// NVIDIA CUDA backend — cuBLAS + custom GEMM kernels // Targets: SM 5.0+ (Maxwell → Blackwell) // Features: FP16 tensor cores, INT8 dp4a, mixed-precision accumulation #include #include #ifdef INFERENCE_X_CUBLAS #include #endif // Copyright (C) 2024-2026 Salka Elmadani. All rights reserved. // INPI eSoleau: 7phf-Ueye-2nWr-Vsgu — BSL-1.1 // Inference-X — Universal Inference Protocol // Morocco // ── Dequantize Q4_K block on GPU ── __device__ void dequantize_q4_k_cuda(const void* src, float* dst, int k) {{ const uint8_t* qs = (const uint8_t*)src + sizeof(float) * 2; // skip scales const float d = *(const float*)src; const float m = *((const float*)src + 1); int tid = threadIdx.x; if (tid < k / 2) {{ uint8_t byte = qs[tid]; dst[tid * 2 + 0] = d * (float)(byte & 0x0F) + m; dst[tid * 2 + 1] = d * (float)(byte >> 4) + m; }} }} // ── Q4 GEMM kernel — fused dequant + matmul ── __global__ void q4_gemm_cuda_kernel( const void* __restrict__ A, // quantized weights [M x K/2] const float* __restrict__ B, // activations [K x N] float* __restrict__ C, // output [M x N] int M, int N, int K, const float* scales, const float* mins ) {{ // Shared memory for tile-based computation extern __shared__ float smem[]; int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; if (row >= M || col >= N) return; float sum = 0.0f; const uint8_t* weight_row = (const uint8_t*)A + row * (K / 2); // Fused dequant + dot product for (int k = 0; k < K; k += 2) {{ uint8_t packed = weight_row[k / 2]; float w0 = scales[row] * (float)(packed & 0x0F) + mins[row]; float w1 = scales[row] * (float)(packed >> 4) + mins[row]; sum += w0 * B[k * N + col] + w1 * B[(k + 1) * N + col]; }} C[row * N + col] = sum; }} // ── FP16 tensor core path (SM >= 7.0) ── #if __CUDA_ARCH__ >= 700 __global__ void q4_gemm_cuda_fp16( const void* __restrict__ A, const half* __restrict__ B, half* __restrict__ C, int M, int N, int K, const half* scales, const half* mins ) {{ // Tensor core WMMA path for Volta+ GPUs // Uses nvcuda::wmma for 16x16x16 matrix fragments int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; if (row >= M || col >= N) return; half sum = __float2half(0.0f); const uint8_t* weight_row = (const uint8_t*)A + row * (K / 2); for (int k = 0; k < K; k += 2) {{ uint8_t packed = weight_row[k / 2]; half w0 = __float2half(__half2float(scales[row]) * (float)(packed & 0x0F) + __half2float(mins[row])); half w1 = __float2half(__half2float(scales[row]) * (float)(packed >> 4) + __half2float(mins[row])); sum = __hadd(sum, __hadd(__hmul(w0, B[k * N + col]), __hmul(w1, B[(k + 1) * N + col]))); }} C[row * N + col] = sum; }} #endif // ── Launch wrapper ── extern "C" void q4_gemm_cuda( const void* weights, const float* input, float* output, int M, int N, int K, const float* scales, const float* mins, cudaStream_t stream ) {{ dim3 block(16, 16); dim3 grid((N + 15) / 16, (M + 15) / 16); q4_gemm_cuda_kernel<<>>( weights, input, output, M, N, K, scales, mins ); }}