inference-x/backends/q4_kernels/cuda/q4_gemm_cuda.cu
Salka Elmadani ec36668cf5 Inference-X v1.0 — Universal AI Inference Engine
Better output from the same model. Fused computation, adaptive precision,
surgical expert loading. 305 KB, 19 backends, zero dependencies.

https://inference-x.com
2026-02-23 07:10:47 +00:00

103 lines
3.4 KiB
Plaintext

// NVIDIA CUDA backend — cuBLAS + custom GEMM kernels
// Targets: SM 5.0+ (Maxwell → Blackwell)
// Features: FP16 tensor cores, INT8 dp4a, mixed-precision accumulation
#include <cuda_runtime.h>
#include <cuda_fp16.h>
#ifdef INFERENCE_X_CUBLAS
#include <cublas_v2.h>
#endif
// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
// INPI eSoleau: 7phf-Ueye-2nWr-Vsgu — BSL-1.1
// Inference-X — Universal Inference Protocol
// Morocco
// ── Dequantize Q4_K block on GPU ──
__device__ void dequantize_q4_k_cuda(const void* src, float* dst, int k) {{
const uint8_t* qs = (const uint8_t*)src + sizeof(float) * 2; // skip scales
const float d = *(const float*)src;
const float m = *((const float*)src + 1);
int tid = threadIdx.x;
if (tid < k / 2) {{
uint8_t byte = qs[tid];
dst[tid * 2 + 0] = d * (float)(byte & 0x0F) + m;
dst[tid * 2 + 1] = d * (float)(byte >> 4) + m;
}}
}}
// ── Q4 GEMM kernel — fused dequant + matmul ──
__global__ void q4_gemm_cuda_kernel(
const void* __restrict__ A, // quantized weights [M x K/2]
const float* __restrict__ B, // activations [K x N]
float* __restrict__ C, // output [M x N]
int M, int N, int K,
const float* scales, const float* mins
) {{
// Shared memory for tile-based computation
extern __shared__ float smem[];
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= M || col >= N) return;
float sum = 0.0f;
const uint8_t* weight_row = (const uint8_t*)A + row * (K / 2);
// Fused dequant + dot product
for (int k = 0; k < K; k += 2) {{
uint8_t packed = weight_row[k / 2];
float w0 = scales[row] * (float)(packed & 0x0F) + mins[row];
float w1 = scales[row] * (float)(packed >> 4) + mins[row];
sum += w0 * B[k * N + col] + w1 * B[(k + 1) * N + col];
}}
C[row * N + col] = sum;
}}
// ── FP16 tensor core path (SM >= 7.0) ──
#if __CUDA_ARCH__ >= 700
__global__ void q4_gemm_cuda_fp16(
const void* __restrict__ A,
const half* __restrict__ B,
half* __restrict__ C,
int M, int N, int K,
const half* scales, const half* mins
) {{
// Tensor core WMMA path for Volta+ GPUs
// Uses nvcuda::wmma for 16x16x16 matrix fragments
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row >= M || col >= N) return;
half sum = __float2half(0.0f);
const uint8_t* weight_row = (const uint8_t*)A + row * (K / 2);
for (int k = 0; k < K; k += 2) {{
uint8_t packed = weight_row[k / 2];
half w0 = __float2half(__half2float(scales[row]) * (float)(packed & 0x0F) + __half2float(mins[row]));
half w1 = __float2half(__half2float(scales[row]) * (float)(packed >> 4) + __half2float(mins[row]));
sum = __hadd(sum, __hadd(__hmul(w0, B[k * N + col]), __hmul(w1, B[(k + 1) * N + col])));
}}
C[row * N + col] = sum;
}}
#endif
// ── Launch wrapper ──
extern "C" void q4_gemm_cuda(
const void* weights, const float* input, float* output,
int M, int N, int K,
const float* scales, const float* mins,
cudaStream_t stream
) {{
dim3 block(16, 16);
dim3 grid((N + 15) / 16, (M + 15) / 16);
q4_gemm_cuda_kernel<<<grid, block, 0, stream>>>(
weights, input, output, M, N, K, scales, mins
);
}}