inference-x/backends/q4_kernels/opencl/q4_gemm_opencl.cpp
Salka Elmadani ec36668cf5 Inference-X v1.0 — Universal AI Inference Engine
Better output from the same model. Fused computation, adaptive precision,
surgical expert loading. 305 KB, 19 backends, zero dependencies.

https://inference-x.com
2026-02-23 07:10:47 +00:00

88 lines
2.7 KiB
C++

// OpenCL backend — Generic GPU compute
// Targets: Any OpenCL 1.2+ device (NVIDIA, AMD, Intel, Mali, PowerVR)
// Features: Portable GPU compute, work-group optimization
#include <CL/cl.h>
#include <cstdint>
#include <cstring>
// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
// INPI eSoleau: 7phf-Ueye-2nWr-Vsgu — BSL-1.1
// Inference-X — Universal Inference Protocol
// Morocco
// ── OpenCL kernel source ──
static const char* q4_gemm_cl_src = R"CL(
__kernel void q4_gemm(
__global const uchar* weights,
__global const float* input,
__global float* output,
__global const float* scales,
__global const float* mins,
const int M, const int N, const int K
) {
int row = get_global_id(1);
int col = get_global_id(0);
if (row >= M || col >= N) return;
float sum = 0.0f;
__global const uchar* w_row = weights + row * (K / 2);
for (int k = 0; k < K; k += 2) {
uchar packed = w_row[k / 2];
float w0 = scales[row] * (float)(packed & 0x0F) + mins[row];
float w1 = scales[row] * (float)(packed >> 4) + mins[row];
sum += w0 * input[k * N + col] + w1 * input[(k + 1) * N + col];
}
output[row * N + col] = sum;
}
)CL";
struct OpenCLGemmContext {{
cl_context context;
cl_command_queue queue;
cl_program program;
cl_kernel kernel;
cl_device_id device;
}};
extern "C" int q4_gemm_opencl_init(OpenCLGemmContext* ctx) {{
cl_platform_id platform;
cl_uint num_platforms;
clGetPlatformIDs(1, &platform, &num_platforms);
if (num_platforms == 0) return -1;
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &ctx->device, NULL);
ctx->context = clCreateContext(NULL, 1, &ctx->device, NULL, NULL, NULL);
ctx->queue = clCreateCommandQueue(ctx->context, ctx->device, 0, NULL);
size_t src_len = strlen(q4_gemm_cl_src);
ctx->program = clCreateProgramWithSource(ctx->context, 1,
&q4_gemm_cl_src, &src_len, NULL);
clBuildProgram(ctx->program, 1, &ctx->device, "-cl-fast-relaxed-math", NULL, NULL);
ctx->kernel = clCreateKernel(ctx->program, "q4_gemm", NULL);
return 0;
}}
extern "C" int q4_gemm_opencl(
OpenCLGemmContext* ctx,
const void* weights, const float* input, float* output,
int M, int N, int K,
const float* scales, const float* mins
) {{
size_t global[2] = {{ (size_t)((N + 15) & ~15), (size_t)((M + 15) & ~15) }};
size_t local[2] = {{ 16, 16 }};
// Set kernel arguments and enqueue
clSetKernelArg(ctx->kernel, 5, sizeof(int), &M);
clSetKernelArg(ctx->kernel, 6, sizeof(int), &N);
clSetKernelArg(ctx->kernel, 7, sizeof(int), &K);
clEnqueueNDRangeKernel(ctx->queue, ctx->kernel, 2, NULL, global, local, 0, NULL, NULL);
clFinish(ctx->queue);
return 0;
}}