Better output from the same model. Fused computation, adaptive precision, surgical expert loading. 305 KB, 19 backends, zero dependencies. https://inference-x.com
88 lines
2.7 KiB
C++
88 lines
2.7 KiB
C++
// OpenCL backend — Generic GPU compute
|
|
// Targets: Any OpenCL 1.2+ device (NVIDIA, AMD, Intel, Mali, PowerVR)
|
|
// Features: Portable GPU compute, work-group optimization
|
|
|
|
#include <CL/cl.h>
|
|
#include <cstdint>
|
|
#include <cstring>
|
|
|
|
// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
|
|
// INPI eSoleau: 7phf-Ueye-2nWr-Vsgu — BSL-1.1
|
|
// Inference-X — Universal Inference Protocol
|
|
// Morocco
|
|
|
|
// ── OpenCL kernel source ──
|
|
static const char* q4_gemm_cl_src = R"CL(
|
|
__kernel void q4_gemm(
|
|
__global const uchar* weights,
|
|
__global const float* input,
|
|
__global float* output,
|
|
__global const float* scales,
|
|
__global const float* mins,
|
|
const int M, const int N, const int K
|
|
) {
|
|
int row = get_global_id(1);
|
|
int col = get_global_id(0);
|
|
if (row >= M || col >= N) return;
|
|
|
|
float sum = 0.0f;
|
|
__global const uchar* w_row = weights + row * (K / 2);
|
|
|
|
for (int k = 0; k < K; k += 2) {
|
|
uchar packed = w_row[k / 2];
|
|
float w0 = scales[row] * (float)(packed & 0x0F) + mins[row];
|
|
float w1 = scales[row] * (float)(packed >> 4) + mins[row];
|
|
sum += w0 * input[k * N + col] + w1 * input[(k + 1) * N + col];
|
|
}
|
|
|
|
output[row * N + col] = sum;
|
|
}
|
|
)CL";
|
|
|
|
struct OpenCLGemmContext {{
|
|
cl_context context;
|
|
cl_command_queue queue;
|
|
cl_program program;
|
|
cl_kernel kernel;
|
|
cl_device_id device;
|
|
}};
|
|
|
|
extern "C" int q4_gemm_opencl_init(OpenCLGemmContext* ctx) {{
|
|
cl_platform_id platform;
|
|
cl_uint num_platforms;
|
|
clGetPlatformIDs(1, &platform, &num_platforms);
|
|
if (num_platforms == 0) return -1;
|
|
|
|
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &ctx->device, NULL);
|
|
|
|
ctx->context = clCreateContext(NULL, 1, &ctx->device, NULL, NULL, NULL);
|
|
ctx->queue = clCreateCommandQueue(ctx->context, ctx->device, 0, NULL);
|
|
|
|
size_t src_len = strlen(q4_gemm_cl_src);
|
|
ctx->program = clCreateProgramWithSource(ctx->context, 1,
|
|
&q4_gemm_cl_src, &src_len, NULL);
|
|
clBuildProgram(ctx->program, 1, &ctx->device, "-cl-fast-relaxed-math", NULL, NULL);
|
|
|
|
ctx->kernel = clCreateKernel(ctx->program, "q4_gemm", NULL);
|
|
return 0;
|
|
}}
|
|
|
|
extern "C" int q4_gemm_opencl(
|
|
OpenCLGemmContext* ctx,
|
|
const void* weights, const float* input, float* output,
|
|
int M, int N, int K,
|
|
const float* scales, const float* mins
|
|
) {{
|
|
size_t global[2] = {{ (size_t)((N + 15) & ~15), (size_t)((M + 15) & ~15) }};
|
|
size_t local[2] = {{ 16, 16 }};
|
|
|
|
// Set kernel arguments and enqueue
|
|
clSetKernelArg(ctx->kernel, 5, sizeof(int), &M);
|
|
clSetKernelArg(ctx->kernel, 6, sizeof(int), &N);
|
|
clSetKernelArg(ctx->kernel, 7, sizeof(int), &K);
|
|
|
|
clEnqueueNDRangeKernel(ctx->queue, ctx->kernel, 2, NULL, global, local, 0, NULL, NULL);
|
|
clFinish(ctx->queue);
|
|
return 0;
|
|
}}
|