// OpenCL backend — Generic GPU compute // Targets: Any OpenCL 1.2+ device (NVIDIA, AMD, Intel, Mali, PowerVR) // Features: Portable GPU compute, work-group optimization #include #include #include // Copyright (C) 2024-2026 Salka Elmadani. All rights reserved. // INPI eSoleau: 7phf-Ueye-2nWr-Vsgu — BSL-1.1 // Inference-X — Universal Inference Protocol // Morocco // ── OpenCL kernel source ── static const char* q4_gemm_cl_src = R"CL( __kernel void q4_gemm( __global const uchar* weights, __global const float* input, __global float* output, __global const float* scales, __global const float* mins, const int M, const int N, const int K ) { int row = get_global_id(1); int col = get_global_id(0); if (row >= M || col >= N) return; float sum = 0.0f; __global const uchar* w_row = weights + row * (K / 2); for (int k = 0; k < K; k += 2) { uchar packed = w_row[k / 2]; float w0 = scales[row] * (float)(packed & 0x0F) + mins[row]; float w1 = scales[row] * (float)(packed >> 4) + mins[row]; sum += w0 * input[k * N + col] + w1 * input[(k + 1) * N + col]; } output[row * N + col] = sum; } )CL"; struct OpenCLGemmContext {{ cl_context context; cl_command_queue queue; cl_program program; cl_kernel kernel; cl_device_id device; }}; extern "C" int q4_gemm_opencl_init(OpenCLGemmContext* ctx) {{ cl_platform_id platform; cl_uint num_platforms; clGetPlatformIDs(1, &platform, &num_platforms); if (num_platforms == 0) return -1; clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &ctx->device, NULL); ctx->context = clCreateContext(NULL, 1, &ctx->device, NULL, NULL, NULL); ctx->queue = clCreateCommandQueue(ctx->context, ctx->device, 0, NULL); size_t src_len = strlen(q4_gemm_cl_src); ctx->program = clCreateProgramWithSource(ctx->context, 1, &q4_gemm_cl_src, &src_len, NULL); clBuildProgram(ctx->program, 1, &ctx->device, "-cl-fast-relaxed-math", NULL, NULL); ctx->kernel = clCreateKernel(ctx->program, "q4_gemm", NULL); return 0; }} extern "C" int q4_gemm_opencl( OpenCLGemmContext* ctx, const void* weights, const float* input, float* output, int M, int N, int K, const float* scales, const float* mins ) {{ size_t global[2] = {{ (size_t)((N + 15) & ~15), (size_t)((M + 15) & ~15) }}; size_t local[2] = {{ 16, 16 }}; // Set kernel arguments and enqueue clSetKernelArg(ctx->kernel, 5, sizeof(int), &M); clSetKernelArg(ctx->kernel, 6, sizeof(int), &N); clSetKernelArg(ctx->kernel, 7, sizeof(int), &K); clEnqueueNDRangeKernel(ctx->queue, ctx->kernel, 2, NULL, global, local, 0, NULL, NULL); clFinish(ctx->queue); return 0; }}