// OpenCL backend — Generic GPU compute
// Targets: Any OpenCL 1.2+ device (NVIDIA, AMD, Intel, Mali, PowerVR)
// Features: Portable GPU compute, work-group optimization

#include <CL/cl.h>
#include <cstdint>
#include <cstring>

// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
// INPI eSoleau: 7phf-Ueye-2nWr-Vsgu — BSL-1.1
// Inference-X — Universal Inference Protocol
// Morocco

// ── OpenCL kernel source ──
static const char* q4_gemm_cl_src = R"CL(
__kernel void q4_gemm(
    __global const uchar*  weights,
    __global const float*  input,
    __global float*        output,
    __global const float*  scales,
    __global const float*  mins,
    const int M, const int N, const int K
) {
    int row = get_global_id(1);
    int col = get_global_id(0);
    if (row >= M || col >= N) return;

    float sum = 0.0f;
    __global const uchar* w_row = weights + row * (K / 2);

    for (int k = 0; k < K; k += 2) {
        uchar packed = w_row[k / 2];
        float w0 = scales[row] * (float)(packed & 0x0F) + mins[row];
        float w1 = scales[row] * (float)(packed >> 4)   + mins[row];
        sum += w0 * input[k * N + col] + w1 * input[(k + 1) * N + col];
    }

    output[row * N + col] = sum;
}
)CL";

struct OpenCLGemmContext {{
    cl_context context;
    cl_command_queue queue;
    cl_program program;
    cl_kernel kernel;
    cl_device_id device;
}};

extern "C" int q4_gemm_opencl_init(OpenCLGemmContext* ctx) {{
    cl_platform_id platform;
    cl_uint num_platforms;
    clGetPlatformIDs(1, &platform, &num_platforms);
    if (num_platforms == 0) return -1;

    clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &ctx->device, NULL);

    ctx->context = clCreateContext(NULL, 1, &ctx->device, NULL, NULL, NULL);
    ctx->queue = clCreateCommandQueue(ctx->context, ctx->device, 0, NULL);

    size_t src_len = strlen(q4_gemm_cl_src);
    ctx->program = clCreateProgramWithSource(ctx->context, 1,
        &q4_gemm_cl_src, &src_len, NULL);
    clBuildProgram(ctx->program, 1, &ctx->device, "-cl-fast-relaxed-math", NULL, NULL);

    ctx->kernel = clCreateKernel(ctx->program, "q4_gemm", NULL);
    return 0;
}}

extern "C" int q4_gemm_opencl(
    OpenCLGemmContext* ctx,
    const void* weights, const float* input, float* output,
    int M, int N, int K,
    const float* scales, const float* mins
) {{
    size_t global[2] = {{ (size_t)((N + 15) & ~15), (size_t)((M + 15) & ~15) }};
    size_t local[2] = {{ 16, 16 }};

    // Set kernel arguments and enqueue
    clSetKernelArg(ctx->kernel, 5, sizeof(int), &M);
    clSetKernelArg(ctx->kernel, 6, sizeof(int), &N);
    clSetKernelArg(ctx->kernel, 7, sizeof(int), &K);

    clEnqueueNDRangeKernel(ctx->queue, ctx->kernel, 2, NULL, global, local, 0, NULL, NULL);
    clFinish(ctx->queue);
    return 0;
}}