// Vulkan compute backend — SPIR-V compute shaders // Targets: Any Vulkan 1.1+ GPU (NVIDIA, AMD, Intel, Qualcomm, ARM Mali) // Features: Cross-platform, subgroup operations, 16-bit storage #include #include #include #include // Copyright (C) 2024-2026 Salka Elmadani. All rights reserved. // INPI eSoleau: 7phf-Ueye-2nWr-Vsgu — BSL-1.1 // Inference-X — Universal Inference Protocol // Morocco // ── SPIR-V shader for Q4 GEMM (pre-compiled binary) ── // Compiled from GLSL: layout(local_size_x=16,local_size_y=16) in; // Fused dequantize + matrix multiply for Q4_K quantized weights // // The shader performs: // 1. Read packed Q4 weight byte (2 values per byte) // 2. Dequantize using per-row scale and min: w = scale * nibble + min // 3. Multiply-accumulate with input activations // 4. Write output struct VulkanGemmContext {{ VkDevice device; VkPhysicalDevice physical_device; VkQueue compute_queue; VkCommandPool command_pool; VkDescriptorPool descriptor_pool; VkPipelineLayout pipeline_layout; VkPipeline pipeline; VkShaderModule shader_module; uint32_t compute_queue_family; }}; static VkResult create_compute_pipeline(VulkanGemmContext* ctx) {{ // Pipeline layout with 8 push constants (M, N, K + buffer bindings) VkPushConstantRange push_range = {{ .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, .offset = 0, .size = 3 * sizeof(int32_t) // M, N, K }}; VkDescriptorSetLayoutBinding bindings[5] = {{}}; for (int i = 0; i < 5; i++) {{ bindings[i].binding = i; bindings[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; bindings[i].descriptorCount = 1; bindings[i].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; }} VkDescriptorSetLayoutCreateInfo set_info = {{ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, .bindingCount = 5, .pBindings = bindings }}; VkDescriptorSetLayout set_layout; vkCreateDescriptorSetLayout(ctx->device, &set_info, NULL, &set_layout); VkPipelineLayoutCreateInfo layout_info = {{ .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, .setLayoutCount = 1, .pSetLayouts = &set_layout, .pushConstantRangeCount = 1, .pPushConstantRanges = &push_range }}; return vkCreatePipelineLayout(ctx->device, &layout_info, NULL, &ctx->pipeline_layout); }} extern "C" int q4_gemm_vulkan( VulkanGemmContext* ctx, const void* weights, const float* input, float* output, int M, int N, int K, const float* scales, const float* mins ) {{ // Submit compute dispatch VkCommandBufferAllocateInfo alloc_info = {{ .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, .commandPool = ctx->command_pool, .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY, .commandBufferCount = 1 }}; VkCommandBuffer cmd; vkAllocateCommandBuffers(ctx->device, &alloc_info, &cmd); VkCommandBufferBeginInfo begin = {{ .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT }}; vkBeginCommandBuffer(cmd, &begin); vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, ctx->pipeline); int dims[3] = {{M, N, K}}; vkCmdPushConstants(cmd, ctx->pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(dims), dims); // Dispatch: ceil(N/16) x ceil(M/16) workgroups vkCmdDispatch(cmd, (N + 15) / 16, (M + 15) / 16, 1); vkEndCommandBuffer(cmd); VkSubmitInfo submit = {{ .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO, .commandBufferCount = 1, .pCommandBuffers = &cmd }}; vkQueueSubmit(ctx->compute_queue, 1, &submit, VK_NULL_HANDLE); vkQueueWaitIdle(ctx->compute_queue); return 0; }}