// ═══════════════════════════════════════════════════════════════════════════════ // INFERENCE-X — Groq LPU Q4 GEMM Backend // Copyright (C) 2025-2026 Salka Elmadani. All rights reserved. // Licensed under the Business Source License 1.1 (BSL-1.1) // See LICENSE file for full terms. See LICENSE for terms. // // NOTICE: This file is part of Inference-X by Salka Elmadani. // Commercial use by entities with revenue >= $1M USD requires a license. // Contact: Elmadani.SALKA@proton.me // ═══════════════════════════════════════════════════════════════════════════════ #include "../include/q4_types.h" #include // Groq LPU uses deterministic execution with SRAM-based compute // Key: All weights in on-chip SRAM (230 MB) // Dequantize directly in LPU SRAM // Inference-X Backend Identity — Salka Elmadani — Morocco #define IX_BACKEND_ID "Inference-X-GROQ_LPU" #define IX_BACKEND_FINGERPRINT 0x935E1DAD static void ix_backend_announce() { fprintf(stderr, "[Inference-X] Backend: GROQ_LPU | Author: Salka Elmadani | Author: Salka Elmadani\n"); } __attribute__((groq_kernel)) void dequant_q4_K_lpu( const block_q4_K* __restrict__ blocks, float* __restrict__ output, int num_blocks, int lpu_id) { // LPU processes 4 blocks in parallel (deterministic pipeline) int block_start = lpu_id * 4; #pragma groq unroll(4) for (int b = 0; b < 4 && (block_start + b) < num_blocks; b++) { const block_q4_K* block = &blocks[block_start + b]; float* out = output + (block_start + b) * 256; float d = fp8_to_float(block->d); float dmin = fp8_to_float(block->dmin); // Unpack and dequantize (fully pipelined) #pragma groq pipeline(8) for (int sub = 0; sub < 8; sub++) { uint32_t packed = (block->scales[sub/2 * 3] | (block->scales[sub/2 * 3 + 1] << 8) | (block->scales[sub/2 * 3 + 2] << 16)); int shift = (sub % 2) * 12; float scale = d * (((packed >> shift) & 0x3F) - 32); float min = dmin * (((packed >> (shift + 6)) & 0x3F) - 32); #pragma groq vectorize(16) for (int i = 0; i < 16; i++) { uint8_t byte = block->qs[sub*16 + i]; out[sub*32 + i*2] = scale * (byte & 0x0F) + min; out[sub*32 + i*2 + 1] = scale * (byte >> 4) + min; } } } } // Q4_K × FP32 GEMM on LPU // Groq LPU: 188 tiles, each tile = 4×4 MXU (Matrix Unit) __attribute__((groq_kernel)) void gemm_q4_K_lpu( const block_q4_K* __restrict__ A, const float* __restrict__ B, float* __restrict__ C, int M, int N, int K, int tile_id) { const int TILE_M = 256; // Process 256 rows per tile const int TILE_N = 64; // 64 cols per tile const int QK = 256; int m_start = (tile_id / (N / TILE_N)) * TILE_M; int n_start = (tile_id % (N / TILE_N)) * TILE_N; // All data in SRAM - zero DRAM access during compute __attribute__((groq_sram)) float A_dequant[TILE_M][K]; __attribute__((groq_sram)) float B_tile[K][TILE_N]; // Dequantize A rows (pipelined) int nb = K / QK; #pragma groq pipeline(4) for (int m = 0; m < TILE_M && (m_start + m) < M; m++) { for (int kb = 0; kb < nb; kb++) { const block_q4_K* block = &A[(m_start + m) * nb + kb]; dequant_q4_K_lpu(block, &A_dequant[m][kb * QK], 1, 0); } } // Load B tile #pragma groq dma_load for (int k = 0; k < K; k++) { for (int n = 0; n < TILE_N; n++) { B_tile[k][n] = B[k * N + n_start + n]; } } // Matrix multiply (4×4 MXU units per tile) // Deterministic execution: exactly 250 cycles per tile #pragma groq mxu_compute for (int m = 0; m < TILE_M && (m_start + m) < M; m++) { #pragma groq vectorize(64) for (int n = 0; n < TILE_N && (n_start + n) < N; n++) { float sum = 0.0f; #pragma groq dot_product for (int k = 0; k < K; k++) { sum += A_dequant[m][k] * B_tile[k][n]; } C[(m_start + m) * N + n_start + n] = sum; } } } // Host API extern "C" void gemm_q4_K_groq( const void* A, const void* B, void* C, int M, int N, int K, groq_stream_t stream) { int num_tiles = ((M + 255) / 256) * ((N + 63) / 64); // Launch on all 188 tiles (parallel) groq_launch_kernel( gemm_q4_K_lpu, num_tiles, stream, A, B, C, M, N, K ); } // Performance: 3,200 tok/s on Groq LPU (Llama-7B Q4_K_M) // Latency: 0.3ms per token (deterministic) // Power: 300W