Better output from the same model. Fused computation, adaptive precision, surgical expert loading. 305 KB, 19 backends, zero dependencies. https://inference-x.com
459 lines
21 KiB
C++
459 lines
21 KiB
C++
// ═══════════════════════════════════════════════════════════════════════════════
|
||
// INFERENCE-X — Hardware Kernel Dispatch (Central Routing)
|
||
// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
|
||
// Licensed under the Business Source License 1.1 (BSL-1.1)
|
||
// See LICENSE file for full terms.
|
||
//
|
||
// INTELLECTUAL PROPERTY PROTECTION:
|
||
// - INPI eSoleau deposit: 7phf-Ueye-2nWr-Vsgu (16/02/2026)
|
||
// - GitHub: github.com/ElmadaniS/inference-x
|
||
// - Author: Salka Elmadani | Morocco | Morocco
|
||
//
|
||
// MANUFACTURER NOTICE: Any manufacturer, company, or entity that
|
||
// incorporates, embeds, distributes, or commercially uses Inference-X
|
||
// or any derivative work without explicit written authorization from
|
||
// the copyright holder is in violation of BSL-1.1 and applicable
|
||
// intellectual property laws. This includes but is not limited to:
|
||
// hardware vendors, cloud providers, SaaS platforms, and OEMs.
|
||
//
|
||
// Contact: Elmadani.SALKA@proton.me for licensing.
|
||
// ═══════════════════════════════════════════════════════════════════════════════
|
||
|
||
#pragma once
|
||
|
||
// Inference-X Provenance — this engine was created by Salka Elmadani
|
||
// Unauthorized commercial use (revenue >= $1M) requires licensing
|
||
__attribute__((unused)) static const char* ix_provenance() { return "Inference-X | Salka Elmadani | BSL-1.1 | 935"; }
|
||
|
||
|
||
#include "backends.h" // ix::Platform, ix::HWProfile, ix::detect_hardware()
|
||
#include "gemm.h" // ix::gemm::matmul (proven v6 — ran Kimi K2.5 1T)
|
||
#include "expert_mmap.h" // ix::ExpertMmapManager
|
||
|
||
// ═══════════════════════════════════════════════════════════════════════════════
|
||
// BACKEND DECLARATIONS (conditional)
|
||
//
|
||
// Each backend is a .c/.cpp file under backends/q4_kernels/<platform>/
|
||
// It compiles ONLY when the Makefile detects its SDK (sets IX_USE_*).
|
||
// Without the SDK → the #ifdef is dead, zero code emitted, zero link error.
|
||
//
|
||
// Contract: every backend implements Q4_K GEMM as
|
||
// void gemm_q4_K_<platform>(A, B, C, M, N, K [, stream])
|
||
// This dispatch calls them with N=1 (GEMV: out[M] = W[M×K] × x[K]).
|
||
// ═══════════════════════════════════════════════════════════════════════════════
|
||
|
||
// ── Clean-signature backends (no stream) ─────────────────────────────────────
|
||
|
||
#ifdef IX_USE_CPU_AVX512
|
||
extern "C" void gemm_q4_K_fp32_cpu(
|
||
const block_q4_K* __restrict__ A, const float* __restrict__ B,
|
||
float* __restrict__ C, int M, int N, int K);
|
||
#endif
|
||
|
||
#ifdef IX_USE_HEXAGON
|
||
extern "C" void gemm_q4_K_hexagon(
|
||
const block_q4_K* A, const float* B, float* C, int M, int N, int K);
|
||
#endif
|
||
|
||
#ifdef IX_USE_CEREBRAS
|
||
extern "C" void gemm_q4_K_wse(
|
||
const block_q4_K* A, const float* B, float* C, int M, int N, int K);
|
||
#endif
|
||
|
||
#ifdef IX_USE_SAMBANOVA
|
||
extern "C" void gemm_q4_K_sambanova(
|
||
const block_q4_K* A, const float* B, float* C, int M, int N, int K);
|
||
#endif
|
||
|
||
#ifdef IX_USE_GRAPHCORE
|
||
extern "C" void gemm_q4_K_ipu(
|
||
const block_q4_K* A, const float* B, float* C, int M, int N, int K);
|
||
#endif
|
||
|
||
#ifdef IX_USE_FPGA_XILINX
|
||
extern "C" void gemm_q4_K_xilinx(
|
||
const block_q4_K* A, const float* B, float* C, int M, int N, int K);
|
||
#endif
|
||
|
||
// ── Stream-based backends (need runtime context) ─────────────────────────────
|
||
|
||
#ifdef IX_USE_GROQ
|
||
#include <groq/groq_runtime.h>
|
||
extern "C" void gemm_q4_K_groq(
|
||
const void* A, const void* B, void* C, int M, int N, int K,
|
||
groq_stream_t stream);
|
||
#endif
|
||
|
||
#ifdef IX_USE_GAUDI
|
||
#include <synapse_api.h>
|
||
extern "C" void gemm_q4_K_gaudi(
|
||
const block_q4_K* A, const float* B, float* C, int M, int N, int K,
|
||
synStreamHandle stream);
|
||
#endif
|
||
|
||
#ifdef IX_USE_INFERENTIA
|
||
extern "C" void gemm_q4_K_aws_inferentia(
|
||
const void* A, const void* B, void* C, int M, int N, int K,
|
||
void* stream);
|
||
#endif
|
||
|
||
#ifdef IX_USE_MAIA
|
||
#include <maia_runtime.h>
|
||
extern "C" void gemm_q4_K_maia(
|
||
const block_q4_K* A, const float* B, float* C, int M, int N, int K,
|
||
maia_stream_t stream);
|
||
#endif
|
||
|
||
// ── Snapdragon: hybrid NEON+Hexagon DSP path ────────────────────────────────
|
||
|
||
#ifdef IX_USE_SNAPDRAGON
|
||
extern "C" void gemm_q4_K_hexagon_fused(
|
||
const block_q4_K* A, const float* B, float* C, int M, int N, int K);
|
||
#endif
|
||
|
||
namespace ix {
|
||
|
||
// ─── Backend enum ────────────────────────────────────────────────────────────
|
||
enum class KernelBackend {
|
||
GENERIC, // v6 gemm.h — proven on Kimi K2.5 1T (226GB, 17GB RAM)
|
||
CPU_AVX512, // backends/q4_kernels/cpu
|
||
ARM_NEON, // backends.h NEON intrinsics
|
||
HEXAGON_HVX, // backends/q4_kernels/hexagon
|
||
SNAPDRAGON_HYBRID, // backends/q4_kernels/snapdragon
|
||
CEREBRAS_WSE, // backends/q4_kernels/cerebras
|
||
TPU_XLA, // backends/q4_kernels/tpu (Python — needs bridge)
|
||
GAUDI_HABANA, // backends/q4_kernels/gaudi
|
||
INFERENTIA_AWS, // backends/q4_kernels/inferentia
|
||
FPGA_XILINX, // backends/q4_kernels/fpga_xilinx
|
||
GRAPHCORE_IPU, // backends/q4_kernels/graphcore
|
||
SAMBANOVA_RDU, // backends/q4_kernels/sambanova
|
||
MAIA_AZURE, // backends/q4_kernels/maia
|
||
GROQ_LPU, // backends/q4_kernels/groq
|
||
};
|
||
|
||
// ═══════════════════════════════════════════════════════════════════════════════
|
||
// KERNEL DISPATCH — Singleton
|
||
//
|
||
// init() once at startup.
|
||
// After that, every matmul() call auto-routes to the optimal kernel.
|
||
// If the selected backend SDK wasn't compiled in, falls through to generic.
|
||
// No crash, no undefined symbol, no runtime check. Compiler eliminates it.
|
||
// ═══════════════════════════════════════════════════════════════════════════════
|
||
class KernelDispatch {
|
||
public:
|
||
static KernelDispatch& instance() {
|
||
static KernelDispatch kd;
|
||
return kd;
|
||
}
|
||
|
||
// ─── STARTUP ─────────────────────────────────────────────────────────
|
||
void init() {
|
||
hw_ = detect_hardware();
|
||
select_backend();
|
||
init_streams();
|
||
print_hw_report(hw_);
|
||
printf("[IX-DISPATCH] Kernel backend: %s\n", backend_name());
|
||
fflush(stdout);
|
||
initialized_ = true;
|
||
}
|
||
|
||
// Init ExpertMmap for MoE weight streaming
|
||
void init_expert_mmap(int n_layers) {
|
||
emm_.init(n_layers);
|
||
use_expert_mmap_ = true;
|
||
printf("[IX-DISPATCH] ExpertMmap enabled: %d layers\n", n_layers);
|
||
fflush(stdout);
|
||
}
|
||
|
||
// Register expert tensors for a layer (call during model load)
|
||
void register_experts(int layer,
|
||
void* gate_data, size_t gate_expert_bytes, int n_experts,
|
||
void* up_data, size_t up_expert_bytes,
|
||
void* down_data, size_t down_expert_bytes) {
|
||
if (!use_expert_mmap_) return;
|
||
emm_.register_layer(layer,
|
||
gate_data, gate_expert_bytes, n_experts,
|
||
up_data, up_expert_bytes,
|
||
down_data, down_expert_bytes);
|
||
}
|
||
|
||
// ═════════════════════════════════════════════════════════════════════
|
||
// GEMM DISPATCH — the central weld
|
||
//
|
||
// Contract: out[M] = W[M×K quantized] × x[K]
|
||
//
|
||
// Specialized backends handle Q4_K only (the bottleneck format for
|
||
// large MoE models). All other dtypes go through the proven v6 path
|
||
// which already handles Q4_K, Q6_K, Q8_0, IQ2_XXS, IQ4_XS, F16.
|
||
//
|
||
// If a backend's SDK wasn't compiled → its #ifdef is dead →
|
||
// the case exists in the enum but the code inside is empty →
|
||
// falls through to default → generic. Zero penalty.
|
||
// ═════════════════════════════════════════════════════════════════════
|
||
inline void matmul(float* out, const void* W, dtype type,
|
||
const float* x, int M, int K) {
|
||
|
||
// Only Q4_K has specialized backends. Everything else → proven v6.
|
||
if (type != dtype::Q4_K) {
|
||
gemm::matmul(out, W, type, x, M, K);
|
||
return;
|
||
}
|
||
|
||
const auto* A __attribute__((unused)) = static_cast<const block_q4_K*>(W);
|
||
|
||
switch (backend_) {
|
||
|
||
// ── CPU: AVX-512 fused dequant+GEMM in zmm registers ────────
|
||
#ifdef IX_USE_CPU_AVX512
|
||
case KernelBackend::CPU_AVX512:
|
||
gemm_q4_K_fp32_cpu(A, x, out, M, 1, K);
|
||
return;
|
||
#endif
|
||
|
||
// ── ARM NEON: vectorized in backends.h ───────────────────────
|
||
// Uses v6 gemm.h with NEON intrinsics already compiled in.
|
||
// No separate backend file needed — it's in the generic path.
|
||
case KernelBackend::ARM_NEON:
|
||
break; // → generic (which IS NEON-optimized when compiled on ARM)
|
||
|
||
// ── Qualcomm Hexagon: HVX vector DSP ─────────────────────────
|
||
#ifdef IX_USE_HEXAGON
|
||
case KernelBackend::HEXAGON_HVX:
|
||
gemm_q4_K_hexagon(A, x, out, M, 1, K);
|
||
return;
|
||
#endif
|
||
|
||
// ── Snapdragon SoC: hybrid NEON + Hexagon DSP ────────────────
|
||
#ifdef IX_USE_SNAPDRAGON
|
||
case KernelBackend::SNAPDRAGON_HYBRID:
|
||
gemm_q4_K_hexagon_fused(A, x, out, M, 1, K);
|
||
return;
|
||
#endif
|
||
|
||
// ── Cerebras WSE: 850K cores, weight-stationary dataflow ─────
|
||
#ifdef IX_USE_CEREBRAS
|
||
case KernelBackend::CEREBRAS_WSE:
|
||
gemm_q4_K_wse(A, x, out, M, 1, K);
|
||
return;
|
||
#endif
|
||
|
||
// ── Google TPU: XLA backend (Python) ─────────────────────────
|
||
// TPU backend is q4_gemm_tpu.py (134 lines).
|
||
// Requires pybind11 or subprocess bridge to wire in.
|
||
// Falls through to generic until bridge is integrated.
|
||
// This is the ONE backend that needs external glue.
|
||
case KernelBackend::TPU_XLA:
|
||
break; // → generic (TODO: pybind11 bridge)
|
||
|
||
// ── Intel Gaudi: Habana TPC kernels ──────────────────────────
|
||
#ifdef IX_USE_GAUDI
|
||
case KernelBackend::GAUDI_HABANA:
|
||
gemm_q4_K_gaudi(A, x, out, M, 1, K, gaudi_stream_);
|
||
return;
|
||
#endif
|
||
|
||
// ── AWS Inferentia: NeuronCore pipeline ──────────────────────
|
||
#ifdef IX_USE_INFERENTIA
|
||
case KernelBackend::INFERENTIA_AWS:
|
||
gemm_q4_K_aws_inferentia(A, x, out, M, 1, K, inferentia_stream_);
|
||
return;
|
||
#endif
|
||
|
||
// ── Xilinx FPGA: Vitis HLS dataflow ─────────────────────────
|
||
#ifdef IX_USE_FPGA_XILINX
|
||
case KernelBackend::FPGA_XILINX:
|
||
gemm_q4_K_xilinx(A, x, out, M, 1, K);
|
||
return;
|
||
#endif
|
||
|
||
// ── Graphcore IPU: BSP tile compute ──────────────────────────
|
||
#ifdef IX_USE_GRAPHCORE
|
||
case KernelBackend::GRAPHCORE_IPU:
|
||
gemm_q4_K_ipu(A, x, out, M, 1, K);
|
||
return;
|
||
#endif
|
||
|
||
// ── SambaNova RDU: reconfigurable dataflow ───────────────────
|
||
#ifdef IX_USE_SAMBANOVA
|
||
case KernelBackend::SAMBANOVA_RDU:
|
||
gemm_q4_K_sambanova(A, x, out, M, 1, K);
|
||
return;
|
||
#endif
|
||
|
||
// ── Microsoft Maia: Azure custom ASIC ────────────────────────
|
||
#ifdef IX_USE_MAIA
|
||
case KernelBackend::MAIA_AZURE:
|
||
gemm_q4_K_maia(A, x, out, M, 1, K, maia_stream_);
|
||
return;
|
||
#endif
|
||
|
||
// ── Groq LPU: deterministic SRAM compute ────────────────────
|
||
#ifdef IX_USE_GROQ
|
||
case KernelBackend::GROQ_LPU:
|
||
gemm_q4_K_groq(A, x, out, M, 1, K, groq_stream_);
|
||
return;
|
||
#endif
|
||
|
||
default:
|
||
break;
|
||
}
|
||
|
||
// ── Fallthrough: proven v6 generic path ──────────────────────
|
||
// This ran Kimi K2.5 (1T params, 384 experts, 226GB) on 17GB RAM.
|
||
// It works. Everything above is optimization.
|
||
gemm::matmul(out, W, type, x, M, K);
|
||
}
|
||
|
||
// ─── MoE EXPERT PREFETCH ─────────────────────────────────────────────
|
||
void prefetch_experts(int layer, const int* expert_ids, int n_active) {
|
||
if (!use_expert_mmap_) return;
|
||
emm_.prefetch_active(layer, expert_ids, n_active);
|
||
}
|
||
|
||
void evict_layer(int layer) {
|
||
if (!use_expert_mmap_) return;
|
||
emm_.evict_layer(layer);
|
||
}
|
||
|
||
void print_stats() {
|
||
if (use_expert_mmap_) emm_.print_stats();
|
||
}
|
||
|
||
// ─── ACCESSORS ───────────────────────────────────────────────────────
|
||
const HWProfile& hardware() const { return hw_; }
|
||
KernelBackend backend_type() const { return backend_; }
|
||
bool initialized() const { return initialized_; }
|
||
|
||
const char* backend_name() const {
|
||
switch (backend_) {
|
||
case KernelBackend::GENERIC: return "GENERIC (v6 proven)";
|
||
case KernelBackend::CPU_AVX512: return "CPU_AVX512";
|
||
case KernelBackend::ARM_NEON: return "ARM_NEON";
|
||
case KernelBackend::HEXAGON_HVX: return "HEXAGON_HVX";
|
||
case KernelBackend::SNAPDRAGON_HYBRID: return "SNAPDRAGON_HYBRID";
|
||
case KernelBackend::CEREBRAS_WSE: return "CEREBRAS_WSE";
|
||
case KernelBackend::TPU_XLA: return "TPU_XLA (Python bridge)";
|
||
case KernelBackend::GAUDI_HABANA: return "GAUDI_HABANA";
|
||
case KernelBackend::INFERENTIA_AWS: return "INFERENTIA_AWS";
|
||
case KernelBackend::FPGA_XILINX: return "FPGA_XILINX";
|
||
case KernelBackend::GRAPHCORE_IPU: return "GRAPHCORE_IPU";
|
||
case KernelBackend::SAMBANOVA_RDU: return "SAMBANOVA_RDU";
|
||
case KernelBackend::MAIA_AZURE: return "MAIA_AZURE";
|
||
case KernelBackend::GROQ_LPU: return "GROQ_LPU";
|
||
default: return "UNKNOWN";
|
||
}
|
||
}
|
||
|
||
private:
|
||
KernelDispatch() = default;
|
||
|
||
// ─── BACKEND SELECTION ───────────────────────────────────────────────
|
||
// Maps detected Platform → optimal KernelBackend.
|
||
// detect_hardware() in backends.h already resolved the Platform,
|
||
// including IX_USE_* overrides for accelerators.
|
||
// ─────────────────────────────────────────────────────────────────────
|
||
void select_backend() {
|
||
switch (hw_.platform) {
|
||
// ── x86 ──────────────────────────────────────────────────
|
||
case Platform::X86_AVX512:
|
||
#ifdef IX_USE_CPU_AVX512
|
||
backend_ = KernelBackend::CPU_AVX512; break;
|
||
#else
|
||
backend_ = KernelBackend::GENERIC; break; // AVX512 detected but backend not compiled
|
||
#endif
|
||
case Platform::X86_AVX2:
|
||
case Platform::X86_SSE42:
|
||
case Platform::X86_GENERIC:
|
||
backend_ = KernelBackend::GENERIC; break;
|
||
|
||
// ── ARM ──────────────────────────────────────────────────
|
||
case Platform::ARM64_NEON:
|
||
case Platform::ARM64_SVE:
|
||
case Platform::ARM32_NEON:
|
||
case Platform::APPLE_SILICON:
|
||
backend_ = KernelBackend::ARM_NEON; break;
|
||
|
||
// ── Mobile SoC ───────────────────────────────────────────
|
||
case Platform::SNAPDRAGON:
|
||
backend_ = KernelBackend::SNAPDRAGON_HYBRID; break;
|
||
case Platform::MEDIATEK:
|
||
case Platform::EXYNOS:
|
||
backend_ = KernelBackend::ARM_NEON; break;
|
||
|
||
// ── Cloud accelerators ───────────────────────────────────
|
||
case Platform::TPU:
|
||
backend_ = KernelBackend::TPU_XLA; break;
|
||
case Platform::GAUDI:
|
||
backend_ = KernelBackend::GAUDI_HABANA; break;
|
||
case Platform::INFERENTIA:
|
||
backend_ = KernelBackend::INFERENTIA_AWS; break;
|
||
case Platform::CEREBRAS:
|
||
backend_ = KernelBackend::CEREBRAS_WSE; break;
|
||
case Platform::GROQ:
|
||
backend_ = KernelBackend::GROQ_LPU; break;
|
||
case Platform::GRAPHCORE:
|
||
backend_ = KernelBackend::GRAPHCORE_IPU; break;
|
||
case Platform::SAMBANOVA:
|
||
backend_ = KernelBackend::SAMBANOVA_RDU; break;
|
||
case Platform::MAIA:
|
||
backend_ = KernelBackend::MAIA_AZURE; break;
|
||
case Platform::FPGA_XILINX:
|
||
backend_ = KernelBackend::FPGA_XILINX; break;
|
||
case Platform::HEXAGON:
|
||
backend_ = KernelBackend::HEXAGON_HVX; break;
|
||
|
||
// ── Edge/Embedded → scalar generic ───────────────────────
|
||
case Platform::RISCV:
|
||
case Platform::XTENSA:
|
||
case Platform::CORTEX_M:
|
||
backend_ = KernelBackend::GENERIC; break;
|
||
|
||
default:
|
||
backend_ = KernelBackend::GENERIC; break;
|
||
}
|
||
}
|
||
|
||
// ─── STREAM INIT ─────────────────────────────────────────────────────
|
||
// Backends that need a stream/context get it here. Called once.
|
||
// Without the SDK → empty function. Compiler eliminates it.
|
||
// ─────────────────────────────────────────────────────────────────────
|
||
void init_streams() {
|
||
#ifdef IX_USE_GROQ
|
||
if (backend_ == KernelBackend::GROQ_LPU)
|
||
groq_create_stream(&groq_stream_);
|
||
#endif
|
||
#ifdef IX_USE_GAUDI
|
||
if (backend_ == KernelBackend::GAUDI_HABANA)
|
||
synStreamCreate(&gaudi_stream_, 0);
|
||
#endif
|
||
#ifdef IX_USE_MAIA
|
||
if (backend_ == KernelBackend::MAIA_AZURE)
|
||
maia_create_stream(&maia_stream_);
|
||
#endif
|
||
// Inferentia uses default stream (nullptr). No init needed.
|
||
}
|
||
|
||
// ─── STATE ───────────────────────────────────────────────────────────
|
||
HWProfile hw_{};
|
||
KernelBackend backend_ = KernelBackend::GENERIC;
|
||
ExpertMmapManager emm_;
|
||
bool use_expert_mmap_ = false;
|
||
bool initialized_ = false;
|
||
|
||
// Stream handles — only exist when SDK compiled in
|
||
#ifdef IX_USE_GROQ
|
||
groq_stream_t groq_stream_{};
|
||
#endif
|
||
#ifdef IX_USE_GAUDI
|
||
synStreamHandle gaudi_stream_{};
|
||
#endif
|
||
#ifdef IX_USE_INFERENTIA
|
||
void* inferentia_stream_ = nullptr;
|
||
#endif
|
||
#ifdef IX_USE_MAIA
|
||
maia_stream_t maia_stream_{};
|
||
#endif
|
||
};
|
||
|
||
} // namespace ix
|