inference-x/runtime/kernel_dispatch.h

459 lines
21 KiB
C++
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// ═══════════════════════════════════════════════════════════════════════════════
// INFERENCE-X — Hardware Kernel Dispatch (Central Routing)
// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
// Licensed under the Business Source License 1.1 (BSL-1.1)
// See LICENSE file for full terms.
//
// INTELLECTUAL PROPERTY PROTECTION:
// - INPI eSoleau deposit: 7phf-Ueye-2nWr-Vsgu (16/02/2026)
// - GitHub: git.inference-x.com/salka/inference-x
// - Author: Salka Elmadani | Morocco | Morocco
//
// MANUFACTURER NOTICE: Any manufacturer, company, or entity that
// incorporates, embeds, distributes, or commercially uses Inference-X
// or any derivative work without explicit written authorization from
// the copyright holder is in violation of BSL-1.1 and applicable
// intellectual property laws. This includes but is not limited to:
// hardware vendors, cloud providers, SaaS platforms, and OEMs.
//
// Contact: Elmadani.SALKA@proton.me for licensing.
// ═══════════════════════════════════════════════════════════════════════════════
#pragma once
// Inference-X Provenance — this engine was created by Salka Elmadani
// Unauthorized commercial use (revenue >= $1M) requires licensing
__attribute__((unused)) static const char* ix_provenance() { return "Inference-X | Salka Elmadani | BSL-1.1 | 935"; }
#include "backends.h" // ix::Platform, ix::HWProfile, ix::detect_hardware()
#include "gemm.h" // ix::gemm::matmul (proven v6 — ran Kimi K2.5 1T)
#include "expert_mmap.h" // ix::ExpertMmapManager
// ═══════════════════════════════════════════════════════════════════════════════
// BACKEND DECLARATIONS (conditional)
//
// Each backend is a .c/.cpp file under backends/q4_kernels/<platform>/
// It compiles ONLY when the Makefile detects its SDK (sets IX_USE_*).
// Without the SDK → the #ifdef is dead, zero code emitted, zero link error.
//
// Contract: every backend implements Q4_K GEMM as
// void gemm_q4_K_<platform>(A, B, C, M, N, K [, stream])
// This dispatch calls them with N=1 (GEMV: out[M] = W[M×K] × x[K]).
// ═══════════════════════════════════════════════════════════════════════════════
// ── Clean-signature backends (no stream) ─────────────────────────────────────
#ifdef IX_USE_CPU_AVX512
extern "C" void gemm_q4_K_fp32_cpu(
const block_q4_K* __restrict__ A, const float* __restrict__ B,
float* __restrict__ C, int M, int N, int K);
#endif
#ifdef IX_USE_HEXAGON
extern "C" void gemm_q4_K_hexagon(
const block_q4_K* A, const float* B, float* C, int M, int N, int K);
#endif
#ifdef IX_USE_CEREBRAS
extern "C" void gemm_q4_K_wse(
const block_q4_K* A, const float* B, float* C, int M, int N, int K);
#endif
#ifdef IX_USE_SAMBANOVA
extern "C" void gemm_q4_K_sambanova(
const block_q4_K* A, const float* B, float* C, int M, int N, int K);
#endif
#ifdef IX_USE_GRAPHCORE
extern "C" void gemm_q4_K_ipu(
const block_q4_K* A, const float* B, float* C, int M, int N, int K);
#endif
#ifdef IX_USE_FPGA_XILINX
extern "C" void gemm_q4_K_xilinx(
const block_q4_K* A, const float* B, float* C, int M, int N, int K);
#endif
// ── Stream-based backends (need runtime context) ─────────────────────────────
#ifdef IX_USE_GROQ
#include <groq/groq_runtime.h>
extern "C" void gemm_q4_K_groq(
const void* A, const void* B, void* C, int M, int N, int K,
groq_stream_t stream);
#endif
#ifdef IX_USE_GAUDI
#include <synapse_api.h>
extern "C" void gemm_q4_K_gaudi(
const block_q4_K* A, const float* B, float* C, int M, int N, int K,
synStreamHandle stream);
#endif
#ifdef IX_USE_INFERENTIA
extern "C" void gemm_q4_K_aws_inferentia(
const void* A, const void* B, void* C, int M, int N, int K,
void* stream);
#endif
#ifdef IX_USE_MAIA
#include <maia_runtime.h>
extern "C" void gemm_q4_K_maia(
const block_q4_K* A, const float* B, float* C, int M, int N, int K,
maia_stream_t stream);
#endif
// ── Snapdragon: hybrid NEON+Hexagon DSP path ────────────────────────────────
#ifdef IX_USE_SNAPDRAGON
extern "C" void gemm_q4_K_hexagon_fused(
const block_q4_K* A, const float* B, float* C, int M, int N, int K);
#endif
namespace ix {
// ─── Backend enum ────────────────────────────────────────────────────────────
enum class KernelBackend {
GENERIC, // v6 gemm.h — proven on Kimi K2.5 1T (226GB, 17GB RAM)
CPU_AVX512, // backends/q4_kernels/cpu
ARM_NEON, // backends.h NEON intrinsics
HEXAGON_HVX, // backends/q4_kernels/hexagon
SNAPDRAGON_HYBRID, // backends/q4_kernels/snapdragon
CEREBRAS_WSE, // backends/q4_kernels/cerebras
TPU_XLA, // backends/q4_kernels/tpu (Python — needs bridge)
GAUDI_HABANA, // backends/q4_kernels/gaudi
INFERENTIA_AWS, // backends/q4_kernels/inferentia
FPGA_XILINX, // backends/q4_kernels/fpga_xilinx
GRAPHCORE_IPU, // backends/q4_kernels/graphcore
SAMBANOVA_RDU, // backends/q4_kernels/sambanova
MAIA_AZURE, // backends/q4_kernels/maia
GROQ_LPU, // backends/q4_kernels/groq
};
// ═══════════════════════════════════════════════════════════════════════════════
// KERNEL DISPATCH — Singleton
//
// init() once at startup.
// After that, every matmul() call auto-routes to the optimal kernel.
// If the selected backend SDK wasn't compiled in, falls through to generic.
// No crash, no undefined symbol, no runtime check. Compiler eliminates it.
// ═══════════════════════════════════════════════════════════════════════════════
class KernelDispatch {
public:
static KernelDispatch& instance() {
static KernelDispatch kd;
return kd;
}
// ─── STARTUP ─────────────────────────────────────────────────────────
void init() {
hw_ = detect_hardware();
select_backend();
init_streams();
print_hw_report(hw_);
printf("[IX-DISPATCH] Kernel backend: %s\n", backend_name());
fflush(stdout);
initialized_ = true;
}
// Init ExpertMmap for MoE weight streaming
void init_expert_mmap(int n_layers) {
emm_.init(n_layers);
use_expert_mmap_ = true;
printf("[IX-DISPATCH] ExpertMmap enabled: %d layers\n", n_layers);
fflush(stdout);
}
// Register expert tensors for a layer (call during model load)
void register_experts(int layer,
void* gate_data, size_t gate_expert_bytes, int n_experts,
void* up_data, size_t up_expert_bytes,
void* down_data, size_t down_expert_bytes) {
if (!use_expert_mmap_) return;
emm_.register_layer(layer,
gate_data, gate_expert_bytes, n_experts,
up_data, up_expert_bytes,
down_data, down_expert_bytes);
}
// ═════════════════════════════════════════════════════════════════════
// GEMM DISPATCH — the central weld
//
// Contract: out[M] = W[M×K quantized] × x[K]
//
// Specialized backends handle Q4_K only (the bottleneck format for
// large MoE models). All other dtypes go through the proven v6 path
// which already handles Q4_K, Q6_K, Q8_0, IQ2_XXS, IQ4_XS, F16.
//
// If a backend's SDK wasn't compiled → its #ifdef is dead →
// the case exists in the enum but the code inside is empty →
// falls through to default → generic. Zero penalty.
// ═════════════════════════════════════════════════════════════════════
inline void matmul(float* out, const void* W, dtype type,
const float* x, int M, int K) {
// Only Q4_K has specialized backends. Everything else → proven v6.
if (type != dtype::Q4_K) {
gemm::matmul(out, W, type, x, M, K);
return;
}
const auto* A __attribute__((unused)) = static_cast<const block_q4_K*>(W);
switch (backend_) {
// ── CPU: AVX-512 fused dequant+GEMM in zmm registers ────────
#ifdef IX_USE_CPU_AVX512
case KernelBackend::CPU_AVX512:
gemm_q4_K_fp32_cpu(A, x, out, M, 1, K);
return;
#endif
// ── ARM NEON: vectorized in backends.h ───────────────────────
// Uses v6 gemm.h with NEON intrinsics already compiled in.
// No separate backend file needed — it's in the generic path.
case KernelBackend::ARM_NEON:
break; // → generic (which IS NEON-optimized when compiled on ARM)
// ── Qualcomm Hexagon: HVX vector DSP ─────────────────────────
#ifdef IX_USE_HEXAGON
case KernelBackend::HEXAGON_HVX:
gemm_q4_K_hexagon(A, x, out, M, 1, K);
return;
#endif
// ── Snapdragon SoC: hybrid NEON + Hexagon DSP ────────────────
#ifdef IX_USE_SNAPDRAGON
case KernelBackend::SNAPDRAGON_HYBRID:
gemm_q4_K_hexagon_fused(A, x, out, M, 1, K);
return;
#endif
// ── Cerebras WSE: 850K cores, weight-stationary dataflow ─────
#ifdef IX_USE_CEREBRAS
case KernelBackend::CEREBRAS_WSE:
gemm_q4_K_wse(A, x, out, M, 1, K);
return;
#endif
// ── Google TPU: XLA backend (Python) ─────────────────────────
// TPU backend is q4_gemm_tpu.py (134 lines).
// Requires pybind11 or subprocess bridge to wire in.
// Falls through to generic until bridge is integrated.
// This is the ONE backend that needs external glue.
case KernelBackend::TPU_XLA:
break; // → generic (TODO: pybind11 bridge)
// ── Intel Gaudi: Habana TPC kernels ──────────────────────────
#ifdef IX_USE_GAUDI
case KernelBackend::GAUDI_HABANA:
gemm_q4_K_gaudi(A, x, out, M, 1, K, gaudi_stream_);
return;
#endif
// ── AWS Inferentia: NeuronCore pipeline ──────────────────────
#ifdef IX_USE_INFERENTIA
case KernelBackend::INFERENTIA_AWS:
gemm_q4_K_aws_inferentia(A, x, out, M, 1, K, inferentia_stream_);
return;
#endif
// ── Xilinx FPGA: Vitis HLS dataflow ─────────────────────────
#ifdef IX_USE_FPGA_XILINX
case KernelBackend::FPGA_XILINX:
gemm_q4_K_xilinx(A, x, out, M, 1, K);
return;
#endif
// ── Graphcore IPU: BSP tile compute ──────────────────────────
#ifdef IX_USE_GRAPHCORE
case KernelBackend::GRAPHCORE_IPU:
gemm_q4_K_ipu(A, x, out, M, 1, K);
return;
#endif
// ── SambaNova RDU: reconfigurable dataflow ───────────────────
#ifdef IX_USE_SAMBANOVA
case KernelBackend::SAMBANOVA_RDU:
gemm_q4_K_sambanova(A, x, out, M, 1, K);
return;
#endif
// ── Microsoft Maia: Azure custom ASIC ────────────────────────
#ifdef IX_USE_MAIA
case KernelBackend::MAIA_AZURE:
gemm_q4_K_maia(A, x, out, M, 1, K, maia_stream_);
return;
#endif
// ── Groq LPU: deterministic SRAM compute ────────────────────
#ifdef IX_USE_GROQ
case KernelBackend::GROQ_LPU:
gemm_q4_K_groq(A, x, out, M, 1, K, groq_stream_);
return;
#endif
default:
break;
}
// ── Fallthrough: proven v6 generic path ──────────────────────
// This ran Kimi K2.5 (1T params, 384 experts, 226GB) on 17GB RAM.
// It works. Everything above is optimization.
gemm::matmul(out, W, type, x, M, K);
}
// ─── MoE EXPERT PREFETCH ─────────────────────────────────────────────
void prefetch_experts(int layer, const int* expert_ids, int n_active) {
if (!use_expert_mmap_) return;
emm_.prefetch_active(layer, expert_ids, n_active);
}
void evict_layer(int layer) {
if (!use_expert_mmap_) return;
emm_.evict_layer(layer);
}
void print_stats() {
if (use_expert_mmap_) emm_.print_stats();
}
// ─── ACCESSORS ───────────────────────────────────────────────────────
const HWProfile& hardware() const { return hw_; }
KernelBackend backend_type() const { return backend_; }
bool initialized() const { return initialized_; }
const char* backend_name() const {
switch (backend_) {
case KernelBackend::GENERIC: return "GENERIC (v6 proven)";
case KernelBackend::CPU_AVX512: return "CPU_AVX512";
case KernelBackend::ARM_NEON: return "ARM_NEON";
case KernelBackend::HEXAGON_HVX: return "HEXAGON_HVX";
case KernelBackend::SNAPDRAGON_HYBRID: return "SNAPDRAGON_HYBRID";
case KernelBackend::CEREBRAS_WSE: return "CEREBRAS_WSE";
case KernelBackend::TPU_XLA: return "TPU_XLA (Python bridge)";
case KernelBackend::GAUDI_HABANA: return "GAUDI_HABANA";
case KernelBackend::INFERENTIA_AWS: return "INFERENTIA_AWS";
case KernelBackend::FPGA_XILINX: return "FPGA_XILINX";
case KernelBackend::GRAPHCORE_IPU: return "GRAPHCORE_IPU";
case KernelBackend::SAMBANOVA_RDU: return "SAMBANOVA_RDU";
case KernelBackend::MAIA_AZURE: return "MAIA_AZURE";
case KernelBackend::GROQ_LPU: return "GROQ_LPU";
default: return "UNKNOWN";
}
}
private:
KernelDispatch() = default;
// ─── BACKEND SELECTION ───────────────────────────────────────────────
// Maps detected Platform → optimal KernelBackend.
// detect_hardware() in backends.h already resolved the Platform,
// including IX_USE_* overrides for accelerators.
// ─────────────────────────────────────────────────────────────────────
void select_backend() {
switch (hw_.platform) {
// ── x86 ──────────────────────────────────────────────────
case Platform::X86_AVX512:
#ifdef IX_USE_CPU_AVX512
backend_ = KernelBackend::CPU_AVX512; break;
#else
backend_ = KernelBackend::GENERIC; break; // AVX512 detected but backend not compiled
#endif
case Platform::X86_AVX2:
case Platform::X86_SSE42:
case Platform::X86_GENERIC:
backend_ = KernelBackend::GENERIC; break;
// ── ARM ──────────────────────────────────────────────────
case Platform::ARM64_NEON:
case Platform::ARM64_SVE:
case Platform::ARM32_NEON:
case Platform::APPLE_SILICON:
backend_ = KernelBackend::ARM_NEON; break;
// ── Mobile SoC ───────────────────────────────────────────
case Platform::SNAPDRAGON:
backend_ = KernelBackend::SNAPDRAGON_HYBRID; break;
case Platform::MEDIATEK:
case Platform::EXYNOS:
backend_ = KernelBackend::ARM_NEON; break;
// ── Cloud accelerators ───────────────────────────────────
case Platform::TPU:
backend_ = KernelBackend::TPU_XLA; break;
case Platform::GAUDI:
backend_ = KernelBackend::GAUDI_HABANA; break;
case Platform::INFERENTIA:
backend_ = KernelBackend::INFERENTIA_AWS; break;
case Platform::CEREBRAS:
backend_ = KernelBackend::CEREBRAS_WSE; break;
case Platform::GROQ:
backend_ = KernelBackend::GROQ_LPU; break;
case Platform::GRAPHCORE:
backend_ = KernelBackend::GRAPHCORE_IPU; break;
case Platform::SAMBANOVA:
backend_ = KernelBackend::SAMBANOVA_RDU; break;
case Platform::MAIA:
backend_ = KernelBackend::MAIA_AZURE; break;
case Platform::FPGA_XILINX:
backend_ = KernelBackend::FPGA_XILINX; break;
case Platform::HEXAGON:
backend_ = KernelBackend::HEXAGON_HVX; break;
// ── Edge/Embedded → scalar generic ───────────────────────
case Platform::RISCV:
case Platform::XTENSA:
case Platform::CORTEX_M:
backend_ = KernelBackend::GENERIC; break;
default:
backend_ = KernelBackend::GENERIC; break;
}
}
// ─── STREAM INIT ─────────────────────────────────────────────────────
// Backends that need a stream/context get it here. Called once.
// Without the SDK → empty function. Compiler eliminates it.
// ─────────────────────────────────────────────────────────────────────
void init_streams() {
#ifdef IX_USE_GROQ
if (backend_ == KernelBackend::GROQ_LPU)
groq_create_stream(&groq_stream_);
#endif
#ifdef IX_USE_GAUDI
if (backend_ == KernelBackend::GAUDI_HABANA)
synStreamCreate(&gaudi_stream_, 0);
#endif
#ifdef IX_USE_MAIA
if (backend_ == KernelBackend::MAIA_AZURE)
maia_create_stream(&maia_stream_);
#endif
// Inferentia uses default stream (nullptr). No init needed.
}
// ─── STATE ───────────────────────────────────────────────────────────
HWProfile hw_{};
KernelBackend backend_ = KernelBackend::GENERIC;
ExpertMmapManager emm_;
bool use_expert_mmap_ = false;
bool initialized_ = false;
// Stream handles — only exist when SDK compiled in
#ifdef IX_USE_GROQ
groq_stream_t groq_stream_{};
#endif
#ifdef IX_USE_GAUDI
synStreamHandle gaudi_stream_{};
#endif
#ifdef IX_USE_INFERENTIA
void* inferentia_stream_ = nullptr;
#endif
#ifdef IX_USE_MAIA
maia_stream_t maia_stream_{};
#endif
};
} // namespace ix