inference-x/runtime/backends.h

536 lines
17 KiB
C++

// ═══════════════════════════════════════════════════════════════════════════════
// INFERENCE-X — Multi-Platform Backend Definitions (12 Backends)
// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
// Licensed under the Business Source License 1.1 (BSL-1.1)
// See LICENSE file for full terms.
//
// INTELLECTUAL PROPERTY PROTECTION:
// - INPI eSoleau deposit: 7phf-Ueye-2nWr-Vsgu (16/02/2026)
// - GitHub: git.inference-x.com/salka/inference-x
// - Author: Salka Elmadani | Morocco | Morocco
//
// MANUFACTURER NOTICE: Any manufacturer, company, or entity that
// incorporates, embeds, distributes, or commercially uses Inference-X
// or any derivative work without explicit written authorization from
// the copyright holder is in violation of BSL-1.1 and applicable
// intellectual property laws. This includes but is not limited to:
// hardware vendors, cloud providers, SaaS platforms, and OEMs.
//
// Contact: Elmadani.SALKA@proton.me for licensing.
// ═══════════════════════════════════════════════════════════════════════════════
#pragma once
// Inference-X Identity — removal violates BSL-1.1
#define IX_VERSION "6.0"
#define IX_AUTHOR_HASH 0x935E1DAD
#define IX_BUILD_SIGNATURE "Inference-X by Salka Elmadani — Morocco"
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <string>
#include <vector>
#include <memory>
#include <functional>
// Platform detection
#if defined(__x86_64__) || defined(_M_X64)
#define IX_ARCH_X86_64 1
#ifdef __AVX512F__
#define IX_HAS_AVX512 1
#endif
#ifdef __AVX2__
#define IX_HAS_AVX2 1
#endif
#ifdef __FMA__
#define IX_HAS_FMA 1
#endif
#elif defined(__aarch64__) || defined(_M_ARM64)
#define IX_ARCH_ARM64 1
#include <arm_neon.h>
#define IX_HAS_NEON 1
#elif defined(__arm__)
#define IX_ARCH_ARM32 1
#elif defined(__riscv)
#define IX_ARCH_RISCV 1
#elif defined(__xtensa__)
#define IX_ARCH_XTENSA 1
#endif
// OS detection
#if defined(__linux__)
#define IX_OS_LINUX 1
#elif defined(__APPLE__)
#define IX_OS_APPLE 1
#if TARGET_OS_IPHONE
#define IX_OS_IOS 1
#else
#define IX_OS_MACOS 1
#endif
#elif defined(__ANDROID__)
#define IX_OS_ANDROID 1
#elif defined(_WIN32)
#define IX_OS_WINDOWS 1
#endif
// Accelerator detection
#if defined(__CUDA_ARCH__) || defined(IX_USE_CUDA)
#define IX_HAS_CUDA 1
#endif
#if defined(IX_USE_ROCM)
#define IX_HAS_ROCM 1
#endif
#if defined(IX_USE_HEXAGON)
#define IX_HAS_HEXAGON 1
#endif
namespace ix {
// =============================================================================
// HARDWARE PROFILE — Auto-detected at runtime
// =============================================================================
enum class Platform {
// Desktop/Server CPU
X86_AVX512,
X86_AVX2,
X86_SSE42,
X86_GENERIC,
// ARM
ARM64_NEON, // Apple M-series, Snapdragon, Ampere
ARM32_NEON, // Raspberry Pi, older ARM
ARM64_SVE, // ARM SVE (Graviton3+, Neoverse)
// Mobile SoC
SNAPDRAGON, // Qualcomm (CPU + Hexagon DSP + Adreno GPU)
APPLE_SILICON, // Apple (CPU + Neural Engine + Metal GPU)
MEDIATEK, // Dimensity series
EXYNOS, // Samsung
// GPU
CUDA, // NVIDIA
ROCM, // AMD
METAL, // Apple
VULKAN, // Cross-platform
// Edge/Embedded
RISCV, // RISC-V boards
XTENSA, // ESP32-S3
CORTEX_M, // Arduino, STM32
// Cloud / Accelerator
TPU, // Google TPU (v4/v5)
INFERENTIA, // AWS Inferentia (NeuronCore)
GAUDI, // Intel Gaudi (Habana TPC)
CEREBRAS, // Cerebras WSE (850K cores)
GROQ, // Groq LPU (deterministic SRAM)
GRAPHCORE, // Graphcore IPU (BSP tiles)
SAMBANOVA, // SambaNova RDU (reconfigurable dataflow)
MAIA, // Microsoft Maia (Azure custom ASIC)
FPGA_XILINX, // Xilinx FPGA (Vitis HLS)
HEXAGON, // Qualcomm Hexagon DSP (standalone, not SoC)
UNKNOWN
};
enum class PowerMode {
MAX, // Full performance, no power limit
BALANCED, // Power/perf tradeoff
ECO, // Minimum power (mobile, edge)
ULTRA_ECO // Sub-1W (ESP32, Arduino)
};
struct HWProfile {
Platform platform = Platform::UNKNOWN;
std::string name;
std::string vendor;
// CPU
int cores = 1;
int threads = 1;
float freq_ghz = 0;
size_t cache_l2 = 0;
size_t cache_l3 = 0;
// Memory
size_t ram_bytes = 0;
size_t vram_bytes = 0;
int mem_channels = 1;
float mem_bandwidth_gbps = 0;
// Capabilities
bool has_avx2 = false;
bool has_avx512 = false;
bool has_fma = false;
bool has_neon = false;
bool has_sve = false;
bool has_fp16 = false;
bool has_int8 = false;
bool has_bf16 = false;
bool has_tensor_cores = false;
bool has_amx = false;
// Power
float tdp_watts = 0;
PowerMode power_mode = PowerMode::MAX;
// Theoretical peak
float tops = 0; // INT8 TOPS
float tflops_fp32 = 0; // FP32 TFLOPS
float tflops_fp16 = 0; // FP16 TFLOPS
};
// =============================================================================
// AUTO-DETECT HARDWARE
// =============================================================================
inline HWProfile detect_hardware() {
HWProfile hw;
#if IX_ARCH_X86_64
#if IX_HAS_AVX512
hw.platform = Platform::X86_AVX512;
hw.has_avx512 = true;
#elif IX_HAS_AVX2
hw.platform = Platform::X86_AVX2;
#else
hw.platform = Platform::X86_GENERIC;
#endif
hw.has_avx2 = true; // Assume baseline
hw.has_fma = true;
// Detect CPU info from /proc/cpuinfo on Linux
#if IX_OS_LINUX
{
FILE* f = fopen("/proc/cpuinfo", "r");
if (f) {
char line[256];
while (fgets(line, sizeof(line), f)) {
if (strncmp(line, "model name", 10) == 0) {
char* p = strchr(line, ':');
if (p) {
hw.name = std::string(p + 2);
if (!hw.name.empty() && hw.name.back() == '\n')
hw.name.pop_back();
}
break;
}
}
fclose(f);
}
// Count cores
f = fopen("/proc/cpuinfo", "r");
if (f) {
int count = 0;
char line[256];
while (fgets(line, sizeof(line), f)) {
if (strncmp(line, "processor", 9) == 0) count++;
}
hw.threads = count;
hw.cores = count / 2; // Approximate
fclose(f);
}
// Memory
f = fopen("/proc/meminfo", "r");
if (f) {
char line[256];
while (fgets(line, sizeof(line), f)) {
if (strncmp(line, "MemTotal:", 9) == 0) {
unsigned long kb = 0;
sscanf(line, "MemTotal: %lu kB", &kb);
hw.ram_bytes = kb * 1024ULL;
break;
}
}
fclose(f);
}
}
#endif
// Vendor detection from name
if (hw.name.find("AMD") != std::string::npos) {
hw.vendor = "AMD";
if (hw.name.find("EPYC") != std::string::npos) hw.mem_channels = 8;
else if (hw.name.find("Threadripper") != std::string::npos) hw.mem_channels = 8;
else if (hw.name.find("Ryzen 9") != std::string::npos) hw.mem_channels = 2;
else if (hw.name.find("Ryzen 7") != std::string::npos) hw.mem_channels = 2;
else hw.mem_channels = 2;
} else if (hw.name.find("Intel") != std::string::npos) {
hw.vendor = "Intel";
if (hw.name.find("Xeon") != std::string::npos) hw.mem_channels = 8;
else hw.mem_channels = 2;
}
// Estimate bandwidth: DDR5 ~38 GB/s per channel, DDR4 ~25 GB/s
hw.mem_bandwidth_gbps = hw.mem_channels * 38.0f; // Assume DDR5
#elif IX_ARCH_ARM64
hw.platform = Platform::ARM64_NEON;
hw.has_neon = true;
hw.has_fp16 = true;
#if IX_OS_APPLE
hw.platform = Platform::APPLE_SILICON;
hw.vendor = "Apple";
hw.name = "Apple Silicon";
hw.mem_bandwidth_gbps = 200.0f; // M-series unified memory
#elif IX_OS_ANDROID
hw.platform = Platform::SNAPDRAGON;
hw.vendor = "Qualcomm";
hw.name = "Snapdragon";
#else
hw.vendor = "ARM";
#endif
#elif IX_ARCH_XTENSA
hw.platform = Platform::XTENSA;
hw.vendor = "Espressif";
hw.name = "ESP32-S3";
hw.cores = 2;
hw.threads = 2;
hw.freq_ghz = 0.24f;
hw.ram_bytes = 8ULL * 1024 * 1024;
hw.tdp_watts = 0.5f;
hw.power_mode = PowerMode::ULTRA_ECO;
#elif IX_ARCH_RISCV
hw.platform = Platform::RISCV;
hw.vendor = "RISC-V";
#endif
// ─── ACCELERATOR SDK OVERRIDE ────────────────────────────────────────
// When Makefile detects an accelerator SDK (IX_USE_*), override
// the CPU-detected platform. The accelerator IS the target.
// ─────────────────────────────────────────────────────────────────────
#ifdef IX_USE_CEREBRAS
hw.platform = Platform::CEREBRAS;
hw.vendor = "Cerebras"; hw.name = "WSE";
hw.cores = 850000; hw.tdp_watts = 20000;
#endif
#ifdef IX_USE_GROQ
hw.platform = Platform::GROQ;
hw.vendor = "Groq"; hw.name = "LPU";
hw.tdp_watts = 300;
#endif
#ifdef IX_USE_GAUDI
hw.platform = Platform::GAUDI;
hw.vendor = "Intel"; hw.name = "Gaudi";
#endif
#ifdef IX_USE_INFERENTIA
hw.platform = Platform::INFERENTIA;
hw.vendor = "AWS"; hw.name = "Inferentia";
#endif
#ifdef IX_USE_GRAPHCORE
hw.platform = Platform::GRAPHCORE;
hw.vendor = "Graphcore"; hw.name = "IPU";
#endif
#ifdef IX_USE_SAMBANOVA
hw.platform = Platform::SAMBANOVA;
hw.vendor = "SambaNova"; hw.name = "RDU";
#endif
#ifdef IX_USE_MAIA
hw.platform = Platform::MAIA;
hw.vendor = "Microsoft"; hw.name = "Maia";
#endif
#ifdef IX_USE_FPGA_XILINX
hw.platform = Platform::FPGA_XILINX;
hw.vendor = "Xilinx"; hw.name = "FPGA";
#endif
#ifdef IX_USE_HEXAGON
hw.platform = Platform::HEXAGON;
hw.vendor = "Qualcomm"; hw.name = "Hexagon DSP";
#endif
return hw;
}
// =============================================================================
// COMPUTE KERNEL DISPATCH — Platform-optimal implementations
// =============================================================================
struct ComputeKernels {
// Vector multiply-add: out[i] += a[i] * b[i]
std::function<void(float*, const float*, const float*, int)> vec_fma;
// SiLU activation
std::function<void(float*, int)> silu;
// RMS Norm
std::function<void(float*, const float*, int, float)> rms_norm;
// GEMV: out = mat @ vec (quantized mat)
std::function<void(float*, const void*, int, const float*, int, int, int)> gemv_q;
// Softmax
std::function<void(float*, int)> softmax;
};
inline ComputeKernels get_optimal_kernels(Platform p) {
ComputeKernels k;
// Default scalar implementations (works everywhere)
k.vec_fma = [](float* out, const float* a, const float* b, int n) {
for (int i = 0; i < n; ++i) out[i] += a[i] * b[i];
};
k.silu = [](float* x, int n) {
for (int i = 0; i < n; ++i) x[i] = x[i] / (1.0f + expf(-x[i]));
};
k.rms_norm = [](float* x, const float* w, int n, float eps) {
float ss = 0;
for (int i = 0; i < n; ++i) ss += x[i] * x[i];
ss = 1.0f / sqrtf(ss / n + eps);
for (int i = 0; i < n; ++i) x[i] = x[i] * ss * w[i];
};
k.softmax = [](float* x, int n) {
float mx = x[0];
for (int i = 1; i < n; ++i) mx = std::max(mx, x[i]);
float sum = 0;
for (int i = 0; i < n; ++i) { x[i] = expf(x[i] - mx); sum += x[i]; }
for (int i = 0; i < n; ++i) x[i] /= sum;
};
#if IX_ARCH_X86_64 && IX_HAS_AVX2
// AVX2 optimized kernels (current v6 path)
// These delegate to kernels.h and gemm.h implementations
// No change needed — v6 already has optimal AVX2 paths
#endif
#if IX_ARCH_ARM64 && defined(IX_HAS_NEON)
// NEON optimized kernels
k.silu = [](float* x, int n) {
int i = 0;
for (; i + 4 <= n; i += 4) {
float32x4_t v = vld1q_f32(&x[i]);
float32x4_t neg = vnegq_f32(v);
// exp approximation for NEON
float tmp[4];
vst1q_f32(tmp, neg);
for (int j = 0; j < 4; ++j) tmp[j] = expf(tmp[j]);
float32x4_t exp_neg = vld1q_f32(tmp);
float32x4_t denom = vaddq_f32(vdupq_n_f32(1.0f), exp_neg);
float32x4_t result = vdivq_f32(v, denom);
vst1q_f32(&x[i], result);
}
for (; i < n; ++i) x[i] = x[i] / (1.0f + expf(-x[i]));
};
k.rms_norm = [](float* x, const float* w, int n, float eps) {
float32x4_t sum4 = vdupq_n_f32(0);
int i = 0;
for (; i + 4 <= n; i += 4) {
float32x4_t v = vld1q_f32(&x[i]);
sum4 = vmlaq_f32(sum4, v, v);
}
float ss = vaddvq_f32(sum4);
for (; i < n; ++i) ss += x[i] * x[i];
float scale = 1.0f / sqrtf(ss / n + eps);
float32x4_t sc4 = vdupq_n_f32(scale);
i = 0;
for (; i + 4 <= n; i += 4) {
float32x4_t v = vld1q_f32(&x[i]);
float32x4_t wv = vld1q_f32(&w[i]);
vst1q_f32(&x[i], vmulq_f32(vmulq_f32(v, sc4), wv));
}
for (; i < n; ++i) x[i] = x[i] * scale * w[i];
};
#endif
return k;
}
// =============================================================================
// PERFORMANCE ESTIMATOR
// Estimate tok/s for a given model config on detected hardware
// =============================================================================
struct ModelProfile {
size_t total_bytes; // Total model size on disk
int n_experts; // Total MoE experts
int n_active; // Active experts per token
int dim; // Hidden dimension
int expert_ffn_dim; // Expert FFN width
int n_layers; // Number of transformer layers
int n_dense_layers; // Dense (non-MoE) layers
size_t shared_bytes; // Non-expert weight bytes (attention, norms, embeddings)
size_t expert_bytes_each; // Bytes per single expert (gate+up+down)
};
struct PerfEstimate {
float tok_per_sec;
float prefill_sec;
float mem_required_gb;
float io_per_token_gb;
std::string bottleneck; // "compute", "memory_bandwidth", "io_bandwidth"
};
inline PerfEstimate estimate_performance(const HWProfile& hw, const ModelProfile& mp) {
PerfEstimate est;
// Active bytes per token = shared weights + K active experts
size_t active_bytes = mp.shared_bytes +
(size_t)mp.n_active * mp.expert_bytes_each * (mp.n_layers - mp.n_dense_layers);
est.io_per_token_gb = active_bytes / 1e9;
// If model fits in RAM: bandwidth-bound
// If not: storage I/O bound
bool fits_ram = mp.total_bytes < hw.ram_bytes * 0.8;
bool active_fits = active_bytes < hw.ram_bytes * 0.6;
if (fits_ram) {
// RAM bandwidth bound
est.tok_per_sec = (hw.mem_bandwidth_gbps) / est.io_per_token_gb;
est.bottleneck = "memory_bandwidth";
est.mem_required_gb = mp.total_bytes / 1e9;
} else if (active_fits) {
// Expert-aware mmap: only active experts paged
// First token cold, subsequent warm from page cache
float nvme_gbps = 6.0f; // Typical NVMe
est.tok_per_sec = nvme_gbps / est.io_per_token_gb;
est.bottleneck = "io_bandwidth_mmap";
est.mem_required_gb = active_bytes / 1e9 * 1.5f;
} else {
// Cold: everything from storage
float nvme_gbps = 6.0f;
est.tok_per_sec = nvme_gbps / (mp.total_bytes / 1e9 / mp.n_layers);
est.bottleneck = "io_bandwidth_cold";
est.mem_required_gb = hw.ram_bytes / 1e9;
}
est.prefill_sec = 1.0f / est.tok_per_sec;
return est;
}
// =============================================================================
// PRINT HARDWARE REPORT
// =============================================================================
inline void print_hw_report(const HWProfile& hw) {
printf("=== INFERENCE-X v6 — HARDWARE PROFILE ===\n");
printf(" Platform: %s\n", hw.name.c_str());
printf(" Vendor: %s\n", hw.vendor.c_str());
printf(" Cores/Thrds: %d / %d\n", hw.cores, hw.threads);
printf(" RAM: %.1f GB\n", hw.ram_bytes / 1e9);
printf(" Mem BW: %.1f GB/s (%d channels)\n",
hw.mem_bandwidth_gbps, hw.mem_channels);
printf(" Features: ");
if (hw.has_avx512) printf("AVX-512 ");
if (hw.has_avx2) printf("AVX2 ");
if (hw.has_fma) printf("FMA ");
if (hw.has_neon) printf("NEON ");
if (hw.has_sve) printf("SVE ");
if (hw.has_fp16) printf("FP16 ");
if (hw.has_bf16) printf("BF16 ");
if (hw.has_amx) printf("AMX ");
printf("\n");
printf(" TDP: %.0f W\n", hw.tdp_watts);
printf("========================================\n");
}
} // namespace ix