Better output from the same model. Fused computation, adaptive precision, surgical expert loading. 305 KB, 19 backends, zero dependencies. https://inference-x.com
536 lines
17 KiB
C++
536 lines
17 KiB
C++
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// INFERENCE-X — Multi-Platform Backend Definitions (12 Backends)
|
|
// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
|
|
// Licensed under the Business Source License 1.1 (BSL-1.1)
|
|
// See LICENSE file for full terms.
|
|
//
|
|
// INTELLECTUAL PROPERTY PROTECTION:
|
|
// - INPI eSoleau deposit: 7phf-Ueye-2nWr-Vsgu (16/02/2026)
|
|
// - GitHub: github.com/ElmadaniS/inference-x
|
|
// - Author: Salka Elmadani | Morocco | Morocco
|
|
//
|
|
// MANUFACTURER NOTICE: Any manufacturer, company, or entity that
|
|
// incorporates, embeds, distributes, or commercially uses Inference-X
|
|
// or any derivative work without explicit written authorization from
|
|
// the copyright holder is in violation of BSL-1.1 and applicable
|
|
// intellectual property laws. This includes but is not limited to:
|
|
// hardware vendors, cloud providers, SaaS platforms, and OEMs.
|
|
//
|
|
// Contact: Elmadani.SALKA@proton.me for licensing.
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
|
|
#pragma once
|
|
|
|
// Inference-X Identity — removal violates BSL-1.1
|
|
#define IX_VERSION "6.0"
|
|
#define IX_AUTHOR_HASH 0x935E1DAD
|
|
#define IX_BUILD_SIGNATURE "Inference-X by Salka Elmadani — Morocco"
|
|
|
|
|
|
#include <cstdint>
|
|
#include <cstdio>
|
|
#include <cstring>
|
|
#include <string>
|
|
#include <vector>
|
|
#include <memory>
|
|
#include <functional>
|
|
|
|
// Platform detection
|
|
#if defined(__x86_64__) || defined(_M_X64)
|
|
#define IX_ARCH_X86_64 1
|
|
#ifdef __AVX512F__
|
|
#define IX_HAS_AVX512 1
|
|
#endif
|
|
#ifdef __AVX2__
|
|
#define IX_HAS_AVX2 1
|
|
#endif
|
|
#ifdef __FMA__
|
|
#define IX_HAS_FMA 1
|
|
#endif
|
|
#elif defined(__aarch64__) || defined(_M_ARM64)
|
|
#define IX_ARCH_ARM64 1
|
|
#include <arm_neon.h>
|
|
#define IX_HAS_NEON 1
|
|
#elif defined(__arm__)
|
|
#define IX_ARCH_ARM32 1
|
|
#elif defined(__riscv)
|
|
#define IX_ARCH_RISCV 1
|
|
#elif defined(__xtensa__)
|
|
#define IX_ARCH_XTENSA 1
|
|
#endif
|
|
|
|
// OS detection
|
|
#if defined(__linux__)
|
|
#define IX_OS_LINUX 1
|
|
#elif defined(__APPLE__)
|
|
#define IX_OS_APPLE 1
|
|
#if TARGET_OS_IPHONE
|
|
#define IX_OS_IOS 1
|
|
#else
|
|
#define IX_OS_MACOS 1
|
|
#endif
|
|
#elif defined(__ANDROID__)
|
|
#define IX_OS_ANDROID 1
|
|
#elif defined(_WIN32)
|
|
#define IX_OS_WINDOWS 1
|
|
#endif
|
|
|
|
// Accelerator detection
|
|
#if defined(__CUDA_ARCH__) || defined(IX_USE_CUDA)
|
|
#define IX_HAS_CUDA 1
|
|
#endif
|
|
#if defined(IX_USE_ROCM)
|
|
#define IX_HAS_ROCM 1
|
|
#endif
|
|
#if defined(IX_USE_HEXAGON)
|
|
#define IX_HAS_HEXAGON 1
|
|
#endif
|
|
|
|
namespace ix {
|
|
|
|
// =============================================================================
|
|
// HARDWARE PROFILE — Auto-detected at runtime
|
|
// =============================================================================
|
|
enum class Platform {
|
|
// Desktop/Server CPU
|
|
X86_AVX512,
|
|
X86_AVX2,
|
|
X86_SSE42,
|
|
X86_GENERIC,
|
|
|
|
// ARM
|
|
ARM64_NEON, // Apple M-series, Snapdragon, Ampere
|
|
ARM32_NEON, // Raspberry Pi, older ARM
|
|
ARM64_SVE, // ARM SVE (Graviton3+, Neoverse)
|
|
|
|
// Mobile SoC
|
|
SNAPDRAGON, // Qualcomm (CPU + Hexagon DSP + Adreno GPU)
|
|
APPLE_SILICON, // Apple (CPU + Neural Engine + Metal GPU)
|
|
MEDIATEK, // Dimensity series
|
|
EXYNOS, // Samsung
|
|
|
|
// GPU
|
|
CUDA, // NVIDIA
|
|
ROCM, // AMD
|
|
METAL, // Apple
|
|
VULKAN, // Cross-platform
|
|
|
|
// Edge/Embedded
|
|
RISCV, // RISC-V boards
|
|
XTENSA, // ESP32-S3
|
|
CORTEX_M, // Arduino, STM32
|
|
|
|
// Cloud / Accelerator
|
|
TPU, // Google TPU (v4/v5)
|
|
INFERENTIA, // AWS Inferentia (NeuronCore)
|
|
GAUDI, // Intel Gaudi (Habana TPC)
|
|
CEREBRAS, // Cerebras WSE (850K cores)
|
|
GROQ, // Groq LPU (deterministic SRAM)
|
|
GRAPHCORE, // Graphcore IPU (BSP tiles)
|
|
SAMBANOVA, // SambaNova RDU (reconfigurable dataflow)
|
|
MAIA, // Microsoft Maia (Azure custom ASIC)
|
|
FPGA_XILINX, // Xilinx FPGA (Vitis HLS)
|
|
HEXAGON, // Qualcomm Hexagon DSP (standalone, not SoC)
|
|
|
|
UNKNOWN
|
|
};
|
|
|
|
enum class PowerMode {
|
|
MAX, // Full performance, no power limit
|
|
BALANCED, // Power/perf tradeoff
|
|
ECO, // Minimum power (mobile, edge)
|
|
ULTRA_ECO // Sub-1W (ESP32, Arduino)
|
|
};
|
|
|
|
struct HWProfile {
|
|
Platform platform = Platform::UNKNOWN;
|
|
std::string name;
|
|
std::string vendor;
|
|
|
|
// CPU
|
|
int cores = 1;
|
|
int threads = 1;
|
|
float freq_ghz = 0;
|
|
size_t cache_l2 = 0;
|
|
size_t cache_l3 = 0;
|
|
|
|
// Memory
|
|
size_t ram_bytes = 0;
|
|
size_t vram_bytes = 0;
|
|
int mem_channels = 1;
|
|
float mem_bandwidth_gbps = 0;
|
|
|
|
// Capabilities
|
|
bool has_avx2 = false;
|
|
bool has_avx512 = false;
|
|
bool has_fma = false;
|
|
bool has_neon = false;
|
|
bool has_sve = false;
|
|
bool has_fp16 = false;
|
|
bool has_int8 = false;
|
|
bool has_bf16 = false;
|
|
bool has_tensor_cores = false;
|
|
bool has_amx = false;
|
|
|
|
// Power
|
|
float tdp_watts = 0;
|
|
PowerMode power_mode = PowerMode::MAX;
|
|
|
|
// Theoretical peak
|
|
float tops = 0; // INT8 TOPS
|
|
float tflops_fp32 = 0; // FP32 TFLOPS
|
|
float tflops_fp16 = 0; // FP16 TFLOPS
|
|
};
|
|
|
|
// =============================================================================
|
|
// AUTO-DETECT HARDWARE
|
|
// =============================================================================
|
|
inline HWProfile detect_hardware() {
|
|
HWProfile hw;
|
|
|
|
#if IX_ARCH_X86_64
|
|
#if IX_HAS_AVX512
|
|
hw.platform = Platform::X86_AVX512;
|
|
hw.has_avx512 = true;
|
|
#elif IX_HAS_AVX2
|
|
hw.platform = Platform::X86_AVX2;
|
|
#else
|
|
hw.platform = Platform::X86_GENERIC;
|
|
#endif
|
|
hw.has_avx2 = true; // Assume baseline
|
|
hw.has_fma = true;
|
|
|
|
// Detect CPU info from /proc/cpuinfo on Linux
|
|
#if IX_OS_LINUX
|
|
{
|
|
FILE* f = fopen("/proc/cpuinfo", "r");
|
|
if (f) {
|
|
char line[256];
|
|
while (fgets(line, sizeof(line), f)) {
|
|
if (strncmp(line, "model name", 10) == 0) {
|
|
char* p = strchr(line, ':');
|
|
if (p) {
|
|
hw.name = std::string(p + 2);
|
|
if (!hw.name.empty() && hw.name.back() == '\n')
|
|
hw.name.pop_back();
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
fclose(f);
|
|
}
|
|
|
|
// Count cores
|
|
f = fopen("/proc/cpuinfo", "r");
|
|
if (f) {
|
|
int count = 0;
|
|
char line[256];
|
|
while (fgets(line, sizeof(line), f)) {
|
|
if (strncmp(line, "processor", 9) == 0) count++;
|
|
}
|
|
hw.threads = count;
|
|
hw.cores = count / 2; // Approximate
|
|
fclose(f);
|
|
}
|
|
|
|
// Memory
|
|
f = fopen("/proc/meminfo", "r");
|
|
if (f) {
|
|
char line[256];
|
|
while (fgets(line, sizeof(line), f)) {
|
|
if (strncmp(line, "MemTotal:", 9) == 0) {
|
|
unsigned long kb = 0;
|
|
sscanf(line, "MemTotal: %lu kB", &kb);
|
|
hw.ram_bytes = kb * 1024ULL;
|
|
break;
|
|
}
|
|
}
|
|
fclose(f);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
// Vendor detection from name
|
|
if (hw.name.find("AMD") != std::string::npos) {
|
|
hw.vendor = "AMD";
|
|
if (hw.name.find("EPYC") != std::string::npos) hw.mem_channels = 8;
|
|
else if (hw.name.find("Threadripper") != std::string::npos) hw.mem_channels = 8;
|
|
else if (hw.name.find("Ryzen 9") != std::string::npos) hw.mem_channels = 2;
|
|
else if (hw.name.find("Ryzen 7") != std::string::npos) hw.mem_channels = 2;
|
|
else hw.mem_channels = 2;
|
|
} else if (hw.name.find("Intel") != std::string::npos) {
|
|
hw.vendor = "Intel";
|
|
if (hw.name.find("Xeon") != std::string::npos) hw.mem_channels = 8;
|
|
else hw.mem_channels = 2;
|
|
}
|
|
|
|
// Estimate bandwidth: DDR5 ~38 GB/s per channel, DDR4 ~25 GB/s
|
|
hw.mem_bandwidth_gbps = hw.mem_channels * 38.0f; // Assume DDR5
|
|
|
|
#elif IX_ARCH_ARM64
|
|
hw.platform = Platform::ARM64_NEON;
|
|
hw.has_neon = true;
|
|
hw.has_fp16 = true;
|
|
|
|
#if IX_OS_APPLE
|
|
hw.platform = Platform::APPLE_SILICON;
|
|
hw.vendor = "Apple";
|
|
hw.name = "Apple Silicon";
|
|
hw.mem_bandwidth_gbps = 200.0f; // M-series unified memory
|
|
#elif IX_OS_ANDROID
|
|
hw.platform = Platform::SNAPDRAGON;
|
|
hw.vendor = "Qualcomm";
|
|
hw.name = "Snapdragon";
|
|
#else
|
|
hw.vendor = "ARM";
|
|
#endif
|
|
|
|
#elif IX_ARCH_XTENSA
|
|
hw.platform = Platform::XTENSA;
|
|
hw.vendor = "Espressif";
|
|
hw.name = "ESP32-S3";
|
|
hw.cores = 2;
|
|
hw.threads = 2;
|
|
hw.freq_ghz = 0.24f;
|
|
hw.ram_bytes = 8ULL * 1024 * 1024;
|
|
hw.tdp_watts = 0.5f;
|
|
hw.power_mode = PowerMode::ULTRA_ECO;
|
|
|
|
#elif IX_ARCH_RISCV
|
|
hw.platform = Platform::RISCV;
|
|
hw.vendor = "RISC-V";
|
|
|
|
#endif
|
|
|
|
// ─── ACCELERATOR SDK OVERRIDE ────────────────────────────────────────
|
|
// When Makefile detects an accelerator SDK (IX_USE_*), override
|
|
// the CPU-detected platform. The accelerator IS the target.
|
|
// ─────────────────────────────────────────────────────────────────────
|
|
#ifdef IX_USE_CEREBRAS
|
|
hw.platform = Platform::CEREBRAS;
|
|
hw.vendor = "Cerebras"; hw.name = "WSE";
|
|
hw.cores = 850000; hw.tdp_watts = 20000;
|
|
#endif
|
|
#ifdef IX_USE_GROQ
|
|
hw.platform = Platform::GROQ;
|
|
hw.vendor = "Groq"; hw.name = "LPU";
|
|
hw.tdp_watts = 300;
|
|
#endif
|
|
#ifdef IX_USE_GAUDI
|
|
hw.platform = Platform::GAUDI;
|
|
hw.vendor = "Intel"; hw.name = "Gaudi";
|
|
#endif
|
|
#ifdef IX_USE_INFERENTIA
|
|
hw.platform = Platform::INFERENTIA;
|
|
hw.vendor = "AWS"; hw.name = "Inferentia";
|
|
#endif
|
|
#ifdef IX_USE_GRAPHCORE
|
|
hw.platform = Platform::GRAPHCORE;
|
|
hw.vendor = "Graphcore"; hw.name = "IPU";
|
|
#endif
|
|
#ifdef IX_USE_SAMBANOVA
|
|
hw.platform = Platform::SAMBANOVA;
|
|
hw.vendor = "SambaNova"; hw.name = "RDU";
|
|
#endif
|
|
#ifdef IX_USE_MAIA
|
|
hw.platform = Platform::MAIA;
|
|
hw.vendor = "Microsoft"; hw.name = "Maia";
|
|
#endif
|
|
#ifdef IX_USE_FPGA_XILINX
|
|
hw.platform = Platform::FPGA_XILINX;
|
|
hw.vendor = "Xilinx"; hw.name = "FPGA";
|
|
#endif
|
|
#ifdef IX_USE_HEXAGON
|
|
hw.platform = Platform::HEXAGON;
|
|
hw.vendor = "Qualcomm"; hw.name = "Hexagon DSP";
|
|
#endif
|
|
|
|
return hw;
|
|
}
|
|
|
|
// =============================================================================
|
|
// COMPUTE KERNEL DISPATCH — Platform-optimal implementations
|
|
// =============================================================================
|
|
struct ComputeKernels {
|
|
// Vector multiply-add: out[i] += a[i] * b[i]
|
|
std::function<void(float*, const float*, const float*, int)> vec_fma;
|
|
|
|
// SiLU activation
|
|
std::function<void(float*, int)> silu;
|
|
|
|
// RMS Norm
|
|
std::function<void(float*, const float*, int, float)> rms_norm;
|
|
|
|
// GEMV: out = mat @ vec (quantized mat)
|
|
std::function<void(float*, const void*, int, const float*, int, int, int)> gemv_q;
|
|
|
|
// Softmax
|
|
std::function<void(float*, int)> softmax;
|
|
};
|
|
|
|
inline ComputeKernels get_optimal_kernels(Platform p) {
|
|
ComputeKernels k;
|
|
|
|
// Default scalar implementations (works everywhere)
|
|
k.vec_fma = [](float* out, const float* a, const float* b, int n) {
|
|
for (int i = 0; i < n; ++i) out[i] += a[i] * b[i];
|
|
};
|
|
|
|
k.silu = [](float* x, int n) {
|
|
for (int i = 0; i < n; ++i) x[i] = x[i] / (1.0f + expf(-x[i]));
|
|
};
|
|
|
|
k.rms_norm = [](float* x, const float* w, int n, float eps) {
|
|
float ss = 0;
|
|
for (int i = 0; i < n; ++i) ss += x[i] * x[i];
|
|
ss = 1.0f / sqrtf(ss / n + eps);
|
|
for (int i = 0; i < n; ++i) x[i] = x[i] * ss * w[i];
|
|
};
|
|
|
|
k.softmax = [](float* x, int n) {
|
|
float mx = x[0];
|
|
for (int i = 1; i < n; ++i) mx = std::max(mx, x[i]);
|
|
float sum = 0;
|
|
for (int i = 0; i < n; ++i) { x[i] = expf(x[i] - mx); sum += x[i]; }
|
|
for (int i = 0; i < n; ++i) x[i] /= sum;
|
|
};
|
|
|
|
#if IX_ARCH_X86_64 && IX_HAS_AVX2
|
|
// AVX2 optimized kernels (current v6 path)
|
|
// These delegate to kernels.h and gemm.h implementations
|
|
// No change needed — v6 already has optimal AVX2 paths
|
|
#endif
|
|
|
|
#if IX_ARCH_ARM64 && defined(IX_HAS_NEON)
|
|
// NEON optimized kernels
|
|
k.silu = [](float* x, int n) {
|
|
int i = 0;
|
|
for (; i + 4 <= n; i += 4) {
|
|
float32x4_t v = vld1q_f32(&x[i]);
|
|
float32x4_t neg = vnegq_f32(v);
|
|
// exp approximation for NEON
|
|
float tmp[4];
|
|
vst1q_f32(tmp, neg);
|
|
for (int j = 0; j < 4; ++j) tmp[j] = expf(tmp[j]);
|
|
float32x4_t exp_neg = vld1q_f32(tmp);
|
|
float32x4_t denom = vaddq_f32(vdupq_n_f32(1.0f), exp_neg);
|
|
float32x4_t result = vdivq_f32(v, denom);
|
|
vst1q_f32(&x[i], result);
|
|
}
|
|
for (; i < n; ++i) x[i] = x[i] / (1.0f + expf(-x[i]));
|
|
};
|
|
|
|
k.rms_norm = [](float* x, const float* w, int n, float eps) {
|
|
float32x4_t sum4 = vdupq_n_f32(0);
|
|
int i = 0;
|
|
for (; i + 4 <= n; i += 4) {
|
|
float32x4_t v = vld1q_f32(&x[i]);
|
|
sum4 = vmlaq_f32(sum4, v, v);
|
|
}
|
|
float ss = vaddvq_f32(sum4);
|
|
for (; i < n; ++i) ss += x[i] * x[i];
|
|
float scale = 1.0f / sqrtf(ss / n + eps);
|
|
float32x4_t sc4 = vdupq_n_f32(scale);
|
|
i = 0;
|
|
for (; i + 4 <= n; i += 4) {
|
|
float32x4_t v = vld1q_f32(&x[i]);
|
|
float32x4_t wv = vld1q_f32(&w[i]);
|
|
vst1q_f32(&x[i], vmulq_f32(vmulq_f32(v, sc4), wv));
|
|
}
|
|
for (; i < n; ++i) x[i] = x[i] * scale * w[i];
|
|
};
|
|
#endif
|
|
|
|
return k;
|
|
}
|
|
|
|
// =============================================================================
|
|
// PERFORMANCE ESTIMATOR
|
|
// Estimate tok/s for a given model config on detected hardware
|
|
// =============================================================================
|
|
struct ModelProfile {
|
|
size_t total_bytes; // Total model size on disk
|
|
int n_experts; // Total MoE experts
|
|
int n_active; // Active experts per token
|
|
int dim; // Hidden dimension
|
|
int expert_ffn_dim; // Expert FFN width
|
|
int n_layers; // Number of transformer layers
|
|
int n_dense_layers; // Dense (non-MoE) layers
|
|
size_t shared_bytes; // Non-expert weight bytes (attention, norms, embeddings)
|
|
size_t expert_bytes_each; // Bytes per single expert (gate+up+down)
|
|
};
|
|
|
|
struct PerfEstimate {
|
|
float tok_per_sec;
|
|
float prefill_sec;
|
|
float mem_required_gb;
|
|
float io_per_token_gb;
|
|
std::string bottleneck; // "compute", "memory_bandwidth", "io_bandwidth"
|
|
};
|
|
|
|
inline PerfEstimate estimate_performance(const HWProfile& hw, const ModelProfile& mp) {
|
|
PerfEstimate est;
|
|
|
|
// Active bytes per token = shared weights + K active experts
|
|
size_t active_bytes = mp.shared_bytes +
|
|
(size_t)mp.n_active * mp.expert_bytes_each * (mp.n_layers - mp.n_dense_layers);
|
|
|
|
est.io_per_token_gb = active_bytes / 1e9;
|
|
|
|
// If model fits in RAM: bandwidth-bound
|
|
// If not: storage I/O bound
|
|
bool fits_ram = mp.total_bytes < hw.ram_bytes * 0.8;
|
|
bool active_fits = active_bytes < hw.ram_bytes * 0.6;
|
|
|
|
if (fits_ram) {
|
|
// RAM bandwidth bound
|
|
est.tok_per_sec = (hw.mem_bandwidth_gbps) / est.io_per_token_gb;
|
|
est.bottleneck = "memory_bandwidth";
|
|
est.mem_required_gb = mp.total_bytes / 1e9;
|
|
} else if (active_fits) {
|
|
// Expert-aware mmap: only active experts paged
|
|
// First token cold, subsequent warm from page cache
|
|
float nvme_gbps = 6.0f; // Typical NVMe
|
|
est.tok_per_sec = nvme_gbps / est.io_per_token_gb;
|
|
est.bottleneck = "io_bandwidth_mmap";
|
|
est.mem_required_gb = active_bytes / 1e9 * 1.5f;
|
|
} else {
|
|
// Cold: everything from storage
|
|
float nvme_gbps = 6.0f;
|
|
est.tok_per_sec = nvme_gbps / (mp.total_bytes / 1e9 / mp.n_layers);
|
|
est.bottleneck = "io_bandwidth_cold";
|
|
est.mem_required_gb = hw.ram_bytes / 1e9;
|
|
}
|
|
|
|
est.prefill_sec = 1.0f / est.tok_per_sec;
|
|
|
|
return est;
|
|
}
|
|
|
|
// =============================================================================
|
|
// PRINT HARDWARE REPORT
|
|
// =============================================================================
|
|
inline void print_hw_report(const HWProfile& hw) {
|
|
printf("=== INFERENCE-X v6 — HARDWARE PROFILE ===\n");
|
|
printf(" Platform: %s\n", hw.name.c_str());
|
|
printf(" Vendor: %s\n", hw.vendor.c_str());
|
|
printf(" Cores/Thrds: %d / %d\n", hw.cores, hw.threads);
|
|
printf(" RAM: %.1f GB\n", hw.ram_bytes / 1e9);
|
|
printf(" Mem BW: %.1f GB/s (%d channels)\n",
|
|
hw.mem_bandwidth_gbps, hw.mem_channels);
|
|
printf(" Features: ");
|
|
if (hw.has_avx512) printf("AVX-512 ");
|
|
if (hw.has_avx2) printf("AVX2 ");
|
|
if (hw.has_fma) printf("FMA ");
|
|
if (hw.has_neon) printf("NEON ");
|
|
if (hw.has_sve) printf("SVE ");
|
|
if (hw.has_fp16) printf("FP16 ");
|
|
if (hw.has_bf16) printf("BF16 ");
|
|
if (hw.has_amx) printf("AMX ");
|
|
printf("\n");
|
|
printf(" TDP: %.0f W\n", hw.tdp_watts);
|
|
printf("========================================\n");
|
|
}
|
|
|
|
} // namespace ix
|