// ═══════════════════════════════════════════════════════════════════════════════ // INFERENCE-X — Multi-Platform Backend Definitions (12 Backends) // Copyright (C) 2024-2026 Salka Elmadani. All rights reserved. // Licensed under the Business Source License 1.1 (BSL-1.1) // See LICENSE file for full terms. // // INTELLECTUAL PROPERTY PROTECTION: // - INPI eSoleau deposit: 7phf-Ueye-2nWr-Vsgu (16/02/2026) // - GitHub: git.inference-x.com/salka/inference-x // - Author: Salka Elmadani | Morocco | Morocco // // MANUFACTURER NOTICE: Any manufacturer, company, or entity that // incorporates, embeds, distributes, or commercially uses Inference-X // or any derivative work without explicit written authorization from // the copyright holder is in violation of BSL-1.1 and applicable // intellectual property laws. This includes but is not limited to: // hardware vendors, cloud providers, SaaS platforms, and OEMs. // // Contact: Elmadani.SALKA@proton.me for licensing. // ═══════════════════════════════════════════════════════════════════════════════ #pragma once // Inference-X Identity — removal violates BSL-1.1 #define IX_VERSION "6.0" #define IX_AUTHOR_HASH 0x935E1DAD #define IX_BUILD_SIGNATURE "Inference-X by Salka Elmadani — Morocco" #include #include #include #include #include #include #include // Platform detection #if defined(__x86_64__) || defined(_M_X64) #define IX_ARCH_X86_64 1 #ifdef __AVX512F__ #define IX_HAS_AVX512 1 #endif #ifdef __AVX2__ #define IX_HAS_AVX2 1 #endif #ifdef __FMA__ #define IX_HAS_FMA 1 #endif #elif defined(__aarch64__) || defined(_M_ARM64) #define IX_ARCH_ARM64 1 #include #define IX_HAS_NEON 1 #elif defined(__arm__) #define IX_ARCH_ARM32 1 #elif defined(__riscv) #define IX_ARCH_RISCV 1 #elif defined(__xtensa__) #define IX_ARCH_XTENSA 1 #endif // OS detection #if defined(__linux__) #define IX_OS_LINUX 1 #elif defined(__APPLE__) #define IX_OS_APPLE 1 #if TARGET_OS_IPHONE #define IX_OS_IOS 1 #else #define IX_OS_MACOS 1 #endif #elif defined(__ANDROID__) #define IX_OS_ANDROID 1 #elif defined(_WIN32) #define IX_OS_WINDOWS 1 #endif // Accelerator detection #if defined(__CUDA_ARCH__) || defined(IX_USE_CUDA) #define IX_HAS_CUDA 1 #endif #if defined(IX_USE_ROCM) #define IX_HAS_ROCM 1 #endif #if defined(IX_USE_HEXAGON) #define IX_HAS_HEXAGON 1 #endif namespace ix { // ============================================================================= // HARDWARE PROFILE — Auto-detected at runtime // ============================================================================= enum class Platform { // Desktop/Server CPU X86_AVX512, X86_AVX2, X86_SSE42, X86_GENERIC, // ARM ARM64_NEON, // Apple M-series, Snapdragon, Ampere ARM32_NEON, // Raspberry Pi, older ARM ARM64_SVE, // ARM SVE (Graviton3+, Neoverse) // Mobile SoC SNAPDRAGON, // Qualcomm (CPU + Hexagon DSP + Adreno GPU) APPLE_SILICON, // Apple (CPU + Neural Engine + Metal GPU) MEDIATEK, // Dimensity series EXYNOS, // Samsung // GPU CUDA, // NVIDIA ROCM, // AMD METAL, // Apple VULKAN, // Cross-platform // Edge/Embedded RISCV, // RISC-V boards XTENSA, // ESP32-S3 CORTEX_M, // Arduino, STM32 // Cloud / Accelerator TPU, // Google TPU (v4/v5) INFERENTIA, // AWS Inferentia (NeuronCore) GAUDI, // Intel Gaudi (Habana TPC) CEREBRAS, // Cerebras WSE (850K cores) GROQ, // Groq LPU (deterministic SRAM) GRAPHCORE, // Graphcore IPU (BSP tiles) SAMBANOVA, // SambaNova RDU (reconfigurable dataflow) MAIA, // Microsoft Maia (Azure custom ASIC) FPGA_XILINX, // Xilinx FPGA (Vitis HLS) HEXAGON, // Qualcomm Hexagon DSP (standalone, not SoC) UNKNOWN }; enum class PowerMode { MAX, // Full performance, no power limit BALANCED, // Power/perf tradeoff ECO, // Minimum power (mobile, edge) ULTRA_ECO // Sub-1W (ESP32, Arduino) }; struct HWProfile { Platform platform = Platform::UNKNOWN; std::string name; std::string vendor; // CPU int cores = 1; int threads = 1; float freq_ghz = 0; size_t cache_l2 = 0; size_t cache_l3 = 0; // Memory size_t ram_bytes = 0; size_t vram_bytes = 0; int mem_channels = 1; float mem_bandwidth_gbps = 0; // Capabilities bool has_avx2 = false; bool has_avx512 = false; bool has_fma = false; bool has_neon = false; bool has_sve = false; bool has_fp16 = false; bool has_int8 = false; bool has_bf16 = false; bool has_tensor_cores = false; bool has_amx = false; // Power float tdp_watts = 0; PowerMode power_mode = PowerMode::MAX; // Theoretical peak float tops = 0; // INT8 TOPS float tflops_fp32 = 0; // FP32 TFLOPS float tflops_fp16 = 0; // FP16 TFLOPS }; // ============================================================================= // AUTO-DETECT HARDWARE // ============================================================================= inline HWProfile detect_hardware() { HWProfile hw; #if IX_ARCH_X86_64 #if IX_HAS_AVX512 hw.platform = Platform::X86_AVX512; hw.has_avx512 = true; #elif IX_HAS_AVX2 hw.platform = Platform::X86_AVX2; #else hw.platform = Platform::X86_GENERIC; #endif hw.has_avx2 = true; // Assume baseline hw.has_fma = true; // Detect CPU info from /proc/cpuinfo on Linux #if IX_OS_LINUX { FILE* f = fopen("/proc/cpuinfo", "r"); if (f) { char line[256]; while (fgets(line, sizeof(line), f)) { if (strncmp(line, "model name", 10) == 0) { char* p = strchr(line, ':'); if (p) { hw.name = std::string(p + 2); if (!hw.name.empty() && hw.name.back() == '\n') hw.name.pop_back(); } break; } } fclose(f); } // Count cores f = fopen("/proc/cpuinfo", "r"); if (f) { int count = 0; char line[256]; while (fgets(line, sizeof(line), f)) { if (strncmp(line, "processor", 9) == 0) count++; } hw.threads = count; hw.cores = count / 2; // Approximate fclose(f); } // Memory f = fopen("/proc/meminfo", "r"); if (f) { char line[256]; while (fgets(line, sizeof(line), f)) { if (strncmp(line, "MemTotal:", 9) == 0) { unsigned long kb = 0; sscanf(line, "MemTotal: %lu kB", &kb); hw.ram_bytes = kb * 1024ULL; break; } } fclose(f); } } #endif // Vendor detection from name if (hw.name.find("AMD") != std::string::npos) { hw.vendor = "AMD"; if (hw.name.find("EPYC") != std::string::npos) hw.mem_channels = 8; else if (hw.name.find("Threadripper") != std::string::npos) hw.mem_channels = 8; else if (hw.name.find("Ryzen 9") != std::string::npos) hw.mem_channels = 2; else if (hw.name.find("Ryzen 7") != std::string::npos) hw.mem_channels = 2; else hw.mem_channels = 2; } else if (hw.name.find("Intel") != std::string::npos) { hw.vendor = "Intel"; if (hw.name.find("Xeon") != std::string::npos) hw.mem_channels = 8; else hw.mem_channels = 2; } // Estimate bandwidth: DDR5 ~38 GB/s per channel, DDR4 ~25 GB/s hw.mem_bandwidth_gbps = hw.mem_channels * 38.0f; // Assume DDR5 #elif IX_ARCH_ARM64 hw.platform = Platform::ARM64_NEON; hw.has_neon = true; hw.has_fp16 = true; #if IX_OS_APPLE hw.platform = Platform::APPLE_SILICON; hw.vendor = "Apple"; hw.name = "Apple Silicon"; hw.mem_bandwidth_gbps = 200.0f; // M-series unified memory #elif IX_OS_ANDROID hw.platform = Platform::SNAPDRAGON; hw.vendor = "Qualcomm"; hw.name = "Snapdragon"; #else hw.vendor = "ARM"; #endif #elif IX_ARCH_XTENSA hw.platform = Platform::XTENSA; hw.vendor = "Espressif"; hw.name = "ESP32-S3"; hw.cores = 2; hw.threads = 2; hw.freq_ghz = 0.24f; hw.ram_bytes = 8ULL * 1024 * 1024; hw.tdp_watts = 0.5f; hw.power_mode = PowerMode::ULTRA_ECO; #elif IX_ARCH_RISCV hw.platform = Platform::RISCV; hw.vendor = "RISC-V"; #endif // ─── ACCELERATOR SDK OVERRIDE ──────────────────────────────────────── // When Makefile detects an accelerator SDK (IX_USE_*), override // the CPU-detected platform. The accelerator IS the target. // ───────────────────────────────────────────────────────────────────── #ifdef IX_USE_CEREBRAS hw.platform = Platform::CEREBRAS; hw.vendor = "Cerebras"; hw.name = "WSE"; hw.cores = 850000; hw.tdp_watts = 20000; #endif #ifdef IX_USE_GROQ hw.platform = Platform::GROQ; hw.vendor = "Groq"; hw.name = "LPU"; hw.tdp_watts = 300; #endif #ifdef IX_USE_GAUDI hw.platform = Platform::GAUDI; hw.vendor = "Intel"; hw.name = "Gaudi"; #endif #ifdef IX_USE_INFERENTIA hw.platform = Platform::INFERENTIA; hw.vendor = "AWS"; hw.name = "Inferentia"; #endif #ifdef IX_USE_GRAPHCORE hw.platform = Platform::GRAPHCORE; hw.vendor = "Graphcore"; hw.name = "IPU"; #endif #ifdef IX_USE_SAMBANOVA hw.platform = Platform::SAMBANOVA; hw.vendor = "SambaNova"; hw.name = "RDU"; #endif #ifdef IX_USE_MAIA hw.platform = Platform::MAIA; hw.vendor = "Microsoft"; hw.name = "Maia"; #endif #ifdef IX_USE_FPGA_XILINX hw.platform = Platform::FPGA_XILINX; hw.vendor = "Xilinx"; hw.name = "FPGA"; #endif #ifdef IX_USE_HEXAGON hw.platform = Platform::HEXAGON; hw.vendor = "Qualcomm"; hw.name = "Hexagon DSP"; #endif return hw; } // ============================================================================= // COMPUTE KERNEL DISPATCH — Platform-optimal implementations // ============================================================================= struct ComputeKernels { // Vector multiply-add: out[i] += a[i] * b[i] std::function vec_fma; // SiLU activation std::function silu; // RMS Norm std::function rms_norm; // GEMV: out = mat @ vec (quantized mat) std::function gemv_q; // Softmax std::function softmax; }; inline ComputeKernels get_optimal_kernels(Platform p) { ComputeKernels k; // Default scalar implementations (works everywhere) k.vec_fma = [](float* out, const float* a, const float* b, int n) { for (int i = 0; i < n; ++i) out[i] += a[i] * b[i]; }; k.silu = [](float* x, int n) { for (int i = 0; i < n; ++i) x[i] = x[i] / (1.0f + expf(-x[i])); }; k.rms_norm = [](float* x, const float* w, int n, float eps) { float ss = 0; for (int i = 0; i < n; ++i) ss += x[i] * x[i]; ss = 1.0f / sqrtf(ss / n + eps); for (int i = 0; i < n; ++i) x[i] = x[i] * ss * w[i]; }; k.softmax = [](float* x, int n) { float mx = x[0]; for (int i = 1; i < n; ++i) mx = std::max(mx, x[i]); float sum = 0; for (int i = 0; i < n; ++i) { x[i] = expf(x[i] - mx); sum += x[i]; } for (int i = 0; i < n; ++i) x[i] /= sum; }; #if IX_ARCH_X86_64 && IX_HAS_AVX2 // AVX2 optimized kernels (current v6 path) // These delegate to kernels.h and gemm.h implementations // No change needed — v6 already has optimal AVX2 paths #endif #if IX_ARCH_ARM64 && defined(IX_HAS_NEON) // NEON optimized kernels k.silu = [](float* x, int n) { int i = 0; for (; i + 4 <= n; i += 4) { float32x4_t v = vld1q_f32(&x[i]); float32x4_t neg = vnegq_f32(v); // exp approximation for NEON float tmp[4]; vst1q_f32(tmp, neg); for (int j = 0; j < 4; ++j) tmp[j] = expf(tmp[j]); float32x4_t exp_neg = vld1q_f32(tmp); float32x4_t denom = vaddq_f32(vdupq_n_f32(1.0f), exp_neg); float32x4_t result = vdivq_f32(v, denom); vst1q_f32(&x[i], result); } for (; i < n; ++i) x[i] = x[i] / (1.0f + expf(-x[i])); }; k.rms_norm = [](float* x, const float* w, int n, float eps) { float32x4_t sum4 = vdupq_n_f32(0); int i = 0; for (; i + 4 <= n; i += 4) { float32x4_t v = vld1q_f32(&x[i]); sum4 = vmlaq_f32(sum4, v, v); } float ss = vaddvq_f32(sum4); for (; i < n; ++i) ss += x[i] * x[i]; float scale = 1.0f / sqrtf(ss / n + eps); float32x4_t sc4 = vdupq_n_f32(scale); i = 0; for (; i + 4 <= n; i += 4) { float32x4_t v = vld1q_f32(&x[i]); float32x4_t wv = vld1q_f32(&w[i]); vst1q_f32(&x[i], vmulq_f32(vmulq_f32(v, sc4), wv)); } for (; i < n; ++i) x[i] = x[i] * scale * w[i]; }; #endif return k; } // ============================================================================= // PERFORMANCE ESTIMATOR // Estimate tok/s for a given model config on detected hardware // ============================================================================= struct ModelProfile { size_t total_bytes; // Total model size on disk int n_experts; // Total MoE experts int n_active; // Active experts per token int dim; // Hidden dimension int expert_ffn_dim; // Expert FFN width int n_layers; // Number of transformer layers int n_dense_layers; // Dense (non-MoE) layers size_t shared_bytes; // Non-expert weight bytes (attention, norms, embeddings) size_t expert_bytes_each; // Bytes per single expert (gate+up+down) }; struct PerfEstimate { float tok_per_sec; float prefill_sec; float mem_required_gb; float io_per_token_gb; std::string bottleneck; // "compute", "memory_bandwidth", "io_bandwidth" }; inline PerfEstimate estimate_performance(const HWProfile& hw, const ModelProfile& mp) { PerfEstimate est; // Active bytes per token = shared weights + K active experts size_t active_bytes = mp.shared_bytes + (size_t)mp.n_active * mp.expert_bytes_each * (mp.n_layers - mp.n_dense_layers); est.io_per_token_gb = active_bytes / 1e9; // If model fits in RAM: bandwidth-bound // If not: storage I/O bound bool fits_ram = mp.total_bytes < hw.ram_bytes * 0.8; bool active_fits = active_bytes < hw.ram_bytes * 0.6; if (fits_ram) { // RAM bandwidth bound est.tok_per_sec = (hw.mem_bandwidth_gbps) / est.io_per_token_gb; est.bottleneck = "memory_bandwidth"; est.mem_required_gb = mp.total_bytes / 1e9; } else if (active_fits) { // Expert-aware mmap: only active experts paged // First token cold, subsequent warm from page cache float nvme_gbps = 6.0f; // Typical NVMe est.tok_per_sec = nvme_gbps / est.io_per_token_gb; est.bottleneck = "io_bandwidth_mmap"; est.mem_required_gb = active_bytes / 1e9 * 1.5f; } else { // Cold: everything from storage float nvme_gbps = 6.0f; est.tok_per_sec = nvme_gbps / (mp.total_bytes / 1e9 / mp.n_layers); est.bottleneck = "io_bandwidth_cold"; est.mem_required_gb = hw.ram_bytes / 1e9; } est.prefill_sec = 1.0f / est.tok_per_sec; return est; } // ============================================================================= // PRINT HARDWARE REPORT // ============================================================================= inline void print_hw_report(const HWProfile& hw) { printf("=== INFERENCE-X v6 — HARDWARE PROFILE ===\n"); printf(" Platform: %s\n", hw.name.c_str()); printf(" Vendor: %s\n", hw.vendor.c_str()); printf(" Cores/Thrds: %d / %d\n", hw.cores, hw.threads); printf(" RAM: %.1f GB\n", hw.ram_bytes / 1e9); printf(" Mem BW: %.1f GB/s (%d channels)\n", hw.mem_bandwidth_gbps, hw.mem_channels); printf(" Features: "); if (hw.has_avx512) printf("AVX-512 "); if (hw.has_avx2) printf("AVX2 "); if (hw.has_fma) printf("FMA "); if (hw.has_neon) printf("NEON "); if (hw.has_sve) printf("SVE "); if (hw.has_fp16) printf("FP16 "); if (hw.has_bf16) printf("BF16 "); if (hw.has_amx) printf("AMX "); printf("\n"); printf(" TDP: %.0f W\n", hw.tdp_watts); printf("========================================\n"); } } // namespace ix