inference-x/runtime/backends.h

// ═══════════════════════════════════════════════════════════════════════════════
// INFERENCE-X — Multi-Platform Backend Definitions (12 Backends)
// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
// Licensed under the Business Source License 1.1 (BSL-1.1)
// See LICENSE file for full terms.
//
// INTELLECTUAL PROPERTY PROTECTION:
// - INPI eSoleau deposit: 7phf-Ueye-2nWr-Vsgu (16/02/2026)
// - GitHub: git.inference-x.com/salka/inference-x
// - Author: Salka Elmadani | Morocco | Morocco
//
// MANUFACTURER NOTICE: Any manufacturer, company, or entity that
// incorporates, embeds, distributes, or commercially uses Inference-X
// or any derivative work without explicit written authorization from
// the copyright holder is in violation of BSL-1.1 and applicable
// intellectual property laws. This includes but is not limited to:
// hardware vendors, cloud providers, SaaS platforms, and OEMs.
//
// Contact: Elmadani.SALKA@proton.me for licensing.
// ═══════════════════════════════════════════════════════════════════════════════

#pragma once

// Inference-X Identity — removal violates BSL-1.1
#define IX_VERSION "6.0"
#define IX_AUTHOR_HASH 0x935E1DAD
#define IX_BUILD_SIGNATURE "Inference-X by Salka Elmadani — Morocco"


#include <cstdint>
#include <cstdio>
#include <cstring>
#include <string>
#include <vector>
#include <memory>
#include <functional>

// Platform detection
#if defined(__x86_64__) || defined(_M_X64)
    #define IX_ARCH_X86_64 1
    #ifdef __AVX512F__
        #define IX_HAS_AVX512 1
    #endif
    #ifdef __AVX2__
        #define IX_HAS_AVX2 1
    #endif
    #ifdef __FMA__
        #define IX_HAS_FMA 1
    #endif
#elif defined(__aarch64__) || defined(_M_ARM64)
    #define IX_ARCH_ARM64 1
    #include <arm_neon.h>
    #define IX_HAS_NEON 1
#elif defined(__arm__)
    #define IX_ARCH_ARM32 1
#elif defined(__riscv)
    #define IX_ARCH_RISCV 1
#elif defined(__xtensa__)
    #define IX_ARCH_XTENSA 1
#endif

// OS detection
#if defined(__linux__)
    #define IX_OS_LINUX 1
#elif defined(__APPLE__)
    #define IX_OS_APPLE 1
    #if TARGET_OS_IPHONE
        #define IX_OS_IOS 1
    #else
        #define IX_OS_MACOS 1
    #endif
#elif defined(__ANDROID__)
    #define IX_OS_ANDROID 1
#elif defined(_WIN32)
    #define IX_OS_WINDOWS 1
#endif

// Accelerator detection
#if defined(__CUDA_ARCH__) || defined(IX_USE_CUDA)
    #define IX_HAS_CUDA 1
#endif
#if defined(IX_USE_ROCM)
    #define IX_HAS_ROCM 1
#endif
#if defined(IX_USE_HEXAGON)
    #define IX_HAS_HEXAGON 1
#endif

namespace ix {

// =============================================================================
// HARDWARE PROFILE — Auto-detected at runtime
// =============================================================================
enum class Platform {
    // Desktop/Server CPU
    X86_AVX512,
    X86_AVX2,
    X86_SSE42,
    X86_GENERIC,

    // ARM
    ARM64_NEON,       // Apple M-series, Snapdragon, Ampere
    ARM32_NEON,       // Raspberry Pi, older ARM
    ARM64_SVE,        // ARM SVE (Graviton3+, Neoverse)

    // Mobile SoC
    SNAPDRAGON,       // Qualcomm (CPU + Hexagon DSP + Adreno GPU)
    APPLE_SILICON,    // Apple (CPU + Neural Engine + Metal GPU)
    MEDIATEK,         // Dimensity series
    EXYNOS,           // Samsung

    // GPU
    CUDA,             // NVIDIA
    ROCM,             // AMD
    METAL,            // Apple
    VULKAN,           // Cross-platform

    // Edge/Embedded
    RISCV,            // RISC-V boards
    XTENSA,           // ESP32-S3
    CORTEX_M,         // Arduino, STM32

    // Cloud / Accelerator
    TPU,              // Google TPU (v4/v5)
    INFERENTIA,       // AWS Inferentia (NeuronCore)
    GAUDI,            // Intel Gaudi (Habana TPC)
    CEREBRAS,         // Cerebras WSE (850K cores)
    GROQ,             // Groq LPU (deterministic SRAM)
    GRAPHCORE,        // Graphcore IPU (BSP tiles)
    SAMBANOVA,        // SambaNova RDU (reconfigurable dataflow)
    MAIA,             // Microsoft Maia (Azure custom ASIC)
    FPGA_XILINX,      // Xilinx FPGA (Vitis HLS)
    HEXAGON,          // Qualcomm Hexagon DSP (standalone, not SoC)

    UNKNOWN
};

enum class PowerMode {
    MAX,        // Full performance, no power limit
    BALANCED,   // Power/perf tradeoff
    ECO,        // Minimum power (mobile, edge)
    ULTRA_ECO   // Sub-1W (ESP32, Arduino)
};

struct HWProfile {
    Platform platform = Platform::UNKNOWN;
    std::string name;
    std::string vendor;

    // CPU
    int cores = 1;
    int threads = 1;
    float freq_ghz = 0;
    size_t cache_l2 = 0;
    size_t cache_l3 = 0;

    // Memory
    size_t ram_bytes = 0;
    size_t vram_bytes = 0;
    int mem_channels = 1;
    float mem_bandwidth_gbps = 0;

    // Capabilities
    bool has_avx2 = false;
    bool has_avx512 = false;
    bool has_fma = false;
    bool has_neon = false;
    bool has_sve = false;
    bool has_fp16 = false;
    bool has_int8 = false;
    bool has_bf16 = false;
    bool has_tensor_cores = false;
    bool has_amx = false;

    // Power
    float tdp_watts = 0;
    PowerMode power_mode = PowerMode::MAX;

    // Theoretical peak
    float tops = 0;           // INT8 TOPS
    float tflops_fp32 = 0;    // FP32 TFLOPS
    float tflops_fp16 = 0;    // FP16 TFLOPS
};

// =============================================================================
// AUTO-DETECT HARDWARE
// =============================================================================
inline HWProfile detect_hardware() {
    HWProfile hw;

#if IX_ARCH_X86_64
    #if IX_HAS_AVX512
        hw.platform = Platform::X86_AVX512;
        hw.has_avx512 = true;
    #elif IX_HAS_AVX2
        hw.platform = Platform::X86_AVX2;
    #else
        hw.platform = Platform::X86_GENERIC;
    #endif
    hw.has_avx2 = true;  // Assume baseline
    hw.has_fma = true;

    // Detect CPU info from /proc/cpuinfo on Linux
    #if IX_OS_LINUX
    {
        FILE* f = fopen("/proc/cpuinfo", "r");
        if (f) {
            char line[256];
            while (fgets(line, sizeof(line), f)) {
                if (strncmp(line, "model name", 10) == 0) {
                    char* p = strchr(line, ':');
                    if (p) {
                        hw.name = std::string(p + 2);
                        if (!hw.name.empty() && hw.name.back() == '\n')
                            hw.name.pop_back();
                    }
                    break;
                }
            }
            fclose(f);
        }

        // Count cores
        f = fopen("/proc/cpuinfo", "r");
        if (f) {
            int count = 0;
            char line[256];
            while (fgets(line, sizeof(line), f)) {
                if (strncmp(line, "processor", 9) == 0) count++;
            }
            hw.threads = count;
            hw.cores = count / 2;  // Approximate
            fclose(f);
        }

        // Memory
        f = fopen("/proc/meminfo", "r");
        if (f) {
            char line[256];
            while (fgets(line, sizeof(line), f)) {
                if (strncmp(line, "MemTotal:", 9) == 0) {
                    unsigned long kb = 0;
                    sscanf(line, "MemTotal: %lu kB", &kb);
                    hw.ram_bytes = kb * 1024ULL;
                    break;
                }
            }
            fclose(f);
        }
    }
    #endif

    // Vendor detection from name
    if (hw.name.find("AMD") != std::string::npos) {
        hw.vendor = "AMD";
        if (hw.name.find("EPYC") != std::string::npos) hw.mem_channels = 8;
        else if (hw.name.find("Threadripper") != std::string::npos) hw.mem_channels = 8;
        else if (hw.name.find("Ryzen 9") != std::string::npos) hw.mem_channels = 2;
        else if (hw.name.find("Ryzen 7") != std::string::npos) hw.mem_channels = 2;
        else hw.mem_channels = 2;
    } else if (hw.name.find("Intel") != std::string::npos) {
        hw.vendor = "Intel";
        if (hw.name.find("Xeon") != std::string::npos) hw.mem_channels = 8;
        else hw.mem_channels = 2;
    }

    // Estimate bandwidth: DDR5 ~38 GB/s per channel, DDR4 ~25 GB/s
    hw.mem_bandwidth_gbps = hw.mem_channels * 38.0f;  // Assume DDR5

#elif IX_ARCH_ARM64
    hw.platform = Platform::ARM64_NEON;
    hw.has_neon = true;
    hw.has_fp16 = true;

    #if IX_OS_APPLE
        hw.platform = Platform::APPLE_SILICON;
        hw.vendor = "Apple";
        hw.name = "Apple Silicon";
        hw.mem_bandwidth_gbps = 200.0f;  // M-series unified memory
    #elif IX_OS_ANDROID
        hw.platform = Platform::SNAPDRAGON;
        hw.vendor = "Qualcomm";
        hw.name = "Snapdragon";
    #else
        hw.vendor = "ARM";
    #endif

#elif IX_ARCH_XTENSA
    hw.platform = Platform::XTENSA;
    hw.vendor = "Espressif";
    hw.name = "ESP32-S3";
    hw.cores = 2;
    hw.threads = 2;
    hw.freq_ghz = 0.24f;
    hw.ram_bytes = 8ULL * 1024 * 1024;
    hw.tdp_watts = 0.5f;
    hw.power_mode = PowerMode::ULTRA_ECO;

#elif IX_ARCH_RISCV
    hw.platform = Platform::RISCV;
    hw.vendor = "RISC-V";

#endif

    // ─── ACCELERATOR SDK OVERRIDE ────────────────────────────────────────
    // When Makefile detects an accelerator SDK (IX_USE_*), override
    // the CPU-detected platform. The accelerator IS the target.
    // ─────────────────────────────────────────────────────────────────────
#ifdef IX_USE_CEREBRAS
    hw.platform = Platform::CEREBRAS;
    hw.vendor = "Cerebras"; hw.name = "WSE";
    hw.cores = 850000; hw.tdp_watts = 20000;
#endif
#ifdef IX_USE_GROQ
    hw.platform = Platform::GROQ;
    hw.vendor = "Groq"; hw.name = "LPU";
    hw.tdp_watts = 300;
#endif
#ifdef IX_USE_GAUDI
    hw.platform = Platform::GAUDI;
    hw.vendor = "Intel"; hw.name = "Gaudi";
#endif
#ifdef IX_USE_INFERENTIA
    hw.platform = Platform::INFERENTIA;
    hw.vendor = "AWS"; hw.name = "Inferentia";
#endif
#ifdef IX_USE_GRAPHCORE
    hw.platform = Platform::GRAPHCORE;
    hw.vendor = "Graphcore"; hw.name = "IPU";
#endif
#ifdef IX_USE_SAMBANOVA
    hw.platform = Platform::SAMBANOVA;
    hw.vendor = "SambaNova"; hw.name = "RDU";
#endif
#ifdef IX_USE_MAIA
    hw.platform = Platform::MAIA;
    hw.vendor = "Microsoft"; hw.name = "Maia";
#endif
#ifdef IX_USE_FPGA_XILINX
    hw.platform = Platform::FPGA_XILINX;
    hw.vendor = "Xilinx"; hw.name = "FPGA";
#endif
#ifdef IX_USE_HEXAGON
    hw.platform = Platform::HEXAGON;
    hw.vendor = "Qualcomm"; hw.name = "Hexagon DSP";
#endif

    return hw;
}

// =============================================================================
// COMPUTE KERNEL DISPATCH — Platform-optimal implementations
// =============================================================================
struct ComputeKernels {
    // Vector multiply-add: out[i] += a[i] * b[i]
    std::function<void(float*, const float*, const float*, int)> vec_fma;

    // SiLU activation
    std::function<void(float*, int)> silu;

    // RMS Norm
    std::function<void(float*, const float*, int, float)> rms_norm;

    // GEMV: out = mat @ vec (quantized mat)
    std::function<void(float*, const void*, int, const float*, int, int, int)> gemv_q;

    // Softmax
    std::function<void(float*, int)> softmax;
};

inline ComputeKernels get_optimal_kernels(Platform p) {
    ComputeKernels k;

    // Default scalar implementations (works everywhere)
    k.vec_fma = [](float* out, const float* a, const float* b, int n) {
        for (int i = 0; i < n; ++i) out[i] += a[i] * b[i];
    };

    k.silu = [](float* x, int n) {
        for (int i = 0; i < n; ++i) x[i] = x[i] / (1.0f + expf(-x[i]));
    };

    k.rms_norm = [](float* x, const float* w, int n, float eps) {
        float ss = 0;
        for (int i = 0; i < n; ++i) ss += x[i] * x[i];
        ss = 1.0f / sqrtf(ss / n + eps);
        for (int i = 0; i < n; ++i) x[i] = x[i] * ss * w[i];
    };

    k.softmax = [](float* x, int n) {
        float mx = x[0];
        for (int i = 1; i < n; ++i) mx = std::max(mx, x[i]);
        float sum = 0;
        for (int i = 0; i < n; ++i) { x[i] = expf(x[i] - mx); sum += x[i]; }
        for (int i = 0; i < n; ++i) x[i] /= sum;
    };

#if IX_ARCH_X86_64 && IX_HAS_AVX2
    // AVX2 optimized kernels (current v6 path)
    // These delegate to kernels.h and gemm.h implementations
    // No change needed — v6 already has optimal AVX2 paths
#endif

#if IX_ARCH_ARM64 && defined(IX_HAS_NEON)
    // NEON optimized kernels
    k.silu = [](float* x, int n) {
        int i = 0;
        for (; i + 4 <= n; i += 4) {
            float32x4_t v = vld1q_f32(&x[i]);
            float32x4_t neg = vnegq_f32(v);
            // exp approximation for NEON
            float tmp[4];
            vst1q_f32(tmp, neg);
            for (int j = 0; j < 4; ++j) tmp[j] = expf(tmp[j]);
            float32x4_t exp_neg = vld1q_f32(tmp);
            float32x4_t denom = vaddq_f32(vdupq_n_f32(1.0f), exp_neg);
            float32x4_t result = vdivq_f32(v, denom);
            vst1q_f32(&x[i], result);
        }
        for (; i < n; ++i) x[i] = x[i] / (1.0f + expf(-x[i]));
    };

    k.rms_norm = [](float* x, const float* w, int n, float eps) {
        float32x4_t sum4 = vdupq_n_f32(0);
        int i = 0;
        for (; i + 4 <= n; i += 4) {
            float32x4_t v = vld1q_f32(&x[i]);
            sum4 = vmlaq_f32(sum4, v, v);
        }
        float ss = vaddvq_f32(sum4);
        for (; i < n; ++i) ss += x[i] * x[i];
        float scale = 1.0f / sqrtf(ss / n + eps);
        float32x4_t sc4 = vdupq_n_f32(scale);
        i = 0;
        for (; i + 4 <= n; i += 4) {
            float32x4_t v = vld1q_f32(&x[i]);
            float32x4_t wv = vld1q_f32(&w[i]);
            vst1q_f32(&x[i], vmulq_f32(vmulq_f32(v, sc4), wv));
        }
        for (; i < n; ++i) x[i] = x[i] * scale * w[i];
    };
#endif

    return k;
}

// =============================================================================
// PERFORMANCE ESTIMATOR
// Estimate tok/s for a given model config on detected hardware
// =============================================================================
struct ModelProfile {
    size_t total_bytes;       // Total model size on disk
    int n_experts;            // Total MoE experts
    int n_active;             // Active experts per token
    int dim;                  // Hidden dimension
    int expert_ffn_dim;       // Expert FFN width
    int n_layers;             // Number of transformer layers
    int n_dense_layers;       // Dense (non-MoE) layers
    size_t shared_bytes;      // Non-expert weight bytes (attention, norms, embeddings)
    size_t expert_bytes_each; // Bytes per single expert (gate+up+down)
};

struct PerfEstimate {
    float tok_per_sec;
    float prefill_sec;
    float mem_required_gb;
    float io_per_token_gb;
    std::string bottleneck;   // "compute", "memory_bandwidth", "io_bandwidth"
};

inline PerfEstimate estimate_performance(const HWProfile& hw, const ModelProfile& mp) {
    PerfEstimate est;

    // Active bytes per token = shared weights + K active experts
    size_t active_bytes = mp.shared_bytes +
        (size_t)mp.n_active * mp.expert_bytes_each * (mp.n_layers - mp.n_dense_layers);

    est.io_per_token_gb = active_bytes / 1e9;

    // If model fits in RAM: bandwidth-bound
    // If not: storage I/O bound
    bool fits_ram = mp.total_bytes < hw.ram_bytes * 0.8;
    bool active_fits = active_bytes < hw.ram_bytes * 0.6;

    if (fits_ram) {
        // RAM bandwidth bound
        est.tok_per_sec = (hw.mem_bandwidth_gbps) / est.io_per_token_gb;
        est.bottleneck = "memory_bandwidth";
        est.mem_required_gb = mp.total_bytes / 1e9;
    } else if (active_fits) {
        // Expert-aware mmap: only active experts paged
        // First token cold, subsequent warm from page cache
        float nvme_gbps = 6.0f;  // Typical NVMe
        est.tok_per_sec = nvme_gbps / est.io_per_token_gb;
        est.bottleneck = "io_bandwidth_mmap";
        est.mem_required_gb = active_bytes / 1e9 * 1.5f;
    } else {
        // Cold: everything from storage
        float nvme_gbps = 6.0f;
        est.tok_per_sec = nvme_gbps / (mp.total_bytes / 1e9 / mp.n_layers);
        est.bottleneck = "io_bandwidth_cold";
        est.mem_required_gb = hw.ram_bytes / 1e9;
    }

    est.prefill_sec = 1.0f / est.tok_per_sec;

    return est;
}

// =============================================================================
// PRINT HARDWARE REPORT
// =============================================================================
inline void print_hw_report(const HWProfile& hw) {
    printf("=== INFERENCE-X v6 — HARDWARE PROFILE ===\n");
    printf("  Platform:    %s\n", hw.name.c_str());
    printf("  Vendor:      %s\n", hw.vendor.c_str());
    printf("  Cores/Thrds: %d / %d\n", hw.cores, hw.threads);
    printf("  RAM:         %.1f GB\n", hw.ram_bytes / 1e9);
    printf("  Mem BW:      %.1f GB/s (%d channels)\n",
           hw.mem_bandwidth_gbps, hw.mem_channels);
    printf("  Features:    ");
    if (hw.has_avx512) printf("AVX-512 ");
    if (hw.has_avx2) printf("AVX2 ");
    if (hw.has_fma) printf("FMA ");
    if (hw.has_neon) printf("NEON ");
    if (hw.has_sve) printf("SVE ");
    if (hw.has_fp16) printf("FP16 ");
    if (hw.has_bf16) printf("BF16 ");
    if (hw.has_amx) printf("AMX ");
    printf("\n");
    printf("  TDP:         %.0f W\n", hw.tdp_watts);
    printf("========================================\n");
}

} // namespace ix