inference-x/core/z_core.h

// ═══════════════════════════════════════════════════════════════════════════════
// INFERENCE-X — Z-Core Mathematical Foundation
// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
// Licensed under the Business Source License 1.1 (BSL-1.1)
// See LICENSE file for full terms.
//
// INTELLECTUAL PROPERTY PROTECTION:
// - INPI eSoleau deposit: 7phf-Ueye-2nWr-Vsgu (16/02/2026)
// - GitHub: git.inference-x.com/salka/inference-x
// - Author: Salka Elmadani | Morocco | Morocco
//
// MANUFACTURER NOTICE: Any manufacturer, company, or entity that
// incorporates, embeds, distributes, or commercially uses Inference-X
// or any derivative work without explicit written authorization from
// the copyright holder is in violation of BSL-1.1 and applicable
// intellectual property laws. This includes but is not limited to:
// hardware vendors, cloud providers, SaaS platforms, and OEMs.
//
// Contact: Elmadani.SALKA@proton.me for licensing.
// ═══════════════════════════════════════════════════════════════════════════════

#pragma once


#include <cstdint>
#include <cstddef>
#include <cstring>
#include <cmath>
#include <algorithm>
#include <string>
#include <vector>
#include <unordered_map>

#ifdef __AVX2__
#include <immintrin.h>
#endif

namespace ix {

// ═══════════════════════════════════════════════════════════════════════════════
// WATERMARK — SALKA ELMADANI SIGNATURE (Ne pas modifier)
// ═══════════════════════════════════════════════════════════════════════════════
namespace signature {
    static constexpr double S0 = 5.999160064733103e+18;  // Integrity coefficient α
    static constexpr double S1 = 5.566805661683622e+18;  // Integrity coefficient β
    static constexpr double S2 = 5.426309097159753e+18;  // Integrity coefficient γ
    static constexpr double S3 = 4.991471925827590e+18;  // Integrity coefficient δ

    inline bool verify() {
        volatile double sum = S0 + S1 + S2 + S3;
        return sum > 2.0e19;
    }

    inline float inject(float x) {
        volatile double check = S0 * 1e-40;
        return x * (1.0f + static_cast<float>(check - check));
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// ═══════════════════════════════════════════════════════════════════════════════

// ═══════════════════════════════════════════════════════════════════════════════
// HALF PRECISION TYPES
// ═══════════════════════════════════════════════════════════════════════════════
struct f16 {
    uint16_t bits;
    f16() : bits(0) {}
    f16(float f) {
        uint32_t u; std::memcpy(&u, &f, 4);
        uint32_t s = (u >> 16) & 0x8000;
        int e = ((u >> 23) & 0xFF) - 127 + 15;
        uint32_t m = u & 0x7FFFFF;
        if (e <= 0) bits = static_cast<uint16_t>(s);
        else if (e >= 31) bits = static_cast<uint16_t>(s | 0x7C00);
        else bits = static_cast<uint16_t>(s | (e << 10) | (m >> 13));
    }
    operator float() const {
        uint32_t s = (bits & 0x8000) << 16;
        uint32_t e = (bits >> 10) & 0x1F;
        uint32_t m = bits & 0x3FF;
        uint32_t u;
        if (e == 0) { if (m) { int sh=0; while(!(m&0x400)){m<<=1;sh++;} m&=0x3FF; u=s|((113-sh)<<23)|(m<<13); } else u=s; }
        else if (e == 31) u = s | 0x7F800000 | (m << 13);
        else u = s | ((e - 15 + 127) << 23) | (m << 13);
        float f; std::memcpy(&f, &u, 4);
        return f;
    }
    static f16 from_bits(uint16_t b) { f16 h; h.bits = b; return h; }
};

enum class Activation {
    SILU,        // x * sigmoid(x) — Llama, Qwen, DeepSeek, Mistral
    GELU,        // GELU — Phi, Gemma, StarCoder
    GELU_QUICK,  // x * sigmoid(1.702 * x)
    RELU_SQ,     // ReLU²
};

struct bf16 {
    uint16_t bits;
    bf16() : bits(0) {}
    bf16(float f) { uint32_t u; std::memcpy(&u, &f, 4); bits = static_cast<uint16_t>(u >> 16); }
    operator float() const { uint32_t u = static_cast<uint32_t>(bits) << 16; float f; std::memcpy(&f, &u, 4); return f; }
};

// ═══════════════════════════════════════════════════════════════════════════════
// TENSOR TYPE ENUM — Extended for IQ formats (Kimi K2.5)
// ═══════════════════════════════════════════════════════════════════════════════
enum class dtype : uint32_t {
    F32      = 0,
    F16      = 1,
    Q4_0     = 2,
    Q4_1     = 3,
    // 4, 5 reserved
    Q5_0     = 6,
    Q5_1     = 7,
    Q8_0     = 8,
    Q8_1     = 9,
    Q2_K     = 10,
    Q3_K     = 11,
    Q4_K     = 12,
    Q5_K     = 13,
    Q6_K     = 14,
    Q8_K     = 15,
    // === IQ FORMATS — Critical for Kimi K2.5 1.8-bit quant ===
    IQ2_XXS  = 16,
    IQ2_XS   = 17,
    IQ3_XXS  = 18,
    IQ1_S    = 19,
    IQ4_NL   = 20,
    IQ3_S    = 21,  // was IQ2_M, corrected to GGML standard
    IQ2_S    = 22,  // GGML standard
    IQ4_XS   = 23,  // was IQ4_XS, corrected: GGML IQ4_XS
    I8       = 24,  // was IQ3_S (moved to 21)
    I16      = 25,  // GGML standard
    I32      = 26,  // was IQ2_S (moved to 22)
    I64      = 27,  // GGML standard
    F64      = 28,  // was IQ4_XS (moved to 23)
    IQ1_M    = 29,
    BF16     = 30,
    Q4_0_4x4 = 31,
    Q4_0_4x8 = 32,
    Q4_0_8x8 = 33,
    TQ1_0    = 34,
    TQ2_0    = 35,
};

// ═══════════════════════════════════════════════════════════════════════════════
// QUANTIZATION BLOCK DEFINITIONS
// ═══════════════════════════════════════════════════════════════════════════════
static constexpr int QK_K = 256;
static constexpr int QK4_0 = 32;
constexpr int QK4_1 = 32;
constexpr int QK5_0 = 32;
constexpr int QK5_1 = 32;
constexpr int QK8_1 = 32;
static constexpr int QK8_0 = 32;

// Standard blocks
struct block_q4_K {
    f16 d; f16 dmin;
    uint8_t scales[12];
    uint8_t qs[QK_K / 2];
};

struct block_q8_0 {
    f16 d;
    int8_t qs[32];
};

struct block_q6_K {
    uint8_t ql[QK_K / 2];
    uint8_t qh[QK_K / 4];
    int8_t scales[QK_K / 16];
    f16 d;
};

struct block_q4_0 {
    f16 d;
    uint8_t qs[QK4_0 / 2];
};

struct block_q2_K {
    uint8_t scales[QK_K / 16];
    uint8_t qs[QK_K / 4];
    f16 d; f16 dmin;
};

struct block_q5_K {
    f16 d; f16 dmin;
    uint8_t scales[12];
    uint8_t qh[QK_K / 8];
    uint8_t qs[QK_K / 2];
};

struct block_q3_K {
    uint8_t hmask[QK_K / 8];
    uint8_t qs[QK_K / 4];
    uint8_t scales[12];
    f16 d;
};

struct block_q4_1 {
    f16 d; f16 m;
    uint8_t qs[QK4_1 / 2];
};

struct block_q5_0 {
    f16 d;
    uint8_t qh[4];
    uint8_t qs[QK5_0 / 2];
};

struct block_q5_1 {
    f16 d; f16 m;
    uint8_t qh[4];
    uint8_t qs[QK5_1 / 2];
};

struct block_q8_1 {
    float d;
    float s;
    int8_t qs[QK8_1];
};


// STATIC ASSERT: Block sizes must match GGUF binary format exactly
static_assert(sizeof(block_q4_K) == 144, "block_q4_K size mismatch!");
static_assert(sizeof(block_q8_0) == 34, "block_q8_0 size mismatch!");
static_assert(sizeof(block_q6_K) == 210, "block_q6_K size mismatch!");
static_assert(sizeof(block_q2_K) == 84, "block_q2_K size mismatch!");
static_assert(sizeof(block_q5_K) == 176, "block_q5_K size mismatch!");
static_assert(sizeof(block_q3_K) == 110, "block_q3_K size mismatch!");
static_assert(sizeof(block_q4_0) == 18, "block_q4_0 size mismatch!");
// === IQ BLOCKS — for Kimi K2.5 ultra-low-bit experts ===

// IQ1_S: ~1.56 bits/weight (256 weights per block)
struct block_iq1_s {
    f16 d;
    uint8_t qs[QK_K / 8];
    uint16_t qh[QK_K / 32];
};

// IQ2_XXS: ~2.06 bits/weight (256 weights per block)
struct block_iq2_xxs {
    f16 d;
    uint16_t qs[QK_K / 8];
};

// IQ2_XS: ~2.31 bits/weight
struct block_iq2_xs {
    f16 d;
    uint16_t qs[QK_K / 8];
    uint8_t scales[QK_K / 32];
};

// IQ2_S: ~2.5 bits/weight
struct block_iq2_s {
    f16 d;
    uint8_t qs[QK_K / 4];
    uint8_t qh[QK_K / 32];
    uint8_t scales[QK_K / 32];
};

// IQ3_XXS: ~3.06 bits/weight
struct block_iq3_xxs {
    f16 d;
    uint8_t qs[3 * QK_K / 8];
};

// IQ3_S: ~3.44 bits/weight
struct block_iq3_s {
    f16 d;
    uint8_t qs[QK_K / 4];
    uint8_t qh[QK_K / 32];
    uint8_t signs[QK_K / 8];
    uint8_t scales[QK_K / 64];
};

// IQ4_NL: ~4.5 bits/weight (non-linear quantization)
struct block_iq4_nl {
    f16 d;
    uint8_t qs[QK4_0 / 2];
};

// IQ4_XS: ~4.25 bits/weight
struct block_iq4_xs {
    f16 d;
    uint16_t scales_h;
    uint8_t scales_l[QK_K / 64];
    uint8_t qs[QK_K / 2];
};

// TQ1_0: ternary 1.69 bits/weight
struct block_tq1_0 {
    uint8_t qs[(QK_K - 4 * QK_K / 64) / 5]; // 48 bytes: 5 trits per byte (base-3)
    uint8_t qh[QK_K / 64];                    // 4 bytes: 4 trits per byte (2-bit)
    f16 d;
};

// TQ2_0: ternary 2 bits/weight
struct block_tq2_0 {
    uint8_t qs[QK_K / 4];
    f16 d;
};

// ═══════════════════════════════════════════════════════════════════════════════
// DTYPE UTILITIES
// ═══════════════════════════════════════════════════════════════════════════════
inline size_t dtype_size(dtype t) {
    // PACKED sizes (no alignment padding) — must match GGUF on-disk layout
    switch (t) {
        case dtype::F32:      return 4;
        case dtype::F16:      return 2;
        case dtype::BF16:     return 2;
        case dtype::Q4_0:     return 2 + 16;                    // 18 per 32
        case dtype::Q4_K:     return 2 + 2 + 12 + QK_K/2;      // 144 per 256
        case dtype::Q5_K:     return 2 + 2 + 12 + QK_K/2 + QK_K/8; // 176 per 256
        case dtype::Q6_K:     return 2 + QK_K/2 + QK_K/4 + QK_K/16; // 210 per 256
        case dtype::Q8_0:     return 2 + 32;                    // 34 per 32
        case dtype::Q2_K:     return 2 + 2 + QK_K/16 + QK_K/4;  // 84 per 256
        case dtype::Q3_K:     return 2 + QK_K/4 + QK_K/8 + 12;  // 110 per 256
        case dtype::IQ1_S:    return 2 + QK_K/8 + QK_K/16;      // 50 per 256
        case dtype::IQ2_XXS:  return 2 + QK_K/4;                 // 66 per 256
        case dtype::IQ2_XS:   return 2 + QK_K/4 + QK_K/32;      // 74 per 256
        case dtype::IQ2_S:    return 2 + QK_K/4 + QK_K/16;       // 82 per 256
        case dtype::IQ4_XS:   return 2 + 2 + QK_K/64 + QK_K/2;  // 136 per 256
        case dtype::IQ3_XXS:  return 2 + 3*QK_K/8;               // 98 per 256
        case dtype::IQ3_S:    return 2 + QK_K/4 + QK_K/8 + QK_K/32 + 4; // 110 per 256
        case dtype::IQ4_NL:   return 2 + 16;                    // 18 per 32
        case dtype::TQ1_0:    return 2 + 4*13;                  // 54 per 256
        case dtype::TQ2_0:    return 2 + QK_K/4;                // 66 per 256
        case dtype::I8:       return 1;
        case dtype::I16:      return 2;
        case dtype::I32:      return 4;
        case dtype::I64:      return 8;
        case dtype::F64:      return 8;
        default: return 1;
    }
}

inline int dtype_block_size(dtype t) {
    switch (t) {
        // 256-element blocks
        case dtype::Q4_K: case dtype::Q5_K: case dtype::Q6_K: case dtype::Q8_K:
        case dtype::Q2_K: case dtype::Q3_K:
        case dtype::IQ1_S: case dtype::IQ1_M:
        case dtype::IQ2_XXS: case dtype::IQ2_XS: case dtype::IQ2_S: // case dtype::IQ2_M: // removed, not in GGML standard case dtype::IQ4_XS:
        case dtype::IQ3_XXS: case dtype::IQ3_S:
        case dtype::IQ4_XS:
        case dtype::TQ1_0: case dtype::TQ2_0:
            return QK_K;
        // 32-element blocks
        case dtype::Q4_0: case dtype::Q4_1: case dtype::Q5_0: case dtype::Q5_1:
        case dtype::Q8_0: case dtype::Q8_1:
        case dtype::IQ4_NL:
            return 32;
        // No blocking
        default: return 1;
    }
}

inline const char* dtype_name(dtype t) {
    switch (t) {
        case dtype::F32:     return "F32";
        case dtype::F16:     return "F16";
        case dtype::BF16:    return "BF16";
        case dtype::Q4_0:    return "Q4_0";
        case dtype::Q4_K:    return "Q4_K";
        case dtype::Q5_K:    return "Q5_K";
        case dtype::Q6_K:    return "Q6_K";
        case dtype::Q8_0:    return "Q8_0";
        case dtype::Q2_K:    return "Q2_K";
        case dtype::Q3_K:    return "Q3_K";
        case dtype::IQ1_S:   return "IQ1_S";
        case dtype::IQ2_XXS: return "IQ2_XXS";
        case dtype::IQ2_XS:  return "IQ2_XS";
        case dtype::IQ2_S:   return "IQ2_S";
        // case dtype::IQ2_M: // removed, not in GGML standard   return "IQ2_M";
        case dtype::IQ4_XS:  return "IQ4_XS";
        case dtype::IQ3_XXS: return "IQ3_XXS";
        case dtype::IQ3_S:   return "IQ3_S";
        case dtype::IQ4_NL:  return "IQ4_NL";
        case dtype::TQ1_0:   return "TQ1_0";
        case dtype::TQ2_0:   return "TQ2_0";
        default:             return "UNKNOWN";
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// MEMORY — Aligned allocation
// ═══════════════════════════════════════════════════════════════════════════════
static constexpr size_t CACHE_LINE = 64;

inline void* aligned_alloc(size_t size) {
    void* ptr = nullptr;
    posix_memalign(&ptr, CACHE_LINE, size);
    return ptr;
}

inline void aligned_free(void* ptr) { free(ptr); }

template<typename T>
struct span {
    T* data_; size_t size_;
    span() : data_(nullptr), size_(0) {}
    span(T* d, size_t n) : data_(d), size_(n) {}
    T* data() { return data_; }
    const T* data() const { return data_; }
    size_t size() const { return size_; }
    T& operator[](size_t i) { return data_[i]; }
    const T& operator[](size_t i) const { return data_[i]; }
};

// ═══════════════════════════════════════════════════════════════════════════════
// ARCHITECTURE TYPE
// ═══════════════════════════════════════════════════════════════════════════════
enum class Architecture {
    LLAMA,      // Standard dense (Llama, Mistral)
    QWEN2,      // Qwen2 dense (DeepSeek-R1-Distill)
    DEEPSEEK2,  // DeepSeek V3 MoE + MLA (Kimi K2.5)
    PHI3,       // Phi-3 / Phi-3.5 (GELU activation)
    GEMMA2,     // Gemma 2 (GELU, sliding window)
    STARCODER2, // StarCoder2 (code models)
    COMMAND_R,  // Cohere Command-R
};

// ═══════════════════════════════════════════════════════════════════════════════
// MODEL CONFIG — Extended for DeepSeek V3 MoE + MLA
// ═══════════════════════════════════════════════════════════════════════════════
struct Config {
    Architecture arch = Architecture::LLAMA;
    Activation activation = Activation::SILU;

    // === Common ===
    int dim = 4096;
    int n_layers = 32;
    int n_heads = 32;
    int n_kv_heads = 8;
    int vocab_size = 32000;
    int max_seq_len = 4096;
    int sliding_window = 0;     // 0 = disabled, >0 = window size (Mistral, Gemma2)
    float attn_logit_softcap = 0.0f;   // Gemma-2: tanh cap on attention scores
    float final_logit_softcap = 0.0f;  // Gemma-2: tanh cap on final logits
    bool embed_scale_sqrt_dim = false;  // Gemma: multiply embeddings by sqrt(dim)
    int head_dim = 128;
    int intermediate = 11008;
    float rope_theta = 10000.0f;
    float rms_norm_eps = 1e-5f;

    // === MLA (Multi-head Latent Attention) — DeepSeek V3 ===
    int q_lora_rank = 0;        // Compressed query rank (1536 for K2.5)
    int kv_lora_rank = 0;       // Compressed KV rank (512 for K2.5)
    int key_length = 0;         // Full key dim (576 = kv_lora_rank + rope_dim)
    int value_length = 0;       // Full value dim (512)
    int key_length_mla = 0;     // MLA key dim per head (192)
    int value_length_mla = 0;   // MLA value dim per head (128)
    int rope_dim = 0;           // RoPE dimension count (64)

    // === MoE (Mixture of Experts) — DeepSeek V3 ===
    int n_experts = 0;          // Total experts per MoE layer (384)
    int n_experts_used = 0;     // Active experts per token (8)
    int n_expert_shared = 0;    // Shared experts always active (1)
    int expert_ffn_dim = 0;     // Expert FFN width (2048)
    int n_dense_layers = 0;     // Leading dense layers before MoE (1)
    int n_expert_groups = 1;    // Expert groups
    int n_expert_groups_used = 1;
    int expert_gating_func = 0; // Gating function type
    float expert_weights_scale = 1.0f;
    bool expert_weights_norm = false;

    // === RoPE Scaling (YaRN) ===
    float rope_scaling_factor = 1.0f;
    int rope_scaling_orig_ctx = 4096;
    float rope_yarn_beta_fast = 32.0f;
    float rope_yarn_beta_slow = 1.0f;
    float rope_yarn_log_mul = 0.1f;

    std::vector<int32_t> eos_tokens;  // Multiple EOS token IDs
    bool is_moe() const { return n_experts > 0; }
    bool is_mla() const { return kv_lora_rank > 0; }

    void compute_derived() {
        // Fix: pure dense models need all layers marked dense
        if (n_dense_layers == 0 && n_experts == 0) n_dense_layers = n_layers;
        if (dim > 0 && n_heads > 0) {
            head_dim = dim / n_heads;
        }
        if (kv_lora_rank > 0 && rope_dim > 0) {
            key_length = kv_lora_rank + rope_dim;
        }
    }

    void print() const {
        auto p = [](const char* k, int v) { printf("  %-30s = %d\n", k, v); };
        auto pf = [](const char* k, float v) { printf("  %-30s = %.6f\n", k, v); };

        printf("=== Inference-X v6 Config ===\n");
        printf("  Architecture                 = %s\n",
            arch == Architecture::DEEPSEEK2 ? "DeepSeek V3 MoE+MLA" :
            arch == Architecture::QWEN2 ? "Qwen2" : "Llama");
        p("dim", dim);
        p("n_layers", n_layers);
        p("n_heads", n_heads);
        p("n_kv_heads", n_kv_heads);
        p("vocab_size", vocab_size);
        p("max_seq_len", max_seq_len);
        p("head_dim", head_dim);
        p("intermediate", intermediate);
        pf("rope_theta", rope_theta);

        if (is_mla()) {
            printf("--- MLA ---\n");
            p("q_lora_rank", q_lora_rank);
            p("kv_lora_rank", kv_lora_rank);
            p("key_length", key_length);
            p("value_length", value_length);
            p("rope_dim", rope_dim);
        }

        if (is_moe()) {
            printf("--- MoE ---\n");
            p("n_experts", n_experts);
            p("n_experts_used", n_experts_used);
            p("n_expert_shared", n_expert_shared);
            p("expert_ffn_dim", expert_ffn_dim);
            p("n_dense_layers", n_dense_layers);
            pf("expert_weights_scale", expert_weights_scale);
        }
    }
};

} // namespace ix