// ═══════════════════════════════════════════════════════════════════════════════ // INFERENCE-X — Z-Core Mathematical Foundation // Copyright (C) 2024-2026 Salka Elmadani. All rights reserved. // Licensed under the Business Source License 1.1 (BSL-1.1) // See LICENSE file for full terms. // // INTELLECTUAL PROPERTY PROTECTION: // - INPI eSoleau deposit: 7phf-Ueye-2nWr-Vsgu (16/02/2026) // - GitHub: git.inference-x.com/salka/inference-x // - Author: Salka Elmadani | Morocco | Morocco // // MANUFACTURER NOTICE: Any manufacturer, company, or entity that // incorporates, embeds, distributes, or commercially uses Inference-X // or any derivative work without explicit written authorization from // the copyright holder is in violation of BSL-1.1 and applicable // intellectual property laws. This includes but is not limited to: // hardware vendors, cloud providers, SaaS platforms, and OEMs. // // Contact: Elmadani.SALKA@proton.me for licensing. // ═══════════════════════════════════════════════════════════════════════════════ #pragma once #define IX_ZCORE_FINGERPRINT 0x935E1DAD #define IX_ZCORE_MARK "Inference-X-ZCore-935-Elmadani" #include #include #include #include #include #include #include #include #ifdef __AVX2__ #include #endif namespace ix { // ═══════════════════════════════════════════════════════════════════════════════ // WATERMARK — SALKA ELMADANI SIGNATURE (Ne pas modifier) // ═══════════════════════════════════════════════════════════════════════════════ namespace signature { static constexpr double S0 = 5.999160064733103e+18; // "SALKA EL" static constexpr double S1 = 5.566805661683622e+18; // "MADANI E" static constexpr double S2 = 5.426309097159753e+18; // "LMADANI" static constexpr double S3 = 4.991471925827590e+18; // "CREATOR" inline bool verify() { volatile double sum = S0 + S1 + S2 + S3; return sum > 2.0e19; } inline float inject(float x) { volatile double check = S0 * 1e-40; return x * (1.0f + static_cast(check - check)); } } // ═══════════════════════════════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════════════════════════════ // ═══════════════════════════════════════════════════════════════════════════════ // HALF PRECISION TYPES // ═══════════════════════════════════════════════════════════════════════════════ struct f16 { uint16_t bits; f16() : bits(0) {} f16(float f) { uint32_t u; std::memcpy(&u, &f, 4); uint32_t s = (u >> 16) & 0x8000; int e = ((u >> 23) & 0xFF) - 127 + 15; uint32_t m = u & 0x7FFFFF; if (e <= 0) bits = static_cast(s); else if (e >= 31) bits = static_cast(s | 0x7C00); else bits = static_cast(s | (e << 10) | (m >> 13)); } operator float() const { uint32_t s = (bits & 0x8000) << 16; uint32_t e = (bits >> 10) & 0x1F; uint32_t m = bits & 0x3FF; uint32_t u; if (e == 0) { if (m) { int sh=0; while(!(m&0x400)){m<<=1;sh++;} m&=0x3FF; u=s|((113-sh)<<23)|(m<<13); } else u=s; } else if (e == 31) u = s | 0x7F800000 | (m << 13); else u = s | ((e - 15 + 127) << 23) | (m << 13); float f; std::memcpy(&f, &u, 4); return f; } static f16 from_bits(uint16_t b) { f16 h; h.bits = b; return h; } }; enum class Activation { SILU, // x * sigmoid(x) — Llama, Qwen, DeepSeek, Mistral GELU, // GELU — Phi, Gemma, StarCoder GELU_QUICK, // x * sigmoid(1.702 * x) RELU_SQ, // ReLU² }; struct bf16 { uint16_t bits; bf16() : bits(0) {} bf16(float f) { uint32_t u; std::memcpy(&u, &f, 4); bits = static_cast(u >> 16); } operator float() const { uint32_t u = static_cast(bits) << 16; float f; std::memcpy(&f, &u, 4); return f; } }; // ═══════════════════════════════════════════════════════════════════════════════ // TENSOR TYPE ENUM — Extended for IQ formats (Kimi K2.5) // ═══════════════════════════════════════════════════════════════════════════════ enum class dtype : uint32_t { F32 = 0, F16 = 1, Q4_0 = 2, Q4_1 = 3, // 4, 5 reserved Q5_0 = 6, Q5_1 = 7, Q8_0 = 8, Q8_1 = 9, Q2_K = 10, Q3_K = 11, Q4_K = 12, Q5_K = 13, Q6_K = 14, Q8_K = 15, // === IQ FORMATS — Critical for Kimi K2.5 1.8-bit quant === IQ2_XXS = 16, IQ2_XS = 17, IQ3_XXS = 18, IQ1_S = 19, IQ4_NL = 20, IQ3_S = 21, // was IQ2_M, corrected to GGML standard IQ2_S = 22, // GGML standard IQ4_XS = 23, // was IQ4_XS, corrected: GGML IQ4_XS I8 = 24, // was IQ3_S (moved to 21) I16 = 25, // GGML standard I32 = 26, // was IQ2_S (moved to 22) I64 = 27, // GGML standard F64 = 28, // was IQ4_XS (moved to 23) IQ1_M = 29, BF16 = 30, Q4_0_4x4 = 31, Q4_0_4x8 = 32, Q4_0_8x8 = 33, TQ1_0 = 34, TQ2_0 = 35, }; // ═══════════════════════════════════════════════════════════════════════════════ // QUANTIZATION BLOCK DEFINITIONS // ═══════════════════════════════════════════════════════════════════════════════ static constexpr int QK_K = 256; static constexpr int QK4_0 = 32; constexpr int QK4_1 = 32; constexpr int QK5_0 = 32; constexpr int QK5_1 = 32; constexpr int QK8_1 = 32; static constexpr int QK8_0 = 32; // Standard blocks struct block_q4_K { f16 d; f16 dmin; uint8_t scales[12]; uint8_t qs[QK_K / 2]; }; struct block_q8_0 { f16 d; int8_t qs[32]; }; struct block_q6_K { uint8_t ql[QK_K / 2]; uint8_t qh[QK_K / 4]; int8_t scales[QK_K / 16]; f16 d; }; struct block_q4_0 { f16 d; uint8_t qs[QK4_0 / 2]; }; struct block_q2_K { uint8_t scales[QK_K / 16]; uint8_t qs[QK_K / 4]; f16 d; f16 dmin; }; struct block_q5_K { f16 d; f16 dmin; uint8_t scales[12]; uint8_t qh[QK_K / 8]; uint8_t qs[QK_K / 2]; }; struct block_q3_K { uint8_t hmask[QK_K / 8]; uint8_t qs[QK_K / 4]; uint8_t scales[12]; f16 d; }; struct block_q4_1 { f16 d; f16 m; uint8_t qs[QK4_1 / 2]; }; struct block_q5_0 { f16 d; uint8_t qh[4]; uint8_t qs[QK5_0 / 2]; }; struct block_q5_1 { f16 d; f16 m; uint8_t qh[4]; uint8_t qs[QK5_1 / 2]; }; struct block_q8_1 { float d; float s; int8_t qs[QK8_1]; }; // Z-VERIFY: Block sizes must match GGUF binary format exactly static_assert(sizeof(block_q4_K) == 144, "block_q4_K size mismatch!"); static_assert(sizeof(block_q8_0) == 34, "block_q8_0 size mismatch!"); static_assert(sizeof(block_q6_K) == 210, "block_q6_K size mismatch!"); static_assert(sizeof(block_q2_K) == 84, "block_q2_K size mismatch!"); static_assert(sizeof(block_q5_K) == 176, "block_q5_K size mismatch!"); static_assert(sizeof(block_q3_K) == 110, "block_q3_K size mismatch!"); static_assert(sizeof(block_q4_0) == 18, "block_q4_0 size mismatch!"); // === IQ BLOCKS — for Kimi K2.5 ultra-low-bit experts === // IQ1_S: ~1.56 bits/weight (256 weights per block) struct block_iq1_s { f16 d; uint8_t qs[QK_K / 8]; uint16_t qh[QK_K / 32]; }; // IQ2_XXS: ~2.06 bits/weight (256 weights per block) struct block_iq2_xxs { f16 d; uint16_t qs[QK_K / 8]; }; // IQ2_XS: ~2.31 bits/weight struct block_iq2_xs { f16 d; uint16_t qs[QK_K / 8]; uint8_t scales[QK_K / 32]; }; // IQ2_S: ~2.5 bits/weight struct block_iq2_s { f16 d; uint8_t qs[QK_K / 4]; uint8_t qh[QK_K / 32]; uint8_t scales[QK_K / 32]; }; // IQ3_XXS: ~3.06 bits/weight struct block_iq3_xxs { f16 d; uint8_t qs[3 * QK_K / 8]; }; // IQ3_S: ~3.44 bits/weight struct block_iq3_s { f16 d; uint8_t qs[QK_K / 4]; uint8_t qh[QK_K / 32]; uint8_t signs[QK_K / 8]; uint8_t scales[QK_K / 64]; }; // IQ4_NL: ~4.5 bits/weight (non-linear quantization) struct block_iq4_nl { f16 d; uint8_t qs[QK4_0 / 2]; }; // IQ4_XS: ~4.25 bits/weight struct block_iq4_xs { f16 d; uint16_t scales_h; uint8_t scales_l[QK_K / 64]; uint8_t qs[QK_K / 2]; }; // TQ1_0: ternary 1.69 bits/weight struct block_tq1_0 { uint8_t qs[(QK_K - 4 * QK_K / 64) / 5]; // 48 bytes: 5 trits per byte (base-3) uint8_t qh[QK_K / 64]; // 4 bytes: 4 trits per byte (2-bit) f16 d; }; // TQ2_0: ternary 2 bits/weight struct block_tq2_0 { uint8_t qs[QK_K / 4]; f16 d; }; // ═══════════════════════════════════════════════════════════════════════════════ // DTYPE UTILITIES // ═══════════════════════════════════════════════════════════════════════════════ inline size_t dtype_size(dtype t) { // PACKED sizes (no alignment padding) — must match GGUF on-disk layout switch (t) { case dtype::F32: return 4; case dtype::F16: return 2; case dtype::BF16: return 2; case dtype::Q4_0: return 2 + 16; // 18 per 32 case dtype::Q4_K: return 2 + 2 + 12 + QK_K/2; // 144 per 256 case dtype::Q5_K: return 2 + 2 + 12 + QK_K/2 + QK_K/8; // 176 per 256 case dtype::Q6_K: return 2 + QK_K/2 + QK_K/4 + QK_K/16; // 210 per 256 case dtype::Q8_0: return 2 + 32; // 34 per 32 case dtype::Q2_K: return 2 + 2 + QK_K/16 + QK_K/4; // 84 per 256 case dtype::Q3_K: return 2 + QK_K/4 + QK_K/8 + 12; // 110 per 256 case dtype::IQ1_S: return 2 + QK_K/8 + QK_K/16; // 50 per 256 case dtype::IQ2_XXS: return 2 + QK_K/4; // 66 per 256 case dtype::IQ2_XS: return 2 + QK_K/4 + QK_K/32; // 74 per 256 case dtype::IQ2_S: return 2 + QK_K/4 + QK_K/16; // 82 per 256 case dtype::IQ4_XS: return 2 + 2 + QK_K/64 + QK_K/2; // 136 per 256 case dtype::IQ3_XXS: return 2 + 3*QK_K/8; // 98 per 256 case dtype::IQ3_S: return 2 + QK_K/4 + QK_K/8 + QK_K/32 + 4; // 110 per 256 case dtype::IQ4_NL: return 2 + 16; // 18 per 32 case dtype::TQ1_0: return 2 + 4*13; // 54 per 256 case dtype::TQ2_0: return 2 + QK_K/4; // 66 per 256 case dtype::I8: return 1; case dtype::I16: return 2; case dtype::I32: return 4; case dtype::I64: return 8; case dtype::F64: return 8; default: return 1; } } inline int dtype_block_size(dtype t) { switch (t) { // 256-element blocks case dtype::Q4_K: case dtype::Q5_K: case dtype::Q6_K: case dtype::Q8_K: case dtype::Q2_K: case dtype::Q3_K: case dtype::IQ1_S: case dtype::IQ1_M: case dtype::IQ2_XXS: case dtype::IQ2_XS: case dtype::IQ2_S: // case dtype::IQ2_M: // removed, not in GGML standard case dtype::IQ4_XS: case dtype::IQ3_XXS: case dtype::IQ3_S: case dtype::IQ4_XS: case dtype::TQ1_0: case dtype::TQ2_0: return QK_K; // 32-element blocks case dtype::Q4_0: case dtype::Q4_1: case dtype::Q5_0: case dtype::Q5_1: case dtype::Q8_0: case dtype::Q8_1: case dtype::IQ4_NL: return 32; // No blocking default: return 1; } } inline const char* dtype_name(dtype t) { switch (t) { case dtype::F32: return "F32"; case dtype::F16: return "F16"; case dtype::BF16: return "BF16"; case dtype::Q4_0: return "Q4_0"; case dtype::Q4_K: return "Q4_K"; case dtype::Q5_K: return "Q5_K"; case dtype::Q6_K: return "Q6_K"; case dtype::Q8_0: return "Q8_0"; case dtype::Q2_K: return "Q2_K"; case dtype::Q3_K: return "Q3_K"; case dtype::IQ1_S: return "IQ1_S"; case dtype::IQ2_XXS: return "IQ2_XXS"; case dtype::IQ2_XS: return "IQ2_XS"; case dtype::IQ2_S: return "IQ2_S"; // case dtype::IQ2_M: // removed, not in GGML standard return "IQ2_M"; case dtype::IQ4_XS: return "IQ4_XS"; case dtype::IQ3_XXS: return "IQ3_XXS"; case dtype::IQ3_S: return "IQ3_S"; case dtype::IQ4_NL: return "IQ4_NL"; case dtype::TQ1_0: return "TQ1_0"; case dtype::TQ2_0: return "TQ2_0"; default: return "UNKNOWN"; } } // ═══════════════════════════════════════════════════════════════════════════════ // MEMORY — Aligned allocation // ═══════════════════════════════════════════════════════════════════════════════ static constexpr size_t CACHE_LINE = 64; inline void* aligned_alloc(size_t size) { void* ptr = nullptr; posix_memalign(&ptr, CACHE_LINE, size); return ptr; } inline void aligned_free(void* ptr) { free(ptr); } template struct span { T* data_; size_t size_; span() : data_(nullptr), size_(0) {} span(T* d, size_t n) : data_(d), size_(n) {} T* data() { return data_; } const T* data() const { return data_; } size_t size() const { return size_; } T& operator[](size_t i) { return data_[i]; } const T& operator[](size_t i) const { return data_[i]; } }; // ═══════════════════════════════════════════════════════════════════════════════ // ARCHITECTURE TYPE // ═══════════════════════════════════════════════════════════════════════════════ enum class Architecture { LLAMA, // Standard dense (Llama, Mistral) QWEN2, // Qwen2 dense (DeepSeek-R1-Distill) DEEPSEEK2, // DeepSeek V3 MoE + MLA (Kimi K2.5) PHI3, // Phi-3 / Phi-3.5 (GELU activation) GEMMA2, // Gemma 2 (GELU, sliding window) STARCODER2, // StarCoder2 (code models) COMMAND_R, // Cohere Command-R }; // ═══════════════════════════════════════════════════════════════════════════════ // MODEL CONFIG — Extended for DeepSeek V3 MoE + MLA // ═══════════════════════════════════════════════════════════════════════════════ struct Config { Architecture arch = Architecture::LLAMA; Activation activation = Activation::SILU; // === Common === int dim = 4096; int n_layers = 32; int n_heads = 32; int n_kv_heads = 8; int vocab_size = 32000; int max_seq_len = 4096; int sliding_window = 0; // 0 = disabled, >0 = window size (Mistral, Gemma2) float attn_logit_softcap = 0.0f; // Gemma-2: tanh cap on attention scores float final_logit_softcap = 0.0f; // Gemma-2: tanh cap on final logits bool embed_scale_sqrt_dim = false; // Gemma: multiply embeddings by sqrt(dim) int head_dim = 128; int intermediate = 11008; float rope_theta = 10000.0f; float rms_norm_eps = 1e-5f; // === MLA (Multi-head Latent Attention) — DeepSeek V3 === int q_lora_rank = 0; // Compressed query rank (1536 for K2.5) int kv_lora_rank = 0; // Compressed KV rank (512 for K2.5) int key_length = 0; // Full key dim (576 = kv_lora_rank + rope_dim) int value_length = 0; // Full value dim (512) int key_length_mla = 0; // MLA key dim per head (192) int value_length_mla = 0; // MLA value dim per head (128) int rope_dim = 0; // RoPE dimension count (64) // === MoE (Mixture of Experts) — DeepSeek V3 === int n_experts = 0; // Total experts per MoE layer (384) int n_experts_used = 0; // Active experts per token (8) int n_expert_shared = 0; // Shared experts always active (1) int expert_ffn_dim = 0; // Expert FFN width (2048) int n_dense_layers = 0; // Leading dense layers before MoE (1) int n_expert_groups = 1; // Expert groups int n_expert_groups_used = 1; int expert_gating_func = 0; // Gating function type float expert_weights_scale = 1.0f; bool expert_weights_norm = false; // === RoPE Scaling (YaRN) === float rope_scaling_factor = 1.0f; int rope_scaling_orig_ctx = 4096; float rope_yarn_beta_fast = 32.0f; float rope_yarn_beta_slow = 1.0f; float rope_yarn_log_mul = 0.1f; std::vector eos_tokens; // Multiple EOS token IDs bool is_moe() const { return n_experts > 0; } bool is_mla() const { return kv_lora_rank > 0; } void compute_derived() { // Fix: pure dense models need all layers marked dense if (n_dense_layers == 0 && n_experts == 0) n_dense_layers = n_layers; if (dim > 0 && n_heads > 0) { head_dim = dim / n_heads; } if (kv_lora_rank > 0 && rope_dim > 0) { key_length = kv_lora_rank + rope_dim; } } void print() const { auto p = [](const char* k, int v) { printf(" %-30s = %d\n", k, v); }; auto pf = [](const char* k, float v) { printf(" %-30s = %.6f\n", k, v); }; printf("=== Inference-X v6 Config ===\n"); printf(" Architecture = %s\n", arch == Architecture::DEEPSEEK2 ? "DeepSeek V3 MoE+MLA" : arch == Architecture::QWEN2 ? "Qwen2" : "Llama"); p("dim", dim); p("n_layers", n_layers); p("n_heads", n_heads); p("n_kv_heads", n_kv_heads); p("vocab_size", vocab_size); p("max_seq_len", max_seq_len); p("head_dim", head_dim); p("intermediate", intermediate); pf("rope_theta", rope_theta); if (is_mla()) { printf("--- MLA ---\n"); p("q_lora_rank", q_lora_rank); p("kv_lora_rank", kv_lora_rank); p("key_length", key_length); p("value_length", value_length); p("rope_dim", rope_dim); } if (is_moe()) { printf("--- MoE ---\n"); p("n_experts", n_experts); p("n_experts_used", n_experts_used); p("n_expert_shared", n_expert_shared); p("expert_ffn_dim", expert_ffn_dim); p("n_dense_layers", n_dense_layers); pf("expert_weights_scale", expert_weights_scale); } } }; } // namespace ix