inference-x/core/z_core.h
Salka Elmadani ec36668cf5 Inference-X v1.0 — Universal AI Inference Engine
Better output from the same model. Fused computation, adaptive precision,
surgical expert loading. 305 KB, 19 backends, zero dependencies.

https://inference-x.com
2026-02-23 07:10:47 +00:00

536 lines
21 KiB
C++

// ═══════════════════════════════════════════════════════════════════════════════
// INFERENCE-X — Z-Core Mathematical Foundation
// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
// Licensed under the Business Source License 1.1 (BSL-1.1)
// See LICENSE file for full terms.
//
// INTELLECTUAL PROPERTY PROTECTION:
// - INPI eSoleau deposit: 7phf-Ueye-2nWr-Vsgu (16/02/2026)
// - GitHub: github.com/ElmadaniS/inference-x
// - Author: Salka Elmadani | Morocco | Morocco
//
// MANUFACTURER NOTICE: Any manufacturer, company, or entity that
// incorporates, embeds, distributes, or commercially uses Inference-X
// or any derivative work without explicit written authorization from
// the copyright holder is in violation of BSL-1.1 and applicable
// intellectual property laws. This includes but is not limited to:
// hardware vendors, cloud providers, SaaS platforms, and OEMs.
//
// Contact: Elmadani.SALKA@proton.me for licensing.
// ═══════════════════════════════════════════════════════════════════════════════
#pragma once
#define IX_ZCORE_FINGERPRINT 0x935E1DAD
#define IX_ZCORE_MARK "Inference-X-ZCore-935-Elmadani"
#include <cstdint>
#include <cstddef>
#include <cstring>
#include <cmath>
#include <algorithm>
#include <string>
#include <vector>
#include <unordered_map>
#ifdef __AVX2__
#include <immintrin.h>
#endif
namespace ix {
// ═══════════════════════════════════════════════════════════════════════════════
// WATERMARK — SALKA ELMADANI SIGNATURE (Ne pas modifier)
// ═══════════════════════════════════════════════════════════════════════════════
namespace signature {
static constexpr double S0 = 5.999160064733103e+18; // "SALKA EL"
static constexpr double S1 = 5.566805661683622e+18; // "MADANI E"
static constexpr double S2 = 5.426309097159753e+18; // "LMADANI"
static constexpr double S3 = 4.991471925827590e+18; // "CREATOR"
inline bool verify() {
volatile double sum = S0 + S1 + S2 + S3;
return sum > 2.0e19;
}
inline float inject(float x) {
volatile double check = S0 * 1e-40;
return x * (1.0f + static_cast<float>(check - check));
}
}
// ═══════════════════════════════════════════════════════════════════════════════
// ═══════════════════════════════════════════════════════════════════════════════
// ═══════════════════════════════════════════════════════════════════════════════
// HALF PRECISION TYPES
// ═══════════════════════════════════════════════════════════════════════════════
struct f16 {
uint16_t bits;
f16() : bits(0) {}
f16(float f) {
uint32_t u; std::memcpy(&u, &f, 4);
uint32_t s = (u >> 16) & 0x8000;
int e = ((u >> 23) & 0xFF) - 127 + 15;
uint32_t m = u & 0x7FFFFF;
if (e <= 0) bits = static_cast<uint16_t>(s);
else if (e >= 31) bits = static_cast<uint16_t>(s | 0x7C00);
else bits = static_cast<uint16_t>(s | (e << 10) | (m >> 13));
}
operator float() const {
uint32_t s = (bits & 0x8000) << 16;
uint32_t e = (bits >> 10) & 0x1F;
uint32_t m = bits & 0x3FF;
uint32_t u;
if (e == 0) { if (m) { int sh=0; while(!(m&0x400)){m<<=1;sh++;} m&=0x3FF; u=s|((113-sh)<<23)|(m<<13); } else u=s; }
else if (e == 31) u = s | 0x7F800000 | (m << 13);
else u = s | ((e - 15 + 127) << 23) | (m << 13);
float f; std::memcpy(&f, &u, 4);
return f;
}
static f16 from_bits(uint16_t b) { f16 h; h.bits = b; return h; }
};
enum class Activation {
SILU, // x * sigmoid(x) — Llama, Qwen, DeepSeek, Mistral
GELU, // GELU — Phi, Gemma, StarCoder
GELU_QUICK, // x * sigmoid(1.702 * x)
RELU_SQ, // ReLU²
};
struct bf16 {
uint16_t bits;
bf16() : bits(0) {}
bf16(float f) { uint32_t u; std::memcpy(&u, &f, 4); bits = static_cast<uint16_t>(u >> 16); }
operator float() const { uint32_t u = static_cast<uint32_t>(bits) << 16; float f; std::memcpy(&f, &u, 4); return f; }
};
// ═══════════════════════════════════════════════════════════════════════════════
// TENSOR TYPE ENUM — Extended for IQ formats (Kimi K2.5)
// ═══════════════════════════════════════════════════════════════════════════════
enum class dtype : uint32_t {
F32 = 0,
F16 = 1,
Q4_0 = 2,
Q4_1 = 3,
// 4, 5 reserved
Q5_0 = 6,
Q5_1 = 7,
Q8_0 = 8,
Q8_1 = 9,
Q2_K = 10,
Q3_K = 11,
Q4_K = 12,
Q5_K = 13,
Q6_K = 14,
Q8_K = 15,
// === IQ FORMATS — Critical for Kimi K2.5 1.8-bit quant ===
IQ2_XXS = 16,
IQ2_XS = 17,
IQ3_XXS = 18,
IQ1_S = 19,
IQ4_NL = 20,
IQ3_S = 21, // was IQ2_M, corrected to GGML standard
IQ2_S = 22, // GGML standard
IQ4_XS = 23, // was IQ4_XS, corrected: GGML IQ4_XS
I8 = 24, // was IQ3_S (moved to 21)
I16 = 25, // GGML standard
I32 = 26, // was IQ2_S (moved to 22)
I64 = 27, // GGML standard
F64 = 28, // was IQ4_XS (moved to 23)
IQ1_M = 29,
BF16 = 30,
Q4_0_4x4 = 31,
Q4_0_4x8 = 32,
Q4_0_8x8 = 33,
TQ1_0 = 34,
TQ2_0 = 35,
};
// ═══════════════════════════════════════════════════════════════════════════════
// QUANTIZATION BLOCK DEFINITIONS
// ═══════════════════════════════════════════════════════════════════════════════
static constexpr int QK_K = 256;
static constexpr int QK4_0 = 32;
constexpr int QK4_1 = 32;
constexpr int QK5_0 = 32;
constexpr int QK5_1 = 32;
constexpr int QK8_1 = 32;
static constexpr int QK8_0 = 32;
// Standard blocks
struct block_q4_K {
f16 d; f16 dmin;
uint8_t scales[12];
uint8_t qs[QK_K / 2];
};
struct block_q8_0 {
f16 d;
int8_t qs[32];
};
struct block_q6_K {
uint8_t ql[QK_K / 2];
uint8_t qh[QK_K / 4];
int8_t scales[QK_K / 16];
f16 d;
};
struct block_q4_0 {
f16 d;
uint8_t qs[QK4_0 / 2];
};
struct block_q2_K {
uint8_t scales[QK_K / 16];
uint8_t qs[QK_K / 4];
f16 d; f16 dmin;
};
struct block_q5_K {
f16 d; f16 dmin;
uint8_t scales[12];
uint8_t qh[QK_K / 8];
uint8_t qs[QK_K / 2];
};
struct block_q3_K {
uint8_t hmask[QK_K / 8];
uint8_t qs[QK_K / 4];
uint8_t scales[12];
f16 d;
};
struct block_q4_1 {
f16 d; f16 m;
uint8_t qs[QK4_1 / 2];
};
struct block_q5_0 {
f16 d;
uint8_t qh[4];
uint8_t qs[QK5_0 / 2];
};
struct block_q5_1 {
f16 d; f16 m;
uint8_t qh[4];
uint8_t qs[QK5_1 / 2];
};
struct block_q8_1 {
float d;
float s;
int8_t qs[QK8_1];
};
// Z-VERIFY: Block sizes must match GGUF binary format exactly
static_assert(sizeof(block_q4_K) == 144, "block_q4_K size mismatch!");
static_assert(sizeof(block_q8_0) == 34, "block_q8_0 size mismatch!");
static_assert(sizeof(block_q6_K) == 210, "block_q6_K size mismatch!");
static_assert(sizeof(block_q2_K) == 84, "block_q2_K size mismatch!");
static_assert(sizeof(block_q5_K) == 176, "block_q5_K size mismatch!");
static_assert(sizeof(block_q3_K) == 110, "block_q3_K size mismatch!");
static_assert(sizeof(block_q4_0) == 18, "block_q4_0 size mismatch!");
// === IQ BLOCKS — for Kimi K2.5 ultra-low-bit experts ===
// IQ1_S: ~1.56 bits/weight (256 weights per block)
struct block_iq1_s {
f16 d;
uint8_t qs[QK_K / 8];
uint16_t qh[QK_K / 32];
};
// IQ2_XXS: ~2.06 bits/weight (256 weights per block)
struct block_iq2_xxs {
f16 d;
uint16_t qs[QK_K / 8];
};
// IQ2_XS: ~2.31 bits/weight
struct block_iq2_xs {
f16 d;
uint16_t qs[QK_K / 8];
uint8_t scales[QK_K / 32];
};
// IQ2_S: ~2.5 bits/weight
struct block_iq2_s {
f16 d;
uint8_t qs[QK_K / 4];
uint8_t qh[QK_K / 32];
uint8_t scales[QK_K / 32];
};
// IQ3_XXS: ~3.06 bits/weight
struct block_iq3_xxs {
f16 d;
uint8_t qs[3 * QK_K / 8];
};
// IQ3_S: ~3.44 bits/weight
struct block_iq3_s {
f16 d;
uint8_t qs[QK_K / 4];
uint8_t qh[QK_K / 32];
uint8_t signs[QK_K / 8];
uint8_t scales[QK_K / 64];
};
// IQ4_NL: ~4.5 bits/weight (non-linear quantization)
struct block_iq4_nl {
f16 d;
uint8_t qs[QK4_0 / 2];
};
// IQ4_XS: ~4.25 bits/weight
struct block_iq4_xs {
f16 d;
uint16_t scales_h;
uint8_t scales_l[QK_K / 64];
uint8_t qs[QK_K / 2];
};
// TQ1_0: ternary 1.69 bits/weight
struct block_tq1_0 {
uint8_t qs[(QK_K - 4 * QK_K / 64) / 5]; // 48 bytes: 5 trits per byte (base-3)
uint8_t qh[QK_K / 64]; // 4 bytes: 4 trits per byte (2-bit)
f16 d;
};
// TQ2_0: ternary 2 bits/weight
struct block_tq2_0 {
uint8_t qs[QK_K / 4];
f16 d;
};
// ═══════════════════════════════════════════════════════════════════════════════
// DTYPE UTILITIES
// ═══════════════════════════════════════════════════════════════════════════════
inline size_t dtype_size(dtype t) {
// PACKED sizes (no alignment padding) — must match GGUF on-disk layout
switch (t) {
case dtype::F32: return 4;
case dtype::F16: return 2;
case dtype::BF16: return 2;
case dtype::Q4_0: return 2 + 16; // 18 per 32
case dtype::Q4_K: return 2 + 2 + 12 + QK_K/2; // 144 per 256
case dtype::Q5_K: return 2 + 2 + 12 + QK_K/2 + QK_K/8; // 176 per 256
case dtype::Q6_K: return 2 + QK_K/2 + QK_K/4 + QK_K/16; // 210 per 256
case dtype::Q8_0: return 2 + 32; // 34 per 32
case dtype::Q2_K: return 2 + 2 + QK_K/16 + QK_K/4; // 84 per 256
case dtype::Q3_K: return 2 + QK_K/4 + QK_K/8 + 12; // 110 per 256
case dtype::IQ1_S: return 2 + QK_K/8 + QK_K/16; // 50 per 256
case dtype::IQ2_XXS: return 2 + QK_K/4; // 66 per 256
case dtype::IQ2_XS: return 2 + QK_K/4 + QK_K/32; // 74 per 256
case dtype::IQ2_S: return 2 + QK_K/4 + QK_K/16; // 82 per 256
case dtype::IQ4_XS: return 2 + 2 + QK_K/64 + QK_K/2; // 136 per 256
case dtype::IQ3_XXS: return 2 + 3*QK_K/8; // 98 per 256
case dtype::IQ3_S: return 2 + QK_K/4 + QK_K/8 + QK_K/32 + 4; // 110 per 256
case dtype::IQ4_NL: return 2 + 16; // 18 per 32
case dtype::TQ1_0: return 2 + 4*13; // 54 per 256
case dtype::TQ2_0: return 2 + QK_K/4; // 66 per 256
case dtype::I8: return 1;
case dtype::I16: return 2;
case dtype::I32: return 4;
case dtype::I64: return 8;
case dtype::F64: return 8;
default: return 1;
}
}
inline int dtype_block_size(dtype t) {
switch (t) {
// 256-element blocks
case dtype::Q4_K: case dtype::Q5_K: case dtype::Q6_K: case dtype::Q8_K:
case dtype::Q2_K: case dtype::Q3_K:
case dtype::IQ1_S: case dtype::IQ1_M:
case dtype::IQ2_XXS: case dtype::IQ2_XS: case dtype::IQ2_S: // case dtype::IQ2_M: // removed, not in GGML standard case dtype::IQ4_XS:
case dtype::IQ3_XXS: case dtype::IQ3_S:
case dtype::IQ4_XS:
case dtype::TQ1_0: case dtype::TQ2_0:
return QK_K;
// 32-element blocks
case dtype::Q4_0: case dtype::Q4_1: case dtype::Q5_0: case dtype::Q5_1:
case dtype::Q8_0: case dtype::Q8_1:
case dtype::IQ4_NL:
return 32;
// No blocking
default: return 1;
}
}
inline const char* dtype_name(dtype t) {
switch (t) {
case dtype::F32: return "F32";
case dtype::F16: return "F16";
case dtype::BF16: return "BF16";
case dtype::Q4_0: return "Q4_0";
case dtype::Q4_K: return "Q4_K";
case dtype::Q5_K: return "Q5_K";
case dtype::Q6_K: return "Q6_K";
case dtype::Q8_0: return "Q8_0";
case dtype::Q2_K: return "Q2_K";
case dtype::Q3_K: return "Q3_K";
case dtype::IQ1_S: return "IQ1_S";
case dtype::IQ2_XXS: return "IQ2_XXS";
case dtype::IQ2_XS: return "IQ2_XS";
case dtype::IQ2_S: return "IQ2_S";
// case dtype::IQ2_M: // removed, not in GGML standard return "IQ2_M";
case dtype::IQ4_XS: return "IQ4_XS";
case dtype::IQ3_XXS: return "IQ3_XXS";
case dtype::IQ3_S: return "IQ3_S";
case dtype::IQ4_NL: return "IQ4_NL";
case dtype::TQ1_0: return "TQ1_0";
case dtype::TQ2_0: return "TQ2_0";
default: return "UNKNOWN";
}
}
// ═══════════════════════════════════════════════════════════════════════════════
// MEMORY — Aligned allocation
// ═══════════════════════════════════════════════════════════════════════════════
static constexpr size_t CACHE_LINE = 64;
inline void* aligned_alloc(size_t size) {
void* ptr = nullptr;
posix_memalign(&ptr, CACHE_LINE, size);
return ptr;
}
inline void aligned_free(void* ptr) { free(ptr); }
template<typename T>
struct span {
T* data_; size_t size_;
span() : data_(nullptr), size_(0) {}
span(T* d, size_t n) : data_(d), size_(n) {}
T* data() { return data_; }
const T* data() const { return data_; }
size_t size() const { return size_; }
T& operator[](size_t i) { return data_[i]; }
const T& operator[](size_t i) const { return data_[i]; }
};
// ═══════════════════════════════════════════════════════════════════════════════
// ARCHITECTURE TYPE
// ═══════════════════════════════════════════════════════════════════════════════
enum class Architecture {
LLAMA, // Standard dense (Llama, Mistral)
QWEN2, // Qwen2 dense (DeepSeek-R1-Distill)
DEEPSEEK2, // DeepSeek V3 MoE + MLA (Kimi K2.5)
PHI3, // Phi-3 / Phi-3.5 (GELU activation)
GEMMA2, // Gemma 2 (GELU, sliding window)
STARCODER2, // StarCoder2 (code models)
COMMAND_R, // Cohere Command-R
};
// ═══════════════════════════════════════════════════════════════════════════════
// MODEL CONFIG — Extended for DeepSeek V3 MoE + MLA
// ═══════════════════════════════════════════════════════════════════════════════
struct Config {
Architecture arch = Architecture::LLAMA;
Activation activation = Activation::SILU;
// === Common ===
int dim = 4096;
int n_layers = 32;
int n_heads = 32;
int n_kv_heads = 8;
int vocab_size = 32000;
int max_seq_len = 4096;
int sliding_window = 0; // 0 = disabled, >0 = window size (Mistral, Gemma2)
float attn_logit_softcap = 0.0f; // Gemma-2: tanh cap on attention scores
float final_logit_softcap = 0.0f; // Gemma-2: tanh cap on final logits
bool embed_scale_sqrt_dim = false; // Gemma: multiply embeddings by sqrt(dim)
int head_dim = 128;
int intermediate = 11008;
float rope_theta = 10000.0f;
float rms_norm_eps = 1e-5f;
// === MLA (Multi-head Latent Attention) — DeepSeek V3 ===
int q_lora_rank = 0; // Compressed query rank (1536 for K2.5)
int kv_lora_rank = 0; // Compressed KV rank (512 for K2.5)
int key_length = 0; // Full key dim (576 = kv_lora_rank + rope_dim)
int value_length = 0; // Full value dim (512)
int key_length_mla = 0; // MLA key dim per head (192)
int value_length_mla = 0; // MLA value dim per head (128)
int rope_dim = 0; // RoPE dimension count (64)
// === MoE (Mixture of Experts) — DeepSeek V3 ===
int n_experts = 0; // Total experts per MoE layer (384)
int n_experts_used = 0; // Active experts per token (8)
int n_expert_shared = 0; // Shared experts always active (1)
int expert_ffn_dim = 0; // Expert FFN width (2048)
int n_dense_layers = 0; // Leading dense layers before MoE (1)
int n_expert_groups = 1; // Expert groups
int n_expert_groups_used = 1;
int expert_gating_func = 0; // Gating function type
float expert_weights_scale = 1.0f;
bool expert_weights_norm = false;
// === RoPE Scaling (YaRN) ===
float rope_scaling_factor = 1.0f;
int rope_scaling_orig_ctx = 4096;
float rope_yarn_beta_fast = 32.0f;
float rope_yarn_beta_slow = 1.0f;
float rope_yarn_log_mul = 0.1f;
std::vector<int32_t> eos_tokens; // Multiple EOS token IDs
bool is_moe() const { return n_experts > 0; }
bool is_mla() const { return kv_lora_rank > 0; }
void compute_derived() {
// Fix: pure dense models need all layers marked dense
if (n_dense_layers == 0 && n_experts == 0) n_dense_layers = n_layers;
if (dim > 0 && n_heads > 0) {
head_dim = dim / n_heads;
}
if (kv_lora_rank > 0 && rope_dim > 0) {
key_length = kv_lora_rank + rope_dim;
}
}
void print() const {
auto p = [](const char* k, int v) { printf(" %-30s = %d\n", k, v); };
auto pf = [](const char* k, float v) { printf(" %-30s = %.6f\n", k, v); };
printf("=== Inference-X v6 Config ===\n");
printf(" Architecture = %s\n",
arch == Architecture::DEEPSEEK2 ? "DeepSeek V3 MoE+MLA" :
arch == Architecture::QWEN2 ? "Qwen2" : "Llama");
p("dim", dim);
p("n_layers", n_layers);
p("n_heads", n_heads);
p("n_kv_heads", n_kv_heads);
p("vocab_size", vocab_size);
p("max_seq_len", max_seq_len);
p("head_dim", head_dim);
p("intermediate", intermediate);
pf("rope_theta", rope_theta);
if (is_mla()) {
printf("--- MLA ---\n");
p("q_lora_rank", q_lora_rank);
p("kv_lora_rank", kv_lora_rank);
p("key_length", key_length);
p("value_length", value_length);
p("rope_dim", rope_dim);
}
if (is_moe()) {
printf("--- MoE ---\n");
p("n_experts", n_experts);
p("n_experts_used", n_experts_used);
p("n_expert_shared", n_expert_shared);
p("expert_ffn_dim", expert_ffn_dim);
p("n_dense_layers", n_dense_layers);
pf("expert_weights_scale", expert_weights_scale);
}
}
};
} // namespace ix