inference-x/core/z_core.h
2026-02-25 02:56:49 +00:00

534 lines
21 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// ═══════════════════════════════════════════════════════════════════════════════
// INFERENCE-X — Z-Core Mathematical Foundation
// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
// Licensed under the Business Source License 1.1 (BSL-1.1)
// See LICENSE file for full terms.
//
// INTELLECTUAL PROPERTY PROTECTION:
// - INPI eSoleau deposit: 7phf-Ueye-2nWr-Vsgu (16/02/2026)
// - GitHub: git.inference-x.com/salka/inference-x
// - Author: Salka Elmadani | Morocco | Morocco
//
// MANUFACTURER NOTICE: Any manufacturer, company, or entity that
// incorporates, embeds, distributes, or commercially uses Inference-X
// or any derivative work without explicit written authorization from
// the copyright holder is in violation of BSL-1.1 and applicable
// intellectual property laws. This includes but is not limited to:
// hardware vendors, cloud providers, SaaS platforms, and OEMs.
//
// Contact: Elmadani.SALKA@proton.me for licensing.
// ═══════════════════════════════════════════════════════════════════════════════
#pragma once
#include <cstdint>
#include <cstddef>
#include <cstring>
#include <cmath>
#include <algorithm>
#include <string>
#include <vector>
#include <unordered_map>
#ifdef __AVX2__
#include <immintrin.h>
#endif
namespace ix {
// ═══════════════════════════════════════════════════════════════════════════════
// WATERMARK — SALKA ELMADANI SIGNATURE (Ne pas modifier)
// ═══════════════════════════════════════════════════════════════════════════════
namespace signature {
static constexpr double S0 = 5.999160064733103e+18; // Integrity coefficient α
static constexpr double S1 = 5.566805661683622e+18; // Integrity coefficient β
static constexpr double S2 = 5.426309097159753e+18; // Integrity coefficient γ
static constexpr double S3 = 4.991471925827590e+18; // Integrity coefficient δ
inline bool verify() {
volatile double sum = S0 + S1 + S2 + S3;
return sum > 2.0e19;
}
inline float inject(float x) {
volatile double check = S0 * 1e-40;
return x * (1.0f + static_cast<float>(check - check));
}
}
// ═══════════════════════════════════════════════════════════════════════════════
// ═══════════════════════════════════════════════════════════════════════════════
// ═══════════════════════════════════════════════════════════════════════════════
// HALF PRECISION TYPES
// ═══════════════════════════════════════════════════════════════════════════════
struct f16 {
uint16_t bits;
f16() : bits(0) {}
f16(float f) {
uint32_t u; std::memcpy(&u, &f, 4);
uint32_t s = (u >> 16) & 0x8000;
int e = ((u >> 23) & 0xFF) - 127 + 15;
uint32_t m = u & 0x7FFFFF;
if (e <= 0) bits = static_cast<uint16_t>(s);
else if (e >= 31) bits = static_cast<uint16_t>(s | 0x7C00);
else bits = static_cast<uint16_t>(s | (e << 10) | (m >> 13));
}
operator float() const {
uint32_t s = (bits & 0x8000) << 16;
uint32_t e = (bits >> 10) & 0x1F;
uint32_t m = bits & 0x3FF;
uint32_t u;
if (e == 0) { if (m) { int sh=0; while(!(m&0x400)){m<<=1;sh++;} m&=0x3FF; u=s|((113-sh)<<23)|(m<<13); } else u=s; }
else if (e == 31) u = s | 0x7F800000 | (m << 13);
else u = s | ((e - 15 + 127) << 23) | (m << 13);
float f; std::memcpy(&f, &u, 4);
return f;
}
static f16 from_bits(uint16_t b) { f16 h; h.bits = b; return h; }
};
enum class Activation {
SILU, // x * sigmoid(x) — Llama, Qwen, DeepSeek, Mistral
GELU, // GELU — Phi, Gemma, StarCoder
GELU_QUICK, // x * sigmoid(1.702 * x)
RELU_SQ, // ReLU²
};
struct bf16 {
uint16_t bits;
bf16() : bits(0) {}
bf16(float f) { uint32_t u; std::memcpy(&u, &f, 4); bits = static_cast<uint16_t>(u >> 16); }
operator float() const { uint32_t u = static_cast<uint32_t>(bits) << 16; float f; std::memcpy(&f, &u, 4); return f; }
};
// ═══════════════════════════════════════════════════════════════════════════════
// TENSOR TYPE ENUM — Extended for IQ formats (Kimi K2.5)
// ═══════════════════════════════════════════════════════════════════════════════
enum class dtype : uint32_t {
F32 = 0,
F16 = 1,
Q4_0 = 2,
Q4_1 = 3,
// 4, 5 reserved
Q5_0 = 6,
Q5_1 = 7,
Q8_0 = 8,
Q8_1 = 9,
Q2_K = 10,
Q3_K = 11,
Q4_K = 12,
Q5_K = 13,
Q6_K = 14,
Q8_K = 15,
// === IQ FORMATS — Critical for Kimi K2.5 1.8-bit quant ===
IQ2_XXS = 16,
IQ2_XS = 17,
IQ3_XXS = 18,
IQ1_S = 19,
IQ4_NL = 20,
IQ3_S = 21, // was IQ2_M, corrected to GGML standard
IQ2_S = 22, // GGML standard
IQ4_XS = 23, // was IQ4_XS, corrected: GGML IQ4_XS
I8 = 24, // was IQ3_S (moved to 21)
I16 = 25, // GGML standard
I32 = 26, // was IQ2_S (moved to 22)
I64 = 27, // GGML standard
F64 = 28, // was IQ4_XS (moved to 23)
IQ1_M = 29,
BF16 = 30,
Q4_0_4x4 = 31,
Q4_0_4x8 = 32,
Q4_0_8x8 = 33,
TQ1_0 = 34,
TQ2_0 = 35,
};
// ═══════════════════════════════════════════════════════════════════════════════
// QUANTIZATION BLOCK DEFINITIONS
// ═══════════════════════════════════════════════════════════════════════════════
static constexpr int QK_K = 256;
static constexpr int QK4_0 = 32;
constexpr int QK4_1 = 32;
constexpr int QK5_0 = 32;
constexpr int QK5_1 = 32;
constexpr int QK8_1 = 32;
static constexpr int QK8_0 = 32;
// Standard blocks
struct block_q4_K {
f16 d; f16 dmin;
uint8_t scales[12];
uint8_t qs[QK_K / 2];
};
struct block_q8_0 {
f16 d;
int8_t qs[32];
};
struct block_q6_K {
uint8_t ql[QK_K / 2];
uint8_t qh[QK_K / 4];
int8_t scales[QK_K / 16];
f16 d;
};
struct block_q4_0 {
f16 d;
uint8_t qs[QK4_0 / 2];
};
struct block_q2_K {
uint8_t scales[QK_K / 16];
uint8_t qs[QK_K / 4];
f16 d; f16 dmin;
};
struct block_q5_K {
f16 d; f16 dmin;
uint8_t scales[12];
uint8_t qh[QK_K / 8];
uint8_t qs[QK_K / 2];
};
struct block_q3_K {
uint8_t hmask[QK_K / 8];
uint8_t qs[QK_K / 4];
uint8_t scales[12];
f16 d;
};
struct block_q4_1 {
f16 d; f16 m;
uint8_t qs[QK4_1 / 2];
};
struct block_q5_0 {
f16 d;
uint8_t qh[4];
uint8_t qs[QK5_0 / 2];
};
struct block_q5_1 {
f16 d; f16 m;
uint8_t qh[4];
uint8_t qs[QK5_1 / 2];
};
struct block_q8_1 {
float d;
float s;
int8_t qs[QK8_1];
};
// STATIC ASSERT: Block sizes must match GGUF binary format exactly
static_assert(sizeof(block_q4_K) == 144, "block_q4_K size mismatch!");
static_assert(sizeof(block_q8_0) == 34, "block_q8_0 size mismatch!");
static_assert(sizeof(block_q6_K) == 210, "block_q6_K size mismatch!");
static_assert(sizeof(block_q2_K) == 84, "block_q2_K size mismatch!");
static_assert(sizeof(block_q5_K) == 176, "block_q5_K size mismatch!");
static_assert(sizeof(block_q3_K) == 110, "block_q3_K size mismatch!");
static_assert(sizeof(block_q4_0) == 18, "block_q4_0 size mismatch!");
// === IQ BLOCKS — for Kimi K2.5 ultra-low-bit experts ===
// IQ1_S: ~1.56 bits/weight (256 weights per block)
struct block_iq1_s {
f16 d;
uint8_t qs[QK_K / 8];
uint16_t qh[QK_K / 32];
};
// IQ2_XXS: ~2.06 bits/weight (256 weights per block)
struct block_iq2_xxs {
f16 d;
uint16_t qs[QK_K / 8];
};
// IQ2_XS: ~2.31 bits/weight
struct block_iq2_xs {
f16 d;
uint16_t qs[QK_K / 8];
uint8_t scales[QK_K / 32];
};
// IQ2_S: ~2.5 bits/weight
struct block_iq2_s {
f16 d;
uint8_t qs[QK_K / 4];
uint8_t qh[QK_K / 32];
uint8_t scales[QK_K / 32];
};
// IQ3_XXS: ~3.06 bits/weight
struct block_iq3_xxs {
f16 d;
uint8_t qs[3 * QK_K / 8];
};
// IQ3_S: ~3.44 bits/weight
struct block_iq3_s {
f16 d;
uint8_t qs[QK_K / 4];
uint8_t qh[QK_K / 32];
uint8_t signs[QK_K / 8];
uint8_t scales[QK_K / 64];
};
// IQ4_NL: ~4.5 bits/weight (non-linear quantization)
struct block_iq4_nl {
f16 d;
uint8_t qs[QK4_0 / 2];
};
// IQ4_XS: ~4.25 bits/weight
struct block_iq4_xs {
f16 d;
uint16_t scales_h;
uint8_t scales_l[QK_K / 64];
uint8_t qs[QK_K / 2];
};
// TQ1_0: ternary 1.69 bits/weight
struct block_tq1_0 {
uint8_t qs[(QK_K - 4 * QK_K / 64) / 5]; // 48 bytes: 5 trits per byte (base-3)
uint8_t qh[QK_K / 64]; // 4 bytes: 4 trits per byte (2-bit)
f16 d;
};
// TQ2_0: ternary 2 bits/weight
struct block_tq2_0 {
uint8_t qs[QK_K / 4];
f16 d;
};
// ═══════════════════════════════════════════════════════════════════════════════
// DTYPE UTILITIES
// ═══════════════════════════════════════════════════════════════════════════════
inline size_t dtype_size(dtype t) {
// PACKED sizes (no alignment padding) — must match GGUF on-disk layout
switch (t) {
case dtype::F32: return 4;
case dtype::F16: return 2;
case dtype::BF16: return 2;
case dtype::Q4_0: return 2 + 16; // 18 per 32
case dtype::Q4_K: return 2 + 2 + 12 + QK_K/2; // 144 per 256
case dtype::Q5_K: return 2 + 2 + 12 + QK_K/2 + QK_K/8; // 176 per 256
case dtype::Q6_K: return 2 + QK_K/2 + QK_K/4 + QK_K/16; // 210 per 256
case dtype::Q8_0: return 2 + 32; // 34 per 32
case dtype::Q2_K: return 2 + 2 + QK_K/16 + QK_K/4; // 84 per 256
case dtype::Q3_K: return 2 + QK_K/4 + QK_K/8 + 12; // 110 per 256
case dtype::IQ1_S: return 2 + QK_K/8 + QK_K/16; // 50 per 256
case dtype::IQ2_XXS: return 2 + QK_K/4; // 66 per 256
case dtype::IQ2_XS: return 2 + QK_K/4 + QK_K/32; // 74 per 256
case dtype::IQ2_S: return 2 + QK_K/4 + QK_K/16; // 82 per 256
case dtype::IQ4_XS: return 2 + 2 + QK_K/64 + QK_K/2; // 136 per 256
case dtype::IQ3_XXS: return 2 + 3*QK_K/8; // 98 per 256
case dtype::IQ3_S: return 2 + QK_K/4 + QK_K/8 + QK_K/32 + 4; // 110 per 256
case dtype::IQ4_NL: return 2 + 16; // 18 per 32
case dtype::TQ1_0: return 2 + 4*13; // 54 per 256
case dtype::TQ2_0: return 2 + QK_K/4; // 66 per 256
case dtype::I8: return 1;
case dtype::I16: return 2;
case dtype::I32: return 4;
case dtype::I64: return 8;
case dtype::F64: return 8;
default: return 1;
}
}
inline int dtype_block_size(dtype t) {
switch (t) {
// 256-element blocks
case dtype::Q4_K: case dtype::Q5_K: case dtype::Q6_K: case dtype::Q8_K:
case dtype::Q2_K: case dtype::Q3_K:
case dtype::IQ1_S: case dtype::IQ1_M:
case dtype::IQ2_XXS: case dtype::IQ2_XS: case dtype::IQ2_S: // case dtype::IQ2_M: // removed, not in GGML standard case dtype::IQ4_XS:
case dtype::IQ3_XXS: case dtype::IQ3_S:
case dtype::IQ4_XS:
case dtype::TQ1_0: case dtype::TQ2_0:
return QK_K;
// 32-element blocks
case dtype::Q4_0: case dtype::Q4_1: case dtype::Q5_0: case dtype::Q5_1:
case dtype::Q8_0: case dtype::Q8_1:
case dtype::IQ4_NL:
return 32;
// No blocking
default: return 1;
}
}
inline const char* dtype_name(dtype t) {
switch (t) {
case dtype::F32: return "F32";
case dtype::F16: return "F16";
case dtype::BF16: return "BF16";
case dtype::Q4_0: return "Q4_0";
case dtype::Q4_K: return "Q4_K";
case dtype::Q5_K: return "Q5_K";
case dtype::Q6_K: return "Q6_K";
case dtype::Q8_0: return "Q8_0";
case dtype::Q2_K: return "Q2_K";
case dtype::Q3_K: return "Q3_K";
case dtype::IQ1_S: return "IQ1_S";
case dtype::IQ2_XXS: return "IQ2_XXS";
case dtype::IQ2_XS: return "IQ2_XS";
case dtype::IQ2_S: return "IQ2_S";
// case dtype::IQ2_M: // removed, not in GGML standard return "IQ2_M";
case dtype::IQ4_XS: return "IQ4_XS";
case dtype::IQ3_XXS: return "IQ3_XXS";
case dtype::IQ3_S: return "IQ3_S";
case dtype::IQ4_NL: return "IQ4_NL";
case dtype::TQ1_0: return "TQ1_0";
case dtype::TQ2_0: return "TQ2_0";
default: return "UNKNOWN";
}
}
// ═══════════════════════════════════════════════════════════════════════════════
// MEMORY — Aligned allocation
// ═══════════════════════════════════════════════════════════════════════════════
static constexpr size_t CACHE_LINE = 64;
inline void* aligned_alloc(size_t size) {
void* ptr = nullptr;
posix_memalign(&ptr, CACHE_LINE, size);
return ptr;
}
inline void aligned_free(void* ptr) { free(ptr); }
template<typename T>
struct span {
T* data_; size_t size_;
span() : data_(nullptr), size_(0) {}
span(T* d, size_t n) : data_(d), size_(n) {}
T* data() { return data_; }
const T* data() const { return data_; }
size_t size() const { return size_; }
T& operator[](size_t i) { return data_[i]; }
const T& operator[](size_t i) const { return data_[i]; }
};
// ═══════════════════════════════════════════════════════════════════════════════
// ARCHITECTURE TYPE
// ═══════════════════════════════════════════════════════════════════════════════
enum class Architecture {
LLAMA, // Standard dense (Llama, Mistral)
QWEN2, // Qwen2 dense (DeepSeek-R1-Distill)
DEEPSEEK2, // DeepSeek V3 MoE + MLA (Kimi K2.5)
PHI3, // Phi-3 / Phi-3.5 (GELU activation)
GEMMA2, // Gemma 2 (GELU, sliding window)
STARCODER2, // StarCoder2 (code models)
COMMAND_R, // Cohere Command-R
};
// ═══════════════════════════════════════════════════════════════════════════════
// MODEL CONFIG — Extended for DeepSeek V3 MoE + MLA
// ═══════════════════════════════════════════════════════════════════════════════
struct Config {
Architecture arch = Architecture::LLAMA;
Activation activation = Activation::SILU;
// === Common ===
int dim = 4096;
int n_layers = 32;
int n_heads = 32;
int n_kv_heads = 8;
int vocab_size = 32000;
int max_seq_len = 4096;
int sliding_window = 0; // 0 = disabled, >0 = window size (Mistral, Gemma2)
float attn_logit_softcap = 0.0f; // Gemma-2: tanh cap on attention scores
float final_logit_softcap = 0.0f; // Gemma-2: tanh cap on final logits
bool embed_scale_sqrt_dim = false; // Gemma: multiply embeddings by sqrt(dim)
int head_dim = 128;
int intermediate = 11008;
float rope_theta = 10000.0f;
float rms_norm_eps = 1e-5f;
// === MLA (Multi-head Latent Attention) — DeepSeek V3 ===
int q_lora_rank = 0; // Compressed query rank (1536 for K2.5)
int kv_lora_rank = 0; // Compressed KV rank (512 for K2.5)
int key_length = 0; // Full key dim (576 = kv_lora_rank + rope_dim)
int value_length = 0; // Full value dim (512)
int key_length_mla = 0; // MLA key dim per head (192)
int value_length_mla = 0; // MLA value dim per head (128)
int rope_dim = 0; // RoPE dimension count (64)
// === MoE (Mixture of Experts) — DeepSeek V3 ===
int n_experts = 0; // Total experts per MoE layer (384)
int n_experts_used = 0; // Active experts per token (8)
int n_expert_shared = 0; // Shared experts always active (1)
int expert_ffn_dim = 0; // Expert FFN width (2048)
int n_dense_layers = 0; // Leading dense layers before MoE (1)
int n_expert_groups = 1; // Expert groups
int n_expert_groups_used = 1;
int expert_gating_func = 0; // Gating function type
float expert_weights_scale = 1.0f;
bool expert_weights_norm = false;
// === RoPE Scaling (YaRN) ===
float rope_scaling_factor = 1.0f;
int rope_scaling_orig_ctx = 4096;
float rope_yarn_beta_fast = 32.0f;
float rope_yarn_beta_slow = 1.0f;
float rope_yarn_log_mul = 0.1f;
std::vector<int32_t> eos_tokens; // Multiple EOS token IDs
bool is_moe() const { return n_experts > 0; }
bool is_mla() const { return kv_lora_rank > 0; }
void compute_derived() {
// Fix: pure dense models need all layers marked dense
if (n_dense_layers == 0 && n_experts == 0) n_dense_layers = n_layers;
if (dim > 0 && n_heads > 0) {
head_dim = dim / n_heads;
}
if (kv_lora_rank > 0 && rope_dim > 0) {
key_length = kv_lora_rank + rope_dim;
}
}
void print() const {
auto p = [](const char* k, int v) { printf(" %-30s = %d\n", k, v); };
auto pf = [](const char* k, float v) { printf(" %-30s = %.6f\n", k, v); };
printf("=== Inference-X v6 Config ===\n");
printf(" Architecture = %s\n",
arch == Architecture::DEEPSEEK2 ? "DeepSeek V3 MoE+MLA" :
arch == Architecture::QWEN2 ? "Qwen2" : "Llama");
p("dim", dim);
p("n_layers", n_layers);
p("n_heads", n_heads);
p("n_kv_heads", n_kv_heads);
p("vocab_size", vocab_size);
p("max_seq_len", max_seq_len);
p("head_dim", head_dim);
p("intermediate", intermediate);
pf("rope_theta", rope_theta);
if (is_mla()) {
printf("--- MLA ---\n");
p("q_lora_rank", q_lora_rank);
p("kv_lora_rank", kv_lora_rank);
p("key_length", key_length);
p("value_length", value_length);
p("rope_dim", rope_dim);
}
if (is_moe()) {
printf("--- MoE ---\n");
p("n_experts", n_experts);
p("n_experts_used", n_experts_used);
p("n_expert_shared", n_expert_shared);
p("expert_ffn_dim", expert_ffn_dim);
p("n_dense_layers", n_dense_layers);
pf("expert_weights_scale", expert_weights_scale);
}
}
};
} // namespace ix