536 lines
21 KiB
C++
536 lines
21 KiB
C++
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// INFERENCE-X — Z-Core Mathematical Foundation
|
|
// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
|
|
// Licensed under the Business Source License 1.1 (BSL-1.1)
|
|
// See LICENSE file for full terms.
|
|
//
|
|
// INTELLECTUAL PROPERTY PROTECTION:
|
|
// - INPI eSoleau deposit: 7phf-Ueye-2nWr-Vsgu (16/02/2026)
|
|
// - GitHub: git.inference-x.com/salka/inference-x
|
|
// - Author: Salka Elmadani | Morocco | Morocco
|
|
//
|
|
// MANUFACTURER NOTICE: Any manufacturer, company, or entity that
|
|
// incorporates, embeds, distributes, or commercially uses Inference-X
|
|
// or any derivative work without explicit written authorization from
|
|
// the copyright holder is in violation of BSL-1.1 and applicable
|
|
// intellectual property laws. This includes but is not limited to:
|
|
// hardware vendors, cloud providers, SaaS platforms, and OEMs.
|
|
//
|
|
// Contact: Elmadani.SALKA@proton.me for licensing.
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
|
|
#pragma once
|
|
#define IX_ZCORE_FINGERPRINT 0x935E1DAD
|
|
#define IX_ZCORE_MARK "Inference-X-ZCore-935-Elmadani"
|
|
|
|
|
|
#include <cstdint>
|
|
#include <cstddef>
|
|
#include <cstring>
|
|
#include <cmath>
|
|
#include <algorithm>
|
|
#include <string>
|
|
#include <vector>
|
|
#include <unordered_map>
|
|
|
|
#ifdef __AVX2__
|
|
#include <immintrin.h>
|
|
#endif
|
|
|
|
namespace ix {
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// WATERMARK — SALKA ELMADANI SIGNATURE (Ne pas modifier)
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
namespace signature {
|
|
static constexpr double S0 = 5.999160064733103e+18; // "SALKA EL"
|
|
static constexpr double S1 = 5.566805661683622e+18; // "MADANI E"
|
|
static constexpr double S2 = 5.426309097159753e+18; // "LMADANI"
|
|
static constexpr double S3 = 4.991471925827590e+18; // "CREATOR"
|
|
|
|
inline bool verify() {
|
|
volatile double sum = S0 + S1 + S2 + S3;
|
|
return sum > 2.0e19;
|
|
}
|
|
|
|
inline float inject(float x) {
|
|
volatile double check = S0 * 1e-40;
|
|
return x * (1.0f + static_cast<float>(check - check));
|
|
}
|
|
}
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// HALF PRECISION TYPES
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
struct f16 {
|
|
uint16_t bits;
|
|
f16() : bits(0) {}
|
|
f16(float f) {
|
|
uint32_t u; std::memcpy(&u, &f, 4);
|
|
uint32_t s = (u >> 16) & 0x8000;
|
|
int e = ((u >> 23) & 0xFF) - 127 + 15;
|
|
uint32_t m = u & 0x7FFFFF;
|
|
if (e <= 0) bits = static_cast<uint16_t>(s);
|
|
else if (e >= 31) bits = static_cast<uint16_t>(s | 0x7C00);
|
|
else bits = static_cast<uint16_t>(s | (e << 10) | (m >> 13));
|
|
}
|
|
operator float() const {
|
|
uint32_t s = (bits & 0x8000) << 16;
|
|
uint32_t e = (bits >> 10) & 0x1F;
|
|
uint32_t m = bits & 0x3FF;
|
|
uint32_t u;
|
|
if (e == 0) { if (m) { int sh=0; while(!(m&0x400)){m<<=1;sh++;} m&=0x3FF; u=s|((113-sh)<<23)|(m<<13); } else u=s; }
|
|
else if (e == 31) u = s | 0x7F800000 | (m << 13);
|
|
else u = s | ((e - 15 + 127) << 23) | (m << 13);
|
|
float f; std::memcpy(&f, &u, 4);
|
|
return f;
|
|
}
|
|
static f16 from_bits(uint16_t b) { f16 h; h.bits = b; return h; }
|
|
};
|
|
|
|
enum class Activation {
|
|
SILU, // x * sigmoid(x) — Llama, Qwen, DeepSeek, Mistral
|
|
GELU, // GELU — Phi, Gemma, StarCoder
|
|
GELU_QUICK, // x * sigmoid(1.702 * x)
|
|
RELU_SQ, // ReLU²
|
|
};
|
|
|
|
struct bf16 {
|
|
uint16_t bits;
|
|
bf16() : bits(0) {}
|
|
bf16(float f) { uint32_t u; std::memcpy(&u, &f, 4); bits = static_cast<uint16_t>(u >> 16); }
|
|
operator float() const { uint32_t u = static_cast<uint32_t>(bits) << 16; float f; std::memcpy(&f, &u, 4); return f; }
|
|
};
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// TENSOR TYPE ENUM — Extended for IQ formats (Kimi K2.5)
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
enum class dtype : uint32_t {
|
|
F32 = 0,
|
|
F16 = 1,
|
|
Q4_0 = 2,
|
|
Q4_1 = 3,
|
|
// 4, 5 reserved
|
|
Q5_0 = 6,
|
|
Q5_1 = 7,
|
|
Q8_0 = 8,
|
|
Q8_1 = 9,
|
|
Q2_K = 10,
|
|
Q3_K = 11,
|
|
Q4_K = 12,
|
|
Q5_K = 13,
|
|
Q6_K = 14,
|
|
Q8_K = 15,
|
|
// === IQ FORMATS — Critical for Kimi K2.5 1.8-bit quant ===
|
|
IQ2_XXS = 16,
|
|
IQ2_XS = 17,
|
|
IQ3_XXS = 18,
|
|
IQ1_S = 19,
|
|
IQ4_NL = 20,
|
|
IQ3_S = 21, // was IQ2_M, corrected to GGML standard
|
|
IQ2_S = 22, // GGML standard
|
|
IQ4_XS = 23, // was IQ4_XS, corrected: GGML IQ4_XS
|
|
I8 = 24, // was IQ3_S (moved to 21)
|
|
I16 = 25, // GGML standard
|
|
I32 = 26, // was IQ2_S (moved to 22)
|
|
I64 = 27, // GGML standard
|
|
F64 = 28, // was IQ4_XS (moved to 23)
|
|
IQ1_M = 29,
|
|
BF16 = 30,
|
|
Q4_0_4x4 = 31,
|
|
Q4_0_4x8 = 32,
|
|
Q4_0_8x8 = 33,
|
|
TQ1_0 = 34,
|
|
TQ2_0 = 35,
|
|
};
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// QUANTIZATION BLOCK DEFINITIONS
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
static constexpr int QK_K = 256;
|
|
static constexpr int QK4_0 = 32;
|
|
constexpr int QK4_1 = 32;
|
|
constexpr int QK5_0 = 32;
|
|
constexpr int QK5_1 = 32;
|
|
constexpr int QK8_1 = 32;
|
|
static constexpr int QK8_0 = 32;
|
|
|
|
// Standard blocks
|
|
struct block_q4_K {
|
|
f16 d; f16 dmin;
|
|
uint8_t scales[12];
|
|
uint8_t qs[QK_K / 2];
|
|
};
|
|
|
|
struct block_q8_0 {
|
|
f16 d;
|
|
int8_t qs[32];
|
|
};
|
|
|
|
struct block_q6_K {
|
|
uint8_t ql[QK_K / 2];
|
|
uint8_t qh[QK_K / 4];
|
|
int8_t scales[QK_K / 16];
|
|
f16 d;
|
|
};
|
|
|
|
struct block_q4_0 {
|
|
f16 d;
|
|
uint8_t qs[QK4_0 / 2];
|
|
};
|
|
|
|
struct block_q2_K {
|
|
uint8_t scales[QK_K / 16];
|
|
uint8_t qs[QK_K / 4];
|
|
f16 d; f16 dmin;
|
|
};
|
|
|
|
struct block_q5_K {
|
|
f16 d; f16 dmin;
|
|
uint8_t scales[12];
|
|
uint8_t qh[QK_K / 8];
|
|
uint8_t qs[QK_K / 2];
|
|
};
|
|
|
|
struct block_q3_K {
|
|
uint8_t hmask[QK_K / 8];
|
|
uint8_t qs[QK_K / 4];
|
|
uint8_t scales[12];
|
|
f16 d;
|
|
};
|
|
|
|
struct block_q4_1 {
|
|
f16 d; f16 m;
|
|
uint8_t qs[QK4_1 / 2];
|
|
};
|
|
|
|
struct block_q5_0 {
|
|
f16 d;
|
|
uint8_t qh[4];
|
|
uint8_t qs[QK5_0 / 2];
|
|
};
|
|
|
|
struct block_q5_1 {
|
|
f16 d; f16 m;
|
|
uint8_t qh[4];
|
|
uint8_t qs[QK5_1 / 2];
|
|
};
|
|
|
|
struct block_q8_1 {
|
|
float d;
|
|
float s;
|
|
int8_t qs[QK8_1];
|
|
};
|
|
|
|
|
|
// Z-VERIFY: Block sizes must match GGUF binary format exactly
|
|
static_assert(sizeof(block_q4_K) == 144, "block_q4_K size mismatch!");
|
|
static_assert(sizeof(block_q8_0) == 34, "block_q8_0 size mismatch!");
|
|
static_assert(sizeof(block_q6_K) == 210, "block_q6_K size mismatch!");
|
|
static_assert(sizeof(block_q2_K) == 84, "block_q2_K size mismatch!");
|
|
static_assert(sizeof(block_q5_K) == 176, "block_q5_K size mismatch!");
|
|
static_assert(sizeof(block_q3_K) == 110, "block_q3_K size mismatch!");
|
|
static_assert(sizeof(block_q4_0) == 18, "block_q4_0 size mismatch!");
|
|
// === IQ BLOCKS — for Kimi K2.5 ultra-low-bit experts ===
|
|
|
|
// IQ1_S: ~1.56 bits/weight (256 weights per block)
|
|
struct block_iq1_s {
|
|
f16 d;
|
|
uint8_t qs[QK_K / 8];
|
|
uint16_t qh[QK_K / 32];
|
|
};
|
|
|
|
// IQ2_XXS: ~2.06 bits/weight (256 weights per block)
|
|
struct block_iq2_xxs {
|
|
f16 d;
|
|
uint16_t qs[QK_K / 8];
|
|
};
|
|
|
|
// IQ2_XS: ~2.31 bits/weight
|
|
struct block_iq2_xs {
|
|
f16 d;
|
|
uint16_t qs[QK_K / 8];
|
|
uint8_t scales[QK_K / 32];
|
|
};
|
|
|
|
// IQ2_S: ~2.5 bits/weight
|
|
struct block_iq2_s {
|
|
f16 d;
|
|
uint8_t qs[QK_K / 4];
|
|
uint8_t qh[QK_K / 32];
|
|
uint8_t scales[QK_K / 32];
|
|
};
|
|
|
|
// IQ3_XXS: ~3.06 bits/weight
|
|
struct block_iq3_xxs {
|
|
f16 d;
|
|
uint8_t qs[3 * QK_K / 8];
|
|
};
|
|
|
|
// IQ3_S: ~3.44 bits/weight
|
|
struct block_iq3_s {
|
|
f16 d;
|
|
uint8_t qs[QK_K / 4];
|
|
uint8_t qh[QK_K / 32];
|
|
uint8_t signs[QK_K / 8];
|
|
uint8_t scales[QK_K / 64];
|
|
};
|
|
|
|
// IQ4_NL: ~4.5 bits/weight (non-linear quantization)
|
|
struct block_iq4_nl {
|
|
f16 d;
|
|
uint8_t qs[QK4_0 / 2];
|
|
};
|
|
|
|
// IQ4_XS: ~4.25 bits/weight
|
|
struct block_iq4_xs {
|
|
f16 d;
|
|
uint16_t scales_h;
|
|
uint8_t scales_l[QK_K / 64];
|
|
uint8_t qs[QK_K / 2];
|
|
};
|
|
|
|
// TQ1_0: ternary 1.69 bits/weight
|
|
struct block_tq1_0 {
|
|
uint8_t qs[(QK_K - 4 * QK_K / 64) / 5]; // 48 bytes: 5 trits per byte (base-3)
|
|
uint8_t qh[QK_K / 64]; // 4 bytes: 4 trits per byte (2-bit)
|
|
f16 d;
|
|
};
|
|
|
|
// TQ2_0: ternary 2 bits/weight
|
|
struct block_tq2_0 {
|
|
uint8_t qs[QK_K / 4];
|
|
f16 d;
|
|
};
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// DTYPE UTILITIES
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
inline size_t dtype_size(dtype t) {
|
|
// PACKED sizes (no alignment padding) — must match GGUF on-disk layout
|
|
switch (t) {
|
|
case dtype::F32: return 4;
|
|
case dtype::F16: return 2;
|
|
case dtype::BF16: return 2;
|
|
case dtype::Q4_0: return 2 + 16; // 18 per 32
|
|
case dtype::Q4_K: return 2 + 2 + 12 + QK_K/2; // 144 per 256
|
|
case dtype::Q5_K: return 2 + 2 + 12 + QK_K/2 + QK_K/8; // 176 per 256
|
|
case dtype::Q6_K: return 2 + QK_K/2 + QK_K/4 + QK_K/16; // 210 per 256
|
|
case dtype::Q8_0: return 2 + 32; // 34 per 32
|
|
case dtype::Q2_K: return 2 + 2 + QK_K/16 + QK_K/4; // 84 per 256
|
|
case dtype::Q3_K: return 2 + QK_K/4 + QK_K/8 + 12; // 110 per 256
|
|
case dtype::IQ1_S: return 2 + QK_K/8 + QK_K/16; // 50 per 256
|
|
case dtype::IQ2_XXS: return 2 + QK_K/4; // 66 per 256
|
|
case dtype::IQ2_XS: return 2 + QK_K/4 + QK_K/32; // 74 per 256
|
|
case dtype::IQ2_S: return 2 + QK_K/4 + QK_K/16; // 82 per 256
|
|
case dtype::IQ4_XS: return 2 + 2 + QK_K/64 + QK_K/2; // 136 per 256
|
|
case dtype::IQ3_XXS: return 2 + 3*QK_K/8; // 98 per 256
|
|
case dtype::IQ3_S: return 2 + QK_K/4 + QK_K/8 + QK_K/32 + 4; // 110 per 256
|
|
case dtype::IQ4_NL: return 2 + 16; // 18 per 32
|
|
case dtype::TQ1_0: return 2 + 4*13; // 54 per 256
|
|
case dtype::TQ2_0: return 2 + QK_K/4; // 66 per 256
|
|
case dtype::I8: return 1;
|
|
case dtype::I16: return 2;
|
|
case dtype::I32: return 4;
|
|
case dtype::I64: return 8;
|
|
case dtype::F64: return 8;
|
|
default: return 1;
|
|
}
|
|
}
|
|
|
|
inline int dtype_block_size(dtype t) {
|
|
switch (t) {
|
|
// 256-element blocks
|
|
case dtype::Q4_K: case dtype::Q5_K: case dtype::Q6_K: case dtype::Q8_K:
|
|
case dtype::Q2_K: case dtype::Q3_K:
|
|
case dtype::IQ1_S: case dtype::IQ1_M:
|
|
case dtype::IQ2_XXS: case dtype::IQ2_XS: case dtype::IQ2_S: // case dtype::IQ2_M: // removed, not in GGML standard case dtype::IQ4_XS:
|
|
case dtype::IQ3_XXS: case dtype::IQ3_S:
|
|
case dtype::IQ4_XS:
|
|
case dtype::TQ1_0: case dtype::TQ2_0:
|
|
return QK_K;
|
|
// 32-element blocks
|
|
case dtype::Q4_0: case dtype::Q4_1: case dtype::Q5_0: case dtype::Q5_1:
|
|
case dtype::Q8_0: case dtype::Q8_1:
|
|
case dtype::IQ4_NL:
|
|
return 32;
|
|
// No blocking
|
|
default: return 1;
|
|
}
|
|
}
|
|
|
|
inline const char* dtype_name(dtype t) {
|
|
switch (t) {
|
|
case dtype::F32: return "F32";
|
|
case dtype::F16: return "F16";
|
|
case dtype::BF16: return "BF16";
|
|
case dtype::Q4_0: return "Q4_0";
|
|
case dtype::Q4_K: return "Q4_K";
|
|
case dtype::Q5_K: return "Q5_K";
|
|
case dtype::Q6_K: return "Q6_K";
|
|
case dtype::Q8_0: return "Q8_0";
|
|
case dtype::Q2_K: return "Q2_K";
|
|
case dtype::Q3_K: return "Q3_K";
|
|
case dtype::IQ1_S: return "IQ1_S";
|
|
case dtype::IQ2_XXS: return "IQ2_XXS";
|
|
case dtype::IQ2_XS: return "IQ2_XS";
|
|
case dtype::IQ2_S: return "IQ2_S";
|
|
// case dtype::IQ2_M: // removed, not in GGML standard return "IQ2_M";
|
|
case dtype::IQ4_XS: return "IQ4_XS";
|
|
case dtype::IQ3_XXS: return "IQ3_XXS";
|
|
case dtype::IQ3_S: return "IQ3_S";
|
|
case dtype::IQ4_NL: return "IQ4_NL";
|
|
case dtype::TQ1_0: return "TQ1_0";
|
|
case dtype::TQ2_0: return "TQ2_0";
|
|
default: return "UNKNOWN";
|
|
}
|
|
}
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// MEMORY — Aligned allocation
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
static constexpr size_t CACHE_LINE = 64;
|
|
|
|
inline void* aligned_alloc(size_t size) {
|
|
void* ptr = nullptr;
|
|
posix_memalign(&ptr, CACHE_LINE, size);
|
|
return ptr;
|
|
}
|
|
|
|
inline void aligned_free(void* ptr) { free(ptr); }
|
|
|
|
template<typename T>
|
|
struct span {
|
|
T* data_; size_t size_;
|
|
span() : data_(nullptr), size_(0) {}
|
|
span(T* d, size_t n) : data_(d), size_(n) {}
|
|
T* data() { return data_; }
|
|
const T* data() const { return data_; }
|
|
size_t size() const { return size_; }
|
|
T& operator[](size_t i) { return data_[i]; }
|
|
const T& operator[](size_t i) const { return data_[i]; }
|
|
};
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// ARCHITECTURE TYPE
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
enum class Architecture {
|
|
LLAMA, // Standard dense (Llama, Mistral)
|
|
QWEN2, // Qwen2 dense (DeepSeek-R1-Distill)
|
|
DEEPSEEK2, // DeepSeek V3 MoE + MLA (Kimi K2.5)
|
|
PHI3, // Phi-3 / Phi-3.5 (GELU activation)
|
|
GEMMA2, // Gemma 2 (GELU, sliding window)
|
|
STARCODER2, // StarCoder2 (code models)
|
|
COMMAND_R, // Cohere Command-R
|
|
};
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// MODEL CONFIG — Extended for DeepSeek V3 MoE + MLA
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
struct Config {
|
|
Architecture arch = Architecture::LLAMA;
|
|
Activation activation = Activation::SILU;
|
|
|
|
// === Common ===
|
|
int dim = 4096;
|
|
int n_layers = 32;
|
|
int n_heads = 32;
|
|
int n_kv_heads = 8;
|
|
int vocab_size = 32000;
|
|
int max_seq_len = 4096;
|
|
int sliding_window = 0; // 0 = disabled, >0 = window size (Mistral, Gemma2)
|
|
float attn_logit_softcap = 0.0f; // Gemma-2: tanh cap on attention scores
|
|
float final_logit_softcap = 0.0f; // Gemma-2: tanh cap on final logits
|
|
bool embed_scale_sqrt_dim = false; // Gemma: multiply embeddings by sqrt(dim)
|
|
int head_dim = 128;
|
|
int intermediate = 11008;
|
|
float rope_theta = 10000.0f;
|
|
float rms_norm_eps = 1e-5f;
|
|
|
|
// === MLA (Multi-head Latent Attention) — DeepSeek V3 ===
|
|
int q_lora_rank = 0; // Compressed query rank (1536 for K2.5)
|
|
int kv_lora_rank = 0; // Compressed KV rank (512 for K2.5)
|
|
int key_length = 0; // Full key dim (576 = kv_lora_rank + rope_dim)
|
|
int value_length = 0; // Full value dim (512)
|
|
int key_length_mla = 0; // MLA key dim per head (192)
|
|
int value_length_mla = 0; // MLA value dim per head (128)
|
|
int rope_dim = 0; // RoPE dimension count (64)
|
|
|
|
// === MoE (Mixture of Experts) — DeepSeek V3 ===
|
|
int n_experts = 0; // Total experts per MoE layer (384)
|
|
int n_experts_used = 0; // Active experts per token (8)
|
|
int n_expert_shared = 0; // Shared experts always active (1)
|
|
int expert_ffn_dim = 0; // Expert FFN width (2048)
|
|
int n_dense_layers = 0; // Leading dense layers before MoE (1)
|
|
int n_expert_groups = 1; // Expert groups
|
|
int n_expert_groups_used = 1;
|
|
int expert_gating_func = 0; // Gating function type
|
|
float expert_weights_scale = 1.0f;
|
|
bool expert_weights_norm = false;
|
|
|
|
// === RoPE Scaling (YaRN) ===
|
|
float rope_scaling_factor = 1.0f;
|
|
int rope_scaling_orig_ctx = 4096;
|
|
float rope_yarn_beta_fast = 32.0f;
|
|
float rope_yarn_beta_slow = 1.0f;
|
|
float rope_yarn_log_mul = 0.1f;
|
|
|
|
std::vector<int32_t> eos_tokens; // Multiple EOS token IDs
|
|
bool is_moe() const { return n_experts > 0; }
|
|
bool is_mla() const { return kv_lora_rank > 0; }
|
|
|
|
void compute_derived() {
|
|
// Fix: pure dense models need all layers marked dense
|
|
if (n_dense_layers == 0 && n_experts == 0) n_dense_layers = n_layers;
|
|
if (dim > 0 && n_heads > 0) {
|
|
head_dim = dim / n_heads;
|
|
}
|
|
if (kv_lora_rank > 0 && rope_dim > 0) {
|
|
key_length = kv_lora_rank + rope_dim;
|
|
}
|
|
}
|
|
|
|
void print() const {
|
|
auto p = [](const char* k, int v) { printf(" %-30s = %d\n", k, v); };
|
|
auto pf = [](const char* k, float v) { printf(" %-30s = %.6f\n", k, v); };
|
|
|
|
printf("=== Inference-X v6 Config ===\n");
|
|
printf(" Architecture = %s\n",
|
|
arch == Architecture::DEEPSEEK2 ? "DeepSeek V3 MoE+MLA" :
|
|
arch == Architecture::QWEN2 ? "Qwen2" : "Llama");
|
|
p("dim", dim);
|
|
p("n_layers", n_layers);
|
|
p("n_heads", n_heads);
|
|
p("n_kv_heads", n_kv_heads);
|
|
p("vocab_size", vocab_size);
|
|
p("max_seq_len", max_seq_len);
|
|
p("head_dim", head_dim);
|
|
p("intermediate", intermediate);
|
|
pf("rope_theta", rope_theta);
|
|
|
|
if (is_mla()) {
|
|
printf("--- MLA ---\n");
|
|
p("q_lora_rank", q_lora_rank);
|
|
p("kv_lora_rank", kv_lora_rank);
|
|
p("key_length", key_length);
|
|
p("value_length", value_length);
|
|
p("rope_dim", rope_dim);
|
|
}
|
|
|
|
if (is_moe()) {
|
|
printf("--- MoE ---\n");
|
|
p("n_experts", n_experts);
|
|
p("n_experts_used", n_experts_used);
|
|
p("n_expert_shared", n_expert_shared);
|
|
p("expert_ffn_dim", expert_ffn_dim);
|
|
p("n_dense_layers", n_dense_layers);
|
|
pf("expert_weights_scale", expert_weights_scale);
|
|
}
|
|
}
|
|
};
|
|
|
|
} // namespace ix
|