// ═══════════════════════════════════════════════════════════════════════════════ // INFERENCE-X — Mixture-of-Experts + Multi-Latent Attention // Copyright (C) 2024-2026 Salka Elmadani. All rights reserved. // Licensed under the Business Source License 1.1 (BSL-1.1) // See LICENSE file for full terms. // // INTELLECTUAL PROPERTY PROTECTION: // - INPI eSoleau deposit: 7phf-Ueye-2nWr-Vsgu (16/02/2026) // - GitHub: github.com/ElmadaniS/inference-x // - Author: Salka Elmadani | Morocco | Morocco // // MANUFACTURER NOTICE: Any manufacturer, company, or entity that // incorporates, embeds, distributes, or commercially uses Inference-X // or any derivative work without explicit written authorization from // the copyright holder is in violation of BSL-1.1 and applicable // intellectual property laws. This includes but is not limited to: // hardware vendors, cloud providers, SaaS platforms, and OEMs. // // Contact: Elmadani.SALKA@proton.me for licensing. // ═══════════════════════════════════════════════════════════════════════════════ #pragma once // Inference-X MoE+MLA — Salka Elmadani — Morocco #define IX_MOE_FINGERPRINT "935-ELMADANI-MOE" #include "../core/z_core.h" #include "kernels.h" #include "gemm.h" #include #include #include #include #include namespace ix { // ═══════════════════════════════════════════════════════════════════════════════ // MLA KV CACHE — Compressed latent space (3x more efficient than GQA) // Instead of storing full K,V per head, store compressed kv_lora_rank vectors // + rope_dim separate RoPE keys // ═══════════════════════════════════════════════════════════════════════════════ class MLAKVCache { public: void init(const Config& cfg, int max_ctx = 4096) { n_layers_ = cfg.n_layers; kv_lora_rank_ = cfg.kv_lora_rank; // 512 rope_dim_ = cfg.rope_dim; // 64 max_seq_len_ = max_ctx; // Per layer per position: kv_lora_rank (compressed KV) + rope_dim (RoPE keys) int per_pos = kv_lora_rank_ + rope_dim_; size_t total = (size_t)n_layers_ * max_seq_len_ * per_pos; data_.resize(total, 0.0f); pos_ = 0; } void clear() { pos_ = 0; std::fill(data_.begin(), data_.end(), 0.0f); } // Store compressed KV latent for this layer and position float* kv_latent(int layer, int pos) { return data_.data() + layer_offset(layer) + pos * (kv_lora_rank_ + rope_dim_); } // Get compressed KV latent (read) const float* kv_latent(int layer, int pos) const { return data_.data() + layer_offset(layer) + pos * (kv_lora_rank_ + rope_dim_); } // Just the compressed KV part (first kv_lora_rank elements) const float* kv_compressed(int layer, int pos) const { return kv_latent(layer, pos); } // Just the RoPE key part (last rope_dim elements) const float* rope_key(int layer, int pos) const { return kv_latent(layer, pos) + kv_lora_rank_; } float* rope_key_mut(int layer, int pos) { return const_cast(rope_key(layer, pos)); } void advance() { ++pos_; } int position() const { return pos_; } int max_seq_len() const { return max_seq_len_; } int kv_lora_rank() const { return kv_lora_rank_; } int rope_dim() const { return rope_dim_; } size_t memory_bytes() const { return data_.size() * sizeof(float); } private: std::vector data_; int n_layers_ = 0; int kv_lora_rank_ = 512; int rope_dim_ = 64; int max_seq_len_ = 4096; int pos_ = 0; size_t layer_offset(int layer) const { return (size_t)layer * max_seq_len_ * (kv_lora_rank_ + rope_dim_); } }; // ═══════════════════════════════════════════════════════════════════════════════ // MLA ROPE — RoPE only on rope_dim dimensions (64), not full head_dim // ═══════════════════════════════════════════════════════════════════════════════ class MLARoPE { public: void init(int rope_dim, int max_seq_len, float theta, float scaling_factor = 1.0f) { rope_dim_ = rope_dim; max_seq_len_ = max_seq_len; half_dim_ = rope_dim / 2; cos_.resize(max_seq_len * half_dim_); sin_.resize(max_seq_len * half_dim_); for (int pos = 0; pos < max_seq_len; ++pos) { float adjusted_pos = pos; // YaRN scaling could adjust this for (int i = 0; i < half_dim_; ++i) { float freq = 1.0f / std::pow(theta, 2.0f * i / rope_dim); if (scaling_factor > 1.0f) freq /= scaling_factor; float angle = adjusted_pos * freq; cos_[pos * half_dim_ + i] = std::cos(angle); sin_[pos * half_dim_ + i] = std::sin(angle); } } } // Apply RoPE to a vector of rope_dim dimensions at given position void apply(float* x, int pos) const { const float* c = cos_.data() + pos * half_dim_; const float* s = sin_.data() + pos * half_dim_; for (int i = 0; i < half_dim_; ++i) { float x0 = x[i]; float x1 = x[i + half_dim_]; x[i] = x0 * c[i] - x1 * s[i]; x[i + half_dim_] = x0 * s[i] + x1 * c[i]; } } // Apply to N heads' RoPE portion (stride = head_key_len) void apply_heads(float* q_rope, int n_heads, int stride, int pos) const { for (int h = 0; h < n_heads; ++h) { apply(q_rope + h * stride, pos); } } private: int rope_dim_ = 64; int half_dim_ = 32; int max_seq_len_ = 4096; std::vector cos_, sin_; }; // ═══════════════════════════════════════════════════════════════════════════════ // MLA ATTENTION — Multi-head Latent Attention (DeepSeek V3) // // Flow: // Q path: x → q_a_proj → q_a_norm → q_b_proj → [q_nope | q_rope] per head // KV path: x → kv_a_proj → kv_a_norm → {k_b_proj, v_b_proj} // Cache: store compressed kv_a (kv_lora_rank) + k_rope (rope_dim) // Attention: standard scaled dot-product with decoupled RoPE // ═══════════════════════════════════════════════════════════════════════════════ class MLAAttention { public: void init(const Config& cfg, int max_ctx = 4096) { dim_ = cfg.dim; n_heads_ = cfg.n_heads; q_lora_rank_ = cfg.q_lora_rank; kv_lora_rank_ = cfg.kv_lora_rank; rope_dim_ = cfg.rope_dim; key_len_mla_ = cfg.key_length_mla; // 192 per head val_len_mla_ = cfg.value_length_mla; // 128 per head nope_head_dim_ = key_len_mla_ - rope_dim_; // 128 max_ctx_ = max_ctx; // Scratch buffers q_a_.resize(q_lora_rank_); // compressed Q q_b_.resize(n_heads_ * key_len_mla_); // decompressed Q [nope|rope] per head kv_a_.resize(kv_lora_rank_ + rope_dim_); // compressed KV + rope key attn_out_.resize(n_heads_ * val_len_mla_); // attention output per head scores_.resize(n_heads_ * max_ctx); // attention scores // Absorbed attention buffers q_absorbed_.resize(n_heads_ * kv_lora_rank_); // q in compressed K space v_compressed_.resize(n_heads_ * kv_lora_rank_); // accumulated V in compressed space dq_row_.resize(std::max(n_heads_ * nope_head_dim_, n_heads_ * val_len_mla_)); // dequant row buffer } // ═══════════════════════════════════════════════════════════════════════════ // MLA FORWARD — Single token, proper absorbed attention // ═══════════════════════════════════════════════════════════════════════════ void forward( float* out, // [dim] output const float* x, // [dim] input // Q path weights const void* w_q_a, dtype t_q_a, // [dim, q_lora_rank] Q compress const float* q_a_norm, // [q_lora_rank] Q norm const void* w_q_b, dtype t_q_b, // [q_lora_rank, n_heads * key_len_mla] Q decompress // KV path weights const void* w_kv_a, dtype t_kv_a, // [dim, kv_lora_rank + rope_dim] KV compress const float* kv_a_norm, // [kv_lora_rank] KV norm (only on kv part, not rope) const void* w_k_b, dtype t_k_b, // [kv_lora_rank, n_heads * nope_head_dim] K decompress const void* w_v_b, dtype t_v_b, // [kv_lora_rank, n_heads * val_len_mla] V decompress // Output const void* w_o, dtype t_o, // [n_heads * val_len_mla, dim] output proj MLAKVCache& cache, MLARoPE& rope, int layer ) { int pos = cache.position(); int seq_len = pos + 1; // ─── Q PATH ──────────────────────────────────────────────────────── // x → q_a_proj → compress to q_lora_rank gemm::matmul(q_a_.data(), w_q_a, t_q_a, x, q_lora_rank_, dim_); kernel::rms_norm(q_a_.data(), q_a_.data(), q_a_norm, q_lora_rank_); // q_a → q_b_proj → decompress to n_heads * key_len_mla // q_b layout per head: [q_nope(nope_head_dim=128) | q_rope(rope_dim=64)] gemm::matmul(q_b_.data(), w_q_b, t_q_b, q_a_.data(), n_heads_ * key_len_mla_, q_lora_rank_); // Apply RoPE to q_rope portion of each head for (int h = 0; h < n_heads_; ++h) { float* q_rope_h = q_b_.data() + h * key_len_mla_ + nope_head_dim_; rope.apply(q_rope_h, pos); } // ─── K ABSORPTION ────────────────────────────────────────────────── // q_absorbed_h = q_nope_h @ W_k_b_h^T → [kv_lora_rank] per head // W_k_b: [kv_lora_rank, n_heads * nope_head_dim] (rows × cols) // For row r, head h: W_k_b[r][h*nope_head_dim .. (h+1)*nope_head_dim-1] { kernel::vec_zero(q_absorbed_.data(), n_heads_ * kv_lora_rank_); int k_b_cols = n_heads_ * nope_head_dim_; size_t k_b_row_bytes = gemm::row_bytes(t_k_b, k_b_cols); const uint8_t* k_b_ptr = static_cast(w_k_b); for (int r = 0; r < kv_lora_rank_; ++r) { // Dequantize row r of W_k_b gemm::dequantize_row(dq_row_.data(), k_b_ptr + r * k_b_row_bytes, t_k_b, k_b_cols); // For each head: q_absorbed[h][r] = dot(q_nope_h, dq_row[h*nope..]) for (int h = 0; h < n_heads_; ++h) { const float* q_nope_h = q_b_.data() + h * key_len_mla_; const float* k_b_h = dq_row_.data() + h * nope_head_dim_; float dot = 0.0f; for (int d = 0; d < nope_head_dim_; ++d) { dot += q_nope_h[d] * k_b_h[d]; } q_absorbed_[h * kv_lora_rank_ + r] = dot; } } } // ─── KV PATH ────────────────────────────────────────────────────── // x → kv_a_proj → compress to [kv_lora_rank | rope_dim] gemm::matmul(kv_a_.data(), w_kv_a, t_kv_a, x, kv_lora_rank_ + rope_dim_, dim_); // Norm only the kv_lora_rank part (not the rope keys) kernel::rms_norm(kv_a_.data(), kv_a_.data(), kv_a_norm, kv_lora_rank_); // Apply RoPE to the rope key portion float* k_rope_curr = kv_a_.data() + kv_lora_rank_; rope.apply(k_rope_curr, pos); // Store compressed KV + rope key in cache float* cache_slot = cache.kv_latent(layer, pos); std::memcpy(cache_slot, kv_a_.data(), (kv_lora_rank_ + rope_dim_) * sizeof(float)); // ─── SCORING (absorbed) ──────────────────────────────────────────── // score_h[t] = q_absorbed_h · kv_compressed[t] + q_rope_h · k_rope[t] float scale = 1.0f / std::sqrt(static_cast(key_len_mla_)); #pragma omp parallel for schedule(dynamic) for (int h = 0; h < n_heads_; ++h) { const float* qa_h = q_absorbed_.data() + h * kv_lora_rank_; const float* qr_h = q_b_.data() + h * key_len_mla_ + nope_head_dim_; float* sh = scores_.data() + h * seq_len; for (int t = 0; t < seq_len; ++t) { const float* cached_kv = cache.kv_compressed(layer, t); const float* cached_rope = cache.rope_key(layer, t); float score = 0.0f; // Nope: q_absorbed · kv_compressed (both kv_lora_rank dims) for (int d = 0; d < kv_lora_rank_; ++d) { score += qa_h[d] * cached_kv[d]; } // Rope: q_rope · k_rope (rope_dim dims) for (int d = 0; d < rope_dim_; ++d) { score += qr_h[d] * cached_rope[d]; } sh[t] = score * scale; } kernel::softmax(sh, seq_len); // ─── V ACCUMULATION (compressed space) ───────────────────────── float* vc_h = v_compressed_.data() + h * kv_lora_rank_; kernel::vec_zero(vc_h, kv_lora_rank_); for (int t = 0; t < seq_len; ++t) { const float* cached_kv = cache.kv_compressed(layer, t); float w = sh[t]; for (int d = 0; d < kv_lora_rank_; ++d) { vc_h[d] += w * cached_kv[d]; } } } // ─── V DECOMPRESSION ─────────────────────────────────────────────── // v_h = W_v_b_h^T @ v_compressed_h → [val_len_mla] per head // W_v_b: [kv_lora_rank, n_heads * val_len_mla] { kernel::vec_zero(attn_out_.data(), n_heads_ * val_len_mla_); int v_b_cols = n_heads_ * val_len_mla_; size_t v_b_row_bytes = gemm::row_bytes(t_v_b, v_b_cols); const uint8_t* v_b_ptr = static_cast(w_v_b); for (int r = 0; r < kv_lora_rank_; ++r) { // Dequantize row r of W_v_b gemm::dequantize_row(dq_row_.data(), v_b_ptr + r * v_b_row_bytes, t_v_b, v_b_cols); // For each head: v_h[d] += W_v_b[r][h*val_len + d] * v_compressed_h[r] for (int h = 0; h < n_heads_; ++h) { float vc_r = v_compressed_[h * kv_lora_rank_ + r]; const float* vb_h = dq_row_.data() + h * val_len_mla_; float* oh = attn_out_.data() + h * val_len_mla_; for (int d = 0; d < val_len_mla_; ++d) { oh[d] += vb_h[d] * vc_r; } } } } // ─── OUTPUT PROJECTION ───────────────────────────────────────────── gemm::matmul(out, w_o, t_o, attn_out_.data(), dim_, n_heads_ * val_len_mla_); } private: int dim_ = 7168; int n_heads_ = 64; int q_lora_rank_ = 1536; int kv_lora_rank_ = 512; int rope_dim_ = 64; int nope_head_dim_ = 128; int key_len_mla_ = 192; int val_len_mla_ = 128; int max_ctx_ = 4096; // Scratch std::vector q_a_; std::vector q_b_; std::vector kv_a_; std::vector attn_out_; std::vector scores_; // Absorbed attention std::vector q_absorbed_; // [n_heads * kv_lora_rank] std::vector v_compressed_; // [n_heads * kv_lora_rank] std::vector dq_row_; // dequant row buffer }; // ═══════════════════════════════════════════════════════════════════════════════ // MoE ROUTER — Top-K expert selection with gating // Route token to top-K experts out of N total // ═══════════════════════════════════════════════════════════════════════════════ struct ExpertSelection { int expert_id; float weight; }; class MoERouter { public: void init(const Config& cfg) { n_experts_ = cfg.n_experts; n_used_ = cfg.n_experts_used; dim_ = cfg.dim; gating_func_ = cfg.expert_gating_func; weights_scale_ = cfg.expert_weights_scale; weights_norm_ = cfg.expert_weights_norm; gate_scores_.resize(n_experts_); sorted_indices_.resize(n_experts_); } // Route: compute gating scores and select top-K experts // gate_inp: [dim, n_experts] router weight (F32) // x: [dim] input token // Returns selected experts with normalized weights std::vector route( const float* gate_inp, // [dim, n_experts] or [n_experts, dim] const float* x ) { // Compute gate scores: gate_inp^T @ x → [n_experts] for (int e = 0; e < n_experts_; ++e) { float score = 0.0f; const float* ge = gate_inp + e * dim_; // row e of gate_inp for (int d = 0; d < dim_; ++d) { score += ge[d] * x[d]; } gate_scores_[e] = score; } // Gating function if (gating_func_ == 2) { // Type 2: sigmoid gating (DeepSeek V3 / Kimi K2.5) for (int e = 0; e < n_experts_; ++e) { gate_scores_[e] = 1.0f / (1.0f + std::exp(-gate_scores_[e])); } } else { // Type 0/1: softmax gating kernel::softmax(gate_scores_.data(), n_experts_); } // Top-K selection std::iota(sorted_indices_.begin(), sorted_indices_.end(), 0); std::partial_sort(sorted_indices_.begin(), sorted_indices_.begin() + n_used_, sorted_indices_.end(), [this](int a, int b) { return gate_scores_[a] > gate_scores_[b]; }); std::vector selected(n_used_); float weight_sum = 0.0f; for (int i = 0; i < n_used_; ++i) { selected[i].expert_id = sorted_indices_[i]; selected[i].weight = gate_scores_[sorted_indices_[i]]; weight_sum += selected[i].weight; } // Normalize weights if (weights_norm_ && weight_sum > 0.0f) { for (int i = 0; i < n_used_; ++i) { selected[i].weight /= weight_sum; } } // Apply scale if (weights_scale_ != 1.0f) { for (int i = 0; i < n_used_; ++i) { selected[i].weight *= weights_scale_; } } return selected; } // Stats int n_experts() const { return n_experts_; } int n_used() const { return n_used_; } private: int n_experts_ = 384; int n_used_ = 8; int dim_ = 7168; int gating_func_ = 2; float weights_scale_ = 1.0f; bool weights_norm_ = false; std::vector gate_scores_; std::vector sorted_indices_; }; // ═══════════════════════════════════════════════════════════════════════════════ // EXPERT FFN — Single expert forward pass // Expert weights are 3D tensors: [expert_ffn_dim, dim, n_experts] // We extract the slice for one expert // ═══════════════════════════════════════════════════════════════════════════════ class ExpertFFN { public: void init(const Config& cfg) { dim_ = cfg.dim; expert_ffn_dim_ = cfg.expert_ffn_dim; n_experts_ = cfg.n_experts; gate_buf_.resize(expert_ffn_dim_); up_buf_.resize(expert_ffn_dim_); } // Forward one expert // expert_gate/up/down are the FULL 3D expert tensors // We extract the slice for expert_id void forward( float* out, const float* x, int expert_id, const void* w_gate_exps, dtype t_gate, // [dim, expert_ffn_dim, n_experts] const void* w_up_exps, dtype t_up, const void* w_down_exps, dtype t_down ) { // Expert slice: offset into the 3D tensor // Layout in GGUF: [outer_dim, inner_dim, n_experts] // For gate_exps: shape is [dim, expert_ffn_dim, n_experts] // Wait, from our analysis: ffn_gate_exps [7168, 2048, 384] IQ2_XXS // So shape is [dim=7168, expert_ffn=2048, n_experts=384] // Expert slice = all [dim, expert_ffn] at index expert_id along axis 2 // For quantized tensors, we need to compute the byte offset for expert_id // The 3D tensor is stored as n_experts slices of [dim, expert_ffn] matrices // Actually GGUF stores them as contiguous: expert0_data, expert1_data, ... // Each expert is a [dim, expert_ffn] matrix // For gate: input x[dim] → output[expert_ffn] (matmul x @ gate^T) // gate_exps slice for expert e: offset = e * (dim * expert_ffn * bits/8) // The gemm::matmul_expert function handles the offset size_t expert_gate_bytes = expert_tensor_bytes(dim_, expert_ffn_dim_, t_gate); size_t expert_up_bytes = expert_tensor_bytes(dim_, expert_ffn_dim_, t_up); size_t expert_down_bytes = expert_tensor_bytes(expert_ffn_dim_, dim_, t_down); const uint8_t* gate_ptr = static_cast(w_gate_exps) + expert_id * expert_gate_bytes; const uint8_t* up_ptr = static_cast(w_up_exps) + expert_id * expert_up_bytes; const uint8_t* down_ptr = static_cast(w_down_exps) + expert_id * expert_down_bytes; // gate(x) → gate_buf [expert_ffn] gemm::matmul(gate_buf_.data(), gate_ptr, t_gate, x, expert_ffn_dim_, dim_); // up(x) → up_buf [expert_ffn] gemm::matmul(up_buf_.data(), up_ptr, t_up, x, expert_ffn_dim_, dim_); // SiLU(gate) * up kernel::silu(gate_buf_.data(), expert_ffn_dim_); kernel::vec_mul(gate_buf_.data(), gate_buf_.data(), up_buf_.data(), expert_ffn_dim_); // down → out [dim] gemm::matmul(out, down_ptr, t_down, gate_buf_.data(), dim_, expert_ffn_dim_); } private: int dim_ = 7168; int expert_ffn_dim_ = 2048; int n_experts_ = 384; std::vector gate_buf_; std::vector up_buf_; // Compute bytes for one expert's matrix size_t expert_tensor_bytes(int rows, int cols, dtype t) const { size_t n_elements = (size_t)rows * cols; int bs = dtype_block_size(t); if (bs > 1) { return (n_elements / bs) * dtype_size(t); } return n_elements * dtype_size(t); } }; // ═══════════════════════════════════════════════════════════════════════════════ // SHARED EXPERT FFN — Always-active expert(s) // Uses standard FFN weights (not 3D tensors) // ═══════════════════════════════════════════════════════════════════════════════ class SharedExpertFFN { public: void init(const Config& cfg) { dim_ = cfg.dim; expert_ffn_dim_ = cfg.expert_ffn_dim; gate_buf_.resize(expert_ffn_dim_); up_buf_.resize(expert_ffn_dim_); } void forward( float* out, const float* x, const void* w_gate, dtype t_gate, const void* w_up, dtype t_up, const void* w_down, dtype t_down ) { gemm::matmul(gate_buf_.data(), w_gate, t_gate, x, expert_ffn_dim_, dim_); gemm::matmul(up_buf_.data(), w_up, t_up, x, expert_ffn_dim_, dim_); kernel::silu(gate_buf_.data(), expert_ffn_dim_); kernel::vec_mul(gate_buf_.data(), gate_buf_.data(), up_buf_.data(), expert_ffn_dim_); gemm::matmul(out, w_down, t_down, gate_buf_.data(), dim_, expert_ffn_dim_); } private: int dim_ = 7168; int expert_ffn_dim_ = 2048; std::vector gate_buf_, up_buf_; }; // ═══════════════════════════════════════════════════════════════════════════════ // EXPERT CACHE — LRU tracking for hot expert pages // Track which experts are frequently activated to keep them hot in page cache // ═══════════════════════════════════════════════════════════════════════════════ class ExpertCache { public: void init(int n_layers, int n_experts) { n_layers_ = n_layers; n_experts_ = n_experts; // Frequency counter per layer per expert freq_.resize(n_layers * n_experts, 0); total_calls_ = 0; } void record(int layer, int expert_id) { freq_[layer * n_experts_ + expert_id]++; total_calls_++; } void record_batch(int layer, const std::vector& selected) { for (const auto& s : selected) { record(layer, s.expert_id); } } // Get top N hottest experts for prefetching std::vector hot_experts(int layer, int top_n = 32) const { std::vector> counts; for (int e = 0; e < n_experts_; ++e) { int f = freq_[layer * n_experts_ + e]; if (f > 0) counts.push_back({f, e}); } std::partial_sort(counts.begin(), counts.begin() + std::min(top_n, (int)counts.size()), counts.end(), [](const auto& a, const auto& b) { return a.first > b.first; }); std::vector result; for (int i = 0; i < std::min(top_n, (int)counts.size()); ++i) { result.push_back(counts[i].second); } return result; } void print_stats() const { printf("Expert Cache: %lu total activations\n", total_calls_); // Print hottest experts per layer sample for (int l = 0; l < std::min(3, n_layers_); ++l) { auto hot = hot_experts(l, 5); printf(" Layer %d top-5: ", l); for (int e : hot) { printf("%d(%d) ", e, freq_[l * n_experts_ + e]); } printf("\n"); } } // KIMI-SIGNAL-935 PROFILING void dump_csv(const char* path) const { FILE* fp = fopen(path, "w"); if (!fp) return; fprintf(fp, "layer,expert_id,count\n"); for (int l = 0; l < n_layers_; ++l) for (int e = 0; e < n_experts_; ++e) { int c = freq_[l * n_experts_ + e]; if (c > 0) fprintf(fp, "%d,%d,%d\n", l, e, c); } fclose(fp); printf("[PROFILE] -> %s (%zu calls)\n", path, total_calls_); } private: int n_layers_ = 0; int n_experts_ = 0; std::vector freq_; size_t total_calls_ = 0; }; } // namespace ix