inference-x/runtime/moe_mla.h

// ═══════════════════════════════════════════════════════════════════════════════
// INFERENCE-X — Mixture-of-Experts + Multi-Latent Attention
// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
// Licensed under the Business Source License 1.1 (BSL-1.1)
// See LICENSE file for full terms.
//
// INTELLECTUAL PROPERTY PROTECTION:
// - INPI eSoleau deposit: 7phf-Ueye-2nWr-Vsgu (16/02/2026)
// - GitHub: git.inference-x.com/salka/inference-x
// - Author: Salka Elmadani | Morocco | Morocco
//
// MANUFACTURER NOTICE: Any manufacturer, company, or entity that
// incorporates, embeds, distributes, or commercially uses Inference-X
// or any derivative work without explicit written authorization from
// the copyright holder is in violation of BSL-1.1 and applicable
// intellectual property laws. This includes but is not limited to:
// hardware vendors, cloud providers, SaaS platforms, and OEMs.
//
// Contact: Elmadani.SALKA@proton.me for licensing.
// ═══════════════════════════════════════════════════════════════════════════════

#pragma once

// Inference-X MoE+MLA — Salka Elmadani — Morocco
#define IX_MOE_FINGERPRINT "935-ELMADANI-MOE"


#include "../core/z_core.h"
#include "kernels.h"
#include "gemm.h"
#include <vector>
#include <cmath>
#include <algorithm>
#include <numeric>
#include <cstdio>

namespace ix {

// ═══════════════════════════════════════════════════════════════════════════════
// MLA KV CACHE — Compressed latent space (3x more efficient than GQA)
// Instead of storing full K,V per head, store compressed kv_lora_rank vectors
// + rope_dim separate RoPE keys
// ═══════════════════════════════════════════════════════════════════════════════
class MLAKVCache {
public:
    void init(const Config& cfg, int max_ctx = 4096) {
        n_layers_ = cfg.n_layers;
        kv_lora_rank_ = cfg.kv_lora_rank;  // 512
        rope_dim_ = cfg.rope_dim;           // 64
        max_seq_len_ = max_ctx;

        // Per layer per position: kv_lora_rank (compressed KV) + rope_dim (RoPE keys)
        int per_pos = kv_lora_rank_ + rope_dim_;
        size_t total = (size_t)n_layers_ * max_seq_len_ * per_pos;
        data_.resize(total, 0.0f);
        pos_ = 0;
    }

    void clear() {
        pos_ = 0;
        std::fill(data_.begin(), data_.end(), 0.0f);
    }

    // Store compressed KV latent for this layer and position
    float* kv_latent(int layer, int pos) {
        return data_.data() + layer_offset(layer) + pos * (kv_lora_rank_ + rope_dim_);
    }

    // Get compressed KV latent (read)
    const float* kv_latent(int layer, int pos) const {
        return data_.data() + layer_offset(layer) + pos * (kv_lora_rank_ + rope_dim_);
    }

    // Just the compressed KV part (first kv_lora_rank elements)
    const float* kv_compressed(int layer, int pos) const {
        return kv_latent(layer, pos);
    }

    // Just the RoPE key part (last rope_dim elements)
    const float* rope_key(int layer, int pos) const {
        return kv_latent(layer, pos) + kv_lora_rank_;
    }

    float* rope_key_mut(int layer, int pos) {
        return const_cast<float*>(rope_key(layer, pos));
    }

    void advance() { ++pos_; }
    int position() const { return pos_; }
    int max_seq_len() const { return max_seq_len_; }
    int kv_lora_rank() const { return kv_lora_rank_; }
    int rope_dim() const { return rope_dim_; }

    size_t memory_bytes() const {
        return data_.size() * sizeof(float);
    }

private:
    std::vector<float> data_;
    int n_layers_ = 0;
    int kv_lora_rank_ = 512;
    int rope_dim_ = 64;
    int max_seq_len_ = 4096;
    int pos_ = 0;

    size_t layer_offset(int layer) const {
        return (size_t)layer * max_seq_len_ * (kv_lora_rank_ + rope_dim_);
    }
};

// ═══════════════════════════════════════════════════════════════════════════════
// MLA ROPE — RoPE only on rope_dim dimensions (64), not full head_dim
// ═══════════════════════════════════════════════════════════════════════════════
class MLARoPE {
public:
    void init(int rope_dim, int max_seq_len, float theta, float scaling_factor = 1.0f) {
        rope_dim_ = rope_dim;
        max_seq_len_ = max_seq_len;
        half_dim_ = rope_dim / 2;

        cos_.resize(max_seq_len * half_dim_);
        sin_.resize(max_seq_len * half_dim_);

        for (int pos = 0; pos < max_seq_len; ++pos) {
            float adjusted_pos = pos;  // YaRN scaling could adjust this
            for (int i = 0; i < half_dim_; ++i) {
                float freq = 1.0f / std::pow(theta, 2.0f * i / rope_dim);
                if (scaling_factor > 1.0f) freq /= scaling_factor;
                float angle = adjusted_pos * freq;
                cos_[pos * half_dim_ + i] = std::cos(angle);
                sin_[pos * half_dim_ + i] = std::sin(angle);
            }
        }
    }

    // Apply RoPE to a vector of rope_dim dimensions at given position
    void apply(float* x, int pos) const {
        const float* c = cos_.data() + pos * half_dim_;
        const float* s = sin_.data() + pos * half_dim_;
        for (int i = 0; i < half_dim_; ++i) {
            float x0 = x[i];
            float x1 = x[i + half_dim_];
            x[i]             = x0 * c[i] - x1 * s[i];
            x[i + half_dim_] = x0 * s[i] + x1 * c[i];
        }
    }

    // Apply to N heads' RoPE portion (stride = head_key_len)
    void apply_heads(float* q_rope, int n_heads, int stride, int pos) const {
        for (int h = 0; h < n_heads; ++h) {
            apply(q_rope + h * stride, pos);
        }
    }

private:
    int rope_dim_ = 64;
    int half_dim_ = 32;
    int max_seq_len_ = 4096;
    std::vector<float> cos_, sin_;
};

// ═══════════════════════════════════════════════════════════════════════════════
// MLA ATTENTION — Multi-head Latent Attention (DeepSeek V3)
//
// Flow:
//   Q path: x → q_a_proj → q_a_norm → q_b_proj → [q_nope | q_rope] per head
//   KV path: x → kv_a_proj → kv_a_norm → {k_b_proj, v_b_proj}
//   Cache: store compressed kv_a (kv_lora_rank) + k_rope (rope_dim)
//   Attention: standard scaled dot-product with decoupled RoPE
// ═══════════════════════════════════════════════════════════════════════════════
class MLAAttention {
public:
    void init(const Config& cfg, int max_ctx = 4096) {
        dim_ = cfg.dim;
        n_heads_ = cfg.n_heads;
        q_lora_rank_ = cfg.q_lora_rank;
        kv_lora_rank_ = cfg.kv_lora_rank;
        rope_dim_ = cfg.rope_dim;

        key_len_mla_ = cfg.key_length_mla;  // 192 per head
        val_len_mla_ = cfg.value_length_mla; // 128 per head
        nope_head_dim_ = key_len_mla_ - rope_dim_;  // 128

        max_ctx_ = max_ctx;

        // Scratch buffers
        q_a_.resize(q_lora_rank_);                           // compressed Q
        q_b_.resize(n_heads_ * key_len_mla_);               // decompressed Q [nope|rope] per head
        kv_a_.resize(kv_lora_rank_ + rope_dim_);             // compressed KV + rope key
        attn_out_.resize(n_heads_ * val_len_mla_);           // attention output per head
        scores_.resize(n_heads_ * max_ctx);                  // attention scores

        // Absorbed attention buffers
        q_absorbed_.resize(n_heads_ * kv_lora_rank_);       // q in compressed K space
        v_compressed_.resize(n_heads_ * kv_lora_rank_);     // accumulated V in compressed space
        dq_row_.resize(std::max(n_heads_ * nope_head_dim_,
                                n_heads_ * val_len_mla_));   // dequant row buffer
    }

    // ═══════════════════════════════════════════════════════════════════════════
    // MLA FORWARD — Single token, proper absorbed attention
    // ═══════════════════════════════════════════════════════════════════════════
    void forward(
        float* out,                    // [dim] output
        const float* x,                // [dim] input

        // Q path weights
        const void* w_q_a, dtype t_q_a,       // [dim, q_lora_rank] Q compress
        const float* q_a_norm,                  // [q_lora_rank] Q norm
        const void* w_q_b, dtype t_q_b,       // [q_lora_rank, n_heads * key_len_mla] Q decompress

        // KV path weights
        const void* w_kv_a, dtype t_kv_a,     // [dim, kv_lora_rank + rope_dim] KV compress
        const float* kv_a_norm,                 // [kv_lora_rank] KV norm (only on kv part, not rope)
        const void* w_k_b, dtype t_k_b,       // [kv_lora_rank, n_heads * nope_head_dim] K decompress
        const void* w_v_b, dtype t_v_b,       // [kv_lora_rank, n_heads * val_len_mla] V decompress

        // Output
        const void* w_o, dtype t_o,            // [n_heads * val_len_mla, dim] output proj

        MLAKVCache& cache,
        MLARoPE& rope,
        int layer
    ) {
        int pos = cache.position();
        int seq_len = pos + 1;

        // ─── Q PATH ────────────────────────────────────────────────────────
        // x → q_a_proj → compress to q_lora_rank
        gemm::matmul(q_a_.data(), w_q_a, t_q_a, x, q_lora_rank_, dim_);
        kernel::rms_norm(q_a_.data(), q_a_.data(), q_a_norm, q_lora_rank_);

        // q_a → q_b_proj → decompress to n_heads * key_len_mla
        // q_b layout per head: [q_nope(nope_head_dim=128) | q_rope(rope_dim=64)]
        gemm::matmul(q_b_.data(), w_q_b, t_q_b, q_a_.data(),
                     n_heads_ * key_len_mla_, q_lora_rank_);

        // Apply RoPE to q_rope portion of each head
        for (int h = 0; h < n_heads_; ++h) {
            float* q_rope_h = q_b_.data() + h * key_len_mla_ + nope_head_dim_;
            rope.apply(q_rope_h, pos);
        }

        // ─── K ABSORPTION ──────────────────────────────────────────────────
        // q_absorbed_h = q_nope_h @ W_k_b_h^T → [kv_lora_rank] per head
        // W_k_b: [kv_lora_rank, n_heads * nope_head_dim] (rows × cols)
        // For row r, head h: W_k_b[r][h*nope_head_dim .. (h+1)*nope_head_dim-1]
        {
            kernel::vec_zero(q_absorbed_.data(), n_heads_ * kv_lora_rank_);
            int k_b_cols = n_heads_ * nope_head_dim_;
            size_t k_b_row_bytes = gemm::row_bytes(t_k_b, k_b_cols);
            const uint8_t* k_b_ptr = static_cast<const uint8_t*>(w_k_b);


            for (int r = 0; r < kv_lora_rank_; ++r) {
                // Dequantize row r of W_k_b
                gemm::dequantize_row(dq_row_.data(),
                                     k_b_ptr + r * k_b_row_bytes,
                                     t_k_b, k_b_cols);
                // For each head: q_absorbed[h][r] = dot(q_nope_h, dq_row[h*nope..])
                for (int h = 0; h < n_heads_; ++h) {
                    const float* q_nope_h = q_b_.data() + h * key_len_mla_;
                    const float* k_b_h = dq_row_.data() + h * nope_head_dim_;
                    float dot = 0.0f;
                    for (int d = 0; d < nope_head_dim_; ++d) {
                        dot += q_nope_h[d] * k_b_h[d];
                    }
                    q_absorbed_[h * kv_lora_rank_ + r] = dot;
                }
            }
        }

        // ─── KV PATH ──────────────────────────────────────────────────────
        // x → kv_a_proj → compress to [kv_lora_rank | rope_dim]
        gemm::matmul(kv_a_.data(), w_kv_a, t_kv_a, x,
                     kv_lora_rank_ + rope_dim_, dim_);

        // Norm only the kv_lora_rank part (not the rope keys)
        kernel::rms_norm(kv_a_.data(), kv_a_.data(), kv_a_norm, kv_lora_rank_);

        // Apply RoPE to the rope key portion
        float* k_rope_curr = kv_a_.data() + kv_lora_rank_;
        rope.apply(k_rope_curr, pos);

        // Store compressed KV + rope key in cache
        float* cache_slot = cache.kv_latent(layer, pos);
        std::memcpy(cache_slot, kv_a_.data(),
                    (kv_lora_rank_ + rope_dim_) * sizeof(float));

        // ─── SCORING (absorbed) ────────────────────────────────────────────
        // score_h[t] = q_absorbed_h · kv_compressed[t] + q_rope_h · k_rope[t]
        float scale = 1.0f / std::sqrt(static_cast<float>(key_len_mla_));

        #pragma omp parallel for schedule(dynamic)
        for (int h = 0; h < n_heads_; ++h) {
            const float* qa_h = q_absorbed_.data() + h * kv_lora_rank_;
            const float* qr_h = q_b_.data() + h * key_len_mla_ + nope_head_dim_;
            float* sh = scores_.data() + h * seq_len;

            for (int t = 0; t < seq_len; ++t) {
                const float* cached_kv = cache.kv_compressed(layer, t);
                const float* cached_rope = cache.rope_key(layer, t);

                float score = 0.0f;
                // Nope: q_absorbed · kv_compressed (both kv_lora_rank dims)
                for (int d = 0; d < kv_lora_rank_; ++d) {
                    score += qa_h[d] * cached_kv[d];
                }
                // Rope: q_rope · k_rope (rope_dim dims)
                for (int d = 0; d < rope_dim_; ++d) {
                    score += qr_h[d] * cached_rope[d];
                }
                sh[t] = score * scale;
            }

            kernel::softmax(sh, seq_len);

            // ─── V ACCUMULATION (compressed space) ─────────────────────────
            float* vc_h = v_compressed_.data() + h * kv_lora_rank_;
            kernel::vec_zero(vc_h, kv_lora_rank_);
            for (int t = 0; t < seq_len; ++t) {
                const float* cached_kv = cache.kv_compressed(layer, t);
                float w = sh[t];
                for (int d = 0; d < kv_lora_rank_; ++d) {
                    vc_h[d] += w * cached_kv[d];
                }
            }
        }

        // ─── V DECOMPRESSION ───────────────────────────────────────────────
        // v_h = W_v_b_h^T @ v_compressed_h → [val_len_mla] per head
        // W_v_b: [kv_lora_rank, n_heads * val_len_mla]
        {
            kernel::vec_zero(attn_out_.data(), n_heads_ * val_len_mla_);
            int v_b_cols = n_heads_ * val_len_mla_;
            size_t v_b_row_bytes = gemm::row_bytes(t_v_b, v_b_cols);
            const uint8_t* v_b_ptr = static_cast<const uint8_t*>(w_v_b);


            for (int r = 0; r < kv_lora_rank_; ++r) {
                // Dequantize row r of W_v_b
                gemm::dequantize_row(dq_row_.data(),
                                     v_b_ptr + r * v_b_row_bytes,
                                     t_v_b, v_b_cols);
                // For each head: v_h[d] += W_v_b[r][h*val_len + d] * v_compressed_h[r]
                for (int h = 0; h < n_heads_; ++h) {
                    float vc_r = v_compressed_[h * kv_lora_rank_ + r];
                    const float* vb_h = dq_row_.data() + h * val_len_mla_;
                    float* oh = attn_out_.data() + h * val_len_mla_;
                    for (int d = 0; d < val_len_mla_; ++d) {
                        oh[d] += vb_h[d] * vc_r;
                    }
                }
            }
        }

        // ─── OUTPUT PROJECTION ─────────────────────────────────────────────
        gemm::matmul(out, w_o, t_o, attn_out_.data(), dim_,
                     n_heads_ * val_len_mla_);
    }

private:
    int dim_ = 7168;
    int n_heads_ = 64;
    int q_lora_rank_ = 1536;
    int kv_lora_rank_ = 512;
    int rope_dim_ = 64;
    int nope_head_dim_ = 128;
    int key_len_mla_ = 192;
    int val_len_mla_ = 128;
    int max_ctx_ = 4096;

    // Scratch
    std::vector<float> q_a_;
    std::vector<float> q_b_;
    std::vector<float> kv_a_;
    std::vector<float> attn_out_;
    std::vector<float> scores_;

    // Absorbed attention
    std::vector<float> q_absorbed_;    // [n_heads * kv_lora_rank]
    std::vector<float> v_compressed_;  // [n_heads * kv_lora_rank]
    std::vector<float> dq_row_;        // dequant row buffer
};

// ═══════════════════════════════════════════════════════════════════════════════
// MoE ROUTER — Top-K expert selection with gating
// Route token to top-K experts out of N total
// ═══════════════════════════════════════════════════════════════════════════════
struct ExpertSelection {
    int expert_id;
    float weight;
};

class MoERouter {
public:
    void init(const Config& cfg) {
        n_experts_ = cfg.n_experts;
        n_used_ = cfg.n_experts_used;
        dim_ = cfg.dim;
        gating_func_ = cfg.expert_gating_func;
        weights_scale_ = cfg.expert_weights_scale;
        weights_norm_ = cfg.expert_weights_norm;

        gate_scores_.resize(n_experts_);
        sorted_indices_.resize(n_experts_);
    }

    // Route: compute gating scores and select top-K experts
    // gate_inp: [dim, n_experts] router weight (F32)
    // x: [dim] input token
    // Returns selected experts with normalized weights
    std::vector<ExpertSelection> route(
        const float* gate_inp,  // [dim, n_experts] or [n_experts, dim]
        const float* x
    ) {
        // Compute gate scores: gate_inp^T @ x → [n_experts]
        for (int e = 0; e < n_experts_; ++e) {
            float score = 0.0f;
            const float* ge = gate_inp + e * dim_;  // row e of gate_inp
            for (int d = 0; d < dim_; ++d) {
                score += ge[d] * x[d];
            }
            gate_scores_[e] = score;
        }

        // Gating function
        if (gating_func_ == 2) {
            // Type 2: sigmoid gating (DeepSeek V3 / Kimi K2.5)
            for (int e = 0; e < n_experts_; ++e) {
                gate_scores_[e] = 1.0f / (1.0f + std::exp(-gate_scores_[e]));
            }
        } else {
            // Type 0/1: softmax gating
            kernel::softmax(gate_scores_.data(), n_experts_);
        }

        // Top-K selection
        std::iota(sorted_indices_.begin(), sorted_indices_.end(), 0);
        std::partial_sort(sorted_indices_.begin(),
                         sorted_indices_.begin() + n_used_,
                         sorted_indices_.end(),
                         [this](int a, int b) {
                             return gate_scores_[a] > gate_scores_[b];
                         });

        std::vector<ExpertSelection> selected(n_used_);
        float weight_sum = 0.0f;

        for (int i = 0; i < n_used_; ++i) {
            selected[i].expert_id = sorted_indices_[i];
            selected[i].weight = gate_scores_[sorted_indices_[i]];
            weight_sum += selected[i].weight;
        }

        // Normalize weights
        if (weights_norm_ && weight_sum > 0.0f) {
            for (int i = 0; i < n_used_; ++i) {
                selected[i].weight /= weight_sum;
            }
        }

        // Apply scale
        if (weights_scale_ != 1.0f) {
            for (int i = 0; i < n_used_; ++i) {
                selected[i].weight *= weights_scale_;
            }
        }

        return selected;
    }

    // Stats
    int n_experts() const { return n_experts_; }
    int n_used() const { return n_used_; }

private:
    int n_experts_ = 384;
    int n_used_ = 8;
    int dim_ = 7168;
    int gating_func_ = 2;
    float weights_scale_ = 1.0f;
    bool weights_norm_ = false;

    std::vector<float> gate_scores_;
    std::vector<int> sorted_indices_;
};

// ═══════════════════════════════════════════════════════════════════════════════
// EXPERT FFN — Single expert forward pass
// Expert weights are 3D tensors: [expert_ffn_dim, dim, n_experts]
// We extract the slice for one expert
// ═══════════════════════════════════════════════════════════════════════════════
class ExpertFFN {
public:
    void init(const Config& cfg) {
        dim_ = cfg.dim;
        expert_ffn_dim_ = cfg.expert_ffn_dim;
        n_experts_ = cfg.n_experts;

        gate_buf_.resize(expert_ffn_dim_);
        up_buf_.resize(expert_ffn_dim_);
    }

    // Forward one expert
    // expert_gate/up/down are the FULL 3D expert tensors
    // We extract the slice for expert_id
    void forward(
        float* out,
        const float* x,
        int expert_id,
        const void* w_gate_exps, dtype t_gate,  // [dim, expert_ffn_dim, n_experts]
        const void* w_up_exps, dtype t_up,
        const void* w_down_exps, dtype t_down
    ) {
        // Expert slice: offset into the 3D tensor
        // Layout in GGUF: [outer_dim, inner_dim, n_experts]
        // For gate_exps: shape is [dim, expert_ffn_dim, n_experts]
        //   Wait, from our analysis: ffn_gate_exps [7168, 2048, 384] IQ2_XXS
        //   So shape is [dim=7168, expert_ffn=2048, n_experts=384]
        //   Expert slice = all [dim, expert_ffn] at index expert_id along axis 2

        // For quantized tensors, we need to compute the byte offset for expert_id
        // The 3D tensor is stored as n_experts slices of [dim, expert_ffn] matrices

        // Actually GGUF stores them as contiguous: expert0_data, expert1_data, ...
        // Each expert is a [dim, expert_ffn] matrix

        // For gate: input x[dim] → output[expert_ffn]  (matmul x @ gate^T)
        // gate_exps slice for expert e: offset = e * (dim * expert_ffn * bits/8)

        // The gemm::matmul_expert function handles the offset
        size_t expert_gate_bytes = expert_tensor_bytes(dim_, expert_ffn_dim_, t_gate);
        size_t expert_up_bytes = expert_tensor_bytes(dim_, expert_ffn_dim_, t_up);
        size_t expert_down_bytes = expert_tensor_bytes(expert_ffn_dim_, dim_, t_down);

        const uint8_t* gate_ptr = static_cast<const uint8_t*>(w_gate_exps)
                                  + expert_id * expert_gate_bytes;
        const uint8_t* up_ptr = static_cast<const uint8_t*>(w_up_exps)
                                + expert_id * expert_up_bytes;
        const uint8_t* down_ptr = static_cast<const uint8_t*>(w_down_exps)
                                  + expert_id * expert_down_bytes;

        // gate(x) → gate_buf [expert_ffn]
        gemm::matmul(gate_buf_.data(), gate_ptr, t_gate, x,
                     expert_ffn_dim_, dim_);

        // up(x) → up_buf [expert_ffn]
        gemm::matmul(up_buf_.data(), up_ptr, t_up, x,
                     expert_ffn_dim_, dim_);

        // SiLU(gate) * up
        kernel::silu(gate_buf_.data(), expert_ffn_dim_);
        kernel::vec_mul(gate_buf_.data(), gate_buf_.data(), up_buf_.data(),
                       expert_ffn_dim_);

        // down → out [dim]
        gemm::matmul(out, down_ptr, t_down, gate_buf_.data(),
                     dim_, expert_ffn_dim_);
    }

private:
    int dim_ = 7168;
    int expert_ffn_dim_ = 2048;
    int n_experts_ = 384;

    std::vector<float> gate_buf_;
    std::vector<float> up_buf_;

    // Compute bytes for one expert's matrix
    size_t expert_tensor_bytes(int rows, int cols, dtype t) const {
        size_t n_elements = (size_t)rows * cols;
        int bs = dtype_block_size(t);
        if (bs > 1) {
            return (n_elements / bs) * dtype_size(t);
        }
        return n_elements * dtype_size(t);
    }
};

// ═══════════════════════════════════════════════════════════════════════════════
// SHARED EXPERT FFN — Always-active expert(s)
// Uses standard FFN weights (not 3D tensors)
// ═══════════════════════════════════════════════════════════════════════════════
class SharedExpertFFN {
public:
    void init(const Config& cfg) {
        dim_ = cfg.dim;
        expert_ffn_dim_ = cfg.expert_ffn_dim;
        gate_buf_.resize(expert_ffn_dim_);
        up_buf_.resize(expert_ffn_dim_);
    }

    void forward(
        float* out,
        const float* x,
        const void* w_gate, dtype t_gate,
        const void* w_up, dtype t_up,
        const void* w_down, dtype t_down
    ) {
        gemm::matmul(gate_buf_.data(), w_gate, t_gate, x, expert_ffn_dim_, dim_);
        gemm::matmul(up_buf_.data(), w_up, t_up, x, expert_ffn_dim_, dim_);
        kernel::silu(gate_buf_.data(), expert_ffn_dim_);
        kernel::vec_mul(gate_buf_.data(), gate_buf_.data(), up_buf_.data(),
                       expert_ffn_dim_);
        gemm::matmul(out, w_down, t_down, gate_buf_.data(), dim_, expert_ffn_dim_);
    }

private:
    int dim_ = 7168;
    int expert_ffn_dim_ = 2048;
    std::vector<float> gate_buf_, up_buf_;
};

// ═══════════════════════════════════════════════════════════════════════════════
// EXPERT CACHE — LRU tracking for hot expert pages
// Track which experts are frequently activated to keep them hot in page cache
// ═══════════════════════════════════════════════════════════════════════════════
class ExpertCache {
public:
    void init(int n_layers, int n_experts) {
        n_layers_ = n_layers;
        n_experts_ = n_experts;
        // Frequency counter per layer per expert
        freq_.resize(n_layers * n_experts, 0);
        total_calls_ = 0;
    }

    void record(int layer, int expert_id) {
        freq_[layer * n_experts_ + expert_id]++;
        total_calls_++;
    }

    void record_batch(int layer, const std::vector<ExpertSelection>& selected) {
        for (const auto& s : selected) {
            record(layer, s.expert_id);
        }
    }

    // Get top N hottest experts for prefetching
    std::vector<int> hot_experts(int layer, int top_n = 32) const {
        std::vector<std::pair<int, int>> counts;
        for (int e = 0; e < n_experts_; ++e) {
            int f = freq_[layer * n_experts_ + e];
            if (f > 0) counts.push_back({f, e});
        }
        std::partial_sort(counts.begin(),
                         counts.begin() + std::min(top_n, (int)counts.size()),
                         counts.end(),
                         [](const auto& a, const auto& b) { return a.first > b.first; });

        std::vector<int> result;
        for (int i = 0; i < std::min(top_n, (int)counts.size()); ++i) {
            result.push_back(counts[i].second);
        }
        return result;
    }

    void print_stats() const {
        printf("Expert Cache: %lu total activations\n", total_calls_);
        // Print hottest experts per layer sample
        for (int l = 0; l < std::min(3, n_layers_); ++l) {
            auto hot = hot_experts(l, 5);
            printf("  Layer %d top-5: ", l);
            for (int e : hot) {
                printf("%d(%d) ", e, freq_[l * n_experts_ + e]);
            }
            printf("\n");
        }
    }

    // KIMI-SIGNAL-935 PROFILING
    void dump_csv(const char* path) const {
        FILE* fp = fopen(path, "w");
        if (!fp) return;
        fprintf(fp, "layer,expert_id,count\n");
        for (int l = 0; l < n_layers_; ++l)
            for (int e = 0; e < n_experts_; ++e) {
                int c = freq_[l * n_experts_ + e];
                if (c > 0) fprintf(fp, "%d,%d,%d\n", l, e, c);
            }
        fclose(fp);
        printf("[PROFILE] -> %s (%zu calls)\n", path, total_calls_);
    }

private:
    int n_layers_ = 0;
    int n_experts_ = 0;
    std::vector<int> freq_;
    size_t total_calls_ = 0;
};

} // namespace ix