inference-x/runtime/moe_mla.h

694 lines
30 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// ═══════════════════════════════════════════════════════════════════════════════
// INFERENCE-X — Mixture-of-Experts + Multi-Latent Attention
// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
// Licensed under the Business Source License 1.1 (BSL-1.1)
// See LICENSE file for full terms.
//
// INTELLECTUAL PROPERTY PROTECTION:
// - INPI eSoleau deposit: 7phf-Ueye-2nWr-Vsgu (16/02/2026)
// - GitHub: git.inference-x.com/salka/inference-x
// - Author: Salka Elmadani | Morocco | Morocco
//
// MANUFACTURER NOTICE: Any manufacturer, company, or entity that
// incorporates, embeds, distributes, or commercially uses Inference-X
// or any derivative work without explicit written authorization from
// the copyright holder is in violation of BSL-1.1 and applicable
// intellectual property laws. This includes but is not limited to:
// hardware vendors, cloud providers, SaaS platforms, and OEMs.
//
// Contact: Elmadani.SALKA@proton.me for licensing.
// ═══════════════════════════════════════════════════════════════════════════════
#pragma once
// Inference-X MoE+MLA — Salka Elmadani — Morocco
#define IX_MOE_FINGERPRINT "935-ELMADANI-MOE"
#include "../core/z_core.h"
#include "kernels.h"
#include "gemm.h"
#include <vector>
#include <cmath>
#include <algorithm>
#include <numeric>
#include <cstdio>
namespace ix {
// ═══════════════════════════════════════════════════════════════════════════════
// MLA KV CACHE — Compressed latent space (3x more efficient than GQA)
// Instead of storing full K,V per head, store compressed kv_lora_rank vectors
// + rope_dim separate RoPE keys
// ═══════════════════════════════════════════════════════════════════════════════
class MLAKVCache {
public:
void init(const Config& cfg, int max_ctx = 4096) {
n_layers_ = cfg.n_layers;
kv_lora_rank_ = cfg.kv_lora_rank; // 512
rope_dim_ = cfg.rope_dim; // 64
max_seq_len_ = max_ctx;
// Per layer per position: kv_lora_rank (compressed KV) + rope_dim (RoPE keys)
int per_pos = kv_lora_rank_ + rope_dim_;
size_t total = (size_t)n_layers_ * max_seq_len_ * per_pos;
data_.resize(total, 0.0f);
pos_ = 0;
}
void clear() {
pos_ = 0;
std::fill(data_.begin(), data_.end(), 0.0f);
}
// Store compressed KV latent for this layer and position
float* kv_latent(int layer, int pos) {
return data_.data() + layer_offset(layer) + pos * (kv_lora_rank_ + rope_dim_);
}
// Get compressed KV latent (read)
const float* kv_latent(int layer, int pos) const {
return data_.data() + layer_offset(layer) + pos * (kv_lora_rank_ + rope_dim_);
}
// Just the compressed KV part (first kv_lora_rank elements)
const float* kv_compressed(int layer, int pos) const {
return kv_latent(layer, pos);
}
// Just the RoPE key part (last rope_dim elements)
const float* rope_key(int layer, int pos) const {
return kv_latent(layer, pos) + kv_lora_rank_;
}
float* rope_key_mut(int layer, int pos) {
return const_cast<float*>(rope_key(layer, pos));
}
void advance() { ++pos_; }
int position() const { return pos_; }
int max_seq_len() const { return max_seq_len_; }
int kv_lora_rank() const { return kv_lora_rank_; }
int rope_dim() const { return rope_dim_; }
size_t memory_bytes() const {
return data_.size() * sizeof(float);
}
private:
std::vector<float> data_;
int n_layers_ = 0;
int kv_lora_rank_ = 512;
int rope_dim_ = 64;
int max_seq_len_ = 4096;
int pos_ = 0;
size_t layer_offset(int layer) const {
return (size_t)layer * max_seq_len_ * (kv_lora_rank_ + rope_dim_);
}
};
// ═══════════════════════════════════════════════════════════════════════════════
// MLA ROPE — RoPE only on rope_dim dimensions (64), not full head_dim
// ═══════════════════════════════════════════════════════════════════════════════
class MLARoPE {
public:
void init(int rope_dim, int max_seq_len, float theta, float scaling_factor = 1.0f) {
rope_dim_ = rope_dim;
max_seq_len_ = max_seq_len;
half_dim_ = rope_dim / 2;
cos_.resize(max_seq_len * half_dim_);
sin_.resize(max_seq_len * half_dim_);
for (int pos = 0; pos < max_seq_len; ++pos) {
float adjusted_pos = pos; // YaRN scaling could adjust this
for (int i = 0; i < half_dim_; ++i) {
float freq = 1.0f / std::pow(theta, 2.0f * i / rope_dim);
if (scaling_factor > 1.0f) freq /= scaling_factor;
float angle = adjusted_pos * freq;
cos_[pos * half_dim_ + i] = std::cos(angle);
sin_[pos * half_dim_ + i] = std::sin(angle);
}
}
}
// Apply RoPE to a vector of rope_dim dimensions at given position
void apply(float* x, int pos) const {
const float* c = cos_.data() + pos * half_dim_;
const float* s = sin_.data() + pos * half_dim_;
for (int i = 0; i < half_dim_; ++i) {
float x0 = x[i];
float x1 = x[i + half_dim_];
x[i] = x0 * c[i] - x1 * s[i];
x[i + half_dim_] = x0 * s[i] + x1 * c[i];
}
}
// Apply to N heads' RoPE portion (stride = head_key_len)
void apply_heads(float* q_rope, int n_heads, int stride, int pos) const {
for (int h = 0; h < n_heads; ++h) {
apply(q_rope + h * stride, pos);
}
}
private:
int rope_dim_ = 64;
int half_dim_ = 32;
int max_seq_len_ = 4096;
std::vector<float> cos_, sin_;
};
// ═══════════════════════════════════════════════════════════════════════════════
// MLA ATTENTION — Multi-head Latent Attention (DeepSeek V3)
//
// Flow:
// Q path: x → q_a_proj → q_a_norm → q_b_proj → [q_nope | q_rope] per head
// KV path: x → kv_a_proj → kv_a_norm → {k_b_proj, v_b_proj}
// Cache: store compressed kv_a (kv_lora_rank) + k_rope (rope_dim)
// Attention: standard scaled dot-product with decoupled RoPE
// ═══════════════════════════════════════════════════════════════════════════════
class MLAAttention {
public:
void init(const Config& cfg, int max_ctx = 4096) {
dim_ = cfg.dim;
n_heads_ = cfg.n_heads;
q_lora_rank_ = cfg.q_lora_rank;
kv_lora_rank_ = cfg.kv_lora_rank;
rope_dim_ = cfg.rope_dim;
key_len_mla_ = cfg.key_length_mla; // 192 per head
val_len_mla_ = cfg.value_length_mla; // 128 per head
nope_head_dim_ = key_len_mla_ - rope_dim_; // 128
max_ctx_ = max_ctx;
// Scratch buffers
q_a_.resize(q_lora_rank_); // compressed Q
q_b_.resize(n_heads_ * key_len_mla_); // decompressed Q [nope|rope] per head
kv_a_.resize(kv_lora_rank_ + rope_dim_); // compressed KV + rope key
attn_out_.resize(n_heads_ * val_len_mla_); // attention output per head
scores_.resize(n_heads_ * max_ctx); // attention scores
// Absorbed attention buffers
q_absorbed_.resize(n_heads_ * kv_lora_rank_); // q in compressed K space
v_compressed_.resize(n_heads_ * kv_lora_rank_); // accumulated V in compressed space
dq_row_.resize(std::max(n_heads_ * nope_head_dim_,
n_heads_ * val_len_mla_)); // dequant row buffer
}
// ═══════════════════════════════════════════════════════════════════════════
// MLA FORWARD — Single token, proper absorbed attention
// ═══════════════════════════════════════════════════════════════════════════
void forward(
float* out, // [dim] output
const float* x, // [dim] input
// Q path weights
const void* w_q_a, dtype t_q_a, // [dim, q_lora_rank] Q compress
const float* q_a_norm, // [q_lora_rank] Q norm
const void* w_q_b, dtype t_q_b, // [q_lora_rank, n_heads * key_len_mla] Q decompress
// KV path weights
const void* w_kv_a, dtype t_kv_a, // [dim, kv_lora_rank + rope_dim] KV compress
const float* kv_a_norm, // [kv_lora_rank] KV norm (only on kv part, not rope)
const void* w_k_b, dtype t_k_b, // [kv_lora_rank, n_heads * nope_head_dim] K decompress
const void* w_v_b, dtype t_v_b, // [kv_lora_rank, n_heads * val_len_mla] V decompress
// Output
const void* w_o, dtype t_o, // [n_heads * val_len_mla, dim] output proj
MLAKVCache& cache,
MLARoPE& rope,
int layer
) {
int pos = cache.position();
int seq_len = pos + 1;
// ─── Q PATH ────────────────────────────────────────────────────────
// x → q_a_proj → compress to q_lora_rank
gemm::matmul(q_a_.data(), w_q_a, t_q_a, x, q_lora_rank_, dim_);
kernel::rms_norm(q_a_.data(), q_a_.data(), q_a_norm, q_lora_rank_);
// q_a → q_b_proj → decompress to n_heads * key_len_mla
// q_b layout per head: [q_nope(nope_head_dim=128) | q_rope(rope_dim=64)]
gemm::matmul(q_b_.data(), w_q_b, t_q_b, q_a_.data(),
n_heads_ * key_len_mla_, q_lora_rank_);
// Apply RoPE to q_rope portion of each head
for (int h = 0; h < n_heads_; ++h) {
float* q_rope_h = q_b_.data() + h * key_len_mla_ + nope_head_dim_;
rope.apply(q_rope_h, pos);
}
// ─── K ABSORPTION ──────────────────────────────────────────────────
// q_absorbed_h = q_nope_h @ W_k_b_h^T → [kv_lora_rank] per head
// W_k_b: [kv_lora_rank, n_heads * nope_head_dim] (rows × cols)
// For row r, head h: W_k_b[r][h*nope_head_dim .. (h+1)*nope_head_dim-1]
{
kernel::vec_zero(q_absorbed_.data(), n_heads_ * kv_lora_rank_);
int k_b_cols = n_heads_ * nope_head_dim_;
size_t k_b_row_bytes = gemm::row_bytes(t_k_b, k_b_cols);
const uint8_t* k_b_ptr = static_cast<const uint8_t*>(w_k_b);
for (int r = 0; r < kv_lora_rank_; ++r) {
// Dequantize row r of W_k_b
gemm::dequantize_row(dq_row_.data(),
k_b_ptr + r * k_b_row_bytes,
t_k_b, k_b_cols);
// For each head: q_absorbed[h][r] = dot(q_nope_h, dq_row[h*nope..])
for (int h = 0; h < n_heads_; ++h) {
const float* q_nope_h = q_b_.data() + h * key_len_mla_;
const float* k_b_h = dq_row_.data() + h * nope_head_dim_;
float dot = 0.0f;
for (int d = 0; d < nope_head_dim_; ++d) {
dot += q_nope_h[d] * k_b_h[d];
}
q_absorbed_[h * kv_lora_rank_ + r] = dot;
}
}
}
// ─── KV PATH ──────────────────────────────────────────────────────
// x → kv_a_proj → compress to [kv_lora_rank | rope_dim]
gemm::matmul(kv_a_.data(), w_kv_a, t_kv_a, x,
kv_lora_rank_ + rope_dim_, dim_);
// Norm only the kv_lora_rank part (not the rope keys)
kernel::rms_norm(kv_a_.data(), kv_a_.data(), kv_a_norm, kv_lora_rank_);
// Apply RoPE to the rope key portion
float* k_rope_curr = kv_a_.data() + kv_lora_rank_;
rope.apply(k_rope_curr, pos);
// Store compressed KV + rope key in cache
float* cache_slot = cache.kv_latent(layer, pos);
std::memcpy(cache_slot, kv_a_.data(),
(kv_lora_rank_ + rope_dim_) * sizeof(float));
// ─── SCORING (absorbed) ────────────────────────────────────────────
// score_h[t] = q_absorbed_h · kv_compressed[t] + q_rope_h · k_rope[t]
float scale = 1.0f / std::sqrt(static_cast<float>(key_len_mla_));
#pragma omp parallel for schedule(dynamic)
for (int h = 0; h < n_heads_; ++h) {
const float* qa_h = q_absorbed_.data() + h * kv_lora_rank_;
const float* qr_h = q_b_.data() + h * key_len_mla_ + nope_head_dim_;
float* sh = scores_.data() + h * seq_len;
for (int t = 0; t < seq_len; ++t) {
const float* cached_kv = cache.kv_compressed(layer, t);
const float* cached_rope = cache.rope_key(layer, t);
float score = 0.0f;
// Nope: q_absorbed · kv_compressed (both kv_lora_rank dims)
for (int d = 0; d < kv_lora_rank_; ++d) {
score += qa_h[d] * cached_kv[d];
}
// Rope: q_rope · k_rope (rope_dim dims)
for (int d = 0; d < rope_dim_; ++d) {
score += qr_h[d] * cached_rope[d];
}
sh[t] = score * scale;
}
kernel::softmax(sh, seq_len);
// ─── V ACCUMULATION (compressed space) ─────────────────────────
float* vc_h = v_compressed_.data() + h * kv_lora_rank_;
kernel::vec_zero(vc_h, kv_lora_rank_);
for (int t = 0; t < seq_len; ++t) {
const float* cached_kv = cache.kv_compressed(layer, t);
float w = sh[t];
for (int d = 0; d < kv_lora_rank_; ++d) {
vc_h[d] += w * cached_kv[d];
}
}
}
// ─── V DECOMPRESSION ───────────────────────────────────────────────
// v_h = W_v_b_h^T @ v_compressed_h → [val_len_mla] per head
// W_v_b: [kv_lora_rank, n_heads * val_len_mla]
{
kernel::vec_zero(attn_out_.data(), n_heads_ * val_len_mla_);
int v_b_cols = n_heads_ * val_len_mla_;
size_t v_b_row_bytes = gemm::row_bytes(t_v_b, v_b_cols);
const uint8_t* v_b_ptr = static_cast<const uint8_t*>(w_v_b);
for (int r = 0; r < kv_lora_rank_; ++r) {
// Dequantize row r of W_v_b
gemm::dequantize_row(dq_row_.data(),
v_b_ptr + r * v_b_row_bytes,
t_v_b, v_b_cols);
// For each head: v_h[d] += W_v_b[r][h*val_len + d] * v_compressed_h[r]
for (int h = 0; h < n_heads_; ++h) {
float vc_r = v_compressed_[h * kv_lora_rank_ + r];
const float* vb_h = dq_row_.data() + h * val_len_mla_;
float* oh = attn_out_.data() + h * val_len_mla_;
for (int d = 0; d < val_len_mla_; ++d) {
oh[d] += vb_h[d] * vc_r;
}
}
}
}
// ─── OUTPUT PROJECTION ─────────────────────────────────────────────
gemm::matmul(out, w_o, t_o, attn_out_.data(), dim_,
n_heads_ * val_len_mla_);
}
private:
int dim_ = 7168;
int n_heads_ = 64;
int q_lora_rank_ = 1536;
int kv_lora_rank_ = 512;
int rope_dim_ = 64;
int nope_head_dim_ = 128;
int key_len_mla_ = 192;
int val_len_mla_ = 128;
int max_ctx_ = 4096;
// Scratch
std::vector<float> q_a_;
std::vector<float> q_b_;
std::vector<float> kv_a_;
std::vector<float> attn_out_;
std::vector<float> scores_;
// Absorbed attention
std::vector<float> q_absorbed_; // [n_heads * kv_lora_rank]
std::vector<float> v_compressed_; // [n_heads * kv_lora_rank]
std::vector<float> dq_row_; // dequant row buffer
};
// ═══════════════════════════════════════════════════════════════════════════════
// MoE ROUTER — Top-K expert selection with gating
// Route token to top-K experts out of N total
// ═══════════════════════════════════════════════════════════════════════════════
struct ExpertSelection {
int expert_id;
float weight;
};
class MoERouter {
public:
void init(const Config& cfg) {
n_experts_ = cfg.n_experts;
n_used_ = cfg.n_experts_used;
dim_ = cfg.dim;
gating_func_ = cfg.expert_gating_func;
weights_scale_ = cfg.expert_weights_scale;
weights_norm_ = cfg.expert_weights_norm;
gate_scores_.resize(n_experts_);
sorted_indices_.resize(n_experts_);
}
// Route: compute gating scores and select top-K experts
// gate_inp: [dim, n_experts] router weight (F32)
// x: [dim] input token
// Returns selected experts with normalized weights
std::vector<ExpertSelection> route(
const float* gate_inp, // [dim, n_experts] or [n_experts, dim]
const float* x
) {
// Compute gate scores: gate_inp^T @ x → [n_experts]
for (int e = 0; e < n_experts_; ++e) {
float score = 0.0f;
const float* ge = gate_inp + e * dim_; // row e of gate_inp
for (int d = 0; d < dim_; ++d) {
score += ge[d] * x[d];
}
gate_scores_[e] = score;
}
// Gating function
if (gating_func_ == 2) {
// Type 2: sigmoid gating (DeepSeek V3 / Kimi K2.5)
for (int e = 0; e < n_experts_; ++e) {
gate_scores_[e] = 1.0f / (1.0f + std::exp(-gate_scores_[e]));
}
} else {
// Type 0/1: softmax gating
kernel::softmax(gate_scores_.data(), n_experts_);
}
// Top-K selection
std::iota(sorted_indices_.begin(), sorted_indices_.end(), 0);
std::partial_sort(sorted_indices_.begin(),
sorted_indices_.begin() + n_used_,
sorted_indices_.end(),
[this](int a, int b) {
return gate_scores_[a] > gate_scores_[b];
});
std::vector<ExpertSelection> selected(n_used_);
float weight_sum = 0.0f;
for (int i = 0; i < n_used_; ++i) {
selected[i].expert_id = sorted_indices_[i];
selected[i].weight = gate_scores_[sorted_indices_[i]];
weight_sum += selected[i].weight;
}
// Normalize weights
if (weights_norm_ && weight_sum > 0.0f) {
for (int i = 0; i < n_used_; ++i) {
selected[i].weight /= weight_sum;
}
}
// Apply scale
if (weights_scale_ != 1.0f) {
for (int i = 0; i < n_used_; ++i) {
selected[i].weight *= weights_scale_;
}
}
return selected;
}
// Stats
int n_experts() const { return n_experts_; }
int n_used() const { return n_used_; }
private:
int n_experts_ = 384;
int n_used_ = 8;
int dim_ = 7168;
int gating_func_ = 2;
float weights_scale_ = 1.0f;
bool weights_norm_ = false;
std::vector<float> gate_scores_;
std::vector<int> sorted_indices_;
};
// ═══════════════════════════════════════════════════════════════════════════════
// EXPERT FFN — Single expert forward pass
// Expert weights are 3D tensors: [expert_ffn_dim, dim, n_experts]
// We extract the slice for one expert
// ═══════════════════════════════════════════════════════════════════════════════
class ExpertFFN {
public:
void init(const Config& cfg) {
dim_ = cfg.dim;
expert_ffn_dim_ = cfg.expert_ffn_dim;
n_experts_ = cfg.n_experts;
gate_buf_.resize(expert_ffn_dim_);
up_buf_.resize(expert_ffn_dim_);
}
// Forward one expert
// expert_gate/up/down are the FULL 3D expert tensors
// We extract the slice for expert_id
void forward(
float* out,
const float* x,
int expert_id,
const void* w_gate_exps, dtype t_gate, // [dim, expert_ffn_dim, n_experts]
const void* w_up_exps, dtype t_up,
const void* w_down_exps, dtype t_down
) {
// Expert slice: offset into the 3D tensor
// Layout in GGUF: [outer_dim, inner_dim, n_experts]
// For gate_exps: shape is [dim, expert_ffn_dim, n_experts]
// Wait, from our analysis: ffn_gate_exps [7168, 2048, 384] IQ2_XXS
// So shape is [dim=7168, expert_ffn=2048, n_experts=384]
// Expert slice = all [dim, expert_ffn] at index expert_id along axis 2
// For quantized tensors, we need to compute the byte offset for expert_id
// The 3D tensor is stored as n_experts slices of [dim, expert_ffn] matrices
// Actually GGUF stores them as contiguous: expert0_data, expert1_data, ...
// Each expert is a [dim, expert_ffn] matrix
// For gate: input x[dim] → output[expert_ffn] (matmul x @ gate^T)
// gate_exps slice for expert e: offset = e * (dim * expert_ffn * bits/8)
// The gemm::matmul_expert function handles the offset
size_t expert_gate_bytes = expert_tensor_bytes(dim_, expert_ffn_dim_, t_gate);
size_t expert_up_bytes = expert_tensor_bytes(dim_, expert_ffn_dim_, t_up);
size_t expert_down_bytes = expert_tensor_bytes(expert_ffn_dim_, dim_, t_down);
const uint8_t* gate_ptr = static_cast<const uint8_t*>(w_gate_exps)
+ expert_id * expert_gate_bytes;
const uint8_t* up_ptr = static_cast<const uint8_t*>(w_up_exps)
+ expert_id * expert_up_bytes;
const uint8_t* down_ptr = static_cast<const uint8_t*>(w_down_exps)
+ expert_id * expert_down_bytes;
// gate(x) → gate_buf [expert_ffn]
gemm::matmul(gate_buf_.data(), gate_ptr, t_gate, x,
expert_ffn_dim_, dim_);
// up(x) → up_buf [expert_ffn]
gemm::matmul(up_buf_.data(), up_ptr, t_up, x,
expert_ffn_dim_, dim_);
// SiLU(gate) * up
kernel::silu(gate_buf_.data(), expert_ffn_dim_);
kernel::vec_mul(gate_buf_.data(), gate_buf_.data(), up_buf_.data(),
expert_ffn_dim_);
// down → out [dim]
gemm::matmul(out, down_ptr, t_down, gate_buf_.data(),
dim_, expert_ffn_dim_);
}
private:
int dim_ = 7168;
int expert_ffn_dim_ = 2048;
int n_experts_ = 384;
std::vector<float> gate_buf_;
std::vector<float> up_buf_;
// Compute bytes for one expert's matrix
size_t expert_tensor_bytes(int rows, int cols, dtype t) const {
size_t n_elements = (size_t)rows * cols;
int bs = dtype_block_size(t);
if (bs > 1) {
return (n_elements / bs) * dtype_size(t);
}
return n_elements * dtype_size(t);
}
};
// ═══════════════════════════════════════════════════════════════════════════════
// SHARED EXPERT FFN — Always-active expert(s)
// Uses standard FFN weights (not 3D tensors)
// ═══════════════════════════════════════════════════════════════════════════════
class SharedExpertFFN {
public:
void init(const Config& cfg) {
dim_ = cfg.dim;
expert_ffn_dim_ = cfg.expert_ffn_dim;
gate_buf_.resize(expert_ffn_dim_);
up_buf_.resize(expert_ffn_dim_);
}
void forward(
float* out,
const float* x,
const void* w_gate, dtype t_gate,
const void* w_up, dtype t_up,
const void* w_down, dtype t_down
) {
gemm::matmul(gate_buf_.data(), w_gate, t_gate, x, expert_ffn_dim_, dim_);
gemm::matmul(up_buf_.data(), w_up, t_up, x, expert_ffn_dim_, dim_);
kernel::silu(gate_buf_.data(), expert_ffn_dim_);
kernel::vec_mul(gate_buf_.data(), gate_buf_.data(), up_buf_.data(),
expert_ffn_dim_);
gemm::matmul(out, w_down, t_down, gate_buf_.data(), dim_, expert_ffn_dim_);
}
private:
int dim_ = 7168;
int expert_ffn_dim_ = 2048;
std::vector<float> gate_buf_, up_buf_;
};
// ═══════════════════════════════════════════════════════════════════════════════
// EXPERT CACHE — LRU tracking for hot expert pages
// Track which experts are frequently activated to keep them hot in page cache
// ═══════════════════════════════════════════════════════════════════════════════
class ExpertCache {
public:
void init(int n_layers, int n_experts) {
n_layers_ = n_layers;
n_experts_ = n_experts;
// Frequency counter per layer per expert
freq_.resize(n_layers * n_experts, 0);
total_calls_ = 0;
}
void record(int layer, int expert_id) {
freq_[layer * n_experts_ + expert_id]++;
total_calls_++;
}
void record_batch(int layer, const std::vector<ExpertSelection>& selected) {
for (const auto& s : selected) {
record(layer, s.expert_id);
}
}
// Get top N hottest experts for prefetching
std::vector<int> hot_experts(int layer, int top_n = 32) const {
std::vector<std::pair<int, int>> counts;
for (int e = 0; e < n_experts_; ++e) {
int f = freq_[layer * n_experts_ + e];
if (f > 0) counts.push_back({f, e});
}
std::partial_sort(counts.begin(),
counts.begin() + std::min(top_n, (int)counts.size()),
counts.end(),
[](const auto& a, const auto& b) { return a.first > b.first; });
std::vector<int> result;
for (int i = 0; i < std::min(top_n, (int)counts.size()); ++i) {
result.push_back(counts[i].second);
}
return result;
}
void print_stats() const {
printf("Expert Cache: %lu total activations\n", total_calls_);
// Print hottest experts per layer sample
for (int l = 0; l < std::min(3, n_layers_); ++l) {
auto hot = hot_experts(l, 5);
printf(" Layer %d top-5: ", l);
for (int e : hot) {
printf("%d(%d) ", e, freq_[l * n_experts_ + e]);
}
printf("\n");
}
}
// KIMI-SIGNAL-935 PROFILING
void dump_csv(const char* path) const {
FILE* fp = fopen(path, "w");
if (!fp) return;
fprintf(fp, "layer,expert_id,count\n");
for (int l = 0; l < n_layers_; ++l)
for (int e = 0; e < n_experts_; ++e) {
int c = freq_[l * n_experts_ + e];
if (c > 0) fprintf(fp, "%d,%d,%d\n", l, e, c);
}
fclose(fp);
printf("[PROFILE] -> %s (%zu calls)\n", path, total_calls_);
}
private:
int n_layers_ = 0;
int n_experts_ = 0;
std::vector<int> freq_;
size_t total_calls_ = 0;
};
} // namespace ix