Better output from the same model. Fused computation, adaptive precision, surgical expert loading. 305 KB, 19 backends, zero dependencies. https://inference-x.com
694 lines
30 KiB
C++
694 lines
30 KiB
C++
// ═══════════════════════════════════════════════════════════════════════════════
|
||
// INFERENCE-X — Mixture-of-Experts + Multi-Latent Attention
|
||
// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
|
||
// Licensed under the Business Source License 1.1 (BSL-1.1)
|
||
// See LICENSE file for full terms.
|
||
//
|
||
// INTELLECTUAL PROPERTY PROTECTION:
|
||
// - INPI eSoleau deposit: 7phf-Ueye-2nWr-Vsgu (16/02/2026)
|
||
// - GitHub: github.com/ElmadaniS/inference-x
|
||
// - Author: Salka Elmadani | Morocco | Morocco
|
||
//
|
||
// MANUFACTURER NOTICE: Any manufacturer, company, or entity that
|
||
// incorporates, embeds, distributes, or commercially uses Inference-X
|
||
// or any derivative work without explicit written authorization from
|
||
// the copyright holder is in violation of BSL-1.1 and applicable
|
||
// intellectual property laws. This includes but is not limited to:
|
||
// hardware vendors, cloud providers, SaaS platforms, and OEMs.
|
||
//
|
||
// Contact: Elmadani.SALKA@proton.me for licensing.
|
||
// ═══════════════════════════════════════════════════════════════════════════════
|
||
|
||
#pragma once
|
||
|
||
// Inference-X MoE+MLA — Salka Elmadani — Morocco
|
||
#define IX_MOE_FINGERPRINT "935-ELMADANI-MOE"
|
||
|
||
|
||
#include "../core/z_core.h"
|
||
#include "kernels.h"
|
||
#include "gemm.h"
|
||
#include <vector>
|
||
#include <cmath>
|
||
#include <algorithm>
|
||
#include <numeric>
|
||
#include <cstdio>
|
||
|
||
namespace ix {
|
||
|
||
// ═══════════════════════════════════════════════════════════════════════════════
|
||
// MLA KV CACHE — Compressed latent space (3x more efficient than GQA)
|
||
// Instead of storing full K,V per head, store compressed kv_lora_rank vectors
|
||
// + rope_dim separate RoPE keys
|
||
// ═══════════════════════════════════════════════════════════════════════════════
|
||
class MLAKVCache {
|
||
public:
|
||
void init(const Config& cfg, int max_ctx = 4096) {
|
||
n_layers_ = cfg.n_layers;
|
||
kv_lora_rank_ = cfg.kv_lora_rank; // 512
|
||
rope_dim_ = cfg.rope_dim; // 64
|
||
max_seq_len_ = max_ctx;
|
||
|
||
// Per layer per position: kv_lora_rank (compressed KV) + rope_dim (RoPE keys)
|
||
int per_pos = kv_lora_rank_ + rope_dim_;
|
||
size_t total = (size_t)n_layers_ * max_seq_len_ * per_pos;
|
||
data_.resize(total, 0.0f);
|
||
pos_ = 0;
|
||
}
|
||
|
||
void clear() {
|
||
pos_ = 0;
|
||
std::fill(data_.begin(), data_.end(), 0.0f);
|
||
}
|
||
|
||
// Store compressed KV latent for this layer and position
|
||
float* kv_latent(int layer, int pos) {
|
||
return data_.data() + layer_offset(layer) + pos * (kv_lora_rank_ + rope_dim_);
|
||
}
|
||
|
||
// Get compressed KV latent (read)
|
||
const float* kv_latent(int layer, int pos) const {
|
||
return data_.data() + layer_offset(layer) + pos * (kv_lora_rank_ + rope_dim_);
|
||
}
|
||
|
||
// Just the compressed KV part (first kv_lora_rank elements)
|
||
const float* kv_compressed(int layer, int pos) const {
|
||
return kv_latent(layer, pos);
|
||
}
|
||
|
||
// Just the RoPE key part (last rope_dim elements)
|
||
const float* rope_key(int layer, int pos) const {
|
||
return kv_latent(layer, pos) + kv_lora_rank_;
|
||
}
|
||
|
||
float* rope_key_mut(int layer, int pos) {
|
||
return const_cast<float*>(rope_key(layer, pos));
|
||
}
|
||
|
||
void advance() { ++pos_; }
|
||
int position() const { return pos_; }
|
||
int max_seq_len() const { return max_seq_len_; }
|
||
int kv_lora_rank() const { return kv_lora_rank_; }
|
||
int rope_dim() const { return rope_dim_; }
|
||
|
||
size_t memory_bytes() const {
|
||
return data_.size() * sizeof(float);
|
||
}
|
||
|
||
private:
|
||
std::vector<float> data_;
|
||
int n_layers_ = 0;
|
||
int kv_lora_rank_ = 512;
|
||
int rope_dim_ = 64;
|
||
int max_seq_len_ = 4096;
|
||
int pos_ = 0;
|
||
|
||
size_t layer_offset(int layer) const {
|
||
return (size_t)layer * max_seq_len_ * (kv_lora_rank_ + rope_dim_);
|
||
}
|
||
};
|
||
|
||
// ═══════════════════════════════════════════════════════════════════════════════
|
||
// MLA ROPE — RoPE only on rope_dim dimensions (64), not full head_dim
|
||
// ═══════════════════════════════════════════════════════════════════════════════
|
||
class MLARoPE {
|
||
public:
|
||
void init(int rope_dim, int max_seq_len, float theta, float scaling_factor = 1.0f) {
|
||
rope_dim_ = rope_dim;
|
||
max_seq_len_ = max_seq_len;
|
||
half_dim_ = rope_dim / 2;
|
||
|
||
cos_.resize(max_seq_len * half_dim_);
|
||
sin_.resize(max_seq_len * half_dim_);
|
||
|
||
for (int pos = 0; pos < max_seq_len; ++pos) {
|
||
float adjusted_pos = pos; // YaRN scaling could adjust this
|
||
for (int i = 0; i < half_dim_; ++i) {
|
||
float freq = 1.0f / std::pow(theta, 2.0f * i / rope_dim);
|
||
if (scaling_factor > 1.0f) freq /= scaling_factor;
|
||
float angle = adjusted_pos * freq;
|
||
cos_[pos * half_dim_ + i] = std::cos(angle);
|
||
sin_[pos * half_dim_ + i] = std::sin(angle);
|
||
}
|
||
}
|
||
}
|
||
|
||
// Apply RoPE to a vector of rope_dim dimensions at given position
|
||
void apply(float* x, int pos) const {
|
||
const float* c = cos_.data() + pos * half_dim_;
|
||
const float* s = sin_.data() + pos * half_dim_;
|
||
for (int i = 0; i < half_dim_; ++i) {
|
||
float x0 = x[i];
|
||
float x1 = x[i + half_dim_];
|
||
x[i] = x0 * c[i] - x1 * s[i];
|
||
x[i + half_dim_] = x0 * s[i] + x1 * c[i];
|
||
}
|
||
}
|
||
|
||
// Apply to N heads' RoPE portion (stride = head_key_len)
|
||
void apply_heads(float* q_rope, int n_heads, int stride, int pos) const {
|
||
for (int h = 0; h < n_heads; ++h) {
|
||
apply(q_rope + h * stride, pos);
|
||
}
|
||
}
|
||
|
||
private:
|
||
int rope_dim_ = 64;
|
||
int half_dim_ = 32;
|
||
int max_seq_len_ = 4096;
|
||
std::vector<float> cos_, sin_;
|
||
};
|
||
|
||
// ═══════════════════════════════════════════════════════════════════════════════
|
||
// MLA ATTENTION — Multi-head Latent Attention (DeepSeek V3)
|
||
//
|
||
// Flow:
|
||
// Q path: x → q_a_proj → q_a_norm → q_b_proj → [q_nope | q_rope] per head
|
||
// KV path: x → kv_a_proj → kv_a_norm → {k_b_proj, v_b_proj}
|
||
// Cache: store compressed kv_a (kv_lora_rank) + k_rope (rope_dim)
|
||
// Attention: standard scaled dot-product with decoupled RoPE
|
||
// ═══════════════════════════════════════════════════════════════════════════════
|
||
class MLAAttention {
|
||
public:
|
||
void init(const Config& cfg, int max_ctx = 4096) {
|
||
dim_ = cfg.dim;
|
||
n_heads_ = cfg.n_heads;
|
||
q_lora_rank_ = cfg.q_lora_rank;
|
||
kv_lora_rank_ = cfg.kv_lora_rank;
|
||
rope_dim_ = cfg.rope_dim;
|
||
|
||
key_len_mla_ = cfg.key_length_mla; // 192 per head
|
||
val_len_mla_ = cfg.value_length_mla; // 128 per head
|
||
nope_head_dim_ = key_len_mla_ - rope_dim_; // 128
|
||
|
||
max_ctx_ = max_ctx;
|
||
|
||
// Scratch buffers
|
||
q_a_.resize(q_lora_rank_); // compressed Q
|
||
q_b_.resize(n_heads_ * key_len_mla_); // decompressed Q [nope|rope] per head
|
||
kv_a_.resize(kv_lora_rank_ + rope_dim_); // compressed KV + rope key
|
||
attn_out_.resize(n_heads_ * val_len_mla_); // attention output per head
|
||
scores_.resize(n_heads_ * max_ctx); // attention scores
|
||
|
||
// Absorbed attention buffers
|
||
q_absorbed_.resize(n_heads_ * kv_lora_rank_); // q in compressed K space
|
||
v_compressed_.resize(n_heads_ * kv_lora_rank_); // accumulated V in compressed space
|
||
dq_row_.resize(std::max(n_heads_ * nope_head_dim_,
|
||
n_heads_ * val_len_mla_)); // dequant row buffer
|
||
}
|
||
|
||
// ═══════════════════════════════════════════════════════════════════════════
|
||
// MLA FORWARD — Single token, proper absorbed attention
|
||
// ═══════════════════════════════════════════════════════════════════════════
|
||
void forward(
|
||
float* out, // [dim] output
|
||
const float* x, // [dim] input
|
||
|
||
// Q path weights
|
||
const void* w_q_a, dtype t_q_a, // [dim, q_lora_rank] Q compress
|
||
const float* q_a_norm, // [q_lora_rank] Q norm
|
||
const void* w_q_b, dtype t_q_b, // [q_lora_rank, n_heads * key_len_mla] Q decompress
|
||
|
||
// KV path weights
|
||
const void* w_kv_a, dtype t_kv_a, // [dim, kv_lora_rank + rope_dim] KV compress
|
||
const float* kv_a_norm, // [kv_lora_rank] KV norm (only on kv part, not rope)
|
||
const void* w_k_b, dtype t_k_b, // [kv_lora_rank, n_heads * nope_head_dim] K decompress
|
||
const void* w_v_b, dtype t_v_b, // [kv_lora_rank, n_heads * val_len_mla] V decompress
|
||
|
||
// Output
|
||
const void* w_o, dtype t_o, // [n_heads * val_len_mla, dim] output proj
|
||
|
||
MLAKVCache& cache,
|
||
MLARoPE& rope,
|
||
int layer
|
||
) {
|
||
int pos = cache.position();
|
||
int seq_len = pos + 1;
|
||
|
||
// ─── Q PATH ────────────────────────────────────────────────────────
|
||
// x → q_a_proj → compress to q_lora_rank
|
||
gemm::matmul(q_a_.data(), w_q_a, t_q_a, x, q_lora_rank_, dim_);
|
||
kernel::rms_norm(q_a_.data(), q_a_.data(), q_a_norm, q_lora_rank_);
|
||
|
||
// q_a → q_b_proj → decompress to n_heads * key_len_mla
|
||
// q_b layout per head: [q_nope(nope_head_dim=128) | q_rope(rope_dim=64)]
|
||
gemm::matmul(q_b_.data(), w_q_b, t_q_b, q_a_.data(),
|
||
n_heads_ * key_len_mla_, q_lora_rank_);
|
||
|
||
// Apply RoPE to q_rope portion of each head
|
||
for (int h = 0; h < n_heads_; ++h) {
|
||
float* q_rope_h = q_b_.data() + h * key_len_mla_ + nope_head_dim_;
|
||
rope.apply(q_rope_h, pos);
|
||
}
|
||
|
||
// ─── K ABSORPTION ──────────────────────────────────────────────────
|
||
// q_absorbed_h = q_nope_h @ W_k_b_h^T → [kv_lora_rank] per head
|
||
// W_k_b: [kv_lora_rank, n_heads * nope_head_dim] (rows × cols)
|
||
// For row r, head h: W_k_b[r][h*nope_head_dim .. (h+1)*nope_head_dim-1]
|
||
{
|
||
kernel::vec_zero(q_absorbed_.data(), n_heads_ * kv_lora_rank_);
|
||
int k_b_cols = n_heads_ * nope_head_dim_;
|
||
size_t k_b_row_bytes = gemm::row_bytes(t_k_b, k_b_cols);
|
||
const uint8_t* k_b_ptr = static_cast<const uint8_t*>(w_k_b);
|
||
|
||
|
||
for (int r = 0; r < kv_lora_rank_; ++r) {
|
||
// Dequantize row r of W_k_b
|
||
gemm::dequantize_row(dq_row_.data(),
|
||
k_b_ptr + r * k_b_row_bytes,
|
||
t_k_b, k_b_cols);
|
||
// For each head: q_absorbed[h][r] = dot(q_nope_h, dq_row[h*nope..])
|
||
for (int h = 0; h < n_heads_; ++h) {
|
||
const float* q_nope_h = q_b_.data() + h * key_len_mla_;
|
||
const float* k_b_h = dq_row_.data() + h * nope_head_dim_;
|
||
float dot = 0.0f;
|
||
for (int d = 0; d < nope_head_dim_; ++d) {
|
||
dot += q_nope_h[d] * k_b_h[d];
|
||
}
|
||
q_absorbed_[h * kv_lora_rank_ + r] = dot;
|
||
}
|
||
}
|
||
}
|
||
|
||
// ─── KV PATH ──────────────────────────────────────────────────────
|
||
// x → kv_a_proj → compress to [kv_lora_rank | rope_dim]
|
||
gemm::matmul(kv_a_.data(), w_kv_a, t_kv_a, x,
|
||
kv_lora_rank_ + rope_dim_, dim_);
|
||
|
||
// Norm only the kv_lora_rank part (not the rope keys)
|
||
kernel::rms_norm(kv_a_.data(), kv_a_.data(), kv_a_norm, kv_lora_rank_);
|
||
|
||
// Apply RoPE to the rope key portion
|
||
float* k_rope_curr = kv_a_.data() + kv_lora_rank_;
|
||
rope.apply(k_rope_curr, pos);
|
||
|
||
// Store compressed KV + rope key in cache
|
||
float* cache_slot = cache.kv_latent(layer, pos);
|
||
std::memcpy(cache_slot, kv_a_.data(),
|
||
(kv_lora_rank_ + rope_dim_) * sizeof(float));
|
||
|
||
// ─── SCORING (absorbed) ────────────────────────────────────────────
|
||
// score_h[t] = q_absorbed_h · kv_compressed[t] + q_rope_h · k_rope[t]
|
||
float scale = 1.0f / std::sqrt(static_cast<float>(key_len_mla_));
|
||
|
||
#pragma omp parallel for schedule(dynamic)
|
||
for (int h = 0; h < n_heads_; ++h) {
|
||
const float* qa_h = q_absorbed_.data() + h * kv_lora_rank_;
|
||
const float* qr_h = q_b_.data() + h * key_len_mla_ + nope_head_dim_;
|
||
float* sh = scores_.data() + h * seq_len;
|
||
|
||
for (int t = 0; t < seq_len; ++t) {
|
||
const float* cached_kv = cache.kv_compressed(layer, t);
|
||
const float* cached_rope = cache.rope_key(layer, t);
|
||
|
||
float score = 0.0f;
|
||
// Nope: q_absorbed · kv_compressed (both kv_lora_rank dims)
|
||
for (int d = 0; d < kv_lora_rank_; ++d) {
|
||
score += qa_h[d] * cached_kv[d];
|
||
}
|
||
// Rope: q_rope · k_rope (rope_dim dims)
|
||
for (int d = 0; d < rope_dim_; ++d) {
|
||
score += qr_h[d] * cached_rope[d];
|
||
}
|
||
sh[t] = score * scale;
|
||
}
|
||
|
||
kernel::softmax(sh, seq_len);
|
||
|
||
// ─── V ACCUMULATION (compressed space) ─────────────────────────
|
||
float* vc_h = v_compressed_.data() + h * kv_lora_rank_;
|
||
kernel::vec_zero(vc_h, kv_lora_rank_);
|
||
for (int t = 0; t < seq_len; ++t) {
|
||
const float* cached_kv = cache.kv_compressed(layer, t);
|
||
float w = sh[t];
|
||
for (int d = 0; d < kv_lora_rank_; ++d) {
|
||
vc_h[d] += w * cached_kv[d];
|
||
}
|
||
}
|
||
}
|
||
|
||
// ─── V DECOMPRESSION ───────────────────────────────────────────────
|
||
// v_h = W_v_b_h^T @ v_compressed_h → [val_len_mla] per head
|
||
// W_v_b: [kv_lora_rank, n_heads * val_len_mla]
|
||
{
|
||
kernel::vec_zero(attn_out_.data(), n_heads_ * val_len_mla_);
|
||
int v_b_cols = n_heads_ * val_len_mla_;
|
||
size_t v_b_row_bytes = gemm::row_bytes(t_v_b, v_b_cols);
|
||
const uint8_t* v_b_ptr = static_cast<const uint8_t*>(w_v_b);
|
||
|
||
|
||
for (int r = 0; r < kv_lora_rank_; ++r) {
|
||
// Dequantize row r of W_v_b
|
||
gemm::dequantize_row(dq_row_.data(),
|
||
v_b_ptr + r * v_b_row_bytes,
|
||
t_v_b, v_b_cols);
|
||
// For each head: v_h[d] += W_v_b[r][h*val_len + d] * v_compressed_h[r]
|
||
for (int h = 0; h < n_heads_; ++h) {
|
||
float vc_r = v_compressed_[h * kv_lora_rank_ + r];
|
||
const float* vb_h = dq_row_.data() + h * val_len_mla_;
|
||
float* oh = attn_out_.data() + h * val_len_mla_;
|
||
for (int d = 0; d < val_len_mla_; ++d) {
|
||
oh[d] += vb_h[d] * vc_r;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// ─── OUTPUT PROJECTION ─────────────────────────────────────────────
|
||
gemm::matmul(out, w_o, t_o, attn_out_.data(), dim_,
|
||
n_heads_ * val_len_mla_);
|
||
}
|
||
|
||
private:
|
||
int dim_ = 7168;
|
||
int n_heads_ = 64;
|
||
int q_lora_rank_ = 1536;
|
||
int kv_lora_rank_ = 512;
|
||
int rope_dim_ = 64;
|
||
int nope_head_dim_ = 128;
|
||
int key_len_mla_ = 192;
|
||
int val_len_mla_ = 128;
|
||
int max_ctx_ = 4096;
|
||
|
||
// Scratch
|
||
std::vector<float> q_a_;
|
||
std::vector<float> q_b_;
|
||
std::vector<float> kv_a_;
|
||
std::vector<float> attn_out_;
|
||
std::vector<float> scores_;
|
||
|
||
// Absorbed attention
|
||
std::vector<float> q_absorbed_; // [n_heads * kv_lora_rank]
|
||
std::vector<float> v_compressed_; // [n_heads * kv_lora_rank]
|
||
std::vector<float> dq_row_; // dequant row buffer
|
||
};
|
||
|
||
// ═══════════════════════════════════════════════════════════════════════════════
|
||
// MoE ROUTER — Top-K expert selection with gating
|
||
// Route token to top-K experts out of N total
|
||
// ═══════════════════════════════════════════════════════════════════════════════
|
||
struct ExpertSelection {
|
||
int expert_id;
|
||
float weight;
|
||
};
|
||
|
||
class MoERouter {
|
||
public:
|
||
void init(const Config& cfg) {
|
||
n_experts_ = cfg.n_experts;
|
||
n_used_ = cfg.n_experts_used;
|
||
dim_ = cfg.dim;
|
||
gating_func_ = cfg.expert_gating_func;
|
||
weights_scale_ = cfg.expert_weights_scale;
|
||
weights_norm_ = cfg.expert_weights_norm;
|
||
|
||
gate_scores_.resize(n_experts_);
|
||
sorted_indices_.resize(n_experts_);
|
||
}
|
||
|
||
// Route: compute gating scores and select top-K experts
|
||
// gate_inp: [dim, n_experts] router weight (F32)
|
||
// x: [dim] input token
|
||
// Returns selected experts with normalized weights
|
||
std::vector<ExpertSelection> route(
|
||
const float* gate_inp, // [dim, n_experts] or [n_experts, dim]
|
||
const float* x
|
||
) {
|
||
// Compute gate scores: gate_inp^T @ x → [n_experts]
|
||
for (int e = 0; e < n_experts_; ++e) {
|
||
float score = 0.0f;
|
||
const float* ge = gate_inp + e * dim_; // row e of gate_inp
|
||
for (int d = 0; d < dim_; ++d) {
|
||
score += ge[d] * x[d];
|
||
}
|
||
gate_scores_[e] = score;
|
||
}
|
||
|
||
// Gating function
|
||
if (gating_func_ == 2) {
|
||
// Type 2: sigmoid gating (DeepSeek V3 / Kimi K2.5)
|
||
for (int e = 0; e < n_experts_; ++e) {
|
||
gate_scores_[e] = 1.0f / (1.0f + std::exp(-gate_scores_[e]));
|
||
}
|
||
} else {
|
||
// Type 0/1: softmax gating
|
||
kernel::softmax(gate_scores_.data(), n_experts_);
|
||
}
|
||
|
||
// Top-K selection
|
||
std::iota(sorted_indices_.begin(), sorted_indices_.end(), 0);
|
||
std::partial_sort(sorted_indices_.begin(),
|
||
sorted_indices_.begin() + n_used_,
|
||
sorted_indices_.end(),
|
||
[this](int a, int b) {
|
||
return gate_scores_[a] > gate_scores_[b];
|
||
});
|
||
|
||
std::vector<ExpertSelection> selected(n_used_);
|
||
float weight_sum = 0.0f;
|
||
|
||
for (int i = 0; i < n_used_; ++i) {
|
||
selected[i].expert_id = sorted_indices_[i];
|
||
selected[i].weight = gate_scores_[sorted_indices_[i]];
|
||
weight_sum += selected[i].weight;
|
||
}
|
||
|
||
// Normalize weights
|
||
if (weights_norm_ && weight_sum > 0.0f) {
|
||
for (int i = 0; i < n_used_; ++i) {
|
||
selected[i].weight /= weight_sum;
|
||
}
|
||
}
|
||
|
||
// Apply scale
|
||
if (weights_scale_ != 1.0f) {
|
||
for (int i = 0; i < n_used_; ++i) {
|
||
selected[i].weight *= weights_scale_;
|
||
}
|
||
}
|
||
|
||
return selected;
|
||
}
|
||
|
||
// Stats
|
||
int n_experts() const { return n_experts_; }
|
||
int n_used() const { return n_used_; }
|
||
|
||
private:
|
||
int n_experts_ = 384;
|
||
int n_used_ = 8;
|
||
int dim_ = 7168;
|
||
int gating_func_ = 2;
|
||
float weights_scale_ = 1.0f;
|
||
bool weights_norm_ = false;
|
||
|
||
std::vector<float> gate_scores_;
|
||
std::vector<int> sorted_indices_;
|
||
};
|
||
|
||
// ═══════════════════════════════════════════════════════════════════════════════
|
||
// EXPERT FFN — Single expert forward pass
|
||
// Expert weights are 3D tensors: [expert_ffn_dim, dim, n_experts]
|
||
// We extract the slice for one expert
|
||
// ═══════════════════════════════════════════════════════════════════════════════
|
||
class ExpertFFN {
|
||
public:
|
||
void init(const Config& cfg) {
|
||
dim_ = cfg.dim;
|
||
expert_ffn_dim_ = cfg.expert_ffn_dim;
|
||
n_experts_ = cfg.n_experts;
|
||
|
||
gate_buf_.resize(expert_ffn_dim_);
|
||
up_buf_.resize(expert_ffn_dim_);
|
||
}
|
||
|
||
// Forward one expert
|
||
// expert_gate/up/down are the FULL 3D expert tensors
|
||
// We extract the slice for expert_id
|
||
void forward(
|
||
float* out,
|
||
const float* x,
|
||
int expert_id,
|
||
const void* w_gate_exps, dtype t_gate, // [dim, expert_ffn_dim, n_experts]
|
||
const void* w_up_exps, dtype t_up,
|
||
const void* w_down_exps, dtype t_down
|
||
) {
|
||
// Expert slice: offset into the 3D tensor
|
||
// Layout in GGUF: [outer_dim, inner_dim, n_experts]
|
||
// For gate_exps: shape is [dim, expert_ffn_dim, n_experts]
|
||
// Wait, from our analysis: ffn_gate_exps [7168, 2048, 384] IQ2_XXS
|
||
// So shape is [dim=7168, expert_ffn=2048, n_experts=384]
|
||
// Expert slice = all [dim, expert_ffn] at index expert_id along axis 2
|
||
|
||
// For quantized tensors, we need to compute the byte offset for expert_id
|
||
// The 3D tensor is stored as n_experts slices of [dim, expert_ffn] matrices
|
||
|
||
// Actually GGUF stores them as contiguous: expert0_data, expert1_data, ...
|
||
// Each expert is a [dim, expert_ffn] matrix
|
||
|
||
// For gate: input x[dim] → output[expert_ffn] (matmul x @ gate^T)
|
||
// gate_exps slice for expert e: offset = e * (dim * expert_ffn * bits/8)
|
||
|
||
// The gemm::matmul_expert function handles the offset
|
||
size_t expert_gate_bytes = expert_tensor_bytes(dim_, expert_ffn_dim_, t_gate);
|
||
size_t expert_up_bytes = expert_tensor_bytes(dim_, expert_ffn_dim_, t_up);
|
||
size_t expert_down_bytes = expert_tensor_bytes(expert_ffn_dim_, dim_, t_down);
|
||
|
||
const uint8_t* gate_ptr = static_cast<const uint8_t*>(w_gate_exps)
|
||
+ expert_id * expert_gate_bytes;
|
||
const uint8_t* up_ptr = static_cast<const uint8_t*>(w_up_exps)
|
||
+ expert_id * expert_up_bytes;
|
||
const uint8_t* down_ptr = static_cast<const uint8_t*>(w_down_exps)
|
||
+ expert_id * expert_down_bytes;
|
||
|
||
// gate(x) → gate_buf [expert_ffn]
|
||
gemm::matmul(gate_buf_.data(), gate_ptr, t_gate, x,
|
||
expert_ffn_dim_, dim_);
|
||
|
||
// up(x) → up_buf [expert_ffn]
|
||
gemm::matmul(up_buf_.data(), up_ptr, t_up, x,
|
||
expert_ffn_dim_, dim_);
|
||
|
||
// SiLU(gate) * up
|
||
kernel::silu(gate_buf_.data(), expert_ffn_dim_);
|
||
kernel::vec_mul(gate_buf_.data(), gate_buf_.data(), up_buf_.data(),
|
||
expert_ffn_dim_);
|
||
|
||
// down → out [dim]
|
||
gemm::matmul(out, down_ptr, t_down, gate_buf_.data(),
|
||
dim_, expert_ffn_dim_);
|
||
}
|
||
|
||
private:
|
||
int dim_ = 7168;
|
||
int expert_ffn_dim_ = 2048;
|
||
int n_experts_ = 384;
|
||
|
||
std::vector<float> gate_buf_;
|
||
std::vector<float> up_buf_;
|
||
|
||
// Compute bytes for one expert's matrix
|
||
size_t expert_tensor_bytes(int rows, int cols, dtype t) const {
|
||
size_t n_elements = (size_t)rows * cols;
|
||
int bs = dtype_block_size(t);
|
||
if (bs > 1) {
|
||
return (n_elements / bs) * dtype_size(t);
|
||
}
|
||
return n_elements * dtype_size(t);
|
||
}
|
||
};
|
||
|
||
// ═══════════════════════════════════════════════════════════════════════════════
|
||
// SHARED EXPERT FFN — Always-active expert(s)
|
||
// Uses standard FFN weights (not 3D tensors)
|
||
// ═══════════════════════════════════════════════════════════════════════════════
|
||
class SharedExpertFFN {
|
||
public:
|
||
void init(const Config& cfg) {
|
||
dim_ = cfg.dim;
|
||
expert_ffn_dim_ = cfg.expert_ffn_dim;
|
||
gate_buf_.resize(expert_ffn_dim_);
|
||
up_buf_.resize(expert_ffn_dim_);
|
||
}
|
||
|
||
void forward(
|
||
float* out,
|
||
const float* x,
|
||
const void* w_gate, dtype t_gate,
|
||
const void* w_up, dtype t_up,
|
||
const void* w_down, dtype t_down
|
||
) {
|
||
gemm::matmul(gate_buf_.data(), w_gate, t_gate, x, expert_ffn_dim_, dim_);
|
||
gemm::matmul(up_buf_.data(), w_up, t_up, x, expert_ffn_dim_, dim_);
|
||
kernel::silu(gate_buf_.data(), expert_ffn_dim_);
|
||
kernel::vec_mul(gate_buf_.data(), gate_buf_.data(), up_buf_.data(),
|
||
expert_ffn_dim_);
|
||
gemm::matmul(out, w_down, t_down, gate_buf_.data(), dim_, expert_ffn_dim_);
|
||
}
|
||
|
||
private:
|
||
int dim_ = 7168;
|
||
int expert_ffn_dim_ = 2048;
|
||
std::vector<float> gate_buf_, up_buf_;
|
||
};
|
||
|
||
// ═══════════════════════════════════════════════════════════════════════════════
|
||
// EXPERT CACHE — LRU tracking for hot expert pages
|
||
// Track which experts are frequently activated to keep them hot in page cache
|
||
// ═══════════════════════════════════════════════════════════════════════════════
|
||
class ExpertCache {
|
||
public:
|
||
void init(int n_layers, int n_experts) {
|
||
n_layers_ = n_layers;
|
||
n_experts_ = n_experts;
|
||
// Frequency counter per layer per expert
|
||
freq_.resize(n_layers * n_experts, 0);
|
||
total_calls_ = 0;
|
||
}
|
||
|
||
void record(int layer, int expert_id) {
|
||
freq_[layer * n_experts_ + expert_id]++;
|
||
total_calls_++;
|
||
}
|
||
|
||
void record_batch(int layer, const std::vector<ExpertSelection>& selected) {
|
||
for (const auto& s : selected) {
|
||
record(layer, s.expert_id);
|
||
}
|
||
}
|
||
|
||
// Get top N hottest experts for prefetching
|
||
std::vector<int> hot_experts(int layer, int top_n = 32) const {
|
||
std::vector<std::pair<int, int>> counts;
|
||
for (int e = 0; e < n_experts_; ++e) {
|
||
int f = freq_[layer * n_experts_ + e];
|
||
if (f > 0) counts.push_back({f, e});
|
||
}
|
||
std::partial_sort(counts.begin(),
|
||
counts.begin() + std::min(top_n, (int)counts.size()),
|
||
counts.end(),
|
||
[](const auto& a, const auto& b) { return a.first > b.first; });
|
||
|
||
std::vector<int> result;
|
||
for (int i = 0; i < std::min(top_n, (int)counts.size()); ++i) {
|
||
result.push_back(counts[i].second);
|
||
}
|
||
return result;
|
||
}
|
||
|
||
void print_stats() const {
|
||
printf("Expert Cache: %lu total activations\n", total_calls_);
|
||
// Print hottest experts per layer sample
|
||
for (int l = 0; l < std::min(3, n_layers_); ++l) {
|
||
auto hot = hot_experts(l, 5);
|
||
printf(" Layer %d top-5: ", l);
|
||
for (int e : hot) {
|
||
printf("%d(%d) ", e, freq_[l * n_experts_ + e]);
|
||
}
|
||
printf("\n");
|
||
}
|
||
}
|
||
|
||
// KIMI-SIGNAL-935 PROFILING
|
||
void dump_csv(const char* path) const {
|
||
FILE* fp = fopen(path, "w");
|
||
if (!fp) return;
|
||
fprintf(fp, "layer,expert_id,count\n");
|
||
for (int l = 0; l < n_layers_; ++l)
|
||
for (int e = 0; e < n_experts_; ++e) {
|
||
int c = freq_[l * n_experts_ + e];
|
||
if (c > 0) fprintf(fp, "%d,%d,%d\n", l, e, c);
|
||
}
|
||
fclose(fp);
|
||
printf("[PROFILE] -> %s (%zu calls)\n", path, total_calls_);
|
||
}
|
||
|
||
private:
|
||
int n_layers_ = 0;
|
||
int n_experts_ = 0;
|
||
std::vector<int> freq_;
|
||
size_t total_calls_ = 0;
|
||
};
|
||
|
||
} // namespace ix
|