inference-x/runtime/expert_profiler.h
Salka Elmadani ec36668cf5 Inference-X v1.0 — Universal AI Inference Engine
Better output from the same model. Fused computation, adaptive precision,
surgical expert loading. 305 KB, 19 backends, zero dependencies.

https://inference-x.com
2026-02-23 07:10:47 +00:00

208 lines
8.2 KiB
C++

// ═══════════════════════════════════════════════════════════════════════════════
// INFERENCEX — Expert Profiler (Kimi-Signal-935 Genesis)
// Copyright (C) 2025-2026 Salka Elmadani. All rights reserved.
// Licensed under the Business Source License 1.1 (BSL-1.1)
// See LICENSE file for full terms. Morocco.
//
// NOTICE: This file is part of InferenceX by Salka Elmadani.
// Commercial use by entities with revenue >= $1M USD requires a license.
// Contact: Elmadani.SALKA@proton.me
// ═══════════════════════════════════════════════════════════════════════════════
#pragma once
#include <cstdint>
#include <cstdio>
#include <vector>
#include <algorithm>
#include <numeric>
#include <string>
#include <cstring>
namespace ix {
class ExpertProfiler {
public:
void init(int n_layers, int n_experts) {
n_layers_ = n_layers;
n_experts_ = n_experts;
counts_.resize(n_layers, std::vector<uint64_t>(n_experts, 0));
co_occur_.resize(n_experts, std::vector<uint32_t>(n_experts, 0));
total_tokens_ = 0;
enabled_ = true;
}
void record(int layer, const int* expert_ids, int n_active) {
if (!enabled_ || layer >= n_layers_) return;
for (int i = 0; i < n_active; ++i) {
int eid = expert_ids[i];
if (eid >= 0 && eid < n_experts_) {
counts_[layer][eid]++;
}
}
// Co-occurrence (layer 0 only, for correlation analysis)
if (layer == 0) {
for (int i = 0; i < n_active; ++i) {
for (int j = i + 1; j < n_active; ++j) {
int a = expert_ids[i], b = expert_ids[j];
if (a >= 0 && a < n_experts_ && b >= 0 && b < n_experts_) {
co_occur_[a][b]++;
co_occur_[b][a]++;
}
}
}
total_tokens_++;
}
}
// Dump CSV: layer, expert_id, count, pct
void dump_csv(const char* path) const {
FILE* f = fopen(path, "w");
if (!f) { printf("[PROFILER] Cannot write %s\n", path); return; }
fprintf(f, "layer,expert_id,count,pct_of_tokens\n");
for (int l = 0; l < n_layers_; ++l) {
for (int e = 0; e < n_experts_; ++e) {
if (counts_[l][e] > 0) {
fprintf(f, "%d,%d,%lu,%.6f\n", l, e,
(unsigned long)counts_[l][e],
total_tokens_ > 0 ?
(double)counts_[l][e] / total_tokens_ : 0.0);
}
}
}
fclose(f);
printf("[PROFILER] Expert activations → %s (%lu tokens)\n",
path, (unsigned long)total_tokens_);
}
// Dump summary: per-layer analysis
void dump_summary(const char* path) const {
FILE* f = fopen(path, "w");
if (!f) return;
fprintf(f, "# KIMI-SIGNAL-935 Expert Profile | %lu tokens\n\n",
(unsigned long)total_tokens_);
for (int l = 0; l < n_layers_; ++l) {
// Sort experts by activation count
std::vector<std::pair<uint64_t, int>> sorted;
uint64_t layer_total = 0;
for (int e = 0; e < n_experts_; ++e) {
if (counts_[l][e] > 0) {
sorted.push_back({counts_[l][e], e});
layer_total += counts_[l][e];
}
}
std::sort(sorted.begin(), sorted.end(),
[](const auto& a, const auto& b) { return a.first > b.first; });
// Find thresholds
uint64_t cumsum = 0;
int n_90 = 0, n_95 = 0, n_99 = 0;
for (size_t i = 0; i < sorted.size(); ++i) {
cumsum += sorted[i].first;
double pct = (double)cumsum / layer_total;
if (n_90 == 0 && pct >= 0.90) n_90 = (int)i + 1;
if (n_95 == 0 && pct >= 0.95) n_95 = (int)i + 1;
if (n_99 == 0 && pct >= 0.99) n_99 = (int)i + 1;
}
int active = (int)sorted.size();
int dead = n_experts_ - active;
fprintf(f, "Layer %2d: %3d active, %3d dead | "
"90%%=%3d experts, 95%%=%3d, 99%%=%3d | "
"top expert: #%d (%.1f%%)\n",
l, active, dead,
n_90, n_95, n_99,
sorted.empty() ? -1 : sorted[0].second,
sorted.empty() ? 0.0 :
100.0 * sorted[0].first / layer_total);
}
// Global recommendation
fprintf(f, "\n# PRUNING RECOMMENDATION\n");
// Average across layers
double avg_90 = 0, avg_95 = 0, avg_99 = 0;
for (int l = 0; l < n_layers_; ++l) {
std::vector<uint64_t> sorted_counts(counts_[l]);
std::sort(sorted_counts.begin(), sorted_counts.end(), std::greater<>());
uint64_t total = 0;
for (auto c : sorted_counts) total += c;
if (total == 0) continue;
uint64_t cum = 0;
for (int i = 0; i < n_experts_; ++i) {
cum += sorted_counts[i];
double pct = (double)cum / total;
if (avg_90 == 0 && pct >= 0.90) avg_90 += i + 1;
if (avg_95 == 0 && pct >= 0.95) avg_95 += i + 1;
if (avg_99 == 0 && pct >= 0.99) avg_99 += i + 1;
}
}
avg_90 /= n_layers_; avg_95 /= n_layers_; avg_99 /= n_layers_;
double size_full = 226.0; // GB
double expert_ratio = (double)(n_experts_ - 8) / n_experts_; // non-shared
fprintf(f, "\nAverage experts for 90%% signal: %.0f\n", avg_90);
fprintf(f, "Average experts for 95%% signal: %.0f\n", avg_95);
fprintf(f, "Average experts for 99%% signal: %.0f\n", avg_99);
fprintf(f, "\nEstimated model sizes:\n");
fprintf(f, " 32 experts: ~%.0f GB\n", size_full * (1.0 - expert_ratio * (1.0 - 32.0/n_experts_)));
fprintf(f, " 64 experts: ~%.0f GB\n", size_full * (1.0 - expert_ratio * (1.0 - 64.0/n_experts_)));
fprintf(f, " 128 experts: ~%.0f GB\n", size_full * (1.0 - expert_ratio * (1.0 - 128.0/n_experts_)));
fclose(f);
printf("[PROFILER] Summary → %s\n", path);
}
// Get the top-N experts globally (union across all layers)
std::vector<int> get_essential_experts(int top_n_per_layer) const {
std::vector<uint64_t> global(n_experts_, 0);
for (int l = 0; l < n_layers_; ++l) {
// Get top-N for this layer
std::vector<std::pair<uint64_t, int>> sorted;
for (int e = 0; e < n_experts_; ++e) {
sorted.push_back({counts_[l][e], e});
}
std::sort(sorted.begin(), sorted.end(),
[](const auto& a, const auto& b) { return a.first > b.first; });
for (int i = 0; i < std::min(top_n_per_layer, n_experts_); ++i) {
global[sorted[i].second] += sorted[i].first;
}
}
// Sort globally
std::vector<std::pair<uint64_t, int>> gsorted;
for (int e = 0; e < n_experts_; ++e) {
if (global[e] > 0) gsorted.push_back({global[e], e});
}
std::sort(gsorted.begin(), gsorted.end(),
[](const auto& a, const auto& b) { return a.first > b.first; });
std::vector<int> result;
for (auto& p : gsorted) result.push_back(p.second);
return result;
}
uint64_t total_tokens() const { return total_tokens_; }
bool enabled() const { return enabled_; }
void enable() { enabled_ = true; }
void disable() { enabled_ = false; }
private:
int n_layers_ = 0;
int n_experts_ = 0;
std::vector<std::vector<uint64_t>> counts_; // [layer][expert_id]
std::vector<std::vector<uint32_t>> co_occur_; // [expert][expert]
uint64_t total_tokens_ = 0;
bool enabled_ = false;
};
// Global profiler instance
static ExpertProfiler g_expert_profiler;
} // namespace ix