inference-x/runtime/expert_profiler.h

// ═══════════════════════════════════════════════════════════════════════════════
// INFERENCEX — Expert Profiler (Kimi-Signal-935 Genesis)
// Copyright (C) 2025-2026 Salka Elmadani. All rights reserved.
// Licensed under the Business Source License 1.1 (BSL-1.1)
// See LICENSE file for full terms. Morocco.
//
// NOTICE: This file is part of InferenceX by Salka Elmadani.
// Commercial use by entities with revenue >= $1M USD requires a license.
// Contact: Elmadani.SALKA@proton.me
// ═══════════════════════════════════════════════════════════════════════════════

#pragma once

#include <cstdint>
#include <cstdio>
#include <vector>
#include <algorithm>
#include <numeric>
#include <string>
#include <cstring>

namespace ix {

class ExpertProfiler {
public:
    void init(int n_layers, int n_experts) {
        n_layers_ = n_layers;
        n_experts_ = n_experts;
        counts_.resize(n_layers, std::vector<uint64_t>(n_experts, 0));
        co_occur_.resize(n_experts, std::vector<uint32_t>(n_experts, 0));
        total_tokens_ = 0;
        enabled_ = true;
    }

    void record(int layer, const int* expert_ids, int n_active) {
        if (!enabled_ || layer >= n_layers_) return;
        for (int i = 0; i < n_active; ++i) {
            int eid = expert_ids[i];
            if (eid >= 0 && eid < n_experts_) {
                counts_[layer][eid]++;
            }
        }
        // Co-occurrence (layer 0 only, for correlation analysis)
        if (layer == 0) {
            for (int i = 0; i < n_active; ++i) {
                for (int j = i + 1; j < n_active; ++j) {
                    int a = expert_ids[i], b = expert_ids[j];
                    if (a >= 0 && a < n_experts_ && b >= 0 && b < n_experts_) {
                        co_occur_[a][b]++;
                        co_occur_[b][a]++;
                    }
                }
            }
            total_tokens_++;
        }
    }

    // Dump CSV: layer, expert_id, count, pct
    void dump_csv(const char* path) const {
        FILE* f = fopen(path, "w");
        if (!f) { printf("[PROFILER] Cannot write %s\n", path); return; }

        fprintf(f, "layer,expert_id,count,pct_of_tokens\n");
        for (int l = 0; l < n_layers_; ++l) {
            for (int e = 0; e < n_experts_; ++e) {
                if (counts_[l][e] > 0) {
                    fprintf(f, "%d,%d,%lu,%.6f\n", l, e,
                            (unsigned long)counts_[l][e],
                            total_tokens_ > 0 ?
                                (double)counts_[l][e] / total_tokens_ : 0.0);
                }
            }
        }
        fclose(f);
        printf("[PROFILER] Expert activations → %s (%lu tokens)\n",
               path, (unsigned long)total_tokens_);
    }

    // Dump summary: per-layer analysis
    void dump_summary(const char* path) const {
        FILE* f = fopen(path, "w");
        if (!f) return;

        fprintf(f, "# KIMI-SIGNAL-935 Expert Profile | %lu tokens\n\n",
                (unsigned long)total_tokens_);

        for (int l = 0; l < n_layers_; ++l) {
            // Sort experts by activation count
            std::vector<std::pair<uint64_t, int>> sorted;
            uint64_t layer_total = 0;
            for (int e = 0; e < n_experts_; ++e) {
                if (counts_[l][e] > 0) {
                    sorted.push_back({counts_[l][e], e});
                    layer_total += counts_[l][e];
                }
            }
            std::sort(sorted.begin(), sorted.end(),
                     [](const auto& a, const auto& b) { return a.first > b.first; });

            // Find thresholds
            uint64_t cumsum = 0;
            int n_90 = 0, n_95 = 0, n_99 = 0;
            for (size_t i = 0; i < sorted.size(); ++i) {
                cumsum += sorted[i].first;
                double pct = (double)cumsum / layer_total;
                if (n_90 == 0 && pct >= 0.90) n_90 = (int)i + 1;
                if (n_95 == 0 && pct >= 0.95) n_95 = (int)i + 1;
                if (n_99 == 0 && pct >= 0.99) n_99 = (int)i + 1;
            }

            int active = (int)sorted.size();
            int dead = n_experts_ - active;

            fprintf(f, "Layer %2d: %3d active, %3d dead | "
                       "90%%=%3d experts, 95%%=%3d, 99%%=%3d | "
                       "top expert: #%d (%.1f%%)\n",
                    l, active, dead,
                    n_90, n_95, n_99,
                    sorted.empty() ? -1 : sorted[0].second,
                    sorted.empty() ? 0.0 :
                        100.0 * sorted[0].first / layer_total);
        }

        // Global recommendation
        fprintf(f, "\n# PRUNING RECOMMENDATION\n");

        // Average across layers
        double avg_90 = 0, avg_95 = 0, avg_99 = 0;
        for (int l = 0; l < n_layers_; ++l) {
            std::vector<uint64_t> sorted_counts(counts_[l]);
            std::sort(sorted_counts.begin(), sorted_counts.end(), std::greater<>());
            uint64_t total = 0;
            for (auto c : sorted_counts) total += c;
            if (total == 0) continue;

            uint64_t cum = 0;
            for (int i = 0; i < n_experts_; ++i) {
                cum += sorted_counts[i];
                double pct = (double)cum / total;
                if (avg_90 == 0 && pct >= 0.90) avg_90 += i + 1;
                if (avg_95 == 0 && pct >= 0.95) avg_95 += i + 1;
                if (avg_99 == 0 && pct >= 0.99) avg_99 += i + 1;
            }
        }
        avg_90 /= n_layers_; avg_95 /= n_layers_; avg_99 /= n_layers_;

        double size_full = 226.0; // GB
        double expert_ratio = (double)(n_experts_ - 8) / n_experts_; // non-shared
        fprintf(f, "\nAverage experts for 90%% signal: %.0f\n", avg_90);
        fprintf(f, "Average experts for 95%% signal: %.0f\n", avg_95);
        fprintf(f, "Average experts for 99%% signal: %.0f\n", avg_99);
        fprintf(f, "\nEstimated model sizes:\n");
        fprintf(f, "  32 experts: ~%.0f GB\n", size_full * (1.0 - expert_ratio * (1.0 - 32.0/n_experts_)));
        fprintf(f, "  64 experts: ~%.0f GB\n", size_full * (1.0 - expert_ratio * (1.0 - 64.0/n_experts_)));
        fprintf(f, " 128 experts: ~%.0f GB\n", size_full * (1.0 - expert_ratio * (1.0 - 128.0/n_experts_)));

        fclose(f);
        printf("[PROFILER] Summary → %s\n", path);
    }

    // Get the top-N experts globally (union across all layers)
    std::vector<int> get_essential_experts(int top_n_per_layer) const {
        std::vector<uint64_t> global(n_experts_, 0);
        for (int l = 0; l < n_layers_; ++l) {
            // Get top-N for this layer
            std::vector<std::pair<uint64_t, int>> sorted;
            for (int e = 0; e < n_experts_; ++e) {
                sorted.push_back({counts_[l][e], e});
            }
            std::sort(sorted.begin(), sorted.end(),
                     [](const auto& a, const auto& b) { return a.first > b.first; });
            for (int i = 0; i < std::min(top_n_per_layer, n_experts_); ++i) {
                global[sorted[i].second] += sorted[i].first;
            }
        }

        // Sort globally
        std::vector<std::pair<uint64_t, int>> gsorted;
        for (int e = 0; e < n_experts_; ++e) {
            if (global[e] > 0) gsorted.push_back({global[e], e});
        }
        std::sort(gsorted.begin(), gsorted.end(),
                 [](const auto& a, const auto& b) { return a.first > b.first; });

        std::vector<int> result;
        for (auto& p : gsorted) result.push_back(p.second);
        return result;
    }

    uint64_t total_tokens() const { return total_tokens_; }
    bool enabled() const { return enabled_; }
    void enable() { enabled_ = true; }
    void disable() { enabled_ = false; }

private:
    int n_layers_ = 0;
    int n_experts_ = 0;
    std::vector<std::vector<uint64_t>> counts_;      // [layer][expert_id]
    std::vector<std::vector<uint32_t>> co_occur_;     // [expert][expert]
    uint64_t total_tokens_ = 0;
    bool enabled_ = false;
};

// Global profiler instance
static ExpertProfiler g_expert_profiler;

} // namespace ix