inference-x/runtime/fractal.h

// runtime/fractal.h — Fractal Inference Protocol
// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
// INPI eSoleau: 7phf-Ueye-2nWr-Vsgu — BSL-1.1
//
// The same model breathes Q2→Q4→Q8→FP16 based on what the query needs.
// No reloading. No switching files. The precision adapts in real-time.
//
// Principle: intelligence compression follows the principle of least action.
// Simple queries use simple precision. Complex reasoning uses full precision.
// The model is one. The view changes.
//
// Precision selection uses information-theoretic complexity analysis:
//   H(X) = -Σ p(x)·log2(p(x))  — Shannon entropy of input tokens
//   C(q) = w₁·H + w₂·len/ctx   — composite complexity score
//   P(l) = quantize(C(q), depth(l)/L)  — layer precision mapping
//
// This follows standard rate-distortion theory (Shannon 1959):
//   minimize distortion D subject to rate constraint R ≤ R_max
//
#pragma once
#include <cmath>
#include <vector>
#include <cstdio>
#include <algorithm>
#include "gemm.h"

namespace ix {

// ═══════════════════════════════════════════════════════════════════════════
// Query Complexity Analysis
// ═══════════════════════════════════════════════════════════════════════════

struct QueryProfile {
    float entropy;          // Token entropy of the input (0=trivial, >2=complex)
    float depth_demand;     // How many layers are likely critical (0-1)
    float reasoning_score;  // Presence of reasoning markers (0-1)
    int token_count;        // Input length
};

// Analyze input tokens to determine complexity
// No ML model needed — pure information theory
inline QueryProfile analyze_query(const std::vector<int32_t>& tokens, int vocab_size) {
    QueryProfile qp = {};
    qp.token_count = (int)tokens.size();

    if (tokens.empty() || vocab_size <= 0) return qp;

    // ── Token entropy ──────────────────────────────────────────────────
    // H = -Σ p(x) log2 p(x)
    // High entropy = diverse vocabulary = complex query
    std::vector<int> freq(std::min(vocab_size, 131072), 0);
    for (int32_t t : tokens) {
        if (t >= 0 && t < (int32_t)freq.size()) freq[t]++;
    }

    float H = 0.0f;
    float n = (float)tokens.size();
    for (int f : freq) {
        if (f > 0) {
            float p = (float)f / n;
            H -= p * log2f(p);
        }
    }
    qp.entropy = H;

    // ── Depth demand ───────────────────────────────────────────────────
    // Longer, more diverse inputs need deeper processing
    // Normalized: short simple query → 0.2, long complex → 0.95
    float len_factor = std::min(1.0f, (float)tokens.size() / 2048.0f);
    float ent_factor = std::min(1.0f, H / 8.0f);  // max useful entropy ~8 bits
    qp.depth_demand = 0.3f * len_factor + 0.7f * ent_factor;

    // ── Reasoning score ────────────────────────────────────────────────
    // Repetition ratio: reasoning often revisits concepts
    int unique = 0;
    for (int f : freq) if (f > 0) unique++;
    float unique_ratio = (float)unique / n;
    // High unique ratio + high entropy = analytical/reasoning
    // Low unique ratio = repetitive/simple
    qp.reasoning_score = std::min(1.0f, unique_ratio * ent_factor);

    return qp;
}

// ═══════════════════════════════════════════════════════════════════════════
// Precision Map — which dtype for which layer given query complexity
// ═══════════════════════════════════════════════════════════════════════════

enum class LayerRole {
    EMBED,      // Embedding layer — always needs decent precision
    ATTN_Q,     // Query projection — critical for attention quality
    ATTN_K,     // Key projection — critical for attention quality
    ATTN_V,     // Value projection — can tolerate lower precision
    ATTN_O,     // Output projection
    FFN_GATE,   // FFN gate — determines information flow
    FFN_UP,     // FFN up projection
    FFN_DOWN,   // FFN down projection — output path, precision matters
    MOE_GATE,   // MoE router — must be precise
    EXPERT,     // MoE expert — can vary by activation frequency
    HEAD,       // Output head — always high precision
};

struct PrecisionMap {
    int n_layers;
    // For each layer, the target dtype for attention and FFN
    std::vector<dtype> attn_dtype;   // Precision for attention projections
    std::vector<dtype> ffn_dtype;    // Precision for FFN/expert layers
    dtype embed_dtype;               // Embedding precision
    dtype head_dtype;                // Output head precision

    // The fractal schedule: which layers get which precision
    // Based on the observation that:
    //   - Early layers (pattern matching) can be lower precision
    //   - Middle layers (composition) need moderate precision
    //   - Late layers (decision) need higher precision
    //   - Output head always needs highest available

    void compute(int layers, dtype base_type, const QueryProfile& qp) {
        n_layers = layers;
        attn_dtype.resize(layers, base_type);
        ffn_dtype.resize(layers, base_type);

        // Head and embed always at base precision
        embed_dtype = base_type;
        head_dtype = base_type;

        // Trivial query: everything can drop
        // Complex query: maintain precision throughout
        float complexity = (qp.depth_demand + qp.reasoning_score) / 2.0f;

        if (complexity < 0.3f) {
            // ── FAST MODE: Simple query, aggressive compression ──────
            // Early 40% of layers → drop 2 levels
            // Middle 40% → drop 1 level
            // Last 20% → keep base precision
            for (int i = 0; i < layers; i++) {
                float pos = (float)i / (float)layers;  // 0=first, 1=last
                if (pos < 0.4f) {
                    attn_dtype[i] = drop_precision(base_type, 2);
                    ffn_dtype[i] = drop_precision(base_type, 2);
                } else if (pos < 0.8f) {
                    attn_dtype[i] = drop_precision(base_type, 1);
                    ffn_dtype[i] = drop_precision(base_type, 1);
                }
                // else: keep base
            }
        } else if (complexity < 0.6f) {
            // ── BALANCED MODE: Moderate compression ──────────────────
            // Early 30% drop 1 level, rest at base
            for (int i = 0; i < layers; i++) {
                float pos = (float)i / (float)layers;
                if (pos < 0.3f) {
                    attn_dtype[i] = drop_precision(base_type, 1);
                    ffn_dtype[i] = drop_precision(base_type, 1);
                }
            }
        }
        // complexity >= 0.6: FULL MODE — keep everything at base precision
    }

    // Drop precision by N levels on the K-quant scale
    // Q8_0 → Q6_K → Q5_K → Q4_K → Q3_K → Q2_K
    static dtype drop_precision(dtype base, int levels) {
        // Define the precision ladder
        static const dtype ladder[] = {
            dtype::Q2_K,  // 0 - lowest
            dtype::Q3_K,  // 1
            dtype::Q4_K,  // 2
            dtype::Q5_K,  // 3
            dtype::Q6_K,  // 4
            dtype::Q8_0,  // 5
            dtype::F16,   // 6
            dtype::F32,   // 7 - highest
        };
        static const int ladder_size = 8;

        // Find base position
        int pos = -1;
        for (int i = 0; i < ladder_size; i++) {
            if (ladder[i] == base) { pos = i; break; }
        }
        if (pos < 0) return base;  // Unknown type, don't touch

        int new_pos = std::max(0, pos - levels);
        return ladder[new_pos];
    }

    // Memory savings estimate
    float memory_ratio() const {
        if (n_layers == 0) return 1.0f;
        float base_bytes = 0, fractal_bytes = 0;
        dtype base = head_dtype;  // Assume head is at base precision

        for (int i = 0; i < n_layers; i++) {
            // Rough: each layer has attention (4 matrices) + FFN (3 matrices)
            float base_layer = dtype_bytes_approx(base) * 7;
            float frac_layer = dtype_bytes_approx(attn_dtype[i]) * 4
                             + dtype_bytes_approx(ffn_dtype[i]) * 3;
            base_bytes += base_layer;
            fractal_bytes += frac_layer;
        }
        return (base_bytes > 0) ? fractal_bytes / base_bytes : 1.0f;
    }

    static float dtype_bytes_approx(dtype t) {
        switch (t) {
            case dtype::F32:  return 4.0f;
            case dtype::F16:  return 2.0f;
            case dtype::BF16: return 2.0f;
            case dtype::Q8_0: return 1.0625f;  // 34/32
            case dtype::Q6_K: return 0.8203f;  // 210/256
            case dtype::Q5_K: return 0.6875f;  // 176/256
            case dtype::Q4_K: return 0.5625f;  // 144/256
            case dtype::Q3_K: return 0.4297f;  // 110/256
            case dtype::Q2_K: return 0.3281f;  // 84/256
            default: return 2.0f;
        }
    }

    void print_schedule() const {
        printf("\n╔═══════════════════════════════════════════════════╗\n");
        printf("║  Fractal Inference — Precision Schedule            ║\n");
        printf("╠═══════════════════════════════════════════════════╣\n");
        printf("║  Embed: %-8s  Head: %-8s                    ║\n",
               dtype_name(embed_dtype), dtype_name(head_dtype));
        printf("╠═══════════════════════════════════════════════════╣\n");

        // Group consecutive identical layers
        int i = 0;
        while (i < n_layers) {
            int j = i;
            while (j < n_layers && attn_dtype[j] == attn_dtype[i]
                   && ffn_dtype[j] == ffn_dtype[i]) j++;

            if (j - i == 1) {
                printf("║  Layer %2d      : attn=%-6s ffn=%-6s          ║\n",
                       i, dtype_name(attn_dtype[i]), dtype_name(ffn_dtype[i]));
            } else {
                printf("║  Layers %2d-%-2d  : attn=%-6s ffn=%-6s          ║\n",
                       i, j-1, dtype_name(attn_dtype[i]), dtype_name(ffn_dtype[i]));
            }
            i = j;
        }

        printf("╠═══════════════════════════════════════════════════╣\n");
        printf("║  Memory ratio: %.1f%% of base                      ║\n",
               memory_ratio() * 100.0f);
        printf("╚═══════════════════════════════════════════════════╝\n");
    }

    static const char* dtype_name(dtype t) {
        switch (t) {
            case dtype::F32:  return "F32";
            case dtype::F16:  return "F16";
            case dtype::BF16: return "BF16";
            case dtype::Q8_0: return "Q8_0";
            case dtype::Q6_K: return "Q6_K";
            case dtype::Q5_K: return "Q5_K";
            case dtype::Q4_K: return "Q4_K";
            case dtype::Q3_K: return "Q3_K";
            case dtype::Q2_K: return "Q2_K";
            default: return "???";
        }
    }
};

// ═══════════════════════════════════════════════════════════════════════════
// Fractal Engine — orchestrates dynamic precision inference
// ═══════════════════════════════════════════════════════════════════════════

class FractalEngine {
public:
    bool enabled = false;
    PrecisionMap current_map;
    QueryProfile last_profile;

    // Stats
    int queries_total = 0;
    int queries_fast = 0;     // complexity < 0.3
    int queries_balanced = 0; // complexity 0.3-0.6
    int queries_full = 0;     // complexity >= 0.6
    float total_savings = 0;  // Cumulative memory ratio savings

    void enable() { enabled = true; }

    // Analyze query and compute precision map
    PrecisionMap plan(const std::vector<int32_t>& tokens,
                      int vocab_size, int n_layers, dtype base_type) {
        last_profile = analyze_query(tokens, vocab_size);
        current_map.compute(n_layers, base_type, last_profile);

        // Stats
        queries_total++;
        float complexity = (last_profile.depth_demand + last_profile.reasoning_score) / 2.0f;
        if (complexity < 0.3f) queries_fast++;
        else if (complexity < 0.6f) queries_balanced++;
        else queries_full++;
        total_savings += current_map.memory_ratio();

        return current_map;
    }

    // Get the dtype that should be used for a specific layer
    dtype layer_attn_type(int layer) const {
        if (!enabled || layer >= current_map.n_layers) return dtype::Q4_K; // fallback
        return current_map.attn_dtype[layer];
    }

    dtype layer_ffn_type(int layer) const {
        if (!enabled || layer >= current_map.n_layers) return dtype::Q4_K;
        return current_map.ffn_dtype[layer];
    }

    void print_stats() const {
        if (queries_total == 0) return;
        printf("\n[FRACTAL] Queries: %d (fast:%d balanced:%d full:%d)\n",
               queries_total, queries_fast, queries_balanced, queries_full);
        printf("[FRACTAL] Avg memory ratio: %.1f%%\n",
               (total_savings / queries_total) * 100.0f);
    }
};

} // namespace ix