// runtime/fractal.h — Fractal Inference Protocol // Copyright (C) 2024-2026 Salka Elmadani. All rights reserved. // INPI eSoleau: 7phf-Ueye-2nWr-Vsgu — BSL-1.1 // // The same model breathes Q2→Q4→Q8→FP16 based on what the query needs. // No reloading. No switching files. The precision adapts in real-time. // // Principle: intelligence compression follows the principle of least action. // Simple queries use simple precision. Complex reasoning uses full precision. // The model is one. The view changes. // // Precision selection uses information-theoretic complexity analysis: // H(X) = -Σ p(x)·log2(p(x)) — Shannon entropy of input tokens // C(q) = w₁·H + w₂·len/ctx — composite complexity score // P(l) = quantize(C(q), depth(l)/L) — layer precision mapping // // This follows standard rate-distortion theory (Shannon 1959): // minimize distortion D subject to rate constraint R ≤ R_max // #pragma once #include #include #include #include #include "gemm.h" namespace ix { // ═══════════════════════════════════════════════════════════════════════════ // Query Complexity Analysis // ═══════════════════════════════════════════════════════════════════════════ struct QueryProfile { float entropy; // Token entropy of the input (0=trivial, >2=complex) float depth_demand; // How many layers are likely critical (0-1) float reasoning_score; // Presence of reasoning markers (0-1) int token_count; // Input length }; // Analyze input tokens to determine complexity // No ML model needed — pure information theory inline QueryProfile analyze_query(const std::vector& tokens, int vocab_size) { QueryProfile qp = {}; qp.token_count = (int)tokens.size(); if (tokens.empty() || vocab_size <= 0) return qp; // ── Token entropy ────────────────────────────────────────────────── // H = -Σ p(x) log2 p(x) // High entropy = diverse vocabulary = complex query std::vector freq(std::min(vocab_size, 131072), 0); for (int32_t t : tokens) { if (t >= 0 && t < (int32_t)freq.size()) freq[t]++; } float H = 0.0f; float n = (float)tokens.size(); for (int f : freq) { if (f > 0) { float p = (float)f / n; H -= p * log2f(p); } } qp.entropy = H; // ── Depth demand ─────────────────────────────────────────────────── // Longer, more diverse inputs need deeper processing // Normalized: short simple query → 0.2, long complex → 0.95 float len_factor = std::min(1.0f, (float)tokens.size() / 2048.0f); float ent_factor = std::min(1.0f, H / 8.0f); // max useful entropy ~8 bits qp.depth_demand = 0.3f * len_factor + 0.7f * ent_factor; // ── Reasoning score ──────────────────────────────────────────────── // Repetition ratio: reasoning often revisits concepts int unique = 0; for (int f : freq) if (f > 0) unique++; float unique_ratio = (float)unique / n; // High unique ratio + high entropy = analytical/reasoning // Low unique ratio = repetitive/simple qp.reasoning_score = std::min(1.0f, unique_ratio * ent_factor); return qp; } // ═══════════════════════════════════════════════════════════════════════════ // Precision Map — which dtype for which layer given query complexity // ═══════════════════════════════════════════════════════════════════════════ enum class LayerRole { EMBED, // Embedding layer — always needs decent precision ATTN_Q, // Query projection — critical for attention quality ATTN_K, // Key projection — critical for attention quality ATTN_V, // Value projection — can tolerate lower precision ATTN_O, // Output projection FFN_GATE, // FFN gate — determines information flow FFN_UP, // FFN up projection FFN_DOWN, // FFN down projection — output path, precision matters MOE_GATE, // MoE router — must be precise EXPERT, // MoE expert — can vary by activation frequency HEAD, // Output head — always high precision }; struct PrecisionMap { int n_layers; // For each layer, the target dtype for attention and FFN std::vector attn_dtype; // Precision for attention projections std::vector ffn_dtype; // Precision for FFN/expert layers dtype embed_dtype; // Embedding precision dtype head_dtype; // Output head precision // The fractal schedule: which layers get which precision // Based on the observation that: // - Early layers (pattern matching) can be lower precision // - Middle layers (composition) need moderate precision // - Late layers (decision) need higher precision // - Output head always needs highest available void compute(int layers, dtype base_type, const QueryProfile& qp) { n_layers = layers; attn_dtype.resize(layers, base_type); ffn_dtype.resize(layers, base_type); // Head and embed always at base precision embed_dtype = base_type; head_dtype = base_type; // Trivial query: everything can drop // Complex query: maintain precision throughout float complexity = (qp.depth_demand + qp.reasoning_score) / 2.0f; if (complexity < 0.3f) { // ── FAST MODE: Simple query, aggressive compression ────── // Early 40% of layers → drop 2 levels // Middle 40% → drop 1 level // Last 20% → keep base precision for (int i = 0; i < layers; i++) { float pos = (float)i / (float)layers; // 0=first, 1=last if (pos < 0.4f) { attn_dtype[i] = drop_precision(base_type, 2); ffn_dtype[i] = drop_precision(base_type, 2); } else if (pos < 0.8f) { attn_dtype[i] = drop_precision(base_type, 1); ffn_dtype[i] = drop_precision(base_type, 1); } // else: keep base } } else if (complexity < 0.6f) { // ── BALANCED MODE: Moderate compression ────────────────── // Early 30% drop 1 level, rest at base for (int i = 0; i < layers; i++) { float pos = (float)i / (float)layers; if (pos < 0.3f) { attn_dtype[i] = drop_precision(base_type, 1); ffn_dtype[i] = drop_precision(base_type, 1); } } } // complexity >= 0.6: FULL MODE — keep everything at base precision } // Drop precision by N levels on the K-quant scale // Q8_0 → Q6_K → Q5_K → Q4_K → Q3_K → Q2_K static dtype drop_precision(dtype base, int levels) { // Define the precision ladder static const dtype ladder[] = { dtype::Q2_K, // 0 - lowest dtype::Q3_K, // 1 dtype::Q4_K, // 2 dtype::Q5_K, // 3 dtype::Q6_K, // 4 dtype::Q8_0, // 5 dtype::F16, // 6 dtype::F32, // 7 - highest }; static const int ladder_size = 8; // Find base position int pos = -1; for (int i = 0; i < ladder_size; i++) { if (ladder[i] == base) { pos = i; break; } } if (pos < 0) return base; // Unknown type, don't touch int new_pos = std::max(0, pos - levels); return ladder[new_pos]; } // Memory savings estimate float memory_ratio() const { if (n_layers == 0) return 1.0f; float base_bytes = 0, fractal_bytes = 0; dtype base = head_dtype; // Assume head is at base precision for (int i = 0; i < n_layers; i++) { // Rough: each layer has attention (4 matrices) + FFN (3 matrices) float base_layer = dtype_bytes_approx(base) * 7; float frac_layer = dtype_bytes_approx(attn_dtype[i]) * 4 + dtype_bytes_approx(ffn_dtype[i]) * 3; base_bytes += base_layer; fractal_bytes += frac_layer; } return (base_bytes > 0) ? fractal_bytes / base_bytes : 1.0f; } static float dtype_bytes_approx(dtype t) { switch (t) { case dtype::F32: return 4.0f; case dtype::F16: return 2.0f; case dtype::BF16: return 2.0f; case dtype::Q8_0: return 1.0625f; // 34/32 case dtype::Q6_K: return 0.8203f; // 210/256 case dtype::Q5_K: return 0.6875f; // 176/256 case dtype::Q4_K: return 0.5625f; // 144/256 case dtype::Q3_K: return 0.4297f; // 110/256 case dtype::Q2_K: return 0.3281f; // 84/256 default: return 2.0f; } } void print_schedule() const { printf("\n╔═══════════════════════════════════════════════════╗\n"); printf("║ Fractal Inference — Precision Schedule ║\n"); printf("╠═══════════════════════════════════════════════════╣\n"); printf("║ Embed: %-8s Head: %-8s ║\n", dtype_name(embed_dtype), dtype_name(head_dtype)); printf("╠═══════════════════════════════════════════════════╣\n"); // Group consecutive identical layers int i = 0; while (i < n_layers) { int j = i; while (j < n_layers && attn_dtype[j] == attn_dtype[i] && ffn_dtype[j] == ffn_dtype[i]) j++; if (j - i == 1) { printf("║ Layer %2d : attn=%-6s ffn=%-6s ║\n", i, dtype_name(attn_dtype[i]), dtype_name(ffn_dtype[i])); } else { printf("║ Layers %2d-%-2d : attn=%-6s ffn=%-6s ║\n", i, j-1, dtype_name(attn_dtype[i]), dtype_name(ffn_dtype[i])); } i = j; } printf("╠═══════════════════════════════════════════════════╣\n"); printf("║ Memory ratio: %.1f%% of base ║\n", memory_ratio() * 100.0f); printf("╚═══════════════════════════════════════════════════╝\n"); } static const char* dtype_name(dtype t) { switch (t) { case dtype::F32: return "F32"; case dtype::F16: return "F16"; case dtype::BF16: return "BF16"; case dtype::Q8_0: return "Q8_0"; case dtype::Q6_K: return "Q6_K"; case dtype::Q5_K: return "Q5_K"; case dtype::Q4_K: return "Q4_K"; case dtype::Q3_K: return "Q3_K"; case dtype::Q2_K: return "Q2_K"; default: return "???"; } } }; // ═══════════════════════════════════════════════════════════════════════════ // Fractal Engine — orchestrates dynamic precision inference // ═══════════════════════════════════════════════════════════════════════════ class FractalEngine { public: bool enabled = false; PrecisionMap current_map; QueryProfile last_profile; // Stats int queries_total = 0; int queries_fast = 0; // complexity < 0.3 int queries_balanced = 0; // complexity 0.3-0.6 int queries_full = 0; // complexity >= 0.6 float total_savings = 0; // Cumulative memory ratio savings void enable() { enabled = true; } // Analyze query and compute precision map PrecisionMap plan(const std::vector& tokens, int vocab_size, int n_layers, dtype base_type) { last_profile = analyze_query(tokens, vocab_size); current_map.compute(n_layers, base_type, last_profile); // Stats queries_total++; float complexity = (last_profile.depth_demand + last_profile.reasoning_score) / 2.0f; if (complexity < 0.3f) queries_fast++; else if (complexity < 0.6f) queries_balanced++; else queries_full++; total_savings += current_map.memory_ratio(); return current_map; } // Get the dtype that should be used for a specific layer dtype layer_attn_type(int layer) const { if (!enabled || layer >= current_map.n_layers) return dtype::Q4_K; // fallback return current_map.attn_dtype[layer]; } dtype layer_ffn_type(int layer) const { if (!enabled || layer >= current_map.n_layers) return dtype::Q4_K; return current_map.ffn_dtype[layer]; } void print_stats() const { if (queries_total == 0) return; printf("\n[FRACTAL] Queries: %d (fast:%d balanced:%d full:%d)\n", queries_total, queries_fast, queries_balanced, queries_full); printf("[FRACTAL] Avg memory ratio: %.1f%%\n", (total_savings / queries_total) * 100.0f); } }; } // namespace ix