inference-x/runtime/fractal.h
Salka Elmadani ec36668cf5 Inference-X v1.0 — Universal AI Inference Engine
Better output from the same model. Fused computation, adaptive precision,
surgical expert loading. 305 KB, 19 backends, zero dependencies.

https://inference-x.com
2026-02-23 07:10:47 +00:00

324 lines
14 KiB
C++

// runtime/fractal.h — Fractal Inference Protocol
// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
// INPI eSoleau: 7phf-Ueye-2nWr-Vsgu — BSL-1.1
//
// The same model breathes Q2→Q4→Q8→FP16 based on what the query needs.
// No reloading. No switching files. The precision adapts in real-time.
//
// Principle: intelligence compression follows the principle of least action.
// Simple queries use simple precision. Complex reasoning uses full precision.
// The model is one. The view changes.
//
// Precision selection uses information-theoretic complexity analysis:
// H(X) = -Σ p(x)·log2(p(x)) — Shannon entropy of input tokens
// C(q) = w₁·H + w₂·len/ctx — composite complexity score
// P(l) = quantize(C(q), depth(l)/L) — layer precision mapping
//
// This follows standard rate-distortion theory (Shannon 1959):
// minimize distortion D subject to rate constraint R ≤ R_max
//
#pragma once
#include <cmath>
#include <vector>
#include <cstdio>
#include <algorithm>
#include "gemm.h"
namespace ix {
// ═══════════════════════════════════════════════════════════════════════════
// Query Complexity Analysis
// ═══════════════════════════════════════════════════════════════════════════
struct QueryProfile {
float entropy; // Token entropy of the input (0=trivial, >2=complex)
float depth_demand; // How many layers are likely critical (0-1)
float reasoning_score; // Presence of reasoning markers (0-1)
int token_count; // Input length
};
// Analyze input tokens to determine complexity
// No ML model needed — pure information theory
inline QueryProfile analyze_query(const std::vector<int32_t>& tokens, int vocab_size) {
QueryProfile qp = {};
qp.token_count = (int)tokens.size();
if (tokens.empty() || vocab_size <= 0) return qp;
// ── Token entropy ──────────────────────────────────────────────────
// H = -Σ p(x) log2 p(x)
// High entropy = diverse vocabulary = complex query
std::vector<int> freq(std::min(vocab_size, 131072), 0);
for (int32_t t : tokens) {
if (t >= 0 && t < (int32_t)freq.size()) freq[t]++;
}
float H = 0.0f;
float n = (float)tokens.size();
for (int f : freq) {
if (f > 0) {
float p = (float)f / n;
H -= p * log2f(p);
}
}
qp.entropy = H;
// ── Depth demand ───────────────────────────────────────────────────
// Longer, more diverse inputs need deeper processing
// Normalized: short simple query → 0.2, long complex → 0.95
float len_factor = std::min(1.0f, (float)tokens.size() / 2048.0f);
float ent_factor = std::min(1.0f, H / 8.0f); // max useful entropy ~8 bits
qp.depth_demand = 0.3f * len_factor + 0.7f * ent_factor;
// ── Reasoning score ────────────────────────────────────────────────
// Repetition ratio: reasoning often revisits concepts
int unique = 0;
for (int f : freq) if (f > 0) unique++;
float unique_ratio = (float)unique / n;
// High unique ratio + high entropy = analytical/reasoning
// Low unique ratio = repetitive/simple
qp.reasoning_score = std::min(1.0f, unique_ratio * ent_factor);
return qp;
}
// ═══════════════════════════════════════════════════════════════════════════
// Precision Map — which dtype for which layer given query complexity
// ═══════════════════════════════════════════════════════════════════════════
enum class LayerRole {
EMBED, // Embedding layer — always needs decent precision
ATTN_Q, // Query projection — critical for attention quality
ATTN_K, // Key projection — critical for attention quality
ATTN_V, // Value projection — can tolerate lower precision
ATTN_O, // Output projection
FFN_GATE, // FFN gate — determines information flow
FFN_UP, // FFN up projection
FFN_DOWN, // FFN down projection — output path, precision matters
MOE_GATE, // MoE router — must be precise
EXPERT, // MoE expert — can vary by activation frequency
HEAD, // Output head — always high precision
};
struct PrecisionMap {
int n_layers;
// For each layer, the target dtype for attention and FFN
std::vector<dtype> attn_dtype; // Precision for attention projections
std::vector<dtype> ffn_dtype; // Precision for FFN/expert layers
dtype embed_dtype; // Embedding precision
dtype head_dtype; // Output head precision
// The fractal schedule: which layers get which precision
// Based on the observation that:
// - Early layers (pattern matching) can be lower precision
// - Middle layers (composition) need moderate precision
// - Late layers (decision) need higher precision
// - Output head always needs highest available
void compute(int layers, dtype base_type, const QueryProfile& qp) {
n_layers = layers;
attn_dtype.resize(layers, base_type);
ffn_dtype.resize(layers, base_type);
// Head and embed always at base precision
embed_dtype = base_type;
head_dtype = base_type;
// Trivial query: everything can drop
// Complex query: maintain precision throughout
float complexity = (qp.depth_demand + qp.reasoning_score) / 2.0f;
if (complexity < 0.3f) {
// ── FAST MODE: Simple query, aggressive compression ──────
// Early 40% of layers → drop 2 levels
// Middle 40% → drop 1 level
// Last 20% → keep base precision
for (int i = 0; i < layers; i++) {
float pos = (float)i / (float)layers; // 0=first, 1=last
if (pos < 0.4f) {
attn_dtype[i] = drop_precision(base_type, 2);
ffn_dtype[i] = drop_precision(base_type, 2);
} else if (pos < 0.8f) {
attn_dtype[i] = drop_precision(base_type, 1);
ffn_dtype[i] = drop_precision(base_type, 1);
}
// else: keep base
}
} else if (complexity < 0.6f) {
// ── BALANCED MODE: Moderate compression ──────────────────
// Early 30% drop 1 level, rest at base
for (int i = 0; i < layers; i++) {
float pos = (float)i / (float)layers;
if (pos < 0.3f) {
attn_dtype[i] = drop_precision(base_type, 1);
ffn_dtype[i] = drop_precision(base_type, 1);
}
}
}
// complexity >= 0.6: FULL MODE — keep everything at base precision
}
// Drop precision by N levels on the K-quant scale
// Q8_0 → Q6_K → Q5_K → Q4_K → Q3_K → Q2_K
static dtype drop_precision(dtype base, int levels) {
// Define the precision ladder
static const dtype ladder[] = {
dtype::Q2_K, // 0 - lowest
dtype::Q3_K, // 1
dtype::Q4_K, // 2
dtype::Q5_K, // 3
dtype::Q6_K, // 4
dtype::Q8_0, // 5
dtype::F16, // 6
dtype::F32, // 7 - highest
};
static const int ladder_size = 8;
// Find base position
int pos = -1;
for (int i = 0; i < ladder_size; i++) {
if (ladder[i] == base) { pos = i; break; }
}
if (pos < 0) return base; // Unknown type, don't touch
int new_pos = std::max(0, pos - levels);
return ladder[new_pos];
}
// Memory savings estimate
float memory_ratio() const {
if (n_layers == 0) return 1.0f;
float base_bytes = 0, fractal_bytes = 0;
dtype base = head_dtype; // Assume head is at base precision
for (int i = 0; i < n_layers; i++) {
// Rough: each layer has attention (4 matrices) + FFN (3 matrices)
float base_layer = dtype_bytes_approx(base) * 7;
float frac_layer = dtype_bytes_approx(attn_dtype[i]) * 4
+ dtype_bytes_approx(ffn_dtype[i]) * 3;
base_bytes += base_layer;
fractal_bytes += frac_layer;
}
return (base_bytes > 0) ? fractal_bytes / base_bytes : 1.0f;
}
static float dtype_bytes_approx(dtype t) {
switch (t) {
case dtype::F32: return 4.0f;
case dtype::F16: return 2.0f;
case dtype::BF16: return 2.0f;
case dtype::Q8_0: return 1.0625f; // 34/32
case dtype::Q6_K: return 0.8203f; // 210/256
case dtype::Q5_K: return 0.6875f; // 176/256
case dtype::Q4_K: return 0.5625f; // 144/256
case dtype::Q3_K: return 0.4297f; // 110/256
case dtype::Q2_K: return 0.3281f; // 84/256
default: return 2.0f;
}
}
void print_schedule() const {
printf("\n╔═══════════════════════════════════════════════════╗\n");
printf("║ Fractal Inference — Precision Schedule ║\n");
printf("╠═══════════════════════════════════════════════════╣\n");
printf("║ Embed: %-8s Head: %-8s ║\n",
dtype_name(embed_dtype), dtype_name(head_dtype));
printf("╠═══════════════════════════════════════════════════╣\n");
// Group consecutive identical layers
int i = 0;
while (i < n_layers) {
int j = i;
while (j < n_layers && attn_dtype[j] == attn_dtype[i]
&& ffn_dtype[j] == ffn_dtype[i]) j++;
if (j - i == 1) {
printf("║ Layer %2d : attn=%-6s ffn=%-6s ║\n",
i, dtype_name(attn_dtype[i]), dtype_name(ffn_dtype[i]));
} else {
printf("║ Layers %2d-%-2d : attn=%-6s ffn=%-6s ║\n",
i, j-1, dtype_name(attn_dtype[i]), dtype_name(ffn_dtype[i]));
}
i = j;
}
printf("╠═══════════════════════════════════════════════════╣\n");
printf("║ Memory ratio: %.1f%% of base ║\n",
memory_ratio() * 100.0f);
printf("╚═══════════════════════════════════════════════════╝\n");
}
static const char* dtype_name(dtype t) {
switch (t) {
case dtype::F32: return "F32";
case dtype::F16: return "F16";
case dtype::BF16: return "BF16";
case dtype::Q8_0: return "Q8_0";
case dtype::Q6_K: return "Q6_K";
case dtype::Q5_K: return "Q5_K";
case dtype::Q4_K: return "Q4_K";
case dtype::Q3_K: return "Q3_K";
case dtype::Q2_K: return "Q2_K";
default: return "???";
}
}
};
// ═══════════════════════════════════════════════════════════════════════════
// Fractal Engine — orchestrates dynamic precision inference
// ═══════════════════════════════════════════════════════════════════════════
class FractalEngine {
public:
bool enabled = false;
PrecisionMap current_map;
QueryProfile last_profile;
// Stats
int queries_total = 0;
int queries_fast = 0; // complexity < 0.3
int queries_balanced = 0; // complexity 0.3-0.6
int queries_full = 0; // complexity >= 0.6
float total_savings = 0; // Cumulative memory ratio savings
void enable() { enabled = true; }
// Analyze query and compute precision map
PrecisionMap plan(const std::vector<int32_t>& tokens,
int vocab_size, int n_layers, dtype base_type) {
last_profile = analyze_query(tokens, vocab_size);
current_map.compute(n_layers, base_type, last_profile);
// Stats
queries_total++;
float complexity = (last_profile.depth_demand + last_profile.reasoning_score) / 2.0f;
if (complexity < 0.3f) queries_fast++;
else if (complexity < 0.6f) queries_balanced++;
else queries_full++;
total_savings += current_map.memory_ratio();
return current_map;
}
// Get the dtype that should be used for a specific layer
dtype layer_attn_type(int layer) const {
if (!enabled || layer >= current_map.n_layers) return dtype::Q4_K; // fallback
return current_map.attn_dtype[layer];
}
dtype layer_ffn_type(int layer) const {
if (!enabled || layer >= current_map.n_layers) return dtype::Q4_K;
return current_map.ffn_dtype[layer];
}
void print_stats() const {
if (queries_total == 0) return;
printf("\n[FRACTAL] Queries: %d (fast:%d balanced:%d full:%d)\n",
queries_total, queries_fast, queries_balanced, queries_full);
printf("[FRACTAL] Avg memory ratio: %.1f%%\n",
(total_savings / queries_total) * 100.0f);
}
};
} // namespace ix