Better output from the same model. Fused computation, adaptive precision, surgical expert loading. 305 KB, 19 backends, zero dependencies. https://inference-x.com
324 lines
14 KiB
C++
324 lines
14 KiB
C++
// runtime/fractal.h — Fractal Inference Protocol
|
|
// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
|
|
// INPI eSoleau: 7phf-Ueye-2nWr-Vsgu — BSL-1.1
|
|
//
|
|
// The same model breathes Q2→Q4→Q8→FP16 based on what the query needs.
|
|
// No reloading. No switching files. The precision adapts in real-time.
|
|
//
|
|
// Principle: intelligence compression follows the principle of least action.
|
|
// Simple queries use simple precision. Complex reasoning uses full precision.
|
|
// The model is one. The view changes.
|
|
//
|
|
// Precision selection uses information-theoretic complexity analysis:
|
|
// H(X) = -Σ p(x)·log2(p(x)) — Shannon entropy of input tokens
|
|
// C(q) = w₁·H + w₂·len/ctx — composite complexity score
|
|
// P(l) = quantize(C(q), depth(l)/L) — layer precision mapping
|
|
//
|
|
// This follows standard rate-distortion theory (Shannon 1959):
|
|
// minimize distortion D subject to rate constraint R ≤ R_max
|
|
//
|
|
#pragma once
|
|
#include <cmath>
|
|
#include <vector>
|
|
#include <cstdio>
|
|
#include <algorithm>
|
|
#include "gemm.h"
|
|
|
|
namespace ix {
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
// Query Complexity Analysis
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
struct QueryProfile {
|
|
float entropy; // Token entropy of the input (0=trivial, >2=complex)
|
|
float depth_demand; // How many layers are likely critical (0-1)
|
|
float reasoning_score; // Presence of reasoning markers (0-1)
|
|
int token_count; // Input length
|
|
};
|
|
|
|
// Analyze input tokens to determine complexity
|
|
// No ML model needed — pure information theory
|
|
inline QueryProfile analyze_query(const std::vector<int32_t>& tokens, int vocab_size) {
|
|
QueryProfile qp = {};
|
|
qp.token_count = (int)tokens.size();
|
|
|
|
if (tokens.empty() || vocab_size <= 0) return qp;
|
|
|
|
// ── Token entropy ──────────────────────────────────────────────────
|
|
// H = -Σ p(x) log2 p(x)
|
|
// High entropy = diverse vocabulary = complex query
|
|
std::vector<int> freq(std::min(vocab_size, 131072), 0);
|
|
for (int32_t t : tokens) {
|
|
if (t >= 0 && t < (int32_t)freq.size()) freq[t]++;
|
|
}
|
|
|
|
float H = 0.0f;
|
|
float n = (float)tokens.size();
|
|
for (int f : freq) {
|
|
if (f > 0) {
|
|
float p = (float)f / n;
|
|
H -= p * log2f(p);
|
|
}
|
|
}
|
|
qp.entropy = H;
|
|
|
|
// ── Depth demand ───────────────────────────────────────────────────
|
|
// Longer, more diverse inputs need deeper processing
|
|
// Normalized: short simple query → 0.2, long complex → 0.95
|
|
float len_factor = std::min(1.0f, (float)tokens.size() / 2048.0f);
|
|
float ent_factor = std::min(1.0f, H / 8.0f); // max useful entropy ~8 bits
|
|
qp.depth_demand = 0.3f * len_factor + 0.7f * ent_factor;
|
|
|
|
// ── Reasoning score ────────────────────────────────────────────────
|
|
// Repetition ratio: reasoning often revisits concepts
|
|
int unique = 0;
|
|
for (int f : freq) if (f > 0) unique++;
|
|
float unique_ratio = (float)unique / n;
|
|
// High unique ratio + high entropy = analytical/reasoning
|
|
// Low unique ratio = repetitive/simple
|
|
qp.reasoning_score = std::min(1.0f, unique_ratio * ent_factor);
|
|
|
|
return qp;
|
|
}
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
// Precision Map — which dtype for which layer given query complexity
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
enum class LayerRole {
|
|
EMBED, // Embedding layer — always needs decent precision
|
|
ATTN_Q, // Query projection — critical for attention quality
|
|
ATTN_K, // Key projection — critical for attention quality
|
|
ATTN_V, // Value projection — can tolerate lower precision
|
|
ATTN_O, // Output projection
|
|
FFN_GATE, // FFN gate — determines information flow
|
|
FFN_UP, // FFN up projection
|
|
FFN_DOWN, // FFN down projection — output path, precision matters
|
|
MOE_GATE, // MoE router — must be precise
|
|
EXPERT, // MoE expert — can vary by activation frequency
|
|
HEAD, // Output head — always high precision
|
|
};
|
|
|
|
struct PrecisionMap {
|
|
int n_layers;
|
|
// For each layer, the target dtype for attention and FFN
|
|
std::vector<dtype> attn_dtype; // Precision for attention projections
|
|
std::vector<dtype> ffn_dtype; // Precision for FFN/expert layers
|
|
dtype embed_dtype; // Embedding precision
|
|
dtype head_dtype; // Output head precision
|
|
|
|
// The fractal schedule: which layers get which precision
|
|
// Based on the observation that:
|
|
// - Early layers (pattern matching) can be lower precision
|
|
// - Middle layers (composition) need moderate precision
|
|
// - Late layers (decision) need higher precision
|
|
// - Output head always needs highest available
|
|
|
|
void compute(int layers, dtype base_type, const QueryProfile& qp) {
|
|
n_layers = layers;
|
|
attn_dtype.resize(layers, base_type);
|
|
ffn_dtype.resize(layers, base_type);
|
|
|
|
// Head and embed always at base precision
|
|
embed_dtype = base_type;
|
|
head_dtype = base_type;
|
|
|
|
// Trivial query: everything can drop
|
|
// Complex query: maintain precision throughout
|
|
float complexity = (qp.depth_demand + qp.reasoning_score) / 2.0f;
|
|
|
|
if (complexity < 0.3f) {
|
|
// ── FAST MODE: Simple query, aggressive compression ──────
|
|
// Early 40% of layers → drop 2 levels
|
|
// Middle 40% → drop 1 level
|
|
// Last 20% → keep base precision
|
|
for (int i = 0; i < layers; i++) {
|
|
float pos = (float)i / (float)layers; // 0=first, 1=last
|
|
if (pos < 0.4f) {
|
|
attn_dtype[i] = drop_precision(base_type, 2);
|
|
ffn_dtype[i] = drop_precision(base_type, 2);
|
|
} else if (pos < 0.8f) {
|
|
attn_dtype[i] = drop_precision(base_type, 1);
|
|
ffn_dtype[i] = drop_precision(base_type, 1);
|
|
}
|
|
// else: keep base
|
|
}
|
|
} else if (complexity < 0.6f) {
|
|
// ── BALANCED MODE: Moderate compression ──────────────────
|
|
// Early 30% drop 1 level, rest at base
|
|
for (int i = 0; i < layers; i++) {
|
|
float pos = (float)i / (float)layers;
|
|
if (pos < 0.3f) {
|
|
attn_dtype[i] = drop_precision(base_type, 1);
|
|
ffn_dtype[i] = drop_precision(base_type, 1);
|
|
}
|
|
}
|
|
}
|
|
// complexity >= 0.6: FULL MODE — keep everything at base precision
|
|
}
|
|
|
|
// Drop precision by N levels on the K-quant scale
|
|
// Q8_0 → Q6_K → Q5_K → Q4_K → Q3_K → Q2_K
|
|
static dtype drop_precision(dtype base, int levels) {
|
|
// Define the precision ladder
|
|
static const dtype ladder[] = {
|
|
dtype::Q2_K, // 0 - lowest
|
|
dtype::Q3_K, // 1
|
|
dtype::Q4_K, // 2
|
|
dtype::Q5_K, // 3
|
|
dtype::Q6_K, // 4
|
|
dtype::Q8_0, // 5
|
|
dtype::F16, // 6
|
|
dtype::F32, // 7 - highest
|
|
};
|
|
static const int ladder_size = 8;
|
|
|
|
// Find base position
|
|
int pos = -1;
|
|
for (int i = 0; i < ladder_size; i++) {
|
|
if (ladder[i] == base) { pos = i; break; }
|
|
}
|
|
if (pos < 0) return base; // Unknown type, don't touch
|
|
|
|
int new_pos = std::max(0, pos - levels);
|
|
return ladder[new_pos];
|
|
}
|
|
|
|
// Memory savings estimate
|
|
float memory_ratio() const {
|
|
if (n_layers == 0) return 1.0f;
|
|
float base_bytes = 0, fractal_bytes = 0;
|
|
dtype base = head_dtype; // Assume head is at base precision
|
|
|
|
for (int i = 0; i < n_layers; i++) {
|
|
// Rough: each layer has attention (4 matrices) + FFN (3 matrices)
|
|
float base_layer = dtype_bytes_approx(base) * 7;
|
|
float frac_layer = dtype_bytes_approx(attn_dtype[i]) * 4
|
|
+ dtype_bytes_approx(ffn_dtype[i]) * 3;
|
|
base_bytes += base_layer;
|
|
fractal_bytes += frac_layer;
|
|
}
|
|
return (base_bytes > 0) ? fractal_bytes / base_bytes : 1.0f;
|
|
}
|
|
|
|
static float dtype_bytes_approx(dtype t) {
|
|
switch (t) {
|
|
case dtype::F32: return 4.0f;
|
|
case dtype::F16: return 2.0f;
|
|
case dtype::BF16: return 2.0f;
|
|
case dtype::Q8_0: return 1.0625f; // 34/32
|
|
case dtype::Q6_K: return 0.8203f; // 210/256
|
|
case dtype::Q5_K: return 0.6875f; // 176/256
|
|
case dtype::Q4_K: return 0.5625f; // 144/256
|
|
case dtype::Q3_K: return 0.4297f; // 110/256
|
|
case dtype::Q2_K: return 0.3281f; // 84/256
|
|
default: return 2.0f;
|
|
}
|
|
}
|
|
|
|
void print_schedule() const {
|
|
printf("\n╔═══════════════════════════════════════════════════╗\n");
|
|
printf("║ Fractal Inference — Precision Schedule ║\n");
|
|
printf("╠═══════════════════════════════════════════════════╣\n");
|
|
printf("║ Embed: %-8s Head: %-8s ║\n",
|
|
dtype_name(embed_dtype), dtype_name(head_dtype));
|
|
printf("╠═══════════════════════════════════════════════════╣\n");
|
|
|
|
// Group consecutive identical layers
|
|
int i = 0;
|
|
while (i < n_layers) {
|
|
int j = i;
|
|
while (j < n_layers && attn_dtype[j] == attn_dtype[i]
|
|
&& ffn_dtype[j] == ffn_dtype[i]) j++;
|
|
|
|
if (j - i == 1) {
|
|
printf("║ Layer %2d : attn=%-6s ffn=%-6s ║\n",
|
|
i, dtype_name(attn_dtype[i]), dtype_name(ffn_dtype[i]));
|
|
} else {
|
|
printf("║ Layers %2d-%-2d : attn=%-6s ffn=%-6s ║\n",
|
|
i, j-1, dtype_name(attn_dtype[i]), dtype_name(ffn_dtype[i]));
|
|
}
|
|
i = j;
|
|
}
|
|
|
|
printf("╠═══════════════════════════════════════════════════╣\n");
|
|
printf("║ Memory ratio: %.1f%% of base ║\n",
|
|
memory_ratio() * 100.0f);
|
|
printf("╚═══════════════════════════════════════════════════╝\n");
|
|
}
|
|
|
|
static const char* dtype_name(dtype t) {
|
|
switch (t) {
|
|
case dtype::F32: return "F32";
|
|
case dtype::F16: return "F16";
|
|
case dtype::BF16: return "BF16";
|
|
case dtype::Q8_0: return "Q8_0";
|
|
case dtype::Q6_K: return "Q6_K";
|
|
case dtype::Q5_K: return "Q5_K";
|
|
case dtype::Q4_K: return "Q4_K";
|
|
case dtype::Q3_K: return "Q3_K";
|
|
case dtype::Q2_K: return "Q2_K";
|
|
default: return "???";
|
|
}
|
|
}
|
|
};
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
// Fractal Engine — orchestrates dynamic precision inference
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
class FractalEngine {
|
|
public:
|
|
bool enabled = false;
|
|
PrecisionMap current_map;
|
|
QueryProfile last_profile;
|
|
|
|
// Stats
|
|
int queries_total = 0;
|
|
int queries_fast = 0; // complexity < 0.3
|
|
int queries_balanced = 0; // complexity 0.3-0.6
|
|
int queries_full = 0; // complexity >= 0.6
|
|
float total_savings = 0; // Cumulative memory ratio savings
|
|
|
|
void enable() { enabled = true; }
|
|
|
|
// Analyze query and compute precision map
|
|
PrecisionMap plan(const std::vector<int32_t>& tokens,
|
|
int vocab_size, int n_layers, dtype base_type) {
|
|
last_profile = analyze_query(tokens, vocab_size);
|
|
current_map.compute(n_layers, base_type, last_profile);
|
|
|
|
// Stats
|
|
queries_total++;
|
|
float complexity = (last_profile.depth_demand + last_profile.reasoning_score) / 2.0f;
|
|
if (complexity < 0.3f) queries_fast++;
|
|
else if (complexity < 0.6f) queries_balanced++;
|
|
else queries_full++;
|
|
total_savings += current_map.memory_ratio();
|
|
|
|
return current_map;
|
|
}
|
|
|
|
// Get the dtype that should be used for a specific layer
|
|
dtype layer_attn_type(int layer) const {
|
|
if (!enabled || layer >= current_map.n_layers) return dtype::Q4_K; // fallback
|
|
return current_map.attn_dtype[layer];
|
|
}
|
|
|
|
dtype layer_ffn_type(int layer) const {
|
|
if (!enabled || layer >= current_map.n_layers) return dtype::Q4_K;
|
|
return current_map.ffn_dtype[layer];
|
|
}
|
|
|
|
void print_stats() const {
|
|
if (queries_total == 0) return;
|
|
printf("\n[FRACTAL] Queries: %d (fast:%d balanced:%d full:%d)\n",
|
|
queries_total, queries_fast, queries_balanced, queries_full);
|
|
printf("[FRACTAL] Avg memory ratio: %.1f%%\n",
|
|
(total_savings / queries_total) * 100.0f);
|
|
}
|
|
};
|
|
|
|
} // namespace ix
|