inference-x/infer.cpp
Salka Elmadani ec36668cf5 Inference-X v1.0 — Universal AI Inference Engine
Better output from the same model. Fused computation, adaptive precision,
surgical expert loading. 305 KB, 19 backends, zero dependencies.

https://inference-x.com
2026-02-23 07:10:47 +00:00

572 lines
27 KiB
C++

// ═══════════════════════════════════════════════════════════════════════════════
// INFERENCE-X — Universal Inference Protocol (Main Entry Point)
// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
// Licensed under the Business Source License 1.1 (BSL-1.1)
// See LICENSE file for full terms.
//
// INTELLECTUAL PROPERTY PROTECTION:
// - INPI eSoleau deposit: 7phf-Ueye-2nWr-Vsgu (16/02/2026)
// - GitHub: github.com/ElmadaniS/inference-x
// - Author: Salka Elmadani | Morocco | Morocco
//
// MANUFACTURER NOTICE: Any manufacturer, company, or entity that
// incorporates, embeds, distributes, or commercially uses Inference-X
// or any derivative work without explicit written authorization from
// the copyright holder is in violation of BSL-1.1 and applicable
// intellectual property laws. This includes but is not limited to:
// hardware vendors, cloud providers, SaaS platforms, and OEMs.
//
// Contact: Elmadani.SALKA@proton.me for licensing.
// ═══════════════════════════════════════════════════════════════════════════════
#include <cstdio>
#include <cstdint>
#include <cstring>
// ═══════════════════════════════════════════════════════════════════════════════
// INFERENCE-X IDENTITY
// This watermark is integral to Inference-X. Removal violates BSL-1.1 Section 4.
// ═══════════════════════════════════════════════════════════════════════════════
static const char* IX_AUTHOR = "Salka Elmadani";
static const char* IX_LICENSE __attribute__((unused)) = "BSL-1.1";
static const char* IX_CONTACT __attribute__((unused)) = "Elmadani.SALKA@proton.me";
static const char* IX_SIGNATURE = "IX";
static const uint32_t IX_FINGERPRINT = 0x935E1DAD; // Elmadani in hex
static void ix_print_banner() {
fprintf(stderr, "\n");
fprintf(stderr, " ╔═══════════════════════════════════════════════════════════╗\n");
fprintf(stderr, " ║ Inference-X — Universal Inference Protocol ║\n");
fprintf(stderr, " ║ Copyright (C) 2025-2026 Salka Elmadani ║\n");
fprintf(stderr, " ║ Licensed under BSL-1.1 | Morocco ║\n");
fprintf(stderr, " ║ https://inference-x.com | github.com/ElmadaniS/inference-x║\n");
fprintf(stderr, " ╚═══════════════════════════════════════════════════════════╝\n");
fprintf(stderr, "\n");
}
static bool ix_verify_integrity() {
// Integrity check — fingerprint must match
// Tampering with this function violates the license
return (IX_FINGERPRINT == 0x935E1DAD) &&
(IX_SIGNATURE[0] == 'I') &&
(IX_AUTHOR[0] == 'S');
}
#include "runtime/gguf.h"
#include "runtime/tokenizer.h"
#include "runtime/transformer_v6.h"
#include "runtime/server.h"
#include "runtime/fractal.h"
#include "runtime/platform.h"
#include "runtime/identity.h"
#include "runtime/kernel_dispatch.h"
#include <cstdlib>
#include <string>
#include <vector>
#include <signal.h>
using namespace ix;
static volatile bool g_interrupted = false;
static void sigint_handler(int) { g_interrupted = true; }
// ═══════════════════════════════════════════════════════════════════════════════
// CHAT TEMPLATE — DeepSeek V3 / Kimi K2.5 / ChatML format
// ═══════════════════════════════════════════════════════════════════════════════
struct ChatTemplate {
enum Style { DEEPSEEK, CHATML, KIMI, LLAMA3, GEMMA, PHI3, MISTRAL, RAW };
Style style = RAW;
// Kimi K2.5 special token IDs (set during detect)
int kimi_bos = -1;
int kimi_im_system = -1;
int kimi_im_user = -1;
int kimi_im_assistant = -1;
int kimi_im_middle = -1;
int kimi_im_end = -1;
int kimi_think = -1;
// Format as token IDs (handles special tokens for Kimi)
std::vector<int32_t> format_ids(
const std::string& system, const std::string& user,
const Tokenizer& tok
) const {
std::vector<int32_t> ids;
if (style == KIMI) {
ids.push_back(kimi_bos);
// System
std::string sys_text = system.empty() ?
"You are Kimi, an AI assistant created by Moonshot AI." : system;
ids.push_back(kimi_im_system);
auto sr = tok.encode("system");
ids.insert(ids.end(), sr.begin(), sr.end());
ids.push_back(kimi_im_middle);
auto sc = tok.encode(sys_text);
ids.insert(ids.end(), sc.begin(), sc.end());
ids.push_back(kimi_im_end);
// User
ids.push_back(kimi_im_user);
auto ur = tok.encode("user");
ids.insert(ids.end(), ur.begin(), ur.end());
ids.push_back(kimi_im_middle);
auto uc = tok.encode(user);
ids.insert(ids.end(), uc.begin(), uc.end());
ids.push_back(kimi_im_end);
// Assistant + <think>
ids.push_back(kimi_im_assistant);
auto ar = tok.encode("assistant");
ids.insert(ids.end(), ar.begin(), ar.end());
ids.push_back(kimi_im_middle);
ids.push_back(kimi_think);
} else {
// Helper: insert special token ID or encode text
auto add_special = [&](const char* name) -> bool {
int32_t id = tok.find_token(name);
if (id >= 0) { ids.push_back(id); return true; }
return false;
};
auto add_text = [&](const std::string& text) {
auto enc = tok.encode(text);
ids.insert(ids.end(), enc.begin(), enc.end());
};
switch (style) {
case DEEPSEEK:
add_special("<|begin\xe2\x96\x81of\xe2\x96\x81sentence|>");
if (!system.empty()) {
add_special("<|System|>");
add_text(system);
}
add_special("<|User|>");
add_text(user);
add_special("<|Assistant|>");
break;
case CHATML:
if (!system.empty()) {
add_special("<|im_start|>");
add_text("system\n" + system);
add_special("<|im_end|>");
}
add_special("<|im_start|>");
add_text("user\n" + user);
add_special("<|im_end|>");
add_special("<|im_start|>");
add_text("assistant\n");
break;
case LLAMA3:
add_special("<|begin_of_text|>");
if (!system.empty()) {
add_special("<|start_header_id|>");
add_text("system");
add_special("<|end_header_id|>");
add_text("\n\n" + system);
add_special("<|eot_id|>");
}
add_special("<|start_header_id|>");
add_text("user");
add_special("<|end_header_id|>");
add_text("\n\n" + user);
add_special("<|eot_id|>");
add_special("<|start_header_id|>");
add_text("assistant");
add_special("<|end_header_id|>");
add_text("\n\n");
break;
case GEMMA:
add_special("<start_of_turn>");
add_text("user\n" + user);
add_special("<end_of_turn>");
add_text("\n");
add_special("<start_of_turn>");
add_text("model\n");
break;
case PHI3:
if (!system.empty()) {
add_special("<|system|>");
add_text("\n" + system);
add_special("<|end|>");
add_text("\n");
}
add_special("<|user|>");
add_text("\n" + user);
add_special("<|end|>");
add_text("\n");
add_special("<|assistant|>");
add_text("\n");
break;
case MISTRAL:
ids.push_back(tok.bos_id());
if (!add_special("[INST]")) add_text("[INST] ");
else add_text(" ");
add_text(user + " ");
if (!add_special("[/INST]")) add_text("[/INST]");
break;
default: // RAW
add_text(user);
break;
}
}
return ids;
}
static Style detect(const Tokenizer& tok, ChatTemplate& tmpl) {
// Kimi K2.5: has <|im_user|> token
int im_user = tok.find_token("<|im_user|>");
if (im_user >= 0) {
tmpl.kimi_bos = tok.bos_id();
tmpl.kimi_im_system = tok.find_token("<|im_system|>");
tmpl.kimi_im_user = im_user;
tmpl.kimi_im_assistant = tok.find_token("<|im_assistant|>");
tmpl.kimi_im_middle = tok.find_token("<|im_middle|>");
tmpl.kimi_im_end = tok.find_token("<|im_end|>");
tmpl.kimi_think = tok.find_token("<think>");
printf("[KIMI] Special tokens: sys=%d user=%d asst=%d mid=%d end=%d think=%d\n",
tmpl.kimi_im_system, tmpl.kimi_im_user, tmpl.kimi_im_assistant,
tmpl.kimi_im_middle, tmpl.kimi_im_end, tmpl.kimi_think);
return KIMI;
}
// Llama 3.x: has <|start_header_id|>
if (tok.find_token("<|start_header_id|>") >= 0) return LLAMA3;
// Gemma: has <start_of_turn>
if (tok.find_token("<start_of_turn>") >= 0) return GEMMA;
// Phi-3: has <|user|>
if (tok.find_token("<|user|>") >= 0) return PHI3;
// Mistral: has [INST]
if (tok.find_token("[INST]") >= 0) return MISTRAL;
// ChatML: has <|im_start|> (Qwen, SmolLM)
if (tok.im_start_id() >= 0) return CHATML;
// DeepSeek: has begin_of_sentence
if (tok.bos_id() >= 0 && tok.find_token("<|User|>") >= 0) return DEEPSEEK;
// Fallback: RAW
// Qwen-family fallback: large vocab (>150k) = ChatML
if (tok.vocab_size() > 150000) {
printf("[DETECT] Large vocab (%d) → Qwen family, using ChatML\n", tok.vocab_size());
return CHATML;
}
printf("[WARN] No known chat template detected, using RAW mode\n");
return RAW;
}
};
// ═══════════════════════════════════════════════════════════════════════════════
// CONFIG
// ═══════════════════════════════════════════════════════════════════════════════
struct InferConfig {
std::string model_path;
std::string prompt = "Hello! Who are you?";
std::string system = "";
int max_tokens = 512;
float temperature = 0.6f;
float top_p = 0.9f;
int top_k = 40;
int max_ctx = 4096;
bool interactive = false;
bool raw_mode = false; // No chat template
bool bench_mode = false; // Benchmark: just measure tok/s
bool serve_mode = false;
int serve_port = 8080;
bool fractal_mode = false; // Fractal inference (dynamic precision)
std::string profile_path; // --profile: expert activation CSV
};
void print_usage(const char* prog) {
printf("Usage: %s <model_path> [options]\n", prog);
printf("Options:\n");
printf(" -p <prompt> User prompt (default: \"Hello! Who are you?\")\n");
printf(" -s <system> System prompt\n");
printf(" -n <max_tokens> Max tokens to generate (default: 512)\n");
printf(" -t <temp> Temperature (default: 0.6)\n");
printf(" --top-p <val> Top-P sampling (default: 0.9)\n");
printf(" --top-k <val> Top-K sampling (default: 40)\n");
printf(" --ctx <size> Max context window (default: 4096)\n");
printf(" -i Interactive chat mode\n");
printf(" --raw No chat template\n");
printf(" --bench Benchmark mode (no output)\n");
printf(" --serve [port] Start OpenAI-compatible API server (default: 8080)\n");
printf(" --fractal Enable fractal inference (dynamic precision per layer)\n");
printf(" --profile <path> Dump expert activation profile\n");
}
InferConfig parse_args(int argc, char** argv) {
InferConfig cfg;
if (argc < 2) { print_usage(argv[0]); exit(1); }
cfg.model_path = argv[1];
for (int i = 2; i < argc; ++i) {
std::string arg = argv[i];
if (arg == "-p" && i + 1 < argc) cfg.prompt = argv[++i];
else if (arg == "-s" && i + 1 < argc) cfg.system = argv[++i];
else if (arg == "-n" && i + 1 < argc) cfg.max_tokens = atoi(argv[++i]);
else if (arg == "-t" && i + 1 < argc) cfg.temperature = atof(argv[++i]);
else if (arg == "--top-p" && i + 1 < argc) cfg.top_p = atof(argv[++i]);
else if (arg == "--top-k" && i + 1 < argc) cfg.top_k = atoi(argv[++i]);
else if (arg == "--ctx" && i + 1 < argc) cfg.max_ctx = atoi(argv[++i]);
else if (arg == "-i") cfg.interactive = true;
else if (arg == "--raw") cfg.raw_mode = true;
else if (arg == "--bench") cfg.bench_mode = true;
else if (arg == "--serve") { cfg.serve_mode = true; if (i+1 < argc && argv[i+1][0] != '-') cfg.serve_port = atoi(argv[++i]); }
else if (arg == "--fractal") cfg.fractal_mode = true;
else if (arg == "--profile" && i + 1 < argc) cfg.profile_path = argv[++i];
}
return cfg;
}
// ═══════════════════════════════════════════════════════════════════════════════
// MAIN
// ═══════════════════════════════════════════════════════════════════════════════
int main(int argc, char** argv) {
ix_print_banner();
if (!ix_verify_integrity()) { fprintf(stderr, "INTEGRITY CHECK FAILED\n"); return 1; }
printf("╔══════════════════════════════════════════════════════════════╗\n");
printf("║ INFERENCE-X v6 — UNIVERSAL INFERENCE PROTOCOL ║\n");
printf("║ COPYRIGHT (C) 2025-2026 SALKA ELMADANI ║\n");
printf("╚══════════════════════════════════════════════════════════════╝\n\n");
signal(SIGINT, sigint_handler);
InferConfig icfg = parse_args(argc, argv);
// ─── LOAD MODEL ────────────────────────────────────────────────────────
ix::identity::print_identity();
ix::identity::license().verify();
printf("=== Loading model: %s ===\n", icfg.model_path.c_str());
GGUF gguf;
if (!gguf.open(icfg.model_path)) {
printf("ERROR: Failed to open model at %s\n", icfg.model_path.c_str());
return 1;
}
// ─── LOAD TOKENIZER ────────────────────────────────────────────────────
printf("\n=== Loading tokenizer ===\n");
Tokenizer tokenizer;
if (!tokenizer.load(gguf)) {
printf("ERROR: Failed to load tokenizer from GGUF\n");
return 1;
}
// ─── INIT TRANSFORMER ──────────────────────────────────────────────────
printf("\n=== Initializing transformer ===\n");
TransformerV6 transformer;
if (!transformer.init(gguf, icfg.max_ctx)) {
printf("ERROR: Failed to initialize transformer\n");
return 1;
}
transformer.set_eos_token(tokenizer.eos_id());
// ─── INIT KERNEL DISPATCH ──────────────────────────────────────────────
printf("\n=== Initializing kernel dispatch ===\n");
ix::KernelDispatch::instance().init();
// Enable ExpertMmap for MoE models (surgical prefetch, ÷48 I/O)
auto& kcfg = transformer.config_mut();
if (kcfg.n_experts > 0) {
ix::KernelDispatch::instance().init_expert_mmap(kcfg.n_layers);
printf("[IX] MoE detected: %d experts, %d active per layer\n",
kcfg.n_experts, kcfg.n_experts_used);
}
// ─── FIX VOCAB SIZE ───────────────────────────────────────────────────
if (kcfg.vocab_size == 0 || kcfg.vocab_size == 32000) {
int tok_vocab = tokenizer.vocab_size();
if (tok_vocab > 0) {
printf("[FIX] vocab_size: GGUF missing → using tokenizer=%d\n", tok_vocab);
kcfg.vocab_size = tok_vocab;
} else {
kcfg.vocab_size = 32000; // ultimate fallback
printf("[FIX] vocab_size: fallback to 32000\n");
}
} else {
int tok_vocab = tokenizer.vocab_size();
if (tok_vocab > 0 && tok_vocab != (int)kcfg.vocab_size) {
printf("[FIX] vocab_size: GGUF=%u, tokenizer=%d → using max\n",
kcfg.vocab_size, tok_vocab);
if (tok_vocab > (int)kcfg.vocab_size) kcfg.vocab_size = tok_vocab;
}
}
// ─── DETECT CHAT TEMPLATE ──────────────────────────────────────────────
ChatTemplate tmpl;
if (icfg.raw_mode) {
tmpl.style = ChatTemplate::RAW;
} else {
tmpl.style = ChatTemplate::detect(tokenizer, tmpl);
}
printf("[CHAT] Template: %s\n",
tmpl.style == ChatTemplate::DEEPSEEK ? "DeepSeek V3" :
tmpl.style == ChatTemplate::CHATML ? "ChatML" :
tmpl.style == ChatTemplate::KIMI ? "Kimi K2.5" :
tmpl.style == ChatTemplate::LLAMA3 ? "Llama 3" :
tmpl.style == ChatTemplate::GEMMA ? "Gemma" :
tmpl.style == ChatTemplate::PHI3 ? "Phi-3" :
tmpl.style == ChatTemplate::MISTRAL ? "Mistral" : "Raw");
// Override EOS for Kimi K2.5
if (tmpl.style == ChatTemplate::KIMI && tmpl.kimi_im_end >= 0) {
transformer.set_eos_token(tmpl.kimi_im_end);
printf("[KIMI] EOS overridden to <|im_end|> = %d\n", tmpl.kimi_im_end);
}
// Multi-EOS: detect additional stop tokens
auto try_add_eos = [&](const char* name) {
int32_t id = tokenizer.find_token(name);
if (id >= 0) {
transformer.add_eos_token(id);
fprintf(stderr, "[EOS] Stop: %s → %d\n", name, id);
}
};
try_add_eos("<|eot_id|>");
try_add_eos("<|end_of_text|>");
try_add_eos("<|endoftext|>");
try_add_eos("<|im_end|>");
try_add_eos("<|end|>");
try_add_eos("<end_of_turn>");
// ─── INFERENCE LOOP ────────────────────────────────────────────────────
// ─── FRACTAL INFERENCE PROTOCOL ──────────────────────────────────────
ix::FractalEngine fractal;
if (icfg.fractal_mode) {
fractal.enable();
printf("[FRACTAL] Dynamic precision enabled — model breathes Q2→FP16\n");
}
auto run_inference = [&](const std::string& user_prompt) {
// Format + tokenize (handles special tokens for Kimi)
auto tokens = tmpl.format_ids(icfg.system, user_prompt, tokenizer);
printf("\n[TOK] Input: %zu tokens\n", tokens.size());
// Fractal: analyze query and plan precision
if (fractal.enabled) {
auto pmap = fractal.plan(tokens, kcfg.vocab_size, kcfg.n_layers, dtype::Q4_K);
pmap.print_schedule();
}
if (tokens.size() > (size_t)icfg.max_ctx - icfg.max_tokens) {
printf("[WARN] Prompt too long (%zu tokens), truncating to %d\n",
tokens.size(), icfg.max_ctx - icfg.max_tokens);
tokens.resize(icfg.max_ctx - icfg.max_tokens);
}
// Benchmark mode: just measure throughput
if (icfg.bench_mode) {
auto output = transformer.generate(tokens, icfg.max_tokens,
icfg.temperature, icfg.top_p, icfg.top_k);
printf("[BENCH] Output: %zu tokens\n", output.size());
return;
}
// Streaming generation
printf("\n─── OUTPUT ───────────────────────────────────────────────────\n");
fflush(stdout);
fprintf(stderr, "[DBG] calling generate_stream\n"); fflush(stderr);
int gen_count = 0;
transformer.generate_stream(
tokens, icfg.max_tokens,
icfg.temperature, icfg.top_p, icfg.top_k,
[&](int32_t token_id) -> bool {
if (g_interrupted) return false;
// Skip special tokens (control tokens, template markers)
if (tokenizer.is_special(token_id)) return true;
std::string piece = tokenizer.decode_token(token_id);
// Skip tokens that look like template markers
if (piece.size() > 2 && piece[0] == '<' && piece[piece.size()-1] == '>')
return true;
printf("%s", piece.c_str());
fflush(stdout);
gen_count++;
// INCREMENTAL PROFILING: dump CSV after each token
if (!icfg.profile_path.empty()) {
transformer.expert_cache_ref().dump_csv(icfg.profile_path.c_str());
}
return true;
}
);
fprintf(stderr, "[DBG] generate_stream returned, gen_count=%d\n", gen_count); fflush(stderr);
printf("\n──────────────────────────────────────────────────────────────\n");
printf("[GEN] %d tokens generated\n", gen_count);
};
// --- SERVE MODE: OpenAI-compatible API server ---
if (icfg.serve_mode) {
std::string mname = icfg.model_path;
size_t slash = mname.rfind('/');
if (slash != std::string::npos) mname = mname.substr(slash + 1);
size_t dot = mname.rfind('.');
if (dot != std::string::npos) mname = mname.substr(0, dot);
ix::Server server(icfg.serve_port, mname,
[&](const std::string& sys, const std::string& user,
int max_tok, float temp, float tp,
std::function<bool(const std::string&)> on_token) {
auto tokens = tmpl.format_ids(sys, user, tokenizer);
if (tokens.size() > (size_t)icfg.max_ctx - max_tok)
tokens.resize(icfg.max_ctx - max_tok);
transformer.generate_stream(
tokens, max_tok, temp, tp, icfg.top_k,
[&](int32_t token_id) -> bool {
if (tokenizer.is_special(token_id)) return true;
std::string piece = tokenizer.decode_token(token_id);
if (piece.size() > 2 && piece[0] == '<' && piece[piece.size()-1] == '>')
return true;
return on_token(piece);
}
);
}
);
server.run();
return 0;
}
if (icfg.interactive) {
// ─── INTERACTIVE CHAT ──────────────────────────────────────────────
printf("\n=== Interactive mode (Ctrl+C to exit) ===\n");
char line[4096];
while (!g_interrupted) {
printf("\n> ");
fflush(stdout);
if (!fgets(line, sizeof(line), stdin)) break;
// Strip newline
size_t len = strlen(line);
if (len > 0 && line[len - 1] == '\n') line[len - 1] = '\0';
if (strlen(line) == 0) continue;
if (strcmp(line, "/quit") == 0 || strcmp(line, "/exit") == 0) break;
if (strcmp(line, "/reset") == 0) {
// transformer.reset(); // DISABLED: multi-turn preserves KV cache
printf("[RESET] Context cleared\n");
continue;
}
run_inference(line);
// Multi-turn: KV cache preserved between turns
// Use /reset command to manually clear context
}
printf("\nGoodbye.\n");
} else {
// ─── SINGLE PROMPT ─────────────────────────────────────────────────
run_inference(icfg.prompt);
}
// ─── FINAL STATS ──────────────────────────────────────────────────────
if (!icfg.profile_path.empty()) {
transformer.expert_cache_ref().dump_csv(icfg.profile_path.c_str());
}
printf("\n=== Inference-X Unified — Session Stats ===\n");
ix::KernelDispatch::instance().print_stats();
if (fractal.enabled) fractal.print_stats();
printf("[IX] Backend: %s\n", ix::KernelDispatch::instance().backend_name());
return 0;
}