// ═══════════════════════════════════════════════════════════════════════════════ // INFERENCE-X — Universal Inference Protocol (Main Entry Point) // Copyright (C) 2024-2026 Salka Elmadani. All rights reserved. // Licensed under the Business Source License 1.1 (BSL-1.1) // See LICENSE file for full terms. // // INTELLECTUAL PROPERTY PROTECTION: // - INPI eSoleau deposit: 7phf-Ueye-2nWr-Vsgu (16/02/2026) // - GitHub: github.com/ElmadaniS/inference-x // - Author: Salka Elmadani | Morocco | Morocco // // MANUFACTURER NOTICE: Any manufacturer, company, or entity that // incorporates, embeds, distributes, or commercially uses Inference-X // or any derivative work without explicit written authorization from // the copyright holder is in violation of BSL-1.1 and applicable // intellectual property laws. This includes but is not limited to: // hardware vendors, cloud providers, SaaS platforms, and OEMs. // // Contact: Elmadani.SALKA@proton.me for licensing. // ═══════════════════════════════════════════════════════════════════════════════ #include #include #include // ═══════════════════════════════════════════════════════════════════════════════ // INFERENCE-X IDENTITY // This watermark is integral to Inference-X. Removal violates BSL-1.1 Section 4. // ═══════════════════════════════════════════════════════════════════════════════ static const char* IX_AUTHOR = "Salka Elmadani"; static const char* IX_LICENSE __attribute__((unused)) = "BSL-1.1"; static const char* IX_CONTACT __attribute__((unused)) = "Elmadani.SALKA@proton.me"; static const char* IX_SIGNATURE = "IX"; static const uint32_t IX_FINGERPRINT = 0x935E1DAD; // Elmadani in hex static void ix_print_banner() { fprintf(stderr, "\n"); fprintf(stderr, " ╔═══════════════════════════════════════════════════════════╗\n"); fprintf(stderr, " ║ Inference-X — Universal Inference Protocol ║\n"); fprintf(stderr, " ║ Copyright (C) 2025-2026 Salka Elmadani ║\n"); fprintf(stderr, " ║ Licensed under BSL-1.1 | Morocco ║\n"); fprintf(stderr, " ║ https://inference-x.com | github.com/ElmadaniS/inference-x║\n"); fprintf(stderr, " ╚═══════════════════════════════════════════════════════════╝\n"); fprintf(stderr, "\n"); } static bool ix_verify_integrity() { // Integrity check — fingerprint must match // Tampering with this function violates the license return (IX_FINGERPRINT == 0x935E1DAD) && (IX_SIGNATURE[0] == 'I') && (IX_AUTHOR[0] == 'S'); } #include "runtime/gguf.h" #include "runtime/tokenizer.h" #include "runtime/transformer_v6.h" #include "runtime/server.h" #include "runtime/fractal.h" #include "runtime/platform.h" #include "runtime/identity.h" #include "runtime/kernel_dispatch.h" #include #include #include #include using namespace ix; static volatile bool g_interrupted = false; static void sigint_handler(int) { g_interrupted = true; } // ═══════════════════════════════════════════════════════════════════════════════ // CHAT TEMPLATE — DeepSeek V3 / Kimi K2.5 / ChatML format // ═══════════════════════════════════════════════════════════════════════════════ struct ChatTemplate { enum Style { DEEPSEEK, CHATML, KIMI, LLAMA3, GEMMA, PHI3, MISTRAL, RAW }; Style style = RAW; // Kimi K2.5 special token IDs (set during detect) int kimi_bos = -1; int kimi_im_system = -1; int kimi_im_user = -1; int kimi_im_assistant = -1; int kimi_im_middle = -1; int kimi_im_end = -1; int kimi_think = -1; // Format as token IDs (handles special tokens for Kimi) std::vector format_ids( const std::string& system, const std::string& user, const Tokenizer& tok ) const { std::vector ids; if (style == KIMI) { ids.push_back(kimi_bos); // System std::string sys_text = system.empty() ? "You are Kimi, an AI assistant created by Moonshot AI." : system; ids.push_back(kimi_im_system); auto sr = tok.encode("system"); ids.insert(ids.end(), sr.begin(), sr.end()); ids.push_back(kimi_im_middle); auto sc = tok.encode(sys_text); ids.insert(ids.end(), sc.begin(), sc.end()); ids.push_back(kimi_im_end); // User ids.push_back(kimi_im_user); auto ur = tok.encode("user"); ids.insert(ids.end(), ur.begin(), ur.end()); ids.push_back(kimi_im_middle); auto uc = tok.encode(user); ids.insert(ids.end(), uc.begin(), uc.end()); ids.push_back(kimi_im_end); // Assistant + ids.push_back(kimi_im_assistant); auto ar = tok.encode("assistant"); ids.insert(ids.end(), ar.begin(), ar.end()); ids.push_back(kimi_im_middle); ids.push_back(kimi_think); } else { // Helper: insert special token ID or encode text auto add_special = [&](const char* name) -> bool { int32_t id = tok.find_token(name); if (id >= 0) { ids.push_back(id); return true; } return false; }; auto add_text = [&](const std::string& text) { auto enc = tok.encode(text); ids.insert(ids.end(), enc.begin(), enc.end()); }; switch (style) { case DEEPSEEK: add_special("<|begin\xe2\x96\x81of\xe2\x96\x81sentence|>"); if (!system.empty()) { add_special("<|System|>"); add_text(system); } add_special("<|User|>"); add_text(user); add_special("<|Assistant|>"); break; case CHATML: if (!system.empty()) { add_special("<|im_start|>"); add_text("system\n" + system); add_special("<|im_end|>"); } add_special("<|im_start|>"); add_text("user\n" + user); add_special("<|im_end|>"); add_special("<|im_start|>"); add_text("assistant\n"); break; case LLAMA3: add_special("<|begin_of_text|>"); if (!system.empty()) { add_special("<|start_header_id|>"); add_text("system"); add_special("<|end_header_id|>"); add_text("\n\n" + system); add_special("<|eot_id|>"); } add_special("<|start_header_id|>"); add_text("user"); add_special("<|end_header_id|>"); add_text("\n\n" + user); add_special("<|eot_id|>"); add_special("<|start_header_id|>"); add_text("assistant"); add_special("<|end_header_id|>"); add_text("\n\n"); break; case GEMMA: add_special(""); add_text("user\n" + user); add_special(""); add_text("\n"); add_special(""); add_text("model\n"); break; case PHI3: if (!system.empty()) { add_special("<|system|>"); add_text("\n" + system); add_special("<|end|>"); add_text("\n"); } add_special("<|user|>"); add_text("\n" + user); add_special("<|end|>"); add_text("\n"); add_special("<|assistant|>"); add_text("\n"); break; case MISTRAL: ids.push_back(tok.bos_id()); if (!add_special("[INST]")) add_text("[INST] "); else add_text(" "); add_text(user + " "); if (!add_special("[/INST]")) add_text("[/INST]"); break; default: // RAW add_text(user); break; } } return ids; } static Style detect(const Tokenizer& tok, ChatTemplate& tmpl) { // Kimi K2.5: has <|im_user|> token int im_user = tok.find_token("<|im_user|>"); if (im_user >= 0) { tmpl.kimi_bos = tok.bos_id(); tmpl.kimi_im_system = tok.find_token("<|im_system|>"); tmpl.kimi_im_user = im_user; tmpl.kimi_im_assistant = tok.find_token("<|im_assistant|>"); tmpl.kimi_im_middle = tok.find_token("<|im_middle|>"); tmpl.kimi_im_end = tok.find_token("<|im_end|>"); tmpl.kimi_think = tok.find_token(""); printf("[KIMI] Special tokens: sys=%d user=%d asst=%d mid=%d end=%d think=%d\n", tmpl.kimi_im_system, tmpl.kimi_im_user, tmpl.kimi_im_assistant, tmpl.kimi_im_middle, tmpl.kimi_im_end, tmpl.kimi_think); return KIMI; } // Llama 3.x: has <|start_header_id|> if (tok.find_token("<|start_header_id|>") >= 0) return LLAMA3; // Gemma: has if (tok.find_token("") >= 0) return GEMMA; // Phi-3: has <|user|> if (tok.find_token("<|user|>") >= 0) return PHI3; // Mistral: has [INST] if (tok.find_token("[INST]") >= 0) return MISTRAL; // ChatML: has <|im_start|> (Qwen, SmolLM) if (tok.im_start_id() >= 0) return CHATML; // DeepSeek: has begin_of_sentence if (tok.bos_id() >= 0 && tok.find_token("<|User|>") >= 0) return DEEPSEEK; // Fallback: RAW // Qwen-family fallback: large vocab (>150k) = ChatML if (tok.vocab_size() > 150000) { printf("[DETECT] Large vocab (%d) → Qwen family, using ChatML\n", tok.vocab_size()); return CHATML; } printf("[WARN] No known chat template detected, using RAW mode\n"); return RAW; } }; // ═══════════════════════════════════════════════════════════════════════════════ // CONFIG // ═══════════════════════════════════════════════════════════════════════════════ struct InferConfig { std::string model_path; std::string prompt = "Hello! Who are you?"; std::string system = ""; int max_tokens = 512; float temperature = 0.6f; float top_p = 0.9f; int top_k = 40; int max_ctx = 4096; bool interactive = false; bool raw_mode = false; // No chat template bool bench_mode = false; // Benchmark: just measure tok/s bool serve_mode = false; int serve_port = 8080; bool fractal_mode = false; // Fractal inference (dynamic precision) std::string profile_path; // --profile: expert activation CSV }; void print_usage(const char* prog) { printf("Usage: %s [options]\n", prog); printf("Options:\n"); printf(" -p User prompt (default: \"Hello! Who are you?\")\n"); printf(" -s System prompt\n"); printf(" -n Max tokens to generate (default: 512)\n"); printf(" -t Temperature (default: 0.6)\n"); printf(" --top-p Top-P sampling (default: 0.9)\n"); printf(" --top-k Top-K sampling (default: 40)\n"); printf(" --ctx Max context window (default: 4096)\n"); printf(" -i Interactive chat mode\n"); printf(" --raw No chat template\n"); printf(" --bench Benchmark mode (no output)\n"); printf(" --serve [port] Start OpenAI-compatible API server (default: 8080)\n"); printf(" --fractal Enable fractal inference (dynamic precision per layer)\n"); printf(" --profile Dump expert activation profile\n"); } InferConfig parse_args(int argc, char** argv) { InferConfig cfg; if (argc < 2) { print_usage(argv[0]); exit(1); } cfg.model_path = argv[1]; for (int i = 2; i < argc; ++i) { std::string arg = argv[i]; if (arg == "-p" && i + 1 < argc) cfg.prompt = argv[++i]; else if (arg == "-s" && i + 1 < argc) cfg.system = argv[++i]; else if (arg == "-n" && i + 1 < argc) cfg.max_tokens = atoi(argv[++i]); else if (arg == "-t" && i + 1 < argc) cfg.temperature = atof(argv[++i]); else if (arg == "--top-p" && i + 1 < argc) cfg.top_p = atof(argv[++i]); else if (arg == "--top-k" && i + 1 < argc) cfg.top_k = atoi(argv[++i]); else if (arg == "--ctx" && i + 1 < argc) cfg.max_ctx = atoi(argv[++i]); else if (arg == "-i") cfg.interactive = true; else if (arg == "--raw") cfg.raw_mode = true; else if (arg == "--bench") cfg.bench_mode = true; else if (arg == "--serve") { cfg.serve_mode = true; if (i+1 < argc && argv[i+1][0] != '-') cfg.serve_port = atoi(argv[++i]); } else if (arg == "--fractal") cfg.fractal_mode = true; else if (arg == "--profile" && i + 1 < argc) cfg.profile_path = argv[++i]; } return cfg; } // ═══════════════════════════════════════════════════════════════════════════════ // MAIN // ═══════════════════════════════════════════════════════════════════════════════ int main(int argc, char** argv) { ix_print_banner(); if (!ix_verify_integrity()) { fprintf(stderr, "INTEGRITY CHECK FAILED\n"); return 1; } printf("╔══════════════════════════════════════════════════════════════╗\n"); printf("║ INFERENCE-X v6 — UNIVERSAL INFERENCE PROTOCOL ║\n"); printf("║ COPYRIGHT (C) 2025-2026 SALKA ELMADANI ║\n"); printf("╚══════════════════════════════════════════════════════════════╝\n\n"); signal(SIGINT, sigint_handler); InferConfig icfg = parse_args(argc, argv); // ─── LOAD MODEL ──────────────────────────────────────────────────────── ix::identity::print_identity(); ix::identity::license().verify(); printf("=== Loading model: %s ===\n", icfg.model_path.c_str()); GGUF gguf; if (!gguf.open(icfg.model_path)) { printf("ERROR: Failed to open model at %s\n", icfg.model_path.c_str()); return 1; } // ─── LOAD TOKENIZER ──────────────────────────────────────────────────── printf("\n=== Loading tokenizer ===\n"); Tokenizer tokenizer; if (!tokenizer.load(gguf)) { printf("ERROR: Failed to load tokenizer from GGUF\n"); return 1; } // ─── INIT TRANSFORMER ────────────────────────────────────────────────── printf("\n=== Initializing transformer ===\n"); TransformerV6 transformer; if (!transformer.init(gguf, icfg.max_ctx)) { printf("ERROR: Failed to initialize transformer\n"); return 1; } transformer.set_eos_token(tokenizer.eos_id()); // ─── INIT KERNEL DISPATCH ────────────────────────────────────────────── printf("\n=== Initializing kernel dispatch ===\n"); ix::KernelDispatch::instance().init(); // Enable ExpertMmap for MoE models (surgical prefetch, ÷48 I/O) auto& kcfg = transformer.config_mut(); if (kcfg.n_experts > 0) { ix::KernelDispatch::instance().init_expert_mmap(kcfg.n_layers); printf("[IX] MoE detected: %d experts, %d active per layer\n", kcfg.n_experts, kcfg.n_experts_used); } // ─── FIX VOCAB SIZE ─────────────────────────────────────────────────── if (kcfg.vocab_size == 0 || kcfg.vocab_size == 32000) { int tok_vocab = tokenizer.vocab_size(); if (tok_vocab > 0) { printf("[FIX] vocab_size: GGUF missing → using tokenizer=%d\n", tok_vocab); kcfg.vocab_size = tok_vocab; } else { kcfg.vocab_size = 32000; // ultimate fallback printf("[FIX] vocab_size: fallback to 32000\n"); } } else { int tok_vocab = tokenizer.vocab_size(); if (tok_vocab > 0 && tok_vocab != (int)kcfg.vocab_size) { printf("[FIX] vocab_size: GGUF=%u, tokenizer=%d → using max\n", kcfg.vocab_size, tok_vocab); if (tok_vocab > (int)kcfg.vocab_size) kcfg.vocab_size = tok_vocab; } } // ─── DETECT CHAT TEMPLATE ────────────────────────────────────────────── ChatTemplate tmpl; if (icfg.raw_mode) { tmpl.style = ChatTemplate::RAW; } else { tmpl.style = ChatTemplate::detect(tokenizer, tmpl); } printf("[CHAT] Template: %s\n", tmpl.style == ChatTemplate::DEEPSEEK ? "DeepSeek V3" : tmpl.style == ChatTemplate::CHATML ? "ChatML" : tmpl.style == ChatTemplate::KIMI ? "Kimi K2.5" : tmpl.style == ChatTemplate::LLAMA3 ? "Llama 3" : tmpl.style == ChatTemplate::GEMMA ? "Gemma" : tmpl.style == ChatTemplate::PHI3 ? "Phi-3" : tmpl.style == ChatTemplate::MISTRAL ? "Mistral" : "Raw"); // Override EOS for Kimi K2.5 if (tmpl.style == ChatTemplate::KIMI && tmpl.kimi_im_end >= 0) { transformer.set_eos_token(tmpl.kimi_im_end); printf("[KIMI] EOS overridden to <|im_end|> = %d\n", tmpl.kimi_im_end); } // Multi-EOS: detect additional stop tokens auto try_add_eos = [&](const char* name) { int32_t id = tokenizer.find_token(name); if (id >= 0) { transformer.add_eos_token(id); fprintf(stderr, "[EOS] Stop: %s → %d\n", name, id); } }; try_add_eos("<|eot_id|>"); try_add_eos("<|end_of_text|>"); try_add_eos("<|endoftext|>"); try_add_eos("<|im_end|>"); try_add_eos("<|end|>"); try_add_eos(""); // ─── INFERENCE LOOP ──────────────────────────────────────────────────── // ─── FRACTAL INFERENCE PROTOCOL ────────────────────────────────────── ix::FractalEngine fractal; if (icfg.fractal_mode) { fractal.enable(); printf("[FRACTAL] Dynamic precision enabled — model breathes Q2→FP16\n"); } auto run_inference = [&](const std::string& user_prompt) { // Format + tokenize (handles special tokens for Kimi) auto tokens = tmpl.format_ids(icfg.system, user_prompt, tokenizer); printf("\n[TOK] Input: %zu tokens\n", tokens.size()); // Fractal: analyze query and plan precision if (fractal.enabled) { auto pmap = fractal.plan(tokens, kcfg.vocab_size, kcfg.n_layers, dtype::Q4_K); pmap.print_schedule(); } if (tokens.size() > (size_t)icfg.max_ctx - icfg.max_tokens) { printf("[WARN] Prompt too long (%zu tokens), truncating to %d\n", tokens.size(), icfg.max_ctx - icfg.max_tokens); tokens.resize(icfg.max_ctx - icfg.max_tokens); } // Benchmark mode: just measure throughput if (icfg.bench_mode) { auto output = transformer.generate(tokens, icfg.max_tokens, icfg.temperature, icfg.top_p, icfg.top_k); printf("[BENCH] Output: %zu tokens\n", output.size()); return; } // Streaming generation printf("\n─── OUTPUT ───────────────────────────────────────────────────\n"); fflush(stdout); fprintf(stderr, "[DBG] calling generate_stream\n"); fflush(stderr); int gen_count = 0; transformer.generate_stream( tokens, icfg.max_tokens, icfg.temperature, icfg.top_p, icfg.top_k, [&](int32_t token_id) -> bool { if (g_interrupted) return false; // Skip special tokens (control tokens, template markers) if (tokenizer.is_special(token_id)) return true; std::string piece = tokenizer.decode_token(token_id); // Skip tokens that look like template markers if (piece.size() > 2 && piece[0] == '<' && piece[piece.size()-1] == '>') return true; printf("%s", piece.c_str()); fflush(stdout); gen_count++; // INCREMENTAL PROFILING: dump CSV after each token if (!icfg.profile_path.empty()) { transformer.expert_cache_ref().dump_csv(icfg.profile_path.c_str()); } return true; } ); fprintf(stderr, "[DBG] generate_stream returned, gen_count=%d\n", gen_count); fflush(stderr); printf("\n──────────────────────────────────────────────────────────────\n"); printf("[GEN] %d tokens generated\n", gen_count); }; // --- SERVE MODE: OpenAI-compatible API server --- if (icfg.serve_mode) { std::string mname = icfg.model_path; size_t slash = mname.rfind('/'); if (slash != std::string::npos) mname = mname.substr(slash + 1); size_t dot = mname.rfind('.'); if (dot != std::string::npos) mname = mname.substr(0, dot); ix::Server server(icfg.serve_port, mname, [&](const std::string& sys, const std::string& user, int max_tok, float temp, float tp, std::function on_token) { auto tokens = tmpl.format_ids(sys, user, tokenizer); if (tokens.size() > (size_t)icfg.max_ctx - max_tok) tokens.resize(icfg.max_ctx - max_tok); transformer.generate_stream( tokens, max_tok, temp, tp, icfg.top_k, [&](int32_t token_id) -> bool { if (tokenizer.is_special(token_id)) return true; std::string piece = tokenizer.decode_token(token_id); if (piece.size() > 2 && piece[0] == '<' && piece[piece.size()-1] == '>') return true; return on_token(piece); } ); } ); server.run(); return 0; } if (icfg.interactive) { // ─── INTERACTIVE CHAT ────────────────────────────────────────────── printf("\n=== Interactive mode (Ctrl+C to exit) ===\n"); char line[4096]; while (!g_interrupted) { printf("\n> "); fflush(stdout); if (!fgets(line, sizeof(line), stdin)) break; // Strip newline size_t len = strlen(line); if (len > 0 && line[len - 1] == '\n') line[len - 1] = '\0'; if (strlen(line) == 0) continue; if (strcmp(line, "/quit") == 0 || strcmp(line, "/exit") == 0) break; if (strcmp(line, "/reset") == 0) { // transformer.reset(); // DISABLED: multi-turn preserves KV cache printf("[RESET] Context cleared\n"); continue; } run_inference(line); // Multi-turn: KV cache preserved between turns // Use /reset command to manually clear context } printf("\nGoodbye.\n"); } else { // ─── SINGLE PROMPT ───────────────────────────────────────────────── run_inference(icfg.prompt); } // ─── FINAL STATS ────────────────────────────────────────────────────── if (!icfg.profile_path.empty()) { transformer.expert_cache_ref().dump_csv(icfg.profile_path.c_str()); } printf("\n=== Inference-X Unified — Session Stats ===\n"); ix::KernelDispatch::instance().print_stats(); if (fractal.enabled) fractal.print_stats(); printf("[IX] Backend: %s\n", ix::KernelDispatch::instance().backend_name()); return 0; }