572 lines
27 KiB
C++
572 lines
27 KiB
C++
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// INFERENCE-X — Universal Inference Protocol (Main Entry Point)
|
|
// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
|
|
// Licensed under the Business Source License 1.1 (BSL-1.1)
|
|
// See LICENSE file for full terms.
|
|
//
|
|
// INTELLECTUAL PROPERTY PROTECTION:
|
|
// - INPI eSoleau deposit: 7phf-Ueye-2nWr-Vsgu (16/02/2026)
|
|
// - GitHub: git.inference-x.com/salka/inference-x
|
|
// - Author: Salka Elmadani | Morocco | Morocco
|
|
//
|
|
// MANUFACTURER NOTICE: Any manufacturer, company, or entity that
|
|
// incorporates, embeds, distributes, or commercially uses Inference-X
|
|
// or any derivative work without explicit written authorization from
|
|
// the copyright holder is in violation of BSL-1.1 and applicable
|
|
// intellectual property laws. This includes but is not limited to:
|
|
// hardware vendors, cloud providers, SaaS platforms, and OEMs.
|
|
//
|
|
// Contact: Elmadani.SALKA@proton.me for licensing.
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
|
|
#include <cstdio>
|
|
#include <cstdint>
|
|
#include <cstring>
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// INFERENCE-X IDENTITY
|
|
// This watermark is integral to Inference-X. Removal violates BSL-1.1 Section 4.
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
static const char* IX_AUTHOR = "Salka Elmadani";
|
|
static const char* IX_LICENSE __attribute__((unused)) = "BSL-1.1";
|
|
static const char* IX_CONTACT __attribute__((unused)) = "Elmadani.SALKA@proton.me";
|
|
static const char* IX_SIGNATURE = "IX";
|
|
static const uint32_t IX_FINGERPRINT = 0x935E1DAD; // Build integrity constant
|
|
|
|
static void ix_print_banner() {
|
|
fprintf(stderr, "\n");
|
|
fprintf(stderr, " ╔═══════════════════════════════════════════════════════════╗\n");
|
|
fprintf(stderr, " ║ Inference-X — Universal Inference Protocol ║\n");
|
|
fprintf(stderr, " ║ Copyright (C) 2025-2026 Salka Elmadani ║\n");
|
|
fprintf(stderr, " ║ Licensed under BSL-1.1 | Morocco ║\n");
|
|
fprintf(stderr, " ║ https://inference-x.com | git.inference-x.com/salka/inference-x║\n");
|
|
fprintf(stderr, " ╚═══════════════════════════════════════════════════════════╝\n");
|
|
fprintf(stderr, "\n");
|
|
}
|
|
|
|
static bool ix_verify_integrity() {
|
|
// Integrity check — fingerprint must match
|
|
// Tampering with this function violates the license
|
|
return (IX_FINGERPRINT == 0x935E1DAD) &&
|
|
(IX_SIGNATURE[0] == 'I') &&
|
|
(IX_AUTHOR[0] == 'S');
|
|
}
|
|
|
|
|
|
#include "runtime/gguf.h"
|
|
#include "runtime/tokenizer.h"
|
|
#include "runtime/transformer_v6.h"
|
|
#include "runtime/server.h"
|
|
#include "runtime/fractal.h"
|
|
#include "runtime/platform.h"
|
|
#include "runtime/identity.h"
|
|
#include "runtime/kernel_dispatch.h"
|
|
#include <cstdlib>
|
|
#include <string>
|
|
#include <vector>
|
|
#include <signal.h>
|
|
|
|
using namespace ix;
|
|
|
|
static volatile bool g_interrupted = false;
|
|
static void sigint_handler(int) { g_interrupted = true; }
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// CHAT TEMPLATE — DeepSeek V3 / Kimi K2.5 / ChatML format
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
struct ChatTemplate {
|
|
enum Style { DEEPSEEK, CHATML, KIMI, LLAMA3, GEMMA, PHI3, MISTRAL, RAW };
|
|
Style style = RAW;
|
|
|
|
// Kimi K2.5 special token IDs (set during detect)
|
|
int kimi_bos = -1;
|
|
int kimi_im_system = -1;
|
|
int kimi_im_user = -1;
|
|
int kimi_im_assistant = -1;
|
|
int kimi_im_middle = -1;
|
|
int kimi_im_end = -1;
|
|
int kimi_think = -1;
|
|
|
|
// Format as token IDs (handles special tokens for Kimi)
|
|
std::vector<int32_t> format_ids(
|
|
const std::string& system, const std::string& user,
|
|
const Tokenizer& tok
|
|
) const {
|
|
std::vector<int32_t> ids;
|
|
|
|
if (style == KIMI) {
|
|
ids.push_back(kimi_bos);
|
|
|
|
// System
|
|
std::string sys_text = system.empty() ?
|
|
"You are Kimi, an AI assistant created by Moonshot AI." : system;
|
|
ids.push_back(kimi_im_system);
|
|
auto sr = tok.encode("system");
|
|
ids.insert(ids.end(), sr.begin(), sr.end());
|
|
ids.push_back(kimi_im_middle);
|
|
auto sc = tok.encode(sys_text);
|
|
ids.insert(ids.end(), sc.begin(), sc.end());
|
|
ids.push_back(kimi_im_end);
|
|
|
|
// User
|
|
ids.push_back(kimi_im_user);
|
|
auto ur = tok.encode("user");
|
|
ids.insert(ids.end(), ur.begin(), ur.end());
|
|
ids.push_back(kimi_im_middle);
|
|
auto uc = tok.encode(user);
|
|
ids.insert(ids.end(), uc.begin(), uc.end());
|
|
ids.push_back(kimi_im_end);
|
|
|
|
// Assistant + <think>
|
|
ids.push_back(kimi_im_assistant);
|
|
auto ar = tok.encode("assistant");
|
|
ids.insert(ids.end(), ar.begin(), ar.end());
|
|
ids.push_back(kimi_im_middle);
|
|
ids.push_back(kimi_think);
|
|
|
|
} else {
|
|
// Helper: insert special token ID or encode text
|
|
auto add_special = [&](const char* name) -> bool {
|
|
int32_t id = tok.find_token(name);
|
|
if (id >= 0) { ids.push_back(id); return true; }
|
|
return false;
|
|
};
|
|
auto add_text = [&](const std::string& text) {
|
|
auto enc = tok.encode(text);
|
|
ids.insert(ids.end(), enc.begin(), enc.end());
|
|
};
|
|
|
|
switch (style) {
|
|
case DEEPSEEK:
|
|
add_special("<|begin\xe2\x96\x81of\xe2\x96\x81sentence|>");
|
|
if (!system.empty()) {
|
|
add_special("<|System|>");
|
|
add_text(system);
|
|
}
|
|
add_special("<|User|>");
|
|
add_text(user);
|
|
add_special("<|Assistant|>");
|
|
break;
|
|
case CHATML:
|
|
if (!system.empty()) {
|
|
add_special("<|im_start|>");
|
|
add_text("system\n" + system);
|
|
add_special("<|im_end|>");
|
|
}
|
|
add_special("<|im_start|>");
|
|
add_text("user\n" + user);
|
|
add_special("<|im_end|>");
|
|
add_special("<|im_start|>");
|
|
add_text("assistant\n");
|
|
break;
|
|
case LLAMA3:
|
|
add_special("<|begin_of_text|>");
|
|
if (!system.empty()) {
|
|
add_special("<|start_header_id|>");
|
|
add_text("system");
|
|
add_special("<|end_header_id|>");
|
|
add_text("\n\n" + system);
|
|
add_special("<|eot_id|>");
|
|
}
|
|
add_special("<|start_header_id|>");
|
|
add_text("user");
|
|
add_special("<|end_header_id|>");
|
|
add_text("\n\n" + user);
|
|
add_special("<|eot_id|>");
|
|
add_special("<|start_header_id|>");
|
|
add_text("assistant");
|
|
add_special("<|end_header_id|>");
|
|
add_text("\n\n");
|
|
break;
|
|
case GEMMA:
|
|
add_special("<start_of_turn>");
|
|
add_text("user\n" + user);
|
|
add_special("<end_of_turn>");
|
|
add_text("\n");
|
|
add_special("<start_of_turn>");
|
|
add_text("model\n");
|
|
break;
|
|
case PHI3:
|
|
if (!system.empty()) {
|
|
add_special("<|system|>");
|
|
add_text("\n" + system);
|
|
add_special("<|end|>");
|
|
add_text("\n");
|
|
}
|
|
add_special("<|user|>");
|
|
add_text("\n" + user);
|
|
add_special("<|end|>");
|
|
add_text("\n");
|
|
add_special("<|assistant|>");
|
|
add_text("\n");
|
|
break;
|
|
case MISTRAL:
|
|
ids.push_back(tok.bos_id());
|
|
if (!add_special("[INST]")) add_text("[INST] ");
|
|
else add_text(" ");
|
|
add_text(user + " ");
|
|
if (!add_special("[/INST]")) add_text("[/INST]");
|
|
break;
|
|
default: // RAW
|
|
add_text(user);
|
|
break;
|
|
}
|
|
}
|
|
return ids;
|
|
}
|
|
|
|
static Style detect(const Tokenizer& tok, ChatTemplate& tmpl) {
|
|
// Kimi K2.5: has <|im_user|> token
|
|
int im_user = tok.find_token("<|im_user|>");
|
|
if (im_user >= 0) {
|
|
tmpl.kimi_bos = tok.bos_id();
|
|
tmpl.kimi_im_system = tok.find_token("<|im_system|>");
|
|
tmpl.kimi_im_user = im_user;
|
|
tmpl.kimi_im_assistant = tok.find_token("<|im_assistant|>");
|
|
tmpl.kimi_im_middle = tok.find_token("<|im_middle|>");
|
|
tmpl.kimi_im_end = tok.find_token("<|im_end|>");
|
|
tmpl.kimi_think = tok.find_token("<think>");
|
|
printf("[KIMI] Special tokens: sys=%d user=%d asst=%d mid=%d end=%d think=%d\n",
|
|
tmpl.kimi_im_system, tmpl.kimi_im_user, tmpl.kimi_im_assistant,
|
|
tmpl.kimi_im_middle, tmpl.kimi_im_end, tmpl.kimi_think);
|
|
return KIMI;
|
|
}
|
|
// Llama 3.x: has <|start_header_id|>
|
|
if (tok.find_token("<|start_header_id|>") >= 0) return LLAMA3;
|
|
// Gemma: has <start_of_turn>
|
|
if (tok.find_token("<start_of_turn>") >= 0) return GEMMA;
|
|
// Phi-3: has <|user|>
|
|
if (tok.find_token("<|user|>") >= 0) return PHI3;
|
|
// Mistral: has [INST]
|
|
if (tok.find_token("[INST]") >= 0) return MISTRAL;
|
|
// ChatML: has <|im_start|> (Qwen, SmolLM)
|
|
if (tok.im_start_id() >= 0) return CHATML;
|
|
// DeepSeek: has begin_of_sentence
|
|
if (tok.bos_id() >= 0 && tok.find_token("<|User|>") >= 0) return DEEPSEEK;
|
|
// Fallback: RAW
|
|
// Qwen-family fallback: large vocab (>150k) = ChatML
|
|
if (tok.vocab_size() > 150000) {
|
|
printf("[DETECT] Large vocab (%d) → Qwen family, using ChatML\n", tok.vocab_size());
|
|
return CHATML;
|
|
}
|
|
printf("[WARN] No known chat template detected, using RAW mode\n");
|
|
return RAW;
|
|
}
|
|
};
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// CONFIG
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
struct InferConfig {
|
|
std::string model_path;
|
|
std::string prompt = "Hello! Who are you?";
|
|
std::string system = "";
|
|
int max_tokens = 512;
|
|
float temperature = 0.6f;
|
|
float top_p = 0.9f;
|
|
int top_k = 40;
|
|
int max_ctx = 4096;
|
|
bool interactive = false;
|
|
bool raw_mode = false; // No chat template
|
|
bool bench_mode = false; // Benchmark: just measure tok/s
|
|
bool serve_mode = false;
|
|
int serve_port = 8080;
|
|
bool fractal_mode = false; // Fractal inference (dynamic precision)
|
|
std::string profile_path; // --profile: expert activation CSV
|
|
};
|
|
|
|
void print_usage(const char* prog) {
|
|
printf("Usage: %s <model_path> [options]\n", prog);
|
|
printf("Options:\n");
|
|
printf(" -p <prompt> User prompt (default: \"Hello! Who are you?\")\n");
|
|
printf(" -s <system> System prompt\n");
|
|
printf(" -n <max_tokens> Max tokens to generate (default: 512)\n");
|
|
printf(" -t <temp> Temperature (default: 0.6)\n");
|
|
printf(" --top-p <val> Top-P sampling (default: 0.9)\n");
|
|
printf(" --top-k <val> Top-K sampling (default: 40)\n");
|
|
printf(" --ctx <size> Max context window (default: 4096)\n");
|
|
printf(" -i Interactive chat mode\n");
|
|
printf(" --raw No chat template\n");
|
|
printf(" --bench Benchmark mode (no output)\n");
|
|
printf(" --serve [port] Start OpenAI-compatible API server (default: 8080)\n");
|
|
printf(" --fractal Enable fractal inference (dynamic precision per layer)\n");
|
|
printf(" --profile <path> Dump expert activation profile\n");
|
|
}
|
|
|
|
InferConfig parse_args(int argc, char** argv) {
|
|
InferConfig cfg;
|
|
if (argc < 2) { print_usage(argv[0]); exit(1); }
|
|
cfg.model_path = argv[1];
|
|
|
|
for (int i = 2; i < argc; ++i) {
|
|
std::string arg = argv[i];
|
|
if (arg == "-p" && i + 1 < argc) cfg.prompt = argv[++i];
|
|
else if (arg == "-s" && i + 1 < argc) cfg.system = argv[++i];
|
|
else if (arg == "-n" && i + 1 < argc) cfg.max_tokens = atoi(argv[++i]);
|
|
else if (arg == "-t" && i + 1 < argc) cfg.temperature = atof(argv[++i]);
|
|
else if (arg == "--top-p" && i + 1 < argc) cfg.top_p = atof(argv[++i]);
|
|
else if (arg == "--top-k" && i + 1 < argc) cfg.top_k = atoi(argv[++i]);
|
|
else if (arg == "--ctx" && i + 1 < argc) cfg.max_ctx = atoi(argv[++i]);
|
|
else if (arg == "-i") cfg.interactive = true;
|
|
else if (arg == "--raw") cfg.raw_mode = true;
|
|
else if (arg == "--bench") cfg.bench_mode = true;
|
|
else if (arg == "--serve") { cfg.serve_mode = true; if (i+1 < argc && argv[i+1][0] != '-') cfg.serve_port = atoi(argv[++i]); }
|
|
else if (arg == "--fractal") cfg.fractal_mode = true;
|
|
else if (arg == "--profile" && i + 1 < argc) cfg.profile_path = argv[++i];
|
|
}
|
|
return cfg;
|
|
}
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// MAIN
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
int main(int argc, char** argv) {
|
|
ix_print_banner();
|
|
if (!ix_verify_integrity()) { fprintf(stderr, "INTEGRITY CHECK FAILED\n"); return 1; }
|
|
|
|
printf("╔══════════════════════════════════════════════════════════════╗\n");
|
|
printf("║ INFERENCE-X v6 — UNIVERSAL INFERENCE PROTOCOL ║\n");
|
|
printf("║ COPYRIGHT (C) 2025-2026 SALKA ELMADANI ║\n");
|
|
printf("╚══════════════════════════════════════════════════════════════╝\n\n");
|
|
|
|
signal(SIGINT, sigint_handler);
|
|
InferConfig icfg = parse_args(argc, argv);
|
|
|
|
// ─── LOAD MODEL ────────────────────────────────────────────────────────
|
|
ix::identity::print_identity();
|
|
ix::identity::license().verify();
|
|
printf("=== Loading model: %s ===\n", icfg.model_path.c_str());
|
|
GGUF gguf;
|
|
if (!gguf.open(icfg.model_path)) {
|
|
printf("ERROR: Failed to open model at %s\n", icfg.model_path.c_str());
|
|
return 1;
|
|
}
|
|
|
|
// ─── LOAD TOKENIZER ────────────────────────────────────────────────────
|
|
printf("\n=== Loading tokenizer ===\n");
|
|
Tokenizer tokenizer;
|
|
if (!tokenizer.load(gguf)) {
|
|
printf("ERROR: Failed to load tokenizer from GGUF\n");
|
|
return 1;
|
|
}
|
|
|
|
// ─── INIT TRANSFORMER ──────────────────────────────────────────────────
|
|
printf("\n=== Initializing transformer ===\n");
|
|
TransformerV6 transformer;
|
|
if (!transformer.init(gguf, icfg.max_ctx)) {
|
|
printf("ERROR: Failed to initialize transformer\n");
|
|
return 1;
|
|
}
|
|
transformer.set_eos_token(tokenizer.eos_id());
|
|
|
|
// ─── INIT KERNEL DISPATCH ──────────────────────────────────────────────
|
|
printf("\n=== Initializing kernel dispatch ===\n");
|
|
ix::KernelDispatch::instance().init();
|
|
|
|
// Enable ExpertMmap for MoE models (surgical prefetch, ÷48 I/O)
|
|
auto& kcfg = transformer.config_mut();
|
|
if (kcfg.n_experts > 0) {
|
|
ix::KernelDispatch::instance().init_expert_mmap(kcfg.n_layers);
|
|
printf("[IX] MoE detected: %d experts, %d active per layer\n",
|
|
kcfg.n_experts, kcfg.n_experts_used);
|
|
}
|
|
|
|
// ─── FIX VOCAB SIZE ───────────────────────────────────────────────────
|
|
if (kcfg.vocab_size == 0 || kcfg.vocab_size == 32000) {
|
|
int tok_vocab = tokenizer.vocab_size();
|
|
if (tok_vocab > 0) {
|
|
printf("[FIX] vocab_size: GGUF missing → using tokenizer=%d\n", tok_vocab);
|
|
kcfg.vocab_size = tok_vocab;
|
|
} else {
|
|
kcfg.vocab_size = 32000; // ultimate fallback
|
|
printf("[FIX] vocab_size: fallback to 32000\n");
|
|
}
|
|
} else {
|
|
int tok_vocab = tokenizer.vocab_size();
|
|
if (tok_vocab > 0 && tok_vocab != (int)kcfg.vocab_size) {
|
|
printf("[FIX] vocab_size: GGUF=%u, tokenizer=%d → using max\n",
|
|
kcfg.vocab_size, tok_vocab);
|
|
if (tok_vocab > (int)kcfg.vocab_size) kcfg.vocab_size = tok_vocab;
|
|
}
|
|
}
|
|
|
|
// ─── DETECT CHAT TEMPLATE ──────────────────────────────────────────────
|
|
ChatTemplate tmpl;
|
|
if (icfg.raw_mode) {
|
|
tmpl.style = ChatTemplate::RAW;
|
|
} else {
|
|
tmpl.style = ChatTemplate::detect(tokenizer, tmpl);
|
|
}
|
|
printf("[CHAT] Template: %s\n",
|
|
tmpl.style == ChatTemplate::DEEPSEEK ? "DeepSeek V3" :
|
|
tmpl.style == ChatTemplate::CHATML ? "ChatML" :
|
|
tmpl.style == ChatTemplate::KIMI ? "Kimi K2.5" :
|
|
tmpl.style == ChatTemplate::LLAMA3 ? "Llama 3" :
|
|
tmpl.style == ChatTemplate::GEMMA ? "Gemma" :
|
|
tmpl.style == ChatTemplate::PHI3 ? "Phi-3" :
|
|
tmpl.style == ChatTemplate::MISTRAL ? "Mistral" : "Raw");
|
|
|
|
// Override EOS for Kimi K2.5
|
|
if (tmpl.style == ChatTemplate::KIMI && tmpl.kimi_im_end >= 0) {
|
|
transformer.set_eos_token(tmpl.kimi_im_end);
|
|
printf("[KIMI] EOS overridden to <|im_end|> = %d\n", tmpl.kimi_im_end);
|
|
}
|
|
// Multi-EOS: detect additional stop tokens
|
|
auto try_add_eos = [&](const char* name) {
|
|
int32_t id = tokenizer.find_token(name);
|
|
if (id >= 0) {
|
|
transformer.add_eos_token(id);
|
|
fprintf(stderr, "[EOS] Stop: %s → %d\n", name, id);
|
|
}
|
|
};
|
|
try_add_eos("<|eot_id|>");
|
|
try_add_eos("<|end_of_text|>");
|
|
try_add_eos("<|endoftext|>");
|
|
try_add_eos("<|im_end|>");
|
|
try_add_eos("<|end|>");
|
|
try_add_eos("<end_of_turn>");
|
|
|
|
|
|
// ─── INFERENCE LOOP ────────────────────────────────────────────────────
|
|
|
|
// ─── FRACTAL INFERENCE PROTOCOL ──────────────────────────────────────
|
|
ix::FractalEngine fractal;
|
|
if (icfg.fractal_mode) {
|
|
fractal.enable();
|
|
printf("[FRACTAL] Dynamic precision enabled — model breathes Q2→FP16\n");
|
|
}
|
|
|
|
auto run_inference = [&](const std::string& user_prompt) {
|
|
// Format + tokenize (handles special tokens for Kimi)
|
|
auto tokens = tmpl.format_ids(icfg.system, user_prompt, tokenizer);
|
|
printf("\n[TOK] Input: %zu tokens\n", tokens.size());
|
|
|
|
|
|
// Fractal: analyze query and plan precision
|
|
if (fractal.enabled) {
|
|
auto pmap = fractal.plan(tokens, kcfg.vocab_size, kcfg.n_layers, dtype::Q4_K);
|
|
pmap.print_schedule();
|
|
}
|
|
if (tokens.size() > (size_t)icfg.max_ctx - icfg.max_tokens) {
|
|
printf("[WARN] Prompt too long (%zu tokens), truncating to %d\n",
|
|
tokens.size(), icfg.max_ctx - icfg.max_tokens);
|
|
tokens.resize(icfg.max_ctx - icfg.max_tokens);
|
|
}
|
|
|
|
// Benchmark mode: just measure throughput
|
|
if (icfg.bench_mode) {
|
|
auto output = transformer.generate(tokens, icfg.max_tokens,
|
|
icfg.temperature, icfg.top_p, icfg.top_k);
|
|
printf("[BENCH] Output: %zu tokens\n", output.size());
|
|
return;
|
|
}
|
|
|
|
// Streaming generation
|
|
printf("\n─── OUTPUT ───────────────────────────────────────────────────\n");
|
|
fflush(stdout);
|
|
fprintf(stderr, "[DBG] calling generate_stream\n"); fflush(stderr);
|
|
int gen_count = 0;
|
|
transformer.generate_stream(
|
|
tokens, icfg.max_tokens,
|
|
icfg.temperature, icfg.top_p, icfg.top_k,
|
|
[&](int32_t token_id) -> bool {
|
|
if (g_interrupted) return false;
|
|
|
|
// Skip special tokens (control tokens, template markers)
|
|
if (tokenizer.is_special(token_id)) return true;
|
|
|
|
std::string piece = tokenizer.decode_token(token_id);
|
|
|
|
// Skip tokens that look like template markers
|
|
if (piece.size() > 2 && piece[0] == '<' && piece[piece.size()-1] == '>')
|
|
return true;
|
|
|
|
printf("%s", piece.c_str());
|
|
fflush(stdout);
|
|
gen_count++;
|
|
|
|
// INCREMENTAL PROFILING: dump CSV after each token
|
|
if (!icfg.profile_path.empty()) {
|
|
transformer.expert_cache_ref().dump_csv(icfg.profile_path.c_str());
|
|
}
|
|
return true;
|
|
}
|
|
);
|
|
fprintf(stderr, "[DBG] generate_stream returned, gen_count=%d\n", gen_count); fflush(stderr);
|
|
printf("\n──────────────────────────────────────────────────────────────\n");
|
|
printf("[GEN] %d tokens generated\n", gen_count);
|
|
};
|
|
|
|
// --- SERVE MODE: OpenAI-compatible API server ---
|
|
if (icfg.serve_mode) {
|
|
std::string mname = icfg.model_path;
|
|
size_t slash = mname.rfind('/');
|
|
if (slash != std::string::npos) mname = mname.substr(slash + 1);
|
|
size_t dot = mname.rfind('.');
|
|
if (dot != std::string::npos) mname = mname.substr(0, dot);
|
|
|
|
ix::Server server(icfg.serve_port, mname,
|
|
[&](const std::string& sys, const std::string& user,
|
|
int max_tok, float temp, float tp,
|
|
std::function<bool(const std::string&)> on_token) {
|
|
auto tokens = tmpl.format_ids(sys, user, tokenizer);
|
|
if (tokens.size() > (size_t)icfg.max_ctx - max_tok)
|
|
tokens.resize(icfg.max_ctx - max_tok);
|
|
transformer.generate_stream(
|
|
tokens, max_tok, temp, tp, icfg.top_k,
|
|
[&](int32_t token_id) -> bool {
|
|
if (tokenizer.is_special(token_id)) return true;
|
|
std::string piece = tokenizer.decode_token(token_id);
|
|
if (piece.size() > 2 && piece[0] == '<' && piece[piece.size()-1] == '>')
|
|
return true;
|
|
return on_token(piece);
|
|
}
|
|
);
|
|
}
|
|
);
|
|
server.run();
|
|
return 0;
|
|
}
|
|
if (icfg.interactive) {
|
|
// ─── INTERACTIVE CHAT ──────────────────────────────────────────────
|
|
printf("\n=== Interactive mode (Ctrl+C to exit) ===\n");
|
|
char line[4096];
|
|
while (!g_interrupted) {
|
|
printf("\n> ");
|
|
fflush(stdout);
|
|
if (!fgets(line, sizeof(line), stdin)) break;
|
|
|
|
// Strip newline
|
|
size_t len = strlen(line);
|
|
if (len > 0 && line[len - 1] == '\n') line[len - 1] = '\0';
|
|
if (strlen(line) == 0) continue;
|
|
|
|
if (strcmp(line, "/quit") == 0 || strcmp(line, "/exit") == 0) break;
|
|
if (strcmp(line, "/reset") == 0) {
|
|
// transformer.reset(); // DISABLED: multi-turn preserves KV cache
|
|
printf("[RESET] Context cleared\n");
|
|
continue;
|
|
}
|
|
|
|
run_inference(line);
|
|
// Multi-turn: KV cache preserved between turns
|
|
// Use /reset command to manually clear context
|
|
}
|
|
printf("\nGoodbye.\n");
|
|
} else {
|
|
// ─── SINGLE PROMPT ─────────────────────────────────────────────────
|
|
run_inference(icfg.prompt);
|
|
}
|
|
|
|
// ─── FINAL STATS ──────────────────────────────────────────────────────
|
|
if (!icfg.profile_path.empty()) {
|
|
transformer.expert_cache_ref().dump_csv(icfg.profile_path.c_str());
|
|
}
|
|
printf("\n=== Inference-X Unified — Session Stats ===\n");
|
|
ix::KernelDispatch::instance().print_stats();
|
|
if (fractal.enabled) fractal.print_stats();
|
|
printf("[IX] Backend: %s\n", ix::KernelDispatch::instance().backend_name());
|
|
|
|
return 0;
|
|
}
|