inference-x/infer.cpp

// ═══════════════════════════════════════════════════════════════════════════════
// INFERENCE-X — Universal Inference Protocol (Main Entry Point)
// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
// Licensed under the Business Source License 1.1 (BSL-1.1)
// See LICENSE file for full terms.
//
// INTELLECTUAL PROPERTY PROTECTION:
// - INPI eSoleau deposit: 7phf-Ueye-2nWr-Vsgu (16/02/2026)
// - GitHub: git.inference-x.com/salka/inference-x
// - Author: Salka Elmadani | Morocco | Morocco
//
// MANUFACTURER NOTICE: Any manufacturer, company, or entity that
// incorporates, embeds, distributes, or commercially uses Inference-X
// or any derivative work without explicit written authorization from
// the copyright holder is in violation of BSL-1.1 and applicable
// intellectual property laws. This includes but is not limited to:
// hardware vendors, cloud providers, SaaS platforms, and OEMs.
//
// Contact: Elmadani.SALKA@proton.me for licensing.
// ═══════════════════════════════════════════════════════════════════════════════

#include <cstdio>
#include <cstdint>
#include <cstring>

// ═══════════════════════════════════════════════════════════════════════════════
// INFERENCE-X IDENTITY
// This watermark is integral to Inference-X. Removal violates BSL-1.1 Section 4.
// ═══════════════════════════════════════════════════════════════════════════════
static const char* IX_AUTHOR    = "Salka Elmadani";
static const char* IX_LICENSE   __attribute__((unused)) = "BSL-1.1";
static const char* IX_CONTACT   __attribute__((unused)) = "Elmadani.SALKA@proton.me";
static const char* IX_SIGNATURE = "IX";
static const uint32_t IX_FINGERPRINT = 0x935E1DAD;  // Build integrity constant

static void ix_print_banner() {
    fprintf(stderr, "\n");
    fprintf(stderr, "  ╔═══════════════════════════════════════════════════════════╗\n");
    fprintf(stderr, "  ║  Inference-X — Universal Inference Protocol        ║\n");
    fprintf(stderr, "  ║  Copyright (C) 2025-2026 Salka Elmadani                  ║\n");
    fprintf(stderr, "  ║  Licensed under BSL-1.1 | Morocco                  ║\n");
    fprintf(stderr, "  ║  https://inference-x.com | git.inference-x.com/salka/inference-x║\n");
    fprintf(stderr, "  ╚═══════════════════════════════════════════════════════════╝\n");
    fprintf(stderr, "\n");
}

static bool ix_verify_integrity() {
    // Integrity check — fingerprint must match
    // Tampering with this function violates the license
    return (IX_FINGERPRINT == 0x935E1DAD) &&
           (IX_SIGNATURE[0] == 'I') &&
           (IX_AUTHOR[0] == 'S');
}


#include "runtime/gguf.h"
#include "runtime/tokenizer.h"
#include "runtime/transformer_v6.h"
#include "runtime/server.h"
#include "runtime/fractal.h"
#include "runtime/platform.h"
#include "runtime/identity.h"
#include "runtime/kernel_dispatch.h"
#include <cstdlib>
#include <string>
#include <vector>
#include <signal.h>

using namespace ix;

static volatile bool g_interrupted = false;
static void sigint_handler(int) { g_interrupted = true; }

// ═══════════════════════════════════════════════════════════════════════════════
// CHAT TEMPLATE — DeepSeek V3 / Kimi K2.5 / ChatML format
// ═══════════════════════════════════════════════════════════════════════════════
struct ChatTemplate {
    enum Style { DEEPSEEK, CHATML, KIMI, LLAMA3, GEMMA, PHI3, MISTRAL, RAW };
    Style style = RAW;

    // Kimi K2.5 special token IDs (set during detect)
    int kimi_bos = -1;
    int kimi_im_system = -1;
    int kimi_im_user = -1;
    int kimi_im_assistant = -1;
    int kimi_im_middle = -1;
    int kimi_im_end = -1;
    int kimi_think = -1;

    // Format as token IDs (handles special tokens for Kimi)
    std::vector<int32_t> format_ids(
        const std::string& system, const std::string& user,
        const Tokenizer& tok
    ) const {
        std::vector<int32_t> ids;

        if (style == KIMI) {
            ids.push_back(kimi_bos);

            // System
            std::string sys_text = system.empty() ?
                "You are Kimi, an AI assistant created by Moonshot AI." : system;
            ids.push_back(kimi_im_system);
            auto sr = tok.encode("system");
            ids.insert(ids.end(), sr.begin(), sr.end());
            ids.push_back(kimi_im_middle);
            auto sc = tok.encode(sys_text);
            ids.insert(ids.end(), sc.begin(), sc.end());
            ids.push_back(kimi_im_end);

            // User
            ids.push_back(kimi_im_user);
            auto ur = tok.encode("user");
            ids.insert(ids.end(), ur.begin(), ur.end());
            ids.push_back(kimi_im_middle);
            auto uc = tok.encode(user);
            ids.insert(ids.end(), uc.begin(), uc.end());
            ids.push_back(kimi_im_end);

            // Assistant + <think>
            ids.push_back(kimi_im_assistant);
            auto ar = tok.encode("assistant");
            ids.insert(ids.end(), ar.begin(), ar.end());
            ids.push_back(kimi_im_middle);
            ids.push_back(kimi_think);

        } else {
            // Helper: insert special token ID or encode text
            auto add_special = [&](const char* name) -> bool {
                int32_t id = tok.find_token(name);
                if (id >= 0) { ids.push_back(id); return true; }
                return false;
            };
            auto add_text = [&](const std::string& text) {
                auto enc = tok.encode(text);
                ids.insert(ids.end(), enc.begin(), enc.end());
            };

            switch (style) {
                case DEEPSEEK:
                    add_special("<|begin\xe2\x96\x81of\xe2\x96\x81sentence|>");
                    if (!system.empty()) {
                        add_special("<|System|>");
                        add_text(system);
                    }
                    add_special("<|User|>");
                    add_text(user);
                    add_special("<|Assistant|>");
                    break;
                case CHATML:
                    if (!system.empty()) {
                        add_special("<|im_start|>");
                        add_text("system\n" + system);
                        add_special("<|im_end|>");
                    }
                    add_special("<|im_start|>");
                    add_text("user\n" + user);
                    add_special("<|im_end|>");
                    add_special("<|im_start|>");
                    add_text("assistant\n");
                    break;
                case LLAMA3:
                    add_special("<|begin_of_text|>");
                    if (!system.empty()) {
                        add_special("<|start_header_id|>");
                        add_text("system");
                        add_special("<|end_header_id|>");
                        add_text("\n\n" + system);
                        add_special("<|eot_id|>");
                    }
                    add_special("<|start_header_id|>");
                    add_text("user");
                    add_special("<|end_header_id|>");
                    add_text("\n\n" + user);
                    add_special("<|eot_id|>");
                    add_special("<|start_header_id|>");
                    add_text("assistant");
                    add_special("<|end_header_id|>");
                    add_text("\n\n");
                    break;
                case GEMMA:
                    add_special("<start_of_turn>");
                    add_text("user\n" + user);
                    add_special("<end_of_turn>");
                    add_text("\n");
                    add_special("<start_of_turn>");
                    add_text("model\n");
                    break;
                case PHI3:
                    if (!system.empty()) {
                        add_special("<|system|>");
                        add_text("\n" + system);
                        add_special("<|end|>");
                        add_text("\n");
                    }
                    add_special("<|user|>");
                    add_text("\n" + user);
                    add_special("<|end|>");
                    add_text("\n");
                    add_special("<|assistant|>");
                    add_text("\n");
                    break;
                case MISTRAL:
                    ids.push_back(tok.bos_id());
                    if (!add_special("[INST]")) add_text("[INST] ");
                    else add_text(" ");
                    add_text(user + " ");
                    if (!add_special("[/INST]")) add_text("[/INST]");
                    break;
                default: // RAW
                    add_text(user);
                    break;
            }
        }
        return ids;
    }

    static Style detect(const Tokenizer& tok, ChatTemplate& tmpl) {
        // Kimi K2.5: has <|im_user|> token
        int im_user = tok.find_token("<|im_user|>");
        if (im_user >= 0) {
            tmpl.kimi_bos = tok.bos_id();
            tmpl.kimi_im_system = tok.find_token("<|im_system|>");
            tmpl.kimi_im_user = im_user;
            tmpl.kimi_im_assistant = tok.find_token("<|im_assistant|>");
            tmpl.kimi_im_middle = tok.find_token("<|im_middle|>");
            tmpl.kimi_im_end = tok.find_token("<|im_end|>");
            tmpl.kimi_think = tok.find_token("<think>");
            printf("[KIMI] Special tokens: sys=%d user=%d asst=%d mid=%d end=%d think=%d\n",
                   tmpl.kimi_im_system, tmpl.kimi_im_user, tmpl.kimi_im_assistant,
                   tmpl.kimi_im_middle, tmpl.kimi_im_end, tmpl.kimi_think);
            return KIMI;
        }
        // Llama 3.x: has <|start_header_id|>
        if (tok.find_token("<|start_header_id|>") >= 0) return LLAMA3;
        // Gemma: has <start_of_turn>
        if (tok.find_token("<start_of_turn>") >= 0) return GEMMA;
        // Phi-3: has <|user|>
        if (tok.find_token("<|user|>") >= 0) return PHI3;
        // Mistral: has [INST]
        if (tok.find_token("[INST]") >= 0) return MISTRAL;
        // ChatML: has <|im_start|> (Qwen, SmolLM)
        if (tok.im_start_id() >= 0) return CHATML;
        // DeepSeek: has begin_of_sentence
        if (tok.bos_id() >= 0 && tok.find_token("<|User|>") >= 0) return DEEPSEEK;
        // Fallback: RAW
        // Qwen-family fallback: large vocab (>150k) = ChatML
        if (tok.vocab_size() > 150000) {
            printf("[DETECT] Large vocab (%d) → Qwen family, using ChatML\n", tok.vocab_size());
            return CHATML;
        }
        printf("[WARN] No known chat template detected, using RAW mode\n");
        return RAW;
    }
};

// ═══════════════════════════════════════════════════════════════════════════════
// CONFIG
// ═══════════════════════════════════════════════════════════════════════════════
struct InferConfig {
    std::string model_path;
    std::string prompt = "Hello! Who are you?";
    std::string system = "";
    int max_tokens = 512;
    float temperature = 0.6f;
    float top_p = 0.9f;
    int top_k = 40;
    int max_ctx = 4096;
    bool interactive = false;
    bool raw_mode = false;     // No chat template
    bool bench_mode = false;   // Benchmark: just measure tok/s
    bool serve_mode = false;
    int serve_port = 8080;
    bool fractal_mode = false;  // Fractal inference (dynamic precision)
    std::string profile_path;  // --profile: expert activation CSV
};

void print_usage(const char* prog) {
    printf("Usage: %s <model_path> [options]\n", prog);
    printf("Options:\n");
    printf("  -p <prompt>      User prompt (default: \"Hello! Who are you?\")\n");
    printf("  -s <system>      System prompt\n");
    printf("  -n <max_tokens>  Max tokens to generate (default: 512)\n");
    printf("  -t <temp>        Temperature (default: 0.6)\n");
    printf("  --top-p <val>    Top-P sampling (default: 0.9)\n");
    printf("  --top-k <val>    Top-K sampling (default: 40)\n");
    printf("  --ctx <size>     Max context window (default: 4096)\n");
    printf("  -i               Interactive chat mode\n");
    printf("  --raw            No chat template\n");
    printf("  --bench          Benchmark mode (no output)\n");
    printf("  --serve [port]    Start OpenAI-compatible API server (default: 8080)\n");
    printf("  --fractal        Enable fractal inference (dynamic precision per layer)\n");
    printf("  --profile <path> Dump expert activation profile\n");
}

InferConfig parse_args(int argc, char** argv) {
    InferConfig cfg;
    if (argc < 2) { print_usage(argv[0]); exit(1); }
    cfg.model_path = argv[1];

    for (int i = 2; i < argc; ++i) {
        std::string arg = argv[i];
        if (arg == "-p" && i + 1 < argc) cfg.prompt = argv[++i];
        else if (arg == "-s" && i + 1 < argc) cfg.system = argv[++i];
        else if (arg == "-n" && i + 1 < argc) cfg.max_tokens = atoi(argv[++i]);
        else if (arg == "-t" && i + 1 < argc) cfg.temperature = atof(argv[++i]);
        else if (arg == "--top-p" && i + 1 < argc) cfg.top_p = atof(argv[++i]);
        else if (arg == "--top-k" && i + 1 < argc) cfg.top_k = atoi(argv[++i]);
        else if (arg == "--ctx" && i + 1 < argc) cfg.max_ctx = atoi(argv[++i]);
        else if (arg == "-i") cfg.interactive = true;
        else if (arg == "--raw") cfg.raw_mode = true;
        else if (arg == "--bench") cfg.bench_mode = true;
        else if (arg == "--serve") { cfg.serve_mode = true; if (i+1 < argc && argv[i+1][0] != '-') cfg.serve_port = atoi(argv[++i]); }
        else if (arg == "--fractal") cfg.fractal_mode = true;
        else if (arg == "--profile" && i + 1 < argc) cfg.profile_path = argv[++i];
    }
    return cfg;
}

// ═══════════════════════════════════════════════════════════════════════════════
// MAIN
// ═══════════════════════════════════════════════════════════════════════════════
int main(int argc, char** argv) {
    ix_print_banner();
    if (!ix_verify_integrity()) { fprintf(stderr, "INTEGRITY CHECK FAILED\n"); return 1; }

    printf("╔══════════════════════════════════════════════════════════════╗\n");
    printf("║  INFERENCE-X v6 — UNIVERSAL INFERENCE PROTOCOL               ║\n");
    printf("║  COPYRIGHT (C) 2025-2026 SALKA ELMADANI                    ║\n");
    printf("╚══════════════════════════════════════════════════════════════╝\n\n");

    signal(SIGINT, sigint_handler);
    InferConfig icfg = parse_args(argc, argv);

    // ─── LOAD MODEL ────────────────────────────────────────────────────────
    ix::identity::print_identity();
    ix::identity::license().verify();
    printf("=== Loading model: %s ===\n", icfg.model_path.c_str());
    GGUF gguf;
    if (!gguf.open(icfg.model_path)) {
        printf("ERROR: Failed to open model at %s\n", icfg.model_path.c_str());
        return 1;
    }

    // ─── LOAD TOKENIZER ────────────────────────────────────────────────────
    printf("\n=== Loading tokenizer ===\n");
    Tokenizer tokenizer;
    if (!tokenizer.load(gguf)) {
        printf("ERROR: Failed to load tokenizer from GGUF\n");
        return 1;
    }

    // ─── INIT TRANSFORMER ──────────────────────────────────────────────────
    printf("\n=== Initializing transformer ===\n");
    TransformerV6 transformer;
    if (!transformer.init(gguf, icfg.max_ctx)) {
        printf("ERROR: Failed to initialize transformer\n");
        return 1;
    }
    transformer.set_eos_token(tokenizer.eos_id());

    // ─── INIT KERNEL DISPATCH ──────────────────────────────────────────────
    printf("\n=== Initializing kernel dispatch ===\n");
    ix::KernelDispatch::instance().init();

    // Enable ExpertMmap for MoE models (surgical prefetch, ÷48 I/O)
    auto& kcfg = transformer.config_mut();
    if (kcfg.n_experts > 0) {
        ix::KernelDispatch::instance().init_expert_mmap(kcfg.n_layers);
        printf("[IX] MoE detected: %d experts, %d active per layer\n",
               kcfg.n_experts, kcfg.n_experts_used);
    }

    // ─── FIX VOCAB SIZE ───────────────────────────────────────────────────
    if (kcfg.vocab_size == 0 || kcfg.vocab_size == 32000) {
        int tok_vocab = tokenizer.vocab_size();
        if (tok_vocab > 0) {
            printf("[FIX] vocab_size: GGUF missing → using tokenizer=%d\n", tok_vocab);
            kcfg.vocab_size = tok_vocab;
        } else {
            kcfg.vocab_size = 32000;  // ultimate fallback
            printf("[FIX] vocab_size: fallback to 32000\n");
        }
    } else {
        int tok_vocab = tokenizer.vocab_size();
        if (tok_vocab > 0 && tok_vocab != (int)kcfg.vocab_size) {
            printf("[FIX] vocab_size: GGUF=%u, tokenizer=%d → using max\n",
                   kcfg.vocab_size, tok_vocab);
            if (tok_vocab > (int)kcfg.vocab_size) kcfg.vocab_size = tok_vocab;
        }
    }

    // ─── DETECT CHAT TEMPLATE ──────────────────────────────────────────────
    ChatTemplate tmpl;
    if (icfg.raw_mode) {
        tmpl.style = ChatTemplate::RAW;
    } else {
        tmpl.style = ChatTemplate::detect(tokenizer, tmpl);
    }
    printf("[CHAT] Template: %s\n",
           tmpl.style == ChatTemplate::DEEPSEEK ? "DeepSeek V3" :
           tmpl.style == ChatTemplate::CHATML ? "ChatML" :
           tmpl.style == ChatTemplate::KIMI ? "Kimi K2.5" :
           tmpl.style == ChatTemplate::LLAMA3 ? "Llama 3" :
           tmpl.style == ChatTemplate::GEMMA ? "Gemma" :
           tmpl.style == ChatTemplate::PHI3 ? "Phi-3" :
           tmpl.style == ChatTemplate::MISTRAL ? "Mistral" : "Raw");

    // Override EOS for Kimi K2.5
    if (tmpl.style == ChatTemplate::KIMI && tmpl.kimi_im_end >= 0) {
        transformer.set_eos_token(tmpl.kimi_im_end);
        printf("[KIMI] EOS overridden to <|im_end|> = %d\n", tmpl.kimi_im_end);
    }
    // Multi-EOS: detect additional stop tokens
    auto try_add_eos = [&](const char* name) {
        int32_t id = tokenizer.find_token(name);
        if (id >= 0) {
            transformer.add_eos_token(id);
            fprintf(stderr, "[EOS] Stop: %s → %d\n", name, id);
        }
    };
    try_add_eos("<|eot_id|>");
    try_add_eos("<|end_of_text|>");
    try_add_eos("<|endoftext|>");
    try_add_eos("<|im_end|>");
    try_add_eos("<|end|>");
    try_add_eos("<end_of_turn>");


    // ─── INFERENCE LOOP ────────────────────────────────────────────────────

    // ─── FRACTAL INFERENCE PROTOCOL ──────────────────────────────────────
    ix::FractalEngine fractal;
    if (icfg.fractal_mode) {
        fractal.enable();
        printf("[FRACTAL] Dynamic precision enabled — model breathes Q2→FP16\n");
    }

auto run_inference = [&](const std::string& user_prompt) {
        // Format + tokenize (handles special tokens for Kimi)
        auto tokens = tmpl.format_ids(icfg.system, user_prompt, tokenizer);
        printf("\n[TOK] Input: %zu tokens\n", tokens.size());


        // Fractal: analyze query and plan precision
        if (fractal.enabled) {
            auto pmap = fractal.plan(tokens, kcfg.vocab_size, kcfg.n_layers, dtype::Q4_K);
            pmap.print_schedule();
        }
        if (tokens.size() > (size_t)icfg.max_ctx - icfg.max_tokens) {
            printf("[WARN] Prompt too long (%zu tokens), truncating to %d\n",
                   tokens.size(), icfg.max_ctx - icfg.max_tokens);
            tokens.resize(icfg.max_ctx - icfg.max_tokens);
        }

        // Benchmark mode: just measure throughput
        if (icfg.bench_mode) {
            auto output = transformer.generate(tokens, icfg.max_tokens,
                                               icfg.temperature, icfg.top_p, icfg.top_k);
            printf("[BENCH] Output: %zu tokens\n", output.size());
            return;
        }

        // Streaming generation
        printf("\n─── OUTPUT ───────────────────────────────────────────────────\n");
        fflush(stdout);
        fprintf(stderr, "[DBG] calling generate_stream\n"); fflush(stderr);
        int gen_count = 0;
        transformer.generate_stream(
            tokens, icfg.max_tokens,
            icfg.temperature, icfg.top_p, icfg.top_k,
            [&](int32_t token_id) -> bool {
                if (g_interrupted) return false;

                // Skip special tokens (control tokens, template markers)
                if (tokenizer.is_special(token_id)) return true;

                std::string piece = tokenizer.decode_token(token_id);

                // Skip tokens that look like template markers
                if (piece.size() > 2 && piece[0] == '<' && piece[piece.size()-1] == '>')
                    return true;

                printf("%s", piece.c_str());
                fflush(stdout);
                gen_count++;

                // INCREMENTAL PROFILING: dump CSV after each token
                if (!icfg.profile_path.empty()) {
                    transformer.expert_cache_ref().dump_csv(icfg.profile_path.c_str());
                }
                return true;
            }
        );
        fprintf(stderr, "[DBG] generate_stream returned, gen_count=%d\n", gen_count); fflush(stderr);
        printf("\n──────────────────────────────────────────────────────────────\n");
        printf("[GEN] %d tokens generated\n", gen_count);
    };

    // --- SERVE MODE: OpenAI-compatible API server ---
    if (icfg.serve_mode) {
        std::string mname = icfg.model_path;
        size_t slash = mname.rfind('/');
        if (slash != std::string::npos) mname = mname.substr(slash + 1);
        size_t dot = mname.rfind('.');
        if (dot != std::string::npos) mname = mname.substr(0, dot);

        ix::Server server(icfg.serve_port, mname,
            [&](const std::string& sys, const std::string& user,
                int max_tok, float temp, float tp,
                std::function<bool(const std::string&)> on_token) {
                auto tokens = tmpl.format_ids(sys, user, tokenizer);
                if (tokens.size() > (size_t)icfg.max_ctx - max_tok)
                    tokens.resize(icfg.max_ctx - max_tok);
                transformer.generate_stream(
                    tokens, max_tok, temp, tp, icfg.top_k,
                    [&](int32_t token_id) -> bool {
                        if (tokenizer.is_special(token_id)) return true;
                        std::string piece = tokenizer.decode_token(token_id);
                        if (piece.size() > 2 && piece[0] == '<' && piece[piece.size()-1] == '>')
                            return true;
                        return on_token(piece);
                    }
                );
            }
        );
        server.run();
        return 0;
    }
    if (icfg.interactive) {
        // ─── INTERACTIVE CHAT ──────────────────────────────────────────────
        printf("\n=== Interactive mode (Ctrl+C to exit) ===\n");
        char line[4096];
        while (!g_interrupted) {
            printf("\n> ");
            fflush(stdout);
            if (!fgets(line, sizeof(line), stdin)) break;

            // Strip newline
            size_t len = strlen(line);
            if (len > 0 && line[len - 1] == '\n') line[len - 1] = '\0';
            if (strlen(line) == 0) continue;

            if (strcmp(line, "/quit") == 0 || strcmp(line, "/exit") == 0) break;
            if (strcmp(line, "/reset") == 0) {
        // transformer.reset(); // DISABLED: multi-turn preserves KV cache
                printf("[RESET] Context cleared\n");
                continue;
            }

            run_inference(line);
            // Multi-turn: KV cache preserved between turns
            // Use /reset command to manually clear context
        }
        printf("\nGoodbye.\n");
    } else {
        // ─── SINGLE PROMPT ─────────────────────────────────────────────────
        run_inference(icfg.prompt);
    }

    // ─── FINAL STATS ──────────────────────────────────────────────────────
    if (!icfg.profile_path.empty()) {
        transformer.expert_cache_ref().dump_csv(icfg.profile_path.c_str());
    }
    printf("\n=== Inference-X Unified — Session Stats ===\n");
    ix::KernelDispatch::instance().print_stats();
    if (fractal.enabled) fractal.print_stats();
    printf("[IX] Backend: %s\n", ix::KernelDispatch::instance().backend_name());

    return 0;
}