inference-x/runtime/tokenizer.h

// ═══════════════════════════════════════════════════════════════════════════════
// INFERENCE-X — Tokenizer Engine
// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
// Licensed under the Business Source License 1.1 (BSL-1.1)
// See LICENSE file for full terms.
//
// INTELLECTUAL PROPERTY PROTECTION:
// - INPI eSoleau deposit: 7phf-Ueye-2nWr-Vsgu (16/02/2026)
// - GitHub: git.inference-x.com/salka/inference-x
// - Author: Salka Elmadani | Morocco | Morocco
//
// MANUFACTURER NOTICE: Any manufacturer, company, or entity that
// incorporates, embeds, distributes, or commercially uses Inference-X
// or any derivative work without explicit written authorization from
// the copyright holder is in violation of BSL-1.1 and applicable
// intellectual property laws. This includes but is not limited to:
// hardware vendors, cloud providers, SaaS platforms, and OEMs.
//
// Contact: Elmadani.SALKA@proton.me for licensing.
// ═══════════════════════════════════════════════════════════════════════════════

#pragma once

// Inference-X Tokenizer — Salka Elmadani
#define IX_TOKENIZER_MARK "IX-TOK"


#include "gguf.h"
#include <string>
#include <vector>
#include <unordered_map>
#include <algorithm>
#include <cstdio>
#include <cstring>

namespace ix {

class Tokenizer {
public:
    // Load from GGUF metadata
    bool load(const GGUF& gguf) {
        // Get vocab tokens
        auto* tokens = gguf.get_str_arr("tokenizer.ggml.tokens");
        if (!tokens || tokens->empty()) {
            printf("[TOK] ERROR: No tokenizer.ggml.tokens in GGUF\n");
            return false;
        }

        vocab_ = *tokens;
        vocab_size_ = (int)vocab_.size();

        // Build token → id map
        for (int i = 0; i < vocab_size_; ++i) {
            token_to_id_[vocab_[i]] = i;
        }

        // Get BPE merges
        auto* merges = gguf.get_str_arr("tokenizer.ggml.merges");
        if (merges && !merges->empty()) {
            for (int i = 0; i < (int)merges->size(); ++i) {
                const std::string& m = (*merges)[i];
                size_t sp = m.find(' ');
                if (sp != std::string::npos) {
                    std::string a = m.substr(0, sp);
                    std::string b = m.substr(sp + 1);
                    merge_rank_[a + " " + b] = i;
                }
            }
        }

        // Get token types
        auto* types = gguf.get_i32_arr("tokenizer.ggml.token_type");
        if (types) token_types_ = *types;

        // Special tokens
        bos_id_ = (int)gguf.get_u32("tokenizer.ggml.bos_token_id", 1);
        eos_id_ = (int)gguf.get_u32("tokenizer.ggml.eos_token_id", 2);
        pad_id_ = (int)gguf.get_u32("tokenizer.ggml.padding_token_id", 0);

        // Check for special token strings in vocab
        auto find_tok = [&](const std::string& s) -> int {
            auto it = token_to_id_.find(s);
            return it != token_to_id_.end() ? it->second : -1;
        };

        // Common special tokens
        if (find_tok("<|begin▁of▁sentence|>") >= 0) bos_id_ = find_tok("<|begin▁of▁sentence|>");
        if (find_tok("<|end▁of▁sentence|>") >= 0)   eos_id_ = find_tok("<|end▁of▁sentence|>");
        if (find_tok("<|im_start|>") >= 0)           im_start_id_ = find_tok("<|im_start|>");
        if (find_tok("<|im_end|>") >= 0)             im_end_id_ = find_tok("<|im_end|>");

        // Detect GPT-2 byte-level BPE
        is_byte_level_ = (token_to_id_.count("\xC4\xA0") > 0);
        std::string tok_model = gguf.get_str("tokenizer.ggml.model", "");
        if (tok_model == "gpt2") is_byte_level_ = true;
        if (is_byte_level_) printf("[TOK] Byte-level BPE detected\n");

        printf("[TOK] Loaded: vocab=%d, merges=%zu, bos=%d, eos=%d\n",
               vocab_size_, merge_rank_.size(), bos_id_, eos_id_);

        return true;
    }

    // ═══════════════════════════════════════════════════════════════════════════
    // ENCODE — text → token IDs
    // Uses byte-fallback BPE: first split to bytes, then merge greedily
    // ═══════════════════════════════════════════════════════════════════════════
    std::vector<int> encode(const std::string& text) const {
        if (text.empty()) return {};

        // If no merges, use byte-level encoding
        if (merge_rank_.empty()) {
            return encode_bytes(text);
        }

        // Pre-tokenize: split on whitespace/punctuation boundaries
        std::vector<std::string> words = pretokenize(text);

        std::vector<int> ids;
        for (const auto& word : words) {
            auto word_ids = encode_word(word);
            ids.insert(ids.end(), word_ids.begin(), word_ids.end());
        }
        return ids;
    }

    // Encode with BOS prefix
    std::vector<int> encode_with_bos(const std::string& text) const {
        auto ids = encode(text);
        ids.insert(ids.begin(), bos_id_);
        return ids;
    }

    // ═══════════════════════════════════════════════════════════════════════════
    // DECODE — token IDs → text
    // Handles: byte tokens (<0xNN>), SentencePiece (▁), GPT-2 byte-level BPE
    // ═══════════════════════════════════════════════════════════════════════════
    std::string decode(const std::vector<int>& ids) const {
        std::string result;
        for (int id : ids) {
            if (id < 0 || id >= vocab_size_) continue;
            std::string tok = vocab_[id];

            // Handle byte tokens: <0xNN>
            if (tok.size() == 6 && tok[0] == '<' && tok[1] == '0' && tok[2] == 'x') {
                int byte_val = 0;
                if (sscanf(tok.c_str(), "<0x%02X>", &byte_val) == 1) {
                    result += (char)byte_val;
                    continue;
                }
            }

            // GPT-2 byte-level BPE: full Unicode→byte decode
            if (is_byte_level_) {
                std::string out;
                out.reserve(tok.size());
                for (size_t i = 0; i < tok.size(); ) {
                    uint8_t c = (uint8_t)tok[i];
                    if (c >= 0xC4 && c <= 0xC7 && i + 1 < tok.size()) {
                        uint8_t c2 = (uint8_t)tok[i+1];
                        uint32_t cp = ((c & 0x1F) << 6) | (c2 & 0x3F);
                        if (cp >= 0x100 && cp <= 0x1FF) {
                            out.push_back((char)(cp - 0x100));
                            i += 2; continue;
                        }
                        int byte = gpt2_unicode_to_byte(cp);
                        if (byte >= 0) { out.push_back((char)byte); i += 2; continue; }
                    }
                    // SentencePiece ▁ → space
                    if (c == 0xE2 && i + 2 < tok.size() &&
                        (uint8_t)tok[i+1] == 0x96 && (uint8_t)tok[i+2] == 0x81) {
                        out += ' '; i += 3; continue;
                    }
                    out.push_back(tok[i]); i++;
                }
                result += out;
            } else {
                // Non-byte-level: SentencePiece + basic GPT-2 markers
                std::string out;
                for (size_t i = 0; i < tok.size(); ) {
                    unsigned char c = tok[i];
                    if (c == 0xC4 && i+1 < tok.size() && (unsigned char)tok[i+1] == 0xA0)
                        { out += ' '; i += 2; }
                    else if (c == 0xE2 && i+2 < tok.size() && (unsigned char)tok[i+1] == 0x96 && (unsigned char)tok[i+2] == 0x81)
                        { out += ' '; i += 3; }
                    else if (c == 0xC4 && i+1 < tok.size() && (unsigned char)tok[i+1] == 0x8A)
                        { out += '\n'; i += 2; }
                    else { out += (char)c; i++; }
                }
                result += out;
            }
        }
        return result;
    }

    // GPT-2 byte-level BPE: Unicode codepoint → byte value
    static int gpt2_unicode_to_byte(uint32_t cp) {
        if (cp >= 0x21 && cp <= 0x7E) return (int)cp;
        if (cp >= 0xA1 && cp <= 0xAC) return (int)cp;
        if (cp >= 0xAE && cp <= 0xFF) return (int)cp;
        if (cp >= 0x100 && cp <= 0x142) {
            static const int map[] = {
                0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
                0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
                0x20,0x7F,
                0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
                0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
                0xA0,0xAD,
            };
            int idx = (int)(cp - 0x100);
            if (idx < (int)(sizeof(map)/sizeof(map[0]))) return map[idx];
        }
        return -1;
    }

    std::string decode_token(int id) const {
        return decode({id});
    }

    // Accessors
    int vocab_size() const { return vocab_size_; }
    int bos_id() const { return bos_id_; }
    int eos_id() const { return eos_id_; }
    int pad_id() const { return pad_id_; }
    int im_start_id() const { return im_start_id_; }
    int im_end_id() const { return im_end_id_; }

    // Public token lookup
    int find_token(const std::string& s) const {
        auto it = token_to_id_.find(s);
        return it != token_to_id_.end() ? it->second : -1;
    }

    bool is_special(int id) const {
        if (id < 0 || id >= (int)token_types_.size()) return false;
        return token_types_[id] != 1; // type 1 = normal, others = special
    }

private:
    std::vector<std::string> vocab_;
    std::unordered_map<std::string, int> token_to_id_;
    std::unordered_map<std::string, int> merge_rank_;
    std::vector<int32_t> token_types_;
    int vocab_size_ = 0;
    int bos_id_ = 1;
    int eos_id_ = 2;
    int pad_id_ = 0;
    int im_start_id_ = -1;
    int im_end_id_ = -1;
    bool is_byte_level_ = false;

    // Pre-tokenize: split text into words
    std::vector<std::string> pretokenize(const std::string& text) const {
        std::vector<std::string> words;
        std::string current;

        for (size_t i = 0; i < text.size(); ) {
            unsigned char c = text[i];

            // UTF-8 character length
            int clen = 1;
            if ((c & 0x80) == 0)        clen = 1;
            else if ((c & 0xE0) == 0xC0) clen = 2;
            else if ((c & 0xF0) == 0xE0) clen = 3;
            else if ((c & 0xF8) == 0xF0) clen = 4;

            std::string ch = text.substr(i, clen);

            if (c == ' ' || c == '\n' || c == '\t' || c == '\r') {
                if (!current.empty()) { words.push_back(current); current.clear(); }
                current = ch;
                words.push_back(current);
                current.clear();
            } else {
                current += ch;
            }
            i += clen;
        }
        if (!current.empty()) words.push_back(current);
        return words;
    }

    // BPE encode a single word
    std::vector<int> encode_word(const std::string& word) const {
        // Start with individual byte/character tokens
        std::vector<std::string> symbols;
        for (size_t i = 0; i < word.size(); ) {
            unsigned char c = word[i];
            int clen = 1;
            if ((c & 0x80) == 0)        clen = 1;
            else if ((c & 0xE0) == 0xC0) clen = 2;
            else if ((c & 0xF0) == 0xE0) clen = 3;
            else if ((c & 0xF8) == 0xF0) clen = 4;
            symbols.push_back(word.substr(i, clen));
            i += clen;
        }

        // Iteratively apply BPE merges
        while (symbols.size() > 1) {
            int best_rank = INT32_MAX;
            int best_pos = -1;

            for (int i = 0; i < (int)symbols.size() - 1; ++i) {
                std::string pair = symbols[i] + " " + symbols[i + 1];
                auto it = merge_rank_.find(pair);
                if (it != merge_rank_.end() && it->second < best_rank) {
                    best_rank = it->second;
                    best_pos = i;
                }
            }

            if (best_pos < 0) break; // No more merges possible

            // Apply merge
            symbols[best_pos] = symbols[best_pos] + symbols[best_pos + 1];
            symbols.erase(symbols.begin() + best_pos + 1);
        }

        // Convert symbols to IDs
        std::vector<int> ids;
        for (const auto& sym : symbols) {
            auto it = token_to_id_.find(sym);
            if (it != token_to_id_.end()) {
                ids.push_back(it->second);
            } else {
                // Byte fallback: encode each byte as <0xNN>
                for (unsigned char c : sym) {
                    char buf[8];
                    snprintf(buf, sizeof(buf), "<0x%02X>", c);
                    auto bit = token_to_id_.find(buf);
                    if (bit != token_to_id_.end()) {
                        ids.push_back(bit->second);
                    }
                }
            }
        }
        return ids;
    }

    // Byte-level encoding (fallback when no merges)
    std::vector<int> encode_bytes(const std::string& text) const {
        std::vector<int> ids;
        // Try whole-string match first
        auto it = token_to_id_.find(text);
        if (it != token_to_id_.end()) {
            ids.push_back(it->second);
            return ids;
        }
        // Greedy forward match
        size_t i = 0;
        while (i < text.size()) {
            int best_len = 0;
            int best_id = -1;
            int max_try = std::min((int)(text.size() - i), 32);
            for (int len = max_try; len >= 1; --len) {
                auto it2 = token_to_id_.find(text.substr(i, len));
                if (it2 != token_to_id_.end()) {
                    best_len = len;
                    best_id = it2->second;
                    break;
                }
            }
            if (best_id >= 0) {
                ids.push_back(best_id);
                i += best_len;
            } else {
                // Byte fallback
                char buf[8];
                snprintf(buf, sizeof(buf), "<0x%02X>", (unsigned char)text[i]);
                auto bit = token_to_id_.find(buf);
                if (bit != token_to_id_.end()) ids.push_back(bit->second);
                i++;
            }
        }
        return ids;
    }
};

} // namespace ix