// ═══════════════════════════════════════════════════════════════════════════════ // INFERENCE-X — Tokenizer Engine // Copyright (C) 2024-2026 Salka Elmadani. All rights reserved. // Licensed under the Business Source License 1.1 (BSL-1.1) // See LICENSE file for full terms. // // INTELLECTUAL PROPERTY PROTECTION: // - INPI eSoleau deposit: 7phf-Ueye-2nWr-Vsgu (16/02/2026) // - GitHub: github.com/ElmadaniS/inference-x // - Author: Salka Elmadani | Morocco | Morocco // // MANUFACTURER NOTICE: Any manufacturer, company, or entity that // incorporates, embeds, distributes, or commercially uses Inference-X // or any derivative work without explicit written authorization from // the copyright holder is in violation of BSL-1.1 and applicable // intellectual property laws. This includes but is not limited to: // hardware vendors, cloud providers, SaaS platforms, and OEMs. // // Contact: Elmadani.SALKA@proton.me for licensing. // ═══════════════════════════════════════════════════════════════════════════════ #pragma once // Inference-X Tokenizer — Salka Elmadani #define IX_TOKENIZER_MARK "IX-TOK" #include "gguf.h" #include #include #include #include #include #include namespace ix { class Tokenizer { public: // Load from GGUF metadata bool load(const GGUF& gguf) { // Get vocab tokens auto* tokens = gguf.get_str_arr("tokenizer.ggml.tokens"); if (!tokens || tokens->empty()) { printf("[TOK] ERROR: No tokenizer.ggml.tokens in GGUF\n"); return false; } vocab_ = *tokens; vocab_size_ = (int)vocab_.size(); // Build token → id map for (int i = 0; i < vocab_size_; ++i) { token_to_id_[vocab_[i]] = i; } // Get BPE merges auto* merges = gguf.get_str_arr("tokenizer.ggml.merges"); if (merges && !merges->empty()) { for (int i = 0; i < (int)merges->size(); ++i) { const std::string& m = (*merges)[i]; size_t sp = m.find(' '); if (sp != std::string::npos) { std::string a = m.substr(0, sp); std::string b = m.substr(sp + 1); merge_rank_[a + " " + b] = i; } } } // Get token types auto* types = gguf.get_i32_arr("tokenizer.ggml.token_type"); if (types) token_types_ = *types; // Special tokens bos_id_ = (int)gguf.get_u32("tokenizer.ggml.bos_token_id", 1); eos_id_ = (int)gguf.get_u32("tokenizer.ggml.eos_token_id", 2); pad_id_ = (int)gguf.get_u32("tokenizer.ggml.padding_token_id", 0); // Check for special token strings in vocab auto find_tok = [&](const std::string& s) -> int { auto it = token_to_id_.find(s); return it != token_to_id_.end() ? it->second : -1; }; // Common special tokens if (find_tok("<|begin▁of▁sentence|>") >= 0) bos_id_ = find_tok("<|begin▁of▁sentence|>"); if (find_tok("<|end▁of▁sentence|>") >= 0) eos_id_ = find_tok("<|end▁of▁sentence|>"); if (find_tok("<|im_start|>") >= 0) im_start_id_ = find_tok("<|im_start|>"); if (find_tok("<|im_end|>") >= 0) im_end_id_ = find_tok("<|im_end|>"); // Detect GPT-2 byte-level BPE is_byte_level_ = (token_to_id_.count("\xC4\xA0") > 0); std::string tok_model = gguf.get_str("tokenizer.ggml.model", ""); if (tok_model == "gpt2") is_byte_level_ = true; if (is_byte_level_) printf("[TOK] Byte-level BPE detected\n"); printf("[TOK] Loaded: vocab=%d, merges=%zu, bos=%d, eos=%d\n", vocab_size_, merge_rank_.size(), bos_id_, eos_id_); return true; } // ═══════════════════════════════════════════════════════════════════════════ // ENCODE — text → token IDs // Uses byte-fallback BPE: first split to bytes, then merge greedily // ═══════════════════════════════════════════════════════════════════════════ std::vector encode(const std::string& text) const { if (text.empty()) return {}; // If no merges, use byte-level encoding if (merge_rank_.empty()) { return encode_bytes(text); } // Pre-tokenize: split on whitespace/punctuation boundaries std::vector words = pretokenize(text); std::vector ids; for (const auto& word : words) { auto word_ids = encode_word(word); ids.insert(ids.end(), word_ids.begin(), word_ids.end()); } return ids; } // Encode with BOS prefix std::vector encode_with_bos(const std::string& text) const { auto ids = encode(text); ids.insert(ids.begin(), bos_id_); return ids; } // ═══════════════════════════════════════════════════════════════════════════ // DECODE — token IDs → text // Handles: byte tokens (<0xNN>), SentencePiece (▁), GPT-2 byte-level BPE // ═══════════════════════════════════════════════════════════════════════════ std::string decode(const std::vector& ids) const { std::string result; for (int id : ids) { if (id < 0 || id >= vocab_size_) continue; std::string tok = vocab_[id]; // Handle byte tokens: <0xNN> if (tok.size() == 6 && tok[0] == '<' && tok[1] == '0' && tok[2] == 'x') { int byte_val = 0; if (sscanf(tok.c_str(), "<0x%02X>", &byte_val) == 1) { result += (char)byte_val; continue; } } // GPT-2 byte-level BPE: full Unicode→byte decode if (is_byte_level_) { std::string out; out.reserve(tok.size()); for (size_t i = 0; i < tok.size(); ) { uint8_t c = (uint8_t)tok[i]; if (c >= 0xC4 && c <= 0xC7 && i + 1 < tok.size()) { uint8_t c2 = (uint8_t)tok[i+1]; uint32_t cp = ((c & 0x1F) << 6) | (c2 & 0x3F); if (cp >= 0x100 && cp <= 0x1FF) { out.push_back((char)(cp - 0x100)); i += 2; continue; } int byte = gpt2_unicode_to_byte(cp); if (byte >= 0) { out.push_back((char)byte); i += 2; continue; } } // SentencePiece ▁ → space if (c == 0xE2 && i + 2 < tok.size() && (uint8_t)tok[i+1] == 0x96 && (uint8_t)tok[i+2] == 0x81) { out += ' '; i += 3; continue; } out.push_back(tok[i]); i++; } result += out; } else { // Non-byte-level: SentencePiece + basic GPT-2 markers std::string out; for (size_t i = 0; i < tok.size(); ) { unsigned char c = tok[i]; if (c == 0xC4 && i+1 < tok.size() && (unsigned char)tok[i+1] == 0xA0) { out += ' '; i += 2; } else if (c == 0xE2 && i+2 < tok.size() && (unsigned char)tok[i+1] == 0x96 && (unsigned char)tok[i+2] == 0x81) { out += ' '; i += 3; } else if (c == 0xC4 && i+1 < tok.size() && (unsigned char)tok[i+1] == 0x8A) { out += '\n'; i += 2; } else { out += (char)c; i++; } } result += out; } } return result; } // GPT-2 byte-level BPE: Unicode codepoint → byte value static int gpt2_unicode_to_byte(uint32_t cp) { if (cp >= 0x21 && cp <= 0x7E) return (int)cp; if (cp >= 0xA1 && cp <= 0xAC) return (int)cp; if (cp >= 0xAE && cp <= 0xFF) return (int)cp; if (cp >= 0x100 && cp <= 0x142) { static const int map[] = { 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F, 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F, 0x20,0x7F, 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F, 0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F, 0xA0,0xAD, }; int idx = (int)(cp - 0x100); if (idx < (int)(sizeof(map)/sizeof(map[0]))) return map[idx]; } return -1; } std::string decode_token(int id) const { return decode({id}); } // Accessors int vocab_size() const { return vocab_size_; } int bos_id() const { return bos_id_; } int eos_id() const { return eos_id_; } int pad_id() const { return pad_id_; } int im_start_id() const { return im_start_id_; } int im_end_id() const { return im_end_id_; } // Public token lookup int find_token(const std::string& s) const { auto it = token_to_id_.find(s); return it != token_to_id_.end() ? it->second : -1; } bool is_special(int id) const { if (id < 0 || id >= (int)token_types_.size()) return false; return token_types_[id] != 1; // type 1 = normal, others = special } private: std::vector vocab_; std::unordered_map token_to_id_; std::unordered_map merge_rank_; std::vector token_types_; int vocab_size_ = 0; int bos_id_ = 1; int eos_id_ = 2; int pad_id_ = 0; int im_start_id_ = -1; int im_end_id_ = -1; bool is_byte_level_ = false; // Pre-tokenize: split text into words std::vector pretokenize(const std::string& text) const { std::vector words; std::string current; for (size_t i = 0; i < text.size(); ) { unsigned char c = text[i]; // UTF-8 character length int clen = 1; if ((c & 0x80) == 0) clen = 1; else if ((c & 0xE0) == 0xC0) clen = 2; else if ((c & 0xF0) == 0xE0) clen = 3; else if ((c & 0xF8) == 0xF0) clen = 4; std::string ch = text.substr(i, clen); if (c == ' ' || c == '\n' || c == '\t' || c == '\r') { if (!current.empty()) { words.push_back(current); current.clear(); } current = ch; words.push_back(current); current.clear(); } else { current += ch; } i += clen; } if (!current.empty()) words.push_back(current); return words; } // BPE encode a single word std::vector encode_word(const std::string& word) const { // Start with individual byte/character tokens std::vector symbols; for (size_t i = 0; i < word.size(); ) { unsigned char c = word[i]; int clen = 1; if ((c & 0x80) == 0) clen = 1; else if ((c & 0xE0) == 0xC0) clen = 2; else if ((c & 0xF0) == 0xE0) clen = 3; else if ((c & 0xF8) == 0xF0) clen = 4; symbols.push_back(word.substr(i, clen)); i += clen; } // Iteratively apply BPE merges while (symbols.size() > 1) { int best_rank = INT32_MAX; int best_pos = -1; for (int i = 0; i < (int)symbols.size() - 1; ++i) { std::string pair = symbols[i] + " " + symbols[i + 1]; auto it = merge_rank_.find(pair); if (it != merge_rank_.end() && it->second < best_rank) { best_rank = it->second; best_pos = i; } } if (best_pos < 0) break; // No more merges possible // Apply merge symbols[best_pos] = symbols[best_pos] + symbols[best_pos + 1]; symbols.erase(symbols.begin() + best_pos + 1); } // Convert symbols to IDs std::vector ids; for (const auto& sym : symbols) { auto it = token_to_id_.find(sym); if (it != token_to_id_.end()) { ids.push_back(it->second); } else { // Byte fallback: encode each byte as <0xNN> for (unsigned char c : sym) { char buf[8]; snprintf(buf, sizeof(buf), "<0x%02X>", c); auto bit = token_to_id_.find(buf); if (bit != token_to_id_.end()) { ids.push_back(bit->second); } } } } return ids; } // Byte-level encoding (fallback when no merges) std::vector encode_bytes(const std::string& text) const { std::vector ids; // Try whole-string match first auto it = token_to_id_.find(text); if (it != token_to_id_.end()) { ids.push_back(it->second); return ids; } // Greedy forward match size_t i = 0; while (i < text.size()) { int best_len = 0; int best_id = -1; int max_try = std::min((int)(text.size() - i), 32); for (int len = max_try; len >= 1; --len) { auto it2 = token_to_id_.find(text.substr(i, len)); if (it2 != token_to_id_.end()) { best_len = len; best_id = it2->second; break; } } if (best_id >= 0) { ids.push_back(best_id); i += best_len; } else { // Byte fallback char buf[8]; snprintf(buf, sizeof(buf), "<0x%02X>", (unsigned char)text[i]); auto bit = token_to_id_.find(buf); if (bit != token_to_id_.end()) ids.push_back(bit->second); i++; } } return ids; } }; } // namespace ix