inference-x/runtime/tokenizer.h

380 lines
15 KiB
C++

// ═══════════════════════════════════════════════════════════════════════════════
// INFERENCE-X — Tokenizer Engine
// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
// Licensed under the Business Source License 1.1 (BSL-1.1)
// See LICENSE file for full terms.
//
// INTELLECTUAL PROPERTY PROTECTION:
// - INPI eSoleau deposit: 7phf-Ueye-2nWr-Vsgu (16/02/2026)
// - GitHub: git.inference-x.com/salka/inference-x
// - Author: Salka Elmadani | Morocco | Morocco
//
// MANUFACTURER NOTICE: Any manufacturer, company, or entity that
// incorporates, embeds, distributes, or commercially uses Inference-X
// or any derivative work without explicit written authorization from
// the copyright holder is in violation of BSL-1.1 and applicable
// intellectual property laws. This includes but is not limited to:
// hardware vendors, cloud providers, SaaS platforms, and OEMs.
//
// Contact: Elmadani.SALKA@proton.me for licensing.
// ═══════════════════════════════════════════════════════════════════════════════
#pragma once
// Inference-X Tokenizer — Salka Elmadani
#define IX_TOKENIZER_MARK "IX-TOK"
#include "gguf.h"
#include <string>
#include <vector>
#include <unordered_map>
#include <algorithm>
#include <cstdio>
#include <cstring>
namespace ix {
class Tokenizer {
public:
// Load from GGUF metadata
bool load(const GGUF& gguf) {
// Get vocab tokens
auto* tokens = gguf.get_str_arr("tokenizer.ggml.tokens");
if (!tokens || tokens->empty()) {
printf("[TOK] ERROR: No tokenizer.ggml.tokens in GGUF\n");
return false;
}
vocab_ = *tokens;
vocab_size_ = (int)vocab_.size();
// Build token → id map
for (int i = 0; i < vocab_size_; ++i) {
token_to_id_[vocab_[i]] = i;
}
// Get BPE merges
auto* merges = gguf.get_str_arr("tokenizer.ggml.merges");
if (merges && !merges->empty()) {
for (int i = 0; i < (int)merges->size(); ++i) {
const std::string& m = (*merges)[i];
size_t sp = m.find(' ');
if (sp != std::string::npos) {
std::string a = m.substr(0, sp);
std::string b = m.substr(sp + 1);
merge_rank_[a + " " + b] = i;
}
}
}
// Get token types
auto* types = gguf.get_i32_arr("tokenizer.ggml.token_type");
if (types) token_types_ = *types;
// Special tokens
bos_id_ = (int)gguf.get_u32("tokenizer.ggml.bos_token_id", 1);
eos_id_ = (int)gguf.get_u32("tokenizer.ggml.eos_token_id", 2);
pad_id_ = (int)gguf.get_u32("tokenizer.ggml.padding_token_id", 0);
// Check for special token strings in vocab
auto find_tok = [&](const std::string& s) -> int {
auto it = token_to_id_.find(s);
return it != token_to_id_.end() ? it->second : -1;
};
// Common special tokens
if (find_tok("<|begin▁of▁sentence|>") >= 0) bos_id_ = find_tok("<|begin▁of▁sentence|>");
if (find_tok("<|end▁of▁sentence|>") >= 0) eos_id_ = find_tok("<|end▁of▁sentence|>");
if (find_tok("<|im_start|>") >= 0) im_start_id_ = find_tok("<|im_start|>");
if (find_tok("<|im_end|>") >= 0) im_end_id_ = find_tok("<|im_end|>");
// Detect GPT-2 byte-level BPE
is_byte_level_ = (token_to_id_.count("\xC4\xA0") > 0);
std::string tok_model = gguf.get_str("tokenizer.ggml.model", "");
if (tok_model == "gpt2") is_byte_level_ = true;
if (is_byte_level_) printf("[TOK] Byte-level BPE detected\n");
printf("[TOK] Loaded: vocab=%d, merges=%zu, bos=%d, eos=%d\n",
vocab_size_, merge_rank_.size(), bos_id_, eos_id_);
return true;
}
// ═══════════════════════════════════════════════════════════════════════════
// ENCODE — text → token IDs
// Uses byte-fallback BPE: first split to bytes, then merge greedily
// ═══════════════════════════════════════════════════════════════════════════
std::vector<int> encode(const std::string& text) const {
if (text.empty()) return {};
// If no merges, use byte-level encoding
if (merge_rank_.empty()) {
return encode_bytes(text);
}
// Pre-tokenize: split on whitespace/punctuation boundaries
std::vector<std::string> words = pretokenize(text);
std::vector<int> ids;
for (const auto& word : words) {
auto word_ids = encode_word(word);
ids.insert(ids.end(), word_ids.begin(), word_ids.end());
}
return ids;
}
// Encode with BOS prefix
std::vector<int> encode_with_bos(const std::string& text) const {
auto ids = encode(text);
ids.insert(ids.begin(), bos_id_);
return ids;
}
// ═══════════════════════════════════════════════════════════════════════════
// DECODE — token IDs → text
// Handles: byte tokens (<0xNN>), SentencePiece (▁), GPT-2 byte-level BPE
// ═══════════════════════════════════════════════════════════════════════════
std::string decode(const std::vector<int>& ids) const {
std::string result;
for (int id : ids) {
if (id < 0 || id >= vocab_size_) continue;
std::string tok = vocab_[id];
// Handle byte tokens: <0xNN>
if (tok.size() == 6 && tok[0] == '<' && tok[1] == '0' && tok[2] == 'x') {
int byte_val = 0;
if (sscanf(tok.c_str(), "<0x%02X>", &byte_val) == 1) {
result += (char)byte_val;
continue;
}
}
// GPT-2 byte-level BPE: full Unicode→byte decode
if (is_byte_level_) {
std::string out;
out.reserve(tok.size());
for (size_t i = 0; i < tok.size(); ) {
uint8_t c = (uint8_t)tok[i];
if (c >= 0xC4 && c <= 0xC7 && i + 1 < tok.size()) {
uint8_t c2 = (uint8_t)tok[i+1];
uint32_t cp = ((c & 0x1F) << 6) | (c2 & 0x3F);
if (cp >= 0x100 && cp <= 0x1FF) {
out.push_back((char)(cp - 0x100));
i += 2; continue;
}
int byte = gpt2_unicode_to_byte(cp);
if (byte >= 0) { out.push_back((char)byte); i += 2; continue; }
}
// SentencePiece ▁ → space
if (c == 0xE2 && i + 2 < tok.size() &&
(uint8_t)tok[i+1] == 0x96 && (uint8_t)tok[i+2] == 0x81) {
out += ' '; i += 3; continue;
}
out.push_back(tok[i]); i++;
}
result += out;
} else {
// Non-byte-level: SentencePiece + basic GPT-2 markers
std::string out;
for (size_t i = 0; i < tok.size(); ) {
unsigned char c = tok[i];
if (c == 0xC4 && i+1 < tok.size() && (unsigned char)tok[i+1] == 0xA0)
{ out += ' '; i += 2; }
else if (c == 0xE2 && i+2 < tok.size() && (unsigned char)tok[i+1] == 0x96 && (unsigned char)tok[i+2] == 0x81)
{ out += ' '; i += 3; }
else if (c == 0xC4 && i+1 < tok.size() && (unsigned char)tok[i+1] == 0x8A)
{ out += '\n'; i += 2; }
else { out += (char)c; i++; }
}
result += out;
}
}
return result;
}
// GPT-2 byte-level BPE: Unicode codepoint → byte value
static int gpt2_unicode_to_byte(uint32_t cp) {
if (cp >= 0x21 && cp <= 0x7E) return (int)cp;
if (cp >= 0xA1 && cp <= 0xAC) return (int)cp;
if (cp >= 0xAE && cp <= 0xFF) return (int)cp;
if (cp >= 0x100 && cp <= 0x142) {
static const int map[] = {
0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
0x20,0x7F,
0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
0xA0,0xAD,
};
int idx = (int)(cp - 0x100);
if (idx < (int)(sizeof(map)/sizeof(map[0]))) return map[idx];
}
return -1;
}
std::string decode_token(int id) const {
return decode({id});
}
// Accessors
int vocab_size() const { return vocab_size_; }
int bos_id() const { return bos_id_; }
int eos_id() const { return eos_id_; }
int pad_id() const { return pad_id_; }
int im_start_id() const { return im_start_id_; }
int im_end_id() const { return im_end_id_; }
// Public token lookup
int find_token(const std::string& s) const {
auto it = token_to_id_.find(s);
return it != token_to_id_.end() ? it->second : -1;
}
bool is_special(int id) const {
if (id < 0 || id >= (int)token_types_.size()) return false;
return token_types_[id] != 1; // type 1 = normal, others = special
}
private:
std::vector<std::string> vocab_;
std::unordered_map<std::string, int> token_to_id_;
std::unordered_map<std::string, int> merge_rank_;
std::vector<int32_t> token_types_;
int vocab_size_ = 0;
int bos_id_ = 1;
int eos_id_ = 2;
int pad_id_ = 0;
int im_start_id_ = -1;
int im_end_id_ = -1;
bool is_byte_level_ = false;
// Pre-tokenize: split text into words
std::vector<std::string> pretokenize(const std::string& text) const {
std::vector<std::string> words;
std::string current;
for (size_t i = 0; i < text.size(); ) {
unsigned char c = text[i];
// UTF-8 character length
int clen = 1;
if ((c & 0x80) == 0) clen = 1;
else if ((c & 0xE0) == 0xC0) clen = 2;
else if ((c & 0xF0) == 0xE0) clen = 3;
else if ((c & 0xF8) == 0xF0) clen = 4;
std::string ch = text.substr(i, clen);
if (c == ' ' || c == '\n' || c == '\t' || c == '\r') {
if (!current.empty()) { words.push_back(current); current.clear(); }
current = ch;
words.push_back(current);
current.clear();
} else {
current += ch;
}
i += clen;
}
if (!current.empty()) words.push_back(current);
return words;
}
// BPE encode a single word
std::vector<int> encode_word(const std::string& word) const {
// Start with individual byte/character tokens
std::vector<std::string> symbols;
for (size_t i = 0; i < word.size(); ) {
unsigned char c = word[i];
int clen = 1;
if ((c & 0x80) == 0) clen = 1;
else if ((c & 0xE0) == 0xC0) clen = 2;
else if ((c & 0xF0) == 0xE0) clen = 3;
else if ((c & 0xF8) == 0xF0) clen = 4;
symbols.push_back(word.substr(i, clen));
i += clen;
}
// Iteratively apply BPE merges
while (symbols.size() > 1) {
int best_rank = INT32_MAX;
int best_pos = -1;
for (int i = 0; i < (int)symbols.size() - 1; ++i) {
std::string pair = symbols[i] + " " + symbols[i + 1];
auto it = merge_rank_.find(pair);
if (it != merge_rank_.end() && it->second < best_rank) {
best_rank = it->second;
best_pos = i;
}
}
if (best_pos < 0) break; // No more merges possible
// Apply merge
symbols[best_pos] = symbols[best_pos] + symbols[best_pos + 1];
symbols.erase(symbols.begin() + best_pos + 1);
}
// Convert symbols to IDs
std::vector<int> ids;
for (const auto& sym : symbols) {
auto it = token_to_id_.find(sym);
if (it != token_to_id_.end()) {
ids.push_back(it->second);
} else {
// Byte fallback: encode each byte as <0xNN>
for (unsigned char c : sym) {
char buf[8];
snprintf(buf, sizeof(buf), "<0x%02X>", c);
auto bit = token_to_id_.find(buf);
if (bit != token_to_id_.end()) {
ids.push_back(bit->second);
}
}
}
}
return ids;
}
// Byte-level encoding (fallback when no merges)
std::vector<int> encode_bytes(const std::string& text) const {
std::vector<int> ids;
// Try whole-string match first
auto it = token_to_id_.find(text);
if (it != token_to_id_.end()) {
ids.push_back(it->second);
return ids;
}
// Greedy forward match
size_t i = 0;
while (i < text.size()) {
int best_len = 0;
int best_id = -1;
int max_try = std::min((int)(text.size() - i), 32);
for (int len = max_try; len >= 1; --len) {
auto it2 = token_to_id_.find(text.substr(i, len));
if (it2 != token_to_id_.end()) {
best_len = len;
best_id = it2->second;
break;
}
}
if (best_id >= 0) {
ids.push_back(best_id);
i += best_len;
} else {
// Byte fallback
char buf[8];
snprintf(buf, sizeof(buf), "<0x%02X>", (unsigned char)text[i]);
auto bit = token_to_id_.find(buf);
if (bit != token_to_id_.end()) ids.push_back(bit->second);
i++;
}
}
return ids;
}
};
} // namespace ix