Better output from the same model. Fused computation, adaptive precision, surgical expert loading. 305 KB, 19 backends, zero dependencies. https://inference-x.com
380 lines
15 KiB
C++
380 lines
15 KiB
C++
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// INFERENCE-X — Tokenizer Engine
|
|
// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
|
|
// Licensed under the Business Source License 1.1 (BSL-1.1)
|
|
// See LICENSE file for full terms.
|
|
//
|
|
// INTELLECTUAL PROPERTY PROTECTION:
|
|
// - INPI eSoleau deposit: 7phf-Ueye-2nWr-Vsgu (16/02/2026)
|
|
// - GitHub: github.com/ElmadaniS/inference-x
|
|
// - Author: Salka Elmadani | Morocco | Morocco
|
|
//
|
|
// MANUFACTURER NOTICE: Any manufacturer, company, or entity that
|
|
// incorporates, embeds, distributes, or commercially uses Inference-X
|
|
// or any derivative work without explicit written authorization from
|
|
// the copyright holder is in violation of BSL-1.1 and applicable
|
|
// intellectual property laws. This includes but is not limited to:
|
|
// hardware vendors, cloud providers, SaaS platforms, and OEMs.
|
|
//
|
|
// Contact: Elmadani.SALKA@proton.me for licensing.
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
|
|
#pragma once
|
|
|
|
// Inference-X Tokenizer — Salka Elmadani
|
|
#define IX_TOKENIZER_MARK "IX-TOK"
|
|
|
|
|
|
#include "gguf.h"
|
|
#include <string>
|
|
#include <vector>
|
|
#include <unordered_map>
|
|
#include <algorithm>
|
|
#include <cstdio>
|
|
#include <cstring>
|
|
|
|
namespace ix {
|
|
|
|
class Tokenizer {
|
|
public:
|
|
// Load from GGUF metadata
|
|
bool load(const GGUF& gguf) {
|
|
// Get vocab tokens
|
|
auto* tokens = gguf.get_str_arr("tokenizer.ggml.tokens");
|
|
if (!tokens || tokens->empty()) {
|
|
printf("[TOK] ERROR: No tokenizer.ggml.tokens in GGUF\n");
|
|
return false;
|
|
}
|
|
|
|
vocab_ = *tokens;
|
|
vocab_size_ = (int)vocab_.size();
|
|
|
|
// Build token → id map
|
|
for (int i = 0; i < vocab_size_; ++i) {
|
|
token_to_id_[vocab_[i]] = i;
|
|
}
|
|
|
|
// Get BPE merges
|
|
auto* merges = gguf.get_str_arr("tokenizer.ggml.merges");
|
|
if (merges && !merges->empty()) {
|
|
for (int i = 0; i < (int)merges->size(); ++i) {
|
|
const std::string& m = (*merges)[i];
|
|
size_t sp = m.find(' ');
|
|
if (sp != std::string::npos) {
|
|
std::string a = m.substr(0, sp);
|
|
std::string b = m.substr(sp + 1);
|
|
merge_rank_[a + " " + b] = i;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Get token types
|
|
auto* types = gguf.get_i32_arr("tokenizer.ggml.token_type");
|
|
if (types) token_types_ = *types;
|
|
|
|
// Special tokens
|
|
bos_id_ = (int)gguf.get_u32("tokenizer.ggml.bos_token_id", 1);
|
|
eos_id_ = (int)gguf.get_u32("tokenizer.ggml.eos_token_id", 2);
|
|
pad_id_ = (int)gguf.get_u32("tokenizer.ggml.padding_token_id", 0);
|
|
|
|
// Check for special token strings in vocab
|
|
auto find_tok = [&](const std::string& s) -> int {
|
|
auto it = token_to_id_.find(s);
|
|
return it != token_to_id_.end() ? it->second : -1;
|
|
};
|
|
|
|
// Common special tokens
|
|
if (find_tok("<|begin▁of▁sentence|>") >= 0) bos_id_ = find_tok("<|begin▁of▁sentence|>");
|
|
if (find_tok("<|end▁of▁sentence|>") >= 0) eos_id_ = find_tok("<|end▁of▁sentence|>");
|
|
if (find_tok("<|im_start|>") >= 0) im_start_id_ = find_tok("<|im_start|>");
|
|
if (find_tok("<|im_end|>") >= 0) im_end_id_ = find_tok("<|im_end|>");
|
|
|
|
// Detect GPT-2 byte-level BPE
|
|
is_byte_level_ = (token_to_id_.count("\xC4\xA0") > 0);
|
|
std::string tok_model = gguf.get_str("tokenizer.ggml.model", "");
|
|
if (tok_model == "gpt2") is_byte_level_ = true;
|
|
if (is_byte_level_) printf("[TOK] Byte-level BPE detected\n");
|
|
|
|
printf("[TOK] Loaded: vocab=%d, merges=%zu, bos=%d, eos=%d\n",
|
|
vocab_size_, merge_rank_.size(), bos_id_, eos_id_);
|
|
|
|
return true;
|
|
}
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
// ENCODE — text → token IDs
|
|
// Uses byte-fallback BPE: first split to bytes, then merge greedily
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
std::vector<int> encode(const std::string& text) const {
|
|
if (text.empty()) return {};
|
|
|
|
// If no merges, use byte-level encoding
|
|
if (merge_rank_.empty()) {
|
|
return encode_bytes(text);
|
|
}
|
|
|
|
// Pre-tokenize: split on whitespace/punctuation boundaries
|
|
std::vector<std::string> words = pretokenize(text);
|
|
|
|
std::vector<int> ids;
|
|
for (const auto& word : words) {
|
|
auto word_ids = encode_word(word);
|
|
ids.insert(ids.end(), word_ids.begin(), word_ids.end());
|
|
}
|
|
return ids;
|
|
}
|
|
|
|
// Encode with BOS prefix
|
|
std::vector<int> encode_with_bos(const std::string& text) const {
|
|
auto ids = encode(text);
|
|
ids.insert(ids.begin(), bos_id_);
|
|
return ids;
|
|
}
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
// DECODE — token IDs → text
|
|
// Handles: byte tokens (<0xNN>), SentencePiece (▁), GPT-2 byte-level BPE
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
std::string decode(const std::vector<int>& ids) const {
|
|
std::string result;
|
|
for (int id : ids) {
|
|
if (id < 0 || id >= vocab_size_) continue;
|
|
std::string tok = vocab_[id];
|
|
|
|
// Handle byte tokens: <0xNN>
|
|
if (tok.size() == 6 && tok[0] == '<' && tok[1] == '0' && tok[2] == 'x') {
|
|
int byte_val = 0;
|
|
if (sscanf(tok.c_str(), "<0x%02X>", &byte_val) == 1) {
|
|
result += (char)byte_val;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// GPT-2 byte-level BPE: full Unicode→byte decode
|
|
if (is_byte_level_) {
|
|
std::string out;
|
|
out.reserve(tok.size());
|
|
for (size_t i = 0; i < tok.size(); ) {
|
|
uint8_t c = (uint8_t)tok[i];
|
|
if (c >= 0xC4 && c <= 0xC7 && i + 1 < tok.size()) {
|
|
uint8_t c2 = (uint8_t)tok[i+1];
|
|
uint32_t cp = ((c & 0x1F) << 6) | (c2 & 0x3F);
|
|
if (cp >= 0x100 && cp <= 0x1FF) {
|
|
out.push_back((char)(cp - 0x100));
|
|
i += 2; continue;
|
|
}
|
|
int byte = gpt2_unicode_to_byte(cp);
|
|
if (byte >= 0) { out.push_back((char)byte); i += 2; continue; }
|
|
}
|
|
// SentencePiece ▁ → space
|
|
if (c == 0xE2 && i + 2 < tok.size() &&
|
|
(uint8_t)tok[i+1] == 0x96 && (uint8_t)tok[i+2] == 0x81) {
|
|
out += ' '; i += 3; continue;
|
|
}
|
|
out.push_back(tok[i]); i++;
|
|
}
|
|
result += out;
|
|
} else {
|
|
// Non-byte-level: SentencePiece + basic GPT-2 markers
|
|
std::string out;
|
|
for (size_t i = 0; i < tok.size(); ) {
|
|
unsigned char c = tok[i];
|
|
if (c == 0xC4 && i+1 < tok.size() && (unsigned char)tok[i+1] == 0xA0)
|
|
{ out += ' '; i += 2; }
|
|
else if (c == 0xE2 && i+2 < tok.size() && (unsigned char)tok[i+1] == 0x96 && (unsigned char)tok[i+2] == 0x81)
|
|
{ out += ' '; i += 3; }
|
|
else if (c == 0xC4 && i+1 < tok.size() && (unsigned char)tok[i+1] == 0x8A)
|
|
{ out += '\n'; i += 2; }
|
|
else { out += (char)c; i++; }
|
|
}
|
|
result += out;
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
// GPT-2 byte-level BPE: Unicode codepoint → byte value
|
|
static int gpt2_unicode_to_byte(uint32_t cp) {
|
|
if (cp >= 0x21 && cp <= 0x7E) return (int)cp;
|
|
if (cp >= 0xA1 && cp <= 0xAC) return (int)cp;
|
|
if (cp >= 0xAE && cp <= 0xFF) return (int)cp;
|
|
if (cp >= 0x100 && cp <= 0x142) {
|
|
static const int map[] = {
|
|
0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
|
|
0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
|
|
0x20,0x7F,
|
|
0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
|
|
0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
|
|
0xA0,0xAD,
|
|
};
|
|
int idx = (int)(cp - 0x100);
|
|
if (idx < (int)(sizeof(map)/sizeof(map[0]))) return map[idx];
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
std::string decode_token(int id) const {
|
|
return decode({id});
|
|
}
|
|
|
|
// Accessors
|
|
int vocab_size() const { return vocab_size_; }
|
|
int bos_id() const { return bos_id_; }
|
|
int eos_id() const { return eos_id_; }
|
|
int pad_id() const { return pad_id_; }
|
|
int im_start_id() const { return im_start_id_; }
|
|
int im_end_id() const { return im_end_id_; }
|
|
|
|
// Public token lookup
|
|
int find_token(const std::string& s) const {
|
|
auto it = token_to_id_.find(s);
|
|
return it != token_to_id_.end() ? it->second : -1;
|
|
}
|
|
|
|
bool is_special(int id) const {
|
|
if (id < 0 || id >= (int)token_types_.size()) return false;
|
|
return token_types_[id] != 1; // type 1 = normal, others = special
|
|
}
|
|
|
|
private:
|
|
std::vector<std::string> vocab_;
|
|
std::unordered_map<std::string, int> token_to_id_;
|
|
std::unordered_map<std::string, int> merge_rank_;
|
|
std::vector<int32_t> token_types_;
|
|
int vocab_size_ = 0;
|
|
int bos_id_ = 1;
|
|
int eos_id_ = 2;
|
|
int pad_id_ = 0;
|
|
int im_start_id_ = -1;
|
|
int im_end_id_ = -1;
|
|
bool is_byte_level_ = false;
|
|
|
|
// Pre-tokenize: split text into words
|
|
std::vector<std::string> pretokenize(const std::string& text) const {
|
|
std::vector<std::string> words;
|
|
std::string current;
|
|
|
|
for (size_t i = 0; i < text.size(); ) {
|
|
unsigned char c = text[i];
|
|
|
|
// UTF-8 character length
|
|
int clen = 1;
|
|
if ((c & 0x80) == 0) clen = 1;
|
|
else if ((c & 0xE0) == 0xC0) clen = 2;
|
|
else if ((c & 0xF0) == 0xE0) clen = 3;
|
|
else if ((c & 0xF8) == 0xF0) clen = 4;
|
|
|
|
std::string ch = text.substr(i, clen);
|
|
|
|
if (c == ' ' || c == '\n' || c == '\t' || c == '\r') {
|
|
if (!current.empty()) { words.push_back(current); current.clear(); }
|
|
current = ch;
|
|
words.push_back(current);
|
|
current.clear();
|
|
} else {
|
|
current += ch;
|
|
}
|
|
i += clen;
|
|
}
|
|
if (!current.empty()) words.push_back(current);
|
|
return words;
|
|
}
|
|
|
|
// BPE encode a single word
|
|
std::vector<int> encode_word(const std::string& word) const {
|
|
// Start with individual byte/character tokens
|
|
std::vector<std::string> symbols;
|
|
for (size_t i = 0; i < word.size(); ) {
|
|
unsigned char c = word[i];
|
|
int clen = 1;
|
|
if ((c & 0x80) == 0) clen = 1;
|
|
else if ((c & 0xE0) == 0xC0) clen = 2;
|
|
else if ((c & 0xF0) == 0xE0) clen = 3;
|
|
else if ((c & 0xF8) == 0xF0) clen = 4;
|
|
symbols.push_back(word.substr(i, clen));
|
|
i += clen;
|
|
}
|
|
|
|
// Iteratively apply BPE merges
|
|
while (symbols.size() > 1) {
|
|
int best_rank = INT32_MAX;
|
|
int best_pos = -1;
|
|
|
|
for (int i = 0; i < (int)symbols.size() - 1; ++i) {
|
|
std::string pair = symbols[i] + " " + symbols[i + 1];
|
|
auto it = merge_rank_.find(pair);
|
|
if (it != merge_rank_.end() && it->second < best_rank) {
|
|
best_rank = it->second;
|
|
best_pos = i;
|
|
}
|
|
}
|
|
|
|
if (best_pos < 0) break; // No more merges possible
|
|
|
|
// Apply merge
|
|
symbols[best_pos] = symbols[best_pos] + symbols[best_pos + 1];
|
|
symbols.erase(symbols.begin() + best_pos + 1);
|
|
}
|
|
|
|
// Convert symbols to IDs
|
|
std::vector<int> ids;
|
|
for (const auto& sym : symbols) {
|
|
auto it = token_to_id_.find(sym);
|
|
if (it != token_to_id_.end()) {
|
|
ids.push_back(it->second);
|
|
} else {
|
|
// Byte fallback: encode each byte as <0xNN>
|
|
for (unsigned char c : sym) {
|
|
char buf[8];
|
|
snprintf(buf, sizeof(buf), "<0x%02X>", c);
|
|
auto bit = token_to_id_.find(buf);
|
|
if (bit != token_to_id_.end()) {
|
|
ids.push_back(bit->second);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return ids;
|
|
}
|
|
|
|
// Byte-level encoding (fallback when no merges)
|
|
std::vector<int> encode_bytes(const std::string& text) const {
|
|
std::vector<int> ids;
|
|
// Try whole-string match first
|
|
auto it = token_to_id_.find(text);
|
|
if (it != token_to_id_.end()) {
|
|
ids.push_back(it->second);
|
|
return ids;
|
|
}
|
|
// Greedy forward match
|
|
size_t i = 0;
|
|
while (i < text.size()) {
|
|
int best_len = 0;
|
|
int best_id = -1;
|
|
int max_try = std::min((int)(text.size() - i), 32);
|
|
for (int len = max_try; len >= 1; --len) {
|
|
auto it2 = token_to_id_.find(text.substr(i, len));
|
|
if (it2 != token_to_id_.end()) {
|
|
best_len = len;
|
|
best_id = it2->second;
|
|
break;
|
|
}
|
|
}
|
|
if (best_id >= 0) {
|
|
ids.push_back(best_id);
|
|
i += best_len;
|
|
} else {
|
|
// Byte fallback
|
|
char buf[8];
|
|
snprintf(buf, sizeof(buf), "<0x%02X>", (unsigned char)text[i]);
|
|
auto bit = token_to_id_.find(buf);
|
|
if (bit != token_to_id_.end()) ids.push_back(bit->second);
|
|
i++;
|
|
}
|
|
}
|
|
return ids;
|
|
}
|
|
};
|
|
|
|
} // namespace ix
|