// ═══════════════════════════════════════════════════════════════════════════════ // INFERENCE-X — GGUF Multi-Shard Model Parser // Copyright (C) 2024-2026 Salka Elmadani. All rights reserved. // Licensed under the Business Source License 1.1 (BSL-1.1) // See LICENSE file for full terms. // // INTELLECTUAL PROPERTY PROTECTION: // - INPI eSoleau deposit: 7phf-Ueye-2nWr-Vsgu (16/02/2026) // - GitHub: git.inference-x.com/salka/inference-x // - Author: Salka Elmadani | Morocco | Morocco // // MANUFACTURER NOTICE: Any manufacturer, company, or entity that // incorporates, embeds, distributes, or commercially uses Inference-X // or any derivative work without explicit written authorization from // the copyright holder is in violation of BSL-1.1 and applicable // intellectual property laws. This includes but is not limited to: // hardware vendors, cloud providers, SaaS platforms, and OEMs. // // Contact: Elmadani.SALKA@proton.me for licensing. // ═══════════════════════════════════════════════════════════════════════════════ #pragma once // Inference-X GGUF Parser — Salka Elmadani — Morocco #define IX_GGUF_WATERMARK "Inference-X-GGUF-935-Elmadani" #include "../core/z_core.h" #include #include #include #include #include #include #include #include #include #include #include namespace ix { // ═══════════════════════════════════════════════════════════════════════════════ // GGUF CONSTANTS // ═══════════════════════════════════════════════════════════════════════════════ static constexpr uint32_t GGUF_MAGIC = 0x46554747; // "GGUF" enum class GGUFType : uint32_t { UINT8 = 0, INT8 = 1, UINT16 = 2, INT16 = 3, UINT32 = 4, INT32 = 5, FLOAT32 = 6, BOOL = 7, STRING = 8, ARRAY = 9, UINT64 = 10, INT64 = 11, FLOAT64 = 12 }; // ═══════════════════════════════════════════════════════════════════════════════ // TENSOR INFO — Points into mmap'd shard // ═══════════════════════════════════════════════════════════════════════════════ struct TensorInfo { std::string name; uint32_t n_dims; uint64_t dims[4]; dtype type; uint64_t offset; // offset within shard's data segment uint64_t n_elements; uint64_t n_bytes; void* data; // pointer into mmap'd memory int shard_idx; // which shard this tensor lives in void compute_size() { n_elements = 1; for (uint32_t i = 0; i < n_dims; ++i) n_elements *= dims[i]; int bs = dtype_block_size(type); if (bs > 1) { n_bytes = (n_elements / bs) * dtype_size(type); } else { n_bytes = n_elements * dtype_size(type); } } }; // ═══════════════════════════════════════════════════════════════════════════════ // MMAP'D SHARD — One per GGUF file // ═══════════════════════════════════════════════════════════════════════════════ struct Shard { int fd = -1; uint8_t* data = nullptr; size_t size = 0; std::string path; bool open(const std::string& p) { path = p; fd = ::open(p.c_str(), O_RDONLY); if (fd < 0) return false; struct stat st; if (fstat(fd, &st) < 0) { close(); return false; } size = st.st_size; data = static_cast(mmap(nullptr, size, PROT_READ, MAP_PRIVATE, fd, 0)); if (data == MAP_FAILED) { data = nullptr; close(); return false; } // MADV_RANDOM for MoE: experts accessed non-sequentially madvise(data, size, MADV_SEQUENTIAL); madvise(data, size, MADV_WILLNEED); printf("[GGUF] Preloading %.1f MB into RAM...\n", size / 1e6); return true; } void close() { if (data) { munmap(data, size); data = nullptr; } if (fd >= 0) { ::close(fd); fd = -1; } } ~Shard() { close(); } Shard() = default; Shard(Shard&& o) noexcept : fd(o.fd), data(o.data), size(o.size), path(std::move(o.path)) { o.fd = -1; o.data = nullptr; o.size = 0; } Shard& operator=(Shard&& o) noexcept { close(); fd = o.fd; data = o.data; size = o.size; path = std::move(o.path); o.fd = -1; o.data = nullptr; o.size = 0; return *this; } }; // ═══════════════════════════════════════════════════════════════════════════════ // GGUF MULTI-SHARD — Mmap-based zero-copy across N shards // ═══════════════════════════════════════════════════════════════════════════════ class GGUF { public: GGUF() = default; ~GGUF() { close(); } GGUF(const GGUF&) = delete; GGUF& operator=(const GGUF&) = delete; // ═══════════════════════════════════════════════════════════════════════════ // OPEN — Detects single vs multi-shard automatically // Pass any shard or the first shard; will discover the rest. // ═══════════════════════════════════════════════════════════════════════════ bool open(const std::string& path) { if (!signature::verify()) return false; // If path is a directory, find GGUF files inside std::string resolved = path; struct stat st; if (stat(path.c_str(), &st) == 0 && S_ISDIR(st.st_mode)) { // Glob for *.gguf in directory std::vector gguf_files; for (const auto& entry : std::filesystem::directory_iterator(path)) { if (entry.path().extension() == ".gguf") { gguf_files.push_back(entry.path().string()); } } std::sort(gguf_files.begin(), gguf_files.end()); if (gguf_files.empty()) { printf("[GGUF] ERROR: No .gguf files in directory: %s\n", path.c_str()); return false; } resolved = gguf_files[0]; // Use first shard for discovery printf("[GGUF] Directory mode: found %zu .gguf file(s)\n", gguf_files.size()); } // Discover all shards auto shard_paths = discover_shards(resolved); if (shard_paths.empty()) { shard_paths.push_back(resolved); // single file } printf("[GGUF] Loading %zu shard(s)...\n", shard_paths.size()); shards_.resize(shard_paths.size()); for (size_t i = 0; i < shard_paths.size(); ++i) { if (!shards_[i].open(shard_paths[i])) { printf("[GGUF] ERROR: Failed to mmap shard %zu: %s\n", i, shard_paths[i].c_str()); close(); return false; } printf("[GGUF] Shard %zu: %s (%.1f GB)\n", i, shard_paths[i].c_str(), shards_[i].size / 1e9); } // Parse metadata from shard 0 (contains all KV pairs) if (!parse_shard(0, true)) { printf("[GGUF] ERROR: Failed to parse metadata from shard 0\n"); close(); return false; } // Parse tensor index from all shards for (size_t i = 1; i < shards_.size(); ++i) { if (!parse_shard(i, false)) { printf("[GGUF] ERROR: Failed to parse shard %zu\n", i); close(); return false; } } printf("[GGUF] Total tensors: %zu across %zu shards\n", tensors_.size(), shards_.size()); // Validate tensor pointers against shard boundaries int bad = 0; for (size_t i = 0; i < tensors_.size(); ++i) { const auto& ti = tensors_[i]; if (!ti.data) { printf("[VALIDATE] WARN: %s has null data\n", ti.name.c_str()); bad++; continue; } uint8_t* start = static_cast(ti.data); uint8_t* end = start + ti.n_bytes; uint8_t* shard_start = shards_[ti.shard_idx].data; uint8_t* shard_end = shard_start + shards_[ti.shard_idx].size; if (start < shard_start || end > shard_end) { printf("[VALIDATE] BAD: %s type=%d(%s) nelem=%lu nbytes=%lu shard=%d off=%lu shard_sz=%zu overflow=%ld\n", ti.name.c_str(), (int)ti.type, dtype_name(ti.type), ti.n_elements, ti.n_bytes, ti.shard_idx, ti.offset, shards_[ti.shard_idx].size, (long)(end - shard_end)); bad++; } } if (bad > 0) printf("[VALIDATE] %d tensors have invalid pointers!\n", bad); else printf("[VALIDATE] All %zu tensors OK\n", tensors_.size()); return true; } void close() { shards_.clear(); tensors_.clear(); tensor_map_.clear(); meta_u32_.clear(); meta_u64_.clear(); meta_f32_.clear(); meta_str_.clear(); meta_str_arr_.clear(); meta_i32_arr_.clear(); } // ═══════════════════════════════════════════════════════════════════════════ // METADATA ACCESS // ═══════════════════════════════════════════════════════════════════════════ uint32_t get_u32(const std::string& k, uint32_t d = 0) const { auto it = meta_u32_.find(k); return it != meta_u32_.end() ? it->second : d; } uint64_t get_u64(const std::string& k, uint64_t d = 0) const { auto it = meta_u64_.find(k); return it != meta_u64_.end() ? it->second : d; } float get_f32(const std::string& k, float d = 0) const { auto it = meta_f32_.find(k); return it != meta_f32_.end() ? it->second : d; } std::string get_str(const std::string& k, const std::string& d = "") const { auto it = meta_str_.find(k); return it != meta_str_.end() ? it->second : d; } const std::vector* get_str_arr(const std::string& k) const { auto it = meta_str_arr_.find(k); return it != meta_str_arr_.end() ? &it->second : nullptr; } const std::vector* get_i32_arr(const std::string& k) const { auto it = meta_i32_arr_.find(k); return it != meta_i32_arr_.end() ? &it->second : nullptr; } // ═══════════════════════════════════════════════════════════════════════════ // TENSOR ACCESS // ═══════════════════════════════════════════════════════════════════════════ const TensorInfo* tensor(const std::string& name) const { auto it = tensor_map_.find(name); return it != tensor_map_.end() ? &tensors_[it->second] : nullptr; } const std::vector& tensors() const { return tensors_; } size_t num_shards() const { return shards_.size(); } // ═══════════════════════════════════════════════════════════════════════════ // PREFETCH — Hint kernel to preload expert pages // ═══════════════════════════════════════════════════════════════════════════ void prefetch_tensor(const std::string& name) const { const TensorInfo* ti = tensor(name); if (ti && ti->data) { madvise(const_cast(ti->data), ti->n_bytes, MADV_WILLNEED); } } void prefetch_experts(int layer, const std::vector& expert_ids) const { for (int eid __attribute__((unused)) : expert_ids) { // Expert tensors are 3D: [expert_ffn_dim, dim, n_experts] // We need to prefetch the slice for this expert std::string prefix = "blk." + std::to_string(layer) + "."; prefetch_tensor(prefix + "ffn_gate_exps.weight"); prefetch_tensor(prefix + "ffn_up_exps.weight"); prefetch_tensor(prefix + "ffn_down_exps.weight"); } } // ═══════════════════════════════════════════════════════════════════════════ // CONFIG EXTRACTION — Architecture-aware // ═══════════════════════════════════════════════════════════════════════════ Config extract_config() const { Config c; // Detect architecture — all model families std::string arch_str = get_str("general.architecture", "llama"); if (arch_str == "deepseek2") { c.arch = Architecture::DEEPSEEK2; c.activation = Activation::SILU; } else if (arch_str == "qwen2") { c.arch = Architecture::QWEN2; c.activation = Activation::SILU; } else if (arch_str == "phi3" || arch_str == "phi") { c.arch = Architecture::PHI3; c.activation = Activation::GELU; } else if (arch_str == "gemma" || arch_str == "gemma2") { c.arch = Architecture::GEMMA2; c.activation = Activation::GELU; } else if (arch_str == "starcoder2") { c.arch = Architecture::STARCODER2; c.activation = Activation::GELU; } else if (arch_str == "command-r" || arch_str == "cohere") { c.arch = Architecture::COMMAND_R; c.activation = Activation::SILU; } else { c.arch = Architecture::LLAMA; c.activation = Activation::SILU; } // Architecture prefix for KV lookups std::string pfx = arch_str + "."; // === Common === c.dim = get_u32(pfx + "embedding_length", 4096); c.n_layers = get_u32(pfx + "block_count", 32); c.n_heads = get_u32(pfx + "attention.head_count", 32); c.n_kv_heads = get_u32(pfx + "attention.head_count_kv", c.n_heads); c.vocab_size = get_u32(pfx + "vocab_size", 0); // Fallback: count tokenizer tokens if metadata key missing if (c.vocab_size == 0) { auto* toks = get_str_arr("tokenizer.ggml.tokens"); if (toks && !toks->empty()) { c.vocab_size = (uint32_t)toks->size(); fprintf(stderr, "[GGUF] vocab_size from token array: %u\n", c.vocab_size); } } // 0 = fixup from tokenizer c.max_seq_len = get_u32(pfx + "context_length", 4096); c.intermediate = get_u32(pfx + "feed_forward_length", 11008); // Sliding window attention c.sliding_window = get_u32(pfx + "attention.sliding_window", 0); c.rope_theta = get_f32(pfx + "rope.freq_base", 10000.0f); c.rms_norm_eps = get_f32(pfx + "attention.layer_norm_rms_epsilon", 1e-5f); // === MLA (DeepSeek V3) === if (c.arch == Architecture::DEEPSEEK2) { c.q_lora_rank = get_u32(pfx + "attention.q_lora_rank", 0); c.kv_lora_rank = get_u32(pfx + "attention.kv_lora_rank", 0); c.key_length = get_u32(pfx + "attention.key_length", 576); c.value_length = get_u32(pfx + "attention.value_length", 512); c.key_length_mla = get_u32(pfx + "attention.key_length_mla", 192); c.value_length_mla = get_u32(pfx + "attention.value_length_mla", 128); c.rope_dim = get_u32(pfx + "rope.dimension_count", 64); // === MoE === c.n_experts = get_u32(pfx + "expert_count", 0); c.n_experts_used = get_u32(pfx + "expert_used_count", 0); c.n_expert_shared = get_u32(pfx + "expert_shared_count", 0); c.expert_ffn_dim = get_u32(pfx + "expert_feed_forward_length", 2048); c.n_dense_layers = get_u32(pfx + "leading_dense_block_count", 1); c.n_expert_groups = get_u32(pfx + "expert_group_count", 1); c.n_expert_groups_used = get_u32(pfx + "expert_group_used_count", 1); c.expert_gating_func = get_u32(pfx + "expert_gating_func", 0); c.expert_weights_scale = get_f32(pfx + "expert_weights_scale", 1.0f); c.expert_weights_norm = get_u32(pfx + "expert_weights_norm", 0) != 0; // === YaRN RoPE scaling === c.rope_scaling_factor = get_f32(pfx + "rope.scaling.factor", 1.0f); c.rope_scaling_orig_ctx = get_u32(pfx + "rope.scaling.original_context_length", 4096); c.rope_yarn_beta_fast = get_f32(pfx + "rope.scaling.yarn_beta_fast", 32.0f); c.rope_yarn_beta_slow = get_f32(pfx + "rope.scaling.yarn_beta_slow", 1.0f); c.rope_yarn_log_mul = get_f32(pfx + "rope.scaling.yarn_log_multiplier", 0.1f); } // Universal adapter: logit caps, embedding scale c.attn_logit_softcap = get_f32(pfx + "attn_logit_softcapping", 0.0f); c.final_logit_softcap = get_f32(pfx + "final_logit_softcapping", 0.0f); // Gemma: embeddings scaled by sqrt(dim) if (arch_str == "gemma" || arch_str == "gemma2") { c.embed_scale_sqrt_dim = true; } c.compute_derived(); return c; } // ═══════════════════════════════════════════════════════════════════════════ // MEMORY STATS // ═══════════════════════════════════════════════════════════════════════════ void print_memory_map() const { size_t total_mmap = 0; for (const auto& s : shards_) total_mmap += s.size; printf("\n=== Memory Map ===\n"); printf(" Total mmap'd: %.1f GB\n", total_mmap / 1e9); printf(" Total tensors: %zu\n", tensors_.size()); // Categorize size_t expert_bytes = 0, attn_bytes = 0, router_bytes = 0; size_t norm_bytes = 0, embed_bytes = 0, other_bytes = 0; int expert_count = 0, attn_count = 0; for (const auto& t : tensors_) { if (t.name.find("exps") != std::string::npos) { expert_bytes += t.n_bytes; expert_count++; } else if (t.name.find("attn") != std::string::npos) { attn_bytes += t.n_bytes; attn_count++; } else if (t.name.find("gate_inp") != std::string::npos) { router_bytes += t.n_bytes; } else if (t.name.find("norm") != std::string::npos) { norm_bytes += t.n_bytes; } else if (t.name.find("embd") != std::string::npos || t.name.find("output") != std::string::npos) { embed_bytes += t.n_bytes; } else { other_bytes += t.n_bytes; } } printf(" Expert tensors: %d (%.1f GB) — MoE sleeping experts\n", expert_count, expert_bytes / 1e9); printf(" Attention: %d (%.1f GB) — MLA projections\n", attn_count, attn_bytes / 1e9); printf(" Router: %.1f MB — F32 gating (sacred)\n", router_bytes / 1e6); printf(" Embed+Output: %.1f GB\n", embed_bytes / 1e9); printf(" Norms: %.1f MB\n", norm_bytes / 1e6); printf(" Other: %.1f GB\n", other_bytes / 1e9); // Active memory estimate for MoE Config cfg = extract_config(); if (cfg.is_moe()) { float active_expert_ratio = (float)cfg.n_experts_used / cfg.n_experts; float active_expert_gb = expert_bytes * active_expert_ratio / 1e9; float active_total = active_expert_gb + attn_bytes / 1e9 + router_bytes / 1e9 + norm_bytes / 1e9; printf("\n=== Active per Token (MoE) ===\n"); printf(" Experts active: %d/%d (%.1f%%)\n", cfg.n_experts_used, cfg.n_experts, active_expert_ratio * 100); printf(" Active experts: %.1f GB\n", active_expert_gb); printf(" + Attention: %.1f GB\n", attn_bytes / 1e9); printf(" + Router (F32): %.1f MB\n", router_bytes / 1e6); printf(" ≈ Active total: %.1f GB ← fits in 17GB RAM\n", active_total); } } private: std::vector shards_; std::vector tensors_; std::unordered_map tensor_map_; std::unordered_map meta_u32_; std::unordered_map meta_u64_; std::unordered_map meta_f32_; std::unordered_map meta_str_; std::unordered_map> meta_str_arr_; std::unordered_map> meta_i32_arr_; // ═══════════════════════════════════════════════════════════════════════════ // SHARD DISCOVERY — Find all -NNNNN-of-NNNNN.gguf siblings // ═══════════════════════════════════════════════════════════════════════════ std::vector discover_shards(const std::string& path) { std::vector result; // Check if this looks like a split GGUF // Pattern: *-00001-of-00005.gguf auto pos = path.rfind("-of-"); if (pos == std::string::npos) return result; // single file auto dash_before = path.rfind('-', pos - 1); if (dash_before == std::string::npos) return result; std::string prefix = path.substr(0, dash_before + 1); // includes trailing dash std::string suffix_after_of = path.substr(pos + 4); // "00005.gguf" // Extract total count auto dot = suffix_after_of.find('.'); if (dot == std::string::npos) return result; int total = std::atoi(suffix_after_of.substr(0, dot).c_str()); std::string ext = suffix_after_of.substr(dot); // ".gguf" // Build shard paths for (int i = 1; i <= total; ++i) { char buf[16]; snprintf(buf, sizeof(buf), "%05d", i); char buf2[16]; snprintf(buf2, sizeof(buf2), "%05d", total); std::string shard_path = prefix + buf + "-of-" + buf2 + ext; // Verify file exists struct stat st; if (stat(shard_path.c_str(), &st) == 0) { result.push_back(shard_path); } else { printf("[GGUF] WARNING: Expected shard not found: %s\n", shard_path.c_str()); } } return result; } // ═══════════════════════════════════════════════════════════════════════════ // PARSE ONE SHARD // ═══════════════════════════════════════════════════════════════════════════ // Binary reader helper — avoid template lambdas for C++17 compat struct Reader { const uint8_t*& ptr; Reader(const uint8_t*& p) : ptr(p) {} template T read() { T v; std::memcpy(&v, ptr, sizeof(T)); ptr += sizeof(T); return v; } std::string read_str() { uint64_t len = read(); std::string s(reinterpret_cast(ptr), len); ptr += len; return s; } void skip_val(GGUFType t) { switch (t) { case GGUFType::UINT8: case GGUFType::INT8: case GGUFType::BOOL: ptr += 1; break; case GGUFType::UINT16: case GGUFType::INT16: ptr += 2; break; case GGUFType::UINT32: case GGUFType::INT32: case GGUFType::FLOAT32: ptr += 4; break; case GGUFType::UINT64: case GGUFType::INT64: case GGUFType::FLOAT64: ptr += 8; break; case GGUFType::STRING: read_str(); break; case GGUFType::ARRAY: { GGUFType at = static_cast(read()); uint64_t n = read(); for (uint64_t i = 0; i < n; ++i) skip_val(at); break; } } } }; bool parse_shard(int shard_idx, bool read_metadata) { const uint8_t* ptr = shards_[shard_idx].data; Reader r(ptr); // Header if (r.read() != GGUF_MAGIC) return false; uint32_t ver = r.read(); if (ver < 2 || ver > 3) return false; uint64_t n_tensors = r.read(); uint64_t n_kv = r.read(); // KV pairs for (uint64_t i = 0; i < n_kv; ++i) { std::string key = r.read_str(); GGUFType type = static_cast(r.read()); if (read_metadata) { switch (type) { case GGUFType::UINT8: meta_u32_[key] = r.read(); break; case GGUFType::BOOL: meta_u32_[key] = r.read(); break; case GGUFType::UINT16: meta_u32_[key] = r.read(); break; case GGUFType::INT16: meta_u32_[key] = static_cast(r.read()); break; case GGUFType::UINT32: meta_u32_[key] = r.read(); break; case GGUFType::INT32: meta_u32_[key] = static_cast(r.read()); break; case GGUFType::FLOAT32: meta_f32_[key] = r.read(); break; case GGUFType::UINT64: meta_u64_[key] = r.read(); break; case GGUFType::INT64: meta_u64_[key] = static_cast(r.read()); break; case GGUFType::FLOAT64: meta_f32_[key] = static_cast(r.read()); break; case GGUFType::STRING: meta_str_[key] = r.read_str(); break; case GGUFType::ARRAY: { GGUFType at = static_cast(r.read()); uint64_t n = r.read(); if (at == GGUFType::STRING) { auto& arr = meta_str_arr_[key]; arr.resize(n); for (uint64_t j = 0; j < n; ++j) arr[j] = r.read_str(); } else if (at == GGUFType::INT32 || at == GGUFType::UINT32) { auto& arr = meta_i32_arr_[key]; arr.resize(n); for (uint64_t j = 0; j < n; ++j) arr[j] = r.read(); } else { for (uint64_t j = 0; j < n; ++j) r.skip_val(at); } break; } default: r.skip_val(type); break; } } else { r.skip_val(type); } } // Tensor index size_t base_idx = tensors_.size(); tensors_.reserve(base_idx + n_tensors); for (uint64_t i = 0; i < n_tensors; ++i) { TensorInfo ti; ti.name = r.read_str(); ti.n_dims = r.read(); for (uint32_t d = 0; d < 4; ++d) { ti.dims[d] = (d < ti.n_dims) ? r.read() : 1; } ti.type = static_cast(r.read()); ti.offset = r.read(); ti.shard_idx = shard_idx; ti.compute_size(); ti.data = nullptr; tensor_map_[ti.name] = tensors_.size(); tensors_.push_back(std::move(ti)); } // Resolve data pointers uint32_t align = read_metadata ? get_u32("general.alignment", 32) : 32; uint64_t data_start = ((ptr - shards_[shard_idx].data) + align - 1) & ~(uint64_t)(align - 1); for (size_t i = base_idx; i < tensors_.size(); ++i) { auto& ti = tensors_[i]; if (ti.shard_idx == shard_idx) { ti.data = shards_[shard_idx].data + data_start + ti.offset; } } return true; } }; } // namespace ix