// ═══════════════════════════════════════════════════════════════════════════════ // INFERENCE-X — Expert-Aware Memory-Mapped I/O for MoE // Copyright (C) 2024-2026 Salka Elmadani. All rights reserved. // Licensed under the Business Source License 1.1 (BSL-1.1) // See LICENSE file for full terms. // // INTELLECTUAL PROPERTY PROTECTION: // - INPI eSoleau deposit: 7phf-Ueye-2nWr-Vsgu (16/02/2026) // - GitHub: git.inference-x.com/salka/inference-x // - Author: Salka Elmadani | Morocco | Morocco // // MANUFACTURER NOTICE: Any manufacturer, company, or entity that // incorporates, embeds, distributes, or commercially uses Inference-X // or any derivative work without explicit written authorization from // the copyright holder is in violation of BSL-1.1 and applicable // intellectual property laws. This includes but is not limited to: // hardware vendors, cloud providers, SaaS platforms, and OEMs. // // Contact: Elmadani.SALKA@proton.me for licensing. // ═══════════════════════════════════════════════════════════════════════════════ #pragma once // Inference-X Expert MMAP — Salka Elmadani — Morocco #define IX_MMAP_IDENTITY "Inference-X-ExpertMMAP-935" #include #include #include #include #ifdef __linux__ #include #endif #ifdef __APPLE__ #include #endif // Fallback for platforms without madvise #if !defined(__linux__) && !defined(__APPLE__) #define MADV_WILLNEED 0 #define MADV_DONTNEED 0 inline int madvise(void*, size_t, int) { return 0; } #endif namespace ix { static constexpr size_t PAGE_SIZE = 4096; inline uintptr_t page_align_down(uintptr_t addr) { return addr & ~(PAGE_SIZE - 1); } inline size_t page_align_up(size_t size) { return (size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1); } // ============================================================================= // EXPERT MMAP MANAGER // Surgical madvise on individual expert slices within 3D MoE tensors // ============================================================================= class ExpertMmapManager { public: struct ExpertSlice { void* base; // Base pointer of full 3D tensor (mmap'd) size_t expert_bytes; // Bytes per single expert slice int n_experts; // Total experts in tensor }; struct LayerExperts { ExpertSlice gate_exps; // [dim, expert_ffn, n_experts] ExpertSlice up_exps; // [dim, expert_ffn, n_experts] ExpertSlice down_exps; // [expert_ffn, dim, n_experts] }; void init(int n_layers) { n_layers_ = n_layers; layers_.resize(n_layers); prev_active_.resize(n_layers); stats_ = {}; } // Register expert tensor locations (called during model load) void register_layer(int layer, void* gate_data, size_t gate_expert_bytes, int n_experts, void* up_data, size_t up_expert_bytes, void* down_data, size_t down_expert_bytes) { if (layer >= n_layers_) return; layers_[layer] = { {gate_data, gate_expert_bytes, n_experts}, {up_data, up_expert_bytes, n_experts}, {down_data, down_expert_bytes, n_experts} }; } // ========================================================================= // SURGICAL PREFETCH — Only page in the K active experts (K=8) // Called AFTER routing, BEFORE expert FFN computation // ========================================================================= void prefetch_active(int layer, const int* expert_ids, int n_active) { if (layer >= n_layers_) return; auto& le = layers_[layer]; auto& prev = prev_active_[layer]; // Prefetch active for (int i = 0; i < n_active; ++i) { int eid = expert_ids[i]; prefetch_slice(le.gate_exps, eid); prefetch_slice(le.up_exps, eid); prefetch_slice(le.down_exps, eid); stats_.prefetches++; } // Evict previously active that are no longer needed for (int prev_eid : prev) { bool still = false; for (int i = 0; i < n_active; ++i) { if (expert_ids[i] == prev_eid) { still = true; break; } } if (!still) { evict_slice(le.gate_exps, prev_eid); evict_slice(le.up_exps, prev_eid); evict_slice(le.down_exps, prev_eid); stats_.evictions++; stats_.bytes_evicted += le.gate_exps.expert_bytes + le.up_exps.expert_bytes + le.down_exps.expert_bytes; } } // Update active set prev.assign(expert_ids, expert_ids + n_active); } // Overload for vector void prefetch_active(int layer, const std::vector& expert_ids) { prefetch_active(layer, expert_ids.data(), (int)expert_ids.size()); } // ========================================================================= // PREDICTIVE PREFETCH — Pre-load statistically hot experts // Uses ExpertCache frequency data to predict next layer's active experts // ========================================================================= void prefetch_predicted(int layer, const std::vector& hot_experts) { if (layer >= n_layers_) return; auto& le = layers_[layer]; for (int eid : hot_experts) { prefetch_slice(le.gate_exps, eid); prefetch_slice(le.up_exps, eid); prefetch_slice(le.down_exps, eid); } } // ========================================================================= // LAYER EVICT — Release all expert pages after layer processing // Frees page cache pressure for next layer // ========================================================================= void evict_layer(int layer) { if (layer >= n_layers_) return; auto& le = layers_[layer]; evict_tensor(le.gate_exps); evict_tensor(le.up_exps); evict_tensor(le.down_exps); prev_active_[layer].clear(); } // ========================================================================= // SHARED WEIGHTS LOCK — Keep non-expert weights hot in page cache // Embeddings, attention projections, RMS norm, output head // ========================================================================= void lock_shared(void* ptr, size_t bytes) { if (!ptr || bytes == 0) return; uintptr_t aligned = page_align_down((uintptr_t)ptr); size_t len = page_align_up(bytes + ((uintptr_t)ptr - aligned)); madvise((void*)aligned, len, MADV_WILLNEED); } void print_stats() const { printf("[EXPERT-MMAP] Prefetches: %zu | Evictions: %zu | I/O saved: %.2f GB\n", stats_.prefetches, stats_.evictions, stats_.bytes_evicted / 1e9); } size_t bytes_saved() const { return stats_.bytes_evicted; } private: int n_layers_ = 0; std::vector layers_; std::vector> prev_active_; struct Stats { size_t prefetches = 0; size_t evictions = 0; size_t bytes_evicted = 0; } stats_; void prefetch_slice(const ExpertSlice& es, int eid) { if (!es.base || eid < 0 || eid >= es.n_experts) return; uintptr_t start = (uintptr_t)es.base + (size_t)eid * es.expert_bytes; uintptr_t aligned = page_align_down(start); size_t len = page_align_up(es.expert_bytes + (start - aligned)); madvise((void*)aligned, len, MADV_WILLNEED); } void evict_slice(const ExpertSlice& es, int eid) { if (!es.base || eid < 0 || eid >= es.n_experts) return; uintptr_t start = (uintptr_t)es.base + (size_t)eid * es.expert_bytes; uintptr_t aligned = page_align_down(start); size_t len = page_align_up(es.expert_bytes + (start - aligned)); madvise((void*)aligned, len, MADV_DONTNEED); } void evict_tensor(const ExpertSlice& es) { if (!es.base) return; size_t total = (size_t)es.n_experts * es.expert_bytes; uintptr_t aligned = page_align_down((uintptr_t)es.base); size_t len = page_align_up(total + ((uintptr_t)es.base - aligned)); madvise((void*)aligned, len, MADV_DONTNEED); } }; } // namespace ix