inference-x/runtime/expert_mmap.h

224 lines
8.5 KiB
C++

// ═══════════════════════════════════════════════════════════════════════════════
// INFERENCE-X — Expert-Aware Memory-Mapped I/O for MoE
// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
// Licensed under the Business Source License 1.1 (BSL-1.1)
// See LICENSE file for full terms.
//
// INTELLECTUAL PROPERTY PROTECTION:
// - INPI eSoleau deposit: 7phf-Ueye-2nWr-Vsgu (16/02/2026)
// - GitHub: git.inference-x.com/salka/inference-x
// - Author: Salka Elmadani | Morocco | Morocco
//
// MANUFACTURER NOTICE: Any manufacturer, company, or entity that
// incorporates, embeds, distributes, or commercially uses Inference-X
// or any derivative work without explicit written authorization from
// the copyright holder is in violation of BSL-1.1 and applicable
// intellectual property laws. This includes but is not limited to:
// hardware vendors, cloud providers, SaaS platforms, and OEMs.
//
// Contact: Elmadani.SALKA@proton.me for licensing.
// ═══════════════════════════════════════════════════════════════════════════════
#pragma once
// Inference-X Expert MMAP — Salka Elmadani — Morocco
#define IX_MMAP_IDENTITY "Inference-X-ExpertMMAP-935"
#include <cstdint>
#include <cstdio>
#include <vector>
#include <algorithm>
#ifdef __linux__
#include <sys/mman.h>
#endif
#ifdef __APPLE__
#include <sys/mman.h>
#endif
// Fallback for platforms without madvise
#if !defined(__linux__) && !defined(__APPLE__)
#define MADV_WILLNEED 0
#define MADV_DONTNEED 0
inline int madvise(void*, size_t, int) { return 0; }
#endif
namespace ix {
static constexpr size_t PAGE_SIZE = 4096;
inline uintptr_t page_align_down(uintptr_t addr) {
return addr & ~(PAGE_SIZE - 1);
}
inline size_t page_align_up(size_t size) {
return (size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
}
// =============================================================================
// EXPERT MMAP MANAGER
// Surgical madvise on individual expert slices within 3D MoE tensors
// =============================================================================
class ExpertMmapManager {
public:
struct ExpertSlice {
void* base; // Base pointer of full 3D tensor (mmap'd)
size_t expert_bytes; // Bytes per single expert slice
int n_experts; // Total experts in tensor
};
struct LayerExperts {
ExpertSlice gate_exps; // [dim, expert_ffn, n_experts]
ExpertSlice up_exps; // [dim, expert_ffn, n_experts]
ExpertSlice down_exps; // [expert_ffn, dim, n_experts]
};
void init(int n_layers) {
n_layers_ = n_layers;
layers_.resize(n_layers);
prev_active_.resize(n_layers);
stats_ = {};
}
// Register expert tensor locations (called during model load)
void register_layer(int layer,
void* gate_data, size_t gate_expert_bytes, int n_experts,
void* up_data, size_t up_expert_bytes,
void* down_data, size_t down_expert_bytes) {
if (layer >= n_layers_) return;
layers_[layer] = {
{gate_data, gate_expert_bytes, n_experts},
{up_data, up_expert_bytes, n_experts},
{down_data, down_expert_bytes, n_experts}
};
}
// =========================================================================
// SURGICAL PREFETCH — Only page in the K active experts (K=8)
// Called AFTER routing, BEFORE expert FFN computation
// =========================================================================
void prefetch_active(int layer, const int* expert_ids, int n_active) {
if (layer >= n_layers_) return;
auto& le = layers_[layer];
auto& prev = prev_active_[layer];
// Prefetch active
for (int i = 0; i < n_active; ++i) {
int eid = expert_ids[i];
prefetch_slice(le.gate_exps, eid);
prefetch_slice(le.up_exps, eid);
prefetch_slice(le.down_exps, eid);
stats_.prefetches++;
}
// Evict previously active that are no longer needed
for (int prev_eid : prev) {
bool still = false;
for (int i = 0; i < n_active; ++i) {
if (expert_ids[i] == prev_eid) { still = true; break; }
}
if (!still) {
evict_slice(le.gate_exps, prev_eid);
evict_slice(le.up_exps, prev_eid);
evict_slice(le.down_exps, prev_eid);
stats_.evictions++;
stats_.bytes_evicted += le.gate_exps.expert_bytes +
le.up_exps.expert_bytes +
le.down_exps.expert_bytes;
}
}
// Update active set
prev.assign(expert_ids, expert_ids + n_active);
}
// Overload for vector
void prefetch_active(int layer, const std::vector<int>& expert_ids) {
prefetch_active(layer, expert_ids.data(), (int)expert_ids.size());
}
// =========================================================================
// PREDICTIVE PREFETCH — Pre-load statistically hot experts
// Uses ExpertCache frequency data to predict next layer's active experts
// =========================================================================
void prefetch_predicted(int layer, const std::vector<int>& hot_experts) {
if (layer >= n_layers_) return;
auto& le = layers_[layer];
for (int eid : hot_experts) {
prefetch_slice(le.gate_exps, eid);
prefetch_slice(le.up_exps, eid);
prefetch_slice(le.down_exps, eid);
}
}
// =========================================================================
// LAYER EVICT — Release all expert pages after layer processing
// Frees page cache pressure for next layer
// =========================================================================
void evict_layer(int layer) {
if (layer >= n_layers_) return;
auto& le = layers_[layer];
evict_tensor(le.gate_exps);
evict_tensor(le.up_exps);
evict_tensor(le.down_exps);
prev_active_[layer].clear();
}
// =========================================================================
// SHARED WEIGHTS LOCK — Keep non-expert weights hot in page cache
// Embeddings, attention projections, RMS norm, output head
// =========================================================================
void lock_shared(void* ptr, size_t bytes) {
if (!ptr || bytes == 0) return;
uintptr_t aligned = page_align_down((uintptr_t)ptr);
size_t len = page_align_up(bytes + ((uintptr_t)ptr - aligned));
madvise((void*)aligned, len, MADV_WILLNEED);
}
void print_stats() const {
printf("[EXPERT-MMAP] Prefetches: %zu | Evictions: %zu | I/O saved: %.2f GB\n",
stats_.prefetches, stats_.evictions, stats_.bytes_evicted / 1e9);
}
size_t bytes_saved() const { return stats_.bytes_evicted; }
private:
int n_layers_ = 0;
std::vector<LayerExperts> layers_;
std::vector<std::vector<int>> prev_active_;
struct Stats {
size_t prefetches = 0;
size_t evictions = 0;
size_t bytes_evicted = 0;
} stats_;
void prefetch_slice(const ExpertSlice& es, int eid) {
if (!es.base || eid < 0 || eid >= es.n_experts) return;
uintptr_t start = (uintptr_t)es.base + (size_t)eid * es.expert_bytes;
uintptr_t aligned = page_align_down(start);
size_t len = page_align_up(es.expert_bytes + (start - aligned));
madvise((void*)aligned, len, MADV_WILLNEED);
}
void evict_slice(const ExpertSlice& es, int eid) {
if (!es.base || eid < 0 || eid >= es.n_experts) return;
uintptr_t start = (uintptr_t)es.base + (size_t)eid * es.expert_bytes;
uintptr_t aligned = page_align_down(start);
size_t len = page_align_up(es.expert_bytes + (start - aligned));
madvise((void*)aligned, len, MADV_DONTNEED);
}
void evict_tensor(const ExpertSlice& es) {
if (!es.base) return;
size_t total = (size_t)es.n_experts * es.expert_bytes;
uintptr_t aligned = page_align_down((uintptr_t)es.base);
size_t len = page_align_up(total + ((uintptr_t)es.base - aligned));
madvise((void*)aligned, len, MADV_DONTNEED);
}
};
} // namespace ix