Better output from the same model. Fused computation, adaptive precision, surgical expert loading. 305 KB, 19 backends, zero dependencies. https://inference-x.com
224 lines
8.5 KiB
C++
224 lines
8.5 KiB
C++
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// INFERENCE-X — Expert-Aware Memory-Mapped I/O for MoE
|
|
// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
|
|
// Licensed under the Business Source License 1.1 (BSL-1.1)
|
|
// See LICENSE file for full terms.
|
|
//
|
|
// INTELLECTUAL PROPERTY PROTECTION:
|
|
// - INPI eSoleau deposit: 7phf-Ueye-2nWr-Vsgu (16/02/2026)
|
|
// - GitHub: github.com/ElmadaniS/inference-x
|
|
// - Author: Salka Elmadani | Morocco | Morocco
|
|
//
|
|
// MANUFACTURER NOTICE: Any manufacturer, company, or entity that
|
|
// incorporates, embeds, distributes, or commercially uses Inference-X
|
|
// or any derivative work without explicit written authorization from
|
|
// the copyright holder is in violation of BSL-1.1 and applicable
|
|
// intellectual property laws. This includes but is not limited to:
|
|
// hardware vendors, cloud providers, SaaS platforms, and OEMs.
|
|
//
|
|
// Contact: Elmadani.SALKA@proton.me for licensing.
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
|
|
#pragma once
|
|
|
|
// Inference-X Expert MMAP — Salka Elmadani — Morocco
|
|
#define IX_MMAP_IDENTITY "Inference-X-ExpertMMAP-935"
|
|
|
|
|
|
#include <cstdint>
|
|
#include <cstdio>
|
|
#include <vector>
|
|
#include <algorithm>
|
|
|
|
#ifdef __linux__
|
|
#include <sys/mman.h>
|
|
#endif
|
|
|
|
#ifdef __APPLE__
|
|
#include <sys/mman.h>
|
|
#endif
|
|
|
|
// Fallback for platforms without madvise
|
|
#if !defined(__linux__) && !defined(__APPLE__)
|
|
#define MADV_WILLNEED 0
|
|
#define MADV_DONTNEED 0
|
|
inline int madvise(void*, size_t, int) { return 0; }
|
|
#endif
|
|
|
|
namespace ix {
|
|
|
|
static constexpr size_t PAGE_SIZE = 4096;
|
|
|
|
inline uintptr_t page_align_down(uintptr_t addr) {
|
|
return addr & ~(PAGE_SIZE - 1);
|
|
}
|
|
|
|
inline size_t page_align_up(size_t size) {
|
|
return (size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
|
|
}
|
|
|
|
// =============================================================================
|
|
// EXPERT MMAP MANAGER
|
|
// Surgical madvise on individual expert slices within 3D MoE tensors
|
|
// =============================================================================
|
|
class ExpertMmapManager {
|
|
public:
|
|
struct ExpertSlice {
|
|
void* base; // Base pointer of full 3D tensor (mmap'd)
|
|
size_t expert_bytes; // Bytes per single expert slice
|
|
int n_experts; // Total experts in tensor
|
|
};
|
|
|
|
struct LayerExperts {
|
|
ExpertSlice gate_exps; // [dim, expert_ffn, n_experts]
|
|
ExpertSlice up_exps; // [dim, expert_ffn, n_experts]
|
|
ExpertSlice down_exps; // [expert_ffn, dim, n_experts]
|
|
};
|
|
|
|
void init(int n_layers) {
|
|
n_layers_ = n_layers;
|
|
layers_.resize(n_layers);
|
|
prev_active_.resize(n_layers);
|
|
stats_ = {};
|
|
}
|
|
|
|
// Register expert tensor locations (called during model load)
|
|
void register_layer(int layer,
|
|
void* gate_data, size_t gate_expert_bytes, int n_experts,
|
|
void* up_data, size_t up_expert_bytes,
|
|
void* down_data, size_t down_expert_bytes) {
|
|
if (layer >= n_layers_) return;
|
|
layers_[layer] = {
|
|
{gate_data, gate_expert_bytes, n_experts},
|
|
{up_data, up_expert_bytes, n_experts},
|
|
{down_data, down_expert_bytes, n_experts}
|
|
};
|
|
}
|
|
|
|
// =========================================================================
|
|
// SURGICAL PREFETCH — Only page in the K active experts (K=8)
|
|
// Called AFTER routing, BEFORE expert FFN computation
|
|
// =========================================================================
|
|
void prefetch_active(int layer, const int* expert_ids, int n_active) {
|
|
if (layer >= n_layers_) return;
|
|
auto& le = layers_[layer];
|
|
auto& prev = prev_active_[layer];
|
|
|
|
// Prefetch active
|
|
for (int i = 0; i < n_active; ++i) {
|
|
int eid = expert_ids[i];
|
|
prefetch_slice(le.gate_exps, eid);
|
|
prefetch_slice(le.up_exps, eid);
|
|
prefetch_slice(le.down_exps, eid);
|
|
stats_.prefetches++;
|
|
}
|
|
|
|
// Evict previously active that are no longer needed
|
|
for (int prev_eid : prev) {
|
|
bool still = false;
|
|
for (int i = 0; i < n_active; ++i) {
|
|
if (expert_ids[i] == prev_eid) { still = true; break; }
|
|
}
|
|
if (!still) {
|
|
evict_slice(le.gate_exps, prev_eid);
|
|
evict_slice(le.up_exps, prev_eid);
|
|
evict_slice(le.down_exps, prev_eid);
|
|
stats_.evictions++;
|
|
stats_.bytes_evicted += le.gate_exps.expert_bytes +
|
|
le.up_exps.expert_bytes +
|
|
le.down_exps.expert_bytes;
|
|
}
|
|
}
|
|
|
|
// Update active set
|
|
prev.assign(expert_ids, expert_ids + n_active);
|
|
}
|
|
|
|
// Overload for vector
|
|
void prefetch_active(int layer, const std::vector<int>& expert_ids) {
|
|
prefetch_active(layer, expert_ids.data(), (int)expert_ids.size());
|
|
}
|
|
|
|
// =========================================================================
|
|
// PREDICTIVE PREFETCH — Pre-load statistically hot experts
|
|
// Uses ExpertCache frequency data to predict next layer's active experts
|
|
// =========================================================================
|
|
void prefetch_predicted(int layer, const std::vector<int>& hot_experts) {
|
|
if (layer >= n_layers_) return;
|
|
auto& le = layers_[layer];
|
|
for (int eid : hot_experts) {
|
|
prefetch_slice(le.gate_exps, eid);
|
|
prefetch_slice(le.up_exps, eid);
|
|
prefetch_slice(le.down_exps, eid);
|
|
}
|
|
}
|
|
|
|
// =========================================================================
|
|
// LAYER EVICT — Release all expert pages after layer processing
|
|
// Frees page cache pressure for next layer
|
|
// =========================================================================
|
|
void evict_layer(int layer) {
|
|
if (layer >= n_layers_) return;
|
|
auto& le = layers_[layer];
|
|
evict_tensor(le.gate_exps);
|
|
evict_tensor(le.up_exps);
|
|
evict_tensor(le.down_exps);
|
|
prev_active_[layer].clear();
|
|
}
|
|
|
|
// =========================================================================
|
|
// SHARED WEIGHTS LOCK — Keep non-expert weights hot in page cache
|
|
// Embeddings, attention projections, RMS norm, output head
|
|
// =========================================================================
|
|
void lock_shared(void* ptr, size_t bytes) {
|
|
if (!ptr || bytes == 0) return;
|
|
uintptr_t aligned = page_align_down((uintptr_t)ptr);
|
|
size_t len = page_align_up(bytes + ((uintptr_t)ptr - aligned));
|
|
madvise((void*)aligned, len, MADV_WILLNEED);
|
|
}
|
|
|
|
void print_stats() const {
|
|
printf("[EXPERT-MMAP] Prefetches: %zu | Evictions: %zu | I/O saved: %.2f GB\n",
|
|
stats_.prefetches, stats_.evictions, stats_.bytes_evicted / 1e9);
|
|
}
|
|
|
|
size_t bytes_saved() const { return stats_.bytes_evicted; }
|
|
|
|
private:
|
|
int n_layers_ = 0;
|
|
std::vector<LayerExperts> layers_;
|
|
std::vector<std::vector<int>> prev_active_;
|
|
|
|
struct Stats {
|
|
size_t prefetches = 0;
|
|
size_t evictions = 0;
|
|
size_t bytes_evicted = 0;
|
|
} stats_;
|
|
|
|
void prefetch_slice(const ExpertSlice& es, int eid) {
|
|
if (!es.base || eid < 0 || eid >= es.n_experts) return;
|
|
uintptr_t start = (uintptr_t)es.base + (size_t)eid * es.expert_bytes;
|
|
uintptr_t aligned = page_align_down(start);
|
|
size_t len = page_align_up(es.expert_bytes + (start - aligned));
|
|
madvise((void*)aligned, len, MADV_WILLNEED);
|
|
}
|
|
|
|
void evict_slice(const ExpertSlice& es, int eid) {
|
|
if (!es.base || eid < 0 || eid >= es.n_experts) return;
|
|
uintptr_t start = (uintptr_t)es.base + (size_t)eid * es.expert_bytes;
|
|
uintptr_t aligned = page_align_down(start);
|
|
size_t len = page_align_up(es.expert_bytes + (start - aligned));
|
|
madvise((void*)aligned, len, MADV_DONTNEED);
|
|
}
|
|
|
|
void evict_tensor(const ExpertSlice& es) {
|
|
if (!es.base) return;
|
|
size_t total = (size_t)es.n_experts * es.expert_bytes;
|
|
uintptr_t aligned = page_align_down((uintptr_t)es.base);
|
|
size_t len = page_align_up(total + ((uintptr_t)es.base - aligned));
|
|
madvise((void*)aligned, len, MADV_DONTNEED);
|
|
}
|
|
};
|
|
|
|
} // namespace ix
|