inference-x/runtime/expert_mmap.h

// ═══════════════════════════════════════════════════════════════════════════════
// INFERENCE-X — Expert-Aware Memory-Mapped I/O for MoE
// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
// Licensed under the Business Source License 1.1 (BSL-1.1)
// See LICENSE file for full terms.
//
// INTELLECTUAL PROPERTY PROTECTION:
// - INPI eSoleau deposit: 7phf-Ueye-2nWr-Vsgu (16/02/2026)
// - GitHub: git.inference-x.com/salka/inference-x
// - Author: Salka Elmadani | Morocco | Morocco
//
// MANUFACTURER NOTICE: Any manufacturer, company, or entity that
// incorporates, embeds, distributes, or commercially uses Inference-X
// or any derivative work without explicit written authorization from
// the copyright holder is in violation of BSL-1.1 and applicable
// intellectual property laws. This includes but is not limited to:
// hardware vendors, cloud providers, SaaS platforms, and OEMs.
//
// Contact: Elmadani.SALKA@proton.me for licensing.
// ═══════════════════════════════════════════════════════════════════════════════

#pragma once

// Inference-X Expert MMAP — Salka Elmadani — Morocco
#define IX_MMAP_IDENTITY "Inference-X-ExpertMMAP-935"


#include <cstdint>
#include <cstdio>
#include <vector>
#include <algorithm>

#ifdef __linux__
#include <sys/mman.h>
#endif

#ifdef __APPLE__
#include <sys/mman.h>
#endif

// Fallback for platforms without madvise
#if !defined(__linux__) && !defined(__APPLE__)
#define MADV_WILLNEED 0
#define MADV_DONTNEED 0
inline int madvise(void*, size_t, int) { return 0; }
#endif

namespace ix {

static constexpr size_t PAGE_SIZE = 4096;

inline uintptr_t page_align_down(uintptr_t addr) {
    return addr & ~(PAGE_SIZE - 1);
}

inline size_t page_align_up(size_t size) {
    return (size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
}

// =============================================================================
// EXPERT MMAP MANAGER
// Surgical madvise on individual expert slices within 3D MoE tensors
// =============================================================================
class ExpertMmapManager {
public:
    struct ExpertSlice {
        void* base;          // Base pointer of full 3D tensor (mmap'd)
        size_t expert_bytes; // Bytes per single expert slice
        int n_experts;       // Total experts in tensor
    };

    struct LayerExperts {
        ExpertSlice gate_exps;  // [dim, expert_ffn, n_experts]
        ExpertSlice up_exps;    // [dim, expert_ffn, n_experts]
        ExpertSlice down_exps;  // [expert_ffn, dim, n_experts]
    };

    void init(int n_layers) {
        n_layers_ = n_layers;
        layers_.resize(n_layers);
        prev_active_.resize(n_layers);
        stats_ = {};
    }

    // Register expert tensor locations (called during model load)
    void register_layer(int layer,
                       void* gate_data, size_t gate_expert_bytes, int n_experts,
                       void* up_data, size_t up_expert_bytes,
                       void* down_data, size_t down_expert_bytes) {
        if (layer >= n_layers_) return;
        layers_[layer] = {
            {gate_data, gate_expert_bytes, n_experts},
            {up_data, up_expert_bytes, n_experts},
            {down_data, down_expert_bytes, n_experts}
        };
    }

    // =========================================================================
    // SURGICAL PREFETCH — Only page in the K active experts (K=8)
    // Called AFTER routing, BEFORE expert FFN computation
    // =========================================================================
    void prefetch_active(int layer, const int* expert_ids, int n_active) {
        if (layer >= n_layers_) return;
        auto& le = layers_[layer];
        auto& prev = prev_active_[layer];

        // Prefetch active
        for (int i = 0; i < n_active; ++i) {
            int eid = expert_ids[i];
            prefetch_slice(le.gate_exps, eid);
            prefetch_slice(le.up_exps, eid);
            prefetch_slice(le.down_exps, eid);
            stats_.prefetches++;
        }

        // Evict previously active that are no longer needed
        for (int prev_eid : prev) {
            bool still = false;
            for (int i = 0; i < n_active; ++i) {
                if (expert_ids[i] == prev_eid) { still = true; break; }
            }
            if (!still) {
                evict_slice(le.gate_exps, prev_eid);
                evict_slice(le.up_exps, prev_eid);
                evict_slice(le.down_exps, prev_eid);
                stats_.evictions++;
                stats_.bytes_evicted += le.gate_exps.expert_bytes +
                                        le.up_exps.expert_bytes +
                                        le.down_exps.expert_bytes;
            }
        }

        // Update active set
        prev.assign(expert_ids, expert_ids + n_active);
    }

    // Overload for vector
    void prefetch_active(int layer, const std::vector<int>& expert_ids) {
        prefetch_active(layer, expert_ids.data(), (int)expert_ids.size());
    }

    // =========================================================================
    // PREDICTIVE PREFETCH — Pre-load statistically hot experts
    // Uses ExpertCache frequency data to predict next layer's active experts
    // =========================================================================
    void prefetch_predicted(int layer, const std::vector<int>& hot_experts) {
        if (layer >= n_layers_) return;
        auto& le = layers_[layer];
        for (int eid : hot_experts) {
            prefetch_slice(le.gate_exps, eid);
            prefetch_slice(le.up_exps, eid);
            prefetch_slice(le.down_exps, eid);
        }
    }

    // =========================================================================
    // LAYER EVICT — Release all expert pages after layer processing
    // Frees page cache pressure for next layer
    // =========================================================================
    void evict_layer(int layer) {
        if (layer >= n_layers_) return;
        auto& le = layers_[layer];
        evict_tensor(le.gate_exps);
        evict_tensor(le.up_exps);
        evict_tensor(le.down_exps);
        prev_active_[layer].clear();
    }

    // =========================================================================
    // SHARED WEIGHTS LOCK — Keep non-expert weights hot in page cache
    // Embeddings, attention projections, RMS norm, output head
    // =========================================================================
    void lock_shared(void* ptr, size_t bytes) {
        if (!ptr || bytes == 0) return;
        uintptr_t aligned = page_align_down((uintptr_t)ptr);
        size_t len = page_align_up(bytes + ((uintptr_t)ptr - aligned));
        madvise((void*)aligned, len, MADV_WILLNEED);
    }

    void print_stats() const {
        printf("[EXPERT-MMAP] Prefetches: %zu | Evictions: %zu | I/O saved: %.2f GB\n",
               stats_.prefetches, stats_.evictions, stats_.bytes_evicted / 1e9);
    }

    size_t bytes_saved() const { return stats_.bytes_evicted; }

private:
    int n_layers_ = 0;
    std::vector<LayerExperts> layers_;
    std::vector<std::vector<int>> prev_active_;

    struct Stats {
        size_t prefetches = 0;
        size_t evictions = 0;
        size_t bytes_evicted = 0;
    } stats_;

    void prefetch_slice(const ExpertSlice& es, int eid) {
        if (!es.base || eid < 0 || eid >= es.n_experts) return;
        uintptr_t start = (uintptr_t)es.base + (size_t)eid * es.expert_bytes;
        uintptr_t aligned = page_align_down(start);
        size_t len = page_align_up(es.expert_bytes + (start - aligned));
        madvise((void*)aligned, len, MADV_WILLNEED);
    }

    void evict_slice(const ExpertSlice& es, int eid) {
        if (!es.base || eid < 0 || eid >= es.n_experts) return;
        uintptr_t start = (uintptr_t)es.base + (size_t)eid * es.expert_bytes;
        uintptr_t aligned = page_align_down(start);
        size_t len = page_align_up(es.expert_bytes + (start - aligned));
        madvise((void*)aligned, len, MADV_DONTNEED);
    }

    void evict_tensor(const ExpertSlice& es) {
        if (!es.base) return;
        size_t total = (size_t)es.n_experts * es.expert_bytes;
        uintptr_t aligned = page_align_down((uintptr_t)es.base);
        size_t len = page_align_up(total + ((uintptr_t)es.base - aligned));
        madvise((void*)aligned, len, MADV_DONTNEED);
    }
};

} // namespace ix