inference-x/runtime/attention.h

// ═══════════════════════════════════════════════════════════════════════════════
// INFERENCE-X — Attention Mechanisms
// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
// Licensed under the Business Source License 1.1 (BSL-1.1)
// See LICENSE file for full terms.
//
// INTELLECTUAL PROPERTY PROTECTION:
// - INPI eSoleau deposit: 7phf-Ueye-2nWr-Vsgu (16/02/2026)
// - GitHub: github.com/ElmadaniS/inference-x
// - Author: Salka Elmadani | Morocco | Morocco
//
// MANUFACTURER NOTICE: Any manufacturer, company, or entity that
// incorporates, embeds, distributes, or commercially uses Inference-X
// or any derivative work without explicit written authorization from
// the copyright holder is in violation of BSL-1.1 and applicable
// intellectual property laws. This includes but is not limited to:
// hardware vendors, cloud providers, SaaS platforms, and OEMs.
//
// Contact: Elmadani.SALKA@proton.me for licensing.
// ═══════════════════════════════════════════════════════════════════════════════

#pragma once

// Inference-X Attention — Salka Elmadani — Morocco
#define IX_ATTENTION_SIGNATURE 0x935
#define IX_ATTENTION_MARK "Inference-X-Attention-935-Elmadani"


#include "../core/z_core.h"
#include "kernels.h"
#include "gemm.h"
#include <vector>
#include <cmath>

namespace ix {

// ═══════════════════════════════════════════════════════════════════════════════
// KV CACHE — Pre-allocated, O(n) memory
// ═══════════════════════════════════════════════════════════════════════════════
class KVCache {
public:
    KVCache() = default;

    void init(const Config& cfg) {
        n_layers_ = cfg.n_layers;
        n_kv_heads_ = cfg.n_kv_heads;
        head_dim_ = cfg.head_dim;
        max_seq_len_ = cfg.max_seq_len;

        // Allocate: [n_layers, 2 (K+V), n_kv_heads, max_seq_len, head_dim]
        size_t layer_size = 2 * n_kv_heads_ * max_seq_len_ * head_dim_;
        size_t total_size = n_layers_ * layer_size;

        data_.resize(total_size, 0.0f);
        pos_ = 0;
    }

    void clear() {
        pos_ = 0;
        std::fill(data_.begin(), data_.end(), 0.0f);
    }

    // Get K cache for layer at position
    float* key(int layer, int pos) {
        return data_.data() + layer_offset(layer) + pos * n_kv_heads_ * head_dim_;
    }

    // Get V cache for layer at position
    float* value(int layer, int pos) {
        size_t v_offset = n_kv_heads_ * max_seq_len_ * head_dim_;
        return data_.data() + layer_offset(layer) + v_offset + pos * n_kv_heads_ * head_dim_;
    }

    // Get all K for layer up to current position
    const float* key_seq(int layer) const {
        return data_.data() + layer_offset(layer);
    }

    // Get all V for layer up to current position
    const float* value_seq(int layer) const {
        size_t v_offset = n_kv_heads_ * max_seq_len_ * head_dim_;
        return data_.data() + layer_offset(layer) + v_offset;
    }

    void advance() { ++pos_; }
    int position() const { return pos_; }
    int max_seq_len() const { return max_seq_len_; }
    int n_kv_heads() const { return n_kv_heads_; }
    int head_dim() const { return head_dim_; }

private:
    std::vector<float> data_;
    int n_layers_ = 0;
    int n_kv_heads_ = 0;
    int head_dim_ = 0;
    int max_seq_len_ = 0;
    int pos_ = 0;

    size_t layer_offset(int layer) const {
        return layer * 2 * n_kv_heads_ * max_seq_len_ * head_dim_;
    }
};

// ═══════════════════════════════════════════════════════════════════════════════
// ATTENTION — Grouped Query Attention (GQA)
// ═══════════════════════════════════════════════════════════════════════════════
class Attention {
public:
    void init(const Config& cfg) {
        n_heads_ = cfg.n_heads;
        n_kv_heads_ = cfg.n_kv_heads;
        head_dim_ = cfg.head_dim;
        dim_ = cfg.dim;

        // GQA: heads_per_kv_head
        gqa_ratio_ = n_heads_ / n_kv_heads_;

        // Scratch buffers
        q_.resize(n_heads_ * head_dim_);
        k_.resize(n_kv_heads_ * head_dim_);
        v_.resize(n_kv_heads_ * head_dim_);
        attn_out_.resize(n_heads_ * head_dim_);
        scores_.resize(n_heads_ * cfg.max_seq_len);
    }

    // ═══════════════════════════════════════════════════════════════════════════
    // FORWARD PASS
    // Input: x[dim], Output: out[dim]
    // Weights: wq[dim, dim], wk[dim, kv_dim], wv[dim, kv_dim], wo[dim, dim]
    // ═══════════════════════════════════════════════════════════════════════════
    void forward(
        float* out,
        const float* x,
        const void* wq, dtype tq,
        const void* wk, dtype tk,
        const void* wv, dtype tv,
        const void* wo, dtype to,
        KVCache& kv,
        kernel::RoPE& rope,
        int layer,
        const float* bq = nullptr,
        const float* bk = nullptr,
        const float* bv = nullptr
    ) {
        int pos = kv.position();
        int seq_len = pos + 1;
        int kv_dim = n_kv_heads_ * head_dim_;

        // Q, K, V projections
        gemm::matmul(q_.data(), wq, tq, x, n_heads_ * head_dim_, dim_);
        gemm::matmul(k_.data(), wk, tk, x, kv_dim, dim_);
        gemm::matmul(v_.data(), wv, tv, x, kv_dim, dim_);

        // Apply QKV bias (Qwen2)
        if (bq) for (int i = 0; i < n_heads_ * head_dim_; ++i) q_[i] += bq[i];
        if (bk) for (int i = 0; i < kv_dim; ++i) k_[i] += bk[i];
        if (bv) for (int i = 0; i < kv_dim; ++i) v_[i] += bv[i];

        // Apply RoPE to Q and K
        rope.apply(q_.data(), k_.data(), pos, n_heads_, n_kv_heads_);

        // Store K, V in cache
        float* k_cache = kv.key(layer, pos);
        float* v_cache = kv.value(layer, pos);
        kernel::vec_copy(k_cache, k_.data(), kv_dim);
        kernel::vec_copy(v_cache, v_.data(), kv_dim);

        // Attention for each head
        float scale = 1.0f / std::sqrt(static_cast<float>(head_dim_));

        #pragma omp parallel for
        for (int h = 0; h < n_heads_; ++h) {
            int kv_h = h / gqa_ratio_;  // GQA: which KV head to use

            const float* qh = q_.data() + h * head_dim_;
            float* sh = scores_.data() + h * seq_len;
            float* oh = attn_out_.data() + h * head_dim_;

            // Compute attention scores: Q @ K^T
            for (int t = 0; t < seq_len; ++t) {
                const float* kh = kv.key_seq(layer) + t * n_kv_heads_ * head_dim_ + kv_h * head_dim_;

                float score = 0.0f;
                for (int d = 0; d < head_dim_; ++d) {
                    score += qh[d] * kh[d];
                }
                sh[t] = score * scale;
            }

            // Causal mask: -inf for future positions (not needed, seq_len is already limited)

            // Softmax
            kernel::softmax(sh, seq_len);

            // Weighted sum of V
            kernel::vec_zero(oh, head_dim_);
            for (int t = 0; t < seq_len; ++t) {
                const float* vh = kv.value_seq(layer) + t * n_kv_heads_ * head_dim_ + kv_h * head_dim_;
                float w = sh[t];

                for (int d = 0; d < head_dim_; ++d) {
                    oh[d] += w * vh[d];
                }
            }
        }

        // Output projection
        gemm::matmul(out, wo, to, attn_out_.data(), dim_, n_heads_ * head_dim_);
    }

private:
    int n_heads_ = 32;
    int n_kv_heads_ = 8;
    int head_dim_ = 128;
    int dim_ = 4096;
    int gqa_ratio_ = 4;

    std::vector<float> q_;
    std::vector<float> k_;
    std::vector<float> v_;
    std::vector<float> attn_out_;
    std::vector<float> scores_;
};

// ═══════════════════════════════════════════════════════════════════════════════
// FFN — SwiGLU Feed-Forward Network
// ═══════════════════════════════════════════════════════════════════════════════
class FFN {
public:
    void init(const Config& cfg) {
        dim_ = cfg.dim;
        intermediate_ = cfg.intermediate;
        activation_ = cfg.activation;

        gate_.resize(intermediate_);
        up_.resize(intermediate_);
    }

    // FFN(x) = down(act(gate(x)) * up(x))
    void forward(
        float* out,
        const float* x,
        const void* w_gate, dtype t_gate,
        const void* w_up, dtype t_up,
        const void* w_down, dtype t_down,
        Activation act = Activation::SILU
    ) {
        // Gate and Up projections
        gemm::matmul(gate_.data(), w_gate, t_gate, x, intermediate_, dim_);
        gemm::matmul(up_.data(), w_up, t_up, x, intermediate_, dim_);

        // Activation(gate) * up — supports all model families
        switch (act) {
            case Activation::GELU:
                kernel::gelu(gate_.data(), intermediate_);
                break;
            case Activation::GELU_QUICK: {
                // GELU_QUICK: x * sigmoid(1.702 * x)
                for (int i = 0; i < intermediate_; ++i) {
                    float sig = 1.0f / (1.0f + std::exp(-1.702f * gate_[i]));
                    gate_[i] *= sig;
                }
                break;
            }
            case Activation::RELU_SQ:
                for (int i = 0; i < intermediate_; ++i) {
                    gate_[i] = std::max(0.0f, gate_[i]);
                    gate_[i] *= gate_[i]; // ReLU²
                }
                break;
            default: // SILU
                kernel::silu(gate_.data(), intermediate_);
                break;
        }
        kernel::vec_mul(gate_.data(), gate_.data(), up_.data(), intermediate_);

        // Down projection
        gemm::matmul(out, w_down, t_down, gate_.data(), dim_, intermediate_);
    }

private:
    int dim_ = 4096;
    int intermediate_ = 11008;
    Activation activation_ = Activation::SILU;

    std::vector<float> gate_;
    std::vector<float> up_;
};

} // namespace ix