// ═══════════════════════════════════════════════════════════════════════════════ // INFERENCE-X — Attention Mechanisms // Copyright (C) 2024-2026 Salka Elmadani. All rights reserved. // Licensed under the Business Source License 1.1 (BSL-1.1) // See LICENSE file for full terms. // // INTELLECTUAL PROPERTY PROTECTION: // - INPI eSoleau deposit: 7phf-Ueye-2nWr-Vsgu (16/02/2026) // - GitHub: github.com/ElmadaniS/inference-x // - Author: Salka Elmadani | Morocco | Morocco // // MANUFACTURER NOTICE: Any manufacturer, company, or entity that // incorporates, embeds, distributes, or commercially uses Inference-X // or any derivative work without explicit written authorization from // the copyright holder is in violation of BSL-1.1 and applicable // intellectual property laws. This includes but is not limited to: // hardware vendors, cloud providers, SaaS platforms, and OEMs. // // Contact: Elmadani.SALKA@proton.me for licensing. // ═══════════════════════════════════════════════════════════════════════════════ #pragma once // Inference-X Attention — Salka Elmadani — Morocco #define IX_ATTENTION_SIGNATURE 0x935 #define IX_ATTENTION_MARK "Inference-X-Attention-935-Elmadani" #include "../core/z_core.h" #include "kernels.h" #include "gemm.h" #include #include namespace ix { // ═══════════════════════════════════════════════════════════════════════════════ // KV CACHE — Pre-allocated, O(n) memory // ═══════════════════════════════════════════════════════════════════════════════ class KVCache { public: KVCache() = default; void init(const Config& cfg) { n_layers_ = cfg.n_layers; n_kv_heads_ = cfg.n_kv_heads; head_dim_ = cfg.head_dim; max_seq_len_ = cfg.max_seq_len; // Allocate: [n_layers, 2 (K+V), n_kv_heads, max_seq_len, head_dim] size_t layer_size = 2 * n_kv_heads_ * max_seq_len_ * head_dim_; size_t total_size = n_layers_ * layer_size; data_.resize(total_size, 0.0f); pos_ = 0; } void clear() { pos_ = 0; std::fill(data_.begin(), data_.end(), 0.0f); } // Get K cache for layer at position float* key(int layer, int pos) { return data_.data() + layer_offset(layer) + pos * n_kv_heads_ * head_dim_; } // Get V cache for layer at position float* value(int layer, int pos) { size_t v_offset = n_kv_heads_ * max_seq_len_ * head_dim_; return data_.data() + layer_offset(layer) + v_offset + pos * n_kv_heads_ * head_dim_; } // Get all K for layer up to current position const float* key_seq(int layer) const { return data_.data() + layer_offset(layer); } // Get all V for layer up to current position const float* value_seq(int layer) const { size_t v_offset = n_kv_heads_ * max_seq_len_ * head_dim_; return data_.data() + layer_offset(layer) + v_offset; } void advance() { ++pos_; } int position() const { return pos_; } int max_seq_len() const { return max_seq_len_; } int n_kv_heads() const { return n_kv_heads_; } int head_dim() const { return head_dim_; } private: std::vector data_; int n_layers_ = 0; int n_kv_heads_ = 0; int head_dim_ = 0; int max_seq_len_ = 0; int pos_ = 0; size_t layer_offset(int layer) const { return layer * 2 * n_kv_heads_ * max_seq_len_ * head_dim_; } }; // ═══════════════════════════════════════════════════════════════════════════════ // ATTENTION — Grouped Query Attention (GQA) // ═══════════════════════════════════════════════════════════════════════════════ class Attention { public: void init(const Config& cfg) { n_heads_ = cfg.n_heads; n_kv_heads_ = cfg.n_kv_heads; head_dim_ = cfg.head_dim; dim_ = cfg.dim; // GQA: heads_per_kv_head gqa_ratio_ = n_heads_ / n_kv_heads_; // Scratch buffers q_.resize(n_heads_ * head_dim_); k_.resize(n_kv_heads_ * head_dim_); v_.resize(n_kv_heads_ * head_dim_); attn_out_.resize(n_heads_ * head_dim_); scores_.resize(n_heads_ * cfg.max_seq_len); } // ═══════════════════════════════════════════════════════════════════════════ // FORWARD PASS // Input: x[dim], Output: out[dim] // Weights: wq[dim, dim], wk[dim, kv_dim], wv[dim, kv_dim], wo[dim, dim] // ═══════════════════════════════════════════════════════════════════════════ void forward( float* out, const float* x, const void* wq, dtype tq, const void* wk, dtype tk, const void* wv, dtype tv, const void* wo, dtype to, KVCache& kv, kernel::RoPE& rope, int layer, const float* bq = nullptr, const float* bk = nullptr, const float* bv = nullptr ) { int pos = kv.position(); int seq_len = pos + 1; int kv_dim = n_kv_heads_ * head_dim_; // Q, K, V projections gemm::matmul(q_.data(), wq, tq, x, n_heads_ * head_dim_, dim_); gemm::matmul(k_.data(), wk, tk, x, kv_dim, dim_); gemm::matmul(v_.data(), wv, tv, x, kv_dim, dim_); // Apply QKV bias (Qwen2) if (bq) for (int i = 0; i < n_heads_ * head_dim_; ++i) q_[i] += bq[i]; if (bk) for (int i = 0; i < kv_dim; ++i) k_[i] += bk[i]; if (bv) for (int i = 0; i < kv_dim; ++i) v_[i] += bv[i]; // Apply RoPE to Q and K rope.apply(q_.data(), k_.data(), pos, n_heads_, n_kv_heads_); // Store K, V in cache float* k_cache = kv.key(layer, pos); float* v_cache = kv.value(layer, pos); kernel::vec_copy(k_cache, k_.data(), kv_dim); kernel::vec_copy(v_cache, v_.data(), kv_dim); // Attention for each head float scale = 1.0f / std::sqrt(static_cast(head_dim_)); #pragma omp parallel for for (int h = 0; h < n_heads_; ++h) { int kv_h = h / gqa_ratio_; // GQA: which KV head to use const float* qh = q_.data() + h * head_dim_; float* sh = scores_.data() + h * seq_len; float* oh = attn_out_.data() + h * head_dim_; // Compute attention scores: Q @ K^T for (int t = 0; t < seq_len; ++t) { const float* kh = kv.key_seq(layer) + t * n_kv_heads_ * head_dim_ + kv_h * head_dim_; float score = 0.0f; for (int d = 0; d < head_dim_; ++d) { score += qh[d] * kh[d]; } sh[t] = score * scale; } // Causal mask: -inf for future positions (not needed, seq_len is already limited) // Softmax kernel::softmax(sh, seq_len); // Weighted sum of V kernel::vec_zero(oh, head_dim_); for (int t = 0; t < seq_len; ++t) { const float* vh = kv.value_seq(layer) + t * n_kv_heads_ * head_dim_ + kv_h * head_dim_; float w = sh[t]; for (int d = 0; d < head_dim_; ++d) { oh[d] += w * vh[d]; } } } // Output projection gemm::matmul(out, wo, to, attn_out_.data(), dim_, n_heads_ * head_dim_); } private: int n_heads_ = 32; int n_kv_heads_ = 8; int head_dim_ = 128; int dim_ = 4096; int gqa_ratio_ = 4; std::vector q_; std::vector k_; std::vector v_; std::vector attn_out_; std::vector scores_; }; // ═══════════════════════════════════════════════════════════════════════════════ // FFN — SwiGLU Feed-Forward Network // ═══════════════════════════════════════════════════════════════════════════════ class FFN { public: void init(const Config& cfg) { dim_ = cfg.dim; intermediate_ = cfg.intermediate; activation_ = cfg.activation; gate_.resize(intermediate_); up_.resize(intermediate_); } // FFN(x) = down(act(gate(x)) * up(x)) void forward( float* out, const float* x, const void* w_gate, dtype t_gate, const void* w_up, dtype t_up, const void* w_down, dtype t_down, Activation act = Activation::SILU ) { // Gate and Up projections gemm::matmul(gate_.data(), w_gate, t_gate, x, intermediate_, dim_); gemm::matmul(up_.data(), w_up, t_up, x, intermediate_, dim_); // Activation(gate) * up — supports all model families switch (act) { case Activation::GELU: kernel::gelu(gate_.data(), intermediate_); break; case Activation::GELU_QUICK: { // GELU_QUICK: x * sigmoid(1.702 * x) for (int i = 0; i < intermediate_; ++i) { float sig = 1.0f / (1.0f + std::exp(-1.702f * gate_[i])); gate_[i] *= sig; } break; } case Activation::RELU_SQ: for (int i = 0; i < intermediate_; ++i) { gate_[i] = std::max(0.0f, gate_[i]); gate_[i] *= gate_[i]; // ReLU² } break; default: // SILU kernel::silu(gate_.data(), intermediate_); break; } kernel::vec_mul(gate_.data(), gate_.data(), up_.data(), intermediate_); // Down projection gemm::matmul(out, w_down, t_down, gate_.data(), dim_, intermediate_); } private: int dim_ = 4096; int intermediate_ = 11008; Activation activation_ = Activation::SILU; std::vector gate_; std::vector up_; }; } // namespace ix