inference-x/runtime/attention.h
Salka Elmadani ec36668cf5 Inference-X v1.0 — Universal AI Inference Engine
Better output from the same model. Fused computation, adaptive precision,
surgical expert loading. 305 KB, 19 backends, zero dependencies.

https://inference-x.com
2026-02-23 07:10:47 +00:00

291 lines
11 KiB
C++

// ═══════════════════════════════════════════════════════════════════════════════
// INFERENCE-X — Attention Mechanisms
// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
// Licensed under the Business Source License 1.1 (BSL-1.1)
// See LICENSE file for full terms.
//
// INTELLECTUAL PROPERTY PROTECTION:
// - INPI eSoleau deposit: 7phf-Ueye-2nWr-Vsgu (16/02/2026)
// - GitHub: github.com/ElmadaniS/inference-x
// - Author: Salka Elmadani | Morocco | Morocco
//
// MANUFACTURER NOTICE: Any manufacturer, company, or entity that
// incorporates, embeds, distributes, or commercially uses Inference-X
// or any derivative work without explicit written authorization from
// the copyright holder is in violation of BSL-1.1 and applicable
// intellectual property laws. This includes but is not limited to:
// hardware vendors, cloud providers, SaaS platforms, and OEMs.
//
// Contact: Elmadani.SALKA@proton.me for licensing.
// ═══════════════════════════════════════════════════════════════════════════════
#pragma once
// Inference-X Attention — Salka Elmadani — Morocco
#define IX_ATTENTION_SIGNATURE 0x935
#define IX_ATTENTION_MARK "Inference-X-Attention-935-Elmadani"
#include "../core/z_core.h"
#include "kernels.h"
#include "gemm.h"
#include <vector>
#include <cmath>
namespace ix {
// ═══════════════════════════════════════════════════════════════════════════════
// KV CACHE — Pre-allocated, O(n) memory
// ═══════════════════════════════════════════════════════════════════════════════
class KVCache {
public:
KVCache() = default;
void init(const Config& cfg) {
n_layers_ = cfg.n_layers;
n_kv_heads_ = cfg.n_kv_heads;
head_dim_ = cfg.head_dim;
max_seq_len_ = cfg.max_seq_len;
// Allocate: [n_layers, 2 (K+V), n_kv_heads, max_seq_len, head_dim]
size_t layer_size = 2 * n_kv_heads_ * max_seq_len_ * head_dim_;
size_t total_size = n_layers_ * layer_size;
data_.resize(total_size, 0.0f);
pos_ = 0;
}
void clear() {
pos_ = 0;
std::fill(data_.begin(), data_.end(), 0.0f);
}
// Get K cache for layer at position
float* key(int layer, int pos) {
return data_.data() + layer_offset(layer) + pos * n_kv_heads_ * head_dim_;
}
// Get V cache for layer at position
float* value(int layer, int pos) {
size_t v_offset = n_kv_heads_ * max_seq_len_ * head_dim_;
return data_.data() + layer_offset(layer) + v_offset + pos * n_kv_heads_ * head_dim_;
}
// Get all K for layer up to current position
const float* key_seq(int layer) const {
return data_.data() + layer_offset(layer);
}
// Get all V for layer up to current position
const float* value_seq(int layer) const {
size_t v_offset = n_kv_heads_ * max_seq_len_ * head_dim_;
return data_.data() + layer_offset(layer) + v_offset;
}
void advance() { ++pos_; }
int position() const { return pos_; }
int max_seq_len() const { return max_seq_len_; }
int n_kv_heads() const { return n_kv_heads_; }
int head_dim() const { return head_dim_; }
private:
std::vector<float> data_;
int n_layers_ = 0;
int n_kv_heads_ = 0;
int head_dim_ = 0;
int max_seq_len_ = 0;
int pos_ = 0;
size_t layer_offset(int layer) const {
return layer * 2 * n_kv_heads_ * max_seq_len_ * head_dim_;
}
};
// ═══════════════════════════════════════════════════════════════════════════════
// ATTENTION — Grouped Query Attention (GQA)
// ═══════════════════════════════════════════════════════════════════════════════
class Attention {
public:
void init(const Config& cfg) {
n_heads_ = cfg.n_heads;
n_kv_heads_ = cfg.n_kv_heads;
head_dim_ = cfg.head_dim;
dim_ = cfg.dim;
// GQA: heads_per_kv_head
gqa_ratio_ = n_heads_ / n_kv_heads_;
// Scratch buffers
q_.resize(n_heads_ * head_dim_);
k_.resize(n_kv_heads_ * head_dim_);
v_.resize(n_kv_heads_ * head_dim_);
attn_out_.resize(n_heads_ * head_dim_);
scores_.resize(n_heads_ * cfg.max_seq_len);
}
// ═══════════════════════════════════════════════════════════════════════════
// FORWARD PASS
// Input: x[dim], Output: out[dim]
// Weights: wq[dim, dim], wk[dim, kv_dim], wv[dim, kv_dim], wo[dim, dim]
// ═══════════════════════════════════════════════════════════════════════════
void forward(
float* out,
const float* x,
const void* wq, dtype tq,
const void* wk, dtype tk,
const void* wv, dtype tv,
const void* wo, dtype to,
KVCache& kv,
kernel::RoPE& rope,
int layer,
const float* bq = nullptr,
const float* bk = nullptr,
const float* bv = nullptr
) {
int pos = kv.position();
int seq_len = pos + 1;
int kv_dim = n_kv_heads_ * head_dim_;
// Q, K, V projections
gemm::matmul(q_.data(), wq, tq, x, n_heads_ * head_dim_, dim_);
gemm::matmul(k_.data(), wk, tk, x, kv_dim, dim_);
gemm::matmul(v_.data(), wv, tv, x, kv_dim, dim_);
// Apply QKV bias (Qwen2)
if (bq) for (int i = 0; i < n_heads_ * head_dim_; ++i) q_[i] += bq[i];
if (bk) for (int i = 0; i < kv_dim; ++i) k_[i] += bk[i];
if (bv) for (int i = 0; i < kv_dim; ++i) v_[i] += bv[i];
// Apply RoPE to Q and K
rope.apply(q_.data(), k_.data(), pos, n_heads_, n_kv_heads_);
// Store K, V in cache
float* k_cache = kv.key(layer, pos);
float* v_cache = kv.value(layer, pos);
kernel::vec_copy(k_cache, k_.data(), kv_dim);
kernel::vec_copy(v_cache, v_.data(), kv_dim);
// Attention for each head
float scale = 1.0f / std::sqrt(static_cast<float>(head_dim_));
#pragma omp parallel for
for (int h = 0; h < n_heads_; ++h) {
int kv_h = h / gqa_ratio_; // GQA: which KV head to use
const float* qh = q_.data() + h * head_dim_;
float* sh = scores_.data() + h * seq_len;
float* oh = attn_out_.data() + h * head_dim_;
// Compute attention scores: Q @ K^T
for (int t = 0; t < seq_len; ++t) {
const float* kh = kv.key_seq(layer) + t * n_kv_heads_ * head_dim_ + kv_h * head_dim_;
float score = 0.0f;
for (int d = 0; d < head_dim_; ++d) {
score += qh[d] * kh[d];
}
sh[t] = score * scale;
}
// Causal mask: -inf for future positions (not needed, seq_len is already limited)
// Softmax
kernel::softmax(sh, seq_len);
// Weighted sum of V
kernel::vec_zero(oh, head_dim_);
for (int t = 0; t < seq_len; ++t) {
const float* vh = kv.value_seq(layer) + t * n_kv_heads_ * head_dim_ + kv_h * head_dim_;
float w = sh[t];
for (int d = 0; d < head_dim_; ++d) {
oh[d] += w * vh[d];
}
}
}
// Output projection
gemm::matmul(out, wo, to, attn_out_.data(), dim_, n_heads_ * head_dim_);
}
private:
int n_heads_ = 32;
int n_kv_heads_ = 8;
int head_dim_ = 128;
int dim_ = 4096;
int gqa_ratio_ = 4;
std::vector<float> q_;
std::vector<float> k_;
std::vector<float> v_;
std::vector<float> attn_out_;
std::vector<float> scores_;
};
// ═══════════════════════════════════════════════════════════════════════════════
// FFN — SwiGLU Feed-Forward Network
// ═══════════════════════════════════════════════════════════════════════════════
class FFN {
public:
void init(const Config& cfg) {
dim_ = cfg.dim;
intermediate_ = cfg.intermediate;
activation_ = cfg.activation;
gate_.resize(intermediate_);
up_.resize(intermediate_);
}
// FFN(x) = down(act(gate(x)) * up(x))
void forward(
float* out,
const float* x,
const void* w_gate, dtype t_gate,
const void* w_up, dtype t_up,
const void* w_down, dtype t_down,
Activation act = Activation::SILU
) {
// Gate and Up projections
gemm::matmul(gate_.data(), w_gate, t_gate, x, intermediate_, dim_);
gemm::matmul(up_.data(), w_up, t_up, x, intermediate_, dim_);
// Activation(gate) * up — supports all model families
switch (act) {
case Activation::GELU:
kernel::gelu(gate_.data(), intermediate_);
break;
case Activation::GELU_QUICK: {
// GELU_QUICK: x * sigmoid(1.702 * x)
for (int i = 0; i < intermediate_; ++i) {
float sig = 1.0f / (1.0f + std::exp(-1.702f * gate_[i]));
gate_[i] *= sig;
}
break;
}
case Activation::RELU_SQ:
for (int i = 0; i < intermediate_; ++i) {
gate_[i] = std::max(0.0f, gate_[i]);
gate_[i] *= gate_[i]; // ReLU²
}
break;
default: // SILU
kernel::silu(gate_.data(), intermediate_);
break;
}
kernel::vec_mul(gate_.data(), gate_.data(), up_.data(), intermediate_);
// Down projection
gemm::matmul(out, w_down, t_down, gate_.data(), dim_, intermediate_);
}
private:
int dim_ = 4096;
int intermediate_ = 11008;
Activation activation_ = Activation::SILU;
std::vector<float> gate_;
std::vector<float> up_;
};
} // namespace ix