Better output from the same model. Fused computation, adaptive precision, surgical expert loading. 305 KB, 19 backends, zero dependencies. https://inference-x.com
291 lines
11 KiB
C++
291 lines
11 KiB
C++
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// INFERENCE-X — Attention Mechanisms
|
|
// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
|
|
// Licensed under the Business Source License 1.1 (BSL-1.1)
|
|
// See LICENSE file for full terms.
|
|
//
|
|
// INTELLECTUAL PROPERTY PROTECTION:
|
|
// - INPI eSoleau deposit: 7phf-Ueye-2nWr-Vsgu (16/02/2026)
|
|
// - GitHub: github.com/ElmadaniS/inference-x
|
|
// - Author: Salka Elmadani | Morocco | Morocco
|
|
//
|
|
// MANUFACTURER NOTICE: Any manufacturer, company, or entity that
|
|
// incorporates, embeds, distributes, or commercially uses Inference-X
|
|
// or any derivative work without explicit written authorization from
|
|
// the copyright holder is in violation of BSL-1.1 and applicable
|
|
// intellectual property laws. This includes but is not limited to:
|
|
// hardware vendors, cloud providers, SaaS platforms, and OEMs.
|
|
//
|
|
// Contact: Elmadani.SALKA@proton.me for licensing.
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
|
|
#pragma once
|
|
|
|
// Inference-X Attention — Salka Elmadani — Morocco
|
|
#define IX_ATTENTION_SIGNATURE 0x935
|
|
#define IX_ATTENTION_MARK "Inference-X-Attention-935-Elmadani"
|
|
|
|
|
|
#include "../core/z_core.h"
|
|
#include "kernels.h"
|
|
#include "gemm.h"
|
|
#include <vector>
|
|
#include <cmath>
|
|
|
|
namespace ix {
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// KV CACHE — Pre-allocated, O(n) memory
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
class KVCache {
|
|
public:
|
|
KVCache() = default;
|
|
|
|
void init(const Config& cfg) {
|
|
n_layers_ = cfg.n_layers;
|
|
n_kv_heads_ = cfg.n_kv_heads;
|
|
head_dim_ = cfg.head_dim;
|
|
max_seq_len_ = cfg.max_seq_len;
|
|
|
|
// Allocate: [n_layers, 2 (K+V), n_kv_heads, max_seq_len, head_dim]
|
|
size_t layer_size = 2 * n_kv_heads_ * max_seq_len_ * head_dim_;
|
|
size_t total_size = n_layers_ * layer_size;
|
|
|
|
data_.resize(total_size, 0.0f);
|
|
pos_ = 0;
|
|
}
|
|
|
|
void clear() {
|
|
pos_ = 0;
|
|
std::fill(data_.begin(), data_.end(), 0.0f);
|
|
}
|
|
|
|
// Get K cache for layer at position
|
|
float* key(int layer, int pos) {
|
|
return data_.data() + layer_offset(layer) + pos * n_kv_heads_ * head_dim_;
|
|
}
|
|
|
|
// Get V cache for layer at position
|
|
float* value(int layer, int pos) {
|
|
size_t v_offset = n_kv_heads_ * max_seq_len_ * head_dim_;
|
|
return data_.data() + layer_offset(layer) + v_offset + pos * n_kv_heads_ * head_dim_;
|
|
}
|
|
|
|
// Get all K for layer up to current position
|
|
const float* key_seq(int layer) const {
|
|
return data_.data() + layer_offset(layer);
|
|
}
|
|
|
|
// Get all V for layer up to current position
|
|
const float* value_seq(int layer) const {
|
|
size_t v_offset = n_kv_heads_ * max_seq_len_ * head_dim_;
|
|
return data_.data() + layer_offset(layer) + v_offset;
|
|
}
|
|
|
|
void advance() { ++pos_; }
|
|
int position() const { return pos_; }
|
|
int max_seq_len() const { return max_seq_len_; }
|
|
int n_kv_heads() const { return n_kv_heads_; }
|
|
int head_dim() const { return head_dim_; }
|
|
|
|
private:
|
|
std::vector<float> data_;
|
|
int n_layers_ = 0;
|
|
int n_kv_heads_ = 0;
|
|
int head_dim_ = 0;
|
|
int max_seq_len_ = 0;
|
|
int pos_ = 0;
|
|
|
|
size_t layer_offset(int layer) const {
|
|
return layer * 2 * n_kv_heads_ * max_seq_len_ * head_dim_;
|
|
}
|
|
};
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// ATTENTION — Grouped Query Attention (GQA)
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
class Attention {
|
|
public:
|
|
void init(const Config& cfg) {
|
|
n_heads_ = cfg.n_heads;
|
|
n_kv_heads_ = cfg.n_kv_heads;
|
|
head_dim_ = cfg.head_dim;
|
|
dim_ = cfg.dim;
|
|
|
|
// GQA: heads_per_kv_head
|
|
gqa_ratio_ = n_heads_ / n_kv_heads_;
|
|
|
|
// Scratch buffers
|
|
q_.resize(n_heads_ * head_dim_);
|
|
k_.resize(n_kv_heads_ * head_dim_);
|
|
v_.resize(n_kv_heads_ * head_dim_);
|
|
attn_out_.resize(n_heads_ * head_dim_);
|
|
scores_.resize(n_heads_ * cfg.max_seq_len);
|
|
}
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
// FORWARD PASS
|
|
// Input: x[dim], Output: out[dim]
|
|
// Weights: wq[dim, dim], wk[dim, kv_dim], wv[dim, kv_dim], wo[dim, dim]
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
void forward(
|
|
float* out,
|
|
const float* x,
|
|
const void* wq, dtype tq,
|
|
const void* wk, dtype tk,
|
|
const void* wv, dtype tv,
|
|
const void* wo, dtype to,
|
|
KVCache& kv,
|
|
kernel::RoPE& rope,
|
|
int layer,
|
|
const float* bq = nullptr,
|
|
const float* bk = nullptr,
|
|
const float* bv = nullptr
|
|
) {
|
|
int pos = kv.position();
|
|
int seq_len = pos + 1;
|
|
int kv_dim = n_kv_heads_ * head_dim_;
|
|
|
|
// Q, K, V projections
|
|
gemm::matmul(q_.data(), wq, tq, x, n_heads_ * head_dim_, dim_);
|
|
gemm::matmul(k_.data(), wk, tk, x, kv_dim, dim_);
|
|
gemm::matmul(v_.data(), wv, tv, x, kv_dim, dim_);
|
|
|
|
// Apply QKV bias (Qwen2)
|
|
if (bq) for (int i = 0; i < n_heads_ * head_dim_; ++i) q_[i] += bq[i];
|
|
if (bk) for (int i = 0; i < kv_dim; ++i) k_[i] += bk[i];
|
|
if (bv) for (int i = 0; i < kv_dim; ++i) v_[i] += bv[i];
|
|
|
|
// Apply RoPE to Q and K
|
|
rope.apply(q_.data(), k_.data(), pos, n_heads_, n_kv_heads_);
|
|
|
|
// Store K, V in cache
|
|
float* k_cache = kv.key(layer, pos);
|
|
float* v_cache = kv.value(layer, pos);
|
|
kernel::vec_copy(k_cache, k_.data(), kv_dim);
|
|
kernel::vec_copy(v_cache, v_.data(), kv_dim);
|
|
|
|
// Attention for each head
|
|
float scale = 1.0f / std::sqrt(static_cast<float>(head_dim_));
|
|
|
|
#pragma omp parallel for
|
|
for (int h = 0; h < n_heads_; ++h) {
|
|
int kv_h = h / gqa_ratio_; // GQA: which KV head to use
|
|
|
|
const float* qh = q_.data() + h * head_dim_;
|
|
float* sh = scores_.data() + h * seq_len;
|
|
float* oh = attn_out_.data() + h * head_dim_;
|
|
|
|
// Compute attention scores: Q @ K^T
|
|
for (int t = 0; t < seq_len; ++t) {
|
|
const float* kh = kv.key_seq(layer) + t * n_kv_heads_ * head_dim_ + kv_h * head_dim_;
|
|
|
|
float score = 0.0f;
|
|
for (int d = 0; d < head_dim_; ++d) {
|
|
score += qh[d] * kh[d];
|
|
}
|
|
sh[t] = score * scale;
|
|
}
|
|
|
|
// Causal mask: -inf for future positions (not needed, seq_len is already limited)
|
|
|
|
// Softmax
|
|
kernel::softmax(sh, seq_len);
|
|
|
|
// Weighted sum of V
|
|
kernel::vec_zero(oh, head_dim_);
|
|
for (int t = 0; t < seq_len; ++t) {
|
|
const float* vh = kv.value_seq(layer) + t * n_kv_heads_ * head_dim_ + kv_h * head_dim_;
|
|
float w = sh[t];
|
|
|
|
for (int d = 0; d < head_dim_; ++d) {
|
|
oh[d] += w * vh[d];
|
|
}
|
|
}
|
|
}
|
|
|
|
// Output projection
|
|
gemm::matmul(out, wo, to, attn_out_.data(), dim_, n_heads_ * head_dim_);
|
|
}
|
|
|
|
private:
|
|
int n_heads_ = 32;
|
|
int n_kv_heads_ = 8;
|
|
int head_dim_ = 128;
|
|
int dim_ = 4096;
|
|
int gqa_ratio_ = 4;
|
|
|
|
std::vector<float> q_;
|
|
std::vector<float> k_;
|
|
std::vector<float> v_;
|
|
std::vector<float> attn_out_;
|
|
std::vector<float> scores_;
|
|
};
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// FFN — SwiGLU Feed-Forward Network
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
class FFN {
|
|
public:
|
|
void init(const Config& cfg) {
|
|
dim_ = cfg.dim;
|
|
intermediate_ = cfg.intermediate;
|
|
activation_ = cfg.activation;
|
|
|
|
gate_.resize(intermediate_);
|
|
up_.resize(intermediate_);
|
|
}
|
|
|
|
// FFN(x) = down(act(gate(x)) * up(x))
|
|
void forward(
|
|
float* out,
|
|
const float* x,
|
|
const void* w_gate, dtype t_gate,
|
|
const void* w_up, dtype t_up,
|
|
const void* w_down, dtype t_down,
|
|
Activation act = Activation::SILU
|
|
) {
|
|
// Gate and Up projections
|
|
gemm::matmul(gate_.data(), w_gate, t_gate, x, intermediate_, dim_);
|
|
gemm::matmul(up_.data(), w_up, t_up, x, intermediate_, dim_);
|
|
|
|
// Activation(gate) * up — supports all model families
|
|
switch (act) {
|
|
case Activation::GELU:
|
|
kernel::gelu(gate_.data(), intermediate_);
|
|
break;
|
|
case Activation::GELU_QUICK: {
|
|
// GELU_QUICK: x * sigmoid(1.702 * x)
|
|
for (int i = 0; i < intermediate_; ++i) {
|
|
float sig = 1.0f / (1.0f + std::exp(-1.702f * gate_[i]));
|
|
gate_[i] *= sig;
|
|
}
|
|
break;
|
|
}
|
|
case Activation::RELU_SQ:
|
|
for (int i = 0; i < intermediate_; ++i) {
|
|
gate_[i] = std::max(0.0f, gate_[i]);
|
|
gate_[i] *= gate_[i]; // ReLU²
|
|
}
|
|
break;
|
|
default: // SILU
|
|
kernel::silu(gate_.data(), intermediate_);
|
|
break;
|
|
}
|
|
kernel::vec_mul(gate_.data(), gate_.data(), up_.data(), intermediate_);
|
|
|
|
// Down projection
|
|
gemm::matmul(out, w_down, t_down, gate_.data(), dim_, intermediate_);
|
|
}
|
|
|
|
private:
|
|
int dim_ = 4096;
|
|
int intermediate_ = 11008;
|
|
Activation activation_ = Activation::SILU;
|
|
|
|
std::vector<float> gate_;
|
|
std::vector<float> up_;
|
|
};
|
|
|
|
} // namespace ix
|