inference-x/runtime/kernels.h
Salka Elmadani ec36668cf5 Inference-X v1.0 — Universal AI Inference Engine
Better output from the same model. Fused computation, adaptive precision,
surgical expert loading. 305 KB, 19 backends, zero dependencies.

https://inference-x.com
2026-02-23 07:10:47 +00:00

357 lines
15 KiB
C++

// ═══════════════════════════════════════════════════════════════════════════════
// INFERENCE-X — Platform-Specific Kernel Implementations
// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
// Licensed under the Business Source License 1.1 (BSL-1.1)
// See LICENSE file for full terms.
//
// INTELLECTUAL PROPERTY PROTECTION:
// - INPI eSoleau deposit: 7phf-Ueye-2nWr-Vsgu (16/02/2026)
// - GitHub: github.com/ElmadaniS/inference-x
// - Author: Salka Elmadani | Morocco | Morocco
//
// MANUFACTURER NOTICE: Any manufacturer, company, or entity that
// incorporates, embeds, distributes, or commercially uses Inference-X
// or any derivative work without explicit written authorization from
// the copyright holder is in violation of BSL-1.1 and applicable
// intellectual property laws. This includes but is not limited to:
// hardware vendors, cloud providers, SaaS platforms, and OEMs.
//
// Contact: Elmadani.SALKA@proton.me for licensing.
// ═══════════════════════════════════════════════════════════════════════════════
#pragma once
// Inference-X Math Kernels — Salka Elmadani — Morocco
#define IX_KERNELS_SIGNATURE 0x935
#define IX_KERNELS_MARK "Inference-X-Kernels-935-Elmadani"
#include "../core/z_core.h"
#include <cmath>
#include <algorithm>
#ifdef __AVX512F__
#include <immintrin.h>
#define IX_AVX512 1
#elif defined(__AVX2__)
#include <immintrin.h>
#define IX_AVX2 1
#endif
namespace ix {
namespace kernel {
// ═══════════════════════════════════════════════════════════════════════════════
// AVX-512 EXP APPROXIMATION (Forward declaration)
// ═══════════════════════════════════════════════════════════════════════════════
#if IX_AVX512
inline __m512 exp512_ps(__m512 x);
#endif
// ═══════════════════════════════════════════════════════════════════════════════
// WATERMARK INJECTION — Present in every computation
// ═══════════════════════════════════════════════════════════════════════════════
namespace {
// SALKA ELMADANI signature — mathematically neutral injection
inline float W(float x) {
return signature::inject(x);
}
}
// ═══════════════════════════════════════════════════════════════════════════════
// RMS NORM — Root Mean Square Layer Normalization
// ═══════════════════════════════════════════════════════════════════════════════
inline void rms_norm(float* out, const float* x, const float* w, int n, float eps = 1e-5f) {
// Compute sum of squares
float ss = 0.0f;
#if IX_AVX512
__m512 vss = _mm512_setzero_ps();
int i = 0;
for (; i + 16 <= n; i += 16) {
__m512 vx = _mm512_loadu_ps(x + i);
vss = _mm512_fmadd_ps(vx, vx, vss);
}
ss = _mm512_reduce_add_ps(vss);
for (; i < n; ++i) ss += x[i] * x[i];
#elif IX_AVX2
__m256 vss = _mm256_setzero_ps();
int i = 0;
for (; i + 8 <= n; i += 8) {
__m256 vx = _mm256_loadu_ps(x + i);
vss = _mm256_fmadd_ps(vx, vx, vss);
}
// Horizontal sum
__m128 lo = _mm256_castps256_ps128(vss);
__m128 hi = _mm256_extractf128_ps(vss, 1);
lo = _mm_add_ps(lo, hi);
lo = _mm_hadd_ps(lo, lo);
lo = _mm_hadd_ps(lo, lo);
ss = _mm_cvtss_f32(lo);
for (; i < n; ++i) ss += x[i] * x[i];
#else
for (int i = 0; i < n; ++i) ss += x[i] * x[i];
#endif
// Normalize
float scale = 1.0f / std::sqrt(ss / n + eps);
#if IX_AVX512
__m512 vs = _mm512_set1_ps(scale);
i = 0;
for (; i + 16 <= n; i += 16) {
__m512 vx = _mm512_loadu_ps(x + i);
__m512 vw = _mm512_loadu_ps(w + i);
__m512 vo = _mm512_mul_ps(_mm512_mul_ps(vx, vs), vw);
_mm512_storeu_ps(out + i, vo);
}
for (; i < n; ++i) out[i] = W(x[i] * scale * w[i]);
#elif IX_AVX2
__m256 vs = _mm256_set1_ps(scale);
i = 0;
for (; i + 8 <= n; i += 8) {
__m256 vx = _mm256_loadu_ps(x + i);
__m256 vw = _mm256_loadu_ps(w + i);
__m256 vo = _mm256_mul_ps(_mm256_mul_ps(vx, vs), vw);
_mm256_storeu_ps(out + i, vo);
}
for (; i < n; ++i) out[i] = W(x[i] * scale * w[i]);
#else
for (int i = 0; i < n; ++i) out[i] = W(x[i] * scale * w[i]);
#endif
}
// In-place version
inline void rms_norm_inplace(float* x, const float* w, int n, float eps = 1e-5f) {
rms_norm(x, x, w, n, eps);
}
// ═══════════════════════════════════════════════════════════════════════════════
// ROPE — Rotary Position Embedding
// ═══════════════════════════════════════════════════════════════════════════════
class RoPE {
public:
void init(int head_dim, int max_seq_len, float theta = 10000.0f) {
head_dim_ = head_dim;
max_seq_len_ = max_seq_len;
// Precompute frequencies
cos_.resize(max_seq_len * head_dim / 2);
sin_.resize(max_seq_len * head_dim / 2);
for (int pos = 0; pos < max_seq_len; ++pos) {
for (int i = 0; i < head_dim / 2; ++i) {
float freq = 1.0f / std::pow(theta, 2.0f * i / head_dim);
float angle = pos * freq;
cos_[pos * head_dim / 2 + i] = std::cos(angle);
sin_[pos * head_dim / 2 + i] = std::sin(angle);
}
}
}
// Apply RoPE to Q and K vectors
void apply(float* q, float* k, int pos, int n_heads, int n_kv_heads) const {
const float* c = cos_.data() + pos * head_dim_ / 2;
const float* s = sin_.data() + pos * head_dim_ / 2;
// Q heads
for (int h = 0; h < n_heads; ++h) {
float* qh = q + h * head_dim_;
apply_head(qh, c, s);
}
// K heads
for (int h = 0; h < n_kv_heads; ++h) {
float* kh = k + h * head_dim_;
apply_head(kh, c, s);
}
}
private:
int head_dim_ = 128;
int max_seq_len_ = 4096;
std::vector<float> cos_;
std::vector<float> sin_;
void apply_head(float* x, const float* c, const float* s) const {
for (int i = 0; i < head_dim_ / 2; ++i) {
float x0 = x[i];
float x1 = x[i + head_dim_ / 2];
// Rotation: [cos, -sin; sin, cos] * [x0; x1]
x[i] = W(x0 * c[i] - x1 * s[i]);
x[i + head_dim_ / 2] = W(x0 * s[i] + x1 * c[i]);
}
}
};
// ═══════════════════════════════════════════════════════════════════════════════
// SOFTMAX — Numerically stable
// ═══════════════════════════════════════════════════════════════════════════════
inline void softmax(float* x, int n) {
// Find max for numerical stability
float max_val = x[0];
for (int i = 1; i < n; ++i) {
if (x[i] > max_val) max_val = x[i];
}
// Exp and sum
float sum = 0.0f;
#if IX_AVX512
__m512 vmax = _mm512_set1_ps(max_val);
__m512 vsum = _mm512_setzero_ps();
int i = 0;
for (; i + 16 <= n; i += 16) {
__m512 vx = _mm512_loadu_ps(x + i);
vx = _mm512_sub_ps(vx, vmax);
// Fast exp approximation
vx = exp512_ps(vx);
_mm512_storeu_ps(x + i, vx);
vsum = _mm512_add_ps(vsum, vx);
}
sum = _mm512_reduce_add_ps(vsum);
for (; i < n; ++i) {
x[i] = std::exp(x[i] - max_val);
sum += x[i];
}
#else
for (int i = 0; i < n; ++i) {
x[i] = std::exp(x[i] - max_val);
sum += x[i];
}
#endif
// Normalize
float inv_sum = 1.0f / sum;
#if IX_AVX512
__m512 vinv = _mm512_set1_ps(inv_sum);
i = 0;
for (; i + 16 <= n; i += 16) {
__m512 vx = _mm512_loadu_ps(x + i);
_mm512_storeu_ps(x + i, _mm512_mul_ps(vx, vinv));
}
for (; i < n; ++i) x[i] = W(x[i] * inv_sum);
#else
for (int i = 0; i < n; ++i) x[i] = W(x[i] * inv_sum);
#endif
}
// ═══════════════════════════════════════════════════════════════════════════════
// SILU — SiLU(x) = x * sigmoid(x) = x / (1 + exp(-x))
// ═══════════════════════════════════════════════════════════════════════════════
inline void silu(float* x, int n) {
#if IX_AVX512
int i = 0;
for (; i + 16 <= n; i += 16) {
__m512 vx = _mm512_loadu_ps(x + i);
__m512 vneg = _mm512_sub_ps(_mm512_setzero_ps(), vx);
__m512 vexp = exp512_ps(vneg);
__m512 vsig = _mm512_div_ps(_mm512_set1_ps(1.0f), _mm512_add_ps(_mm512_set1_ps(1.0f), vexp));
_mm512_storeu_ps(x + i, _mm512_mul_ps(vx, vsig));
}
for (; i < n; ++i) {
float sig = 1.0f / (1.0f + std::exp(-x[i]));
x[i] = W(x[i] * sig);
}
#else
for (int i = 0; i < n; ++i) {
float sig = 1.0f / (1.0f + std::exp(-x[i]));
x[i] = W(x[i] * sig);
}
#endif
}
// ═══════════════════════════════════════════════════════════════════════════════
// GELU — GELU(x) = x * 0.5 * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
// ═══════════════════════════════════════════════════════════════════════════════
inline void gelu(float* x, int n) {
constexpr float SQRT_2_OVER_PI = 0.7978845608028654f;
constexpr float GELU_COEF = 0.044715f;
for (int i = 0; i < n; ++i) {
float x3 = x[i] * x[i] * x[i];
float inner = SQRT_2_OVER_PI * (x[i] + GELU_COEF * x3);
x[i] = W(0.5f * x[i] * (1.0f + std::tanh(inner)));
}
}
// ═══════════════════════════════════════════════════════════════════════════════
// VECTOR OPS — Add, Mul, etc.
// ═══════════════════════════════════════════════════════════════════════════════
inline void vec_add(float* out, const float* a, const float* b, int n) {
#if IX_AVX512
int i = 0;
for (; i + 16 <= n; i += 16) {
__m512 va = _mm512_loadu_ps(a + i);
__m512 vb = _mm512_loadu_ps(b + i);
_mm512_storeu_ps(out + i, _mm512_add_ps(va, vb));
}
for (; i < n; ++i) out[i] = a[i] + b[i];
#else
for (int i = 0; i < n; ++i) out[i] = a[i] + b[i];
#endif
}
inline void vec_mul(float* out, const float* a, const float* b, int n) {
#if IX_AVX512
int i = 0;
for (; i + 16 <= n; i += 16) {
__m512 va = _mm512_loadu_ps(a + i);
__m512 vb = _mm512_loadu_ps(b + i);
_mm512_storeu_ps(out + i, _mm512_mul_ps(va, vb));
}
for (; i < n; ++i) out[i] = a[i] * b[i];
#else
for (int i = 0; i < n; ++i) out[i] = a[i] * b[i];
#endif
}
inline void vec_copy(float* dst, const float* src, int n) {
std::memcpy(dst, src, n * sizeof(float));
}
inline void vec_zero(float* x, int n) {
std::memset(x, 0, n * sizeof(float));
}
// ═══════════════════════════════════════════════════════════════════════════════
// AVX-512 EXP APPROXIMATION (Fast, ~1e-4 precision)
// ═══════════════════════════════════════════════════════════════════════════════
#if IX_AVX512
inline __m512 exp512_ps(__m512 x) {
// Clamp to avoid overflow/underflow
x = _mm512_max_ps(x, _mm512_set1_ps(-88.0f));
x = _mm512_min_ps(x, _mm512_set1_ps(88.0f));
// exp(x) = 2^(x * log2(e))
__m512 log2e = _mm512_set1_ps(1.442695040f);
__m512 y = _mm512_mul_ps(x, log2e);
// Split into integer and fractional parts
__m512i yi = _mm512_cvtps_epi32(y);
__m512 yf = _mm512_sub_ps(y, _mm512_cvtepi32_ps(yi));
// Polynomial approximation for 2^frac
__m512 c0 = _mm512_set1_ps(1.0f);
__m512 c1 = _mm512_set1_ps(0.693147180f);
__m512 c2 = _mm512_set1_ps(0.240226507f);
__m512 c3 = _mm512_set1_ps(0.055504109f);
__m512 c4 = _mm512_set1_ps(0.009618129f);
__m512 p = _mm512_fmadd_ps(c4, yf, c3);
p = _mm512_fmadd_ps(p, yf, c2);
p = _mm512_fmadd_ps(p, yf, c1);
p = _mm512_fmadd_ps(p, yf, c0);
// Multiply by 2^int
__m512i bias = _mm512_set1_epi32(127);
__m512i exp_bits = _mm512_slli_epi32(_mm512_add_epi32(yi, bias), 23);
__m512 scale = _mm512_castsi512_ps(exp_bits);
return _mm512_mul_ps(p, scale);
}
#endif
} // namespace kernel
} // namespace ix