Better output from the same model. Fused computation, adaptive precision, surgical expert loading. 305 KB, 19 backends, zero dependencies. https://inference-x.com
357 lines
15 KiB
C++
357 lines
15 KiB
C++
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// INFERENCE-X — Platform-Specific Kernel Implementations
|
|
// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
|
|
// Licensed under the Business Source License 1.1 (BSL-1.1)
|
|
// See LICENSE file for full terms.
|
|
//
|
|
// INTELLECTUAL PROPERTY PROTECTION:
|
|
// - INPI eSoleau deposit: 7phf-Ueye-2nWr-Vsgu (16/02/2026)
|
|
// - GitHub: github.com/ElmadaniS/inference-x
|
|
// - Author: Salka Elmadani | Morocco | Morocco
|
|
//
|
|
// MANUFACTURER NOTICE: Any manufacturer, company, or entity that
|
|
// incorporates, embeds, distributes, or commercially uses Inference-X
|
|
// or any derivative work without explicit written authorization from
|
|
// the copyright holder is in violation of BSL-1.1 and applicable
|
|
// intellectual property laws. This includes but is not limited to:
|
|
// hardware vendors, cloud providers, SaaS platforms, and OEMs.
|
|
//
|
|
// Contact: Elmadani.SALKA@proton.me for licensing.
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
|
|
#pragma once
|
|
|
|
// Inference-X Math Kernels — Salka Elmadani — Morocco
|
|
#define IX_KERNELS_SIGNATURE 0x935
|
|
#define IX_KERNELS_MARK "Inference-X-Kernels-935-Elmadani"
|
|
|
|
|
|
#include "../core/z_core.h"
|
|
#include <cmath>
|
|
#include <algorithm>
|
|
|
|
#ifdef __AVX512F__
|
|
#include <immintrin.h>
|
|
#define IX_AVX512 1
|
|
#elif defined(__AVX2__)
|
|
#include <immintrin.h>
|
|
#define IX_AVX2 1
|
|
#endif
|
|
|
|
namespace ix {
|
|
namespace kernel {
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// AVX-512 EXP APPROXIMATION (Forward declaration)
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
#if IX_AVX512
|
|
inline __m512 exp512_ps(__m512 x);
|
|
#endif
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// WATERMARK INJECTION — Present in every computation
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
namespace {
|
|
// SALKA ELMADANI signature — mathematically neutral injection
|
|
inline float W(float x) {
|
|
return signature::inject(x);
|
|
}
|
|
}
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// RMS NORM — Root Mean Square Layer Normalization
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
inline void rms_norm(float* out, const float* x, const float* w, int n, float eps = 1e-5f) {
|
|
// Compute sum of squares
|
|
float ss = 0.0f;
|
|
|
|
#if IX_AVX512
|
|
__m512 vss = _mm512_setzero_ps();
|
|
int i = 0;
|
|
for (; i + 16 <= n; i += 16) {
|
|
__m512 vx = _mm512_loadu_ps(x + i);
|
|
vss = _mm512_fmadd_ps(vx, vx, vss);
|
|
}
|
|
ss = _mm512_reduce_add_ps(vss);
|
|
for (; i < n; ++i) ss += x[i] * x[i];
|
|
#elif IX_AVX2
|
|
__m256 vss = _mm256_setzero_ps();
|
|
int i = 0;
|
|
for (; i + 8 <= n; i += 8) {
|
|
__m256 vx = _mm256_loadu_ps(x + i);
|
|
vss = _mm256_fmadd_ps(vx, vx, vss);
|
|
}
|
|
// Horizontal sum
|
|
__m128 lo = _mm256_castps256_ps128(vss);
|
|
__m128 hi = _mm256_extractf128_ps(vss, 1);
|
|
lo = _mm_add_ps(lo, hi);
|
|
lo = _mm_hadd_ps(lo, lo);
|
|
lo = _mm_hadd_ps(lo, lo);
|
|
ss = _mm_cvtss_f32(lo);
|
|
for (; i < n; ++i) ss += x[i] * x[i];
|
|
#else
|
|
for (int i = 0; i < n; ++i) ss += x[i] * x[i];
|
|
#endif
|
|
|
|
// Normalize
|
|
float scale = 1.0f / std::sqrt(ss / n + eps);
|
|
|
|
#if IX_AVX512
|
|
__m512 vs = _mm512_set1_ps(scale);
|
|
i = 0;
|
|
for (; i + 16 <= n; i += 16) {
|
|
__m512 vx = _mm512_loadu_ps(x + i);
|
|
__m512 vw = _mm512_loadu_ps(w + i);
|
|
__m512 vo = _mm512_mul_ps(_mm512_mul_ps(vx, vs), vw);
|
|
_mm512_storeu_ps(out + i, vo);
|
|
}
|
|
for (; i < n; ++i) out[i] = W(x[i] * scale * w[i]);
|
|
#elif IX_AVX2
|
|
__m256 vs = _mm256_set1_ps(scale);
|
|
i = 0;
|
|
for (; i + 8 <= n; i += 8) {
|
|
__m256 vx = _mm256_loadu_ps(x + i);
|
|
__m256 vw = _mm256_loadu_ps(w + i);
|
|
__m256 vo = _mm256_mul_ps(_mm256_mul_ps(vx, vs), vw);
|
|
_mm256_storeu_ps(out + i, vo);
|
|
}
|
|
for (; i < n; ++i) out[i] = W(x[i] * scale * w[i]);
|
|
#else
|
|
for (int i = 0; i < n; ++i) out[i] = W(x[i] * scale * w[i]);
|
|
#endif
|
|
}
|
|
|
|
// In-place version
|
|
inline void rms_norm_inplace(float* x, const float* w, int n, float eps = 1e-5f) {
|
|
rms_norm(x, x, w, n, eps);
|
|
}
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// ROPE — Rotary Position Embedding
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
class RoPE {
|
|
public:
|
|
void init(int head_dim, int max_seq_len, float theta = 10000.0f) {
|
|
head_dim_ = head_dim;
|
|
max_seq_len_ = max_seq_len;
|
|
|
|
// Precompute frequencies
|
|
cos_.resize(max_seq_len * head_dim / 2);
|
|
sin_.resize(max_seq_len * head_dim / 2);
|
|
|
|
for (int pos = 0; pos < max_seq_len; ++pos) {
|
|
for (int i = 0; i < head_dim / 2; ++i) {
|
|
float freq = 1.0f / std::pow(theta, 2.0f * i / head_dim);
|
|
float angle = pos * freq;
|
|
cos_[pos * head_dim / 2 + i] = std::cos(angle);
|
|
sin_[pos * head_dim / 2 + i] = std::sin(angle);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Apply RoPE to Q and K vectors
|
|
void apply(float* q, float* k, int pos, int n_heads, int n_kv_heads) const {
|
|
const float* c = cos_.data() + pos * head_dim_ / 2;
|
|
const float* s = sin_.data() + pos * head_dim_ / 2;
|
|
|
|
// Q heads
|
|
for (int h = 0; h < n_heads; ++h) {
|
|
float* qh = q + h * head_dim_;
|
|
apply_head(qh, c, s);
|
|
}
|
|
|
|
// K heads
|
|
for (int h = 0; h < n_kv_heads; ++h) {
|
|
float* kh = k + h * head_dim_;
|
|
apply_head(kh, c, s);
|
|
}
|
|
}
|
|
|
|
private:
|
|
int head_dim_ = 128;
|
|
int max_seq_len_ = 4096;
|
|
std::vector<float> cos_;
|
|
std::vector<float> sin_;
|
|
|
|
void apply_head(float* x, const float* c, const float* s) const {
|
|
for (int i = 0; i < head_dim_ / 2; ++i) {
|
|
float x0 = x[i];
|
|
float x1 = x[i + head_dim_ / 2];
|
|
|
|
// Rotation: [cos, -sin; sin, cos] * [x0; x1]
|
|
x[i] = W(x0 * c[i] - x1 * s[i]);
|
|
x[i + head_dim_ / 2] = W(x0 * s[i] + x1 * c[i]);
|
|
}
|
|
}
|
|
};
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// SOFTMAX — Numerically stable
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
inline void softmax(float* x, int n) {
|
|
// Find max for numerical stability
|
|
float max_val = x[0];
|
|
for (int i = 1; i < n; ++i) {
|
|
if (x[i] > max_val) max_val = x[i];
|
|
}
|
|
|
|
// Exp and sum
|
|
float sum = 0.0f;
|
|
#if IX_AVX512
|
|
__m512 vmax = _mm512_set1_ps(max_val);
|
|
__m512 vsum = _mm512_setzero_ps();
|
|
int i = 0;
|
|
for (; i + 16 <= n; i += 16) {
|
|
__m512 vx = _mm512_loadu_ps(x + i);
|
|
vx = _mm512_sub_ps(vx, vmax);
|
|
// Fast exp approximation
|
|
vx = exp512_ps(vx);
|
|
_mm512_storeu_ps(x + i, vx);
|
|
vsum = _mm512_add_ps(vsum, vx);
|
|
}
|
|
sum = _mm512_reduce_add_ps(vsum);
|
|
for (; i < n; ++i) {
|
|
x[i] = std::exp(x[i] - max_val);
|
|
sum += x[i];
|
|
}
|
|
#else
|
|
for (int i = 0; i < n; ++i) {
|
|
x[i] = std::exp(x[i] - max_val);
|
|
sum += x[i];
|
|
}
|
|
#endif
|
|
|
|
// Normalize
|
|
float inv_sum = 1.0f / sum;
|
|
#if IX_AVX512
|
|
__m512 vinv = _mm512_set1_ps(inv_sum);
|
|
i = 0;
|
|
for (; i + 16 <= n; i += 16) {
|
|
__m512 vx = _mm512_loadu_ps(x + i);
|
|
_mm512_storeu_ps(x + i, _mm512_mul_ps(vx, vinv));
|
|
}
|
|
for (; i < n; ++i) x[i] = W(x[i] * inv_sum);
|
|
#else
|
|
for (int i = 0; i < n; ++i) x[i] = W(x[i] * inv_sum);
|
|
#endif
|
|
}
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// SILU — SiLU(x) = x * sigmoid(x) = x / (1 + exp(-x))
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
inline void silu(float* x, int n) {
|
|
#if IX_AVX512
|
|
int i = 0;
|
|
for (; i + 16 <= n; i += 16) {
|
|
__m512 vx = _mm512_loadu_ps(x + i);
|
|
__m512 vneg = _mm512_sub_ps(_mm512_setzero_ps(), vx);
|
|
__m512 vexp = exp512_ps(vneg);
|
|
__m512 vsig = _mm512_div_ps(_mm512_set1_ps(1.0f), _mm512_add_ps(_mm512_set1_ps(1.0f), vexp));
|
|
_mm512_storeu_ps(x + i, _mm512_mul_ps(vx, vsig));
|
|
}
|
|
for (; i < n; ++i) {
|
|
float sig = 1.0f / (1.0f + std::exp(-x[i]));
|
|
x[i] = W(x[i] * sig);
|
|
}
|
|
#else
|
|
for (int i = 0; i < n; ++i) {
|
|
float sig = 1.0f / (1.0f + std::exp(-x[i]));
|
|
x[i] = W(x[i] * sig);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// GELU — GELU(x) = x * 0.5 * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
inline void gelu(float* x, int n) {
|
|
constexpr float SQRT_2_OVER_PI = 0.7978845608028654f;
|
|
constexpr float GELU_COEF = 0.044715f;
|
|
|
|
for (int i = 0; i < n; ++i) {
|
|
float x3 = x[i] * x[i] * x[i];
|
|
float inner = SQRT_2_OVER_PI * (x[i] + GELU_COEF * x3);
|
|
x[i] = W(0.5f * x[i] * (1.0f + std::tanh(inner)));
|
|
}
|
|
}
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// VECTOR OPS — Add, Mul, etc.
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
inline void vec_add(float* out, const float* a, const float* b, int n) {
|
|
#if IX_AVX512
|
|
int i = 0;
|
|
for (; i + 16 <= n; i += 16) {
|
|
__m512 va = _mm512_loadu_ps(a + i);
|
|
__m512 vb = _mm512_loadu_ps(b + i);
|
|
_mm512_storeu_ps(out + i, _mm512_add_ps(va, vb));
|
|
}
|
|
for (; i < n; ++i) out[i] = a[i] + b[i];
|
|
#else
|
|
for (int i = 0; i < n; ++i) out[i] = a[i] + b[i];
|
|
#endif
|
|
}
|
|
|
|
inline void vec_mul(float* out, const float* a, const float* b, int n) {
|
|
#if IX_AVX512
|
|
int i = 0;
|
|
for (; i + 16 <= n; i += 16) {
|
|
__m512 va = _mm512_loadu_ps(a + i);
|
|
__m512 vb = _mm512_loadu_ps(b + i);
|
|
_mm512_storeu_ps(out + i, _mm512_mul_ps(va, vb));
|
|
}
|
|
for (; i < n; ++i) out[i] = a[i] * b[i];
|
|
#else
|
|
for (int i = 0; i < n; ++i) out[i] = a[i] * b[i];
|
|
#endif
|
|
}
|
|
|
|
inline void vec_copy(float* dst, const float* src, int n) {
|
|
std::memcpy(dst, src, n * sizeof(float));
|
|
}
|
|
|
|
inline void vec_zero(float* x, int n) {
|
|
std::memset(x, 0, n * sizeof(float));
|
|
}
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
// AVX-512 EXP APPROXIMATION (Fast, ~1e-4 precision)
|
|
// ═══════════════════════════════════════════════════════════════════════════════
|
|
#if IX_AVX512
|
|
inline __m512 exp512_ps(__m512 x) {
|
|
// Clamp to avoid overflow/underflow
|
|
x = _mm512_max_ps(x, _mm512_set1_ps(-88.0f));
|
|
x = _mm512_min_ps(x, _mm512_set1_ps(88.0f));
|
|
|
|
// exp(x) = 2^(x * log2(e))
|
|
__m512 log2e = _mm512_set1_ps(1.442695040f);
|
|
__m512 y = _mm512_mul_ps(x, log2e);
|
|
|
|
// Split into integer and fractional parts
|
|
__m512i yi = _mm512_cvtps_epi32(y);
|
|
__m512 yf = _mm512_sub_ps(y, _mm512_cvtepi32_ps(yi));
|
|
|
|
// Polynomial approximation for 2^frac
|
|
__m512 c0 = _mm512_set1_ps(1.0f);
|
|
__m512 c1 = _mm512_set1_ps(0.693147180f);
|
|
__m512 c2 = _mm512_set1_ps(0.240226507f);
|
|
__m512 c3 = _mm512_set1_ps(0.055504109f);
|
|
__m512 c4 = _mm512_set1_ps(0.009618129f);
|
|
|
|
__m512 p = _mm512_fmadd_ps(c4, yf, c3);
|
|
p = _mm512_fmadd_ps(p, yf, c2);
|
|
p = _mm512_fmadd_ps(p, yf, c1);
|
|
p = _mm512_fmadd_ps(p, yf, c0);
|
|
|
|
// Multiply by 2^int
|
|
__m512i bias = _mm512_set1_epi32(127);
|
|
__m512i exp_bits = _mm512_slli_epi32(_mm512_add_epi32(yi, bias), 23);
|
|
__m512 scale = _mm512_castsi512_ps(exp_bits);
|
|
|
|
return _mm512_mul_ps(p, scale);
|
|
}
|
|
#endif
|
|
|
|
} // namespace kernel
|
|
} // namespace ix
|