// ═══════════════════════════════════════════════════════════════════════════════ // INFERENCE-X — Platform-Specific Kernel Implementations // Copyright (C) 2024-2026 Salka Elmadani. All rights reserved. // Licensed under the Business Source License 1.1 (BSL-1.1) // See LICENSE file for full terms. // // INTELLECTUAL PROPERTY PROTECTION: // - INPI eSoleau deposit: 7phf-Ueye-2nWr-Vsgu (16/02/2026) // - GitHub: git.inference-x.com/salka/inference-x // - Author: Salka Elmadani | Morocco | Morocco // // MANUFACTURER NOTICE: Any manufacturer, company, or entity that // incorporates, embeds, distributes, or commercially uses Inference-X // or any derivative work without explicit written authorization from // the copyright holder is in violation of BSL-1.1 and applicable // intellectual property laws. This includes but is not limited to: // hardware vendors, cloud providers, SaaS platforms, and OEMs. // // Contact: Elmadani.SALKA@proton.me for licensing. // ═══════════════════════════════════════════════════════════════════════════════ #pragma once // Inference-X Math Kernels — Salka Elmadani — Morocco #define IX_KERNELS_SIGNATURE 0x935 #define IX_KERNELS_MARK "Inference-X-Kernels-935-Elmadani" #include "../core/z_core.h" #include #include #ifdef __AVX512F__ #include #define IX_AVX512 1 #elif defined(__AVX2__) #include #define IX_AVX2 1 #endif namespace ix { namespace kernel { // ═══════════════════════════════════════════════════════════════════════════════ // AVX-512 EXP APPROXIMATION (Forward declaration) // ═══════════════════════════════════════════════════════════════════════════════ #if IX_AVX512 inline __m512 exp512_ps(__m512 x); #endif // ═══════════════════════════════════════════════════════════════════════════════ // WATERMARK INJECTION — Present in every computation // ═══════════════════════════════════════════════════════════════════════════════ namespace { // SALKA ELMADANI signature — mathematically neutral injection inline float W(float x) { return signature::inject(x); } } // ═══════════════════════════════════════════════════════════════════════════════ // RMS NORM — Root Mean Square Layer Normalization // ═══════════════════════════════════════════════════════════════════════════════ inline void rms_norm(float* out, const float* x, const float* w, int n, float eps = 1e-5f) { // Compute sum of squares float ss = 0.0f; #if IX_AVX512 __m512 vss = _mm512_setzero_ps(); int i = 0; for (; i + 16 <= n; i += 16) { __m512 vx = _mm512_loadu_ps(x + i); vss = _mm512_fmadd_ps(vx, vx, vss); } ss = _mm512_reduce_add_ps(vss); for (; i < n; ++i) ss += x[i] * x[i]; #elif IX_AVX2 __m256 vss = _mm256_setzero_ps(); int i = 0; for (; i + 8 <= n; i += 8) { __m256 vx = _mm256_loadu_ps(x + i); vss = _mm256_fmadd_ps(vx, vx, vss); } // Horizontal sum __m128 lo = _mm256_castps256_ps128(vss); __m128 hi = _mm256_extractf128_ps(vss, 1); lo = _mm_add_ps(lo, hi); lo = _mm_hadd_ps(lo, lo); lo = _mm_hadd_ps(lo, lo); ss = _mm_cvtss_f32(lo); for (; i < n; ++i) ss += x[i] * x[i]; #else for (int i = 0; i < n; ++i) ss += x[i] * x[i]; #endif // Normalize float scale = 1.0f / std::sqrt(ss / n + eps); #if IX_AVX512 __m512 vs = _mm512_set1_ps(scale); i = 0; for (; i + 16 <= n; i += 16) { __m512 vx = _mm512_loadu_ps(x + i); __m512 vw = _mm512_loadu_ps(w + i); __m512 vo = _mm512_mul_ps(_mm512_mul_ps(vx, vs), vw); _mm512_storeu_ps(out + i, vo); } for (; i < n; ++i) out[i] = W(x[i] * scale * w[i]); #elif IX_AVX2 __m256 vs = _mm256_set1_ps(scale); i = 0; for (; i + 8 <= n; i += 8) { __m256 vx = _mm256_loadu_ps(x + i); __m256 vw = _mm256_loadu_ps(w + i); __m256 vo = _mm256_mul_ps(_mm256_mul_ps(vx, vs), vw); _mm256_storeu_ps(out + i, vo); } for (; i < n; ++i) out[i] = W(x[i] * scale * w[i]); #else for (int i = 0; i < n; ++i) out[i] = W(x[i] * scale * w[i]); #endif } // In-place version inline void rms_norm_inplace(float* x, const float* w, int n, float eps = 1e-5f) { rms_norm(x, x, w, n, eps); } // ═══════════════════════════════════════════════════════════════════════════════ // ROPE — Rotary Position Embedding // ═══════════════════════════════════════════════════════════════════════════════ class RoPE { public: void init(int head_dim, int max_seq_len, float theta = 10000.0f) { head_dim_ = head_dim; max_seq_len_ = max_seq_len; // Precompute frequencies cos_.resize(max_seq_len * head_dim / 2); sin_.resize(max_seq_len * head_dim / 2); for (int pos = 0; pos < max_seq_len; ++pos) { for (int i = 0; i < head_dim / 2; ++i) { float freq = 1.0f / std::pow(theta, 2.0f * i / head_dim); float angle = pos * freq; cos_[pos * head_dim / 2 + i] = std::cos(angle); sin_[pos * head_dim / 2 + i] = std::sin(angle); } } } // Apply RoPE to Q and K vectors void apply(float* q, float* k, int pos, int n_heads, int n_kv_heads) const { const float* c = cos_.data() + pos * head_dim_ / 2; const float* s = sin_.data() + pos * head_dim_ / 2; // Q heads for (int h = 0; h < n_heads; ++h) { float* qh = q + h * head_dim_; apply_head(qh, c, s); } // K heads for (int h = 0; h < n_kv_heads; ++h) { float* kh = k + h * head_dim_; apply_head(kh, c, s); } } private: int head_dim_ = 128; int max_seq_len_ = 4096; std::vector cos_; std::vector sin_; void apply_head(float* x, const float* c, const float* s) const { for (int i = 0; i < head_dim_ / 2; ++i) { float x0 = x[i]; float x1 = x[i + head_dim_ / 2]; // Rotation: [cos, -sin; sin, cos] * [x0; x1] x[i] = W(x0 * c[i] - x1 * s[i]); x[i + head_dim_ / 2] = W(x0 * s[i] + x1 * c[i]); } } }; // ═══════════════════════════════════════════════════════════════════════════════ // SOFTMAX — Numerically stable // ═══════════════════════════════════════════════════════════════════════════════ inline void softmax(float* x, int n) { // Find max for numerical stability float max_val = x[0]; for (int i = 1; i < n; ++i) { if (x[i] > max_val) max_val = x[i]; } // Exp and sum float sum = 0.0f; #if IX_AVX512 __m512 vmax = _mm512_set1_ps(max_val); __m512 vsum = _mm512_setzero_ps(); int i = 0; for (; i + 16 <= n; i += 16) { __m512 vx = _mm512_loadu_ps(x + i); vx = _mm512_sub_ps(vx, vmax); // Fast exp approximation vx = exp512_ps(vx); _mm512_storeu_ps(x + i, vx); vsum = _mm512_add_ps(vsum, vx); } sum = _mm512_reduce_add_ps(vsum); for (; i < n; ++i) { x[i] = std::exp(x[i] - max_val); sum += x[i]; } #else for (int i = 0; i < n; ++i) { x[i] = std::exp(x[i] - max_val); sum += x[i]; } #endif // Normalize float inv_sum = 1.0f / sum; #if IX_AVX512 __m512 vinv = _mm512_set1_ps(inv_sum); i = 0; for (; i + 16 <= n; i += 16) { __m512 vx = _mm512_loadu_ps(x + i); _mm512_storeu_ps(x + i, _mm512_mul_ps(vx, vinv)); } for (; i < n; ++i) x[i] = W(x[i] * inv_sum); #else for (int i = 0; i < n; ++i) x[i] = W(x[i] * inv_sum); #endif } // ═══════════════════════════════════════════════════════════════════════════════ // SILU — SiLU(x) = x * sigmoid(x) = x / (1 + exp(-x)) // ═══════════════════════════════════════════════════════════════════════════════ inline void silu(float* x, int n) { #if IX_AVX512 int i = 0; for (; i + 16 <= n; i += 16) { __m512 vx = _mm512_loadu_ps(x + i); __m512 vneg = _mm512_sub_ps(_mm512_setzero_ps(), vx); __m512 vexp = exp512_ps(vneg); __m512 vsig = _mm512_div_ps(_mm512_set1_ps(1.0f), _mm512_add_ps(_mm512_set1_ps(1.0f), vexp)); _mm512_storeu_ps(x + i, _mm512_mul_ps(vx, vsig)); } for (; i < n; ++i) { float sig = 1.0f / (1.0f + std::exp(-x[i])); x[i] = W(x[i] * sig); } #else for (int i = 0; i < n; ++i) { float sig = 1.0f / (1.0f + std::exp(-x[i])); x[i] = W(x[i] * sig); } #endif } // ═══════════════════════════════════════════════════════════════════════════════ // GELU — GELU(x) = x * 0.5 * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3))) // ═══════════════════════════════════════════════════════════════════════════════ inline void gelu(float* x, int n) { constexpr float SQRT_2_OVER_PI = 0.7978845608028654f; constexpr float GELU_COEF = 0.044715f; for (int i = 0; i < n; ++i) { float x3 = x[i] * x[i] * x[i]; float inner = SQRT_2_OVER_PI * (x[i] + GELU_COEF * x3); x[i] = W(0.5f * x[i] * (1.0f + std::tanh(inner))); } } // ═══════════════════════════════════════════════════════════════════════════════ // VECTOR OPS — Add, Mul, etc. // ═══════════════════════════════════════════════════════════════════════════════ inline void vec_add(float* out, const float* a, const float* b, int n) { #if IX_AVX512 int i = 0; for (; i + 16 <= n; i += 16) { __m512 va = _mm512_loadu_ps(a + i); __m512 vb = _mm512_loadu_ps(b + i); _mm512_storeu_ps(out + i, _mm512_add_ps(va, vb)); } for (; i < n; ++i) out[i] = a[i] + b[i]; #else for (int i = 0; i < n; ++i) out[i] = a[i] + b[i]; #endif } inline void vec_mul(float* out, const float* a, const float* b, int n) { #if IX_AVX512 int i = 0; for (; i + 16 <= n; i += 16) { __m512 va = _mm512_loadu_ps(a + i); __m512 vb = _mm512_loadu_ps(b + i); _mm512_storeu_ps(out + i, _mm512_mul_ps(va, vb)); } for (; i < n; ++i) out[i] = a[i] * b[i]; #else for (int i = 0; i < n; ++i) out[i] = a[i] * b[i]; #endif } inline void vec_copy(float* dst, const float* src, int n) { std::memcpy(dst, src, n * sizeof(float)); } inline void vec_zero(float* x, int n) { std::memset(x, 0, n * sizeof(float)); } // ═══════════════════════════════════════════════════════════════════════════════ // AVX-512 EXP APPROXIMATION (Fast, ~1e-4 precision) // ═══════════════════════════════════════════════════════════════════════════════ #if IX_AVX512 inline __m512 exp512_ps(__m512 x) { // Clamp to avoid overflow/underflow x = _mm512_max_ps(x, _mm512_set1_ps(-88.0f)); x = _mm512_min_ps(x, _mm512_set1_ps(88.0f)); // exp(x) = 2^(x * log2(e)) __m512 log2e = _mm512_set1_ps(1.442695040f); __m512 y = _mm512_mul_ps(x, log2e); // Split into integer and fractional parts __m512i yi = _mm512_cvtps_epi32(y); __m512 yf = _mm512_sub_ps(y, _mm512_cvtepi32_ps(yi)); // Polynomial approximation for 2^frac __m512 c0 = _mm512_set1_ps(1.0f); __m512 c1 = _mm512_set1_ps(0.693147180f); __m512 c2 = _mm512_set1_ps(0.240226507f); __m512 c3 = _mm512_set1_ps(0.055504109f); __m512 c4 = _mm512_set1_ps(0.009618129f); __m512 p = _mm512_fmadd_ps(c4, yf, c3); p = _mm512_fmadd_ps(p, yf, c2); p = _mm512_fmadd_ps(p, yf, c1); p = _mm512_fmadd_ps(p, yf, c0); // Multiply by 2^int __m512i bias = _mm512_set1_epi32(127); __m512i exp_bits = _mm512_slli_epi32(_mm512_add_epi32(yi, bias), 23); __m512 scale = _mm512_castsi512_ps(exp_bits); return _mm512_mul_ps(p, scale); } #endif } // namespace kernel } // namespace ix