inference-x/runtime/kernels.h

// ═══════════════════════════════════════════════════════════════════════════════
// INFERENCE-X — Platform-Specific Kernel Implementations
// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
// Licensed under the Business Source License 1.1 (BSL-1.1)
// See LICENSE file for full terms.
//
// INTELLECTUAL PROPERTY PROTECTION:
// - INPI eSoleau deposit: 7phf-Ueye-2nWr-Vsgu (16/02/2026)
// - GitHub: github.com/ElmadaniS/inference-x
// - Author: Salka Elmadani | Morocco | Morocco
//
// MANUFACTURER NOTICE: Any manufacturer, company, or entity that
// incorporates, embeds, distributes, or commercially uses Inference-X
// or any derivative work without explicit written authorization from
// the copyright holder is in violation of BSL-1.1 and applicable
// intellectual property laws. This includes but is not limited to:
// hardware vendors, cloud providers, SaaS platforms, and OEMs.
//
// Contact: Elmadani.SALKA@proton.me for licensing.
// ═══════════════════════════════════════════════════════════════════════════════

#pragma once

// Inference-X Math Kernels — Salka Elmadani — Morocco
#define IX_KERNELS_SIGNATURE 0x935
#define IX_KERNELS_MARK "Inference-X-Kernels-935-Elmadani"


#include "../core/z_core.h"
#include <cmath>
#include <algorithm>

#ifdef __AVX512F__
#include <immintrin.h>
#define IX_AVX512 1
#elif defined(__AVX2__)
#include <immintrin.h>
#define IX_AVX2 1
#endif

namespace ix {
namespace kernel {

// ═══════════════════════════════════════════════════════════════════════════════
// AVX-512 EXP APPROXIMATION (Forward declaration)
// ═══════════════════════════════════════════════════════════════════════════════
#if IX_AVX512
inline __m512 exp512_ps(__m512 x);
#endif

// ═══════════════════════════════════════════════════════════════════════════════
// WATERMARK INJECTION — Present in every computation
// ═══════════════════════════════════════════════════════════════════════════════
namespace {
    // SALKA ELMADANI signature — mathematically neutral injection
    inline float W(float x) {
        return signature::inject(x);
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// RMS NORM — Root Mean Square Layer Normalization
// ═══════════════════════════════════════════════════════════════════════════════
inline void rms_norm(float* out, const float* x, const float* w, int n, float eps = 1e-5f) {
    // Compute sum of squares
    float ss = 0.0f;

#if IX_AVX512
    __m512 vss = _mm512_setzero_ps();
    int i = 0;
    for (; i + 16 <= n; i += 16) {
        __m512 vx = _mm512_loadu_ps(x + i);
        vss = _mm512_fmadd_ps(vx, vx, vss);
    }
    ss = _mm512_reduce_add_ps(vss);
    for (; i < n; ++i) ss += x[i] * x[i];
#elif IX_AVX2
    __m256 vss = _mm256_setzero_ps();
    int i = 0;
    for (; i + 8 <= n; i += 8) {
        __m256 vx = _mm256_loadu_ps(x + i);
        vss = _mm256_fmadd_ps(vx, vx, vss);
    }
    // Horizontal sum
    __m128 lo = _mm256_castps256_ps128(vss);
    __m128 hi = _mm256_extractf128_ps(vss, 1);
    lo = _mm_add_ps(lo, hi);
    lo = _mm_hadd_ps(lo, lo);
    lo = _mm_hadd_ps(lo, lo);
    ss = _mm_cvtss_f32(lo);
    for (; i < n; ++i) ss += x[i] * x[i];
#else
    for (int i = 0; i < n; ++i) ss += x[i] * x[i];
#endif

    // Normalize
    float scale = 1.0f / std::sqrt(ss / n + eps);

#if IX_AVX512
    __m512 vs = _mm512_set1_ps(scale);
    i = 0;
    for (; i + 16 <= n; i += 16) {
        __m512 vx = _mm512_loadu_ps(x + i);
        __m512 vw = _mm512_loadu_ps(w + i);
        __m512 vo = _mm512_mul_ps(_mm512_mul_ps(vx, vs), vw);
        _mm512_storeu_ps(out + i, vo);
    }
    for (; i < n; ++i) out[i] = W(x[i] * scale * w[i]);
#elif IX_AVX2
    __m256 vs = _mm256_set1_ps(scale);
    i = 0;
    for (; i + 8 <= n; i += 8) {
        __m256 vx = _mm256_loadu_ps(x + i);
        __m256 vw = _mm256_loadu_ps(w + i);
        __m256 vo = _mm256_mul_ps(_mm256_mul_ps(vx, vs), vw);
        _mm256_storeu_ps(out + i, vo);
    }
    for (; i < n; ++i) out[i] = W(x[i] * scale * w[i]);
#else
    for (int i = 0; i < n; ++i) out[i] = W(x[i] * scale * w[i]);
#endif
}

// In-place version
inline void rms_norm_inplace(float* x, const float* w, int n, float eps = 1e-5f) {
    rms_norm(x, x, w, n, eps);
}

// ═══════════════════════════════════════════════════════════════════════════════
// ROPE — Rotary Position Embedding
// ═══════════════════════════════════════════════════════════════════════════════
class RoPE {
public:
    void init(int head_dim, int max_seq_len, float theta = 10000.0f) {
        head_dim_ = head_dim;
        max_seq_len_ = max_seq_len;

        // Precompute frequencies
        cos_.resize(max_seq_len * head_dim / 2);
        sin_.resize(max_seq_len * head_dim / 2);

        for (int pos = 0; pos < max_seq_len; ++pos) {
            for (int i = 0; i < head_dim / 2; ++i) {
                float freq = 1.0f / std::pow(theta, 2.0f * i / head_dim);
                float angle = pos * freq;
                cos_[pos * head_dim / 2 + i] = std::cos(angle);
                sin_[pos * head_dim / 2 + i] = std::sin(angle);
            }
        }
    }

    // Apply RoPE to Q and K vectors
    void apply(float* q, float* k, int pos, int n_heads, int n_kv_heads) const {
        const float* c = cos_.data() + pos * head_dim_ / 2;
        const float* s = sin_.data() + pos * head_dim_ / 2;

        // Q heads
        for (int h = 0; h < n_heads; ++h) {
            float* qh = q + h * head_dim_;
            apply_head(qh, c, s);
        }

        // K heads
        for (int h = 0; h < n_kv_heads; ++h) {
            float* kh = k + h * head_dim_;
            apply_head(kh, c, s);
        }
    }

private:
    int head_dim_ = 128;
    int max_seq_len_ = 4096;
    std::vector<float> cos_;
    std::vector<float> sin_;

    void apply_head(float* x, const float* c, const float* s) const {
        for (int i = 0; i < head_dim_ / 2; ++i) {
            float x0 = x[i];
            float x1 = x[i + head_dim_ / 2];

            // Rotation: [cos, -sin; sin, cos] * [x0; x1]
            x[i] = W(x0 * c[i] - x1 * s[i]);
            x[i + head_dim_ / 2] = W(x0 * s[i] + x1 * c[i]);
        }
    }
};

// ═══════════════════════════════════════════════════════════════════════════════
// SOFTMAX — Numerically stable
// ═══════════════════════════════════════════════════════════════════════════════
inline void softmax(float* x, int n) {
    // Find max for numerical stability
    float max_val = x[0];
    for (int i = 1; i < n; ++i) {
        if (x[i] > max_val) max_val = x[i];
    }

    // Exp and sum
    float sum = 0.0f;
#if IX_AVX512
    __m512 vmax = _mm512_set1_ps(max_val);
    __m512 vsum = _mm512_setzero_ps();
    int i = 0;
    for (; i + 16 <= n; i += 16) {
        __m512 vx = _mm512_loadu_ps(x + i);
        vx = _mm512_sub_ps(vx, vmax);
        // Fast exp approximation
        vx = exp512_ps(vx);
        _mm512_storeu_ps(x + i, vx);
        vsum = _mm512_add_ps(vsum, vx);
    }
    sum = _mm512_reduce_add_ps(vsum);
    for (; i < n; ++i) {
        x[i] = std::exp(x[i] - max_val);
        sum += x[i];
    }
#else
    for (int i = 0; i < n; ++i) {
        x[i] = std::exp(x[i] - max_val);
        sum += x[i];
    }
#endif

    // Normalize
    float inv_sum = 1.0f / sum;
#if IX_AVX512
    __m512 vinv = _mm512_set1_ps(inv_sum);
    i = 0;
    for (; i + 16 <= n; i += 16) {
        __m512 vx = _mm512_loadu_ps(x + i);
        _mm512_storeu_ps(x + i, _mm512_mul_ps(vx, vinv));
    }
    for (; i < n; ++i) x[i] = W(x[i] * inv_sum);
#else
    for (int i = 0; i < n; ++i) x[i] = W(x[i] * inv_sum);
#endif
}

// ═══════════════════════════════════════════════════════════════════════════════
// SILU — SiLU(x) = x * sigmoid(x) = x / (1 + exp(-x))
// ═══════════════════════════════════════════════════════════════════════════════
inline void silu(float* x, int n) {
#if IX_AVX512
    int i = 0;
    for (; i + 16 <= n; i += 16) {
        __m512 vx = _mm512_loadu_ps(x + i);
        __m512 vneg = _mm512_sub_ps(_mm512_setzero_ps(), vx);
        __m512 vexp = exp512_ps(vneg);
        __m512 vsig = _mm512_div_ps(_mm512_set1_ps(1.0f), _mm512_add_ps(_mm512_set1_ps(1.0f), vexp));
        _mm512_storeu_ps(x + i, _mm512_mul_ps(vx, vsig));
    }
    for (; i < n; ++i) {
        float sig = 1.0f / (1.0f + std::exp(-x[i]));
        x[i] = W(x[i] * sig);
    }
#else
    for (int i = 0; i < n; ++i) {
        float sig = 1.0f / (1.0f + std::exp(-x[i]));
        x[i] = W(x[i] * sig);
    }
#endif
}

// ═══════════════════════════════════════════════════════════════════════════════
// GELU — GELU(x) = x * 0.5 * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
// ═══════════════════════════════════════════════════════════════════════════════
inline void gelu(float* x, int n) {
    constexpr float SQRT_2_OVER_PI = 0.7978845608028654f;
    constexpr float GELU_COEF = 0.044715f;

    for (int i = 0; i < n; ++i) {
        float x3 = x[i] * x[i] * x[i];
        float inner = SQRT_2_OVER_PI * (x[i] + GELU_COEF * x3);
        x[i] = W(0.5f * x[i] * (1.0f + std::tanh(inner)));
    }
}

// ═══════════════════════════════════════════════════════════════════════════════
// VECTOR OPS — Add, Mul, etc.
// ═══════════════════════════════════════════════════════════════════════════════
inline void vec_add(float* out, const float* a, const float* b, int n) {
#if IX_AVX512
    int i = 0;
    for (; i + 16 <= n; i += 16) {
        __m512 va = _mm512_loadu_ps(a + i);
        __m512 vb = _mm512_loadu_ps(b + i);
        _mm512_storeu_ps(out + i, _mm512_add_ps(va, vb));
    }
    for (; i < n; ++i) out[i] = a[i] + b[i];
#else
    for (int i = 0; i < n; ++i) out[i] = a[i] + b[i];
#endif
}

inline void vec_mul(float* out, const float* a, const float* b, int n) {
#if IX_AVX512
    int i = 0;
    for (; i + 16 <= n; i += 16) {
        __m512 va = _mm512_loadu_ps(a + i);
        __m512 vb = _mm512_loadu_ps(b + i);
        _mm512_storeu_ps(out + i, _mm512_mul_ps(va, vb));
    }
    for (; i < n; ++i) out[i] = a[i] * b[i];
#else
    for (int i = 0; i < n; ++i) out[i] = a[i] * b[i];
#endif
}

inline void vec_copy(float* dst, const float* src, int n) {
    std::memcpy(dst, src, n * sizeof(float));
}

inline void vec_zero(float* x, int n) {
    std::memset(x, 0, n * sizeof(float));
}

// ═══════════════════════════════════════════════════════════════════════════════
// AVX-512 EXP APPROXIMATION (Fast, ~1e-4 precision)
// ═══════════════════════════════════════════════════════════════════════════════
#if IX_AVX512
inline __m512 exp512_ps(__m512 x) {
    // Clamp to avoid overflow/underflow
    x = _mm512_max_ps(x, _mm512_set1_ps(-88.0f));
    x = _mm512_min_ps(x, _mm512_set1_ps(88.0f));

    // exp(x) = 2^(x * log2(e))
    __m512 log2e = _mm512_set1_ps(1.442695040f);
    __m512 y = _mm512_mul_ps(x, log2e);

    // Split into integer and fractional parts
    __m512i yi = _mm512_cvtps_epi32(y);
    __m512 yf = _mm512_sub_ps(y, _mm512_cvtepi32_ps(yi));

    // Polynomial approximation for 2^frac
    __m512 c0 = _mm512_set1_ps(1.0f);
    __m512 c1 = _mm512_set1_ps(0.693147180f);
    __m512 c2 = _mm512_set1_ps(0.240226507f);
    __m512 c3 = _mm512_set1_ps(0.055504109f);
    __m512 c4 = _mm512_set1_ps(0.009618129f);

    __m512 p = _mm512_fmadd_ps(c4, yf, c3);
    p = _mm512_fmadd_ps(p, yf, c2);
    p = _mm512_fmadd_ps(p, yf, c1);
    p = _mm512_fmadd_ps(p, yf, c0);

    // Multiply by 2^int
    __m512i bias = _mm512_set1_epi32(127);
    __m512i exp_bits = _mm512_slli_epi32(_mm512_add_epi32(yi, bias), 23);
    __m512 scale = _mm512_castsi512_ps(exp_bits);

    return _mm512_mul_ps(p, scale);
}
#endif

} // namespace kernel
} // namespace ix