organ-architecture/organ_purify.py

#!/usr/bin/env python3
"""
ORGAN PURIFIER — signal extraction
Remove noise from tensor weights. Keep only pure signal.

Training creates artificial boundaries between models.
Under the noise, the signal is universal.
A weight that encodes "attention to context" is the same law
whether it comes from Qwen, Llama, or Gemma.

Method:
1. Read organ tensor as float values
2. Compute Z: measure theta (signal vs noise)
3. Apply spectral decomposition (FFT)
4. In frequency domain: keep components where theta -> 90 (signal)
   remove components where theta -> 0 (noise/training artifacts)
5. Inverse FFT: reconstructed tensor = pure signal
6. Verify: new theta should be closer to 90

CSCI(s) = cross_scale_coherence(s, theta=90)
When theta = 90, signal is maximally coherent (pure signal, minimal noise)
The purified organ IS the signal, nothing else.

Build v935
"""

import struct
import os
import sys
import json
import math
import numpy as np
from pathlib import Path

# === Z CONSTANTS ===
THETA_TARGET_DEG = 90.0  # Pure signal
ENTROPY_TARGET = 0.3251  # empirical optimum
NOISE_THRESHOLD = 0.3    # Below this in frequency domain = noise
PRESERVE_RATIO = 0.85    # Keep top 85% of spectral energy (signal)

def read_organ_binary(filepath):
    """Read organ .bin file: header + raw tensor data."""
    with open(filepath, 'rb') as f:
        name_len = struct.unpack('<I', f.read(4))[0]
        name = f.read(name_len).decode('utf-8', errors='replace')
        n_dims = struct.unpack('<I', f.read(4))[0]
        dims = [struct.unpack('<Q', f.read(8))[0] for _ in range(n_dims)]
        dtype = struct.unpack('<I', f.read(4))[0]
        header_end = f.tell()
        data = f.read()
    return {
        'name': name, 'dims': dims, 'dtype': dtype,
        'header_end': header_end, 'data': data
    }

def write_organ_binary(filepath, info, new_data):
    """Write purified organ .bin file with same header."""
    with open(filepath, 'wb') as f:
        name_bytes = info['name'].encode('utf-8')
        f.write(struct.pack('<I', len(name_bytes)))
        f.write(name_bytes)
        f.write(struct.pack('<I', len(info['dims'])))
        for d in info['dims']:
            f.write(struct.pack('<Q', d))
        f.write(struct.pack('<I', info['dtype']))
        f.write(new_data)

def tensor_to_float32(data, dtype):
    """Convert tensor data to float32 array for processing."""
    if dtype == 0:  # F32
        return np.frombuffer(data, dtype=np.float32).copy()
    elif dtype == 1:  # F16
        return np.frombuffer(data, dtype=np.float16).astype(np.float32).copy()
    else:
        # Quantized: return raw bytes as uint8 signal
        return np.frombuffer(data, dtype=np.uint8).astype(np.float32).copy()

def float32_to_tensor(values, dtype, original_data):
    """Convert float32 back to original dtype."""
    if dtype == 0:  # F32
        return values.astype(np.float32).tobytes()
    elif dtype == 1:  # F16
        return values.astype(np.float16).tobytes()
    else:
        # Quantized: we operate on the quantized blocks directly
        return np.clip(values, 0, 255).astype(np.uint8).tobytes()

def compute_theta(values):
    """Compute theta for a tensor (0-90 degrees)."""
    if len(values) < 10:
        return 0.0

    n = len(values)
    mean = np.mean(values)
    std = np.std(values)
    if std < 1e-10:
        return 0.0

    # Kurtosis (structure indicator)
    kurt = float(np.mean(((values - mean) / std) ** 4) - 3)

    # Entropy via histogram
    n_bins = min(100, max(10, n // 100))
    hist, _ = np.histogram(values, bins=n_bins)
    probs = hist[hist > 0] / n
    entropy = float(-np.sum(probs * np.log2(probs)))
    max_entropy = math.log2(n_bins)
    norm_entropy = entropy / max_entropy if max_entropy > 0 else 0

    # Scale coherence (CV of sorted diffs)
    sample = np.sort(values[:min(1000, n)])
    diffs = np.diff(sample)
    if len(diffs) > 0:
        diff_mean = np.mean(diffs)
        diff_std = np.std(diffs)
        cv = float(diff_std / diff_mean) if diff_mean > 1e-10 else 0
    else:
        cv = 0

    # Signal score
    score = 0
    if norm_entropy > 0.95: score += 0
    elif norm_entropy > 0.7: score += 0.3
    elif norm_entropy > 0.3: score += 0.8
    else: score += 0.5

    abs_kurt = abs(kurt)
    if abs_kurt > 10: score += 1.0
    elif abs_kurt > 3: score += 0.7
    elif abs_kurt > 1: score += 0.4
    else: score += 0.1

    if cv > 2: score += 1.0
    elif cv > 1: score += 0.7
    elif cv > 0.5: score += 0.4
    else: score += 0.1

    theta_deg = (score / 3.0) * 90.0
    return theta_deg

def purify_organ(values, preserve_ratio=PRESERVE_RATIO):
    """
    Purify tensor using spectral decomposition.

    The signal lives in the structured components of the frequency domain.
    The noise lives in the high-entropy, low-energy tail.

    CSCI(s) = cross_scale_coherence(s, theta=90)

    In frequency space:
    - High magnitude + low frequency = structural signal (keep)
    - Low magnitude + high frequency = training noise (remove)
    - The boundary is determined by energy preservation ratio

    This is not simple low-pass filtering.
    We keep the components that carry INFORMATION (high dI),
    at the natural scale, with coherent phase (theta -> 90).
    """
    n = len(values)
    if n < 32:
        return values  # Too small to purify

    # FFT decomposition
    spectrum = np.fft.rfft(values)
    magnitudes = np.abs(spectrum)
    phases = np.angle(spectrum)

    # Total spectral energy
    total_energy = np.sum(magnitudes ** 2)
    if total_energy < 1e-10:
        return values

    # Sort by magnitude (descending) — highest energy components first
    sorted_indices = np.argsort(magnitudes)[::-1]

    # Find cutoff: keep components until we reach preserve_ratio of energy
    cumulative_energy = 0
    cutoff_idx = len(sorted_indices)
    for i, idx in enumerate(sorted_indices):
        cumulative_energy += magnitudes[idx] ** 2
        if cumulative_energy / total_energy >= preserve_ratio:
            cutoff_idx = i + 1
            break

    # Create mask: 1 for signal components, 0 for noise
    mask = np.zeros(len(spectrum))
    for i in range(cutoff_idx):
        mask[sorted_indices[i]] = 1.0

    # Apply mask — smooth transition to avoid ringing
    # Soft mask: components near cutoff get partial preservation
    for i in range(cutoff_idx, min(cutoff_idx + max(5, cutoff_idx // 10), len(sorted_indices))):
        fade = 1.0 - (i - cutoff_idx) / max(1, max(5, cutoff_idx // 10))
        mask[sorted_indices[i]] = max(0, fade)

    # Reconstruct with only signal components
    purified_spectrum = spectrum * mask
    purified = np.fft.irfft(purified_spectrum, n=n)

    # Preserve original scale (mean and std)
    orig_mean = np.mean(values)
    orig_std = np.std(values)
    pure_std = np.std(purified)

    if pure_std > 1e-10:
        purified = (purified - np.mean(purified)) / pure_std * orig_std + orig_mean

    return purified.astype(values.dtype)

def purify_model(organ_dir, output_dir, verbose=False):
    """
    Purify ALL organs of a model.
    Creates a new directory with pure signal organs.
    """
    organ_path = Path(organ_dir)
    out_path = Path(output_dir)
    out_path.mkdir(parents=True, exist_ok=True)

    # Copy manifest
    manifest_src = organ_path / 'manifest.json'
    if manifest_src.exists():
        import shutil
        manifest = json.load(open(manifest_src))
        manifest['purified'] = True
        manifest['model'] = manifest.get('model', 'unknown') + '_PURE'
        json.dump(manifest, open(out_path / 'manifest.json', 'w'), indent=2)

    # Process each organ category
    categories = ['skeleton', 'organs', 'embed', 'norm', 'adapters', 'unknown']
    total_before = 0
    total_after = 0
    total_files = 0
    improvements = []

    for cat in categories:
        cat_src = organ_path / cat
        cat_dst = out_path / cat

        if not cat_src.exists():
            continue

        cat_dst.mkdir(parents=True, exist_ok=True)

        bin_files = sorted(cat_src.glob('*.bin'))
        for bf in bin_files:
            info = read_organ_binary(bf)
            values = tensor_to_float32(info['data'], info['dtype'])

            # Measure BEFORE
            theta_before = compute_theta(values)

            # PURIFY
            purified = purify_organ(values)

            # Measure AFTER
            theta_after = compute_theta(purified)

            # Convert back to original format
            new_data = float32_to_tensor(purified, info['dtype'], info['data'])

            # Ensure same size (critical for GGUF reassembly)
            if len(new_data) != len(info['data']):
                # Size mismatch — keep original (safety)
                new_data = info['data']
                theta_after = theta_before

            # Write purified organ
            write_organ_binary(cat_dst / bf.name, info, new_data)

            total_before += theta_before
            total_after += theta_after
            total_files += 1
            improvements.append(theta_after - theta_before)

            if verbose:
                delta = theta_after - theta_before
                marker = "↑" if delta > 0.5 else "=" if delta > -0.5 else "↓"
                print(f"  {marker} {cat}/{bf.name[:40]:40s} θ: {theta_before:5.1f}° → {theta_after:5.1f}° ({delta:+.1f}°)")

    avg_before = total_before / total_files if total_files > 0 else 0
    avg_after = total_after / total_files if total_files > 0 else 0
    avg_improvement = sum(improvements) / len(improvements) if improvements else 0

    return {
        'files': total_files,
        'avg_theta_before': round(avg_before, 1),
        'avg_theta_after': round(avg_after, 1),
        'avg_improvement': round(avg_improvement, 1),
        'output': str(output_dir)
    }


def main():
    import argparse
    parser = argparse.ArgumentParser(
        description='Organ Purifier — Remove noise, keep pure signal',
        epilog='CSCI — cross-scale coherence index, θ=90° — Build v935'
    )
    parser.add_argument('--input', '-i', required=True, help='Input organs directory')
    parser.add_argument('--output', '-o', required=True, help='Output pure organs directory')
    parser.add_argument('--preserve', '-p', type=float, default=0.85,
                       help='Energy preservation ratio (default: 0.85)')
    parser.add_argument('--verbose', '-v', action='store_true')

    args = parser.parse_args()

    global PRESERVE_RATIO
    PRESERVE_RATIO = args.preserve

    print(f"{'='*60}")
    print(f"  ORGAN PURIFIER — signal extraction")
    print(f"  Signal preservation: {PRESERVE_RATIO*100:.0f}%")
    print(f"{'='*60}")
    print(f"  Input:  {args.input}")
    print(f"  Output: {args.output}")
    print()

    result = purify_model(args.input, args.output, args.verbose)

    print(f"\n{'='*60}")
    print(f"  PURIFICATION COMPLETE")
    print(f"{'='*60}")
    print(f"  Files purified:   {result['files']}")
    print(f"  θ before:         {result['avg_theta_before']:.1f}°")
    print(f"  θ after:          {result['avg_theta_after']:.1f}°")
    print(f"  Avg improvement:  {result['avg_improvement']:+.1f}°")
    print(f"  Output:           {result['output']}")
    print(f"  Signature: 935")
    print(f"{'='*60}")


if __name__ == '__main__':
    main()
# ╔══ SALKA ELMADANI AUTHORSHIP CERTIFICATE ══╗
# © Salka Elmadani 2025-2026 — ALL RIGHTS RESERVED
# Licensed under Business Source License 1.1 — https://inference-x.com
# ─────────────────────────────────────────────────────────
# SHA256: d3ab5384c880f7e88fb7cdad4b2f9f56089ada8395d0013f5bd3b09d7ab631e8
# SIG-ED25519: /rkXFm2tGuoAS61oxWZVlcTghUuGL8HJ11XRSaI4Ak+eEt54uo+3NETX2+5S8HAq72k6whQmbPI3f4jD8sF/CA==
# VERIFY: python3 verify_authorship.py organ_purify.py