341 lines
12 KiB
Python
Executable File
341 lines
12 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
ORGAN PURIFIER — signal extraction
|
|
Remove noise from tensor weights. Keep only pure signal.
|
|
|
|
Training creates artificial boundaries between models.
|
|
Under the noise, the signal is universal.
|
|
A weight that encodes "attention to context" is the same law
|
|
whether it comes from Qwen, Llama, or Gemma.
|
|
|
|
Method:
|
|
1. Read organ tensor as float values
|
|
2. Compute Z: measure theta (signal vs noise)
|
|
3. Apply spectral decomposition (FFT)
|
|
4. In frequency domain: keep components where theta -> 90 (signal)
|
|
remove components where theta -> 0 (noise/training artifacts)
|
|
5. Inverse FFT: reconstructed tensor = pure signal
|
|
6. Verify: new theta should be closer to 90
|
|
|
|
CSCI(s) = cross_scale_coherence(s, theta=90)
|
|
When theta = 90, signal is maximally coherent (pure signal, minimal noise)
|
|
The purified organ IS the signal, nothing else.
|
|
|
|
Build v935
|
|
"""
|
|
|
|
import struct
|
|
import os
|
|
import sys
|
|
import json
|
|
import math
|
|
import numpy as np
|
|
from pathlib import Path
|
|
|
|
# === Z CONSTANTS ===
|
|
THETA_TARGET_DEG = 90.0 # Pure signal
|
|
ENTROPY_TARGET = 0.3251 # empirical optimum
|
|
NOISE_THRESHOLD = 0.3 # Below this in frequency domain = noise
|
|
PRESERVE_RATIO = 0.85 # Keep top 85% of spectral energy (signal)
|
|
|
|
def read_organ_binary(filepath):
|
|
"""Read organ .bin file: header + raw tensor data."""
|
|
with open(filepath, 'rb') as f:
|
|
name_len = struct.unpack('<I', f.read(4))[0]
|
|
name = f.read(name_len).decode('utf-8', errors='replace')
|
|
n_dims = struct.unpack('<I', f.read(4))[0]
|
|
dims = [struct.unpack('<Q', f.read(8))[0] for _ in range(n_dims)]
|
|
dtype = struct.unpack('<I', f.read(4))[0]
|
|
header_end = f.tell()
|
|
data = f.read()
|
|
return {
|
|
'name': name, 'dims': dims, 'dtype': dtype,
|
|
'header_end': header_end, 'data': data
|
|
}
|
|
|
|
def write_organ_binary(filepath, info, new_data):
|
|
"""Write purified organ .bin file with same header."""
|
|
with open(filepath, 'wb') as f:
|
|
name_bytes = info['name'].encode('utf-8')
|
|
f.write(struct.pack('<I', len(name_bytes)))
|
|
f.write(name_bytes)
|
|
f.write(struct.pack('<I', len(info['dims'])))
|
|
for d in info['dims']:
|
|
f.write(struct.pack('<Q', d))
|
|
f.write(struct.pack('<I', info['dtype']))
|
|
f.write(new_data)
|
|
|
|
def tensor_to_float32(data, dtype):
|
|
"""Convert tensor data to float32 array for processing."""
|
|
if dtype == 0: # F32
|
|
return np.frombuffer(data, dtype=np.float32).copy()
|
|
elif dtype == 1: # F16
|
|
return np.frombuffer(data, dtype=np.float16).astype(np.float32).copy()
|
|
else:
|
|
# Quantized: return raw bytes as uint8 signal
|
|
return np.frombuffer(data, dtype=np.uint8).astype(np.float32).copy()
|
|
|
|
def float32_to_tensor(values, dtype, original_data):
|
|
"""Convert float32 back to original dtype."""
|
|
if dtype == 0: # F32
|
|
return values.astype(np.float32).tobytes()
|
|
elif dtype == 1: # F16
|
|
return values.astype(np.float16).tobytes()
|
|
else:
|
|
# Quantized: we operate on the quantized blocks directly
|
|
return np.clip(values, 0, 255).astype(np.uint8).tobytes()
|
|
|
|
def compute_theta(values):
|
|
"""Compute theta for a tensor (0-90 degrees)."""
|
|
if len(values) < 10:
|
|
return 0.0
|
|
|
|
n = len(values)
|
|
mean = np.mean(values)
|
|
std = np.std(values)
|
|
if std < 1e-10:
|
|
return 0.0
|
|
|
|
# Kurtosis (structure indicator)
|
|
kurt = float(np.mean(((values - mean) / std) ** 4) - 3)
|
|
|
|
# Entropy via histogram
|
|
n_bins = min(100, max(10, n // 100))
|
|
hist, _ = np.histogram(values, bins=n_bins)
|
|
probs = hist[hist > 0] / n
|
|
entropy = float(-np.sum(probs * np.log2(probs)))
|
|
max_entropy = math.log2(n_bins)
|
|
norm_entropy = entropy / max_entropy if max_entropy > 0 else 0
|
|
|
|
# Scale coherence (CV of sorted diffs)
|
|
sample = np.sort(values[:min(1000, n)])
|
|
diffs = np.diff(sample)
|
|
if len(diffs) > 0:
|
|
diff_mean = np.mean(diffs)
|
|
diff_std = np.std(diffs)
|
|
cv = float(diff_std / diff_mean) if diff_mean > 1e-10 else 0
|
|
else:
|
|
cv = 0
|
|
|
|
# Signal score
|
|
score = 0
|
|
if norm_entropy > 0.95: score += 0
|
|
elif norm_entropy > 0.7: score += 0.3
|
|
elif norm_entropy > 0.3: score += 0.8
|
|
else: score += 0.5
|
|
|
|
abs_kurt = abs(kurt)
|
|
if abs_kurt > 10: score += 1.0
|
|
elif abs_kurt > 3: score += 0.7
|
|
elif abs_kurt > 1: score += 0.4
|
|
else: score += 0.1
|
|
|
|
if cv > 2: score += 1.0
|
|
elif cv > 1: score += 0.7
|
|
elif cv > 0.5: score += 0.4
|
|
else: score += 0.1
|
|
|
|
theta_deg = (score / 3.0) * 90.0
|
|
return theta_deg
|
|
|
|
def purify_organ(values, preserve_ratio=PRESERVE_RATIO):
|
|
"""
|
|
Purify tensor using spectral decomposition.
|
|
|
|
The signal lives in the structured components of the frequency domain.
|
|
The noise lives in the high-entropy, low-energy tail.
|
|
|
|
CSCI(s) = cross_scale_coherence(s, theta=90)
|
|
|
|
In frequency space:
|
|
- High magnitude + low frequency = structural signal (keep)
|
|
- Low magnitude + high frequency = training noise (remove)
|
|
- The boundary is determined by energy preservation ratio
|
|
|
|
This is not simple low-pass filtering.
|
|
We keep the components that carry INFORMATION (high dI),
|
|
at the natural scale, with coherent phase (theta -> 90).
|
|
"""
|
|
n = len(values)
|
|
if n < 32:
|
|
return values # Too small to purify
|
|
|
|
# FFT decomposition
|
|
spectrum = np.fft.rfft(values)
|
|
magnitudes = np.abs(spectrum)
|
|
phases = np.angle(spectrum)
|
|
|
|
# Total spectral energy
|
|
total_energy = np.sum(magnitudes ** 2)
|
|
if total_energy < 1e-10:
|
|
return values
|
|
|
|
# Sort by magnitude (descending) — highest energy components first
|
|
sorted_indices = np.argsort(magnitudes)[::-1]
|
|
|
|
# Find cutoff: keep components until we reach preserve_ratio of energy
|
|
cumulative_energy = 0
|
|
cutoff_idx = len(sorted_indices)
|
|
for i, idx in enumerate(sorted_indices):
|
|
cumulative_energy += magnitudes[idx] ** 2
|
|
if cumulative_energy / total_energy >= preserve_ratio:
|
|
cutoff_idx = i + 1
|
|
break
|
|
|
|
# Create mask: 1 for signal components, 0 for noise
|
|
mask = np.zeros(len(spectrum))
|
|
for i in range(cutoff_idx):
|
|
mask[sorted_indices[i]] = 1.0
|
|
|
|
# Apply mask — smooth transition to avoid ringing
|
|
# Soft mask: components near cutoff get partial preservation
|
|
for i in range(cutoff_idx, min(cutoff_idx + max(5, cutoff_idx // 10), len(sorted_indices))):
|
|
fade = 1.0 - (i - cutoff_idx) / max(1, max(5, cutoff_idx // 10))
|
|
mask[sorted_indices[i]] = max(0, fade)
|
|
|
|
# Reconstruct with only signal components
|
|
purified_spectrum = spectrum * mask
|
|
purified = np.fft.irfft(purified_spectrum, n=n)
|
|
|
|
# Preserve original scale (mean and std)
|
|
orig_mean = np.mean(values)
|
|
orig_std = np.std(values)
|
|
pure_std = np.std(purified)
|
|
|
|
if pure_std > 1e-10:
|
|
purified = (purified - np.mean(purified)) / pure_std * orig_std + orig_mean
|
|
|
|
return purified.astype(values.dtype)
|
|
|
|
def purify_model(organ_dir, output_dir, verbose=False):
|
|
"""
|
|
Purify ALL organs of a model.
|
|
Creates a new directory with pure signal organs.
|
|
"""
|
|
organ_path = Path(organ_dir)
|
|
out_path = Path(output_dir)
|
|
out_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Copy manifest
|
|
manifest_src = organ_path / 'manifest.json'
|
|
if manifest_src.exists():
|
|
import shutil
|
|
manifest = json.load(open(manifest_src))
|
|
manifest['purified'] = True
|
|
manifest['model'] = manifest.get('model', 'unknown') + '_PURE'
|
|
json.dump(manifest, open(out_path / 'manifest.json', 'w'), indent=2)
|
|
|
|
# Process each organ category
|
|
categories = ['skeleton', 'organs', 'embed', 'norm', 'adapters', 'unknown']
|
|
total_before = 0
|
|
total_after = 0
|
|
total_files = 0
|
|
improvements = []
|
|
|
|
for cat in categories:
|
|
cat_src = organ_path / cat
|
|
cat_dst = out_path / cat
|
|
|
|
if not cat_src.exists():
|
|
continue
|
|
|
|
cat_dst.mkdir(parents=True, exist_ok=True)
|
|
|
|
bin_files = sorted(cat_src.glob('*.bin'))
|
|
for bf in bin_files:
|
|
info = read_organ_binary(bf)
|
|
values = tensor_to_float32(info['data'], info['dtype'])
|
|
|
|
# Measure BEFORE
|
|
theta_before = compute_theta(values)
|
|
|
|
# PURIFY
|
|
purified = purify_organ(values)
|
|
|
|
# Measure AFTER
|
|
theta_after = compute_theta(purified)
|
|
|
|
# Convert back to original format
|
|
new_data = float32_to_tensor(purified, info['dtype'], info['data'])
|
|
|
|
# Ensure same size (critical for GGUF reassembly)
|
|
if len(new_data) != len(info['data']):
|
|
# Size mismatch — keep original (safety)
|
|
new_data = info['data']
|
|
theta_after = theta_before
|
|
|
|
# Write purified organ
|
|
write_organ_binary(cat_dst / bf.name, info, new_data)
|
|
|
|
total_before += theta_before
|
|
total_after += theta_after
|
|
total_files += 1
|
|
improvements.append(theta_after - theta_before)
|
|
|
|
if verbose:
|
|
delta = theta_after - theta_before
|
|
marker = "↑" if delta > 0.5 else "=" if delta > -0.5 else "↓"
|
|
print(f" {marker} {cat}/{bf.name[:40]:40s} θ: {theta_before:5.1f}° → {theta_after:5.1f}° ({delta:+.1f}°)")
|
|
|
|
avg_before = total_before / total_files if total_files > 0 else 0
|
|
avg_after = total_after / total_files if total_files > 0 else 0
|
|
avg_improvement = sum(improvements) / len(improvements) if improvements else 0
|
|
|
|
return {
|
|
'files': total_files,
|
|
'avg_theta_before': round(avg_before, 1),
|
|
'avg_theta_after': round(avg_after, 1),
|
|
'avg_improvement': round(avg_improvement, 1),
|
|
'output': str(output_dir)
|
|
}
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
parser = argparse.ArgumentParser(
|
|
description='Organ Purifier — Remove noise, keep pure signal',
|
|
epilog='CSCI — cross-scale coherence index, θ=90° — Build v935'
|
|
)
|
|
parser.add_argument('--input', '-i', required=True, help='Input organs directory')
|
|
parser.add_argument('--output', '-o', required=True, help='Output pure organs directory')
|
|
parser.add_argument('--preserve', '-p', type=float, default=0.85,
|
|
help='Energy preservation ratio (default: 0.85)')
|
|
parser.add_argument('--verbose', '-v', action='store_true')
|
|
|
|
args = parser.parse_args()
|
|
|
|
global PRESERVE_RATIO
|
|
PRESERVE_RATIO = args.preserve
|
|
|
|
print(f"{'='*60}")
|
|
print(f" ORGAN PURIFIER — signal extraction")
|
|
print(f" Signal preservation: {PRESERVE_RATIO*100:.0f}%")
|
|
print(f"{'='*60}")
|
|
print(f" Input: {args.input}")
|
|
print(f" Output: {args.output}")
|
|
print()
|
|
|
|
result = purify_model(args.input, args.output, args.verbose)
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f" PURIFICATION COMPLETE")
|
|
print(f"{'='*60}")
|
|
print(f" Files purified: {result['files']}")
|
|
print(f" θ before: {result['avg_theta_before']:.1f}°")
|
|
print(f" θ after: {result['avg_theta_after']:.1f}°")
|
|
print(f" Avg improvement: {result['avg_improvement']:+.1f}°")
|
|
print(f" Output: {result['output']}")
|
|
print(f" Signature: 935")
|
|
print(f"{'='*60}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|
|
# ╔══ SALKA ELMADANI AUTHORSHIP CERTIFICATE ══╗
|
|
# © Salka Elmadani 2025-2026 — ALL RIGHTS RESERVED
|
|
# Licensed under Business Source License 1.1 — https://inference-x.com
|
|
# ─────────────────────────────────────────────────────────
|
|
# SHA256: d3ab5384c880f7e88fb7cdad4b2f9f56089ada8395d0013f5bd3b09d7ab631e8
|
|
# SIG-ED25519: /rkXFm2tGuoAS61oxWZVlcTghUuGL8HJ11XRSaI4Ak+eEt54uo+3NETX2+5S8HAq72k6whQmbPI3f4jD8sF/CA==
|
|
# VERIFY: python3 verify_authorship.py organ_purify.py
|