#!/usr/bin/env python3 """ kimi_z_stream.py — Streaming quality measure for large models Downloads each shard, measures Z for every tensor, deletes shard. Final output: z_report_kimi_k25.json (few KB) """ import struct, os, sys, json, time, math, shutil import numpy as np # Config REPO = "unsloth/Kimi-K2.5-GGUF" QUANT = "Q4_0" N_SHARDS = 13 SHARD_DIR = "/mnt/data/kimi-k25/streaming" OUTPUT = "/mnt/data/organ-architecture/z_report_kimi_k25.json" LOG = "/tmp/kimi_z_stream.log" os.makedirs(SHARD_DIR, exist_ok=True) os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = '1' def log(msg): ts = time.strftime("%H:%M:%S") line = f"[{ts}] {msg}" print(line, flush=True) with open(LOG, 'a') as f: f.write(line + "\n") # GGUF type info for dequantization GGML_TYPES = {0:'F32',1:'F16',2:'Q4_0',3:'Q4_1',6:'Q5_0',7:'Q5_1',8:'Q8_0', 10:'Q2_K',11:'Q3_K',12:'Q4_K',13:'Q5_K',14:'Q6_K',15:'Q8_K', 16:'IQ2_XXS',17:'IQ2_XS',18:'IQ3_XXS',19:'IQ1_S',20:'IQ4_NL', 26:'Q4_0_4_4',27:'Q4_0_4_8',28:'Q4_0_8_8',29:'TQ1_0',30:'TQ2_0'} # Block sizes for each quant type BLOCK_SIZES = { 0: (1, 4), # F32: 1 element per block, 4 bytes 1: (1, 2), # F16: 1 element per block, 2 bytes 2: (32, 18), # Q4_0: 32 elements per block, 18 bytes (2 byte scale + 16 byte quants) 3: (32, 20), # Q4_1: 32 elements, 20 bytes 8: (32, 34), # Q8_0: 32 elements, 34 bytes (2 byte scale + 32 byte quants) 12: (256, 144),# Q4_K: 256 elements, 144 bytes 13: (256, 176),# Q5_K: 256 elements, 176 bytes 14: (256, 210),# Q6_K: 256 elements, 210 bytes } def dequant_q4_0(data, n_elements): """Dequantize Q4_0 block format to float32""" block_size = 32 n_blocks = n_elements // block_size result = np.zeros(n_elements, dtype=np.float32) offset = 0 for i in range(n_blocks): # 2 bytes: float16 scale scale = np.frombuffer(data[offset:offset+2], dtype=np.float16)[0].astype(np.float32) offset += 2 # 16 bytes: 32 x 4-bit quants quants = np.frombuffer(data[offset:offset+16], dtype=np.uint8) offset += 16 for j in range(16): q_lo = (quants[j] & 0x0F) - 8 q_hi = (quants[j] >> 4) - 8 result[i * block_size + j * 2] = scale * q_lo result[i * block_size + j * 2 + 1] = scale * q_hi return result def fast_z_measure(data, dtype, n_elements): """ Compute Z-angle (theta) for a tensor. Uses statistical properties of the raw quantized data. theta = arccos(correlation_with_unit_reference) For pure signal: theta -> 90 degrees """ try: if dtype == 0: # F32 vals = np.frombuffer(data[:n_elements*4], dtype=np.float32) elif dtype == 1: # F16 vals = np.frombuffer(data[:n_elements*2], dtype=np.float16).astype(np.float32) elif dtype == 8: # Q8_0 # Extract scales for quick measurement block_size = 32 n_blocks = n_elements // block_size scales = np.zeros(n_blocks, dtype=np.float32) offset = 0 for b in range(min(n_blocks, 10000)): # Sample up to 10K blocks scales[b] = np.frombuffer(data[offset:offset+2], dtype=np.float16)[0] offset += 34 vals = scales[:min(n_blocks, 10000)] elif dtype == 2: # Q4_0 # Extract scales for quick measurement block_size = 32 n_blocks = n_elements // block_size n_sample = min(n_blocks, 50000) scales = np.zeros(n_sample, dtype=np.float32) offset = 0 for b in range(n_sample): scales[b] = np.frombuffer(data[offset:offset+2], dtype=np.float16)[0] offset += 18 vals = scales elif dtype in (12, 13, 14): # Q4_K, Q5_K, Q6_K # Extract super-block scales if dtype == 12: block_bytes = 144 elif dtype == 13: block_bytes = 176 else: block_bytes = 210 n_blocks = n_elements // 256 n_sample = min(n_blocks, 50000) scales = np.zeros(n_sample, dtype=np.float32) offset = 0 for b in range(n_sample): scales[b] = np.frombuffer(data[offset:offset+2], dtype=np.float16)[0] offset += block_bytes vals = scales else: return None, f"unsupported_dtype_{dtype}" if len(vals) < 10: return None, "too_few_values" # Remove zeros and infinities vals = vals[np.isfinite(vals)] if len(vals) < 10: return None, "too_few_finite" # theta = arccos(|correlation with linear reference|) # Pure signal -> decorrelated -> theta near 90 # Noise/bias -> correlated with something simple -> theta near 0 n = len(vals) ref = np.linspace(-1, 1, n) # Normalize vals_norm = vals - np.mean(vals) ref_norm = ref - np.mean(ref) std_v = np.std(vals_norm) std_r = np.std(ref_norm) if std_v < 1e-10 or std_r < 1e-10: return 0.0, "constant" corr = np.dot(vals_norm, ref_norm) / (n * std_v * std_r) corr = max(-1.0, min(1.0, corr)) theta = math.degrees(math.acos(abs(corr))) return theta, "ok" except Exception as e: return None, str(e) def read_string(f): n = struct.unpack('