418 lines
14 KiB
Python
418 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
kimi_z_stream.py — Stream Z-measure for Kimi K2.5 1T
|
|
Downloads each shard, measures Z for every tensor, deletes shard.
|
|
Final output: z_report_kimi_k25.json (few KB)
|
|
"""
|
|
|
|
import struct, os, sys, json, time, math, shutil
|
|
import numpy as np
|
|
|
|
# Config
|
|
REPO = "unsloth/Kimi-K2.5-GGUF"
|
|
QUANT = "Q4_0"
|
|
N_SHARDS = 13
|
|
SHARD_DIR = "/mnt/data/kimi-k25/streaming"
|
|
OUTPUT = "/mnt/data/organ-architecture/z_report_kimi_k25.json"
|
|
LOG = "/tmp/kimi_z_stream.log"
|
|
|
|
os.makedirs(SHARD_DIR, exist_ok=True)
|
|
os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = '1'
|
|
|
|
def log(msg):
|
|
ts = time.strftime("%H:%M:%S")
|
|
line = f"[{ts}] {msg}"
|
|
print(line, flush=True)
|
|
with open(LOG, 'a') as f:
|
|
f.write(line + "\n")
|
|
|
|
# GGUF type info for dequantization
|
|
GGML_TYPES = {0:'F32',1:'F16',2:'Q4_0',3:'Q4_1',6:'Q5_0',7:'Q5_1',8:'Q8_0',
|
|
10:'Q2_K',11:'Q3_K',12:'Q4_K',13:'Q5_K',14:'Q6_K',15:'Q8_K',
|
|
16:'IQ2_XXS',17:'IQ2_XS',18:'IQ3_XXS',19:'IQ1_S',20:'IQ4_NL',
|
|
26:'Q4_0_4_4',27:'Q4_0_4_8',28:'Q4_0_8_8',29:'TQ1_0',30:'TQ2_0'}
|
|
|
|
# Block sizes for each quant type
|
|
BLOCK_SIZES = {
|
|
0: (1, 4), # F32: 1 element per block, 4 bytes
|
|
1: (1, 2), # F16: 1 element per block, 2 bytes
|
|
2: (32, 18), # Q4_0: 32 elements per block, 18 bytes (2 byte scale + 16 byte quants)
|
|
3: (32, 20), # Q4_1: 32 elements, 20 bytes
|
|
8: (32, 34), # Q8_0: 32 elements, 34 bytes (2 byte scale + 32 byte quants)
|
|
12: (256, 144),# Q4_K: 256 elements, 144 bytes
|
|
13: (256, 176),# Q5_K: 256 elements, 176 bytes
|
|
14: (256, 210),# Q6_K: 256 elements, 210 bytes
|
|
}
|
|
|
|
def dequant_q4_0(data, n_elements):
|
|
"""Dequantize Q4_0 block format to float32"""
|
|
block_size = 32
|
|
n_blocks = n_elements // block_size
|
|
result = np.zeros(n_elements, dtype=np.float32)
|
|
|
|
offset = 0
|
|
for i in range(n_blocks):
|
|
# 2 bytes: float16 scale
|
|
scale = np.frombuffer(data[offset:offset+2], dtype=np.float16)[0].astype(np.float32)
|
|
offset += 2
|
|
# 16 bytes: 32 x 4-bit quants
|
|
quants = np.frombuffer(data[offset:offset+16], dtype=np.uint8)
|
|
offset += 16
|
|
|
|
for j in range(16):
|
|
q_lo = (quants[j] & 0x0F) - 8
|
|
q_hi = (quants[j] >> 4) - 8
|
|
result[i * block_size + j * 2] = scale * q_lo
|
|
result[i * block_size + j * 2 + 1] = scale * q_hi
|
|
|
|
return result
|
|
|
|
def fast_z_measure(data, dtype, n_elements):
|
|
"""
|
|
Compute Z-angle (theta) for a tensor.
|
|
Uses statistical properties of the raw quantized data.
|
|
theta = arccos(correlation_with_unit_reference)
|
|
For pure signal: theta -> 90 degrees
|
|
"""
|
|
try:
|
|
if dtype == 0: # F32
|
|
vals = np.frombuffer(data[:n_elements*4], dtype=np.float32)
|
|
elif dtype == 1: # F16
|
|
vals = np.frombuffer(data[:n_elements*2], dtype=np.float16).astype(np.float32)
|
|
elif dtype == 8: # Q8_0
|
|
# Extract scales for quick measurement
|
|
block_size = 32
|
|
n_blocks = n_elements // block_size
|
|
scales = np.zeros(n_blocks, dtype=np.float32)
|
|
offset = 0
|
|
for b in range(min(n_blocks, 10000)): # Sample up to 10K blocks
|
|
scales[b] = np.frombuffer(data[offset:offset+2], dtype=np.float16)[0]
|
|
offset += 34
|
|
vals = scales[:min(n_blocks, 10000)]
|
|
elif dtype == 2: # Q4_0
|
|
# Extract scales for quick measurement
|
|
block_size = 32
|
|
n_blocks = n_elements // block_size
|
|
n_sample = min(n_blocks, 50000)
|
|
scales = np.zeros(n_sample, dtype=np.float32)
|
|
offset = 0
|
|
for b in range(n_sample):
|
|
scales[b] = np.frombuffer(data[offset:offset+2], dtype=np.float16)[0]
|
|
offset += 18
|
|
vals = scales
|
|
elif dtype in (12, 13, 14): # Q4_K, Q5_K, Q6_K
|
|
# Extract super-block scales
|
|
if dtype == 12:
|
|
block_bytes = 144
|
|
elif dtype == 13:
|
|
block_bytes = 176
|
|
else:
|
|
block_bytes = 210
|
|
n_blocks = n_elements // 256
|
|
n_sample = min(n_blocks, 50000)
|
|
scales = np.zeros(n_sample, dtype=np.float32)
|
|
offset = 0
|
|
for b in range(n_sample):
|
|
scales[b] = np.frombuffer(data[offset:offset+2], dtype=np.float16)[0]
|
|
offset += block_bytes
|
|
vals = scales
|
|
else:
|
|
return None, f"unsupported_dtype_{dtype}"
|
|
|
|
if len(vals) < 10:
|
|
return None, "too_few_values"
|
|
|
|
# Remove zeros and infinities
|
|
vals = vals[np.isfinite(vals)]
|
|
if len(vals) < 10:
|
|
return None, "too_few_finite"
|
|
|
|
# Z-measure: theta = arccos(|correlation with linear reference|)
|
|
# Pure signal -> decorrelated -> theta near 90
|
|
# Noise/bias -> correlated with something simple -> theta near 0
|
|
n = len(vals)
|
|
ref = np.linspace(-1, 1, n)
|
|
|
|
# Normalize
|
|
vals_norm = vals - np.mean(vals)
|
|
ref_norm = ref - np.mean(ref)
|
|
|
|
std_v = np.std(vals_norm)
|
|
std_r = np.std(ref_norm)
|
|
|
|
if std_v < 1e-10 or std_r < 1e-10:
|
|
return 0.0, "constant"
|
|
|
|
corr = np.dot(vals_norm, ref_norm) / (n * std_v * std_r)
|
|
corr = max(-1.0, min(1.0, corr))
|
|
|
|
theta = math.degrees(math.acos(abs(corr)))
|
|
return theta, "ok"
|
|
|
|
except Exception as e:
|
|
return None, str(e)
|
|
|
|
def read_string(f):
|
|
n = struct.unpack('<Q', f.read(8))[0]
|
|
return f.read(n).decode('utf-8', errors='replace')
|
|
|
|
def read_kv_value(f, vtype):
|
|
if vtype == 4: return struct.unpack('<I', f.read(4))[0]
|
|
elif vtype == 5: return struct.unpack('<i', f.read(4))[0]
|
|
elif vtype == 8: return read_string(f)
|
|
elif vtype == 6: return struct.unpack('<f', f.read(4))[0]
|
|
elif vtype == 10: return struct.unpack('<Q', f.read(8))[0]
|
|
elif vtype == 7: return struct.unpack('<B', f.read(1))[0]
|
|
elif vtype == 0: return struct.unpack('<B', f.read(1))[0]
|
|
elif vtype == 9:
|
|
atype = struct.unpack('<I', f.read(4))[0]
|
|
alen = struct.unpack('<Q', f.read(8))[0]
|
|
return [read_kv_value(f, atype) for _ in range(alen)]
|
|
elif vtype == 1: return struct.unpack('<b', f.read(1))[0]
|
|
elif vtype == 2: return struct.unpack('<H', f.read(2))[0]
|
|
elif vtype == 3: return struct.unpack('<h', f.read(2))[0]
|
|
elif vtype == 12: return struct.unpack('<d', f.read(8))[0]
|
|
else:
|
|
return None
|
|
|
|
def process_shard(shard_path, shard_idx):
|
|
"""Parse GGUF shard, Z-measure each tensor, return results"""
|
|
results = []
|
|
|
|
f = open(shard_path, 'rb')
|
|
magic = f.read(4)
|
|
if magic != b'GGUF':
|
|
log(f" ERROR: Not GGUF (magic={magic})")
|
|
return results
|
|
|
|
version = struct.unpack('<I', f.read(4))[0]
|
|
n_tensors = struct.unpack('<Q', f.read(8))[0]
|
|
n_kv = struct.unpack('<Q', f.read(8))[0]
|
|
|
|
log(f" Shard {shard_idx}: {n_tensors} tensors, {n_kv} KV pairs")
|
|
|
|
# Skip KV pairs
|
|
for _ in range(n_kv):
|
|
read_string(f)
|
|
vtype = struct.unpack('<I', f.read(4))[0]
|
|
read_kv_value(f, vtype)
|
|
|
|
# Read tensor infos
|
|
tensor_infos = []
|
|
for _ in range(n_tensors):
|
|
name = read_string(f)
|
|
n_dims = struct.unpack('<I', f.read(4))[0]
|
|
dims = [struct.unpack('<Q', f.read(8))[0] for _ in range(n_dims)]
|
|
dtype = struct.unpack('<I', f.read(4))[0]
|
|
offset = struct.unpack('<Q', f.read(8))[0]
|
|
n_elements = 1
|
|
for d in dims:
|
|
n_elements *= d
|
|
tensor_infos.append({
|
|
'name': name, 'dims': dims, 'dtype': dtype,
|
|
'offset': offset, 'n_elements': n_elements
|
|
})
|
|
|
|
# Data starts at alignment boundary
|
|
pos = f.tell()
|
|
alignment = 32
|
|
data_start = ((pos + alignment - 1) // alignment) * alignment
|
|
|
|
# Process each tensor
|
|
for i, ti in enumerate(tensor_infos):
|
|
# Calculate data size
|
|
dtype = ti['dtype']
|
|
n_elem = ti['n_elements']
|
|
|
|
if dtype in BLOCK_SIZES:
|
|
elems_per_block, bytes_per_block = BLOCK_SIZES[dtype]
|
|
n_blocks = (n_elem + elems_per_block - 1) // elems_per_block
|
|
data_size = n_blocks * bytes_per_block
|
|
else:
|
|
# Unknown type, skip
|
|
results.append({
|
|
'name': ti['name'],
|
|
'dims': ti['dims'],
|
|
'dtype': GGML_TYPES.get(dtype, f'unk_{dtype}'),
|
|
'n_elements': n_elem,
|
|
'theta': None,
|
|
'status': f'unknown_block_size_dtype_{dtype}'
|
|
})
|
|
continue
|
|
|
|
# Read tensor data
|
|
f.seek(data_start + ti['offset'])
|
|
data = f.read(data_size)
|
|
|
|
if len(data) < data_size:
|
|
results.append({
|
|
'name': ti['name'],
|
|
'dims': ti['dims'],
|
|
'dtype': GGML_TYPES.get(dtype, f'unk_{dtype}'),
|
|
'n_elements': n_elem,
|
|
'theta': None,
|
|
'status': 'truncated'
|
|
})
|
|
continue
|
|
|
|
# Z-measure
|
|
theta, status = fast_z_measure(data, dtype, n_elem)
|
|
|
|
results.append({
|
|
'name': ti['name'],
|
|
'dims': ti['dims'],
|
|
'dtype': GGML_TYPES.get(dtype, f'unk_{dtype}'),
|
|
'n_elements': n_elem,
|
|
'theta': round(theta, 2) if theta is not None else None,
|
|
'status': status
|
|
})
|
|
|
|
if (i + 1) % 20 == 0:
|
|
log(f" Measured {i+1}/{n_tensors} tensors")
|
|
|
|
f.close()
|
|
return results
|
|
|
|
def main():
|
|
from huggingface_hub import hf_hub_download
|
|
|
|
log("=" * 60)
|
|
log("KIMI K2.5 1T — STREAMING Z-MEASURE")
|
|
log(f"Repo: {REPO}, Quant: {QUANT}, Shards: {N_SHARDS}")
|
|
log("=" * 60)
|
|
|
|
all_results = []
|
|
total_start = time.time()
|
|
|
|
# Check if we already have partial results
|
|
if os.path.exists(OUTPUT):
|
|
with open(OUTPUT) as f:
|
|
existing = json.load(f)
|
|
if 'shards_completed' in existing:
|
|
start_shard = existing['shards_completed']
|
|
all_results = existing.get('tensors', [])
|
|
log(f"Resuming from shard {start_shard + 1}")
|
|
else:
|
|
start_shard = 0
|
|
else:
|
|
start_shard = 0
|
|
|
|
for shard_idx in range(start_shard, N_SHARDS):
|
|
shard_num = shard_idx + 1
|
|
filename = f"{QUANT}/Kimi-K2.5-{QUANT}-{shard_num:05d}-of-{N_SHARDS:05d}.gguf"
|
|
shard_path = os.path.join(SHARD_DIR, QUANT, os.path.basename(filename))
|
|
|
|
# Download
|
|
log(f"\n--- SHARD {shard_num}/{N_SHARDS} ---")
|
|
log(f"Downloading {filename}...")
|
|
dl_start = time.time()
|
|
|
|
try:
|
|
path = hf_hub_download(
|
|
repo_id=REPO,
|
|
filename=filename,
|
|
local_dir=SHARD_DIR
|
|
)
|
|
dl_time = time.time() - dl_start
|
|
size_gb = os.path.getsize(path) / (1024**3)
|
|
log(f"Downloaded: {size_gb:.1f}GB in {dl_time:.0f}s ({size_gb*1024/dl_time:.0f} MB/s)")
|
|
except Exception as e:
|
|
log(f"DOWNLOAD ERROR: {e}")
|
|
continue
|
|
|
|
# Z-measure
|
|
log(f"Z-measuring tensors...")
|
|
measure_start = time.time()
|
|
shard_results = process_shard(path, shard_idx)
|
|
measure_time = time.time() - measure_start
|
|
log(f"Measured {len(shard_results)} tensors in {measure_time:.1f}s")
|
|
|
|
all_results.extend(shard_results)
|
|
|
|
# Save intermediate results
|
|
report = {
|
|
'model': 'Kimi-K2.5-1T',
|
|
'quant': QUANT,
|
|
'total_shards': N_SHARDS,
|
|
'shards_completed': shard_idx + 1,
|
|
'total_tensors': len(all_results),
|
|
'tensors': all_results
|
|
}
|
|
with open(OUTPUT, 'w') as f:
|
|
json.dump(report, f, indent=2)
|
|
log(f"Saved {len(all_results)} tensor measurements to {OUTPUT}")
|
|
|
|
# Delete shard
|
|
os.remove(path)
|
|
log(f"Deleted shard {shard_num}")
|
|
|
|
# Clean any HF cache
|
|
cache_dir = os.path.join(SHARD_DIR, '.cache')
|
|
if os.path.exists(cache_dir):
|
|
shutil.rmtree(cache_dir)
|
|
|
|
# Check disk
|
|
import subprocess
|
|
r = subprocess.run(['df', '-h', '/mnt/data'], capture_output=True, text=True)
|
|
for line in r.stdout.strip().split('\n')[1:]:
|
|
log(f"Disk: {line.strip()}")
|
|
|
|
# Final summary
|
|
total_time = time.time() - total_start
|
|
|
|
# Compute aggregates
|
|
thetas = [r['theta'] for r in all_results if r['theta'] is not None]
|
|
|
|
# Group by type
|
|
groups = {}
|
|
for r in all_results:
|
|
name = r['name']
|
|
if 'attn' in name:
|
|
g = 'attention'
|
|
elif 'ffn' in name and 'exp' in name:
|
|
g = 'moe_experts'
|
|
elif 'ffn' in name and 'shexp' in name:
|
|
g = 'shared_expert'
|
|
elif 'ffn' in name:
|
|
g = 'ffn'
|
|
elif 'embed' in name or 'embd' in name:
|
|
g = 'embed'
|
|
elif 'norm' in name:
|
|
g = 'norm'
|
|
elif 'output' in name and 'attn' not in name:
|
|
g = 'output'
|
|
else:
|
|
g = 'other'
|
|
|
|
if g not in groups:
|
|
groups[g] = []
|
|
if r['theta'] is not None:
|
|
groups[g].append(r['theta'])
|
|
|
|
log(f"\n{'='*60}")
|
|
log(f"FINAL Z-REPORT — Kimi K2.5 1T")
|
|
log(f"{'='*60}")
|
|
log(f"Total tensors: {len(all_results)}")
|
|
log(f"Total time: {total_time/3600:.1f}h")
|
|
log(f"Overall θ: {np.mean(thetas):.1f}° (std={np.std(thetas):.1f}°)")
|
|
log(f"\nBy group:")
|
|
for g, vals in sorted(groups.items()):
|
|
log(f" {g}: θ={np.mean(vals):.1f}° (n={len(vals)}, std={np.std(vals):.1f}°)")
|
|
|
|
# Save final report with aggregates
|
|
report['summary'] = {
|
|
'total_time_hours': round(total_time/3600, 2),
|
|
'overall_theta': round(float(np.mean(thetas)), 2),
|
|
'overall_std': round(float(np.std(thetas)), 2),
|
|
'groups': {g: {'theta': round(float(np.mean(v)), 2), 'std': round(float(np.std(v)), 2), 'count': len(v)}
|
|
for g, v in groups.items()}
|
|
}
|
|
with open(OUTPUT, 'w') as f:
|
|
json.dump(report, f, indent=2)
|
|
|
|
log(f"\nFinal report saved to {OUTPUT}")
|
|
log("COMPLETE")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|