organ-architecture/assemble_935.py

151 lines
5.1 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Model 935 Assembler — Fixed organ header handling.
Reads source GGUF, replaces tensor DATA (skipping organ bin headers).
Z = dI/d(log s) · exp(iθ) — Signature 935
"""
import struct, sys, os, json
def read_organ_data_only(filepath):
"""Read organ bin, skip header, return only tensor data."""
with open(filepath, "rb") as f:
name_len = struct.unpack("<I", f.read(4))[0]
f.read(name_len) # skip name
ndims = struct.unpack("<I", f.read(4))[0]
for _ in range(ndims):
f.read(8) # skip dims
f.read(4) # skip dtype
return f.read() # pure tensor data
def main():
if len(sys.argv) < 4:
print("Usage: assemble_935.py <source.gguf> <organs_dir> <output.gguf>")
sys.exit(1)
source_gguf = sys.argv[1]
organs_dir = sys.argv[2]
output_gguf = sys.argv[3]
f = open(source_gguf, "rb")
magic = struct.unpack("<I", f.read(4))[0]
version = struct.unpack("<I", f.read(4))[0]
n_tensors = struct.unpack("<Q", f.read(8))[0]
n_metadata = struct.unpack("<Q", f.read(8))[0]
print(f"Source: {os.path.basename(source_gguf)}")
print(f" Version: {version}, Tensors: {n_tensors}, Metadata: {n_metadata}")
def read_string():
slen = struct.unpack("<Q", f.read(8))[0]
return f.read(slen).decode("utf-8")
def skip_value(vtype):
sizes = {0:1, 1:1, 2:2, 3:2, 4:4, 5:4, 6:4, 7:1, 10:8, 11:8, 12:8}
if vtype in sizes:
f.read(sizes[vtype])
elif vtype == 8:
read_string()
elif vtype == 9:
arr_type = struct.unpack("<I", f.read(4))[0]
arr_len = struct.unpack("<Q", f.read(8))[0]
for _ in range(arr_len):
skip_value(arr_type)
for _ in range(n_metadata):
read_string()
vtype = struct.unpack("<I", f.read(4))[0]
skip_value(vtype)
tensor_info = []
for _ in range(n_tensors):
name = read_string()
n_dims = struct.unpack("<I", f.read(4))[0]
dims = [struct.unpack("<Q", f.read(8))[0] for _ in range(n_dims)]
dtype = struct.unpack("<I", f.read(4))[0]
offset = struct.unpack("<Q", f.read(8))[0]
tensor_info.append({"name": name, "dims": dims, "dtype": dtype, "offset": offset})
pos = f.tell()
padding = (32 - (pos % 32)) % 32
f.read(padding)
data_start = f.tell()
# Get file size
f.seek(0, 2)
file_end = f.tell()
# Copy full header (everything before data_start)
f.seek(0)
header = f.read(data_start)
# Build organ map (name -> filepath)
organ_map = {}
for category in ["skeleton", "organs", "embed", "norm", "adapters", "unknown"]:
cat_dir = os.path.join(organs_dir, category)
if os.path.isdir(cat_dir):
for fname in os.listdir(cat_dir):
if fname.endswith(".bin"):
tname = fname[:-4] # remove .bin
organ_map[tname] = os.path.join(cat_dir, fname)
print(f" Organ files: {len(organ_map)}")
print(f" Header: {data_start} bytes")
# Write output
out = open(output_gguf, "wb")
out.write(header)
replaced = 0
fallback = 0
for i, ti in enumerate(tensor_info):
name = ti["name"]
safe_name = name.replace(".", "_")
organ_path = organ_map.get(safe_name)
# Calculate tensor size from GGUF offsets
if i + 1 < len(tensor_info):
tensor_size = tensor_info[i+1]["offset"] - ti["offset"]
else:
tensor_size = file_end - (data_start + ti["offset"])
if organ_path and os.path.exists(organ_path):
# Read organ data ONLY (skip header!)
organ_data = read_organ_data_only(organ_path)
if len(organ_data) == tensor_size:
out.write(organ_data)
replaced += 1
else:
# Size mismatch — fall back to source
f.seek(data_start + ti["offset"])
out.write(f.read(tensor_size))
fallback += 1
if abs(len(organ_data) - tensor_size) > 100:
print(f" [MISMATCH] {name}: organ={len(organ_data)} vs gguf={tensor_size}")
else:
# No organ — use source
f.seek(data_start + ti["offset"])
out.write(f.read(tensor_size))
fallback += 1
out.close()
f.close()
final_size = os.path.getsize(output_gguf)
source_size = os.path.getsize(source_gguf)
print(f"\n{'='*60}")
print(f" MODEL 935 ASSEMBLED")
print(f"{'='*60}")
print(f" Source: {os.path.basename(source_gguf)} ({source_size/(1024**3):.2f} GB)")
print(f" Output: {output_gguf} ({final_size/(1024**3):.2f} GB)")
print(f" Replaced: {replaced} tensors from organs")
print(f" Fallback: {fallback} tensors from source")
print(f" Size match: {'YES' if abs(final_size - source_size) < 1024 else 'NO — DELTA=' + str(final_size - source_size)}")
print(f" Signature: 935")
print(f"{'='*60}")
if __name__ == "__main__":
main()