Better output from the same model. Fused computation, adaptive precision, surgical expert loading. 305 KB, 19 backends, zero dependencies. https://inference-x.com
165 lines
9.4 KiB
Bash
Executable File
165 lines
9.4 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# ix — Inference-X Model Hub & Benchmark
|
|
# Salka Elmadani | Morocco
|
|
set -uo pipefail
|
|
IX="${IX:-./inference-x}"
|
|
HUB="${HUB:-./models}"
|
|
RES="${RES:-./benchmarks}"
|
|
mkdir -p "$HUB" "$RES"
|
|
CPU=$(grep -m1 "model name" /proc/cpuinfo | sed "s/.*: *//" | sed 's/\s\+/ /g')
|
|
RAM_GB=$(awk '/MemTotal/ {printf "%.0f", $2/1024/1024}' /proc/meminfo)
|
|
CORES=$(nproc)
|
|
|
|
find_model() {
|
|
local fn="$1"
|
|
for d in "$HUB" $HOME/models $HOME/models; do
|
|
[[ -f "$d/$fn" ]] && echo "$d/$fn" && return 0
|
|
done
|
|
return 1
|
|
}
|
|
|
|
bench_one() {
|
|
local name="$1" fn="$2" size="$3" params="$4" quant="$5" ntok="${6:-4}"
|
|
local path=$(find_model "$fn")
|
|
[[ -z "$path" ]] && printf " %-20s NOT FOUND\n" "$name" && return 1
|
|
sync; echo 3 > /proc/sys/vm/drop_caches 2>/dev/null || true
|
|
local log="$RES/${name}.log"
|
|
local t0=$(date +%s%N)
|
|
timeout 600 "$IX" "$path" --raw -p "The capital of France is" -n "$ntok" -t 0.1 > "$log" 2>&1
|
|
local rc=$? t1=$(date +%s%N)
|
|
local ms=$(( (t1 - t0) / 1000000 ))
|
|
local secs=$(echo "scale=1; $ms / 1000" | bc 2>/dev/null || echo "?")
|
|
local gen=$(grep -oP '\[GEN\] \K\d+' "$log" 2>/dev/null || echo "0")
|
|
local output=$(awk '/OUTPUT/{f=1;next} /────/{if(f)exit} f' "$log" | tr '\n' ' ' | sed 's/^[[:space:]]*//' | head -c 60)
|
|
local tps="0"
|
|
[[ "$gen" -gt 0 && "$ms" -gt 0 ]] && tps=$(echo "scale=2; $gen * 1000 / $ms" | bc 2>/dev/null || echo "0")
|
|
local q="FAIL"
|
|
[[ $rc -eq 124 ]] && q="TIMEOUT"
|
|
[[ $rc -ne 0 && $rc -ne 124 ]] && q="CRASH"
|
|
[[ "$gen" -gt 0 ]] && q="OK"
|
|
echo "$output" | grep -qiP '[a-z]{2,}' || q="GARB"
|
|
printf " %-20s %5s %7s %5sGB %7ss %6s/s %-7s %.50s\n" "$name" "$params" "$quant" "$size" "$secs" "$tps" "$q" "$output"
|
|
echo "$name,$params,$quant,$size,$secs,$tps,$q" >> "$RES/results.csv"
|
|
}
|
|
|
|
case "${1:-help}" in
|
|
list)
|
|
echo ""
|
|
echo " INFERENCE-X MODEL HUB | $CPU | ${RAM_GB}GB | $CORES cores"
|
|
echo ""
|
|
printf " %-20s %5s %7s %6s %s\n" "MODEL" "PARAM" "QUANT" "SIZE" "STATUS"
|
|
echo " ════════════════════════════════════════════════════════════"
|
|
while IFS='|' read -r name repo fn size params quant; do
|
|
path=$(find_model "$fn" 2>/dev/null)
|
|
st="REMOTE"; [[ -n "$path" ]] && st="LOCAL"
|
|
sz=${size%.*}; [[ $sz -gt $RAM_GB ]] && [[ "$st" == "REMOTE" ]] && st="TOO BIG"
|
|
printf " %-20s %5s %7s %5sGB %s\n" "$name" "$params" "$quant" "$size" "$st"
|
|
done << 'REGISTRY'
|
|
smollm2-135m|HuggingFaceTB/SmolLM2-135M-Instruct-GGUF|smollm2-135m-instruct-q8_0.gguf|0.1|135M|Q8_0
|
|
llama-3.2-1b|bartowski/Llama-3.2-1B-Instruct-GGUF|Llama-3.2-1B-Instruct-Q4_K_M.gguf|0.8|1B|Q4_K_M
|
|
llama-3.2-3b|bartowski/Llama-3.2-3B-Instruct-GGUF|Llama-3.2-3B-Instruct-Q4_K_M.gguf|2.0|3B|Q4_K_M
|
|
qwen2.5-3b|Qwen/Qwen2.5-3B-Instruct-GGUF|qwen2.5-3b-instruct-q4_k_m.gguf|2.0|3B|Q4_K_M
|
|
phi-3.5-mini|bartowski/Phi-3.5-mini-instruct-GGUF|Phi-3.5-mini-instruct-Q4_K_M.gguf|2.3|3.8B|Q4_K_M
|
|
deepseek-r1-7b|bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF|DeepSeek-R1-Distill-Qwen-7B-Q4_K_M.gguf|4.7|7B|Q4_K_M
|
|
qwen2.5-7b|Qwen/Qwen2.5-7B-Instruct-GGUF|qwen2.5-7b-instruct-q4_k_m.gguf|4.7|7B|Q4_K_M
|
|
mistral-7b|bartowski/Mistral-7B-Instruct-v0.3-GGUF|Mistral-7B-Instruct-v0.3-Q4_K_M.gguf|4.4|7B|Q4_K_M
|
|
llama-3.1-8b|bartowski/Meta-Llama-3.1-8B-Instruct-GGUF|Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf|4.9|8B|Q4_K_M
|
|
gemma-2-9b|bartowski/gemma-2-9b-it-GGUF|gemma-2-9b-it-Q4_K_M.gguf|5.8|9B|Q4_K_M
|
|
deepseek-r1-14b|bartowski/DeepSeek-R1-Distill-Qwen-14B-GGUF|DeepSeek-R1-Distill-Qwen-14B-Q4_K_M.gguf|8.7|14B|Q4_K_M
|
|
qwen2.5-14b|Qwen/Qwen2.5-14B-Instruct-GGUF|qwen2.5-14b-instruct-q4_k_m.gguf|9.0|14B|Q4_K_M
|
|
qwen2.5-32b|Qwen/Qwen2.5-32B-Instruct-GGUF|qwen2.5-32b-instruct-q4_k_m.gguf|19.8|32B|Q4_K_M
|
|
deepseek-r1-32b|bartowski/DeepSeek-R1-Distill-Qwen-32B-GGUF|DeepSeek-R1-Distill-Qwen-32B-Q4_K_M.gguf|19.8|32B|Q4_K_M
|
|
llama-3.1-70b|bartowski/Meta-Llama-3.1-70B-Instruct-GGUF|Meta-Llama-3.1-70B-Instruct-Q4_K_M.gguf|42.5|70B|Q4_K_M
|
|
qwen2.5-72b|Qwen/Qwen2.5-72B-Instruct-GGUF|qwen2.5-72b-instruct-q4_k_m.gguf|44.0|72B|Q4_K_M
|
|
REGISTRY
|
|
echo ""
|
|
;;
|
|
|
|
pull)
|
|
name="${2:-}"
|
|
[[ -z "$name" ]] && echo "Usage: ix pull <model>" && exit 1
|
|
while IFS='|' read -r n repo fn size params quant; do
|
|
[[ "$n" != "$name" && "$name" != "all" ]] && continue
|
|
sz=${size%.*}
|
|
[[ "$name" == "all" && $sz -gt $RAM_GB ]] && echo "SKIP $n (${size}GB > ${RAM_GB}GB)" && continue
|
|
path=$(find_model "$fn" 2>/dev/null)
|
|
[[ -n "$path" ]] && echo "✓ $n: $path" && continue
|
|
echo "⬇ $n (${size}GB)..."
|
|
wget -q --show-progress -c -O "$HUB/$fn" "https://huggingface.co/$repo/resolve/main/$fn"
|
|
[[ $? -eq 0 ]] && echo "✓ $n" || echo "✗ $n FAILED"
|
|
done << 'REGISTRY'
|
|
smollm2-135m|HuggingFaceTB/SmolLM2-135M-Instruct-GGUF|smollm2-135m-instruct-q8_0.gguf|0.1|135M|Q8_0
|
|
llama-3.2-1b|bartowski/Llama-3.2-1B-Instruct-GGUF|Llama-3.2-1B-Instruct-Q4_K_M.gguf|0.8|1B|Q4_K_M
|
|
llama-3.2-3b|bartowski/Llama-3.2-3B-Instruct-GGUF|Llama-3.2-3B-Instruct-Q4_K_M.gguf|2.0|3B|Q4_K_M
|
|
qwen2.5-3b|Qwen/Qwen2.5-3B-Instruct-GGUF|qwen2.5-3b-instruct-q4_k_m.gguf|2.0|3B|Q4_K_M
|
|
phi-3.5-mini|bartowski/Phi-3.5-mini-instruct-GGUF|Phi-3.5-mini-instruct-Q4_K_M.gguf|2.3|3.8B|Q4_K_M
|
|
deepseek-r1-7b|bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF|DeepSeek-R1-Distill-Qwen-7B-Q4_K_M.gguf|4.7|7B|Q4_K_M
|
|
qwen2.5-7b|Qwen/Qwen2.5-7B-Instruct-GGUF|qwen2.5-7b-instruct-q4_k_m.gguf|4.7|7B|Q4_K_M
|
|
mistral-7b|bartowski/Mistral-7B-Instruct-v0.3-GGUF|Mistral-7B-Instruct-v0.3-Q4_K_M.gguf|4.4|7B|Q4_K_M
|
|
llama-3.1-8b|bartowski/Meta-Llama-3.1-8B-Instruct-GGUF|Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf|4.9|8B|Q4_K_M
|
|
gemma-2-9b|bartowski/gemma-2-9b-it-GGUF|gemma-2-9b-it-Q4_K_M.gguf|5.8|9B|Q4_K_M
|
|
deepseek-r1-14b|bartowski/DeepSeek-R1-Distill-Qwen-14B-GGUF|DeepSeek-R1-Distill-Qwen-14B-Q4_K_M.gguf|8.7|14B|Q4_K_M
|
|
qwen2.5-14b|Qwen/Qwen2.5-14B-Instruct-GGUF|qwen2.5-14b-instruct-q4_k_m.gguf|9.0|14B|Q4_K_M
|
|
qwen2.5-32b|Qwen/Qwen2.5-32B-Instruct-GGUF|qwen2.5-32b-instruct-q4_k_m.gguf|19.8|32B|Q4_K_M
|
|
deepseek-r1-32b|bartowski/DeepSeek-R1-Distill-Qwen-32B-GGUF|DeepSeek-R1-Distill-Qwen-32B-Q4_K_M.gguf|19.8|32B|Q4_K_M
|
|
llama-3.1-70b|bartowski/Meta-Llama-3.1-70B-Instruct-GGUF|Meta-Llama-3.1-70B-Instruct-Q4_K_M.gguf|42.5|70B|Q4_K_M
|
|
qwen2.5-72b|Qwen/Qwen2.5-72B-Instruct-GGUF|qwen2.5-72b-instruct-q4_k_m.gguf|44.0|72B|Q4_K_M
|
|
REGISTRY
|
|
;;
|
|
|
|
bench)
|
|
target="${2:-all}"
|
|
ntok="${3:-4}"
|
|
echo ""
|
|
echo "═══════════════════════════════════════════════════════════════"
|
|
echo " INFERENCE-X VALIDATION | $CPU | ${RAM_GB}GB | $CORES cores"
|
|
echo " $(date -u +%Y-%m-%dT%H:%M:%SZ) | $ntok tokens/model"
|
|
echo "═══════════════════════════════════════════════════════════════"
|
|
echo ""
|
|
printf " %-20s %5s %7s %6s %7s %8s %-7s %s\n" "MODEL" "PARAM" "QUANT" "SIZE" "TIME" "SPEED" "QUAL" "OUTPUT"
|
|
echo " ════════════════════════════════════════════════════════════════════════════════════"
|
|
echo "model,params,quant,size_gb,time_s,tok_s,quality" > "$RES/results.csv"
|
|
while IFS='|' read -r name repo fn size params quant; do
|
|
[[ "$target" != "all" && "$target" != "$name" ]] && continue
|
|
bench_one "$name" "$fn" "$size" "$params" "$quant" "$ntok"
|
|
done << 'REGISTRY'
|
|
smollm2-135m|HuggingFaceTB/SmolLM2-135M-Instruct-GGUF|smollm2-135m-instruct-q8_0.gguf|0.1|135M|Q8_0
|
|
llama-3.2-3b|bartowski/Llama-3.2-3B-Instruct-GGUF|Llama-3.2-3B-Instruct-Q4_K_M.gguf|2.0|3B|Q4_K_M
|
|
phi-3.5-mini|bartowski/Phi-3.5-mini-instruct-GGUF|Phi-3.5-mini-instruct-Q4_K_M.gguf|2.3|3.8B|Q4_K_M
|
|
deepseek-r1-7b|bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF|DeepSeek-R1-Distill-Qwen-7B-Q4_K_M.gguf|4.7|7B|Q4_K_M
|
|
deepseek-r1-14b|bartowski/DeepSeek-R1-Distill-Qwen-14B-GGUF|DeepSeek-R1-Distill-Qwen-14B-Q4_K_M.gguf|8.7|14B|Q4_K_M
|
|
REGISTRY
|
|
echo ""
|
|
echo " Results: $RES/results.csv"
|
|
echo "═══════════════════════════════════════════════════════════════"
|
|
;;
|
|
|
|
|
|
serve)
|
|
port="${2:-8080}"
|
|
model="${3:-}"
|
|
if [[ -z "$model" ]]; then
|
|
# Auto-select best model that fits in RAM
|
|
best=""
|
|
while IFS='|' read -r n repo fn size params quant; do
|
|
sz=${size%.*}
|
|
[[ $sz -gt $RAM_GB ]] && continue
|
|
path=$(find_model "$fn" 2>/dev/null)
|
|
[[ -n "$path" ]] && best="$path"
|
|
done << 'REGISTRY'
|
|
smollm2-135m|HuggingFaceTB/SmolLM2-135M-Instruct-GGUF|smollm2-135m-instruct-q8_0.gguf|0.1|135M|Q8_0
|
|
llama-3.2-3b|bartowski/Llama-3.2-3B-Instruct-GGUF|Llama-3.2-3B-Instruct-Q4_K_M.gguf|2.0|3B|Q4_K_M
|
|
deepseek-r1-7b|bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF|DeepSeek-R1-Distill-Qwen-7B-Q4_K_M.gguf|4.7|7B|Q4_K_M
|
|
REGISTRY
|
|
[[ -z "$best" ]] && echo "No model found. Run: ix pull <model>" && exit 1
|
|
model="$best"
|
|
fi
|
|
echo "Starting IX server on port $port with $model"
|
|
"$IX" "$model" --serve "$port"
|
|
;;
|
|
|
|
*)
|
|
echo " ix list | pull <model|all> | bench [model|all] [ntok]"
|
|
;;
|
|
esac
|