inference-x/runtime/server.h
Salka Elmadani ec36668cf5 Inference-X v1.0 — Universal AI Inference Engine
Better output from the same model. Fused computation, adaptive precision,
surgical expert loading. 305 KB, 19 backends, zero dependencies.

https://inference-x.com
2026-02-23 07:10:47 +00:00

534 lines
21 KiB
C++

// runtime/server.h — OpenAI-Compatible HTTP Server for Inference-X
// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
// INPI eSoleau: 7phf-Ueye-2nWr-Vsgu — BSL-1.1
//
// Zero dependencies. POSIX sockets. Drop-in replacement for OpenAI API.
// Any app that talks to GPT-4 talks to your local model. No code change.
//
// Endpoints:
// POST /v1/chat/completions — Chat with streaming (SSE)
// POST /v1/completions — Text completion
// GET /v1/models — List loaded model
// GET /health — Health check
//
#pragma once
#include <string>
#include <vector>
#include <functional>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <thread>
#include <atomic>
#include <sstream>
#include <sys/socket.h>
#include <netinet/in.h>
#include <unistd.h>
#include <arpa/inet.h>
#include <signal.h>
#include "identity.h"
namespace ix {
// ═══════════════════════════════════════════════════════════════════════════
// Minimal JSON helpers — just enough for OpenAI protocol, no external lib
// ═══════════════════════════════════════════════════════════════════════════
struct ChatMessage {
std::string role;
std::string content;
};
struct ChatRequest {
std::string model;
std::vector<ChatMessage> messages;
int max_tokens = 512;
float temperature = 0.6f;
float top_p = 0.9f;
bool stream = false;
};
// Extract string value for a key from JSON (minimal, handles escaped quotes)
static std::string json_str(const std::string& json, const std::string& key) {
std::string needle = "\"" + key + "\"";
size_t pos = json.find(needle);
if (pos == std::string::npos) return "";
pos = json.find(':', pos + needle.size());
if (pos == std::string::npos) return "";
pos = json.find('"', pos + 1);
if (pos == std::string::npos) return "";
pos++;
std::string result;
while (pos < json.size() && json[pos] != '"') {
if (json[pos] == '\\' && pos + 1 < json.size()) {
pos++;
if (json[pos] == 'n') result += '\n';
else if (json[pos] == 't') result += '\t';
else if (json[pos] == '"') result += '"';
else if (json[pos] == '\\') result += '\\';
else result += json[pos];
} else {
result += json[pos];
}
pos++;
}
return result;
}
// Extract numeric value
static double json_num(const std::string& json, const std::string& key, double def) {
std::string needle = "\"" + key + "\"";
size_t pos = json.find(needle);
if (pos == std::string::npos) return def;
pos = json.find(':', pos + needle.size());
if (pos == std::string::npos) return def;
pos++;
while (pos < json.size() && (json[pos] == ' ' || json[pos] == '\t')) pos++;
try { return std::stod(json.substr(pos, 20)); } catch (...) { return def; }
}
// Extract bool value
static bool json_bool(const std::string& json, const std::string& key, bool def) {
std::string needle = "\"" + key + "\"";
size_t pos = json.find(needle);
if (pos == std::string::npos) return def;
pos = json.find(':', pos + needle.size());
if (pos == std::string::npos) return def;
pos++;
while (pos < json.size() && json[pos] == ' ') pos++;
if (json.substr(pos, 4) == "true") return true;
if (json.substr(pos, 5) == "false") return false;
return def;
}
// Parse messages array from chat request
static std::vector<ChatMessage> parse_messages(const std::string& json) {
std::vector<ChatMessage> msgs;
size_t pos = json.find("\"messages\"");
if (pos == std::string::npos) return msgs;
pos = json.find('[', pos);
if (pos == std::string::npos) return msgs;
// Find each message object
size_t end = json.find(']', pos);
if (end == std::string::npos) end = json.size();
size_t cur = pos;
while (cur < end) {
size_t obj_start = json.find('{', cur);
if (obj_start == std::string::npos || obj_start >= end) break;
size_t obj_end = json.find('}', obj_start);
if (obj_end == std::string::npos) break;
std::string obj = json.substr(obj_start, obj_end - obj_start + 1);
ChatMessage msg;
msg.role = json_str(obj, "role");
msg.content = json_str(obj, "content");
if (!msg.role.empty()) msgs.push_back(msg);
cur = obj_end + 1;
}
return msgs;
}
static ChatRequest parse_chat_request(const std::string& body) {
ChatRequest req;
req.model = json_str(body, "model");
req.messages = parse_messages(body);
req.max_tokens = (int)json_num(body, "max_tokens", 512);
req.temperature = (float)json_num(body, "temperature", 0.6);
req.top_p = (float)json_num(body, "top_p", 0.9);
req.stream = json_bool(body, "stream", false);
return req;
}
// JSON string escape
static std::string json_escape(const std::string& s) {
std::string r;
r.reserve(s.size() + 16);
for (char c : s) {
switch (c) {
case '"': r += "\\\""; break;
case '\\': r += "\\\\"; break;
case '\n': r += "\\n"; break;
case '\r': r += "\\r"; break;
case '\t': r += "\\t"; break;
default: r += c;
}
}
return r;
}
// Generate unique ID
static std::string gen_id() {
char buf[32];
snprintf(buf, sizeof(buf), "chatcmpl-%lx", (long)time(nullptr));
return buf;
}
// ═══════════════════════════════════════════════════════════════════════════
// HTTP Server
// ═══════════════════════════════════════════════════════════════════════════
// Callback: given system+user prompt, stream tokens
using GenerateFn = std::function<void(
const std::string& system,
const std::string& user,
int max_tokens,
float temperature,
float top_p,
std::function<bool(const std::string& token)> on_token
)>;
class Server {
public:
Server(int port, const std::string& model_name, GenerateFn generate)
: port_(port), model_name_(model_name), generate_(generate) {}
void run() {
int server_fd = socket(AF_INET, SOCK_STREAM, 0);
if (server_fd < 0) { perror("socket"); return; }
int opt = 1;
setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt));
struct sockaddr_in addr;
memset(&addr, 0, sizeof(addr));
addr.sin_family = AF_INET;
addr.sin_addr.s_addr = INADDR_ANY;
addr.sin_port = htons(port_);
if (bind(server_fd, (struct sockaddr*)&addr, sizeof(addr)) < 0) {
perror("bind"); close(server_fd); return;
}
if (listen(server_fd, 16) < 0) {
perror("listen"); close(server_fd); return;
}
printf("\n");
printf("╔══════════════════════════════════════════════════════════════╗\n");
printf("║ Inference-X Server — OpenAI-Compatible API ║\n");
printf("╠══════════════════════════════════════════════════════════════╣\n");
printf("║ Model : %-49s ║\n", model_name_.c_str());
printf("║ Port : %-49d ║\n", port_);
printf("║ API : http://0.0.0.0:%-35d ║\n", port_);
printf("╠══════════════════════════════════════════════════════════════╣\n");
printf("║ POST /v1/chat/completions Chat (streaming + sync) ║\n");
printf("║ POST /v1/completions Text completion ║\n");
printf("║ GET /v1/models Model info ║\n");
printf("║ GET /health Health check ║\n");
printf("╚══════════════════════════════════════════════════════════════╝\n");
printf("\nReady. Ctrl+C to stop.\n\n");
fflush(stdout);
while (!stopped_) {
struct sockaddr_in client_addr;
socklen_t client_len = sizeof(client_addr);
int client_fd = accept(server_fd, (struct sockaddr*)&client_addr, &client_len);
if (client_fd < 0) continue;
// Handle in-thread (sequential for now — model is single-threaded)
handle_client(client_fd);
close(client_fd);
}
close(server_fd);
}
void stop() { stopped_ = true; }
private:
int port_;
std::string model_name_;
GenerateFn generate_;
std::atomic<bool> stopped_{false};
int total_requests_ = 0;
int total_tokens_ = 0;
// ─── HTTP parsing ──────────────────────────────────────────────────
struct HttpRequest {
std::string method;
std::string path;
std::string body;
int content_length = 0;
};
HttpRequest read_request(int fd) {
HttpRequest req;
char buf[65536];
int n = recv(fd, buf, sizeof(buf) - 1, 0);
if (n <= 0) return req;
buf[n] = '\0';
std::string raw(buf, n);
// Parse method + path
size_t sp1 = raw.find(' ');
if (sp1 == std::string::npos) return req;
req.method = raw.substr(0, sp1);
size_t sp2 = raw.find(' ', sp1 + 1);
req.path = raw.substr(sp1 + 1, sp2 - sp1 - 1);
// Content-Length
size_t cl_pos = raw.find("Content-Length: ");
if (cl_pos == std::string::npos) cl_pos = raw.find("content-length: ");
if (cl_pos != std::string::npos) {
req.content_length = atoi(raw.c_str() + cl_pos + 16);
}
// Body (after \r\n\r\n)
size_t body_start = raw.find("\r\n\r\n");
if (body_start != std::string::npos) {
body_start += 4;
req.body = raw.substr(body_start);
// Read remaining body if needed
while ((int)req.body.size() < req.content_length) {
n = recv(fd, buf, sizeof(buf) - 1, 0);
if (n <= 0) break;
buf[n] = '\0';
req.body.append(buf, n);
}
}
return req;
}
// ─── HTTP responses ────────────────────────────────────────────────
void send_response(int fd, int status, const std::string& body,
const std::string& content_type = "application/json") {
std::string status_text = (status == 200) ? "OK" : "Not Found";
char header[512];
snprintf(header, sizeof(header),
"HTTP/1.1 %d %s\r\n"
"Content-Type: %s\r\n"
"Content-Length: %zu\r\n"
"Access-Control-Allow-Origin: *\r\n"
"Access-Control-Allow-Methods: POST, GET, OPTIONS\r\n"
"Access-Control-Allow-Headers: Content-Type, Authorization\r\n"
"X-Powered-By: %s\r\n"
"\r\n",
status, status_text.c_str(),
content_type.c_str(), body.size(),
ix::identity::license().server_header().c_str());
send(fd, header, strlen(header), 0);
send(fd, body.c_str(), body.size(), 0);
}
void send_sse_start(int fd) {
const char* header =
"HTTP/1.1 200 OK\r\n"
"Content-Type: text/event-stream\r\n"
"Cache-Control: no-cache\r\n"
"Connection: keep-alive\r\n"
"Access-Control-Allow-Origin: *\r\n"
"\r\n";
send(fd, header, strlen(header), 0);
}
void send_sse_event(int fd, const std::string& data) {
std::string event = "data: " + data + "\n\n";
send(fd, event.c_str(), event.size(), MSG_NOSIGNAL);
}
// ─── Route handlers ────────────────────────────────────────────────
void handle_client(int fd) {
HttpRequest req = read_request(fd);
if (req.method.empty()) return;
total_requests_++;
ix::identity::license().on_request();
// CORS preflight
if (req.method == "OPTIONS") {
send_response(fd, 200, "");
return;
}
// Health check
if (req.path == "/health") {
char json[256];
snprintf(json, sizeof(json),
"{\"status\":\"ok\",\"model\":\"%s\",\"requests\":%d,\"tokens\":%d}",
model_name_.c_str(), total_requests_, total_tokens_);
send_response(fd, 200, json);
return;
}
// List models
if (req.path == "/v1/models" && req.method == "GET") {
char json[512];
snprintf(json, sizeof(json),
"{\"object\":\"list\",\"data\":[{\"id\":\"%s\","
"\"object\":\"model\",\"owned_by\":\"inference-x\"}]}",
model_name_.c_str());
send_response(fd, 200, json);
return;
}
// Chat completions
if (req.path == "/v1/chat/completions" && req.method == "POST") {
handle_chat(fd, req.body);
return;
}
// Text completions
if (req.path == "/v1/completions" && req.method == "POST") {
handle_completion(fd, req.body);
return;
}
send_response(fd, 404, "{\"error\":\"not found\"}");
}
void handle_chat(int fd, const std::string& body) {
ChatRequest req = parse_chat_request(body);
// Build system + user from messages
std::string system_prompt, user_prompt;
for (auto& msg : req.messages) {
if (msg.role == "system") system_prompt += msg.content + "\n";
else if (msg.role == "user") user_prompt += msg.content + "\n";
else if (msg.role == "assistant") {
// For multi-turn context, append assistant messages too
user_prompt += "[Assistant]: " + msg.content + "\n[User]: ";
}
}
if (user_prompt.empty() && !req.messages.empty()) {
user_prompt = req.messages.back().content;
}
std::string chat_id = gen_id();
long created = (long)time(nullptr);
if (req.stream) {
// ─── Streaming (SSE) ───
send_sse_start(fd);
int token_count = 0;
generate_(system_prompt, user_prompt, req.max_tokens,
req.temperature, req.top_p,
[&](const std::string& token) -> bool {
token_count++;
total_tokens_++;
char chunk[2048];
snprintf(chunk, sizeof(chunk),
"{\"id\":\"%s\",\"object\":\"chat.completion.chunk\","
"\"created\":%ld,\"model\":\"%s\","
"\"choices\":[{\"index\":0,\"delta\":"
"{\"content\":\"%s\"},\"finish_reason\":null}]}",
chat_id.c_str(), created,
model_name_.c_str(),
json_escape(token).c_str());
send_sse_event(fd, chunk);
return true;
});
// Final chunk with finish_reason
char done[512];
snprintf(done, sizeof(done),
"{\"id\":\"%s\",\"object\":\"chat.completion.chunk\","
"\"created\":%ld,\"model\":\"%s\","
"\"choices\":[{\"index\":0,\"delta\":{},"
"\"finish_reason\":\"stop\"}]}",
chat_id.c_str(), created, model_name_.c_str());
send_sse_event(fd, done);
send_sse_event(fd, "[DONE]");
} else {
// ─── Non-streaming ───
std::string full_response;
int token_count = 0;
generate_(system_prompt, user_prompt, req.max_tokens,
req.temperature, req.top_p,
[&](const std::string& token) -> bool {
full_response += token;
token_count++;
total_tokens_++;
return true;
});
char json[65536];
snprintf(json, sizeof(json),
"{\"id\":\"%s\",\"object\":\"chat.completion\","
"\"created\":%ld,\"model\":\"%s\","
"\"choices\":[{\"index\":0,\"message\":"
"{\"role\":\"assistant\",\"content\":\"%s\"},"
"\"finish_reason\":\"stop\"}],"
"\"usage\":{\"prompt_tokens\":0,"
"\"completion_tokens\":%d,\"total_tokens\":%d}}",
chat_id.c_str(), created,
model_name_.c_str(),
json_escape(full_response).c_str(),
token_count, token_count);
send_response(fd, 200, json);
}
}
void handle_completion(int fd, const std::string& body) {
std::string prompt = json_str(body, "prompt");
int max_tokens = (int)json_num(body, "max_tokens", 256);
float temperature = (float)json_num(body, "temperature", 0.6);
float top_p = (float)json_num(body, "top_p", 0.9);
bool stream = json_bool(body, "stream", false);
std::string comp_id = gen_id();
long created = (long)time(nullptr);
if (stream) {
send_sse_start(fd);
int token_count = 0;
generate_("", prompt, max_tokens, temperature, top_p,
[&](const std::string& token) -> bool {
token_count++;
total_tokens_++;
char chunk[2048];
snprintf(chunk, sizeof(chunk),
"{\"id\":\"%s\",\"object\":\"text_completion\","
"\"created\":%ld,\"model\":\"%s\","
"\"choices\":[{\"text\":\"%s\",\"index\":0,"
"\"finish_reason\":null}]}",
comp_id.c_str(), created,
model_name_.c_str(),
json_escape(token).c_str());
send_sse_event(fd, chunk);
return true;
});
char done[256];
snprintf(done, sizeof(done),
"{\"id\":\"%s\",\"object\":\"text_completion\","
"\"created\":%ld,\"choices\":[{\"text\":\"\","
"\"finish_reason\":\"stop\"}]}",
comp_id.c_str(), created);
send_sse_event(fd, done);
send_sse_event(fd, "[DONE]");
} else {
std::string full;
int token_count = 0;
generate_("", prompt, max_tokens, temperature, top_p,
[&](const std::string& token) -> bool {
full += token;
token_count++;
total_tokens_++;
return true;
});
char json[65536];
snprintf(json, sizeof(json),
"{\"id\":\"%s\",\"object\":\"text_completion\","
"\"created\":%ld,\"model\":\"%s\","
"\"choices\":[{\"text\":\"%s\",\"index\":0,"
"\"finish_reason\":\"stop\"}],"
"\"usage\":{\"prompt_tokens\":0,"
"\"completion_tokens\":%d,\"total_tokens\":%d}}",
comp_id.c_str(), created,
model_name_.c_str(),
json_escape(full).c_str(),
token_count, token_count);
send_response(fd, 200, json);
}
}
};
} // namespace ix