// runtime/server.h — OpenAI-Compatible HTTP Server for Inference-X // Copyright (C) 2024-2026 Salka Elmadani. All rights reserved. // INPI eSoleau: 7phf-Ueye-2nWr-Vsgu — BSL-1.1 // // Zero dependencies. POSIX sockets. Drop-in replacement for OpenAI API. // Any app that talks to GPT-4 talks to your local model. No code change. // // Endpoints: // POST /v1/chat/completions — Chat with streaming (SSE) // POST /v1/completions — Text completion // GET /v1/models — List loaded model // GET /health — Health check // #pragma once #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "identity.h" namespace ix { // ═══════════════════════════════════════════════════════════════════════════ // Minimal JSON helpers — just enough for OpenAI protocol, no external lib // ═══════════════════════════════════════════════════════════════════════════ struct ChatMessage { std::string role; std::string content; }; struct ChatRequest { std::string model; std::vector messages; int max_tokens = 512; float temperature = 0.6f; float top_p = 0.9f; bool stream = false; }; // Extract string value for a key from JSON (minimal, handles escaped quotes) static std::string json_str(const std::string& json, const std::string& key) { std::string needle = "\"" + key + "\""; size_t pos = json.find(needle); if (pos == std::string::npos) return ""; pos = json.find(':', pos + needle.size()); if (pos == std::string::npos) return ""; pos = json.find('"', pos + 1); if (pos == std::string::npos) return ""; pos++; std::string result; while (pos < json.size() && json[pos] != '"') { if (json[pos] == '\\' && pos + 1 < json.size()) { pos++; if (json[pos] == 'n') result += '\n'; else if (json[pos] == 't') result += '\t'; else if (json[pos] == '"') result += '"'; else if (json[pos] == '\\') result += '\\'; else result += json[pos]; } else { result += json[pos]; } pos++; } return result; } // Extract numeric value static double json_num(const std::string& json, const std::string& key, double def) { std::string needle = "\"" + key + "\""; size_t pos = json.find(needle); if (pos == std::string::npos) return def; pos = json.find(':', pos + needle.size()); if (pos == std::string::npos) return def; pos++; while (pos < json.size() && (json[pos] == ' ' || json[pos] == '\t')) pos++; try { return std::stod(json.substr(pos, 20)); } catch (...) { return def; } } // Extract bool value static bool json_bool(const std::string& json, const std::string& key, bool def) { std::string needle = "\"" + key + "\""; size_t pos = json.find(needle); if (pos == std::string::npos) return def; pos = json.find(':', pos + needle.size()); if (pos == std::string::npos) return def; pos++; while (pos < json.size() && json[pos] == ' ') pos++; if (json.substr(pos, 4) == "true") return true; if (json.substr(pos, 5) == "false") return false; return def; } // Parse messages array from chat request static std::vector parse_messages(const std::string& json) { std::vector msgs; size_t pos = json.find("\"messages\""); if (pos == std::string::npos) return msgs; pos = json.find('[', pos); if (pos == std::string::npos) return msgs; // Find each message object size_t end = json.find(']', pos); if (end == std::string::npos) end = json.size(); size_t cur = pos; while (cur < end) { size_t obj_start = json.find('{', cur); if (obj_start == std::string::npos || obj_start >= end) break; size_t obj_end = json.find('}', obj_start); if (obj_end == std::string::npos) break; std::string obj = json.substr(obj_start, obj_end - obj_start + 1); ChatMessage msg; msg.role = json_str(obj, "role"); msg.content = json_str(obj, "content"); if (!msg.role.empty()) msgs.push_back(msg); cur = obj_end + 1; } return msgs; } static ChatRequest parse_chat_request(const std::string& body) { ChatRequest req; req.model = json_str(body, "model"); req.messages = parse_messages(body); req.max_tokens = (int)json_num(body, "max_tokens", 512); req.temperature = (float)json_num(body, "temperature", 0.6); req.top_p = (float)json_num(body, "top_p", 0.9); req.stream = json_bool(body, "stream", false); return req; } // JSON string escape static std::string json_escape(const std::string& s) { std::string r; r.reserve(s.size() + 16); for (char c : s) { switch (c) { case '"': r += "\\\""; break; case '\\': r += "\\\\"; break; case '\n': r += "\\n"; break; case '\r': r += "\\r"; break; case '\t': r += "\\t"; break; default: r += c; } } return r; } // Generate unique ID static std::string gen_id() { char buf[32]; snprintf(buf, sizeof(buf), "chatcmpl-%lx", (long)time(nullptr)); return buf; } // ═══════════════════════════════════════════════════════════════════════════ // HTTP Server // ═══════════════════════════════════════════════════════════════════════════ // Callback: given system+user prompt, stream tokens using GenerateFn = std::function on_token )>; class Server { public: Server(int port, const std::string& model_name, GenerateFn generate) : port_(port), model_name_(model_name), generate_(generate) {} void run() { int server_fd = socket(AF_INET, SOCK_STREAM, 0); if (server_fd < 0) { perror("socket"); return; } int opt = 1; setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)); struct sockaddr_in addr; memset(&addr, 0, sizeof(addr)); addr.sin_family = AF_INET; addr.sin_addr.s_addr = INADDR_ANY; addr.sin_port = htons(port_); if (bind(server_fd, (struct sockaddr*)&addr, sizeof(addr)) < 0) { perror("bind"); close(server_fd); return; } if (listen(server_fd, 16) < 0) { perror("listen"); close(server_fd); return; } printf("\n"); printf("╔══════════════════════════════════════════════════════════════╗\n"); printf("║ Inference-X Server — OpenAI-Compatible API ║\n"); printf("╠══════════════════════════════════════════════════════════════╣\n"); printf("║ Model : %-49s ║\n", model_name_.c_str()); printf("║ Port : %-49d ║\n", port_); printf("║ API : http://0.0.0.0:%-35d ║\n", port_); printf("╠══════════════════════════════════════════════════════════════╣\n"); printf("║ POST /v1/chat/completions Chat (streaming + sync) ║\n"); printf("║ POST /v1/completions Text completion ║\n"); printf("║ GET /v1/models Model info ║\n"); printf("║ GET /health Health check ║\n"); printf("╚══════════════════════════════════════════════════════════════╝\n"); printf("\nReady. Ctrl+C to stop.\n\n"); fflush(stdout); while (!stopped_) { struct sockaddr_in client_addr; socklen_t client_len = sizeof(client_addr); int client_fd = accept(server_fd, (struct sockaddr*)&client_addr, &client_len); if (client_fd < 0) continue; // Handle in-thread (sequential for now — model is single-threaded) handle_client(client_fd); close(client_fd); } close(server_fd); } void stop() { stopped_ = true; } private: int port_; std::string model_name_; GenerateFn generate_; std::atomic stopped_{false}; int total_requests_ = 0; int total_tokens_ = 0; // ─── HTTP parsing ────────────────────────────────────────────────── struct HttpRequest { std::string method; std::string path; std::string body; int content_length = 0; }; HttpRequest read_request(int fd) { HttpRequest req; char buf[65536]; int n = recv(fd, buf, sizeof(buf) - 1, 0); if (n <= 0) return req; buf[n] = '\0'; std::string raw(buf, n); // Parse method + path size_t sp1 = raw.find(' '); if (sp1 == std::string::npos) return req; req.method = raw.substr(0, sp1); size_t sp2 = raw.find(' ', sp1 + 1); req.path = raw.substr(sp1 + 1, sp2 - sp1 - 1); // Content-Length size_t cl_pos = raw.find("Content-Length: "); if (cl_pos == std::string::npos) cl_pos = raw.find("content-length: "); if (cl_pos != std::string::npos) { req.content_length = atoi(raw.c_str() + cl_pos + 16); } // Body (after \r\n\r\n) size_t body_start = raw.find("\r\n\r\n"); if (body_start != std::string::npos) { body_start += 4; req.body = raw.substr(body_start); // Read remaining body if needed while ((int)req.body.size() < req.content_length) { n = recv(fd, buf, sizeof(buf) - 1, 0); if (n <= 0) break; buf[n] = '\0'; req.body.append(buf, n); } } return req; } // ─── HTTP responses ──────────────────────────────────────────────── void send_response(int fd, int status, const std::string& body, const std::string& content_type = "application/json") { std::string status_text = (status == 200) ? "OK" : "Not Found"; char header[512]; snprintf(header, sizeof(header), "HTTP/1.1 %d %s\r\n" "Content-Type: %s\r\n" "Content-Length: %zu\r\n" "Access-Control-Allow-Origin: *\r\n" "Access-Control-Allow-Methods: POST, GET, OPTIONS\r\n" "Access-Control-Allow-Headers: Content-Type, Authorization\r\n" "X-Powered-By: %s\r\n" "\r\n", status, status_text.c_str(), content_type.c_str(), body.size(), ix::identity::license().server_header().c_str()); send(fd, header, strlen(header), 0); send(fd, body.c_str(), body.size(), 0); } void send_sse_start(int fd) { const char* header = "HTTP/1.1 200 OK\r\n" "Content-Type: text/event-stream\r\n" "Cache-Control: no-cache\r\n" "Connection: keep-alive\r\n" "Access-Control-Allow-Origin: *\r\n" "\r\n"; send(fd, header, strlen(header), 0); } void send_sse_event(int fd, const std::string& data) { std::string event = "data: " + data + "\n\n"; send(fd, event.c_str(), event.size(), MSG_NOSIGNAL); } // ─── Route handlers ──────────────────────────────────────────────── void handle_client(int fd) { HttpRequest req = read_request(fd); if (req.method.empty()) return; total_requests_++; ix::identity::license().on_request(); // CORS preflight if (req.method == "OPTIONS") { send_response(fd, 200, ""); return; } // Health check if (req.path == "/health") { char json[256]; snprintf(json, sizeof(json), "{\"status\":\"ok\",\"model\":\"%s\",\"requests\":%d,\"tokens\":%d}", model_name_.c_str(), total_requests_, total_tokens_); send_response(fd, 200, json); return; } // List models if (req.path == "/v1/models" && req.method == "GET") { char json[512]; snprintf(json, sizeof(json), "{\"object\":\"list\",\"data\":[{\"id\":\"%s\"," "\"object\":\"model\",\"owned_by\":\"inference-x\"}]}", model_name_.c_str()); send_response(fd, 200, json); return; } // Chat completions if (req.path == "/v1/chat/completions" && req.method == "POST") { handle_chat(fd, req.body); return; } // Text completions if (req.path == "/v1/completions" && req.method == "POST") { handle_completion(fd, req.body); return; } send_response(fd, 404, "{\"error\":\"not found\"}"); } void handle_chat(int fd, const std::string& body) { ChatRequest req = parse_chat_request(body); // Build system + user from messages std::string system_prompt, user_prompt; for (auto& msg : req.messages) { if (msg.role == "system") system_prompt += msg.content + "\n"; else if (msg.role == "user") user_prompt += msg.content + "\n"; else if (msg.role == "assistant") { // For multi-turn context, append assistant messages too user_prompt += "[Assistant]: " + msg.content + "\n[User]: "; } } if (user_prompt.empty() && !req.messages.empty()) { user_prompt = req.messages.back().content; } std::string chat_id = gen_id(); long created = (long)time(nullptr); if (req.stream) { // ─── Streaming (SSE) ─── send_sse_start(fd); int token_count = 0; generate_(system_prompt, user_prompt, req.max_tokens, req.temperature, req.top_p, [&](const std::string& token) -> bool { token_count++; total_tokens_++; char chunk[2048]; snprintf(chunk, sizeof(chunk), "{\"id\":\"%s\",\"object\":\"chat.completion.chunk\"," "\"created\":%ld,\"model\":\"%s\"," "\"choices\":[{\"index\":0,\"delta\":" "{\"content\":\"%s\"},\"finish_reason\":null}]}", chat_id.c_str(), created, model_name_.c_str(), json_escape(token).c_str()); send_sse_event(fd, chunk); return true; }); // Final chunk with finish_reason char done[512]; snprintf(done, sizeof(done), "{\"id\":\"%s\",\"object\":\"chat.completion.chunk\"," "\"created\":%ld,\"model\":\"%s\"," "\"choices\":[{\"index\":0,\"delta\":{}," "\"finish_reason\":\"stop\"}]}", chat_id.c_str(), created, model_name_.c_str()); send_sse_event(fd, done); send_sse_event(fd, "[DONE]"); } else { // ─── Non-streaming ─── std::string full_response; int token_count = 0; generate_(system_prompt, user_prompt, req.max_tokens, req.temperature, req.top_p, [&](const std::string& token) -> bool { full_response += token; token_count++; total_tokens_++; return true; }); char json[65536]; snprintf(json, sizeof(json), "{\"id\":\"%s\",\"object\":\"chat.completion\"," "\"created\":%ld,\"model\":\"%s\"," "\"choices\":[{\"index\":0,\"message\":" "{\"role\":\"assistant\",\"content\":\"%s\"}," "\"finish_reason\":\"stop\"}]," "\"usage\":{\"prompt_tokens\":0," "\"completion_tokens\":%d,\"total_tokens\":%d}}", chat_id.c_str(), created, model_name_.c_str(), json_escape(full_response).c_str(), token_count, token_count); send_response(fd, 200, json); } } void handle_completion(int fd, const std::string& body) { std::string prompt = json_str(body, "prompt"); int max_tokens = (int)json_num(body, "max_tokens", 256); float temperature = (float)json_num(body, "temperature", 0.6); float top_p = (float)json_num(body, "top_p", 0.9); bool stream = json_bool(body, "stream", false); std::string comp_id = gen_id(); long created = (long)time(nullptr); if (stream) { send_sse_start(fd); int token_count = 0; generate_("", prompt, max_tokens, temperature, top_p, [&](const std::string& token) -> bool { token_count++; total_tokens_++; char chunk[2048]; snprintf(chunk, sizeof(chunk), "{\"id\":\"%s\",\"object\":\"text_completion\"," "\"created\":%ld,\"model\":\"%s\"," "\"choices\":[{\"text\":\"%s\",\"index\":0," "\"finish_reason\":null}]}", comp_id.c_str(), created, model_name_.c_str(), json_escape(token).c_str()); send_sse_event(fd, chunk); return true; }); char done[256]; snprintf(done, sizeof(done), "{\"id\":\"%s\",\"object\":\"text_completion\"," "\"created\":%ld,\"choices\":[{\"text\":\"\"," "\"finish_reason\":\"stop\"}]}", comp_id.c_str(), created); send_sse_event(fd, done); send_sse_event(fd, "[DONE]"); } else { std::string full; int token_count = 0; generate_("", prompt, max_tokens, temperature, top_p, [&](const std::string& token) -> bool { full += token; token_count++; total_tokens_++; return true; }); char json[65536]; snprintf(json, sizeof(json), "{\"id\":\"%s\",\"object\":\"text_completion\"," "\"created\":%ld,\"model\":\"%s\"," "\"choices\":[{\"text\":\"%s\",\"index\":0," "\"finish_reason\":\"stop\"}]," "\"usage\":{\"prompt_tokens\":0," "\"completion_tokens\":%d,\"total_tokens\":%d}}", comp_id.c_str(), created, model_name_.c_str(), json_escape(full).c_str(), token_count, token_count); send_response(fd, 200, json); } } }; } // namespace ix