Better output from the same model. Fused computation, adaptive precision, surgical expert loading. 305 KB, 19 backends, zero dependencies. https://inference-x.com
534 lines
21 KiB
C++
534 lines
21 KiB
C++
// runtime/server.h — OpenAI-Compatible HTTP Server for Inference-X
|
|
// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
|
|
// INPI eSoleau: 7phf-Ueye-2nWr-Vsgu — BSL-1.1
|
|
//
|
|
// Zero dependencies. POSIX sockets. Drop-in replacement for OpenAI API.
|
|
// Any app that talks to GPT-4 talks to your local model. No code change.
|
|
//
|
|
// Endpoints:
|
|
// POST /v1/chat/completions — Chat with streaming (SSE)
|
|
// POST /v1/completions — Text completion
|
|
// GET /v1/models — List loaded model
|
|
// GET /health — Health check
|
|
//
|
|
#pragma once
|
|
#include <string>
|
|
#include <vector>
|
|
#include <functional>
|
|
#include <cstdio>
|
|
#include <cstring>
|
|
#include <ctime>
|
|
#include <thread>
|
|
#include <atomic>
|
|
#include <sstream>
|
|
|
|
#include <sys/socket.h>
|
|
#include <netinet/in.h>
|
|
#include <unistd.h>
|
|
#include <arpa/inet.h>
|
|
#include <signal.h>
|
|
|
|
#include "identity.h"
|
|
|
|
namespace ix {
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
// Minimal JSON helpers — just enough for OpenAI protocol, no external lib
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
struct ChatMessage {
|
|
std::string role;
|
|
std::string content;
|
|
};
|
|
|
|
struct ChatRequest {
|
|
std::string model;
|
|
std::vector<ChatMessage> messages;
|
|
int max_tokens = 512;
|
|
float temperature = 0.6f;
|
|
float top_p = 0.9f;
|
|
bool stream = false;
|
|
};
|
|
|
|
// Extract string value for a key from JSON (minimal, handles escaped quotes)
|
|
static std::string json_str(const std::string& json, const std::string& key) {
|
|
std::string needle = "\"" + key + "\"";
|
|
size_t pos = json.find(needle);
|
|
if (pos == std::string::npos) return "";
|
|
pos = json.find(':', pos + needle.size());
|
|
if (pos == std::string::npos) return "";
|
|
pos = json.find('"', pos + 1);
|
|
if (pos == std::string::npos) return "";
|
|
pos++;
|
|
std::string result;
|
|
while (pos < json.size() && json[pos] != '"') {
|
|
if (json[pos] == '\\' && pos + 1 < json.size()) {
|
|
pos++;
|
|
if (json[pos] == 'n') result += '\n';
|
|
else if (json[pos] == 't') result += '\t';
|
|
else if (json[pos] == '"') result += '"';
|
|
else if (json[pos] == '\\') result += '\\';
|
|
else result += json[pos];
|
|
} else {
|
|
result += json[pos];
|
|
}
|
|
pos++;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
// Extract numeric value
|
|
static double json_num(const std::string& json, const std::string& key, double def) {
|
|
std::string needle = "\"" + key + "\"";
|
|
size_t pos = json.find(needle);
|
|
if (pos == std::string::npos) return def;
|
|
pos = json.find(':', pos + needle.size());
|
|
if (pos == std::string::npos) return def;
|
|
pos++;
|
|
while (pos < json.size() && (json[pos] == ' ' || json[pos] == '\t')) pos++;
|
|
try { return std::stod(json.substr(pos, 20)); } catch (...) { return def; }
|
|
}
|
|
|
|
// Extract bool value
|
|
static bool json_bool(const std::string& json, const std::string& key, bool def) {
|
|
std::string needle = "\"" + key + "\"";
|
|
size_t pos = json.find(needle);
|
|
if (pos == std::string::npos) return def;
|
|
pos = json.find(':', pos + needle.size());
|
|
if (pos == std::string::npos) return def;
|
|
pos++;
|
|
while (pos < json.size() && json[pos] == ' ') pos++;
|
|
if (json.substr(pos, 4) == "true") return true;
|
|
if (json.substr(pos, 5) == "false") return false;
|
|
return def;
|
|
}
|
|
|
|
// Parse messages array from chat request
|
|
static std::vector<ChatMessage> parse_messages(const std::string& json) {
|
|
std::vector<ChatMessage> msgs;
|
|
size_t pos = json.find("\"messages\"");
|
|
if (pos == std::string::npos) return msgs;
|
|
pos = json.find('[', pos);
|
|
if (pos == std::string::npos) return msgs;
|
|
|
|
// Find each message object
|
|
size_t end = json.find(']', pos);
|
|
if (end == std::string::npos) end = json.size();
|
|
|
|
size_t cur = pos;
|
|
while (cur < end) {
|
|
size_t obj_start = json.find('{', cur);
|
|
if (obj_start == std::string::npos || obj_start >= end) break;
|
|
size_t obj_end = json.find('}', obj_start);
|
|
if (obj_end == std::string::npos) break;
|
|
|
|
std::string obj = json.substr(obj_start, obj_end - obj_start + 1);
|
|
ChatMessage msg;
|
|
msg.role = json_str(obj, "role");
|
|
msg.content = json_str(obj, "content");
|
|
if (!msg.role.empty()) msgs.push_back(msg);
|
|
cur = obj_end + 1;
|
|
}
|
|
return msgs;
|
|
}
|
|
|
|
static ChatRequest parse_chat_request(const std::string& body) {
|
|
ChatRequest req;
|
|
req.model = json_str(body, "model");
|
|
req.messages = parse_messages(body);
|
|
req.max_tokens = (int)json_num(body, "max_tokens", 512);
|
|
req.temperature = (float)json_num(body, "temperature", 0.6);
|
|
req.top_p = (float)json_num(body, "top_p", 0.9);
|
|
req.stream = json_bool(body, "stream", false);
|
|
return req;
|
|
}
|
|
|
|
// JSON string escape
|
|
static std::string json_escape(const std::string& s) {
|
|
std::string r;
|
|
r.reserve(s.size() + 16);
|
|
for (char c : s) {
|
|
switch (c) {
|
|
case '"': r += "\\\""; break;
|
|
case '\\': r += "\\\\"; break;
|
|
case '\n': r += "\\n"; break;
|
|
case '\r': r += "\\r"; break;
|
|
case '\t': r += "\\t"; break;
|
|
default: r += c;
|
|
}
|
|
}
|
|
return r;
|
|
}
|
|
|
|
// Generate unique ID
|
|
static std::string gen_id() {
|
|
char buf[32];
|
|
snprintf(buf, sizeof(buf), "chatcmpl-%lx", (long)time(nullptr));
|
|
return buf;
|
|
}
|
|
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
// HTTP Server
|
|
// ═══════════════════════════════════════════════════════════════════════════
|
|
|
|
// Callback: given system+user prompt, stream tokens
|
|
using GenerateFn = std::function<void(
|
|
const std::string& system,
|
|
const std::string& user,
|
|
int max_tokens,
|
|
float temperature,
|
|
float top_p,
|
|
std::function<bool(const std::string& token)> on_token
|
|
)>;
|
|
|
|
class Server {
|
|
public:
|
|
Server(int port, const std::string& model_name, GenerateFn generate)
|
|
: port_(port), model_name_(model_name), generate_(generate) {}
|
|
|
|
void run() {
|
|
int server_fd = socket(AF_INET, SOCK_STREAM, 0);
|
|
if (server_fd < 0) { perror("socket"); return; }
|
|
|
|
int opt = 1;
|
|
setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt));
|
|
|
|
struct sockaddr_in addr;
|
|
memset(&addr, 0, sizeof(addr));
|
|
addr.sin_family = AF_INET;
|
|
addr.sin_addr.s_addr = INADDR_ANY;
|
|
addr.sin_port = htons(port_);
|
|
|
|
if (bind(server_fd, (struct sockaddr*)&addr, sizeof(addr)) < 0) {
|
|
perror("bind"); close(server_fd); return;
|
|
}
|
|
if (listen(server_fd, 16) < 0) {
|
|
perror("listen"); close(server_fd); return;
|
|
}
|
|
|
|
printf("\n");
|
|
printf("╔══════════════════════════════════════════════════════════════╗\n");
|
|
printf("║ Inference-X Server — OpenAI-Compatible API ║\n");
|
|
printf("╠══════════════════════════════════════════════════════════════╣\n");
|
|
printf("║ Model : %-49s ║\n", model_name_.c_str());
|
|
printf("║ Port : %-49d ║\n", port_);
|
|
printf("║ API : http://0.0.0.0:%-35d ║\n", port_);
|
|
printf("╠══════════════════════════════════════════════════════════════╣\n");
|
|
printf("║ POST /v1/chat/completions Chat (streaming + sync) ║\n");
|
|
printf("║ POST /v1/completions Text completion ║\n");
|
|
printf("║ GET /v1/models Model info ║\n");
|
|
printf("║ GET /health Health check ║\n");
|
|
printf("╚══════════════════════════════════════════════════════════════╝\n");
|
|
printf("\nReady. Ctrl+C to stop.\n\n");
|
|
fflush(stdout);
|
|
|
|
while (!stopped_) {
|
|
struct sockaddr_in client_addr;
|
|
socklen_t client_len = sizeof(client_addr);
|
|
int client_fd = accept(server_fd, (struct sockaddr*)&client_addr, &client_len);
|
|
if (client_fd < 0) continue;
|
|
|
|
// Handle in-thread (sequential for now — model is single-threaded)
|
|
handle_client(client_fd);
|
|
close(client_fd);
|
|
}
|
|
close(server_fd);
|
|
}
|
|
|
|
void stop() { stopped_ = true; }
|
|
|
|
private:
|
|
int port_;
|
|
std::string model_name_;
|
|
GenerateFn generate_;
|
|
std::atomic<bool> stopped_{false};
|
|
int total_requests_ = 0;
|
|
int total_tokens_ = 0;
|
|
|
|
// ─── HTTP parsing ──────────────────────────────────────────────────
|
|
|
|
struct HttpRequest {
|
|
std::string method;
|
|
std::string path;
|
|
std::string body;
|
|
int content_length = 0;
|
|
};
|
|
|
|
HttpRequest read_request(int fd) {
|
|
HttpRequest req;
|
|
char buf[65536];
|
|
int n = recv(fd, buf, sizeof(buf) - 1, 0);
|
|
if (n <= 0) return req;
|
|
buf[n] = '\0';
|
|
|
|
std::string raw(buf, n);
|
|
|
|
// Parse method + path
|
|
size_t sp1 = raw.find(' ');
|
|
if (sp1 == std::string::npos) return req;
|
|
req.method = raw.substr(0, sp1);
|
|
size_t sp2 = raw.find(' ', sp1 + 1);
|
|
req.path = raw.substr(sp1 + 1, sp2 - sp1 - 1);
|
|
|
|
// Content-Length
|
|
size_t cl_pos = raw.find("Content-Length: ");
|
|
if (cl_pos == std::string::npos) cl_pos = raw.find("content-length: ");
|
|
if (cl_pos != std::string::npos) {
|
|
req.content_length = atoi(raw.c_str() + cl_pos + 16);
|
|
}
|
|
|
|
// Body (after \r\n\r\n)
|
|
size_t body_start = raw.find("\r\n\r\n");
|
|
if (body_start != std::string::npos) {
|
|
body_start += 4;
|
|
req.body = raw.substr(body_start);
|
|
|
|
// Read remaining body if needed
|
|
while ((int)req.body.size() < req.content_length) {
|
|
n = recv(fd, buf, sizeof(buf) - 1, 0);
|
|
if (n <= 0) break;
|
|
buf[n] = '\0';
|
|
req.body.append(buf, n);
|
|
}
|
|
}
|
|
return req;
|
|
}
|
|
|
|
// ─── HTTP responses ────────────────────────────────────────────────
|
|
|
|
void send_response(int fd, int status, const std::string& body,
|
|
const std::string& content_type = "application/json") {
|
|
std::string status_text = (status == 200) ? "OK" : "Not Found";
|
|
char header[512];
|
|
snprintf(header, sizeof(header),
|
|
"HTTP/1.1 %d %s\r\n"
|
|
"Content-Type: %s\r\n"
|
|
"Content-Length: %zu\r\n"
|
|
"Access-Control-Allow-Origin: *\r\n"
|
|
"Access-Control-Allow-Methods: POST, GET, OPTIONS\r\n"
|
|
"Access-Control-Allow-Headers: Content-Type, Authorization\r\n"
|
|
"X-Powered-By: %s\r\n"
|
|
"\r\n",
|
|
status, status_text.c_str(),
|
|
content_type.c_str(), body.size(),
|
|
ix::identity::license().server_header().c_str());
|
|
send(fd, header, strlen(header), 0);
|
|
send(fd, body.c_str(), body.size(), 0);
|
|
}
|
|
|
|
void send_sse_start(int fd) {
|
|
const char* header =
|
|
"HTTP/1.1 200 OK\r\n"
|
|
"Content-Type: text/event-stream\r\n"
|
|
"Cache-Control: no-cache\r\n"
|
|
"Connection: keep-alive\r\n"
|
|
"Access-Control-Allow-Origin: *\r\n"
|
|
"\r\n";
|
|
send(fd, header, strlen(header), 0);
|
|
}
|
|
|
|
void send_sse_event(int fd, const std::string& data) {
|
|
std::string event = "data: " + data + "\n\n";
|
|
send(fd, event.c_str(), event.size(), MSG_NOSIGNAL);
|
|
}
|
|
|
|
// ─── Route handlers ────────────────────────────────────────────────
|
|
|
|
void handle_client(int fd) {
|
|
HttpRequest req = read_request(fd);
|
|
if (req.method.empty()) return;
|
|
|
|
total_requests_++;
|
|
ix::identity::license().on_request();
|
|
|
|
// CORS preflight
|
|
if (req.method == "OPTIONS") {
|
|
send_response(fd, 200, "");
|
|
return;
|
|
}
|
|
|
|
// Health check
|
|
if (req.path == "/health") {
|
|
char json[256];
|
|
snprintf(json, sizeof(json),
|
|
"{\"status\":\"ok\",\"model\":\"%s\",\"requests\":%d,\"tokens\":%d}",
|
|
model_name_.c_str(), total_requests_, total_tokens_);
|
|
send_response(fd, 200, json);
|
|
return;
|
|
}
|
|
|
|
// List models
|
|
if (req.path == "/v1/models" && req.method == "GET") {
|
|
char json[512];
|
|
snprintf(json, sizeof(json),
|
|
"{\"object\":\"list\",\"data\":[{\"id\":\"%s\","
|
|
"\"object\":\"model\",\"owned_by\":\"inference-x\"}]}",
|
|
model_name_.c_str());
|
|
send_response(fd, 200, json);
|
|
return;
|
|
}
|
|
|
|
// Chat completions
|
|
if (req.path == "/v1/chat/completions" && req.method == "POST") {
|
|
handle_chat(fd, req.body);
|
|
return;
|
|
}
|
|
|
|
// Text completions
|
|
if (req.path == "/v1/completions" && req.method == "POST") {
|
|
handle_completion(fd, req.body);
|
|
return;
|
|
}
|
|
|
|
send_response(fd, 404, "{\"error\":\"not found\"}");
|
|
}
|
|
|
|
void handle_chat(int fd, const std::string& body) {
|
|
ChatRequest req = parse_chat_request(body);
|
|
|
|
// Build system + user from messages
|
|
std::string system_prompt, user_prompt;
|
|
for (auto& msg : req.messages) {
|
|
if (msg.role == "system") system_prompt += msg.content + "\n";
|
|
else if (msg.role == "user") user_prompt += msg.content + "\n";
|
|
else if (msg.role == "assistant") {
|
|
// For multi-turn context, append assistant messages too
|
|
user_prompt += "[Assistant]: " + msg.content + "\n[User]: ";
|
|
}
|
|
}
|
|
if (user_prompt.empty() && !req.messages.empty()) {
|
|
user_prompt = req.messages.back().content;
|
|
}
|
|
|
|
std::string chat_id = gen_id();
|
|
long created = (long)time(nullptr);
|
|
|
|
if (req.stream) {
|
|
// ─── Streaming (SSE) ───
|
|
send_sse_start(fd);
|
|
|
|
int token_count = 0;
|
|
generate_(system_prompt, user_prompt, req.max_tokens,
|
|
req.temperature, req.top_p,
|
|
[&](const std::string& token) -> bool {
|
|
token_count++;
|
|
total_tokens_++;
|
|
char chunk[2048];
|
|
snprintf(chunk, sizeof(chunk),
|
|
"{\"id\":\"%s\",\"object\":\"chat.completion.chunk\","
|
|
"\"created\":%ld,\"model\":\"%s\","
|
|
"\"choices\":[{\"index\":0,\"delta\":"
|
|
"{\"content\":\"%s\"},\"finish_reason\":null}]}",
|
|
chat_id.c_str(), created,
|
|
model_name_.c_str(),
|
|
json_escape(token).c_str());
|
|
send_sse_event(fd, chunk);
|
|
return true;
|
|
});
|
|
|
|
// Final chunk with finish_reason
|
|
char done[512];
|
|
snprintf(done, sizeof(done),
|
|
"{\"id\":\"%s\",\"object\":\"chat.completion.chunk\","
|
|
"\"created\":%ld,\"model\":\"%s\","
|
|
"\"choices\":[{\"index\":0,\"delta\":{},"
|
|
"\"finish_reason\":\"stop\"}]}",
|
|
chat_id.c_str(), created, model_name_.c_str());
|
|
send_sse_event(fd, done);
|
|
send_sse_event(fd, "[DONE]");
|
|
} else {
|
|
// ─── Non-streaming ───
|
|
std::string full_response;
|
|
int token_count = 0;
|
|
generate_(system_prompt, user_prompt, req.max_tokens,
|
|
req.temperature, req.top_p,
|
|
[&](const std::string& token) -> bool {
|
|
full_response += token;
|
|
token_count++;
|
|
total_tokens_++;
|
|
return true;
|
|
});
|
|
|
|
char json[65536];
|
|
snprintf(json, sizeof(json),
|
|
"{\"id\":\"%s\",\"object\":\"chat.completion\","
|
|
"\"created\":%ld,\"model\":\"%s\","
|
|
"\"choices\":[{\"index\":0,\"message\":"
|
|
"{\"role\":\"assistant\",\"content\":\"%s\"},"
|
|
"\"finish_reason\":\"stop\"}],"
|
|
"\"usage\":{\"prompt_tokens\":0,"
|
|
"\"completion_tokens\":%d,\"total_tokens\":%d}}",
|
|
chat_id.c_str(), created,
|
|
model_name_.c_str(),
|
|
json_escape(full_response).c_str(),
|
|
token_count, token_count);
|
|
send_response(fd, 200, json);
|
|
}
|
|
}
|
|
|
|
void handle_completion(int fd, const std::string& body) {
|
|
std::string prompt = json_str(body, "prompt");
|
|
int max_tokens = (int)json_num(body, "max_tokens", 256);
|
|
float temperature = (float)json_num(body, "temperature", 0.6);
|
|
float top_p = (float)json_num(body, "top_p", 0.9);
|
|
bool stream = json_bool(body, "stream", false);
|
|
|
|
std::string comp_id = gen_id();
|
|
long created = (long)time(nullptr);
|
|
|
|
if (stream) {
|
|
send_sse_start(fd);
|
|
int token_count = 0;
|
|
generate_("", prompt, max_tokens, temperature, top_p,
|
|
[&](const std::string& token) -> bool {
|
|
token_count++;
|
|
total_tokens_++;
|
|
char chunk[2048];
|
|
snprintf(chunk, sizeof(chunk),
|
|
"{\"id\":\"%s\",\"object\":\"text_completion\","
|
|
"\"created\":%ld,\"model\":\"%s\","
|
|
"\"choices\":[{\"text\":\"%s\",\"index\":0,"
|
|
"\"finish_reason\":null}]}",
|
|
comp_id.c_str(), created,
|
|
model_name_.c_str(),
|
|
json_escape(token).c_str());
|
|
send_sse_event(fd, chunk);
|
|
return true;
|
|
});
|
|
char done[256];
|
|
snprintf(done, sizeof(done),
|
|
"{\"id\":\"%s\",\"object\":\"text_completion\","
|
|
"\"created\":%ld,\"choices\":[{\"text\":\"\","
|
|
"\"finish_reason\":\"stop\"}]}",
|
|
comp_id.c_str(), created);
|
|
send_sse_event(fd, done);
|
|
send_sse_event(fd, "[DONE]");
|
|
} else {
|
|
std::string full;
|
|
int token_count = 0;
|
|
generate_("", prompt, max_tokens, temperature, top_p,
|
|
[&](const std::string& token) -> bool {
|
|
full += token;
|
|
token_count++;
|
|
total_tokens_++;
|
|
return true;
|
|
});
|
|
char json[65536];
|
|
snprintf(json, sizeof(json),
|
|
"{\"id\":\"%s\",\"object\":\"text_completion\","
|
|
"\"created\":%ld,\"model\":\"%s\","
|
|
"\"choices\":[{\"text\":\"%s\",\"index\":0,"
|
|
"\"finish_reason\":\"stop\"}],"
|
|
"\"usage\":{\"prompt_tokens\":0,"
|
|
"\"completion_tokens\":%d,\"total_tokens\":%d}}",
|
|
comp_id.c_str(), created,
|
|
model_name_.c_str(),
|
|
json_escape(full).c_str(),
|
|
token_count, token_count);
|
|
send_response(fd, 200, json);
|
|
}
|
|
}
|
|
};
|
|
|
|
} // namespace ix
|