inference-x/runtime/server.h

// runtime/server.h — OpenAI-Compatible HTTP Server for Inference-X
// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
// INPI eSoleau: 7phf-Ueye-2nWr-Vsgu — BSL-1.1
//
// Zero dependencies. POSIX sockets. Drop-in replacement for OpenAI API.
// Any app that talks to GPT-4 talks to your local model. No code change.
//
// Endpoints:
//   POST /v1/chat/completions   — Chat with streaming (SSE)
//   POST /v1/completions        — Text completion
//   GET  /v1/models             — List loaded model
//   GET  /health                — Health check
//
#pragma once
#include <string>
#include <vector>
#include <functional>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <thread>
#include <atomic>
#include <sstream>

#include <sys/socket.h>
#include <netinet/in.h>
#include <unistd.h>
#include <arpa/inet.h>
#include <signal.h>

#include "identity.h"

namespace ix {

// ═══════════════════════════════════════════════════════════════════════════
// Minimal JSON helpers — just enough for OpenAI protocol, no external lib
// ═══════════════════════════════════════════════════════════════════════════

struct ChatMessage {
    std::string role;
    std::string content;
};

struct ChatRequest {
    std::string model;
    std::vector<ChatMessage> messages;
    int max_tokens = 512;
    float temperature = 0.6f;
    float top_p = 0.9f;
    bool stream = false;
};

// Extract string value for a key from JSON (minimal, handles escaped quotes)
static std::string json_str(const std::string& json, const std::string& key) {
    std::string needle = "\"" + key + "\"";
    size_t pos = json.find(needle);
    if (pos == std::string::npos) return "";
    pos = json.find(':', pos + needle.size());
    if (pos == std::string::npos) return "";
    pos = json.find('"', pos + 1);
    if (pos == std::string::npos) return "";
    pos++;
    std::string result;
    while (pos < json.size() && json[pos] != '"') {
        if (json[pos] == '\\' && pos + 1 < json.size()) {
            pos++;
            if (json[pos] == 'n') result += '\n';
            else if (json[pos] == 't') result += '\t';
            else if (json[pos] == '"') result += '"';
            else if (json[pos] == '\\') result += '\\';
            else result += json[pos];
        } else {
            result += json[pos];
        }
        pos++;
    }
    return result;
}

// Extract numeric value
static double json_num(const std::string& json, const std::string& key, double def) {
    std::string needle = "\"" + key + "\"";
    size_t pos = json.find(needle);
    if (pos == std::string::npos) return def;
    pos = json.find(':', pos + needle.size());
    if (pos == std::string::npos) return def;
    pos++;
    while (pos < json.size() && (json[pos] == ' ' || json[pos] == '\t')) pos++;
    try { return std::stod(json.substr(pos, 20)); } catch (...) { return def; }
}

// Extract bool value
static bool json_bool(const std::string& json, const std::string& key, bool def) {
    std::string needle = "\"" + key + "\"";
    size_t pos = json.find(needle);
    if (pos == std::string::npos) return def;
    pos = json.find(':', pos + needle.size());
    if (pos == std::string::npos) return def;
    pos++;
    while (pos < json.size() && json[pos] == ' ') pos++;
    if (json.substr(pos, 4) == "true") return true;
    if (json.substr(pos, 5) == "false") return false;
    return def;
}

// Parse messages array from chat request
static std::vector<ChatMessage> parse_messages(const std::string& json) {
    std::vector<ChatMessage> msgs;
    size_t pos = json.find("\"messages\"");
    if (pos == std::string::npos) return msgs;
    pos = json.find('[', pos);
    if (pos == std::string::npos) return msgs;

    // Find each message object
    size_t end = json.find(']', pos);
    if (end == std::string::npos) end = json.size();

    size_t cur = pos;
    while (cur < end) {
        size_t obj_start = json.find('{', cur);
        if (obj_start == std::string::npos || obj_start >= end) break;
        size_t obj_end = json.find('}', obj_start);
        if (obj_end == std::string::npos) break;

        std::string obj = json.substr(obj_start, obj_end - obj_start + 1);
        ChatMessage msg;
        msg.role = json_str(obj, "role");
        msg.content = json_str(obj, "content");
        if (!msg.role.empty()) msgs.push_back(msg);
        cur = obj_end + 1;
    }
    return msgs;
}

static ChatRequest parse_chat_request(const std::string& body) {
    ChatRequest req;
    req.model = json_str(body, "model");
    req.messages = parse_messages(body);
    req.max_tokens = (int)json_num(body, "max_tokens", 512);
    req.temperature = (float)json_num(body, "temperature", 0.6);
    req.top_p = (float)json_num(body, "top_p", 0.9);
    req.stream = json_bool(body, "stream", false);
    return req;
}

// JSON string escape
static std::string json_escape(const std::string& s) {
    std::string r;
    r.reserve(s.size() + 16);
    for (char c : s) {
        switch (c) {
            case '"':  r += "\\\""; break;
            case '\\': r += "\\\\"; break;
            case '\n': r += "\\n"; break;
            case '\r': r += "\\r"; break;
            case '\t': r += "\\t"; break;
            default:   r += c;
        }
    }
    return r;
}

// Generate unique ID
static std::string gen_id() {
    char buf[32];
    snprintf(buf, sizeof(buf), "chatcmpl-%lx", (long)time(nullptr));
    return buf;
}

// ═══════════════════════════════════════════════════════════════════════════
// HTTP Server
// ═══════════════════════════════════════════════════════════════════════════

// Callback: given system+user prompt, stream tokens
using GenerateFn = std::function<void(
    const std::string& system,
    const std::string& user,
    int max_tokens,
    float temperature,
    float top_p,
    std::function<bool(const std::string& token)> on_token
)>;

class Server {
public:
    Server(int port, const std::string& model_name, GenerateFn generate)
        : port_(port), model_name_(model_name), generate_(generate) {}

    void run() {
        int server_fd = socket(AF_INET, SOCK_STREAM, 0);
        if (server_fd < 0) { perror("socket"); return; }

        int opt = 1;
        setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt));

        struct sockaddr_in addr;
        memset(&addr, 0, sizeof(addr));
        addr.sin_family = AF_INET;
        addr.sin_addr.s_addr = INADDR_ANY;
        addr.sin_port = htons(port_);

        if (bind(server_fd, (struct sockaddr*)&addr, sizeof(addr)) < 0) {
            perror("bind"); close(server_fd); return;
        }
        if (listen(server_fd, 16) < 0) {
            perror("listen"); close(server_fd); return;
        }

        printf("\n");
        printf("╔══════════════════════════════════════════════════════════════╗\n");
        printf("║  Inference-X Server — OpenAI-Compatible API                 ║\n");
        printf("╠══════════════════════════════════════════════════════════════╣\n");
        printf("║  Model : %-49s ║\n", model_name_.c_str());
        printf("║  Port  : %-49d ║\n", port_);
        printf("║  API   : http://0.0.0.0:%-35d ║\n", port_);
        printf("╠══════════════════════════════════════════════════════════════╣\n");
        printf("║  POST /v1/chat/completions    Chat (streaming + sync)       ║\n");
        printf("║  POST /v1/completions         Text completion               ║\n");
        printf("║  GET  /v1/models              Model info                    ║\n");
        printf("║  GET  /health                 Health check                  ║\n");
        printf("╚══════════════════════════════════════════════════════════════╝\n");
        printf("\nReady. Ctrl+C to stop.\n\n");
        fflush(stdout);

        while (!stopped_) {
            struct sockaddr_in client_addr;
            socklen_t client_len = sizeof(client_addr);
            int client_fd = accept(server_fd, (struct sockaddr*)&client_addr, &client_len);
            if (client_fd < 0) continue;

            // Handle in-thread (sequential for now — model is single-threaded)
            handle_client(client_fd);
            close(client_fd);
        }
        close(server_fd);
    }

    void stop() { stopped_ = true; }

private:
    int port_;
    std::string model_name_;
    GenerateFn generate_;
    std::atomic<bool> stopped_{false};
    int total_requests_ = 0;
    int total_tokens_ = 0;

    // ─── HTTP parsing ──────────────────────────────────────────────────

    struct HttpRequest {
        std::string method;
        std::string path;
        std::string body;
        int content_length = 0;
    };

    HttpRequest read_request(int fd) {
        HttpRequest req;
        char buf[65536];
        int n = recv(fd, buf, sizeof(buf) - 1, 0);
        if (n <= 0) return req;
        buf[n] = '\0';

        std::string raw(buf, n);

        // Parse method + path
        size_t sp1 = raw.find(' ');
        if (sp1 == std::string::npos) return req;
        req.method = raw.substr(0, sp1);
        size_t sp2 = raw.find(' ', sp1 + 1);
        req.path = raw.substr(sp1 + 1, sp2 - sp1 - 1);

        // Content-Length
        size_t cl_pos = raw.find("Content-Length: ");
        if (cl_pos == std::string::npos) cl_pos = raw.find("content-length: ");
        if (cl_pos != std::string::npos) {
            req.content_length = atoi(raw.c_str() + cl_pos + 16);
        }

        // Body (after \r\n\r\n)
        size_t body_start = raw.find("\r\n\r\n");
        if (body_start != std::string::npos) {
            body_start += 4;
            req.body = raw.substr(body_start);

            // Read remaining body if needed
            while ((int)req.body.size() < req.content_length) {
                n = recv(fd, buf, sizeof(buf) - 1, 0);
                if (n <= 0) break;
                buf[n] = '\0';
                req.body.append(buf, n);
            }
        }
        return req;
    }

    // ─── HTTP responses ────────────────────────────────────────────────

    void send_response(int fd, int status, const std::string& body,
                       const std::string& content_type = "application/json") {
        std::string status_text = (status == 200) ? "OK" : "Not Found";
        char header[512];
        snprintf(header, sizeof(header),
                 "HTTP/1.1 %d %s\r\n"
                 "Content-Type: %s\r\n"
                 "Content-Length: %zu\r\n"
                 "Access-Control-Allow-Origin: *\r\n"
                 "Access-Control-Allow-Methods: POST, GET, OPTIONS\r\n"
                 "Access-Control-Allow-Headers: Content-Type, Authorization\r\n"
                 "X-Powered-By: %s\r\n"
                 "\r\n",
                 status, status_text.c_str(),
                 content_type.c_str(), body.size(),
                 ix::identity::license().server_header().c_str());
        send(fd, header, strlen(header), 0);
        send(fd, body.c_str(), body.size(), 0);
    }

    void send_sse_start(int fd) {
        const char* header =
            "HTTP/1.1 200 OK\r\n"
            "Content-Type: text/event-stream\r\n"
            "Cache-Control: no-cache\r\n"
            "Connection: keep-alive\r\n"
            "Access-Control-Allow-Origin: *\r\n"
            "\r\n";
        send(fd, header, strlen(header), 0);
    }

    void send_sse_event(int fd, const std::string& data) {
        std::string event = "data: " + data + "\n\n";
        send(fd, event.c_str(), event.size(), MSG_NOSIGNAL);
    }

    // ─── Route handlers ────────────────────────────────────────────────

    void handle_client(int fd) {
        HttpRequest req = read_request(fd);
        if (req.method.empty()) return;

        total_requests_++;
        ix::identity::license().on_request();

        // CORS preflight
        if (req.method == "OPTIONS") {
            send_response(fd, 200, "");
            return;
        }

        // Health check
        if (req.path == "/health") {
            char json[256];
            snprintf(json, sizeof(json),
                     "{\"status\":\"ok\",\"model\":\"%s\",\"requests\":%d,\"tokens\":%d}",
                     model_name_.c_str(), total_requests_, total_tokens_);
            send_response(fd, 200, json);
            return;
        }

        // List models
        if (req.path == "/v1/models" && req.method == "GET") {
            char json[512];
            snprintf(json, sizeof(json),
                     "{\"object\":\"list\",\"data\":[{\"id\":\"%s\","
                     "\"object\":\"model\",\"owned_by\":\"inference-x\"}]}",
                     model_name_.c_str());
            send_response(fd, 200, json);
            return;
        }

        // Chat completions
        if (req.path == "/v1/chat/completions" && req.method == "POST") {
            handle_chat(fd, req.body);
            return;
        }

        // Text completions
        if (req.path == "/v1/completions" && req.method == "POST") {
            handle_completion(fd, req.body);
            return;
        }

        send_response(fd, 404, "{\"error\":\"not found\"}");
    }

    void handle_chat(int fd, const std::string& body) {
        ChatRequest req = parse_chat_request(body);

        // Build system + user from messages
        std::string system_prompt, user_prompt;
        for (auto& msg : req.messages) {
            if (msg.role == "system") system_prompt += msg.content + "\n";
            else if (msg.role == "user") user_prompt += msg.content + "\n";
            else if (msg.role == "assistant") {
                // For multi-turn context, append assistant messages too
                user_prompt += "[Assistant]: " + msg.content + "\n[User]: ";
            }
        }
        if (user_prompt.empty() && !req.messages.empty()) {
            user_prompt = req.messages.back().content;
        }

        std::string chat_id = gen_id();
        long created = (long)time(nullptr);

        if (req.stream) {
            // ─── Streaming (SSE) ───
            send_sse_start(fd);

            int token_count = 0;
            generate_(system_prompt, user_prompt, req.max_tokens,
                      req.temperature, req.top_p,
                      [&](const std::string& token) -> bool {
                          token_count++;
                          total_tokens_++;
                          char chunk[2048];
                          snprintf(chunk, sizeof(chunk),
                                   "{\"id\":\"%s\",\"object\":\"chat.completion.chunk\","
                                   "\"created\":%ld,\"model\":\"%s\","
                                   "\"choices\":[{\"index\":0,\"delta\":"
                                   "{\"content\":\"%s\"},\"finish_reason\":null}]}",
                                   chat_id.c_str(), created,
                                   model_name_.c_str(),
                                   json_escape(token).c_str());
                          send_sse_event(fd, chunk);
                          return true;
                      });

            // Final chunk with finish_reason
            char done[512];
            snprintf(done, sizeof(done),
                     "{\"id\":\"%s\",\"object\":\"chat.completion.chunk\","
                     "\"created\":%ld,\"model\":\"%s\","
                     "\"choices\":[{\"index\":0,\"delta\":{},"
                     "\"finish_reason\":\"stop\"}]}",
                     chat_id.c_str(), created, model_name_.c_str());
            send_sse_event(fd, done);
            send_sse_event(fd, "[DONE]");
        } else {
            // ─── Non-streaming ───
            std::string full_response;
            int token_count = 0;
            generate_(system_prompt, user_prompt, req.max_tokens,
                      req.temperature, req.top_p,
                      [&](const std::string& token) -> bool {
                          full_response += token;
                          token_count++;
                          total_tokens_++;
                          return true;
                      });

            char json[65536];
            snprintf(json, sizeof(json),
                     "{\"id\":\"%s\",\"object\":\"chat.completion\","
                     "\"created\":%ld,\"model\":\"%s\","
                     "\"choices\":[{\"index\":0,\"message\":"
                     "{\"role\":\"assistant\",\"content\":\"%s\"},"
                     "\"finish_reason\":\"stop\"}],"
                     "\"usage\":{\"prompt_tokens\":0,"
                     "\"completion_tokens\":%d,\"total_tokens\":%d}}",
                     chat_id.c_str(), created,
                     model_name_.c_str(),
                     json_escape(full_response).c_str(),
                     token_count, token_count);
            send_response(fd, 200, json);
        }
    }

    void handle_completion(int fd, const std::string& body) {
        std::string prompt = json_str(body, "prompt");
        int max_tokens = (int)json_num(body, "max_tokens", 256);
        float temperature = (float)json_num(body, "temperature", 0.6);
        float top_p = (float)json_num(body, "top_p", 0.9);
        bool stream = json_bool(body, "stream", false);

        std::string comp_id = gen_id();
        long created = (long)time(nullptr);

        if (stream) {
            send_sse_start(fd);
            int token_count = 0;
            generate_("", prompt, max_tokens, temperature, top_p,
                      [&](const std::string& token) -> bool {
                          token_count++;
                          total_tokens_++;
                          char chunk[2048];
                          snprintf(chunk, sizeof(chunk),
                                   "{\"id\":\"%s\",\"object\":\"text_completion\","
                                   "\"created\":%ld,\"model\":\"%s\","
                                   "\"choices\":[{\"text\":\"%s\",\"index\":0,"
                                   "\"finish_reason\":null}]}",
                                   comp_id.c_str(), created,
                                   model_name_.c_str(),
                                   json_escape(token).c_str());
                          send_sse_event(fd, chunk);
                          return true;
                      });
            char done[256];
            snprintf(done, sizeof(done),
                     "{\"id\":\"%s\",\"object\":\"text_completion\","
                     "\"created\":%ld,\"choices\":[{\"text\":\"\","
                     "\"finish_reason\":\"stop\"}]}",
                     comp_id.c_str(), created);
            send_sse_event(fd, done);
            send_sse_event(fd, "[DONE]");
        } else {
            std::string full;
            int token_count = 0;
            generate_("", prompt, max_tokens, temperature, top_p,
                      [&](const std::string& token) -> bool {
                          full += token;
                          token_count++;
                          total_tokens_++;
                          return true;
                      });
            char json[65536];
            snprintf(json, sizeof(json),
                     "{\"id\":\"%s\",\"object\":\"text_completion\","
                     "\"created\":%ld,\"model\":\"%s\","
                     "\"choices\":[{\"text\":\"%s\",\"index\":0,"
                     "\"finish_reason\":\"stop\"}],"
                     "\"usage\":{\"prompt_tokens\":0,"
                     "\"completion_tokens\":%d,\"total_tokens\":%d}}",
                     comp_id.c_str(), created,
                     model_name_.c_str(),
                     json_escape(full).c_str(),
                     token_count, token_count);
            send_response(fd, 200, json);
        }
    }
};

} // namespace ix