Better output from the same model. Fused computation, adaptive precision, surgical expert loading. 305 KB, 19 backends, zero dependencies. https://inference-x.com
75 lines
2.0 KiB
C++
75 lines
2.0 KiB
C++
// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
|
|
// INPI eSoleau: 7phf-Ueye-2nWr-Vsgu — BSL-1.1
|
|
// Inference-X — Universal Inference Protocol
|
|
// Morocco
|
|
// Backend Manager Header — Device enumeration and routing
|
|
#pragma once
|
|
|
|
#include <string>
|
|
#include <vector>
|
|
#include <mutex>
|
|
#include <cstdint>
|
|
|
|
namespace inference_x {
|
|
namespace compute {
|
|
|
|
enum class ComputeBackend {
|
|
Auto = 0, CPU, CUDA, ROCm, Metal, Vulkan, OpenCL,
|
|
Hexagon, Snapdragon, TPU, Groq, Cerebras, FPGA,
|
|
Gaudi, Inferentia, Maia, SambaNova, GraphCore,
|
|
ARM_NEON, WebGPU
|
|
};
|
|
|
|
enum class ComputeError {
|
|
Success = 0, NotInitialized, InvalidDevice, NotSupported,
|
|
OutOfMemory, LaunchFailed, SyncFailed
|
|
};
|
|
|
|
struct DeviceInfo {
|
|
ComputeBackend backend;
|
|
int device_id;
|
|
std::string name;
|
|
size_t total_memory;
|
|
size_t free_memory;
|
|
int compute_capability_major;
|
|
int compute_capability_minor;
|
|
int num_sm;
|
|
int max_threads_per_block;
|
|
int warp_size;
|
|
bool supports_fp16;
|
|
bool supports_bf16;
|
|
bool supports_int8;
|
|
};
|
|
|
|
class BackendManager {
|
|
public:
|
|
static BackendManager& instance();
|
|
ComputeError initialize();
|
|
bool is_available(ComputeBackend backend) const;
|
|
int get_device_count(ComputeBackend backend) const;
|
|
const std::vector<DeviceInfo>& get_devices() const { return devices_; }
|
|
DeviceInfo get_best_device() const;
|
|
const char* backend_name(ComputeBackend b) const;
|
|
|
|
private:
|
|
BackendManager() = default;
|
|
ComputeError initialize_cpu();
|
|
ComputeError initialize_cuda();
|
|
ComputeError initialize_rocm();
|
|
DeviceInfo query_cpu_info() const;
|
|
DeviceInfo query_cuda_info(int device_id) const;
|
|
DeviceInfo query_rocm_info(int device_id) const;
|
|
|
|
mutable std::mutex mutex_;
|
|
bool initialized_ = false;
|
|
bool cpu_available_ = false;
|
|
bool cuda_available_ = false;
|
|
bool rocm_available_ = false;
|
|
int cuda_device_count_ = 0;
|
|
int rocm_device_count_ = 0;
|
|
std::vector<DeviceInfo> devices_;
|
|
};
|
|
|
|
} // namespace compute
|
|
} // namespace inference_x
|