inference-x/compute/backend_manager.cpp
Salka Elmadani ec36668cf5 Inference-X v1.0 — Universal AI Inference Engine
Better output from the same model. Fused computation, adaptive precision,
surgical expert loading. 305 KB, 19 backends, zero dependencies.

https://inference-x.com
2026-02-23 07:10:47 +00:00

423 lines
12 KiB
C++

// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
// INPI eSoleau: 7phf-Ueye-2nWr-Vsgu — BSL-1.1
// Inference-X — Universal Inference Protocol
// Morocco
// Backend Manager — GPU/CPU auto-detection and routing
#include "inference_x/compute/backend_manager.h"
#include <cstring>
#include <algorithm>
#ifdef INFERENCE_X_CUDA_ENABLED
#include <cuda_runtime.h>
#include <cublas_v2.h>
#endif
#ifdef INFERENCE_X_ROCM_ENABLED
#include <hip/hip_runtime.h>
#include <rocblas/rocblas.h>
#endif
#if defined(__x86_64__) || defined(_M_X64)
#include <cpuid.h>
#elif defined(__aarch64__) || defined(_M_ARM64)
#include <sys/auxv.h>
#include <asm/hwcap.h>
#endif
namespace inference_x {
namespace compute {
BackendManager& BackendManager::instance() {
static BackendManager instance;
return instance;
}
ComputeError BackendManager::initialize() {
std::lock_guard<std::mutex> lock(mutex_);
if (initialized_) {
return ComputeError::Success;
}
// Initialize all available backends
cpu_available_ = (initialize_cpu() == ComputeError::Success);
cuda_available_ = (initialize_cuda() == ComputeError::Success);
rocm_available_ = (initialize_rocm() == ComputeError::Success);
// Must have at least CPU
if (!cpu_available_) {
return ComputeError::NotInitialized;
}
// Cache device information
devices_.clear();
if (cpu_available_) {
devices_.push_back(query_cpu_info());
}
if (cuda_available_) {
for (int i = 0; i < cuda_device_count_; ++i) {
devices_.push_back(query_cuda_info(i));
}
}
if (rocm_available_) {
for (int i = 0; i < rocm_device_count_; ++i) {
devices_.push_back(query_rocm_info(i));
}
}
initialized_ = true;
return ComputeError::Success;
}
ComputeError BackendManager::initialize_cpu() {
// CPU always available
return ComputeError::Success;
}
ComputeError BackendManager::initialize_cuda() {
#ifdef INFERENCE_X_CUDA_ENABLED
cudaError_t err = cudaGetDeviceCount(&cuda_device_count_);
if (err != cudaSuccess || cuda_device_count_ == 0) {
cuda_device_count_ = 0;
return ComputeError::InvalidDevice;
}
return ComputeError::Success;
#else
cuda_device_count_ = 0;
return ComputeError::NotSupported;
#endif
}
ComputeError BackendManager::initialize_rocm() {
#ifdef INFERENCE_X_ROCM_ENABLED
hipError_t err = hipGetDeviceCount(&rocm_device_count_);
if (err != hipSuccess || rocm_device_count_ == 0) {
rocm_device_count_ = 0;
return ComputeError::InvalidDevice;
}
return ComputeError::Success;
#else
rocm_device_count_ = 0;
return ComputeError::NotSupported;
#endif
}
DeviceInfo BackendManager::query_cpu_info() const {
DeviceInfo info;
info.backend = ComputeBackend::CPU;
info.device_id = 0;
#if defined(__x86_64__) || defined(_M_X64)
// Query CPU name via CPUID
uint32_t brand[12];
for (int i = 0; i < 3; ++i) {
__cpuid_count(0x80000002 + i, 0,
brand[i*4 + 0], brand[i*4 + 1],
brand[i*4 + 2], brand[i*4 + 3]);
}
info.name = std::string(reinterpret_cast<char*>(brand), 48);
// Check SIMD support
uint32_t eax, ebx, ecx, edx;
__cpuid_count(1, 0, eax, ebx, ecx, edx);
bool has_sse4_2 = (ecx & (1 << 20)) != 0;
bool has_avx = (ecx & (1 << 28)) != 0;
__cpuid_count(7, 0, eax, ebx, ecx, edx);
bool has_avx2 = (ebx & (1 << 5)) != 0;
bool has_avx512f = (ebx & (1 << 16)) != 0;
if (has_avx512f) {
info.name += " (AVX-512)";
} else if (has_avx2) {
info.name += " (AVX2)";
} else if (has_avx) {
info.name += " (AVX)";
} else if (has_sse4_2) {
info.name += " (SSE4.2)";
}
#elif defined(__aarch64__) || defined(_M_ARM64)
info.name = "ARM CPU";
// Check NEON support (always present on ARMv8)
unsigned long hwcaps = getauxval(AT_HWCAP);
if (hwcaps & HWCAP_ASIMD) {
info.name += " (NEON)";
}
#else
info.name = "Generic CPU";
#endif
// Get system memory
info.total_memory = 0; // Would need platform-specific code
info.free_memory = 0;
// CPU "capabilities"
info.compute_capability_major = 1;
info.compute_capability_minor = 0;
info.num_sm = 1; // Logical cores
info.max_threads_per_block = 1;
info.warp_size = 1;
// CPU supports all precisions
info.supports_fp16 = true;
info.supports_bf16 = true;
info.supports_int8 = true;
return info;
}
DeviceInfo BackendManager::query_cuda_info(int device_id) const {
DeviceInfo info;
info.backend = ComputeBackend::CUDA;
info.device_id = device_id;
#ifdef INFERENCE_X_CUDA_ENABLED
cudaDeviceProp prop;
cudaError_t err = cudaGetDeviceProperties(&prop, device_id);
if (err == cudaSuccess) {
info.name = prop.name;
info.total_memory = prop.totalGlobalMem;
size_t free_mem, total_mem;
cudaMemGetInfo(&free_mem, &total_mem);
info.free_memory = free_mem;
info.compute_capability_major = prop.major;
info.compute_capability_minor = prop.minor;
info.num_sm = prop.multiProcessorCount;
info.max_threads_per_block = prop.maxThreadsPerBlock;
info.warp_size = prop.warpSize;
// FP16 support: compute capability >= 5.3
info.supports_fp16 = (prop.major > 5) || (prop.major == 5 && prop.minor >= 3);
// BF16 support: compute capability >= 8.0 (Ampere)
info.supports_bf16 = (prop.major >= 8);
// INT8 support: compute capability >= 6.1 (Pascal)
info.supports_int8 = (prop.major > 6) || (prop.major == 6 && prop.minor >= 1);
} else {
info.name = "CUDA Device (query failed)";
}
#else
info.name = "CUDA Device (not compiled)";
#endif
return info;
}
DeviceInfo BackendManager::query_rocm_info(int device_id) const {
DeviceInfo info;
info.backend = ComputeBackend::ROCm;
info.device_id = device_id;
#ifdef INFERENCE_X_ROCM_ENABLED
hipDeviceProp_t prop;
hipError_t err = hipGetDeviceProperties(&prop, device_id);
if (err == hipSuccess) {
info.name = prop.name;
info.total_memory = prop.totalGlobalMem;
size_t free_mem, total_mem;
hipMemGetInfo(&free_mem, &total_mem);
info.free_memory = free_mem;
info.compute_capability_major = prop.major;
info.compute_capability_minor = prop.minor;
info.num_sm = prop.multiProcessorCount;
info.max_threads_per_block = prop.maxThreadsPerBlock;
info.warp_size = prop.warpSize; // 64 for AMD
// AMD GPUs generally support FP16, BF16, INT8
info.supports_fp16 = true;
info.supports_bf16 = (prop.major >= 9); // gfx9+
info.supports_int8 = true;
} else {
info.name = "ROCm Device (query failed)";
}
#else
info.name = "ROCm Device (not compiled)";
#endif
return info;
}
bool BackendManager::is_available(ComputeBackend backend) const {
std::lock_guard<std::mutex> lock(mutex_);
if (!initialized_) {
return false;
}
switch (backend) {
case ComputeBackend::CPU:
return cpu_available_;
case ComputeBackend::CUDA:
return cuda_available_;
case ComputeBackend::ROCm:
return rocm_available_;
case ComputeBackend::Auto:
return true; // Always available (falls back to CPU)
default:
return false;
}
}
int BackendManager::get_device_count(ComputeBackend backend) const {
std::lock_guard<std::mutex> lock(mutex_);
switch (backend) {
case ComputeBackend::CPU:
return cpu_available_ ? 1 : 0;
case ComputeBackend::CUDA:
return cuda_device_count_;
case ComputeBackend::ROCm:
return rocm_device_count_;
default:
return 0;
}
}
DeviceInfo BackendManager::get_device_info(ComputeBackend backend, int device_id) const {
std::lock_guard<std::mutex> lock(mutex_);
for (const auto& device : devices_) {
if (device.backend == backend && device.device_id == device_id) {
return device;
}
}
// Return empty info if not found
return DeviceInfo();
}
std::vector<DeviceInfo> BackendManager::get_available_devices() const {
std::lock_guard<std::mutex> lock(mutex_);
return devices_;
}
DeviceInfo BackendManager::select_best_device() const {
std::lock_guard<std::mutex> lock(mutex_);
// Priority: CUDA > ROCm > CPU
// Try CUDA first
if (cuda_available_ && cuda_device_count_ > 0) {
// Select device with most memory
DeviceInfo best;
size_t max_memory = 0;
for (const auto& device : devices_) {
if (device.backend == ComputeBackend::CUDA) {
if (device.free_memory > max_memory) {
max_memory = device.free_memory;
best = device;
}
}
}
if (max_memory > 0) {
return best;
}
}
// Try ROCm
if (rocm_available_ && rocm_device_count_ > 0) {
DeviceInfo best;
size_t max_memory = 0;
for (const auto& device : devices_) {
if (device.backend == ComputeBackend::ROCm) {
if (device.free_memory > max_memory) {
max_memory = device.free_memory;
best = device;
}
}
}
if (max_memory > 0) {
return best;
}
}
// Fall back to CPU
return query_cpu_info();
}
BackendCapabilities BackendManager::get_capabilities(ComputeBackend backend, int device_id) const {
BackendCapabilities caps;
#ifdef INFERENCE_X_CUDA_ENABLED
if (backend == ComputeBackend::CUDA) {
cudaDeviceProp prop;
if (cudaGetDeviceProperties(&prop, device_id) == cudaSuccess) {
caps.can_map_host_memory = prop.canMapHostMemory;
caps.can_use_unified_memory = prop.unifiedAddressing;
caps.supports_async_copy = true;
caps.supports_peer_access = prop.unifiedAddressing;
caps.max_shared_memory_per_block = prop.sharedMemPerBlock;
caps.max_constant_memory = prop.totalConstMem;
}
}
#endif
#ifdef INFERENCE_X_ROCM_ENABLED
if (backend == ComputeBackend::ROCm) {
hipDeviceProp_t prop;
if (hipGetDeviceProperties(&prop, device_id) == hipSuccess) {
caps.can_map_host_memory = prop.canMapHostMemory;
caps.can_use_unified_memory = true;
caps.supports_async_copy = true;
caps.supports_peer_access = true;
caps.max_shared_memory_per_block = prop.sharedMemPerBlock;
caps.max_constant_memory = prop.totalConstMem;
}
}
#endif
if (backend == ComputeBackend::CPU) {
caps.can_map_host_memory = true;
caps.can_use_unified_memory = true;
caps.supports_async_copy = false;
caps.supports_peer_access = false;
caps.max_shared_memory_per_block = 0;
caps.max_constant_memory = 0;
}
return caps;
}
void BackendManager::shutdown() {
std::lock_guard<std::mutex> lock(mutex_);
if (!initialized_) {
return;
}
#ifdef INFERENCE_X_CUDA_ENABLED
if (cuda_available_) {
cudaDeviceReset();
}
#endif
#ifdef INFERENCE_X_ROCM_ENABLED
if (rocm_available_) {
hipDeviceReset();
}
#endif
devices_.clear();
initialized_ = false;
}
} // namespace compute
} // namespace inference_x