Better output from the same model. Fused computation, adaptive precision, surgical expert loading. 305 KB, 19 backends, zero dependencies. https://inference-x.com
423 lines
12 KiB
C++
423 lines
12 KiB
C++
// Copyright (C) 2024-2026 Salka Elmadani. All rights reserved.
|
|
// INPI eSoleau: 7phf-Ueye-2nWr-Vsgu — BSL-1.1
|
|
// Inference-X — Universal Inference Protocol
|
|
// Morocco
|
|
// Backend Manager — GPU/CPU auto-detection and routing
|
|
|
|
#include "inference_x/compute/backend_manager.h"
|
|
#include <cstring>
|
|
#include <algorithm>
|
|
|
|
#ifdef INFERENCE_X_CUDA_ENABLED
|
|
#include <cuda_runtime.h>
|
|
#include <cublas_v2.h>
|
|
#endif
|
|
|
|
#ifdef INFERENCE_X_ROCM_ENABLED
|
|
#include <hip/hip_runtime.h>
|
|
#include <rocblas/rocblas.h>
|
|
#endif
|
|
|
|
#if defined(__x86_64__) || defined(_M_X64)
|
|
#include <cpuid.h>
|
|
#elif defined(__aarch64__) || defined(_M_ARM64)
|
|
#include <sys/auxv.h>
|
|
#include <asm/hwcap.h>
|
|
#endif
|
|
|
|
namespace inference_x {
|
|
namespace compute {
|
|
|
|
BackendManager& BackendManager::instance() {
|
|
static BackendManager instance;
|
|
return instance;
|
|
}
|
|
|
|
ComputeError BackendManager::initialize() {
|
|
std::lock_guard<std::mutex> lock(mutex_);
|
|
|
|
if (initialized_) {
|
|
return ComputeError::Success;
|
|
}
|
|
|
|
// Initialize all available backends
|
|
cpu_available_ = (initialize_cpu() == ComputeError::Success);
|
|
cuda_available_ = (initialize_cuda() == ComputeError::Success);
|
|
rocm_available_ = (initialize_rocm() == ComputeError::Success);
|
|
|
|
// Must have at least CPU
|
|
if (!cpu_available_) {
|
|
return ComputeError::NotInitialized;
|
|
}
|
|
|
|
// Cache device information
|
|
devices_.clear();
|
|
|
|
if (cpu_available_) {
|
|
devices_.push_back(query_cpu_info());
|
|
}
|
|
|
|
if (cuda_available_) {
|
|
for (int i = 0; i < cuda_device_count_; ++i) {
|
|
devices_.push_back(query_cuda_info(i));
|
|
}
|
|
}
|
|
|
|
if (rocm_available_) {
|
|
for (int i = 0; i < rocm_device_count_; ++i) {
|
|
devices_.push_back(query_rocm_info(i));
|
|
}
|
|
}
|
|
|
|
initialized_ = true;
|
|
return ComputeError::Success;
|
|
}
|
|
|
|
ComputeError BackendManager::initialize_cpu() {
|
|
// CPU always available
|
|
return ComputeError::Success;
|
|
}
|
|
|
|
ComputeError BackendManager::initialize_cuda() {
|
|
#ifdef INFERENCE_X_CUDA_ENABLED
|
|
cudaError_t err = cudaGetDeviceCount(&cuda_device_count_);
|
|
if (err != cudaSuccess || cuda_device_count_ == 0) {
|
|
cuda_device_count_ = 0;
|
|
return ComputeError::InvalidDevice;
|
|
}
|
|
return ComputeError::Success;
|
|
#else
|
|
cuda_device_count_ = 0;
|
|
return ComputeError::NotSupported;
|
|
#endif
|
|
}
|
|
|
|
ComputeError BackendManager::initialize_rocm() {
|
|
#ifdef INFERENCE_X_ROCM_ENABLED
|
|
hipError_t err = hipGetDeviceCount(&rocm_device_count_);
|
|
if (err != hipSuccess || rocm_device_count_ == 0) {
|
|
rocm_device_count_ = 0;
|
|
return ComputeError::InvalidDevice;
|
|
}
|
|
return ComputeError::Success;
|
|
#else
|
|
rocm_device_count_ = 0;
|
|
return ComputeError::NotSupported;
|
|
#endif
|
|
}
|
|
|
|
DeviceInfo BackendManager::query_cpu_info() const {
|
|
DeviceInfo info;
|
|
info.backend = ComputeBackend::CPU;
|
|
info.device_id = 0;
|
|
|
|
#if defined(__x86_64__) || defined(_M_X64)
|
|
// Query CPU name via CPUID
|
|
uint32_t brand[12];
|
|
for (int i = 0; i < 3; ++i) {
|
|
__cpuid_count(0x80000002 + i, 0,
|
|
brand[i*4 + 0], brand[i*4 + 1],
|
|
brand[i*4 + 2], brand[i*4 + 3]);
|
|
}
|
|
info.name = std::string(reinterpret_cast<char*>(brand), 48);
|
|
|
|
// Check SIMD support
|
|
uint32_t eax, ebx, ecx, edx;
|
|
__cpuid_count(1, 0, eax, ebx, ecx, edx);
|
|
bool has_sse4_2 = (ecx & (1 << 20)) != 0;
|
|
bool has_avx = (ecx & (1 << 28)) != 0;
|
|
|
|
__cpuid_count(7, 0, eax, ebx, ecx, edx);
|
|
bool has_avx2 = (ebx & (1 << 5)) != 0;
|
|
bool has_avx512f = (ebx & (1 << 16)) != 0;
|
|
|
|
if (has_avx512f) {
|
|
info.name += " (AVX-512)";
|
|
} else if (has_avx2) {
|
|
info.name += " (AVX2)";
|
|
} else if (has_avx) {
|
|
info.name += " (AVX)";
|
|
} else if (has_sse4_2) {
|
|
info.name += " (SSE4.2)";
|
|
}
|
|
|
|
#elif defined(__aarch64__) || defined(_M_ARM64)
|
|
info.name = "ARM CPU";
|
|
|
|
// Check NEON support (always present on ARMv8)
|
|
unsigned long hwcaps = getauxval(AT_HWCAP);
|
|
if (hwcaps & HWCAP_ASIMD) {
|
|
info.name += " (NEON)";
|
|
}
|
|
#else
|
|
info.name = "Generic CPU";
|
|
#endif
|
|
|
|
// Get system memory
|
|
info.total_memory = 0; // Would need platform-specific code
|
|
info.free_memory = 0;
|
|
|
|
// CPU "capabilities"
|
|
info.compute_capability_major = 1;
|
|
info.compute_capability_minor = 0;
|
|
info.num_sm = 1; // Logical cores
|
|
info.max_threads_per_block = 1;
|
|
info.warp_size = 1;
|
|
|
|
// CPU supports all precisions
|
|
info.supports_fp16 = true;
|
|
info.supports_bf16 = true;
|
|
info.supports_int8 = true;
|
|
|
|
return info;
|
|
}
|
|
|
|
DeviceInfo BackendManager::query_cuda_info(int device_id) const {
|
|
DeviceInfo info;
|
|
info.backend = ComputeBackend::CUDA;
|
|
info.device_id = device_id;
|
|
|
|
#ifdef INFERENCE_X_CUDA_ENABLED
|
|
cudaDeviceProp prop;
|
|
cudaError_t err = cudaGetDeviceProperties(&prop, device_id);
|
|
|
|
if (err == cudaSuccess) {
|
|
info.name = prop.name;
|
|
info.total_memory = prop.totalGlobalMem;
|
|
|
|
size_t free_mem, total_mem;
|
|
cudaMemGetInfo(&free_mem, &total_mem);
|
|
info.free_memory = free_mem;
|
|
|
|
info.compute_capability_major = prop.major;
|
|
info.compute_capability_minor = prop.minor;
|
|
info.num_sm = prop.multiProcessorCount;
|
|
info.max_threads_per_block = prop.maxThreadsPerBlock;
|
|
info.warp_size = prop.warpSize;
|
|
|
|
// FP16 support: compute capability >= 5.3
|
|
info.supports_fp16 = (prop.major > 5) || (prop.major == 5 && prop.minor >= 3);
|
|
|
|
// BF16 support: compute capability >= 8.0 (Ampere)
|
|
info.supports_bf16 = (prop.major >= 8);
|
|
|
|
// INT8 support: compute capability >= 6.1 (Pascal)
|
|
info.supports_int8 = (prop.major > 6) || (prop.major == 6 && prop.minor >= 1);
|
|
} else {
|
|
info.name = "CUDA Device (query failed)";
|
|
}
|
|
#else
|
|
info.name = "CUDA Device (not compiled)";
|
|
#endif
|
|
|
|
return info;
|
|
}
|
|
|
|
DeviceInfo BackendManager::query_rocm_info(int device_id) const {
|
|
DeviceInfo info;
|
|
info.backend = ComputeBackend::ROCm;
|
|
info.device_id = device_id;
|
|
|
|
#ifdef INFERENCE_X_ROCM_ENABLED
|
|
hipDeviceProp_t prop;
|
|
hipError_t err = hipGetDeviceProperties(&prop, device_id);
|
|
|
|
if (err == hipSuccess) {
|
|
info.name = prop.name;
|
|
info.total_memory = prop.totalGlobalMem;
|
|
|
|
size_t free_mem, total_mem;
|
|
hipMemGetInfo(&free_mem, &total_mem);
|
|
info.free_memory = free_mem;
|
|
|
|
info.compute_capability_major = prop.major;
|
|
info.compute_capability_minor = prop.minor;
|
|
info.num_sm = prop.multiProcessorCount;
|
|
info.max_threads_per_block = prop.maxThreadsPerBlock;
|
|
info.warp_size = prop.warpSize; // 64 for AMD
|
|
|
|
// AMD GPUs generally support FP16, BF16, INT8
|
|
info.supports_fp16 = true;
|
|
info.supports_bf16 = (prop.major >= 9); // gfx9+
|
|
info.supports_int8 = true;
|
|
} else {
|
|
info.name = "ROCm Device (query failed)";
|
|
}
|
|
#else
|
|
info.name = "ROCm Device (not compiled)";
|
|
#endif
|
|
|
|
return info;
|
|
}
|
|
|
|
bool BackendManager::is_available(ComputeBackend backend) const {
|
|
std::lock_guard<std::mutex> lock(mutex_);
|
|
|
|
if (!initialized_) {
|
|
return false;
|
|
}
|
|
|
|
switch (backend) {
|
|
case ComputeBackend::CPU:
|
|
return cpu_available_;
|
|
case ComputeBackend::CUDA:
|
|
return cuda_available_;
|
|
case ComputeBackend::ROCm:
|
|
return rocm_available_;
|
|
case ComputeBackend::Auto:
|
|
return true; // Always available (falls back to CPU)
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
int BackendManager::get_device_count(ComputeBackend backend) const {
|
|
std::lock_guard<std::mutex> lock(mutex_);
|
|
|
|
switch (backend) {
|
|
case ComputeBackend::CPU:
|
|
return cpu_available_ ? 1 : 0;
|
|
case ComputeBackend::CUDA:
|
|
return cuda_device_count_;
|
|
case ComputeBackend::ROCm:
|
|
return rocm_device_count_;
|
|
default:
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
DeviceInfo BackendManager::get_device_info(ComputeBackend backend, int device_id) const {
|
|
std::lock_guard<std::mutex> lock(mutex_);
|
|
|
|
for (const auto& device : devices_) {
|
|
if (device.backend == backend && device.device_id == device_id) {
|
|
return device;
|
|
}
|
|
}
|
|
|
|
// Return empty info if not found
|
|
return DeviceInfo();
|
|
}
|
|
|
|
std::vector<DeviceInfo> BackendManager::get_available_devices() const {
|
|
std::lock_guard<std::mutex> lock(mutex_);
|
|
return devices_;
|
|
}
|
|
|
|
DeviceInfo BackendManager::select_best_device() const {
|
|
std::lock_guard<std::mutex> lock(mutex_);
|
|
|
|
// Priority: CUDA > ROCm > CPU
|
|
|
|
// Try CUDA first
|
|
if (cuda_available_ && cuda_device_count_ > 0) {
|
|
// Select device with most memory
|
|
DeviceInfo best;
|
|
size_t max_memory = 0;
|
|
|
|
for (const auto& device : devices_) {
|
|
if (device.backend == ComputeBackend::CUDA) {
|
|
if (device.free_memory > max_memory) {
|
|
max_memory = device.free_memory;
|
|
best = device;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (max_memory > 0) {
|
|
return best;
|
|
}
|
|
}
|
|
|
|
// Try ROCm
|
|
if (rocm_available_ && rocm_device_count_ > 0) {
|
|
DeviceInfo best;
|
|
size_t max_memory = 0;
|
|
|
|
for (const auto& device : devices_) {
|
|
if (device.backend == ComputeBackend::ROCm) {
|
|
if (device.free_memory > max_memory) {
|
|
max_memory = device.free_memory;
|
|
best = device;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (max_memory > 0) {
|
|
return best;
|
|
}
|
|
}
|
|
|
|
// Fall back to CPU
|
|
return query_cpu_info();
|
|
}
|
|
|
|
BackendCapabilities BackendManager::get_capabilities(ComputeBackend backend, int device_id) const {
|
|
BackendCapabilities caps;
|
|
|
|
#ifdef INFERENCE_X_CUDA_ENABLED
|
|
if (backend == ComputeBackend::CUDA) {
|
|
cudaDeviceProp prop;
|
|
if (cudaGetDeviceProperties(&prop, device_id) == cudaSuccess) {
|
|
caps.can_map_host_memory = prop.canMapHostMemory;
|
|
caps.can_use_unified_memory = prop.unifiedAddressing;
|
|
caps.supports_async_copy = true;
|
|
caps.supports_peer_access = prop.unifiedAddressing;
|
|
caps.max_shared_memory_per_block = prop.sharedMemPerBlock;
|
|
caps.max_constant_memory = prop.totalConstMem;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
#ifdef INFERENCE_X_ROCM_ENABLED
|
|
if (backend == ComputeBackend::ROCm) {
|
|
hipDeviceProp_t prop;
|
|
if (hipGetDeviceProperties(&prop, device_id) == hipSuccess) {
|
|
caps.can_map_host_memory = prop.canMapHostMemory;
|
|
caps.can_use_unified_memory = true;
|
|
caps.supports_async_copy = true;
|
|
caps.supports_peer_access = true;
|
|
caps.max_shared_memory_per_block = prop.sharedMemPerBlock;
|
|
caps.max_constant_memory = prop.totalConstMem;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
if (backend == ComputeBackend::CPU) {
|
|
caps.can_map_host_memory = true;
|
|
caps.can_use_unified_memory = true;
|
|
caps.supports_async_copy = false;
|
|
caps.supports_peer_access = false;
|
|
caps.max_shared_memory_per_block = 0;
|
|
caps.max_constant_memory = 0;
|
|
}
|
|
|
|
return caps;
|
|
}
|
|
|
|
void BackendManager::shutdown() {
|
|
std::lock_guard<std::mutex> lock(mutex_);
|
|
|
|
if (!initialized_) {
|
|
return;
|
|
}
|
|
|
|
#ifdef INFERENCE_X_CUDA_ENABLED
|
|
if (cuda_available_) {
|
|
cudaDeviceReset();
|
|
}
|
|
#endif
|
|
|
|
#ifdef INFERENCE_X_ROCM_ENABLED
|
|
if (rocm_available_) {
|
|
hipDeviceReset();
|
|
}
|
|
#endif
|
|
|
|
devices_.clear();
|
|
initialized_ = false;
|
|
}
|
|
|
|
} // namespace compute
|
|
} // namespace inference_x
|