Refactor: Phase 4: Communication Layer

This commit is contained in:
ziesorx 2025-09-12 15:26:31 +07:00
parent cdeaaf4a4f
commit 54f21672aa
6 changed files with 2876 additions and 0 deletions

View file

@ -0,0 +1,379 @@
"""
System monitoring utilities.
This module provides functions to monitor system resources including
CPU, memory, and GPU usage.
"""
import logging
import time
import psutil
from typing import Dict, Any, Optional, List
try:
import pynvml
NVIDIA_AVAILABLE = True
except ImportError:
NVIDIA_AVAILABLE = False
# Setup logging
logger = logging.getLogger("detector_worker.system_monitor")
# Global initialization flag
_nvidia_initialized = False
# Process start time
_start_time = time.time()
def initialize_nvidia_monitoring() -> bool:
"""
Initialize NVIDIA GPU monitoring.
Returns:
True if initialization successful, False otherwise
"""
global _nvidia_initialized
if not NVIDIA_AVAILABLE:
return False
if _nvidia_initialized:
return True
try:
pynvml.nvmlInit()
_nvidia_initialized = True
logger.info("NVIDIA GPU monitoring initialized")
return True
except Exception as e:
logger.warning(f"Failed to initialize NVIDIA monitoring: {e}")
return False
def get_gpu_info() -> List[Dict[str, Any]]:
"""
Get information about available GPUs.
Returns:
List of GPU information dictionaries
"""
if not NVIDIA_AVAILABLE or not _nvidia_initialized:
return []
gpu_info = []
try:
device_count = pynvml.nvmlDeviceGetCount()
for i in range(device_count):
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
# Get GPU information
name = pynvml.nvmlDeviceGetName(handle).decode('utf-8')
# Get memory info
mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
total_memory = mem_info.total / (1024 ** 3) # Convert to GB
used_memory = mem_info.used / (1024 ** 3)
free_memory = mem_info.free / (1024 ** 3)
# Get utilization
try:
util = pynvml.nvmlDeviceGetUtilizationRates(handle)
gpu_util = util.gpu
memory_util = util.memory
except Exception:
gpu_util = 0
memory_util = 0
# Get temperature
try:
temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
except Exception:
temp = 0
gpu_info.append({
"index": i,
"name": name,
"gpu_utilization": gpu_util,
"memory_utilization": memory_util,
"total_memory_gb": round(total_memory, 2),
"used_memory_gb": round(used_memory, 2),
"free_memory_gb": round(free_memory, 2),
"temperature": temp
})
except Exception as e:
logger.error(f"Error getting GPU info: {e}")
return gpu_info
def get_gpu_metrics() -> Dict[str, float]:
"""
Get current GPU metrics.
Returns:
Dictionary with GPU utilization and memory usage
"""
if not NVIDIA_AVAILABLE or not _nvidia_initialized:
return {"gpu_percent": 0.0, "gpu_memory_percent": 0.0}
try:
# Get first GPU metrics
handle = pynvml.nvmlDeviceGetHandleByIndex(0)
# Get utilization
util = pynvml.nvmlDeviceGetUtilizationRates(handle)
# Get memory info
mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
memory_percent = (mem_info.used / mem_info.total) * 100
return {
"gpu_percent": float(util.gpu),
"gpu_memory_percent": float(memory_percent)
}
except Exception as e:
logger.debug(f"Error getting GPU metrics: {e}")
return {"gpu_percent": 0.0, "gpu_memory_percent": 0.0}
def get_cpu_metrics() -> Dict[str, float]:
"""
Get current CPU metrics.
Returns:
Dictionary with CPU usage percentage
"""
try:
# Get CPU percent with 0.1 second interval
cpu_percent = psutil.cpu_percent(interval=0.1)
# Get per-core CPU usage
cpu_per_core = psutil.cpu_percent(interval=0.1, percpu=True)
return {
"cpu_percent": cpu_percent,
"cpu_per_core": cpu_per_core,
"cpu_count": psutil.cpu_count()
}
except Exception as e:
logger.error(f"Error getting CPU metrics: {e}")
return {"cpu_percent": 0.0, "cpu_per_core": [], "cpu_count": 0}
def get_memory_metrics() -> Dict[str, Any]:
"""
Get current memory metrics.
Returns:
Dictionary with memory usage information
"""
try:
# Get virtual memory info
virtual_mem = psutil.virtual_memory()
# Get swap memory info
swap_mem = psutil.swap_memory()
return {
"memory_percent": virtual_mem.percent,
"memory_used_gb": round(virtual_mem.used / (1024 ** 3), 2),
"memory_available_gb": round(virtual_mem.available / (1024 ** 3), 2),
"memory_total_gb": round(virtual_mem.total / (1024 ** 3), 2),
"swap_percent": swap_mem.percent,
"swap_used_gb": round(swap_mem.used / (1024 ** 3), 2)
}
except Exception as e:
logger.error(f"Error getting memory metrics: {e}")
return {
"memory_percent": 0.0,
"memory_used_gb": 0.0,
"memory_available_gb": 0.0,
"memory_total_gb": 0.0,
"swap_percent": 0.0,
"swap_used_gb": 0.0
}
def get_disk_metrics(path: str = "/") -> Dict[str, Any]:
"""
Get disk usage metrics for specified path.
Args:
path: Path to check disk usage for
Returns:
Dictionary with disk usage information
"""
try:
disk_usage = psutil.disk_usage(path)
return {
"disk_percent": disk_usage.percent,
"disk_used_gb": round(disk_usage.used / (1024 ** 3), 2),
"disk_free_gb": round(disk_usage.free / (1024 ** 3), 2),
"disk_total_gb": round(disk_usage.total / (1024 ** 3), 2)
}
except Exception as e:
logger.error(f"Error getting disk metrics: {e}")
return {
"disk_percent": 0.0,
"disk_used_gb": 0.0,
"disk_free_gb": 0.0,
"disk_total_gb": 0.0
}
def get_process_metrics() -> Dict[str, Any]:
"""
Get current process metrics.
Returns:
Dictionary with process-specific metrics
"""
try:
process = psutil.Process()
# Get process info
with process.oneshot():
cpu_percent = process.cpu_percent()
memory_info = process.memory_info()
memory_percent = process.memory_percent()
num_threads = process.num_threads()
# Get open file descriptors
try:
num_fds = len(process.open_files())
except Exception:
num_fds = 0
return {
"process_cpu_percent": cpu_percent,
"process_memory_mb": round(memory_info.rss / (1024 ** 2), 2),
"process_memory_percent": round(memory_percent, 2),
"process_threads": num_threads,
"process_open_files": num_fds
}
except Exception as e:
logger.error(f"Error getting process metrics: {e}")
return {
"process_cpu_percent": 0.0,
"process_memory_mb": 0.0,
"process_memory_percent": 0.0,
"process_threads": 0,
"process_open_files": 0
}
def get_network_metrics() -> Dict[str, Any]:
"""
Get network I/O metrics.
Returns:
Dictionary with network statistics
"""
try:
net_io = psutil.net_io_counters()
return {
"bytes_sent": net_io.bytes_sent,
"bytes_recv": net_io.bytes_recv,
"packets_sent": net_io.packets_sent,
"packets_recv": net_io.packets_recv,
"errin": net_io.errin,
"errout": net_io.errout,
"dropin": net_io.dropin,
"dropout": net_io.dropout
}
except Exception as e:
logger.error(f"Error getting network metrics: {e}")
return {
"bytes_sent": 0,
"bytes_recv": 0,
"packets_sent": 0,
"packets_recv": 0,
"errin": 0,
"errout": 0,
"dropin": 0,
"dropout": 0
}
def get_system_metrics() -> Dict[str, Any]:
"""
Get comprehensive system metrics.
Returns:
Dictionary containing all system metrics
"""
# Initialize GPU monitoring if not done
if NVIDIA_AVAILABLE and not _nvidia_initialized:
initialize_nvidia_monitoring()
metrics = {
"timestamp": time.time(),
"uptime": time.time() - _start_time,
"start_time": _start_time
}
# Get CPU metrics
cpu_metrics = get_cpu_metrics()
metrics.update(cpu_metrics)
# Get memory metrics
memory_metrics = get_memory_metrics()
metrics.update(memory_metrics)
# Get GPU metrics
gpu_metrics = get_gpu_metrics()
metrics.update(gpu_metrics)
# Get process metrics
process_metrics = get_process_metrics()
metrics.update(process_metrics)
return metrics
def get_resource_summary() -> str:
"""
Get a formatted summary of system resources.
Returns:
Formatted string with resource summary
"""
metrics = get_system_metrics()
summary = []
summary.append(f"CPU: {metrics['cpu_percent']:.1f}%")
summary.append(f"Memory: {metrics['memory_percent']:.1f}% ({metrics['memory_used_gb']:.1f}GB/{metrics['memory_total_gb']:.1f}GB)")
if metrics['gpu_percent'] > 0:
summary.append(f"GPU: {metrics['gpu_percent']:.1f}%")
summary.append(f"GPU Memory: {metrics['gpu_memory_percent']:.1f}%")
summary.append(f"Process Memory: {metrics['process_memory_mb']:.1f}MB")
summary.append(f"Threads: {metrics['process_threads']}")
return " | ".join(summary)
def cleanup_nvidia_monitoring():
"""Clean up NVIDIA monitoring resources."""
global _nvidia_initialized
if NVIDIA_AVAILABLE and _nvidia_initialized:
try:
pynvml.nvmlShutdown()
_nvidia_initialized = False
logger.info("NVIDIA GPU monitoring cleaned up")
except Exception as e:
logger.error(f"Error cleaning up NVIDIA monitoring: {e}")
# Initialize on import
if NVIDIA_AVAILABLE:
initialize_nvidia_monitoring()