Refactor: Phase 4: Communication Layer
This commit is contained in:
parent
cdeaaf4a4f
commit
54f21672aa
6 changed files with 2876 additions and 0 deletions
379
detector_worker/utils/system_monitor.py
Normal file
379
detector_worker/utils/system_monitor.py
Normal file
|
@ -0,0 +1,379 @@
|
|||
"""
|
||||
System monitoring utilities.
|
||||
|
||||
This module provides functions to monitor system resources including
|
||||
CPU, memory, and GPU usage.
|
||||
"""
|
||||
import logging
|
||||
import time
|
||||
import psutil
|
||||
from typing import Dict, Any, Optional, List
|
||||
|
||||
try:
|
||||
import pynvml
|
||||
NVIDIA_AVAILABLE = True
|
||||
except ImportError:
|
||||
NVIDIA_AVAILABLE = False
|
||||
|
||||
# Setup logging
|
||||
logger = logging.getLogger("detector_worker.system_monitor")
|
||||
|
||||
# Global initialization flag
|
||||
_nvidia_initialized = False
|
||||
|
||||
# Process start time
|
||||
_start_time = time.time()
|
||||
|
||||
|
||||
def initialize_nvidia_monitoring() -> bool:
|
||||
"""
|
||||
Initialize NVIDIA GPU monitoring.
|
||||
|
||||
Returns:
|
||||
True if initialization successful, False otherwise
|
||||
"""
|
||||
global _nvidia_initialized
|
||||
|
||||
if not NVIDIA_AVAILABLE:
|
||||
return False
|
||||
|
||||
if _nvidia_initialized:
|
||||
return True
|
||||
|
||||
try:
|
||||
pynvml.nvmlInit()
|
||||
_nvidia_initialized = True
|
||||
logger.info("NVIDIA GPU monitoring initialized")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to initialize NVIDIA monitoring: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def get_gpu_info() -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Get information about available GPUs.
|
||||
|
||||
Returns:
|
||||
List of GPU information dictionaries
|
||||
"""
|
||||
if not NVIDIA_AVAILABLE or not _nvidia_initialized:
|
||||
return []
|
||||
|
||||
gpu_info = []
|
||||
|
||||
try:
|
||||
device_count = pynvml.nvmlDeviceGetCount()
|
||||
|
||||
for i in range(device_count):
|
||||
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
|
||||
|
||||
# Get GPU information
|
||||
name = pynvml.nvmlDeviceGetName(handle).decode('utf-8')
|
||||
|
||||
# Get memory info
|
||||
mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
|
||||
total_memory = mem_info.total / (1024 ** 3) # Convert to GB
|
||||
used_memory = mem_info.used / (1024 ** 3)
|
||||
free_memory = mem_info.free / (1024 ** 3)
|
||||
|
||||
# Get utilization
|
||||
try:
|
||||
util = pynvml.nvmlDeviceGetUtilizationRates(handle)
|
||||
gpu_util = util.gpu
|
||||
memory_util = util.memory
|
||||
except Exception:
|
||||
gpu_util = 0
|
||||
memory_util = 0
|
||||
|
||||
# Get temperature
|
||||
try:
|
||||
temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
|
||||
except Exception:
|
||||
temp = 0
|
||||
|
||||
gpu_info.append({
|
||||
"index": i,
|
||||
"name": name,
|
||||
"gpu_utilization": gpu_util,
|
||||
"memory_utilization": memory_util,
|
||||
"total_memory_gb": round(total_memory, 2),
|
||||
"used_memory_gb": round(used_memory, 2),
|
||||
"free_memory_gb": round(free_memory, 2),
|
||||
"temperature": temp
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting GPU info: {e}")
|
||||
|
||||
return gpu_info
|
||||
|
||||
|
||||
def get_gpu_metrics() -> Dict[str, float]:
|
||||
"""
|
||||
Get current GPU metrics.
|
||||
|
||||
Returns:
|
||||
Dictionary with GPU utilization and memory usage
|
||||
"""
|
||||
if not NVIDIA_AVAILABLE or not _nvidia_initialized:
|
||||
return {"gpu_percent": 0.0, "gpu_memory_percent": 0.0}
|
||||
|
||||
try:
|
||||
# Get first GPU metrics
|
||||
handle = pynvml.nvmlDeviceGetHandleByIndex(0)
|
||||
|
||||
# Get utilization
|
||||
util = pynvml.nvmlDeviceGetUtilizationRates(handle)
|
||||
|
||||
# Get memory info
|
||||
mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
|
||||
memory_percent = (mem_info.used / mem_info.total) * 100
|
||||
|
||||
return {
|
||||
"gpu_percent": float(util.gpu),
|
||||
"gpu_memory_percent": float(memory_percent)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Error getting GPU metrics: {e}")
|
||||
return {"gpu_percent": 0.0, "gpu_memory_percent": 0.0}
|
||||
|
||||
|
||||
def get_cpu_metrics() -> Dict[str, float]:
|
||||
"""
|
||||
Get current CPU metrics.
|
||||
|
||||
Returns:
|
||||
Dictionary with CPU usage percentage
|
||||
"""
|
||||
try:
|
||||
# Get CPU percent with 0.1 second interval
|
||||
cpu_percent = psutil.cpu_percent(interval=0.1)
|
||||
|
||||
# Get per-core CPU usage
|
||||
cpu_per_core = psutil.cpu_percent(interval=0.1, percpu=True)
|
||||
|
||||
return {
|
||||
"cpu_percent": cpu_percent,
|
||||
"cpu_per_core": cpu_per_core,
|
||||
"cpu_count": psutil.cpu_count()
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting CPU metrics: {e}")
|
||||
return {"cpu_percent": 0.0, "cpu_per_core": [], "cpu_count": 0}
|
||||
|
||||
|
||||
def get_memory_metrics() -> Dict[str, Any]:
|
||||
"""
|
||||
Get current memory metrics.
|
||||
|
||||
Returns:
|
||||
Dictionary with memory usage information
|
||||
"""
|
||||
try:
|
||||
# Get virtual memory info
|
||||
virtual_mem = psutil.virtual_memory()
|
||||
|
||||
# Get swap memory info
|
||||
swap_mem = psutil.swap_memory()
|
||||
|
||||
return {
|
||||
"memory_percent": virtual_mem.percent,
|
||||
"memory_used_gb": round(virtual_mem.used / (1024 ** 3), 2),
|
||||
"memory_available_gb": round(virtual_mem.available / (1024 ** 3), 2),
|
||||
"memory_total_gb": round(virtual_mem.total / (1024 ** 3), 2),
|
||||
"swap_percent": swap_mem.percent,
|
||||
"swap_used_gb": round(swap_mem.used / (1024 ** 3), 2)
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting memory metrics: {e}")
|
||||
return {
|
||||
"memory_percent": 0.0,
|
||||
"memory_used_gb": 0.0,
|
||||
"memory_available_gb": 0.0,
|
||||
"memory_total_gb": 0.0,
|
||||
"swap_percent": 0.0,
|
||||
"swap_used_gb": 0.0
|
||||
}
|
||||
|
||||
|
||||
def get_disk_metrics(path: str = "/") -> Dict[str, Any]:
|
||||
"""
|
||||
Get disk usage metrics for specified path.
|
||||
|
||||
Args:
|
||||
path: Path to check disk usage for
|
||||
|
||||
Returns:
|
||||
Dictionary with disk usage information
|
||||
"""
|
||||
try:
|
||||
disk_usage = psutil.disk_usage(path)
|
||||
|
||||
return {
|
||||
"disk_percent": disk_usage.percent,
|
||||
"disk_used_gb": round(disk_usage.used / (1024 ** 3), 2),
|
||||
"disk_free_gb": round(disk_usage.free / (1024 ** 3), 2),
|
||||
"disk_total_gb": round(disk_usage.total / (1024 ** 3), 2)
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting disk metrics: {e}")
|
||||
return {
|
||||
"disk_percent": 0.0,
|
||||
"disk_used_gb": 0.0,
|
||||
"disk_free_gb": 0.0,
|
||||
"disk_total_gb": 0.0
|
||||
}
|
||||
|
||||
|
||||
def get_process_metrics() -> Dict[str, Any]:
|
||||
"""
|
||||
Get current process metrics.
|
||||
|
||||
Returns:
|
||||
Dictionary with process-specific metrics
|
||||
"""
|
||||
try:
|
||||
process = psutil.Process()
|
||||
|
||||
# Get process info
|
||||
with process.oneshot():
|
||||
cpu_percent = process.cpu_percent()
|
||||
memory_info = process.memory_info()
|
||||
memory_percent = process.memory_percent()
|
||||
num_threads = process.num_threads()
|
||||
|
||||
# Get open file descriptors
|
||||
try:
|
||||
num_fds = len(process.open_files())
|
||||
except Exception:
|
||||
num_fds = 0
|
||||
|
||||
return {
|
||||
"process_cpu_percent": cpu_percent,
|
||||
"process_memory_mb": round(memory_info.rss / (1024 ** 2), 2),
|
||||
"process_memory_percent": round(memory_percent, 2),
|
||||
"process_threads": num_threads,
|
||||
"process_open_files": num_fds
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting process metrics: {e}")
|
||||
return {
|
||||
"process_cpu_percent": 0.0,
|
||||
"process_memory_mb": 0.0,
|
||||
"process_memory_percent": 0.0,
|
||||
"process_threads": 0,
|
||||
"process_open_files": 0
|
||||
}
|
||||
|
||||
|
||||
def get_network_metrics() -> Dict[str, Any]:
|
||||
"""
|
||||
Get network I/O metrics.
|
||||
|
||||
Returns:
|
||||
Dictionary with network statistics
|
||||
"""
|
||||
try:
|
||||
net_io = psutil.net_io_counters()
|
||||
|
||||
return {
|
||||
"bytes_sent": net_io.bytes_sent,
|
||||
"bytes_recv": net_io.bytes_recv,
|
||||
"packets_sent": net_io.packets_sent,
|
||||
"packets_recv": net_io.packets_recv,
|
||||
"errin": net_io.errin,
|
||||
"errout": net_io.errout,
|
||||
"dropin": net_io.dropin,
|
||||
"dropout": net_io.dropout
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting network metrics: {e}")
|
||||
return {
|
||||
"bytes_sent": 0,
|
||||
"bytes_recv": 0,
|
||||
"packets_sent": 0,
|
||||
"packets_recv": 0,
|
||||
"errin": 0,
|
||||
"errout": 0,
|
||||
"dropin": 0,
|
||||
"dropout": 0
|
||||
}
|
||||
|
||||
|
||||
def get_system_metrics() -> Dict[str, Any]:
|
||||
"""
|
||||
Get comprehensive system metrics.
|
||||
|
||||
Returns:
|
||||
Dictionary containing all system metrics
|
||||
"""
|
||||
# Initialize GPU monitoring if not done
|
||||
if NVIDIA_AVAILABLE and not _nvidia_initialized:
|
||||
initialize_nvidia_monitoring()
|
||||
|
||||
metrics = {
|
||||
"timestamp": time.time(),
|
||||
"uptime": time.time() - _start_time,
|
||||
"start_time": _start_time
|
||||
}
|
||||
|
||||
# Get CPU metrics
|
||||
cpu_metrics = get_cpu_metrics()
|
||||
metrics.update(cpu_metrics)
|
||||
|
||||
# Get memory metrics
|
||||
memory_metrics = get_memory_metrics()
|
||||
metrics.update(memory_metrics)
|
||||
|
||||
# Get GPU metrics
|
||||
gpu_metrics = get_gpu_metrics()
|
||||
metrics.update(gpu_metrics)
|
||||
|
||||
# Get process metrics
|
||||
process_metrics = get_process_metrics()
|
||||
metrics.update(process_metrics)
|
||||
|
||||
return metrics
|
||||
|
||||
|
||||
def get_resource_summary() -> str:
|
||||
"""
|
||||
Get a formatted summary of system resources.
|
||||
|
||||
Returns:
|
||||
Formatted string with resource summary
|
||||
"""
|
||||
metrics = get_system_metrics()
|
||||
|
||||
summary = []
|
||||
summary.append(f"CPU: {metrics['cpu_percent']:.1f}%")
|
||||
summary.append(f"Memory: {metrics['memory_percent']:.1f}% ({metrics['memory_used_gb']:.1f}GB/{metrics['memory_total_gb']:.1f}GB)")
|
||||
|
||||
if metrics['gpu_percent'] > 0:
|
||||
summary.append(f"GPU: {metrics['gpu_percent']:.1f}%")
|
||||
summary.append(f"GPU Memory: {metrics['gpu_memory_percent']:.1f}%")
|
||||
|
||||
summary.append(f"Process Memory: {metrics['process_memory_mb']:.1f}MB")
|
||||
summary.append(f"Threads: {metrics['process_threads']}")
|
||||
|
||||
return " | ".join(summary)
|
||||
|
||||
|
||||
def cleanup_nvidia_monitoring():
|
||||
"""Clean up NVIDIA monitoring resources."""
|
||||
global _nvidia_initialized
|
||||
|
||||
if NVIDIA_AVAILABLE and _nvidia_initialized:
|
||||
try:
|
||||
pynvml.nvmlShutdown()
|
||||
_nvidia_initialized = False
|
||||
logger.info("NVIDIA GPU monitoring cleaned up")
|
||||
except Exception as e:
|
||||
logger.error(f"Error cleaning up NVIDIA monitoring: {e}")
|
||||
|
||||
|
||||
# Initialize on import
|
||||
if NVIDIA_AVAILABLE:
|
||||
initialize_nvidia_monitoring()
|
Loading…
Add table
Add a link
Reference in a new issue