Refactor: Phase 4: Communication Layer

2025-09-12 15:26:31 +07:00 · 2025-09-12 15:26:31 +07:00 · 54f21672aa
commit 54f21672aa
parent cdeaaf4a4f
6 changed files with 2876 additions and 0 deletions
--- a/detector_worker/utils/system_monitor.py
+++ b/detector_worker/utils/system_monitor.py
@ -0,0 +1,379 @@
+"""
+System monitoring utilities.
+
+This module provides functions to monitor system resources including
+CPU, memory, and GPU usage.
+"""
+import logging
+import time
+import psutil
+from typing import Dict, Any, Optional, List
+
+try:
+    import pynvml
+    NVIDIA_AVAILABLE = True
+except ImportError:
+    NVIDIA_AVAILABLE = False
+
+# Setup logging
+logger = logging.getLogger("detector_worker.system_monitor")
+
+# Global initialization flag
+_nvidia_initialized = False
+
+# Process start time
+_start_time = time.time()
+
+
+def initialize_nvidia_monitoring() -> bool:
+    """
+    Initialize NVIDIA GPU monitoring.
+    
+    Returns:
+        True if initialization successful, False otherwise
+    """
+    global _nvidia_initialized
+    
+    if not NVIDIA_AVAILABLE:
+        return False
+        
+    if _nvidia_initialized:
+        return True
+        
+    try:
+        pynvml.nvmlInit()
+        _nvidia_initialized = True
+        logger.info("NVIDIA GPU monitoring initialized")
+        return True
+    except Exception as e:
+        logger.warning(f"Failed to initialize NVIDIA monitoring: {e}")
+        return False
+
+
+def get_gpu_info() -> List[Dict[str, Any]]:
+    """
+    Get information about available GPUs.
+    
+    Returns:
+        List of GPU information dictionaries
+    """
+    if not NVIDIA_AVAILABLE or not _nvidia_initialized:
+        return []
+        
+    gpu_info = []
+    
+    try:
+        device_count = pynvml.nvmlDeviceGetCount()
+        
+        for i in range(device_count):
+            handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+            
+            # Get GPU information
+            name = pynvml.nvmlDeviceGetName(handle).decode('utf-8')
+            
+            # Get memory info
+            mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+            total_memory = mem_info.total / (1024 ** 3)  # Convert to GB
+            used_memory = mem_info.used / (1024 ** 3)
+            free_memory = mem_info.free / (1024 ** 3)
+            
+            # Get utilization
+            try:
+                util = pynvml.nvmlDeviceGetUtilizationRates(handle)
+                gpu_util = util.gpu
+                memory_util = util.memory
+            except Exception:
+                gpu_util = 0
+                memory_util = 0
+                
+            # Get temperature
+            try:
+                temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
+            except Exception:
+                temp = 0
+                
+            gpu_info.append({
+                "index": i,
+                "name": name,
+                "gpu_utilization": gpu_util,
+                "memory_utilization": memory_util,
+                "total_memory_gb": round(total_memory, 2),
+                "used_memory_gb": round(used_memory, 2),
+                "free_memory_gb": round(free_memory, 2),
+                "temperature": temp
+            })
+            
+    except Exception as e:
+        logger.error(f"Error getting GPU info: {e}")
+        
+    return gpu_info
+
+
+def get_gpu_metrics() -> Dict[str, float]:
+    """
+    Get current GPU metrics.
+    
+    Returns:
+        Dictionary with GPU utilization and memory usage
+    """
+    if not NVIDIA_AVAILABLE or not _nvidia_initialized:
+        return {"gpu_percent": 0.0, "gpu_memory_percent": 0.0}
+        
+    try:
+        # Get first GPU metrics
+        handle = pynvml.nvmlDeviceGetHandleByIndex(0)
+        
+        # Get utilization
+        util = pynvml.nvmlDeviceGetUtilizationRates(handle)
+        
+        # Get memory info
+        mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
+        memory_percent = (mem_info.used / mem_info.total) * 100
+        
+        return {
+            "gpu_percent": float(util.gpu),
+            "gpu_memory_percent": float(memory_percent)
+        }
+        
+    except Exception as e:
+        logger.debug(f"Error getting GPU metrics: {e}")
+        return {"gpu_percent": 0.0, "gpu_memory_percent": 0.0}
+
+
+def get_cpu_metrics() -> Dict[str, float]:
+    """
+    Get current CPU metrics.
+    
+    Returns:
+        Dictionary with CPU usage percentage
+    """
+    try:
+        # Get CPU percent with 0.1 second interval
+        cpu_percent = psutil.cpu_percent(interval=0.1)
+        
+        # Get per-core CPU usage
+        cpu_per_core = psutil.cpu_percent(interval=0.1, percpu=True)
+        
+        return {
+            "cpu_percent": cpu_percent,
+            "cpu_per_core": cpu_per_core,
+            "cpu_count": psutil.cpu_count()
+        }
+    except Exception as e:
+        logger.error(f"Error getting CPU metrics: {e}")
+        return {"cpu_percent": 0.0, "cpu_per_core": [], "cpu_count": 0}
+
+
+def get_memory_metrics() -> Dict[str, Any]:
+    """
+    Get current memory metrics.
+    
+    Returns:
+        Dictionary with memory usage information
+    """
+    try:
+        # Get virtual memory info
+        virtual_mem = psutil.virtual_memory()
+        
+        # Get swap memory info
+        swap_mem = psutil.swap_memory()
+        
+        return {
+            "memory_percent": virtual_mem.percent,
+            "memory_used_gb": round(virtual_mem.used / (1024 ** 3), 2),
+            "memory_available_gb": round(virtual_mem.available / (1024 ** 3), 2),
+            "memory_total_gb": round(virtual_mem.total / (1024 ** 3), 2),
+            "swap_percent": swap_mem.percent,
+            "swap_used_gb": round(swap_mem.used / (1024 ** 3), 2)
+        }
+    except Exception as e:
+        logger.error(f"Error getting memory metrics: {e}")
+        return {
+            "memory_percent": 0.0,
+            "memory_used_gb": 0.0,
+            "memory_available_gb": 0.0,
+            "memory_total_gb": 0.0,
+            "swap_percent": 0.0,
+            "swap_used_gb": 0.0
+        }
+
+
+def get_disk_metrics(path: str = "/") -> Dict[str, Any]:
+    """
+    Get disk usage metrics for specified path.
+    
+    Args:
+        path: Path to check disk usage for
+        
+    Returns:
+        Dictionary with disk usage information
+    """
+    try:
+        disk_usage = psutil.disk_usage(path)
+        
+        return {
+            "disk_percent": disk_usage.percent,
+            "disk_used_gb": round(disk_usage.used / (1024 ** 3), 2),
+            "disk_free_gb": round(disk_usage.free / (1024 ** 3), 2),
+            "disk_total_gb": round(disk_usage.total / (1024 ** 3), 2)
+        }
+    except Exception as e:
+        logger.error(f"Error getting disk metrics: {e}")
+        return {
+            "disk_percent": 0.0,
+            "disk_used_gb": 0.0,
+            "disk_free_gb": 0.0,
+            "disk_total_gb": 0.0
+        }
+
+
+def get_process_metrics() -> Dict[str, Any]:
+    """
+    Get current process metrics.
+    
+    Returns:
+        Dictionary with process-specific metrics
+    """
+    try:
+        process = psutil.Process()
+        
+        # Get process info
+        with process.oneshot():
+            cpu_percent = process.cpu_percent()
+            memory_info = process.memory_info()
+            memory_percent = process.memory_percent()
+            num_threads = process.num_threads()
+            
+            # Get open file descriptors
+            try:
+                num_fds = len(process.open_files())
+            except Exception:
+                num_fds = 0
+                
+        return {
+            "process_cpu_percent": cpu_percent,
+            "process_memory_mb": round(memory_info.rss / (1024 ** 2), 2),
+            "process_memory_percent": round(memory_percent, 2),
+            "process_threads": num_threads,
+            "process_open_files": num_fds
+        }
+    except Exception as e:
+        logger.error(f"Error getting process metrics: {e}")
+        return {
+            "process_cpu_percent": 0.0,
+            "process_memory_mb": 0.0,
+            "process_memory_percent": 0.0,
+            "process_threads": 0,
+            "process_open_files": 0
+        }
+
+
+def get_network_metrics() -> Dict[str, Any]:
+    """
+    Get network I/O metrics.
+    
+    Returns:
+        Dictionary with network statistics
+    """
+    try:
+        net_io = psutil.net_io_counters()
+        
+        return {
+            "bytes_sent": net_io.bytes_sent,
+            "bytes_recv": net_io.bytes_recv,
+            "packets_sent": net_io.packets_sent,
+            "packets_recv": net_io.packets_recv,
+            "errin": net_io.errin,
+            "errout": net_io.errout,
+            "dropin": net_io.dropin,
+            "dropout": net_io.dropout
+        }
+    except Exception as e:
+        logger.error(f"Error getting network metrics: {e}")
+        return {
+            "bytes_sent": 0,
+            "bytes_recv": 0,
+            "packets_sent": 0,
+            "packets_recv": 0,
+            "errin": 0,
+            "errout": 0,
+            "dropin": 0,
+            "dropout": 0
+        }
+
+
+def get_system_metrics() -> Dict[str, Any]:
+    """
+    Get comprehensive system metrics.
+    
+    Returns:
+        Dictionary containing all system metrics
+    """
+    # Initialize GPU monitoring if not done
+    if NVIDIA_AVAILABLE and not _nvidia_initialized:
+        initialize_nvidia_monitoring()
+        
+    metrics = {
+        "timestamp": time.time(),
+        "uptime": time.time() - _start_time,
+        "start_time": _start_time
+    }
+    
+    # Get CPU metrics
+    cpu_metrics = get_cpu_metrics()
+    metrics.update(cpu_metrics)
+    
+    # Get memory metrics
+    memory_metrics = get_memory_metrics()
+    metrics.update(memory_metrics)
+    
+    # Get GPU metrics
+    gpu_metrics = get_gpu_metrics()
+    metrics.update(gpu_metrics)
+    
+    # Get process metrics
+    process_metrics = get_process_metrics()
+    metrics.update(process_metrics)
+    
+    return metrics
+
+
+def get_resource_summary() -> str:
+    """
+    Get a formatted summary of system resources.
+    
+    Returns:
+        Formatted string with resource summary
+    """
+    metrics = get_system_metrics()
+    
+    summary = []
+    summary.append(f"CPU: {metrics['cpu_percent']:.1f}%")
+    summary.append(f"Memory: {metrics['memory_percent']:.1f}% ({metrics['memory_used_gb']:.1f}GB/{metrics['memory_total_gb']:.1f}GB)")
+    
+    if metrics['gpu_percent'] > 0:
+        summary.append(f"GPU: {metrics['gpu_percent']:.1f}%")
+        summary.append(f"GPU Memory: {metrics['gpu_memory_percent']:.1f}%")
+        
+    summary.append(f"Process Memory: {metrics['process_memory_mb']:.1f}MB")
+    summary.append(f"Threads: {metrics['process_threads']}")
+    
+    return " | ".join(summary)
+
+
+def cleanup_nvidia_monitoring():
+    """Clean up NVIDIA monitoring resources."""
+    global _nvidia_initialized
+    
+    if NVIDIA_AVAILABLE and _nvidia_initialized:
+        try:
+            pynvml.nvmlShutdown()
+            _nvidia_initialized = False
+            logger.info("NVIDIA GPU monitoring cleaned up")
+        except Exception as e:
+            logger.error(f"Error cleaning up NVIDIA monitoring: {e}")
+
+
+# Initialize on import
+if NVIDIA_AVAILABLE:
+    initialize_nvidia_monitoring()