feat: update pynvml in linux

2025-09-25 01:26:19 +07:00 · 2025-09-25 01:26:19 +07:00 · 5065e43837
commit 5065e43837
parent 67096d4141
3 changed files with 27 additions and 9 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,6 +2,8 @@
 app.log
 *.pt

+images
+
 # All pycache directories
 __pycache__/
 .mptacache
--- a/core/communication/state.py
+++ b/core/communication/state.py
@ -10,7 +10,7 @@ from .models import CameraConnection, SubscriptionObject

 logger = logging.getLogger(__name__)

-# Try to import torch for GPU monitoring
+# Try to import torch and pynvml for GPU monitoring
 try:
    import torch
    TORCH_AVAILABLE = True
@ -18,6 +18,18 @@ except ImportError:
    TORCH_AVAILABLE = False
    logger.warning("PyTorch not available, GPU metrics will not be collected")

+try:
+    import pynvml
+    PYNVML_AVAILABLE = True
+    pynvml.nvmlInit()
+    logger.info("NVIDIA ML Python (pynvml) initialized successfully")
+except ImportError:
+    PYNVML_AVAILABLE = False
+    logger.debug("pynvml not available, falling back to PyTorch GPU monitoring")
+except Exception as e:
+    PYNVML_AVAILABLE = False
+    logger.warning(f"Failed to initialize pynvml: {e}")
+

@dataclass
 class WorkerState:
@ -180,21 +192,24 @@ class SystemMetrics:
    @staticmethod
    def get_gpu_usage() -> Optional[float]:
        """Get current GPU usage percentage."""
-        if not TORCH_AVAILABLE:
-            return None
-
        try:
-            if torch.cuda.is_available():
-                # PyTorch doesn't provide direct GPU utilization
-                # This is a placeholder - real implementation might use nvidia-ml-py
+            # Prefer pynvml for accurate GPU utilization
+            if PYNVML_AVAILABLE:
+                handle = pynvml.nvmlDeviceGetHandleByIndex(0)  # First GPU
+                utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
+                return float(utilization.gpu)
+
+            # Fallback to PyTorch memory-based estimation
+            elif TORCH_AVAILABLE and torch.cuda.is_available():
                if hasattr(torch.cuda, 'utilization'):
                    return torch.cuda.utilization()
                else:
-                    # Fallback: estimate based on memory usage
+                    # Estimate based on memory usage
                    allocated = torch.cuda.memory_allocated()
                    reserved = torch.cuda.memory_reserved()
                    if reserved > 0:
                        return (allocated / reserved) * 100
+
            return None
        except Exception as e:
            logger.error(f"Failed to get GPU usage: {e}")
--- a/requirements.base.txt
+++ b/requirements.base.txt
@ -6,3 +6,4 @@ scipy
 filterpy
 psycopg2-binary
 lap>=0.5.12
+pynvml