feat: update pynvml in linux
All checks were successful
Build Worker Base and Application Images / check-base-changes (push) Successful in 12s
Build Worker Base and Application Images / build-base (push) Successful in 4m44s
Build Worker Base and Application Images / build-docker (push) Successful in 3m3s
Build Worker Base and Application Images / deploy-stack (push) Successful in 24s
All checks were successful
Build Worker Base and Application Images / check-base-changes (push) Successful in 12s
Build Worker Base and Application Images / build-base (push) Successful in 4m44s
Build Worker Base and Application Images / build-docker (push) Successful in 3m3s
Build Worker Base and Application Images / deploy-stack (push) Successful in 24s
This commit is contained in:
parent
67096d4141
commit
5065e43837
3 changed files with 27 additions and 9 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -2,6 +2,8 @@
|
||||||
app.log
|
app.log
|
||||||
*.pt
|
*.pt
|
||||||
|
|
||||||
|
images
|
||||||
|
|
||||||
# All pycache directories
|
# All pycache directories
|
||||||
__pycache__/
|
__pycache__/
|
||||||
.mptacache
|
.mptacache
|
||||||
|
|
|
@ -10,7 +10,7 @@ from .models import CameraConnection, SubscriptionObject
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# Try to import torch for GPU monitoring
|
# Try to import torch and pynvml for GPU monitoring
|
||||||
try:
|
try:
|
||||||
import torch
|
import torch
|
||||||
TORCH_AVAILABLE = True
|
TORCH_AVAILABLE = True
|
||||||
|
@ -18,6 +18,18 @@ except ImportError:
|
||||||
TORCH_AVAILABLE = False
|
TORCH_AVAILABLE = False
|
||||||
logger.warning("PyTorch not available, GPU metrics will not be collected")
|
logger.warning("PyTorch not available, GPU metrics will not be collected")
|
||||||
|
|
||||||
|
try:
|
||||||
|
import pynvml
|
||||||
|
PYNVML_AVAILABLE = True
|
||||||
|
pynvml.nvmlInit()
|
||||||
|
logger.info("NVIDIA ML Python (pynvml) initialized successfully")
|
||||||
|
except ImportError:
|
||||||
|
PYNVML_AVAILABLE = False
|
||||||
|
logger.debug("pynvml not available, falling back to PyTorch GPU monitoring")
|
||||||
|
except Exception as e:
|
||||||
|
PYNVML_AVAILABLE = False
|
||||||
|
logger.warning(f"Failed to initialize pynvml: {e}")
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class WorkerState:
|
class WorkerState:
|
||||||
|
@ -180,21 +192,24 @@ class SystemMetrics:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_gpu_usage() -> Optional[float]:
|
def get_gpu_usage() -> Optional[float]:
|
||||||
"""Get current GPU usage percentage."""
|
"""Get current GPU usage percentage."""
|
||||||
if not TORCH_AVAILABLE:
|
|
||||||
return None
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if torch.cuda.is_available():
|
# Prefer pynvml for accurate GPU utilization
|
||||||
# PyTorch doesn't provide direct GPU utilization
|
if PYNVML_AVAILABLE:
|
||||||
# This is a placeholder - real implementation might use nvidia-ml-py
|
handle = pynvml.nvmlDeviceGetHandleByIndex(0) # First GPU
|
||||||
|
utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
|
||||||
|
return float(utilization.gpu)
|
||||||
|
|
||||||
|
# Fallback to PyTorch memory-based estimation
|
||||||
|
elif TORCH_AVAILABLE and torch.cuda.is_available():
|
||||||
if hasattr(torch.cuda, 'utilization'):
|
if hasattr(torch.cuda, 'utilization'):
|
||||||
return torch.cuda.utilization()
|
return torch.cuda.utilization()
|
||||||
else:
|
else:
|
||||||
# Fallback: estimate based on memory usage
|
# Estimate based on memory usage
|
||||||
allocated = torch.cuda.memory_allocated()
|
allocated = torch.cuda.memory_allocated()
|
||||||
reserved = torch.cuda.memory_reserved()
|
reserved = torch.cuda.memory_reserved()
|
||||||
if reserved > 0:
|
if reserved > 0:
|
||||||
return (allocated / reserved) * 100
|
return (allocated / reserved) * 100
|
||||||
|
|
||||||
return None
|
return None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to get GPU usage: {e}")
|
logger.error(f"Failed to get GPU usage: {e}")
|
||||||
|
|
|
@ -6,3 +6,4 @@ scipy
|
||||||
filterpy
|
filterpy
|
||||||
psycopg2-binary
|
psycopg2-binary
|
||||||
lap>=0.5.12
|
lap>=0.5.12
|
||||||
|
pynvml
|
Loading…
Add table
Add a link
Reference in a new issue