python-detector-worker/core/communication/state.py
ziesorx 5065e43837
All checks were successful
Build Worker Base and Application Images / check-base-changes (push) Successful in 12s
Build Worker Base and Application Images / build-base (push) Successful in 4m44s
Build Worker Base and Application Images / build-docker (push) Successful in 3m3s
Build Worker Base and Application Images / deploy-stack (push) Successful in 24s
feat: update pynvml in linux
2025-09-25 01:26:19 +07:00

234 lines
No EOL
8.5 KiB
Python

"""
Worker state management for system metrics and subscription tracking.
"""
import logging
import psutil
import threading
from typing import Dict, Set, Optional, List
from dataclasses import dataclass, field
from .models import CameraConnection, SubscriptionObject
logger = logging.getLogger(__name__)
# Try to import torch and pynvml for GPU monitoring
try:
import torch
TORCH_AVAILABLE = True
except ImportError:
TORCH_AVAILABLE = False
logger.warning("PyTorch not available, GPU metrics will not be collected")
try:
import pynvml
PYNVML_AVAILABLE = True
pynvml.nvmlInit()
logger.info("NVIDIA ML Python (pynvml) initialized successfully")
except ImportError:
PYNVML_AVAILABLE = False
logger.debug("pynvml not available, falling back to PyTorch GPU monitoring")
except Exception as e:
PYNVML_AVAILABLE = False
logger.warning(f"Failed to initialize pynvml: {e}")
@dataclass
class WorkerState:
"""Central state management for the detector worker."""
# Active subscriptions indexed by subscription identifier
subscriptions: Dict[str, SubscriptionObject] = field(default_factory=dict)
# Session ID mapping: display_identifier -> session_id
session_ids: Dict[str, int] = field(default_factory=dict)
# Progression stage mapping: display_identifier -> stage
progression_stages: Dict[str, str] = field(default_factory=dict)
# Active camera connections for state reporting
camera_connections: List[CameraConnection] = field(default_factory=list)
# Thread lock for state synchronization
_lock: threading.RLock = field(default_factory=threading.RLock)
def set_subscriptions(self, new_subscriptions: List[SubscriptionObject]) -> None:
"""
Update active subscriptions with declarative list from backend.
Args:
new_subscriptions: Complete list of desired subscriptions
"""
with self._lock:
# Convert to dict for easy lookup
new_sub_dict = {sub.subscriptionIdentifier: sub for sub in new_subscriptions}
# Log changes for debugging
current_ids = set(self.subscriptions.keys())
new_ids = set(new_sub_dict.keys())
added = new_ids - current_ids
removed = current_ids - new_ids
updated = current_ids & new_ids
if added:
logger.info(f"[State Update] Adding subscriptions: {added}")
if removed:
logger.info(f"[State Update] Removing subscriptions: {removed}")
if updated:
logger.info(f"[State Update] Updating subscriptions: {updated}")
# Replace entire subscription dict
self.subscriptions = new_sub_dict
# Update camera connections for state reporting
self._update_camera_connections()
def get_subscription(self, subscription_identifier: str) -> Optional[SubscriptionObject]:
"""Get subscription by identifier."""
with self._lock:
return self.subscriptions.get(subscription_identifier)
def get_all_subscriptions(self) -> List[SubscriptionObject]:
"""Get all active subscriptions."""
with self._lock:
return list(self.subscriptions.values())
def set_session_id(self, display_identifier: str, session_id: Optional[int]) -> None:
"""
Set or clear session ID for a display.
Args:
display_identifier: Display identifier
session_id: Session ID to set, or None to clear
"""
with self._lock:
if session_id is None:
self.session_ids.pop(display_identifier, None)
logger.info(f"[State Update] Cleared session ID for display {display_identifier}")
else:
self.session_ids[display_identifier] = session_id
logger.info(f"[State Update] Set session ID {session_id} for display {display_identifier}")
def get_session_id(self, display_identifier: str) -> Optional[int]:
"""Get session ID for display identifier."""
with self._lock:
return self.session_ids.get(display_identifier)
def get_session_id_for_subscription(self, subscription_identifier: str) -> Optional[int]:
"""Get session ID for subscription by extracting display identifier."""
from .messages import extract_display_identifier
display_id = extract_display_identifier(subscription_identifier)
if display_id:
return self.get_session_id(display_id)
return None
def set_progression_stage(self, display_identifier: str, stage: Optional[str]) -> None:
"""
Set or clear progression stage for a display.
Args:
display_identifier: Display identifier
stage: Progression stage to set, or None to clear
"""
with self._lock:
if stage is None:
self.progression_stages.pop(display_identifier, None)
logger.info(f"[State Update] Cleared progression stage for display {display_identifier}")
else:
self.progression_stages[display_identifier] = stage
logger.info(f"[State Update] Set progression stage '{stage}' for display {display_identifier}")
def get_progression_stage(self, display_identifier: str) -> Optional[str]:
"""Get progression stage for display identifier."""
with self._lock:
return self.progression_stages.get(display_identifier)
def _update_camera_connections(self) -> None:
"""Update camera connections list for state reporting."""
connections = []
for sub in self.subscriptions.values():
connection = CameraConnection(
subscriptionIdentifier=sub.subscriptionIdentifier,
modelId=sub.modelId,
modelName=sub.modelName,
online=True, # TODO: Add actual online status tracking
cropX1=sub.cropX1,
cropY1=sub.cropY1,
cropX2=sub.cropX2,
cropY2=sub.cropY2
)
connections.append(connection)
self.camera_connections = connections
def get_camera_connections(self) -> List[CameraConnection]:
"""Get current camera connections for state reporting."""
with self._lock:
return self.camera_connections.copy()
class SystemMetrics:
"""System metrics collection for state reporting."""
@staticmethod
def get_cpu_usage() -> float:
"""Get current CPU usage percentage."""
try:
return psutil.cpu_percent(interval=0.1)
except Exception as e:
logger.error(f"Failed to get CPU usage: {e}")
return 0.0
@staticmethod
def get_memory_usage() -> float:
"""Get current memory usage percentage."""
try:
return psutil.virtual_memory().percent
except Exception as e:
logger.error(f"Failed to get memory usage: {e}")
return 0.0
@staticmethod
def get_gpu_usage() -> Optional[float]:
"""Get current GPU usage percentage."""
try:
# Prefer pynvml for accurate GPU utilization
if PYNVML_AVAILABLE:
handle = pynvml.nvmlDeviceGetHandleByIndex(0) # First GPU
utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
return float(utilization.gpu)
# Fallback to PyTorch memory-based estimation
elif TORCH_AVAILABLE and torch.cuda.is_available():
if hasattr(torch.cuda, 'utilization'):
return torch.cuda.utilization()
else:
# Estimate based on memory usage
allocated = torch.cuda.memory_allocated()
reserved = torch.cuda.memory_reserved()
if reserved > 0:
return (allocated / reserved) * 100
return None
except Exception as e:
logger.error(f"Failed to get GPU usage: {e}")
return None
@staticmethod
def get_gpu_memory_usage() -> Optional[float]:
"""Get current GPU memory usage in MB."""
if not TORCH_AVAILABLE:
return None
try:
if torch.cuda.is_available():
return torch.cuda.memory_reserved() / (1024 ** 2) # Convert to MB
return None
except Exception as e:
logger.error(f"Failed to get GPU memory usage: {e}")
return None
# Global worker state instance
worker_state = WorkerState()