""" Core health monitoring system for comprehensive stream and system health tracking. Provides centralized health status, alerting, and recovery coordination. """ import time import threading import logging import psutil from typing import Dict, List, Optional, Any, Callable from dataclasses import dataclass, field from enum import Enum from collections import defaultdict, deque logger = logging.getLogger(__name__) class HealthStatus(Enum): """Health status levels.""" HEALTHY = "healthy" WARNING = "warning" CRITICAL = "critical" UNKNOWN = "unknown" @dataclass class HealthCheck: """Individual health check result.""" name: str status: HealthStatus message: str timestamp: float = field(default_factory=time.time) details: Dict[str, Any] = field(default_factory=dict) recovery_action: Optional[str] = None @dataclass class HealthMetrics: """Health metrics for a component.""" component_id: str last_update: float frame_count: int = 0 error_count: int = 0 warning_count: int = 0 restart_count: int = 0 avg_frame_interval: float = 0.0 last_frame_time: Optional[float] = None thread_alive: bool = True connection_healthy: bool = True memory_usage_mb: float = 0.0 cpu_usage_percent: float = 0.0 class HealthMonitor: """Comprehensive health monitoring system.""" def __init__(self, check_interval: float = 30.0): """ Initialize health monitor. Args: check_interval: Interval between health checks in seconds """ self.check_interval = check_interval self.running = False self.monitor_thread = None self._lock = threading.RLock() # Health data storage self.health_checks: Dict[str, HealthCheck] = {} self.metrics: Dict[str, HealthMetrics] = {} self.alert_history: deque = deque(maxlen=1000) self.recovery_actions: deque = deque(maxlen=500) # Thresholds (configurable) self.thresholds = { 'frame_stale_warning_seconds': 120, # 2 minutes 'frame_stale_critical_seconds': 300, # 5 minutes 'thread_unresponsive_seconds': 60, # 1 minute 'memory_warning_mb': 500, # 500MB per stream 'memory_critical_mb': 1000, # 1GB per stream 'cpu_warning_percent': 80, # 80% CPU 'cpu_critical_percent': 95, # 95% CPU 'error_rate_warning': 0.1, # 10% error rate 'error_rate_critical': 0.3, # 30% error rate 'restart_threshold': 3 # Max restarts per hour } # Health check functions self.health_checkers: List[Callable[[], List[HealthCheck]]] = [] self.recovery_callbacks: Dict[str, Callable[[str, HealthCheck], bool]] = {} # System monitoring self.process = psutil.Process() self.system_start_time = time.time() def start(self): """Start health monitoring.""" if self.running: logger.warning("Health monitor already running") return self.running = True self.monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True) self.monitor_thread.start() logger.info(f"Health monitor started (check interval: {self.check_interval}s)") def stop(self): """Stop health monitoring.""" self.running = False if self.monitor_thread: self.monitor_thread.join(timeout=5.0) logger.info("Health monitor stopped") def register_health_checker(self, checker: Callable[[], List[HealthCheck]]): """Register a health check function.""" self.health_checkers.append(checker) logger.debug(f"Registered health checker: {checker.__name__}") def register_recovery_callback(self, component: str, callback: Callable[[str, HealthCheck], bool]): """Register a recovery callback for a component.""" self.recovery_callbacks[component] = callback logger.debug(f"Registered recovery callback for {component}") def update_metrics(self, component_id: str, **kwargs): """Update metrics for a component.""" with self._lock: if component_id not in self.metrics: self.metrics[component_id] = HealthMetrics( component_id=component_id, last_update=time.time() ) metrics = self.metrics[component_id] metrics.last_update = time.time() # Update provided metrics for key, value in kwargs.items(): if hasattr(metrics, key): setattr(metrics, key, value) def report_frame_received(self, component_id: str): """Report that a frame was received for a component.""" current_time = time.time() with self._lock: if component_id not in self.metrics: self.metrics[component_id] = HealthMetrics( component_id=component_id, last_update=current_time ) metrics = self.metrics[component_id] # Update frame metrics if metrics.last_frame_time: interval = current_time - metrics.last_frame_time # Moving average of frame intervals if metrics.avg_frame_interval == 0: metrics.avg_frame_interval = interval else: metrics.avg_frame_interval = (metrics.avg_frame_interval * 0.9) + (interval * 0.1) metrics.last_frame_time = current_time metrics.frame_count += 1 metrics.last_update = current_time def report_error(self, component_id: str, error_type: str = "general"): """Report an error for a component.""" with self._lock: if component_id not in self.metrics: self.metrics[component_id] = HealthMetrics( component_id=component_id, last_update=time.time() ) self.metrics[component_id].error_count += 1 self.metrics[component_id].last_update = time.time() logger.debug(f"Error reported for {component_id}: {error_type}") def report_warning(self, component_id: str, warning_type: str = "general"): """Report a warning for a component.""" with self._lock: if component_id not in self.metrics: self.metrics[component_id] = HealthMetrics( component_id=component_id, last_update=time.time() ) self.metrics[component_id].warning_count += 1 self.metrics[component_id].last_update = time.time() logger.debug(f"Warning reported for {component_id}: {warning_type}") def report_restart(self, component_id: str): """Report that a component was restarted.""" with self._lock: if component_id not in self.metrics: self.metrics[component_id] = HealthMetrics( component_id=component_id, last_update=time.time() ) self.metrics[component_id].restart_count += 1 self.metrics[component_id].last_update = time.time() # Log recovery action recovery_action = { 'timestamp': time.time(), 'component': component_id, 'action': 'restart', 'reason': 'manual_restart' } with self._lock: self.recovery_actions.append(recovery_action) logger.info(f"Restart reported for {component_id}") def get_health_status(self, component_id: Optional[str] = None) -> Dict[str, Any]: """Get comprehensive health status.""" with self._lock: if component_id: # Get health for specific component return self._get_component_health(component_id) else: # Get overall health status return self._get_overall_health() def _get_component_health(self, component_id: str) -> Dict[str, Any]: """Get health status for a specific component.""" if component_id not in self.metrics: return { 'component_id': component_id, 'status': HealthStatus.UNKNOWN.value, 'message': 'No metrics available', 'metrics': {} } metrics = self.metrics[component_id] current_time = time.time() # Determine health status status = HealthStatus.HEALTHY issues = [] # Check frame freshness if metrics.last_frame_time: frame_age = current_time - metrics.last_frame_time if frame_age > self.thresholds['frame_stale_critical_seconds']: status = HealthStatus.CRITICAL issues.append(f"Frames stale for {frame_age:.1f}s") elif frame_age > self.thresholds['frame_stale_warning_seconds']: if status == HealthStatus.HEALTHY: status = HealthStatus.WARNING issues.append(f"Frames aging ({frame_age:.1f}s)") # Check error rates if metrics.frame_count > 0: error_rate = metrics.error_count / metrics.frame_count if error_rate > self.thresholds['error_rate_critical']: status = HealthStatus.CRITICAL issues.append(f"High error rate ({error_rate:.1%})") elif error_rate > self.thresholds['error_rate_warning']: if status == HealthStatus.HEALTHY: status = HealthStatus.WARNING issues.append(f"Elevated error rate ({error_rate:.1%})") # Check restart frequency restart_rate = metrics.restart_count / max(1, (current_time - self.system_start_time) / 3600) if restart_rate > self.thresholds['restart_threshold']: status = HealthStatus.CRITICAL issues.append(f"Frequent restarts ({restart_rate:.1f}/hour)") # Check thread health if not metrics.thread_alive: status = HealthStatus.CRITICAL issues.append("Thread not alive") # Check connection health if not metrics.connection_healthy: if status == HealthStatus.HEALTHY: status = HealthStatus.WARNING issues.append("Connection unhealthy") return { 'component_id': component_id, 'status': status.value, 'message': '; '.join(issues) if issues else 'All checks passing', 'metrics': { 'frame_count': metrics.frame_count, 'error_count': metrics.error_count, 'warning_count': metrics.warning_count, 'restart_count': metrics.restart_count, 'avg_frame_interval': metrics.avg_frame_interval, 'last_frame_age': current_time - metrics.last_frame_time if metrics.last_frame_time else None, 'thread_alive': metrics.thread_alive, 'connection_healthy': metrics.connection_healthy, 'memory_usage_mb': metrics.memory_usage_mb, 'cpu_usage_percent': metrics.cpu_usage_percent, 'uptime_seconds': current_time - self.system_start_time }, 'last_update': metrics.last_update } def _get_overall_health(self) -> Dict[str, Any]: """Get overall system health status.""" current_time = time.time() components = {} overall_status = HealthStatus.HEALTHY # Get health for all components for component_id in self.metrics.keys(): component_health = self._get_component_health(component_id) components[component_id] = component_health # Determine overall status component_status = HealthStatus(component_health['status']) if component_status == HealthStatus.CRITICAL: overall_status = HealthStatus.CRITICAL elif component_status == HealthStatus.WARNING and overall_status == HealthStatus.HEALTHY: overall_status = HealthStatus.WARNING # System metrics try: system_memory = self.process.memory_info() system_cpu = self.process.cpu_percent() except Exception: system_memory = None system_cpu = 0.0 return { 'overall_status': overall_status.value, 'timestamp': current_time, 'uptime_seconds': current_time - self.system_start_time, 'total_components': len(self.metrics), 'components': components, 'system_metrics': { 'memory_mb': system_memory.rss / (1024 * 1024) if system_memory else 0, 'cpu_percent': system_cpu, 'process_id': self.process.pid }, 'recent_alerts': list(self.alert_history)[-10:], # Last 10 alerts 'recent_recoveries': list(self.recovery_actions)[-10:] # Last 10 recovery actions } def _monitor_loop(self): """Main health monitoring loop.""" logger.info("Health monitor loop started") while self.running: try: start_time = time.time() # Run all registered health checks all_checks = [] for checker in self.health_checkers: try: checks = checker() all_checks.extend(checks) except Exception as e: logger.error(f"Error in health checker {checker.__name__}: {e}") # Process health checks and trigger recovery if needed for check in all_checks: self._process_health_check(check) # Update system metrics self._update_system_metrics() # Sleep until next check elapsed = time.time() - start_time sleep_time = max(0, self.check_interval - elapsed) if sleep_time > 0: time.sleep(sleep_time) except Exception as e: logger.error(f"Error in health monitor loop: {e}") time.sleep(5.0) # Fallback sleep logger.info("Health monitor loop ended") def _process_health_check(self, check: HealthCheck): """Process a health check result and trigger recovery if needed.""" with self._lock: # Store health check self.health_checks[check.name] = check # Log alerts for non-healthy status if check.status != HealthStatus.HEALTHY: alert = { 'timestamp': check.timestamp, 'component': check.name, 'status': check.status.value, 'message': check.message, 'details': check.details } self.alert_history.append(alert) logger.warning(f"Health alert [{check.status.value.upper()}] {check.name}: {check.message}") # Trigger recovery if critical and recovery action available if check.status == HealthStatus.CRITICAL and check.recovery_action: self._trigger_recovery(check.name, check) def _trigger_recovery(self, component: str, check: HealthCheck): """Trigger recovery action for a component.""" if component in self.recovery_callbacks: try: logger.info(f"Triggering recovery for {component}: {check.recovery_action}") success = self.recovery_callbacks[component](component, check) recovery_action = { 'timestamp': time.time(), 'component': component, 'action': check.recovery_action, 'reason': check.message, 'success': success } with self._lock: self.recovery_actions.append(recovery_action) if success: logger.info(f"Recovery successful for {component}") else: logger.error(f"Recovery failed for {component}") except Exception as e: logger.error(f"Error in recovery callback for {component}: {e}") def _update_system_metrics(self): """Update system-level metrics.""" try: # Update process metrics for all components current_time = time.time() with self._lock: for component_id, metrics in self.metrics.items(): # Update CPU and memory if available try: # This is a simplified approach - in practice you'd want # per-thread or per-component resource tracking metrics.cpu_usage_percent = self.process.cpu_percent() / len(self.metrics) memory_info = self.process.memory_info() metrics.memory_usage_mb = memory_info.rss / (1024 * 1024) / len(self.metrics) except Exception: pass except Exception as e: logger.error(f"Error updating system metrics: {e}") # Global health monitor instance health_monitor = HealthMonitor()