Implement comprehensive health monitoring for streams and threads

- Added RecoveryManager for automatic handling of health issues, including circuit breaker patterns, automatic restarts, and graceful degradation. - Introduced StreamHealthTracker to monitor video stream metrics, including frame production, connection health, and error rates. - Developed ThreadHealthMonitor for detecting unresponsive and deadlocked threads, providing liveness detection and responsiveness testing. - Integrated health checks for streams and threads, reporting metrics and recovery actions to the health monitor. - Enhanced logging for recovery attempts, errors, and health checks to improve observability and debugging.
2025-09-27 12:27:38 +07:00 · 2025-09-27 12:27:38 +07:00 · b08ce27de2
commit b08ce27de2
parent 8c08c815ce
9 changed files with 2173 additions and 11 deletions
--- a/core/monitoring/init.py
+++ b/core/monitoring/init.py
@ -0,0 +1,18 @@
+"""
+Comprehensive health monitoring system for detector worker.
+Tracks stream health, thread responsiveness, and system performance.
+"""
+
+from .health import HealthMonitor, HealthStatus, HealthCheck
+from .stream_health import StreamHealthTracker
+from .thread_health import ThreadHealthMonitor
+from .recovery import RecoveryManager
+
+__all__ = [
+    'HealthMonitor',
+    'HealthStatus',
+    'HealthCheck',
+    'StreamHealthTracker',
+    'ThreadHealthMonitor',
+    'RecoveryManager'
+]
--- a/core/monitoring/health.py
+++ b/core/monitoring/health.py
@ -0,0 +1,456 @@
+"""
+Core health monitoring system for comprehensive stream and system health tracking.
+Provides centralized health status, alerting, and recovery coordination.
+"""
+import time
+import threading
+import logging
+import psutil
+from typing import Dict, List, Optional, Any, Callable
+from dataclasses import dataclass, field
+from enum import Enum
+from collections import defaultdict, deque
+
+
+logger = logging.getLogger(__name__)
+
+
+class HealthStatus(Enum):
+    """Health status levels."""
+    HEALTHY = "healthy"
+    WARNING = "warning"
+    CRITICAL = "critical"
+    UNKNOWN = "unknown"
+
+
+@dataclass
+class HealthCheck:
+    """Individual health check result."""
+    name: str
+    status: HealthStatus
+    message: str
+    timestamp: float = field(default_factory=time.time)
+    details: Dict[str, Any] = field(default_factory=dict)
+    recovery_action: Optional[str] = None
+
+
+@dataclass
+class HealthMetrics:
+    """Health metrics for a component."""
+    component_id: str
+    last_update: float
+    frame_count: int = 0
+    error_count: int = 0
+    warning_count: int = 0
+    restart_count: int = 0
+    avg_frame_interval: float = 0.0
+    last_frame_time: Optional[float] = None
+    thread_alive: bool = True
+    connection_healthy: bool = True
+    memory_usage_mb: float = 0.0
+    cpu_usage_percent: float = 0.0
+
+
+class HealthMonitor:
+    """Comprehensive health monitoring system."""
+
+    def __init__(self, check_interval: float = 30.0):
+        """
+        Initialize health monitor.
+
+        Args:
+            check_interval: Interval between health checks in seconds
+        """
+        self.check_interval = check_interval
+        self.running = False
+        self.monitor_thread = None
+        self._lock = threading.RLock()
+
+        # Health data storage
+        self.health_checks: Dict[str, HealthCheck] = {}
+        self.metrics: Dict[str, HealthMetrics] = {}
+        self.alert_history: deque = deque(maxlen=1000)
+        self.recovery_actions: deque = deque(maxlen=500)
+
+        # Thresholds (configurable)
+        self.thresholds = {
+            'frame_stale_warning_seconds': 120,    # 2 minutes
+            'frame_stale_critical_seconds': 300,   # 5 minutes
+            'thread_unresponsive_seconds': 60,     # 1 minute
+            'memory_warning_mb': 500,              # 500MB per stream
+            'memory_critical_mb': 1000,            # 1GB per stream
+            'cpu_warning_percent': 80,             # 80% CPU
+            'cpu_critical_percent': 95,            # 95% CPU
+            'error_rate_warning': 0.1,             # 10% error rate
+            'error_rate_critical': 0.3,            # 30% error rate
+            'restart_threshold': 3                 # Max restarts per hour
+        }
+
+        # Health check functions
+        self.health_checkers: List[Callable[[], List[HealthCheck]]] = []
+        self.recovery_callbacks: Dict[str, Callable[[str, HealthCheck], bool]] = {}
+
+        # System monitoring
+        self.process = psutil.Process()
+        self.system_start_time = time.time()
+
+    def start(self):
+        """Start health monitoring."""
+        if self.running:
+            logger.warning("Health monitor already running")
+            return
+
+        self.running = True
+        self.monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True)
+        self.monitor_thread.start()
+        logger.info(f"Health monitor started (check interval: {self.check_interval}s)")
+
+    def stop(self):
+        """Stop health monitoring."""
+        self.running = False
+        if self.monitor_thread:
+            self.monitor_thread.join(timeout=5.0)
+        logger.info("Health monitor stopped")
+
+    def register_health_checker(self, checker: Callable[[], List[HealthCheck]]):
+        """Register a health check function."""
+        self.health_checkers.append(checker)
+        logger.debug(f"Registered health checker: {checker.__name__}")
+
+    def register_recovery_callback(self, component: str, callback: Callable[[str, HealthCheck], bool]):
+        """Register a recovery callback for a component."""
+        self.recovery_callbacks[component] = callback
+        logger.debug(f"Registered recovery callback for {component}")
+
+    def update_metrics(self, component_id: str, **kwargs):
+        """Update metrics for a component."""
+        with self._lock:
+            if component_id not in self.metrics:
+                self.metrics[component_id] = HealthMetrics(
+                    component_id=component_id,
+                    last_update=time.time()
+                )
+
+            metrics = self.metrics[component_id]
+            metrics.last_update = time.time()
+
+            # Update provided metrics
+            for key, value in kwargs.items():
+                if hasattr(metrics, key):
+                    setattr(metrics, key, value)
+
+    def report_frame_received(self, component_id: str):
+        """Report that a frame was received for a component."""
+        current_time = time.time()
+        with self._lock:
+            if component_id not in self.metrics:
+                self.metrics[component_id] = HealthMetrics(
+                    component_id=component_id,
+                    last_update=current_time
+                )
+
+            metrics = self.metrics[component_id]
+
+            # Update frame metrics
+            if metrics.last_frame_time:
+                interval = current_time - metrics.last_frame_time
+                # Moving average of frame intervals
+                if metrics.avg_frame_interval == 0:
+                    metrics.avg_frame_interval = interval
+                else:
+                    metrics.avg_frame_interval = (metrics.avg_frame_interval * 0.9) + (interval * 0.1)
+
+            metrics.last_frame_time = current_time
+            metrics.frame_count += 1
+            metrics.last_update = current_time
+
+    def report_error(self, component_id: str, error_type: str = "general"):
+        """Report an error for a component."""
+        with self._lock:
+            if component_id not in self.metrics:
+                self.metrics[component_id] = HealthMetrics(
+                    component_id=component_id,
+                    last_update=time.time()
+                )
+
+            self.metrics[component_id].error_count += 1
+            self.metrics[component_id].last_update = time.time()
+
+        logger.debug(f"Error reported for {component_id}: {error_type}")
+
+    def report_warning(self, component_id: str, warning_type: str = "general"):
+        """Report a warning for a component."""
+        with self._lock:
+            if component_id not in self.metrics:
+                self.metrics[component_id] = HealthMetrics(
+                    component_id=component_id,
+                    last_update=time.time()
+                )
+
+            self.metrics[component_id].warning_count += 1
+            self.metrics[component_id].last_update = time.time()
+
+        logger.debug(f"Warning reported for {component_id}: {warning_type}")
+
+    def report_restart(self, component_id: str):
+        """Report that a component was restarted."""
+        with self._lock:
+            if component_id not in self.metrics:
+                self.metrics[component_id] = HealthMetrics(
+                    component_id=component_id,
+                    last_update=time.time()
+                )
+
+            self.metrics[component_id].restart_count += 1
+            self.metrics[component_id].last_update = time.time()
+
+        # Log recovery action
+        recovery_action = {
+            'timestamp': time.time(),
+            'component': component_id,
+            'action': 'restart',
+            'reason': 'manual_restart'
+        }
+
+        with self._lock:
+            self.recovery_actions.append(recovery_action)
+
+        logger.info(f"Restart reported for {component_id}")
+
+    def get_health_status(self, component_id: Optional[str] = None) -> Dict[str, Any]:
+        """Get comprehensive health status."""
+        with self._lock:
+            if component_id:
+                # Get health for specific component
+                return self._get_component_health(component_id)
+            else:
+                # Get overall health status
+                return self._get_overall_health()
+
+    def _get_component_health(self, component_id: str) -> Dict[str, Any]:
+        """Get health status for a specific component."""
+        if component_id not in self.metrics:
+            return {
+                'component_id': component_id,
+                'status': HealthStatus.UNKNOWN.value,
+                'message': 'No metrics available',
+                'metrics': {}
+            }
+
+        metrics = self.metrics[component_id]
+        current_time = time.time()
+
+        # Determine health status
+        status = HealthStatus.HEALTHY
+        issues = []
+
+        # Check frame freshness
+        if metrics.last_frame_time:
+            frame_age = current_time - metrics.last_frame_time
+            if frame_age > self.thresholds['frame_stale_critical_seconds']:
+                status = HealthStatus.CRITICAL
+                issues.append(f"Frames stale for {frame_age:.1f}s")
+            elif frame_age > self.thresholds['frame_stale_warning_seconds']:
+                if status == HealthStatus.HEALTHY:
+                    status = HealthStatus.WARNING
+                issues.append(f"Frames aging ({frame_age:.1f}s)")
+
+        # Check error rates
+        if metrics.frame_count > 0:
+            error_rate = metrics.error_count / metrics.frame_count
+            if error_rate > self.thresholds['error_rate_critical']:
+                status = HealthStatus.CRITICAL
+                issues.append(f"High error rate ({error_rate:.1%})")
+            elif error_rate > self.thresholds['error_rate_warning']:
+                if status == HealthStatus.HEALTHY:
+                    status = HealthStatus.WARNING
+                issues.append(f"Elevated error rate ({error_rate:.1%})")
+
+        # Check restart frequency
+        restart_rate = metrics.restart_count / max(1, (current_time - self.system_start_time) / 3600)
+        if restart_rate > self.thresholds['restart_threshold']:
+            status = HealthStatus.CRITICAL
+            issues.append(f"Frequent restarts ({restart_rate:.1f}/hour)")
+
+        # Check thread health
+        if not metrics.thread_alive:
+            status = HealthStatus.CRITICAL
+            issues.append("Thread not alive")
+
+        # Check connection health
+        if not metrics.connection_healthy:
+            if status == HealthStatus.HEALTHY:
+                status = HealthStatus.WARNING
+            issues.append("Connection unhealthy")
+
+        return {
+            'component_id': component_id,
+            'status': status.value,
+            'message': '; '.join(issues) if issues else 'All checks passing',
+            'metrics': {
+                'frame_count': metrics.frame_count,
+                'error_count': metrics.error_count,
+                'warning_count': metrics.warning_count,
+                'restart_count': metrics.restart_count,
+                'avg_frame_interval': metrics.avg_frame_interval,
+                'last_frame_age': current_time - metrics.last_frame_time if metrics.last_frame_time else None,
+                'thread_alive': metrics.thread_alive,
+                'connection_healthy': metrics.connection_healthy,
+                'memory_usage_mb': metrics.memory_usage_mb,
+                'cpu_usage_percent': metrics.cpu_usage_percent,
+                'uptime_seconds': current_time - self.system_start_time
+            },
+            'last_update': metrics.last_update
+        }
+
+    def _get_overall_health(self) -> Dict[str, Any]:
+        """Get overall system health status."""
+        current_time = time.time()
+        components = {}
+        overall_status = HealthStatus.HEALTHY
+
+        # Get health for all components
+        for component_id in self.metrics.keys():
+            component_health = self._get_component_health(component_id)
+            components[component_id] = component_health
+
+            # Determine overall status
+            component_status = HealthStatus(component_health['status'])
+            if component_status == HealthStatus.CRITICAL:
+                overall_status = HealthStatus.CRITICAL
+            elif component_status == HealthStatus.WARNING and overall_status == HealthStatus.HEALTHY:
+                overall_status = HealthStatus.WARNING
+
+        # System metrics
+        try:
+            system_memory = self.process.memory_info()
+            system_cpu = self.process.cpu_percent()
+        except Exception:
+            system_memory = None
+            system_cpu = 0.0
+
+        return {
+            'overall_status': overall_status.value,
+            'timestamp': current_time,
+            'uptime_seconds': current_time - self.system_start_time,
+            'total_components': len(self.metrics),
+            'components': components,
+            'system_metrics': {
+                'memory_mb': system_memory.rss / (1024 * 1024) if system_memory else 0,
+                'cpu_percent': system_cpu,
+                'process_id': self.process.pid
+            },
+            'recent_alerts': list(self.alert_history)[-10:],  # Last 10 alerts
+            'recent_recoveries': list(self.recovery_actions)[-10:]  # Last 10 recovery actions
+        }
+
+    def _monitor_loop(self):
+        """Main health monitoring loop."""
+        logger.info("Health monitor loop started")
+
+        while self.running:
+            try:
+                start_time = time.time()
+
+                # Run all registered health checks
+                all_checks = []
+                for checker in self.health_checkers:
+                    try:
+                        checks = checker()
+                        all_checks.extend(checks)
+                    except Exception as e:
+                        logger.error(f"Error in health checker {checker.__name__}: {e}")
+
+                # Process health checks and trigger recovery if needed
+                for check in all_checks:
+                    self._process_health_check(check)
+
+                # Update system metrics
+                self._update_system_metrics()
+
+                # Sleep until next check
+                elapsed = time.time() - start_time
+                sleep_time = max(0, self.check_interval - elapsed)
+                if sleep_time > 0:
+                    time.sleep(sleep_time)
+
+            except Exception as e:
+                logger.error(f"Error in health monitor loop: {e}")
+                time.sleep(5.0)  # Fallback sleep
+
+        logger.info("Health monitor loop ended")
+
+    def _process_health_check(self, check: HealthCheck):
+        """Process a health check result and trigger recovery if needed."""
+        with self._lock:
+            # Store health check
+            self.health_checks[check.name] = check
+
+            # Log alerts for non-healthy status
+            if check.status != HealthStatus.HEALTHY:
+                alert = {
+                    'timestamp': check.timestamp,
+                    'component': check.name,
+                    'status': check.status.value,
+                    'message': check.message,
+                    'details': check.details
+                }
+                self.alert_history.append(alert)
+
+                logger.warning(f"Health alert [{check.status.value.upper()}] {check.name}: {check.message}")
+
+                # Trigger recovery if critical and recovery action available
+                if check.status == HealthStatus.CRITICAL and check.recovery_action:
+                    self._trigger_recovery(check.name, check)
+
+    def _trigger_recovery(self, component: str, check: HealthCheck):
+        """Trigger recovery action for a component."""
+        if component in self.recovery_callbacks:
+            try:
+                logger.info(f"Triggering recovery for {component}: {check.recovery_action}")
+
+                success = self.recovery_callbacks[component](component, check)
+
+                recovery_action = {
+                    'timestamp': time.time(),
+                    'component': component,
+                    'action': check.recovery_action,
+                    'reason': check.message,
+                    'success': success
+                }
+
+                with self._lock:
+                    self.recovery_actions.append(recovery_action)
+
+                if success:
+                    logger.info(f"Recovery successful for {component}")
+                else:
+                    logger.error(f"Recovery failed for {component}")
+
+            except Exception as e:
+                logger.error(f"Error in recovery callback for {component}: {e}")
+
+    def _update_system_metrics(self):
+        """Update system-level metrics."""
+        try:
+            # Update process metrics for all components
+            current_time = time.time()
+
+            with self._lock:
+                for component_id, metrics in self.metrics.items():
+                    # Update CPU and memory if available
+                    try:
+                        # This is a simplified approach - in practice you'd want
+                        # per-thread or per-component resource tracking
+                        metrics.cpu_usage_percent = self.process.cpu_percent() / len(self.metrics)
+                        memory_info = self.process.memory_info()
+                        metrics.memory_usage_mb = memory_info.rss / (1024 * 1024) / len(self.metrics)
+                    except Exception:
+                        pass
+
+        except Exception as e:
+            logger.error(f"Error updating system metrics: {e}")
+
+
+# Global health monitor instance
+health_monitor = HealthMonitor()
--- a/core/monitoring/recovery.py
+++ b/core/monitoring/recovery.py
@ -0,0 +1,385 @@
+"""
+Recovery manager for automatic handling of health issues.
+Provides circuit breaker patterns, automatic restarts, and graceful degradation.
+"""
+import time
+import logging
+import threading
+from typing import Dict, List, Optional, Any, Callable
+from dataclasses import dataclass
+from enum import Enum
+from collections import defaultdict, deque
+
+from .health import HealthCheck, HealthStatus, health_monitor
+
+
+logger = logging.getLogger(__name__)
+
+
+class RecoveryAction(Enum):
+    """Types of recovery actions."""
+    RESTART_STREAM = "restart_stream"
+    RESTART_THREAD = "restart_thread"
+    CLEAR_BUFFER = "clear_buffer"
+    RECONNECT = "reconnect"
+    THROTTLE = "throttle"
+    DISABLE = "disable"
+
+
+@dataclass
+class RecoveryAttempt:
+    """Record of a recovery attempt."""
+    timestamp: float
+    component: str
+    action: RecoveryAction
+    reason: str
+    success: bool
+    details: Dict[str, Any] = None
+
+
+@dataclass
+class RecoveryState:
+    """Recovery state for a component - simplified without circuit breaker."""
+    failure_count: int = 0
+    success_count: int = 0
+    last_failure_time: Optional[float] = None
+    last_success_time: Optional[float] = None
+
+
+class RecoveryManager:
+    """Manages automatic recovery actions for health issues."""
+
+    def __init__(self):
+        self.recovery_handlers: Dict[str, Callable[[str, HealthCheck], bool]] = {}
+        self.recovery_states: Dict[str, RecoveryState] = {}
+        self.recovery_history: deque = deque(maxlen=1000)
+        self._lock = threading.RLock()
+
+        # Configuration - simplified without circuit breaker
+        self.recovery_cooldown = 30          # 30 seconds between recovery attempts
+        self.max_attempts_per_hour = 20      # Still limit to prevent spam, but much higher
+
+        # Track recovery attempts per component
+        self.recovery_attempts: Dict[str, deque] = defaultdict(lambda: deque(maxlen=50))
+
+        # Register with health monitor
+        health_monitor.register_recovery_callback("stream", self._handle_stream_recovery)
+        health_monitor.register_recovery_callback("thread", self._handle_thread_recovery)
+        health_monitor.register_recovery_callback("buffer", self._handle_buffer_recovery)
+
+    def register_recovery_handler(self, action: RecoveryAction, handler: Callable[[str, Dict[str, Any]], bool]):
+        """
+        Register a recovery handler for a specific action.
+
+        Args:
+            action: Type of recovery action
+            handler: Function that performs the recovery
+        """
+        self.recovery_handlers[action.value] = handler
+        logger.info(f"Registered recovery handler for {action.value}")
+
+    def can_attempt_recovery(self, component: str) -> bool:
+        """
+        Check if recovery can be attempted for a component.
+
+        Args:
+            component: Component identifier
+
+        Returns:
+            True if recovery can be attempted (always allow with minimal throttling)
+        """
+        with self._lock:
+            current_time = time.time()
+
+            # Check recovery attempt rate limiting (much more permissive)
+            recent_attempts = [
+                attempt for attempt in self.recovery_attempts[component]
+                if current_time - attempt <= 3600  # Last hour
+            ]
+
+            # Only block if truly excessive attempts
+            if len(recent_attempts) >= self.max_attempts_per_hour:
+                logger.warning(f"Recovery rate limit exceeded for {component} "
+                             f"({len(recent_attempts)} attempts in last hour)")
+                return False
+
+            # Check cooldown period (shorter cooldown)
+            if recent_attempts:
+                last_attempt = max(recent_attempts)
+                if current_time - last_attempt < self.recovery_cooldown:
+                    logger.debug(f"Recovery cooldown active for {component} "
+                               f"(last attempt {current_time - last_attempt:.1f}s ago)")
+                    return False
+
+            return True
+
+    def attempt_recovery(self, component: str, action: RecoveryAction, reason: str,
+                        details: Optional[Dict[str, Any]] = None) -> bool:
+        """
+        Attempt recovery for a component.
+
+        Args:
+            component: Component identifier
+            action: Recovery action to perform
+            reason: Reason for recovery
+            details: Additional details
+
+        Returns:
+            True if recovery was successful
+        """
+        if not self.can_attempt_recovery(component):
+            return False
+
+        current_time = time.time()
+
+        logger.info(f"Attempting recovery for {component}: {action.value} ({reason})")
+
+        try:
+            # Record recovery attempt
+            with self._lock:
+                self.recovery_attempts[component].append(current_time)
+
+            # Perform recovery action
+            success = self._execute_recovery_action(component, action, details or {})
+
+            # Record recovery result
+            attempt = RecoveryAttempt(
+                timestamp=current_time,
+                component=component,
+                action=action,
+                reason=reason,
+                success=success,
+                details=details
+            )
+
+            with self._lock:
+                self.recovery_history.append(attempt)
+
+            # Update recovery state
+            self._update_recovery_state(component, success)
+
+            if success:
+                logger.info(f"Recovery successful for {component}: {action.value}")
+            else:
+                logger.error(f"Recovery failed for {component}: {action.value}")
+
+            return success
+
+        except Exception as e:
+            logger.error(f"Error during recovery for {component}: {e}")
+            self._update_recovery_state(component, False)
+            return False
+
+    def _execute_recovery_action(self, component: str, action: RecoveryAction,
+                               details: Dict[str, Any]) -> bool:
+        """Execute a specific recovery action."""
+        handler_key = action.value
+
+        if handler_key not in self.recovery_handlers:
+            logger.error(f"No recovery handler registered for action: {handler_key}")
+            return False
+
+        try:
+            handler = self.recovery_handlers[handler_key]
+            return handler(component, details)
+
+        except Exception as e:
+            logger.error(f"Error executing recovery action {handler_key} for {component}: {e}")
+            return False
+
+    def _update_recovery_state(self, component: str, success: bool):
+        """Update recovery state based on recovery result."""
+        current_time = time.time()
+
+        with self._lock:
+            if component not in self.recovery_states:
+                self.recovery_states[component] = RecoveryState()
+
+            state = self.recovery_states[component]
+
+            if success:
+                state.success_count += 1
+                state.last_success_time = current_time
+                # Reset failure count on success
+                state.failure_count = max(0, state.failure_count - 1)
+                logger.debug(f"Recovery success for {component} (total successes: {state.success_count})")
+            else:
+                state.failure_count += 1
+                state.last_failure_time = current_time
+                logger.debug(f"Recovery failure for {component} (total failures: {state.failure_count})")
+
+    def _handle_stream_recovery(self, component: str, health_check: HealthCheck) -> bool:
+        """Handle recovery for stream-related issues."""
+        if "frames" in health_check.name:
+            # Frame-related issue - restart stream
+            return self.attempt_recovery(
+                component,
+                RecoveryAction.RESTART_STREAM,
+                health_check.message,
+                health_check.details
+            )
+        elif "connection" in health_check.name:
+            # Connection issue - reconnect
+            return self.attempt_recovery(
+                component,
+                RecoveryAction.RECONNECT,
+                health_check.message,
+                health_check.details
+            )
+        elif "errors" in health_check.name:
+            # High error rate - throttle or restart
+            return self.attempt_recovery(
+                component,
+                RecoveryAction.THROTTLE,
+                health_check.message,
+                health_check.details
+            )
+        else:
+            # Generic stream issue - restart
+            return self.attempt_recovery(
+                component,
+                RecoveryAction.RESTART_STREAM,
+                health_check.message,
+                health_check.details
+            )
+
+    def _handle_thread_recovery(self, component: str, health_check: HealthCheck) -> bool:
+        """Handle recovery for thread-related issues."""
+        if "deadlock" in health_check.name:
+            # Deadlock detected - restart thread
+            return self.attempt_recovery(
+                component,
+                RecoveryAction.RESTART_THREAD,
+                health_check.message,
+                health_check.details
+            )
+        elif "responsive" in health_check.name:
+            # Thread unresponsive - restart
+            return self.attempt_recovery(
+                component,
+                RecoveryAction.RESTART_THREAD,
+                health_check.message,
+                health_check.details
+            )
+        else:
+            # Generic thread issue - restart
+            return self.attempt_recovery(
+                component,
+                RecoveryAction.RESTART_THREAD,
+                health_check.message,
+                health_check.details
+            )
+
+    def _handle_buffer_recovery(self, component: str, health_check: HealthCheck) -> bool:
+        """Handle recovery for buffer-related issues."""
+        # Buffer issues - clear buffer
+        return self.attempt_recovery(
+            component,
+            RecoveryAction.CLEAR_BUFFER,
+            health_check.message,
+            health_check.details
+        )
+
+    def get_recovery_stats(self) -> Dict[str, Any]:
+        """Get recovery statistics."""
+        current_time = time.time()
+
+        with self._lock:
+            # Calculate stats from history
+            recent_recoveries = [
+                attempt for attempt in self.recovery_history
+                if current_time - attempt.timestamp <= 3600  # Last hour
+            ]
+
+            stats_by_component = defaultdict(lambda: {
+                'attempts': 0,
+                'successes': 0,
+                'failures': 0,
+                'last_attempt': None,
+                'last_success': None
+            })
+
+            for attempt in recent_recoveries:
+                stats = stats_by_component[attempt.component]
+                stats['attempts'] += 1
+
+                if attempt.success:
+                    stats['successes'] += 1
+                    if not stats['last_success'] or attempt.timestamp > stats['last_success']:
+                        stats['last_success'] = attempt.timestamp
+                else:
+                    stats['failures'] += 1
+
+                if not stats['last_attempt'] or attempt.timestamp > stats['last_attempt']:
+                    stats['last_attempt'] = attempt.timestamp
+
+            return {
+                'total_recoveries_last_hour': len(recent_recoveries),
+                'recovery_by_component': dict(stats_by_component),
+                'recovery_states': {
+                    component: {
+                        'failure_count': state.failure_count,
+                        'success_count': state.success_count,
+                        'last_failure_time': state.last_failure_time,
+                        'last_success_time': state.last_success_time
+                    }
+                    for component, state in self.recovery_states.items()
+                },
+                'recent_history': [
+                    {
+                        'timestamp': attempt.timestamp,
+                        'component': attempt.component,
+                        'action': attempt.action.value,
+                        'reason': attempt.reason,
+                        'success': attempt.success
+                    }
+                    for attempt in list(self.recovery_history)[-10:]  # Last 10 attempts
+                ]
+            }
+
+    def force_recovery(self, component: str, action: RecoveryAction, reason: str = "manual") -> bool:
+        """
+        Force recovery for a component, bypassing rate limiting.
+
+        Args:
+            component: Component identifier
+            action: Recovery action to perform
+            reason: Reason for forced recovery
+
+        Returns:
+            True if recovery was successful
+        """
+        logger.info(f"Forcing recovery for {component}: {action.value} ({reason})")
+
+        current_time = time.time()
+
+        try:
+            # Execute recovery action directly
+            success = self._execute_recovery_action(component, action, {})
+
+            # Record forced recovery
+            attempt = RecoveryAttempt(
+                timestamp=current_time,
+                component=component,
+                action=action,
+                reason=f"forced: {reason}",
+                success=success,
+                details={'forced': True}
+            )
+
+            with self._lock:
+                self.recovery_history.append(attempt)
+                self.recovery_attempts[component].append(current_time)
+
+            # Update recovery state
+            self._update_recovery_state(component, success)
+
+            return success
+
+        except Exception as e:
+            logger.error(f"Error during forced recovery for {component}: {e}")
+            return False
+
+
+# Global recovery manager instance
+recovery_manager = RecoveryManager()
--- a/core/monitoring/stream_health.py
+++ b/core/monitoring/stream_health.py
@ -0,0 +1,351 @@
+"""
+Stream-specific health monitoring for video streams.
+Tracks frame production, connection health, and stream-specific metrics.
+"""
+import time
+import logging
+import threading
+import requests
+from typing import Dict, Optional, List, Any
+from collections import deque
+from dataclasses import dataclass
+
+from .health import HealthCheck, HealthStatus, health_monitor
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class StreamMetrics:
+    """Metrics for an individual stream."""
+    camera_id: str
+    stream_type: str  # 'rtsp', 'http_snapshot'
+    start_time: float
+    last_frame_time: Optional[float] = None
+    frame_count: int = 0
+    error_count: int = 0
+    reconnect_count: int = 0
+    bytes_received: int = 0
+    frames_per_second: float = 0.0
+    connection_attempts: int = 0
+    last_connection_test: Optional[float] = None
+    connection_healthy: bool = True
+    last_error: Optional[str] = None
+    last_error_time: Optional[float] = None
+
+
+class StreamHealthTracker:
+    """Tracks health for individual video streams."""
+
+    def __init__(self):
+        self.streams: Dict[str, StreamMetrics] = {}
+        self._lock = threading.RLock()
+
+        # Configuration
+        self.connection_test_interval = 300  # Test connection every 5 minutes
+        self.frame_timeout_warning = 120    # Warn if no frames for 2 minutes
+        self.frame_timeout_critical = 300   # Critical if no frames for 5 minutes
+        self.error_rate_threshold = 0.1     # 10% error rate threshold
+
+        # Register with health monitor
+        health_monitor.register_health_checker(self._perform_health_checks)
+
+    def register_stream(self, camera_id: str, stream_type: str, source_url: Optional[str] = None):
+        """Register a new stream for monitoring."""
+        with self._lock:
+            if camera_id not in self.streams:
+                self.streams[camera_id] = StreamMetrics(
+                    camera_id=camera_id,
+                    stream_type=stream_type,
+                    start_time=time.time()
+                )
+                logger.info(f"Registered stream for monitoring: {camera_id} ({stream_type})")
+
+            # Update health monitor metrics
+            health_monitor.update_metrics(
+                camera_id,
+                thread_alive=True,
+                connection_healthy=True
+            )
+
+    def unregister_stream(self, camera_id: str):
+        """Unregister a stream from monitoring."""
+        with self._lock:
+            if camera_id in self.streams:
+                del self.streams[camera_id]
+                logger.info(f"Unregistered stream from monitoring: {camera_id}")
+
+    def report_frame_received(self, camera_id: str, frame_size_bytes: int = 0):
+        """Report that a frame was received."""
+        current_time = time.time()
+
+        with self._lock:
+            if camera_id not in self.streams:
+                logger.warning(f"Frame received for unregistered stream: {camera_id}")
+                return
+
+            stream = self.streams[camera_id]
+
+            # Update frame metrics
+            if stream.last_frame_time:
+                interval = current_time - stream.last_frame_time
+                # Calculate FPS as moving average
+                if stream.frames_per_second == 0:
+                    stream.frames_per_second = 1.0 / interval if interval > 0 else 0
+                else:
+                    new_fps = 1.0 / interval if interval > 0 else 0
+                    stream.frames_per_second = (stream.frames_per_second * 0.9) + (new_fps * 0.1)
+
+            stream.last_frame_time = current_time
+            stream.frame_count += 1
+            stream.bytes_received += frame_size_bytes
+
+            # Report to health monitor
+            health_monitor.report_frame_received(camera_id)
+            health_monitor.update_metrics(
+                camera_id,
+                frame_count=stream.frame_count,
+                avg_frame_interval=1.0 / stream.frames_per_second if stream.frames_per_second > 0 else 0,
+                last_frame_time=current_time
+            )
+
+    def report_error(self, camera_id: str, error_message: str):
+        """Report an error for a stream."""
+        current_time = time.time()
+
+        with self._lock:
+            if camera_id not in self.streams:
+                logger.warning(f"Error reported for unregistered stream: {camera_id}")
+                return
+
+            stream = self.streams[camera_id]
+            stream.error_count += 1
+            stream.last_error = error_message
+            stream.last_error_time = current_time
+
+            # Report to health monitor
+            health_monitor.report_error(camera_id, "stream_error")
+            health_monitor.update_metrics(
+                camera_id,
+                error_count=stream.error_count
+            )
+
+            logger.debug(f"Error reported for stream {camera_id}: {error_message}")
+
+    def report_reconnect(self, camera_id: str, reason: str = "unknown"):
+        """Report that a stream reconnected."""
+        current_time = time.time()
+
+        with self._lock:
+            if camera_id not in self.streams:
+                logger.warning(f"Reconnect reported for unregistered stream: {camera_id}")
+                return
+
+            stream = self.streams[camera_id]
+            stream.reconnect_count += 1
+
+            # Report to health monitor
+            health_monitor.report_restart(camera_id)
+            health_monitor.update_metrics(
+                camera_id,
+                restart_count=stream.reconnect_count
+            )
+
+            logger.info(f"Reconnect reported for stream {camera_id}: {reason}")
+
+    def report_connection_attempt(self, camera_id: str, success: bool):
+        """Report a connection attempt."""
+        with self._lock:
+            if camera_id not in self.streams:
+                return
+
+            stream = self.streams[camera_id]
+            stream.connection_attempts += 1
+            stream.connection_healthy = success
+
+            # Report to health monitor
+            health_monitor.update_metrics(
+                camera_id,
+                connection_healthy=success
+            )
+
+    def test_http_connection(self, camera_id: str, url: str) -> bool:
+        """Test HTTP connection health for snapshot streams."""
+        try:
+            # Quick HEAD request to test connectivity
+            response = requests.head(url, timeout=5, verify=False)
+            success = response.status_code in [200, 404]  # 404 might be normal for some cameras
+
+            self.report_connection_attempt(camera_id, success)
+
+            if success:
+                logger.debug(f"Connection test passed for {camera_id}")
+            else:
+                logger.warning(f"Connection test failed for {camera_id}: HTTP {response.status_code}")
+
+            return success
+
+        except Exception as e:
+            logger.warning(f"Connection test failed for {camera_id}: {e}")
+            self.report_connection_attempt(camera_id, False)
+            return False
+
+    def get_stream_metrics(self, camera_id: str) -> Optional[Dict[str, Any]]:
+        """Get metrics for a specific stream."""
+        with self._lock:
+            if camera_id not in self.streams:
+                return None
+
+            stream = self.streams[camera_id]
+            current_time = time.time()
+
+            # Calculate derived metrics
+            uptime = current_time - stream.start_time
+            frame_age = current_time - stream.last_frame_time if stream.last_frame_time else None
+            error_rate = stream.error_count / max(1, stream.frame_count)
+
+            return {
+                'camera_id': camera_id,
+                'stream_type': stream.stream_type,
+                'uptime_seconds': uptime,
+                'frame_count': stream.frame_count,
+                'frames_per_second': stream.frames_per_second,
+                'bytes_received': stream.bytes_received,
+                'error_count': stream.error_count,
+                'error_rate': error_rate,
+                'reconnect_count': stream.reconnect_count,
+                'connection_attempts': stream.connection_attempts,
+                'connection_healthy': stream.connection_healthy,
+                'last_frame_age_seconds': frame_age,
+                'last_error': stream.last_error,
+                'last_error_time': stream.last_error_time
+            }
+
+    def get_all_metrics(self) -> Dict[str, Dict[str, Any]]:
+        """Get metrics for all streams."""
+        with self._lock:
+            return {
+                camera_id: self.get_stream_metrics(camera_id)
+                for camera_id in self.streams.keys()
+            }
+
+    def _perform_health_checks(self) -> List[HealthCheck]:
+        """Perform health checks for all streams."""
+        checks = []
+        current_time = time.time()
+
+        with self._lock:
+            for camera_id, stream in self.streams.items():
+                checks.extend(self._check_stream_health(camera_id, stream, current_time))
+
+        return checks
+
+    def _check_stream_health(self, camera_id: str, stream: StreamMetrics, current_time: float) -> List[HealthCheck]:
+        """Perform health checks for a single stream."""
+        checks = []
+
+        # Check frame freshness
+        if stream.last_frame_time:
+            frame_age = current_time - stream.last_frame_time
+
+            if frame_age > self.frame_timeout_critical:
+                checks.append(HealthCheck(
+                    name=f"stream_{camera_id}_frames",
+                    status=HealthStatus.CRITICAL,
+                    message=f"No frames for {frame_age:.1f}s (critical threshold: {self.frame_timeout_critical}s)",
+                    details={
+                        'frame_age': frame_age,
+                        'threshold': self.frame_timeout_critical,
+                        'last_frame_time': stream.last_frame_time
+                    },
+                    recovery_action="restart_stream"
+                ))
+            elif frame_age > self.frame_timeout_warning:
+                checks.append(HealthCheck(
+                    name=f"stream_{camera_id}_frames",
+                    status=HealthStatus.WARNING,
+                    message=f"Frames aging: {frame_age:.1f}s (warning threshold: {self.frame_timeout_warning}s)",
+                    details={
+                        'frame_age': frame_age,
+                        'threshold': self.frame_timeout_warning,
+                        'last_frame_time': stream.last_frame_time
+                    }
+                ))
+        else:
+            # No frames received yet
+            startup_time = current_time - stream.start_time
+            if startup_time > 60:  # Allow 1 minute for initial connection
+                checks.append(HealthCheck(
+                    name=f"stream_{camera_id}_startup",
+                    status=HealthStatus.CRITICAL,
+                    message=f"No frames received since startup {startup_time:.1f}s ago",
+                    details={
+                        'startup_time': startup_time,
+                        'start_time': stream.start_time
+                    },
+                    recovery_action="restart_stream"
+                ))
+
+        # Check error rate
+        if stream.frame_count > 10:  # Need sufficient samples
+            error_rate = stream.error_count / stream.frame_count
+            if error_rate > self.error_rate_threshold:
+                checks.append(HealthCheck(
+                    name=f"stream_{camera_id}_errors",
+                    status=HealthStatus.WARNING,
+                    message=f"High error rate: {error_rate:.1%} ({stream.error_count}/{stream.frame_count})",
+                    details={
+                        'error_rate': error_rate,
+                        'error_count': stream.error_count,
+                        'frame_count': stream.frame_count,
+                        'last_error': stream.last_error
+                    }
+                ))
+
+        # Check connection health
+        if not stream.connection_healthy:
+            checks.append(HealthCheck(
+                name=f"stream_{camera_id}_connection",
+                status=HealthStatus.WARNING,
+                message="Connection unhealthy (last test failed)",
+                details={
+                    'connection_attempts': stream.connection_attempts,
+                    'last_connection_test': stream.last_connection_test
+                }
+            ))
+
+        # Check excessive reconnects
+        uptime_hours = (current_time - stream.start_time) / 3600
+        if uptime_hours > 1 and stream.reconnect_count > 5:  # More than 5 reconnects per hour
+            reconnect_rate = stream.reconnect_count / uptime_hours
+            checks.append(HealthCheck(
+                name=f"stream_{camera_id}_stability",
+                status=HealthStatus.WARNING,
+                message=f"Frequent reconnects: {reconnect_rate:.1f}/hour ({stream.reconnect_count} total)",
+                details={
+                    'reconnect_rate': reconnect_rate,
+                    'reconnect_count': stream.reconnect_count,
+                    'uptime_hours': uptime_hours
+                }
+            ))
+
+        # Check frame rate health
+        if stream.last_frame_time and stream.frames_per_second > 0:
+            expected_fps = 6.0  # Expected FPS for streams
+            if stream.frames_per_second < expected_fps * 0.5:  # Less than 50% of expected
+                checks.append(HealthCheck(
+                    name=f"stream_{camera_id}_framerate",
+                    status=HealthStatus.WARNING,
+                    message=f"Low frame rate: {stream.frames_per_second:.1f} fps (expected: ~{expected_fps} fps)",
+                    details={
+                        'current_fps': stream.frames_per_second,
+                        'expected_fps': expected_fps
+                    }
+                ))
+
+        return checks
+
+
+# Global stream health tracker instance
+stream_health_tracker = StreamHealthTracker()
--- a/core/monitoring/thread_health.py
+++ b/core/monitoring/thread_health.py
@ -0,0 +1,381 @@
+"""
+Thread health monitoring for detecting unresponsive and deadlocked threads.
+Provides thread liveness detection and responsiveness testing.
+"""
+import time
+import threading
+import logging
+import signal
+import traceback
+from typing import Dict, List, Optional, Any, Callable
+from dataclasses import dataclass
+from collections import defaultdict
+
+from .health import HealthCheck, HealthStatus, health_monitor
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ThreadInfo:
+    """Information about a monitored thread."""
+    thread_id: int
+    thread_name: str
+    start_time: float
+    last_heartbeat: float
+    heartbeat_count: int = 0
+    is_responsive: bool = True
+    last_activity: Optional[str] = None
+    stack_traces: List[str] = None
+
+
+class ThreadHealthMonitor:
+    """Monitors thread health and responsiveness."""
+
+    def __init__(self):
+        self.monitored_threads: Dict[int, ThreadInfo] = {}
+        self.heartbeat_callbacks: Dict[int, Callable[[], bool]] = {}
+        self._lock = threading.RLock()
+
+        # Configuration
+        self.heartbeat_timeout = 60.0      # 1 minute without heartbeat = unresponsive
+        self.responsiveness_test_interval = 30.0  # Test responsiveness every 30 seconds
+        self.stack_trace_count = 5         # Keep last 5 stack traces for analysis
+
+        # Register with health monitor
+        health_monitor.register_health_checker(self._perform_health_checks)
+
+        # Enable periodic responsiveness testing
+        self.test_thread = threading.Thread(target=self._responsiveness_test_loop, daemon=True)
+        self.test_thread.start()
+
+    def register_thread(self, thread: threading.Thread, heartbeat_callback: Optional[Callable[[], bool]] = None):
+        """
+        Register a thread for monitoring.
+
+        Args:
+            thread: Thread to monitor
+            heartbeat_callback: Optional callback to test thread responsiveness
+        """
+        with self._lock:
+            thread_info = ThreadInfo(
+                thread_id=thread.ident,
+                thread_name=thread.name,
+                start_time=time.time(),
+                last_heartbeat=time.time()
+            )
+
+            self.monitored_threads[thread.ident] = thread_info
+
+            if heartbeat_callback:
+                self.heartbeat_callbacks[thread.ident] = heartbeat_callback
+
+            logger.info(f"Registered thread for monitoring: {thread.name} (ID: {thread.ident})")
+
+    def unregister_thread(self, thread_id: int):
+        """Unregister a thread from monitoring."""
+        with self._lock:
+            if thread_id in self.monitored_threads:
+                thread_name = self.monitored_threads[thread_id].thread_name
+                del self.monitored_threads[thread_id]
+
+                if thread_id in self.heartbeat_callbacks:
+                    del self.heartbeat_callbacks[thread_id]
+
+                logger.info(f"Unregistered thread from monitoring: {thread_name} (ID: {thread_id})")
+
+    def heartbeat(self, thread_id: Optional[int] = None, activity: Optional[str] = None):
+        """
+        Report thread heartbeat.
+
+        Args:
+            thread_id: Thread ID (uses current thread if None)
+            activity: Description of current activity
+        """
+        if thread_id is None:
+            thread_id = threading.current_thread().ident
+
+        current_time = time.time()
+
+        with self._lock:
+            if thread_id in self.monitored_threads:
+                thread_info = self.monitored_threads[thread_id]
+                thread_info.last_heartbeat = current_time
+                thread_info.heartbeat_count += 1
+                thread_info.is_responsive = True
+
+                if activity:
+                    thread_info.last_activity = activity
+
+                # Report to health monitor
+                health_monitor.update_metrics(
+                    f"thread_{thread_info.thread_name}",
+                    thread_alive=True,
+                    last_frame_time=current_time
+                )
+
+    def get_thread_info(self, thread_id: int) -> Optional[Dict[str, Any]]:
+        """Get information about a monitored thread."""
+        with self._lock:
+            if thread_id not in self.monitored_threads:
+                return None
+
+            thread_info = self.monitored_threads[thread_id]
+            current_time = time.time()
+
+            return {
+                'thread_id': thread_id,
+                'thread_name': thread_info.thread_name,
+                'uptime_seconds': current_time - thread_info.start_time,
+                'last_heartbeat_age': current_time - thread_info.last_heartbeat,
+                'heartbeat_count': thread_info.heartbeat_count,
+                'is_responsive': thread_info.is_responsive,
+                'last_activity': thread_info.last_activity,
+                'stack_traces': thread_info.stack_traces or []
+            }
+
+    def get_all_thread_info(self) -> Dict[int, Dict[str, Any]]:
+        """Get information about all monitored threads."""
+        with self._lock:
+            return {
+                thread_id: self.get_thread_info(thread_id)
+                for thread_id in self.monitored_threads.keys()
+            }
+
+    def test_thread_responsiveness(self, thread_id: int) -> bool:
+        """
+        Test if a thread is responsive by calling its heartbeat callback.
+
+        Args:
+            thread_id: ID of thread to test
+
+        Returns:
+            True if thread responds within timeout
+        """
+        if thread_id not in self.heartbeat_callbacks:
+            return True  # Can't test if no callback provided
+
+        try:
+            # Call the heartbeat callback with a timeout
+            callback = self.heartbeat_callbacks[thread_id]
+
+            # This is a simple approach - in practice you might want to use
+            # threading.Timer or asyncio for more sophisticated timeout handling
+            start_time = time.time()
+            result = callback()
+            response_time = time.time() - start_time
+
+            with self._lock:
+                if thread_id in self.monitored_threads:
+                    self.monitored_threads[thread_id].is_responsive = result
+
+            if response_time > 5.0:  # Slow response
+                logger.warning(f"Thread {thread_id} slow response: {response_time:.1f}s")
+
+            return result
+
+        except Exception as e:
+            logger.error(f"Error testing thread {thread_id} responsiveness: {e}")
+            with self._lock:
+                if thread_id in self.monitored_threads:
+                    self.monitored_threads[thread_id].is_responsive = False
+            return False
+
+    def capture_stack_trace(self, thread_id: int) -> Optional[str]:
+        """
+        Capture stack trace for a thread.
+
+        Args:
+            thread_id: ID of thread to capture
+
+        Returns:
+            Stack trace string or None if not available
+        """
+        try:
+            # Get all frames for all threads
+            frames = dict(threading._current_frames())
+
+            if thread_id not in frames:
+                return None
+
+            # Format stack trace
+            frame = frames[thread_id]
+            stack_trace = ''.join(traceback.format_stack(frame))
+
+            # Store in thread info
+            with self._lock:
+                if thread_id in self.monitored_threads:
+                    thread_info = self.monitored_threads[thread_id]
+                    if thread_info.stack_traces is None:
+                        thread_info.stack_traces = []
+
+                    thread_info.stack_traces.append(f"{time.time()}: {stack_trace}")
+
+                    # Keep only last N stack traces
+                    if len(thread_info.stack_traces) > self.stack_trace_count:
+                        thread_info.stack_traces = thread_info.stack_traces[-self.stack_trace_count:]
+
+            return stack_trace
+
+        except Exception as e:
+            logger.error(f"Error capturing stack trace for thread {thread_id}: {e}")
+            return None
+
+    def detect_deadlocks(self) -> List[Dict[str, Any]]:
+        """
+        Attempt to detect potential deadlocks by analyzing thread states.
+
+        Returns:
+            List of potential deadlock scenarios
+        """
+        deadlocks = []
+        current_time = time.time()
+
+        with self._lock:
+            # Look for threads that haven't had heartbeats for a long time
+            # and are supposedly alive
+            for thread_id, thread_info in self.monitored_threads.items():
+                heartbeat_age = current_time - thread_info.last_heartbeat
+
+                if heartbeat_age > self.heartbeat_timeout * 2:  # Double the timeout
+                    # Check if thread still exists
+                    thread_exists = any(
+                        t.ident == thread_id and t.is_alive()
+                        for t in threading.enumerate()
+                    )
+
+                    if thread_exists:
+                        # Thread exists but not responding - potential deadlock
+                        stack_trace = self.capture_stack_trace(thread_id)
+
+                        deadlock_info = {
+                            'thread_id': thread_id,
+                            'thread_name': thread_info.thread_name,
+                            'heartbeat_age': heartbeat_age,
+                            'last_activity': thread_info.last_activity,
+                            'stack_trace': stack_trace,
+                            'detection_time': current_time
+                        }
+
+                        deadlocks.append(deadlock_info)
+                        logger.warning(f"Potential deadlock detected in thread {thread_info.thread_name}")
+
+        return deadlocks
+
+    def _responsiveness_test_loop(self):
+        """Background loop to test thread responsiveness."""
+        logger.info("Thread responsiveness testing started")
+
+        while True:
+            try:
+                time.sleep(self.responsiveness_test_interval)
+
+                with self._lock:
+                    thread_ids = list(self.monitored_threads.keys())
+
+                for thread_id in thread_ids:
+                    try:
+                        self.test_thread_responsiveness(thread_id)
+                    except Exception as e:
+                        logger.error(f"Error testing thread {thread_id}: {e}")
+
+            except Exception as e:
+                logger.error(f"Error in responsiveness test loop: {e}")
+                time.sleep(10.0)  # Fallback sleep
+
+    def _perform_health_checks(self) -> List[HealthCheck]:
+        """Perform health checks for all monitored threads."""
+        checks = []
+        current_time = time.time()
+
+        with self._lock:
+            for thread_id, thread_info in self.monitored_threads.items():
+                checks.extend(self._check_thread_health(thread_id, thread_info, current_time))
+
+        # Check for deadlocks
+        deadlocks = self.detect_deadlocks()
+        for deadlock in deadlocks:
+            checks.append(HealthCheck(
+                name=f"deadlock_detection_{deadlock['thread_id']}",
+                status=HealthStatus.CRITICAL,
+                message=f"Potential deadlock in thread {deadlock['thread_name']} "
+                       f"(unresponsive for {deadlock['heartbeat_age']:.1f}s)",
+                details=deadlock,
+                recovery_action="restart_thread"
+            ))
+
+        return checks
+
+    def _check_thread_health(self, thread_id: int, thread_info: ThreadInfo, current_time: float) -> List[HealthCheck]:
+        """Perform health checks for a single thread."""
+        checks = []
+
+        # Check if thread still exists
+        thread_exists = any(
+            t.ident == thread_id and t.is_alive()
+            for t in threading.enumerate()
+        )
+
+        if not thread_exists:
+            checks.append(HealthCheck(
+                name=f"thread_{thread_info.thread_name}_alive",
+                status=HealthStatus.CRITICAL,
+                message=f"Thread {thread_info.thread_name} is no longer alive",
+                details={
+                    'thread_id': thread_id,
+                    'uptime': current_time - thread_info.start_time,
+                    'last_heartbeat': thread_info.last_heartbeat
+                },
+                recovery_action="restart_thread"
+            ))
+            return checks
+
+        # Check heartbeat freshness
+        heartbeat_age = current_time - thread_info.last_heartbeat
+
+        if heartbeat_age > self.heartbeat_timeout:
+            checks.append(HealthCheck(
+                name=f"thread_{thread_info.thread_name}_responsive",
+                status=HealthStatus.CRITICAL,
+                message=f"Thread {thread_info.thread_name} unresponsive for {heartbeat_age:.1f}s",
+                details={
+                    'thread_id': thread_id,
+                    'heartbeat_age': heartbeat_age,
+                    'heartbeat_count': thread_info.heartbeat_count,
+                    'last_activity': thread_info.last_activity,
+                    'is_responsive': thread_info.is_responsive
+                },
+                recovery_action="restart_thread"
+            ))
+        elif heartbeat_age > self.heartbeat_timeout * 0.5:  # Warning at 50% of timeout
+            checks.append(HealthCheck(
+                name=f"thread_{thread_info.thread_name}_responsive",
+                status=HealthStatus.WARNING,
+                message=f"Thread {thread_info.thread_name} slow heartbeat: {heartbeat_age:.1f}s",
+                details={
+                    'thread_id': thread_id,
+                    'heartbeat_age': heartbeat_age,
+                    'heartbeat_count': thread_info.heartbeat_count,
+                    'last_activity': thread_info.last_activity,
+                    'is_responsive': thread_info.is_responsive
+                }
+            ))
+
+        # Check responsiveness test results
+        if not thread_info.is_responsive:
+            checks.append(HealthCheck(
+                name=f"thread_{thread_info.thread_name}_callback",
+                status=HealthStatus.WARNING,
+                message=f"Thread {thread_info.thread_name} failed responsiveness test",
+                details={
+                    'thread_id': thread_id,
+                    'last_activity': thread_info.last_activity
+                }
+            ))
+
+        return checks
+
+
+# Global thread health monitor instance
+thread_health_monitor = ThreadHealthMonitor()
--- a/core/streaming/readers/ffmpeg_rtsp.py
+++ b/core/streaming/readers/ffmpeg_rtsp.py
@ -1,5 +1,6 @@
 """
 FFmpeg RTSP stream reader using subprocess piping frames directly to buffer.
+Enhanced with comprehensive health monitoring and automatic recovery.
 """
 import cv2
 import time
@ -7,10 +8,13 @@ import threading
 import numpy as np
 import subprocess
 import struct
-from typing import Optional, Callable
+from typing import Optional, Callable, Dict, Any

 from .base import VideoReader
 from .utils import log_success, log_warning, log_error, log_info
+from ..monitoring.stream_health import stream_health_tracker
+from ..monitoring.thread_health import thread_health_monitor
+from ..monitoring.recovery import recovery_manager, RecoveryAction


 class FFmpegRTSPReader(VideoReader):
@ -35,6 +39,21 @@ class FFmpegRTSPReader(VideoReader):
        self.first_start_timeout = 30.0  # 30s timeout on first start
        self.restart_timeout = 15.0      # 15s timeout after restart

+        # Health monitoring setup
+        self.last_heartbeat = time.time()
+        self.consecutive_errors = 0
+        self.ffmpeg_restart_count = 0
+
+        # Register recovery handlers
+        recovery_manager.register_recovery_handler(
+            RecoveryAction.RESTART_STREAM,
+            self._handle_restart_recovery
+        )
+        recovery_manager.register_recovery_handler(
+            RecoveryAction.RECONNECT,
+            self._handle_reconnect_recovery
+        )
+
    @property
    def is_running(self) -> bool:
        """Check if the reader is currently running."""
@ -58,21 +77,35 @@ class FFmpegRTSPReader(VideoReader):
        self.stop_event.clear()
        self.thread = threading.Thread(target=self._read_frames, daemon=True)
        self.thread.start()
-        log_success(self.camera_id, "Stream started")
+
+        # Register with health monitoring
+        stream_health_tracker.register_stream(self.camera_id, "rtsp_ffmpeg", self.rtsp_url)
+        thread_health_monitor.register_thread(self.thread, self._heartbeat_callback)
+
+        log_success(self.camera_id, "Stream started with health monitoring")

    def stop(self):
        """Stop the FFmpeg subprocess reader."""
        self.stop_event.set()
+
+        # Unregister from health monitoring
+        if self.thread:
+            thread_health_monitor.unregister_thread(self.thread.ident)
+
        if self.process:
            self.process.terminate()
            try:
                self.process.wait(timeout=5)
            except subprocess.TimeoutExpired:
                self.process.kill()
+
        if self.thread:
            self.thread.join(timeout=5.0)
        if self.stderr_thread:
            self.stderr_thread.join(timeout=2.0)
+
+        stream_health_tracker.unregister_stream(self.camera_id)
+
        log_info(self.camera_id, "Stream stopped")

    def _start_ffmpeg_process(self):
@ -249,6 +282,9 @@ class FFmpegRTSPReader(VideoReader):

        while not self.stop_event.is_set():
            try:
+                # Send heartbeat for thread health monitoring
+                self._send_heartbeat("reading_frames")
+
                # Check watchdog timeout if process is running
                if self.process and self.process.poll() is None:
                    if self._check_watchdog_timeout():
@ -259,8 +295,17 @@ class FFmpegRTSPReader(VideoReader):
                if not self.process or self.process.poll() is not None:
                    if self.process and self.process.poll() is not None:
                        log_warning(self.camera_id, "Stream disconnected, reconnecting...")
+                        stream_health_tracker.report_error(
+                            self.camera_id,
+                            "FFmpeg process disconnected"
+                        )

                    if not self._start_ffmpeg_process():
+                        self.consecutive_errors += 1
+                        stream_health_tracker.report_error(
+                            self.camera_id,
+                            "Failed to start FFmpeg process"
+                        )
                        time.sleep(5.0)
                        continue

@ -275,9 +320,22 @@ class FFmpegRTSPReader(VideoReader):
                        # Update watchdog - we got a frame
                        self.last_frame_time = time.time()

+                        # Reset error counter on successful frame
+                        self.consecutive_errors = 0
+
+                        # Report successful frame to health monitoring
+                        frame_size = frame.nbytes
+                        stream_health_tracker.report_frame_received(self.camera_id, frame_size)
+
                        # Call frame callback
                        if self.frame_callback:
-                            self.frame_callback(self.camera_id, frame)
+                            try:
+                                self.frame_callback(self.camera_id, frame)
+                            except Exception as e:
+                                stream_health_tracker.report_error(
+                                    self.camera_id,
+                                    f"Frame callback error: {e}"
+                                )

                        frame_count += 1

@ -287,16 +345,85 @@ class FFmpegRTSPReader(VideoReader):
                            log_success(self.camera_id, f"{frame_count} frames captured ({frame.shape[1]}x{frame.shape[0]})")
                            last_log_time = current_time

-                except Exception:
+                except Exception as e:
                    # Process might have died, let it restart on next iteration
+                    stream_health_tracker.report_error(
+                        self.camera_id,
+                        f"Frame reading error: {e}"
+                    )
                    if self.process:
                        self.process.terminate()
                        self.process = None
                    time.sleep(1.0)

-            except Exception:
+            except Exception as e:
+                stream_health_tracker.report_error(
+                    self.camera_id,
+                    f"Main loop error: {e}"
+                )
                time.sleep(1.0)

        # Cleanup
        if self.process:
-            self.process.terminate()
+            self.process.terminate()
+
+    # Health monitoring methods
+    def _send_heartbeat(self, activity: str = "running"):
+        """Send heartbeat to thread health monitor."""
+        self.last_heartbeat = time.time()
+        thread_health_monitor.heartbeat(activity=activity)
+
+    def _heartbeat_callback(self) -> bool:
+        """Heartbeat callback for thread responsiveness testing."""
+        try:
+            # Check if thread is responsive by checking recent heartbeat
+            current_time = time.time()
+            age = current_time - self.last_heartbeat
+
+            # Thread is responsive if heartbeat is recent
+            return age < 30.0  # 30 second responsiveness threshold
+
+        except Exception:
+            return False
+
+    def _handle_restart_recovery(self, component: str, details: Dict[str, Any]) -> bool:
+        """Handle restart recovery action."""
+        try:
+            log_info(self.camera_id, "Restarting FFmpeg RTSP reader for health recovery")
+
+            # Stop current instance
+            self.stop()
+
+            # Small delay
+            time.sleep(2.0)
+
+            # Restart
+            self.start()
+
+            # Report successful restart
+            stream_health_tracker.report_reconnect(self.camera_id, "health_recovery_restart")
+            self.ffmpeg_restart_count += 1
+
+            return True
+
+        except Exception as e:
+            log_error(self.camera_id, f"Failed to restart FFmpeg RTSP reader: {e}")
+            return False
+
+    def _handle_reconnect_recovery(self, component: str, details: Dict[str, Any]) -> bool:
+        """Handle reconnect recovery action."""
+        try:
+            log_info(self.camera_id, "Reconnecting FFmpeg RTSP reader for health recovery")
+
+            # Force restart FFmpeg process
+            self._restart_ffmpeg_process()
+
+            # Reset error counters
+            self.consecutive_errors = 0
+            stream_health_tracker.report_reconnect(self.camera_id, "health_recovery_reconnect")
+
+            return True
+
+        except Exception as e:
+            log_error(self.camera_id, f"Failed to reconnect FFmpeg RTSP reader: {e}")
+            return False
--- a/core/streaming/readers/http_snapshot.py
+++ b/core/streaming/readers/http_snapshot.py
@ -1,5 +1,6 @@
 """
 HTTP snapshot reader optimized for 2560x1440 (2K) high quality images.
+Enhanced with comprehensive health monitoring and automatic recovery.
 """
 import cv2
 import logging
@ -7,10 +8,13 @@ import time
 import threading
 import requests
 import numpy as np
-from typing import Optional, Callable
+from typing import Optional, Callable, Dict, Any

 from .base import VideoReader
 from .utils import log_success, log_warning, log_error, log_info
+from ..monitoring.stream_health import stream_health_tracker
+from ..monitoring.thread_health import thread_health_monitor
+from ..monitoring.recovery import recovery_manager, RecoveryAction

 logger = logging.getLogger(__name__)

@ -30,6 +34,22 @@ class HTTPSnapshotReader(VideoReader):
        self.expected_height = 1440
        self.max_file_size = 10 * 1024 * 1024  # 10MB max for 2K image

+        # Health monitoring setup
+        self.last_heartbeat = time.time()
+        self.consecutive_errors = 0
+        self.connection_test_interval = 300  # Test connection every 5 minutes
+        self.last_connection_test = None
+
+        # Register recovery handlers
+        recovery_manager.register_recovery_handler(
+            RecoveryAction.RESTART_STREAM,
+            self._handle_restart_recovery
+        )
+        recovery_manager.register_recovery_handler(
+            RecoveryAction.RECONNECT,
+            self._handle_reconnect_recovery
+        )
+
    @property
    def is_running(self) -> bool:
        """Check if the reader is currently running."""
@ -53,13 +73,24 @@ class HTTPSnapshotReader(VideoReader):
        self.stop_event.clear()
        self.thread = threading.Thread(target=self._read_snapshots, daemon=True)
        self.thread.start()
-        logger.info(f"Started snapshot reader for camera {self.camera_id}")
+
+        # Register with health monitoring
+        stream_health_tracker.register_stream(self.camera_id, "http_snapshot", self.snapshot_url)
+        thread_health_monitor.register_thread(self.thread, self._heartbeat_callback)
+
+        logger.info(f"Started snapshot reader for camera {self.camera_id} with health monitoring")

    def stop(self):
        """Stop the snapshot reader thread."""
        self.stop_event.set()
+
+        # Unregister from health monitoring
        if self.thread:
+            thread_health_monitor.unregister_thread(self.thread.ident)
            self.thread.join(timeout=5.0)
+
+        stream_health_tracker.unregister_stream(self.camera_id)
+
        logger.info(f"Stopped snapshot reader for camera {self.camera_id}")

    def _read_snapshots(self):
@ -67,17 +98,29 @@ class HTTPSnapshotReader(VideoReader):
        retries = 0
        frame_count = 0
        last_log_time = time.time()
+        last_connection_test = time.time()
        interval_seconds = self.interval_ms / 1000.0

        logger.info(f"Snapshot interval for camera {self.camera_id}: {interval_seconds}s")

        while not self.stop_event.is_set():
            try:
+                # Send heartbeat for thread health monitoring
+                self._send_heartbeat("fetching_snapshot")
+
                start_time = time.time()
                frame = self._fetch_snapshot()

                if frame is None:
                    retries += 1
+                    self.consecutive_errors += 1
+
+                    # Report error to health monitoring
+                    stream_health_tracker.report_error(
+                        self.camera_id,
+                        f"Failed to fetch snapshot (retry {retries}/{self.max_retries})"
+                    )
+
                    logger.warning(f"Failed to fetch snapshot for camera {self.camera_id}, retry {retries}/{self.max_retries}")

                    if self.max_retries != -1 and retries > self.max_retries:
@ -90,21 +133,36 @@ class HTTPSnapshotReader(VideoReader):
                # Accept any valid image dimensions - don't force specific resolution
                if frame.shape[1] <= 0 or frame.shape[0] <= 0:
                    logger.warning(f"Camera {self.camera_id}: Invalid frame dimensions {frame.shape[1]}x{frame.shape[0]}")
+                    stream_health_tracker.report_error(
+                        self.camera_id,
+                        f"Invalid frame dimensions: {frame.shape[1]}x{frame.shape[0]}"
+                    )
                    continue

                # Reset retry counter on successful fetch
                retries = 0
+                self.consecutive_errors = 0
                frame_count += 1

+                # Report successful frame to health monitoring
+                frame_size = frame.nbytes
+                stream_health_tracker.report_frame_received(self.camera_id, frame_size)
+
                # Call frame callback
                if self.frame_callback:
                    try:
                        self.frame_callback(self.camera_id, frame)
                    except Exception as e:
                        logger.error(f"Camera {self.camera_id}: Frame callback error: {e}")
+                        stream_health_tracker.report_error(self.camera_id, f"Frame callback error: {e}")
+
+                # Periodic connection health test
+                current_time = time.time()
+                if current_time - last_connection_test >= self.connection_test_interval:
+                    self._test_connection_health()
+                    last_connection_test = current_time

                # Log progress every 30 seconds
-                current_time = time.time()
                if current_time - last_log_time >= 30:
                    logger.info(f"Camera {self.camera_id}: {frame_count} snapshots processed")
                    last_log_time = current_time
@ -117,6 +175,7 @@ class HTTPSnapshotReader(VideoReader):

            except Exception as e:
                logger.error(f"Error in snapshot loop for camera {self.camera_id}: {e}")
+                stream_health_tracker.report_error(self.camera_id, f"Snapshot loop error: {e}")
                retries += 1
                if self.max_retries != -1 and retries > self.max_retries:
                    break
@ -246,4 +305,74 @@ class HTTPSnapshotReader(VideoReader):
            right = target_width - new_width - left
            resized = cv2.copyMakeBorder(resized, top, bottom, left, right, cv2.BORDER_CONSTANT, value=[0, 0, 0])

-        return resized
+        return resized
+
+    # Health monitoring methods
+    def _send_heartbeat(self, activity: str = "running"):
+        """Send heartbeat to thread health monitor."""
+        self.last_heartbeat = time.time()
+        thread_health_monitor.heartbeat(activity=activity)
+
+    def _heartbeat_callback(self) -> bool:
+        """Heartbeat callback for thread responsiveness testing."""
+        try:
+            # Check if thread is responsive by checking recent heartbeat
+            current_time = time.time()
+            age = current_time - self.last_heartbeat
+
+            # Thread is responsive if heartbeat is recent
+            return age < 30.0  # 30 second responsiveness threshold
+
+        except Exception:
+            return False
+
+    def _test_connection_health(self):
+        """Test HTTP connection health."""
+        try:
+            stream_health_tracker.test_http_connection(self.camera_id, self.snapshot_url)
+        except Exception as e:
+            logger.error(f"Error testing connection health for {self.camera_id}: {e}")
+
+    def _handle_restart_recovery(self, component: str, details: Dict[str, Any]) -> bool:
+        """Handle restart recovery action."""
+        try:
+            logger.info(f"Restarting HTTP snapshot reader for {self.camera_id}")
+
+            # Stop current instance
+            self.stop()
+
+            # Small delay
+            time.sleep(2.0)
+
+            # Restart
+            self.start()
+
+            # Report successful restart
+            stream_health_tracker.report_reconnect(self.camera_id, "health_recovery_restart")
+
+            return True
+
+        except Exception as e:
+            logger.error(f"Failed to restart HTTP snapshot reader for {self.camera_id}: {e}")
+            return False
+
+    def _handle_reconnect_recovery(self, component: str, details: Dict[str, Any]) -> bool:
+        """Handle reconnect recovery action."""
+        try:
+            logger.info(f"Reconnecting HTTP snapshot reader for {self.camera_id}")
+
+            # Test connection first
+            success = stream_health_tracker.test_http_connection(self.camera_id, self.snapshot_url)
+
+            if success:
+                # Reset error counters
+                self.consecutive_errors = 0
+                stream_health_tracker.report_reconnect(self.camera_id, "health_recovery_reconnect")
+                return True
+            else:
+                logger.warning(f"Connection test failed during recovery for {self.camera_id}")
+                return False
+
+        except Exception as e:
+            logger.error(f"Failed to reconnect HTTP snapshot reader for {self.camera_id}: {e}")
+            return False