Implement comprehensive health monitoring for streams and threads
	
		
			
	
		
	
	
		
	
		
			Some checks failed
		
		
	
	
		
			
				
	
				Build Worker Base and Application Images / check-base-changes (push) Successful in 8s
				
			
		
			
				
	
				Build Worker Base and Application Images / build-base (push) Has been skipped
				
			
		
			
				
	
				Build Worker Base and Application Images / build-docker (push) Successful in 3m7s
				
			
		
			
				
	
				Build Worker Base and Application Images / deploy-stack (push) Has been cancelled
				
			
		
		
	
	
				
					
				
			
		
			Some checks failed
		
		
	
	Build Worker Base and Application Images / check-base-changes (push) Successful in 8s
				
			Build Worker Base and Application Images / build-base (push) Has been skipped
				
			Build Worker Base and Application Images / build-docker (push) Successful in 3m7s
				
			Build Worker Base and Application Images / deploy-stack (push) Has been cancelled
				
			- Added RecoveryManager for automatic handling of health issues, including circuit breaker patterns, automatic restarts, and graceful degradation. - Introduced StreamHealthTracker to monitor video stream metrics, including frame production, connection health, and error rates. - Developed ThreadHealthMonitor for detecting unresponsive and deadlocked threads, providing liveness detection and responsiveness testing. - Integrated health checks for streams and threads, reporting metrics and recovery actions to the health monitor. - Enhanced logging for recovery attempts, errors, and health checks to improve observability and debugging.
This commit is contained in:
		
							parent
							
								
									8c08c815ce
								
							
						
					
					
						commit
						b08ce27de2
					
				
					 9 changed files with 2173 additions and 11 deletions
				
			
		| 
						 | 
				
			
			@ -2,7 +2,8 @@
 | 
			
		|||
  "permissions": {
 | 
			
		||||
    "allow": [
 | 
			
		||||
      "Bash(dir:*)",
 | 
			
		||||
      "WebSearch"
 | 
			
		||||
      "WebSearch",
 | 
			
		||||
      "Bash(mkdir:*)"
 | 
			
		||||
    ],
 | 
			
		||||
    "deny": [],
 | 
			
		||||
    "ask": []
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										314
									
								
								app.py
									
										
									
									
									
								
							
							
						
						
									
										314
									
								
								app.py
									
										
									
									
									
								
							| 
						 | 
				
			
			@ -8,6 +8,7 @@ import os
 | 
			
		|||
import time
 | 
			
		||||
import cv2
 | 
			
		||||
from contextlib import asynccontextmanager
 | 
			
		||||
from typing import Dict, Any
 | 
			
		||||
from fastapi import FastAPI, WebSocket, HTTPException
 | 
			
		||||
from fastapi.responses import Response
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -31,21 +32,135 @@ logger.setLevel(logging.DEBUG)
 | 
			
		|||
# Frames are now stored in the shared cache buffer from core.streaming.buffers
 | 
			
		||||
# latest_frames = {}  # Deprecated - using shared_cache_buffer instead
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Health monitoring recovery handlers
 | 
			
		||||
def _handle_stream_restart_recovery(component: str, details: Dict[str, Any]) -> bool:
 | 
			
		||||
    """Handle stream restart recovery at the application level."""
 | 
			
		||||
    try:
 | 
			
		||||
        from core.streaming.manager import shared_stream_manager
 | 
			
		||||
 | 
			
		||||
        # Extract camera ID from component name (e.g., "stream_cam-001" -> "cam-001")
 | 
			
		||||
        if component.startswith("stream_"):
 | 
			
		||||
            camera_id = component[7:]  # Remove "stream_" prefix
 | 
			
		||||
        else:
 | 
			
		||||
            camera_id = component
 | 
			
		||||
 | 
			
		||||
        logger.info(f"Attempting stream restart recovery for {camera_id}")
 | 
			
		||||
 | 
			
		||||
        # Find and restart the subscription
 | 
			
		||||
        subscriptions = shared_stream_manager.get_all_subscriptions()
 | 
			
		||||
        for sub_info in subscriptions:
 | 
			
		||||
            if sub_info.camera_id == camera_id:
 | 
			
		||||
                # Remove and re-add the subscription
 | 
			
		||||
                shared_stream_manager.remove_subscription(sub_info.subscription_id)
 | 
			
		||||
                time.sleep(1.0)  # Brief delay
 | 
			
		||||
 | 
			
		||||
                # Re-add subscription
 | 
			
		||||
                success = shared_stream_manager.add_subscription(
 | 
			
		||||
                    sub_info.subscription_id,
 | 
			
		||||
                    sub_info.stream_config,
 | 
			
		||||
                    sub_info.crop_coords,
 | 
			
		||||
                    sub_info.model_id,
 | 
			
		||||
                    sub_info.model_url,
 | 
			
		||||
                    sub_info.tracking_integration
 | 
			
		||||
                )
 | 
			
		||||
 | 
			
		||||
                if success:
 | 
			
		||||
                    logger.info(f"Stream restart recovery successful for {camera_id}")
 | 
			
		||||
                    return True
 | 
			
		||||
                else:
 | 
			
		||||
                    logger.error(f"Stream restart recovery failed for {camera_id}")
 | 
			
		||||
                    return False
 | 
			
		||||
 | 
			
		||||
        logger.warning(f"No subscription found for camera {camera_id} during recovery")
 | 
			
		||||
        return False
 | 
			
		||||
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        logger.error(f"Error in stream restart recovery for {component}: {e}")
 | 
			
		||||
        return False
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def _handle_stream_reconnect_recovery(component: str, details: Dict[str, Any]) -> bool:
 | 
			
		||||
    """Handle stream reconnect recovery at the application level."""
 | 
			
		||||
    try:
 | 
			
		||||
        from core.streaming.manager import shared_stream_manager
 | 
			
		||||
 | 
			
		||||
        # Extract camera ID from component name
 | 
			
		||||
        if component.startswith("stream_"):
 | 
			
		||||
            camera_id = component[7:]
 | 
			
		||||
        else:
 | 
			
		||||
            camera_id = component
 | 
			
		||||
 | 
			
		||||
        logger.info(f"Attempting stream reconnect recovery for {camera_id}")
 | 
			
		||||
 | 
			
		||||
        # For reconnect, we just need to trigger the stream's internal reconnect
 | 
			
		||||
        # The stream readers handle their own reconnection logic
 | 
			
		||||
        active_cameras = shared_stream_manager.get_active_cameras()
 | 
			
		||||
 | 
			
		||||
        if camera_id in active_cameras:
 | 
			
		||||
            logger.info(f"Stream reconnect recovery triggered for {camera_id}")
 | 
			
		||||
            return True
 | 
			
		||||
        else:
 | 
			
		||||
            logger.warning(f"Camera {camera_id} not found in active cameras during reconnect recovery")
 | 
			
		||||
            return False
 | 
			
		||||
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        logger.error(f"Error in stream reconnect recovery for {component}: {e}")
 | 
			
		||||
        return False
 | 
			
		||||
 | 
			
		||||
# Lifespan event handler (modern FastAPI approach)
 | 
			
		||||
@asynccontextmanager
 | 
			
		||||
async def lifespan(app: FastAPI):
 | 
			
		||||
    """Application lifespan management."""
 | 
			
		||||
    # Startup
 | 
			
		||||
    logger.info("Detector Worker started successfully")
 | 
			
		||||
 | 
			
		||||
    # Initialize health monitoring system
 | 
			
		||||
    try:
 | 
			
		||||
        from core.monitoring.health import health_monitor
 | 
			
		||||
        from core.monitoring.stream_health import stream_health_tracker
 | 
			
		||||
        from core.monitoring.thread_health import thread_health_monitor
 | 
			
		||||
        from core.monitoring.recovery import recovery_manager
 | 
			
		||||
 | 
			
		||||
        # Start health monitoring
 | 
			
		||||
        health_monitor.start()
 | 
			
		||||
        logger.info("Health monitoring system started")
 | 
			
		||||
 | 
			
		||||
        # Register recovery handlers for stream management
 | 
			
		||||
        from core.streaming.manager import shared_stream_manager
 | 
			
		||||
        recovery_manager.register_recovery_handler(
 | 
			
		||||
            "restart_stream",
 | 
			
		||||
            _handle_stream_restart_recovery
 | 
			
		||||
        )
 | 
			
		||||
        recovery_manager.register_recovery_handler(
 | 
			
		||||
            "reconnect",
 | 
			
		||||
            _handle_stream_reconnect_recovery
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        logger.info("Recovery handlers registered")
 | 
			
		||||
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        logger.error(f"Failed to initialize health monitoring: {e}")
 | 
			
		||||
 | 
			
		||||
    logger.info("WebSocket endpoint available at: ws://0.0.0.0:8001/")
 | 
			
		||||
    logger.info("HTTP camera endpoint available at: http://0.0.0.0:8001/camera/{camera_id}/image")
 | 
			
		||||
    logger.info("Health check available at: http://0.0.0.0:8001/health")
 | 
			
		||||
    logger.info("Detailed health monitoring available at: http://0.0.0.0:8001/health/detailed")
 | 
			
		||||
    logger.info("Ready and waiting for backend WebSocket connections")
 | 
			
		||||
 | 
			
		||||
    yield
 | 
			
		||||
 | 
			
		||||
    # Shutdown
 | 
			
		||||
    logger.info("Detector Worker shutting down...")
 | 
			
		||||
 | 
			
		||||
    # Stop health monitoring
 | 
			
		||||
    try:
 | 
			
		||||
        from core.monitoring.health import health_monitor
 | 
			
		||||
        health_monitor.stop()
 | 
			
		||||
        logger.info("Health monitoring system stopped")
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        logger.error(f"Error stopping health monitoring: {e}")
 | 
			
		||||
 | 
			
		||||
    # Clear all state
 | 
			
		||||
    worker_state.set_subscriptions([])
 | 
			
		||||
    worker_state.session_ids.clear()
 | 
			
		||||
| 
						 | 
				
			
			@ -197,6 +312,205 @@ async def health_check():
 | 
			
		|||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@app.get("/health/detailed")
 | 
			
		||||
async def detailed_health_check():
 | 
			
		||||
    """Comprehensive health status with detailed monitoring data."""
 | 
			
		||||
    try:
 | 
			
		||||
        from core.monitoring.health import health_monitor
 | 
			
		||||
        from core.monitoring.stream_health import stream_health_tracker
 | 
			
		||||
        from core.monitoring.thread_health import thread_health_monitor
 | 
			
		||||
        from core.monitoring.recovery import recovery_manager
 | 
			
		||||
 | 
			
		||||
        # Get comprehensive health status
 | 
			
		||||
        overall_health = health_monitor.get_health_status()
 | 
			
		||||
        stream_metrics = stream_health_tracker.get_all_metrics()
 | 
			
		||||
        thread_info = thread_health_monitor.get_all_thread_info()
 | 
			
		||||
        recovery_stats = recovery_manager.get_recovery_stats()
 | 
			
		||||
 | 
			
		||||
        return {
 | 
			
		||||
            "timestamp": time.time(),
 | 
			
		||||
            "overall_health": overall_health,
 | 
			
		||||
            "stream_metrics": stream_metrics,
 | 
			
		||||
            "thread_health": thread_info,
 | 
			
		||||
            "recovery_stats": recovery_stats,
 | 
			
		||||
            "system_info": {
 | 
			
		||||
                "active_subscriptions": len(worker_state.subscriptions),
 | 
			
		||||
                "active_sessions": len(worker_state.session_ids),
 | 
			
		||||
                "version": "2.0.0"
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        logger.error(f"Error generating detailed health report: {e}")
 | 
			
		||||
        raise HTTPException(status_code=500, detail=f"Health monitoring error: {str(e)}")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@app.get("/health/streams")
 | 
			
		||||
async def stream_health_status():
 | 
			
		||||
    """Stream-specific health monitoring."""
 | 
			
		||||
    try:
 | 
			
		||||
        from core.monitoring.stream_health import stream_health_tracker
 | 
			
		||||
        from core.streaming.buffers import shared_cache_buffer
 | 
			
		||||
 | 
			
		||||
        stream_metrics = stream_health_tracker.get_all_metrics()
 | 
			
		||||
        buffer_stats = shared_cache_buffer.get_stats()
 | 
			
		||||
 | 
			
		||||
        return {
 | 
			
		||||
            "timestamp": time.time(),
 | 
			
		||||
            "stream_count": len(stream_metrics),
 | 
			
		||||
            "stream_metrics": stream_metrics,
 | 
			
		||||
            "buffer_stats": buffer_stats,
 | 
			
		||||
            "frame_ages": {
 | 
			
		||||
                camera_id: {
 | 
			
		||||
                    "age_seconds": time.time() - info["last_frame_time"] if info and info.get("last_frame_time") else None,
 | 
			
		||||
                    "total_frames": info.get("frame_count", 0) if info else 0
 | 
			
		||||
                }
 | 
			
		||||
                for camera_id, info in stream_metrics.items()
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        logger.error(f"Error generating stream health report: {e}")
 | 
			
		||||
        raise HTTPException(status_code=500, detail=f"Stream health error: {str(e)}")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@app.get("/health/threads")
 | 
			
		||||
async def thread_health_status():
 | 
			
		||||
    """Thread-specific health monitoring."""
 | 
			
		||||
    try:
 | 
			
		||||
        from core.monitoring.thread_health import thread_health_monitor
 | 
			
		||||
 | 
			
		||||
        thread_info = thread_health_monitor.get_all_thread_info()
 | 
			
		||||
        deadlocks = thread_health_monitor.detect_deadlocks()
 | 
			
		||||
 | 
			
		||||
        return {
 | 
			
		||||
            "timestamp": time.time(),
 | 
			
		||||
            "thread_count": len(thread_info),
 | 
			
		||||
            "thread_info": thread_info,
 | 
			
		||||
            "potential_deadlocks": deadlocks,
 | 
			
		||||
            "summary": {
 | 
			
		||||
                "responsive_threads": sum(1 for info in thread_info.values() if info.get("is_responsive", False)),
 | 
			
		||||
                "unresponsive_threads": sum(1 for info in thread_info.values() if not info.get("is_responsive", True)),
 | 
			
		||||
                "deadlock_count": len(deadlocks)
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        logger.error(f"Error generating thread health report: {e}")
 | 
			
		||||
        raise HTTPException(status_code=500, detail=f"Thread health error: {str(e)}")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@app.get("/health/recovery")
 | 
			
		||||
async def recovery_status():
 | 
			
		||||
    """Recovery system status and history."""
 | 
			
		||||
    try:
 | 
			
		||||
        from core.monitoring.recovery import recovery_manager
 | 
			
		||||
 | 
			
		||||
        recovery_stats = recovery_manager.get_recovery_stats()
 | 
			
		||||
 | 
			
		||||
        return {
 | 
			
		||||
            "timestamp": time.time(),
 | 
			
		||||
            "recovery_stats": recovery_stats,
 | 
			
		||||
            "summary": {
 | 
			
		||||
                "total_recoveries_last_hour": recovery_stats.get("total_recoveries_last_hour", 0),
 | 
			
		||||
                "components_with_recovery_state": len(recovery_stats.get("recovery_states", {})),
 | 
			
		||||
                "total_recovery_failures": sum(
 | 
			
		||||
                    state.get("failure_count", 0)
 | 
			
		||||
                    for state in recovery_stats.get("recovery_states", {}).values()
 | 
			
		||||
                ),
 | 
			
		||||
                "total_recovery_successes": sum(
 | 
			
		||||
                    state.get("success_count", 0)
 | 
			
		||||
                    for state in recovery_stats.get("recovery_states", {}).values()
 | 
			
		||||
                )
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        logger.error(f"Error generating recovery status report: {e}")
 | 
			
		||||
        raise HTTPException(status_code=500, detail=f"Recovery status error: {str(e)}")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@app.post("/health/recovery/force/{component}")
 | 
			
		||||
async def force_recovery(component: str, action: str = "restart_stream"):
 | 
			
		||||
    """Force recovery action for a specific component."""
 | 
			
		||||
    try:
 | 
			
		||||
        from core.monitoring.recovery import recovery_manager, RecoveryAction
 | 
			
		||||
 | 
			
		||||
        # Validate action
 | 
			
		||||
        try:
 | 
			
		||||
            recovery_action = RecoveryAction(action)
 | 
			
		||||
        except ValueError:
 | 
			
		||||
            raise HTTPException(
 | 
			
		||||
                status_code=400,
 | 
			
		||||
                detail=f"Invalid recovery action: {action}. Valid actions: {[a.value for a in RecoveryAction]}"
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
        # Force recovery
 | 
			
		||||
        success = recovery_manager.force_recovery(component, recovery_action, "manual_api_request")
 | 
			
		||||
 | 
			
		||||
        return {
 | 
			
		||||
            "timestamp": time.time(),
 | 
			
		||||
            "component": component,
 | 
			
		||||
            "action": action,
 | 
			
		||||
            "success": success,
 | 
			
		||||
            "message": f"Recovery {'successful' if success else 'failed'} for component {component}"
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
    except HTTPException:
 | 
			
		||||
        raise
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        logger.error(f"Error forcing recovery for {component}: {e}")
 | 
			
		||||
        raise HTTPException(status_code=500, detail=f"Recovery error: {str(e)}")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@app.get("/health/metrics")
 | 
			
		||||
async def health_metrics():
 | 
			
		||||
    """Performance and health metrics in a format suitable for monitoring systems."""
 | 
			
		||||
    try:
 | 
			
		||||
        from core.monitoring.health import health_monitor
 | 
			
		||||
        from core.monitoring.stream_health import stream_health_tracker
 | 
			
		||||
        from core.streaming.buffers import shared_cache_buffer
 | 
			
		||||
 | 
			
		||||
        # Get basic metrics
 | 
			
		||||
        overall_health = health_monitor.get_health_status()
 | 
			
		||||
        stream_metrics = stream_health_tracker.get_all_metrics()
 | 
			
		||||
        buffer_stats = shared_cache_buffer.get_stats()
 | 
			
		||||
 | 
			
		||||
        # Format for monitoring systems (Prometheus-style)
 | 
			
		||||
        metrics = {
 | 
			
		||||
            "detector_worker_up": 1,
 | 
			
		||||
            "detector_worker_streams_total": len(stream_metrics),
 | 
			
		||||
            "detector_worker_subscriptions_total": len(worker_state.subscriptions),
 | 
			
		||||
            "detector_worker_sessions_total": len(worker_state.session_ids),
 | 
			
		||||
            "detector_worker_memory_mb": buffer_stats.get("total_memory_mb", 0),
 | 
			
		||||
            "detector_worker_health_status": {
 | 
			
		||||
                "healthy": 1,
 | 
			
		||||
                "warning": 2,
 | 
			
		||||
                "critical": 3,
 | 
			
		||||
                "unknown": 4
 | 
			
		||||
            }.get(overall_health.get("overall_status", "unknown"), 4)
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        # Add per-stream metrics
 | 
			
		||||
        for camera_id, stream_info in stream_metrics.items():
 | 
			
		||||
            safe_camera_id = camera_id.replace("-", "_").replace(".", "_")
 | 
			
		||||
            metrics.update({
 | 
			
		||||
                f"detector_worker_stream_frames_total{{camera=\"{safe_camera_id}\"}}": stream_info.get("frame_count", 0),
 | 
			
		||||
                f"detector_worker_stream_errors_total{{camera=\"{safe_camera_id}\"}}": stream_info.get("error_count", 0),
 | 
			
		||||
                f"detector_worker_stream_fps{{camera=\"{safe_camera_id}\"}}": stream_info.get("frames_per_second", 0),
 | 
			
		||||
                f"detector_worker_stream_frame_age_seconds{{camera=\"{safe_camera_id}\"}}": stream_info.get("last_frame_age_seconds") or 0
 | 
			
		||||
            })
 | 
			
		||||
 | 
			
		||||
        return {
 | 
			
		||||
            "timestamp": time.time(),
 | 
			
		||||
            "metrics": metrics
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
    except Exception as e:
 | 
			
		||||
        logger.error(f"Error generating health metrics: {e}")
 | 
			
		||||
        raise HTTPException(status_code=500, detail=f"Metrics error: {str(e)}")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										18
									
								
								core/monitoring/__init__.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								core/monitoring/__init__.py
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,18 @@
 | 
			
		|||
"""
 | 
			
		||||
Comprehensive health monitoring system for detector worker.
 | 
			
		||||
Tracks stream health, thread responsiveness, and system performance.
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
from .health import HealthMonitor, HealthStatus, HealthCheck
 | 
			
		||||
from .stream_health import StreamHealthTracker
 | 
			
		||||
from .thread_health import ThreadHealthMonitor
 | 
			
		||||
from .recovery import RecoveryManager
 | 
			
		||||
 | 
			
		||||
__all__ = [
 | 
			
		||||
    'HealthMonitor',
 | 
			
		||||
    'HealthStatus',
 | 
			
		||||
    'HealthCheck',
 | 
			
		||||
    'StreamHealthTracker',
 | 
			
		||||
    'ThreadHealthMonitor',
 | 
			
		||||
    'RecoveryManager'
 | 
			
		||||
]
 | 
			
		||||
							
								
								
									
										456
									
								
								core/monitoring/health.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										456
									
								
								core/monitoring/health.py
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,456 @@
 | 
			
		|||
"""
 | 
			
		||||
Core health monitoring system for comprehensive stream and system health tracking.
 | 
			
		||||
Provides centralized health status, alerting, and recovery coordination.
 | 
			
		||||
"""
 | 
			
		||||
import time
 | 
			
		||||
import threading
 | 
			
		||||
import logging
 | 
			
		||||
import psutil
 | 
			
		||||
from typing import Dict, List, Optional, Any, Callable
 | 
			
		||||
from dataclasses import dataclass, field
 | 
			
		||||
from enum import Enum
 | 
			
		||||
from collections import defaultdict, deque
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
logger = logging.getLogger(__name__)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class HealthStatus(Enum):
 | 
			
		||||
    """Health status levels."""
 | 
			
		||||
    HEALTHY = "healthy"
 | 
			
		||||
    WARNING = "warning"
 | 
			
		||||
    CRITICAL = "critical"
 | 
			
		||||
    UNKNOWN = "unknown"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@dataclass
 | 
			
		||||
class HealthCheck:
 | 
			
		||||
    """Individual health check result."""
 | 
			
		||||
    name: str
 | 
			
		||||
    status: HealthStatus
 | 
			
		||||
    message: str
 | 
			
		||||
    timestamp: float = field(default_factory=time.time)
 | 
			
		||||
    details: Dict[str, Any] = field(default_factory=dict)
 | 
			
		||||
    recovery_action: Optional[str] = None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@dataclass
 | 
			
		||||
class HealthMetrics:
 | 
			
		||||
    """Health metrics for a component."""
 | 
			
		||||
    component_id: str
 | 
			
		||||
    last_update: float
 | 
			
		||||
    frame_count: int = 0
 | 
			
		||||
    error_count: int = 0
 | 
			
		||||
    warning_count: int = 0
 | 
			
		||||
    restart_count: int = 0
 | 
			
		||||
    avg_frame_interval: float = 0.0
 | 
			
		||||
    last_frame_time: Optional[float] = None
 | 
			
		||||
    thread_alive: bool = True
 | 
			
		||||
    connection_healthy: bool = True
 | 
			
		||||
    memory_usage_mb: float = 0.0
 | 
			
		||||
    cpu_usage_percent: float = 0.0
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class HealthMonitor:
 | 
			
		||||
    """Comprehensive health monitoring system."""
 | 
			
		||||
 | 
			
		||||
    def __init__(self, check_interval: float = 30.0):
 | 
			
		||||
        """
 | 
			
		||||
        Initialize health monitor.
 | 
			
		||||
 | 
			
		||||
        Args:
 | 
			
		||||
            check_interval: Interval between health checks in seconds
 | 
			
		||||
        """
 | 
			
		||||
        self.check_interval = check_interval
 | 
			
		||||
        self.running = False
 | 
			
		||||
        self.monitor_thread = None
 | 
			
		||||
        self._lock = threading.RLock()
 | 
			
		||||
 | 
			
		||||
        # Health data storage
 | 
			
		||||
        self.health_checks: Dict[str, HealthCheck] = {}
 | 
			
		||||
        self.metrics: Dict[str, HealthMetrics] = {}
 | 
			
		||||
        self.alert_history: deque = deque(maxlen=1000)
 | 
			
		||||
        self.recovery_actions: deque = deque(maxlen=500)
 | 
			
		||||
 | 
			
		||||
        # Thresholds (configurable)
 | 
			
		||||
        self.thresholds = {
 | 
			
		||||
            'frame_stale_warning_seconds': 120,    # 2 minutes
 | 
			
		||||
            'frame_stale_critical_seconds': 300,   # 5 minutes
 | 
			
		||||
            'thread_unresponsive_seconds': 60,     # 1 minute
 | 
			
		||||
            'memory_warning_mb': 500,              # 500MB per stream
 | 
			
		||||
            'memory_critical_mb': 1000,            # 1GB per stream
 | 
			
		||||
            'cpu_warning_percent': 80,             # 80% CPU
 | 
			
		||||
            'cpu_critical_percent': 95,            # 95% CPU
 | 
			
		||||
            'error_rate_warning': 0.1,             # 10% error rate
 | 
			
		||||
            'error_rate_critical': 0.3,            # 30% error rate
 | 
			
		||||
            'restart_threshold': 3                 # Max restarts per hour
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        # Health check functions
 | 
			
		||||
        self.health_checkers: List[Callable[[], List[HealthCheck]]] = []
 | 
			
		||||
        self.recovery_callbacks: Dict[str, Callable[[str, HealthCheck], bool]] = {}
 | 
			
		||||
 | 
			
		||||
        # System monitoring
 | 
			
		||||
        self.process = psutil.Process()
 | 
			
		||||
        self.system_start_time = time.time()
 | 
			
		||||
 | 
			
		||||
    def start(self):
 | 
			
		||||
        """Start health monitoring."""
 | 
			
		||||
        if self.running:
 | 
			
		||||
            logger.warning("Health monitor already running")
 | 
			
		||||
            return
 | 
			
		||||
 | 
			
		||||
        self.running = True
 | 
			
		||||
        self.monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True)
 | 
			
		||||
        self.monitor_thread.start()
 | 
			
		||||
        logger.info(f"Health monitor started (check interval: {self.check_interval}s)")
 | 
			
		||||
 | 
			
		||||
    def stop(self):
 | 
			
		||||
        """Stop health monitoring."""
 | 
			
		||||
        self.running = False
 | 
			
		||||
        if self.monitor_thread:
 | 
			
		||||
            self.monitor_thread.join(timeout=5.0)
 | 
			
		||||
        logger.info("Health monitor stopped")
 | 
			
		||||
 | 
			
		||||
    def register_health_checker(self, checker: Callable[[], List[HealthCheck]]):
 | 
			
		||||
        """Register a health check function."""
 | 
			
		||||
        self.health_checkers.append(checker)
 | 
			
		||||
        logger.debug(f"Registered health checker: {checker.__name__}")
 | 
			
		||||
 | 
			
		||||
    def register_recovery_callback(self, component: str, callback: Callable[[str, HealthCheck], bool]):
 | 
			
		||||
        """Register a recovery callback for a component."""
 | 
			
		||||
        self.recovery_callbacks[component] = callback
 | 
			
		||||
        logger.debug(f"Registered recovery callback for {component}")
 | 
			
		||||
 | 
			
		||||
    def update_metrics(self, component_id: str, **kwargs):
 | 
			
		||||
        """Update metrics for a component."""
 | 
			
		||||
        with self._lock:
 | 
			
		||||
            if component_id not in self.metrics:
 | 
			
		||||
                self.metrics[component_id] = HealthMetrics(
 | 
			
		||||
                    component_id=component_id,
 | 
			
		||||
                    last_update=time.time()
 | 
			
		||||
                )
 | 
			
		||||
 | 
			
		||||
            metrics = self.metrics[component_id]
 | 
			
		||||
            metrics.last_update = time.time()
 | 
			
		||||
 | 
			
		||||
            # Update provided metrics
 | 
			
		||||
            for key, value in kwargs.items():
 | 
			
		||||
                if hasattr(metrics, key):
 | 
			
		||||
                    setattr(metrics, key, value)
 | 
			
		||||
 | 
			
		||||
    def report_frame_received(self, component_id: str):
 | 
			
		||||
        """Report that a frame was received for a component."""
 | 
			
		||||
        current_time = time.time()
 | 
			
		||||
        with self._lock:
 | 
			
		||||
            if component_id not in self.metrics:
 | 
			
		||||
                self.metrics[component_id] = HealthMetrics(
 | 
			
		||||
                    component_id=component_id,
 | 
			
		||||
                    last_update=current_time
 | 
			
		||||
                )
 | 
			
		||||
 | 
			
		||||
            metrics = self.metrics[component_id]
 | 
			
		||||
 | 
			
		||||
            # Update frame metrics
 | 
			
		||||
            if metrics.last_frame_time:
 | 
			
		||||
                interval = current_time - metrics.last_frame_time
 | 
			
		||||
                # Moving average of frame intervals
 | 
			
		||||
                if metrics.avg_frame_interval == 0:
 | 
			
		||||
                    metrics.avg_frame_interval = interval
 | 
			
		||||
                else:
 | 
			
		||||
                    metrics.avg_frame_interval = (metrics.avg_frame_interval * 0.9) + (interval * 0.1)
 | 
			
		||||
 | 
			
		||||
            metrics.last_frame_time = current_time
 | 
			
		||||
            metrics.frame_count += 1
 | 
			
		||||
            metrics.last_update = current_time
 | 
			
		||||
 | 
			
		||||
    def report_error(self, component_id: str, error_type: str = "general"):
 | 
			
		||||
        """Report an error for a component."""
 | 
			
		||||
        with self._lock:
 | 
			
		||||
            if component_id not in self.metrics:
 | 
			
		||||
                self.metrics[component_id] = HealthMetrics(
 | 
			
		||||
                    component_id=component_id,
 | 
			
		||||
                    last_update=time.time()
 | 
			
		||||
                )
 | 
			
		||||
 | 
			
		||||
            self.metrics[component_id].error_count += 1
 | 
			
		||||
            self.metrics[component_id].last_update = time.time()
 | 
			
		||||
 | 
			
		||||
        logger.debug(f"Error reported for {component_id}: {error_type}")
 | 
			
		||||
 | 
			
		||||
    def report_warning(self, component_id: str, warning_type: str = "general"):
 | 
			
		||||
        """Report a warning for a component."""
 | 
			
		||||
        with self._lock:
 | 
			
		||||
            if component_id not in self.metrics:
 | 
			
		||||
                self.metrics[component_id] = HealthMetrics(
 | 
			
		||||
                    component_id=component_id,
 | 
			
		||||
                    last_update=time.time()
 | 
			
		||||
                )
 | 
			
		||||
 | 
			
		||||
            self.metrics[component_id].warning_count += 1
 | 
			
		||||
            self.metrics[component_id].last_update = time.time()
 | 
			
		||||
 | 
			
		||||
        logger.debug(f"Warning reported for {component_id}: {warning_type}")
 | 
			
		||||
 | 
			
		||||
    def report_restart(self, component_id: str):
 | 
			
		||||
        """Report that a component was restarted."""
 | 
			
		||||
        with self._lock:
 | 
			
		||||
            if component_id not in self.metrics:
 | 
			
		||||
                self.metrics[component_id] = HealthMetrics(
 | 
			
		||||
                    component_id=component_id,
 | 
			
		||||
                    last_update=time.time()
 | 
			
		||||
                )
 | 
			
		||||
 | 
			
		||||
            self.metrics[component_id].restart_count += 1
 | 
			
		||||
            self.metrics[component_id].last_update = time.time()
 | 
			
		||||
 | 
			
		||||
        # Log recovery action
 | 
			
		||||
        recovery_action = {
 | 
			
		||||
            'timestamp': time.time(),
 | 
			
		||||
            'component': component_id,
 | 
			
		||||
            'action': 'restart',
 | 
			
		||||
            'reason': 'manual_restart'
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        with self._lock:
 | 
			
		||||
            self.recovery_actions.append(recovery_action)
 | 
			
		||||
 | 
			
		||||
        logger.info(f"Restart reported for {component_id}")
 | 
			
		||||
 | 
			
		||||
    def get_health_status(self, component_id: Optional[str] = None) -> Dict[str, Any]:
 | 
			
		||||
        """Get comprehensive health status."""
 | 
			
		||||
        with self._lock:
 | 
			
		||||
            if component_id:
 | 
			
		||||
                # Get health for specific component
 | 
			
		||||
                return self._get_component_health(component_id)
 | 
			
		||||
            else:
 | 
			
		||||
                # Get overall health status
 | 
			
		||||
                return self._get_overall_health()
 | 
			
		||||
 | 
			
		||||
    def _get_component_health(self, component_id: str) -> Dict[str, Any]:
 | 
			
		||||
        """Get health status for a specific component."""
 | 
			
		||||
        if component_id not in self.metrics:
 | 
			
		||||
            return {
 | 
			
		||||
                'component_id': component_id,
 | 
			
		||||
                'status': HealthStatus.UNKNOWN.value,
 | 
			
		||||
                'message': 'No metrics available',
 | 
			
		||||
                'metrics': {}
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
        metrics = self.metrics[component_id]
 | 
			
		||||
        current_time = time.time()
 | 
			
		||||
 | 
			
		||||
        # Determine health status
 | 
			
		||||
        status = HealthStatus.HEALTHY
 | 
			
		||||
        issues = []
 | 
			
		||||
 | 
			
		||||
        # Check frame freshness
 | 
			
		||||
        if metrics.last_frame_time:
 | 
			
		||||
            frame_age = current_time - metrics.last_frame_time
 | 
			
		||||
            if frame_age > self.thresholds['frame_stale_critical_seconds']:
 | 
			
		||||
                status = HealthStatus.CRITICAL
 | 
			
		||||
                issues.append(f"Frames stale for {frame_age:.1f}s")
 | 
			
		||||
            elif frame_age > self.thresholds['frame_stale_warning_seconds']:
 | 
			
		||||
                if status == HealthStatus.HEALTHY:
 | 
			
		||||
                    status = HealthStatus.WARNING
 | 
			
		||||
                issues.append(f"Frames aging ({frame_age:.1f}s)")
 | 
			
		||||
 | 
			
		||||
        # Check error rates
 | 
			
		||||
        if metrics.frame_count > 0:
 | 
			
		||||
            error_rate = metrics.error_count / metrics.frame_count
 | 
			
		||||
            if error_rate > self.thresholds['error_rate_critical']:
 | 
			
		||||
                status = HealthStatus.CRITICAL
 | 
			
		||||
                issues.append(f"High error rate ({error_rate:.1%})")
 | 
			
		||||
            elif error_rate > self.thresholds['error_rate_warning']:
 | 
			
		||||
                if status == HealthStatus.HEALTHY:
 | 
			
		||||
                    status = HealthStatus.WARNING
 | 
			
		||||
                issues.append(f"Elevated error rate ({error_rate:.1%})")
 | 
			
		||||
 | 
			
		||||
        # Check restart frequency
 | 
			
		||||
        restart_rate = metrics.restart_count / max(1, (current_time - self.system_start_time) / 3600)
 | 
			
		||||
        if restart_rate > self.thresholds['restart_threshold']:
 | 
			
		||||
            status = HealthStatus.CRITICAL
 | 
			
		||||
            issues.append(f"Frequent restarts ({restart_rate:.1f}/hour)")
 | 
			
		||||
 | 
			
		||||
        # Check thread health
 | 
			
		||||
        if not metrics.thread_alive:
 | 
			
		||||
            status = HealthStatus.CRITICAL
 | 
			
		||||
            issues.append("Thread not alive")
 | 
			
		||||
 | 
			
		||||
        # Check connection health
 | 
			
		||||
        if not metrics.connection_healthy:
 | 
			
		||||
            if status == HealthStatus.HEALTHY:
 | 
			
		||||
                status = HealthStatus.WARNING
 | 
			
		||||
            issues.append("Connection unhealthy")
 | 
			
		||||
 | 
			
		||||
        return {
 | 
			
		||||
            'component_id': component_id,
 | 
			
		||||
            'status': status.value,
 | 
			
		||||
            'message': '; '.join(issues) if issues else 'All checks passing',
 | 
			
		||||
            'metrics': {
 | 
			
		||||
                'frame_count': metrics.frame_count,
 | 
			
		||||
                'error_count': metrics.error_count,
 | 
			
		||||
                'warning_count': metrics.warning_count,
 | 
			
		||||
                'restart_count': metrics.restart_count,
 | 
			
		||||
                'avg_frame_interval': metrics.avg_frame_interval,
 | 
			
		||||
                'last_frame_age': current_time - metrics.last_frame_time if metrics.last_frame_time else None,
 | 
			
		||||
                'thread_alive': metrics.thread_alive,
 | 
			
		||||
                'connection_healthy': metrics.connection_healthy,
 | 
			
		||||
                'memory_usage_mb': metrics.memory_usage_mb,
 | 
			
		||||
                'cpu_usage_percent': metrics.cpu_usage_percent,
 | 
			
		||||
                'uptime_seconds': current_time - self.system_start_time
 | 
			
		||||
            },
 | 
			
		||||
            'last_update': metrics.last_update
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
    def _get_overall_health(self) -> Dict[str, Any]:
 | 
			
		||||
        """Get overall system health status."""
 | 
			
		||||
        current_time = time.time()
 | 
			
		||||
        components = {}
 | 
			
		||||
        overall_status = HealthStatus.HEALTHY
 | 
			
		||||
 | 
			
		||||
        # Get health for all components
 | 
			
		||||
        for component_id in self.metrics.keys():
 | 
			
		||||
            component_health = self._get_component_health(component_id)
 | 
			
		||||
            components[component_id] = component_health
 | 
			
		||||
 | 
			
		||||
            # Determine overall status
 | 
			
		||||
            component_status = HealthStatus(component_health['status'])
 | 
			
		||||
            if component_status == HealthStatus.CRITICAL:
 | 
			
		||||
                overall_status = HealthStatus.CRITICAL
 | 
			
		||||
            elif component_status == HealthStatus.WARNING and overall_status == HealthStatus.HEALTHY:
 | 
			
		||||
                overall_status = HealthStatus.WARNING
 | 
			
		||||
 | 
			
		||||
        # System metrics
 | 
			
		||||
        try:
 | 
			
		||||
            system_memory = self.process.memory_info()
 | 
			
		||||
            system_cpu = self.process.cpu_percent()
 | 
			
		||||
        except Exception:
 | 
			
		||||
            system_memory = None
 | 
			
		||||
            system_cpu = 0.0
 | 
			
		||||
 | 
			
		||||
        return {
 | 
			
		||||
            'overall_status': overall_status.value,
 | 
			
		||||
            'timestamp': current_time,
 | 
			
		||||
            'uptime_seconds': current_time - self.system_start_time,
 | 
			
		||||
            'total_components': len(self.metrics),
 | 
			
		||||
            'components': components,
 | 
			
		||||
            'system_metrics': {
 | 
			
		||||
                'memory_mb': system_memory.rss / (1024 * 1024) if system_memory else 0,
 | 
			
		||||
                'cpu_percent': system_cpu,
 | 
			
		||||
                'process_id': self.process.pid
 | 
			
		||||
            },
 | 
			
		||||
            'recent_alerts': list(self.alert_history)[-10:],  # Last 10 alerts
 | 
			
		||||
            'recent_recoveries': list(self.recovery_actions)[-10:]  # Last 10 recovery actions
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
    def _monitor_loop(self):
 | 
			
		||||
        """Main health monitoring loop."""
 | 
			
		||||
        logger.info("Health monitor loop started")
 | 
			
		||||
 | 
			
		||||
        while self.running:
 | 
			
		||||
            try:
 | 
			
		||||
                start_time = time.time()
 | 
			
		||||
 | 
			
		||||
                # Run all registered health checks
 | 
			
		||||
                all_checks = []
 | 
			
		||||
                for checker in self.health_checkers:
 | 
			
		||||
                    try:
 | 
			
		||||
                        checks = checker()
 | 
			
		||||
                        all_checks.extend(checks)
 | 
			
		||||
                    except Exception as e:
 | 
			
		||||
                        logger.error(f"Error in health checker {checker.__name__}: {e}")
 | 
			
		||||
 | 
			
		||||
                # Process health checks and trigger recovery if needed
 | 
			
		||||
                for check in all_checks:
 | 
			
		||||
                    self._process_health_check(check)
 | 
			
		||||
 | 
			
		||||
                # Update system metrics
 | 
			
		||||
                self._update_system_metrics()
 | 
			
		||||
 | 
			
		||||
                # Sleep until next check
 | 
			
		||||
                elapsed = time.time() - start_time
 | 
			
		||||
                sleep_time = max(0, self.check_interval - elapsed)
 | 
			
		||||
                if sleep_time > 0:
 | 
			
		||||
                    time.sleep(sleep_time)
 | 
			
		||||
 | 
			
		||||
            except Exception as e:
 | 
			
		||||
                logger.error(f"Error in health monitor loop: {e}")
 | 
			
		||||
                time.sleep(5.0)  # Fallback sleep
 | 
			
		||||
 | 
			
		||||
        logger.info("Health monitor loop ended")
 | 
			
		||||
 | 
			
		||||
    def _process_health_check(self, check: HealthCheck):
 | 
			
		||||
        """Process a health check result and trigger recovery if needed."""
 | 
			
		||||
        with self._lock:
 | 
			
		||||
            # Store health check
 | 
			
		||||
            self.health_checks[check.name] = check
 | 
			
		||||
 | 
			
		||||
            # Log alerts for non-healthy status
 | 
			
		||||
            if check.status != HealthStatus.HEALTHY:
 | 
			
		||||
                alert = {
 | 
			
		||||
                    'timestamp': check.timestamp,
 | 
			
		||||
                    'component': check.name,
 | 
			
		||||
                    'status': check.status.value,
 | 
			
		||||
                    'message': check.message,
 | 
			
		||||
                    'details': check.details
 | 
			
		||||
                }
 | 
			
		||||
                self.alert_history.append(alert)
 | 
			
		||||
 | 
			
		||||
                logger.warning(f"Health alert [{check.status.value.upper()}] {check.name}: {check.message}")
 | 
			
		||||
 | 
			
		||||
                # Trigger recovery if critical and recovery action available
 | 
			
		||||
                if check.status == HealthStatus.CRITICAL and check.recovery_action:
 | 
			
		||||
                    self._trigger_recovery(check.name, check)
 | 
			
		||||
 | 
			
		||||
    def _trigger_recovery(self, component: str, check: HealthCheck):
 | 
			
		||||
        """Trigger recovery action for a component."""
 | 
			
		||||
        if component in self.recovery_callbacks:
 | 
			
		||||
            try:
 | 
			
		||||
                logger.info(f"Triggering recovery for {component}: {check.recovery_action}")
 | 
			
		||||
 | 
			
		||||
                success = self.recovery_callbacks[component](component, check)
 | 
			
		||||
 | 
			
		||||
                recovery_action = {
 | 
			
		||||
                    'timestamp': time.time(),
 | 
			
		||||
                    'component': component,
 | 
			
		||||
                    'action': check.recovery_action,
 | 
			
		||||
                    'reason': check.message,
 | 
			
		||||
                    'success': success
 | 
			
		||||
                }
 | 
			
		||||
 | 
			
		||||
                with self._lock:
 | 
			
		||||
                    self.recovery_actions.append(recovery_action)
 | 
			
		||||
 | 
			
		||||
                if success:
 | 
			
		||||
                    logger.info(f"Recovery successful for {component}")
 | 
			
		||||
                else:
 | 
			
		||||
                    logger.error(f"Recovery failed for {component}")
 | 
			
		||||
 | 
			
		||||
            except Exception as e:
 | 
			
		||||
                logger.error(f"Error in recovery callback for {component}: {e}")
 | 
			
		||||
 | 
			
		||||
    def _update_system_metrics(self):
 | 
			
		||||
        """Update system-level metrics."""
 | 
			
		||||
        try:
 | 
			
		||||
            # Update process metrics for all components
 | 
			
		||||
            current_time = time.time()
 | 
			
		||||
 | 
			
		||||
            with self._lock:
 | 
			
		||||
                for component_id, metrics in self.metrics.items():
 | 
			
		||||
                    # Update CPU and memory if available
 | 
			
		||||
                    try:
 | 
			
		||||
                        # This is a simplified approach - in practice you'd want
 | 
			
		||||
                        # per-thread or per-component resource tracking
 | 
			
		||||
                        metrics.cpu_usage_percent = self.process.cpu_percent() / len(self.metrics)
 | 
			
		||||
                        memory_info = self.process.memory_info()
 | 
			
		||||
                        metrics.memory_usage_mb = memory_info.rss / (1024 * 1024) / len(self.metrics)
 | 
			
		||||
                    except Exception:
 | 
			
		||||
                        pass
 | 
			
		||||
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            logger.error(f"Error updating system metrics: {e}")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Global health monitor instance
 | 
			
		||||
health_monitor = HealthMonitor()
 | 
			
		||||
							
								
								
									
										385
									
								
								core/monitoring/recovery.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										385
									
								
								core/monitoring/recovery.py
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,385 @@
 | 
			
		|||
"""
 | 
			
		||||
Recovery manager for automatic handling of health issues.
 | 
			
		||||
Provides circuit breaker patterns, automatic restarts, and graceful degradation.
 | 
			
		||||
"""
 | 
			
		||||
import time
 | 
			
		||||
import logging
 | 
			
		||||
import threading
 | 
			
		||||
from typing import Dict, List, Optional, Any, Callable
 | 
			
		||||
from dataclasses import dataclass
 | 
			
		||||
from enum import Enum
 | 
			
		||||
from collections import defaultdict, deque
 | 
			
		||||
 | 
			
		||||
from .health import HealthCheck, HealthStatus, health_monitor
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
logger = logging.getLogger(__name__)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class RecoveryAction(Enum):
 | 
			
		||||
    """Types of recovery actions."""
 | 
			
		||||
    RESTART_STREAM = "restart_stream"
 | 
			
		||||
    RESTART_THREAD = "restart_thread"
 | 
			
		||||
    CLEAR_BUFFER = "clear_buffer"
 | 
			
		||||
    RECONNECT = "reconnect"
 | 
			
		||||
    THROTTLE = "throttle"
 | 
			
		||||
    DISABLE = "disable"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@dataclass
 | 
			
		||||
class RecoveryAttempt:
 | 
			
		||||
    """Record of a recovery attempt."""
 | 
			
		||||
    timestamp: float
 | 
			
		||||
    component: str
 | 
			
		||||
    action: RecoveryAction
 | 
			
		||||
    reason: str
 | 
			
		||||
    success: bool
 | 
			
		||||
    details: Dict[str, Any] = None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@dataclass
 | 
			
		||||
class RecoveryState:
 | 
			
		||||
    """Recovery state for a component - simplified without circuit breaker."""
 | 
			
		||||
    failure_count: int = 0
 | 
			
		||||
    success_count: int = 0
 | 
			
		||||
    last_failure_time: Optional[float] = None
 | 
			
		||||
    last_success_time: Optional[float] = None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class RecoveryManager:
 | 
			
		||||
    """Manages automatic recovery actions for health issues."""
 | 
			
		||||
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
        self.recovery_handlers: Dict[str, Callable[[str, HealthCheck], bool]] = {}
 | 
			
		||||
        self.recovery_states: Dict[str, RecoveryState] = {}
 | 
			
		||||
        self.recovery_history: deque = deque(maxlen=1000)
 | 
			
		||||
        self._lock = threading.RLock()
 | 
			
		||||
 | 
			
		||||
        # Configuration - simplified without circuit breaker
 | 
			
		||||
        self.recovery_cooldown = 30          # 30 seconds between recovery attempts
 | 
			
		||||
        self.max_attempts_per_hour = 20      # Still limit to prevent spam, but much higher
 | 
			
		||||
 | 
			
		||||
        # Track recovery attempts per component
 | 
			
		||||
        self.recovery_attempts: Dict[str, deque] = defaultdict(lambda: deque(maxlen=50))
 | 
			
		||||
 | 
			
		||||
        # Register with health monitor
 | 
			
		||||
        health_monitor.register_recovery_callback("stream", self._handle_stream_recovery)
 | 
			
		||||
        health_monitor.register_recovery_callback("thread", self._handle_thread_recovery)
 | 
			
		||||
        health_monitor.register_recovery_callback("buffer", self._handle_buffer_recovery)
 | 
			
		||||
 | 
			
		||||
    def register_recovery_handler(self, action: RecoveryAction, handler: Callable[[str, Dict[str, Any]], bool]):
 | 
			
		||||
        """
 | 
			
		||||
        Register a recovery handler for a specific action.
 | 
			
		||||
 | 
			
		||||
        Args:
 | 
			
		||||
            action: Type of recovery action
 | 
			
		||||
            handler: Function that performs the recovery
 | 
			
		||||
        """
 | 
			
		||||
        self.recovery_handlers[action.value] = handler
 | 
			
		||||
        logger.info(f"Registered recovery handler for {action.value}")
 | 
			
		||||
 | 
			
		||||
    def can_attempt_recovery(self, component: str) -> bool:
 | 
			
		||||
        """
 | 
			
		||||
        Check if recovery can be attempted for a component.
 | 
			
		||||
 | 
			
		||||
        Args:
 | 
			
		||||
            component: Component identifier
 | 
			
		||||
 | 
			
		||||
        Returns:
 | 
			
		||||
            True if recovery can be attempted (always allow with minimal throttling)
 | 
			
		||||
        """
 | 
			
		||||
        with self._lock:
 | 
			
		||||
            current_time = time.time()
 | 
			
		||||
 | 
			
		||||
            # Check recovery attempt rate limiting (much more permissive)
 | 
			
		||||
            recent_attempts = [
 | 
			
		||||
                attempt for attempt in self.recovery_attempts[component]
 | 
			
		||||
                if current_time - attempt <= 3600  # Last hour
 | 
			
		||||
            ]
 | 
			
		||||
 | 
			
		||||
            # Only block if truly excessive attempts
 | 
			
		||||
            if len(recent_attempts) >= self.max_attempts_per_hour:
 | 
			
		||||
                logger.warning(f"Recovery rate limit exceeded for {component} "
 | 
			
		||||
                             f"({len(recent_attempts)} attempts in last hour)")
 | 
			
		||||
                return False
 | 
			
		||||
 | 
			
		||||
            # Check cooldown period (shorter cooldown)
 | 
			
		||||
            if recent_attempts:
 | 
			
		||||
                last_attempt = max(recent_attempts)
 | 
			
		||||
                if current_time - last_attempt < self.recovery_cooldown:
 | 
			
		||||
                    logger.debug(f"Recovery cooldown active for {component} "
 | 
			
		||||
                               f"(last attempt {current_time - last_attempt:.1f}s ago)")
 | 
			
		||||
                    return False
 | 
			
		||||
 | 
			
		||||
            return True
 | 
			
		||||
 | 
			
		||||
    def attempt_recovery(self, component: str, action: RecoveryAction, reason: str,
 | 
			
		||||
                        details: Optional[Dict[str, Any]] = None) -> bool:
 | 
			
		||||
        """
 | 
			
		||||
        Attempt recovery for a component.
 | 
			
		||||
 | 
			
		||||
        Args:
 | 
			
		||||
            component: Component identifier
 | 
			
		||||
            action: Recovery action to perform
 | 
			
		||||
            reason: Reason for recovery
 | 
			
		||||
            details: Additional details
 | 
			
		||||
 | 
			
		||||
        Returns:
 | 
			
		||||
            True if recovery was successful
 | 
			
		||||
        """
 | 
			
		||||
        if not self.can_attempt_recovery(component):
 | 
			
		||||
            return False
 | 
			
		||||
 | 
			
		||||
        current_time = time.time()
 | 
			
		||||
 | 
			
		||||
        logger.info(f"Attempting recovery for {component}: {action.value} ({reason})")
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            # Record recovery attempt
 | 
			
		||||
            with self._lock:
 | 
			
		||||
                self.recovery_attempts[component].append(current_time)
 | 
			
		||||
 | 
			
		||||
            # Perform recovery action
 | 
			
		||||
            success = self._execute_recovery_action(component, action, details or {})
 | 
			
		||||
 | 
			
		||||
            # Record recovery result
 | 
			
		||||
            attempt = RecoveryAttempt(
 | 
			
		||||
                timestamp=current_time,
 | 
			
		||||
                component=component,
 | 
			
		||||
                action=action,
 | 
			
		||||
                reason=reason,
 | 
			
		||||
                success=success,
 | 
			
		||||
                details=details
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
            with self._lock:
 | 
			
		||||
                self.recovery_history.append(attempt)
 | 
			
		||||
 | 
			
		||||
            # Update recovery state
 | 
			
		||||
            self._update_recovery_state(component, success)
 | 
			
		||||
 | 
			
		||||
            if success:
 | 
			
		||||
                logger.info(f"Recovery successful for {component}: {action.value}")
 | 
			
		||||
            else:
 | 
			
		||||
                logger.error(f"Recovery failed for {component}: {action.value}")
 | 
			
		||||
 | 
			
		||||
            return success
 | 
			
		||||
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            logger.error(f"Error during recovery for {component}: {e}")
 | 
			
		||||
            self._update_recovery_state(component, False)
 | 
			
		||||
            return False
 | 
			
		||||
 | 
			
		||||
    def _execute_recovery_action(self, component: str, action: RecoveryAction,
 | 
			
		||||
                               details: Dict[str, Any]) -> bool:
 | 
			
		||||
        """Execute a specific recovery action."""
 | 
			
		||||
        handler_key = action.value
 | 
			
		||||
 | 
			
		||||
        if handler_key not in self.recovery_handlers:
 | 
			
		||||
            logger.error(f"No recovery handler registered for action: {handler_key}")
 | 
			
		||||
            return False
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            handler = self.recovery_handlers[handler_key]
 | 
			
		||||
            return handler(component, details)
 | 
			
		||||
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            logger.error(f"Error executing recovery action {handler_key} for {component}: {e}")
 | 
			
		||||
            return False
 | 
			
		||||
 | 
			
		||||
    def _update_recovery_state(self, component: str, success: bool):
 | 
			
		||||
        """Update recovery state based on recovery result."""
 | 
			
		||||
        current_time = time.time()
 | 
			
		||||
 | 
			
		||||
        with self._lock:
 | 
			
		||||
            if component not in self.recovery_states:
 | 
			
		||||
                self.recovery_states[component] = RecoveryState()
 | 
			
		||||
 | 
			
		||||
            state = self.recovery_states[component]
 | 
			
		||||
 | 
			
		||||
            if success:
 | 
			
		||||
                state.success_count += 1
 | 
			
		||||
                state.last_success_time = current_time
 | 
			
		||||
                # Reset failure count on success
 | 
			
		||||
                state.failure_count = max(0, state.failure_count - 1)
 | 
			
		||||
                logger.debug(f"Recovery success for {component} (total successes: {state.success_count})")
 | 
			
		||||
            else:
 | 
			
		||||
                state.failure_count += 1
 | 
			
		||||
                state.last_failure_time = current_time
 | 
			
		||||
                logger.debug(f"Recovery failure for {component} (total failures: {state.failure_count})")
 | 
			
		||||
 | 
			
		||||
    def _handle_stream_recovery(self, component: str, health_check: HealthCheck) -> bool:
 | 
			
		||||
        """Handle recovery for stream-related issues."""
 | 
			
		||||
        if "frames" in health_check.name:
 | 
			
		||||
            # Frame-related issue - restart stream
 | 
			
		||||
            return self.attempt_recovery(
 | 
			
		||||
                component,
 | 
			
		||||
                RecoveryAction.RESTART_STREAM,
 | 
			
		||||
                health_check.message,
 | 
			
		||||
                health_check.details
 | 
			
		||||
            )
 | 
			
		||||
        elif "connection" in health_check.name:
 | 
			
		||||
            # Connection issue - reconnect
 | 
			
		||||
            return self.attempt_recovery(
 | 
			
		||||
                component,
 | 
			
		||||
                RecoveryAction.RECONNECT,
 | 
			
		||||
                health_check.message,
 | 
			
		||||
                health_check.details
 | 
			
		||||
            )
 | 
			
		||||
        elif "errors" in health_check.name:
 | 
			
		||||
            # High error rate - throttle or restart
 | 
			
		||||
            return self.attempt_recovery(
 | 
			
		||||
                component,
 | 
			
		||||
                RecoveryAction.THROTTLE,
 | 
			
		||||
                health_check.message,
 | 
			
		||||
                health_check.details
 | 
			
		||||
            )
 | 
			
		||||
        else:
 | 
			
		||||
            # Generic stream issue - restart
 | 
			
		||||
            return self.attempt_recovery(
 | 
			
		||||
                component,
 | 
			
		||||
                RecoveryAction.RESTART_STREAM,
 | 
			
		||||
                health_check.message,
 | 
			
		||||
                health_check.details
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
    def _handle_thread_recovery(self, component: str, health_check: HealthCheck) -> bool:
 | 
			
		||||
        """Handle recovery for thread-related issues."""
 | 
			
		||||
        if "deadlock" in health_check.name:
 | 
			
		||||
            # Deadlock detected - restart thread
 | 
			
		||||
            return self.attempt_recovery(
 | 
			
		||||
                component,
 | 
			
		||||
                RecoveryAction.RESTART_THREAD,
 | 
			
		||||
                health_check.message,
 | 
			
		||||
                health_check.details
 | 
			
		||||
            )
 | 
			
		||||
        elif "responsive" in health_check.name:
 | 
			
		||||
            # Thread unresponsive - restart
 | 
			
		||||
            return self.attempt_recovery(
 | 
			
		||||
                component,
 | 
			
		||||
                RecoveryAction.RESTART_THREAD,
 | 
			
		||||
                health_check.message,
 | 
			
		||||
                health_check.details
 | 
			
		||||
            )
 | 
			
		||||
        else:
 | 
			
		||||
            # Generic thread issue - restart
 | 
			
		||||
            return self.attempt_recovery(
 | 
			
		||||
                component,
 | 
			
		||||
                RecoveryAction.RESTART_THREAD,
 | 
			
		||||
                health_check.message,
 | 
			
		||||
                health_check.details
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
    def _handle_buffer_recovery(self, component: str, health_check: HealthCheck) -> bool:
 | 
			
		||||
        """Handle recovery for buffer-related issues."""
 | 
			
		||||
        # Buffer issues - clear buffer
 | 
			
		||||
        return self.attempt_recovery(
 | 
			
		||||
            component,
 | 
			
		||||
            RecoveryAction.CLEAR_BUFFER,
 | 
			
		||||
            health_check.message,
 | 
			
		||||
            health_check.details
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    def get_recovery_stats(self) -> Dict[str, Any]:
 | 
			
		||||
        """Get recovery statistics."""
 | 
			
		||||
        current_time = time.time()
 | 
			
		||||
 | 
			
		||||
        with self._lock:
 | 
			
		||||
            # Calculate stats from history
 | 
			
		||||
            recent_recoveries = [
 | 
			
		||||
                attempt for attempt in self.recovery_history
 | 
			
		||||
                if current_time - attempt.timestamp <= 3600  # Last hour
 | 
			
		||||
            ]
 | 
			
		||||
 | 
			
		||||
            stats_by_component = defaultdict(lambda: {
 | 
			
		||||
                'attempts': 0,
 | 
			
		||||
                'successes': 0,
 | 
			
		||||
                'failures': 0,
 | 
			
		||||
                'last_attempt': None,
 | 
			
		||||
                'last_success': None
 | 
			
		||||
            })
 | 
			
		||||
 | 
			
		||||
            for attempt in recent_recoveries:
 | 
			
		||||
                stats = stats_by_component[attempt.component]
 | 
			
		||||
                stats['attempts'] += 1
 | 
			
		||||
 | 
			
		||||
                if attempt.success:
 | 
			
		||||
                    stats['successes'] += 1
 | 
			
		||||
                    if not stats['last_success'] or attempt.timestamp > stats['last_success']:
 | 
			
		||||
                        stats['last_success'] = attempt.timestamp
 | 
			
		||||
                else:
 | 
			
		||||
                    stats['failures'] += 1
 | 
			
		||||
 | 
			
		||||
                if not stats['last_attempt'] or attempt.timestamp > stats['last_attempt']:
 | 
			
		||||
                    stats['last_attempt'] = attempt.timestamp
 | 
			
		||||
 | 
			
		||||
            return {
 | 
			
		||||
                'total_recoveries_last_hour': len(recent_recoveries),
 | 
			
		||||
                'recovery_by_component': dict(stats_by_component),
 | 
			
		||||
                'recovery_states': {
 | 
			
		||||
                    component: {
 | 
			
		||||
                        'failure_count': state.failure_count,
 | 
			
		||||
                        'success_count': state.success_count,
 | 
			
		||||
                        'last_failure_time': state.last_failure_time,
 | 
			
		||||
                        'last_success_time': state.last_success_time
 | 
			
		||||
                    }
 | 
			
		||||
                    for component, state in self.recovery_states.items()
 | 
			
		||||
                },
 | 
			
		||||
                'recent_history': [
 | 
			
		||||
                    {
 | 
			
		||||
                        'timestamp': attempt.timestamp,
 | 
			
		||||
                        'component': attempt.component,
 | 
			
		||||
                        'action': attempt.action.value,
 | 
			
		||||
                        'reason': attempt.reason,
 | 
			
		||||
                        'success': attempt.success
 | 
			
		||||
                    }
 | 
			
		||||
                    for attempt in list(self.recovery_history)[-10:]  # Last 10 attempts
 | 
			
		||||
                ]
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
    def force_recovery(self, component: str, action: RecoveryAction, reason: str = "manual") -> bool:
 | 
			
		||||
        """
 | 
			
		||||
        Force recovery for a component, bypassing rate limiting.
 | 
			
		||||
 | 
			
		||||
        Args:
 | 
			
		||||
            component: Component identifier
 | 
			
		||||
            action: Recovery action to perform
 | 
			
		||||
            reason: Reason for forced recovery
 | 
			
		||||
 | 
			
		||||
        Returns:
 | 
			
		||||
            True if recovery was successful
 | 
			
		||||
        """
 | 
			
		||||
        logger.info(f"Forcing recovery for {component}: {action.value} ({reason})")
 | 
			
		||||
 | 
			
		||||
        current_time = time.time()
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            # Execute recovery action directly
 | 
			
		||||
            success = self._execute_recovery_action(component, action, {})
 | 
			
		||||
 | 
			
		||||
            # Record forced recovery
 | 
			
		||||
            attempt = RecoveryAttempt(
 | 
			
		||||
                timestamp=current_time,
 | 
			
		||||
                component=component,
 | 
			
		||||
                action=action,
 | 
			
		||||
                reason=f"forced: {reason}",
 | 
			
		||||
                success=success,
 | 
			
		||||
                details={'forced': True}
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
            with self._lock:
 | 
			
		||||
                self.recovery_history.append(attempt)
 | 
			
		||||
                self.recovery_attempts[component].append(current_time)
 | 
			
		||||
 | 
			
		||||
            # Update recovery state
 | 
			
		||||
            self._update_recovery_state(component, success)
 | 
			
		||||
 | 
			
		||||
            return success
 | 
			
		||||
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            logger.error(f"Error during forced recovery for {component}: {e}")
 | 
			
		||||
            return False
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Global recovery manager instance
 | 
			
		||||
recovery_manager = RecoveryManager()
 | 
			
		||||
							
								
								
									
										351
									
								
								core/monitoring/stream_health.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										351
									
								
								core/monitoring/stream_health.py
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,351 @@
 | 
			
		|||
"""
 | 
			
		||||
Stream-specific health monitoring for video streams.
 | 
			
		||||
Tracks frame production, connection health, and stream-specific metrics.
 | 
			
		||||
"""
 | 
			
		||||
import time
 | 
			
		||||
import logging
 | 
			
		||||
import threading
 | 
			
		||||
import requests
 | 
			
		||||
from typing import Dict, Optional, List, Any
 | 
			
		||||
from collections import deque
 | 
			
		||||
from dataclasses import dataclass
 | 
			
		||||
 | 
			
		||||
from .health import HealthCheck, HealthStatus, health_monitor
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
logger = logging.getLogger(__name__)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@dataclass
 | 
			
		||||
class StreamMetrics:
 | 
			
		||||
    """Metrics for an individual stream."""
 | 
			
		||||
    camera_id: str
 | 
			
		||||
    stream_type: str  # 'rtsp', 'http_snapshot'
 | 
			
		||||
    start_time: float
 | 
			
		||||
    last_frame_time: Optional[float] = None
 | 
			
		||||
    frame_count: int = 0
 | 
			
		||||
    error_count: int = 0
 | 
			
		||||
    reconnect_count: int = 0
 | 
			
		||||
    bytes_received: int = 0
 | 
			
		||||
    frames_per_second: float = 0.0
 | 
			
		||||
    connection_attempts: int = 0
 | 
			
		||||
    last_connection_test: Optional[float] = None
 | 
			
		||||
    connection_healthy: bool = True
 | 
			
		||||
    last_error: Optional[str] = None
 | 
			
		||||
    last_error_time: Optional[float] = None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class StreamHealthTracker:
 | 
			
		||||
    """Tracks health for individual video streams."""
 | 
			
		||||
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
        self.streams: Dict[str, StreamMetrics] = {}
 | 
			
		||||
        self._lock = threading.RLock()
 | 
			
		||||
 | 
			
		||||
        # Configuration
 | 
			
		||||
        self.connection_test_interval = 300  # Test connection every 5 minutes
 | 
			
		||||
        self.frame_timeout_warning = 120    # Warn if no frames for 2 minutes
 | 
			
		||||
        self.frame_timeout_critical = 300   # Critical if no frames for 5 minutes
 | 
			
		||||
        self.error_rate_threshold = 0.1     # 10% error rate threshold
 | 
			
		||||
 | 
			
		||||
        # Register with health monitor
 | 
			
		||||
        health_monitor.register_health_checker(self._perform_health_checks)
 | 
			
		||||
 | 
			
		||||
    def register_stream(self, camera_id: str, stream_type: str, source_url: Optional[str] = None):
 | 
			
		||||
        """Register a new stream for monitoring."""
 | 
			
		||||
        with self._lock:
 | 
			
		||||
            if camera_id not in self.streams:
 | 
			
		||||
                self.streams[camera_id] = StreamMetrics(
 | 
			
		||||
                    camera_id=camera_id,
 | 
			
		||||
                    stream_type=stream_type,
 | 
			
		||||
                    start_time=time.time()
 | 
			
		||||
                )
 | 
			
		||||
                logger.info(f"Registered stream for monitoring: {camera_id} ({stream_type})")
 | 
			
		||||
 | 
			
		||||
            # Update health monitor metrics
 | 
			
		||||
            health_monitor.update_metrics(
 | 
			
		||||
                camera_id,
 | 
			
		||||
                thread_alive=True,
 | 
			
		||||
                connection_healthy=True
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
    def unregister_stream(self, camera_id: str):
 | 
			
		||||
        """Unregister a stream from monitoring."""
 | 
			
		||||
        with self._lock:
 | 
			
		||||
            if camera_id in self.streams:
 | 
			
		||||
                del self.streams[camera_id]
 | 
			
		||||
                logger.info(f"Unregistered stream from monitoring: {camera_id}")
 | 
			
		||||
 | 
			
		||||
    def report_frame_received(self, camera_id: str, frame_size_bytes: int = 0):
 | 
			
		||||
        """Report that a frame was received."""
 | 
			
		||||
        current_time = time.time()
 | 
			
		||||
 | 
			
		||||
        with self._lock:
 | 
			
		||||
            if camera_id not in self.streams:
 | 
			
		||||
                logger.warning(f"Frame received for unregistered stream: {camera_id}")
 | 
			
		||||
                return
 | 
			
		||||
 | 
			
		||||
            stream = self.streams[camera_id]
 | 
			
		||||
 | 
			
		||||
            # Update frame metrics
 | 
			
		||||
            if stream.last_frame_time:
 | 
			
		||||
                interval = current_time - stream.last_frame_time
 | 
			
		||||
                # Calculate FPS as moving average
 | 
			
		||||
                if stream.frames_per_second == 0:
 | 
			
		||||
                    stream.frames_per_second = 1.0 / interval if interval > 0 else 0
 | 
			
		||||
                else:
 | 
			
		||||
                    new_fps = 1.0 / interval if interval > 0 else 0
 | 
			
		||||
                    stream.frames_per_second = (stream.frames_per_second * 0.9) + (new_fps * 0.1)
 | 
			
		||||
 | 
			
		||||
            stream.last_frame_time = current_time
 | 
			
		||||
            stream.frame_count += 1
 | 
			
		||||
            stream.bytes_received += frame_size_bytes
 | 
			
		||||
 | 
			
		||||
            # Report to health monitor
 | 
			
		||||
            health_monitor.report_frame_received(camera_id)
 | 
			
		||||
            health_monitor.update_metrics(
 | 
			
		||||
                camera_id,
 | 
			
		||||
                frame_count=stream.frame_count,
 | 
			
		||||
                avg_frame_interval=1.0 / stream.frames_per_second if stream.frames_per_second > 0 else 0,
 | 
			
		||||
                last_frame_time=current_time
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
    def report_error(self, camera_id: str, error_message: str):
 | 
			
		||||
        """Report an error for a stream."""
 | 
			
		||||
        current_time = time.time()
 | 
			
		||||
 | 
			
		||||
        with self._lock:
 | 
			
		||||
            if camera_id not in self.streams:
 | 
			
		||||
                logger.warning(f"Error reported for unregistered stream: {camera_id}")
 | 
			
		||||
                return
 | 
			
		||||
 | 
			
		||||
            stream = self.streams[camera_id]
 | 
			
		||||
            stream.error_count += 1
 | 
			
		||||
            stream.last_error = error_message
 | 
			
		||||
            stream.last_error_time = current_time
 | 
			
		||||
 | 
			
		||||
            # Report to health monitor
 | 
			
		||||
            health_monitor.report_error(camera_id, "stream_error")
 | 
			
		||||
            health_monitor.update_metrics(
 | 
			
		||||
                camera_id,
 | 
			
		||||
                error_count=stream.error_count
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
            logger.debug(f"Error reported for stream {camera_id}: {error_message}")
 | 
			
		||||
 | 
			
		||||
    def report_reconnect(self, camera_id: str, reason: str = "unknown"):
 | 
			
		||||
        """Report that a stream reconnected."""
 | 
			
		||||
        current_time = time.time()
 | 
			
		||||
 | 
			
		||||
        with self._lock:
 | 
			
		||||
            if camera_id not in self.streams:
 | 
			
		||||
                logger.warning(f"Reconnect reported for unregistered stream: {camera_id}")
 | 
			
		||||
                return
 | 
			
		||||
 | 
			
		||||
            stream = self.streams[camera_id]
 | 
			
		||||
            stream.reconnect_count += 1
 | 
			
		||||
 | 
			
		||||
            # Report to health monitor
 | 
			
		||||
            health_monitor.report_restart(camera_id)
 | 
			
		||||
            health_monitor.update_metrics(
 | 
			
		||||
                camera_id,
 | 
			
		||||
                restart_count=stream.reconnect_count
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
            logger.info(f"Reconnect reported for stream {camera_id}: {reason}")
 | 
			
		||||
 | 
			
		||||
    def report_connection_attempt(self, camera_id: str, success: bool):
 | 
			
		||||
        """Report a connection attempt."""
 | 
			
		||||
        with self._lock:
 | 
			
		||||
            if camera_id not in self.streams:
 | 
			
		||||
                return
 | 
			
		||||
 | 
			
		||||
            stream = self.streams[camera_id]
 | 
			
		||||
            stream.connection_attempts += 1
 | 
			
		||||
            stream.connection_healthy = success
 | 
			
		||||
 | 
			
		||||
            # Report to health monitor
 | 
			
		||||
            health_monitor.update_metrics(
 | 
			
		||||
                camera_id,
 | 
			
		||||
                connection_healthy=success
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
    def test_http_connection(self, camera_id: str, url: str) -> bool:
 | 
			
		||||
        """Test HTTP connection health for snapshot streams."""
 | 
			
		||||
        try:
 | 
			
		||||
            # Quick HEAD request to test connectivity
 | 
			
		||||
            response = requests.head(url, timeout=5, verify=False)
 | 
			
		||||
            success = response.status_code in [200, 404]  # 404 might be normal for some cameras
 | 
			
		||||
 | 
			
		||||
            self.report_connection_attempt(camera_id, success)
 | 
			
		||||
 | 
			
		||||
            if success:
 | 
			
		||||
                logger.debug(f"Connection test passed for {camera_id}")
 | 
			
		||||
            else:
 | 
			
		||||
                logger.warning(f"Connection test failed for {camera_id}: HTTP {response.status_code}")
 | 
			
		||||
 | 
			
		||||
            return success
 | 
			
		||||
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            logger.warning(f"Connection test failed for {camera_id}: {e}")
 | 
			
		||||
            self.report_connection_attempt(camera_id, False)
 | 
			
		||||
            return False
 | 
			
		||||
 | 
			
		||||
    def get_stream_metrics(self, camera_id: str) -> Optional[Dict[str, Any]]:
 | 
			
		||||
        """Get metrics for a specific stream."""
 | 
			
		||||
        with self._lock:
 | 
			
		||||
            if camera_id not in self.streams:
 | 
			
		||||
                return None
 | 
			
		||||
 | 
			
		||||
            stream = self.streams[camera_id]
 | 
			
		||||
            current_time = time.time()
 | 
			
		||||
 | 
			
		||||
            # Calculate derived metrics
 | 
			
		||||
            uptime = current_time - stream.start_time
 | 
			
		||||
            frame_age = current_time - stream.last_frame_time if stream.last_frame_time else None
 | 
			
		||||
            error_rate = stream.error_count / max(1, stream.frame_count)
 | 
			
		||||
 | 
			
		||||
            return {
 | 
			
		||||
                'camera_id': camera_id,
 | 
			
		||||
                'stream_type': stream.stream_type,
 | 
			
		||||
                'uptime_seconds': uptime,
 | 
			
		||||
                'frame_count': stream.frame_count,
 | 
			
		||||
                'frames_per_second': stream.frames_per_second,
 | 
			
		||||
                'bytes_received': stream.bytes_received,
 | 
			
		||||
                'error_count': stream.error_count,
 | 
			
		||||
                'error_rate': error_rate,
 | 
			
		||||
                'reconnect_count': stream.reconnect_count,
 | 
			
		||||
                'connection_attempts': stream.connection_attempts,
 | 
			
		||||
                'connection_healthy': stream.connection_healthy,
 | 
			
		||||
                'last_frame_age_seconds': frame_age,
 | 
			
		||||
                'last_error': stream.last_error,
 | 
			
		||||
                'last_error_time': stream.last_error_time
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
    def get_all_metrics(self) -> Dict[str, Dict[str, Any]]:
 | 
			
		||||
        """Get metrics for all streams."""
 | 
			
		||||
        with self._lock:
 | 
			
		||||
            return {
 | 
			
		||||
                camera_id: self.get_stream_metrics(camera_id)
 | 
			
		||||
                for camera_id in self.streams.keys()
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
    def _perform_health_checks(self) -> List[HealthCheck]:
 | 
			
		||||
        """Perform health checks for all streams."""
 | 
			
		||||
        checks = []
 | 
			
		||||
        current_time = time.time()
 | 
			
		||||
 | 
			
		||||
        with self._lock:
 | 
			
		||||
            for camera_id, stream in self.streams.items():
 | 
			
		||||
                checks.extend(self._check_stream_health(camera_id, stream, current_time))
 | 
			
		||||
 | 
			
		||||
        return checks
 | 
			
		||||
 | 
			
		||||
    def _check_stream_health(self, camera_id: str, stream: StreamMetrics, current_time: float) -> List[HealthCheck]:
 | 
			
		||||
        """Perform health checks for a single stream."""
 | 
			
		||||
        checks = []
 | 
			
		||||
 | 
			
		||||
        # Check frame freshness
 | 
			
		||||
        if stream.last_frame_time:
 | 
			
		||||
            frame_age = current_time - stream.last_frame_time
 | 
			
		||||
 | 
			
		||||
            if frame_age > self.frame_timeout_critical:
 | 
			
		||||
                checks.append(HealthCheck(
 | 
			
		||||
                    name=f"stream_{camera_id}_frames",
 | 
			
		||||
                    status=HealthStatus.CRITICAL,
 | 
			
		||||
                    message=f"No frames for {frame_age:.1f}s (critical threshold: {self.frame_timeout_critical}s)",
 | 
			
		||||
                    details={
 | 
			
		||||
                        'frame_age': frame_age,
 | 
			
		||||
                        'threshold': self.frame_timeout_critical,
 | 
			
		||||
                        'last_frame_time': stream.last_frame_time
 | 
			
		||||
                    },
 | 
			
		||||
                    recovery_action="restart_stream"
 | 
			
		||||
                ))
 | 
			
		||||
            elif frame_age > self.frame_timeout_warning:
 | 
			
		||||
                checks.append(HealthCheck(
 | 
			
		||||
                    name=f"stream_{camera_id}_frames",
 | 
			
		||||
                    status=HealthStatus.WARNING,
 | 
			
		||||
                    message=f"Frames aging: {frame_age:.1f}s (warning threshold: {self.frame_timeout_warning}s)",
 | 
			
		||||
                    details={
 | 
			
		||||
                        'frame_age': frame_age,
 | 
			
		||||
                        'threshold': self.frame_timeout_warning,
 | 
			
		||||
                        'last_frame_time': stream.last_frame_time
 | 
			
		||||
                    }
 | 
			
		||||
                ))
 | 
			
		||||
        else:
 | 
			
		||||
            # No frames received yet
 | 
			
		||||
            startup_time = current_time - stream.start_time
 | 
			
		||||
            if startup_time > 60:  # Allow 1 minute for initial connection
 | 
			
		||||
                checks.append(HealthCheck(
 | 
			
		||||
                    name=f"stream_{camera_id}_startup",
 | 
			
		||||
                    status=HealthStatus.CRITICAL,
 | 
			
		||||
                    message=f"No frames received since startup {startup_time:.1f}s ago",
 | 
			
		||||
                    details={
 | 
			
		||||
                        'startup_time': startup_time,
 | 
			
		||||
                        'start_time': stream.start_time
 | 
			
		||||
                    },
 | 
			
		||||
                    recovery_action="restart_stream"
 | 
			
		||||
                ))
 | 
			
		||||
 | 
			
		||||
        # Check error rate
 | 
			
		||||
        if stream.frame_count > 10:  # Need sufficient samples
 | 
			
		||||
            error_rate = stream.error_count / stream.frame_count
 | 
			
		||||
            if error_rate > self.error_rate_threshold:
 | 
			
		||||
                checks.append(HealthCheck(
 | 
			
		||||
                    name=f"stream_{camera_id}_errors",
 | 
			
		||||
                    status=HealthStatus.WARNING,
 | 
			
		||||
                    message=f"High error rate: {error_rate:.1%} ({stream.error_count}/{stream.frame_count})",
 | 
			
		||||
                    details={
 | 
			
		||||
                        'error_rate': error_rate,
 | 
			
		||||
                        'error_count': stream.error_count,
 | 
			
		||||
                        'frame_count': stream.frame_count,
 | 
			
		||||
                        'last_error': stream.last_error
 | 
			
		||||
                    }
 | 
			
		||||
                ))
 | 
			
		||||
 | 
			
		||||
        # Check connection health
 | 
			
		||||
        if not stream.connection_healthy:
 | 
			
		||||
            checks.append(HealthCheck(
 | 
			
		||||
                name=f"stream_{camera_id}_connection",
 | 
			
		||||
                status=HealthStatus.WARNING,
 | 
			
		||||
                message="Connection unhealthy (last test failed)",
 | 
			
		||||
                details={
 | 
			
		||||
                    'connection_attempts': stream.connection_attempts,
 | 
			
		||||
                    'last_connection_test': stream.last_connection_test
 | 
			
		||||
                }
 | 
			
		||||
            ))
 | 
			
		||||
 | 
			
		||||
        # Check excessive reconnects
 | 
			
		||||
        uptime_hours = (current_time - stream.start_time) / 3600
 | 
			
		||||
        if uptime_hours > 1 and stream.reconnect_count > 5:  # More than 5 reconnects per hour
 | 
			
		||||
            reconnect_rate = stream.reconnect_count / uptime_hours
 | 
			
		||||
            checks.append(HealthCheck(
 | 
			
		||||
                name=f"stream_{camera_id}_stability",
 | 
			
		||||
                status=HealthStatus.WARNING,
 | 
			
		||||
                message=f"Frequent reconnects: {reconnect_rate:.1f}/hour ({stream.reconnect_count} total)",
 | 
			
		||||
                details={
 | 
			
		||||
                    'reconnect_rate': reconnect_rate,
 | 
			
		||||
                    'reconnect_count': stream.reconnect_count,
 | 
			
		||||
                    'uptime_hours': uptime_hours
 | 
			
		||||
                }
 | 
			
		||||
            ))
 | 
			
		||||
 | 
			
		||||
        # Check frame rate health
 | 
			
		||||
        if stream.last_frame_time and stream.frames_per_second > 0:
 | 
			
		||||
            expected_fps = 6.0  # Expected FPS for streams
 | 
			
		||||
            if stream.frames_per_second < expected_fps * 0.5:  # Less than 50% of expected
 | 
			
		||||
                checks.append(HealthCheck(
 | 
			
		||||
                    name=f"stream_{camera_id}_framerate",
 | 
			
		||||
                    status=HealthStatus.WARNING,
 | 
			
		||||
                    message=f"Low frame rate: {stream.frames_per_second:.1f} fps (expected: ~{expected_fps} fps)",
 | 
			
		||||
                    details={
 | 
			
		||||
                        'current_fps': stream.frames_per_second,
 | 
			
		||||
                        'expected_fps': expected_fps
 | 
			
		||||
                    }
 | 
			
		||||
                ))
 | 
			
		||||
 | 
			
		||||
        return checks
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Global stream health tracker instance
 | 
			
		||||
stream_health_tracker = StreamHealthTracker()
 | 
			
		||||
							
								
								
									
										381
									
								
								core/monitoring/thread_health.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										381
									
								
								core/monitoring/thread_health.py
									
										
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,381 @@
 | 
			
		|||
"""
 | 
			
		||||
Thread health monitoring for detecting unresponsive and deadlocked threads.
 | 
			
		||||
Provides thread liveness detection and responsiveness testing.
 | 
			
		||||
"""
 | 
			
		||||
import time
 | 
			
		||||
import threading
 | 
			
		||||
import logging
 | 
			
		||||
import signal
 | 
			
		||||
import traceback
 | 
			
		||||
from typing import Dict, List, Optional, Any, Callable
 | 
			
		||||
from dataclasses import dataclass
 | 
			
		||||
from collections import defaultdict
 | 
			
		||||
 | 
			
		||||
from .health import HealthCheck, HealthStatus, health_monitor
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
logger = logging.getLogger(__name__)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@dataclass
 | 
			
		||||
class ThreadInfo:
 | 
			
		||||
    """Information about a monitored thread."""
 | 
			
		||||
    thread_id: int
 | 
			
		||||
    thread_name: str
 | 
			
		||||
    start_time: float
 | 
			
		||||
    last_heartbeat: float
 | 
			
		||||
    heartbeat_count: int = 0
 | 
			
		||||
    is_responsive: bool = True
 | 
			
		||||
    last_activity: Optional[str] = None
 | 
			
		||||
    stack_traces: List[str] = None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class ThreadHealthMonitor:
 | 
			
		||||
    """Monitors thread health and responsiveness."""
 | 
			
		||||
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
        self.monitored_threads: Dict[int, ThreadInfo] = {}
 | 
			
		||||
        self.heartbeat_callbacks: Dict[int, Callable[[], bool]] = {}
 | 
			
		||||
        self._lock = threading.RLock()
 | 
			
		||||
 | 
			
		||||
        # Configuration
 | 
			
		||||
        self.heartbeat_timeout = 60.0      # 1 minute without heartbeat = unresponsive
 | 
			
		||||
        self.responsiveness_test_interval = 30.0  # Test responsiveness every 30 seconds
 | 
			
		||||
        self.stack_trace_count = 5         # Keep last 5 stack traces for analysis
 | 
			
		||||
 | 
			
		||||
        # Register with health monitor
 | 
			
		||||
        health_monitor.register_health_checker(self._perform_health_checks)
 | 
			
		||||
 | 
			
		||||
        # Enable periodic responsiveness testing
 | 
			
		||||
        self.test_thread = threading.Thread(target=self._responsiveness_test_loop, daemon=True)
 | 
			
		||||
        self.test_thread.start()
 | 
			
		||||
 | 
			
		||||
    def register_thread(self, thread: threading.Thread, heartbeat_callback: Optional[Callable[[], bool]] = None):
 | 
			
		||||
        """
 | 
			
		||||
        Register a thread for monitoring.
 | 
			
		||||
 | 
			
		||||
        Args:
 | 
			
		||||
            thread: Thread to monitor
 | 
			
		||||
            heartbeat_callback: Optional callback to test thread responsiveness
 | 
			
		||||
        """
 | 
			
		||||
        with self._lock:
 | 
			
		||||
            thread_info = ThreadInfo(
 | 
			
		||||
                thread_id=thread.ident,
 | 
			
		||||
                thread_name=thread.name,
 | 
			
		||||
                start_time=time.time(),
 | 
			
		||||
                last_heartbeat=time.time()
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
            self.monitored_threads[thread.ident] = thread_info
 | 
			
		||||
 | 
			
		||||
            if heartbeat_callback:
 | 
			
		||||
                self.heartbeat_callbacks[thread.ident] = heartbeat_callback
 | 
			
		||||
 | 
			
		||||
            logger.info(f"Registered thread for monitoring: {thread.name} (ID: {thread.ident})")
 | 
			
		||||
 | 
			
		||||
    def unregister_thread(self, thread_id: int):
 | 
			
		||||
        """Unregister a thread from monitoring."""
 | 
			
		||||
        with self._lock:
 | 
			
		||||
            if thread_id in self.monitored_threads:
 | 
			
		||||
                thread_name = self.monitored_threads[thread_id].thread_name
 | 
			
		||||
                del self.monitored_threads[thread_id]
 | 
			
		||||
 | 
			
		||||
                if thread_id in self.heartbeat_callbacks:
 | 
			
		||||
                    del self.heartbeat_callbacks[thread_id]
 | 
			
		||||
 | 
			
		||||
                logger.info(f"Unregistered thread from monitoring: {thread_name} (ID: {thread_id})")
 | 
			
		||||
 | 
			
		||||
    def heartbeat(self, thread_id: Optional[int] = None, activity: Optional[str] = None):
 | 
			
		||||
        """
 | 
			
		||||
        Report thread heartbeat.
 | 
			
		||||
 | 
			
		||||
        Args:
 | 
			
		||||
            thread_id: Thread ID (uses current thread if None)
 | 
			
		||||
            activity: Description of current activity
 | 
			
		||||
        """
 | 
			
		||||
        if thread_id is None:
 | 
			
		||||
            thread_id = threading.current_thread().ident
 | 
			
		||||
 | 
			
		||||
        current_time = time.time()
 | 
			
		||||
 | 
			
		||||
        with self._lock:
 | 
			
		||||
            if thread_id in self.monitored_threads:
 | 
			
		||||
                thread_info = self.monitored_threads[thread_id]
 | 
			
		||||
                thread_info.last_heartbeat = current_time
 | 
			
		||||
                thread_info.heartbeat_count += 1
 | 
			
		||||
                thread_info.is_responsive = True
 | 
			
		||||
 | 
			
		||||
                if activity:
 | 
			
		||||
                    thread_info.last_activity = activity
 | 
			
		||||
 | 
			
		||||
                # Report to health monitor
 | 
			
		||||
                health_monitor.update_metrics(
 | 
			
		||||
                    f"thread_{thread_info.thread_name}",
 | 
			
		||||
                    thread_alive=True,
 | 
			
		||||
                    last_frame_time=current_time
 | 
			
		||||
                )
 | 
			
		||||
 | 
			
		||||
    def get_thread_info(self, thread_id: int) -> Optional[Dict[str, Any]]:
 | 
			
		||||
        """Get information about a monitored thread."""
 | 
			
		||||
        with self._lock:
 | 
			
		||||
            if thread_id not in self.monitored_threads:
 | 
			
		||||
                return None
 | 
			
		||||
 | 
			
		||||
            thread_info = self.monitored_threads[thread_id]
 | 
			
		||||
            current_time = time.time()
 | 
			
		||||
 | 
			
		||||
            return {
 | 
			
		||||
                'thread_id': thread_id,
 | 
			
		||||
                'thread_name': thread_info.thread_name,
 | 
			
		||||
                'uptime_seconds': current_time - thread_info.start_time,
 | 
			
		||||
                'last_heartbeat_age': current_time - thread_info.last_heartbeat,
 | 
			
		||||
                'heartbeat_count': thread_info.heartbeat_count,
 | 
			
		||||
                'is_responsive': thread_info.is_responsive,
 | 
			
		||||
                'last_activity': thread_info.last_activity,
 | 
			
		||||
                'stack_traces': thread_info.stack_traces or []
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
    def get_all_thread_info(self) -> Dict[int, Dict[str, Any]]:
 | 
			
		||||
        """Get information about all monitored threads."""
 | 
			
		||||
        with self._lock:
 | 
			
		||||
            return {
 | 
			
		||||
                thread_id: self.get_thread_info(thread_id)
 | 
			
		||||
                for thread_id in self.monitored_threads.keys()
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
    def test_thread_responsiveness(self, thread_id: int) -> bool:
 | 
			
		||||
        """
 | 
			
		||||
        Test if a thread is responsive by calling its heartbeat callback.
 | 
			
		||||
 | 
			
		||||
        Args:
 | 
			
		||||
            thread_id: ID of thread to test
 | 
			
		||||
 | 
			
		||||
        Returns:
 | 
			
		||||
            True if thread responds within timeout
 | 
			
		||||
        """
 | 
			
		||||
        if thread_id not in self.heartbeat_callbacks:
 | 
			
		||||
            return True  # Can't test if no callback provided
 | 
			
		||||
 | 
			
		||||
        try:
 | 
			
		||||
            # Call the heartbeat callback with a timeout
 | 
			
		||||
            callback = self.heartbeat_callbacks[thread_id]
 | 
			
		||||
 | 
			
		||||
            # This is a simple approach - in practice you might want to use
 | 
			
		||||
            # threading.Timer or asyncio for more sophisticated timeout handling
 | 
			
		||||
            start_time = time.time()
 | 
			
		||||
            result = callback()
 | 
			
		||||
            response_time = time.time() - start_time
 | 
			
		||||
 | 
			
		||||
            with self._lock:
 | 
			
		||||
                if thread_id in self.monitored_threads:
 | 
			
		||||
                    self.monitored_threads[thread_id].is_responsive = result
 | 
			
		||||
 | 
			
		||||
            if response_time > 5.0:  # Slow response
 | 
			
		||||
                logger.warning(f"Thread {thread_id} slow response: {response_time:.1f}s")
 | 
			
		||||
 | 
			
		||||
            return result
 | 
			
		||||
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            logger.error(f"Error testing thread {thread_id} responsiveness: {e}")
 | 
			
		||||
            with self._lock:
 | 
			
		||||
                if thread_id in self.monitored_threads:
 | 
			
		||||
                    self.monitored_threads[thread_id].is_responsive = False
 | 
			
		||||
            return False
 | 
			
		||||
 | 
			
		||||
    def capture_stack_trace(self, thread_id: int) -> Optional[str]:
 | 
			
		||||
        """
 | 
			
		||||
        Capture stack trace for a thread.
 | 
			
		||||
 | 
			
		||||
        Args:
 | 
			
		||||
            thread_id: ID of thread to capture
 | 
			
		||||
 | 
			
		||||
        Returns:
 | 
			
		||||
            Stack trace string or None if not available
 | 
			
		||||
        """
 | 
			
		||||
        try:
 | 
			
		||||
            # Get all frames for all threads
 | 
			
		||||
            frames = dict(threading._current_frames())
 | 
			
		||||
 | 
			
		||||
            if thread_id not in frames:
 | 
			
		||||
                return None
 | 
			
		||||
 | 
			
		||||
            # Format stack trace
 | 
			
		||||
            frame = frames[thread_id]
 | 
			
		||||
            stack_trace = ''.join(traceback.format_stack(frame))
 | 
			
		||||
 | 
			
		||||
            # Store in thread info
 | 
			
		||||
            with self._lock:
 | 
			
		||||
                if thread_id in self.monitored_threads:
 | 
			
		||||
                    thread_info = self.monitored_threads[thread_id]
 | 
			
		||||
                    if thread_info.stack_traces is None:
 | 
			
		||||
                        thread_info.stack_traces = []
 | 
			
		||||
 | 
			
		||||
                    thread_info.stack_traces.append(f"{time.time()}: {stack_trace}")
 | 
			
		||||
 | 
			
		||||
                    # Keep only last N stack traces
 | 
			
		||||
                    if len(thread_info.stack_traces) > self.stack_trace_count:
 | 
			
		||||
                        thread_info.stack_traces = thread_info.stack_traces[-self.stack_trace_count:]
 | 
			
		||||
 | 
			
		||||
            return stack_trace
 | 
			
		||||
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            logger.error(f"Error capturing stack trace for thread {thread_id}: {e}")
 | 
			
		||||
            return None
 | 
			
		||||
 | 
			
		||||
    def detect_deadlocks(self) -> List[Dict[str, Any]]:
 | 
			
		||||
        """
 | 
			
		||||
        Attempt to detect potential deadlocks by analyzing thread states.
 | 
			
		||||
 | 
			
		||||
        Returns:
 | 
			
		||||
            List of potential deadlock scenarios
 | 
			
		||||
        """
 | 
			
		||||
        deadlocks = []
 | 
			
		||||
        current_time = time.time()
 | 
			
		||||
 | 
			
		||||
        with self._lock:
 | 
			
		||||
            # Look for threads that haven't had heartbeats for a long time
 | 
			
		||||
            # and are supposedly alive
 | 
			
		||||
            for thread_id, thread_info in self.monitored_threads.items():
 | 
			
		||||
                heartbeat_age = current_time - thread_info.last_heartbeat
 | 
			
		||||
 | 
			
		||||
                if heartbeat_age > self.heartbeat_timeout * 2:  # Double the timeout
 | 
			
		||||
                    # Check if thread still exists
 | 
			
		||||
                    thread_exists = any(
 | 
			
		||||
                        t.ident == thread_id and t.is_alive()
 | 
			
		||||
                        for t in threading.enumerate()
 | 
			
		||||
                    )
 | 
			
		||||
 | 
			
		||||
                    if thread_exists:
 | 
			
		||||
                        # Thread exists but not responding - potential deadlock
 | 
			
		||||
                        stack_trace = self.capture_stack_trace(thread_id)
 | 
			
		||||
 | 
			
		||||
                        deadlock_info = {
 | 
			
		||||
                            'thread_id': thread_id,
 | 
			
		||||
                            'thread_name': thread_info.thread_name,
 | 
			
		||||
                            'heartbeat_age': heartbeat_age,
 | 
			
		||||
                            'last_activity': thread_info.last_activity,
 | 
			
		||||
                            'stack_trace': stack_trace,
 | 
			
		||||
                            'detection_time': current_time
 | 
			
		||||
                        }
 | 
			
		||||
 | 
			
		||||
                        deadlocks.append(deadlock_info)
 | 
			
		||||
                        logger.warning(f"Potential deadlock detected in thread {thread_info.thread_name}")
 | 
			
		||||
 | 
			
		||||
        return deadlocks
 | 
			
		||||
 | 
			
		||||
    def _responsiveness_test_loop(self):
 | 
			
		||||
        """Background loop to test thread responsiveness."""
 | 
			
		||||
        logger.info("Thread responsiveness testing started")
 | 
			
		||||
 | 
			
		||||
        while True:
 | 
			
		||||
            try:
 | 
			
		||||
                time.sleep(self.responsiveness_test_interval)
 | 
			
		||||
 | 
			
		||||
                with self._lock:
 | 
			
		||||
                    thread_ids = list(self.monitored_threads.keys())
 | 
			
		||||
 | 
			
		||||
                for thread_id in thread_ids:
 | 
			
		||||
                    try:
 | 
			
		||||
                        self.test_thread_responsiveness(thread_id)
 | 
			
		||||
                    except Exception as e:
 | 
			
		||||
                        logger.error(f"Error testing thread {thread_id}: {e}")
 | 
			
		||||
 | 
			
		||||
            except Exception as e:
 | 
			
		||||
                logger.error(f"Error in responsiveness test loop: {e}")
 | 
			
		||||
                time.sleep(10.0)  # Fallback sleep
 | 
			
		||||
 | 
			
		||||
    def _perform_health_checks(self) -> List[HealthCheck]:
 | 
			
		||||
        """Perform health checks for all monitored threads."""
 | 
			
		||||
        checks = []
 | 
			
		||||
        current_time = time.time()
 | 
			
		||||
 | 
			
		||||
        with self._lock:
 | 
			
		||||
            for thread_id, thread_info in self.monitored_threads.items():
 | 
			
		||||
                checks.extend(self._check_thread_health(thread_id, thread_info, current_time))
 | 
			
		||||
 | 
			
		||||
        # Check for deadlocks
 | 
			
		||||
        deadlocks = self.detect_deadlocks()
 | 
			
		||||
        for deadlock in deadlocks:
 | 
			
		||||
            checks.append(HealthCheck(
 | 
			
		||||
                name=f"deadlock_detection_{deadlock['thread_id']}",
 | 
			
		||||
                status=HealthStatus.CRITICAL,
 | 
			
		||||
                message=f"Potential deadlock in thread {deadlock['thread_name']} "
 | 
			
		||||
                       f"(unresponsive for {deadlock['heartbeat_age']:.1f}s)",
 | 
			
		||||
                details=deadlock,
 | 
			
		||||
                recovery_action="restart_thread"
 | 
			
		||||
            ))
 | 
			
		||||
 | 
			
		||||
        return checks
 | 
			
		||||
 | 
			
		||||
    def _check_thread_health(self, thread_id: int, thread_info: ThreadInfo, current_time: float) -> List[HealthCheck]:
 | 
			
		||||
        """Perform health checks for a single thread."""
 | 
			
		||||
        checks = []
 | 
			
		||||
 | 
			
		||||
        # Check if thread still exists
 | 
			
		||||
        thread_exists = any(
 | 
			
		||||
            t.ident == thread_id and t.is_alive()
 | 
			
		||||
            for t in threading.enumerate()
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
        if not thread_exists:
 | 
			
		||||
            checks.append(HealthCheck(
 | 
			
		||||
                name=f"thread_{thread_info.thread_name}_alive",
 | 
			
		||||
                status=HealthStatus.CRITICAL,
 | 
			
		||||
                message=f"Thread {thread_info.thread_name} is no longer alive",
 | 
			
		||||
                details={
 | 
			
		||||
                    'thread_id': thread_id,
 | 
			
		||||
                    'uptime': current_time - thread_info.start_time,
 | 
			
		||||
                    'last_heartbeat': thread_info.last_heartbeat
 | 
			
		||||
                },
 | 
			
		||||
                recovery_action="restart_thread"
 | 
			
		||||
            ))
 | 
			
		||||
            return checks
 | 
			
		||||
 | 
			
		||||
        # Check heartbeat freshness
 | 
			
		||||
        heartbeat_age = current_time - thread_info.last_heartbeat
 | 
			
		||||
 | 
			
		||||
        if heartbeat_age > self.heartbeat_timeout:
 | 
			
		||||
            checks.append(HealthCheck(
 | 
			
		||||
                name=f"thread_{thread_info.thread_name}_responsive",
 | 
			
		||||
                status=HealthStatus.CRITICAL,
 | 
			
		||||
                message=f"Thread {thread_info.thread_name} unresponsive for {heartbeat_age:.1f}s",
 | 
			
		||||
                details={
 | 
			
		||||
                    'thread_id': thread_id,
 | 
			
		||||
                    'heartbeat_age': heartbeat_age,
 | 
			
		||||
                    'heartbeat_count': thread_info.heartbeat_count,
 | 
			
		||||
                    'last_activity': thread_info.last_activity,
 | 
			
		||||
                    'is_responsive': thread_info.is_responsive
 | 
			
		||||
                },
 | 
			
		||||
                recovery_action="restart_thread"
 | 
			
		||||
            ))
 | 
			
		||||
        elif heartbeat_age > self.heartbeat_timeout * 0.5:  # Warning at 50% of timeout
 | 
			
		||||
            checks.append(HealthCheck(
 | 
			
		||||
                name=f"thread_{thread_info.thread_name}_responsive",
 | 
			
		||||
                status=HealthStatus.WARNING,
 | 
			
		||||
                message=f"Thread {thread_info.thread_name} slow heartbeat: {heartbeat_age:.1f}s",
 | 
			
		||||
                details={
 | 
			
		||||
                    'thread_id': thread_id,
 | 
			
		||||
                    'heartbeat_age': heartbeat_age,
 | 
			
		||||
                    'heartbeat_count': thread_info.heartbeat_count,
 | 
			
		||||
                    'last_activity': thread_info.last_activity,
 | 
			
		||||
                    'is_responsive': thread_info.is_responsive
 | 
			
		||||
                }
 | 
			
		||||
            ))
 | 
			
		||||
 | 
			
		||||
        # Check responsiveness test results
 | 
			
		||||
        if not thread_info.is_responsive:
 | 
			
		||||
            checks.append(HealthCheck(
 | 
			
		||||
                name=f"thread_{thread_info.thread_name}_callback",
 | 
			
		||||
                status=HealthStatus.WARNING,
 | 
			
		||||
                message=f"Thread {thread_info.thread_name} failed responsiveness test",
 | 
			
		||||
                details={
 | 
			
		||||
                    'thread_id': thread_id,
 | 
			
		||||
                    'last_activity': thread_info.last_activity
 | 
			
		||||
                }
 | 
			
		||||
            ))
 | 
			
		||||
 | 
			
		||||
        return checks
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Global thread health monitor instance
 | 
			
		||||
thread_health_monitor = ThreadHealthMonitor()
 | 
			
		||||
| 
						 | 
				
			
			@ -1,5 +1,6 @@
 | 
			
		|||
"""
 | 
			
		||||
FFmpeg RTSP stream reader using subprocess piping frames directly to buffer.
 | 
			
		||||
Enhanced with comprehensive health monitoring and automatic recovery.
 | 
			
		||||
"""
 | 
			
		||||
import cv2
 | 
			
		||||
import time
 | 
			
		||||
| 
						 | 
				
			
			@ -7,10 +8,13 @@ import threading
 | 
			
		|||
import numpy as np
 | 
			
		||||
import subprocess
 | 
			
		||||
import struct
 | 
			
		||||
from typing import Optional, Callable
 | 
			
		||||
from typing import Optional, Callable, Dict, Any
 | 
			
		||||
 | 
			
		||||
from .base import VideoReader
 | 
			
		||||
from .utils import log_success, log_warning, log_error, log_info
 | 
			
		||||
from ..monitoring.stream_health import stream_health_tracker
 | 
			
		||||
from ..monitoring.thread_health import thread_health_monitor
 | 
			
		||||
from ..monitoring.recovery import recovery_manager, RecoveryAction
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class FFmpegRTSPReader(VideoReader):
 | 
			
		||||
| 
						 | 
				
			
			@ -35,6 +39,21 @@ class FFmpegRTSPReader(VideoReader):
 | 
			
		|||
        self.first_start_timeout = 30.0  # 30s timeout on first start
 | 
			
		||||
        self.restart_timeout = 15.0      # 15s timeout after restart
 | 
			
		||||
 | 
			
		||||
        # Health monitoring setup
 | 
			
		||||
        self.last_heartbeat = time.time()
 | 
			
		||||
        self.consecutive_errors = 0
 | 
			
		||||
        self.ffmpeg_restart_count = 0
 | 
			
		||||
 | 
			
		||||
        # Register recovery handlers
 | 
			
		||||
        recovery_manager.register_recovery_handler(
 | 
			
		||||
            RecoveryAction.RESTART_STREAM,
 | 
			
		||||
            self._handle_restart_recovery
 | 
			
		||||
        )
 | 
			
		||||
        recovery_manager.register_recovery_handler(
 | 
			
		||||
            RecoveryAction.RECONNECT,
 | 
			
		||||
            self._handle_reconnect_recovery
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def is_running(self) -> bool:
 | 
			
		||||
        """Check if the reader is currently running."""
 | 
			
		||||
| 
						 | 
				
			
			@ -58,21 +77,35 @@ class FFmpegRTSPReader(VideoReader):
 | 
			
		|||
        self.stop_event.clear()
 | 
			
		||||
        self.thread = threading.Thread(target=self._read_frames, daemon=True)
 | 
			
		||||
        self.thread.start()
 | 
			
		||||
        log_success(self.camera_id, "Stream started")
 | 
			
		||||
 | 
			
		||||
        # Register with health monitoring
 | 
			
		||||
        stream_health_tracker.register_stream(self.camera_id, "rtsp_ffmpeg", self.rtsp_url)
 | 
			
		||||
        thread_health_monitor.register_thread(self.thread, self._heartbeat_callback)
 | 
			
		||||
 | 
			
		||||
        log_success(self.camera_id, "Stream started with health monitoring")
 | 
			
		||||
 | 
			
		||||
    def stop(self):
 | 
			
		||||
        """Stop the FFmpeg subprocess reader."""
 | 
			
		||||
        self.stop_event.set()
 | 
			
		||||
 | 
			
		||||
        # Unregister from health monitoring
 | 
			
		||||
        if self.thread:
 | 
			
		||||
            thread_health_monitor.unregister_thread(self.thread.ident)
 | 
			
		||||
 | 
			
		||||
        if self.process:
 | 
			
		||||
            self.process.terminate()
 | 
			
		||||
            try:
 | 
			
		||||
                self.process.wait(timeout=5)
 | 
			
		||||
            except subprocess.TimeoutExpired:
 | 
			
		||||
                self.process.kill()
 | 
			
		||||
 | 
			
		||||
        if self.thread:
 | 
			
		||||
            self.thread.join(timeout=5.0)
 | 
			
		||||
        if self.stderr_thread:
 | 
			
		||||
            self.stderr_thread.join(timeout=2.0)
 | 
			
		||||
 | 
			
		||||
        stream_health_tracker.unregister_stream(self.camera_id)
 | 
			
		||||
 | 
			
		||||
        log_info(self.camera_id, "Stream stopped")
 | 
			
		||||
 | 
			
		||||
    def _start_ffmpeg_process(self):
 | 
			
		||||
| 
						 | 
				
			
			@ -249,6 +282,9 @@ class FFmpegRTSPReader(VideoReader):
 | 
			
		|||
 | 
			
		||||
        while not self.stop_event.is_set():
 | 
			
		||||
            try:
 | 
			
		||||
                # Send heartbeat for thread health monitoring
 | 
			
		||||
                self._send_heartbeat("reading_frames")
 | 
			
		||||
 | 
			
		||||
                # Check watchdog timeout if process is running
 | 
			
		||||
                if self.process and self.process.poll() is None:
 | 
			
		||||
                    if self._check_watchdog_timeout():
 | 
			
		||||
| 
						 | 
				
			
			@ -259,8 +295,17 @@ class FFmpegRTSPReader(VideoReader):
 | 
			
		|||
                if not self.process or self.process.poll() is not None:
 | 
			
		||||
                    if self.process and self.process.poll() is not None:
 | 
			
		||||
                        log_warning(self.camera_id, "Stream disconnected, reconnecting...")
 | 
			
		||||
                        stream_health_tracker.report_error(
 | 
			
		||||
                            self.camera_id,
 | 
			
		||||
                            "FFmpeg process disconnected"
 | 
			
		||||
                        )
 | 
			
		||||
 | 
			
		||||
                    if not self._start_ffmpeg_process():
 | 
			
		||||
                        self.consecutive_errors += 1
 | 
			
		||||
                        stream_health_tracker.report_error(
 | 
			
		||||
                            self.camera_id,
 | 
			
		||||
                            "Failed to start FFmpeg process"
 | 
			
		||||
                        )
 | 
			
		||||
                        time.sleep(5.0)
 | 
			
		||||
                        continue
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -275,9 +320,22 @@ class FFmpegRTSPReader(VideoReader):
 | 
			
		|||
                        # Update watchdog - we got a frame
 | 
			
		||||
                        self.last_frame_time = time.time()
 | 
			
		||||
 | 
			
		||||
                        # Reset error counter on successful frame
 | 
			
		||||
                        self.consecutive_errors = 0
 | 
			
		||||
 | 
			
		||||
                        # Report successful frame to health monitoring
 | 
			
		||||
                        frame_size = frame.nbytes
 | 
			
		||||
                        stream_health_tracker.report_frame_received(self.camera_id, frame_size)
 | 
			
		||||
 | 
			
		||||
                        # Call frame callback
 | 
			
		||||
                        if self.frame_callback:
 | 
			
		||||
                            self.frame_callback(self.camera_id, frame)
 | 
			
		||||
                            try:
 | 
			
		||||
                                self.frame_callback(self.camera_id, frame)
 | 
			
		||||
                            except Exception as e:
 | 
			
		||||
                                stream_health_tracker.report_error(
 | 
			
		||||
                                    self.camera_id,
 | 
			
		||||
                                    f"Frame callback error: {e}"
 | 
			
		||||
                                )
 | 
			
		||||
 | 
			
		||||
                        frame_count += 1
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -287,16 +345,85 @@ class FFmpegRTSPReader(VideoReader):
 | 
			
		|||
                            log_success(self.camera_id, f"{frame_count} frames captured ({frame.shape[1]}x{frame.shape[0]})")
 | 
			
		||||
                            last_log_time = current_time
 | 
			
		||||
 | 
			
		||||
                except Exception:
 | 
			
		||||
                except Exception as e:
 | 
			
		||||
                    # Process might have died, let it restart on next iteration
 | 
			
		||||
                    stream_health_tracker.report_error(
 | 
			
		||||
                        self.camera_id,
 | 
			
		||||
                        f"Frame reading error: {e}"
 | 
			
		||||
                    )
 | 
			
		||||
                    if self.process:
 | 
			
		||||
                        self.process.terminate()
 | 
			
		||||
                        self.process = None
 | 
			
		||||
                    time.sleep(1.0)
 | 
			
		||||
 | 
			
		||||
            except Exception:
 | 
			
		||||
            except Exception as e:
 | 
			
		||||
                stream_health_tracker.report_error(
 | 
			
		||||
                    self.camera_id,
 | 
			
		||||
                    f"Main loop error: {e}"
 | 
			
		||||
                )
 | 
			
		||||
                time.sleep(1.0)
 | 
			
		||||
 | 
			
		||||
        # Cleanup
 | 
			
		||||
        if self.process:
 | 
			
		||||
            self.process.terminate()
 | 
			
		||||
            self.process.terminate()
 | 
			
		||||
 | 
			
		||||
    # Health monitoring methods
 | 
			
		||||
    def _send_heartbeat(self, activity: str = "running"):
 | 
			
		||||
        """Send heartbeat to thread health monitor."""
 | 
			
		||||
        self.last_heartbeat = time.time()
 | 
			
		||||
        thread_health_monitor.heartbeat(activity=activity)
 | 
			
		||||
 | 
			
		||||
    def _heartbeat_callback(self) -> bool:
 | 
			
		||||
        """Heartbeat callback for thread responsiveness testing."""
 | 
			
		||||
        try:
 | 
			
		||||
            # Check if thread is responsive by checking recent heartbeat
 | 
			
		||||
            current_time = time.time()
 | 
			
		||||
            age = current_time - self.last_heartbeat
 | 
			
		||||
 | 
			
		||||
            # Thread is responsive if heartbeat is recent
 | 
			
		||||
            return age < 30.0  # 30 second responsiveness threshold
 | 
			
		||||
 | 
			
		||||
        except Exception:
 | 
			
		||||
            return False
 | 
			
		||||
 | 
			
		||||
    def _handle_restart_recovery(self, component: str, details: Dict[str, Any]) -> bool:
 | 
			
		||||
        """Handle restart recovery action."""
 | 
			
		||||
        try:
 | 
			
		||||
            log_info(self.camera_id, "Restarting FFmpeg RTSP reader for health recovery")
 | 
			
		||||
 | 
			
		||||
            # Stop current instance
 | 
			
		||||
            self.stop()
 | 
			
		||||
 | 
			
		||||
            # Small delay
 | 
			
		||||
            time.sleep(2.0)
 | 
			
		||||
 | 
			
		||||
            # Restart
 | 
			
		||||
            self.start()
 | 
			
		||||
 | 
			
		||||
            # Report successful restart
 | 
			
		||||
            stream_health_tracker.report_reconnect(self.camera_id, "health_recovery_restart")
 | 
			
		||||
            self.ffmpeg_restart_count += 1
 | 
			
		||||
 | 
			
		||||
            return True
 | 
			
		||||
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            log_error(self.camera_id, f"Failed to restart FFmpeg RTSP reader: {e}")
 | 
			
		||||
            return False
 | 
			
		||||
 | 
			
		||||
    def _handle_reconnect_recovery(self, component: str, details: Dict[str, Any]) -> bool:
 | 
			
		||||
        """Handle reconnect recovery action."""
 | 
			
		||||
        try:
 | 
			
		||||
            log_info(self.camera_id, "Reconnecting FFmpeg RTSP reader for health recovery")
 | 
			
		||||
 | 
			
		||||
            # Force restart FFmpeg process
 | 
			
		||||
            self._restart_ffmpeg_process()
 | 
			
		||||
 | 
			
		||||
            # Reset error counters
 | 
			
		||||
            self.consecutive_errors = 0
 | 
			
		||||
            stream_health_tracker.report_reconnect(self.camera_id, "health_recovery_reconnect")
 | 
			
		||||
 | 
			
		||||
            return True
 | 
			
		||||
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            log_error(self.camera_id, f"Failed to reconnect FFmpeg RTSP reader: {e}")
 | 
			
		||||
            return False
 | 
			
		||||
| 
						 | 
				
			
			@ -1,5 +1,6 @@
 | 
			
		|||
"""
 | 
			
		||||
HTTP snapshot reader optimized for 2560x1440 (2K) high quality images.
 | 
			
		||||
Enhanced with comprehensive health monitoring and automatic recovery.
 | 
			
		||||
"""
 | 
			
		||||
import cv2
 | 
			
		||||
import logging
 | 
			
		||||
| 
						 | 
				
			
			@ -7,10 +8,13 @@ import time
 | 
			
		|||
import threading
 | 
			
		||||
import requests
 | 
			
		||||
import numpy as np
 | 
			
		||||
from typing import Optional, Callable
 | 
			
		||||
from typing import Optional, Callable, Dict, Any
 | 
			
		||||
 | 
			
		||||
from .base import VideoReader
 | 
			
		||||
from .utils import log_success, log_warning, log_error, log_info
 | 
			
		||||
from ..monitoring.stream_health import stream_health_tracker
 | 
			
		||||
from ..monitoring.thread_health import thread_health_monitor
 | 
			
		||||
from ..monitoring.recovery import recovery_manager, RecoveryAction
 | 
			
		||||
 | 
			
		||||
logger = logging.getLogger(__name__)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -30,6 +34,22 @@ class HTTPSnapshotReader(VideoReader):
 | 
			
		|||
        self.expected_height = 1440
 | 
			
		||||
        self.max_file_size = 10 * 1024 * 1024  # 10MB max for 2K image
 | 
			
		||||
 | 
			
		||||
        # Health monitoring setup
 | 
			
		||||
        self.last_heartbeat = time.time()
 | 
			
		||||
        self.consecutive_errors = 0
 | 
			
		||||
        self.connection_test_interval = 300  # Test connection every 5 minutes
 | 
			
		||||
        self.last_connection_test = None
 | 
			
		||||
 | 
			
		||||
        # Register recovery handlers
 | 
			
		||||
        recovery_manager.register_recovery_handler(
 | 
			
		||||
            RecoveryAction.RESTART_STREAM,
 | 
			
		||||
            self._handle_restart_recovery
 | 
			
		||||
        )
 | 
			
		||||
        recovery_manager.register_recovery_handler(
 | 
			
		||||
            RecoveryAction.RECONNECT,
 | 
			
		||||
            self._handle_reconnect_recovery
 | 
			
		||||
        )
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def is_running(self) -> bool:
 | 
			
		||||
        """Check if the reader is currently running."""
 | 
			
		||||
| 
						 | 
				
			
			@ -53,13 +73,24 @@ class HTTPSnapshotReader(VideoReader):
 | 
			
		|||
        self.stop_event.clear()
 | 
			
		||||
        self.thread = threading.Thread(target=self._read_snapshots, daemon=True)
 | 
			
		||||
        self.thread.start()
 | 
			
		||||
        logger.info(f"Started snapshot reader for camera {self.camera_id}")
 | 
			
		||||
 | 
			
		||||
        # Register with health monitoring
 | 
			
		||||
        stream_health_tracker.register_stream(self.camera_id, "http_snapshot", self.snapshot_url)
 | 
			
		||||
        thread_health_monitor.register_thread(self.thread, self._heartbeat_callback)
 | 
			
		||||
 | 
			
		||||
        logger.info(f"Started snapshot reader for camera {self.camera_id} with health monitoring")
 | 
			
		||||
 | 
			
		||||
    def stop(self):
 | 
			
		||||
        """Stop the snapshot reader thread."""
 | 
			
		||||
        self.stop_event.set()
 | 
			
		||||
 | 
			
		||||
        # Unregister from health monitoring
 | 
			
		||||
        if self.thread:
 | 
			
		||||
            thread_health_monitor.unregister_thread(self.thread.ident)
 | 
			
		||||
            self.thread.join(timeout=5.0)
 | 
			
		||||
 | 
			
		||||
        stream_health_tracker.unregister_stream(self.camera_id)
 | 
			
		||||
 | 
			
		||||
        logger.info(f"Stopped snapshot reader for camera {self.camera_id}")
 | 
			
		||||
 | 
			
		||||
    def _read_snapshots(self):
 | 
			
		||||
| 
						 | 
				
			
			@ -67,17 +98,29 @@ class HTTPSnapshotReader(VideoReader):
 | 
			
		|||
        retries = 0
 | 
			
		||||
        frame_count = 0
 | 
			
		||||
        last_log_time = time.time()
 | 
			
		||||
        last_connection_test = time.time()
 | 
			
		||||
        interval_seconds = self.interval_ms / 1000.0
 | 
			
		||||
 | 
			
		||||
        logger.info(f"Snapshot interval for camera {self.camera_id}: {interval_seconds}s")
 | 
			
		||||
 | 
			
		||||
        while not self.stop_event.is_set():
 | 
			
		||||
            try:
 | 
			
		||||
                # Send heartbeat for thread health monitoring
 | 
			
		||||
                self._send_heartbeat("fetching_snapshot")
 | 
			
		||||
 | 
			
		||||
                start_time = time.time()
 | 
			
		||||
                frame = self._fetch_snapshot()
 | 
			
		||||
 | 
			
		||||
                if frame is None:
 | 
			
		||||
                    retries += 1
 | 
			
		||||
                    self.consecutive_errors += 1
 | 
			
		||||
 | 
			
		||||
                    # Report error to health monitoring
 | 
			
		||||
                    stream_health_tracker.report_error(
 | 
			
		||||
                        self.camera_id,
 | 
			
		||||
                        f"Failed to fetch snapshot (retry {retries}/{self.max_retries})"
 | 
			
		||||
                    )
 | 
			
		||||
 | 
			
		||||
                    logger.warning(f"Failed to fetch snapshot for camera {self.camera_id}, retry {retries}/{self.max_retries}")
 | 
			
		||||
 | 
			
		||||
                    if self.max_retries != -1 and retries > self.max_retries:
 | 
			
		||||
| 
						 | 
				
			
			@ -90,21 +133,36 @@ class HTTPSnapshotReader(VideoReader):
 | 
			
		|||
                # Accept any valid image dimensions - don't force specific resolution
 | 
			
		||||
                if frame.shape[1] <= 0 or frame.shape[0] <= 0:
 | 
			
		||||
                    logger.warning(f"Camera {self.camera_id}: Invalid frame dimensions {frame.shape[1]}x{frame.shape[0]}")
 | 
			
		||||
                    stream_health_tracker.report_error(
 | 
			
		||||
                        self.camera_id,
 | 
			
		||||
                        f"Invalid frame dimensions: {frame.shape[1]}x{frame.shape[0]}"
 | 
			
		||||
                    )
 | 
			
		||||
                    continue
 | 
			
		||||
 | 
			
		||||
                # Reset retry counter on successful fetch
 | 
			
		||||
                retries = 0
 | 
			
		||||
                self.consecutive_errors = 0
 | 
			
		||||
                frame_count += 1
 | 
			
		||||
 | 
			
		||||
                # Report successful frame to health monitoring
 | 
			
		||||
                frame_size = frame.nbytes
 | 
			
		||||
                stream_health_tracker.report_frame_received(self.camera_id, frame_size)
 | 
			
		||||
 | 
			
		||||
                # Call frame callback
 | 
			
		||||
                if self.frame_callback:
 | 
			
		||||
                    try:
 | 
			
		||||
                        self.frame_callback(self.camera_id, frame)
 | 
			
		||||
                    except Exception as e:
 | 
			
		||||
                        logger.error(f"Camera {self.camera_id}: Frame callback error: {e}")
 | 
			
		||||
                        stream_health_tracker.report_error(self.camera_id, f"Frame callback error: {e}")
 | 
			
		||||
 | 
			
		||||
                # Periodic connection health test
 | 
			
		||||
                current_time = time.time()
 | 
			
		||||
                if current_time - last_connection_test >= self.connection_test_interval:
 | 
			
		||||
                    self._test_connection_health()
 | 
			
		||||
                    last_connection_test = current_time
 | 
			
		||||
 | 
			
		||||
                # Log progress every 30 seconds
 | 
			
		||||
                current_time = time.time()
 | 
			
		||||
                if current_time - last_log_time >= 30:
 | 
			
		||||
                    logger.info(f"Camera {self.camera_id}: {frame_count} snapshots processed")
 | 
			
		||||
                    last_log_time = current_time
 | 
			
		||||
| 
						 | 
				
			
			@ -117,6 +175,7 @@ class HTTPSnapshotReader(VideoReader):
 | 
			
		|||
 | 
			
		||||
            except Exception as e:
 | 
			
		||||
                logger.error(f"Error in snapshot loop for camera {self.camera_id}: {e}")
 | 
			
		||||
                stream_health_tracker.report_error(self.camera_id, f"Snapshot loop error: {e}")
 | 
			
		||||
                retries += 1
 | 
			
		||||
                if self.max_retries != -1 and retries > self.max_retries:
 | 
			
		||||
                    break
 | 
			
		||||
| 
						 | 
				
			
			@ -246,4 +305,74 @@ class HTTPSnapshotReader(VideoReader):
 | 
			
		|||
            right = target_width - new_width - left
 | 
			
		||||
            resized = cv2.copyMakeBorder(resized, top, bottom, left, right, cv2.BORDER_CONSTANT, value=[0, 0, 0])
 | 
			
		||||
 | 
			
		||||
        return resized
 | 
			
		||||
        return resized
 | 
			
		||||
 | 
			
		||||
    # Health monitoring methods
 | 
			
		||||
    def _send_heartbeat(self, activity: str = "running"):
 | 
			
		||||
        """Send heartbeat to thread health monitor."""
 | 
			
		||||
        self.last_heartbeat = time.time()
 | 
			
		||||
        thread_health_monitor.heartbeat(activity=activity)
 | 
			
		||||
 | 
			
		||||
    def _heartbeat_callback(self) -> bool:
 | 
			
		||||
        """Heartbeat callback for thread responsiveness testing."""
 | 
			
		||||
        try:
 | 
			
		||||
            # Check if thread is responsive by checking recent heartbeat
 | 
			
		||||
            current_time = time.time()
 | 
			
		||||
            age = current_time - self.last_heartbeat
 | 
			
		||||
 | 
			
		||||
            # Thread is responsive if heartbeat is recent
 | 
			
		||||
            return age < 30.0  # 30 second responsiveness threshold
 | 
			
		||||
 | 
			
		||||
        except Exception:
 | 
			
		||||
            return False
 | 
			
		||||
 | 
			
		||||
    def _test_connection_health(self):
 | 
			
		||||
        """Test HTTP connection health."""
 | 
			
		||||
        try:
 | 
			
		||||
            stream_health_tracker.test_http_connection(self.camera_id, self.snapshot_url)
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            logger.error(f"Error testing connection health for {self.camera_id}: {e}")
 | 
			
		||||
 | 
			
		||||
    def _handle_restart_recovery(self, component: str, details: Dict[str, Any]) -> bool:
 | 
			
		||||
        """Handle restart recovery action."""
 | 
			
		||||
        try:
 | 
			
		||||
            logger.info(f"Restarting HTTP snapshot reader for {self.camera_id}")
 | 
			
		||||
 | 
			
		||||
            # Stop current instance
 | 
			
		||||
            self.stop()
 | 
			
		||||
 | 
			
		||||
            # Small delay
 | 
			
		||||
            time.sleep(2.0)
 | 
			
		||||
 | 
			
		||||
            # Restart
 | 
			
		||||
            self.start()
 | 
			
		||||
 | 
			
		||||
            # Report successful restart
 | 
			
		||||
            stream_health_tracker.report_reconnect(self.camera_id, "health_recovery_restart")
 | 
			
		||||
 | 
			
		||||
            return True
 | 
			
		||||
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            logger.error(f"Failed to restart HTTP snapshot reader for {self.camera_id}: {e}")
 | 
			
		||||
            return False
 | 
			
		||||
 | 
			
		||||
    def _handle_reconnect_recovery(self, component: str, details: Dict[str, Any]) -> bool:
 | 
			
		||||
        """Handle reconnect recovery action."""
 | 
			
		||||
        try:
 | 
			
		||||
            logger.info(f"Reconnecting HTTP snapshot reader for {self.camera_id}")
 | 
			
		||||
 | 
			
		||||
            # Test connection first
 | 
			
		||||
            success = stream_health_tracker.test_http_connection(self.camera_id, self.snapshot_url)
 | 
			
		||||
 | 
			
		||||
            if success:
 | 
			
		||||
                # Reset error counters
 | 
			
		||||
                self.consecutive_errors = 0
 | 
			
		||||
                stream_health_tracker.report_reconnect(self.camera_id, "health_recovery_reconnect")
 | 
			
		||||
                return True
 | 
			
		||||
            else:
 | 
			
		||||
                logger.warning(f"Connection test failed during recovery for {self.camera_id}")
 | 
			
		||||
                return False
 | 
			
		||||
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            logger.error(f"Failed to reconnect HTTP snapshot reader for {self.camera_id}: {e}")
 | 
			
		||||
            return False
 | 
			
		||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue