Implement comprehensive health monitoring for streams and threads

- Added RecoveryManager for automatic handling of health issues, including circuit breaker patterns, automatic restarts, and graceful degradation. - Introduced StreamHealthTracker to monitor video stream metrics, including frame production, connection health, and error rates. - Developed ThreadHealthMonitor for detecting unresponsive and deadlocked threads, providing liveness detection and responsiveness testing. - Integrated health checks for streams and threads, reporting metrics and recovery actions to the health monitor. - Enhanced logging for recovery attempts, errors, and health checks to improve observability and debugging.
2025-09-27 12:27:38 +07:00 · 2025-09-27 12:27:38 +07:00 · b08ce27de2
commit b08ce27de2
parent 8c08c815ce
9 changed files with 2173 additions and 11 deletions
--- a/app.py
+++ b/app.py
@ -8,6 +8,7 @@ import os
 import time
 import cv2
 from contextlib import asynccontextmanager
+from typing import Dict, Any
 from fastapi import FastAPI, WebSocket, HTTPException
 from fastapi.responses import Response

@ -31,21 +32,135 @@ logger.setLevel(logging.DEBUG)
 # Frames are now stored in the shared cache buffer from core.streaming.buffers
 # latest_frames = {}  # Deprecated - using shared_cache_buffer instead

+
+# Health monitoring recovery handlers
+def _handle_stream_restart_recovery(component: str, details: Dict[str, Any]) -> bool:
+    """Handle stream restart recovery at the application level."""
+    try:
+        from core.streaming.manager import shared_stream_manager
+
+        # Extract camera ID from component name (e.g., "stream_cam-001" -> "cam-001")
+        if component.startswith("stream_"):
+            camera_id = component[7:]  # Remove "stream_" prefix
+        else:
+            camera_id = component
+
+        logger.info(f"Attempting stream restart recovery for {camera_id}")
+
+        # Find and restart the subscription
+        subscriptions = shared_stream_manager.get_all_subscriptions()
+        for sub_info in subscriptions:
+            if sub_info.camera_id == camera_id:
+                # Remove and re-add the subscription
+                shared_stream_manager.remove_subscription(sub_info.subscription_id)
+                time.sleep(1.0)  # Brief delay
+
+                # Re-add subscription
+                success = shared_stream_manager.add_subscription(
+                    sub_info.subscription_id,
+                    sub_info.stream_config,
+                    sub_info.crop_coords,
+                    sub_info.model_id,
+                    sub_info.model_url,
+                    sub_info.tracking_integration
+                )
+
+                if success:
+                    logger.info(f"Stream restart recovery successful for {camera_id}")
+                    return True
+                else:
+                    logger.error(f"Stream restart recovery failed for {camera_id}")
+                    return False
+
+        logger.warning(f"No subscription found for camera {camera_id} during recovery")
+        return False
+
+    except Exception as e:
+        logger.error(f"Error in stream restart recovery for {component}: {e}")
+        return False
+
+
+def _handle_stream_reconnect_recovery(component: str, details: Dict[str, Any]) -> bool:
+    """Handle stream reconnect recovery at the application level."""
+    try:
+        from core.streaming.manager import shared_stream_manager
+
+        # Extract camera ID from component name
+        if component.startswith("stream_"):
+            camera_id = component[7:]
+        else:
+            camera_id = component
+
+        logger.info(f"Attempting stream reconnect recovery for {camera_id}")
+
+        # For reconnect, we just need to trigger the stream's internal reconnect
+        # The stream readers handle their own reconnection logic
+        active_cameras = shared_stream_manager.get_active_cameras()
+
+        if camera_id in active_cameras:
+            logger.info(f"Stream reconnect recovery triggered for {camera_id}")
+            return True
+        else:
+            logger.warning(f"Camera {camera_id} not found in active cameras during reconnect recovery")
+            return False
+
+    except Exception as e:
+        logger.error(f"Error in stream reconnect recovery for {component}: {e}")
+        return False
+
 # Lifespan event handler (modern FastAPI approach)
@asynccontextmanager
 async def lifespan(app: FastAPI):
    """Application lifespan management."""
    # Startup
    logger.info("Detector Worker started successfully")
+
+    # Initialize health monitoring system
+    try:
+        from core.monitoring.health import health_monitor
+        from core.monitoring.stream_health import stream_health_tracker
+        from core.monitoring.thread_health import thread_health_monitor
+        from core.monitoring.recovery import recovery_manager
+
+        # Start health monitoring
+        health_monitor.start()
+        logger.info("Health monitoring system started")
+
+        # Register recovery handlers for stream management
+        from core.streaming.manager import shared_stream_manager
+        recovery_manager.register_recovery_handler(
+            "restart_stream",
+            _handle_stream_restart_recovery
+        )
+        recovery_manager.register_recovery_handler(
+            "reconnect",
+            _handle_stream_reconnect_recovery
+        )
+
+        logger.info("Recovery handlers registered")
+
+    except Exception as e:
+        logger.error(f"Failed to initialize health monitoring: {e}")
+
    logger.info("WebSocket endpoint available at: ws://0.0.0.0:8001/")
    logger.info("HTTP camera endpoint available at: http://0.0.0.0:8001/camera/{camera_id}/image")
    logger.info("Health check available at: http://0.0.0.0:8001/health")
+    logger.info("Detailed health monitoring available at: http://0.0.0.0:8001/health/detailed")
    logger.info("Ready and waiting for backend WebSocket connections")

    yield

    # Shutdown
    logger.info("Detector Worker shutting down...")
+
+    # Stop health monitoring
+    try:
+        from core.monitoring.health import health_monitor
+        health_monitor.stop()
+        logger.info("Health monitoring system stopped")
+    except Exception as e:
+        logger.error(f"Error stopping health monitoring: {e}")
+
    # Clear all state
    worker_state.set_subscriptions([])
    worker_state.session_ids.clear()
@ -197,6 +312,205 @@ async def health_check():
    }


+@app.get("/health/detailed")
+async def detailed_health_check():
+    """Comprehensive health status with detailed monitoring data."""
+    try:
+        from core.monitoring.health import health_monitor
+        from core.monitoring.stream_health import stream_health_tracker
+        from core.monitoring.thread_health import thread_health_monitor
+        from core.monitoring.recovery import recovery_manager
+
+        # Get comprehensive health status
+        overall_health = health_monitor.get_health_status()
+        stream_metrics = stream_health_tracker.get_all_metrics()
+        thread_info = thread_health_monitor.get_all_thread_info()
+        recovery_stats = recovery_manager.get_recovery_stats()
+
+        return {
+            "timestamp": time.time(),
+            "overall_health": overall_health,
+            "stream_metrics": stream_metrics,
+            "thread_health": thread_info,
+            "recovery_stats": recovery_stats,
+            "system_info": {
+                "active_subscriptions": len(worker_state.subscriptions),
+                "active_sessions": len(worker_state.session_ids),
+                "version": "2.0.0"
+            }
+        }
+
+    except Exception as e:
+        logger.error(f"Error generating detailed health report: {e}")
+        raise HTTPException(status_code=500, detail=f"Health monitoring error: {str(e)}")
+
+
+@app.get("/health/streams")
+async def stream_health_status():
+    """Stream-specific health monitoring."""
+    try:
+        from core.monitoring.stream_health import stream_health_tracker
+        from core.streaming.buffers import shared_cache_buffer
+
+        stream_metrics = stream_health_tracker.get_all_metrics()
+        buffer_stats = shared_cache_buffer.get_stats()
+
+        return {
+            "timestamp": time.time(),
+            "stream_count": len(stream_metrics),
+            "stream_metrics": stream_metrics,
+            "buffer_stats": buffer_stats,
+            "frame_ages": {
+                camera_id: {
+                    "age_seconds": time.time() - info["last_frame_time"] if info and info.get("last_frame_time") else None,
+                    "total_frames": info.get("frame_count", 0) if info else 0
+                }
+                for camera_id, info in stream_metrics.items()
+            }
+        }
+
+    except Exception as e:
+        logger.error(f"Error generating stream health report: {e}")
+        raise HTTPException(status_code=500, detail=f"Stream health error: {str(e)}")
+
+
+@app.get("/health/threads")
+async def thread_health_status():
+    """Thread-specific health monitoring."""
+    try:
+        from core.monitoring.thread_health import thread_health_monitor
+
+        thread_info = thread_health_monitor.get_all_thread_info()
+        deadlocks = thread_health_monitor.detect_deadlocks()
+
+        return {
+            "timestamp": time.time(),
+            "thread_count": len(thread_info),
+            "thread_info": thread_info,
+            "potential_deadlocks": deadlocks,
+            "summary": {
+                "responsive_threads": sum(1 for info in thread_info.values() if info.get("is_responsive", False)),
+                "unresponsive_threads": sum(1 for info in thread_info.values() if not info.get("is_responsive", True)),
+                "deadlock_count": len(deadlocks)
+            }
+        }
+
+    except Exception as e:
+        logger.error(f"Error generating thread health report: {e}")
+        raise HTTPException(status_code=500, detail=f"Thread health error: {str(e)}")
+
+
+@app.get("/health/recovery")
+async def recovery_status():
+    """Recovery system status and history."""
+    try:
+        from core.monitoring.recovery import recovery_manager
+
+        recovery_stats = recovery_manager.get_recovery_stats()
+
+        return {
+            "timestamp": time.time(),
+            "recovery_stats": recovery_stats,
+            "summary": {
+                "total_recoveries_last_hour": recovery_stats.get("total_recoveries_last_hour", 0),
+                "components_with_recovery_state": len(recovery_stats.get("recovery_states", {})),
+                "total_recovery_failures": sum(
+                    state.get("failure_count", 0)
+                    for state in recovery_stats.get("recovery_states", {}).values()
+                ),
+                "total_recovery_successes": sum(
+                    state.get("success_count", 0)
+                    for state in recovery_stats.get("recovery_states", {}).values()
+                )
+            }
+        }
+
+    except Exception as e:
+        logger.error(f"Error generating recovery status report: {e}")
+        raise HTTPException(status_code=500, detail=f"Recovery status error: {str(e)}")
+
+
+@app.post("/health/recovery/force/{component}")
+async def force_recovery(component: str, action: str = "restart_stream"):
+    """Force recovery action for a specific component."""
+    try:
+        from core.monitoring.recovery import recovery_manager, RecoveryAction
+
+        # Validate action
+        try:
+            recovery_action = RecoveryAction(action)
+        except ValueError:
+            raise HTTPException(
+                status_code=400,
+                detail=f"Invalid recovery action: {action}. Valid actions: {[a.value for a in RecoveryAction]}"
+            )
+
+        # Force recovery
+        success = recovery_manager.force_recovery(component, recovery_action, "manual_api_request")
+
+        return {
+            "timestamp": time.time(),
+            "component": component,
+            "action": action,
+            "success": success,
+            "message": f"Recovery {'successful' if success else 'failed'} for component {component}"
+        }
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error forcing recovery for {component}: {e}")
+        raise HTTPException(status_code=500, detail=f"Recovery error: {str(e)}")
+
+
+@app.get("/health/metrics")
+async def health_metrics():
+    """Performance and health metrics in a format suitable for monitoring systems."""
+    try:
+        from core.monitoring.health import health_monitor
+        from core.monitoring.stream_health import stream_health_tracker
+        from core.streaming.buffers import shared_cache_buffer
+
+        # Get basic metrics
+        overall_health = health_monitor.get_health_status()
+        stream_metrics = stream_health_tracker.get_all_metrics()
+        buffer_stats = shared_cache_buffer.get_stats()
+
+        # Format for monitoring systems (Prometheus-style)
+        metrics = {
+            "detector_worker_up": 1,
+            "detector_worker_streams_total": len(stream_metrics),
+            "detector_worker_subscriptions_total": len(worker_state.subscriptions),
+            "detector_worker_sessions_total": len(worker_state.session_ids),
+            "detector_worker_memory_mb": buffer_stats.get("total_memory_mb", 0),
+            "detector_worker_health_status": {
+                "healthy": 1,
+                "warning": 2,
+                "critical": 3,
+                "unknown": 4
+            }.get(overall_health.get("overall_status", "unknown"), 4)
+        }
+
+        # Add per-stream metrics
+        for camera_id, stream_info in stream_metrics.items():
+            safe_camera_id = camera_id.replace("-", "_").replace(".", "_")
+            metrics.update({
+                f"detector_worker_stream_frames_total{{camera=\"{safe_camera_id}\"}}": stream_info.get("frame_count", 0),
+                f"detector_worker_stream_errors_total{{camera=\"{safe_camera_id}\"}}": stream_info.get("error_count", 0),
+                f"detector_worker_stream_fps{{camera=\"{safe_camera_id}\"}}": stream_info.get("frames_per_second", 0),
+                f"detector_worker_stream_frame_age_seconds{{camera=\"{safe_camera_id}\"}}": stream_info.get("last_frame_age_seconds") or 0
+            })
+
+        return {
+            "timestamp": time.time(),
+            "metrics": metrics
+        }
+
+    except Exception as e:
+        logger.error(f"Error generating health metrics: {e}")
+        raise HTTPException(status_code=500, detail=f"Metrics error: {str(e)}")
+
+


 if __name__ == "__main__":