Implement comprehensive health monitoring for streams and threads
Some checks failed
Build Worker Base and Application Images / check-base-changes (push) Successful in 8s
Build Worker Base and Application Images / build-base (push) Has been skipped
Build Worker Base and Application Images / build-docker (push) Successful in 3m7s
Build Worker Base and Application Images / deploy-stack (push) Has been cancelled

- Added RecoveryManager for automatic handling of health issues, including circuit breaker patterns, automatic restarts, and graceful degradation.
- Introduced StreamHealthTracker to monitor video stream metrics, including frame production, connection health, and error rates.
- Developed ThreadHealthMonitor for detecting unresponsive and deadlocked threads, providing liveness detection and responsiveness testing.
- Integrated health checks for streams and threads, reporting metrics and recovery actions to the health monitor.
- Enhanced logging for recovery attempts, errors, and health checks to improve observability and debugging.
This commit is contained in:
Siwat Sirichai 2025-09-27 12:27:38 +07:00
parent 8c08c815ce
commit b08ce27de2
9 changed files with 2173 additions and 11 deletions

314
app.py
View file

@ -8,6 +8,7 @@ import os
import time
import cv2
from contextlib import asynccontextmanager
from typing import Dict, Any
from fastapi import FastAPI, WebSocket, HTTPException
from fastapi.responses import Response
@ -31,21 +32,135 @@ logger.setLevel(logging.DEBUG)
# Frames are now stored in the shared cache buffer from core.streaming.buffers
# latest_frames = {} # Deprecated - using shared_cache_buffer instead
# Health monitoring recovery handlers
def _handle_stream_restart_recovery(component: str, details: Dict[str, Any]) -> bool:
"""Handle stream restart recovery at the application level."""
try:
from core.streaming.manager import shared_stream_manager
# Extract camera ID from component name (e.g., "stream_cam-001" -> "cam-001")
if component.startswith("stream_"):
camera_id = component[7:] # Remove "stream_" prefix
else:
camera_id = component
logger.info(f"Attempting stream restart recovery for {camera_id}")
# Find and restart the subscription
subscriptions = shared_stream_manager.get_all_subscriptions()
for sub_info in subscriptions:
if sub_info.camera_id == camera_id:
# Remove and re-add the subscription
shared_stream_manager.remove_subscription(sub_info.subscription_id)
time.sleep(1.0) # Brief delay
# Re-add subscription
success = shared_stream_manager.add_subscription(
sub_info.subscription_id,
sub_info.stream_config,
sub_info.crop_coords,
sub_info.model_id,
sub_info.model_url,
sub_info.tracking_integration
)
if success:
logger.info(f"Stream restart recovery successful for {camera_id}")
return True
else:
logger.error(f"Stream restart recovery failed for {camera_id}")
return False
logger.warning(f"No subscription found for camera {camera_id} during recovery")
return False
except Exception as e:
logger.error(f"Error in stream restart recovery for {component}: {e}")
return False
def _handle_stream_reconnect_recovery(component: str, details: Dict[str, Any]) -> bool:
"""Handle stream reconnect recovery at the application level."""
try:
from core.streaming.manager import shared_stream_manager
# Extract camera ID from component name
if component.startswith("stream_"):
camera_id = component[7:]
else:
camera_id = component
logger.info(f"Attempting stream reconnect recovery for {camera_id}")
# For reconnect, we just need to trigger the stream's internal reconnect
# The stream readers handle their own reconnection logic
active_cameras = shared_stream_manager.get_active_cameras()
if camera_id in active_cameras:
logger.info(f"Stream reconnect recovery triggered for {camera_id}")
return True
else:
logger.warning(f"Camera {camera_id} not found in active cameras during reconnect recovery")
return False
except Exception as e:
logger.error(f"Error in stream reconnect recovery for {component}: {e}")
return False
# Lifespan event handler (modern FastAPI approach)
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Application lifespan management."""
# Startup
logger.info("Detector Worker started successfully")
# Initialize health monitoring system
try:
from core.monitoring.health import health_monitor
from core.monitoring.stream_health import stream_health_tracker
from core.monitoring.thread_health import thread_health_monitor
from core.monitoring.recovery import recovery_manager
# Start health monitoring
health_monitor.start()
logger.info("Health monitoring system started")
# Register recovery handlers for stream management
from core.streaming.manager import shared_stream_manager
recovery_manager.register_recovery_handler(
"restart_stream",
_handle_stream_restart_recovery
)
recovery_manager.register_recovery_handler(
"reconnect",
_handle_stream_reconnect_recovery
)
logger.info("Recovery handlers registered")
except Exception as e:
logger.error(f"Failed to initialize health monitoring: {e}")
logger.info("WebSocket endpoint available at: ws://0.0.0.0:8001/")
logger.info("HTTP camera endpoint available at: http://0.0.0.0:8001/camera/{camera_id}/image")
logger.info("Health check available at: http://0.0.0.0:8001/health")
logger.info("Detailed health monitoring available at: http://0.0.0.0:8001/health/detailed")
logger.info("Ready and waiting for backend WebSocket connections")
yield
# Shutdown
logger.info("Detector Worker shutting down...")
# Stop health monitoring
try:
from core.monitoring.health import health_monitor
health_monitor.stop()
logger.info("Health monitoring system stopped")
except Exception as e:
logger.error(f"Error stopping health monitoring: {e}")
# Clear all state
worker_state.set_subscriptions([])
worker_state.session_ids.clear()
@ -197,6 +312,205 @@ async def health_check():
}
@app.get("/health/detailed")
async def detailed_health_check():
"""Comprehensive health status with detailed monitoring data."""
try:
from core.monitoring.health import health_monitor
from core.monitoring.stream_health import stream_health_tracker
from core.monitoring.thread_health import thread_health_monitor
from core.monitoring.recovery import recovery_manager
# Get comprehensive health status
overall_health = health_monitor.get_health_status()
stream_metrics = stream_health_tracker.get_all_metrics()
thread_info = thread_health_monitor.get_all_thread_info()
recovery_stats = recovery_manager.get_recovery_stats()
return {
"timestamp": time.time(),
"overall_health": overall_health,
"stream_metrics": stream_metrics,
"thread_health": thread_info,
"recovery_stats": recovery_stats,
"system_info": {
"active_subscriptions": len(worker_state.subscriptions),
"active_sessions": len(worker_state.session_ids),
"version": "2.0.0"
}
}
except Exception as e:
logger.error(f"Error generating detailed health report: {e}")
raise HTTPException(status_code=500, detail=f"Health monitoring error: {str(e)}")
@app.get("/health/streams")
async def stream_health_status():
"""Stream-specific health monitoring."""
try:
from core.monitoring.stream_health import stream_health_tracker
from core.streaming.buffers import shared_cache_buffer
stream_metrics = stream_health_tracker.get_all_metrics()
buffer_stats = shared_cache_buffer.get_stats()
return {
"timestamp": time.time(),
"stream_count": len(stream_metrics),
"stream_metrics": stream_metrics,
"buffer_stats": buffer_stats,
"frame_ages": {
camera_id: {
"age_seconds": time.time() - info["last_frame_time"] if info and info.get("last_frame_time") else None,
"total_frames": info.get("frame_count", 0) if info else 0
}
for camera_id, info in stream_metrics.items()
}
}
except Exception as e:
logger.error(f"Error generating stream health report: {e}")
raise HTTPException(status_code=500, detail=f"Stream health error: {str(e)}")
@app.get("/health/threads")
async def thread_health_status():
"""Thread-specific health monitoring."""
try:
from core.monitoring.thread_health import thread_health_monitor
thread_info = thread_health_monitor.get_all_thread_info()
deadlocks = thread_health_monitor.detect_deadlocks()
return {
"timestamp": time.time(),
"thread_count": len(thread_info),
"thread_info": thread_info,
"potential_deadlocks": deadlocks,
"summary": {
"responsive_threads": sum(1 for info in thread_info.values() if info.get("is_responsive", False)),
"unresponsive_threads": sum(1 for info in thread_info.values() if not info.get("is_responsive", True)),
"deadlock_count": len(deadlocks)
}
}
except Exception as e:
logger.error(f"Error generating thread health report: {e}")
raise HTTPException(status_code=500, detail=f"Thread health error: {str(e)}")
@app.get("/health/recovery")
async def recovery_status():
"""Recovery system status and history."""
try:
from core.monitoring.recovery import recovery_manager
recovery_stats = recovery_manager.get_recovery_stats()
return {
"timestamp": time.time(),
"recovery_stats": recovery_stats,
"summary": {
"total_recoveries_last_hour": recovery_stats.get("total_recoveries_last_hour", 0),
"components_with_recovery_state": len(recovery_stats.get("recovery_states", {})),
"total_recovery_failures": sum(
state.get("failure_count", 0)
for state in recovery_stats.get("recovery_states", {}).values()
),
"total_recovery_successes": sum(
state.get("success_count", 0)
for state in recovery_stats.get("recovery_states", {}).values()
)
}
}
except Exception as e:
logger.error(f"Error generating recovery status report: {e}")
raise HTTPException(status_code=500, detail=f"Recovery status error: {str(e)}")
@app.post("/health/recovery/force/{component}")
async def force_recovery(component: str, action: str = "restart_stream"):
"""Force recovery action for a specific component."""
try:
from core.monitoring.recovery import recovery_manager, RecoveryAction
# Validate action
try:
recovery_action = RecoveryAction(action)
except ValueError:
raise HTTPException(
status_code=400,
detail=f"Invalid recovery action: {action}. Valid actions: {[a.value for a in RecoveryAction]}"
)
# Force recovery
success = recovery_manager.force_recovery(component, recovery_action, "manual_api_request")
return {
"timestamp": time.time(),
"component": component,
"action": action,
"success": success,
"message": f"Recovery {'successful' if success else 'failed'} for component {component}"
}
except HTTPException:
raise
except Exception as e:
logger.error(f"Error forcing recovery for {component}: {e}")
raise HTTPException(status_code=500, detail=f"Recovery error: {str(e)}")
@app.get("/health/metrics")
async def health_metrics():
"""Performance and health metrics in a format suitable for monitoring systems."""
try:
from core.monitoring.health import health_monitor
from core.monitoring.stream_health import stream_health_tracker
from core.streaming.buffers import shared_cache_buffer
# Get basic metrics
overall_health = health_monitor.get_health_status()
stream_metrics = stream_health_tracker.get_all_metrics()
buffer_stats = shared_cache_buffer.get_stats()
# Format for monitoring systems (Prometheus-style)
metrics = {
"detector_worker_up": 1,
"detector_worker_streams_total": len(stream_metrics),
"detector_worker_subscriptions_total": len(worker_state.subscriptions),
"detector_worker_sessions_total": len(worker_state.session_ids),
"detector_worker_memory_mb": buffer_stats.get("total_memory_mb", 0),
"detector_worker_health_status": {
"healthy": 1,
"warning": 2,
"critical": 3,
"unknown": 4
}.get(overall_health.get("overall_status", "unknown"), 4)
}
# Add per-stream metrics
for camera_id, stream_info in stream_metrics.items():
safe_camera_id = camera_id.replace("-", "_").replace(".", "_")
metrics.update({
f"detector_worker_stream_frames_total{{camera=\"{safe_camera_id}\"}}": stream_info.get("frame_count", 0),
f"detector_worker_stream_errors_total{{camera=\"{safe_camera_id}\"}}": stream_info.get("error_count", 0),
f"detector_worker_stream_fps{{camera=\"{safe_camera_id}\"}}": stream_info.get("frames_per_second", 0),
f"detector_worker_stream_frame_age_seconds{{camera=\"{safe_camera_id}\"}}": stream_info.get("last_frame_age_seconds") or 0
})
return {
"timestamp": time.time(),
"metrics": metrics
}
except Exception as e:
logger.error(f"Error generating health metrics: {e}")
raise HTTPException(status_code=500, detail=f"Metrics error: {str(e)}")
if __name__ == "__main__":