Implement comprehensive health monitoring for streams and threads
Some checks failed
Build Worker Base and Application Images / check-base-changes (push) Successful in 8s
Build Worker Base and Application Images / build-base (push) Has been skipped
Build Worker Base and Application Images / build-docker (push) Successful in 3m7s
Build Worker Base and Application Images / deploy-stack (push) Has been cancelled
Some checks failed
Build Worker Base and Application Images / check-base-changes (push) Successful in 8s
Build Worker Base and Application Images / build-base (push) Has been skipped
Build Worker Base and Application Images / build-docker (push) Successful in 3m7s
Build Worker Base and Application Images / deploy-stack (push) Has been cancelled
- Added RecoveryManager for automatic handling of health issues, including circuit breaker patterns, automatic restarts, and graceful degradation. - Introduced StreamHealthTracker to monitor video stream metrics, including frame production, connection health, and error rates. - Developed ThreadHealthMonitor for detecting unresponsive and deadlocked threads, providing liveness detection and responsiveness testing. - Integrated health checks for streams and threads, reporting metrics and recovery actions to the health monitor. - Enhanced logging for recovery attempts, errors, and health checks to improve observability and debugging.
This commit is contained in:
parent
8c08c815ce
commit
b08ce27de2
9 changed files with 2173 additions and 11 deletions
314
app.py
314
app.py
|
@ -8,6 +8,7 @@ import os
|
|||
import time
|
||||
import cv2
|
||||
from contextlib import asynccontextmanager
|
||||
from typing import Dict, Any
|
||||
from fastapi import FastAPI, WebSocket, HTTPException
|
||||
from fastapi.responses import Response
|
||||
|
||||
|
@ -31,21 +32,135 @@ logger.setLevel(logging.DEBUG)
|
|||
# Frames are now stored in the shared cache buffer from core.streaming.buffers
|
||||
# latest_frames = {} # Deprecated - using shared_cache_buffer instead
|
||||
|
||||
|
||||
# Health monitoring recovery handlers
|
||||
def _handle_stream_restart_recovery(component: str, details: Dict[str, Any]) -> bool:
|
||||
"""Handle stream restart recovery at the application level."""
|
||||
try:
|
||||
from core.streaming.manager import shared_stream_manager
|
||||
|
||||
# Extract camera ID from component name (e.g., "stream_cam-001" -> "cam-001")
|
||||
if component.startswith("stream_"):
|
||||
camera_id = component[7:] # Remove "stream_" prefix
|
||||
else:
|
||||
camera_id = component
|
||||
|
||||
logger.info(f"Attempting stream restart recovery for {camera_id}")
|
||||
|
||||
# Find and restart the subscription
|
||||
subscriptions = shared_stream_manager.get_all_subscriptions()
|
||||
for sub_info in subscriptions:
|
||||
if sub_info.camera_id == camera_id:
|
||||
# Remove and re-add the subscription
|
||||
shared_stream_manager.remove_subscription(sub_info.subscription_id)
|
||||
time.sleep(1.0) # Brief delay
|
||||
|
||||
# Re-add subscription
|
||||
success = shared_stream_manager.add_subscription(
|
||||
sub_info.subscription_id,
|
||||
sub_info.stream_config,
|
||||
sub_info.crop_coords,
|
||||
sub_info.model_id,
|
||||
sub_info.model_url,
|
||||
sub_info.tracking_integration
|
||||
)
|
||||
|
||||
if success:
|
||||
logger.info(f"Stream restart recovery successful for {camera_id}")
|
||||
return True
|
||||
else:
|
||||
logger.error(f"Stream restart recovery failed for {camera_id}")
|
||||
return False
|
||||
|
||||
logger.warning(f"No subscription found for camera {camera_id} during recovery")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in stream restart recovery for {component}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def _handle_stream_reconnect_recovery(component: str, details: Dict[str, Any]) -> bool:
|
||||
"""Handle stream reconnect recovery at the application level."""
|
||||
try:
|
||||
from core.streaming.manager import shared_stream_manager
|
||||
|
||||
# Extract camera ID from component name
|
||||
if component.startswith("stream_"):
|
||||
camera_id = component[7:]
|
||||
else:
|
||||
camera_id = component
|
||||
|
||||
logger.info(f"Attempting stream reconnect recovery for {camera_id}")
|
||||
|
||||
# For reconnect, we just need to trigger the stream's internal reconnect
|
||||
# The stream readers handle their own reconnection logic
|
||||
active_cameras = shared_stream_manager.get_active_cameras()
|
||||
|
||||
if camera_id in active_cameras:
|
||||
logger.info(f"Stream reconnect recovery triggered for {camera_id}")
|
||||
return True
|
||||
else:
|
||||
logger.warning(f"Camera {camera_id} not found in active cameras during reconnect recovery")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in stream reconnect recovery for {component}: {e}")
|
||||
return False
|
||||
|
||||
# Lifespan event handler (modern FastAPI approach)
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
"""Application lifespan management."""
|
||||
# Startup
|
||||
logger.info("Detector Worker started successfully")
|
||||
|
||||
# Initialize health monitoring system
|
||||
try:
|
||||
from core.monitoring.health import health_monitor
|
||||
from core.monitoring.stream_health import stream_health_tracker
|
||||
from core.monitoring.thread_health import thread_health_monitor
|
||||
from core.monitoring.recovery import recovery_manager
|
||||
|
||||
# Start health monitoring
|
||||
health_monitor.start()
|
||||
logger.info("Health monitoring system started")
|
||||
|
||||
# Register recovery handlers for stream management
|
||||
from core.streaming.manager import shared_stream_manager
|
||||
recovery_manager.register_recovery_handler(
|
||||
"restart_stream",
|
||||
_handle_stream_restart_recovery
|
||||
)
|
||||
recovery_manager.register_recovery_handler(
|
||||
"reconnect",
|
||||
_handle_stream_reconnect_recovery
|
||||
)
|
||||
|
||||
logger.info("Recovery handlers registered")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to initialize health monitoring: {e}")
|
||||
|
||||
logger.info("WebSocket endpoint available at: ws://0.0.0.0:8001/")
|
||||
logger.info("HTTP camera endpoint available at: http://0.0.0.0:8001/camera/{camera_id}/image")
|
||||
logger.info("Health check available at: http://0.0.0.0:8001/health")
|
||||
logger.info("Detailed health monitoring available at: http://0.0.0.0:8001/health/detailed")
|
||||
logger.info("Ready and waiting for backend WebSocket connections")
|
||||
|
||||
yield
|
||||
|
||||
# Shutdown
|
||||
logger.info("Detector Worker shutting down...")
|
||||
|
||||
# Stop health monitoring
|
||||
try:
|
||||
from core.monitoring.health import health_monitor
|
||||
health_monitor.stop()
|
||||
logger.info("Health monitoring system stopped")
|
||||
except Exception as e:
|
||||
logger.error(f"Error stopping health monitoring: {e}")
|
||||
|
||||
# Clear all state
|
||||
worker_state.set_subscriptions([])
|
||||
worker_state.session_ids.clear()
|
||||
|
@ -197,6 +312,205 @@ async def health_check():
|
|||
}
|
||||
|
||||
|
||||
@app.get("/health/detailed")
|
||||
async def detailed_health_check():
|
||||
"""Comprehensive health status with detailed monitoring data."""
|
||||
try:
|
||||
from core.monitoring.health import health_monitor
|
||||
from core.monitoring.stream_health import stream_health_tracker
|
||||
from core.monitoring.thread_health import thread_health_monitor
|
||||
from core.monitoring.recovery import recovery_manager
|
||||
|
||||
# Get comprehensive health status
|
||||
overall_health = health_monitor.get_health_status()
|
||||
stream_metrics = stream_health_tracker.get_all_metrics()
|
||||
thread_info = thread_health_monitor.get_all_thread_info()
|
||||
recovery_stats = recovery_manager.get_recovery_stats()
|
||||
|
||||
return {
|
||||
"timestamp": time.time(),
|
||||
"overall_health": overall_health,
|
||||
"stream_metrics": stream_metrics,
|
||||
"thread_health": thread_info,
|
||||
"recovery_stats": recovery_stats,
|
||||
"system_info": {
|
||||
"active_subscriptions": len(worker_state.subscriptions),
|
||||
"active_sessions": len(worker_state.session_ids),
|
||||
"version": "2.0.0"
|
||||
}
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error generating detailed health report: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Health monitoring error: {str(e)}")
|
||||
|
||||
|
||||
@app.get("/health/streams")
|
||||
async def stream_health_status():
|
||||
"""Stream-specific health monitoring."""
|
||||
try:
|
||||
from core.monitoring.stream_health import stream_health_tracker
|
||||
from core.streaming.buffers import shared_cache_buffer
|
||||
|
||||
stream_metrics = stream_health_tracker.get_all_metrics()
|
||||
buffer_stats = shared_cache_buffer.get_stats()
|
||||
|
||||
return {
|
||||
"timestamp": time.time(),
|
||||
"stream_count": len(stream_metrics),
|
||||
"stream_metrics": stream_metrics,
|
||||
"buffer_stats": buffer_stats,
|
||||
"frame_ages": {
|
||||
camera_id: {
|
||||
"age_seconds": time.time() - info["last_frame_time"] if info and info.get("last_frame_time") else None,
|
||||
"total_frames": info.get("frame_count", 0) if info else 0
|
||||
}
|
||||
for camera_id, info in stream_metrics.items()
|
||||
}
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error generating stream health report: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Stream health error: {str(e)}")
|
||||
|
||||
|
||||
@app.get("/health/threads")
|
||||
async def thread_health_status():
|
||||
"""Thread-specific health monitoring."""
|
||||
try:
|
||||
from core.monitoring.thread_health import thread_health_monitor
|
||||
|
||||
thread_info = thread_health_monitor.get_all_thread_info()
|
||||
deadlocks = thread_health_monitor.detect_deadlocks()
|
||||
|
||||
return {
|
||||
"timestamp": time.time(),
|
||||
"thread_count": len(thread_info),
|
||||
"thread_info": thread_info,
|
||||
"potential_deadlocks": deadlocks,
|
||||
"summary": {
|
||||
"responsive_threads": sum(1 for info in thread_info.values() if info.get("is_responsive", False)),
|
||||
"unresponsive_threads": sum(1 for info in thread_info.values() if not info.get("is_responsive", True)),
|
||||
"deadlock_count": len(deadlocks)
|
||||
}
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error generating thread health report: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Thread health error: {str(e)}")
|
||||
|
||||
|
||||
@app.get("/health/recovery")
|
||||
async def recovery_status():
|
||||
"""Recovery system status and history."""
|
||||
try:
|
||||
from core.monitoring.recovery import recovery_manager
|
||||
|
||||
recovery_stats = recovery_manager.get_recovery_stats()
|
||||
|
||||
return {
|
||||
"timestamp": time.time(),
|
||||
"recovery_stats": recovery_stats,
|
||||
"summary": {
|
||||
"total_recoveries_last_hour": recovery_stats.get("total_recoveries_last_hour", 0),
|
||||
"components_with_recovery_state": len(recovery_stats.get("recovery_states", {})),
|
||||
"total_recovery_failures": sum(
|
||||
state.get("failure_count", 0)
|
||||
for state in recovery_stats.get("recovery_states", {}).values()
|
||||
),
|
||||
"total_recovery_successes": sum(
|
||||
state.get("success_count", 0)
|
||||
for state in recovery_stats.get("recovery_states", {}).values()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error generating recovery status report: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Recovery status error: {str(e)}")
|
||||
|
||||
|
||||
@app.post("/health/recovery/force/{component}")
|
||||
async def force_recovery(component: str, action: str = "restart_stream"):
|
||||
"""Force recovery action for a specific component."""
|
||||
try:
|
||||
from core.monitoring.recovery import recovery_manager, RecoveryAction
|
||||
|
||||
# Validate action
|
||||
try:
|
||||
recovery_action = RecoveryAction(action)
|
||||
except ValueError:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Invalid recovery action: {action}. Valid actions: {[a.value for a in RecoveryAction]}"
|
||||
)
|
||||
|
||||
# Force recovery
|
||||
success = recovery_manager.force_recovery(component, recovery_action, "manual_api_request")
|
||||
|
||||
return {
|
||||
"timestamp": time.time(),
|
||||
"component": component,
|
||||
"action": action,
|
||||
"success": success,
|
||||
"message": f"Recovery {'successful' if success else 'failed'} for component {component}"
|
||||
}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Error forcing recovery for {component}: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Recovery error: {str(e)}")
|
||||
|
||||
|
||||
@app.get("/health/metrics")
|
||||
async def health_metrics():
|
||||
"""Performance and health metrics in a format suitable for monitoring systems."""
|
||||
try:
|
||||
from core.monitoring.health import health_monitor
|
||||
from core.monitoring.stream_health import stream_health_tracker
|
||||
from core.streaming.buffers import shared_cache_buffer
|
||||
|
||||
# Get basic metrics
|
||||
overall_health = health_monitor.get_health_status()
|
||||
stream_metrics = stream_health_tracker.get_all_metrics()
|
||||
buffer_stats = shared_cache_buffer.get_stats()
|
||||
|
||||
# Format for monitoring systems (Prometheus-style)
|
||||
metrics = {
|
||||
"detector_worker_up": 1,
|
||||
"detector_worker_streams_total": len(stream_metrics),
|
||||
"detector_worker_subscriptions_total": len(worker_state.subscriptions),
|
||||
"detector_worker_sessions_total": len(worker_state.session_ids),
|
||||
"detector_worker_memory_mb": buffer_stats.get("total_memory_mb", 0),
|
||||
"detector_worker_health_status": {
|
||||
"healthy": 1,
|
||||
"warning": 2,
|
||||
"critical": 3,
|
||||
"unknown": 4
|
||||
}.get(overall_health.get("overall_status", "unknown"), 4)
|
||||
}
|
||||
|
||||
# Add per-stream metrics
|
||||
for camera_id, stream_info in stream_metrics.items():
|
||||
safe_camera_id = camera_id.replace("-", "_").replace(".", "_")
|
||||
metrics.update({
|
||||
f"detector_worker_stream_frames_total{{camera=\"{safe_camera_id}\"}}": stream_info.get("frame_count", 0),
|
||||
f"detector_worker_stream_errors_total{{camera=\"{safe_camera_id}\"}}": stream_info.get("error_count", 0),
|
||||
f"detector_worker_stream_fps{{camera=\"{safe_camera_id}\"}}": stream_info.get("frames_per_second", 0),
|
||||
f"detector_worker_stream_frame_age_seconds{{camera=\"{safe_camera_id}\"}}": stream_info.get("last_frame_age_seconds") or 0
|
||||
})
|
||||
|
||||
return {
|
||||
"timestamp": time.time(),
|
||||
"metrics": metrics
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error generating health metrics: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Metrics error: {str(e)}")
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue