python-detector-worker/app.py
Siwat Sirichai b08ce27de2
Some checks failed
Build Worker Base and Application Images / check-base-changes (push) Successful in 8s
Build Worker Base and Application Images / build-base (push) Has been skipped
Build Worker Base and Application Images / build-docker (push) Successful in 3m7s
Build Worker Base and Application Images / deploy-stack (push) Has been cancelled
Implement comprehensive health monitoring for streams and threads
- Added RecoveryManager for automatic handling of health issues, including circuit breaker patterns, automatic restarts, and graceful degradation.
- Introduced StreamHealthTracker to monitor video stream metrics, including frame production, connection health, and error rates.
- Developed ThreadHealthMonitor for detecting unresponsive and deadlocked threads, providing liveness detection and responsiveness testing.
- Integrated health checks for streams and threads, reporting metrics and recovery actions to the health monitor.
- Enhanced logging for recovery attempts, errors, and health checks to improve observability and debugging.
2025-09-27 12:27:38 +07:00

518 lines
No EOL
19 KiB
Python

"""
Detector Worker - Main FastAPI Application
Refactored modular architecture for computer vision pipeline processing.
"""
import json
import logging
import os
import time
import cv2
from contextlib import asynccontextmanager
from typing import Dict, Any
from fastapi import FastAPI, WebSocket, HTTPException
from fastapi.responses import Response
# Import new modular communication system
from core.communication.websocket import websocket_endpoint
from core.communication.state import worker_state
# Configure logging
logging.basicConfig(
level=logging.DEBUG,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
handlers=[
logging.FileHandler("detector_worker.log"),
logging.StreamHandler()
]
)
logger = logging.getLogger("detector_worker")
logger.setLevel(logging.DEBUG)
# Frames are now stored in the shared cache buffer from core.streaming.buffers
# latest_frames = {} # Deprecated - using shared_cache_buffer instead
# Health monitoring recovery handlers
def _handle_stream_restart_recovery(component: str, details: Dict[str, Any]) -> bool:
"""Handle stream restart recovery at the application level."""
try:
from core.streaming.manager import shared_stream_manager
# Extract camera ID from component name (e.g., "stream_cam-001" -> "cam-001")
if component.startswith("stream_"):
camera_id = component[7:] # Remove "stream_" prefix
else:
camera_id = component
logger.info(f"Attempting stream restart recovery for {camera_id}")
# Find and restart the subscription
subscriptions = shared_stream_manager.get_all_subscriptions()
for sub_info in subscriptions:
if sub_info.camera_id == camera_id:
# Remove and re-add the subscription
shared_stream_manager.remove_subscription(sub_info.subscription_id)
time.sleep(1.0) # Brief delay
# Re-add subscription
success = shared_stream_manager.add_subscription(
sub_info.subscription_id,
sub_info.stream_config,
sub_info.crop_coords,
sub_info.model_id,
sub_info.model_url,
sub_info.tracking_integration
)
if success:
logger.info(f"Stream restart recovery successful for {camera_id}")
return True
else:
logger.error(f"Stream restart recovery failed for {camera_id}")
return False
logger.warning(f"No subscription found for camera {camera_id} during recovery")
return False
except Exception as e:
logger.error(f"Error in stream restart recovery for {component}: {e}")
return False
def _handle_stream_reconnect_recovery(component: str, details: Dict[str, Any]) -> bool:
"""Handle stream reconnect recovery at the application level."""
try:
from core.streaming.manager import shared_stream_manager
# Extract camera ID from component name
if component.startswith("stream_"):
camera_id = component[7:]
else:
camera_id = component
logger.info(f"Attempting stream reconnect recovery for {camera_id}")
# For reconnect, we just need to trigger the stream's internal reconnect
# The stream readers handle their own reconnection logic
active_cameras = shared_stream_manager.get_active_cameras()
if camera_id in active_cameras:
logger.info(f"Stream reconnect recovery triggered for {camera_id}")
return True
else:
logger.warning(f"Camera {camera_id} not found in active cameras during reconnect recovery")
return False
except Exception as e:
logger.error(f"Error in stream reconnect recovery for {component}: {e}")
return False
# Lifespan event handler (modern FastAPI approach)
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Application lifespan management."""
# Startup
logger.info("Detector Worker started successfully")
# Initialize health monitoring system
try:
from core.monitoring.health import health_monitor
from core.monitoring.stream_health import stream_health_tracker
from core.monitoring.thread_health import thread_health_monitor
from core.monitoring.recovery import recovery_manager
# Start health monitoring
health_monitor.start()
logger.info("Health monitoring system started")
# Register recovery handlers for stream management
from core.streaming.manager import shared_stream_manager
recovery_manager.register_recovery_handler(
"restart_stream",
_handle_stream_restart_recovery
)
recovery_manager.register_recovery_handler(
"reconnect",
_handle_stream_reconnect_recovery
)
logger.info("Recovery handlers registered")
except Exception as e:
logger.error(f"Failed to initialize health monitoring: {e}")
logger.info("WebSocket endpoint available at: ws://0.0.0.0:8001/")
logger.info("HTTP camera endpoint available at: http://0.0.0.0:8001/camera/{camera_id}/image")
logger.info("Health check available at: http://0.0.0.0:8001/health")
logger.info("Detailed health monitoring available at: http://0.0.0.0:8001/health/detailed")
logger.info("Ready and waiting for backend WebSocket connections")
yield
# Shutdown
logger.info("Detector Worker shutting down...")
# Stop health monitoring
try:
from core.monitoring.health import health_monitor
health_monitor.stop()
logger.info("Health monitoring system stopped")
except Exception as e:
logger.error(f"Error stopping health monitoring: {e}")
# Clear all state
worker_state.set_subscriptions([])
worker_state.session_ids.clear()
worker_state.progression_stages.clear()
# latest_frames.clear() # No longer needed - frames are in shared_cache_buffer
logger.info("Detector Worker shutdown complete")
# Create FastAPI application with detailed WebSocket logging
app = FastAPI(title="Detector Worker", version="2.0.0", lifespan=lifespan)
# Add middleware to log all requests
@app.middleware("http")
async def log_requests(request, call_next):
start_time = time.time()
response = await call_next(request)
process_time = time.time() - start_time
logger.debug(f"HTTP {request.method} {request.url} - {response.status_code} ({process_time:.3f}s)")
return response
# Load configuration
config_path = "config.json"
if os.path.exists(config_path):
with open(config_path, "r") as f:
config = json.load(f)
logger.info(f"Loaded configuration from {config_path}")
else:
# Default configuration
config = {
"poll_interval_ms": 100,
"reconnect_interval_sec": 5,
"target_fps": 10,
"max_streams": 20,
"max_retries": 3
}
logger.warning(f"Configuration file {config_path} not found, using defaults")
# Ensure models directory exists
os.makedirs("models", exist_ok=True)
logger.info("Ensured models directory exists")
# Initialize stream manager with config value
from core.streaming import initialize_stream_manager
initialize_stream_manager(max_streams=config.get('max_streams', 10))
logger.info(f"Initialized stream manager with max_streams={config.get('max_streams', 10)}")
# Frames are now stored in the shared cache buffer from core.streaming.buffers
# latest_frames = {} # Deprecated - using shared_cache_buffer instead
logger.info("Starting detector worker application (refactored)")
logger.info(f"Configuration: Target FPS: {config.get('target_fps', 10)}, "
f"Max streams: {config.get('max_streams', 5)}, "
f"Max retries: {config.get('max_retries', 3)}")
@app.websocket("/")
async def websocket_handler(websocket: WebSocket):
"""
Main WebSocket endpoint for backend communication.
Handles all protocol messages according to worker.md specification.
"""
client_info = f"{websocket.client.host}:{websocket.client.port}" if websocket.client else "unknown"
logger.info(f"[RX ← Backend] New WebSocket connection request from {client_info}")
try:
await websocket_endpoint(websocket)
except Exception as e:
logger.error(f"WebSocket handler error for {client_info}: {e}", exc_info=True)
@app.get("/camera/{camera_id}/image")
async def get_camera_image(camera_id: str):
"""
HTTP endpoint to retrieve the latest frame from a camera as JPEG image.
This endpoint is preserved for backward compatibility with existing systems.
Args:
camera_id: The subscription identifier (e.g., "display-001;cam-001")
Returns:
JPEG image as binary response
Raises:
HTTPException: 404 if camera not found or no frame available
HTTPException: 500 if encoding fails
"""
try:
from urllib.parse import unquote
# URL decode the camera_id to handle encoded characters
original_camera_id = camera_id
camera_id = unquote(camera_id)
logger.debug(f"REST API request: original='{original_camera_id}', decoded='{camera_id}'")
# Check if camera is in active subscriptions
subscription = worker_state.get_subscription(camera_id)
if not subscription:
logger.warning(f"Camera ID '{camera_id}' not found in active subscriptions")
available_cameras = list(worker_state.subscriptions.keys())
logger.debug(f"Available cameras: {available_cameras}")
raise HTTPException(
status_code=404,
detail=f"Camera {camera_id} not found or not active"
)
# Extract actual camera_id from subscription identifier (displayId;cameraId)
# Frames are stored using just the camera_id part
actual_camera_id = camera_id.split(';')[-1] if ';' in camera_id else camera_id
# Get frame from the shared cache buffer
from core.streaming.buffers import shared_cache_buffer
# Only show buffer debug info if camera not found (to reduce log spam)
available_cameras = shared_cache_buffer.frame_buffer.get_camera_list()
frame = shared_cache_buffer.get_frame(actual_camera_id)
if frame is None:
logger.warning(f"\033[93m[API] No frame for '{actual_camera_id}' - Available: {available_cameras}\033[0m")
raise HTTPException(
status_code=404,
detail=f"No frame available for camera {actual_camera_id}"
)
# Successful frame retrieval - log only occasionally to avoid spam
# Encode frame as JPEG
success, buffer_img = cv2.imencode('.jpg', frame, [cv2.IMWRITE_JPEG_QUALITY, 85])
if not success:
raise HTTPException(status_code=500, detail="Failed to encode image as JPEG")
# Return image as binary response
return Response(content=buffer_img.tobytes(), media_type="image/jpeg")
except HTTPException:
raise
except Exception as e:
logger.error(f"Error retrieving image for camera {camera_id}: {str(e)}", exc_info=True)
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
@app.get("/health")
async def health_check():
"""Health check endpoint for monitoring."""
return {
"status": "healthy",
"version": "2.0.0",
"active_subscriptions": len(worker_state.subscriptions),
"active_sessions": len(worker_state.session_ids)
}
@app.get("/health/detailed")
async def detailed_health_check():
"""Comprehensive health status with detailed monitoring data."""
try:
from core.monitoring.health import health_monitor
from core.monitoring.stream_health import stream_health_tracker
from core.monitoring.thread_health import thread_health_monitor
from core.monitoring.recovery import recovery_manager
# Get comprehensive health status
overall_health = health_monitor.get_health_status()
stream_metrics = stream_health_tracker.get_all_metrics()
thread_info = thread_health_monitor.get_all_thread_info()
recovery_stats = recovery_manager.get_recovery_stats()
return {
"timestamp": time.time(),
"overall_health": overall_health,
"stream_metrics": stream_metrics,
"thread_health": thread_info,
"recovery_stats": recovery_stats,
"system_info": {
"active_subscriptions": len(worker_state.subscriptions),
"active_sessions": len(worker_state.session_ids),
"version": "2.0.0"
}
}
except Exception as e:
logger.error(f"Error generating detailed health report: {e}")
raise HTTPException(status_code=500, detail=f"Health monitoring error: {str(e)}")
@app.get("/health/streams")
async def stream_health_status():
"""Stream-specific health monitoring."""
try:
from core.monitoring.stream_health import stream_health_tracker
from core.streaming.buffers import shared_cache_buffer
stream_metrics = stream_health_tracker.get_all_metrics()
buffer_stats = shared_cache_buffer.get_stats()
return {
"timestamp": time.time(),
"stream_count": len(stream_metrics),
"stream_metrics": stream_metrics,
"buffer_stats": buffer_stats,
"frame_ages": {
camera_id: {
"age_seconds": time.time() - info["last_frame_time"] if info and info.get("last_frame_time") else None,
"total_frames": info.get("frame_count", 0) if info else 0
}
for camera_id, info in stream_metrics.items()
}
}
except Exception as e:
logger.error(f"Error generating stream health report: {e}")
raise HTTPException(status_code=500, detail=f"Stream health error: {str(e)}")
@app.get("/health/threads")
async def thread_health_status():
"""Thread-specific health monitoring."""
try:
from core.monitoring.thread_health import thread_health_monitor
thread_info = thread_health_monitor.get_all_thread_info()
deadlocks = thread_health_monitor.detect_deadlocks()
return {
"timestamp": time.time(),
"thread_count": len(thread_info),
"thread_info": thread_info,
"potential_deadlocks": deadlocks,
"summary": {
"responsive_threads": sum(1 for info in thread_info.values() if info.get("is_responsive", False)),
"unresponsive_threads": sum(1 for info in thread_info.values() if not info.get("is_responsive", True)),
"deadlock_count": len(deadlocks)
}
}
except Exception as e:
logger.error(f"Error generating thread health report: {e}")
raise HTTPException(status_code=500, detail=f"Thread health error: {str(e)}")
@app.get("/health/recovery")
async def recovery_status():
"""Recovery system status and history."""
try:
from core.monitoring.recovery import recovery_manager
recovery_stats = recovery_manager.get_recovery_stats()
return {
"timestamp": time.time(),
"recovery_stats": recovery_stats,
"summary": {
"total_recoveries_last_hour": recovery_stats.get("total_recoveries_last_hour", 0),
"components_with_recovery_state": len(recovery_stats.get("recovery_states", {})),
"total_recovery_failures": sum(
state.get("failure_count", 0)
for state in recovery_stats.get("recovery_states", {}).values()
),
"total_recovery_successes": sum(
state.get("success_count", 0)
for state in recovery_stats.get("recovery_states", {}).values()
)
}
}
except Exception as e:
logger.error(f"Error generating recovery status report: {e}")
raise HTTPException(status_code=500, detail=f"Recovery status error: {str(e)}")
@app.post("/health/recovery/force/{component}")
async def force_recovery(component: str, action: str = "restart_stream"):
"""Force recovery action for a specific component."""
try:
from core.monitoring.recovery import recovery_manager, RecoveryAction
# Validate action
try:
recovery_action = RecoveryAction(action)
except ValueError:
raise HTTPException(
status_code=400,
detail=f"Invalid recovery action: {action}. Valid actions: {[a.value for a in RecoveryAction]}"
)
# Force recovery
success = recovery_manager.force_recovery(component, recovery_action, "manual_api_request")
return {
"timestamp": time.time(),
"component": component,
"action": action,
"success": success,
"message": f"Recovery {'successful' if success else 'failed'} for component {component}"
}
except HTTPException:
raise
except Exception as e:
logger.error(f"Error forcing recovery for {component}: {e}")
raise HTTPException(status_code=500, detail=f"Recovery error: {str(e)}")
@app.get("/health/metrics")
async def health_metrics():
"""Performance and health metrics in a format suitable for monitoring systems."""
try:
from core.monitoring.health import health_monitor
from core.monitoring.stream_health import stream_health_tracker
from core.streaming.buffers import shared_cache_buffer
# Get basic metrics
overall_health = health_monitor.get_health_status()
stream_metrics = stream_health_tracker.get_all_metrics()
buffer_stats = shared_cache_buffer.get_stats()
# Format for monitoring systems (Prometheus-style)
metrics = {
"detector_worker_up": 1,
"detector_worker_streams_total": len(stream_metrics),
"detector_worker_subscriptions_total": len(worker_state.subscriptions),
"detector_worker_sessions_total": len(worker_state.session_ids),
"detector_worker_memory_mb": buffer_stats.get("total_memory_mb", 0),
"detector_worker_health_status": {
"healthy": 1,
"warning": 2,
"critical": 3,
"unknown": 4
}.get(overall_health.get("overall_status", "unknown"), 4)
}
# Add per-stream metrics
for camera_id, stream_info in stream_metrics.items():
safe_camera_id = camera_id.replace("-", "_").replace(".", "_")
metrics.update({
f"detector_worker_stream_frames_total{{camera=\"{safe_camera_id}\"}}": stream_info.get("frame_count", 0),
f"detector_worker_stream_errors_total{{camera=\"{safe_camera_id}\"}}": stream_info.get("error_count", 0),
f"detector_worker_stream_fps{{camera=\"{safe_camera_id}\"}}": stream_info.get("frames_per_second", 0),
f"detector_worker_stream_frame_age_seconds{{camera=\"{safe_camera_id}\"}}": stream_info.get("last_frame_age_seconds") or 0
})
return {
"timestamp": time.time(),
"metrics": metrics
}
except Exception as e:
logger.error(f"Error generating health metrics: {e}")
raise HTTPException(status_code=500, detail=f"Metrics error: {str(e)}")
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8001)