Implement comprehensive health monitoring for streams and threads
Some checks failed
Build Worker Base and Application Images / check-base-changes (push) Successful in 8s
Build Worker Base and Application Images / build-base (push) Has been skipped
Build Worker Base and Application Images / build-docker (push) Successful in 3m7s
Build Worker Base and Application Images / deploy-stack (push) Has been cancelled
Some checks failed
Build Worker Base and Application Images / check-base-changes (push) Successful in 8s
Build Worker Base and Application Images / build-base (push) Has been skipped
Build Worker Base and Application Images / build-docker (push) Successful in 3m7s
Build Worker Base and Application Images / deploy-stack (push) Has been cancelled
- Added RecoveryManager for automatic handling of health issues, including circuit breaker patterns, automatic restarts, and graceful degradation. - Introduced StreamHealthTracker to monitor video stream metrics, including frame production, connection health, and error rates. - Developed ThreadHealthMonitor for detecting unresponsive and deadlocked threads, providing liveness detection and responsiveness testing. - Integrated health checks for streams and threads, reporting metrics and recovery actions to the health monitor. - Enhanced logging for recovery attempts, errors, and health checks to improve observability and debugging.
This commit is contained in:
parent
8c08c815ce
commit
b08ce27de2
9 changed files with 2173 additions and 11 deletions
18
core/monitoring/__init__.py
Normal file
18
core/monitoring/__init__.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
"""
|
||||
Comprehensive health monitoring system for detector worker.
|
||||
Tracks stream health, thread responsiveness, and system performance.
|
||||
"""
|
||||
|
||||
from .health import HealthMonitor, HealthStatus, HealthCheck
|
||||
from .stream_health import StreamHealthTracker
|
||||
from .thread_health import ThreadHealthMonitor
|
||||
from .recovery import RecoveryManager
|
||||
|
||||
__all__ = [
|
||||
'HealthMonitor',
|
||||
'HealthStatus',
|
||||
'HealthCheck',
|
||||
'StreamHealthTracker',
|
||||
'ThreadHealthMonitor',
|
||||
'RecoveryManager'
|
||||
]
|
456
core/monitoring/health.py
Normal file
456
core/monitoring/health.py
Normal file
|
@ -0,0 +1,456 @@
|
|||
"""
|
||||
Core health monitoring system for comprehensive stream and system health tracking.
|
||||
Provides centralized health status, alerting, and recovery coordination.
|
||||
"""
|
||||
import time
|
||||
import threading
|
||||
import logging
|
||||
import psutil
|
||||
from typing import Dict, List, Optional, Any, Callable
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
from collections import defaultdict, deque
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HealthStatus(Enum):
|
||||
"""Health status levels."""
|
||||
HEALTHY = "healthy"
|
||||
WARNING = "warning"
|
||||
CRITICAL = "critical"
|
||||
UNKNOWN = "unknown"
|
||||
|
||||
|
||||
@dataclass
|
||||
class HealthCheck:
|
||||
"""Individual health check result."""
|
||||
name: str
|
||||
status: HealthStatus
|
||||
message: str
|
||||
timestamp: float = field(default_factory=time.time)
|
||||
details: Dict[str, Any] = field(default_factory=dict)
|
||||
recovery_action: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class HealthMetrics:
|
||||
"""Health metrics for a component."""
|
||||
component_id: str
|
||||
last_update: float
|
||||
frame_count: int = 0
|
||||
error_count: int = 0
|
||||
warning_count: int = 0
|
||||
restart_count: int = 0
|
||||
avg_frame_interval: float = 0.0
|
||||
last_frame_time: Optional[float] = None
|
||||
thread_alive: bool = True
|
||||
connection_healthy: bool = True
|
||||
memory_usage_mb: float = 0.0
|
||||
cpu_usage_percent: float = 0.0
|
||||
|
||||
|
||||
class HealthMonitor:
|
||||
"""Comprehensive health monitoring system."""
|
||||
|
||||
def __init__(self, check_interval: float = 30.0):
|
||||
"""
|
||||
Initialize health monitor.
|
||||
|
||||
Args:
|
||||
check_interval: Interval between health checks in seconds
|
||||
"""
|
||||
self.check_interval = check_interval
|
||||
self.running = False
|
||||
self.monitor_thread = None
|
||||
self._lock = threading.RLock()
|
||||
|
||||
# Health data storage
|
||||
self.health_checks: Dict[str, HealthCheck] = {}
|
||||
self.metrics: Dict[str, HealthMetrics] = {}
|
||||
self.alert_history: deque = deque(maxlen=1000)
|
||||
self.recovery_actions: deque = deque(maxlen=500)
|
||||
|
||||
# Thresholds (configurable)
|
||||
self.thresholds = {
|
||||
'frame_stale_warning_seconds': 120, # 2 minutes
|
||||
'frame_stale_critical_seconds': 300, # 5 minutes
|
||||
'thread_unresponsive_seconds': 60, # 1 minute
|
||||
'memory_warning_mb': 500, # 500MB per stream
|
||||
'memory_critical_mb': 1000, # 1GB per stream
|
||||
'cpu_warning_percent': 80, # 80% CPU
|
||||
'cpu_critical_percent': 95, # 95% CPU
|
||||
'error_rate_warning': 0.1, # 10% error rate
|
||||
'error_rate_critical': 0.3, # 30% error rate
|
||||
'restart_threshold': 3 # Max restarts per hour
|
||||
}
|
||||
|
||||
# Health check functions
|
||||
self.health_checkers: List[Callable[[], List[HealthCheck]]] = []
|
||||
self.recovery_callbacks: Dict[str, Callable[[str, HealthCheck], bool]] = {}
|
||||
|
||||
# System monitoring
|
||||
self.process = psutil.Process()
|
||||
self.system_start_time = time.time()
|
||||
|
||||
def start(self):
|
||||
"""Start health monitoring."""
|
||||
if self.running:
|
||||
logger.warning("Health monitor already running")
|
||||
return
|
||||
|
||||
self.running = True
|
||||
self.monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True)
|
||||
self.monitor_thread.start()
|
||||
logger.info(f"Health monitor started (check interval: {self.check_interval}s)")
|
||||
|
||||
def stop(self):
|
||||
"""Stop health monitoring."""
|
||||
self.running = False
|
||||
if self.monitor_thread:
|
||||
self.monitor_thread.join(timeout=5.0)
|
||||
logger.info("Health monitor stopped")
|
||||
|
||||
def register_health_checker(self, checker: Callable[[], List[HealthCheck]]):
|
||||
"""Register a health check function."""
|
||||
self.health_checkers.append(checker)
|
||||
logger.debug(f"Registered health checker: {checker.__name__}")
|
||||
|
||||
def register_recovery_callback(self, component: str, callback: Callable[[str, HealthCheck], bool]):
|
||||
"""Register a recovery callback for a component."""
|
||||
self.recovery_callbacks[component] = callback
|
||||
logger.debug(f"Registered recovery callback for {component}")
|
||||
|
||||
def update_metrics(self, component_id: str, **kwargs):
|
||||
"""Update metrics for a component."""
|
||||
with self._lock:
|
||||
if component_id not in self.metrics:
|
||||
self.metrics[component_id] = HealthMetrics(
|
||||
component_id=component_id,
|
||||
last_update=time.time()
|
||||
)
|
||||
|
||||
metrics = self.metrics[component_id]
|
||||
metrics.last_update = time.time()
|
||||
|
||||
# Update provided metrics
|
||||
for key, value in kwargs.items():
|
||||
if hasattr(metrics, key):
|
||||
setattr(metrics, key, value)
|
||||
|
||||
def report_frame_received(self, component_id: str):
|
||||
"""Report that a frame was received for a component."""
|
||||
current_time = time.time()
|
||||
with self._lock:
|
||||
if component_id not in self.metrics:
|
||||
self.metrics[component_id] = HealthMetrics(
|
||||
component_id=component_id,
|
||||
last_update=current_time
|
||||
)
|
||||
|
||||
metrics = self.metrics[component_id]
|
||||
|
||||
# Update frame metrics
|
||||
if metrics.last_frame_time:
|
||||
interval = current_time - metrics.last_frame_time
|
||||
# Moving average of frame intervals
|
||||
if metrics.avg_frame_interval == 0:
|
||||
metrics.avg_frame_interval = interval
|
||||
else:
|
||||
metrics.avg_frame_interval = (metrics.avg_frame_interval * 0.9) + (interval * 0.1)
|
||||
|
||||
metrics.last_frame_time = current_time
|
||||
metrics.frame_count += 1
|
||||
metrics.last_update = current_time
|
||||
|
||||
def report_error(self, component_id: str, error_type: str = "general"):
|
||||
"""Report an error for a component."""
|
||||
with self._lock:
|
||||
if component_id not in self.metrics:
|
||||
self.metrics[component_id] = HealthMetrics(
|
||||
component_id=component_id,
|
||||
last_update=time.time()
|
||||
)
|
||||
|
||||
self.metrics[component_id].error_count += 1
|
||||
self.metrics[component_id].last_update = time.time()
|
||||
|
||||
logger.debug(f"Error reported for {component_id}: {error_type}")
|
||||
|
||||
def report_warning(self, component_id: str, warning_type: str = "general"):
|
||||
"""Report a warning for a component."""
|
||||
with self._lock:
|
||||
if component_id not in self.metrics:
|
||||
self.metrics[component_id] = HealthMetrics(
|
||||
component_id=component_id,
|
||||
last_update=time.time()
|
||||
)
|
||||
|
||||
self.metrics[component_id].warning_count += 1
|
||||
self.metrics[component_id].last_update = time.time()
|
||||
|
||||
logger.debug(f"Warning reported for {component_id}: {warning_type}")
|
||||
|
||||
def report_restart(self, component_id: str):
|
||||
"""Report that a component was restarted."""
|
||||
with self._lock:
|
||||
if component_id not in self.metrics:
|
||||
self.metrics[component_id] = HealthMetrics(
|
||||
component_id=component_id,
|
||||
last_update=time.time()
|
||||
)
|
||||
|
||||
self.metrics[component_id].restart_count += 1
|
||||
self.metrics[component_id].last_update = time.time()
|
||||
|
||||
# Log recovery action
|
||||
recovery_action = {
|
||||
'timestamp': time.time(),
|
||||
'component': component_id,
|
||||
'action': 'restart',
|
||||
'reason': 'manual_restart'
|
||||
}
|
||||
|
||||
with self._lock:
|
||||
self.recovery_actions.append(recovery_action)
|
||||
|
||||
logger.info(f"Restart reported for {component_id}")
|
||||
|
||||
def get_health_status(self, component_id: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""Get comprehensive health status."""
|
||||
with self._lock:
|
||||
if component_id:
|
||||
# Get health for specific component
|
||||
return self._get_component_health(component_id)
|
||||
else:
|
||||
# Get overall health status
|
||||
return self._get_overall_health()
|
||||
|
||||
def _get_component_health(self, component_id: str) -> Dict[str, Any]:
|
||||
"""Get health status for a specific component."""
|
||||
if component_id not in self.metrics:
|
||||
return {
|
||||
'component_id': component_id,
|
||||
'status': HealthStatus.UNKNOWN.value,
|
||||
'message': 'No metrics available',
|
||||
'metrics': {}
|
||||
}
|
||||
|
||||
metrics = self.metrics[component_id]
|
||||
current_time = time.time()
|
||||
|
||||
# Determine health status
|
||||
status = HealthStatus.HEALTHY
|
||||
issues = []
|
||||
|
||||
# Check frame freshness
|
||||
if metrics.last_frame_time:
|
||||
frame_age = current_time - metrics.last_frame_time
|
||||
if frame_age > self.thresholds['frame_stale_critical_seconds']:
|
||||
status = HealthStatus.CRITICAL
|
||||
issues.append(f"Frames stale for {frame_age:.1f}s")
|
||||
elif frame_age > self.thresholds['frame_stale_warning_seconds']:
|
||||
if status == HealthStatus.HEALTHY:
|
||||
status = HealthStatus.WARNING
|
||||
issues.append(f"Frames aging ({frame_age:.1f}s)")
|
||||
|
||||
# Check error rates
|
||||
if metrics.frame_count > 0:
|
||||
error_rate = metrics.error_count / metrics.frame_count
|
||||
if error_rate > self.thresholds['error_rate_critical']:
|
||||
status = HealthStatus.CRITICAL
|
||||
issues.append(f"High error rate ({error_rate:.1%})")
|
||||
elif error_rate > self.thresholds['error_rate_warning']:
|
||||
if status == HealthStatus.HEALTHY:
|
||||
status = HealthStatus.WARNING
|
||||
issues.append(f"Elevated error rate ({error_rate:.1%})")
|
||||
|
||||
# Check restart frequency
|
||||
restart_rate = metrics.restart_count / max(1, (current_time - self.system_start_time) / 3600)
|
||||
if restart_rate > self.thresholds['restart_threshold']:
|
||||
status = HealthStatus.CRITICAL
|
||||
issues.append(f"Frequent restarts ({restart_rate:.1f}/hour)")
|
||||
|
||||
# Check thread health
|
||||
if not metrics.thread_alive:
|
||||
status = HealthStatus.CRITICAL
|
||||
issues.append("Thread not alive")
|
||||
|
||||
# Check connection health
|
||||
if not metrics.connection_healthy:
|
||||
if status == HealthStatus.HEALTHY:
|
||||
status = HealthStatus.WARNING
|
||||
issues.append("Connection unhealthy")
|
||||
|
||||
return {
|
||||
'component_id': component_id,
|
||||
'status': status.value,
|
||||
'message': '; '.join(issues) if issues else 'All checks passing',
|
||||
'metrics': {
|
||||
'frame_count': metrics.frame_count,
|
||||
'error_count': metrics.error_count,
|
||||
'warning_count': metrics.warning_count,
|
||||
'restart_count': metrics.restart_count,
|
||||
'avg_frame_interval': metrics.avg_frame_interval,
|
||||
'last_frame_age': current_time - metrics.last_frame_time if metrics.last_frame_time else None,
|
||||
'thread_alive': metrics.thread_alive,
|
||||
'connection_healthy': metrics.connection_healthy,
|
||||
'memory_usage_mb': metrics.memory_usage_mb,
|
||||
'cpu_usage_percent': metrics.cpu_usage_percent,
|
||||
'uptime_seconds': current_time - self.system_start_time
|
||||
},
|
||||
'last_update': metrics.last_update
|
||||
}
|
||||
|
||||
def _get_overall_health(self) -> Dict[str, Any]:
|
||||
"""Get overall system health status."""
|
||||
current_time = time.time()
|
||||
components = {}
|
||||
overall_status = HealthStatus.HEALTHY
|
||||
|
||||
# Get health for all components
|
||||
for component_id in self.metrics.keys():
|
||||
component_health = self._get_component_health(component_id)
|
||||
components[component_id] = component_health
|
||||
|
||||
# Determine overall status
|
||||
component_status = HealthStatus(component_health['status'])
|
||||
if component_status == HealthStatus.CRITICAL:
|
||||
overall_status = HealthStatus.CRITICAL
|
||||
elif component_status == HealthStatus.WARNING and overall_status == HealthStatus.HEALTHY:
|
||||
overall_status = HealthStatus.WARNING
|
||||
|
||||
# System metrics
|
||||
try:
|
||||
system_memory = self.process.memory_info()
|
||||
system_cpu = self.process.cpu_percent()
|
||||
except Exception:
|
||||
system_memory = None
|
||||
system_cpu = 0.0
|
||||
|
||||
return {
|
||||
'overall_status': overall_status.value,
|
||||
'timestamp': current_time,
|
||||
'uptime_seconds': current_time - self.system_start_time,
|
||||
'total_components': len(self.metrics),
|
||||
'components': components,
|
||||
'system_metrics': {
|
||||
'memory_mb': system_memory.rss / (1024 * 1024) if system_memory else 0,
|
||||
'cpu_percent': system_cpu,
|
||||
'process_id': self.process.pid
|
||||
},
|
||||
'recent_alerts': list(self.alert_history)[-10:], # Last 10 alerts
|
||||
'recent_recoveries': list(self.recovery_actions)[-10:] # Last 10 recovery actions
|
||||
}
|
||||
|
||||
def _monitor_loop(self):
|
||||
"""Main health monitoring loop."""
|
||||
logger.info("Health monitor loop started")
|
||||
|
||||
while self.running:
|
||||
try:
|
||||
start_time = time.time()
|
||||
|
||||
# Run all registered health checks
|
||||
all_checks = []
|
||||
for checker in self.health_checkers:
|
||||
try:
|
||||
checks = checker()
|
||||
all_checks.extend(checks)
|
||||
except Exception as e:
|
||||
logger.error(f"Error in health checker {checker.__name__}: {e}")
|
||||
|
||||
# Process health checks and trigger recovery if needed
|
||||
for check in all_checks:
|
||||
self._process_health_check(check)
|
||||
|
||||
# Update system metrics
|
||||
self._update_system_metrics()
|
||||
|
||||
# Sleep until next check
|
||||
elapsed = time.time() - start_time
|
||||
sleep_time = max(0, self.check_interval - elapsed)
|
||||
if sleep_time > 0:
|
||||
time.sleep(sleep_time)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in health monitor loop: {e}")
|
||||
time.sleep(5.0) # Fallback sleep
|
||||
|
||||
logger.info("Health monitor loop ended")
|
||||
|
||||
def _process_health_check(self, check: HealthCheck):
|
||||
"""Process a health check result and trigger recovery if needed."""
|
||||
with self._lock:
|
||||
# Store health check
|
||||
self.health_checks[check.name] = check
|
||||
|
||||
# Log alerts for non-healthy status
|
||||
if check.status != HealthStatus.HEALTHY:
|
||||
alert = {
|
||||
'timestamp': check.timestamp,
|
||||
'component': check.name,
|
||||
'status': check.status.value,
|
||||
'message': check.message,
|
||||
'details': check.details
|
||||
}
|
||||
self.alert_history.append(alert)
|
||||
|
||||
logger.warning(f"Health alert [{check.status.value.upper()}] {check.name}: {check.message}")
|
||||
|
||||
# Trigger recovery if critical and recovery action available
|
||||
if check.status == HealthStatus.CRITICAL and check.recovery_action:
|
||||
self._trigger_recovery(check.name, check)
|
||||
|
||||
def _trigger_recovery(self, component: str, check: HealthCheck):
|
||||
"""Trigger recovery action for a component."""
|
||||
if component in self.recovery_callbacks:
|
||||
try:
|
||||
logger.info(f"Triggering recovery for {component}: {check.recovery_action}")
|
||||
|
||||
success = self.recovery_callbacks[component](component, check)
|
||||
|
||||
recovery_action = {
|
||||
'timestamp': time.time(),
|
||||
'component': component,
|
||||
'action': check.recovery_action,
|
||||
'reason': check.message,
|
||||
'success': success
|
||||
}
|
||||
|
||||
with self._lock:
|
||||
self.recovery_actions.append(recovery_action)
|
||||
|
||||
if success:
|
||||
logger.info(f"Recovery successful for {component}")
|
||||
else:
|
||||
logger.error(f"Recovery failed for {component}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in recovery callback for {component}: {e}")
|
||||
|
||||
def _update_system_metrics(self):
|
||||
"""Update system-level metrics."""
|
||||
try:
|
||||
# Update process metrics for all components
|
||||
current_time = time.time()
|
||||
|
||||
with self._lock:
|
||||
for component_id, metrics in self.metrics.items():
|
||||
# Update CPU and memory if available
|
||||
try:
|
||||
# This is a simplified approach - in practice you'd want
|
||||
# per-thread or per-component resource tracking
|
||||
metrics.cpu_usage_percent = self.process.cpu_percent() / len(self.metrics)
|
||||
memory_info = self.process.memory_info()
|
||||
metrics.memory_usage_mb = memory_info.rss / (1024 * 1024) / len(self.metrics)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error updating system metrics: {e}")
|
||||
|
||||
|
||||
# Global health monitor instance
|
||||
health_monitor = HealthMonitor()
|
385
core/monitoring/recovery.py
Normal file
385
core/monitoring/recovery.py
Normal file
|
@ -0,0 +1,385 @@
|
|||
"""
|
||||
Recovery manager for automatic handling of health issues.
|
||||
Provides circuit breaker patterns, automatic restarts, and graceful degradation.
|
||||
"""
|
||||
import time
|
||||
import logging
|
||||
import threading
|
||||
from typing import Dict, List, Optional, Any, Callable
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from collections import defaultdict, deque
|
||||
|
||||
from .health import HealthCheck, HealthStatus, health_monitor
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RecoveryAction(Enum):
|
||||
"""Types of recovery actions."""
|
||||
RESTART_STREAM = "restart_stream"
|
||||
RESTART_THREAD = "restart_thread"
|
||||
CLEAR_BUFFER = "clear_buffer"
|
||||
RECONNECT = "reconnect"
|
||||
THROTTLE = "throttle"
|
||||
DISABLE = "disable"
|
||||
|
||||
|
||||
@dataclass
|
||||
class RecoveryAttempt:
|
||||
"""Record of a recovery attempt."""
|
||||
timestamp: float
|
||||
component: str
|
||||
action: RecoveryAction
|
||||
reason: str
|
||||
success: bool
|
||||
details: Dict[str, Any] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class RecoveryState:
|
||||
"""Recovery state for a component - simplified without circuit breaker."""
|
||||
failure_count: int = 0
|
||||
success_count: int = 0
|
||||
last_failure_time: Optional[float] = None
|
||||
last_success_time: Optional[float] = None
|
||||
|
||||
|
||||
class RecoveryManager:
|
||||
"""Manages automatic recovery actions for health issues."""
|
||||
|
||||
def __init__(self):
|
||||
self.recovery_handlers: Dict[str, Callable[[str, HealthCheck], bool]] = {}
|
||||
self.recovery_states: Dict[str, RecoveryState] = {}
|
||||
self.recovery_history: deque = deque(maxlen=1000)
|
||||
self._lock = threading.RLock()
|
||||
|
||||
# Configuration - simplified without circuit breaker
|
||||
self.recovery_cooldown = 30 # 30 seconds between recovery attempts
|
||||
self.max_attempts_per_hour = 20 # Still limit to prevent spam, but much higher
|
||||
|
||||
# Track recovery attempts per component
|
||||
self.recovery_attempts: Dict[str, deque] = defaultdict(lambda: deque(maxlen=50))
|
||||
|
||||
# Register with health monitor
|
||||
health_monitor.register_recovery_callback("stream", self._handle_stream_recovery)
|
||||
health_monitor.register_recovery_callback("thread", self._handle_thread_recovery)
|
||||
health_monitor.register_recovery_callback("buffer", self._handle_buffer_recovery)
|
||||
|
||||
def register_recovery_handler(self, action: RecoveryAction, handler: Callable[[str, Dict[str, Any]], bool]):
|
||||
"""
|
||||
Register a recovery handler for a specific action.
|
||||
|
||||
Args:
|
||||
action: Type of recovery action
|
||||
handler: Function that performs the recovery
|
||||
"""
|
||||
self.recovery_handlers[action.value] = handler
|
||||
logger.info(f"Registered recovery handler for {action.value}")
|
||||
|
||||
def can_attempt_recovery(self, component: str) -> bool:
|
||||
"""
|
||||
Check if recovery can be attempted for a component.
|
||||
|
||||
Args:
|
||||
component: Component identifier
|
||||
|
||||
Returns:
|
||||
True if recovery can be attempted (always allow with minimal throttling)
|
||||
"""
|
||||
with self._lock:
|
||||
current_time = time.time()
|
||||
|
||||
# Check recovery attempt rate limiting (much more permissive)
|
||||
recent_attempts = [
|
||||
attempt for attempt in self.recovery_attempts[component]
|
||||
if current_time - attempt <= 3600 # Last hour
|
||||
]
|
||||
|
||||
# Only block if truly excessive attempts
|
||||
if len(recent_attempts) >= self.max_attempts_per_hour:
|
||||
logger.warning(f"Recovery rate limit exceeded for {component} "
|
||||
f"({len(recent_attempts)} attempts in last hour)")
|
||||
return False
|
||||
|
||||
# Check cooldown period (shorter cooldown)
|
||||
if recent_attempts:
|
||||
last_attempt = max(recent_attempts)
|
||||
if current_time - last_attempt < self.recovery_cooldown:
|
||||
logger.debug(f"Recovery cooldown active for {component} "
|
||||
f"(last attempt {current_time - last_attempt:.1f}s ago)")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def attempt_recovery(self, component: str, action: RecoveryAction, reason: str,
|
||||
details: Optional[Dict[str, Any]] = None) -> bool:
|
||||
"""
|
||||
Attempt recovery for a component.
|
||||
|
||||
Args:
|
||||
component: Component identifier
|
||||
action: Recovery action to perform
|
||||
reason: Reason for recovery
|
||||
details: Additional details
|
||||
|
||||
Returns:
|
||||
True if recovery was successful
|
||||
"""
|
||||
if not self.can_attempt_recovery(component):
|
||||
return False
|
||||
|
||||
current_time = time.time()
|
||||
|
||||
logger.info(f"Attempting recovery for {component}: {action.value} ({reason})")
|
||||
|
||||
try:
|
||||
# Record recovery attempt
|
||||
with self._lock:
|
||||
self.recovery_attempts[component].append(current_time)
|
||||
|
||||
# Perform recovery action
|
||||
success = self._execute_recovery_action(component, action, details or {})
|
||||
|
||||
# Record recovery result
|
||||
attempt = RecoveryAttempt(
|
||||
timestamp=current_time,
|
||||
component=component,
|
||||
action=action,
|
||||
reason=reason,
|
||||
success=success,
|
||||
details=details
|
||||
)
|
||||
|
||||
with self._lock:
|
||||
self.recovery_history.append(attempt)
|
||||
|
||||
# Update recovery state
|
||||
self._update_recovery_state(component, success)
|
||||
|
||||
if success:
|
||||
logger.info(f"Recovery successful for {component}: {action.value}")
|
||||
else:
|
||||
logger.error(f"Recovery failed for {component}: {action.value}")
|
||||
|
||||
return success
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during recovery for {component}: {e}")
|
||||
self._update_recovery_state(component, False)
|
||||
return False
|
||||
|
||||
def _execute_recovery_action(self, component: str, action: RecoveryAction,
|
||||
details: Dict[str, Any]) -> bool:
|
||||
"""Execute a specific recovery action."""
|
||||
handler_key = action.value
|
||||
|
||||
if handler_key not in self.recovery_handlers:
|
||||
logger.error(f"No recovery handler registered for action: {handler_key}")
|
||||
return False
|
||||
|
||||
try:
|
||||
handler = self.recovery_handlers[handler_key]
|
||||
return handler(component, details)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error executing recovery action {handler_key} for {component}: {e}")
|
||||
return False
|
||||
|
||||
def _update_recovery_state(self, component: str, success: bool):
|
||||
"""Update recovery state based on recovery result."""
|
||||
current_time = time.time()
|
||||
|
||||
with self._lock:
|
||||
if component not in self.recovery_states:
|
||||
self.recovery_states[component] = RecoveryState()
|
||||
|
||||
state = self.recovery_states[component]
|
||||
|
||||
if success:
|
||||
state.success_count += 1
|
||||
state.last_success_time = current_time
|
||||
# Reset failure count on success
|
||||
state.failure_count = max(0, state.failure_count - 1)
|
||||
logger.debug(f"Recovery success for {component} (total successes: {state.success_count})")
|
||||
else:
|
||||
state.failure_count += 1
|
||||
state.last_failure_time = current_time
|
||||
logger.debug(f"Recovery failure for {component} (total failures: {state.failure_count})")
|
||||
|
||||
def _handle_stream_recovery(self, component: str, health_check: HealthCheck) -> bool:
|
||||
"""Handle recovery for stream-related issues."""
|
||||
if "frames" in health_check.name:
|
||||
# Frame-related issue - restart stream
|
||||
return self.attempt_recovery(
|
||||
component,
|
||||
RecoveryAction.RESTART_STREAM,
|
||||
health_check.message,
|
||||
health_check.details
|
||||
)
|
||||
elif "connection" in health_check.name:
|
||||
# Connection issue - reconnect
|
||||
return self.attempt_recovery(
|
||||
component,
|
||||
RecoveryAction.RECONNECT,
|
||||
health_check.message,
|
||||
health_check.details
|
||||
)
|
||||
elif "errors" in health_check.name:
|
||||
# High error rate - throttle or restart
|
||||
return self.attempt_recovery(
|
||||
component,
|
||||
RecoveryAction.THROTTLE,
|
||||
health_check.message,
|
||||
health_check.details
|
||||
)
|
||||
else:
|
||||
# Generic stream issue - restart
|
||||
return self.attempt_recovery(
|
||||
component,
|
||||
RecoveryAction.RESTART_STREAM,
|
||||
health_check.message,
|
||||
health_check.details
|
||||
)
|
||||
|
||||
def _handle_thread_recovery(self, component: str, health_check: HealthCheck) -> bool:
|
||||
"""Handle recovery for thread-related issues."""
|
||||
if "deadlock" in health_check.name:
|
||||
# Deadlock detected - restart thread
|
||||
return self.attempt_recovery(
|
||||
component,
|
||||
RecoveryAction.RESTART_THREAD,
|
||||
health_check.message,
|
||||
health_check.details
|
||||
)
|
||||
elif "responsive" in health_check.name:
|
||||
# Thread unresponsive - restart
|
||||
return self.attempt_recovery(
|
||||
component,
|
||||
RecoveryAction.RESTART_THREAD,
|
||||
health_check.message,
|
||||
health_check.details
|
||||
)
|
||||
else:
|
||||
# Generic thread issue - restart
|
||||
return self.attempt_recovery(
|
||||
component,
|
||||
RecoveryAction.RESTART_THREAD,
|
||||
health_check.message,
|
||||
health_check.details
|
||||
)
|
||||
|
||||
def _handle_buffer_recovery(self, component: str, health_check: HealthCheck) -> bool:
|
||||
"""Handle recovery for buffer-related issues."""
|
||||
# Buffer issues - clear buffer
|
||||
return self.attempt_recovery(
|
||||
component,
|
||||
RecoveryAction.CLEAR_BUFFER,
|
||||
health_check.message,
|
||||
health_check.details
|
||||
)
|
||||
|
||||
def get_recovery_stats(self) -> Dict[str, Any]:
|
||||
"""Get recovery statistics."""
|
||||
current_time = time.time()
|
||||
|
||||
with self._lock:
|
||||
# Calculate stats from history
|
||||
recent_recoveries = [
|
||||
attempt for attempt in self.recovery_history
|
||||
if current_time - attempt.timestamp <= 3600 # Last hour
|
||||
]
|
||||
|
||||
stats_by_component = defaultdict(lambda: {
|
||||
'attempts': 0,
|
||||
'successes': 0,
|
||||
'failures': 0,
|
||||
'last_attempt': None,
|
||||
'last_success': None
|
||||
})
|
||||
|
||||
for attempt in recent_recoveries:
|
||||
stats = stats_by_component[attempt.component]
|
||||
stats['attempts'] += 1
|
||||
|
||||
if attempt.success:
|
||||
stats['successes'] += 1
|
||||
if not stats['last_success'] or attempt.timestamp > stats['last_success']:
|
||||
stats['last_success'] = attempt.timestamp
|
||||
else:
|
||||
stats['failures'] += 1
|
||||
|
||||
if not stats['last_attempt'] or attempt.timestamp > stats['last_attempt']:
|
||||
stats['last_attempt'] = attempt.timestamp
|
||||
|
||||
return {
|
||||
'total_recoveries_last_hour': len(recent_recoveries),
|
||||
'recovery_by_component': dict(stats_by_component),
|
||||
'recovery_states': {
|
||||
component: {
|
||||
'failure_count': state.failure_count,
|
||||
'success_count': state.success_count,
|
||||
'last_failure_time': state.last_failure_time,
|
||||
'last_success_time': state.last_success_time
|
||||
}
|
||||
for component, state in self.recovery_states.items()
|
||||
},
|
||||
'recent_history': [
|
||||
{
|
||||
'timestamp': attempt.timestamp,
|
||||
'component': attempt.component,
|
||||
'action': attempt.action.value,
|
||||
'reason': attempt.reason,
|
||||
'success': attempt.success
|
||||
}
|
||||
for attempt in list(self.recovery_history)[-10:] # Last 10 attempts
|
||||
]
|
||||
}
|
||||
|
||||
def force_recovery(self, component: str, action: RecoveryAction, reason: str = "manual") -> bool:
|
||||
"""
|
||||
Force recovery for a component, bypassing rate limiting.
|
||||
|
||||
Args:
|
||||
component: Component identifier
|
||||
action: Recovery action to perform
|
||||
reason: Reason for forced recovery
|
||||
|
||||
Returns:
|
||||
True if recovery was successful
|
||||
"""
|
||||
logger.info(f"Forcing recovery for {component}: {action.value} ({reason})")
|
||||
|
||||
current_time = time.time()
|
||||
|
||||
try:
|
||||
# Execute recovery action directly
|
||||
success = self._execute_recovery_action(component, action, {})
|
||||
|
||||
# Record forced recovery
|
||||
attempt = RecoveryAttempt(
|
||||
timestamp=current_time,
|
||||
component=component,
|
||||
action=action,
|
||||
reason=f"forced: {reason}",
|
||||
success=success,
|
||||
details={'forced': True}
|
||||
)
|
||||
|
||||
with self._lock:
|
||||
self.recovery_history.append(attempt)
|
||||
self.recovery_attempts[component].append(current_time)
|
||||
|
||||
# Update recovery state
|
||||
self._update_recovery_state(component, success)
|
||||
|
||||
return success
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during forced recovery for {component}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
# Global recovery manager instance
|
||||
recovery_manager = RecoveryManager()
|
351
core/monitoring/stream_health.py
Normal file
351
core/monitoring/stream_health.py
Normal file
|
@ -0,0 +1,351 @@
|
|||
"""
|
||||
Stream-specific health monitoring for video streams.
|
||||
Tracks frame production, connection health, and stream-specific metrics.
|
||||
"""
|
||||
import time
|
||||
import logging
|
||||
import threading
|
||||
import requests
|
||||
from typing import Dict, Optional, List, Any
|
||||
from collections import deque
|
||||
from dataclasses import dataclass
|
||||
|
||||
from .health import HealthCheck, HealthStatus, health_monitor
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class StreamMetrics:
|
||||
"""Metrics for an individual stream."""
|
||||
camera_id: str
|
||||
stream_type: str # 'rtsp', 'http_snapshot'
|
||||
start_time: float
|
||||
last_frame_time: Optional[float] = None
|
||||
frame_count: int = 0
|
||||
error_count: int = 0
|
||||
reconnect_count: int = 0
|
||||
bytes_received: int = 0
|
||||
frames_per_second: float = 0.0
|
||||
connection_attempts: int = 0
|
||||
last_connection_test: Optional[float] = None
|
||||
connection_healthy: bool = True
|
||||
last_error: Optional[str] = None
|
||||
last_error_time: Optional[float] = None
|
||||
|
||||
|
||||
class StreamHealthTracker:
|
||||
"""Tracks health for individual video streams."""
|
||||
|
||||
def __init__(self):
|
||||
self.streams: Dict[str, StreamMetrics] = {}
|
||||
self._lock = threading.RLock()
|
||||
|
||||
# Configuration
|
||||
self.connection_test_interval = 300 # Test connection every 5 minutes
|
||||
self.frame_timeout_warning = 120 # Warn if no frames for 2 minutes
|
||||
self.frame_timeout_critical = 300 # Critical if no frames for 5 minutes
|
||||
self.error_rate_threshold = 0.1 # 10% error rate threshold
|
||||
|
||||
# Register with health monitor
|
||||
health_monitor.register_health_checker(self._perform_health_checks)
|
||||
|
||||
def register_stream(self, camera_id: str, stream_type: str, source_url: Optional[str] = None):
|
||||
"""Register a new stream for monitoring."""
|
||||
with self._lock:
|
||||
if camera_id not in self.streams:
|
||||
self.streams[camera_id] = StreamMetrics(
|
||||
camera_id=camera_id,
|
||||
stream_type=stream_type,
|
||||
start_time=time.time()
|
||||
)
|
||||
logger.info(f"Registered stream for monitoring: {camera_id} ({stream_type})")
|
||||
|
||||
# Update health monitor metrics
|
||||
health_monitor.update_metrics(
|
||||
camera_id,
|
||||
thread_alive=True,
|
||||
connection_healthy=True
|
||||
)
|
||||
|
||||
def unregister_stream(self, camera_id: str):
|
||||
"""Unregister a stream from monitoring."""
|
||||
with self._lock:
|
||||
if camera_id in self.streams:
|
||||
del self.streams[camera_id]
|
||||
logger.info(f"Unregistered stream from monitoring: {camera_id}")
|
||||
|
||||
def report_frame_received(self, camera_id: str, frame_size_bytes: int = 0):
|
||||
"""Report that a frame was received."""
|
||||
current_time = time.time()
|
||||
|
||||
with self._lock:
|
||||
if camera_id not in self.streams:
|
||||
logger.warning(f"Frame received for unregistered stream: {camera_id}")
|
||||
return
|
||||
|
||||
stream = self.streams[camera_id]
|
||||
|
||||
# Update frame metrics
|
||||
if stream.last_frame_time:
|
||||
interval = current_time - stream.last_frame_time
|
||||
# Calculate FPS as moving average
|
||||
if stream.frames_per_second == 0:
|
||||
stream.frames_per_second = 1.0 / interval if interval > 0 else 0
|
||||
else:
|
||||
new_fps = 1.0 / interval if interval > 0 else 0
|
||||
stream.frames_per_second = (stream.frames_per_second * 0.9) + (new_fps * 0.1)
|
||||
|
||||
stream.last_frame_time = current_time
|
||||
stream.frame_count += 1
|
||||
stream.bytes_received += frame_size_bytes
|
||||
|
||||
# Report to health monitor
|
||||
health_monitor.report_frame_received(camera_id)
|
||||
health_monitor.update_metrics(
|
||||
camera_id,
|
||||
frame_count=stream.frame_count,
|
||||
avg_frame_interval=1.0 / stream.frames_per_second if stream.frames_per_second > 0 else 0,
|
||||
last_frame_time=current_time
|
||||
)
|
||||
|
||||
def report_error(self, camera_id: str, error_message: str):
|
||||
"""Report an error for a stream."""
|
||||
current_time = time.time()
|
||||
|
||||
with self._lock:
|
||||
if camera_id not in self.streams:
|
||||
logger.warning(f"Error reported for unregistered stream: {camera_id}")
|
||||
return
|
||||
|
||||
stream = self.streams[camera_id]
|
||||
stream.error_count += 1
|
||||
stream.last_error = error_message
|
||||
stream.last_error_time = current_time
|
||||
|
||||
# Report to health monitor
|
||||
health_monitor.report_error(camera_id, "stream_error")
|
||||
health_monitor.update_metrics(
|
||||
camera_id,
|
||||
error_count=stream.error_count
|
||||
)
|
||||
|
||||
logger.debug(f"Error reported for stream {camera_id}: {error_message}")
|
||||
|
||||
def report_reconnect(self, camera_id: str, reason: str = "unknown"):
|
||||
"""Report that a stream reconnected."""
|
||||
current_time = time.time()
|
||||
|
||||
with self._lock:
|
||||
if camera_id not in self.streams:
|
||||
logger.warning(f"Reconnect reported for unregistered stream: {camera_id}")
|
||||
return
|
||||
|
||||
stream = self.streams[camera_id]
|
||||
stream.reconnect_count += 1
|
||||
|
||||
# Report to health monitor
|
||||
health_monitor.report_restart(camera_id)
|
||||
health_monitor.update_metrics(
|
||||
camera_id,
|
||||
restart_count=stream.reconnect_count
|
||||
)
|
||||
|
||||
logger.info(f"Reconnect reported for stream {camera_id}: {reason}")
|
||||
|
||||
def report_connection_attempt(self, camera_id: str, success: bool):
|
||||
"""Report a connection attempt."""
|
||||
with self._lock:
|
||||
if camera_id not in self.streams:
|
||||
return
|
||||
|
||||
stream = self.streams[camera_id]
|
||||
stream.connection_attempts += 1
|
||||
stream.connection_healthy = success
|
||||
|
||||
# Report to health monitor
|
||||
health_monitor.update_metrics(
|
||||
camera_id,
|
||||
connection_healthy=success
|
||||
)
|
||||
|
||||
def test_http_connection(self, camera_id: str, url: str) -> bool:
|
||||
"""Test HTTP connection health for snapshot streams."""
|
||||
try:
|
||||
# Quick HEAD request to test connectivity
|
||||
response = requests.head(url, timeout=5, verify=False)
|
||||
success = response.status_code in [200, 404] # 404 might be normal for some cameras
|
||||
|
||||
self.report_connection_attempt(camera_id, success)
|
||||
|
||||
if success:
|
||||
logger.debug(f"Connection test passed for {camera_id}")
|
||||
else:
|
||||
logger.warning(f"Connection test failed for {camera_id}: HTTP {response.status_code}")
|
||||
|
||||
return success
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Connection test failed for {camera_id}: {e}")
|
||||
self.report_connection_attempt(camera_id, False)
|
||||
return False
|
||||
|
||||
def get_stream_metrics(self, camera_id: str) -> Optional[Dict[str, Any]]:
|
||||
"""Get metrics for a specific stream."""
|
||||
with self._lock:
|
||||
if camera_id not in self.streams:
|
||||
return None
|
||||
|
||||
stream = self.streams[camera_id]
|
||||
current_time = time.time()
|
||||
|
||||
# Calculate derived metrics
|
||||
uptime = current_time - stream.start_time
|
||||
frame_age = current_time - stream.last_frame_time if stream.last_frame_time else None
|
||||
error_rate = stream.error_count / max(1, stream.frame_count)
|
||||
|
||||
return {
|
||||
'camera_id': camera_id,
|
||||
'stream_type': stream.stream_type,
|
||||
'uptime_seconds': uptime,
|
||||
'frame_count': stream.frame_count,
|
||||
'frames_per_second': stream.frames_per_second,
|
||||
'bytes_received': stream.bytes_received,
|
||||
'error_count': stream.error_count,
|
||||
'error_rate': error_rate,
|
||||
'reconnect_count': stream.reconnect_count,
|
||||
'connection_attempts': stream.connection_attempts,
|
||||
'connection_healthy': stream.connection_healthy,
|
||||
'last_frame_age_seconds': frame_age,
|
||||
'last_error': stream.last_error,
|
||||
'last_error_time': stream.last_error_time
|
||||
}
|
||||
|
||||
def get_all_metrics(self) -> Dict[str, Dict[str, Any]]:
|
||||
"""Get metrics for all streams."""
|
||||
with self._lock:
|
||||
return {
|
||||
camera_id: self.get_stream_metrics(camera_id)
|
||||
for camera_id in self.streams.keys()
|
||||
}
|
||||
|
||||
def _perform_health_checks(self) -> List[HealthCheck]:
|
||||
"""Perform health checks for all streams."""
|
||||
checks = []
|
||||
current_time = time.time()
|
||||
|
||||
with self._lock:
|
||||
for camera_id, stream in self.streams.items():
|
||||
checks.extend(self._check_stream_health(camera_id, stream, current_time))
|
||||
|
||||
return checks
|
||||
|
||||
def _check_stream_health(self, camera_id: str, stream: StreamMetrics, current_time: float) -> List[HealthCheck]:
|
||||
"""Perform health checks for a single stream."""
|
||||
checks = []
|
||||
|
||||
# Check frame freshness
|
||||
if stream.last_frame_time:
|
||||
frame_age = current_time - stream.last_frame_time
|
||||
|
||||
if frame_age > self.frame_timeout_critical:
|
||||
checks.append(HealthCheck(
|
||||
name=f"stream_{camera_id}_frames",
|
||||
status=HealthStatus.CRITICAL,
|
||||
message=f"No frames for {frame_age:.1f}s (critical threshold: {self.frame_timeout_critical}s)",
|
||||
details={
|
||||
'frame_age': frame_age,
|
||||
'threshold': self.frame_timeout_critical,
|
||||
'last_frame_time': stream.last_frame_time
|
||||
},
|
||||
recovery_action="restart_stream"
|
||||
))
|
||||
elif frame_age > self.frame_timeout_warning:
|
||||
checks.append(HealthCheck(
|
||||
name=f"stream_{camera_id}_frames",
|
||||
status=HealthStatus.WARNING,
|
||||
message=f"Frames aging: {frame_age:.1f}s (warning threshold: {self.frame_timeout_warning}s)",
|
||||
details={
|
||||
'frame_age': frame_age,
|
||||
'threshold': self.frame_timeout_warning,
|
||||
'last_frame_time': stream.last_frame_time
|
||||
}
|
||||
))
|
||||
else:
|
||||
# No frames received yet
|
||||
startup_time = current_time - stream.start_time
|
||||
if startup_time > 60: # Allow 1 minute for initial connection
|
||||
checks.append(HealthCheck(
|
||||
name=f"stream_{camera_id}_startup",
|
||||
status=HealthStatus.CRITICAL,
|
||||
message=f"No frames received since startup {startup_time:.1f}s ago",
|
||||
details={
|
||||
'startup_time': startup_time,
|
||||
'start_time': stream.start_time
|
||||
},
|
||||
recovery_action="restart_stream"
|
||||
))
|
||||
|
||||
# Check error rate
|
||||
if stream.frame_count > 10: # Need sufficient samples
|
||||
error_rate = stream.error_count / stream.frame_count
|
||||
if error_rate > self.error_rate_threshold:
|
||||
checks.append(HealthCheck(
|
||||
name=f"stream_{camera_id}_errors",
|
||||
status=HealthStatus.WARNING,
|
||||
message=f"High error rate: {error_rate:.1%} ({stream.error_count}/{stream.frame_count})",
|
||||
details={
|
||||
'error_rate': error_rate,
|
||||
'error_count': stream.error_count,
|
||||
'frame_count': stream.frame_count,
|
||||
'last_error': stream.last_error
|
||||
}
|
||||
))
|
||||
|
||||
# Check connection health
|
||||
if not stream.connection_healthy:
|
||||
checks.append(HealthCheck(
|
||||
name=f"stream_{camera_id}_connection",
|
||||
status=HealthStatus.WARNING,
|
||||
message="Connection unhealthy (last test failed)",
|
||||
details={
|
||||
'connection_attempts': stream.connection_attempts,
|
||||
'last_connection_test': stream.last_connection_test
|
||||
}
|
||||
))
|
||||
|
||||
# Check excessive reconnects
|
||||
uptime_hours = (current_time - stream.start_time) / 3600
|
||||
if uptime_hours > 1 and stream.reconnect_count > 5: # More than 5 reconnects per hour
|
||||
reconnect_rate = stream.reconnect_count / uptime_hours
|
||||
checks.append(HealthCheck(
|
||||
name=f"stream_{camera_id}_stability",
|
||||
status=HealthStatus.WARNING,
|
||||
message=f"Frequent reconnects: {reconnect_rate:.1f}/hour ({stream.reconnect_count} total)",
|
||||
details={
|
||||
'reconnect_rate': reconnect_rate,
|
||||
'reconnect_count': stream.reconnect_count,
|
||||
'uptime_hours': uptime_hours
|
||||
}
|
||||
))
|
||||
|
||||
# Check frame rate health
|
||||
if stream.last_frame_time and stream.frames_per_second > 0:
|
||||
expected_fps = 6.0 # Expected FPS for streams
|
||||
if stream.frames_per_second < expected_fps * 0.5: # Less than 50% of expected
|
||||
checks.append(HealthCheck(
|
||||
name=f"stream_{camera_id}_framerate",
|
||||
status=HealthStatus.WARNING,
|
||||
message=f"Low frame rate: {stream.frames_per_second:.1f} fps (expected: ~{expected_fps} fps)",
|
||||
details={
|
||||
'current_fps': stream.frames_per_second,
|
||||
'expected_fps': expected_fps
|
||||
}
|
||||
))
|
||||
|
||||
return checks
|
||||
|
||||
|
||||
# Global stream health tracker instance
|
||||
stream_health_tracker = StreamHealthTracker()
|
381
core/monitoring/thread_health.py
Normal file
381
core/monitoring/thread_health.py
Normal file
|
@ -0,0 +1,381 @@
|
|||
"""
|
||||
Thread health monitoring for detecting unresponsive and deadlocked threads.
|
||||
Provides thread liveness detection and responsiveness testing.
|
||||
"""
|
||||
import time
|
||||
import threading
|
||||
import logging
|
||||
import signal
|
||||
import traceback
|
||||
from typing import Dict, List, Optional, Any, Callable
|
||||
from dataclasses import dataclass
|
||||
from collections import defaultdict
|
||||
|
||||
from .health import HealthCheck, HealthStatus, health_monitor
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ThreadInfo:
|
||||
"""Information about a monitored thread."""
|
||||
thread_id: int
|
||||
thread_name: str
|
||||
start_time: float
|
||||
last_heartbeat: float
|
||||
heartbeat_count: int = 0
|
||||
is_responsive: bool = True
|
||||
last_activity: Optional[str] = None
|
||||
stack_traces: List[str] = None
|
||||
|
||||
|
||||
class ThreadHealthMonitor:
|
||||
"""Monitors thread health and responsiveness."""
|
||||
|
||||
def __init__(self):
|
||||
self.monitored_threads: Dict[int, ThreadInfo] = {}
|
||||
self.heartbeat_callbacks: Dict[int, Callable[[], bool]] = {}
|
||||
self._lock = threading.RLock()
|
||||
|
||||
# Configuration
|
||||
self.heartbeat_timeout = 60.0 # 1 minute without heartbeat = unresponsive
|
||||
self.responsiveness_test_interval = 30.0 # Test responsiveness every 30 seconds
|
||||
self.stack_trace_count = 5 # Keep last 5 stack traces for analysis
|
||||
|
||||
# Register with health monitor
|
||||
health_monitor.register_health_checker(self._perform_health_checks)
|
||||
|
||||
# Enable periodic responsiveness testing
|
||||
self.test_thread = threading.Thread(target=self._responsiveness_test_loop, daemon=True)
|
||||
self.test_thread.start()
|
||||
|
||||
def register_thread(self, thread: threading.Thread, heartbeat_callback: Optional[Callable[[], bool]] = None):
|
||||
"""
|
||||
Register a thread for monitoring.
|
||||
|
||||
Args:
|
||||
thread: Thread to monitor
|
||||
heartbeat_callback: Optional callback to test thread responsiveness
|
||||
"""
|
||||
with self._lock:
|
||||
thread_info = ThreadInfo(
|
||||
thread_id=thread.ident,
|
||||
thread_name=thread.name,
|
||||
start_time=time.time(),
|
||||
last_heartbeat=time.time()
|
||||
)
|
||||
|
||||
self.monitored_threads[thread.ident] = thread_info
|
||||
|
||||
if heartbeat_callback:
|
||||
self.heartbeat_callbacks[thread.ident] = heartbeat_callback
|
||||
|
||||
logger.info(f"Registered thread for monitoring: {thread.name} (ID: {thread.ident})")
|
||||
|
||||
def unregister_thread(self, thread_id: int):
|
||||
"""Unregister a thread from monitoring."""
|
||||
with self._lock:
|
||||
if thread_id in self.monitored_threads:
|
||||
thread_name = self.monitored_threads[thread_id].thread_name
|
||||
del self.monitored_threads[thread_id]
|
||||
|
||||
if thread_id in self.heartbeat_callbacks:
|
||||
del self.heartbeat_callbacks[thread_id]
|
||||
|
||||
logger.info(f"Unregistered thread from monitoring: {thread_name} (ID: {thread_id})")
|
||||
|
||||
def heartbeat(self, thread_id: Optional[int] = None, activity: Optional[str] = None):
|
||||
"""
|
||||
Report thread heartbeat.
|
||||
|
||||
Args:
|
||||
thread_id: Thread ID (uses current thread if None)
|
||||
activity: Description of current activity
|
||||
"""
|
||||
if thread_id is None:
|
||||
thread_id = threading.current_thread().ident
|
||||
|
||||
current_time = time.time()
|
||||
|
||||
with self._lock:
|
||||
if thread_id in self.monitored_threads:
|
||||
thread_info = self.monitored_threads[thread_id]
|
||||
thread_info.last_heartbeat = current_time
|
||||
thread_info.heartbeat_count += 1
|
||||
thread_info.is_responsive = True
|
||||
|
||||
if activity:
|
||||
thread_info.last_activity = activity
|
||||
|
||||
# Report to health monitor
|
||||
health_monitor.update_metrics(
|
||||
f"thread_{thread_info.thread_name}",
|
||||
thread_alive=True,
|
||||
last_frame_time=current_time
|
||||
)
|
||||
|
||||
def get_thread_info(self, thread_id: int) -> Optional[Dict[str, Any]]:
|
||||
"""Get information about a monitored thread."""
|
||||
with self._lock:
|
||||
if thread_id not in self.monitored_threads:
|
||||
return None
|
||||
|
||||
thread_info = self.monitored_threads[thread_id]
|
||||
current_time = time.time()
|
||||
|
||||
return {
|
||||
'thread_id': thread_id,
|
||||
'thread_name': thread_info.thread_name,
|
||||
'uptime_seconds': current_time - thread_info.start_time,
|
||||
'last_heartbeat_age': current_time - thread_info.last_heartbeat,
|
||||
'heartbeat_count': thread_info.heartbeat_count,
|
||||
'is_responsive': thread_info.is_responsive,
|
||||
'last_activity': thread_info.last_activity,
|
||||
'stack_traces': thread_info.stack_traces or []
|
||||
}
|
||||
|
||||
def get_all_thread_info(self) -> Dict[int, Dict[str, Any]]:
|
||||
"""Get information about all monitored threads."""
|
||||
with self._lock:
|
||||
return {
|
||||
thread_id: self.get_thread_info(thread_id)
|
||||
for thread_id in self.monitored_threads.keys()
|
||||
}
|
||||
|
||||
def test_thread_responsiveness(self, thread_id: int) -> bool:
|
||||
"""
|
||||
Test if a thread is responsive by calling its heartbeat callback.
|
||||
|
||||
Args:
|
||||
thread_id: ID of thread to test
|
||||
|
||||
Returns:
|
||||
True if thread responds within timeout
|
||||
"""
|
||||
if thread_id not in self.heartbeat_callbacks:
|
||||
return True # Can't test if no callback provided
|
||||
|
||||
try:
|
||||
# Call the heartbeat callback with a timeout
|
||||
callback = self.heartbeat_callbacks[thread_id]
|
||||
|
||||
# This is a simple approach - in practice you might want to use
|
||||
# threading.Timer or asyncio for more sophisticated timeout handling
|
||||
start_time = time.time()
|
||||
result = callback()
|
||||
response_time = time.time() - start_time
|
||||
|
||||
with self._lock:
|
||||
if thread_id in self.monitored_threads:
|
||||
self.monitored_threads[thread_id].is_responsive = result
|
||||
|
||||
if response_time > 5.0: # Slow response
|
||||
logger.warning(f"Thread {thread_id} slow response: {response_time:.1f}s")
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error testing thread {thread_id} responsiveness: {e}")
|
||||
with self._lock:
|
||||
if thread_id in self.monitored_threads:
|
||||
self.monitored_threads[thread_id].is_responsive = False
|
||||
return False
|
||||
|
||||
def capture_stack_trace(self, thread_id: int) -> Optional[str]:
|
||||
"""
|
||||
Capture stack trace for a thread.
|
||||
|
||||
Args:
|
||||
thread_id: ID of thread to capture
|
||||
|
||||
Returns:
|
||||
Stack trace string or None if not available
|
||||
"""
|
||||
try:
|
||||
# Get all frames for all threads
|
||||
frames = dict(threading._current_frames())
|
||||
|
||||
if thread_id not in frames:
|
||||
return None
|
||||
|
||||
# Format stack trace
|
||||
frame = frames[thread_id]
|
||||
stack_trace = ''.join(traceback.format_stack(frame))
|
||||
|
||||
# Store in thread info
|
||||
with self._lock:
|
||||
if thread_id in self.monitored_threads:
|
||||
thread_info = self.monitored_threads[thread_id]
|
||||
if thread_info.stack_traces is None:
|
||||
thread_info.stack_traces = []
|
||||
|
||||
thread_info.stack_traces.append(f"{time.time()}: {stack_trace}")
|
||||
|
||||
# Keep only last N stack traces
|
||||
if len(thread_info.stack_traces) > self.stack_trace_count:
|
||||
thread_info.stack_traces = thread_info.stack_traces[-self.stack_trace_count:]
|
||||
|
||||
return stack_trace
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error capturing stack trace for thread {thread_id}: {e}")
|
||||
return None
|
||||
|
||||
def detect_deadlocks(self) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Attempt to detect potential deadlocks by analyzing thread states.
|
||||
|
||||
Returns:
|
||||
List of potential deadlock scenarios
|
||||
"""
|
||||
deadlocks = []
|
||||
current_time = time.time()
|
||||
|
||||
with self._lock:
|
||||
# Look for threads that haven't had heartbeats for a long time
|
||||
# and are supposedly alive
|
||||
for thread_id, thread_info in self.monitored_threads.items():
|
||||
heartbeat_age = current_time - thread_info.last_heartbeat
|
||||
|
||||
if heartbeat_age > self.heartbeat_timeout * 2: # Double the timeout
|
||||
# Check if thread still exists
|
||||
thread_exists = any(
|
||||
t.ident == thread_id and t.is_alive()
|
||||
for t in threading.enumerate()
|
||||
)
|
||||
|
||||
if thread_exists:
|
||||
# Thread exists but not responding - potential deadlock
|
||||
stack_trace = self.capture_stack_trace(thread_id)
|
||||
|
||||
deadlock_info = {
|
||||
'thread_id': thread_id,
|
||||
'thread_name': thread_info.thread_name,
|
||||
'heartbeat_age': heartbeat_age,
|
||||
'last_activity': thread_info.last_activity,
|
||||
'stack_trace': stack_trace,
|
||||
'detection_time': current_time
|
||||
}
|
||||
|
||||
deadlocks.append(deadlock_info)
|
||||
logger.warning(f"Potential deadlock detected in thread {thread_info.thread_name}")
|
||||
|
||||
return deadlocks
|
||||
|
||||
def _responsiveness_test_loop(self):
|
||||
"""Background loop to test thread responsiveness."""
|
||||
logger.info("Thread responsiveness testing started")
|
||||
|
||||
while True:
|
||||
try:
|
||||
time.sleep(self.responsiveness_test_interval)
|
||||
|
||||
with self._lock:
|
||||
thread_ids = list(self.monitored_threads.keys())
|
||||
|
||||
for thread_id in thread_ids:
|
||||
try:
|
||||
self.test_thread_responsiveness(thread_id)
|
||||
except Exception as e:
|
||||
logger.error(f"Error testing thread {thread_id}: {e}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in responsiveness test loop: {e}")
|
||||
time.sleep(10.0) # Fallback sleep
|
||||
|
||||
def _perform_health_checks(self) -> List[HealthCheck]:
|
||||
"""Perform health checks for all monitored threads."""
|
||||
checks = []
|
||||
current_time = time.time()
|
||||
|
||||
with self._lock:
|
||||
for thread_id, thread_info in self.monitored_threads.items():
|
||||
checks.extend(self._check_thread_health(thread_id, thread_info, current_time))
|
||||
|
||||
# Check for deadlocks
|
||||
deadlocks = self.detect_deadlocks()
|
||||
for deadlock in deadlocks:
|
||||
checks.append(HealthCheck(
|
||||
name=f"deadlock_detection_{deadlock['thread_id']}",
|
||||
status=HealthStatus.CRITICAL,
|
||||
message=f"Potential deadlock in thread {deadlock['thread_name']} "
|
||||
f"(unresponsive for {deadlock['heartbeat_age']:.1f}s)",
|
||||
details=deadlock,
|
||||
recovery_action="restart_thread"
|
||||
))
|
||||
|
||||
return checks
|
||||
|
||||
def _check_thread_health(self, thread_id: int, thread_info: ThreadInfo, current_time: float) -> List[HealthCheck]:
|
||||
"""Perform health checks for a single thread."""
|
||||
checks = []
|
||||
|
||||
# Check if thread still exists
|
||||
thread_exists = any(
|
||||
t.ident == thread_id and t.is_alive()
|
||||
for t in threading.enumerate()
|
||||
)
|
||||
|
||||
if not thread_exists:
|
||||
checks.append(HealthCheck(
|
||||
name=f"thread_{thread_info.thread_name}_alive",
|
||||
status=HealthStatus.CRITICAL,
|
||||
message=f"Thread {thread_info.thread_name} is no longer alive",
|
||||
details={
|
||||
'thread_id': thread_id,
|
||||
'uptime': current_time - thread_info.start_time,
|
||||
'last_heartbeat': thread_info.last_heartbeat
|
||||
},
|
||||
recovery_action="restart_thread"
|
||||
))
|
||||
return checks
|
||||
|
||||
# Check heartbeat freshness
|
||||
heartbeat_age = current_time - thread_info.last_heartbeat
|
||||
|
||||
if heartbeat_age > self.heartbeat_timeout:
|
||||
checks.append(HealthCheck(
|
||||
name=f"thread_{thread_info.thread_name}_responsive",
|
||||
status=HealthStatus.CRITICAL,
|
||||
message=f"Thread {thread_info.thread_name} unresponsive for {heartbeat_age:.1f}s",
|
||||
details={
|
||||
'thread_id': thread_id,
|
||||
'heartbeat_age': heartbeat_age,
|
||||
'heartbeat_count': thread_info.heartbeat_count,
|
||||
'last_activity': thread_info.last_activity,
|
||||
'is_responsive': thread_info.is_responsive
|
||||
},
|
||||
recovery_action="restart_thread"
|
||||
))
|
||||
elif heartbeat_age > self.heartbeat_timeout * 0.5: # Warning at 50% of timeout
|
||||
checks.append(HealthCheck(
|
||||
name=f"thread_{thread_info.thread_name}_responsive",
|
||||
status=HealthStatus.WARNING,
|
||||
message=f"Thread {thread_info.thread_name} slow heartbeat: {heartbeat_age:.1f}s",
|
||||
details={
|
||||
'thread_id': thread_id,
|
||||
'heartbeat_age': heartbeat_age,
|
||||
'heartbeat_count': thread_info.heartbeat_count,
|
||||
'last_activity': thread_info.last_activity,
|
||||
'is_responsive': thread_info.is_responsive
|
||||
}
|
||||
))
|
||||
|
||||
# Check responsiveness test results
|
||||
if not thread_info.is_responsive:
|
||||
checks.append(HealthCheck(
|
||||
name=f"thread_{thread_info.thread_name}_callback",
|
||||
status=HealthStatus.WARNING,
|
||||
message=f"Thread {thread_info.thread_name} failed responsiveness test",
|
||||
details={
|
||||
'thread_id': thread_id,
|
||||
'last_activity': thread_info.last_activity
|
||||
}
|
||||
))
|
||||
|
||||
return checks
|
||||
|
||||
|
||||
# Global thread health monitor instance
|
||||
thread_health_monitor = ThreadHealthMonitor()
|
|
@ -1,5 +1,6 @@
|
|||
"""
|
||||
FFmpeg RTSP stream reader using subprocess piping frames directly to buffer.
|
||||
Enhanced with comprehensive health monitoring and automatic recovery.
|
||||
"""
|
||||
import cv2
|
||||
import time
|
||||
|
@ -7,10 +8,13 @@ import threading
|
|||
import numpy as np
|
||||
import subprocess
|
||||
import struct
|
||||
from typing import Optional, Callable
|
||||
from typing import Optional, Callable, Dict, Any
|
||||
|
||||
from .base import VideoReader
|
||||
from .utils import log_success, log_warning, log_error, log_info
|
||||
from ..monitoring.stream_health import stream_health_tracker
|
||||
from ..monitoring.thread_health import thread_health_monitor
|
||||
from ..monitoring.recovery import recovery_manager, RecoveryAction
|
||||
|
||||
|
||||
class FFmpegRTSPReader(VideoReader):
|
||||
|
@ -35,6 +39,21 @@ class FFmpegRTSPReader(VideoReader):
|
|||
self.first_start_timeout = 30.0 # 30s timeout on first start
|
||||
self.restart_timeout = 15.0 # 15s timeout after restart
|
||||
|
||||
# Health monitoring setup
|
||||
self.last_heartbeat = time.time()
|
||||
self.consecutive_errors = 0
|
||||
self.ffmpeg_restart_count = 0
|
||||
|
||||
# Register recovery handlers
|
||||
recovery_manager.register_recovery_handler(
|
||||
RecoveryAction.RESTART_STREAM,
|
||||
self._handle_restart_recovery
|
||||
)
|
||||
recovery_manager.register_recovery_handler(
|
||||
RecoveryAction.RECONNECT,
|
||||
self._handle_reconnect_recovery
|
||||
)
|
||||
|
||||
@property
|
||||
def is_running(self) -> bool:
|
||||
"""Check if the reader is currently running."""
|
||||
|
@ -58,21 +77,35 @@ class FFmpegRTSPReader(VideoReader):
|
|||
self.stop_event.clear()
|
||||
self.thread = threading.Thread(target=self._read_frames, daemon=True)
|
||||
self.thread.start()
|
||||
log_success(self.camera_id, "Stream started")
|
||||
|
||||
# Register with health monitoring
|
||||
stream_health_tracker.register_stream(self.camera_id, "rtsp_ffmpeg", self.rtsp_url)
|
||||
thread_health_monitor.register_thread(self.thread, self._heartbeat_callback)
|
||||
|
||||
log_success(self.camera_id, "Stream started with health monitoring")
|
||||
|
||||
def stop(self):
|
||||
"""Stop the FFmpeg subprocess reader."""
|
||||
self.stop_event.set()
|
||||
|
||||
# Unregister from health monitoring
|
||||
if self.thread:
|
||||
thread_health_monitor.unregister_thread(self.thread.ident)
|
||||
|
||||
if self.process:
|
||||
self.process.terminate()
|
||||
try:
|
||||
self.process.wait(timeout=5)
|
||||
except subprocess.TimeoutExpired:
|
||||
self.process.kill()
|
||||
|
||||
if self.thread:
|
||||
self.thread.join(timeout=5.0)
|
||||
if self.stderr_thread:
|
||||
self.stderr_thread.join(timeout=2.0)
|
||||
|
||||
stream_health_tracker.unregister_stream(self.camera_id)
|
||||
|
||||
log_info(self.camera_id, "Stream stopped")
|
||||
|
||||
def _start_ffmpeg_process(self):
|
||||
|
@ -249,6 +282,9 @@ class FFmpegRTSPReader(VideoReader):
|
|||
|
||||
while not self.stop_event.is_set():
|
||||
try:
|
||||
# Send heartbeat for thread health monitoring
|
||||
self._send_heartbeat("reading_frames")
|
||||
|
||||
# Check watchdog timeout if process is running
|
||||
if self.process and self.process.poll() is None:
|
||||
if self._check_watchdog_timeout():
|
||||
|
@ -259,8 +295,17 @@ class FFmpegRTSPReader(VideoReader):
|
|||
if not self.process or self.process.poll() is not None:
|
||||
if self.process and self.process.poll() is not None:
|
||||
log_warning(self.camera_id, "Stream disconnected, reconnecting...")
|
||||
stream_health_tracker.report_error(
|
||||
self.camera_id,
|
||||
"FFmpeg process disconnected"
|
||||
)
|
||||
|
||||
if not self._start_ffmpeg_process():
|
||||
self.consecutive_errors += 1
|
||||
stream_health_tracker.report_error(
|
||||
self.camera_id,
|
||||
"Failed to start FFmpeg process"
|
||||
)
|
||||
time.sleep(5.0)
|
||||
continue
|
||||
|
||||
|
@ -275,9 +320,22 @@ class FFmpegRTSPReader(VideoReader):
|
|||
# Update watchdog - we got a frame
|
||||
self.last_frame_time = time.time()
|
||||
|
||||
# Reset error counter on successful frame
|
||||
self.consecutive_errors = 0
|
||||
|
||||
# Report successful frame to health monitoring
|
||||
frame_size = frame.nbytes
|
||||
stream_health_tracker.report_frame_received(self.camera_id, frame_size)
|
||||
|
||||
# Call frame callback
|
||||
if self.frame_callback:
|
||||
self.frame_callback(self.camera_id, frame)
|
||||
try:
|
||||
self.frame_callback(self.camera_id, frame)
|
||||
except Exception as e:
|
||||
stream_health_tracker.report_error(
|
||||
self.camera_id,
|
||||
f"Frame callback error: {e}"
|
||||
)
|
||||
|
||||
frame_count += 1
|
||||
|
||||
|
@ -287,16 +345,85 @@ class FFmpegRTSPReader(VideoReader):
|
|||
log_success(self.camera_id, f"{frame_count} frames captured ({frame.shape[1]}x{frame.shape[0]})")
|
||||
last_log_time = current_time
|
||||
|
||||
except Exception:
|
||||
except Exception as e:
|
||||
# Process might have died, let it restart on next iteration
|
||||
stream_health_tracker.report_error(
|
||||
self.camera_id,
|
||||
f"Frame reading error: {e}"
|
||||
)
|
||||
if self.process:
|
||||
self.process.terminate()
|
||||
self.process = None
|
||||
time.sleep(1.0)
|
||||
|
||||
except Exception:
|
||||
except Exception as e:
|
||||
stream_health_tracker.report_error(
|
||||
self.camera_id,
|
||||
f"Main loop error: {e}"
|
||||
)
|
||||
time.sleep(1.0)
|
||||
|
||||
# Cleanup
|
||||
if self.process:
|
||||
self.process.terminate()
|
||||
self.process.terminate()
|
||||
|
||||
# Health monitoring methods
|
||||
def _send_heartbeat(self, activity: str = "running"):
|
||||
"""Send heartbeat to thread health monitor."""
|
||||
self.last_heartbeat = time.time()
|
||||
thread_health_monitor.heartbeat(activity=activity)
|
||||
|
||||
def _heartbeat_callback(self) -> bool:
|
||||
"""Heartbeat callback for thread responsiveness testing."""
|
||||
try:
|
||||
# Check if thread is responsive by checking recent heartbeat
|
||||
current_time = time.time()
|
||||
age = current_time - self.last_heartbeat
|
||||
|
||||
# Thread is responsive if heartbeat is recent
|
||||
return age < 30.0 # 30 second responsiveness threshold
|
||||
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _handle_restart_recovery(self, component: str, details: Dict[str, Any]) -> bool:
|
||||
"""Handle restart recovery action."""
|
||||
try:
|
||||
log_info(self.camera_id, "Restarting FFmpeg RTSP reader for health recovery")
|
||||
|
||||
# Stop current instance
|
||||
self.stop()
|
||||
|
||||
# Small delay
|
||||
time.sleep(2.0)
|
||||
|
||||
# Restart
|
||||
self.start()
|
||||
|
||||
# Report successful restart
|
||||
stream_health_tracker.report_reconnect(self.camera_id, "health_recovery_restart")
|
||||
self.ffmpeg_restart_count += 1
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
log_error(self.camera_id, f"Failed to restart FFmpeg RTSP reader: {e}")
|
||||
return False
|
||||
|
||||
def _handle_reconnect_recovery(self, component: str, details: Dict[str, Any]) -> bool:
|
||||
"""Handle reconnect recovery action."""
|
||||
try:
|
||||
log_info(self.camera_id, "Reconnecting FFmpeg RTSP reader for health recovery")
|
||||
|
||||
# Force restart FFmpeg process
|
||||
self._restart_ffmpeg_process()
|
||||
|
||||
# Reset error counters
|
||||
self.consecutive_errors = 0
|
||||
stream_health_tracker.report_reconnect(self.camera_id, "health_recovery_reconnect")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
log_error(self.camera_id, f"Failed to reconnect FFmpeg RTSP reader: {e}")
|
||||
return False
|
|
@ -1,5 +1,6 @@
|
|||
"""
|
||||
HTTP snapshot reader optimized for 2560x1440 (2K) high quality images.
|
||||
Enhanced with comprehensive health monitoring and automatic recovery.
|
||||
"""
|
||||
import cv2
|
||||
import logging
|
||||
|
@ -7,10 +8,13 @@ import time
|
|||
import threading
|
||||
import requests
|
||||
import numpy as np
|
||||
from typing import Optional, Callable
|
||||
from typing import Optional, Callable, Dict, Any
|
||||
|
||||
from .base import VideoReader
|
||||
from .utils import log_success, log_warning, log_error, log_info
|
||||
from ..monitoring.stream_health import stream_health_tracker
|
||||
from ..monitoring.thread_health import thread_health_monitor
|
||||
from ..monitoring.recovery import recovery_manager, RecoveryAction
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
@ -30,6 +34,22 @@ class HTTPSnapshotReader(VideoReader):
|
|||
self.expected_height = 1440
|
||||
self.max_file_size = 10 * 1024 * 1024 # 10MB max for 2K image
|
||||
|
||||
# Health monitoring setup
|
||||
self.last_heartbeat = time.time()
|
||||
self.consecutive_errors = 0
|
||||
self.connection_test_interval = 300 # Test connection every 5 minutes
|
||||
self.last_connection_test = None
|
||||
|
||||
# Register recovery handlers
|
||||
recovery_manager.register_recovery_handler(
|
||||
RecoveryAction.RESTART_STREAM,
|
||||
self._handle_restart_recovery
|
||||
)
|
||||
recovery_manager.register_recovery_handler(
|
||||
RecoveryAction.RECONNECT,
|
||||
self._handle_reconnect_recovery
|
||||
)
|
||||
|
||||
@property
|
||||
def is_running(self) -> bool:
|
||||
"""Check if the reader is currently running."""
|
||||
|
@ -53,13 +73,24 @@ class HTTPSnapshotReader(VideoReader):
|
|||
self.stop_event.clear()
|
||||
self.thread = threading.Thread(target=self._read_snapshots, daemon=True)
|
||||
self.thread.start()
|
||||
logger.info(f"Started snapshot reader for camera {self.camera_id}")
|
||||
|
||||
# Register with health monitoring
|
||||
stream_health_tracker.register_stream(self.camera_id, "http_snapshot", self.snapshot_url)
|
||||
thread_health_monitor.register_thread(self.thread, self._heartbeat_callback)
|
||||
|
||||
logger.info(f"Started snapshot reader for camera {self.camera_id} with health monitoring")
|
||||
|
||||
def stop(self):
|
||||
"""Stop the snapshot reader thread."""
|
||||
self.stop_event.set()
|
||||
|
||||
# Unregister from health monitoring
|
||||
if self.thread:
|
||||
thread_health_monitor.unregister_thread(self.thread.ident)
|
||||
self.thread.join(timeout=5.0)
|
||||
|
||||
stream_health_tracker.unregister_stream(self.camera_id)
|
||||
|
||||
logger.info(f"Stopped snapshot reader for camera {self.camera_id}")
|
||||
|
||||
def _read_snapshots(self):
|
||||
|
@ -67,17 +98,29 @@ class HTTPSnapshotReader(VideoReader):
|
|||
retries = 0
|
||||
frame_count = 0
|
||||
last_log_time = time.time()
|
||||
last_connection_test = time.time()
|
||||
interval_seconds = self.interval_ms / 1000.0
|
||||
|
||||
logger.info(f"Snapshot interval for camera {self.camera_id}: {interval_seconds}s")
|
||||
|
||||
while not self.stop_event.is_set():
|
||||
try:
|
||||
# Send heartbeat for thread health monitoring
|
||||
self._send_heartbeat("fetching_snapshot")
|
||||
|
||||
start_time = time.time()
|
||||
frame = self._fetch_snapshot()
|
||||
|
||||
if frame is None:
|
||||
retries += 1
|
||||
self.consecutive_errors += 1
|
||||
|
||||
# Report error to health monitoring
|
||||
stream_health_tracker.report_error(
|
||||
self.camera_id,
|
||||
f"Failed to fetch snapshot (retry {retries}/{self.max_retries})"
|
||||
)
|
||||
|
||||
logger.warning(f"Failed to fetch snapshot for camera {self.camera_id}, retry {retries}/{self.max_retries}")
|
||||
|
||||
if self.max_retries != -1 and retries > self.max_retries:
|
||||
|
@ -90,21 +133,36 @@ class HTTPSnapshotReader(VideoReader):
|
|||
# Accept any valid image dimensions - don't force specific resolution
|
||||
if frame.shape[1] <= 0 or frame.shape[0] <= 0:
|
||||
logger.warning(f"Camera {self.camera_id}: Invalid frame dimensions {frame.shape[1]}x{frame.shape[0]}")
|
||||
stream_health_tracker.report_error(
|
||||
self.camera_id,
|
||||
f"Invalid frame dimensions: {frame.shape[1]}x{frame.shape[0]}"
|
||||
)
|
||||
continue
|
||||
|
||||
# Reset retry counter on successful fetch
|
||||
retries = 0
|
||||
self.consecutive_errors = 0
|
||||
frame_count += 1
|
||||
|
||||
# Report successful frame to health monitoring
|
||||
frame_size = frame.nbytes
|
||||
stream_health_tracker.report_frame_received(self.camera_id, frame_size)
|
||||
|
||||
# Call frame callback
|
||||
if self.frame_callback:
|
||||
try:
|
||||
self.frame_callback(self.camera_id, frame)
|
||||
except Exception as e:
|
||||
logger.error(f"Camera {self.camera_id}: Frame callback error: {e}")
|
||||
stream_health_tracker.report_error(self.camera_id, f"Frame callback error: {e}")
|
||||
|
||||
# Periodic connection health test
|
||||
current_time = time.time()
|
||||
if current_time - last_connection_test >= self.connection_test_interval:
|
||||
self._test_connection_health()
|
||||
last_connection_test = current_time
|
||||
|
||||
# Log progress every 30 seconds
|
||||
current_time = time.time()
|
||||
if current_time - last_log_time >= 30:
|
||||
logger.info(f"Camera {self.camera_id}: {frame_count} snapshots processed")
|
||||
last_log_time = current_time
|
||||
|
@ -117,6 +175,7 @@ class HTTPSnapshotReader(VideoReader):
|
|||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in snapshot loop for camera {self.camera_id}: {e}")
|
||||
stream_health_tracker.report_error(self.camera_id, f"Snapshot loop error: {e}")
|
||||
retries += 1
|
||||
if self.max_retries != -1 and retries > self.max_retries:
|
||||
break
|
||||
|
@ -246,4 +305,74 @@ class HTTPSnapshotReader(VideoReader):
|
|||
right = target_width - new_width - left
|
||||
resized = cv2.copyMakeBorder(resized, top, bottom, left, right, cv2.BORDER_CONSTANT, value=[0, 0, 0])
|
||||
|
||||
return resized
|
||||
return resized
|
||||
|
||||
# Health monitoring methods
|
||||
def _send_heartbeat(self, activity: str = "running"):
|
||||
"""Send heartbeat to thread health monitor."""
|
||||
self.last_heartbeat = time.time()
|
||||
thread_health_monitor.heartbeat(activity=activity)
|
||||
|
||||
def _heartbeat_callback(self) -> bool:
|
||||
"""Heartbeat callback for thread responsiveness testing."""
|
||||
try:
|
||||
# Check if thread is responsive by checking recent heartbeat
|
||||
current_time = time.time()
|
||||
age = current_time - self.last_heartbeat
|
||||
|
||||
# Thread is responsive if heartbeat is recent
|
||||
return age < 30.0 # 30 second responsiveness threshold
|
||||
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _test_connection_health(self):
|
||||
"""Test HTTP connection health."""
|
||||
try:
|
||||
stream_health_tracker.test_http_connection(self.camera_id, self.snapshot_url)
|
||||
except Exception as e:
|
||||
logger.error(f"Error testing connection health for {self.camera_id}: {e}")
|
||||
|
||||
def _handle_restart_recovery(self, component: str, details: Dict[str, Any]) -> bool:
|
||||
"""Handle restart recovery action."""
|
||||
try:
|
||||
logger.info(f"Restarting HTTP snapshot reader for {self.camera_id}")
|
||||
|
||||
# Stop current instance
|
||||
self.stop()
|
||||
|
||||
# Small delay
|
||||
time.sleep(2.0)
|
||||
|
||||
# Restart
|
||||
self.start()
|
||||
|
||||
# Report successful restart
|
||||
stream_health_tracker.report_reconnect(self.camera_id, "health_recovery_restart")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to restart HTTP snapshot reader for {self.camera_id}: {e}")
|
||||
return False
|
||||
|
||||
def _handle_reconnect_recovery(self, component: str, details: Dict[str, Any]) -> bool:
|
||||
"""Handle reconnect recovery action."""
|
||||
try:
|
||||
logger.info(f"Reconnecting HTTP snapshot reader for {self.camera_id}")
|
||||
|
||||
# Test connection first
|
||||
success = stream_health_tracker.test_http_connection(self.camera_id, self.snapshot_url)
|
||||
|
||||
if success:
|
||||
# Reset error counters
|
||||
self.consecutive_errors = 0
|
||||
stream_health_tracker.report_reconnect(self.camera_id, "health_recovery_reconnect")
|
||||
return True
|
||||
else:
|
||||
logger.warning(f"Connection test failed during recovery for {self.camera_id}")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to reconnect HTTP snapshot reader for {self.camera_id}: {e}")
|
||||
return False
|
Loading…
Add table
Add a link
Reference in a new issue