Implement comprehensive health monitoring for streams and threads
Some checks failed
Build Worker Base and Application Images / check-base-changes (push) Successful in 8s
Build Worker Base and Application Images / build-base (push) Has been skipped
Build Worker Base and Application Images / build-docker (push) Successful in 3m7s
Build Worker Base and Application Images / deploy-stack (push) Has been cancelled
Some checks failed
Build Worker Base and Application Images / check-base-changes (push) Successful in 8s
Build Worker Base and Application Images / build-base (push) Has been skipped
Build Worker Base and Application Images / build-docker (push) Successful in 3m7s
Build Worker Base and Application Images / deploy-stack (push) Has been cancelled
- Added RecoveryManager for automatic handling of health issues, including circuit breaker patterns, automatic restarts, and graceful degradation. - Introduced StreamHealthTracker to monitor video stream metrics, including frame production, connection health, and error rates. - Developed ThreadHealthMonitor for detecting unresponsive and deadlocked threads, providing liveness detection and responsiveness testing. - Integrated health checks for streams and threads, reporting metrics and recovery actions to the health monitor. - Enhanced logging for recovery attempts, errors, and health checks to improve observability and debugging.
This commit is contained in:
parent
8c08c815ce
commit
b08ce27de2
9 changed files with 2173 additions and 11 deletions
385
core/monitoring/recovery.py
Normal file
385
core/monitoring/recovery.py
Normal file
|
@ -0,0 +1,385 @@
|
|||
"""
|
||||
Recovery manager for automatic handling of health issues.
|
||||
Provides circuit breaker patterns, automatic restarts, and graceful degradation.
|
||||
"""
|
||||
import time
|
||||
import logging
|
||||
import threading
|
||||
from typing import Dict, List, Optional, Any, Callable
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from collections import defaultdict, deque
|
||||
|
||||
from .health import HealthCheck, HealthStatus, health_monitor
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RecoveryAction(Enum):
|
||||
"""Types of recovery actions."""
|
||||
RESTART_STREAM = "restart_stream"
|
||||
RESTART_THREAD = "restart_thread"
|
||||
CLEAR_BUFFER = "clear_buffer"
|
||||
RECONNECT = "reconnect"
|
||||
THROTTLE = "throttle"
|
||||
DISABLE = "disable"
|
||||
|
||||
|
||||
@dataclass
|
||||
class RecoveryAttempt:
|
||||
"""Record of a recovery attempt."""
|
||||
timestamp: float
|
||||
component: str
|
||||
action: RecoveryAction
|
||||
reason: str
|
||||
success: bool
|
||||
details: Dict[str, Any] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class RecoveryState:
|
||||
"""Recovery state for a component - simplified without circuit breaker."""
|
||||
failure_count: int = 0
|
||||
success_count: int = 0
|
||||
last_failure_time: Optional[float] = None
|
||||
last_success_time: Optional[float] = None
|
||||
|
||||
|
||||
class RecoveryManager:
|
||||
"""Manages automatic recovery actions for health issues."""
|
||||
|
||||
def __init__(self):
|
||||
self.recovery_handlers: Dict[str, Callable[[str, HealthCheck], bool]] = {}
|
||||
self.recovery_states: Dict[str, RecoveryState] = {}
|
||||
self.recovery_history: deque = deque(maxlen=1000)
|
||||
self._lock = threading.RLock()
|
||||
|
||||
# Configuration - simplified without circuit breaker
|
||||
self.recovery_cooldown = 30 # 30 seconds between recovery attempts
|
||||
self.max_attempts_per_hour = 20 # Still limit to prevent spam, but much higher
|
||||
|
||||
# Track recovery attempts per component
|
||||
self.recovery_attempts: Dict[str, deque] = defaultdict(lambda: deque(maxlen=50))
|
||||
|
||||
# Register with health monitor
|
||||
health_monitor.register_recovery_callback("stream", self._handle_stream_recovery)
|
||||
health_monitor.register_recovery_callback("thread", self._handle_thread_recovery)
|
||||
health_monitor.register_recovery_callback("buffer", self._handle_buffer_recovery)
|
||||
|
||||
def register_recovery_handler(self, action: RecoveryAction, handler: Callable[[str, Dict[str, Any]], bool]):
|
||||
"""
|
||||
Register a recovery handler for a specific action.
|
||||
|
||||
Args:
|
||||
action: Type of recovery action
|
||||
handler: Function that performs the recovery
|
||||
"""
|
||||
self.recovery_handlers[action.value] = handler
|
||||
logger.info(f"Registered recovery handler for {action.value}")
|
||||
|
||||
def can_attempt_recovery(self, component: str) -> bool:
|
||||
"""
|
||||
Check if recovery can be attempted for a component.
|
||||
|
||||
Args:
|
||||
component: Component identifier
|
||||
|
||||
Returns:
|
||||
True if recovery can be attempted (always allow with minimal throttling)
|
||||
"""
|
||||
with self._lock:
|
||||
current_time = time.time()
|
||||
|
||||
# Check recovery attempt rate limiting (much more permissive)
|
||||
recent_attempts = [
|
||||
attempt for attempt in self.recovery_attempts[component]
|
||||
if current_time - attempt <= 3600 # Last hour
|
||||
]
|
||||
|
||||
# Only block if truly excessive attempts
|
||||
if len(recent_attempts) >= self.max_attempts_per_hour:
|
||||
logger.warning(f"Recovery rate limit exceeded for {component} "
|
||||
f"({len(recent_attempts)} attempts in last hour)")
|
||||
return False
|
||||
|
||||
# Check cooldown period (shorter cooldown)
|
||||
if recent_attempts:
|
||||
last_attempt = max(recent_attempts)
|
||||
if current_time - last_attempt < self.recovery_cooldown:
|
||||
logger.debug(f"Recovery cooldown active for {component} "
|
||||
f"(last attempt {current_time - last_attempt:.1f}s ago)")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def attempt_recovery(self, component: str, action: RecoveryAction, reason: str,
|
||||
details: Optional[Dict[str, Any]] = None) -> bool:
|
||||
"""
|
||||
Attempt recovery for a component.
|
||||
|
||||
Args:
|
||||
component: Component identifier
|
||||
action: Recovery action to perform
|
||||
reason: Reason for recovery
|
||||
details: Additional details
|
||||
|
||||
Returns:
|
||||
True if recovery was successful
|
||||
"""
|
||||
if not self.can_attempt_recovery(component):
|
||||
return False
|
||||
|
||||
current_time = time.time()
|
||||
|
||||
logger.info(f"Attempting recovery for {component}: {action.value} ({reason})")
|
||||
|
||||
try:
|
||||
# Record recovery attempt
|
||||
with self._lock:
|
||||
self.recovery_attempts[component].append(current_time)
|
||||
|
||||
# Perform recovery action
|
||||
success = self._execute_recovery_action(component, action, details or {})
|
||||
|
||||
# Record recovery result
|
||||
attempt = RecoveryAttempt(
|
||||
timestamp=current_time,
|
||||
component=component,
|
||||
action=action,
|
||||
reason=reason,
|
||||
success=success,
|
||||
details=details
|
||||
)
|
||||
|
||||
with self._lock:
|
||||
self.recovery_history.append(attempt)
|
||||
|
||||
# Update recovery state
|
||||
self._update_recovery_state(component, success)
|
||||
|
||||
if success:
|
||||
logger.info(f"Recovery successful for {component}: {action.value}")
|
||||
else:
|
||||
logger.error(f"Recovery failed for {component}: {action.value}")
|
||||
|
||||
return success
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during recovery for {component}: {e}")
|
||||
self._update_recovery_state(component, False)
|
||||
return False
|
||||
|
||||
def _execute_recovery_action(self, component: str, action: RecoveryAction,
|
||||
details: Dict[str, Any]) -> bool:
|
||||
"""Execute a specific recovery action."""
|
||||
handler_key = action.value
|
||||
|
||||
if handler_key not in self.recovery_handlers:
|
||||
logger.error(f"No recovery handler registered for action: {handler_key}")
|
||||
return False
|
||||
|
||||
try:
|
||||
handler = self.recovery_handlers[handler_key]
|
||||
return handler(component, details)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error executing recovery action {handler_key} for {component}: {e}")
|
||||
return False
|
||||
|
||||
def _update_recovery_state(self, component: str, success: bool):
|
||||
"""Update recovery state based on recovery result."""
|
||||
current_time = time.time()
|
||||
|
||||
with self._lock:
|
||||
if component not in self.recovery_states:
|
||||
self.recovery_states[component] = RecoveryState()
|
||||
|
||||
state = self.recovery_states[component]
|
||||
|
||||
if success:
|
||||
state.success_count += 1
|
||||
state.last_success_time = current_time
|
||||
# Reset failure count on success
|
||||
state.failure_count = max(0, state.failure_count - 1)
|
||||
logger.debug(f"Recovery success for {component} (total successes: {state.success_count})")
|
||||
else:
|
||||
state.failure_count += 1
|
||||
state.last_failure_time = current_time
|
||||
logger.debug(f"Recovery failure for {component} (total failures: {state.failure_count})")
|
||||
|
||||
def _handle_stream_recovery(self, component: str, health_check: HealthCheck) -> bool:
|
||||
"""Handle recovery for stream-related issues."""
|
||||
if "frames" in health_check.name:
|
||||
# Frame-related issue - restart stream
|
||||
return self.attempt_recovery(
|
||||
component,
|
||||
RecoveryAction.RESTART_STREAM,
|
||||
health_check.message,
|
||||
health_check.details
|
||||
)
|
||||
elif "connection" in health_check.name:
|
||||
# Connection issue - reconnect
|
||||
return self.attempt_recovery(
|
||||
component,
|
||||
RecoveryAction.RECONNECT,
|
||||
health_check.message,
|
||||
health_check.details
|
||||
)
|
||||
elif "errors" in health_check.name:
|
||||
# High error rate - throttle or restart
|
||||
return self.attempt_recovery(
|
||||
component,
|
||||
RecoveryAction.THROTTLE,
|
||||
health_check.message,
|
||||
health_check.details
|
||||
)
|
||||
else:
|
||||
# Generic stream issue - restart
|
||||
return self.attempt_recovery(
|
||||
component,
|
||||
RecoveryAction.RESTART_STREAM,
|
||||
health_check.message,
|
||||
health_check.details
|
||||
)
|
||||
|
||||
def _handle_thread_recovery(self, component: str, health_check: HealthCheck) -> bool:
|
||||
"""Handle recovery for thread-related issues."""
|
||||
if "deadlock" in health_check.name:
|
||||
# Deadlock detected - restart thread
|
||||
return self.attempt_recovery(
|
||||
component,
|
||||
RecoveryAction.RESTART_THREAD,
|
||||
health_check.message,
|
||||
health_check.details
|
||||
)
|
||||
elif "responsive" in health_check.name:
|
||||
# Thread unresponsive - restart
|
||||
return self.attempt_recovery(
|
||||
component,
|
||||
RecoveryAction.RESTART_THREAD,
|
||||
health_check.message,
|
||||
health_check.details
|
||||
)
|
||||
else:
|
||||
# Generic thread issue - restart
|
||||
return self.attempt_recovery(
|
||||
component,
|
||||
RecoveryAction.RESTART_THREAD,
|
||||
health_check.message,
|
||||
health_check.details
|
||||
)
|
||||
|
||||
def _handle_buffer_recovery(self, component: str, health_check: HealthCheck) -> bool:
|
||||
"""Handle recovery for buffer-related issues."""
|
||||
# Buffer issues - clear buffer
|
||||
return self.attempt_recovery(
|
||||
component,
|
||||
RecoveryAction.CLEAR_BUFFER,
|
||||
health_check.message,
|
||||
health_check.details
|
||||
)
|
||||
|
||||
def get_recovery_stats(self) -> Dict[str, Any]:
|
||||
"""Get recovery statistics."""
|
||||
current_time = time.time()
|
||||
|
||||
with self._lock:
|
||||
# Calculate stats from history
|
||||
recent_recoveries = [
|
||||
attempt for attempt in self.recovery_history
|
||||
if current_time - attempt.timestamp <= 3600 # Last hour
|
||||
]
|
||||
|
||||
stats_by_component = defaultdict(lambda: {
|
||||
'attempts': 0,
|
||||
'successes': 0,
|
||||
'failures': 0,
|
||||
'last_attempt': None,
|
||||
'last_success': None
|
||||
})
|
||||
|
||||
for attempt in recent_recoveries:
|
||||
stats = stats_by_component[attempt.component]
|
||||
stats['attempts'] += 1
|
||||
|
||||
if attempt.success:
|
||||
stats['successes'] += 1
|
||||
if not stats['last_success'] or attempt.timestamp > stats['last_success']:
|
||||
stats['last_success'] = attempt.timestamp
|
||||
else:
|
||||
stats['failures'] += 1
|
||||
|
||||
if not stats['last_attempt'] or attempt.timestamp > stats['last_attempt']:
|
||||
stats['last_attempt'] = attempt.timestamp
|
||||
|
||||
return {
|
||||
'total_recoveries_last_hour': len(recent_recoveries),
|
||||
'recovery_by_component': dict(stats_by_component),
|
||||
'recovery_states': {
|
||||
component: {
|
||||
'failure_count': state.failure_count,
|
||||
'success_count': state.success_count,
|
||||
'last_failure_time': state.last_failure_time,
|
||||
'last_success_time': state.last_success_time
|
||||
}
|
||||
for component, state in self.recovery_states.items()
|
||||
},
|
||||
'recent_history': [
|
||||
{
|
||||
'timestamp': attempt.timestamp,
|
||||
'component': attempt.component,
|
||||
'action': attempt.action.value,
|
||||
'reason': attempt.reason,
|
||||
'success': attempt.success
|
||||
}
|
||||
for attempt in list(self.recovery_history)[-10:] # Last 10 attempts
|
||||
]
|
||||
}
|
||||
|
||||
def force_recovery(self, component: str, action: RecoveryAction, reason: str = "manual") -> bool:
|
||||
"""
|
||||
Force recovery for a component, bypassing rate limiting.
|
||||
|
||||
Args:
|
||||
component: Component identifier
|
||||
action: Recovery action to perform
|
||||
reason: Reason for forced recovery
|
||||
|
||||
Returns:
|
||||
True if recovery was successful
|
||||
"""
|
||||
logger.info(f"Forcing recovery for {component}: {action.value} ({reason})")
|
||||
|
||||
current_time = time.time()
|
||||
|
||||
try:
|
||||
# Execute recovery action directly
|
||||
success = self._execute_recovery_action(component, action, {})
|
||||
|
||||
# Record forced recovery
|
||||
attempt = RecoveryAttempt(
|
||||
timestamp=current_time,
|
||||
component=component,
|
||||
action=action,
|
||||
reason=f"forced: {reason}",
|
||||
success=success,
|
||||
details={'forced': True}
|
||||
)
|
||||
|
||||
with self._lock:
|
||||
self.recovery_history.append(attempt)
|
||||
self.recovery_attempts[component].append(current_time)
|
||||
|
||||
# Update recovery state
|
||||
self._update_recovery_state(component, success)
|
||||
|
||||
return success
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error during forced recovery for {component}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
# Global recovery manager instance
|
||||
recovery_manager = RecoveryManager()
|
Loading…
Add table
Add a link
Reference in a new issue