""" Recovery manager for automatic handling of health issues. Provides circuit breaker patterns, automatic restarts, and graceful degradation. """ import time import logging import threading from typing import Dict, List, Optional, Any, Callable from dataclasses import dataclass from enum import Enum from collections import defaultdict, deque from .health import HealthCheck, HealthStatus, health_monitor logger = logging.getLogger(__name__) class RecoveryAction(Enum): """Types of recovery actions.""" RESTART_STREAM = "restart_stream" RESTART_THREAD = "restart_thread" CLEAR_BUFFER = "clear_buffer" RECONNECT = "reconnect" THROTTLE = "throttle" DISABLE = "disable" @dataclass class RecoveryAttempt: """Record of a recovery attempt.""" timestamp: float component: str action: RecoveryAction reason: str success: bool details: Dict[str, Any] = None @dataclass class RecoveryState: """Recovery state for a component - simplified without circuit breaker.""" failure_count: int = 0 success_count: int = 0 last_failure_time: Optional[float] = None last_success_time: Optional[float] = None class RecoveryManager: """Manages automatic recovery actions for health issues.""" def __init__(self): self.recovery_handlers: Dict[str, Callable[[str, HealthCheck], bool]] = {} self.recovery_states: Dict[str, RecoveryState] = {} self.recovery_history: deque = deque(maxlen=1000) self._lock = threading.RLock() # Configuration - simplified without circuit breaker self.recovery_cooldown = 30 # 30 seconds between recovery attempts self.max_attempts_per_hour = 20 # Still limit to prevent spam, but much higher # Track recovery attempts per component self.recovery_attempts: Dict[str, deque] = defaultdict(lambda: deque(maxlen=50)) # Register with health monitor health_monitor.register_recovery_callback("stream", self._handle_stream_recovery) health_monitor.register_recovery_callback("thread", self._handle_thread_recovery) health_monitor.register_recovery_callback("buffer", self._handle_buffer_recovery) def register_recovery_handler(self, action: RecoveryAction, handler: Callable[[str, Dict[str, Any]], bool]): """ Register a recovery handler for a specific action. Args: action: Type of recovery action handler: Function that performs the recovery """ self.recovery_handlers[action.value] = handler logger.info(f"Registered recovery handler for {action.value}") def can_attempt_recovery(self, component: str) -> bool: """ Check if recovery can be attempted for a component. Args: component: Component identifier Returns: True if recovery can be attempted (always allow with minimal throttling) """ with self._lock: current_time = time.time() # Check recovery attempt rate limiting (much more permissive) recent_attempts = [ attempt for attempt in self.recovery_attempts[component] if current_time - attempt <= 3600 # Last hour ] # Only block if truly excessive attempts if len(recent_attempts) >= self.max_attempts_per_hour: logger.warning(f"Recovery rate limit exceeded for {component} " f"({len(recent_attempts)} attempts in last hour)") return False # Check cooldown period (shorter cooldown) if recent_attempts: last_attempt = max(recent_attempts) if current_time - last_attempt < self.recovery_cooldown: logger.debug(f"Recovery cooldown active for {component} " f"(last attempt {current_time - last_attempt:.1f}s ago)") return False return True def attempt_recovery(self, component: str, action: RecoveryAction, reason: str, details: Optional[Dict[str, Any]] = None) -> bool: """ Attempt recovery for a component. Args: component: Component identifier action: Recovery action to perform reason: Reason for recovery details: Additional details Returns: True if recovery was successful """ if not self.can_attempt_recovery(component): return False current_time = time.time() logger.info(f"Attempting recovery for {component}: {action.value} ({reason})") try: # Record recovery attempt with self._lock: self.recovery_attempts[component].append(current_time) # Perform recovery action success = self._execute_recovery_action(component, action, details or {}) # Record recovery result attempt = RecoveryAttempt( timestamp=current_time, component=component, action=action, reason=reason, success=success, details=details ) with self._lock: self.recovery_history.append(attempt) # Update recovery state self._update_recovery_state(component, success) if success: logger.info(f"Recovery successful for {component}: {action.value}") else: logger.error(f"Recovery failed for {component}: {action.value}") return success except Exception as e: logger.error(f"Error during recovery for {component}: {e}") self._update_recovery_state(component, False) return False def _execute_recovery_action(self, component: str, action: RecoveryAction, details: Dict[str, Any]) -> bool: """Execute a specific recovery action.""" handler_key = action.value if handler_key not in self.recovery_handlers: logger.error(f"No recovery handler registered for action: {handler_key}") return False try: handler = self.recovery_handlers[handler_key] return handler(component, details) except Exception as e: logger.error(f"Error executing recovery action {handler_key} for {component}: {e}") return False def _update_recovery_state(self, component: str, success: bool): """Update recovery state based on recovery result.""" current_time = time.time() with self._lock: if component not in self.recovery_states: self.recovery_states[component] = RecoveryState() state = self.recovery_states[component] if success: state.success_count += 1 state.last_success_time = current_time # Reset failure count on success state.failure_count = max(0, state.failure_count - 1) logger.debug(f"Recovery success for {component} (total successes: {state.success_count})") else: state.failure_count += 1 state.last_failure_time = current_time logger.debug(f"Recovery failure for {component} (total failures: {state.failure_count})") def _handle_stream_recovery(self, component: str, health_check: HealthCheck) -> bool: """Handle recovery for stream-related issues.""" if "frames" in health_check.name: # Frame-related issue - restart stream return self.attempt_recovery( component, RecoveryAction.RESTART_STREAM, health_check.message, health_check.details ) elif "connection" in health_check.name: # Connection issue - reconnect return self.attempt_recovery( component, RecoveryAction.RECONNECT, health_check.message, health_check.details ) elif "errors" in health_check.name: # High error rate - throttle or restart return self.attempt_recovery( component, RecoveryAction.THROTTLE, health_check.message, health_check.details ) else: # Generic stream issue - restart return self.attempt_recovery( component, RecoveryAction.RESTART_STREAM, health_check.message, health_check.details ) def _handle_thread_recovery(self, component: str, health_check: HealthCheck) -> bool: """Handle recovery for thread-related issues.""" if "deadlock" in health_check.name: # Deadlock detected - restart thread return self.attempt_recovery( component, RecoveryAction.RESTART_THREAD, health_check.message, health_check.details ) elif "responsive" in health_check.name: # Thread unresponsive - restart return self.attempt_recovery( component, RecoveryAction.RESTART_THREAD, health_check.message, health_check.details ) else: # Generic thread issue - restart return self.attempt_recovery( component, RecoveryAction.RESTART_THREAD, health_check.message, health_check.details ) def _handle_buffer_recovery(self, component: str, health_check: HealthCheck) -> bool: """Handle recovery for buffer-related issues.""" # Buffer issues - clear buffer return self.attempt_recovery( component, RecoveryAction.CLEAR_BUFFER, health_check.message, health_check.details ) def get_recovery_stats(self) -> Dict[str, Any]: """Get recovery statistics.""" current_time = time.time() with self._lock: # Calculate stats from history recent_recoveries = [ attempt for attempt in self.recovery_history if current_time - attempt.timestamp <= 3600 # Last hour ] stats_by_component = defaultdict(lambda: { 'attempts': 0, 'successes': 0, 'failures': 0, 'last_attempt': None, 'last_success': None }) for attempt in recent_recoveries: stats = stats_by_component[attempt.component] stats['attempts'] += 1 if attempt.success: stats['successes'] += 1 if not stats['last_success'] or attempt.timestamp > stats['last_success']: stats['last_success'] = attempt.timestamp else: stats['failures'] += 1 if not stats['last_attempt'] or attempt.timestamp > stats['last_attempt']: stats['last_attempt'] = attempt.timestamp return { 'total_recoveries_last_hour': len(recent_recoveries), 'recovery_by_component': dict(stats_by_component), 'recovery_states': { component: { 'failure_count': state.failure_count, 'success_count': state.success_count, 'last_failure_time': state.last_failure_time, 'last_success_time': state.last_success_time } for component, state in self.recovery_states.items() }, 'recent_history': [ { 'timestamp': attempt.timestamp, 'component': attempt.component, 'action': attempt.action.value, 'reason': attempt.reason, 'success': attempt.success } for attempt in list(self.recovery_history)[-10:] # Last 10 attempts ] } def force_recovery(self, component: str, action: RecoveryAction, reason: str = "manual") -> bool: """ Force recovery for a component, bypassing rate limiting. Args: component: Component identifier action: Recovery action to perform reason: Reason for forced recovery Returns: True if recovery was successful """ logger.info(f"Forcing recovery for {component}: {action.value} ({reason})") current_time = time.time() try: # Execute recovery action directly success = self._execute_recovery_action(component, action, {}) # Record forced recovery attempt = RecoveryAttempt( timestamp=current_time, component=component, action=action, reason=f"forced: {reason}", success=success, details={'forced': True} ) with self._lock: self.recovery_history.append(attempt) self.recovery_attempts[component].append(current_time) # Update recovery state self._update_recovery_state(component, success) return success except Exception as e: logger.error(f"Error during forced recovery for {component}: {e}") return False # Global recovery manager instance recovery_manager = RecoveryManager()