python-detector-worker/core/streaming/process_manager.py
ziesorx bfab574058
All checks were successful
Build Worker Base and Application Images / check-base-changes (push) Successful in 10s
Build Worker Base and Application Images / build-base (push) Has been skipped
Build Worker Base and Application Images / build-docker (push) Successful in 2m52s
Build Worker Base and Application Images / deploy-stack (push) Successful in 8s
refactor: replace threading with multiprocessing
2025-09-25 12:53:17 +07:00

453 lines
No EOL
15 KiB
Python

"""
Multiprocessing-based RTSP stream management for scalability.
Handles multiple camera streams using separate processes to bypass GIL limitations.
"""
import multiprocessing as mp
import time
import logging
import cv2
import numpy as np
import queue
import threading
import os
import psutil
from typing import Dict, Optional, Tuple, Any, Callable
from dataclasses import dataclass
from multiprocessing import Process, Queue, Lock, Value, Array, Manager
from multiprocessing.shared_memory import SharedMemory
import signal
import sys
# Ensure proper multiprocessing context for uvicorn compatibility
try:
mp.set_start_method('spawn', force=True)
except RuntimeError:
pass # Already set
logger = logging.getLogger("detector_worker.process_manager")
# Frame dimensions (1280x720 RGB)
FRAME_WIDTH = 1280
FRAME_HEIGHT = 720
FRAME_CHANNELS = 3
FRAME_SIZE = FRAME_WIDTH * FRAME_HEIGHT * FRAME_CHANNELS
@dataclass
class ProcessConfig:
"""Configuration for camera process."""
camera_id: str
rtsp_url: str
expected_fps: int = 6
buffer_size: int = 3
max_retries: int = 30
reconnect_delay: float = 5.0
class SharedFrameBuffer:
"""Thread-safe shared memory frame buffer with double buffering."""
def __init__(self, camera_id: str):
self.camera_id = camera_id
self.lock = mp.Lock()
# Double buffering for lock-free reads
self.buffer_a = mp.Array('B', FRAME_SIZE, lock=False)
self.buffer_b = mp.Array('B', FRAME_SIZE, lock=False)
# Atomic index for current read buffer (0 or 1)
self.read_buffer_idx = mp.Value('i', 0)
# Frame metadata (atomic access)
self.timestamp = mp.Value('d', 0.0)
self.frame_number = mp.Value('L', 0)
self.is_valid = mp.Value('b', False)
# Statistics
self.frames_written = mp.Value('L', 0)
self.frames_dropped = mp.Value('L', 0)
def write_frame(self, frame: np.ndarray, timestamp: float) -> bool:
"""Write frame to buffer with atomic swap."""
if frame is None or frame.size == 0:
return False
# Resize if needed
if frame.shape != (FRAME_HEIGHT, FRAME_WIDTH, FRAME_CHANNELS):
frame = cv2.resize(frame, (FRAME_WIDTH, FRAME_HEIGHT))
# Get write buffer (opposite of read buffer)
write_idx = 1 - self.read_buffer_idx.value
write_buffer = self.buffer_a if write_idx == 0 else self.buffer_b
try:
# Write to buffer without lock (safe because of double buffering)
frame_flat = frame.flatten()
write_buffer[:] = frame_flat.astype(np.uint8)
# Update metadata
self.timestamp.value = timestamp
self.frame_number.value += 1
# Atomic swap of buffers
with self.lock:
self.read_buffer_idx.value = write_idx
self.is_valid.value = True
self.frames_written.value += 1
return True
except Exception as e:
logger.error(f"Error writing frame for {self.camera_id}: {e}")
self.frames_dropped.value += 1
return False
def read_frame(self) -> Optional[Tuple[np.ndarray, float]]:
"""Read frame from buffer without blocking writers."""
if not self.is_valid.value:
return None
# Get current read buffer index (atomic read)
read_idx = self.read_buffer_idx.value
read_buffer = self.buffer_a if read_idx == 0 else self.buffer_b
# Read timestamp (atomic)
timestamp = self.timestamp.value
# Copy frame data (no lock needed for read)
try:
frame_data = np.array(read_buffer, dtype=np.uint8)
frame = frame_data.reshape((FRAME_HEIGHT, FRAME_WIDTH, FRAME_CHANNELS))
return frame.copy(), timestamp
except Exception as e:
logger.error(f"Error reading frame for {self.camera_id}: {e}")
return None
def get_stats(self) -> Dict[str, int]:
"""Get buffer statistics."""
return {
'frames_written': self.frames_written.value,
'frames_dropped': self.frames_dropped.value,
'frame_number': self.frame_number.value,
'is_valid': self.is_valid.value
}
def camera_worker_process(
config: ProcessConfig,
frame_buffer: SharedFrameBuffer,
command_queue: Queue,
status_queue: Queue,
stop_event: mp.Event
):
"""
Worker process for individual camera stream.
Runs in separate process to bypass GIL.
"""
# Set process name for debugging
mp.current_process().name = f"Camera-{config.camera_id}"
# Configure logging for subprocess
logging.basicConfig(
level=logging.INFO,
format=f'%(asctime)s [%(levelname)s] Camera-{config.camera_id}: %(message)s'
)
logger.info(f"Starting camera worker for {config.camera_id}")
cap = None
consecutive_errors = 0
frame_interval = 1.0 / config.expected_fps
last_frame_time = 0
def initialize_capture():
"""Initialize OpenCV capture with optimized settings."""
nonlocal cap
try:
# Set RTSP transport to TCP for reliability
os.environ['OPENCV_FFMPEG_CAPTURE_OPTIONS'] = 'rtsp_transport;tcp'
# Create capture
cap = cv2.VideoCapture(config.rtsp_url, cv2.CAP_FFMPEG)
if not cap.isOpened():
logger.error(f"Failed to open RTSP stream")
return False
# Set capture properties
cap.set(cv2.CAP_PROP_FRAME_WIDTH, FRAME_WIDTH)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, FRAME_HEIGHT)
cap.set(cv2.CAP_PROP_FPS, config.expected_fps)
cap.set(cv2.CAP_PROP_BUFFERSIZE, config.buffer_size)
# Read initial frames to stabilize
for _ in range(3):
ret, _ = cap.read()
if not ret:
logger.warning("Failed to read initial frames")
time.sleep(0.1)
logger.info(f"Successfully initialized capture")
return True
except Exception as e:
logger.error(f"Error initializing capture: {e}")
return False
# Main processing loop
while not stop_event.is_set():
try:
# Check for commands (non-blocking)
try:
command = command_queue.get_nowait()
if command == "reinit":
logger.info("Received reinit command")
if cap:
cap.release()
cap = None
consecutive_errors = 0
except queue.Empty:
pass
# Initialize capture if needed
if cap is None or not cap.isOpened():
if not initialize_capture():
time.sleep(config.reconnect_delay)
consecutive_errors += 1
if consecutive_errors > config.max_retries and config.max_retries > 0:
logger.error("Max retries reached, exiting")
break
continue
else:
consecutive_errors = 0
# Read frame with timing control
current_time = time.time()
if current_time - last_frame_time < frame_interval:
time.sleep(0.01) # Small sleep to prevent busy waiting
continue
ret, frame = cap.read()
if not ret or frame is None:
consecutive_errors += 1
if consecutive_errors >= config.max_retries:
logger.error(f"Too many consecutive errors ({consecutive_errors}), reinitializing")
if cap:
cap.release()
cap = None
consecutive_errors = 0
time.sleep(config.reconnect_delay)
else:
if consecutive_errors <= 5:
logger.debug(f"Frame read failed (error {consecutive_errors})")
elif consecutive_errors % 10 == 0:
logger.warning(f"Continuing frame failures (error {consecutive_errors})")
# Exponential backoff
sleep_time = min(0.1 * (1.5 ** min(consecutive_errors, 10)), 1.0)
time.sleep(sleep_time)
continue
# Frame read successful
consecutive_errors = 0
last_frame_time = current_time
# Write to shared buffer
if frame_buffer.write_frame(frame, current_time):
# Send status update periodically
if frame_buffer.frame_number.value % 30 == 0: # Every 30 frames
status_queue.put({
'camera_id': config.camera_id,
'status': 'running',
'frames': frame_buffer.frame_number.value,
'timestamp': current_time
})
except KeyboardInterrupt:
logger.info("Received interrupt signal")
break
except Exception as e:
logger.error(f"Error in camera worker: {e}")
consecutive_errors += 1
time.sleep(1.0)
# Cleanup
if cap:
cap.release()
logger.info(f"Camera worker stopped")
status_queue.put({
'camera_id': config.camera_id,
'status': 'stopped',
'frames': frame_buffer.frame_number.value
})
class RTSPProcessManager:
"""
Manages multiple camera processes with health monitoring and auto-restart.
"""
def __init__(self, max_processes: int = None):
self.max_processes = max_processes or (mp.cpu_count() - 2)
self.processes: Dict[str, Process] = {}
self.frame_buffers: Dict[str, SharedFrameBuffer] = {}
self.command_queues: Dict[str, Queue] = {}
self.status_queue = mp.Queue()
self.stop_events: Dict[str, mp.Event] = {}
self.configs: Dict[str, ProcessConfig] = {}
# Manager for shared objects
self.manager = Manager()
self.process_stats = self.manager.dict()
# Health monitoring thread
self.monitor_thread = None
self.monitor_stop = threading.Event()
logger.info(f"RTSPProcessManager initialized with max_processes={self.max_processes}")
def add_camera(self, config: ProcessConfig) -> bool:
"""Add a new camera stream."""
if config.camera_id in self.processes:
logger.warning(f"Camera {config.camera_id} already exists")
return False
if len(self.processes) >= self.max_processes:
logger.error(f"Max processes ({self.max_processes}) reached")
return False
try:
# Create shared resources
frame_buffer = SharedFrameBuffer(config.camera_id)
command_queue = mp.Queue()
stop_event = mp.Event()
# Store resources
self.frame_buffers[config.camera_id] = frame_buffer
self.command_queues[config.camera_id] = command_queue
self.stop_events[config.camera_id] = stop_event
self.configs[config.camera_id] = config
# Start process
process = mp.Process(
target=camera_worker_process,
args=(config, frame_buffer, command_queue, self.status_queue, stop_event),
name=f"Camera-{config.camera_id}"
)
process.start()
self.processes[config.camera_id] = process
logger.info(f"Started process for camera {config.camera_id} (PID: {process.pid})")
return True
except Exception as e:
logger.error(f"Error adding camera {config.camera_id}: {e}")
self._cleanup_camera(config.camera_id)
return False
def remove_camera(self, camera_id: str) -> bool:
"""Remove a camera stream."""
if camera_id not in self.processes:
return False
logger.info(f"Removing camera {camera_id}")
# Signal stop
if camera_id in self.stop_events:
self.stop_events[camera_id].set()
# Wait for process to stop
process = self.processes.get(camera_id)
if process and process.is_alive():
process.join(timeout=5.0)
if process.is_alive():
logger.warning(f"Force terminating process for {camera_id}")
process.terminate()
process.join(timeout=2.0)
# Cleanup
self._cleanup_camera(camera_id)
return True
def _cleanup_camera(self, camera_id: str):
"""Clean up camera resources."""
for collection in [self.processes, self.frame_buffers,
self.command_queues, self.stop_events, self.configs]:
collection.pop(camera_id, None)
def get_frame(self, camera_id: str) -> Optional[Tuple[np.ndarray, float]]:
"""Get latest frame from camera."""
buffer = self.frame_buffers.get(camera_id)
if buffer:
return buffer.read_frame()
return None
def get_stats(self) -> Dict[str, Any]:
"""Get statistics for all cameras."""
stats = {}
for camera_id, buffer in self.frame_buffers.items():
process = self.processes.get(camera_id)
stats[camera_id] = {
'buffer_stats': buffer.get_stats(),
'process_alive': process.is_alive() if process else False,
'process_pid': process.pid if process else None
}
return stats
def start_monitoring(self):
"""Start health monitoring thread."""
if self.monitor_thread and self.monitor_thread.is_alive():
return
self.monitor_stop.clear()
self.monitor_thread = threading.Thread(target=self._monitor_processes)
self.monitor_thread.start()
logger.info("Started process monitoring")
def _monitor_processes(self):
"""Monitor process health and restart if needed."""
while not self.monitor_stop.is_set():
try:
# Check status queue
try:
while True:
status = self.status_queue.get_nowait()
self.process_stats[status['camera_id']] = status
except queue.Empty:
pass
# Check process health
for camera_id in list(self.processes.keys()):
process = self.processes.get(camera_id)
if process and not process.is_alive():
logger.warning(f"Process for {camera_id} died, restarting")
config = self.configs.get(camera_id)
if config:
self.remove_camera(camera_id)
time.sleep(1.0)
self.add_camera(config)
time.sleep(5.0) # Check every 5 seconds
except Exception as e:
logger.error(f"Error in monitor thread: {e}")
time.sleep(5.0)
def stop_all(self):
"""Stop all camera processes."""
logger.info("Stopping all camera processes")
# Stop monitoring
if self.monitor_thread:
self.monitor_stop.set()
self.monitor_thread.join(timeout=5.0)
# Stop all cameras
for camera_id in list(self.processes.keys()):
self.remove_camera(camera_id)
logger.info("All processes stopped")