refactor: replace threading with multiprocessing
All checks were successful
Build Worker Base and Application Images / check-base-changes (push) Successful in 10s
Build Worker Base and Application Images / build-base (push) Has been skipped
Build Worker Base and Application Images / build-docker (push) Successful in 2m52s
Build Worker Base and Application Images / deploy-stack (push) Successful in 8s
All checks were successful
Build Worker Base and Application Images / check-base-changes (push) Successful in 10s
Build Worker Base and Application Images / build-base (push) Has been skipped
Build Worker Base and Application Images / build-docker (push) Successful in 2m52s
Build Worker Base and Application Images / deploy-stack (push) Successful in 8s
This commit is contained in:
parent
e87ed4c056
commit
bfab574058
6 changed files with 682 additions and 58 deletions
453
core/streaming/process_manager.py
Normal file
453
core/streaming/process_manager.py
Normal file
|
@ -0,0 +1,453 @@
|
|||
"""
|
||||
Multiprocessing-based RTSP stream management for scalability.
|
||||
Handles multiple camera streams using separate processes to bypass GIL limitations.
|
||||
"""
|
||||
|
||||
import multiprocessing as mp
|
||||
import time
|
||||
import logging
|
||||
import cv2
|
||||
import numpy as np
|
||||
import queue
|
||||
import threading
|
||||
import os
|
||||
import psutil
|
||||
from typing import Dict, Optional, Tuple, Any, Callable
|
||||
from dataclasses import dataclass
|
||||
from multiprocessing import Process, Queue, Lock, Value, Array, Manager
|
||||
from multiprocessing.shared_memory import SharedMemory
|
||||
import signal
|
||||
import sys
|
||||
|
||||
# Ensure proper multiprocessing context for uvicorn compatibility
|
||||
try:
|
||||
mp.set_start_method('spawn', force=True)
|
||||
except RuntimeError:
|
||||
pass # Already set
|
||||
|
||||
logger = logging.getLogger("detector_worker.process_manager")
|
||||
|
||||
# Frame dimensions (1280x720 RGB)
|
||||
FRAME_WIDTH = 1280
|
||||
FRAME_HEIGHT = 720
|
||||
FRAME_CHANNELS = 3
|
||||
FRAME_SIZE = FRAME_WIDTH * FRAME_HEIGHT * FRAME_CHANNELS
|
||||
|
||||
@dataclass
|
||||
class ProcessConfig:
|
||||
"""Configuration for camera process."""
|
||||
camera_id: str
|
||||
rtsp_url: str
|
||||
expected_fps: int = 6
|
||||
buffer_size: int = 3
|
||||
max_retries: int = 30
|
||||
reconnect_delay: float = 5.0
|
||||
|
||||
|
||||
class SharedFrameBuffer:
|
||||
"""Thread-safe shared memory frame buffer with double buffering."""
|
||||
|
||||
def __init__(self, camera_id: str):
|
||||
self.camera_id = camera_id
|
||||
self.lock = mp.Lock()
|
||||
|
||||
# Double buffering for lock-free reads
|
||||
self.buffer_a = mp.Array('B', FRAME_SIZE, lock=False)
|
||||
self.buffer_b = mp.Array('B', FRAME_SIZE, lock=False)
|
||||
|
||||
# Atomic index for current read buffer (0 or 1)
|
||||
self.read_buffer_idx = mp.Value('i', 0)
|
||||
|
||||
# Frame metadata (atomic access)
|
||||
self.timestamp = mp.Value('d', 0.0)
|
||||
self.frame_number = mp.Value('L', 0)
|
||||
self.is_valid = mp.Value('b', False)
|
||||
|
||||
# Statistics
|
||||
self.frames_written = mp.Value('L', 0)
|
||||
self.frames_dropped = mp.Value('L', 0)
|
||||
|
||||
def write_frame(self, frame: np.ndarray, timestamp: float) -> bool:
|
||||
"""Write frame to buffer with atomic swap."""
|
||||
if frame is None or frame.size == 0:
|
||||
return False
|
||||
|
||||
# Resize if needed
|
||||
if frame.shape != (FRAME_HEIGHT, FRAME_WIDTH, FRAME_CHANNELS):
|
||||
frame = cv2.resize(frame, (FRAME_WIDTH, FRAME_HEIGHT))
|
||||
|
||||
# Get write buffer (opposite of read buffer)
|
||||
write_idx = 1 - self.read_buffer_idx.value
|
||||
write_buffer = self.buffer_a if write_idx == 0 else self.buffer_b
|
||||
|
||||
try:
|
||||
# Write to buffer without lock (safe because of double buffering)
|
||||
frame_flat = frame.flatten()
|
||||
write_buffer[:] = frame_flat.astype(np.uint8)
|
||||
|
||||
# Update metadata
|
||||
self.timestamp.value = timestamp
|
||||
self.frame_number.value += 1
|
||||
|
||||
# Atomic swap of buffers
|
||||
with self.lock:
|
||||
self.read_buffer_idx.value = write_idx
|
||||
self.is_valid.value = True
|
||||
self.frames_written.value += 1
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error writing frame for {self.camera_id}: {e}")
|
||||
self.frames_dropped.value += 1
|
||||
return False
|
||||
|
||||
def read_frame(self) -> Optional[Tuple[np.ndarray, float]]:
|
||||
"""Read frame from buffer without blocking writers."""
|
||||
if not self.is_valid.value:
|
||||
return None
|
||||
|
||||
# Get current read buffer index (atomic read)
|
||||
read_idx = self.read_buffer_idx.value
|
||||
read_buffer = self.buffer_a if read_idx == 0 else self.buffer_b
|
||||
|
||||
# Read timestamp (atomic)
|
||||
timestamp = self.timestamp.value
|
||||
|
||||
# Copy frame data (no lock needed for read)
|
||||
try:
|
||||
frame_data = np.array(read_buffer, dtype=np.uint8)
|
||||
frame = frame_data.reshape((FRAME_HEIGHT, FRAME_WIDTH, FRAME_CHANNELS))
|
||||
return frame.copy(), timestamp
|
||||
except Exception as e:
|
||||
logger.error(f"Error reading frame for {self.camera_id}: {e}")
|
||||
return None
|
||||
|
||||
def get_stats(self) -> Dict[str, int]:
|
||||
"""Get buffer statistics."""
|
||||
return {
|
||||
'frames_written': self.frames_written.value,
|
||||
'frames_dropped': self.frames_dropped.value,
|
||||
'frame_number': self.frame_number.value,
|
||||
'is_valid': self.is_valid.value
|
||||
}
|
||||
|
||||
|
||||
def camera_worker_process(
|
||||
config: ProcessConfig,
|
||||
frame_buffer: SharedFrameBuffer,
|
||||
command_queue: Queue,
|
||||
status_queue: Queue,
|
||||
stop_event: mp.Event
|
||||
):
|
||||
"""
|
||||
Worker process for individual camera stream.
|
||||
Runs in separate process to bypass GIL.
|
||||
"""
|
||||
# Set process name for debugging
|
||||
mp.current_process().name = f"Camera-{config.camera_id}"
|
||||
|
||||
# Configure logging for subprocess
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format=f'%(asctime)s [%(levelname)s] Camera-{config.camera_id}: %(message)s'
|
||||
)
|
||||
|
||||
logger.info(f"Starting camera worker for {config.camera_id}")
|
||||
|
||||
cap = None
|
||||
consecutive_errors = 0
|
||||
frame_interval = 1.0 / config.expected_fps
|
||||
last_frame_time = 0
|
||||
|
||||
def initialize_capture():
|
||||
"""Initialize OpenCV capture with optimized settings."""
|
||||
nonlocal cap
|
||||
|
||||
try:
|
||||
# Set RTSP transport to TCP for reliability
|
||||
os.environ['OPENCV_FFMPEG_CAPTURE_OPTIONS'] = 'rtsp_transport;tcp'
|
||||
|
||||
# Create capture
|
||||
cap = cv2.VideoCapture(config.rtsp_url, cv2.CAP_FFMPEG)
|
||||
|
||||
if not cap.isOpened():
|
||||
logger.error(f"Failed to open RTSP stream")
|
||||
return False
|
||||
|
||||
# Set capture properties
|
||||
cap.set(cv2.CAP_PROP_FRAME_WIDTH, FRAME_WIDTH)
|
||||
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, FRAME_HEIGHT)
|
||||
cap.set(cv2.CAP_PROP_FPS, config.expected_fps)
|
||||
cap.set(cv2.CAP_PROP_BUFFERSIZE, config.buffer_size)
|
||||
|
||||
# Read initial frames to stabilize
|
||||
for _ in range(3):
|
||||
ret, _ = cap.read()
|
||||
if not ret:
|
||||
logger.warning("Failed to read initial frames")
|
||||
time.sleep(0.1)
|
||||
|
||||
logger.info(f"Successfully initialized capture")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error initializing capture: {e}")
|
||||
return False
|
||||
|
||||
# Main processing loop
|
||||
while not stop_event.is_set():
|
||||
try:
|
||||
# Check for commands (non-blocking)
|
||||
try:
|
||||
command = command_queue.get_nowait()
|
||||
if command == "reinit":
|
||||
logger.info("Received reinit command")
|
||||
if cap:
|
||||
cap.release()
|
||||
cap = None
|
||||
consecutive_errors = 0
|
||||
except queue.Empty:
|
||||
pass
|
||||
|
||||
# Initialize capture if needed
|
||||
if cap is None or not cap.isOpened():
|
||||
if not initialize_capture():
|
||||
time.sleep(config.reconnect_delay)
|
||||
consecutive_errors += 1
|
||||
if consecutive_errors > config.max_retries and config.max_retries > 0:
|
||||
logger.error("Max retries reached, exiting")
|
||||
break
|
||||
continue
|
||||
else:
|
||||
consecutive_errors = 0
|
||||
|
||||
# Read frame with timing control
|
||||
current_time = time.time()
|
||||
if current_time - last_frame_time < frame_interval:
|
||||
time.sleep(0.01) # Small sleep to prevent busy waiting
|
||||
continue
|
||||
|
||||
ret, frame = cap.read()
|
||||
|
||||
if not ret or frame is None:
|
||||
consecutive_errors += 1
|
||||
|
||||
if consecutive_errors >= config.max_retries:
|
||||
logger.error(f"Too many consecutive errors ({consecutive_errors}), reinitializing")
|
||||
if cap:
|
||||
cap.release()
|
||||
cap = None
|
||||
consecutive_errors = 0
|
||||
time.sleep(config.reconnect_delay)
|
||||
else:
|
||||
if consecutive_errors <= 5:
|
||||
logger.debug(f"Frame read failed (error {consecutive_errors})")
|
||||
elif consecutive_errors % 10 == 0:
|
||||
logger.warning(f"Continuing frame failures (error {consecutive_errors})")
|
||||
|
||||
# Exponential backoff
|
||||
sleep_time = min(0.1 * (1.5 ** min(consecutive_errors, 10)), 1.0)
|
||||
time.sleep(sleep_time)
|
||||
continue
|
||||
|
||||
# Frame read successful
|
||||
consecutive_errors = 0
|
||||
last_frame_time = current_time
|
||||
|
||||
# Write to shared buffer
|
||||
if frame_buffer.write_frame(frame, current_time):
|
||||
# Send status update periodically
|
||||
if frame_buffer.frame_number.value % 30 == 0: # Every 30 frames
|
||||
status_queue.put({
|
||||
'camera_id': config.camera_id,
|
||||
'status': 'running',
|
||||
'frames': frame_buffer.frame_number.value,
|
||||
'timestamp': current_time
|
||||
})
|
||||
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Received interrupt signal")
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"Error in camera worker: {e}")
|
||||
consecutive_errors += 1
|
||||
time.sleep(1.0)
|
||||
|
||||
# Cleanup
|
||||
if cap:
|
||||
cap.release()
|
||||
|
||||
logger.info(f"Camera worker stopped")
|
||||
status_queue.put({
|
||||
'camera_id': config.camera_id,
|
||||
'status': 'stopped',
|
||||
'frames': frame_buffer.frame_number.value
|
||||
})
|
||||
|
||||
|
||||
class RTSPProcessManager:
|
||||
"""
|
||||
Manages multiple camera processes with health monitoring and auto-restart.
|
||||
"""
|
||||
|
||||
def __init__(self, max_processes: int = None):
|
||||
self.max_processes = max_processes or (mp.cpu_count() - 2)
|
||||
self.processes: Dict[str, Process] = {}
|
||||
self.frame_buffers: Dict[str, SharedFrameBuffer] = {}
|
||||
self.command_queues: Dict[str, Queue] = {}
|
||||
self.status_queue = mp.Queue()
|
||||
self.stop_events: Dict[str, mp.Event] = {}
|
||||
self.configs: Dict[str, ProcessConfig] = {}
|
||||
|
||||
# Manager for shared objects
|
||||
self.manager = Manager()
|
||||
self.process_stats = self.manager.dict()
|
||||
|
||||
# Health monitoring thread
|
||||
self.monitor_thread = None
|
||||
self.monitor_stop = threading.Event()
|
||||
|
||||
logger.info(f"RTSPProcessManager initialized with max_processes={self.max_processes}")
|
||||
|
||||
def add_camera(self, config: ProcessConfig) -> bool:
|
||||
"""Add a new camera stream."""
|
||||
if config.camera_id in self.processes:
|
||||
logger.warning(f"Camera {config.camera_id} already exists")
|
||||
return False
|
||||
|
||||
if len(self.processes) >= self.max_processes:
|
||||
logger.error(f"Max processes ({self.max_processes}) reached")
|
||||
return False
|
||||
|
||||
try:
|
||||
# Create shared resources
|
||||
frame_buffer = SharedFrameBuffer(config.camera_id)
|
||||
command_queue = mp.Queue()
|
||||
stop_event = mp.Event()
|
||||
|
||||
# Store resources
|
||||
self.frame_buffers[config.camera_id] = frame_buffer
|
||||
self.command_queues[config.camera_id] = command_queue
|
||||
self.stop_events[config.camera_id] = stop_event
|
||||
self.configs[config.camera_id] = config
|
||||
|
||||
# Start process
|
||||
process = mp.Process(
|
||||
target=camera_worker_process,
|
||||
args=(config, frame_buffer, command_queue, self.status_queue, stop_event),
|
||||
name=f"Camera-{config.camera_id}"
|
||||
)
|
||||
process.start()
|
||||
self.processes[config.camera_id] = process
|
||||
|
||||
logger.info(f"Started process for camera {config.camera_id} (PID: {process.pid})")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error adding camera {config.camera_id}: {e}")
|
||||
self._cleanup_camera(config.camera_id)
|
||||
return False
|
||||
|
||||
def remove_camera(self, camera_id: str) -> bool:
|
||||
"""Remove a camera stream."""
|
||||
if camera_id not in self.processes:
|
||||
return False
|
||||
|
||||
logger.info(f"Removing camera {camera_id}")
|
||||
|
||||
# Signal stop
|
||||
if camera_id in self.stop_events:
|
||||
self.stop_events[camera_id].set()
|
||||
|
||||
# Wait for process to stop
|
||||
process = self.processes.get(camera_id)
|
||||
if process and process.is_alive():
|
||||
process.join(timeout=5.0)
|
||||
if process.is_alive():
|
||||
logger.warning(f"Force terminating process for {camera_id}")
|
||||
process.terminate()
|
||||
process.join(timeout=2.0)
|
||||
|
||||
# Cleanup
|
||||
self._cleanup_camera(camera_id)
|
||||
return True
|
||||
|
||||
def _cleanup_camera(self, camera_id: str):
|
||||
"""Clean up camera resources."""
|
||||
for collection in [self.processes, self.frame_buffers,
|
||||
self.command_queues, self.stop_events, self.configs]:
|
||||
collection.pop(camera_id, None)
|
||||
|
||||
def get_frame(self, camera_id: str) -> Optional[Tuple[np.ndarray, float]]:
|
||||
"""Get latest frame from camera."""
|
||||
buffer = self.frame_buffers.get(camera_id)
|
||||
if buffer:
|
||||
return buffer.read_frame()
|
||||
return None
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""Get statistics for all cameras."""
|
||||
stats = {}
|
||||
for camera_id, buffer in self.frame_buffers.items():
|
||||
process = self.processes.get(camera_id)
|
||||
stats[camera_id] = {
|
||||
'buffer_stats': buffer.get_stats(),
|
||||
'process_alive': process.is_alive() if process else False,
|
||||
'process_pid': process.pid if process else None
|
||||
}
|
||||
return stats
|
||||
|
||||
def start_monitoring(self):
|
||||
"""Start health monitoring thread."""
|
||||
if self.monitor_thread and self.monitor_thread.is_alive():
|
||||
return
|
||||
|
||||
self.monitor_stop.clear()
|
||||
self.monitor_thread = threading.Thread(target=self._monitor_processes)
|
||||
self.monitor_thread.start()
|
||||
logger.info("Started process monitoring")
|
||||
|
||||
def _monitor_processes(self):
|
||||
"""Monitor process health and restart if needed."""
|
||||
while not self.monitor_stop.is_set():
|
||||
try:
|
||||
# Check status queue
|
||||
try:
|
||||
while True:
|
||||
status = self.status_queue.get_nowait()
|
||||
self.process_stats[status['camera_id']] = status
|
||||
except queue.Empty:
|
||||
pass
|
||||
|
||||
# Check process health
|
||||
for camera_id in list(self.processes.keys()):
|
||||
process = self.processes.get(camera_id)
|
||||
if process and not process.is_alive():
|
||||
logger.warning(f"Process for {camera_id} died, restarting")
|
||||
config = self.configs.get(camera_id)
|
||||
if config:
|
||||
self.remove_camera(camera_id)
|
||||
time.sleep(1.0)
|
||||
self.add_camera(config)
|
||||
|
||||
time.sleep(5.0) # Check every 5 seconds
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in monitor thread: {e}")
|
||||
time.sleep(5.0)
|
||||
|
||||
def stop_all(self):
|
||||
"""Stop all camera processes."""
|
||||
logger.info("Stopping all camera processes")
|
||||
|
||||
# Stop monitoring
|
||||
if self.monitor_thread:
|
||||
self.monitor_stop.set()
|
||||
self.monitor_thread.join(timeout=5.0)
|
||||
|
||||
# Stop all cameras
|
||||
for camera_id in list(self.processes.keys()):
|
||||
self.remove_camera(camera_id)
|
||||
|
||||
logger.info("All processes stopped")
|
Loading…
Add table
Add a link
Reference in a new issue