python-rtsp-worker/services/ultralytics_model_controller.py

230 lines
9.4 KiB
Python

"""
Ultralytics Model Controller - YOLO inference with batched processing.
"""
import logging
from typing import Any, Callable, Dict, List, Optional
import torch
from .base_model_controller import BaseModelController, BatchFrame
logger = logging.getLogger(__name__)
class UltralyticsModelController(BaseModelController):
"""
Model controller for Ultralytics YOLO inference.
Uses UltralyticsEngine which wraps the Ultralytics YOLO model with
native TensorRT backend for GPU-accelerated inference.
"""
def __init__(
self,
inference_engine,
model_id: str,
batch_size: int = 16,
max_queue_size: int = 100,
preprocess_fn: Optional[Callable] = None,
postprocess_fn: Optional[Callable] = None,
):
# Auto-detect actual batch size from the YOLO engine
print(f"[UltralyticsModelController] Detecting batch size from engine...")
engine_batch_size = self._detect_engine_batch_size(inference_engine)
print(
f"[UltralyticsModelController] Detected engine_batch_size={engine_batch_size}"
)
# If engine has fixed batch size, use it. Otherwise use user's batch_size
actual_batch_size = engine_batch_size if engine_batch_size > 0 else batch_size
print(
f"[UltralyticsModelController] Using actual_batch_size={actual_batch_size}"
)
super().__init__(
model_id=model_id,
batch_size=actual_batch_size,
max_queue_size=max_queue_size,
preprocess_fn=preprocess_fn,
postprocess_fn=postprocess_fn,
)
self.inference_engine = inference_engine
self.engine_batch_size = engine_batch_size # Store for padding logic
if engine_batch_size > 0:
print(f"✓ Ultralytics engine has FIXED batch_size={engine_batch_size}")
print(
f" Will pad/truncate all batches to exactly {engine_batch_size} frames"
)
logger.info(
f"Ultralytics engine has fixed batch_size={engine_batch_size}, "
f"will pad batches to match"
)
# CRITICAL: Override the parent's batch_size to match engine's fixed size
# This prevents buffer accumulation beyond the engine's capacity
self.batch_size = engine_batch_size
print(f" Controller self.batch_size is now: {self.batch_size}")
print(f" Buffer will swap when size >= {self.batch_size}")
else:
print(
f"✓ Ultralytics engine supports DYNAMIC batching, max={actual_batch_size}"
)
logger.info(
f"Ultralytics engine supports dynamic batching, "
f"using max batch_size={actual_batch_size}"
)
def _detect_engine_batch_size(self, inference_engine) -> int:
"""
Detect the batch size from Ultralytics engine.
Returns:
Fixed batch size (e.g., 2, 4, 8) or -1 for dynamic batching
"""
try:
# Get engine metadata
metadata = inference_engine.get_metadata()
logger.info(f"Detecting batch size from engine metadata: {metadata}")
# Check input shape for batch dimension
if "images" in metadata.input_shapes:
input_shape = metadata.input_shapes["images"]
batch_dim = input_shape[0]
logger.info(f"Found batch dimension in metadata: {batch_dim}")
if batch_dim > 0:
# Fixed batch size
logger.info(f"Using fixed batch size from engine: {batch_dim}")
return batch_dim
else:
# Dynamic batch size (-1)
logger.info("Engine supports dynamic batching (batch_dim=-1)")
return -1
# Fallback: try to get from model directly
if (
hasattr(inference_engine, "_model")
and inference_engine._model is not None
):
model = inference_engine._model
# Try to get batch info from Ultralytics model
if hasattr(model, "predictor") and model.predictor is not None:
predictor = model.predictor
if hasattr(predictor, "model") and hasattr(
predictor.model, "batch"
):
return predictor.model.batch
# Try to get from model.model (for .engine files)
if hasattr(model, "model"):
# For TensorRT engines, check input shape
if hasattr(model.model, "get_input_details"):
details = model.model.get_input_details()
if details and len(details) > 0:
shape = details[0].get("shape")
if shape and len(shape) > 0:
return shape[0] if shape[0] > 0 else -1
except Exception as e:
logger.warning(f"Could not detect engine batch size: {e}")
# Default: assume dynamic batching
return -1
def _run_batch_inference(self, batch: List[BatchFrame]) -> List[Dict[str, Any]]:
"""
Run Ultralytics YOLO inference on a batch of frames.
Ultralytics handles batching natively and returns Results objects.
"""
# Preprocess frames
preprocessed = []
for batch_frame in batch:
if self.preprocess_fn:
processed = self.preprocess_fn(batch_frame.frame)
# Ensure shape is (C, H, W) not (1, C, H, W)
if processed.dim() == 4 and processed.shape[0] == 1:
processed = processed.squeeze(0)
else:
processed = batch_frame.frame
preprocessed.append(processed)
# Stack into batch tensor: (B, C, H, W)
batch_tensor = torch.stack(preprocessed, dim=0)
actual_batch_size = len(batch)
# Handle fixed batch size engines (pad if needed)
if self.engine_batch_size > 0:
# Engine has fixed batch size
if batch_tensor.shape[0] > self.engine_batch_size:
# Truncate to engine's max batch size
logger.warning(
f"Batch size {batch_tensor.shape[0]} exceeds engine max {self.engine_batch_size}, truncating"
)
batch_tensor = batch_tensor[: self.engine_batch_size]
batch = batch[: self.engine_batch_size]
actual_batch_size = self.engine_batch_size
elif batch_tensor.shape[0] < self.engine_batch_size:
# Pad to match engine's fixed batch size
padding_size = self.engine_batch_size - batch_tensor.shape[0]
# Replicate last frame to pad (cheaper than zeros)
padding = batch_tensor[-1:].repeat(padding_size, 1, 1, 1)
batch_tensor = torch.cat([batch_tensor, padding], dim=0)
logger.debug(
f"Padded batch from {actual_batch_size} to {self.engine_batch_size} frames"
)
else:
# Dynamic batching - just limit to max
if batch_tensor.shape[0] > self.batch_size:
logger.warning(
f"Batch size {batch_tensor.shape[0]} exceeds configured max {self.batch_size}"
)
batch_tensor = batch_tensor[: self.batch_size]
batch = batch[: self.batch_size]
actual_batch_size = self.batch_size
# Run Ultralytics inference
# Input should be (B, 3, H, W) in range [0, 1], RGB format
outputs = self.inference_engine.infer(
inputs={"images": batch_tensor},
conf=0.25, # Confidence threshold
iou=0.45, # NMS IoU threshold
)
# Ultralytics returns Results objects in outputs["results"]
yolo_results = outputs["results"]
# Convert Results objects to our standard format
# Only process actual batch size (ignore padded results if any)
results = []
for i in range(actual_batch_size):
batch_frame = batch[i]
yolo_result = yolo_results[i]
# Extract detections from YOLO Results object
# yolo_result.boxes.data has format: [x1, y1, x2, y2, conf, cls]
if hasattr(yolo_result, "boxes") and yolo_result.boxes is not None:
detections = yolo_result.boxes.data # Already a tensor on GPU
else:
# No detections
detections = torch.zeros((0, 6), device=batch_tensor.device)
# NOTE: Skip postprocess_fn for Ultralytics backend!
# Ultralytics already does confidence filtering, NMS, and format conversion.
# The detections are already in final format: [x1, y1, x2, y2, conf, cls]
# Any custom postprocess_fn would expect raw TensorRT output and will fail.
result = {
"stream_id": batch_frame.stream_id,
"timestamp": batch_frame.timestamp,
"detections": detections,
"frame": batch_frame.frame, # Include original frame tensor
"metadata": batch_frame.metadata,
"yolo_result": yolo_result, # Keep original Results object for debugging
}
results.append(result)
return results