python-rtsp-worker/services/ultralytics_model_controller.py

"""
Ultralytics Model Controller - YOLO inference with batched processing.
"""

import logging
from typing import Any, Callable, Dict, List, Optional

import torch

from .base_model_controller import BaseModelController, BatchFrame

logger = logging.getLogger(__name__)


class UltralyticsModelController(BaseModelController):
    """
    Model controller for Ultralytics YOLO inference.

    Uses UltralyticsEngine which wraps the Ultralytics YOLO model with
    native TensorRT backend for GPU-accelerated inference.
    """

    def __init__(
        self,
        inference_engine,
        model_id: str,
        batch_size: int = 16,
        max_queue_size: int = 100,
        preprocess_fn: Optional[Callable] = None,
        postprocess_fn: Optional[Callable] = None,
    ):
        # Auto-detect actual batch size from the YOLO engine
        print(f"[UltralyticsModelController] Detecting batch size from engine...")
        engine_batch_size = self._detect_engine_batch_size(inference_engine)
        print(
            f"[UltralyticsModelController] Detected engine_batch_size={engine_batch_size}"
        )

        # If engine has fixed batch size, use it. Otherwise use user's batch_size
        actual_batch_size = engine_batch_size if engine_batch_size > 0 else batch_size
        print(
            f"[UltralyticsModelController] Using actual_batch_size={actual_batch_size}"
        )

        super().__init__(
            model_id=model_id,
            batch_size=actual_batch_size,
            max_queue_size=max_queue_size,
            preprocess_fn=preprocess_fn,
            postprocess_fn=postprocess_fn,
        )
        self.inference_engine = inference_engine
        self.engine_batch_size = engine_batch_size  # Store for padding logic

        if engine_batch_size > 0:
            print(f"✓ Ultralytics engine has FIXED batch_size={engine_batch_size}")
            print(
                f"  Will pad/truncate all batches to exactly {engine_batch_size} frames"
            )
            logger.info(
                f"Ultralytics engine has fixed batch_size={engine_batch_size}, "
                f"will pad batches to match"
            )
            # CRITICAL: Override the parent's batch_size to match engine's fixed size
            # This prevents buffer accumulation beyond the engine's capacity
            self.batch_size = engine_batch_size
            print(f"  Controller self.batch_size is now: {self.batch_size}")
            print(f"  Buffer will swap when size >= {self.batch_size}")
        else:
            print(
                f"✓ Ultralytics engine supports DYNAMIC batching, max={actual_batch_size}"
            )
            logger.info(
                f"Ultralytics engine supports dynamic batching, "
                f"using max batch_size={actual_batch_size}"
            )

    def _detect_engine_batch_size(self, inference_engine) -> int:
        """
        Detect the batch size from Ultralytics engine.

        Returns:
            Fixed batch size (e.g., 2, 4, 8) or -1 for dynamic batching
        """
        try:
            # Get engine metadata
            metadata = inference_engine.get_metadata()

            logger.info(f"Detecting batch size from engine metadata: {metadata}")

            # Check input shape for batch dimension
            if "images" in metadata.input_shapes:
                input_shape = metadata.input_shapes["images"]
                batch_dim = input_shape[0]

                logger.info(f"Found batch dimension in metadata: {batch_dim}")

                if batch_dim > 0:
                    # Fixed batch size
                    logger.info(f"Using fixed batch size from engine: {batch_dim}")
                    return batch_dim
                else:
                    # Dynamic batch size (-1)
                    logger.info("Engine supports dynamic batching (batch_dim=-1)")
                    return -1

            # Fallback: try to get from model directly
            if (
                hasattr(inference_engine, "_model")
                and inference_engine._model is not None
            ):
                model = inference_engine._model

                # Try to get batch info from Ultralytics model
                if hasattr(model, "predictor") and model.predictor is not None:
                    predictor = model.predictor
                    if hasattr(predictor, "model") and hasattr(
                        predictor.model, "batch"
                    ):
                        return predictor.model.batch

                # Try to get from model.model (for .engine files)
                if hasattr(model, "model"):
                    # For TensorRT engines, check input shape
                    if hasattr(model.model, "get_input_details"):
                        details = model.model.get_input_details()
                        if details and len(details) > 0:
                            shape = details[0].get("shape")
                            if shape and len(shape) > 0:
                                return shape[0] if shape[0] > 0 else -1

        except Exception as e:
            logger.warning(f"Could not detect engine batch size: {e}")

        # Default: assume dynamic batching
        return -1

    def _run_batch_inference(self, batch: List[BatchFrame]) -> List[Dict[str, Any]]:
        """
        Run Ultralytics YOLO inference on a batch of frames.

        Ultralytics handles batching natively and returns Results objects.
        """
        # Preprocess frames
        preprocessed = []
        for batch_frame in batch:
            if self.preprocess_fn:
                processed = self.preprocess_fn(batch_frame.frame)
                # Ensure shape is (C, H, W) not (1, C, H, W)
                if processed.dim() == 4 and processed.shape[0] == 1:
                    processed = processed.squeeze(0)
            else:
                processed = batch_frame.frame
            preprocessed.append(processed)

        # Stack into batch tensor: (B, C, H, W)
        batch_tensor = torch.stack(preprocessed, dim=0)
        actual_batch_size = len(batch)

        # Handle fixed batch size engines (pad if needed)
        if self.engine_batch_size > 0:
            # Engine has fixed batch size
            if batch_tensor.shape[0] > self.engine_batch_size:
                # Truncate to engine's max batch size
                logger.warning(
                    f"Batch size {batch_tensor.shape[0]} exceeds engine max {self.engine_batch_size}, truncating"
                )
                batch_tensor = batch_tensor[: self.engine_batch_size]
                batch = batch[: self.engine_batch_size]
                actual_batch_size = self.engine_batch_size
            elif batch_tensor.shape[0] < self.engine_batch_size:
                # Pad to match engine's fixed batch size
                padding_size = self.engine_batch_size - batch_tensor.shape[0]
                # Replicate last frame to pad (cheaper than zeros)
                padding = batch_tensor[-1:].repeat(padding_size, 1, 1, 1)
                batch_tensor = torch.cat([batch_tensor, padding], dim=0)
                logger.debug(
                    f"Padded batch from {actual_batch_size} to {self.engine_batch_size} frames"
                )
        else:
            # Dynamic batching - just limit to max
            if batch_tensor.shape[0] > self.batch_size:
                logger.warning(
                    f"Batch size {batch_tensor.shape[0]} exceeds configured max {self.batch_size}"
                )
                batch_tensor = batch_tensor[: self.batch_size]
                batch = batch[: self.batch_size]
                actual_batch_size = self.batch_size

        # Run Ultralytics inference
        # Input should be (B, 3, H, W) in range [0, 1], RGB format
        outputs = self.inference_engine.infer(
            inputs={"images": batch_tensor},
            conf=0.25,  # Confidence threshold
            iou=0.45,  # NMS IoU threshold
        )

        # Ultralytics returns Results objects in outputs["results"]
        yolo_results = outputs["results"]

        # Convert Results objects to our standard format
        # Only process actual batch size (ignore padded results if any)
        results = []
        for i in range(actual_batch_size):
            batch_frame = batch[i]
            yolo_result = yolo_results[i]
            # Extract detections from YOLO Results object
            # yolo_result.boxes.data has format: [x1, y1, x2, y2, conf, cls]
            if hasattr(yolo_result, "boxes") and yolo_result.boxes is not None:
                detections = yolo_result.boxes.data  # Already a tensor on GPU
            else:
                # No detections
                detections = torch.zeros((0, 6), device=batch_tensor.device)

            # NOTE: Skip postprocess_fn for Ultralytics backend!
            # Ultralytics already does confidence filtering, NMS, and format conversion.
            # The detections are already in final format: [x1, y1, x2, y2, conf, cls]
            # Any custom postprocess_fn would expect raw TensorRT output and will fail.

            result = {
                "stream_id": batch_frame.stream_id,
                "timestamp": batch_frame.timestamp,
                "detections": detections,
                "frame": batch_frame.frame,  # Include original frame tensor
                "metadata": batch_frame.metadata,
                "yolo_result": yolo_result,  # Keep original Results object for debugging
            }
            results.append(result)

        return results