""" YOLOv8 Model Utilities This module provides static utility functions for YOLOv8 model preprocessing and postprocessing, compatible with TensorRT inference. Features: - Preprocessing: Resize and normalize frames for YOLOv8 inference - Postprocessing: Parse YOLOv8 output format to detection boxes - Format conversion: (cx, cy, w, h) to (x1, y1, x2, y2) - Confidence filtering and NMS handled separately Usage: from services.yolo import YOLOv8Utils # Preprocess frame model_input = YOLOv8Utils.preprocess(frame_gpu, input_size=640) # Run inference outputs = model_repo.infer(model_id="yolov8", inputs={"images": model_input}) # Postprocess detections detections = YOLOv8Utils.postprocess(outputs, conf_threshold=0.25, nms_threshold=0.45) """ import torch from typing import Tuple, Optional class YOLOv8Utils: """Static utility class for YOLOv8 model operations.""" @staticmethod def preprocess( frame: torch.Tensor, input_size: int = 640 ) -> torch.Tensor: """ Preprocess frame for YOLOv8 inference. Args: frame: RGB frame as GPU tensor, shape (3, H, W) uint8 input_size: Model input size (default: 640 for YOLOv8) Returns: Preprocessed frame ready for model, shape (1, 3, input_size, input_size) float32 Example: >>> frame_gpu = decoder.get_latest_frame(rgb=True) # (3, 720, 1280) >>> model_input = YOLOv8Utils.preprocess(frame_gpu) # (1, 3, 640, 640) """ # Add batch dimension and convert to float frame_batch = frame.unsqueeze(0).float() # (1, 3, H, W) # Resize to model input size frame_resized = torch.nn.functional.interpolate( frame_batch, size=(input_size, input_size), mode='bilinear', align_corners=False ) # Normalize to [0, 1] (YOLOv8 expects normalized input) frame_normalized = frame_resized / 255.0 # NOTE: Don't track these tensors - they're short-lived inputs to TensorRT # that get automatically freed by PyTorch after inference completes. # Tracking them would show false "leaks" since we can't track when TensorRT consumes them. return frame_normalized @staticmethod def postprocess( outputs: dict, conf_threshold: float = 0.25, nms_threshold: float = 0.45 ) -> torch.Tensor: """ Postprocess YOLOv8 TensorRT output to detection format. YOLOv8 output format: (1, 84, 8400) - 84 channels = 4 bbox coords (cx, cy, w, h) + 80 class scores - 8400 anchor points Args: outputs: Dictionary of model outputs from TensorRT inference conf_threshold: Confidence threshold for filtering detections (default: 0.25) nms_threshold: IoU threshold for Non-Maximum Suppression (default: 0.45) Returns: Tensor of shape (N, 6): [x1, y1, x2, y2, conf, class_id] - Coordinates are in model input space (0-640 for default YOLOv8) - N is the number of detections after NMS Example: >>> outputs = model_repo.infer(model_id="yolov8", inputs={"images": frame}) >>> detections = YOLOv8Utils.postprocess(outputs, conf_threshold=0.5) >>> # detections: [[x1, y1, x2, y2, conf, class_id], ...] """ from torchvision.ops import nms # Get output tensor (first and only output) output_name = list(outputs.keys())[0] output = outputs[output_name] # (1, 84, 8400) # Transpose to (1, 8400, 84) for easier processing output = output.transpose(1, 2).squeeze(0) # (8400, 84) # Split bbox coordinates and class scores (vectorized) bboxes = output[:, :4] # (8400, 4) - (cx, cy, w, h) class_scores = output[:, 4:] # (8400, 80) # Get max class score and corresponding class ID for all anchors (vectorized) max_scores, class_ids = torch.max(class_scores, dim=1) # (8400,), (8400,) # Filter by confidence threshold (vectorized) mask = max_scores > conf_threshold filtered_bboxes = bboxes[mask] # (N, 4) filtered_scores = max_scores[mask] # (N,) filtered_class_ids = class_ids[mask] # (N,) # Return empty tensor if no detections if filtered_bboxes.shape[0] == 0: return torch.zeros((0, 6), device=output.device) # Convert from (cx, cy, w, h) to (x1, y1, x2, y2) (vectorized) cx, cy, w, h = filtered_bboxes[:, 0], filtered_bboxes[:, 1], filtered_bboxes[:, 2], filtered_bboxes[:, 3] x1 = cx - w / 2 y1 = cy - h / 2 x2 = cx + w / 2 y2 = cy + h / 2 # Stack into detections tensor: [x1, y1, x2, y2, conf, class_id] detections_tensor = torch.stack([ x1, y1, x2, y2, filtered_scores, filtered_class_ids.float() ], dim=1) # (N, 6) # Apply Non-Maximum Suppression (NMS) boxes = detections_tensor[:, :4] # (N, 4) scores = detections_tensor[:, 4] # (N,) # NMS returns indices of boxes to keep keep_indices = nms(boxes, scores, iou_threshold=nms_threshold) # Return filtered detections return detections_tensor[keep_indices] @staticmethod def scale_boxes( boxes: torch.Tensor, from_size: Tuple[int, int], to_size: Tuple[int, int] ) -> torch.Tensor: """ Scale bounding boxes from one coordinate space to another. Args: boxes: Tensor of boxes, shape (N, 4) in format [x1, y1, x2, y2] from_size: Source size (width, height) - e.g., (640, 640) for model output to_size: Target size (width, height) - e.g., (1280, 720) for display Returns: Scaled boxes tensor, same shape as input Example: >>> detections = YOLOv8Utils.postprocess(outputs) # boxes in 640x640 space >>> boxes = detections[:, :4] # Extract boxes >>> scaled_boxes = YOLOv8Utils.scale_boxes(boxes, (640, 640), (1280, 720)) """ scale_x = to_size[0] / from_size[0] scale_y = to_size[1] / from_size[1] # Clone to avoid modifying original scaled = boxes.clone() scaled[:, [0, 2]] *= scale_x # Scale x coordinates scaled[:, [1, 3]] *= scale_y # Scale y coordinates return scaled # COCO class names for YOLOv8 (80 classes) COCO_CLASSES = { 0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus', 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light', 10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter', 13: 'bench', 14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow', 20: 'elephant', 21: 'bear', 22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag', 27: 'tie', 28: 'suitcase', 29: 'frisbee', 30: 'skis', 31: 'snowboard', 32: 'sports ball', 33: 'kite', 34: 'baseball bat', 35: 'baseball glove', 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket', 39: 'bottle', 40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl', 46: 'banana', 47: 'apple', 48: 'sandwich', 49: 'orange', 50: 'broccoli', 51: 'carrot', 52: 'hot dog', 53: 'pizza', 54: 'donut', 55: 'cake', 56: 'chair', 57: 'couch', 58: 'potted plant', 59: 'bed', 60: 'dining table', 61: 'toilet', 62: 'tv', 63: 'laptop', 64: 'mouse', 65: 'remote', 66: 'keyboard', 67: 'cell phone', 68: 'microwave', 69: 'oven', 70: 'toaster', 71: 'sink', 72: 'refrigerator', 73: 'book', 74: 'clock', 75: 'vase', 76: 'scissors', 77: 'teddy bear', 78: 'hair drier', 79: 'toothbrush' }