695 lines
22 KiB
Python
695 lines
22 KiB
Python
"""
|
|
Inference Engine Abstraction Layer
|
|
|
|
Provides a unified interface for different inference backends:
|
|
- Native TensorRT: Direct TensorRT API with zero-copy GPU tensors
|
|
- Ultralytics: YOLO models with built-in pre/postprocessing
|
|
- Future: ONNX Runtime, OpenVINO, etc.
|
|
|
|
All engines support zero-copy GPU tensor inference where possible.
|
|
"""
|
|
|
|
import logging
|
|
from abc import ABC, abstractmethod
|
|
from dataclasses import dataclass
|
|
from enum import Enum
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
import torch
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class BackendType(Enum):
|
|
"""Supported inference backend types"""
|
|
|
|
TENSORRT = "tensorrt"
|
|
ULTRALYTICS = "ultralytics"
|
|
|
|
@classmethod
|
|
def from_string(cls, backend: str) -> "BackendType":
|
|
"""Convert string to BackendType"""
|
|
backend = backend.lower()
|
|
for member in cls:
|
|
if member.value == backend:
|
|
return member
|
|
raise ValueError(
|
|
f"Unknown backend: {backend}. Available: {[m.value for m in cls]}"
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class EngineMetadata:
|
|
"""Metadata for an inference engine"""
|
|
|
|
engine_type: str # "tensorrt", "ultralytics", etc.
|
|
model_path: str
|
|
input_shapes: Dict[str, Tuple[int, ...]]
|
|
output_shapes: Dict[str, Tuple[int, ...]]
|
|
input_names: List[str]
|
|
output_names: List[str]
|
|
input_dtypes: Dict[str, torch.dtype]
|
|
output_dtypes: Dict[str, torch.dtype]
|
|
supports_batching: bool = True
|
|
supports_dynamic_shapes: bool = False
|
|
extra_info: Dict[str, Any] = None # Backend-specific info
|
|
|
|
|
|
class IInferenceEngine(ABC):
|
|
"""
|
|
Abstract interface for inference engines.
|
|
|
|
All implementations must support zero-copy GPU tensor inference:
|
|
- Inputs: CUDA tensors on GPU
|
|
- Outputs: CUDA tensors on GPU
|
|
- No CPU transfers during inference
|
|
"""
|
|
|
|
@abstractmethod
|
|
def initialize(
|
|
self, model_path: str, device: torch.device, **kwargs
|
|
) -> EngineMetadata:
|
|
"""
|
|
Initialize the inference engine.
|
|
|
|
Automatically detects model type and handles conversion if needed.
|
|
|
|
Args:
|
|
model_path: Path to model file (.pt, .engine, .trt)
|
|
device: GPU device to use
|
|
**kwargs: Optional parameters (batch_size, half, workspace, etc.)
|
|
|
|
Returns:
|
|
EngineMetadata with model information
|
|
"""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def infer(
|
|
self, inputs: Dict[str, torch.Tensor], **kwargs
|
|
) -> Dict[str, torch.Tensor]:
|
|
"""
|
|
Run inference on GPU tensors (zero-copy).
|
|
|
|
Args:
|
|
inputs: Dict of input_name -> CUDA tensor
|
|
**kwargs: Backend-specific inference parameters
|
|
|
|
Returns:
|
|
Dict of output_name -> CUDA tensor
|
|
|
|
Raises:
|
|
ValueError: If inputs are not CUDA tensors or wrong shape
|
|
"""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def get_metadata(self) -> EngineMetadata:
|
|
"""Get engine metadata"""
|
|
pass
|
|
|
|
@abstractmethod
|
|
def cleanup(self):
|
|
"""Cleanup resources"""
|
|
pass
|
|
|
|
@property
|
|
@abstractmethod
|
|
def is_initialized(self) -> bool:
|
|
"""Check if engine is initialized"""
|
|
pass
|
|
|
|
@property
|
|
@abstractmethod
|
|
def device(self) -> torch.device:
|
|
"""Get device the engine is running on"""
|
|
pass
|
|
|
|
|
|
class NativeTensorRTEngine(IInferenceEngine):
|
|
"""
|
|
Native TensorRT inference engine with direct API access.
|
|
|
|
Features:
|
|
- Zero-copy GPU tensor inference
|
|
- Execution context pooling for concurrent inference
|
|
- Support for .trt, .engine files
|
|
- Automatic Ultralytics .engine metadata stripping
|
|
"""
|
|
|
|
def __init__(self):
|
|
self._engine = None
|
|
self._contexts = []
|
|
self._metadata = None
|
|
self._device = None
|
|
self._trt_logger = None
|
|
|
|
def initialize(
|
|
self, model_path: str, device: torch.device, num_contexts: int = 1, **kwargs
|
|
) -> EngineMetadata:
|
|
"""
|
|
Initialize TensorRT engine.
|
|
|
|
Args:
|
|
model_path: Path to .trt or .engine file
|
|
device: GPU device
|
|
num_contexts: Number of execution contexts for pooling
|
|
|
|
Returns:
|
|
EngineMetadata
|
|
"""
|
|
import tensorrt as trt
|
|
|
|
self._device = device
|
|
self._trt_logger = trt.Logger(trt.Logger.WARNING)
|
|
|
|
# Load engine
|
|
runtime = trt.Runtime(self._trt_logger)
|
|
|
|
# Read engine file (handle Ultralytics format)
|
|
engine_data = self._load_engine_data(model_path)
|
|
|
|
self._engine = runtime.deserialize_cuda_engine(engine_data)
|
|
if self._engine is None:
|
|
raise RuntimeError(f"Failed to load TensorRT engine from {model_path}")
|
|
|
|
# Create execution contexts
|
|
for i in range(num_contexts):
|
|
ctx = self._engine.create_execution_context()
|
|
if ctx is None:
|
|
raise RuntimeError(f"Failed to create execution context {i}")
|
|
self._contexts.append(ctx)
|
|
|
|
# Extract metadata
|
|
self._metadata = self._extract_metadata(model_path)
|
|
|
|
return self._metadata
|
|
|
|
def _load_engine_data(self, file_path: str) -> bytes:
|
|
"""Load engine data, stripping Ultralytics metadata if present"""
|
|
import json
|
|
|
|
with open(file_path, "rb") as f:
|
|
# Try to read Ultralytics metadata header
|
|
meta_len_bytes = f.read(4)
|
|
if len(meta_len_bytes) == 4:
|
|
meta_len = int.from_bytes(meta_len_bytes, byteorder="little")
|
|
|
|
# Sanity check
|
|
if 0 < meta_len < 100000:
|
|
try:
|
|
metadata_bytes = f.read(meta_len)
|
|
json.loads(metadata_bytes.decode("utf-8"))
|
|
# Valid Ultralytics metadata, rest is engine
|
|
return f.read()
|
|
except (UnicodeDecodeError, json.JSONDecodeError):
|
|
pass
|
|
|
|
# Not Ultralytics format, read entire file
|
|
f.seek(0)
|
|
return f.read()
|
|
|
|
def _extract_metadata(self, model_path: str) -> EngineMetadata:
|
|
"""Extract metadata from TensorRT engine"""
|
|
import tensorrt as trt
|
|
|
|
input_shapes = {}
|
|
output_shapes = {}
|
|
input_names = []
|
|
output_names = []
|
|
input_dtypes = {}
|
|
output_dtypes = {}
|
|
|
|
trt_to_torch_dtype = {
|
|
trt.DataType.FLOAT: torch.float32,
|
|
trt.DataType.HALF: torch.float16,
|
|
trt.DataType.INT8: torch.int8,
|
|
trt.DataType.INT32: torch.int32,
|
|
trt.DataType.BOOL: torch.bool,
|
|
}
|
|
|
|
for i in range(self._engine.num_io_tensors):
|
|
name = self._engine.get_tensor_name(i)
|
|
shape = tuple(self._engine.get_tensor_shape(name))
|
|
dtype = trt_to_torch_dtype.get(
|
|
self._engine.get_tensor_dtype(name), torch.float32
|
|
)
|
|
mode = self._engine.get_tensor_mode(name)
|
|
|
|
if mode == trt.TensorIOMode.INPUT:
|
|
input_names.append(name)
|
|
input_shapes[name] = shape
|
|
input_dtypes[name] = dtype
|
|
else:
|
|
output_names.append(name)
|
|
output_shapes[name] = shape
|
|
output_dtypes[name] = dtype
|
|
|
|
return EngineMetadata(
|
|
engine_type="tensorrt",
|
|
model_path=model_path,
|
|
input_shapes=input_shapes,
|
|
output_shapes=output_shapes,
|
|
input_names=input_names,
|
|
output_names=output_names,
|
|
input_dtypes=input_dtypes,
|
|
output_dtypes=output_dtypes,
|
|
supports_batching=True,
|
|
supports_dynamic_shapes=False,
|
|
)
|
|
|
|
def infer(
|
|
self,
|
|
inputs: Dict[str, torch.Tensor],
|
|
context_id: int = 0,
|
|
stream: Optional[torch.cuda.Stream] = None,
|
|
**kwargs,
|
|
) -> Dict[str, torch.Tensor]:
|
|
"""
|
|
Run TensorRT inference with zero-copy GPU tensors.
|
|
|
|
Args:
|
|
inputs: Dict of input_name -> CUDA tensor
|
|
context_id: Which execution context to use
|
|
stream: CUDA stream for async execution
|
|
|
|
Returns:
|
|
Dict of output_name -> CUDA tensor
|
|
"""
|
|
if not self.is_initialized:
|
|
raise RuntimeError("Engine not initialized")
|
|
|
|
# Validate inputs
|
|
for name in self._metadata.input_names:
|
|
if name not in inputs:
|
|
raise ValueError(f"Missing required input: {name}")
|
|
if not inputs[name].is_cuda:
|
|
raise ValueError(f"Input '{name}' must be a CUDA tensor")
|
|
|
|
# Get execution context
|
|
if context_id >= len(self._contexts):
|
|
raise ValueError(
|
|
f"Invalid context_id {context_id}, only {len(self._contexts)} contexts available"
|
|
)
|
|
|
|
context = self._contexts[context_id]
|
|
|
|
# Prepare outputs
|
|
outputs = {}
|
|
|
|
# Set input tensor addresses
|
|
for name in self._metadata.input_names:
|
|
input_tensor = inputs[name].contiguous()
|
|
context.set_tensor_address(name, input_tensor.data_ptr())
|
|
|
|
# Allocate and set output tensors
|
|
for name in self._metadata.output_names:
|
|
output_tensor = torch.empty(
|
|
self._metadata.output_shapes[name],
|
|
dtype=self._metadata.output_dtypes[name],
|
|
device=self._device,
|
|
)
|
|
outputs[name] = output_tensor
|
|
context.set_tensor_address(name, output_tensor.data_ptr())
|
|
|
|
# Execute
|
|
if stream is None:
|
|
stream = torch.cuda.Stream(device=self._device)
|
|
|
|
with torch.cuda.stream(stream):
|
|
success = context.execute_async_v3(stream_handle=stream.cuda_stream)
|
|
if not success:
|
|
raise RuntimeError("TensorRT inference failed")
|
|
|
|
stream.synchronize()
|
|
|
|
return outputs
|
|
|
|
def get_metadata(self) -> EngineMetadata:
|
|
"""Get engine metadata"""
|
|
if self._metadata is None:
|
|
raise RuntimeError("Engine not initialized")
|
|
return self._metadata
|
|
|
|
def cleanup(self):
|
|
"""Cleanup TensorRT resources"""
|
|
for ctx in self._contexts:
|
|
del ctx
|
|
self._contexts.clear()
|
|
|
|
if self._engine is not None:
|
|
del self._engine
|
|
self._engine = None
|
|
|
|
self._metadata = None
|
|
|
|
@property
|
|
def is_initialized(self) -> bool:
|
|
return self._engine is not None
|
|
|
|
@property
|
|
def device(self) -> torch.device:
|
|
return self._device
|
|
|
|
|
|
class UltralyticsEngine(IInferenceEngine):
|
|
"""
|
|
Ultralytics YOLO inference engine.
|
|
|
|
Features:
|
|
- Zero-copy GPU tensor inference
|
|
- Built-in preprocessing/postprocessing for YOLO models
|
|
- Supports .pt, .engine formats
|
|
- Automatic model export to TensorRT with caching
|
|
"""
|
|
|
|
def __init__(self):
|
|
self._model = None
|
|
self._metadata = None
|
|
self._device = None
|
|
self._model_path = None
|
|
self._exporter = None
|
|
|
|
def initialize(
|
|
self,
|
|
model_path: str,
|
|
device: torch.device,
|
|
batch: int = 1,
|
|
half: bool = False,
|
|
imgsz: int = 640,
|
|
cache_dir: str = ".ultralytics_cache",
|
|
**kwargs,
|
|
) -> EngineMetadata:
|
|
"""
|
|
Initialize Ultralytics YOLO model.
|
|
|
|
Automatically exports .pt models to .engine format with caching.
|
|
|
|
Args:
|
|
model_path: Path to .pt or .engine file
|
|
device: GPU device
|
|
batch: Maximum batch size for inference
|
|
half: Use FP16 precision
|
|
imgsz: Input image size
|
|
cache_dir: Directory for caching exported engines
|
|
**kwargs: Additional export parameters
|
|
|
|
Returns:
|
|
EngineMetadata
|
|
"""
|
|
from ultralytics import YOLO
|
|
|
|
from .ultralytics_exporter import UltralyticsExporter
|
|
|
|
self._device = device
|
|
self._model_path = model_path
|
|
|
|
# Check if we need to export
|
|
model_file = Path(model_path)
|
|
final_model_path = model_path
|
|
|
|
if model_file.suffix == ".pt":
|
|
# Use exporter with caching
|
|
print(f"Checking for cached TensorRT engine...")
|
|
self._exporter = UltralyticsExporter(cache_dir=cache_dir)
|
|
|
|
_, engine_path = self._exporter.export(
|
|
model_path=str(model_path),
|
|
device=device.index if device.type == "cuda" else 0,
|
|
half=half,
|
|
imgsz=imgsz,
|
|
batch=batch,
|
|
**kwargs,
|
|
)
|
|
|
|
final_model_path = engine_path
|
|
print(f"Using TensorRT engine: {engine_path}")
|
|
|
|
# CRITICAL: Update _model_path to point to the .engine file for metadata extraction
|
|
self._model_path = engine_path
|
|
|
|
# Load model (Ultralytics handles .engine files natively)
|
|
self._model = YOLO(final_model_path)
|
|
|
|
logger.info(f"Loaded Ultralytics model: {type(self._model)}")
|
|
if hasattr(self._model, "predictor"):
|
|
logger.info(
|
|
f"Model has predictor: {type(self._model.predictor) if self._model.predictor else None}"
|
|
)
|
|
|
|
# Move to device if needed (only for .pt models, .engine already on specific device)
|
|
if hasattr(self._model, "model") and self._model.model is not None:
|
|
# Check if it's actually a torch model (not a string path for .engine files)
|
|
if hasattr(self._model.model, "to"):
|
|
self._model.model = self._model.model.to(device)
|
|
|
|
# Extract metadata
|
|
self._metadata = self._extract_metadata()
|
|
|
|
return self._metadata
|
|
|
|
def _read_batch_size_from_engine_file(self, engine_path: str) -> int:
|
|
"""
|
|
Read batch size from the metadata JSON file saved next to the engine.
|
|
|
|
Much simpler than parsing TensorRT engine!
|
|
"""
|
|
try:
|
|
import json
|
|
from pathlib import Path
|
|
|
|
# The metadata file is named: <engine_path_without_extension>_metadata.json
|
|
engine_file = Path(engine_path)
|
|
metadata_file = engine_file.with_name(f"{engine_file.stem}_metadata.json")
|
|
|
|
print(f"[UltralyticsEngine] Looking for metadata file: {metadata_file}")
|
|
|
|
if metadata_file.exists():
|
|
with open(metadata_file, "r") as f:
|
|
metadata = json.load(f)
|
|
batch_size = metadata.get("batch", -1)
|
|
print(
|
|
f"[UltralyticsEngine] Found metadata: batch={batch_size}, imgsz={metadata.get('imgsz')}"
|
|
)
|
|
return batch_size
|
|
else:
|
|
print(f"[UltralyticsEngine] Metadata file not found: {metadata_file}")
|
|
except Exception as e:
|
|
print(
|
|
f"[UltralyticsEngine] Could not read batch size from metadata file: {e}"
|
|
)
|
|
|
|
return -1 # Default to dynamic
|
|
|
|
def _extract_metadata(self) -> EngineMetadata:
|
|
"""Extract metadata from Ultralytics model"""
|
|
# Ultralytics models typically expect (B, 3, H, W) input
|
|
# and return Results objects, not raw tensors
|
|
|
|
# Default values
|
|
batch_size = -1 # Dynamic batching by default
|
|
imgsz = 640
|
|
input_shape = (batch_size, 3, imgsz, imgsz)
|
|
|
|
# CRITICAL: For .engine files, read batch size directly from the TensorRT engine file
|
|
print(f"[UltralyticsEngine] _model_path={self._model_path}")
|
|
if self._model_path.endswith(".engine"):
|
|
print(f"[UltralyticsEngine] Reading batch size from engine file...")
|
|
batch_size = self._read_batch_size_from_engine_file(self._model_path)
|
|
print(f"[UltralyticsEngine] Read batch_size={batch_size} from .engine file")
|
|
if batch_size > 0:
|
|
input_shape = (batch_size, 3, imgsz, imgsz)
|
|
else:
|
|
print(f"[UltralyticsEngine] Not an .engine file, skipping direct read")
|
|
|
|
if hasattr(self._model, "model") and self._model.model is not None:
|
|
# Try to get actual input shape from model
|
|
try:
|
|
# For .engine files, check predictor model
|
|
if (
|
|
hasattr(self._model, "predictor")
|
|
and self._model.predictor is not None
|
|
):
|
|
predictor = self._model.predictor
|
|
|
|
# Get image size
|
|
if hasattr(predictor, "args") and hasattr(predictor.args, "imgsz"):
|
|
imgsz_val = predictor.args.imgsz
|
|
if isinstance(imgsz_val, (list, tuple)):
|
|
h, w = (
|
|
imgsz_val[0],
|
|
imgsz_val[1] if len(imgsz_val) > 1 else imgsz_val[0],
|
|
)
|
|
else:
|
|
h = w = imgsz_val
|
|
imgsz = h # Use height as reference
|
|
|
|
# Get batch size from model
|
|
if hasattr(predictor, "model"):
|
|
pred_model = predictor.model
|
|
|
|
# For TensorRT engines, check input bindings
|
|
if hasattr(pred_model, "bindings"):
|
|
# This is a TensorRT AutoBackend
|
|
try:
|
|
# Get first input binding shape
|
|
if hasattr(pred_model, "input_shape"):
|
|
shape = pred_model.input_shape
|
|
if shape and len(shape) >= 4:
|
|
batch_size = shape[0] if shape[0] > 0 else -1
|
|
except:
|
|
pass
|
|
|
|
# Try batch attribute
|
|
if batch_size == -1 and hasattr(pred_model, "batch"):
|
|
batch_size = (
|
|
pred_model.batch if pred_model.batch > 0 else -1
|
|
)
|
|
|
|
# Fallback: check model args
|
|
if hasattr(self._model.model, "args"):
|
|
imgsz_val = getattr(self._model.model.args, "imgsz", 640)
|
|
if isinstance(imgsz_val, (list, tuple)):
|
|
h, w = (
|
|
imgsz_val[0],
|
|
imgsz_val[1] if len(imgsz_val) > 1 else imgsz_val[0],
|
|
)
|
|
else:
|
|
h = w = imgsz_val
|
|
imgsz = h
|
|
|
|
input_shape = (batch_size, 3, imgsz, imgsz)
|
|
except Exception as e:
|
|
logger.warning(f"Could not extract full metadata: {e}")
|
|
pass
|
|
|
|
logger.info(
|
|
f"Extracted Ultralytics metadata: batch_size={batch_size}, imgsz={imgsz}, input_shape={input_shape}"
|
|
)
|
|
|
|
return EngineMetadata(
|
|
engine_type="ultralytics",
|
|
model_path=self._model_path,
|
|
input_shapes={"images": input_shape},
|
|
output_shapes={"results": (-1,)}, # Dynamic, depends on detections
|
|
input_names=["images"],
|
|
output_names=["results"],
|
|
input_dtypes={"images": torch.float32},
|
|
output_dtypes={"results": torch.float32},
|
|
supports_batching=True,
|
|
supports_dynamic_shapes=(batch_size == -1),
|
|
extra_info={
|
|
"is_yolo": True,
|
|
"has_builtin_postprocess": True,
|
|
"batch_size": batch_size,
|
|
"imgsz": imgsz,
|
|
},
|
|
)
|
|
|
|
def infer(
|
|
self,
|
|
inputs: Dict[str, torch.Tensor],
|
|
return_raw: bool = False,
|
|
conf: float = 0.25,
|
|
iou: float = 0.45,
|
|
**kwargs,
|
|
) -> Dict[str, torch.Tensor]:
|
|
"""
|
|
Run Ultralytics inference with zero-copy GPU tensors.
|
|
|
|
Args:
|
|
inputs: Dict with "images" key -> CUDA tensor (B, 3, H, W), normalized [0, 1]
|
|
return_raw: If True, return raw tensor output. If False, return Results objects
|
|
conf: Confidence threshold
|
|
iou: IoU threshold for NMS
|
|
|
|
Returns:
|
|
Dict with inference results
|
|
|
|
Note:
|
|
Input tensor should be normalized to [0, 1] range.
|
|
Format: (B, 3, H, W) in RGB color space.
|
|
"""
|
|
if not self.is_initialized:
|
|
raise RuntimeError("Engine not initialized")
|
|
|
|
# Get input tensor
|
|
if "images" not in inputs:
|
|
raise ValueError("Input must contain 'images' key")
|
|
|
|
images = inputs["images"]
|
|
|
|
if not images.is_cuda:
|
|
raise ValueError("Input must be a CUDA tensor")
|
|
|
|
# Ensure tensor is on correct device
|
|
if images.device != self._device:
|
|
images = images.to(self._device)
|
|
|
|
# Run inference
|
|
results = self._model(images, conf=conf, iou=iou, verbose=False, **kwargs)
|
|
|
|
# Return results
|
|
# Note: Ultralytics returns Results objects, not raw tensors
|
|
# For compatibility, we wrap them in a dict
|
|
return {
|
|
"results": results,
|
|
"raw_predictions": results[0].boxes.data
|
|
if len(results) > 0 and hasattr(results[0], "boxes")
|
|
else None,
|
|
}
|
|
|
|
def get_metadata(self) -> EngineMetadata:
|
|
"""Get engine metadata"""
|
|
if self._metadata is None:
|
|
raise RuntimeError("Engine not initialized")
|
|
return self._metadata
|
|
|
|
def cleanup(self):
|
|
"""Cleanup Ultralytics model"""
|
|
if self._model is not None:
|
|
del self._model
|
|
self._model = None
|
|
self._metadata = None
|
|
|
|
@property
|
|
def is_initialized(self) -> bool:
|
|
return self._model is not None
|
|
|
|
@property
|
|
def device(self) -> torch.device:
|
|
return self._device
|
|
|
|
|
|
def create_engine(backend: str | BackendType, **kwargs) -> IInferenceEngine:
|
|
"""
|
|
Factory function to create inference engine.
|
|
|
|
Args:
|
|
backend: Backend type (BackendType enum or string: "tensorrt", "ultralytics")
|
|
**kwargs: Engine-specific arguments
|
|
|
|
Returns:
|
|
IInferenceEngine instance
|
|
|
|
Example:
|
|
>>> from services import create_engine, BackendType
|
|
>>> engine = create_engine(BackendType.TENSORRT)
|
|
>>> engine = create_engine("ultralytics")
|
|
"""
|
|
# Convert string to BackendType if needed
|
|
if isinstance(backend, str):
|
|
backend = BackendType.from_string(backend)
|
|
|
|
engines = {
|
|
BackendType.TENSORRT: NativeTensorRTEngine,
|
|
BackendType.ULTRALYTICS: UltralyticsEngine,
|
|
}
|
|
|
|
if backend not in engines:
|
|
raise ValueError(
|
|
f"Unknown backend: {backend}. Available: {[b.value for b in BackendType]}"
|
|
)
|
|
|
|
return engines[backend]()
|