fix: gpu memory leaks

This commit is contained in:
Siwat Sirichai 2025-11-10 22:10:46 +07:00
parent 3a47920186
commit 593611cdb7
13 changed files with 420 additions and 166 deletions

View file

@ -117,9 +117,10 @@ class ModelController:
"""
try:
metadata = self.model_repository.get_metadata(self.model_id)
# Get first input tensor shape
first_input = list(metadata.inputs.values())[0]
batch_dim = first_input["shape"][0]
# Get first input tensor shape (ModelMetadata has input_shapes, not inputs)
first_input_name = metadata.input_names[0]
input_shape = metadata.input_shapes[first_input_name]
batch_dim = input_shape[0]
# batch_dim can be -1 (dynamic), 1 (fixed), or N (fixed batch size)
if batch_dim == -1:

View file

@ -1,5 +1,6 @@
import threading
import hashlib
import json
from typing import Optional, Dict, Any, List, Tuple
from pathlib import Path
from queue import Queue
@ -161,7 +162,7 @@ class TensorRTModelRepository:
# Result: 1 engine in VRAM, N contexts (e.g., 4), not 100 contexts!
"""
def __init__(self, gpu_id: int = 0, default_num_contexts: int = 4, enable_pt_conversion: bool = True):
def __init__(self, gpu_id: int = 0, default_num_contexts: int = 4, enable_pt_conversion: bool = True, cache_dir: str = ".trt_cache"):
"""
Initialize the model repository.
@ -169,11 +170,14 @@ class TensorRTModelRepository:
gpu_id: GPU device ID to use
default_num_contexts: Default number of execution contexts per unique engine
enable_pt_conversion: Enable automatic PyTorch to TensorRT conversion
cache_dir: Directory for caching stripped TensorRT engines and metadata
"""
self.gpu_id = gpu_id
self.device = torch.device(f'cuda:{gpu_id}')
self.default_num_contexts = default_num_contexts
self.enable_pt_conversion = enable_pt_conversion
self.cache_dir = Path(cache_dir)
self.cache_dir.mkdir(parents=True, exist_ok=True)
# Model ID to engine mapping: model_id -> file_hash
self._model_to_hash: Dict[str, str] = {}
@ -192,6 +196,7 @@ class TensorRTModelRepository:
print(f"TensorRT Model Repository initialized on GPU {gpu_id}")
print(f"Default context pool size: {default_num_contexts} contexts per unique model")
print(f"Cache directory: {self.cache_dir}")
if enable_pt_conversion:
print(f"PyTorch to TensorRT conversion: enabled")
@ -226,6 +231,14 @@ class TensorRTModelRepository:
"""
Load TensorRT engine from file.
Supports both raw TensorRT engines and Ultralytics .engine files
(which have embedded JSON metadata at the beginning).
For Ultralytics engines:
- Strips metadata and caches pure TensorRT engine in cache_dir
- Saves metadata as separate JSON file
- Reuses cached stripped engine on subsequent loads
Args:
file_path: Path to .trt or .engine file
@ -234,8 +247,68 @@ class TensorRTModelRepository:
"""
runtime = trt.Runtime(self.trt_logger)
with open(file_path, 'rb') as f:
engine_data = f.read()
# Compute hash of original file for cache lookup
file_hash = self.compute_file_hash(file_path)
cache_engine_path = self.cache_dir / f"{file_hash}.trt"
cache_metadata_path = self.cache_dir / f"{file_hash}_metadata.json"
# Check if stripped engine already cached
if cache_engine_path.exists():
logger.info(f"Loading cached stripped engine from {cache_engine_path}")
with open(cache_engine_path, 'rb') as f:
engine_data = f.read()
else:
# Read and process original file
with open(file_path, 'rb') as f:
# Try to read Ultralytics metadata header (first 4 bytes = metadata length)
try:
meta_len_bytes = f.read(4)
if len(meta_len_bytes) == 4:
meta_len = int.from_bytes(meta_len_bytes, byteorder="little")
# Sanity check: metadata length should be reasonable (< 100KB)
if 0 < meta_len < 100000:
try:
metadata_bytes = f.read(meta_len)
metadata = json.loads(metadata_bytes.decode("utf-8"))
# This is an Ultralytics engine - read remaining pure TRT data
engine_data = f.read()
# Save stripped engine to cache
logger.info(f"Detected Ultralytics engine format")
logger.info(f"Ultralytics metadata: {metadata}")
logger.info(f"Caching stripped engine to {cache_engine_path}")
with open(cache_engine_path, 'wb') as cache_f:
cache_f.write(engine_data)
# Save metadata separately
with open(cache_metadata_path, 'w') as meta_f:
json.dump(metadata, meta_f, indent=2)
except (UnicodeDecodeError, json.JSONDecodeError):
# Not Ultralytics format, rewind and read entire file
f.seek(0)
engine_data = f.read()
else:
# Invalid metadata length, rewind and read entire file
f.seek(0)
engine_data = f.read()
else:
# File too small, just use what we read
engine_data = meta_len_bytes
except Exception as e:
# Any error, rewind and read entire file
logger.warning(f"Error reading engine metadata: {e}, treating as raw TRT engine")
f.seek(0)
engine_data = f.read()
# Cache the engine data (even if it was already raw TRT)
if not cache_engine_path.exists():
with open(cache_engine_path, 'wb') as cache_f:
cache_f.write(engine_data)
engine = runtime.deserialize_cuda_engine(engine_data)
if engine is None:
@ -494,6 +567,11 @@ class TensorRTModelRepository:
device=self.device
)
# NOTE: Don't track these tensors - they're returned to caller and consumed
# by postprocessing, then automatically freed by PyTorch's garbage collector.
# Tracking them would show false "leaks" since we can't track when the caller
# finishes using them and PyTorch deallocates them.
outputs[name] = output_tensor
exec_ctx.context.set_tensor_address(name, output_tensor.data_ptr())

View file

@ -125,12 +125,19 @@ class PTConverter:
mapping = self.mapping_db[pt_hash]
trt_hash = mapping["trt_hash"]
# Check both .engine and .trt extensions (Ultralytics uses .engine, generic uses .trt)
engine_key = f"trt/{trt_hash}.engine"
trt_key = f"trt/{trt_hash}.trt"
# Verify TRT file still exists in storage
if not self.storage.exists(trt_key):
# Try .engine first (Ultralytics native format)
if self.storage.exists(engine_key):
cached_key = engine_key
elif self.storage.exists(trt_key):
cached_key = trt_key
else:
logger.warning(
f"Mapping exists for PT hash {pt_hash[:16]}... but TRT file missing. "
f"Mapping exists for PT hash {pt_hash[:16]}... but engine file missing. "
f"Will reconvert."
)
# Remove stale mapping
@ -139,16 +146,16 @@ class PTConverter:
return None
# Get local path
trt_path = self.storage.get_local_path(trt_key)
if trt_path is None:
logger.error(f"Could not get local path for TRT file {trt_key}")
cached_path = self.storage.get_local_path(cached_key)
if cached_path is None:
logger.error(f"Could not get local path for engine file {cached_key}")
return None
logger.info(
f"Found cached conversion for PT hash {pt_hash[:16]}... -> "
f"TRT hash {trt_hash[:16]}..."
f"Engine hash {trt_hash[:16]}... ({cached_key})"
)
return (trt_hash, trt_path)
return (trt_hash, cached_path)
def convert(
self,
@ -241,24 +248,21 @@ class PTConverter:
precision: torch.dtype,
) -> Tuple[str, str]:
"""
Convert ultralytics YOLO model using ONNX TensorRT pipeline.
Uses the same approach as scripts/convert_pt_to_tensorrt.py
Convert ultralytics YOLO model using native .engine export.
This produces .engine files with embedded metadata (no manual input_shapes needed).
Args:
pt_path: Path to PT file
pt_hash: PT file hash
input_shapes: Input tensor shapes
input_shapes: Input tensor shapes (IGNORED for Ultralytics - auto-detected)
precision: Target precision
Returns:
Tuple of (trt_hash, trt_file_path)
Tuple of (engine_hash, engine_file_path)
"""
import tensorrt as trt
import tempfile
import os
import shutil
logger.info("Detected ultralytics YOLO model, using ONNX → TensorRT pipeline...")
logger.info("Detected ultralytics YOLO model, using native .engine export...")
# Load ultralytics model
try:
@ -267,83 +271,48 @@ class PTConverter:
except ImportError:
raise ImportError("ultralytics package not found. Install with: pip install ultralytics")
# Determine input shape
if not input_shapes:
raise ValueError("input_shapes required for ultralytics conversion")
# Export to native .engine format with embedded metadata
logger.info(f"Exporting to native TensorRT .engine (precision: {'FP16' if precision == torch.float16 else 'FP32'})...")
input_key = 'images' if 'images' in input_shapes else list(input_shapes.keys())[0]
input_shape = input_shapes[input_key]
# Ultralytics export creates .engine file in same directory as .pt
engine_path = model.export(
format='engine',
half=(precision == torch.float16),
device=self.gpu_id,
batch=1,
simplify=True
)
# Export to ONNX first
logger.info(f"Exporting to ONNX (input shape: {input_shape})...")
with tempfile.NamedTemporaryFile(suffix='.onnx', delete=False) as tmp_onnx:
onnx_path = tmp_onnx.name
# Convert to string (Ultralytics returns Path object)
engine_path = str(engine_path)
logger.info(f"Native .engine export complete: {engine_path}")
logger.info("Metadata embedded in .engine file (stride, imgsz, names, etc.)")
try:
# Use ultralytics export to ONNX
model.export(format='onnx', imgsz=input_shape[2], batch=input_shape[0])
# Ultralytics saves as model_name.onnx in same directory
pt_dir = os.path.dirname(pt_path)
pt_name = os.path.splitext(os.path.basename(pt_path))[0]
onnx_export_path = os.path.join(pt_dir, f"{pt_name}.onnx")
# Read the exported .engine file
with open(engine_path, 'rb') as f:
engine_data = f.read()
# Move to our temp location (use shutil.move for cross-device support)
if os.path.exists(onnx_export_path):
shutil.move(onnx_export_path, onnx_path)
else:
raise RuntimeError(f"ONNX export failed, file not found: {onnx_export_path}")
# Compute hash of the .engine file
engine_hash = hashlib.sha256(engine_data).hexdigest()
logger.info(f"ONNX export complete: {onnx_path}")
# Store in our cache (as .engine to preserve metadata)
engine_key = f"trt/{engine_hash}.engine"
self.storage.write(engine_key, engine_data)
# Build TensorRT engine from ONNX
logger.info("Building TensorRT engine from ONNX...")
trt_logger = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(trt_logger)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
parser = trt.OnnxParser(network, trt_logger)
cached_path = self.storage.get_local_path(engine_key)
if cached_path is None:
raise RuntimeError("Failed to get local path for .engine file")
# Parse ONNX
with open(onnx_path, 'rb') as f:
if not parser.parse(f.read()):
errors = [parser.get_error(i) for i in range(parser.num_errors)]
raise RuntimeError(f"Failed to parse ONNX: {errors}")
# Clean up the original export (we've cached it)
# Only delete if it's different from cached path
if os.path.exists(engine_path) and os.path.abspath(engine_path) != os.path.abspath(cached_path):
logger.info(f"Removing original export (cached): {engine_path}")
os.unlink(engine_path)
else:
logger.info(f"Keeping original export at: {engine_path}")
# Configure builder
config = builder.create_builder_config()
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 4 << 30) # 4GB
# Set precision
if precision == torch.float16:
if builder.platform_has_fast_fp16:
config.set_flag(trt.BuilderFlag.FP16)
logger.info("FP16 mode enabled")
# Build engine
logger.info("Building TensorRT engine (this may take a few minutes)...")
serialized_engine = builder.build_serialized_network(network, config)
if serialized_engine is None:
raise RuntimeError("Failed to build TensorRT engine")
# Convert IHostMemory to bytes
engine_bytes = bytes(serialized_engine)
# Save to storage
trt_hash = hashlib.sha256(engine_bytes).hexdigest()
trt_key = f"trt/{trt_hash}.trt"
self.storage.write(trt_key, engine_bytes)
trt_path = self.storage.get_local_path(trt_key)
if trt_path is None:
raise RuntimeError("Failed to get local path for TRT file")
logger.info(f"TensorRT engine built successfully: {trt_path}")
return (trt_hash, trt_path)
finally:
# Cleanup ONNX file
if os.path.exists(onnx_path):
os.unlink(onnx_path)
logger.info(f"Cached .engine file: {cached_path}")
return (engine_hash, cached_path)
def _perform_conversion(
self,
@ -387,11 +356,21 @@ class PTConverter:
# Check if this is an ultralytics model
if self._is_ultralytics_model(model):
logger.info("Detected ultralytics model, using ultralytics export API")
logger.info("Detected Ultralytics YOLO model, using native .engine export")
logger.info("Note: input_shapes parameter is ignored for Ultralytics models (auto-detected)")
return self._convert_ultralytics_model(pt_path, pt_hash, input_shapes, precision)
# For non-ultralytics models, use torch_tensorrt
logger.info("Using torch_tensorrt for conversion")
logger.info("Using torch_tensorrt for conversion (non-Ultralytics model)")
# Non-Ultralytics models REQUIRE input_shapes
if input_shapes is None:
raise ValueError(
"input_shapes required for non-Ultralytics PyTorch models. "
"For Ultralytics YOLO models, input_shapes is auto-detected. "
"Example: input_shapes={'images': (1, 3, 640, 640)}"
)
model.eval()
# Convert model to target precision to avoid mixed precision issues

View file

@ -127,15 +127,17 @@ class StreamConnection:
self.status = ConnectionStatus.DISCONNECTED
logger.info(f"Stream {self.stream_id} stopped")
def _on_frame_decoded(self, frame: torch.Tensor):
def _on_frame_decoded(self, frame_ref):
"""
Event handler called by decoder when a new frame is decoded.
This is the event-driven replacement for polling.
Args:
frame: RGB frame tensor on GPU (3, H, W)
frame_ref: FrameReference object containing the RGB frame tensor
"""
if not self.running:
# If not running, free the frame immediately
frame_ref.free()
return
try:
@ -143,12 +145,14 @@ class StreamConnection:
self.frame_count += 1
# Submit to model controller for batched inference
# Pass the FrameReference in metadata so we can free it later
self.model_controller.submit_frame(
stream_id=self.stream_id,
frame=frame,
frame=frame_ref.rgb_tensor,
metadata={
"frame_number": self.frame_count,
"shape": tuple(frame.shape),
"shape": tuple(frame_ref.rgb_tensor.shape),
"frame_ref": frame_ref, # Store reference for later cleanup
}
)
@ -164,6 +168,8 @@ class StreamConnection:
logger.error(f"Error processing frame for {self.stream_id}: {e}", exc_info=True)
self.error_queue.put(e)
self.status = ConnectionStatus.ERROR
# Free the frame on error
frame_ref.free()
def _handle_inference_result(self, result: Dict[str, Any]):
"""
@ -173,12 +179,17 @@ class StreamConnection:
Args:
result: Inference result dictionary
"""
frame_ref = None
try:
# Extract detections
detections = result["detections"]
# Run tracking (synchronous)
tracked_objects = self._run_tracking_sync(detections)
# Get FrameReference from metadata (if present)
frame_ref = result["metadata"].get("frame_ref")
# Run tracking (synchronous) with frame shape for bbox scaling
frame_shape = result["metadata"].get("shape")
tracked_objects = self._run_tracking_sync(detections, frame_shape)
# Create tracking result
tracking_result = TrackingResult(
@ -196,13 +207,18 @@ class StreamConnection:
except Exception as e:
logger.error(f"Error handling inference result for {self.stream_id}: {e}", exc_info=True)
self.error_queue.put(e)
finally:
# Free the frame reference - this is the last point in the pipeline
if frame_ref is not None:
frame_ref.free()
def _run_tracking_sync(self, detections, min_confidence=0.7):
def _run_tracking_sync(self, detections, frame_shape=None, min_confidence=0.7):
"""
Run tracking synchronously (called from executor).
Args:
detections: Detection tensor (N, 6) [x1, y1, x2, y2, conf, class_id]
frame_shape: Original frame shape (C, H, W) for scaling bboxes
min_confidence: Minimum confidence threshold for detections
Returns:
@ -226,8 +242,8 @@ class StreamConnection:
class_name=f"class_{int(det[5])}" if det.shape[0] > 5 else "unknown"
))
# Update tracker with detections (lightweight, no model dependency!)
return self.tracking_controller.update(detection_list)
# Update tracker with detections (will scale bboxes to frame space)
return self.tracking_controller.update(detection_list, frame_shape=frame_shape)
def tracking_results(self):
"""
@ -339,15 +355,31 @@ class StreamConnectionManager:
"""
Initialize the manager with a model.
Supports transparent loading of .pt (YOLO), .engine, and .trt files.
For Ultralytics YOLO models (.pt), metadata is auto-detected - no manual
input_shapes or precision needed! Non-YOLO models still require input_shapes.
Args:
model_path: Path to TensorRT or PyTorch model file (.trt, .pt, .pth)
model_path: Path to model file (.trt, .engine, .pt, .pth)
- .engine: Ultralytics native format (recommended)
- .pt: Auto-converts to .engine (YOLO models only)
- .trt: Raw TensorRT engine
model_id: Model identifier (default: "detector")
preprocess_fn: Preprocessing function (e.g., YOLOv8Utils.preprocess)
postprocess_fn: Postprocessing function (e.g., YOLOv8Utils.postprocess)
num_contexts: Number of TensorRT execution contexts (default: 4)
pt_input_shapes: Required for PT files - dict of input shapes
pt_precision: Precision for PT conversion (torch.float16 or torch.float32)
pt_input_shapes: [Optional] Only required for non-YOLO PyTorch models
YOLO models auto-detect from embedded metadata
pt_precision: [Optional] Precision for PT conversion (auto-detected for YOLO)
**pt_conversion_kwargs: Additional PT conversion arguments
Example:
# YOLO model - no manual parameters needed:
manager.initialize(
model_path="model.pt", # or .engine
preprocess_fn=YOLOv8Utils.preprocess,
postprocess_fn=YOLOv8Utils.postprocess
)
"""
logger.info(f"Initializing StreamConnectionManager on GPU {self.gpu_id}")

View file

@ -12,26 +12,31 @@ from .jpeg_encoder import encode_frame_to_jpeg
class FrameReference:
"""
CPU-side reference object for a GPU frame.
Reference-counted frame wrapper for zero-copy memory management.
This object holds a cloned RGB tensor that is independent of PyNvVideoCodec's
DecodedFrame lifecycle. We don't keep the DecodedFrame to avoid conflicts
with PyNvVideoCodec's internal frame pool management.
This allows multiple parts of the pipeline to hold references to the same
cloned frame, and tracks when all references are released so the decoder
knows when buffer slots can be reused.
"""
def __init__(self, rgb_tensor: torch.Tensor, buffer_index: int, decoder):
self.rgb_tensor = rgb_tensor # Cloned RGB tensor (independent copy)
self.rgb_tensor = rgb_tensor # Cloned RGB tensor (one clone per frame)
self.buffer_index = buffer_index
self.decoder = decoder # Reference to decoder for marking as free
self.decoder = decoder
self._freed = False
def free(self):
"""Mark this frame as no longer in use"""
"""Mark this reference as freed - called by the last user of the frame"""
if not self._freed:
self._freed = True
# Release GPU memory immediately
if self.rgb_tensor is not None:
del self.rgb_tensor
self.rgb_tensor = None
self.decoder._mark_frame_free(self.buffer_index)
def is_freed(self) -> bool:
"""Check if this frame has been freed"""
"""Check if this reference has been freed"""
return self._freed
def __del__(self):
@ -212,13 +217,10 @@ class StreamDecoder:
self.status = ConnectionStatus.DISCONNECTED
self._status_lock = threading.Lock()
# Frame buffer (ring buffer) - stores FrameReference objects
# Frame buffer (ring buffer) - stores cloned RGB tensors
self.frame_buffer = deque(maxlen=buffer_size)
self._buffer_lock = threading.RLock()
# Track which buffer slots are in use (list of FrameReference objects)
self._in_use_frames = [] # List of FrameReference objects currently held by callbacks
# Decoder and container instances
self.decoder = None
self.container = None
@ -236,6 +238,10 @@ class StreamDecoder:
self._frame_callbacks = []
self._callback_lock = threading.Lock()
# Track frames currently in use (referenced by callbacks/pipeline)
self._in_use_frames = [] # List of FrameReference objects
self._frame_index_counter = 0 # Monotonically increasing frame index
def register_frame_callback(self, callback: Callable):
"""
Register a callback to be called when a new frame is decoded.
@ -396,19 +402,7 @@ class StreamDecoder:
# Add frames to ring buffer and fire callbacks
with self._buffer_lock:
for frame in decoded_frames:
# Check for buffer overflow - discard oldest if needed
if len(self.frame_buffer) >= self.buffer_size:
# Check if oldest frame is still in use
if len(self._in_use_frames) > 0:
oldest_ref = self.frame_buffer[0] if len(self.frame_buffer) > 0 else None
if oldest_ref and not oldest_ref.is_freed():
# Force free the oldest frame to prevent overflow
print(f"[WARNING] Buffer overflow, force-freeing oldest frame (buffer_index={oldest_ref.buffer_index})")
oldest_ref.free()
# Deque will automatically remove oldest when at maxlen
# Convert to tensor
# Convert to tensor immediately after NVDEC
try:
# Convert DecodedFrame to PyTorch tensor using DLPack (zero-copy)
nv12_tensor = torch.from_dlpack(frame)
@ -417,32 +411,32 @@ class StreamDecoder:
if self.frame_height is not None and self.frame_width is not None:
rgb_tensor = nv12_to_rgb_gpu(nv12_tensor, self.frame_height, self.frame_width)
# CRITICAL: Clone the RGB tensor to break CUDA memory dependency
# The nv12_to_rgb_gpu creates a new tensor, but it still references
# the same CUDA context/stream. We need an independent copy.
rgb_tensor_cloned = rgb_tensor.clone()
# CLONE ONCE into our post-decode buffer
# This breaks the dependency on PyNvVideoCodec's DecodedFrame
# After this, the tensor is fully ours and can be used throughout the pipeline
rgb_cloned = rgb_tensor.clone()
# Create FrameReference object for C++-style memory management
# We don't keep the DecodedFrame to avoid conflicts with PyNvVideoCodec's
# internal frame pool - the clone is fully independent
buffer_index = self.frame_count
# Create FrameReference for reference counting
frame_ref = FrameReference(
rgb_tensor=rgb_tensor_cloned, # Independent cloned tensor
buffer_index=buffer_index,
rgb_tensor=rgb_cloned,
buffer_index=self._frame_index_counter,
decoder=self
)
self._frame_index_counter += 1
# Add to buffer and in-use tracking
# Add FrameReference to ring buffer (deque automatically removes oldest when full)
self.frame_buffer.append(frame_ref)
self._in_use_frames.append(frame_ref)
self.frame_count += 1
# Fire callbacks with the cloned RGB tensor from FrameReference
# The tensor is now independent of the DecodedFrame lifecycle
# Track this frame as in-use
self._in_use_frames.append(frame_ref)
# Fire callbacks with the FrameReference
# The callback receivers should call .free() when done
with self._callback_lock:
for callback in self._frame_callbacks:
try:
callback(frame_ref.rgb_tensor)
callback(frame_ref)
except Exception as e:
print(f"Error in frame callback: {e}")
except Exception as e:

View file

@ -272,12 +272,14 @@ class ObjectTracker:
for tid in stale_track_ids:
del self._tracks[tid]
def update(self, detections: List[Detection]) -> List[TrackedObject]:
def update(self, detections: List[Detection], frame_shape: tuple = None, model_input_size: int = 640) -> List[TrackedObject]:
"""
Update tracker with new detections (decoupled from inference).
Args:
detections: List of Detection objects from model inference
frame_shape: Original frame shape (C, H, W) for scaling bboxes back from model space
model_input_size: Model input size (default: 640 for YOLOv8)
Returns:
List of currently tracked objects
@ -291,6 +293,22 @@ class ObjectTracker:
self._cleanup_stale_tracks()
return list(self._tracks.values())
# Scale detections from model space (640x640) to frame space (H x W)
if frame_shape is not None:
_, frame_h, frame_w = frame_shape
scale_x = frame_w / model_input_size
scale_y = frame_h / model_input_size
# Scale all detection bboxes
for det in detections:
x1, y1, x2, y2 = det.bbox
det.bbox = [
x1 * scale_x,
y1 * scale_y,
x2 * scale_x,
y2 * scale_y
]
# Convert detections to tensor for GPU processing
det_tensor = torch.tensor(
[[*det.bbox, det.confidence, det.class_id] for det in detections],

View file

@ -63,6 +63,10 @@ class YOLOv8Utils:
# Normalize to [0, 1] (YOLOv8 expects normalized input)
frame_normalized = frame_resized / 255.0
# NOTE: Don't track these tensors - they're short-lived inputs to TensorRT
# that get automatically freed by PyTorch after inference completes.
# Tracking them would show false "leaks" since we can't track when TensorRT consumes them.
return frame_normalized
@staticmethod