fix: gpu memory leaks

This commit is contained in:
Siwat Sirichai 2025-11-10 22:10:46 +07:00
parent 3a47920186
commit 593611cdb7
13 changed files with 420 additions and 166 deletions

1
.gitignore vendored
View file

@ -5,3 +5,4 @@ __pycache__/
.claude .claude
/models/ /models/
/tracked_objects.json /tracked_objects.json
.trt_cache

2
bangchak/models/.gitignore vendored Normal file
View file

@ -0,0 +1,2 @@
*.onnx
*.engine

136
debug_trt_output.py Normal file
View file

@ -0,0 +1,136 @@
"""
Debug script to capture and compare raw PT vs TRT outputs on problematic frames.
"""
import torch
import time
from services import StreamDecoderFactory, YOLOv8Utils, TensorRTModelRepository
from ultralytics import YOLO
import os
from dotenv import load_dotenv
load_dotenv()
GPU_ID = 0
MODEL_PATH = "bangchak/models/frontal_detection_v5.pt"
STREAM_URL = os.getenv('CAMERA_URL_1')
# Load models
print("Loading models...")
pt_model = YOLO(MODEL_PATH)
pt_model.to(f'cuda:{GPU_ID}')
repo = TensorRTModelRepository(gpu_id=GPU_ID)
trt_path = "./models/trtptcache/trt/cda5e520441e12fe09a97ac2609da29b4cbac969cc2029ef1735f65697579121.trt"
repo.load_model("detector", trt_path, num_contexts=1)
# Start decoder
print("Starting decoder...")
decoder_factory = StreamDecoderFactory(gpu_id=GPU_ID)
decoder = decoder_factory.create_decoder(STREAM_URL, buffer_size=30)
decoder.start()
time.sleep(2)
torch.cuda.set_device(GPU_ID)
print("\nWaiting for frames with TRT false positives...\n")
frame_count = 0
found_issue = False
while frame_count < 50 and not found_issue:
frame = decoder.get_frame()
if frame is None:
time.sleep(0.01)
continue
frame_count += 1
# Preprocess
preprocessed = YOLOv8Utils.preprocess(frame, input_size=640)
# Run TRT inference
trt_outputs = repo.infer("detector", {"images": preprocessed}, synchronize=True)
trt_raw = trt_outputs['output0'] # (1, 5, 8400)
# Check for the issue - transpose and check channel 4
trt_transposed = trt_raw.transpose(1, 2).squeeze(0) # (8400, 5)
conf_channel = trt_transposed[:, 4] # (8400,)
num_high_conf = (conf_channel > 0.25).sum().item()
if num_high_conf > 100:
found_issue = True
print(f"🔴 FOUND PROBLEMATIC FRAME {frame_count}!")
print(f" TRT detections > 0.25 threshold: {num_high_conf}")
# Now run PT model on same frame
with torch.no_grad():
pt_raw = pt_model.model(preprocessed)[0] # (1, 5, 8400)
print(f"\n=== RAW OUTPUT COMPARISON ===")
print(f"PT output shape: {pt_raw.shape}")
print(f"TRT output shape: {trt_raw.shape}")
# Compare channel 4 (confidence)
pt_conf = pt_raw.transpose(1, 2).squeeze(0)[:, 4]
trt_conf = trt_transposed[:, 4]
print(f"\n--- Confidence Channel (channel 4) ---")
print(f"PT confidence stats:")
print(f" Min: {pt_conf.min().item():.6e}")
print(f" Max: {pt_conf.max().item():.6e}")
print(f" Mean: {pt_conf.mean().item():.6e}")
print(f" >0.25: {(pt_conf > 0.25).sum().item()}")
print(f" >0.5: {(pt_conf > 0.5).sum().item()}")
print(f"\nTRT confidence stats:")
print(f" Min: {trt_conf.min().item():.6e}")
print(f" Max: {trt_conf.max().item():.6e}")
print(f" Mean: {trt_conf.mean().item():.6e}")
print(f" >0.25: {(trt_conf > 0.25).sum().item()}")
print(f" >0.5: {(trt_conf > 0.5).sum().item()}")
# Check bbox coordinates too
print(f"\n--- BBox Coordinates (channels 0-3) ---")
pt_bbox = pt_raw.transpose(1, 2).squeeze(0)[:, :4]
trt_bbox = trt_transposed[:, :4]
print(f"PT bbox stats:")
print(f" Min: {pt_bbox.min().item():.3f}")
print(f" Max: {pt_bbox.max().item():.3f}")
print(f" Mean: {pt_bbox.mean().item():.3f}")
print(f"\nTRT bbox stats:")
print(f" Min: {trt_bbox.min().item():.3f}")
print(f" Max: {trt_bbox.max().item():.3f}")
print(f" Mean: {trt_bbox.mean().item():.3f}")
# Sample some values
print(f"\n--- Sample Values (first 5 anchors) ---")
for i in range(5):
print(f"\nAnchor {i}:")
print(f" PT [cx={pt_bbox[i,0]:.1f}, cy={pt_bbox[i,1]:.1f}, w={pt_bbox[i,2]:.1f}, h={pt_bbox[i,3]:.1f}, conf={pt_conf[i]:.6f}]")
print(f" TRT [cx={trt_bbox[i,0]:.1f}, cy={trt_bbox[i,1]:.1f}, w={trt_bbox[i,2]:.1f}, h={trt_bbox[i,3]:.1f}, conf={trt_conf[i]:.6f}]")
# Find indices with high confidence in TRT
high_conf_idx = torch.where(trt_conf > 0.25)[0][:5]
print(f"\n--- High Confidence Detections in TRT (first 5) ---")
for idx in high_conf_idx:
i = idx.item()
print(f"\nAnchor {i}:")
print(f" PT [cx={pt_bbox[i,0]:.1f}, cy={pt_bbox[i,1]:.1f}, w={pt_bbox[i,2]:.1f}, h={pt_bbox[i,3]:.1f}, conf={pt_conf[i]:.6f}]")
print(f" TRT [cx={trt_bbox[i,0]:.1f}, cy={trt_bbox[i,1]:.1f}, w={trt_bbox[i,2]:.1f}, h={trt_bbox[i,3]:.1f}, conf={trt_conf[i]:.6f}]")
break
if frame_count % 10 == 0:
print(f"Checked {frame_count} frames, no issues yet...")
if not found_issue:
print(f"\n⚠️ No problematic frames found in {frame_count} frames")
# Cleanup
decoder.stop()
repo.unload_model("detector")
print("\n✓ Done")

9
new_buffer_design.txt Normal file
View file

@ -0,0 +1,9 @@
The Post-Decoded Buffer should just be the ping pong ring buffer
let's get some relationship in order
- ping pong ring is per model
- many camera may use the same model
- this buffer is filled when we memcpy it from decode buffer
but I need some more ground rules
- in the model buffer, one frame per camera may be in the buffer, if older one of the same camera exist, evict it. This is a real time system so buffer should be as fresh as possible.
- The goal of batching is not to pool up processing for the same camera but to pool up multiple camera.
- if all camera in the pool already post its frame, flush the buffer too

View file

@ -117,9 +117,10 @@ class ModelController:
""" """
try: try:
metadata = self.model_repository.get_metadata(self.model_id) metadata = self.model_repository.get_metadata(self.model_id)
# Get first input tensor shape # Get first input tensor shape (ModelMetadata has input_shapes, not inputs)
first_input = list(metadata.inputs.values())[0] first_input_name = metadata.input_names[0]
batch_dim = first_input["shape"][0] input_shape = metadata.input_shapes[first_input_name]
batch_dim = input_shape[0]
# batch_dim can be -1 (dynamic), 1 (fixed), or N (fixed batch size) # batch_dim can be -1 (dynamic), 1 (fixed), or N (fixed batch size)
if batch_dim == -1: if batch_dim == -1:

View file

@ -1,5 +1,6 @@
import threading import threading
import hashlib import hashlib
import json
from typing import Optional, Dict, Any, List, Tuple from typing import Optional, Dict, Any, List, Tuple
from pathlib import Path from pathlib import Path
from queue import Queue from queue import Queue
@ -161,7 +162,7 @@ class TensorRTModelRepository:
# Result: 1 engine in VRAM, N contexts (e.g., 4), not 100 contexts! # Result: 1 engine in VRAM, N contexts (e.g., 4), not 100 contexts!
""" """
def __init__(self, gpu_id: int = 0, default_num_contexts: int = 4, enable_pt_conversion: bool = True): def __init__(self, gpu_id: int = 0, default_num_contexts: int = 4, enable_pt_conversion: bool = True, cache_dir: str = ".trt_cache"):
""" """
Initialize the model repository. Initialize the model repository.
@ -169,11 +170,14 @@ class TensorRTModelRepository:
gpu_id: GPU device ID to use gpu_id: GPU device ID to use
default_num_contexts: Default number of execution contexts per unique engine default_num_contexts: Default number of execution contexts per unique engine
enable_pt_conversion: Enable automatic PyTorch to TensorRT conversion enable_pt_conversion: Enable automatic PyTorch to TensorRT conversion
cache_dir: Directory for caching stripped TensorRT engines and metadata
""" """
self.gpu_id = gpu_id self.gpu_id = gpu_id
self.device = torch.device(f'cuda:{gpu_id}') self.device = torch.device(f'cuda:{gpu_id}')
self.default_num_contexts = default_num_contexts self.default_num_contexts = default_num_contexts
self.enable_pt_conversion = enable_pt_conversion self.enable_pt_conversion = enable_pt_conversion
self.cache_dir = Path(cache_dir)
self.cache_dir.mkdir(parents=True, exist_ok=True)
# Model ID to engine mapping: model_id -> file_hash # Model ID to engine mapping: model_id -> file_hash
self._model_to_hash: Dict[str, str] = {} self._model_to_hash: Dict[str, str] = {}
@ -192,6 +196,7 @@ class TensorRTModelRepository:
print(f"TensorRT Model Repository initialized on GPU {gpu_id}") print(f"TensorRT Model Repository initialized on GPU {gpu_id}")
print(f"Default context pool size: {default_num_contexts} contexts per unique model") print(f"Default context pool size: {default_num_contexts} contexts per unique model")
print(f"Cache directory: {self.cache_dir}")
if enable_pt_conversion: if enable_pt_conversion:
print(f"PyTorch to TensorRT conversion: enabled") print(f"PyTorch to TensorRT conversion: enabled")
@ -226,6 +231,14 @@ class TensorRTModelRepository:
""" """
Load TensorRT engine from file. Load TensorRT engine from file.
Supports both raw TensorRT engines and Ultralytics .engine files
(which have embedded JSON metadata at the beginning).
For Ultralytics engines:
- Strips metadata and caches pure TensorRT engine in cache_dir
- Saves metadata as separate JSON file
- Reuses cached stripped engine on subsequent loads
Args: Args:
file_path: Path to .trt or .engine file file_path: Path to .trt or .engine file
@ -234,8 +247,68 @@ class TensorRTModelRepository:
""" """
runtime = trt.Runtime(self.trt_logger) runtime = trt.Runtime(self.trt_logger)
with open(file_path, 'rb') as f: # Compute hash of original file for cache lookup
file_hash = self.compute_file_hash(file_path)
cache_engine_path = self.cache_dir / f"{file_hash}.trt"
cache_metadata_path = self.cache_dir / f"{file_hash}_metadata.json"
# Check if stripped engine already cached
if cache_engine_path.exists():
logger.info(f"Loading cached stripped engine from {cache_engine_path}")
with open(cache_engine_path, 'rb') as f:
engine_data = f.read() engine_data = f.read()
else:
# Read and process original file
with open(file_path, 'rb') as f:
# Try to read Ultralytics metadata header (first 4 bytes = metadata length)
try:
meta_len_bytes = f.read(4)
if len(meta_len_bytes) == 4:
meta_len = int.from_bytes(meta_len_bytes, byteorder="little")
# Sanity check: metadata length should be reasonable (< 100KB)
if 0 < meta_len < 100000:
try:
metadata_bytes = f.read(meta_len)
metadata = json.loads(metadata_bytes.decode("utf-8"))
# This is an Ultralytics engine - read remaining pure TRT data
engine_data = f.read()
# Save stripped engine to cache
logger.info(f"Detected Ultralytics engine format")
logger.info(f"Ultralytics metadata: {metadata}")
logger.info(f"Caching stripped engine to {cache_engine_path}")
with open(cache_engine_path, 'wb') as cache_f:
cache_f.write(engine_data)
# Save metadata separately
with open(cache_metadata_path, 'w') as meta_f:
json.dump(metadata, meta_f, indent=2)
except (UnicodeDecodeError, json.JSONDecodeError):
# Not Ultralytics format, rewind and read entire file
f.seek(0)
engine_data = f.read()
else:
# Invalid metadata length, rewind and read entire file
f.seek(0)
engine_data = f.read()
else:
# File too small, just use what we read
engine_data = meta_len_bytes
except Exception as e:
# Any error, rewind and read entire file
logger.warning(f"Error reading engine metadata: {e}, treating as raw TRT engine")
f.seek(0)
engine_data = f.read()
# Cache the engine data (even if it was already raw TRT)
if not cache_engine_path.exists():
with open(cache_engine_path, 'wb') as cache_f:
cache_f.write(engine_data)
engine = runtime.deserialize_cuda_engine(engine_data) engine = runtime.deserialize_cuda_engine(engine_data)
if engine is None: if engine is None:
@ -494,6 +567,11 @@ class TensorRTModelRepository:
device=self.device device=self.device
) )
# NOTE: Don't track these tensors - they're returned to caller and consumed
# by postprocessing, then automatically freed by PyTorch's garbage collector.
# Tracking them would show false "leaks" since we can't track when the caller
# finishes using them and PyTorch deallocates them.
outputs[name] = output_tensor outputs[name] = output_tensor
exec_ctx.context.set_tensor_address(name, output_tensor.data_ptr()) exec_ctx.context.set_tensor_address(name, output_tensor.data_ptr())

View file

@ -125,12 +125,19 @@ class PTConverter:
mapping = self.mapping_db[pt_hash] mapping = self.mapping_db[pt_hash]
trt_hash = mapping["trt_hash"] trt_hash = mapping["trt_hash"]
# Check both .engine and .trt extensions (Ultralytics uses .engine, generic uses .trt)
engine_key = f"trt/{trt_hash}.engine"
trt_key = f"trt/{trt_hash}.trt" trt_key = f"trt/{trt_hash}.trt"
# Verify TRT file still exists in storage # Try .engine first (Ultralytics native format)
if not self.storage.exists(trt_key): if self.storage.exists(engine_key):
cached_key = engine_key
elif self.storage.exists(trt_key):
cached_key = trt_key
else:
logger.warning( logger.warning(
f"Mapping exists for PT hash {pt_hash[:16]}... but TRT file missing. " f"Mapping exists for PT hash {pt_hash[:16]}... but engine file missing. "
f"Will reconvert." f"Will reconvert."
) )
# Remove stale mapping # Remove stale mapping
@ -139,16 +146,16 @@ class PTConverter:
return None return None
# Get local path # Get local path
trt_path = self.storage.get_local_path(trt_key) cached_path = self.storage.get_local_path(cached_key)
if trt_path is None: if cached_path is None:
logger.error(f"Could not get local path for TRT file {trt_key}") logger.error(f"Could not get local path for engine file {cached_key}")
return None return None
logger.info( logger.info(
f"Found cached conversion for PT hash {pt_hash[:16]}... -> " f"Found cached conversion for PT hash {pt_hash[:16]}... -> "
f"TRT hash {trt_hash[:16]}..." f"Engine hash {trt_hash[:16]}... ({cached_key})"
) )
return (trt_hash, trt_path) return (trt_hash, cached_path)
def convert( def convert(
self, self,
@ -241,24 +248,21 @@ class PTConverter:
precision: torch.dtype, precision: torch.dtype,
) -> Tuple[str, str]: ) -> Tuple[str, str]:
""" """
Convert ultralytics YOLO model using ONNX TensorRT pipeline. Convert ultralytics YOLO model using native .engine export.
Uses the same approach as scripts/convert_pt_to_tensorrt.py This produces .engine files with embedded metadata (no manual input_shapes needed).
Args: Args:
pt_path: Path to PT file pt_path: Path to PT file
pt_hash: PT file hash pt_hash: PT file hash
input_shapes: Input tensor shapes input_shapes: Input tensor shapes (IGNORED for Ultralytics - auto-detected)
precision: Target precision precision: Target precision
Returns: Returns:
Tuple of (trt_hash, trt_file_path) Tuple of (engine_hash, engine_file_path)
""" """
import tensorrt as trt
import tempfile
import os import os
import shutil
logger.info("Detected ultralytics YOLO model, using ONNX → TensorRT pipeline...") logger.info("Detected ultralytics YOLO model, using native .engine export...")
# Load ultralytics model # Load ultralytics model
try: try:
@ -267,83 +271,48 @@ class PTConverter:
except ImportError: except ImportError:
raise ImportError("ultralytics package not found. Install with: pip install ultralytics") raise ImportError("ultralytics package not found. Install with: pip install ultralytics")
# Determine input shape # Export to native .engine format with embedded metadata
if not input_shapes: logger.info(f"Exporting to native TensorRT .engine (precision: {'FP16' if precision == torch.float16 else 'FP32'})...")
raise ValueError("input_shapes required for ultralytics conversion")
input_key = 'images' if 'images' in input_shapes else list(input_shapes.keys())[0] # Ultralytics export creates .engine file in same directory as .pt
input_shape = input_shapes[input_key] engine_path = model.export(
format='engine',
half=(precision == torch.float16),
device=self.gpu_id,
batch=1,
simplify=True
)
# Export to ONNX first # Convert to string (Ultralytics returns Path object)
logger.info(f"Exporting to ONNX (input shape: {input_shape})...") engine_path = str(engine_path)
with tempfile.NamedTemporaryFile(suffix='.onnx', delete=False) as tmp_onnx: logger.info(f"Native .engine export complete: {engine_path}")
onnx_path = tmp_onnx.name logger.info("Metadata embedded in .engine file (stride, imgsz, names, etc.)")
try: # Read the exported .engine file
# Use ultralytics export to ONNX with open(engine_path, 'rb') as f:
model.export(format='onnx', imgsz=input_shape[2], batch=input_shape[0]) engine_data = f.read()
# Ultralytics saves as model_name.onnx in same directory
pt_dir = os.path.dirname(pt_path)
pt_name = os.path.splitext(os.path.basename(pt_path))[0]
onnx_export_path = os.path.join(pt_dir, f"{pt_name}.onnx")
# Move to our temp location (use shutil.move for cross-device support) # Compute hash of the .engine file
if os.path.exists(onnx_export_path): engine_hash = hashlib.sha256(engine_data).hexdigest()
shutil.move(onnx_export_path, onnx_path)
# Store in our cache (as .engine to preserve metadata)
engine_key = f"trt/{engine_hash}.engine"
self.storage.write(engine_key, engine_data)
cached_path = self.storage.get_local_path(engine_key)
if cached_path is None:
raise RuntimeError("Failed to get local path for .engine file")
# Clean up the original export (we've cached it)
# Only delete if it's different from cached path
if os.path.exists(engine_path) and os.path.abspath(engine_path) != os.path.abspath(cached_path):
logger.info(f"Removing original export (cached): {engine_path}")
os.unlink(engine_path)
else: else:
raise RuntimeError(f"ONNX export failed, file not found: {onnx_export_path}") logger.info(f"Keeping original export at: {engine_path}")
logger.info(f"ONNX export complete: {onnx_path}") logger.info(f"Cached .engine file: {cached_path}")
return (engine_hash, cached_path)
# Build TensorRT engine from ONNX
logger.info("Building TensorRT engine from ONNX...")
trt_logger = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(trt_logger)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
parser = trt.OnnxParser(network, trt_logger)
# Parse ONNX
with open(onnx_path, 'rb') as f:
if not parser.parse(f.read()):
errors = [parser.get_error(i) for i in range(parser.num_errors)]
raise RuntimeError(f"Failed to parse ONNX: {errors}")
# Configure builder
config = builder.create_builder_config()
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 4 << 30) # 4GB
# Set precision
if precision == torch.float16:
if builder.platform_has_fast_fp16:
config.set_flag(trt.BuilderFlag.FP16)
logger.info("FP16 mode enabled")
# Build engine
logger.info("Building TensorRT engine (this may take a few minutes)...")
serialized_engine = builder.build_serialized_network(network, config)
if serialized_engine is None:
raise RuntimeError("Failed to build TensorRT engine")
# Convert IHostMemory to bytes
engine_bytes = bytes(serialized_engine)
# Save to storage
trt_hash = hashlib.sha256(engine_bytes).hexdigest()
trt_key = f"trt/{trt_hash}.trt"
self.storage.write(trt_key, engine_bytes)
trt_path = self.storage.get_local_path(trt_key)
if trt_path is None:
raise RuntimeError("Failed to get local path for TRT file")
logger.info(f"TensorRT engine built successfully: {trt_path}")
return (trt_hash, trt_path)
finally:
# Cleanup ONNX file
if os.path.exists(onnx_path):
os.unlink(onnx_path)
def _perform_conversion( def _perform_conversion(
self, self,
@ -387,11 +356,21 @@ class PTConverter:
# Check if this is an ultralytics model # Check if this is an ultralytics model
if self._is_ultralytics_model(model): if self._is_ultralytics_model(model):
logger.info("Detected ultralytics model, using ultralytics export API") logger.info("Detected Ultralytics YOLO model, using native .engine export")
logger.info("Note: input_shapes parameter is ignored for Ultralytics models (auto-detected)")
return self._convert_ultralytics_model(pt_path, pt_hash, input_shapes, precision) return self._convert_ultralytics_model(pt_path, pt_hash, input_shapes, precision)
# For non-ultralytics models, use torch_tensorrt # For non-ultralytics models, use torch_tensorrt
logger.info("Using torch_tensorrt for conversion") logger.info("Using torch_tensorrt for conversion (non-Ultralytics model)")
# Non-Ultralytics models REQUIRE input_shapes
if input_shapes is None:
raise ValueError(
"input_shapes required for non-Ultralytics PyTorch models. "
"For Ultralytics YOLO models, input_shapes is auto-detected. "
"Example: input_shapes={'images': (1, 3, 640, 640)}"
)
model.eval() model.eval()
# Convert model to target precision to avoid mixed precision issues # Convert model to target precision to avoid mixed precision issues

View file

@ -127,15 +127,17 @@ class StreamConnection:
self.status = ConnectionStatus.DISCONNECTED self.status = ConnectionStatus.DISCONNECTED
logger.info(f"Stream {self.stream_id} stopped") logger.info(f"Stream {self.stream_id} stopped")
def _on_frame_decoded(self, frame: torch.Tensor): def _on_frame_decoded(self, frame_ref):
""" """
Event handler called by decoder when a new frame is decoded. Event handler called by decoder when a new frame is decoded.
This is the event-driven replacement for polling. This is the event-driven replacement for polling.
Args: Args:
frame: RGB frame tensor on GPU (3, H, W) frame_ref: FrameReference object containing the RGB frame tensor
""" """
if not self.running: if not self.running:
# If not running, free the frame immediately
frame_ref.free()
return return
try: try:
@ -143,12 +145,14 @@ class StreamConnection:
self.frame_count += 1 self.frame_count += 1
# Submit to model controller for batched inference # Submit to model controller for batched inference
# Pass the FrameReference in metadata so we can free it later
self.model_controller.submit_frame( self.model_controller.submit_frame(
stream_id=self.stream_id, stream_id=self.stream_id,
frame=frame, frame=frame_ref.rgb_tensor,
metadata={ metadata={
"frame_number": self.frame_count, "frame_number": self.frame_count,
"shape": tuple(frame.shape), "shape": tuple(frame_ref.rgb_tensor.shape),
"frame_ref": frame_ref, # Store reference for later cleanup
} }
) )
@ -164,6 +168,8 @@ class StreamConnection:
logger.error(f"Error processing frame for {self.stream_id}: {e}", exc_info=True) logger.error(f"Error processing frame for {self.stream_id}: {e}", exc_info=True)
self.error_queue.put(e) self.error_queue.put(e)
self.status = ConnectionStatus.ERROR self.status = ConnectionStatus.ERROR
# Free the frame on error
frame_ref.free()
def _handle_inference_result(self, result: Dict[str, Any]): def _handle_inference_result(self, result: Dict[str, Any]):
""" """
@ -173,12 +179,17 @@ class StreamConnection:
Args: Args:
result: Inference result dictionary result: Inference result dictionary
""" """
frame_ref = None
try: try:
# Extract detections # Extract detections
detections = result["detections"] detections = result["detections"]
# Run tracking (synchronous) # Get FrameReference from metadata (if present)
tracked_objects = self._run_tracking_sync(detections) frame_ref = result["metadata"].get("frame_ref")
# Run tracking (synchronous) with frame shape for bbox scaling
frame_shape = result["metadata"].get("shape")
tracked_objects = self._run_tracking_sync(detections, frame_shape)
# Create tracking result # Create tracking result
tracking_result = TrackingResult( tracking_result = TrackingResult(
@ -196,13 +207,18 @@ class StreamConnection:
except Exception as e: except Exception as e:
logger.error(f"Error handling inference result for {self.stream_id}: {e}", exc_info=True) logger.error(f"Error handling inference result for {self.stream_id}: {e}", exc_info=True)
self.error_queue.put(e) self.error_queue.put(e)
finally:
# Free the frame reference - this is the last point in the pipeline
if frame_ref is not None:
frame_ref.free()
def _run_tracking_sync(self, detections, min_confidence=0.7): def _run_tracking_sync(self, detections, frame_shape=None, min_confidence=0.7):
""" """
Run tracking synchronously (called from executor). Run tracking synchronously (called from executor).
Args: Args:
detections: Detection tensor (N, 6) [x1, y1, x2, y2, conf, class_id] detections: Detection tensor (N, 6) [x1, y1, x2, y2, conf, class_id]
frame_shape: Original frame shape (C, H, W) for scaling bboxes
min_confidence: Minimum confidence threshold for detections min_confidence: Minimum confidence threshold for detections
Returns: Returns:
@ -226,8 +242,8 @@ class StreamConnection:
class_name=f"class_{int(det[5])}" if det.shape[0] > 5 else "unknown" class_name=f"class_{int(det[5])}" if det.shape[0] > 5 else "unknown"
)) ))
# Update tracker with detections (lightweight, no model dependency!) # Update tracker with detections (will scale bboxes to frame space)
return self.tracking_controller.update(detection_list) return self.tracking_controller.update(detection_list, frame_shape=frame_shape)
def tracking_results(self): def tracking_results(self):
""" """
@ -339,15 +355,31 @@ class StreamConnectionManager:
""" """
Initialize the manager with a model. Initialize the manager with a model.
Supports transparent loading of .pt (YOLO), .engine, and .trt files.
For Ultralytics YOLO models (.pt), metadata is auto-detected - no manual
input_shapes or precision needed! Non-YOLO models still require input_shapes.
Args: Args:
model_path: Path to TensorRT or PyTorch model file (.trt, .pt, .pth) model_path: Path to model file (.trt, .engine, .pt, .pth)
- .engine: Ultralytics native format (recommended)
- .pt: Auto-converts to .engine (YOLO models only)
- .trt: Raw TensorRT engine
model_id: Model identifier (default: "detector") model_id: Model identifier (default: "detector")
preprocess_fn: Preprocessing function (e.g., YOLOv8Utils.preprocess) preprocess_fn: Preprocessing function (e.g., YOLOv8Utils.preprocess)
postprocess_fn: Postprocessing function (e.g., YOLOv8Utils.postprocess) postprocess_fn: Postprocessing function (e.g., YOLOv8Utils.postprocess)
num_contexts: Number of TensorRT execution contexts (default: 4) num_contexts: Number of TensorRT execution contexts (default: 4)
pt_input_shapes: Required for PT files - dict of input shapes pt_input_shapes: [Optional] Only required for non-YOLO PyTorch models
pt_precision: Precision for PT conversion (torch.float16 or torch.float32) YOLO models auto-detect from embedded metadata
pt_precision: [Optional] Precision for PT conversion (auto-detected for YOLO)
**pt_conversion_kwargs: Additional PT conversion arguments **pt_conversion_kwargs: Additional PT conversion arguments
Example:
# YOLO model - no manual parameters needed:
manager.initialize(
model_path="model.pt", # or .engine
preprocess_fn=YOLOv8Utils.preprocess,
postprocess_fn=YOLOv8Utils.postprocess
)
""" """
logger.info(f"Initializing StreamConnectionManager on GPU {self.gpu_id}") logger.info(f"Initializing StreamConnectionManager on GPU {self.gpu_id}")

View file

@ -12,26 +12,31 @@ from .jpeg_encoder import encode_frame_to_jpeg
class FrameReference: class FrameReference:
""" """
CPU-side reference object for a GPU frame. Reference-counted frame wrapper for zero-copy memory management.
This object holds a cloned RGB tensor that is independent of PyNvVideoCodec's This allows multiple parts of the pipeline to hold references to the same
DecodedFrame lifecycle. We don't keep the DecodedFrame to avoid conflicts cloned frame, and tracks when all references are released so the decoder
with PyNvVideoCodec's internal frame pool management. knows when buffer slots can be reused.
""" """
def __init__(self, rgb_tensor: torch.Tensor, buffer_index: int, decoder): def __init__(self, rgb_tensor: torch.Tensor, buffer_index: int, decoder):
self.rgb_tensor = rgb_tensor # Cloned RGB tensor (independent copy) self.rgb_tensor = rgb_tensor # Cloned RGB tensor (one clone per frame)
self.buffer_index = buffer_index self.buffer_index = buffer_index
self.decoder = decoder # Reference to decoder for marking as free self.decoder = decoder
self._freed = False self._freed = False
def free(self): def free(self):
"""Mark this frame as no longer in use""" """Mark this reference as freed - called by the last user of the frame"""
if not self._freed: if not self._freed:
self._freed = True self._freed = True
# Release GPU memory immediately
if self.rgb_tensor is not None:
del self.rgb_tensor
self.rgb_tensor = None
self.decoder._mark_frame_free(self.buffer_index) self.decoder._mark_frame_free(self.buffer_index)
def is_freed(self) -> bool: def is_freed(self) -> bool:
"""Check if this frame has been freed""" """Check if this reference has been freed"""
return self._freed return self._freed
def __del__(self): def __del__(self):
@ -212,13 +217,10 @@ class StreamDecoder:
self.status = ConnectionStatus.DISCONNECTED self.status = ConnectionStatus.DISCONNECTED
self._status_lock = threading.Lock() self._status_lock = threading.Lock()
# Frame buffer (ring buffer) - stores FrameReference objects # Frame buffer (ring buffer) - stores cloned RGB tensors
self.frame_buffer = deque(maxlen=buffer_size) self.frame_buffer = deque(maxlen=buffer_size)
self._buffer_lock = threading.RLock() self._buffer_lock = threading.RLock()
# Track which buffer slots are in use (list of FrameReference objects)
self._in_use_frames = [] # List of FrameReference objects currently held by callbacks
# Decoder and container instances # Decoder and container instances
self.decoder = None self.decoder = None
self.container = None self.container = None
@ -236,6 +238,10 @@ class StreamDecoder:
self._frame_callbacks = [] self._frame_callbacks = []
self._callback_lock = threading.Lock() self._callback_lock = threading.Lock()
# Track frames currently in use (referenced by callbacks/pipeline)
self._in_use_frames = [] # List of FrameReference objects
self._frame_index_counter = 0 # Monotonically increasing frame index
def register_frame_callback(self, callback: Callable): def register_frame_callback(self, callback: Callable):
""" """
Register a callback to be called when a new frame is decoded. Register a callback to be called when a new frame is decoded.
@ -396,19 +402,7 @@ class StreamDecoder:
# Add frames to ring buffer and fire callbacks # Add frames to ring buffer and fire callbacks
with self._buffer_lock: with self._buffer_lock:
for frame in decoded_frames: for frame in decoded_frames:
# Check for buffer overflow - discard oldest if needed # Convert to tensor immediately after NVDEC
if len(self.frame_buffer) >= self.buffer_size:
# Check if oldest frame is still in use
if len(self._in_use_frames) > 0:
oldest_ref = self.frame_buffer[0] if len(self.frame_buffer) > 0 else None
if oldest_ref and not oldest_ref.is_freed():
# Force free the oldest frame to prevent overflow
print(f"[WARNING] Buffer overflow, force-freeing oldest frame (buffer_index={oldest_ref.buffer_index})")
oldest_ref.free()
# Deque will automatically remove oldest when at maxlen
# Convert to tensor
try: try:
# Convert DecodedFrame to PyTorch tensor using DLPack (zero-copy) # Convert DecodedFrame to PyTorch tensor using DLPack (zero-copy)
nv12_tensor = torch.from_dlpack(frame) nv12_tensor = torch.from_dlpack(frame)
@ -417,32 +411,32 @@ class StreamDecoder:
if self.frame_height is not None and self.frame_width is not None: if self.frame_height is not None and self.frame_width is not None:
rgb_tensor = nv12_to_rgb_gpu(nv12_tensor, self.frame_height, self.frame_width) rgb_tensor = nv12_to_rgb_gpu(nv12_tensor, self.frame_height, self.frame_width)
# CRITICAL: Clone the RGB tensor to break CUDA memory dependency # CLONE ONCE into our post-decode buffer
# The nv12_to_rgb_gpu creates a new tensor, but it still references # This breaks the dependency on PyNvVideoCodec's DecodedFrame
# the same CUDA context/stream. We need an independent copy. # After this, the tensor is fully ours and can be used throughout the pipeline
rgb_tensor_cloned = rgb_tensor.clone() rgb_cloned = rgb_tensor.clone()
# Create FrameReference object for C++-style memory management # Create FrameReference for reference counting
# We don't keep the DecodedFrame to avoid conflicts with PyNvVideoCodec's
# internal frame pool - the clone is fully independent
buffer_index = self.frame_count
frame_ref = FrameReference( frame_ref = FrameReference(
rgb_tensor=rgb_tensor_cloned, # Independent cloned tensor rgb_tensor=rgb_cloned,
buffer_index=buffer_index, buffer_index=self._frame_index_counter,
decoder=self decoder=self
) )
self._frame_index_counter += 1
# Add to buffer and in-use tracking # Add FrameReference to ring buffer (deque automatically removes oldest when full)
self.frame_buffer.append(frame_ref) self.frame_buffer.append(frame_ref)
self._in_use_frames.append(frame_ref)
self.frame_count += 1 self.frame_count += 1
# Fire callbacks with the cloned RGB tensor from FrameReference # Track this frame as in-use
# The tensor is now independent of the DecodedFrame lifecycle self._in_use_frames.append(frame_ref)
# Fire callbacks with the FrameReference
# The callback receivers should call .free() when done
with self._callback_lock: with self._callback_lock:
for callback in self._frame_callbacks: for callback in self._frame_callbacks:
try: try:
callback(frame_ref.rgb_tensor) callback(frame_ref)
except Exception as e: except Exception as e:
print(f"Error in frame callback: {e}") print(f"Error in frame callback: {e}")
except Exception as e: except Exception as e:

View file

@ -272,12 +272,14 @@ class ObjectTracker:
for tid in stale_track_ids: for tid in stale_track_ids:
del self._tracks[tid] del self._tracks[tid]
def update(self, detections: List[Detection]) -> List[TrackedObject]: def update(self, detections: List[Detection], frame_shape: tuple = None, model_input_size: int = 640) -> List[TrackedObject]:
""" """
Update tracker with new detections (decoupled from inference). Update tracker with new detections (decoupled from inference).
Args: Args:
detections: List of Detection objects from model inference detections: List of Detection objects from model inference
frame_shape: Original frame shape (C, H, W) for scaling bboxes back from model space
model_input_size: Model input size (default: 640 for YOLOv8)
Returns: Returns:
List of currently tracked objects List of currently tracked objects
@ -291,6 +293,22 @@ class ObjectTracker:
self._cleanup_stale_tracks() self._cleanup_stale_tracks()
return list(self._tracks.values()) return list(self._tracks.values())
# Scale detections from model space (640x640) to frame space (H x W)
if frame_shape is not None:
_, frame_h, frame_w = frame_shape
scale_x = frame_w / model_input_size
scale_y = frame_h / model_input_size
# Scale all detection bboxes
for det in detections:
x1, y1, x2, y2 = det.bbox
det.bbox = [
x1 * scale_x,
y1 * scale_y,
x2 * scale_x,
y2 * scale_y
]
# Convert detections to tensor for GPU processing # Convert detections to tensor for GPU processing
det_tensor = torch.tensor( det_tensor = torch.tensor(
[[*det.bbox, det.confidence, det.class_id] for det in detections], [[*det.bbox, det.confidence, det.class_id] for det in detections],

View file

@ -63,6 +63,10 @@ class YOLOv8Utils:
# Normalize to [0, 1] (YOLOv8 expects normalized input) # Normalize to [0, 1] (YOLOv8 expects normalized input)
frame_normalized = frame_resized / 255.0 frame_normalized = frame_resized / 255.0
# NOTE: Don't track these tensors - they're short-lived inputs to TensorRT
# that get automatically freed by PyTorch after inference completes.
# Tracking them would show false "leaks" since we can't track when TensorRT consumes them.
return frame_normalized return frame_normalized
@staticmethod @staticmethod

View file

@ -43,8 +43,8 @@ async def example_callback_pattern():
poll_interval=0.01, # 100 FPS poll_interval=0.01, # 100 FPS
) )
# Initialize with YOLOv8 model # Initialize with YOLOv8 model (transparent loading: .pt, .engine, or .trt)
model_path = "models/yolov8n.trt" # Adjust path as needed model_path = "models/yolov8n.trt" # Can also use .pt or .engine
if not os.path.exists(model_path): if not os.path.exists(model_path):
logger.error(f"Model file not found: {model_path}") logger.error(f"Model file not found: {model_path}")
return return
@ -53,7 +53,8 @@ async def example_callback_pattern():
model_path=model_path, model_path=model_path,
model_id="yolo", model_id="yolo",
preprocess_fn=YOLOv8Utils.preprocess, preprocess_fn=YOLOv8Utils.preprocess,
postprocess_fn=YOLOv8Utils.postprocess, postprocess_fn=YOLOv8Utils.postprocess
# Note: No manual parameters needed for YOLO models
) )
# Define callback for tracking results # Define callback for tracking results

View file

@ -24,7 +24,6 @@ from services import (
# Load environment variables # Load environment variables
load_dotenv() load_dotenv()
def main_single_stream(): def main_single_stream():
"""Single stream example with event-driven architecture.""" """Single stream example with event-driven architecture."""
print("=" * 80) print("=" * 80)
@ -33,7 +32,7 @@ def main_single_stream():
# Configuration # Configuration
GPU_ID = 0 GPU_ID = 0
MODEL_PATH = "bangchak/models/frontal_detection_v5.pt" # PT file will be auto-converted MODEL_PATH = "bangchak/models/frontal_detection_v5.pt" # Transparent loading: .pt, .engine, or .trt
STREAM_URL = os.getenv('CAMERA_URL_1', 'rtsp://localhost:8554/test') STREAM_URL = os.getenv('CAMERA_URL_1', 'rtsp://localhost:8554/test')
BATCH_SIZE = 4 BATCH_SIZE = 4
FORCE_TIMEOUT = 0.05 FORCE_TIMEOUT = 0.05
@ -59,10 +58,10 @@ def main_single_stream():
) )
print("✓ Manager created") print("✓ Manager created")
# Initialize with PT model (auto-conversion) # Initialize with model (transparent loading - no manual parameters needed)
print("\n[2/3] Initializing with PT model...") print("\n[2/3] Initializing model...")
print("Note: First load will convert PT to TensorRT (3-5 minutes)") print("Note: YOLO models auto-convert to native TensorRT .engine (first time only)")
print("Subsequent loads will use cached TensorRT engine\n") print("Metadata is auto-detected from model - no manual input_shapes needed!\n")
try: try:
manager.initialize( manager.initialize(
@ -70,11 +69,10 @@ def main_single_stream():
model_id="detector", model_id="detector",
preprocess_fn=YOLOv8Utils.preprocess, preprocess_fn=YOLOv8Utils.preprocess,
postprocess_fn=YOLOv8Utils.postprocess, postprocess_fn=YOLOv8Utils.postprocess,
num_contexts=4, num_contexts=4
pt_input_shapes={"images": (1, 3, 640, 640)}, # Note: No pt_input_shapes or pt_precision needed for YOLO models!
pt_precision=torch.float16
) )
print("✓ Manager initialized (PT converted to TensorRT)") print("✓ Manager initialized")
except Exception as e: except Exception as e:
print(f"✗ Failed to initialize: {e}") print(f"✗ Failed to initialize: {e}")
import traceback import traceback
@ -176,6 +174,7 @@ def main_single_stream():
class_counts[obj.class_name] = class_counts.get(obj.class_name, 0) + 1 class_counts[obj.class_name] = class_counts.get(obj.class_name, 0) + 1
print(f" Classes: {class_counts}") print(f" Classes: {class_counts}")
except KeyboardInterrupt: except KeyboardInterrupt:
print(f"\n✓ Interrupted by user") print(f"\n✓ Interrupted by user")
@ -206,7 +205,7 @@ def main_multi_stream():
# Configuration # Configuration
GPU_ID = 0 GPU_ID = 0
MODEL_PATH = "models/yolov8n.pt" # PT file will be auto-converted MODEL_PATH = "bangchak/models/frontal_detection_v5.pt" # Transparent loading: .pt, .engine, or .trt
BATCH_SIZE = 16 BATCH_SIZE = 16
FORCE_TIMEOUT = 0.05 FORCE_TIMEOUT = 0.05
@ -241,17 +240,16 @@ def main_multi_stream():
) )
print("✓ Manager created") print("✓ Manager created")
# Initialize with PT model # Initialize model (transparent loading)
print("\n[2/3] Initializing with PT model...") print("\n[2/3] Initializing model...")
try: try:
manager.initialize( manager.initialize(
model_path=MODEL_PATH, model_path=MODEL_PATH,
model_id="detector", model_id="detector",
preprocess_fn=YOLOv8Utils.preprocess, preprocess_fn=YOLOv8Utils.preprocess,
postprocess_fn=YOLOv8Utils.postprocess, postprocess_fn=YOLOv8Utils.postprocess,
num_contexts=8, num_contexts=8
pt_input_shapes={"images": (1, 3, 640, 640)}, # Note: No pt_input_shapes or pt_precision needed for YOLO models!
pt_precision=torch.float16
) )
print("✓ Manager initialized") print("✓ Manager initialized")
except Exception as e: except Exception as e:
@ -312,6 +310,7 @@ def main_multi_stream():
s_fps = stats['count'] / s_elapsed if s_elapsed > 0 else 0 s_fps = stats['count'] / s_elapsed if s_elapsed > 0 else 0
print(f" {sid}: {stats['count']} ({s_fps:.1f} FPS)") print(f" {sid}: {stats['count']} ({s_fps:.1f} FPS)")
except KeyboardInterrupt: except KeyboardInterrupt:
print(f"\n✓ Interrupted") print(f"\n✓ Interrupted")