nms optimization

This commit is contained in:
Siwat Sirichai 2025-11-09 11:47:18 +07:00
parent 81bbb0074e
commit 8e20496fa7
5 changed files with 907 additions and 26 deletions

218
test_profiling.py Normal file
View file

@ -0,0 +1,218 @@
"""
Detailed Profiling Script to Identify Performance Bottlenecks
This script profiles each component separately:
1. Video decoding (NVDEC)
2. Preprocessing
3. TensorRT inference
4. Postprocessing (including NMS)
5. Tracking (IOU matching)
"""
import time
import os
import torch
from dotenv import load_dotenv
from services import (
StreamDecoderFactory,
TensorRTModelRepository,
TrackingFactory,
YOLOv8Utils,
COCO_CLASSES,
)
load_dotenv()
def profile_component(name, iterations=100):
"""Decorator for profiling a component."""
def decorator(func):
def wrapper(*args, **kwargs):
times = []
for _ in range(iterations):
start = time.time()
result = func(*args, **kwargs)
elapsed = time.time() - start
times.append(elapsed * 1000) # Convert to ms
avg_time = sum(times) / len(times)
min_time = min(times)
max_time = max(times)
print(f"\n{name}:")
print(f" Iterations: {iterations}")
print(f" Average: {avg_time:.2f} ms")
print(f" Min: {min_time:.2f} ms")
print(f" Max: {max_time:.2f} ms")
print(f" Throughput: {1000/avg_time:.2f} FPS")
return result
return wrapper
return decorator
def main():
print("=" * 80)
print("PERFORMANCE PROFILING - Component Breakdown")
print("=" * 80)
GPU_ID = 0
MODEL_PATH = "models/yolov8n.trt"
RTSP_URL = os.getenv('CAMERA_URL_1')
# Initialize components
print("\nInitializing components...")
model_repo = TensorRTModelRepository(gpu_id=GPU_ID, default_num_contexts=4)
model_repo.load_model("detector", MODEL_PATH, num_contexts=4)
tracking_factory = TrackingFactory(gpu_id=GPU_ID)
controller = tracking_factory.create_controller(
model_repository=model_repo,
model_id="detector",
tracker_type="iou",
max_age=30,
min_confidence=0.5,
iou_threshold=0.3,
class_names=COCO_CLASSES
)
stream_factory = StreamDecoderFactory(gpu_id=GPU_ID)
decoder = stream_factory.create_decoder(RTSP_URL, buffer_size=30)
decoder.start()
print("Waiting for stream connection...")
connected = False
for i in range(30):
time.sleep(1)
if decoder.is_connected():
connected = True
print(f"✓ Stream connected after {i+1} seconds")
break
if i % 5 == 0:
print(f" Waiting... {i+1}/30 seconds")
if not connected:
print("⚠ Stream not connected after 30 seconds")
return
print("✓ Stream connected\n")
print("=" * 80)
print("PROFILING RESULTS")
print("=" * 80)
# Wait for frames to buffer
time.sleep(2)
# Get a sample frame for testing
frame_gpu = decoder.get_latest_frame(rgb=True)
if frame_gpu is None:
print("⚠ No frames available")
return
print(f"\nFrame shape: {frame_gpu.shape}")
print(f"Frame device: {frame_gpu.device}")
print(f"Frame dtype: {frame_gpu.dtype}")
# Profile 1: Video Decoding
@profile_component("1. Video Decoding (NVDEC)", iterations=100)
def profile_decoding():
return decoder.get_latest_frame(rgb=True)
profile_decoding()
# Profile 2: Preprocessing
@profile_component("2. Preprocessing (Resize + Normalize)", iterations=100)
def profile_preprocessing():
return YOLOv8Utils.preprocess(frame_gpu)
preprocessed = profile_preprocessing()
# Profile 3: TensorRT Inference
@profile_component("3. TensorRT Inference", iterations=100)
def profile_inference():
return model_repo.infer(
model_id="detector",
inputs={"images": preprocessed},
synchronize=True
)
outputs = profile_inference()
# Profile 4: Postprocessing (including NMS)
@profile_component("4. Postprocessing (NMS + Format Conversion)", iterations=100)
def profile_postprocessing():
return YOLOv8Utils.postprocess(outputs)
detections = profile_postprocessing()
print(f"\nDetections shape: {detections.shape}")
print(f"Number of detections: {len(detections)}")
# Profile 5: Full Pipeline (Tracking)
@profile_component("5. Full Tracking Pipeline", iterations=50)
def profile_full_pipeline():
frame = decoder.get_latest_frame(rgb=True)
if frame is None:
return []
return controller.track(
frame,
preprocess_fn=YOLOv8Utils.preprocess,
postprocess_fn=YOLOv8Utils.postprocess
)
profile_full_pipeline()
# Profile 6: Parallel inference (simulate multi-camera)
print("\n" + "=" * 80)
print("MULTI-CAMERA SIMULATION")
print("=" * 80)
num_cameras = 4
print(f"\nSimulating {num_cameras} cameras processing sequentially...")
@profile_component(f"Sequential Processing ({num_cameras} cameras)", iterations=20)
def profile_sequential():
for _ in range(num_cameras):
frame = decoder.get_latest_frame(rgb=True)
if frame is not None:
controller.track(
frame,
preprocess_fn=YOLOv8Utils.preprocess,
postprocess_fn=YOLOv8Utils.postprocess
)
profile_sequential()
# Cleanup
decoder.stop()
# Summary
print("\n" + "=" * 80)
print("BOTTLENECK ANALYSIS")
print("=" * 80)
print("""
Based on the profiling results above, identify the bottleneck:
1. If "TensorRT Inference" is the slowest:
GPU compute is the bottleneck
Solutions: Lower resolution, smaller model, batch processing
2. If "Postprocessing (NMS)" is slow:
CPU/GPU synchronization or NMS is slow
Solutions: Optimize NMS, reduce detections threshold
3. If "Video Decoding" is slow:
NVDEC is the bottleneck
Solutions: Lower resolution streams, fewer cameras per decoder
4. If "Sequential Processing" time (single pipeline time × num_cameras):
No parallelization, processing is sequential
Solutions: Async processing, CUDA streams, batching
Expected bottleneck: TensorRT Inference (most compute-intensive)
""")
if __name__ == "__main__":
main()