nms optimization
This commit is contained in:
parent
81bbb0074e
commit
8e20496fa7
5 changed files with 907 additions and 26 deletions
218
test_profiling.py
Normal file
218
test_profiling.py
Normal file
|
|
@ -0,0 +1,218 @@
|
|||
"""
|
||||
Detailed Profiling Script to Identify Performance Bottlenecks
|
||||
|
||||
This script profiles each component separately:
|
||||
1. Video decoding (NVDEC)
|
||||
2. Preprocessing
|
||||
3. TensorRT inference
|
||||
4. Postprocessing (including NMS)
|
||||
5. Tracking (IOU matching)
|
||||
"""
|
||||
|
||||
import time
|
||||
import os
|
||||
import torch
|
||||
from dotenv import load_dotenv
|
||||
from services import (
|
||||
StreamDecoderFactory,
|
||||
TensorRTModelRepository,
|
||||
TrackingFactory,
|
||||
YOLOv8Utils,
|
||||
COCO_CLASSES,
|
||||
)
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
||||
def profile_component(name, iterations=100):
|
||||
"""Decorator for profiling a component."""
|
||||
def decorator(func):
|
||||
def wrapper(*args, **kwargs):
|
||||
times = []
|
||||
for _ in range(iterations):
|
||||
start = time.time()
|
||||
result = func(*args, **kwargs)
|
||||
elapsed = time.time() - start
|
||||
times.append(elapsed * 1000) # Convert to ms
|
||||
|
||||
avg_time = sum(times) / len(times)
|
||||
min_time = min(times)
|
||||
max_time = max(times)
|
||||
|
||||
print(f"\n{name}:")
|
||||
print(f" Iterations: {iterations}")
|
||||
print(f" Average: {avg_time:.2f} ms")
|
||||
print(f" Min: {min_time:.2f} ms")
|
||||
print(f" Max: {max_time:.2f} ms")
|
||||
print(f" Throughput: {1000/avg_time:.2f} FPS")
|
||||
|
||||
return result
|
||||
return wrapper
|
||||
return decorator
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 80)
|
||||
print("PERFORMANCE PROFILING - Component Breakdown")
|
||||
print("=" * 80)
|
||||
|
||||
GPU_ID = 0
|
||||
MODEL_PATH = "models/yolov8n.trt"
|
||||
RTSP_URL = os.getenv('CAMERA_URL_1')
|
||||
|
||||
# Initialize components
|
||||
print("\nInitializing components...")
|
||||
model_repo = TensorRTModelRepository(gpu_id=GPU_ID, default_num_contexts=4)
|
||||
model_repo.load_model("detector", MODEL_PATH, num_contexts=4)
|
||||
|
||||
tracking_factory = TrackingFactory(gpu_id=GPU_ID)
|
||||
controller = tracking_factory.create_controller(
|
||||
model_repository=model_repo,
|
||||
model_id="detector",
|
||||
tracker_type="iou",
|
||||
max_age=30,
|
||||
min_confidence=0.5,
|
||||
iou_threshold=0.3,
|
||||
class_names=COCO_CLASSES
|
||||
)
|
||||
|
||||
stream_factory = StreamDecoderFactory(gpu_id=GPU_ID)
|
||||
decoder = stream_factory.create_decoder(RTSP_URL, buffer_size=30)
|
||||
decoder.start()
|
||||
|
||||
print("Waiting for stream connection...")
|
||||
connected = False
|
||||
for i in range(30):
|
||||
time.sleep(1)
|
||||
if decoder.is_connected():
|
||||
connected = True
|
||||
print(f"✓ Stream connected after {i+1} seconds")
|
||||
break
|
||||
if i % 5 == 0:
|
||||
print(f" Waiting... {i+1}/30 seconds")
|
||||
|
||||
if not connected:
|
||||
print("⚠ Stream not connected after 30 seconds")
|
||||
return
|
||||
|
||||
print("✓ Stream connected\n")
|
||||
print("=" * 80)
|
||||
print("PROFILING RESULTS")
|
||||
print("=" * 80)
|
||||
|
||||
# Wait for frames to buffer
|
||||
time.sleep(2)
|
||||
|
||||
# Get a sample frame for testing
|
||||
frame_gpu = decoder.get_latest_frame(rgb=True)
|
||||
if frame_gpu is None:
|
||||
print("⚠ No frames available")
|
||||
return
|
||||
|
||||
print(f"\nFrame shape: {frame_gpu.shape}")
|
||||
print(f"Frame device: {frame_gpu.device}")
|
||||
print(f"Frame dtype: {frame_gpu.dtype}")
|
||||
|
||||
# Profile 1: Video Decoding
|
||||
@profile_component("1. Video Decoding (NVDEC)", iterations=100)
|
||||
def profile_decoding():
|
||||
return decoder.get_latest_frame(rgb=True)
|
||||
|
||||
profile_decoding()
|
||||
|
||||
# Profile 2: Preprocessing
|
||||
@profile_component("2. Preprocessing (Resize + Normalize)", iterations=100)
|
||||
def profile_preprocessing():
|
||||
return YOLOv8Utils.preprocess(frame_gpu)
|
||||
|
||||
preprocessed = profile_preprocessing()
|
||||
|
||||
# Profile 3: TensorRT Inference
|
||||
@profile_component("3. TensorRT Inference", iterations=100)
|
||||
def profile_inference():
|
||||
return model_repo.infer(
|
||||
model_id="detector",
|
||||
inputs={"images": preprocessed},
|
||||
synchronize=True
|
||||
)
|
||||
|
||||
outputs = profile_inference()
|
||||
|
||||
# Profile 4: Postprocessing (including NMS)
|
||||
@profile_component("4. Postprocessing (NMS + Format Conversion)", iterations=100)
|
||||
def profile_postprocessing():
|
||||
return YOLOv8Utils.postprocess(outputs)
|
||||
|
||||
detections = profile_postprocessing()
|
||||
|
||||
print(f"\nDetections shape: {detections.shape}")
|
||||
print(f"Number of detections: {len(detections)}")
|
||||
|
||||
# Profile 5: Full Pipeline (Tracking)
|
||||
@profile_component("5. Full Tracking Pipeline", iterations=50)
|
||||
def profile_full_pipeline():
|
||||
frame = decoder.get_latest_frame(rgb=True)
|
||||
if frame is None:
|
||||
return []
|
||||
return controller.track(
|
||||
frame,
|
||||
preprocess_fn=YOLOv8Utils.preprocess,
|
||||
postprocess_fn=YOLOv8Utils.postprocess
|
||||
)
|
||||
|
||||
profile_full_pipeline()
|
||||
|
||||
# Profile 6: Parallel inference (simulate multi-camera)
|
||||
print("\n" + "=" * 80)
|
||||
print("MULTI-CAMERA SIMULATION")
|
||||
print("=" * 80)
|
||||
|
||||
num_cameras = 4
|
||||
print(f"\nSimulating {num_cameras} cameras processing sequentially...")
|
||||
|
||||
@profile_component(f"Sequential Processing ({num_cameras} cameras)", iterations=20)
|
||||
def profile_sequential():
|
||||
for _ in range(num_cameras):
|
||||
frame = decoder.get_latest_frame(rgb=True)
|
||||
if frame is not None:
|
||||
controller.track(
|
||||
frame,
|
||||
preprocess_fn=YOLOv8Utils.preprocess,
|
||||
postprocess_fn=YOLOv8Utils.postprocess
|
||||
)
|
||||
|
||||
profile_sequential()
|
||||
|
||||
# Cleanup
|
||||
decoder.stop()
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 80)
|
||||
print("BOTTLENECK ANALYSIS")
|
||||
print("=" * 80)
|
||||
|
||||
print("""
|
||||
Based on the profiling results above, identify the bottleneck:
|
||||
|
||||
1. If "TensorRT Inference" is the slowest:
|
||||
→ GPU compute is the bottleneck
|
||||
→ Solutions: Lower resolution, smaller model, batch processing
|
||||
|
||||
2. If "Postprocessing (NMS)" is slow:
|
||||
→ CPU/GPU synchronization or NMS is slow
|
||||
→ Solutions: Optimize NMS, reduce detections threshold
|
||||
|
||||
3. If "Video Decoding" is slow:
|
||||
→ NVDEC is the bottleneck
|
||||
→ Solutions: Lower resolution streams, fewer cameras per decoder
|
||||
|
||||
4. If "Sequential Processing" time ≈ (single pipeline time × num_cameras):
|
||||
→ No parallelization, processing is sequential
|
||||
→ Solutions: Async processing, CUDA streams, batching
|
||||
|
||||
Expected bottleneck: TensorRT Inference (most compute-intensive)
|
||||
""")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Add table
Add a link
Reference in a new issue