218 lines
6.2 KiB
Python
218 lines
6.2 KiB
Python
"""
|
||
Detailed Profiling Script to Identify Performance Bottlenecks
|
||
|
||
This script profiles each component separately:
|
||
1. Video decoding (NVDEC)
|
||
2. Preprocessing
|
||
3. TensorRT inference
|
||
4. Postprocessing (including NMS)
|
||
5. Tracking (IOU matching)
|
||
"""
|
||
|
||
import time
|
||
import os
|
||
import torch
|
||
from dotenv import load_dotenv
|
||
from services import (
|
||
StreamDecoderFactory,
|
||
TensorRTModelRepository,
|
||
TrackingFactory,
|
||
YOLOv8Utils,
|
||
COCO_CLASSES,
|
||
)
|
||
|
||
load_dotenv()
|
||
|
||
|
||
def profile_component(name, iterations=100):
|
||
"""Decorator for profiling a component."""
|
||
def decorator(func):
|
||
def wrapper(*args, **kwargs):
|
||
times = []
|
||
for _ in range(iterations):
|
||
start = time.time()
|
||
result = func(*args, **kwargs)
|
||
elapsed = time.time() - start
|
||
times.append(elapsed * 1000) # Convert to ms
|
||
|
||
avg_time = sum(times) / len(times)
|
||
min_time = min(times)
|
||
max_time = max(times)
|
||
|
||
print(f"\n{name}:")
|
||
print(f" Iterations: {iterations}")
|
||
print(f" Average: {avg_time:.2f} ms")
|
||
print(f" Min: {min_time:.2f} ms")
|
||
print(f" Max: {max_time:.2f} ms")
|
||
print(f" Throughput: {1000/avg_time:.2f} FPS")
|
||
|
||
return result
|
||
return wrapper
|
||
return decorator
|
||
|
||
|
||
def main():
|
||
print("=" * 80)
|
||
print("PERFORMANCE PROFILING - Component Breakdown")
|
||
print("=" * 80)
|
||
|
||
GPU_ID = 0
|
||
MODEL_PATH = "models/yolov8n.trt"
|
||
RTSP_URL = os.getenv('CAMERA_URL_1')
|
||
|
||
# Initialize components
|
||
print("\nInitializing components...")
|
||
model_repo = TensorRTModelRepository(gpu_id=GPU_ID, default_num_contexts=4)
|
||
model_repo.load_model("detector", MODEL_PATH, num_contexts=4)
|
||
|
||
tracking_factory = TrackingFactory(gpu_id=GPU_ID)
|
||
controller = tracking_factory.create_controller(
|
||
model_repository=model_repo,
|
||
model_id="detector",
|
||
tracker_type="iou",
|
||
max_age=30,
|
||
min_confidence=0.5,
|
||
iou_threshold=0.3,
|
||
class_names=COCO_CLASSES
|
||
)
|
||
|
||
stream_factory = StreamDecoderFactory(gpu_id=GPU_ID)
|
||
decoder = stream_factory.create_decoder(RTSP_URL, buffer_size=30)
|
||
decoder.start()
|
||
|
||
print("Waiting for stream connection...")
|
||
connected = False
|
||
for i in range(30):
|
||
time.sleep(1)
|
||
if decoder.is_connected():
|
||
connected = True
|
||
print(f"✓ Stream connected after {i+1} seconds")
|
||
break
|
||
if i % 5 == 0:
|
||
print(f" Waiting... {i+1}/30 seconds")
|
||
|
||
if not connected:
|
||
print("⚠ Stream not connected after 30 seconds")
|
||
return
|
||
|
||
print("✓ Stream connected\n")
|
||
print("=" * 80)
|
||
print("PROFILING RESULTS")
|
||
print("=" * 80)
|
||
|
||
# Wait for frames to buffer
|
||
time.sleep(2)
|
||
|
||
# Get a sample frame for testing
|
||
frame_gpu = decoder.get_latest_frame(rgb=True)
|
||
if frame_gpu is None:
|
||
print("⚠ No frames available")
|
||
return
|
||
|
||
print(f"\nFrame shape: {frame_gpu.shape}")
|
||
print(f"Frame device: {frame_gpu.device}")
|
||
print(f"Frame dtype: {frame_gpu.dtype}")
|
||
|
||
# Profile 1: Video Decoding
|
||
@profile_component("1. Video Decoding (NVDEC)", iterations=100)
|
||
def profile_decoding():
|
||
return decoder.get_latest_frame(rgb=True)
|
||
|
||
profile_decoding()
|
||
|
||
# Profile 2: Preprocessing
|
||
@profile_component("2. Preprocessing (Resize + Normalize)", iterations=100)
|
||
def profile_preprocessing():
|
||
return YOLOv8Utils.preprocess(frame_gpu)
|
||
|
||
preprocessed = profile_preprocessing()
|
||
|
||
# Profile 3: TensorRT Inference
|
||
@profile_component("3. TensorRT Inference", iterations=100)
|
||
def profile_inference():
|
||
return model_repo.infer(
|
||
model_id="detector",
|
||
inputs={"images": preprocessed},
|
||
synchronize=True
|
||
)
|
||
|
||
outputs = profile_inference()
|
||
|
||
# Profile 4: Postprocessing (including NMS)
|
||
@profile_component("4. Postprocessing (NMS + Format Conversion)", iterations=100)
|
||
def profile_postprocessing():
|
||
return YOLOv8Utils.postprocess(outputs)
|
||
|
||
detections = profile_postprocessing()
|
||
|
||
print(f"\nDetections shape: {detections.shape}")
|
||
print(f"Number of detections: {len(detections)}")
|
||
|
||
# Profile 5: Full Pipeline (Tracking)
|
||
@profile_component("5. Full Tracking Pipeline", iterations=50)
|
||
def profile_full_pipeline():
|
||
frame = decoder.get_latest_frame(rgb=True)
|
||
if frame is None:
|
||
return []
|
||
return controller.track(
|
||
frame,
|
||
preprocess_fn=YOLOv8Utils.preprocess,
|
||
postprocess_fn=YOLOv8Utils.postprocess
|
||
)
|
||
|
||
profile_full_pipeline()
|
||
|
||
# Profile 6: Parallel inference (simulate multi-camera)
|
||
print("\n" + "=" * 80)
|
||
print("MULTI-CAMERA SIMULATION")
|
||
print("=" * 80)
|
||
|
||
num_cameras = 4
|
||
print(f"\nSimulating {num_cameras} cameras processing sequentially...")
|
||
|
||
@profile_component(f"Sequential Processing ({num_cameras} cameras)", iterations=20)
|
||
def profile_sequential():
|
||
for _ in range(num_cameras):
|
||
frame = decoder.get_latest_frame(rgb=True)
|
||
if frame is not None:
|
||
controller.track(
|
||
frame,
|
||
preprocess_fn=YOLOv8Utils.preprocess,
|
||
postprocess_fn=YOLOv8Utils.postprocess
|
||
)
|
||
|
||
profile_sequential()
|
||
|
||
# Cleanup
|
||
decoder.stop()
|
||
|
||
# Summary
|
||
print("\n" + "=" * 80)
|
||
print("BOTTLENECK ANALYSIS")
|
||
print("=" * 80)
|
||
|
||
print("""
|
||
Based on the profiling results above, identify the bottleneck:
|
||
|
||
1. If "TensorRT Inference" is the slowest:
|
||
→ GPU compute is the bottleneck
|
||
→ Solutions: Lower resolution, smaller model, batch processing
|
||
|
||
2. If "Postprocessing (NMS)" is slow:
|
||
→ CPU/GPU synchronization or NMS is slow
|
||
→ Solutions: Optimize NMS, reduce detections threshold
|
||
|
||
3. If "Video Decoding" is slow:
|
||
→ NVDEC is the bottleneck
|
||
→ Solutions: Lower resolution streams, fewer cameras per decoder
|
||
|
||
4. If "Sequential Processing" time ≈ (single pipeline time × num_cameras):
|
||
→ No parallelization, processing is sequential
|
||
→ Solutions: Async processing, CUDA streams, batching
|
||
|
||
Expected bottleneck: TensorRT Inference (most compute-intensive)
|
||
""")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|