python-rtsp-worker/test_profiling.py
2025-11-09 11:47:18 +07:00

218 lines
6.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Detailed Profiling Script to Identify Performance Bottlenecks
This script profiles each component separately:
1. Video decoding (NVDEC)
2. Preprocessing
3. TensorRT inference
4. Postprocessing (including NMS)
5. Tracking (IOU matching)
"""
import time
import os
import torch
from dotenv import load_dotenv
from services import (
StreamDecoderFactory,
TensorRTModelRepository,
TrackingFactory,
YOLOv8Utils,
COCO_CLASSES,
)
load_dotenv()
def profile_component(name, iterations=100):
"""Decorator for profiling a component."""
def decorator(func):
def wrapper(*args, **kwargs):
times = []
for _ in range(iterations):
start = time.time()
result = func(*args, **kwargs)
elapsed = time.time() - start
times.append(elapsed * 1000) # Convert to ms
avg_time = sum(times) / len(times)
min_time = min(times)
max_time = max(times)
print(f"\n{name}:")
print(f" Iterations: {iterations}")
print(f" Average: {avg_time:.2f} ms")
print(f" Min: {min_time:.2f} ms")
print(f" Max: {max_time:.2f} ms")
print(f" Throughput: {1000/avg_time:.2f} FPS")
return result
return wrapper
return decorator
def main():
print("=" * 80)
print("PERFORMANCE PROFILING - Component Breakdown")
print("=" * 80)
GPU_ID = 0
MODEL_PATH = "models/yolov8n.trt"
RTSP_URL = os.getenv('CAMERA_URL_1')
# Initialize components
print("\nInitializing components...")
model_repo = TensorRTModelRepository(gpu_id=GPU_ID, default_num_contexts=4)
model_repo.load_model("detector", MODEL_PATH, num_contexts=4)
tracking_factory = TrackingFactory(gpu_id=GPU_ID)
controller = tracking_factory.create_controller(
model_repository=model_repo,
model_id="detector",
tracker_type="iou",
max_age=30,
min_confidence=0.5,
iou_threshold=0.3,
class_names=COCO_CLASSES
)
stream_factory = StreamDecoderFactory(gpu_id=GPU_ID)
decoder = stream_factory.create_decoder(RTSP_URL, buffer_size=30)
decoder.start()
print("Waiting for stream connection...")
connected = False
for i in range(30):
time.sleep(1)
if decoder.is_connected():
connected = True
print(f"✓ Stream connected after {i+1} seconds")
break
if i % 5 == 0:
print(f" Waiting... {i+1}/30 seconds")
if not connected:
print("⚠ Stream not connected after 30 seconds")
return
print("✓ Stream connected\n")
print("=" * 80)
print("PROFILING RESULTS")
print("=" * 80)
# Wait for frames to buffer
time.sleep(2)
# Get a sample frame for testing
frame_gpu = decoder.get_latest_frame(rgb=True)
if frame_gpu is None:
print("⚠ No frames available")
return
print(f"\nFrame shape: {frame_gpu.shape}")
print(f"Frame device: {frame_gpu.device}")
print(f"Frame dtype: {frame_gpu.dtype}")
# Profile 1: Video Decoding
@profile_component("1. Video Decoding (NVDEC)", iterations=100)
def profile_decoding():
return decoder.get_latest_frame(rgb=True)
profile_decoding()
# Profile 2: Preprocessing
@profile_component("2. Preprocessing (Resize + Normalize)", iterations=100)
def profile_preprocessing():
return YOLOv8Utils.preprocess(frame_gpu)
preprocessed = profile_preprocessing()
# Profile 3: TensorRT Inference
@profile_component("3. TensorRT Inference", iterations=100)
def profile_inference():
return model_repo.infer(
model_id="detector",
inputs={"images": preprocessed},
synchronize=True
)
outputs = profile_inference()
# Profile 4: Postprocessing (including NMS)
@profile_component("4. Postprocessing (NMS + Format Conversion)", iterations=100)
def profile_postprocessing():
return YOLOv8Utils.postprocess(outputs)
detections = profile_postprocessing()
print(f"\nDetections shape: {detections.shape}")
print(f"Number of detections: {len(detections)}")
# Profile 5: Full Pipeline (Tracking)
@profile_component("5. Full Tracking Pipeline", iterations=50)
def profile_full_pipeline():
frame = decoder.get_latest_frame(rgb=True)
if frame is None:
return []
return controller.track(
frame,
preprocess_fn=YOLOv8Utils.preprocess,
postprocess_fn=YOLOv8Utils.postprocess
)
profile_full_pipeline()
# Profile 6: Parallel inference (simulate multi-camera)
print("\n" + "=" * 80)
print("MULTI-CAMERA SIMULATION")
print("=" * 80)
num_cameras = 4
print(f"\nSimulating {num_cameras} cameras processing sequentially...")
@profile_component(f"Sequential Processing ({num_cameras} cameras)", iterations=20)
def profile_sequential():
for _ in range(num_cameras):
frame = decoder.get_latest_frame(rgb=True)
if frame is not None:
controller.track(
frame,
preprocess_fn=YOLOv8Utils.preprocess,
postprocess_fn=YOLOv8Utils.postprocess
)
profile_sequential()
# Cleanup
decoder.stop()
# Summary
print("\n" + "=" * 80)
print("BOTTLENECK ANALYSIS")
print("=" * 80)
print("""
Based on the profiling results above, identify the bottleneck:
1. If "TensorRT Inference" is the slowest:
→ GPU compute is the bottleneck
→ Solutions: Lower resolution, smaller model, batch processing
2. If "Postprocessing (NMS)" is slow:
→ CPU/GPU synchronization or NMS is slow
→ Solutions: Optimize NMS, reduce detections threshold
3. If "Video Decoding" is slow:
→ NVDEC is the bottleneck
→ Solutions: Lower resolution streams, fewer cameras per decoder
4. If "Sequential Processing" time ≈ (single pipeline time × num_cameras):
→ No parallelization, processing is sequential
→ Solutions: Async processing, CUDA streams, batching
Expected bottleneck: TensorRT Inference (most compute-intensive)
""")
if __name__ == "__main__":
main()