python-rtsp-worker/test_profiling.py

"""
Detailed Profiling Script to Identify Performance Bottlenecks

This script profiles each component separately:
1. Video decoding (NVDEC)
2. Preprocessing
3. TensorRT inference
4. Postprocessing (including NMS)
5. Tracking (IOU matching)
"""

import time
import os
import torch
from dotenv import load_dotenv
from services import (
    StreamDecoderFactory,
    TensorRTModelRepository,
    TrackingFactory,
    YOLOv8Utils,
    COCO_CLASSES,
)

load_dotenv()


def profile_component(name, iterations=100):
    """Decorator for profiling a component."""
    def decorator(func):
        def wrapper(*args, **kwargs):
            times = []
            for _ in range(iterations):
                start = time.time()
                result = func(*args, **kwargs)
                elapsed = time.time() - start
                times.append(elapsed * 1000)  # Convert to ms

            avg_time = sum(times) / len(times)
            min_time = min(times)
            max_time = max(times)

            print(f"\n{name}:")
            print(f"  Iterations: {iterations}")
            print(f"  Average:    {avg_time:.2f} ms")
            print(f"  Min:        {min_time:.2f} ms")
            print(f"  Max:        {max_time:.2f} ms")
            print(f"  Throughput: {1000/avg_time:.2f} FPS")

            return result
        return wrapper
    return decorator


def main():
    print("=" * 80)
    print("PERFORMANCE PROFILING - Component Breakdown")
    print("=" * 80)

    GPU_ID = 0
    MODEL_PATH = "models/yolov8n.trt"
    RTSP_URL = os.getenv('CAMERA_URL_1')

    # Initialize components
    print("\nInitializing components...")
    model_repo = TensorRTModelRepository(gpu_id=GPU_ID, default_num_contexts=4)
    model_repo.load_model("detector", MODEL_PATH, num_contexts=4)

    tracking_factory = TrackingFactory(gpu_id=GPU_ID)
    controller = tracking_factory.create_controller(
        model_repository=model_repo,
        model_id="detector",
        tracker_type="iou",
        max_age=30,
        min_confidence=0.5,
        iou_threshold=0.3,
        class_names=COCO_CLASSES
    )

    stream_factory = StreamDecoderFactory(gpu_id=GPU_ID)
    decoder = stream_factory.create_decoder(RTSP_URL, buffer_size=30)
    decoder.start()

    print("Waiting for stream connection...")
    connected = False
    for i in range(30):
        time.sleep(1)
        if decoder.is_connected():
            connected = True
            print(f"✓ Stream connected after {i+1} seconds")
            break
        if i % 5 == 0:
            print(f"  Waiting... {i+1}/30 seconds")

    if not connected:
        print("⚠ Stream not connected after 30 seconds")
        return

    print("✓ Stream connected\n")
    print("=" * 80)
    print("PROFILING RESULTS")
    print("=" * 80)

    # Wait for frames to buffer
    time.sleep(2)

    # Get a sample frame for testing
    frame_gpu = decoder.get_latest_frame(rgb=True)
    if frame_gpu is None:
        print("⚠ No frames available")
        return

    print(f"\nFrame shape: {frame_gpu.shape}")
    print(f"Frame device: {frame_gpu.device}")
    print(f"Frame dtype: {frame_gpu.dtype}")

    # Profile 1: Video Decoding
    @profile_component("1. Video Decoding (NVDEC)", iterations=100)
    def profile_decoding():
        return decoder.get_latest_frame(rgb=True)

    profile_decoding()

    # Profile 2: Preprocessing
    @profile_component("2. Preprocessing (Resize + Normalize)", iterations=100)
    def profile_preprocessing():
        return YOLOv8Utils.preprocess(frame_gpu)

    preprocessed = profile_preprocessing()

    # Profile 3: TensorRT Inference
    @profile_component("3. TensorRT Inference", iterations=100)
    def profile_inference():
        return model_repo.infer(
            model_id="detector",
            inputs={"images": preprocessed},
            synchronize=True
        )

    outputs = profile_inference()

    # Profile 4: Postprocessing (including NMS)
    @profile_component("4. Postprocessing (NMS + Format Conversion)", iterations=100)
    def profile_postprocessing():
        return YOLOv8Utils.postprocess(outputs)

    detections = profile_postprocessing()

    print(f"\nDetections shape: {detections.shape}")
    print(f"Number of detections: {len(detections)}")

    # Profile 5: Full Pipeline (Tracking)
    @profile_component("5. Full Tracking Pipeline", iterations=50)
    def profile_full_pipeline():
        frame = decoder.get_latest_frame(rgb=True)
        if frame is None:
            return []
        return controller.track(
            frame,
            preprocess_fn=YOLOv8Utils.preprocess,
            postprocess_fn=YOLOv8Utils.postprocess
        )

    profile_full_pipeline()

    # Profile 6: Parallel inference (simulate multi-camera)
    print("\n" + "=" * 80)
    print("MULTI-CAMERA SIMULATION")
    print("=" * 80)

    num_cameras = 4
    print(f"\nSimulating {num_cameras} cameras processing sequentially...")

    @profile_component(f"Sequential Processing ({num_cameras} cameras)", iterations=20)
    def profile_sequential():
        for _ in range(num_cameras):
            frame = decoder.get_latest_frame(rgb=True)
            if frame is not None:
                controller.track(
                    frame,
                    preprocess_fn=YOLOv8Utils.preprocess,
                    postprocess_fn=YOLOv8Utils.postprocess
                )

    profile_sequential()

    # Cleanup
    decoder.stop()

    # Summary
    print("\n" + "=" * 80)
    print("BOTTLENECK ANALYSIS")
    print("=" * 80)

    print("""
Based on the profiling results above, identify the bottleneck:

1. If "TensorRT Inference" is the slowest:
   → GPU compute is the bottleneck
   → Solutions: Lower resolution, smaller model, batch processing

2. If "Postprocessing (NMS)" is slow:
   → CPU/GPU synchronization or NMS is slow
   → Solutions: Optimize NMS, reduce detections threshold

3. If "Video Decoding" is slow:
   → NVDEC is the bottleneck
   → Solutions: Lower resolution streams, fewer cameras per decoder

4. If "Sequential Processing" time ≈ (single pipeline time × num_cameras):
   → No parallelization, processing is sequential
   → Solutions: Async processing, CUDA streams, batching

Expected bottleneck: TensorRT Inference (most compute-intensive)
    """)


if __name__ == "__main__":
    main()