nms optimization

2025-11-09 11:47:18 +07:00 · 2025-11-09 11:47:18 +07:00 · 8e20496fa7
commit 8e20496fa7
parent 81bbb0074e
5 changed files with 907 additions and 26 deletions
--- a/test_profiling.py
+++ b/test_profiling.py
@ -0,0 +1,218 @@
+"""
+Detailed Profiling Script to Identify Performance Bottlenecks
+
+This script profiles each component separately:
+1. Video decoding (NVDEC)
+2. Preprocessing
+3. TensorRT inference
+4. Postprocessing (including NMS)
+5. Tracking (IOU matching)
+"""
+
+import time
+import os
+import torch
+from dotenv import load_dotenv
+from services import (
+    StreamDecoderFactory,
+    TensorRTModelRepository,
+    TrackingFactory,
+    YOLOv8Utils,
+    COCO_CLASSES,
+)
+
+load_dotenv()
+
+
+def profile_component(name, iterations=100):
+    """Decorator for profiling a component."""
+    def decorator(func):
+        def wrapper(*args, **kwargs):
+            times = []
+            for _ in range(iterations):
+                start = time.time()
+                result = func(*args, **kwargs)
+                elapsed = time.time() - start
+                times.append(elapsed * 1000)  # Convert to ms
+
+            avg_time = sum(times) / len(times)
+            min_time = min(times)
+            max_time = max(times)
+
+            print(f"\n{name}:")
+            print(f"  Iterations: {iterations}")
+            print(f"  Average:    {avg_time:.2f} ms")
+            print(f"  Min:        {min_time:.2f} ms")
+            print(f"  Max:        {max_time:.2f} ms")
+            print(f"  Throughput: {1000/avg_time:.2f} FPS")
+
+            return result
+        return wrapper
+    return decorator
+
+
+def main():
+    print("=" * 80)
+    print("PERFORMANCE PROFILING - Component Breakdown")
+    print("=" * 80)
+
+    GPU_ID = 0
+    MODEL_PATH = "models/yolov8n.trt"
+    RTSP_URL = os.getenv('CAMERA_URL_1')
+
+    # Initialize components
+    print("\nInitializing components...")
+    model_repo = TensorRTModelRepository(gpu_id=GPU_ID, default_num_contexts=4)
+    model_repo.load_model("detector", MODEL_PATH, num_contexts=4)
+
+    tracking_factory = TrackingFactory(gpu_id=GPU_ID)
+    controller = tracking_factory.create_controller(
+        model_repository=model_repo,
+        model_id="detector",
+        tracker_type="iou",
+        max_age=30,
+        min_confidence=0.5,
+        iou_threshold=0.3,
+        class_names=COCO_CLASSES
+    )
+
+    stream_factory = StreamDecoderFactory(gpu_id=GPU_ID)
+    decoder = stream_factory.create_decoder(RTSP_URL, buffer_size=30)
+    decoder.start()
+
+    print("Waiting for stream connection...")
+    connected = False
+    for i in range(30):
+        time.sleep(1)
+        if decoder.is_connected():
+            connected = True
+            print(f"✓ Stream connected after {i+1} seconds")
+            break
+        if i % 5 == 0:
+            print(f"  Waiting... {i+1}/30 seconds")
+
+    if not connected:
+        print("⚠ Stream not connected after 30 seconds")
+        return
+
+    print("✓ Stream connected\n")
+    print("=" * 80)
+    print("PROFILING RESULTS")
+    print("=" * 80)
+
+    # Wait for frames to buffer
+    time.sleep(2)
+
+    # Get a sample frame for testing
+    frame_gpu = decoder.get_latest_frame(rgb=True)
+    if frame_gpu is None:
+        print("⚠ No frames available")
+        return
+
+    print(f"\nFrame shape: {frame_gpu.shape}")
+    print(f"Frame device: {frame_gpu.device}")
+    print(f"Frame dtype: {frame_gpu.dtype}")
+
+    # Profile 1: Video Decoding
+    @profile_component("1. Video Decoding (NVDEC)", iterations=100)
+    def profile_decoding():
+        return decoder.get_latest_frame(rgb=True)
+
+    profile_decoding()
+
+    # Profile 2: Preprocessing
+    @profile_component("2. Preprocessing (Resize + Normalize)", iterations=100)
+    def profile_preprocessing():
+        return YOLOv8Utils.preprocess(frame_gpu)
+
+    preprocessed = profile_preprocessing()
+
+    # Profile 3: TensorRT Inference
+    @profile_component("3. TensorRT Inference", iterations=100)
+    def profile_inference():
+        return model_repo.infer(
+            model_id="detector",
+            inputs={"images": preprocessed},
+            synchronize=True
+        )
+
+    outputs = profile_inference()
+
+    # Profile 4: Postprocessing (including NMS)
+    @profile_component("4. Postprocessing (NMS + Format Conversion)", iterations=100)
+    def profile_postprocessing():
+        return YOLOv8Utils.postprocess(outputs)
+
+    detections = profile_postprocessing()
+
+    print(f"\nDetections shape: {detections.shape}")
+    print(f"Number of detections: {len(detections)}")
+
+    # Profile 5: Full Pipeline (Tracking)
+    @profile_component("5. Full Tracking Pipeline", iterations=50)
+    def profile_full_pipeline():
+        frame = decoder.get_latest_frame(rgb=True)
+        if frame is None:
+            return []
+        return controller.track(
+            frame,
+            preprocess_fn=YOLOv8Utils.preprocess,
+            postprocess_fn=YOLOv8Utils.postprocess
+        )
+
+    profile_full_pipeline()
+
+    # Profile 6: Parallel inference (simulate multi-camera)
+    print("\n" + "=" * 80)
+    print("MULTI-CAMERA SIMULATION")
+    print("=" * 80)
+
+    num_cameras = 4
+    print(f"\nSimulating {num_cameras} cameras processing sequentially...")
+
+    @profile_component(f"Sequential Processing ({num_cameras} cameras)", iterations=20)
+    def profile_sequential():
+        for _ in range(num_cameras):
+            frame = decoder.get_latest_frame(rgb=True)
+            if frame is not None:
+                controller.track(
+                    frame,
+                    preprocess_fn=YOLOv8Utils.preprocess,
+                    postprocess_fn=YOLOv8Utils.postprocess
+                )
+
+    profile_sequential()
+
+    # Cleanup
+    decoder.stop()
+
+    # Summary
+    print("\n" + "=" * 80)
+    print("BOTTLENECK ANALYSIS")
+    print("=" * 80)
+
+    print("""
+Based on the profiling results above, identify the bottleneck:
+
+1. If "TensorRT Inference" is the slowest:
+   → GPU compute is the bottleneck
+   → Solutions: Lower resolution, smaller model, batch processing
+
+2. If "Postprocessing (NMS)" is slow:
+   → CPU/GPU synchronization or NMS is slow
+   → Solutions: Optimize NMS, reduce detections threshold
+
+3. If "Video Decoding" is slow:
+   → NVDEC is the bottleneck
+   → Solutions: Lower resolution streams, fewer cameras per decoder
+
+4. If "Sequential Processing" time ≈ (single pipeline time × num_cameras):
+   → No parallelization, processing is sequential
+   → Solutions: Async processing, CUDA streams, batching
+
+Expected bottleneck: TensorRT Inference (most compute-intensive)
+    """)
+
+
+if __name__ == "__main__":
+    main()