""" Detailed Profiling Script to Identify Performance Bottlenecks This script profiles each component separately: 1. Video decoding (NVDEC) 2. Preprocessing 3. TensorRT inference 4. Postprocessing (including NMS) 5. Tracking (IOU matching) """ import time import os import torch from dotenv import load_dotenv from services import ( StreamDecoderFactory, TensorRTModelRepository, TrackingFactory, YOLOv8Utils, COCO_CLASSES, ) load_dotenv() def profile_component(name, iterations=100): """Decorator for profiling a component.""" def decorator(func): def wrapper(*args, **kwargs): times = [] for _ in range(iterations): start = time.time() result = func(*args, **kwargs) elapsed = time.time() - start times.append(elapsed * 1000) # Convert to ms avg_time = sum(times) / len(times) min_time = min(times) max_time = max(times) print(f"\n{name}:") print(f" Iterations: {iterations}") print(f" Average: {avg_time:.2f} ms") print(f" Min: {min_time:.2f} ms") print(f" Max: {max_time:.2f} ms") print(f" Throughput: {1000/avg_time:.2f} FPS") return result return wrapper return decorator def main(): print("=" * 80) print("PERFORMANCE PROFILING - Component Breakdown") print("=" * 80) GPU_ID = 0 MODEL_PATH = "models/yolov8n.trt" RTSP_URL = os.getenv('CAMERA_URL_1') # Initialize components print("\nInitializing components...") model_repo = TensorRTModelRepository(gpu_id=GPU_ID, default_num_contexts=4) model_repo.load_model("detector", MODEL_PATH, num_contexts=4) tracking_factory = TrackingFactory(gpu_id=GPU_ID) controller = tracking_factory.create_controller( model_repository=model_repo, model_id="detector", tracker_type="iou", max_age=30, min_confidence=0.5, iou_threshold=0.3, class_names=COCO_CLASSES ) stream_factory = StreamDecoderFactory(gpu_id=GPU_ID) decoder = stream_factory.create_decoder(RTSP_URL, buffer_size=30) decoder.start() print("Waiting for stream connection...") connected = False for i in range(30): time.sleep(1) if decoder.is_connected(): connected = True print(f"✓ Stream connected after {i+1} seconds") break if i % 5 == 0: print(f" Waiting... {i+1}/30 seconds") if not connected: print("⚠ Stream not connected after 30 seconds") return print("✓ Stream connected\n") print("=" * 80) print("PROFILING RESULTS") print("=" * 80) # Wait for frames to buffer time.sleep(2) # Get a sample frame for testing frame_gpu = decoder.get_latest_frame(rgb=True) if frame_gpu is None: print("⚠ No frames available") return print(f"\nFrame shape: {frame_gpu.shape}") print(f"Frame device: {frame_gpu.device}") print(f"Frame dtype: {frame_gpu.dtype}") # Profile 1: Video Decoding @profile_component("1. Video Decoding (NVDEC)", iterations=100) def profile_decoding(): return decoder.get_latest_frame(rgb=True) profile_decoding() # Profile 2: Preprocessing @profile_component("2. Preprocessing (Resize + Normalize)", iterations=100) def profile_preprocessing(): return YOLOv8Utils.preprocess(frame_gpu) preprocessed = profile_preprocessing() # Profile 3: TensorRT Inference @profile_component("3. TensorRT Inference", iterations=100) def profile_inference(): return model_repo.infer( model_id="detector", inputs={"images": preprocessed}, synchronize=True ) outputs = profile_inference() # Profile 4: Postprocessing (including NMS) @profile_component("4. Postprocessing (NMS + Format Conversion)", iterations=100) def profile_postprocessing(): return YOLOv8Utils.postprocess(outputs) detections = profile_postprocessing() print(f"\nDetections shape: {detections.shape}") print(f"Number of detections: {len(detections)}") # Profile 5: Full Pipeline (Tracking) @profile_component("5. Full Tracking Pipeline", iterations=50) def profile_full_pipeline(): frame = decoder.get_latest_frame(rgb=True) if frame is None: return [] return controller.track( frame, preprocess_fn=YOLOv8Utils.preprocess, postprocess_fn=YOLOv8Utils.postprocess ) profile_full_pipeline() # Profile 6: Parallel inference (simulate multi-camera) print("\n" + "=" * 80) print("MULTI-CAMERA SIMULATION") print("=" * 80) num_cameras = 4 print(f"\nSimulating {num_cameras} cameras processing sequentially...") @profile_component(f"Sequential Processing ({num_cameras} cameras)", iterations=20) def profile_sequential(): for _ in range(num_cameras): frame = decoder.get_latest_frame(rgb=True) if frame is not None: controller.track( frame, preprocess_fn=YOLOv8Utils.preprocess, postprocess_fn=YOLOv8Utils.postprocess ) profile_sequential() # Cleanup decoder.stop() # Summary print("\n" + "=" * 80) print("BOTTLENECK ANALYSIS") print("=" * 80) print(""" Based on the profiling results above, identify the bottleneck: 1. If "TensorRT Inference" is the slowest: → GPU compute is the bottleneck → Solutions: Lower resolution, smaller model, batch processing 2. If "Postprocessing (NMS)" is slow: → CPU/GPU synchronization or NMS is slow → Solutions: Optimize NMS, reduce detections threshold 3. If "Video Decoding" is slow: → NVDEC is the bottleneck → Solutions: Lower resolution streams, fewer cameras per decoder 4. If "Sequential Processing" time ≈ (single pipeline time × num_cameras): → No parallelization, processing is sequential → Solutions: Async processing, CUDA streams, batching Expected bottleneck: TensorRT Inference (most compute-intensive) """) if __name__ == "__main__": main()