""" Test script for TensorRT Model Repository with multi-camera inference. This demonstrates: 1. Loading the same model for multiple cameras (deduplication) 2. Context pool load balancing 3. GPU-to-GPU inference from RTSP streams 4. Memory efficiency with shared engines """ import time import torch from services.model_repository import TensorRTModelRepository from services.stream_decoder import StreamDecoderFactory def test_multi_camera_inference(): """ Simulate multi-camera inference scenario. Example: 100 cameras, all using the same YOLOv8 model - Without pooling: 100 engines + 100 contexts in VRAM - With pooling: 1 engine + 4 contexts in VRAM (huge savings!) """ # Initialize model repository with context pooling repo = TensorRTModelRepository(gpu_id=0, default_num_contexts=4) # Camera configurations (simulated) camera_configs = [ {"id": "camera_1", "rtsp_url": "rtsp://camera1.local/stream"}, {"id": "camera_2", "rtsp_url": "rtsp://camera2.local/stream"}, {"id": "camera_3", "rtsp_url": "rtsp://camera3.local/stream"}, # ... imagine 100 cameras here ] # Load the same model for all cameras model_file = "models/yolov8n.trt" # Same file for all cameras print("=" * 80) print("LOADING MODELS FOR MULTIPLE CAMERAS") print("=" * 80) for config in camera_configs: try: # Each camera gets its own model_id, but shares the same engine! metadata = repo.load_model( model_id=config["id"], file_path=model_file, num_contexts=4 # 4 contexts shared across all cameras ) print(f"\n✓ Loaded model for {config['id']}") except Exception as e: print(f"\n✗ Failed to load model for {config['id']}: {e}") # Show repository stats print("\n" + "=" * 80) print("REPOSITORY STATISTICS") print("=" * 80) stats = repo.get_stats() print(f"Total model IDs: {stats['total_model_ids']}") print(f"Unique engines in VRAM: {stats['unique_engines']}") print(f"Total contexts: {stats['total_contexts']}") print(f"Memory efficiency: {stats['memory_efficiency']}") # Get detailed info for one camera print("\n" + "=" * 80) print("DETAILED MODEL INFO (camera_1)") print("=" * 80) info = repo.get_model_info("camera_1") if info: print(f"Model ID: {info['model_id']}") print(f"File: {info['file_path']}") print(f"File hash: {info['file_hash']}") print(f"Engine references: {info['engine_references']}") print(f"Context pool size: {info['context_pool_size']}") print(f"Shared with: {info['shared_with_model_ids']}") print(f"\nInputs:") for name, spec in info['inputs'].items(): print(f" {name}: {spec['shape']} ({spec['dtype']})") print(f"\nOutputs:") for name, spec in info['outputs'].items(): print(f" {name}: {spec['shape']} ({spec['dtype']})") # Simulate inference from multiple cameras print("\n" + "=" * 80) print("RUNNING INFERENCE (GPU-to-GPU)") print("=" * 80) # Create dummy input tensors (simulating frames from cameras) # In real scenario, these come from StreamDecoder.get_frame() batch_size = 1 channels = 3 height = 640 width = 640 for config in camera_configs: try: # Simulate getting frame from camera (already on GPU) input_tensor = torch.rand( batch_size, channels, height, width, dtype=torch.float32, device='cuda:0' ) # Run inference (stays in GPU) start = time.time() outputs = repo.infer( model_id=config["id"], inputs={"images": input_tensor}, # Adjust input name based on your model synchronize=True, timeout=5.0 ) elapsed = (time.time() - start) * 1000 # Convert to ms print(f"\n{config['id']}: Inference completed in {elapsed:.2f}ms") for name, tensor in outputs.items(): print(f" Output '{name}': {tensor.shape} on {tensor.device}") except Exception as e: print(f"\n{config['id']}: Inference failed: {e}") # Cleanup print("\n" + "=" * 80) print("CLEANUP") print("=" * 80) for config in camera_configs: repo.unload_model(config["id"]) print("\nAll models unloaded.") def test_rtsp_stream_with_inference(): """ Real-world example: Decode RTSP stream and run inference. Everything stays in GPU memory (zero CPU transfers). """ print("=" * 80) print("RTSP STREAM + TENSORRT INFERENCE (GPU-to-GPU)") print("=" * 80) # Initialize components decoder_factory = StreamDecoderFactory(gpu_id=0) model_repo = TensorRTModelRepository(gpu_id=0, default_num_contexts=4) # Setup camera stream rtsp_url = "rtsp://your-camera-ip/stream" decoder = decoder_factory.create_decoder(rtsp_url, buffer_size=30) decoder.start() # Load inference model try: model_repo.load_model( model_id="camera_main", file_path="models/yolov8n.trt" ) except FileNotFoundError: print("\n⚠ Model file not found. Please export your model to TensorRT:") print(" Example: yolo export model=yolov8n.pt format=engine device=0") return print("\nWaiting for stream to buffer frames...") time.sleep(3) # Process frames for i in range(10): # Get frame from decoder (already on GPU) frame_gpu = decoder.get_latest_frame(rgb=True) # Returns torch.Tensor on CUDA if frame_gpu is None: print(f"Frame {i}: No frame available") continue # Preprocess if needed (stays on GPU) # For YOLOv8: normalize, resize, etc. # Example preprocessing (adjust for your model): frame_gpu = frame_gpu.float() / 255.0 # Normalize to [0, 1] frame_gpu = frame_gpu.unsqueeze(0) # Add batch dimension: (1, 3, H, W) # Run inference (GPU-to-GPU, zero copy) try: outputs = model_repo.infer( model_id="camera_main", inputs={"images": frame_gpu}, synchronize=True ) print(f"\nFrame {i}: Inference successful") for name, tensor in outputs.items(): print(f" {name}: {tensor.shape} on {tensor.device}") # Post-process results (can stay on GPU or move to CPU as needed) # Example: NMS, bounding box extraction, etc. except Exception as e: print(f"\nFrame {i}: Inference failed: {e}") time.sleep(0.1) # Simulate processing interval # Cleanup decoder.stop() model_repo.unload_model("camera_main") print("\n✓ Test completed successfully") def test_concurrent_inference(): """ Test concurrent inference from multiple threads. Demonstrates context pool load balancing. """ import threading print("=" * 80) print("CONCURRENT INFERENCE TEST (Context Pool Load Balancing)") print("=" * 80) repo = TensorRTModelRepository(gpu_id=0, default_num_contexts=4) # Load model try: repo.load_model("shared_model", "models/yolov8n.trt", num_contexts=4) except Exception as e: print(f"Failed to load model: {e}") return def worker(worker_id: int, num_inferences: int): """Worker thread performing inference""" for i in range(num_inferences): try: # Create dummy input input_tensor = torch.rand(1, 3, 640, 640, device='cuda:0', dtype=torch.float32) # Acquire context from pool, run inference, release context outputs = repo.infer( model_id="shared_model", inputs={"images": input_tensor}, timeout=10.0 ) print(f"Worker {worker_id}, Inference {i}: SUCCESS") except Exception as e: print(f"Worker {worker_id}, Inference {i}: FAILED - {e}") time.sleep(0.01) # Small delay # Launch multiple worker threads (more workers than contexts!) threads = [] num_workers = 10 # 10 workers sharing 4 contexts inferences_per_worker = 5 print(f"\nLaunching {num_workers} workers (only 4 contexts available)") print("Contexts will be borrowed/returned automatically\n") start_time = time.time() for worker_id in range(num_workers): t = threading.Thread(target=worker, args=(worker_id, inferences_per_worker)) threads.append(t) t.start() # Wait for all workers for t in threads: t.join() elapsed = time.time() - start_time total_inferences = num_workers * inferences_per_worker print(f"\n✓ Completed {total_inferences} inferences in {elapsed:.2f}s") print(f" Throughput: {total_inferences / elapsed:.2f} inferences/sec") print(f" With only 4 contexts for {num_workers} workers!") repo.unload_model("shared_model") if __name__ == "__main__": print("\n" + "=" * 80) print("TENSORRT MODEL REPOSITORY - TEST SUITE") print("=" * 80) # Test 1: Multi-camera model loading print("\n\nTEST 1: Multi-Camera Model Loading with Deduplication") print("-" * 80) try: test_multi_camera_inference() except Exception as e: print(f"Test 1 failed: {e}") # Test 2: RTSP stream + inference (commented out by default) # Uncomment if you have a real RTSP stream # print("\n\nTEST 2: RTSP Stream + Inference") # print("-" * 80) # try: # test_rtsp_stream_with_inference() # except Exception as e: # print(f"Test 2 failed: {e}") # Test 3: Concurrent inference print("\n\nTEST 3: Concurrent Inference with Context Pooling") print("-" * 80) try: test_concurrent_inference() except Exception as e: print(f"Test 3 failed: {e}") print("\n" + "=" * 80) print("ALL TESTS COMPLETED") print("=" * 80)