feat: inference subsystem and optimization to decoder

2025-11-09 00:57:08 +07:00 · 2025-11-09 00:57:08 +07:00 · 3c83a57e44
commit 3c83a57e44
19 changed files with 3897 additions and 0 deletions
--- a/test_model_inference.py
+++ b/test_model_inference.py
@ -0,0 +1,310 @@
+"""
+Test script for TensorRT Model Repository with multi-camera inference.
+
+This demonstrates:
+1. Loading the same model for multiple cameras (deduplication)
+2. Context pool load balancing
+3. GPU-to-GPU inference from RTSP streams
+4. Memory efficiency with shared engines
+"""
+
+import time
+import torch
+from services.model_repository import TensorRTModelRepository
+from services.stream_decoder import StreamDecoderFactory
+
+
+def test_multi_camera_inference():
+    """
+    Simulate multi-camera inference scenario.
+
+    Example: 100 cameras, all using the same YOLOv8 model
+    - Without pooling: 100 engines + 100 contexts in VRAM
+    - With pooling: 1 engine + 4 contexts in VRAM (huge savings!)
+    """
+
+    # Initialize model repository with context pooling
+    repo = TensorRTModelRepository(gpu_id=0, default_num_contexts=4)
+
+    # Camera configurations (simulated)
+    camera_configs = [
+        {"id": "camera_1", "rtsp_url": "rtsp://camera1.local/stream"},
+        {"id": "camera_2", "rtsp_url": "rtsp://camera2.local/stream"},
+        {"id": "camera_3", "rtsp_url": "rtsp://camera3.local/stream"},
+        # ... imagine 100 cameras here
+    ]
+
+    # Load the same model for all cameras
+    model_file = "models/yolov8n.trt"  # Same file for all cameras
+
+    print("=" * 80)
+    print("LOADING MODELS FOR MULTIPLE CAMERAS")
+    print("=" * 80)
+
+    for config in camera_configs:
+        try:
+            # Each camera gets its own model_id, but shares the same engine!
+            metadata = repo.load_model(
+                model_id=config["id"],
+                file_path=model_file,
+                num_contexts=4  # 4 contexts shared across all cameras
+            )
+            print(f"\n✓ Loaded model for {config['id']}")
+        except Exception as e:
+            print(f"\n✗ Failed to load model for {config['id']}: {e}")
+
+    # Show repository stats
+    print("\n" + "=" * 80)
+    print("REPOSITORY STATISTICS")
+    print("=" * 80)
+    stats = repo.get_stats()
+    print(f"Total model IDs: {stats['total_model_ids']}")
+    print(f"Unique engines in VRAM: {stats['unique_engines']}")
+    print(f"Total contexts: {stats['total_contexts']}")
+    print(f"Memory efficiency: {stats['memory_efficiency']}")
+
+    # Get detailed info for one camera
+    print("\n" + "=" * 80)
+    print("DETAILED MODEL INFO (camera_1)")
+    print("=" * 80)
+    info = repo.get_model_info("camera_1")
+    if info:
+        print(f"Model ID: {info['model_id']}")
+        print(f"File: {info['file_path']}")
+        print(f"File hash: {info['file_hash']}")
+        print(f"Engine references: {info['engine_references']}")
+        print(f"Context pool size: {info['context_pool_size']}")
+        print(f"Shared with: {info['shared_with_model_ids']}")
+        print(f"\nInputs:")
+        for name, spec in info['inputs'].items():
+            print(f"  {name}: {spec['shape']} ({spec['dtype']})")
+        print(f"\nOutputs:")
+        for name, spec in info['outputs'].items():
+            print(f"  {name}: {spec['shape']} ({spec['dtype']})")
+
+    # Simulate inference from multiple cameras
+    print("\n" + "=" * 80)
+    print("RUNNING INFERENCE (GPU-to-GPU)")
+    print("=" * 80)
+
+    # Create dummy input tensors (simulating frames from cameras)
+    # In real scenario, these come from StreamDecoder.get_frame()
+    batch_size = 1
+    channels = 3
+    height = 640
+    width = 640
+
+    for config in camera_configs:
+        try:
+            # Simulate getting frame from camera (already on GPU)
+            input_tensor = torch.rand(
+                batch_size, channels, height, width,
+                dtype=torch.float32,
+                device='cuda:0'
+            )
+
+            # Run inference (stays in GPU)
+            start = time.time()
+            outputs = repo.infer(
+                model_id=config["id"],
+                inputs={"images": input_tensor},  # Adjust input name based on your model
+                synchronize=True,
+                timeout=5.0
+            )
+            elapsed = (time.time() - start) * 1000  # Convert to ms
+
+            print(f"\n{config['id']}: Inference completed in {elapsed:.2f}ms")
+            for name, tensor in outputs.items():
+                print(f"  Output '{name}': {tensor.shape} on {tensor.device}")
+
+        except Exception as e:
+            print(f"\n{config['id']}: Inference failed: {e}")
+
+    # Cleanup
+    print("\n" + "=" * 80)
+    print("CLEANUP")
+    print("=" * 80)
+
+    for config in camera_configs:
+        repo.unload_model(config["id"])
+
+    print("\nAll models unloaded.")
+
+
+def test_rtsp_stream_with_inference():
+    """
+    Real-world example: Decode RTSP stream and run inference.
+    Everything stays in GPU memory (zero CPU transfers).
+    """
+
+    print("=" * 80)
+    print("RTSP STREAM + TENSORRT INFERENCE (GPU-to-GPU)")
+    print("=" * 80)
+
+    # Initialize components
+    decoder_factory = StreamDecoderFactory(gpu_id=0)
+    model_repo = TensorRTModelRepository(gpu_id=0, default_num_contexts=4)
+
+    # Setup camera stream
+    rtsp_url = "rtsp://your-camera-ip/stream"
+    decoder = decoder_factory.create_decoder(rtsp_url, buffer_size=30)
+    decoder.start()
+
+    # Load inference model
+    try:
+        model_repo.load_model(
+            model_id="camera_main",
+            file_path="models/yolov8n.trt"
+        )
+    except FileNotFoundError:
+        print("\n⚠ Model file not found. Please export your model to TensorRT:")
+        print("   Example: yolo export model=yolov8n.pt format=engine device=0")
+        return
+
+    print("\nWaiting for stream to buffer frames...")
+    time.sleep(3)
+
+    # Process frames
+    for i in range(10):
+        # Get frame from decoder (already on GPU)
+        frame_gpu = decoder.get_latest_frame(rgb=True)  # Returns torch.Tensor on CUDA
+
+        if frame_gpu is None:
+            print(f"Frame {i}: No frame available")
+            continue
+
+        # Preprocess if needed (stays on GPU)
+        # For YOLOv8: normalize, resize, etc.
+        # Example preprocessing (adjust for your model):
+        frame_gpu = frame_gpu.float() / 255.0  # Normalize to [0, 1]
+        frame_gpu = frame_gpu.unsqueeze(0)  # Add batch dimension: (1, 3, H, W)
+
+        # Run inference (GPU-to-GPU, zero copy)
+        try:
+            outputs = model_repo.infer(
+                model_id="camera_main",
+                inputs={"images": frame_gpu},
+                synchronize=True
+            )
+
+            print(f"\nFrame {i}: Inference successful")
+            for name, tensor in outputs.items():
+                print(f"  {name}: {tensor.shape} on {tensor.device}")
+
+            # Post-process results (can stay on GPU or move to CPU as needed)
+            # Example: NMS, bounding box extraction, etc.
+
+        except Exception as e:
+            print(f"\nFrame {i}: Inference failed: {e}")
+
+        time.sleep(0.1)  # Simulate processing interval
+
+    # Cleanup
+    decoder.stop()
+    model_repo.unload_model("camera_main")
+    print("\n✓ Test completed successfully")
+
+
+def test_concurrent_inference():
+    """
+    Test concurrent inference from multiple threads.
+    Demonstrates context pool load balancing.
+    """
+    import threading
+
+    print("=" * 80)
+    print("CONCURRENT INFERENCE TEST (Context Pool Load Balancing)")
+    print("=" * 80)
+
+    repo = TensorRTModelRepository(gpu_id=0, default_num_contexts=4)
+
+    # Load model
+    try:
+        repo.load_model("shared_model", "models/yolov8n.trt", num_contexts=4)
+    except Exception as e:
+        print(f"Failed to load model: {e}")
+        return
+
+    def worker(worker_id: int, num_inferences: int):
+        """Worker thread performing inference"""
+        for i in range(num_inferences):
+            try:
+                # Create dummy input
+                input_tensor = torch.rand(1, 3, 640, 640, device='cuda:0', dtype=torch.float32)
+
+                # Acquire context from pool, run inference, release context
+                outputs = repo.infer(
+                    model_id="shared_model",
+                    inputs={"images": input_tensor},
+                    timeout=10.0
+                )
+
+                print(f"Worker {worker_id}, Inference {i}: SUCCESS")
+
+            except Exception as e:
+                print(f"Worker {worker_id}, Inference {i}: FAILED - {e}")
+
+            time.sleep(0.01)  # Small delay
+
+    # Launch multiple worker threads (more workers than contexts!)
+    threads = []
+    num_workers = 10  # 10 workers sharing 4 contexts
+    inferences_per_worker = 5
+
+    print(f"\nLaunching {num_workers} workers (only 4 contexts available)")
+    print("Contexts will be borrowed/returned automatically\n")
+
+    start_time = time.time()
+
+    for worker_id in range(num_workers):
+        t = threading.Thread(target=worker, args=(worker_id, inferences_per_worker))
+        threads.append(t)
+        t.start()
+
+    # Wait for all workers
+    for t in threads:
+        t.join()
+
+    elapsed = time.time() - start_time
+    total_inferences = num_workers * inferences_per_worker
+
+    print(f"\n✓ Completed {total_inferences} inferences in {elapsed:.2f}s")
+    print(f"  Throughput: {total_inferences / elapsed:.2f} inferences/sec")
+    print(f"  With only 4 contexts for {num_workers} workers!")
+
+    repo.unload_model("shared_model")
+
+
+if __name__ == "__main__":
+    print("\n" + "=" * 80)
+    print("TENSORRT MODEL REPOSITORY - TEST SUITE")
+    print("=" * 80)
+
+    # Test 1: Multi-camera model loading
+    print("\n\nTEST 1: Multi-Camera Model Loading with Deduplication")
+    print("-" * 80)
+    try:
+        test_multi_camera_inference()
+    except Exception as e:
+        print(f"Test 1 failed: {e}")
+
+    # Test 2: RTSP stream + inference (commented out by default)
+    # Uncomment if you have a real RTSP stream
+    # print("\n\nTEST 2: RTSP Stream + Inference")
+    # print("-" * 80)
+    # try:
+    #     test_rtsp_stream_with_inference()
+    # except Exception as e:
+    #     print(f"Test 2 failed: {e}")
+
+    # Test 3: Concurrent inference
+    print("\n\nTEST 3: Concurrent Inference with Context Pooling")
+    print("-" * 80)
+    try:
+        test_concurrent_inference()
+    except Exception as e:
+        print(f"Test 3 failed: {e}")
+
+    print("\n" + "=" * 80)
+    print("ALL TESTS COMPLETED")
+    print("=" * 80)