feat: inference subsystem and optimization to decoder

This commit is contained in:
Siwat Sirichai 2025-11-09 00:57:08 +07:00
commit 3c83a57e44
19 changed files with 3897 additions and 0 deletions

310
test_model_inference.py Normal file
View file

@ -0,0 +1,310 @@
"""
Test script for TensorRT Model Repository with multi-camera inference.
This demonstrates:
1. Loading the same model for multiple cameras (deduplication)
2. Context pool load balancing
3. GPU-to-GPU inference from RTSP streams
4. Memory efficiency with shared engines
"""
import time
import torch
from services.model_repository import TensorRTModelRepository
from services.stream_decoder import StreamDecoderFactory
def test_multi_camera_inference():
"""
Simulate multi-camera inference scenario.
Example: 100 cameras, all using the same YOLOv8 model
- Without pooling: 100 engines + 100 contexts in VRAM
- With pooling: 1 engine + 4 contexts in VRAM (huge savings!)
"""
# Initialize model repository with context pooling
repo = TensorRTModelRepository(gpu_id=0, default_num_contexts=4)
# Camera configurations (simulated)
camera_configs = [
{"id": "camera_1", "rtsp_url": "rtsp://camera1.local/stream"},
{"id": "camera_2", "rtsp_url": "rtsp://camera2.local/stream"},
{"id": "camera_3", "rtsp_url": "rtsp://camera3.local/stream"},
# ... imagine 100 cameras here
]
# Load the same model for all cameras
model_file = "models/yolov8n.trt" # Same file for all cameras
print("=" * 80)
print("LOADING MODELS FOR MULTIPLE CAMERAS")
print("=" * 80)
for config in camera_configs:
try:
# Each camera gets its own model_id, but shares the same engine!
metadata = repo.load_model(
model_id=config["id"],
file_path=model_file,
num_contexts=4 # 4 contexts shared across all cameras
)
print(f"\n✓ Loaded model for {config['id']}")
except Exception as e:
print(f"\n✗ Failed to load model for {config['id']}: {e}")
# Show repository stats
print("\n" + "=" * 80)
print("REPOSITORY STATISTICS")
print("=" * 80)
stats = repo.get_stats()
print(f"Total model IDs: {stats['total_model_ids']}")
print(f"Unique engines in VRAM: {stats['unique_engines']}")
print(f"Total contexts: {stats['total_contexts']}")
print(f"Memory efficiency: {stats['memory_efficiency']}")
# Get detailed info for one camera
print("\n" + "=" * 80)
print("DETAILED MODEL INFO (camera_1)")
print("=" * 80)
info = repo.get_model_info("camera_1")
if info:
print(f"Model ID: {info['model_id']}")
print(f"File: {info['file_path']}")
print(f"File hash: {info['file_hash']}")
print(f"Engine references: {info['engine_references']}")
print(f"Context pool size: {info['context_pool_size']}")
print(f"Shared with: {info['shared_with_model_ids']}")
print(f"\nInputs:")
for name, spec in info['inputs'].items():
print(f" {name}: {spec['shape']} ({spec['dtype']})")
print(f"\nOutputs:")
for name, spec in info['outputs'].items():
print(f" {name}: {spec['shape']} ({spec['dtype']})")
# Simulate inference from multiple cameras
print("\n" + "=" * 80)
print("RUNNING INFERENCE (GPU-to-GPU)")
print("=" * 80)
# Create dummy input tensors (simulating frames from cameras)
# In real scenario, these come from StreamDecoder.get_frame()
batch_size = 1
channels = 3
height = 640
width = 640
for config in camera_configs:
try:
# Simulate getting frame from camera (already on GPU)
input_tensor = torch.rand(
batch_size, channels, height, width,
dtype=torch.float32,
device='cuda:0'
)
# Run inference (stays in GPU)
start = time.time()
outputs = repo.infer(
model_id=config["id"],
inputs={"images": input_tensor}, # Adjust input name based on your model
synchronize=True,
timeout=5.0
)
elapsed = (time.time() - start) * 1000 # Convert to ms
print(f"\n{config['id']}: Inference completed in {elapsed:.2f}ms")
for name, tensor in outputs.items():
print(f" Output '{name}': {tensor.shape} on {tensor.device}")
except Exception as e:
print(f"\n{config['id']}: Inference failed: {e}")
# Cleanup
print("\n" + "=" * 80)
print("CLEANUP")
print("=" * 80)
for config in camera_configs:
repo.unload_model(config["id"])
print("\nAll models unloaded.")
def test_rtsp_stream_with_inference():
"""
Real-world example: Decode RTSP stream and run inference.
Everything stays in GPU memory (zero CPU transfers).
"""
print("=" * 80)
print("RTSP STREAM + TENSORRT INFERENCE (GPU-to-GPU)")
print("=" * 80)
# Initialize components
decoder_factory = StreamDecoderFactory(gpu_id=0)
model_repo = TensorRTModelRepository(gpu_id=0, default_num_contexts=4)
# Setup camera stream
rtsp_url = "rtsp://your-camera-ip/stream"
decoder = decoder_factory.create_decoder(rtsp_url, buffer_size=30)
decoder.start()
# Load inference model
try:
model_repo.load_model(
model_id="camera_main",
file_path="models/yolov8n.trt"
)
except FileNotFoundError:
print("\n⚠ Model file not found. Please export your model to TensorRT:")
print(" Example: yolo export model=yolov8n.pt format=engine device=0")
return
print("\nWaiting for stream to buffer frames...")
time.sleep(3)
# Process frames
for i in range(10):
# Get frame from decoder (already on GPU)
frame_gpu = decoder.get_latest_frame(rgb=True) # Returns torch.Tensor on CUDA
if frame_gpu is None:
print(f"Frame {i}: No frame available")
continue
# Preprocess if needed (stays on GPU)
# For YOLOv8: normalize, resize, etc.
# Example preprocessing (adjust for your model):
frame_gpu = frame_gpu.float() / 255.0 # Normalize to [0, 1]
frame_gpu = frame_gpu.unsqueeze(0) # Add batch dimension: (1, 3, H, W)
# Run inference (GPU-to-GPU, zero copy)
try:
outputs = model_repo.infer(
model_id="camera_main",
inputs={"images": frame_gpu},
synchronize=True
)
print(f"\nFrame {i}: Inference successful")
for name, tensor in outputs.items():
print(f" {name}: {tensor.shape} on {tensor.device}")
# Post-process results (can stay on GPU or move to CPU as needed)
# Example: NMS, bounding box extraction, etc.
except Exception as e:
print(f"\nFrame {i}: Inference failed: {e}")
time.sleep(0.1) # Simulate processing interval
# Cleanup
decoder.stop()
model_repo.unload_model("camera_main")
print("\n✓ Test completed successfully")
def test_concurrent_inference():
"""
Test concurrent inference from multiple threads.
Demonstrates context pool load balancing.
"""
import threading
print("=" * 80)
print("CONCURRENT INFERENCE TEST (Context Pool Load Balancing)")
print("=" * 80)
repo = TensorRTModelRepository(gpu_id=0, default_num_contexts=4)
# Load model
try:
repo.load_model("shared_model", "models/yolov8n.trt", num_contexts=4)
except Exception as e:
print(f"Failed to load model: {e}")
return
def worker(worker_id: int, num_inferences: int):
"""Worker thread performing inference"""
for i in range(num_inferences):
try:
# Create dummy input
input_tensor = torch.rand(1, 3, 640, 640, device='cuda:0', dtype=torch.float32)
# Acquire context from pool, run inference, release context
outputs = repo.infer(
model_id="shared_model",
inputs={"images": input_tensor},
timeout=10.0
)
print(f"Worker {worker_id}, Inference {i}: SUCCESS")
except Exception as e:
print(f"Worker {worker_id}, Inference {i}: FAILED - {e}")
time.sleep(0.01) # Small delay
# Launch multiple worker threads (more workers than contexts!)
threads = []
num_workers = 10 # 10 workers sharing 4 contexts
inferences_per_worker = 5
print(f"\nLaunching {num_workers} workers (only 4 contexts available)")
print("Contexts will be borrowed/returned automatically\n")
start_time = time.time()
for worker_id in range(num_workers):
t = threading.Thread(target=worker, args=(worker_id, inferences_per_worker))
threads.append(t)
t.start()
# Wait for all workers
for t in threads:
t.join()
elapsed = time.time() - start_time
total_inferences = num_workers * inferences_per_worker
print(f"\n✓ Completed {total_inferences} inferences in {elapsed:.2f}s")
print(f" Throughput: {total_inferences / elapsed:.2f} inferences/sec")
print(f" With only 4 contexts for {num_workers} workers!")
repo.unload_model("shared_model")
if __name__ == "__main__":
print("\n" + "=" * 80)
print("TENSORRT MODEL REPOSITORY - TEST SUITE")
print("=" * 80)
# Test 1: Multi-camera model loading
print("\n\nTEST 1: Multi-Camera Model Loading with Deduplication")
print("-" * 80)
try:
test_multi_camera_inference()
except Exception as e:
print(f"Test 1 failed: {e}")
# Test 2: RTSP stream + inference (commented out by default)
# Uncomment if you have a real RTSP stream
# print("\n\nTEST 2: RTSP Stream + Inference")
# print("-" * 80)
# try:
# test_rtsp_stream_with_inference()
# except Exception as e:
# print(f"Test 2 failed: {e}")
# Test 3: Concurrent inference
print("\n\nTEST 3: Concurrent Inference with Context Pooling")
print("-" * 80)
try:
test_concurrent_inference()
except Exception as e:
print(f"Test 3 failed: {e}")
print("\n" + "=" * 80)
print("ALL TESTS COMPLETED")
print("=" * 80)