feat: inference subsystem and optimization to decoder
This commit is contained in:
commit
3c83a57e44
19 changed files with 3897 additions and 0 deletions
310
test_model_inference.py
Normal file
310
test_model_inference.py
Normal file
|
|
@ -0,0 +1,310 @@
|
|||
"""
|
||||
Test script for TensorRT Model Repository with multi-camera inference.
|
||||
|
||||
This demonstrates:
|
||||
1. Loading the same model for multiple cameras (deduplication)
|
||||
2. Context pool load balancing
|
||||
3. GPU-to-GPU inference from RTSP streams
|
||||
4. Memory efficiency with shared engines
|
||||
"""
|
||||
|
||||
import time
|
||||
import torch
|
||||
from services.model_repository import TensorRTModelRepository
|
||||
from services.stream_decoder import StreamDecoderFactory
|
||||
|
||||
|
||||
def test_multi_camera_inference():
|
||||
"""
|
||||
Simulate multi-camera inference scenario.
|
||||
|
||||
Example: 100 cameras, all using the same YOLOv8 model
|
||||
- Without pooling: 100 engines + 100 contexts in VRAM
|
||||
- With pooling: 1 engine + 4 contexts in VRAM (huge savings!)
|
||||
"""
|
||||
|
||||
# Initialize model repository with context pooling
|
||||
repo = TensorRTModelRepository(gpu_id=0, default_num_contexts=4)
|
||||
|
||||
# Camera configurations (simulated)
|
||||
camera_configs = [
|
||||
{"id": "camera_1", "rtsp_url": "rtsp://camera1.local/stream"},
|
||||
{"id": "camera_2", "rtsp_url": "rtsp://camera2.local/stream"},
|
||||
{"id": "camera_3", "rtsp_url": "rtsp://camera3.local/stream"},
|
||||
# ... imagine 100 cameras here
|
||||
]
|
||||
|
||||
# Load the same model for all cameras
|
||||
model_file = "models/yolov8n.trt" # Same file for all cameras
|
||||
|
||||
print("=" * 80)
|
||||
print("LOADING MODELS FOR MULTIPLE CAMERAS")
|
||||
print("=" * 80)
|
||||
|
||||
for config in camera_configs:
|
||||
try:
|
||||
# Each camera gets its own model_id, but shares the same engine!
|
||||
metadata = repo.load_model(
|
||||
model_id=config["id"],
|
||||
file_path=model_file,
|
||||
num_contexts=4 # 4 contexts shared across all cameras
|
||||
)
|
||||
print(f"\n✓ Loaded model for {config['id']}")
|
||||
except Exception as e:
|
||||
print(f"\n✗ Failed to load model for {config['id']}: {e}")
|
||||
|
||||
# Show repository stats
|
||||
print("\n" + "=" * 80)
|
||||
print("REPOSITORY STATISTICS")
|
||||
print("=" * 80)
|
||||
stats = repo.get_stats()
|
||||
print(f"Total model IDs: {stats['total_model_ids']}")
|
||||
print(f"Unique engines in VRAM: {stats['unique_engines']}")
|
||||
print(f"Total contexts: {stats['total_contexts']}")
|
||||
print(f"Memory efficiency: {stats['memory_efficiency']}")
|
||||
|
||||
# Get detailed info for one camera
|
||||
print("\n" + "=" * 80)
|
||||
print("DETAILED MODEL INFO (camera_1)")
|
||||
print("=" * 80)
|
||||
info = repo.get_model_info("camera_1")
|
||||
if info:
|
||||
print(f"Model ID: {info['model_id']}")
|
||||
print(f"File: {info['file_path']}")
|
||||
print(f"File hash: {info['file_hash']}")
|
||||
print(f"Engine references: {info['engine_references']}")
|
||||
print(f"Context pool size: {info['context_pool_size']}")
|
||||
print(f"Shared with: {info['shared_with_model_ids']}")
|
||||
print(f"\nInputs:")
|
||||
for name, spec in info['inputs'].items():
|
||||
print(f" {name}: {spec['shape']} ({spec['dtype']})")
|
||||
print(f"\nOutputs:")
|
||||
for name, spec in info['outputs'].items():
|
||||
print(f" {name}: {spec['shape']} ({spec['dtype']})")
|
||||
|
||||
# Simulate inference from multiple cameras
|
||||
print("\n" + "=" * 80)
|
||||
print("RUNNING INFERENCE (GPU-to-GPU)")
|
||||
print("=" * 80)
|
||||
|
||||
# Create dummy input tensors (simulating frames from cameras)
|
||||
# In real scenario, these come from StreamDecoder.get_frame()
|
||||
batch_size = 1
|
||||
channels = 3
|
||||
height = 640
|
||||
width = 640
|
||||
|
||||
for config in camera_configs:
|
||||
try:
|
||||
# Simulate getting frame from camera (already on GPU)
|
||||
input_tensor = torch.rand(
|
||||
batch_size, channels, height, width,
|
||||
dtype=torch.float32,
|
||||
device='cuda:0'
|
||||
)
|
||||
|
||||
# Run inference (stays in GPU)
|
||||
start = time.time()
|
||||
outputs = repo.infer(
|
||||
model_id=config["id"],
|
||||
inputs={"images": input_tensor}, # Adjust input name based on your model
|
||||
synchronize=True,
|
||||
timeout=5.0
|
||||
)
|
||||
elapsed = (time.time() - start) * 1000 # Convert to ms
|
||||
|
||||
print(f"\n{config['id']}: Inference completed in {elapsed:.2f}ms")
|
||||
for name, tensor in outputs.items():
|
||||
print(f" Output '{name}': {tensor.shape} on {tensor.device}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n{config['id']}: Inference failed: {e}")
|
||||
|
||||
# Cleanup
|
||||
print("\n" + "=" * 80)
|
||||
print("CLEANUP")
|
||||
print("=" * 80)
|
||||
|
||||
for config in camera_configs:
|
||||
repo.unload_model(config["id"])
|
||||
|
||||
print("\nAll models unloaded.")
|
||||
|
||||
|
||||
def test_rtsp_stream_with_inference():
|
||||
"""
|
||||
Real-world example: Decode RTSP stream and run inference.
|
||||
Everything stays in GPU memory (zero CPU transfers).
|
||||
"""
|
||||
|
||||
print("=" * 80)
|
||||
print("RTSP STREAM + TENSORRT INFERENCE (GPU-to-GPU)")
|
||||
print("=" * 80)
|
||||
|
||||
# Initialize components
|
||||
decoder_factory = StreamDecoderFactory(gpu_id=0)
|
||||
model_repo = TensorRTModelRepository(gpu_id=0, default_num_contexts=4)
|
||||
|
||||
# Setup camera stream
|
||||
rtsp_url = "rtsp://your-camera-ip/stream"
|
||||
decoder = decoder_factory.create_decoder(rtsp_url, buffer_size=30)
|
||||
decoder.start()
|
||||
|
||||
# Load inference model
|
||||
try:
|
||||
model_repo.load_model(
|
||||
model_id="camera_main",
|
||||
file_path="models/yolov8n.trt"
|
||||
)
|
||||
except FileNotFoundError:
|
||||
print("\n⚠ Model file not found. Please export your model to TensorRT:")
|
||||
print(" Example: yolo export model=yolov8n.pt format=engine device=0")
|
||||
return
|
||||
|
||||
print("\nWaiting for stream to buffer frames...")
|
||||
time.sleep(3)
|
||||
|
||||
# Process frames
|
||||
for i in range(10):
|
||||
# Get frame from decoder (already on GPU)
|
||||
frame_gpu = decoder.get_latest_frame(rgb=True) # Returns torch.Tensor on CUDA
|
||||
|
||||
if frame_gpu is None:
|
||||
print(f"Frame {i}: No frame available")
|
||||
continue
|
||||
|
||||
# Preprocess if needed (stays on GPU)
|
||||
# For YOLOv8: normalize, resize, etc.
|
||||
# Example preprocessing (adjust for your model):
|
||||
frame_gpu = frame_gpu.float() / 255.0 # Normalize to [0, 1]
|
||||
frame_gpu = frame_gpu.unsqueeze(0) # Add batch dimension: (1, 3, H, W)
|
||||
|
||||
# Run inference (GPU-to-GPU, zero copy)
|
||||
try:
|
||||
outputs = model_repo.infer(
|
||||
model_id="camera_main",
|
||||
inputs={"images": frame_gpu},
|
||||
synchronize=True
|
||||
)
|
||||
|
||||
print(f"\nFrame {i}: Inference successful")
|
||||
for name, tensor in outputs.items():
|
||||
print(f" {name}: {tensor.shape} on {tensor.device}")
|
||||
|
||||
# Post-process results (can stay on GPU or move to CPU as needed)
|
||||
# Example: NMS, bounding box extraction, etc.
|
||||
|
||||
except Exception as e:
|
||||
print(f"\nFrame {i}: Inference failed: {e}")
|
||||
|
||||
time.sleep(0.1) # Simulate processing interval
|
||||
|
||||
# Cleanup
|
||||
decoder.stop()
|
||||
model_repo.unload_model("camera_main")
|
||||
print("\n✓ Test completed successfully")
|
||||
|
||||
|
||||
def test_concurrent_inference():
|
||||
"""
|
||||
Test concurrent inference from multiple threads.
|
||||
Demonstrates context pool load balancing.
|
||||
"""
|
||||
import threading
|
||||
|
||||
print("=" * 80)
|
||||
print("CONCURRENT INFERENCE TEST (Context Pool Load Balancing)")
|
||||
print("=" * 80)
|
||||
|
||||
repo = TensorRTModelRepository(gpu_id=0, default_num_contexts=4)
|
||||
|
||||
# Load model
|
||||
try:
|
||||
repo.load_model("shared_model", "models/yolov8n.trt", num_contexts=4)
|
||||
except Exception as e:
|
||||
print(f"Failed to load model: {e}")
|
||||
return
|
||||
|
||||
def worker(worker_id: int, num_inferences: int):
|
||||
"""Worker thread performing inference"""
|
||||
for i in range(num_inferences):
|
||||
try:
|
||||
# Create dummy input
|
||||
input_tensor = torch.rand(1, 3, 640, 640, device='cuda:0', dtype=torch.float32)
|
||||
|
||||
# Acquire context from pool, run inference, release context
|
||||
outputs = repo.infer(
|
||||
model_id="shared_model",
|
||||
inputs={"images": input_tensor},
|
||||
timeout=10.0
|
||||
)
|
||||
|
||||
print(f"Worker {worker_id}, Inference {i}: SUCCESS")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Worker {worker_id}, Inference {i}: FAILED - {e}")
|
||||
|
||||
time.sleep(0.01) # Small delay
|
||||
|
||||
# Launch multiple worker threads (more workers than contexts!)
|
||||
threads = []
|
||||
num_workers = 10 # 10 workers sharing 4 contexts
|
||||
inferences_per_worker = 5
|
||||
|
||||
print(f"\nLaunching {num_workers} workers (only 4 contexts available)")
|
||||
print("Contexts will be borrowed/returned automatically\n")
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
for worker_id in range(num_workers):
|
||||
t = threading.Thread(target=worker, args=(worker_id, inferences_per_worker))
|
||||
threads.append(t)
|
||||
t.start()
|
||||
|
||||
# Wait for all workers
|
||||
for t in threads:
|
||||
t.join()
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
total_inferences = num_workers * inferences_per_worker
|
||||
|
||||
print(f"\n✓ Completed {total_inferences} inferences in {elapsed:.2f}s")
|
||||
print(f" Throughput: {total_inferences / elapsed:.2f} inferences/sec")
|
||||
print(f" With only 4 contexts for {num_workers} workers!")
|
||||
|
||||
repo.unload_model("shared_model")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("\n" + "=" * 80)
|
||||
print("TENSORRT MODEL REPOSITORY - TEST SUITE")
|
||||
print("=" * 80)
|
||||
|
||||
# Test 1: Multi-camera model loading
|
||||
print("\n\nTEST 1: Multi-Camera Model Loading with Deduplication")
|
||||
print("-" * 80)
|
||||
try:
|
||||
test_multi_camera_inference()
|
||||
except Exception as e:
|
||||
print(f"Test 1 failed: {e}")
|
||||
|
||||
# Test 2: RTSP stream + inference (commented out by default)
|
||||
# Uncomment if you have a real RTSP stream
|
||||
# print("\n\nTEST 2: RTSP Stream + Inference")
|
||||
# print("-" * 80)
|
||||
# try:
|
||||
# test_rtsp_stream_with_inference()
|
||||
# except Exception as e:
|
||||
# print(f"Test 2 failed: {e}")
|
||||
|
||||
# Test 3: Concurrent inference
|
||||
print("\n\nTEST 3: Concurrent Inference with Context Pooling")
|
||||
print("-" * 80)
|
||||
try:
|
||||
test_concurrent_inference()
|
||||
except Exception as e:
|
||||
print(f"Test 3 failed: {e}")
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("ALL TESTS COMPLETED")
|
||||
print("=" * 80)
|
||||
Loading…
Add table
Add a link
Reference in a new issue