remove redundant examples

2025-11-09 19:42:18 +07:00 · 2025-11-09 19:42:18 +07:00 · d3dbf9a580
commit d3dbf9a580
parent dd57b5a246
8 changed files with 0 additions and 1648 deletions
--- a/test_fps_benchmark.py
+++ b/test_fps_benchmark.py
@ -1,340 +0,0 @@
 """
 FPS Benchmark Test for Single vs Multi-Camera Tracking
 This script benchmarks the FPS performance of:
 1. Single camera tracking
 2. Multi-camera tracking (2+ cameras)
 Usage:
    python test_fps_benchmark.py
 """
 import time
 import os
 from dotenv import load_dotenv
 from services import (
    StreamDecoderFactory,
    TensorRTModelRepository,
    TrackingFactory,
    YOLOv8Utils,
    COCO_CLASSES,
 )
 load_dotenv()
 def benchmark_single_camera(duration=30):
    """
    Benchmark single camera tracking performance.
    Args:
        duration: Test duration in seconds
    Returns:
        Dictionary with FPS statistics
    """
    print("\n" + "=" * 80)
    print("SINGLE CAMERA BENCHMARK")
    print("=" * 80)
    GPU_ID = 0
    MODEL_PATH = "models/yolov8n.trt"
    RTSP_URL = os.getenv('CAMERA_URL_1', 'rtsp://localhost:8554/test')
    # Initialize components
    print("\nInitializing...")
    model_repo = TensorRTModelRepository(gpu_id=GPU_ID, default_num_contexts=4)
    model_repo.load_model("detector", MODEL_PATH, num_contexts=4)
    tracking_factory = TrackingFactory(gpu_id=GPU_ID)
    controller = tracking_factory.create_controller(
        model_repository=model_repo,
        model_id="detector",
        tracker_type="iou",
        max_age=30,
        min_confidence=0.5,
        iou_threshold=0.3,
        class_names=COCO_CLASSES
    )
    stream_factory = StreamDecoderFactory(gpu_id=GPU_ID)
    decoder = stream_factory.create_decoder(RTSP_URL, buffer_size=30)
    decoder.start()
    print("Waiting for stream connection...")
    time.sleep(5)
    if not decoder.is_connected():
        print("⚠ Stream not connected, results may be inaccurate")
    # Benchmark
    print(f"\nRunning benchmark for {duration} seconds...")
    frame_count = 0
    start_time = time.time()
    fps_samples = []
    sample_start = time.time()
    sample_frames = 0
    try:
        while time.time() - start_time < duration:
            frame_gpu = decoder.get_latest_frame(rgb=True)
            if frame_gpu is None:
                time.sleep(0.001)
                continue
            # Run tracking
            tracked_objects = controller.track(
                frame_gpu,
                preprocess_fn=YOLOv8Utils.preprocess,
                postprocess_fn=YOLOv8Utils.postprocess
            )
            frame_count += 1
            sample_frames += 1
            # Sample FPS every second
            if time.time() - sample_start >= 1.0:
                fps = sample_frames / (time.time() - sample_start)
                fps_samples.append(fps)
                sample_frames = 0
                sample_start = time.time()
                print(f"  Current FPS: {fps:.2f}")
    except KeyboardInterrupt:
        print("\nBenchmark interrupted")
    # Calculate statistics
    total_time = time.time() - start_time
    avg_fps = frame_count / total_time
    # Cleanup
    decoder.stop()
    stats = {
        'total_frames': frame_count,
        'total_time': total_time,
        'avg_fps': avg_fps,
        'min_fps': min(fps_samples) if fps_samples else 0,
        'max_fps': max(fps_samples) if fps_samples else 0,
        'samples': fps_samples
    }
    print("\n" + "-" * 80)
    print(f"Total Frames:  {stats['total_frames']}")
    print(f"Total Time:    {stats['total_time']:.2f} seconds")
    print(f"Average FPS:   {stats['avg_fps']:.2f}")
    print(f"Min FPS:       {stats['min_fps']:.2f}")
    print(f"Max FPS:       {stats['max_fps']:.2f}")
    print("-" * 80)
    return stats
 def benchmark_multi_camera(duration=30):
    """
    Benchmark multi-camera tracking performance.
    Args:
        duration: Test duration in seconds
    Returns:
        Dictionary with FPS statistics per camera
    """
    print("\n" + "=" * 80)
    print("MULTI-CAMERA BENCHMARK")
    print("=" * 80)
    GPU_ID = 0
    MODEL_PATH = "models/yolov8n.trt"
    # Load camera URLs
    camera_urls = []
    i = 1
    while True:
        url = os.getenv(f'CAMERA_URL_{i}')
        if url:
            camera_urls.append(url)
            i += 1
        else:
            break
    if len(camera_urls) < 2:
        print("⚠ Need at least 2 cameras for multi-camera test")
        print(f"  Found only {len(camera_urls)} camera(s) in .env")
        return None
    print(f"\nTesting with {len(camera_urls)} cameras")
    # Initialize components
    print("\nInitializing...")
    model_repo = TensorRTModelRepository(gpu_id=GPU_ID, default_num_contexts=8)
    model_repo.load_model("detector", MODEL_PATH, num_contexts=8)
    tracking_factory = TrackingFactory(gpu_id=GPU_ID)
    stream_factory = StreamDecoderFactory(gpu_id=GPU_ID)
    decoders = []
    controllers = []
    for i, url in enumerate(camera_urls):
        # Create decoder
        decoder = stream_factory.create_decoder(url, buffer_size=30)
        decoder.start()
        decoders.append(decoder)
        # Create controller
        controller = tracking_factory.create_controller(
            model_repository=model_repo,
            model_id="detector",
            tracker_type="iou",
            max_age=30,
            min_confidence=0.5,
            iou_threshold=0.3,
            class_names=COCO_CLASSES
        )
        controllers.append(controller)
        print(f"  Camera {i+1}: {url}")
    print("\nWaiting for streams to connect...")
    time.sleep(10)
    # Benchmark
    print(f"\nRunning benchmark for {duration} seconds...")
    frame_counts = [0] * len(camera_urls)
    fps_samples = [[] for _ in camera_urls]
    sample_starts = [time.time()] * len(camera_urls)
    sample_frames = [0] * len(camera_urls)
    start_time = time.time()
    try:
        while time.time() - start_time < duration:
            for i, (decoder, controller) in enumerate(zip(decoders, controllers)):
                frame_gpu = decoder.get_latest_frame(rgb=True)
                if frame_gpu is None:
                    continue
                # Run tracking
                tracked_objects = controller.track(
                    frame_gpu,
                    preprocess_fn=YOLOv8Utils.preprocess,
                    postprocess_fn=YOLOv8Utils.postprocess
                )
                frame_counts[i] += 1
                sample_frames[i] += 1
                # Sample FPS every second
                if time.time() - sample_starts[i] >= 1.0:
                    fps = sample_frames[i] / (time.time() - sample_starts[i])
                    fps_samples[i].append(fps)
                    sample_frames[i] = 0
                    sample_starts[i] = time.time()
    except KeyboardInterrupt:
        print("\nBenchmark interrupted")
    # Calculate statistics
    total_time = time.time() - start_time
    # Cleanup
    for decoder in decoders:
        decoder.stop()
    # Compile results
    results = {}
    total_frames = 0
    print("\n" + "-" * 80)
    for i in range(len(camera_urls)):
        avg_fps = frame_counts[i] / total_time if total_time > 0 else 0
        total_frames += frame_counts[i]
        cam_stats = {
            'total_frames': frame_counts[i],
            'avg_fps': avg_fps,
            'min_fps': min(fps_samples[i]) if fps_samples[i] else 0,
            'max_fps': max(fps_samples[i]) if fps_samples[i] else 0,
        }
        results[f'camera_{i+1}'] = cam_stats
        print(f"Camera {i+1}:")
        print(f"  Total Frames:  {cam_stats['total_frames']}")
        print(f"  Average FPS:   {cam_stats['avg_fps']:.2f}")
        print(f"  Min FPS:       {cam_stats['min_fps']:.2f}")
        print(f"  Max FPS:       {cam_stats['max_fps']:.2f}")
        print()
    # Combined stats
    combined_avg_fps = total_frames / total_time if total_time > 0 else 0
    print("-" * 80)
    print(f"COMBINED:")
    print(f"  Total Frames (all cameras): {total_frames}")
    print(f"  Total Time:                 {total_time:.2f} seconds")
    print(f"  Combined Throughput:        {combined_avg_fps:.2f} FPS")
    print(f"  Per-Camera Average:         {combined_avg_fps / len(camera_urls):.2f} FPS")
    print("-" * 80)
    results['combined'] = {
        'total_frames': total_frames,
        'total_time': total_time,
        'combined_fps': combined_avg_fps,
        'per_camera_avg': combined_avg_fps / len(camera_urls)
    }
    return results
 def main():
    """Run both benchmarks and compare."""
    print("=" * 80)
    print("FPS BENCHMARK: Single vs Multi-Camera Tracking")
    print("=" * 80)
    # Run single camera benchmark
    single_stats = benchmark_single_camera(duration=30)
    # Run multi-camera benchmark
    multi_stats = benchmark_multi_camera(duration=30)
    # Comparison
    if multi_stats:
        print("\n" + "=" * 80)
        print("COMPARISON")
        print("=" * 80)
        print(f"\nSingle Camera Performance:")
        print(f"  Average FPS: {single_stats['avg_fps']:.2f}")
        print(f"\nMulti-Camera Performance:")
        print(f"  Per-Camera Average: {multi_stats['combined']['per_camera_avg']:.2f} FPS")
        print(f"  Combined Throughput: {multi_stats['combined']['combined_fps']:.2f} FPS")
        # Calculate performance drop
        fps_drop = ((single_stats['avg_fps'] - multi_stats['combined']['per_camera_avg'])
                    / single_stats['avg_fps'] * 100)
        print(f"\nPerformance Analysis:")
        print(f"  FPS Drop per Camera: {fps_drop:.1f}%")
        if fps_drop < 10:
            print("  ✓ Excellent - Minimal performance impact")
        elif fps_drop < 25:
            print("  ✓ Good - Acceptable performance scaling")
        elif fps_drop < 50:
            print("  ⚠ Moderate - Some performance degradation")
        else:
            print("  ⚠ Significant - Consider optimizations")
        print("=" * 80)
 if __name__ == "__main__":
    main()
--- a/test_inference.py
+++ b/test_inference.py
@ -1,189 +0,0 @@
 import time
 import torch
 import os
 from dotenv import load_dotenv
 from services.model_repository import TensorRTModelRepository
 from services.stream_decoder import StreamDecoderFactory
 import numpy as np
 # COCO class names for YOLOv8
 COCO_CLASSES = [
    'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
    'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
    'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
    'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
    'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
    'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard',
    'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
    'scissors', 'teddy bear', 'hair drier', 'toothbrush'
 ]
 def postprocess(output, confidence_threshold=0.25, iou_threshold=0.45):
    """
    Post-processes the output of a YOLOv8 model to extract bounding boxes, scores, and class IDs.
    """
    # output shape: (batch_size, 84, 8400)
    # 84 = 4 (bbox) + 80 (classes) 
    # Transpose the output to (batch_size, 8400, 84)
    output = output.transpose(1, 2)
    boxes = []
    scores = []
    class_ids = []
    for detection in output[0]:
        # First 4 values are bbox (cx, cy, w, h)
        # The rest are class scores
        class_scores = detection[4:]
        max_score, max_class_id = torch.max(class_scores, 0)
        if max_score > confidence_threshold:
            cx, cy, w, h = detection[:4]
            # Convert from center-width-height to x1-y1-x2-y2
            x1 = cx - w / 2
            y1 = cy - h / 2
            x2 = cx + w / 2
            y2 = cy + h / 2
            boxes.append([x1.item(), y1.item(), x2.item(), y2.item()])
            scores.append(max_score.item())
            class_ids.append(max_class_id.item())
    if not boxes:
        return [], [], []
    # Perform Non-Maximum Suppression (NMS)
    # This is a simplified version. For production, use a library like torchvision.ops.nms
    indices = []
    boxes_np = np.array(boxes)
    scores_np = np.array(scores)
    order = scores_np.argsort()[::-1]
    while order.size > 0:
        i = order[0]
        indices.append(i)
        xx1 = np.maximum(boxes_np[i, 0], boxes_np[order[1:], 0])
        yy1 = np.maximum(boxes_np[i, 1], boxes_np[order[1:], 1])
        xx2 = np.minimum(boxes_np[i, 2], boxes_np[order[1:], 2])
        yy2 = np.minimum(boxes_np[i, 3], boxes_np[order[1:], 3])
        w = np.maximum(0.0, xx2 - xx1 + 1)
        h = np.maximum(0.0, yy2 - yy1 + 1)
        inter = w * h
        ovr = inter / ((boxes_np[i, 2] - boxes_np[i, 0] + 1) * (boxes_np[i, 3] - boxes_np[i, 1] + 1) + \
                       (boxes_np[order[1:], 2] - boxes_np[order[1:], 0] + 1) * \
                       (boxes_np[order[1:], 3] - boxes_np[order[1:], 1] + 1) - inter)
        inds = np.where(ovr <= iou_threshold)[0]
        order = order[inds + 1]
    final_boxes = [boxes[i] for i in indices]
    final_scores = [scores[i] for i in indices]
    final_class_ids = [class_ids[i] for i in indices]
    return final_boxes, final_scores, final_class_ids
 def test_rtsp_stream_with_inference():
    """
    Decodes an RTSP stream and runs inference, printing bounding boxes and class names.
    """
    load_dotenv()
    rtsp_url = os.getenv("CAMERA_URL_1")
    if not rtsp_url:
        print("Error: CAMERA_URL_1 not found in .env file.")
        return
    print("=" * 80)
    print("RTSP Stream + TensorRT Inference")
    print("=" * 80)
    # Initialize components
    decoder_factory = StreamDecoderFactory(gpu_id=0)
    model_repo = TensorRTModelRepository(gpu_id=0, default_num_contexts=1)
    # Setup camera stream
    decoder = decoder_factory.create_decoder(rtsp_url, buffer_size=1)
    decoder.start()
    # Load inference model
    model_path = "models/yolov8n.trt"
    try:
        model_repo.load_model(
            model_id="camera_main",
            file_path=model_path
        )
    except Exception as e:
        print(f"Error loading model: {e}")
        print(f"Please ensure '{model_path}' exists.")
        decoder.stop()
        return
    print("\nWaiting for stream to buffer frames...")
    time.sleep(3)
    try:
        while True:
            frame_gpu = decoder.get_latest_frame(rgb=True)
            if frame_gpu is None:
                time.sleep(0.1)
                continue
            # Preprocess frame for YOLOv8
            # Resize to 640x640, normalize, and add batch dimension
            frame_float = frame_gpu.unsqueeze(0).float() # Convert to float here
            frame_resized = torch.nn.functional.interpolate(
                frame_float, size=(640, 640), mode='bilinear', align_corners=False
            )
            frame_normalized = frame_resized.float() / 255.0
            # Run inference
            try:
                outputs = model_repo.infer(
                    model_id="camera_main",
                    inputs={"images": frame_normalized},
                    synchronize=True
                )
                # Post-process the output
                output_tensor = outputs['output0']
                boxes, scores, class_ids = postprocess(output_tensor)
                # Print results
                print(f"\n--- Frame at {time.time():.2f} ---")
                if boxes:
                    for box, score, class_id in zip(boxes, scores, class_ids):
                        class_name = COCO_CLASSES[class_id]
                        print(
                            f"  Detected: {class_name} "
                            f"(confidence: {score:.2f}) at "
                            f"bbox: [{box[0]:.0f}, {box[1]:.0f}, {box[2]:.0f}, {box[3]:.0f}]"
                        )
                else:
                    print("  No objects detected.")
            except Exception as e:
                print(f"Inference failed: {e}")
            time.sleep(0.03) # ~30 FPS
    except KeyboardInterrupt:
        print("\nStopping...")
    finally:
        # Cleanup
        decoder.stop()
        model_repo.unload_model("camera_main")
        print("Stream and model unloaded.")
 if __name__ == "__main__":
    test_rtsp_stream_with_inference()
--- a/test_jpeg_encode.py
+++ b/test_jpeg_encode.py
@ -1,174 +0,0 @@
 #!/usr/bin/env python3
 """
 Test script for JPEG encoding with nvImageCodec
 Tests GPU-accelerated JPEG encoding from RTSP stream frames
 """
 import argparse
 import sys
 import time
 import os
 from pathlib import Path
 from dotenv import load_dotenv
 from services import StreamDecoderFactory
 # Load environment variables from .env file
 load_dotenv()
 def main():
    parser = argparse.ArgumentParser(description='Test JPEG encoding from RTSP stream')
    parser.add_argument(
        '--rtsp-url',
        type=str,
        default=None,
        help='RTSP stream URL (defaults to CAMERA_URL_1 from .env)'
    )
    parser.add_argument(
        '--output-dir',
        type=str,
        default='./snapshots',
        help='Output directory for JPEG files'
    )
    parser.add_argument(
        '--num-frames',
        type=int,
        default=10,
        help='Number of frames to capture'
    )
    parser.add_argument(
        '--interval',
        type=float,
        default=1.0,
        help='Interval between captures in seconds'
    )
    parser.add_argument(
        '--quality',
        type=int,
        default=95,
        help='JPEG quality (0-100)'
    )
    parser.add_argument(
        '--gpu-id',
        type=int,
        default=0,
        help='GPU device ID'
    )
    args = parser.parse_args()
    # Get RTSP URL from command line or environment
    rtsp_url = args.rtsp_url
    if not rtsp_url:
        rtsp_url = os.getenv('CAMERA_URL_1')
        if not rtsp_url:
            print("Error: No RTSP URL provided")
            print("Please either:")
            print("  1. Use --rtsp-url argument, or")
            print("  2. Add CAMERA_URL_1 to your .env file")
            sys.exit(1)
    # Create output directory
    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    print("=" * 80)
    print("RTSP Stream JPEG Encoding Test")
    print("=" * 80)
    print(f"RTSP URL: {rtsp_url}")
    print(f"Output Directory: {output_dir}")
    print(f"Number of Frames: {args.num_frames}")
    print(f"Capture Interval: {args.interval}s")
    print(f"JPEG Quality: {args.quality}")
    print(f"GPU ID: {args.gpu_id}")
    print("=" * 80)
    print()
    try:
        # Initialize factory and decoder
        print("[1/3] Initializing StreamDecoderFactory...")
        factory = StreamDecoderFactory(gpu_id=args.gpu_id)
        print("✓ Factory initialized\n")
        print("[2/3] Creating and starting decoder...")
        decoder = factory.create_decoder(
            rtsp_url=rtsp_url,
            buffer_size=30
        )
        decoder.start()
        print("✓ Decoder started\n")
        # Wait for connection
        print("[3/3] Waiting for stream to connect...")
        max_wait = 10
        for i in range(max_wait):
            if decoder.is_connected():
                print("✓ Stream connected\n")
                break
            time.sleep(1)
            print(f"  Waiting... {i+1}/{max_wait}s")
        else:
            print("✗ Failed to connect to stream")
            sys.exit(1)
        # Capture frames
        print(f"Capturing {args.num_frames} frames...")
        print("-" * 80)
        captured = 0
        for i in range(args.num_frames):
            # Get frame as JPEG
            start_time = time.time()
            jpeg_bytes = decoder.get_frame_as_jpeg(quality=args.quality)
            encode_time = (time.time() - start_time) * 1000  # ms
            if jpeg_bytes:
                # Save to file
                filename = output_dir / f"frame_{i:04d}.jpg"
                with open(filename, 'wb') as f:
                    f.write(jpeg_bytes)
                size_kb = len(jpeg_bytes) / 1024
                print(f"[{i+1}/{args.num_frames}] Saved {filename.name} "
                      f"({size_kb:.1f} KB, encoded in {encode_time:.2f}ms)")
                captured += 1
            else:
                print(f"[{i+1}/{args.num_frames}] Failed to get frame")
            # Wait before next capture (except for last frame)
            if i < args.num_frames - 1:
                time.sleep(args.interval)
        print("-" * 80)
        # Summary
        print("\n" + "=" * 80)
        print("Capture Complete")
        print("=" * 80)
        print(f"Successfully captured: {captured}/{args.num_frames} frames")
        print(f"Output directory: {output_dir.absolute()}")
        print("=" * 80)
    except KeyboardInterrupt:
        print("\n\n✗ Interrupted by user")
        sys.exit(1)
    except Exception as e:
        print(f"\n\n✗ Error: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)
    finally:
        # Cleanup
        if 'decoder' in locals():
            print("\nCleaning up...")
            decoder.stop()
            print("✓ Decoder stopped")
    print("\n✓ Test completed successfully")
    sys.exit(0)
 if __name__ == '__main__':
    main()
--- a/test_model_inference.py
+++ b/test_model_inference.py
@ -1,310 +0,0 @@
 """
 Test script for TensorRT Model Repository with multi-camera inference.
 This demonstrates:
 1. Loading the same model for multiple cameras (deduplication)
 2. Context pool load balancing
 3. GPU-to-GPU inference from RTSP streams
 4. Memory efficiency with shared engines
 """
 import time
 import torch
 from services.model_repository import TensorRTModelRepository
 from services.stream_decoder import StreamDecoderFactory
 def test_multi_camera_inference():
    """
    Simulate multi-camera inference scenario.
    Example: 100 cameras, all using the same YOLOv8 model
    - Without pooling: 100 engines + 100 contexts in VRAM
    - With pooling: 1 engine + 4 contexts in VRAM (huge savings!)
    """
    # Initialize model repository with context pooling
    repo = TensorRTModelRepository(gpu_id=0, default_num_contexts=4)
    # Camera configurations (simulated)
    camera_configs = [
        {"id": "camera_1", "rtsp_url": "rtsp://camera1.local/stream"},
        {"id": "camera_2", "rtsp_url": "rtsp://camera2.local/stream"},
        {"id": "camera_3", "rtsp_url": "rtsp://camera3.local/stream"},
        # ... imagine 100 cameras here
    ]
    # Load the same model for all cameras
    model_file = "models/yolov8n.trt"  # Same file for all cameras
    print("=" * 80)
    print("LOADING MODELS FOR MULTIPLE CAMERAS")
    print("=" * 80)
    for config in camera_configs:
        try:
            # Each camera gets its own model_id, but shares the same engine!
            metadata = repo.load_model(
                model_id=config["id"],
                file_path=model_file,
                num_contexts=4  # 4 contexts shared across all cameras
            )
            print(f"\n✓ Loaded model for {config['id']}")
        except Exception as e:
            print(f"\n✗ Failed to load model for {config['id']}: {e}")
    # Show repository stats
    print("\n" + "=" * 80)
    print("REPOSITORY STATISTICS")
    print("=" * 80)
    stats = repo.get_stats()
    print(f"Total model IDs: {stats['total_model_ids']}")
    print(f"Unique engines in VRAM: {stats['unique_engines']}")
    print(f"Total contexts: {stats['total_contexts']}")
    print(f"Memory efficiency: {stats['memory_efficiency']}")
    # Get detailed info for one camera
    print("\n" + "=" * 80)
    print("DETAILED MODEL INFO (camera_1)")
    print("=" * 80)
    info = repo.get_model_info("camera_1")
    if info:
        print(f"Model ID: {info['model_id']}")
        print(f"File: {info['file_path']}")
        print(f"File hash: {info['file_hash']}")
        print(f"Engine references: {info['engine_references']}")
        print(f"Context pool size: {info['context_pool_size']}")
        print(f"Shared with: {info['shared_with_model_ids']}")
        print(f"\nInputs:")
        for name, spec in info['inputs'].items():
            print(f"  {name}: {spec['shape']} ({spec['dtype']})")
        print(f"\nOutputs:")
        for name, spec in info['outputs'].items():
            print(f"  {name}: {spec['shape']} ({spec['dtype']})")
    # Simulate inference from multiple cameras
    print("\n" + "=" * 80)
    print("RUNNING INFERENCE (GPU-to-GPU)")
    print("=" * 80)
    # Create dummy input tensors (simulating frames from cameras)
    # In real scenario, these come from StreamDecoder.get_frame()
    batch_size = 1
    channels = 3
    height = 640
    width = 640
    for config in camera_configs:
        try:
            # Simulate getting frame from camera (already on GPU)
            input_tensor = torch.rand(
                batch_size, channels, height, width,
                dtype=torch.float32,
                device='cuda:0'
            )
            # Run inference (stays in GPU)
            start = time.time()
            outputs = repo.infer(
                model_id=config["id"],
                inputs={"images": input_tensor},  # Adjust input name based on your model
                synchronize=True,
                timeout=5.0
            )
            elapsed = (time.time() - start) * 1000  # Convert to ms
            print(f"\n{config['id']}: Inference completed in {elapsed:.2f}ms")
            for name, tensor in outputs.items():
                print(f"  Output '{name}': {tensor.shape} on {tensor.device}")
        except Exception as e:
            print(f"\n{config['id']}: Inference failed: {e}")
    # Cleanup
    print("\n" + "=" * 80)
    print("CLEANUP")
    print("=" * 80)
    for config in camera_configs:
        repo.unload_model(config["id"])
    print("\nAll models unloaded.")
 def test_rtsp_stream_with_inference():
    """
    Real-world example: Decode RTSP stream and run inference.
    Everything stays in GPU memory (zero CPU transfers).
    """
    print("=" * 80)
    print("RTSP STREAM + TENSORRT INFERENCE (GPU-to-GPU)")
    print("=" * 80)
    # Initialize components
    decoder_factory = StreamDecoderFactory(gpu_id=0)
    model_repo = TensorRTModelRepository(gpu_id=0, default_num_contexts=4)
    # Setup camera stream
    rtsp_url = "rtsp://your-camera-ip/stream"
    decoder = decoder_factory.create_decoder(rtsp_url, buffer_size=30)
    decoder.start()
    # Load inference model
    try:
        model_repo.load_model(
            model_id="camera_main",
            file_path="models/yolov8n.trt"
        )
    except FileNotFoundError:
        print("\n⚠ Model file not found. Please export your model to TensorRT:")
        print("   Example: yolo export model=yolov8n.pt format=engine device=0")
        return
    print("\nWaiting for stream to buffer frames...")
    time.sleep(3)
    # Process frames
    for i in range(10):
        # Get frame from decoder (already on GPU)
        frame_gpu = decoder.get_latest_frame(rgb=True)  # Returns torch.Tensor on CUDA
        if frame_gpu is None:
            print(f"Frame {i}: No frame available")
            continue
        # Preprocess if needed (stays on GPU)
        # For YOLOv8: normalize, resize, etc.
        # Example preprocessing (adjust for your model):
        frame_gpu = frame_gpu.float() / 255.0  # Normalize to [0, 1]
        frame_gpu = frame_gpu.unsqueeze(0)  # Add batch dimension: (1, 3, H, W)
        # Run inference (GPU-to-GPU, zero copy)
        try:
            outputs = model_repo.infer(
                model_id="camera_main",
                inputs={"images": frame_gpu},
                synchronize=True
            )
            print(f"\nFrame {i}: Inference successful")
            for name, tensor in outputs.items():
                print(f"  {name}: {tensor.shape} on {tensor.device}")
            # Post-process results (can stay on GPU or move to CPU as needed)
            # Example: NMS, bounding box extraction, etc.
        except Exception as e:
            print(f"\nFrame {i}: Inference failed: {e}")
        time.sleep(0.1)  # Simulate processing interval
    # Cleanup
    decoder.stop()
    model_repo.unload_model("camera_main")
    print("\n✓ Test completed successfully")
 def test_concurrent_inference():
    """
    Test concurrent inference from multiple threads.
    Demonstrates context pool load balancing.
    """
    import threading
    print("=" * 80)
    print("CONCURRENT INFERENCE TEST (Context Pool Load Balancing)")
    print("=" * 80)
    repo = TensorRTModelRepository(gpu_id=0, default_num_contexts=4)
    # Load model
    try:
        repo.load_model("shared_model", "models/yolov8n.trt", num_contexts=4)
    except Exception as e:
        print(f"Failed to load model: {e}")
        return
    def worker(worker_id: int, num_inferences: int):
        """Worker thread performing inference"""
        for i in range(num_inferences):
            try:
                # Create dummy input
                input_tensor = torch.rand(1, 3, 640, 640, device='cuda:0', dtype=torch.float32)
                # Acquire context from pool, run inference, release context
                outputs = repo.infer(
                    model_id="shared_model",
                    inputs={"images": input_tensor},
                    timeout=10.0
                )
                print(f"Worker {worker_id}, Inference {i}: SUCCESS")
            except Exception as e:
                print(f"Worker {worker_id}, Inference {i}: FAILED - {e}")
            time.sleep(0.01)  # Small delay
    # Launch multiple worker threads (more workers than contexts!)
    threads = []
    num_workers = 10  # 10 workers sharing 4 contexts
    inferences_per_worker = 5
    print(f"\nLaunching {num_workers} workers (only 4 contexts available)")
    print("Contexts will be borrowed/returned automatically\n")
    start_time = time.time()
    for worker_id in range(num_workers):
        t = threading.Thread(target=worker, args=(worker_id, inferences_per_worker))
        threads.append(t)
        t.start()
    # Wait for all workers
    for t in threads:
        t.join()
    elapsed = time.time() - start_time
    total_inferences = num_workers * inferences_per_worker
    print(f"\n✓ Completed {total_inferences} inferences in {elapsed:.2f}s")
    print(f"  Throughput: {total_inferences / elapsed:.2f} inferences/sec")
    print(f"  With only 4 contexts for {num_workers} workers!")
    repo.unload_model("shared_model")
 if __name__ == "__main__":
    print("\n" + "=" * 80)
    print("TENSORRT MODEL REPOSITORY - TEST SUITE")
    print("=" * 80)
    # Test 1: Multi-camera model loading
    print("\n\nTEST 1: Multi-Camera Model Loading with Deduplication")
    print("-" * 80)
    try:
        test_multi_camera_inference()
    except Exception as e:
        print(f"Test 1 failed: {e}")
    # Test 2: RTSP stream + inference (commented out by default)
    # Uncomment if you have a real RTSP stream
    # print("\n\nTEST 2: RTSP Stream + Inference")
    # print("-" * 80)
    # try:
    #     test_rtsp_stream_with_inference()
    # except Exception as e:
    #     print(f"Test 2 failed: {e}")
    # Test 3: Concurrent inference
    print("\n\nTEST 3: Concurrent Inference with Context Pooling")
    print("-" * 80)
    try:
        test_concurrent_inference()
    except Exception as e:
        print(f"Test 3 failed: {e}")
    print("\n" + "=" * 80)
    print("ALL TESTS COMPLETED")
    print("=" * 80)
--- a/test_multi_stream.py
+++ b/test_multi_stream.py
@ -1,255 +0,0 @@
 #!/usr/bin/env python3
 """
 Multi-stream test script to verify CUDA context sharing efficiency.
 Tests multiple RTSP streams simultaneously and monitors VRAM usage.
 """
 import argparse
 import time
 import sys
 import subprocess
 import os
 from pathlib import Path
 from dotenv import load_dotenv
 from services import StreamDecoderFactory, ConnectionStatus
 # Load environment variables from .env file
 load_dotenv()
 def get_gpu_memory_usage(gpu_id: int = 0) -> int:
    """Get current GPU memory usage in MB using nvidia-smi"""
    try:
        result = subprocess.run(
            ['nvidia-smi', '--query-gpu=memory.used', '--format=csv,noheader,nounits', f'--id={gpu_id}'],
            capture_output=True,
            text=True,
            check=True
        )
        return int(result.stdout.strip())
    except Exception as e:
        print(f"Warning: Could not get GPU memory usage: {e}")
        return 0
 def main():
    parser = argparse.ArgumentParser(description='Test multi-stream decoding with context sharing')
    parser.add_argument(
        '--gpu-id',
        type=int,
        default=0,
        help='GPU device ID'
    )
    parser.add_argument(
        '--duration',
        type=int,
        default=20,
        help='Test duration in seconds'
    )
    parser.add_argument(
        '--capture-snapshots',
        action='store_true',
        help='Capture JPEG snapshots during test'
    )
    parser.add_argument(
        '--output-dir',
        type=str,
        default='./multi_stream_snapshots',
        help='Output directory for snapshots'
    )
    args = parser.parse_args()
    # Load camera URLs from environment
    camera_urls = []
    i = 1
    while True:
        url = os.getenv(f'CAMERA_URL_{i}')
        if url:
            camera_urls.append(url)
            i += 1
        else:
            break
    if not camera_urls:
        print("Error: No camera URLs found in .env file")
        print("Please add CAMERA_URL_1, CAMERA_URL_2, etc. to your .env file")
        sys.exit(1)
    # Create output directory if capturing snapshots
    if args.capture_snapshots:
        output_dir = Path(args.output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
    print("=" * 80)
    print("Multi-Stream RTSP Decoder Test - Context Sharing Verification")
    print("=" * 80)
    print(f"Number of Streams: {len(camera_urls)}")
    print(f"GPU ID: {args.gpu_id}")
    print(f"Test Duration: {args.duration} seconds")
    print(f"Capture Snapshots: {args.capture_snapshots}")
    print("=" * 80)
    print()
    try:
        # Get baseline GPU memory
        print("[Baseline] Measuring initial GPU memory usage...")
        baseline_memory = get_gpu_memory_usage(args.gpu_id)
        print(f"✓ Baseline VRAM: {baseline_memory} MB\n")
        # Initialize factory (shared CUDA context)
        print("[1/4] Initializing StreamDecoderFactory with shared CUDA context...")
        factory = StreamDecoderFactory(gpu_id=args.gpu_id)
        factory_memory = get_gpu_memory_usage(args.gpu_id)
        factory_overhead = factory_memory - baseline_memory
        print(f"✓ Factory initialized")
        print(f"  VRAM after factory: {factory_memory} MB (+{factory_overhead} MB)\n")
        # Create all decoders
        print(f"[2/4] Creating {len(camera_urls)} StreamDecoder instances...")
        decoders = []
        for i, url in enumerate(camera_urls):
            decoder = factory.create_decoder(
                rtsp_url=url,
                buffer_size=30,
                codec='h264'
            )
            decoders.append(decoder)
            print(f"  ✓ Decoder {i+1} created for camera {url.split('@')[1].split('/')[0]}")
        decoders_memory = get_gpu_memory_usage(args.gpu_id)
        decoders_overhead = decoders_memory - factory_memory
        print(f"\n  VRAM after creating {len(decoders)} decoders: {decoders_memory} MB (+{decoders_overhead} MB)")
        print(f"  Average per decoder: {decoders_overhead / len(decoders):.1f} MB\n")
        # Start all decoders
        print(f"[3/4] Starting all {len(decoders)} decoders...")
        for i, decoder in enumerate(decoders):
            decoder.start()
            print(f"  ✓ Decoder {i+1} started")
        started_memory = get_gpu_memory_usage(args.gpu_id)
        started_overhead = started_memory - decoders_memory
        print(f"\n  VRAM after starting decoders: {started_memory} MB (+{started_overhead} MB)")
        print(f"  Average per running decoder: {started_overhead / len(decoders):.1f} MB\n")
        # Wait for all streams to connect
        print("[4/4] Waiting for all streams to connect...")
        max_wait = 15
        for wait_time in range(max_wait):
            connected = sum(1 for d in decoders if d.is_connected())
            print(f"  Connected: {connected}/{len(decoders)} streams", end='\r')
            if connected == len(decoders):
                print(f"\n✓ All {len(decoders)} streams connected!\n")
                break
            time.sleep(1)
        else:
            connected = sum(1 for d in decoders if d.is_connected())
            print(f"\n⚠ Only {connected}/{len(decoders)} streams connected after {max_wait}s\n")
        connected_memory = get_gpu_memory_usage(args.gpu_id)
        connected_overhead = connected_memory - started_memory
        print(f"  VRAM after connection: {connected_memory} MB (+{connected_overhead} MB)\n")
        # Monitor streams
        print(f"Monitoring streams for {args.duration} seconds...")
        print("=" * 80)
        print(f"{'Time':<8} {'VRAM':<10} {'Stream 1':<12} {'Stream 2':<12} {'Stream 3':<12} {'Stream 4':<12}")
        print("-" * 80)
        start_time = time.time()
        snapshot_interval = args.duration // 3 if args.capture_snapshots else 0
        last_snapshot = 0
        while time.time() - start_time < args.duration:
            elapsed = time.time() - start_time
            current_memory = get_gpu_memory_usage(args.gpu_id)
            # Get stats for each decoder
            stats = []
            for decoder in decoders:
                status = decoder.get_status().value[:8]
                buffer = decoder.get_buffer_size()
                frames = decoder.frame_count
                stats.append(f"{status:8s} {buffer:2d}/30 {frames:4d}")
            print(f"{elapsed:6.1f}s {current_memory:6d}MB {stats[0]:<12} {stats[1]:<12} {stats[2]:<12} {stats[3]:<12}")
            # Capture snapshots
            if args.capture_snapshots and snapshot_interval > 0:
                if elapsed - last_snapshot >= snapshot_interval:
                    print("\n  → Capturing snapshots from all streams...")
                    for i, decoder in enumerate(decoders):
                        jpeg_bytes = decoder.get_frame_as_jpeg(quality=85)
                        if jpeg_bytes:
                            filename = output_dir / f"camera_{i+1}_t{int(elapsed)}s.jpg"
                            with open(filename, 'wb') as f:
                                f.write(jpeg_bytes)
                            print(f"     Saved {filename.name} ({len(jpeg_bytes)/1024:.1f} KB)")
                    print()
                    last_snapshot = elapsed
            time.sleep(1)
        print("=" * 80)
        # Final memory analysis
        final_memory = get_gpu_memory_usage(args.gpu_id)
        total_overhead = final_memory - baseline_memory
        print("\n" + "=" * 80)
        print("Memory Usage Analysis")
        print("=" * 80)
        print(f"Baseline VRAM:                    {baseline_memory:6d} MB")
        print(f"After Factory Init:               {factory_memory:6d} MB  (+{factory_overhead:4d} MB)")
        print(f"After Creating {len(decoders)} Decoders:        {decoders_memory:6d} MB  (+{decoders_overhead:4d} MB)")
        print(f"After Starting Decoders:          {started_memory:6d} MB  (+{started_overhead:4d} MB)")
        print(f"After Connection:                 {connected_memory:6d} MB  (+{connected_overhead:4d} MB)")
        print(f"Final (after {args.duration}s):              {final_memory:6d} MB  (+{total_overhead:4d} MB total)")
        print("-" * 80)
        print(f"Average VRAM per stream:          {total_overhead / len(decoders):6.1f} MB")
        print(f"Context sharing efficiency:       {'EXCELLENT' if total_overhead < 500 else 'GOOD' if total_overhead < 800 else 'POOR'}")
        print("=" * 80)
        # Final stats
        print("\nFinal Stream Statistics:")
        print("-" * 80)
        for i, decoder in enumerate(decoders):
            status = decoder.get_status().value
            buffer = decoder.get_buffer_size()
            frames = decoder.frame_count
            fps = frames / args.duration if args.duration > 0 else 0
            print(f"Stream {i+1}: {status:12s} | Buffer: {buffer:2d}/{decoder.buffer_size} | "
                  f"Frames: {frames:5d} | Avg FPS: {fps:5.2f}")
        print("=" * 80)
    except KeyboardInterrupt:
        print("\n\n✗ Interrupted by user")
        sys.exit(1)
    except Exception as e:
        print(f"\n\n✗ Error: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)
    finally:
        # Cleanup
        if 'decoders' in locals():
            print("\nCleaning up...")
            for i, decoder in enumerate(decoders):
                decoder.stop()
                print(f"  ✓ Decoder {i+1} stopped")
            cleanup_memory = get_gpu_memory_usage(args.gpu_id)
            print(f"\nVRAM after cleanup: {cleanup_memory} MB")
    print("\n✓ Multi-stream test completed successfully")
    sys.exit(0)
 if __name__ == '__main__':
    main()
--- a/test_stream.py
+++ b/test_stream.py
@ -1,152 +0,0 @@
 #!/usr/bin/env python3
 """
 CLI test script for StreamDecoder
 Tests RTSP stream decoding with NVDEC hardware acceleration
 """
 import argparse
 import time
 import sys
 from services.stream_decoder import StreamDecoderFactory, ConnectionStatus
 def main():
    parser = argparse.ArgumentParser(description='Test RTSP stream decoder with NVDEC')
    parser.add_argument(
        '--rtsp-url',
        type=str,
        required=True,
        help='RTSP stream URL (e.g., rtsp://user:pass@host/path)'
    )
    parser.add_argument(
        '--gpu-id',
        type=int,
        default=0,
        help='GPU device ID'
    )
    parser.add_argument(
        '--buffer-size',
        type=int,
        default=30,
        help='Frame buffer size'
    )
    parser.add_argument(
        '--duration',
        type=int,
        default=30,
        help='Test duration in seconds'
    )
    parser.add_argument(
        '--check-interval',
        type=float,
        default=1.0,
        help='Status check interval in seconds'
    )
    args = parser.parse_args()
    print("=" * 80)
    print("RTSP Stream Decoder Test")
    print("=" * 80)
    print(f"RTSP URL: {args.rtsp_url}")
    print(f"GPU ID: {args.gpu_id}")
    print(f"Buffer Size: {args.buffer_size} frames")
    print(f"Test Duration: {args.duration} seconds")
    print("=" * 80)
    print()
    try:
        # Create factory with shared CUDA context
        print("[1/4] Initializing StreamDecoderFactory...")
        factory = StreamDecoderFactory(gpu_id=args.gpu_id)
        print("✓ Factory initialized with shared CUDA context\n")
        # Create decoder
        print("[2/4] Creating StreamDecoder...")
        decoder = factory.create_decoder(
            rtsp_url=args.rtsp_url,
            buffer_size=args.buffer_size,
            codec='h264'
        )
        print(f"✓ Decoder created: {decoder}\n")
        # Start decoding
        print("[3/4] Starting decoder thread...")
        decoder.start()
        print("✓ Decoder thread started\n")
        # Monitor for specified duration
        print(f"[4/4] Monitoring stream for {args.duration} seconds...")
        print("-" * 80)
        start_time = time.time()
        last_frame_count = 0
        while time.time() - start_time < args.duration:
            time.sleep(args.check_interval)
            # Get status
            status = decoder.get_status()
            buffer_size = decoder.get_buffer_size()
            frame_count = decoder.frame_count
            fps = (frame_count - last_frame_count) / args.check_interval
            last_frame_count = frame_count
            # Print status
            elapsed = time.time() - start_time
            print(f"[{elapsed:6.1f}s] Status: {status.value:12s} | "
                  f"Buffer: {buffer_size:2d}/{args.buffer_size:2d} | "
                  f"Frames: {frame_count:5d} | "
                  f"FPS: {fps:5.1f}")
            # Try to get latest frame
            if status == ConnectionStatus.CONNECTED:
                frame = decoder.get_latest_frame()
                if frame is not None:
                    print(f"         Frame shape: {frame.shape}, dtype: {frame.dtype}, "
                          f"device: {frame.device}")
            # Check for errors
            if status == ConnectionStatus.ERROR:
                print("\n✗ ERROR: Stream connection failed!")
                break
        print("-" * 80)
        # Final statistics
        print("\n" + "=" * 80)
        print("Test Complete - Final Statistics")
        print("=" * 80)
        print(f"Total Frames Decoded: {decoder.frame_count}")
        print(f"Average FPS: {decoder.frame_count / args.duration:.2f}")
        print(f"Final Status: {decoder.get_status().value}")
        print(f"Buffer Utilization: {decoder.get_buffer_size()}/{args.buffer_size}")
        if decoder.frame_width and decoder.frame_height:
            print(f"Frame Resolution: {decoder.frame_width}x{decoder.frame_height}")
        print("=" * 80)
    except KeyboardInterrupt:
        print("\n\n✗ Interrupted by user")
        sys.exit(1)
    except Exception as e:
        print(f"\n\n✗ Error: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)
    finally:
        # Cleanup
        if 'decoder' in locals():
            print("\nCleaning up...")
            decoder.stop()
            print("✓ Decoder stopped")
    print("\n✓ Test completed successfully")
    sys.exit(0)
 if __name__ == '__main__':
    main()
--- a/test_vram_process.py
+++ b/test_vram_process.py
@ -1,143 +0,0 @@
 #!/usr/bin/env python3
 """
 VRAM scaling test - measures Python process memory usage for 1, 2, 3, and 4 streams.
 """
 import os
 import time
 import subprocess
 from dotenv import load_dotenv
 from services import StreamDecoderFactory
 # Load environment variables from .env file
 load_dotenv()
 # Load camera URLs from environment
 camera_urls = []
 i = 1
 while True:
    url = os.getenv(f'CAMERA_URL_{i}')
    if url:
        camera_urls.append(url)
        i += 1
    else:
        break
 if not camera_urls:
    print("Error: No camera URLs found in .env file")
    print("Please add CAMERA_URL_1, CAMERA_URL_2, etc. to your .env file")
    exit(1)
 def get_python_gpu_memory():
    """Get Python process GPU memory usage in MB"""
    try:
        pid = os.getpid()
        result = subprocess.run(
            ['nvidia-smi', '--query-compute-apps=pid,used_memory', '--format=csv,noheader,nounits'],
            capture_output=True, text=True, check=True
        )
        for line in result.stdout.strip().split('\n'):
            if line:
                parts = line.split(',')
                if len(parts) >= 2 and int(parts[0].strip()) == pid:
                    return int(parts[1].strip())
        return 0
    except:
        return 0
 def test_n_streams(n, wait_time=15):
    """Test with n streams"""
    print(f"\n{'='*80}")
    print(f"Testing with {n} stream(s)")
    print('='*80)
    mem_before = get_python_gpu_memory()
    print(f"Python process VRAM before: {mem_before} MB")
    # Create factory
    factory = StreamDecoderFactory(gpu_id=0)
    time.sleep(1)
    mem_after_factory = get_python_gpu_memory()
    print(f"After factory: {mem_after_factory} MB (+{mem_after_factory - mem_before} MB)")
    # Create decoders
    decoders = []
    for i in range(n):
        decoder = factory.create_decoder(camera_urls[i], buffer_size=30)
        decoders.append(decoder)
    time.sleep(1)
    mem_after_create = get_python_gpu_memory()
    print(f"After creating {n} decoder(s): {mem_after_create} MB (+{mem_after_create - mem_after_factory} MB)")
    # Start decoders
    for decoder in decoders:
        decoder.start()
    time.sleep(2)
    mem_after_start = get_python_gpu_memory()
    print(f"After starting {n} decoder(s): {mem_after_start} MB (+{mem_after_start - mem_after_create} MB)")
    # Wait for connection
    print(f"Waiting {wait_time}s for streams to connect and stabilize...")
    time.sleep(wait_time)
    # Check connection status
    connected = sum(1 for d in decoders if d.is_connected())
    mem_stable = get_python_gpu_memory()
    print(f"Connected: {connected}/{n} streams")
    print(f"Python process VRAM (stable): {mem_stable} MB")
    # Get frame stats
    for i, decoder in enumerate(decoders):
        print(f"  Stream {i+1}: {decoder.get_status().value:10s} "
              f"Buffer: {decoder.get_buffer_size()}/30 "
              f"Frames: {decoder.frame_count}")
    # Cleanup
    for decoder in decoders:
        decoder.stop()
    time.sleep(2)
    mem_after_cleanup = get_python_gpu_memory()
    print(f"After cleanup: {mem_after_cleanup} MB")
    return mem_stable
 if __name__ == '__main__':
    print("Python VRAM Scaling Test")
    print(f"PID: {os.getpid()}")
    baseline = get_python_gpu_memory()
    print(f"Baseline Python process VRAM: {baseline} MB\n")
    results = {}
    for n in [1, 2, 3, 4]:
        mem = test_n_streams(n, wait_time=15)
        results[n] = mem
        print(f"\n→ {n} stream(s): {mem} MB (process total)")
        # Give time between tests
        if n < 4:
            print("\nWaiting 5s before next test...")
            time.sleep(5)
    # Summary
    print("\n" + "="*80)
    print("Python Process VRAM Scaling Summary")
    print("="*80)
    print(f"Baseline:     {baseline:4d} MB")
    for n in [1, 2, 3, 4]:
        total = results[n]
        overhead = total - baseline
        per_stream = overhead / n if n > 0 else 0
        print(f"{n} stream(s):  {total:4d} MB  (+{overhead:3d} MB total, {per_stream:5.1f} MB per stream)")
    # Calculate marginal cost
    print("\nMarginal cost per additional stream:")
    for n in [2, 3, 4]:
        marginal = results[n] - results[n-1]
        print(f"  Stream {n}: +{marginal} MB")
    print("="*80)
--- a/verify_tensorrt_model.py
+++ b/verify_tensorrt_model.py
@ -1,85 +0,0 @@
 #!/usr/bin/env python3
 """
 Quick verification script for TensorRT model
 """
 import torch
 from services.model_repository import TensorRTModelRepository
 def verify_model():
    print("=" * 80)
    print("TensorRT Model Verification")
    print("=" * 80)
    # Initialize repository
    repo = TensorRTModelRepository(gpu_id=0, default_num_contexts=2)
    # Load the model
    print("\nLoading YOLOv8n TensorRT engine...")
    try:
        metadata = repo.load_model(
            model_id="yolov8n_test",
            file_path="models/yolov8n.trt",
            num_contexts=2
        )
        print("✓ Model loaded successfully!")
    except Exception as e:
        print(f"✗ Failed to load model: {e}")
        return
    # Get model info
    print("\n" + "=" * 80)
    print("Model Information")
    print("=" * 80)
    info = repo.get_model_info("yolov8n_test")
    if info:
        print(f"Model ID: {info['model_id']}")
        print(f"File: {info['file_path']}")
        print(f"File hash: {info['file_hash']}")
        print(f"\nInputs:")
        for name, spec in info['inputs'].items():
            print(f"  {name}: {spec['shape']} ({spec['dtype']})")
        print(f"\nOutputs:")
        for name, spec in info['outputs'].items():
            print(f"  {name}: {spec['shape']} ({spec['dtype']})")
    # Run test inference
    print("\n" + "=" * 80)
    print("Running Test Inference")
    print("=" * 80)
    try:
        # Create dummy input (simulating a 640x640 image)
        input_tensor = torch.rand(1, 3, 640, 640, dtype=torch.float32, device='cuda:0')
        print(f"Input tensor: {input_tensor.shape} on {input_tensor.device}")
        # Run inference
        outputs = repo.infer(
            model_id="yolov8n_test",
            inputs={"images": input_tensor},
            synchronize=True
        )
        print("\n✓ Inference successful!")
        print("\nOutputs:")
        for name, tensor in outputs.items():
            print(f"  {name}: {tensor.shape} on {tensor.device} ({tensor.dtype})")
    except Exception as e:
        print(f"\n✗ Inference failed: {e}")
        import traceback
        traceback.print_exc()
    # Cleanup
    print("\n" + "=" * 80)
    print("Cleanup")
    print("=" * 80)
    repo.unload_model("yolov8n_test")
    print("✓ Model unloaded")
    print("\n" + "=" * 80)
    print("Verification Complete!")
    print("=" * 80)
 if __name__ == "__main__":
    verify_model()