"""
Test script for TensorRT Model Repository with multi-camera inference.

This demonstrates:
1. Loading the same model for multiple cameras (deduplication)
2. Context pool load balancing
3. GPU-to-GPU inference from RTSP streams
4. Memory efficiency with shared engines
"""

import time
import torch
from services.model_repository import TensorRTModelRepository
from services.stream_decoder import StreamDecoderFactory


def test_multi_camera_inference():
    """
    Simulate multi-camera inference scenario.

    Example: 100 cameras, all using the same YOLOv8 model
    - Without pooling: 100 engines + 100 contexts in VRAM
    - With pooling: 1 engine + 4 contexts in VRAM (huge savings!)
    """

    # Initialize model repository with context pooling
    repo = TensorRTModelRepository(gpu_id=0, default_num_contexts=4)

    # Camera configurations (simulated)
    camera_configs = [
        {"id": "camera_1", "rtsp_url": "rtsp://camera1.local/stream"},
        {"id": "camera_2", "rtsp_url": "rtsp://camera2.local/stream"},
        {"id": "camera_3", "rtsp_url": "rtsp://camera3.local/stream"},
        # ... imagine 100 cameras here
    ]

    # Load the same model for all cameras
    model_file = "models/yolov8n.trt"  # Same file for all cameras

    print("=" * 80)
    print("LOADING MODELS FOR MULTIPLE CAMERAS")
    print("=" * 80)

    for config in camera_configs:
        try:
            # Each camera gets its own model_id, but shares the same engine!
            metadata = repo.load_model(
                model_id=config["id"],
                file_path=model_file,
                num_contexts=4  # 4 contexts shared across all cameras
            )
            print(f"\n✓ Loaded model for {config['id']}")
        except Exception as e:
            print(f"\n✗ Failed to load model for {config['id']}: {e}")

    # Show repository stats
    print("\n" + "=" * 80)
    print("REPOSITORY STATISTICS")
    print("=" * 80)
    stats = repo.get_stats()
    print(f"Total model IDs: {stats['total_model_ids']}")
    print(f"Unique engines in VRAM: {stats['unique_engines']}")
    print(f"Total contexts: {stats['total_contexts']}")
    print(f"Memory efficiency: {stats['memory_efficiency']}")

    # Get detailed info for one camera
    print("\n" + "=" * 80)
    print("DETAILED MODEL INFO (camera_1)")
    print("=" * 80)
    info = repo.get_model_info("camera_1")
    if info:
        print(f"Model ID: {info['model_id']}")
        print(f"File: {info['file_path']}")
        print(f"File hash: {info['file_hash']}")
        print(f"Engine references: {info['engine_references']}")
        print(f"Context pool size: {info['context_pool_size']}")
        print(f"Shared with: {info['shared_with_model_ids']}")
        print(f"\nInputs:")
        for name, spec in info['inputs'].items():
            print(f"  {name}: {spec['shape']} ({spec['dtype']})")
        print(f"\nOutputs:")
        for name, spec in info['outputs'].items():
            print(f"  {name}: {spec['shape']} ({spec['dtype']})")

    # Simulate inference from multiple cameras
    print("\n" + "=" * 80)
    print("RUNNING INFERENCE (GPU-to-GPU)")
    print("=" * 80)

    # Create dummy input tensors (simulating frames from cameras)
    # In real scenario, these come from StreamDecoder.get_frame()
    batch_size = 1
    channels = 3
    height = 640
    width = 640

    for config in camera_configs:
        try:
            # Simulate getting frame from camera (already on GPU)
            input_tensor = torch.rand(
                batch_size, channels, height, width,
                dtype=torch.float32,
                device='cuda:0'
            )

            # Run inference (stays in GPU)
            start = time.time()
            outputs = repo.infer(
                model_id=config["id"],
                inputs={"images": input_tensor},  # Adjust input name based on your model
                synchronize=True,
                timeout=5.0
            )
            elapsed = (time.time() - start) * 1000  # Convert to ms

            print(f"\n{config['id']}: Inference completed in {elapsed:.2f}ms")
            for name, tensor in outputs.items():
                print(f"  Output '{name}': {tensor.shape} on {tensor.device}")

        except Exception as e:
            print(f"\n{config['id']}: Inference failed: {e}")

    # Cleanup
    print("\n" + "=" * 80)
    print("CLEANUP")
    print("=" * 80)

    for config in camera_configs:
        repo.unload_model(config["id"])

    print("\nAll models unloaded.")


def test_rtsp_stream_with_inference():
    """
    Real-world example: Decode RTSP stream and run inference.
    Everything stays in GPU memory (zero CPU transfers).
    """

    print("=" * 80)
    print("RTSP STREAM + TENSORRT INFERENCE (GPU-to-GPU)")
    print("=" * 80)

    # Initialize components
    decoder_factory = StreamDecoderFactory(gpu_id=0)
    model_repo = TensorRTModelRepository(gpu_id=0, default_num_contexts=4)

    # Setup camera stream
    rtsp_url = "rtsp://your-camera-ip/stream"
    decoder = decoder_factory.create_decoder(rtsp_url, buffer_size=30)
    decoder.start()

    # Load inference model
    try:
        model_repo.load_model(
            model_id="camera_main",
            file_path="models/yolov8n.trt"
        )
    except FileNotFoundError:
        print("\n⚠ Model file not found. Please export your model to TensorRT:")
        print("   Example: yolo export model=yolov8n.pt format=engine device=0")
        return

    print("\nWaiting for stream to buffer frames...")
    time.sleep(3)

    # Process frames
    for i in range(10):
        # Get frame from decoder (already on GPU)
        frame_gpu = decoder.get_latest_frame(rgb=True)  # Returns torch.Tensor on CUDA

        if frame_gpu is None:
            print(f"Frame {i}: No frame available")
            continue

        # Preprocess if needed (stays on GPU)
        # For YOLOv8: normalize, resize, etc.
        # Example preprocessing (adjust for your model):
        frame_gpu = frame_gpu.float() / 255.0  # Normalize to [0, 1]
        frame_gpu = frame_gpu.unsqueeze(0)  # Add batch dimension: (1, 3, H, W)

        # Run inference (GPU-to-GPU, zero copy)
        try:
            outputs = model_repo.infer(
                model_id="camera_main",
                inputs={"images": frame_gpu},
                synchronize=True
            )

            print(f"\nFrame {i}: Inference successful")
            for name, tensor in outputs.items():
                print(f"  {name}: {tensor.shape} on {tensor.device}")

            # Post-process results (can stay on GPU or move to CPU as needed)
            # Example: NMS, bounding box extraction, etc.

        except Exception as e:
            print(f"\nFrame {i}: Inference failed: {e}")

        time.sleep(0.1)  # Simulate processing interval

    # Cleanup
    decoder.stop()
    model_repo.unload_model("camera_main")
    print("\n✓ Test completed successfully")


def test_concurrent_inference():
    """
    Test concurrent inference from multiple threads.
    Demonstrates context pool load balancing.
    """
    import threading

    print("=" * 80)
    print("CONCURRENT INFERENCE TEST (Context Pool Load Balancing)")
    print("=" * 80)

    repo = TensorRTModelRepository(gpu_id=0, default_num_contexts=4)

    # Load model
    try:
        repo.load_model("shared_model", "models/yolov8n.trt", num_contexts=4)
    except Exception as e:
        print(f"Failed to load model: {e}")
        return

    def worker(worker_id: int, num_inferences: int):
        """Worker thread performing inference"""
        for i in range(num_inferences):
            try:
                # Create dummy input
                input_tensor = torch.rand(1, 3, 640, 640, device='cuda:0', dtype=torch.float32)

                # Acquire context from pool, run inference, release context
                outputs = repo.infer(
                    model_id="shared_model",
                    inputs={"images": input_tensor},
                    timeout=10.0
                )

                print(f"Worker {worker_id}, Inference {i}: SUCCESS")

            except Exception as e:
                print(f"Worker {worker_id}, Inference {i}: FAILED - {e}")

            time.sleep(0.01)  # Small delay

    # Launch multiple worker threads (more workers than contexts!)
    threads = []
    num_workers = 10  # 10 workers sharing 4 contexts
    inferences_per_worker = 5

    print(f"\nLaunching {num_workers} workers (only 4 contexts available)")
    print("Contexts will be borrowed/returned automatically\n")

    start_time = time.time()

    for worker_id in range(num_workers):
        t = threading.Thread(target=worker, args=(worker_id, inferences_per_worker))
        threads.append(t)
        t.start()

    # Wait for all workers
    for t in threads:
        t.join()

    elapsed = time.time() - start_time
    total_inferences = num_workers * inferences_per_worker

    print(f"\n✓ Completed {total_inferences} inferences in {elapsed:.2f}s")
    print(f"  Throughput: {total_inferences / elapsed:.2f} inferences/sec")
    print(f"  With only 4 contexts for {num_workers} workers!")

    repo.unload_model("shared_model")


if __name__ == "__main__":
    print("\n" + "=" * 80)
    print("TENSORRT MODEL REPOSITORY - TEST SUITE")
    print("=" * 80)

    # Test 1: Multi-camera model loading
    print("\n\nTEST 1: Multi-Camera Model Loading with Deduplication")
    print("-" * 80)
    try:
        test_multi_camera_inference()
    except Exception as e:
        print(f"Test 1 failed: {e}")

    # Test 2: RTSP stream + inference (commented out by default)
    # Uncomment if you have a real RTSP stream
    # print("\n\nTEST 2: RTSP Stream + Inference")
    # print("-" * 80)
    # try:
    #     test_rtsp_stream_with_inference()
    # except Exception as e:
    #     print(f"Test 2 failed: {e}")

    # Test 3: Concurrent inference
    print("\n\nTEST 3: Concurrent Inference with Context Pooling")
    print("-" * 80)
    try:
        test_concurrent_inference()
    except Exception as e:
        print(f"Test 3 failed: {e}")

    print("\n" + "=" * 80)
    print("ALL TESTS COMPLETED")
    print("=" * 80)