feat: inference subsystem and optimization to decoder

2025-11-09 00:57:08 +07:00 · 2025-11-09 00:57:08 +07:00 · 3c83a57e44
commit 3c83a57e44
19 changed files with 3897 additions and 0 deletions
--- a/test_multi_stream.py
+++ b/test_multi_stream.py
@ -0,0 +1,255 @@
+#!/usr/bin/env python3
+"""
+Multi-stream test script to verify CUDA context sharing efficiency.
+Tests multiple RTSP streams simultaneously and monitors VRAM usage.
+"""
+
+import argparse
+import time
+import sys
+import subprocess
+import os
+from pathlib import Path
+from dotenv import load_dotenv
+from services import StreamDecoderFactory, ConnectionStatus
+
+# Load environment variables from .env file
+load_dotenv()
+
+
+def get_gpu_memory_usage(gpu_id: int = 0) -> int:
+    """Get current GPU memory usage in MB using nvidia-smi"""
+    try:
+        result = subprocess.run(
+            ['nvidia-smi', '--query-gpu=memory.used', '--format=csv,noheader,nounits', f'--id={gpu_id}'],
+            capture_output=True,
+            text=True,
+            check=True
+        )
+        return int(result.stdout.strip())
+    except Exception as e:
+        print(f"Warning: Could not get GPU memory usage: {e}")
+        return 0
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Test multi-stream decoding with context sharing')
+    parser.add_argument(
+        '--gpu-id',
+        type=int,
+        default=0,
+        help='GPU device ID'
+    )
+    parser.add_argument(
+        '--duration',
+        type=int,
+        default=20,
+        help='Test duration in seconds'
+    )
+    parser.add_argument(
+        '--capture-snapshots',
+        action='store_true',
+        help='Capture JPEG snapshots during test'
+    )
+    parser.add_argument(
+        '--output-dir',
+        type=str,
+        default='./multi_stream_snapshots',
+        help='Output directory for snapshots'
+    )
+
+    args = parser.parse_args()
+
+    # Load camera URLs from environment
+    camera_urls = []
+    i = 1
+    while True:
+        url = os.getenv(f'CAMERA_URL_{i}')
+        if url:
+            camera_urls.append(url)
+            i += 1
+        else:
+            break
+
+    if not camera_urls:
+        print("Error: No camera URLs found in .env file")
+        print("Please add CAMERA_URL_1, CAMERA_URL_2, etc. to your .env file")
+        sys.exit(1)
+
+    # Create output directory if capturing snapshots
+    if args.capture_snapshots:
+        output_dir = Path(args.output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+    print("=" * 80)
+    print("Multi-Stream RTSP Decoder Test - Context Sharing Verification")
+    print("=" * 80)
+    print(f"Number of Streams: {len(camera_urls)}")
+    print(f"GPU ID: {args.gpu_id}")
+    print(f"Test Duration: {args.duration} seconds")
+    print(f"Capture Snapshots: {args.capture_snapshots}")
+    print("=" * 80)
+    print()
+
+    try:
+        # Get baseline GPU memory
+        print("[Baseline] Measuring initial GPU memory usage...")
+        baseline_memory = get_gpu_memory_usage(args.gpu_id)
+        print(f"✓ Baseline VRAM: {baseline_memory} MB\n")
+
+        # Initialize factory (shared CUDA context)
+        print("[1/4] Initializing StreamDecoderFactory with shared CUDA context...")
+        factory = StreamDecoderFactory(gpu_id=args.gpu_id)
+
+        factory_memory = get_gpu_memory_usage(args.gpu_id)
+        factory_overhead = factory_memory - baseline_memory
+        print(f"✓ Factory initialized")
+        print(f"  VRAM after factory: {factory_memory} MB (+{factory_overhead} MB)\n")
+
+        # Create all decoders
+        print(f"[2/4] Creating {len(camera_urls)} StreamDecoder instances...")
+        decoders = []
+        for i, url in enumerate(camera_urls):
+            decoder = factory.create_decoder(
+                rtsp_url=url,
+                buffer_size=30,
+                codec='h264'
+            )
+            decoders.append(decoder)
+            print(f"  ✓ Decoder {i+1} created for camera {url.split('@')[1].split('/')[0]}")
+
+        decoders_memory = get_gpu_memory_usage(args.gpu_id)
+        decoders_overhead = decoders_memory - factory_memory
+        print(f"\n  VRAM after creating {len(decoders)} decoders: {decoders_memory} MB (+{decoders_overhead} MB)")
+        print(f"  Average per decoder: {decoders_overhead / len(decoders):.1f} MB\n")
+
+        # Start all decoders
+        print(f"[3/4] Starting all {len(decoders)} decoders...")
+        for i, decoder in enumerate(decoders):
+            decoder.start()
+            print(f"  ✓ Decoder {i+1} started")
+
+        started_memory = get_gpu_memory_usage(args.gpu_id)
+        started_overhead = started_memory - decoders_memory
+        print(f"\n  VRAM after starting decoders: {started_memory} MB (+{started_overhead} MB)")
+        print(f"  Average per running decoder: {started_overhead / len(decoders):.1f} MB\n")
+
+        # Wait for all streams to connect
+        print("[4/4] Waiting for all streams to connect...")
+        max_wait = 15
+        for wait_time in range(max_wait):
+            connected = sum(1 for d in decoders if d.is_connected())
+            print(f"  Connected: {connected}/{len(decoders)} streams", end='\r')
+
+            if connected == len(decoders):
+                print(f"\n✓ All {len(decoders)} streams connected!\n")
+                break
+
+            time.sleep(1)
+        else:
+            connected = sum(1 for d in decoders if d.is_connected())
+            print(f"\n⚠ Only {connected}/{len(decoders)} streams connected after {max_wait}s\n")
+
+        connected_memory = get_gpu_memory_usage(args.gpu_id)
+        connected_overhead = connected_memory - started_memory
+        print(f"  VRAM after connection: {connected_memory} MB (+{connected_overhead} MB)\n")
+
+        # Monitor streams
+        print(f"Monitoring streams for {args.duration} seconds...")
+        print("=" * 80)
+        print(f"{'Time':<8} {'VRAM':<10} {'Stream 1':<12} {'Stream 2':<12} {'Stream 3':<12} {'Stream 4':<12}")
+        print("-" * 80)
+
+        start_time = time.time()
+        snapshot_interval = args.duration // 3 if args.capture_snapshots else 0
+        last_snapshot = 0
+
+        while time.time() - start_time < args.duration:
+            elapsed = time.time() - start_time
+            current_memory = get_gpu_memory_usage(args.gpu_id)
+
+            # Get stats for each decoder
+            stats = []
+            for decoder in decoders:
+                status = decoder.get_status().value[:8]
+                buffer = decoder.get_buffer_size()
+                frames = decoder.frame_count
+                stats.append(f"{status:8s} {buffer:2d}/30 {frames:4d}")
+
+            print(f"{elapsed:6.1f}s {current_memory:6d}MB {stats[0]:<12} {stats[1]:<12} {stats[2]:<12} {stats[3]:<12}")
+
+            # Capture snapshots
+            if args.capture_snapshots and snapshot_interval > 0:
+                if elapsed - last_snapshot >= snapshot_interval:
+                    print("\n  → Capturing snapshots from all streams...")
+                    for i, decoder in enumerate(decoders):
+                        jpeg_bytes = decoder.get_frame_as_jpeg(quality=85)
+                        if jpeg_bytes:
+                            filename = output_dir / f"camera_{i+1}_t{int(elapsed)}s.jpg"
+                            with open(filename, 'wb') as f:
+                                f.write(jpeg_bytes)
+                            print(f"     Saved {filename.name} ({len(jpeg_bytes)/1024:.1f} KB)")
+                    print()
+                    last_snapshot = elapsed
+
+            time.sleep(1)
+
+        print("=" * 80)
+
+        # Final memory analysis
+        final_memory = get_gpu_memory_usage(args.gpu_id)
+        total_overhead = final_memory - baseline_memory
+
+        print("\n" + "=" * 80)
+        print("Memory Usage Analysis")
+        print("=" * 80)
+        print(f"Baseline VRAM:                    {baseline_memory:6d} MB")
+        print(f"After Factory Init:               {factory_memory:6d} MB  (+{factory_overhead:4d} MB)")
+        print(f"After Creating {len(decoders)} Decoders:        {decoders_memory:6d} MB  (+{decoders_overhead:4d} MB)")
+        print(f"After Starting Decoders:          {started_memory:6d} MB  (+{started_overhead:4d} MB)")
+        print(f"After Connection:                 {connected_memory:6d} MB  (+{connected_overhead:4d} MB)")
+        print(f"Final (after {args.duration}s):              {final_memory:6d} MB  (+{total_overhead:4d} MB total)")
+        print("-" * 80)
+        print(f"Average VRAM per stream:          {total_overhead / len(decoders):6.1f} MB")
+        print(f"Context sharing efficiency:       {'EXCELLENT' if total_overhead < 500 else 'GOOD' if total_overhead < 800 else 'POOR'}")
+        print("=" * 80)
+
+        # Final stats
+        print("\nFinal Stream Statistics:")
+        print("-" * 80)
+        for i, decoder in enumerate(decoders):
+            status = decoder.get_status().value
+            buffer = decoder.get_buffer_size()
+            frames = decoder.frame_count
+            fps = frames / args.duration if args.duration > 0 else 0
+            print(f"Stream {i+1}: {status:12s} | Buffer: {buffer:2d}/{decoder.buffer_size} | "
+                  f"Frames: {frames:5d} | Avg FPS: {fps:5.2f}")
+        print("=" * 80)
+
+    except KeyboardInterrupt:
+        print("\n\n✗ Interrupted by user")
+        sys.exit(1)
+
+    except Exception as e:
+        print(f"\n\n✗ Error: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+
+    finally:
+        # Cleanup
+        if 'decoders' in locals():
+            print("\nCleaning up...")
+            for i, decoder in enumerate(decoders):
+                decoder.stop()
+                print(f"  ✓ Decoder {i+1} stopped")
+
+            cleanup_memory = get_gpu_memory_usage(args.gpu_id)
+            print(f"\nVRAM after cleanup: {cleanup_memory} MB")
+
+    print("\n✓ Multi-stream test completed successfully")
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()