feat: inference subsystem and optimization to decoder

This commit is contained in:
Siwat Sirichai 2025-11-09 00:57:08 +07:00
commit 3c83a57e44
19 changed files with 3897 additions and 0 deletions

255
test_multi_stream.py Executable file
View file

@ -0,0 +1,255 @@
#!/usr/bin/env python3
"""
Multi-stream test script to verify CUDA context sharing efficiency.
Tests multiple RTSP streams simultaneously and monitors VRAM usage.
"""
import argparse
import time
import sys
import subprocess
import os
from pathlib import Path
from dotenv import load_dotenv
from services import StreamDecoderFactory, ConnectionStatus
# Load environment variables from .env file
load_dotenv()
def get_gpu_memory_usage(gpu_id: int = 0) -> int:
"""Get current GPU memory usage in MB using nvidia-smi"""
try:
result = subprocess.run(
['nvidia-smi', '--query-gpu=memory.used', '--format=csv,noheader,nounits', f'--id={gpu_id}'],
capture_output=True,
text=True,
check=True
)
return int(result.stdout.strip())
except Exception as e:
print(f"Warning: Could not get GPU memory usage: {e}")
return 0
def main():
parser = argparse.ArgumentParser(description='Test multi-stream decoding with context sharing')
parser.add_argument(
'--gpu-id',
type=int,
default=0,
help='GPU device ID'
)
parser.add_argument(
'--duration',
type=int,
default=20,
help='Test duration in seconds'
)
parser.add_argument(
'--capture-snapshots',
action='store_true',
help='Capture JPEG snapshots during test'
)
parser.add_argument(
'--output-dir',
type=str,
default='./multi_stream_snapshots',
help='Output directory for snapshots'
)
args = parser.parse_args()
# Load camera URLs from environment
camera_urls = []
i = 1
while True:
url = os.getenv(f'CAMERA_URL_{i}')
if url:
camera_urls.append(url)
i += 1
else:
break
if not camera_urls:
print("Error: No camera URLs found in .env file")
print("Please add CAMERA_URL_1, CAMERA_URL_2, etc. to your .env file")
sys.exit(1)
# Create output directory if capturing snapshots
if args.capture_snapshots:
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
print("=" * 80)
print("Multi-Stream RTSP Decoder Test - Context Sharing Verification")
print("=" * 80)
print(f"Number of Streams: {len(camera_urls)}")
print(f"GPU ID: {args.gpu_id}")
print(f"Test Duration: {args.duration} seconds")
print(f"Capture Snapshots: {args.capture_snapshots}")
print("=" * 80)
print()
try:
# Get baseline GPU memory
print("[Baseline] Measuring initial GPU memory usage...")
baseline_memory = get_gpu_memory_usage(args.gpu_id)
print(f"✓ Baseline VRAM: {baseline_memory} MB\n")
# Initialize factory (shared CUDA context)
print("[1/4] Initializing StreamDecoderFactory with shared CUDA context...")
factory = StreamDecoderFactory(gpu_id=args.gpu_id)
factory_memory = get_gpu_memory_usage(args.gpu_id)
factory_overhead = factory_memory - baseline_memory
print(f"✓ Factory initialized")
print(f" VRAM after factory: {factory_memory} MB (+{factory_overhead} MB)\n")
# Create all decoders
print(f"[2/4] Creating {len(camera_urls)} StreamDecoder instances...")
decoders = []
for i, url in enumerate(camera_urls):
decoder = factory.create_decoder(
rtsp_url=url,
buffer_size=30,
codec='h264'
)
decoders.append(decoder)
print(f" ✓ Decoder {i+1} created for camera {url.split('@')[1].split('/')[0]}")
decoders_memory = get_gpu_memory_usage(args.gpu_id)
decoders_overhead = decoders_memory - factory_memory
print(f"\n VRAM after creating {len(decoders)} decoders: {decoders_memory} MB (+{decoders_overhead} MB)")
print(f" Average per decoder: {decoders_overhead / len(decoders):.1f} MB\n")
# Start all decoders
print(f"[3/4] Starting all {len(decoders)} decoders...")
for i, decoder in enumerate(decoders):
decoder.start()
print(f" ✓ Decoder {i+1} started")
started_memory = get_gpu_memory_usage(args.gpu_id)
started_overhead = started_memory - decoders_memory
print(f"\n VRAM after starting decoders: {started_memory} MB (+{started_overhead} MB)")
print(f" Average per running decoder: {started_overhead / len(decoders):.1f} MB\n")
# Wait for all streams to connect
print("[4/4] Waiting for all streams to connect...")
max_wait = 15
for wait_time in range(max_wait):
connected = sum(1 for d in decoders if d.is_connected())
print(f" Connected: {connected}/{len(decoders)} streams", end='\r')
if connected == len(decoders):
print(f"\n✓ All {len(decoders)} streams connected!\n")
break
time.sleep(1)
else:
connected = sum(1 for d in decoders if d.is_connected())
print(f"\n⚠ Only {connected}/{len(decoders)} streams connected after {max_wait}s\n")
connected_memory = get_gpu_memory_usage(args.gpu_id)
connected_overhead = connected_memory - started_memory
print(f" VRAM after connection: {connected_memory} MB (+{connected_overhead} MB)\n")
# Monitor streams
print(f"Monitoring streams for {args.duration} seconds...")
print("=" * 80)
print(f"{'Time':<8} {'VRAM':<10} {'Stream 1':<12} {'Stream 2':<12} {'Stream 3':<12} {'Stream 4':<12}")
print("-" * 80)
start_time = time.time()
snapshot_interval = args.duration // 3 if args.capture_snapshots else 0
last_snapshot = 0
while time.time() - start_time < args.duration:
elapsed = time.time() - start_time
current_memory = get_gpu_memory_usage(args.gpu_id)
# Get stats for each decoder
stats = []
for decoder in decoders:
status = decoder.get_status().value[:8]
buffer = decoder.get_buffer_size()
frames = decoder.frame_count
stats.append(f"{status:8s} {buffer:2d}/30 {frames:4d}")
print(f"{elapsed:6.1f}s {current_memory:6d}MB {stats[0]:<12} {stats[1]:<12} {stats[2]:<12} {stats[3]:<12}")
# Capture snapshots
if args.capture_snapshots and snapshot_interval > 0:
if elapsed - last_snapshot >= snapshot_interval:
print("\n → Capturing snapshots from all streams...")
for i, decoder in enumerate(decoders):
jpeg_bytes = decoder.get_frame_as_jpeg(quality=85)
if jpeg_bytes:
filename = output_dir / f"camera_{i+1}_t{int(elapsed)}s.jpg"
with open(filename, 'wb') as f:
f.write(jpeg_bytes)
print(f" Saved {filename.name} ({len(jpeg_bytes)/1024:.1f} KB)")
print()
last_snapshot = elapsed
time.sleep(1)
print("=" * 80)
# Final memory analysis
final_memory = get_gpu_memory_usage(args.gpu_id)
total_overhead = final_memory - baseline_memory
print("\n" + "=" * 80)
print("Memory Usage Analysis")
print("=" * 80)
print(f"Baseline VRAM: {baseline_memory:6d} MB")
print(f"After Factory Init: {factory_memory:6d} MB (+{factory_overhead:4d} MB)")
print(f"After Creating {len(decoders)} Decoders: {decoders_memory:6d} MB (+{decoders_overhead:4d} MB)")
print(f"After Starting Decoders: {started_memory:6d} MB (+{started_overhead:4d} MB)")
print(f"After Connection: {connected_memory:6d} MB (+{connected_overhead:4d} MB)")
print(f"Final (after {args.duration}s): {final_memory:6d} MB (+{total_overhead:4d} MB total)")
print("-" * 80)
print(f"Average VRAM per stream: {total_overhead / len(decoders):6.1f} MB")
print(f"Context sharing efficiency: {'EXCELLENT' if total_overhead < 500 else 'GOOD' if total_overhead < 800 else 'POOR'}")
print("=" * 80)
# Final stats
print("\nFinal Stream Statistics:")
print("-" * 80)
for i, decoder in enumerate(decoders):
status = decoder.get_status().value
buffer = decoder.get_buffer_size()
frames = decoder.frame_count
fps = frames / args.duration if args.duration > 0 else 0
print(f"Stream {i+1}: {status:12s} | Buffer: {buffer:2d}/{decoder.buffer_size} | "
f"Frames: {frames:5d} | Avg FPS: {fps:5.2f}")
print("=" * 80)
except KeyboardInterrupt:
print("\n\n✗ Interrupted by user")
sys.exit(1)
except Exception as e:
print(f"\n\n✗ Error: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
finally:
# Cleanup
if 'decoders' in locals():
print("\nCleaning up...")
for i, decoder in enumerate(decoders):
decoder.stop()
print(f" ✓ Decoder {i+1} stopped")
cleanup_memory = get_gpu_memory_usage(args.gpu_id)
print(f"\nVRAM after cleanup: {cleanup_memory} MB")
print("\n✓ Multi-stream test completed successfully")
sys.exit(0)
if __name__ == '__main__':
main()