fix: gpu memory leaks

This commit is contained in:
Siwat Sirichai 2025-11-10 22:10:46 +07:00
parent 3a47920186
commit 593611cdb7
13 changed files with 420 additions and 166 deletions

View file

@ -12,26 +12,31 @@ from .jpeg_encoder import encode_frame_to_jpeg
class FrameReference:
"""
CPU-side reference object for a GPU frame.
Reference-counted frame wrapper for zero-copy memory management.
This object holds a cloned RGB tensor that is independent of PyNvVideoCodec's
DecodedFrame lifecycle. We don't keep the DecodedFrame to avoid conflicts
with PyNvVideoCodec's internal frame pool management.
This allows multiple parts of the pipeline to hold references to the same
cloned frame, and tracks when all references are released so the decoder
knows when buffer slots can be reused.
"""
def __init__(self, rgb_tensor: torch.Tensor, buffer_index: int, decoder):
self.rgb_tensor = rgb_tensor # Cloned RGB tensor (independent copy)
self.rgb_tensor = rgb_tensor # Cloned RGB tensor (one clone per frame)
self.buffer_index = buffer_index
self.decoder = decoder # Reference to decoder for marking as free
self.decoder = decoder
self._freed = False
def free(self):
"""Mark this frame as no longer in use"""
"""Mark this reference as freed - called by the last user of the frame"""
if not self._freed:
self._freed = True
# Release GPU memory immediately
if self.rgb_tensor is not None:
del self.rgb_tensor
self.rgb_tensor = None
self.decoder._mark_frame_free(self.buffer_index)
def is_freed(self) -> bool:
"""Check if this frame has been freed"""
"""Check if this reference has been freed"""
return self._freed
def __del__(self):
@ -212,13 +217,10 @@ class StreamDecoder:
self.status = ConnectionStatus.DISCONNECTED
self._status_lock = threading.Lock()
# Frame buffer (ring buffer) - stores FrameReference objects
# Frame buffer (ring buffer) - stores cloned RGB tensors
self.frame_buffer = deque(maxlen=buffer_size)
self._buffer_lock = threading.RLock()
# Track which buffer slots are in use (list of FrameReference objects)
self._in_use_frames = [] # List of FrameReference objects currently held by callbacks
# Decoder and container instances
self.decoder = None
self.container = None
@ -236,6 +238,10 @@ class StreamDecoder:
self._frame_callbacks = []
self._callback_lock = threading.Lock()
# Track frames currently in use (referenced by callbacks/pipeline)
self._in_use_frames = [] # List of FrameReference objects
self._frame_index_counter = 0 # Monotonically increasing frame index
def register_frame_callback(self, callback: Callable):
"""
Register a callback to be called when a new frame is decoded.
@ -396,19 +402,7 @@ class StreamDecoder:
# Add frames to ring buffer and fire callbacks
with self._buffer_lock:
for frame in decoded_frames:
# Check for buffer overflow - discard oldest if needed
if len(self.frame_buffer) >= self.buffer_size:
# Check if oldest frame is still in use
if len(self._in_use_frames) > 0:
oldest_ref = self.frame_buffer[0] if len(self.frame_buffer) > 0 else None
if oldest_ref and not oldest_ref.is_freed():
# Force free the oldest frame to prevent overflow
print(f"[WARNING] Buffer overflow, force-freeing oldest frame (buffer_index={oldest_ref.buffer_index})")
oldest_ref.free()
# Deque will automatically remove oldest when at maxlen
# Convert to tensor
# Convert to tensor immediately after NVDEC
try:
# Convert DecodedFrame to PyTorch tensor using DLPack (zero-copy)
nv12_tensor = torch.from_dlpack(frame)
@ -417,32 +411,32 @@ class StreamDecoder:
if self.frame_height is not None and self.frame_width is not None:
rgb_tensor = nv12_to_rgb_gpu(nv12_tensor, self.frame_height, self.frame_width)
# CRITICAL: Clone the RGB tensor to break CUDA memory dependency
# The nv12_to_rgb_gpu creates a new tensor, but it still references
# the same CUDA context/stream. We need an independent copy.
rgb_tensor_cloned = rgb_tensor.clone()
# CLONE ONCE into our post-decode buffer
# This breaks the dependency on PyNvVideoCodec's DecodedFrame
# After this, the tensor is fully ours and can be used throughout the pipeline
rgb_cloned = rgb_tensor.clone()
# Create FrameReference object for C++-style memory management
# We don't keep the DecodedFrame to avoid conflicts with PyNvVideoCodec's
# internal frame pool - the clone is fully independent
buffer_index = self.frame_count
# Create FrameReference for reference counting
frame_ref = FrameReference(
rgb_tensor=rgb_tensor_cloned, # Independent cloned tensor
buffer_index=buffer_index,
rgb_tensor=rgb_cloned,
buffer_index=self._frame_index_counter,
decoder=self
)
self._frame_index_counter += 1
# Add to buffer and in-use tracking
# Add FrameReference to ring buffer (deque automatically removes oldest when full)
self.frame_buffer.append(frame_ref)
self._in_use_frames.append(frame_ref)
self.frame_count += 1
# Fire callbacks with the cloned RGB tensor from FrameReference
# The tensor is now independent of the DecodedFrame lifecycle
# Track this frame as in-use
self._in_use_frames.append(frame_ref)
# Fire callbacks with the FrameReference
# The callback receivers should call .free() when done
with self._callback_lock:
for callback in self._frame_callbacks:
try:
callback(frame_ref.rgb_tensor)
callback(frame_ref)
except Exception as e:
print(f"Error in frame callback: {e}")
except Exception as e: