fix: gpu memory leaks

2025-11-10 22:10:46 +07:00 · 2025-11-10 22:10:46 +07:00 · 593611cdb7
commit 593611cdb7
parent 3a47920186
13 changed files with 420 additions and 166 deletions
--- a/services/pt_converter.py
+++ b/services/pt_converter.py
@ -125,12 +125,19 @@ class PTConverter:

        mapping = self.mapping_db[pt_hash]
        trt_hash = mapping["trt_hash"]
+
+        # Check both .engine and .trt extensions (Ultralytics uses .engine, generic uses .trt)
+        engine_key = f"trt/{trt_hash}.engine"
        trt_key = f"trt/{trt_hash}.trt"

-        # Verify TRT file still exists in storage
-        if not self.storage.exists(trt_key):
+        # Try .engine first (Ultralytics native format)
+        if self.storage.exists(engine_key):
+            cached_key = engine_key
+        elif self.storage.exists(trt_key):
+            cached_key = trt_key
+        else:
            logger.warning(
-                f"Mapping exists for PT hash {pt_hash[:16]}... but TRT file missing. "
+                f"Mapping exists for PT hash {pt_hash[:16]}... but engine file missing. "
                f"Will reconvert."
            )
            # Remove stale mapping
@ -139,16 +146,16 @@ class PTConverter:
            return None

        # Get local path
-        trt_path = self.storage.get_local_path(trt_key)
-        if trt_path is None:
-            logger.error(f"Could not get local path for TRT file {trt_key}")
+        cached_path = self.storage.get_local_path(cached_key)
+        if cached_path is None:
+            logger.error(f"Could not get local path for engine file {cached_key}")
            return None

        logger.info(
            f"Found cached conversion for PT hash {pt_hash[:16]}... -> "
-            f"TRT hash {trt_hash[:16]}..."
+            f"Engine hash {trt_hash[:16]}... ({cached_key})"
        )
-        return (trt_hash, trt_path)
+        return (trt_hash, cached_path)

    def convert(
        self,
@ -241,24 +248,21 @@ class PTConverter:
        precision: torch.dtype,
    ) -> Tuple[str, str]:
        """
-        Convert ultralytics YOLO model using ONNX → TensorRT pipeline.
-        Uses the same approach as scripts/convert_pt_to_tensorrt.py
+        Convert ultralytics YOLO model using native .engine export.
+        This produces .engine files with embedded metadata (no manual input_shapes needed).

        Args:
            pt_path: Path to PT file
            pt_hash: PT file hash
-            input_shapes: Input tensor shapes
+            input_shapes: Input tensor shapes (IGNORED for Ultralytics - auto-detected)
            precision: Target precision

        Returns:
-            Tuple of (trt_hash, trt_file_path)
+            Tuple of (engine_hash, engine_file_path)
        """
-        import tensorrt as trt
-        import tempfile
        import os
-        import shutil

-        logger.info("Detected ultralytics YOLO model, using ONNX → TensorRT pipeline...")
+        logger.info("Detected ultralytics YOLO model, using native .engine export...")

        # Load ultralytics model
        try:
@ -267,83 +271,48 @@ class PTConverter:
        except ImportError:
            raise ImportError("ultralytics package not found. Install with: pip install ultralytics")

-        # Determine input shape
-        if not input_shapes:
-            raise ValueError("input_shapes required for ultralytics conversion")
+        # Export to native .engine format with embedded metadata
+        logger.info(f"Exporting to native TensorRT .engine (precision: {'FP16' if precision == torch.float16 else 'FP32'})...")

-        input_key = 'images' if 'images' in input_shapes else list(input_shapes.keys())[0]
-        input_shape = input_shapes[input_key]
+        # Ultralytics export creates .engine file in same directory as .pt
+        engine_path = model.export(
+            format='engine',
+            half=(precision == torch.float16),
+            device=self.gpu_id,
+            batch=1,
+            simplify=True
+        )

-        # Export to ONNX first
-        logger.info(f"Exporting to ONNX (input shape: {input_shape})...")
-        with tempfile.NamedTemporaryFile(suffix='.onnx', delete=False) as tmp_onnx:
-            onnx_path = tmp_onnx.name
+        # Convert to string (Ultralytics returns Path object)
+        engine_path = str(engine_path)
+        logger.info(f"Native .engine export complete: {engine_path}")
+        logger.info("Metadata embedded in .engine file (stride, imgsz, names, etc.)")

-        try:
-            # Use ultralytics export to ONNX
-            model.export(format='onnx', imgsz=input_shape[2], batch=input_shape[0])
-            # Ultralytics saves as model_name.onnx in same directory
-            pt_dir = os.path.dirname(pt_path)
-            pt_name = os.path.splitext(os.path.basename(pt_path))[0]
-            onnx_export_path = os.path.join(pt_dir, f"{pt_name}.onnx")
+        # Read the exported .engine file
+        with open(engine_path, 'rb') as f:
+            engine_data = f.read()

-            # Move to our temp location (use shutil.move for cross-device support)
-            if os.path.exists(onnx_export_path):
-                shutil.move(onnx_export_path, onnx_path)
-            else:
-                raise RuntimeError(f"ONNX export failed, file not found: {onnx_export_path}")
+        # Compute hash of the .engine file
+        engine_hash = hashlib.sha256(engine_data).hexdigest()

-            logger.info(f"ONNX export complete: {onnx_path}")
+        # Store in our cache (as .engine to preserve metadata)
+        engine_key = f"trt/{engine_hash}.engine"
+        self.storage.write(engine_key, engine_data)

-            # Build TensorRT engine from ONNX
-            logger.info("Building TensorRT engine from ONNX...")
-            trt_logger = trt.Logger(trt.Logger.WARNING)
-            builder = trt.Builder(trt_logger)
-            network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
-            parser = trt.OnnxParser(network, trt_logger)
+        cached_path = self.storage.get_local_path(engine_key)
+        if cached_path is None:
+            raise RuntimeError("Failed to get local path for .engine file")

-            # Parse ONNX
-            with open(onnx_path, 'rb') as f:
-                if not parser.parse(f.read()):
-                    errors = [parser.get_error(i) for i in range(parser.num_errors)]
-                    raise RuntimeError(f"Failed to parse ONNX: {errors}")
+        # Clean up the original export (we've cached it)
+        # Only delete if it's different from cached path
+        if os.path.exists(engine_path) and os.path.abspath(engine_path) != os.path.abspath(cached_path):
+            logger.info(f"Removing original export (cached): {engine_path}")
+            os.unlink(engine_path)
+        else:
+            logger.info(f"Keeping original export at: {engine_path}")

-            # Configure builder
-            config = builder.create_builder_config()
-            config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 4 << 30)  # 4GB
-
-            # Set precision
-            if precision == torch.float16:
-                if builder.platform_has_fast_fp16:
-                    config.set_flag(trt.BuilderFlag.FP16)
-                    logger.info("FP16 mode enabled")
-
-            # Build engine
-            logger.info("Building TensorRT engine (this may take a few minutes)...")
-            serialized_engine = builder.build_serialized_network(network, config)
-
-            if serialized_engine is None:
-                raise RuntimeError("Failed to build TensorRT engine")
-
-            # Convert IHostMemory to bytes
-            engine_bytes = bytes(serialized_engine)
-
-            # Save to storage
-            trt_hash = hashlib.sha256(engine_bytes).hexdigest()
-            trt_key = f"trt/{trt_hash}.trt"
-            self.storage.write(trt_key, engine_bytes)
-
-            trt_path = self.storage.get_local_path(trt_key)
-            if trt_path is None:
-                raise RuntimeError("Failed to get local path for TRT file")
-
-            logger.info(f"TensorRT engine built successfully: {trt_path}")
-            return (trt_hash, trt_path)
-
-        finally:
-            # Cleanup ONNX file
-            if os.path.exists(onnx_path):
-                os.unlink(onnx_path)
+        logger.info(f"Cached .engine file: {cached_path}")
+        return (engine_hash, cached_path)

    def _perform_conversion(
        self,
@ -387,11 +356,21 @@ class PTConverter:

            # Check if this is an ultralytics model
            if self._is_ultralytics_model(model):
-                logger.info("Detected ultralytics model, using ultralytics export API")
+                logger.info("Detected Ultralytics YOLO model, using native .engine export")
+                logger.info("Note: input_shapes parameter is ignored for Ultralytics models (auto-detected)")
                return self._convert_ultralytics_model(pt_path, pt_hash, input_shapes, precision)

            # For non-ultralytics models, use torch_tensorrt
-            logger.info("Using torch_tensorrt for conversion")
+            logger.info("Using torch_tensorrt for conversion (non-Ultralytics model)")
+
+            # Non-Ultralytics models REQUIRE input_shapes
+            if input_shapes is None:
+                raise ValueError(
+                    "input_shapes required for non-Ultralytics PyTorch models. "
+                    "For Ultralytics YOLO models, input_shapes is auto-detected. "
+                    "Example: input_shapes={'images': (1, 3, 640, 640)}"
+                )
+
            model.eval()

            # Convert model to target precision to avoid mixed precision issues