From f7a0475e767b122fb076b95d56c9a71a16273f00 Mon Sep 17 00:00:00 2001 From: Will Guo Date: Sun, 1 Feb 2026 22:30:31 +0000 Subject: [PATCH 1/8] Integrate Automated QDQ placement tool - part 3.1 Signed-off-by: Will Guo --- .../onnx/quantization/autotune/__init__.py | 63 ++ .../onnx/quantization/autotune/benchmark.py | 684 ++++++++++++++++++ 2 files changed, 747 insertions(+) create mode 100644 modelopt/onnx/quantization/autotune/__init__.py create mode 100644 modelopt/onnx/quantization/autotune/benchmark.py diff --git a/modelopt/onnx/quantization/autotune/__init__.py b/modelopt/onnx/quantization/autotune/__init__.py new file mode 100644 index 000000000..c1b2ef5f5 --- /dev/null +++ b/modelopt/onnx/quantization/autotune/__init__.py @@ -0,0 +1,63 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Pattern-Based Q/DQ Autotuning for ONNX Models. + +This package provides automated optimization of Quantize/Dequantize (Q/DQ) node placement +in ONNX computation graphs to minimize TensorRT inference latency. It uses pattern-based +region analysis to efficiently explore and optimize Q/DQ insertion strategies. +""" + +# Core data structures +from .benchmark import TensorRTPyBenchmark, TrtExecBenchmark +from .common import ( + AutotunerError, + AutotunerNotInitializedError, + Config, + InsertionScheme, + InvalidSchemeError, + PatternCache, + PatternSchemes, + Region, + RegionType, +) +from .insertion_points import ( + ChildRegionInputInsertionPoint, + ChildRegionOutputInsertionPoint, + NodeInputInsertionPoint, + ResolvedInsertionPoint, +) +from .region_pattern import RegionPattern +from .region_search import CombinedRegionSearch + +__all__ = [ + "AutotunerError", + "AutotunerNotInitializedError", + "ChildRegionInputInsertionPoint", + "ChildRegionOutputInsertionPoint", + "CombinedRegionSearch", + "Config", + "InsertionScheme", + "InvalidSchemeError", + "NodeInputInsertionPoint", + "PatternCache", + "PatternSchemes", + "Region", + "RegionPattern", + "RegionType", + "ResolvedInsertionPoint", + "TensorRTPyBenchmark", + "TrtExecBenchmark", +] diff --git a/modelopt/onnx/quantization/autotune/benchmark.py b/modelopt/onnx/quantization/autotune/benchmark.py new file mode 100644 index 000000000..f1bd33cb9 --- /dev/null +++ b/modelopt/onnx/quantization/autotune/benchmark.py @@ -0,0 +1,684 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""TensorRT Utilities and Benchmark Module. + +This module provides comprehensive TensorRT utilities including: +- Benchmark framework for measuring TensorRT engine performance +- Graph utilities for tensor analysis + +**Benchmark Classes:** +- Benchmark: Abstract base class defining the benchmarking interface +- TrtExecBenchmark: Uses trtexec command-line tool for benchmarking +- TensorRTPyBenchmark: Uses TensorRT Python API for direct engine profiling +""" + +import ctypes +import os +import re +import shutil +import subprocess # nosec B404 +import tempfile +import time +from abc import ABC, abstractmethod +from pathlib import Path + +import numpy as np + +# Optional dependencies - gracefully handle missing packages +try: + import tensorrt as trt + + TRT_AVAILABLE = True +except ImportError: + TRT_AVAILABLE = False + +try: + import pycuda.autoinit # noqa: F401 # Automatically initializes CUDA (side-effect import) + import pycuda.driver as cuda + + PYCUDA_AVAILABLE = True +except ImportError: + PYCUDA_AVAILABLE = False + +from modelopt.onnx.logging_config import logger + + +class Benchmark(ABC): + """Abstract base class for TensorRT model benchmarking. + + This class defines the interface that all benchmark implementations must follow. + It provides a consistent API for measuring inference latency of ONNX models + when converted to TensorRT engines. + + Attributes: + timing_cache_file: Path to the TensorRT timing cache file. + warmup_runs: Number of warmup iterations before timing. + timing_runs: Number of iterations for latency measurement. + plugin_libraries: List of paths to plugin libraries. + logger: Logger instance for this benchmark. + + Subclasses must implement: + run(): Execute the benchmark and return latency in milliseconds. + """ + + def __init__( + self, + timing_cache_file: str | None = None, + warmup_runs: int = 5, + timing_runs: int = 10, + plugin_libraries: list[str] | None = None, + ): + """Initialize the benchmark. + + Args: + timing_cache_file: Path to timing cache file to accelerate engine builds. + If None, uses '/tmp/trtexec_timing.cache' as default. + warmup_runs: Number of warmup iterations before timing measurements. + timing_runs: Number of iterations for latency measurement. Results + are averaged across these runs. + plugin_libraries: List of paths to TensorRT plugin shared libraries (.so files). + These plugins will be loaded during engine building. + If None, no custom plugins are loaded. + """ + global logger + self.timing_cache_file = timing_cache_file or "/tmp/trtexec_timing.cache" # nosec B108 + self.warmup_runs = warmup_runs + self.timing_runs = timing_runs + self.plugin_libraries = plugin_libraries or [] + self.logger = logger + + @abstractmethod + def run(self, path_or_bytes: str | bytes, log_file: str | None = None) -> float: + """Run benchmark on the given ONNX model. + + Args: + path_or_bytes: Path to the ONNX model (str) or raw model data (bytes) + log_file: Optional path to save benchmark logs + + Returns: + Measured latency in milliseconds, or float("inf") on failure + """ + raise NotImplementedError("Subclasses must implement this method") + + def __call__(self, path_or_bytes: str | bytes, log_file: str | None = None) -> float: + """Convenience method to call benchmark as a function. + + Args: + path_or_bytes: Path to the ONNX model (str) or raw model data (bytes) + log_file: Optional path to save benchmark logs + + Returns: + Measured latency in milliseconds + """ + return self.run(path_or_bytes, log_file) + + +class TrtExecBenchmark(Benchmark): + """TensorRT benchmark using trtexec command-line tool. + + This implementation uses the trtexec binary to build engines and measure + inference latency. It is the most straightforward method and closely + mirrors standard TensorRT workflows. + """ + + def __init__( + self, + timing_cache_file: str | None = None, + warmup_runs: int = 5, + timing_runs: int = 10, + plugin_libraries: list[str] | None = None, + trtexec_path: str = "trtexec", + trtexec_args: list | None = None, + ): + """Initialize the trtexec benchmark. + + Args: + timing_cache_file: Path to TensorRT timing cache file for faster + subsequent builds. Defaults to '/tmp/trtexec_timing.cache'. + warmup_runs: Number of warmup iterations before timing measurements. + timing_runs: Number of iterations for latency measurement. Results + are averaged across these runs. + plugin_libraries: List of paths to TensorRT plugin shared libraries (.so files). + These plugins will be loaded by trtexec during engine building. + If None, no custom plugins are loaded. + trtexec_path: Path to trtexec binary. Defaults to 'trtexec' which + looks for the binary in PATH. + trtexec_args: Additional command-line arguments to pass to trtexec. + These are appended after the standard arguments. + Example: ['--fp16', '--workspace=4096', '--verbose'] + """ + super().__init__(timing_cache_file, warmup_runs, timing_runs, plugin_libraries) + self.trtexec_path = trtexec_path + self.trtexec_args = trtexec_args if trtexec_args is not None else [] + self._temp_dir = tempfile.mkdtemp(prefix="trtexec_benchmark_") + self.engine_dir = self._temp_dir + self.engine_path = os.path.join(self.engine_dir, "engine.trt") + self.temp_model_path = os.path.join(self.engine_dir, "temp_model.onnx") + self.logger.debug(f"Created temporary engine directory: {self.engine_dir}") + self.logger.debug(f"Temporary model path: {self.temp_model_path}") + self.latency_pattern = r"\[I\]\s+Latency:.*?median\s*=\s*([\d.]+)\s*ms" + + self._base_cmd = [ + self.trtexec_path, + f"--avgRuns={self.timing_runs}", + f"--iterations={self.timing_runs}", + f"--warmUp={self.warmup_runs}", + "--stronglyTyped", + f"--saveEngine={self.engine_path}", + f"--timingCacheFile={self.timing_cache_file}", + ] + + for plugin_lib in self.plugin_libraries: + plugin_path = Path(plugin_lib).resolve() + if not plugin_path.exists(): + self.logger.warning(f"Plugin library not found: {plugin_path}") + continue + self._base_cmd.append(f"--staticPlugins={plugin_path}") + self.logger.debug(f"Added plugin library: {plugin_path}") + + if self.trtexec_args: + self._base_cmd.extend(self.trtexec_args) + + self.logger.debug(f"Base command template: {' '.join(self._base_cmd)}") + + def __del__(self): + """Cleanup temporary directory.""" + if hasattr(self, "_temp_dir"): + try: + shutil.rmtree(self._temp_dir, ignore_errors=True) + self.logger.debug(f"Cleaned up temporary directory: {self._temp_dir}") + except Exception as e: + self.logger.warning(f"Failed to cleanup temporary directory: {e}") + + def run( + self, + path_or_bytes: str | bytes, + log_file: str | None = None, + ) -> float: + """Run benchmark using trtexec. + + Args: + path_or_bytes: Path to the ONNX model (str) or raw model data (bytes) + log_file: Optional path to save trtexec logs + + Returns: + Measured median latency in milliseconds + """ + if not os.path.exists(self.timing_cache_file): + self.logger.debug(f"Will create timing cache: {self.timing_cache_file}") + + try: + model_path = path_or_bytes + if isinstance(model_path, bytes): + with open(self.temp_model_path, "wb") as f: + f.write(model_path) + model_path = self.temp_model_path + self.logger.debug(f"Wrote model bytes to temporary file: {model_path}") + + cmd = [*self._base_cmd, f"--onnx={model_path}"] + self.logger.debug(f"Running: {' '.join(cmd)}") + result = subprocess.run(cmd, capture_output=True, text=True) # nosec B603 + if log_file is not None: + try: + log_path = Path(log_file) + log_path.parent.mkdir(parents=True, exist_ok=True) + with open(log_path, "w") as f: + output = "" + output += f"Command: {' '.join(cmd)}\n" + output += f"Return code: {result.returncode}\n" + output += "=" * 80 + "\n" + output += "STDOUT:\n" + output += "=" * 80 + "\n" + output += result.stdout + output += "\n" + "=" * 80 + "\n" + output += "STDERR:\n" + output += "=" * 80 + "\n" + output += result.stderr + f.write(output) + self.logger.debug(f"Saved trtexec logs to: {log_file}") + except Exception as e: + self.logger.warning(f"Failed to save logs to {log_file}: {e}") + + if result.returncode != 0: + self.logger.error(f"trtexec failed with return code {result.returncode}") + self.logger.error(f"stderr: {result.stderr}") + return float("inf") + + match = re.search(self.latency_pattern, result.stdout, re.IGNORECASE) + if not match: + self.logger.warning("Could not parse median latency from trtexec output") + self.logger.debug(f"trtexec stdout:\n{result.stdout}") + return float("inf") + latency = float(match.group(1)) + self.logger.info(f"TrtExec benchmark (median): {latency:.2f} ms") + return latency + except FileNotFoundError: + self.logger.error(f"trtexec binary not found: {self.trtexec_path}") + self.logger.error("Please ensure TensorRT is installed and trtexec path is correct") + return float("inf") + except Exception as e: + self.logger.error(f"Benchmark failed: {e}") + return float("inf") + + +class TensorRTPyBenchmark(Benchmark): + """TensorRT benchmark using Python API with plugin support. + + This implementation directly uses the TensorRT Python API to build engines + and measure inference latency. It provides more control than trtexec and + can be faster for certain workflows as it avoids subprocess overhead. + """ + + def __init__( + self, + timing_cache_file: str | None = None, + warmup_runs: int = 5, + timing_runs: int = 20, + plugin_libraries: list[str] | None = None, + ): + """Initialize the TensorRT Python API benchmark. + + Creates persistent TensorRT objects (Logger, Builder, Runtime) and + loads the timing cache from disk if available. Optionally loads custom + TensorRT plugin libraries for models with custom operations. + + Args: + timing_cache_file: Path to TensorRT timing cache file. If None, + defaults to '/tmp/trtexec_timing.cache'. + warmup_runs: Number of warmup iterations before timing measurements. + timing_runs: Number of iterations for latency measurement. + plugin_libraries: List of paths to TensorRT plugin shared libraries (.so files). + These plugins will be loaded and registered for use during + engine building. If None, no custom plugins are loaded. + + Raises: + ImportError: If tensorrt or pycuda packages are not available. + FileNotFoundError: If a specified plugin library file does not exist. + RuntimeError: If plugin library loading fails. + """ + super().__init__(timing_cache_file, warmup_runs, timing_runs, plugin_libraries) + + if not TRT_AVAILABLE: + raise ImportError("TensorRT Python API not available. Please install tensorrt package.") + if not PYCUDA_AVAILABLE: + raise ImportError("PyCUDA not available. Please install pycuda package.") + + self.trt_logger = trt.Logger(trt.Logger.WARNING) + self.builder = trt.Builder(self.trt_logger) + self.runtime = trt.Runtime(self.trt_logger) + self._loaded_plugin_handles = [] + if self.plugin_libraries: + self._load_plugin_libraries() + trt.init_libnvinfer_plugins(self.trt_logger, "") + self._plugin_registry = trt.get_plugin_registry() + + self.network_flags = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) + self.network_flags |= 1 << int(trt.NetworkDefinitionCreationFlag.STRONGLY_TYPED) + + # Load timing cache from disk or create new one + self._timing_cache = None + self._load_timing_cache() + + # Storage for user-defined shape configurations + # Format: {input_name: (min_shape, opt_shape, max_shape)} + self._shape_configs = {} + + def _load_plugin_libraries(self): + """Load custom TensorRT plugin libraries from shared object files. + + This method loads plugin libraries using ctypes and initializes them + with the TensorRT plugin registry. Plugins must export the + initLibNvInferPlugins function to register their implementations. + + The loaded library handles are stored to prevent them from being + garbage collected during the benchmark's lifetime. + + Raises: + FileNotFoundError: If a plugin library file does not exist. + RuntimeError: If plugin initialization fails. + """ + for plugin_lib in self.plugin_libraries: + plugin_path = Path(plugin_lib).resolve() + + if not plugin_path.exists(): + raise FileNotFoundError(f"Plugin library not found: {plugin_path}") + + self.logger.info(f"Loading TensorRT plugin: {plugin_path}") + + try: + if hasattr(os, "RTLD_LAZY") and hasattr(os, "RTLD_GLOBAL"): + plugin_handle = ctypes.CDLL( + str(plugin_path), mode=os.RTLD_LAZY | os.RTLD_GLOBAL + ) + else: + # Fallback for platforms without RTLD flags (e.g., Windows) + plugin_handle = ctypes.CDLL(str(plugin_path)) + + # Store handle to prevent garbage collection + self._loaded_plugin_handles.append(plugin_handle) + + # Try to initialize plugin with TensorRT registry + # Most TensorRT plugins export initLibNvInferPlugins function + if hasattr(plugin_handle, "initLibNvInferPlugins"): + init_func = plugin_handle.initLibNvInferPlugins + # Function signature: bool initLibNvInferPlugins(void* logger, const char* namespace) + init_func.argtypes = [ctypes.c_void_p, ctypes.c_char_p] + init_func.restype = ctypes.c_bool + + # Initialize with the TensorRT logger and default namespace + success = init_func(None, b"") + if not success: + self.logger.warning( + f"Plugin initialization returned false for: {plugin_path}" + ) + else: + self.logger.info(f"Successfully initialized plugin: {plugin_path.name}") + else: + self.logger.info( + f"Plugin loaded (no initLibNvInferPlugins function): {plugin_path.name}" + ) + + except Exception as e: + raise RuntimeError(f"Failed to load plugin library {plugin_path}: {e}") from e + + def set_shapes(self, input_name: str, min_shape: list, opt_shape: list, max_shape: list): + """Set custom min/opt/max shapes for a dynamic input. + + This method allows you to specify custom shape ranges for dynamic inputs + (inputs with -1 dimensions). If not specified, the benchmark will use + default shapes (all -1 dimensions become 1). + + Args: + input_name: Name of the input tensor to configure. + min_shape: Minimum shape for this input. List of integers. + opt_shape: Optimal/default shape for this input. List of integers. + max_shape: Maximum shape for this input. List of integers. + """ + if len(min_shape) != len(opt_shape) or len(opt_shape) != len(max_shape): + raise ValueError("min_shape, opt_shape, and max_shape must have the same length") + + for i, (min_dim, opt_dim, max_dim) in enumerate(zip(min_shape, opt_shape, max_shape)): + if not (min_dim <= opt_dim <= max_dim): + raise ValueError( + f"Invalid shape range at dimension {i}: " + f"min={min_dim}, opt={opt_dim}, max={max_dim}. " + f"Must satisfy min <= opt <= max" + ) + + self._shape_configs[input_name] = (min_shape, opt_shape, max_shape) + self.logger.debug( + f"Set shapes for input '{input_name}': " + f"min={min_shape}, opt={opt_shape}, max={max_shape}" + ) + + def run( + self, + path_or_bytes: str | bytes, + log_file: str | None = None, + flush_timing_cache: bool = False, + ) -> float: + """Run benchmark using TensorRT Python API. + + Args: + path_or_bytes: Path to the ONNX model (str) or raw model data (bytes) + log_file: Optional path to save benchmark logs + + Returns: + Measured median latency in milliseconds + """ + config = None + network = None + parser = None + serialized_engine = None + engine = None + context = None + inputs = [] + outputs = [] + stream = None + + try: + self.logger.debug("Creating TensorRT builder...") + config = self.builder.create_builder_config() + config.set_flag(trt.BuilderFlag.DIRECT_IO) + if not config.set_timing_cache(self._timing_cache, ignore_mismatch=True): + self.logger.warning("Failed to set timing cache to builder config") + network = self.builder.create_network(self.network_flags) + parser = trt.OnnxParser(network, self.trt_logger) + if isinstance(path_or_bytes, bytes): + self.logger.debug(f"Parsing ONNX model from bytes (size: {len(path_or_bytes)})") + model_data = path_or_bytes + else: + self.logger.debug(f"Parsing ONNX model: {path_or_bytes}") + with open(path_or_bytes, "rb") as f: + model_data = f.read() + + if not parser.parse(model_data): + self.logger.error("Failed to parse ONNX model") + for error_idx in range(parser.num_errors): + self.logger.error(f" {parser.get_error(error_idx)}") + return float("inf") + + has_dynamic_shapes = False + for i in range(network.num_inputs): + input_tensor = network.get_input(i) + shape = input_tensor.shape + if any(dim == -1 for dim in shape): + has_dynamic_shapes = True + break + + if has_dynamic_shapes: + profile = self.builder.create_optimization_profile() + for i in range(network.num_inputs): + input_tensor = network.get_input(i) + input_name = input_tensor.name + shape = list(input_tensor.shape) + + if input_name in self._shape_configs: + min_shape, opt_shape, max_shape = self._shape_configs[input_name] + self.logger.debug( + f"Using custom shapes for input '{input_name}': " + f"min={min_shape}, opt={opt_shape}, max={max_shape}" + ) + else: + min_shape = [1 if dim == -1 else dim for dim in shape] + opt_shape = [1 if dim == -1 else dim for dim in shape] + max_shape = [1 if dim == -1 else dim for dim in shape] + self.logger.debug( + f"Using default shapes for input '{input_name}': {opt_shape}" + ) + + profile.set_shape(input_name, min_shape, opt_shape, max_shape) + + config.add_optimization_profile(profile) + + self.logger.debug("Building TensorRT engine...") + build_start = time.perf_counter() + serialized_engine = self.builder.build_serialized_network(network, config) + build_time = time.perf_counter() - build_start + + if serialized_engine is None: + self.logger.error("Failed to build TensorRT engine") + return float("inf") + + self.logger.debug(f"Engine built successfully in {build_time:.2f}s") + + if flush_timing_cache: + self._save_timing_cache() + + engine = self.runtime.deserialize_cuda_engine(serialized_engine) + + if engine is None: + self.logger.error("Failed to deserialize engine") + return float("inf") + + context = engine.create_execution_context() + + inputs = [] + outputs = [] + + for i in range(engine.num_io_tensors): + tensor_name = engine.get_tensor_name(i) + dtype = trt.nptype(engine.get_tensor_dtype(tensor_name)) + shape = context.get_tensor_shape(tensor_name) + + size = trt.volume(shape) + host_mem = cuda.pagelocked_empty(size, dtype) + device_mem = cuda.mem_alloc(host_mem.nbytes) + + if engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.INPUT: + np.copyto(host_mem, np.random.randn(size).astype(dtype)) + inputs.append({"host": host_mem, "device": device_mem, "name": tensor_name}) + else: + outputs.append({"host": host_mem, "device": device_mem, "name": tensor_name}) + + context.set_tensor_address(tensor_name, int(device_mem)) + + stream = cuda.Stream() + + self.logger.debug(f"Running {self.warmup_runs} warmup iterations...") + for _ in range(self.warmup_runs): + for inp in inputs: + cuda.memcpy_htod_async(inp["device"], inp["host"], stream) + context.execute_async_v3(stream_handle=stream.handle) + for out in outputs: + cuda.memcpy_dtoh_async(out["host"], out["device"], stream) + stream.synchronize() + + self.logger.debug(f"Running {self.timing_runs} timing iterations...") + latencies = [] + + for _ in range(self.timing_runs): + for inp in inputs: + cuda.memcpy_htod_async(inp["device"], inp["host"], stream) + + stream.synchronize() + start = time.perf_counter() + context.execute_async_v3(stream_handle=stream.handle) + stream.synchronize() + end = time.perf_counter() + + latency_ms = (end - start) * 1000.0 + latencies.append(latency_ms) + + for out in outputs: + cuda.memcpy_dtoh_async(out["host"], out["device"], stream) + + latencies = np.array(latencies) + median_latency = float(np.median(latencies)) + mean_latency = float(np.mean(latencies)) + std_latency = float(np.std(latencies)) + min_latency = float(np.min(latencies)) + max_latency = float(np.max(latencies)) + + self.logger.info("TensorRT Python API benchmark:") + self.logger.info( + f" min={min_latency:.3f}ms, max={max_latency:.3f}ms, " + f"mean={mean_latency:.3f}ms, std={std_latency:.3f}ms, median={median_latency:.3f}ms" + ) + + if log_file is not None: + try: + log_path = Path(log_file) + log_path.parent.mkdir(parents=True, exist_ok=True) + model_info = ( + f"" + if isinstance(path_or_bytes, bytes) + else path_or_bytes + ) + with open(log_path, "w") as f: + output = "" + output += "TensorRT Python API Benchmark\n" + output += f"Model: {model_info}\n" + output += f"Build time: {build_time:.2f}s\n" + output += f"Warmup runs: {self.warmup_runs}\n" + output += f"Timing runs: {self.timing_runs}\n" + output += "Latency Statistics:\n" + output += f" Min: {min_latency:.3f} ms\n" + output += f" Max: {max_latency:.3f} ms\n" + output += f" Mean: {mean_latency:.3f} ms\n" + output += f" Std: {std_latency:.3f} ms\n" + output += f" Median: {median_latency:.3f} ms\n" + output += f"All latencies: {latencies.tolist()}\n" + f.write(output) + self.logger.debug(f"Saved benchmark logs to: {log_file}") + except Exception as e: + self.logger.warning(f"Failed to save logs to {log_file}: {e}") + return median_latency + except Exception as e: + self.logger.error(f"Benchmark failed: {e}", exc_info=True) + return float("inf") + finally: + try: + for inp in inputs: + if "device" in inp: + inp["device"].free() + if "host" in inp: + del inp["host"] + for out in outputs: + if "device" in out: + out["device"].free() + if "host" in out: + del out["host"] + inputs.clear() + outputs.clear() + + if context is not None: + del context + if stream is not None: + del stream + if engine is not None: + del engine + if serialized_engine is not None: + del serialized_engine + if parser is not None: + del parser + if network is not None: + del network + if config is not None: + del config + except Exception as cleanup_error: + self.logger.warning(f"Error during cleanup: {cleanup_error}") + + def _load_timing_cache(self): + """Load timing cache from file or create a new one.""" + config = self.builder.create_builder_config() + if os.path.exists(self.timing_cache_file): + try: + with open(self.timing_cache_file, "rb") as f: + timing_cache_data = f.read() + self._timing_cache = config.create_timing_cache(timing_cache_data) + self.logger.debug(f"Loaded timing cache from: {self.timing_cache_file}") + except Exception as e: + self.logger.warning(f"Failed to load timing cache: {e}") + self.logger.debug("Creating new timing cache") + self._timing_cache = None + + if self._timing_cache is None: + self._timing_cache = config.create_timing_cache(b"") + self.logger.debug("Created new timing cache") + del config + + def _save_timing_cache(self): + """Save timing cache to file.""" + try: + if self._timing_cache is not None: + timing_cache_data = self._timing_cache.serialize() + with open(self.timing_cache_file, "wb") as f: + f.write(timing_cache_data) + self.logger.debug(f"Saved timing cache to: {self.timing_cache_file}") + except Exception as e: + self.logger.warning(f"Failed to save timing cache: {e}") From fcdb871ae0ac8bf21c6b6b8b33b2986af4a20d0c Mon Sep 17 00:00:00 2001 From: Will Guo Date: Wed, 4 Feb 2026 03:20:10 +0000 Subject: [PATCH 2/8] add warning when remote autotuner is not available Signed-off-by: Will Guo --- .../onnx/quantization/autotune/benchmark.py | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/modelopt/onnx/quantization/autotune/benchmark.py b/modelopt/onnx/quantization/autotune/benchmark.py index f1bd33cb9..c5738ccd2 100644 --- a/modelopt/onnx/quantization/autotune/benchmark.py +++ b/modelopt/onnx/quantization/autotune/benchmark.py @@ -55,6 +55,7 @@ PYCUDA_AVAILABLE = False from modelopt.onnx.logging_config import logger +from modelopt.onnx.quantization.ort_utils import _check_for_tensorrt class Benchmark(ABC): @@ -190,8 +191,23 @@ def __init__( self._base_cmd.append(f"--staticPlugins={plugin_path}") self.logger.debug(f"Added plugin library: {plugin_path}") - if self.trtexec_args: - self._base_cmd.extend(self.trtexec_args) + trtexec_args = self.trtexec_args or [] + has_remote_config = any("--remoteAutoTuningConfig" in arg for arg in trtexec_args) + + if has_remote_config: + try: + _check_for_tensorrt(min_version="10.16") + self.logger.debug("TensorRT Python API version >= 10.16 detected") + return + except ImportError: + self.logger.warning( + "Remote autotuning is not supported with TensorRT version < 10.16" + "Removing --remoteAutoTuningConfig from trtexec arguments" + ) + trtexec_args = [ + arg for arg in trtexec_args if "--remoteAutoTuningConfig" not in arg + ] + self._base_cmd.extend(trtexec_args) self.logger.debug(f"Base command template: {' '.join(self._base_cmd)}") From 4815983b1145e763b169ef1a97fa218b1be8842a Mon Sep 17 00:00:00 2001 From: Will Guo Date: Mon, 9 Feb 2026 07:29:10 +0000 Subject: [PATCH 3/8] simplify benchmark code Signed-off-by: Will Guo --- .../onnx/quantization/autotune/benchmark.py | 51 ++++++------------- 1 file changed, 15 insertions(+), 36 deletions(-) diff --git a/modelopt/onnx/quantization/autotune/benchmark.py b/modelopt/onnx/quantization/autotune/benchmark.py index c5738ccd2..71357f175 100644 --- a/modelopt/onnx/quantization/autotune/benchmark.py +++ b/modelopt/onnx/quantization/autotune/benchmark.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # @@ -38,7 +37,6 @@ import numpy as np -# Optional dependencies - gracefully handle missing packages try: import tensorrt as trt @@ -165,11 +163,10 @@ def __init__( super().__init__(timing_cache_file, warmup_runs, timing_runs, plugin_libraries) self.trtexec_path = trtexec_path self.trtexec_args = trtexec_args if trtexec_args is not None else [] - self._temp_dir = tempfile.mkdtemp(prefix="trtexec_benchmark_") - self.engine_dir = self._temp_dir - self.engine_path = os.path.join(self.engine_dir, "engine.trt") - self.temp_model_path = os.path.join(self.engine_dir, "temp_model.onnx") - self.logger.debug(f"Created temporary engine directory: {self.engine_dir}") + self.temp_dir = tempfile.mkdtemp(prefix="trtexec_benchmark_") + self.engine_path = os.path.join(self.temp_dir, "engine.trt") + self.temp_model_path = os.path.join(self.temp_dir, "temp_model.onnx") + self.logger.debug(f"Created temporary engine directory: {self.temp_dir}") self.logger.debug(f"Temporary model path: {self.temp_model_path}") self.latency_pattern = r"\[I\]\s+Latency:.*?median\s*=\s*([\d.]+)\s*ms" @@ -213,10 +210,10 @@ def __init__( def __del__(self): """Cleanup temporary directory.""" - if hasattr(self, "_temp_dir"): + if hasattr(self, "temp_dir"): try: - shutil.rmtree(self._temp_dir, ignore_errors=True) - self.logger.debug(f"Cleaned up temporary directory: {self._temp_dir}") + shutil.rmtree(self.temp_dir, ignore_errors=True) + self.logger.debug(f"Cleaned up temporary directory: {self.temp_dir}") except Exception as e: self.logger.warning(f"Failed to cleanup temporary directory: {e}") @@ -344,13 +341,8 @@ def __init__( self.network_flags = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) self.network_flags |= 1 << int(trt.NetworkDefinitionCreationFlag.STRONGLY_TYPED) - - # Load timing cache from disk or create new one self._timing_cache = None self._load_timing_cache() - - # Storage for user-defined shape configurations - # Format: {input_name: (min_shape, opt_shape, max_shape)} self._shape_configs = {} def _load_plugin_libraries(self): @@ -600,9 +592,8 @@ def run( min_latency = float(np.min(latencies)) max_latency = float(np.max(latencies)) - self.logger.info("TensorRT Python API benchmark:") self.logger.info( - f" min={min_latency:.3f}ms, max={max_latency:.3f}ms, " + f"TensorRT Python API benchmark: min={min_latency:.3f}ms, max={max_latency:.3f}ms, " f"mean={mean_latency:.3f}ms, std={std_latency:.3f}ms, median={median_latency:.3f}ms" ) @@ -639,33 +630,21 @@ def run( return float("inf") finally: try: + [inp["device"].free() for inp in inputs if "device" in inp] + [out["device"].free() for out in outputs if "device" in out] for inp in inputs: - if "device" in inp: - inp["device"].free() if "host" in inp: del inp["host"] for out in outputs: - if "device" in out: - out["device"].free() if "host" in out: del out["host"] inputs.clear() outputs.clear() - - if context is not None: - del context - if stream is not None: - del stream - if engine is not None: - del engine - if serialized_engine is not None: - del serialized_engine - if parser is not None: - del parser - if network is not None: - del network - if config is not None: - del config + resources = [context, stream, engine, serialized_engine, parser, network, config] + for resource in resources: + if resource is not None: + del resource + resources.clear() except Exception as cleanup_error: self.logger.warning(f"Error during cleanup: {cleanup_error}") From 3fae6cb50ed49947605342e3a72f2af4fbd4e679 Mon Sep 17 00:00:00 2001 From: Will Guo Date: Tue, 10 Feb 2026 09:33:29 +0000 Subject: [PATCH 4/8] add qdq-placement deps group Signed-off-by: Will Guo --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index b2419d908..63df71d11 100644 --- a/setup.py +++ b/setup.py @@ -107,6 +107,7 @@ } # create "compound" optional dependencies +optional_deps["qdq-placement"] = [*optional_deps["onnx"], "pycuda>=2026.01"] optional_deps["all"] = [ deps for k in optional_deps if not k.startswith("dev") for deps in optional_deps[k] ] From f536f26febe16ed0716d18e53ee46be927d0673e Mon Sep 17 00:00:00 2001 From: Will Guo Date: Wed, 11 Feb 2026 14:00:57 +0000 Subject: [PATCH 5/8] remove qdq-placement deps group Signed-off-by: Will Guo --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index 63df71d11..b2419d908 100644 --- a/setup.py +++ b/setup.py @@ -107,7 +107,6 @@ } # create "compound" optional dependencies -optional_deps["qdq-placement"] = [*optional_deps["onnx"], "pycuda>=2026.01"] optional_deps["all"] = [ deps for k in optional_deps if not k.startswith("dev") for deps in optional_deps[k] ] From 1cb66ede8f8e17213a73263a3bbfb4a7e51351bb Mon Sep 17 00:00:00 2001 From: Will Guo Date: Thu, 12 Feb 2026 23:48:23 +0000 Subject: [PATCH 6/8] resolve comments Signed-off-by: Will Guo --- .../onnx/quantization/autotune/__init__.py | 6 - .../onnx/quantization/autotune/benchmark.py | 230 +++++++++++------- 2 files changed, 137 insertions(+), 99 deletions(-) diff --git a/modelopt/onnx/quantization/autotune/__init__.py b/modelopt/onnx/quantization/autotune/__init__.py index c1b2ef5f5..91e86889f 100644 --- a/modelopt/onnx/quantization/autotune/__init__.py +++ b/modelopt/onnx/quantization/autotune/__init__.py @@ -25,11 +25,8 @@ from .common import ( AutotunerError, AutotunerNotInitializedError, - Config, InsertionScheme, InvalidSchemeError, - PatternCache, - PatternSchemes, Region, RegionType, ) @@ -48,12 +45,9 @@ "ChildRegionInputInsertionPoint", "ChildRegionOutputInsertionPoint", "CombinedRegionSearch", - "Config", "InsertionScheme", "InvalidSchemeError", "NodeInputInsertionPoint", - "PatternCache", - "PatternSchemes", "Region", "RegionPattern", "RegionType", diff --git a/modelopt/onnx/quantization/autotune/benchmark.py b/modelopt/onnx/quantization/autotune/benchmark.py index 71357f175..6fdf3883e 100644 --- a/modelopt/onnx/quantization/autotune/benchmark.py +++ b/modelopt/onnx/quantization/autotune/benchmark.py @@ -25,7 +25,9 @@ - TensorRTPyBenchmark: Uses TensorRT Python API for direct engine profiling """ +import contextlib import ctypes +import importlib.util import os import re import shutil @@ -37,24 +39,21 @@ import numpy as np -try: - import tensorrt as trt - - TRT_AVAILABLE = True -except ImportError: - TRT_AVAILABLE = False - -try: - import pycuda.autoinit # noqa: F401 # Automatically initializes CUDA (side-effect import) - import pycuda.driver as cuda - - PYCUDA_AVAILABLE = True -except ImportError: - PYCUDA_AVAILABLE = False - from modelopt.onnx.logging_config import logger from modelopt.onnx.quantization.ort_utils import _check_for_tensorrt +TRT_AVAILABLE = importlib.util.find_spec("tensorrt") is not None +if TRT_AVAILABLE: + import tensorrt as trt + +CUDART_AVAILABLE = importlib.util.find_spec("cuda") is not None +if CUDART_AVAILABLE: + try: + from cuda import cudart + except ImportError: + with contextlib.suppress(ImportError): + from cuda.bindings import runtime as cudart + class Benchmark(ABC): """Abstract base class for TensorRT model benchmarking. @@ -93,7 +92,6 @@ def __init__( These plugins will be loaded during engine building. If None, no custom plugins are loaded. """ - global logger self.timing_cache_file = timing_cache_file or "/tmp/trtexec_timing.cache" # nosec B108 self.warmup_runs = warmup_runs self.timing_runs = timing_runs @@ -204,7 +202,7 @@ def __init__( trtexec_args = [ arg for arg in trtexec_args if "--remoteAutoTuningConfig" not in arg ] - self._base_cmd.extend(trtexec_args) + self._base_cmd.extend(trtexec_args) self.logger.debug(f"Base command template: {' '.join(self._base_cmd)}") @@ -250,17 +248,21 @@ def run( log_path = Path(log_file) log_path.parent.mkdir(parents=True, exist_ok=True) with open(log_path, "w") as f: - output = "" - output += f"Command: {' '.join(cmd)}\n" - output += f"Return code: {result.returncode}\n" - output += "=" * 80 + "\n" - output += "STDOUT:\n" - output += "=" * 80 + "\n" - output += result.stdout - output += "\n" + "=" * 80 + "\n" - output += "STDERR:\n" - output += "=" * 80 + "\n" - output += result.stderr + output = "\n".join( + [ + f"Command: {' '.join(cmd)}", + f"Return code: {result.returncode}", + "=" * 80, + "STDOUT:", + "=" * 80, + result.stdout, + "\n" + "=" * 80, + "STDERR:", + "=" * 80, + result.stderr, + "\n" + "=" * 80, + ] + ) f.write(output) self.logger.debug(f"Saved trtexec logs to: {log_file}") except Exception as e: @@ -271,8 +273,7 @@ def run( self.logger.error(f"stderr: {result.stderr}") return float("inf") - match = re.search(self.latency_pattern, result.stdout, re.IGNORECASE) - if not match: + if not (match := re.search(self.latency_pattern, result.stdout, re.IGNORECASE)): self.logger.warning("Could not parse median latency from trtexec output") self.logger.debug(f"trtexec stdout:\n{result.stdout}") return float("inf") @@ -319,7 +320,7 @@ def __init__( engine building. If None, no custom plugins are loaded. Raises: - ImportError: If tensorrt or pycuda packages are not available. + ImportError: If tensorrt or cuda-python (cudart) packages are not available. FileNotFoundError: If a specified plugin library file does not exist. RuntimeError: If plugin library loading fails. """ @@ -327,8 +328,10 @@ def __init__( if not TRT_AVAILABLE: raise ImportError("TensorRT Python API not available. Please install tensorrt package.") - if not PYCUDA_AVAILABLE: - raise ImportError("PyCUDA not available. Please install pycuda package.") + if not CUDART_AVAILABLE or cudart is None: + raise ImportError( + "CUDA Runtime (cudart) not available. Please install cuda-python package: pip install cuda-python" + ) self.trt_logger = trt.Logger(trt.Logger.WARNING) self.builder = trt.Builder(self.trt_logger) @@ -448,15 +451,8 @@ def run( Returns: Measured median latency in milliseconds """ - config = None - network = None - parser = None - serialized_engine = None - engine = None - context = None - inputs = [] - outputs = [] - stream = None + config = network = parser = serialized_engine = engine = context = stream_handle = None + inputs, outputs = [], [] try: self.logger.debug("Creating TensorRT builder...") @@ -480,13 +476,10 @@ def run( self.logger.error(f" {parser.get_error(error_idx)}") return float("inf") - has_dynamic_shapes = False - for i in range(network.num_inputs): - input_tensor = network.get_input(i) - shape = input_tensor.shape - if any(dim == -1 for dim in shape): - has_dynamic_shapes = True - break + has_dynamic_shapes = any( + any(dim == -1 for dim in input_tensor.shape) + for input_tensor in network.get_inputs() + ) if has_dynamic_shapes: profile = self.builder.create_optimization_profile() @@ -537,6 +530,17 @@ def run( inputs = [] outputs = [] + stream_handle = None + + def _alloc_pinned_host(size: int, dtype: np.dtype): + nbytes = size * np.dtype(dtype).itemsize + err, host_ptr = cudart.cudaMallocHost(nbytes) + if err != cudart.cudaError_t.cudaSuccess: + raise RuntimeError(f"cudaMallocHost failed: {err}") + addr = int(host_ptr) if hasattr(host_ptr, "__int__") else host_ptr + ctype = np.ctypeslib.as_ctypes_type(dtype) + arr = np.ctypeslib.as_array((ctype * size).from_address(addr)) + return host_ptr, arr for i in range(engine.num_io_tensors): tensor_name = engine.get_tensor_name(i) @@ -544,46 +548,80 @@ def run( shape = context.get_tensor_shape(tensor_name) size = trt.volume(shape) - host_mem = cuda.pagelocked_empty(size, dtype) - device_mem = cuda.mem_alloc(host_mem.nbytes) + nbytes = size * np.dtype(dtype).itemsize + + err, device_ptr = cudart.cudaMalloc(nbytes) + if err != cudart.cudaError_t.cudaSuccess: + raise RuntimeError(f"cudaMalloc failed: {err}") + + host_ptr, host_mem = _alloc_pinned_host(size, dtype) if engine.get_tensor_mode(tensor_name) == trt.TensorIOMode.INPUT: np.copyto(host_mem, np.random.randn(size).astype(dtype)) - inputs.append({"host": host_mem, "device": device_mem, "name": tensor_name}) + inputs.append( + { + "host_ptr": host_ptr, + "host": host_mem, + "device_ptr": device_ptr, + "nbytes": nbytes, + "name": tensor_name, + } + ) else: - outputs.append({"host": host_mem, "device": device_mem, "name": tensor_name}) + outputs.append( + { + "host_ptr": host_ptr, + "host": host_mem, + "device_ptr": device_ptr, + "nbytes": nbytes, + "name": tensor_name, + } + ) + + context.set_tensor_address(tensor_name, int(device_ptr)) - context.set_tensor_address(tensor_name, int(device_mem)) + err, stream_handle = cudart.cudaStreamCreate() + if err != cudart.cudaError_t.cudaSuccess: + raise RuntimeError(f"cudaStreamCreate failed: {err}") - stream = cuda.Stream() + h2d = cudart.cudaMemcpyKind.cudaMemcpyHostToDevice + d2h = cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost self.logger.debug(f"Running {self.warmup_runs} warmup iterations...") for _ in range(self.warmup_runs): for inp in inputs: - cuda.memcpy_htod_async(inp["device"], inp["host"], stream) - context.execute_async_v3(stream_handle=stream.handle) + cudart.cudaMemcpyAsync( + inp["device_ptr"], inp["host_ptr"], inp["nbytes"], h2d, stream_handle + ) + context.execute_async_v3(stream_handle) for out in outputs: - cuda.memcpy_dtoh_async(out["host"], out["device"], stream) - stream.synchronize() + cudart.cudaMemcpyAsync( + out["host_ptr"], out["device_ptr"], out["nbytes"], d2h, stream_handle + ) + cudart.cudaStreamSynchronize(stream_handle) self.logger.debug(f"Running {self.timing_runs} timing iterations...") latencies = [] for _ in range(self.timing_runs): for inp in inputs: - cuda.memcpy_htod_async(inp["device"], inp["host"], stream) + cudart.cudaMemcpyAsync( + inp["device_ptr"], inp["host_ptr"], inp["nbytes"], h2d, stream_handle + ) - stream.synchronize() + cudart.cudaStreamSynchronize(stream_handle) start = time.perf_counter() - context.execute_async_v3(stream_handle=stream.handle) - stream.synchronize() + context.execute_async_v3(stream_handle) + cudart.cudaStreamSynchronize(stream_handle) end = time.perf_counter() latency_ms = (end - start) * 1000.0 latencies.append(latency_ms) for out in outputs: - cuda.memcpy_dtoh_async(out["host"], out["device"], stream) + cudart.cudaMemcpyAsync( + out["host_ptr"], out["device_ptr"], out["nbytes"], d2h, stream_handle + ) latencies = np.array(latencies) median_latency = float(np.median(latencies)) @@ -607,20 +645,23 @@ def run( else path_or_bytes ) with open(log_path, "w") as f: - output = "" - output += "TensorRT Python API Benchmark\n" - output += f"Model: {model_info}\n" - output += f"Build time: {build_time:.2f}s\n" - output += f"Warmup runs: {self.warmup_runs}\n" - output += f"Timing runs: {self.timing_runs}\n" - output += "Latency Statistics:\n" - output += f" Min: {min_latency:.3f} ms\n" - output += f" Max: {max_latency:.3f} ms\n" - output += f" Mean: {mean_latency:.3f} ms\n" - output += f" Std: {std_latency:.3f} ms\n" - output += f" Median: {median_latency:.3f} ms\n" - output += f"All latencies: {latencies.tolist()}\n" - f.write(output) + output = "\n".join( + [ + "TensorRT Python API Benchmark", + f"Model: {model_info}", + f"Build time: {build_time:.2f}s", + f"Warmup runs: {self.warmup_runs}", + f"Timing runs: {self.timing_runs}", + "Latency Statistics:", + f" Min: {min_latency:.3f} ms", + f" Max: {max_latency:.3f} ms", + f" Mean: {mean_latency:.3f} ms", + f" Std: {std_latency:.3f} ms", + f" Median: {median_latency:.3f} ms", + f"All latencies: {latencies.tolist()}", + ] + ) + f.write(output) # type: ignore[arg-type] self.logger.debug(f"Saved benchmark logs to: {log_file}") except Exception as e: self.logger.warning(f"Failed to save logs to {log_file}: {e}") @@ -630,21 +671,24 @@ def run( return float("inf") finally: try: - [inp["device"].free() for inp in inputs if "device" in inp] - [out["device"].free() for out in outputs if "device" in out] - for inp in inputs: - if "host" in inp: - del inp["host"] - for out in outputs: - if "host" in out: - del out["host"] - inputs.clear() - outputs.clear() - resources = [context, stream, engine, serialized_engine, parser, network, config] - for resource in resources: - if resource is not None: - del resource - resources.clear() + for buf in inputs + outputs: + if "host_ptr" in buf: + cudart.cudaFreeHost(buf["host_ptr"]) + if "device_ptr" in buf: + cudart.cudaFree(buf["device_ptr"]) + if stream_handle is not None: + cudart.cudaStreamDestroy(stream_handle) + del ( + inputs, + outputs, + stream_handle, + context, + engine, + serialized_engine, + parser, + network, + config, + ) except Exception as cleanup_error: self.logger.warning(f"Error during cleanup: {cleanup_error}") From b6506b39fd23bc1c51e5fb5cbb54acbe7077094d Mon Sep 17 00:00:00 2001 From: Will Guo Date: Mon, 16 Feb 2026 22:59:02 +0000 Subject: [PATCH 7/8] resolve comments Signed-off-by: Will Guo --- .../onnx/quantization/autotune/benchmark.py | 109 +++++++++--------- 1 file changed, 54 insertions(+), 55 deletions(-) diff --git a/modelopt/onnx/quantization/autotune/benchmark.py b/modelopt/onnx/quantization/autotune/benchmark.py index 6fdf3883e..1213121f7 100644 --- a/modelopt/onnx/quantization/autotune/benchmark.py +++ b/modelopt/onnx/quantization/autotune/benchmark.py @@ -123,6 +123,18 @@ def __call__(self, path_or_bytes: str | bytes, log_file: str | None = None) -> f """ return self.run(path_or_bytes, log_file) + def _write_log_file(self, file: Path | str | None, content: str) -> None: + if file is None: + return + if isinstance(file, str): + file = Path(file) + try: + file.parent.mkdir(parents=True, exist_ok=True) + file.write_text(content) + self.logger.debug(f"Saved logs to: {file}") + except Exception as e: + self.logger.warning(f"Failed to save logs to {file}: {e}") + class TrtExecBenchmark(Benchmark): """TensorRT benchmark using trtexec command-line tool. @@ -243,31 +255,24 @@ def run( cmd = [*self._base_cmd, f"--onnx={model_path}"] self.logger.debug(f"Running: {' '.join(cmd)}") result = subprocess.run(cmd, capture_output=True, text=True) # nosec B603 - if log_file is not None: - try: - log_path = Path(log_file) - log_path.parent.mkdir(parents=True, exist_ok=True) - with open(log_path, "w") as f: - output = "\n".join( - [ - f"Command: {' '.join(cmd)}", - f"Return code: {result.returncode}", - "=" * 80, - "STDOUT:", - "=" * 80, - result.stdout, - "\n" + "=" * 80, - "STDERR:", - "=" * 80, - result.stderr, - "\n" + "=" * 80, - ] - ) - f.write(output) - self.logger.debug(f"Saved trtexec logs to: {log_file}") - except Exception as e: - self.logger.warning(f"Failed to save logs to {log_file}: {e}") - + self._write_log_file( + log_file, + "\n".join( + [ + f"Command: {' '.join(cmd)}", + f"Return code: {result.returncode}", + "=" * 80, + "STDOUT:", + "=" * 80, + result.stdout, + "\n" + "=" * 80, + "STDERR:", + "=" * 80, + result.stderr, + "\n" + "=" * 80, + ] + ), + ) if result.returncode != 0: self.logger.error(f"trtexec failed with return code {result.returncode}") self.logger.error(f"stderr: {result.stderr}") @@ -635,36 +640,30 @@ def _alloc_pinned_host(size: int, dtype: np.dtype): f"mean={mean_latency:.3f}ms, std={std_latency:.3f}ms, median={median_latency:.3f}ms" ) - if log_file is not None: - try: - log_path = Path(log_file) - log_path.parent.mkdir(parents=True, exist_ok=True) - model_info = ( - f"" - if isinstance(path_or_bytes, bytes) - else path_or_bytes - ) - with open(log_path, "w") as f: - output = "\n".join( - [ - "TensorRT Python API Benchmark", - f"Model: {model_info}", - f"Build time: {build_time:.2f}s", - f"Warmup runs: {self.warmup_runs}", - f"Timing runs: {self.timing_runs}", - "Latency Statistics:", - f" Min: {min_latency:.3f} ms", - f" Max: {max_latency:.3f} ms", - f" Mean: {mean_latency:.3f} ms", - f" Std: {std_latency:.3f} ms", - f" Median: {median_latency:.3f} ms", - f"All latencies: {latencies.tolist()}", - ] - ) - f.write(output) # type: ignore[arg-type] - self.logger.debug(f"Saved benchmark logs to: {log_file}") - except Exception as e: - self.logger.warning(f"Failed to save logs to {log_file}: {e}") + model_info = ( + f"" + if isinstance(path_or_bytes, bytes) + else path_or_bytes + ) + self._write_log_file( + log_file, + "\n".join( + [ + "TensorRT Python API Benchmark", + f"Model: {model_info}", + f"Build time: {build_time:.2f}s", + f"Warmup runs: {self.warmup_runs}", + f"Timing runs: {self.timing_runs}", + "Latency Statistics:", + f" Min: {min_latency:.3f} ms", + f" Max: {max_latency:.3f} ms", + f" Mean: {mean_latency:.3f} ms", + f" Std: {std_latency:.3f} ms", + f" Median: {median_latency:.3f} ms", + f"All latencies: {latencies.tolist()}", + ] + ), + ) return median_latency except Exception as e: self.logger.error(f"Benchmark failed: {e}", exc_info=True) From c941acaaf2a05e88aeac63f38faaa55746a53342 Mon Sep 17 00:00:00 2001 From: Will Guo Date: Mon, 23 Feb 2026 02:51:59 +0000 Subject: [PATCH 8/8] add benchmark unittests Signed-off-by: Will Guo --- .../onnx/quantization/autotune/benchmark.py | 6 +- .../onnx/quantization/autotune/models.py | 47 +++++++ .../quantization/autotune/test_benchmark.py | 124 ++++++++++++++++++ 3 files changed, 174 insertions(+), 3 deletions(-) create mode 100644 tests/_test_utils/onnx/quantization/autotune/models.py create mode 100644 tests/gpu/onnx/quantization/autotune/test_benchmark.py diff --git a/modelopt/onnx/quantization/autotune/benchmark.py b/modelopt/onnx/quantization/autotune/benchmark.py index 1213121f7..c0850765f 100644 --- a/modelopt/onnx/quantization/autotune/benchmark.py +++ b/modelopt/onnx/quantization/autotune/benchmark.py @@ -151,7 +151,7 @@ def __init__( timing_runs: int = 10, plugin_libraries: list[str] | None = None, trtexec_path: str = "trtexec", - trtexec_args: list | None = None, + trtexec_args: list[str] | None = None, ): """Initialize the trtexec benchmark. @@ -482,8 +482,8 @@ def run( return float("inf") has_dynamic_shapes = any( - any(dim == -1 for dim in input_tensor.shape) - for input_tensor in network.get_inputs() + any(dim == -1 for dim in network.get_input(i).shape) + for i in range(network.num_inputs) ) if has_dynamic_shapes: diff --git a/tests/_test_utils/onnx/quantization/autotune/models.py b/tests/_test_utils/onnx/quantization/autotune/models.py new file mode 100644 index 000000000..4090cfef3 --- /dev/null +++ b/tests/_test_utils/onnx/quantization/autotune/models.py @@ -0,0 +1,47 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Shared test ONNX models for autotuner unit tests. + +Model creation functions live here; tests import and call them directly. +""" + +import onnx +from onnx import helper + + +def _create_simple_conv_onnx_model(): + """Build ONNX model: Input -> Conv -> Relu -> Output (minimal for autotuner tests).""" + input_tensor = helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [1, 3, 224, 224]) + output_tensor = helper.make_tensor_value_info( + "output", onnx.TensorProto.FLOAT, [1, 64, 224, 224] + ) + conv_node = helper.make_node( + "Conv", inputs=["input", "conv_weight"], outputs=["conv_out"], name="conv" + ) + relu_node = helper.make_node("Relu", inputs=["conv_out"], outputs=["output"], name="relu") + graph = helper.make_graph( + [conv_node, relu_node], + "simple_conv", + [input_tensor], + [output_tensor], + initializer=[ + helper.make_tensor( + "conv_weight", onnx.TensorProto.FLOAT, [64, 3, 3, 3], [0.1] * (64 * 3 * 3 * 3) + ) + ], + ) + return helper.make_model(graph, producer_name="test") diff --git a/tests/gpu/onnx/quantization/autotune/test_benchmark.py b/tests/gpu/onnx/quantization/autotune/test_benchmark.py new file mode 100644 index 000000000..a089d99d9 --- /dev/null +++ b/tests/gpu/onnx/quantization/autotune/test_benchmark.py @@ -0,0 +1,124 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""GPU tests for autotune Benchmark (TrtExecBenchmark, TensorRTPyBenchmark).""" + +import contextlib +import os +import shutil +import tempfile + +import pytest +from _test_utils.onnx.quantization.autotune.models import _create_simple_conv_onnx_model + +from modelopt.onnx.quantization.autotune import TensorRTPyBenchmark, TrtExecBenchmark + + +@pytest.fixture +def simple_conv_model_bytes(): + """ONNX model bytes: Input -> Conv -> Relu -> Output (from _test_utils).""" + model = _create_simple_conv_onnx_model() + return model.SerializeToString() + + +@pytest.fixture +def simple_conv_model_path(simple_conv_model_bytes, tmp_path): + """Path to ONNX model file (same graph as simple_conv_model_bytes).""" + path = tmp_path / "simple_conv.onnx" + path.write_bytes(simple_conv_model_bytes) + return str(path) + + +class TestTensorRTPyBenchmark: + """Tests for TensorRTPyBenchmark (TensorRT Python API + cudart).""" + + @pytest.fixture(autouse=True) + def _require_tensorrt_and_cudart(self): + pytest.importorskip("tensorrt") + try: + from cuda import cudart # noqa: F401 + except ImportError: + try: + from cuda.bindings import runtime # noqa: F401 + except ImportError: + pytest.skip("cuda-python (cudart) not available", allow_module_level=False) + + def test_run_with_bytes(self, simple_conv_model_bytes): + """TensorRTPyBenchmark accepts model bytes and returns finite latency.""" + benchmark = TensorRTPyBenchmark(warmup_runs=1, timing_runs=2) + latency_ms = benchmark.run(simple_conv_model_bytes) + assert isinstance(latency_ms, float) + assert latency_ms > 0 + assert latency_ms != float("inf") + + def test_run_with_path(self, simple_conv_model_path): + """TensorRTPyBenchmark accepts model path and returns finite latency.""" + benchmark = TensorRTPyBenchmark(warmup_runs=1, timing_runs=2) + latency_ms = benchmark.run(simple_conv_model_path) + assert isinstance(latency_ms, float) + assert latency_ms > 0 + assert latency_ms != float("inf") + + def test_callable(self, simple_conv_model_bytes): + """Benchmark is callable and returns same as run().""" + benchmark = TensorRTPyBenchmark(warmup_runs=1, timing_runs=2) + latency_ms = benchmark(simple_conv_model_bytes) + assert isinstance(latency_ms, float) + assert latency_ms > 0 + + +class TestTrtExecBenchmark: + """Tests for TrtExecBenchmark (trtexec CLI).""" + + @pytest.fixture(autouse=True) + def _require_trtexec(self): + if shutil.which("trtexec") is None: + pytest.skip("trtexec not found in PATH", allow_module_level=False) + + def test_run_with_path(self, simple_conv_model_path): + """TrtExecBenchmark accepts model path and returns finite latency.""" + with tempfile.NamedTemporaryFile(suffix=".cache", delete=False) as f: + cache_path = f.name + try: + benchmark = TrtExecBenchmark( + timing_cache_file=cache_path, + warmup_runs=1, + timing_runs=2, + ) + latency_ms = benchmark.run(simple_conv_model_path) + assert isinstance(latency_ms, float) + assert latency_ms > 0 + assert latency_ms != float("inf") + finally: + with contextlib.suppress(OSError): + os.unlink(cache_path) + + def test_run_with_bytes(self, simple_conv_model_bytes): + """TrtExecBenchmark accepts model bytes (writes temp file) and returns finite latency.""" + with tempfile.NamedTemporaryFile(suffix=".cache", delete=False) as f: + cache_path = f.name + try: + benchmark = TrtExecBenchmark( + timing_cache_file=cache_path, + warmup_runs=1, + timing_runs=2, + ) + latency_ms = benchmark.run(simple_conv_model_bytes) + assert isinstance(latency_ms, float) + assert latency_ms > 0 + assert latency_ms != float("inf") + finally: + with contextlib.suppress(OSError): + os.unlink(cache_path)