pytorch · shoumikhin · Mar 5, 2026
@@ -807,6 +807,12 @@ if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_CPUINFO)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/threadpool)
 endif()
 
+# TensorRT examples (benchmark, etc.) need extension_module and extension_tensor,
+# so they must be included after those targets are defined above.
+if(EXECUTORCH_BUILD_TENSORRT)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/examples/nvidia)
+endif()
+
 if(EXECUTORCH_BUILD_KERNELS_TORCHAO)
   if(NOT TARGET cpuinfo)
     message(

@@ -0,0 +1,41 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# NVIDIA Backend Examples
+#
+# This CMakeLists.txt includes the TensorRT examples subdirectory.
+#
+# Supported platforms:
+#   - Linux x86_64 with NVIDIA GPU (devgpu, workstations)
+#   - NVIDIA Jetson (Orin Nano, AGX Orin, etc.)
+#
+# Build instructions:
+#   cmake .. -DEXECUTORCH_BUILD_TENSORRT=ON
+#   cmake --build . --target benchmark_runner_tensorrt
+
+cmake_minimum_required(VERSION 3.19)
+
+project(nvidia_examples)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+# Ensure compile_commands.json is generated for tooling
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+# Source root directory for executorch
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+endif()
+
+# Include utility CMake scripts from ExecuTorch
+include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
+
+# Find CUDA (optional - needed for TensorRT backend)
+find_package(CUDAToolkit QUIET)
+
+# Add TensorRT examples subdirectory
+add_subdirectory(tensorrt)
@@ -0,0 +1,80 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# TensorRT Examples - Benchmark runner
+#
+# Build:
+#   cmake -DEXECUTORCH_BUILD_TENSORRT=ON ...
+#   cmake --build . --target benchmark
+#
+# Usage:
+#   ./benchmark                        # all .pte/.onnx in current dir
+#   ./benchmark -m mv3                 # mv3 .pte and .onnx in current dir
+#   ./benchmark -d /tmp/trt -n 200     # all models in /tmp/trt, 200 iterations
+
+cmake_minimum_required(VERSION 3.19)
+
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
+endif()
+
+set(COMMON_INCLUDE_DIRS ${EXECUTORCH_ROOT}/..)
+
+if(EXECUTORCH_BUILD_TENSORRT)
+  find_library(NVONNXPARSER_LIBRARY nvonnxparser
+    HINTS ${TENSORRT_HOME}/lib ${TENSORRT_HOME}/lib64
+          /usr/lib /usr/lib/x86_64-linux-gnu /usr/lib/aarch64-linux-gnu
+  )
+
+  add_executable(benchmark ${CMAKE_CURRENT_SOURCE_DIR}/benchmark.cpp)
+
+  target_include_directories(
+    benchmark
+    PUBLIC $<BUILD_INTERFACE:${EXECUTORCH_ROOT}>
+           $<BUILD_INTERFACE:${COMMON_INCLUDE_DIRS}>
+  )
+
+  # extension_module builds as extension_module_static in OSS CMake.
+  if(TARGET extension_module_static)
+    set(_extension_module extension_module_static)
+  elseif(TARGET extension_module)
+    set(_extension_module extension_module)
+  else()
+    message(FATAL_ERROR
+      "extension_module not found. Build with -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON")
+  endif()
+
+  if(NOT TARGET extension_tensor)
+    message(FATAL_ERROR
+      "extension_tensor not found. Build with -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON")
+  endif()
+
+  target_link_libraries(
+    benchmark
+    PRIVATE executorch
+            ${_extension_module}
+            extension_tensor
+            portable_kernels
+  )
+
+  target_link_options(
+    benchmark
+    PRIVATE
+      "SHELL:LINKER:--whole-archive $<TARGET_FILE:tensorrt_backend> LINKER:--no-whole-archive"
+  )
+  target_link_libraries(benchmark PRIVATE CUDA::cudart)
+  if(TENSORRT_LIBRARY)
+    target_link_libraries(benchmark PRIVATE ${TENSORRT_LIBRARY})
+  endif()
+  if(NVONNXPARSER_LIBRARY)
+    target_link_libraries(benchmark PRIVATE ${NVONNXPARSER_LIBRARY})
+  endif()
+  add_dependencies(benchmark tensorrt_backend)
+
+  target_compile_options(benchmark PRIVATE -frtti -fexceptions)
+
+  install(TARGETS benchmark DESTINATION ${CMAKE_INSTALL_BINDIR})
+endif()
diff --git a/examples/nvidia/tensorrt/README.md b/examples/nvidia/tensorrt/README.md
@@ -18,8 +18,8 @@ Export a supported model to ExecuTorch format with TensorRT delegation:
 # Export the add model
 python -m executorch.examples.nvidia.tensorrt.export -m add
 
-# Export with validation test
-python -m executorch.examples.nvidia.tensorrt.export -m add --test
+# Export all supported models to a directory
+python -m executorch.examples.nvidia.tensorrt.export -o /tmp/trt
 
 # Export to a specific directory
 python -m executorch.examples.nvidia.tensorrt.export -m add -o ./output
@@ -59,6 +59,7 @@ python -m executorch.examples.nvidia.tensorrt.export --help
 
 - `export.py` - Main export script for converting models to TensorRT format
 - `runner.py` - Python utilities for running and testing exported models
+- `benchmark.cpp` - C++ benchmark runner for performance measurement
 - `tensorrt_executor_runner.cpp` - C++ executor runner for TensorRT models
 - `__init__.py` - Package initialization
 
@@ -85,13 +86,31 @@ python -m executorch.examples.nvidia.tensorrt.export -m add
 --help               Show help message
 ```
 
-### Validation Testing
+## Benchmarking
 
-The `--test` flag runs the exported model through the ExecuTorch runtime
-and compares outputs against the PyTorch reference model:
+Export models then benchmark with the C++ runner:
 
 ```bash
-python -m executorch.examples.nvidia.tensorrt.export -m add --test
+# Step 1: Export models
+python -m executorch.examples.nvidia.tensorrt.export -o /tmp/trt
+
+# Step 2: Benchmark all exported models
+./benchmark -d /tmp/trt
+
+# Benchmark a specific model
+./benchmark -d /tmp/trt -m mv3
+
+# Benchmark with custom iterations
+./benchmark -d /tmp/trt -n 200 -w 5
+```
+
+**Benchmark Options:**
+```
+-d, --model_dir DIR    Directory with .pte files (default: current dir)
+-m, --model_name NAME  Run only NAME_tensorrt.pte from the directory
+-n, --num_executions N Number of timed iterations (default: 100)
+-w, --warmup N         Number of warmup runs (default: 3)
+-v, --verbose          Enable verbose logging
 ```
 
 ## Adding New Models
@@ -109,7 +128,10 @@ To add support for a new model:
 examples/nvidia/tensorrt/
 ├── export.py                    # CLI export script using MODEL_NAME_TO_MODEL registry
 ├── runner.py                    # Python runtime utilities for testing
+├── benchmark.cpp                # C++ benchmark runner binary
 ├── tensorrt_executor_runner.cpp # C++ executor runner binary
+├── tests/                      # Correctness tests
+│   └── test_export.py           # Export + inference verification
 ├── __init__.py                  # Package exports
 └── README.md                    # This file
 ```