diff --git a/backends/nvidia/tensorrt/README.md b/backends/nvidia/tensorrt/README.md index 38e9c52de86..0f8ccc01f73 100644 --- a/backends/nvidia/tensorrt/README.md +++ b/backends/nvidia/tensorrt/README.md @@ -66,3 +66,43 @@ Download and install from the | `device_id` | int | 0 | CUDA device ID | | `dla_core` | int | -1 | DLA core ID (-1 = disabled) | | `allow_gpu_fallback` | bool | True | Allow GPU fallback when using DLA | + +## Version Compatibility + +- **Minimum TensorRT version**: 10.3 (required for serialization format + compatibility) +- **Recommended**: TensorRT 10.6 or later for best performance and feature + support + +## Blob Format + +The TensorRT delegate uses a custom binary blob format: + +``` +┌─────────────────────────────────────┐ +│ Header (32 bytes) │ +│ - magic: "TR01" (4 bytes) │ +│ - metadata_offset (4 bytes) │ +│ - metadata_size (4 bytes) │ +│ - engine_offset (4 bytes) │ +│ - engine_size (8 bytes) │ +│ - reserved (8 bytes) │ +├─────────────────────────────────────┤ +│ Metadata JSON (variable) │ +│ - I/O binding information │ +│ - Tensor names, dtypes, shapes │ +├─────────────────────────────────────┤ +│ Padding (16-byte alignment) │ +├─────────────────────────────────────┤ +│ TensorRT Engine (variable) │ +│ - Serialized TensorRT engine │ +└─────────────────────────────────────┘ +``` + +## Requirements + +- NVIDIA GPU with CUDA Compute Capability 5.0+ +- **TensorRT 10.3+** (required for serialization compatibility) +- CUDA Toolkit 11.x or 12.x +- cuDNN 8.x +- PyTorch 2.x with CUDA support (for export) diff --git a/backends/nvidia/tensorrt/serialization.py b/backends/nvidia/tensorrt/serialization.py new file mode 100644 index 00000000000..16eb9a361ba --- /dev/null +++ b/backends/nvidia/tensorrt/serialization.py @@ -0,0 +1,237 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""Serialization format for TensorRT backend. + +Defines the binary blob format for storing TensorRT engines in .pte files. +The format is designed to be: +- Simple and self-contained +- 16-byte aligned for efficient memory access +- Forward compatible with reserved header space + +Blob Layout: + [Header: 32 bytes] + [I/O Metadata JSON: variable] + [Padding to 16-byte alignment] + [Engine Data: variable] +""" + +import json +import struct +from dataclasses import asdict, dataclass, field +from typing import List, Optional + +# Magic bytes identifying a TensorRT blob. +# "TR01" = TensorRT version 1 format with I/O metadata. +# Must match kTensorRTMagic in TensorRTBlobHeader.h for C++ runtime compatibility. +TENSORRT_MAGIC = b"TR01" + +# Header is 32 bytes, 16-byte aligned +# Layout: +# magic (4 bytes) - "TR01" +# metadata_offset (4 bytes) - offset to metadata JSON from start +# metadata_size (4 bytes) - size of metadata JSON in bytes +# engine_offset (4 bytes) - offset to engine data from start +# engine_size (8 bytes) - size of engine data in bytes +# reserved (8 bytes) - for future use +HEADER_SIZE = 32 +HEADER_FORMAT = "<4sIIIQ8s" # little-endian + + +@dataclass +class TensorRTIOBinding: + """I/O binding metadata for a TensorRT engine tensor. + + Attributes: + name: Name of the tensor binding. + dtype: Data type as string (e.g., "float32", "float16", "int32"). + shape: Shape of the tensor as list of dimensions. + is_input: True if this is an input binding, False for output. + """ + + name: str + dtype: str + shape: List[int] + is_input: bool + + def to_dict(self) -> dict: + """Convert to dictionary for JSON serialization.""" + return asdict(self) + + @classmethod + def from_dict(cls, data: dict) -> "TensorRTIOBinding": + """Create from dictionary.""" + return cls( + name=data["name"], + dtype=data["dtype"], + shape=data["shape"], + is_input=data["is_input"], + ) + + +@dataclass +class TensorRTBlobMetadata: + """Metadata stored in TensorRT blob. + + Attributes: + io_bindings: List of input/output tensor bindings. + """ + + io_bindings: List[TensorRTIOBinding] = field(default_factory=list) + + def to_json(self) -> bytes: + """Serialize metadata to JSON bytes.""" + data = { + "io_bindings": [b.to_dict() for b in self.io_bindings], + } + return json.dumps(data, separators=(",", ":")).encode("utf-8") + + @classmethod + def from_json(cls, data: bytes) -> "TensorRTBlobMetadata": + """Deserialize metadata from JSON bytes.""" + parsed = json.loads(data.decode("utf-8")) + io_bindings = [ + TensorRTIOBinding.from_dict(b) for b in parsed.get("io_bindings", []) + ] + return cls(io_bindings=io_bindings) + + +@dataclass +class TensorRTBlobHeader: + """Header for TensorRT serialized blob. + + Attributes: + magic: Magic bytes identifying blob type (b"TR01"). + metadata_offset: Offset in bytes from start to metadata. + metadata_size: Size of metadata JSON in bytes. + engine_offset: Offset in bytes from start to engine data. + engine_size: Size of engine data in bytes. + """ + + magic: bytes + metadata_offset: int + metadata_size: int + engine_offset: int + engine_size: int + + def is_valid(self) -> bool: + """Check if this is a valid TensorRT blob header.""" + return self.magic == TENSORRT_MAGIC + + +def _align_to_16(offset: int) -> int: + """Align offset to 16-byte boundary.""" + return (offset + 15) & ~15 + + +def serialize_blob( + engine_bytes: bytes, + metadata: Optional[TensorRTBlobMetadata] = None, +) -> bytes: + """Serialize TensorRT engine to blob format with metadata. + + Args: + engine_bytes: Serialized TensorRT engine bytes. + metadata: Optional metadata including I/O bindings. + + Returns: + Complete blob with header, metadata, and engine data. + """ + if metadata is None: + metadata = TensorRTBlobMetadata() + + # Serialize metadata to JSON + metadata_json = metadata.to_json() + metadata_size = len(metadata_json) + + # Calculate offsets with alignment + metadata_offset = HEADER_SIZE + engine_offset = _align_to_16(metadata_offset + metadata_size) + engine_size = len(engine_bytes) + + # Build header + reserved = b"\x00" * 8 + header = struct.pack( + HEADER_FORMAT, + TENSORRT_MAGIC, + metadata_offset, + metadata_size, + engine_offset, + engine_size, + reserved, + ) + + # Build padding between metadata and engine + padding_size = engine_offset - (metadata_offset + metadata_size) + padding = b"\x00" * padding_size + + return header + metadata_json + padding + engine_bytes + + +def deserialize_blob_header(data: bytes) -> Optional[TensorRTBlobHeader]: + """Deserialize blob header from binary data. + + Args: + data: Binary data containing at least the header. + + Returns: + TensorRTBlobHeader if valid, None otherwise. + """ + if len(data) < HEADER_SIZE: + return None + + magic, metadata_offset, metadata_size, engine_offset, engine_size, _ = ( + struct.unpack(HEADER_FORMAT, data[:HEADER_SIZE]) + ) + + return TensorRTBlobHeader( + magic=magic, + metadata_offset=metadata_offset, + metadata_size=metadata_size, + engine_offset=engine_offset, + engine_size=engine_size, + ) + + +def get_metadata_from_blob(data: bytes) -> Optional[TensorRTBlobMetadata]: + """Extract metadata from blob. + + Args: + data: Complete blob data. + + Returns: + TensorRTBlobMetadata if valid blob, None otherwise. + """ + header = deserialize_blob_header(data) + if header is None or not header.is_valid(): + return None + + end_offset = header.metadata_offset + header.metadata_size + if len(data) < end_offset: + return None + + metadata_json = data[header.metadata_offset : end_offset] + return TensorRTBlobMetadata.from_json(metadata_json) + + +def get_engine_from_blob(data: bytes) -> Optional[bytes]: + """Extract TensorRT engine bytes from blob. + + Args: + data: Complete blob data. + + Returns: + Engine bytes if valid blob, None otherwise. + """ + header = deserialize_blob_header(data) + if header is None or not header.is_valid(): + return None + + end_offset = header.engine_offset + header.engine_size + if len(data) < end_offset: + return None + + return data[header.engine_offset : end_offset] diff --git a/backends/nvidia/tensorrt/targets.bzl b/backends/nvidia/tensorrt/targets.bzl index 49332ddbc25..ad42dca0152 100644 --- a/backends/nvidia/tensorrt/targets.bzl +++ b/backends/nvidia/tensorrt/targets.bzl @@ -70,3 +70,11 @@ def define_common_targets(): "//executorch/exir/backend:compile_spec_schema", ], ) + + runtime.python_library( + name = "serialization", + srcs = [ + "serialization.py", + ], + visibility = ["PUBLIC"], + ) diff --git a/backends/nvidia/tensorrt/test/targets.bzl b/backends/nvidia/tensorrt/test/targets.bzl index e82fe69c900..2d216b9bd86 100644 --- a/backends/nvidia/tensorrt/test/targets.bzl +++ b/backends/nvidia/tensorrt/test/targets.bzl @@ -16,3 +16,13 @@ def define_common_targets(): "//executorch/backends/nvidia/tensorrt:compile_spec", ], ) + + runtime.python_test( + name = "test_serialization", + srcs = [ + "test_serialization.py", + ], + deps = [ + "//executorch/backends/nvidia/tensorrt:serialization", + ], + ) diff --git a/backends/nvidia/tensorrt/test/test_serialization.py b/backends/nvidia/tensorrt/test/test_serialization.py new file mode 100644 index 00000000000..148d9835a8c --- /dev/null +++ b/backends/nvidia/tensorrt/test/test_serialization.py @@ -0,0 +1,203 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""Tests for TensorRT blob serialization format.""" + +import unittest + + +class SerializationTest(unittest.TestCase): + """Tests for TensorRT blob serialization format.""" + + def test_serialize_and_deserialize_header(self) -> None: + """Test that serialize/deserialize round-trips correctly.""" + from executorch.backends.nvidia.tensorrt.serialization import ( + deserialize_blob_header, + serialize_blob, + TENSORRT_MAGIC, + ) + + engine_bytes = b"fake_engine_data_12345" + blob = serialize_blob(engine_bytes) + header = deserialize_blob_header(blob) + + self.assertIsNotNone(header) + self.assertEqual(header.magic, TENSORRT_MAGIC) + self.assertTrue(header.is_valid()) + self.assertEqual(header.engine_size, len(engine_bytes)) + + def test_header_size(self) -> None: + """Test that header size constant is correct.""" + from executorch.backends.nvidia.tensorrt.serialization import ( + deserialize_blob_header, + HEADER_FORMAT, + HEADER_SIZE, + serialize_blob, + ) + import struct + + # Verify HEADER_SIZE matches the struct format + self.assertEqual(struct.calcsize(HEADER_FORMAT), HEADER_SIZE) + + # Verify blob layout: header starts at 0, metadata at HEADER_SIZE + engine_bytes = b"test" + blob = serialize_blob(engine_bytes) + header = deserialize_blob_header(blob) + + self.assertEqual(header.metadata_offset, HEADER_SIZE) + + def test_get_engine_from_blob(self) -> None: + """Test engine extraction from blob.""" + from executorch.backends.nvidia.tensorrt.serialization import ( + get_engine_from_blob, + serialize_blob, + ) + + engine_bytes = b"test_engine_bytes_here" + blob = serialize_blob(engine_bytes) + extracted = get_engine_from_blob(blob) + + self.assertEqual(extracted, engine_bytes) + + def test_invalid_blob_header(self) -> None: + """Test that invalid data returns None.""" + from executorch.backends.nvidia.tensorrt.serialization import ( + deserialize_blob_header, + ) + + result = deserialize_blob_header(b"short") + self.assertIsNone(result) + + def test_invalid_magic(self) -> None: + """Test that invalid magic returns False for is_valid.""" + from executorch.backends.nvidia.tensorrt.serialization import ( + TensorRTBlobHeader, + ) + + header = TensorRTBlobHeader( + magic=b"XXXX", + metadata_offset=32, + metadata_size=0, + engine_offset=32, + engine_size=100, + ) + self.assertFalse(header.is_valid()) + + def test_io_binding_to_dict(self) -> None: + """Test TensorRTIOBinding serialization to dict.""" + from executorch.backends.nvidia.tensorrt.serialization import ( + TensorRTIOBinding, + ) + + binding = TensorRTIOBinding( + name="input_0", + dtype="float32", + shape=[1, 3, 224, 224], + is_input=True, + ) + d = binding.to_dict() + + self.assertEqual(d["name"], "input_0") + self.assertEqual(d["dtype"], "float32") + self.assertEqual(d["shape"], [1, 3, 224, 224]) + self.assertTrue(d["is_input"]) + + def test_io_binding_from_dict(self) -> None: + """Test TensorRTIOBinding deserialization from dict.""" + from executorch.backends.nvidia.tensorrt.serialization import ( + TensorRTIOBinding, + ) + + d = { + "name": "output_0", + "dtype": "float16", + "shape": [1, 1000], + "is_input": False, + } + binding = TensorRTIOBinding.from_dict(d) + + self.assertEqual(binding.name, "output_0") + self.assertEqual(binding.dtype, "float16") + self.assertEqual(binding.shape, [1, 1000]) + self.assertFalse(binding.is_input) + + def test_metadata_roundtrip(self) -> None: + """Test TensorRTBlobMetadata JSON round-trip.""" + from executorch.backends.nvidia.tensorrt.serialization import ( + TensorRTBlobMetadata, + TensorRTIOBinding, + ) + + original = TensorRTBlobMetadata( + io_bindings=[ + TensorRTIOBinding("x", "float32", [2, 3], True), + TensorRTIOBinding("y", "float32", [2, 3], True), + TensorRTIOBinding("output", "float32", [2, 3], False), + ] + ) + + json_bytes = original.to_json() + restored = TensorRTBlobMetadata.from_json(json_bytes) + + self.assertEqual(len(restored.io_bindings), 3) + self.assertEqual(restored.io_bindings[0].name, "x") + self.assertEqual(restored.io_bindings[1].name, "y") + self.assertEqual(restored.io_bindings[2].name, "output") + self.assertTrue(restored.io_bindings[0].is_input) + self.assertFalse(restored.io_bindings[2].is_input) + + def test_blob_with_metadata(self) -> None: + """Test full blob serialization with metadata.""" + from executorch.backends.nvidia.tensorrt.serialization import ( + get_engine_from_blob, + get_metadata_from_blob, + serialize_blob, + TensorRTBlobMetadata, + TensorRTIOBinding, + ) + + engine_bytes = b"fake_tensorrt_engine" + metadata = TensorRTBlobMetadata( + io_bindings=[ + TensorRTIOBinding("input", "float32", [1, 3, 224, 224], True), + TensorRTIOBinding("output", "float32", [1, 1000], False), + ] + ) + + blob = serialize_blob(engine_bytes, metadata) + + # Verify engine extraction + extracted_engine = get_engine_from_blob(blob) + self.assertEqual(extracted_engine, engine_bytes) + + # Verify metadata extraction + extracted_metadata = get_metadata_from_blob(blob) + self.assertIsNotNone(extracted_metadata) + self.assertEqual(len(extracted_metadata.io_bindings), 2) + self.assertEqual(extracted_metadata.io_bindings[0].name, "input") + self.assertEqual(extracted_metadata.io_bindings[1].name, "output") + + def test_blob_alignment(self) -> None: + """Test that engine data is 16-byte aligned.""" + from executorch.backends.nvidia.tensorrt.serialization import ( + deserialize_blob_header, + serialize_blob, + TensorRTBlobMetadata, + TensorRTIOBinding, + ) + + # Create metadata of varying sizes + for num_bindings in [1, 2, 5, 10]: + bindings = [ + TensorRTIOBinding(f"tensor_{i}", "float32", [1, i + 1], i == 0) + for i in range(num_bindings) + ] + metadata = TensorRTBlobMetadata(io_bindings=bindings) + blob = serialize_blob(b"engine", metadata) + header = deserialize_blob_header(blob) + + # Engine offset must be 16-byte aligned + self.assertEqual(header.engine_offset % 16, 0)