From 85fa1eb54f675f3feeae7bbbce1f853b813c5d71 Mon Sep 17 00:00:00 2001 From: shoumikhin Date: Sun, 21 Jun 2026 00:26:33 +0000 Subject: [PATCH] feat(dynamo): add target_executorch setting to keep output-allocator ops in PyTorch Some converters require a TensorRT output allocator because their output shape is data-dependent (for example aten.nonzero). A TensorRT engine that needs an output allocator cannot be consumed by every downstream runtime that executes the compiled program. This adds a target_executorch compile setting (default False). When enabled, every operator whose converter sets requires_output_allocator is routed to torch_executed_ops and runs in PyTorch instead of being lowered into a TensorRT engine. When disabled (the default), behavior is unchanged. Details: - Discovery and routing live in two small helpers (_output_allocator_ops and _route_output_allocator_ops) so both are unit-testable without a GPU. The registry walk handles a single converter, a list/tuple, or a priority-keyed dict, and is conservative: if any converter for a target needs an allocator, the whole target is routed to PyTorch so an allocator engine is never emitted. - Wired through compile() and cross_compile_for_windows(); the routing runs in compile_module(), which both entry points funnel through. It is intentionally not exposed on convert_exported_program_to_serialized_trt_engine(), where a single serialized engine cannot contain PyTorch fallbacks. - Combining target_executorch with require_full_compilation raises a clear error, since routing ops to PyTorch contradicts full compilation. - CompilationSettings.__setstate__ defaults the new field so older pickles load. The name is deliberate: it gates ExecuTorch-targeted routing, and further ExecuTorch-specific behavior can accrete under the same flag. Tests (tests/py/dynamo/models/test_target_executorch.py): default value; old-pickle compatibility; output-allocator op discovery; routing is a no-op when disabled; routing adds the op when enabled (CPU only); the require_full_compilation conflict; and an end to end GPU test that a data-dependent op falls back to PyTorch. Signed-off-by: shoumikhin --- py/torch_tensorrt/dynamo/_compiler.py | 47 +++++++++++- py/torch_tensorrt/dynamo/_defaults.py | 1 + py/torch_tensorrt/dynamo/_settings.py | 4 + .../dynamo/models/test_target_executorch.py | 76 +++++++++++++++++++ 4 files changed, 127 insertions(+), 1 deletion(-) create mode 100644 tests/py/dynamo/models/test_target_executorch.py diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index 673dcfe128..8016a072ea 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -5,7 +5,7 @@ import os import platform import warnings -from typing import Any, Collection, List, Optional, Sequence, Union +from typing import Any, Collection, List, Optional, Sequence, Set, Union import torch from torch.export import ExportedProgram @@ -109,6 +109,7 @@ def cross_compile_for_windows( dynamically_allocate_resources: bool = _defaults.DYNAMICALLY_ALLOCATE_RESOURCES, decompose_attention: bool = _defaults.DECOMPOSE_ATTENTION, attn_bias_is_causal: bool = _defaults.ATTN_BIAS_IS_CAUSAL, + target_executorch: bool = _defaults.TARGET_EXECUTORCH, **kwargs: Any, ) -> torch.fx.GraphModule: """Compile an ExportedProgram module using TensorRT in Linux for Inference in Windows @@ -186,6 +187,7 @@ def cross_compile_for_windows( dynamically_allocate_resources (bool): Dynamically allocate resources during engine execution. decompose_attention (bool): Whether to decompose attention layers. We have converters for handling attention ops, but if you want to decompose them into smaller ops, you can set this to True. attn_bias_is_causal (bool): Whether the attn_bias in efficient SDPA is causal. Default is True. This can accelerate models from HF because attn_bias is always a causal mask in HF. If you want to use non-causal attn_bias, you can set this to False. + target_executorch (bool): If True, operators whose converters require a TensorRT output allocator (i.e. data-dependent output shapes, such as nonzero) are added to torch_executed_ops and run in PyTorch instead of being lowered into a TensorRT engine. This is useful when targeting runtimes that cannot consume a TensorRT output allocator. Default is False. **kwargs: Any, Returns: torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT @@ -338,6 +340,7 @@ def cross_compile_for_windows( "dynamically_allocate_resources": dynamically_allocate_resources, "decompose_attention": decompose_attention, "attn_bias_is_causal": attn_bias_is_causal, + "target_executorch": target_executorch, } # disable the following settings is not supported for cross compilation for windows feature @@ -460,6 +463,7 @@ def compile( dynamically_allocate_resources: bool = _defaults.DYNAMICALLY_ALLOCATE_RESOURCES, decompose_attention: bool = _defaults.DECOMPOSE_ATTENTION, attn_bias_is_causal: bool = _defaults.ATTN_BIAS_IS_CAUSAL, + target_executorch: bool = _defaults.TARGET_EXECUTORCH, **kwargs: Any, ) -> torch.fx.GraphModule: """Compile an ExportedProgram module for NVIDIA GPUs using TensorRT @@ -547,6 +551,7 @@ def compile( dynamically_allocate_resources (bool): Dynamically allocate resources during engine execution. decompose_attention (bool): Whether to decompose attention layers. We have converters for handling attention ops, but if you want to decompose them into smaller ops, you can set this to True. attn_bias_is_causal (bool): Whether the attn_bias in efficient SDPA is causal. Default is True. This can accelerate models from HF because attn_bias is always a causal mask in HF. If you want to use non-causal attn_bias, you can set this to False. + target_executorch (bool): If True, operators whose converters require a TensorRT output allocator (i.e. data-dependent output shapes, such as nonzero) are added to torch_executed_ops and run in PyTorch instead of being lowered into a TensorRT engine. This is useful when targeting runtimes that cannot consume a TensorRT output allocator. Default is False. **kwargs: Any, Returns: torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT @@ -732,6 +737,7 @@ def compile( "dynamically_allocate_resources": dynamically_allocate_resources, "decompose_attention": decompose_attention, "attn_bias_is_causal": attn_bias_is_causal, + "target_executorch": target_executorch, } logger.debug(f"CPU memory usage before lowering: {get_cpu_memory_usage()} MB") settings = CompilationSettings(**compilation_options) @@ -879,6 +885,44 @@ def _insert_complex_io_adapters( @fn_supports_debugger # type: ignore[misc] +def _output_allocator_ops() -> Set[Target]: + ops: Set[Target] = set() + for registry in CONVERTERS.registries: + for target, converters in registry.items(): + # a registry value may be one converter, a list/tuple, or a priority dict + if isinstance(converters, (list, tuple)): + candidates = list(converters) + elif isinstance(converters, dict): + candidates = list(converters.values()) + else: + candidates = [converters] + # if any converter needs an allocator, route the whole target so an + # allocator engine is never emitted + if any( + getattr(conv, "requires_output_allocator", False) for conv in candidates + ): + ops.add(target) + return ops + + +def _route_output_allocator_ops(settings: CompilationSettings) -> None: + if not settings.target_executorch: + return + ops = _output_allocator_ops() + if not ops: + return + if settings.require_full_compilation: + raise ValueError( + "target_executorch routes output-allocator ops to PyTorch, which " + "is incompatible with require_full_compilation=True; enable only one." + ) + settings.torch_executed_ops = set(settings.torch_executed_ops) | ops + logger.debug( + "target_executorch: routing output-allocator ops to " "torch_executed_ops: %s", + sorted(str(t) for t in ops), + ) + + def compile_module( gm: torch.fx.GraphModule, sample_arg_inputs: Sequence[Input], @@ -910,6 +954,7 @@ def compile_module( sample_kwarg_inputs = {} # Configure user compilation settings to converters. + _route_output_allocator_ops(settings) CONVERTERS.set_compilation_settings(settings) # Check the number of supported operations in the graph diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py index 6b8d32e3c7..43190344e0 100644 --- a/py/torch_tensorrt/dynamo/_defaults.py +++ b/py/torch_tensorrt/dynamo/_defaults.py @@ -66,6 +66,7 @@ DYNAMICALLY_ALLOCATE_RESOURCES = False DECOMPOSE_ATTENTION = False ATTN_BIAS_IS_CAUSAL = True +TARGET_EXECUTORCH = False if platform.system() == "Linux": import pwd diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py index 8c079eb973..976f5c1983 100644 --- a/py/torch_tensorrt/dynamo/_settings.py +++ b/py/torch_tensorrt/dynamo/_settings.py @@ -45,6 +45,7 @@ REUSE_CACHED_ENGINES, SPARSE_WEIGHTS, STRIP_ENGINE_WEIGHTS, + TARGET_EXECUTORCH, TILING_OPTIMIZATION_LEVEL, TIMING_CACHE_PATH, TRUNCATE_DOUBLE, @@ -113,6 +114,7 @@ class CompilationSettings: dynamically_allocate_resources (bool): Dynamically allocate resources for TensorRT engines decompose_attention (bool): Whether to decompose attention layers. We have converters for handling attention ops, but if you want to decompose them into smaller ops, you can set this to True. attn_bias_is_causal (bool): Whether the attn_bias in efficient SDPA is causal. Default is True. This can accelerate models from HF because attn_bias is always a causal mask in HF. If you want to use non-causal attn_bias, you can set this to False. + target_executorch (bool): If True, operators whose converters require a TensorRT output allocator (i.e. data-dependent output shapes, such as nonzero) are added to torch_executed_ops and run in PyTorch instead of being lowered into a TensorRT engine. This is useful when targeting runtimes that cannot consume a TensorRT output allocator. Default is False. """ workspace_size: int = WORKSPACE_SIZE @@ -171,6 +173,7 @@ class CompilationSettings: dynamically_allocate_resources: bool = DYNAMICALLY_ALLOCATE_RESOURCES decompose_attention: bool = DECOMPOSE_ATTENTION attn_bias_is_causal: bool = ATTN_BIAS_IS_CAUSAL + target_executorch: bool = TARGET_EXECUTORCH def __getstate__(self) -> dict[str, Any]: from torch_tensorrt.dynamo.conversion._ConverterRegistry import ( @@ -186,6 +189,7 @@ def __getstate__(self) -> dict[str, Any]: def __setstate__(self, state: dict[str, Any]) -> None: state.pop("use_python_runtime", None) + state.setdefault("target_executorch", TARGET_EXECUTORCH) self.__dict__.update(state) diff --git a/tests/py/dynamo/models/test_target_executorch.py b/tests/py/dynamo/models/test_target_executorch.py new file mode 100644 index 0000000000..582e3b2530 --- /dev/null +++ b/tests/py/dynamo/models/test_target_executorch.py @@ -0,0 +1,76 @@ +import pytest +import torch +import torch_tensorrt +from torch_tensorrt.dynamo._compiler import ( + _output_allocator_ops, + _route_output_allocator_ops, +) +from torch_tensorrt.dynamo._settings import CompilationSettings + + +def test_target_executorch_setting_default(): + # Off by default; opt-in only. + assert CompilationSettings().target_executorch is False + assert CompilationSettings(target_executorch=True).target_executorch is True + + +def test_old_pickle_without_field_defaults_to_false(): + # A settings object serialized before target_executorch existed (no such key in the + # state) must restore to the default instead of raising AttributeError. + state = CompilationSettings().__dict__.copy() + state.pop("target_executorch", None) + restored = CompilationSettings.__new__(CompilationSettings) + restored.__setstate__(state) + assert restored.target_executorch is False + + +def test_output_allocator_converters_are_discoverable(): + # The routing relies on converters tagging requires_output_allocator; the canonical + # data-dependent op (nonzero) must be discoverable that way. Exercises the production + # discovery helper directly. + assert torch.ops.aten.nonzero.default in _output_allocator_ops() + + +def test_route_is_noop_when_disabled(): + # The default (off) must leave torch_executed_ops untouched. + settings = CompilationSettings() + _route_output_allocator_ops(settings) + assert settings.torch_executed_ops == set() + + +def test_route_adds_output_allocator_ops_when_enabled(): + # CPU only: enabling the flag routes the data-dependent op into torch_executed_ops + # (so the partitioner runs it in PyTorch). This guards the routing without a GPU. + settings = CompilationSettings(target_executorch=True) + _route_output_allocator_ops(settings) + assert torch.ops.aten.nonzero.default in settings.torch_executed_ops + + +def test_route_incompatible_with_require_full_compilation(): + # Routing ops to PyTorch contradicts require_full_compilation, so it must error early. + settings = CompilationSettings( + target_executorch=True, require_full_compilation=True + ) + with pytest.raises(ValueError): + _route_output_allocator_ops(settings) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required") +def test_target_executorch_routes_output_allocator_op_to_torch(): + # End to end on GPU: with target_executorch=True an output-allocator op (nonzero) + # falls back to PyTorch instead of being absorbed into a TensorRT engine. + class Model(torch.nn.Module): + def forward(self, x): + return torch.nonzero(x) + + inputs = (torch.tensor([0, 3, 0, 5, 7], dtype=torch.int32, device="cuda"),) + ep = torch.export.export(Model().cuda(), inputs) + gm = torch_tensorrt.dynamo.compile( + ep, + arg_inputs=list(inputs), + min_block_size=1, + target_executorch=True, + use_python_runtime=True, + ) + targets = {n.target for n in gm.graph.nodes if n.op == "call_function"} + assert torch.ops.aten.nonzero.default in targets