diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index 673dcfe128..8016a072ea 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -5,7 +5,7 @@ import os import platform import warnings -from typing import Any, Collection, List, Optional, Sequence, Union +from typing import Any, Collection, List, Optional, Sequence, Set, Union import torch from torch.export import ExportedProgram @@ -109,6 +109,7 @@ def cross_compile_for_windows( dynamically_allocate_resources: bool = _defaults.DYNAMICALLY_ALLOCATE_RESOURCES, decompose_attention: bool = _defaults.DECOMPOSE_ATTENTION, attn_bias_is_causal: bool = _defaults.ATTN_BIAS_IS_CAUSAL, + target_executorch: bool = _defaults.TARGET_EXECUTORCH, **kwargs: Any, ) -> torch.fx.GraphModule: """Compile an ExportedProgram module using TensorRT in Linux for Inference in Windows @@ -186,6 +187,7 @@ def cross_compile_for_windows( dynamically_allocate_resources (bool): Dynamically allocate resources during engine execution. decompose_attention (bool): Whether to decompose attention layers. We have converters for handling attention ops, but if you want to decompose them into smaller ops, you can set this to True. attn_bias_is_causal (bool): Whether the attn_bias in efficient SDPA is causal. Default is True. This can accelerate models from HF because attn_bias is always a causal mask in HF. If you want to use non-causal attn_bias, you can set this to False. + target_executorch (bool): If True, operators whose converters require a TensorRT output allocator (i.e. data-dependent output shapes, such as nonzero) are added to torch_executed_ops and run in PyTorch instead of being lowered into a TensorRT engine. This is useful when targeting runtimes that cannot consume a TensorRT output allocator. Default is False. **kwargs: Any, Returns: torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT @@ -338,6 +340,7 @@ def cross_compile_for_windows( "dynamically_allocate_resources": dynamically_allocate_resources, "decompose_attention": decompose_attention, "attn_bias_is_causal": attn_bias_is_causal, + "target_executorch": target_executorch, } # disable the following settings is not supported for cross compilation for windows feature @@ -460,6 +463,7 @@ def compile( dynamically_allocate_resources: bool = _defaults.DYNAMICALLY_ALLOCATE_RESOURCES, decompose_attention: bool = _defaults.DECOMPOSE_ATTENTION, attn_bias_is_causal: bool = _defaults.ATTN_BIAS_IS_CAUSAL, + target_executorch: bool = _defaults.TARGET_EXECUTORCH, **kwargs: Any, ) -> torch.fx.GraphModule: """Compile an ExportedProgram module for NVIDIA GPUs using TensorRT @@ -547,6 +551,7 @@ def compile( dynamically_allocate_resources (bool): Dynamically allocate resources during engine execution. decompose_attention (bool): Whether to decompose attention layers. We have converters for handling attention ops, but if you want to decompose them into smaller ops, you can set this to True. attn_bias_is_causal (bool): Whether the attn_bias in efficient SDPA is causal. Default is True. This can accelerate models from HF because attn_bias is always a causal mask in HF. If you want to use non-causal attn_bias, you can set this to False. + target_executorch (bool): If True, operators whose converters require a TensorRT output allocator (i.e. data-dependent output shapes, such as nonzero) are added to torch_executed_ops and run in PyTorch instead of being lowered into a TensorRT engine. This is useful when targeting runtimes that cannot consume a TensorRT output allocator. Default is False. **kwargs: Any, Returns: torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT @@ -732,6 +737,7 @@ def compile( "dynamically_allocate_resources": dynamically_allocate_resources, "decompose_attention": decompose_attention, "attn_bias_is_causal": attn_bias_is_causal, + "target_executorch": target_executorch, } logger.debug(f"CPU memory usage before lowering: {get_cpu_memory_usage()} MB") settings = CompilationSettings(**compilation_options) @@ -879,6 +885,44 @@ def _insert_complex_io_adapters( @fn_supports_debugger # type: ignore[misc] +def _output_allocator_ops() -> Set[Target]: + ops: Set[Target] = set() + for registry in CONVERTERS.registries: + for target, converters in registry.items(): + # a registry value may be one converter, a list/tuple, or a priority dict + if isinstance(converters, (list, tuple)): + candidates = list(converters) + elif isinstance(converters, dict): + candidates = list(converters.values()) + else: + candidates = [converters] + # if any converter needs an allocator, route the whole target so an + # allocator engine is never emitted + if any( + getattr(conv, "requires_output_allocator", False) for conv in candidates + ): + ops.add(target) + return ops + + +def _route_output_allocator_ops(settings: CompilationSettings) -> None: + if not settings.target_executorch: + return + ops = _output_allocator_ops() + if not ops: + return + if settings.require_full_compilation: + raise ValueError( + "target_executorch routes output-allocator ops to PyTorch, which " + "is incompatible with require_full_compilation=True; enable only one." + ) + settings.torch_executed_ops = set(settings.torch_executed_ops) | ops + logger.debug( + "target_executorch: routing output-allocator ops to " "torch_executed_ops: %s", + sorted(str(t) for t in ops), + ) + + def compile_module( gm: torch.fx.GraphModule, sample_arg_inputs: Sequence[Input], @@ -910,6 +954,7 @@ def compile_module( sample_kwarg_inputs = {} # Configure user compilation settings to converters. + _route_output_allocator_ops(settings) CONVERTERS.set_compilation_settings(settings) # Check the number of supported operations in the graph diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py index 6b8d32e3c7..43190344e0 100644 --- a/py/torch_tensorrt/dynamo/_defaults.py +++ b/py/torch_tensorrt/dynamo/_defaults.py @@ -66,6 +66,7 @@ DYNAMICALLY_ALLOCATE_RESOURCES = False DECOMPOSE_ATTENTION = False ATTN_BIAS_IS_CAUSAL = True +TARGET_EXECUTORCH = False if platform.system() == "Linux": import pwd diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py index 8c079eb973..976f5c1983 100644 --- a/py/torch_tensorrt/dynamo/_settings.py +++ b/py/torch_tensorrt/dynamo/_settings.py @@ -45,6 +45,7 @@ REUSE_CACHED_ENGINES, SPARSE_WEIGHTS, STRIP_ENGINE_WEIGHTS, + TARGET_EXECUTORCH, TILING_OPTIMIZATION_LEVEL, TIMING_CACHE_PATH, TRUNCATE_DOUBLE, @@ -113,6 +114,7 @@ class CompilationSettings: dynamically_allocate_resources (bool): Dynamically allocate resources for TensorRT engines decompose_attention (bool): Whether to decompose attention layers. We have converters for handling attention ops, but if you want to decompose them into smaller ops, you can set this to True. attn_bias_is_causal (bool): Whether the attn_bias in efficient SDPA is causal. Default is True. This can accelerate models from HF because attn_bias is always a causal mask in HF. If you want to use non-causal attn_bias, you can set this to False. + target_executorch (bool): If True, operators whose converters require a TensorRT output allocator (i.e. data-dependent output shapes, such as nonzero) are added to torch_executed_ops and run in PyTorch instead of being lowered into a TensorRT engine. This is useful when targeting runtimes that cannot consume a TensorRT output allocator. Default is False. """ workspace_size: int = WORKSPACE_SIZE @@ -171,6 +173,7 @@ class CompilationSettings: dynamically_allocate_resources: bool = DYNAMICALLY_ALLOCATE_RESOURCES decompose_attention: bool = DECOMPOSE_ATTENTION attn_bias_is_causal: bool = ATTN_BIAS_IS_CAUSAL + target_executorch: bool = TARGET_EXECUTORCH def __getstate__(self) -> dict[str, Any]: from torch_tensorrt.dynamo.conversion._ConverterRegistry import ( @@ -186,6 +189,7 @@ def __getstate__(self) -> dict[str, Any]: def __setstate__(self, state: dict[str, Any]) -> None: state.pop("use_python_runtime", None) + state.setdefault("target_executorch", TARGET_EXECUTORCH) self.__dict__.update(state) diff --git a/tests/py/dynamo/models/test_target_executorch.py b/tests/py/dynamo/models/test_target_executorch.py new file mode 100644 index 0000000000..582e3b2530 --- /dev/null +++ b/tests/py/dynamo/models/test_target_executorch.py @@ -0,0 +1,76 @@ +import pytest +import torch +import torch_tensorrt +from torch_tensorrt.dynamo._compiler import ( + _output_allocator_ops, + _route_output_allocator_ops, +) +from torch_tensorrt.dynamo._settings import CompilationSettings + + +def test_target_executorch_setting_default(): + # Off by default; opt-in only. + assert CompilationSettings().target_executorch is False + assert CompilationSettings(target_executorch=True).target_executorch is True + + +def test_old_pickle_without_field_defaults_to_false(): + # A settings object serialized before target_executorch existed (no such key in the + # state) must restore to the default instead of raising AttributeError. + state = CompilationSettings().__dict__.copy() + state.pop("target_executorch", None) + restored = CompilationSettings.__new__(CompilationSettings) + restored.__setstate__(state) + assert restored.target_executorch is False + + +def test_output_allocator_converters_are_discoverable(): + # The routing relies on converters tagging requires_output_allocator; the canonical + # data-dependent op (nonzero) must be discoverable that way. Exercises the production + # discovery helper directly. + assert torch.ops.aten.nonzero.default in _output_allocator_ops() + + +def test_route_is_noop_when_disabled(): + # The default (off) must leave torch_executed_ops untouched. + settings = CompilationSettings() + _route_output_allocator_ops(settings) + assert settings.torch_executed_ops == set() + + +def test_route_adds_output_allocator_ops_when_enabled(): + # CPU only: enabling the flag routes the data-dependent op into torch_executed_ops + # (so the partitioner runs it in PyTorch). This guards the routing without a GPU. + settings = CompilationSettings(target_executorch=True) + _route_output_allocator_ops(settings) + assert torch.ops.aten.nonzero.default in settings.torch_executed_ops + + +def test_route_incompatible_with_require_full_compilation(): + # Routing ops to PyTorch contradicts require_full_compilation, so it must error early. + settings = CompilationSettings( + target_executorch=True, require_full_compilation=True + ) + with pytest.raises(ValueError): + _route_output_allocator_ops(settings) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required") +def test_target_executorch_routes_output_allocator_op_to_torch(): + # End to end on GPU: with target_executorch=True an output-allocator op (nonzero) + # falls back to PyTorch instead of being absorbed into a TensorRT engine. + class Model(torch.nn.Module): + def forward(self, x): + return torch.nonzero(x) + + inputs = (torch.tensor([0, 3, 0, 5, 7], dtype=torch.int32, device="cuda"),) + ep = torch.export.export(Model().cuda(), inputs) + gm = torch_tensorrt.dynamo.compile( + ep, + arg_inputs=list(inputs), + min_block_size=1, + target_executorch=True, + use_python_runtime=True, + ) + targets = {n.target for n in gm.graph.nodes if n.op == "call_function"} + assert torch.ops.aten.nonzero.default in targets