From 85fa1eb54f675f3feeae7bbbce1f853b813c5d71 Mon Sep 17 00:00:00 2001
From: shoumikhin <shoumikhin@meta.com>
Date: Sun, 21 Jun 2026 00:26:33 +0000
Subject: [PATCH] feat(dynamo): add target_executorch setting to keep
 output-allocator ops in PyTorch

Some converters require a TensorRT output allocator because their output shape is
data-dependent (for example aten.nonzero). A TensorRT engine that needs an output
allocator cannot be consumed by every downstream runtime that executes the compiled
program.

This adds a target_executorch compile setting (default False). When enabled, every
operator whose converter sets requires_output_allocator is routed to
torch_executed_ops and runs in PyTorch instead of being lowered into a TensorRT
engine. When disabled (the default), behavior is unchanged.

Details:
- Discovery and routing live in two small helpers (_output_allocator_ops and
  _route_output_allocator_ops) so both are unit-testable without a GPU. The registry
  walk handles a single converter, a list/tuple, or a priority-keyed dict, and is
  conservative: if any converter for a target needs an allocator, the whole target is
  routed to PyTorch so an allocator engine is never emitted.
- Wired through compile() and cross_compile_for_windows(); the routing runs in
  compile_module(), which both entry points funnel through. It is intentionally not
  exposed on convert_exported_program_to_serialized_trt_engine(), where a single
  serialized engine cannot contain PyTorch fallbacks.
- Combining target_executorch with require_full_compilation raises a clear error,
  since routing ops to PyTorch contradicts full compilation.
- CompilationSettings.__setstate__ defaults the new field so older pickles load.

The name is deliberate: it gates ExecuTorch-targeted routing, and further
ExecuTorch-specific behavior can accrete under the same flag.

Tests (tests/py/dynamo/models/test_target_executorch.py): default value; old-pickle
compatibility; output-allocator op discovery; routing is a no-op when disabled; routing
adds the op when enabled (CPU only); the require_full_compilation conflict; and an end
to end GPU test that a data-dependent op falls back to PyTorch.

Signed-off-by: shoumikhin <shoumikhin@meta.com>
---
 py/torch_tensorrt/dynamo/_compiler.py         | 47 +++++++++++-
 py/torch_tensorrt/dynamo/_defaults.py         |  1 +
 py/torch_tensorrt/dynamo/_settings.py         |  4 +
 .../dynamo/models/test_target_executorch.py   | 76 +++++++++++++++++++
 4 files changed, 127 insertions(+), 1 deletion(-)
 create mode 100644 tests/py/dynamo/models/test_target_executorch.py

diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
index 673dcfe128..8016a072ea 100644
--- a/py/torch_tensorrt/dynamo/_compiler.py
+++ b/py/torch_tensorrt/dynamo/_compiler.py
@@ -5,7 +5,7 @@
 import os
 import platform
 import warnings
-from typing import Any, Collection, List, Optional, Sequence, Union
+from typing import Any, Collection, List, Optional, Sequence, Set, Union
 
 import torch
 from torch.export import ExportedProgram
@@ -109,6 +109,7 @@ def cross_compile_for_windows(
     dynamically_allocate_resources: bool = _defaults.DYNAMICALLY_ALLOCATE_RESOURCES,
     decompose_attention: bool = _defaults.DECOMPOSE_ATTENTION,
     attn_bias_is_causal: bool = _defaults.ATTN_BIAS_IS_CAUSAL,
+    target_executorch: bool = _defaults.TARGET_EXECUTORCH,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module using TensorRT in Linux for Inference in Windows
@@ -186,6 +187,7 @@ def cross_compile_for_windows(
         dynamically_allocate_resources (bool): Dynamically allocate resources during engine execution.
         decompose_attention (bool): Whether to decompose attention layers. We have converters for handling attention ops, but if you want to decompose them into smaller ops, you can set this to True.
         attn_bias_is_causal (bool): Whether the attn_bias in efficient SDPA is causal. Default is True. This can accelerate models from HF because attn_bias is always a causal mask in HF. If you want to use non-causal attn_bias, you can set this to False.
+        target_executorch (bool): If True, operators whose converters require a TensorRT output allocator (i.e. data-dependent output shapes, such as nonzero) are added to torch_executed_ops and run in PyTorch instead of being lowered into a TensorRT engine. This is useful when targeting runtimes that cannot consume a TensorRT output allocator. Default is False.
         **kwargs: Any,
     Returns:
         torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -338,6 +340,7 @@ def cross_compile_for_windows(
         "dynamically_allocate_resources": dynamically_allocate_resources,
         "decompose_attention": decompose_attention,
         "attn_bias_is_causal": attn_bias_is_causal,
+        "target_executorch": target_executorch,
     }
 
     # disable the following settings is not supported for cross compilation for windows feature
@@ -460,6 +463,7 @@ def compile(
     dynamically_allocate_resources: bool = _defaults.DYNAMICALLY_ALLOCATE_RESOURCES,
     decompose_attention: bool = _defaults.DECOMPOSE_ATTENTION,
     attn_bias_is_causal: bool = _defaults.ATTN_BIAS_IS_CAUSAL,
+    target_executorch: bool = _defaults.TARGET_EXECUTORCH,
     **kwargs: Any,
 ) -> torch.fx.GraphModule:
     """Compile an ExportedProgram module for NVIDIA GPUs using TensorRT
@@ -547,6 +551,7 @@ def compile(
         dynamically_allocate_resources (bool): Dynamically allocate resources during engine execution.
         decompose_attention (bool): Whether to decompose attention layers. We have converters for handling attention ops, but if you want to decompose them into smaller ops, you can set this to True.
         attn_bias_is_causal (bool): Whether the attn_bias in efficient SDPA is causal. Default is True. This can accelerate models from HF because attn_bias is always a causal mask in HF. If you want to use non-causal attn_bias, you can set this to False.
+        target_executorch (bool): If True, operators whose converters require a TensorRT output allocator (i.e. data-dependent output shapes, such as nonzero) are added to torch_executed_ops and run in PyTorch instead of being lowered into a TensorRT engine. This is useful when targeting runtimes that cannot consume a TensorRT output allocator. Default is False.
         **kwargs: Any,
     Returns:
         torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
@@ -732,6 +737,7 @@ def compile(
         "dynamically_allocate_resources": dynamically_allocate_resources,
         "decompose_attention": decompose_attention,
         "attn_bias_is_causal": attn_bias_is_causal,
+        "target_executorch": target_executorch,
     }
     logger.debug(f"CPU memory usage before lowering: {get_cpu_memory_usage()} MB")
     settings = CompilationSettings(**compilation_options)
@@ -879,6 +885,44 @@ def _insert_complex_io_adapters(
 
 
 @fn_supports_debugger  # type: ignore[misc]
+def _output_allocator_ops() -> Set[Target]:
+    ops: Set[Target] = set()
+    for registry in CONVERTERS.registries:
+        for target, converters in registry.items():
+            # a registry value may be one converter, a list/tuple, or a priority dict
+            if isinstance(converters, (list, tuple)):
+                candidates = list(converters)
+            elif isinstance(converters, dict):
+                candidates = list(converters.values())
+            else:
+                candidates = [converters]
+            # if any converter needs an allocator, route the whole target so an
+            # allocator engine is never emitted
+            if any(
+                getattr(conv, "requires_output_allocator", False) for conv in candidates
+            ):
+                ops.add(target)
+    return ops
+
+
+def _route_output_allocator_ops(settings: CompilationSettings) -> None:
+    if not settings.target_executorch:
+        return
+    ops = _output_allocator_ops()
+    if not ops:
+        return
+    if settings.require_full_compilation:
+        raise ValueError(
+            "target_executorch routes output-allocator ops to PyTorch, which "
+            "is incompatible with require_full_compilation=True; enable only one."
+        )
+    settings.torch_executed_ops = set(settings.torch_executed_ops) | ops
+    logger.debug(
+        "target_executorch: routing output-allocator ops to " "torch_executed_ops: %s",
+        sorted(str(t) for t in ops),
+    )
+
+
 def compile_module(
     gm: torch.fx.GraphModule,
     sample_arg_inputs: Sequence[Input],
@@ -910,6 +954,7 @@ def compile_module(
         sample_kwarg_inputs = {}
 
     # Configure user compilation settings to converters.
+    _route_output_allocator_ops(settings)
     CONVERTERS.set_compilation_settings(settings)
 
     # Check the number of supported operations in the graph
diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py
index 6b8d32e3c7..43190344e0 100644
--- a/py/torch_tensorrt/dynamo/_defaults.py
+++ b/py/torch_tensorrt/dynamo/_defaults.py
@@ -66,6 +66,7 @@
 DYNAMICALLY_ALLOCATE_RESOURCES = False
 DECOMPOSE_ATTENTION = False
 ATTN_BIAS_IS_CAUSAL = True
+TARGET_EXECUTORCH = False
 
 if platform.system() == "Linux":
     import pwd
diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py
index 8c079eb973..976f5c1983 100644
--- a/py/torch_tensorrt/dynamo/_settings.py
+++ b/py/torch_tensorrt/dynamo/_settings.py
@@ -45,6 +45,7 @@
     REUSE_CACHED_ENGINES,
     SPARSE_WEIGHTS,
     STRIP_ENGINE_WEIGHTS,
+    TARGET_EXECUTORCH,
     TILING_OPTIMIZATION_LEVEL,
     TIMING_CACHE_PATH,
     TRUNCATE_DOUBLE,
@@ -113,6 +114,7 @@ class CompilationSettings:
         dynamically_allocate_resources (bool): Dynamically allocate resources for TensorRT engines
         decompose_attention (bool): Whether to decompose attention layers. We have converters for handling attention ops, but if you want to decompose them into smaller ops, you can set this to True.
         attn_bias_is_causal (bool): Whether the attn_bias in efficient SDPA is causal. Default is True. This can accelerate models from HF because attn_bias is always a causal mask in HF. If you want to use non-causal attn_bias, you can set this to False.
+        target_executorch (bool): If True, operators whose converters require a TensorRT output allocator (i.e. data-dependent output shapes, such as nonzero) are added to torch_executed_ops and run in PyTorch instead of being lowered into a TensorRT engine. This is useful when targeting runtimes that cannot consume a TensorRT output allocator. Default is False.
     """
 
     workspace_size: int = WORKSPACE_SIZE
@@ -171,6 +173,7 @@ class CompilationSettings:
     dynamically_allocate_resources: bool = DYNAMICALLY_ALLOCATE_RESOURCES
     decompose_attention: bool = DECOMPOSE_ATTENTION
     attn_bias_is_causal: bool = ATTN_BIAS_IS_CAUSAL
+    target_executorch: bool = TARGET_EXECUTORCH
 
     def __getstate__(self) -> dict[str, Any]:
         from torch_tensorrt.dynamo.conversion._ConverterRegistry import (
@@ -186,6 +189,7 @@ def __getstate__(self) -> dict[str, Any]:
 
     def __setstate__(self, state: dict[str, Any]) -> None:
         state.pop("use_python_runtime", None)
+        state.setdefault("target_executorch", TARGET_EXECUTORCH)
         self.__dict__.update(state)
 
 
diff --git a/tests/py/dynamo/models/test_target_executorch.py b/tests/py/dynamo/models/test_target_executorch.py
new file mode 100644
index 0000000000..582e3b2530
--- /dev/null
+++ b/tests/py/dynamo/models/test_target_executorch.py
@@ -0,0 +1,76 @@
+import pytest
+import torch
+import torch_tensorrt
+from torch_tensorrt.dynamo._compiler import (
+    _output_allocator_ops,
+    _route_output_allocator_ops,
+)
+from torch_tensorrt.dynamo._settings import CompilationSettings
+
+
+def test_target_executorch_setting_default():
+    # Off by default; opt-in only.
+    assert CompilationSettings().target_executorch is False
+    assert CompilationSettings(target_executorch=True).target_executorch is True
+
+
+def test_old_pickle_without_field_defaults_to_false():
+    # A settings object serialized before target_executorch existed (no such key in the
+    # state) must restore to the default instead of raising AttributeError.
+    state = CompilationSettings().__dict__.copy()
+    state.pop("target_executorch", None)
+    restored = CompilationSettings.__new__(CompilationSettings)
+    restored.__setstate__(state)
+    assert restored.target_executorch is False
+
+
+def test_output_allocator_converters_are_discoverable():
+    # The routing relies on converters tagging requires_output_allocator; the canonical
+    # data-dependent op (nonzero) must be discoverable that way. Exercises the production
+    # discovery helper directly.
+    assert torch.ops.aten.nonzero.default in _output_allocator_ops()
+
+
+def test_route_is_noop_when_disabled():
+    # The default (off) must leave torch_executed_ops untouched.
+    settings = CompilationSettings()
+    _route_output_allocator_ops(settings)
+    assert settings.torch_executed_ops == set()
+
+
+def test_route_adds_output_allocator_ops_when_enabled():
+    # CPU only: enabling the flag routes the data-dependent op into torch_executed_ops
+    # (so the partitioner runs it in PyTorch). This guards the routing without a GPU.
+    settings = CompilationSettings(target_executorch=True)
+    _route_output_allocator_ops(settings)
+    assert torch.ops.aten.nonzero.default in settings.torch_executed_ops
+
+
+def test_route_incompatible_with_require_full_compilation():
+    # Routing ops to PyTorch contradicts require_full_compilation, so it must error early.
+    settings = CompilationSettings(
+        target_executorch=True, require_full_compilation=True
+    )
+    with pytest.raises(ValueError):
+        _route_output_allocator_ops(settings)
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required")
+def test_target_executorch_routes_output_allocator_op_to_torch():
+    # End to end on GPU: with target_executorch=True an output-allocator op (nonzero)
+    # falls back to PyTorch instead of being absorbed into a TensorRT engine.
+    class Model(torch.nn.Module):
+        def forward(self, x):
+            return torch.nonzero(x)
+
+    inputs = (torch.tensor([0, 3, 0, 5, 7], dtype=torch.int32, device="cuda"),)
+    ep = torch.export.export(Model().cuda(), inputs)
+    gm = torch_tensorrt.dynamo.compile(
+        ep,
+        arg_inputs=list(inputs),
+        min_block_size=1,
+        target_executorch=True,
+        use_python_runtime=True,
+    )
+    targets = {n.target for n in gm.graph.nodes if n.op == "call_function"}
+    assert torch.ops.aten.nonzero.default in targets