Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 46 additions & 1 deletion py/torch_tensorrt/dynamo/_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import os
import platform
import warnings
from typing import Any, Collection, List, Optional, Sequence, Union
from typing import Any, Collection, List, Optional, Sequence, Set, Union

import torch
from torch.export import ExportedProgram
Expand Down Expand Up @@ -109,6 +109,7 @@ def cross_compile_for_windows(
dynamically_allocate_resources: bool = _defaults.DYNAMICALLY_ALLOCATE_RESOURCES,
decompose_attention: bool = _defaults.DECOMPOSE_ATTENTION,
attn_bias_is_causal: bool = _defaults.ATTN_BIAS_IS_CAUSAL,
target_executorch: bool = _defaults.TARGET_EXECUTORCH,
**kwargs: Any,
) -> torch.fx.GraphModule:
"""Compile an ExportedProgram module using TensorRT in Linux for Inference in Windows
Expand Down Expand Up @@ -186,6 +187,7 @@ def cross_compile_for_windows(
dynamically_allocate_resources (bool): Dynamically allocate resources during engine execution.
decompose_attention (bool): Whether to decompose attention layers. We have converters for handling attention ops, but if you want to decompose them into smaller ops, you can set this to True.
attn_bias_is_causal (bool): Whether the attn_bias in efficient SDPA is causal. Default is True. This can accelerate models from HF because attn_bias is always a causal mask in HF. If you want to use non-causal attn_bias, you can set this to False.
target_executorch (bool): If True, operators whose converters require a TensorRT output allocator (i.e. data-dependent output shapes, such as nonzero) are added to torch_executed_ops and run in PyTorch instead of being lowered into a TensorRT engine. This is useful when targeting runtimes that cannot consume a TensorRT output allocator. Default is False.
**kwargs: Any,
Returns:
torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
Expand Down Expand Up @@ -338,6 +340,7 @@ def cross_compile_for_windows(
"dynamically_allocate_resources": dynamically_allocate_resources,
"decompose_attention": decompose_attention,
"attn_bias_is_causal": attn_bias_is_causal,
"target_executorch": target_executorch,
}

# disable the following settings is not supported for cross compilation for windows feature
Expand Down Expand Up @@ -460,6 +463,7 @@ def compile(
dynamically_allocate_resources: bool = _defaults.DYNAMICALLY_ALLOCATE_RESOURCES,
decompose_attention: bool = _defaults.DECOMPOSE_ATTENTION,
attn_bias_is_causal: bool = _defaults.ATTN_BIAS_IS_CAUSAL,
target_executorch: bool = _defaults.TARGET_EXECUTORCH,
**kwargs: Any,
) -> torch.fx.GraphModule:
"""Compile an ExportedProgram module for NVIDIA GPUs using TensorRT
Expand Down Expand Up @@ -547,6 +551,7 @@ def compile(
dynamically_allocate_resources (bool): Dynamically allocate resources during engine execution.
decompose_attention (bool): Whether to decompose attention layers. We have converters for handling attention ops, but if you want to decompose them into smaller ops, you can set this to True.
attn_bias_is_causal (bool): Whether the attn_bias in efficient SDPA is causal. Default is True. This can accelerate models from HF because attn_bias is always a causal mask in HF. If you want to use non-causal attn_bias, you can set this to False.
target_executorch (bool): If True, operators whose converters require a TensorRT output allocator (i.e. data-dependent output shapes, such as nonzero) are added to torch_executed_ops and run in PyTorch instead of being lowered into a TensorRT engine. This is useful when targeting runtimes that cannot consume a TensorRT output allocator. Default is False.
**kwargs: Any,
Returns:
torch.fx.GraphModule: Compiled FX Module, when run it will execute via TensorRT
Expand Down Expand Up @@ -732,6 +737,7 @@ def compile(
"dynamically_allocate_resources": dynamically_allocate_resources,
"decompose_attention": decompose_attention,
"attn_bias_is_causal": attn_bias_is_causal,
"target_executorch": target_executorch,
}
logger.debug(f"CPU memory usage before lowering: {get_cpu_memory_usage()} MB")
settings = CompilationSettings(**compilation_options)
Expand Down Expand Up @@ -879,6 +885,44 @@ def _insert_complex_io_adapters(


@fn_supports_debugger # type: ignore[misc]
def _output_allocator_ops() -> Set[Target]:
ops: Set[Target] = set()
for registry in CONVERTERS.registries:
for target, converters in registry.items():
# a registry value may be one converter, a list/tuple, or a priority dict
if isinstance(converters, (list, tuple)):
candidates = list(converters)
elif isinstance(converters, dict):
candidates = list(converters.values())
else:
candidates = [converters]
# if any converter needs an allocator, route the whole target so an
# allocator engine is never emitted
if any(
getattr(conv, "requires_output_allocator", False) for conv in candidates
):
ops.add(target)
return ops


def _route_output_allocator_ops(settings: CompilationSettings) -> None:
if not settings.target_executorch:
return
ops = _output_allocator_ops()
if not ops:
return
if settings.require_full_compilation:
raise ValueError(
"target_executorch routes output-allocator ops to PyTorch, which "
"is incompatible with require_full_compilation=True; enable only one."
)
settings.torch_executed_ops = set(settings.torch_executed_ops) | ops
logger.debug(
"target_executorch: routing output-allocator ops to " "torch_executed_ops: %s",
sorted(str(t) for t in ops),
)


def compile_module(
gm: torch.fx.GraphModule,
sample_arg_inputs: Sequence[Input],
Expand Down Expand Up @@ -910,6 +954,7 @@ def compile_module(
sample_kwarg_inputs = {}

# Configure user compilation settings to converters.
_route_output_allocator_ops(settings)
CONVERTERS.set_compilation_settings(settings)

# Check the number of supported operations in the graph
Expand Down
1 change: 1 addition & 0 deletions py/torch_tensorrt/dynamo/_defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
DYNAMICALLY_ALLOCATE_RESOURCES = False
DECOMPOSE_ATTENTION = False
ATTN_BIAS_IS_CAUSAL = True
TARGET_EXECUTORCH = False

if platform.system() == "Linux":
import pwd
Expand Down
4 changes: 4 additions & 0 deletions py/torch_tensorrt/dynamo/_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
REUSE_CACHED_ENGINES,
SPARSE_WEIGHTS,
STRIP_ENGINE_WEIGHTS,
TARGET_EXECUTORCH,
TILING_OPTIMIZATION_LEVEL,
TIMING_CACHE_PATH,
TRUNCATE_DOUBLE,
Expand Down Expand Up @@ -113,6 +114,7 @@ class CompilationSettings:
dynamically_allocate_resources (bool): Dynamically allocate resources for TensorRT engines
decompose_attention (bool): Whether to decompose attention layers. We have converters for handling attention ops, but if you want to decompose them into smaller ops, you can set this to True.
attn_bias_is_causal (bool): Whether the attn_bias in efficient SDPA is causal. Default is True. This can accelerate models from HF because attn_bias is always a causal mask in HF. If you want to use non-causal attn_bias, you can set this to False.
target_executorch (bool): If True, operators whose converters require a TensorRT output allocator (i.e. data-dependent output shapes, such as nonzero) are added to torch_executed_ops and run in PyTorch instead of being lowered into a TensorRT engine. This is useful when targeting runtimes that cannot consume a TensorRT output allocator. Default is False.
"""

workspace_size: int = WORKSPACE_SIZE
Expand Down Expand Up @@ -171,6 +173,7 @@ class CompilationSettings:
dynamically_allocate_resources: bool = DYNAMICALLY_ALLOCATE_RESOURCES
decompose_attention: bool = DECOMPOSE_ATTENTION
attn_bias_is_causal: bool = ATTN_BIAS_IS_CAUSAL
target_executorch: bool = TARGET_EXECUTORCH

def __getstate__(self) -> dict[str, Any]:
from torch_tensorrt.dynamo.conversion._ConverterRegistry import (
Expand All @@ -186,6 +189,7 @@ def __getstate__(self) -> dict[str, Any]:

def __setstate__(self, state: dict[str, Any]) -> None:
state.pop("use_python_runtime", None)
state.setdefault("target_executorch", TARGET_EXECUTORCH)
self.__dict__.update(state)


Expand Down
76 changes: 76 additions & 0 deletions tests/py/dynamo/models/test_target_executorch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import pytest
import torch
import torch_tensorrt
from torch_tensorrt.dynamo._compiler import (
_output_allocator_ops,
_route_output_allocator_ops,
)
from torch_tensorrt.dynamo._settings import CompilationSettings


def test_target_executorch_setting_default():
# Off by default; opt-in only.
assert CompilationSettings().target_executorch is False
assert CompilationSettings(target_executorch=True).target_executorch is True


def test_old_pickle_without_field_defaults_to_false():
# A settings object serialized before target_executorch existed (no such key in the
# state) must restore to the default instead of raising AttributeError.
state = CompilationSettings().__dict__.copy()
state.pop("target_executorch", None)
restored = CompilationSettings.__new__(CompilationSettings)
restored.__setstate__(state)
assert restored.target_executorch is False


def test_output_allocator_converters_are_discoverable():
# The routing relies on converters tagging requires_output_allocator; the canonical
# data-dependent op (nonzero) must be discoverable that way. Exercises the production
# discovery helper directly.
assert torch.ops.aten.nonzero.default in _output_allocator_ops()


def test_route_is_noop_when_disabled():
# The default (off) must leave torch_executed_ops untouched.
settings = CompilationSettings()
_route_output_allocator_ops(settings)
assert settings.torch_executed_ops == set()


def test_route_adds_output_allocator_ops_when_enabled():
# CPU only: enabling the flag routes the data-dependent op into torch_executed_ops
# (so the partitioner runs it in PyTorch). This guards the routing without a GPU.
settings = CompilationSettings(target_executorch=True)
_route_output_allocator_ops(settings)
assert torch.ops.aten.nonzero.default in settings.torch_executed_ops


def test_route_incompatible_with_require_full_compilation():
# Routing ops to PyTorch contradicts require_full_compilation, so it must error early.
settings = CompilationSettings(
target_executorch=True, require_full_compilation=True
)
with pytest.raises(ValueError):
_route_output_allocator_ops(settings)


@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required")
def test_target_executorch_routes_output_allocator_op_to_torch():
# End to end on GPU: with target_executorch=True an output-allocator op (nonzero)
# falls back to PyTorch instead of being absorbed into a TensorRT engine.
class Model(torch.nn.Module):
def forward(self, x):
return torch.nonzero(x)

inputs = (torch.tensor([0, 3, 0, 5, 7], dtype=torch.int32, device="cuda"),)
ep = torch.export.export(Model().cuda(), inputs)
gm = torch_tensorrt.dynamo.compile(
ep,
arg_inputs=list(inputs),
min_block_size=1,
target_executorch=True,
use_python_runtime=True,
)
targets = {n.target for n in gm.graph.nodes if n.op == "call_function"}
assert torch.ops.aten.nonzero.default in targets
Loading