pytorch · narendasan · Jun 18, 2026 · Jun 18, 2026
diff --git a/.github/workflows/build-test-linux-x86_64.yml b/.github/workflows/build-test-linux-x86_64.yml
diff --git a/.github/workflows/build-test-linux-x86_64_rtx.yml b/.github/workflows/build-test-linux-x86_64_rtx.yml
@@ -105,9 +105,12 @@
       use-rtx: true
       script: |
         set -euo pipefail
+        # Pull in trt_pytest (reruns + reproduce hint). One source of truth
+        # for the rerun regex; see tests/py/ci_helpers.sh.
+        source tests/py/ci_helpers.sh
         pushd .
         cd tests/py/dynamo
-        python -m pytest -ra -n 8 --junitxml=${RUNNER_TEST_RESULTS_DIR}/l0_dynamo_converter_tests_results.xml  --maxfail=20 conversion/
+        trt_pytest -ra -n 8 --junitxml=${RUNNER_TEST_RESULTS_DIR}/l0_dynamo_converter_tests_results.xml  --maxfail=20 conversion/
         popd
 
   L0-dynamo-core-tests:
@@ -136,12 +139,15 @@
       use-rtx: true
       script: |
         set -euo pipefail
+        # Pull in trt_pytest (reruns + reproduce hint). One source of truth
+        # for the rerun regex; see tests/py/ci_helpers.sh.
+        source tests/py/ci_helpers.sh
         pushd .
         cd tests/py
         cd dynamo
-        python -m pytest -ra -n 8 --junitxml=${RUNNER_TEST_RESULTS_DIR}/l0_dynamo_core_runtime_tests_results.xml runtime/test_000_*
-        python -m pytest -ra -n 8 --junitxml=${RUNNER_TEST_RESULTS_DIR}/l0_dynamo_core_partitioning_tests_results.xml partitioning/
-        python -m pytest -ra -n 8 --junitxml=${RUNNER_TEST_RESULTS_DIR}/l0_dynamo_core_lowering_tests_results.xml lowering/
+        trt_pytest -ra -n 8 --junitxml=${RUNNER_TEST_RESULTS_DIR}/l0_dynamo_core_runtime_tests_results.xml runtime/test_000_*
+        trt_pytest -ra -n 8 --junitxml=${RUNNER_TEST_RESULTS_DIR}/l0_dynamo_core_partitioning_tests_results.xml partitioning/
+        trt_pytest -ra -n 8 --junitxml=${RUNNER_TEST_RESULTS_DIR}/l0_dynamo_core_lowering_tests_results.xml lowering/
         popd
 
   L0-py-core-tests:
@@ -170,9 +176,12 @@
       use-rtx: true
       script: |
         set -euo pipefail
+        # Pull in trt_pytest (reruns + reproduce hint). One source of truth
+        # for the rerun regex; see tests/py/ci_helpers.sh.
+        source tests/py/ci_helpers.sh
         pushd .
         cd tests/py/core
-        python -m pytest -ra -n 8 --junitxml=${RUNNER_TEST_RESULTS_DIR}/l0_py_core_tests_results.xml  .
+        trt_pytest -ra -n 8 --junitxml=${RUNNER_TEST_RESULTS_DIR}/l0_py_core_tests_results.xml  .
         popd
 
   L1-dynamo-core-tests:
@@ -201,16 +210,19 @@
       use-rtx: true
       script: |
         set -euo pipefail
+        # Pull in trt_pytest (reruns + reproduce hint). One source of truth
+        # for the rerun regex; see tests/py/ci_helpers.sh.
+        source tests/py/ci_helpers.sh
         pushd .
         cd tests/py/dynamo
-        python -m pytest -ra -n 8 --junitxml=${RUNNER_TEST_RESULTS_DIR}/l1_dynamo_core_tests_results.xml  runtime/test_001_*
-        python -m pytest -ra -n 8 --junitxml=${RUNNER_TEST_RESULTS_DIR}/l1_dynamo_hlo_tests_results.xml hlo/
+        trt_pytest -ra -n 8 --junitxml=${RUNNER_TEST_RESULTS_DIR}/l1_dynamo_core_tests_results.xml  runtime/test_001_*
+        trt_pytest -ra -n 8 --junitxml=${RUNNER_TEST_RESULTS_DIR}/l1_dynamo_hlo_tests_results.xml hlo/
         popd
 
   L1-dynamo-compile-tests:
     name: ${{ matrix.display-name }}
     needs: [filter-matrix, build, L0-dynamo-converter-tests, L0-dynamo-core-tests, L0-py-core-tests]
-    if: ${{ (github.ref_name == 'main' || github.ref_name == 'nightly' || startsWith(github.ref_name, 'release/') || (startsWith(github.ref, 'refs/tags/v') && contains(github.ref_name, '-rc')) || contains(github.event.pull_request.labels.*.name, 'Force All Tests[L0+L1+L2]')) && always() || success() }}
+    if: "${{ !contains(github.event.pull_request.labels.*.name, 'ci: only-l0') && ((github.ref_name == 'main' || github.ref_name == 'nightly' || startsWith(github.ref_name, 'release/') || (startsWith(github.ref, 'refs/tags/v') && contains(github.ref_name, '-rc')) || contains(github.event.pull_request.labels.*.name, 'Force All Tests[L0+L1+L2]')) && always() || success()) }}"
     strategy:
       fail-fast: false
       matrix:
@@ -233,15 +245,18 @@
       use-rtx: true
       script: |
         set -euo pipefail
+        # Pull in trt_pytest (reruns + reproduce hint). One source of truth
+        # for the rerun regex; see tests/py/ci_helpers.sh.
+        source tests/py/ci_helpers.sh
         pushd .
         cd tests/py/dynamo/
-        python -m pytest -m critical -ra  --junitxml=${RUNNER_TEST_RESULTS_DIR}/l1_dynamo_compile_tests_results.xml models/
+        trt_pytest -m critical -ra  --junitxml=${RUNNER_TEST_RESULTS_DIR}/l1_dynamo_compile_tests_results.xml models/
         popd
 
   L1-torch-compile-tests:
     name: ${{ matrix.display-name }}
     needs: [filter-matrix, build, L0-dynamo-converter-tests, L0-dynamo-core-tests, L0-py-core-tests]
-    if: ${{ (github.ref_name == 'main' || github.ref_name == 'nightly' || startsWith(github.ref_name, 'release/') || (startsWith(github.ref, 'refs/tags/v') && contains(github.ref_name, '-rc')) || contains(github.event.pull_request.labels.*.name, 'Force All Tests[L0+L1+L2]')) && always() || success() }}
+    if: "${{ !contains(github.event.pull_request.labels.*.name, 'ci: only-l0') && ((github.ref_name == 'main' || github.ref_name == 'nightly' || startsWith(github.ref_name, 'release/') || (startsWith(github.ref, 'refs/tags/v') && contains(github.ref_name, '-rc')) || contains(github.event.pull_request.labels.*.name, 'Force All Tests[L0+L1+L2]')) && always() || success()) }}"
     strategy:
       fail-fast: false
       matrix:
@@ -264,18 +279,21 @@
       use-rtx: true
       script: |
         set -euo pipefail
+        # Pull in trt_pytest (reruns + reproduce hint). One source of truth
+        # for the rerun regex; see tests/py/ci_helpers.sh.
+        source tests/py/ci_helpers.sh
         pushd .
         cd tests/py/dynamo/
-        python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/l1_torch_compile_be_tests_results.xml  backend/
-        python -m pytest -m critical -ra  --junitxml=${RUNNER_TEST_RESULTS_DIR}/l1_torch_compile_models_tests_results.xml --ir torch_compile models/test_models.py
-        python -m pytest -m critical -ra  --junitxml=${RUNNER_TEST_RESULTS_DIR}/l1_torch_compile_dyn_models_tests_results.xml --ir torch_compile models/test_dyn_models.py
+        trt_pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/l1_torch_compile_be_tests_results.xml  backend/
+        trt_pytest -m critical -ra  --junitxml=${RUNNER_TEST_RESULTS_DIR}/l1_torch_compile_models_tests_results.xml --ir torch_compile models/test_models.py
+        trt_pytest -m critical -ra  --junitxml=${RUNNER_TEST_RESULTS_DIR}/l1_torch_compile_dyn_models_tests_results.xml --ir torch_compile models/test_dyn_models.py
         popd
 
 
   L2-torch-compile-tests:
     name: ${{ matrix.display-name }}
     needs: [filter-matrix, build, L1-torch-compile-tests, L1-dynamo-compile-tests, L1-dynamo-core-tests]
-    if: ${{ (github.ref_name == 'main' || github.ref_name == 'nightly' || startsWith(github.ref_name, 'release/') || (startsWith(github.ref, 'refs/tags/v') && contains(github.ref_name, '-rc')) || contains(github.event.pull_request.labels.*.name, 'Force All Tests[L0+L1+L2]')) && always() || success() }}
+    if: "${{ !contains(github.event.pull_request.labels.*.name, 'ci: only-l0') && ((github.ref_name == 'main' || github.ref_name == 'nightly' || startsWith(github.ref_name, 'release/') || (startsWith(github.ref, 'refs/tags/v') && contains(github.ref_name, '-rc')) || contains(github.event.pull_request.labels.*.name, 'Force All Tests[L0+L1+L2]')) && always() || success()) }}"
     strategy:
       fail-fast: false
       matrix:
@@ -307,7 +325,7 @@
   L2-dynamo-compile-tests:
     name: ${{ matrix.display-name }}
     needs: [filter-matrix, build, L1-dynamo-compile-tests, L1-dynamo-core-tests, L1-torch-compile-tests]
-    if: ${{ (github.ref_name == 'main' || github.ref_name == 'nightly' || startsWith(github.ref_name, 'release/') || (startsWith(github.ref, 'refs/tags/v') && contains(github.ref_name, '-rc')) || contains(github.event.pull_request.labels.*.name, 'Force All Tests[L0+L1+L2]')) && always() || success() }}
+    if: "${{ !contains(github.event.pull_request.labels.*.name, 'ci: only-l0') && !contains(github.event.pull_request.labels.*.name, 'ci: skip-l2') && ((github.ref_name == 'main' || github.ref_name == 'nightly' || startsWith(github.ref_name, 'release/') || (startsWith(github.ref, 'refs/tags/v') && contains(github.ref_name, '-rc')) || contains(github.event.pull_request.labels.*.name, 'Force All Tests[L0+L1+L2]')) && always() || success()) }}"
     strategy:
       fail-fast: false
       matrix:
@@ -339,7 +357,7 @@
   L2-dynamo-core-tests:
     name: ${{ matrix.display-name }}
     needs: [filter-matrix, build, L1-dynamo-core-tests, L1-dynamo-compile-tests, L1-torch-compile-tests]
-    if: ${{ (github.ref_name == 'main' || github.ref_name == 'nightly' || startsWith(github.ref_name, 'release/') || (startsWith(github.ref, 'refs/tags/v') && contains(github.ref_name, '-rc')) || contains(github.event.pull_request.labels.*.name, 'Force All Tests[L0+L1+L2]')) && always() || success() }}
+    if: "${{ !contains(github.event.pull_request.labels.*.name, 'ci: only-l0') && !contains(github.event.pull_request.labels.*.name, 'ci: skip-l2') && ((github.ref_name == 'main' || github.ref_name == 'nightly' || startsWith(github.ref_name, 'release/') || (startsWith(github.ref, 'refs/tags/v') && contains(github.ref_name, '-rc')) || contains(github.event.pull_request.labels.*.name, 'Force All Tests[L0+L1+L2]')) && always() || success()) }}"
     strategy:
       fail-fast: false
       matrix:
@@ -370,7 +388,7 @@
   L2-dynamo-plugin-tests:
     name: ${{ matrix.display-name }}
     needs: [filter-matrix, build, L1-dynamo-core-tests, L1-dynamo-compile-tests, L1-torch-compile-tests]
-    if: ${{ (github.ref_name == 'main' || github.ref_name == 'nightly' || startsWith(github.ref_name, 'release/') || (startsWith(github.ref, 'refs/tags/v') && contains(github.ref_name, '-rc')) || contains(github.event.pull_request.labels.*.name, 'Force All Tests[L0+L1+L2]')) && always() || success() }}
+    if: "${{ !contains(github.event.pull_request.labels.*.name, 'ci: only-l0') && !contains(github.event.pull_request.labels.*.name, 'ci: skip-l2') && ((github.ref_name == 'main' || github.ref_name == 'nightly' || startsWith(github.ref_name, 'release/') || (startsWith(github.ref, 'refs/tags/v') && contains(github.ref_name, '-rc')) || contains(github.event.pull_request.labels.*.name, 'Force All Tests[L0+L1+L2]')) && always() || success()) }}"
     strategy:
       fail-fast: false
       matrix:
@@ -398,7 +416,84 @@
         python -m pytest -ra --junitxml=${RUNNER_TEST_RESULTS_DIR}/l2_dynamo_plugins_tests_results.xml automatic_plugin/
         popd
 
+  # Single rollup status for the RTX matrix; mirror the non-RTX workflow's
+  # ci-rollup so branch protection can require one check per workflow.
+  ci-rollup:
+    name: CI / Linux x86_64 (RTX)
+    if: ${{ always() }}
+    needs:
+      [
+        build,
+        L0-dynamo-converter-tests,
+        L0-dynamo-core-tests,
+        L0-py-core-tests,
+        L1-dynamo-core-tests,
+        L1-dynamo-compile-tests,
+        L1-torch-compile-tests,
+        L2-torch-compile-tests,
+        L2-dynamo-compile-tests,
+        L2-dynamo-core-tests,
+        L2-dynamo-plugin-tests,
+      ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Aggregate job results
+        env:
+          RESULTS: ${{ toJSON(needs) }}
+          # Surface a label so the markdown summary disambiguates RTX vs standard.
+          WORKFLOW_LABEL: "Linux x86_64 (RTX)"
+        run: |
+          set -euo pipefail
+          # Same logic as the non-RTX rollup: stdout for the rollup status,
+          # $GITHUB_STEP_SUMMARY for the reviewer-facing markdown table.
+          python3 - <<'PY'
+          import json, os, sys
+          needs = json.loads(os.environ["RESULTS"])
+          label = os.environ.get("WORKFLOW_LABEL", "Linux x86_64")
+          by_result = {"success": [], "failure": [], "skipped": [], "cancelled": []}
+          for name, info in needs.items():
+              by_result.setdefault(info.get("result") or "unknown", []).append(name)
+          failed = sorted(by_result["failure"])
+          passed = sorted(by_result["success"])
+          skipped = sorted(by_result["skipped"])
+          cancelled = sorted(by_result["cancelled"])
+
+          print(f"PASS:      {len(passed)}")
+          print(f"FAIL:      {len(failed)}")
+          print(f"SKIPPED:   {len(skipped)} (label-gated or never started)")
+          print(f"CANCELLED: {len(cancelled)}")
+          if failed:
+              print()
+              print("Failed jobs:")
+              for name in failed:
+                  print(f"  - {name}")
+
+          summary_path = os.environ.get("GITHUB_STEP_SUMMARY")
+          if summary_path:
+              icon = {"success": "✅", "failure": "❌", "skipped": "⏭️", "cancelled": "🚫"}
+              with open(summary_path, "a", encoding="utf-8") as f:
+                  f.write(f"# CI / {label} — rollup\n\n")
+                  f.write(
+                      f"**{len(passed)}** passed · "
+                      f"**{len(failed)}** failed · "
+                      f"**{len(skipped)}** skipped · "
+                      f"**{len(cancelled)}** cancelled\n\n"
+                  )
+                  f.write("| Result | Job |\n|---|---|\n")
+                  for status in ("failure", "cancelled", "skipped", "success"):
+                      for name in sorted(by_result.get(status, [])):
+                          f.write(f"| {icon.get(status, '?')} {status} | `{name}` |\n")
+                  if failed:
+                      f.write(
+                          "\n> Click into a failed job above to see the "
+                          "rendered test table (via `pytest-results-action`) "
+                          "and the `::warning::Reproduce locally with: ...` "
+                          "hint near the bottom of the log.\n"
+                      )
 
+          if failed:
+              sys.exit(1)
+          PY
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-tensorrt-rtx-${{ inputs.repository }}-${{ github.event_name == 'workflow_dispatch' }}-${{ inputs.job-name }}

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -51,6 +51,18 @@ We use the PyTorch Slack for communication about core development, integration w
 
 - Document hacks, we can discuss it only if we can find it
 
+### Controlling CI scope via PR labels
+
+A full CI run is ~150 jobs across {Python 3.10–3.13} × {CUDA 13.0, 13.2} × {build, L0, L1, L2}. To keep PR feedback fast we let you shape what runs via labels — apply them in the PR's right sidebar and re-push (or close/reopen) to re-trigger:
+
+| Label | Effect |
+|---|---|
+| `ci: only-l0` | Skip L1 and L2 jobs. Useful for docs / build-system changes where only smoke matters. |
+| `ci: skip-l2` | Run L0 + L1, skip L2 (the slow model-level suites). |
+| `Force All Tests[L0+L1+L2]` | Pre-existing — force every tier to run even if an earlier tier failed. Used when investigating cascading failures. |
+
+PRs without any of these labels run the default set: build + L0 + L1 + L2, with L1/L2 gated on the previous tier's success so a fundamental build break doesn't waste 30 min of test capacity.
+
 ### Commits and PRs
 
 - Try to keep pull requests focused (multiple pull requests are okay). Typically PRs should focus on a single issue or a small collection of closely related issue.

diff --git a/py/torch_tensorrt/dynamo/runtime/_TRTEngine.py b/py/torch_tensorrt/dynamo/runtime/_TRTEngine.py
@@ -700,10 +700,13 @@ def device_memory_budget(self) -> Any:
     def device_memory_budget(self, budget_bytes: int) -> None:
         if budget_bytes < 0:
             budget_bytes = self.streamable_device_memory_budget
+        # TRT 11+ rejects setWeightStreamingBudgetV2 while an IExecutionContext
+        # is alive (use_count must be 1). Drop the context BEFORE setting the
+        # budget — matches the C++ runtime's TRTEngine::set_device_memory_budget.
+        self.invalidate_context()
         self.cuda_engine.weight_streaming_budget_v2 = budget_bytes
         if self.cuda_engine.weight_streaming_budget_v2 != budget_bytes:
             logger.error(f"Failed to set weight streaming budget to {budget_bytes}")
-        self.invalidate_context()
         self.runtime_states.context_changed = True
 
     def reset_captured_graph(self) -> None:

diff --git a/py/torch_tensorrt/runtime/_runtime_cache.py b/py/torch_tensorrt/runtime/_runtime_cache.py
@@ -99,6 +99,20 @@ def __init__(self, path: str = "") -> None:
         self._pending_warm_bytes: Optional[bytes] = None
         self._lock = threading.Lock()
 
+    def __getstate__(self) -> dict:
+        # ``threading.Lock`` is not picklable, which breaks ``copy.deepcopy``
+        # on any GraphModule that has us in its state (the cross-runtime
+        # export path calls deepcopy on the gm before re-tracing). The lock
+        # guards in-process mutations only; a freshly-deserialized cache
+        # always needs a new lock anyway.
+        state = self.__dict__.copy()
+        state.pop("_lock", None)
+        return state
+
+    def __setstate__(self, state: dict) -> None:
+        self.__dict__.update(state)
+        self._lock = threading.Lock()
+
     def serialize(self) -> torch.Tensor:
         with self._lock:
             if self._cache is None:

diff --git a/pyproject.toml b/pyproject.toml
@@ -82,6 +82,15 @@ test = [
     "parameterized>=0.2.0",
     "pytest>=8.2.1",
     "pytest-forked>=1.6.0",
+    # Emits ``::error file=...,line=...::`` annotations on test failure so
+    # GitHub renders them inline on the PR's Files Changed tab. Inert when
+    # ``GITHUB_ACTIONS`` is unset (local runs are unaffected).
+    "pytest-github-actions-annotate-failures>=0.2.0",
+    # pytest-rerunfailures lets CI retry tests that hit known transient
+    # CUDA / cudagraphs / engine-deserialization failures (see L0/L1
+    # ``--only-rerun`` regex in the workflows) without papering over real
+    # bugs in numerical / model-accuracy suites where it is omitted.
+    "pytest-rerunfailures>=14.0",
     "pytest-xdist>=3.6.1",
     "pyyaml",
     "setuptools",
@@ -110,8 +119,10 @@ quantization = [
 ]
 
 # Optional runtime deps for the torch_tensorrt.kernels QDP-plugin layer,
-# which compiles user-supplied CUDA C++ kernels via NVRTC.
-kernels = ["cuda-python"]
+# which compiles user-supplied CUDA C++ kernels via NVRTC. The high-level
+# launch/compile API (``cuda.core``) lives in cuda-core; cuda-python's
+# bindings are still pulled in for the lower-level driver/runtime shims.
+kernels = ["cuda-python", "cuda-core"]
 
 [project.urls]
 Homepage = "https://pytorch.org/tensorrt"

diff --git a/tests/py/ci_helpers.sh b/tests/py/ci_helpers.sh
@@ -0,0 +1,26 @@
+# Shared shell helpers for Torch-TensorRT CI test scripts.
+# Sourced from L0/L1 script blocks in .github/workflows/build-test-linux-x86_64*.yml.
+#
+# Update this file (not the YAMLs) when adjusting the pytest rerun policy or
+# the reproduce-locally hint. Tested only via running CI; if you change a
+# function signature, audit every ``source tests/py/ci_helpers.sh`` site.
+
+# trt_pytest wraps ``python -m pytest`` with:
+#   * --reruns 1: retry once on known transient cudagraphs/TRT-driver flakes.
+#                 Expand the regex below only with concrete evidence; broad
+#                 regexes hide real bugs.
+#   * an inline ``::warning::`` reproduce hint on failure so reviewers can
+#     copy-paste the exact local repro command.
+#
+# Usage (inside an L0/L1 script: | block):
+#     source tests/py/ci_helpers.sh
+#     cd tests/py/dynamo
+#     trt_pytest -ra -n 8 --junitxml="$RUNNER_TEST_RESULTS_DIR/foo.xml" runtime/test_001_*
+trt_pytest() {
+    local rerun='--reruns 1 --reruns-delay 5'
+    local only_rerun='--only-rerun cudaErrorStreamCaptureInvalidated --only-rerun "Stream capture invalidated"'
+    if ! python -m pytest $rerun $only_rerun "$@"; then
+        echo "::warning::pytest failed. Reproduce locally with: cd $(pwd) && uv run pytest $*"
+        return 1
+    fi
+}
diff --git a/tests/py/dynamo/conversion/test_cumsum_aten.py b/tests/py/dynamo/conversion/test_cumsum_aten.py
@@ -1,4 +1,3 @@
-import sys
 import unittest
 
 import torch
@@ -11,8 +10,8 @@
 
 
 @unittest.skipIf(
-    torch_tensorrt.ENABLED_FEATURES.tensorrt_rtx and sys.platform == "win32",
-    "cumsum errors out on TensorRT-RTX on Windows",
+    torch_tensorrt.ENABLED_FEATURES.tensorrt_rtx,
+    "cumsum is not supported on TensorRT-RTX (build_serialized_network returns None on Linux as well as Windows)",
 )
 class TestCumsumConverter(DispatchTestCase):
     @parameterized.expand(