From d95e3c85bdca62fe654d3649f8b9d507e322c1f3 Mon Sep 17 00:00:00 2001
From: Julian Arce <52429267+JuArce@users.noreply.github.com>
Date: Tue, 30 Jun 2026 12:06:01 -0300
Subject: [PATCH 01/10] ci: run tests on GPU server

---
 .github/workflows/gpu-tests.yml | 300 ++++++++++++++++++++++++++++++++
 Makefile                        |   8 +-
 scripts/gpu_test.sh             |  67 +++++++
 3 files changed, 374 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/gpu-tests.yml
 create mode 100755 scripts/gpu_test.sh

diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml
new file mode 100644
index 000000000..a05de8f58
--- /dev/null
+++ b/.github/workflows/gpu-tests.yml
@@ -0,0 +1,300 @@
+name: GPU Tests (merge queue)
+
+# Run the CUDA-only test groups (which CPU CI can't, since GitHub runners have no GPU) on a
+# rented Vast.ai RTX 5090 when a PR is in the merge queue, and block the merge if they fail.
+# Groups (see scripts/gpu_test.sh): math-cuda kernel parity, cuda_path_integration (GPU proof
+# verifies), cuda_fallback (CPU fallback verifies). Orchestration runs on a GitHub-hosted
+# runner; all GPU work happens on the rented box (provisioned by the template onstart). The
+# box is ALWAYS destroyed at the end.
+#
+# Triggered on `merge_group` (one rental per merge, not per push) + `workflow_dispatch` for
+# manual runs. To gate merges, add the job name `gpu-tests` to the branch-protection required
+# status checks for `main` (GitHub UI).
+#
+# Requires repo secrets:
+#   VAST_API_KEY        — https://cloud.vast.ai/manage-keys/
+#   VAST_TEMPLATE_HASH  — hash of the "NVIDIA CUDA Lambda VM 64GB" template
+
+on:
+  merge_group:
+  workflow_dispatch:
+  # TEMP(testing): run on pushes to this branch pre-merge (no merge queue needed to test the
+  # rent -> test -> destroy path). REMOVE before merging.
+  push:
+    branches: [ci_run_tests_gpu]
+
+permissions:
+  contents: read
+
+concurrency:
+  group: gpu-tests-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  # Vast offer search: RTX 5090, >=16 cores, >=32GB RAM (workloads are small), >=64GB disk,
+  # verified + rentable, Blackwell-capable driver, <= cap.
+  GPU_NAME: RTX_5090
+  PRICE_CAP: "1"
+  VAST_IMAGE_DISK: "64"
+  # Unique per-run label set on the instance, for identification + leak-proof teardown.
+  RUN_LABEL: "gpu-tests-${{ github.run_id }}-${{ github.run_attempt }}"
+  # Pin the Vast CLI to an immutable commit (a PyPI version can be re-published; a commit
+  # hash can't) — avoids pulling untrusted code at run time.
+  VAST_CLI_COMMIT: "28494d92c6c03d887f8375085243c22eb68c5874"
+
+jobs:
+  gpu-tests:
+    runs-on: ubuntu-latest
+    # Provisioning + dual-feature cuda build (~25 min) + the three test groups (~15 min) +
+    # image-pull slack. Generous ceiling; teardown still always destroys the box.
+    timeout-minutes: 120
+    steps:
+      - name: Install Vast CLI
+        # No secrets in this step's env: install-time code can't read the API key during pip
+        # install. Pinned to an immutable commit (see VAST_CLI_COMMIT) for the same reason.
+        # --break-system-packages: the ephemeral runner's Python may be PEP-668 "externally
+        # managed"; safe to override on a disposable runner.
+        run: pip install --quiet --break-system-packages "git+https://github.com/vast-ai/vast-cli.git@${VAST_CLI_COMMIT}"
+
+      - name: Authenticate Vast CLI
+        env:
+          VAST_API_KEY: ${{ secrets.VAST_API_KEY }}
+        run: vastai set api-key "$VAST_API_KEY"
+
+      - name: Generate ephemeral SSH key
+        id: sshkey
+        run: |
+          mkdir -p "$HOME/.ssh"
+          KEY="$HOME/.ssh/vast_gpu_tests"
+          ssh-keygen -t ed25519 -N "" -f "$KEY" -C "gh-actions-gpu-tests-${GITHUB_RUN_ID}" >/dev/null
+          echo "key_path=$KEY" >> "$GITHUB_OUTPUT"
+
+      - name: Pick a Vast offer
+        id: offer
+        env:
+          # Retry the same query to ride out transient scarcity (RTX 5090s are a small,
+          # fast-churning pool). Total wait ~= ATTEMPTS * INTERVAL.
+          OFFER_ATTEMPTS: "10"
+          OFFER_INTERVAL: "30"
+          # Require driver >= this major so cudarc matches the runtime driver (older drivers
+          # lack newer symbols and the GPU path falls back to CPU). Filtered client-side in jq
+          # because vast can't numerically compare the driver_version string server-side.
+          MIN_DRIVER: "580"
+        run: |
+          # cpu_ram filter is in GB.
+          QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=32 disk_space>=64 verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}"
+          echo "Query: $QUERY (+ client-side driver_version major >= $MIN_DRIVER)"
+          # Keep only offers whose driver major >= MIN_DRIVER, then most expensive first (within
+          # the cap) — premium hosts have faster disks/network and better reliability; cheapest
+          # boxes were flaky. `try ... catch 0` so a malformed/null driver_version on one offer
+          # is treated as 0 (filtered out) rather than erroring the whole jq.
+          SELECT="map(select((try (.driver_version|split(\".\")[0]|tonumber) catch 0) >= ${MIN_DRIVER})) | sort_by(.dph_total) | reverse"
+          OFFER_ID=""
+          for attempt in $(seq 1 "$OFFER_ATTEMPTS"); do
+            vastai search offers "$QUERY" --raw -o dph_total > offers.json || true
+            OFFER_ID=$(jq -r "$SELECT | .[0].id // empty" offers.json)
+            OFFER_PRICE=$(jq -r "$SELECT | .[0].dph_total // empty" offers.json)
+            if [ -n "$OFFER_ID" ]; then
+              echo "Selected offer $OFFER_ID at \$${OFFER_PRICE}/hr (attempt $attempt)"
+              break
+            fi
+            echo "No matching offer (attempt $attempt/$OFFER_ATTEMPTS); retrying in ${OFFER_INTERVAL}s..."
+            sleep "$OFFER_INTERVAL"
+          done
+          if [ -z "$OFFER_ID" ]; then
+            echo "::error::No RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=32GB RAM, >=64GB disk, driver>=${MIN_DRIVER}, <= \$${PRICE_CAP}/hr)"
+            exit 1
+          fi
+          echo "id=$OFFER_ID" >> "$GITHUB_OUTPUT"
+          echo "price=$OFFER_PRICE" >> "$GITHUB_OUTPUT"
+
+      - name: Create instance
+        id: instance
+        env:
+          VAST_TEMPLATE_HASH: ${{ secrets.VAST_TEMPLATE_HASH }}
+          OFFER_ID: ${{ steps.offer.outputs.id }}
+        run: |
+          vastai create instance "$OFFER_ID" \
+            --template_hash "$VAST_TEMPLATE_HASH" \
+            --disk "$VAST_IMAGE_DISK" \
+            --label "$RUN_LABEL" \
+            --ssh --direct --raw > create.json
+          # Log only the fields we need (the full --raw response could carry a sensitive field).
+          jq '{success, new_contract: (.new_contract // .instances.new_contract)}' create.json
+          IID=$(jq -r '.new_contract // .instances.new_contract // empty' create.json)
+          if [ -z "$IID" ]; then
+            echo "::error::Failed to create Vast instance"
+            exit 1
+          fi
+          # Persist immediately so teardown runs even if later steps fail.
+          echo "$IID" > "$RUNNER_TEMP/vast_instance_id"
+          echo "id=$IID" >> "$GITHUB_OUTPUT"
+          echo "Created instance $IID (label $RUN_LABEL)"
+
+      - name: Attach SSH key to instance
+        env:
+          IID: ${{ steps.instance.outputs.id }}
+          KEY: ${{ steps.sshkey.outputs.key_path }}
+        run: |
+          # Attach the ephemeral pubkey to THIS instance only (added to its authorized_keys);
+          # removed when the instance is destroyed, so no account-level key to clean up.
+          # Retry: the instance may not accept the attach immediately after create.
+          PUB="$(cat "$KEY.pub")"
+          for attempt in $(seq 1 12); do
+            if vastai attach ssh "$IID" "$PUB"; then
+              echo "Attached ssh key (attempt $attempt)"; exit 0
+            fi
+            echo "attach failed (attempt $attempt/12); retrying in 10s..."
+            sleep 10
+          done
+          echo "::error::Failed to attach ssh key to instance $IID"
+          exit 1
+
+      - name: Wait for SSH
+        id: ssh
+        env:
+          IID: ${{ steps.instance.outputs.id }}
+        run: |
+          echo "Waiting for instance $IID to reach 'running' with SSH endpoint..."
+          HOST=""; PORT=""
+          # The base CUDA image is large; some hosts sit in 'loading' (image pull) a while.
+          for _ in $(seq 1 180); do   # ~30 min
+            vastai show instance "$IID" --raw > inst.json || true
+            STATUS=$(jq -r '.actual_status // empty' inst.json)
+            # We create with --direct, so SSH straight to the public IP + the host port mapped
+            # to container port 22 (the .ssh_host/.ssh_port proxy fields are unreliable).
+            HOST=$(jq -r '.public_ipaddr // empty' inst.json)
+            PORT=$(jq -r '.ports["22/tcp"][0].HostPort // empty' inst.json)
+            echo "  status=$STATUS ssh=$HOST:$PORT"
+            if [ "$STATUS" = "running" ] && [ -n "$HOST" ] && [ -n "$PORT" ]; then
+              break
+            fi
+            sleep 10
+          done
+          if [ "$STATUS" != "running" ] || [ -z "$HOST" ] || [ -z "$PORT" ]; then
+            echo "::error::Instance never became reachable (status=$STATUS host=$HOST port=$PORT)"
+            exit 1
+          fi
+          echo "host=$HOST" >> "$GITHUB_OUTPUT"
+          echo "port=$PORT" >> "$GITHUB_OUTPUT"
+
+          # Wait for sshd to accept our key.
+          for _ in $(seq 1 30); do
+            if ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes \
+                 -i "${{ steps.sshkey.outputs.key_path }}" -p "$PORT" "root@$HOST" true 2>/dev/null; then
+              echo "sshd reachable"; exit 0
+            fi
+            sleep 10
+          done
+          echo "::error::sshd did not accept connections in time"
+          exit 1
+
+      - name: Wait for onstart provisioning
+        env:
+          HOST: ${{ steps.ssh.outputs.host }}
+          PORT: ${{ steps.ssh.outputs.port }}
+          KEY: ${{ steps.sshkey.outputs.key_path }}
+        run: |
+          SSH="ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes -i $KEY -p $PORT root@$HOST"
+          echo "Waiting for the template onstart script to finish (Rust + LLVM + sysroot + clone)..."
+          # The bootstrap's final stdout line is "=== done ===". Vast captures onstart output to
+          # /var/log/onstart.log; fall back to checking the artifacts it leaves.
+          for _ in $(seq 1 120); do   # ~20 min
+            if $SSH 'grep -q "=== done ===" /var/log/onstart.log 2>/dev/null'; then
+              echo "onstart reported done"; exit 0
+            fi
+            # shellcheck disable=SC2016  # $HOME must expand on the remote box, not the runner
+            if $SSH 'test -x "$HOME/.cargo/bin/cargo" \
+                  && test -f /opt/lambda-vm-sysroot/include/stdlib.h \
+                  && test -d /workspace/lambda_vm/.git'; then
+              echo "provisioning artifacts present"; exit 0
+            fi
+            sleep 10
+          done
+          echo "::error::onstart provisioning did not complete in time"
+          exit 1
+
+      - name: Run GPU tests
+        id: tests
+        env:
+          HOST: ${{ steps.ssh.outputs.host }}
+          PORT: ${{ steps.ssh.outputs.port }}
+          KEY: ${{ steps.sshkey.outputs.key_path }}
+          # merge_group: refs/heads/gh-readonly-queue/main/pr-… (the merge commit = PR + main),
+          # so we test exactly what will land. workflow_dispatch: the chosen branch ref.
+          REF: ${{ github.ref }}
+        run: |
+          SSH="ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes -i $KEY -p $PORT root@$HOST"
+          # Defense-in-depth: never interpolate an unvalidated ref into the remote `bash -lc`.
+          case "$REF" in
+            ''|*[!A-Za-z0-9._/-]*) echo "::error::invalid ref: '$REF'"; exit 1 ;;
+          esac
+          # Check out the ref under test on the box, then run the CUDA test groups.
+          REMOTE="set -e; cd /workspace/lambda_vm; \
+            git fetch --force origin '$REF'; \
+            git checkout -f FETCH_HEAD; \
+            CUDARC_PIN=cuda-12080 SYSROOT_DIR=/opt/lambda-vm-sysroot bash scripts/gpu_test.sh"
+
+          # pipefail so a test failure on the box propagates through the tee pipe and FAILS this
+          # step (which fails the job and blocks the merge), instead of being masked by tee.
+          # 2>&1 so remote stderr (build errors, panics) is captured too — both into the live
+          # step log and the file the run-summary step tails.
+          set -o pipefail
+          $SSH "bash -lc \"$REMOTE\"" 2>&1 | tee "$RUNNER_TEMP/gpu_test_out.txt"
+
+      - name: Write run summary
+        if: always() && (steps.tests.outcome == 'success' || steps.tests.outcome == 'failure')
+        env:
+          OUTCOME: ${{ steps.tests.outcome }}
+        run: |
+          {
+            echo "## GPU tests (CUDA-only suite)"
+            echo "Outcome: **${OUTCOME}**"
+            # On failure, surface the failing-group markers explicitly, then the log tail.
+            if [ "$OUTCOME" != "success" ]; then
+              FAILED=$(grep -F '::error::GPU test group failed:' "$RUNNER_TEMP/gpu_test_out.txt" 2>/dev/null \
+                       | sed 's/.*failed: /- /' || true)
+              [ -n "$FAILED" ] && { echo; echo "Failed groups:"; echo "$FAILED"; }
+            fi
+            echo '```'
+            tail -n 80 "$RUNNER_TEMP/gpu_test_out.txt" 2>/dev/null || echo "(no output captured)"
+            echo '```'
+          } >> "$GITHUB_STEP_SUMMARY"
+
+      # --- Teardown: ALWAYS destroy the instance (cost guardrail) ---
+      - name: Destroy instance
+        if: always()
+        run: |
+          # Retry transient failures (network/auth) so a paid box isn't stranded.
+          # --yes: skip the interactive [y/N] confirm (CI has no tty).
+          destroy() {
+            iid="$1"; destroyed=""
+            for attempt in 1 2 3; do
+              if vastai destroy instance "$iid" --yes; then destroyed=1; break; fi
+              echo "destroy attempt $attempt failed; retrying in 10s..."
+              sleep 10
+            done
+            [ -n "$destroyed" ] || echo "::warning::Failed to destroy instance $iid after 3 attempts — check the Vast console (label $RUN_LABEL)"
+          }
+          if [ -f "$RUNNER_TEMP/vast_instance_id" ]; then
+            IID=$(cat "$RUNNER_TEMP/vast_instance_id")
+            echo "Destroying instance $IID"
+            destroy "$IID"
+          else
+            # The id file is written only AFTER create succeeds AND its JSON parses, so a box can
+            # exist unrecorded if the run was cancelled in that window or the parse failed. Fall
+            # back to destroying by our unique RUN_LABEL so the box can't leak (bill indefinitely).
+            echo "No instance id recorded; searching Vast for any box labelled $RUN_LABEL..."
+            vastai show instances --raw > all_inst.json 2>/dev/null || echo '[]' > all_inst.json
+            LEAKED=$(jq -r --arg L "$RUN_LABEL" \
+              '(if type=="array" then . else (.instances // []) end) | .[] | select(.label == $L) | .id' \
+              all_inst.json 2>/dev/null || true)
+            if [ -z "$LEAKED" ]; then
+              echo "No instance labelled $RUN_LABEL found; nothing to destroy."
+            else
+              for IID in $LEAKED; do
+                echo "Destroying leaked instance $IID (label $RUN_LABEL)"
+                destroy "$IID"
+              done
+            fi
+          fi
diff --git a/Makefile b/Makefile
index 81bc03a8c..32a735c79 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 .PHONY: deps deps-linux deps-macos compile-programs-asm compile-programs-rust compile-bench \
 compile-programs clean-asm clean-rust clean-bench clean-shared clean test test-asm \
 test-rust test-executor test-flamegraph flamegraph-prover \
-test-fast test-prover test-prover-all test-disk-spill test-math-cuda test-cuda-integration \
+test-fast test-prover test-prover-all test-disk-spill test-math-cuda test-cuda-integration test-cuda-fallback \
 bench-math-cuda bench-prover bench-prover-cuda build check clippy fmt lint regen-ethrex-fixtures \
 update-ethrex-fixture-checksums check-ethrex-fixture-checksums
 
@@ -248,6 +248,12 @@ test-cuda-integration:
 	cargo test -p lambda-vm-prover --release --features cuda \
 	    --test cuda_path_integration -- --ignored --nocapture
 
+# GPU error-path coverage (requires NVIDIA GPU + nvcc).
+# Forces cuda dispatch errors and asserts the CPU fallback still produces a verifying proof.
+test-cuda-fallback:
+	cargo test -p lambda-vm-prover --release --features test-cuda-faults \
+	    --test cuda_fallback_tests -- --ignored --nocapture
+
 # math-cuda quick microbench (median of 10 runs)
 bench-math-cuda:
 	cargo test -p math-cuda --release --test bench_quick -- --ignored --nocapture
diff --git a/scripts/gpu_test.sh b/scripts/gpu_test.sh
new file mode 100755
index 000000000..e3608d81a
--- /dev/null
+++ b/scripts/gpu_test.sh
@@ -0,0 +1,67 @@
+#!/usr/bin/env bash
+#
+# gpu_test.sh — run the CUDA-only test groups on a GPU box.
+#
+# These groups can't run in CPU CI (GitHub runners have no GPU):
+#   1. math-cuda kernel parity        (make test-math-cuda)
+#   2. end-to-end GPU dispatch + proof (make test-cuda-integration)
+#   3. GPU error-path / CPU fallback   (make test-cuda-fallback)
+#
+# Runs on the rented Vast box from the gpu-tests.yml merge-queue workflow. All three groups
+# run even if one fails (so the log shows every failure); the script exits non-zero if ANY
+# group failed, which fails the workflow job and blocks the merge.
+#
+# Env:
+#   CUDARC_PIN   cudarc CUDA-version feature to pin (default cuda-12080). See the sed below.
+#   SYSROOT_DIR  rv64 sysroot (default /opt/lambda-vm-sysroot, provisioned by the template).
+
+set -euo pipefail
+
+CUDARC_PIN="${CUDARC_PIN:-cuda-12080}"
+export SYSROOT_DIR="${SYSROOT_DIR:-/opt/lambda-vm-sysroot}"
+
+log() { printf '\n=== %s ===\n' "$*"; }
+
+# --- GPU toolchain sanity (fail loudly rather than silently falling back to CPU) ---
+log "GPU toolchain"
+if ! command -v nvcc >/dev/null 2>&1; then
+    for d in /usr/local/cuda/bin /usr/local/cuda-*/bin; do
+        [ -x "$d/nvcc" ] && export PATH="$d:$PATH" && break
+    done
+fi
+command -v nvcc >/dev/null 2>&1 || { echo "ERROR: nvcc not found — CUDA toolkit missing" >&2; exit 1; }
+nvcc --version | tail -n 2
+nvidia-smi --query-gpu=name,driver_version,compute_cap --format=csv,noheader
+
+# --- Pin cudarc so it binds a fixed driver-symbol set --------------------------
+# crypto/math-cuda/Cargo.toml uses `cuda-version-from-build-system` + `fallback-latest`;
+# when detection falls back to "latest", cudarc requests symbols some boxes' driver doesn't
+# export (e.g. cuDevSmResourceSplit / cuCtxGetDevice_v2) -> runtime panic. Pinning to a fixed
+# CUDA version (12.8, matching the cuda_max_good>=12.8 offer floor) avoids that.
+log "pinning cudarc to $CUDARC_PIN"
+sed -i "s/\"cuda-version-from-build-system\"/\"${CUDARC_PIN}\"/; /\"fallback-latest\"/d" \
+    crypto/math-cuda/Cargo.toml
+
+# --- Build the asm guest ELFs used by Groups 2 & 3 (clang on .s; fast) ----------
+# (math-cuda parity tests need no ELF; cuda_path_integration / cuda_fallback prove an asm ELF.)
+log "compiling asm guest programs"
+make compile-programs-asm
+
+# --- Run the three CUDA test groups via the Makefile targets --------------------
+fail=0
+run() {  # $1 = make target
+    log "make $1"
+    if ! make "$1"; then
+        echo "::error::GPU test group failed: $1"
+        fail=1
+    fi
+}
+run test-math-cuda         # Group 1: kernel parity
+run test-cuda-integration  # Group 2: end-to-end GPU dispatch + proof verifies
+run test-cuda-fallback     # Group 3: GPU error -> CPU fallback still verifies
+
+if [ "$fail" -ne 0 ]; then
+    log "FAILED — one or more GPU test groups failed"
+    exit 1
+fi
+log "all GPU test groups passed"

From 70754b384f743537e95fb5cf0243fd878e48330d Mon Sep 17 00:00:00 2001
From: Julian Arce <52429267+JuArce@users.noreply.github.com>
Date: Tue, 30 Jun 2026 12:17:54 -0300
Subject: [PATCH 02/10] keep instance running for debug

---
 .github/workflows/gpu-tests.yml | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml
index a05de8f58..ba0e42411 100644
--- a/.github/workflows/gpu-tests.yml
+++ b/.github/workflows/gpu-tests.yml
@@ -41,6 +41,10 @@ env:
   # Pin the Vast CLI to an immutable commit (a PyPI version can be re-published; a commit
   # hash can't) — avoids pulling untrusted code at run time.
   VAST_CLI_COMMIT: "28494d92c6c03d887f8375085243c22eb68c5874"
+  # TEMP(debugging): "1" skips teardown so the box stays up for SSH debugging. The
+  # "Connection info" step prints how to connect and how to destroy it manually.
+  # SET BACK TO "0" (or remove) so the box is destroyed again.
+  KEEP_INSTANCE: "1"
 
 jobs:
   gpu-tests:
@@ -261,9 +265,31 @@ jobs:
             echo '```'
           } >> "$GITHUB_STEP_SUMMARY"
 
+      # TEMP(debugging): when KEEP_INSTANCE=1, leave the box up and print how to reach it.
+      - name: Connection info (instance kept for debugging)
+        if: always() && env.KEEP_INSTANCE == '1'
+        env:
+          HOST: ${{ steps.ssh.outputs.host }}
+          PORT: ${{ steps.ssh.outputs.port }}
+          IID: ${{ steps.instance.outputs.id }}
+        run: |
+          {
+            echo "## ⚠️ Instance KEPT for debugging (KEEP_INSTANCE=1)"
+            echo "SSH in with your team key (baked into the box by the template onstart):"
+            echo '```'
+            echo "ssh -o StrictHostKeyChecking=accept-new -p ${PORT:-?} root@${HOST:-?}"
+            echo "cd /workspace/lambda_vm   # the failing tests live here"
+            echo '```'
+            echo "Destroy it when done (it bills hourly):"
+            echo '```'
+            echo "vastai destroy instance ${IID:-?} --yes   # label: $RUN_LABEL"
+            echo '```'
+          } | tee -a "$GITHUB_STEP_SUMMARY"
+          echo "::warning::Instance $IID kept for debugging — destroy it manually: vastai destroy instance $IID --yes"
+
       # --- Teardown: ALWAYS destroy the instance (cost guardrail) ---
       - name: Destroy instance
-        if: always()
+        if: always() && env.KEEP_INSTANCE != '1'
         run: |
           # Retry transient failures (network/auth) so a paid box isn't stranded.
           # --yes: skip the interactive [y/N] confirm (CI has no tty).

From 0c3b895f89d730a49e0b79f8f54694cbd364110d Mon Sep 17 00:00:00 2001
From: Julian Arce <52429267+JuArce@users.noreply.github.com>
Date: Tue, 30 Jun 2026 13:00:56 -0300
Subject: [PATCH 03/10] ci: require CUDA 13.1

---
 .github/workflows/gpu-tests.yml |  7 +++++--
 README.md                       | 19 ++++++++++++++++++-
 crypto/math-cuda/build.rs       |  5 +++++
 3 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml
index ba0e42411..45e596bc8 100644
--- a/.github/workflows/gpu-tests.yml
+++ b/.github/workflows/gpu-tests.yml
@@ -85,8 +85,11 @@ jobs:
           # because vast can't numerically compare the driver_version string server-side.
           MIN_DRIVER: "580"
         run: |
-          # cpu_ram filter is in GB.
-          QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=32 disk_space>=64 verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}"
+          # cpu_ram filter is in GB. cuda_max_good>=13.1: the box's driver must support CUDA
+          # 13.1 because the template's nvcc is 13.1 and build.rs JIT-compiles its PTX at load —
+          # a 13.0 driver rejects 13.1 PTX (CUDA_ERROR_UNSUPPORTED_PTX_VERSION). Bump this in
+          # lockstep if the base image's CUDA toolkit changes.
+          QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=32 disk_space>=64 verified=true rentable=true cuda_max_good>=13.1 dph_total<=${PRICE_CAP}"
           echo "Query: $QUERY (+ client-side driver_version major >= $MIN_DRIVER)"
           # Keep only offers whose driver major >= MIN_DRIVER, then most expensive first (within
           # the cap) — premium hosts have faster disks/network and better reliability; cheapest
diff --git a/README.md b/README.md
index 151934433..0f00979ab 100644
--- a/README.md
+++ b/README.md
@@ -185,7 +185,9 @@ See [`spec/README.md`](./spec/README.md) for full setup instructions.
 | `make test-asm` | Compile and run ASM tests |
 | `make test-rust` | Compile and run Rust tests |
 | `make test-executor` | Compile all programs and run executor tests |
-| `make test-math-cuda` | math-cuda parity tests (requires NVIDIA GPU + nvcc) |
+| `make test-math-cuda` | math-cuda GPU kernel parity tests (requires NVIDIA GPU + nvcc; see GPU Tests) |
+| `make test-cuda-integration` | End-to-end GPU dispatch + proof verification (requires NVIDIA GPU + nvcc) |
+| `make test-cuda-fallback` | GPU error-path / CPU-fallback tests (requires NVIDIA GPU + nvcc) |
 | `make build` | Build all workspace crates |
 | `make check` | Check all crates (faster than build, no codegen) |
 | `make clippy` | Run clippy on all crates |
@@ -219,6 +221,21 @@ You can run it with
 
 `make test-rust`
 
+### GPU Tests
+
+The CUDA test groups run only on a machine with an NVIDIA GPU and `nvcc`:
+
+- `make test-math-cuda` — GPU-vs-CPU kernel parity (NTT, LDE, barycentric, FRI, …)
+- `make test-cuda-integration` — proves a guest on GPU and checks every dispatch fired + the proof verifies
+- `make test-cuda-fallback` — forces GPU dispatch errors and checks the CPU fallback still verifies
+
+**Requirement: an NVIDIA driver supporting CUDA ≥ 13.1.** The kernels are compiled with the
+toolkit's `nvcc` (currently CUDA 13.1) into PTX that the driver JIT-compiles at load; a driver
+older than the toolkit rejects it with `CUDA_ERROR_UNSUPPORTED_PTX_VERSION`. Keep the driver/CUDA
+floor in step with the installed toolkit (e.g. the `cuda_max_good>=13.1` filter in
+`.github/workflows/gpu-tests.yml`). These groups run automatically on a rented GPU in the merge
+queue via that workflow.
+
 ## Benchmarking & Profiling
 
 You can create a flamegraph for proof generation using the following target:
diff --git a/crypto/math-cuda/build.rs b/crypto/math-cuda/build.rs
index b2f61f9a2..6888d2a72 100644
--- a/crypto/math-cuda/build.rs
+++ b/crypto/math-cuda/build.rs
@@ -72,6 +72,11 @@ fn compile_ptx(src: &str, out_name: &str, have_nvcc: bool) {
     // compute capability. If unset, try `nvidia-smi` to match the host GPU
     // (avoids JIT failures like nvcc-13.0 PTX rejected on Blackwell drivers);
     // fall back to compute_89 (Ada) when detection fails.
+    //
+    // NOTE: this `-arch` only sets the *virtual arch*, not the PTX ISA version, which is
+    // fixed by this nvcc's CUDA toolkit. The runtime driver must support that toolkit's CUDA
+    // version or it rejects the PTX with CUDA_ERROR_UNSUPPORTED_PTX_VERSION — i.e. the box's
+    // driver CUDA must be >= the build toolkit's CUDA (currently 13.1). See README "GPU Tests".
     let arch = env::var("CUDARC_NVCC_ARCH").unwrap_or_else(|_| detect_arch());
 
     let status = Command::new(nvcc_path())

From 9abe6f01f70d9f17624dfc0daffe0590cc06e8fa Mon Sep 17 00:00:00 2001
From: Julian Arce <52429267+JuArce@users.noreply.github.com>
Date: Tue, 30 Jun 2026 14:17:06 -0300
Subject: [PATCH 04/10] ci: test prover on CUDA

---
 .github/workflows/gpu-tests.yml | 23 ++++++++++++-----------
 Makefile                        | 15 +++++++++++++++
 README.md                       |  4 ++++
 scripts/gpu_test.sh             | 28 +++++++++++++++++-----------
 4 files changed, 48 insertions(+), 22 deletions(-)

diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml
index 45e596bc8..a84613291 100644
--- a/.github/workflows/gpu-tests.yml
+++ b/.github/workflows/gpu-tests.yml
@@ -1,11 +1,12 @@
 name: GPU Tests (merge queue)
 
-# Run the CUDA-only test groups (which CPU CI can't, since GitHub runners have no GPU) on a
-# rented Vast.ai RTX 5090 when a PR is in the merge queue, and block the merge if they fail.
+# Run the GPU test suite (which CPU CI can't, since GitHub runners have no GPU) on a rented
+# Vast.ai RTX 5090 when a PR is in the merge queue, and block the merge if it fails.
 # Groups (see scripts/gpu_test.sh): math-cuda kernel parity, cuda_path_integration (GPU proof
-# verifies), cuda_fallback (CPU fallback verifies). Orchestration runs on a GitHub-hosted
-# runner; all GPU work happens on the rented box (provisioned by the template onstart). The
-# box is ALWAYS destroyed at the end.
+# verifies), cuda_fallback (CPU fallback verifies), the prover/stark/crypto/ecsm suite on the
+# GPU path, and the comprehensive all-instructions prove. Orchestration runs on a GitHub-hosted
+# runner; all GPU work happens on the rented box (provisioned by the template onstart). The box
+# is ALWAYS destroyed at the end.
 #
 # Triggered on `merge_group` (one rental per merge, not per push) + `workflow_dispatch` for
 # manual runs. To gate merges, add the job name `gpu-tests` to the branch-protection required
@@ -31,8 +32,8 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  # Vast offer search: RTX 5090, >=16 cores, >=32GB RAM (workloads are small), >=64GB disk,
-  # verified + rentable, Blackwell-capable driver, <= cap.
+  # Vast offer search: RTX 5090, >=16 cores, >=64GB RAM (the prover suite proves real ELFs,
+  # so allow headroom), >=64GB disk, verified + rentable, Blackwell-capable driver, <= cap.
   GPU_NAME: RTX_5090
   PRICE_CAP: "1"
   VAST_IMAGE_DISK: "64"
@@ -49,9 +50,9 @@ env:
 jobs:
   gpu-tests:
     runs-on: ubuntu-latest
-    # Provisioning + dual-feature cuda build (~25 min) + the three test groups (~15 min) +
-    # image-pull slack. Generous ceiling; teardown still always destroys the box.
-    timeout-minutes: 120
+    # Provisioning + cuda builds + 5 test groups; the prover suite (single-threaded, real
+    # ELF proves) dominates. Generous ceiling; teardown still always destroys the box.
+    timeout-minutes: 240
     steps:
       - name: Install Vast CLI
         # No secrets in this step's env: install-time code can't read the API key during pip
@@ -89,7 +90,7 @@ jobs:
           # 13.1 because the template's nvcc is 13.1 and build.rs JIT-compiles its PTX at load —
           # a 13.0 driver rejects 13.1 PTX (CUDA_ERROR_UNSUPPORTED_PTX_VERSION). Bump this in
           # lockstep if the base image's CUDA toolkit changes.
-          QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=32 disk_space>=64 verified=true rentable=true cuda_max_good>=13.1 dph_total<=${PRICE_CAP}"
+          QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=64 disk_space>=64 verified=true rentable=true cuda_max_good>=13.1 dph_total<=${PRICE_CAP}"
           echo "Query: $QUERY (+ client-side driver_version major >= $MIN_DRIVER)"
           # Keep only offers whose driver major >= MIN_DRIVER, then most expensive first (within
           # the cap) — premium hosts have faster disks/network and better reliability; cheapest
diff --git a/Makefile b/Makefile
index 32a735c79..45518ed6d 100644
--- a/Makefile
+++ b/Makefile
@@ -2,6 +2,7 @@
 compile-programs clean-asm clean-rust clean-bench clean-shared clean test test-asm \
 test-rust test-executor test-flamegraph flamegraph-prover \
 test-fast test-prover test-prover-all test-disk-spill test-math-cuda test-cuda-integration test-cuda-fallback \
+test-prover-cuda test-prover-comprehensive-cuda \
 bench-math-cuda bench-prover bench-prover-cuda build check clippy fmt lint regen-ethrex-fixtures \
 update-ethrex-fixture-checksums check-ethrex-fixture-checksums
 
@@ -254,6 +255,20 @@ test-cuda-fallback:
 	cargo test -p lambda-vm-prover --release --features test-cuda-faults \
 	    --test cuda_fallback_tests -- --ignored --nocapture
 
+# The prover/stark/crypto/ecsm test suite with the GPU (cuda) path enabled (requires NVIDIA
+# GPU + nvcc). The GPU CI counterpart of CPU CI's sharded prover tests. Single-threaded: the
+# GPU serializes proves and the dispatch counters are process-global. cuda on prover cascades
+# to stark; crypto/ecsm build without it (they have no GPU path).
+test-prover-cuda:
+	cargo test --release -p lambda-vm-prover -p stark -p crypto -p ecsm \
+	    --features lambda-vm-prover/cuda -- --test-threads=1
+
+# The comprehensive all-instructions prove (ignored by default) on the GPU path (requires
+# NVIDIA GPU + nvcc). GPU counterpart of CPU CI's merge-queue-only comprehensive job.
+test-prover-comprehensive-cuda:
+	cargo test --release -p lambda-vm-prover --features cuda \
+	    test_prove_elfs_all_instructions_64_full -- --ignored --test-threads=1 --nocapture
+
 # math-cuda quick microbench (median of 10 runs)
 bench-math-cuda:
 	cargo test -p math-cuda --release --test bench_quick -- --ignored --nocapture
diff --git a/README.md b/README.md
index 0f00979ab..820a97857 100644
--- a/README.md
+++ b/README.md
@@ -188,6 +188,8 @@ See [`spec/README.md`](./spec/README.md) for full setup instructions.
 | `make test-math-cuda` | math-cuda GPU kernel parity tests (requires NVIDIA GPU + nvcc; see GPU Tests) |
 | `make test-cuda-integration` | End-to-end GPU dispatch + proof verification (requires NVIDIA GPU + nvcc) |
 | `make test-cuda-fallback` | GPU error-path / CPU-fallback tests (requires NVIDIA GPU + nvcc) |
+| `make test-prover-cuda` | Prover/stark/crypto/ecsm suite on the GPU path (requires NVIDIA GPU + nvcc) |
+| `make test-prover-comprehensive-cuda` | Comprehensive all-instructions prove on the GPU path (requires NVIDIA GPU + nvcc) |
 | `make build` | Build all workspace crates |
 | `make check` | Check all crates (faster than build, no codegen) |
 | `make clippy` | Run clippy on all crates |
@@ -228,6 +230,8 @@ The CUDA test groups run only on a machine with an NVIDIA GPU and `nvcc`:
 - `make test-math-cuda` — GPU-vs-CPU kernel parity (NTT, LDE, barycentric, FRI, …)
 - `make test-cuda-integration` — proves a guest on GPU and checks every dispatch fired + the proof verifies
 - `make test-cuda-fallback` — forces GPU dispatch errors and checks the CPU fallback still verifies
+- `make test-prover-cuda` — the prover/stark/crypto/ecsm suite with the GPU path enabled
+- `make test-prover-comprehensive-cuda` — the comprehensive all-instructions prove on the GPU path
 
 **Requirement: an NVIDIA driver supporting CUDA ≥ 13.1.** The kernels are compiled with the
 toolkit's `nvcc` (currently CUDA 13.1) into PTX that the driver JIT-compiles at load; a driver
diff --git a/scripts/gpu_test.sh b/scripts/gpu_test.sh
index e3608d81a..e6c974986 100755
--- a/scripts/gpu_test.sh
+++ b/scripts/gpu_test.sh
@@ -2,10 +2,12 @@
 #
 # gpu_test.sh — run the CUDA-only test groups on a GPU box.
 #
-# These groups can't run in CPU CI (GitHub runners have no GPU):
-#   1. math-cuda kernel parity        (make test-math-cuda)
-#   2. end-to-end GPU dispatch + proof (make test-cuda-integration)
-#   3. GPU error-path / CPU fallback   (make test-cuda-fallback)
+# Exercises the CUDA path, which CPU CI can't (GitHub runners have no GPU):
+#   1. math-cuda kernel parity         (make test-math-cuda)
+#   2. end-to-end GPU dispatch + proof  (make test-cuda-integration)
+#   3. GPU error-path / CPU fallback    (make test-cuda-fallback)
+#   4. prover/stark/crypto/ecsm suite   (make test-prover-cuda) — CPU CI's prover tests on GPU
+#   5. comprehensive all-instructions   (make test-prover-comprehensive-cuda)
 #
 # Runs on the rented Vast box from the gpu-tests.yml merge-queue workflow. All three groups
 # run even if one fails (so the log shows every failure); the script exits non-zero if ANY
@@ -42,12 +44,14 @@ log "pinning cudarc to $CUDARC_PIN"
 sed -i "s/\"cuda-version-from-build-system\"/\"${CUDARC_PIN}\"/; /\"fallback-latest\"/d" \
     crypto/math-cuda/Cargo.toml
 
-# --- Build the asm guest ELFs used by Groups 2 & 3 (clang on .s; fast) ----------
-# (math-cuda parity tests need no ELF; cuda_path_integration / cuda_fallback prove an asm ELF.)
-log "compiling asm guest programs"
+# --- Build the guest ELFs the tests prove ---------------------------------------
+# math-cuda parity needs none; cuda_path_integration / cuda_fallback prove an asm ELF; the
+# prover suite (Groups 4 & 5) proves asm AND rust guests. Build both up front.
+log "compiling guest programs (asm + rust)"
 make compile-programs-asm
+make compile-programs-rust
 
-# --- Run the three CUDA test groups via the Makefile targets --------------------
+# --- Run the CUDA test groups via the Makefile targets --------------------------
 fail=0
 run() {  # $1 = make target
     log "make $1"
@@ -56,9 +60,11 @@ run() {  # $1 = make target
         fail=1
     fi
 }
-run test-math-cuda         # Group 1: kernel parity
-run test-cuda-integration  # Group 2: end-to-end GPU dispatch + proof verifies
-run test-cuda-fallback     # Group 3: GPU error -> CPU fallback still verifies
+run test-math-cuda                  # Group 1: kernel parity
+run test-cuda-integration           # Group 2: end-to-end GPU dispatch + proof verifies
+run test-cuda-fallback              # Group 3: GPU error -> CPU fallback still verifies
+run test-prover-cuda                # Group 4: prover/stark/crypto/ecsm suite on the GPU path
+run test-prover-comprehensive-cuda  # Group 5: comprehensive all-instructions prove on GPU
 
 if [ "$fail" -ne 0 ]; then
     log "FAILED — one or more GPU test groups failed"

From cd83966edf8ec2a5c813df992fae7df09ac125ee Mon Sep 17 00:00:00 2001
From: Julian Arce <52429267+JuArce@users.noreply.github.com>
Date: Tue, 30 Jun 2026 15:07:44 -0300
Subject: [PATCH 05/10] ci: improve gpu failed tests summaries

---
 .github/workflows/gpu-tests.yml | 52 ++++++++++++++++++++++++---------
 1 file changed, 39 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml
index a84613291..12188ecc9 100644
--- a/.github/workflows/gpu-tests.yml
+++ b/.github/workflows/gpu-tests.yml
@@ -32,7 +32,7 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  # Vast offer search: RTX 5090, >=16 cores, >=64GB RAM (the prover suite proves real ELFs,
+  # Vast offer search: RTX 5090, >=16 cores, >=96GB RAM (the prover suite proves real ELFs,
   # so allow headroom), >=64GB disk, verified + rentable, Blackwell-capable driver, <= cap.
   GPU_NAME: RTX_5090
   PRICE_CAP: "1"
@@ -90,7 +90,7 @@ jobs:
           # 13.1 because the template's nvcc is 13.1 and build.rs JIT-compiles its PTX at load —
           # a 13.0 driver rejects 13.1 PTX (CUDA_ERROR_UNSUPPORTED_PTX_VERSION). Bump this in
           # lockstep if the base image's CUDA toolkit changes.
-          QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=64 disk_space>=64 verified=true rentable=true cuda_max_good>=13.1 dph_total<=${PRICE_CAP}"
+          QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=96 disk_space>=64 verified=true rentable=true cuda_max_good>=13.1 dph_total<=${PRICE_CAP}"
           echo "Query: $QUERY (+ client-side driver_version major >= $MIN_DRIVER)"
           # Keep only offers whose driver major >= MIN_DRIVER, then most expensive first (within
           # the cap) — premium hosts have faster disks/network and better reliability; cheapest
@@ -110,7 +110,7 @@ jobs:
             sleep "$OFFER_INTERVAL"
           done
           if [ -z "$OFFER_ID" ]; then
-            echo "::error::No RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=32GB RAM, >=64GB disk, driver>=${MIN_DRIVER}, <= \$${PRICE_CAP}/hr)"
+            echo "::error::No RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=96GB RAM, >=64GB disk, driver>=${MIN_DRIVER}, <= \$${PRICE_CAP}/hr)"
             exit 1
           fi
           echo "id=$OFFER_ID" >> "$GITHUB_OUTPUT"
@@ -255,18 +255,44 @@ jobs:
         env:
           OUTCOME: ${{ steps.tests.outcome }}
         run: |
+          OUT="$RUNNER_TEMP/gpu_test_out.txt"
           {
-            echo "## GPU tests (CUDA-only suite)"
-            echo "Outcome: **${OUTCOME}**"
-            # On failure, surface the failing-group markers explicitly, then the log tail.
-            if [ "$OUTCOME" != "success" ]; then
-              FAILED=$(grep -F '::error::GPU test group failed:' "$RUNNER_TEMP/gpu_test_out.txt" 2>/dev/null \
-                       | sed 's/.*failed: /- /' || true)
-              [ -n "$FAILED" ] && { echo; echo "Failed groups:"; echo "$FAILED"; }
+            echo "## GPU tests (CUDA suite) — ${OUTCOME}"
+            if [ "$OUTCOME" = "success" ]; then
+              echo "All GPU test groups passed."
+            else
+              # Group the failed tests under the make target that ran them: gpu_test.sh prints
+              # "=== make <target> ===" before each group, and cargo prints "test <name> ... FAILED".
+              report=$(awk '
+                /^=== make / { grp=$3; next }
+                / \.\.\. FAILED/ { fails[grp]=fails[grp] "\n    - " $2; n[grp]++ }
+                END { for (g in fails) printf "- **%s** (%d failed):%s\n", g, n[g], fails[g] }
+              ' "$OUT" 2>/dev/null || true)
+              # Per-test panic/assertion messages: each "thread '…' panicked at …:" block plus
+              # its following message lines (assertion, left/right), capped per block.
+              details=$(awk '
+                /^thread .* panicked at / { cap=1; lines=0; buf=$0; next }
+                cap {
+                  if ($0 ~ /^note: run with/ || $0 ~ /^----/ || $0 ~ /^test / || $0 ~ /^=== / || $0 ~ /^[[:space:]]*$/) { printf "%s\n\n", buf; cap=0; next }
+                  if (lines < 14) { buf=buf "\n" $0; lines++ } else if (lines==14) { buf=buf "\n    ...(truncated)"; lines++ }
+                }
+                END { if (cap) printf "%s\n", buf }
+              ' "$OUT" 2>/dev/null || true)
+              if [ -n "$report" ]; then
+                echo; echo "### Failed tests by group"; echo "$report"
+                if [ -n "$details" ]; then
+                  echo; echo "### Failure details"; echo '```'; echo "$details"; echo '```'
+                fi
+              else
+                # No per-test failures parsed (likely a build/infra error) — fall back to the
+                # failed-group markers plus a short log tail.
+                grps=$(grep -F '::error::GPU test group failed:' "$OUT" 2>/dev/null | sed 's/.*failed: /- /' | sort -u || true)
+                [ -n "$grps" ] && { echo; echo "### Failed groups"; echo "$grps"; }
+                echo; echo "No individual test failures parsed (build/infra error?). Last lines:"
+                echo '```'; tail -n 40 "$OUT" 2>/dev/null || echo "(no output captured)"; echo '```'
+              fi
+              echo; echo "<sub>Full output is in the \"Run GPU tests\" step log.</sub>"
             fi
-            echo '```'
-            tail -n 80 "$RUNNER_TEMP/gpu_test_out.txt" 2>/dev/null || echo "(no output captured)"
-            echo '```'
           } >> "$GITHUB_STEP_SUMMARY"
 
       # TEMP(debugging): when KEEP_INSTANCE=1, leave the box up and print how to reach it.

From 25904c93b0380b064ca6dfb1908c786ce8808b03 Mon Sep 17 00:00:00 2001
From: Julian Arce <52429267+JuArce@users.noreply.github.com>
Date: Tue, 30 Jun 2026 15:37:12 -0300
Subject: [PATCH 06/10] ci: test-threads=1

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 45518ed6d..e5ef2633f 100644
--- a/Makefile
+++ b/Makefile
@@ -253,7 +253,7 @@ test-cuda-integration:
 # Forces cuda dispatch errors and asserts the CPU fallback still produces a verifying proof.
 test-cuda-fallback:
 	cargo test -p lambda-vm-prover --release --features test-cuda-faults \
-	    --test cuda_fallback_tests -- --ignored --nocapture
+	    --test cuda_fallback_tests -- --ignored --nocapture --test-threads=1
 
 # The prover/stark/crypto/ecsm test suite with the GPU (cuda) path enabled (requires NVIDIA
 # GPU + nvcc). The GPU CI counterpart of CPU CI's sharded prover tests. Single-threaded: the

From e862a66dd16fcdc4b27a938102d9c384cf43edc2 Mon Sep 17 00:00:00 2001
From: Julian Arce <52429267+JuArce@users.noreply.github.com>
Date: Tue, 30 Jun 2026 16:16:25 -0300
Subject: [PATCH 07/10] remove temporary code

---
 .github/workflows/gpu-tests.yml | 32 +-------------------------------
 1 file changed, 1 insertion(+), 31 deletions(-)

diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml
index 12188ecc9..61db99acd 100644
--- a/.github/workflows/gpu-tests.yml
+++ b/.github/workflows/gpu-tests.yml
@@ -19,10 +19,6 @@ name: GPU Tests (merge queue)
 on:
   merge_group:
   workflow_dispatch:
-  # TEMP(testing): run on pushes to this branch pre-merge (no merge queue needed to test the
-  # rent -> test -> destroy path). REMOVE before merging.
-  push:
-    branches: [ci_run_tests_gpu]
 
 permissions:
   contents: read
@@ -42,10 +38,6 @@ env:
   # Pin the Vast CLI to an immutable commit (a PyPI version can be re-published; a commit
   # hash can't) — avoids pulling untrusted code at run time.
   VAST_CLI_COMMIT: "28494d92c6c03d887f8375085243c22eb68c5874"
-  # TEMP(debugging): "1" skips teardown so the box stays up for SSH debugging. The
-  # "Connection info" step prints how to connect and how to destroy it manually.
-  # SET BACK TO "0" (or remove) so the box is destroyed again.
-  KEEP_INSTANCE: "1"
 
 jobs:
   gpu-tests:
@@ -295,31 +287,9 @@ jobs:
             fi
           } >> "$GITHUB_STEP_SUMMARY"
 
-      # TEMP(debugging): when KEEP_INSTANCE=1, leave the box up and print how to reach it.
-      - name: Connection info (instance kept for debugging)
-        if: always() && env.KEEP_INSTANCE == '1'
-        env:
-          HOST: ${{ steps.ssh.outputs.host }}
-          PORT: ${{ steps.ssh.outputs.port }}
-          IID: ${{ steps.instance.outputs.id }}
-        run: |
-          {
-            echo "## ⚠️ Instance KEPT for debugging (KEEP_INSTANCE=1)"
-            echo "SSH in with your team key (baked into the box by the template onstart):"
-            echo '```'
-            echo "ssh -o StrictHostKeyChecking=accept-new -p ${PORT:-?} root@${HOST:-?}"
-            echo "cd /workspace/lambda_vm   # the failing tests live here"
-            echo '```'
-            echo "Destroy it when done (it bills hourly):"
-            echo '```'
-            echo "vastai destroy instance ${IID:-?} --yes   # label: $RUN_LABEL"
-            echo '```'
-          } | tee -a "$GITHUB_STEP_SUMMARY"
-          echo "::warning::Instance $IID kept for debugging — destroy it manually: vastai destroy instance $IID --yes"
-
       # --- Teardown: ALWAYS destroy the instance (cost guardrail) ---
       - name: Destroy instance
-        if: always() && env.KEEP_INSTANCE != '1'
+        if: always()
         run: |
           # Retry transient failures (network/auth) so a paid box isn't stranded.
           # --yes: skip the interactive [y/N] confirm (CI has no tty).

From 57e324156d4e733cbcc9552d24e2b9de3f736b91 Mon Sep 17 00:00:00 2001
From: Julian Arce <52429267+JuArce@users.noreply.github.com>
Date: Tue, 30 Jun 2026 17:00:25 -0300
Subject: [PATCH 08/10] fix: set cuda_max_good>=12.8

---
 .github/workflows/gpu-tests.yml | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml
index 61db99acd..b2df4379d 100644
--- a/.github/workflows/gpu-tests.yml
+++ b/.github/workflows/gpu-tests.yml
@@ -19,6 +19,9 @@ name: GPU Tests (merge queue)
 on:
   merge_group:
   workflow_dispatch:
+  # TEMP(testing): run on pushes to this branch pre-merge. REMOVE before merging.
+  push:
+    branches: [ci_run_tests_gpu]
 
 permissions:
   contents: read
@@ -78,11 +81,12 @@ jobs:
           # because vast can't numerically compare the driver_version string server-side.
           MIN_DRIVER: "580"
         run: |
-          # cpu_ram filter is in GB. cuda_max_good>=13.1: the box's driver must support CUDA
-          # 13.1 because the template's nvcc is 13.1 and build.rs JIT-compiles its PTX at load —
-          # a 13.0 driver rejects 13.1 PTX (CUDA_ERROR_UNSUPPORTED_PTX_VERSION). Bump this in
-          # lockstep if the base image's CUDA toolkit changes.
-          QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=96 disk_space>=64 verified=true rentable=true cuda_max_good>=13.1 dph_total<=${PRICE_CAP}"
+          # cpu_ram filter is in GB.
+          # EXPERIMENT: cuda_max_good>=12.8 (was 13.1). The template's nvcc is 13.1, so its PTX
+          # JITs only on drivers with CUDA >= 13.1 — a 12.8/13.0 box will fail with
+          # CUDA_ERROR_UNSUPPORTED_PTX_VERSION. Testing whether the most-expensive selection
+          # still lands on 13.1-capable boxes in practice. Revert to >=13.1 if it flakes.
+          QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=96 disk_space>=64 verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}"
           echo "Query: $QUERY (+ client-side driver_version major >= $MIN_DRIVER)"
           # Keep only offers whose driver major >= MIN_DRIVER, then most expensive first (within
           # the cap) — premium hosts have faster disks/network and better reliability; cheapest

From 7d8241c9f35b4615d5e224d0b7ddb6a11013c0f0 Mon Sep 17 00:00:00 2001
From: Julian Arce <52429267+JuArce@users.noreply.github.com>
Date: Tue, 30 Jun 2026 17:37:16 -0300
Subject: [PATCH 09/10] comments

---
 .github/workflows/gpu-tests.yml |  8 +-------
 README.md                       | 10 ++++------
 crypto/math-cuda/build.rs       |  2 +-
 3 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml
index b2df4379d..00ecbdcf7 100644
--- a/.github/workflows/gpu-tests.yml
+++ b/.github/workflows/gpu-tests.yml
@@ -82,16 +82,10 @@ jobs:
           MIN_DRIVER: "580"
         run: |
           # cpu_ram filter is in GB.
-          # EXPERIMENT: cuda_max_good>=12.8 (was 13.1). The template's nvcc is 13.1, so its PTX
-          # JITs only on drivers with CUDA >= 13.1 — a 12.8/13.0 box will fail with
-          # CUDA_ERROR_UNSUPPORTED_PTX_VERSION. Testing whether the most-expensive selection
-          # still lands on 13.1-capable boxes in practice. Revert to >=13.1 if it flakes.
           QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=96 disk_space>=64 verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}"
           echo "Query: $QUERY (+ client-side driver_version major >= $MIN_DRIVER)"
           # Keep only offers whose driver major >= MIN_DRIVER, then most expensive first (within
-          # the cap) — premium hosts have faster disks/network and better reliability; cheapest
-          # boxes were flaky. `try ... catch 0` so a malformed/null driver_version on one offer
-          # is treated as 0 (filtered out) rather than erroring the whole jq.
+          # the cap) — premium hosts have faster disks/network and better reliability.
           SELECT="map(select((try (.driver_version|split(\".\")[0]|tonumber) catch 0) >= ${MIN_DRIVER})) | sort_by(.dph_total) | reverse"
           OFFER_ID=""
           for attempt in $(seq 1 "$OFFER_ATTEMPTS"); do
diff --git a/README.md b/README.md
index 820a97857..e07967037 100644
--- a/README.md
+++ b/README.md
@@ -233,12 +233,10 @@ The CUDA test groups run only on a machine with an NVIDIA GPU and `nvcc`:
 - `make test-prover-cuda` — the prover/stark/crypto/ecsm suite with the GPU path enabled
 - `make test-prover-comprehensive-cuda` — the comprehensive all-instructions prove on the GPU path
 
-**Requirement: an NVIDIA driver supporting CUDA ≥ 13.1.** The kernels are compiled with the
-toolkit's `nvcc` (currently CUDA 13.1) into PTX that the driver JIT-compiles at load; a driver
-older than the toolkit rejects it with `CUDA_ERROR_UNSUPPORTED_PTX_VERSION`. Keep the driver/CUDA
-floor in step with the installed toolkit (e.g. the `cuda_max_good>=13.1` filter in
-`.github/workflows/gpu-tests.yml`). These groups run automatically on a rented GPU in the merge
-queue via that workflow.
+The kernels are compiled by `nvcc` into PTX that the driver JIT-compiles at load, so the GPU's
+driver must be new enough for the toolkit — an older driver rejects the PTX with
+`CUDA_ERROR_UNSUPPORTED_PTX_VERSION`. These groups run automatically on a rented GPU in the merge
+queue via `.github/workflows/gpu-tests.yml` (which filters offers on `cuda_max_good`).
 
 ## Benchmarking & Profiling
 
diff --git a/crypto/math-cuda/build.rs b/crypto/math-cuda/build.rs
index 6888d2a72..73cc10d3a 100644
--- a/crypto/math-cuda/build.rs
+++ b/crypto/math-cuda/build.rs
@@ -76,7 +76,7 @@ fn compile_ptx(src: &str, out_name: &str, have_nvcc: bool) {
     // NOTE: this `-arch` only sets the *virtual arch*, not the PTX ISA version, which is
     // fixed by this nvcc's CUDA toolkit. The runtime driver must support that toolkit's CUDA
     // version or it rejects the PTX with CUDA_ERROR_UNSUPPORTED_PTX_VERSION — i.e. the box's
-    // driver CUDA must be >= the build toolkit's CUDA (currently 13.1). See README "GPU Tests".
+    // driver CUDA must be >= the build toolkit's CUDA. See README "GPU Tests".
     let arch = env::var("CUDARC_NVCC_ARCH").unwrap_or_else(|_| detect_arch());
 
     let status = Command::new(nvcc_path())

From 764aadeab5027f85163e6d448c3388862d57259d Mon Sep 17 00:00:00 2001
From: Julian Arce <52429267+JuArce@users.noreply.github.com>
Date: Tue, 30 Jun 2026 17:59:28 -0300
Subject: [PATCH 10/10] apply code review

---
 .github/workflows/gpu-tests.yml | 3 ++-
 scripts/gpu_test.sh             | 7 ++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml
index 00ecbdcf7..984f6c35f 100644
--- a/.github/workflows/gpu-tests.yml
+++ b/.github/workflows/gpu-tests.yml
@@ -228,10 +228,11 @@ jobs:
             ''|*[!A-Za-z0-9._/-]*) echo "::error::invalid ref: '$REF'"; exit 1 ;;
           esac
           # Check out the ref under test on the box, then run the CUDA test groups.
+          # gpu_test.sh owns the CUDARC_PIN / SYSROOT_DIR defaults — don't duplicate them here.
           REMOTE="set -e; cd /workspace/lambda_vm; \
             git fetch --force origin '$REF'; \
             git checkout -f FETCH_HEAD; \
-            CUDARC_PIN=cuda-12080 SYSROOT_DIR=/opt/lambda-vm-sysroot bash scripts/gpu_test.sh"
+            bash scripts/gpu_test.sh"
 
           # pipefail so a test failure on the box propagates through the tee pipe and FAILS this
           # step (which fails the job and blocks the merge), instead of being masked by tee.
diff --git a/scripts/gpu_test.sh b/scripts/gpu_test.sh
index e6c974986..942a09620 100755
--- a/scripts/gpu_test.sh
+++ b/scripts/gpu_test.sh
@@ -9,7 +9,7 @@
 #   4. prover/stark/crypto/ecsm suite   (make test-prover-cuda) — CPU CI's prover tests on GPU
 #   5. comprehensive all-instructions   (make test-prover-comprehensive-cuda)
 #
-# Runs on the rented Vast box from the gpu-tests.yml merge-queue workflow. All three groups
+# Runs on the rented Vast box from the gpu-tests.yml merge-queue workflow. All groups
 # run even if one fails (so the log shows every failure); the script exits non-zero if ANY
 # group failed, which fails the workflow job and blocks the merge.
 #
@@ -38,8 +38,9 @@ nvidia-smi --query-gpu=name,driver_version,compute_cap --format=csv,noheader
 # --- Pin cudarc so it binds a fixed driver-symbol set --------------------------
 # crypto/math-cuda/Cargo.toml uses `cuda-version-from-build-system` + `fallback-latest`;
 # when detection falls back to "latest", cudarc requests symbols some boxes' driver doesn't
-# export (e.g. cuDevSmResourceSplit / cuCtxGetDevice_v2) -> runtime panic. Pinning to a fixed
-# CUDA version (12.8, matching the cuda_max_good>=12.8 offer floor) avoids that.
+# export (e.g. cuDevSmResourceSplit / cuCtxGetDevice_v2) -> runtime panic. Pinning to a fixed,
+# conservative CUDA version binds a known driver-symbol set instead. (This is cudarc's
+# host-side driver-API floor — independent of the PTX/driver version the offer filter targets.)
 log "pinning cudarc to $CUDARC_PIN"
 sed -i "s/\"cuda-version-from-build-system\"/\"${CUDARC_PIN}\"/; /\"fallback-latest\"/d" \
     crypto/math-cuda/Cargo.toml