diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml
new file mode 100644
index 000000000..984f6c35f
--- /dev/null
+++ b/.github/workflows/gpu-tests.yml
@@ -0,0 +1,325 @@
+name: GPU Tests (merge queue)
+
+# Run the GPU test suite (which CPU CI can't, since GitHub runners have no GPU) on a rented
+# Vast.ai RTX 5090 when a PR is in the merge queue, and block the merge if it fails.
+# Groups (see scripts/gpu_test.sh): math-cuda kernel parity, cuda_path_integration (GPU proof
+# verifies), cuda_fallback (CPU fallback verifies), the prover/stark/crypto/ecsm suite on the
+# GPU path, and the comprehensive all-instructions prove. Orchestration runs on a GitHub-hosted
+# runner; all GPU work happens on the rented box (provisioned by the template onstart). The box
+# is ALWAYS destroyed at the end.
+#
+# Triggered on `merge_group` (one rental per merge, not per push) + `workflow_dispatch` for
+# manual runs. To gate merges, add the job name `gpu-tests` to the branch-protection required
+# status checks for `main` (GitHub UI).
+#
+# Requires repo secrets:
+#   VAST_API_KEY        — https://cloud.vast.ai/manage-keys/
+#   VAST_TEMPLATE_HASH  — hash of the "NVIDIA CUDA Lambda VM 64GB" template
+
+on:
+  merge_group:
+  workflow_dispatch:
+  # TEMP(testing): run on pushes to this branch pre-merge. REMOVE before merging.
+  push:
+    branches: [ci_run_tests_gpu]
+
+permissions:
+  contents: read
+
+concurrency:
+  group: gpu-tests-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  # Vast offer search: RTX 5090, >=16 cores, >=96GB RAM (the prover suite proves real ELFs,
+  # so allow headroom), >=64GB disk, verified + rentable, Blackwell-capable driver, <= cap.
+  GPU_NAME: RTX_5090
+  PRICE_CAP: "1"
+  VAST_IMAGE_DISK: "64"
+  # Unique per-run label set on the instance, for identification + leak-proof teardown.
+  RUN_LABEL: "gpu-tests-${{ github.run_id }}-${{ github.run_attempt }}"
+  # Pin the Vast CLI to an immutable commit (a PyPI version can be re-published; a commit
+  # hash can't) — avoids pulling untrusted code at run time.
+  VAST_CLI_COMMIT: "28494d92c6c03d887f8375085243c22eb68c5874"
+
+jobs:
+  gpu-tests:
+    runs-on: ubuntu-latest
+    # Provisioning + cuda builds + 5 test groups; the prover suite (single-threaded, real
+    # ELF proves) dominates. Generous ceiling; teardown still always destroys the box.
+    timeout-minutes: 240
+    steps:
+      - name: Install Vast CLI
+        # No secrets in this step's env: install-time code can't read the API key during pip
+        # install. Pinned to an immutable commit (see VAST_CLI_COMMIT) for the same reason.
+        # --break-system-packages: the ephemeral runner's Python may be PEP-668 "externally
+        # managed"; safe to override on a disposable runner.
+        run: pip install --quiet --break-system-packages "git+https://github.com/vast-ai/vast-cli.git@${VAST_CLI_COMMIT}"
+
+      - name: Authenticate Vast CLI
+        env:
+          VAST_API_KEY: ${{ secrets.VAST_API_KEY }}
+        run: vastai set api-key "$VAST_API_KEY"
+
+      - name: Generate ephemeral SSH key
+        id: sshkey
+        run: |
+          mkdir -p "$HOME/.ssh"
+          KEY="$HOME/.ssh/vast_gpu_tests"
+          ssh-keygen -t ed25519 -N "" -f "$KEY" -C "gh-actions-gpu-tests-${GITHUB_RUN_ID}" >/dev/null
+          echo "key_path=$KEY" >> "$GITHUB_OUTPUT"
+
+      - name: Pick a Vast offer
+        id: offer
+        env:
+          # Retry the same query to ride out transient scarcity (RTX 5090s are a small,
+          # fast-churning pool). Total wait ~= ATTEMPTS * INTERVAL.
+          OFFER_ATTEMPTS: "10"
+          OFFER_INTERVAL: "30"
+          # Require driver >= this major so cudarc matches the runtime driver (older drivers
+          # lack newer symbols and the GPU path falls back to CPU). Filtered client-side in jq
+          # because vast can't numerically compare the driver_version string server-side.
+          MIN_DRIVER: "580"
+        run: |
+          # cpu_ram filter is in GB.
+          QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=96 disk_space>=64 verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}"
+          echo "Query: $QUERY (+ client-side driver_version major >= $MIN_DRIVER)"
+          # Keep only offers whose driver major >= MIN_DRIVER, then most expensive first (within
+          # the cap) — premium hosts have faster disks/network and better reliability.
+          SELECT="map(select((try (.driver_version|split(\".\")[0]|tonumber) catch 0) >= ${MIN_DRIVER})) | sort_by(.dph_total) | reverse"
+          OFFER_ID=""
+          for attempt in $(seq 1 "$OFFER_ATTEMPTS"); do
+            vastai search offers "$QUERY" --raw -o dph_total > offers.json || true
+            OFFER_ID=$(jq -r "$SELECT | .[0].id // empty" offers.json)
+            OFFER_PRICE=$(jq -r "$SELECT | .[0].dph_total // empty" offers.json)
+            if [ -n "$OFFER_ID" ]; then
+              echo "Selected offer $OFFER_ID at \$${OFFER_PRICE}/hr (attempt $attempt)"
+              break
+            fi
+            echo "No matching offer (attempt $attempt/$OFFER_ATTEMPTS); retrying in ${OFFER_INTERVAL}s..."
+            sleep "$OFFER_INTERVAL"
+          done
+          if [ -z "$OFFER_ID" ]; then
+            echo "::error::No RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=96GB RAM, >=64GB disk, driver>=${MIN_DRIVER}, <= \$${PRICE_CAP}/hr)"
+            exit 1
+          fi
+          echo "id=$OFFER_ID" >> "$GITHUB_OUTPUT"
+          echo "price=$OFFER_PRICE" >> "$GITHUB_OUTPUT"
+
+      - name: Create instance
+        id: instance
+        env:
+          VAST_TEMPLATE_HASH: ${{ secrets.VAST_TEMPLATE_HASH }}
+          OFFER_ID: ${{ steps.offer.outputs.id }}
+        run: |
+          vastai create instance "$OFFER_ID" \
+            --template_hash "$VAST_TEMPLATE_HASH" \
+            --disk "$VAST_IMAGE_DISK" \
+            --label "$RUN_LABEL" \
+            --ssh --direct --raw > create.json
+          # Log only the fields we need (the full --raw response could carry a sensitive field).
+          jq '{success, new_contract: (.new_contract // .instances.new_contract)}' create.json
+          IID=$(jq -r '.new_contract // .instances.new_contract // empty' create.json)
+          if [ -z "$IID" ]; then
+            echo "::error::Failed to create Vast instance"
+            exit 1
+          fi
+          # Persist immediately so teardown runs even if later steps fail.
+          echo "$IID" > "$RUNNER_TEMP/vast_instance_id"
+          echo "id=$IID" >> "$GITHUB_OUTPUT"
+          echo "Created instance $IID (label $RUN_LABEL)"
+
+      - name: Attach SSH key to instance
+        env:
+          IID: ${{ steps.instance.outputs.id }}
+          KEY: ${{ steps.sshkey.outputs.key_path }}
+        run: |
+          # Attach the ephemeral pubkey to THIS instance only (added to its authorized_keys);
+          # removed when the instance is destroyed, so no account-level key to clean up.
+          # Retry: the instance may not accept the attach immediately after create.
+          PUB="$(cat "$KEY.pub")"
+          for attempt in $(seq 1 12); do
+            if vastai attach ssh "$IID" "$PUB"; then
+              echo "Attached ssh key (attempt $attempt)"; exit 0
+            fi
+            echo "attach failed (attempt $attempt/12); retrying in 10s..."
+            sleep 10
+          done
+          echo "::error::Failed to attach ssh key to instance $IID"
+          exit 1
+
+      - name: Wait for SSH
+        id: ssh
+        env:
+          IID: ${{ steps.instance.outputs.id }}
+        run: |
+          echo "Waiting for instance $IID to reach 'running' with SSH endpoint..."
+          HOST=""; PORT=""
+          # The base CUDA image is large; some hosts sit in 'loading' (image pull) a while.
+          for _ in $(seq 1 180); do   # ~30 min
+            vastai show instance "$IID" --raw > inst.json || true
+            STATUS=$(jq -r '.actual_status // empty' inst.json)
+            # We create with --direct, so SSH straight to the public IP + the host port mapped
+            # to container port 22 (the .ssh_host/.ssh_port proxy fields are unreliable).
+            HOST=$(jq -r '.public_ipaddr // empty' inst.json)
+            PORT=$(jq -r '.ports["22/tcp"][0].HostPort // empty' inst.json)
+            echo "  status=$STATUS ssh=$HOST:$PORT"
+            if [ "$STATUS" = "running" ] && [ -n "$HOST" ] && [ -n "$PORT" ]; then
+              break
+            fi
+            sleep 10
+          done
+          if [ "$STATUS" != "running" ] || [ -z "$HOST" ] || [ -z "$PORT" ]; then
+            echo "::error::Instance never became reachable (status=$STATUS host=$HOST port=$PORT)"
+            exit 1
+          fi
+          echo "host=$HOST" >> "$GITHUB_OUTPUT"
+          echo "port=$PORT" >> "$GITHUB_OUTPUT"
+
+          # Wait for sshd to accept our key.
+          for _ in $(seq 1 30); do
+            if ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes \
+                 -i "${{ steps.sshkey.outputs.key_path }}" -p "$PORT" "root@$HOST" true 2>/dev/null; then
+              echo "sshd reachable"; exit 0
+            fi
+            sleep 10
+          done
+          echo "::error::sshd did not accept connections in time"
+          exit 1
+
+      - name: Wait for onstart provisioning
+        env:
+          HOST: ${{ steps.ssh.outputs.host }}
+          PORT: ${{ steps.ssh.outputs.port }}
+          KEY: ${{ steps.sshkey.outputs.key_path }}
+        run: |
+          SSH="ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes -i $KEY -p $PORT root@$HOST"
+          echo "Waiting for the template onstart script to finish (Rust + LLVM + sysroot + clone)..."
+          # The bootstrap's final stdout line is "=== done ===". Vast captures onstart output to
+          # /var/log/onstart.log; fall back to checking the artifacts it leaves.
+          for _ in $(seq 1 120); do   # ~20 min
+            if $SSH 'grep -q "=== done ===" /var/log/onstart.log 2>/dev/null'; then
+              echo "onstart reported done"; exit 0
+            fi
+            # shellcheck disable=SC2016  # $HOME must expand on the remote box, not the runner
+            if $SSH 'test -x "$HOME/.cargo/bin/cargo" \
+                  && test -f /opt/lambda-vm-sysroot/include/stdlib.h \
+                  && test -d /workspace/lambda_vm/.git'; then
+              echo "provisioning artifacts present"; exit 0
+            fi
+            sleep 10
+          done
+          echo "::error::onstart provisioning did not complete in time"
+          exit 1
+
+      - name: Run GPU tests
+        id: tests
+        env:
+          HOST: ${{ steps.ssh.outputs.host }}
+          PORT: ${{ steps.ssh.outputs.port }}
+          KEY: ${{ steps.sshkey.outputs.key_path }}
+          # merge_group: refs/heads/gh-readonly-queue/main/pr-… (the merge commit = PR + main),
+          # so we test exactly what will land. workflow_dispatch: the chosen branch ref.
+          REF: ${{ github.ref }}
+        run: |
+          SSH="ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes -i $KEY -p $PORT root@$HOST"
+          # Defense-in-depth: never interpolate an unvalidated ref into the remote `bash -lc`.
+          case "$REF" in
+            ''|*[!A-Za-z0-9._/-]*) echo "::error::invalid ref: '$REF'"; exit 1 ;;
+          esac
+          # Check out the ref under test on the box, then run the CUDA test groups.
+          # gpu_test.sh owns the CUDARC_PIN / SYSROOT_DIR defaults — don't duplicate them here.
+          REMOTE="set -e; cd /workspace/lambda_vm; \
+            git fetch --force origin '$REF'; \
+            git checkout -f FETCH_HEAD; \
+            bash scripts/gpu_test.sh"
+
+          # pipefail so a test failure on the box propagates through the tee pipe and FAILS this
+          # step (which fails the job and blocks the merge), instead of being masked by tee.
+          # 2>&1 so remote stderr (build errors, panics) is captured too — both into the live
+          # step log and the file the run-summary step tails.
+          set -o pipefail
+          $SSH "bash -lc \"$REMOTE\"" 2>&1 | tee "$RUNNER_TEMP/gpu_test_out.txt"
+
+      - name: Write run summary
+        if: always() && (steps.tests.outcome == 'success' || steps.tests.outcome == 'failure')
+        env:
+          OUTCOME: ${{ steps.tests.outcome }}
+        run: |
+          OUT="$RUNNER_TEMP/gpu_test_out.txt"
+          {
+            echo "## GPU tests (CUDA suite) — ${OUTCOME}"
+            if [ "$OUTCOME" = "success" ]; then
+              echo "All GPU test groups passed."
+            else
+              # Group the failed tests under the make target that ran them: gpu_test.sh prints
+              # "=== make <target> ===" before each group, and cargo prints "test <name> ... FAILED".
+              report=$(awk '
+                /^=== make / { grp=$3; next }
+                / \.\.\. FAILED/ { fails[grp]=fails[grp] "\n    - " $2; n[grp]++ }
+                END { for (g in fails) printf "- **%s** (%d failed):%s\n", g, n[g], fails[g] }
+              ' "$OUT" 2>/dev/null || true)
+              # Per-test panic/assertion messages: each "thread '…' panicked at …:" block plus
+              # its following message lines (assertion, left/right), capped per block.
+              details=$(awk '
+                /^thread .* panicked at / { cap=1; lines=0; buf=$0; next }
+                cap {
+                  if ($0 ~ /^note: run with/ || $0 ~ /^----/ || $0 ~ /^test / || $0 ~ /^=== / || $0 ~ /^[[:space:]]*$/) { printf "%s\n\n", buf; cap=0; next }
+                  if (lines < 14) { buf=buf "\n" $0; lines++ } else if (lines==14) { buf=buf "\n    ...(truncated)"; lines++ }
+                }
+                END { if (cap) printf "%s\n", buf }
+              ' "$OUT" 2>/dev/null || true)
+              if [ -n "$report" ]; then
+                echo; echo "### Failed tests by group"; echo "$report"
+                if [ -n "$details" ]; then
+                  echo; echo "### Failure details"; echo '```'; echo "$details"; echo '```'
+                fi
+              else
+                # No per-test failures parsed (likely a build/infra error) — fall back to the
+                # failed-group markers plus a short log tail.
+                grps=$(grep -F '::error::GPU test group failed:' "$OUT" 2>/dev/null | sed 's/.*failed: /- /' | sort -u || true)
+                [ -n "$grps" ] && { echo; echo "### Failed groups"; echo "$grps"; }
+                echo; echo "No individual test failures parsed (build/infra error?). Last lines:"
+                echo '```'; tail -n 40 "$OUT" 2>/dev/null || echo "(no output captured)"; echo '```'
+              fi
+              echo; echo "<sub>Full output is in the \"Run GPU tests\" step log.</sub>"
+            fi
+          } >> "$GITHUB_STEP_SUMMARY"
+
+      # --- Teardown: ALWAYS destroy the instance (cost guardrail) ---
+      - name: Destroy instance
+        if: always()
+        run: |
+          # Retry transient failures (network/auth) so a paid box isn't stranded.
+          # --yes: skip the interactive [y/N] confirm (CI has no tty).
+          destroy() {
+            iid="$1"; destroyed=""
+            for attempt in 1 2 3; do
+              if vastai destroy instance "$iid" --yes; then destroyed=1; break; fi
+              echo "destroy attempt $attempt failed; retrying in 10s..."
+              sleep 10
+            done
+            [ -n "$destroyed" ] || echo "::warning::Failed to destroy instance $iid after 3 attempts — check the Vast console (label $RUN_LABEL)"
+          }
+          if [ -f "$RUNNER_TEMP/vast_instance_id" ]; then
+            IID=$(cat "$RUNNER_TEMP/vast_instance_id")
+            echo "Destroying instance $IID"
+            destroy "$IID"
+          else
+            # The id file is written only AFTER create succeeds AND its JSON parses, so a box can
+            # exist unrecorded if the run was cancelled in that window or the parse failed. Fall
+            # back to destroying by our unique RUN_LABEL so the box can't leak (bill indefinitely).
+            echo "No instance id recorded; searching Vast for any box labelled $RUN_LABEL..."
+            vastai show instances --raw > all_inst.json 2>/dev/null || echo '[]' > all_inst.json
+            LEAKED=$(jq -r --arg L "$RUN_LABEL" \
+              '(if type=="array" then . else (.instances // []) end) | .[] | select(.label == $L) | .id' \
+              all_inst.json 2>/dev/null || true)
+            if [ -z "$LEAKED" ]; then
+              echo "No instance labelled $RUN_LABEL found; nothing to destroy."
+            else
+              for IID in $LEAKED; do
+                echo "Destroying leaked instance $IID (label $RUN_LABEL)"
+                destroy "$IID"
+              done
+            fi
+          fi
diff --git a/Makefile b/Makefile
index 454eff098..6592eee97 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,8 @@
 compile-programs compile-recursion-elfs clean-asm clean-rust clean-bench clean-shared \
 clean-recursion-elfs clean test test-asm \
 test-rust test-executor test-flamegraph flamegraph-prover \
-test-fast test-prover test-prover-all test-disk-spill test-math-cuda test-cuda-integration \
+test-fast test-prover test-prover-all test-disk-spill test-math-cuda test-cuda-integration test-cuda-fallback \
+test-prover-cuda test-prover-comprehensive-cuda \
 bench-math-cuda bench-prover bench-prover-cuda build check clippy fmt lint regen-ethrex-fixtures \
 update-ethrex-fixture-checksums check-ethrex-fixture-checksums
 
@@ -284,6 +285,26 @@ test-cuda-integration:
 	cargo test -p lambda-vm-prover --release --features cuda \
 	    --test cuda_path_integration -- --ignored --nocapture
 
+# GPU error-path coverage (requires NVIDIA GPU + nvcc).
+# Forces cuda dispatch errors and asserts the CPU fallback still produces a verifying proof.
+test-cuda-fallback:
+	cargo test -p lambda-vm-prover --release --features test-cuda-faults \
+	    --test cuda_fallback_tests -- --ignored --nocapture --test-threads=1
+
+# The prover/stark/crypto/ecsm test suite with the GPU (cuda) path enabled (requires NVIDIA
+# GPU + nvcc). The GPU CI counterpart of CPU CI's sharded prover tests. Single-threaded: the
+# GPU serializes proves and the dispatch counters are process-global. cuda on prover cascades
+# to stark; crypto/ecsm build without it (they have no GPU path).
+test-prover-cuda:
+	cargo test --release -p lambda-vm-prover -p stark -p crypto -p ecsm \
+	    --features lambda-vm-prover/cuda -- --test-threads=1
+
+# The comprehensive all-instructions prove (ignored by default) on the GPU path (requires
+# NVIDIA GPU + nvcc). GPU counterpart of CPU CI's merge-queue-only comprehensive job.
+test-prover-comprehensive-cuda:
+	cargo test --release -p lambda-vm-prover --features cuda \
+	    test_prove_elfs_all_instructions_64_full -- --ignored --test-threads=1 --nocapture
+
 # math-cuda quick microbench (median of 10 runs)
 bench-math-cuda:
 	cargo test -p math-cuda --release --test bench_quick -- --ignored --nocapture
diff --git a/README.md b/README.md
index 151934433..e07967037 100644
--- a/README.md
+++ b/README.md
@@ -185,7 +185,11 @@ See [`spec/README.md`](./spec/README.md) for full setup instructions.
 | `make test-asm` | Compile and run ASM tests |
 | `make test-rust` | Compile and run Rust tests |
 | `make test-executor` | Compile all programs and run executor tests |
-| `make test-math-cuda` | math-cuda parity tests (requires NVIDIA GPU + nvcc) |
+| `make test-math-cuda` | math-cuda GPU kernel parity tests (requires NVIDIA GPU + nvcc; see GPU Tests) |
+| `make test-cuda-integration` | End-to-end GPU dispatch + proof verification (requires NVIDIA GPU + nvcc) |
+| `make test-cuda-fallback` | GPU error-path / CPU-fallback tests (requires NVIDIA GPU + nvcc) |
+| `make test-prover-cuda` | Prover/stark/crypto/ecsm suite on the GPU path (requires NVIDIA GPU + nvcc) |
+| `make test-prover-comprehensive-cuda` | Comprehensive all-instructions prove on the GPU path (requires NVIDIA GPU + nvcc) |
 | `make build` | Build all workspace crates |
 | `make check` | Check all crates (faster than build, no codegen) |
 | `make clippy` | Run clippy on all crates |
@@ -219,6 +223,21 @@ You can run it with
 
 `make test-rust`
 
+### GPU Tests
+
+The CUDA test groups run only on a machine with an NVIDIA GPU and `nvcc`:
+
+- `make test-math-cuda` — GPU-vs-CPU kernel parity (NTT, LDE, barycentric, FRI, …)
+- `make test-cuda-integration` — proves a guest on GPU and checks every dispatch fired + the proof verifies
+- `make test-cuda-fallback` — forces GPU dispatch errors and checks the CPU fallback still verifies
+- `make test-prover-cuda` — the prover/stark/crypto/ecsm suite with the GPU path enabled
+- `make test-prover-comprehensive-cuda` — the comprehensive all-instructions prove on the GPU path
+
+The kernels are compiled by `nvcc` into PTX that the driver JIT-compiles at load, so the GPU's
+driver must be new enough for the toolkit — an older driver rejects the PTX with
+`CUDA_ERROR_UNSUPPORTED_PTX_VERSION`. These groups run automatically on a rented GPU in the merge
+queue via `.github/workflows/gpu-tests.yml` (which filters offers on `cuda_max_good`).
+
 ## Benchmarking & Profiling
 
 You can create a flamegraph for proof generation using the following target:
diff --git a/crypto/math-cuda/build.rs b/crypto/math-cuda/build.rs
index b2f61f9a2..73cc10d3a 100644
--- a/crypto/math-cuda/build.rs
+++ b/crypto/math-cuda/build.rs
@@ -72,6 +72,11 @@ fn compile_ptx(src: &str, out_name: &str, have_nvcc: bool) {
     // compute capability. If unset, try `nvidia-smi` to match the host GPU
     // (avoids JIT failures like nvcc-13.0 PTX rejected on Blackwell drivers);
     // fall back to compute_89 (Ada) when detection fails.
+    //
+    // NOTE: this `-arch` only sets the *virtual arch*, not the PTX ISA version, which is
+    // fixed by this nvcc's CUDA toolkit. The runtime driver must support that toolkit's CUDA
+    // version or it rejects the PTX with CUDA_ERROR_UNSUPPORTED_PTX_VERSION — i.e. the box's
+    // driver CUDA must be >= the build toolkit's CUDA. See README "GPU Tests".
     let arch = env::var("CUDARC_NVCC_ARCH").unwrap_or_else(|_| detect_arch());
 
     let status = Command::new(nvcc_path())
diff --git a/scripts/gpu_test.sh b/scripts/gpu_test.sh
new file mode 100755
index 000000000..942a09620
--- /dev/null
+++ b/scripts/gpu_test.sh
@@ -0,0 +1,74 @@
+#!/usr/bin/env bash
+#
+# gpu_test.sh — run the CUDA-only test groups on a GPU box.
+#
+# Exercises the CUDA path, which CPU CI can't (GitHub runners have no GPU):
+#   1. math-cuda kernel parity         (make test-math-cuda)
+#   2. end-to-end GPU dispatch + proof  (make test-cuda-integration)
+#   3. GPU error-path / CPU fallback    (make test-cuda-fallback)
+#   4. prover/stark/crypto/ecsm suite   (make test-prover-cuda) — CPU CI's prover tests on GPU
+#   5. comprehensive all-instructions   (make test-prover-comprehensive-cuda)
+#
+# Runs on the rented Vast box from the gpu-tests.yml merge-queue workflow. All groups
+# run even if one fails (so the log shows every failure); the script exits non-zero if ANY
+# group failed, which fails the workflow job and blocks the merge.
+#
+# Env:
+#   CUDARC_PIN   cudarc CUDA-version feature to pin (default cuda-12080). See the sed below.
+#   SYSROOT_DIR  rv64 sysroot (default /opt/lambda-vm-sysroot, provisioned by the template).
+
+set -euo pipefail
+
+CUDARC_PIN="${CUDARC_PIN:-cuda-12080}"
+export SYSROOT_DIR="${SYSROOT_DIR:-/opt/lambda-vm-sysroot}"
+
+log() { printf '\n=== %s ===\n' "$*"; }
+
+# --- GPU toolchain sanity (fail loudly rather than silently falling back to CPU) ---
+log "GPU toolchain"
+if ! command -v nvcc >/dev/null 2>&1; then
+    for d in /usr/local/cuda/bin /usr/local/cuda-*/bin; do
+        [ -x "$d/nvcc" ] && export PATH="$d:$PATH" && break
+    done
+fi
+command -v nvcc >/dev/null 2>&1 || { echo "ERROR: nvcc not found — CUDA toolkit missing" >&2; exit 1; }
+nvcc --version | tail -n 2
+nvidia-smi --query-gpu=name,driver_version,compute_cap --format=csv,noheader
+
+# --- Pin cudarc so it binds a fixed driver-symbol set --------------------------
+# crypto/math-cuda/Cargo.toml uses `cuda-version-from-build-system` + `fallback-latest`;
+# when detection falls back to "latest", cudarc requests symbols some boxes' driver doesn't
+# export (e.g. cuDevSmResourceSplit / cuCtxGetDevice_v2) -> runtime panic. Pinning to a fixed,
+# conservative CUDA version binds a known driver-symbol set instead. (This is cudarc's
+# host-side driver-API floor — independent of the PTX/driver version the offer filter targets.)
+log "pinning cudarc to $CUDARC_PIN"
+sed -i "s/\"cuda-version-from-build-system\"/\"${CUDARC_PIN}\"/; /\"fallback-latest\"/d" \
+    crypto/math-cuda/Cargo.toml
+
+# --- Build the guest ELFs the tests prove ---------------------------------------
+# math-cuda parity needs none; cuda_path_integration / cuda_fallback prove an asm ELF; the
+# prover suite (Groups 4 & 5) proves asm AND rust guests. Build both up front.
+log "compiling guest programs (asm + rust)"
+make compile-programs-asm
+make compile-programs-rust
+
+# --- Run the CUDA test groups via the Makefile targets --------------------------
+fail=0
+run() {  # $1 = make target
+    log "make $1"
+    if ! make "$1"; then
+        echo "::error::GPU test group failed: $1"
+        fail=1
+    fi
+}
+run test-math-cuda                  # Group 1: kernel parity
+run test-cuda-integration           # Group 2: end-to-end GPU dispatch + proof verifies
+run test-cuda-fallback              # Group 3: GPU error -> CPU fallback still verifies
+run test-prover-cuda                # Group 4: prover/stark/crypto/ecsm suite on the GPU path
+run test-prover-comprehensive-cuda  # Group 5: comprehensive all-instructions prove on GPU
+
+if [ "$fail" -ne 0 ]; then
+    log "FAILED — one or more GPU test groups failed"
+    exit 1
+fi
+log "all GPU test groups passed"