From d95e3c85bdca62fe654d3649f8b9d507e322c1f3 Mon Sep 17 00:00:00 2001 From: Julian Arce <52429267+JuArce@users.noreply.github.com> Date: Tue, 30 Jun 2026 12:06:01 -0300 Subject: [PATCH 01/10] ci: run tests on GPU server --- .github/workflows/gpu-tests.yml | 300 ++++++++++++++++++++++++++++++++ Makefile | 8 +- scripts/gpu_test.sh | 67 +++++++ 3 files changed, 374 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/gpu-tests.yml create mode 100755 scripts/gpu_test.sh diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml new file mode 100644 index 000000000..a05de8f58 --- /dev/null +++ b/.github/workflows/gpu-tests.yml @@ -0,0 +1,300 @@ +name: GPU Tests (merge queue) + +# Run the CUDA-only test groups (which CPU CI can't, since GitHub runners have no GPU) on a +# rented Vast.ai RTX 5090 when a PR is in the merge queue, and block the merge if they fail. +# Groups (see scripts/gpu_test.sh): math-cuda kernel parity, cuda_path_integration (GPU proof +# verifies), cuda_fallback (CPU fallback verifies). Orchestration runs on a GitHub-hosted +# runner; all GPU work happens on the rented box (provisioned by the template onstart). The +# box is ALWAYS destroyed at the end. +# +# Triggered on `merge_group` (one rental per merge, not per push) + `workflow_dispatch` for +# manual runs. To gate merges, add the job name `gpu-tests` to the branch-protection required +# status checks for `main` (GitHub UI). +# +# Requires repo secrets: +# VAST_API_KEY — https://cloud.vast.ai/manage-keys/ +# VAST_TEMPLATE_HASH — hash of the "NVIDIA CUDA Lambda VM 64GB" template + +on: + merge_group: + workflow_dispatch: + # TEMP(testing): run on pushes to this branch pre-merge (no merge queue needed to test the + # rent -> test -> destroy path). REMOVE before merging. + push: + branches: [ci_run_tests_gpu] + +permissions: + contents: read + +concurrency: + group: gpu-tests-${{ github.ref }} + cancel-in-progress: true + +env: + # Vast offer search: RTX 5090, >=16 cores, >=32GB RAM (workloads are small), >=64GB disk, + # verified + rentable, Blackwell-capable driver, <= cap. + GPU_NAME: RTX_5090 + PRICE_CAP: "1" + VAST_IMAGE_DISK: "64" + # Unique per-run label set on the instance, for identification + leak-proof teardown. + RUN_LABEL: "gpu-tests-${{ github.run_id }}-${{ github.run_attempt }}" + # Pin the Vast CLI to an immutable commit (a PyPI version can be re-published; a commit + # hash can't) — avoids pulling untrusted code at run time. + VAST_CLI_COMMIT: "28494d92c6c03d887f8375085243c22eb68c5874" + +jobs: + gpu-tests: + runs-on: ubuntu-latest + # Provisioning + dual-feature cuda build (~25 min) + the three test groups (~15 min) + + # image-pull slack. Generous ceiling; teardown still always destroys the box. + timeout-minutes: 120 + steps: + - name: Install Vast CLI + # No secrets in this step's env: install-time code can't read the API key during pip + # install. Pinned to an immutable commit (see VAST_CLI_COMMIT) for the same reason. + # --break-system-packages: the ephemeral runner's Python may be PEP-668 "externally + # managed"; safe to override on a disposable runner. + run: pip install --quiet --break-system-packages "git+https://github.com/vast-ai/vast-cli.git@${VAST_CLI_COMMIT}" + + - name: Authenticate Vast CLI + env: + VAST_API_KEY: ${{ secrets.VAST_API_KEY }} + run: vastai set api-key "$VAST_API_KEY" + + - name: Generate ephemeral SSH key + id: sshkey + run: | + mkdir -p "$HOME/.ssh" + KEY="$HOME/.ssh/vast_gpu_tests" + ssh-keygen -t ed25519 -N "" -f "$KEY" -C "gh-actions-gpu-tests-${GITHUB_RUN_ID}" >/dev/null + echo "key_path=$KEY" >> "$GITHUB_OUTPUT" + + - name: Pick a Vast offer + id: offer + env: + # Retry the same query to ride out transient scarcity (RTX 5090s are a small, + # fast-churning pool). Total wait ~= ATTEMPTS * INTERVAL. + OFFER_ATTEMPTS: "10" + OFFER_INTERVAL: "30" + # Require driver >= this major so cudarc matches the runtime driver (older drivers + # lack newer symbols and the GPU path falls back to CPU). Filtered client-side in jq + # because vast can't numerically compare the driver_version string server-side. + MIN_DRIVER: "580" + run: | + # cpu_ram filter is in GB. + QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=32 disk_space>=64 verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}" + echo "Query: $QUERY (+ client-side driver_version major >= $MIN_DRIVER)" + # Keep only offers whose driver major >= MIN_DRIVER, then most expensive first (within + # the cap) — premium hosts have faster disks/network and better reliability; cheapest + # boxes were flaky. `try ... catch 0` so a malformed/null driver_version on one offer + # is treated as 0 (filtered out) rather than erroring the whole jq. + SELECT="map(select((try (.driver_version|split(\".\")[0]|tonumber) catch 0) >= ${MIN_DRIVER})) | sort_by(.dph_total) | reverse" + OFFER_ID="" + for attempt in $(seq 1 "$OFFER_ATTEMPTS"); do + vastai search offers "$QUERY" --raw -o dph_total > offers.json || true + OFFER_ID=$(jq -r "$SELECT | .[0].id // empty" offers.json) + OFFER_PRICE=$(jq -r "$SELECT | .[0].dph_total // empty" offers.json) + if [ -n "$OFFER_ID" ]; then + echo "Selected offer $OFFER_ID at \$${OFFER_PRICE}/hr (attempt $attempt)" + break + fi + echo "No matching offer (attempt $attempt/$OFFER_ATTEMPTS); retrying in ${OFFER_INTERVAL}s..." + sleep "$OFFER_INTERVAL" + done + if [ -z "$OFFER_ID" ]; then + echo "::error::No RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=32GB RAM, >=64GB disk, driver>=${MIN_DRIVER}, <= \$${PRICE_CAP}/hr)" + exit 1 + fi + echo "id=$OFFER_ID" >> "$GITHUB_OUTPUT" + echo "price=$OFFER_PRICE" >> "$GITHUB_OUTPUT" + + - name: Create instance + id: instance + env: + VAST_TEMPLATE_HASH: ${{ secrets.VAST_TEMPLATE_HASH }} + OFFER_ID: ${{ steps.offer.outputs.id }} + run: | + vastai create instance "$OFFER_ID" \ + --template_hash "$VAST_TEMPLATE_HASH" \ + --disk "$VAST_IMAGE_DISK" \ + --label "$RUN_LABEL" \ + --ssh --direct --raw > create.json + # Log only the fields we need (the full --raw response could carry a sensitive field). + jq '{success, new_contract: (.new_contract // .instances.new_contract)}' create.json + IID=$(jq -r '.new_contract // .instances.new_contract // empty' create.json) + if [ -z "$IID" ]; then + echo "::error::Failed to create Vast instance" + exit 1 + fi + # Persist immediately so teardown runs even if later steps fail. + echo "$IID" > "$RUNNER_TEMP/vast_instance_id" + echo "id=$IID" >> "$GITHUB_OUTPUT" + echo "Created instance $IID (label $RUN_LABEL)" + + - name: Attach SSH key to instance + env: + IID: ${{ steps.instance.outputs.id }} + KEY: ${{ steps.sshkey.outputs.key_path }} + run: | + # Attach the ephemeral pubkey to THIS instance only (added to its authorized_keys); + # removed when the instance is destroyed, so no account-level key to clean up. + # Retry: the instance may not accept the attach immediately after create. + PUB="$(cat "$KEY.pub")" + for attempt in $(seq 1 12); do + if vastai attach ssh "$IID" "$PUB"; then + echo "Attached ssh key (attempt $attempt)"; exit 0 + fi + echo "attach failed (attempt $attempt/12); retrying in 10s..." + sleep 10 + done + echo "::error::Failed to attach ssh key to instance $IID" + exit 1 + + - name: Wait for SSH + id: ssh + env: + IID: ${{ steps.instance.outputs.id }} + run: | + echo "Waiting for instance $IID to reach 'running' with SSH endpoint..." + HOST=""; PORT="" + # The base CUDA image is large; some hosts sit in 'loading' (image pull) a while. + for _ in $(seq 1 180); do # ~30 min + vastai show instance "$IID" --raw > inst.json || true + STATUS=$(jq -r '.actual_status // empty' inst.json) + # We create with --direct, so SSH straight to the public IP + the host port mapped + # to container port 22 (the .ssh_host/.ssh_port proxy fields are unreliable). + HOST=$(jq -r '.public_ipaddr // empty' inst.json) + PORT=$(jq -r '.ports["22/tcp"][0].HostPort // empty' inst.json) + echo " status=$STATUS ssh=$HOST:$PORT" + if [ "$STATUS" = "running" ] && [ -n "$HOST" ] && [ -n "$PORT" ]; then + break + fi + sleep 10 + done + if [ "$STATUS" != "running" ] || [ -z "$HOST" ] || [ -z "$PORT" ]; then + echo "::error::Instance never became reachable (status=$STATUS host=$HOST port=$PORT)" + exit 1 + fi + echo "host=$HOST" >> "$GITHUB_OUTPUT" + echo "port=$PORT" >> "$GITHUB_OUTPUT" + + # Wait for sshd to accept our key. + for _ in $(seq 1 30); do + if ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes \ + -i "${{ steps.sshkey.outputs.key_path }}" -p "$PORT" "root@$HOST" true 2>/dev/null; then + echo "sshd reachable"; exit 0 + fi + sleep 10 + done + echo "::error::sshd did not accept connections in time" + exit 1 + + - name: Wait for onstart provisioning + env: + HOST: ${{ steps.ssh.outputs.host }} + PORT: ${{ steps.ssh.outputs.port }} + KEY: ${{ steps.sshkey.outputs.key_path }} + run: | + SSH="ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes -i $KEY -p $PORT root@$HOST" + echo "Waiting for the template onstart script to finish (Rust + LLVM + sysroot + clone)..." + # The bootstrap's final stdout line is "=== done ===". Vast captures onstart output to + # /var/log/onstart.log; fall back to checking the artifacts it leaves. + for _ in $(seq 1 120); do # ~20 min + if $SSH 'grep -q "=== done ===" /var/log/onstart.log 2>/dev/null'; then + echo "onstart reported done"; exit 0 + fi + # shellcheck disable=SC2016 # $HOME must expand on the remote box, not the runner + if $SSH 'test -x "$HOME/.cargo/bin/cargo" \ + && test -f /opt/lambda-vm-sysroot/include/stdlib.h \ + && test -d /workspace/lambda_vm/.git'; then + echo "provisioning artifacts present"; exit 0 + fi + sleep 10 + done + echo "::error::onstart provisioning did not complete in time" + exit 1 + + - name: Run GPU tests + id: tests + env: + HOST: ${{ steps.ssh.outputs.host }} + PORT: ${{ steps.ssh.outputs.port }} + KEY: ${{ steps.sshkey.outputs.key_path }} + # merge_group: refs/heads/gh-readonly-queue/main/pr-… (the merge commit = PR + main), + # so we test exactly what will land. workflow_dispatch: the chosen branch ref. + REF: ${{ github.ref }} + run: | + SSH="ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes -i $KEY -p $PORT root@$HOST" + # Defense-in-depth: never interpolate an unvalidated ref into the remote `bash -lc`. + case "$REF" in + ''|*[!A-Za-z0-9._/-]*) echo "::error::invalid ref: '$REF'"; exit 1 ;; + esac + # Check out the ref under test on the box, then run the CUDA test groups. + REMOTE="set -e; cd /workspace/lambda_vm; \ + git fetch --force origin '$REF'; \ + git checkout -f FETCH_HEAD; \ + CUDARC_PIN=cuda-12080 SYSROOT_DIR=/opt/lambda-vm-sysroot bash scripts/gpu_test.sh" + + # pipefail so a test failure on the box propagates through the tee pipe and FAILS this + # step (which fails the job and blocks the merge), instead of being masked by tee. + # 2>&1 so remote stderr (build errors, panics) is captured too — both into the live + # step log and the file the run-summary step tails. + set -o pipefail + $SSH "bash -lc \"$REMOTE\"" 2>&1 | tee "$RUNNER_TEMP/gpu_test_out.txt" + + - name: Write run summary + if: always() && (steps.tests.outcome == 'success' || steps.tests.outcome == 'failure') + env: + OUTCOME: ${{ steps.tests.outcome }} + run: | + { + echo "## GPU tests (CUDA-only suite)" + echo "Outcome: **${OUTCOME}**" + # On failure, surface the failing-group markers explicitly, then the log tail. + if [ "$OUTCOME" != "success" ]; then + FAILED=$(grep -F '::error::GPU test group failed:' "$RUNNER_TEMP/gpu_test_out.txt" 2>/dev/null \ + | sed 's/.*failed: /- /' || true) + [ -n "$FAILED" ] && { echo; echo "Failed groups:"; echo "$FAILED"; } + fi + echo '```' + tail -n 80 "$RUNNER_TEMP/gpu_test_out.txt" 2>/dev/null || echo "(no output captured)" + echo '```' + } >> "$GITHUB_STEP_SUMMARY" + + # --- Teardown: ALWAYS destroy the instance (cost guardrail) --- + - name: Destroy instance + if: always() + run: | + # Retry transient failures (network/auth) so a paid box isn't stranded. + # --yes: skip the interactive [y/N] confirm (CI has no tty). + destroy() { + iid="$1"; destroyed="" + for attempt in 1 2 3; do + if vastai destroy instance "$iid" --yes; then destroyed=1; break; fi + echo "destroy attempt $attempt failed; retrying in 10s..." + sleep 10 + done + [ -n "$destroyed" ] || echo "::warning::Failed to destroy instance $iid after 3 attempts — check the Vast console (label $RUN_LABEL)" + } + if [ -f "$RUNNER_TEMP/vast_instance_id" ]; then + IID=$(cat "$RUNNER_TEMP/vast_instance_id") + echo "Destroying instance $IID" + destroy "$IID" + else + # The id file is written only AFTER create succeeds AND its JSON parses, so a box can + # exist unrecorded if the run was cancelled in that window or the parse failed. Fall + # back to destroying by our unique RUN_LABEL so the box can't leak (bill indefinitely). + echo "No instance id recorded; searching Vast for any box labelled $RUN_LABEL..." + vastai show instances --raw > all_inst.json 2>/dev/null || echo '[]' > all_inst.json + LEAKED=$(jq -r --arg L "$RUN_LABEL" \ + '(if type=="array" then . else (.instances // []) end) | .[] | select(.label == $L) | .id' \ + all_inst.json 2>/dev/null || true) + if [ -z "$LEAKED" ]; then + echo "No instance labelled $RUN_LABEL found; nothing to destroy." + else + for IID in $LEAKED; do + echo "Destroying leaked instance $IID (label $RUN_LABEL)" + destroy "$IID" + done + fi + fi diff --git a/Makefile b/Makefile index 81bc03a8c..32a735c79 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ .PHONY: deps deps-linux deps-macos compile-programs-asm compile-programs-rust compile-bench \ compile-programs clean-asm clean-rust clean-bench clean-shared clean test test-asm \ test-rust test-executor test-flamegraph flamegraph-prover \ -test-fast test-prover test-prover-all test-disk-spill test-math-cuda test-cuda-integration \ +test-fast test-prover test-prover-all test-disk-spill test-math-cuda test-cuda-integration test-cuda-fallback \ bench-math-cuda bench-prover bench-prover-cuda build check clippy fmt lint regen-ethrex-fixtures \ update-ethrex-fixture-checksums check-ethrex-fixture-checksums @@ -248,6 +248,12 @@ test-cuda-integration: cargo test -p lambda-vm-prover --release --features cuda \ --test cuda_path_integration -- --ignored --nocapture +# GPU error-path coverage (requires NVIDIA GPU + nvcc). +# Forces cuda dispatch errors and asserts the CPU fallback still produces a verifying proof. +test-cuda-fallback: + cargo test -p lambda-vm-prover --release --features test-cuda-faults \ + --test cuda_fallback_tests -- --ignored --nocapture + # math-cuda quick microbench (median of 10 runs) bench-math-cuda: cargo test -p math-cuda --release --test bench_quick -- --ignored --nocapture diff --git a/scripts/gpu_test.sh b/scripts/gpu_test.sh new file mode 100755 index 000000000..e3608d81a --- /dev/null +++ b/scripts/gpu_test.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# +# gpu_test.sh — run the CUDA-only test groups on a GPU box. +# +# These groups can't run in CPU CI (GitHub runners have no GPU): +# 1. math-cuda kernel parity (make test-math-cuda) +# 2. end-to-end GPU dispatch + proof (make test-cuda-integration) +# 3. GPU error-path / CPU fallback (make test-cuda-fallback) +# +# Runs on the rented Vast box from the gpu-tests.yml merge-queue workflow. All three groups +# run even if one fails (so the log shows every failure); the script exits non-zero if ANY +# group failed, which fails the workflow job and blocks the merge. +# +# Env: +# CUDARC_PIN cudarc CUDA-version feature to pin (default cuda-12080). See the sed below. +# SYSROOT_DIR rv64 sysroot (default /opt/lambda-vm-sysroot, provisioned by the template). + +set -euo pipefail + +CUDARC_PIN="${CUDARC_PIN:-cuda-12080}" +export SYSROOT_DIR="${SYSROOT_DIR:-/opt/lambda-vm-sysroot}" + +log() { printf '\n=== %s ===\n' "$*"; } + +# --- GPU toolchain sanity (fail loudly rather than silently falling back to CPU) --- +log "GPU toolchain" +if ! command -v nvcc >/dev/null 2>&1; then + for d in /usr/local/cuda/bin /usr/local/cuda-*/bin; do + [ -x "$d/nvcc" ] && export PATH="$d:$PATH" && break + done +fi +command -v nvcc >/dev/null 2>&1 || { echo "ERROR: nvcc not found — CUDA toolkit missing" >&2; exit 1; } +nvcc --version | tail -n 2 +nvidia-smi --query-gpu=name,driver_version,compute_cap --format=csv,noheader + +# --- Pin cudarc so it binds a fixed driver-symbol set -------------------------- +# crypto/math-cuda/Cargo.toml uses `cuda-version-from-build-system` + `fallback-latest`; +# when detection falls back to "latest", cudarc requests symbols some boxes' driver doesn't +# export (e.g. cuDevSmResourceSplit / cuCtxGetDevice_v2) -> runtime panic. Pinning to a fixed +# CUDA version (12.8, matching the cuda_max_good>=12.8 offer floor) avoids that. +log "pinning cudarc to $CUDARC_PIN" +sed -i "s/\"cuda-version-from-build-system\"/\"${CUDARC_PIN}\"/; /\"fallback-latest\"/d" \ + crypto/math-cuda/Cargo.toml + +# --- Build the asm guest ELFs used by Groups 2 & 3 (clang on .s; fast) ---------- +# (math-cuda parity tests need no ELF; cuda_path_integration / cuda_fallback prove an asm ELF.) +log "compiling asm guest programs" +make compile-programs-asm + +# --- Run the three CUDA test groups via the Makefile targets -------------------- +fail=0 +run() { # $1 = make target + log "make $1" + if ! make "$1"; then + echo "::error::GPU test group failed: $1" + fail=1 + fi +} +run test-math-cuda # Group 1: kernel parity +run test-cuda-integration # Group 2: end-to-end GPU dispatch + proof verifies +run test-cuda-fallback # Group 3: GPU error -> CPU fallback still verifies + +if [ "$fail" -ne 0 ]; then + log "FAILED — one or more GPU test groups failed" + exit 1 +fi +log "all GPU test groups passed" From 70754b384f743537e95fb5cf0243fd878e48330d Mon Sep 17 00:00:00 2001 From: Julian Arce <52429267+JuArce@users.noreply.github.com> Date: Tue, 30 Jun 2026 12:17:54 -0300 Subject: [PATCH 02/10] keep instance running for debug --- .github/workflows/gpu-tests.yml | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index a05de8f58..ba0e42411 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -41,6 +41,10 @@ env: # Pin the Vast CLI to an immutable commit (a PyPI version can be re-published; a commit # hash can't) — avoids pulling untrusted code at run time. VAST_CLI_COMMIT: "28494d92c6c03d887f8375085243c22eb68c5874" + # TEMP(debugging): "1" skips teardown so the box stays up for SSH debugging. The + # "Connection info" step prints how to connect and how to destroy it manually. + # SET BACK TO "0" (or remove) so the box is destroyed again. + KEEP_INSTANCE: "1" jobs: gpu-tests: @@ -261,9 +265,31 @@ jobs: echo '```' } >> "$GITHUB_STEP_SUMMARY" + # TEMP(debugging): when KEEP_INSTANCE=1, leave the box up and print how to reach it. + - name: Connection info (instance kept for debugging) + if: always() && env.KEEP_INSTANCE == '1' + env: + HOST: ${{ steps.ssh.outputs.host }} + PORT: ${{ steps.ssh.outputs.port }} + IID: ${{ steps.instance.outputs.id }} + run: | + { + echo "## ⚠️ Instance KEPT for debugging (KEEP_INSTANCE=1)" + echo "SSH in with your team key (baked into the box by the template onstart):" + echo '```' + echo "ssh -o StrictHostKeyChecking=accept-new -p ${PORT:-?} root@${HOST:-?}" + echo "cd /workspace/lambda_vm # the failing tests live here" + echo '```' + echo "Destroy it when done (it bills hourly):" + echo '```' + echo "vastai destroy instance ${IID:-?} --yes # label: $RUN_LABEL" + echo '```' + } | tee -a "$GITHUB_STEP_SUMMARY" + echo "::warning::Instance $IID kept for debugging — destroy it manually: vastai destroy instance $IID --yes" + # --- Teardown: ALWAYS destroy the instance (cost guardrail) --- - name: Destroy instance - if: always() + if: always() && env.KEEP_INSTANCE != '1' run: | # Retry transient failures (network/auth) so a paid box isn't stranded. # --yes: skip the interactive [y/N] confirm (CI has no tty). From 0c3b895f89d730a49e0b79f8f54694cbd364110d Mon Sep 17 00:00:00 2001 From: Julian Arce <52429267+JuArce@users.noreply.github.com> Date: Tue, 30 Jun 2026 13:00:56 -0300 Subject: [PATCH 03/10] ci: require CUDA 13.1 --- .github/workflows/gpu-tests.yml | 7 +++++-- README.md | 19 ++++++++++++++++++- crypto/math-cuda/build.rs | 5 +++++ 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index ba0e42411..45e596bc8 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -85,8 +85,11 @@ jobs: # because vast can't numerically compare the driver_version string server-side. MIN_DRIVER: "580" run: | - # cpu_ram filter is in GB. - QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=32 disk_space>=64 verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}" + # cpu_ram filter is in GB. cuda_max_good>=13.1: the box's driver must support CUDA + # 13.1 because the template's nvcc is 13.1 and build.rs JIT-compiles its PTX at load — + # a 13.0 driver rejects 13.1 PTX (CUDA_ERROR_UNSUPPORTED_PTX_VERSION). Bump this in + # lockstep if the base image's CUDA toolkit changes. + QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=32 disk_space>=64 verified=true rentable=true cuda_max_good>=13.1 dph_total<=${PRICE_CAP}" echo "Query: $QUERY (+ client-side driver_version major >= $MIN_DRIVER)" # Keep only offers whose driver major >= MIN_DRIVER, then most expensive first (within # the cap) — premium hosts have faster disks/network and better reliability; cheapest diff --git a/README.md b/README.md index 151934433..0f00979ab 100644 --- a/README.md +++ b/README.md @@ -185,7 +185,9 @@ See [`spec/README.md`](./spec/README.md) for full setup instructions. | `make test-asm` | Compile and run ASM tests | | `make test-rust` | Compile and run Rust tests | | `make test-executor` | Compile all programs and run executor tests | -| `make test-math-cuda` | math-cuda parity tests (requires NVIDIA GPU + nvcc) | +| `make test-math-cuda` | math-cuda GPU kernel parity tests (requires NVIDIA GPU + nvcc; see GPU Tests) | +| `make test-cuda-integration` | End-to-end GPU dispatch + proof verification (requires NVIDIA GPU + nvcc) | +| `make test-cuda-fallback` | GPU error-path / CPU-fallback tests (requires NVIDIA GPU + nvcc) | | `make build` | Build all workspace crates | | `make check` | Check all crates (faster than build, no codegen) | | `make clippy` | Run clippy on all crates | @@ -219,6 +221,21 @@ You can run it with `make test-rust` +### GPU Tests + +The CUDA test groups run only on a machine with an NVIDIA GPU and `nvcc`: + +- `make test-math-cuda` — GPU-vs-CPU kernel parity (NTT, LDE, barycentric, FRI, …) +- `make test-cuda-integration` — proves a guest on GPU and checks every dispatch fired + the proof verifies +- `make test-cuda-fallback` — forces GPU dispatch errors and checks the CPU fallback still verifies + +**Requirement: an NVIDIA driver supporting CUDA ≥ 13.1.** The kernels are compiled with the +toolkit's `nvcc` (currently CUDA 13.1) into PTX that the driver JIT-compiles at load; a driver +older than the toolkit rejects it with `CUDA_ERROR_UNSUPPORTED_PTX_VERSION`. Keep the driver/CUDA +floor in step with the installed toolkit (e.g. the `cuda_max_good>=13.1` filter in +`.github/workflows/gpu-tests.yml`). These groups run automatically on a rented GPU in the merge +queue via that workflow. + ## Benchmarking & Profiling You can create a flamegraph for proof generation using the following target: diff --git a/crypto/math-cuda/build.rs b/crypto/math-cuda/build.rs index b2f61f9a2..6888d2a72 100644 --- a/crypto/math-cuda/build.rs +++ b/crypto/math-cuda/build.rs @@ -72,6 +72,11 @@ fn compile_ptx(src: &str, out_name: &str, have_nvcc: bool) { // compute capability. If unset, try `nvidia-smi` to match the host GPU // (avoids JIT failures like nvcc-13.0 PTX rejected on Blackwell drivers); // fall back to compute_89 (Ada) when detection fails. + // + // NOTE: this `-arch` only sets the *virtual arch*, not the PTX ISA version, which is + // fixed by this nvcc's CUDA toolkit. The runtime driver must support that toolkit's CUDA + // version or it rejects the PTX with CUDA_ERROR_UNSUPPORTED_PTX_VERSION — i.e. the box's + // driver CUDA must be >= the build toolkit's CUDA (currently 13.1). See README "GPU Tests". let arch = env::var("CUDARC_NVCC_ARCH").unwrap_or_else(|_| detect_arch()); let status = Command::new(nvcc_path()) From 9abe6f01f70d9f17624dfc0daffe0590cc06e8fa Mon Sep 17 00:00:00 2001 From: Julian Arce <52429267+JuArce@users.noreply.github.com> Date: Tue, 30 Jun 2026 14:17:06 -0300 Subject: [PATCH 04/10] ci: test prover on CUDA --- .github/workflows/gpu-tests.yml | 23 ++++++++++++----------- Makefile | 15 +++++++++++++++ README.md | 4 ++++ scripts/gpu_test.sh | 28 +++++++++++++++++----------- 4 files changed, 48 insertions(+), 22 deletions(-) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index 45e596bc8..a84613291 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -1,11 +1,12 @@ name: GPU Tests (merge queue) -# Run the CUDA-only test groups (which CPU CI can't, since GitHub runners have no GPU) on a -# rented Vast.ai RTX 5090 when a PR is in the merge queue, and block the merge if they fail. +# Run the GPU test suite (which CPU CI can't, since GitHub runners have no GPU) on a rented +# Vast.ai RTX 5090 when a PR is in the merge queue, and block the merge if it fails. # Groups (see scripts/gpu_test.sh): math-cuda kernel parity, cuda_path_integration (GPU proof -# verifies), cuda_fallback (CPU fallback verifies). Orchestration runs on a GitHub-hosted -# runner; all GPU work happens on the rented box (provisioned by the template onstart). The -# box is ALWAYS destroyed at the end. +# verifies), cuda_fallback (CPU fallback verifies), the prover/stark/crypto/ecsm suite on the +# GPU path, and the comprehensive all-instructions prove. Orchestration runs on a GitHub-hosted +# runner; all GPU work happens on the rented box (provisioned by the template onstart). The box +# is ALWAYS destroyed at the end. # # Triggered on `merge_group` (one rental per merge, not per push) + `workflow_dispatch` for # manual runs. To gate merges, add the job name `gpu-tests` to the branch-protection required @@ -31,8 +32,8 @@ concurrency: cancel-in-progress: true env: - # Vast offer search: RTX 5090, >=16 cores, >=32GB RAM (workloads are small), >=64GB disk, - # verified + rentable, Blackwell-capable driver, <= cap. + # Vast offer search: RTX 5090, >=16 cores, >=64GB RAM (the prover suite proves real ELFs, + # so allow headroom), >=64GB disk, verified + rentable, Blackwell-capable driver, <= cap. GPU_NAME: RTX_5090 PRICE_CAP: "1" VAST_IMAGE_DISK: "64" @@ -49,9 +50,9 @@ env: jobs: gpu-tests: runs-on: ubuntu-latest - # Provisioning + dual-feature cuda build (~25 min) + the three test groups (~15 min) + - # image-pull slack. Generous ceiling; teardown still always destroys the box. - timeout-minutes: 120 + # Provisioning + cuda builds + 5 test groups; the prover suite (single-threaded, real + # ELF proves) dominates. Generous ceiling; teardown still always destroys the box. + timeout-minutes: 240 steps: - name: Install Vast CLI # No secrets in this step's env: install-time code can't read the API key during pip @@ -89,7 +90,7 @@ jobs: # 13.1 because the template's nvcc is 13.1 and build.rs JIT-compiles its PTX at load — # a 13.0 driver rejects 13.1 PTX (CUDA_ERROR_UNSUPPORTED_PTX_VERSION). Bump this in # lockstep if the base image's CUDA toolkit changes. - QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=32 disk_space>=64 verified=true rentable=true cuda_max_good>=13.1 dph_total<=${PRICE_CAP}" + QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=64 disk_space>=64 verified=true rentable=true cuda_max_good>=13.1 dph_total<=${PRICE_CAP}" echo "Query: $QUERY (+ client-side driver_version major >= $MIN_DRIVER)" # Keep only offers whose driver major >= MIN_DRIVER, then most expensive first (within # the cap) — premium hosts have faster disks/network and better reliability; cheapest diff --git a/Makefile b/Makefile index 32a735c79..45518ed6d 100644 --- a/Makefile +++ b/Makefile @@ -2,6 +2,7 @@ compile-programs clean-asm clean-rust clean-bench clean-shared clean test test-asm \ test-rust test-executor test-flamegraph flamegraph-prover \ test-fast test-prover test-prover-all test-disk-spill test-math-cuda test-cuda-integration test-cuda-fallback \ +test-prover-cuda test-prover-comprehensive-cuda \ bench-math-cuda bench-prover bench-prover-cuda build check clippy fmt lint regen-ethrex-fixtures \ update-ethrex-fixture-checksums check-ethrex-fixture-checksums @@ -254,6 +255,20 @@ test-cuda-fallback: cargo test -p lambda-vm-prover --release --features test-cuda-faults \ --test cuda_fallback_tests -- --ignored --nocapture +# The prover/stark/crypto/ecsm test suite with the GPU (cuda) path enabled (requires NVIDIA +# GPU + nvcc). The GPU CI counterpart of CPU CI's sharded prover tests. Single-threaded: the +# GPU serializes proves and the dispatch counters are process-global. cuda on prover cascades +# to stark; crypto/ecsm build without it (they have no GPU path). +test-prover-cuda: + cargo test --release -p lambda-vm-prover -p stark -p crypto -p ecsm \ + --features lambda-vm-prover/cuda -- --test-threads=1 + +# The comprehensive all-instructions prove (ignored by default) on the GPU path (requires +# NVIDIA GPU + nvcc). GPU counterpart of CPU CI's merge-queue-only comprehensive job. +test-prover-comprehensive-cuda: + cargo test --release -p lambda-vm-prover --features cuda \ + test_prove_elfs_all_instructions_64_full -- --ignored --test-threads=1 --nocapture + # math-cuda quick microbench (median of 10 runs) bench-math-cuda: cargo test -p math-cuda --release --test bench_quick -- --ignored --nocapture diff --git a/README.md b/README.md index 0f00979ab..820a97857 100644 --- a/README.md +++ b/README.md @@ -188,6 +188,8 @@ See [`spec/README.md`](./spec/README.md) for full setup instructions. | `make test-math-cuda` | math-cuda GPU kernel parity tests (requires NVIDIA GPU + nvcc; see GPU Tests) | | `make test-cuda-integration` | End-to-end GPU dispatch + proof verification (requires NVIDIA GPU + nvcc) | | `make test-cuda-fallback` | GPU error-path / CPU-fallback tests (requires NVIDIA GPU + nvcc) | +| `make test-prover-cuda` | Prover/stark/crypto/ecsm suite on the GPU path (requires NVIDIA GPU + nvcc) | +| `make test-prover-comprehensive-cuda` | Comprehensive all-instructions prove on the GPU path (requires NVIDIA GPU + nvcc) | | `make build` | Build all workspace crates | | `make check` | Check all crates (faster than build, no codegen) | | `make clippy` | Run clippy on all crates | @@ -228,6 +230,8 @@ The CUDA test groups run only on a machine with an NVIDIA GPU and `nvcc`: - `make test-math-cuda` — GPU-vs-CPU kernel parity (NTT, LDE, barycentric, FRI, …) - `make test-cuda-integration` — proves a guest on GPU and checks every dispatch fired + the proof verifies - `make test-cuda-fallback` — forces GPU dispatch errors and checks the CPU fallback still verifies +- `make test-prover-cuda` — the prover/stark/crypto/ecsm suite with the GPU path enabled +- `make test-prover-comprehensive-cuda` — the comprehensive all-instructions prove on the GPU path **Requirement: an NVIDIA driver supporting CUDA ≥ 13.1.** The kernels are compiled with the toolkit's `nvcc` (currently CUDA 13.1) into PTX that the driver JIT-compiles at load; a driver diff --git a/scripts/gpu_test.sh b/scripts/gpu_test.sh index e3608d81a..e6c974986 100755 --- a/scripts/gpu_test.sh +++ b/scripts/gpu_test.sh @@ -2,10 +2,12 @@ # # gpu_test.sh — run the CUDA-only test groups on a GPU box. # -# These groups can't run in CPU CI (GitHub runners have no GPU): -# 1. math-cuda kernel parity (make test-math-cuda) -# 2. end-to-end GPU dispatch + proof (make test-cuda-integration) -# 3. GPU error-path / CPU fallback (make test-cuda-fallback) +# Exercises the CUDA path, which CPU CI can't (GitHub runners have no GPU): +# 1. math-cuda kernel parity (make test-math-cuda) +# 2. end-to-end GPU dispatch + proof (make test-cuda-integration) +# 3. GPU error-path / CPU fallback (make test-cuda-fallback) +# 4. prover/stark/crypto/ecsm suite (make test-prover-cuda) — CPU CI's prover tests on GPU +# 5. comprehensive all-instructions (make test-prover-comprehensive-cuda) # # Runs on the rented Vast box from the gpu-tests.yml merge-queue workflow. All three groups # run even if one fails (so the log shows every failure); the script exits non-zero if ANY @@ -42,12 +44,14 @@ log "pinning cudarc to $CUDARC_PIN" sed -i "s/\"cuda-version-from-build-system\"/\"${CUDARC_PIN}\"/; /\"fallback-latest\"/d" \ crypto/math-cuda/Cargo.toml -# --- Build the asm guest ELFs used by Groups 2 & 3 (clang on .s; fast) ---------- -# (math-cuda parity tests need no ELF; cuda_path_integration / cuda_fallback prove an asm ELF.) -log "compiling asm guest programs" +# --- Build the guest ELFs the tests prove --------------------------------------- +# math-cuda parity needs none; cuda_path_integration / cuda_fallback prove an asm ELF; the +# prover suite (Groups 4 & 5) proves asm AND rust guests. Build both up front. +log "compiling guest programs (asm + rust)" make compile-programs-asm +make compile-programs-rust -# --- Run the three CUDA test groups via the Makefile targets -------------------- +# --- Run the CUDA test groups via the Makefile targets -------------------------- fail=0 run() { # $1 = make target log "make $1" @@ -56,9 +60,11 @@ run() { # $1 = make target fail=1 fi } -run test-math-cuda # Group 1: kernel parity -run test-cuda-integration # Group 2: end-to-end GPU dispatch + proof verifies -run test-cuda-fallback # Group 3: GPU error -> CPU fallback still verifies +run test-math-cuda # Group 1: kernel parity +run test-cuda-integration # Group 2: end-to-end GPU dispatch + proof verifies +run test-cuda-fallback # Group 3: GPU error -> CPU fallback still verifies +run test-prover-cuda # Group 4: prover/stark/crypto/ecsm suite on the GPU path +run test-prover-comprehensive-cuda # Group 5: comprehensive all-instructions prove on GPU if [ "$fail" -ne 0 ]; then log "FAILED — one or more GPU test groups failed" From cd83966edf8ec2a5c813df992fae7df09ac125ee Mon Sep 17 00:00:00 2001 From: Julian Arce <52429267+JuArce@users.noreply.github.com> Date: Tue, 30 Jun 2026 15:07:44 -0300 Subject: [PATCH 05/10] ci: improve gpu failed tests summaries --- .github/workflows/gpu-tests.yml | 52 ++++++++++++++++++++++++--------- 1 file changed, 39 insertions(+), 13 deletions(-) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index a84613291..12188ecc9 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -32,7 +32,7 @@ concurrency: cancel-in-progress: true env: - # Vast offer search: RTX 5090, >=16 cores, >=64GB RAM (the prover suite proves real ELFs, + # Vast offer search: RTX 5090, >=16 cores, >=96GB RAM (the prover suite proves real ELFs, # so allow headroom), >=64GB disk, verified + rentable, Blackwell-capable driver, <= cap. GPU_NAME: RTX_5090 PRICE_CAP: "1" @@ -90,7 +90,7 @@ jobs: # 13.1 because the template's nvcc is 13.1 and build.rs JIT-compiles its PTX at load — # a 13.0 driver rejects 13.1 PTX (CUDA_ERROR_UNSUPPORTED_PTX_VERSION). Bump this in # lockstep if the base image's CUDA toolkit changes. - QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=64 disk_space>=64 verified=true rentable=true cuda_max_good>=13.1 dph_total<=${PRICE_CAP}" + QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=96 disk_space>=64 verified=true rentable=true cuda_max_good>=13.1 dph_total<=${PRICE_CAP}" echo "Query: $QUERY (+ client-side driver_version major >= $MIN_DRIVER)" # Keep only offers whose driver major >= MIN_DRIVER, then most expensive first (within # the cap) — premium hosts have faster disks/network and better reliability; cheapest @@ -110,7 +110,7 @@ jobs: sleep "$OFFER_INTERVAL" done if [ -z "$OFFER_ID" ]; then - echo "::error::No RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=32GB RAM, >=64GB disk, driver>=${MIN_DRIVER}, <= \$${PRICE_CAP}/hr)" + echo "::error::No RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=96GB RAM, >=64GB disk, driver>=${MIN_DRIVER}, <= \$${PRICE_CAP}/hr)" exit 1 fi echo "id=$OFFER_ID" >> "$GITHUB_OUTPUT" @@ -255,18 +255,44 @@ jobs: env: OUTCOME: ${{ steps.tests.outcome }} run: | + OUT="$RUNNER_TEMP/gpu_test_out.txt" { - echo "## GPU tests (CUDA-only suite)" - echo "Outcome: **${OUTCOME}**" - # On failure, surface the failing-group markers explicitly, then the log tail. - if [ "$OUTCOME" != "success" ]; then - FAILED=$(grep -F '::error::GPU test group failed:' "$RUNNER_TEMP/gpu_test_out.txt" 2>/dev/null \ - | sed 's/.*failed: /- /' || true) - [ -n "$FAILED" ] && { echo; echo "Failed groups:"; echo "$FAILED"; } + echo "## GPU tests (CUDA suite) — ${OUTCOME}" + if [ "$OUTCOME" = "success" ]; then + echo "All GPU test groups passed." + else + # Group the failed tests under the make target that ran them: gpu_test.sh prints + # "=== make ===" before each group, and cargo prints "test ... FAILED". + report=$(awk ' + /^=== make / { grp=$3; next } + / \.\.\. FAILED/ { fails[grp]=fails[grp] "\n - " $2; n[grp]++ } + END { for (g in fails) printf "- **%s** (%d failed):%s\n", g, n[g], fails[g] } + ' "$OUT" 2>/dev/null || true) + # Per-test panic/assertion messages: each "thread '…' panicked at …:" block plus + # its following message lines (assertion, left/right), capped per block. + details=$(awk ' + /^thread .* panicked at / { cap=1; lines=0; buf=$0; next } + cap { + if ($0 ~ /^note: run with/ || $0 ~ /^----/ || $0 ~ /^test / || $0 ~ /^=== / || $0 ~ /^[[:space:]]*$/) { printf "%s\n\n", buf; cap=0; next } + if (lines < 14) { buf=buf "\n" $0; lines++ } else if (lines==14) { buf=buf "\n ...(truncated)"; lines++ } + } + END { if (cap) printf "%s\n", buf } + ' "$OUT" 2>/dev/null || true) + if [ -n "$report" ]; then + echo; echo "### Failed tests by group"; echo "$report" + if [ -n "$details" ]; then + echo; echo "### Failure details"; echo '```'; echo "$details"; echo '```' + fi + else + # No per-test failures parsed (likely a build/infra error) — fall back to the + # failed-group markers plus a short log tail. + grps=$(grep -F '::error::GPU test group failed:' "$OUT" 2>/dev/null | sed 's/.*failed: /- /' | sort -u || true) + [ -n "$grps" ] && { echo; echo "### Failed groups"; echo "$grps"; } + echo; echo "No individual test failures parsed (build/infra error?). Last lines:" + echo '```'; tail -n 40 "$OUT" 2>/dev/null || echo "(no output captured)"; echo '```' + fi + echo; echo "Full output is in the \"Run GPU tests\" step log." fi - echo '```' - tail -n 80 "$RUNNER_TEMP/gpu_test_out.txt" 2>/dev/null || echo "(no output captured)" - echo '```' } >> "$GITHUB_STEP_SUMMARY" # TEMP(debugging): when KEEP_INSTANCE=1, leave the box up and print how to reach it. From 25904c93b0380b064ca6dfb1908c786ce8808b03 Mon Sep 17 00:00:00 2001 From: Julian Arce <52429267+JuArce@users.noreply.github.com> Date: Tue, 30 Jun 2026 15:37:12 -0300 Subject: [PATCH 06/10] ci: test-threads=1 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 45518ed6d..e5ef2633f 100644 --- a/Makefile +++ b/Makefile @@ -253,7 +253,7 @@ test-cuda-integration: # Forces cuda dispatch errors and asserts the CPU fallback still produces a verifying proof. test-cuda-fallback: cargo test -p lambda-vm-prover --release --features test-cuda-faults \ - --test cuda_fallback_tests -- --ignored --nocapture + --test cuda_fallback_tests -- --ignored --nocapture --test-threads=1 # The prover/stark/crypto/ecsm test suite with the GPU (cuda) path enabled (requires NVIDIA # GPU + nvcc). The GPU CI counterpart of CPU CI's sharded prover tests. Single-threaded: the From e862a66dd16fcdc4b27a938102d9c384cf43edc2 Mon Sep 17 00:00:00 2001 From: Julian Arce <52429267+JuArce@users.noreply.github.com> Date: Tue, 30 Jun 2026 16:16:25 -0300 Subject: [PATCH 07/10] remove temporary code --- .github/workflows/gpu-tests.yml | 32 +------------------------------- 1 file changed, 1 insertion(+), 31 deletions(-) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index 12188ecc9..61db99acd 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -19,10 +19,6 @@ name: GPU Tests (merge queue) on: merge_group: workflow_dispatch: - # TEMP(testing): run on pushes to this branch pre-merge (no merge queue needed to test the - # rent -> test -> destroy path). REMOVE before merging. - push: - branches: [ci_run_tests_gpu] permissions: contents: read @@ -42,10 +38,6 @@ env: # Pin the Vast CLI to an immutable commit (a PyPI version can be re-published; a commit # hash can't) — avoids pulling untrusted code at run time. VAST_CLI_COMMIT: "28494d92c6c03d887f8375085243c22eb68c5874" - # TEMP(debugging): "1" skips teardown so the box stays up for SSH debugging. The - # "Connection info" step prints how to connect and how to destroy it manually. - # SET BACK TO "0" (or remove) so the box is destroyed again. - KEEP_INSTANCE: "1" jobs: gpu-tests: @@ -295,31 +287,9 @@ jobs: fi } >> "$GITHUB_STEP_SUMMARY" - # TEMP(debugging): when KEEP_INSTANCE=1, leave the box up and print how to reach it. - - name: Connection info (instance kept for debugging) - if: always() && env.KEEP_INSTANCE == '1' - env: - HOST: ${{ steps.ssh.outputs.host }} - PORT: ${{ steps.ssh.outputs.port }} - IID: ${{ steps.instance.outputs.id }} - run: | - { - echo "## ⚠️ Instance KEPT for debugging (KEEP_INSTANCE=1)" - echo "SSH in with your team key (baked into the box by the template onstart):" - echo '```' - echo "ssh -o StrictHostKeyChecking=accept-new -p ${PORT:-?} root@${HOST:-?}" - echo "cd /workspace/lambda_vm # the failing tests live here" - echo '```' - echo "Destroy it when done (it bills hourly):" - echo '```' - echo "vastai destroy instance ${IID:-?} --yes # label: $RUN_LABEL" - echo '```' - } | tee -a "$GITHUB_STEP_SUMMARY" - echo "::warning::Instance $IID kept for debugging — destroy it manually: vastai destroy instance $IID --yes" - # --- Teardown: ALWAYS destroy the instance (cost guardrail) --- - name: Destroy instance - if: always() && env.KEEP_INSTANCE != '1' + if: always() run: | # Retry transient failures (network/auth) so a paid box isn't stranded. # --yes: skip the interactive [y/N] confirm (CI has no tty). From 57e324156d4e733cbcc9552d24e2b9de3f736b91 Mon Sep 17 00:00:00 2001 From: Julian Arce <52429267+JuArce@users.noreply.github.com> Date: Tue, 30 Jun 2026 17:00:25 -0300 Subject: [PATCH 08/10] fix: set cuda_max_good>=12.8 --- .github/workflows/gpu-tests.yml | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index 61db99acd..b2df4379d 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -19,6 +19,9 @@ name: GPU Tests (merge queue) on: merge_group: workflow_dispatch: + # TEMP(testing): run on pushes to this branch pre-merge. REMOVE before merging. + push: + branches: [ci_run_tests_gpu] permissions: contents: read @@ -78,11 +81,12 @@ jobs: # because vast can't numerically compare the driver_version string server-side. MIN_DRIVER: "580" run: | - # cpu_ram filter is in GB. cuda_max_good>=13.1: the box's driver must support CUDA - # 13.1 because the template's nvcc is 13.1 and build.rs JIT-compiles its PTX at load — - # a 13.0 driver rejects 13.1 PTX (CUDA_ERROR_UNSUPPORTED_PTX_VERSION). Bump this in - # lockstep if the base image's CUDA toolkit changes. - QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=96 disk_space>=64 verified=true rentable=true cuda_max_good>=13.1 dph_total<=${PRICE_CAP}" + # cpu_ram filter is in GB. + # EXPERIMENT: cuda_max_good>=12.8 (was 13.1). The template's nvcc is 13.1, so its PTX + # JITs only on drivers with CUDA >= 13.1 — a 12.8/13.0 box will fail with + # CUDA_ERROR_UNSUPPORTED_PTX_VERSION. Testing whether the most-expensive selection + # still lands on 13.1-capable boxes in practice. Revert to >=13.1 if it flakes. + QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=96 disk_space>=64 verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}" echo "Query: $QUERY (+ client-side driver_version major >= $MIN_DRIVER)" # Keep only offers whose driver major >= MIN_DRIVER, then most expensive first (within # the cap) — premium hosts have faster disks/network and better reliability; cheapest From 7d8241c9f35b4615d5e224d0b7ddb6a11013c0f0 Mon Sep 17 00:00:00 2001 From: Julian Arce <52429267+JuArce@users.noreply.github.com> Date: Tue, 30 Jun 2026 17:37:16 -0300 Subject: [PATCH 09/10] comments --- .github/workflows/gpu-tests.yml | 8 +------- README.md | 10 ++++------ crypto/math-cuda/build.rs | 2 +- 3 files changed, 6 insertions(+), 14 deletions(-) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index b2df4379d..00ecbdcf7 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -82,16 +82,10 @@ jobs: MIN_DRIVER: "580" run: | # cpu_ram filter is in GB. - # EXPERIMENT: cuda_max_good>=12.8 (was 13.1). The template's nvcc is 13.1, so its PTX - # JITs only on drivers with CUDA >= 13.1 — a 12.8/13.0 box will fail with - # CUDA_ERROR_UNSUPPORTED_PTX_VERSION. Testing whether the most-expensive selection - # still lands on 13.1-capable boxes in practice. Revert to >=13.1 if it flakes. QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=96 disk_space>=64 verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}" echo "Query: $QUERY (+ client-side driver_version major >= $MIN_DRIVER)" # Keep only offers whose driver major >= MIN_DRIVER, then most expensive first (within - # the cap) — premium hosts have faster disks/network and better reliability; cheapest - # boxes were flaky. `try ... catch 0` so a malformed/null driver_version on one offer - # is treated as 0 (filtered out) rather than erroring the whole jq. + # the cap) — premium hosts have faster disks/network and better reliability. SELECT="map(select((try (.driver_version|split(\".\")[0]|tonumber) catch 0) >= ${MIN_DRIVER})) | sort_by(.dph_total) | reverse" OFFER_ID="" for attempt in $(seq 1 "$OFFER_ATTEMPTS"); do diff --git a/README.md b/README.md index 820a97857..e07967037 100644 --- a/README.md +++ b/README.md @@ -233,12 +233,10 @@ The CUDA test groups run only on a machine with an NVIDIA GPU and `nvcc`: - `make test-prover-cuda` — the prover/stark/crypto/ecsm suite with the GPU path enabled - `make test-prover-comprehensive-cuda` — the comprehensive all-instructions prove on the GPU path -**Requirement: an NVIDIA driver supporting CUDA ≥ 13.1.** The kernels are compiled with the -toolkit's `nvcc` (currently CUDA 13.1) into PTX that the driver JIT-compiles at load; a driver -older than the toolkit rejects it with `CUDA_ERROR_UNSUPPORTED_PTX_VERSION`. Keep the driver/CUDA -floor in step with the installed toolkit (e.g. the `cuda_max_good>=13.1` filter in -`.github/workflows/gpu-tests.yml`). These groups run automatically on a rented GPU in the merge -queue via that workflow. +The kernels are compiled by `nvcc` into PTX that the driver JIT-compiles at load, so the GPU's +driver must be new enough for the toolkit — an older driver rejects the PTX with +`CUDA_ERROR_UNSUPPORTED_PTX_VERSION`. These groups run automatically on a rented GPU in the merge +queue via `.github/workflows/gpu-tests.yml` (which filters offers on `cuda_max_good`). ## Benchmarking & Profiling diff --git a/crypto/math-cuda/build.rs b/crypto/math-cuda/build.rs index 6888d2a72..73cc10d3a 100644 --- a/crypto/math-cuda/build.rs +++ b/crypto/math-cuda/build.rs @@ -76,7 +76,7 @@ fn compile_ptx(src: &str, out_name: &str, have_nvcc: bool) { // NOTE: this `-arch` only sets the *virtual arch*, not the PTX ISA version, which is // fixed by this nvcc's CUDA toolkit. The runtime driver must support that toolkit's CUDA // version or it rejects the PTX with CUDA_ERROR_UNSUPPORTED_PTX_VERSION — i.e. the box's - // driver CUDA must be >= the build toolkit's CUDA (currently 13.1). See README "GPU Tests". + // driver CUDA must be >= the build toolkit's CUDA. See README "GPU Tests". let arch = env::var("CUDARC_NVCC_ARCH").unwrap_or_else(|_| detect_arch()); let status = Command::new(nvcc_path()) From 764aadeab5027f85163e6d448c3388862d57259d Mon Sep 17 00:00:00 2001 From: Julian Arce <52429267+JuArce@users.noreply.github.com> Date: Tue, 30 Jun 2026 17:59:28 -0300 Subject: [PATCH 10/10] apply code review --- .github/workflows/gpu-tests.yml | 3 ++- scripts/gpu_test.sh | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index 00ecbdcf7..984f6c35f 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -228,10 +228,11 @@ jobs: ''|*[!A-Za-z0-9._/-]*) echo "::error::invalid ref: '$REF'"; exit 1 ;; esac # Check out the ref under test on the box, then run the CUDA test groups. + # gpu_test.sh owns the CUDARC_PIN / SYSROOT_DIR defaults — don't duplicate them here. REMOTE="set -e; cd /workspace/lambda_vm; \ git fetch --force origin '$REF'; \ git checkout -f FETCH_HEAD; \ - CUDARC_PIN=cuda-12080 SYSROOT_DIR=/opt/lambda-vm-sysroot bash scripts/gpu_test.sh" + bash scripts/gpu_test.sh" # pipefail so a test failure on the box propagates through the tee pipe and FAILS this # step (which fails the job and blocks the merge), instead of being masked by tee. diff --git a/scripts/gpu_test.sh b/scripts/gpu_test.sh index e6c974986..942a09620 100755 --- a/scripts/gpu_test.sh +++ b/scripts/gpu_test.sh @@ -9,7 +9,7 @@ # 4. prover/stark/crypto/ecsm suite (make test-prover-cuda) — CPU CI's prover tests on GPU # 5. comprehensive all-instructions (make test-prover-comprehensive-cuda) # -# Runs on the rented Vast box from the gpu-tests.yml merge-queue workflow. All three groups +# Runs on the rented Vast box from the gpu-tests.yml merge-queue workflow. All groups # run even if one fails (so the log shows every failure); the script exits non-zero if ANY # group failed, which fails the workflow job and blocks the merge. # @@ -38,8 +38,9 @@ nvidia-smi --query-gpu=name,driver_version,compute_cap --format=csv,noheader # --- Pin cudarc so it binds a fixed driver-symbol set -------------------------- # crypto/math-cuda/Cargo.toml uses `cuda-version-from-build-system` + `fallback-latest`; # when detection falls back to "latest", cudarc requests symbols some boxes' driver doesn't -# export (e.g. cuDevSmResourceSplit / cuCtxGetDevice_v2) -> runtime panic. Pinning to a fixed -# CUDA version (12.8, matching the cuda_max_good>=12.8 offer floor) avoids that. +# export (e.g. cuDevSmResourceSplit / cuCtxGetDevice_v2) -> runtime panic. Pinning to a fixed, +# conservative CUDA version binds a known driver-symbol set instead. (This is cudarc's +# host-side driver-API floor — independent of the PTX/driver version the offer filter targets.) log "pinning cudarc to $CUDARC_PIN" sed -i "s/\"cuda-version-from-build-system\"/\"${CUDARC_PIN}\"/; /\"fallback-latest\"/d" \ crypto/math-cuda/Cargo.toml