diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml new file mode 100644 index 000000000..984f6c35f --- /dev/null +++ b/.github/workflows/gpu-tests.yml @@ -0,0 +1,325 @@ +name: GPU Tests (merge queue) + +# Run the GPU test suite (which CPU CI can't, since GitHub runners have no GPU) on a rented +# Vast.ai RTX 5090 when a PR is in the merge queue, and block the merge if it fails. +# Groups (see scripts/gpu_test.sh): math-cuda kernel parity, cuda_path_integration (GPU proof +# verifies), cuda_fallback (CPU fallback verifies), the prover/stark/crypto/ecsm suite on the +# GPU path, and the comprehensive all-instructions prove. Orchestration runs on a GitHub-hosted +# runner; all GPU work happens on the rented box (provisioned by the template onstart). The box +# is ALWAYS destroyed at the end. +# +# Triggered on `merge_group` (one rental per merge, not per push) + `workflow_dispatch` for +# manual runs. To gate merges, add the job name `gpu-tests` to the branch-protection required +# status checks for `main` (GitHub UI). +# +# Requires repo secrets: +# VAST_API_KEY — https://cloud.vast.ai/manage-keys/ +# VAST_TEMPLATE_HASH — hash of the "NVIDIA CUDA Lambda VM 64GB" template + +on: + merge_group: + workflow_dispatch: + # TEMP(testing): run on pushes to this branch pre-merge. REMOVE before merging. + push: + branches: [ci_run_tests_gpu] + +permissions: + contents: read + +concurrency: + group: gpu-tests-${{ github.ref }} + cancel-in-progress: true + +env: + # Vast offer search: RTX 5090, >=16 cores, >=96GB RAM (the prover suite proves real ELFs, + # so allow headroom), >=64GB disk, verified + rentable, Blackwell-capable driver, <= cap. + GPU_NAME: RTX_5090 + PRICE_CAP: "1" + VAST_IMAGE_DISK: "64" + # Unique per-run label set on the instance, for identification + leak-proof teardown. + RUN_LABEL: "gpu-tests-${{ github.run_id }}-${{ github.run_attempt }}" + # Pin the Vast CLI to an immutable commit (a PyPI version can be re-published; a commit + # hash can't) — avoids pulling untrusted code at run time. + VAST_CLI_COMMIT: "28494d92c6c03d887f8375085243c22eb68c5874" + +jobs: + gpu-tests: + runs-on: ubuntu-latest + # Provisioning + cuda builds + 5 test groups; the prover suite (single-threaded, real + # ELF proves) dominates. Generous ceiling; teardown still always destroys the box. + timeout-minutes: 240 + steps: + - name: Install Vast CLI + # No secrets in this step's env: install-time code can't read the API key during pip + # install. Pinned to an immutable commit (see VAST_CLI_COMMIT) for the same reason. + # --break-system-packages: the ephemeral runner's Python may be PEP-668 "externally + # managed"; safe to override on a disposable runner. + run: pip install --quiet --break-system-packages "git+https://github.com/vast-ai/vast-cli.git@${VAST_CLI_COMMIT}" + + - name: Authenticate Vast CLI + env: + VAST_API_KEY: ${{ secrets.VAST_API_KEY }} + run: vastai set api-key "$VAST_API_KEY" + + - name: Generate ephemeral SSH key + id: sshkey + run: | + mkdir -p "$HOME/.ssh" + KEY="$HOME/.ssh/vast_gpu_tests" + ssh-keygen -t ed25519 -N "" -f "$KEY" -C "gh-actions-gpu-tests-${GITHUB_RUN_ID}" >/dev/null + echo "key_path=$KEY" >> "$GITHUB_OUTPUT" + + - name: Pick a Vast offer + id: offer + env: + # Retry the same query to ride out transient scarcity (RTX 5090s are a small, + # fast-churning pool). Total wait ~= ATTEMPTS * INTERVAL. + OFFER_ATTEMPTS: "10" + OFFER_INTERVAL: "30" + # Require driver >= this major so cudarc matches the runtime driver (older drivers + # lack newer symbols and the GPU path falls back to CPU). Filtered client-side in jq + # because vast can't numerically compare the driver_version string server-side. + MIN_DRIVER: "580" + run: | + # cpu_ram filter is in GB. + QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=96 disk_space>=64 verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}" + echo "Query: $QUERY (+ client-side driver_version major >= $MIN_DRIVER)" + # Keep only offers whose driver major >= MIN_DRIVER, then most expensive first (within + # the cap) — premium hosts have faster disks/network and better reliability. + SELECT="map(select((try (.driver_version|split(\".\")[0]|tonumber) catch 0) >= ${MIN_DRIVER})) | sort_by(.dph_total) | reverse" + OFFER_ID="" + for attempt in $(seq 1 "$OFFER_ATTEMPTS"); do + vastai search offers "$QUERY" --raw -o dph_total > offers.json || true + OFFER_ID=$(jq -r "$SELECT | .[0].id // empty" offers.json) + OFFER_PRICE=$(jq -r "$SELECT | .[0].dph_total // empty" offers.json) + if [ -n "$OFFER_ID" ]; then + echo "Selected offer $OFFER_ID at \$${OFFER_PRICE}/hr (attempt $attempt)" + break + fi + echo "No matching offer (attempt $attempt/$OFFER_ATTEMPTS); retrying in ${OFFER_INTERVAL}s..." + sleep "$OFFER_INTERVAL" + done + if [ -z "$OFFER_ID" ]; then + echo "::error::No RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=96GB RAM, >=64GB disk, driver>=${MIN_DRIVER}, <= \$${PRICE_CAP}/hr)" + exit 1 + fi + echo "id=$OFFER_ID" >> "$GITHUB_OUTPUT" + echo "price=$OFFER_PRICE" >> "$GITHUB_OUTPUT" + + - name: Create instance + id: instance + env: + VAST_TEMPLATE_HASH: ${{ secrets.VAST_TEMPLATE_HASH }} + OFFER_ID: ${{ steps.offer.outputs.id }} + run: | + vastai create instance "$OFFER_ID" \ + --template_hash "$VAST_TEMPLATE_HASH" \ + --disk "$VAST_IMAGE_DISK" \ + --label "$RUN_LABEL" \ + --ssh --direct --raw > create.json + # Log only the fields we need (the full --raw response could carry a sensitive field). + jq '{success, new_contract: (.new_contract // .instances.new_contract)}' create.json + IID=$(jq -r '.new_contract // .instances.new_contract // empty' create.json) + if [ -z "$IID" ]; then + echo "::error::Failed to create Vast instance" + exit 1 + fi + # Persist immediately so teardown runs even if later steps fail. + echo "$IID" > "$RUNNER_TEMP/vast_instance_id" + echo "id=$IID" >> "$GITHUB_OUTPUT" + echo "Created instance $IID (label $RUN_LABEL)" + + - name: Attach SSH key to instance + env: + IID: ${{ steps.instance.outputs.id }} + KEY: ${{ steps.sshkey.outputs.key_path }} + run: | + # Attach the ephemeral pubkey to THIS instance only (added to its authorized_keys); + # removed when the instance is destroyed, so no account-level key to clean up. + # Retry: the instance may not accept the attach immediately after create. + PUB="$(cat "$KEY.pub")" + for attempt in $(seq 1 12); do + if vastai attach ssh "$IID" "$PUB"; then + echo "Attached ssh key (attempt $attempt)"; exit 0 + fi + echo "attach failed (attempt $attempt/12); retrying in 10s..." + sleep 10 + done + echo "::error::Failed to attach ssh key to instance $IID" + exit 1 + + - name: Wait for SSH + id: ssh + env: + IID: ${{ steps.instance.outputs.id }} + run: | + echo "Waiting for instance $IID to reach 'running' with SSH endpoint..." + HOST=""; PORT="" + # The base CUDA image is large; some hosts sit in 'loading' (image pull) a while. + for _ in $(seq 1 180); do # ~30 min + vastai show instance "$IID" --raw > inst.json || true + STATUS=$(jq -r '.actual_status // empty' inst.json) + # We create with --direct, so SSH straight to the public IP + the host port mapped + # to container port 22 (the .ssh_host/.ssh_port proxy fields are unreliable). + HOST=$(jq -r '.public_ipaddr // empty' inst.json) + PORT=$(jq -r '.ports["22/tcp"][0].HostPort // empty' inst.json) + echo " status=$STATUS ssh=$HOST:$PORT" + if [ "$STATUS" = "running" ] && [ -n "$HOST" ] && [ -n "$PORT" ]; then + break + fi + sleep 10 + done + if [ "$STATUS" != "running" ] || [ -z "$HOST" ] || [ -z "$PORT" ]; then + echo "::error::Instance never became reachable (status=$STATUS host=$HOST port=$PORT)" + exit 1 + fi + echo "host=$HOST" >> "$GITHUB_OUTPUT" + echo "port=$PORT" >> "$GITHUB_OUTPUT" + + # Wait for sshd to accept our key. + for _ in $(seq 1 30); do + if ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes \ + -i "${{ steps.sshkey.outputs.key_path }}" -p "$PORT" "root@$HOST" true 2>/dev/null; then + echo "sshd reachable"; exit 0 + fi + sleep 10 + done + echo "::error::sshd did not accept connections in time" + exit 1 + + - name: Wait for onstart provisioning + env: + HOST: ${{ steps.ssh.outputs.host }} + PORT: ${{ steps.ssh.outputs.port }} + KEY: ${{ steps.sshkey.outputs.key_path }} + run: | + SSH="ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes -i $KEY -p $PORT root@$HOST" + echo "Waiting for the template onstart script to finish (Rust + LLVM + sysroot + clone)..." + # The bootstrap's final stdout line is "=== done ===". Vast captures onstart output to + # /var/log/onstart.log; fall back to checking the artifacts it leaves. + for _ in $(seq 1 120); do # ~20 min + if $SSH 'grep -q "=== done ===" /var/log/onstart.log 2>/dev/null'; then + echo "onstart reported done"; exit 0 + fi + # shellcheck disable=SC2016 # $HOME must expand on the remote box, not the runner + if $SSH 'test -x "$HOME/.cargo/bin/cargo" \ + && test -f /opt/lambda-vm-sysroot/include/stdlib.h \ + && test -d /workspace/lambda_vm/.git'; then + echo "provisioning artifacts present"; exit 0 + fi + sleep 10 + done + echo "::error::onstart provisioning did not complete in time" + exit 1 + + - name: Run GPU tests + id: tests + env: + HOST: ${{ steps.ssh.outputs.host }} + PORT: ${{ steps.ssh.outputs.port }} + KEY: ${{ steps.sshkey.outputs.key_path }} + # merge_group: refs/heads/gh-readonly-queue/main/pr-… (the merge commit = PR + main), + # so we test exactly what will land. workflow_dispatch: the chosen branch ref. + REF: ${{ github.ref }} + run: | + SSH="ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes -i $KEY -p $PORT root@$HOST" + # Defense-in-depth: never interpolate an unvalidated ref into the remote `bash -lc`. + case "$REF" in + ''|*[!A-Za-z0-9._/-]*) echo "::error::invalid ref: '$REF'"; exit 1 ;; + esac + # Check out the ref under test on the box, then run the CUDA test groups. + # gpu_test.sh owns the CUDARC_PIN / SYSROOT_DIR defaults — don't duplicate them here. + REMOTE="set -e; cd /workspace/lambda_vm; \ + git fetch --force origin '$REF'; \ + git checkout -f FETCH_HEAD; \ + bash scripts/gpu_test.sh" + + # pipefail so a test failure on the box propagates through the tee pipe and FAILS this + # step (which fails the job and blocks the merge), instead of being masked by tee. + # 2>&1 so remote stderr (build errors, panics) is captured too — both into the live + # step log and the file the run-summary step tails. + set -o pipefail + $SSH "bash -lc \"$REMOTE\"" 2>&1 | tee "$RUNNER_TEMP/gpu_test_out.txt" + + - name: Write run summary + if: always() && (steps.tests.outcome == 'success' || steps.tests.outcome == 'failure') + env: + OUTCOME: ${{ steps.tests.outcome }} + run: | + OUT="$RUNNER_TEMP/gpu_test_out.txt" + { + echo "## GPU tests (CUDA suite) — ${OUTCOME}" + if [ "$OUTCOME" = "success" ]; then + echo "All GPU test groups passed." + else + # Group the failed tests under the make target that ran them: gpu_test.sh prints + # "=== make ===" before each group, and cargo prints "test ... FAILED". + report=$(awk ' + /^=== make / { grp=$3; next } + / \.\.\. FAILED/ { fails[grp]=fails[grp] "\n - " $2; n[grp]++ } + END { for (g in fails) printf "- **%s** (%d failed):%s\n", g, n[g], fails[g] } + ' "$OUT" 2>/dev/null || true) + # Per-test panic/assertion messages: each "thread '…' panicked at …:" block plus + # its following message lines (assertion, left/right), capped per block. + details=$(awk ' + /^thread .* panicked at / { cap=1; lines=0; buf=$0; next } + cap { + if ($0 ~ /^note: run with/ || $0 ~ /^----/ || $0 ~ /^test / || $0 ~ /^=== / || $0 ~ /^[[:space:]]*$/) { printf "%s\n\n", buf; cap=0; next } + if (lines < 14) { buf=buf "\n" $0; lines++ } else if (lines==14) { buf=buf "\n ...(truncated)"; lines++ } + } + END { if (cap) printf "%s\n", buf } + ' "$OUT" 2>/dev/null || true) + if [ -n "$report" ]; then + echo; echo "### Failed tests by group"; echo "$report" + if [ -n "$details" ]; then + echo; echo "### Failure details"; echo '```'; echo "$details"; echo '```' + fi + else + # No per-test failures parsed (likely a build/infra error) — fall back to the + # failed-group markers plus a short log tail. + grps=$(grep -F '::error::GPU test group failed:' "$OUT" 2>/dev/null | sed 's/.*failed: /- /' | sort -u || true) + [ -n "$grps" ] && { echo; echo "### Failed groups"; echo "$grps"; } + echo; echo "No individual test failures parsed (build/infra error?). Last lines:" + echo '```'; tail -n 40 "$OUT" 2>/dev/null || echo "(no output captured)"; echo '```' + fi + echo; echo "Full output is in the \"Run GPU tests\" step log." + fi + } >> "$GITHUB_STEP_SUMMARY" + + # --- Teardown: ALWAYS destroy the instance (cost guardrail) --- + - name: Destroy instance + if: always() + run: | + # Retry transient failures (network/auth) so a paid box isn't stranded. + # --yes: skip the interactive [y/N] confirm (CI has no tty). + destroy() { + iid="$1"; destroyed="" + for attempt in 1 2 3; do + if vastai destroy instance "$iid" --yes; then destroyed=1; break; fi + echo "destroy attempt $attempt failed; retrying in 10s..." + sleep 10 + done + [ -n "$destroyed" ] || echo "::warning::Failed to destroy instance $iid after 3 attempts — check the Vast console (label $RUN_LABEL)" + } + if [ -f "$RUNNER_TEMP/vast_instance_id" ]; then + IID=$(cat "$RUNNER_TEMP/vast_instance_id") + echo "Destroying instance $IID" + destroy "$IID" + else + # The id file is written only AFTER create succeeds AND its JSON parses, so a box can + # exist unrecorded if the run was cancelled in that window or the parse failed. Fall + # back to destroying by our unique RUN_LABEL so the box can't leak (bill indefinitely). + echo "No instance id recorded; searching Vast for any box labelled $RUN_LABEL..." + vastai show instances --raw > all_inst.json 2>/dev/null || echo '[]' > all_inst.json + LEAKED=$(jq -r --arg L "$RUN_LABEL" \ + '(if type=="array" then . else (.instances // []) end) | .[] | select(.label == $L) | .id' \ + all_inst.json 2>/dev/null || true) + if [ -z "$LEAKED" ]; then + echo "No instance labelled $RUN_LABEL found; nothing to destroy." + else + for IID in $LEAKED; do + echo "Destroying leaked instance $IID (label $RUN_LABEL)" + destroy "$IID" + done + fi + fi diff --git a/Makefile b/Makefile index 454eff098..6592eee97 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,8 @@ compile-programs compile-recursion-elfs clean-asm clean-rust clean-bench clean-shared \ clean-recursion-elfs clean test test-asm \ test-rust test-executor test-flamegraph flamegraph-prover \ -test-fast test-prover test-prover-all test-disk-spill test-math-cuda test-cuda-integration \ +test-fast test-prover test-prover-all test-disk-spill test-math-cuda test-cuda-integration test-cuda-fallback \ +test-prover-cuda test-prover-comprehensive-cuda \ bench-math-cuda bench-prover bench-prover-cuda build check clippy fmt lint regen-ethrex-fixtures \ update-ethrex-fixture-checksums check-ethrex-fixture-checksums @@ -284,6 +285,26 @@ test-cuda-integration: cargo test -p lambda-vm-prover --release --features cuda \ --test cuda_path_integration -- --ignored --nocapture +# GPU error-path coverage (requires NVIDIA GPU + nvcc). +# Forces cuda dispatch errors and asserts the CPU fallback still produces a verifying proof. +test-cuda-fallback: + cargo test -p lambda-vm-prover --release --features test-cuda-faults \ + --test cuda_fallback_tests -- --ignored --nocapture --test-threads=1 + +# The prover/stark/crypto/ecsm test suite with the GPU (cuda) path enabled (requires NVIDIA +# GPU + nvcc). The GPU CI counterpart of CPU CI's sharded prover tests. Single-threaded: the +# GPU serializes proves and the dispatch counters are process-global. cuda on prover cascades +# to stark; crypto/ecsm build without it (they have no GPU path). +test-prover-cuda: + cargo test --release -p lambda-vm-prover -p stark -p crypto -p ecsm \ + --features lambda-vm-prover/cuda -- --test-threads=1 + +# The comprehensive all-instructions prove (ignored by default) on the GPU path (requires +# NVIDIA GPU + nvcc). GPU counterpart of CPU CI's merge-queue-only comprehensive job. +test-prover-comprehensive-cuda: + cargo test --release -p lambda-vm-prover --features cuda \ + test_prove_elfs_all_instructions_64_full -- --ignored --test-threads=1 --nocapture + # math-cuda quick microbench (median of 10 runs) bench-math-cuda: cargo test -p math-cuda --release --test bench_quick -- --ignored --nocapture diff --git a/README.md b/README.md index 151934433..e07967037 100644 --- a/README.md +++ b/README.md @@ -185,7 +185,11 @@ See [`spec/README.md`](./spec/README.md) for full setup instructions. | `make test-asm` | Compile and run ASM tests | | `make test-rust` | Compile and run Rust tests | | `make test-executor` | Compile all programs and run executor tests | -| `make test-math-cuda` | math-cuda parity tests (requires NVIDIA GPU + nvcc) | +| `make test-math-cuda` | math-cuda GPU kernel parity tests (requires NVIDIA GPU + nvcc; see GPU Tests) | +| `make test-cuda-integration` | End-to-end GPU dispatch + proof verification (requires NVIDIA GPU + nvcc) | +| `make test-cuda-fallback` | GPU error-path / CPU-fallback tests (requires NVIDIA GPU + nvcc) | +| `make test-prover-cuda` | Prover/stark/crypto/ecsm suite on the GPU path (requires NVIDIA GPU + nvcc) | +| `make test-prover-comprehensive-cuda` | Comprehensive all-instructions prove on the GPU path (requires NVIDIA GPU + nvcc) | | `make build` | Build all workspace crates | | `make check` | Check all crates (faster than build, no codegen) | | `make clippy` | Run clippy on all crates | @@ -219,6 +223,21 @@ You can run it with `make test-rust` +### GPU Tests + +The CUDA test groups run only on a machine with an NVIDIA GPU and `nvcc`: + +- `make test-math-cuda` — GPU-vs-CPU kernel parity (NTT, LDE, barycentric, FRI, …) +- `make test-cuda-integration` — proves a guest on GPU and checks every dispatch fired + the proof verifies +- `make test-cuda-fallback` — forces GPU dispatch errors and checks the CPU fallback still verifies +- `make test-prover-cuda` — the prover/stark/crypto/ecsm suite with the GPU path enabled +- `make test-prover-comprehensive-cuda` — the comprehensive all-instructions prove on the GPU path + +The kernels are compiled by `nvcc` into PTX that the driver JIT-compiles at load, so the GPU's +driver must be new enough for the toolkit — an older driver rejects the PTX with +`CUDA_ERROR_UNSUPPORTED_PTX_VERSION`. These groups run automatically on a rented GPU in the merge +queue via `.github/workflows/gpu-tests.yml` (which filters offers on `cuda_max_good`). + ## Benchmarking & Profiling You can create a flamegraph for proof generation using the following target: diff --git a/crypto/math-cuda/build.rs b/crypto/math-cuda/build.rs index b2f61f9a2..73cc10d3a 100644 --- a/crypto/math-cuda/build.rs +++ b/crypto/math-cuda/build.rs @@ -72,6 +72,11 @@ fn compile_ptx(src: &str, out_name: &str, have_nvcc: bool) { // compute capability. If unset, try `nvidia-smi` to match the host GPU // (avoids JIT failures like nvcc-13.0 PTX rejected on Blackwell drivers); // fall back to compute_89 (Ada) when detection fails. + // + // NOTE: this `-arch` only sets the *virtual arch*, not the PTX ISA version, which is + // fixed by this nvcc's CUDA toolkit. The runtime driver must support that toolkit's CUDA + // version or it rejects the PTX with CUDA_ERROR_UNSUPPORTED_PTX_VERSION — i.e. the box's + // driver CUDA must be >= the build toolkit's CUDA. See README "GPU Tests". let arch = env::var("CUDARC_NVCC_ARCH").unwrap_or_else(|_| detect_arch()); let status = Command::new(nvcc_path()) diff --git a/scripts/gpu_test.sh b/scripts/gpu_test.sh new file mode 100755 index 000000000..942a09620 --- /dev/null +++ b/scripts/gpu_test.sh @@ -0,0 +1,74 @@ +#!/usr/bin/env bash +# +# gpu_test.sh — run the CUDA-only test groups on a GPU box. +# +# Exercises the CUDA path, which CPU CI can't (GitHub runners have no GPU): +# 1. math-cuda kernel parity (make test-math-cuda) +# 2. end-to-end GPU dispatch + proof (make test-cuda-integration) +# 3. GPU error-path / CPU fallback (make test-cuda-fallback) +# 4. prover/stark/crypto/ecsm suite (make test-prover-cuda) — CPU CI's prover tests on GPU +# 5. comprehensive all-instructions (make test-prover-comprehensive-cuda) +# +# Runs on the rented Vast box from the gpu-tests.yml merge-queue workflow. All groups +# run even if one fails (so the log shows every failure); the script exits non-zero if ANY +# group failed, which fails the workflow job and blocks the merge. +# +# Env: +# CUDARC_PIN cudarc CUDA-version feature to pin (default cuda-12080). See the sed below. +# SYSROOT_DIR rv64 sysroot (default /opt/lambda-vm-sysroot, provisioned by the template). + +set -euo pipefail + +CUDARC_PIN="${CUDARC_PIN:-cuda-12080}" +export SYSROOT_DIR="${SYSROOT_DIR:-/opt/lambda-vm-sysroot}" + +log() { printf '\n=== %s ===\n' "$*"; } + +# --- GPU toolchain sanity (fail loudly rather than silently falling back to CPU) --- +log "GPU toolchain" +if ! command -v nvcc >/dev/null 2>&1; then + for d in /usr/local/cuda/bin /usr/local/cuda-*/bin; do + [ -x "$d/nvcc" ] && export PATH="$d:$PATH" && break + done +fi +command -v nvcc >/dev/null 2>&1 || { echo "ERROR: nvcc not found — CUDA toolkit missing" >&2; exit 1; } +nvcc --version | tail -n 2 +nvidia-smi --query-gpu=name,driver_version,compute_cap --format=csv,noheader + +# --- Pin cudarc so it binds a fixed driver-symbol set -------------------------- +# crypto/math-cuda/Cargo.toml uses `cuda-version-from-build-system` + `fallback-latest`; +# when detection falls back to "latest", cudarc requests symbols some boxes' driver doesn't +# export (e.g. cuDevSmResourceSplit / cuCtxGetDevice_v2) -> runtime panic. Pinning to a fixed, +# conservative CUDA version binds a known driver-symbol set instead. (This is cudarc's +# host-side driver-API floor — independent of the PTX/driver version the offer filter targets.) +log "pinning cudarc to $CUDARC_PIN" +sed -i "s/\"cuda-version-from-build-system\"/\"${CUDARC_PIN}\"/; /\"fallback-latest\"/d" \ + crypto/math-cuda/Cargo.toml + +# --- Build the guest ELFs the tests prove --------------------------------------- +# math-cuda parity needs none; cuda_path_integration / cuda_fallback prove an asm ELF; the +# prover suite (Groups 4 & 5) proves asm AND rust guests. Build both up front. +log "compiling guest programs (asm + rust)" +make compile-programs-asm +make compile-programs-rust + +# --- Run the CUDA test groups via the Makefile targets -------------------------- +fail=0 +run() { # $1 = make target + log "make $1" + if ! make "$1"; then + echo "::error::GPU test group failed: $1" + fail=1 + fi +} +run test-math-cuda # Group 1: kernel parity +run test-cuda-integration # Group 2: end-to-end GPU dispatch + proof verifies +run test-cuda-fallback # Group 3: GPU error -> CPU fallback still verifies +run test-prover-cuda # Group 4: prover/stark/crypto/ecsm suite on the GPU path +run test-prover-comprehensive-cuda # Group 5: comprehensive all-instructions prove on GPU + +if [ "$fail" -ne 0 ]; then + log "FAILED — one or more GPU test groups failed" + exit 1 +fi +log "all GPU test groups passed"