Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
324 changes: 324 additions & 0 deletions .github/workflows/gpu-tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,324 @@
name: GPU Tests (merge queue)

# Run the GPU test suite (which CPU CI can't, since GitHub runners have no GPU) on a rented
# Vast.ai RTX 5090 when a PR is in the merge queue, and block the merge if it fails.
# Groups (see scripts/gpu_test.sh): math-cuda kernel parity, cuda_path_integration (GPU proof
# verifies), cuda_fallback (CPU fallback verifies), the prover/stark/crypto/ecsm suite on the
# GPU path, and the comprehensive all-instructions prove. Orchestration runs on a GitHub-hosted
# runner; all GPU work happens on the rented box (provisioned by the template onstart). The box
# is ALWAYS destroyed at the end.
#
# Triggered on `merge_group` (one rental per merge, not per push) + `workflow_dispatch` for
# manual runs. To gate merges, add the job name `gpu-tests` to the branch-protection required
# status checks for `main` (GitHub UI).
#
# Requires repo secrets:
# VAST_API_KEY — https://cloud.vast.ai/manage-keys/
# VAST_TEMPLATE_HASH — hash of the "NVIDIA CUDA Lambda VM 64GB" template

on:
merge_group:
workflow_dispatch:
# TEMP(testing): run on pushes to this branch pre-merge. REMOVE before merging.
push:
branches: [ci_run_tests_gpu]

permissions:
contents: read

concurrency:
group: gpu-tests-${{ github.ref }}
cancel-in-progress: true

env:
# Vast offer search: RTX 5090, >=16 cores, >=96GB RAM (the prover suite proves real ELFs,
# so allow headroom), >=64GB disk, verified + rentable, Blackwell-capable driver, <= cap.
GPU_NAME: RTX_5090
PRICE_CAP: "1"
VAST_IMAGE_DISK: "64"
# Unique per-run label set on the instance, for identification + leak-proof teardown.
RUN_LABEL: "gpu-tests-${{ github.run_id }}-${{ github.run_attempt }}"
# Pin the Vast CLI to an immutable commit (a PyPI version can be re-published; a commit
# hash can't) — avoids pulling untrusted code at run time.
VAST_CLI_COMMIT: "28494d92c6c03d887f8375085243c22eb68c5874"

jobs:
gpu-tests:
runs-on: ubuntu-latest
# Provisioning + cuda builds + 5 test groups; the prover suite (single-threaded, real
# ELF proves) dominates. Generous ceiling; teardown still always destroys the box.
timeout-minutes: 240
steps:
- name: Install Vast CLI
# No secrets in this step's env: install-time code can't read the API key during pip
# install. Pinned to an immutable commit (see VAST_CLI_COMMIT) for the same reason.
# --break-system-packages: the ephemeral runner's Python may be PEP-668 "externally
# managed"; safe to override on a disposable runner.
run: pip install --quiet --break-system-packages "git+https://github.com/vast-ai/vast-cli.git@${VAST_CLI_COMMIT}"

- name: Authenticate Vast CLI
env:
VAST_API_KEY: ${{ secrets.VAST_API_KEY }}
run: vastai set api-key "$VAST_API_KEY"

- name: Generate ephemeral SSH key
id: sshkey
run: |
mkdir -p "$HOME/.ssh"
KEY="$HOME/.ssh/vast_gpu_tests"
ssh-keygen -t ed25519 -N "" -f "$KEY" -C "gh-actions-gpu-tests-${GITHUB_RUN_ID}" >/dev/null
echo "key_path=$KEY" >> "$GITHUB_OUTPUT"

- name: Pick a Vast offer
id: offer
env:
# Retry the same query to ride out transient scarcity (RTX 5090s are a small,
# fast-churning pool). Total wait ~= ATTEMPTS * INTERVAL.
OFFER_ATTEMPTS: "10"
OFFER_INTERVAL: "30"
# Require driver >= this major so cudarc matches the runtime driver (older drivers
# lack newer symbols and the GPU path falls back to CPU). Filtered client-side in jq
# because vast can't numerically compare the driver_version string server-side.
MIN_DRIVER: "580"
run: |
# cpu_ram filter is in GB.
QUERY="gpu_name=${GPU_NAME} num_gpus=1 cpu_cores_effective>=16 cpu_ram>=96 disk_space>=64 verified=true rentable=true cuda_max_good>=12.8 dph_total<=${PRICE_CAP}"
echo "Query: $QUERY (+ client-side driver_version major >= $MIN_DRIVER)"
# Keep only offers whose driver major >= MIN_DRIVER, then most expensive first (within
# the cap) — premium hosts have faster disks/network and better reliability.
SELECT="map(select((try (.driver_version|split(\".\")[0]|tonumber) catch 0) >= ${MIN_DRIVER})) | sort_by(.dph_total) | reverse"
OFFER_ID=""
for attempt in $(seq 1 "$OFFER_ATTEMPTS"); do
vastai search offers "$QUERY" --raw -o dph_total > offers.json || true
OFFER_ID=$(jq -r "$SELECT | .[0].id // empty" offers.json)
OFFER_PRICE=$(jq -r "$SELECT | .[0].dph_total // empty" offers.json)
if [ -n "$OFFER_ID" ]; then
echo "Selected offer $OFFER_ID at \$${OFFER_PRICE}/hr (attempt $attempt)"
break
fi
echo "No matching offer (attempt $attempt/$OFFER_ATTEMPTS); retrying in ${OFFER_INTERVAL}s..."
sleep "$OFFER_INTERVAL"
done
if [ -z "$OFFER_ID" ]; then
echo "::error::No RTX 5090 offer matched after $OFFER_ATTEMPTS attempts (>=16 cores, >=96GB RAM, >=64GB disk, driver>=${MIN_DRIVER}, <= \$${PRICE_CAP}/hr)"
exit 1
fi
echo "id=$OFFER_ID" >> "$GITHUB_OUTPUT"
echo "price=$OFFER_PRICE" >> "$GITHUB_OUTPUT"

- name: Create instance
id: instance
env:
VAST_TEMPLATE_HASH: ${{ secrets.VAST_TEMPLATE_HASH }}
OFFER_ID: ${{ steps.offer.outputs.id }}
run: |
vastai create instance "$OFFER_ID" \
--template_hash "$VAST_TEMPLATE_HASH" \
--disk "$VAST_IMAGE_DISK" \
--label "$RUN_LABEL" \
--ssh --direct --raw > create.json
# Log only the fields we need (the full --raw response could carry a sensitive field).
jq '{success, new_contract: (.new_contract // .instances.new_contract)}' create.json
IID=$(jq -r '.new_contract // .instances.new_contract // empty' create.json)
if [ -z "$IID" ]; then
echo "::error::Failed to create Vast instance"
exit 1
fi
# Persist immediately so teardown runs even if later steps fail.
echo "$IID" > "$RUNNER_TEMP/vast_instance_id"
echo "id=$IID" >> "$GITHUB_OUTPUT"
echo "Created instance $IID (label $RUN_LABEL)"

- name: Attach SSH key to instance
env:
IID: ${{ steps.instance.outputs.id }}
KEY: ${{ steps.sshkey.outputs.key_path }}
run: |
# Attach the ephemeral pubkey to THIS instance only (added to its authorized_keys);
# removed when the instance is destroyed, so no account-level key to clean up.
# Retry: the instance may not accept the attach immediately after create.
PUB="$(cat "$KEY.pub")"
for attempt in $(seq 1 12); do
if vastai attach ssh "$IID" "$PUB"; then
echo "Attached ssh key (attempt $attempt)"; exit 0
fi
echo "attach failed (attempt $attempt/12); retrying in 10s..."
sleep 10
done
echo "::error::Failed to attach ssh key to instance $IID"
exit 1

- name: Wait for SSH
id: ssh
env:
IID: ${{ steps.instance.outputs.id }}
run: |
echo "Waiting for instance $IID to reach 'running' with SSH endpoint..."
HOST=""; PORT=""
# The base CUDA image is large; some hosts sit in 'loading' (image pull) a while.
for _ in $(seq 1 180); do # ~30 min
vastai show instance "$IID" --raw > inst.json || true
STATUS=$(jq -r '.actual_status // empty' inst.json)
# We create with --direct, so SSH straight to the public IP + the host port mapped
# to container port 22 (the .ssh_host/.ssh_port proxy fields are unreliable).
HOST=$(jq -r '.public_ipaddr // empty' inst.json)
PORT=$(jq -r '.ports["22/tcp"][0].HostPort // empty' inst.json)
echo " status=$STATUS ssh=$HOST:$PORT"
if [ "$STATUS" = "running" ] && [ -n "$HOST" ] && [ -n "$PORT" ]; then
break
fi
sleep 10
done
if [ "$STATUS" != "running" ] || [ -z "$HOST" ] || [ -z "$PORT" ]; then
echo "::error::Instance never became reachable (status=$STATUS host=$HOST port=$PORT)"
exit 1
fi
echo "host=$HOST" >> "$GITHUB_OUTPUT"
echo "port=$PORT" >> "$GITHUB_OUTPUT"

# Wait for sshd to accept our key.
for _ in $(seq 1 30); do
if ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes \
-i "${{ steps.sshkey.outputs.key_path }}" -p "$PORT" "root@$HOST" true 2>/dev/null; then
echo "sshd reachable"; exit 0
fi
sleep 10
done
echo "::error::sshd did not accept connections in time"
exit 1

- name: Wait for onstart provisioning
env:
HOST: ${{ steps.ssh.outputs.host }}
PORT: ${{ steps.ssh.outputs.port }}
KEY: ${{ steps.sshkey.outputs.key_path }}
run: |
SSH="ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes -i $KEY -p $PORT root@$HOST"
echo "Waiting for the template onstart script to finish (Rust + LLVM + sysroot + clone)..."
# The bootstrap's final stdout line is "=== done ===". Vast captures onstart output to
# /var/log/onstart.log; fall back to checking the artifacts it leaves.
for _ in $(seq 1 120); do # ~20 min
if $SSH 'grep -q "=== done ===" /var/log/onstart.log 2>/dev/null'; then
echo "onstart reported done"; exit 0
fi
# shellcheck disable=SC2016 # $HOME must expand on the remote box, not the runner
if $SSH 'test -x "$HOME/.cargo/bin/cargo" \
&& test -f /opt/lambda-vm-sysroot/include/stdlib.h \
&& test -d /workspace/lambda_vm/.git'; then
echo "provisioning artifacts present"; exit 0
fi
sleep 10
done
echo "::error::onstart provisioning did not complete in time"
exit 1

- name: Run GPU tests
id: tests
env:
HOST: ${{ steps.ssh.outputs.host }}
PORT: ${{ steps.ssh.outputs.port }}
KEY: ${{ steps.sshkey.outputs.key_path }}
# merge_group: refs/heads/gh-readonly-queue/main/pr-… (the merge commit = PR + main),
# so we test exactly what will land. workflow_dispatch: the chosen branch ref.
REF: ${{ github.ref }}
run: |
SSH="ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -o BatchMode=yes -i $KEY -p $PORT root@$HOST"
# Defense-in-depth: never interpolate an unvalidated ref into the remote `bash -lc`.
case "$REF" in
''|*[!A-Za-z0-9._/-]*) echo "::error::invalid ref: '$REF'"; exit 1 ;;
esac
# Check out the ref under test on the box, then run the CUDA test groups.
REMOTE="set -e; cd /workspace/lambda_vm; \
git fetch --force origin '$REF'; \
git checkout -f FETCH_HEAD; \
CUDARC_PIN=cuda-12080 SYSROOT_DIR=/opt/lambda-vm-sysroot bash scripts/gpu_test.sh"

# pipefail so a test failure on the box propagates through the tee pipe and FAILS this
# step (which fails the job and blocks the merge), instead of being masked by tee.
# 2>&1 so remote stderr (build errors, panics) is captured too — both into the live
# step log and the file the run-summary step tails.
set -o pipefail
$SSH "bash -lc \"$REMOTE\"" 2>&1 | tee "$RUNNER_TEMP/gpu_test_out.txt"

- name: Write run summary
if: always() && (steps.tests.outcome == 'success' || steps.tests.outcome == 'failure')
env:
OUTCOME: ${{ steps.tests.outcome }}
run: |
OUT="$RUNNER_TEMP/gpu_test_out.txt"
{
echo "## GPU tests (CUDA suite) — ${OUTCOME}"
if [ "$OUTCOME" = "success" ]; then
echo "All GPU test groups passed."
else
# Group the failed tests under the make target that ran them: gpu_test.sh prints
# "=== make <target> ===" before each group, and cargo prints "test <name> ... FAILED".
report=$(awk '
/^=== make / { grp=$3; next }
/ \.\.\. FAILED/ { fails[grp]=fails[grp] "\n - " $2; n[grp]++ }
END { for (g in fails) printf "- **%s** (%d failed):%s\n", g, n[g], fails[g] }
' "$OUT" 2>/dev/null || true)
# Per-test panic/assertion messages: each "thread '…' panicked at …:" block plus
# its following message lines (assertion, left/right), capped per block.
details=$(awk '
/^thread .* panicked at / { cap=1; lines=0; buf=$0; next }
cap {
if ($0 ~ /^note: run with/ || $0 ~ /^----/ || $0 ~ /^test / || $0 ~ /^=== / || $0 ~ /^[[:space:]]*$/) { printf "%s\n\n", buf; cap=0; next }
if (lines < 14) { buf=buf "\n" $0; lines++ } else if (lines==14) { buf=buf "\n ...(truncated)"; lines++ }
}
END { if (cap) printf "%s\n", buf }
' "$OUT" 2>/dev/null || true)
if [ -n "$report" ]; then
echo; echo "### Failed tests by group"; echo "$report"
if [ -n "$details" ]; then
echo; echo "### Failure details"; echo '```'; echo "$details"; echo '```'
fi
else
# No per-test failures parsed (likely a build/infra error) — fall back to the
# failed-group markers plus a short log tail.
grps=$(grep -F '::error::GPU test group failed:' "$OUT" 2>/dev/null | sed 's/.*failed: /- /' | sort -u || true)
[ -n "$grps" ] && { echo; echo "### Failed groups"; echo "$grps"; }
echo; echo "No individual test failures parsed (build/infra error?). Last lines:"
echo '```'; tail -n 40 "$OUT" 2>/dev/null || echo "(no output captured)"; echo '```'
fi
echo; echo "<sub>Full output is in the \"Run GPU tests\" step log.</sub>"
fi
} >> "$GITHUB_STEP_SUMMARY"

# --- Teardown: ALWAYS destroy the instance (cost guardrail) ---
- name: Destroy instance
if: always()
run: |
# Retry transient failures (network/auth) so a paid box isn't stranded.
# --yes: skip the interactive [y/N] confirm (CI has no tty).
destroy() {
iid="$1"; destroyed=""
for attempt in 1 2 3; do
if vastai destroy instance "$iid" --yes; then destroyed=1; break; fi
echo "destroy attempt $attempt failed; retrying in 10s..."
sleep 10
done
[ -n "$destroyed" ] || echo "::warning::Failed to destroy instance $iid after 3 attempts — check the Vast console (label $RUN_LABEL)"
}
if [ -f "$RUNNER_TEMP/vast_instance_id" ]; then
IID=$(cat "$RUNNER_TEMP/vast_instance_id")
echo "Destroying instance $IID"
destroy "$IID"
else
# The id file is written only AFTER create succeeds AND its JSON parses, so a box can
# exist unrecorded if the run was cancelled in that window or the parse failed. Fall
# back to destroying by our unique RUN_LABEL so the box can't leak (bill indefinitely).
echo "No instance id recorded; searching Vast for any box labelled $RUN_LABEL..."
vastai show instances --raw > all_inst.json 2>/dev/null || echo '[]' > all_inst.json
LEAKED=$(jq -r --arg L "$RUN_LABEL" \
'(if type=="array" then . else (.instances // []) end) | .[] | select(.label == $L) | .id' \
all_inst.json 2>/dev/null || true)
if [ -z "$LEAKED" ]; then
echo "No instance labelled $RUN_LABEL found; nothing to destroy."
else
for IID in $LEAKED; do
echo "Destroying leaked instance $IID (label $RUN_LABEL)"
destroy "$IID"
done
fi
fi
23 changes: 22 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
compile-programs compile-recursion-elfs clean-asm clean-rust clean-bench clean-shared \
clean-recursion-elfs clean test test-asm \
test-rust test-executor test-flamegraph flamegraph-prover \
test-fast test-prover test-prover-all test-disk-spill test-math-cuda test-cuda-integration \
test-fast test-prover test-prover-all test-disk-spill test-math-cuda test-cuda-integration test-cuda-fallback \
test-prover-cuda test-prover-comprehensive-cuda \
bench-math-cuda bench-prover bench-prover-cuda build check clippy fmt lint regen-ethrex-fixtures \
update-ethrex-fixture-checksums check-ethrex-fixture-checksums

Expand Down Expand Up @@ -284,6 +285,26 @@ test-cuda-integration:
cargo test -p lambda-vm-prover --release --features cuda \
--test cuda_path_integration -- --ignored --nocapture

# GPU error-path coverage (requires NVIDIA GPU + nvcc).
# Forces cuda dispatch errors and asserts the CPU fallback still produces a verifying proof.
test-cuda-fallback:
cargo test -p lambda-vm-prover --release --features test-cuda-faults \
--test cuda_fallback_tests -- --ignored --nocapture --test-threads=1

# The prover/stark/crypto/ecsm test suite with the GPU (cuda) path enabled (requires NVIDIA
# GPU + nvcc). The GPU CI counterpart of CPU CI's sharded prover tests. Single-threaded: the
# GPU serializes proves and the dispatch counters are process-global. cuda on prover cascades
# to stark; crypto/ecsm build without it (they have no GPU path).
test-prover-cuda:
cargo test --release -p lambda-vm-prover -p stark -p crypto -p ecsm \
--features lambda-vm-prover/cuda -- --test-threads=1

# The comprehensive all-instructions prove (ignored by default) on the GPU path (requires
# NVIDIA GPU + nvcc). GPU counterpart of CPU CI's merge-queue-only comprehensive job.
test-prover-comprehensive-cuda:
cargo test --release -p lambda-vm-prover --features cuda \
test_prove_elfs_all_instructions_64_full -- --ignored --test-threads=1 --nocapture

# math-cuda quick microbench (median of 10 runs)
bench-math-cuda:
cargo test -p math-cuda --release --test bench_quick -- --ignored --nocapture
Expand Down
Loading
Loading