From ae9e2006fd9348b7a7aaf6dae6433e6b57de3c55 Mon Sep 17 00:00:00 2001 From: Mario Rugiero Date: Fri, 26 Jun 2026 18:00:39 -0300 Subject: [PATCH 01/16] feat: recursion profiling + measurement programs Add the measurement/profiling harness for the in-VM STARK verifier: - `empty`-proof and `deserialize-only` bench guests + `sp1/verifier` cross-prover comparison, all exercising the no_std verifier. - Expand the recursion smoke test with PC-histogram, sampled-flamegraph, page-count, cycle-count and per-step-breakdown diagnostics, plus the `make test-profile-recursion` targets and the histogram-aggregation CI script/workflow. - Expose read-only `Executor::memory()`, `Memory::cells()` and `SymbolTable::functions()` accessors and make `flamegraph::demangle` public so the diagnostics can resolve guest PCs to functions. --- .../scripts/aggregate_recursion_histogram.py | 126 +++ .github/workflows/profile-recursion.yml | 175 ++++ Makefile | 10 +- .../deserialize-only/.cargo/config.toml | 6 + bench_vs/lambda/deserialize-only/Cargo.toml | 13 + bench_vs/lambda/deserialize-only/src/main.rs | 93 ++ bench_vs/sp1/verifier/Cargo.toml | 3 + bench_vs/sp1/verifier/program/Cargo.toml | 10 + bench_vs/sp1/verifier/program/src/main.rs | 34 + bench_vs/sp1/verifier/script/Cargo.toml | 13 + bench_vs/sp1/verifier/script/build.rs | 5 + bench_vs/sp1/verifier/script/src/main.rs | 83 ++ executor/src/elf.rs | 5 + executor/src/flamegraph.rs | 2 +- executor/src/vm/memory.rs | 7 + prover/src/tests/recursion_smoke_test.rs | 936 +++++++++++++++++- 16 files changed, 1510 insertions(+), 11 deletions(-) create mode 100755 .github/scripts/aggregate_recursion_histogram.py create mode 100644 .github/workflows/profile-recursion.yml create mode 100644 bench_vs/lambda/deserialize-only/.cargo/config.toml create mode 100644 bench_vs/lambda/deserialize-only/Cargo.toml create mode 100644 bench_vs/lambda/deserialize-only/src/main.rs create mode 100644 bench_vs/sp1/verifier/Cargo.toml create mode 100644 bench_vs/sp1/verifier/program/Cargo.toml create mode 100644 bench_vs/sp1/verifier/program/src/main.rs create mode 100644 bench_vs/sp1/verifier/script/Cargo.toml create mode 100644 bench_vs/sp1/verifier/script/build.rs create mode 100644 bench_vs/sp1/verifier/script/src/main.rs diff --git a/.github/scripts/aggregate_recursion_histogram.py b/.github/scripts/aggregate_recursion_histogram.py new file mode 100755 index 000000000..8a12dc05e --- /dev/null +++ b/.github/scripts/aggregate_recursion_histogram.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python3 +"""Format the recursion-guest per-function profile as a Markdown PR comment. + +`test_recursion_pc_histogram` prints a per-function summary table (cycles folded +over each function's PCs, computed across the *full* histogram) followed by a +per-address detail table. We extract the per-function table — the view that +shows where the cycles actually go — and render it as Markdown. + + Top 25 functions by cycle count (aggregated over their PCs): + rank cycles % cum % PCs function (file:line) + 1 5335072 24.95% 24.95% 72 <...>::visit_seq::<...> + +Reads the test's captured output from argv[1]; writes the Markdown body to +argv[2] (or stdout). +""" + +import re +import sys + +# A per-function summary row: rank, cycles, pct%, cum%, pcs, function. +# Distinguished from the per-PC detail rows by the absence of a 0x column. +FN_ROW = re.compile( + r"^\s*\d+\s+(\d+)\s+([\d.]+)%\s+([\d.]+)%\s+(\d+)\s+(.*\S)\s*$" +) +FN_TABLE_START = re.compile(r"Top \d+ functions by cycle count") +PC_TABLE_START = re.compile(r"Top \d+ PCs by cycle count") +TOTAL_CYCLES = re.compile(r"Total cycles\s*:\s*(\d+)") +UNIQUE_PCS = re.compile(r"Unique PCs\s*:\s*(\d+)") +EXEC_TIME = re.compile(r"Exec time\s*:\s*(\S+)") + + +def parse(text): + total_cycles = unique_pcs = exec_time = None + rows = [] + in_fn_table = False + for line in text.splitlines(): + if total_cycles is None and (m := TOTAL_CYCLES.search(line)): + total_cycles = int(m.group(1)) + if unique_pcs is None and (m := UNIQUE_PCS.search(line)): + unique_pcs = int(m.group(1)) + if exec_time is None and (m := EXEC_TIME.search(line)): + exec_time = m.group(1) + if FN_TABLE_START.search(line): + in_fn_table = True + continue + if PC_TABLE_START.search(line): + in_fn_table = False + continue + if in_fn_table and (m := FN_ROW.match(line)): + rows.append( + { + "cycles": int(m.group(1)), + "pct": m.group(2), + "cum": m.group(3), + "pcs": int(m.group(4)), + "fn": m.group(5), + } + ) + return total_cycles, unique_pcs, exec_time, rows + + +def short(name, width=90): + return name if len(name) <= width else name[: width - 1] + "…" + + +def render(total_cycles, unique_pcs, exec_time, rows, title="Recursion guest profile"): + if not rows: + return ( + f"### {title}\n\n" + "> ⚠️ No per-function rows found in the test output — the run may " + "have failed before printing the table. Check the workflow logs.\n" + ) + + body = f"### {title}\n\n" + if total_cycles is not None: + body += f"**Total cycles:** {total_cycles:,}" + if unique_pcs is not None: + body += f" · **Unique PCs:** {unique_pcs:,}" + if exec_time: + body += f" · **Exec time:** {exec_time}" + body += "\n\n" + + body += f"#### Top {len(rows)} functions by cycles (folded over their PCs)\n\n" + body += "| Rank | Cycles | % | Cum % | PCs | Function |\n" + body += "|-----:|-------:|--:|------:|----:|----------|\n" + for i, r in enumerate(rows, 1): + body += ( + f"| {i} | {r['cycles']:,} | {r['pct']}% | {r['cum']}% | " + f"{r['pcs']} | `{short(r['fn'])}` |\n" + ) + + last_cum = rows[-1]["cum"] + body += ( + f"\nEach function's cycles are summed over all its program counters " + f"across the full histogram; the top {len(rows)} cover {last_cum}% of total " + f"cycles. Percentages are of total cycles.\n" + ) + return body + + +def main(): + import argparse + + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("log", help="captured test output to parse") + ap.add_argument("-o", "--out", help="write Markdown here instead of stdout") + ap.add_argument( + "-t", + "--title", + default="Recursion guest profile", + help="section heading (e.g. the test/config name)", + ) + args = ap.parse_args() + + with open(args.log, "r", errors="replace") as f: + text = f.read() + body = render(*parse(text), title=args.title) + if args.out: + with open(args.out, "w") as f: + f.write(body) + else: + sys.stdout.write(body) + + +if __name__ == "__main__": + main() diff --git a/.github/workflows/profile-recursion.yml b/.github/workflows/profile-recursion.yml new file mode 100644 index 000000000..420cebfcb --- /dev/null +++ b/.github/workflows/profile-recursion.yml @@ -0,0 +1,175 @@ +name: Profile Recursion (PR) + +# Runs the recursion-guest PC histogram diagnostics (single-query and +# multi-query, in parallel via a matrix) and posts a combined per-function +# profile as a PR comment. Triggered by a `/profile_recursion` comment from a +# repo member, or manually via workflow_dispatch. + +on: + workflow_dispatch: + issue_comment: + types: [created] + +permissions: + contents: read + pull-requests: write + +concurrency: + group: profile-recursion-${{ github.event.issue.number || github.run_id }} + cancel-in-progress: true + +jobs: + # One job per configuration; they run in parallel and each uploads a Markdown + # fragment artifact. The `comment` job stitches them into one PR comment. + profile: + # Skip unless: workflow_dispatch, or "/profile_recursion" comment on a PR by a member. + if: >- + github.event_name == 'workflow_dispatch' || + (github.event_name == 'issue_comment' && + github.event.issue.pull_request && + startsWith(github.event.comment.body, '/profile_recursion') && + contains(fromJSON('["MEMBER","OWNER","COLLABORATOR"]'), github.event.comment.author_association)) + runs-on: [self-hosted, bench] + timeout-minutes: 90 + strategy: + fail-fast: false + matrix: + include: + - name: single-query + test: single + title: "Single query (blowup=2, 1 query)" + - name: multi-query + test: single + title: "Multi query (blowup=8, 128-bit)" + steps: + - name: React to comment + if: github.event_name == 'issue_comment' && matrix.name == 'single-query' + uses: actions/github-script@v7 + with: + script: | + await github.rest.reactions.createForIssueComment({ + owner: context.repo.owner, + repo: context.repo.repo, + comment_id: context.payload.comment.id, + content: 'eyes' + }); + + - name: Get PR head ref + id: pr-ref + if: github.event_name == 'issue_comment' + env: + GH_TOKEN: ${{ github.token }} + PR_NUM: ${{ github.event.issue.number }} + run: | + SHA=$(gh pr view "$PR_NUM" --repo "$GITHUB_REPOSITORY" --json headRefOid -q .headRefOid) + echo "sha=$SHA" >> "$GITHUB_OUTPUT" + + - name: Checkout + uses: actions/checkout@v4 + with: + ref: ${{ steps.pr-ref.outputs.sha || github.sha }} + + - name: Setup Rust Environment + uses: ./.github/actions/setup-rust + + - name: Add cargo to PATH + run: echo "$HOME/.cargo/bin" >> "$GITHUB_PATH" + + - name: Run recursion PC histogram (${{ matrix.name }}) + env: + TEST: ${{ matrix.test }} + run: | + # Self-provision the RISC-V sysroot in a user-writable dir (the default + # /opt path on the bench runner is root-owned); the guest ELF build the + # test triggers picks this up via the Makefile's `SYSROOT_DIR ?=`. + export SYSROOT_DIR="$HOME/.lambda-vm-sysroot" + set -o pipefail + make test-profile-recursion-$TEST + + - name: Aggregate into a per-function fragment + if: always() + env: + TITLE: ${{ matrix.title }} + run: | + python3 .github/scripts/aggregate_recursion_histogram.py \ + /tmp/hist.log --title "$TITLE" --out "/tmp/fragment-${{ matrix.name }}.md" + cat "/tmp/fragment-${{ matrix.name }}.md" >> "$GITHUB_STEP_SUMMARY" + + - name: Upload fragment + if: always() + uses: actions/upload-artifact@v4 + with: + name: profile-fragment-${{ matrix.name }} + path: /tmp/fragment-${{ matrix.name }}.md + retention-days: 7 + + # Stitch the matrix fragments into a single PR comment. + comment: + needs: profile + if: always() && github.event_name == 'issue_comment' + runs-on: [self-hosted, bench] + steps: + - name: Get PR head ref + id: pr-ref + env: + GH_TOKEN: ${{ github.token }} + PR_NUM: ${{ github.event.issue.number }} + run: | + SHA=$(gh pr view "$PR_NUM" --repo "$GITHUB_REPOSITORY" --json headRefOid -q .headRefOid) + echo "sha=$SHA" >> "$GITHUB_OUTPUT" + + - name: Download fragments + uses: actions/download-artifact@v4 + with: + path: fragments + pattern: profile-fragment-* + merge-multiple: true + + - name: Assemble comment body + env: + COMMIT_SHA: ${{ steps.pr-ref.outputs.sha }} + run: | + { + echo "## Recursion guest profile" + echo + # Single-query first, then multi-query, then any others. + for frag in fragments/fragment-single-query.md \ + fragments/fragment-multi-query.md; do + [ -f "$frag" ] && { cat "$frag"; echo; } + done + echo "Commit: ${COMMIT_SHA:0:8} · Runner: self-hosted bench" + } > /tmp/profile_comment.md + cat /tmp/profile_comment.md + + - name: Comment on PR + uses: actions/github-script@v7 + with: + script: | + const fs = require('fs'); + const body = fs.readFileSync('/tmp/profile_comment.md', 'utf8'); + + const { data: comments } = await github.rest.issues.listComments({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + }); + // Reuse our own marker comment so repeated /profile_recursion runs update in place. + const existing = comments.find(c => + c.user.type === 'Bot' && + c.body.includes('Recursion guest profile') + ); + if (existing) { + await github.rest.issues.updateComment({ + owner: context.repo.owner, + repo: context.repo.repo, + comment_id: existing.id, + body, + }); + } else { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body, + }); + } diff --git a/Makefile b/Makefile index 454eff098..30e3029da 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ .PHONY: deps deps-linux deps-macos compile-programs-asm compile-programs-rust compile-bench \ compile-programs compile-recursion-elfs clean-asm clean-rust clean-bench clean-shared \ clean-recursion-elfs clean test test-asm \ -test-rust test-executor test-flamegraph flamegraph-prover \ +test-rust test-executor test-flamegraph flamegraph-prover test-profile-recursion test-profile-recursion-single test-profile-recursion-multi \ test-fast test-prover test-prover-all test-disk-spill test-math-cuda test-cuda-integration \ bench-math-cuda bench-prover bench-prover-cuda build check clippy fmt lint regen-ethrex-fixtures \ update-ethrex-fixture-checksums check-ethrex-fixture-checksums @@ -232,6 +232,14 @@ test-rust: compile-programs-rust test-flamegraph: cargo test -p executor --test flamegraph +test-profile-recursion: test-profile-recursion-single test-profile-recursion-multi + +test-profile-recursion-single: compile-programs-rust + cargo test --package lambda-vm-prover --lib test_recursion_pc_histogram_1query -- --ignored --nocapture + +test-profile-recursion-multi: compile-programs-rust + cargo test --package lambda-vm-prover --lib test_recursion_pc_histogram_multiquery -- --ignored --nocapture + # Regenerate the committed ethrex block fixtures (see tooling/ethrex-fixtures). # Run after bumping the ethrex rev; README checksums are refreshed automatically. regen-ethrex-fixtures: diff --git a/bench_vs/lambda/deserialize-only/.cargo/config.toml b/bench_vs/lambda/deserialize-only/.cargo/config.toml new file mode 100644 index 000000000..be730c3ec --- /dev/null +++ b/bench_vs/lambda/deserialize-only/.cargo/config.toml @@ -0,0 +1,6 @@ +[target.riscv64im-lambda-vm-elf] +rustflags = [ + "-C", "link-arg=-e", + "-C", "link-arg=main", + "-C", "passes=lower-atomic" +] diff --git a/bench_vs/lambda/deserialize-only/Cargo.toml b/bench_vs/lambda/deserialize-only/Cargo.toml new file mode 100644 index 000000000..b4a4616f4 --- /dev/null +++ b/bench_vs/lambda/deserialize-only/Cargo.toml @@ -0,0 +1,13 @@ +[workspace] + +[package] +name = "deserialize-only-bench" +version = "0.1.0" +edition = "2024" + +[dependencies] +lambda-vm-prover = { path = "../../../prover", default-features = false } +embedded-alloc = "0.6" +riscv = { version = "0.15", features = ["critical-section-single-hart"] } +serde = { version = "=1.0.219", default-features = false, features = ["derive", "alloc"] } +postcard = { version = "1.0", default-features = false, features = ["alloc"] } diff --git a/bench_vs/lambda/deserialize-only/src/main.rs b/bench_vs/lambda/deserialize-only/src/main.rs new file mode 100644 index 000000000..8627776a1 --- /dev/null +++ b/bench_vs/lambda/deserialize-only/src/main.rs @@ -0,0 +1,93 @@ +//! Deserialize-only counterpart to the recursion guest. +//! +//! Reads the same private-input blob as `recursion-bench`, postcard-decodes +//! `(VmProof, Vec, ProofOptions)`, then commits success +//! and halts — without ever calling `verify_with_options`. The cycle delta +//! between this guest and `recursion-bench` is the actual cost of the STARK +//! verifier inside the VM (everything else being equal). + +#![no_std] +#![no_main] + +extern crate alloc; + +use alloc::vec::Vec; +use core::arch::asm; +use core::panic::PanicInfo; + +use embedded_alloc::TlsfHeap as Heap; +use lambda_vm_prover::{ProofOptions, VmProof}; +// Required to pull in the riscv crate's critical-section implementation. +use riscv as _; + +const PRIVATE_INPUT_START: usize = 0xFF000000; +const SYSCALL_COMMIT: u64 = 64; +const SYSCALL_HALT: u64 = 93; +const MAX_MEMORY_SIZE: usize = 0xC000_0000; + +#[global_allocator] +static HEAP: Heap = Heap::empty(); + +#[panic_handler] +fn panic(_info: &PanicInfo) -> ! { + loop {} +} + +fn init_allocator() { + unsafe extern "C" { + static _end: u8; + } + let heap_pos = (&raw const _end) as usize; + unsafe { HEAP.init(heap_pos, MAX_MEMORY_SIZE - heap_pos) } +} + +fn read_private_input() -> &'static [u8] { + let len = unsafe { core::ptr::read_volatile(PRIVATE_INPUT_START as *const u32) } as usize; + let data = (PRIVATE_INPUT_START + 4) as *const u8; + unsafe { core::slice::from_raw_parts(data, len) } +} + +fn commit(bytes: &[u8]) { + unsafe { + asm!( + "ecall", + in("a0") 1u64, + in("a1") bytes.as_ptr(), + in("a2") bytes.len(), + in("a7") SYSCALL_COMMIT, + ); + } +} + +fn halt() -> ! { + unsafe { + asm!( + "ecall", + in("a0") 0u64, + in("a7") SYSCALL_HALT, + options(noreturn), + ); + } +} + +#[unsafe(no_mangle)] +pub fn main() -> ! { + init_allocator(); + + let blob = read_private_input(); + let decoded: (VmProof, Vec, ProofOptions) = + postcard::from_bytes(blob).expect("failed to deserialize"); + + // Force the commit byte to depend on the actually-decoded value. Without + // this, LLVM at -O3 was eliding the postcard decode entirely — the only + // sinks for `decoded` were `black_box(&decoded)` (which only forces the + // *reference* to materialize, not the pointee) and `Drop`, neither of + // which require the decoded bytes to be real. With the commit byte tied + // to a deep field of the decoded value, the decode has to run. + let proof_options_byte = decoded.2.blowup_factor; + let inner_elf_byte = *decoded.1.first().unwrap_or(&0); + let marker = proof_options_byte ^ inner_elf_byte; + + commit(&[marker]); + halt() +} diff --git a/bench_vs/sp1/verifier/Cargo.toml b/bench_vs/sp1/verifier/Cargo.toml new file mode 100644 index 000000000..fc24039c2 --- /dev/null +++ b/bench_vs/sp1/verifier/Cargo.toml @@ -0,0 +1,3 @@ +[workspace] +members = ["program", "script"] +resolver = "2" diff --git a/bench_vs/sp1/verifier/program/Cargo.toml b/bench_vs/sp1/verifier/program/Cargo.toml new file mode 100644 index 000000000..7fbc9c5ce --- /dev/null +++ b/bench_vs/sp1/verifier/program/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "verifier-program" +version = "0.1.0" +edition = "2024" + +[dependencies] +sp1-zkvm = "6.0.1" +lambda-vm-prover = { path = "../../../../prover", default-features = false } +serde = { version = "=1.0.219", default-features = false, features = ["derive", "alloc"] } +postcard = { version = "1.0", default-features = false, features = ["alloc"] } diff --git a/bench_vs/sp1/verifier/program/src/main.rs b/bench_vs/sp1/verifier/program/src/main.rs new file mode 100644 index 000000000..c63bb67ca --- /dev/null +++ b/bench_vs/sp1/verifier/program/src/main.rs @@ -0,0 +1,34 @@ +//! SP1 guest that runs lambda-vm's `verify_with_options` on a single proof. +//! +//! Input layout (postcard-encoded `Vec` written via `SP1Stdin::write_vec`): +//! `(VmProof, Vec, ProofOptions)` +//! where the inner `Vec` is the inner program's ELF bytes. +//! +//! Output: commits `[1u8]` on successful verify; the guest panics otherwise. +//! +//! Caveats: +//! - The verifier hashes through the `keccak` crate. SP1 has a Keccak +//! precompile but it patches `tiny-keccak`, not `keccak`. We don't patch +//! here, so Keccak runs as software inside the guest. Cycle counts will be +//! inflated by that overhead. Worth keeping in mind when interpreting the +//! number relative to lambda-vm's in-VM count. + +#![no_main] + +extern crate alloc; + +use alloc::vec::Vec; + +use lambda_vm_prover::{ProofOptions, VmProof}; + +sp1_zkvm::entrypoint!(main); + +pub fn main() { + let blob = sp1_zkvm::io::read_vec(); + let (vm_proof, inner_elf, options): (VmProof, Vec, ProofOptions) = + postcard::from_bytes(&blob).expect("failed to deserialize input"); + let ok = lambda_vm_prover::verify_with_options(&vm_proof, &inner_elf, &options, None, None) + .expect("verify errored"); + assert!(ok, "inner proof failed verification"); + sp1_zkvm::io::commit_slice(&[1u8]); +} diff --git a/bench_vs/sp1/verifier/script/Cargo.toml b/bench_vs/sp1/verifier/script/Cargo.toml new file mode 100644 index 000000000..3198059bd --- /dev/null +++ b/bench_vs/sp1/verifier/script/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "verifier-script" +version = "0.1.0" +edition = "2024" + +[dependencies] +sp1-sdk = { version = "6.0.1", features = ["blocking", "profiling"] } +lambda-vm-prover = { path = "../../../../prover" } +stark = { path = "../../../../crypto/stark" } +postcard = { version = "1.0", features = ["alloc"] } + +[build-dependencies] +sp1-build = "6.0.1" diff --git a/bench_vs/sp1/verifier/script/build.rs b/bench_vs/sp1/verifier/script/build.rs new file mode 100644 index 000000000..d6cf925d6 --- /dev/null +++ b/bench_vs/sp1/verifier/script/build.rs @@ -0,0 +1,5 @@ +use sp1_build::build_program_with_args; + +fn main() { + build_program_with_args("../program", Default::default()); +} diff --git a/bench_vs/sp1/verifier/script/src/main.rs b/bench_vs/sp1/verifier/script/src/main.rs new file mode 100644 index 000000000..86e46a710 --- /dev/null +++ b/bench_vs/sp1/verifier/script/src/main.rs @@ -0,0 +1,83 @@ +//! Host driver: prove an inner empty program on lambda-vm, then execute the +//! lambda-vm verifier inside SP1's executor, printing the cycle count. +//! +//! Set `TRACE_FILE=profiles/verifier.json` to capture a DWARF-attributed +//! profile (1 sample = 1 cycle). The output can be opened with +//! `samply load profiles/verifier.json`. + +use std::path::PathBuf; + +use sp1_sdk::blocking::{Prover, ProverClient}; +use sp1_sdk::{SP1Stdin, include_elf}; + +const VERIFIER_ELF: sp1_sdk::Elf = include_elf!("verifier-program"); + +fn workspace_root() -> PathBuf { + // CARGO_MANIFEST_DIR for this crate is `/bench_vs/sp1/verifier/script`. + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .ancestors() + .nth(4) + .expect("workspace root") + .to_path_buf() +} + +fn main() { + sp1_sdk::utils::setup_logger(); + + let root = workspace_root(); + let empty_elf_path = root + .join("bench_vs/lambda/empty/target/riscv64im-lambda-vm-elf/release/empty-bench"); + assert!( + empty_elf_path.exists(), + "empty-bench ELF not found at {} — run `bash bench_vs/build_recursion_elfs.sh` first", + empty_elf_path.display(), + ); + let inner_elf = std::fs::read(&empty_elf_path).expect("read empty-bench"); + + let options = stark::proof::options::ProofOptions { + blowup_factor: 2, + fri_number_of_queries: 1, + coset_offset: 3, + grinding_factor: 1, + }; + + println!("[sp1-verifier] proving inner (empty, blowup=2, 1 query) ..."); + let inner_proof = lambda_vm_prover::prove_with_options_and_inputs( + &inner_elf, + &[], + &options, + &lambda_vm_prover::MaxRowsConfig::default(), + ) + .expect("inner prove should succeed"); + + let blob = postcard::to_allocvec(&(&inner_proof, &inner_elf, &options)) + .expect("postcard encode failed"); + println!("[sp1-verifier] postcard blob: {} bytes", blob.len()); + + let client = ProverClient::from_env(); + let mut stdin = SP1Stdin::new(); + stdin.write_vec(blob); + + println!("[sp1-verifier] executing verifier in SP1 ..."); + let (_, report) = client + .execute(VERIFIER_ELF.clone(), stdin) + .run() + .expect("execute failed"); + + let cycles = report.total_instruction_count(); + println!(); + println!("============================================================"); + println!(" SP1 EXECUTION SUMMARY — lambda-vm verifier inside SP1"); + println!("============================================================"); + println!(" Total cycles : {cycles}"); + println!(); + println!(" Compare against lambda-vm in-VM count (~40.5B for the same"); + println!(" proof). Both VMs target riscv64im, so word width is symmetric."); + println!(" Main remaining asymmetry: lambda-vm's KeccakPermute precompile"); + println!(" is patched on its guests but SP1 does not patch `keccak` (only"); + println!(" `tiny-keccak`), so Keccak rounds run as software in SP1 here."); + println!(); + println!(" If TRACE_FILE was set, the profile was written there."); + println!(" Render with: samply load "); + println!("============================================================"); +} diff --git a/executor/src/elf.rs b/executor/src/elf.rs index ed79fb983..da38cbbf1 100644 --- a/executor/src/elf.rs +++ b/executor/src/elf.rs @@ -557,4 +557,9 @@ impl SymbolTable { pub fn len(&self) -> usize { self.functions.len() } + + /// Borrow the full function list (sorted by address). + pub fn functions(&self) -> &[FunctionSymbol] { + &self.functions + } } diff --git a/executor/src/flamegraph.rs b/executor/src/flamegraph.rs index f9b447d19..4764d71a2 100644 --- a/executor/src/flamegraph.rs +++ b/executor/src/flamegraph.rs @@ -154,7 +154,7 @@ impl FlamegraphGenerator { /// Demangle a Rust symbol name using the official rustc-demangle crate. /// /// Uses the alternate format (`{:#}`) to omit the hash suffix for cleaner output. -pub(crate) fn demangle(name: &str) -> String { +pub fn demangle(name: &str) -> String { // Use rustc-demangle with alternate format to omit hash format!("{:#}", rustc_demangle(name)) } diff --git a/executor/src/vm/memory.rs b/executor/src/vm/memory.rs index f349eeae6..f3a3e622c 100644 --- a/executor/src/vm/memory.rs +++ b/executor/src/vm/memory.rs @@ -218,6 +218,13 @@ impl Memory { Ok(self.public_output.clone()) } + /// Read-only access to the underlying 4-byte cell map. Exposed for + /// diagnostic tooling (e.g. counting the distinct 4 KB memory pages a + /// program touches) — not part of the normal execution interface. + pub fn cells(&self) -> &U64HashMap<[u8; 4]> { + &self.cells + } + /// Pre-loads private input bytes at `PRIVATE_INPUT_START_INDEX` as a /// 4-byte LE length prefix followed by the raw data. The guest reads these /// bytes directly via normal RISC-V loads (ZisK-style memory-mapped input). diff --git a/prover/src/tests/recursion_smoke_test.rs b/prover/src/tests/recursion_smoke_test.rs index 7bcd4bd3d..a39bcfc90 100644 --- a/prover/src/tests/recursion_smoke_test.rs +++ b/prover/src/tests/recursion_smoke_test.rs @@ -2,7 +2,7 @@ //! //! Each test: //! 1. Proves an inner program on the host. -//! 2. Serializes `(VmProof, inner_elf, opts)` with postcard. +//! 2. Serializes `(VmProof, inner_elf)` with postcard. //! 3. Hands that as private input to the recursion guest. //! 4. Either **proves** the recursion guest's execution (memory-bounded via //! continuations) and verifies the outer proof (`OuterMode::Prove`), or @@ -12,6 +12,7 @@ //! //! The guest ELFs are assumed built by `make compile-recursion-elfs`. +use std::ops::ControlFlow; use std::path::PathBuf; fn workspace_root() -> PathBuf { @@ -33,10 +34,10 @@ fn read_guest_elf(root: &std::path::Path, name: &str) -> Vec { } /// Minimum-security FRI parameters: blowup=2, a single FRI query. Security is -/// intentionally terrible — used by the capacity-probing test, where the goal -/// is the smallest possible inner proof, not a sound one. -/// (`GoldilocksCubicProofOptions::with_blowup` derives a query count from a -/// 128-bit target, far more than we want here.) +/// intentionally terrible — used by the capacity-probing test and every cheap +/// diagnostic below, where the goal is the smallest possible inner proof, not +/// a sound one. (`GoldilocksCubicProofOptions::with_blowup` derives a query +/// count from a 128-bit target, far more than we want here.) const MIN_PROOF_OPTIONS: stark::proof::options::ProofOptions = stark::proof::options::ProofOptions { blowup_factor: 2, @@ -46,10 +47,10 @@ const MIN_PROOF_OPTIONS: stark::proof::options::ProofOptions = }; /// Prove `inner_elf` (fed `inner_input`) under `opts`, then package -/// `(proof, elf, opts)` into the postcard blob the recursion guest consumes as -/// its private input. `tag` prefixes the progress lines. Returns the inner -/// proof — callers that re-verify it on the host need it — next to the encoded -/// blob. +/// `(proof, elf, opts)` into the postcard blob the recursion and +/// deserialize-only guests consume as their private input. `tag` prefixes the +/// progress lines. Returns the inner proof — callers that re-verify it on the +/// host need it — next to the encoded blob. fn prove_inner_and_encode_blob( tag: &str, inner_elf: &[u8], @@ -148,6 +149,132 @@ fn prove_outer_and_commit(label: &str, recursion_elf_bytes: &[u8], blob: &[u8]) committed } +/// Stream a guest's execution via `Executor::resume()`, calling `on_log` for +/// every `Log` without ever buffering the full log stream (`Executor::run` +/// would accumulate tens of millions of `Log`s and OOM even a 125 GB box). +/// `on_log` returns `ControlFlow::Break(())` to stop the run early (e.g. once a +/// cycle budget is hit); `Continue(())` to keep going. `on_progress(chunks, +/// total_cycles, elapsed)` fires once per resumed chunk; callers throttle and +/// format their own progress lines. Returns `(total_cycles, wall_time)` — +/// `total_cycles` counts logs actually visited, so it is exact even when a run +/// breaks mid-chunk. +fn drive_executor( + executor: &mut executor::vm::execution::Executor, + mut on_log: impl FnMut(&executor::vm::logs::Log) -> ControlFlow<()>, + mut on_progress: impl FnMut(usize, u64, std::time::Duration), +) -> (u64, std::time::Duration) { + let start = std::time::Instant::now(); + let mut total_cycles: u64 = 0; + let mut chunks: usize = 0; + while let Some(logs) = executor.resume().expect("executor resume failed") { + let mut stop = false; + for log in logs { + total_cycles += 1; + if on_log(log).is_break() { + stop = true; + break; + } + } + chunks += 1; + on_progress(chunks, total_cycles, start.elapsed()); + if stop { + break; + } + } + (total_cycles, start.elapsed()) +} + +/// Resolve a guest PC to its (demangled) enclosing function name using the +/// ELF's own symbol table — the same data `executor::flamegraph` resolves +/// against. `` when no function symbol covers the PC (e.g. PLT stubs +/// or a release build that dropped symbols). No file:line: the symbol table +/// carries function ranges only, not DWARF line info. +fn resolve_pc(symbols: &executor::elf::SymbolTable, pc: u64) -> String { + symbols.lookup(pc).map_or_else( + || "".to_string(), + |s| executor::flamegraph::demangle(&s.name), + ) +} + +/// Print a PC histogram as two tables: a per-function summary (the cycles each +/// resolved function accounts for, folded over all its PCs) followed by the +/// top-100 per-address detail. `pc_hist` maps program counter → cycle count. +/// +/// The per-function view is the one that matters: an inlined kernel is spread +/// across dozens of PCs, so the raw per-address table scatters its true cost. +fn print_pc_histogram( + title: &str, + symbols: &executor::elf::SymbolTable, + pc_hist: std::collections::HashMap, + total_cycles: u64, + exec_time: std::time::Duration, +) { + let mut entries: Vec<(u64, u64)> = pc_hist.into_iter().collect(); + entries.sort_unstable_by_key(|(_pc, count)| std::cmp::Reverse(*count)); + + // Aggregate the full histogram by resolved function, resolving each PC once. + let mut by_function: std::collections::HashMap = + std::collections::HashMap::new(); + for (pc, count) in &entries { + let entry = by_function + .entry(resolve_pc(symbols, *pc)) + .or_insert((0, 0)); + entry.0 += *count; // cycles + entry.1 += 1; // distinct PCs folded into this function + } + let mut fn_entries: Vec<(String, (u64, u64))> = by_function.into_iter().collect(); + fn_entries.sort_unstable_by_key(|(_name, (cycles, _pcs))| std::cmp::Reverse(*cycles)); + + let pct = |n: u64| 100.0 * (n as f64) / (total_cycles as f64); + + eprintln!(); + eprintln!("============================================================"); + eprintln!(" {title}"); + eprintln!("============================================================"); + eprintln!(" Total cycles : {total_cycles}"); + eprintln!(" Unique PCs : {}", entries.len()); + eprintln!(" Exec time : {exec_time:?}"); + eprintln!(); + eprintln!(" Top 25 functions by cycle count (aggregated over their PCs):"); + eprintln!( + " {:>4} {:>14} {:>7} {:>7} {:>5} {}", + "rank", "cycles", "%", "cum %", "PCs", "function" + ); + let mut fn_cumulative: u64 = 0; + for (rank, (name, (cycles, pcs))) in fn_entries.iter().take(25).enumerate() { + fn_cumulative += cycles; + eprintln!( + " {:>4} {:>14} {:>6.2}% {:>6.2}% {:>5} {}", + rank + 1, + cycles, + pct(*cycles), + pct(fn_cumulative), + pcs, + name, + ); + } + eprintln!(); + eprintln!(" Top 100 PCs by cycle count (per-address detail):"); + eprintln!( + " {:>4} {:>18} {:>14} {:>7} {:>7} {}", + "rank", "pc", "cycles", "%", "cum %", "function" + ); + let mut cumulative: u64 = 0; + for (rank, (pc, count)) in entries.iter().take(100).enumerate() { + cumulative += count; + eprintln!( + " {:>4} {:#018x} {:>14} {:>6.2}% {:>6.2}% {}", + rank + 1, + pc, + count, + pct(*count), + pct(cumulative), + resolve_pc(symbols, *pc), + ); + } + eprintln!("============================================================"); +} + /// Core pipeline: prove an inner program with the given options, hand the /// proof+ELF+options to the recursion guest, then take the guest to `mode` /// (execute-only or full prove) and assert it committed the `[1]` success @@ -336,6 +463,797 @@ fn test_recursion_prove_1query() { ); } +/// Diagnostic: build the inner proof and dump the recursion guest's private-input +/// blob to `/tmp/recursion_input.bin` so the CLI's `execute --flamegraph` can +/// consume it. +/// +/// Usage after running this test: +/// ``` +/// cargo run -p cli --release -- execute \ +/// bench_vs/lambda/recursion/target/riscv64im-lambda-vm-elf/release/recursion-bench \ +/// --private-input /tmp/recursion_input.bin \ +/// --flamegraph /tmp/recursion_folded.txt +/// cat /tmp/recursion_folded.txt | inferno-flamegraph > /tmp/recursion_flamegraph.svg +/// ``` +#[test] +#[ignore = "diagnostic: writes recursion private input to /tmp/recursion_input.bin"] +fn test_dump_recursion_input() { + let root = workspace_root(); + let empty_elf_bytes = read_guest_elf(&root, "empty"); + + let (_inner_proof, blob) = + prove_inner_and_encode_blob("dump-input", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS); + + let path = "/tmp/recursion_input.bin"; + std::fs::write(path, &blob).expect("write blob"); + eprintln!("[dump-input] wrote {} bytes to {path}", blob.len()); +} + +/// Diagnostic: build the inner proof + recursion guest input, then **execute +/// only** the recursion guest (no STARK proving) and report cycle counts + +/// trace size estimates. +/// +/// This is the cheap way to find out how many RISC-V instructions the +/// verifier actually executes inside the guest — a much faster signal than +/// running the full outer prove (which can OOM on a 125 GB machine). +#[test] +#[ignore = "diagnostic: runs the executor only, prints cycle counts"] +fn test_recursion_cycle_count() { + use executor::elf::Elf; + use executor::vm::execution::Executor; + + let root = workspace_root(); + let empty_elf_bytes = read_guest_elf(&root, "empty"); + let recursion_elf_bytes = read_guest_elf(&root, "recursion"); + + // Build the inner proof exactly as the smoke test does, with the + // absolute-minimum FRI params so the inner is as small as possible. + let (_inner_proof, blob) = + prove_inner_and_encode_blob("cycle-count", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS); + + // Execute (NOT prove) the recursion guest. `drive_executor` streams chunks + // and never accumulates logs in memory — this avoids the Vec blow-up + // that OOMs even a 125 GB server (one Log is 40 B; a few billion of them is + // hundreds of GB). + eprintln!("[cycle-count] executing recursion guest (streaming counter only) ..."); + let program = Elf::load(&recursion_elf_bytes).expect("ELF load failed"); + let mut executor = Executor::new(&program, blob).expect("Executor::new failed"); + let (total_cycles, exec_time) = drive_executor( + &mut executor, + |_log| ControlFlow::Continue(()), + |chunks, cycles, elapsed| { + if chunks.is_multiple_of(50) { + eprintln!( + "[cycle-count] ... {chunks} chunks, {cycles} cycles, {elapsed:?} elapsed" + ); + } + }, + ); + let cycle_count = total_cycles as usize; + + eprintln!(); + eprintln!("============================================================"); + eprintln!(" RECURSION GUEST EXECUTION SUMMARY"); + eprintln!("============================================================"); + eprintln!(" Cycle count : {cycle_count}"); + eprintln!(" Executor wall time : {exec_time:?}"); + eprintln!(); + eprintln!(" Rough memory estimate for outer prove:"); + let bytes_per_field = 8usize; + let approx_columns = 250usize; // CPU + MEMW + DECODE + bus columns combined + let main_trace_bytes = cycle_count * approx_columns * bytes_per_field; + let blowup = 2usize; + let lde_main_bytes = main_trace_bytes * blowup; + eprintln!( + " main trace : ~{:.2} GB ({} cycles × ~{} cols × 8 B)", + main_trace_bytes as f64 / 1e9, + cycle_count, + approx_columns + ); + eprintln!( + " main LDE (blowup={}) : ~{:.2} GB", + blowup, + lde_main_bytes as f64 / 1e9 + ); + eprintln!(" (aux trace adds roughly 50% more, so peak peak ≈ 2-3× LDE)"); + eprintln!("============================================================"); +} + +/// Diagnostic: count the distinct 4 KB memory pages the recursion guest +/// touches when verifying a small inner proof. +/// +/// We suspect the outer prover's 125 GB OOM wall is dominated by per-page +/// PAGE-table overhead. The number of PAGE tables the prover would build +/// equals the number of distinct 4 KB pages the executor touches — code, +/// heap, private input, and stack. This test surfaces that count without +/// running the prover. +/// +/// Layout (per `executor::constants` + `bench_vs/lambda/recursion/src/main.rs`): +/// - Code/static: whatever PT_LOAD segments the recursion ELF carries. +/// - Heap: `_end .. 0xC000_0000` (`MAX_MEMORY_SIZE`); `TlsfHeap` scatters +/// allocations across this region. +/// - Private input: starts at `PRIVATE_INPUT_START_INDEX = 0xFF000000`. +/// - Stack: top of address space (down from `STACK_TOP = 0xFFFFFFFFFFFFFFF0`). +/// +/// Interpretation (rough): +/// - <1,000 pages: PAGE-table overhead is not the bottleneck. +/// - 10k-100k pages: TLSF heap fragmentation; design a tighter bump allocator +/// and re-measure. +/// - >100k pages: postcard decode dominates; consider streaming decode. +#[test] +#[ignore = "diagnostic: counts distinct 4 KB memory pages touched by the recursion guest"] +fn test_recursion_page_count() { + use executor::elf::Elf; + use executor::vm::execution::Executor; + use executor::vm::memory::PRIVATE_INPUT_START_INDEX; + use std::collections::HashSet; + + let root = workspace_root(); + let empty_elf_bytes = read_guest_elf(&root, "empty"); + let recursion_elf_bytes = read_guest_elf(&root, "recursion"); + + let (_inner_proof, blob) = + prove_inner_and_encode_blob("page-count", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS); + + // Precompute the recursion ELF's PT_LOAD ranges so we can bucket code/ + // static pages separately from heap. `Elf::load` already expands BSS + // (memsz > filesz) into zero-valued words, so these ranges cover + // .text + .rodata + .data + .bss. + let program = Elf::load(&recursion_elf_bytes).expect("ELF load failed"); + let segment_ranges: Vec<(u64, u64)> = program + .data + .iter() + .map(|seg| (seg.base_addr, seg.base_addr + (seg.values.len() as u64 * 4))) + .collect(); + eprintln!( + "[page-count] recursion ELF: {} PT_LOAD segment(s)", + segment_ranges.len(), + ); + for (i, (lo, hi)) in segment_ranges.iter().enumerate() { + eprintln!( + "[page-count] segment[{i}]: 0x{lo:016x} .. 0x{hi:016x} ({} bytes)", + hi - lo, + ); + } + + // Stream through execution — running to completion via `Executor::run` + // would accumulate ~67 M `Log` records (~2.7 GB) we don't need. We only + // care about the *final* memory state. + eprintln!("[page-count] executing recursion guest (streaming) ..."); + let mut executor = Executor::new(&program, blob).expect("Executor::new failed"); + let (total_cycles, exec_time) = drive_executor( + &mut executor, + |_log| ControlFlow::Continue(()), + |chunks, cycles, elapsed| { + if chunks.is_multiple_of(50) { + eprintln!( + "[page-count] ... {chunks} chunks, {cycles} cycles, {elapsed:?} elapsed" + ); + } + }, + ); + + // Collect the set of distinct 4 KB pages from every cell touched during + // (a) program loading, (b) private-input loading, (c) execution. + const PAGE_MASK: u64 = !0xFFFu64; + let cells = executor.memory().cells(); + let total_cells = cells.len(); + let pages: HashSet = cells.keys().map(|&a| a & PAGE_MASK).collect(); + + // Bucket by region. A "code/static" page is any page that overlaps a + // PT_LOAD segment. Stack lives near the top of the 64-bit address + // space; private input lives in the [0xFF000000, ...) window above the + // 3 GB heap ceiling. + const HEAP_CEILING: u64 = 0xC000_0000; + const STACK_FLOOR: u64 = 0xFFFF_FFFF_0000_0000; + + let mut code_pages = 0usize; + let mut heap_pages = 0usize; + let mut private_input_pages = 0usize; + let mut stack_pages = 0usize; + let mut other_pages = 0usize; + + for &page in &pages { + let page_end = page.saturating_add(0x1000); + let in_code = segment_ranges + .iter() + .any(|&(lo, hi)| page < hi && lo < page_end); + if in_code { + code_pages += 1; + } else if page >= STACK_FLOOR { + stack_pages += 1; + } else if page >= PRIVATE_INPUT_START_INDEX { + private_input_pages += 1; + } else if page < HEAP_CEILING { + heap_pages += 1; + } else { + other_pages += 1; + } + } + + eprintln!(); + eprintln!("============================================================"); + eprintln!(" RECURSION GUEST PAGE-COUNT SUMMARY"); + eprintln!("============================================================"); + eprintln!(" Total cycles : {total_cycles}"); + eprintln!(" Executor wall time : {exec_time:?}"); + eprintln!(" Memory cells touched (4 B ea) : {total_cells}"); + eprintln!(" Distinct 4 KB pages touched : {}", pages.len()); + eprintln!(); + eprintln!(" Pages per region:"); + eprintln!(" code/static (ELF segments) : {code_pages}"); + eprintln!(" heap (0..0xC000_0000) : {heap_pages}"); + eprintln!(" private input (0xFF000000..) : {private_input_pages}"); + eprintln!(" stack (>= 0xFFFFFFFF_00000000) : {stack_pages}"); + if other_pages > 0 { + eprintln!(" other (unclassified) : {other_pages}"); + } + eprintln!(); + eprintln!(" Interpretation (PAGE-table overhead):"); + eprintln!(" <1k pages → PAGE overhead is not the bottleneck."); + eprintln!(" 10k-100k → TLSF heap fragmentation; try a bump alloc."); + eprintln!(" >100k → postcard decode dominates; stream-decode?"); + eprintln!("============================================================"); +} + +/// Build a PC histogram of the recursion guest verifying an `empty`-program +/// inner proof produced with `inner_proof_options`, and print it via +/// [`print_pc_histogram`] under `title`. +/// +/// `blowup_factor` and `fri_number_of_queries` are coupled (the query count is +/// derived from blowup for a fixed security target), so each `#[test]` below is +/// just this runner with a different `ProofOptions` — a single query at low +/// blowup, vs. the security-derived multi-query count at a higher blowup. +/// +/// Streams chunks of logs via `Executor::resume()` so memory stays bounded to +/// the histogram itself. Each PC is resolved to its enclosing function via the +/// in-house `executor::elf::SymbolTable` (reading the recursion ELF's symbol +/// table directly — no external tool, no DWARF dependency). +fn run_recursion_pc_histogram( + title: &str, + inner_proof_options: stark::proof::options::ProofOptions, +) { + use executor::elf::Elf; + use executor::vm::execution::Executor; + use std::collections::HashMap; + + let root = workspace_root(); + let empty_elf_bytes = read_guest_elf(&root, "empty"); + let recursion_elf_bytes = read_guest_elf(&root, "recursion"); + + let (_inner_proof, blob) = + prove_inner_and_encode_blob("pc-hist", &empty_elf_bytes, &[], &inner_proof_options); + + eprintln!("[pc-hist] executing recursion guest (building PC histogram) ..."); + let program = Elf::load(&recursion_elf_bytes).expect("ELF load failed"); + let mut executor = Executor::new(&program, blob).expect("Executor::new failed"); + + let mut pc_hist: HashMap = HashMap::with_capacity(300_000); + let unique = std::cell::Cell::new(0usize); + let (total_cycles, exec_time) = drive_executor( + &mut executor, + |log| { + *pc_hist.entry(log.current_pc).or_insert(0) += 1; + unique.set(pc_hist.len()); + ControlFlow::Continue(()) + }, + |chunks, cycles, elapsed| { + if chunks.is_multiple_of(500) { + eprintln!( + "[pc-hist] ... {chunks} chunks, {cycles} cycles, {} unique PCs, {elapsed:?}", + unique.get() + ); + } + }, + ); + + // Resolve PCs to functions directly from the ELF's symbol table. + let symbols = executor::elf::SymbolTable::parse(&recursion_elf_bytes); + print_pc_histogram(title, &symbols, pc_hist, total_cycles, exec_time); +} + +/// Diagnostic: PC histogram of the recursion guest with a **single** FRI query +/// at blowup=2 — the cheapest verifier run, dominated by fixed setup cost +/// (decode, allocator, postcard) rather than per-query FRI/Merkle work. +#[test] +#[ignore = "diagnostic: ~8 minutes; prints PC histogram of the verifier-in-VM"] +fn test_recursion_pc_histogram_1query() { + run_recursion_pc_histogram( + "RECURSION GUEST PC HISTOGRAM (blowup=2, 1 query)", + MIN_PROOF_OPTIONS, + ); +} + +/// Diagnostic: PC histogram of the recursion guest at **128-bit security** +/// (blowup=8, FRI query count derived by the Johnson Bound Regime — tens of +/// queries). Compared against the single-query runs, weight shifts toward the +/// verifier's per-query FRI-layer / Merkle-opening and field arithmetic. +#[test] +#[ignore = "diagnostic: heavy; PC histogram of the multi-query verifier-in-VM"] +fn test_recursion_pc_histogram_multiquery() { + let inner_proof_options = + crate::GoldilocksCubicProofOptions::with_blowup(8).expect("blowup=8 is always valid"); + run_recursion_pc_histogram( + &format!( + "RECURSION GUEST PC HISTOGRAM (blowup=8, {} queries, 128-bit)", + inner_proof_options.fri_number_of_queries + ), + inner_proof_options, + ); +} + +/// Diagnostic: build a **sampled** call-stack histogram of the recursion guest. +/// +/// Like `test_recursion_pc_histogram` but groups by full call stack (not PC). +/// To stay fast, only every `SAMPLE_RATE`-th log is recorded into the histogram. +/// The call stack itself is updated on every log (skipping would corrupt it). +/// +/// Output is written to `/tmp/recursion_folded_sampled.txt` in +/// inferno-flamegraph "folded stacks" format. Pipe it through: +/// +/// cat /tmp/recursion_folded_sampled.txt | inferno-flamegraph > svg.svg +/// +/// Expect ~10-20 minutes for SAMPLE_RATE=100 on a 40B-cycle guest. +#[test] +#[ignore = "diagnostic: sampled flamegraph for the verifier-in-VM"] +fn test_recursion_sampled_flamegraph() { + use executor::elf::Elf; + use executor::flamegraph::FlamegraphGenerator; + use executor::vm::execution::Executor; + use std::io::BufWriter; + + /// 1 in N logs is fed to `process_logs`, which both updates the call + /// stack and records a sample. At 1, every cycle goes through — the call + /// stack stays exactly in sync with execution so frame widths are + /// trustworthy, but the per-cycle cost (~57µs) limits how many cycles + /// we can cover within a wall-clock budget. + /// + /// At SAMPLE_RATE > 1, every CALL/RETURN that lands on a skipped cycle + /// silently desyncs the stack, producing the "stuck-in-visit_seq" effect + /// we saw at 1:1000. Use values > 1 only when stack accuracy is + /// expendable. + const SAMPLE_RATE: usize = 1; + + /// Stop the executor early once we've covered this many cycles. + /// Set to 0 to run to completion (40B+ cycles, hours at SAMPLE_RATE=1). + /// At SAMPLE_RATE=1, ~57µs per cycle means 5M cycles ≈ 5 min wall time. + const CYCLE_BUDGET: u64 = 5_000_000; + + let root = workspace_root(); + let empty_elf_bytes = read_guest_elf(&root, "empty"); + let recursion_elf_bytes = read_guest_elf(&root, "recursion"); + + let (_inner_proof, blob) = + prove_inner_and_encode_blob("sampled-fg", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS); + + eprintln!("[sampled-fg] executing recursion guest (sampling 1-in-{SAMPLE_RATE}) ...",); + let program = Elf::load(&recursion_elf_bytes).expect("ELF load failed"); + let symbols = executor::elf::SymbolTable::parse(&recursion_elf_bytes); + let entry_point = program.entry_point; + let mut executor = Executor::new(&program, blob).expect("Executor::new failed"); + + // Build our own instruction cache from the same segments `Executor::new` + // decodes internally. Owning it (rather than reading `executor.instructions` + // mid-loop) is what lets the per-log closure call `process_logs` without + // borrowing `executor`, which `drive_executor` holds mutably for `resume()`. + let instructions = executor::vm::execution::InstructionCache::new(&program.data) + .expect("instruction cache build failed"); + + // RefCell so the per-log closure (`process_logs`, &mut self) and the + // progress closure (`write_folded`, &self) can both reach the generator — + // their calls never overlap, so the runtime borrow check never trips. + let generator = std::cell::RefCell::new(FlamegraphGenerator::new(symbols, entry_point)); + + // Path is defined here (not after the loop) so the periodic checkpoint + // writes below can target it. The final write at the end still happens. + let path = "/tmp/recursion_folded_sampled.txt"; + + let mut i = 0usize; + let (total_cycles, exec_time) = drive_executor( + &mut executor, + |log| { + // 1-in-SAMPLE_RATE logs are fed to `process_logs`. At SAMPLE_RATE==1 + // this is the identity filter (`_ % 1 == 0`); the `#[allow]` keeps + // the general form so SAMPLE_RATE can be bumped without touching the + // body. Skipped logs lose stack accuracy — acceptable diagnostic + // quality at higher rates. + #[allow(clippy::modulo_one)] + let take = i % SAMPLE_RATE == 0; + if take { + generator + .borrow_mut() + .process_logs(std::slice::from_ref(log), &instructions) + .expect("flamegraph process_logs"); + } + i += 1; + + // Early exit once we've covered the cycle budget. The dominant hot + // kernels are ~uniform across the verifier's runtime, so a partial + // run still surfaces them. `#[allow]` lets CYCLE_BUDGET be const-0 + // (full run) without tripping clippy. + #[allow(clippy::absurd_extreme_comparisons)] + if CYCLE_BUDGET > 0 && i as u64 >= CYCLE_BUDGET { + eprintln!("[sampled-fg] hit cycle budget ({CYCLE_BUDGET} cycles), stopping early"); + ControlFlow::Break(()) + } else { + ControlFlow::Continue(()) + } + }, + |chunks, cycles, elapsed| { + if chunks.is_multiple_of(500) { + eprintln!( + "[sampled-fg] ... {chunks} chunks, {cycles} cycles, {elapsed:?} elapsed" + ); + // Checkpoint: re-write the folded file in place so a killed run + // still leaves a usable (if partial) flamegraph on disk. + let file = std::fs::File::create(path).expect("create output file"); + let mut writer = BufWriter::new(file); + generator + .borrow() + .write_folded(&mut writer) + .expect("write folded output"); + } + }, + ); + + let file = std::fs::File::create(path).expect("create output file"); + let mut writer = BufWriter::new(file); + generator + .borrow() + .write_folded(&mut writer) + .expect("write folded output"); + + eprintln!(); + eprintln!("============================================================"); + eprintln!(" SAMPLED FLAMEGRAPH SUMMARY"); + eprintln!("============================================================"); + eprintln!(" Total cycles : {total_cycles}"); + eprintln!(" Sample rate : 1 in {SAMPLE_RATE}"); + eprintln!(" Exec time : {exec_time:?}"); + eprintln!(" Output file : {path}"); + eprintln!("============================================================"); + eprintln!(); + eprintln!(" To render SVG (requires inferno):"); + eprintln!(" cat {path} | inferno-flamegraph > /tmp/recursion_flamegraph_sampled.svg"); + eprintln!("============================================================"); +} + +/// Diagnostic: host-side per-step timings for the verifier. +/// +/// Runs an inner prove (empty guest, blowup=2, 1 query) and then verifies it +/// on the host. When built with `--features stark/instruments`, the verifier +/// prints `Time spent: ...` for each of the four steps (replay challenges, +/// composition polynomial, FRI, DEEP openings) plus the step-1-replay it +/// does before step 2. Lets us see the host-side split in seconds, without +/// running anything inside the VM. +/// +/// Usage: +/// ``` +/// cargo test --release -p lambda-vm-prover --features stark/instruments \ +/// --lib test_host_verify_step_timings -- --ignored --nocapture +/// ``` +#[test] +#[ignore = "diagnostic: prints host-side verifier step timings"] +fn test_host_verify_step_timings() { + let root = workspace_root(); + let empty_path = + root.join("bench_vs/lambda/empty/target/riscv64im-lambda-vm-elf/release/empty-bench"); + let empty_elf_bytes = std::fs::read(&empty_path).expect("read empty-bench"); + + let inner_proof_options = MIN_PROOF_OPTIONS; + + eprintln!("[host-verify] proving empty (blowup=2, fri_queries=1) ..."); + let inner_proof = crate::prove_with_options_and_inputs( + &empty_elf_bytes, + &[], + &inner_proof_options, + &crate::MaxRowsConfig::default(), + ) + .expect("inner prove should succeed"); + + eprintln!("[host-verify] verifying on host (with instruments) ..."); + let ok = crate::verify_with_options( + &inner_proof, + &empty_elf_bytes, + &inner_proof_options, + None, + None, + ) + .expect("verify errored"); + assert!(ok, "proof must verify"); + eprintln!("[host-verify] verified OK"); +} + +/// Diagnostic: cycle count for the **deserialize-only** counterpart of the +/// recursion guest. Same input layout +/// (`(VmProof, Vec, ProofOptions)`) and same proof, but +/// the guest just postcard-decodes the blob and halts — it never calls +/// `verify_with_options`. +/// +/// The cycle delta between this and `test_recursion_cycle_count` is the +/// actual cost of the STARK verifier inside the VM. Historically (40.5 B-cycle +/// recursion guest) postcard decode was ~15.6 M cycles — negligible. Now that +/// the recursion guest is ~67 M cycles, the same absolute cost would be ~23% +/// of total; this test re-measures it. +#[test] +#[ignore = "diagnostic: runs the deserialize-only guest, prints cycle count"] +fn test_deserialize_only_cycle_count() { + use executor::elf::Elf; + use executor::vm::execution::Executor; + + let root = workspace_root(); + let empty_elf_bytes = read_guest_elf(&root, "empty"); + let deser_elf_bytes = read_guest_elf(&root, "deserialize-only"); + + let (_inner_proof, blob) = + prove_inner_and_encode_blob("deser-only", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS); + + eprintln!("[deser-only] executing deserialize-only guest (streaming) ..."); + let program = Elf::load(&deser_elf_bytes).expect("ELF load failed"); + eprintln!( + "[deser-only] ELF: {} bytes, entry_point=0x{:x}", + deser_elf_bytes.len(), + program.entry_point, + ); + assert_ne!( + program.entry_point, 0, + "deserialize-only ELF has entry_point=0 — build artifact is malformed" + ); + let mut executor = Executor::new(&program, blob).expect("Executor::new failed"); + + let (total_cycles, exec_time) = drive_executor( + &mut executor, + |_log| ControlFlow::Continue(()), + |chunks, cycles, elapsed| { + if chunks.is_multiple_of(50) { + eprintln!( + "[deser-only] ... {chunks} chunks, {cycles} cycles, {elapsed:?} elapsed" + ); + } + }, + ); + let cycle_count = total_cycles; + + eprintln!(); + eprintln!("============================================================"); + eprintln!(" DESERIALIZE-ONLY GUEST EXECUTION SUMMARY"); + eprintln!("============================================================"); + eprintln!(" Cycle count : {cycle_count}"); + eprintln!(" Executor wall time : {exec_time:?}"); + eprintln!(); + eprintln!(" Compare against test_recursion_cycle_count (~40.5B cycles"); + eprintln!(" with the same proof). Delta = verifier-in-VM cost."); + eprintln!("============================================================"); +} + +/// Diagnostic: PC histogram for the **deserialize-only** guest. +/// +/// Sibling of `test_recursion_pc_histogram`, but targeting the +/// deserialize-only control guest so we can locate the hot kernel inside the +/// 15.7 M-cycle postcard decode itself. Every cycle goes through the +/// histogram (no sampling), so attribution is exact — the previous sampled +/// flamegraph at 1:1000 had broken stack reconstruction on skipped +/// CALL/RETURNs, which made it unreliable for a workload this small. +/// +/// Each top PC is resolved to its enclosing function via the in-house +/// `executor::elf::SymbolTable`, reading the guest ELF's symbol table directly +/// (no external tool, no DWARF dependency). +#[test] +#[ignore = "diagnostic: ~1 min; PC histogram for the deserialize-only guest"] +fn test_deserialize_only_pc_histogram() { + use executor::elf::Elf; + use executor::vm::execution::Executor; + use std::collections::HashMap; + + let root = workspace_root(); + let empty_elf_bytes = read_guest_elf(&root, "empty"); + let deser_elf_bytes = read_guest_elf(&root, "deserialize-only"); + + let (_inner_proof, blob) = + prove_inner_and_encode_blob("deser-pc-hist", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS); + + eprintln!("[deser-pc-hist] executing deserialize-only guest (building PC histogram) ..."); + let program = Elf::load(&deser_elf_bytes).expect("ELF load failed"); + let mut executor = Executor::new(&program, blob).expect("Executor::new failed"); + + // ~50k unique PCs is plenty: the deserialize-only guest is ~74 KB of ELF + // (~18k 4-byte instructions); the hot inner loop is much smaller still. + let mut pc_hist: HashMap = HashMap::with_capacity(50_000); + let unique = std::cell::Cell::new(0usize); + let (total_cycles, exec_time) = drive_executor( + &mut executor, + |log| { + *pc_hist.entry(log.current_pc).or_insert(0) += 1; + unique.set(pc_hist.len()); + ControlFlow::Continue(()) + }, + |chunks, cycles, elapsed| { + if chunks.is_multiple_of(50) { + eprintln!( + "[deser-pc-hist] ... {chunks} chunks, {cycles} cycles, {} unique PCs, {elapsed:?}", + unique.get() + ); + } + }, + ); + + // Resolve PCs to functions directly from the ELF's symbol table. + let symbols = executor::elf::SymbolTable::parse(&deser_elf_bytes); + print_pc_histogram( + "DESERIALIZE-ONLY GUEST PC HISTOGRAM", + &symbols, + pc_hist, + total_cycles, + exec_time, + ); +} + +/// Diagnostic: bucket the recursion guest's cycles by which verifier step +/// is currently executing. +/// +/// The verifier's hot path is `verify_rounds_2_to_4`, which calls four +/// sub-routines in a fixed order: +/// 1. `replay_rounds_after_round_1` (recover challenges) +/// 2. `step_2_verify_claimed_composition_polynomial` +/// 3. `step_3_verify_fri` +/// 4. `step_4_verify_trace_and_composition_openings` +/// +/// We resolve each sub-routine's entry PC from the recursion ELF's symbol +/// table, then run a monotonic state machine over the execution stream: +/// the active bucket only advances 0 → 1 → 2 → 3 → 4 (never backwards), +/// so cycles inside a step's callees stay attributed to that step. +/// +/// Bucket 0 ("setup") captures everything before step 1 is entered — the +/// allocator init, postcard decode, and `VmAirs::new` (which contains the +/// expensive preprocessed-commitment FFTs). +/// +/// Streams chunks via `Executor::resume()` so memory stays bounded. +#[test] +#[ignore = "diagnostic: ~13 min; buckets the 40B cycles by verifier step"] +fn test_recursion_step_breakdown() { + use executor::elf::{Elf, SymbolTable}; + use executor::vm::execution::Executor; + + let root = workspace_root(); + let empty_elf_bytes = read_guest_elf(&root, "empty"); + let recursion_elf_bytes = read_guest_elf(&root, "recursion"); + + let (_inner_proof, blob) = + prove_inner_and_encode_blob("step-bkd", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS); + + // Build a per-step "advance bucket to N" lookup. The verifier's step + // functions get inlined by LLVM in release mode, so we can't rely on + // matching their entry PCs directly. Instead we anchor on closures the + // compiler emits *inside* each step's body — iterator combinators like + // `.fold(|...|)` keep the step's method name as a substring in their + // mangled symbol. Any PC that resolves to a symbol containing step N's + // keyword advances the bucket to N (monotonically). + // + // If step N has no matching symbol at all (e.g. step 4 is fully inlined + // with no closure children of its own), its cycles get attributed to the + // previous bucket. We report that explicitly in the summary. + let symbols = SymbolTable::parse(&recursion_elf_bytes); + assert!( + !symbols.is_empty(), + "recursion ELF has no symbol table — was it stripped?" + ); + + let step_keywords = [ + "replay_rounds_after_round_1", + "step_2_verify_claimed_composition_polynomial", + "step_3_verify_fri", + "step_4_verify_trace_and_composition_openings", + ]; + let step_found: [bool; 4] = std::array::from_fn(|i| { + symbols + .functions() + .iter() + .any(|f| f.name.contains(step_keywords[i])) + }); + for (i, found) in step_found.iter().enumerate() { + let n_matches = symbols + .functions() + .iter() + .filter(|f| f.name.contains(step_keywords[i])) + .count(); + eprintln!( + "[step-bkd] step {}: keyword={:?} -> {} symbol(s) {}", + i + 1, + step_keywords[i], + n_matches, + if *found { + "" + } else { + "(fully inlined; will merge into the previous bucket)" + } + ); + } + + // Monotonic state machine: 0=setup, 1..=4=inside step N (or its callees / + // inlined-step-N-cycles attributed here because step N+1 is missing). + // `bucket` lives in a Cell so the per-log closure can advance it while the + // progress closure reads it for its live readout. + let bucket = std::cell::Cell::new(0u8); + let mut buckets = [0u64; 5]; + + eprintln!("[step-bkd] executing recursion guest (streaming) ..."); + let program = Elf::load(&recursion_elf_bytes).expect("ELF load failed"); + let mut executor = Executor::new(&program, blob).expect("Executor::new failed"); + + // Cache the last symbol-table hit so we only do a binary search on + // function transitions, not on every cycle. Functions are typically + // long-running (>>1 instruction), so this cache hits ~all of the time. + let mut last_range: Option<(u64, u64)> = None; + let mut last_advance: u8 = 0; + + let (total_cycles, exec_time) = drive_executor( + &mut executor, + |log| { + let pc = log.current_pc; + let in_cached = matches!(last_range, Some((s, e)) if pc >= s && pc < e); + if !in_cached { + // Slow path: refresh the cache from the symbol table. + if let Some(sym) = symbols.lookup(pc) { + // SymbolTable accepts size=0 symbols as "any address >="; for + // those we'd need the next symbol's start for a real upper + // bound. Cheapest workaround: set a tiny range so we re-resolve + // soon enough that wrong attribution is bounded. + let end = sym.address + sym.size.max(1); + last_range = Some((sym.address, end)); + last_advance = 0; + for (i, kw) in step_keywords.iter().enumerate() { + if sym.name.contains(kw) { + last_advance = (i + 1) as u8; + } + } + } else { + last_range = None; + last_advance = 0; + } + } + if bucket.get() < last_advance { + bucket.set(last_advance); + } + buckets[bucket.get() as usize] += 1; + ControlFlow::Continue(()) + }, + |chunks, cycles, elapsed| { + if chunks.is_multiple_of(500) { + eprintln!( + "[step-bkd] ... {chunks} chunks, {cycles} cycles, bucket={}, {elapsed:?}", + bucket.get() + ); + } + }, + ); + + let labels = [ + "0. setup (alloc + postcard decode + VmAirs::new + pre-step-1)", + "1. step 1: replay_rounds_after_round_1", + "2. step 2: verify_claimed_composition_polynomial", + "3. step 3: verify_fri", + "4. step 4: verify_trace_and_composition_openings (+ wrap-up)", + ]; + + eprintln!(); + eprintln!("============================================================"); + eprintln!(" RECURSION GUEST PER-STEP CYCLE BREAKDOWN"); + eprintln!("============================================================"); + eprintln!(" Total cycles : {total_cycles}"); + eprintln!(" Exec time : {exec_time:?}"); + eprintln!(); + eprintln!(" {:<60} {:>14} {:>7}", "bucket", "cycles", "%"); + for (label, cycles) in labels.iter().zip(buckets.iter()) { + let pct = if total_cycles > 0 { + 100.0 * (*cycles as f64) / (total_cycles as f64) + } else { + 0.0 + }; + eprintln!(" {:<60} {:>14} {:>6.2}%", label, cycles, pct); + } + eprintln!("============================================================"); +} + /// Inner program: fibonacci(10). #[test] #[ignore = "slow: memory-bounded continuation prove of the verifier-in-VM"] From 1d470670c45c79147ac2f8dd7626d1f89f4d3820 Mon Sep 17 00:00:00 2001 From: Mario Rugiero Date: Tue, 30 Jun 2026 15:31:45 -0300 Subject: [PATCH 02/16] refactor(prover): drop per-address PC table from recursion profile The top-100 per-address table carried bare PCs with no file:line, so it was not actionable for optimization and the CI aggregator already discarded it. Keep the per-function fold (the view that matters); terminate the aggregator's function-table parse on the trailing rule instead of the removed PC header. --- .../scripts/aggregate_recursion_histogram.py | 16 ++++----- prover/src/tests/recursion_smoke_test.rs | 34 +++++-------------- 2 files changed, 16 insertions(+), 34 deletions(-) diff --git a/.github/scripts/aggregate_recursion_histogram.py b/.github/scripts/aggregate_recursion_histogram.py index 8a12dc05e..1ae34ff70 100755 --- a/.github/scripts/aggregate_recursion_histogram.py +++ b/.github/scripts/aggregate_recursion_histogram.py @@ -1,13 +1,13 @@ #!/usr/bin/env python3 """Format the recursion-guest per-function profile as a Markdown PR comment. -`test_recursion_pc_histogram` prints a per-function summary table (cycles folded -over each function's PCs, computed across the *full* histogram) followed by a -per-address detail table. We extract the per-function table — the view that -shows where the cycles actually go — and render it as Markdown. +`test_recursion_pc_histogram` prints a per-function summary table: the cycles +folded over each function's PCs, computed across the *full* histogram — the view +that shows where the cycles actually go. We parse that table and render it as +Markdown. Top 25 functions by cycle count (aggregated over their PCs): - rank cycles % cum % PCs function (file:line) + rank cycles % cum % PCs function 1 5335072 24.95% 24.95% 72 <...>::visit_seq::<...> Reads the test's captured output from argv[1]; writes the Markdown body to @@ -18,12 +18,12 @@ import sys # A per-function summary row: rank, cycles, pct%, cum%, pcs, function. -# Distinguished from the per-PC detail rows by the absence of a 0x column. FN_ROW = re.compile( r"^\s*\d+\s+(\d+)\s+([\d.]+)%\s+([\d.]+)%\s+(\d+)\s+(.*\S)\s*$" ) FN_TABLE_START = re.compile(r"Top \d+ functions by cycle count") -PC_TABLE_START = re.compile(r"Top \d+ PCs by cycle count") +# The "====" rule the test prints right after the (now sole) function table. +TABLE_END = re.compile(r"^=+\s*$") TOTAL_CYCLES = re.compile(r"Total cycles\s*:\s*(\d+)") UNIQUE_PCS = re.compile(r"Unique PCs\s*:\s*(\d+)") EXEC_TIME = re.compile(r"Exec time\s*:\s*(\S+)") @@ -43,7 +43,7 @@ def parse(text): if FN_TABLE_START.search(line): in_fn_table = True continue - if PC_TABLE_START.search(line): + if in_fn_table and TABLE_END.match(line): in_fn_table = False continue if in_fn_table and (m := FN_ROW.match(line)): diff --git a/prover/src/tests/recursion_smoke_test.rs b/prover/src/tests/recursion_smoke_test.rs index a39bcfc90..b437bed72 100644 --- a/prover/src/tests/recursion_smoke_test.rs +++ b/prover/src/tests/recursion_smoke_test.rs @@ -196,12 +196,14 @@ fn resolve_pc(symbols: &executor::elf::SymbolTable, pc: u64) -> String { ) } -/// Print a PC histogram as two tables: a per-function summary (the cycles each -/// resolved function accounts for, folded over all its PCs) followed by the -/// top-100 per-address detail. `pc_hist` maps program counter → cycle count. +/// Print a per-function PC-histogram summary: the cycles each resolved function +/// accounts for, folded over all its PCs. `pc_hist` maps program counter → +/// cycle count. /// -/// The per-function view is the one that matters: an inlined kernel is spread -/// across dozens of PCs, so the raw per-address table scatters its true cost. +/// We fold by function deliberately: an inlined kernel is spread across dozens +/// of PCs, so a raw per-address table scatters its true cost — and without +/// file:line resolution a bare PC isn't actionable for optimization anyway, so +/// there is no per-address detail table. fn print_pc_histogram( title: &str, symbols: &executor::elf::SymbolTable, @@ -209,8 +211,7 @@ fn print_pc_histogram( total_cycles: u64, exec_time: std::time::Duration, ) { - let mut entries: Vec<(u64, u64)> = pc_hist.into_iter().collect(); - entries.sort_unstable_by_key(|(_pc, count)| std::cmp::Reverse(*count)); + let entries: Vec<(u64, u64)> = pc_hist.into_iter().collect(); // Aggregate the full histogram by resolved function, resolving each PC once. let mut by_function: std::collections::HashMap = @@ -253,25 +254,6 @@ fn print_pc_histogram( name, ); } - eprintln!(); - eprintln!(" Top 100 PCs by cycle count (per-address detail):"); - eprintln!( - " {:>4} {:>18} {:>14} {:>7} {:>7} {}", - "rank", "pc", "cycles", "%", "cum %", "function" - ); - let mut cumulative: u64 = 0; - for (rank, (pc, count)) in entries.iter().take(100).enumerate() { - cumulative += count; - eprintln!( - " {:>4} {:#018x} {:>14} {:>6.2}% {:>6.2}% {}", - rank + 1, - pc, - count, - pct(*count), - pct(cumulative), - resolve_pc(symbols, *pc), - ); - } eprintln!("============================================================"); } From a395f275a16c0c0adfbaa0be7934e8be73ca5a63 Mon Sep 17 00:00:00 2001 From: Mario Rugiero Date: Tue, 30 Jun 2026 15:40:41 -0300 Subject: [PATCH 03/16] refactor(prover): share setup/progress across recursion diagnostics Extract setup_guest_run (blob build + ELF load + Executor::new) and a log_progress throttled-readout factory, used by the cycle-count, page-count, PC-histogram, sampled-flamegraph and step-breakdown diagnostics. Generalize the PC-histogram runner over guest name + progress stride so the deserialize-only histogram is a one-line caller instead of a near-duplicate. --- prover/src/tests/recursion_smoke_test.rs | 231 +++++++++-------------- 1 file changed, 88 insertions(+), 143 deletions(-) diff --git a/prover/src/tests/recursion_smoke_test.rs b/prover/src/tests/recursion_smoke_test.rs index b437bed72..da885a15e 100644 --- a/prover/src/tests/recursion_smoke_test.rs +++ b/prover/src/tests/recursion_smoke_test.rs @@ -184,6 +184,47 @@ fn drive_executor( (total_cycles, start.elapsed()) } +/// Shared preamble for every execute-only diagnostic below: build the standard +/// recursion private-input blob (an `empty`-program inner proof produced under +/// `opts`), load guest `guest_name`, and stand up an executor over it. Returns +/// the guest's raw ELF bytes (callers that resolve PCs pass them to +/// [`executor::elf::SymbolTable::parse`]), the loaded program, and the +/// ready-to-drive executor. +fn setup_guest_run( + label: &str, + guest_name: &str, + opts: &stark::proof::options::ProofOptions, +) -> ( + Vec, + executor::elf::Elf, + executor::vm::execution::Executor, +) { + let root = workspace_root(); + let empty_elf_bytes = read_guest_elf(&root, "empty"); + let guest_elf_bytes = read_guest_elf(&root, guest_name); + + let (_inner_proof, blob) = prove_inner_and_encode_blob(label, &empty_elf_bytes, &[], opts); + + let program = executor::elf::Elf::load(&guest_elf_bytes).expect("ELF load failed"); + let executor = executor::vm::execution::Executor::new(&program, blob).expect("Executor::new failed"); + (guest_elf_bytes, program, executor) +} + +/// A `drive_executor` progress callback that prints the throttled +/// `[label] ... N chunks, M cycles, T elapsed` line every `stride` chunks — +/// the readout every counting diagnostic shares. Tests that need extra live +/// state (unique PC count, active step bucket) keep their own closure instead. +fn log_progress( + label: &'static str, + stride: usize, +) -> impl FnMut(usize, u64, std::time::Duration) { + move |chunks, cycles, elapsed| { + if chunks.is_multiple_of(stride) { + eprintln!("[{label}] ... {chunks} chunks, {cycles} cycles, {elapsed:?} elapsed"); + } + } +} + /// Resolve a guest PC to its (demangled) enclosing function name using the /// ELF's own symbol table — the same data `executor::flamegraph` resolves /// against. `` when no function symbol covers the PC (e.g. PLT stubs @@ -481,35 +522,20 @@ fn test_dump_recursion_input() { #[test] #[ignore = "diagnostic: runs the executor only, prints cycle counts"] fn test_recursion_cycle_count() { - use executor::elf::Elf; - use executor::vm::execution::Executor; - - let root = workspace_root(); - let empty_elf_bytes = read_guest_elf(&root, "empty"); - let recursion_elf_bytes = read_guest_elf(&root, "recursion"); - - // Build the inner proof exactly as the smoke test does, with the - // absolute-minimum FRI params so the inner is as small as possible. - let (_inner_proof, blob) = - prove_inner_and_encode_blob("cycle-count", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS); + // Build the inner proof with the absolute-minimum FRI params (smallest + // possible inner) and stand up the recursion guest over it. + let (_bytes, _program, mut executor) = + setup_guest_run("cycle-count", "recursion", &MIN_PROOF_OPTIONS); // Execute (NOT prove) the recursion guest. `drive_executor` streams chunks // and never accumulates logs in memory — this avoids the Vec blow-up // that OOMs even a 125 GB server (one Log is 40 B; a few billion of them is // hundreds of GB). eprintln!("[cycle-count] executing recursion guest (streaming counter only) ..."); - let program = Elf::load(&recursion_elf_bytes).expect("ELF load failed"); - let mut executor = Executor::new(&program, blob).expect("Executor::new failed"); let (total_cycles, exec_time) = drive_executor( &mut executor, |_log| ControlFlow::Continue(()), - |chunks, cycles, elapsed| { - if chunks.is_multiple_of(50) { - eprintln!( - "[cycle-count] ... {chunks} chunks, {cycles} cycles, {elapsed:?} elapsed" - ); - } - }, + log_progress("cycle-count", 50), ); let cycle_count = total_cycles as usize; @@ -565,23 +591,16 @@ fn test_recursion_cycle_count() { #[test] #[ignore = "diagnostic: counts distinct 4 KB memory pages touched by the recursion guest"] fn test_recursion_page_count() { - use executor::elf::Elf; - use executor::vm::execution::Executor; use executor::vm::memory::PRIVATE_INPUT_START_INDEX; use std::collections::HashSet; - let root = workspace_root(); - let empty_elf_bytes = read_guest_elf(&root, "empty"); - let recursion_elf_bytes = read_guest_elf(&root, "recursion"); - - let (_inner_proof, blob) = - prove_inner_and_encode_blob("page-count", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS); + let (_bytes, program, mut executor) = + setup_guest_run("page-count", "recursion", &MIN_PROOF_OPTIONS); // Precompute the recursion ELF's PT_LOAD ranges so we can bucket code/ // static pages separately from heap. `Elf::load` already expands BSS // (memsz > filesz) into zero-valued words, so these ranges cover // .text + .rodata + .data + .bss. - let program = Elf::load(&recursion_elf_bytes).expect("ELF load failed"); let segment_ranges: Vec<(u64, u64)> = program .data .iter() @@ -602,17 +621,10 @@ fn test_recursion_page_count() { // would accumulate ~67 M `Log` records (~2.7 GB) we don't need. We only // care about the *final* memory state. eprintln!("[page-count] executing recursion guest (streaming) ..."); - let mut executor = Executor::new(&program, blob).expect("Executor::new failed"); let (total_cycles, exec_time) = drive_executor( &mut executor, |_log| ControlFlow::Continue(()), - |chunks, cycles, elapsed| { - if chunks.is_multiple_of(50) { - eprintln!( - "[page-count] ... {chunks} chunks, {cycles} cycles, {elapsed:?} elapsed" - ); - } - }, + log_progress("page-count", 50), ); // Collect the set of distinct 4 KB pages from every cell touched during @@ -678,38 +690,33 @@ fn test_recursion_page_count() { eprintln!("============================================================"); } -/// Build a PC histogram of the recursion guest verifying an `empty`-program +/// Build a PC histogram of guest `guest_name` verifying an `empty`-program /// inner proof produced with `inner_proof_options`, and print it via /// [`print_pc_histogram`] under `title`. /// -/// `blowup_factor` and `fri_number_of_queries` are coupled (the query count is -/// derived from blowup for a fixed security target), so each `#[test]` below is -/// just this runner with a different `ProofOptions` — a single query at low -/// blowup, vs. the security-derived multi-query count at a higher blowup. +/// For the recursion guest, `blowup_factor` and `fri_number_of_queries` are +/// coupled (the query count is derived from blowup for a fixed security +/// target), so each recursion `#[test]` is just this runner with a different +/// `ProofOptions` — a single query at low blowup, vs. the security-derived +/// multi-query count at a higher blowup. The deserialize-only control guest +/// reuses the same runner with its own ELF name. /// /// Streams chunks of logs via `Executor::resume()` so memory stays bounded to /// the histogram itself. Each PC is resolved to its enclosing function via the -/// in-house `executor::elf::SymbolTable` (reading the recursion ELF's symbol -/// table directly — no external tool, no DWARF dependency). -fn run_recursion_pc_histogram( +/// in-house `executor::elf::SymbolTable` (reading the guest ELF's symbol table +/// directly — no external tool, no DWARF dependency). +fn run_pc_histogram( title: &str, + guest_name: &str, + progress_stride: usize, inner_proof_options: stark::proof::options::ProofOptions, ) { - use executor::elf::Elf; - use executor::vm::execution::Executor; use std::collections::HashMap; - let root = workspace_root(); - let empty_elf_bytes = read_guest_elf(&root, "empty"); - let recursion_elf_bytes = read_guest_elf(&root, "recursion"); - - let (_inner_proof, blob) = - prove_inner_and_encode_blob("pc-hist", &empty_elf_bytes, &[], &inner_proof_options); - - eprintln!("[pc-hist] executing recursion guest (building PC histogram) ..."); - let program = Elf::load(&recursion_elf_bytes).expect("ELF load failed"); - let mut executor = Executor::new(&program, blob).expect("Executor::new failed"); + let (guest_elf_bytes, _program, mut executor) = + setup_guest_run("pc-hist", guest_name, &inner_proof_options); + eprintln!("[pc-hist] executing {guest_name} guest (building PC histogram) ..."); let mut pc_hist: HashMap = HashMap::with_capacity(300_000); let unique = std::cell::Cell::new(0usize); let (total_cycles, exec_time) = drive_executor( @@ -720,7 +727,7 @@ fn run_recursion_pc_histogram( ControlFlow::Continue(()) }, |chunks, cycles, elapsed| { - if chunks.is_multiple_of(500) { + if chunks.is_multiple_of(progress_stride) { eprintln!( "[pc-hist] ... {chunks} chunks, {cycles} cycles, {} unique PCs, {elapsed:?}", unique.get() @@ -730,7 +737,7 @@ fn run_recursion_pc_histogram( ); // Resolve PCs to functions directly from the ELF's symbol table. - let symbols = executor::elf::SymbolTable::parse(&recursion_elf_bytes); + let symbols = executor::elf::SymbolTable::parse(&guest_elf_bytes); print_pc_histogram(title, &symbols, pc_hist, total_cycles, exec_time); } @@ -740,8 +747,10 @@ fn run_recursion_pc_histogram( #[test] #[ignore = "diagnostic: ~8 minutes; prints PC histogram of the verifier-in-VM"] fn test_recursion_pc_histogram_1query() { - run_recursion_pc_histogram( + run_pc_histogram( "RECURSION GUEST PC HISTOGRAM (blowup=2, 1 query)", + "recursion", + 500, MIN_PROOF_OPTIONS, ); } @@ -755,11 +764,13 @@ fn test_recursion_pc_histogram_1query() { fn test_recursion_pc_histogram_multiquery() { let inner_proof_options = crate::GoldilocksCubicProofOptions::with_blowup(8).expect("blowup=8 is always valid"); - run_recursion_pc_histogram( + run_pc_histogram( &format!( "RECURSION GUEST PC HISTOGRAM (blowup=8, {} queries, 128-bit)", inner_proof_options.fri_number_of_queries ), + "recursion", + 500, inner_proof_options, ); } @@ -779,9 +790,7 @@ fn test_recursion_pc_histogram_multiquery() { #[test] #[ignore = "diagnostic: sampled flamegraph for the verifier-in-VM"] fn test_recursion_sampled_flamegraph() { - use executor::elf::Elf; use executor::flamegraph::FlamegraphGenerator; - use executor::vm::execution::Executor; use std::io::BufWriter; /// 1 in N logs is fed to `process_logs`, which both updates the call @@ -801,18 +810,12 @@ fn test_recursion_sampled_flamegraph() { /// At SAMPLE_RATE=1, ~57µs per cycle means 5M cycles ≈ 5 min wall time. const CYCLE_BUDGET: u64 = 5_000_000; - let root = workspace_root(); - let empty_elf_bytes = read_guest_elf(&root, "empty"); - let recursion_elf_bytes = read_guest_elf(&root, "recursion"); - - let (_inner_proof, blob) = - prove_inner_and_encode_blob("sampled-fg", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS); + let (recursion_elf_bytes, program, mut executor) = + setup_guest_run("sampled-fg", "recursion", &MIN_PROOF_OPTIONS); eprintln!("[sampled-fg] executing recursion guest (sampling 1-in-{SAMPLE_RATE}) ...",); - let program = Elf::load(&recursion_elf_bytes).expect("ELF load failed"); let symbols = executor::elf::SymbolTable::parse(&recursion_elf_bytes); let entry_point = program.entry_point; - let mut executor = Executor::new(&program, blob).expect("Executor::new failed"); // Build our own instruction cache from the same segments `Executor::new` // decodes internally. Owning it (rather than reading `executor.instructions` @@ -960,18 +963,9 @@ fn test_host_verify_step_timings() { #[test] #[ignore = "diagnostic: runs the deserialize-only guest, prints cycle count"] fn test_deserialize_only_cycle_count() { - use executor::elf::Elf; - use executor::vm::execution::Executor; + let (deser_elf_bytes, program, mut executor) = + setup_guest_run("deser-only", "deserialize-only", &MIN_PROOF_OPTIONS); - let root = workspace_root(); - let empty_elf_bytes = read_guest_elf(&root, "empty"); - let deser_elf_bytes = read_guest_elf(&root, "deserialize-only"); - - let (_inner_proof, blob) = - prove_inner_and_encode_blob("deser-only", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS); - - eprintln!("[deser-only] executing deserialize-only guest (streaming) ..."); - let program = Elf::load(&deser_elf_bytes).expect("ELF load failed"); eprintln!( "[deser-only] ELF: {} bytes, entry_point=0x{:x}", deser_elf_bytes.len(), @@ -981,18 +975,12 @@ fn test_deserialize_only_cycle_count() { program.entry_point, 0, "deserialize-only ELF has entry_point=0 — build artifact is malformed" ); - let mut executor = Executor::new(&program, blob).expect("Executor::new failed"); + eprintln!("[deser-only] executing deserialize-only guest (streaming) ..."); let (total_cycles, exec_time) = drive_executor( &mut executor, |_log| ControlFlow::Continue(()), - |chunks, cycles, elapsed| { - if chunks.is_multiple_of(50) { - eprintln!( - "[deser-only] ... {chunks} chunks, {cycles} cycles, {elapsed:?} elapsed" - ); - } - }, + log_progress("deser-only", 50), ); let cycle_count = total_cycles; @@ -1023,50 +1011,14 @@ fn test_deserialize_only_cycle_count() { #[test] #[ignore = "diagnostic: ~1 min; PC histogram for the deserialize-only guest"] fn test_deserialize_only_pc_histogram() { - use executor::elf::Elf; - use executor::vm::execution::Executor; - use std::collections::HashMap; - - let root = workspace_root(); - let empty_elf_bytes = read_guest_elf(&root, "empty"); - let deser_elf_bytes = read_guest_elf(&root, "deserialize-only"); - - let (_inner_proof, blob) = - prove_inner_and_encode_blob("deser-pc-hist", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS); - - eprintln!("[deser-pc-hist] executing deserialize-only guest (building PC histogram) ..."); - let program = Elf::load(&deser_elf_bytes).expect("ELF load failed"); - let mut executor = Executor::new(&program, blob).expect("Executor::new failed"); - - // ~50k unique PCs is plenty: the deserialize-only guest is ~74 KB of ELF - // (~18k 4-byte instructions); the hot inner loop is much smaller still. - let mut pc_hist: HashMap = HashMap::with_capacity(50_000); - let unique = std::cell::Cell::new(0usize); - let (total_cycles, exec_time) = drive_executor( - &mut executor, - |log| { - *pc_hist.entry(log.current_pc).or_insert(0) += 1; - unique.set(pc_hist.len()); - ControlFlow::Continue(()) - }, - |chunks, cycles, elapsed| { - if chunks.is_multiple_of(50) { - eprintln!( - "[deser-pc-hist] ... {chunks} chunks, {cycles} cycles, {} unique PCs, {elapsed:?}", - unique.get() - ); - } - }, - ); - - // Resolve PCs to functions directly from the ELF's symbol table. - let symbols = executor::elf::SymbolTable::parse(&deser_elf_bytes); - print_pc_histogram( + // Same runner as the recursion PC histograms, pointed at the deserialize-only + // control guest. Smaller workload (~16 M cycles, far fewer chunks), so use a + // tighter progress stride to still get periodic readouts. + run_pc_histogram( "DESERIALIZE-ONLY GUEST PC HISTOGRAM", - &symbols, - pc_hist, - total_cycles, - exec_time, + "deserialize-only", + 50, + MIN_PROOF_OPTIONS, ); } @@ -1093,15 +1045,10 @@ fn test_deserialize_only_pc_histogram() { #[test] #[ignore = "diagnostic: ~13 min; buckets the 40B cycles by verifier step"] fn test_recursion_step_breakdown() { - use executor::elf::{Elf, SymbolTable}; - use executor::vm::execution::Executor; - - let root = workspace_root(); - let empty_elf_bytes = read_guest_elf(&root, "empty"); - let recursion_elf_bytes = read_guest_elf(&root, "recursion"); + use executor::elf::SymbolTable; - let (_inner_proof, blob) = - prove_inner_and_encode_blob("step-bkd", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS); + let (recursion_elf_bytes, _program, mut executor) = + setup_guest_run("step-bkd", "recursion", &MIN_PROOF_OPTIONS); // Build a per-step "advance bucket to N" lookup. The verifier's step // functions get inlined by LLVM in release mode, so we can't rely on @@ -1159,8 +1106,6 @@ fn test_recursion_step_breakdown() { let mut buckets = [0u64; 5]; eprintln!("[step-bkd] executing recursion guest (streaming) ..."); - let program = Elf::load(&recursion_elf_bytes).expect("ELF load failed"); - let mut executor = Executor::new(&program, blob).expect("Executor::new failed"); // Cache the last symbol-table hit so we only do a binary search on // function transitions, not on every cycle. Functions are typically From 66f24433a626541bc4448ae255d0e4a8ce4910f3 Mon Sep 17 00:00:00 2001 From: Mario Rugiero Date: Tue, 30 Jun 2026 15:49:08 -0300 Subject: [PATCH 04/16] cargo fmt --- prover/src/tests/recursion_smoke_test.rs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/prover/src/tests/recursion_smoke_test.rs b/prover/src/tests/recursion_smoke_test.rs index da885a15e..90baab4a6 100644 --- a/prover/src/tests/recursion_smoke_test.rs +++ b/prover/src/tests/recursion_smoke_test.rs @@ -206,7 +206,8 @@ fn setup_guest_run( let (_inner_proof, blob) = prove_inner_and_encode_blob(label, &empty_elf_bytes, &[], opts); let program = executor::elf::Elf::load(&guest_elf_bytes).expect("ELF load failed"); - let executor = executor::vm::execution::Executor::new(&program, blob).expect("Executor::new failed"); + let executor = + executor::vm::execution::Executor::new(&program, blob).expect("Executor::new failed"); (guest_elf_bytes, program, executor) } @@ -214,10 +215,7 @@ fn setup_guest_run( /// `[label] ... N chunks, M cycles, T elapsed` line every `stride` chunks — /// the readout every counting diagnostic shares. Tests that need extra live /// state (unique PC count, active step bucket) keep their own closure instead. -fn log_progress( - label: &'static str, - stride: usize, -) -> impl FnMut(usize, u64, std::time::Duration) { +fn log_progress(label: &'static str, stride: usize) -> impl FnMut(usize, u64, std::time::Duration) { move |chunks, cycles, elapsed| { if chunks.is_multiple_of(stride) { eprintln!("[{label}] ... {chunks} chunks, {cycles} cycles, {elapsed:?} elapsed"); From acd6783bf78733a6f084d8c2b19fde925164d5e4 Mon Sep 17 00:00:00 2001 From: Mario Rugiero Date: Tue, 30 Jun 2026 17:12:59 -0300 Subject: [PATCH 05/16] refactor(prover): unify recursion execute-only diagnostics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Collapse the cycle-count, PC-histogram and step-breakdown diagnostics into one parameterized run_profile(guest, stride, opts, detailed): total cycles print unconditionally, the top-25 functions + per-step breakdown gate on detailed (they share one streamed pass over the same PC stream). Every variant now comes in 1query and multiquery flavours for both recursion and the deserialize-only control. Route execute_outer_and_commit through drive_executor too — the rebased streaming finish() makes its hand-rolled drain loop redundant. --- prover/src/tests/recursion_smoke_test.rs | 623 ++++++++--------------- 1 file changed, 224 insertions(+), 399 deletions(-) diff --git a/prover/src/tests/recursion_smoke_test.rs b/prover/src/tests/recursion_smoke_test.rs index 90baab4a6..5077610e7 100644 --- a/prover/src/tests/recursion_smoke_test.rs +++ b/prover/src/tests/recursion_smoke_test.rs @@ -103,12 +103,8 @@ fn execute_outer_and_commit(label: &str, recursion_elf_bytes: &[u8], blob: &[u8] let program = Elf::load(recursion_elf_bytes).expect("load recursion elf"); let mut executor = Executor::new(&program, blob.to_vec()).expect("executor new"); - // Drain chunks to completion without retaining logs or building a trace. - while executor - .resume() - .expect("recursion guest execution failed (verify panicked in-VM?)") - .is_some() - {} + let (total_cycles, exec_time) = + drive_executor(&mut executor, |_log| ControlFlow::Continue(()), |_, _, _| {}); let committed = executor .finish() @@ -116,7 +112,7 @@ fn execute_outer_and_commit(label: &str, recursion_elf_bytes: &[u8], blob: &[u8] .memory_values; eprintln!( - "[{label}] committed {} bytes: {:?} (as str: {:?})", + "[{label}] {total_cycles} cycles in {exec_time:?}; committed {} bytes: {:?} (as str: {:?})", committed.len(), committed, String::from_utf8_lossy(&committed), @@ -166,7 +162,7 @@ fn drive_executor( let start = std::time::Instant::now(); let mut total_cycles: u64 = 0; let mut chunks: usize = 0; - while let Some(logs) = executor.resume().expect("executor resume failed") { + while let Some(logs) = executor.resume().expect("executor resume failed (guest panicked in-VM?)") { let mut stop = false; for log in logs { total_cycles += 1; @@ -206,6 +202,10 @@ fn setup_guest_run( let (_inner_proof, blob) = prove_inner_and_encode_blob(label, &empty_elf_bytes, &[], opts); let program = executor::elf::Elf::load(&guest_elf_bytes).expect("ELF load failed"); + assert_ne!( + program.entry_point, 0, + "{guest_name} ELF has entry_point=0 — build artifact is malformed" + ); let executor = executor::vm::execution::Executor::new(&program, blob).expect("Executor::new failed"); (guest_elf_bytes, program, executor) @@ -213,9 +213,12 @@ fn setup_guest_run( /// A `drive_executor` progress callback that prints the throttled /// `[label] ... N chunks, M cycles, T elapsed` line every `stride` chunks — -/// the readout every counting diagnostic shares. Tests that need extra live -/// state (unique PC count, active step bucket) keep their own closure instead. -fn log_progress(label: &'static str, stride: usize) -> impl FnMut(usize, u64, std::time::Duration) { +/// the readout the counting diagnostics share. Tests that need extra live state +/// (unique PC count, active step bucket) keep their own closure instead. Takes +/// `impl Into` so it works with both `&'static` tags and a run's +/// dynamic `label`. +fn log_progress(label: impl Into, stride: usize) -> impl FnMut(usize, u64, std::time::Duration) { + let label = label.into(); move |chunks, cycles, elapsed| { if chunks.is_multiple_of(stride) { eprintln!("[{label}] ... {chunks} chunks, {cycles} cycles, {elapsed:?} elapsed"); @@ -235,30 +238,35 @@ fn resolve_pc(symbols: &executor::elf::SymbolTable, pc: u64) -> String { ) } -/// Print a per-function PC-histogram summary: the cycles each resolved function -/// accounts for, folded over all its PCs. `pc_hist` maps program counter → -/// cycle count. -/// -/// We fold by function deliberately: an inlined kernel is spread across dozens -/// of PCs, so a raw per-address table scatters its true cost — and without -/// file:line resolution a bare PC isn't actionable for optimization anyway, so -/// there is no per-address detail table. -fn print_pc_histogram( - title: &str, +/// Verifier sub-routines in execution order. LLVM inlines the step bodies, but +/// closures inside each keep the method name in their mangled symbol, so +/// `run_profile` advances the step bucket by substring-matching the enclosing +/// symbol. A step with no matching symbol merges into the previous bucket. +const VERIFIER_STEP_KEYWORDS: [&str; 4] = [ + "replay_rounds_after_round_1", + "step_2_verify_claimed_composition_polynomial", + "step_3_verify_fri", + "step_4_verify_trace_and_composition_openings", +]; + +/// `blowup=8` inner-proof options: the security-derived multi-query count (tens +/// of queries, 128-bit) used by every `multiquery` profiling variant. +fn blowup8() -> stark::proof::options::ProofOptions { + crate::GoldilocksCubicProofOptions::with_blowup(8).expect("blowup=8 is always valid") +} + +/// Fold the PC histogram by enclosing function and print the top-25 by cycles. +/// Folded because an inlined kernel spreads across many PCs; no per-address +/// table since a bare PC isn't actionable without file:line. +fn print_function_table( symbols: &executor::elf::SymbolTable, pc_hist: std::collections::HashMap, total_cycles: u64, - exec_time: std::time::Duration, ) { - let entries: Vec<(u64, u64)> = pc_hist.into_iter().collect(); - - // Aggregate the full histogram by resolved function, resolving each PC once. let mut by_function: std::collections::HashMap = std::collections::HashMap::new(); - for (pc, count) in &entries { - let entry = by_function - .entry(resolve_pc(symbols, *pc)) - .or_insert((0, 0)); + for (pc, count) in &pc_hist { + let entry = by_function.entry(resolve_pc(symbols, *pc)).or_insert((0, 0)); entry.0 += *count; // cycles entry.1 += 1; // distinct PCs folded into this function } @@ -266,20 +274,10 @@ fn print_pc_histogram( fn_entries.sort_unstable_by_key(|(_name, (cycles, _pcs))| std::cmp::Reverse(*cycles)); let pct = |n: u64| 100.0 * (n as f64) / (total_cycles as f64); - - eprintln!(); - eprintln!("============================================================"); - eprintln!(" {title}"); - eprintln!("============================================================"); - eprintln!(" Total cycles : {total_cycles}"); - eprintln!(" Unique PCs : {}", entries.len()); - eprintln!(" Exec time : {exec_time:?}"); + eprintln!(" Unique PCs : {}", pc_hist.len()); eprintln!(); eprintln!(" Top 25 functions by cycle count (aggregated over their PCs):"); - eprintln!( - " {:>4} {:>14} {:>7} {:>7} {:>5} {}", - "rank", "cycles", "%", "cum %", "PCs", "function" - ); + eprintln!(" rank cycles % cum % PCs function"); let mut fn_cumulative: u64 = 0; for (rank, (name, (cycles, pcs))) in fn_entries.iter().take(25).enumerate() { fn_cumulative += cycles; @@ -293,6 +291,149 @@ fn print_pc_histogram( name, ); } +} + +/// Print the monotonic per-verifier-step cycle bucketing. `buckets[0]` is +/// pre-step-1 setup (alloc + postcard decode + `VmAirs::new`); `buckets[i]` is +/// verifier step i (with a missing step's cycles merged into the previous one). +fn print_step_breakdown(buckets: &[u64; 5], total_cycles: u64) { + let labels = [ + "0. setup (alloc + postcard decode + VmAirs::new + pre-step-1)", + "1. step 1: replay_rounds_after_round_1", + "2. step 2: verify_claimed_composition_polynomial", + "3. step 3: verify_fri", + "4. step 4: verify_trace_and_composition_openings (+ wrap-up)", + ]; + eprintln!(); + eprintln!(" Per-step cycle breakdown (monotonic state machine):"); + eprintln!(" {:<60} {:>14} {:>7}", "bucket", "cycles", "%"); + for (label, cycles) in labels.iter().zip(buckets.iter()) { + let pct = if total_cycles > 0 { + 100.0 * (*cycles as f64) / (total_cycles as f64) + } else { + 0.0 + }; + eprintln!(" {:<60} {:>14} {:>6.2}%", label, cycles, pct); + } +} + +/// Single-pass execute-only profiler. Always prints total cycles + wall time + +/// a rough trace/LDE size estimate. With `detailed`, the same pass also builds +/// the PC histogram and verifier-step bucketing and prints the top-25 functions +/// and the per-step breakdown (the two always come together); `!detailed` does +/// no per-log work, so it's just a fast cycle counter. `progress_stride` +/// throttles the readout (recursion large, the deserialize-only control small). +fn run_profile( + guest_name: &str, + progress_stride: usize, + opts: stark::proof::options::ProofOptions, + detailed: bool, +) { + use std::collections::HashMap; + + let (guest_elf_bytes, _program, mut executor) = setup_guest_run("profile", guest_name, &opts); + let symbols = executor::elf::SymbolTable::parse(&guest_elf_bytes); + + let mut pc_hist: HashMap = HashMap::new(); + let mut buckets = [0u64; 5]; + let mut last_range: Option<(u64, u64)> = None; + let mut last_advance: u8 = 0; + let bucket = std::cell::Cell::new(0u8); + let unique = std::cell::Cell::new(0usize); + + if detailed { + assert!( + !symbols.is_empty(), + "{guest_name} ELF has no symbol table — was it stripped?" + ); + for (i, kw) in VERIFIER_STEP_KEYWORDS.iter().enumerate() { + let n = symbols.functions().iter().filter(|f| f.name.contains(kw)).count(); + eprintln!( + "[profile] step {}: keyword={kw:?} -> {n} symbol(s) {}", + i + 1, + if n > 0 { "" } else { "(no match; merges into previous bucket)" }, + ); + } + } + + eprintln!( + "[profile] executing {guest_name} guest ({}) ...", + if detailed { "histogram + steps" } else { "cycle counter" } + ); + let (total_cycles, exec_time) = drive_executor( + &mut executor, + |log| { + if detailed { + let pc = log.current_pc; + *pc_hist.entry(pc).or_insert(0) += 1; + unique.set(pc_hist.len()); + + let in_cached = matches!(last_range, Some((s, e)) if pc >= s && pc < e); + if !in_cached { + if let Some(sym) = symbols.lookup(pc) { + last_range = Some((sym.address, sym.address + sym.size.max(1))); + last_advance = 0; + for (i, kw) in VERIFIER_STEP_KEYWORDS.iter().enumerate() { + if sym.name.contains(kw) { + last_advance = (i + 1) as u8; + } + } + } else { + last_range = None; + last_advance = 0; + } + } + if bucket.get() < last_advance { + bucket.set(last_advance); + } + buckets[bucket.get() as usize] += 1; + } + ControlFlow::Continue(()) + }, + |chunks, cycles, elapsed| { + if chunks.is_multiple_of(progress_stride) { + if detailed { + eprintln!( + "[profile] ... {chunks} chunks, {cycles} cycles, {} unique PCs, bucket={}, {elapsed:?}", + unique.get(), + bucket.get(), + ); + } else { + eprintln!("[profile] ... {chunks} chunks, {cycles} cycles, {elapsed:?}"); + } + } + }, + ); + + eprintln!(); + eprintln!("============================================================"); + eprintln!( + " {} GUEST PROFILE (blowup={}, {} queries)", + guest_name.to_uppercase(), + opts.blowup_factor, + opts.fri_number_of_queries, + ); + eprintln!("============================================================"); + eprintln!(" Total cycles : {total_cycles}"); + eprintln!(" Exec time : {exec_time:?}"); + eprintln!(); + eprintln!(" Rough trace/LDE size if this guest were proven:"); + let approx_columns = 250u64; + let main_trace_bytes = total_cycles * approx_columns * 8; + eprintln!( + " main trace : ~{:.2} GB ({total_cycles} cycles × ~{approx_columns} cols × 8 B)", + main_trace_bytes as f64 / 1e9, + ); + eprintln!( + " main LDE (blowup=2) : ~{:.2} GB (+aux ≈ 50% more → peak ≈ 2-3× LDE)", + (main_trace_bytes * 2) as f64 / 1e9, + ); + + if detailed { + eprintln!(); + print_function_table(&symbols, pc_hist, total_cycles); + print_step_breakdown(&buckets, total_cycles); + } eprintln!("============================================================"); } @@ -510,59 +651,33 @@ fn test_dump_recursion_input() { eprintln!("[dump-input] wrote {} bytes to {path}", blob.len()); } -/// Diagnostic: build the inner proof + recursion guest input, then **execute -/// only** the recursion guest (no STARK proving) and report cycle counts + -/// trace size estimates. -/// -/// This is the cheap way to find out how many RISC-V instructions the -/// verifier actually executes inside the guest — a much faster signal than -/// running the full outer prove (which can OOM on a 125 GB machine). +/// Cycle count only of the recursion guest verifying a 1-query inner proof. #[test] -#[ignore = "diagnostic: runs the executor only, prints cycle counts"] -fn test_recursion_cycle_count() { - // Build the inner proof with the absolute-minimum FRI params (smallest - // possible inner) and stand up the recursion guest over it. - let (_bytes, _program, mut executor) = - setup_guest_run("cycle-count", "recursion", &MIN_PROOF_OPTIONS); - - // Execute (NOT prove) the recursion guest. `drive_executor` streams chunks - // and never accumulates logs in memory — this avoids the Vec blow-up - // that OOMs even a 125 GB server (one Log is 40 B; a few billion of them is - // hundreds of GB). - eprintln!("[cycle-count] executing recursion guest (streaming counter only) ..."); - let (total_cycles, exec_time) = drive_executor( - &mut executor, - |_log| ControlFlow::Continue(()), - log_progress("cycle-count", 50), - ); - let cycle_count = total_cycles as usize; +#[ignore = "diagnostic: fast; recursion guest cycle count (1 query)"] +fn test_recursion_cycles_1query() { + run_profile("recursion", 500, MIN_PROOF_OPTIONS, false); +} - eprintln!(); - eprintln!("============================================================"); - eprintln!(" RECURSION GUEST EXECUTION SUMMARY"); - eprintln!("============================================================"); - eprintln!(" Cycle count : {cycle_count}"); - eprintln!(" Executor wall time : {exec_time:?}"); - eprintln!(); - eprintln!(" Rough memory estimate for outer prove:"); - let bytes_per_field = 8usize; - let approx_columns = 250usize; // CPU + MEMW + DECODE + bus columns combined - let main_trace_bytes = cycle_count * approx_columns * bytes_per_field; - let blowup = 2usize; - let lde_main_bytes = main_trace_bytes * blowup; - eprintln!( - " main trace : ~{:.2} GB ({} cycles × ~{} cols × 8 B)", - main_trace_bytes as f64 / 1e9, - cycle_count, - approx_columns - ); - eprintln!( - " main LDE (blowup={}) : ~{:.2} GB", - blowup, - lde_main_bytes as f64 / 1e9 - ); - eprintln!(" (aux trace adds roughly 50% more, so peak peak ≈ 2-3× LDE)"); - eprintln!("============================================================"); +/// Cycle count only at 128-bit security: more FRI queries → more verifier cycles. +#[test] +#[ignore = "diagnostic: fast; recursion guest cycle count (multi-query)"] +fn test_recursion_cycles_multiquery() { + run_profile("recursion", 500, blowup8(), false); +} + +/// Full profile (top-25 functions + per-step breakdown) of the 1-query run — +/// the cheapest verifier run, dominated by fixed setup. +#[test] +#[ignore = "diagnostic: ~8 min; recursion guest histogram + steps (1 query)"] +fn test_recursion_profile_1query() { + run_profile("recursion", 500, MIN_PROOF_OPTIONS, true); +} + +/// Full profile at 128-bit security: weight shifts toward per-query FRI/Merkle. +#[test] +#[ignore = "diagnostic: heavy; recursion guest histogram + steps (multi-query)"] +fn test_recursion_profile_multiquery() { + run_profile("recursion", 500, blowup8(), true); } /// Diagnostic: count the distinct 4 KB memory pages the recursion guest @@ -688,91 +803,6 @@ fn test_recursion_page_count() { eprintln!("============================================================"); } -/// Build a PC histogram of guest `guest_name` verifying an `empty`-program -/// inner proof produced with `inner_proof_options`, and print it via -/// [`print_pc_histogram`] under `title`. -/// -/// For the recursion guest, `blowup_factor` and `fri_number_of_queries` are -/// coupled (the query count is derived from blowup for a fixed security -/// target), so each recursion `#[test]` is just this runner with a different -/// `ProofOptions` — a single query at low blowup, vs. the security-derived -/// multi-query count at a higher blowup. The deserialize-only control guest -/// reuses the same runner with its own ELF name. -/// -/// Streams chunks of logs via `Executor::resume()` so memory stays bounded to -/// the histogram itself. Each PC is resolved to its enclosing function via the -/// in-house `executor::elf::SymbolTable` (reading the guest ELF's symbol table -/// directly — no external tool, no DWARF dependency). -fn run_pc_histogram( - title: &str, - guest_name: &str, - progress_stride: usize, - inner_proof_options: stark::proof::options::ProofOptions, -) { - use std::collections::HashMap; - - let (guest_elf_bytes, _program, mut executor) = - setup_guest_run("pc-hist", guest_name, &inner_proof_options); - - eprintln!("[pc-hist] executing {guest_name} guest (building PC histogram) ..."); - let mut pc_hist: HashMap = HashMap::with_capacity(300_000); - let unique = std::cell::Cell::new(0usize); - let (total_cycles, exec_time) = drive_executor( - &mut executor, - |log| { - *pc_hist.entry(log.current_pc).or_insert(0) += 1; - unique.set(pc_hist.len()); - ControlFlow::Continue(()) - }, - |chunks, cycles, elapsed| { - if chunks.is_multiple_of(progress_stride) { - eprintln!( - "[pc-hist] ... {chunks} chunks, {cycles} cycles, {} unique PCs, {elapsed:?}", - unique.get() - ); - } - }, - ); - - // Resolve PCs to functions directly from the ELF's symbol table. - let symbols = executor::elf::SymbolTable::parse(&guest_elf_bytes); - print_pc_histogram(title, &symbols, pc_hist, total_cycles, exec_time); -} - -/// Diagnostic: PC histogram of the recursion guest with a **single** FRI query -/// at blowup=2 — the cheapest verifier run, dominated by fixed setup cost -/// (decode, allocator, postcard) rather than per-query FRI/Merkle work. -#[test] -#[ignore = "diagnostic: ~8 minutes; prints PC histogram of the verifier-in-VM"] -fn test_recursion_pc_histogram_1query() { - run_pc_histogram( - "RECURSION GUEST PC HISTOGRAM (blowup=2, 1 query)", - "recursion", - 500, - MIN_PROOF_OPTIONS, - ); -} - -/// Diagnostic: PC histogram of the recursion guest at **128-bit security** -/// (blowup=8, FRI query count derived by the Johnson Bound Regime — tens of -/// queries). Compared against the single-query runs, weight shifts toward the -/// verifier's per-query FRI-layer / Merkle-opening and field arithmetic. -#[test] -#[ignore = "diagnostic: heavy; PC histogram of the multi-query verifier-in-VM"] -fn test_recursion_pc_histogram_multiquery() { - let inner_proof_options = - crate::GoldilocksCubicProofOptions::with_blowup(8).expect("blowup=8 is always valid"); - run_pc_histogram( - &format!( - "RECURSION GUEST PC HISTOGRAM (blowup=8, {} queries, 128-bit)", - inner_proof_options.fri_number_of_queries - ), - "recursion", - 500, - inner_proof_options, - ); -} - /// Diagnostic: build a **sampled** call-stack histogram of the recursion guest. /// /// Like `test_recursion_pc_histogram` but groups by full call stack (not PC). @@ -947,236 +977,31 @@ fn test_host_verify_step_timings() { eprintln!("[host-verify] verified OK"); } -/// Diagnostic: cycle count for the **deserialize-only** counterpart of the -/// recursion guest. Same input layout -/// (`(VmProof, Vec, ProofOptions)`) and same proof, but -/// the guest just postcard-decodes the blob and halts — it never calls -/// `verify_with_options`. -/// -/// The cycle delta between this and `test_recursion_cycle_count` is the -/// actual cost of the STARK verifier inside the VM. Historically (40.5 B-cycle -/// recursion guest) postcard decode was ~15.6 M cycles — negligible. Now that -/// the recursion guest is ~67 M cycles, the same absolute cost would be ~23% -/// of total; this test re-measures it. -#[test] -#[ignore = "diagnostic: runs the deserialize-only guest, prints cycle count"] -fn test_deserialize_only_cycle_count() { - let (deser_elf_bytes, program, mut executor) = - setup_guest_run("deser-only", "deserialize-only", &MIN_PROOF_OPTIONS); - - eprintln!( - "[deser-only] ELF: {} bytes, entry_point=0x{:x}", - deser_elf_bytes.len(), - program.entry_point, - ); - assert_ne!( - program.entry_point, 0, - "deserialize-only ELF has entry_point=0 — build artifact is malformed" - ); +// Control guest: decodes the blob and halts. Its cycle count subtracted from +// the matching recursion run isolates the in-VM verifier cost. - eprintln!("[deser-only] executing deserialize-only guest (streaming) ..."); - let (total_cycles, exec_time) = drive_executor( - &mut executor, - |_log| ControlFlow::Continue(()), - log_progress("deser-only", 50), - ); - let cycle_count = total_cycles; - - eprintln!(); - eprintln!("============================================================"); - eprintln!(" DESERIALIZE-ONLY GUEST EXECUTION SUMMARY"); - eprintln!("============================================================"); - eprintln!(" Cycle count : {cycle_count}"); - eprintln!(" Executor wall time : {exec_time:?}"); - eprintln!(); - eprintln!(" Compare against test_recursion_cycle_count (~40.5B cycles"); - eprintln!(" with the same proof). Delta = verifier-in-VM cost."); - eprintln!("============================================================"); +#[test] +#[ignore = "diagnostic: fast; deserialize-only guest cycle count (1 query)"] +fn test_deserialize_only_cycles_1query() { + run_profile("deserialize-only", 50, MIN_PROOF_OPTIONS, false); } -/// Diagnostic: PC histogram for the **deserialize-only** guest. -/// -/// Sibling of `test_recursion_pc_histogram`, but targeting the -/// deserialize-only control guest so we can locate the hot kernel inside the -/// 15.7 M-cycle postcard decode itself. Every cycle goes through the -/// histogram (no sampling), so attribution is exact — the previous sampled -/// flamegraph at 1:1000 had broken stack reconstruction on skipped -/// CALL/RETURNs, which made it unreliable for a workload this small. -/// -/// Each top PC is resolved to its enclosing function via the in-house -/// `executor::elf::SymbolTable`, reading the guest ELF's symbol table directly -/// (no external tool, no DWARF dependency). #[test] -#[ignore = "diagnostic: ~1 min; PC histogram for the deserialize-only guest"] -fn test_deserialize_only_pc_histogram() { - // Same runner as the recursion PC histograms, pointed at the deserialize-only - // control guest. Smaller workload (~16 M cycles, far fewer chunks), so use a - // tighter progress stride to still get periodic readouts. - run_pc_histogram( - "DESERIALIZE-ONLY GUEST PC HISTOGRAM", - "deserialize-only", - 50, - MIN_PROOF_OPTIONS, - ); +#[ignore = "diagnostic: fast; deserialize-only guest cycle count (multi-query)"] +fn test_deserialize_only_cycles_multiquery() { + run_profile("deserialize-only", 50, blowup8(), false); } -/// Diagnostic: bucket the recursion guest's cycles by which verifier step -/// is currently executing. -/// -/// The verifier's hot path is `verify_rounds_2_to_4`, which calls four -/// sub-routines in a fixed order: -/// 1. `replay_rounds_after_round_1` (recover challenges) -/// 2. `step_2_verify_claimed_composition_polynomial` -/// 3. `step_3_verify_fri` -/// 4. `step_4_verify_trace_and_composition_openings` -/// -/// We resolve each sub-routine's entry PC from the recursion ELF's symbol -/// table, then run a monotonic state machine over the execution stream: -/// the active bucket only advances 0 → 1 → 2 → 3 → 4 (never backwards), -/// so cycles inside a step's callees stay attributed to that step. -/// -/// Bucket 0 ("setup") captures everything before step 1 is entered — the -/// allocator init, postcard decode, and `VmAirs::new` (which contains the -/// expensive preprocessed-commitment FFTs). -/// -/// Streams chunks via `Executor::resume()` so memory stays bounded. #[test] -#[ignore = "diagnostic: ~13 min; buckets the 40B cycles by verifier step"] -fn test_recursion_step_breakdown() { - use executor::elf::SymbolTable; - - let (recursion_elf_bytes, _program, mut executor) = - setup_guest_run("step-bkd", "recursion", &MIN_PROOF_OPTIONS); - - // Build a per-step "advance bucket to N" lookup. The verifier's step - // functions get inlined by LLVM in release mode, so we can't rely on - // matching their entry PCs directly. Instead we anchor on closures the - // compiler emits *inside* each step's body — iterator combinators like - // `.fold(|...|)` keep the step's method name as a substring in their - // mangled symbol. Any PC that resolves to a symbol containing step N's - // keyword advances the bucket to N (monotonically). - // - // If step N has no matching symbol at all (e.g. step 4 is fully inlined - // with no closure children of its own), its cycles get attributed to the - // previous bucket. We report that explicitly in the summary. - let symbols = SymbolTable::parse(&recursion_elf_bytes); - assert!( - !symbols.is_empty(), - "recursion ELF has no symbol table — was it stripped?" - ); - - let step_keywords = [ - "replay_rounds_after_round_1", - "step_2_verify_claimed_composition_polynomial", - "step_3_verify_fri", - "step_4_verify_trace_and_composition_openings", - ]; - let step_found: [bool; 4] = std::array::from_fn(|i| { - symbols - .functions() - .iter() - .any(|f| f.name.contains(step_keywords[i])) - }); - for (i, found) in step_found.iter().enumerate() { - let n_matches = symbols - .functions() - .iter() - .filter(|f| f.name.contains(step_keywords[i])) - .count(); - eprintln!( - "[step-bkd] step {}: keyword={:?} -> {} symbol(s) {}", - i + 1, - step_keywords[i], - n_matches, - if *found { - "" - } else { - "(fully inlined; will merge into the previous bucket)" - } - ); - } - - // Monotonic state machine: 0=setup, 1..=4=inside step N (or its callees / - // inlined-step-N-cycles attributed here because step N+1 is missing). - // `bucket` lives in a Cell so the per-log closure can advance it while the - // progress closure reads it for its live readout. - let bucket = std::cell::Cell::new(0u8); - let mut buckets = [0u64; 5]; - - eprintln!("[step-bkd] executing recursion guest (streaming) ..."); - - // Cache the last symbol-table hit so we only do a binary search on - // function transitions, not on every cycle. Functions are typically - // long-running (>>1 instruction), so this cache hits ~all of the time. - let mut last_range: Option<(u64, u64)> = None; - let mut last_advance: u8 = 0; - - let (total_cycles, exec_time) = drive_executor( - &mut executor, - |log| { - let pc = log.current_pc; - let in_cached = matches!(last_range, Some((s, e)) if pc >= s && pc < e); - if !in_cached { - // Slow path: refresh the cache from the symbol table. - if let Some(sym) = symbols.lookup(pc) { - // SymbolTable accepts size=0 symbols as "any address >="; for - // those we'd need the next symbol's start for a real upper - // bound. Cheapest workaround: set a tiny range so we re-resolve - // soon enough that wrong attribution is bounded. - let end = sym.address + sym.size.max(1); - last_range = Some((sym.address, end)); - last_advance = 0; - for (i, kw) in step_keywords.iter().enumerate() { - if sym.name.contains(kw) { - last_advance = (i + 1) as u8; - } - } - } else { - last_range = None; - last_advance = 0; - } - } - if bucket.get() < last_advance { - bucket.set(last_advance); - } - buckets[bucket.get() as usize] += 1; - ControlFlow::Continue(()) - }, - |chunks, cycles, elapsed| { - if chunks.is_multiple_of(500) { - eprintln!( - "[step-bkd] ... {chunks} chunks, {cycles} cycles, bucket={}, {elapsed:?}", - bucket.get() - ); - } - }, - ); - - let labels = [ - "0. setup (alloc + postcard decode + VmAirs::new + pre-step-1)", - "1. step 1: replay_rounds_after_round_1", - "2. step 2: verify_claimed_composition_polynomial", - "3. step 3: verify_fri", - "4. step 4: verify_trace_and_composition_openings (+ wrap-up)", - ]; +#[ignore = "diagnostic: ~1 min; deserialize-only guest histogram (1 query)"] +fn test_deserialize_only_profile_1query() { + run_profile("deserialize-only", 50, MIN_PROOF_OPTIONS, true); +} - eprintln!(); - eprintln!("============================================================"); - eprintln!(" RECURSION GUEST PER-STEP CYCLE BREAKDOWN"); - eprintln!("============================================================"); - eprintln!(" Total cycles : {total_cycles}"); - eprintln!(" Exec time : {exec_time:?}"); - eprintln!(); - eprintln!(" {:<60} {:>14} {:>7}", "bucket", "cycles", "%"); - for (label, cycles) in labels.iter().zip(buckets.iter()) { - let pct = if total_cycles > 0 { - 100.0 * (*cycles as f64) / (total_cycles as f64) - } else { - 0.0 - }; - eprintln!(" {:<60} {:>14} {:>6.2}%", label, cycles, pct); - } - eprintln!("============================================================"); +#[test] +#[ignore = "diagnostic: deserialize-only guest histogram (multi-query)"] +fn test_deserialize_only_profile_multiquery() { + run_profile("deserialize-only", 50, blowup8(), true); } /// Inner program: fibonacci(10). From 1b143b393814c01dbb3040c32d6fe8e8891656c4 Mon Sep 17 00:00:00 2001 From: Mario Rugiero Date: Tue, 30 Jun 2026 17:29:33 -0300 Subject: [PATCH 06/16] build: enable the deserialize-only recursion guest Add deserialize-only to RECURSION_GUESTS and migrate the guest to the recursion guest's std shape (lambda_vm_syscalls + build-std std), since the old no_std panic handler collided with std. Add getrandom_backend="custom" to its cargo config (transitive getrandom 0.3 needs it) and track its Cargo.lock. The deser control guest now builds and its profile tests run. --- Makefile | 2 +- .../deserialize-only/.cargo/config.toml | 1 + bench_vs/lambda/deserialize-only/Cargo.lock | 1199 +++++++++++++++++ bench_vs/lambda/deserialize-only/Cargo.toml | 6 +- bench_vs/lambda/deserialize-only/src/main.rs | 99 +- 5 files changed, 1222 insertions(+), 85 deletions(-) create mode 100644 bench_vs/lambda/deserialize-only/Cargo.lock diff --git a/Makefile b/Makefile index 30e3029da..60bb8a0c5 100644 --- a/Makefile +++ b/Makefile @@ -51,7 +51,7 @@ BENCH_ARTIFACTS := $(addprefix $(BENCH_ARTIFACTS_DIR)/, $(addsuffix .elf, $(BENC # rather than executor/programs/. The recursion guest is the in-VM STARK verifier. RECURSION_GUESTS_DIR=./bench_vs/lambda RECURSION_ARTIFACTS_DIR=./executor/program_artifacts/recursion -RECURSION_GUESTS := empty fibonacci recursion +RECURSION_GUESTS := empty fibonacci recursion deserialize-only RECURSION_ARTIFACTS := $(addprefix $(RECURSION_ARTIFACTS_DIR)/, $(addsuffix .elf, $(RECURSION_GUESTS))) # Override with: make ... SYSROOT_DIR=$HOME/.lambda-vm-sysroot diff --git a/bench_vs/lambda/deserialize-only/.cargo/config.toml b/bench_vs/lambda/deserialize-only/.cargo/config.toml index be730c3ec..f5ea686ff 100644 --- a/bench_vs/lambda/deserialize-only/.cargo/config.toml +++ b/bench_vs/lambda/deserialize-only/.cargo/config.toml @@ -2,5 +2,6 @@ rustflags = [ "-C", "link-arg=-e", "-C", "link-arg=main", + "--cfg", "getrandom_backend=\"custom\"", "-C", "passes=lower-atomic" ] diff --git a/bench_vs/lambda/deserialize-only/Cargo.lock b/bench_vs/lambda/deserialize-only/Cargo.lock new file mode 100644 index 000000000..9433fadb3 --- /dev/null +++ b/bench_vs/lambda/deserialize-only/Cargo.lock @@ -0,0 +1,1199 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "atomic-polyfill" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8cf2bce30dfe09ef0bfaef228b9d414faaf7e563035494d7fe092dba54b300f4" +dependencies = [ + "critical-section", +] + +[[package]] +name = "autocfg" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2032f911046de80f0a198e0901378627c33f59ea0ac00e363d481118bd70a53" + +[[package]] +name = "base16ct" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c7f02d4ea65f2c1853089ffd8d2787bdbc63de2f0d29dedbcf8ccdfa0ccd4cf" + +[[package]] +name = "base64" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" + +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + +[[package]] +name = "bumpalo" +version = "3.20.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72f5acc6cb2ba439de613abc23857ec3d78374d8ed5ac84e9d11336e87da8649" + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "cobs" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fa961b519f0b462e3a3b4a34b64d119eeaca1d59af726fe450bbba07a9fc0a1" +dependencies = [ + "thiserror 2.0.18", +] + +[[package]] +name = "const-default" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b396d1f76d455557e1218ec8066ae14bba60b4b36ecd55577ba979f5db7ecaa" + +[[package]] +name = "const-oid" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" + +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + +[[package]] +name = "critical-section" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "790eea4361631c5e7d22598ecd5723ff611904e3344ce8720784c93e3d83d40b" + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "crypto" +version = "0.1.0" +dependencies = [ + "digest", + "math", + "rand 0.8.6", + "rand_chacha 0.3.1", + "serde", + "sha3", +] + +[[package]] +name = "crypto-bigint" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0dc92fb57ca44df6db8059111ab3af99a63d5d0f8375d9972e319a379c6bab76" +dependencies = [ + "generic-array", + "rand_core 0.6.4", + "subtle", + "zeroize", +] + +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "der" +version = "0.7.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7c1832837b905bbfb5101e07cc24c8deddf52f93225eee6ead5f4d63d53ddcb" +dependencies = [ + "const-oid", + "zeroize", +] + +[[package]] +name = "deserialize-only-bench" +version = "0.1.0" +dependencies = [ + "lambda-vm-prover", + "lambda-vm-syscalls", + "postcard", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", +] + +[[package]] +name = "ecsm" +version = "0.1.0" +dependencies = [ + "k256", + "num-bigint", + "num-traits", +] + +[[package]] +name = "either" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e" + +[[package]] +name = "elliptic-curve" +version = "0.13.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5e6043086bf7973472e0c7dff2142ea0b680d30e18d9cc40f267efbf222bd47" +dependencies = [ + "base16ct", + "crypto-bigint", + "ff", + "generic-array", + "group", + "rand_core 0.6.4", + "sec1", + "subtle", + "zeroize", +] + +[[package]] +name = "embedded-alloc" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f2de9133f68db0d4627ad69db767726c99ff8585272716708227008d3f1bddd" +dependencies = [ + "const-default", + "critical-section", + "linked_list_allocator", + "rlsf", +] + +[[package]] +name = "embedded-hal" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "361a90feb7004eca4019fb28352a9465666b24f840f5c3cddf0ff13920590b89" + +[[package]] +name = "embedded-io" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef1a6892d9eef45c8fa6b9e0086428a2cca8491aca8f787c534a3d6d0bcb3ced" + +[[package]] +name = "embedded-io" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edd0f118536f44f5ccd48bcb8b111bdc3de888b58c74639dfb034a357d0f206d" + +[[package]] +name = "executor" +version = "0.1.0" +dependencies = [ + "ecsm", + "rustc-demangle", + "thiserror 1.0.69", +] + +[[package]] +name = "ff" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0b50bfb653653f9ca9095b427bed08ab8d75a137839d9ad64eb11810d5b6393" +dependencies = [ + "rand_core 0.6.4", + "subtle", +] + +[[package]] +name = "futures-core" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" + +[[package]] +name = "futures-task" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393" + +[[package]] +name = "futures-util" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" +dependencies = [ + "futures-core", + "futures-task", + "pin-project-lite", + "slab", +] + +[[package]] +name = "generic-array" +version = "0.14.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4bb6743198531e02858aeaea5398fcc883e71851fcbcb5a2f773e2fb6cb1edf2" +dependencies = [ + "typenum", + "version_check", + "zeroize", +] + +[[package]] +name = "getrandom" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" +dependencies = [ + "cfg-if", + "js-sys", + "libc", + "wasi", + "wasm-bindgen", +] + +[[package]] +name = "getrandom" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasip2", +] + +[[package]] +name = "group" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0f9ef7462f7c099f518d754361858f86d8a07af53ba9af0fe635bbccb151a63" +dependencies = [ + "ff", + "rand_core 0.6.4", + "subtle", +] + +[[package]] +name = "half" +version = "1.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b43ede17f21864e81be2fa654110bf1e793774238d86ef8555c37e6519c0403" + +[[package]] +name = "hash32" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0c35f58762feb77d74ebe43bdbc3210f09be9fe6742234d573bacc26ed92b67" +dependencies = [ + "byteorder", +] + +[[package]] +name = "heapless" +version = "0.7.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdc6457c0eb62c71aac4bc17216026d8410337c4126773b9c5daba343f17964f" +dependencies = [ + "atomic-polyfill", + "hash32", + "rustc_version", + "serde", + "spin", + "stable_deref_trait", +] + +[[package]] +name = "itertools" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" + +[[package]] +name = "js-sys" +version = "0.3.103" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53b44bfcdb3f8d5837a46dae1ca9660a837176eee74a28b229bc626816589102" +dependencies = [ + "cfg-if", + "futures-util", + "wasm-bindgen", +] + +[[package]] +name = "k256" +version = "0.13.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6e3919bbaa2945715f0bb6d3934a173d1e9a59ac23767fbaaef277265a7411b" +dependencies = [ + "cfg-if", + "elliptic-curve", +] + +[[package]] +name = "keccak" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb26cec98cce3a3d96cbb7bced3c4b16e3d13f27ec56dbd62cbc8f39cfb9d653" +dependencies = [ + "cpufeatures", +] + +[[package]] +name = "lambda-vm-prover" +version = "0.1.0" +dependencies = [ + "crypto", + "ecsm", + "executor", + "log", + "math", + "serde", + "sha3", + "stark", + "sysinfo", +] + +[[package]] +name = "lambda-vm-syscalls" +version = "0.1.0" +dependencies = [ + "embedded-alloc", + "getrandom 0.2.17", + "getrandom 0.3.4", + "lazy_static", + "rand 0.9.4", + "riscv", + "thiserror 1.0.69", +] + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + +[[package]] +name = "libc" +version = "0.2.186" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" + +[[package]] +name = "linked_list_allocator" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b23ac50abb8261cb38c6e2a7192d3302e0836dac1628f6a93b82b4fad185897" + +[[package]] +name = "lock_api" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" +dependencies = [ + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ceec5bc11778974d1bcb055b18002eba7f4b3518b6a0081b3af5f21666da9ad" + +[[package]] +name = "math" +version = "0.1.0" +dependencies = [ + "getrandom 0.2.17", + "num-bigint", + "num-traits", + "rand 0.8.6", + "rayon", + "serde", + "serde_json", +] + +[[package]] +name = "memchr" +version = "2.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88904434abc2901f197fe8cc55f0445e7ded921dba5911dad2e2b39b48e663c4" + +[[package]] +name = "ntapi" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3b335231dfd352ffb0f8017f3b6027a4917f7df785ea2143d8af2adc66980ae" +dependencies = [ + "winapi", +] + +[[package]] +name = "num-bigint" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" +dependencies = [ + "num-integer", + "num-traits", +] + +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" + +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + +[[package]] +name = "pin-project-lite" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" + +[[package]] +name = "postcard" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6764c3b5dd454e283a30e6dfe78e9b31096d9e32036b5d1eaac7a6119ccb9a24" +dependencies = [ + "cobs", + "embedded-io 0.4.0", + "embedded-io 0.6.1", + "heapless", + "serde", +] + +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfbc457d0c7a0759a614551b11a6409e5951f6c7537be1f1b7682b9ae9230368" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + +[[package]] +name = "rand" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ca0ecfa931c29007047d1bc58e623ab12e5590e8c7cc53200d5202b69266d8a" +dependencies = [ + "rand_core 0.6.4", +] + +[[package]] +name = "rand" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" +dependencies = [ + "rand_chacha 0.9.0", + "rand_core 0.9.5", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core 0.9.5", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" + +[[package]] +name = "rand_core" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" +dependencies = [ + "getrandom 0.3.4", +] + +[[package]] +name = "rayon" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "riscv" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b05cfa3f7b30c84536a9025150d44d26b8e1cc20ddf436448d74cd9591eefb25" +dependencies = [ + "critical-section", + "embedded-hal", + "paste", + "riscv-macros", + "riscv-pac", +] + +[[package]] +name = "riscv-macros" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d323d13972c1b104aa036bc692cd08b822c8bbf23d79a27c526095856499799" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.118", +] + +[[package]] +name = "riscv-pac" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8188909339ccc0c68cfb5a04648313f09621e8b87dc03095454f1a11f6c5d436" + +[[package]] +name = "rlsf" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1646a59a9734b8b7a0ac51689388a60fe1625d4b956348e9de07591a1478457a" +dependencies = [ + "cfg-if", + "const-default", + "libc", + "rustversion", + "svgbobdoc", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b50b8869d9fc858ce7266cce0194bd74df58b9d0e3f6df3a9fc8eb470d95c09d" + +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + +[[package]] +name = "ryu" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "sec1" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3e97a565f76233a6003f9f5c54be1d9c5bdfa3eccfb189469f11ec4901c47dc" +dependencies = [ + "base16ct", + "der", + "generic-array", + "subtle", + "zeroize", +] + +[[package]] +name = "semver" +version = "1.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd" + +[[package]] +name = "serde" +version = "1.0.219" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_cbor" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bef2ebfde456fb76bbcf9f59315333decc4fda0b2b44b420243c11e0f5ec1f5" +dependencies = [ + "half", + "serde", +] + +[[package]] +name = "serde_derive" +version = "1.0.219" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.118", +] + +[[package]] +name = "serde_json" +version = "1.0.143" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d401abef1d108fbd9cbaebc3e46611f4b1021f714a0597a71f41ee463f5f4a5a" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", +] + +[[package]] +name = "sha3" +version = "0.10.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77fd7028345d415a4034cf8777cd4f8ab1851274233b45f84e3d955502d93874" +dependencies = [ + "digest", + "keccak", +] + +[[package]] +name = "slab" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" + +[[package]] +name = "spin" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" +dependencies = [ + "lock_api", +] + +[[package]] +name = "stable_deref_trait" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" + +[[package]] +name = "stark" +version = "0.1.0" +dependencies = [ + "crypto", + "itertools", + "log", + "math", + "serde", + "serde_cbor", + "sha3", + "thiserror 1.0.69", +] + +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + +[[package]] +name = "svgbobdoc" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2c04b93fc15d79b39c63218f15e3fdffaa4c227830686e3b7c5f41244eb3e50" +dependencies = [ + "base64", + "proc-macro2", + "quote", + "syn 1.0.109", + "unicode-width", +] + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "syn" +version = "2.0.118" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9ae57f904213ebb649ce6895b8a66c66f0203b9319718f69a5612a065b1422" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "sysinfo" +version = "0.31.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "355dbe4f8799b304b05e1b0f05fc59b2a18d36645cf169607da45bde2f69a1be" +dependencies = [ + "core-foundation-sys", + "libc", + "memchr", + "ntapi", + "windows", +] + +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" +dependencies = [ + "thiserror-impl 2.0.18", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.118", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.118", +] + +[[package]] +name = "typenum" +version = "1.20.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6f5e870be6c3b371b77fe0ee0bafb859fa4964b4404c27de1d380043c4dda20" + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "unicode-width" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "wasi" +version = "0.11.1+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + +[[package]] +name = "wasip2" +version = "1.0.4+wasi-0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b67efb37e106e55ce722a510d6b5f9c17f083e5fc79afc2badeb12cc313d9487" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.126" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b067c0c11094aef6b7a801c1e34a26affafdf3d051dba08456b868789aaf9a4" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.126" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "167ce5e579f6bcf889c4f7175a8a5a585de84e8ff93976ce393efa5f2837aab1" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.126" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3997c7839262f4ef12cf90b818d6340c18e80f263f1a94bf157d0ec4420380e" +dependencies = [ + "bumpalo", + "proc-macro2", + "quote", + "syn 2.0.118", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.126" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc1b4cb0cc549fcf58d7dfc081778139b3d283a081644e833e84682ad71cea24" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12342cb4d8e3b046f3d80effd474a7a02447231330ef77d71daa6fbc40681143" +dependencies = [ + "windows-core", + "windows-targets", +] + +[[package]] +name = "windows-core" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2ed2439a290666cd67ecce2b0ffaad89c2a56b976b736e6ece670297897832d" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-result", + "windows-targets", +] + +[[package]] +name = "windows-implement" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9107ddc059d5b6fbfbffdfa7a7fe3e22a226def0b2608f72e9d552763d3e1ad7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.118", +] + +[[package]] +name = "windows-interface" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29bee4b38ea3cde66011baa44dba677c432a78593e202392d1e9070cf2a7fca7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.118", +] + +[[package]] +name = "windows-result" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e383302e8ec8515204254685643de10811af0ed97ea37210dc26fb0032647f8" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "wit-bindgen" +version = "0.57.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" + +[[package]] +name = "zerocopy" +version = "0.8.52" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce1022995ff5ff5d841ad7d994facc23098cd40152f2c1d11cd607c6f530653f" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.52" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ae7f38b72ec2a254e2b87ef277cf2cd4fb97cbebf944faa6f33354da0867930" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.118", +] + +[[package]] +name = "zeroize" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13c156562582aa81c60cb29407084cdb54c4164760106ab78e6c5b0858cf64e" diff --git a/bench_vs/lambda/deserialize-only/Cargo.toml b/bench_vs/lambda/deserialize-only/Cargo.toml index b4a4616f4..fac6a7628 100644 --- a/bench_vs/lambda/deserialize-only/Cargo.toml +++ b/bench_vs/lambda/deserialize-only/Cargo.toml @@ -7,7 +7,5 @@ edition = "2024" [dependencies] lambda-vm-prover = { path = "../../../prover", default-features = false } -embedded-alloc = "0.6" -riscv = { version = "0.15", features = ["critical-section-single-hart"] } -serde = { version = "=1.0.219", default-features = false, features = ["derive", "alloc"] } -postcard = { version = "1.0", default-features = false, features = ["alloc"] } +lambda-vm-syscalls = { path = "../../../syscalls" } +postcard = { version = "1.0", features = ["alloc"] } diff --git a/bench_vs/lambda/deserialize-only/src/main.rs b/bench_vs/lambda/deserialize-only/src/main.rs index 8627776a1..7ba9a9d93 100644 --- a/bench_vs/lambda/deserialize-only/src/main.rs +++ b/bench_vs/lambda/deserialize-only/src/main.rs @@ -1,93 +1,32 @@ //! Deserialize-only counterpart to the recursion guest. //! //! Reads the same private-input blob as `recursion-bench`, postcard-decodes -//! `(VmProof, Vec, ProofOptions)`, then commits success -//! and halts — without ever calling `verify_with_options`. The cycle delta -//! between this guest and `recursion-bench` is the actual cost of the STARK -//! verifier inside the VM (everything else being equal). +//! `(VmProof, Vec, ProofOptions)`, then commits and halts — without ever +//! calling `verify_with_options`. The cycle delta between this guest and +//! `recursion-bench` is the actual cost of the STARK verifier inside the VM. +//! +//! Mirrors the recursion guest's std setup (build-std + `lambda_vm_syscalls`) +//! so the two differ only in the verify call. -#![no_std] #![no_main] -extern crate alloc; - -use alloc::vec::Vec; -use core::arch::asm; -use core::panic::PanicInfo; - -use embedded_alloc::TlsfHeap as Heap; use lambda_vm_prover::{ProofOptions, VmProof}; -// Required to pull in the riscv crate's critical-section implementation. -use riscv as _; - -const PRIVATE_INPUT_START: usize = 0xFF000000; -const SYSCALL_COMMIT: u64 = 64; -const SYSCALL_HALT: u64 = 93; -const MAX_MEMORY_SIZE: usize = 0xC000_0000; - -#[global_allocator] -static HEAP: Heap = Heap::empty(); - -#[panic_handler] -fn panic(_info: &PanicInfo) -> ! { - loop {} -} - -fn init_allocator() { - unsafe extern "C" { - static _end: u8; - } - let heap_pos = (&raw const _end) as usize; - unsafe { HEAP.init(heap_pos, MAX_MEMORY_SIZE - heap_pos) } -} -fn read_private_input() -> &'static [u8] { - let len = unsafe { core::ptr::read_volatile(PRIVATE_INPUT_START as *const u32) } as usize; - let data = (PRIVATE_INPUT_START + 4) as *const u8; - unsafe { core::slice::from_raw_parts(data, len) } -} - -fn commit(bytes: &[u8]) { - unsafe { - asm!( - "ecall", - in("a0") 1u64, - in("a1") bytes.as_ptr(), - in("a2") bytes.len(), - in("a7") SYSCALL_COMMIT, - ); - } -} - -fn halt() -> ! { - unsafe { - asm!( - "ecall", - in("a0") 0u64, - in("a7") SYSCALL_HALT, - options(noreturn), - ); - } -} - -#[unsafe(no_mangle)] +#[unsafe(export_name = "main")] pub fn main() -> ! { - init_allocator(); + lambda_vm_syscalls::allocator::init_allocator(); - let blob = read_private_input(); - let decoded: (VmProof, Vec, ProofOptions) = - postcard::from_bytes(blob).expect("failed to deserialize"); + const PANIC_MSG: &str = "PANICKED"; + std::panic::set_hook(Box::new(|_| unsafe { + lambda_vm_syscalls::syscalls::sys_panic(PANIC_MSG.as_ptr(), PANIC_MSG.len()) + })); - // Force the commit byte to depend on the actually-decoded value. Without - // this, LLVM at -O3 was eliding the postcard decode entirely — the only - // sinks for `decoded` were `black_box(&decoded)` (which only forces the - // *reference* to materialize, not the pointee) and `Drop`, neither of - // which require the decoded bytes to be real. With the commit byte tied - // to a deep field of the decoded value, the decode has to run. - let proof_options_byte = decoded.2.blowup_factor; - let inner_elf_byte = *decoded.1.first().unwrap_or(&0); - let marker = proof_options_byte ^ inner_elf_byte; + let blob = lambda_vm_syscalls::syscalls::get_private_input(); + let decoded: (VmProof, Vec, ProofOptions) = + postcard::from_bytes(&blob).expect("failed to deserialize recursion input"); - commit(&[marker]); - halt() + // Tie the committed byte to the decoded value so LLVM can't elide the decode. + let marker = decoded.2.blowup_factor ^ *decoded.1.first().unwrap_or(&0); + lambda_vm_syscalls::syscalls::commit(&[marker]); + lambda_vm_syscalls::syscalls::sys_halt(); } From 35f4741270855e8b1b9533edc14f6417978e7e2b Mon Sep 17 00:00:00 2001 From: Mario Rugiero Date: Tue, 30 Jun 2026 18:11:58 -0300 Subject: [PATCH 07/16] build: point profile-recursion make targets at renamed tests --- Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 60bb8a0c5..801845534 100644 --- a/Makefile +++ b/Makefile @@ -234,11 +234,11 @@ test-flamegraph: test-profile-recursion: test-profile-recursion-single test-profile-recursion-multi -test-profile-recursion-single: compile-programs-rust - cargo test --package lambda-vm-prover --lib test_recursion_pc_histogram_1query -- --ignored --nocapture +test-profile-recursion-single: compile-recursion-elfs + cargo test --package lambda-vm-prover --lib test_recursion_profile_1query -- --ignored --nocapture -test-profile-recursion-multi: compile-programs-rust - cargo test --package lambda-vm-prover --lib test_recursion_pc_histogram_multiquery -- --ignored --nocapture +test-profile-recursion-multi: compile-recursion-elfs + cargo test --package lambda-vm-prover --lib test_recursion_profile_multiquery -- --ignored --nocapture # Regenerate the committed ethrex block fixtures (see tooling/ethrex-fixtures). # Run after bumping the ethrex rev; README checksums are refreshed automatically. From 89d46dcd2deddf15cc0a5dde6150aae249685c4f Mon Sep 17 00:00:00 2001 From: Mario Rugiero Date: Tue, 30 Jun 2026 18:20:16 -0300 Subject: [PATCH 08/16] docs: trim recursion smoke-test doc comments --- prover/src/tests/recursion_smoke_test.rs | 230 ++++++----------------- 1 file changed, 53 insertions(+), 177 deletions(-) diff --git a/prover/src/tests/recursion_smoke_test.rs b/prover/src/tests/recursion_smoke_test.rs index 5077610e7..3b3814adc 100644 --- a/prover/src/tests/recursion_smoke_test.rs +++ b/prover/src/tests/recursion_smoke_test.rs @@ -1,16 +1,7 @@ -//! End-to-end naive recursion pipeline smoke tests. -//! -//! Each test: -//! 1. Proves an inner program on the host. -//! 2. Serializes `(VmProof, inner_elf)` with postcard. -//! 3. Hands that as private input to the recursion guest. -//! 4. Either **proves** the recursion guest's execution (memory-bounded via -//! continuations) and verifies the outer proof (`OuterMode::Prove`), or -//! merely **executes** the guest in-VM and reads the committed marker -//! straight off the executor's memory (`OuterMode::ExecuteOnly`) — a cheaper -//! tier that skips the LDE/FRI that dominate the full pipeline. -//! -//! The guest ELFs are assumed built by `make compile-recursion-elfs`. +//! End-to-end naive recursion pipeline smoke tests: prove an inner program, +//! hand `(VmProof, elf, opts)` to the in-VM verifier guest, then either prove +//! the guest's execution (`OuterMode::Prove`) or just execute it +//! (`OuterMode::ExecuteOnly`). Guest ELFs come from `make compile-recursion-elfs`. use std::ops::ControlFlow; use std::path::PathBuf; @@ -33,11 +24,8 @@ fn read_guest_elf(root: &std::path::Path, name: &str) -> Vec { }) } -/// Minimum-security FRI parameters: blowup=2, a single FRI query. Security is -/// intentionally terrible — used by the capacity-probing test and every cheap -/// diagnostic below, where the goal is the smallest possible inner proof, not -/// a sound one. (`GoldilocksCubicProofOptions::with_blowup` derives a query -/// count from a 128-bit target, far more than we want here.) +/// Smallest possible inner proof (blowup=2, 1 query). Intentionally insecure — +/// for the cheap diagnostics, not soundness. const MIN_PROOF_OPTIONS: stark::proof::options::ProofOptions = stark::proof::options::ProofOptions { blowup_factor: 2, @@ -46,11 +34,8 @@ const MIN_PROOF_OPTIONS: stark::proof::options::ProofOptions = grinding_factor: 1, }; -/// Prove `inner_elf` (fed `inner_input`) under `opts`, then package -/// `(proof, elf, opts)` into the postcard blob the recursion and -/// deserialize-only guests consume as their private input. `tag` prefixes the -/// progress lines. Returns the inner proof — callers that re-verify it on the -/// host need it — next to the encoded blob. +/// Prove `inner_elf` under `opts` and postcard-encode `(proof, elf, opts)` into +/// the guest's private-input blob. Returns the proof and the blob. fn prove_inner_and_encode_blob( tag: &str, inner_elf: &[u8], @@ -75,26 +60,17 @@ fn prove_inner_and_encode_blob( (inner_proof, blob) } -/// How far to take the recursion guest after it has been handed the inner -/// proof. The guest under test is the verifier either way — this only chooses -/// whether we also prove the guest's own execution. +/// Whether to also prove the guest's own execution after handing it the proof. #[derive(Clone, Copy, Debug)] enum OuterMode { - /// Execute the guest in-VM and read the committed marker straight off the - /// executor's memory. Streams logs via `Executor::resume()` and never - /// builds a `Traces`, so footprint stays bounded to the VM's touched - /// memory + instruction cache. Skips the LDE/FRI of the full pipeline entirely. + /// Execute in-VM, read the committed marker off memory; no LDE/FRI. ExecuteOnly, - /// Prove the guest's execution memory-bounded via continuations, then - /// verify the outer proof on the host. Peak RAM is a single epoch's proof. + /// Prove the execution (memory-bounded via continuations) and verify on host. Prove, } -/// Execute the recursion guest in-VM on `blob` and return the bytes it -/// committed (the success marker the in-VM verifier emits). -/// -/// Streams execution via `Executor::resume()`. The committed marker is -/// read directly off the executor's memory. This avoids OOMs. +/// Execute the recursion guest in-VM on `blob` and return its committed bytes, +/// read straight off the executor's memory after a streamed run. fn execute_outer_and_commit(label: &str, recursion_elf_bytes: &[u8], blob: &[u8]) -> Vec { use executor::elf::Elf; use executor::vm::execution::Executor; @@ -123,9 +99,8 @@ fn execute_outer_and_commit(label: &str, recursion_elf_bytes: &[u8], blob: &[u8] /// Epoch size for the outer prove: 2^20 ≈ 1M cycles per epoch. const OUTER_EPOCH_SIZE_LOG2: u32 = 20; -/// Prove the recursion guest's execution on `blob` memory-bounded via -/// continuations and verify the bundle on the host, returning the bytes the -/// guest committed. +/// Prove the guest's execution via continuations, verify on host, return the +/// committed bytes. fn prove_outer_and_commit(label: &str, recursion_elf_bytes: &[u8], blob: &[u8]) -> Vec { let opts = crate::GoldilocksCubicProofOptions::with_blowup(2).expect("blowup=2 is always valid"); @@ -145,15 +120,9 @@ fn prove_outer_and_commit(label: &str, recursion_elf_bytes: &[u8], blob: &[u8]) committed } -/// Stream a guest's execution via `Executor::resume()`, calling `on_log` for -/// every `Log` without ever buffering the full log stream (`Executor::run` -/// would accumulate tens of millions of `Log`s and OOM even a 125 GB box). -/// `on_log` returns `ControlFlow::Break(())` to stop the run early (e.g. once a -/// cycle budget is hit); `Continue(())` to keep going. `on_progress(chunks, -/// total_cycles, elapsed)` fires once per resumed chunk; callers throttle and -/// format their own progress lines. Returns `(total_cycles, wall_time)` — -/// `total_cycles` counts logs actually visited, so it is exact even when a run -/// breaks mid-chunk. +/// Stream a guest's execution via `Executor::resume()` without buffering the log +/// stream. `on_log` returns `Break` to stop early; `on_progress` fires per chunk. +/// Returns `(total_cycles, wall_time)`, exact even on an early break. fn drive_executor( executor: &mut executor::vm::execution::Executor, mut on_log: impl FnMut(&executor::vm::logs::Log) -> ControlFlow<()>, @@ -180,12 +149,8 @@ fn drive_executor( (total_cycles, start.elapsed()) } -/// Shared preamble for every execute-only diagnostic below: build the standard -/// recursion private-input blob (an `empty`-program inner proof produced under -/// `opts`), load guest `guest_name`, and stand up an executor over it. Returns -/// the guest's raw ELF bytes (callers that resolve PCs pass them to -/// [`executor::elf::SymbolTable::parse`]), the loaded program, and the -/// ready-to-drive executor. +/// Shared preamble: build the blob (an `empty` inner proof under `opts`), load +/// `guest_name`, and stand up an executor. Returns `(elf_bytes, program, executor)`. fn setup_guest_run( label: &str, guest_name: &str, @@ -211,12 +176,7 @@ fn setup_guest_run( (guest_elf_bytes, program, executor) } -/// A `drive_executor` progress callback that prints the throttled -/// `[label] ... N chunks, M cycles, T elapsed` line every `stride` chunks — -/// the readout the counting diagnostics share. Tests that need extra live state -/// (unique PC count, active step bucket) keep their own closure instead. Takes -/// `impl Into` so it works with both `&'static` tags and a run's -/// dynamic `label`. +/// A `drive_executor` progress callback printing one line every `stride` chunks. fn log_progress(label: impl Into, stride: usize) -> impl FnMut(usize, u64, std::time::Duration) { let label = label.into(); move |chunks, cycles, elapsed| { @@ -226,11 +186,8 @@ fn log_progress(label: impl Into, stride: usize) -> impl FnMut(usize, u6 } } -/// Resolve a guest PC to its (demangled) enclosing function name using the -/// ELF's own symbol table — the same data `executor::flamegraph` resolves -/// against. `` when no function symbol covers the PC (e.g. PLT stubs -/// or a release build that dropped symbols). No file:line: the symbol table -/// carries function ranges only, not DWARF line info. +/// Demangled enclosing-function name for a PC via the ELF symbol table; +/// `` if none covers it. No file:line (symtab has no DWARF). fn resolve_pc(symbols: &executor::elf::SymbolTable, pc: u64) -> String { symbols.lookup(pc).map_or_else( || "".to_string(), @@ -238,10 +195,8 @@ fn resolve_pc(symbols: &executor::elf::SymbolTable, pc: u64) -> String { ) } -/// Verifier sub-routines in execution order. LLVM inlines the step bodies, but -/// closures inside each keep the method name in their mangled symbol, so -/// `run_profile` advances the step bucket by substring-matching the enclosing -/// symbol. A step with no matching symbol merges into the previous bucket. +/// Verifier sub-routines in execution order; `run_profile` buckets cycles by +/// substring-matching the enclosing symbol (a missing step merges into the prior). const VERIFIER_STEP_KEYWORDS: [&str; 4] = [ "replay_rounds_after_round_1", "step_2_verify_claimed_composition_polynomial", @@ -249,15 +204,12 @@ const VERIFIER_STEP_KEYWORDS: [&str; 4] = [ "step_4_verify_trace_and_composition_openings", ]; -/// `blowup=8` inner-proof options: the security-derived multi-query count (tens -/// of queries, 128-bit) used by every `multiquery` profiling variant. +/// `blowup=8` (128-bit, multi-query) options for the `multiquery` variants. fn blowup8() -> stark::proof::options::ProofOptions { crate::GoldilocksCubicProofOptions::with_blowup(8).expect("blowup=8 is always valid") } -/// Fold the PC histogram by enclosing function and print the top-25 by cycles. -/// Folded because an inlined kernel spreads across many PCs; no per-address -/// table since a bare PC isn't actionable without file:line. +/// Print the top-25 functions by cycles, folding the PC histogram by symbol. fn print_function_table( symbols: &executor::elf::SymbolTable, pc_hist: std::collections::HashMap, @@ -293,9 +245,7 @@ fn print_function_table( } } -/// Print the monotonic per-verifier-step cycle bucketing. `buckets[0]` is -/// pre-step-1 setup (alloc + postcard decode + `VmAirs::new`); `buckets[i]` is -/// verifier step i (with a missing step's cycles merged into the previous one). +/// Print the monotonic per-verifier-step cycle bucketing (`buckets[0]` = setup). fn print_step_breakdown(buckets: &[u64; 5], total_cycles: u64) { let labels = [ "0. setup (alloc + postcard decode + VmAirs::new + pre-step-1)", @@ -317,12 +267,9 @@ fn print_step_breakdown(buckets: &[u64; 5], total_cycles: u64) { } } -/// Single-pass execute-only profiler. Always prints total cycles + wall time + -/// a rough trace/LDE size estimate. With `detailed`, the same pass also builds -/// the PC histogram and verifier-step bucketing and prints the top-25 functions -/// and the per-step breakdown (the two always come together); `!detailed` does -/// no per-log work, so it's just a fast cycle counter. `progress_stride` -/// throttles the readout (recursion large, the deserialize-only control small). +/// Single-pass execute-only profiler. Always prints total cycles + a rough +/// trace/LDE estimate; with `detailed`, also the top-25 functions + per-step +/// breakdown (one streamed pass). `!detailed` does no per-log work. fn run_profile( guest_name: &str, progress_stride: usize, @@ -437,10 +384,8 @@ fn run_profile( eprintln!("============================================================"); } -/// Core pipeline: prove an inner program with the given options, hand the -/// proof+ELF+options to the recursion guest, then take the guest to `mode` -/// (execute-only or full prove) and assert it committed the `[1]` success -/// marker — i.e. the in-VM verifier accepted the inner proof. +/// Core pipeline: prove the inner program, run the guest to `mode`, assert it +/// committed `[1]` (the in-VM verifier accepted the proof). fn run_recursion_pipeline_with_options( label: &str, inner_elf_bytes: &[u8], @@ -487,8 +432,7 @@ fn run_recursion_pipeline_with_options( eprintln!("[{label}] guest committed [1]: in-VM verify accepted ✓"); } -/// Convenience wrapper using `blowup=8` for the inner proof — the default for -/// the `empty` and `fibonacci` cases, chosen to keep outer-prove memory tractable. +/// `run_recursion_pipeline_with_options` with `blowup=8` (the `empty`/`fibonacci` default). fn run_recursion_pipeline( label: &str, inner_elf_bytes: &[u8], @@ -506,9 +450,8 @@ fn run_recursion_pipeline( ); } -/// Reproduce the recursion guest's EXACT path on the host — decode the postcard -/// blob into `(VmProof, Vec, ProofOptions)` and call `verify_with_options`. -/// Cheap regression guard. +/// Decode the blob on the host and verify — a cheap guard on the encode/decode +/// contract without running the VM. #[test] #[ignore = "needs prebuilt guest ELF (make compile-recursion-elfs)"] fn test_recursion_blob_decodes_and_verifies_on_host() { @@ -541,8 +484,7 @@ fn test_recursion_blob_decodes_and_verifies_on_host() { // === Execute-only tier ======================================================== -/// Execute-only mirror of `test_recursion_prove_empty`: verify a `blowup=8` -/// proof of the empty program in-VM. +/// Execute-only: verify a `blowup=8` proof of the empty program in-VM. #[test] #[ignore = "slow: runs the in-VM STARK verifier (minutes on CI)"] fn test_recursion_execute_empty() { @@ -556,8 +498,7 @@ fn test_recursion_execute_empty() { ); } -/// Execute-only mirror of `test_recursion_prove_1query`: smallest possible -/// inner proof (blowup=2, 1 query) → least guest work. +/// Execute-only: smallest inner proof (blowup=2, 1 query) → least guest work. #[test] #[ignore = "slow: runs the in-VM STARK verifier (minutes on CI)"] fn test_recursion_execute_1query() { @@ -572,8 +513,7 @@ fn test_recursion_execute_1query() { ); } -/// Execute-only mirror of `test_recursion_prove`: verify a `blowup=8` proof of -/// fibonacci(10) in-VM. +/// Execute-only: verify a `blowup=8` proof of fibonacci(10) in-VM. #[test] #[ignore = "slow: runs the in-VM STARK verifier (minutes on CI)"] fn test_recursion_execute() { @@ -593,8 +533,7 @@ fn test_recursion_execute() { // === Full-prove tier ========================================================== -/// Inner program: empty (halt immediately). Useful for measuring the -/// verifier's intrinsic recursion overhead. +/// Inner program: empty — the verifier's intrinsic recursion overhead. #[test] #[ignore = "slow: memory-bounded continuation prove of the verifier-in-VM"] fn test_recursion_prove_empty() { @@ -608,8 +547,7 @@ fn test_recursion_prove_empty() { ); } -/// Inner program: empty, but with the absolute-minimum FRI parameters -/// (blowup=2, **fri_number_of_queries=1**). For quick profiling only. +/// Inner program: empty, blowup=2/1-query. Quick profiling only. #[test] #[ignore = "slow: memory-bounded continuation prove of the verifier-in-VM"] fn test_recursion_prove_1query() { @@ -625,18 +563,8 @@ fn test_recursion_prove_1query() { ); } -/// Diagnostic: build the inner proof and dump the recursion guest's private-input -/// blob to `/tmp/recursion_input.bin` so the CLI's `execute --flamegraph` can -/// consume it. -/// -/// Usage after running this test: -/// ``` -/// cargo run -p cli --release -- execute \ -/// bench_vs/lambda/recursion/target/riscv64im-lambda-vm-elf/release/recursion-bench \ -/// --private-input /tmp/recursion_input.bin \ -/// --flamegraph /tmp/recursion_folded.txt -/// cat /tmp/recursion_folded.txt | inferno-flamegraph > /tmp/recursion_flamegraph.svg -/// ``` +/// Dump the guest's private-input blob to `/tmp/recursion_input.bin` for the +/// CLI's `execute --flamegraph`. #[test] #[ignore = "diagnostic: writes recursion private input to /tmp/recursion_input.bin"] fn test_dump_recursion_input() { @@ -665,8 +593,7 @@ fn test_recursion_cycles_multiquery() { run_profile("recursion", 500, blowup8(), false); } -/// Full profile (top-25 functions + per-step breakdown) of the 1-query run — -/// the cheapest verifier run, dominated by fixed setup. +/// Full profile (top-25 + per-step) of the 1-query run. #[test] #[ignore = "diagnostic: ~8 min; recursion guest histogram + steps (1 query)"] fn test_recursion_profile_1query() { @@ -680,27 +607,8 @@ fn test_recursion_profile_multiquery() { run_profile("recursion", 500, blowup8(), true); } -/// Diagnostic: count the distinct 4 KB memory pages the recursion guest -/// touches when verifying a small inner proof. -/// -/// We suspect the outer prover's 125 GB OOM wall is dominated by per-page -/// PAGE-table overhead. The number of PAGE tables the prover would build -/// equals the number of distinct 4 KB pages the executor touches — code, -/// heap, private input, and stack. This test surfaces that count without -/// running the prover. -/// -/// Layout (per `executor::constants` + `bench_vs/lambda/recursion/src/main.rs`): -/// - Code/static: whatever PT_LOAD segments the recursion ELF carries. -/// - Heap: `_end .. 0xC000_0000` (`MAX_MEMORY_SIZE`); `TlsfHeap` scatters -/// allocations across this region. -/// - Private input: starts at `PRIVATE_INPUT_START_INDEX = 0xFF000000`. -/// - Stack: top of address space (down from `STACK_TOP = 0xFFFFFFFFFFFFFFF0`). -/// -/// Interpretation (rough): -/// - <1,000 pages: PAGE-table overhead is not the bottleneck. -/// - 10k-100k pages: TLSF heap fragmentation; design a tighter bump allocator -/// and re-measure. -/// - >100k pages: postcard decode dominates; consider streaming decode. +/// Count the distinct 4 KB pages the guest touches (code/heap/input/stack) — a +/// proxy for the prover's per-page PAGE-table overhead, without running it. #[test] #[ignore = "diagnostic: counts distinct 4 KB memory pages touched by the recursion guest"] fn test_recursion_page_count() { @@ -803,39 +711,19 @@ fn test_recursion_page_count() { eprintln!("============================================================"); } -/// Diagnostic: build a **sampled** call-stack histogram of the recursion guest. -/// -/// Like `test_recursion_pc_histogram` but groups by full call stack (not PC). -/// To stay fast, only every `SAMPLE_RATE`-th log is recorded into the histogram. -/// The call stack itself is updated on every log (skipping would corrupt it). -/// -/// Output is written to `/tmp/recursion_folded_sampled.txt` in -/// inferno-flamegraph "folded stacks" format. Pipe it through: -/// -/// cat /tmp/recursion_folded_sampled.txt | inferno-flamegraph > svg.svg -/// -/// Expect ~10-20 minutes for SAMPLE_RATE=100 on a 40B-cycle guest. +/// Sampled call-stack flamegraph of the recursion guest, written to +/// `/tmp/recursion_folded_sampled.txt` (inferno "folded stacks" format). #[test] #[ignore = "diagnostic: sampled flamegraph for the verifier-in-VM"] fn test_recursion_sampled_flamegraph() { use executor::flamegraph::FlamegraphGenerator; use std::io::BufWriter; - /// 1 in N logs is fed to `process_logs`, which both updates the call - /// stack and records a sample. At 1, every cycle goes through — the call - /// stack stays exactly in sync with execution so frame widths are - /// trustworthy, but the per-cycle cost (~57µs) limits how many cycles - /// we can cover within a wall-clock budget. - /// - /// At SAMPLE_RATE > 1, every CALL/RETURN that lands on a skipped cycle - /// silently desyncs the stack, producing the "stuck-in-visit_seq" effect - /// we saw at 1:1000. Use values > 1 only when stack accuracy is - /// expendable. + /// 1-in-N logs sampled. >1 desyncs the call stack on skipped CALL/RETURNs, + /// so keep at 1 unless stack accuracy is expendable. const SAMPLE_RATE: usize = 1; - /// Stop the executor early once we've covered this many cycles. - /// Set to 0 to run to completion (40B+ cycles, hours at SAMPLE_RATE=1). - /// At SAMPLE_RATE=1, ~57µs per cycle means 5M cycles ≈ 5 min wall time. + /// Stop after this many cycles (0 = run to completion). const CYCLE_BUDGET: u64 = 5_000_000; let (recursion_elf_bytes, program, mut executor) = @@ -931,20 +819,8 @@ fn test_recursion_sampled_flamegraph() { eprintln!("============================================================"); } -/// Diagnostic: host-side per-step timings for the verifier. -/// -/// Runs an inner prove (empty guest, blowup=2, 1 query) and then verifies it -/// on the host. When built with `--features stark/instruments`, the verifier -/// prints `Time spent: ...` for each of the four steps (replay challenges, -/// composition polynomial, FRI, DEEP openings) plus the step-1-replay it -/// does before step 2. Lets us see the host-side split in seconds, without -/// running anything inside the VM. -/// -/// Usage: -/// ``` -/// cargo test --release -p lambda-vm-prover --features stark/instruments \ -/// --lib test_host_verify_step_timings -- --ignored --nocapture -/// ``` +/// Host-side per-step verifier timings (build with `--features stark/instruments` +/// for the `Time spent:` lines). No VM execution. #[test] #[ignore = "diagnostic: prints host-side verifier step timings"] fn test_host_verify_step_timings() { From 53145fc50f48f8b82ff4c5ce675154f1d9d25749 Mon Sep 17 00:00:00 2001 From: Mario Rugiero Date: Tue, 30 Jun 2026 18:23:07 -0300 Subject: [PATCH 09/16] refactor(prover): drop test_host_verify_step_timings The smoke pipelines already host-verify the inner proof, so building with --features stark/instruments surfaces the per-step timings; the dedicated test was just that verify minus the guest run. Documented the flag in the module doc. --- prover/src/tests/recursion_smoke_test.rs | 38 +++--------------------- 1 file changed, 4 insertions(+), 34 deletions(-) diff --git a/prover/src/tests/recursion_smoke_test.rs b/prover/src/tests/recursion_smoke_test.rs index 3b3814adc..67c6a5818 100644 --- a/prover/src/tests/recursion_smoke_test.rs +++ b/prover/src/tests/recursion_smoke_test.rs @@ -2,6 +2,10 @@ //! hand `(VmProof, elf, opts)` to the in-VM verifier guest, then either prove //! the guest's execution (`OuterMode::Prove`) or just execute it //! (`OuterMode::ExecuteOnly`). Guest ELFs come from `make compile-recursion-elfs`. +//! +//! Every pipeline host-verifies the inner proof, so building with +//! `--features stark/instruments` makes any of these tests print the verifier's +//! per-step `Time spent:` timings. use std::ops::ControlFlow; use std::path::PathBuf; @@ -819,40 +823,6 @@ fn test_recursion_sampled_flamegraph() { eprintln!("============================================================"); } -/// Host-side per-step verifier timings (build with `--features stark/instruments` -/// for the `Time spent:` lines). No VM execution. -#[test] -#[ignore = "diagnostic: prints host-side verifier step timings"] -fn test_host_verify_step_timings() { - let root = workspace_root(); - let empty_path = - root.join("bench_vs/lambda/empty/target/riscv64im-lambda-vm-elf/release/empty-bench"); - let empty_elf_bytes = std::fs::read(&empty_path).expect("read empty-bench"); - - let inner_proof_options = MIN_PROOF_OPTIONS; - - eprintln!("[host-verify] proving empty (blowup=2, fri_queries=1) ..."); - let inner_proof = crate::prove_with_options_and_inputs( - &empty_elf_bytes, - &[], - &inner_proof_options, - &crate::MaxRowsConfig::default(), - ) - .expect("inner prove should succeed"); - - eprintln!("[host-verify] verifying on host (with instruments) ..."); - let ok = crate::verify_with_options( - &inner_proof, - &empty_elf_bytes, - &inner_proof_options, - None, - None, - ) - .expect("verify errored"); - assert!(ok, "proof must verify"); - eprintln!("[host-verify] verified OK"); -} - // Control guest: decodes the blob and halts. Its cycle count subtracted from // the matching recursion run isolates the in-VM verifier cost. From da41a237cb2a41fa344a0437e7e4529e064ca317 Mon Sep 17 00:00:00 2001 From: Mario Rugiero Date: Tue, 30 Jun 2026 18:29:58 -0300 Subject: [PATCH 10/16] Remove the unused SP1 verifier bench program It was never wired into the bench harness or CI (run.sh uses sp1/fibonacci), and its in-VM verifier-cost comparison is superseded by the recursion profile tests in this PR. --- bench_vs/sp1/verifier/Cargo.toml | 3 - bench_vs/sp1/verifier/program/Cargo.toml | 10 --- bench_vs/sp1/verifier/program/src/main.rs | 34 ---------- bench_vs/sp1/verifier/script/Cargo.toml | 13 ---- bench_vs/sp1/verifier/script/build.rs | 5 -- bench_vs/sp1/verifier/script/src/main.rs | 83 ----------------------- 6 files changed, 148 deletions(-) delete mode 100644 bench_vs/sp1/verifier/Cargo.toml delete mode 100644 bench_vs/sp1/verifier/program/Cargo.toml delete mode 100644 bench_vs/sp1/verifier/program/src/main.rs delete mode 100644 bench_vs/sp1/verifier/script/Cargo.toml delete mode 100644 bench_vs/sp1/verifier/script/build.rs delete mode 100644 bench_vs/sp1/verifier/script/src/main.rs diff --git a/bench_vs/sp1/verifier/Cargo.toml b/bench_vs/sp1/verifier/Cargo.toml deleted file mode 100644 index fc24039c2..000000000 --- a/bench_vs/sp1/verifier/Cargo.toml +++ /dev/null @@ -1,3 +0,0 @@ -[workspace] -members = ["program", "script"] -resolver = "2" diff --git a/bench_vs/sp1/verifier/program/Cargo.toml b/bench_vs/sp1/verifier/program/Cargo.toml deleted file mode 100644 index 7fbc9c5ce..000000000 --- a/bench_vs/sp1/verifier/program/Cargo.toml +++ /dev/null @@ -1,10 +0,0 @@ -[package] -name = "verifier-program" -version = "0.1.0" -edition = "2024" - -[dependencies] -sp1-zkvm = "6.0.1" -lambda-vm-prover = { path = "../../../../prover", default-features = false } -serde = { version = "=1.0.219", default-features = false, features = ["derive", "alloc"] } -postcard = { version = "1.0", default-features = false, features = ["alloc"] } diff --git a/bench_vs/sp1/verifier/program/src/main.rs b/bench_vs/sp1/verifier/program/src/main.rs deleted file mode 100644 index c63bb67ca..000000000 --- a/bench_vs/sp1/verifier/program/src/main.rs +++ /dev/null @@ -1,34 +0,0 @@ -//! SP1 guest that runs lambda-vm's `verify_with_options` on a single proof. -//! -//! Input layout (postcard-encoded `Vec` written via `SP1Stdin::write_vec`): -//! `(VmProof, Vec, ProofOptions)` -//! where the inner `Vec` is the inner program's ELF bytes. -//! -//! Output: commits `[1u8]` on successful verify; the guest panics otherwise. -//! -//! Caveats: -//! - The verifier hashes through the `keccak` crate. SP1 has a Keccak -//! precompile but it patches `tiny-keccak`, not `keccak`. We don't patch -//! here, so Keccak runs as software inside the guest. Cycle counts will be -//! inflated by that overhead. Worth keeping in mind when interpreting the -//! number relative to lambda-vm's in-VM count. - -#![no_main] - -extern crate alloc; - -use alloc::vec::Vec; - -use lambda_vm_prover::{ProofOptions, VmProof}; - -sp1_zkvm::entrypoint!(main); - -pub fn main() { - let blob = sp1_zkvm::io::read_vec(); - let (vm_proof, inner_elf, options): (VmProof, Vec, ProofOptions) = - postcard::from_bytes(&blob).expect("failed to deserialize input"); - let ok = lambda_vm_prover::verify_with_options(&vm_proof, &inner_elf, &options, None, None) - .expect("verify errored"); - assert!(ok, "inner proof failed verification"); - sp1_zkvm::io::commit_slice(&[1u8]); -} diff --git a/bench_vs/sp1/verifier/script/Cargo.toml b/bench_vs/sp1/verifier/script/Cargo.toml deleted file mode 100644 index 3198059bd..000000000 --- a/bench_vs/sp1/verifier/script/Cargo.toml +++ /dev/null @@ -1,13 +0,0 @@ -[package] -name = "verifier-script" -version = "0.1.0" -edition = "2024" - -[dependencies] -sp1-sdk = { version = "6.0.1", features = ["blocking", "profiling"] } -lambda-vm-prover = { path = "../../../../prover" } -stark = { path = "../../../../crypto/stark" } -postcard = { version = "1.0", features = ["alloc"] } - -[build-dependencies] -sp1-build = "6.0.1" diff --git a/bench_vs/sp1/verifier/script/build.rs b/bench_vs/sp1/verifier/script/build.rs deleted file mode 100644 index d6cf925d6..000000000 --- a/bench_vs/sp1/verifier/script/build.rs +++ /dev/null @@ -1,5 +0,0 @@ -use sp1_build::build_program_with_args; - -fn main() { - build_program_with_args("../program", Default::default()); -} diff --git a/bench_vs/sp1/verifier/script/src/main.rs b/bench_vs/sp1/verifier/script/src/main.rs deleted file mode 100644 index 86e46a710..000000000 --- a/bench_vs/sp1/verifier/script/src/main.rs +++ /dev/null @@ -1,83 +0,0 @@ -//! Host driver: prove an inner empty program on lambda-vm, then execute the -//! lambda-vm verifier inside SP1's executor, printing the cycle count. -//! -//! Set `TRACE_FILE=profiles/verifier.json` to capture a DWARF-attributed -//! profile (1 sample = 1 cycle). The output can be opened with -//! `samply load profiles/verifier.json`. - -use std::path::PathBuf; - -use sp1_sdk::blocking::{Prover, ProverClient}; -use sp1_sdk::{SP1Stdin, include_elf}; - -const VERIFIER_ELF: sp1_sdk::Elf = include_elf!("verifier-program"); - -fn workspace_root() -> PathBuf { - // CARGO_MANIFEST_DIR for this crate is `/bench_vs/sp1/verifier/script`. - PathBuf::from(env!("CARGO_MANIFEST_DIR")) - .ancestors() - .nth(4) - .expect("workspace root") - .to_path_buf() -} - -fn main() { - sp1_sdk::utils::setup_logger(); - - let root = workspace_root(); - let empty_elf_path = root - .join("bench_vs/lambda/empty/target/riscv64im-lambda-vm-elf/release/empty-bench"); - assert!( - empty_elf_path.exists(), - "empty-bench ELF not found at {} — run `bash bench_vs/build_recursion_elfs.sh` first", - empty_elf_path.display(), - ); - let inner_elf = std::fs::read(&empty_elf_path).expect("read empty-bench"); - - let options = stark::proof::options::ProofOptions { - blowup_factor: 2, - fri_number_of_queries: 1, - coset_offset: 3, - grinding_factor: 1, - }; - - println!("[sp1-verifier] proving inner (empty, blowup=2, 1 query) ..."); - let inner_proof = lambda_vm_prover::prove_with_options_and_inputs( - &inner_elf, - &[], - &options, - &lambda_vm_prover::MaxRowsConfig::default(), - ) - .expect("inner prove should succeed"); - - let blob = postcard::to_allocvec(&(&inner_proof, &inner_elf, &options)) - .expect("postcard encode failed"); - println!("[sp1-verifier] postcard blob: {} bytes", blob.len()); - - let client = ProverClient::from_env(); - let mut stdin = SP1Stdin::new(); - stdin.write_vec(blob); - - println!("[sp1-verifier] executing verifier in SP1 ..."); - let (_, report) = client - .execute(VERIFIER_ELF.clone(), stdin) - .run() - .expect("execute failed"); - - let cycles = report.total_instruction_count(); - println!(); - println!("============================================================"); - println!(" SP1 EXECUTION SUMMARY — lambda-vm verifier inside SP1"); - println!("============================================================"); - println!(" Total cycles : {cycles}"); - println!(); - println!(" Compare against lambda-vm in-VM count (~40.5B for the same"); - println!(" proof). Both VMs target riscv64im, so word width is symmetric."); - println!(" Main remaining asymmetry: lambda-vm's KeccakPermute precompile"); - println!(" is patched on its guests but SP1 does not patch `keccak` (only"); - println!(" `tiny-keccak`), so Keccak rounds run as software in SP1 here."); - println!(); - println!(" If TRACE_FILE was set, the profile was written there."); - println!(" Render with: samply load "); - println!("============================================================"); -} From acd2c67f1b5bd93eec3290a6ed97dcc67f88a743 Mon Sep 17 00:00:00 2001 From: Mario Rugiero Date: Tue, 30 Jun 2026 18:33:55 -0300 Subject: [PATCH 11/16] fix ci bug Co-authored-by: claude[bot] <209825114+claude[bot]@users.noreply.github.com> --- .github/workflows/profile-recursion.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/profile-recursion.yml b/.github/workflows/profile-recursion.yml index 420cebfcb..d7fb12447 100644 --- a/.github/workflows/profile-recursion.yml +++ b/.github/workflows/profile-recursion.yml @@ -39,7 +39,7 @@ jobs: test: single title: "Single query (blowup=2, 1 query)" - name: multi-query - test: single + test: multi title: "Multi query (blowup=8, 128-bit)" steps: - name: React to comment From e52cd9db000b870fdc494689364a99b402be5aee Mon Sep 17 00:00:00 2001 From: Mario Rugiero Date: Tue, 30 Jun 2026 18:34:29 -0300 Subject: [PATCH 12/16] fix ci bug Co-authored-by: claude[bot] <209825114+claude[bot]@users.noreply.github.com> --- .github/workflows/profile-recursion.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/profile-recursion.yml b/.github/workflows/profile-recursion.yml index d7fb12447..1e2f2ae0c 100644 --- a/.github/workflows/profile-recursion.yml +++ b/.github/workflows/profile-recursion.yml @@ -84,7 +84,7 @@ jobs: # test triggers picks this up via the Makefile's `SYSROOT_DIR ?=`. export SYSROOT_DIR="$HOME/.lambda-vm-sysroot" set -o pipefail - make test-profile-recursion-$TEST + make test-profile-recursion-$TEST 2>&1 | tee /tmp/hist.log - name: Aggregate into a per-function fragment if: always() From b70789a938c8bb5469147a0bff95e4fbaed22d8b Mon Sep 17 00:00:00 2001 From: Mario Rugiero Date: Tue, 30 Jun 2026 18:41:40 -0300 Subject: [PATCH 13/16] cargo fmt --- prover/src/tests/recursion_smoke_test.rs | 39 +++++++++++++++++++----- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/prover/src/tests/recursion_smoke_test.rs b/prover/src/tests/recursion_smoke_test.rs index 67c6a5818..e3e44bfa3 100644 --- a/prover/src/tests/recursion_smoke_test.rs +++ b/prover/src/tests/recursion_smoke_test.rs @@ -83,8 +83,11 @@ fn execute_outer_and_commit(label: &str, recursion_elf_bytes: &[u8], blob: &[u8] let program = Elf::load(recursion_elf_bytes).expect("load recursion elf"); let mut executor = Executor::new(&program, blob.to_vec()).expect("executor new"); - let (total_cycles, exec_time) = - drive_executor(&mut executor, |_log| ControlFlow::Continue(()), |_, _, _| {}); + let (total_cycles, exec_time) = drive_executor( + &mut executor, + |_log| ControlFlow::Continue(()), + |_, _, _| {}, + ); let committed = executor .finish() @@ -135,7 +138,10 @@ fn drive_executor( let start = std::time::Instant::now(); let mut total_cycles: u64 = 0; let mut chunks: usize = 0; - while let Some(logs) = executor.resume().expect("executor resume failed (guest panicked in-VM?)") { + while let Some(logs) = executor + .resume() + .expect("executor resume failed (guest panicked in-VM?)") + { let mut stop = false; for log in logs { total_cycles += 1; @@ -181,7 +187,10 @@ fn setup_guest_run( } /// A `drive_executor` progress callback printing one line every `stride` chunks. -fn log_progress(label: impl Into, stride: usize) -> impl FnMut(usize, u64, std::time::Duration) { +fn log_progress( + label: impl Into, + stride: usize, +) -> impl FnMut(usize, u64, std::time::Duration) { let label = label.into(); move |chunks, cycles, elapsed| { if chunks.is_multiple_of(stride) { @@ -222,7 +231,9 @@ fn print_function_table( let mut by_function: std::collections::HashMap = std::collections::HashMap::new(); for (pc, count) in &pc_hist { - let entry = by_function.entry(resolve_pc(symbols, *pc)).or_insert((0, 0)); + let entry = by_function + .entry(resolve_pc(symbols, *pc)) + .or_insert((0, 0)); entry.0 += *count; // cycles entry.1 += 1; // distinct PCs folded into this function } @@ -298,18 +309,30 @@ fn run_profile( "{guest_name} ELF has no symbol table — was it stripped?" ); for (i, kw) in VERIFIER_STEP_KEYWORDS.iter().enumerate() { - let n = symbols.functions().iter().filter(|f| f.name.contains(kw)).count(); + let n = symbols + .functions() + .iter() + .filter(|f| f.name.contains(kw)) + .count(); eprintln!( "[profile] step {}: keyword={kw:?} -> {n} symbol(s) {}", i + 1, - if n > 0 { "" } else { "(no match; merges into previous bucket)" }, + if n > 0 { + "" + } else { + "(no match; merges into previous bucket)" + }, ); } } eprintln!( "[profile] executing {guest_name} guest ({}) ...", - if detailed { "histogram + steps" } else { "cycle counter" } + if detailed { + "histogram + steps" + } else { + "cycle counter" + } ); let (total_cycles, exec_time) = drive_executor( &mut executor, From 8cb31e550d63cd85b3fbfced932f7a2e8719dcf7 Mon Sep 17 00:00:00 2001 From: Mario Rugiero Date: Tue, 30 Jun 2026 18:43:55 -0300 Subject: [PATCH 14/16] ci: gate recursion-profile comment job on profile not being skipped --- .github/workflows/profile-recursion.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/profile-recursion.yml b/.github/workflows/profile-recursion.yml index 1e2f2ae0c..0e614fcd8 100644 --- a/.github/workflows/profile-recursion.yml +++ b/.github/workflows/profile-recursion.yml @@ -106,7 +106,10 @@ jobs: # Stitch the matrix fragments into a single PR comment. comment: needs: profile - if: always() && github.event_name == 'issue_comment' + # always() so partial-matrix failures still post; skip when `profile` was + # skipped (non-/profile_recursion or non-member comment) so this job — and + # the self-hosted bench runner it spins up — doesn't fire on every comment. + if: always() && github.event_name == 'issue_comment' && needs.profile.result != 'skipped' runs-on: [self-hosted, bench] steps: - name: Get PR head ref From cd0d61541ae8b432831011ce01f9431da425104c Mon Sep 17 00:00:00 2001 From: Mario Rugiero Date: Tue, 30 Jun 2026 18:49:04 -0300 Subject: [PATCH 15/16] lint --- prover/src/tests/recursion_smoke_test.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prover/src/tests/recursion_smoke_test.rs b/prover/src/tests/recursion_smoke_test.rs index e3e44bfa3..d34e7cf4d 100644 --- a/prover/src/tests/recursion_smoke_test.rs +++ b/prover/src/tests/recursion_smoke_test.rs @@ -786,7 +786,7 @@ fn test_recursion_sampled_flamegraph() { // body. Skipped logs lose stack accuracy — acceptable diagnostic // quality at higher rates. #[allow(clippy::modulo_one)] - let take = i % SAMPLE_RATE == 0; + let take = i.is_multiple_of(SAMPLE_RATE); if take { generator .borrow_mut() From 0a58f0fbf910dfc9e3028ed7f526db11006df26c Mon Sep 17 00:00:00 2001 From: Mario Rugiero Date: Tue, 30 Jun 2026 21:11:40 -0300 Subject: [PATCH 16/16] inline(never) for high-level steps to avoid missing symbols --- crypto/stark/src/verifier.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/crypto/stark/src/verifier.rs b/crypto/stark/src/verifier.rs index 03119f617..d4186e563 100644 --- a/crypto/stark/src/verifier.rs +++ b/crypto/stark/src/verifier.rs @@ -97,6 +97,7 @@ pub trait IsStarkVerifier< /// Checks whether the purported evaluations of the composition polynomial parts and the trace /// polynomials at the out-of-domain challenge are consistent. /// See https://lambdaclass.github.io/lambdaworks/starks/protocol.html#step-2-verify-claimed-composition-polynomial + #[inline(never)] fn step_2_verify_claimed_composition_polynomial( air: &dyn AIR, proof: &StarkProof, @@ -241,6 +242,7 @@ pub trait IsStarkVerifier< /// Reconstructs the Deep composition polynomial evaluations at the challenge indices values using the provided /// openings of the trace polynomials and the composition polynomial parts. It then uses these to verify that the /// FRI decommitments are valid and correspond to the Deep composition polynomial. + #[inline(never)] fn step_3_verify_fri( proof: &StarkProof, domain: &VerifierDomain, @@ -396,6 +398,7 @@ pub trait IsStarkVerifier< /// Verifies the validity of the purported values of the trace polynomials and the composition polynomial /// parts at the domain elements and their symmetric counterparts corresponding to all the FRI query /// index challenges. + #[inline(never)] fn step_4_verify_trace_and_composition_openings( proof: &StarkProof, challenges: &Challenges, @@ -903,6 +906,7 @@ pub trait IsStarkVerifier< /// Replays rounds 2, 3 and 4 of the protocol for a given proof, assuming round 1 has /// already been replayed and the RAP challenges are known. + #[inline(never)] fn replay_rounds_after_round_1( air: &dyn AIR, proof: &StarkProof,