From ae9e2006fd9348b7a7aaf6dae6433e6b57de3c55 Mon Sep 17 00:00:00 2001
From: Mario Rugiero <mrugiero@gmail.com>
Date: Fri, 26 Jun 2026 18:00:39 -0300
Subject: [PATCH 01/16] feat: recursion profiling + measurement programs

Add the measurement/profiling harness for the in-VM STARK verifier:

- `empty`-proof and `deserialize-only` bench guests + `sp1/verifier`
  cross-prover comparison, all exercising the no_std verifier.
- Expand the recursion smoke test with PC-histogram, sampled-flamegraph,
  page-count, cycle-count and per-step-breakdown diagnostics, plus the
  `make test-profile-recursion` targets and the histogram-aggregation
  CI script/workflow.
- Expose read-only `Executor::memory()`, `Memory::cells()` and
  `SymbolTable::functions()` accessors and make `flamegraph::demangle`
  public so the diagnostics can resolve guest PCs to functions.
---
 .../scripts/aggregate_recursion_histogram.py  | 126 +++
 .github/workflows/profile-recursion.yml       | 175 ++++
 Makefile                                      |  10 +-
 .../deserialize-only/.cargo/config.toml       |   6 +
 bench_vs/lambda/deserialize-only/Cargo.toml   |  13 +
 bench_vs/lambda/deserialize-only/src/main.rs  |  93 ++
 bench_vs/sp1/verifier/Cargo.toml              |   3 +
 bench_vs/sp1/verifier/program/Cargo.toml      |  10 +
 bench_vs/sp1/verifier/program/src/main.rs     |  34 +
 bench_vs/sp1/verifier/script/Cargo.toml       |  13 +
 bench_vs/sp1/verifier/script/build.rs         |   5 +
 bench_vs/sp1/verifier/script/src/main.rs      |  83 ++
 executor/src/elf.rs                           |   5 +
 executor/src/flamegraph.rs                    |   2 +-
 executor/src/vm/memory.rs                     |   7 +
 prover/src/tests/recursion_smoke_test.rs      | 936 +++++++++++++++++-
 16 files changed, 1510 insertions(+), 11 deletions(-)
 create mode 100755 .github/scripts/aggregate_recursion_histogram.py
 create mode 100644 .github/workflows/profile-recursion.yml
 create mode 100644 bench_vs/lambda/deserialize-only/.cargo/config.toml
 create mode 100644 bench_vs/lambda/deserialize-only/Cargo.toml
 create mode 100644 bench_vs/lambda/deserialize-only/src/main.rs
 create mode 100644 bench_vs/sp1/verifier/Cargo.toml
 create mode 100644 bench_vs/sp1/verifier/program/Cargo.toml
 create mode 100644 bench_vs/sp1/verifier/program/src/main.rs
 create mode 100644 bench_vs/sp1/verifier/script/Cargo.toml
 create mode 100644 bench_vs/sp1/verifier/script/build.rs
 create mode 100644 bench_vs/sp1/verifier/script/src/main.rs
diff --git a/.github/scripts/aggregate_recursion_histogram.py b/.github/scripts/aggregate_recursion_histogram.py
new file mode 100755
index 000000000..8a12dc05e
--- /dev/null
+++ b/.github/scripts/aggregate_recursion_histogram.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+"""Format the recursion-guest per-function profile as a Markdown PR comment.
+
+`test_recursion_pc_histogram` prints a per-function summary table (cycles folded
+over each function's PCs, computed across the *full* histogram) followed by a
+per-address detail table. We extract the per-function table — the view that
+shows where the cycles actually go — and render it as Markdown.
+
+    Top 25 functions by cycle count (aggregated over their PCs):
+    rank          cycles        %    cum %    PCs  function (file:line)
+       1         5335072   24.95%   24.95%     72  <...>::visit_seq::<...>
+
+Reads the test's captured output from argv[1]; writes the Markdown body to
+argv[2] (or stdout).
+"""
+
+import re
+import sys
+
+# A per-function summary row: rank, cycles, pct%, cum%, pcs, function.
+# Distinguished from the per-PC detail rows by the absence of a 0x<pc> column.
+FN_ROW = re.compile(
+    r"^\s*\d+\s+(\d+)\s+([\d.]+)%\s+([\d.]+)%\s+(\d+)\s+(.*\S)\s*$"
+)
+FN_TABLE_START = re.compile(r"Top \d+ functions by cycle count")
+PC_TABLE_START = re.compile(r"Top \d+ PCs by cycle count")
+TOTAL_CYCLES = re.compile(r"Total cycles\s*:\s*(\d+)")
+UNIQUE_PCS = re.compile(r"Unique PCs\s*:\s*(\d+)")
+EXEC_TIME = re.compile(r"Exec time\s*:\s*(\S+)")
+
+
+def parse(text):
+    total_cycles = unique_pcs = exec_time = None
+    rows = []
+    in_fn_table = False
+    for line in text.splitlines():
+        if total_cycles is None and (m := TOTAL_CYCLES.search(line)):
+            total_cycles = int(m.group(1))
+        if unique_pcs is None and (m := UNIQUE_PCS.search(line)):
+            unique_pcs = int(m.group(1))
+        if exec_time is None and (m := EXEC_TIME.search(line)):
+            exec_time = m.group(1)
+        if FN_TABLE_START.search(line):
+            in_fn_table = True
+            continue
+        if PC_TABLE_START.search(line):
+            in_fn_table = False
+            continue
+        if in_fn_table and (m := FN_ROW.match(line)):
+            rows.append(
+                {
+                    "cycles": int(m.group(1)),
+                    "pct": m.group(2),
+                    "cum": m.group(3),
+                    "pcs": int(m.group(4)),
+                    "fn": m.group(5),
+                }
+            )
+    return total_cycles, unique_pcs, exec_time, rows
+
+
+def short(name, width=90):
+    return name if len(name) <= width else name[: width - 1] + "…"
+
+
+def render(total_cycles, unique_pcs, exec_time, rows, title="Recursion guest profile"):
+    if not rows:
+        return (
+            f"### {title}\n\n"
+            "> ⚠️ No per-function rows found in the test output — the run may "
+            "have failed before printing the table. Check the workflow logs.\n"
+        )
+
+    body = f"### {title}\n\n"
+    if total_cycles is not None:
+        body += f"**Total cycles:** {total_cycles:,}"
+        if unique_pcs is not None:
+            body += f" · **Unique PCs:** {unique_pcs:,}"
+        if exec_time:
+            body += f" · **Exec time:** {exec_time}"
+        body += "\n\n"
+
+    body += f"#### Top {len(rows)} functions by cycles (folded over their PCs)\n\n"
+    body += "| Rank | Cycles | % | Cum % | PCs | Function |\n"
+    body += "|-----:|-------:|--:|------:|----:|----------|\n"
+    for i, r in enumerate(rows, 1):
+        body += (
+            f"| {i} | {r['cycles']:,} | {r['pct']}% | {r['cum']}% | "
+            f"{r['pcs']} | `{short(r['fn'])}` |\n"
+        )
+
+    last_cum = rows[-1]["cum"]
+    body += (
+        f"\n<sub>Each function's cycles are summed over all its program counters "
+        f"across the full histogram; the top {len(rows)} cover {last_cum}% of total "
+        f"cycles. Percentages are of total cycles.</sub>\n"
+    )
+    return body
+
+
+def main():
+    import argparse
+
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("log", help="captured test output to parse")
+    ap.add_argument("-o", "--out", help="write Markdown here instead of stdout")
+    ap.add_argument(
+        "-t",
+        "--title",
+        default="Recursion guest profile",
+        help="section heading (e.g. the test/config name)",
+    )
+    args = ap.parse_args()
+
+    with open(args.log, "r", errors="replace") as f:
+        text = f.read()
+    body = render(*parse(text), title=args.title)
+    if args.out:
+        with open(args.out, "w") as f:
+            f.write(body)
+    else:
+        sys.stdout.write(body)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/workflows/profile-recursion.yml b/.github/workflows/profile-recursion.yml
new file mode 100644
index 000000000..420cebfcb
--- /dev/null
+++ b/.github/workflows/profile-recursion.yml
@@ -0,0 +1,175 @@
+name: Profile Recursion (PR)
+
+# Runs the recursion-guest PC histogram diagnostics (single-query and
+# multi-query, in parallel via a matrix) and posts a combined per-function
+# profile as a PR comment. Triggered by a `/profile_recursion` comment from a
+# repo member, or manually via workflow_dispatch.
+
+on:
+  workflow_dispatch:
+  issue_comment:
+    types: [created]
+
+permissions:
+  contents: read
+  pull-requests: write
+
+concurrency:
+  group: profile-recursion-${{ github.event.issue.number || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  # One job per configuration; they run in parallel and each uploads a Markdown
+  # fragment artifact. The `comment` job stitches them into one PR comment.
+  profile:
+    # Skip unless: workflow_dispatch, or "/profile_recursion" comment on a PR by a member.
+    if: >-
+      github.event_name == 'workflow_dispatch' ||
+      (github.event_name == 'issue_comment' &&
+       github.event.issue.pull_request &&
+       startsWith(github.event.comment.body, '/profile_recursion') &&
+       contains(fromJSON('["MEMBER","OWNER","COLLABORATOR"]'), github.event.comment.author_association))
+    runs-on: [self-hosted, bench]
+    timeout-minutes: 90
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - name: single-query
+            test: single
+            title: "Single query (blowup=2, 1 query)"
+          - name: multi-query
+            test: single
+            title: "Multi query (blowup=8, 128-bit)"
+    steps:
+      - name: React to comment
+        if: github.event_name == 'issue_comment' && matrix.name == 'single-query'
+        uses: actions/github-script@v7
+        with:
+          script: |
+            await github.rest.reactions.createForIssueComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              comment_id: context.payload.comment.id,
+              content: 'eyes'
+            });
+
+      - name: Get PR head ref
+        id: pr-ref
+        if: github.event_name == 'issue_comment'
+        env:
+          GH_TOKEN: ${{ github.token }}
+          PR_NUM: ${{ github.event.issue.number }}
+        run: |
+          SHA=$(gh pr view "$PR_NUM" --repo "$GITHUB_REPOSITORY" --json headRefOid -q .headRefOid)
+          echo "sha=$SHA" >> "$GITHUB_OUTPUT"
+
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ steps.pr-ref.outputs.sha || github.sha }}
+
+      - name: Setup Rust Environment
+        uses: ./.github/actions/setup-rust
+
+      - name: Add cargo to PATH
+        run: echo "$HOME/.cargo/bin" >> "$GITHUB_PATH"
+
+      - name: Run recursion PC histogram (${{ matrix.name }})
+        env:
+          TEST: ${{ matrix.test }}
+        run: |
+          # Self-provision the RISC-V sysroot in a user-writable dir (the default
+          # /opt path on the bench runner is root-owned); the guest ELF build the
+          # test triggers picks this up via the Makefile's `SYSROOT_DIR ?=`.
+          export SYSROOT_DIR="$HOME/.lambda-vm-sysroot"
+          set -o pipefail
+          make test-profile-recursion-$TEST
+
+      - name: Aggregate into a per-function fragment
+        if: always()
+        env:
+          TITLE: ${{ matrix.title }}
+        run: |
+          python3 .github/scripts/aggregate_recursion_histogram.py \
+            /tmp/hist.log --title "$TITLE" --out "/tmp/fragment-${{ matrix.name }}.md"
+          cat "/tmp/fragment-${{ matrix.name }}.md" >> "$GITHUB_STEP_SUMMARY"
+
+      - name: Upload fragment
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: profile-fragment-${{ matrix.name }}
+          path: /tmp/fragment-${{ matrix.name }}.md
+          retention-days: 7
+
+  # Stitch the matrix fragments into a single PR comment.
+  comment:
+    needs: profile
+    if: always() && github.event_name == 'issue_comment'
+    runs-on: [self-hosted, bench]
+    steps:
+      - name: Get PR head ref
+        id: pr-ref
+        env:
+          GH_TOKEN: ${{ github.token }}
+          PR_NUM: ${{ github.event.issue.number }}
+        run: |
+          SHA=$(gh pr view "$PR_NUM" --repo "$GITHUB_REPOSITORY" --json headRefOid -q .headRefOid)
+          echo "sha=$SHA" >> "$GITHUB_OUTPUT"
+
+      - name: Download fragments
+        uses: actions/download-artifact@v4
+        with:
+          path: fragments
+          pattern: profile-fragment-*
+          merge-multiple: true
+
+      - name: Assemble comment body
+        env:
+          COMMIT_SHA: ${{ steps.pr-ref.outputs.sha }}
+        run: |
+          {
+            echo "## Recursion guest profile"
+            echo
+            # Single-query first, then multi-query, then any others.
+            for frag in fragments/fragment-single-query.md \
+                        fragments/fragment-multi-query.md; do
+              [ -f "$frag" ] && { cat "$frag"; echo; }
+            done
+            echo "<sub>Commit: ${COMMIT_SHA:0:8} · Runner: self-hosted bench</sub>"
+          } > /tmp/profile_comment.md
+          cat /tmp/profile_comment.md
+
+      - name: Comment on PR
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const fs = require('fs');
+            const body = fs.readFileSync('/tmp/profile_comment.md', 'utf8');
+
+            const { data: comments } = await github.rest.issues.listComments({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number,
+            });
+            // Reuse our own marker comment so repeated /profile_recursion runs update in place.
+            const existing = comments.find(c =>
+              c.user.type === 'Bot' &&
+              c.body.includes('Recursion guest profile')
+            );
+            if (existing) {
+              await github.rest.issues.updateComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                comment_id: existing.id,
+                body,
+              });
+            } else {
+              await github.rest.issues.createComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: context.issue.number,
+                body,
+              });
+            }
diff --git a/Makefile b/Makefile
index 454eff098..30e3029da 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 .PHONY: deps deps-linux deps-macos compile-programs-asm compile-programs-rust compile-bench \
 compile-programs compile-recursion-elfs clean-asm clean-rust clean-bench clean-shared \
 clean-recursion-elfs clean test test-asm \
-test-rust test-executor test-flamegraph flamegraph-prover \
+test-rust test-executor test-flamegraph flamegraph-prover test-profile-recursion test-profile-recursion-single test-profile-recursion-multi \
 test-fast test-prover test-prover-all test-disk-spill test-math-cuda test-cuda-integration \
 bench-math-cuda bench-prover bench-prover-cuda build check clippy fmt lint regen-ethrex-fixtures \
 update-ethrex-fixture-checksums check-ethrex-fixture-checksums
@@ -232,6 +232,14 @@ test-rust: compile-programs-rust
 test-flamegraph:
 	cargo test -p executor --test flamegraph
 
+test-profile-recursion: test-profile-recursion-single test-profile-recursion-multi
+
+test-profile-recursion-single: compile-programs-rust
+	cargo test --package lambda-vm-prover --lib test_recursion_pc_histogram_1query -- --ignored --nocapture
+
+test-profile-recursion-multi: compile-programs-rust
+	cargo test --package lambda-vm-prover --lib test_recursion_pc_histogram_multiquery -- --ignored --nocapture
+
 # Regenerate the committed ethrex block fixtures (see tooling/ethrex-fixtures).
 # Run after bumping the ethrex rev; README checksums are refreshed automatically.
 regen-ethrex-fixtures:
diff --git a/bench_vs/lambda/deserialize-only/.cargo/config.toml b/bench_vs/lambda/deserialize-only/.cargo/config.toml
new file mode 100644
index 000000000..be730c3ec
--- /dev/null
+++ b/bench_vs/lambda/deserialize-only/.cargo/config.toml
@@ -0,0 +1,6 @@
+[target.riscv64im-lambda-vm-elf]
+rustflags = [
+  "-C", "link-arg=-e",
+  "-C", "link-arg=main",
+  "-C", "passes=lower-atomic"
+]
diff --git a/bench_vs/lambda/deserialize-only/Cargo.toml b/bench_vs/lambda/deserialize-only/Cargo.toml
new file mode 100644
index 000000000..b4a4616f4
--- /dev/null
+++ b/bench_vs/lambda/deserialize-only/Cargo.toml
@@ -0,0 +1,13 @@
+[workspace]
+
+[package]
+name = "deserialize-only-bench"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+lambda-vm-prover = { path = "../../../prover", default-features = false }
+embedded-alloc = "0.6"
+riscv = { version = "0.15", features = ["critical-section-single-hart"] }
+serde = { version = "=1.0.219", default-features = false, features = ["derive", "alloc"] }
+postcard = { version = "1.0", default-features = false, features = ["alloc"] }
diff --git a/bench_vs/lambda/deserialize-only/src/main.rs b/bench_vs/lambda/deserialize-only/src/main.rs
new file mode 100644
index 000000000..8627776a1
--- /dev/null
+++ b/bench_vs/lambda/deserialize-only/src/main.rs
@@ -0,0 +1,93 @@
+//! Deserialize-only counterpart to the recursion guest.
+//!
+//! Reads the same private-input blob as `recursion-bench`, postcard-decodes
+//! `(VmProof, Vec<u8>, ProofOptions)`, then commits success
+//! and halts — without ever calling `verify_with_options`. The cycle delta
+//! between this guest and `recursion-bench` is the actual cost of the STARK
+//! verifier inside the VM (everything else being equal).
+
+#![no_std]
+#![no_main]
+
+extern crate alloc;
+
+use alloc::vec::Vec;
+use core::arch::asm;
+use core::panic::PanicInfo;
+
+use embedded_alloc::TlsfHeap as Heap;
+use lambda_vm_prover::{ProofOptions, VmProof};
+// Required to pull in the riscv crate's critical-section implementation.
+use riscv as _;
+
+const PRIVATE_INPUT_START: usize = 0xFF000000;
+const SYSCALL_COMMIT: u64 = 64;
+const SYSCALL_HALT: u64 = 93;
+const MAX_MEMORY_SIZE: usize = 0xC000_0000;
+
+#[global_allocator]
+static HEAP: Heap = Heap::empty();
+
+#[panic_handler]
+fn panic(_info: &PanicInfo) -> ! {
+    loop {}
+}
+
+fn init_allocator() {
+    unsafe extern "C" {
+        static _end: u8;
+    }
+    let heap_pos = (&raw const _end) as usize;
+    unsafe { HEAP.init(heap_pos, MAX_MEMORY_SIZE - heap_pos) }
+}
+
+fn read_private_input() -> &'static [u8] {
+    let len = unsafe { core::ptr::read_volatile(PRIVATE_INPUT_START as *const u32) } as usize;
+    let data = (PRIVATE_INPUT_START + 4) as *const u8;
+    unsafe { core::slice::from_raw_parts(data, len) }
+}
+
+fn commit(bytes: &[u8]) {
+    unsafe {
+        asm!(
+            "ecall",
+            in("a0") 1u64,
+            in("a1") bytes.as_ptr(),
+            in("a2") bytes.len(),
+            in("a7") SYSCALL_COMMIT,
+        );
+    }
+}
+
+fn halt() -> ! {
+    unsafe {
+        asm!(
+            "ecall",
+            in("a0") 0u64,
+            in("a7") SYSCALL_HALT,
+            options(noreturn),
+        );
+    }
+}
+
+#[unsafe(no_mangle)]
+pub fn main() -> ! {
+    init_allocator();
+
+    let blob = read_private_input();
+    let decoded: (VmProof, Vec<u8>, ProofOptions) =
+        postcard::from_bytes(blob).expect("failed to deserialize");
+
+    // Force the commit byte to depend on the actually-decoded value. Without
+    // this, LLVM at -O3 was eliding the postcard decode entirely — the only
+    // sinks for `decoded` were `black_box(&decoded)` (which only forces the
+    // *reference* to materialize, not the pointee) and `Drop`, neither of
+    // which require the decoded bytes to be real. With the commit byte tied
+    // to a deep field of the decoded value, the decode has to run.
+    let proof_options_byte = decoded.2.blowup_factor;
+    let inner_elf_byte = *decoded.1.first().unwrap_or(&0);
+    let marker = proof_options_byte ^ inner_elf_byte;
+
+    commit(&[marker]);
+    halt()
+}
diff --git a/bench_vs/sp1/verifier/Cargo.toml b/bench_vs/sp1/verifier/Cargo.toml
new file mode 100644
index 000000000..fc24039c2
--- /dev/null
+++ b/bench_vs/sp1/verifier/Cargo.toml
@@ -0,0 +1,3 @@
+[workspace]
+members = ["program", "script"]
+resolver = "2"
diff --git a/bench_vs/sp1/verifier/program/Cargo.toml b/bench_vs/sp1/verifier/program/Cargo.toml
new file mode 100644
index 000000000..7fbc9c5ce
--- /dev/null
+++ b/bench_vs/sp1/verifier/program/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "verifier-program"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+sp1-zkvm = "6.0.1"
+lambda-vm-prover = { path = "../../../../prover", default-features = false }
+serde = { version = "=1.0.219", default-features = false, features = ["derive", "alloc"] }
+postcard = { version = "1.0", default-features = false, features = ["alloc"] }
diff --git a/bench_vs/sp1/verifier/program/src/main.rs b/bench_vs/sp1/verifier/program/src/main.rs
new file mode 100644
index 000000000..c63bb67ca
--- /dev/null
+++ b/bench_vs/sp1/verifier/program/src/main.rs
@@ -0,0 +1,34 @@
+//! SP1 guest that runs lambda-vm's `verify_with_options` on a single proof.
+//!
+//! Input layout (postcard-encoded `Vec<u8>` written via `SP1Stdin::write_vec`):
+//!   `(VmProof, Vec<u8>, ProofOptions)`
+//! where the inner `Vec<u8>` is the inner program's ELF bytes.
+//!
+//! Output: commits `[1u8]` on successful verify; the guest panics otherwise.
+//!
+//! Caveats:
+//! - The verifier hashes through the `keccak` crate. SP1 has a Keccak
+//!   precompile but it patches `tiny-keccak`, not `keccak`. We don't patch
+//!   here, so Keccak runs as software inside the guest. Cycle counts will be
+//!   inflated by that overhead. Worth keeping in mind when interpreting the
+//!   number relative to lambda-vm's in-VM count.
+
+#![no_main]
+
+extern crate alloc;
+
+use alloc::vec::Vec;
+
+use lambda_vm_prover::{ProofOptions, VmProof};
+
+sp1_zkvm::entrypoint!(main);
+
+pub fn main() {
+    let blob = sp1_zkvm::io::read_vec();
+    let (vm_proof, inner_elf, options): (VmProof, Vec<u8>, ProofOptions) =
+        postcard::from_bytes(&blob).expect("failed to deserialize input");
+    let ok = lambda_vm_prover::verify_with_options(&vm_proof, &inner_elf, &options, None, None)
+        .expect("verify errored");
+    assert!(ok, "inner proof failed verification");
+    sp1_zkvm::io::commit_slice(&[1u8]);
+}
diff --git a/bench_vs/sp1/verifier/script/Cargo.toml b/bench_vs/sp1/verifier/script/Cargo.toml
new file mode 100644
index 000000000..3198059bd
--- /dev/null
+++ b/bench_vs/sp1/verifier/script/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "verifier-script"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+sp1-sdk = { version = "6.0.1", features = ["blocking", "profiling"] }
+lambda-vm-prover = { path = "../../../../prover" }
+stark = { path = "../../../../crypto/stark" }
+postcard = { version = "1.0", features = ["alloc"] }
+
+[build-dependencies]
+sp1-build = "6.0.1"
diff --git a/bench_vs/sp1/verifier/script/build.rs b/bench_vs/sp1/verifier/script/build.rs
new file mode 100644
index 000000000..d6cf925d6
--- /dev/null
+++ b/bench_vs/sp1/verifier/script/build.rs
@@ -0,0 +1,5 @@
+use sp1_build::build_program_with_args;
+
+fn main() {
+    build_program_with_args("../program", Default::default());
+}
diff --git a/bench_vs/sp1/verifier/script/src/main.rs b/bench_vs/sp1/verifier/script/src/main.rs
new file mode 100644
index 000000000..86e46a710
--- /dev/null
+++ b/bench_vs/sp1/verifier/script/src/main.rs
@@ -0,0 +1,83 @@
+//! Host driver: prove an inner empty program on lambda-vm, then execute the
+//! lambda-vm verifier inside SP1's executor, printing the cycle count.
+//!
+//! Set `TRACE_FILE=profiles/verifier.json` to capture a DWARF-attributed
+//! profile (1 sample = 1 cycle). The output can be opened with
+//! `samply load profiles/verifier.json`.
+
+use std::path::PathBuf;
+
+use sp1_sdk::blocking::{Prover, ProverClient};
+use sp1_sdk::{SP1Stdin, include_elf};
+
+const VERIFIER_ELF: sp1_sdk::Elf = include_elf!("verifier-program");
+
+fn workspace_root() -> PathBuf {
+    // CARGO_MANIFEST_DIR for this crate is `<root>/bench_vs/sp1/verifier/script`.
+    PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+        .ancestors()
+        .nth(4)
+        .expect("workspace root")
+        .to_path_buf()
+}
+
+fn main() {
+    sp1_sdk::utils::setup_logger();
+
+    let root = workspace_root();
+    let empty_elf_path = root
+        .join("bench_vs/lambda/empty/target/riscv64im-lambda-vm-elf/release/empty-bench");
+    assert!(
+        empty_elf_path.exists(),
+        "empty-bench ELF not found at {} — run `bash bench_vs/build_recursion_elfs.sh` first",
+        empty_elf_path.display(),
+    );
+    let inner_elf = std::fs::read(&empty_elf_path).expect("read empty-bench");
+
+    let options = stark::proof::options::ProofOptions {
+        blowup_factor: 2,
+        fri_number_of_queries: 1,
+        coset_offset: 3,
+        grinding_factor: 1,
+    };
+
+    println!("[sp1-verifier] proving inner (empty, blowup=2, 1 query) ...");
+    let inner_proof = lambda_vm_prover::prove_with_options_and_inputs(
+        &inner_elf,
+        &[],
+        &options,
+        &lambda_vm_prover::MaxRowsConfig::default(),
+    )
+    .expect("inner prove should succeed");
+
+    let blob = postcard::to_allocvec(&(&inner_proof, &inner_elf, &options))
+        .expect("postcard encode failed");
+    println!("[sp1-verifier] postcard blob: {} bytes", blob.len());
+
+    let client = ProverClient::from_env();
+    let mut stdin = SP1Stdin::new();
+    stdin.write_vec(blob);
+
+    println!("[sp1-verifier] executing verifier in SP1 ...");
+    let (_, report) = client
+        .execute(VERIFIER_ELF.clone(), stdin)
+        .run()
+        .expect("execute failed");
+
+    let cycles = report.total_instruction_count();
+    println!();
+    println!("============================================================");
+    println!("  SP1 EXECUTION SUMMARY — lambda-vm verifier inside SP1");
+    println!("============================================================");
+    println!("  Total cycles : {cycles}");
+    println!();
+    println!("  Compare against lambda-vm in-VM count (~40.5B for the same");
+    println!("  proof). Both VMs target riscv64im, so word width is symmetric.");
+    println!("  Main remaining asymmetry: lambda-vm's KeccakPermute precompile");
+    println!("  is patched on its guests but SP1 does not patch `keccak` (only");
+    println!("  `tiny-keccak`), so Keccak rounds run as software in SP1 here.");
+    println!();
+    println!("  If TRACE_FILE was set, the profile was written there.");
+    println!("  Render with: samply load <trace>");
+    println!("============================================================");
+}
diff --git a/executor/src/elf.rs b/executor/src/elf.rs
index ed79fb983..da38cbbf1 100644
--- a/executor/src/elf.rs
+++ b/executor/src/elf.rs
@@ -557,4 +557,9 @@ impl SymbolTable {
     pub fn len(&self) -> usize {
         self.functions.len()
     }
+
+    /// Borrow the full function list (sorted by address).
+    pub fn functions(&self) -> &[FunctionSymbol] {
+        &self.functions
+    }
 }
diff --git a/executor/src/flamegraph.rs b/executor/src/flamegraph.rs
index f9b447d19..4764d71a2 100644
--- a/executor/src/flamegraph.rs
+++ b/executor/src/flamegraph.rs
@@ -154,7 +154,7 @@ impl FlamegraphGenerator {
 /// Demangle a Rust symbol name using the official rustc-demangle crate.
 ///
 /// Uses the alternate format (`{:#}`) to omit the hash suffix for cleaner output.
-pub(crate) fn demangle(name: &str) -> String {
+pub fn demangle(name: &str) -> String {
     // Use rustc-demangle with alternate format to omit hash
     format!("{:#}", rustc_demangle(name))
 }
diff --git a/executor/src/vm/memory.rs b/executor/src/vm/memory.rs
index f349eeae6..f3a3e622c 100644
--- a/executor/src/vm/memory.rs
+++ b/executor/src/vm/memory.rs
@@ -218,6 +218,13 @@ impl Memory {
         Ok(self.public_output.clone())
     }
 
+    /// Read-only access to the underlying 4-byte cell map. Exposed for
+    /// diagnostic tooling (e.g. counting the distinct 4 KB memory pages a
+    /// program touches) — not part of the normal execution interface.
+    pub fn cells(&self) -> &U64HashMap<[u8; 4]> {
+        &self.cells
+    }
+
     /// Pre-loads private input bytes at `PRIVATE_INPUT_START_INDEX` as a
     /// 4-byte LE length prefix followed by the raw data. The guest reads these
     /// bytes directly via normal RISC-V loads (ZisK-style memory-mapped input).
diff --git a/prover/src/tests/recursion_smoke_test.rs b/prover/src/tests/recursion_smoke_test.rs
index 7bcd4bd3d..a39bcfc90 100644
--- a/prover/src/tests/recursion_smoke_test.rs
+++ b/prover/src/tests/recursion_smoke_test.rs
@@ -2,7 +2,7 @@
 //!
 //! Each test:
 //! 1. Proves an inner program on the host.
-//! 2. Serializes `(VmProof, inner_elf, opts)` with postcard.
+//! 2. Serializes `(VmProof, inner_elf)` with postcard.
 //! 3. Hands that as private input to the recursion guest.
 //! 4. Either **proves** the recursion guest's execution (memory-bounded via
 //!    continuations) and verifies the outer proof (`OuterMode::Prove`), or
@@ -12,6 +12,7 @@
 //!
 //! The guest ELFs are assumed built by `make compile-recursion-elfs`.
 
+use std::ops::ControlFlow;
 use std::path::PathBuf;
 
 fn workspace_root() -> PathBuf {
@@ -33,10 +34,10 @@ fn read_guest_elf(root: &std::path::Path, name: &str) -> Vec<u8> {
 }
 
 /// Minimum-security FRI parameters: blowup=2, a single FRI query. Security is
-/// intentionally terrible — used by the capacity-probing test, where the goal
-/// is the smallest possible inner proof, not a sound one.
-/// (`GoldilocksCubicProofOptions::with_blowup` derives a query count from a
-/// 128-bit target, far more than we want here.)
+/// intentionally terrible — used by the capacity-probing test and every cheap
+/// diagnostic below, where the goal is the smallest possible inner proof, not
+/// a sound one. (`GoldilocksCubicProofOptions::with_blowup` derives a query
+/// count from a 128-bit target, far more than we want here.)
 const MIN_PROOF_OPTIONS: stark::proof::options::ProofOptions =
     stark::proof::options::ProofOptions {
         blowup_factor: 2,
@@ -46,10 +47,10 @@ const MIN_PROOF_OPTIONS: stark::proof::options::ProofOptions =
     };
 
 /// Prove `inner_elf` (fed `inner_input`) under `opts`, then package
-/// `(proof, elf, opts)` into the postcard blob the recursion guest consumes as
-/// its private input. `tag` prefixes the progress lines. Returns the inner
-/// proof — callers that re-verify it on the host need it — next to the encoded
-/// blob.
+/// `(proof, elf, opts)` into the postcard blob the recursion and
+/// deserialize-only guests consume as their private input. `tag` prefixes the
+/// progress lines. Returns the inner proof — callers that re-verify it on the
+/// host need it — next to the encoded blob.
 fn prove_inner_and_encode_blob(
     tag: &str,
     inner_elf: &[u8],
@@ -148,6 +149,132 @@ fn prove_outer_and_commit(label: &str, recursion_elf_bytes: &[u8], blob: &[u8])
     committed
 }
 
+/// Stream a guest's execution via `Executor::resume()`, calling `on_log` for
+/// every `Log` without ever buffering the full log stream (`Executor::run`
+/// would accumulate tens of millions of `Log`s and OOM even a 125 GB box).
+/// `on_log` returns `ControlFlow::Break(())` to stop the run early (e.g. once a
+/// cycle budget is hit); `Continue(())` to keep going. `on_progress(chunks,
+/// total_cycles, elapsed)` fires once per resumed chunk; callers throttle and
+/// format their own progress lines. Returns `(total_cycles, wall_time)` —
+/// `total_cycles` counts logs actually visited, so it is exact even when a run
+/// breaks mid-chunk.
+fn drive_executor(
+    executor: &mut executor::vm::execution::Executor,
+    mut on_log: impl FnMut(&executor::vm::logs::Log) -> ControlFlow<()>,
+    mut on_progress: impl FnMut(usize, u64, std::time::Duration),
+) -> (u64, std::time::Duration) {
+    let start = std::time::Instant::now();
+    let mut total_cycles: u64 = 0;
+    let mut chunks: usize = 0;
+    while let Some(logs) = executor.resume().expect("executor resume failed") {
+        let mut stop = false;
+        for log in logs {
+            total_cycles += 1;
+            if on_log(log).is_break() {
+                stop = true;
+                break;
+            }
+        }
+        chunks += 1;
+        on_progress(chunks, total_cycles, start.elapsed());
+        if stop {
+            break;
+        }
+    }
+    (total_cycles, start.elapsed())
+}
+
+/// Resolve a guest PC to its (demangled) enclosing function name using the
+/// ELF's own symbol table — the same data `executor::flamegraph` resolves
+/// against. `<unknown>` when no function symbol covers the PC (e.g. PLT stubs
+/// or a release build that dropped symbols). No file:line: the symbol table
+/// carries function ranges only, not DWARF line info.
+fn resolve_pc(symbols: &executor::elf::SymbolTable, pc: u64) -> String {
+    symbols.lookup(pc).map_or_else(
+        || "<unknown>".to_string(),
+        |s| executor::flamegraph::demangle(&s.name),
+    )
+}
+
+/// Print a PC histogram as two tables: a per-function summary (the cycles each
+/// resolved function accounts for, folded over all its PCs) followed by the
+/// top-100 per-address detail. `pc_hist` maps program counter → cycle count.
+///
+/// The per-function view is the one that matters: an inlined kernel is spread
+/// across dozens of PCs, so the raw per-address table scatters its true cost.
+fn print_pc_histogram(
+    title: &str,
+    symbols: &executor::elf::SymbolTable,
+    pc_hist: std::collections::HashMap<u64, u64>,
+    total_cycles: u64,
+    exec_time: std::time::Duration,
+) {
+    let mut entries: Vec<(u64, u64)> = pc_hist.into_iter().collect();
+    entries.sort_unstable_by_key(|(_pc, count)| std::cmp::Reverse(*count));
+
+    // Aggregate the full histogram by resolved function, resolving each PC once.
+    let mut by_function: std::collections::HashMap<String, (u64, u64)> =
+        std::collections::HashMap::new();
+    for (pc, count) in &entries {
+        let entry = by_function
+            .entry(resolve_pc(symbols, *pc))
+            .or_insert((0, 0));
+        entry.0 += *count; // cycles
+        entry.1 += 1; // distinct PCs folded into this function
+    }
+    let mut fn_entries: Vec<(String, (u64, u64))> = by_function.into_iter().collect();
+    fn_entries.sort_unstable_by_key(|(_name, (cycles, _pcs))| std::cmp::Reverse(*cycles));
+
+    let pct = |n: u64| 100.0 * (n as f64) / (total_cycles as f64);
+
+    eprintln!();
+    eprintln!("============================================================");
+    eprintln!("  {title}");
+    eprintln!("============================================================");
+    eprintln!("  Total cycles : {total_cycles}");
+    eprintln!("  Unique PCs   : {}", entries.len());
+    eprintln!("  Exec time    : {exec_time:?}");
+    eprintln!();
+    eprintln!("  Top 25 functions by cycle count (aggregated over their PCs):");
+    eprintln!(
+        "  {:>4}  {:>14}  {:>7}  {:>7}  {:>5}  {}",
+        "rank", "cycles", "%", "cum %", "PCs", "function"
+    );
+    let mut fn_cumulative: u64 = 0;
+    for (rank, (name, (cycles, pcs))) in fn_entries.iter().take(25).enumerate() {
+        fn_cumulative += cycles;
+        eprintln!(
+            "  {:>4}  {:>14}  {:>6.2}%  {:>6.2}%  {:>5}  {}",
+            rank + 1,
+            cycles,
+            pct(*cycles),
+            pct(fn_cumulative),
+            pcs,
+            name,
+        );
+    }
+    eprintln!();
+    eprintln!("  Top 100 PCs by cycle count (per-address detail):");
+    eprintln!(
+        "  {:>4}  {:>18}  {:>14}  {:>7}  {:>7}  {}",
+        "rank", "pc", "cycles", "%", "cum %", "function"
+    );
+    let mut cumulative: u64 = 0;
+    for (rank, (pc, count)) in entries.iter().take(100).enumerate() {
+        cumulative += count;
+        eprintln!(
+            "  {:>4}  {:#018x}  {:>14}  {:>6.2}%  {:>6.2}%  {}",
+            rank + 1,
+            pc,
+            count,
+            pct(*count),
+            pct(cumulative),
+            resolve_pc(symbols, *pc),
+        );
+    }
+    eprintln!("============================================================");
+}
+
 /// Core pipeline: prove an inner program with the given options, hand the
 /// proof+ELF+options to the recursion guest, then take the guest to `mode`
 /// (execute-only or full prove) and assert it committed the `[1]` success
@@ -336,6 +463,797 @@ fn test_recursion_prove_1query() {
     );
 }
 
+/// Diagnostic: build the inner proof and dump the recursion guest's private-input
+/// blob to `/tmp/recursion_input.bin` so the CLI's `execute --flamegraph` can
+/// consume it.
+///
+/// Usage after running this test:
+/// ```
+/// cargo run -p cli --release -- execute \
+///     bench_vs/lambda/recursion/target/riscv64im-lambda-vm-elf/release/recursion-bench \
+///     --private-input /tmp/recursion_input.bin \
+///     --flamegraph /tmp/recursion_folded.txt
+/// cat /tmp/recursion_folded.txt | inferno-flamegraph > /tmp/recursion_flamegraph.svg
+/// ```
+#[test]
+#[ignore = "diagnostic: writes recursion private input to /tmp/recursion_input.bin"]
+fn test_dump_recursion_input() {
+    let root = workspace_root();
+    let empty_elf_bytes = read_guest_elf(&root, "empty");
+
+    let (_inner_proof, blob) =
+        prove_inner_and_encode_blob("dump-input", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS);
+
+    let path = "/tmp/recursion_input.bin";
+    std::fs::write(path, &blob).expect("write blob");
+    eprintln!("[dump-input] wrote {} bytes to {path}", blob.len());
+}
+
+/// Diagnostic: build the inner proof + recursion guest input, then **execute
+/// only** the recursion guest (no STARK proving) and report cycle counts +
+/// trace size estimates.
+///
+/// This is the cheap way to find out how many RISC-V instructions the
+/// verifier actually executes inside the guest — a much faster signal than
+/// running the full outer prove (which can OOM on a 125 GB machine).
+#[test]
+#[ignore = "diagnostic: runs the executor only, prints cycle counts"]
+fn test_recursion_cycle_count() {
+    use executor::elf::Elf;
+    use executor::vm::execution::Executor;
+
+    let root = workspace_root();
+    let empty_elf_bytes = read_guest_elf(&root, "empty");
+    let recursion_elf_bytes = read_guest_elf(&root, "recursion");
+
+    // Build the inner proof exactly as the smoke test does, with the
+    // absolute-minimum FRI params so the inner is as small as possible.
+    let (_inner_proof, blob) =
+        prove_inner_and_encode_blob("cycle-count", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS);
+
+    // Execute (NOT prove) the recursion guest. `drive_executor` streams chunks
+    // and never accumulates logs in memory — this avoids the Vec<Log> blow-up
+    // that OOMs even a 125 GB server (one Log is 40 B; a few billion of them is
+    // hundreds of GB).
+    eprintln!("[cycle-count] executing recursion guest (streaming counter only) ...");
+    let program = Elf::load(&recursion_elf_bytes).expect("ELF load failed");
+    let mut executor = Executor::new(&program, blob).expect("Executor::new failed");
+    let (total_cycles, exec_time) = drive_executor(
+        &mut executor,
+        |_log| ControlFlow::Continue(()),
+        |chunks, cycles, elapsed| {
+            if chunks.is_multiple_of(50) {
+                eprintln!(
+                    "[cycle-count]   ... {chunks} chunks, {cycles} cycles, {elapsed:?} elapsed"
+                );
+            }
+        },
+    );
+    let cycle_count = total_cycles as usize;
+
+    eprintln!();
+    eprintln!("============================================================");
+    eprintln!("  RECURSION GUEST EXECUTION SUMMARY");
+    eprintln!("============================================================");
+    eprintln!("  Cycle count           : {cycle_count}");
+    eprintln!("  Executor wall time    : {exec_time:?}");
+    eprintln!();
+    eprintln!("  Rough memory estimate for outer prove:");
+    let bytes_per_field = 8usize;
+    let approx_columns = 250usize; // CPU + MEMW + DECODE + bus columns combined
+    let main_trace_bytes = cycle_count * approx_columns * bytes_per_field;
+    let blowup = 2usize;
+    let lde_main_bytes = main_trace_bytes * blowup;
+    eprintln!(
+        "    main trace            : ~{:.2} GB ({} cycles × ~{} cols × 8 B)",
+        main_trace_bytes as f64 / 1e9,
+        cycle_count,
+        approx_columns
+    );
+    eprintln!(
+        "    main LDE (blowup={})   : ~{:.2} GB",
+        blowup,
+        lde_main_bytes as f64 / 1e9
+    );
+    eprintln!("  (aux trace adds roughly 50% more, so peak peak ≈ 2-3× LDE)");
+    eprintln!("============================================================");
+}
+
+/// Diagnostic: count the distinct 4 KB memory pages the recursion guest
+/// touches when verifying a small inner proof.
+///
+/// We suspect the outer prover's 125 GB OOM wall is dominated by per-page
+/// PAGE-table overhead. The number of PAGE tables the prover would build
+/// equals the number of distinct 4 KB pages the executor touches — code,
+/// heap, private input, and stack. This test surfaces that count without
+/// running the prover.
+///
+/// Layout (per `executor::constants` + `bench_vs/lambda/recursion/src/main.rs`):
+/// - Code/static: whatever PT_LOAD segments the recursion ELF carries.
+/// - Heap: `_end .. 0xC000_0000` (`MAX_MEMORY_SIZE`); `TlsfHeap` scatters
+///   allocations across this region.
+/// - Private input: starts at `PRIVATE_INPUT_START_INDEX = 0xFF000000`.
+/// - Stack: top of address space (down from `STACK_TOP = 0xFFFFFFFFFFFFFFF0`).
+///
+/// Interpretation (rough):
+/// - <1,000 pages: PAGE-table overhead is not the bottleneck.
+/// - 10k-100k pages: TLSF heap fragmentation; design a tighter bump allocator
+///   and re-measure.
+/// - >100k pages: postcard decode dominates; consider streaming decode.
+#[test]
+#[ignore = "diagnostic: counts distinct 4 KB memory pages touched by the recursion guest"]
+fn test_recursion_page_count() {
+    use executor::elf::Elf;
+    use executor::vm::execution::Executor;
+    use executor::vm::memory::PRIVATE_INPUT_START_INDEX;
+    use std::collections::HashSet;
+
+    let root = workspace_root();
+    let empty_elf_bytes = read_guest_elf(&root, "empty");
+    let recursion_elf_bytes = read_guest_elf(&root, "recursion");
+
+    let (_inner_proof, blob) =
+        prove_inner_and_encode_blob("page-count", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS);
+
+    // Precompute the recursion ELF's PT_LOAD ranges so we can bucket code/
+    // static pages separately from heap. `Elf::load` already expands BSS
+    // (memsz > filesz) into zero-valued words, so these ranges cover
+    // .text + .rodata + .data + .bss.
+    let program = Elf::load(&recursion_elf_bytes).expect("ELF load failed");
+    let segment_ranges: Vec<(u64, u64)> = program
+        .data
+        .iter()
+        .map(|seg| (seg.base_addr, seg.base_addr + (seg.values.len() as u64 * 4)))
+        .collect();
+    eprintln!(
+        "[page-count] recursion ELF: {} PT_LOAD segment(s)",
+        segment_ranges.len(),
+    );
+    for (i, (lo, hi)) in segment_ranges.iter().enumerate() {
+        eprintln!(
+            "[page-count]   segment[{i}]: 0x{lo:016x} .. 0x{hi:016x} ({} bytes)",
+            hi - lo,
+        );
+    }
+
+    // Stream through execution — running to completion via `Executor::run`
+    // would accumulate ~67 M `Log` records (~2.7 GB) we don't need. We only
+    // care about the *final* memory state.
+    eprintln!("[page-count] executing recursion guest (streaming) ...");
+    let mut executor = Executor::new(&program, blob).expect("Executor::new failed");
+    let (total_cycles, exec_time) = drive_executor(
+        &mut executor,
+        |_log| ControlFlow::Continue(()),
+        |chunks, cycles, elapsed| {
+            if chunks.is_multiple_of(50) {
+                eprintln!(
+                    "[page-count]   ... {chunks} chunks, {cycles} cycles, {elapsed:?} elapsed"
+                );
+            }
+        },
+    );
+
+    // Collect the set of distinct 4 KB pages from every cell touched during
+    // (a) program loading, (b) private-input loading, (c) execution.
+    const PAGE_MASK: u64 = !0xFFFu64;
+    let cells = executor.memory().cells();
+    let total_cells = cells.len();
+    let pages: HashSet<u64> = cells.keys().map(|&a| a & PAGE_MASK).collect();
+
+    // Bucket by region. A "code/static" page is any page that overlaps a
+    // PT_LOAD segment. Stack lives near the top of the 64-bit address
+    // space; private input lives in the [0xFF000000, ...) window above the
+    // 3 GB heap ceiling.
+    const HEAP_CEILING: u64 = 0xC000_0000;
+    const STACK_FLOOR: u64 = 0xFFFF_FFFF_0000_0000;
+
+    let mut code_pages = 0usize;
+    let mut heap_pages = 0usize;
+    let mut private_input_pages = 0usize;
+    let mut stack_pages = 0usize;
+    let mut other_pages = 0usize;
+
+    for &page in &pages {
+        let page_end = page.saturating_add(0x1000);
+        let in_code = segment_ranges
+            .iter()
+            .any(|&(lo, hi)| page < hi && lo < page_end);
+        if in_code {
+            code_pages += 1;
+        } else if page >= STACK_FLOOR {
+            stack_pages += 1;
+        } else if page >= PRIVATE_INPUT_START_INDEX {
+            private_input_pages += 1;
+        } else if page < HEAP_CEILING {
+            heap_pages += 1;
+        } else {
+            other_pages += 1;
+        }
+    }
+
+    eprintln!();
+    eprintln!("============================================================");
+    eprintln!("  RECURSION GUEST PAGE-COUNT SUMMARY");
+    eprintln!("============================================================");
+    eprintln!("  Total cycles                  : {total_cycles}");
+    eprintln!("  Executor wall time            : {exec_time:?}");
+    eprintln!("  Memory cells touched (4 B ea) : {total_cells}");
+    eprintln!("  Distinct 4 KB pages touched   : {}", pages.len());
+    eprintln!();
+    eprintln!("  Pages per region:");
+    eprintln!("    code/static (ELF segments)     : {code_pages}");
+    eprintln!("    heap (0..0xC000_0000)          : {heap_pages}");
+    eprintln!("    private input (0xFF000000..)   : {private_input_pages}");
+    eprintln!("    stack (>= 0xFFFFFFFF_00000000) : {stack_pages}");
+    if other_pages > 0 {
+        eprintln!("    other (unclassified)           : {other_pages}");
+    }
+    eprintln!();
+    eprintln!("  Interpretation (PAGE-table overhead):");
+    eprintln!("    <1k pages     → PAGE overhead is not the bottleneck.");
+    eprintln!("    10k-100k      → TLSF heap fragmentation; try a bump alloc.");
+    eprintln!("    >100k         → postcard decode dominates; stream-decode?");
+    eprintln!("============================================================");
+}
+
+/// Build a PC histogram of the recursion guest verifying an `empty`-program
+/// inner proof produced with `inner_proof_options`, and print it via
+/// [`print_pc_histogram`] under `title`.
+///
+/// `blowup_factor` and `fri_number_of_queries` are coupled (the query count is
+/// derived from blowup for a fixed security target), so each `#[test]` below is
+/// just this runner with a different `ProofOptions` — a single query at low
+/// blowup, vs. the security-derived multi-query count at a higher blowup.
+///
+/// Streams chunks of logs via `Executor::resume()` so memory stays bounded to
+/// the histogram itself. Each PC is resolved to its enclosing function via the
+/// in-house `executor::elf::SymbolTable` (reading the recursion ELF's symbol
+/// table directly — no external tool, no DWARF dependency).
+fn run_recursion_pc_histogram(
+    title: &str,
+    inner_proof_options: stark::proof::options::ProofOptions,
+) {
+    use executor::elf::Elf;
+    use executor::vm::execution::Executor;
+    use std::collections::HashMap;
+
+    let root = workspace_root();
+    let empty_elf_bytes = read_guest_elf(&root, "empty");
+    let recursion_elf_bytes = read_guest_elf(&root, "recursion");
+
+    let (_inner_proof, blob) =
+        prove_inner_and_encode_blob("pc-hist", &empty_elf_bytes, &[], &inner_proof_options);
+
+    eprintln!("[pc-hist] executing recursion guest (building PC histogram) ...");
+    let program = Elf::load(&recursion_elf_bytes).expect("ELF load failed");
+    let mut executor = Executor::new(&program, blob).expect("Executor::new failed");
+
+    let mut pc_hist: HashMap<u64, u64> = HashMap::with_capacity(300_000);
+    let unique = std::cell::Cell::new(0usize);
+    let (total_cycles, exec_time) = drive_executor(
+        &mut executor,
+        |log| {
+            *pc_hist.entry(log.current_pc).or_insert(0) += 1;
+            unique.set(pc_hist.len());
+            ControlFlow::Continue(())
+        },
+        |chunks, cycles, elapsed| {
+            if chunks.is_multiple_of(500) {
+                eprintln!(
+                    "[pc-hist]   ... {chunks} chunks, {cycles} cycles, {} unique PCs, {elapsed:?}",
+                    unique.get()
+                );
+            }
+        },
+    );
+
+    // Resolve PCs to functions directly from the ELF's symbol table.
+    let symbols = executor::elf::SymbolTable::parse(&recursion_elf_bytes);
+    print_pc_histogram(title, &symbols, pc_hist, total_cycles, exec_time);
+}
+
+/// Diagnostic: PC histogram of the recursion guest with a **single** FRI query
+/// at blowup=2 — the cheapest verifier run, dominated by fixed setup cost
+/// (decode, allocator, postcard) rather than per-query FRI/Merkle work.
+#[test]
+#[ignore = "diagnostic: ~8 minutes; prints PC histogram of the verifier-in-VM"]
+fn test_recursion_pc_histogram_1query() {
+    run_recursion_pc_histogram(
+        "RECURSION GUEST PC HISTOGRAM (blowup=2, 1 query)",
+        MIN_PROOF_OPTIONS,
+    );
+}
+
+/// Diagnostic: PC histogram of the recursion guest at **128-bit security**
+/// (blowup=8, FRI query count derived by the Johnson Bound Regime — tens of
+/// queries). Compared against the single-query runs, weight shifts toward the
+/// verifier's per-query FRI-layer / Merkle-opening and field arithmetic.
+#[test]
+#[ignore = "diagnostic: heavy; PC histogram of the multi-query verifier-in-VM"]
+fn test_recursion_pc_histogram_multiquery() {
+    let inner_proof_options =
+        crate::GoldilocksCubicProofOptions::with_blowup(8).expect("blowup=8 is always valid");
+    run_recursion_pc_histogram(
+        &format!(
+            "RECURSION GUEST PC HISTOGRAM (blowup=8, {} queries, 128-bit)",
+            inner_proof_options.fri_number_of_queries
+        ),
+        inner_proof_options,
+    );
+}
+
+/// Diagnostic: build a **sampled** call-stack histogram of the recursion guest.
+///
+/// Like `test_recursion_pc_histogram` but groups by full call stack (not PC).
+/// To stay fast, only every `SAMPLE_RATE`-th log is recorded into the histogram.
+/// The call stack itself is updated on every log (skipping would corrupt it).
+///
+/// Output is written to `/tmp/recursion_folded_sampled.txt` in
+/// inferno-flamegraph "folded stacks" format. Pipe it through:
+///
+///     cat /tmp/recursion_folded_sampled.txt | inferno-flamegraph > svg.svg
+///
+/// Expect ~10-20 minutes for SAMPLE_RATE=100 on a 40B-cycle guest.
+#[test]
+#[ignore = "diagnostic: sampled flamegraph for the verifier-in-VM"]
+fn test_recursion_sampled_flamegraph() {
+    use executor::elf::Elf;
+    use executor::flamegraph::FlamegraphGenerator;
+    use executor::vm::execution::Executor;
+    use std::io::BufWriter;
+
+    /// 1 in N logs is fed to `process_logs`, which both updates the call
+    /// stack and records a sample. At 1, every cycle goes through — the call
+    /// stack stays exactly in sync with execution so frame widths are
+    /// trustworthy, but the per-cycle cost (~57µs) limits how many cycles
+    /// we can cover within a wall-clock budget.
+    ///
+    /// At SAMPLE_RATE > 1, every CALL/RETURN that lands on a skipped cycle
+    /// silently desyncs the stack, producing the "stuck-in-visit_seq" effect
+    /// we saw at 1:1000. Use values > 1 only when stack accuracy is
+    /// expendable.
+    const SAMPLE_RATE: usize = 1;
+
+    /// Stop the executor early once we've covered this many cycles.
+    /// Set to 0 to run to completion (40B+ cycles, hours at SAMPLE_RATE=1).
+    /// At SAMPLE_RATE=1, ~57µs per cycle means 5M cycles ≈ 5 min wall time.
+    const CYCLE_BUDGET: u64 = 5_000_000;
+
+    let root = workspace_root();
+    let empty_elf_bytes = read_guest_elf(&root, "empty");
+    let recursion_elf_bytes = read_guest_elf(&root, "recursion");
+
+    let (_inner_proof, blob) =
+        prove_inner_and_encode_blob("sampled-fg", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS);
+
+    eprintln!("[sampled-fg] executing recursion guest (sampling 1-in-{SAMPLE_RATE}) ...",);
+    let program = Elf::load(&recursion_elf_bytes).expect("ELF load failed");
+    let symbols = executor::elf::SymbolTable::parse(&recursion_elf_bytes);
+    let entry_point = program.entry_point;
+    let mut executor = Executor::new(&program, blob).expect("Executor::new failed");
+
+    // Build our own instruction cache from the same segments `Executor::new`
+    // decodes internally. Owning it (rather than reading `executor.instructions`
+    // mid-loop) is what lets the per-log closure call `process_logs` without
+    // borrowing `executor`, which `drive_executor` holds mutably for `resume()`.
+    let instructions = executor::vm::execution::InstructionCache::new(&program.data)
+        .expect("instruction cache build failed");
+
+    // RefCell so the per-log closure (`process_logs`, &mut self) and the
+    // progress closure (`write_folded`, &self) can both reach the generator —
+    // their calls never overlap, so the runtime borrow check never trips.
+    let generator = std::cell::RefCell::new(FlamegraphGenerator::new(symbols, entry_point));
+
+    // Path is defined here (not after the loop) so the periodic checkpoint
+    // writes below can target it. The final write at the end still happens.
+    let path = "/tmp/recursion_folded_sampled.txt";
+
+    let mut i = 0usize;
+    let (total_cycles, exec_time) = drive_executor(
+        &mut executor,
+        |log| {
+            // 1-in-SAMPLE_RATE logs are fed to `process_logs`. At SAMPLE_RATE==1
+            // this is the identity filter (`_ % 1 == 0`); the `#[allow]` keeps
+            // the general form so SAMPLE_RATE can be bumped without touching the
+            // body. Skipped logs lose stack accuracy — acceptable diagnostic
+            // quality at higher rates.
+            #[allow(clippy::modulo_one)]
+            let take = i % SAMPLE_RATE == 0;
+            if take {
+                generator
+                    .borrow_mut()
+                    .process_logs(std::slice::from_ref(log), &instructions)
+                    .expect("flamegraph process_logs");
+            }
+            i += 1;
+
+            // Early exit once we've covered the cycle budget. The dominant hot
+            // kernels are ~uniform across the verifier's runtime, so a partial
+            // run still surfaces them. `#[allow]` lets CYCLE_BUDGET be const-0
+            // (full run) without tripping clippy.
+            #[allow(clippy::absurd_extreme_comparisons)]
+            if CYCLE_BUDGET > 0 && i as u64 >= CYCLE_BUDGET {
+                eprintln!("[sampled-fg] hit cycle budget ({CYCLE_BUDGET} cycles), stopping early");
+                ControlFlow::Break(())
+            } else {
+                ControlFlow::Continue(())
+            }
+        },
+        |chunks, cycles, elapsed| {
+            if chunks.is_multiple_of(500) {
+                eprintln!(
+                    "[sampled-fg]   ... {chunks} chunks, {cycles} cycles, {elapsed:?} elapsed"
+                );
+                // Checkpoint: re-write the folded file in place so a killed run
+                // still leaves a usable (if partial) flamegraph on disk.
+                let file = std::fs::File::create(path).expect("create output file");
+                let mut writer = BufWriter::new(file);
+                generator
+                    .borrow()
+                    .write_folded(&mut writer)
+                    .expect("write folded output");
+            }
+        },
+    );
+
+    let file = std::fs::File::create(path).expect("create output file");
+    let mut writer = BufWriter::new(file);
+    generator
+        .borrow()
+        .write_folded(&mut writer)
+        .expect("write folded output");
+
+    eprintln!();
+    eprintln!("============================================================");
+    eprintln!("  SAMPLED FLAMEGRAPH SUMMARY");
+    eprintln!("============================================================");
+    eprintln!("  Total cycles : {total_cycles}");
+    eprintln!("  Sample rate  : 1 in {SAMPLE_RATE}");
+    eprintln!("  Exec time    : {exec_time:?}");
+    eprintln!("  Output file  : {path}");
+    eprintln!("============================================================");
+    eprintln!();
+    eprintln!("  To render SVG (requires inferno):");
+    eprintln!("    cat {path} | inferno-flamegraph > /tmp/recursion_flamegraph_sampled.svg");
+    eprintln!("============================================================");
+}
+
+/// Diagnostic: host-side per-step timings for the verifier.
+///
+/// Runs an inner prove (empty guest, blowup=2, 1 query) and then verifies it
+/// on the host. When built with `--features stark/instruments`, the verifier
+/// prints `Time spent: ...` for each of the four steps (replay challenges,
+/// composition polynomial, FRI, DEEP openings) plus the step-1-replay it
+/// does before step 2. Lets us see the host-side split in seconds, without
+/// running anything inside the VM.
+///
+/// Usage:
+/// ```
+/// cargo test --release -p lambda-vm-prover --features stark/instruments \
+///   --lib test_host_verify_step_timings -- --ignored --nocapture
+/// ```
+#[test]
+#[ignore = "diagnostic: prints host-side verifier step timings"]
+fn test_host_verify_step_timings() {
+    let root = workspace_root();
+    let empty_path =
+        root.join("bench_vs/lambda/empty/target/riscv64im-lambda-vm-elf/release/empty-bench");
+    let empty_elf_bytes = std::fs::read(&empty_path).expect("read empty-bench");
+
+    let inner_proof_options = MIN_PROOF_OPTIONS;
+
+    eprintln!("[host-verify] proving empty (blowup=2, fri_queries=1) ...");
+    let inner_proof = crate::prove_with_options_and_inputs(
+        &empty_elf_bytes,
+        &[],
+        &inner_proof_options,
+        &crate::MaxRowsConfig::default(),
+    )
+    .expect("inner prove should succeed");
+
+    eprintln!("[host-verify] verifying on host (with instruments) ...");
+    let ok = crate::verify_with_options(
+        &inner_proof,
+        &empty_elf_bytes,
+        &inner_proof_options,
+        None,
+        None,
+    )
+    .expect("verify errored");
+    assert!(ok, "proof must verify");
+    eprintln!("[host-verify] verified OK");
+}
+
+/// Diagnostic: cycle count for the **deserialize-only** counterpart of the
+/// recursion guest. Same input layout
+/// (`(VmProof, Vec<u8>, ProofOptions)`) and same proof, but
+/// the guest just postcard-decodes the blob and halts — it never calls
+/// `verify_with_options`.
+///
+/// The cycle delta between this and `test_recursion_cycle_count` is the
+/// actual cost of the STARK verifier inside the VM. Historically (40.5 B-cycle
+/// recursion guest) postcard decode was ~15.6 M cycles — negligible. Now that
+/// the recursion guest is ~67 M cycles, the same absolute cost would be ~23%
+/// of total; this test re-measures it.
+#[test]
+#[ignore = "diagnostic: runs the deserialize-only guest, prints cycle count"]
+fn test_deserialize_only_cycle_count() {
+    use executor::elf::Elf;
+    use executor::vm::execution::Executor;
+
+    let root = workspace_root();
+    let empty_elf_bytes = read_guest_elf(&root, "empty");
+    let deser_elf_bytes = read_guest_elf(&root, "deserialize-only");
+
+    let (_inner_proof, blob) =
+        prove_inner_and_encode_blob("deser-only", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS);
+
+    eprintln!("[deser-only] executing deserialize-only guest (streaming) ...");
+    let program = Elf::load(&deser_elf_bytes).expect("ELF load failed");
+    eprintln!(
+        "[deser-only] ELF: {} bytes, entry_point=0x{:x}",
+        deser_elf_bytes.len(),
+        program.entry_point,
+    );
+    assert_ne!(
+        program.entry_point, 0,
+        "deserialize-only ELF has entry_point=0 — build artifact is malformed"
+    );
+    let mut executor = Executor::new(&program, blob).expect("Executor::new failed");
+
+    let (total_cycles, exec_time) = drive_executor(
+        &mut executor,
+        |_log| ControlFlow::Continue(()),
+        |chunks, cycles, elapsed| {
+            if chunks.is_multiple_of(50) {
+                eprintln!(
+                    "[deser-only]   ... {chunks} chunks, {cycles} cycles, {elapsed:?} elapsed"
+                );
+            }
+        },
+    );
+    let cycle_count = total_cycles;
+
+    eprintln!();
+    eprintln!("============================================================");
+    eprintln!("  DESERIALIZE-ONLY GUEST EXECUTION SUMMARY");
+    eprintln!("============================================================");
+    eprintln!("  Cycle count           : {cycle_count}");
+    eprintln!("  Executor wall time    : {exec_time:?}");
+    eprintln!();
+    eprintln!("  Compare against test_recursion_cycle_count (~40.5B cycles");
+    eprintln!("  with the same proof). Delta = verifier-in-VM cost.");
+    eprintln!("============================================================");
+}
+
+/// Diagnostic: PC histogram for the **deserialize-only** guest.
+///
+/// Sibling of `test_recursion_pc_histogram`, but targeting the
+/// deserialize-only control guest so we can locate the hot kernel inside the
+/// 15.7 M-cycle postcard decode itself. Every cycle goes through the
+/// histogram (no sampling), so attribution is exact — the previous sampled
+/// flamegraph at 1:1000 had broken stack reconstruction on skipped
+/// CALL/RETURNs, which made it unreliable for a workload this small.
+///
+/// Each top PC is resolved to its enclosing function via the in-house
+/// `executor::elf::SymbolTable`, reading the guest ELF's symbol table directly
+/// (no external tool, no DWARF dependency).
+#[test]
+#[ignore = "diagnostic: ~1 min; PC histogram for the deserialize-only guest"]
+fn test_deserialize_only_pc_histogram() {
+    use executor::elf::Elf;
+    use executor::vm::execution::Executor;
+    use std::collections::HashMap;
+
+    let root = workspace_root();
+    let empty_elf_bytes = read_guest_elf(&root, "empty");
+    let deser_elf_bytes = read_guest_elf(&root, "deserialize-only");
+
+    let (_inner_proof, blob) =
+        prove_inner_and_encode_blob("deser-pc-hist", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS);
+
+    eprintln!("[deser-pc-hist] executing deserialize-only guest (building PC histogram) ...");
+    let program = Elf::load(&deser_elf_bytes).expect("ELF load failed");
+    let mut executor = Executor::new(&program, blob).expect("Executor::new failed");
+
+    // ~50k unique PCs is plenty: the deserialize-only guest is ~74 KB of ELF
+    // (~18k 4-byte instructions); the hot inner loop is much smaller still.
+    let mut pc_hist: HashMap<u64, u64> = HashMap::with_capacity(50_000);
+    let unique = std::cell::Cell::new(0usize);
+    let (total_cycles, exec_time) = drive_executor(
+        &mut executor,
+        |log| {
+            *pc_hist.entry(log.current_pc).or_insert(0) += 1;
+            unique.set(pc_hist.len());
+            ControlFlow::Continue(())
+        },
+        |chunks, cycles, elapsed| {
+            if chunks.is_multiple_of(50) {
+                eprintln!(
+                    "[deser-pc-hist]   ... {chunks} chunks, {cycles} cycles, {} unique PCs, {elapsed:?}",
+                    unique.get()
+                );
+            }
+        },
+    );
+
+    // Resolve PCs to functions directly from the ELF's symbol table.
+    let symbols = executor::elf::SymbolTable::parse(&deser_elf_bytes);
+    print_pc_histogram(
+        "DESERIALIZE-ONLY GUEST PC HISTOGRAM",
+        &symbols,
+        pc_hist,
+        total_cycles,
+        exec_time,
+    );
+}
+
+/// Diagnostic: bucket the recursion guest's cycles by which verifier step
+/// is currently executing.
+///
+/// The verifier's hot path is `verify_rounds_2_to_4`, which calls four
+/// sub-routines in a fixed order:
+///   1. `replay_rounds_after_round_1`               (recover challenges)
+///   2. `step_2_verify_claimed_composition_polynomial`
+///   3. `step_3_verify_fri`
+///   4. `step_4_verify_trace_and_composition_openings`
+///
+/// We resolve each sub-routine's entry PC from the recursion ELF's symbol
+/// table, then run a monotonic state machine over the execution stream:
+/// the active bucket only advances 0 → 1 → 2 → 3 → 4 (never backwards),
+/// so cycles inside a step's callees stay attributed to that step.
+///
+/// Bucket 0 ("setup") captures everything before step 1 is entered — the
+/// allocator init, postcard decode, and `VmAirs::new` (which contains the
+/// expensive preprocessed-commitment FFTs).
+///
+/// Streams chunks via `Executor::resume()` so memory stays bounded.
+#[test]
+#[ignore = "diagnostic: ~13 min; buckets the 40B cycles by verifier step"]
+fn test_recursion_step_breakdown() {
+    use executor::elf::{Elf, SymbolTable};
+    use executor::vm::execution::Executor;
+
+    let root = workspace_root();
+    let empty_elf_bytes = read_guest_elf(&root, "empty");
+    let recursion_elf_bytes = read_guest_elf(&root, "recursion");
+
+    let (_inner_proof, blob) =
+        prove_inner_and_encode_blob("step-bkd", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS);
+
+    // Build a per-step "advance bucket to N" lookup. The verifier's step
+    // functions get inlined by LLVM in release mode, so we can't rely on
+    // matching their entry PCs directly. Instead we anchor on closures the
+    // compiler emits *inside* each step's body — iterator combinators like
+    // `.fold(|...|)` keep the step's method name as a substring in their
+    // mangled symbol. Any PC that resolves to a symbol containing step N's
+    // keyword advances the bucket to N (monotonically).
+    //
+    // If step N has no matching symbol at all (e.g. step 4 is fully inlined
+    // with no closure children of its own), its cycles get attributed to the
+    // previous bucket. We report that explicitly in the summary.
+    let symbols = SymbolTable::parse(&recursion_elf_bytes);
+    assert!(
+        !symbols.is_empty(),
+        "recursion ELF has no symbol table — was it stripped?"
+    );
+
+    let step_keywords = [
+        "replay_rounds_after_round_1",
+        "step_2_verify_claimed_composition_polynomial",
+        "step_3_verify_fri",
+        "step_4_verify_trace_and_composition_openings",
+    ];
+    let step_found: [bool; 4] = std::array::from_fn(|i| {
+        symbols
+            .functions()
+            .iter()
+            .any(|f| f.name.contains(step_keywords[i]))
+    });
+    for (i, found) in step_found.iter().enumerate() {
+        let n_matches = symbols
+            .functions()
+            .iter()
+            .filter(|f| f.name.contains(step_keywords[i]))
+            .count();
+        eprintln!(
+            "[step-bkd] step {}: keyword={:?} -> {} symbol(s) {}",
+            i + 1,
+            step_keywords[i],
+            n_matches,
+            if *found {
+                ""
+            } else {
+                "(fully inlined; will merge into the previous bucket)"
+            }
+        );
+    }
+
+    // Monotonic state machine: 0=setup, 1..=4=inside step N (or its callees /
+    // inlined-step-N-cycles attributed here because step N+1 is missing).
+    // `bucket` lives in a Cell so the per-log closure can advance it while the
+    // progress closure reads it for its live readout.
+    let bucket = std::cell::Cell::new(0u8);
+    let mut buckets = [0u64; 5];
+
+    eprintln!("[step-bkd] executing recursion guest (streaming) ...");
+    let program = Elf::load(&recursion_elf_bytes).expect("ELF load failed");
+    let mut executor = Executor::new(&program, blob).expect("Executor::new failed");
+
+    // Cache the last symbol-table hit so we only do a binary search on
+    // function transitions, not on every cycle. Functions are typically
+    // long-running (>>1 instruction), so this cache hits ~all of the time.
+    let mut last_range: Option<(u64, u64)> = None;
+    let mut last_advance: u8 = 0;
+
+    let (total_cycles, exec_time) = drive_executor(
+        &mut executor,
+        |log| {
+            let pc = log.current_pc;
+            let in_cached = matches!(last_range, Some((s, e)) if pc >= s && pc < e);
+            if !in_cached {
+                // Slow path: refresh the cache from the symbol table.
+                if let Some(sym) = symbols.lookup(pc) {
+                    // SymbolTable accepts size=0 symbols as "any address >="; for
+                    // those we'd need the next symbol's start for a real upper
+                    // bound. Cheapest workaround: set a tiny range so we re-resolve
+                    // soon enough that wrong attribution is bounded.
+                    let end = sym.address + sym.size.max(1);
+                    last_range = Some((sym.address, end));
+                    last_advance = 0;
+                    for (i, kw) in step_keywords.iter().enumerate() {
+                        if sym.name.contains(kw) {
+                            last_advance = (i + 1) as u8;
+                        }
+                    }
+                } else {
+                    last_range = None;
+                    last_advance = 0;
+                }
+            }
+            if bucket.get() < last_advance {
+                bucket.set(last_advance);
+            }
+            buckets[bucket.get() as usize] += 1;
+            ControlFlow::Continue(())
+        },
+        |chunks, cycles, elapsed| {
+            if chunks.is_multiple_of(500) {
+                eprintln!(
+                    "[step-bkd]   ... {chunks} chunks, {cycles} cycles, bucket={}, {elapsed:?}",
+                    bucket.get()
+                );
+            }
+        },
+    );
+
+    let labels = [
+        "0. setup (alloc + postcard decode + VmAirs::new + pre-step-1)",
+        "1. step 1: replay_rounds_after_round_1",
+        "2. step 2: verify_claimed_composition_polynomial",
+        "3. step 3: verify_fri",
+        "4. step 4: verify_trace_and_composition_openings (+ wrap-up)",
+    ];
+
+    eprintln!();
+    eprintln!("============================================================");
+    eprintln!("  RECURSION GUEST PER-STEP CYCLE BREAKDOWN");
+    eprintln!("============================================================");
+    eprintln!("  Total cycles : {total_cycles}");
+    eprintln!("  Exec time    : {exec_time:?}");
+    eprintln!();
+    eprintln!("  {:<60}  {:>14}  {:>7}", "bucket", "cycles", "%");
+    for (label, cycles) in labels.iter().zip(buckets.iter()) {
+        let pct = if total_cycles > 0 {
+            100.0 * (*cycles as f64) / (total_cycles as f64)
+        } else {
+            0.0
+        };
+        eprintln!("  {:<60}  {:>14}  {:>6.2}%", label, cycles, pct);
+    }
+    eprintln!("============================================================");
+}
+
 /// Inner program: fibonacci(10).
 #[test]
 #[ignore = "slow: memory-bounded continuation prove of the verifier-in-VM"]

From 1d470670c45c79147ac2f8dd7626d1f89f4d3820 Mon Sep 17 00:00:00 2001
From: Mario Rugiero <mrugiero@gmail.com>
Date: Tue, 30 Jun 2026 15:31:45 -0300
Subject: [PATCH 02/16] refactor(prover): drop per-address PC table from
 recursion profile

The top-100 per-address table carried bare PCs with no file:line, so it was
not actionable for optimization and the CI aggregator already discarded it.
Keep the per-function fold (the view that matters); terminate the aggregator's
function-table parse on the trailing rule instead of the removed PC header.
---
 .../scripts/aggregate_recursion_histogram.py  | 16 ++++-----
 prover/src/tests/recursion_smoke_test.rs      | 34 +++++--------------
 2 files changed, 16 insertions(+), 34 deletions(-)

diff --git a/.github/scripts/aggregate_recursion_histogram.py b/.github/scripts/aggregate_recursion_histogram.py
index 8a12dc05e..1ae34ff70 100755
--- a/.github/scripts/aggregate_recursion_histogram.py
+++ b/.github/scripts/aggregate_recursion_histogram.py
@@ -1,13 +1,13 @@
 #!/usr/bin/env python3
 """Format the recursion-guest per-function profile as a Markdown PR comment.
 
-`test_recursion_pc_histogram` prints a per-function summary table (cycles folded
-over each function's PCs, computed across the *full* histogram) followed by a
-per-address detail table. We extract the per-function table — the view that
-shows where the cycles actually go — and render it as Markdown.
+`test_recursion_pc_histogram` prints a per-function summary table: the cycles
+folded over each function's PCs, computed across the *full* histogram — the view
+that shows where the cycles actually go. We parse that table and render it as
+Markdown.
 
     Top 25 functions by cycle count (aggregated over their PCs):
-    rank          cycles        %    cum %    PCs  function (file:line)
+    rank          cycles        %    cum %    PCs  function
        1         5335072   24.95%   24.95%     72  <...>::visit_seq::<...>
 
 Reads the test's captured output from argv[1]; writes the Markdown body to
@@ -18,12 +18,12 @@
 import sys
 
 # A per-function summary row: rank, cycles, pct%, cum%, pcs, function.
-# Distinguished from the per-PC detail rows by the absence of a 0x<pc> column.
 FN_ROW = re.compile(
     r"^\s*\d+\s+(\d+)\s+([\d.]+)%\s+([\d.]+)%\s+(\d+)\s+(.*\S)\s*$"
 )
 FN_TABLE_START = re.compile(r"Top \d+ functions by cycle count")
-PC_TABLE_START = re.compile(r"Top \d+ PCs by cycle count")
+# The "====" rule the test prints right after the (now sole) function table.
+TABLE_END = re.compile(r"^=+\s*$")
 TOTAL_CYCLES = re.compile(r"Total cycles\s*:\s*(\d+)")
 UNIQUE_PCS = re.compile(r"Unique PCs\s*:\s*(\d+)")
 EXEC_TIME = re.compile(r"Exec time\s*:\s*(\S+)")
@@ -43,7 +43,7 @@ def parse(text):
         if FN_TABLE_START.search(line):
             in_fn_table = True
             continue
-        if PC_TABLE_START.search(line):
+        if in_fn_table and TABLE_END.match(line):
             in_fn_table = False
             continue
         if in_fn_table and (m := FN_ROW.match(line)):
diff --git a/prover/src/tests/recursion_smoke_test.rs b/prover/src/tests/recursion_smoke_test.rs
index a39bcfc90..b437bed72 100644
--- a/prover/src/tests/recursion_smoke_test.rs
+++ b/prover/src/tests/recursion_smoke_test.rs
@@ -196,12 +196,14 @@ fn resolve_pc(symbols: &executor::elf::SymbolTable, pc: u64) -> String {
     )
 }
 
-/// Print a PC histogram as two tables: a per-function summary (the cycles each
-/// resolved function accounts for, folded over all its PCs) followed by the
-/// top-100 per-address detail. `pc_hist` maps program counter → cycle count.
+/// Print a per-function PC-histogram summary: the cycles each resolved function
+/// accounts for, folded over all its PCs. `pc_hist` maps program counter →
+/// cycle count.
 ///
-/// The per-function view is the one that matters: an inlined kernel is spread
-/// across dozens of PCs, so the raw per-address table scatters its true cost.
+/// We fold by function deliberately: an inlined kernel is spread across dozens
+/// of PCs, so a raw per-address table scatters its true cost — and without
+/// file:line resolution a bare PC isn't actionable for optimization anyway, so
+/// there is no per-address detail table.
 fn print_pc_histogram(
     title: &str,
     symbols: &executor::elf::SymbolTable,
@@ -209,8 +211,7 @@ fn print_pc_histogram(
     total_cycles: u64,
     exec_time: std::time::Duration,
 ) {
-    let mut entries: Vec<(u64, u64)> = pc_hist.into_iter().collect();
-    entries.sort_unstable_by_key(|(_pc, count)| std::cmp::Reverse(*count));
+    let entries: Vec<(u64, u64)> = pc_hist.into_iter().collect();
 
     // Aggregate the full histogram by resolved function, resolving each PC once.
     let mut by_function: std::collections::HashMap<String, (u64, u64)> =
@@ -253,25 +254,6 @@ fn print_pc_histogram(
             name,
         );
     }
-    eprintln!();
-    eprintln!("  Top 100 PCs by cycle count (per-address detail):");
-    eprintln!(
-        "  {:>4}  {:>18}  {:>14}  {:>7}  {:>7}  {}",
-        "rank", "pc", "cycles", "%", "cum %", "function"
-    );
-    let mut cumulative: u64 = 0;
-    for (rank, (pc, count)) in entries.iter().take(100).enumerate() {
-        cumulative += count;
-        eprintln!(
-            "  {:>4}  {:#018x}  {:>14}  {:>6.2}%  {:>6.2}%  {}",
-            rank + 1,
-            pc,
-            count,
-            pct(*count),
-            pct(cumulative),
-            resolve_pc(symbols, *pc),
-        );
-    }
     eprintln!("============================================================");
 }
 

From a395f275a16c0c0adfbaa0be7934e8be73ca5a63 Mon Sep 17 00:00:00 2001
From: Mario Rugiero <mrugiero@gmail.com>
Date: Tue, 30 Jun 2026 15:40:41 -0300
Subject: [PATCH 03/16] refactor(prover): share setup/progress across recursion
 diagnostics

Extract setup_guest_run (blob build + ELF load + Executor::new) and a
log_progress throttled-readout factory, used by the cycle-count, page-count,
PC-histogram, sampled-flamegraph and step-breakdown diagnostics. Generalize
the PC-histogram runner over guest name + progress stride so the
deserialize-only histogram is a one-line caller instead of a near-duplicate.
---
 prover/src/tests/recursion_smoke_test.rs | 231 +++++++++--------------
 1 file changed, 88 insertions(+), 143 deletions(-)

diff --git a/prover/src/tests/recursion_smoke_test.rs b/prover/src/tests/recursion_smoke_test.rs
index b437bed72..da885a15e 100644
--- a/prover/src/tests/recursion_smoke_test.rs
+++ b/prover/src/tests/recursion_smoke_test.rs
@@ -184,6 +184,47 @@ fn drive_executor(
     (total_cycles, start.elapsed())
 }
 
+/// Shared preamble for every execute-only diagnostic below: build the standard
+/// recursion private-input blob (an `empty`-program inner proof produced under
+/// `opts`), load guest `guest_name`, and stand up an executor over it. Returns
+/// the guest's raw ELF bytes (callers that resolve PCs pass them to
+/// [`executor::elf::SymbolTable::parse`]), the loaded program, and the
+/// ready-to-drive executor.
+fn setup_guest_run(
+    label: &str,
+    guest_name: &str,
+    opts: &stark::proof::options::ProofOptions,
+) -> (
+    Vec<u8>,
+    executor::elf::Elf,
+    executor::vm::execution::Executor,
+) {
+    let root = workspace_root();
+    let empty_elf_bytes = read_guest_elf(&root, "empty");
+    let guest_elf_bytes = read_guest_elf(&root, guest_name);
+
+    let (_inner_proof, blob) = prove_inner_and_encode_blob(label, &empty_elf_bytes, &[], opts);
+
+    let program = executor::elf::Elf::load(&guest_elf_bytes).expect("ELF load failed");
+    let executor = executor::vm::execution::Executor::new(&program, blob).expect("Executor::new failed");
+    (guest_elf_bytes, program, executor)
+}
+
+/// A `drive_executor` progress callback that prints the throttled
+/// `[label]   ... N chunks, M cycles, T elapsed` line every `stride` chunks —
+/// the readout every counting diagnostic shares. Tests that need extra live
+/// state (unique PC count, active step bucket) keep their own closure instead.
+fn log_progress(
+    label: &'static str,
+    stride: usize,
+) -> impl FnMut(usize, u64, std::time::Duration) {
+    move |chunks, cycles, elapsed| {
+        if chunks.is_multiple_of(stride) {
+            eprintln!("[{label}]   ... {chunks} chunks, {cycles} cycles, {elapsed:?} elapsed");
+        }
+    }
+}
+
 /// Resolve a guest PC to its (demangled) enclosing function name using the
 /// ELF's own symbol table — the same data `executor::flamegraph` resolves
 /// against. `<unknown>` when no function symbol covers the PC (e.g. PLT stubs
@@ -481,35 +522,20 @@ fn test_dump_recursion_input() {
 #[test]
 #[ignore = "diagnostic: runs the executor only, prints cycle counts"]
 fn test_recursion_cycle_count() {
-    use executor::elf::Elf;
-    use executor::vm::execution::Executor;
-
-    let root = workspace_root();
-    let empty_elf_bytes = read_guest_elf(&root, "empty");
-    let recursion_elf_bytes = read_guest_elf(&root, "recursion");
-
-    // Build the inner proof exactly as the smoke test does, with the
-    // absolute-minimum FRI params so the inner is as small as possible.
-    let (_inner_proof, blob) =
-        prove_inner_and_encode_blob("cycle-count", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS);
+    // Build the inner proof with the absolute-minimum FRI params (smallest
+    // possible inner) and stand up the recursion guest over it.
+    let (_bytes, _program, mut executor) =
+        setup_guest_run("cycle-count", "recursion", &MIN_PROOF_OPTIONS);
 
     // Execute (NOT prove) the recursion guest. `drive_executor` streams chunks
     // and never accumulates logs in memory — this avoids the Vec<Log> blow-up
     // that OOMs even a 125 GB server (one Log is 40 B; a few billion of them is
     // hundreds of GB).
     eprintln!("[cycle-count] executing recursion guest (streaming counter only) ...");
-    let program = Elf::load(&recursion_elf_bytes).expect("ELF load failed");
-    let mut executor = Executor::new(&program, blob).expect("Executor::new failed");
     let (total_cycles, exec_time) = drive_executor(
         &mut executor,
         |_log| ControlFlow::Continue(()),
-        |chunks, cycles, elapsed| {
-            if chunks.is_multiple_of(50) {
-                eprintln!(
-                    "[cycle-count]   ... {chunks} chunks, {cycles} cycles, {elapsed:?} elapsed"
-                );
-            }
-        },
+        log_progress("cycle-count", 50),
     );
     let cycle_count = total_cycles as usize;
 
@@ -565,23 +591,16 @@ fn test_recursion_cycle_count() {
 #[test]
 #[ignore = "diagnostic: counts distinct 4 KB memory pages touched by the recursion guest"]
 fn test_recursion_page_count() {
-    use executor::elf::Elf;
-    use executor::vm::execution::Executor;
     use executor::vm::memory::PRIVATE_INPUT_START_INDEX;
     use std::collections::HashSet;
 
-    let root = workspace_root();
-    let empty_elf_bytes = read_guest_elf(&root, "empty");
-    let recursion_elf_bytes = read_guest_elf(&root, "recursion");
-
-    let (_inner_proof, blob) =
-        prove_inner_and_encode_blob("page-count", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS);
+    let (_bytes, program, mut executor) =
+        setup_guest_run("page-count", "recursion", &MIN_PROOF_OPTIONS);
 
     // Precompute the recursion ELF's PT_LOAD ranges so we can bucket code/
     // static pages separately from heap. `Elf::load` already expands BSS
     // (memsz > filesz) into zero-valued words, so these ranges cover
     // .text + .rodata + .data + .bss.
-    let program = Elf::load(&recursion_elf_bytes).expect("ELF load failed");
     let segment_ranges: Vec<(u64, u64)> = program
         .data
         .iter()
@@ -602,17 +621,10 @@ fn test_recursion_page_count() {
     // would accumulate ~67 M `Log` records (~2.7 GB) we don't need. We only
     // care about the *final* memory state.
     eprintln!("[page-count] executing recursion guest (streaming) ...");
-    let mut executor = Executor::new(&program, blob).expect("Executor::new failed");
     let (total_cycles, exec_time) = drive_executor(
         &mut executor,
         |_log| ControlFlow::Continue(()),
-        |chunks, cycles, elapsed| {
-            if chunks.is_multiple_of(50) {
-                eprintln!(
-                    "[page-count]   ... {chunks} chunks, {cycles} cycles, {elapsed:?} elapsed"
-                );
-            }
-        },
+        log_progress("page-count", 50),
     );
 
     // Collect the set of distinct 4 KB pages from every cell touched during
@@ -678,38 +690,33 @@ fn test_recursion_page_count() {
     eprintln!("============================================================");
 }
 
-/// Build a PC histogram of the recursion guest verifying an `empty`-program
+/// Build a PC histogram of guest `guest_name` verifying an `empty`-program
 /// inner proof produced with `inner_proof_options`, and print it via
 /// [`print_pc_histogram`] under `title`.
 ///
-/// `blowup_factor` and `fri_number_of_queries` are coupled (the query count is
-/// derived from blowup for a fixed security target), so each `#[test]` below is
-/// just this runner with a different `ProofOptions` — a single query at low
-/// blowup, vs. the security-derived multi-query count at a higher blowup.
+/// For the recursion guest, `blowup_factor` and `fri_number_of_queries` are
+/// coupled (the query count is derived from blowup for a fixed security
+/// target), so each recursion `#[test]` is just this runner with a different
+/// `ProofOptions` — a single query at low blowup, vs. the security-derived
+/// multi-query count at a higher blowup. The deserialize-only control guest
+/// reuses the same runner with its own ELF name.
 ///
 /// Streams chunks of logs via `Executor::resume()` so memory stays bounded to
 /// the histogram itself. Each PC is resolved to its enclosing function via the
-/// in-house `executor::elf::SymbolTable` (reading the recursion ELF's symbol
-/// table directly — no external tool, no DWARF dependency).
-fn run_recursion_pc_histogram(
+/// in-house `executor::elf::SymbolTable` (reading the guest ELF's symbol table
+/// directly — no external tool, no DWARF dependency).
+fn run_pc_histogram(
     title: &str,
+    guest_name: &str,
+    progress_stride: usize,
     inner_proof_options: stark::proof::options::ProofOptions,
 ) {
-    use executor::elf::Elf;
-    use executor::vm::execution::Executor;
     use std::collections::HashMap;
 
-    let root = workspace_root();
-    let empty_elf_bytes = read_guest_elf(&root, "empty");
-    let recursion_elf_bytes = read_guest_elf(&root, "recursion");
-
-    let (_inner_proof, blob) =
-        prove_inner_and_encode_blob("pc-hist", &empty_elf_bytes, &[], &inner_proof_options);
-
-    eprintln!("[pc-hist] executing recursion guest (building PC histogram) ...");
-    let program = Elf::load(&recursion_elf_bytes).expect("ELF load failed");
-    let mut executor = Executor::new(&program, blob).expect("Executor::new failed");
+    let (guest_elf_bytes, _program, mut executor) =
+        setup_guest_run("pc-hist", guest_name, &inner_proof_options);
 
+    eprintln!("[pc-hist] executing {guest_name} guest (building PC histogram) ...");
     let mut pc_hist: HashMap<u64, u64> = HashMap::with_capacity(300_000);
     let unique = std::cell::Cell::new(0usize);
     let (total_cycles, exec_time) = drive_executor(
@@ -720,7 +727,7 @@ fn run_recursion_pc_histogram(
             ControlFlow::Continue(())
         },
         |chunks, cycles, elapsed| {
-            if chunks.is_multiple_of(500) {
+            if chunks.is_multiple_of(progress_stride) {
                 eprintln!(
                     "[pc-hist]   ... {chunks} chunks, {cycles} cycles, {} unique PCs, {elapsed:?}",
                     unique.get()
@@ -730,7 +737,7 @@ fn run_recursion_pc_histogram(
     );
 
     // Resolve PCs to functions directly from the ELF's symbol table.
-    let symbols = executor::elf::SymbolTable::parse(&recursion_elf_bytes);
+    let symbols = executor::elf::SymbolTable::parse(&guest_elf_bytes);
     print_pc_histogram(title, &symbols, pc_hist, total_cycles, exec_time);
 }
 
@@ -740,8 +747,10 @@ fn run_recursion_pc_histogram(
 #[test]
 #[ignore = "diagnostic: ~8 minutes; prints PC histogram of the verifier-in-VM"]
 fn test_recursion_pc_histogram_1query() {
-    run_recursion_pc_histogram(
+    run_pc_histogram(
         "RECURSION GUEST PC HISTOGRAM (blowup=2, 1 query)",
+        "recursion",
+        500,
         MIN_PROOF_OPTIONS,
     );
 }
@@ -755,11 +764,13 @@ fn test_recursion_pc_histogram_1query() {
 fn test_recursion_pc_histogram_multiquery() {
     let inner_proof_options =
         crate::GoldilocksCubicProofOptions::with_blowup(8).expect("blowup=8 is always valid");
-    run_recursion_pc_histogram(
+    run_pc_histogram(
         &format!(
             "RECURSION GUEST PC HISTOGRAM (blowup=8, {} queries, 128-bit)",
             inner_proof_options.fri_number_of_queries
         ),
+        "recursion",
+        500,
         inner_proof_options,
     );
 }
@@ -779,9 +790,7 @@ fn test_recursion_pc_histogram_multiquery() {
 #[test]
 #[ignore = "diagnostic: sampled flamegraph for the verifier-in-VM"]
 fn test_recursion_sampled_flamegraph() {
-    use executor::elf::Elf;
     use executor::flamegraph::FlamegraphGenerator;
-    use executor::vm::execution::Executor;
     use std::io::BufWriter;
 
     /// 1 in N logs is fed to `process_logs`, which both updates the call
@@ -801,18 +810,12 @@ fn test_recursion_sampled_flamegraph() {
     /// At SAMPLE_RATE=1, ~57µs per cycle means 5M cycles ≈ 5 min wall time.
     const CYCLE_BUDGET: u64 = 5_000_000;
 
-    let root = workspace_root();
-    let empty_elf_bytes = read_guest_elf(&root, "empty");
-    let recursion_elf_bytes = read_guest_elf(&root, "recursion");
-
-    let (_inner_proof, blob) =
-        prove_inner_and_encode_blob("sampled-fg", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS);
+    let (recursion_elf_bytes, program, mut executor) =
+        setup_guest_run("sampled-fg", "recursion", &MIN_PROOF_OPTIONS);
 
     eprintln!("[sampled-fg] executing recursion guest (sampling 1-in-{SAMPLE_RATE}) ...",);
-    let program = Elf::load(&recursion_elf_bytes).expect("ELF load failed");
     let symbols = executor::elf::SymbolTable::parse(&recursion_elf_bytes);
     let entry_point = program.entry_point;
-    let mut executor = Executor::new(&program, blob).expect("Executor::new failed");
 
     // Build our own instruction cache from the same segments `Executor::new`
     // decodes internally. Owning it (rather than reading `executor.instructions`
@@ -960,18 +963,9 @@ fn test_host_verify_step_timings() {
 #[test]
 #[ignore = "diagnostic: runs the deserialize-only guest, prints cycle count"]
 fn test_deserialize_only_cycle_count() {
-    use executor::elf::Elf;
-    use executor::vm::execution::Executor;
+    let (deser_elf_bytes, program, mut executor) =
+        setup_guest_run("deser-only", "deserialize-only", &MIN_PROOF_OPTIONS);
 
-    let root = workspace_root();
-    let empty_elf_bytes = read_guest_elf(&root, "empty");
-    let deser_elf_bytes = read_guest_elf(&root, "deserialize-only");
-
-    let (_inner_proof, blob) =
-        prove_inner_and_encode_blob("deser-only", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS);
-
-    eprintln!("[deser-only] executing deserialize-only guest (streaming) ...");
-    let program = Elf::load(&deser_elf_bytes).expect("ELF load failed");
     eprintln!(
         "[deser-only] ELF: {} bytes, entry_point=0x{:x}",
         deser_elf_bytes.len(),
@@ -981,18 +975,12 @@ fn test_deserialize_only_cycle_count() {
         program.entry_point, 0,
         "deserialize-only ELF has entry_point=0 — build artifact is malformed"
     );
-    let mut executor = Executor::new(&program, blob).expect("Executor::new failed");
 
+    eprintln!("[deser-only] executing deserialize-only guest (streaming) ...");
     let (total_cycles, exec_time) = drive_executor(
         &mut executor,
         |_log| ControlFlow::Continue(()),
-        |chunks, cycles, elapsed| {
-            if chunks.is_multiple_of(50) {
-                eprintln!(
-                    "[deser-only]   ... {chunks} chunks, {cycles} cycles, {elapsed:?} elapsed"
-                );
-            }
-        },
+        log_progress("deser-only", 50),
     );
     let cycle_count = total_cycles;
 
@@ -1023,50 +1011,14 @@ fn test_deserialize_only_cycle_count() {
 #[test]
 #[ignore = "diagnostic: ~1 min; PC histogram for the deserialize-only guest"]
 fn test_deserialize_only_pc_histogram() {
-    use executor::elf::Elf;
-    use executor::vm::execution::Executor;
-    use std::collections::HashMap;
-
-    let root = workspace_root();
-    let empty_elf_bytes = read_guest_elf(&root, "empty");
-    let deser_elf_bytes = read_guest_elf(&root, "deserialize-only");
-
-    let (_inner_proof, blob) =
-        prove_inner_and_encode_blob("deser-pc-hist", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS);
-
-    eprintln!("[deser-pc-hist] executing deserialize-only guest (building PC histogram) ...");
-    let program = Elf::load(&deser_elf_bytes).expect("ELF load failed");
-    let mut executor = Executor::new(&program, blob).expect("Executor::new failed");
-
-    // ~50k unique PCs is plenty: the deserialize-only guest is ~74 KB of ELF
-    // (~18k 4-byte instructions); the hot inner loop is much smaller still.
-    let mut pc_hist: HashMap<u64, u64> = HashMap::with_capacity(50_000);
-    let unique = std::cell::Cell::new(0usize);
-    let (total_cycles, exec_time) = drive_executor(
-        &mut executor,
-        |log| {
-            *pc_hist.entry(log.current_pc).or_insert(0) += 1;
-            unique.set(pc_hist.len());
-            ControlFlow::Continue(())
-        },
-        |chunks, cycles, elapsed| {
-            if chunks.is_multiple_of(50) {
-                eprintln!(
-                    "[deser-pc-hist]   ... {chunks} chunks, {cycles} cycles, {} unique PCs, {elapsed:?}",
-                    unique.get()
-                );
-            }
-        },
-    );
-
-    // Resolve PCs to functions directly from the ELF's symbol table.
-    let symbols = executor::elf::SymbolTable::parse(&deser_elf_bytes);
-    print_pc_histogram(
+    // Same runner as the recursion PC histograms, pointed at the deserialize-only
+    // control guest. Smaller workload (~16 M cycles, far fewer chunks), so use a
+    // tighter progress stride to still get periodic readouts.
+    run_pc_histogram(
         "DESERIALIZE-ONLY GUEST PC HISTOGRAM",
-        &symbols,
-        pc_hist,
-        total_cycles,
-        exec_time,
+        "deserialize-only",
+        50,
+        MIN_PROOF_OPTIONS,
     );
 }
 
@@ -1093,15 +1045,10 @@ fn test_deserialize_only_pc_histogram() {
 #[test]
 #[ignore = "diagnostic: ~13 min; buckets the 40B cycles by verifier step"]
 fn test_recursion_step_breakdown() {
-    use executor::elf::{Elf, SymbolTable};
-    use executor::vm::execution::Executor;
-
-    let root = workspace_root();
-    let empty_elf_bytes = read_guest_elf(&root, "empty");
-    let recursion_elf_bytes = read_guest_elf(&root, "recursion");
+    use executor::elf::SymbolTable;
 
-    let (_inner_proof, blob) =
-        prove_inner_and_encode_blob("step-bkd", &empty_elf_bytes, &[], &MIN_PROOF_OPTIONS);
+    let (recursion_elf_bytes, _program, mut executor) =
+        setup_guest_run("step-bkd", "recursion", &MIN_PROOF_OPTIONS);
 
     // Build a per-step "advance bucket to N" lookup. The verifier's step
     // functions get inlined by LLVM in release mode, so we can't rely on
@@ -1159,8 +1106,6 @@ fn test_recursion_step_breakdown() {
     let mut buckets = [0u64; 5];
 
     eprintln!("[step-bkd] executing recursion guest (streaming) ...");
-    let program = Elf::load(&recursion_elf_bytes).expect("ELF load failed");
-    let mut executor = Executor::new(&program, blob).expect("Executor::new failed");
 
     // Cache the last symbol-table hit so we only do a binary search on
     // function transitions, not on every cycle. Functions are typically

From 66f24433a626541bc4448ae255d0e4a8ce4910f3 Mon Sep 17 00:00:00 2001
From: Mario Rugiero <mrugiero@gmail.com>
Date: Tue, 30 Jun 2026 15:49:08 -0300
Subject: [PATCH 04/16] cargo fmt

---
 prover/src/tests/recursion_smoke_test.rs | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/prover/src/tests/recursion_smoke_test.rs b/prover/src/tests/recursion_smoke_test.rs
index da885a15e..90baab4a6 100644
--- a/prover/src/tests/recursion_smoke_test.rs
+++ b/prover/src/tests/recursion_smoke_test.rs
@@ -206,7 +206,8 @@ fn setup_guest_run(
     let (_inner_proof, blob) = prove_inner_and_encode_blob(label, &empty_elf_bytes, &[], opts);
 
     let program = executor::elf::Elf::load(&guest_elf_bytes).expect("ELF load failed");
-    let executor = executor::vm::execution::Executor::new(&program, blob).expect("Executor::new failed");
+    let executor =
+        executor::vm::execution::Executor::new(&program, blob).expect("Executor::new failed");
     (guest_elf_bytes, program, executor)
 }
 
@@ -214,10 +215,7 @@ fn setup_guest_run(
 /// `[label]   ... N chunks, M cycles, T elapsed` line every `stride` chunks —
 /// the readout every counting diagnostic shares. Tests that need extra live
 /// state (unique PC count, active step bucket) keep their own closure instead.
-fn log_progress(
-    label: &'static str,
-    stride: usize,
-) -> impl FnMut(usize, u64, std::time::Duration) {
+fn log_progress(label: &'static str, stride: usize) -> impl FnMut(usize, u64, std::time::Duration) {
     move |chunks, cycles, elapsed| {
         if chunks.is_multiple_of(stride) {
             eprintln!("[{label}]   ... {chunks} chunks, {cycles} cycles, {elapsed:?} elapsed");

From acd6783bf78733a6f084d8c2b19fde925164d5e4 Mon Sep 17 00:00:00 2001
From: Mario Rugiero <mrugiero@gmail.com>
Date: Tue, 30 Jun 2026 17:12:59 -0300
Subject: [PATCH 05/16] refactor(prover): unify recursion execute-only
 diagnostics
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Collapse the cycle-count, PC-histogram and step-breakdown diagnostics into one
parameterized run_profile(guest, stride, opts, detailed): total cycles print
unconditionally, the top-25 functions + per-step breakdown gate on detailed
(they share one streamed pass over the same PC stream). Every variant now comes
in 1query and multiquery flavours for both recursion and the deserialize-only
control. Route execute_outer_and_commit through drive_executor too — the rebased
streaming finish() makes its hand-rolled drain loop redundant.
---
 prover/src/tests/recursion_smoke_test.rs | 623 ++++++++---------------
 1 file changed, 224 insertions(+), 399 deletions(-)

diff --git a/prover/src/tests/recursion_smoke_test.rs b/prover/src/tests/recursion_smoke_test.rs
index 90baab4a6..5077610e7 100644
--- a/prover/src/tests/recursion_smoke_test.rs
+++ b/prover/src/tests/recursion_smoke_test.rs
@@ -103,12 +103,8 @@ fn execute_outer_and_commit(label: &str, recursion_elf_bytes: &[u8], blob: &[u8]
     let program = Elf::load(recursion_elf_bytes).expect("load recursion elf");
     let mut executor = Executor::new(&program, blob.to_vec()).expect("executor new");
 
-    // Drain chunks to completion without retaining logs or building a trace.
-    while executor
-        .resume()
-        .expect("recursion guest execution failed (verify panicked in-VM?)")
-        .is_some()
-    {}
+    let (total_cycles, exec_time) =
+        drive_executor(&mut executor, |_log| ControlFlow::Continue(()), |_, _, _| {});
 
     let committed = executor
         .finish()
@@ -116,7 +112,7 @@ fn execute_outer_and_commit(label: &str, recursion_elf_bytes: &[u8], blob: &[u8]
         .memory_values;
 
     eprintln!(
-        "[{label}] committed {} bytes: {:?} (as str: {:?})",
+        "[{label}] {total_cycles} cycles in {exec_time:?}; committed {} bytes: {:?} (as str: {:?})",
         committed.len(),
         committed,
         String::from_utf8_lossy(&committed),
@@ -166,7 +162,7 @@ fn drive_executor(
     let start = std::time::Instant::now();
     let mut total_cycles: u64 = 0;
     let mut chunks: usize = 0;
-    while let Some(logs) = executor.resume().expect("executor resume failed") {
+    while let Some(logs) = executor.resume().expect("executor resume failed (guest panicked in-VM?)") {
         let mut stop = false;
         for log in logs {
             total_cycles += 1;
@@ -206,6 +202,10 @@ fn setup_guest_run(
     let (_inner_proof, blob) = prove_inner_and_encode_blob(label, &empty_elf_bytes, &[], opts);
 
     let program = executor::elf::Elf::load(&guest_elf_bytes).expect("ELF load failed");
+    assert_ne!(
+        program.entry_point, 0,
+        "{guest_name} ELF has entry_point=0 — build artifact is malformed"
+    );
     let executor =
         executor::vm::execution::Executor::new(&program, blob).expect("Executor::new failed");
     (guest_elf_bytes, program, executor)
@@ -213,9 +213,12 @@ fn setup_guest_run(
 
 /// A `drive_executor` progress callback that prints the throttled
 /// `[label]   ... N chunks, M cycles, T elapsed` line every `stride` chunks —
-/// the readout every counting diagnostic shares. Tests that need extra live
-/// state (unique PC count, active step bucket) keep their own closure instead.
-fn log_progress(label: &'static str, stride: usize) -> impl FnMut(usize, u64, std::time::Duration) {
+/// the readout the counting diagnostics share. Tests that need extra live state
+/// (unique PC count, active step bucket) keep their own closure instead. Takes
+/// `impl Into<String>` so it works with both `&'static` tags and a run's
+/// dynamic `label`.
+fn log_progress(label: impl Into<String>, stride: usize) -> impl FnMut(usize, u64, std::time::Duration) {
+    let label = label.into();
     move |chunks, cycles, elapsed| {
         if chunks.is_multiple_of(stride) {
             eprintln!("[{label}]   ... {chunks} chunks, {cycles} cycles, {elapsed:?} elapsed");
@@ -235,30 +238,35 @@ fn resolve_pc(symbols: &executor::elf::SymbolTable, pc: u64) -> String {
     )
 }
 
-/// Print a per-function PC-histogram summary: the cycles each resolved function
-/// accounts for, folded over all its PCs. `pc_hist` maps program counter →
-/// cycle count.
-///
-/// We fold by function deliberately: an inlined kernel is spread across dozens
-/// of PCs, so a raw per-address table scatters its true cost — and without
-/// file:line resolution a bare PC isn't actionable for optimization anyway, so
-/// there is no per-address detail table.
-fn print_pc_histogram(
-    title: &str,
+/// Verifier sub-routines in execution order. LLVM inlines the step bodies, but
+/// closures inside each keep the method name in their mangled symbol, so
+/// `run_profile` advances the step bucket by substring-matching the enclosing
+/// symbol. A step with no matching symbol merges into the previous bucket.
+const VERIFIER_STEP_KEYWORDS: [&str; 4] = [
+    "replay_rounds_after_round_1",
+    "step_2_verify_claimed_composition_polynomial",
+    "step_3_verify_fri",
+    "step_4_verify_trace_and_composition_openings",
+];
+
+/// `blowup=8` inner-proof options: the security-derived multi-query count (tens
+/// of queries, 128-bit) used by every `multiquery` profiling variant.
+fn blowup8() -> stark::proof::options::ProofOptions {
+    crate::GoldilocksCubicProofOptions::with_blowup(8).expect("blowup=8 is always valid")
+}
+
+/// Fold the PC histogram by enclosing function and print the top-25 by cycles.
+/// Folded because an inlined kernel spreads across many PCs; no per-address
+/// table since a bare PC isn't actionable without file:line.
+fn print_function_table(
     symbols: &executor::elf::SymbolTable,
     pc_hist: std::collections::HashMap<u64, u64>,
     total_cycles: u64,
-    exec_time: std::time::Duration,
 ) {
-    let entries: Vec<(u64, u64)> = pc_hist.into_iter().collect();
-
-    // Aggregate the full histogram by resolved function, resolving each PC once.
     let mut by_function: std::collections::HashMap<String, (u64, u64)> =
         std::collections::HashMap::new();
-    for (pc, count) in &entries {
-        let entry = by_function
-            .entry(resolve_pc(symbols, *pc))
-            .or_insert((0, 0));
+    for (pc, count) in &pc_hist {
+        let entry = by_function.entry(resolve_pc(symbols, *pc)).or_insert((0, 0));
         entry.0 += *count; // cycles
         entry.1 += 1; // distinct PCs folded into this function
     }
@@ -266,20 +274,10 @@ fn print_pc_histogram(
     fn_entries.sort_unstable_by_key(|(_name, (cycles, _pcs))| std::cmp::Reverse(*cycles));
 
     let pct = |n: u64| 100.0 * (n as f64) / (total_cycles as f64);
-
-    eprintln!();
-    eprintln!("============================================================");
-    eprintln!("  {title}");
-    eprintln!("============================================================");
-    eprintln!("  Total cycles : {total_cycles}");
-    eprintln!("  Unique PCs   : {}", entries.len());
-    eprintln!("  Exec time    : {exec_time:?}");
+    eprintln!("  Unique PCs   : {}", pc_hist.len());
     eprintln!();
     eprintln!("  Top 25 functions by cycle count (aggregated over their PCs):");
-    eprintln!(
-        "  {:>4}  {:>14}  {:>7}  {:>7}  {:>5}  {}",
-        "rank", "cycles", "%", "cum %", "PCs", "function"
-    );
+    eprintln!("  rank          cycles        %    cum %    PCs  function");
     let mut fn_cumulative: u64 = 0;
     for (rank, (name, (cycles, pcs))) in fn_entries.iter().take(25).enumerate() {
         fn_cumulative += cycles;
@@ -293,6 +291,149 @@ fn print_pc_histogram(
             name,
         );
     }
+}
+
+/// Print the monotonic per-verifier-step cycle bucketing. `buckets[0]` is
+/// pre-step-1 setup (alloc + postcard decode + `VmAirs::new`); `buckets[i]` is
+/// verifier step i (with a missing step's cycles merged into the previous one).
+fn print_step_breakdown(buckets: &[u64; 5], total_cycles: u64) {
+    let labels = [
+        "0. setup (alloc + postcard decode + VmAirs::new + pre-step-1)",
+        "1. step 1: replay_rounds_after_round_1",
+        "2. step 2: verify_claimed_composition_polynomial",
+        "3. step 3: verify_fri",
+        "4. step 4: verify_trace_and_composition_openings (+ wrap-up)",
+    ];
+    eprintln!();
+    eprintln!("  Per-step cycle breakdown (monotonic state machine):");
+    eprintln!("  {:<60}  {:>14}  {:>7}", "bucket", "cycles", "%");
+    for (label, cycles) in labels.iter().zip(buckets.iter()) {
+        let pct = if total_cycles > 0 {
+            100.0 * (*cycles as f64) / (total_cycles as f64)
+        } else {
+            0.0
+        };
+        eprintln!("  {:<60}  {:>14}  {:>6.2}%", label, cycles, pct);
+    }
+}
+
+/// Single-pass execute-only profiler. Always prints total cycles + wall time +
+/// a rough trace/LDE size estimate. With `detailed`, the same pass also builds
+/// the PC histogram and verifier-step bucketing and prints the top-25 functions
+/// and the per-step breakdown (the two always come together); `!detailed` does
+/// no per-log work, so it's just a fast cycle counter. `progress_stride`
+/// throttles the readout (recursion large, the deserialize-only control small).
+fn run_profile(
+    guest_name: &str,
+    progress_stride: usize,
+    opts: stark::proof::options::ProofOptions,
+    detailed: bool,
+) {
+    use std::collections::HashMap;
+
+    let (guest_elf_bytes, _program, mut executor) = setup_guest_run("profile", guest_name, &opts);
+    let symbols = executor::elf::SymbolTable::parse(&guest_elf_bytes);
+
+    let mut pc_hist: HashMap<u64, u64> = HashMap::new();
+    let mut buckets = [0u64; 5];
+    let mut last_range: Option<(u64, u64)> = None;
+    let mut last_advance: u8 = 0;
+    let bucket = std::cell::Cell::new(0u8);
+    let unique = std::cell::Cell::new(0usize);
+
+    if detailed {
+        assert!(
+            !symbols.is_empty(),
+            "{guest_name} ELF has no symbol table — was it stripped?"
+        );
+        for (i, kw) in VERIFIER_STEP_KEYWORDS.iter().enumerate() {
+            let n = symbols.functions().iter().filter(|f| f.name.contains(kw)).count();
+            eprintln!(
+                "[profile] step {}: keyword={kw:?} -> {n} symbol(s) {}",
+                i + 1,
+                if n > 0 { "" } else { "(no match; merges into previous bucket)" },
+            );
+        }
+    }
+
+    eprintln!(
+        "[profile] executing {guest_name} guest ({}) ...",
+        if detailed { "histogram + steps" } else { "cycle counter" }
+    );
+    let (total_cycles, exec_time) = drive_executor(
+        &mut executor,
+        |log| {
+            if detailed {
+                let pc = log.current_pc;
+                *pc_hist.entry(pc).or_insert(0) += 1;
+                unique.set(pc_hist.len());
+
+                let in_cached = matches!(last_range, Some((s, e)) if pc >= s && pc < e);
+                if !in_cached {
+                    if let Some(sym) = symbols.lookup(pc) {
+                        last_range = Some((sym.address, sym.address + sym.size.max(1)));
+                        last_advance = 0;
+                        for (i, kw) in VERIFIER_STEP_KEYWORDS.iter().enumerate() {
+                            if sym.name.contains(kw) {
+                                last_advance = (i + 1) as u8;
+                            }
+                        }
+                    } else {
+                        last_range = None;
+                        last_advance = 0;
+                    }
+                }
+                if bucket.get() < last_advance {
+                    bucket.set(last_advance);
+                }
+                buckets[bucket.get() as usize] += 1;
+            }
+            ControlFlow::Continue(())
+        },
+        |chunks, cycles, elapsed| {
+            if chunks.is_multiple_of(progress_stride) {
+                if detailed {
+                    eprintln!(
+                        "[profile]   ... {chunks} chunks, {cycles} cycles, {} unique PCs, bucket={}, {elapsed:?}",
+                        unique.get(),
+                        bucket.get(),
+                    );
+                } else {
+                    eprintln!("[profile]   ... {chunks} chunks, {cycles} cycles, {elapsed:?}");
+                }
+            }
+        },
+    );
+
+    eprintln!();
+    eprintln!("============================================================");
+    eprintln!(
+        "  {} GUEST PROFILE (blowup={}, {} queries)",
+        guest_name.to_uppercase(),
+        opts.blowup_factor,
+        opts.fri_number_of_queries,
+    );
+    eprintln!("============================================================");
+    eprintln!("  Total cycles : {total_cycles}");
+    eprintln!("  Exec time    : {exec_time:?}");
+    eprintln!();
+    eprintln!("  Rough trace/LDE size if this guest were proven:");
+    let approx_columns = 250u64;
+    let main_trace_bytes = total_cycles * approx_columns * 8;
+    eprintln!(
+        "    main trace          : ~{:.2} GB ({total_cycles} cycles × ~{approx_columns} cols × 8 B)",
+        main_trace_bytes as f64 / 1e9,
+    );
+    eprintln!(
+        "    main LDE (blowup=2) : ~{:.2} GB  (+aux ≈ 50% more → peak ≈ 2-3× LDE)",
+        (main_trace_bytes * 2) as f64 / 1e9,
+    );
+
+    if detailed {
+        eprintln!();
+        print_function_table(&symbols, pc_hist, total_cycles);
+        print_step_breakdown(&buckets, total_cycles);
+    }
     eprintln!("============================================================");
 }
 
@@ -510,59 +651,33 @@ fn test_dump_recursion_input() {
     eprintln!("[dump-input] wrote {} bytes to {path}", blob.len());
 }
 
-/// Diagnostic: build the inner proof + recursion guest input, then **execute
-/// only** the recursion guest (no STARK proving) and report cycle counts +
-/// trace size estimates.
-///
-/// This is the cheap way to find out how many RISC-V instructions the
-/// verifier actually executes inside the guest — a much faster signal than
-/// running the full outer prove (which can OOM on a 125 GB machine).
+/// Cycle count only of the recursion guest verifying a 1-query inner proof.
 #[test]
-#[ignore = "diagnostic: runs the executor only, prints cycle counts"]
-fn test_recursion_cycle_count() {
-    // Build the inner proof with the absolute-minimum FRI params (smallest
-    // possible inner) and stand up the recursion guest over it.
-    let (_bytes, _program, mut executor) =
-        setup_guest_run("cycle-count", "recursion", &MIN_PROOF_OPTIONS);
-
-    // Execute (NOT prove) the recursion guest. `drive_executor` streams chunks
-    // and never accumulates logs in memory — this avoids the Vec<Log> blow-up
-    // that OOMs even a 125 GB server (one Log is 40 B; a few billion of them is
-    // hundreds of GB).
-    eprintln!("[cycle-count] executing recursion guest (streaming counter only) ...");
-    let (total_cycles, exec_time) = drive_executor(
-        &mut executor,
-        |_log| ControlFlow::Continue(()),
-        log_progress("cycle-count", 50),
-    );
-    let cycle_count = total_cycles as usize;
+#[ignore = "diagnostic: fast; recursion guest cycle count (1 query)"]
+fn test_recursion_cycles_1query() {
+    run_profile("recursion", 500, MIN_PROOF_OPTIONS, false);
+}
 
-    eprintln!();
-    eprintln!("============================================================");
-    eprintln!("  RECURSION GUEST EXECUTION SUMMARY");
-    eprintln!("============================================================");
-    eprintln!("  Cycle count           : {cycle_count}");
-    eprintln!("  Executor wall time    : {exec_time:?}");
-    eprintln!();
-    eprintln!("  Rough memory estimate for outer prove:");
-    let bytes_per_field = 8usize;
-    let approx_columns = 250usize; // CPU + MEMW + DECODE + bus columns combined
-    let main_trace_bytes = cycle_count * approx_columns * bytes_per_field;
-    let blowup = 2usize;
-    let lde_main_bytes = main_trace_bytes * blowup;
-    eprintln!(
-        "    main trace            : ~{:.2} GB ({} cycles × ~{} cols × 8 B)",
-        main_trace_bytes as f64 / 1e9,
-        cycle_count,
-        approx_columns
-    );
-    eprintln!(
-        "    main LDE (blowup={})   : ~{:.2} GB",
-        blowup,
-        lde_main_bytes as f64 / 1e9
-    );
-    eprintln!("  (aux trace adds roughly 50% more, so peak peak ≈ 2-3× LDE)");
-    eprintln!("============================================================");
+/// Cycle count only at 128-bit security: more FRI queries → more verifier cycles.
+#[test]
+#[ignore = "diagnostic: fast; recursion guest cycle count (multi-query)"]
+fn test_recursion_cycles_multiquery() {
+    run_profile("recursion", 500, blowup8(), false);
+}
+
+/// Full profile (top-25 functions + per-step breakdown) of the 1-query run —
+/// the cheapest verifier run, dominated by fixed setup.
+#[test]
+#[ignore = "diagnostic: ~8 min; recursion guest histogram + steps (1 query)"]
+fn test_recursion_profile_1query() {
+    run_profile("recursion", 500, MIN_PROOF_OPTIONS, true);
+}
+
+/// Full profile at 128-bit security: weight shifts toward per-query FRI/Merkle.
+#[test]
+#[ignore = "diagnostic: heavy; recursion guest histogram + steps (multi-query)"]
+fn test_recursion_profile_multiquery() {
+    run_profile("recursion", 500, blowup8(), true);
 }
 
 /// Diagnostic: count the distinct 4 KB memory pages the recursion guest
@@ -688,91 +803,6 @@ fn test_recursion_page_count() {
     eprintln!("============================================================");
 }
 
-/// Build a PC histogram of guest `guest_name` verifying an `empty`-program
-/// inner proof produced with `inner_proof_options`, and print it via
-/// [`print_pc_histogram`] under `title`.
-///
-/// For the recursion guest, `blowup_factor` and `fri_number_of_queries` are
-/// coupled (the query count is derived from blowup for a fixed security
-/// target), so each recursion `#[test]` is just this runner with a different
-/// `ProofOptions` — a single query at low blowup, vs. the security-derived
-/// multi-query count at a higher blowup. The deserialize-only control guest
-/// reuses the same runner with its own ELF name.
-///
-/// Streams chunks of logs via `Executor::resume()` so memory stays bounded to
-/// the histogram itself. Each PC is resolved to its enclosing function via the
-/// in-house `executor::elf::SymbolTable` (reading the guest ELF's symbol table
-/// directly — no external tool, no DWARF dependency).
-fn run_pc_histogram(
-    title: &str,
-    guest_name: &str,
-    progress_stride: usize,
-    inner_proof_options: stark::proof::options::ProofOptions,
-) {
-    use std::collections::HashMap;
-
-    let (guest_elf_bytes, _program, mut executor) =
-        setup_guest_run("pc-hist", guest_name, &inner_proof_options);
-
-    eprintln!("[pc-hist] executing {guest_name} guest (building PC histogram) ...");
-    let mut pc_hist: HashMap<u64, u64> = HashMap::with_capacity(300_000);
-    let unique = std::cell::Cell::new(0usize);
-    let (total_cycles, exec_time) = drive_executor(
-        &mut executor,
-        |log| {
-            *pc_hist.entry(log.current_pc).or_insert(0) += 1;
-            unique.set(pc_hist.len());
-            ControlFlow::Continue(())
-        },
-        |chunks, cycles, elapsed| {
-            if chunks.is_multiple_of(progress_stride) {
-                eprintln!(
-                    "[pc-hist]   ... {chunks} chunks, {cycles} cycles, {} unique PCs, {elapsed:?}",
-                    unique.get()
-                );
-            }
-        },
-    );
-
-    // Resolve PCs to functions directly from the ELF's symbol table.
-    let symbols = executor::elf::SymbolTable::parse(&guest_elf_bytes);
-    print_pc_histogram(title, &symbols, pc_hist, total_cycles, exec_time);
-}
-
-/// Diagnostic: PC histogram of the recursion guest with a **single** FRI query
-/// at blowup=2 — the cheapest verifier run, dominated by fixed setup cost
-/// (decode, allocator, postcard) rather than per-query FRI/Merkle work.
-#[test]
-#[ignore = "diagnostic: ~8 minutes; prints PC histogram of the verifier-in-VM"]
-fn test_recursion_pc_histogram_1query() {
-    run_pc_histogram(
-        "RECURSION GUEST PC HISTOGRAM (blowup=2, 1 query)",
-        "recursion",
-        500,
-        MIN_PROOF_OPTIONS,
-    );
-}
-
-/// Diagnostic: PC histogram of the recursion guest at **128-bit security**
-/// (blowup=8, FRI query count derived by the Johnson Bound Regime — tens of
-/// queries). Compared against the single-query runs, weight shifts toward the
-/// verifier's per-query FRI-layer / Merkle-opening and field arithmetic.
-#[test]
-#[ignore = "diagnostic: heavy; PC histogram of the multi-query verifier-in-VM"]
-fn test_recursion_pc_histogram_multiquery() {
-    let inner_proof_options =
-        crate::GoldilocksCubicProofOptions::with_blowup(8).expect("blowup=8 is always valid");
-    run_pc_histogram(
-        &format!(
-            "RECURSION GUEST PC HISTOGRAM (blowup=8, {} queries, 128-bit)",
-            inner_proof_options.fri_number_of_queries
-        ),
-        "recursion",
-        500,
-        inner_proof_options,
-    );
-}
-
 /// Diagnostic: build a **sampled** call-stack histogram of the recursion guest.
 ///
 /// Like `test_recursion_pc_histogram` but groups by full call stack (not PC).
@@ -947,236 +977,31 @@ fn test_host_verify_step_timings() {
     eprintln!("[host-verify] verified OK");
 }
 
-/// Diagnostic: cycle count for the **deserialize-only** counterpart of the
-/// recursion guest. Same input layout
-/// (`(VmProof, Vec<u8>, ProofOptions)`) and same proof, but
-/// the guest just postcard-decodes the blob and halts — it never calls
-/// `verify_with_options`.
-///
-/// The cycle delta between this and `test_recursion_cycle_count` is the
-/// actual cost of the STARK verifier inside the VM. Historically (40.5 B-cycle
-/// recursion guest) postcard decode was ~15.6 M cycles — negligible. Now that
-/// the recursion guest is ~67 M cycles, the same absolute cost would be ~23%
-/// of total; this test re-measures it.
-#[test]
-#[ignore = "diagnostic: runs the deserialize-only guest, prints cycle count"]
-fn test_deserialize_only_cycle_count() {
-    let (deser_elf_bytes, program, mut executor) =
-        setup_guest_run("deser-only", "deserialize-only", &MIN_PROOF_OPTIONS);
-
-    eprintln!(
-        "[deser-only] ELF: {} bytes, entry_point=0x{:x}",
-        deser_elf_bytes.len(),
-        program.entry_point,
-    );
-    assert_ne!(
-        program.entry_point, 0,
-        "deserialize-only ELF has entry_point=0 — build artifact is malformed"
-    );
+// Control guest: decodes the blob and halts. Its cycle count subtracted from
+// the matching recursion run isolates the in-VM verifier cost.
 
-    eprintln!("[deser-only] executing deserialize-only guest (streaming) ...");
-    let (total_cycles, exec_time) = drive_executor(
-        &mut executor,
-        |_log| ControlFlow::Continue(()),
-        log_progress("deser-only", 50),
-    );
-    let cycle_count = total_cycles;
-
-    eprintln!();
-    eprintln!("============================================================");
-    eprintln!("  DESERIALIZE-ONLY GUEST EXECUTION SUMMARY");
-    eprintln!("============================================================");
-    eprintln!("  Cycle count           : {cycle_count}");
-    eprintln!("  Executor wall time    : {exec_time:?}");
-    eprintln!();
-    eprintln!("  Compare against test_recursion_cycle_count (~40.5B cycles");
-    eprintln!("  with the same proof). Delta = verifier-in-VM cost.");
-    eprintln!("============================================================");
+#[test]
+#[ignore = "diagnostic: fast; deserialize-only guest cycle count (1 query)"]
+fn test_deserialize_only_cycles_1query() {
+    run_profile("deserialize-only", 50, MIN_PROOF_OPTIONS, false);
 }
 
-/// Diagnostic: PC histogram for the **deserialize-only** guest.
-///
-/// Sibling of `test_recursion_pc_histogram`, but targeting the
-/// deserialize-only control guest so we can locate the hot kernel inside the
-/// 15.7 M-cycle postcard decode itself. Every cycle goes through the
-/// histogram (no sampling), so attribution is exact — the previous sampled
-/// flamegraph at 1:1000 had broken stack reconstruction on skipped
-/// CALL/RETURNs, which made it unreliable for a workload this small.
-///
-/// Each top PC is resolved to its enclosing function via the in-house
-/// `executor::elf::SymbolTable`, reading the guest ELF's symbol table directly
-/// (no external tool, no DWARF dependency).
 #[test]
-#[ignore = "diagnostic: ~1 min; PC histogram for the deserialize-only guest"]
-fn test_deserialize_only_pc_histogram() {
-    // Same runner as the recursion PC histograms, pointed at the deserialize-only
-    // control guest. Smaller workload (~16 M cycles, far fewer chunks), so use a
-    // tighter progress stride to still get periodic readouts.
-    run_pc_histogram(
-        "DESERIALIZE-ONLY GUEST PC HISTOGRAM",
-        "deserialize-only",
-        50,
-        MIN_PROOF_OPTIONS,
-    );
+#[ignore = "diagnostic: fast; deserialize-only guest cycle count (multi-query)"]
+fn test_deserialize_only_cycles_multiquery() {
+    run_profile("deserialize-only", 50, blowup8(), false);
 }
 
-/// Diagnostic: bucket the recursion guest's cycles by which verifier step
-/// is currently executing.
-///
-/// The verifier's hot path is `verify_rounds_2_to_4`, which calls four
-/// sub-routines in a fixed order:
-///   1. `replay_rounds_after_round_1`               (recover challenges)
-///   2. `step_2_verify_claimed_composition_polynomial`
-///   3. `step_3_verify_fri`
-///   4. `step_4_verify_trace_and_composition_openings`
-///
-/// We resolve each sub-routine's entry PC from the recursion ELF's symbol
-/// table, then run a monotonic state machine over the execution stream:
-/// the active bucket only advances 0 → 1 → 2 → 3 → 4 (never backwards),
-/// so cycles inside a step's callees stay attributed to that step.
-///
-/// Bucket 0 ("setup") captures everything before step 1 is entered — the
-/// allocator init, postcard decode, and `VmAirs::new` (which contains the
-/// expensive preprocessed-commitment FFTs).
-///
-/// Streams chunks via `Executor::resume()` so memory stays bounded.
 #[test]
-#[ignore = "diagnostic: ~13 min; buckets the 40B cycles by verifier step"]
-fn test_recursion_step_breakdown() {
-    use executor::elf::SymbolTable;
-
-    let (recursion_elf_bytes, _program, mut executor) =
-        setup_guest_run("step-bkd", "recursion", &MIN_PROOF_OPTIONS);
-
-    // Build a per-step "advance bucket to N" lookup. The verifier's step
-    // functions get inlined by LLVM in release mode, so we can't rely on
-    // matching their entry PCs directly. Instead we anchor on closures the
-    // compiler emits *inside* each step's body — iterator combinators like
-    // `.fold(|...|)` keep the step's method name as a substring in their
-    // mangled symbol. Any PC that resolves to a symbol containing step N's
-    // keyword advances the bucket to N (monotonically).
-    //
-    // If step N has no matching symbol at all (e.g. step 4 is fully inlined
-    // with no closure children of its own), its cycles get attributed to the
-    // previous bucket. We report that explicitly in the summary.
-    let symbols = SymbolTable::parse(&recursion_elf_bytes);
-    assert!(
-        !symbols.is_empty(),
-        "recursion ELF has no symbol table — was it stripped?"
-    );
-
-    let step_keywords = [
-        "replay_rounds_after_round_1",
-        "step_2_verify_claimed_composition_polynomial",
-        "step_3_verify_fri",
-        "step_4_verify_trace_and_composition_openings",
-    ];
-    let step_found: [bool; 4] = std::array::from_fn(|i| {
-        symbols
-            .functions()
-            .iter()
-            .any(|f| f.name.contains(step_keywords[i]))
-    });
-    for (i, found) in step_found.iter().enumerate() {
-        let n_matches = symbols
-            .functions()
-            .iter()
-            .filter(|f| f.name.contains(step_keywords[i]))
-            .count();
-        eprintln!(
-            "[step-bkd] step {}: keyword={:?} -> {} symbol(s) {}",
-            i + 1,
-            step_keywords[i],
-            n_matches,
-            if *found {
-                ""
-            } else {
-                "(fully inlined; will merge into the previous bucket)"
-            }
-        );
-    }
-
-    // Monotonic state machine: 0=setup, 1..=4=inside step N (or its callees /
-    // inlined-step-N-cycles attributed here because step N+1 is missing).
-    // `bucket` lives in a Cell so the per-log closure can advance it while the
-    // progress closure reads it for its live readout.
-    let bucket = std::cell::Cell::new(0u8);
-    let mut buckets = [0u64; 5];
-
-    eprintln!("[step-bkd] executing recursion guest (streaming) ...");
-
-    // Cache the last symbol-table hit so we only do a binary search on
-    // function transitions, not on every cycle. Functions are typically
-    // long-running (>>1 instruction), so this cache hits ~all of the time.
-    let mut last_range: Option<(u64, u64)> = None;
-    let mut last_advance: u8 = 0;
-
-    let (total_cycles, exec_time) = drive_executor(
-        &mut executor,
-        |log| {
-            let pc = log.current_pc;
-            let in_cached = matches!(last_range, Some((s, e)) if pc >= s && pc < e);
-            if !in_cached {
-                // Slow path: refresh the cache from the symbol table.
-                if let Some(sym) = symbols.lookup(pc) {
-                    // SymbolTable accepts size=0 symbols as "any address >="; for
-                    // those we'd need the next symbol's start for a real upper
-                    // bound. Cheapest workaround: set a tiny range so we re-resolve
-                    // soon enough that wrong attribution is bounded.
-                    let end = sym.address + sym.size.max(1);
-                    last_range = Some((sym.address, end));
-                    last_advance = 0;
-                    for (i, kw) in step_keywords.iter().enumerate() {
-                        if sym.name.contains(kw) {
-                            last_advance = (i + 1) as u8;
-                        }
-                    }
-                } else {
-                    last_range = None;
-                    last_advance = 0;
-                }
-            }
-            if bucket.get() < last_advance {
-                bucket.set(last_advance);
-            }
-            buckets[bucket.get() as usize] += 1;
-            ControlFlow::Continue(())
-        },
-        |chunks, cycles, elapsed| {
-            if chunks.is_multiple_of(500) {
-                eprintln!(
-                    "[step-bkd]   ... {chunks} chunks, {cycles} cycles, bucket={}, {elapsed:?}",
-                    bucket.get()
-                );
-            }
-        },
-    );
-
-    let labels = [
-        "0. setup (alloc + postcard decode + VmAirs::new + pre-step-1)",
-        "1. step 1: replay_rounds_after_round_1",
-        "2. step 2: verify_claimed_composition_polynomial",
-        "3. step 3: verify_fri",
-        "4. step 4: verify_trace_and_composition_openings (+ wrap-up)",
-    ];
+#[ignore = "diagnostic: ~1 min; deserialize-only guest histogram (1 query)"]
+fn test_deserialize_only_profile_1query() {
+    run_profile("deserialize-only", 50, MIN_PROOF_OPTIONS, true);
+}
 
-    eprintln!();
-    eprintln!("============================================================");
-    eprintln!("  RECURSION GUEST PER-STEP CYCLE BREAKDOWN");
-    eprintln!("============================================================");
-    eprintln!("  Total cycles : {total_cycles}");
-    eprintln!("  Exec time    : {exec_time:?}");
-    eprintln!();
-    eprintln!("  {:<60}  {:>14}  {:>7}", "bucket", "cycles", "%");
-    for (label, cycles) in labels.iter().zip(buckets.iter()) {
-        let pct = if total_cycles > 0 {
-            100.0 * (*cycles as f64) / (total_cycles as f64)
-        } else {
-            0.0
-        };
-        eprintln!("  {:<60}  {:>14}  {:>6.2}%", label, cycles, pct);
-    }
-    eprintln!("============================================================");
+#[test]
+#[ignore = "diagnostic: deserialize-only guest histogram (multi-query)"]
+fn test_deserialize_only_profile_multiquery() {
+    run_profile("deserialize-only", 50, blowup8(), true);
 }
 
 /// Inner program: fibonacci(10).

From 1b143b393814c01dbb3040c32d6fe8e8891656c4 Mon Sep 17 00:00:00 2001
From: Mario Rugiero <mrugiero@gmail.com>
Date: Tue, 30 Jun 2026 17:29:33 -0300
Subject: [PATCH 06/16] build: enable the deserialize-only recursion guest

Add deserialize-only to RECURSION_GUESTS and migrate the guest to the recursion
guest's std shape (lambda_vm_syscalls + build-std std), since the old no_std
panic handler collided with std. Add getrandom_backend="custom" to its cargo
config (transitive getrandom 0.3 needs it) and track its Cargo.lock. The deser
control guest now builds and its profile tests run.
---
 Makefile                                      |    2 +-
 .../deserialize-only/.cargo/config.toml       |    1 +
 bench_vs/lambda/deserialize-only/Cargo.lock   | 1199 +++++++++++++++++
 bench_vs/lambda/deserialize-only/Cargo.toml   |    6 +-
 bench_vs/lambda/deserialize-only/src/main.rs  |   99 +-
 5 files changed, 1222 insertions(+), 85 deletions(-)
 create mode 100644 bench_vs/lambda/deserialize-only/Cargo.lock

diff --git a/Makefile b/Makefile
index 30e3029da..60bb8a0c5 100644
--- a/Makefile
+++ b/Makefile
@@ -51,7 +51,7 @@ BENCH_ARTIFACTS := $(addprefix $(BENCH_ARTIFACTS_DIR)/, $(addsuffix .elf, $(BENC
 # rather than executor/programs/. The recursion guest is the in-VM STARK verifier.
 RECURSION_GUESTS_DIR=./bench_vs/lambda
 RECURSION_ARTIFACTS_DIR=./executor/program_artifacts/recursion
-RECURSION_GUESTS := empty fibonacci recursion
+RECURSION_GUESTS := empty fibonacci recursion deserialize-only
 RECURSION_ARTIFACTS := $(addprefix $(RECURSION_ARTIFACTS_DIR)/, $(addsuffix .elf, $(RECURSION_GUESTS)))
 
 # Override with: make ... SYSROOT_DIR=$HOME/.lambda-vm-sysroot
diff --git a/bench_vs/lambda/deserialize-only/.cargo/config.toml b/bench_vs/lambda/deserialize-only/.cargo/config.toml
index be730c3ec..f5ea686ff 100644
--- a/bench_vs/lambda/deserialize-only/.cargo/config.toml
+++ b/bench_vs/lambda/deserialize-only/.cargo/config.toml
@@ -2,5 +2,6 @@
 rustflags = [
   "-C", "link-arg=-e",
   "-C", "link-arg=main",
+  "--cfg", "getrandom_backend=\"custom\"",
   "-C", "passes=lower-atomic"
 ]
diff --git a/bench_vs/lambda/deserialize-only/Cargo.lock b/bench_vs/lambda/deserialize-only/Cargo.lock
new file mode 100644
index 000000000..9433fadb3
--- /dev/null
+++ b/bench_vs/lambda/deserialize-only/Cargo.lock
@@ -0,0 +1,1199 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 4
+
+[[package]]
+name = "atomic-polyfill"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8cf2bce30dfe09ef0bfaef228b9d414faaf7e563035494d7fe092dba54b300f4"
+dependencies = [
+ "critical-section",
+]
+
+[[package]]
+name = "autocfg"
+version = "1.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f2032f911046de80f0a198e0901378627c33f59ea0ac00e363d481118bd70a53"
+
+[[package]]
+name = "base16ct"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c7f02d4ea65f2c1853089ffd8d2787bdbc63de2f0d29dedbcf8ccdfa0ccd4cf"
+
+[[package]]
+name = "base64"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
+
+[[package]]
+name = "block-buffer"
+version = "0.10.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71"
+dependencies = [
+ "generic-array",
+]
+
+[[package]]
+name = "bumpalo"
+version = "3.20.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72f5acc6cb2ba439de613abc23857ec3d78374d8ed5ac84e9d11336e87da8649"
+
+[[package]]
+name = "byteorder"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
+
+[[package]]
+name = "cfg-if"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
+
+[[package]]
+name = "cobs"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0fa961b519f0b462e3a3b4a34b64d119eeaca1d59af726fe450bbba07a9fc0a1"
+dependencies = [
+ "thiserror 2.0.18",
+]
+
+[[package]]
+name = "const-default"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b396d1f76d455557e1218ec8066ae14bba60b4b36ecd55577ba979f5db7ecaa"
+
+[[package]]
+name = "const-oid"
+version = "0.9.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8"
+
+[[package]]
+name = "core-foundation-sys"
+version = "0.8.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
+
+[[package]]
+name = "cpufeatures"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "critical-section"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "790eea4361631c5e7d22598ecd5723ff611904e3344ce8720784c93e3d83d40b"
+
+[[package]]
+name = "crossbeam-deque"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
+dependencies = [
+ "crossbeam-epoch",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-epoch"
+version = "0.9.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
+dependencies = [
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-utils"
+version = "0.8.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
+
+[[package]]
+name = "crypto"
+version = "0.1.0"
+dependencies = [
+ "digest",
+ "math",
+ "rand 0.8.6",
+ "rand_chacha 0.3.1",
+ "serde",
+ "sha3",
+]
+
+[[package]]
+name = "crypto-bigint"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0dc92fb57ca44df6db8059111ab3af99a63d5d0f8375d9972e319a379c6bab76"
+dependencies = [
+ "generic-array",
+ "rand_core 0.6.4",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "crypto-common"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3"
+dependencies = [
+ "generic-array",
+ "typenum",
+]
+
+[[package]]
+name = "der"
+version = "0.7.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e7c1832837b905bbfb5101e07cc24c8deddf52f93225eee6ead5f4d63d53ddcb"
+dependencies = [
+ "const-oid",
+ "zeroize",
+]
+
+[[package]]
+name = "deserialize-only-bench"
+version = "0.1.0"
+dependencies = [
+ "lambda-vm-prover",
+ "lambda-vm-syscalls",
+ "postcard",
+]
+
+[[package]]
+name = "digest"
+version = "0.10.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
+dependencies = [
+ "block-buffer",
+ "crypto-common",
+]
+
+[[package]]
+name = "ecsm"
+version = "0.1.0"
+dependencies = [
+ "k256",
+ "num-bigint",
+ "num-traits",
+]
+
+[[package]]
+name = "either"
+version = "1.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e"
+
+[[package]]
+name = "elliptic-curve"
+version = "0.13.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b5e6043086bf7973472e0c7dff2142ea0b680d30e18d9cc40f267efbf222bd47"
+dependencies = [
+ "base16ct",
+ "crypto-bigint",
+ "ff",
+ "generic-array",
+ "group",
+ "rand_core 0.6.4",
+ "sec1",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "embedded-alloc"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f2de9133f68db0d4627ad69db767726c99ff8585272716708227008d3f1bddd"
+dependencies = [
+ "const-default",
+ "critical-section",
+ "linked_list_allocator",
+ "rlsf",
+]
+
+[[package]]
+name = "embedded-hal"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "361a90feb7004eca4019fb28352a9465666b24f840f5c3cddf0ff13920590b89"
+
+[[package]]
+name = "embedded-io"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ef1a6892d9eef45c8fa6b9e0086428a2cca8491aca8f787c534a3d6d0bcb3ced"
+
+[[package]]
+name = "embedded-io"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "edd0f118536f44f5ccd48bcb8b111bdc3de888b58c74639dfb034a357d0f206d"
+
+[[package]]
+name = "executor"
+version = "0.1.0"
+dependencies = [
+ "ecsm",
+ "rustc-demangle",
+ "thiserror 1.0.69",
+]
+
+[[package]]
+name = "ff"
+version = "0.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c0b50bfb653653f9ca9095b427bed08ab8d75a137839d9ad64eb11810d5b6393"
+dependencies = [
+ "rand_core 0.6.4",
+ "subtle",
+]
+
+[[package]]
+name = "futures-core"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d"
+
+[[package]]
+name = "futures-task"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393"
+
+[[package]]
+name = "futures-util"
+version = "0.3.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6"
+dependencies = [
+ "futures-core",
+ "futures-task",
+ "pin-project-lite",
+ "slab",
+]
+
+[[package]]
+name = "generic-array"
+version = "0.14.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4bb6743198531e02858aeaea5398fcc883e71851fcbcb5a2f773e2fb6cb1edf2"
+dependencies = [
+ "typenum",
+ "version_check",
+ "zeroize",
+]
+
+[[package]]
+name = "getrandom"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0"
+dependencies = [
+ "cfg-if",
+ "js-sys",
+ "libc",
+ "wasi",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "getrandom"
+version = "0.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "r-efi",
+ "wasip2",
+]
+
+[[package]]
+name = "group"
+version = "0.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0f9ef7462f7c099f518d754361858f86d8a07af53ba9af0fe635bbccb151a63"
+dependencies = [
+ "ff",
+ "rand_core 0.6.4",
+ "subtle",
+]
+
+[[package]]
+name = "half"
+version = "1.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1b43ede17f21864e81be2fa654110bf1e793774238d86ef8555c37e6519c0403"
+
+[[package]]
+name = "hash32"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b0c35f58762feb77d74ebe43bdbc3210f09be9fe6742234d573bacc26ed92b67"
+dependencies = [
+ "byteorder",
+]
+
+[[package]]
+name = "heapless"
+version = "0.7.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cdc6457c0eb62c71aac4bc17216026d8410337c4126773b9c5daba343f17964f"
+dependencies = [
+ "atomic-polyfill",
+ "hash32",
+ "rustc_version",
+ "serde",
+ "spin",
+ "stable_deref_trait",
+]
+
+[[package]]
+name = "itertools"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57"
+dependencies = [
+ "either",
+]
+
+[[package]]
+name = "itoa"
+version = "1.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682"
+
+[[package]]
+name = "js-sys"
+version = "0.3.103"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "53b44bfcdb3f8d5837a46dae1ca9660a837176eee74a28b229bc626816589102"
+dependencies = [
+ "cfg-if",
+ "futures-util",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "k256"
+version = "0.13.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f6e3919bbaa2945715f0bb6d3934a173d1e9a59ac23767fbaaef277265a7411b"
+dependencies = [
+ "cfg-if",
+ "elliptic-curve",
+]
+
+[[package]]
+name = "keccak"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cb26cec98cce3a3d96cbb7bced3c4b16e3d13f27ec56dbd62cbc8f39cfb9d653"
+dependencies = [
+ "cpufeatures",
+]
+
+[[package]]
+name = "lambda-vm-prover"
+version = "0.1.0"
+dependencies = [
+ "crypto",
+ "ecsm",
+ "executor",
+ "log",
+ "math",
+ "serde",
+ "sha3",
+ "stark",
+ "sysinfo",
+]
+
+[[package]]
+name = "lambda-vm-syscalls"
+version = "0.1.0"
+dependencies = [
+ "embedded-alloc",
+ "getrandom 0.2.17",
+ "getrandom 0.3.4",
+ "lazy_static",
+ "rand 0.9.4",
+ "riscv",
+ "thiserror 1.0.69",
+]
+
+[[package]]
+name = "lazy_static"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
+
+[[package]]
+name = "libc"
+version = "0.2.186"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66"
+
+[[package]]
+name = "linked_list_allocator"
+version = "0.10.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b23ac50abb8261cb38c6e2a7192d3302e0836dac1628f6a93b82b4fad185897"
+
+[[package]]
+name = "lock_api"
+version = "0.4.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965"
+dependencies = [
+ "scopeguard",
+]
+
+[[package]]
+name = "log"
+version = "0.4.33"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0ceec5bc11778974d1bcb055b18002eba7f4b3518b6a0081b3af5f21666da9ad"
+
+[[package]]
+name = "math"
+version = "0.1.0"
+dependencies = [
+ "getrandom 0.2.17",
+ "num-bigint",
+ "num-traits",
+ "rand 0.8.6",
+ "rayon",
+ "serde",
+ "serde_json",
+]
+
+[[package]]
+name = "memchr"
+version = "2.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "88904434abc2901f197fe8cc55f0445e7ded921dba5911dad2e2b39b48e663c4"
+
+[[package]]
+name = "ntapi"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c3b335231dfd352ffb0f8017f3b6027a4917f7df785ea2143d8af2adc66980ae"
+dependencies = [
+ "winapi",
+]
+
+[[package]]
+name = "num-bigint"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9"
+dependencies = [
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-integer"
+version = "0.1.46"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "num-traits"
+version = "0.2.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
+dependencies = [
+ "autocfg",
+]
+
+[[package]]
+name = "once_cell"
+version = "1.21.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50"
+
+[[package]]
+name = "paste"
+version = "1.0.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
+
+[[package]]
+name = "pin-project-lite"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd"
+
+[[package]]
+name = "postcard"
+version = "1.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6764c3b5dd454e283a30e6dfe78e9b31096d9e32036b5d1eaac7a6119ccb9a24"
+dependencies = [
+ "cobs",
+ "embedded-io 0.4.0",
+ "embedded-io 0.6.1",
+ "heapless",
+ "serde",
+]
+
+[[package]]
+name = "ppv-lite86"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9"
+dependencies = [
+ "zerocopy",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.106"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.46"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dfbc457d0c7a0759a614551b11a6409e5951f6c7537be1f1b7682b9ae9230368"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "r-efi"
+version = "5.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
+
+[[package]]
+name = "rand"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5ca0ecfa931c29007047d1bc58e623ab12e5590e8c7cc53200d5202b69266d8a"
+dependencies = [
+ "rand_core 0.6.4",
+]
+
+[[package]]
+name = "rand"
+version = "0.9.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea"
+dependencies = [
+ "rand_chacha 0.9.0",
+ "rand_core 0.9.5",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
+dependencies = [
+ "ppv-lite86",
+ "rand_core 0.6.4",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
+dependencies = [
+ "ppv-lite86",
+ "rand_core 0.9.5",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
+
+[[package]]
+name = "rand_core"
+version = "0.9.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c"
+dependencies = [
+ "getrandom 0.3.4",
+]
+
+[[package]]
+name = "rayon"
+version = "1.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d"
+dependencies = [
+ "either",
+ "rayon-core",
+]
+
+[[package]]
+name = "rayon-core"
+version = "1.13.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91"
+dependencies = [
+ "crossbeam-deque",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "riscv"
+version = "0.15.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b05cfa3f7b30c84536a9025150d44d26b8e1cc20ddf436448d74cd9591eefb25"
+dependencies = [
+ "critical-section",
+ "embedded-hal",
+ "paste",
+ "riscv-macros",
+ "riscv-pac",
+]
+
+[[package]]
+name = "riscv-macros"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7d323d13972c1b104aa036bc692cd08b822c8bbf23d79a27c526095856499799"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.118",
+]
+
+[[package]]
+name = "riscv-pac"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8188909339ccc0c68cfb5a04648313f09621e8b87dc03095454f1a11f6c5d436"
+
+[[package]]
+name = "rlsf"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1646a59a9734b8b7a0ac51689388a60fe1625d4b956348e9de07591a1478457a"
+dependencies = [
+ "cfg-if",
+ "const-default",
+ "libc",
+ "rustversion",
+ "svgbobdoc",
+]
+
+[[package]]
+name = "rustc-demangle"
+version = "0.1.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b50b8869d9fc858ce7266cce0194bd74df58b9d0e3f6df3a9fc8eb470d95c09d"
+
+[[package]]
+name = "rustc_version"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92"
+dependencies = [
+ "semver",
+]
+
+[[package]]
+name = "rustversion"
+version = "1.0.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d"
+
+[[package]]
+name = "ryu"
+version = "1.0.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f"
+
+[[package]]
+name = "scopeguard"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
+
+[[package]]
+name = "sec1"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3e97a565f76233a6003f9f5c54be1d9c5bdfa3eccfb189469f11ec4901c47dc"
+dependencies = [
+ "base16ct",
+ "der",
+ "generic-array",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "semver"
+version = "1.0.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a7852d02fc848982e0c167ef163aaff9cd91dc640ba85e263cb1ce46fae51cd"
+
+[[package]]
+name = "serde"
+version = "1.0.219"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6"
+dependencies = [
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_cbor"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2bef2ebfde456fb76bbcf9f59315333decc4fda0b2b44b420243c11e0f5ec1f5"
+dependencies = [
+ "half",
+ "serde",
+]
+
+[[package]]
+name = "serde_derive"
+version = "1.0.219"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.118",
+]
+
+[[package]]
+name = "serde_json"
+version = "1.0.143"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d401abef1d108fbd9cbaebc3e46611f4b1021f714a0597a71f41ee463f5f4a5a"
+dependencies = [
+ "itoa",
+ "memchr",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "sha3"
+version = "0.10.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77fd7028345d415a4034cf8777cd4f8ab1851274233b45f84e3d955502d93874"
+dependencies = [
+ "digest",
+ "keccak",
+]
+
+[[package]]
+name = "slab"
+version = "0.4.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5"
+
+[[package]]
+name = "spin"
+version = "0.9.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
+dependencies = [
+ "lock_api",
+]
+
+[[package]]
+name = "stable_deref_trait"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
+
+[[package]]
+name = "stark"
+version = "0.1.0"
+dependencies = [
+ "crypto",
+ "itertools",
+ "log",
+ "math",
+ "serde",
+ "serde_cbor",
+ "sha3",
+ "thiserror 1.0.69",
+]
+
+[[package]]
+name = "subtle"
+version = "2.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
+
+[[package]]
+name = "svgbobdoc"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f2c04b93fc15d79b39c63218f15e3fdffaa4c227830686e3b7c5f41244eb3e50"
+dependencies = [
+ "base64",
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+ "unicode-width",
+]
+
+[[package]]
+name = "syn"
+version = "1.0.109"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "syn"
+version = "2.0.118"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1b9ae57f904213ebb649ce6895b8a66c66f0203b9319718f69a5612a065b1422"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "sysinfo"
+version = "0.31.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "355dbe4f8799b304b05e1b0f05fc59b2a18d36645cf169607da45bde2f69a1be"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
+ "memchr",
+ "ntapi",
+ "windows",
+]
+
+[[package]]
+name = "thiserror"
+version = "1.0.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
+dependencies = [
+ "thiserror-impl 1.0.69",
+]
+
+[[package]]
+name = "thiserror"
+version = "2.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4"
+dependencies = [
+ "thiserror-impl 2.0.18",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "1.0.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.118",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "2.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.118",
+]
+
+[[package]]
+name = "typenum"
+version = "1.20.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6f5e870be6c3b371b77fe0ee0bafb859fa4964b4404c27de1d380043c4dda20"
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75"
+
+[[package]]
+name = "unicode-width"
+version = "0.1.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af"
+
+[[package]]
+name = "version_check"
+version = "0.9.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
+
+[[package]]
+name = "wasi"
+version = "0.11.1+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b"
+
+[[package]]
+name = "wasip2"
+version = "1.0.4+wasi-0.2.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b67efb37e106e55ce722a510d6b5f9c17f083e5fc79afc2badeb12cc313d9487"
+dependencies = [
+ "wit-bindgen",
+]
+
+[[package]]
+name = "wasm-bindgen"
+version = "0.2.126"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4b067c0c11094aef6b7a801c1e34a26affafdf3d051dba08456b868789aaf9a4"
+dependencies = [
+ "cfg-if",
+ "once_cell",
+ "rustversion",
+ "wasm-bindgen-macro",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-macro"
+version = "0.2.126"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "167ce5e579f6bcf889c4f7175a8a5a585de84e8ff93976ce393efa5f2837aab1"
+dependencies = [
+ "quote",
+ "wasm-bindgen-macro-support",
+]
+
+[[package]]
+name = "wasm-bindgen-macro-support"
+version = "0.2.126"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f3997c7839262f4ef12cf90b818d6340c18e80f263f1a94bf157d0ec4420380e"
+dependencies = [
+ "bumpalo",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.118",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-shared"
+version = "0.2.126"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc1b4cb0cc549fcf58d7dfc081778139b3d283a081644e833e84682ad71cea24"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "winapi"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
+dependencies = [
+ "winapi-i686-pc-windows-gnu",
+ "winapi-x86_64-pc-windows-gnu",
+]
+
+[[package]]
+name = "winapi-i686-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
+
+[[package]]
+name = "winapi-x86_64-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
+
+[[package]]
+name = "windows"
+version = "0.57.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "12342cb4d8e3b046f3d80effd474a7a02447231330ef77d71daa6fbc40681143"
+dependencies = [
+ "windows-core",
+ "windows-targets",
+]
+
+[[package]]
+name = "windows-core"
+version = "0.57.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d2ed2439a290666cd67ecce2b0ffaad89c2a56b976b736e6ece670297897832d"
+dependencies = [
+ "windows-implement",
+ "windows-interface",
+ "windows-result",
+ "windows-targets",
+]
+
+[[package]]
+name = "windows-implement"
+version = "0.57.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9107ddc059d5b6fbfbffdfa7a7fe3e22a226def0b2608f72e9d552763d3e1ad7"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.118",
+]
+
+[[package]]
+name = "windows-interface"
+version = "0.57.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "29bee4b38ea3cde66011baa44dba677c432a78593e202392d1e9070cf2a7fca7"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.118",
+]
+
+[[package]]
+name = "windows-result"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5e383302e8ec8515204254685643de10811af0ed97ea37210dc26fb0032647f8"
+dependencies = [
+ "windows-targets",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
+dependencies = [
+ "windows_aarch64_gnullvm",
+ "windows_aarch64_msvc",
+ "windows_i686_gnu",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc",
+ "windows_x86_64_gnu",
+ "windows_x86_64_gnullvm",
+ "windows_x86_64_msvc",
+]
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
+
+[[package]]
+name = "wit-bindgen"
+version = "0.57.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e"
+
+[[package]]
+name = "zerocopy"
+version = "0.8.52"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ce1022995ff5ff5d841ad7d994facc23098cd40152f2c1d11cd607c6f530653f"
+dependencies = [
+ "zerocopy-derive",
+]
+
+[[package]]
+name = "zerocopy-derive"
+version = "0.8.52"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ae7f38b72ec2a254e2b87ef277cf2cd4fb97cbebf944faa6f33354da0867930"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.118",
+]
+
+[[package]]
+name = "zeroize"
+version = "1.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e13c156562582aa81c60cb29407084cdb54c4164760106ab78e6c5b0858cf64e"
diff --git a/bench_vs/lambda/deserialize-only/Cargo.toml b/bench_vs/lambda/deserialize-only/Cargo.toml
index b4a4616f4..fac6a7628 100644
--- a/bench_vs/lambda/deserialize-only/Cargo.toml
+++ b/bench_vs/lambda/deserialize-only/Cargo.toml
@@ -7,7 +7,5 @@ edition = "2024"
 
 [dependencies]
 lambda-vm-prover = { path = "../../../prover", default-features = false }
-embedded-alloc = "0.6"
-riscv = { version = "0.15", features = ["critical-section-single-hart"] }
-serde = { version = "=1.0.219", default-features = false, features = ["derive", "alloc"] }
-postcard = { version = "1.0", default-features = false, features = ["alloc"] }
+lambda-vm-syscalls = { path = "../../../syscalls" }
+postcard = { version = "1.0", features = ["alloc"] }
diff --git a/bench_vs/lambda/deserialize-only/src/main.rs b/bench_vs/lambda/deserialize-only/src/main.rs
index 8627776a1..7ba9a9d93 100644
--- a/bench_vs/lambda/deserialize-only/src/main.rs
+++ b/bench_vs/lambda/deserialize-only/src/main.rs
@@ -1,93 +1,32 @@
 //! Deserialize-only counterpart to the recursion guest.
 //!
 //! Reads the same private-input blob as `recursion-bench`, postcard-decodes
-//! `(VmProof, Vec<u8>, ProofOptions)`, then commits success
-//! and halts — without ever calling `verify_with_options`. The cycle delta
-//! between this guest and `recursion-bench` is the actual cost of the STARK
-//! verifier inside the VM (everything else being equal).
+//! `(VmProof, Vec<u8>, ProofOptions)`, then commits and halts — without ever
+//! calling `verify_with_options`. The cycle delta between this guest and
+//! `recursion-bench` is the actual cost of the STARK verifier inside the VM.
+//!
+//! Mirrors the recursion guest's std setup (build-std + `lambda_vm_syscalls`)
+//! so the two differ only in the verify call.
 
-#![no_std]
 #![no_main]
 
-extern crate alloc;
-
-use alloc::vec::Vec;
-use core::arch::asm;
-use core::panic::PanicInfo;
-
-use embedded_alloc::TlsfHeap as Heap;
 use lambda_vm_prover::{ProofOptions, VmProof};
-// Required to pull in the riscv crate's critical-section implementation.
-use riscv as _;
-
-const PRIVATE_INPUT_START: usize = 0xFF000000;
-const SYSCALL_COMMIT: u64 = 64;
-const SYSCALL_HALT: u64 = 93;
-const MAX_MEMORY_SIZE: usize = 0xC000_0000;
-
-#[global_allocator]
-static HEAP: Heap = Heap::empty();
-
-#[panic_handler]
-fn panic(_info: &PanicInfo) -> ! {
-    loop {}
-}
-
-fn init_allocator() {
-    unsafe extern "C" {
-        static _end: u8;
-    }
-    let heap_pos = (&raw const _end) as usize;
-    unsafe { HEAP.init(heap_pos, MAX_MEMORY_SIZE - heap_pos) }
-}
 
-fn read_private_input() -> &'static [u8] {
-    let len = unsafe { core::ptr::read_volatile(PRIVATE_INPUT_START as *const u32) } as usize;
-    let data = (PRIVATE_INPUT_START + 4) as *const u8;
-    unsafe { core::slice::from_raw_parts(data, len) }
-}
-
-fn commit(bytes: &[u8]) {
-    unsafe {
-        asm!(
-            "ecall",
-            in("a0") 1u64,
-            in("a1") bytes.as_ptr(),
-            in("a2") bytes.len(),
-            in("a7") SYSCALL_COMMIT,
-        );
-    }
-}
-
-fn halt() -> ! {
-    unsafe {
-        asm!(
-            "ecall",
-            in("a0") 0u64,
-            in("a7") SYSCALL_HALT,
-            options(noreturn),
-        );
-    }
-}
-
-#[unsafe(no_mangle)]
+#[unsafe(export_name = "main")]
 pub fn main() -> ! {
-    init_allocator();
+    lambda_vm_syscalls::allocator::init_allocator();
 
-    let blob = read_private_input();
-    let decoded: (VmProof, Vec<u8>, ProofOptions) =
-        postcard::from_bytes(blob).expect("failed to deserialize");
+    const PANIC_MSG: &str = "PANICKED";
+    std::panic::set_hook(Box::new(|_| unsafe {
+        lambda_vm_syscalls::syscalls::sys_panic(PANIC_MSG.as_ptr(), PANIC_MSG.len())
+    }));
 
-    // Force the commit byte to depend on the actually-decoded value. Without
-    // this, LLVM at -O3 was eliding the postcard decode entirely — the only
-    // sinks for `decoded` were `black_box(&decoded)` (which only forces the
-    // *reference* to materialize, not the pointee) and `Drop`, neither of
-    // which require the decoded bytes to be real. With the commit byte tied
-    // to a deep field of the decoded value, the decode has to run.
-    let proof_options_byte = decoded.2.blowup_factor;
-    let inner_elf_byte = *decoded.1.first().unwrap_or(&0);
-    let marker = proof_options_byte ^ inner_elf_byte;
+    let blob = lambda_vm_syscalls::syscalls::get_private_input();
+    let decoded: (VmProof, Vec<u8>, ProofOptions) =
+        postcard::from_bytes(&blob).expect("failed to deserialize recursion input");
 
-    commit(&[marker]);
-    halt()
+    // Tie the committed byte to the decoded value so LLVM can't elide the decode.
+    let marker = decoded.2.blowup_factor ^ *decoded.1.first().unwrap_or(&0);
+    lambda_vm_syscalls::syscalls::commit(&[marker]);
+    lambda_vm_syscalls::syscalls::sys_halt();
 }

From 35f4741270855e8b1b9533edc14f6417978e7e2b Mon Sep 17 00:00:00 2001
From: Mario Rugiero <mrugiero@gmail.com>
Date: Tue, 30 Jun 2026 18:11:58 -0300
Subject: [PATCH 07/16] build: point profile-recursion make targets at renamed
 tests

---
 Makefile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index 60bb8a0c5..801845534 100644
--- a/Makefile
+++ b/Makefile
@@ -234,11 +234,11 @@ test-flamegraph:
 
 test-profile-recursion: test-profile-recursion-single test-profile-recursion-multi
 
-test-profile-recursion-single: compile-programs-rust
-	cargo test --package lambda-vm-prover --lib test_recursion_pc_histogram_1query -- --ignored --nocapture
+test-profile-recursion-single: compile-recursion-elfs
+	cargo test --package lambda-vm-prover --lib test_recursion_profile_1query -- --ignored --nocapture
 
-test-profile-recursion-multi: compile-programs-rust
-	cargo test --package lambda-vm-prover --lib test_recursion_pc_histogram_multiquery -- --ignored --nocapture
+test-profile-recursion-multi: compile-recursion-elfs
+	cargo test --package lambda-vm-prover --lib test_recursion_profile_multiquery -- --ignored --nocapture
 
 # Regenerate the committed ethrex block fixtures (see tooling/ethrex-fixtures).
 # Run after bumping the ethrex rev; README checksums are refreshed automatically.

From 89d46dcd2deddf15cc0a5dde6150aae249685c4f Mon Sep 17 00:00:00 2001
From: Mario Rugiero <mrugiero@gmail.com>
Date: Tue, 30 Jun 2026 18:20:16 -0300
Subject: [PATCH 08/16] docs: trim recursion smoke-test doc comments

---
 prover/src/tests/recursion_smoke_test.rs | 230 ++++++-----------------
 1 file changed, 53 insertions(+), 177 deletions(-)

diff --git a/prover/src/tests/recursion_smoke_test.rs b/prover/src/tests/recursion_smoke_test.rs
index 5077610e7..3b3814adc 100644
--- a/prover/src/tests/recursion_smoke_test.rs
+++ b/prover/src/tests/recursion_smoke_test.rs
@@ -1,16 +1,7 @@
-//! End-to-end naive recursion pipeline smoke tests.
-//!
-//! Each test:
-//! 1. Proves an inner program on the host.
-//! 2. Serializes `(VmProof, inner_elf)` with postcard.
-//! 3. Hands that as private input to the recursion guest.
-//! 4. Either **proves** the recursion guest's execution (memory-bounded via
-//!    continuations) and verifies the outer proof (`OuterMode::Prove`), or
-//!    merely **executes** the guest in-VM and reads the committed marker
-//!    straight off the executor's memory (`OuterMode::ExecuteOnly`) — a cheaper
-//!    tier that skips the LDE/FRI that dominate the full pipeline.
-//!
-//! The guest ELFs are assumed built by `make compile-recursion-elfs`.
+//! End-to-end naive recursion pipeline smoke tests: prove an inner program,
+//! hand `(VmProof, elf, opts)` to the in-VM verifier guest, then either prove
+//! the guest's execution (`OuterMode::Prove`) or just execute it
+//! (`OuterMode::ExecuteOnly`). Guest ELFs come from `make compile-recursion-elfs`.
 
 use std::ops::ControlFlow;
 use std::path::PathBuf;
@@ -33,11 +24,8 @@ fn read_guest_elf(root: &std::path::Path, name: &str) -> Vec<u8> {
     })
 }
 
-/// Minimum-security FRI parameters: blowup=2, a single FRI query. Security is
-/// intentionally terrible — used by the capacity-probing test and every cheap
-/// diagnostic below, where the goal is the smallest possible inner proof, not
-/// a sound one. (`GoldilocksCubicProofOptions::with_blowup` derives a query
-/// count from a 128-bit target, far more than we want here.)
+/// Smallest possible inner proof (blowup=2, 1 query). Intentionally insecure —
+/// for the cheap diagnostics, not soundness.
 const MIN_PROOF_OPTIONS: stark::proof::options::ProofOptions =
     stark::proof::options::ProofOptions {
         blowup_factor: 2,
@@ -46,11 +34,8 @@ const MIN_PROOF_OPTIONS: stark::proof::options::ProofOptions =
         grinding_factor: 1,
     };
 
-/// Prove `inner_elf` (fed `inner_input`) under `opts`, then package
-/// `(proof, elf, opts)` into the postcard blob the recursion and
-/// deserialize-only guests consume as their private input. `tag` prefixes the
-/// progress lines. Returns the inner proof — callers that re-verify it on the
-/// host need it — next to the encoded blob.
+/// Prove `inner_elf` under `opts` and postcard-encode `(proof, elf, opts)` into
+/// the guest's private-input blob. Returns the proof and the blob.
 fn prove_inner_and_encode_blob(
     tag: &str,
     inner_elf: &[u8],
@@ -75,26 +60,17 @@ fn prove_inner_and_encode_blob(
     (inner_proof, blob)
 }
 
-/// How far to take the recursion guest after it has been handed the inner
-/// proof. The guest under test is the verifier either way — this only chooses
-/// whether we also prove the guest's own execution.
+/// Whether to also prove the guest's own execution after handing it the proof.
 #[derive(Clone, Copy, Debug)]
 enum OuterMode {
-    /// Execute the guest in-VM and read the committed marker straight off the
-    /// executor's memory. Streams logs via `Executor::resume()` and never
-    /// builds a `Traces`, so footprint stays bounded to the VM's touched
-    /// memory + instruction cache. Skips the LDE/FRI of the full pipeline entirely.
+    /// Execute in-VM, read the committed marker off memory; no LDE/FRI.
     ExecuteOnly,
-    /// Prove the guest's execution memory-bounded via continuations, then
-    /// verify the outer proof on the host. Peak RAM is a single epoch's proof.
+    /// Prove the execution (memory-bounded via continuations) and verify on host.
     Prove,
 }
 
-/// Execute the recursion guest in-VM on `blob` and return the bytes it
-/// committed (the success marker the in-VM verifier emits).
-///
-/// Streams execution via `Executor::resume()`. The committed marker is
-/// read directly off the executor's memory. This avoids OOMs.
+/// Execute the recursion guest in-VM on `blob` and return its committed bytes,
+/// read straight off the executor's memory after a streamed run.
 fn execute_outer_and_commit(label: &str, recursion_elf_bytes: &[u8], blob: &[u8]) -> Vec<u8> {
     use executor::elf::Elf;
     use executor::vm::execution::Executor;
@@ -123,9 +99,8 @@ fn execute_outer_and_commit(label: &str, recursion_elf_bytes: &[u8], blob: &[u8]
 /// Epoch size for the outer prove: 2^20 ≈ 1M cycles per epoch.
 const OUTER_EPOCH_SIZE_LOG2: u32 = 20;
 
-/// Prove the recursion guest's execution on `blob` memory-bounded via
-/// continuations and verify the bundle on the host, returning the bytes the
-/// guest committed.
+/// Prove the guest's execution via continuations, verify on host, return the
+/// committed bytes.
 fn prove_outer_and_commit(label: &str, recursion_elf_bytes: &[u8], blob: &[u8]) -> Vec<u8> {
     let opts =
         crate::GoldilocksCubicProofOptions::with_blowup(2).expect("blowup=2 is always valid");
@@ -145,15 +120,9 @@ fn prove_outer_and_commit(label: &str, recursion_elf_bytes: &[u8], blob: &[u8])
     committed
 }
 
-/// Stream a guest's execution via `Executor::resume()`, calling `on_log` for
-/// every `Log` without ever buffering the full log stream (`Executor::run`
-/// would accumulate tens of millions of `Log`s and OOM even a 125 GB box).
-/// `on_log` returns `ControlFlow::Break(())` to stop the run early (e.g. once a
-/// cycle budget is hit); `Continue(())` to keep going. `on_progress(chunks,
-/// total_cycles, elapsed)` fires once per resumed chunk; callers throttle and
-/// format their own progress lines. Returns `(total_cycles, wall_time)` —
-/// `total_cycles` counts logs actually visited, so it is exact even when a run
-/// breaks mid-chunk.
+/// Stream a guest's execution via `Executor::resume()` without buffering the log
+/// stream. `on_log` returns `Break` to stop early; `on_progress` fires per chunk.
+/// Returns `(total_cycles, wall_time)`, exact even on an early break.
 fn drive_executor(
     executor: &mut executor::vm::execution::Executor,
     mut on_log: impl FnMut(&executor::vm::logs::Log) -> ControlFlow<()>,
@@ -180,12 +149,8 @@ fn drive_executor(
     (total_cycles, start.elapsed())
 }
 
-/// Shared preamble for every execute-only diagnostic below: build the standard
-/// recursion private-input blob (an `empty`-program inner proof produced under
-/// `opts`), load guest `guest_name`, and stand up an executor over it. Returns
-/// the guest's raw ELF bytes (callers that resolve PCs pass them to
-/// [`executor::elf::SymbolTable::parse`]), the loaded program, and the
-/// ready-to-drive executor.
+/// Shared preamble: build the blob (an `empty` inner proof under `opts`), load
+/// `guest_name`, and stand up an executor. Returns `(elf_bytes, program, executor)`.
 fn setup_guest_run(
     label: &str,
     guest_name: &str,
@@ -211,12 +176,7 @@ fn setup_guest_run(
     (guest_elf_bytes, program, executor)
 }
 
-/// A `drive_executor` progress callback that prints the throttled
-/// `[label]   ... N chunks, M cycles, T elapsed` line every `stride` chunks —
-/// the readout the counting diagnostics share. Tests that need extra live state
-/// (unique PC count, active step bucket) keep their own closure instead. Takes
-/// `impl Into<String>` so it works with both `&'static` tags and a run's
-/// dynamic `label`.
+/// A `drive_executor` progress callback printing one line every `stride` chunks.
 fn log_progress(label: impl Into<String>, stride: usize) -> impl FnMut(usize, u64, std::time::Duration) {
     let label = label.into();
     move |chunks, cycles, elapsed| {
@@ -226,11 +186,8 @@ fn log_progress(label: impl Into<String>, stride: usize) -> impl FnMut(usize, u6
     }
 }
 
-/// Resolve a guest PC to its (demangled) enclosing function name using the
-/// ELF's own symbol table — the same data `executor::flamegraph` resolves
-/// against. `<unknown>` when no function symbol covers the PC (e.g. PLT stubs
-/// or a release build that dropped symbols). No file:line: the symbol table
-/// carries function ranges only, not DWARF line info.
+/// Demangled enclosing-function name for a PC via the ELF symbol table;
+/// `<unknown>` if none covers it. No file:line (symtab has no DWARF).
 fn resolve_pc(symbols: &executor::elf::SymbolTable, pc: u64) -> String {
     symbols.lookup(pc).map_or_else(
         || "<unknown>".to_string(),
@@ -238,10 +195,8 @@ fn resolve_pc(symbols: &executor::elf::SymbolTable, pc: u64) -> String {
     )
 }
 
-/// Verifier sub-routines in execution order. LLVM inlines the step bodies, but
-/// closures inside each keep the method name in their mangled symbol, so
-/// `run_profile` advances the step bucket by substring-matching the enclosing
-/// symbol. A step with no matching symbol merges into the previous bucket.
+/// Verifier sub-routines in execution order; `run_profile` buckets cycles by
+/// substring-matching the enclosing symbol (a missing step merges into the prior).
 const VERIFIER_STEP_KEYWORDS: [&str; 4] = [
     "replay_rounds_after_round_1",
     "step_2_verify_claimed_composition_polynomial",
@@ -249,15 +204,12 @@ const VERIFIER_STEP_KEYWORDS: [&str; 4] = [
     "step_4_verify_trace_and_composition_openings",
 ];
 
-/// `blowup=8` inner-proof options: the security-derived multi-query count (tens
-/// of queries, 128-bit) used by every `multiquery` profiling variant.
+/// `blowup=8` (128-bit, multi-query) options for the `multiquery` variants.
 fn blowup8() -> stark::proof::options::ProofOptions {
     crate::GoldilocksCubicProofOptions::with_blowup(8).expect("blowup=8 is always valid")
 }
 
-/// Fold the PC histogram by enclosing function and print the top-25 by cycles.
-/// Folded because an inlined kernel spreads across many PCs; no per-address
-/// table since a bare PC isn't actionable without file:line.
+/// Print the top-25 functions by cycles, folding the PC histogram by symbol.
 fn print_function_table(
     symbols: &executor::elf::SymbolTable,
     pc_hist: std::collections::HashMap<u64, u64>,
@@ -293,9 +245,7 @@ fn print_function_table(
     }
 }
 
-/// Print the monotonic per-verifier-step cycle bucketing. `buckets[0]` is
-/// pre-step-1 setup (alloc + postcard decode + `VmAirs::new`); `buckets[i]` is
-/// verifier step i (with a missing step's cycles merged into the previous one).
+/// Print the monotonic per-verifier-step cycle bucketing (`buckets[0]` = setup).
 fn print_step_breakdown(buckets: &[u64; 5], total_cycles: u64) {
     let labels = [
         "0. setup (alloc + postcard decode + VmAirs::new + pre-step-1)",
@@ -317,12 +267,9 @@ fn print_step_breakdown(buckets: &[u64; 5], total_cycles: u64) {
     }
 }
 
-/// Single-pass execute-only profiler. Always prints total cycles + wall time +
-/// a rough trace/LDE size estimate. With `detailed`, the same pass also builds
-/// the PC histogram and verifier-step bucketing and prints the top-25 functions
-/// and the per-step breakdown (the two always come together); `!detailed` does
-/// no per-log work, so it's just a fast cycle counter. `progress_stride`
-/// throttles the readout (recursion large, the deserialize-only control small).
+/// Single-pass execute-only profiler. Always prints total cycles + a rough
+/// trace/LDE estimate; with `detailed`, also the top-25 functions + per-step
+/// breakdown (one streamed pass). `!detailed` does no per-log work.
 fn run_profile(
     guest_name: &str,
     progress_stride: usize,
@@ -437,10 +384,8 @@ fn run_profile(
     eprintln!("============================================================");
 }
 
-/// Core pipeline: prove an inner program with the given options, hand the
-/// proof+ELF+options to the recursion guest, then take the guest to `mode`
-/// (execute-only or full prove) and assert it committed the `[1]` success
-/// marker — i.e. the in-VM verifier accepted the inner proof.
+/// Core pipeline: prove the inner program, run the guest to `mode`, assert it
+/// committed `[1]` (the in-VM verifier accepted the proof).
 fn run_recursion_pipeline_with_options(
     label: &str,
     inner_elf_bytes: &[u8],
@@ -487,8 +432,7 @@ fn run_recursion_pipeline_with_options(
     eprintln!("[{label}] guest committed [1]: in-VM verify accepted ✓");
 }
 
-/// Convenience wrapper using `blowup=8` for the inner proof — the default for
-/// the `empty` and `fibonacci` cases, chosen to keep outer-prove memory tractable.
+/// `run_recursion_pipeline_with_options` with `blowup=8` (the `empty`/`fibonacci` default).
 fn run_recursion_pipeline(
     label: &str,
     inner_elf_bytes: &[u8],
@@ -506,9 +450,8 @@ fn run_recursion_pipeline(
     );
 }
 
-/// Reproduce the recursion guest's EXACT path on the host — decode the postcard
-/// blob into `(VmProof, Vec<u8>, ProofOptions)` and call `verify_with_options`.
-/// Cheap regression guard.
+/// Decode the blob on the host and verify — a cheap guard on the encode/decode
+/// contract without running the VM.
 #[test]
 #[ignore = "needs prebuilt guest ELF (make compile-recursion-elfs)"]
 fn test_recursion_blob_decodes_and_verifies_on_host() {
@@ -541,8 +484,7 @@ fn test_recursion_blob_decodes_and_verifies_on_host() {
 
 // === Execute-only tier ========================================================
 
-/// Execute-only mirror of `test_recursion_prove_empty`: verify a `blowup=8`
-/// proof of the empty program in-VM.
+/// Execute-only: verify a `blowup=8` proof of the empty program in-VM.
 #[test]
 #[ignore = "slow: runs the in-VM STARK verifier (minutes on CI)"]
 fn test_recursion_execute_empty() {
@@ -556,8 +498,7 @@ fn test_recursion_execute_empty() {
     );
 }
 
-/// Execute-only mirror of `test_recursion_prove_1query`: smallest possible
-/// inner proof (blowup=2, 1 query) → least guest work.
+/// Execute-only: smallest inner proof (blowup=2, 1 query) → least guest work.
 #[test]
 #[ignore = "slow: runs the in-VM STARK verifier (minutes on CI)"]
 fn test_recursion_execute_1query() {
@@ -572,8 +513,7 @@ fn test_recursion_execute_1query() {
     );
 }
 
-/// Execute-only mirror of `test_recursion_prove`: verify a `blowup=8` proof of
-/// fibonacci(10) in-VM.
+/// Execute-only: verify a `blowup=8` proof of fibonacci(10) in-VM.
 #[test]
 #[ignore = "slow: runs the in-VM STARK verifier (minutes on CI)"]
 fn test_recursion_execute() {
@@ -593,8 +533,7 @@ fn test_recursion_execute() {
 
 // === Full-prove tier ==========================================================
 
-/// Inner program: empty (halt immediately). Useful for measuring the
-/// verifier's intrinsic recursion overhead.
+/// Inner program: empty — the verifier's intrinsic recursion overhead.
 #[test]
 #[ignore = "slow: memory-bounded continuation prove of the verifier-in-VM"]
 fn test_recursion_prove_empty() {
@@ -608,8 +547,7 @@ fn test_recursion_prove_empty() {
     );
 }
 
-/// Inner program: empty, but with the absolute-minimum FRI parameters
-/// (blowup=2, **fri_number_of_queries=1**). For quick profiling only.
+/// Inner program: empty, blowup=2/1-query. Quick profiling only.
 #[test]
 #[ignore = "slow: memory-bounded continuation prove of the verifier-in-VM"]
 fn test_recursion_prove_1query() {
@@ -625,18 +563,8 @@ fn test_recursion_prove_1query() {
     );
 }
 
-/// Diagnostic: build the inner proof and dump the recursion guest's private-input
-/// blob to `/tmp/recursion_input.bin` so the CLI's `execute --flamegraph` can
-/// consume it.
-///
-/// Usage after running this test:
-/// ```
-/// cargo run -p cli --release -- execute \
-///     bench_vs/lambda/recursion/target/riscv64im-lambda-vm-elf/release/recursion-bench \
-///     --private-input /tmp/recursion_input.bin \
-///     --flamegraph /tmp/recursion_folded.txt
-/// cat /tmp/recursion_folded.txt | inferno-flamegraph > /tmp/recursion_flamegraph.svg
-/// ```
+/// Dump the guest's private-input blob to `/tmp/recursion_input.bin` for the
+/// CLI's `execute --flamegraph`.
 #[test]
 #[ignore = "diagnostic: writes recursion private input to /tmp/recursion_input.bin"]
 fn test_dump_recursion_input() {
@@ -665,8 +593,7 @@ fn test_recursion_cycles_multiquery() {
     run_profile("recursion", 500, blowup8(), false);
 }
 
-/// Full profile (top-25 functions + per-step breakdown) of the 1-query run —
-/// the cheapest verifier run, dominated by fixed setup.
+/// Full profile (top-25 + per-step) of the 1-query run.
 #[test]
 #[ignore = "diagnostic: ~8 min; recursion guest histogram + steps (1 query)"]
 fn test_recursion_profile_1query() {
@@ -680,27 +607,8 @@ fn test_recursion_profile_multiquery() {
     run_profile("recursion", 500, blowup8(), true);
 }
 
-/// Diagnostic: count the distinct 4 KB memory pages the recursion guest
-/// touches when verifying a small inner proof.
-///
-/// We suspect the outer prover's 125 GB OOM wall is dominated by per-page
-/// PAGE-table overhead. The number of PAGE tables the prover would build
-/// equals the number of distinct 4 KB pages the executor touches — code,
-/// heap, private input, and stack. This test surfaces that count without
-/// running the prover.
-///
-/// Layout (per `executor::constants` + `bench_vs/lambda/recursion/src/main.rs`):
-/// - Code/static: whatever PT_LOAD segments the recursion ELF carries.
-/// - Heap: `_end .. 0xC000_0000` (`MAX_MEMORY_SIZE`); `TlsfHeap` scatters
-///   allocations across this region.
-/// - Private input: starts at `PRIVATE_INPUT_START_INDEX = 0xFF000000`.
-/// - Stack: top of address space (down from `STACK_TOP = 0xFFFFFFFFFFFFFFF0`).
-///
-/// Interpretation (rough):
-/// - <1,000 pages: PAGE-table overhead is not the bottleneck.
-/// - 10k-100k pages: TLSF heap fragmentation; design a tighter bump allocator
-///   and re-measure.
-/// - >100k pages: postcard decode dominates; consider streaming decode.
+/// Count the distinct 4 KB pages the guest touches (code/heap/input/stack) — a
+/// proxy for the prover's per-page PAGE-table overhead, without running it.
 #[test]
 #[ignore = "diagnostic: counts distinct 4 KB memory pages touched by the recursion guest"]
 fn test_recursion_page_count() {
@@ -803,39 +711,19 @@ fn test_recursion_page_count() {
     eprintln!("============================================================");
 }
 
-/// Diagnostic: build a **sampled** call-stack histogram of the recursion guest.
-///
-/// Like `test_recursion_pc_histogram` but groups by full call stack (not PC).
-/// To stay fast, only every `SAMPLE_RATE`-th log is recorded into the histogram.
-/// The call stack itself is updated on every log (skipping would corrupt it).
-///
-/// Output is written to `/tmp/recursion_folded_sampled.txt` in
-/// inferno-flamegraph "folded stacks" format. Pipe it through:
-///
-///     cat /tmp/recursion_folded_sampled.txt | inferno-flamegraph > svg.svg
-///
-/// Expect ~10-20 minutes for SAMPLE_RATE=100 on a 40B-cycle guest.
+/// Sampled call-stack flamegraph of the recursion guest, written to
+/// `/tmp/recursion_folded_sampled.txt` (inferno "folded stacks" format).
 #[test]
 #[ignore = "diagnostic: sampled flamegraph for the verifier-in-VM"]
 fn test_recursion_sampled_flamegraph() {
     use executor::flamegraph::FlamegraphGenerator;
     use std::io::BufWriter;
 
-    /// 1 in N logs is fed to `process_logs`, which both updates the call
-    /// stack and records a sample. At 1, every cycle goes through — the call
-    /// stack stays exactly in sync with execution so frame widths are
-    /// trustworthy, but the per-cycle cost (~57µs) limits how many cycles
-    /// we can cover within a wall-clock budget.
-    ///
-    /// At SAMPLE_RATE > 1, every CALL/RETURN that lands on a skipped cycle
-    /// silently desyncs the stack, producing the "stuck-in-visit_seq" effect
-    /// we saw at 1:1000. Use values > 1 only when stack accuracy is
-    /// expendable.
+    /// 1-in-N logs sampled. >1 desyncs the call stack on skipped CALL/RETURNs,
+    /// so keep at 1 unless stack accuracy is expendable.
     const SAMPLE_RATE: usize = 1;
 
-    /// Stop the executor early once we've covered this many cycles.
-    /// Set to 0 to run to completion (40B+ cycles, hours at SAMPLE_RATE=1).
-    /// At SAMPLE_RATE=1, ~57µs per cycle means 5M cycles ≈ 5 min wall time.
+    /// Stop after this many cycles (0 = run to completion).
     const CYCLE_BUDGET: u64 = 5_000_000;
 
     let (recursion_elf_bytes, program, mut executor) =
@@ -931,20 +819,8 @@ fn test_recursion_sampled_flamegraph() {
     eprintln!("============================================================");
 }
 
-/// Diagnostic: host-side per-step timings for the verifier.
-///
-/// Runs an inner prove (empty guest, blowup=2, 1 query) and then verifies it
-/// on the host. When built with `--features stark/instruments`, the verifier
-/// prints `Time spent: ...` for each of the four steps (replay challenges,
-/// composition polynomial, FRI, DEEP openings) plus the step-1-replay it
-/// does before step 2. Lets us see the host-side split in seconds, without
-/// running anything inside the VM.
-///
-/// Usage:
-/// ```
-/// cargo test --release -p lambda-vm-prover --features stark/instruments \
-///   --lib test_host_verify_step_timings -- --ignored --nocapture
-/// ```
+/// Host-side per-step verifier timings (build with `--features stark/instruments`
+/// for the `Time spent:` lines). No VM execution.
 #[test]
 #[ignore = "diagnostic: prints host-side verifier step timings"]
 fn test_host_verify_step_timings() {

From 53145fc50f48f8b82ff4c5ce675154f1d9d25749 Mon Sep 17 00:00:00 2001
From: Mario Rugiero <mrugiero@gmail.com>
Date: Tue, 30 Jun 2026 18:23:07 -0300
Subject: [PATCH 09/16] refactor(prover): drop test_host_verify_step_timings

The smoke pipelines already host-verify the inner proof, so building with
--features stark/instruments surfaces the per-step timings; the dedicated test
was just that verify minus the guest run. Documented the flag in the module doc.
---
 prover/src/tests/recursion_smoke_test.rs | 38 +++---------------------
 1 file changed, 4 insertions(+), 34 deletions(-)

diff --git a/prover/src/tests/recursion_smoke_test.rs b/prover/src/tests/recursion_smoke_test.rs
index 3b3814adc..67c6a5818 100644
--- a/prover/src/tests/recursion_smoke_test.rs
+++ b/prover/src/tests/recursion_smoke_test.rs
@@ -2,6 +2,10 @@
 //! hand `(VmProof, elf, opts)` to the in-VM verifier guest, then either prove
 //! the guest's execution (`OuterMode::Prove`) or just execute it
 //! (`OuterMode::ExecuteOnly`). Guest ELFs come from `make compile-recursion-elfs`.
+//!
+//! Every pipeline host-verifies the inner proof, so building with
+//! `--features stark/instruments` makes any of these tests print the verifier's
+//! per-step `Time spent:` timings.
 
 use std::ops::ControlFlow;
 use std::path::PathBuf;
@@ -819,40 +823,6 @@ fn test_recursion_sampled_flamegraph() {
     eprintln!("============================================================");
 }
 
-/// Host-side per-step verifier timings (build with `--features stark/instruments`
-/// for the `Time spent:` lines). No VM execution.
-#[test]
-#[ignore = "diagnostic: prints host-side verifier step timings"]
-fn test_host_verify_step_timings() {
-    let root = workspace_root();
-    let empty_path =
-        root.join("bench_vs/lambda/empty/target/riscv64im-lambda-vm-elf/release/empty-bench");
-    let empty_elf_bytes = std::fs::read(&empty_path).expect("read empty-bench");
-
-    let inner_proof_options = MIN_PROOF_OPTIONS;
-
-    eprintln!("[host-verify] proving empty (blowup=2, fri_queries=1) ...");
-    let inner_proof = crate::prove_with_options_and_inputs(
-        &empty_elf_bytes,
-        &[],
-        &inner_proof_options,
-        &crate::MaxRowsConfig::default(),
-    )
-    .expect("inner prove should succeed");
-
-    eprintln!("[host-verify] verifying on host (with instruments) ...");
-    let ok = crate::verify_with_options(
-        &inner_proof,
-        &empty_elf_bytes,
-        &inner_proof_options,
-        None,
-        None,
-    )
-    .expect("verify errored");
-    assert!(ok, "proof must verify");
-    eprintln!("[host-verify] verified OK");
-}
-
 // Control guest: decodes the blob and halts. Its cycle count subtracted from
 // the matching recursion run isolates the in-VM verifier cost.
 

From da41a237cb2a41fa344a0437e7e4529e064ca317 Mon Sep 17 00:00:00 2001
From: Mario Rugiero <mrugiero@gmail.com>
Date: Tue, 30 Jun 2026 18:29:58 -0300
Subject: [PATCH 10/16] Remove the unused SP1 verifier bench program

It was never wired into the bench harness or CI (run.sh uses sp1/fibonacci),
and its in-VM verifier-cost comparison is superseded by the recursion profile
tests in this PR.
---
 bench_vs/sp1/verifier/Cargo.toml          |  3 -
 bench_vs/sp1/verifier/program/Cargo.toml  | 10 ---
 bench_vs/sp1/verifier/program/src/main.rs | 34 ----------
 bench_vs/sp1/verifier/script/Cargo.toml   | 13 ----
 bench_vs/sp1/verifier/script/build.rs     |  5 --
 bench_vs/sp1/verifier/script/src/main.rs  | 83 -----------------------
 6 files changed, 148 deletions(-)
 delete mode 100644 bench_vs/sp1/verifier/Cargo.toml
 delete mode 100644 bench_vs/sp1/verifier/program/Cargo.toml
 delete mode 100644 bench_vs/sp1/verifier/program/src/main.rs
 delete mode 100644 bench_vs/sp1/verifier/script/Cargo.toml
 delete mode 100644 bench_vs/sp1/verifier/script/build.rs
 delete mode 100644 bench_vs/sp1/verifier/script/src/main.rs

diff --git a/bench_vs/sp1/verifier/Cargo.toml b/bench_vs/sp1/verifier/Cargo.toml
deleted file mode 100644
index fc24039c2..000000000
--- a/bench_vs/sp1/verifier/Cargo.toml
+++ /dev/null
@@ -1,3 +0,0 @@
-[workspace]
-members = ["program", "script"]
-resolver = "2"
diff --git a/bench_vs/sp1/verifier/program/Cargo.toml b/bench_vs/sp1/verifier/program/Cargo.toml
deleted file mode 100644
index 7fbc9c5ce..000000000
--- a/bench_vs/sp1/verifier/program/Cargo.toml
+++ /dev/null
@@ -1,10 +0,0 @@
-[package]
-name = "verifier-program"
-version = "0.1.0"
-edition = "2024"
-
-[dependencies]
-sp1-zkvm = "6.0.1"
-lambda-vm-prover = { path = "../../../../prover", default-features = false }
-serde = { version = "=1.0.219", default-features = false, features = ["derive", "alloc"] }
-postcard = { version = "1.0", default-features = false, features = ["alloc"] }
diff --git a/bench_vs/sp1/verifier/program/src/main.rs b/bench_vs/sp1/verifier/program/src/main.rs
deleted file mode 100644
index c63bb67ca..000000000
--- a/bench_vs/sp1/verifier/program/src/main.rs
+++ /dev/null
@@ -1,34 +0,0 @@
-//! SP1 guest that runs lambda-vm's `verify_with_options` on a single proof.
-//!
-//! Input layout (postcard-encoded `Vec<u8>` written via `SP1Stdin::write_vec`):
-//!   `(VmProof, Vec<u8>, ProofOptions)`
-//! where the inner `Vec<u8>` is the inner program's ELF bytes.
-//!
-//! Output: commits `[1u8]` on successful verify; the guest panics otherwise.
-//!
-//! Caveats:
-//! - The verifier hashes through the `keccak` crate. SP1 has a Keccak
-//!   precompile but it patches `tiny-keccak`, not `keccak`. We don't patch
-//!   here, so Keccak runs as software inside the guest. Cycle counts will be
-//!   inflated by that overhead. Worth keeping in mind when interpreting the
-//!   number relative to lambda-vm's in-VM count.
-
-#![no_main]
-
-extern crate alloc;
-
-use alloc::vec::Vec;
-
-use lambda_vm_prover::{ProofOptions, VmProof};
-
-sp1_zkvm::entrypoint!(main);
-
-pub fn main() {
-    let blob = sp1_zkvm::io::read_vec();
-    let (vm_proof, inner_elf, options): (VmProof, Vec<u8>, ProofOptions) =
-        postcard::from_bytes(&blob).expect("failed to deserialize input");
-    let ok = lambda_vm_prover::verify_with_options(&vm_proof, &inner_elf, &options, None, None)
-        .expect("verify errored");
-    assert!(ok, "inner proof failed verification");
-    sp1_zkvm::io::commit_slice(&[1u8]);
-}
diff --git a/bench_vs/sp1/verifier/script/Cargo.toml b/bench_vs/sp1/verifier/script/Cargo.toml
deleted file mode 100644
index 3198059bd..000000000
--- a/bench_vs/sp1/verifier/script/Cargo.toml
+++ /dev/null
@@ -1,13 +0,0 @@
-[package]
-name = "verifier-script"
-version = "0.1.0"
-edition = "2024"
-
-[dependencies]
-sp1-sdk = { version = "6.0.1", features = ["blocking", "profiling"] }
-lambda-vm-prover = { path = "../../../../prover" }
-stark = { path = "../../../../crypto/stark" }
-postcard = { version = "1.0", features = ["alloc"] }
-
-[build-dependencies]
-sp1-build = "6.0.1"
diff --git a/bench_vs/sp1/verifier/script/build.rs b/bench_vs/sp1/verifier/script/build.rs
deleted file mode 100644
index d6cf925d6..000000000
--- a/bench_vs/sp1/verifier/script/build.rs
+++ /dev/null
@@ -1,5 +0,0 @@
-use sp1_build::build_program_with_args;
-
-fn main() {
-    build_program_with_args("../program", Default::default());
-}
diff --git a/bench_vs/sp1/verifier/script/src/main.rs b/bench_vs/sp1/verifier/script/src/main.rs
deleted file mode 100644
index 86e46a710..000000000
--- a/bench_vs/sp1/verifier/script/src/main.rs
+++ /dev/null
@@ -1,83 +0,0 @@
-//! Host driver: prove an inner empty program on lambda-vm, then execute the
-//! lambda-vm verifier inside SP1's executor, printing the cycle count.
-//!
-//! Set `TRACE_FILE=profiles/verifier.json` to capture a DWARF-attributed
-//! profile (1 sample = 1 cycle). The output can be opened with
-//! `samply load profiles/verifier.json`.
-
-use std::path::PathBuf;
-
-use sp1_sdk::blocking::{Prover, ProverClient};
-use sp1_sdk::{SP1Stdin, include_elf};
-
-const VERIFIER_ELF: sp1_sdk::Elf = include_elf!("verifier-program");
-
-fn workspace_root() -> PathBuf {
-    // CARGO_MANIFEST_DIR for this crate is `<root>/bench_vs/sp1/verifier/script`.
-    PathBuf::from(env!("CARGO_MANIFEST_DIR"))
-        .ancestors()
-        .nth(4)
-        .expect("workspace root")
-        .to_path_buf()
-}
-
-fn main() {
-    sp1_sdk::utils::setup_logger();
-
-    let root = workspace_root();
-    let empty_elf_path = root
-        .join("bench_vs/lambda/empty/target/riscv64im-lambda-vm-elf/release/empty-bench");
-    assert!(
-        empty_elf_path.exists(),
-        "empty-bench ELF not found at {} — run `bash bench_vs/build_recursion_elfs.sh` first",
-        empty_elf_path.display(),
-    );
-    let inner_elf = std::fs::read(&empty_elf_path).expect("read empty-bench");
-
-    let options = stark::proof::options::ProofOptions {
-        blowup_factor: 2,
-        fri_number_of_queries: 1,
-        coset_offset: 3,
-        grinding_factor: 1,
-    };
-
-    println!("[sp1-verifier] proving inner (empty, blowup=2, 1 query) ...");
-    let inner_proof = lambda_vm_prover::prove_with_options_and_inputs(
-        &inner_elf,
-        &[],
-        &options,
-        &lambda_vm_prover::MaxRowsConfig::default(),
-    )
-    .expect("inner prove should succeed");
-
-    let blob = postcard::to_allocvec(&(&inner_proof, &inner_elf, &options))
-        .expect("postcard encode failed");
-    println!("[sp1-verifier] postcard blob: {} bytes", blob.len());
-
-    let client = ProverClient::from_env();
-    let mut stdin = SP1Stdin::new();
-    stdin.write_vec(blob);
-
-    println!("[sp1-verifier] executing verifier in SP1 ...");
-    let (_, report) = client
-        .execute(VERIFIER_ELF.clone(), stdin)
-        .run()
-        .expect("execute failed");
-
-    let cycles = report.total_instruction_count();
-    println!();
-    println!("============================================================");
-    println!("  SP1 EXECUTION SUMMARY — lambda-vm verifier inside SP1");
-    println!("============================================================");
-    println!("  Total cycles : {cycles}");
-    println!();
-    println!("  Compare against lambda-vm in-VM count (~40.5B for the same");
-    println!("  proof). Both VMs target riscv64im, so word width is symmetric.");
-    println!("  Main remaining asymmetry: lambda-vm's KeccakPermute precompile");
-    println!("  is patched on its guests but SP1 does not patch `keccak` (only");
-    println!("  `tiny-keccak`), so Keccak rounds run as software in SP1 here.");
-    println!();
-    println!("  If TRACE_FILE was set, the profile was written there.");
-    println!("  Render with: samply load <trace>");
-    println!("============================================================");
-}

From acd2c67f1b5bd93eec3290a6ed97dcc67f88a743 Mon Sep 17 00:00:00 2001
From: Mario Rugiero <mrugiero@gmail.com>
Date: Tue, 30 Jun 2026 18:33:55 -0300
Subject: [PATCH 11/16] fix ci bug

Co-authored-by: claude[bot] <209825114+claude[bot]@users.noreply.github.com>
---
 .github/workflows/profile-recursion.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/profile-recursion.yml b/.github/workflows/profile-recursion.yml
index 420cebfcb..d7fb12447 100644
--- a/.github/workflows/profile-recursion.yml
+++ b/.github/workflows/profile-recursion.yml
@@ -39,7 +39,7 @@ jobs:
             test: single
             title: "Single query (blowup=2, 1 query)"
           - name: multi-query
-            test: single
+            test: multi
             title: "Multi query (blowup=8, 128-bit)"
     steps:
       - name: React to comment

From e52cd9db000b870fdc494689364a99b402be5aee Mon Sep 17 00:00:00 2001
From: Mario Rugiero <mrugiero@gmail.com>
Date: Tue, 30 Jun 2026 18:34:29 -0300
Subject: [PATCH 12/16] fix ci bug

Co-authored-by: claude[bot] <209825114+claude[bot]@users.noreply.github.com>
---
 .github/workflows/profile-recursion.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/profile-recursion.yml b/.github/workflows/profile-recursion.yml
index d7fb12447..1e2f2ae0c 100644
--- a/.github/workflows/profile-recursion.yml
+++ b/.github/workflows/profile-recursion.yml
@@ -84,7 +84,7 @@ jobs:
           # test triggers picks this up via the Makefile's `SYSROOT_DIR ?=`.
           export SYSROOT_DIR="$HOME/.lambda-vm-sysroot"
           set -o pipefail
-          make test-profile-recursion-$TEST
+          make test-profile-recursion-$TEST 2>&1 | tee /tmp/hist.log
 
       - name: Aggregate into a per-function fragment
         if: always()

From b70789a938c8bb5469147a0bff95e4fbaed22d8b Mon Sep 17 00:00:00 2001
From: Mario Rugiero <mrugiero@gmail.com>
Date: Tue, 30 Jun 2026 18:41:40 -0300
Subject: [PATCH 13/16] cargo fmt

---
 prover/src/tests/recursion_smoke_test.rs | 39 +++++++++++++++++++-----
 1 file changed, 31 insertions(+), 8 deletions(-)

diff --git a/prover/src/tests/recursion_smoke_test.rs b/prover/src/tests/recursion_smoke_test.rs
index 67c6a5818..e3e44bfa3 100644
--- a/prover/src/tests/recursion_smoke_test.rs
+++ b/prover/src/tests/recursion_smoke_test.rs
@@ -83,8 +83,11 @@ fn execute_outer_and_commit(label: &str, recursion_elf_bytes: &[u8], blob: &[u8]
     let program = Elf::load(recursion_elf_bytes).expect("load recursion elf");
     let mut executor = Executor::new(&program, blob.to_vec()).expect("executor new");
 
-    let (total_cycles, exec_time) =
-        drive_executor(&mut executor, |_log| ControlFlow::Continue(()), |_, _, _| {});
+    let (total_cycles, exec_time) = drive_executor(
+        &mut executor,
+        |_log| ControlFlow::Continue(()),
+        |_, _, _| {},
+    );
 
     let committed = executor
         .finish()
@@ -135,7 +138,10 @@ fn drive_executor(
     let start = std::time::Instant::now();
     let mut total_cycles: u64 = 0;
     let mut chunks: usize = 0;
-    while let Some(logs) = executor.resume().expect("executor resume failed (guest panicked in-VM?)") {
+    while let Some(logs) = executor
+        .resume()
+        .expect("executor resume failed (guest panicked in-VM?)")
+    {
         let mut stop = false;
         for log in logs {
             total_cycles += 1;
@@ -181,7 +187,10 @@ fn setup_guest_run(
 }
 
 /// A `drive_executor` progress callback printing one line every `stride` chunks.
-fn log_progress(label: impl Into<String>, stride: usize) -> impl FnMut(usize, u64, std::time::Duration) {
+fn log_progress(
+    label: impl Into<String>,
+    stride: usize,
+) -> impl FnMut(usize, u64, std::time::Duration) {
     let label = label.into();
     move |chunks, cycles, elapsed| {
         if chunks.is_multiple_of(stride) {
@@ -222,7 +231,9 @@ fn print_function_table(
     let mut by_function: std::collections::HashMap<String, (u64, u64)> =
         std::collections::HashMap::new();
     for (pc, count) in &pc_hist {
-        let entry = by_function.entry(resolve_pc(symbols, *pc)).or_insert((0, 0));
+        let entry = by_function
+            .entry(resolve_pc(symbols, *pc))
+            .or_insert((0, 0));
         entry.0 += *count; // cycles
         entry.1 += 1; // distinct PCs folded into this function
     }
@@ -298,18 +309,30 @@ fn run_profile(
             "{guest_name} ELF has no symbol table — was it stripped?"
         );
         for (i, kw) in VERIFIER_STEP_KEYWORDS.iter().enumerate() {
-            let n = symbols.functions().iter().filter(|f| f.name.contains(kw)).count();
+            let n = symbols
+                .functions()
+                .iter()
+                .filter(|f| f.name.contains(kw))
+                .count();
             eprintln!(
                 "[profile] step {}: keyword={kw:?} -> {n} symbol(s) {}",
                 i + 1,
-                if n > 0 { "" } else { "(no match; merges into previous bucket)" },
+                if n > 0 {
+                    ""
+                } else {
+                    "(no match; merges into previous bucket)"
+                },
             );
         }
     }
 
     eprintln!(
         "[profile] executing {guest_name} guest ({}) ...",
-        if detailed { "histogram + steps" } else { "cycle counter" }
+        if detailed {
+            "histogram + steps"
+        } else {
+            "cycle counter"
+        }
     );
     let (total_cycles, exec_time) = drive_executor(
         &mut executor,

From 8cb31e550d63cd85b3fbfced932f7a2e8719dcf7 Mon Sep 17 00:00:00 2001
From: Mario Rugiero <mrugiero@gmail.com>
Date: Tue, 30 Jun 2026 18:43:55 -0300
Subject: [PATCH 14/16] ci: gate recursion-profile comment job on profile not
 being skipped

---
 .github/workflows/profile-recursion.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/profile-recursion.yml b/.github/workflows/profile-recursion.yml
index 1e2f2ae0c..0e614fcd8 100644
--- a/.github/workflows/profile-recursion.yml
+++ b/.github/workflows/profile-recursion.yml
@@ -106,7 +106,10 @@ jobs:
   # Stitch the matrix fragments into a single PR comment.
   comment:
     needs: profile
-    if: always() && github.event_name == 'issue_comment'
+    # always() so partial-matrix failures still post; skip when `profile` was
+    # skipped (non-/profile_recursion or non-member comment) so this job — and
+    # the self-hosted bench runner it spins up — doesn't fire on every comment.
+    if: always() && github.event_name == 'issue_comment' && needs.profile.result != 'skipped'
     runs-on: [self-hosted, bench]
     steps:
       - name: Get PR head ref

From cd0d61541ae8b432831011ce01f9431da425104c Mon Sep 17 00:00:00 2001
From: Mario Rugiero <mrugiero@gmail.com>
Date: Tue, 30 Jun 2026 18:49:04 -0300
Subject: [PATCH 15/16] lint

---
 prover/src/tests/recursion_smoke_test.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/prover/src/tests/recursion_smoke_test.rs b/prover/src/tests/recursion_smoke_test.rs
index e3e44bfa3..d34e7cf4d 100644
--- a/prover/src/tests/recursion_smoke_test.rs
+++ b/prover/src/tests/recursion_smoke_test.rs
@@ -786,7 +786,7 @@ fn test_recursion_sampled_flamegraph() {
             // body. Skipped logs lose stack accuracy — acceptable diagnostic
             // quality at higher rates.
             #[allow(clippy::modulo_one)]
-            let take = i % SAMPLE_RATE == 0;
+            let take = i.is_multiple_of(SAMPLE_RATE);
             if take {
                 generator
                     .borrow_mut()

From 0a58f0fbf910dfc9e3028ed7f526db11006df26c Mon Sep 17 00:00:00 2001
From: Mario Rugiero <mrugiero@gmail.com>
Date: Tue, 30 Jun 2026 21:11:40 -0300
Subject: [PATCH 16/16] inline(never) for high-level steps to avoid missing
 symbols

---
 crypto/stark/src/verifier.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/crypto/stark/src/verifier.rs b/crypto/stark/src/verifier.rs
index 03119f617..d4186e563 100644
--- a/crypto/stark/src/verifier.rs
+++ b/crypto/stark/src/verifier.rs
@@ -97,6 +97,7 @@ pub trait IsStarkVerifier<
     /// Checks whether the purported evaluations of the composition polynomial parts and the trace
     /// polynomials at the out-of-domain challenge are consistent.
     /// See https://lambdaclass.github.io/lambdaworks/starks/protocol.html#step-2-verify-claimed-composition-polynomial
+    #[inline(never)]
     fn step_2_verify_claimed_composition_polynomial(
         air: &dyn AIR<Field = Field, FieldExtension = FieldExtension, PublicInputs = PI>,
         proof: &StarkProof<Field, FieldExtension, PI>,
@@ -241,6 +242,7 @@ pub trait IsStarkVerifier<
     /// Reconstructs the Deep composition polynomial evaluations at the challenge indices values using the provided
     /// openings of the trace polynomials and the composition polynomial parts. It then uses these to verify that the
     /// FRI decommitments are valid and correspond to the Deep composition polynomial.
+    #[inline(never)]
     fn step_3_verify_fri(
         proof: &StarkProof<Field, FieldExtension, PI>,
         domain: &VerifierDomain<Field>,
@@ -396,6 +398,7 @@ pub trait IsStarkVerifier<
     /// Verifies the validity of the purported values of the trace polynomials and the composition polynomial
     /// parts at the domain elements and their symmetric counterparts corresponding to all the FRI query
     /// index challenges.
+    #[inline(never)]
     fn step_4_verify_trace_and_composition_openings(
         proof: &StarkProof<Field, FieldExtension, PI>,
         challenges: &Challenges<FieldExtension>,
@@ -903,6 +906,7 @@ pub trait IsStarkVerifier<
 
     /// Replays rounds 2, 3 and 4 of the protocol for a given proof, assuming round 1 has
     /// already been replayed and the RAP challenges are known.
+    #[inline(never)]
     fn replay_rounds_after_round_1(
         air: &dyn AIR<Field = Field, FieldExtension = FieldExtension, PublicInputs = PI>,
         proof: &StarkProof<Field, FieldExtension, PI>,