From 5f369f08f78ca2271db404f2e35f3580ff1c9314 Mon Sep 17 00:00:00 2001
From: Joaquin Carletti <joaquin.carletti@lambdaclass.com>
Date: Mon, 29 Jun 2026 15:08:10 -0300
Subject: [PATCH 01/17] spike

---
 crypto/math-cuda/src/device.rs     |  41 ++++++++
 crypto/stark/src/instruments.rs    | 149 ++++++++++++++++++++++++++++-
 crypto/stark/src/lib.rs            |  10 ++
 crypto/stark/src/prover.rs         |  20 ++++
 prover/src/lib.rs                  |  28 ++++++
 prover/src/tables/trace_builder.rs |  33 ++++++-
 6 files changed, 278 insertions(+), 3 deletions(-)
diff --git a/crypto/math-cuda/src/device.rs b/crypto/math-cuda/src/device.rs
index 17e2f9f82..4c12000fd 100644
--- a/crypto/math-cuda/src/device.rs
+++ b/crypto/math-cuda/src/device.rs
@@ -174,6 +174,34 @@ pub struct Backend {
     inv_twiddles: Mutex<Vec<Option<Arc<CudaSlice<u64>>>>>,
 }
 
+/// Raise the device default memory pool's release threshold so freed
+/// stream-ordered allocations are kept for reuse instead of returned to the OS
+/// at each sync. Best-effort: any failure (e.g. a device/driver without
+/// stream-ordered allocator support) leaves the default behaviour untouched.
+fn retain_default_mempool(ctx: &CudaContext) {
+    use cudarc::driver::sys;
+    // SAFETY: raw CUDA driver calls. `ctx.cu_device()` is a valid device for
+    // the just-created context; the out-pointers are valid stack slots; the
+    // threshold is read as a u64 by the driver. Errors are swallowed.
+    unsafe {
+        let dev = ctx.cu_device();
+        let mut pool: sys::CUmemoryPool = std::ptr::null_mut();
+        if sys::cuDeviceGetDefaultMemPool(&mut pool as *mut _, dev)
+            .result()
+            .is_err()
+        {
+            return;
+        }
+        let threshold: u64 = u64::MAX;
+        let _ = sys::cuMemPoolSetAttribute(
+            pool,
+            sys::CUmemPool_attribute_enum::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
+            &threshold as *const u64 as *mut core::ffi::c_void,
+        )
+        .result();
+    }
+}
+
 impl Backend {
     fn init() -> Result<Self> {
         let ctx = CudaContext::new(0)?;
@@ -183,6 +211,19 @@ impl Backend {
         // before returning), so the tracking is pure overhead. Disable it.
         unsafe { ctx.disable_event_tracking() };
 
+        // Retain freed device memory in the stream-ordered pool for reuse.
+        //
+        // cudarc routes `CudaStream::alloc*` through `cuMemAllocAsync`, which
+        // draws from the device's default memory pool. That pool's release
+        // threshold defaults to 0, so every freed buffer is handed back to the
+        // OS at the next sync — meaning the prover's large, repeatedly-shaped
+        // LDE / FRI buffers are re-malloc'd from scratch each op. Raising the
+        // threshold to "unbounded" keeps freed blocks resident in the pool so
+        // subsequent allocations of the same size are satisfied without a real
+        // driver allocation. Best-effort: on any error (no pool support, sync
+        // allocator) we silently keep the current behaviour.
+        retain_default_mempool(&ctx);
+
         let arith = ctx.load_module(Ptx::from_src(ARITH_PTX))?;
         let ntt = ctx.load_module(Ptx::from_src(NTT_PTX))?;
         let keccak = ctx.load_module(Ptx::from_src(KECCAK_PTX))?;
diff --git a/crypto/stark/src/instruments.rs b/crypto/stark/src/instruments.rs
index 16ff95082..83e14545b 100644
--- a/crypto/stark/src/instruments.rs
+++ b/crypto/stark/src/instruments.rs
@@ -1,7 +1,154 @@
 use std::cell::RefCell;
+use std::sync::Mutex;
 use std::sync::OnceLock;
 use std::sync::atomic::{AtomicU64, Ordering};
-use std::time::Duration;
+use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
+
+// =========================================================================
+// Wall-clock span timeline (the trustworthy per-step measurement)
+// =========================================================================
+//
+// Nested wall-clock spans opened/closed on the driving (main) thread at phase
+// boundaries. Unlike the `accum_*` / thread-local sub-timers below — which sum
+// per-worker CPU time across rayon threads and over-count (percentages > 100%) —
+// these spans are non-overlapping and sum to their parent, so the tree is a true
+// latency breakdown. Parallel regions are measured as a single span around the
+// blocking call (that IS their latency); their internal split is reported
+// separately as CPU-time, never mixed into the wall tree.
+//
+//     let _s = instruments::span("trace_build");   // RAII, stops on drop
+//
+// `Instant::now()` is ~20 ns — fine at phase granularity; never inside per-op loops.
+
+#[derive(Clone, Debug)]
+pub struct SpanRecord {
+    pub label: &'static str,
+    pub depth: u16,
+    pub wall: Duration,
+    /// Open-order, so the tree reconstructs in start-order (records push on close).
+    pub order: u32,
+    /// Wall-clock epoch (ns) when the span opened — for aligning with external
+    /// samplers (e.g. nvidia-smi GPU-util) to attribute device-busy time per step.
+    pub start_ns: u128,
+}
+
+static TIMELINE: Mutex<Vec<SpanRecord>> = Mutex::new(Vec::new());
+static SPAN_ORDER: AtomicU64 = AtomicU64::new(0);
+
+thread_local! {
+    static SPAN_DEPTH: std::cell::Cell<u16> = const { std::cell::Cell::new(0) };
+}
+
+#[must_use]
+pub struct SpanGuard {
+    label: &'static str,
+    depth: u16,
+    order: u32,
+    start: Instant,
+    start_ns: u128,
+}
+
+/// Open a wall-clock span; records elapsed time when the guard drops.
+pub fn span(label: &'static str) -> SpanGuard {
+    let depth = SPAN_DEPTH.with(|d| {
+        let v = d.get();
+        d.set(v + 1);
+        v
+    });
+    let order = SPAN_ORDER.fetch_add(1, Ordering::Relaxed) as u32;
+    let start_ns = SystemTime::now()
+        .duration_since(UNIX_EPOCH)
+        .unwrap_or_default()
+        .as_nanos();
+    SpanGuard {
+        label,
+        depth,
+        order,
+        start: Instant::now(),
+        start_ns,
+    }
+}
+
+impl Drop for SpanGuard {
+    fn drop(&mut self) {
+        let wall = self.start.elapsed();
+        SPAN_DEPTH.with(|d| d.set(d.get().saturating_sub(1)));
+        if let Ok(mut t) = TIMELINE.lock() {
+            t.push(SpanRecord {
+                label: self.label,
+                depth: self.depth,
+                wall,
+                order: self.order,
+                start_ns: self.start_ns,
+            });
+        }
+    }
+}
+
+/// Clear recorded spans. Call at the start of a measured prove.
+pub fn reset_timeline() {
+    SPAN_ORDER.store(0, Ordering::Relaxed);
+    SPAN_DEPTH.with(|d| d.set(0));
+    if let Ok(mut t) = TIMELINE.lock() {
+        t.clear();
+    }
+}
+
+/// Drain recorded spans, sorted in start-order (ready for the tree).
+pub fn take_timeline() -> Vec<SpanRecord> {
+    let mut spans = TIMELINE
+        .lock()
+        .map(|mut t| std::mem::take(&mut *t))
+        .unwrap_or_default();
+    spans.sort_by_key(|s| s.order);
+    spans
+}
+
+/// Indented wall-clock tree with % of the root span.
+pub fn format_timeline(spans: &[SpanRecord]) -> String {
+    use std::fmt::Write;
+    if spans.is_empty() {
+        return String::new();
+    }
+    let total_s = spans
+        .first()
+        .map(|s| s.wall.as_secs_f64())
+        .unwrap_or(1e-9)
+        .max(1e-9);
+    let mut out = String::from("=== TIMELINE (wall-clock) ===\n");
+    for s in spans {
+        let indent = "  ".repeat(s.depth as usize);
+        let pct = 100.0 * s.wall.as_secs_f64() / total_s;
+        let _ = writeln!(
+            out,
+            "{:<42} {:>10.3?} {:>6.1}%",
+            format!("{indent}{}", s.label),
+            s.wall,
+            pct
+        );
+    }
+    out
+}
+
+/// JSON array of `{label, depth, wall_ns, order}` for diffing / plotting.
+pub fn timeline_json(spans: &[SpanRecord]) -> String {
+    let mut out = String::from("[");
+    for (i, s) in spans.iter().enumerate() {
+        if i > 0 {
+            out.push(',');
+        }
+        out.push_str(&format!(
+            "{{\"label\":\"{}\",\"depth\":{},\"wall_ns\":{},\"order\":{},\"start_ns\":{}}}",
+            s.label,
+            s.depth,
+            s.wall.as_nanos(),
+            s.order,
+            s.start_ns
+        ));
+    }
+    out.push(']');
+    out
+}
 
 static HEAP_READER: OnceLock<fn() -> Option<usize>> = OnceLock::new();
 
diff --git a/crypto/stark/src/lib.rs b/crypto/stark/src/lib.rs
index e9f6a1cda..25ce91569 100644
--- a/crypto/stark/src/lib.rs
+++ b/crypto/stark/src/lib.rs
@@ -3,6 +3,16 @@
 #[cfg(all(target_arch = "wasm32", feature = "disk-spill"))]
 compile_error!("the `disk-spill` feature requires memmap2, which does not compile on wasm32");
 
+/// Open a wall-clock profiling span (no-op unless the `instruments` feature is on).
+/// RAII: records elapsed wall time when the binding drops at end of scope.
+#[macro_export]
+macro_rules! prof_span {
+    ($label:expr) => {
+        #[cfg(feature = "instruments")]
+        let _prof_span = $crate::instruments::span($label);
+    };
+}
+
 #[cfg(feature = "debug-checks")]
 pub mod bus_debug;
 pub mod constraints;
diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs
index 4da57559c..eec131be7 100644
--- a/crypto/stark/src/prover.rs
+++ b/crypto/stark/src/prover.rs
@@ -1682,6 +1682,8 @@ pub trait IsStarkProver<
 
         #[cfg(feature = "instruments")]
         let phase_start = Instant::now();
+        #[cfg(feature = "instruments")]
+        let __sp = crate::instruments::span("r1_prepass");
 
         // Deduplicate Domain + LdeTwiddles by (trace_length, blowup_factor, coset_offset).
         // Many tables share the same domain size (e.g., 7+ tables at 2^20).
@@ -1739,6 +1741,8 @@ pub trait IsStarkProver<
             })?;
         }
 
+        #[cfg(feature = "instruments")]
+        drop(__sp);
         #[cfg(feature = "instruments")]
         let prepass_elapsed = phase_start.elapsed();
         #[cfg(feature = "instruments")]
@@ -1754,6 +1758,8 @@ pub trait IsStarkProver<
 
         #[cfg(feature = "instruments")]
         let phase_start = Instant::now();
+        #[cfg(feature = "instruments")]
+        let __sp = crate::instruments::span("r1_main_commit");
 
         let mut main_commits: Vec<TableCommit<Field>> = Vec::with_capacity(num_airs);
         let mut main_ldes: Vec<Vec<Vec<FieldElement<Field>>>> = Vec::with_capacity(num_airs);
@@ -1810,6 +1816,8 @@ pub trait IsStarkProver<
             }
         }
 
+        #[cfg(feature = "instruments")]
+        drop(__sp);
         #[cfg(feature = "instruments")]
         let main_commits_elapsed = phase_start.elapsed();
         #[cfg(feature = "instruments")]
@@ -1845,6 +1853,8 @@ pub trait IsStarkProver<
         // but outer parallelism over 12 tables also helps on high-core-count machines.
         #[cfg(feature = "instruments")]
         let phase_start = Instant::now();
+        #[cfg(feature = "instruments")]
+        let __sp = crate::instruments::span("r1_aux_build");
 
         #[cfg(feature = "parallel")]
         let aux_iter = air_trace_pairs.par_iter_mut();
@@ -1877,6 +1887,8 @@ pub trait IsStarkProver<
             })?;
         }
 
+        #[cfg(feature = "instruments")]
+        drop(__sp);
         #[cfg(feature = "instruments")]
         let aux_build_elapsed = phase_start.elapsed();
         #[cfg(feature = "instruments")]
@@ -1888,6 +1900,8 @@ pub trait IsStarkProver<
         // Each table gets its own transcript fork.
         #[cfg(feature = "instruments")]
         let phase_start = Instant::now();
+        #[cfg(feature = "instruments")]
+        let __sp = crate::instruments::span("r1_aux_commit");
 
         // Pre-fork all transcripts (cheap, sequential — must match verifier ordering)
         let mut table_transcripts: Vec<_> = (0..num_airs)
@@ -2066,6 +2080,8 @@ pub trait IsStarkProver<
             });
         }
 
+        #[cfg(feature = "instruments")]
+        drop(__sp);
         #[cfg(feature = "instruments")]
         let aux_commit_elapsed = phase_start.elapsed();
         #[cfg(feature = "instruments")]
@@ -2086,6 +2102,8 @@ pub trait IsStarkProver<
         #[cfg(feature = "instruments")]
         let phase_start = Instant::now();
         #[cfg(feature = "instruments")]
+        let __sp = crate::instruments::span("rounds_2to4");
+        #[cfg(feature = "instruments")]
         let mut table_timings: Vec<(
             String,
             usize,
@@ -2173,6 +2191,8 @@ pub trait IsStarkProver<
             }
         }
 
+        #[cfg(feature = "instruments")]
+        drop(__sp);
         #[cfg(feature = "instruments")]
         {
             // Store timing data for the top-level report in prove_with_options.
diff --git a/prover/src/lib.rs b/prover/src/lib.rs
index 81233d39f..589145d09 100644
--- a/prover/src/lib.rs
+++ b/prover/src/lib.rs
@@ -713,11 +713,17 @@ pub fn prove_with_options_and_inputs(
     #[cfg(feature = "instruments")]
     let total_start = std::time::Instant::now();
     #[cfg(feature = "instruments")]
+    stark::instruments::reset_timeline();
+    #[cfg(feature = "instruments")]
+    let __root = stark::instruments::span("prove_total");
+    #[cfg(feature = "instruments")]
     let heap_before = stark::instruments::heap_bytes();
 
     // Phase 1: Execute (ELF load + run)
     #[cfg(feature = "instruments")]
     let phase_start = std::time::Instant::now();
+    #[cfg(feature = "instruments")]
+    let __sp = stark::instruments::span("execute");
 
     let program = Elf::load(elf_bytes).map_err(|e| Error::ElfLoad(format!("{e}")))?;
     let executor = Executor::new(&program, private_inputs.to_vec())
@@ -726,6 +732,8 @@ pub fn prove_with_options_and_inputs(
         .run()
         .map_err(|e| Error::Execution(format!("{e}")))?;
 
+    #[cfg(feature = "instruments")]
+    drop(__sp);
     #[cfg(feature = "instruments")]
     let execute_elapsed = phase_start.elapsed();
     #[cfg(feature = "instruments")]
@@ -734,6 +742,8 @@ pub fn prove_with_options_and_inputs(
     // Phase 2: Trace build
     #[cfg(feature = "instruments")]
     let phase_start = std::time::Instant::now();
+    #[cfg(feature = "instruments")]
+    let __sp = stark::instruments::span("trace_build");
 
     #[cfg(feature = "disk-spill")]
     let storage_mode = {
@@ -755,6 +765,8 @@ pub fn prove_with_options_and_inputs(
     );
     drop(result);
 
+    #[cfg(feature = "instruments")]
+    drop(__sp);
     #[cfg(feature = "instruments")]
     let trace_build_elapsed = phase_start.elapsed();
     #[cfg(feature = "instruments")]
@@ -763,6 +775,8 @@ pub fn prove_with_options_and_inputs(
     // Phase 3: AIR construction
     #[cfg(feature = "instruments")]
     let phase_start = std::time::Instant::now();
+    #[cfg(feature = "instruments")]
+    let __sp = stark::instruments::span("air_construction");
 
     let table_counts = traces.table_counts();
     let airs = VmAirs::new(
@@ -775,6 +789,8 @@ pub fn prove_with_options_and_inputs(
         None,
     );
 
+    #[cfg(feature = "instruments")]
+    drop(__sp);
     #[cfg(feature = "instruments")]
     let air_elapsed = phase_start.elapsed();
     #[cfg(feature = "instruments")]
@@ -801,6 +817,8 @@ pub fn prove_with_options_and_inputs(
     );
 
     // Phase 4: Prove (multi_prove)
+    #[cfg(feature = "instruments")]
+    let __sp = stark::instruments::span("proving");
     let proof = Prover::multi_prove(
         airs.air_trace_pairs(&mut traces),
         &mut transcript,
@@ -808,6 +826,8 @@ pub fn prove_with_options_and_inputs(
         storage_mode,
     )
     .map_err(|e| Error::Prover(format!("{e:?}")))?;
+    #[cfg(feature = "instruments")]
+    drop(__sp);
 
     #[cfg(feature = "instruments")]
     {
@@ -823,6 +843,14 @@ pub fn prove_with_options_and_inputs(
                 after_air: heap_after_air,
             },
         );
+        // Accurate wall-clock span tree (the trustworthy per-step breakdown).
+        drop(__root);
+        let spans = stark::instruments::take_timeline();
+        print!("{}", stark::instruments::format_timeline(&spans));
+        if let Ok(path) = std::env::var("LAMBDA_VM_TIMELINE_JSON") {
+            let _ = std::fs::write(&path, stark::instruments::timeline_json(&spans));
+            println!("[timeline] wrote {path}");
+        }
     }
 
     Ok(VmProof {
diff --git a/prover/src/tables/trace_builder.rs b/prover/src/tables/trace_builder.rs
index 02371c1a0..048334356 100644
--- a/prover/src/tables/trace_builder.rs
+++ b/prover/src/tables/trace_builder.rs
@@ -2750,6 +2750,8 @@ fn build_traces(
     // =====================================================================
     // PHASE 4: All → Bitwise lookups
     // =====================================================================
+    #[cfg(feature = "instruments")]
+    let __sp = stark::instruments::span("p4_bitwise_collect");
     bitwise_ops.extend(collect_bitwise_from_lt(&lt_ops));
     // MUL/DVRM dedup their per-unique bit-gated lookups PER CHIP INSTANCE, so pass
     // the same chunk size used to split them into instances (see chunk_and_generate
@@ -2795,10 +2797,14 @@ fn build_traces(
         .map(|chunk| chunk.len().next_power_of_two().max(4) - chunk.len())
         .sum();
     bitwise_ops.extend(collect_byte_check_ops_for_padding(num_padding_rows));
+    #[cfg(feature = "instruments")]
+    drop(__sp);
 
     // =====================================================================
     // PHASE 5: Generate final traces (parallelized)
     // =====================================================================
+    #[cfg(feature = "instruments")]
+    let __sp = stark::instruments::span("p5_generate_tables");
 
     // Extract halt timestamp from the last ECALL instruction
     let halt_op = cpu_ops
@@ -3027,6 +3033,8 @@ fn build_traces(
         }
     }
 
+    #[cfg(feature = "instruments")]
+    drop(__sp);
     Ok(Traces {
         cpus,
         bitwise,
@@ -3692,17 +3700,27 @@ impl Traces {
         // Phase 0: ELF → DECODE + instructions
         // IMPORTANT: Use generate_decode_trace (same as compute_precomputed_commitment)
         // so the DECODE trace row ordering matches the AIR's hardcoded commitment.
+        #[cfg(feature = "instruments")]
+        let __sp = stark::instruments::span("p0_decode");
         let instructions = decode::instructions_from_elf(elf)
             .map_err(|e| Error::Execution(format!("Failed to parse instructions: {e}")))?;
         let (decode_trace, decode_pc_to_row) = decode::generate_decode_trace(&instructions);
+        #[cfg(feature = "instruments")]
+        drop(__sp);
 
         // Phase 1: Logs → CPU operations
+        #[cfg(feature = "instruments")]
+        let __sp = stark::instruments::span("p1_cpu_ops");
         let cpu_ops = collect_cpu_ops(logs, &instructions)?;
+        #[cfg(feature = "instruments")]
+        drop(__sp);
 
         // Phase 2: Collect + route all ops
         let mut memory_state = MemoryState::from_elf(elf);
         memory_state.add_private_input(private_input);
         let mut register_state = RegisterState::new(elf.entry_point);
+        #[cfg(feature = "instruments")]
+        let __sp = stark::instruments::span("p2a_collect_cpu");
         let (
             memw_ops,
             load_ops,
@@ -3716,7 +3734,11 @@ impl Traces {
             ec_scalar_ops,
             ecdas_ops,
         ) = collect_ops_from_cpu(&cpu_ops, &mut memory_state, &mut register_state);
+        #[cfg(feature = "instruments")]
+        drop(__sp);
 
+        #[cfg(feature = "instruments")]
+        let __sp = stark::instruments::span("p2b_collect_all");
         let ops = collect_all_ops(
             cpu_ops,
             memw_ops,
@@ -3732,9 +3754,13 @@ impl Traces {
             ecdas_ops,
             &mut register_state,
         );
+        #[cfg(feature = "instruments")]
+        drop(__sp);
 
         // Phases 3-5
-        build_traces(
+        #[cfg(feature = "instruments")]
+        let __sp = stark::instruments::span("p3to5_build_traces");
+        let result = build_traces(
             ops,
             Some(elf),
             &memory_state,
@@ -3746,7 +3772,10 @@ impl Traces {
             #[cfg(feature = "disk-spill")]
             storage_mode,
             private_input,
-        )
+        );
+        #[cfg(feature = "instruments")]
+        drop(__sp);
+        result
     }
 
     /// Generates all traces from execution logs (legacy API).

From 0fa7f32ab9371efac32f99a1b44a84c3bdf9448f Mon Sep 17 00:00:00 2001
From: Joaquin Carletti <joaquin.carletti@lambdaclass.com>
Date: Mon, 29 Jun 2026 15:22:04 -0300
Subject: [PATCH 02/17]  resident-data scaffolding

---
 crypto/stark/src/trace.rs | 88 +++++++++++++++++++++++++++++++--------
 1 file changed, 71 insertions(+), 17 deletions(-)

diff --git a/crypto/stark/src/trace.rs b/crypto/stark/src/trace.rs
index 405ce89f8..39e968c7f 100644
--- a/crypto/stark/src/trace.rs
+++ b/crypto/stark/src/trace.rs
@@ -193,17 +193,58 @@ where
     pub(crate) aux_columns: Vec<Vec<FieldElement<E>>>,
     pub(crate) lde_step_size: usize,
     pub(crate) blowup_factor: usize,
-    /// If the main trace was LDE'd on the GPU via the fused pipeline,
-    /// the device buffer is retained here so downstream GPU rounds can
-    /// read the LDE without a re-H2D. `None` when the GPU LDE didn't run
-    /// for this table (below the size threshold or any CPU fallback:
-    /// preprocessed main, non-Goldilocks, or GPU error).
+    /// Per-table GPU residency session: owns the device-resident trace LDE
+    /// buffers (main + aux) and tracks, per buffer, whether a host mirror is
+    /// live. Created on the CPU path too (all buffers `None`, mirrors present);
+    /// populated when the R1 fused GPU pipeline runs. Threaded R1→R4 because
+    /// `LDETraceTable` is borrowed through every round.
     #[cfg(feature = "cuda")]
-    pub(crate) gpu_main: Option<math_cuda::lde::GpuLdeBase>,
-    /// Same as `gpu_main` but for the aux trace (ext3 de-interleaved
-    /// layout on device).
-    #[cfg(feature = "cuda")]
-    pub(crate) gpu_aux: Option<math_cuda::lde::GpuLdeExt3>,
+    pub(crate) gpu_session: GpuTableSession,
+}
+
+/// Per-table GPU residency session.
+///
+/// Owns the device-resident buffers for a single trace table and tracks, per
+/// buffer, whether a host mirror is currently materialised. Today every buffer
+/// produced on the GPU is *also* copied to host columns, because the remaining
+/// CPU consumers (R2 constraint evaluation, R3 barycentric OOD, R4 query
+/// openings) read from host. The `*_host_mirror` flags are the seam those later
+/// steps flip: once a consumer reads from the device buffer instead, the
+/// corresponding mirror is dropped and the H2D/D2H copy elided.
+///
+/// Scope: this owns the main/aux trace LDE (resident R1→R4). The composition
+/// parts (`Round2`, R2→R4) and the bound stream are folded in by the control-
+/// plane commit, where the R2→R4 borrow chain becomes mutable. The R4-local
+/// `inv_denoms`/FRI state are created and consumed within R4 and stay local.
+#[cfg(feature = "cuda")]
+pub(crate) struct GpuTableSession {
+    /// Main-trace LDE, resident from the R1 fused pipeline through R4. `None`
+    /// when the GPU LDE didn't run for this table (below the size threshold or
+    /// any CPU fallback: preprocessed main, non-Goldilocks, or GPU error).
+    main_lde: Option<math_cuda::lde::GpuLdeBase>,
+    /// Aux-trace LDE (ext3 de-interleaved layout on device), resident R1→R4.
+    aux_lde: Option<math_cuda::lde::GpuLdeExt3>,
+    /// Whether the main-trace host columns currently mirror `main_lde`.
+    /// Always `true` today; CPU consumers depend on it.
+    main_host_mirror: bool,
+    /// Whether the aux-trace host columns currently mirror `aux_lde`.
+    /// Always `true` today; CPU consumers depend on it.
+    aux_host_mirror: bool,
+}
+
+#[cfg(feature = "cuda")]
+impl GpuTableSession {
+    fn new() -> Self {
+        Self {
+            main_lde: None,
+            aux_lde: None,
+            // Host columns are always materialised today; the CPU consumers
+            // (constraint eval, OOD, openings) read them. Steps 5/6 flip these
+            // off as each consumer moves to reading the device buffer.
+            main_host_mirror: true,
+            aux_host_mirror: true,
+        }
+    }
 }
 
 impl<F, E> LDETraceTable<F, E>
@@ -227,9 +268,7 @@ where
             lde_step_size,
             blowup_factor,
             #[cfg(feature = "cuda")]
-            gpu_main: None,
-            #[cfg(feature = "cuda")]
-            gpu_aux: None,
+            gpu_session: GpuTableSession::new(),
         }
     }
 
@@ -238,23 +277,38 @@ where
     /// ran the CPU path should leave this alone.
     #[cfg(feature = "cuda")]
     pub fn set_gpu_main(&mut self, h: math_cuda::lde::GpuLdeBase) {
-        self.gpu_main = Some(h);
+        self.gpu_session.main_lde = Some(h);
     }
 
     /// Attach an already-populated device LDE handle for the aux columns.
     #[cfg(feature = "cuda")]
     pub fn set_gpu_aux(&mut self, h: math_cuda::lde::GpuLdeExt3) {
-        self.gpu_aux = Some(h);
+        self.gpu_session.aux_lde = Some(h);
     }
 
     #[cfg(feature = "cuda")]
     pub fn gpu_main(&self) -> Option<&math_cuda::lde::GpuLdeBase> {
-        self.gpu_main.as_ref()
+        self.gpu_session.main_lde.as_ref()
     }
 
     #[cfg(feature = "cuda")]
     pub fn gpu_aux(&self) -> Option<&math_cuda::lde::GpuLdeExt3> {
-        self.gpu_aux.as_ref()
+        self.gpu_session.aux_lde.as_ref()
+    }
+
+    /// Whether the main-trace host columns currently mirror the device LDE.
+    /// Always `true` today (CPU consumers read host); the source-agnostic seam
+    /// steps 5/6 flip when the main consumer reads the device buffer instead.
+    #[cfg(feature = "cuda")]
+    pub fn main_host_mirror(&self) -> bool {
+        self.gpu_session.main_host_mirror
+    }
+
+    /// Whether the aux-trace host columns currently mirror the device LDE.
+    /// Always `true` today; see [`Self::main_host_mirror`].
+    #[cfg(feature = "cuda")]
+    pub fn aux_host_mirror(&self) -> bool {
+        self.gpu_session.aux_host_mirror
     }
 
     /// Consume self and return the owned column vectors.

From bdea00e1e3c1663e81220d14c46ef98ad94c972a Mon Sep 17 00:00:00 2001
From: Joaquin Carletti <joaquin.carletti@lambdaclass.com>
Date: Mon, 29 Jun 2026 15:34:53 -0300
Subject: [PATCH 03/17] part 2

---
 crypto/math-cuda/src/device.rs |  44 +++++++++++++
 crypto/stark/src/prover.rs     | 110 +++++++++++++++++++++++++++++++--
 2 files changed, 148 insertions(+), 6 deletions(-)

diff --git a/crypto/math-cuda/src/device.rs b/crypto/math-cuda/src/device.rs
index 4c12000fd..4e0dde5bb 100644
--- a/crypto/math-cuda/src/device.rs
+++ b/crypto/math-cuda/src/device.rs
@@ -118,6 +118,9 @@ pub struct Backend {
     pinned_hashes: Vec<Mutex<PinnedStaging>>,
     util_stream: Arc<CudaStream>,
     next: AtomicUsize,
+    /// VRAM budget (bytes) for table-session admission control. See
+    /// [`detect_vram_budget_bytes`].
+    vram_budget_bytes: u64,
 
     // arith.ptx
     pub vector_add_u64: CudaFunction,
@@ -202,6 +205,38 @@ fn retain_default_mempool(ctx: &CudaContext) {
     }
 }
 
+/// Device VRAM budget (bytes) for table-session admission control.
+///
+/// `LAMBDA_VM_VRAM_BUDGET_MB` overrides it explicitly — used to force-exercise
+/// the throttle in tests/benchmarks. Otherwise it is 80% of total device
+/// memory, leaving headroom for the context, module code, and the pool's
+/// retained blocks. On any query failure it returns `u64::MAX`, which disables
+/// budgeting: admission then falls back to the core-bound chunk size alone.
+fn detect_vram_budget_bytes(ctx: &CudaContext) -> u64 {
+    if let Ok(mb) = std::env::var("LAMBDA_VM_VRAM_BUDGET_MB") {
+        if let Ok(mb) = mb.parse::<u64>() {
+            return mb.saturating_mul(1024 * 1024);
+        }
+    }
+    use cudarc::driver::sys;
+    // SAFETY: raw driver query writing into two stack slots. The caller's
+    // context is already current (it was just created in `init`). Any error
+    // falls through to the budgeting-disabled sentinel.
+    unsafe {
+        let _ = ctx;
+        let mut free: usize = 0;
+        let mut total: usize = 0;
+        if sys::cuMemGetInfo_v2(&mut free as *mut usize, &mut total as *mut usize)
+            .result()
+            .is_err()
+        {
+            return u64::MAX;
+        }
+        // 80% of total, computed to avoid intermediate overflow.
+        (total as u64) / 5 * 4
+    }
+}
+
 impl Backend {
     fn init() -> Result<Self> {
         let ctx = CudaContext::new(0)?;
@@ -259,6 +294,8 @@ impl Backend {
         // Length = TWO_ADICITY + 1 to allow indexing at log_n = TWO_ADICITY.
         let max_log = GoldilocksField::TWO_ADICITY as usize + 1;
 
+        let vram_budget_bytes = detect_vram_budget_bytes(&ctx);
+
         Ok(Self {
             vector_add_u64: arith.load_function("vector_add_u64")?,
             gl_add: arith.load_function("gl_add_kernel")?,
@@ -308,9 +345,16 @@ impl Backend {
             pinned_hashes,
             util_stream,
             next: AtomicUsize::new(0),
+            vram_budget_bytes,
         })
     }
 
+    /// VRAM budget in bytes for table-session admission control. `u64::MAX`
+    /// when budgeting is disabled (query failed). See the field docs.
+    pub fn vram_budget_bytes(&self) -> u64 {
+        self.vram_budget_bytes
+    }
+
     /// Round-robin over the stream pool. Concurrent callers get different
     /// streams so their kernel launches overlap on the GPU.
     pub fn next_stream(&self) -> Arc<CudaStream> {
diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs
index eec131be7..6b3e345b8 100644
--- a/crypto/stark/src/prover.rs
+++ b/crypto/stark/src/prover.rs
@@ -323,6 +323,59 @@ pub fn table_parallelism() -> usize {
     }
 }
 
+/// Heuristic peak device working-set for one table, in bytes.
+///
+/// Counts the LDE columns that are co-resident on the GPU — `main` in the base
+/// field (8 B) and `aux` in the ext3 field (24 B) — times a scratch multiplier
+/// for the Merkle / NTT / composition transients allocated alongside them. It
+/// is deliberately a conservative over-estimate: it gates a safety ceiling, not
+/// a precise allocator. Pass `aux_cols == 0` for phases where the aux LDE is
+/// not yet resident (the R1 main commit).
+fn estimate_table_vram_bytes(main_cols: usize, aux_cols: usize, lde_size: usize) -> u64 {
+    const BYTES_PER_BASE: u64 = 8;
+    const EXT3_BYTES: u64 = 24;
+    const SCRATCH_FACTOR: u64 = 2;
+    let per_row = (main_cols as u64).saturating_mul(BYTES_PER_BASE)
+        + (aux_cols as u64).saturating_mul(EXT3_BYTES);
+    (lde_size as u64)
+        .saturating_mul(per_row)
+        .saturating_mul(SCRATCH_FACTOR)
+}
+
+/// Plan contiguous table chunks for parallel proving.
+///
+/// A chunk grows until it reaches `k` tables (the core/RAM-bound limit) **or**
+/// its summed VRAM estimate would exceed `budget` — whichever comes first. A
+/// single table larger than `budget` forms its own chunk (it runs solo rather
+/// than being excluded). With `budget == u64::MAX` this degrades exactly to
+/// fixed chunks of `k`, identical to the previous `step_by(k)` scheme — so on
+/// non-cuda builds and when VRAM isn't the binding constraint, scheduling (and
+/// therefore the proof) is unchanged. Returns `(start, end)` half-open ranges
+/// covering `0..estimates.len()` in order.
+fn plan_table_chunks(estimates: &[u64], k: usize, budget: u64) -> Vec<(usize, usize)> {
+    let n = estimates.len();
+    let k = k.max(1);
+    let budget = budget as u128;
+    let mut chunks = Vec::new();
+    let mut start = 0;
+    while start < n {
+        let mut end = start;
+        let mut acc: u128 = 0;
+        while end < n {
+            let next = estimates[end] as u128;
+            // Always admit at least one table per chunk (oversized → solo).
+            if end > start && (end - start >= k || acc + next > budget) {
+                break;
+            }
+            acc += next;
+            end += 1;
+        }
+        chunks.push((start, end));
+        start = end;
+    }
+    chunks
+}
+
 /// A container for the results of the second round of the STARK Prove protocol.
 pub(crate) struct Round2<F>
 where
@@ -1726,6 +1779,33 @@ pub trait IsStarkProver<
 
         let k = table_parallelism().min(num_airs).max(1);
 
+        // VRAM-budgeted admission. The budget caps the summed device working-set
+        // of the tables proved concurrently so large blocks don't exhaust VRAM.
+        // It is an *additional* ceiling on top of `k` (it never raises
+        // concurrency): on non-cuda builds, or when the budget can't be queried,
+        // it is `u64::MAX` and chunking falls back to fixed size `k`.
+        #[cfg(feature = "cuda")]
+        let vram_budget = math_cuda::device::backend()
+            .map(|b| b.vram_budget_bytes())
+            .unwrap_or(u64::MAX);
+        #[cfg(not(feature = "cuda"))]
+        let vram_budget = u64::MAX;
+
+        // R1 main commit: only the main LDE (+ its Merkle scratch) is resident,
+        // so the aux columns contribute nothing to this phase's working-set.
+        let main_chunks = {
+            let estimates: Vec<u64> = air_trace_pairs
+                .iter()
+                .enumerate()
+                .map(|(idx, (_, trace, _))| {
+                    let lde_size = domains[idx].interpolation_domain_size
+                        * domains[idx].blowup_factor;
+                    estimate_table_vram_bytes(trace.num_main_columns, 0, lde_size)
+                })
+                .collect();
+            plan_table_chunks(&estimates, k, vram_budget)
+        };
+
         // Spill main traces to mmap before Round 1 LDE.
         #[cfg(feature = "disk-spill")]
         if storage_mode == StorageMode::Disk {
@@ -1770,8 +1850,7 @@ pub trait IsStarkProver<
         let mut main_gpu_handles: Vec<Option<math_cuda::lde::GpuLdeBase>> =
             Vec::with_capacity(num_airs);
 
-        for chunk_start in (0..num_airs).step_by(k) {
-            let chunk_end = (chunk_start + k).min(num_airs);
+        for &(chunk_start, chunk_end) in &main_chunks {
             let chunk_range = chunk_start..chunk_end;
 
             #[cfg(feature = "parallel")]
@@ -1929,8 +2008,28 @@ pub trait IsStarkProver<
         #[allow(clippy::type_complexity)]
         let mut aux_results: Vec<AuxResult<FieldExtension>> = Vec::with_capacity(num_airs);
 
-        for chunk_start in (0..num_airs).step_by(k) {
-            let chunk_end = (chunk_start + k).min(num_airs);
+        // R1 aux commit and rounds 2–4 share the peak working-set: the main and
+        // aux LDEs are co-resident, plus the composition / Merkle transients
+        // (folded into the scratch factor). `num_aux_columns` is now populated
+        // by the aux build above, so this estimate is accurate for both phases.
+        let peak_chunks = {
+            let estimates: Vec<u64> = air_trace_pairs
+                .iter()
+                .enumerate()
+                .map(|(idx, (_, trace, _))| {
+                    let lde_size = domains[idx].interpolation_domain_size
+                        * domains[idx].blowup_factor;
+                    estimate_table_vram_bytes(
+                        trace.num_main_columns,
+                        trace.num_aux_columns,
+                        lde_size,
+                    )
+                })
+                .collect();
+            plan_table_chunks(&estimates, k, vram_budget)
+        };
+
+        for &(chunk_start, chunk_end) in &peak_chunks {
             let chunk_range = chunk_start..chunk_end;
 
             #[cfg(feature = "parallel")]
@@ -2113,8 +2212,7 @@ pub trait IsStarkProver<
 
         let mut proofs = Vec::with_capacity(num_airs);
         let mut lde_drain = cached_ldes.into_iter();
-        for chunk_start in (0..num_airs).step_by(k) {
-            let chunk_end = (chunk_start + k).min(num_airs);
+        for &(chunk_start, chunk_end) in &peak_chunks {
             let chunk_size = chunk_end - chunk_start;
 
             let chunk_ldes: Vec<Lde<Field, FieldExtension>> =

From e9dcda8c7cbed9a11ed71f3278871c3d5c528687 Mon Sep 17 00:00:00 2001
From: Joaquin Carletti <joaquin.carletti@lambdaclass.com>
Date: Mon, 29 Jun 2026 16:00:17 -0300
Subject: [PATCH 04/17] composition parts fold

---
 crypto/stark/src/gpu_lde.rs | 18 ++++++++---
 crypto/stark/src/prover.rs  | 30 +++++++++----------
 crypto/stark/src/trace.rs   | 59 ++++++++++++++++++++++++++++++++++++-
 3 files changed, 87 insertions(+), 20 deletions(-)

diff --git a/crypto/stark/src/gpu_lde.rs b/crypto/stark/src/gpu_lde.rs
index 36756b40b..6c6e725a1 100644
--- a/crypto/stark/src/gpu_lde.rs
+++ b/crypto/stark/src/gpu_lde.rs
@@ -1421,13 +1421,18 @@ pub(crate) fn try_inv_denoms_dev_with_stream<F, E>(
     coset_base: &[FieldElement<F>],
     z_scalars: &[FieldElement<E>],
     sign: math_cuda::inverse::DenomSign,
+    bound_stream: Option<Arc<CudaStream>>,
 ) -> Option<(CudaSlice<u64>, Arc<CudaStream>)>
 where
     F: IsField + 'static,
     E: IsField + 'static,
 {
-    let be = math_cuda::device::backend().ok()?;
-    let stream = be.next_stream();
+    // Use the caller's per-table session stream when provided, so this table's
+    // R3/R4 device chain serialises on one queue; otherwise grab a pool stream.
+    let stream = match bound_stream {
+        Some(s) => s,
+        None => math_cuda::device::backend().ok()?.next_stream(),
+    };
     let handle =
         try_compute_and_invert_inv_denoms_dev::<F, E>(coset_base, z_scalars, sign, &stream)?;
     Some((handle, stream))
@@ -1456,6 +1461,7 @@ pub(crate) struct R3DevContext {
 pub(crate) fn try_prep_r3_dev_context<F, E>(
     coset_base: &[FieldElement<F>],
     z_scalars: &[FieldElement<E>],
+    bound_stream: Option<Arc<CudaStream>>,
 ) -> Option<R3DevContext>
 where
     F: IsField + 'static,
@@ -1477,8 +1483,12 @@ where
         return None;
     }
 
-    let be = math_cuda::device::backend().ok()?;
-    let stream = be.next_stream();
+    // Per-table session stream when provided (shares the queue with R4 DEEP for
+    // this table); otherwise a pool stream.
+    let stream = match bound_stream {
+        Some(s) => s,
+        None => math_cuda::device::backend().ok()?.next_stream(),
+    };
 
     // SAFETY: F == Goldilocks per TypeId check; FieldElement<F> is
     // #[repr(transparent)] over u64.
diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs
index 6b3e345b8..2e1987b07 100644
--- a/crypto/stark/src/prover.rs
+++ b/crypto/stark/src/prover.rs
@@ -388,13 +388,6 @@ where
     pub(crate) composition_poly_merkle_tree: BatchedMerkleTree<F>,
     /// The commitment to the composition polynomial parts.
     pub(crate) composition_poly_root: Commitment,
-    /// Device-resident de-interleaved LDE handle from the R2 fused GPU path
-    /// (`try_evaluate_parts_on_lde_gpu_keep`). When present, R4 DEEP skips
-    /// the `num_parts * 3 * lde_size * 8` byte H2D and reads parts on
-    /// device. `None` when the GPU R2 path didn't run (number_of_parts <= 2,
-    /// below threshold, or any CPU fallback).
-    #[cfg(feature = "cuda")]
-    pub(crate) gpu_composition_parts: Option<math_cuda::lde::GpuLdeExt3>,
 }
 
 /// A container for the results of the third round of the STARK Prove protocol.
@@ -1037,7 +1030,7 @@ pub trait IsStarkProver<
         air: &dyn AIR<Field = Field, FieldExtension = FieldExtension, PublicInputs = PI>,
         pub_inputs: &PI,
         domain: &Domain<Field>,
-        round_1_result: &Round1<Field, FieldExtension>,
+        round_1_result: &mut Round1<Field, FieldExtension>,
         transition_coefficients: &[FieldElement<FieldExtension>],
         boundary_coefficients: &[FieldElement<FieldExtension>],
     ) -> Result<Round2<FieldExtension>, ProvingError>
@@ -1160,12 +1153,18 @@ pub trait IsStarkProver<
         #[cfg(feature = "instruments")]
         crate::instruments::store_r2_sub(constraints_dur, fft_dur, merkle_dur);
 
+        // Fold the R2 device composition-parts handle into the per-table session
+        // (resident R2→R4). The host evaluations remain in `Round2` as the
+        // mirror R4 openings still read; `composition_host_mirror` flags that.
+        #[cfg(feature = "cuda")]
+        if let Some(handle) = gpu_composition_parts {
+            round_1_result.lde_trace.set_gpu_composition_parts(handle);
+        }
+
         Ok(Round2 {
             lde_composition_poly_evaluations: lde_composition_poly_parts_evaluations,
             composition_poly_merkle_tree,
             composition_poly_root,
-            #[cfg(feature = "cuda")]
-            gpu_composition_parts,
         })
     }
 
@@ -1425,11 +1424,12 @@ pub trait IsStarkProver<
                     &domain.lde_roots_of_unity_coset,
                     &z_scalars,
                     math_cuda::inverse::DenomSign::XMinusZ,
+                    lde_trace.bound_stream(),
                 )
                 && let Some(deep_evals) =
                     crate::gpu_lde::try_deep_composition_gpu::<Field, FieldExtension>(
                         lde_trace,
-                        round_2_result.gpu_composition_parts.as_ref(),
+                        lde_trace.gpu_composition_parts(),
                         &round_2_result.lde_composition_poly_evaluations,
                         h_ood,
                         &trace_ood_columns,
@@ -1465,7 +1465,7 @@ pub trait IsStarkProver<
             if let Some(deep_evals) =
                 crate::gpu_lde::try_deep_composition_gpu::<Field, FieldExtension>(
                     lde_trace,
-                    round_2_result.gpu_composition_parts.as_ref(),
+                    lde_trace.gpu_composition_parts(),
                     &round_2_result.lde_composition_poly_evaluations,
                     h_ood,
                     &trace_ood_columns,
@@ -2244,7 +2244,7 @@ pub trait IsStarkProver<
                     let table_start = Instant::now();
 
                     // Build Round1 from cached LDE (consumed by value, no recomputation).
-                    let round_1_result =
+                    let mut round_1_result =
                         commitment.build_round1(lde, air.step_size(), domain.blowup_factor);
 
                     if let Some(ref bpi) = round_1_result.bus_public_inputs {
@@ -2254,7 +2254,7 @@ pub trait IsStarkProver<
                     let proof = Self::prove_rounds_2_to_4(
                         *air,
                         *pub_inputs,
-                        &round_1_result,
+                        &mut round_1_result,
                         table_transcript,
                         domain,
                     )?;
@@ -2343,7 +2343,7 @@ pub trait IsStarkProver<
     fn prove_rounds_2_to_4(
         air: &dyn AIR<Field = Field, FieldExtension = FieldExtension, PublicInputs = PI>,
         pub_inputs: &PI,
-        round_1_result: &Round1<Field, FieldExtension>,
+        round_1_result: &mut Round1<Field, FieldExtension>,
         transcript: &mut (impl IsStarkTranscript<FieldExtension, Field> + Clone),
         domain: &Domain<Field>,
     ) -> Result<StarkProof<Field, FieldExtension, PI>, ProvingError>
diff --git a/crypto/stark/src/trace.rs b/crypto/stark/src/trace.rs
index 39e968c7f..d70a7e664 100644
--- a/crypto/stark/src/trace.rs
+++ b/crypto/stark/src/trace.rs
@@ -7,6 +7,8 @@ use math::polynomial::barycentric_inv_denoms;
 use math::spill_safe::SpillSafe;
 #[cfg(feature = "parallel")]
 use rayon::prelude::{IntoParallelIterator, ParallelIterator};
+#[cfg(feature = "cuda")]
+use std::sync::{Arc, OnceLock};
 
 /// A two-dimensional representation of an execution trace of the STARK
 /// protocol.
@@ -224,12 +226,29 @@ pub(crate) struct GpuTableSession {
     main_lde: Option<math_cuda::lde::GpuLdeBase>,
     /// Aux-trace LDE (ext3 de-interleaved layout on device), resident R1→R4.
     aux_lde: Option<math_cuda::lde::GpuLdeExt3>,
+    /// Composition-poly parts LDE (ext3 de-interleaved on device), produced in
+    /// R2 and resident R2→R4 so R4 DEEP reads the parts on-device instead of a
+    /// `num_parts * 3 * lde_size * 8` byte H2D. `None` when the R2 GPU path
+    /// didn't run (number_of_parts <= 2, below threshold, or CPU fallback).
+    composition_parts: Option<math_cuda::lde::GpuLdeExt3>,
     /// Whether the main-trace host columns currently mirror `main_lde`.
     /// Always `true` today; CPU consumers depend on it.
     main_host_mirror: bool,
     /// Whether the aux-trace host columns currently mirror `aux_lde`.
     /// Always `true` today; CPU consumers depend on it.
     aux_host_mirror: bool,
+    /// Whether the host composition-parts evaluations (`Round2`) mirror
+    /// `composition_parts`. Always `true` today; R4 openings read host.
+    composition_host_mirror: bool,
+    /// Stream bound to this table's GPU work, acquired lazily from the backend
+    /// pool on first use and cached for the session's lifetime. The R3/R4
+    /// device-resident chain (inv_denoms → barycentric/OOD → DEEP) runs on it
+    /// today; the heavy LDE/Merkle ops join once they thread a stream. Binding
+    /// one stream per table serialises a table's kernels on a single queue and
+    /// gives distinct tables distinct streams — the prerequisite for cross-table
+    /// overlap once the host mirrors drop (Steps 4–5). `None` is cached if the
+    /// backend is unavailable, so callers fall back to the CPU path.
+    stream: OnceLock<Option<Arc<math_cuda::CudaStream>>>,
 }
 
 #[cfg(feature = "cuda")]
@@ -238,11 +257,14 @@ impl GpuTableSession {
         Self {
             main_lde: None,
             aux_lde: None,
+            composition_parts: None,
             // Host columns are always materialised today; the CPU consumers
             // (constraint eval, OOD, openings) read them. Steps 5/6 flip these
             // off as each consumer moves to reading the device buffer.
             main_host_mirror: true,
             aux_host_mirror: true,
+            composition_host_mirror: true,
+            stream: OnceLock::new(),
         }
     }
 }
@@ -311,6 +333,37 @@ where
         self.gpu_session.aux_host_mirror
     }
 
+    /// Attach the device-resident composition-poly parts LDE produced in R2.
+    /// Read by R4 DEEP so the parts aren't re-uploaded H2D.
+    #[cfg(feature = "cuda")]
+    pub fn set_gpu_composition_parts(&mut self, h: math_cuda::lde::GpuLdeExt3) {
+        self.gpu_session.composition_parts = Some(h);
+    }
+
+    #[cfg(feature = "cuda")]
+    pub fn gpu_composition_parts(&self) -> Option<&math_cuda::lde::GpuLdeExt3> {
+        self.gpu_session.composition_parts.as_ref()
+    }
+
+    /// Whether the host composition-parts evaluations mirror the device buffer.
+    /// Always `true` today; see [`Self::main_host_mirror`].
+    #[cfg(feature = "cuda")]
+    pub fn composition_host_mirror(&self) -> bool {
+        self.gpu_session.composition_host_mirror
+    }
+
+    /// The stream bound to this table's GPU work. Acquired lazily from the
+    /// backend pool on first call and cached for the session's lifetime, so all
+    /// of a table's stream-threaded ops share one queue. Returns `None` (cached)
+    /// when the backend is unavailable; callers then fall back to the CPU path.
+    #[cfg(feature = "cuda")]
+    pub fn bound_stream(&self) -> Option<Arc<math_cuda::CudaStream>> {
+        self.gpu_session
+            .stream
+            .get_or_init(|| math_cuda::device::backend().ok().map(|b| b.next_stream()))
+            .clone()
+    }
+
     /// Consume self and return the owned column vectors.
     #[allow(clippy::type_complexity)]
     pub fn into_columns(self) -> (Vec<Vec<FieldElement<F>>>, Vec<Vec<FieldElement<E>>>) {
@@ -438,7 +491,11 @@ where
     // both via offset, with no per-eval-point or per-{main,aux} H2D.
     #[cfg(feature = "cuda")]
     let r3_ctx: Option<crate::gpu_lde::R3DevContext> =
-        crate::gpu_lde::try_prep_r3_dev_context::<F, E>(&dc.points, &evaluation_points);
+        crate::gpu_lde::try_prep_r3_dev_context::<F, E>(
+            &dc.points,
+            &evaluation_points,
+            lde_trace.bound_stream(),
+        );
     #[allow(unused_variables)]
     #[cfg(not(feature = "cuda"))]
     let r3_ctx: Option<()> = None;

From b08ef819675ff10e04060ecd2c15834e168c0115 Mon Sep 17 00:00:00 2001
From: Joaquin Carletti <joaquin.carletti@lambdaclass.com>
Date: Mon, 29 Jun 2026 16:10:16 -0300
Subject: [PATCH 05/17] fix

---
 crypto/math-cuda/src/device.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/crypto/math-cuda/src/device.rs b/crypto/math-cuda/src/device.rs
index 4e0dde5bb..9ea480b9f 100644
--- a/crypto/math-cuda/src/device.rs
+++ b/crypto/math-cuda/src/device.rs
@@ -213,10 +213,10 @@ fn retain_default_mempool(ctx: &CudaContext) {
 /// retained blocks. On any query failure it returns `u64::MAX`, which disables
 /// budgeting: admission then falls back to the core-bound chunk size alone.
 fn detect_vram_budget_bytes(ctx: &CudaContext) -> u64 {
-    if let Ok(mb) = std::env::var("LAMBDA_VM_VRAM_BUDGET_MB") {
-        if let Ok(mb) = mb.parse::<u64>() {
-            return mb.saturating_mul(1024 * 1024);
-        }
+    if let Ok(mb) = std::env::var("LAMBDA_VM_VRAM_BUDGET_MB")
+        && let Ok(mb) = mb.parse::<u64>()
+    {
+        return mb.saturating_mul(1024 * 1024);
     }
     use cudarc::driver::sys;
     // SAFETY: raw driver query writing into two stack slots. The caller's

From 8ea8a0af1b1f9e445e1ad662768885a97b31a8a8 Mon Sep 17 00:00:00 2001
From: Joaquin Carletti <joaquin.carletti@lambdaclass.com>
Date: Mon, 29 Jun 2026 16:11:38 -0300
Subject: [PATCH 06/17] fix_doc

---
 crypto/stark/src/trace.rs | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/crypto/stark/src/trace.rs b/crypto/stark/src/trace.rs
index d70a7e664..c7a9a941a 100644
--- a/crypto/stark/src/trace.rs
+++ b/crypto/stark/src/trace.rs
@@ -214,9 +214,8 @@ where
 /// steps flip: once a consumer reads from the device buffer instead, the
 /// corresponding mirror is dropped and the H2D/D2H copy elided.
 ///
-/// Scope: this owns the main/aux trace LDE (resident R1→R4). The composition
-/// parts (`Round2`, R2→R4) and the bound stream are folded in by the control-
-/// plane commit, where the R2→R4 borrow chain becomes mutable. The R4-local
+/// Scope: this owns the main/aux trace LDE (resident R1→R4), the composition
+/// parts (`Round2`, R2→R4), and the per-table bound stream. The R4-local
 /// `inv_denoms`/FRI state are created and consumed within R4 and stay local.
 #[cfg(feature = "cuda")]
 pub(crate) struct GpuTableSession {

From caa7efdd02218a651a166caf8a7e713fb270a2c9 Mon Sep 17 00:00:00 2001
From: Joaquin Carletti <joaquin.carletti@lambdaclass.com>
Date: Mon, 29 Jun 2026 17:10:46 -0300
Subject: [PATCH 07/17]  move merkle tree to gpu

---
 crypto/math-cuda/kernels/keccak.cu            | 34 ++++++++
 crypto/math-cuda/src/device.rs                |  2 +
 crypto/math-cuda/src/lde.rs                   | 40 +++++++++
 crypto/math-cuda/src/merkle.rs                | 56 +++++++++++++
 crypto/math-cuda/tests/barycentric_strided.rs |  1 +
 crypto/math-cuda/tests/deep.rs                |  1 +
 crypto/math-cuda/tests/merkle_gather.rs       | 84 +++++++++++++++++++
 crypto/stark/src/gpu_lde.rs                   | 41 ++++++++-
 crypto/stark/src/prover.rs                    | 80 +++++++++++++++++-
 9 files changed, 334 insertions(+), 5 deletions(-)
 create mode 100644 crypto/math-cuda/tests/merkle_gather.rs

diff --git a/crypto/math-cuda/kernels/keccak.cu b/crypto/math-cuda/kernels/keccak.cu
index c22bc4d05..bcaf519f4 100644
--- a/crypto/math-cuda/kernels/keccak.cu
+++ b/crypto/math-cuda/kernels/keccak.cu
@@ -347,3 +347,37 @@ extern "C" __global__ void keccak_merkle_level(
 
     finalize_keccak256(st, rate_pos, nodes + (parent_begin + tid) * 32);
 }
+
+// Gather Merkle authentication paths for a batch of leaf positions, reading the
+// resident tree `nodes` (32-byte nodes; layout: inner nodes [0..leaves_len-1],
+// root at 0, leaves at [leaves_len-1..]). One thread per query walks leaf->root,
+// writing each sibling node into the output. This mirrors the CPU
+// `build_merkle_path` exactly (sibling_index / parent_index in
+// crypto/crypto/src/merkle_tree/utils.rs):
+//   leaf node = pos + leaves_len - 1
+//   sibling   = node even ? node-1 : node+1
+//   parent    = node even ? (node-1)/2 : node/2
+// so `out[(q*depth + level)*32 .. +32]` is the level-th sibling for query q.
+extern "C" __global__ void merkle_gather_paths(
+    const uint8_t *nodes,
+    const uint32_t *positions,   // leaf positions, length num_queries
+    uint32_t num_queries,
+    uint64_t leaves_len,
+    uint32_t depth,              // = log2(leaves_len)
+    uint8_t *out) {              // num_queries * depth * 32 bytes
+    uint32_t q = blockIdx.x * blockDim.x + threadIdx.x;
+    if (q >= num_queries) return;
+
+    uint64_t node = (uint64_t)positions[q] + leaves_len - 1;
+    for (uint32_t level = 0; level < depth; ++level) {
+        uint64_t sib = (node & 1ull) ? (node + 1ull) : (node - 1ull);
+        // 32-byte nodes at 32-byte-aligned offsets (cuMemAlloc 256-aligned),
+        // so the u64 copy is safe.
+        const uint64_t *src = reinterpret_cast<const uint64_t *>(nodes + sib * 32);
+        uint64_t *dst = reinterpret_cast<uint64_t *>(
+            out + ((uint64_t)q * depth + level) * 32);
+        #pragma unroll
+        for (int i = 0; i < 4; ++i) dst[i] = src[i];
+        node = (node & 1ull) ? (node >> 1) : ((node - 1ull) >> 1);
+    }
+}
diff --git a/crypto/math-cuda/src/device.rs b/crypto/math-cuda/src/device.rs
index 9ea480b9f..6b6094d45 100644
--- a/crypto/math-cuda/src/device.rs
+++ b/crypto/math-cuda/src/device.rs
@@ -150,6 +150,7 @@ pub struct Backend {
     pub keccak_comp_poly_leaves_ext3: CudaFunction,
     pub keccak_fri_leaves_ext3: CudaFunction,
     pub keccak_merkle_level: CudaFunction,
+    pub merkle_gather_paths: CudaFunction,
 
     // barycentric.ptx
     pub barycentric_base_batched: CudaFunction,
@@ -320,6 +321,7 @@ impl Backend {
             keccak_comp_poly_leaves_ext3: keccak.load_function("keccak_comp_poly_leaves_ext3")?,
             keccak_fri_leaves_ext3: keccak.load_function("keccak_fri_leaves_ext3")?,
             keccak_merkle_level: keccak.load_function("keccak_merkle_level")?,
+            merkle_gather_paths: keccak.load_function("merkle_gather_paths")?,
             barycentric_base_batched: bary.load_function("barycentric_base_batched")?,
             barycentric_ext3_batched: bary.load_function("barycentric_ext3_batched")?,
             barycentric_base_batched_strided: bary
diff --git a/crypto/math-cuda/src/lde.rs b/crypto/math-cuda/src/lde.rs
index ee5dc3fce..f65d17a35 100644
--- a/crypto/math-cuda/src/lde.rs
+++ b/crypto/math-cuda/src/lde.rs
@@ -219,11 +219,17 @@ fn launch_pointwise_mul_batched(
 /// Handle to a base-field LDE kept live on device after R1 commit.
 /// Layout: `m` columns, each `lde_size` u64s, column `c` at byte offset
 /// `c * lde_size * 8` within `buf`. Freed when `buf` Arc drops.
+///
+/// `tree` optionally carries the main-trace Merkle tree kept resident on device
+/// (populated by the keep path), so R4 query openings can gather authentication
+/// paths on device instead of D2H'ing the whole tree to host. `None` on the CPU
+/// path or when the tree wasn't retained.
 #[derive(Clone)]
 pub struct GpuLdeBase {
     pub buf: Arc<CudaSlice<u64>>,
     pub m: usize,
     pub lde_size: usize,
+    pub tree: Option<GpuMerkleTree>,
 }
 
 /// Handle to an ext3 LDE kept live on device, de-interleaved into 3 base
@@ -236,6 +242,18 @@ pub struct GpuLdeExt3 {
     pub lde_size: usize,
 }
 
+/// Handle to a Merkle tree kept live on device after a commit, so query
+/// openings can gather authentication paths on device instead of D2H'ing the
+/// whole tree to host. Node layout matches the CPU tree
+/// (`crypto/crypto/src/merkle_tree`): `nodes[0..leaves_len-1]` are inner nodes
+/// (root at index 0), `nodes[leaves_len-1..]` are the leaves; each node is 32
+/// bytes. Freed when the `nodes` Arc drops.
+#[derive(Clone)]
+pub struct GpuMerkleTree {
+    pub nodes: Arc<CudaSlice<u8>>,
+    pub leaves_len: usize,
+}
+
 pub fn coset_lde_base(evals: &[u64], blowup_factor: usize, weights: &[u64]) -> Result<Vec<u64>> {
     let n = evals.len();
     // Empty input must short-circuit before the power-of-two assert
@@ -613,6 +631,7 @@ pub fn coset_lde_batch_base_into_with_leaf_hash(
         hashed_leaves_out,
         KeccakCommit::LeavesOnly,
         false,
+        false,
     )
     .map(|_| ())
 }
@@ -640,6 +659,7 @@ pub fn coset_lde_batch_base_into_with_merkle_tree(
         merkle_nodes_out,
         KeccakCommit::FullTree,
         false,
+        false,
     )
     .map(|_| ())
 }
@@ -654,6 +674,10 @@ pub fn coset_lde_batch_base_into_with_merkle_tree_keep(
     outputs: &mut [&mut [u64]],
     merkle_nodes_out: &mut [u8],
 ) -> Result<GpuLdeBase> {
+    // `keep_tree = true`: the main-trace tree nodes are retained on device
+    // (inside the returned `GpuLdeBase.tree`) so R4 openings can gather paths on
+    // device. The host `merkle_nodes_out` is still filled, so host-tree openings
+    // keep working until a later step drops the host tree.
     let opt = coset_lde_batch_base_into_with_merkle_tree_inner(
         columns,
         blowup_factor,
@@ -662,11 +686,13 @@ pub fn coset_lde_batch_base_into_with_merkle_tree_keep(
         merkle_nodes_out,
         KeccakCommit::FullTree,
         true,
+        true,
     )?;
     let handle = opt.expect("keep_device_buf=true must return Some");
     Ok(handle)
 }
 
+#[allow(clippy::too_many_arguments)]
 fn coset_lde_batch_base_into_with_merkle_tree_inner(
     columns: &[&[u64]],
     blowup_factor: usize,
@@ -675,6 +701,7 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner(
     nodes_out: &mut [u8],
     commit: KeccakCommit,
     keep_device_buf: bool,
+    keep_tree: bool,
 ) -> Result<Option<GpuLdeBase>> {
     if columns.is_empty() {
         assert_eq!(outputs.len(), 0);
@@ -817,13 +844,26 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner(
     drop(staging);
 
     if keep_device_buf {
+        // Retain the device tree nodes for on-device opening (FullTree only; in
+        // LeavesOnly mode there are no inner nodes to gather paths from).
+        let tree = if keep_tree && commit == KeccakCommit::FullTree {
+            Some(GpuMerkleTree {
+                nodes: Arc::new(nodes_dev),
+                leaves_len: lde_size,
+            })
+        } else {
+            drop(nodes_dev);
+            None
+        };
         Ok(Some(GpuLdeBase {
             buf: Arc::new(buf),
             m,
             lde_size,
+            tree,
         }))
     } else {
         drop(buf);
+        drop(nodes_dev);
         Ok(None)
     }
 }
diff --git a/crypto/math-cuda/src/merkle.rs b/crypto/math-cuda/src/merkle.rs
index 932e81325..1c1e54093 100644
--- a/crypto/math-cuda/src/merkle.rs
+++ b/crypto/math-cuda/src/merkle.rs
@@ -17,6 +17,7 @@
 //! to match `FieldElement::<Ext3>::write_bytes_be`.
 
 use cudarc::driver::{CudaSlice, CudaStream, CudaViewMut, LaunchConfig, PushKernelArg};
+use std::sync::Arc;
 
 use crate::Result;
 use crate::device::{Backend, backend};
@@ -217,6 +218,61 @@ pub fn build_merkle_tree_on_device(hashed_leaves: &[u8]) -> Result<Vec<u8>> {
     Ok(out)
 }
 
+/// Gather Merkle authentication paths on device for `positions` (leaf indices)
+/// against the resident tree `nodes_dev` (standard layout, `2*leaves_len-1`
+/// nodes of 32 bytes). Returns `positions.len() * depth * 32` bytes, where
+/// `depth = log2(leaves_len)`: query `q`'s path occupies bytes
+/// `[q*depth*32 .. (q+1)*depth*32]`, each 32-byte node a sibling from leaf to
+/// root — byte-for-byte the same nodes the CPU `MerkleTree::get_proof_by_pos`
+/// (`build_merkle_path`) collects. Runs on the caller's `stream` (pass the
+/// table's session stream so it shares the queue with the rest of R4).
+pub fn gather_merkle_paths_dev(
+    nodes_dev: &CudaSlice<u8>,
+    leaves_len: usize,
+    positions: &[u32],
+    stream: &Arc<CudaStream>,
+) -> Result<Vec<u8>> {
+    let num_queries = positions.len();
+    if num_queries == 0 {
+        return Ok(Vec::new());
+    }
+    assert!(
+        leaves_len.is_power_of_two() && leaves_len >= 2,
+        "leaves_len must be a power of two >= 2"
+    );
+    let depth = leaves_len.trailing_zeros() as usize;
+    let be = backend()?;
+
+    let pos_dev = stream.clone_htod(positions)?;
+    // SAFETY: every byte of `out` is written by the kernel below (one 32-byte
+    // node per (query, level)) before the D2H reads it back.
+    let mut out = unsafe { stream.alloc::<u8>(num_queries * depth * 32) }?;
+
+    let grid = (num_queries as u32).div_ceil(KECCAK_BLOCK_DIM);
+    let cfg = LaunchConfig {
+        grid_dim: (grid, 1, 1),
+        block_dim: (KECCAK_BLOCK_DIM, 1, 1),
+        shared_mem_bytes: 0,
+    };
+    let num_queries_u32 = num_queries as u32;
+    let leaves_len_u64 = leaves_len as u64;
+    let depth_u32 = depth as u32;
+    unsafe {
+        stream
+            .launch_builder(&be.merkle_gather_paths)
+            .arg(nodes_dev)
+            .arg(&pos_dev)
+            .arg(&num_queries_u32)
+            .arg(&leaves_len_u64)
+            .arg(&depth_u32)
+            .arg(&mut out)
+            .launch(cfg)?;
+    }
+    let host = stream.clone_dtoh(&out)?;
+    stream.synchronize()?;
+    Ok(host)
+}
+
 /// Row-pair Keccak leaf + Merkle tree build for R2 composition-polynomial
 /// commit. `parts_interleaved` is `num_parts` slices, each holding an ext3
 /// LDE column interleaved as `[a0,a1,a2, b0,b1,b2, ...]` of length `3*lde_size`.
diff --git a/crypto/math-cuda/tests/barycentric_strided.rs b/crypto/math-cuda/tests/barycentric_strided.rs
index 653ef4e38..b1963522f 100644
--- a/crypto/math-cuda/tests/barycentric_strided.rs
+++ b/crypto/math-cuda/tests/barycentric_strided.rs
@@ -49,6 +49,7 @@ fn run_base(log_trace: u32, blowup: usize, num_cols: usize, seed: u64) {
         buf: Arc::new(lde_dev),
         m: num_cols,
         lde_size,
+        tree: None,
     };
 
     // Pre-strided buffer for non-strided reference: trace-size picks of each col.
diff --git a/crypto/math-cuda/tests/deep.rs b/crypto/math-cuda/tests/deep.rs
index 8499cd04a..57ea14d3c 100644
--- a/crypto/math-cuda/tests/deep.rs
+++ b/crypto/math-cuda/tests/deep.rs
@@ -177,6 +177,7 @@ fn run_parity(
         buf: Arc::new(main_dev),
         m: num_main,
         lde_size,
+        tree: None,
     };
     let aux_handle = if num_aux > 0 {
         Some(GpuLdeExt3 {
diff --git a/crypto/math-cuda/tests/merkle_gather.rs b/crypto/math-cuda/tests/merkle_gather.rs
new file mode 100644
index 000000000..b2ba24c1d
--- /dev/null
+++ b/crypto/math-cuda/tests/merkle_gather.rs
@@ -0,0 +1,84 @@
+//! Parity: GPU `gather_merkle_paths_dev` must produce, for each leaf position,
+//! the exact `merkle_path` the CPU `MerkleTree::get_proof_by_pos` returns —
+//! same sibling-from-leaf-to-root order, byte-for-byte. This is the gate for
+//! gathering R4 query openings on device instead of D2H'ing the whole tree.
+
+use crypto::merkle_tree::backends::field_element_vector::FieldElementVectorBackend;
+use crypto::merkle_tree::merkle::MerkleTree;
+use math::field::goldilocks::GoldilocksField;
+use rand::{Rng, SeedableRng};
+use rand_chacha::ChaCha8Rng;
+use sha3::Keccak256;
+
+type CpuTree = MerkleTree<FieldElementVectorBackend<GoldilocksField, Keccak256, 32>>;
+
+fn run_gather_parity(log_n: u32, seed: u64) {
+    let leaves_len = 1usize << log_n;
+    let mut rng = ChaCha8Rng::seed_from_u64(seed);
+    let leaves: Vec<[u8; 32]> = (0..leaves_len)
+        .map(|_| {
+            let mut arr = [0u8; 32];
+            rng.fill(&mut arr[..]);
+            arr
+        })
+        .collect();
+
+    let mut flat = Vec::with_capacity(leaves_len * 32);
+    for l in &leaves {
+        flat.extend_from_slice(l);
+    }
+
+    // Build the tree on device, then upload its nodes back as the resident
+    // buffer the gather reads (build_merkle_tree_on_device returns host bytes).
+    let gpu_nodes_bytes = math_cuda::merkle::build_merkle_tree_on_device(&flat).unwrap();
+
+    // CPU reference tree over the same backend as the prover.
+    let cpu_tree = CpuTree::build_from_hashed_leaves(leaves).unwrap();
+
+    // Query a spread of positions: first, last, and random interior ones.
+    let mut positions: Vec<u32> = vec![0, (leaves_len - 1) as u32];
+    let mut r = ChaCha8Rng::seed_from_u64(seed ^ 0xabcd);
+    for _ in 0..16usize.min(leaves_len) {
+        positions.push(r.gen_range(0..leaves_len) as u32);
+    }
+
+    let be = math_cuda::device::backend().unwrap();
+    let stream = be.next_stream();
+    let nodes_dev = stream.clone_htod(&gpu_nodes_bytes).unwrap();
+    stream.synchronize().unwrap();
+
+    let depth = log_n as usize;
+    let paths =
+        math_cuda::merkle::gather_merkle_paths_dev(&nodes_dev, leaves_len, &positions, &stream)
+            .unwrap();
+    assert_eq!(paths.len(), positions.len() * depth * 32);
+
+    for (q, &pos) in positions.iter().enumerate() {
+        let cpu_proof = cpu_tree.get_proof_by_pos(pos as usize).unwrap();
+        assert_eq!(
+            cpu_proof.merkle_path.len(),
+            depth,
+            "depth mismatch at log_n={log_n} pos={pos}"
+        );
+        for (level, cpu_node) in cpu_proof.merkle_path.iter().enumerate() {
+            let g = &paths[(q * depth + level) * 32..(q * depth + level + 1) * 32];
+            assert_eq!(
+                g,
+                &cpu_node[..],
+                "path node mismatch: log_n={log_n} pos={pos} level={level}"
+            );
+        }
+    }
+}
+
+#[test]
+fn merkle_gather_small() {
+    for log_n in 1u32..=6 {
+        run_gather_parity(log_n, 200 + log_n as u64);
+    }
+}
+
+#[test]
+fn merkle_gather_large() {
+    run_gather_parity(18, 7777);
+}
diff --git a/crypto/stark/src/gpu_lde.rs b/crypto/stark/src/gpu_lde.rs
index 6c6e725a1..f6c85553e 100644
--- a/crypto/stark/src/gpu_lde.rs
+++ b/crypto/stark/src/gpu_lde.rs
@@ -16,6 +16,7 @@ use math_cuda::{CudaSlice, CudaStream};
 
 use crypto::fiat_shamir::is_transcript::IsStarkTranscript;
 use crypto::merkle_tree::merkle::MerkleTree;
+use crypto::merkle_tree::proof::Proof;
 use crypto::merkle_tree::traits::IsMerkleTreeBackend;
 use math::field::element::FieldElement;
 use math::field::extensions_goldilocks::Degree3GoldilocksExtensionField;
@@ -23,7 +24,7 @@ use math::field::goldilocks::GoldilocksField;
 use math::field::traits::{IsFFTField, IsField, IsSubFieldOf};
 use math::traits::AsBytes;
 
-use crate::config::FriLayerMerkleTreeBackend;
+use crate::config::{Commitment, FriLayerMerkleTreeBackend};
 use crate::domain::Domain;
 use crate::fri::fri_commitment::FriLayer;
 use crate::fri::fri_functions::compute_coset_twiddles_inv;
@@ -1438,6 +1439,44 @@ where
     Some((handle, stream))
 }
 
+/// Gather Merkle authentication paths on device for `positions` (leaf indices),
+/// returning one [`Proof`] per position in the same order. Byte-identical to
+/// the host `MerkleTree::get_proof_by_pos` (guarded by the `merkle_gather`
+/// parity test), so R4 query openings can source proofs from the resident
+/// device tree instead of the host tree. Returns `None` on any cudarc error
+/// (the caller then falls back to the host tree).
+pub(crate) fn gather_proofs_dev(
+    tree: &math_cuda::lde::GpuMerkleTree,
+    positions: &[usize],
+    stream: &Arc<CudaStream>,
+) -> Option<Vec<Proof<Commitment>>> {
+    if positions.is_empty() {
+        return Some(Vec::new());
+    }
+    let positions_u32: Vec<u32> = positions.iter().map(|&p| p as u32).collect();
+    let bytes = math_cuda::merkle::gather_merkle_paths_dev(
+        &tree.nodes,
+        tree.leaves_len,
+        &positions_u32,
+        stream,
+    )
+    .ok()?;
+    let depth = tree.leaves_len.trailing_zeros() as usize;
+    debug_assert_eq!(bytes.len(), positions.len() * depth * 32);
+    let mut proofs = Vec::with_capacity(positions.len());
+    for q in 0..positions.len() {
+        let mut merkle_path = Vec::with_capacity(depth);
+        for level in 0..depth {
+            let off = (q * depth + level) * 32;
+            let mut node: Commitment = [0u8; 32];
+            node.copy_from_slice(&bytes[off..off + 32]);
+            merkle_path.push(node);
+        }
+        proofs.push(Proof { merkle_path });
+    }
+    Some(proofs)
+}
+
 /// R3 OOD device-side context: bundles the inverted denominators, the
 /// coset_points upload (used by every barycentric kernel for this batch),
 /// and the stream so producer + consumers serialize naturally. Hoisting
diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs
index 2e1987b07..25d63cd2d 100644
--- a/crypto/stark/src/prover.rs
+++ b/crypto/stark/src/prover.rs
@@ -35,6 +35,8 @@ use crate::trace::LDETraceTable;
 
 use super::config::{BatchedMerkleTree, BatchedMerkleTreeBackend, Commitment};
 use super::constraints::evaluator::ConstraintEvaluator;
+#[cfg(feature = "cuda")]
+use crypto::merkle_tree::proof::Proof;
 use super::domain::{Domain, DomainConstants};
 use super::fri::fri_decommit::FriDecommitment;
 use super::grinding;
@@ -1619,6 +1621,36 @@ pub trait IsStarkProver<
         }
     }
 
+    /// Like [`Self::open_polys_with`], but uses Merkle proofs already gathered
+    /// from the resident device tree (see [`crate::gpu_lde::gather_proofs_dev`])
+    /// instead of walking a host tree. The evaluations are still gathered from
+    /// the host-resident LDE columns via `gather`. `proof` is for leaf position
+    /// `challenge * 2`, `proof_sym` for `challenge * 2 + 1` — the same positions
+    /// `open_polys_with` opens.
+    #[cfg(feature = "cuda")]
+    fn open_polys_with_proofs<C, G>(
+        domain: &Domain<Field>,
+        proof: Proof<Commitment>,
+        proof_sym: Proof<Commitment>,
+        challenge: usize,
+        gather: G,
+    ) -> PolynomialOpenings<C>
+    where
+        C: IsField,
+        FieldElement<C>: AsBytes + Sync + Send,
+        G: Fn(usize) -> Vec<FieldElement<C>>,
+    {
+        let domain_size = domain.lde_roots_of_unity_coset.len() as u64;
+        let index = challenge * 2;
+        let index_sym = challenge * 2 + 1;
+        PolynomialOpenings {
+            proof,
+            proof_sym,
+            evaluations: gather(reverse_index(index, domain_size)),
+            evaluations_sym: gather(reverse_index(index_sym, domain_size)),
+        }
+    }
+
     /// Open the deep composition polynomial on a list of indexes and their symmetric elements.
     fn open_deep_composition_poly(
         domain: &Domain<Field>,
@@ -1638,7 +1670,28 @@ pub trait IsStarkProver<
         let num_precomputed_cols = main_commit.num_precomputed_cols;
         let total_cols = lde_trace.num_main_cols();
 
-        for index in indexes_to_open.iter() {
+        // R4 main-trace proofs from the resident device tree, when present:
+        // gathered in one batch over all query positions (`c*2`, `c*2+1` per
+        // query) instead of walking the host tree. Byte-identical to the host
+        // proofs (guarded by the `merkle_gather` parity test). Only the
+        // non-preprocessed main carries a device tree today; on any miss this is
+        // `None` and openings fall back to the host tree below.
+        #[cfg(feature = "cuda")]
+        let main_dev_proofs: Option<Vec<Proof<Commitment>>> = (!is_preprocessed)
+            .then(|| {
+                let tree = lde_trace.gpu_main()?.tree.as_ref()?;
+                let stream = lde_trace.bound_stream()?;
+                let positions: Vec<usize> = indexes_to_open
+                    .iter()
+                    .flat_map(|&c| [c * 2, c * 2 + 1])
+                    .collect();
+                crate::gpu_lde::gather_proofs_dev(tree, &positions, &stream)
+            })
+            .flatten();
+
+        for (qi, index) in indexes_to_open.iter().enumerate() {
+            #[cfg(not(feature = "cuda"))]
+            let _ = qi;
             // For preprocessed tables, open the main split (multiplicities only);
             // for normal tables, open all main columns.
             let main_trace_opening = if is_preprocessed {
@@ -1646,9 +1699,28 @@ pub trait IsStarkProver<
                     lde_trace.gather_main_row_range(row, num_precomputed_cols, total_cols)
                 })
             } else {
-                Self::open_polys_with(domain, &main_commit.tree, *index, |row| {
-                    lde_trace.gather_main_row(row)
-                })
+                #[cfg(feature = "cuda")]
+                {
+                    if let Some(proofs) = &main_dev_proofs {
+                        Self::open_polys_with_proofs(
+                            domain,
+                            proofs[qi * 2].clone(),
+                            proofs[qi * 2 + 1].clone(),
+                            *index,
+                            |row| lde_trace.gather_main_row(row),
+                        )
+                    } else {
+                        Self::open_polys_with(domain, &main_commit.tree, *index, |row| {
+                            lde_trace.gather_main_row(row)
+                        })
+                    }
+                }
+                #[cfg(not(feature = "cuda"))]
+                {
+                    Self::open_polys_with(domain, &main_commit.tree, *index, |row| {
+                        lde_trace.gather_main_row(row)
+                    })
+                }
             };
 
             // For preprocessed tables, also open the precomputed-columns tree.

From b8113370fffbf3393e09ced900139f778655231b Mon Sep 17 00:00:00 2001
From: Joaquin Carletti <joaquin.carletti@lambdaclass.com>
Date: Mon, 29 Jun 2026 17:52:23 -0300
Subject: [PATCH 08/17] merkle tree

---
 crypto/crypto/src/merkle_tree/merkle.rs       |  14 ++
 crypto/math-cuda/src/lde.rs                   | 104 ++++++++---
 crypto/math-cuda/src/merkle.rs                |  37 +++-
 crypto/math-cuda/tests/barycentric_strided.rs |   1 +
 crypto/math-cuda/tests/deep.rs                |   1 +
 crypto/stark/src/gpu_lde.rs                   |  68 +++-----
 crypto/stark/src/prover.rs                    | 164 +++++++++++++++---
 7 files changed, 296 insertions(+), 93 deletions(-)

diff --git a/crypto/crypto/src/merkle_tree/merkle.rs b/crypto/crypto/src/merkle_tree/merkle.rs
index f00985d39..2060a0f18 100644
--- a/crypto/crypto/src/merkle_tree/merkle.rs
+++ b/crypto/crypto/src/merkle_tree/merkle.rs
@@ -168,6 +168,20 @@ where
         })
     }
 
+    /// Create a root-only Merkle tree placeholder: stores the commitment root
+    /// but no nodes. Used when the tree's authentication paths are gathered from
+    /// a device-resident copy (GPU) instead of this host tree, so the host nodes
+    /// are never materialised (saving the full-tree Device→Host copy).
+    /// [`get_proof_by_pos`](Self::get_proof_by_pos) must NOT be called on it.
+    pub fn from_root(root: B::Node) -> Self {
+        MerkleTree {
+            root,
+            nodes: Vec::new(),
+            #[cfg(feature = "disk-spill")]
+            mmap_backing: None,
+        }
+    }
+
     /// Create a Merkle tree from pre-hashed leaf nodes.
     ///
     /// This skips the `hash_leaves` step, useful when leaves have already been
diff --git a/crypto/math-cuda/src/lde.rs b/crypto/math-cuda/src/lde.rs
index f65d17a35..8fc309ae5 100644
--- a/crypto/math-cuda/src/lde.rs
+++ b/crypto/math-cuda/src/lde.rs
@@ -240,6 +240,10 @@ pub struct GpuLdeExt3 {
     pub buf: Arc<CudaSlice<u64>>,
     pub m: usize,
     pub lde_size: usize,
+    /// Optionally the aux/composition Merkle tree kept resident on device (the
+    /// keep path), so R4 openings gather paths on device instead of D2H'ing the
+    /// whole tree. `None` on the CPU path or when not retained.
+    pub tree: Option<GpuMerkleTree>,
 }
 
 /// Handle to a Merkle tree kept live on device after a commit, so query
@@ -252,6 +256,9 @@ pub struct GpuLdeExt3 {
 pub struct GpuMerkleTree {
     pub nodes: Arc<CudaSlice<u8>>,
     pub leaves_len: usize,
+    /// The Merkle root (node 0), copied to host at build time (32 bytes) so the
+    /// commitment is available without D2H'ing the whole tree.
+    pub root: [u8; 32],
 }
 
 pub fn coset_lde_base(evals: &[u64], blowup_factor: usize, weights: &[u64]) -> Result<Vec<u64>> {
@@ -628,7 +635,7 @@ pub fn coset_lde_batch_base_into_with_leaf_hash(
         blowup_factor,
         weights,
         outputs,
-        hashed_leaves_out,
+        Some(hashed_leaves_out),
         KeccakCommit::LeavesOnly,
         false,
         false,
@@ -656,7 +663,7 @@ pub fn coset_lde_batch_base_into_with_merkle_tree(
         blowup_factor,
         weights,
         outputs,
-        merkle_nodes_out,
+        Some(merkle_nodes_out),
         KeccakCommit::FullTree,
         false,
         false,
@@ -664,26 +671,25 @@ pub fn coset_lde_batch_base_into_with_merkle_tree(
     .map(|_| ())
 }
 
-/// Fused LDE + leaf-hash + Merkle tree build. If `keep_device_buf` is true,
-/// returns an `Arc<CudaSlice<u64>>` wrapping the LDE device buffer so callers
-/// (R2–R4 GPU paths) can reuse the LDE without a re-H2D.
+/// Fused LDE + leaf-hash + Merkle tree build, keeping **both** the LDE buffer
+/// and the Merkle tree resident on device (returned in `GpuLdeBase` + its
+/// `tree`, including the root). The host tree is **not** materialised — there is
+/// no `merkle_nodes_out` — so the whole-tree Device→Host copy is eliminated.
+/// Callers gather query authentication paths from the device tree instead
+/// (`crate::merkle::gather_merkle_paths_dev`) and use a root-only host tree for
+/// the commitment.
 pub fn coset_lde_batch_base_into_with_merkle_tree_keep(
     columns: &[&[u64]],
     blowup_factor: usize,
     weights: &[u64],
     outputs: &mut [&mut [u64]],
-    merkle_nodes_out: &mut [u8],
 ) -> Result<GpuLdeBase> {
-    // `keep_tree = true`: the main-trace tree nodes are retained on device
-    // (inside the returned `GpuLdeBase.tree`) so R4 openings can gather paths on
-    // device. The host `merkle_nodes_out` is still filled, so host-tree openings
-    // keep working until a later step drops the host tree.
     let opt = coset_lde_batch_base_into_with_merkle_tree_inner(
         columns,
         blowup_factor,
         weights,
         outputs,
-        merkle_nodes_out,
+        None,
         KeccakCommit::FullTree,
         true,
         true,
@@ -698,7 +704,7 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner(
     blowup_factor: usize,
     weights: &[u64],
     outputs: &mut [&mut [u64]],
-    nodes_out: &mut [u8],
+    nodes_out: Option<&mut [u8]>,
     commit: KeccakCommit,
     keep_device_buf: bool,
     keep_tree: bool,
@@ -726,7 +732,9 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner(
         assert_eq!(o.len(), lde_size);
     }
     let nodes_dev_bytes = commit.total_nodes_bytes(lde_size);
-    assert_eq!(nodes_out.len(), nodes_dev_bytes);
+    if let Some(no) = &nodes_out {
+        assert_eq!(no.len(), nodes_dev_bytes);
+    }
     let log_n = n.trailing_zeros() as u64;
     let log_lde = lde_size.trailing_zeros() as u64;
 
@@ -832,9 +840,18 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner(
         crate::merkle::build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, lde_size)?;
     }
 
-    // D2H the LDE and the tree/leaves nodes via pinned staging.
+    // D2H the LDE columns via pinned staging (still read on host by constraint
+    // eval / openings). The full tree nodes are D2H'd only when the caller asks
+    // for a host tree (`nodes_out`); the device-resident path passes `None` and
+    // keeps the tree on device, eliminating the whole-tree D2H.
     stream.memcpy_dtoh(&buf, &mut pinned[..m * lde_size])?;
-    d2h_bytes_via_pinned_hashes(&stream, be, &nodes_dev, nodes_out)?;
+    if let Some(no) = nodes_out {
+        d2h_bytes_via_pinned_hashes(&stream, be, &nodes_dev, no)?;
+    }
+    // Ensure the (async) LDE D2H above has landed before reading `pinned`.
+    // Previously the tree D2H's `synchronize` covered this; the device-resident
+    // path skips that D2H, so synchronize explicitly.
+    stream.synchronize()?;
 
     // Copy pinned into caller outputs. Runs under the pinned-staging lock,
     // where rayon can deadlock. See `Backend::pinned_staging`.
@@ -847,9 +864,15 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner(
         // Retain the device tree nodes for on-device opening (FullTree only; in
         // LeavesOnly mode there are no inner nodes to gather paths from).
         let tree = if keep_tree && commit == KeccakCommit::FullTree {
+            // Copy just the 32-byte root (node 0) so the commitment is available
+            // without D2H'ing the whole tree.
+            let mut root = [0u8; 32];
+            stream.memcpy_dtoh(&nodes_dev.slice(0..32), &mut root)?;
+            stream.synchronize()?;
             Some(GpuMerkleTree {
                 nodes: Arc::new(nodes_dev),
                 leaves_len: lde_size,
+                root,
             })
         } else {
             drop(nodes_dev);
@@ -885,9 +908,10 @@ pub fn coset_lde_batch_ext3_into_with_leaf_hash(
         blowup_factor,
         weights,
         outputs,
-        hashed_leaves_out,
+        Some(hashed_leaves_out),
         KeccakCommit::LeavesOnly,
         false,
+        false,
     )
     .map(|_| ())
 }
@@ -909,23 +933,24 @@ pub fn coset_lde_batch_ext3_into_with_merkle_tree(
         blowup_factor,
         weights,
         outputs,
-        merkle_nodes_out,
+        Some(merkle_nodes_out),
         KeccakCommit::FullTree,
         false,
+        false,
     )
     .map(|_| ())
 }
 
-/// Ext3 variant of [`coset_lde_batch_base_into_with_merkle_tree_keep`] —
-/// returns an `Arc<CudaSlice<u64>>` handle to the de-interleaved LDE device
-/// buffer.
+/// Ext3 variant of [`coset_lde_batch_base_into_with_merkle_tree_keep`]: keeps
+/// both the de-interleaved LDE buffer and the Merkle tree (with root) resident
+/// on device. The host tree is **not** materialised (no `merkle_nodes_out`), so
+/// the whole-tree D2H is eliminated; openings gather paths from the device tree.
 pub fn coset_lde_batch_ext3_into_with_merkle_tree_keep(
     columns: &[&[u64]],
     n: usize,
     blowup_factor: usize,
     weights: &[u64],
     outputs: &mut [&mut [u64]],
-    merkle_nodes_out: &mut [u8],
 ) -> Result<GpuLdeExt3> {
     let opt = coset_lde_batch_ext3_into_with_merkle_tree_inner(
         columns,
@@ -933,9 +958,10 @@ pub fn coset_lde_batch_ext3_into_with_merkle_tree_keep(
         blowup_factor,
         weights,
         outputs,
-        merkle_nodes_out,
+        None,
         KeccakCommit::FullTree,
         true,
+        true,
     )?;
     Ok(opt.expect("keep_device_buf=true must return Some"))
 }
@@ -947,9 +973,10 @@ fn coset_lde_batch_ext3_into_with_merkle_tree_inner(
     blowup_factor: usize,
     weights: &[u64],
     outputs: &mut [&mut [u64]],
-    nodes_out: &mut [u8],
+    nodes_out: Option<&mut [u8]>,
     commit: KeccakCommit,
     keep_device_buf: bool,
+    keep_tree: bool,
 ) -> Result<Option<GpuLdeExt3>> {
     if columns.is_empty() {
         assert_eq!(outputs.len(), 0);
@@ -976,7 +1003,9 @@ fn coset_lde_batch_ext3_into_with_merkle_tree_inner(
         assert_eq!(o.len(), 3 * lde_size);
     }
     let nodes_dev_bytes = commit.total_nodes_bytes(lde_size);
-    assert_eq!(nodes_out.len(), nodes_dev_bytes);
+    if let Some(no) = &nodes_out {
+        assert_eq!(no.len(), nodes_dev_bytes);
+    }
     let log_n = n.trailing_zeros() as u64;
     let log_lde = lde_size.trailing_zeros() as u64;
 
@@ -1074,21 +1103,43 @@ fn coset_lde_batch_ext3_into_with_merkle_tree_inner(
         crate::merkle::build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, lde_size)?;
     }
 
-    // D2H LDE (mb * lde_size u64) and tree/leaves nodes.
+    // D2H the LDE columns. The full tree nodes are D2H'd only when the caller
+    // asks for a host tree (`nodes_out`); the device-resident path passes `None`
+    // and keeps the tree on device, eliminating the whole-tree D2H.
     stream.memcpy_dtoh(&buf, &mut pinned[..mb * lde_size])?;
-    d2h_bytes_via_pinned_hashes(&stream, be, &nodes_dev, nodes_out)?;
+    if let Some(no) = nodes_out {
+        d2h_bytes_via_pinned_hashes(&stream, be, &nodes_dev, no)?;
+    }
+    // Ensure the (async) LDE D2H has landed before reading `pinned` (the tree
+    // D2H's synchronize used to cover this; the device-resident path skips it).
+    stream.synchronize()?;
 
     unpack_pinned_slabs_to_ext3(pinned, outputs, lde_size);
     drop(staging);
 
     if keep_device_buf {
+        let tree = if keep_tree && commit == KeccakCommit::FullTree {
+            let mut root = [0u8; 32];
+            stream.memcpy_dtoh(&nodes_dev.slice(0..32), &mut root)?;
+            stream.synchronize()?;
+            Some(GpuMerkleTree {
+                nodes: Arc::new(nodes_dev),
+                leaves_len: lde_size,
+                root,
+            })
+        } else {
+            drop(nodes_dev);
+            None
+        };
         Ok(Some(GpuLdeExt3 {
             buf: Arc::new(buf),
             m,
             lde_size,
+            tree,
         }))
     } else {
         drop(buf);
+        drop(nodes_dev);
         Ok(None)
     }
 }
@@ -1276,6 +1327,7 @@ fn evaluate_poly_coset_batch_ext3_into_inner(
             buf: std::sync::Arc::new(buf),
             m,
             lde_size,
+            tree: None,
         }))
     } else {
         drop(buf);
diff --git a/crypto/math-cuda/src/merkle.rs b/crypto/math-cuda/src/merkle.rs
index 1c1e54093..cb3758ff6 100644
--- a/crypto/math-cuda/src/merkle.rs
+++ b/crypto/math-cuda/src/merkle.rs
@@ -279,7 +279,13 @@ pub fn gather_merkle_paths_dev(
 ///
 /// Returns `(2*(lde_size/2) - 1) * 32` bytes of tree nodes in the standard
 /// layout (root at byte offset 0, leaves in the tail).
-pub fn build_comp_poly_tree_from_evals_ext3(parts_interleaved: &[&[u64]]) -> Result<Vec<u8>> {
+/// Build the composition-poly Merkle tree on device (leaves hash row-pairs, so
+/// `num_leaves = lde_size / 2`). Returns the device node buffer, the leaf count,
+/// and the stream it was built on. Shared by the host-D2H and device-keep
+/// wrappers below.
+fn build_comp_poly_tree_nodes_dev(
+    parts_interleaved: &[&[u64]],
+) -> Result<(CudaSlice<u8>, usize, Arc<CudaStream>)> {
     assert!(!parts_interleaved.is_empty());
     let m = parts_interleaved.len();
     let ext3_elems = parts_interleaved[0].len() / 3;
@@ -308,9 +314,13 @@ pub fn build_comp_poly_tree_from_evals_ext3(parts_interleaved: &[&[u64]]) -> Res
 
     pack_ext3_to_pinned_slabs(parts_interleaved, pinned, lde_size);
 
-    // H2D the de-interleaved parts.
+    // H2D the de-interleaved parts, then release the staging lock (the kernels
+    // below read the device `buf`, not `pinned`). Synchronize first so the
+    // async H2D has consumed `pinned` before it is freed/reused.
     let mut buf = stream.alloc_zeros::<u64>(mb * lde_size)?;
     stream.memcpy_htod(&pinned[..mb * lde_size], &mut buf)?;
+    stream.synchronize()?;
+    drop(staging);
 
     // Leaves into tail of a tight node buffer.
     let mut nodes_dev = unsafe { stream.alloc::<u8>(tight_total_nodes * 32) }?;
@@ -337,13 +347,34 @@ pub fn build_comp_poly_tree_from_evals_ext3(parts_interleaved: &[&[u64]]) -> Res
     }
 
     build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, num_leaves)?;
+    Ok((nodes_dev, num_leaves, stream))
+}
 
+pub fn build_comp_poly_tree_from_evals_ext3(parts_interleaved: &[&[u64]]) -> Result<Vec<u8>> {
+    let (nodes_dev, _num_leaves, stream) = build_comp_poly_tree_nodes_dev(parts_interleaved)?;
     let out = stream.clone_dtoh(&nodes_dev)?;
     stream.synchronize()?;
-    drop(staging);
     Ok(out)
 }
 
+/// Like [`build_comp_poly_tree_from_evals_ext3`] but keeps the tree nodes on
+/// device (returned as a [`crate::lde::GpuMerkleTree`] with its root), so R4
+/// composition openings gather authentication paths on device instead of
+/// D2H'ing the whole tree. `leaves_len = lde_size / 2` (row-pair leaves).
+pub fn build_comp_poly_tree_from_evals_ext3_keep(
+    parts_interleaved: &[&[u64]],
+) -> Result<crate::lde::GpuMerkleTree> {
+    let (nodes_dev, num_leaves, stream) = build_comp_poly_tree_nodes_dev(parts_interleaved)?;
+    let mut root = [0u8; 32];
+    stream.memcpy_dtoh(&nodes_dev.slice(0..32), &mut root)?;
+    stream.synchronize()?;
+    Ok(crate::lde::GpuMerkleTree {
+        nodes: Arc::new(nodes_dev),
+        leaves_len: num_leaves,
+        root,
+    })
+}
+
 /// Build a FRI-layer Merkle tree on device from an interleaved ext3 eval
 /// vector. Each leaf hashes two consecutive ext3 values. `num_leaves =
 /// evals.len() / 6` (since each ext3 is 3 u64s).
diff --git a/crypto/math-cuda/tests/barycentric_strided.rs b/crypto/math-cuda/tests/barycentric_strided.rs
index b1963522f..377a2b531 100644
--- a/crypto/math-cuda/tests/barycentric_strided.rs
+++ b/crypto/math-cuda/tests/barycentric_strided.rs
@@ -106,6 +106,7 @@ fn run_ext3(log_trace: u32, blowup: usize, num_cols: usize, seed: u64) {
         buf: Arc::new(lde_dev),
         m: num_cols,
         lde_size,
+        tree: None,
     };
 
     // Pre-strided buffer for non-strided reference.
diff --git a/crypto/math-cuda/tests/deep.rs b/crypto/math-cuda/tests/deep.rs
index 57ea14d3c..6ab63be10 100644
--- a/crypto/math-cuda/tests/deep.rs
+++ b/crypto/math-cuda/tests/deep.rs
@@ -184,6 +184,7 @@ fn run_parity(
             buf: Arc::new(aux_dev),
             m: num_aux,
             lde_size,
+            tree: None,
         })
     } else {
         drop(aux_dev);
diff --git a/crypto/stark/src/gpu_lde.rs b/crypto/stark/src/gpu_lde.rs
index f6c85553e..635ff815f 100644
--- a/crypto/stark/src/gpu_lde.rs
+++ b/crypto/stark/src/gpu_lde.rs
@@ -452,11 +452,13 @@ pub fn gpu_leaf_hash_calls() -> u64 {
     GPU_LEAF_HASH_CALLS.load(Ordering::Relaxed)
 }
 
-/// Fused base-field path: LDE + Keccak-256 leaf hash + Merkle tree build,
-/// all on device, with the LDE buffer retained for R2–R4 GPU reuse. On
-/// success: `columns[c]` is resized to `lde_size` with the LDE output, and
-/// the returned `(tree, GpuLdeBase)` pair is the host-side tree plus a
-/// device-resident handle to the LDE buffer.
+/// Fused base-field path: LDE + Keccak-256 leaf hash + Merkle tree build, all
+/// on device, keeping **both** the LDE buffer and the Merkle tree resident on
+/// device. On success: `columns[c]` is resized to `lde_size` with the LDE
+/// output, and the returned `GpuLdeBase` carries the device LDE buffer plus the
+/// device tree (`.tree`). The returned `MerkleTree` is **root-only** — the host
+/// tree nodes are never materialised (no whole-tree D2H); query openings gather
+/// authentication paths from the device tree via [`gather_proofs_dev`].
 pub(crate) fn try_expand_leaf_and_tree_batched_keep<F, E, B>(
     columns: &mut [Vec<FieldElement<E>>],
     blowup_factor: usize,
@@ -471,11 +473,8 @@ where
         LayoutDispatch::Empty | LayoutDispatch::Skip => return None,
         LayoutDispatch::Run { n, lde_size } => (n, lde_size),
     };
+    let _ = lde_size;
     let num_columns = columns.len();
-    let (mut nodes, total_nodes) = alloc_merkle_nodes(lde_size)?;
-    let node_byte_len = total_nodes
-        .checked_mul(32)
-        .expect("node byte length overflow");
 
     // SAFETY: layout-checked above.
     let raw_columns = unsafe { columns_to_u64_base::<E>(columns) };
@@ -488,14 +487,11 @@ where
 
     let handle_result = {
         let mut raw_outputs = unsafe { presize_and_view_base::<E>(columns, lde_size) };
-        let nodes_bytes: &mut [u8] =
-            unsafe { from_raw_parts_mut(nodes.as_mut_ptr() as *mut u8, node_byte_len) };
         math_cuda::lde::coset_lde_batch_base_into_with_merkle_tree_keep(
             &slices,
             blowup_factor,
             &weights_u64,
             &mut raw_outputs,
-            nodes_bytes,
         )
     };
     let handle = match handle_result {
@@ -506,7 +502,10 @@ where
         }
     };
 
-    let tree = MerkleTree::<B>::from_precomputed_nodes(nodes)?;
+    // Root-only host tree: the device tree (`handle.tree`) holds the nodes and
+    // serves openings; only the commitment root lives on host.
+    let root = handle.tree.as_ref()?.root;
+    let tree = MerkleTree::<B>::from_root(root);
     Some((tree, handle))
 }
 
@@ -528,11 +527,8 @@ where
         LayoutDispatch::Empty | LayoutDispatch::Skip => return None,
         LayoutDispatch::Run { n, lde_size } => (n, lde_size),
     };
+    let _ = lde_size;
     let num_columns = columns.len();
-    let (mut nodes, total_nodes) = alloc_merkle_nodes(lde_size)?;
-    let node_byte_len = total_nodes
-        .checked_mul(32)
-        .expect("node byte length overflow");
 
     // SAFETY: layout-checked above.
     let raw_columns = unsafe { columns_to_u64_ext3::<E>(columns) };
@@ -545,15 +541,12 @@ where
 
     let handle_result = {
         let mut raw_outputs = unsafe { presize_and_view_ext3::<E>(columns, lde_size) };
-        let nodes_bytes: &mut [u8] =
-            unsafe { from_raw_parts_mut(nodes.as_mut_ptr() as *mut u8, node_byte_len) };
         math_cuda::lde::coset_lde_batch_ext3_into_with_merkle_tree_keep(
             &slices,
             n,
             blowup_factor,
             &weights_u64,
             &mut raw_outputs,
-            nodes_bytes,
         )
     };
     let handle = match handle_result {
@@ -564,7 +557,10 @@ where
         }
     };
 
-    let tree = MerkleTree::<B>::from_precomputed_nodes(nodes)?;
+    // Root-only host tree: the device tree (`handle.tree`) holds the nodes and
+    // serves openings; only the commitment root lives on host.
+    let root = handle.tree.as_ref()?.root;
+    let tree = MerkleTree::<B>::from_root(root);
     Some((tree, handle))
 }
 
@@ -742,7 +738,7 @@ where
 /// recomputes on CPU.
 pub(crate) fn try_build_comp_poly_tree_gpu<E, B>(
     lde_parts: &[Vec<FieldElement<E>>],
-) -> Option<MerkleTree<B>>
+) -> Option<(MerkleTree<B>, math_cuda::lde::GpuMerkleTree)>
 where
     E: IsField + 'static,
     B: IsMerkleTreeBackend<Node = [u8; 32]>,
@@ -775,29 +771,17 @@ where
         })
         .collect();
 
-    let nodes_bytes = match math_cuda::merkle::build_comp_poly_tree_from_evals_ext3(&raw_parts) {
-        Ok(v) => v,
+    // Keep the composition tree resident on device; the whole-tree D2H is
+    // eliminated. R4 composition openings gather paths from the device tree
+    // (`gather_proofs_dev`); the returned host tree is root-only.
+    let dev_tree = match math_cuda::merkle::build_comp_poly_tree_from_evals_ext3_keep(&raw_parts) {
+        Ok(t) => t,
         Err(_) => return None,
     };
-
-    // lde_size is an even power of two >= 2, so 2*num_leaves == lde_size and
-    // tight_total_nodes = lde_size - 1 >= 1. No overflow or underflow possible.
-    let tight_total_nodes = lde_size - 1;
-    let expected_byte_len = tight_total_nodes
-        .checked_mul(32)
-        .expect("comp-poly node byte length overflow");
-    debug_assert_eq!(nodes_bytes.len(), expected_byte_len);
-
-    let nodes: Vec<[u8; 32]> = nodes_bytes
-        .chunks_exact(32)
-        .map(|c| {
-            c.try_into()
-                .expect("chunks_exact(32) yields exactly 32 bytes")
-        })
-        .collect();
+    debug_assert_eq!(dev_tree.leaves_len, lde_size / 2);
     GPU_COMP_POLY_TREE_CALLS.fetch_add(1, Ordering::Relaxed);
-    // Falls back to CPU on `None`, matching the R1 paths (lines 496, 557).
-    MerkleTree::<B>::from_precomputed_nodes(nodes)
+    let host = MerkleTree::<B>::from_root(dev_tree.root);
+    Some((host, dev_tree))
 }
 
 /// R3 GPU dispatch: batched strided barycentric OOD evaluation over the main
diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs
index 25d63cd2d..c2702fd00 100644
--- a/crypto/stark/src/prover.rs
+++ b/crypto/stark/src/prover.rs
@@ -390,6 +390,12 @@ where
     pub(crate) composition_poly_merkle_tree: BatchedMerkleTree<F>,
     /// The commitment to the composition polynomial parts.
     pub(crate) composition_poly_root: Commitment,
+    /// The composition-poly Merkle tree kept resident on device (when the R2
+    /// GPU tree path ran), so R4 openings gather authentication paths on device
+    /// instead of walking a host tree. When set, `composition_poly_merkle_tree`
+    /// is a root-only placeholder. `None` on the CPU path.
+    #[cfg(feature = "cuda")]
+    pub(crate) gpu_composition_tree: Option<math_cuda::lde::GpuMerkleTree>,
 }
 
 /// A container for the results of the third round of the STARK Prove protocol.
@@ -1133,22 +1139,31 @@ pub trait IsStarkProver<
         let t_sub = Instant::now();
         // GPU fast path for the comp-poly Merkle commit: row-pair Keccak
         // leaves + device-side inner tree, both wrapping the host eval Vecs.
+        // GPU path keeps the composition tree resident on device (no whole-tree
+        // D2H) and returns a root-only host tree; the device tree is threaded to
+        // R4 in `Round2.gpu_composition_tree`.
         #[cfg(feature = "cuda")]
-        let gpu_tree = crate::gpu_lde::try_build_comp_poly_tree_gpu::<
-            FieldExtension,
-            BatchedMerkleTreeBackend<FieldExtension>,
-        >(&lde_composition_poly_parts_evaluations);
+        let (composition_poly_merkle_tree, composition_poly_root, gpu_composition_tree) =
+            match crate::gpu_lde::try_build_comp_poly_tree_gpu::<
+                FieldExtension,
+                BatchedMerkleTreeBackend<FieldExtension>,
+            >(&lde_composition_poly_parts_evaluations)
+            {
+                Some((host_tree, dev_tree)) => {
+                    let root = host_tree.root;
+                    (host_tree, root, Some(dev_tree))
+                }
+                None => {
+                    let (tree, root) =
+                        Self::commit_composition_polynomial(&lde_composition_poly_parts_evaluations)
+                            .ok_or(ProvingError::EmptyCommitment)?;
+                    (tree, root, None)
+                }
+            };
         #[cfg(not(feature = "cuda"))]
-        let gpu_tree: Option<BatchedMerkleTree<FieldExtension>> = None;
-
-        let (composition_poly_merkle_tree, composition_poly_root) = match gpu_tree {
-            Some(tree) => {
-                let root = tree.root;
-                (tree, root)
-            }
-            None => Self::commit_composition_polynomial(&lde_composition_poly_parts_evaluations)
-                .ok_or(ProvingError::EmptyCommitment)?,
-        };
+        let (composition_poly_merkle_tree, composition_poly_root) =
+            Self::commit_composition_polynomial(&lde_composition_poly_parts_evaluations)
+                .ok_or(ProvingError::EmptyCommitment)?;
         #[cfg(feature = "instruments")]
         let merkle_dur = t_sub.elapsed();
 
@@ -1167,6 +1182,8 @@ pub trait IsStarkProver<
             lde_composition_poly_evaluations: lde_composition_poly_parts_evaluations,
             composition_poly_merkle_tree,
             composition_poly_root,
+            #[cfg(feature = "cuda")]
+            gpu_composition_tree,
         })
     }
 
@@ -1595,6 +1612,47 @@ pub trait IsStarkProver<
         }
     }
 
+    /// Like [`Self::open_composition_poly`] but uses a Merkle proof already
+    /// gathered from the resident device composition tree
+    /// ([`crate::gpu_lde::gather_proofs_dev`]) instead of walking a host tree.
+    /// Same single-position opening (`proof_sym == proof`); evaluations come
+    /// from the host eval Vecs as before.
+    #[cfg(feature = "cuda")]
+    fn open_composition_poly_with_proof(
+        proof: Proof<Commitment>,
+        lde_composition_poly_evaluations: &[Vec<FieldElement<FieldExtension>>],
+        index: usize,
+    ) -> PolynomialOpenings<FieldExtension>
+    where
+        FieldElement<Field>: AsBytes + Sync + Send,
+        FieldElement<FieldExtension>: AsBytes + Sync + Send,
+    {
+        let lde_composition_poly_parts_evaluation: Vec<_> = lde_composition_poly_evaluations
+            .iter()
+            .flat_map(|part| {
+                vec![
+                    part[reverse_index(index * 2, part.len() as u64)].clone(),
+                    part[reverse_index(index * 2 + 1, part.len() as u64)].clone(),
+                ]
+            })
+            .collect();
+
+        PolynomialOpenings {
+            proof: proof.clone(),
+            proof_sym: proof,
+            evaluations: lde_composition_poly_parts_evaluation
+                .clone()
+                .into_iter()
+                .step_by(2)
+                .collect(),
+            evaluations_sym: lde_composition_poly_parts_evaluation
+                .into_iter()
+                .skip(1)
+                .step_by(2)
+                .collect(),
+        }
+    }
+
     /// Computes values and validity proofs of the evaluations of trace polynomials at
     /// the FRI query challenge `challenge` and its symmetric counterpart. The caller
     /// supplies a `gather` closure that pulls the row data from the column-major LDE
@@ -1689,6 +1747,28 @@ pub trait IsStarkProver<
             })
             .flatten();
 
+        // Same for the aux-trace tree, when it is device-resident.
+        #[cfg(feature = "cuda")]
+        let aux_dev_proofs: Option<Vec<Proof<Commitment>>> =
+            round_1_result.aux.as_ref().and_then(|_aux| {
+                let tree = lde_trace.gpu_aux()?.tree.as_ref()?;
+                let stream = lde_trace.bound_stream()?;
+                let positions: Vec<usize> = indexes_to_open
+                    .iter()
+                    .flat_map(|&c| [c * 2, c * 2 + 1])
+                    .collect();
+                crate::gpu_lde::gather_proofs_dev(tree, &positions, &stream)
+            });
+
+        // Composition tree: openings open a single position `index` (row-pair
+        // leaf), so gather one proof per query challenge from the device tree.
+        #[cfg(feature = "cuda")]
+        let comp_dev_proofs: Option<Vec<Proof<Commitment>>> =
+            round_2_result.gpu_composition_tree.as_ref().and_then(|tree| {
+                let stream = lde_trace.bound_stream()?;
+                crate::gpu_lde::gather_proofs_dev(tree, indexes_to_open, &stream)
+            });
+
         for (qi, index) in indexes_to_open.iter().enumerate() {
             #[cfg(not(feature = "cuda"))]
             let _ = qi;
@@ -1730,16 +1810,56 @@ pub trait IsStarkProver<
                 })
             });
 
-            let composition_openings = Self::open_composition_poly(
-                &round_2_result.composition_poly_merkle_tree,
-                &round_2_result.lde_composition_poly_evaluations,
-                *index,
-            );
+            let composition_openings = {
+                #[cfg(feature = "cuda")]
+                {
+                    if let Some(proofs) = &comp_dev_proofs {
+                        Self::open_composition_poly_with_proof(
+                            proofs[qi].clone(),
+                            &round_2_result.lde_composition_poly_evaluations,
+                            *index,
+                        )
+                    } else {
+                        Self::open_composition_poly(
+                            &round_2_result.composition_poly_merkle_tree,
+                            &round_2_result.lde_composition_poly_evaluations,
+                            *index,
+                        )
+                    }
+                }
+                #[cfg(not(feature = "cuda"))]
+                {
+                    Self::open_composition_poly(
+                        &round_2_result.composition_poly_merkle_tree,
+                        &round_2_result.lde_composition_poly_evaluations,
+                        *index,
+                    )
+                }
+            };
 
             let aux_trace_polys = round_1_result.aux.as_ref().map(|aux| {
-                Self::open_polys_with(domain, &aux.tree, *index, |row| {
-                    lde_trace.gather_aux_row(row)
-                })
+                #[cfg(feature = "cuda")]
+                {
+                    if let Some(proofs) = &aux_dev_proofs {
+                        Self::open_polys_with_proofs(
+                            domain,
+                            proofs[qi * 2].clone(),
+                            proofs[qi * 2 + 1].clone(),
+                            *index,
+                            |row| lde_trace.gather_aux_row(row),
+                        )
+                    } else {
+                        Self::open_polys_with(domain, &aux.tree, *index, |row| {
+                            lde_trace.gather_aux_row(row)
+                        })
+                    }
+                }
+                #[cfg(not(feature = "cuda"))]
+                {
+                    Self::open_polys_with(domain, &aux.tree, *index, |row| {
+                        lde_trace.gather_aux_row(row)
+                    })
+                }
             });
 
             openings.push(DeepPolynomialOpening {

From 819ad015e1b8fba4c12a981c9e4df541988938f4 Mon Sep 17 00:00:00 2001
From: Joaquin Carletti <joaquin.carletti@lambdaclass.com>
Date: Mon, 29 Jun 2026 18:16:12 -0300
Subject: [PATCH 09/17] finish merkle

---
 crypto/math-cuda/src/fri.rs            | 20 ++++---
 crypto/stark/src/fri/fri_commitment.rs |  7 +++
 crypto/stark/src/fri/mod.rs            |  8 +++
 crypto/stark/src/gpu_lde.rs            | 78 +++++++++++++++++++++-----
 4 files changed, 90 insertions(+), 23 deletions(-)

diff --git a/crypto/math-cuda/src/fri.rs b/crypto/math-cuda/src/fri.rs
index edd359b1b..e4c60a8a8 100644
--- a/crypto/math-cuda/src/fri.rs
+++ b/crypto/math-cuda/src/fri.rs
@@ -98,7 +98,7 @@ impl FriCommitState {
     pub fn fold_and_commit_layer(
         &mut self,
         zeta_raw: [u64; 3],
-    ) -> Result<(Vec<u8>, Vec<u64>, Vec<u8>)> {
+    ) -> Result<(Vec<u64>, crate::lde::GpuMerkleTree)> {
         #[cfg(feature = "test-faults")]
         check_fault_injection()?;
         let be = backend()?;
@@ -214,17 +214,21 @@ impl FriCommitState {
             self.stream.clone_dtoh(&view)?
         };
 
-        // Tree nodes.
-        let nodes_bytes: Vec<u8> = self.stream.clone_dtoh(&nodes_dev)?;
-        debug_assert_eq!(nodes_bytes.len(), tight_total_nodes * 32);
-
-        let mut root = vec![0u8; 32];
-        root.copy_from_slice(&nodes_bytes[0..32]);
+        // Keep the layer tree resident on device; copy only the 32-byte root so
+        // R4 query openings gather paths on device instead of D2H'ing the tree.
+        let mut root = [0u8; 32];
+        self.stream.memcpy_dtoh(&nodes_dev.slice(0..32), &mut root)?;
+        self.stream.synchronize()?;
 
         self.a_is_input = !self.a_is_input;
         self.current_n = n_out;
 
-        Ok((root, layer_evals, nodes_bytes))
+        let tree = crate::lde::GpuMerkleTree {
+            nodes: std::sync::Arc::new(nodes_dev),
+            leaves_len: num_leaves,
+            root,
+        };
+        Ok((layer_evals, tree))
     }
 
     /// Final fold, no Merkle commit. Returns the single ext3 output
diff --git a/crypto/stark/src/fri/fri_commitment.rs b/crypto/stark/src/fri/fri_commitment.rs
index 831471761..9f950c27e 100644
--- a/crypto/stark/src/fri/fri_commitment.rs
+++ b/crypto/stark/src/fri/fri_commitment.rs
@@ -13,6 +13,11 @@ where
 {
     pub evaluation: Vec<FieldElement<F>>,
     pub merkle_tree: MerkleTree<B>,
+    /// The layer's Merkle tree kept resident on device (GPU FRI commit path),
+    /// so R4 query openings gather authentication paths on device. When set,
+    /// `merkle_tree` is a root-only placeholder. `None` on the CPU path.
+    #[cfg(feature = "cuda")]
+    pub gpu_tree: Option<math_cuda::lde::GpuMerkleTree>,
 }
 
 impl<F, B> FriLayer<F, B>
@@ -25,6 +30,8 @@ where
         Self {
             evaluation: evaluation.to_vec(),
             merkle_tree,
+            #[cfg(feature = "cuda")]
+            gpu_tree: None,
         }
     }
 }
diff --git a/crypto/stark/src/fri/mod.rs b/crypto/stark/src/fri/mod.rs
index 60ad2a398..181c27380 100644
--- a/crypto/stark/src/fri/mod.rs
+++ b/crypto/stark/src/fri/mod.rs
@@ -117,6 +117,14 @@ pub fn query_phase<F: IsField>(
 where
     FieldElement<F>: AsBytes + Sync + Send,
 {
+    // GPU fast path: gather every layer's authentication paths on device (the
+    // layer trees stay resident from the GPU commit). Falls back to the host
+    // walk below if any layer lacks a device tree.
+    #[cfg(feature = "cuda")]
+    if let Some(decommits) = crate::gpu_lde::try_fri_query_phase_gpu::<F>(fri_layers, iotas) {
+        return decommits;
+    }
+
     if !fri_layers.is_empty() {
         let num_layers = fri_layers.len();
         iotas
diff --git a/crypto/stark/src/gpu_lde.rs b/crypto/stark/src/gpu_lde.rs
index 635ff815f..0dd83ab2d 100644
--- a/crypto/stark/src/gpu_lde.rs
+++ b/crypto/stark/src/gpu_lde.rs
@@ -27,6 +27,7 @@ use math::traits::AsBytes;
 use crate::config::{Commitment, FriLayerMerkleTreeBackend};
 use crate::domain::Domain;
 use crate::fri::fri_commitment::FriLayer;
+use crate::fri::fri_decommit::FriDecommitment;
 use crate::fri::fri_functions::compute_coset_twiddles_inv;
 use crate::trace::LDETraceTable;
 
@@ -1620,7 +1621,7 @@ where
         let zeta_ptr = &zeta as *const FieldElement<E> as *const u64;
         let zeta_raw: [u64; 3] = unsafe { [*zeta_ptr, *zeta_ptr.add(1), *zeta_ptr.add(2)] };
 
-        let (root, layer_evals_u64, nodes_bytes) = match state.fold_and_commit_layer(zeta_raw) {
+        let (layer_evals_u64, dev_tree) = match state.fold_and_commit_layer(zeta_raw) {
             Ok(v) => v,
             Err(_) => {
                 *transcript = transcript_snapshot.clone();
@@ -1628,23 +1629,18 @@ where
             }
         };
 
-        // Build the FriLayer: ext3 evals + Merkle tree from precomputed nodes.
+        // Build the FriLayer: ext3 evals + a root-only host tree (the layer tree
+        // stays resident on device in `gpu_tree`; query openings gather paths
+        // from it via `gather_proofs_dev`).
         let evaluation = u64_to_ext3_vec::<E>(&layer_evals_u64);
-
-        debug_assert!(nodes_bytes.len().is_multiple_of(32));
-        let nodes: Vec<[u8; 32]> = nodes_bytes
-            .chunks_exact(32)
-            .map(|c| c.try_into().expect("chunks_exact(32) yields 32 bytes"))
-            .collect();
-        let merkle_tree = MerkleTree::<FriLayerMerkleTreeBackend<E>>::from_precomputed_nodes(nodes)
-            .expect("FRI commit: precomputed nodes form a valid tree");
-
-        fri_layer_list.push(FriLayer::new(&evaluation, merkle_tree));
+        let root = dev_tree.root;
+        let merkle_tree = MerkleTree::<FriLayerMerkleTreeBackend<E>>::from_root(root);
+        let mut layer = FriLayer::new(&evaluation, merkle_tree);
+        layer.gpu_tree = Some(dev_tree);
+        fri_layer_list.push(layer);
 
         // >>>> Send commitment: [p_k]
-        let mut root_arr = [0u8; 32];
-        root_arr.copy_from_slice(&root);
-        transcript.append_bytes(&root_arr);
+        transcript.append_bytes(&root);
     }
 
     // <<<< Receive challenge zeta_{n-1}
@@ -1671,3 +1667,55 @@ where
     GPU_FRI_CALLS.fetch_add(1, Ordering::Relaxed);
     Some((last_value, fri_layer_list))
 }
+
+/// GPU FRI query phase: gather each layer's authentication paths on device
+/// instead of walking host trees. For layer `l` and query `iota`, the opened
+/// position is `(iota >> l) >> 1` — matching [`crate::fri::query_phase`]. Paths
+/// for all queries are gathered in one batched call per layer. The layer
+/// evaluations (`evaluation[index ^ 1]`) are read from the host Vecs as before.
+///
+/// Returns `None` if there are no layers or any layer lacks a device tree (a
+/// CPU-committed layer), so the caller falls back to the host walk.
+pub(crate) fn try_fri_query_phase_gpu<E>(
+    fri_layers: &[FriLayer<E, FriLayerMerkleTreeBackend<E>>],
+    iotas: &[usize],
+) -> Option<Vec<FriDecommitment<E>>>
+where
+    E: IsField,
+    FieldElement<E>: AsBytes + Sync + Send,
+{
+    if fri_layers.is_empty() {
+        return None;
+    }
+    let stream = math_cuda::device::backend().ok()?.next_stream();
+    let num_layers = fri_layers.len();
+
+    // Batched gather: one call per layer over all queries.
+    let mut per_layer_proofs: Vec<Vec<Proof<Commitment>>> = Vec::with_capacity(num_layers);
+    for (l, layer) in fri_layers.iter().enumerate() {
+        let tree = layer.gpu_tree.as_ref()?;
+        let positions: Vec<usize> = iotas.iter().map(|&iota| (iota >> l) >> 1).collect();
+        per_layer_proofs.push(gather_proofs_dev(tree, &positions, &stream)?);
+    }
+
+    // Reassemble per-query decommitments, matching the host walk's order.
+    let decommits = iotas
+        .iter()
+        .enumerate()
+        .map(|(q, &iota)| {
+            let mut layers_evaluations_sym = Vec::with_capacity(num_layers);
+            let mut layers_auth_paths = Vec::with_capacity(num_layers);
+            let mut index = iota;
+            for (l, layer) in fri_layers.iter().enumerate() {
+                layers_evaluations_sym.push(layer.evaluation[index ^ 1].clone());
+                layers_auth_paths.push(per_layer_proofs[l][q].clone());
+                index >>= 1;
+            }
+            FriDecommitment {
+                layers_auth_paths,
+                layers_evaluations_sym,
+            }
+        })
+        .collect();
+    Some(decommits)
+}

From 0fb8df6d26d399fafa25caa4223504e6a7739542 Mon Sep 17 00:00:00 2001
From: Joaquin Carletti <joaquin.carletti@lambdaclass.com>
Date: Mon, 29 Jun 2026 18:30:55 -0300
Subject: [PATCH 10/17] fix

---
 crypto/math-cuda/src/device.rs | 10 ++++-
 crypto/math-cuda/src/merkle.rs |  7 +++
 crypto/stark/src/gpu_lde.rs    | 21 +++++++--
 crypto/stark/src/lib.rs        | 10 -----
 crypto/stark/src/prover.rs     | 82 +++++++++++++++++++++++-----------
 5 files changed, 90 insertions(+), 40 deletions(-)

diff --git a/crypto/math-cuda/src/device.rs b/crypto/math-cuda/src/device.rs
index 6b6094d45..ef99f05af 100644
--- a/crypto/math-cuda/src/device.rs
+++ b/crypto/math-cuda/src/device.rs
@@ -196,7 +196,15 @@ fn retain_default_mempool(ctx: &CudaContext) {
         {
             return;
         }
-        let threshold: u64 = u64::MAX;
+        // Default: retain freed stream-ordered blocks indefinitely (u64::MAX)
+        // for reuse. `LAMBDA_VM_MEMPOOL_RELEASE_MB` overrides the cap (bytes the
+        // pool keeps before returning memory to the OS) when retained-pool
+        // growth needs bounding.
+        let threshold: u64 = std::env::var("LAMBDA_VM_MEMPOOL_RELEASE_MB")
+            .ok()
+            .and_then(|s| s.parse::<u64>().ok())
+            .map(|mb| mb.saturating_mul(1024 * 1024))
+            .unwrap_or(u64::MAX);
         let _ = sys::cuMemPoolSetAttribute(
             pool,
             sys::CUmemPool_attribute_enum::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
diff --git a/crypto/math-cuda/src/merkle.rs b/crypto/math-cuda/src/merkle.rs
index cb3758ff6..000f6daa0 100644
--- a/crypto/math-cuda/src/merkle.rs
+++ b/crypto/math-cuda/src/merkle.rs
@@ -241,6 +241,13 @@ pub fn gather_merkle_paths_dev(
         "leaves_len must be a power of two >= 2"
     );
     let depth = leaves_len.trailing_zeros() as usize;
+    // Guard the kernel's device reads: an out-of-range position would walk off
+    // the node buffer (OOB device read). Positions are valid by construction;
+    // this catches any caller bug before it becomes UB.
+    assert!(
+        positions.iter().all(|&p| (p as usize) < leaves_len),
+        "gather_merkle_paths_dev: leaf position >= leaves_len"
+    );
     let be = backend()?;
 
     let pos_dev = stream.clone_htod(positions)?;
diff --git a/crypto/stark/src/gpu_lde.rs b/crypto/stark/src/gpu_lde.rs
index 0dd83ab2d..115b9462a 100644
--- a/crypto/stark/src/gpu_lde.rs
+++ b/crypto/stark/src/gpu_lde.rs
@@ -1687,15 +1687,30 @@ where
     if fri_layers.is_empty() {
         return None;
     }
-    let stream = math_cuda::device::backend().ok()?.next_stream();
+    // The GPU FRI commit sets `gpu_tree` on every layer as a group; the CPU
+    // commit sets none. If the layers are host trees, fall back to the host
+    // walk. If they're device-resident, the host trees are root-only — so the
+    // gather below MUST succeed (a failure is a hard abort, not a silent walk).
+    if fri_layers[0].gpu_tree.is_none() {
+        return None;
+    }
+    let stream = math_cuda::device::backend()
+        .expect("cuda backend for device-resident FRI query")
+        .next_stream();
     let num_layers = fri_layers.len();
 
     // Batched gather: one call per layer over all queries.
     let mut per_layer_proofs: Vec<Vec<Proof<Commitment>>> = Vec::with_capacity(num_layers);
     for (l, layer) in fri_layers.iter().enumerate() {
-        let tree = layer.gpu_tree.as_ref()?;
+        let tree = layer
+            .gpu_tree
+            .as_ref()
+            .expect("FRI layers are device-resident as a group");
         let positions: Vec<usize> = iotas.iter().map(|&iota| (iota >> l) >> 1).collect();
-        per_layer_proofs.push(gather_proofs_dev(tree, &positions, &stream)?);
+        per_layer_proofs.push(
+            gather_proofs_dev(tree, &positions, &stream)
+                .expect("device FRI-layer gather failed; resident tree has no host fallback"),
+        );
     }
 
     // Reassemble per-query decommitments, matching the host walk's order.
diff --git a/crypto/stark/src/lib.rs b/crypto/stark/src/lib.rs
index 25ce91569..e9f6a1cda 100644
--- a/crypto/stark/src/lib.rs
+++ b/crypto/stark/src/lib.rs
@@ -3,16 +3,6 @@
 #[cfg(all(target_arch = "wasm32", feature = "disk-spill"))]
 compile_error!("the `disk-spill` feature requires memmap2, which does not compile on wasm32");
 
-/// Open a wall-clock profiling span (no-op unless the `instruments` feature is on).
-/// RAII: records elapsed wall time when the binding drops at end of scope.
-#[macro_export]
-macro_rules! prof_span {
-    ($label:expr) => {
-        #[cfg(feature = "instruments")]
-        let _prof_span = $crate::instruments::span($label);
-    };
-}
-
 #[cfg(feature = "debug-checks")]
 pub mod bus_debug;
 pub mod constraints;
diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs
index c2702fd00..4bc0653fc 100644
--- a/crypto/stark/src/prover.rs
+++ b/crypto/stark/src/prover.rs
@@ -327,21 +327,29 @@ pub fn table_parallelism() -> usize {
 
 /// Heuristic peak device working-set for one table, in bytes.
 ///
-/// Counts the LDE columns that are co-resident on the GPU — `main` in the base
-/// field (8 B) and `aux` in the ext3 field (24 B) — times a scratch multiplier
-/// for the Merkle / NTT / composition transients allocated alongside them. It
-/// is deliberately a conservative over-estimate: it gates a safety ceiling, not
-/// a precise allocator. Pass `aux_cols == 0` for phases where the aux LDE is
-/// not yet resident (the R1 main commit).
+/// Two contributions:
+/// 1. **LDE columns** co-resident on the GPU — `main` in the base field (8 B)
+///    and `aux` in the ext3 field (24 B) — times a scratch multiplier for the
+///    NTT / leaf-hash transients allocated alongside them.
+/// 2. **Resident Merkle trees** — main, aux, composition, and FRI-layer trees
+///    are now kept on device R1→R4 (no whole-tree D2H). Each full tree is
+///    `~2*lde_size` nodes × 32 B = `64*lde_size`; co-resident at the R4 peak
+///    they sum to a few × that, so `~256 B × lde_size` covers them conservatively.
+///
+/// It is deliberately a conservative over-estimate: it gates a safety ceiling,
+/// not a precise allocator. Pass `aux_cols == 0` for phases where the aux LDE
+/// is not yet resident (the R1 main commit).
 fn estimate_table_vram_bytes(main_cols: usize, aux_cols: usize, lde_size: usize) -> u64 {
     const BYTES_PER_BASE: u64 = 8;
     const EXT3_BYTES: u64 = 24;
     const SCRATCH_FACTOR: u64 = 2;
+    const RESIDENT_TREE_BYTES_PER_LDE: u64 = 256;
+    let lde = lde_size as u64;
     let per_row = (main_cols as u64).saturating_mul(BYTES_PER_BASE)
         + (aux_cols as u64).saturating_mul(EXT3_BYTES);
-    (lde_size as u64)
-        .saturating_mul(per_row)
-        .saturating_mul(SCRATCH_FACTOR)
+    let lde_term = lde.saturating_mul(per_row).saturating_mul(SCRATCH_FACTOR);
+    let tree_term = lde.saturating_mul(RESIDENT_TREE_BYTES_PER_LDE);
+    lde_term.saturating_add(tree_term)
 }
 
 /// Plan contiguous table chunks for parallel proving.
@@ -1734,39 +1742,61 @@ pub trait IsStarkProver<
         // proofs (guarded by the `merkle_gather` parity test). Only the
         // non-preprocessed main carries a device tree today; on any miss this is
         // `None` and openings fall back to the host tree below.
+        // `*_dev_proofs` is `Some` exactly when the corresponding tree is
+        // device-resident (so the host tree is a root-only placeholder). In that
+        // case the gather MUST succeed — there is no host tree to fall back to,
+        // so a gather error is a hard abort (not a silent walk of an empty
+        // tree). When the tree is *not* device-resident the value is `None` and
+        // the openings below walk the full host tree as usual.
         #[cfg(feature = "cuda")]
-        let main_dev_proofs: Option<Vec<Proof<Commitment>>> = (!is_preprocessed)
-            .then(|| {
-                let tree = lde_trace.gpu_main()?.tree.as_ref()?;
-                let stream = lde_trace.bound_stream()?;
-                let positions: Vec<usize> = indexes_to_open
-                    .iter()
-                    .flat_map(|&c| [c * 2, c * 2 + 1])
-                    .collect();
-                crate::gpu_lde::gather_proofs_dev(tree, &positions, &stream)
-            })
-            .flatten();
+        let main_dev_proofs: Option<Vec<Proof<Commitment>>> = if is_preprocessed {
+            None
+        } else {
+            lde_trace
+                .gpu_main()
+                .and_then(|h| h.tree.as_ref())
+                .map(|tree| {
+                    let stream = lde_trace
+                        .bound_stream()
+                        .expect("bound stream for device-resident main-tree opening");
+                    let positions: Vec<usize> = indexes_to_open
+                        .iter()
+                        .flat_map(|&c| [c * 2, c * 2 + 1])
+                        .collect();
+                    crate::gpu_lde::gather_proofs_dev(tree, &positions, &stream).expect(
+                        "device main-tree gather failed; resident tree has no host fallback",
+                    )
+                })
+        };
 
         // Same for the aux-trace tree, when it is device-resident.
         #[cfg(feature = "cuda")]
-        let aux_dev_proofs: Option<Vec<Proof<Commitment>>> =
-            round_1_result.aux.as_ref().and_then(|_aux| {
-                let tree = lde_trace.gpu_aux()?.tree.as_ref()?;
-                let stream = lde_trace.bound_stream()?;
+        let aux_dev_proofs: Option<Vec<Proof<Commitment>>> = round_1_result
+            .aux
+            .as_ref()
+            .and_then(|_aux| lde_trace.gpu_aux().and_then(|h| h.tree.as_ref()))
+            .map(|tree| {
+                let stream = lde_trace
+                    .bound_stream()
+                    .expect("bound stream for device-resident aux-tree opening");
                 let positions: Vec<usize> = indexes_to_open
                     .iter()
                     .flat_map(|&c| [c * 2, c * 2 + 1])
                     .collect();
                 crate::gpu_lde::gather_proofs_dev(tree, &positions, &stream)
+                    .expect("device aux-tree gather failed; resident tree has no host fallback")
             });
 
         // Composition tree: openings open a single position `index` (row-pair
         // leaf), so gather one proof per query challenge from the device tree.
         #[cfg(feature = "cuda")]
         let comp_dev_proofs: Option<Vec<Proof<Commitment>>> =
-            round_2_result.gpu_composition_tree.as_ref().and_then(|tree| {
-                let stream = lde_trace.bound_stream()?;
+            round_2_result.gpu_composition_tree.as_ref().map(|tree| {
+                let stream = lde_trace
+                    .bound_stream()
+                    .expect("bound stream for device-resident composition-tree opening");
                 crate::gpu_lde::gather_proofs_dev(tree, indexes_to_open, &stream)
+                    .expect("device composition-tree gather failed; resident tree has no host fallback")
             });
 
         for (qi, index) in indexes_to_open.iter().enumerate() {

From 8ab633ff8d64614cc1f69b0d87fc9d19d48ced2a Mon Sep 17 00:00:00 2001
From: Joaquin Carletti <joaquin.carletti@lambdaclass.com>
Date: Mon, 29 Jun 2026 18:44:14 -0300
Subject: [PATCH 11/17] rm unused functions

---
 crypto/crypto/src/merkle_tree/merkle.rs |  7 +++++++
 crypto/stark/src/gpu_lde.rs             | 23 -----------------------
 crypto/stark/src/prover.rs              | 11 ++++++-----
 3 files changed, 13 insertions(+), 28 deletions(-)

diff --git a/crypto/crypto/src/merkle_tree/merkle.rs b/crypto/crypto/src/merkle_tree/merkle.rs
index 2060a0f18..1ae40a714 100644
--- a/crypto/crypto/src/merkle_tree/merkle.rs
+++ b/crypto/crypto/src/merkle_tree/merkle.rs
@@ -255,6 +255,13 @@ where
     /// For example, give me an inclusion proof for the 3rd element in the
     /// Merkle tree
     pub fn get_proof_by_pos(&self, pos: usize) -> Option<Proof<B::Node>> {
+        // A root-only tree (from `from_root`) has no nodes to walk — callers
+        // must gather paths from the device-resident copy instead. Catch the
+        // misuse early in debug builds rather than returning a misleading `None`.
+        debug_assert!(
+            !self.nodes.is_empty(),
+            "get_proof_by_pos called on a root-only MerkleTree (no nodes)"
+        );
         let pos = pos + self.node_count() / 2;
         let Ok(merkle_path) = self.build_merkle_path(pos) else {
             return None;
diff --git a/crypto/stark/src/gpu_lde.rs b/crypto/stark/src/gpu_lde.rs
index 115b9462a..1a0acafd5 100644
--- a/crypto/stark/src/gpu_lde.rs
+++ b/crypto/stark/src/gpu_lde.rs
@@ -271,29 +271,6 @@ fn restore_columns_on_err<E: IsField>(columns: &mut [Vec<FieldElement<E>>], n: u
     }
 }
 
-/// Allocate the `[u8; 32]` Merkle node buffer for a tree of `lde_size` leaves
-/// and return the node `Vec` (length-initialised, contents undefined) together
-/// with its node count `total_nodes` (`2 * lde_size - 1`). Returns `None` if
-/// the layout would be invalid (`lde_size < 2` or `total_nodes * 32` overflows
-/// `usize`). The caller builds the `&mut [u8]` byte view of length
-/// `total_nodes * 32` and must overwrite every byte via the GPU D2H.
-fn alloc_merkle_nodes(lde_size: usize) -> Option<(Vec<[u8; 32]>, usize)> {
-    if lde_size < 2 {
-        return None;
-    }
-    let total_nodes = 2usize.saturating_mul(lde_size).checked_sub(1)?;
-    let _byte_len = total_nodes.checked_mul(32)?;
-    let mut nodes: Vec<[u8; 32]> = Vec::with_capacity(total_nodes);
-    // SAFETY: every byte will be overwritten via the GPU D2H before the
-    // contents are read. The caller computes the byte-length view from the
-    // returned `nodes` Vec using `total_nodes.checked_mul(32)`.
-    #[allow(clippy::uninit_vec)]
-    unsafe {
-        nodes.set_len(total_nodes)
-    };
-    Some((nodes, total_nodes))
-}
-
 /// Try to GPU-batch all columns in one pass.
 ///
 /// Engaged for Goldilocks-base and ext3 tables whose LDE size is above the
diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs
index 4bc0653fc..b668e2364 100644
--- a/crypto/stark/src/prover.rs
+++ b/crypto/stark/src/prover.rs
@@ -357,11 +357,12 @@ fn estimate_table_vram_bytes(main_cols: usize, aux_cols: usize, lde_size: usize)
 /// A chunk grows until it reaches `k` tables (the core/RAM-bound limit) **or**
 /// its summed VRAM estimate would exceed `budget` — whichever comes first. A
 /// single table larger than `budget` forms its own chunk (it runs solo rather
-/// than being excluded). With `budget == u64::MAX` this degrades exactly to
-/// fixed chunks of `k`, identical to the previous `step_by(k)` scheme — so on
-/// non-cuda builds and when VRAM isn't the binding constraint, scheduling (and
-/// therefore the proof) is unchanged. Returns `(start, end)` half-open ranges
-/// covering `0..estimates.len()` in order.
+/// than being excluded). With `budget == u64::MAX` the VRAM constraint is never
+/// binding for any realistic estimate (a chunk's summed estimate can't approach
+/// `u64::MAX`), so chunks fall back to fixed size `k` — identical to the previous
+/// `step_by(k)` scheme. So on non-cuda builds and when VRAM isn't the binding
+/// constraint, scheduling (and therefore the proof) is unchanged. Returns
+/// `(start, end)` half-open ranges covering `0..estimates.len()` in order.
 fn plan_table_chunks(estimates: &[u64], k: usize, budget: u64) -> Vec<(usize, usize)> {
     let n = estimates.len();
     let k = k.max(1);

From fd31fc9d9d346bfea6882c034994610d84baaf18 Mon Sep 17 00:00:00 2001
From: Joaquin Carletti <joaquin.carletti@lambdaclass.com>
Date: Mon, 29 Jun 2026 19:09:29 -0300
Subject: [PATCH 12/17] cleanup

---
 crypto/crypto/src/merkle_tree/merkle.rs |  13 +-
 crypto/math-cuda/src/device.rs          |  30 +++--
 crypto/math-cuda/src/fri.rs             |   2 +-
 crypto/math-cuda/src/lde.rs             | 155 ++++++------------------
 crypto/math-cuda/src/merkle.rs          |  25 ++--
 crypto/math-cuda/tests/merkle_gather.rs |   6 +-
 crypto/stark/src/fri/fri_commitment.rs  |   2 +-
 crypto/stark/src/gpu_lde.rs             |  45 ++++---
 crypto/stark/src/instruments.rs         |  25 ++--
 crypto/stark/src/prover.rs              | 106 +++++++---------
 crypto/stark/src/trace.rs               |  97 +++------------
 11 files changed, 175 insertions(+), 331 deletions(-)

diff --git a/crypto/crypto/src/merkle_tree/merkle.rs b/crypto/crypto/src/merkle_tree/merkle.rs
index 1ae40a714..a32f12978 100644
--- a/crypto/crypto/src/merkle_tree/merkle.rs
+++ b/crypto/crypto/src/merkle_tree/merkle.rs
@@ -168,10 +168,9 @@ where
         })
     }
 
-    /// Create a root-only Merkle tree placeholder: stores the commitment root
-    /// but no nodes. Used when the tree's authentication paths are gathered from
-    /// a device-resident copy (GPU) instead of this host tree, so the host nodes
-    /// are never materialised (saving the full-tree Device→Host copy).
+    /// Create a root only Merkle tree placeholder: stores the commitment root
+    /// but no nodes. Used when paths are gathered from a device resident copy
+    /// (GPU) instead of this host tree, so the host nodes are never built.
     /// [`get_proof_by_pos`](Self::get_proof_by_pos) must NOT be called on it.
     pub fn from_root(root: B::Node) -> Self {
         MerkleTree {
@@ -255,9 +254,9 @@ where
     /// For example, give me an inclusion proof for the 3rd element in the
     /// Merkle tree
     pub fn get_proof_by_pos(&self, pos: usize) -> Option<Proof<B::Node>> {
-        // A root-only tree (from `from_root`) has no nodes to walk — callers
-        // must gather paths from the device-resident copy instead. Catch the
-        // misuse early in debug builds rather than returning a misleading `None`.
+        // A root only tree (from `from_root`) has no nodes to walk. Callers must
+        // gather paths from the device resident copy instead. Catch the misuse
+        // early in debug builds rather than returning a misleading None.
         debug_assert!(
             !self.nodes.is_empty(),
             "get_proof_by_pos called on a root-only MerkleTree (no nodes)"
diff --git a/crypto/math-cuda/src/device.rs b/crypto/math-cuda/src/device.rs
index ef99f05af..3dfaf629d 100644
--- a/crypto/math-cuda/src/device.rs
+++ b/crypto/math-cuda/src/device.rs
@@ -214,13 +214,13 @@ fn retain_default_mempool(ctx: &CudaContext) {
     }
 }
 
-/// Device VRAM budget (bytes) for table-session admission control.
+/// Device VRAM budget in bytes for table session admission control.
 ///
-/// `LAMBDA_VM_VRAM_BUDGET_MB` overrides it explicitly — used to force-exercise
-/// the throttle in tests/benchmarks. Otherwise it is 80% of total device
-/// memory, leaving headroom for the context, module code, and the pool's
-/// retained blocks. On any query failure it returns `u64::MAX`, which disables
-/// budgeting: admission then falls back to the core-bound chunk size alone.
+/// LAMBDA_VM_VRAM_BUDGET_MB overrides it (used to force the throttle in tests).
+/// Otherwise it is 80% of total device memory, leaving headroom for the
+/// context, module code, and retained pool blocks. Returns u64::MAX on any
+/// query failure, which disables budgeting (chunks fall back to the core bound
+/// size alone).
 fn detect_vram_budget_bytes(ctx: &CudaContext) -> u64 {
     if let Ok(mb) = std::env::var("LAMBDA_VM_VRAM_BUDGET_MB")
         && let Ok(mb) = mb.parse::<u64>()
@@ -255,17 +255,15 @@ impl Backend {
         // before returning), so the tracking is pure overhead. Disable it.
         unsafe { ctx.disable_event_tracking() };
 
-        // Retain freed device memory in the stream-ordered pool for reuse.
+        // Retain freed device memory in the stream ordered pool for reuse.
         //
-        // cudarc routes `CudaStream::alloc*` through `cuMemAllocAsync`, which
-        // draws from the device's default memory pool. That pool's release
-        // threshold defaults to 0, so every freed buffer is handed back to the
-        // OS at the next sync — meaning the prover's large, repeatedly-shaped
-        // LDE / FRI buffers are re-malloc'd from scratch each op. Raising the
-        // threshold to "unbounded" keeps freed blocks resident in the pool so
-        // subsequent allocations of the same size are satisfied without a real
-        // driver allocation. Best-effort: on any error (no pool support, sync
-        // allocator) we silently keep the current behaviour.
+        // cudarc routes CudaStream::alloc* through cuMemAllocAsync, drawing from
+        // the device default memory pool. Its release threshold defaults to 0,
+        // so every freed buffer goes back to the OS at the next sync and the
+        // prover's large LDE/FRI buffers are rebuilt from scratch each op.
+        // Raising the threshold keeps freed blocks in the pool so a same size
+        // allocation skips a real driver allocation. Best effort: on any error
+        // we keep the current behaviour.
         retain_default_mempool(&ctx);
 
         let arith = ctx.load_module(Ptx::from_src(ARITH_PTX))?;
diff --git a/crypto/math-cuda/src/fri.rs b/crypto/math-cuda/src/fri.rs
index e4c60a8a8..8fe6fcdb1 100644
--- a/crypto/math-cuda/src/fri.rs
+++ b/crypto/math-cuda/src/fri.rs
@@ -215,7 +215,7 @@ impl FriCommitState {
         };
 
         // Keep the layer tree resident on device; copy only the 32-byte root so
-        // R4 query openings gather paths on device instead of D2H'ing the tree.
+        // R4 query openings gather paths on device instead of copying the tree.
         let mut root = [0u8; 32];
         self.stream.memcpy_dtoh(&nodes_dev.slice(0..32), &mut root)?;
         self.stream.synchronize()?;
diff --git a/crypto/math-cuda/src/lde.rs b/crypto/math-cuda/src/lde.rs
index 8fc309ae5..ad0fd9152 100644
--- a/crypto/math-cuda/src/lde.rs
+++ b/crypto/math-cuda/src/lde.rs
@@ -220,10 +220,9 @@ fn launch_pointwise_mul_batched(
 /// Layout: `m` columns, each `lde_size` u64s, column `c` at byte offset
 /// `c * lde_size * 8` within `buf`. Freed when `buf` Arc drops.
 ///
-/// `tree` optionally carries the main-trace Merkle tree kept resident on device
-/// (populated by the keep path), so R4 query openings can gather authentication
-/// paths on device instead of D2H'ing the whole tree to host. `None` on the CPU
-/// path or when the tree wasn't retained.
+/// `tree` optionally carries the main trace Merkle tree kept resident on device
+/// (the keep path), so R4 query openings gather paths on device instead of
+/// copying the whole tree to host. None on the CPU path.
 #[derive(Clone)]
 pub struct GpuLdeBase {
     pub buf: Arc<CudaSlice<u64>>,
@@ -240,24 +239,22 @@ pub struct GpuLdeExt3 {
     pub buf: Arc<CudaSlice<u64>>,
     pub m: usize,
     pub lde_size: usize,
-    /// Optionally the aux/composition Merkle tree kept resident on device (the
-    /// keep path), so R4 openings gather paths on device instead of D2H'ing the
-    /// whole tree. `None` on the CPU path or when not retained.
+    /// Optionally the aux or composition Merkle tree kept resident on device
+    /// (the keep path), so R4 openings gather paths on device. None otherwise.
     pub tree: Option<GpuMerkleTree>,
 }
 
-/// Handle to a Merkle tree kept live on device after a commit, so query
-/// openings can gather authentication paths on device instead of D2H'ing the
-/// whole tree to host. Node layout matches the CPU tree
-/// (`crypto/crypto/src/merkle_tree`): `nodes[0..leaves_len-1]` are inner nodes
-/// (root at index 0), `nodes[leaves_len-1..]` are the leaves; each node is 32
+/// Merkle tree kept resident on device after a commit, so query openings gather
+/// paths on device instead of copying the whole tree to host. Node layout
+/// matches the CPU tree (`crypto/crypto/src/merkle_tree`): `nodes[0..leaves_len-1]`
+/// are inner nodes (root at 0), `nodes[leaves_len-1..]` are the leaves, each 32
 /// bytes. Freed when the `nodes` Arc drops.
 #[derive(Clone)]
 pub struct GpuMerkleTree {
     pub nodes: Arc<CudaSlice<u8>>,
     pub leaves_len: usize,
-    /// The Merkle root (node 0), copied to host at build time (32 bytes) so the
-    /// commitment is available without D2H'ing the whole tree.
+    /// The Merkle root (node 0), copied to host at build time so the commitment
+    /// is available without copying the whole tree.
     pub root: [u8; 32],
 }
 
@@ -643,41 +640,11 @@ pub fn coset_lde_batch_base_into_with_leaf_hash(
     .map(|_| ())
 }
 
-/// Like `coset_lde_batch_base_into_with_leaf_hash`, but also builds the full
-/// Merkle tree on device and returns the `2*lde_size - 1` node buffer back
-/// to the caller in `merkle_nodes_out` (byte length `(2*lde_size - 1) * 32`).
-///
-/// The leaf hashes are never exposed to the caller — they stay on device and
-/// feed straight into the pair-hash tree kernel, avoiding the
-/// pinned→pageable→pinned round-trip that the separate-step GPU tree build
-/// would pay.
-pub fn coset_lde_batch_base_into_with_merkle_tree(
-    columns: &[&[u64]],
-    blowup_factor: usize,
-    weights: &[u64],
-    outputs: &mut [&mut [u64]],
-    merkle_nodes_out: &mut [u8],
-) -> Result<()> {
-    coset_lde_batch_base_into_with_merkle_tree_inner(
-        columns,
-        blowup_factor,
-        weights,
-        outputs,
-        Some(merkle_nodes_out),
-        KeccakCommit::FullTree,
-        false,
-        false,
-    )
-    .map(|_| ())
-}
-
-/// Fused LDE + leaf-hash + Merkle tree build, keeping **both** the LDE buffer
-/// and the Merkle tree resident on device (returned in `GpuLdeBase` + its
-/// `tree`, including the root). The host tree is **not** materialised — there is
-/// no `merkle_nodes_out` — so the whole-tree Device→Host copy is eliminated.
-/// Callers gather query authentication paths from the device tree instead
-/// (`crate::merkle::gather_merkle_paths_dev`) and use a root-only host tree for
-/// the commitment.
+/// Fused LDE, leaf hash, and Merkle tree build that keeps both the LDE buffer
+/// and the Merkle tree (with root) resident on device, returned in `GpuLdeBase`.
+/// The host tree is not built (no `merkle_nodes_out`), so the whole tree copy to
+/// host is eliminated. Callers gather query paths from the device tree
+/// (`crate::merkle::gather_merkle_paths_dev`) and use a root only host tree.
 pub fn coset_lde_batch_base_into_with_merkle_tree_keep(
     columns: &[&[u64]],
     blowup_factor: usize,
@@ -840,20 +807,20 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner(
         crate::merkle::build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, lde_size)?;
     }
 
-    // D2H the LDE columns via pinned staging (still read on host by constraint
-    // eval / openings). The full tree nodes are D2H'd only when the caller asks
-    // for a host tree (`nodes_out`); the device-resident path passes `None` and
-    // keeps the tree on device, eliminating the whole-tree D2H.
+    // Copy the LDE columns to host via pinned staging (constraint eval and
+    // openings still read them). The full tree is copied only when the caller
+    // wants a host tree (`nodes_out`); the resident path passes None and keeps
+    // the tree on device.
     stream.memcpy_dtoh(&buf, &mut pinned[..m * lde_size])?;
     if let Some(no) = nodes_out {
         d2h_bytes_via_pinned_hashes(&stream, be, &nodes_dev, no)?;
     }
-    // Ensure the (async) LDE D2H above has landed before reading `pinned`.
-    // Previously the tree D2H's `synchronize` covered this; the device-resident
-    // path skips that D2H, so synchronize explicitly.
+    // Make sure the async LDE copy above landed before reading `pinned`. The
+    // tree copy used to provide this sync; the resident path skips it, so sync
+    // here.
     stream.synchronize()?;
 
-    // Copy pinned into caller outputs. Runs under the pinned-staging lock,
+    // Copy pinned into caller outputs. Runs under the pinned staging lock,
     // where rayon can deadlock. See `Backend::pinned_staging`.
     for (c, dst) in outputs.iter_mut().enumerate() {
         dst.copy_from_slice(&pinned[c * lde_size..c * lde_size + lde_size]);
@@ -861,11 +828,11 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner(
     drop(staging);
 
     if keep_device_buf {
-        // Retain the device tree nodes for on-device opening (FullTree only; in
-        // LeavesOnly mode there are no inner nodes to gather paths from).
+        // Retain the device tree for on device opening (FullTree only; LeavesOnly
+        // has no inner nodes to gather paths from).
         let tree = if keep_tree && commit == KeccakCommit::FullTree {
-            // Copy just the 32-byte root (node 0) so the commitment is available
-            // without D2H'ing the whole tree.
+            // Copy just the 32 byte root (node 0) so the commitment is available
+            // without copying the whole tree.
             let mut root = [0u8; 32];
             stream.memcpy_dtoh(&nodes_dev.slice(0..32), &mut root)?;
             stream.synchronize()?;
@@ -891,60 +858,10 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner(
     }
 }
 
-/// Ext3 variant of `coset_lde_batch_base_into_with_leaf_hash`: fused
-/// LDE + Keccak-256 leaf hashing over ext3 columns. Thin wrapper over
-/// `coset_lde_batch_ext3_into_with_merkle_tree_inner` with `LeavesOnly`.
-pub fn coset_lde_batch_ext3_into_with_leaf_hash(
-    columns: &[&[u64]],
-    n: usize,
-    blowup_factor: usize,
-    weights: &[u64],
-    outputs: &mut [&mut [u64]],
-    hashed_leaves_out: &mut [u8],
-) -> Result<()> {
-    coset_lde_batch_ext3_into_with_merkle_tree_inner(
-        columns,
-        n,
-        blowup_factor,
-        weights,
-        outputs,
-        Some(hashed_leaves_out),
-        KeccakCommit::LeavesOnly,
-        false,
-        false,
-    )
-    .map(|_| ())
-}
-
-/// Ext3 variant of the fused `coset_lde_batch_base_into_with_merkle_tree`.
-/// LDE + leaf hashing + inner-tree build, all on device; D2Hs only the LDE
-/// evaluations and the full `2*lde_size - 1` node buffer.
-pub fn coset_lde_batch_ext3_into_with_merkle_tree(
-    columns: &[&[u64]],
-    n: usize,
-    blowup_factor: usize,
-    weights: &[u64],
-    outputs: &mut [&mut [u64]],
-    merkle_nodes_out: &mut [u8],
-) -> Result<()> {
-    coset_lde_batch_ext3_into_with_merkle_tree_inner(
-        columns,
-        n,
-        blowup_factor,
-        weights,
-        outputs,
-        Some(merkle_nodes_out),
-        KeccakCommit::FullTree,
-        false,
-        false,
-    )
-    .map(|_| ())
-}
-
 /// Ext3 variant of [`coset_lde_batch_base_into_with_merkle_tree_keep`]: keeps
-/// both the de-interleaved LDE buffer and the Merkle tree (with root) resident
-/// on device. The host tree is **not** materialised (no `merkle_nodes_out`), so
-/// the whole-tree D2H is eliminated; openings gather paths from the device tree.
+/// both the deinterleaved LDE buffer and the Merkle tree (with root) resident on
+/// device. No host tree is built, so the whole tree copy to host is eliminated;
+/// openings gather paths from the device tree.
 pub fn coset_lde_batch_ext3_into_with_merkle_tree_keep(
     columns: &[&[u64]],
     n: usize,
@@ -1103,15 +1020,15 @@ fn coset_lde_batch_ext3_into_with_merkle_tree_inner(
         crate::merkle::build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, lde_size)?;
     }
 
-    // D2H the LDE columns. The full tree nodes are D2H'd only when the caller
-    // asks for a host tree (`nodes_out`); the device-resident path passes `None`
-    // and keeps the tree on device, eliminating the whole-tree D2H.
+    // Copy the LDE columns to host. The full tree is copied only when the caller
+    // wants a host tree (`nodes_out`); the resident path passes None and keeps
+    // the tree on device.
     stream.memcpy_dtoh(&buf, &mut pinned[..mb * lde_size])?;
     if let Some(no) = nodes_out {
         d2h_bytes_via_pinned_hashes(&stream, be, &nodes_dev, no)?;
     }
-    // Ensure the (async) LDE D2H has landed before reading `pinned` (the tree
-    // D2H's synchronize used to cover this; the device-resident path skips it).
+    // Make sure the async LDE copy landed before reading `pinned` (the tree copy
+    // used to provide this sync; the resident path skips it).
     stream.synchronize()?;
 
     unpack_pinned_slabs_to_ext3(pinned, outputs, lde_size);
diff --git a/crypto/math-cuda/src/merkle.rs b/crypto/math-cuda/src/merkle.rs
index 000f6daa0..2ca5d25ec 100644
--- a/crypto/math-cuda/src/merkle.rs
+++ b/crypto/math-cuda/src/merkle.rs
@@ -221,11 +221,10 @@ pub fn build_merkle_tree_on_device(hashed_leaves: &[u8]) -> Result<Vec<u8>> {
 /// Gather Merkle authentication paths on device for `positions` (leaf indices)
 /// against the resident tree `nodes_dev` (standard layout, `2*leaves_len-1`
 /// nodes of 32 bytes). Returns `positions.len() * depth * 32` bytes, where
-/// `depth = log2(leaves_len)`: query `q`'s path occupies bytes
-/// `[q*depth*32 .. (q+1)*depth*32]`, each 32-byte node a sibling from leaf to
-/// root — byte-for-byte the same nodes the CPU `MerkleTree::get_proof_by_pos`
-/// (`build_merkle_path`) collects. Runs on the caller's `stream` (pass the
-/// table's session stream so it shares the queue with the rest of R4).
+/// `depth = log2(leaves_len)`. Query `q`'s path is `[q*depth*32 ..
+/// (q+1)*depth*32]`, each 32 byte node a sibling from leaf to root. These are
+/// the same nodes the CPU `MerkleTree::get_proof_by_pos` collects. Runs on the
+/// caller's `stream` (pass the table's session stream).
 pub fn gather_merkle_paths_dev(
     nodes_dev: &CudaSlice<u8>,
     leaves_len: usize,
@@ -241,9 +240,9 @@ pub fn gather_merkle_paths_dev(
         "leaves_len must be a power of two >= 2"
     );
     let depth = leaves_len.trailing_zeros() as usize;
-    // Guard the kernel's device reads: an out-of-range position would walk off
-    // the node buffer (OOB device read). Positions are valid by construction;
-    // this catches any caller bug before it becomes UB.
+    // Guard the kernel's device reads: a position past leaves_len would walk
+    // off the node buffer. Positions are valid by construction; this catches a
+    // caller bug before it becomes an out of bounds device read.
     assert!(
         positions.iter().all(|&p| (p as usize) < leaves_len),
         "gather_merkle_paths_dev: leaf position >= leaves_len"
@@ -286,9 +285,9 @@ pub fn gather_merkle_paths_dev(
 ///
 /// Returns `(2*(lde_size/2) - 1) * 32` bytes of tree nodes in the standard
 /// layout (root at byte offset 0, leaves in the tail).
-/// Build the composition-poly Merkle tree on device (leaves hash row-pairs, so
-/// `num_leaves = lde_size / 2`). Returns the device node buffer, the leaf count,
-/// and the stream it was built on. Shared by the host-D2H and device-keep
+/// Build the composition Merkle tree on device. Leaves hash row pairs, so
+/// `num_leaves = lde_size / 2`. Returns the device node buffer, the leaf count,
+/// and the stream it was built on. Shared by the host copy and device keep
 /// wrappers below.
 fn build_comp_poly_tree_nodes_dev(
     parts_interleaved: &[&[u64]],
@@ -366,8 +365,8 @@ pub fn build_comp_poly_tree_from_evals_ext3(parts_interleaved: &[&[u64]]) -> Res
 
 /// Like [`build_comp_poly_tree_from_evals_ext3`] but keeps the tree nodes on
 /// device (returned as a [`crate::lde::GpuMerkleTree`] with its root), so R4
-/// composition openings gather authentication paths on device instead of
-/// D2H'ing the whole tree. `leaves_len = lde_size / 2` (row-pair leaves).
+/// composition openings gather paths on device instead of copying the whole
+/// tree to host. `leaves_len = lde_size / 2` (row pair leaves).
 pub fn build_comp_poly_tree_from_evals_ext3_keep(
     parts_interleaved: &[&[u64]],
 ) -> Result<crate::lde::GpuMerkleTree> {
diff --git a/crypto/math-cuda/tests/merkle_gather.rs b/crypto/math-cuda/tests/merkle_gather.rs
index b2ba24c1d..36e05a719 100644
--- a/crypto/math-cuda/tests/merkle_gather.rs
+++ b/crypto/math-cuda/tests/merkle_gather.rs
@@ -1,7 +1,7 @@
 //! Parity: GPU `gather_merkle_paths_dev` must produce, for each leaf position,
-//! the exact `merkle_path` the CPU `MerkleTree::get_proof_by_pos` returns —
-//! same sibling-from-leaf-to-root order, byte-for-byte. This is the gate for
-//! gathering R4 query openings on device instead of D2H'ing the whole tree.
+//! the exact `merkle_path` the CPU `MerkleTree::get_proof_by_pos` returns: the
+//! same sibling order from leaf to root, byte for byte. This is the gate for
+//! gathering R4 query openings on device instead of copying the whole tree.
 
 use crypto::merkle_tree::backends::field_element_vector::FieldElementVectorBackend;
 use crypto::merkle_tree::merkle::MerkleTree;
diff --git a/crypto/stark/src/fri/fri_commitment.rs b/crypto/stark/src/fri/fri_commitment.rs
index 9f950c27e..58c9eed77 100644
--- a/crypto/stark/src/fri/fri_commitment.rs
+++ b/crypto/stark/src/fri/fri_commitment.rs
@@ -15,7 +15,7 @@ where
     pub merkle_tree: MerkleTree<B>,
     /// The layer's Merkle tree kept resident on device (GPU FRI commit path),
     /// so R4 query openings gather authentication paths on device. When set,
-    /// `merkle_tree` is a root-only placeholder. `None` on the CPU path.
+    /// `merkle_tree` is a root only placeholder. `None` on the CPU path.
     #[cfg(feature = "cuda")]
     pub gpu_tree: Option<math_cuda::lde::GpuMerkleTree>,
 }
diff --git a/crypto/stark/src/gpu_lde.rs b/crypto/stark/src/gpu_lde.rs
index 1a0acafd5..3f2debb5e 100644
--- a/crypto/stark/src/gpu_lde.rs
+++ b/crypto/stark/src/gpu_lde.rs
@@ -430,13 +430,12 @@ pub fn gpu_leaf_hash_calls() -> u64 {
     GPU_LEAF_HASH_CALLS.load(Ordering::Relaxed)
 }
 
-/// Fused base-field path: LDE + Keccak-256 leaf hash + Merkle tree build, all
-/// on device, keeping **both** the LDE buffer and the Merkle tree resident on
-/// device. On success: `columns[c]` is resized to `lde_size` with the LDE
-/// output, and the returned `GpuLdeBase` carries the device LDE buffer plus the
-/// device tree (`.tree`). The returned `MerkleTree` is **root-only** — the host
-/// tree nodes are never materialised (no whole-tree D2H); query openings gather
-/// authentication paths from the device tree via [`gather_proofs_dev`].
+/// Fused base field path: LDE, leaf hash, and Merkle tree build, all on device,
+/// keeping both the LDE buffer and the tree resident. On success `columns[c]` is
+/// resized to `lde_size` with the LDE output, and the returned `GpuLdeBase`
+/// carries the device LDE buffer plus the device tree (`.tree`). The returned
+/// `MerkleTree` is root only (no host tree, no whole tree copy); query openings
+/// gather paths from the device tree via [`gather_proofs_dev`].
 pub(crate) fn try_expand_leaf_and_tree_batched_keep<F, E, B>(
     columns: &mut [Vec<FieldElement<E>>],
     blowup_factor: usize,
@@ -749,9 +748,9 @@ where
         })
         .collect();
 
-    // Keep the composition tree resident on device; the whole-tree D2H is
-    // eliminated. R4 composition openings gather paths from the device tree
-    // (`gather_proofs_dev`); the returned host tree is root-only.
+    // Keep the composition tree resident on device, so the whole tree copy to
+    // host is eliminated. R4 composition openings gather paths from the device
+    // tree (`gather_proofs_dev`); the returned host tree is root only.
     let dev_tree = match math_cuda::merkle::build_comp_poly_tree_from_evals_ext3_keep(&raw_parts) {
         Ok(t) => t,
         Err(_) => return None,
@@ -1606,9 +1605,9 @@ where
             }
         };
 
-        // Build the FriLayer: ext3 evals + a root-only host tree (the layer tree
-        // stays resident on device in `gpu_tree`; query openings gather paths
-        // from it via `gather_proofs_dev`).
+        // Build the FriLayer: ext3 evals and a root only host tree. The layer
+        // tree stays resident on device in `gpu_tree`; query openings gather
+        // paths from it via `gather_proofs_dev`.
         let evaluation = u64_to_ext3_vec::<E>(&layer_evals_u64);
         let root = dev_tree.root;
         let merkle_tree = MerkleTree::<FriLayerMerkleTreeBackend<E>>::from_root(root);
@@ -1645,14 +1644,14 @@ where
     Some((last_value, fri_layer_list))
 }
 
-/// GPU FRI query phase: gather each layer's authentication paths on device
-/// instead of walking host trees. For layer `l` and query `iota`, the opened
-/// position is `(iota >> l) >> 1` — matching [`crate::fri::query_phase`]. Paths
-/// for all queries are gathered in one batched call per layer. The layer
-/// evaluations (`evaluation[index ^ 1]`) are read from the host Vecs as before.
+/// GPU FRI query phase: gather each layer's paths on device instead of walking
+/// host trees. For layer `l` and query `iota` the opened position is
+/// `(iota >> l) >> 1`, matching [`crate::fri::query_phase`]. Paths for all
+/// queries are gathered in one batched call per layer. The layer evaluations
+/// (`evaluation[index ^ 1]`) are read from the host Vecs as before.
 ///
-/// Returns `None` if there are no layers or any layer lacks a device tree (a
-/// CPU-committed layer), so the caller falls back to the host walk.
+/// Returns None when there are no layers or the layers are host trees (CPU
+/// commit), so the caller falls back to the host walk.
 pub(crate) fn try_fri_query_phase_gpu<E>(
     fri_layers: &[FriLayer<E, FriLayerMerkleTreeBackend<E>>],
     iotas: &[usize],
@@ -1665,9 +1664,9 @@ where
         return None;
     }
     // The GPU FRI commit sets `gpu_tree` on every layer as a group; the CPU
-    // commit sets none. If the layers are host trees, fall back to the host
-    // walk. If they're device-resident, the host trees are root-only — so the
-    // gather below MUST succeed (a failure is a hard abort, not a silent walk).
+    // commit sets none. Host trees fall back to the host walk. When the layers
+    // are device resident the host trees are root only, so the gather below must
+    // succeed (a failure is a hard abort, not a silent walk).
     if fri_layers[0].gpu_tree.is_none() {
         return None;
     }
diff --git a/crypto/stark/src/instruments.rs b/crypto/stark/src/instruments.rs
index 83e14545b..784e5e1ad 100644
--- a/crypto/stark/src/instruments.rs
+++ b/crypto/stark/src/instruments.rs
@@ -8,17 +8,18 @@ use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
 // Wall-clock span timeline (the trustworthy per-step measurement)
 // =========================================================================
 //
-// Nested wall-clock spans opened/closed on the driving (main) thread at phase
-// boundaries. Unlike the `accum_*` / thread-local sub-timers below — which sum
-// per-worker CPU time across rayon threads and over-count (percentages > 100%) —
-// these spans are non-overlapping and sum to their parent, so the tree is a true
-// latency breakdown. Parallel regions are measured as a single span around the
-// blocking call (that IS their latency); their internal split is reported
-// separately as CPU-time, never mixed into the wall tree.
+// Nested wall clock spans opened and closed on the driving (main) thread at
+// phase boundaries. Unlike the `accum_*` thread local sub timers below (which
+// sum per worker CPU time across rayon threads and over count, so percentages
+// exceed 100%), these spans do not overlap and sum to their parent, so the tree
+// is a true latency breakdown. Parallel regions are measured as a single span
+// around the blocking call (that is their latency); their internal split is
+// reported separately as CPU time, never mixed into the wall tree.
 //
 //     let _s = instruments::span("trace_build");   // RAII, stops on drop
 //
-// `Instant::now()` is ~20 ns — fine at phase granularity; never inside per-op loops.
+// `Instant::now()` is about 20 ns, fine at phase granularity; never put it
+// inside per op loops.
 
 #[derive(Clone, Debug)]
 pub struct SpanRecord {
@@ -27,8 +28,8 @@ pub struct SpanRecord {
     pub wall: Duration,
     /// Open-order, so the tree reconstructs in start-order (records push on close).
     pub order: u32,
-    /// Wall-clock epoch (ns) when the span opened — for aligning with external
-    /// samplers (e.g. nvidia-smi GPU-util) to attribute device-busy time per step.
+    /// Wall clock epoch (ns) when the span opened, for aligning with external
+    /// samplers (e.g. nvidia-smi GPU util) to attribute device busy time per step.
     pub start_ns: u128,
 }
 
@@ -269,8 +270,8 @@ pub fn take_r1_sub() -> Round1SubOps {
 /// Reset all instrument state. Call at the start of `multi_prove` to avoid
 /// stale data from a previous run in the same process.
 ///
-/// Note: thread-local stores (R2_SUB, R4_SUB, ROUND_SUB_OPS) are only cleared
-/// for the calling thread. Rayon worker threads are not reset — stale data is
+/// Note: thread local stores (R2_SUB, R4_SUB, ROUND_SUB_OPS) are only cleared
+/// for the calling thread. Rayon worker threads are not reset, so stale data is
 /// possible if a previous run panicked without consuming stored values.
 /// In practice this is safe because store/take pairs always execute within the
 /// same rayon task closure.
diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs
index b668e2364..1e4c40bfc 100644
--- a/crypto/stark/src/prover.rs
+++ b/crypto/stark/src/prover.rs
@@ -325,20 +325,14 @@ pub fn table_parallelism() -> usize {
     }
 }
 
-/// Heuristic peak device working-set for one table, in bytes.
-///
-/// Two contributions:
-/// 1. **LDE columns** co-resident on the GPU — `main` in the base field (8 B)
-///    and `aux` in the ext3 field (24 B) — times a scratch multiplier for the
-///    NTT / leaf-hash transients allocated alongside them.
-/// 2. **Resident Merkle trees** — main, aux, composition, and FRI-layer trees
-///    are now kept on device R1→R4 (no whole-tree D2H). Each full tree is
-///    `~2*lde_size` nodes × 32 B = `64*lde_size`; co-resident at the R4 peak
-///    they sum to a few × that, so `~256 B × lde_size` covers them conservatively.
-///
-/// It is deliberately a conservative over-estimate: it gates a safety ceiling,
-/// not a precise allocator. Pass `aux_cols == 0` for phases where the aux LDE
-/// is not yet resident (the R1 main commit).
+/// Heuristic peak device working set for one table, in bytes. Two parts: the
+/// LDE columns co-resident on the GPU (main base field at 8 B, aux ext3 at
+/// 24 B) times a scratch factor for the NTT and leaf hash transients, plus the
+/// resident Merkle trees (main, aux, composition, FRI layers) kept on device R1
+/// to R4. Each full tree is about `2*lde_size` nodes of 32 B (`64*lde_size`);
+/// together at the R4 peak about 256 B per `lde_size` covers them. A deliberate
+/// over estimate that gates a safety ceiling, not a precise allocator. Pass
+/// `aux_cols == 0` where the aux LDE is not yet resident (R1 main commit).
 fn estimate_table_vram_bytes(main_cols: usize, aux_cols: usize, lde_size: usize) -> u64 {
     const BYTES_PER_BASE: u64 = 8;
     const EXT3_BYTES: u64 = 24;
@@ -354,15 +348,14 @@ fn estimate_table_vram_bytes(main_cols: usize, aux_cols: usize, lde_size: usize)
 
 /// Plan contiguous table chunks for parallel proving.
 ///
-/// A chunk grows until it reaches `k` tables (the core/RAM-bound limit) **or**
-/// its summed VRAM estimate would exceed `budget` — whichever comes first. A
+/// A chunk grows until it reaches `k` tables (the core and RAM bound limit) or
+/// its summed VRAM estimate would exceed `budget`, whichever comes first. A
 /// single table larger than `budget` forms its own chunk (it runs solo rather
 /// than being excluded). With `budget == u64::MAX` the VRAM constraint is never
-/// binding for any realistic estimate (a chunk's summed estimate can't approach
-/// `u64::MAX`), so chunks fall back to fixed size `k` — identical to the previous
-/// `step_by(k)` scheme. So on non-cuda builds and when VRAM isn't the binding
-/// constraint, scheduling (and therefore the proof) is unchanged. Returns
-/// `(start, end)` half-open ranges covering `0..estimates.len()` in order.
+/// binding for any realistic estimate, so chunks fall back to fixed size `k`,
+/// the same as the previous `step_by(k)` scheme. So on non-cuda builds and when
+/// VRAM is not binding, scheduling (and therefore the proof) is unchanged.
+/// Returns `(start, end)` half open ranges covering `0..estimates.len()` in order.
 fn plan_table_chunks(estimates: &[u64], k: usize, budget: u64) -> Vec<(usize, usize)> {
     let n = estimates.len();
     let k = k.max(1);
@@ -399,10 +392,10 @@ where
     pub(crate) composition_poly_merkle_tree: BatchedMerkleTree<F>,
     /// The commitment to the composition polynomial parts.
     pub(crate) composition_poly_root: Commitment,
-    /// The composition-poly Merkle tree kept resident on device (when the R2
-    /// GPU tree path ran), so R4 openings gather authentication paths on device
-    /// instead of walking a host tree. When set, `composition_poly_merkle_tree`
-    /// is a root-only placeholder. `None` on the CPU path.
+    /// The composition Merkle tree kept resident on device (when the R2 GPU tree
+    /// path ran), so R4 openings gather paths on device instead of walking a host
+    /// tree. When set, `composition_poly_merkle_tree` is a root only placeholder.
+    /// `None` on the CPU path.
     #[cfg(feature = "cuda")]
     pub(crate) gpu_composition_tree: Option<math_cuda::lde::GpuMerkleTree>,
 }
@@ -1148,9 +1141,9 @@ pub trait IsStarkProver<
         let t_sub = Instant::now();
         // GPU fast path for the comp-poly Merkle commit: row-pair Keccak
         // leaves + device-side inner tree, both wrapping the host eval Vecs.
-        // GPU path keeps the composition tree resident on device (no whole-tree
-        // D2H) and returns a root-only host tree; the device tree is threaded to
-        // R4 in `Round2.gpu_composition_tree`.
+        // GPU path keeps the composition tree resident on device (no whole tree
+        // copy) and returns a root only host tree. The device tree is threaded
+        // to R4 in `Round2.gpu_composition_tree`.
         #[cfg(feature = "cuda")]
         let (composition_poly_merkle_tree, composition_poly_root, gpu_composition_tree) =
             match crate::gpu_lde::try_build_comp_poly_tree_gpu::<
@@ -1179,9 +1172,8 @@ pub trait IsStarkProver<
         #[cfg(feature = "instruments")]
         crate::instruments::store_r2_sub(constraints_dur, fft_dur, merkle_dur);
 
-        // Fold the R2 device composition-parts handle into the per-table session
-        // (resident R2→R4). The host evaluations remain in `Round2` as the
-        // mirror R4 openings still read; `composition_host_mirror` flags that.
+        // Fold the R2 device composition parts handle into the session (resident
+        // R2 to R4). The host evaluations stay in `Round2` for R4 openings.
         #[cfg(feature = "cuda")]
         if let Some(handle) = gpu_composition_parts {
             round_1_result.lde_trace.set_gpu_composition_parts(handle);
@@ -1690,10 +1682,10 @@ pub trait IsStarkProver<
 
     /// Like [`Self::open_polys_with`], but uses Merkle proofs already gathered
     /// from the resident device tree (see [`crate::gpu_lde::gather_proofs_dev`])
-    /// instead of walking a host tree. The evaluations are still gathered from
-    /// the host-resident LDE columns via `gather`. `proof` is for leaf position
-    /// `challenge * 2`, `proof_sym` for `challenge * 2 + 1` — the same positions
-    /// `open_polys_with` opens.
+    /// instead of walking a host tree. Evaluations still come from the host LDE
+    /// columns via `gather`. `proof` is for leaf position `challenge * 2`,
+    /// `proof_sym` for `challenge * 2 + 1`, the same positions `open_polys_with`
+    /// opens.
     #[cfg(feature = "cuda")]
     fn open_polys_with_proofs<C, G>(
         domain: &Domain<Field>,
@@ -1737,18 +1729,14 @@ pub trait IsStarkProver<
         let num_precomputed_cols = main_commit.num_precomputed_cols;
         let total_cols = lde_trace.num_main_cols();
 
-        // R4 main-trace proofs from the resident device tree, when present:
-        // gathered in one batch over all query positions (`c*2`, `c*2+1` per
-        // query) instead of walking the host tree. Byte-identical to the host
-        // proofs (guarded by the `merkle_gather` parity test). Only the
-        // non-preprocessed main carries a device tree today; on any miss this is
-        // `None` and openings fall back to the host tree below.
-        // `*_dev_proofs` is `Some` exactly when the corresponding tree is
-        // device-resident (so the host tree is a root-only placeholder). In that
-        // case the gather MUST succeed — there is no host tree to fall back to,
-        // so a gather error is a hard abort (not a silent walk of an empty
-        // tree). When the tree is *not* device-resident the value is `None` and
-        // the openings below walk the full host tree as usual.
+        // R4 trace proofs from the resident device trees, gathered in one batch
+        // over all query positions instead of walking the host trees (byte
+        // identical to the host proofs, guarded by the `merkle_gather` test).
+        // `*_dev_proofs` is `Some` exactly when the tree is device resident (so
+        // the host tree is a root only placeholder). In that case the gather
+        // must succeed: there is no host tree to fall back to, so a gather error
+        // is a hard abort. When the tree is not device resident the value is
+        // `None` and the openings below walk the full host tree.
         #[cfg(feature = "cuda")]
         let main_dev_proofs: Option<Vec<Proof<Commitment>>> = if is_preprocessed {
             None
@@ -1770,7 +1758,7 @@ pub trait IsStarkProver<
                 })
         };
 
-        // Same for the aux-trace tree, when it is device-resident.
+        // Same for the aux trace tree, when it is device resident.
         #[cfg(feature = "cuda")]
         let aux_dev_proofs: Option<Vec<Proof<Commitment>>> = round_1_result
             .aux
@@ -1788,7 +1776,7 @@ pub trait IsStarkProver<
                     .expect("device aux-tree gather failed; resident tree has no host fallback")
             });
 
-        // Composition tree: openings open a single position `index` (row-pair
+        // Composition tree: openings open a single position `index` (row pair
         // leaf), so gather one proof per query challenge from the device tree.
         #[cfg(feature = "cuda")]
         let comp_dev_proofs: Option<Vec<Proof<Commitment>>> =
@@ -2002,11 +1990,11 @@ pub trait IsStarkProver<
 
         let k = table_parallelism().min(num_airs).max(1);
 
-        // VRAM-budgeted admission. The budget caps the summed device working-set
+        // VRAM budgeted admission. The budget caps the summed device working set
         // of the tables proved concurrently so large blocks don't exhaust VRAM.
-        // It is an *additional* ceiling on top of `k` (it never raises
-        // concurrency): on non-cuda builds, or when the budget can't be queried,
-        // it is `u64::MAX` and chunking falls back to fixed size `k`.
+        // It is an extra ceiling on top of `k` (it never raises concurrency). On
+        // non-cuda builds, or when the budget can't be queried, it is `u64::MAX`
+        // and chunking falls back to fixed size `k`.
         #[cfg(feature = "cuda")]
         let vram_budget = math_cuda::device::backend()
             .map(|b| b.vram_budget_bytes())
@@ -2014,8 +2002,8 @@ pub trait IsStarkProver<
         #[cfg(not(feature = "cuda"))]
         let vram_budget = u64::MAX;
 
-        // R1 main commit: only the main LDE (+ its Merkle scratch) is resident,
-        // so the aux columns contribute nothing to this phase's working-set.
+        // R1 main commit: only the main LDE and its Merkle scratch are resident,
+        // so the aux columns add nothing to this phase's working set.
         let main_chunks = {
             let estimates: Vec<u64> = air_trace_pairs
                 .iter()
@@ -2231,10 +2219,10 @@ pub trait IsStarkProver<
         #[allow(clippy::type_complexity)]
         let mut aux_results: Vec<AuxResult<FieldExtension>> = Vec::with_capacity(num_airs);
 
-        // R1 aux commit and rounds 2–4 share the peak working-set: the main and
-        // aux LDEs are co-resident, plus the composition / Merkle transients
-        // (folded into the scratch factor). `num_aux_columns` is now populated
-        // by the aux build above, so this estimate is accurate for both phases.
+        // R1 aux commit and rounds 2 to 4 share the peak working set: the main
+        // and aux LDEs are co-resident, plus the composition and Merkle
+        // transients (in the scratch factor). `num_aux_columns` is populated by
+        // the aux build above, so this estimate is accurate for both phases.
         let peak_chunks = {
             let estimates: Vec<u64> = air_trace_pairs
                 .iter()
diff --git a/crypto/stark/src/trace.rs b/crypto/stark/src/trace.rs
index c7a9a941a..292f5be2e 100644
--- a/crypto/stark/src/trace.rs
+++ b/crypto/stark/src/trace.rs
@@ -195,58 +195,31 @@ where
     pub(crate) aux_columns: Vec<Vec<FieldElement<E>>>,
     pub(crate) lde_step_size: usize,
     pub(crate) blowup_factor: usize,
-    /// Per-table GPU residency session: owns the device-resident trace LDE
-    /// buffers (main + aux) and tracks, per buffer, whether a host mirror is
-    /// live. Created on the CPU path too (all buffers `None`, mirrors present);
-    /// populated when the R1 fused GPU pipeline runs. Threaded R1→R4 because
-    /// `LDETraceTable` is borrowed through every round.
+    /// Per table GPU residency session: owns this table's device LDE buffers
+    /// and bound stream. Threaded R1 to R4. Empty on the CPU path.
     #[cfg(feature = "cuda")]
     pub(crate) gpu_session: GpuTableSession,
 }
 
-/// Per-table GPU residency session.
+/// Per table GPU residency session.
 ///
-/// Owns the device-resident buffers for a single trace table and tracks, per
-/// buffer, whether a host mirror is currently materialised. Today every buffer
-/// produced on the GPU is *also* copied to host columns, because the remaining
-/// CPU consumers (R2 constraint evaluation, R3 barycentric OOD, R4 query
-/// openings) read from host. The `*_host_mirror` flags are the seam those later
-/// steps flip: once a consumer reads from the device buffer instead, the
-/// corresponding mirror is dropped and the H2D/D2H copy elided.
-///
-/// Scope: this owns the main/aux trace LDE (resident R1→R4), the composition
-/// parts (`Round2`, R2→R4), and the per-table bound stream. The R4-local
-/// `inv_denoms`/FRI state are created and consumed within R4 and stay local.
+/// Owns the device buffers for one trace table: the main and aux trace LDE
+/// (resident R1 to R4), the composition parts LDE (R2 to R4), and a bound
+/// stream. The R4 local inv_denoms and FRI state stay local to R4.
 #[cfg(feature = "cuda")]
 pub(crate) struct GpuTableSession {
-    /// Main-trace LDE, resident from the R1 fused pipeline through R4. `None`
-    /// when the GPU LDE didn't run for this table (below the size threshold or
-    /// any CPU fallback: preprocessed main, non-Goldilocks, or GPU error).
+    /// Main trace LDE, resident from the R1 fused pipeline through R4. None
+    /// when the GPU LDE did not run (below threshold, preprocessed main, not
+    /// Goldilocks, or a GPU error).
     main_lde: Option<math_cuda::lde::GpuLdeBase>,
-    /// Aux-trace LDE (ext3 de-interleaved layout on device), resident R1→R4.
+    /// Aux trace LDE (ext3, deinterleaved on device), resident R1 to R4.
     aux_lde: Option<math_cuda::lde::GpuLdeExt3>,
-    /// Composition-poly parts LDE (ext3 de-interleaved on device), produced in
-    /// R2 and resident R2→R4 so R4 DEEP reads the parts on-device instead of a
-    /// `num_parts * 3 * lde_size * 8` byte H2D. `None` when the R2 GPU path
-    /// didn't run (number_of_parts <= 2, below threshold, or CPU fallback).
+    /// Composition parts LDE (ext3, deinterleaved on device), produced in R2
+    /// and resident R2 to R4 so R4 DEEP reads them on device. None when the R2
+    /// GPU path did not run.
     composition_parts: Option<math_cuda::lde::GpuLdeExt3>,
-    /// Whether the main-trace host columns currently mirror `main_lde`.
-    /// Always `true` today; CPU consumers depend on it.
-    main_host_mirror: bool,
-    /// Whether the aux-trace host columns currently mirror `aux_lde`.
-    /// Always `true` today; CPU consumers depend on it.
-    aux_host_mirror: bool,
-    /// Whether the host composition-parts evaluations (`Round2`) mirror
-    /// `composition_parts`. Always `true` today; R4 openings read host.
-    composition_host_mirror: bool,
     /// Stream bound to this table's GPU work, acquired lazily from the backend
-    /// pool on first use and cached for the session's lifetime. The R3/R4
-    /// device-resident chain (inv_denoms → barycentric/OOD → DEEP) runs on it
-    /// today; the heavy LDE/Merkle ops join once they thread a stream. Binding
-    /// one stream per table serialises a table's kernels on a single queue and
-    /// gives distinct tables distinct streams — the prerequisite for cross-table
-    /// overlap once the host mirrors drop (Steps 4–5). `None` is cached if the
-    /// backend is unavailable, so callers fall back to the CPU path.
+    /// pool and cached. None is cached when the backend is unavailable.
     stream: OnceLock<Option<Arc<math_cuda::CudaStream>>>,
 }
 
@@ -257,12 +230,6 @@ impl GpuTableSession {
             main_lde: None,
             aux_lde: None,
             composition_parts: None,
-            // Host columns are always materialised today; the CPU consumers
-            // (constraint eval, OOD, openings) read them. Steps 5/6 flip these
-            // off as each consumer moves to reading the device buffer.
-            main_host_mirror: true,
-            aux_host_mirror: true,
-            composition_host_mirror: true,
             stream: OnceLock::new(),
         }
     }
@@ -293,9 +260,8 @@ where
         }
     }
 
-    /// Attach an already-populated device LDE handle for the main columns.
-    /// Only set when the GPU fused pipeline produced the LDE. Callers that
-    /// ran the CPU path should leave this alone.
+    /// Attach the device LDE handle for the main columns, produced by the GPU
+    /// fused pipeline. Leave unset on the CPU path.
     #[cfg(feature = "cuda")]
     pub fn set_gpu_main(&mut self, h: math_cuda::lde::GpuLdeBase) {
         self.gpu_session.main_lde = Some(h);
@@ -317,23 +283,8 @@ where
         self.gpu_session.aux_lde.as_ref()
     }
 
-    /// Whether the main-trace host columns currently mirror the device LDE.
-    /// Always `true` today (CPU consumers read host); the source-agnostic seam
-    /// steps 5/6 flip when the main consumer reads the device buffer instead.
-    #[cfg(feature = "cuda")]
-    pub fn main_host_mirror(&self) -> bool {
-        self.gpu_session.main_host_mirror
-    }
-
-    /// Whether the aux-trace host columns currently mirror the device LDE.
-    /// Always `true` today; see [`Self::main_host_mirror`].
-    #[cfg(feature = "cuda")]
-    pub fn aux_host_mirror(&self) -> bool {
-        self.gpu_session.aux_host_mirror
-    }
-
-    /// Attach the device-resident composition-poly parts LDE produced in R2.
-    /// Read by R4 DEEP so the parts aren't re-uploaded H2D.
+    /// Attach the composition parts LDE produced in R2. Read by R4 DEEP so the
+    /// parts are not re-uploaded.
     #[cfg(feature = "cuda")]
     pub fn set_gpu_composition_parts(&mut self, h: math_cuda::lde::GpuLdeExt3) {
         self.gpu_session.composition_parts = Some(h);
@@ -344,17 +295,9 @@ where
         self.gpu_session.composition_parts.as_ref()
     }
 
-    /// Whether the host composition-parts evaluations mirror the device buffer.
-    /// Always `true` today; see [`Self::main_host_mirror`].
-    #[cfg(feature = "cuda")]
-    pub fn composition_host_mirror(&self) -> bool {
-        self.gpu_session.composition_host_mirror
-    }
-
     /// The stream bound to this table's GPU work. Acquired lazily from the
-    /// backend pool on first call and cached for the session's lifetime, so all
-    /// of a table's stream-threaded ops share one queue. Returns `None` (cached)
-    /// when the backend is unavailable; callers then fall back to the CPU path.
+    /// backend pool on first call and cached, so all of a table's stream ops
+    /// share one queue. Returns None (cached) when the backend is unavailable.
     #[cfg(feature = "cuda")]
     pub fn bound_stream(&self) -> Option<Arc<math_cuda::CudaStream>> {
         self.gpu_session

From b5fbdef6db80e783afdfe51cb202545eea940fe8 Mon Sep 17 00:00:00 2001
From: Joaquin Carletti <joaquin.carletti@lambdaclass.com>
Date: Tue, 30 Jun 2026 12:24:21 -0300
Subject: [PATCH 13/17] fix

---
 crypto/math-cuda/tests/merkle_root_parity.rs | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/crypto/math-cuda/tests/merkle_root_parity.rs b/crypto/math-cuda/tests/merkle_root_parity.rs
index 0cbe016b6..b4f96bc4a 100644
--- a/crypto/math-cuda/tests/merkle_root_parity.rs
+++ b/crypto/math-cuda/tests/merkle_root_parity.rs
@@ -295,7 +295,7 @@ fn new_row_major_pipeline_base_root_matches_cpu() {
                 let fwd_tw =
                     TwoHalfTwiddles::<GoldilocksField>::new(log_lde, false).expect("fwd twiddles");
 
-                let (nodes, _handle, _lde) =
+                let (handle, _lde) =
                     math_cuda::lde::coset_lde_row_major_with_merkle_tree_keep(
                         &row_major,
                         n,
@@ -304,8 +304,7 @@ fn new_row_major_pipeline_base_root_matches_cpu() {
                         &weights_u64,
                     )
                     .expect("new row-major GPU pipeline");
-                let mut gpu_root = [0u8; 32];
-                gpu_root.copy_from_slice(&nodes[0..32]);
+                let gpu_root = handle.tree.as_ref().expect("resident merkle tree").root;
 
                 let cpu_root = cpu_row_major_merkle_root(
                     &(0..num_cols)
@@ -359,7 +358,7 @@ fn new_row_major_pipeline_ext3_root_matches_cpu() {
                 let fwd_tw =
                     TwoHalfTwiddles::<GoldilocksField>::new(log_lde, false).expect("fwd twiddles");
 
-                let (nodes, _handle, _lde) =
+                let (handle, _lde) =
                     math_cuda::lde::coset_lde_ext3_row_major_with_merkle_tree_keep(
                         &row_major,
                         n,
@@ -368,8 +367,7 @@ fn new_row_major_pipeline_ext3_root_matches_cpu() {
                         &weights_u64,
                     )
                     .expect("new ext3 row-major GPU pipeline");
-                let mut gpu_root = [0u8; 32];
-                gpu_root.copy_from_slice(&nodes[0..32]);
+                let gpu_root = handle.tree.as_ref().expect("resident merkle tree").root;
 
                 let cpu_root =
                     cpu_ext3_row_major_merkle_root(&columns, blowup, &weights_fp, &inv_tw, &fwd_tw);

From b37b6497d3f52c07d5ee3ba965c13a7cce0416c6 Mon Sep 17 00:00:00 2001
From: Joaquin Carletti <joaquin.carletti@lambdaclass.com>
Date: Tue, 30 Jun 2026 13:23:30 -0300
Subject: [PATCH 14/17] fix

---
 crypto/math-cuda/src/merkle.rs          | 22 +++++++------------
 crypto/math-cuda/tests/keccak_leaves.rs | 10 +++++++--
 crypto/stark/src/instruments.rs         | 24 ++++++++++-----------
 crypto/stark/src/prover.rs              | 28 +++++++++----------------
 4 files changed, 37 insertions(+), 47 deletions(-)

diff --git a/crypto/math-cuda/src/merkle.rs b/crypto/math-cuda/src/merkle.rs
index cf6823c24..3d9cd260c 100644
--- a/crypto/math-cuda/src/merkle.rs
+++ b/crypto/math-cuda/src/merkle.rs
@@ -455,15 +455,8 @@ fn build_comp_poly_tree_nodes_dev(
     Ok((nodes_dev, num_leaves, stream))
 }
 
-pub fn build_comp_poly_tree_from_evals_ext3(parts_interleaved: &[&[u64]]) -> Result<Vec<u8>> {
-    let (nodes_dev, _num_leaves, stream) = build_comp_poly_tree_nodes_dev(parts_interleaved)?;
-    let out = stream.clone_dtoh(&nodes_dev)?;
-    stream.synchronize()?;
-    Ok(out)
-}
-
-/// Like [`build_comp_poly_tree_from_evals_ext3`] but keeps the tree nodes on
-/// device (returned as a [`crate::lde::GpuMerkleTree`] with its root), so R4
+/// Build the comp poly Merkle tree on device and keep the nodes resident
+/// (returned as a [`crate::lde::GpuMerkleTree`] with its root), so R4
 /// composition openings gather paths on device instead of copying the whole
 /// tree to host. `leaves_len = lde_size / 2` (row pair leaves).
 pub fn build_comp_poly_tree_from_evals_ext3_keep(
@@ -480,11 +473,12 @@ pub fn build_comp_poly_tree_from_evals_ext3_keep(
     })
 }
 
-/// Build a FRI-layer Merkle tree on device from an interleaved ext3 eval
-/// vector. Each leaf hashes two consecutive ext3 values. `num_leaves =
-/// evals.len() / 6` (since each ext3 is 3 u64s).
-///
-/// Returns the `(2*num_leaves - 1) * 32`-byte node buffer in standard layout.
+/// Test-only parity harness: build a FRI layer Merkle tree on device from an
+/// interleaved ext3 eval vector and return the full host node buffer so tests
+/// can compare it byte for byte against the CPU. Production folds and commits
+/// via [`crate::fri::FriLayer::fold_and_commit_layer`]. Each leaf hashes two
+/// consecutive ext3 values; `num_leaves = evals.len() / 6`. Returns the
+/// `(2*num_leaves - 1) * 32`-byte node buffer in standard layout.
 pub fn build_fri_layer_tree_from_evals_ext3(evals: &[u64]) -> Result<Vec<u8>> {
     assert!(
         evals.len().is_multiple_of(6),
diff --git a/crypto/math-cuda/tests/keccak_leaves.rs b/crypto/math-cuda/tests/keccak_leaves.rs
index 61a861f32..eca6a5c0b 100644
--- a/crypto/math-cuda/tests/keccak_leaves.rs
+++ b/crypto/math-cuda/tests/keccak_leaves.rs
@@ -217,8 +217,14 @@ fn keccak_comp_poly_leaves_matches_cpu() {
             let parts_slices: Vec<&[u64]> =
                 parts_interleaved.iter().map(|v| v.as_slice()).collect();
 
-            let nodes =
-                math_cuda::merkle::build_comp_poly_tree_from_evals_ext3(&parts_slices).unwrap();
+            // Exercise the production keep path, then read the resident nodes
+            // back to host to check the leaf bytes.
+            let tree =
+                math_cuda::merkle::build_comp_poly_tree_from_evals_ext3_keep(&parts_slices)
+                    .unwrap();
+            let be = math_cuda::device::backend().unwrap();
+            let stream = be.next_stream();
+            let nodes: Vec<u8> = stream.clone_dtoh(&*tree.nodes).unwrap();
             let num_leaves = lde_size / 2;
             let leaves_offset = (num_leaves - 1) * 32;
             for i in 0..num_leaves {
diff --git a/crypto/stark/src/instruments.rs b/crypto/stark/src/instruments.rs
index 20cde52fa..f263558aa 100644
--- a/crypto/stark/src/instruments.rs
+++ b/crypto/stark/src/instruments.rs
@@ -4,22 +4,18 @@ use std::sync::OnceLock;
 use std::sync::atomic::{AtomicU64, Ordering};
 use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
 
-// =========================================================================
-// Wall-clock span timeline (the trustworthy per-step measurement)
-// =========================================================================
+// Wall clock span timeline: the trustworthy per step latency breakdown.
 //
-// Nested wall clock spans opened and closed on the driving (main) thread at
-// phase boundaries. Unlike the `accum_*` thread local sub timers below (which
-// sum per worker CPU time across rayon threads and over count, so percentages
-// exceed 100%), these spans do not overlap and sum to their parent, so the tree
-// is a true latency breakdown. Parallel regions are measured as a single span
-// around the blocking call (that is their latency); their internal split is
-// reported separately as CPU time, never mixed into the wall tree.
+// Spans open and close on the main thread at phase boundaries. They do not
+// overlap and sum to their parent, so the tree is a true latency breakdown
+// (unlike the accum_* thread local sub timers below, which sum per worker CPU
+// time across rayon threads and can exceed 100%). A parallel region is one span
+// around the blocking call; its internal split is reported separately as CPU
+// time, never mixed into the wall tree.
 //
 //     let _s = instruments::span("trace_build");   // RAII, stops on drop
 //
-// `Instant::now()` is about 20 ns, fine at phase granularity; never put it
-// inside per op loops.
+// Instant::now() is about 20 ns, fine at phase granularity, not in per op loops.
 
 #[derive(Clone, Debug)]
 pub struct SpanRecord {
@@ -138,9 +134,11 @@ pub fn timeline_json(spans: &[SpanRecord]) -> String {
         if i > 0 {
             out.push(',');
         }
+        // Escape the label so a quote or backslash cannot break the JSON.
+        let label = s.label.replace('\\', "\\\\").replace('"', "\\\"");
         out.push_str(&format!(
             "{{\"label\":\"{}\",\"depth\":{},\"wall_ns\":{},\"order\":{},\"start_ns\":{}}}",
-            s.label,
+            label,
             s.depth,
             s.wall.as_nanos(),
             s.order,
diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs
index f89a3fb24..8c04b7d1d 100644
--- a/crypto/stark/src/prover.rs
+++ b/crypto/stark/src/prover.rs
@@ -424,14 +424,10 @@ pub fn table_parallelism() -> usize {
     }
 }
 
-/// Heuristic peak device working set for one table, in bytes. Two parts: the
-/// LDE columns co-resident on the GPU (main base field at 8 B, aux ext3 at
-/// 24 B) times a scratch factor for the NTT and leaf hash transients, plus the
-/// resident Merkle trees (main, aux, composition, FRI layers) kept on device R1
-/// to R4. Each full tree is about `2*lde_size` nodes of 32 B (`64*lde_size`);
-/// together at the R4 peak about 256 B per `lde_size` covers them. A deliberate
-/// over estimate that gates a safety ceiling, not a precise allocator. Pass
-/// `aux_cols == 0` where the aux LDE is not yet resident (R1 main commit).
+/// Heuristic peak device bytes for one table: co-resident LDE columns plus the
+/// resident Merkle trees, with a scratch factor for NTT and leaf transients. A
+/// deliberate over estimate for a safety ceiling, not a precise allocator. Pass
+/// aux_cols == 0 when the aux LDE is not yet resident (R1 main commit).
 fn estimate_table_vram_bytes(main_cols: usize, aux_cols: usize, lde_size: usize) -> u64 {
     const BYTES_PER_BASE: u64 = 8;
     const EXT3_BYTES: u64 = 24;
@@ -445,16 +441,12 @@ fn estimate_table_vram_bytes(main_cols: usize, aux_cols: usize, lde_size: usize)
     lde_term.saturating_add(tree_term)
 }
 
-/// Plan contiguous table chunks for parallel proving.
-///
-/// A chunk grows until it reaches `k` tables (the core and RAM bound limit) or
-/// its summed VRAM estimate would exceed `budget`, whichever comes first. A
-/// single table larger than `budget` forms its own chunk (it runs solo rather
-/// than being excluded). With `budget == u64::MAX` the VRAM constraint is never
-/// binding for any realistic estimate, so chunks fall back to fixed size `k`,
-/// the same as the previous `step_by(k)` scheme. So on non-cuda builds and when
-/// VRAM is not binding, scheduling (and therefore the proof) is unchanged.
-/// Returns `(start, end)` half open ranges covering `0..estimates.len()` in order.
+/// Plan contiguous table chunks for parallel proving. A chunk grows until it
+/// hits `k` tables or its summed VRAM estimate would exceed `budget`; a single
+/// table larger than `budget` runs solo. With `budget == u64::MAX` (non-cuda,
+/// or VRAM not binding) chunks fall back to fixed size `k`, identical to the
+/// old `step_by(k)`, so scheduling and the proof are unchanged. Returns
+/// `(start, end)` half open ranges covering `0..estimates.len()` in order.
 fn plan_table_chunks(estimates: &[u64], k: usize, budget: u64) -> Vec<(usize, usize)> {
     let n = estimates.len();
     let k = k.max(1);

From 873491ecc83577a576505783963f1969c5c88f5e Mon Sep 17 00:00:00 2001
From: Joaquin Carletti <joaquin.carletti@lambdaclass.com>
Date: Tue, 30 Jun 2026 13:46:08 -0300
Subject: [PATCH 15/17] fix clippy

---
 crypto/math-cuda/src/fri.rs                  |  3 +-
 crypto/math-cuda/tests/keccak_leaves.rs      |  5 ++-
 crypto/math-cuda/tests/merkle_root_parity.rs | 17 +++++-----
 crypto/stark/src/gpu_lde.rs                  | 12 +++++--
 crypto/stark/src/prover.rs                   | 35 +++++++++-----------
 5 files changed, 37 insertions(+), 35 deletions(-)

diff --git a/crypto/math-cuda/src/fri.rs b/crypto/math-cuda/src/fri.rs
index 8fe6fcdb1..a2f96c07a 100644
--- a/crypto/math-cuda/src/fri.rs
+++ b/crypto/math-cuda/src/fri.rs
@@ -217,7 +217,8 @@ impl FriCommitState {
         // Keep the layer tree resident on device; copy only the 32-byte root so
         // R4 query openings gather paths on device instead of copying the tree.
         let mut root = [0u8; 32];
-        self.stream.memcpy_dtoh(&nodes_dev.slice(0..32), &mut root)?;
+        self.stream
+            .memcpy_dtoh(&nodes_dev.slice(0..32), &mut root)?;
         self.stream.synchronize()?;
 
         self.a_is_input = !self.a_is_input;
diff --git a/crypto/math-cuda/tests/keccak_leaves.rs b/crypto/math-cuda/tests/keccak_leaves.rs
index eca6a5c0b..087ccde14 100644
--- a/crypto/math-cuda/tests/keccak_leaves.rs
+++ b/crypto/math-cuda/tests/keccak_leaves.rs
@@ -219,9 +219,8 @@ fn keccak_comp_poly_leaves_matches_cpu() {
 
             // Exercise the production keep path, then read the resident nodes
             // back to host to check the leaf bytes.
-            let tree =
-                math_cuda::merkle::build_comp_poly_tree_from_evals_ext3_keep(&parts_slices)
-                    .unwrap();
+            let tree = math_cuda::merkle::build_comp_poly_tree_from_evals_ext3_keep(&parts_slices)
+                .unwrap();
             let be = math_cuda::device::backend().unwrap();
             let stream = be.next_stream();
             let nodes: Vec<u8> = stream.clone_dtoh(&*tree.nodes).unwrap();
diff --git a/crypto/math-cuda/tests/merkle_root_parity.rs b/crypto/math-cuda/tests/merkle_root_parity.rs
index 573747a65..fcc9d226e 100644
--- a/crypto/math-cuda/tests/merkle_root_parity.rs
+++ b/crypto/math-cuda/tests/merkle_root_parity.rs
@@ -299,15 +299,14 @@ fn new_row_major_pipeline_base_root_matches_cpu() {
                 let fwd_tw =
                     TwoHalfTwiddles::<GoldilocksField>::new(log_lde, false).expect("fwd twiddles");
 
-                let (handle, _lde) =
-                    math_cuda::lde::coset_lde_row_major_with_merkle_tree_keep(
-                        &row_major,
-                        n,
-                        num_cols,
-                        blowup,
-                        &weights_u64,
-                    )
-                    .expect("new row-major GPU pipeline");
+                let (handle, _lde) = math_cuda::lde::coset_lde_row_major_with_merkle_tree_keep(
+                    &row_major,
+                    n,
+                    num_cols,
+                    blowup,
+                    &weights_u64,
+                )
+                .expect("new row-major GPU pipeline");
                 let gpu_root = handle.tree.as_ref().expect("resident merkle tree").root;
 
                 let cpu_root = cpu_row_major_merkle_root(
diff --git a/crypto/stark/src/gpu_lde.rs b/crypto/stark/src/gpu_lde.rs
index 8da2b707a..29285d34f 100644
--- a/crypto/stark/src/gpu_lde.rs
+++ b/crypto/stark/src/gpu_lde.rs
@@ -485,7 +485,11 @@ where
     // Transmute Vec<u64> → Vec<FieldElement<E>> (zero-copy, E == GoldilocksField).
     let lde_out: Vec<FieldElement<E>> = unsafe {
         let mut v = std::mem::ManuallyDrop::new(lde_u64);
-        Vec::from_raw_parts(v.as_mut_ptr() as *mut FieldElement<E>, v.len(), v.capacity())
+        Vec::from_raw_parts(
+            v.as_mut_ptr() as *mut FieldElement<E>,
+            v.len(),
+            v.capacity(),
+        )
     };
 
     // Root-only host tree: the device tree (`handle.tree`) holds the nodes and
@@ -554,7 +558,11 @@ where
             v.len() % 3 == 0 && v.capacity() % 3 == 0,
             "lde_u64 len/capacity must be a multiple of 3 for Fp3 reinterpret"
         );
-        Vec::from_raw_parts(v.as_mut_ptr() as *mut FieldElement<E>, v.len() / 3, v.capacity() / 3)
+        Vec::from_raw_parts(
+            v.as_mut_ptr() as *mut FieldElement<E>,
+            v.len() / 3,
+            v.capacity() / 3,
+        )
     };
 
     // Root-only host tree: the device tree (`handle.tree`) holds the nodes and
diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs
index 8c04b7d1d..cdf1cd1b2 100644
--- a/crypto/stark/src/prover.rs
+++ b/crypto/stark/src/prover.rs
@@ -36,8 +36,6 @@ use crate::trace::LDETraceTable;
 
 use super::config::{BatchedMerkleTree, BatchedMerkleTreeBackend, Commitment};
 use super::constraints::evaluator::ConstraintEvaluator;
-#[cfg(feature = "cuda")]
-use crypto::merkle_tree::proof::Proof;
 use super::domain::{Domain, DomainConstants};
 use super::fri::fri_decommit::FriDecommitment;
 use super::grinding;
@@ -45,6 +43,8 @@ use super::lookup::BusPublicInputs;
 use super::proof::stark::{DeepPolynomialOpening, MultiProof, StarkProof};
 use super::trace::TraceTable;
 use super::traits::AIR;
+#[cfg(feature = "cuda")]
+use crypto::merkle_tree::proof::Proof;
 
 pub use crate::commitment::{keccak_leaves_bit_reversed, keccak_leaves_row_pair_bit_reversed};
 
@@ -1875,8 +1875,9 @@ pub trait IsStarkProver<
                 let stream = lde_trace
                     .bound_stream()
                     .expect("bound stream for device-resident composition-tree opening");
-                crate::gpu_lde::gather_proofs_dev(tree, indexes_to_open, &stream)
-                    .expect("device composition-tree gather failed; resident tree has no host fallback")
+                crate::gpu_lde::gather_proofs_dev(tree, indexes_to_open, &stream).expect(
+                    "device composition-tree gather failed; resident tree has no host fallback",
+                )
             });
 
         for (qi, index) in indexes_to_open.iter().enumerate() {
@@ -1892,12 +1893,9 @@ pub trait IsStarkProver<
                 #[cfg(feature = "cuda")]
                 {
                     if let Some(proofs) = &main_dev_proofs {
-                        Self::open_polys_with_proofs(
-                            domain,
-                            proofs[qi].clone(),
-                            *index,
-                            |row| lde_trace.gather_main_row(row),
-                        )
+                        Self::open_polys_with_proofs(domain, proofs[qi].clone(), *index, |row| {
+                            lde_trace.gather_main_row(row)
+                        })
                     } else {
                         Self::open_polys_with(domain, &main_commit.tree, *index, |row| {
                             lde_trace.gather_main_row(row)
@@ -1950,12 +1948,9 @@ pub trait IsStarkProver<
                 #[cfg(feature = "cuda")]
                 {
                     if let Some(proofs) = &aux_dev_proofs {
-                        Self::open_polys_with_proofs(
-                            domain,
-                            proofs[qi].clone(),
-                            *index,
-                            |row| lde_trace.gather_aux_row(row),
-                        )
+                        Self::open_polys_with_proofs(domain, proofs[qi].clone(), *index, |row| {
+                            lde_trace.gather_aux_row(row)
+                        })
                     } else {
                         Self::open_polys_with(domain, &aux.tree, *index, |row| {
                             lde_trace.gather_aux_row(row)
@@ -2098,8 +2093,8 @@ pub trait IsStarkProver<
                 .iter()
                 .enumerate()
                 .map(|(idx, (_, trace, _))| {
-                    let lde_size = domains[idx].interpolation_domain_size
-                        * domains[idx].blowup_factor;
+                    let lde_size =
+                        domains[idx].interpolation_domain_size * domains[idx].blowup_factor;
                     estimate_table_vram_bytes(trace.num_main_columns, 0, lde_size)
                 })
                 .collect();
@@ -2303,8 +2298,8 @@ pub trait IsStarkProver<
                 .iter()
                 .enumerate()
                 .map(|(idx, (_, trace, _))| {
-                    let lde_size = domains[idx].interpolation_domain_size
-                        * domains[idx].blowup_factor;
+                    let lde_size =
+                        domains[idx].interpolation_domain_size * domains[idx].blowup_factor;
                     estimate_table_vram_bytes(
                         trace.num_main_columns,
                         trace.num_aux_columns,

From e4ada642938788d29cfcd998ba8b42146fc6e6a4 Mon Sep 17 00:00:00 2001
From: Joaquin Carletti <joaquin.carletti@lambdaclass.com>
Date: Tue, 30 Jun 2026 15:02:26 -0300
Subject: [PATCH 16/17] fix comments

---
 crypto/crypto/src/merkle_tree/merkle.rs |  6 +++---
 crypto/math-cuda/src/merkle.rs          | 13 ++++---------
 crypto/stark/src/gpu_lde.rs             | 19 +++++++++++++++++--
 3 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/crypto/crypto/src/merkle_tree/merkle.rs b/crypto/crypto/src/merkle_tree/merkle.rs
index a32f12978..e9e9469fd 100644
--- a/crypto/crypto/src/merkle_tree/merkle.rs
+++ b/crypto/crypto/src/merkle_tree/merkle.rs
@@ -255,9 +255,9 @@ where
     /// Merkle tree
     pub fn get_proof_by_pos(&self, pos: usize) -> Option<Proof<B::Node>> {
         // A root only tree (from `from_root`) has no nodes to walk. Callers must
-        // gather paths from the device resident copy instead. Catch the misuse
-        // early in debug builds rather than returning a misleading None.
-        debug_assert!(
+        // gather paths from the device resident copy instead. Fail loudly in all
+        // builds rather than returning a misleading empty path.
+        assert!(
             !self.nodes.is_empty(),
             "get_proof_by_pos called on a root-only MerkleTree (no nodes)"
         );
diff --git a/crypto/math-cuda/src/merkle.rs b/crypto/math-cuda/src/merkle.rs
index 3d9cd260c..fb1125ea4 100644
--- a/crypto/math-cuda/src/merkle.rs
+++ b/crypto/math-cuda/src/merkle.rs
@@ -378,16 +378,11 @@ pub fn gather_merkle_paths_dev(
     Ok(host)
 }
 
-/// Row-pair Keccak leaf + Merkle tree build for R2 composition-polynomial
-/// commit. `parts_interleaved` is `num_parts` slices, each holding an ext3
-/// LDE column interleaved as `[a0,a1,a2, b0,b1,b2, ...]` of length `3*lde_size`.
-///
-/// Returns `(2*(lde_size/2) - 1) * 32` bytes of tree nodes in the standard
-/// layout (root at byte offset 0, leaves in the tail).
-/// Build the composition Merkle tree on device. Leaves hash row pairs, so
+/// Build the composition Merkle tree on device. `parts_interleaved` is
+/// `num_parts` slices, each an ext3 LDE column interleaved as
+/// `[a0,a1,a2, b0,b1,b2, ...]` of length `3*lde_size`. Leaves hash row pairs, so
 /// `num_leaves = lde_size / 2`. Returns the device node buffer, the leaf count,
-/// and the stream it was built on. Shared by the host copy and device keep
-/// wrappers below.
+/// and the stream it was built on. Used by the device keep wrapper below.
 fn build_comp_poly_tree_nodes_dev(
     parts_interleaved: &[&[u64]],
 ) -> Result<(CudaSlice<u8>, usize, Arc<CudaStream>)> {
diff --git a/crypto/stark/src/gpu_lde.rs b/crypto/stark/src/gpu_lde.rs
index 29285d34f..3f1d81846 100644
--- a/crypto/stark/src/gpu_lde.rs
+++ b/crypto/stark/src/gpu_lde.rs
@@ -1446,6 +1446,12 @@ pub(crate) fn gather_proofs_dev(
     if positions.is_empty() {
         return Some(Vec::new());
     }
+    // Positions index an LDE that `assert_u32_domain` keeps within u32; guard the
+    // cast so any future relaxation fails loudly instead of wrapping silently.
+    debug_assert!(
+        positions.iter().all(|&p| p <= u32::MAX as usize),
+        "gather_proofs_dev: position exceeds u32 range"
+    );
     let positions_u32: Vec<u32> = positions.iter().map(|&p| p as u32).collect();
     let bytes = math_cuda::merkle::gather_merkle_paths_dev(
         &tree.nodes,
@@ -1698,8 +1704,17 @@ where
     // The GPU FRI commit sets `gpu_tree` on every layer as a group; the CPU
     // commit sets none. Host trees fall back to the host walk. When the layers
     // are device resident the host trees are root only, so the gather below must
-    // succeed (a failure is a hard abort, not a silent walk).
-    if fri_layers[0].gpu_tree.is_none() {
+    // succeed (a failure is a hard abort, not a silent walk). The residency is
+    // all or nothing; assert it so a future partial-build can never route a
+    // root-only layer through the host walk and ship empty proofs.
+    let first_resident = fri_layers[0].gpu_tree.is_some();
+    debug_assert!(
+        fri_layers
+            .iter()
+            .all(|l| l.gpu_tree.is_some() == first_resident),
+        "FRI layer residency must be all or nothing"
+    );
+    if !first_resident {
         return None;
     }
     let stream = math_cuda::device::backend()

From 4ca240ce304e7a3f42ef8388449d5f5f0fb4aefc Mon Sep 17 00:00:00 2001
From: Joaquin Carletti <joaquin.carletti@lambdaclass.com>
Date: Tue, 30 Jun 2026 16:29:23 -0300
Subject: [PATCH 17/17] fix

---
 crypto/crypto/src/merkle_tree/merkle.rs | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/crypto/crypto/src/merkle_tree/merkle.rs b/crypto/crypto/src/merkle_tree/merkle.rs
index e9e9469fd..d53f06f10 100644
--- a/crypto/crypto/src/merkle_tree/merkle.rs
+++ b/crypto/crypto/src/merkle_tree/merkle.rs
@@ -254,13 +254,6 @@ where
     /// For example, give me an inclusion proof for the 3rd element in the
     /// Merkle tree
     pub fn get_proof_by_pos(&self, pos: usize) -> Option<Proof<B::Node>> {
-        // A root only tree (from `from_root`) has no nodes to walk. Callers must
-        // gather paths from the device resident copy instead. Fail loudly in all
-        // builds rather than returning a misleading empty path.
-        assert!(
-            !self.nodes.is_empty(),
-            "get_proof_by_pos called on a root-only MerkleTree (no nodes)"
-        );
         let pos = pos + self.node_count() / 2;
         let Ok(merkle_path) = self.build_merkle_path(pos) else {
             return None;