From 5f369f08f78ca2271db404f2e35f3580ff1c9314 Mon Sep 17 00:00:00 2001 From: Joaquin Carletti Date: Mon, 29 Jun 2026 15:08:10 -0300 Subject: [PATCH 01/17] spike --- crypto/math-cuda/src/device.rs | 41 ++++++++ crypto/stark/src/instruments.rs | 149 ++++++++++++++++++++++++++++- crypto/stark/src/lib.rs | 10 ++ crypto/stark/src/prover.rs | 20 ++++ prover/src/lib.rs | 28 ++++++ prover/src/tables/trace_builder.rs | 33 ++++++- 6 files changed, 278 insertions(+), 3 deletions(-) diff --git a/crypto/math-cuda/src/device.rs b/crypto/math-cuda/src/device.rs index 17e2f9f82..4c12000fd 100644 --- a/crypto/math-cuda/src/device.rs +++ b/crypto/math-cuda/src/device.rs @@ -174,6 +174,34 @@ pub struct Backend { inv_twiddles: Mutex>>>>, } +/// Raise the device default memory pool's release threshold so freed +/// stream-ordered allocations are kept for reuse instead of returned to the OS +/// at each sync. Best-effort: any failure (e.g. a device/driver without +/// stream-ordered allocator support) leaves the default behaviour untouched. +fn retain_default_mempool(ctx: &CudaContext) { + use cudarc::driver::sys; + // SAFETY: raw CUDA driver calls. `ctx.cu_device()` is a valid device for + // the just-created context; the out-pointers are valid stack slots; the + // threshold is read as a u64 by the driver. Errors are swallowed. + unsafe { + let dev = ctx.cu_device(); + let mut pool: sys::CUmemoryPool = std::ptr::null_mut(); + if sys::cuDeviceGetDefaultMemPool(&mut pool as *mut _, dev) + .result() + .is_err() + { + return; + } + let threshold: u64 = u64::MAX; + let _ = sys::cuMemPoolSetAttribute( + pool, + sys::CUmemPool_attribute_enum::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, + &threshold as *const u64 as *mut core::ffi::c_void, + ) + .result(); + } +} + impl Backend { fn init() -> Result { let ctx = CudaContext::new(0)?; @@ -183,6 +211,19 @@ impl Backend { // before returning), so the tracking is pure overhead. Disable it. unsafe { ctx.disable_event_tracking() }; + // Retain freed device memory in the stream-ordered pool for reuse. + // + // cudarc routes `CudaStream::alloc*` through `cuMemAllocAsync`, which + // draws from the device's default memory pool. That pool's release + // threshold defaults to 0, so every freed buffer is handed back to the + // OS at the next sync — meaning the prover's large, repeatedly-shaped + // LDE / FRI buffers are re-malloc'd from scratch each op. Raising the + // threshold to "unbounded" keeps freed blocks resident in the pool so + // subsequent allocations of the same size are satisfied without a real + // driver allocation. Best-effort: on any error (no pool support, sync + // allocator) we silently keep the current behaviour. + retain_default_mempool(&ctx); + let arith = ctx.load_module(Ptx::from_src(ARITH_PTX))?; let ntt = ctx.load_module(Ptx::from_src(NTT_PTX))?; let keccak = ctx.load_module(Ptx::from_src(KECCAK_PTX))?; diff --git a/crypto/stark/src/instruments.rs b/crypto/stark/src/instruments.rs index 16ff95082..83e14545b 100644 --- a/crypto/stark/src/instruments.rs +++ b/crypto/stark/src/instruments.rs @@ -1,7 +1,154 @@ use std::cell::RefCell; +use std::sync::Mutex; use std::sync::OnceLock; use std::sync::atomic::{AtomicU64, Ordering}; -use std::time::Duration; +use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; + +// ========================================================================= +// Wall-clock span timeline (the trustworthy per-step measurement) +// ========================================================================= +// +// Nested wall-clock spans opened/closed on the driving (main) thread at phase +// boundaries. Unlike the `accum_*` / thread-local sub-timers below — which sum +// per-worker CPU time across rayon threads and over-count (percentages > 100%) — +// these spans are non-overlapping and sum to their parent, so the tree is a true +// latency breakdown. Parallel regions are measured as a single span around the +// blocking call (that IS their latency); their internal split is reported +// separately as CPU-time, never mixed into the wall tree. +// +// let _s = instruments::span("trace_build"); // RAII, stops on drop +// +// `Instant::now()` is ~20 ns — fine at phase granularity; never inside per-op loops. + +#[derive(Clone, Debug)] +pub struct SpanRecord { + pub label: &'static str, + pub depth: u16, + pub wall: Duration, + /// Open-order, so the tree reconstructs in start-order (records push on close). + pub order: u32, + /// Wall-clock epoch (ns) when the span opened — for aligning with external + /// samplers (e.g. nvidia-smi GPU-util) to attribute device-busy time per step. + pub start_ns: u128, +} + +static TIMELINE: Mutex> = Mutex::new(Vec::new()); +static SPAN_ORDER: AtomicU64 = AtomicU64::new(0); + +thread_local! { + static SPAN_DEPTH: std::cell::Cell = const { std::cell::Cell::new(0) }; +} + +#[must_use] +pub struct SpanGuard { + label: &'static str, + depth: u16, + order: u32, + start: Instant, + start_ns: u128, +} + +/// Open a wall-clock span; records elapsed time when the guard drops. +pub fn span(label: &'static str) -> SpanGuard { + let depth = SPAN_DEPTH.with(|d| { + let v = d.get(); + d.set(v + 1); + v + }); + let order = SPAN_ORDER.fetch_add(1, Ordering::Relaxed) as u32; + let start_ns = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos(); + SpanGuard { + label, + depth, + order, + start: Instant::now(), + start_ns, + } +} + +impl Drop for SpanGuard { + fn drop(&mut self) { + let wall = self.start.elapsed(); + SPAN_DEPTH.with(|d| d.set(d.get().saturating_sub(1))); + if let Ok(mut t) = TIMELINE.lock() { + t.push(SpanRecord { + label: self.label, + depth: self.depth, + wall, + order: self.order, + start_ns: self.start_ns, + }); + } + } +} + +/// Clear recorded spans. Call at the start of a measured prove. +pub fn reset_timeline() { + SPAN_ORDER.store(0, Ordering::Relaxed); + SPAN_DEPTH.with(|d| d.set(0)); + if let Ok(mut t) = TIMELINE.lock() { + t.clear(); + } +} + +/// Drain recorded spans, sorted in start-order (ready for the tree). +pub fn take_timeline() -> Vec { + let mut spans = TIMELINE + .lock() + .map(|mut t| std::mem::take(&mut *t)) + .unwrap_or_default(); + spans.sort_by_key(|s| s.order); + spans +} + +/// Indented wall-clock tree with % of the root span. +pub fn format_timeline(spans: &[SpanRecord]) -> String { + use std::fmt::Write; + if spans.is_empty() { + return String::new(); + } + let total_s = spans + .first() + .map(|s| s.wall.as_secs_f64()) + .unwrap_or(1e-9) + .max(1e-9); + let mut out = String::from("=== TIMELINE (wall-clock) ===\n"); + for s in spans { + let indent = " ".repeat(s.depth as usize); + let pct = 100.0 * s.wall.as_secs_f64() / total_s; + let _ = writeln!( + out, + "{:<42} {:>10.3?} {:>6.1}%", + format!("{indent}{}", s.label), + s.wall, + pct + ); + } + out +} + +/// JSON array of `{label, depth, wall_ns, order}` for diffing / plotting. +pub fn timeline_json(spans: &[SpanRecord]) -> String { + let mut out = String::from("["); + for (i, s) in spans.iter().enumerate() { + if i > 0 { + out.push(','); + } + out.push_str(&format!( + "{{\"label\":\"{}\",\"depth\":{},\"wall_ns\":{},\"order\":{},\"start_ns\":{}}}", + s.label, + s.depth, + s.wall.as_nanos(), + s.order, + s.start_ns + )); + } + out.push(']'); + out +} static HEAP_READER: OnceLock Option> = OnceLock::new(); diff --git a/crypto/stark/src/lib.rs b/crypto/stark/src/lib.rs index e9f6a1cda..25ce91569 100644 --- a/crypto/stark/src/lib.rs +++ b/crypto/stark/src/lib.rs @@ -3,6 +3,16 @@ #[cfg(all(target_arch = "wasm32", feature = "disk-spill"))] compile_error!("the `disk-spill` feature requires memmap2, which does not compile on wasm32"); +/// Open a wall-clock profiling span (no-op unless the `instruments` feature is on). +/// RAII: records elapsed wall time when the binding drops at end of scope. +#[macro_export] +macro_rules! prof_span { + ($label:expr) => { + #[cfg(feature = "instruments")] + let _prof_span = $crate::instruments::span($label); + }; +} + #[cfg(feature = "debug-checks")] pub mod bus_debug; pub mod constraints; diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs index 4da57559c..eec131be7 100644 --- a/crypto/stark/src/prover.rs +++ b/crypto/stark/src/prover.rs @@ -1682,6 +1682,8 @@ pub trait IsStarkProver< #[cfg(feature = "instruments")] let phase_start = Instant::now(); + #[cfg(feature = "instruments")] + let __sp = crate::instruments::span("r1_prepass"); // Deduplicate Domain + LdeTwiddles by (trace_length, blowup_factor, coset_offset). // Many tables share the same domain size (e.g., 7+ tables at 2^20). @@ -1739,6 +1741,8 @@ pub trait IsStarkProver< })?; } + #[cfg(feature = "instruments")] + drop(__sp); #[cfg(feature = "instruments")] let prepass_elapsed = phase_start.elapsed(); #[cfg(feature = "instruments")] @@ -1754,6 +1758,8 @@ pub trait IsStarkProver< #[cfg(feature = "instruments")] let phase_start = Instant::now(); + #[cfg(feature = "instruments")] + let __sp = crate::instruments::span("r1_main_commit"); let mut main_commits: Vec> = Vec::with_capacity(num_airs); let mut main_ldes: Vec>>> = Vec::with_capacity(num_airs); @@ -1810,6 +1816,8 @@ pub trait IsStarkProver< } } + #[cfg(feature = "instruments")] + drop(__sp); #[cfg(feature = "instruments")] let main_commits_elapsed = phase_start.elapsed(); #[cfg(feature = "instruments")] @@ -1845,6 +1853,8 @@ pub trait IsStarkProver< // but outer parallelism over 12 tables also helps on high-core-count machines. #[cfg(feature = "instruments")] let phase_start = Instant::now(); + #[cfg(feature = "instruments")] + let __sp = crate::instruments::span("r1_aux_build"); #[cfg(feature = "parallel")] let aux_iter = air_trace_pairs.par_iter_mut(); @@ -1877,6 +1887,8 @@ pub trait IsStarkProver< })?; } + #[cfg(feature = "instruments")] + drop(__sp); #[cfg(feature = "instruments")] let aux_build_elapsed = phase_start.elapsed(); #[cfg(feature = "instruments")] @@ -1888,6 +1900,8 @@ pub trait IsStarkProver< // Each table gets its own transcript fork. #[cfg(feature = "instruments")] let phase_start = Instant::now(); + #[cfg(feature = "instruments")] + let __sp = crate::instruments::span("r1_aux_commit"); // Pre-fork all transcripts (cheap, sequential — must match verifier ordering) let mut table_transcripts: Vec<_> = (0..num_airs) @@ -2066,6 +2080,8 @@ pub trait IsStarkProver< }); } + #[cfg(feature = "instruments")] + drop(__sp); #[cfg(feature = "instruments")] let aux_commit_elapsed = phase_start.elapsed(); #[cfg(feature = "instruments")] @@ -2086,6 +2102,8 @@ pub trait IsStarkProver< #[cfg(feature = "instruments")] let phase_start = Instant::now(); #[cfg(feature = "instruments")] + let __sp = crate::instruments::span("rounds_2to4"); + #[cfg(feature = "instruments")] let mut table_timings: Vec<( String, usize, @@ -2173,6 +2191,8 @@ pub trait IsStarkProver< } } + #[cfg(feature = "instruments")] + drop(__sp); #[cfg(feature = "instruments")] { // Store timing data for the top-level report in prove_with_options. diff --git a/prover/src/lib.rs b/prover/src/lib.rs index 81233d39f..589145d09 100644 --- a/prover/src/lib.rs +++ b/prover/src/lib.rs @@ -713,11 +713,17 @@ pub fn prove_with_options_and_inputs( #[cfg(feature = "instruments")] let total_start = std::time::Instant::now(); #[cfg(feature = "instruments")] + stark::instruments::reset_timeline(); + #[cfg(feature = "instruments")] + let __root = stark::instruments::span("prove_total"); + #[cfg(feature = "instruments")] let heap_before = stark::instruments::heap_bytes(); // Phase 1: Execute (ELF load + run) #[cfg(feature = "instruments")] let phase_start = std::time::Instant::now(); + #[cfg(feature = "instruments")] + let __sp = stark::instruments::span("execute"); let program = Elf::load(elf_bytes).map_err(|e| Error::ElfLoad(format!("{e}")))?; let executor = Executor::new(&program, private_inputs.to_vec()) @@ -726,6 +732,8 @@ pub fn prove_with_options_and_inputs( .run() .map_err(|e| Error::Execution(format!("{e}")))?; + #[cfg(feature = "instruments")] + drop(__sp); #[cfg(feature = "instruments")] let execute_elapsed = phase_start.elapsed(); #[cfg(feature = "instruments")] @@ -734,6 +742,8 @@ pub fn prove_with_options_and_inputs( // Phase 2: Trace build #[cfg(feature = "instruments")] let phase_start = std::time::Instant::now(); + #[cfg(feature = "instruments")] + let __sp = stark::instruments::span("trace_build"); #[cfg(feature = "disk-spill")] let storage_mode = { @@ -755,6 +765,8 @@ pub fn prove_with_options_and_inputs( ); drop(result); + #[cfg(feature = "instruments")] + drop(__sp); #[cfg(feature = "instruments")] let trace_build_elapsed = phase_start.elapsed(); #[cfg(feature = "instruments")] @@ -763,6 +775,8 @@ pub fn prove_with_options_and_inputs( // Phase 3: AIR construction #[cfg(feature = "instruments")] let phase_start = std::time::Instant::now(); + #[cfg(feature = "instruments")] + let __sp = stark::instruments::span("air_construction"); let table_counts = traces.table_counts(); let airs = VmAirs::new( @@ -775,6 +789,8 @@ pub fn prove_with_options_and_inputs( None, ); + #[cfg(feature = "instruments")] + drop(__sp); #[cfg(feature = "instruments")] let air_elapsed = phase_start.elapsed(); #[cfg(feature = "instruments")] @@ -801,6 +817,8 @@ pub fn prove_with_options_and_inputs( ); // Phase 4: Prove (multi_prove) + #[cfg(feature = "instruments")] + let __sp = stark::instruments::span("proving"); let proof = Prover::multi_prove( airs.air_trace_pairs(&mut traces), &mut transcript, @@ -808,6 +826,8 @@ pub fn prove_with_options_and_inputs( storage_mode, ) .map_err(|e| Error::Prover(format!("{e:?}")))?; + #[cfg(feature = "instruments")] + drop(__sp); #[cfg(feature = "instruments")] { @@ -823,6 +843,14 @@ pub fn prove_with_options_and_inputs( after_air: heap_after_air, }, ); + // Accurate wall-clock span tree (the trustworthy per-step breakdown). + drop(__root); + let spans = stark::instruments::take_timeline(); + print!("{}", stark::instruments::format_timeline(&spans)); + if let Ok(path) = std::env::var("LAMBDA_VM_TIMELINE_JSON") { + let _ = std::fs::write(&path, stark::instruments::timeline_json(&spans)); + println!("[timeline] wrote {path}"); + } } Ok(VmProof { diff --git a/prover/src/tables/trace_builder.rs b/prover/src/tables/trace_builder.rs index 02371c1a0..048334356 100644 --- a/prover/src/tables/trace_builder.rs +++ b/prover/src/tables/trace_builder.rs @@ -2750,6 +2750,8 @@ fn build_traces( // ===================================================================== // PHASE 4: All → Bitwise lookups // ===================================================================== + #[cfg(feature = "instruments")] + let __sp = stark::instruments::span("p4_bitwise_collect"); bitwise_ops.extend(collect_bitwise_from_lt(<_ops)); // MUL/DVRM dedup their per-unique bit-gated lookups PER CHIP INSTANCE, so pass // the same chunk size used to split them into instances (see chunk_and_generate @@ -2795,10 +2797,14 @@ fn build_traces( .map(|chunk| chunk.len().next_power_of_two().max(4) - chunk.len()) .sum(); bitwise_ops.extend(collect_byte_check_ops_for_padding(num_padding_rows)); + #[cfg(feature = "instruments")] + drop(__sp); // ===================================================================== // PHASE 5: Generate final traces (parallelized) // ===================================================================== + #[cfg(feature = "instruments")] + let __sp = stark::instruments::span("p5_generate_tables"); // Extract halt timestamp from the last ECALL instruction let halt_op = cpu_ops @@ -3027,6 +3033,8 @@ fn build_traces( } } + #[cfg(feature = "instruments")] + drop(__sp); Ok(Traces { cpus, bitwise, @@ -3692,17 +3700,27 @@ impl Traces { // Phase 0: ELF → DECODE + instructions // IMPORTANT: Use generate_decode_trace (same as compute_precomputed_commitment) // so the DECODE trace row ordering matches the AIR's hardcoded commitment. + #[cfg(feature = "instruments")] + let __sp = stark::instruments::span("p0_decode"); let instructions = decode::instructions_from_elf(elf) .map_err(|e| Error::Execution(format!("Failed to parse instructions: {e}")))?; let (decode_trace, decode_pc_to_row) = decode::generate_decode_trace(&instructions); + #[cfg(feature = "instruments")] + drop(__sp); // Phase 1: Logs → CPU operations + #[cfg(feature = "instruments")] + let __sp = stark::instruments::span("p1_cpu_ops"); let cpu_ops = collect_cpu_ops(logs, &instructions)?; + #[cfg(feature = "instruments")] + drop(__sp); // Phase 2: Collect + route all ops let mut memory_state = MemoryState::from_elf(elf); memory_state.add_private_input(private_input); let mut register_state = RegisterState::new(elf.entry_point); + #[cfg(feature = "instruments")] + let __sp = stark::instruments::span("p2a_collect_cpu"); let ( memw_ops, load_ops, @@ -3716,7 +3734,11 @@ impl Traces { ec_scalar_ops, ecdas_ops, ) = collect_ops_from_cpu(&cpu_ops, &mut memory_state, &mut register_state); + #[cfg(feature = "instruments")] + drop(__sp); + #[cfg(feature = "instruments")] + let __sp = stark::instruments::span("p2b_collect_all"); let ops = collect_all_ops( cpu_ops, memw_ops, @@ -3732,9 +3754,13 @@ impl Traces { ecdas_ops, &mut register_state, ); + #[cfg(feature = "instruments")] + drop(__sp); // Phases 3-5 - build_traces( + #[cfg(feature = "instruments")] + let __sp = stark::instruments::span("p3to5_build_traces"); + let result = build_traces( ops, Some(elf), &memory_state, @@ -3746,7 +3772,10 @@ impl Traces { #[cfg(feature = "disk-spill")] storage_mode, private_input, - ) + ); + #[cfg(feature = "instruments")] + drop(__sp); + result } /// Generates all traces from execution logs (legacy API). From 0fa7f32ab9371efac32f99a1b44a84c3bdf9448f Mon Sep 17 00:00:00 2001 From: Joaquin Carletti Date: Mon, 29 Jun 2026 15:22:04 -0300 Subject: [PATCH 02/17] resident-data scaffolding --- crypto/stark/src/trace.rs | 88 +++++++++++++++++++++++++++++++-------- 1 file changed, 71 insertions(+), 17 deletions(-) diff --git a/crypto/stark/src/trace.rs b/crypto/stark/src/trace.rs index 405ce89f8..39e968c7f 100644 --- a/crypto/stark/src/trace.rs +++ b/crypto/stark/src/trace.rs @@ -193,17 +193,58 @@ where pub(crate) aux_columns: Vec>>, pub(crate) lde_step_size: usize, pub(crate) blowup_factor: usize, - /// If the main trace was LDE'd on the GPU via the fused pipeline, - /// the device buffer is retained here so downstream GPU rounds can - /// read the LDE without a re-H2D. `None` when the GPU LDE didn't run - /// for this table (below the size threshold or any CPU fallback: - /// preprocessed main, non-Goldilocks, or GPU error). + /// Per-table GPU residency session: owns the device-resident trace LDE + /// buffers (main + aux) and tracks, per buffer, whether a host mirror is + /// live. Created on the CPU path too (all buffers `None`, mirrors present); + /// populated when the R1 fused GPU pipeline runs. Threaded R1→R4 because + /// `LDETraceTable` is borrowed through every round. #[cfg(feature = "cuda")] - pub(crate) gpu_main: Option, - /// Same as `gpu_main` but for the aux trace (ext3 de-interleaved - /// layout on device). - #[cfg(feature = "cuda")] - pub(crate) gpu_aux: Option, + pub(crate) gpu_session: GpuTableSession, +} + +/// Per-table GPU residency session. +/// +/// Owns the device-resident buffers for a single trace table and tracks, per +/// buffer, whether a host mirror is currently materialised. Today every buffer +/// produced on the GPU is *also* copied to host columns, because the remaining +/// CPU consumers (R2 constraint evaluation, R3 barycentric OOD, R4 query +/// openings) read from host. The `*_host_mirror` flags are the seam those later +/// steps flip: once a consumer reads from the device buffer instead, the +/// corresponding mirror is dropped and the H2D/D2H copy elided. +/// +/// Scope: this owns the main/aux trace LDE (resident R1→R4). The composition +/// parts (`Round2`, R2→R4) and the bound stream are folded in by the control- +/// plane commit, where the R2→R4 borrow chain becomes mutable. The R4-local +/// `inv_denoms`/FRI state are created and consumed within R4 and stay local. +#[cfg(feature = "cuda")] +pub(crate) struct GpuTableSession { + /// Main-trace LDE, resident from the R1 fused pipeline through R4. `None` + /// when the GPU LDE didn't run for this table (below the size threshold or + /// any CPU fallback: preprocessed main, non-Goldilocks, or GPU error). + main_lde: Option, + /// Aux-trace LDE (ext3 de-interleaved layout on device), resident R1→R4. + aux_lde: Option, + /// Whether the main-trace host columns currently mirror `main_lde`. + /// Always `true` today; CPU consumers depend on it. + main_host_mirror: bool, + /// Whether the aux-trace host columns currently mirror `aux_lde`. + /// Always `true` today; CPU consumers depend on it. + aux_host_mirror: bool, +} + +#[cfg(feature = "cuda")] +impl GpuTableSession { + fn new() -> Self { + Self { + main_lde: None, + aux_lde: None, + // Host columns are always materialised today; the CPU consumers + // (constraint eval, OOD, openings) read them. Steps 5/6 flip these + // off as each consumer moves to reading the device buffer. + main_host_mirror: true, + aux_host_mirror: true, + } + } } impl LDETraceTable @@ -227,9 +268,7 @@ where lde_step_size, blowup_factor, #[cfg(feature = "cuda")] - gpu_main: None, - #[cfg(feature = "cuda")] - gpu_aux: None, + gpu_session: GpuTableSession::new(), } } @@ -238,23 +277,38 @@ where /// ran the CPU path should leave this alone. #[cfg(feature = "cuda")] pub fn set_gpu_main(&mut self, h: math_cuda::lde::GpuLdeBase) { - self.gpu_main = Some(h); + self.gpu_session.main_lde = Some(h); } /// Attach an already-populated device LDE handle for the aux columns. #[cfg(feature = "cuda")] pub fn set_gpu_aux(&mut self, h: math_cuda::lde::GpuLdeExt3) { - self.gpu_aux = Some(h); + self.gpu_session.aux_lde = Some(h); } #[cfg(feature = "cuda")] pub fn gpu_main(&self) -> Option<&math_cuda::lde::GpuLdeBase> { - self.gpu_main.as_ref() + self.gpu_session.main_lde.as_ref() } #[cfg(feature = "cuda")] pub fn gpu_aux(&self) -> Option<&math_cuda::lde::GpuLdeExt3> { - self.gpu_aux.as_ref() + self.gpu_session.aux_lde.as_ref() + } + + /// Whether the main-trace host columns currently mirror the device LDE. + /// Always `true` today (CPU consumers read host); the source-agnostic seam + /// steps 5/6 flip when the main consumer reads the device buffer instead. + #[cfg(feature = "cuda")] + pub fn main_host_mirror(&self) -> bool { + self.gpu_session.main_host_mirror + } + + /// Whether the aux-trace host columns currently mirror the device LDE. + /// Always `true` today; see [`Self::main_host_mirror`]. + #[cfg(feature = "cuda")] + pub fn aux_host_mirror(&self) -> bool { + self.gpu_session.aux_host_mirror } /// Consume self and return the owned column vectors. From bdea00e1e3c1663e81220d14c46ef98ad94c972a Mon Sep 17 00:00:00 2001 From: Joaquin Carletti Date: Mon, 29 Jun 2026 15:34:53 -0300 Subject: [PATCH 03/17] part 2 --- crypto/math-cuda/src/device.rs | 44 +++++++++++++ crypto/stark/src/prover.rs | 110 +++++++++++++++++++++++++++++++-- 2 files changed, 148 insertions(+), 6 deletions(-) diff --git a/crypto/math-cuda/src/device.rs b/crypto/math-cuda/src/device.rs index 4c12000fd..4e0dde5bb 100644 --- a/crypto/math-cuda/src/device.rs +++ b/crypto/math-cuda/src/device.rs @@ -118,6 +118,9 @@ pub struct Backend { pinned_hashes: Vec>, util_stream: Arc, next: AtomicUsize, + /// VRAM budget (bytes) for table-session admission control. See + /// [`detect_vram_budget_bytes`]. + vram_budget_bytes: u64, // arith.ptx pub vector_add_u64: CudaFunction, @@ -202,6 +205,38 @@ fn retain_default_mempool(ctx: &CudaContext) { } } +/// Device VRAM budget (bytes) for table-session admission control. +/// +/// `LAMBDA_VM_VRAM_BUDGET_MB` overrides it explicitly — used to force-exercise +/// the throttle in tests/benchmarks. Otherwise it is 80% of total device +/// memory, leaving headroom for the context, module code, and the pool's +/// retained blocks. On any query failure it returns `u64::MAX`, which disables +/// budgeting: admission then falls back to the core-bound chunk size alone. +fn detect_vram_budget_bytes(ctx: &CudaContext) -> u64 { + if let Ok(mb) = std::env::var("LAMBDA_VM_VRAM_BUDGET_MB") { + if let Ok(mb) = mb.parse::() { + return mb.saturating_mul(1024 * 1024); + } + } + use cudarc::driver::sys; + // SAFETY: raw driver query writing into two stack slots. The caller's + // context is already current (it was just created in `init`). Any error + // falls through to the budgeting-disabled sentinel. + unsafe { + let _ = ctx; + let mut free: usize = 0; + let mut total: usize = 0; + if sys::cuMemGetInfo_v2(&mut free as *mut usize, &mut total as *mut usize) + .result() + .is_err() + { + return u64::MAX; + } + // 80% of total, computed to avoid intermediate overflow. + (total as u64) / 5 * 4 + } +} + impl Backend { fn init() -> Result { let ctx = CudaContext::new(0)?; @@ -259,6 +294,8 @@ impl Backend { // Length = TWO_ADICITY + 1 to allow indexing at log_n = TWO_ADICITY. let max_log = GoldilocksField::TWO_ADICITY as usize + 1; + let vram_budget_bytes = detect_vram_budget_bytes(&ctx); + Ok(Self { vector_add_u64: arith.load_function("vector_add_u64")?, gl_add: arith.load_function("gl_add_kernel")?, @@ -308,9 +345,16 @@ impl Backend { pinned_hashes, util_stream, next: AtomicUsize::new(0), + vram_budget_bytes, }) } + /// VRAM budget in bytes for table-session admission control. `u64::MAX` + /// when budgeting is disabled (query failed). See the field docs. + pub fn vram_budget_bytes(&self) -> u64 { + self.vram_budget_bytes + } + /// Round-robin over the stream pool. Concurrent callers get different /// streams so their kernel launches overlap on the GPU. pub fn next_stream(&self) -> Arc { diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs index eec131be7..6b3e345b8 100644 --- a/crypto/stark/src/prover.rs +++ b/crypto/stark/src/prover.rs @@ -323,6 +323,59 @@ pub fn table_parallelism() -> usize { } } +/// Heuristic peak device working-set for one table, in bytes. +/// +/// Counts the LDE columns that are co-resident on the GPU — `main` in the base +/// field (8 B) and `aux` in the ext3 field (24 B) — times a scratch multiplier +/// for the Merkle / NTT / composition transients allocated alongside them. It +/// is deliberately a conservative over-estimate: it gates a safety ceiling, not +/// a precise allocator. Pass `aux_cols == 0` for phases where the aux LDE is +/// not yet resident (the R1 main commit). +fn estimate_table_vram_bytes(main_cols: usize, aux_cols: usize, lde_size: usize) -> u64 { + const BYTES_PER_BASE: u64 = 8; + const EXT3_BYTES: u64 = 24; + const SCRATCH_FACTOR: u64 = 2; + let per_row = (main_cols as u64).saturating_mul(BYTES_PER_BASE) + + (aux_cols as u64).saturating_mul(EXT3_BYTES); + (lde_size as u64) + .saturating_mul(per_row) + .saturating_mul(SCRATCH_FACTOR) +} + +/// Plan contiguous table chunks for parallel proving. +/// +/// A chunk grows until it reaches `k` tables (the core/RAM-bound limit) **or** +/// its summed VRAM estimate would exceed `budget` — whichever comes first. A +/// single table larger than `budget` forms its own chunk (it runs solo rather +/// than being excluded). With `budget == u64::MAX` this degrades exactly to +/// fixed chunks of `k`, identical to the previous `step_by(k)` scheme — so on +/// non-cuda builds and when VRAM isn't the binding constraint, scheduling (and +/// therefore the proof) is unchanged. Returns `(start, end)` half-open ranges +/// covering `0..estimates.len()` in order. +fn plan_table_chunks(estimates: &[u64], k: usize, budget: u64) -> Vec<(usize, usize)> { + let n = estimates.len(); + let k = k.max(1); + let budget = budget as u128; + let mut chunks = Vec::new(); + let mut start = 0; + while start < n { + let mut end = start; + let mut acc: u128 = 0; + while end < n { + let next = estimates[end] as u128; + // Always admit at least one table per chunk (oversized → solo). + if end > start && (end - start >= k || acc + next > budget) { + break; + } + acc += next; + end += 1; + } + chunks.push((start, end)); + start = end; + } + chunks +} + /// A container for the results of the second round of the STARK Prove protocol. pub(crate) struct Round2 where @@ -1726,6 +1779,33 @@ pub trait IsStarkProver< let k = table_parallelism().min(num_airs).max(1); + // VRAM-budgeted admission. The budget caps the summed device working-set + // of the tables proved concurrently so large blocks don't exhaust VRAM. + // It is an *additional* ceiling on top of `k` (it never raises + // concurrency): on non-cuda builds, or when the budget can't be queried, + // it is `u64::MAX` and chunking falls back to fixed size `k`. + #[cfg(feature = "cuda")] + let vram_budget = math_cuda::device::backend() + .map(|b| b.vram_budget_bytes()) + .unwrap_or(u64::MAX); + #[cfg(not(feature = "cuda"))] + let vram_budget = u64::MAX; + + // R1 main commit: only the main LDE (+ its Merkle scratch) is resident, + // so the aux columns contribute nothing to this phase's working-set. + let main_chunks = { + let estimates: Vec = air_trace_pairs + .iter() + .enumerate() + .map(|(idx, (_, trace, _))| { + let lde_size = domains[idx].interpolation_domain_size + * domains[idx].blowup_factor; + estimate_table_vram_bytes(trace.num_main_columns, 0, lde_size) + }) + .collect(); + plan_table_chunks(&estimates, k, vram_budget) + }; + // Spill main traces to mmap before Round 1 LDE. #[cfg(feature = "disk-spill")] if storage_mode == StorageMode::Disk { @@ -1770,8 +1850,7 @@ pub trait IsStarkProver< let mut main_gpu_handles: Vec> = Vec::with_capacity(num_airs); - for chunk_start in (0..num_airs).step_by(k) { - let chunk_end = (chunk_start + k).min(num_airs); + for &(chunk_start, chunk_end) in &main_chunks { let chunk_range = chunk_start..chunk_end; #[cfg(feature = "parallel")] @@ -1929,8 +2008,28 @@ pub trait IsStarkProver< #[allow(clippy::type_complexity)] let mut aux_results: Vec> = Vec::with_capacity(num_airs); - for chunk_start in (0..num_airs).step_by(k) { - let chunk_end = (chunk_start + k).min(num_airs); + // R1 aux commit and rounds 2–4 share the peak working-set: the main and + // aux LDEs are co-resident, plus the composition / Merkle transients + // (folded into the scratch factor). `num_aux_columns` is now populated + // by the aux build above, so this estimate is accurate for both phases. + let peak_chunks = { + let estimates: Vec = air_trace_pairs + .iter() + .enumerate() + .map(|(idx, (_, trace, _))| { + let lde_size = domains[idx].interpolation_domain_size + * domains[idx].blowup_factor; + estimate_table_vram_bytes( + trace.num_main_columns, + trace.num_aux_columns, + lde_size, + ) + }) + .collect(); + plan_table_chunks(&estimates, k, vram_budget) + }; + + for &(chunk_start, chunk_end) in &peak_chunks { let chunk_range = chunk_start..chunk_end; #[cfg(feature = "parallel")] @@ -2113,8 +2212,7 @@ pub trait IsStarkProver< let mut proofs = Vec::with_capacity(num_airs); let mut lde_drain = cached_ldes.into_iter(); - for chunk_start in (0..num_airs).step_by(k) { - let chunk_end = (chunk_start + k).min(num_airs); + for &(chunk_start, chunk_end) in &peak_chunks { let chunk_size = chunk_end - chunk_start; let chunk_ldes: Vec> = From e9dcda8c7cbed9a11ed71f3278871c3d5c528687 Mon Sep 17 00:00:00 2001 From: Joaquin Carletti Date: Mon, 29 Jun 2026 16:00:17 -0300 Subject: [PATCH 04/17] composition parts fold --- crypto/stark/src/gpu_lde.rs | 18 ++++++++--- crypto/stark/src/prover.rs | 30 +++++++++---------- crypto/stark/src/trace.rs | 59 ++++++++++++++++++++++++++++++++++++- 3 files changed, 87 insertions(+), 20 deletions(-) diff --git a/crypto/stark/src/gpu_lde.rs b/crypto/stark/src/gpu_lde.rs index 36756b40b..6c6e725a1 100644 --- a/crypto/stark/src/gpu_lde.rs +++ b/crypto/stark/src/gpu_lde.rs @@ -1421,13 +1421,18 @@ pub(crate) fn try_inv_denoms_dev_with_stream( coset_base: &[FieldElement], z_scalars: &[FieldElement], sign: math_cuda::inverse::DenomSign, + bound_stream: Option>, ) -> Option<(CudaSlice, Arc)> where F: IsField + 'static, E: IsField + 'static, { - let be = math_cuda::device::backend().ok()?; - let stream = be.next_stream(); + // Use the caller's per-table session stream when provided, so this table's + // R3/R4 device chain serialises on one queue; otherwise grab a pool stream. + let stream = match bound_stream { + Some(s) => s, + None => math_cuda::device::backend().ok()?.next_stream(), + }; let handle = try_compute_and_invert_inv_denoms_dev::(coset_base, z_scalars, sign, &stream)?; Some((handle, stream)) @@ -1456,6 +1461,7 @@ pub(crate) struct R3DevContext { pub(crate) fn try_prep_r3_dev_context( coset_base: &[FieldElement], z_scalars: &[FieldElement], + bound_stream: Option>, ) -> Option where F: IsField + 'static, @@ -1477,8 +1483,12 @@ where return None; } - let be = math_cuda::device::backend().ok()?; - let stream = be.next_stream(); + // Per-table session stream when provided (shares the queue with R4 DEEP for + // this table); otherwise a pool stream. + let stream = match bound_stream { + Some(s) => s, + None => math_cuda::device::backend().ok()?.next_stream(), + }; // SAFETY: F == Goldilocks per TypeId check; FieldElement is // #[repr(transparent)] over u64. diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs index 6b3e345b8..2e1987b07 100644 --- a/crypto/stark/src/prover.rs +++ b/crypto/stark/src/prover.rs @@ -388,13 +388,6 @@ where pub(crate) composition_poly_merkle_tree: BatchedMerkleTree, /// The commitment to the composition polynomial parts. pub(crate) composition_poly_root: Commitment, - /// Device-resident de-interleaved LDE handle from the R2 fused GPU path - /// (`try_evaluate_parts_on_lde_gpu_keep`). When present, R4 DEEP skips - /// the `num_parts * 3 * lde_size * 8` byte H2D and reads parts on - /// device. `None` when the GPU R2 path didn't run (number_of_parts <= 2, - /// below threshold, or any CPU fallback). - #[cfg(feature = "cuda")] - pub(crate) gpu_composition_parts: Option, } /// A container for the results of the third round of the STARK Prove protocol. @@ -1037,7 +1030,7 @@ pub trait IsStarkProver< air: &dyn AIR, pub_inputs: &PI, domain: &Domain, - round_1_result: &Round1, + round_1_result: &mut Round1, transition_coefficients: &[FieldElement], boundary_coefficients: &[FieldElement], ) -> Result, ProvingError> @@ -1160,12 +1153,18 @@ pub trait IsStarkProver< #[cfg(feature = "instruments")] crate::instruments::store_r2_sub(constraints_dur, fft_dur, merkle_dur); + // Fold the R2 device composition-parts handle into the per-table session + // (resident R2→R4). The host evaluations remain in `Round2` as the + // mirror R4 openings still read; `composition_host_mirror` flags that. + #[cfg(feature = "cuda")] + if let Some(handle) = gpu_composition_parts { + round_1_result.lde_trace.set_gpu_composition_parts(handle); + } + Ok(Round2 { lde_composition_poly_evaluations: lde_composition_poly_parts_evaluations, composition_poly_merkle_tree, composition_poly_root, - #[cfg(feature = "cuda")] - gpu_composition_parts, }) } @@ -1425,11 +1424,12 @@ pub trait IsStarkProver< &domain.lde_roots_of_unity_coset, &z_scalars, math_cuda::inverse::DenomSign::XMinusZ, + lde_trace.bound_stream(), ) && let Some(deep_evals) = crate::gpu_lde::try_deep_composition_gpu::( lde_trace, - round_2_result.gpu_composition_parts.as_ref(), + lde_trace.gpu_composition_parts(), &round_2_result.lde_composition_poly_evaluations, h_ood, &trace_ood_columns, @@ -1465,7 +1465,7 @@ pub trait IsStarkProver< if let Some(deep_evals) = crate::gpu_lde::try_deep_composition_gpu::( lde_trace, - round_2_result.gpu_composition_parts.as_ref(), + lde_trace.gpu_composition_parts(), &round_2_result.lde_composition_poly_evaluations, h_ood, &trace_ood_columns, @@ -2244,7 +2244,7 @@ pub trait IsStarkProver< let table_start = Instant::now(); // Build Round1 from cached LDE (consumed by value, no recomputation). - let round_1_result = + let mut round_1_result = commitment.build_round1(lde, air.step_size(), domain.blowup_factor); if let Some(ref bpi) = round_1_result.bus_public_inputs { @@ -2254,7 +2254,7 @@ pub trait IsStarkProver< let proof = Self::prove_rounds_2_to_4( *air, *pub_inputs, - &round_1_result, + &mut round_1_result, table_transcript, domain, )?; @@ -2343,7 +2343,7 @@ pub trait IsStarkProver< fn prove_rounds_2_to_4( air: &dyn AIR, pub_inputs: &PI, - round_1_result: &Round1, + round_1_result: &mut Round1, transcript: &mut (impl IsStarkTranscript + Clone), domain: &Domain, ) -> Result, ProvingError> diff --git a/crypto/stark/src/trace.rs b/crypto/stark/src/trace.rs index 39e968c7f..d70a7e664 100644 --- a/crypto/stark/src/trace.rs +++ b/crypto/stark/src/trace.rs @@ -7,6 +7,8 @@ use math::polynomial::barycentric_inv_denoms; use math::spill_safe::SpillSafe; #[cfg(feature = "parallel")] use rayon::prelude::{IntoParallelIterator, ParallelIterator}; +#[cfg(feature = "cuda")] +use std::sync::{Arc, OnceLock}; /// A two-dimensional representation of an execution trace of the STARK /// protocol. @@ -224,12 +226,29 @@ pub(crate) struct GpuTableSession { main_lde: Option, /// Aux-trace LDE (ext3 de-interleaved layout on device), resident R1→R4. aux_lde: Option, + /// Composition-poly parts LDE (ext3 de-interleaved on device), produced in + /// R2 and resident R2→R4 so R4 DEEP reads the parts on-device instead of a + /// `num_parts * 3 * lde_size * 8` byte H2D. `None` when the R2 GPU path + /// didn't run (number_of_parts <= 2, below threshold, or CPU fallback). + composition_parts: Option, /// Whether the main-trace host columns currently mirror `main_lde`. /// Always `true` today; CPU consumers depend on it. main_host_mirror: bool, /// Whether the aux-trace host columns currently mirror `aux_lde`. /// Always `true` today; CPU consumers depend on it. aux_host_mirror: bool, + /// Whether the host composition-parts evaluations (`Round2`) mirror + /// `composition_parts`. Always `true` today; R4 openings read host. + composition_host_mirror: bool, + /// Stream bound to this table's GPU work, acquired lazily from the backend + /// pool on first use and cached for the session's lifetime. The R3/R4 + /// device-resident chain (inv_denoms → barycentric/OOD → DEEP) runs on it + /// today; the heavy LDE/Merkle ops join once they thread a stream. Binding + /// one stream per table serialises a table's kernels on a single queue and + /// gives distinct tables distinct streams — the prerequisite for cross-table + /// overlap once the host mirrors drop (Steps 4–5). `None` is cached if the + /// backend is unavailable, so callers fall back to the CPU path. + stream: OnceLock>>, } #[cfg(feature = "cuda")] @@ -238,11 +257,14 @@ impl GpuTableSession { Self { main_lde: None, aux_lde: None, + composition_parts: None, // Host columns are always materialised today; the CPU consumers // (constraint eval, OOD, openings) read them. Steps 5/6 flip these // off as each consumer moves to reading the device buffer. main_host_mirror: true, aux_host_mirror: true, + composition_host_mirror: true, + stream: OnceLock::new(), } } } @@ -311,6 +333,37 @@ where self.gpu_session.aux_host_mirror } + /// Attach the device-resident composition-poly parts LDE produced in R2. + /// Read by R4 DEEP so the parts aren't re-uploaded H2D. + #[cfg(feature = "cuda")] + pub fn set_gpu_composition_parts(&mut self, h: math_cuda::lde::GpuLdeExt3) { + self.gpu_session.composition_parts = Some(h); + } + + #[cfg(feature = "cuda")] + pub fn gpu_composition_parts(&self) -> Option<&math_cuda::lde::GpuLdeExt3> { + self.gpu_session.composition_parts.as_ref() + } + + /// Whether the host composition-parts evaluations mirror the device buffer. + /// Always `true` today; see [`Self::main_host_mirror`]. + #[cfg(feature = "cuda")] + pub fn composition_host_mirror(&self) -> bool { + self.gpu_session.composition_host_mirror + } + + /// The stream bound to this table's GPU work. Acquired lazily from the + /// backend pool on first call and cached for the session's lifetime, so all + /// of a table's stream-threaded ops share one queue. Returns `None` (cached) + /// when the backend is unavailable; callers then fall back to the CPU path. + #[cfg(feature = "cuda")] + pub fn bound_stream(&self) -> Option> { + self.gpu_session + .stream + .get_or_init(|| math_cuda::device::backend().ok().map(|b| b.next_stream())) + .clone() + } + /// Consume self and return the owned column vectors. #[allow(clippy::type_complexity)] pub fn into_columns(self) -> (Vec>>, Vec>>) { @@ -438,7 +491,11 @@ where // both via offset, with no per-eval-point or per-{main,aux} H2D. #[cfg(feature = "cuda")] let r3_ctx: Option = - crate::gpu_lde::try_prep_r3_dev_context::(&dc.points, &evaluation_points); + crate::gpu_lde::try_prep_r3_dev_context::( + &dc.points, + &evaluation_points, + lde_trace.bound_stream(), + ); #[allow(unused_variables)] #[cfg(not(feature = "cuda"))] let r3_ctx: Option<()> = None; From b08ef819675ff10e04060ecd2c15834e168c0115 Mon Sep 17 00:00:00 2001 From: Joaquin Carletti Date: Mon, 29 Jun 2026 16:10:16 -0300 Subject: [PATCH 05/17] fix --- crypto/math-cuda/src/device.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/crypto/math-cuda/src/device.rs b/crypto/math-cuda/src/device.rs index 4e0dde5bb..9ea480b9f 100644 --- a/crypto/math-cuda/src/device.rs +++ b/crypto/math-cuda/src/device.rs @@ -213,10 +213,10 @@ fn retain_default_mempool(ctx: &CudaContext) { /// retained blocks. On any query failure it returns `u64::MAX`, which disables /// budgeting: admission then falls back to the core-bound chunk size alone. fn detect_vram_budget_bytes(ctx: &CudaContext) -> u64 { - if let Ok(mb) = std::env::var("LAMBDA_VM_VRAM_BUDGET_MB") { - if let Ok(mb) = mb.parse::() { - return mb.saturating_mul(1024 * 1024); - } + if let Ok(mb) = std::env::var("LAMBDA_VM_VRAM_BUDGET_MB") + && let Ok(mb) = mb.parse::() + { + return mb.saturating_mul(1024 * 1024); } use cudarc::driver::sys; // SAFETY: raw driver query writing into two stack slots. The caller's From 8ea8a0af1b1f9e445e1ad662768885a97b31a8a8 Mon Sep 17 00:00:00 2001 From: Joaquin Carletti Date: Mon, 29 Jun 2026 16:11:38 -0300 Subject: [PATCH 06/17] fix_doc --- crypto/stark/src/trace.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/crypto/stark/src/trace.rs b/crypto/stark/src/trace.rs index d70a7e664..c7a9a941a 100644 --- a/crypto/stark/src/trace.rs +++ b/crypto/stark/src/trace.rs @@ -214,9 +214,8 @@ where /// steps flip: once a consumer reads from the device buffer instead, the /// corresponding mirror is dropped and the H2D/D2H copy elided. /// -/// Scope: this owns the main/aux trace LDE (resident R1→R4). The composition -/// parts (`Round2`, R2→R4) and the bound stream are folded in by the control- -/// plane commit, where the R2→R4 borrow chain becomes mutable. The R4-local +/// Scope: this owns the main/aux trace LDE (resident R1→R4), the composition +/// parts (`Round2`, R2→R4), and the per-table bound stream. The R4-local /// `inv_denoms`/FRI state are created and consumed within R4 and stay local. #[cfg(feature = "cuda")] pub(crate) struct GpuTableSession { From caa7efdd02218a651a166caf8a7e713fb270a2c9 Mon Sep 17 00:00:00 2001 From: Joaquin Carletti Date: Mon, 29 Jun 2026 17:10:46 -0300 Subject: [PATCH 07/17] move merkle tree to gpu --- crypto/math-cuda/kernels/keccak.cu | 34 ++++++++ crypto/math-cuda/src/device.rs | 2 + crypto/math-cuda/src/lde.rs | 40 +++++++++ crypto/math-cuda/src/merkle.rs | 56 +++++++++++++ crypto/math-cuda/tests/barycentric_strided.rs | 1 + crypto/math-cuda/tests/deep.rs | 1 + crypto/math-cuda/tests/merkle_gather.rs | 84 +++++++++++++++++++ crypto/stark/src/gpu_lde.rs | 41 ++++++++- crypto/stark/src/prover.rs | 80 +++++++++++++++++- 9 files changed, 334 insertions(+), 5 deletions(-) create mode 100644 crypto/math-cuda/tests/merkle_gather.rs diff --git a/crypto/math-cuda/kernels/keccak.cu b/crypto/math-cuda/kernels/keccak.cu index c22bc4d05..bcaf519f4 100644 --- a/crypto/math-cuda/kernels/keccak.cu +++ b/crypto/math-cuda/kernels/keccak.cu @@ -347,3 +347,37 @@ extern "C" __global__ void keccak_merkle_level( finalize_keccak256(st, rate_pos, nodes + (parent_begin + tid) * 32); } + +// Gather Merkle authentication paths for a batch of leaf positions, reading the +// resident tree `nodes` (32-byte nodes; layout: inner nodes [0..leaves_len-1], +// root at 0, leaves at [leaves_len-1..]). One thread per query walks leaf->root, +// writing each sibling node into the output. This mirrors the CPU +// `build_merkle_path` exactly (sibling_index / parent_index in +// crypto/crypto/src/merkle_tree/utils.rs): +// leaf node = pos + leaves_len - 1 +// sibling = node even ? node-1 : node+1 +// parent = node even ? (node-1)/2 : node/2 +// so `out[(q*depth + level)*32 .. +32]` is the level-th sibling for query q. +extern "C" __global__ void merkle_gather_paths( + const uint8_t *nodes, + const uint32_t *positions, // leaf positions, length num_queries + uint32_t num_queries, + uint64_t leaves_len, + uint32_t depth, // = log2(leaves_len) + uint8_t *out) { // num_queries * depth * 32 bytes + uint32_t q = blockIdx.x * blockDim.x + threadIdx.x; + if (q >= num_queries) return; + + uint64_t node = (uint64_t)positions[q] + leaves_len - 1; + for (uint32_t level = 0; level < depth; ++level) { + uint64_t sib = (node & 1ull) ? (node + 1ull) : (node - 1ull); + // 32-byte nodes at 32-byte-aligned offsets (cuMemAlloc 256-aligned), + // so the u64 copy is safe. + const uint64_t *src = reinterpret_cast(nodes + sib * 32); + uint64_t *dst = reinterpret_cast( + out + ((uint64_t)q * depth + level) * 32); + #pragma unroll + for (int i = 0; i < 4; ++i) dst[i] = src[i]; + node = (node & 1ull) ? (node >> 1) : ((node - 1ull) >> 1); + } +} diff --git a/crypto/math-cuda/src/device.rs b/crypto/math-cuda/src/device.rs index 9ea480b9f..6b6094d45 100644 --- a/crypto/math-cuda/src/device.rs +++ b/crypto/math-cuda/src/device.rs @@ -150,6 +150,7 @@ pub struct Backend { pub keccak_comp_poly_leaves_ext3: CudaFunction, pub keccak_fri_leaves_ext3: CudaFunction, pub keccak_merkle_level: CudaFunction, + pub merkle_gather_paths: CudaFunction, // barycentric.ptx pub barycentric_base_batched: CudaFunction, @@ -320,6 +321,7 @@ impl Backend { keccak_comp_poly_leaves_ext3: keccak.load_function("keccak_comp_poly_leaves_ext3")?, keccak_fri_leaves_ext3: keccak.load_function("keccak_fri_leaves_ext3")?, keccak_merkle_level: keccak.load_function("keccak_merkle_level")?, + merkle_gather_paths: keccak.load_function("merkle_gather_paths")?, barycentric_base_batched: bary.load_function("barycentric_base_batched")?, barycentric_ext3_batched: bary.load_function("barycentric_ext3_batched")?, barycentric_base_batched_strided: bary diff --git a/crypto/math-cuda/src/lde.rs b/crypto/math-cuda/src/lde.rs index ee5dc3fce..f65d17a35 100644 --- a/crypto/math-cuda/src/lde.rs +++ b/crypto/math-cuda/src/lde.rs @@ -219,11 +219,17 @@ fn launch_pointwise_mul_batched( /// Handle to a base-field LDE kept live on device after R1 commit. /// Layout: `m` columns, each `lde_size` u64s, column `c` at byte offset /// `c * lde_size * 8` within `buf`. Freed when `buf` Arc drops. +/// +/// `tree` optionally carries the main-trace Merkle tree kept resident on device +/// (populated by the keep path), so R4 query openings can gather authentication +/// paths on device instead of D2H'ing the whole tree to host. `None` on the CPU +/// path or when the tree wasn't retained. #[derive(Clone)] pub struct GpuLdeBase { pub buf: Arc>, pub m: usize, pub lde_size: usize, + pub tree: Option, } /// Handle to an ext3 LDE kept live on device, de-interleaved into 3 base @@ -236,6 +242,18 @@ pub struct GpuLdeExt3 { pub lde_size: usize, } +/// Handle to a Merkle tree kept live on device after a commit, so query +/// openings can gather authentication paths on device instead of D2H'ing the +/// whole tree to host. Node layout matches the CPU tree +/// (`crypto/crypto/src/merkle_tree`): `nodes[0..leaves_len-1]` are inner nodes +/// (root at index 0), `nodes[leaves_len-1..]` are the leaves; each node is 32 +/// bytes. Freed when the `nodes` Arc drops. +#[derive(Clone)] +pub struct GpuMerkleTree { + pub nodes: Arc>, + pub leaves_len: usize, +} + pub fn coset_lde_base(evals: &[u64], blowup_factor: usize, weights: &[u64]) -> Result> { let n = evals.len(); // Empty input must short-circuit before the power-of-two assert @@ -613,6 +631,7 @@ pub fn coset_lde_batch_base_into_with_leaf_hash( hashed_leaves_out, KeccakCommit::LeavesOnly, false, + false, ) .map(|_| ()) } @@ -640,6 +659,7 @@ pub fn coset_lde_batch_base_into_with_merkle_tree( merkle_nodes_out, KeccakCommit::FullTree, false, + false, ) .map(|_| ()) } @@ -654,6 +674,10 @@ pub fn coset_lde_batch_base_into_with_merkle_tree_keep( outputs: &mut [&mut [u64]], merkle_nodes_out: &mut [u8], ) -> Result { + // `keep_tree = true`: the main-trace tree nodes are retained on device + // (inside the returned `GpuLdeBase.tree`) so R4 openings can gather paths on + // device. The host `merkle_nodes_out` is still filled, so host-tree openings + // keep working until a later step drops the host tree. let opt = coset_lde_batch_base_into_with_merkle_tree_inner( columns, blowup_factor, @@ -662,11 +686,13 @@ pub fn coset_lde_batch_base_into_with_merkle_tree_keep( merkle_nodes_out, KeccakCommit::FullTree, true, + true, )?; let handle = opt.expect("keep_device_buf=true must return Some"); Ok(handle) } +#[allow(clippy::too_many_arguments)] fn coset_lde_batch_base_into_with_merkle_tree_inner( columns: &[&[u64]], blowup_factor: usize, @@ -675,6 +701,7 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner( nodes_out: &mut [u8], commit: KeccakCommit, keep_device_buf: bool, + keep_tree: bool, ) -> Result> { if columns.is_empty() { assert_eq!(outputs.len(), 0); @@ -817,13 +844,26 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner( drop(staging); if keep_device_buf { + // Retain the device tree nodes for on-device opening (FullTree only; in + // LeavesOnly mode there are no inner nodes to gather paths from). + let tree = if keep_tree && commit == KeccakCommit::FullTree { + Some(GpuMerkleTree { + nodes: Arc::new(nodes_dev), + leaves_len: lde_size, + }) + } else { + drop(nodes_dev); + None + }; Ok(Some(GpuLdeBase { buf: Arc::new(buf), m, lde_size, + tree, })) } else { drop(buf); + drop(nodes_dev); Ok(None) } } diff --git a/crypto/math-cuda/src/merkle.rs b/crypto/math-cuda/src/merkle.rs index 932e81325..1c1e54093 100644 --- a/crypto/math-cuda/src/merkle.rs +++ b/crypto/math-cuda/src/merkle.rs @@ -17,6 +17,7 @@ //! to match `FieldElement::::write_bytes_be`. use cudarc::driver::{CudaSlice, CudaStream, CudaViewMut, LaunchConfig, PushKernelArg}; +use std::sync::Arc; use crate::Result; use crate::device::{Backend, backend}; @@ -217,6 +218,61 @@ pub fn build_merkle_tree_on_device(hashed_leaves: &[u8]) -> Result> { Ok(out) } +/// Gather Merkle authentication paths on device for `positions` (leaf indices) +/// against the resident tree `nodes_dev` (standard layout, `2*leaves_len-1` +/// nodes of 32 bytes). Returns `positions.len() * depth * 32` bytes, where +/// `depth = log2(leaves_len)`: query `q`'s path occupies bytes +/// `[q*depth*32 .. (q+1)*depth*32]`, each 32-byte node a sibling from leaf to +/// root — byte-for-byte the same nodes the CPU `MerkleTree::get_proof_by_pos` +/// (`build_merkle_path`) collects. Runs on the caller's `stream` (pass the +/// table's session stream so it shares the queue with the rest of R4). +pub fn gather_merkle_paths_dev( + nodes_dev: &CudaSlice, + leaves_len: usize, + positions: &[u32], + stream: &Arc, +) -> Result> { + let num_queries = positions.len(); + if num_queries == 0 { + return Ok(Vec::new()); + } + assert!( + leaves_len.is_power_of_two() && leaves_len >= 2, + "leaves_len must be a power of two >= 2" + ); + let depth = leaves_len.trailing_zeros() as usize; + let be = backend()?; + + let pos_dev = stream.clone_htod(positions)?; + // SAFETY: every byte of `out` is written by the kernel below (one 32-byte + // node per (query, level)) before the D2H reads it back. + let mut out = unsafe { stream.alloc::(num_queries * depth * 32) }?; + + let grid = (num_queries as u32).div_ceil(KECCAK_BLOCK_DIM); + let cfg = LaunchConfig { + grid_dim: (grid, 1, 1), + block_dim: (KECCAK_BLOCK_DIM, 1, 1), + shared_mem_bytes: 0, + }; + let num_queries_u32 = num_queries as u32; + let leaves_len_u64 = leaves_len as u64; + let depth_u32 = depth as u32; + unsafe { + stream + .launch_builder(&be.merkle_gather_paths) + .arg(nodes_dev) + .arg(&pos_dev) + .arg(&num_queries_u32) + .arg(&leaves_len_u64) + .arg(&depth_u32) + .arg(&mut out) + .launch(cfg)?; + } + let host = stream.clone_dtoh(&out)?; + stream.synchronize()?; + Ok(host) +} + /// Row-pair Keccak leaf + Merkle tree build for R2 composition-polynomial /// commit. `parts_interleaved` is `num_parts` slices, each holding an ext3 /// LDE column interleaved as `[a0,a1,a2, b0,b1,b2, ...]` of length `3*lde_size`. diff --git a/crypto/math-cuda/tests/barycentric_strided.rs b/crypto/math-cuda/tests/barycentric_strided.rs index 653ef4e38..b1963522f 100644 --- a/crypto/math-cuda/tests/barycentric_strided.rs +++ b/crypto/math-cuda/tests/barycentric_strided.rs @@ -49,6 +49,7 @@ fn run_base(log_trace: u32, blowup: usize, num_cols: usize, seed: u64) { buf: Arc::new(lde_dev), m: num_cols, lde_size, + tree: None, }; // Pre-strided buffer for non-strided reference: trace-size picks of each col. diff --git a/crypto/math-cuda/tests/deep.rs b/crypto/math-cuda/tests/deep.rs index 8499cd04a..57ea14d3c 100644 --- a/crypto/math-cuda/tests/deep.rs +++ b/crypto/math-cuda/tests/deep.rs @@ -177,6 +177,7 @@ fn run_parity( buf: Arc::new(main_dev), m: num_main, lde_size, + tree: None, }; let aux_handle = if num_aux > 0 { Some(GpuLdeExt3 { diff --git a/crypto/math-cuda/tests/merkle_gather.rs b/crypto/math-cuda/tests/merkle_gather.rs new file mode 100644 index 000000000..b2ba24c1d --- /dev/null +++ b/crypto/math-cuda/tests/merkle_gather.rs @@ -0,0 +1,84 @@ +//! Parity: GPU `gather_merkle_paths_dev` must produce, for each leaf position, +//! the exact `merkle_path` the CPU `MerkleTree::get_proof_by_pos` returns — +//! same sibling-from-leaf-to-root order, byte-for-byte. This is the gate for +//! gathering R4 query openings on device instead of D2H'ing the whole tree. + +use crypto::merkle_tree::backends::field_element_vector::FieldElementVectorBackend; +use crypto::merkle_tree::merkle::MerkleTree; +use math::field::goldilocks::GoldilocksField; +use rand::{Rng, SeedableRng}; +use rand_chacha::ChaCha8Rng; +use sha3::Keccak256; + +type CpuTree = MerkleTree>; + +fn run_gather_parity(log_n: u32, seed: u64) { + let leaves_len = 1usize << log_n; + let mut rng = ChaCha8Rng::seed_from_u64(seed); + let leaves: Vec<[u8; 32]> = (0..leaves_len) + .map(|_| { + let mut arr = [0u8; 32]; + rng.fill(&mut arr[..]); + arr + }) + .collect(); + + let mut flat = Vec::with_capacity(leaves_len * 32); + for l in &leaves { + flat.extend_from_slice(l); + } + + // Build the tree on device, then upload its nodes back as the resident + // buffer the gather reads (build_merkle_tree_on_device returns host bytes). + let gpu_nodes_bytes = math_cuda::merkle::build_merkle_tree_on_device(&flat).unwrap(); + + // CPU reference tree over the same backend as the prover. + let cpu_tree = CpuTree::build_from_hashed_leaves(leaves).unwrap(); + + // Query a spread of positions: first, last, and random interior ones. + let mut positions: Vec = vec![0, (leaves_len - 1) as u32]; + let mut r = ChaCha8Rng::seed_from_u64(seed ^ 0xabcd); + for _ in 0..16usize.min(leaves_len) { + positions.push(r.gen_range(0..leaves_len) as u32); + } + + let be = math_cuda::device::backend().unwrap(); + let stream = be.next_stream(); + let nodes_dev = stream.clone_htod(&gpu_nodes_bytes).unwrap(); + stream.synchronize().unwrap(); + + let depth = log_n as usize; + let paths = + math_cuda::merkle::gather_merkle_paths_dev(&nodes_dev, leaves_len, &positions, &stream) + .unwrap(); + assert_eq!(paths.len(), positions.len() * depth * 32); + + for (q, &pos) in positions.iter().enumerate() { + let cpu_proof = cpu_tree.get_proof_by_pos(pos as usize).unwrap(); + assert_eq!( + cpu_proof.merkle_path.len(), + depth, + "depth mismatch at log_n={log_n} pos={pos}" + ); + for (level, cpu_node) in cpu_proof.merkle_path.iter().enumerate() { + let g = &paths[(q * depth + level) * 32..(q * depth + level + 1) * 32]; + assert_eq!( + g, + &cpu_node[..], + "path node mismatch: log_n={log_n} pos={pos} level={level}" + ); + } + } +} + +#[test] +fn merkle_gather_small() { + for log_n in 1u32..=6 { + run_gather_parity(log_n, 200 + log_n as u64); + } +} + +#[test] +fn merkle_gather_large() { + run_gather_parity(18, 7777); +} diff --git a/crypto/stark/src/gpu_lde.rs b/crypto/stark/src/gpu_lde.rs index 6c6e725a1..f6c85553e 100644 --- a/crypto/stark/src/gpu_lde.rs +++ b/crypto/stark/src/gpu_lde.rs @@ -16,6 +16,7 @@ use math_cuda::{CudaSlice, CudaStream}; use crypto::fiat_shamir::is_transcript::IsStarkTranscript; use crypto::merkle_tree::merkle::MerkleTree; +use crypto::merkle_tree::proof::Proof; use crypto::merkle_tree::traits::IsMerkleTreeBackend; use math::field::element::FieldElement; use math::field::extensions_goldilocks::Degree3GoldilocksExtensionField; @@ -23,7 +24,7 @@ use math::field::goldilocks::GoldilocksField; use math::field::traits::{IsFFTField, IsField, IsSubFieldOf}; use math::traits::AsBytes; -use crate::config::FriLayerMerkleTreeBackend; +use crate::config::{Commitment, FriLayerMerkleTreeBackend}; use crate::domain::Domain; use crate::fri::fri_commitment::FriLayer; use crate::fri::fri_functions::compute_coset_twiddles_inv; @@ -1438,6 +1439,44 @@ where Some((handle, stream)) } +/// Gather Merkle authentication paths on device for `positions` (leaf indices), +/// returning one [`Proof`] per position in the same order. Byte-identical to +/// the host `MerkleTree::get_proof_by_pos` (guarded by the `merkle_gather` +/// parity test), so R4 query openings can source proofs from the resident +/// device tree instead of the host tree. Returns `None` on any cudarc error +/// (the caller then falls back to the host tree). +pub(crate) fn gather_proofs_dev( + tree: &math_cuda::lde::GpuMerkleTree, + positions: &[usize], + stream: &Arc, +) -> Option>> { + if positions.is_empty() { + return Some(Vec::new()); + } + let positions_u32: Vec = positions.iter().map(|&p| p as u32).collect(); + let bytes = math_cuda::merkle::gather_merkle_paths_dev( + &tree.nodes, + tree.leaves_len, + &positions_u32, + stream, + ) + .ok()?; + let depth = tree.leaves_len.trailing_zeros() as usize; + debug_assert_eq!(bytes.len(), positions.len() * depth * 32); + let mut proofs = Vec::with_capacity(positions.len()); + for q in 0..positions.len() { + let mut merkle_path = Vec::with_capacity(depth); + for level in 0..depth { + let off = (q * depth + level) * 32; + let mut node: Commitment = [0u8; 32]; + node.copy_from_slice(&bytes[off..off + 32]); + merkle_path.push(node); + } + proofs.push(Proof { merkle_path }); + } + Some(proofs) +} + /// R3 OOD device-side context: bundles the inverted denominators, the /// coset_points upload (used by every barycentric kernel for this batch), /// and the stream so producer + consumers serialize naturally. Hoisting diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs index 2e1987b07..25d63cd2d 100644 --- a/crypto/stark/src/prover.rs +++ b/crypto/stark/src/prover.rs @@ -35,6 +35,8 @@ use crate::trace::LDETraceTable; use super::config::{BatchedMerkleTree, BatchedMerkleTreeBackend, Commitment}; use super::constraints::evaluator::ConstraintEvaluator; +#[cfg(feature = "cuda")] +use crypto::merkle_tree::proof::Proof; use super::domain::{Domain, DomainConstants}; use super::fri::fri_decommit::FriDecommitment; use super::grinding; @@ -1619,6 +1621,36 @@ pub trait IsStarkProver< } } + /// Like [`Self::open_polys_with`], but uses Merkle proofs already gathered + /// from the resident device tree (see [`crate::gpu_lde::gather_proofs_dev`]) + /// instead of walking a host tree. The evaluations are still gathered from + /// the host-resident LDE columns via `gather`. `proof` is for leaf position + /// `challenge * 2`, `proof_sym` for `challenge * 2 + 1` — the same positions + /// `open_polys_with` opens. + #[cfg(feature = "cuda")] + fn open_polys_with_proofs( + domain: &Domain, + proof: Proof, + proof_sym: Proof, + challenge: usize, + gather: G, + ) -> PolynomialOpenings + where + C: IsField, + FieldElement: AsBytes + Sync + Send, + G: Fn(usize) -> Vec>, + { + let domain_size = domain.lde_roots_of_unity_coset.len() as u64; + let index = challenge * 2; + let index_sym = challenge * 2 + 1; + PolynomialOpenings { + proof, + proof_sym, + evaluations: gather(reverse_index(index, domain_size)), + evaluations_sym: gather(reverse_index(index_sym, domain_size)), + } + } + /// Open the deep composition polynomial on a list of indexes and their symmetric elements. fn open_deep_composition_poly( domain: &Domain, @@ -1638,7 +1670,28 @@ pub trait IsStarkProver< let num_precomputed_cols = main_commit.num_precomputed_cols; let total_cols = lde_trace.num_main_cols(); - for index in indexes_to_open.iter() { + // R4 main-trace proofs from the resident device tree, when present: + // gathered in one batch over all query positions (`c*2`, `c*2+1` per + // query) instead of walking the host tree. Byte-identical to the host + // proofs (guarded by the `merkle_gather` parity test). Only the + // non-preprocessed main carries a device tree today; on any miss this is + // `None` and openings fall back to the host tree below. + #[cfg(feature = "cuda")] + let main_dev_proofs: Option>> = (!is_preprocessed) + .then(|| { + let tree = lde_trace.gpu_main()?.tree.as_ref()?; + let stream = lde_trace.bound_stream()?; + let positions: Vec = indexes_to_open + .iter() + .flat_map(|&c| [c * 2, c * 2 + 1]) + .collect(); + crate::gpu_lde::gather_proofs_dev(tree, &positions, &stream) + }) + .flatten(); + + for (qi, index) in indexes_to_open.iter().enumerate() { + #[cfg(not(feature = "cuda"))] + let _ = qi; // For preprocessed tables, open the main split (multiplicities only); // for normal tables, open all main columns. let main_trace_opening = if is_preprocessed { @@ -1646,9 +1699,28 @@ pub trait IsStarkProver< lde_trace.gather_main_row_range(row, num_precomputed_cols, total_cols) }) } else { - Self::open_polys_with(domain, &main_commit.tree, *index, |row| { - lde_trace.gather_main_row(row) - }) + #[cfg(feature = "cuda")] + { + if let Some(proofs) = &main_dev_proofs { + Self::open_polys_with_proofs( + domain, + proofs[qi * 2].clone(), + proofs[qi * 2 + 1].clone(), + *index, + |row| lde_trace.gather_main_row(row), + ) + } else { + Self::open_polys_with(domain, &main_commit.tree, *index, |row| { + lde_trace.gather_main_row(row) + }) + } + } + #[cfg(not(feature = "cuda"))] + { + Self::open_polys_with(domain, &main_commit.tree, *index, |row| { + lde_trace.gather_main_row(row) + }) + } }; // For preprocessed tables, also open the precomputed-columns tree. From b8113370fffbf3393e09ced900139f778655231b Mon Sep 17 00:00:00 2001 From: Joaquin Carletti Date: Mon, 29 Jun 2026 17:52:23 -0300 Subject: [PATCH 08/17] merkle tree --- crypto/crypto/src/merkle_tree/merkle.rs | 14 ++ crypto/math-cuda/src/lde.rs | 104 ++++++++--- crypto/math-cuda/src/merkle.rs | 37 +++- crypto/math-cuda/tests/barycentric_strided.rs | 1 + crypto/math-cuda/tests/deep.rs | 1 + crypto/stark/src/gpu_lde.rs | 68 +++----- crypto/stark/src/prover.rs | 164 +++++++++++++++--- 7 files changed, 296 insertions(+), 93 deletions(-) diff --git a/crypto/crypto/src/merkle_tree/merkle.rs b/crypto/crypto/src/merkle_tree/merkle.rs index f00985d39..2060a0f18 100644 --- a/crypto/crypto/src/merkle_tree/merkle.rs +++ b/crypto/crypto/src/merkle_tree/merkle.rs @@ -168,6 +168,20 @@ where }) } + /// Create a root-only Merkle tree placeholder: stores the commitment root + /// but no nodes. Used when the tree's authentication paths are gathered from + /// a device-resident copy (GPU) instead of this host tree, so the host nodes + /// are never materialised (saving the full-tree Device→Host copy). + /// [`get_proof_by_pos`](Self::get_proof_by_pos) must NOT be called on it. + pub fn from_root(root: B::Node) -> Self { + MerkleTree { + root, + nodes: Vec::new(), + #[cfg(feature = "disk-spill")] + mmap_backing: None, + } + } + /// Create a Merkle tree from pre-hashed leaf nodes. /// /// This skips the `hash_leaves` step, useful when leaves have already been diff --git a/crypto/math-cuda/src/lde.rs b/crypto/math-cuda/src/lde.rs index f65d17a35..8fc309ae5 100644 --- a/crypto/math-cuda/src/lde.rs +++ b/crypto/math-cuda/src/lde.rs @@ -240,6 +240,10 @@ pub struct GpuLdeExt3 { pub buf: Arc>, pub m: usize, pub lde_size: usize, + /// Optionally the aux/composition Merkle tree kept resident on device (the + /// keep path), so R4 openings gather paths on device instead of D2H'ing the + /// whole tree. `None` on the CPU path or when not retained. + pub tree: Option, } /// Handle to a Merkle tree kept live on device after a commit, so query @@ -252,6 +256,9 @@ pub struct GpuLdeExt3 { pub struct GpuMerkleTree { pub nodes: Arc>, pub leaves_len: usize, + /// The Merkle root (node 0), copied to host at build time (32 bytes) so the + /// commitment is available without D2H'ing the whole tree. + pub root: [u8; 32], } pub fn coset_lde_base(evals: &[u64], blowup_factor: usize, weights: &[u64]) -> Result> { @@ -628,7 +635,7 @@ pub fn coset_lde_batch_base_into_with_leaf_hash( blowup_factor, weights, outputs, - hashed_leaves_out, + Some(hashed_leaves_out), KeccakCommit::LeavesOnly, false, false, @@ -656,7 +663,7 @@ pub fn coset_lde_batch_base_into_with_merkle_tree( blowup_factor, weights, outputs, - merkle_nodes_out, + Some(merkle_nodes_out), KeccakCommit::FullTree, false, false, @@ -664,26 +671,25 @@ pub fn coset_lde_batch_base_into_with_merkle_tree( .map(|_| ()) } -/// Fused LDE + leaf-hash + Merkle tree build. If `keep_device_buf` is true, -/// returns an `Arc>` wrapping the LDE device buffer so callers -/// (R2–R4 GPU paths) can reuse the LDE without a re-H2D. +/// Fused LDE + leaf-hash + Merkle tree build, keeping **both** the LDE buffer +/// and the Merkle tree resident on device (returned in `GpuLdeBase` + its +/// `tree`, including the root). The host tree is **not** materialised — there is +/// no `merkle_nodes_out` — so the whole-tree Device→Host copy is eliminated. +/// Callers gather query authentication paths from the device tree instead +/// (`crate::merkle::gather_merkle_paths_dev`) and use a root-only host tree for +/// the commitment. pub fn coset_lde_batch_base_into_with_merkle_tree_keep( columns: &[&[u64]], blowup_factor: usize, weights: &[u64], outputs: &mut [&mut [u64]], - merkle_nodes_out: &mut [u8], ) -> Result { - // `keep_tree = true`: the main-trace tree nodes are retained on device - // (inside the returned `GpuLdeBase.tree`) so R4 openings can gather paths on - // device. The host `merkle_nodes_out` is still filled, so host-tree openings - // keep working until a later step drops the host tree. let opt = coset_lde_batch_base_into_with_merkle_tree_inner( columns, blowup_factor, weights, outputs, - merkle_nodes_out, + None, KeccakCommit::FullTree, true, true, @@ -698,7 +704,7 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner( blowup_factor: usize, weights: &[u64], outputs: &mut [&mut [u64]], - nodes_out: &mut [u8], + nodes_out: Option<&mut [u8]>, commit: KeccakCommit, keep_device_buf: bool, keep_tree: bool, @@ -726,7 +732,9 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner( assert_eq!(o.len(), lde_size); } let nodes_dev_bytes = commit.total_nodes_bytes(lde_size); - assert_eq!(nodes_out.len(), nodes_dev_bytes); + if let Some(no) = &nodes_out { + assert_eq!(no.len(), nodes_dev_bytes); + } let log_n = n.trailing_zeros() as u64; let log_lde = lde_size.trailing_zeros() as u64; @@ -832,9 +840,18 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner( crate::merkle::build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, lde_size)?; } - // D2H the LDE and the tree/leaves nodes via pinned staging. + // D2H the LDE columns via pinned staging (still read on host by constraint + // eval / openings). The full tree nodes are D2H'd only when the caller asks + // for a host tree (`nodes_out`); the device-resident path passes `None` and + // keeps the tree on device, eliminating the whole-tree D2H. stream.memcpy_dtoh(&buf, &mut pinned[..m * lde_size])?; - d2h_bytes_via_pinned_hashes(&stream, be, &nodes_dev, nodes_out)?; + if let Some(no) = nodes_out { + d2h_bytes_via_pinned_hashes(&stream, be, &nodes_dev, no)?; + } + // Ensure the (async) LDE D2H above has landed before reading `pinned`. + // Previously the tree D2H's `synchronize` covered this; the device-resident + // path skips that D2H, so synchronize explicitly. + stream.synchronize()?; // Copy pinned into caller outputs. Runs under the pinned-staging lock, // where rayon can deadlock. See `Backend::pinned_staging`. @@ -847,9 +864,15 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner( // Retain the device tree nodes for on-device opening (FullTree only; in // LeavesOnly mode there are no inner nodes to gather paths from). let tree = if keep_tree && commit == KeccakCommit::FullTree { + // Copy just the 32-byte root (node 0) so the commitment is available + // without D2H'ing the whole tree. + let mut root = [0u8; 32]; + stream.memcpy_dtoh(&nodes_dev.slice(0..32), &mut root)?; + stream.synchronize()?; Some(GpuMerkleTree { nodes: Arc::new(nodes_dev), leaves_len: lde_size, + root, }) } else { drop(nodes_dev); @@ -885,9 +908,10 @@ pub fn coset_lde_batch_ext3_into_with_leaf_hash( blowup_factor, weights, outputs, - hashed_leaves_out, + Some(hashed_leaves_out), KeccakCommit::LeavesOnly, false, + false, ) .map(|_| ()) } @@ -909,23 +933,24 @@ pub fn coset_lde_batch_ext3_into_with_merkle_tree( blowup_factor, weights, outputs, - merkle_nodes_out, + Some(merkle_nodes_out), KeccakCommit::FullTree, false, + false, ) .map(|_| ()) } -/// Ext3 variant of [`coset_lde_batch_base_into_with_merkle_tree_keep`] — -/// returns an `Arc>` handle to the de-interleaved LDE device -/// buffer. +/// Ext3 variant of [`coset_lde_batch_base_into_with_merkle_tree_keep`]: keeps +/// both the de-interleaved LDE buffer and the Merkle tree (with root) resident +/// on device. The host tree is **not** materialised (no `merkle_nodes_out`), so +/// the whole-tree D2H is eliminated; openings gather paths from the device tree. pub fn coset_lde_batch_ext3_into_with_merkle_tree_keep( columns: &[&[u64]], n: usize, blowup_factor: usize, weights: &[u64], outputs: &mut [&mut [u64]], - merkle_nodes_out: &mut [u8], ) -> Result { let opt = coset_lde_batch_ext3_into_with_merkle_tree_inner( columns, @@ -933,9 +958,10 @@ pub fn coset_lde_batch_ext3_into_with_merkle_tree_keep( blowup_factor, weights, outputs, - merkle_nodes_out, + None, KeccakCommit::FullTree, true, + true, )?; Ok(opt.expect("keep_device_buf=true must return Some")) } @@ -947,9 +973,10 @@ fn coset_lde_batch_ext3_into_with_merkle_tree_inner( blowup_factor: usize, weights: &[u64], outputs: &mut [&mut [u64]], - nodes_out: &mut [u8], + nodes_out: Option<&mut [u8]>, commit: KeccakCommit, keep_device_buf: bool, + keep_tree: bool, ) -> Result> { if columns.is_empty() { assert_eq!(outputs.len(), 0); @@ -976,7 +1003,9 @@ fn coset_lde_batch_ext3_into_with_merkle_tree_inner( assert_eq!(o.len(), 3 * lde_size); } let nodes_dev_bytes = commit.total_nodes_bytes(lde_size); - assert_eq!(nodes_out.len(), nodes_dev_bytes); + if let Some(no) = &nodes_out { + assert_eq!(no.len(), nodes_dev_bytes); + } let log_n = n.trailing_zeros() as u64; let log_lde = lde_size.trailing_zeros() as u64; @@ -1074,21 +1103,43 @@ fn coset_lde_batch_ext3_into_with_merkle_tree_inner( crate::merkle::build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, lde_size)?; } - // D2H LDE (mb * lde_size u64) and tree/leaves nodes. + // D2H the LDE columns. The full tree nodes are D2H'd only when the caller + // asks for a host tree (`nodes_out`); the device-resident path passes `None` + // and keeps the tree on device, eliminating the whole-tree D2H. stream.memcpy_dtoh(&buf, &mut pinned[..mb * lde_size])?; - d2h_bytes_via_pinned_hashes(&stream, be, &nodes_dev, nodes_out)?; + if let Some(no) = nodes_out { + d2h_bytes_via_pinned_hashes(&stream, be, &nodes_dev, no)?; + } + // Ensure the (async) LDE D2H has landed before reading `pinned` (the tree + // D2H's synchronize used to cover this; the device-resident path skips it). + stream.synchronize()?; unpack_pinned_slabs_to_ext3(pinned, outputs, lde_size); drop(staging); if keep_device_buf { + let tree = if keep_tree && commit == KeccakCommit::FullTree { + let mut root = [0u8; 32]; + stream.memcpy_dtoh(&nodes_dev.slice(0..32), &mut root)?; + stream.synchronize()?; + Some(GpuMerkleTree { + nodes: Arc::new(nodes_dev), + leaves_len: lde_size, + root, + }) + } else { + drop(nodes_dev); + None + }; Ok(Some(GpuLdeExt3 { buf: Arc::new(buf), m, lde_size, + tree, })) } else { drop(buf); + drop(nodes_dev); Ok(None) } } @@ -1276,6 +1327,7 @@ fn evaluate_poly_coset_batch_ext3_into_inner( buf: std::sync::Arc::new(buf), m, lde_size, + tree: None, })) } else { drop(buf); diff --git a/crypto/math-cuda/src/merkle.rs b/crypto/math-cuda/src/merkle.rs index 1c1e54093..cb3758ff6 100644 --- a/crypto/math-cuda/src/merkle.rs +++ b/crypto/math-cuda/src/merkle.rs @@ -279,7 +279,13 @@ pub fn gather_merkle_paths_dev( /// /// Returns `(2*(lde_size/2) - 1) * 32` bytes of tree nodes in the standard /// layout (root at byte offset 0, leaves in the tail). -pub fn build_comp_poly_tree_from_evals_ext3(parts_interleaved: &[&[u64]]) -> Result> { +/// Build the composition-poly Merkle tree on device (leaves hash row-pairs, so +/// `num_leaves = lde_size / 2`). Returns the device node buffer, the leaf count, +/// and the stream it was built on. Shared by the host-D2H and device-keep +/// wrappers below. +fn build_comp_poly_tree_nodes_dev( + parts_interleaved: &[&[u64]], +) -> Result<(CudaSlice, usize, Arc)> { assert!(!parts_interleaved.is_empty()); let m = parts_interleaved.len(); let ext3_elems = parts_interleaved[0].len() / 3; @@ -308,9 +314,13 @@ pub fn build_comp_poly_tree_from_evals_ext3(parts_interleaved: &[&[u64]]) -> Res pack_ext3_to_pinned_slabs(parts_interleaved, pinned, lde_size); - // H2D the de-interleaved parts. + // H2D the de-interleaved parts, then release the staging lock (the kernels + // below read the device `buf`, not `pinned`). Synchronize first so the + // async H2D has consumed `pinned` before it is freed/reused. let mut buf = stream.alloc_zeros::(mb * lde_size)?; stream.memcpy_htod(&pinned[..mb * lde_size], &mut buf)?; + stream.synchronize()?; + drop(staging); // Leaves into tail of a tight node buffer. let mut nodes_dev = unsafe { stream.alloc::(tight_total_nodes * 32) }?; @@ -337,13 +347,34 @@ pub fn build_comp_poly_tree_from_evals_ext3(parts_interleaved: &[&[u64]]) -> Res } build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, num_leaves)?; + Ok((nodes_dev, num_leaves, stream)) +} +pub fn build_comp_poly_tree_from_evals_ext3(parts_interleaved: &[&[u64]]) -> Result> { + let (nodes_dev, _num_leaves, stream) = build_comp_poly_tree_nodes_dev(parts_interleaved)?; let out = stream.clone_dtoh(&nodes_dev)?; stream.synchronize()?; - drop(staging); Ok(out) } +/// Like [`build_comp_poly_tree_from_evals_ext3`] but keeps the tree nodes on +/// device (returned as a [`crate::lde::GpuMerkleTree`] with its root), so R4 +/// composition openings gather authentication paths on device instead of +/// D2H'ing the whole tree. `leaves_len = lde_size / 2` (row-pair leaves). +pub fn build_comp_poly_tree_from_evals_ext3_keep( + parts_interleaved: &[&[u64]], +) -> Result { + let (nodes_dev, num_leaves, stream) = build_comp_poly_tree_nodes_dev(parts_interleaved)?; + let mut root = [0u8; 32]; + stream.memcpy_dtoh(&nodes_dev.slice(0..32), &mut root)?; + stream.synchronize()?; + Ok(crate::lde::GpuMerkleTree { + nodes: Arc::new(nodes_dev), + leaves_len: num_leaves, + root, + }) +} + /// Build a FRI-layer Merkle tree on device from an interleaved ext3 eval /// vector. Each leaf hashes two consecutive ext3 values. `num_leaves = /// evals.len() / 6` (since each ext3 is 3 u64s). diff --git a/crypto/math-cuda/tests/barycentric_strided.rs b/crypto/math-cuda/tests/barycentric_strided.rs index b1963522f..377a2b531 100644 --- a/crypto/math-cuda/tests/barycentric_strided.rs +++ b/crypto/math-cuda/tests/barycentric_strided.rs @@ -106,6 +106,7 @@ fn run_ext3(log_trace: u32, blowup: usize, num_cols: usize, seed: u64) { buf: Arc::new(lde_dev), m: num_cols, lde_size, + tree: None, }; // Pre-strided buffer for non-strided reference. diff --git a/crypto/math-cuda/tests/deep.rs b/crypto/math-cuda/tests/deep.rs index 57ea14d3c..6ab63be10 100644 --- a/crypto/math-cuda/tests/deep.rs +++ b/crypto/math-cuda/tests/deep.rs @@ -184,6 +184,7 @@ fn run_parity( buf: Arc::new(aux_dev), m: num_aux, lde_size, + tree: None, }) } else { drop(aux_dev); diff --git a/crypto/stark/src/gpu_lde.rs b/crypto/stark/src/gpu_lde.rs index f6c85553e..635ff815f 100644 --- a/crypto/stark/src/gpu_lde.rs +++ b/crypto/stark/src/gpu_lde.rs @@ -452,11 +452,13 @@ pub fn gpu_leaf_hash_calls() -> u64 { GPU_LEAF_HASH_CALLS.load(Ordering::Relaxed) } -/// Fused base-field path: LDE + Keccak-256 leaf hash + Merkle tree build, -/// all on device, with the LDE buffer retained for R2–R4 GPU reuse. On -/// success: `columns[c]` is resized to `lde_size` with the LDE output, and -/// the returned `(tree, GpuLdeBase)` pair is the host-side tree plus a -/// device-resident handle to the LDE buffer. +/// Fused base-field path: LDE + Keccak-256 leaf hash + Merkle tree build, all +/// on device, keeping **both** the LDE buffer and the Merkle tree resident on +/// device. On success: `columns[c]` is resized to `lde_size` with the LDE +/// output, and the returned `GpuLdeBase` carries the device LDE buffer plus the +/// device tree (`.tree`). The returned `MerkleTree` is **root-only** — the host +/// tree nodes are never materialised (no whole-tree D2H); query openings gather +/// authentication paths from the device tree via [`gather_proofs_dev`]. pub(crate) fn try_expand_leaf_and_tree_batched_keep( columns: &mut [Vec>], blowup_factor: usize, @@ -471,11 +473,8 @@ where LayoutDispatch::Empty | LayoutDispatch::Skip => return None, LayoutDispatch::Run { n, lde_size } => (n, lde_size), }; + let _ = lde_size; let num_columns = columns.len(); - let (mut nodes, total_nodes) = alloc_merkle_nodes(lde_size)?; - let node_byte_len = total_nodes - .checked_mul(32) - .expect("node byte length overflow"); // SAFETY: layout-checked above. let raw_columns = unsafe { columns_to_u64_base::(columns) }; @@ -488,14 +487,11 @@ where let handle_result = { let mut raw_outputs = unsafe { presize_and_view_base::(columns, lde_size) }; - let nodes_bytes: &mut [u8] = - unsafe { from_raw_parts_mut(nodes.as_mut_ptr() as *mut u8, node_byte_len) }; math_cuda::lde::coset_lde_batch_base_into_with_merkle_tree_keep( &slices, blowup_factor, &weights_u64, &mut raw_outputs, - nodes_bytes, ) }; let handle = match handle_result { @@ -506,7 +502,10 @@ where } }; - let tree = MerkleTree::::from_precomputed_nodes(nodes)?; + // Root-only host tree: the device tree (`handle.tree`) holds the nodes and + // serves openings; only the commitment root lives on host. + let root = handle.tree.as_ref()?.root; + let tree = MerkleTree::::from_root(root); Some((tree, handle)) } @@ -528,11 +527,8 @@ where LayoutDispatch::Empty | LayoutDispatch::Skip => return None, LayoutDispatch::Run { n, lde_size } => (n, lde_size), }; + let _ = lde_size; let num_columns = columns.len(); - let (mut nodes, total_nodes) = alloc_merkle_nodes(lde_size)?; - let node_byte_len = total_nodes - .checked_mul(32) - .expect("node byte length overflow"); // SAFETY: layout-checked above. let raw_columns = unsafe { columns_to_u64_ext3::(columns) }; @@ -545,15 +541,12 @@ where let handle_result = { let mut raw_outputs = unsafe { presize_and_view_ext3::(columns, lde_size) }; - let nodes_bytes: &mut [u8] = - unsafe { from_raw_parts_mut(nodes.as_mut_ptr() as *mut u8, node_byte_len) }; math_cuda::lde::coset_lde_batch_ext3_into_with_merkle_tree_keep( &slices, n, blowup_factor, &weights_u64, &mut raw_outputs, - nodes_bytes, ) }; let handle = match handle_result { @@ -564,7 +557,10 @@ where } }; - let tree = MerkleTree::::from_precomputed_nodes(nodes)?; + // Root-only host tree: the device tree (`handle.tree`) holds the nodes and + // serves openings; only the commitment root lives on host. + let root = handle.tree.as_ref()?.root; + let tree = MerkleTree::::from_root(root); Some((tree, handle)) } @@ -742,7 +738,7 @@ where /// recomputes on CPU. pub(crate) fn try_build_comp_poly_tree_gpu( lde_parts: &[Vec>], -) -> Option> +) -> Option<(MerkleTree, math_cuda::lde::GpuMerkleTree)> where E: IsField + 'static, B: IsMerkleTreeBackend, @@ -775,29 +771,17 @@ where }) .collect(); - let nodes_bytes = match math_cuda::merkle::build_comp_poly_tree_from_evals_ext3(&raw_parts) { - Ok(v) => v, + // Keep the composition tree resident on device; the whole-tree D2H is + // eliminated. R4 composition openings gather paths from the device tree + // (`gather_proofs_dev`); the returned host tree is root-only. + let dev_tree = match math_cuda::merkle::build_comp_poly_tree_from_evals_ext3_keep(&raw_parts) { + Ok(t) => t, Err(_) => return None, }; - - // lde_size is an even power of two >= 2, so 2*num_leaves == lde_size and - // tight_total_nodes = lde_size - 1 >= 1. No overflow or underflow possible. - let tight_total_nodes = lde_size - 1; - let expected_byte_len = tight_total_nodes - .checked_mul(32) - .expect("comp-poly node byte length overflow"); - debug_assert_eq!(nodes_bytes.len(), expected_byte_len); - - let nodes: Vec<[u8; 32]> = nodes_bytes - .chunks_exact(32) - .map(|c| { - c.try_into() - .expect("chunks_exact(32) yields exactly 32 bytes") - }) - .collect(); + debug_assert_eq!(dev_tree.leaves_len, lde_size / 2); GPU_COMP_POLY_TREE_CALLS.fetch_add(1, Ordering::Relaxed); - // Falls back to CPU on `None`, matching the R1 paths (lines 496, 557). - MerkleTree::::from_precomputed_nodes(nodes) + let host = MerkleTree::::from_root(dev_tree.root); + Some((host, dev_tree)) } /// R3 GPU dispatch: batched strided barycentric OOD evaluation over the main diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs index 25d63cd2d..c2702fd00 100644 --- a/crypto/stark/src/prover.rs +++ b/crypto/stark/src/prover.rs @@ -390,6 +390,12 @@ where pub(crate) composition_poly_merkle_tree: BatchedMerkleTree, /// The commitment to the composition polynomial parts. pub(crate) composition_poly_root: Commitment, + /// The composition-poly Merkle tree kept resident on device (when the R2 + /// GPU tree path ran), so R4 openings gather authentication paths on device + /// instead of walking a host tree. When set, `composition_poly_merkle_tree` + /// is a root-only placeholder. `None` on the CPU path. + #[cfg(feature = "cuda")] + pub(crate) gpu_composition_tree: Option, } /// A container for the results of the third round of the STARK Prove protocol. @@ -1133,22 +1139,31 @@ pub trait IsStarkProver< let t_sub = Instant::now(); // GPU fast path for the comp-poly Merkle commit: row-pair Keccak // leaves + device-side inner tree, both wrapping the host eval Vecs. + // GPU path keeps the composition tree resident on device (no whole-tree + // D2H) and returns a root-only host tree; the device tree is threaded to + // R4 in `Round2.gpu_composition_tree`. #[cfg(feature = "cuda")] - let gpu_tree = crate::gpu_lde::try_build_comp_poly_tree_gpu::< - FieldExtension, - BatchedMerkleTreeBackend, - >(&lde_composition_poly_parts_evaluations); + let (composition_poly_merkle_tree, composition_poly_root, gpu_composition_tree) = + match crate::gpu_lde::try_build_comp_poly_tree_gpu::< + FieldExtension, + BatchedMerkleTreeBackend, + >(&lde_composition_poly_parts_evaluations) + { + Some((host_tree, dev_tree)) => { + let root = host_tree.root; + (host_tree, root, Some(dev_tree)) + } + None => { + let (tree, root) = + Self::commit_composition_polynomial(&lde_composition_poly_parts_evaluations) + .ok_or(ProvingError::EmptyCommitment)?; + (tree, root, None) + } + }; #[cfg(not(feature = "cuda"))] - let gpu_tree: Option> = None; - - let (composition_poly_merkle_tree, composition_poly_root) = match gpu_tree { - Some(tree) => { - let root = tree.root; - (tree, root) - } - None => Self::commit_composition_polynomial(&lde_composition_poly_parts_evaluations) - .ok_or(ProvingError::EmptyCommitment)?, - }; + let (composition_poly_merkle_tree, composition_poly_root) = + Self::commit_composition_polynomial(&lde_composition_poly_parts_evaluations) + .ok_or(ProvingError::EmptyCommitment)?; #[cfg(feature = "instruments")] let merkle_dur = t_sub.elapsed(); @@ -1167,6 +1182,8 @@ pub trait IsStarkProver< lde_composition_poly_evaluations: lde_composition_poly_parts_evaluations, composition_poly_merkle_tree, composition_poly_root, + #[cfg(feature = "cuda")] + gpu_composition_tree, }) } @@ -1595,6 +1612,47 @@ pub trait IsStarkProver< } } + /// Like [`Self::open_composition_poly`] but uses a Merkle proof already + /// gathered from the resident device composition tree + /// ([`crate::gpu_lde::gather_proofs_dev`]) instead of walking a host tree. + /// Same single-position opening (`proof_sym == proof`); evaluations come + /// from the host eval Vecs as before. + #[cfg(feature = "cuda")] + fn open_composition_poly_with_proof( + proof: Proof, + lde_composition_poly_evaluations: &[Vec>], + index: usize, + ) -> PolynomialOpenings + where + FieldElement: AsBytes + Sync + Send, + FieldElement: AsBytes + Sync + Send, + { + let lde_composition_poly_parts_evaluation: Vec<_> = lde_composition_poly_evaluations + .iter() + .flat_map(|part| { + vec![ + part[reverse_index(index * 2, part.len() as u64)].clone(), + part[reverse_index(index * 2 + 1, part.len() as u64)].clone(), + ] + }) + .collect(); + + PolynomialOpenings { + proof: proof.clone(), + proof_sym: proof, + evaluations: lde_composition_poly_parts_evaluation + .clone() + .into_iter() + .step_by(2) + .collect(), + evaluations_sym: lde_composition_poly_parts_evaluation + .into_iter() + .skip(1) + .step_by(2) + .collect(), + } + } + /// Computes values and validity proofs of the evaluations of trace polynomials at /// the FRI query challenge `challenge` and its symmetric counterpart. The caller /// supplies a `gather` closure that pulls the row data from the column-major LDE @@ -1689,6 +1747,28 @@ pub trait IsStarkProver< }) .flatten(); + // Same for the aux-trace tree, when it is device-resident. + #[cfg(feature = "cuda")] + let aux_dev_proofs: Option>> = + round_1_result.aux.as_ref().and_then(|_aux| { + let tree = lde_trace.gpu_aux()?.tree.as_ref()?; + let stream = lde_trace.bound_stream()?; + let positions: Vec = indexes_to_open + .iter() + .flat_map(|&c| [c * 2, c * 2 + 1]) + .collect(); + crate::gpu_lde::gather_proofs_dev(tree, &positions, &stream) + }); + + // Composition tree: openings open a single position `index` (row-pair + // leaf), so gather one proof per query challenge from the device tree. + #[cfg(feature = "cuda")] + let comp_dev_proofs: Option>> = + round_2_result.gpu_composition_tree.as_ref().and_then(|tree| { + let stream = lde_trace.bound_stream()?; + crate::gpu_lde::gather_proofs_dev(tree, indexes_to_open, &stream) + }); + for (qi, index) in indexes_to_open.iter().enumerate() { #[cfg(not(feature = "cuda"))] let _ = qi; @@ -1730,16 +1810,56 @@ pub trait IsStarkProver< }) }); - let composition_openings = Self::open_composition_poly( - &round_2_result.composition_poly_merkle_tree, - &round_2_result.lde_composition_poly_evaluations, - *index, - ); + let composition_openings = { + #[cfg(feature = "cuda")] + { + if let Some(proofs) = &comp_dev_proofs { + Self::open_composition_poly_with_proof( + proofs[qi].clone(), + &round_2_result.lde_composition_poly_evaluations, + *index, + ) + } else { + Self::open_composition_poly( + &round_2_result.composition_poly_merkle_tree, + &round_2_result.lde_composition_poly_evaluations, + *index, + ) + } + } + #[cfg(not(feature = "cuda"))] + { + Self::open_composition_poly( + &round_2_result.composition_poly_merkle_tree, + &round_2_result.lde_composition_poly_evaluations, + *index, + ) + } + }; let aux_trace_polys = round_1_result.aux.as_ref().map(|aux| { - Self::open_polys_with(domain, &aux.tree, *index, |row| { - lde_trace.gather_aux_row(row) - }) + #[cfg(feature = "cuda")] + { + if let Some(proofs) = &aux_dev_proofs { + Self::open_polys_with_proofs( + domain, + proofs[qi * 2].clone(), + proofs[qi * 2 + 1].clone(), + *index, + |row| lde_trace.gather_aux_row(row), + ) + } else { + Self::open_polys_with(domain, &aux.tree, *index, |row| { + lde_trace.gather_aux_row(row) + }) + } + } + #[cfg(not(feature = "cuda"))] + { + Self::open_polys_with(domain, &aux.tree, *index, |row| { + lde_trace.gather_aux_row(row) + }) + } }); openings.push(DeepPolynomialOpening { From 819ad015e1b8fba4c12a981c9e4df541988938f4 Mon Sep 17 00:00:00 2001 From: Joaquin Carletti Date: Mon, 29 Jun 2026 18:16:12 -0300 Subject: [PATCH 09/17] finish merkle --- crypto/math-cuda/src/fri.rs | 20 ++++--- crypto/stark/src/fri/fri_commitment.rs | 7 +++ crypto/stark/src/fri/mod.rs | 8 +++ crypto/stark/src/gpu_lde.rs | 78 +++++++++++++++++++++----- 4 files changed, 90 insertions(+), 23 deletions(-) diff --git a/crypto/math-cuda/src/fri.rs b/crypto/math-cuda/src/fri.rs index edd359b1b..e4c60a8a8 100644 --- a/crypto/math-cuda/src/fri.rs +++ b/crypto/math-cuda/src/fri.rs @@ -98,7 +98,7 @@ impl FriCommitState { pub fn fold_and_commit_layer( &mut self, zeta_raw: [u64; 3], - ) -> Result<(Vec, Vec, Vec)> { + ) -> Result<(Vec, crate::lde::GpuMerkleTree)> { #[cfg(feature = "test-faults")] check_fault_injection()?; let be = backend()?; @@ -214,17 +214,21 @@ impl FriCommitState { self.stream.clone_dtoh(&view)? }; - // Tree nodes. - let nodes_bytes: Vec = self.stream.clone_dtoh(&nodes_dev)?; - debug_assert_eq!(nodes_bytes.len(), tight_total_nodes * 32); - - let mut root = vec![0u8; 32]; - root.copy_from_slice(&nodes_bytes[0..32]); + // Keep the layer tree resident on device; copy only the 32-byte root so + // R4 query openings gather paths on device instead of D2H'ing the tree. + let mut root = [0u8; 32]; + self.stream.memcpy_dtoh(&nodes_dev.slice(0..32), &mut root)?; + self.stream.synchronize()?; self.a_is_input = !self.a_is_input; self.current_n = n_out; - Ok((root, layer_evals, nodes_bytes)) + let tree = crate::lde::GpuMerkleTree { + nodes: std::sync::Arc::new(nodes_dev), + leaves_len: num_leaves, + root, + }; + Ok((layer_evals, tree)) } /// Final fold, no Merkle commit. Returns the single ext3 output diff --git a/crypto/stark/src/fri/fri_commitment.rs b/crypto/stark/src/fri/fri_commitment.rs index 831471761..9f950c27e 100644 --- a/crypto/stark/src/fri/fri_commitment.rs +++ b/crypto/stark/src/fri/fri_commitment.rs @@ -13,6 +13,11 @@ where { pub evaluation: Vec>, pub merkle_tree: MerkleTree, + /// The layer's Merkle tree kept resident on device (GPU FRI commit path), + /// so R4 query openings gather authentication paths on device. When set, + /// `merkle_tree` is a root-only placeholder. `None` on the CPU path. + #[cfg(feature = "cuda")] + pub gpu_tree: Option, } impl FriLayer @@ -25,6 +30,8 @@ where Self { evaluation: evaluation.to_vec(), merkle_tree, + #[cfg(feature = "cuda")] + gpu_tree: None, } } } diff --git a/crypto/stark/src/fri/mod.rs b/crypto/stark/src/fri/mod.rs index 60ad2a398..181c27380 100644 --- a/crypto/stark/src/fri/mod.rs +++ b/crypto/stark/src/fri/mod.rs @@ -117,6 +117,14 @@ pub fn query_phase( where FieldElement: AsBytes + Sync + Send, { + // GPU fast path: gather every layer's authentication paths on device (the + // layer trees stay resident from the GPU commit). Falls back to the host + // walk below if any layer lacks a device tree. + #[cfg(feature = "cuda")] + if let Some(decommits) = crate::gpu_lde::try_fri_query_phase_gpu::(fri_layers, iotas) { + return decommits; + } + if !fri_layers.is_empty() { let num_layers = fri_layers.len(); iotas diff --git a/crypto/stark/src/gpu_lde.rs b/crypto/stark/src/gpu_lde.rs index 635ff815f..0dd83ab2d 100644 --- a/crypto/stark/src/gpu_lde.rs +++ b/crypto/stark/src/gpu_lde.rs @@ -27,6 +27,7 @@ use math::traits::AsBytes; use crate::config::{Commitment, FriLayerMerkleTreeBackend}; use crate::domain::Domain; use crate::fri::fri_commitment::FriLayer; +use crate::fri::fri_decommit::FriDecommitment; use crate::fri::fri_functions::compute_coset_twiddles_inv; use crate::trace::LDETraceTable; @@ -1620,7 +1621,7 @@ where let zeta_ptr = &zeta as *const FieldElement as *const u64; let zeta_raw: [u64; 3] = unsafe { [*zeta_ptr, *zeta_ptr.add(1), *zeta_ptr.add(2)] }; - let (root, layer_evals_u64, nodes_bytes) = match state.fold_and_commit_layer(zeta_raw) { + let (layer_evals_u64, dev_tree) = match state.fold_and_commit_layer(zeta_raw) { Ok(v) => v, Err(_) => { *transcript = transcript_snapshot.clone(); @@ -1628,23 +1629,18 @@ where } }; - // Build the FriLayer: ext3 evals + Merkle tree from precomputed nodes. + // Build the FriLayer: ext3 evals + a root-only host tree (the layer tree + // stays resident on device in `gpu_tree`; query openings gather paths + // from it via `gather_proofs_dev`). let evaluation = u64_to_ext3_vec::(&layer_evals_u64); - - debug_assert!(nodes_bytes.len().is_multiple_of(32)); - let nodes: Vec<[u8; 32]> = nodes_bytes - .chunks_exact(32) - .map(|c| c.try_into().expect("chunks_exact(32) yields 32 bytes")) - .collect(); - let merkle_tree = MerkleTree::>::from_precomputed_nodes(nodes) - .expect("FRI commit: precomputed nodes form a valid tree"); - - fri_layer_list.push(FriLayer::new(&evaluation, merkle_tree)); + let root = dev_tree.root; + let merkle_tree = MerkleTree::>::from_root(root); + let mut layer = FriLayer::new(&evaluation, merkle_tree); + layer.gpu_tree = Some(dev_tree); + fri_layer_list.push(layer); // >>>> Send commitment: [p_k] - let mut root_arr = [0u8; 32]; - root_arr.copy_from_slice(&root); - transcript.append_bytes(&root_arr); + transcript.append_bytes(&root); } // <<<< Receive challenge zeta_{n-1} @@ -1671,3 +1667,55 @@ where GPU_FRI_CALLS.fetch_add(1, Ordering::Relaxed); Some((last_value, fri_layer_list)) } + +/// GPU FRI query phase: gather each layer's authentication paths on device +/// instead of walking host trees. For layer `l` and query `iota`, the opened +/// position is `(iota >> l) >> 1` — matching [`crate::fri::query_phase`]. Paths +/// for all queries are gathered in one batched call per layer. The layer +/// evaluations (`evaluation[index ^ 1]`) are read from the host Vecs as before. +/// +/// Returns `None` if there are no layers or any layer lacks a device tree (a +/// CPU-committed layer), so the caller falls back to the host walk. +pub(crate) fn try_fri_query_phase_gpu( + fri_layers: &[FriLayer>], + iotas: &[usize], +) -> Option>> +where + E: IsField, + FieldElement: AsBytes + Sync + Send, +{ + if fri_layers.is_empty() { + return None; + } + let stream = math_cuda::device::backend().ok()?.next_stream(); + let num_layers = fri_layers.len(); + + // Batched gather: one call per layer over all queries. + let mut per_layer_proofs: Vec>> = Vec::with_capacity(num_layers); + for (l, layer) in fri_layers.iter().enumerate() { + let tree = layer.gpu_tree.as_ref()?; + let positions: Vec = iotas.iter().map(|&iota| (iota >> l) >> 1).collect(); + per_layer_proofs.push(gather_proofs_dev(tree, &positions, &stream)?); + } + + // Reassemble per-query decommitments, matching the host walk's order. + let decommits = iotas + .iter() + .enumerate() + .map(|(q, &iota)| { + let mut layers_evaluations_sym = Vec::with_capacity(num_layers); + let mut layers_auth_paths = Vec::with_capacity(num_layers); + let mut index = iota; + for (l, layer) in fri_layers.iter().enumerate() { + layers_evaluations_sym.push(layer.evaluation[index ^ 1].clone()); + layers_auth_paths.push(per_layer_proofs[l][q].clone()); + index >>= 1; + } + FriDecommitment { + layers_auth_paths, + layers_evaluations_sym, + } + }) + .collect(); + Some(decommits) +} From 0fb8df6d26d399fafa25caa4223504e6a7739542 Mon Sep 17 00:00:00 2001 From: Joaquin Carletti Date: Mon, 29 Jun 2026 18:30:55 -0300 Subject: [PATCH 10/17] fix --- crypto/math-cuda/src/device.rs | 10 ++++- crypto/math-cuda/src/merkle.rs | 7 +++ crypto/stark/src/gpu_lde.rs | 21 +++++++-- crypto/stark/src/lib.rs | 10 ----- crypto/stark/src/prover.rs | 82 +++++++++++++++++++++++----------- 5 files changed, 90 insertions(+), 40 deletions(-) diff --git a/crypto/math-cuda/src/device.rs b/crypto/math-cuda/src/device.rs index 6b6094d45..ef99f05af 100644 --- a/crypto/math-cuda/src/device.rs +++ b/crypto/math-cuda/src/device.rs @@ -196,7 +196,15 @@ fn retain_default_mempool(ctx: &CudaContext) { { return; } - let threshold: u64 = u64::MAX; + // Default: retain freed stream-ordered blocks indefinitely (u64::MAX) + // for reuse. `LAMBDA_VM_MEMPOOL_RELEASE_MB` overrides the cap (bytes the + // pool keeps before returning memory to the OS) when retained-pool + // growth needs bounding. + let threshold: u64 = std::env::var("LAMBDA_VM_MEMPOOL_RELEASE_MB") + .ok() + .and_then(|s| s.parse::().ok()) + .map(|mb| mb.saturating_mul(1024 * 1024)) + .unwrap_or(u64::MAX); let _ = sys::cuMemPoolSetAttribute( pool, sys::CUmemPool_attribute_enum::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, diff --git a/crypto/math-cuda/src/merkle.rs b/crypto/math-cuda/src/merkle.rs index cb3758ff6..000f6daa0 100644 --- a/crypto/math-cuda/src/merkle.rs +++ b/crypto/math-cuda/src/merkle.rs @@ -241,6 +241,13 @@ pub fn gather_merkle_paths_dev( "leaves_len must be a power of two >= 2" ); let depth = leaves_len.trailing_zeros() as usize; + // Guard the kernel's device reads: an out-of-range position would walk off + // the node buffer (OOB device read). Positions are valid by construction; + // this catches any caller bug before it becomes UB. + assert!( + positions.iter().all(|&p| (p as usize) < leaves_len), + "gather_merkle_paths_dev: leaf position >= leaves_len" + ); let be = backend()?; let pos_dev = stream.clone_htod(positions)?; diff --git a/crypto/stark/src/gpu_lde.rs b/crypto/stark/src/gpu_lde.rs index 0dd83ab2d..115b9462a 100644 --- a/crypto/stark/src/gpu_lde.rs +++ b/crypto/stark/src/gpu_lde.rs @@ -1687,15 +1687,30 @@ where if fri_layers.is_empty() { return None; } - let stream = math_cuda::device::backend().ok()?.next_stream(); + // The GPU FRI commit sets `gpu_tree` on every layer as a group; the CPU + // commit sets none. If the layers are host trees, fall back to the host + // walk. If they're device-resident, the host trees are root-only — so the + // gather below MUST succeed (a failure is a hard abort, not a silent walk). + if fri_layers[0].gpu_tree.is_none() { + return None; + } + let stream = math_cuda::device::backend() + .expect("cuda backend for device-resident FRI query") + .next_stream(); let num_layers = fri_layers.len(); // Batched gather: one call per layer over all queries. let mut per_layer_proofs: Vec>> = Vec::with_capacity(num_layers); for (l, layer) in fri_layers.iter().enumerate() { - let tree = layer.gpu_tree.as_ref()?; + let tree = layer + .gpu_tree + .as_ref() + .expect("FRI layers are device-resident as a group"); let positions: Vec = iotas.iter().map(|&iota| (iota >> l) >> 1).collect(); - per_layer_proofs.push(gather_proofs_dev(tree, &positions, &stream)?); + per_layer_proofs.push( + gather_proofs_dev(tree, &positions, &stream) + .expect("device FRI-layer gather failed; resident tree has no host fallback"), + ); } // Reassemble per-query decommitments, matching the host walk's order. diff --git a/crypto/stark/src/lib.rs b/crypto/stark/src/lib.rs index 25ce91569..e9f6a1cda 100644 --- a/crypto/stark/src/lib.rs +++ b/crypto/stark/src/lib.rs @@ -3,16 +3,6 @@ #[cfg(all(target_arch = "wasm32", feature = "disk-spill"))] compile_error!("the `disk-spill` feature requires memmap2, which does not compile on wasm32"); -/// Open a wall-clock profiling span (no-op unless the `instruments` feature is on). -/// RAII: records elapsed wall time when the binding drops at end of scope. -#[macro_export] -macro_rules! prof_span { - ($label:expr) => { - #[cfg(feature = "instruments")] - let _prof_span = $crate::instruments::span($label); - }; -} - #[cfg(feature = "debug-checks")] pub mod bus_debug; pub mod constraints; diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs index c2702fd00..4bc0653fc 100644 --- a/crypto/stark/src/prover.rs +++ b/crypto/stark/src/prover.rs @@ -327,21 +327,29 @@ pub fn table_parallelism() -> usize { /// Heuristic peak device working-set for one table, in bytes. /// -/// Counts the LDE columns that are co-resident on the GPU — `main` in the base -/// field (8 B) and `aux` in the ext3 field (24 B) — times a scratch multiplier -/// for the Merkle / NTT / composition transients allocated alongside them. It -/// is deliberately a conservative over-estimate: it gates a safety ceiling, not -/// a precise allocator. Pass `aux_cols == 0` for phases where the aux LDE is -/// not yet resident (the R1 main commit). +/// Two contributions: +/// 1. **LDE columns** co-resident on the GPU — `main` in the base field (8 B) +/// and `aux` in the ext3 field (24 B) — times a scratch multiplier for the +/// NTT / leaf-hash transients allocated alongside them. +/// 2. **Resident Merkle trees** — main, aux, composition, and FRI-layer trees +/// are now kept on device R1→R4 (no whole-tree D2H). Each full tree is +/// `~2*lde_size` nodes × 32 B = `64*lde_size`; co-resident at the R4 peak +/// they sum to a few × that, so `~256 B × lde_size` covers them conservatively. +/// +/// It is deliberately a conservative over-estimate: it gates a safety ceiling, +/// not a precise allocator. Pass `aux_cols == 0` for phases where the aux LDE +/// is not yet resident (the R1 main commit). fn estimate_table_vram_bytes(main_cols: usize, aux_cols: usize, lde_size: usize) -> u64 { const BYTES_PER_BASE: u64 = 8; const EXT3_BYTES: u64 = 24; const SCRATCH_FACTOR: u64 = 2; + const RESIDENT_TREE_BYTES_PER_LDE: u64 = 256; + let lde = lde_size as u64; let per_row = (main_cols as u64).saturating_mul(BYTES_PER_BASE) + (aux_cols as u64).saturating_mul(EXT3_BYTES); - (lde_size as u64) - .saturating_mul(per_row) - .saturating_mul(SCRATCH_FACTOR) + let lde_term = lde.saturating_mul(per_row).saturating_mul(SCRATCH_FACTOR); + let tree_term = lde.saturating_mul(RESIDENT_TREE_BYTES_PER_LDE); + lde_term.saturating_add(tree_term) } /// Plan contiguous table chunks for parallel proving. @@ -1734,39 +1742,61 @@ pub trait IsStarkProver< // proofs (guarded by the `merkle_gather` parity test). Only the // non-preprocessed main carries a device tree today; on any miss this is // `None` and openings fall back to the host tree below. + // `*_dev_proofs` is `Some` exactly when the corresponding tree is + // device-resident (so the host tree is a root-only placeholder). In that + // case the gather MUST succeed — there is no host tree to fall back to, + // so a gather error is a hard abort (not a silent walk of an empty + // tree). When the tree is *not* device-resident the value is `None` and + // the openings below walk the full host tree as usual. #[cfg(feature = "cuda")] - let main_dev_proofs: Option>> = (!is_preprocessed) - .then(|| { - let tree = lde_trace.gpu_main()?.tree.as_ref()?; - let stream = lde_trace.bound_stream()?; - let positions: Vec = indexes_to_open - .iter() - .flat_map(|&c| [c * 2, c * 2 + 1]) - .collect(); - crate::gpu_lde::gather_proofs_dev(tree, &positions, &stream) - }) - .flatten(); + let main_dev_proofs: Option>> = if is_preprocessed { + None + } else { + lde_trace + .gpu_main() + .and_then(|h| h.tree.as_ref()) + .map(|tree| { + let stream = lde_trace + .bound_stream() + .expect("bound stream for device-resident main-tree opening"); + let positions: Vec = indexes_to_open + .iter() + .flat_map(|&c| [c * 2, c * 2 + 1]) + .collect(); + crate::gpu_lde::gather_proofs_dev(tree, &positions, &stream).expect( + "device main-tree gather failed; resident tree has no host fallback", + ) + }) + }; // Same for the aux-trace tree, when it is device-resident. #[cfg(feature = "cuda")] - let aux_dev_proofs: Option>> = - round_1_result.aux.as_ref().and_then(|_aux| { - let tree = lde_trace.gpu_aux()?.tree.as_ref()?; - let stream = lde_trace.bound_stream()?; + let aux_dev_proofs: Option>> = round_1_result + .aux + .as_ref() + .and_then(|_aux| lde_trace.gpu_aux().and_then(|h| h.tree.as_ref())) + .map(|tree| { + let stream = lde_trace + .bound_stream() + .expect("bound stream for device-resident aux-tree opening"); let positions: Vec = indexes_to_open .iter() .flat_map(|&c| [c * 2, c * 2 + 1]) .collect(); crate::gpu_lde::gather_proofs_dev(tree, &positions, &stream) + .expect("device aux-tree gather failed; resident tree has no host fallback") }); // Composition tree: openings open a single position `index` (row-pair // leaf), so gather one proof per query challenge from the device tree. #[cfg(feature = "cuda")] let comp_dev_proofs: Option>> = - round_2_result.gpu_composition_tree.as_ref().and_then(|tree| { - let stream = lde_trace.bound_stream()?; + round_2_result.gpu_composition_tree.as_ref().map(|tree| { + let stream = lde_trace + .bound_stream() + .expect("bound stream for device-resident composition-tree opening"); crate::gpu_lde::gather_proofs_dev(tree, indexes_to_open, &stream) + .expect("device composition-tree gather failed; resident tree has no host fallback") }); for (qi, index) in indexes_to_open.iter().enumerate() { From 8ab633ff8d64614cc1f69b0d87fc9d19d48ced2a Mon Sep 17 00:00:00 2001 From: Joaquin Carletti Date: Mon, 29 Jun 2026 18:44:14 -0300 Subject: [PATCH 11/17] rm unused functions --- crypto/crypto/src/merkle_tree/merkle.rs | 7 +++++++ crypto/stark/src/gpu_lde.rs | 23 ----------------------- crypto/stark/src/prover.rs | 11 ++++++----- 3 files changed, 13 insertions(+), 28 deletions(-) diff --git a/crypto/crypto/src/merkle_tree/merkle.rs b/crypto/crypto/src/merkle_tree/merkle.rs index 2060a0f18..1ae40a714 100644 --- a/crypto/crypto/src/merkle_tree/merkle.rs +++ b/crypto/crypto/src/merkle_tree/merkle.rs @@ -255,6 +255,13 @@ where /// For example, give me an inclusion proof for the 3rd element in the /// Merkle tree pub fn get_proof_by_pos(&self, pos: usize) -> Option> { + // A root-only tree (from `from_root`) has no nodes to walk — callers + // must gather paths from the device-resident copy instead. Catch the + // misuse early in debug builds rather than returning a misleading `None`. + debug_assert!( + !self.nodes.is_empty(), + "get_proof_by_pos called on a root-only MerkleTree (no nodes)" + ); let pos = pos + self.node_count() / 2; let Ok(merkle_path) = self.build_merkle_path(pos) else { return None; diff --git a/crypto/stark/src/gpu_lde.rs b/crypto/stark/src/gpu_lde.rs index 115b9462a..1a0acafd5 100644 --- a/crypto/stark/src/gpu_lde.rs +++ b/crypto/stark/src/gpu_lde.rs @@ -271,29 +271,6 @@ fn restore_columns_on_err(columns: &mut [Vec>], n: u } } -/// Allocate the `[u8; 32]` Merkle node buffer for a tree of `lde_size` leaves -/// and return the node `Vec` (length-initialised, contents undefined) together -/// with its node count `total_nodes` (`2 * lde_size - 1`). Returns `None` if -/// the layout would be invalid (`lde_size < 2` or `total_nodes * 32` overflows -/// `usize`). The caller builds the `&mut [u8]` byte view of length -/// `total_nodes * 32` and must overwrite every byte via the GPU D2H. -fn alloc_merkle_nodes(lde_size: usize) -> Option<(Vec<[u8; 32]>, usize)> { - if lde_size < 2 { - return None; - } - let total_nodes = 2usize.saturating_mul(lde_size).checked_sub(1)?; - let _byte_len = total_nodes.checked_mul(32)?; - let mut nodes: Vec<[u8; 32]> = Vec::with_capacity(total_nodes); - // SAFETY: every byte will be overwritten via the GPU D2H before the - // contents are read. The caller computes the byte-length view from the - // returned `nodes` Vec using `total_nodes.checked_mul(32)`. - #[allow(clippy::uninit_vec)] - unsafe { - nodes.set_len(total_nodes) - }; - Some((nodes, total_nodes)) -} - /// Try to GPU-batch all columns in one pass. /// /// Engaged for Goldilocks-base and ext3 tables whose LDE size is above the diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs index 4bc0653fc..b668e2364 100644 --- a/crypto/stark/src/prover.rs +++ b/crypto/stark/src/prover.rs @@ -357,11 +357,12 @@ fn estimate_table_vram_bytes(main_cols: usize, aux_cols: usize, lde_size: usize) /// A chunk grows until it reaches `k` tables (the core/RAM-bound limit) **or** /// its summed VRAM estimate would exceed `budget` — whichever comes first. A /// single table larger than `budget` forms its own chunk (it runs solo rather -/// than being excluded). With `budget == u64::MAX` this degrades exactly to -/// fixed chunks of `k`, identical to the previous `step_by(k)` scheme — so on -/// non-cuda builds and when VRAM isn't the binding constraint, scheduling (and -/// therefore the proof) is unchanged. Returns `(start, end)` half-open ranges -/// covering `0..estimates.len()` in order. +/// than being excluded). With `budget == u64::MAX` the VRAM constraint is never +/// binding for any realistic estimate (a chunk's summed estimate can't approach +/// `u64::MAX`), so chunks fall back to fixed size `k` — identical to the previous +/// `step_by(k)` scheme. So on non-cuda builds and when VRAM isn't the binding +/// constraint, scheduling (and therefore the proof) is unchanged. Returns +/// `(start, end)` half-open ranges covering `0..estimates.len()` in order. fn plan_table_chunks(estimates: &[u64], k: usize, budget: u64) -> Vec<(usize, usize)> { let n = estimates.len(); let k = k.max(1); From fd31fc9d9d346bfea6882c034994610d84baaf18 Mon Sep 17 00:00:00 2001 From: Joaquin Carletti Date: Mon, 29 Jun 2026 19:09:29 -0300 Subject: [PATCH 12/17] cleanup --- crypto/crypto/src/merkle_tree/merkle.rs | 13 +- crypto/math-cuda/src/device.rs | 30 +++-- crypto/math-cuda/src/fri.rs | 2 +- crypto/math-cuda/src/lde.rs | 155 ++++++------------------ crypto/math-cuda/src/merkle.rs | 25 ++-- crypto/math-cuda/tests/merkle_gather.rs | 6 +- crypto/stark/src/fri/fri_commitment.rs | 2 +- crypto/stark/src/gpu_lde.rs | 45 ++++--- crypto/stark/src/instruments.rs | 25 ++-- crypto/stark/src/prover.rs | 106 +++++++--------- crypto/stark/src/trace.rs | 97 +++------------ 11 files changed, 175 insertions(+), 331 deletions(-) diff --git a/crypto/crypto/src/merkle_tree/merkle.rs b/crypto/crypto/src/merkle_tree/merkle.rs index 1ae40a714..a32f12978 100644 --- a/crypto/crypto/src/merkle_tree/merkle.rs +++ b/crypto/crypto/src/merkle_tree/merkle.rs @@ -168,10 +168,9 @@ where }) } - /// Create a root-only Merkle tree placeholder: stores the commitment root - /// but no nodes. Used when the tree's authentication paths are gathered from - /// a device-resident copy (GPU) instead of this host tree, so the host nodes - /// are never materialised (saving the full-tree Device→Host copy). + /// Create a root only Merkle tree placeholder: stores the commitment root + /// but no nodes. Used when paths are gathered from a device resident copy + /// (GPU) instead of this host tree, so the host nodes are never built. /// [`get_proof_by_pos`](Self::get_proof_by_pos) must NOT be called on it. pub fn from_root(root: B::Node) -> Self { MerkleTree { @@ -255,9 +254,9 @@ where /// For example, give me an inclusion proof for the 3rd element in the /// Merkle tree pub fn get_proof_by_pos(&self, pos: usize) -> Option> { - // A root-only tree (from `from_root`) has no nodes to walk — callers - // must gather paths from the device-resident copy instead. Catch the - // misuse early in debug builds rather than returning a misleading `None`. + // A root only tree (from `from_root`) has no nodes to walk. Callers must + // gather paths from the device resident copy instead. Catch the misuse + // early in debug builds rather than returning a misleading None. debug_assert!( !self.nodes.is_empty(), "get_proof_by_pos called on a root-only MerkleTree (no nodes)" diff --git a/crypto/math-cuda/src/device.rs b/crypto/math-cuda/src/device.rs index ef99f05af..3dfaf629d 100644 --- a/crypto/math-cuda/src/device.rs +++ b/crypto/math-cuda/src/device.rs @@ -214,13 +214,13 @@ fn retain_default_mempool(ctx: &CudaContext) { } } -/// Device VRAM budget (bytes) for table-session admission control. +/// Device VRAM budget in bytes for table session admission control. /// -/// `LAMBDA_VM_VRAM_BUDGET_MB` overrides it explicitly — used to force-exercise -/// the throttle in tests/benchmarks. Otherwise it is 80% of total device -/// memory, leaving headroom for the context, module code, and the pool's -/// retained blocks. On any query failure it returns `u64::MAX`, which disables -/// budgeting: admission then falls back to the core-bound chunk size alone. +/// LAMBDA_VM_VRAM_BUDGET_MB overrides it (used to force the throttle in tests). +/// Otherwise it is 80% of total device memory, leaving headroom for the +/// context, module code, and retained pool blocks. Returns u64::MAX on any +/// query failure, which disables budgeting (chunks fall back to the core bound +/// size alone). fn detect_vram_budget_bytes(ctx: &CudaContext) -> u64 { if let Ok(mb) = std::env::var("LAMBDA_VM_VRAM_BUDGET_MB") && let Ok(mb) = mb.parse::() @@ -255,17 +255,15 @@ impl Backend { // before returning), so the tracking is pure overhead. Disable it. unsafe { ctx.disable_event_tracking() }; - // Retain freed device memory in the stream-ordered pool for reuse. + // Retain freed device memory in the stream ordered pool for reuse. // - // cudarc routes `CudaStream::alloc*` through `cuMemAllocAsync`, which - // draws from the device's default memory pool. That pool's release - // threshold defaults to 0, so every freed buffer is handed back to the - // OS at the next sync — meaning the prover's large, repeatedly-shaped - // LDE / FRI buffers are re-malloc'd from scratch each op. Raising the - // threshold to "unbounded" keeps freed blocks resident in the pool so - // subsequent allocations of the same size are satisfied without a real - // driver allocation. Best-effort: on any error (no pool support, sync - // allocator) we silently keep the current behaviour. + // cudarc routes CudaStream::alloc* through cuMemAllocAsync, drawing from + // the device default memory pool. Its release threshold defaults to 0, + // so every freed buffer goes back to the OS at the next sync and the + // prover's large LDE/FRI buffers are rebuilt from scratch each op. + // Raising the threshold keeps freed blocks in the pool so a same size + // allocation skips a real driver allocation. Best effort: on any error + // we keep the current behaviour. retain_default_mempool(&ctx); let arith = ctx.load_module(Ptx::from_src(ARITH_PTX))?; diff --git a/crypto/math-cuda/src/fri.rs b/crypto/math-cuda/src/fri.rs index e4c60a8a8..8fe6fcdb1 100644 --- a/crypto/math-cuda/src/fri.rs +++ b/crypto/math-cuda/src/fri.rs @@ -215,7 +215,7 @@ impl FriCommitState { }; // Keep the layer tree resident on device; copy only the 32-byte root so - // R4 query openings gather paths on device instead of D2H'ing the tree. + // R4 query openings gather paths on device instead of copying the tree. let mut root = [0u8; 32]; self.stream.memcpy_dtoh(&nodes_dev.slice(0..32), &mut root)?; self.stream.synchronize()?; diff --git a/crypto/math-cuda/src/lde.rs b/crypto/math-cuda/src/lde.rs index 8fc309ae5..ad0fd9152 100644 --- a/crypto/math-cuda/src/lde.rs +++ b/crypto/math-cuda/src/lde.rs @@ -220,10 +220,9 @@ fn launch_pointwise_mul_batched( /// Layout: `m` columns, each `lde_size` u64s, column `c` at byte offset /// `c * lde_size * 8` within `buf`. Freed when `buf` Arc drops. /// -/// `tree` optionally carries the main-trace Merkle tree kept resident on device -/// (populated by the keep path), so R4 query openings can gather authentication -/// paths on device instead of D2H'ing the whole tree to host. `None` on the CPU -/// path or when the tree wasn't retained. +/// `tree` optionally carries the main trace Merkle tree kept resident on device +/// (the keep path), so R4 query openings gather paths on device instead of +/// copying the whole tree to host. None on the CPU path. #[derive(Clone)] pub struct GpuLdeBase { pub buf: Arc>, @@ -240,24 +239,22 @@ pub struct GpuLdeExt3 { pub buf: Arc>, pub m: usize, pub lde_size: usize, - /// Optionally the aux/composition Merkle tree kept resident on device (the - /// keep path), so R4 openings gather paths on device instead of D2H'ing the - /// whole tree. `None` on the CPU path or when not retained. + /// Optionally the aux or composition Merkle tree kept resident on device + /// (the keep path), so R4 openings gather paths on device. None otherwise. pub tree: Option, } -/// Handle to a Merkle tree kept live on device after a commit, so query -/// openings can gather authentication paths on device instead of D2H'ing the -/// whole tree to host. Node layout matches the CPU tree -/// (`crypto/crypto/src/merkle_tree`): `nodes[0..leaves_len-1]` are inner nodes -/// (root at index 0), `nodes[leaves_len-1..]` are the leaves; each node is 32 +/// Merkle tree kept resident on device after a commit, so query openings gather +/// paths on device instead of copying the whole tree to host. Node layout +/// matches the CPU tree (`crypto/crypto/src/merkle_tree`): `nodes[0..leaves_len-1]` +/// are inner nodes (root at 0), `nodes[leaves_len-1..]` are the leaves, each 32 /// bytes. Freed when the `nodes` Arc drops. #[derive(Clone)] pub struct GpuMerkleTree { pub nodes: Arc>, pub leaves_len: usize, - /// The Merkle root (node 0), copied to host at build time (32 bytes) so the - /// commitment is available without D2H'ing the whole tree. + /// The Merkle root (node 0), copied to host at build time so the commitment + /// is available without copying the whole tree. pub root: [u8; 32], } @@ -643,41 +640,11 @@ pub fn coset_lde_batch_base_into_with_leaf_hash( .map(|_| ()) } -/// Like `coset_lde_batch_base_into_with_leaf_hash`, but also builds the full -/// Merkle tree on device and returns the `2*lde_size - 1` node buffer back -/// to the caller in `merkle_nodes_out` (byte length `(2*lde_size - 1) * 32`). -/// -/// The leaf hashes are never exposed to the caller — they stay on device and -/// feed straight into the pair-hash tree kernel, avoiding the -/// pinned→pageable→pinned round-trip that the separate-step GPU tree build -/// would pay. -pub fn coset_lde_batch_base_into_with_merkle_tree( - columns: &[&[u64]], - blowup_factor: usize, - weights: &[u64], - outputs: &mut [&mut [u64]], - merkle_nodes_out: &mut [u8], -) -> Result<()> { - coset_lde_batch_base_into_with_merkle_tree_inner( - columns, - blowup_factor, - weights, - outputs, - Some(merkle_nodes_out), - KeccakCommit::FullTree, - false, - false, - ) - .map(|_| ()) -} - -/// Fused LDE + leaf-hash + Merkle tree build, keeping **both** the LDE buffer -/// and the Merkle tree resident on device (returned in `GpuLdeBase` + its -/// `tree`, including the root). The host tree is **not** materialised — there is -/// no `merkle_nodes_out` — so the whole-tree Device→Host copy is eliminated. -/// Callers gather query authentication paths from the device tree instead -/// (`crate::merkle::gather_merkle_paths_dev`) and use a root-only host tree for -/// the commitment. +/// Fused LDE, leaf hash, and Merkle tree build that keeps both the LDE buffer +/// and the Merkle tree (with root) resident on device, returned in `GpuLdeBase`. +/// The host tree is not built (no `merkle_nodes_out`), so the whole tree copy to +/// host is eliminated. Callers gather query paths from the device tree +/// (`crate::merkle::gather_merkle_paths_dev`) and use a root only host tree. pub fn coset_lde_batch_base_into_with_merkle_tree_keep( columns: &[&[u64]], blowup_factor: usize, @@ -840,20 +807,20 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner( crate::merkle::build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, lde_size)?; } - // D2H the LDE columns via pinned staging (still read on host by constraint - // eval / openings). The full tree nodes are D2H'd only when the caller asks - // for a host tree (`nodes_out`); the device-resident path passes `None` and - // keeps the tree on device, eliminating the whole-tree D2H. + // Copy the LDE columns to host via pinned staging (constraint eval and + // openings still read them). The full tree is copied only when the caller + // wants a host tree (`nodes_out`); the resident path passes None and keeps + // the tree on device. stream.memcpy_dtoh(&buf, &mut pinned[..m * lde_size])?; if let Some(no) = nodes_out { d2h_bytes_via_pinned_hashes(&stream, be, &nodes_dev, no)?; } - // Ensure the (async) LDE D2H above has landed before reading `pinned`. - // Previously the tree D2H's `synchronize` covered this; the device-resident - // path skips that D2H, so synchronize explicitly. + // Make sure the async LDE copy above landed before reading `pinned`. The + // tree copy used to provide this sync; the resident path skips it, so sync + // here. stream.synchronize()?; - // Copy pinned into caller outputs. Runs under the pinned-staging lock, + // Copy pinned into caller outputs. Runs under the pinned staging lock, // where rayon can deadlock. See `Backend::pinned_staging`. for (c, dst) in outputs.iter_mut().enumerate() { dst.copy_from_slice(&pinned[c * lde_size..c * lde_size + lde_size]); @@ -861,11 +828,11 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner( drop(staging); if keep_device_buf { - // Retain the device tree nodes for on-device opening (FullTree only; in - // LeavesOnly mode there are no inner nodes to gather paths from). + // Retain the device tree for on device opening (FullTree only; LeavesOnly + // has no inner nodes to gather paths from). let tree = if keep_tree && commit == KeccakCommit::FullTree { - // Copy just the 32-byte root (node 0) so the commitment is available - // without D2H'ing the whole tree. + // Copy just the 32 byte root (node 0) so the commitment is available + // without copying the whole tree. let mut root = [0u8; 32]; stream.memcpy_dtoh(&nodes_dev.slice(0..32), &mut root)?; stream.synchronize()?; @@ -891,60 +858,10 @@ fn coset_lde_batch_base_into_with_merkle_tree_inner( } } -/// Ext3 variant of `coset_lde_batch_base_into_with_leaf_hash`: fused -/// LDE + Keccak-256 leaf hashing over ext3 columns. Thin wrapper over -/// `coset_lde_batch_ext3_into_with_merkle_tree_inner` with `LeavesOnly`. -pub fn coset_lde_batch_ext3_into_with_leaf_hash( - columns: &[&[u64]], - n: usize, - blowup_factor: usize, - weights: &[u64], - outputs: &mut [&mut [u64]], - hashed_leaves_out: &mut [u8], -) -> Result<()> { - coset_lde_batch_ext3_into_with_merkle_tree_inner( - columns, - n, - blowup_factor, - weights, - outputs, - Some(hashed_leaves_out), - KeccakCommit::LeavesOnly, - false, - false, - ) - .map(|_| ()) -} - -/// Ext3 variant of the fused `coset_lde_batch_base_into_with_merkle_tree`. -/// LDE + leaf hashing + inner-tree build, all on device; D2Hs only the LDE -/// evaluations and the full `2*lde_size - 1` node buffer. -pub fn coset_lde_batch_ext3_into_with_merkle_tree( - columns: &[&[u64]], - n: usize, - blowup_factor: usize, - weights: &[u64], - outputs: &mut [&mut [u64]], - merkle_nodes_out: &mut [u8], -) -> Result<()> { - coset_lde_batch_ext3_into_with_merkle_tree_inner( - columns, - n, - blowup_factor, - weights, - outputs, - Some(merkle_nodes_out), - KeccakCommit::FullTree, - false, - false, - ) - .map(|_| ()) -} - /// Ext3 variant of [`coset_lde_batch_base_into_with_merkle_tree_keep`]: keeps -/// both the de-interleaved LDE buffer and the Merkle tree (with root) resident -/// on device. The host tree is **not** materialised (no `merkle_nodes_out`), so -/// the whole-tree D2H is eliminated; openings gather paths from the device tree. +/// both the deinterleaved LDE buffer and the Merkle tree (with root) resident on +/// device. No host tree is built, so the whole tree copy to host is eliminated; +/// openings gather paths from the device tree. pub fn coset_lde_batch_ext3_into_with_merkle_tree_keep( columns: &[&[u64]], n: usize, @@ -1103,15 +1020,15 @@ fn coset_lde_batch_ext3_into_with_merkle_tree_inner( crate::merkle::build_inner_tree_levels(stream.as_ref(), be, &mut nodes_dev, lde_size)?; } - // D2H the LDE columns. The full tree nodes are D2H'd only when the caller - // asks for a host tree (`nodes_out`); the device-resident path passes `None` - // and keeps the tree on device, eliminating the whole-tree D2H. + // Copy the LDE columns to host. The full tree is copied only when the caller + // wants a host tree (`nodes_out`); the resident path passes None and keeps + // the tree on device. stream.memcpy_dtoh(&buf, &mut pinned[..mb * lde_size])?; if let Some(no) = nodes_out { d2h_bytes_via_pinned_hashes(&stream, be, &nodes_dev, no)?; } - // Ensure the (async) LDE D2H has landed before reading `pinned` (the tree - // D2H's synchronize used to cover this; the device-resident path skips it). + // Make sure the async LDE copy landed before reading `pinned` (the tree copy + // used to provide this sync; the resident path skips it). stream.synchronize()?; unpack_pinned_slabs_to_ext3(pinned, outputs, lde_size); diff --git a/crypto/math-cuda/src/merkle.rs b/crypto/math-cuda/src/merkle.rs index 000f6daa0..2ca5d25ec 100644 --- a/crypto/math-cuda/src/merkle.rs +++ b/crypto/math-cuda/src/merkle.rs @@ -221,11 +221,10 @@ pub fn build_merkle_tree_on_device(hashed_leaves: &[u8]) -> Result> { /// Gather Merkle authentication paths on device for `positions` (leaf indices) /// against the resident tree `nodes_dev` (standard layout, `2*leaves_len-1` /// nodes of 32 bytes). Returns `positions.len() * depth * 32` bytes, where -/// `depth = log2(leaves_len)`: query `q`'s path occupies bytes -/// `[q*depth*32 .. (q+1)*depth*32]`, each 32-byte node a sibling from leaf to -/// root — byte-for-byte the same nodes the CPU `MerkleTree::get_proof_by_pos` -/// (`build_merkle_path`) collects. Runs on the caller's `stream` (pass the -/// table's session stream so it shares the queue with the rest of R4). +/// `depth = log2(leaves_len)`. Query `q`'s path is `[q*depth*32 .. +/// (q+1)*depth*32]`, each 32 byte node a sibling from leaf to root. These are +/// the same nodes the CPU `MerkleTree::get_proof_by_pos` collects. Runs on the +/// caller's `stream` (pass the table's session stream). pub fn gather_merkle_paths_dev( nodes_dev: &CudaSlice, leaves_len: usize, @@ -241,9 +240,9 @@ pub fn gather_merkle_paths_dev( "leaves_len must be a power of two >= 2" ); let depth = leaves_len.trailing_zeros() as usize; - // Guard the kernel's device reads: an out-of-range position would walk off - // the node buffer (OOB device read). Positions are valid by construction; - // this catches any caller bug before it becomes UB. + // Guard the kernel's device reads: a position past leaves_len would walk + // off the node buffer. Positions are valid by construction; this catches a + // caller bug before it becomes an out of bounds device read. assert!( positions.iter().all(|&p| (p as usize) < leaves_len), "gather_merkle_paths_dev: leaf position >= leaves_len" @@ -286,9 +285,9 @@ pub fn gather_merkle_paths_dev( /// /// Returns `(2*(lde_size/2) - 1) * 32` bytes of tree nodes in the standard /// layout (root at byte offset 0, leaves in the tail). -/// Build the composition-poly Merkle tree on device (leaves hash row-pairs, so -/// `num_leaves = lde_size / 2`). Returns the device node buffer, the leaf count, -/// and the stream it was built on. Shared by the host-D2H and device-keep +/// Build the composition Merkle tree on device. Leaves hash row pairs, so +/// `num_leaves = lde_size / 2`. Returns the device node buffer, the leaf count, +/// and the stream it was built on. Shared by the host copy and device keep /// wrappers below. fn build_comp_poly_tree_nodes_dev( parts_interleaved: &[&[u64]], @@ -366,8 +365,8 @@ pub fn build_comp_poly_tree_from_evals_ext3(parts_interleaved: &[&[u64]]) -> Res /// Like [`build_comp_poly_tree_from_evals_ext3`] but keeps the tree nodes on /// device (returned as a [`crate::lde::GpuMerkleTree`] with its root), so R4 -/// composition openings gather authentication paths on device instead of -/// D2H'ing the whole tree. `leaves_len = lde_size / 2` (row-pair leaves). +/// composition openings gather paths on device instead of copying the whole +/// tree to host. `leaves_len = lde_size / 2` (row pair leaves). pub fn build_comp_poly_tree_from_evals_ext3_keep( parts_interleaved: &[&[u64]], ) -> Result { diff --git a/crypto/math-cuda/tests/merkle_gather.rs b/crypto/math-cuda/tests/merkle_gather.rs index b2ba24c1d..36e05a719 100644 --- a/crypto/math-cuda/tests/merkle_gather.rs +++ b/crypto/math-cuda/tests/merkle_gather.rs @@ -1,7 +1,7 @@ //! Parity: GPU `gather_merkle_paths_dev` must produce, for each leaf position, -//! the exact `merkle_path` the CPU `MerkleTree::get_proof_by_pos` returns — -//! same sibling-from-leaf-to-root order, byte-for-byte. This is the gate for -//! gathering R4 query openings on device instead of D2H'ing the whole tree. +//! the exact `merkle_path` the CPU `MerkleTree::get_proof_by_pos` returns: the +//! same sibling order from leaf to root, byte for byte. This is the gate for +//! gathering R4 query openings on device instead of copying the whole tree. use crypto::merkle_tree::backends::field_element_vector::FieldElementVectorBackend; use crypto::merkle_tree::merkle::MerkleTree; diff --git a/crypto/stark/src/fri/fri_commitment.rs b/crypto/stark/src/fri/fri_commitment.rs index 9f950c27e..58c9eed77 100644 --- a/crypto/stark/src/fri/fri_commitment.rs +++ b/crypto/stark/src/fri/fri_commitment.rs @@ -15,7 +15,7 @@ where pub merkle_tree: MerkleTree, /// The layer's Merkle tree kept resident on device (GPU FRI commit path), /// so R4 query openings gather authentication paths on device. When set, - /// `merkle_tree` is a root-only placeholder. `None` on the CPU path. + /// `merkle_tree` is a root only placeholder. `None` on the CPU path. #[cfg(feature = "cuda")] pub gpu_tree: Option, } diff --git a/crypto/stark/src/gpu_lde.rs b/crypto/stark/src/gpu_lde.rs index 1a0acafd5..3f2debb5e 100644 --- a/crypto/stark/src/gpu_lde.rs +++ b/crypto/stark/src/gpu_lde.rs @@ -430,13 +430,12 @@ pub fn gpu_leaf_hash_calls() -> u64 { GPU_LEAF_HASH_CALLS.load(Ordering::Relaxed) } -/// Fused base-field path: LDE + Keccak-256 leaf hash + Merkle tree build, all -/// on device, keeping **both** the LDE buffer and the Merkle tree resident on -/// device. On success: `columns[c]` is resized to `lde_size` with the LDE -/// output, and the returned `GpuLdeBase` carries the device LDE buffer plus the -/// device tree (`.tree`). The returned `MerkleTree` is **root-only** — the host -/// tree nodes are never materialised (no whole-tree D2H); query openings gather -/// authentication paths from the device tree via [`gather_proofs_dev`]. +/// Fused base field path: LDE, leaf hash, and Merkle tree build, all on device, +/// keeping both the LDE buffer and the tree resident. On success `columns[c]` is +/// resized to `lde_size` with the LDE output, and the returned `GpuLdeBase` +/// carries the device LDE buffer plus the device tree (`.tree`). The returned +/// `MerkleTree` is root only (no host tree, no whole tree copy); query openings +/// gather paths from the device tree via [`gather_proofs_dev`]. pub(crate) fn try_expand_leaf_and_tree_batched_keep( columns: &mut [Vec>], blowup_factor: usize, @@ -749,9 +748,9 @@ where }) .collect(); - // Keep the composition tree resident on device; the whole-tree D2H is - // eliminated. R4 composition openings gather paths from the device tree - // (`gather_proofs_dev`); the returned host tree is root-only. + // Keep the composition tree resident on device, so the whole tree copy to + // host is eliminated. R4 composition openings gather paths from the device + // tree (`gather_proofs_dev`); the returned host tree is root only. let dev_tree = match math_cuda::merkle::build_comp_poly_tree_from_evals_ext3_keep(&raw_parts) { Ok(t) => t, Err(_) => return None, @@ -1606,9 +1605,9 @@ where } }; - // Build the FriLayer: ext3 evals + a root-only host tree (the layer tree - // stays resident on device in `gpu_tree`; query openings gather paths - // from it via `gather_proofs_dev`). + // Build the FriLayer: ext3 evals and a root only host tree. The layer + // tree stays resident on device in `gpu_tree`; query openings gather + // paths from it via `gather_proofs_dev`. let evaluation = u64_to_ext3_vec::(&layer_evals_u64); let root = dev_tree.root; let merkle_tree = MerkleTree::>::from_root(root); @@ -1645,14 +1644,14 @@ where Some((last_value, fri_layer_list)) } -/// GPU FRI query phase: gather each layer's authentication paths on device -/// instead of walking host trees. For layer `l` and query `iota`, the opened -/// position is `(iota >> l) >> 1` — matching [`crate::fri::query_phase`]. Paths -/// for all queries are gathered in one batched call per layer. The layer -/// evaluations (`evaluation[index ^ 1]`) are read from the host Vecs as before. +/// GPU FRI query phase: gather each layer's paths on device instead of walking +/// host trees. For layer `l` and query `iota` the opened position is +/// `(iota >> l) >> 1`, matching [`crate::fri::query_phase`]. Paths for all +/// queries are gathered in one batched call per layer. The layer evaluations +/// (`evaluation[index ^ 1]`) are read from the host Vecs as before. /// -/// Returns `None` if there are no layers or any layer lacks a device tree (a -/// CPU-committed layer), so the caller falls back to the host walk. +/// Returns None when there are no layers or the layers are host trees (CPU +/// commit), so the caller falls back to the host walk. pub(crate) fn try_fri_query_phase_gpu( fri_layers: &[FriLayer>], iotas: &[usize], @@ -1665,9 +1664,9 @@ where return None; } // The GPU FRI commit sets `gpu_tree` on every layer as a group; the CPU - // commit sets none. If the layers are host trees, fall back to the host - // walk. If they're device-resident, the host trees are root-only — so the - // gather below MUST succeed (a failure is a hard abort, not a silent walk). + // commit sets none. Host trees fall back to the host walk. When the layers + // are device resident the host trees are root only, so the gather below must + // succeed (a failure is a hard abort, not a silent walk). if fri_layers[0].gpu_tree.is_none() { return None; } diff --git a/crypto/stark/src/instruments.rs b/crypto/stark/src/instruments.rs index 83e14545b..784e5e1ad 100644 --- a/crypto/stark/src/instruments.rs +++ b/crypto/stark/src/instruments.rs @@ -8,17 +8,18 @@ use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; // Wall-clock span timeline (the trustworthy per-step measurement) // ========================================================================= // -// Nested wall-clock spans opened/closed on the driving (main) thread at phase -// boundaries. Unlike the `accum_*` / thread-local sub-timers below — which sum -// per-worker CPU time across rayon threads and over-count (percentages > 100%) — -// these spans are non-overlapping and sum to their parent, so the tree is a true -// latency breakdown. Parallel regions are measured as a single span around the -// blocking call (that IS their latency); their internal split is reported -// separately as CPU-time, never mixed into the wall tree. +// Nested wall clock spans opened and closed on the driving (main) thread at +// phase boundaries. Unlike the `accum_*` thread local sub timers below (which +// sum per worker CPU time across rayon threads and over count, so percentages +// exceed 100%), these spans do not overlap and sum to their parent, so the tree +// is a true latency breakdown. Parallel regions are measured as a single span +// around the blocking call (that is their latency); their internal split is +// reported separately as CPU time, never mixed into the wall tree. // // let _s = instruments::span("trace_build"); // RAII, stops on drop // -// `Instant::now()` is ~20 ns — fine at phase granularity; never inside per-op loops. +// `Instant::now()` is about 20 ns, fine at phase granularity; never put it +// inside per op loops. #[derive(Clone, Debug)] pub struct SpanRecord { @@ -27,8 +28,8 @@ pub struct SpanRecord { pub wall: Duration, /// Open-order, so the tree reconstructs in start-order (records push on close). pub order: u32, - /// Wall-clock epoch (ns) when the span opened — for aligning with external - /// samplers (e.g. nvidia-smi GPU-util) to attribute device-busy time per step. + /// Wall clock epoch (ns) when the span opened, for aligning with external + /// samplers (e.g. nvidia-smi GPU util) to attribute device busy time per step. pub start_ns: u128, } @@ -269,8 +270,8 @@ pub fn take_r1_sub() -> Round1SubOps { /// Reset all instrument state. Call at the start of `multi_prove` to avoid /// stale data from a previous run in the same process. /// -/// Note: thread-local stores (R2_SUB, R4_SUB, ROUND_SUB_OPS) are only cleared -/// for the calling thread. Rayon worker threads are not reset — stale data is +/// Note: thread local stores (R2_SUB, R4_SUB, ROUND_SUB_OPS) are only cleared +/// for the calling thread. Rayon worker threads are not reset, so stale data is /// possible if a previous run panicked without consuming stored values. /// In practice this is safe because store/take pairs always execute within the /// same rayon task closure. diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs index b668e2364..1e4c40bfc 100644 --- a/crypto/stark/src/prover.rs +++ b/crypto/stark/src/prover.rs @@ -325,20 +325,14 @@ pub fn table_parallelism() -> usize { } } -/// Heuristic peak device working-set for one table, in bytes. -/// -/// Two contributions: -/// 1. **LDE columns** co-resident on the GPU — `main` in the base field (8 B) -/// and `aux` in the ext3 field (24 B) — times a scratch multiplier for the -/// NTT / leaf-hash transients allocated alongside them. -/// 2. **Resident Merkle trees** — main, aux, composition, and FRI-layer trees -/// are now kept on device R1→R4 (no whole-tree D2H). Each full tree is -/// `~2*lde_size` nodes × 32 B = `64*lde_size`; co-resident at the R4 peak -/// they sum to a few × that, so `~256 B × lde_size` covers them conservatively. -/// -/// It is deliberately a conservative over-estimate: it gates a safety ceiling, -/// not a precise allocator. Pass `aux_cols == 0` for phases where the aux LDE -/// is not yet resident (the R1 main commit). +/// Heuristic peak device working set for one table, in bytes. Two parts: the +/// LDE columns co-resident on the GPU (main base field at 8 B, aux ext3 at +/// 24 B) times a scratch factor for the NTT and leaf hash transients, plus the +/// resident Merkle trees (main, aux, composition, FRI layers) kept on device R1 +/// to R4. Each full tree is about `2*lde_size` nodes of 32 B (`64*lde_size`); +/// together at the R4 peak about 256 B per `lde_size` covers them. A deliberate +/// over estimate that gates a safety ceiling, not a precise allocator. Pass +/// `aux_cols == 0` where the aux LDE is not yet resident (R1 main commit). fn estimate_table_vram_bytes(main_cols: usize, aux_cols: usize, lde_size: usize) -> u64 { const BYTES_PER_BASE: u64 = 8; const EXT3_BYTES: u64 = 24; @@ -354,15 +348,14 @@ fn estimate_table_vram_bytes(main_cols: usize, aux_cols: usize, lde_size: usize) /// Plan contiguous table chunks for parallel proving. /// -/// A chunk grows until it reaches `k` tables (the core/RAM-bound limit) **or** -/// its summed VRAM estimate would exceed `budget` — whichever comes first. A +/// A chunk grows until it reaches `k` tables (the core and RAM bound limit) or +/// its summed VRAM estimate would exceed `budget`, whichever comes first. A /// single table larger than `budget` forms its own chunk (it runs solo rather /// than being excluded). With `budget == u64::MAX` the VRAM constraint is never -/// binding for any realistic estimate (a chunk's summed estimate can't approach -/// `u64::MAX`), so chunks fall back to fixed size `k` — identical to the previous -/// `step_by(k)` scheme. So on non-cuda builds and when VRAM isn't the binding -/// constraint, scheduling (and therefore the proof) is unchanged. Returns -/// `(start, end)` half-open ranges covering `0..estimates.len()` in order. +/// binding for any realistic estimate, so chunks fall back to fixed size `k`, +/// the same as the previous `step_by(k)` scheme. So on non-cuda builds and when +/// VRAM is not binding, scheduling (and therefore the proof) is unchanged. +/// Returns `(start, end)` half open ranges covering `0..estimates.len()` in order. fn plan_table_chunks(estimates: &[u64], k: usize, budget: u64) -> Vec<(usize, usize)> { let n = estimates.len(); let k = k.max(1); @@ -399,10 +392,10 @@ where pub(crate) composition_poly_merkle_tree: BatchedMerkleTree, /// The commitment to the composition polynomial parts. pub(crate) composition_poly_root: Commitment, - /// The composition-poly Merkle tree kept resident on device (when the R2 - /// GPU tree path ran), so R4 openings gather authentication paths on device - /// instead of walking a host tree. When set, `composition_poly_merkle_tree` - /// is a root-only placeholder. `None` on the CPU path. + /// The composition Merkle tree kept resident on device (when the R2 GPU tree + /// path ran), so R4 openings gather paths on device instead of walking a host + /// tree. When set, `composition_poly_merkle_tree` is a root only placeholder. + /// `None` on the CPU path. #[cfg(feature = "cuda")] pub(crate) gpu_composition_tree: Option, } @@ -1148,9 +1141,9 @@ pub trait IsStarkProver< let t_sub = Instant::now(); // GPU fast path for the comp-poly Merkle commit: row-pair Keccak // leaves + device-side inner tree, both wrapping the host eval Vecs. - // GPU path keeps the composition tree resident on device (no whole-tree - // D2H) and returns a root-only host tree; the device tree is threaded to - // R4 in `Round2.gpu_composition_tree`. + // GPU path keeps the composition tree resident on device (no whole tree + // copy) and returns a root only host tree. The device tree is threaded + // to R4 in `Round2.gpu_composition_tree`. #[cfg(feature = "cuda")] let (composition_poly_merkle_tree, composition_poly_root, gpu_composition_tree) = match crate::gpu_lde::try_build_comp_poly_tree_gpu::< @@ -1179,9 +1172,8 @@ pub trait IsStarkProver< #[cfg(feature = "instruments")] crate::instruments::store_r2_sub(constraints_dur, fft_dur, merkle_dur); - // Fold the R2 device composition-parts handle into the per-table session - // (resident R2→R4). The host evaluations remain in `Round2` as the - // mirror R4 openings still read; `composition_host_mirror` flags that. + // Fold the R2 device composition parts handle into the session (resident + // R2 to R4). The host evaluations stay in `Round2` for R4 openings. #[cfg(feature = "cuda")] if let Some(handle) = gpu_composition_parts { round_1_result.lde_trace.set_gpu_composition_parts(handle); @@ -1690,10 +1682,10 @@ pub trait IsStarkProver< /// Like [`Self::open_polys_with`], but uses Merkle proofs already gathered /// from the resident device tree (see [`crate::gpu_lde::gather_proofs_dev`]) - /// instead of walking a host tree. The evaluations are still gathered from - /// the host-resident LDE columns via `gather`. `proof` is for leaf position - /// `challenge * 2`, `proof_sym` for `challenge * 2 + 1` — the same positions - /// `open_polys_with` opens. + /// instead of walking a host tree. Evaluations still come from the host LDE + /// columns via `gather`. `proof` is for leaf position `challenge * 2`, + /// `proof_sym` for `challenge * 2 + 1`, the same positions `open_polys_with` + /// opens. #[cfg(feature = "cuda")] fn open_polys_with_proofs( domain: &Domain, @@ -1737,18 +1729,14 @@ pub trait IsStarkProver< let num_precomputed_cols = main_commit.num_precomputed_cols; let total_cols = lde_trace.num_main_cols(); - // R4 main-trace proofs from the resident device tree, when present: - // gathered in one batch over all query positions (`c*2`, `c*2+1` per - // query) instead of walking the host tree. Byte-identical to the host - // proofs (guarded by the `merkle_gather` parity test). Only the - // non-preprocessed main carries a device tree today; on any miss this is - // `None` and openings fall back to the host tree below. - // `*_dev_proofs` is `Some` exactly when the corresponding tree is - // device-resident (so the host tree is a root-only placeholder). In that - // case the gather MUST succeed — there is no host tree to fall back to, - // so a gather error is a hard abort (not a silent walk of an empty - // tree). When the tree is *not* device-resident the value is `None` and - // the openings below walk the full host tree as usual. + // R4 trace proofs from the resident device trees, gathered in one batch + // over all query positions instead of walking the host trees (byte + // identical to the host proofs, guarded by the `merkle_gather` test). + // `*_dev_proofs` is `Some` exactly when the tree is device resident (so + // the host tree is a root only placeholder). In that case the gather + // must succeed: there is no host tree to fall back to, so a gather error + // is a hard abort. When the tree is not device resident the value is + // `None` and the openings below walk the full host tree. #[cfg(feature = "cuda")] let main_dev_proofs: Option>> = if is_preprocessed { None @@ -1770,7 +1758,7 @@ pub trait IsStarkProver< }) }; - // Same for the aux-trace tree, when it is device-resident. + // Same for the aux trace tree, when it is device resident. #[cfg(feature = "cuda")] let aux_dev_proofs: Option>> = round_1_result .aux @@ -1788,7 +1776,7 @@ pub trait IsStarkProver< .expect("device aux-tree gather failed; resident tree has no host fallback") }); - // Composition tree: openings open a single position `index` (row-pair + // Composition tree: openings open a single position `index` (row pair // leaf), so gather one proof per query challenge from the device tree. #[cfg(feature = "cuda")] let comp_dev_proofs: Option>> = @@ -2002,11 +1990,11 @@ pub trait IsStarkProver< let k = table_parallelism().min(num_airs).max(1); - // VRAM-budgeted admission. The budget caps the summed device working-set + // VRAM budgeted admission. The budget caps the summed device working set // of the tables proved concurrently so large blocks don't exhaust VRAM. - // It is an *additional* ceiling on top of `k` (it never raises - // concurrency): on non-cuda builds, or when the budget can't be queried, - // it is `u64::MAX` and chunking falls back to fixed size `k`. + // It is an extra ceiling on top of `k` (it never raises concurrency). On + // non-cuda builds, or when the budget can't be queried, it is `u64::MAX` + // and chunking falls back to fixed size `k`. #[cfg(feature = "cuda")] let vram_budget = math_cuda::device::backend() .map(|b| b.vram_budget_bytes()) @@ -2014,8 +2002,8 @@ pub trait IsStarkProver< #[cfg(not(feature = "cuda"))] let vram_budget = u64::MAX; - // R1 main commit: only the main LDE (+ its Merkle scratch) is resident, - // so the aux columns contribute nothing to this phase's working-set. + // R1 main commit: only the main LDE and its Merkle scratch are resident, + // so the aux columns add nothing to this phase's working set. let main_chunks = { let estimates: Vec = air_trace_pairs .iter() @@ -2231,10 +2219,10 @@ pub trait IsStarkProver< #[allow(clippy::type_complexity)] let mut aux_results: Vec> = Vec::with_capacity(num_airs); - // R1 aux commit and rounds 2–4 share the peak working-set: the main and - // aux LDEs are co-resident, plus the composition / Merkle transients - // (folded into the scratch factor). `num_aux_columns` is now populated - // by the aux build above, so this estimate is accurate for both phases. + // R1 aux commit and rounds 2 to 4 share the peak working set: the main + // and aux LDEs are co-resident, plus the composition and Merkle + // transients (in the scratch factor). `num_aux_columns` is populated by + // the aux build above, so this estimate is accurate for both phases. let peak_chunks = { let estimates: Vec = air_trace_pairs .iter() diff --git a/crypto/stark/src/trace.rs b/crypto/stark/src/trace.rs index c7a9a941a..292f5be2e 100644 --- a/crypto/stark/src/trace.rs +++ b/crypto/stark/src/trace.rs @@ -195,58 +195,31 @@ where pub(crate) aux_columns: Vec>>, pub(crate) lde_step_size: usize, pub(crate) blowup_factor: usize, - /// Per-table GPU residency session: owns the device-resident trace LDE - /// buffers (main + aux) and tracks, per buffer, whether a host mirror is - /// live. Created on the CPU path too (all buffers `None`, mirrors present); - /// populated when the R1 fused GPU pipeline runs. Threaded R1→R4 because - /// `LDETraceTable` is borrowed through every round. + /// Per table GPU residency session: owns this table's device LDE buffers + /// and bound stream. Threaded R1 to R4. Empty on the CPU path. #[cfg(feature = "cuda")] pub(crate) gpu_session: GpuTableSession, } -/// Per-table GPU residency session. +/// Per table GPU residency session. /// -/// Owns the device-resident buffers for a single trace table and tracks, per -/// buffer, whether a host mirror is currently materialised. Today every buffer -/// produced on the GPU is *also* copied to host columns, because the remaining -/// CPU consumers (R2 constraint evaluation, R3 barycentric OOD, R4 query -/// openings) read from host. The `*_host_mirror` flags are the seam those later -/// steps flip: once a consumer reads from the device buffer instead, the -/// corresponding mirror is dropped and the H2D/D2H copy elided. -/// -/// Scope: this owns the main/aux trace LDE (resident R1→R4), the composition -/// parts (`Round2`, R2→R4), and the per-table bound stream. The R4-local -/// `inv_denoms`/FRI state are created and consumed within R4 and stay local. +/// Owns the device buffers for one trace table: the main and aux trace LDE +/// (resident R1 to R4), the composition parts LDE (R2 to R4), and a bound +/// stream. The R4 local inv_denoms and FRI state stay local to R4. #[cfg(feature = "cuda")] pub(crate) struct GpuTableSession { - /// Main-trace LDE, resident from the R1 fused pipeline through R4. `None` - /// when the GPU LDE didn't run for this table (below the size threshold or - /// any CPU fallback: preprocessed main, non-Goldilocks, or GPU error). + /// Main trace LDE, resident from the R1 fused pipeline through R4. None + /// when the GPU LDE did not run (below threshold, preprocessed main, not + /// Goldilocks, or a GPU error). main_lde: Option, - /// Aux-trace LDE (ext3 de-interleaved layout on device), resident R1→R4. + /// Aux trace LDE (ext3, deinterleaved on device), resident R1 to R4. aux_lde: Option, - /// Composition-poly parts LDE (ext3 de-interleaved on device), produced in - /// R2 and resident R2→R4 so R4 DEEP reads the parts on-device instead of a - /// `num_parts * 3 * lde_size * 8` byte H2D. `None` when the R2 GPU path - /// didn't run (number_of_parts <= 2, below threshold, or CPU fallback). + /// Composition parts LDE (ext3, deinterleaved on device), produced in R2 + /// and resident R2 to R4 so R4 DEEP reads them on device. None when the R2 + /// GPU path did not run. composition_parts: Option, - /// Whether the main-trace host columns currently mirror `main_lde`. - /// Always `true` today; CPU consumers depend on it. - main_host_mirror: bool, - /// Whether the aux-trace host columns currently mirror `aux_lde`. - /// Always `true` today; CPU consumers depend on it. - aux_host_mirror: bool, - /// Whether the host composition-parts evaluations (`Round2`) mirror - /// `composition_parts`. Always `true` today; R4 openings read host. - composition_host_mirror: bool, /// Stream bound to this table's GPU work, acquired lazily from the backend - /// pool on first use and cached for the session's lifetime. The R3/R4 - /// device-resident chain (inv_denoms → barycentric/OOD → DEEP) runs on it - /// today; the heavy LDE/Merkle ops join once they thread a stream. Binding - /// one stream per table serialises a table's kernels on a single queue and - /// gives distinct tables distinct streams — the prerequisite for cross-table - /// overlap once the host mirrors drop (Steps 4–5). `None` is cached if the - /// backend is unavailable, so callers fall back to the CPU path. + /// pool and cached. None is cached when the backend is unavailable. stream: OnceLock>>, } @@ -257,12 +230,6 @@ impl GpuTableSession { main_lde: None, aux_lde: None, composition_parts: None, - // Host columns are always materialised today; the CPU consumers - // (constraint eval, OOD, openings) read them. Steps 5/6 flip these - // off as each consumer moves to reading the device buffer. - main_host_mirror: true, - aux_host_mirror: true, - composition_host_mirror: true, stream: OnceLock::new(), } } @@ -293,9 +260,8 @@ where } } - /// Attach an already-populated device LDE handle for the main columns. - /// Only set when the GPU fused pipeline produced the LDE. Callers that - /// ran the CPU path should leave this alone. + /// Attach the device LDE handle for the main columns, produced by the GPU + /// fused pipeline. Leave unset on the CPU path. #[cfg(feature = "cuda")] pub fn set_gpu_main(&mut self, h: math_cuda::lde::GpuLdeBase) { self.gpu_session.main_lde = Some(h); @@ -317,23 +283,8 @@ where self.gpu_session.aux_lde.as_ref() } - /// Whether the main-trace host columns currently mirror the device LDE. - /// Always `true` today (CPU consumers read host); the source-agnostic seam - /// steps 5/6 flip when the main consumer reads the device buffer instead. - #[cfg(feature = "cuda")] - pub fn main_host_mirror(&self) -> bool { - self.gpu_session.main_host_mirror - } - - /// Whether the aux-trace host columns currently mirror the device LDE. - /// Always `true` today; see [`Self::main_host_mirror`]. - #[cfg(feature = "cuda")] - pub fn aux_host_mirror(&self) -> bool { - self.gpu_session.aux_host_mirror - } - - /// Attach the device-resident composition-poly parts LDE produced in R2. - /// Read by R4 DEEP so the parts aren't re-uploaded H2D. + /// Attach the composition parts LDE produced in R2. Read by R4 DEEP so the + /// parts are not re-uploaded. #[cfg(feature = "cuda")] pub fn set_gpu_composition_parts(&mut self, h: math_cuda::lde::GpuLdeExt3) { self.gpu_session.composition_parts = Some(h); @@ -344,17 +295,9 @@ where self.gpu_session.composition_parts.as_ref() } - /// Whether the host composition-parts evaluations mirror the device buffer. - /// Always `true` today; see [`Self::main_host_mirror`]. - #[cfg(feature = "cuda")] - pub fn composition_host_mirror(&self) -> bool { - self.gpu_session.composition_host_mirror - } - /// The stream bound to this table's GPU work. Acquired lazily from the - /// backend pool on first call and cached for the session's lifetime, so all - /// of a table's stream-threaded ops share one queue. Returns `None` (cached) - /// when the backend is unavailable; callers then fall back to the CPU path. + /// backend pool on first call and cached, so all of a table's stream ops + /// share one queue. Returns None (cached) when the backend is unavailable. #[cfg(feature = "cuda")] pub fn bound_stream(&self) -> Option> { self.gpu_session From b5fbdef6db80e783afdfe51cb202545eea940fe8 Mon Sep 17 00:00:00 2001 From: Joaquin Carletti Date: Tue, 30 Jun 2026 12:24:21 -0300 Subject: [PATCH 13/17] fix --- crypto/math-cuda/tests/merkle_root_parity.rs | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/crypto/math-cuda/tests/merkle_root_parity.rs b/crypto/math-cuda/tests/merkle_root_parity.rs index 0cbe016b6..b4f96bc4a 100644 --- a/crypto/math-cuda/tests/merkle_root_parity.rs +++ b/crypto/math-cuda/tests/merkle_root_parity.rs @@ -295,7 +295,7 @@ fn new_row_major_pipeline_base_root_matches_cpu() { let fwd_tw = TwoHalfTwiddles::::new(log_lde, false).expect("fwd twiddles"); - let (nodes, _handle, _lde) = + let (handle, _lde) = math_cuda::lde::coset_lde_row_major_with_merkle_tree_keep( &row_major, n, @@ -304,8 +304,7 @@ fn new_row_major_pipeline_base_root_matches_cpu() { &weights_u64, ) .expect("new row-major GPU pipeline"); - let mut gpu_root = [0u8; 32]; - gpu_root.copy_from_slice(&nodes[0..32]); + let gpu_root = handle.tree.as_ref().expect("resident merkle tree").root; let cpu_root = cpu_row_major_merkle_root( &(0..num_cols) @@ -359,7 +358,7 @@ fn new_row_major_pipeline_ext3_root_matches_cpu() { let fwd_tw = TwoHalfTwiddles::::new(log_lde, false).expect("fwd twiddles"); - let (nodes, _handle, _lde) = + let (handle, _lde) = math_cuda::lde::coset_lde_ext3_row_major_with_merkle_tree_keep( &row_major, n, @@ -368,8 +367,7 @@ fn new_row_major_pipeline_ext3_root_matches_cpu() { &weights_u64, ) .expect("new ext3 row-major GPU pipeline"); - let mut gpu_root = [0u8; 32]; - gpu_root.copy_from_slice(&nodes[0..32]); + let gpu_root = handle.tree.as_ref().expect("resident merkle tree").root; let cpu_root = cpu_ext3_row_major_merkle_root(&columns, blowup, &weights_fp, &inv_tw, &fwd_tw); From b37b6497d3f52c07d5ee3ba965c13a7cce0416c6 Mon Sep 17 00:00:00 2001 From: Joaquin Carletti Date: Tue, 30 Jun 2026 13:23:30 -0300 Subject: [PATCH 14/17] fix --- crypto/math-cuda/src/merkle.rs | 22 +++++++------------ crypto/math-cuda/tests/keccak_leaves.rs | 10 +++++++-- crypto/stark/src/instruments.rs | 24 ++++++++++----------- crypto/stark/src/prover.rs | 28 +++++++++---------------- 4 files changed, 37 insertions(+), 47 deletions(-) diff --git a/crypto/math-cuda/src/merkle.rs b/crypto/math-cuda/src/merkle.rs index cf6823c24..3d9cd260c 100644 --- a/crypto/math-cuda/src/merkle.rs +++ b/crypto/math-cuda/src/merkle.rs @@ -455,15 +455,8 @@ fn build_comp_poly_tree_nodes_dev( Ok((nodes_dev, num_leaves, stream)) } -pub fn build_comp_poly_tree_from_evals_ext3(parts_interleaved: &[&[u64]]) -> Result> { - let (nodes_dev, _num_leaves, stream) = build_comp_poly_tree_nodes_dev(parts_interleaved)?; - let out = stream.clone_dtoh(&nodes_dev)?; - stream.synchronize()?; - Ok(out) -} - -/// Like [`build_comp_poly_tree_from_evals_ext3`] but keeps the tree nodes on -/// device (returned as a [`crate::lde::GpuMerkleTree`] with its root), so R4 +/// Build the comp poly Merkle tree on device and keep the nodes resident +/// (returned as a [`crate::lde::GpuMerkleTree`] with its root), so R4 /// composition openings gather paths on device instead of copying the whole /// tree to host. `leaves_len = lde_size / 2` (row pair leaves). pub fn build_comp_poly_tree_from_evals_ext3_keep( @@ -480,11 +473,12 @@ pub fn build_comp_poly_tree_from_evals_ext3_keep( }) } -/// Build a FRI-layer Merkle tree on device from an interleaved ext3 eval -/// vector. Each leaf hashes two consecutive ext3 values. `num_leaves = -/// evals.len() / 6` (since each ext3 is 3 u64s). -/// -/// Returns the `(2*num_leaves - 1) * 32`-byte node buffer in standard layout. +/// Test-only parity harness: build a FRI layer Merkle tree on device from an +/// interleaved ext3 eval vector and return the full host node buffer so tests +/// can compare it byte for byte against the CPU. Production folds and commits +/// via [`crate::fri::FriLayer::fold_and_commit_layer`]. Each leaf hashes two +/// consecutive ext3 values; `num_leaves = evals.len() / 6`. Returns the +/// `(2*num_leaves - 1) * 32`-byte node buffer in standard layout. pub fn build_fri_layer_tree_from_evals_ext3(evals: &[u64]) -> Result> { assert!( evals.len().is_multiple_of(6), diff --git a/crypto/math-cuda/tests/keccak_leaves.rs b/crypto/math-cuda/tests/keccak_leaves.rs index 61a861f32..eca6a5c0b 100644 --- a/crypto/math-cuda/tests/keccak_leaves.rs +++ b/crypto/math-cuda/tests/keccak_leaves.rs @@ -217,8 +217,14 @@ fn keccak_comp_poly_leaves_matches_cpu() { let parts_slices: Vec<&[u64]> = parts_interleaved.iter().map(|v| v.as_slice()).collect(); - let nodes = - math_cuda::merkle::build_comp_poly_tree_from_evals_ext3(&parts_slices).unwrap(); + // Exercise the production keep path, then read the resident nodes + // back to host to check the leaf bytes. + let tree = + math_cuda::merkle::build_comp_poly_tree_from_evals_ext3_keep(&parts_slices) + .unwrap(); + let be = math_cuda::device::backend().unwrap(); + let stream = be.next_stream(); + let nodes: Vec = stream.clone_dtoh(&*tree.nodes).unwrap(); let num_leaves = lde_size / 2; let leaves_offset = (num_leaves - 1) * 32; for i in 0..num_leaves { diff --git a/crypto/stark/src/instruments.rs b/crypto/stark/src/instruments.rs index 20cde52fa..f263558aa 100644 --- a/crypto/stark/src/instruments.rs +++ b/crypto/stark/src/instruments.rs @@ -4,22 +4,18 @@ use std::sync::OnceLock; use std::sync::atomic::{AtomicU64, Ordering}; use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; -// ========================================================================= -// Wall-clock span timeline (the trustworthy per-step measurement) -// ========================================================================= +// Wall clock span timeline: the trustworthy per step latency breakdown. // -// Nested wall clock spans opened and closed on the driving (main) thread at -// phase boundaries. Unlike the `accum_*` thread local sub timers below (which -// sum per worker CPU time across rayon threads and over count, so percentages -// exceed 100%), these spans do not overlap and sum to their parent, so the tree -// is a true latency breakdown. Parallel regions are measured as a single span -// around the blocking call (that is their latency); their internal split is -// reported separately as CPU time, never mixed into the wall tree. +// Spans open and close on the main thread at phase boundaries. They do not +// overlap and sum to their parent, so the tree is a true latency breakdown +// (unlike the accum_* thread local sub timers below, which sum per worker CPU +// time across rayon threads and can exceed 100%). A parallel region is one span +// around the blocking call; its internal split is reported separately as CPU +// time, never mixed into the wall tree. // // let _s = instruments::span("trace_build"); // RAII, stops on drop // -// `Instant::now()` is about 20 ns, fine at phase granularity; never put it -// inside per op loops. +// Instant::now() is about 20 ns, fine at phase granularity, not in per op loops. #[derive(Clone, Debug)] pub struct SpanRecord { @@ -138,9 +134,11 @@ pub fn timeline_json(spans: &[SpanRecord]) -> String { if i > 0 { out.push(','); } + // Escape the label so a quote or backslash cannot break the JSON. + let label = s.label.replace('\\', "\\\\").replace('"', "\\\""); out.push_str(&format!( "{{\"label\":\"{}\",\"depth\":{},\"wall_ns\":{},\"order\":{},\"start_ns\":{}}}", - s.label, + label, s.depth, s.wall.as_nanos(), s.order, diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs index f89a3fb24..8c04b7d1d 100644 --- a/crypto/stark/src/prover.rs +++ b/crypto/stark/src/prover.rs @@ -424,14 +424,10 @@ pub fn table_parallelism() -> usize { } } -/// Heuristic peak device working set for one table, in bytes. Two parts: the -/// LDE columns co-resident on the GPU (main base field at 8 B, aux ext3 at -/// 24 B) times a scratch factor for the NTT and leaf hash transients, plus the -/// resident Merkle trees (main, aux, composition, FRI layers) kept on device R1 -/// to R4. Each full tree is about `2*lde_size` nodes of 32 B (`64*lde_size`); -/// together at the R4 peak about 256 B per `lde_size` covers them. A deliberate -/// over estimate that gates a safety ceiling, not a precise allocator. Pass -/// `aux_cols == 0` where the aux LDE is not yet resident (R1 main commit). +/// Heuristic peak device bytes for one table: co-resident LDE columns plus the +/// resident Merkle trees, with a scratch factor for NTT and leaf transients. A +/// deliberate over estimate for a safety ceiling, not a precise allocator. Pass +/// aux_cols == 0 when the aux LDE is not yet resident (R1 main commit). fn estimate_table_vram_bytes(main_cols: usize, aux_cols: usize, lde_size: usize) -> u64 { const BYTES_PER_BASE: u64 = 8; const EXT3_BYTES: u64 = 24; @@ -445,16 +441,12 @@ fn estimate_table_vram_bytes(main_cols: usize, aux_cols: usize, lde_size: usize) lde_term.saturating_add(tree_term) } -/// Plan contiguous table chunks for parallel proving. -/// -/// A chunk grows until it reaches `k` tables (the core and RAM bound limit) or -/// its summed VRAM estimate would exceed `budget`, whichever comes first. A -/// single table larger than `budget` forms its own chunk (it runs solo rather -/// than being excluded). With `budget == u64::MAX` the VRAM constraint is never -/// binding for any realistic estimate, so chunks fall back to fixed size `k`, -/// the same as the previous `step_by(k)` scheme. So on non-cuda builds and when -/// VRAM is not binding, scheduling (and therefore the proof) is unchanged. -/// Returns `(start, end)` half open ranges covering `0..estimates.len()` in order. +/// Plan contiguous table chunks for parallel proving. A chunk grows until it +/// hits `k` tables or its summed VRAM estimate would exceed `budget`; a single +/// table larger than `budget` runs solo. With `budget == u64::MAX` (non-cuda, +/// or VRAM not binding) chunks fall back to fixed size `k`, identical to the +/// old `step_by(k)`, so scheduling and the proof are unchanged. Returns +/// `(start, end)` half open ranges covering `0..estimates.len()` in order. fn plan_table_chunks(estimates: &[u64], k: usize, budget: u64) -> Vec<(usize, usize)> { let n = estimates.len(); let k = k.max(1); From 873491ecc83577a576505783963f1969c5c88f5e Mon Sep 17 00:00:00 2001 From: Joaquin Carletti Date: Tue, 30 Jun 2026 13:46:08 -0300 Subject: [PATCH 15/17] fix clippy --- crypto/math-cuda/src/fri.rs | 3 +- crypto/math-cuda/tests/keccak_leaves.rs | 5 ++- crypto/math-cuda/tests/merkle_root_parity.rs | 17 +++++----- crypto/stark/src/gpu_lde.rs | 12 +++++-- crypto/stark/src/prover.rs | 35 +++++++++----------- 5 files changed, 37 insertions(+), 35 deletions(-) diff --git a/crypto/math-cuda/src/fri.rs b/crypto/math-cuda/src/fri.rs index 8fe6fcdb1..a2f96c07a 100644 --- a/crypto/math-cuda/src/fri.rs +++ b/crypto/math-cuda/src/fri.rs @@ -217,7 +217,8 @@ impl FriCommitState { // Keep the layer tree resident on device; copy only the 32-byte root so // R4 query openings gather paths on device instead of copying the tree. let mut root = [0u8; 32]; - self.stream.memcpy_dtoh(&nodes_dev.slice(0..32), &mut root)?; + self.stream + .memcpy_dtoh(&nodes_dev.slice(0..32), &mut root)?; self.stream.synchronize()?; self.a_is_input = !self.a_is_input; diff --git a/crypto/math-cuda/tests/keccak_leaves.rs b/crypto/math-cuda/tests/keccak_leaves.rs index eca6a5c0b..087ccde14 100644 --- a/crypto/math-cuda/tests/keccak_leaves.rs +++ b/crypto/math-cuda/tests/keccak_leaves.rs @@ -219,9 +219,8 @@ fn keccak_comp_poly_leaves_matches_cpu() { // Exercise the production keep path, then read the resident nodes // back to host to check the leaf bytes. - let tree = - math_cuda::merkle::build_comp_poly_tree_from_evals_ext3_keep(&parts_slices) - .unwrap(); + let tree = math_cuda::merkle::build_comp_poly_tree_from_evals_ext3_keep(&parts_slices) + .unwrap(); let be = math_cuda::device::backend().unwrap(); let stream = be.next_stream(); let nodes: Vec = stream.clone_dtoh(&*tree.nodes).unwrap(); diff --git a/crypto/math-cuda/tests/merkle_root_parity.rs b/crypto/math-cuda/tests/merkle_root_parity.rs index 573747a65..fcc9d226e 100644 --- a/crypto/math-cuda/tests/merkle_root_parity.rs +++ b/crypto/math-cuda/tests/merkle_root_parity.rs @@ -299,15 +299,14 @@ fn new_row_major_pipeline_base_root_matches_cpu() { let fwd_tw = TwoHalfTwiddles::::new(log_lde, false).expect("fwd twiddles"); - let (handle, _lde) = - math_cuda::lde::coset_lde_row_major_with_merkle_tree_keep( - &row_major, - n, - num_cols, - blowup, - &weights_u64, - ) - .expect("new row-major GPU pipeline"); + let (handle, _lde) = math_cuda::lde::coset_lde_row_major_with_merkle_tree_keep( + &row_major, + n, + num_cols, + blowup, + &weights_u64, + ) + .expect("new row-major GPU pipeline"); let gpu_root = handle.tree.as_ref().expect("resident merkle tree").root; let cpu_root = cpu_row_major_merkle_root( diff --git a/crypto/stark/src/gpu_lde.rs b/crypto/stark/src/gpu_lde.rs index 8da2b707a..29285d34f 100644 --- a/crypto/stark/src/gpu_lde.rs +++ b/crypto/stark/src/gpu_lde.rs @@ -485,7 +485,11 @@ where // Transmute Vec → Vec> (zero-copy, E == GoldilocksField). let lde_out: Vec> = unsafe { let mut v = std::mem::ManuallyDrop::new(lde_u64); - Vec::from_raw_parts(v.as_mut_ptr() as *mut FieldElement, v.len(), v.capacity()) + Vec::from_raw_parts( + v.as_mut_ptr() as *mut FieldElement, + v.len(), + v.capacity(), + ) }; // Root-only host tree: the device tree (`handle.tree`) holds the nodes and @@ -554,7 +558,11 @@ where v.len() % 3 == 0 && v.capacity() % 3 == 0, "lde_u64 len/capacity must be a multiple of 3 for Fp3 reinterpret" ); - Vec::from_raw_parts(v.as_mut_ptr() as *mut FieldElement, v.len() / 3, v.capacity() / 3) + Vec::from_raw_parts( + v.as_mut_ptr() as *mut FieldElement, + v.len() / 3, + v.capacity() / 3, + ) }; // Root-only host tree: the device tree (`handle.tree`) holds the nodes and diff --git a/crypto/stark/src/prover.rs b/crypto/stark/src/prover.rs index 8c04b7d1d..cdf1cd1b2 100644 --- a/crypto/stark/src/prover.rs +++ b/crypto/stark/src/prover.rs @@ -36,8 +36,6 @@ use crate::trace::LDETraceTable; use super::config::{BatchedMerkleTree, BatchedMerkleTreeBackend, Commitment}; use super::constraints::evaluator::ConstraintEvaluator; -#[cfg(feature = "cuda")] -use crypto::merkle_tree::proof::Proof; use super::domain::{Domain, DomainConstants}; use super::fri::fri_decommit::FriDecommitment; use super::grinding; @@ -45,6 +43,8 @@ use super::lookup::BusPublicInputs; use super::proof::stark::{DeepPolynomialOpening, MultiProof, StarkProof}; use super::trace::TraceTable; use super::traits::AIR; +#[cfg(feature = "cuda")] +use crypto::merkle_tree::proof::Proof; pub use crate::commitment::{keccak_leaves_bit_reversed, keccak_leaves_row_pair_bit_reversed}; @@ -1875,8 +1875,9 @@ pub trait IsStarkProver< let stream = lde_trace .bound_stream() .expect("bound stream for device-resident composition-tree opening"); - crate::gpu_lde::gather_proofs_dev(tree, indexes_to_open, &stream) - .expect("device composition-tree gather failed; resident tree has no host fallback") + crate::gpu_lde::gather_proofs_dev(tree, indexes_to_open, &stream).expect( + "device composition-tree gather failed; resident tree has no host fallback", + ) }); for (qi, index) in indexes_to_open.iter().enumerate() { @@ -1892,12 +1893,9 @@ pub trait IsStarkProver< #[cfg(feature = "cuda")] { if let Some(proofs) = &main_dev_proofs { - Self::open_polys_with_proofs( - domain, - proofs[qi].clone(), - *index, - |row| lde_trace.gather_main_row(row), - ) + Self::open_polys_with_proofs(domain, proofs[qi].clone(), *index, |row| { + lde_trace.gather_main_row(row) + }) } else { Self::open_polys_with(domain, &main_commit.tree, *index, |row| { lde_trace.gather_main_row(row) @@ -1950,12 +1948,9 @@ pub trait IsStarkProver< #[cfg(feature = "cuda")] { if let Some(proofs) = &aux_dev_proofs { - Self::open_polys_with_proofs( - domain, - proofs[qi].clone(), - *index, - |row| lde_trace.gather_aux_row(row), - ) + Self::open_polys_with_proofs(domain, proofs[qi].clone(), *index, |row| { + lde_trace.gather_aux_row(row) + }) } else { Self::open_polys_with(domain, &aux.tree, *index, |row| { lde_trace.gather_aux_row(row) @@ -2098,8 +2093,8 @@ pub trait IsStarkProver< .iter() .enumerate() .map(|(idx, (_, trace, _))| { - let lde_size = domains[idx].interpolation_domain_size - * domains[idx].blowup_factor; + let lde_size = + domains[idx].interpolation_domain_size * domains[idx].blowup_factor; estimate_table_vram_bytes(trace.num_main_columns, 0, lde_size) }) .collect(); @@ -2303,8 +2298,8 @@ pub trait IsStarkProver< .iter() .enumerate() .map(|(idx, (_, trace, _))| { - let lde_size = domains[idx].interpolation_domain_size - * domains[idx].blowup_factor; + let lde_size = + domains[idx].interpolation_domain_size * domains[idx].blowup_factor; estimate_table_vram_bytes( trace.num_main_columns, trace.num_aux_columns, From e4ada642938788d29cfcd998ba8b42146fc6e6a4 Mon Sep 17 00:00:00 2001 From: Joaquin Carletti Date: Tue, 30 Jun 2026 15:02:26 -0300 Subject: [PATCH 16/17] fix comments --- crypto/crypto/src/merkle_tree/merkle.rs | 6 +++--- crypto/math-cuda/src/merkle.rs | 13 ++++--------- crypto/stark/src/gpu_lde.rs | 19 +++++++++++++++++-- 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/crypto/crypto/src/merkle_tree/merkle.rs b/crypto/crypto/src/merkle_tree/merkle.rs index a32f12978..e9e9469fd 100644 --- a/crypto/crypto/src/merkle_tree/merkle.rs +++ b/crypto/crypto/src/merkle_tree/merkle.rs @@ -255,9 +255,9 @@ where /// Merkle tree pub fn get_proof_by_pos(&self, pos: usize) -> Option> { // A root only tree (from `from_root`) has no nodes to walk. Callers must - // gather paths from the device resident copy instead. Catch the misuse - // early in debug builds rather than returning a misleading None. - debug_assert!( + // gather paths from the device resident copy instead. Fail loudly in all + // builds rather than returning a misleading empty path. + assert!( !self.nodes.is_empty(), "get_proof_by_pos called on a root-only MerkleTree (no nodes)" ); diff --git a/crypto/math-cuda/src/merkle.rs b/crypto/math-cuda/src/merkle.rs index 3d9cd260c..fb1125ea4 100644 --- a/crypto/math-cuda/src/merkle.rs +++ b/crypto/math-cuda/src/merkle.rs @@ -378,16 +378,11 @@ pub fn gather_merkle_paths_dev( Ok(host) } -/// Row-pair Keccak leaf + Merkle tree build for R2 composition-polynomial -/// commit. `parts_interleaved` is `num_parts` slices, each holding an ext3 -/// LDE column interleaved as `[a0,a1,a2, b0,b1,b2, ...]` of length `3*lde_size`. -/// -/// Returns `(2*(lde_size/2) - 1) * 32` bytes of tree nodes in the standard -/// layout (root at byte offset 0, leaves in the tail). -/// Build the composition Merkle tree on device. Leaves hash row pairs, so +/// Build the composition Merkle tree on device. `parts_interleaved` is +/// `num_parts` slices, each an ext3 LDE column interleaved as +/// `[a0,a1,a2, b0,b1,b2, ...]` of length `3*lde_size`. Leaves hash row pairs, so /// `num_leaves = lde_size / 2`. Returns the device node buffer, the leaf count, -/// and the stream it was built on. Shared by the host copy and device keep -/// wrappers below. +/// and the stream it was built on. Used by the device keep wrapper below. fn build_comp_poly_tree_nodes_dev( parts_interleaved: &[&[u64]], ) -> Result<(CudaSlice, usize, Arc)> { diff --git a/crypto/stark/src/gpu_lde.rs b/crypto/stark/src/gpu_lde.rs index 29285d34f..3f1d81846 100644 --- a/crypto/stark/src/gpu_lde.rs +++ b/crypto/stark/src/gpu_lde.rs @@ -1446,6 +1446,12 @@ pub(crate) fn gather_proofs_dev( if positions.is_empty() { return Some(Vec::new()); } + // Positions index an LDE that `assert_u32_domain` keeps within u32; guard the + // cast so any future relaxation fails loudly instead of wrapping silently. + debug_assert!( + positions.iter().all(|&p| p <= u32::MAX as usize), + "gather_proofs_dev: position exceeds u32 range" + ); let positions_u32: Vec = positions.iter().map(|&p| p as u32).collect(); let bytes = math_cuda::merkle::gather_merkle_paths_dev( &tree.nodes, @@ -1698,8 +1704,17 @@ where // The GPU FRI commit sets `gpu_tree` on every layer as a group; the CPU // commit sets none. Host trees fall back to the host walk. When the layers // are device resident the host trees are root only, so the gather below must - // succeed (a failure is a hard abort, not a silent walk). - if fri_layers[0].gpu_tree.is_none() { + // succeed (a failure is a hard abort, not a silent walk). The residency is + // all or nothing; assert it so a future partial-build can never route a + // root-only layer through the host walk and ship empty proofs. + let first_resident = fri_layers[0].gpu_tree.is_some(); + debug_assert!( + fri_layers + .iter() + .all(|l| l.gpu_tree.is_some() == first_resident), + "FRI layer residency must be all or nothing" + ); + if !first_resident { return None; } let stream = math_cuda::device::backend() From 4ca240ce304e7a3f42ef8388449d5f5f0fb4aefc Mon Sep 17 00:00:00 2001 From: Joaquin Carletti Date: Tue, 30 Jun 2026 16:29:23 -0300 Subject: [PATCH 17/17] fix --- crypto/crypto/src/merkle_tree/merkle.rs | 7 ------- 1 file changed, 7 deletions(-) diff --git a/crypto/crypto/src/merkle_tree/merkle.rs b/crypto/crypto/src/merkle_tree/merkle.rs index e9e9469fd..d53f06f10 100644 --- a/crypto/crypto/src/merkle_tree/merkle.rs +++ b/crypto/crypto/src/merkle_tree/merkle.rs @@ -254,13 +254,6 @@ where /// For example, give me an inclusion proof for the 3rd element in the /// Merkle tree pub fn get_proof_by_pos(&self, pos: usize) -> Option> { - // A root only tree (from `from_root`) has no nodes to walk. Callers must - // gather paths from the device resident copy instead. Fail loudly in all - // builds rather than returning a misleading empty path. - assert!( - !self.nodes.is_empty(), - "get_proof_by_pos called on a root-only MerkleTree (no nodes)" - ); let pos = pos + self.node_count() / 2; let Ok(merkle_path) = self.build_merkle_path(pos) else { return None;