diff --git a/crates/ppvm-tableau/Cargo.toml b/crates/ppvm-tableau/Cargo.toml index dfaf9005..3a1e3886 100644 --- a/crates/ppvm-tableau/Cargo.toml +++ b/crates/ppvm-tableau/Cargo.toml @@ -65,3 +65,7 @@ harness = false [[bench]] name = "measure-all" harness = false + +[[bench]] +name = "rot2-apply" +harness = false diff --git a/crates/ppvm-tableau/benches/rot2-apply.rs b/crates/ppvm-tableau/benches/rot2-apply.rs new file mode 100644 index 00000000..a3ab873b --- /dev/null +++ b/crates/ppvm-tableau/benches/rot2-apply.rs @@ -0,0 +1,71 @@ +// SPDX-FileCopyrightText: 2026 The PPVM Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Two-qubit-rotation (`rotate_2`) throughput benchmark. +//! +//! `rotate_2` (RXX/RYY/RZZ) is the only caller of the "apply" coefficient +//! accumulation (`compute_coefficients_after_pauli_apply`). The headline +//! `stim-circuits` bench is T-gate heavy and never hits this path, so this bench +//! exercises it directly: a branchy brickwork of non-Clifford two-qubit +//! rotations whose coefficient vector grows into the thousands, making the +//! per-`rotate_2` apply cost dominate. +//! +//! Matches the cultivation config (`ByteFxHashF64<8>, usize`) so the workload is +//! representative of a real branchy run. + +use std::f64::consts::PI; +use std::time::Duration; + +use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main}; +use ppvm_pauli_sum::config::indexmap::ByteFxHashF64; +use ppvm_tableau::prelude::*; + +type Tab = GeneralizedTableau, usize>; + +/// Branchy brickwork of two-qubit rotations on `n` qubits over `layers` layers. +fn rot2_brickwork(n: usize, layers: usize) -> Tab { + let mut tab: Tab = GeneralizedTableau::new_with_seed(n, 1e-10, 1); + for q in (0..n).step_by(2) { + tab.h(q); + } + for layer in 0..layers { + for a in (0..n.saturating_sub(1)).step_by(2) { + tab.rxx(a, a + 1, 0.3 * PI); + tab.ryy(a, a + 1, 0.4 * PI); + } + for a in (1..n.saturating_sub(1)).step_by(2) { + tab.rzz(a, a + 1, 0.25 * PI); + tab.rxx(a, a + 1, 0.15 * PI); + } + if layer % 2 == 0 { + for q in (1..n).step_by(2) { + tab.h(q); + } + } + } + tab +} + +pub fn rot2_apply_benchmarks(c: &mut Criterion) { + let mut group = c.benchmark_group("rot2-apply"); + // (qubits, layers): each grows the coefficient vector to a different scale. + for &(n, layers) in &[(8usize, 4usize), (10, 4), (12, 3)] { + let m = rot2_brickwork(n, layers).coefficients.len(); + group.bench_with_input( + BenchmarkId::from_parameter(format!("n{n}_l{layers}_m{m}")), + &(n, layers), + |b, &(n, layers)| b.iter(|| rot2_brickwork(n, layers)), + ); + } + group.finish(); +} + +criterion_group! { + name = benches; + config = Criterion::default() + .warm_up_time(Duration::from_millis(500)) + .measurement_time(Duration::from_secs(2)) + .sample_size(30); + targets = rot2_apply_benchmarks +} +criterion_main!(benches); diff --git a/crates/ppvm-tableau/src/data.rs b/crates/ppvm-tableau/src/data.rs index 3a111dc1..d35e3e70 100644 --- a/crates/ppvm-tableau/src/data.rs +++ b/crates/ppvm-tableau/src/data.rs @@ -3,6 +3,9 @@ use std::{fmt::Debug, marker::PhantomData}; +// Only the rayon branch-coefficient helpers coalesce into a map now; the apply +// path and the default-feature branch path both use flat-Vec sort-merge/relabel. +#[cfg(feature = "rayon")] use fxhash::FxHashMap as HashMap; use bitvec::array::BitArray; @@ -465,7 +468,18 @@ where branch_coefficients_seq(items.iter().copied(), 2 * items.len(), params) } -/// Sequential accumulation of apply coefficients. +/// Sequential relabel of apply coefficients. +/// +/// Pauli application sends every branch to `branch_index = idx ^ stab_anticomm_bits`. +/// XOR by a fixed constant is a bijection, so distinct input indices always map +/// to distinct branch indices: unlike the T-gate branch split (which emits two +/// streams that genuinely collide), the apply path produces no index collisions +/// at all. A per-index coalesce can therefore never merge two entries — the +/// `entry()`-keyed map was pure overhead (hash every key + table allocation) for +/// what is a straight relabel. We instead build a flat `Vec` in one sequential, +/// prefetch-friendly pass and let the caller apply the magnitude cutoff. (The +/// returned keys are unique by the bijection above; the `Vec` backing relies on +/// that, exactly as the old map did implicitly.) fn apply_coefficients_seq( items: impl IntoIterator, I)>, capacity: usize, @@ -473,15 +487,14 @@ fn apply_coefficients_seq( destab_anticomm_bits: I, odd_phase_mask: I, phase_decomp: u8, -) -> HashMap> +) -> Vec<(I, Complex)> where I: TableauIndex, CoeffType: One + Zero + Clone + num::Num, Complex: std::ops::Mul> + std::ops::AddAssign + From + Copy, { - let mut map: HashMap> = - HashMap::with_capacity_and_hasher(capacity, Default::default()); + let mut out: Vec<(I, Complex)> = Vec::with_capacity(capacity); for (coeff, idx) in items { debug_assert!( !(coeff.re == CoeffType::zero() && coeff.im == CoeffType::zero()), @@ -497,15 +510,19 @@ where let branch_phase = (branch_phase_contribution + phase_decomp) % 4; let phase_factor: Complex = COMPLEX_PHASE_CONVERSION[branch_phase as usize].into(); - let branch_coefficient = phase_factor * coeff; - *map.entry(branch_index).or_insert(Complex::zero()) += branch_coefficient; + out.push((branch_index, phase_factor * coeff)); } - map + out } -/// Accumulate coefficients for pauli application. When the coefficient count -/// exceeds `RAYON_COEFF_THRESHOLD`, uses parallel map/collect followed by -/// sequential accumulation. Below the threshold, falls back to sequential. +/// Relabel coefficients for pauli application. When the coefficient count +/// exceeds `RAYON_COEFF_THRESHOLD`, the per-element relabel runs as a parallel +/// map; below the threshold it falls back to the sequential relabel. +/// +/// Because the relabel `idx ^ stab_anticomm_bits` is a bijection (see +/// [`apply_coefficients_seq`]), the parallel map's output already has unique +/// keys — there is nothing to coalesce, so the result is collected straight into +/// a flat `Vec` with no sequential fold afterwards. #[cfg(feature = "rayon")] fn apply_coefficients_parallel( items: &[(Complex, I)], @@ -513,7 +530,7 @@ fn apply_coefficients_parallel( destab_anticomm_bits: I, odd_phase_mask: I, phase_decomp: u8, -) -> HashMap> +) -> Vec<(I, Complex)> where I: TableauIndex + Send + Sync, CoeffType: One + Zero + Clone + Send + Sync + num::Num, @@ -523,7 +540,7 @@ where if items.len() >= RAYON_COEFF_THRESHOLD { use rayon::prelude::*; - let pairs: Vec<(I, Complex)> = items + return items .par_iter() .map(|&(coeff, idx)| { let branch_index = idx ^ stab_anticomm_bits; @@ -539,13 +556,6 @@ where (branch_index, phase_factor * coeff) }) .collect(); - - let mut map: HashMap> = - HashMap::with_capacity_and_hasher(pairs.len(), Default::default()); - for (branch_idx, branch_coeff) in pairs { - *map.entry(branch_idx).or_insert(Complex::zero()) += branch_coeff; - } - return map; } apply_coefficients_seq( diff --git a/crates/ppvm-tableau/tests/apply_path.rs b/crates/ppvm-tableau/tests/apply_path.rs new file mode 100644 index 00000000..0847ba1e --- /dev/null +++ b/crates/ppvm-tableau/tests/apply_path.rs @@ -0,0 +1,95 @@ +// SPDX-FileCopyrightText: 2026 The PPVM Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Differential lock for the two-qubit-rotation (`rotate_2`) coefficient path. +//! +//! `rotate_2` (RXX/RYY/RZZ) is the *only* caller of +//! `compute_coefficients_after_pauli_apply`, i.e. the "apply" coefficient +//! accumulation in `data.rs`. That accumulation relabels every branch index by +//! a fixed `idx ^ stab_anticomm_bits`, which is a bijection — so the keys never +//! collide and the coalescing container can never actually merge two entries. +//! +//! This test pins the measured-bit record of a branchy RXX/RYY/RZZ brickwork +//! over many seeds to an FNV-1a digest, so that swapping the apply +//! accumulation's storage strategy (hash coalesce → direct relabel) is proven +//! to leave every measurement outcome bit-identical. + +use ppvm_pauli_sum::config::indexmap::ByteFxHashF64; +use ppvm_tableau::prelude::*; + +type Tab = GeneralizedTableau, usize>; + +const FNV_OFFSET: u64 = 0xcbf2_9ce4_8422_2325; +const FNV_PRIME: u64 = 0x0000_0100_0000_01b3; + +fn fnv1a_update(mut h: u64, byte: u8) -> u64 { + h ^= byte as u64; + h = h.wrapping_mul(FNV_PRIME); + h +} + +/// Deterministic branchy two-qubit-rotation circuit on `n` qubits. +/// +/// Brickwork layers of RXX / RYY / RZZ at non-Clifford angles, interleaved with +/// Hadamards, so the coefficient vector genuinely branches and every +/// `rotate_2` exercises the apply path on a non-trivial superposition. +fn build_rot2_brickwork(n: usize, layers: usize) -> Tab { + let mut tab: Tab = GeneralizedTableau::new_with_seed(n, 1e-10, 1); + for q in (0..n).step_by(2) { + tab.h(q); + } + for layer in 0..layers { + // even brickwork pairs + for a in (0..n.saturating_sub(1)).step_by(2) { + tab.rxx(a, a + 1, 0.3 * std::f64::consts::PI); + tab.ryy(a, a + 1, 0.4 * std::f64::consts::PI); + } + // odd brickwork pairs + for a in (1..n.saturating_sub(1)).step_by(2) { + tab.rzz(a, a + 1, 0.25 * std::f64::consts::PI); + tab.rxx(a, a + 1, 0.15 * std::f64::consts::PI); + } + if layer % 2 == 0 { + for q in (1..n).step_by(2) { + tab.h(q); + } + } + } + tab +} + +/// Fork `tab` over `seeds` independent RNG streams, measure every qubit, and +/// fold the full outcome record into an FNV-1a digest. +fn measure_record_digest(tab: &Tab, n: usize, seeds: u64) -> u64 { + let mut h = FNV_OFFSET; + for seed in 0..seeds { + let mut forked = tab.fork(Some(seed)); + for q in 0..n { + let bit = forked.measure(q).expect("no lost qubits in this circuit"); + h = fnv1a_update(h, bit as u8); + } + } + h +} + +#[test] +fn rot2_apply_path_measurement_digest_is_stable() { + let n = 8; + let tab = build_rot2_brickwork(n, 3); + // The circuit must actually branch, or it wouldn't exercise the apply path. + assert!( + tab.coefficients.len() > 8, + "expected a branchy superposition, got {} coefficients", + tab.coefficients.len() + ); + + let digest = measure_record_digest(&tab, n, 256); + println!("rot2_apply_path digest = {digest:#018x}"); + + // Golden digest captured on the hash-coalesce apply path (pre-refactor). + // The direct-relabel apply path must reproduce it bit-for-bit. + assert_eq!( + digest, 0x2401_e08e_70e6_ecc8, + "measurement record changed — apply-path refactor is not behaviour-preserving" + ); +}