QuEraComputing · Roger-luo · Jun 24, 2026 · Jun 24, 2026 · Jun 24, 2026
diff --git a/crates/ppvm-tableau/Cargo.toml b/crates/ppvm-tableau/Cargo.toml
@@ -65,3 +65,7 @@ harness = false
 [[bench]]
 name = "measure-all"
 harness = false
+
+[[bench]]
+name = "rot2-apply"
+harness = false
diff --git a/crates/ppvm-tableau/benches/rot2-apply.rs b/crates/ppvm-tableau/benches/rot2-apply.rs
@@ -0,0 +1,71 @@
+// SPDX-FileCopyrightText: 2026 The PPVM Authors
+// SPDX-License-Identifier: Apache-2.0
+
+//! Two-qubit-rotation (`rotate_2`) throughput benchmark.
+//!
+//! `rotate_2` (RXX/RYY/RZZ) is the only caller of the "apply" coefficient
+//! accumulation (`compute_coefficients_after_pauli_apply`). The headline
+//! `stim-circuits` bench is T-gate heavy and never hits this path, so this bench
+//! exercises it directly: a branchy brickwork of non-Clifford two-qubit
+//! rotations whose coefficient vector grows into the thousands, making the
+//! per-`rotate_2` apply cost dominate.
+//!
+//! Matches the cultivation config (`ByteFxHashF64<8>, usize`) so the workload is
+//! representative of a real branchy run.
+
+use std::f64::consts::PI;
+use std::time::Duration;
+
+use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
+use ppvm_pauli_sum::config::indexmap::ByteFxHashF64;
+use ppvm_tableau::prelude::*;
+
+type Tab = GeneralizedTableau<ByteFxHashF64<8>, usize>;
+
+/// Branchy brickwork of two-qubit rotations on `n` qubits over `layers` layers.
+fn rot2_brickwork(n: usize, layers: usize) -> Tab {
+    let mut tab: Tab = GeneralizedTableau::new_with_seed(n, 1e-10, 1);
+    for q in (0..n).step_by(2) {
+        tab.h(q);
+    }
+    for layer in 0..layers {
+        for a in (0..n.saturating_sub(1)).step_by(2) {
+            tab.rxx(a, a + 1, 0.3 * PI);
+            tab.ryy(a, a + 1, 0.4 * PI);
+        }
+        for a in (1..n.saturating_sub(1)).step_by(2) {
+            tab.rzz(a, a + 1, 0.25 * PI);
+            tab.rxx(a, a + 1, 0.15 * PI);
+        }
+        if layer % 2 == 0 {
+            for q in (1..n).step_by(2) {
+                tab.h(q);
+            }
+        }
+    }
+    tab
+}
+
+pub fn rot2_apply_benchmarks(c: &mut Criterion) {
+    let mut group = c.benchmark_group("rot2-apply");
+    // (qubits, layers): each grows the coefficient vector to a different scale.
+    for &(n, layers) in &[(8usize, 4usize), (10, 4), (12, 3)] {
+        let m = rot2_brickwork(n, layers).coefficients.len();
+        group.bench_with_input(
+            BenchmarkId::from_parameter(format!("n{n}_l{layers}_m{m}")),
+            &(n, layers),
+            |b, &(n, layers)| b.iter(|| rot2_brickwork(n, layers)),
+        );
+    }
+    group.finish();
+}
+
+criterion_group! {
+    name = benches;
+    config = Criterion::default()
+        .warm_up_time(Duration::from_millis(500))
+        .measurement_time(Duration::from_secs(2))
+        .sample_size(30);
+    targets = rot2_apply_benchmarks
+}
+criterion_main!(benches);
diff --git a/crates/ppvm-tableau/src/data.rs b/crates/ppvm-tableau/src/data.rs
@@ -3,6 +3,9 @@
 
 use std::{fmt::Debug, marker::PhantomData};
 
+// Only the rayon branch-coefficient helpers coalesce into a map now; the apply
+// path and the default-feature branch path both use flat-Vec sort-merge/relabel.
+#[cfg(feature = "rayon")]
 use fxhash::FxHashMap as HashMap;
 
 use bitvec::array::BitArray;
@@ -465,23 +468,33 @@ where
     branch_coefficients_seq(items.iter().copied(), 2 * items.len(), params)
 }
 
-/// Sequential accumulation of apply coefficients.
+/// Sequential relabel of apply coefficients.
+///
+/// Pauli application sends every branch to `branch_index = idx ^ stab_anticomm_bits`.
+/// XOR by a fixed constant is a bijection, so distinct input indices always map
+/// to distinct branch indices: unlike the T-gate branch split (which emits two
+/// streams that genuinely collide), the apply path produces no index collisions
+/// at all. A per-index coalesce can therefore never merge two entries — the
+/// `entry()`-keyed map was pure overhead (hash every key + table allocation) for
+/// what is a straight relabel. We instead build a flat `Vec` in one sequential,
+/// prefetch-friendly pass and let the caller apply the magnitude cutoff. (The
+/// returned keys are unique by the bijection above; the `Vec` backing relies on
+/// that, exactly as the old map did implicitly.)
 fn apply_coefficients_seq<I, CoeffType>(
     items: impl IntoIterator<Item = (Complex<CoeffType>, I)>,
     capacity: usize,
     stab_anticomm_bits: I,
     destab_anticomm_bits: I,
     odd_phase_mask: I,
     phase_decomp: u8,
-) -> HashMap<I, Complex<CoeffType>>
+) -> Vec<(I, Complex<CoeffType>)>
 where
     I: TableauIndex,
     CoeffType: One + Zero + Clone + num::Num,
     Complex<CoeffType>:
         std::ops::Mul<Output = Complex<CoeffType>> + std::ops::AddAssign + From<Complex64> + Copy,
 {
-    let mut map: HashMap<I, Complex<CoeffType>> =
-        HashMap::with_capacity_and_hasher(capacity, Default::default());
+    let mut out: Vec<(I, Complex<CoeffType>)> = Vec::with_capacity(capacity);
     for (coeff, idx) in items {
         debug_assert!(
             !(coeff.re == CoeffType::zero() && coeff.im == CoeffType::zero()),
@@ -497,23 +510,27 @@ where
         let branch_phase = (branch_phase_contribution + phase_decomp) % 4;
         let phase_factor: Complex<CoeffType> =
             COMPLEX_PHASE_CONVERSION[branch_phase as usize].into();
-        let branch_coefficient = phase_factor * coeff;
-        *map.entry(branch_index).or_insert(Complex::zero()) += branch_coefficient;
+        out.push((branch_index, phase_factor * coeff));
     }
-    map
+    out
 }
 
-/// Accumulate coefficients for pauli application. When the coefficient count
-/// exceeds `RAYON_COEFF_THRESHOLD`, uses parallel map/collect followed by
-/// sequential accumulation. Below the threshold, falls back to sequential.
+/// Relabel coefficients for pauli application. When the coefficient count
+/// exceeds `RAYON_COEFF_THRESHOLD`, the per-element relabel runs as a parallel
+/// map; below the threshold it falls back to the sequential relabel.
+///
+/// Because the relabel `idx ^ stab_anticomm_bits` is a bijection (see
+/// [`apply_coefficients_seq`]), the parallel map's output already has unique
+/// keys — there is nothing to coalesce, so the result is collected straight into
+/// a flat `Vec` with no sequential fold afterwards.
 #[cfg(feature = "rayon")]
 fn apply_coefficients_parallel<I, CoeffType>(
     items: &[(Complex<CoeffType>, I)],
     stab_anticomm_bits: I,
     destab_anticomm_bits: I,
     odd_phase_mask: I,
     phase_decomp: u8,
-) -> HashMap<I, Complex<CoeffType>>
+) -> Vec<(I, Complex<CoeffType>)>
 where
     I: TableauIndex + Send + Sync,
     CoeffType: One + Zero + Clone + Send + Sync + num::Num,
@@ -523,7 +540,7 @@ where
     if items.len() >= RAYON_COEFF_THRESHOLD {
         use rayon::prelude::*;
 
-        let pairs: Vec<(I, Complex<CoeffType>)> = items
+        return items
             .par_iter()
             .map(|&(coeff, idx)| {
                 let branch_index = idx ^ stab_anticomm_bits;
@@ -539,13 +556,6 @@ where
                 (branch_index, phase_factor * coeff)
             })
             .collect();
-
-        let mut map: HashMap<I, Complex<CoeffType>> =
-            HashMap::with_capacity_and_hasher(pairs.len(), Default::default());
-        for (branch_idx, branch_coeff) in pairs {
-            *map.entry(branch_idx).or_insert(Complex::zero()) += branch_coeff;
-        }
-        return map;
     }
 
     apply_coefficients_seq(

diff --git a/crates/ppvm-tableau/tests/apply_path.rs b/crates/ppvm-tableau/tests/apply_path.rs
@@ -0,0 +1,95 @@
+// SPDX-FileCopyrightText: 2026 The PPVM Authors
+// SPDX-License-Identifier: Apache-2.0
+
+//! Differential lock for the two-qubit-rotation (`rotate_2`) coefficient path.
+//!
+//! `rotate_2` (RXX/RYY/RZZ) is the *only* caller of
+//! `compute_coefficients_after_pauli_apply`, i.e. the "apply" coefficient
+//! accumulation in `data.rs`. That accumulation relabels every branch index by
+//! a fixed `idx ^ stab_anticomm_bits`, which is a bijection — so the keys never
+//! collide and the coalescing container can never actually merge two entries.
+//!
+//! This test pins the measured-bit record of a branchy RXX/RYY/RZZ brickwork
+//! over many seeds to an FNV-1a digest, so that swapping the apply
+//! accumulation's storage strategy (hash coalesce → direct relabel) is proven
+//! to leave every measurement outcome bit-identical.
+
+use ppvm_pauli_sum::config::indexmap::ByteFxHashF64;
+use ppvm_tableau::prelude::*;
+
+type Tab = GeneralizedTableau<ByteFxHashF64<8>, usize>;
+
+const FNV_OFFSET: u64 = 0xcbf2_9ce4_8422_2325;
+const FNV_PRIME: u64 = 0x0000_0100_0000_01b3;
+
+fn fnv1a_update(mut h: u64, byte: u8) -> u64 {
+    h ^= byte as u64;
+    h = h.wrapping_mul(FNV_PRIME);
+    h
+}
+
+/// Deterministic branchy two-qubit-rotation circuit on `n` qubits.
+///
+/// Brickwork layers of RXX / RYY / RZZ at non-Clifford angles, interleaved with
+/// Hadamards, so the coefficient vector genuinely branches and every
+/// `rotate_2` exercises the apply path on a non-trivial superposition.
+fn build_rot2_brickwork(n: usize, layers: usize) -> Tab {
+    let mut tab: Tab = GeneralizedTableau::new_with_seed(n, 1e-10, 1);
+    for q in (0..n).step_by(2) {
+        tab.h(q);
+    }
+    for layer in 0..layers {
+        // even brickwork pairs
+        for a in (0..n.saturating_sub(1)).step_by(2) {
+            tab.rxx(a, a + 1, 0.3 * std::f64::consts::PI);
+            tab.ryy(a, a + 1, 0.4 * std::f64::consts::PI);
+        }
+        // odd brickwork pairs
+        for a in (1..n.saturating_sub(1)).step_by(2) {
+            tab.rzz(a, a + 1, 0.25 * std::f64::consts::PI);
+            tab.rxx(a, a + 1, 0.15 * std::f64::consts::PI);
+        }
+        if layer % 2 == 0 {
+            for q in (1..n).step_by(2) {
+                tab.h(q);
+            }
+        }
+    }
+    tab
+}
+
+/// Fork `tab` over `seeds` independent RNG streams, measure every qubit, and
+/// fold the full outcome record into an FNV-1a digest.
+fn measure_record_digest(tab: &Tab, n: usize, seeds: u64) -> u64 {
+    let mut h = FNV_OFFSET;
+    for seed in 0..seeds {
+        let mut forked = tab.fork(Some(seed));
+        for q in 0..n {
+            let bit = forked.measure(q).expect("no lost qubits in this circuit");
+            h = fnv1a_update(h, bit as u8);
+        }
+    }
+    h
+}
+
+#[test]
+fn rot2_apply_path_measurement_digest_is_stable() {
+    let n = 8;
+    let tab = build_rot2_brickwork(n, 3);
+    // The circuit must actually branch, or it wouldn't exercise the apply path.
+    assert!(
+        tab.coefficients.len() > 8,
+        "expected a branchy superposition, got {} coefficients",
+        tab.coefficients.len()
+    );
+
+    let digest = measure_record_digest(&tab, n, 256);
+    println!("rot2_apply_path digest = {digest:#018x}");
+
+    // Golden digest captured on the hash-coalesce apply path (pre-refactor).
+    // The direct-relabel apply path must reproduce it bit-for-bit.
+    assert_eq!(
+        digest, 0x2401_e08e_70e6_ecc8,
+        "measurement record changed — apply-path refactor is not behaviour-preserving"
+    );
+}