From f152652e4c04ca936e513c35963867773fe1b8c4 Mon Sep 17 00:00:00 2001 From: David Plankensteiner Date: Tue, 23 Jun 2026 20:31:10 +0200 Subject: [PATCH 01/24] chore(autotune): add tableau-sum-build experiment + deterministic bench harness --- .../examples/msd-noisy-bench.rs | 227 ++++++++++++++++++ .../2026-06-23-tableau-sum-build/log.md | 3 + .../2026-06-23-tableau-sum-build/metric.toml | 0 3 files changed, 230 insertions(+) create mode 100644 crates/ppvm-tableau-sum/examples/msd-noisy-bench.rs create mode 100644 docs/autotune/2026-06-23-tableau-sum-build/log.md create mode 100644 docs/autotune/2026-06-23-tableau-sum-build/metric.toml diff --git a/crates/ppvm-tableau-sum/examples/msd-noisy-bench.rs b/crates/ppvm-tableau-sum/examples/msd-noisy-bench.rs new file mode 100644 index 000000000..6377325c8 --- /dev/null +++ b/crates/ppvm-tableau-sum/examples/msd-noisy-bench.rs @@ -0,0 +1,227 @@ +// SPDX-FileCopyrightText: 2026 The PPVM Authors +// SPDX-License-Identifier: Apache-2.0 + +// Deterministic timing harness for the msd-noisy build + sample workload. +// Mirrors examples/msd-noisy.rs but uses a fixed seed, runs the build several +// times (median), and asserts the final branch count so an optimization that +// silently changes the math is caught. Used by the autotune experiment +// `docs/autotune/2026-06-23-tableau-sum-build`. + +use std::time::Instant; + +use ppvm_pauli_sum::config::fx64hash::Byte8F64; +use ppvm_tableau::prelude::*; +use ppvm_tableau_sum::data::GeneralizedTableauSum; + +#[global_allocator] +static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; + +type GTabSum = GeneralizedTableauSum, u128>; + +fn encode(tab: &mut GTabSum, qubits: &[usize], p_loss: f64, p_depolarize: f64) { + if qubits.len() == 17 { + for i in [0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16] { + tab.sqrt_y(qubits[i]); + tab.loss_channel(qubits[i], p_loss); + tab.depolarize1(qubits[i], p_depolarize); + } + for [i, j] in [[1, 3], [7, 10], [12, 14], [13, 16]] { + tab.cz(qubits[i], qubits[j]); + tab.loss_channel(qubits[i], p_loss); + tab.loss_channel(qubits[j], p_loss); + tab.depolarize1(qubits[i], p_depolarize); + tab.depolarize1(qubits[j], p_depolarize); + } + for i in [7, 16] { + tab.sqrt_y_dag(qubits[i]); + tab.loss_channel(qubits[i], p_loss); + tab.depolarize1(qubits[i], p_depolarize); + } + for [i, j] in [[4, 7], [8, 10], [11, 14], [15, 16]] { + tab.cz(qubits[i], qubits[j]); + tab.loss_channel(qubits[i], p_loss); + tab.loss_channel(qubits[j], p_loss); + tab.depolarize1(qubits[i], p_depolarize); + tab.depolarize1(qubits[j], p_depolarize); + } + for i in [4, 10, 14, 16] { + tab.sqrt_y_dag(qubits[i]); + tab.loss_channel(qubits[i], p_loss); + tab.depolarize1(qubits[i], p_depolarize); + } + for [i, j] in [[2, 4], [6, 8], [7, 9], [10, 13], [14, 16]] { + tab.cz(qubits[i], qubits[j]); + tab.loss_channel(qubits[i], p_loss); + tab.loss_channel(qubits[j], p_loss); + tab.depolarize1(qubits[i], p_depolarize); + tab.depolarize1(qubits[j], p_depolarize); + } + for i in [3, 6, 9, 10, 12, 13] { + tab.sqrt_y(qubits[i]); + tab.loss_channel(qubits[i], p_loss); + tab.depolarize1(qubits[i], p_depolarize); + } + for [i, j] in [[0, 2], [3, 6], [5, 8], [10, 12], [11, 13]] { + tab.cz(qubits[i], qubits[j]); + tab.loss_channel(qubits[i], p_loss); + tab.loss_channel(qubits[j], p_loss); + tab.depolarize1(qubits[i], p_depolarize); + tab.depolarize1(qubits[j], p_depolarize); + } + for i in [1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 14] { + tab.sqrt_y(qubits[i]); + tab.loss_channel(qubits[i], p_loss); + tab.depolarize1(qubits[i], p_depolarize); + } + for [i, j] in [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [12, 15]] { + tab.cz(qubits[i], qubits[j]); + tab.loss_channel(qubits[i], p_loss); + tab.loss_channel(qubits[j], p_loss); + tab.depolarize1(qubits[i], p_depolarize); + tab.depolarize1(qubits[j], p_depolarize); + } + for i in [0, 2, 5, 6, 8, 10, 12] { + tab.sqrt_y_dag(qubits[i]); + tab.loss_channel(qubits[i], p_loss); + tab.depolarize1(qubits[i], p_depolarize); + } + } +} + +fn build(seed: u64) -> GTabSum { + let n_qubits = 85; + let p_loss = 1e-4; + let p_depolarize = 1e-4; + let sum_cutoff = 1e-7; + + let mut tab: GTabSum = GeneralizedTableauSum::new_with_seed(n_qubits, 1e-10, sum_cutoff, seed); + let qubit_addrs: Vec = (0..n_qubits).collect(); + let ql: Vec<&[usize]> = qubit_addrs.chunks_exact(17).collect(); + + for q in ql.iter() { + let encoding_qubit = q[7]; + tab.h(encoding_qubit); + tab.loss_channel(encoding_qubit, p_loss); + tab.depolarize1(encoding_qubit, p_depolarize); + tab.t(encoding_qubit); + tab.loss_channel(encoding_qubit, p_loss); + tab.depolarize1(encoding_qubit, p_depolarize); + encode(&mut tab, q, p_loss, p_depolarize); + } + + for i in [0, 1, 4] { + for q in ql[i] { + tab.sqrt_x(*q); + tab.loss_channel(*q, p_loss); + tab.depolarize1(*q, p_depolarize); + } + } + for (control, target) in ql[0].iter().zip(ql[1]) { + tab.cz(*control, *target); + tab.loss_channel(*control, p_loss); + tab.loss_channel(*target, p_loss); + tab.depolarize1(*control, p_depolarize); + tab.depolarize1(*target, p_depolarize); + } + for (control, target) in ql[2].iter().zip(ql[3]) { + tab.cz(*control, *target); + tab.loss_channel(*control, p_loss); + tab.loss_channel(*target, p_loss); + tab.depolarize1(*control, p_depolarize); + tab.depolarize1(*target, p_depolarize); + } + for q in ql[0] { + tab.sqrt_y(*q); + tab.loss_channel(*q, p_loss); + tab.depolarize1(*q, p_depolarize); + } + for q in ql[3] { + tab.sqrt_y(*q); + tab.loss_channel(*q, p_loss); + tab.depolarize1(*q, p_depolarize); + } + for (control, target) in ql[0].iter().zip(ql[2]) { + tab.cz(*control, *target); + tab.loss_channel(*control, p_loss); + tab.loss_channel(*target, p_loss); + tab.depolarize1(*control, p_depolarize); + tab.depolarize1(*target, p_depolarize); + } + for (control, target) in ql[3].iter().zip(ql[4]) { + tab.cz(*control, *target); + tab.loss_channel(*control, p_loss); + tab.loss_channel(*target, p_loss); + tab.depolarize1(*control, p_depolarize); + tab.depolarize1(*target, p_depolarize); + } + for q in ql[0] { + tab.sqrt_x_dag(*q); + tab.loss_channel(*q, p_loss); + tab.depolarize1(*q, p_depolarize); + } + for (control, target) in ql[0].iter().zip(ql[4]) { + tab.cz(*control, *target); + tab.loss_channel(*control, p_loss); + tab.loss_channel(*target, p_loss); + tab.depolarize1(*control, p_depolarize); + tab.depolarize1(*target, p_depolarize); + } + for (control, target) in ql[1].iter().zip(ql[3]) { + tab.cz(*control, *target); + tab.loss_channel(*control, p_loss); + tab.loss_channel(*target, p_loss); + tab.depolarize1(*control, p_depolarize); + tab.depolarize1(*target, p_depolarize); + } + for block in ql.iter().take(5) { + for q in *block { + tab.sqrt_x_dag(*q); + tab.loss_channel(*q, p_loss); + tab.depolarize1(*q, p_depolarize); + } + } + + tab +} + +fn main() { + const BUILD_RUNS: usize = 5; + const N_SHOTS: usize = 20000; + const SEED: u64 = 12345; + + let mut build_times_ms: Vec = Vec::new(); + let mut branches = 0usize; + for r in 0..BUILD_RUNS { + let now = Instant::now(); + let tab = build(SEED + r as u64); + let ms = now.elapsed().as_secs_f64() * 1e3; + build_times_ms.push(ms); + branches = tab.len(); + } + build_times_ms.sort_by(|a, b| a.partial_cmp(b).unwrap()); + let median_ms = build_times_ms[build_times_ms.len() / 2]; + let min_ms = build_times_ms[0]; + + // Sample timing on a fresh build. + let mut tab = build(SEED); + let mut sampler = tab.sampler(); + let now = Instant::now(); + sampler.sample_shots(N_SHOTS); + let per_shot_ns = now.elapsed().as_nanos() as f64 / N_SHOTS as f64; + + println!("branches = {}", branches); + println!("build_min_ms = {:.1}", min_ms); + println!("build_median_ms= {:.1}", median_ms); + println!("per_shot_ns = {:.1}", per_shot_ns); + println!("all_build_ms = {:?}", build_times_ms); + + // Accuracy guard: the optimizations under test must not change the math, + // so the final branch count must stay at the baseline value. + const EXPECTED_BRANCHES: usize = 2025; + if branches != EXPECTED_BRANCHES { + eprintln!( + "WARNING: branch count {} != baseline {} — accuracy/structure changed!", + branches, EXPECTED_BRANCHES + ); + } +} diff --git a/docs/autotune/2026-06-23-tableau-sum-build/log.md b/docs/autotune/2026-06-23-tableau-sum-build/log.md new file mode 100644 index 000000000..867032304 --- /dev/null +++ b/docs/autotune/2026-06-23-tableau-sum-build/log.md @@ -0,0 +1,3 @@ +# Log for 2026-06-23-tableau-sum-build + +## 2026-06-23 diff --git a/docs/autotune/2026-06-23-tableau-sum-build/metric.toml b/docs/autotune/2026-06-23-tableau-sum-build/metric.toml new file mode 100644 index 000000000..e69de29bb From 34dcdb21ee90921e7832a3ccdb1bc3e73b78bbdd Mon Sep 17 00:00:00 2001 From: David Plankensteiner Date: Tue, 23 Jun 2026 20:31:36 +0200 Subject: [PATCH 02/24] chore(autotune): record baseline + architecture notes --- docs/autotune/2026-06-23-tableau-sum-build/log.md | 7 +++++++ docs/autotune/2026-06-23-tableau-sum-build/metric.toml | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/docs/autotune/2026-06-23-tableau-sum-build/log.md b/docs/autotune/2026-06-23-tableau-sum-build/log.md index 867032304..64a344917 100644 --- a/docs/autotune/2026-06-23-tableau-sum-build/log.md +++ b/docs/autotune/2026-06-23-tableau-sum-build/log.md @@ -1,3 +1,10 @@ # Log for 2026-06-23-tableau-sum-build ## 2026-06-23 +- Architecture Notes / baseline profile (samply, 2733 samples): +- Target: examples/msd-noisy build time. Baseline build_median ~2620ms, per_shot ~22.5us, final branches=2025. Config Byte8F64<2> (storage [u64;2]=128bit), index u128, 85 qubits => tableau has 170 rows, each row ~32B word data. +- INCLUSIVE: for_each_mut_with_keys 85%; depolarize1 53%; fork(clone) 47%; loss_channel 38%; rebuild_fingerprints_if_dirty 23%; mimalloc alloc ~15-20%. +- SELF: _platform_memmove 32% (the tableau deep-clone in fork); rebuild_fingerprints_if_dirty 18% (re-hashes all words after every clifford gate marks dirty); for_each_mut_with_keys 11%; phase_loss_hash 5%; gates y/cz/sqrt_* ~10% total. +- Root causes: (1) noise branching deep-clones a full ~7KB tableau per branch; depolarize forks 3x/entry, ~85% of branches are then merged or truncated -> wasted clones. (2) every clifford gate marks all entries dirty -> next noise op re-hashes all words of all entries. +- Accuracy guard: branch count must stay 2025 (optimizations must not change the math). Cutoff fixed at 1e-7. +- Bench cmd: cargo build --release -p ppvm-tableau-sum --example msd-noisy-bench && ./target/release/examples/msd-noisy-bench diff --git a/docs/autotune/2026-06-23-tableau-sum-build/metric.toml b/docs/autotune/2026-06-23-tableau-sum-build/metric.toml index e69de29bb..24d85b30b 100644 --- a/docs/autotune/2026-06-23-tableau-sum-build/metric.toml +++ b/docs/autotune/2026-06-23-tableau-sum-build/metric.toml @@ -0,0 +1,7 @@ +[[metric]] +"commit" = "f152652e" +"status" = "keep" +"description" = "baseline (msd-noisy build, seeded harness, median of 5)" +"build_median_ms" = 2620.0 +"per_shot_ns" = 22500.0 +"branches" = 2025.0 From a31b6c907fc6f458aa0c160f4203ea7ccb89a3f2 Mon Sep 17 00:00:00 2001 From: David Plankensteiner Date: Tue, 23 Jun 2026 20:40:55 +0200 Subject: [PATCH 03/24] chore(autotune): add accuracy fingerprint (sum_p2, top5) to bench harness --- .../ppvm-tableau-sum/examples/msd-noisy-bench.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/crates/ppvm-tableau-sum/examples/msd-noisy-bench.rs b/crates/ppvm-tableau-sum/examples/msd-noisy-bench.rs index 6377325c8..11e7f018a 100644 --- a/crates/ppvm-tableau-sum/examples/msd-noisy-bench.rs +++ b/crates/ppvm-tableau-sum/examples/msd-noisy-bench.rs @@ -12,6 +12,7 @@ use std::time::Instant; use ppvm_pauli_sum::config::fx64hash::Byte8F64; use ppvm_tableau::prelude::*; use ppvm_tableau_sum::data::GeneralizedTableauSum; +use ppvm_tableau_sum::storage::EntryStore; #[global_allocator] static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; @@ -202,6 +203,17 @@ fn main() { let median_ms = build_times_ms[build_times_ms.len() / 2]; let min_ms = build_times_ms[0]; + // Accuracy fingerprint: the optimizations under test must not change the + // math, so the multiset of branch probabilities must be invariant (up to + // float summation-order noise). Capture sum(p), sum(p^2) (participation + // ratio), and the top-5 probabilities from a fresh deterministic build. + let tab_acc = build(SEED); + let mut probs: Vec = tab_acc.entries.iter().map(|(_, p)| *p).collect(); + probs.sort_by(|a, b| b.partial_cmp(a).unwrap()); + let sum_p: f64 = probs.iter().sum(); + let sum_p2: f64 = probs.iter().map(|p| p * p).sum(); + let top5: Vec = probs.iter().take(5).copied().collect(); + // Sample timing on a fresh build. let mut tab = build(SEED); let mut sampler = tab.sampler(); @@ -213,6 +225,9 @@ fn main() { println!("build_min_ms = {:.1}", min_ms); println!("build_median_ms= {:.1}", median_ms); println!("per_shot_ns = {:.1}", per_shot_ns); + println!("sum_p = {:.12}", sum_p); + println!("sum_p2 = {:.12}", sum_p2); + println!("top5_p = {:?}", top5); println!("all_build_ms = {:?}", build_times_ms); // Accuracy guard: the optimizations under test must not change the math, From 03150ddbb35f48ed1b1eacd2101bd7d60ebe7ac7 Mon Sep 17 00:00:00 2001 From: David Plankensteiner Date: Tue, 23 Jun 2026 20:40:56 +0200 Subject: [PATCH 04/24] chore(autotune): record accuracy reference values --- docs/autotune/2026-06-23-tableau-sum-build/log.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/autotune/2026-06-23-tableau-sum-build/log.md b/docs/autotune/2026-06-23-tableau-sum-build/log.md index 64a344917..06e93a1c0 100644 --- a/docs/autotune/2026-06-23-tableau-sum-build/log.md +++ b/docs/autotune/2026-06-23-tableau-sum-build/log.md @@ -8,3 +8,4 @@ - Root causes: (1) noise branching deep-clones a full ~7KB tableau per branch; depolarize forks 3x/entry, ~85% of branches are then merged or truncated -> wasted clones. (2) every clifford gate marks all entries dirty -> next noise op re-hashes all words of all entries. - Accuracy guard: branch count must stay 2025 (optimizations must not change the math). Cutoff fixed at 1e-7. - Bench cmd: cargo build --release -p ppvm-tableau-sum --example msd-noisy-bench && ./target/release/examples/msd-noisy-bench +- Accuracy reference (must be preserved to ~1e-9 by all optimizations): branches=2025, sum_p=1.0, sum_p2=0.725135705447, top5_p[0]=0.8515413524292632. Key code fact: GeneralizedTableau X/Y/Z (gates/clifford.rs impl_generalized_tableau_clifford) only flip per-row sign bits (X where z=1, Y where x^z=1, Z where x=1) and leave words/coefficients/is_lost identical to parent; loss only sets one is_lost bit. So branch fingerprints are derivable without cloning -> lazy materialization is the plan. From 3e62b7b757dd463029bbe6595c6a98867bf95900 Mon Sep 17 00:00:00 2001 From: David Plankensteiner Date: Tue, 23 Jun 2026 20:43:05 +0200 Subject: [PATCH 05/24] chore(autotune): plan lazy-materialization approach --- .../lazy-materialization/prompts.md | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 docs/autotune/2026-06-23-tableau-sum-build/lazy-materialization/prompts.md diff --git a/docs/autotune/2026-06-23-tableau-sum-build/lazy-materialization/prompts.md b/docs/autotune/2026-06-23-tableau-sum-build/lazy-materialization/prompts.md new file mode 100644 index 000000000..cb1883d20 --- /dev/null +++ b/docs/autotune/2026-06-23-tableau-sum-build/lazy-materialization/prompts.md @@ -0,0 +1,33 @@ +# Approach: lazy branch materialization (loss_channel + pauli_error) + +## Hypothesis +Build time is dominated by `fork` (deep-clone of a ~7KB `GeneralizedTableau`, +47% inclusive, 32% `_platform_memmove` self). Each `depolarize1` forks 3 full +tableaux per entry and `loss_channel` forks 1 per entry, but ~85% of those +branches are immediately merged into an existing entry or dropped below +`sum_cutoff`. Those clones are pure waste. + +Key fact (verified in `ppvm-tableau/src/gates/clifford.rs`): applying X/Y/Z to a +`GeneralizedTableau` only flips per-row **sign bits** and leaves the Pauli +words, the `coefficients` vector, and `is_lost` identical to the parent. Loss +only sets one `is_lost` bit. So a branch's fingerprint and structural identity +are derivable from the parent **without cloning**. Materialize (clone+mutate) +only when a branch survives as a *new* entry. + +Per-row sign-flip rule at column `addr0` (matches the gate code exactly): +- X flips sign of row iff `z[addr0] == 1` +- Y flips sign of row iff `x[addr0] ^ z[addr0] == 1` +- Z flips sign of row iff `x[addr0] == 1` +(Only phase bit 1 = sign; the imaginary bit 0 is untouched. So the phase/loss +hash delta is `XOR sign_mask(row)` over flipped rows — same as the existing +`pauli_branch_phase_loss`.) + +## Target metric +`cargo build --release -p ppvm-tableau-sum --example msd-noisy-bench && ./target/release/examples/msd-noisy-bench` +Baseline: build_median ~2620ms, per_shot ~22.5us, branches=2025, +sum_p2=0.725135705447, top5[0]=0.8515413524292632. The math must be unchanged: +branches stays 2025 and the accuracy fingerprint must match to ~1e-9. + +## Expected win +Replace ~3N depolarize clones + ~N loss clones with clones only for survivors +(~1.2N total entries). Target 25-40% build-time reduction. From 5916523571c95b4aedbc500df1bccae5a410a6d4 Mon Sep 17 00:00:00 2001 From: David Plankensteiner Date: Tue, 23 Jun 2026 20:49:34 +0200 Subject: [PATCH 06/24] perf(tableau-sum): lazy branch materialization for loss/pauli noise loss_channel and single-qubit pauli_error now describe each branch as a BranchMutation of a parent entry instead of deep-cloning a tableau up front. The merge resolves structural identity against the virtual (parent + mutation) tableau via structurally_equal_mutated and only clones the parent when the branch survives as a new entry; merges and below-cutoff drops never clone. The VecStorage path is the optimized one; MapStorage materializes parents eagerly (correctness only). Math is unchanged: msd-noisy benchmark still ends at exactly 2025 branches with sum_p = 1.0, and all 76 ppvm-tableau-sum tests pass. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/ppvm-tableau-sum/src/noise.rs | 105 ++++++++----- .../src/storage/entry_store.rs | 17 ++ crates/ppvm-tableau-sum/src/storage/map.rs | 36 ++++- crates/ppvm-tableau-sum/src/storage/mod.rs | 147 ++++++++++++++++++ crates/ppvm-tableau-sum/src/storage/vec.rs | 76 ++++++++- 5 files changed, 342 insertions(+), 39 deletions(-) diff --git a/crates/ppvm-tableau-sum/src/noise.rs b/crates/ppvm-tableau-sum/src/noise.rs index 287f6e8f2..b12a08057 100644 --- a/crates/ppvm-tableau-sum/src/noise.rs +++ b/crates/ppvm-tableau-sum/src/noise.rs @@ -20,7 +20,7 @@ use rand::{RngExt, rngs::SmallRng}; use crate::{ data::GeneralizedTableauSum, - storage::{Branch, EntryStore, loss_mask, pauli_branch_phase_loss}, + storage::{Branch, BranchMutation, EntryStore, loss_mask, pauli_branch_phase_loss, sign_mask}, }; fn single_qubit_loss_branch( @@ -106,25 +106,34 @@ where I: Debug, { fn loss_channel(&mut self, addr0: usize, p: ::Coeff) { - let mut branches = Vec::<(GeneralizedTableau, T::Coeff, u64, u64)>::with_capacity( - self.entries.len(), - ); + // Lazy branch materialization: describe each loss branch as a mutation + // of its parent entry. The merge clones the parent only when the branch + // survives as a NEW entry; merges/below-cutoff drops never clone. + let mut branches = + Vec::<(usize, BranchMutation, T::Coeff, u64, u64)>::with_capacity(self.entries.len()); + let mut idx = 0usize; self.entries .for_each_mut_with_keys(|tab, p_sum, word_fp, phase_loss| { - single_qubit_loss_branch( - addr0, - &p, - &mut self.rng, - &mut branches, - tab, - p_sum, - (word_fp, phase_loss), - ); + // Increment for EVERY entry, before the lost check, so + // parent_idx aligns with for_each_mut_with_keys' order. + let parent_idx = idx; + idx += 1; + if tab.is_lost[addr0] { + return; + } + branches.push(( + parent_idx, + BranchMutation::Loss { q: addr0 }, + p_sum.clone() * p.clone(), + word_fp, + phase_loss ^ loss_mask(addr0), + )); + *p_sum *= T::Coeff::one() - p.clone(); }); let needs_renormalize = self .entries - .insert_or_merge_batch(branches, &self.sum_cutoff); + .insert_or_merge_mutated_branches(branches, &self.sum_cutoff); if needs_renormalize { self.normalize_probabilities(); } @@ -196,44 +205,66 @@ where { fn pauli_error(&mut self, addr0: usize, p: [::Coeff; 3]) { let p_total: T::Coeff = p[0].clone() + p[1].clone() + p[2].clone(); - let mut branches = Vec::<(GeneralizedTableau, T::Coeff, u64, u64)>::with_capacity( + // Lazy branch materialization: describe each X/Y/Z branch as a Pauli + // mutation of its parent. The phase/loss delta is computed by walking the + // parent's column once (no clone) — X flips rows with z, Y with x^z, Z + // with x — matching what `pauli_branch_phase_loss` would produce. + let mut branches = Vec::<(usize, BranchMutation, T::Coeff, u64, u64)>::with_capacity( 3 * self.entries.len(), ); - + let mut idx = 0usize; self.entries .for_each_mut_with_keys(|tab, p_sum, word_fp, phase_loss| { + let parent_idx = idx; + idx += 1; if tab.is_lost[addr0] { return; } - let tab_seed_x = self.rng.random::(); - let tab_seed_y = self.rng.random::(); - let tab_seed_z = self.rng.random::(); - - let mut tab_branch_x = tab.fork(Some(tab_seed_x)); - let mut tab_branch_y = tab.fork(Some(tab_seed_y)); - let mut tab_branch_z = tab.fork(Some(tab_seed_z)); - - tab_branch_x.x(addr0); - tab_branch_y.y(addr0); - tab_branch_z.z(addr0); + let (mut dx, mut dy, mut dz) = (0u64, 0u64, 0u64); + for (row, pw) in tab.tableau.data.iter().enumerate() { + let x: bool = pw.word.xbits[addr0]; + let z: bool = pw.word.zbits[addr0]; + let m = sign_mask(row); + if z { + dx ^= m; + } + if x ^ z { + dy ^= m; + } + if x { + dz ^= m; + } + } - // X/Y/Z flip only phase bits, never the Pauli words, so all three - // branches reuse the parent's word-fingerprint and derive their - // phase/loss hash from the parent's by XORing the flipped rows. - let hx = pauli_branch_phase_loss(tab, &tab_branch_x, phase_loss); - let hy = pauli_branch_phase_loss(tab, &tab_branch_y, phase_loss); - let hz = pauli_branch_phase_loss(tab, &tab_branch_z, phase_loss); - branches.push((tab_branch_x, p_sum.clone() * p[0].clone(), word_fp, hx)); - branches.push((tab_branch_y, p_sum.clone() * p[1].clone(), word_fp, hy)); - branches.push((tab_branch_z, p_sum.clone() * p[2].clone(), word_fp, hz)); + branches.push(( + parent_idx, + BranchMutation::Pauli { op: 1, addr0 }, + p_sum.clone() * p[0].clone(), + word_fp, + phase_loss ^ dx, + )); + branches.push(( + parent_idx, + BranchMutation::Pauli { op: 2, addr0 }, + p_sum.clone() * p[1].clone(), + word_fp, + phase_loss ^ dy, + )); + branches.push(( + parent_idx, + BranchMutation::Pauli { op: 3, addr0 }, + p_sum.clone() * p[2].clone(), + word_fp, + phase_loss ^ dz, + )); *p_sum *= T::Coeff::one() - p_total.clone(); }); let needs_normalize = self .entries - .insert_or_merge_batch(branches, &self.sum_cutoff); + .insert_or_merge_mutated_branches(branches, &self.sum_cutoff); if needs_normalize { self.normalize_probabilities(); } diff --git a/crates/ppvm-tableau-sum/src/storage/entry_store.rs b/crates/ppvm-tableau-sum/src/storage/entry_store.rs index 2dfaf9eb4..2cd822263 100644 --- a/crates/ppvm-tableau-sum/src/storage/entry_store.rs +++ b/crates/ppvm-tableau-sum/src/storage/entry_store.rs @@ -5,6 +5,8 @@ use num::Complex; use ppvm_tableau::{data::GeneralizedTableau, sparsevec::SparseVector}; use ppvm_traits::config::Config; +use crate::storage::BranchMutation; + /// One branch produced by a noise channel: its tableau, coefficient, and the /// cached `(word_fingerprint, phase_loss_hash)` pair, so a merge can recompute /// the full fingerprint (`word_fp ^ phase_loss`) without re-hashing the tableau. @@ -54,6 +56,21 @@ pub trait EntryStore, I>>: Clone /// fingerprint is `word_fp ^ phase_loss` — no re-hashing of the tableau. fn insert_or_merge_batch(&mut self, branches: Vec>, cutoff: &T::Coeff) -> bool; + /// Like [`insert_or_merge_batch`](Self::insert_or_merge_batch), but each + /// branch is described lazily as a [`BranchMutation`] of a parent entry + /// referenced by `parent_idx` — its ordinal in the order yielded by the + /// immediately-preceding [`for_each_mut_with_keys`](Self::for_each_mut_with_keys) + /// call (entry index for `VecStorage`; flat bucket order for `MapStorage`). + /// The branch tableau is materialized ONLY when it survives as a new entry; + /// merges and below-cutoff drops never clone. `word_fp`/`phase_loss` are the + /// branch's already-computed fingerprint halves; full fp = `word_fp ^ phase_loss`. + /// Returns true if any branch was dropped (caller renormalizes). + fn insert_or_merge_mutated_branches( + &mut self, + branches: Vec<(usize, BranchMutation, T::Coeff, u64, u64)>, + cutoff: &T::Coeff, + ) -> bool; + fn retain(&mut self, f: F) where F: FnMut(&GeneralizedTableau, &T::Coeff) -> bool; diff --git a/crates/ppvm-tableau-sum/src/storage/map.rs b/crates/ppvm-tableau-sum/src/storage/map.rs index 15125f76c..c788ed996 100644 --- a/crates/ppvm-tableau-sum/src/storage/map.rs +++ b/crates/ppvm-tableau-sum/src/storage/map.rs @@ -15,8 +15,12 @@ use ppvm_traits::config::Config; use smallvec::SmallVec; use crate::storage::{ - EntryStore, fingerprint, phase_loss_hash, structurally_equal, word_fingerprint, + Branch, BranchMutation, EntryStore, apply_branch_mutation, fingerprint, phase_loss_hash, + structurally_equal, word_fingerprint, }; +use bitvec::view::BitView; +use num::PrimInt; +use ppvm_traits::traits::Clifford; type Bucket = SmallVec<[(GeneralizedTableau, ::Coeff); 1]>; @@ -75,6 +79,8 @@ where + Copy, I: TableauIndex + Send + Sync, C: SparseVector, I>, + GeneralizedTableau: Clifford, + <::Storage as BitView>::Store: PrimInt, { fn with_capacity(cap: usize) -> Self { Self { @@ -177,6 +183,34 @@ where needs_renormalize } + fn insert_or_merge_mutated_branches( + &mut self, + branches: Vec<(usize, BranchMutation, ::Coeff, u64, u64)>, + cutoff: &::Coeff, + ) -> bool { + self.rebuild_if_dirty(); + + // Materialize parents in the SAME order as for_each_mut_with_keys so + // parent_idx aligns. Correctness-only path: no clone savings here. + let parents: Vec<_> = self + .buckets + .values() + .flat_map(|v| v.iter()) + .map(|(t, _)| t.clone()) + .collect(); + + let real: Vec> = branches + .into_iter() + .map(|(parent_idx, mutation, p, word_fp, phase_loss)| { + let mut tab = parents[parent_idx].clone(); + apply_branch_mutation(&mut tab, mutation); + (tab, p, word_fp, phase_loss) + }) + .collect(); + + self.insert_or_merge_batch(real, cutoff) + } + fn retain(&mut self, mut f: F) where F: FnMut(&GeneralizedTableau, &::Coeff) -> bool, diff --git a/crates/ppvm-tableau-sum/src/storage/mod.rs b/crates/ppvm-tableau-sum/src/storage/mod.rs index 03d4c4a74..8522e9e8c 100644 --- a/crates/ppvm-tableau-sum/src/storage/mod.rs +++ b/crates/ppvm-tableau-sum/src/storage/mod.rs @@ -7,6 +7,7 @@ pub mod vec; pub use entry_store::{Branch, EntryStore}; use fxhash::FxHashMap; +use ppvm_traits::traits::Clifford; // Hasher for the structural `word_fingerprint`. gxhash (AES-based) is fastest on // native but needs hardware AES and does not build on wasm32, so fall back to @@ -218,6 +219,152 @@ where true } +/// A lazily-described branch: a mutation applied to a parent entry. Used so the +/// merge can compute the branch fingerprint / structural identity without +/// cloning, materializing the tableau only for surviving new entries. +#[derive(Clone, Copy, Debug)] +pub enum BranchMutation { + /// Apply Pauli `op` (1=X, 2=Y, 3=Z) at `addr0`: flips per-row sign bits only. + Pauli { op: u8, addr0: usize }, + /// Mark qubit `q` lost (set is_lost[q] = true). + Loss { q: usize }, +} + +/// Materialize a lazily-described branch into a (cloned) tableau in place. +pub(crate) fn apply_branch_mutation( + tab: &mut GeneralizedTableau, + m: BranchMutation, +) where + T: Config, + I: TableauIndex, + C: SparseVector, I>, + GeneralizedTableau: Clifford, +{ + match m { + BranchMutation::Pauli { op, addr0 } => match op { + 1 => tab.x(addr0), + 2 => tab.y(addr0), + 3 => tab.z(addr0), + _ => {} + }, + BranchMutation::Loss { q } => { + tab.is_lost[q] = true; + } + } +} + +/// Like [`structurally_equal`], but compares `existing` against the *virtual* +/// tableau `parent + m` without materializing it. Mirrors `structurally_equal` +/// field-by-field, deriving each field of the virtual tableau from `parent`: +/// - `is_lost`: for `Loss { q }`, equals `parent`'s with index `q` forced true; +/// for `Pauli`, equals `parent`'s unchanged. +/// - `coefficients`: unchanged by both mutations. +/// - rows: for `Loss`, unchanged; for `Pauli`, each row's sign bit (phase bit 1) +/// is flipped per the per-column rule (X: z; Y: x^z; Z: x). +pub(crate) fn structurally_equal_mutated( + existing: &GeneralizedTableau, + parent: &GeneralizedTableau, + m: BranchMutation, + scratch: &mut FxHashMap>, +) -> bool +where + T: Config, + T::Coeff: One + Zero + Clone + num::Num + PartialOrd, + Complex: std::ops::Mul> + + AddAssign + + From + + ComplexFloat + + Copy, + I: TableauIndex, + C: SparseVector, I>, +{ + // NOTE: comparing is_lost and rows is only necessary to avoid hash collisions + + match m { + BranchMutation::Loss { q } => { + // Virtual is_lost == parent's with index q forced true. + if existing.is_lost.len() != parent.is_lost.len() { + return false; + } + for (i, (&e, &p)) in existing + .is_lost + .iter() + .zip(parent.is_lost.iter()) + .enumerate() + { + let virt = if i == q { true } else { p }; + if e != virt { + return false; + } + } + } + BranchMutation::Pauli { .. } => { + // Virtual is_lost == parent's, unchanged. + if existing.is_lost != parent.is_lost { + return false; + } + } + } + + if existing.coefficients.len() != parent.coefficients.len() { + return false; + } + + // Cheaper row comparison first; coefficient compare is O(K) below. + match m { + BranchMutation::Loss { .. } => { + for (re, rp) in existing.tableau.data.iter().zip(parent.tableau.data.iter()) { + if re.phase != rp.phase || re.word != rp.word { + return false; + } + } + } + BranchMutation::Pauli { op, addr0 } => { + for (re, rp) in existing.tableau.data.iter().zip(parent.tableau.data.iter()) { + if re.word != rp.word { + return false; + } + let x: bool = rp.word.xbits[addr0]; + let z: bool = rp.word.zbits[addr0]; + let flip = match op { + 1 => z, + 2 => x ^ z, + 3 => x, + _ => false, + }; + let virt_phase = rp.phase ^ ((flip as u8) << 1); + if re.phase != virt_phase { + return false; + } + } + } + } + + // Reuse the caller-owned scratch map instead of allocating per call. + // Clear retains capacity across invocations. Coefficients are unchanged + // by both mutations, so compare existing vs parent directly. + scratch.clear(); + scratch.reserve(parent.coefficients.len()); + for (val, idx) in parent.coefficients.iter() { + scratch.insert(*idx, *val); + } + + let threshold_sq = + existing.coefficient_threshold.clone() * existing.coefficient_threshold.clone(); + let zero = Complex { + re: T::Coeff::zero(), + im: T::Coeff::zero(), + }; + for (val0, idx0) in existing.coefficients.iter() { + let val1 = scratch.get(idx0).copied().unwrap_or(zero); + if (*val0 - val1).norm_sqr() >= threshold_sq { + return false; + } + } + + true +} + #[cfg(test)] mod fingerprint_tests { use super::{ diff --git a/crates/ppvm-tableau-sum/src/storage/vec.rs b/crates/ppvm-tableau-sum/src/storage/vec.rs index a235b32d6..7a6bdbb3f 100644 --- a/crates/ppvm-tableau-sum/src/storage/vec.rs +++ b/crates/ppvm-tableau-sum/src/storage/vec.rs @@ -13,7 +13,13 @@ use ppvm_tableau::{ }; use ppvm_traits::config::Config; -use crate::storage::{EntryStore, phase_loss_hash, structurally_equal, word_fingerprint}; +use crate::storage::{ + BranchMutation, EntryStore, apply_branch_mutation, phase_loss_hash, structurally_equal, + structurally_equal_mutated, word_fingerprint, +}; +use bitvec::view::BitView; +use num::PrimInt; +use ppvm_traits::traits::Clifford; #[derive(Clone)] pub struct VecStorage, I>> { @@ -125,6 +131,8 @@ where + Copy, I: TableauIndex + Send + Sync, C: SparseVector, I>, + GeneralizedTableau: Clifford, + <::Storage as BitView>::Store: PrimInt, { fn with_capacity(cap: usize) -> Self { Self { @@ -201,6 +209,72 @@ where needs_renormalize } + fn insert_or_merge_mutated_branches( + &mut self, + branches: Vec<(usize, BranchMutation, ::Coeff, u64, u64)>, + cutoff: &::Coeff, + ) -> bool { + // Defensive: should be a no-op since the caller's for_each_mut_with_keys + // already ran and the branch words were never mutated. + self.rebuild_fingerprints_if_dirty(); + + let mut fp_index: FxHashMap> = + FxHashMap::with_capacity_and_hasher(self.entries.len(), Default::default()); + for i in 0..self.entries.len() { + let fp = self.fingerprints[i]; + fp_index.entry(fp).or_default().push(i); + } + + let mut needs_renormalize = false; + for (parent_idx, mutation, p, word_fp, phase_loss) in branches { + let fp = word_fp ^ phase_loss; + + // Find a structurally-equal existing entry among the fp candidates + // WITHOUT materializing the branch tableau. The disjoint-field borrow + // (`self.entries` immutable + `self.scratch` mutable) is allowed. + let mut found: Option = None; + if let Some(candidates) = fp_index.get(&fp) { + for &i in candidates { + if structurally_equal_mutated( + &self.entries[i].0, + &self.entries[parent_idx].0, + mutation, + &mut self.scratch, + ) { + found = Some(i); + break; + } + } + } + + match found { + Some(i) => { + let p0 = &self.entries[i].1; + self.entries[i].1 = p0.clone() + p; + } + None => { + if &p > cutoff { + // Surviving new entry: materialize now (clone parent + + // apply mutation). Later branches' parent_idx still refer + // to the original entries — push never moves them. + let mut tab = self.entries[parent_idx].0.clone(); + apply_branch_mutation(&mut tab, mutation); + let new_idx = self.entries.len(); + self.entries.push((tab, p)); + self.fingerprints.push(fp); + self.word_fingerprints.push(word_fp); + self.phase_loss_hashes.push(phase_loss); + fp_index.entry(fp).or_default().push(new_idx); + } else { + needs_renormalize = true; + } + } + } + } + + needs_renormalize + } + fn retain(&mut self, mut f: F) where F: FnMut(&GeneralizedTableau, &::Coeff) -> bool, From b682eb4144b3b52a7967caae3cfe68156ccbf76b Mon Sep 17 00:00:00 2001 From: David Plankensteiner Date: Tue, 23 Jun 2026 20:51:22 +0200 Subject: [PATCH 07/24] chore(autotune): record lazy-materialization keep (2.73x) --- docs/autotune/2026-06-23-tableau-sum-build/log.md | 1 + docs/autotune/2026-06-23-tableau-sum-build/metric.toml | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/docs/autotune/2026-06-23-tableau-sum-build/log.md b/docs/autotune/2026-06-23-tableau-sum-build/log.md index 06e93a1c0..039506bc5 100644 --- a/docs/autotune/2026-06-23-tableau-sum-build/log.md +++ b/docs/autotune/2026-06-23-tableau-sum-build/log.md @@ -9,3 +9,4 @@ - Accuracy guard: branch count must stay 2025 (optimizations must not change the math). Cutoff fixed at 1e-7. - Bench cmd: cargo build --release -p ppvm-tableau-sum --example msd-noisy-bench && ./target/release/examples/msd-noisy-bench - Accuracy reference (must be preserved to ~1e-9 by all optimizations): branches=2025, sum_p=1.0, sum_p2=0.725135705447, top5_p[0]=0.8515413524292632. Key code fact: GeneralizedTableau X/Y/Z (gates/clifford.rs impl_generalized_tableau_clifford) only flip per-row sign bits (X where z=1, Y where x^z=1, Z where x=1) and leave words/coefficients/is_lost identical to parent; loss only sets one is_lost bit. So branch fingerprints are derivable without cloning -> lazy materialization is the plan. +- KEEP lazy-materialization: build_median 2620ms -> 958ms (2.73x, -63%). per_shot unchanged (~22us). branches=2025, sum_p2=0.725135705447 and top5 bit-identical => math unchanged. 76 tests pass. Implemented BranchMutation + structurally_equal_mutated + apply_branch_mutation in storage/mod.rs; insert_or_merge_mutated_branches trait method (VecStorage lazy/index-based, MapStorage eager-clone fallback); loss_channel + pauli_error emit virtual branches and only clone survivors. two_qubit/correlated_loss/reset_loss left eager (not in msd-noisy). diff --git a/docs/autotune/2026-06-23-tableau-sum-build/metric.toml b/docs/autotune/2026-06-23-tableau-sum-build/metric.toml index 24d85b30b..9db9e15a7 100644 --- a/docs/autotune/2026-06-23-tableau-sum-build/metric.toml +++ b/docs/autotune/2026-06-23-tableau-sum-build/metric.toml @@ -5,3 +5,10 @@ "build_median_ms" = 2620.0 "per_shot_ns" = 22500.0 "branches" = 2025.0 +[[metric]] +"commit" = "59165235" +"status" = "keep" +"description" = "lazy branch materialization for loss_channel + pauli_error (single-qubit depolarize)" +"build_median_ms" = 958.0 +"per_shot_ns" = 22100.0 +"branches" = 2025.0 From d0cc2fe88dadb6ab258c02bf090cd67a196073e5 Mon Sep 17 00:00:00 2001 From: David Plankensteiner Date: Tue, 23 Jun 2026 20:58:44 +0200 Subject: [PATCH 08/24] perf(tableau-sum): bulk-hash word_fingerprint in one pass Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/ppvm-tableau-sum/src/storage/mod.rs | 71 ++++++++++++++++------ 1 file changed, 52 insertions(+), 19 deletions(-) diff --git a/crates/ppvm-tableau-sum/src/storage/mod.rs b/crates/ppvm-tableau-sum/src/storage/mod.rs index 8522e9e8c..f44418bb9 100644 --- a/crates/ppvm-tableau-sum/src/storage/mod.rs +++ b/crates/ppvm-tableau-sum/src/storage/mod.rs @@ -10,14 +10,13 @@ use fxhash::FxHashMap; use ppvm_traits::traits::Clifford; // Hasher for the structural `word_fingerprint`. gxhash (AES-based) is fastest on -// native but needs hardware AES and does not build on wasm32, so fall back to -// fxhash there. The fingerprint is a transient in-memory dedup key — collisions -// are resolved by `structurally_equal`, and it is never persisted or compared -// across builds — so the hasher may differ per target without affecting results. +// native and exposes a `gxhash64` bulk free function, but it needs hardware AES +// and does not build on wasm32, so fall back to fxhash there. The fingerprint is +// a transient in-memory dedup key — collisions are resolved by +// `structurally_equal`, and it is never persisted or compared across builds — so +// the hasher may differ per target without affecting results. #[cfg(target_arch = "wasm32")] use fxhash::FxHasher as FingerprintHasher; -#[cfg(not(target_arch = "wasm32"))] -use gxhash::GxHasher as FingerprintHasher; use num::{ Complex, One, Zero, complex::{Complex64, ComplexFloat}, @@ -26,10 +25,30 @@ use ppvm_tableau::{ data::GeneralizedTableau, sparsevec::SparseVector, tableau_index::TableauIndex, }; use ppvm_traits::config::Config; -use std::{ - hash::{Hash, Hasher}, - ops::AddAssign, -}; +#[cfg(target_arch = "wasm32")] +use std::hash::Hasher; +use std::ops::AddAssign; + +// Reusable per-thread scratch buffer for `word_fingerprint`. Gathering every +// row's word bytes into one contiguous slice lets us hash in a single bulk call +// instead of two tiny `Hash::hash` writes per row (high per-call overhead). +// Cleared (capacity retained) per call, so it adapts to any row count / qubit +// width without re-allocating. Bytes (not the storage word type) because the +// storage element width (`[u8; N]` vs `[u64; N]`) is generic at this call site. +thread_local! { + static WORD_FP_BUF: std::cell::RefCell> = const { std::cell::RefCell::new(Vec::new()) }; +} + +/// View a `Copy` plain-old-data value's bytes. Sound because `A: PauliStorage` +/// implies `bytemuck::Pod`: no padding, every bit pattern valid, so the bytes +/// are fully initialized and `u8`-aligned. +#[inline] +fn pod_bytes(value: &A) -> &[u8] { + // SAFETY: `A` is POD (PauliStorage: bytemuck::Pod); reading its + // `size_of::()` initialized bytes as `[u8]` is sound, and the borrow is + // tied to `value`. + unsafe { std::slice::from_raw_parts(value as *const A as *const u8, std::mem::size_of::()) } +} /// Hash of the `word` (Pauli content) of every row, in order. This is the /// expensive component (each word is several machine words wide) and is @@ -43,15 +62,29 @@ where I:, C: SparseVector, I>, { - let mut hasher = FingerprintHasher::default(); - for row in tab.tableau.data.iter() { - // Hash the Pauli bits directly: the `PauliWord` hash cache is disabled - // for tableau rows (`REHASH = false`), so `row.word.hash()` would feed - // a stale zero and make every tableau collide. - row.word.xbits.data.hash(&mut hasher); - row.word.zbits.data.hash(&mut hasher); - } - hasher.finish() + WORD_FP_BUF.with(|cell| { + let mut buf = cell.borrow_mut(); + // Clear retains capacity; refill with every row's bits as raw bytes. + buf.clear(); + for row in tab.tableau.data.iter() { + // Gather the Pauli bits directly: the `PauliWord` hash cache is + // disabled for tableau rows (`REHASH = false`), so hashing + // `row.word` would feed a stale zero and make every tableau collide. + buf.extend_from_slice(pod_bytes(&row.word.xbits.data)); + buf.extend_from_slice(pod_bytes(&row.word.zbits.data)); + } + + #[cfg(not(target_arch = "wasm32"))] + { + gxhash::gxhash64(&buf, 0) + } + #[cfg(target_arch = "wasm32")] + { + let mut hasher = FingerprintHasher::default(); + hasher.write(&buf); + hasher.finish() + } + }) } /// Per-row mask (splitmix64 of `(index, salt)`); a stable pure function used From ad0d0b24260caaf50fe34fc7b72f4cd740c6a3db Mon Sep 17 00:00:00 2001 From: David Plankensteiner Date: Tue, 23 Jun 2026 20:59:47 +0200 Subject: [PATCH 09/24] chore(autotune): plan bulk-word-hash approach --- .../bulk-word-hash/prompts.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 docs/autotune/2026-06-23-tableau-sum-build/bulk-word-hash/prompts.md diff --git a/docs/autotune/2026-06-23-tableau-sum-build/bulk-word-hash/prompts.md b/docs/autotune/2026-06-23-tableau-sum-build/bulk-word-hash/prompts.md new file mode 100644 index 000000000..f928d0d17 --- /dev/null +++ b/docs/autotune/2026-06-23-tableau-sum-build/bulk-word-hash/prompts.md @@ -0,0 +1,15 @@ +# Approach: bulk word_fingerprint hashing + +## Hypothesis +After lazy materialization, `rebuild_fingerprints_if_dirty` dominates (47% self, +61% inclusive); it re-hashes every entry's words after each clifford gate marks +them dirty. `word_fingerprint` currently does 2 small `Hash::hash` calls per row +(`xbits.data` then `zbits.data`) = ~340 hasher writes for 170 rows, with +per-call overhead. Gather the row words into one contiguous buffer and hash once +with `gxhash::gxhash64` (native) — far less per-call overhead, single SIMD pass. + +## Target +`./target/release/examples/msd-noisy-bench`; baseline now build_median ~958ms. +Must keep branches=2025, sum_p2=0.725135705447, top5[0]=0.8515413524292632. +Fingerprints are transient dedup keys (resolved by structurally_equal), so the +hash VALUE may change freely as long as it's consistent within a build. From 6132edf7dc7acefb2cdaff23d1a27c7af15894bc Mon Sep 17 00:00:00 2001 From: David Plankensteiner Date: Tue, 23 Jun 2026 20:59:48 +0200 Subject: [PATCH 10/24] chore(autotune): record bulk-word-hash keep (1.67x, cumulative 4.57x) --- docs/autotune/2026-06-23-tableau-sum-build/log.md | 1 + docs/autotune/2026-06-23-tableau-sum-build/metric.toml | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/docs/autotune/2026-06-23-tableau-sum-build/log.md b/docs/autotune/2026-06-23-tableau-sum-build/log.md index 039506bc5..564d379e0 100644 --- a/docs/autotune/2026-06-23-tableau-sum-build/log.md +++ b/docs/autotune/2026-06-23-tableau-sum-build/log.md @@ -10,3 +10,4 @@ - Bench cmd: cargo build --release -p ppvm-tableau-sum --example msd-noisy-bench && ./target/release/examples/msd-noisy-bench - Accuracy reference (must be preserved to ~1e-9 by all optimizations): branches=2025, sum_p=1.0, sum_p2=0.725135705447, top5_p[0]=0.8515413524292632. Key code fact: GeneralizedTableau X/Y/Z (gates/clifford.rs impl_generalized_tableau_clifford) only flip per-row sign bits (X where z=1, Y where x^z=1, Z where x=1) and leave words/coefficients/is_lost identical to parent; loss only sets one is_lost bit. So branch fingerprints are derivable without cloning -> lazy materialization is the plan. - KEEP lazy-materialization: build_median 2620ms -> 958ms (2.73x, -63%). per_shot unchanged (~22us). branches=2025, sum_p2=0.725135705447 and top5 bit-identical => math unchanged. 76 tests pass. Implemented BranchMutation + structurally_equal_mutated + apply_branch_mutation in storage/mod.rs; insert_or_merge_mutated_branches trait method (VecStorage lazy/index-based, MapStorage eager-clone fallback); loss_channel + pauli_error emit virtual branches and only clone survivors. two_qubit/correlated_loss/reset_loss left eager (not in msd-noisy). +- KEEP bulk-word-hash: build_median 958ms -> 573ms (1.67x; cumulative 4.57x from 2620). word_fingerprint now gathers all row words into a reused thread_local Vec and calls gxhash64 once instead of ~340 tiny Hash writes. Subagent used Vec+pod_bytes (storage type is generic A:PauliStorage, not usize) and avoided adding bytemuck dep. per_shot unchanged, accuracy identical, 76+33 tests pass. diff --git a/docs/autotune/2026-06-23-tableau-sum-build/metric.toml b/docs/autotune/2026-06-23-tableau-sum-build/metric.toml index 9db9e15a7..f17d237c7 100644 --- a/docs/autotune/2026-06-23-tableau-sum-build/metric.toml +++ b/docs/autotune/2026-06-23-tableau-sum-build/metric.toml @@ -12,3 +12,10 @@ "build_median_ms" = 958.0 "per_shot_ns" = 22100.0 "branches" = 2025.0 +[[metric]] +"commit" = "ad0d0b24" +"status" = "keep" +"description" = "bulk-hash word_fingerprint (gather rows, single gxhash64)" +"build_median_ms" = 573.0 +"per_shot_ns" = 22100.0 +"branches" = 2025.0 From 26e222391aa9fc7d5a1b94eb5ffa8ca49d059c2b Mon Sep 17 00:00:00 2001 From: David Plankensteiner Date: Tue, 23 Jun 2026 21:05:32 +0200 Subject: [PATCH 11/24] perf(tableau-sum): precompute per-row masks for phase_loss + pauli deltas Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/ppvm-tableau-sum/src/noise.rs | 7 ++- crates/ppvm-tableau-sum/src/storage/map.rs | 24 +++++++--- crates/ppvm-tableau-sum/src/storage/mod.rs | 56 +++++++++++++++++----- crates/ppvm-tableau-sum/src/storage/vec.rs | 21 ++++---- 4 files changed, 79 insertions(+), 29 deletions(-) diff --git a/crates/ppvm-tableau-sum/src/noise.rs b/crates/ppvm-tableau-sum/src/noise.rs index b12a08057..d6799d035 100644 --- a/crates/ppvm-tableau-sum/src/noise.rs +++ b/crates/ppvm-tableau-sum/src/noise.rs @@ -20,7 +20,7 @@ use rand::{RngExt, rngs::SmallRng}; use crate::{ data::GeneralizedTableauSum, - storage::{Branch, BranchMutation, EntryStore, loss_mask, pauli_branch_phase_loss, sign_mask}, + storage::{Branch, BranchMutation, EntryStore, RowMasks, loss_mask, pauli_branch_phase_loss}, }; fn single_qubit_loss_branch( @@ -212,6 +212,9 @@ where let mut branches = Vec::<(usize, BranchMutation, T::Coeff, u64, u64)>::with_capacity( 3 * self.entries.len(), ); + // Precompute the per-row sign masks once instead of recomputing the + // splitmix `sign_mask` per row per entry in the hot loop below. + let masks = RowMasks::new(self.n_qubits); let mut idx = 0usize; self.entries .for_each_mut_with_keys(|tab, p_sum, word_fp, phase_loss| { @@ -225,7 +228,7 @@ where for (row, pw) in tab.tableau.data.iter().enumerate() { let x: bool = pw.word.xbits[addr0]; let z: bool = pw.word.zbits[addr0]; - let m = sign_mask(row); + let m = masks.sign[row]; if z { dx ^= m; } diff --git a/crates/ppvm-tableau-sum/src/storage/map.rs b/crates/ppvm-tableau-sum/src/storage/map.rs index c788ed996..7242f3cdf 100644 --- a/crates/ppvm-tableau-sum/src/storage/map.rs +++ b/crates/ppvm-tableau-sum/src/storage/map.rs @@ -15,8 +15,8 @@ use ppvm_traits::config::Config; use smallvec::SmallVec; use crate::storage::{ - Branch, BranchMutation, EntryStore, apply_branch_mutation, fingerprint, phase_loss_hash, - structurally_equal, word_fingerprint, + Branch, BranchMutation, EntryStore, RowMasks, apply_branch_mutation, fingerprint, + phase_loss_hash_with, structurally_equal, word_fingerprint, }; use bitvec::view::BitView; use num::PrimInt; @@ -128,11 +128,21 @@ where F: FnMut(&mut GeneralizedTableau, &mut ::Coeff, u64, u64), { self.rebuild_if_dirty(); - for v in self.buckets.values_mut() { - for (tab, c) in v.iter_mut() { - let word_fp = word_fingerprint(tab); - let phase_loss = phase_loss_hash(tab); - f(tab, c, word_fp, phase_loss); + // Build the per-row mask table once; every tableau in the sum shares the + // same qubit count. Skip when there are no entries. + let masks = self + .buckets + .values() + .flat_map(|v| v.iter()) + .next() + .map(|(t, _)| RowMasks::new(t.is_lost.len())); + if let Some(masks) = masks { + for v in self.buckets.values_mut() { + for (tab, c) in v.iter_mut() { + let word_fp = word_fingerprint(tab); + let phase_loss = phase_loss_hash_with(tab, &masks); + f(tab, c, word_fp, phase_loss); + } } } } diff --git a/crates/ppvm-tableau-sum/src/storage/mod.rs b/crates/ppvm-tableau-sum/src/storage/mod.rs index f44418bb9..268a3a091 100644 --- a/crates/ppvm-tableau-sum/src/storage/mod.rs +++ b/crates/ppvm-tableau-sum/src/storage/mod.rs @@ -121,17 +121,25 @@ pub(crate) fn loss_mask(q: usize) -> u64 { row_mask(q, 0xC3C3_C3C3_C3C3_C3C3) } -/// XOR contribution of a single row's phase. -#[inline] -fn phase_contrib(row: usize, phase: u8) -> u64 { - let mut h = 0; - if phase & 1 != 0 { - h ^= imag_mask(row); - } - if phase & 2 != 0 { - h ^= sign_mask(row); +/// Precomputed per-row/per-qubit masks (sign, imag, loss). Built once per op and +/// indexed instead of recomputing the splitmix `row_mask` per row per entry. +pub(crate) struct RowMasks { + pub sign: Vec, // sign_mask(i) for i in 0..2*n_qubits + pub imag: Vec, // imag_mask(i) for i in 0..2*n_qubits + pub loss: Vec, // loss_mask(q) for q in 0..n_qubits +} + +impl RowMasks { + /// Build the mask tables. The tableau has `2 * n_qubits` rows (`sign`/`imag` + /// indexed by row); `loss` is indexed by qubit `0..n_qubits`. + pub(crate) fn new(n_qubits: usize) -> Self { + let n_rows = 2 * n_qubits; + Self { + sign: (0..n_rows).map(sign_mask).collect(), + imag: (0..n_rows).map(imag_mask).collect(), + loss: (0..n_qubits).map(loss_mask).collect(), + } } - h } /// XOR-combinable hash of `is_lost` plus every row's `phase`, formed as the @@ -139,17 +147,41 @@ fn phase_contrib(row: usize, phase: u8) -> u64 { /// XOR-combinable lets a branch inherit its parent's value and update only the /// rows it changed — a sign flip XORs [`sign_mask`], a loss XORs [`loss_mask`]. pub fn phase_loss_hash(tab: &GeneralizedTableau) -> u64 +where + T: Config, + C: SparseVector, I>, +{ + // Single implementation: build a one-shot mask table and delegate so the + // table-indexed and from-scratch values are guaranteed identical. + // `is_lost.len() == n_qubits` and is available under these minimal bounds. + let masks = RowMasks::new(tab.is_lost.len()); + phase_loss_hash_with(tab, &masks) +} + +/// Like [`phase_loss_hash`], but indexes a precomputed [`RowMasks`] instead of +/// recomputing the splitmix masks per row/qubit. Reproduces the same value: +/// phase bit 0 (imag) XORs `masks.imag[row]`, phase bit 1 (sign) XORs +/// `masks.sign[row]`, and a lost qubit `q` XORs `masks.loss[q]`. +pub(crate) fn phase_loss_hash_with( + tab: &GeneralizedTableau, + masks: &RowMasks, +) -> u64 where T: Config, C: SparseVector, I>, { let mut h = 0u64; for (row, ppw) in tab.tableau.data.iter().enumerate() { - h ^= phase_contrib(row, ppw.phase); + if ppw.phase & 1 != 0 { + h ^= masks.imag[row]; + } + if ppw.phase & 2 != 0 { + h ^= masks.sign[row]; + } } for (q, lost) in tab.is_lost.iter().enumerate() { if *lost { - h ^= loss_mask(q); + h ^= masks.loss[q]; } } h diff --git a/crates/ppvm-tableau-sum/src/storage/vec.rs b/crates/ppvm-tableau-sum/src/storage/vec.rs index 7a6bdbb3f..7a13e23e8 100644 --- a/crates/ppvm-tableau-sum/src/storage/vec.rs +++ b/crates/ppvm-tableau-sum/src/storage/vec.rs @@ -14,8 +14,8 @@ use ppvm_tableau::{ use ppvm_traits::config::Config; use crate::storage::{ - BranchMutation, EntryStore, apply_branch_mutation, phase_loss_hash, structurally_equal, - structurally_equal_mutated, word_fingerprint, + BranchMutation, EntryStore, RowMasks, apply_branch_mutation, phase_loss_hash_with, + structurally_equal, structurally_equal_mutated, word_fingerprint, }; use bitvec::view::BitView; use num::PrimInt; @@ -109,12 +109,17 @@ where self.fingerprints.clear(); self.word_fingerprints.clear(); self.phase_loss_hashes.clear(); - for (t, _) in self.entries.iter() { - let wfp = word_fingerprint(t); - let plh = phase_loss_hash(t); - self.word_fingerprints.push(wfp); - self.phase_loss_hashes.push(plh); - self.fingerprints.push(wfp ^ plh); + // Build the per-row mask table once for all entries (every tableau in a + // sum shares the same qubit count). Skip when there are no entries. + if let Some((first, _)) = self.entries.first() { + let masks = RowMasks::new(first.is_lost.len()); + for (t, _) in self.entries.iter() { + let wfp = word_fingerprint(t); + let plh = phase_loss_hash_with(t, &masks); + self.word_fingerprints.push(wfp); + self.phase_loss_hashes.push(plh); + self.fingerprints.push(wfp ^ plh); + } } self.dirty = false; } From 4fea5841cf4985de6e14f88bf7eacf0673a8db06 Mon Sep 17 00:00:00 2001 From: David Plankensteiner Date: Tue, 23 Jun 2026 21:06:45 +0200 Subject: [PATCH 12/24] chore(autotune): plan precompute-masks approach --- .../precompute-masks/prompts.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 docs/autotune/2026-06-23-tableau-sum-build/precompute-masks/prompts.md diff --git a/docs/autotune/2026-06-23-tableau-sum-build/precompute-masks/prompts.md b/docs/autotune/2026-06-23-tableau-sum-build/precompute-masks/prompts.md new file mode 100644 index 000000000..5dc8c0fff --- /dev/null +++ b/docs/autotune/2026-06-23-tableau-sum-build/precompute-masks/prompts.md @@ -0,0 +1,16 @@ +# Approach: precompute per-row masks (kill redundant splitmix) + +## Hypothesis +`sign_mask(row)`/`imag_mask(row)`/`loss_mask(q)` are splitmix64 hashes of a pure +index. They are recomputed per-row, per-entry, on every op: +- `pauli_error`'s dx/dy/dz loop computes `sign_mask(row)` for all 2n rows of + every entry on every depolarize. +- `phase_loss_hash` (called per entry in `rebuild_fingerprints_if_dirty`) + recomputes sign/imag masks per set phase and loss masks per lost qubit. + +Precompute the per-index mask tables ONCE per op and index them. Values are +identical, so all fingerprints (and the accuracy fingerprint) are unchanged. + +## Target +`./target/release/examples/msd-noisy-bench`; baseline now build_median ~573ms. +Keep branches=2025, sum_p2=0.725135705447, top5[0]=0.8515413524292632, per_shot ~22us. From 96064d0ab1922238ee1a6dd591bef161e564574c Mon Sep 17 00:00:00 2001 From: David Plankensteiner Date: Tue, 23 Jun 2026 21:06:57 +0200 Subject: [PATCH 13/24] chore(autotune): record precompute-masks keep (cumulative 4.75x) --- docs/autotune/2026-06-23-tableau-sum-build/log.md | 1 + docs/autotune/2026-06-23-tableau-sum-build/metric.toml | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/docs/autotune/2026-06-23-tableau-sum-build/log.md b/docs/autotune/2026-06-23-tableau-sum-build/log.md index 564d379e0..3dc311a7b 100644 --- a/docs/autotune/2026-06-23-tableau-sum-build/log.md +++ b/docs/autotune/2026-06-23-tableau-sum-build/log.md @@ -11,3 +11,4 @@ - Accuracy reference (must be preserved to ~1e-9 by all optimizations): branches=2025, sum_p=1.0, sum_p2=0.725135705447, top5_p[0]=0.8515413524292632. Key code fact: GeneralizedTableau X/Y/Z (gates/clifford.rs impl_generalized_tableau_clifford) only flip per-row sign bits (X where z=1, Y where x^z=1, Z where x=1) and leave words/coefficients/is_lost identical to parent; loss only sets one is_lost bit. So branch fingerprints are derivable without cloning -> lazy materialization is the plan. - KEEP lazy-materialization: build_median 2620ms -> 958ms (2.73x, -63%). per_shot unchanged (~22us). branches=2025, sum_p2=0.725135705447 and top5 bit-identical => math unchanged. 76 tests pass. Implemented BranchMutation + structurally_equal_mutated + apply_branch_mutation in storage/mod.rs; insert_or_merge_mutated_branches trait method (VecStorage lazy/index-based, MapStorage eager-clone fallback); loss_channel + pauli_error emit virtual branches and only clone survivors. two_qubit/correlated_loss/reset_loss left eager (not in msd-noisy). - KEEP bulk-word-hash: build_median 958ms -> 573ms (1.67x; cumulative 4.57x from 2620). word_fingerprint now gathers all row words into a reused thread_local Vec and calls gxhash64 once instead of ~340 tiny Hash writes. Subagent used Vec+pod_bytes (storage type is generic A:PauliStorage, not usize) and avoided adding bytemuck dep. per_shot unchanged, accuracy identical, 76+33 tests pass. +- KEEP precompute-masks: build_median 573ms -> 552ms (~3.7%; cumulative 4.75x). RowMasks table built once per op; phase_loss_hash_with + pauli_error dx/dy/dz index it instead of recomputing splitmix sign_mask per row per entry. Smaller than hoped => phase_loss_hash wasn't splitmix-dominated (mostly iteration over rows). accuracy identical, per_shot unchanged. diff --git a/docs/autotune/2026-06-23-tableau-sum-build/metric.toml b/docs/autotune/2026-06-23-tableau-sum-build/metric.toml index f17d237c7..3a64c189d 100644 --- a/docs/autotune/2026-06-23-tableau-sum-build/metric.toml +++ b/docs/autotune/2026-06-23-tableau-sum-build/metric.toml @@ -19,3 +19,10 @@ "build_median_ms" = 573.0 "per_shot_ns" = 22100.0 "branches" = 2025.0 +[[metric]] +"commit" = "4fea5841" +"status" = "keep" +"description" = "precompute per-row masks (RowMasks) for phase_loss + pauli dx/dy/dz" +"build_median_ms" = 552.0 +"per_shot_ns" = 22300.0 +"branches" = 2025.0 From 34b41ceba941677b3fef69a509d2d3333c05cb2f Mon Sep 17 00:00:00 2001 From: David Plankensteiner Date: Tue, 23 Jun 2026 21:14:32 +0200 Subject: [PATCH 14/24] perf(tableau-sum): direct single-pass word_fingerprint (drop gather+gxhash) Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/ppvm-tableau-sum/src/storage/mod.rs | 80 ++++++++++------------ 1 file changed, 35 insertions(+), 45 deletions(-) diff --git a/crates/ppvm-tableau-sum/src/storage/mod.rs b/crates/ppvm-tableau-sum/src/storage/mod.rs index 268a3a091..fc6d5d212 100644 --- a/crates/ppvm-tableau-sum/src/storage/mod.rs +++ b/crates/ppvm-tableau-sum/src/storage/mod.rs @@ -7,16 +7,6 @@ pub mod vec; pub use entry_store::{Branch, EntryStore}; use fxhash::FxHashMap; -use ppvm_traits::traits::Clifford; - -// Hasher for the structural `word_fingerprint`. gxhash (AES-based) is fastest on -// native and exposes a `gxhash64` bulk free function, but it needs hardware AES -// and does not build on wasm32, so fall back to fxhash there. The fingerprint is -// a transient in-memory dedup key — collisions are resolved by -// `structurally_equal`, and it is never persisted or compared across builds — so -// the hasher may differ per target without affecting results. -#[cfg(target_arch = "wasm32")] -use fxhash::FxHasher as FingerprintHasher; use num::{ Complex, One, Zero, complex::{Complex64, ComplexFloat}, @@ -25,20 +15,9 @@ use ppvm_tableau::{ data::GeneralizedTableau, sparsevec::SparseVector, tableau_index::TableauIndex, }; use ppvm_traits::config::Config; -#[cfg(target_arch = "wasm32")] -use std::hash::Hasher; +use ppvm_traits::traits::Clifford; use std::ops::AddAssign; -// Reusable per-thread scratch buffer for `word_fingerprint`. Gathering every -// row's word bytes into one contiguous slice lets us hash in a single bulk call -// instead of two tiny `Hash::hash` writes per row (high per-call overhead). -// Cleared (capacity retained) per call, so it adapts to any row count / qubit -// width without re-allocating. Bytes (not the storage word type) because the -// storage element width (`[u8; N]` vs `[u64; N]`) is generic at this call site. -thread_local! { - static WORD_FP_BUF: std::cell::RefCell> = const { std::cell::RefCell::new(Vec::new()) }; -} - /// View a `Copy` plain-old-data value's bytes. Sound because `A: PauliStorage` /// implies `bytemuck::Pod`: no padding, every bit pattern valid, so the bytes /// are fully initialized and `u8`-aligned. @@ -59,32 +38,43 @@ fn pod_bytes(value: &A) -> &[u8] { pub fn word_fingerprint(tab: &GeneralizedTableau) -> u64 where T: Config, - I:, C: SparseVector, I>, { - WORD_FP_BUF.with(|cell| { - let mut buf = cell.borrow_mut(); - // Clear retains capacity; refill with every row's bits as raw bytes. - buf.clear(); - for row in tab.tableau.data.iter() { - // Gather the Pauli bits directly: the `PauliWord` hash cache is - // disabled for tableau rows (`REHASH = false`), so hashing - // `row.word` would feed a stale zero and make every tableau collide. - buf.extend_from_slice(pod_bytes(&row.word.xbits.data)); - buf.extend_from_slice(pod_bytes(&row.word.zbits.data)); - } + // fxhash-style multiplicative-rotate mix, fed the raw storage words of + // every row's x then z bits. No allocation, no thread_local, single pass. + const K: u64 = 0x51_7c_c1_b7_27_22_0a_95; // fxhash constant + let mut h: u64 = 0; + let mut mix = |w: u64| { + h = (h.rotate_left(5) ^ w).wrapping_mul(K); + }; + for row in tab.tableau.data.iter() { + // The `PauliWord` hash cache is disabled for tableau rows + // (`REHASH = false`), so hashing `row.word` would feed a stale zero and + // make every tableau collide; read the bit storage directly. The + // storage word element type is generic (`[u8; N]` vs `[u64; N]`) with no + // numeric bound available here, so view it as POD bytes and fold 8 at a + // time into a `u64` lane — identical on native and wasm. + mix_pod_words(pod_bytes(&row.word.xbits.data), &mut mix); + mix_pod_words(pod_bytes(&row.word.zbits.data), &mut mix); + } + h +} - #[cfg(not(target_arch = "wasm32"))] - { - gxhash::gxhash64(&buf, 0) - } - #[cfg(target_arch = "wasm32")] - { - let mut hasher = FingerprintHasher::default(); - hasher.write(&buf); - hasher.finish() - } - }) +/// Fold a POD byte slice into the `mix` closure 8 bytes (one `u64` lane) at a +/// time, little-endian, zero-padding a short trailing chunk. Keeps the hash +/// independent of the generic storage word width. +#[inline] +fn mix_pod_words(bytes: &[u8], mix: &mut impl FnMut(u64)) { + let mut chunks = bytes.chunks_exact(8); + for c in &mut chunks { + mix(u64::from_le_bytes(c.try_into().unwrap())); + } + let rem = chunks.remainder(); + if !rem.is_empty() { + let mut buf = [0u8; 8]; + buf[..rem.len()].copy_from_slice(rem); + mix(u64::from_le_bytes(buf)); + } } /// Per-row mask (splitmix64 of `(index, salt)`); a stable pure function used From 5ea2c00f0a00841197854eb6b2cb7fa4cbd31905 Mon Sep 17 00:00:00 2001 From: David Plankensteiner Date: Tue, 23 Jun 2026 21:15:50 +0200 Subject: [PATCH 15/24] Revert "perf(tableau-sum): direct single-pass word_fingerprint (drop gather+gxhash)" This reverts commit 34b41ceba941677b3fef69a509d2d3333c05cb2f. --- crates/ppvm-tableau-sum/src/storage/mod.rs | 80 ++++++++++++---------- 1 file changed, 45 insertions(+), 35 deletions(-) diff --git a/crates/ppvm-tableau-sum/src/storage/mod.rs b/crates/ppvm-tableau-sum/src/storage/mod.rs index fc6d5d212..268a3a091 100644 --- a/crates/ppvm-tableau-sum/src/storage/mod.rs +++ b/crates/ppvm-tableau-sum/src/storage/mod.rs @@ -7,6 +7,16 @@ pub mod vec; pub use entry_store::{Branch, EntryStore}; use fxhash::FxHashMap; +use ppvm_traits::traits::Clifford; + +// Hasher for the structural `word_fingerprint`. gxhash (AES-based) is fastest on +// native and exposes a `gxhash64` bulk free function, but it needs hardware AES +// and does not build on wasm32, so fall back to fxhash there. The fingerprint is +// a transient in-memory dedup key — collisions are resolved by +// `structurally_equal`, and it is never persisted or compared across builds — so +// the hasher may differ per target without affecting results. +#[cfg(target_arch = "wasm32")] +use fxhash::FxHasher as FingerprintHasher; use num::{ Complex, One, Zero, complex::{Complex64, ComplexFloat}, @@ -15,9 +25,20 @@ use ppvm_tableau::{ data::GeneralizedTableau, sparsevec::SparseVector, tableau_index::TableauIndex, }; use ppvm_traits::config::Config; -use ppvm_traits::traits::Clifford; +#[cfg(target_arch = "wasm32")] +use std::hash::Hasher; use std::ops::AddAssign; +// Reusable per-thread scratch buffer for `word_fingerprint`. Gathering every +// row's word bytes into one contiguous slice lets us hash in a single bulk call +// instead of two tiny `Hash::hash` writes per row (high per-call overhead). +// Cleared (capacity retained) per call, so it adapts to any row count / qubit +// width without re-allocating. Bytes (not the storage word type) because the +// storage element width (`[u8; N]` vs `[u64; N]`) is generic at this call site. +thread_local! { + static WORD_FP_BUF: std::cell::RefCell> = const { std::cell::RefCell::new(Vec::new()) }; +} + /// View a `Copy` plain-old-data value's bytes. Sound because `A: PauliStorage` /// implies `bytemuck::Pod`: no padding, every bit pattern valid, so the bytes /// are fully initialized and `u8`-aligned. @@ -38,43 +59,32 @@ fn pod_bytes(value: &A) -> &[u8] { pub fn word_fingerprint(tab: &GeneralizedTableau) -> u64 where T: Config, + I:, C: SparseVector, I>, { - // fxhash-style multiplicative-rotate mix, fed the raw storage words of - // every row's x then z bits. No allocation, no thread_local, single pass. - const K: u64 = 0x51_7c_c1_b7_27_22_0a_95; // fxhash constant - let mut h: u64 = 0; - let mut mix = |w: u64| { - h = (h.rotate_left(5) ^ w).wrapping_mul(K); - }; - for row in tab.tableau.data.iter() { - // The `PauliWord` hash cache is disabled for tableau rows - // (`REHASH = false`), so hashing `row.word` would feed a stale zero and - // make every tableau collide; read the bit storage directly. The - // storage word element type is generic (`[u8; N]` vs `[u64; N]`) with no - // numeric bound available here, so view it as POD bytes and fold 8 at a - // time into a `u64` lane — identical on native and wasm. - mix_pod_words(pod_bytes(&row.word.xbits.data), &mut mix); - mix_pod_words(pod_bytes(&row.word.zbits.data), &mut mix); - } - h -} + WORD_FP_BUF.with(|cell| { + let mut buf = cell.borrow_mut(); + // Clear retains capacity; refill with every row's bits as raw bytes. + buf.clear(); + for row in tab.tableau.data.iter() { + // Gather the Pauli bits directly: the `PauliWord` hash cache is + // disabled for tableau rows (`REHASH = false`), so hashing + // `row.word` would feed a stale zero and make every tableau collide. + buf.extend_from_slice(pod_bytes(&row.word.xbits.data)); + buf.extend_from_slice(pod_bytes(&row.word.zbits.data)); + } -/// Fold a POD byte slice into the `mix` closure 8 bytes (one `u64` lane) at a -/// time, little-endian, zero-padding a short trailing chunk. Keeps the hash -/// independent of the generic storage word width. -#[inline] -fn mix_pod_words(bytes: &[u8], mix: &mut impl FnMut(u64)) { - let mut chunks = bytes.chunks_exact(8); - for c in &mut chunks { - mix(u64::from_le_bytes(c.try_into().unwrap())); - } - let rem = chunks.remainder(); - if !rem.is_empty() { - let mut buf = [0u8; 8]; - buf[..rem.len()].copy_from_slice(rem); - mix(u64::from_le_bytes(buf)); - } + #[cfg(not(target_arch = "wasm32"))] + { + gxhash::gxhash64(&buf, 0) + } + #[cfg(target_arch = "wasm32")] + { + let mut hasher = FingerprintHasher::default(); + hasher.write(&buf); + hasher.finish() + } + }) } /// Per-row mask (splitmix64 of `(index, salt)`); a stable pure function used From d491621054464a35cde88f0e759dd1fac1d7cbce Mon Sep 17 00:00:00 2001 From: David Plankensteiner Date: Tue, 23 Jun 2026 21:16:18 +0200 Subject: [PATCH 16/24] chore(autotune): plan direct-word-hash approach --- .../direct-word-hash/prompts.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 docs/autotune/2026-06-23-tableau-sum-build/direct-word-hash/prompts.md diff --git a/docs/autotune/2026-06-23-tableau-sum-build/direct-word-hash/prompts.md b/docs/autotune/2026-06-23-tableau-sum-build/direct-word-hash/prompts.md new file mode 100644 index 000000000..7cbbf1987 --- /dev/null +++ b/docs/autotune/2026-06-23-tableau-sum-build/direct-word-hash/prompts.md @@ -0,0 +1,15 @@ +# Approach: direct single-pass word_fingerprint (no gather, no thread_local) + +## Hypothesis +`word_fingerprint` currently gathers all row words into a thread_local buffer and +calls `gxhash64` (the `with` frame is 22% self). The gather (memcpy ~2.7KB/entry), +the thread_local access, and the separate hash pass have overhead. Replace with a +direct single-pass scalar hash that reads the row words straight from the tableau +(no buffer, no thread_local, one pass). fxhash-style mixing is proven adequate +here (it's the existing wasm fallback) and `structurally_equal` resolves any +extra collisions (it's <1% of runtime, lots of headroom). Bonus: the hash becomes +portable (same on native + wasm), simplifying the cfg. + +## Target +`./target/release/examples/msd-noisy-bench`; baseline now build_median ~552ms. +Keep branches=2025, sum_p2=0.725135705447, top5[0]=0.8515413524292632, per_shot ~22us. From b1c0ed532517bd53aa1a47782e4e0d1ceb7f7e86 Mon Sep 17 00:00:00 2001 From: David Plankensteiner Date: Tue, 23 Jun 2026 21:16:24 +0200 Subject: [PATCH 17/24] chore(autotune): record direct-word-hash discard --- docs/autotune/2026-06-23-tableau-sum-build/log.md | 1 + docs/autotune/2026-06-23-tableau-sum-build/metric.toml | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/docs/autotune/2026-06-23-tableau-sum-build/log.md b/docs/autotune/2026-06-23-tableau-sum-build/log.md index 3dc311a7b..22fb3f312 100644 --- a/docs/autotune/2026-06-23-tableau-sum-build/log.md +++ b/docs/autotune/2026-06-23-tableau-sum-build/log.md @@ -12,3 +12,4 @@ - KEEP lazy-materialization: build_median 2620ms -> 958ms (2.73x, -63%). per_shot unchanged (~22us). branches=2025, sum_p2=0.725135705447 and top5 bit-identical => math unchanged. 76 tests pass. Implemented BranchMutation + structurally_equal_mutated + apply_branch_mutation in storage/mod.rs; insert_or_merge_mutated_branches trait method (VecStorage lazy/index-based, MapStorage eager-clone fallback); loss_channel + pauli_error emit virtual branches and only clone survivors. two_qubit/correlated_loss/reset_loss left eager (not in msd-noisy). - KEEP bulk-word-hash: build_median 958ms -> 573ms (1.67x; cumulative 4.57x from 2620). word_fingerprint now gathers all row words into a reused thread_local Vec and calls gxhash64 once instead of ~340 tiny Hash writes. Subagent used Vec+pod_bytes (storage type is generic A:PauliStorage, not usize) and avoided adding bytemuck dep. per_shot unchanged, accuracy identical, 76+33 tests pass. - KEEP precompute-masks: build_median 573ms -> 552ms (~3.7%; cumulative 4.75x). RowMasks table built once per op; phase_loss_hash_with + pauli_error dx/dy/dz index it instead of recomputing splitmix sign_mask per row per entry. Smaller than hoped => phase_loss_hash wasn't splitmix-dominated (mostly iteration over rows). accuracy identical, per_shot unchanged. +- DISCARD direct-word-hash: scalar fxhash-style fold REGRESSED 552ms -> 921ms. Reverted. Lesson: word_fingerprint is SIMD-throughput-bound; gxhash (even with the gather+thread_local) beats a scalar byte/word fold. The gather+gxhash is near-optimal for full hashing. => the ONLY way to cut the 22% word-hash is to avoid re-hashing entirely (incremental/Zobrist fingerprint maintained through gates). Next: incremental fingerprinting. diff --git a/docs/autotune/2026-06-23-tableau-sum-build/metric.toml b/docs/autotune/2026-06-23-tableau-sum-build/metric.toml index 3a64c189d..0cc1e5130 100644 --- a/docs/autotune/2026-06-23-tableau-sum-build/metric.toml +++ b/docs/autotune/2026-06-23-tableau-sum-build/metric.toml @@ -26,3 +26,10 @@ "build_median_ms" = 552.0 "per_shot_ns" = 22300.0 "branches" = 2025.0 +[[metric]] +"commit" = "d4916210" +"status" = "discard" +"description" = "direct single-pass scalar word_fingerprint (no gather/gxhash) \u2014 REGRESSED" +"build_median_ms" = 921.0 +"per_shot_ns" = 22300.0 +"branches" = 2025.0 From 4be19ee3afae3273082c217ccbd56c38369df8ee Mon Sep 17 00:00:00 2001 From: David Plankensteiner Date: Tue, 23 Jun 2026 21:22:30 +0200 Subject: [PATCH 18/24] perf(tableau-sum): direct-word column reads in pauli noise + virtual compare Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/ppvm-tableau-sum/src/noise.rs | 17 +++++++++++++---- crates/ppvm-tableau-sum/src/storage/mod.rs | 21 ++++++++++++++++++--- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/crates/ppvm-tableau-sum/src/noise.rs b/crates/ppvm-tableau-sum/src/noise.rs index d6799d035..ad1bad2b1 100644 --- a/crates/ppvm-tableau-sum/src/noise.rs +++ b/crates/ppvm-tableau-sum/src/noise.rs @@ -3,7 +3,7 @@ use std::fmt::Debug; -use bitvec::view::BitView; +use bitvec::view::{BitView, BitViewSized}; use num::{ Complex, One, PrimInt, ToPrimitive, Zero, complex::{Complex64, ComplexFloat}, @@ -20,7 +20,9 @@ use rand::{RngExt, rngs::SmallRng}; use crate::{ data::GeneralizedTableauSum, - storage::{Branch, BranchMutation, EntryStore, RowMasks, loss_mask, pauli_branch_phase_loss}, + storage::{ + Branch, BranchMutation, EntryStore, RowMasks, bit_at, loss_mask, pauli_branch_phase_loss, + }, }; fn single_qubit_loss_branch( @@ -215,6 +217,11 @@ where // Precompute the per-row sign masks once instead of recomputing the // splitmix `sign_mask` per row per entry in the hot loop below. let masks = RowMasks::new(self.n_qubits); + // The store-word index / bit position of column `addr0` are the same for + // every entry and row, so resolve them once (Lsb0 convention). + let bits_per_word = std::mem::size_of::<::Store>() * 8; + let word_idx = addr0 / bits_per_word; + let bit = addr0 % bits_per_word; let mut idx = 0usize; self.entries .for_each_mut_with_keys(|tab, p_sum, word_fp, phase_loss| { @@ -226,8 +233,10 @@ where let (mut dx, mut dy, mut dz) = (0u64, 0u64, 0u64); for (row, pw) in tab.tableau.data.iter().enumerate() { - let x: bool = pw.word.xbits[addr0]; - let z: bool = pw.word.zbits[addr0]; + let xw = pw.word.xbits.data.as_raw_slice(); + let zw = pw.word.zbits.data.as_raw_slice(); + let x: bool = bit_at(xw, word_idx, bit); + let z: bool = bit_at(zw, word_idx, bit); let m = masks.sign[row]; if z { dx ^= m; diff --git a/crates/ppvm-tableau-sum/src/storage/mod.rs b/crates/ppvm-tableau-sum/src/storage/mod.rs index 268a3a091..6d9393fcb 100644 --- a/crates/ppvm-tableau-sum/src/storage/mod.rs +++ b/crates/ppvm-tableau-sum/src/storage/mod.rs @@ -15,10 +15,11 @@ use ppvm_traits::traits::Clifford; // a transient in-memory dedup key — collisions are resolved by // `structurally_equal`, and it is never persisted or compared across builds — so // the hasher may differ per target without affecting results. +use bitvec::view::{BitView, BitViewSized}; #[cfg(target_arch = "wasm32")] use fxhash::FxHasher as FingerprintHasher; use num::{ - Complex, One, Zero, + Complex, One, PrimInt, Zero, complex::{Complex64, ComplexFloat}, }; use ppvm_tableau::{ @@ -39,6 +40,14 @@ thread_local! { static WORD_FP_BUF: std::cell::RefCell> = const { std::cell::RefCell::new(Vec::new()) }; } +/// Read a single bit from a raw store-word slice (Lsb0 convention). Skips the +/// per-call word/bit recomputation and bounds-check that `BitArray`'s `Index` +/// does, for the hot per-row column reads in noise propagation. +#[inline] +pub(crate) fn bit_at(words: &[S], word_idx: usize, bit: usize) -> bool { + (words[word_idx] >> bit) & S::one() != S::zero() +} + /// View a `Copy` plain-old-data value's bytes. Sound because `A: PauliStorage` /// implies `bytemuck::Pod`: no padding, every bit pattern valid, so the bytes /// are fully initialized and `u8`-aligned. @@ -334,6 +343,7 @@ pub(crate) fn structurally_equal_mutated( ) -> bool where T: Config, + <::Storage as BitView>::Store: PrimInt, T::Coeff: One + Zero + Clone + num::Num + PartialOrd, Complex: std::ops::Mul> + AddAssign @@ -385,12 +395,17 @@ where } } BranchMutation::Pauli { op, addr0 } => { + let bits_per_word = std::mem::size_of::<::Store>() * 8; + let word_idx = addr0 / bits_per_word; + let bit = addr0 % bits_per_word; for (re, rp) in existing.tableau.data.iter().zip(parent.tableau.data.iter()) { if re.word != rp.word { return false; } - let x: bool = rp.word.xbits[addr0]; - let z: bool = rp.word.zbits[addr0]; + let xw = rp.word.xbits.data.as_raw_slice(); + let zw = rp.word.zbits.data.as_raw_slice(); + let x: bool = bit_at(xw, word_idx, bit); + let z: bool = bit_at(zw, word_idx, bit); let flip = match op { 1 => z, 2 => x ^ z, From 4b8fa6f4a4d7f8a3bcaa36c435ff5beaaabdd0ff Mon Sep 17 00:00:00 2001 From: David Plankensteiner Date: Tue, 23 Jun 2026 21:23:27 +0200 Subject: [PATCH 19/24] chore(autotune): plan direct-column-read approach --- .../direct-column-read/prompts.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 docs/autotune/2026-06-23-tableau-sum-build/direct-column-read/prompts.md diff --git a/docs/autotune/2026-06-23-tableau-sum-build/direct-column-read/prompts.md b/docs/autotune/2026-06-23-tableau-sum-build/direct-column-read/prompts.md new file mode 100644 index 000000000..5b565edde --- /dev/null +++ b/docs/autotune/2026-06-23-tableau-sum-build/direct-column-read/prompts.md @@ -0,0 +1,14 @@ +# Approach: direct-word column reads in hot loops + +## Hypothesis +`pauli_error`'s per-entry dx/dy/dz loop and `structurally_equal_mutated`'s Pauli +branch read the tableau column with `bitvec`'s generic `Index` (`pw.word.xbits[addr0]`), +which recomputes word/bit and bounds-checks per access — done for all 2n rows of +every entry on every depolarize (part of the 23% `for_each_mut_with_keys` self). +Replace with direct storage-word access: compute `word_idx = addr0 / bits_per_word` +and `bit = addr0 % bits_per_word` ONCE, then test `(data.as_raw_slice()[word_idx] >> bit) & 1`. +Same bit values (Lsb0, matches `Tableau::build_masks`), so branches stay 2025. + +## Target +`./target/release/examples/msd-noisy-bench`; baseline now build_median ~542ms. +Keep branches=2025, sum_p2=0.725135705447, top5[0]=0.8515413524292632, per_shot ~22us. From 1c2e6e30b203e2389a03a1e8100deb944c3bb964 Mon Sep 17 00:00:00 2001 From: David Plankensteiner Date: Tue, 23 Jun 2026 21:23:27 +0200 Subject: [PATCH 20/24] chore(autotune): record direct-column-read keep (cumulative 5.04x) --- docs/autotune/2026-06-23-tableau-sum-build/log.md | 1 + docs/autotune/2026-06-23-tableau-sum-build/metric.toml | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/docs/autotune/2026-06-23-tableau-sum-build/log.md b/docs/autotune/2026-06-23-tableau-sum-build/log.md index 22fb3f312..f00469525 100644 --- a/docs/autotune/2026-06-23-tableau-sum-build/log.md +++ b/docs/autotune/2026-06-23-tableau-sum-build/log.md @@ -13,3 +13,4 @@ - KEEP bulk-word-hash: build_median 958ms -> 573ms (1.67x; cumulative 4.57x from 2620). word_fingerprint now gathers all row words into a reused thread_local Vec and calls gxhash64 once instead of ~340 tiny Hash writes. Subagent used Vec+pod_bytes (storage type is generic A:PauliStorage, not usize) and avoided adding bytemuck dep. per_shot unchanged, accuracy identical, 76+33 tests pass. - KEEP precompute-masks: build_median 573ms -> 552ms (~3.7%; cumulative 4.75x). RowMasks table built once per op; phase_loss_hash_with + pauli_error dx/dy/dz index it instead of recomputing splitmix sign_mask per row per entry. Smaller than hoped => phase_loss_hash wasn't splitmix-dominated (mostly iteration over rows). accuracy identical, per_shot unchanged. - DISCARD direct-word-hash: scalar fxhash-style fold REGRESSED 552ms -> 921ms. Reverted. Lesson: word_fingerprint is SIMD-throughput-bound; gxhash (even with the gather+thread_local) beats a scalar byte/word fold. The gather+gxhash is near-optimal for full hashing. => the ONLY way to cut the 22% word-hash is to avoid re-hashing entirely (incremental/Zobrist fingerprint maintained through gates). Next: incremental fingerprinting. +- KEEP direct-column-read: build_median 542ms -> 520ms (~4%; cumulative 5.04x). bit_at() helper reads the storage word directly (word_idx/bit precomputed once) instead of bitvec Index per row, in pauli_error dx/dy/dz and structurally_equal_mutated. accuracy identical, per_shot unchanged. diff --git a/docs/autotune/2026-06-23-tableau-sum-build/metric.toml b/docs/autotune/2026-06-23-tableau-sum-build/metric.toml index 0cc1e5130..3d943cd80 100644 --- a/docs/autotune/2026-06-23-tableau-sum-build/metric.toml +++ b/docs/autotune/2026-06-23-tableau-sum-build/metric.toml @@ -33,3 +33,10 @@ "build_median_ms" = 921.0 "per_shot_ns" = 22300.0 "branches" = 2025.0 +[[metric]] +"commit" = "4b8fa6f4" +"status" = "keep" +"description" = "direct-word column reads in pauli_error dx/dy/dz + structurally_equal_mutated" +"build_median_ms" = 520.0 +"per_shot_ns" = 22100.0 +"branches" = 2025.0 From 683d3ef662b4060b94fa9e399f8398f51a7177f1 Mon Sep 17 00:00:00 2001 From: David Plankensteiner Date: Tue, 23 Jun 2026 21:30:48 +0200 Subject: [PATCH 21/24] chore(autotune): record plateau analysis + incremental-fingerprint opportunity --- docs/autotune/2026-06-23-tableau-sum-build/log.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/autotune/2026-06-23-tableau-sum-build/log.md b/docs/autotune/2026-06-23-tableau-sum-build/log.md index f00469525..02c7f8683 100644 --- a/docs/autotune/2026-06-23-tableau-sum-build/log.md +++ b/docs/autotune/2026-06-23-tableau-sum-build/log.md @@ -14,3 +14,6 @@ - KEEP precompute-masks: build_median 573ms -> 552ms (~3.7%; cumulative 4.75x). RowMasks table built once per op; phase_loss_hash_with + pauli_error dx/dy/dz index it instead of recomputing splitmix sign_mask per row per entry. Smaller than hoped => phase_loss_hash wasn't splitmix-dominated (mostly iteration over rows). accuracy identical, per_shot unchanged. - DISCARD direct-word-hash: scalar fxhash-style fold REGRESSED 552ms -> 921ms. Reverted. Lesson: word_fingerprint is SIMD-throughput-bound; gxhash (even with the gather+thread_local) beats a scalar byte/word fold. The gather+gxhash is near-optimal for full hashing. => the ONLY way to cut the 22% word-hash is to avoid re-hashing entirely (incremental/Zobrist fingerprint maintained through gates). Next: incremental fingerprinting. - KEEP direct-column-read: build_median 542ms -> 520ms (~4%; cumulative 5.04x). bit_at() helper reads the storage word directly (word_idx/bit precomputed once) instead of bitvec Index per row, in pauli_error dx/dy/dz and structurally_equal_mutated. accuracy identical, per_shot unchanged. +- PLATEAU ANALYSIS @ 520ms (5.04x): profile now: word-hash gxhash gather ('with') 22%, phase_loss_hash_with 14.5% (=> rebuild ~36.7%), noise branch-building (for_each self) 21%, clifford gates 23% (cz 10.5% + sqrt_* 12.7%), sampling ~10% (not build), T compute_decomposition 4.5%, merge 3.8%. The rebuild is SIMD-compute-bound (proven: scalar-fold DISCARD regressed), so full re-hashing is near-optimal. Remaining gate cost is inherent (gate applied to every one of 2025 entries x 170 rows). + +NEXT OPPORTUNITY (not done — high complexity, conflicts with user 'simplicity over cleverness' pref; ~10-20% est, revert-safe due to loud failure via test_word_fingerprint_cache_stays_consistent + branches!=2025 guard): cell-level incremental ('Zobrist') word_fingerprint maintained through clifford gates so rebuild is skipped for the common gates. Replace gxhash word_fp with XOR over (row r, qubit q) of (xbit?XM(r,q):0)^(zbit?ZM(r,q):0). Keep dirty+full-recompute fallback for non-handled gates (T, s, cnot, x/y/z standalone). Incremental update per gate reads OLD column bits (direct, via bit_at) and XORs precomputed column masks; gate-delta rules (single gate at col q, from old x,z): h: x_chg=z_chg=(x^z), sign=x&z; sqrt_x: x_chg=z,z_chg=0,sign=z&!x; sqrt_x_dag: x_chg=z,sign=x&z; sqrt_y: x_chg=z_chg=(x^z),sign=x&!z; sqrt_y_dag: x_chg=z_chg=(x^z),sign=z&!x. cz(c,t) from old xa,za,xb,zb: z@c flips iff xb (XOR ZM[c][r]), z@t flips iff xa (XOR ZM[t][r]), x unchanged, sign=xa&xb&(za^zb). phase_loss uses existing sign_mask. Estimated ~11% (non-fused, gates left intact) to ~20% (fused gate+delta in one direct pass). From 863eadc823276ed59bc95b5efce5319a61604c3c Mon Sep 17 00:00:00 2001 From: David Plankensteiner Date: Wed, 24 Jun 2026 08:51:10 +0200 Subject: [PATCH 22/24] chore(autotune): drop session scratch docs (kept out of PR) The per-iteration log, metric ledger, and prompt records were working notes for the tuning session; they are summarized in the PR description rather than checked in. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../bulk-word-hash/prompts.md | 15 ------- .../direct-column-read/prompts.md | 14 ------- .../direct-word-hash/prompts.md | 15 ------- .../lazy-materialization/prompts.md | 33 --------------- .../2026-06-23-tableau-sum-build/log.md | 19 --------- .../2026-06-23-tableau-sum-build/metric.toml | 42 ------------------- .../precompute-masks/prompts.md | 16 ------- 7 files changed, 154 deletions(-) delete mode 100644 docs/autotune/2026-06-23-tableau-sum-build/bulk-word-hash/prompts.md delete mode 100644 docs/autotune/2026-06-23-tableau-sum-build/direct-column-read/prompts.md delete mode 100644 docs/autotune/2026-06-23-tableau-sum-build/direct-word-hash/prompts.md delete mode 100644 docs/autotune/2026-06-23-tableau-sum-build/lazy-materialization/prompts.md delete mode 100644 docs/autotune/2026-06-23-tableau-sum-build/log.md delete mode 100644 docs/autotune/2026-06-23-tableau-sum-build/metric.toml delete mode 100644 docs/autotune/2026-06-23-tableau-sum-build/precompute-masks/prompts.md diff --git a/docs/autotune/2026-06-23-tableau-sum-build/bulk-word-hash/prompts.md b/docs/autotune/2026-06-23-tableau-sum-build/bulk-word-hash/prompts.md deleted file mode 100644 index f928d0d17..000000000 --- a/docs/autotune/2026-06-23-tableau-sum-build/bulk-word-hash/prompts.md +++ /dev/null @@ -1,15 +0,0 @@ -# Approach: bulk word_fingerprint hashing - -## Hypothesis -After lazy materialization, `rebuild_fingerprints_if_dirty` dominates (47% self, -61% inclusive); it re-hashes every entry's words after each clifford gate marks -them dirty. `word_fingerprint` currently does 2 small `Hash::hash` calls per row -(`xbits.data` then `zbits.data`) = ~340 hasher writes for 170 rows, with -per-call overhead. Gather the row words into one contiguous buffer and hash once -with `gxhash::gxhash64` (native) — far less per-call overhead, single SIMD pass. - -## Target -`./target/release/examples/msd-noisy-bench`; baseline now build_median ~958ms. -Must keep branches=2025, sum_p2=0.725135705447, top5[0]=0.8515413524292632. -Fingerprints are transient dedup keys (resolved by structurally_equal), so the -hash VALUE may change freely as long as it's consistent within a build. diff --git a/docs/autotune/2026-06-23-tableau-sum-build/direct-column-read/prompts.md b/docs/autotune/2026-06-23-tableau-sum-build/direct-column-read/prompts.md deleted file mode 100644 index 5b565edde..000000000 --- a/docs/autotune/2026-06-23-tableau-sum-build/direct-column-read/prompts.md +++ /dev/null @@ -1,14 +0,0 @@ -# Approach: direct-word column reads in hot loops - -## Hypothesis -`pauli_error`'s per-entry dx/dy/dz loop and `structurally_equal_mutated`'s Pauli -branch read the tableau column with `bitvec`'s generic `Index` (`pw.word.xbits[addr0]`), -which recomputes word/bit and bounds-checks per access — done for all 2n rows of -every entry on every depolarize (part of the 23% `for_each_mut_with_keys` self). -Replace with direct storage-word access: compute `word_idx = addr0 / bits_per_word` -and `bit = addr0 % bits_per_word` ONCE, then test `(data.as_raw_slice()[word_idx] >> bit) & 1`. -Same bit values (Lsb0, matches `Tableau::build_masks`), so branches stay 2025. - -## Target -`./target/release/examples/msd-noisy-bench`; baseline now build_median ~542ms. -Keep branches=2025, sum_p2=0.725135705447, top5[0]=0.8515413524292632, per_shot ~22us. diff --git a/docs/autotune/2026-06-23-tableau-sum-build/direct-word-hash/prompts.md b/docs/autotune/2026-06-23-tableau-sum-build/direct-word-hash/prompts.md deleted file mode 100644 index 7cbbf1987..000000000 --- a/docs/autotune/2026-06-23-tableau-sum-build/direct-word-hash/prompts.md +++ /dev/null @@ -1,15 +0,0 @@ -# Approach: direct single-pass word_fingerprint (no gather, no thread_local) - -## Hypothesis -`word_fingerprint` currently gathers all row words into a thread_local buffer and -calls `gxhash64` (the `with` frame is 22% self). The gather (memcpy ~2.7KB/entry), -the thread_local access, and the separate hash pass have overhead. Replace with a -direct single-pass scalar hash that reads the row words straight from the tableau -(no buffer, no thread_local, one pass). fxhash-style mixing is proven adequate -here (it's the existing wasm fallback) and `structurally_equal` resolves any -extra collisions (it's <1% of runtime, lots of headroom). Bonus: the hash becomes -portable (same on native + wasm), simplifying the cfg. - -## Target -`./target/release/examples/msd-noisy-bench`; baseline now build_median ~552ms. -Keep branches=2025, sum_p2=0.725135705447, top5[0]=0.8515413524292632, per_shot ~22us. diff --git a/docs/autotune/2026-06-23-tableau-sum-build/lazy-materialization/prompts.md b/docs/autotune/2026-06-23-tableau-sum-build/lazy-materialization/prompts.md deleted file mode 100644 index cb1883d20..000000000 --- a/docs/autotune/2026-06-23-tableau-sum-build/lazy-materialization/prompts.md +++ /dev/null @@ -1,33 +0,0 @@ -# Approach: lazy branch materialization (loss_channel + pauli_error) - -## Hypothesis -Build time is dominated by `fork` (deep-clone of a ~7KB `GeneralizedTableau`, -47% inclusive, 32% `_platform_memmove` self). Each `depolarize1` forks 3 full -tableaux per entry and `loss_channel` forks 1 per entry, but ~85% of those -branches are immediately merged into an existing entry or dropped below -`sum_cutoff`. Those clones are pure waste. - -Key fact (verified in `ppvm-tableau/src/gates/clifford.rs`): applying X/Y/Z to a -`GeneralizedTableau` only flips per-row **sign bits** and leaves the Pauli -words, the `coefficients` vector, and `is_lost` identical to the parent. Loss -only sets one `is_lost` bit. So a branch's fingerprint and structural identity -are derivable from the parent **without cloning**. Materialize (clone+mutate) -only when a branch survives as a *new* entry. - -Per-row sign-flip rule at column `addr0` (matches the gate code exactly): -- X flips sign of row iff `z[addr0] == 1` -- Y flips sign of row iff `x[addr0] ^ z[addr0] == 1` -- Z flips sign of row iff `x[addr0] == 1` -(Only phase bit 1 = sign; the imaginary bit 0 is untouched. So the phase/loss -hash delta is `XOR sign_mask(row)` over flipped rows — same as the existing -`pauli_branch_phase_loss`.) - -## Target metric -`cargo build --release -p ppvm-tableau-sum --example msd-noisy-bench && ./target/release/examples/msd-noisy-bench` -Baseline: build_median ~2620ms, per_shot ~22.5us, branches=2025, -sum_p2=0.725135705447, top5[0]=0.8515413524292632. The math must be unchanged: -branches stays 2025 and the accuracy fingerprint must match to ~1e-9. - -## Expected win -Replace ~3N depolarize clones + ~N loss clones with clones only for survivors -(~1.2N total entries). Target 25-40% build-time reduction. diff --git a/docs/autotune/2026-06-23-tableau-sum-build/log.md b/docs/autotune/2026-06-23-tableau-sum-build/log.md deleted file mode 100644 index 02c7f8683..000000000 --- a/docs/autotune/2026-06-23-tableau-sum-build/log.md +++ /dev/null @@ -1,19 +0,0 @@ -# Log for 2026-06-23-tableau-sum-build - -## 2026-06-23 -- Architecture Notes / baseline profile (samply, 2733 samples): -- Target: examples/msd-noisy build time. Baseline build_median ~2620ms, per_shot ~22.5us, final branches=2025. Config Byte8F64<2> (storage [u64;2]=128bit), index u128, 85 qubits => tableau has 170 rows, each row ~32B word data. -- INCLUSIVE: for_each_mut_with_keys 85%; depolarize1 53%; fork(clone) 47%; loss_channel 38%; rebuild_fingerprints_if_dirty 23%; mimalloc alloc ~15-20%. -- SELF: _platform_memmove 32% (the tableau deep-clone in fork); rebuild_fingerprints_if_dirty 18% (re-hashes all words after every clifford gate marks dirty); for_each_mut_with_keys 11%; phase_loss_hash 5%; gates y/cz/sqrt_* ~10% total. -- Root causes: (1) noise branching deep-clones a full ~7KB tableau per branch; depolarize forks 3x/entry, ~85% of branches are then merged or truncated -> wasted clones. (2) every clifford gate marks all entries dirty -> next noise op re-hashes all words of all entries. -- Accuracy guard: branch count must stay 2025 (optimizations must not change the math). Cutoff fixed at 1e-7. -- Bench cmd: cargo build --release -p ppvm-tableau-sum --example msd-noisy-bench && ./target/release/examples/msd-noisy-bench -- Accuracy reference (must be preserved to ~1e-9 by all optimizations): branches=2025, sum_p=1.0, sum_p2=0.725135705447, top5_p[0]=0.8515413524292632. Key code fact: GeneralizedTableau X/Y/Z (gates/clifford.rs impl_generalized_tableau_clifford) only flip per-row sign bits (X where z=1, Y where x^z=1, Z where x=1) and leave words/coefficients/is_lost identical to parent; loss only sets one is_lost bit. So branch fingerprints are derivable without cloning -> lazy materialization is the plan. -- KEEP lazy-materialization: build_median 2620ms -> 958ms (2.73x, -63%). per_shot unchanged (~22us). branches=2025, sum_p2=0.725135705447 and top5 bit-identical => math unchanged. 76 tests pass. Implemented BranchMutation + structurally_equal_mutated + apply_branch_mutation in storage/mod.rs; insert_or_merge_mutated_branches trait method (VecStorage lazy/index-based, MapStorage eager-clone fallback); loss_channel + pauli_error emit virtual branches and only clone survivors. two_qubit/correlated_loss/reset_loss left eager (not in msd-noisy). -- KEEP bulk-word-hash: build_median 958ms -> 573ms (1.67x; cumulative 4.57x from 2620). word_fingerprint now gathers all row words into a reused thread_local Vec and calls gxhash64 once instead of ~340 tiny Hash writes. Subagent used Vec+pod_bytes (storage type is generic A:PauliStorage, not usize) and avoided adding bytemuck dep. per_shot unchanged, accuracy identical, 76+33 tests pass. -- KEEP precompute-masks: build_median 573ms -> 552ms (~3.7%; cumulative 4.75x). RowMasks table built once per op; phase_loss_hash_with + pauli_error dx/dy/dz index it instead of recomputing splitmix sign_mask per row per entry. Smaller than hoped => phase_loss_hash wasn't splitmix-dominated (mostly iteration over rows). accuracy identical, per_shot unchanged. -- DISCARD direct-word-hash: scalar fxhash-style fold REGRESSED 552ms -> 921ms. Reverted. Lesson: word_fingerprint is SIMD-throughput-bound; gxhash (even with the gather+thread_local) beats a scalar byte/word fold. The gather+gxhash is near-optimal for full hashing. => the ONLY way to cut the 22% word-hash is to avoid re-hashing entirely (incremental/Zobrist fingerprint maintained through gates). Next: incremental fingerprinting. -- KEEP direct-column-read: build_median 542ms -> 520ms (~4%; cumulative 5.04x). bit_at() helper reads the storage word directly (word_idx/bit precomputed once) instead of bitvec Index per row, in pauli_error dx/dy/dz and structurally_equal_mutated. accuracy identical, per_shot unchanged. -- PLATEAU ANALYSIS @ 520ms (5.04x): profile now: word-hash gxhash gather ('with') 22%, phase_loss_hash_with 14.5% (=> rebuild ~36.7%), noise branch-building (for_each self) 21%, clifford gates 23% (cz 10.5% + sqrt_* 12.7%), sampling ~10% (not build), T compute_decomposition 4.5%, merge 3.8%. The rebuild is SIMD-compute-bound (proven: scalar-fold DISCARD regressed), so full re-hashing is near-optimal. Remaining gate cost is inherent (gate applied to every one of 2025 entries x 170 rows). - -NEXT OPPORTUNITY (not done — high complexity, conflicts with user 'simplicity over cleverness' pref; ~10-20% est, revert-safe due to loud failure via test_word_fingerprint_cache_stays_consistent + branches!=2025 guard): cell-level incremental ('Zobrist') word_fingerprint maintained through clifford gates so rebuild is skipped for the common gates. Replace gxhash word_fp with XOR over (row r, qubit q) of (xbit?XM(r,q):0)^(zbit?ZM(r,q):0). Keep dirty+full-recompute fallback for non-handled gates (T, s, cnot, x/y/z standalone). Incremental update per gate reads OLD column bits (direct, via bit_at) and XORs precomputed column masks; gate-delta rules (single gate at col q, from old x,z): h: x_chg=z_chg=(x^z), sign=x&z; sqrt_x: x_chg=z,z_chg=0,sign=z&!x; sqrt_x_dag: x_chg=z,sign=x&z; sqrt_y: x_chg=z_chg=(x^z),sign=x&!z; sqrt_y_dag: x_chg=z_chg=(x^z),sign=z&!x. cz(c,t) from old xa,za,xb,zb: z@c flips iff xb (XOR ZM[c][r]), z@t flips iff xa (XOR ZM[t][r]), x unchanged, sign=xa&xb&(za^zb). phase_loss uses existing sign_mask. Estimated ~11% (non-fused, gates left intact) to ~20% (fused gate+delta in one direct pass). diff --git a/docs/autotune/2026-06-23-tableau-sum-build/metric.toml b/docs/autotune/2026-06-23-tableau-sum-build/metric.toml deleted file mode 100644 index 3d943cd80..000000000 --- a/docs/autotune/2026-06-23-tableau-sum-build/metric.toml +++ /dev/null @@ -1,42 +0,0 @@ -[[metric]] -"commit" = "f152652e" -"status" = "keep" -"description" = "baseline (msd-noisy build, seeded harness, median of 5)" -"build_median_ms" = 2620.0 -"per_shot_ns" = 22500.0 -"branches" = 2025.0 -[[metric]] -"commit" = "59165235" -"status" = "keep" -"description" = "lazy branch materialization for loss_channel + pauli_error (single-qubit depolarize)" -"build_median_ms" = 958.0 -"per_shot_ns" = 22100.0 -"branches" = 2025.0 -[[metric]] -"commit" = "ad0d0b24" -"status" = "keep" -"description" = "bulk-hash word_fingerprint (gather rows, single gxhash64)" -"build_median_ms" = 573.0 -"per_shot_ns" = 22100.0 -"branches" = 2025.0 -[[metric]] -"commit" = "4fea5841" -"status" = "keep" -"description" = "precompute per-row masks (RowMasks) for phase_loss + pauli dx/dy/dz" -"build_median_ms" = 552.0 -"per_shot_ns" = 22300.0 -"branches" = 2025.0 -[[metric]] -"commit" = "d4916210" -"status" = "discard" -"description" = "direct single-pass scalar word_fingerprint (no gather/gxhash) \u2014 REGRESSED" -"build_median_ms" = 921.0 -"per_shot_ns" = 22300.0 -"branches" = 2025.0 -[[metric]] -"commit" = "4b8fa6f4" -"status" = "keep" -"description" = "direct-word column reads in pauli_error dx/dy/dz + structurally_equal_mutated" -"build_median_ms" = 520.0 -"per_shot_ns" = 22100.0 -"branches" = 2025.0 diff --git a/docs/autotune/2026-06-23-tableau-sum-build/precompute-masks/prompts.md b/docs/autotune/2026-06-23-tableau-sum-build/precompute-masks/prompts.md deleted file mode 100644 index 5dc8c0fff..000000000 --- a/docs/autotune/2026-06-23-tableau-sum-build/precompute-masks/prompts.md +++ /dev/null @@ -1,16 +0,0 @@ -# Approach: precompute per-row masks (kill redundant splitmix) - -## Hypothesis -`sign_mask(row)`/`imag_mask(row)`/`loss_mask(q)` are splitmix64 hashes of a pure -index. They are recomputed per-row, per-entry, on every op: -- `pauli_error`'s dx/dy/dz loop computes `sign_mask(row)` for all 2n rows of - every entry on every depolarize. -- `phase_loss_hash` (called per entry in `rebuild_fingerprints_if_dirty`) - recomputes sign/imag masks per set phase and loss masks per lost qubit. - -Precompute the per-index mask tables ONCE per op and index them. Values are -identical, so all fingerprints (and the accuracy fingerprint) are unchanged. - -## Target -`./target/release/examples/msd-noisy-bench`; baseline now build_median ~573ms. -Keep branches=2025, sum_p2=0.725135705447, top5[0]=0.8515413524292632, per_shot ~22us. From e35ddd94e721acf417f323563c5488f385ab7b68 Mon Sep 17 00:00:00 2001 From: David Plankensteiner Date: Wed, 24 Jun 2026 09:19:59 +0200 Subject: [PATCH 23/24] refactor(tableau-sum): use bytemuck::bytes_of in word_fingerprint Replace the hand-rolled `unsafe pod_bytes` byte view with `bytemuck::bytes_of`. `PauliStorage` already requires `bytemuck::Pod`, so the byte view is sound without `unsafe`, matching the existing idiom in `PauliWord::rehash`. Identical codegen (same pointer cast), so build time and accuracy are unchanged (branches=2025, sum_p2 bit-identical). Co-Authored-By: Claude Opus 4.8 (1M context) --- Cargo.lock | 1 + crates/ppvm-tableau-sum/Cargo.toml | 1 + crates/ppvm-tableau-sum/src/storage/mod.rs | 17 ++++------------- 3 files changed, 6 insertions(+), 13 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f0aaf37d1..b0a5bb9ef 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1034,6 +1034,7 @@ name = "ppvm-tableau-sum" version = "0.1.0" dependencies = [ "bitvec", + "bytemuck", "criterion 0.8.2", "fxhash", "gxhash", diff --git a/crates/ppvm-tableau-sum/Cargo.toml b/crates/ppvm-tableau-sum/Cargo.toml index 65355efec..aca490053 100644 --- a/crates/ppvm-tableau-sum/Cargo.toml +++ b/crates/ppvm-tableau-sum/Cargo.toml @@ -5,6 +5,7 @@ edition = "2024" [dependencies] bitvec = "1.0.1" +bytemuck = { version = "1", features = ["min_const_generics"] } fxhash = "0.2.1" num = "0.4.3" ppvm-traits = { version = "0.1.0", path = "../ppvm-traits" } diff --git a/crates/ppvm-tableau-sum/src/storage/mod.rs b/crates/ppvm-tableau-sum/src/storage/mod.rs index 6d9393fcb..c62f7daa6 100644 --- a/crates/ppvm-tableau-sum/src/storage/mod.rs +++ b/crates/ppvm-tableau-sum/src/storage/mod.rs @@ -48,17 +48,6 @@ pub(crate) fn bit_at(words: &[S], word_idx: usize, bit: usize) -> bo (words[word_idx] >> bit) & S::one() != S::zero() } -/// View a `Copy` plain-old-data value's bytes. Sound because `A: PauliStorage` -/// implies `bytemuck::Pod`: no padding, every bit pattern valid, so the bytes -/// are fully initialized and `u8`-aligned. -#[inline] -fn pod_bytes(value: &A) -> &[u8] { - // SAFETY: `A` is POD (PauliStorage: bytemuck::Pod); reading its - // `size_of::()` initialized bytes as `[u8]` is sound, and the borrow is - // tied to `value`. - unsafe { std::slice::from_raw_parts(value as *const A as *const u8, std::mem::size_of::()) } -} - /// Hash of the `word` (Pauli content) of every row, in order. This is the /// expensive component (each word is several machine words wide) and is /// *invariant* under X/Y/Z and `is_lost` flips, so a branch inherits it from @@ -79,8 +68,10 @@ where // Gather the Pauli bits directly: the `PauliWord` hash cache is // disabled for tableau rows (`REHASH = false`), so hashing // `row.word` would feed a stale zero and make every tableau collide. - buf.extend_from_slice(pod_bytes(&row.word.xbits.data)); - buf.extend_from_slice(pod_bytes(&row.word.zbits.data)); + // `xbits.data`/`zbits.data` are the `PauliStorage` backing array, + // which is `bytemuck::Pod`, so this byte view is safe and zero-copy. + buf.extend_from_slice(bytemuck::bytes_of(&row.word.xbits.data)); + buf.extend_from_slice(bytemuck::bytes_of(&row.word.zbits.data)); } #[cfg(not(target_arch = "wasm32"))] From a7c8057eea29658cb0e857c609ad446f1a889edf Mon Sep 17 00:00:00 2001 From: David Plankensteiner Date: Thu, 25 Jun 2026 09:57:21 +0200 Subject: [PATCH 24/24] refactor(tableau-sum): type BranchMutation Pauli op as NotIdentity enum MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The depolarizing-branch op was a `u8` (1=X, 2=Y, 3=Z), so both matches on it carried a dead `_` catch-all that silently ignored invalid ops. Reuse the existing `ppvm_pauli_word::pattern::NotIdentity` enum (X/Y/Z) instead, which makes `apply_branch_mutation` and the `structurally_equal_mutated` flip rule exhaustive with no catch-all — the invalid state is now unrepresentable. Promotes `NotIdentity` from `pub(crate)` to `pub` and re-exports it from the `pattern` module. Matching is by variant name, so the enum's `X=1, Z=2, Y=3` discriminants don't affect behavior. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/ppvm-pauli-word/src/pattern/data.rs | 5 ++++- crates/ppvm-pauli-word/src/pattern/mod.rs | 2 +- crates/ppvm-tableau-sum/src/noise.rs | 16 +++++++++++++--- crates/ppvm-tableau-sum/src/storage/mod.rs | 19 +++++++++---------- 4 files changed, 27 insertions(+), 15 deletions(-) diff --git a/crates/ppvm-pauli-word/src/pattern/data.rs b/crates/ppvm-pauli-word/src/pattern/data.rs index f34e48802..7e0d17a68 100644 --- a/crates/ppvm-pauli-word/src/pattern/data.rs +++ b/crates/ppvm-pauli-word/src/pattern/data.rs @@ -8,11 +8,14 @@ use bincode::{Decode, Encode}; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +/// A single-qubit Pauli that is not the identity: `X`, `Y`, or `Z`. Encoded so +/// the low two bits match [`Pauli`](crate::Pauli) (`X = 1`, `Z = 2`, `Y = 3`), +/// which lets `From for Pauli` be a no-op transmute. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] #[cfg_attr(feature = "bincode", derive(Encode, Decode))] #[repr(u8)] -pub(crate) enum NotIdentity { +pub enum NotIdentity { X = 1, Z = 2, Y = 3, diff --git a/crates/ppvm-pauli-word/src/pattern/mod.rs b/crates/ppvm-pauli-word/src/pattern/mod.rs index 2147de525..93465d567 100644 --- a/crates/ppvm-pauli-word/src/pattern/mod.rs +++ b/crates/ppvm-pauli-word/src/pattern/mod.rs @@ -10,4 +10,4 @@ mod parse; mod trace; pub use contains::Contains; -pub use data::PauliPattern; +pub use data::{NotIdentity, PauliPattern}; diff --git a/crates/ppvm-tableau-sum/src/noise.rs b/crates/ppvm-tableau-sum/src/noise.rs index ad1bad2b1..eedbde003 100644 --- a/crates/ppvm-tableau-sum/src/noise.rs +++ b/crates/ppvm-tableau-sum/src/noise.rs @@ -8,6 +8,7 @@ use num::{ Complex, One, PrimInt, ToPrimitive, Zero, complex::{Complex64, ComplexFloat}, }; +use ppvm_pauli_word::pattern::NotIdentity; use ppvm_tableau::{ data::GeneralizedTableau, sparsevec::SparseVector, tableau_index::TableauIndex, }; @@ -251,21 +252,30 @@ where branches.push(( parent_idx, - BranchMutation::Pauli { op: 1, addr0 }, + BranchMutation::Pauli { + op: NotIdentity::X, + addr0, + }, p_sum.clone() * p[0].clone(), word_fp, phase_loss ^ dx, )); branches.push(( parent_idx, - BranchMutation::Pauli { op: 2, addr0 }, + BranchMutation::Pauli { + op: NotIdentity::Y, + addr0, + }, p_sum.clone() * p[1].clone(), word_fp, phase_loss ^ dy, )); branches.push(( parent_idx, - BranchMutation::Pauli { op: 3, addr0 }, + BranchMutation::Pauli { + op: NotIdentity::Z, + addr0, + }, p_sum.clone() * p[2].clone(), word_fp, phase_loss ^ dz, diff --git a/crates/ppvm-tableau-sum/src/storage/mod.rs b/crates/ppvm-tableau-sum/src/storage/mod.rs index c62f7daa6..c4320146c 100644 --- a/crates/ppvm-tableau-sum/src/storage/mod.rs +++ b/crates/ppvm-tableau-sum/src/storage/mod.rs @@ -22,6 +22,7 @@ use num::{ Complex, One, PrimInt, Zero, complex::{Complex64, ComplexFloat}, }; +use ppvm_pauli_word::pattern::NotIdentity; use ppvm_tableau::{ data::GeneralizedTableau, sparsevec::SparseVector, tableau_index::TableauIndex, }; @@ -289,8 +290,8 @@ where /// cloning, materializing the tableau only for surviving new entries. #[derive(Clone, Copy, Debug)] pub enum BranchMutation { - /// Apply Pauli `op` (1=X, 2=Y, 3=Z) at `addr0`: flips per-row sign bits only. - Pauli { op: u8, addr0: usize }, + /// Apply a non-identity Pauli at `addr0`: flips per-row sign bits only. + Pauli { op: NotIdentity, addr0: usize }, /// Mark qubit `q` lost (set is_lost[q] = true). Loss { q: usize }, } @@ -307,10 +308,9 @@ pub(crate) fn apply_branch_mutation( { match m { BranchMutation::Pauli { op, addr0 } => match op { - 1 => tab.x(addr0), - 2 => tab.y(addr0), - 3 => tab.z(addr0), - _ => {} + NotIdentity::X => tab.x(addr0), + NotIdentity::Y => tab.y(addr0), + NotIdentity::Z => tab.z(addr0), }, BranchMutation::Loss { q } => { tab.is_lost[q] = true; @@ -398,10 +398,9 @@ where let x: bool = bit_at(xw, word_idx, bit); let z: bool = bit_at(zw, word_idx, bit); let flip = match op { - 1 => z, - 2 => x ^ z, - 3 => x, - _ => false, + NotIdentity::X => z, + NotIdentity::Y => x ^ z, + NotIdentity::Z => x, }; let virt_phase = rp.phase ^ ((flip as u8) << 1); if re.phase != virt_phase {