From f152652e4c04ca936e513c35963867773fe1b8c4 Mon Sep 17 00:00:00 2001
From: David Plankensteiner <dplankensteiner@quera.com>
Date: Tue, 23 Jun 2026 20:31:10 +0200
Subject: [PATCH 01/24] chore(autotune): add tableau-sum-build experiment +
 deterministic bench harness

---
 .../examples/msd-noisy-bench.rs               | 227 ++++++++++++++++++
 .../2026-06-23-tableau-sum-build/log.md       |   3 +
 .../2026-06-23-tableau-sum-build/metric.toml  |   0
 3 files changed, 230 insertions(+)
 create mode 100644 crates/ppvm-tableau-sum/examples/msd-noisy-bench.rs
 create mode 100644 docs/autotune/2026-06-23-tableau-sum-build/log.md
 create mode 100644 docs/autotune/2026-06-23-tableau-sum-build/metric.toml

diff --git a/crates/ppvm-tableau-sum/examples/msd-noisy-bench.rs b/crates/ppvm-tableau-sum/examples/msd-noisy-bench.rs
new file mode 100644
index 000000000..6377325c8
--- /dev/null
+++ b/crates/ppvm-tableau-sum/examples/msd-noisy-bench.rs
@@ -0,0 +1,227 @@
+// SPDX-FileCopyrightText: 2026 The PPVM Authors
+// SPDX-License-Identifier: Apache-2.0
+
+// Deterministic timing harness for the msd-noisy build + sample workload.
+// Mirrors examples/msd-noisy.rs but uses a fixed seed, runs the build several
+// times (median), and asserts the final branch count so an optimization that
+// silently changes the math is caught. Used by the autotune experiment
+// `docs/autotune/2026-06-23-tableau-sum-build`.
+
+use std::time::Instant;
+
+use ppvm_pauli_sum::config::fx64hash::Byte8F64;
+use ppvm_tableau::prelude::*;
+use ppvm_tableau_sum::data::GeneralizedTableauSum;
+
+#[global_allocator]
+static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
+
+type GTabSum = GeneralizedTableauSum<Byte8F64<2>, u128>;
+
+fn encode(tab: &mut GTabSum, qubits: &[usize], p_loss: f64, p_depolarize: f64) {
+    if qubits.len() == 17 {
+        for i in [0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16] {
+            tab.sqrt_y(qubits[i]);
+            tab.loss_channel(qubits[i], p_loss);
+            tab.depolarize1(qubits[i], p_depolarize);
+        }
+        for [i, j] in [[1, 3], [7, 10], [12, 14], [13, 16]] {
+            tab.cz(qubits[i], qubits[j]);
+            tab.loss_channel(qubits[i], p_loss);
+            tab.loss_channel(qubits[j], p_loss);
+            tab.depolarize1(qubits[i], p_depolarize);
+            tab.depolarize1(qubits[j], p_depolarize);
+        }
+        for i in [7, 16] {
+            tab.sqrt_y_dag(qubits[i]);
+            tab.loss_channel(qubits[i], p_loss);
+            tab.depolarize1(qubits[i], p_depolarize);
+        }
+        for [i, j] in [[4, 7], [8, 10], [11, 14], [15, 16]] {
+            tab.cz(qubits[i], qubits[j]);
+            tab.loss_channel(qubits[i], p_loss);
+            tab.loss_channel(qubits[j], p_loss);
+            tab.depolarize1(qubits[i], p_depolarize);
+            tab.depolarize1(qubits[j], p_depolarize);
+        }
+        for i in [4, 10, 14, 16] {
+            tab.sqrt_y_dag(qubits[i]);
+            tab.loss_channel(qubits[i], p_loss);
+            tab.depolarize1(qubits[i], p_depolarize);
+        }
+        for [i, j] in [[2, 4], [6, 8], [7, 9], [10, 13], [14, 16]] {
+            tab.cz(qubits[i], qubits[j]);
+            tab.loss_channel(qubits[i], p_loss);
+            tab.loss_channel(qubits[j], p_loss);
+            tab.depolarize1(qubits[i], p_depolarize);
+            tab.depolarize1(qubits[j], p_depolarize);
+        }
+        for i in [3, 6, 9, 10, 12, 13] {
+            tab.sqrt_y(qubits[i]);
+            tab.loss_channel(qubits[i], p_loss);
+            tab.depolarize1(qubits[i], p_depolarize);
+        }
+        for [i, j] in [[0, 2], [3, 6], [5, 8], [10, 12], [11, 13]] {
+            tab.cz(qubits[i], qubits[j]);
+            tab.loss_channel(qubits[i], p_loss);
+            tab.loss_channel(qubits[j], p_loss);
+            tab.depolarize1(qubits[i], p_depolarize);
+            tab.depolarize1(qubits[j], p_depolarize);
+        }
+        for i in [1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 14] {
+            tab.sqrt_y(qubits[i]);
+            tab.loss_channel(qubits[i], p_loss);
+            tab.depolarize1(qubits[i], p_depolarize);
+        }
+        for [i, j] in [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [12, 15]] {
+            tab.cz(qubits[i], qubits[j]);
+            tab.loss_channel(qubits[i], p_loss);
+            tab.loss_channel(qubits[j], p_loss);
+            tab.depolarize1(qubits[i], p_depolarize);
+            tab.depolarize1(qubits[j], p_depolarize);
+        }
+        for i in [0, 2, 5, 6, 8, 10, 12] {
+            tab.sqrt_y_dag(qubits[i]);
+            tab.loss_channel(qubits[i], p_loss);
+            tab.depolarize1(qubits[i], p_depolarize);
+        }
+    }
+}
+
+fn build(seed: u64) -> GTabSum {
+    let n_qubits = 85;
+    let p_loss = 1e-4;
+    let p_depolarize = 1e-4;
+    let sum_cutoff = 1e-7;
+
+    let mut tab: GTabSum = GeneralizedTableauSum::new_with_seed(n_qubits, 1e-10, sum_cutoff, seed);
+    let qubit_addrs: Vec<usize> = (0..n_qubits).collect();
+    let ql: Vec<&[usize]> = qubit_addrs.chunks_exact(17).collect();
+
+    for q in ql.iter() {
+        let encoding_qubit = q[7];
+        tab.h(encoding_qubit);
+        tab.loss_channel(encoding_qubit, p_loss);
+        tab.depolarize1(encoding_qubit, p_depolarize);
+        tab.t(encoding_qubit);
+        tab.loss_channel(encoding_qubit, p_loss);
+        tab.depolarize1(encoding_qubit, p_depolarize);
+        encode(&mut tab, q, p_loss, p_depolarize);
+    }
+
+    for i in [0, 1, 4] {
+        for q in ql[i] {
+            tab.sqrt_x(*q);
+            tab.loss_channel(*q, p_loss);
+            tab.depolarize1(*q, p_depolarize);
+        }
+    }
+    for (control, target) in ql[0].iter().zip(ql[1]) {
+        tab.cz(*control, *target);
+        tab.loss_channel(*control, p_loss);
+        tab.loss_channel(*target, p_loss);
+        tab.depolarize1(*control, p_depolarize);
+        tab.depolarize1(*target, p_depolarize);
+    }
+    for (control, target) in ql[2].iter().zip(ql[3]) {
+        tab.cz(*control, *target);
+        tab.loss_channel(*control, p_loss);
+        tab.loss_channel(*target, p_loss);
+        tab.depolarize1(*control, p_depolarize);
+        tab.depolarize1(*target, p_depolarize);
+    }
+    for q in ql[0] {
+        tab.sqrt_y(*q);
+        tab.loss_channel(*q, p_loss);
+        tab.depolarize1(*q, p_depolarize);
+    }
+    for q in ql[3] {
+        tab.sqrt_y(*q);
+        tab.loss_channel(*q, p_loss);
+        tab.depolarize1(*q, p_depolarize);
+    }
+    for (control, target) in ql[0].iter().zip(ql[2]) {
+        tab.cz(*control, *target);
+        tab.loss_channel(*control, p_loss);
+        tab.loss_channel(*target, p_loss);
+        tab.depolarize1(*control, p_depolarize);
+        tab.depolarize1(*target, p_depolarize);
+    }
+    for (control, target) in ql[3].iter().zip(ql[4]) {
+        tab.cz(*control, *target);
+        tab.loss_channel(*control, p_loss);
+        tab.loss_channel(*target, p_loss);
+        tab.depolarize1(*control, p_depolarize);
+        tab.depolarize1(*target, p_depolarize);
+    }
+    for q in ql[0] {
+        tab.sqrt_x_dag(*q);
+        tab.loss_channel(*q, p_loss);
+        tab.depolarize1(*q, p_depolarize);
+    }
+    for (control, target) in ql[0].iter().zip(ql[4]) {
+        tab.cz(*control, *target);
+        tab.loss_channel(*control, p_loss);
+        tab.loss_channel(*target, p_loss);
+        tab.depolarize1(*control, p_depolarize);
+        tab.depolarize1(*target, p_depolarize);
+    }
+    for (control, target) in ql[1].iter().zip(ql[3]) {
+        tab.cz(*control, *target);
+        tab.loss_channel(*control, p_loss);
+        tab.loss_channel(*target, p_loss);
+        tab.depolarize1(*control, p_depolarize);
+        tab.depolarize1(*target, p_depolarize);
+    }
+    for block in ql.iter().take(5) {
+        for q in *block {
+            tab.sqrt_x_dag(*q);
+            tab.loss_channel(*q, p_loss);
+            tab.depolarize1(*q, p_depolarize);
+        }
+    }
+
+    tab
+}
+
+fn main() {
+    const BUILD_RUNS: usize = 5;
+    const N_SHOTS: usize = 20000;
+    const SEED: u64 = 12345;
+
+    let mut build_times_ms: Vec<f64> = Vec::new();
+    let mut branches = 0usize;
+    for r in 0..BUILD_RUNS {
+        let now = Instant::now();
+        let tab = build(SEED + r as u64);
+        let ms = now.elapsed().as_secs_f64() * 1e3;
+        build_times_ms.push(ms);
+        branches = tab.len();
+    }
+    build_times_ms.sort_by(|a, b| a.partial_cmp(b).unwrap());
+    let median_ms = build_times_ms[build_times_ms.len() / 2];
+    let min_ms = build_times_ms[0];
+
+    // Sample timing on a fresh build.
+    let mut tab = build(SEED);
+    let mut sampler = tab.sampler();
+    let now = Instant::now();
+    sampler.sample_shots(N_SHOTS);
+    let per_shot_ns = now.elapsed().as_nanos() as f64 / N_SHOTS as f64;
+
+    println!("branches       = {}", branches);
+    println!("build_min_ms   = {:.1}", min_ms);
+    println!("build_median_ms= {:.1}", median_ms);
+    println!("per_shot_ns    = {:.1}", per_shot_ns);
+    println!("all_build_ms   = {:?}", build_times_ms);
+
+    // Accuracy guard: the optimizations under test must not change the math,
+    // so the final branch count must stay at the baseline value.
+    const EXPECTED_BRANCHES: usize = 2025;
+    if branches != EXPECTED_BRANCHES {
+        eprintln!(
+            "WARNING: branch count {} != baseline {} — accuracy/structure changed!",
+            branches, EXPECTED_BRANCHES
+        );
+    }
+}
diff --git a/docs/autotune/2026-06-23-tableau-sum-build/log.md b/docs/autotune/2026-06-23-tableau-sum-build/log.md
new file mode 100644
index 000000000..867032304
--- /dev/null
+++ b/docs/autotune/2026-06-23-tableau-sum-build/log.md
@@ -0,0 +1,3 @@
+# Log for 2026-06-23-tableau-sum-build
+
+## 2026-06-23
diff --git a/docs/autotune/2026-06-23-tableau-sum-build/metric.toml b/docs/autotune/2026-06-23-tableau-sum-build/metric.toml
new file mode 100644
index 000000000..e69de29bb

From 34dcdb21ee90921e7832a3ccdb1bc3e73b78bbdd Mon Sep 17 00:00:00 2001
From: David Plankensteiner <dplankensteiner@quera.com>
Date: Tue, 23 Jun 2026 20:31:36 +0200
Subject: [PATCH 02/24] chore(autotune): record baseline + architecture notes

---
 docs/autotune/2026-06-23-tableau-sum-build/log.md      | 7 +++++++
 docs/autotune/2026-06-23-tableau-sum-build/metric.toml | 7 +++++++
 2 files changed, 14 insertions(+)

diff --git a/docs/autotune/2026-06-23-tableau-sum-build/log.md b/docs/autotune/2026-06-23-tableau-sum-build/log.md
index 867032304..64a344917 100644
--- a/docs/autotune/2026-06-23-tableau-sum-build/log.md
+++ b/docs/autotune/2026-06-23-tableau-sum-build/log.md
@@ -1,3 +1,10 @@
 # Log for 2026-06-23-tableau-sum-build
 
 ## 2026-06-23
+- Architecture Notes / baseline profile (samply, 2733 samples):
+- Target: examples/msd-noisy build time. Baseline build_median ~2620ms, per_shot ~22.5us, final branches=2025. Config Byte8F64<2> (storage [u64;2]=128bit), index u128, 85 qubits => tableau has 170 rows, each row ~32B word data.
+- INCLUSIVE: for_each_mut_with_keys 85%; depolarize1 53%; fork(clone) 47%; loss_channel 38%; rebuild_fingerprints_if_dirty 23%; mimalloc alloc ~15-20%.
+- SELF: _platform_memmove 32% (the tableau deep-clone in fork); rebuild_fingerprints_if_dirty 18% (re-hashes all words after every clifford gate marks dirty); for_each_mut_with_keys 11%; phase_loss_hash 5%; gates y/cz/sqrt_* ~10% total.
+- Root causes: (1) noise branching deep-clones a full ~7KB tableau per branch; depolarize forks 3x/entry, ~85% of branches are then merged or truncated -> wasted clones. (2) every clifford gate marks all entries dirty -> next noise op re-hashes all words of all entries.
+- Accuracy guard: branch count must stay 2025 (optimizations must not change the math). Cutoff fixed at 1e-7.
+- Bench cmd: cargo build --release -p ppvm-tableau-sum --example msd-noisy-bench && ./target/release/examples/msd-noisy-bench
diff --git a/docs/autotune/2026-06-23-tableau-sum-build/metric.toml b/docs/autotune/2026-06-23-tableau-sum-build/metric.toml
index e69de29bb..24d85b30b 100644
--- a/docs/autotune/2026-06-23-tableau-sum-build/metric.toml
+++ b/docs/autotune/2026-06-23-tableau-sum-build/metric.toml
@@ -0,0 +1,7 @@
+[[metric]]
+"commit" = "f152652e"
+"status" = "keep"
+"description" = "baseline (msd-noisy build, seeded harness, median of 5)"
+"build_median_ms" = 2620.0
+"per_shot_ns" = 22500.0
+"branches" = 2025.0

From a31b6c907fc6f458aa0c160f4203ea7ccb89a3f2 Mon Sep 17 00:00:00 2001
From: David Plankensteiner <dplankensteiner@quera.com>
Date: Tue, 23 Jun 2026 20:40:55 +0200
Subject: [PATCH 03/24] chore(autotune): add accuracy fingerprint (sum_p2,
 top5) to bench harness

---
 .../ppvm-tableau-sum/examples/msd-noisy-bench.rs  | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/crates/ppvm-tableau-sum/examples/msd-noisy-bench.rs b/crates/ppvm-tableau-sum/examples/msd-noisy-bench.rs
index 6377325c8..11e7f018a 100644
--- a/crates/ppvm-tableau-sum/examples/msd-noisy-bench.rs
+++ b/crates/ppvm-tableau-sum/examples/msd-noisy-bench.rs
@@ -12,6 +12,7 @@ use std::time::Instant;
 use ppvm_pauli_sum::config::fx64hash::Byte8F64;
 use ppvm_tableau::prelude::*;
 use ppvm_tableau_sum::data::GeneralizedTableauSum;
+use ppvm_tableau_sum::storage::EntryStore;
 
 #[global_allocator]
 static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
@@ -202,6 +203,17 @@ fn main() {
     let median_ms = build_times_ms[build_times_ms.len() / 2];
     let min_ms = build_times_ms[0];
 
+    // Accuracy fingerprint: the optimizations under test must not change the
+    // math, so the multiset of branch probabilities must be invariant (up to
+    // float summation-order noise). Capture sum(p), sum(p^2) (participation
+    // ratio), and the top-5 probabilities from a fresh deterministic build.
+    let tab_acc = build(SEED);
+    let mut probs: Vec<f64> = tab_acc.entries.iter().map(|(_, p)| *p).collect();
+    probs.sort_by(|a, b| b.partial_cmp(a).unwrap());
+    let sum_p: f64 = probs.iter().sum();
+    let sum_p2: f64 = probs.iter().map(|p| p * p).sum();
+    let top5: Vec<f64> = probs.iter().take(5).copied().collect();
+
     // Sample timing on a fresh build.
     let mut tab = build(SEED);
     let mut sampler = tab.sampler();
@@ -213,6 +225,9 @@ fn main() {
     println!("build_min_ms   = {:.1}", min_ms);
     println!("build_median_ms= {:.1}", median_ms);
     println!("per_shot_ns    = {:.1}", per_shot_ns);
+    println!("sum_p          = {:.12}", sum_p);
+    println!("sum_p2         = {:.12}", sum_p2);
+    println!("top5_p         = {:?}", top5);
     println!("all_build_ms   = {:?}", build_times_ms);
 
     // Accuracy guard: the optimizations under test must not change the math,

From 03150ddbb35f48ed1b1eacd2101bd7d60ebe7ac7 Mon Sep 17 00:00:00 2001
From: David Plankensteiner <dplankensteiner@quera.com>
Date: Tue, 23 Jun 2026 20:40:56 +0200
Subject: [PATCH 04/24] chore(autotune): record accuracy reference values

---
 docs/autotune/2026-06-23-tableau-sum-build/log.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/autotune/2026-06-23-tableau-sum-build/log.md b/docs/autotune/2026-06-23-tableau-sum-build/log.md
index 64a344917..06e93a1c0 100644
--- a/docs/autotune/2026-06-23-tableau-sum-build/log.md
+++ b/docs/autotune/2026-06-23-tableau-sum-build/log.md
@@ -8,3 +8,4 @@
 - Root causes: (1) noise branching deep-clones a full ~7KB tableau per branch; depolarize forks 3x/entry, ~85% of branches are then merged or truncated -> wasted clones. (2) every clifford gate marks all entries dirty -> next noise op re-hashes all words of all entries.
 - Accuracy guard: branch count must stay 2025 (optimizations must not change the math). Cutoff fixed at 1e-7.
 - Bench cmd: cargo build --release -p ppvm-tableau-sum --example msd-noisy-bench && ./target/release/examples/msd-noisy-bench
+- Accuracy reference (must be preserved to ~1e-9 by all optimizations): branches=2025, sum_p=1.0, sum_p2=0.725135705447, top5_p[0]=0.8515413524292632. Key code fact: GeneralizedTableau X/Y/Z (gates/clifford.rs impl_generalized_tableau_clifford) only flip per-row sign bits (X where z=1, Y where x^z=1, Z where x=1) and leave words/coefficients/is_lost identical to parent; loss only sets one is_lost bit. So branch fingerprints are derivable without cloning -> lazy materialization is the plan.

From 3e62b7b757dd463029bbe6595c6a98867bf95900 Mon Sep 17 00:00:00 2001
From: David Plankensteiner <dplankensteiner@quera.com>
Date: Tue, 23 Jun 2026 20:43:05 +0200
Subject: [PATCH 05/24] chore(autotune): plan lazy-materialization approach

---
 .../lazy-materialization/prompts.md           | 33 +++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 docs/autotune/2026-06-23-tableau-sum-build/lazy-materialization/prompts.md

diff --git a/docs/autotune/2026-06-23-tableau-sum-build/lazy-materialization/prompts.md b/docs/autotune/2026-06-23-tableau-sum-build/lazy-materialization/prompts.md
new file mode 100644
index 000000000..cb1883d20
--- /dev/null
+++ b/docs/autotune/2026-06-23-tableau-sum-build/lazy-materialization/prompts.md
@@ -0,0 +1,33 @@
+# Approach: lazy branch materialization (loss_channel + pauli_error)
+
+## Hypothesis
+Build time is dominated by `fork` (deep-clone of a ~7KB `GeneralizedTableau`,
+47% inclusive, 32% `_platform_memmove` self). Each `depolarize1` forks 3 full
+tableaux per entry and `loss_channel` forks 1 per entry, but ~85% of those
+branches are immediately merged into an existing entry or dropped below
+`sum_cutoff`. Those clones are pure waste.
+
+Key fact (verified in `ppvm-tableau/src/gates/clifford.rs`): applying X/Y/Z to a
+`GeneralizedTableau` only flips per-row **sign bits** and leaves the Pauli
+words, the `coefficients` vector, and `is_lost` identical to the parent. Loss
+only sets one `is_lost` bit. So a branch's fingerprint and structural identity
+are derivable from the parent **without cloning**. Materialize (clone+mutate)
+only when a branch survives as a *new* entry.
+
+Per-row sign-flip rule at column `addr0` (matches the gate code exactly):
+- X flips sign of row iff `z[addr0] == 1`
+- Y flips sign of row iff `x[addr0] ^ z[addr0] == 1`
+- Z flips sign of row iff `x[addr0] == 1`
+(Only phase bit 1 = sign; the imaginary bit 0 is untouched. So the phase/loss
+hash delta is `XOR sign_mask(row)` over flipped rows — same as the existing
+`pauli_branch_phase_loss`.)
+
+## Target metric
+`cargo build --release -p ppvm-tableau-sum --example msd-noisy-bench && ./target/release/examples/msd-noisy-bench`
+Baseline: build_median ~2620ms, per_shot ~22.5us, branches=2025,
+sum_p2=0.725135705447, top5[0]=0.8515413524292632. The math must be unchanged:
+branches stays 2025 and the accuracy fingerprint must match to ~1e-9.
+
+## Expected win
+Replace ~3N depolarize clones + ~N loss clones with clones only for survivors
+(~1.2N total entries). Target 25-40% build-time reduction.

From 5916523571c95b4aedbc500df1bccae5a410a6d4 Mon Sep 17 00:00:00 2001
From: David Plankensteiner <dplankensteiner@quera.com>
Date: Tue, 23 Jun 2026 20:49:34 +0200
Subject: [PATCH 06/24] perf(tableau-sum): lazy branch materialization for
 loss/pauli noise

loss_channel and single-qubit pauli_error now describe each branch as a
BranchMutation of a parent entry instead of deep-cloning a tableau up
front. The merge resolves structural identity against the virtual
(parent + mutation) tableau via structurally_equal_mutated and only
clones the parent when the branch survives as a new entry; merges and
below-cutoff drops never clone. The VecStorage path is the optimized
one; MapStorage materializes parents eagerly (correctness only).

Math is unchanged: msd-noisy benchmark still ends at exactly 2025
branches with sum_p = 1.0, and all 76 ppvm-tableau-sum tests pass.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 crates/ppvm-tableau-sum/src/noise.rs          | 105 ++++++++-----
 .../src/storage/entry_store.rs                |  17 ++
 crates/ppvm-tableau-sum/src/storage/map.rs    |  36 ++++-
 crates/ppvm-tableau-sum/src/storage/mod.rs    | 147 ++++++++++++++++++
 crates/ppvm-tableau-sum/src/storage/vec.rs    |  76 ++++++++-
 5 files changed, 342 insertions(+), 39 deletions(-)

diff --git a/crates/ppvm-tableau-sum/src/noise.rs b/crates/ppvm-tableau-sum/src/noise.rs
index 287f6e8f2..b12a08057 100644
--- a/crates/ppvm-tableau-sum/src/noise.rs
+++ b/crates/ppvm-tableau-sum/src/noise.rs
@@ -20,7 +20,7 @@ use rand::{RngExt, rngs::SmallRng};
 
 use crate::{
     data::GeneralizedTableauSum,
-    storage::{Branch, EntryStore, loss_mask, pauli_branch_phase_loss},
+    storage::{Branch, BranchMutation, EntryStore, loss_mask, pauli_branch_phase_loss, sign_mask},
 };
 
 fn single_qubit_loss_branch<T, I, C>(
@@ -106,25 +106,34 @@ where
     I: Debug,
 {
     fn loss_channel(&mut self, addr0: usize, p: <T as Config>::Coeff) {
-        let mut branches = Vec::<(GeneralizedTableau<T, I, C>, T::Coeff, u64, u64)>::with_capacity(
-            self.entries.len(),
-        );
+        // Lazy branch materialization: describe each loss branch as a mutation
+        // of its parent entry. The merge clones the parent only when the branch
+        // survives as a NEW entry; merges/below-cutoff drops never clone.
+        let mut branches =
+            Vec::<(usize, BranchMutation, T::Coeff, u64, u64)>::with_capacity(self.entries.len());
+        let mut idx = 0usize;
         self.entries
             .for_each_mut_with_keys(|tab, p_sum, word_fp, phase_loss| {
-                single_qubit_loss_branch(
-                    addr0,
-                    &p,
-                    &mut self.rng,
-                    &mut branches,
-                    tab,
-                    p_sum,
-                    (word_fp, phase_loss),
-                );
+                // Increment for EVERY entry, before the lost check, so
+                // parent_idx aligns with for_each_mut_with_keys' order.
+                let parent_idx = idx;
+                idx += 1;
+                if tab.is_lost[addr0] {
+                    return;
+                }
+                branches.push((
+                    parent_idx,
+                    BranchMutation::Loss { q: addr0 },
+                    p_sum.clone() * p.clone(),
+                    word_fp,
+                    phase_loss ^ loss_mask(addr0),
+                ));
+                *p_sum *= T::Coeff::one() - p.clone();
             });
 
         let needs_renormalize = self
             .entries
-            .insert_or_merge_batch(branches, &self.sum_cutoff);
+            .insert_or_merge_mutated_branches(branches, &self.sum_cutoff);
         if needs_renormalize {
             self.normalize_probabilities();
         }
@@ -196,44 +205,66 @@ where
 {
     fn pauli_error(&mut self, addr0: usize, p: [<T as Config>::Coeff; 3]) {
         let p_total: T::Coeff = p[0].clone() + p[1].clone() + p[2].clone();
-        let mut branches = Vec::<(GeneralizedTableau<T, I, C>, T::Coeff, u64, u64)>::with_capacity(
+        // Lazy branch materialization: describe each X/Y/Z branch as a Pauli
+        // mutation of its parent. The phase/loss delta is computed by walking the
+        // parent's column once (no clone) — X flips rows with z, Y with x^z, Z
+        // with x — matching what `pauli_branch_phase_loss` would produce.
+        let mut branches = Vec::<(usize, BranchMutation, T::Coeff, u64, u64)>::with_capacity(
             3 * self.entries.len(),
         );
-
+        let mut idx = 0usize;
         self.entries
             .for_each_mut_with_keys(|tab, p_sum, word_fp, phase_loss| {
+                let parent_idx = idx;
+                idx += 1;
                 if tab.is_lost[addr0] {
                     return;
                 }
 
-                let tab_seed_x = self.rng.random::<u64>();
-                let tab_seed_y = self.rng.random::<u64>();
-                let tab_seed_z = self.rng.random::<u64>();
-
-                let mut tab_branch_x = tab.fork(Some(tab_seed_x));
-                let mut tab_branch_y = tab.fork(Some(tab_seed_y));
-                let mut tab_branch_z = tab.fork(Some(tab_seed_z));
-
-                tab_branch_x.x(addr0);
-                tab_branch_y.y(addr0);
-                tab_branch_z.z(addr0);
+                let (mut dx, mut dy, mut dz) = (0u64, 0u64, 0u64);
+                for (row, pw) in tab.tableau.data.iter().enumerate() {
+                    let x: bool = pw.word.xbits[addr0];
+                    let z: bool = pw.word.zbits[addr0];
+                    let m = sign_mask(row);
+                    if z {
+                        dx ^= m;
+                    }
+                    if x ^ z {
+                        dy ^= m;
+                    }
+                    if x {
+                        dz ^= m;
+                    }
+                }
 
-                // X/Y/Z flip only phase bits, never the Pauli words, so all three
-                // branches reuse the parent's word-fingerprint and derive their
-                // phase/loss hash from the parent's by XORing the flipped rows.
-                let hx = pauli_branch_phase_loss(tab, &tab_branch_x, phase_loss);
-                let hy = pauli_branch_phase_loss(tab, &tab_branch_y, phase_loss);
-                let hz = pauli_branch_phase_loss(tab, &tab_branch_z, phase_loss);
-                branches.push((tab_branch_x, p_sum.clone() * p[0].clone(), word_fp, hx));
-                branches.push((tab_branch_y, p_sum.clone() * p[1].clone(), word_fp, hy));
-                branches.push((tab_branch_z, p_sum.clone() * p[2].clone(), word_fp, hz));
+                branches.push((
+                    parent_idx,
+                    BranchMutation::Pauli { op: 1, addr0 },
+                    p_sum.clone() * p[0].clone(),
+                    word_fp,
+                    phase_loss ^ dx,
+                ));
+                branches.push((
+                    parent_idx,
+                    BranchMutation::Pauli { op: 2, addr0 },
+                    p_sum.clone() * p[1].clone(),
+                    word_fp,
+                    phase_loss ^ dy,
+                ));
+                branches.push((
+                    parent_idx,
+                    BranchMutation::Pauli { op: 3, addr0 },
+                    p_sum.clone() * p[2].clone(),
+                    word_fp,
+                    phase_loss ^ dz,
+                ));
 
                 *p_sum *= T::Coeff::one() - p_total.clone();
             });
 
         let needs_normalize = self
             .entries
-            .insert_or_merge_batch(branches, &self.sum_cutoff);
+            .insert_or_merge_mutated_branches(branches, &self.sum_cutoff);
         if needs_normalize {
             self.normalize_probabilities();
         }
diff --git a/crates/ppvm-tableau-sum/src/storage/entry_store.rs b/crates/ppvm-tableau-sum/src/storage/entry_store.rs
index 2dfaf9eb4..2cd822263 100644
--- a/crates/ppvm-tableau-sum/src/storage/entry_store.rs
+++ b/crates/ppvm-tableau-sum/src/storage/entry_store.rs
@@ -5,6 +5,8 @@ use num::Complex;
 use ppvm_tableau::{data::GeneralizedTableau, sparsevec::SparseVector};
 use ppvm_traits::config::Config;
 
+use crate::storage::BranchMutation;
+
 /// One branch produced by a noise channel: its tableau, coefficient, and the
 /// cached `(word_fingerprint, phase_loss_hash)` pair, so a merge can recompute
 /// the full fingerprint (`word_fp ^ phase_loss`) without re-hashing the tableau.
@@ -54,6 +56,21 @@ pub trait EntryStore<T: Config, I, C: SparseVector<Complex<T::Coeff>, I>>: Clone
     /// fingerprint is `word_fp ^ phase_loss` — no re-hashing of the tableau.
     fn insert_or_merge_batch(&mut self, branches: Vec<Branch<T, I, C>>, cutoff: &T::Coeff) -> bool;
 
+    /// Like [`insert_or_merge_batch`](Self::insert_or_merge_batch), but each
+    /// branch is described lazily as a [`BranchMutation`] of a parent entry
+    /// referenced by `parent_idx` — its ordinal in the order yielded by the
+    /// immediately-preceding [`for_each_mut_with_keys`](Self::for_each_mut_with_keys)
+    /// call (entry index for `VecStorage`; flat bucket order for `MapStorage`).
+    /// The branch tableau is materialized ONLY when it survives as a new entry;
+    /// merges and below-cutoff drops never clone. `word_fp`/`phase_loss` are the
+    /// branch's already-computed fingerprint halves; full fp = `word_fp ^ phase_loss`.
+    /// Returns true if any branch was dropped (caller renormalizes).
+    fn insert_or_merge_mutated_branches(
+        &mut self,
+        branches: Vec<(usize, BranchMutation, T::Coeff, u64, u64)>,
+        cutoff: &T::Coeff,
+    ) -> bool;
+
     fn retain<F>(&mut self, f: F)
     where
         F: FnMut(&GeneralizedTableau<T, I, C>, &T::Coeff) -> bool;
diff --git a/crates/ppvm-tableau-sum/src/storage/map.rs b/crates/ppvm-tableau-sum/src/storage/map.rs
index 15125f76c..c788ed996 100644
--- a/crates/ppvm-tableau-sum/src/storage/map.rs
+++ b/crates/ppvm-tableau-sum/src/storage/map.rs
@@ -15,8 +15,12 @@ use ppvm_traits::config::Config;
 use smallvec::SmallVec;
 
 use crate::storage::{
-    EntryStore, fingerprint, phase_loss_hash, structurally_equal, word_fingerprint,
+    Branch, BranchMutation, EntryStore, apply_branch_mutation, fingerprint, phase_loss_hash,
+    structurally_equal, word_fingerprint,
 };
+use bitvec::view::BitView;
+use num::PrimInt;
+use ppvm_traits::traits::Clifford;
 
 type Bucket<T, I, C> = SmallVec<[(GeneralizedTableau<T, I, C>, <T as Config>::Coeff); 1]>;
 
@@ -75,6 +79,8 @@ where
         + Copy,
     I: TableauIndex + Send + Sync,
     C: SparseVector<Complex<T::Coeff>, I>,
+    GeneralizedTableau<T, I, C>: Clifford,
+    <<T as Config>::Storage as BitView>::Store: PrimInt,
 {
     fn with_capacity(cap: usize) -> Self {
         Self {
@@ -177,6 +183,34 @@ where
         needs_renormalize
     }
 
+    fn insert_or_merge_mutated_branches(
+        &mut self,
+        branches: Vec<(usize, BranchMutation, <T as Config>::Coeff, u64, u64)>,
+        cutoff: &<T as Config>::Coeff,
+    ) -> bool {
+        self.rebuild_if_dirty();
+
+        // Materialize parents in the SAME order as for_each_mut_with_keys so
+        // parent_idx aligns. Correctness-only path: no clone savings here.
+        let parents: Vec<_> = self
+            .buckets
+            .values()
+            .flat_map(|v| v.iter())
+            .map(|(t, _)| t.clone())
+            .collect();
+
+        let real: Vec<Branch<T, I, C>> = branches
+            .into_iter()
+            .map(|(parent_idx, mutation, p, word_fp, phase_loss)| {
+                let mut tab = parents[parent_idx].clone();
+                apply_branch_mutation(&mut tab, mutation);
+                (tab, p, word_fp, phase_loss)
+            })
+            .collect();
+
+        self.insert_or_merge_batch(real, cutoff)
+    }
+
     fn retain<F>(&mut self, mut f: F)
     where
         F: FnMut(&GeneralizedTableau<T, I, C>, &<T as Config>::Coeff) -> bool,
diff --git a/crates/ppvm-tableau-sum/src/storage/mod.rs b/crates/ppvm-tableau-sum/src/storage/mod.rs
index 03d4c4a74..8522e9e8c 100644
--- a/crates/ppvm-tableau-sum/src/storage/mod.rs
+++ b/crates/ppvm-tableau-sum/src/storage/mod.rs
@@ -7,6 +7,7 @@ pub mod vec;
 
 pub use entry_store::{Branch, EntryStore};
 use fxhash::FxHashMap;
+use ppvm_traits::traits::Clifford;
 
 // Hasher for the structural `word_fingerprint`. gxhash (AES-based) is fastest on
 // native but needs hardware AES and does not build on wasm32, so fall back to
@@ -218,6 +219,152 @@ where
     true
 }
 
+/// A lazily-described branch: a mutation applied to a parent entry. Used so the
+/// merge can compute the branch fingerprint / structural identity without
+/// cloning, materializing the tableau only for surviving new entries.
+#[derive(Clone, Copy, Debug)]
+pub enum BranchMutation {
+    /// Apply Pauli `op` (1=X, 2=Y, 3=Z) at `addr0`: flips per-row sign bits only.
+    Pauli { op: u8, addr0: usize },
+    /// Mark qubit `q` lost (set is_lost[q] = true).
+    Loss { q: usize },
+}
+
+/// Materialize a lazily-described branch into a (cloned) tableau in place.
+pub(crate) fn apply_branch_mutation<T, I, C>(
+    tab: &mut GeneralizedTableau<T, I, C>,
+    m: BranchMutation,
+) where
+    T: Config,
+    I: TableauIndex,
+    C: SparseVector<Complex<T::Coeff>, I>,
+    GeneralizedTableau<T, I, C>: Clifford,
+{
+    match m {
+        BranchMutation::Pauli { op, addr0 } => match op {
+            1 => tab.x(addr0),
+            2 => tab.y(addr0),
+            3 => tab.z(addr0),
+            _ => {}
+        },
+        BranchMutation::Loss { q } => {
+            tab.is_lost[q] = true;
+        }
+    }
+}
+
+/// Like [`structurally_equal`], but compares `existing` against the *virtual*
+/// tableau `parent + m` without materializing it. Mirrors `structurally_equal`
+/// field-by-field, deriving each field of the virtual tableau from `parent`:
+/// - `is_lost`: for `Loss { q }`, equals `parent`'s with index `q` forced true;
+///   for `Pauli`, equals `parent`'s unchanged.
+/// - `coefficients`: unchanged by both mutations.
+/// - rows: for `Loss`, unchanged; for `Pauli`, each row's sign bit (phase bit 1)
+///   is flipped per the per-column rule (X: z; Y: x^z; Z: x).
+pub(crate) fn structurally_equal_mutated<T, I, C>(
+    existing: &GeneralizedTableau<T, I, C>,
+    parent: &GeneralizedTableau<T, I, C>,
+    m: BranchMutation,
+    scratch: &mut FxHashMap<I, Complex<T::Coeff>>,
+) -> bool
+where
+    T: Config,
+    T::Coeff: One + Zero + Clone + num::Num + PartialOrd,
+    Complex<T::Coeff>: std::ops::Mul<Output = Complex<T::Coeff>>
+        + AddAssign
+        + From<Complex64>
+        + ComplexFloat
+        + Copy,
+    I: TableauIndex,
+    C: SparseVector<Complex<T::Coeff>, I>,
+{
+    // NOTE: comparing is_lost and rows is only necessary to avoid hash collisions
+
+    match m {
+        BranchMutation::Loss { q } => {
+            // Virtual is_lost == parent's with index q forced true.
+            if existing.is_lost.len() != parent.is_lost.len() {
+                return false;
+            }
+            for (i, (&e, &p)) in existing
+                .is_lost
+                .iter()
+                .zip(parent.is_lost.iter())
+                .enumerate()
+            {
+                let virt = if i == q { true } else { p };
+                if e != virt {
+                    return false;
+                }
+            }
+        }
+        BranchMutation::Pauli { .. } => {
+            // Virtual is_lost == parent's, unchanged.
+            if existing.is_lost != parent.is_lost {
+                return false;
+            }
+        }
+    }
+
+    if existing.coefficients.len() != parent.coefficients.len() {
+        return false;
+    }
+
+    // Cheaper row comparison first; coefficient compare is O(K) below.
+    match m {
+        BranchMutation::Loss { .. } => {
+            for (re, rp) in existing.tableau.data.iter().zip(parent.tableau.data.iter()) {
+                if re.phase != rp.phase || re.word != rp.word {
+                    return false;
+                }
+            }
+        }
+        BranchMutation::Pauli { op, addr0 } => {
+            for (re, rp) in existing.tableau.data.iter().zip(parent.tableau.data.iter()) {
+                if re.word != rp.word {
+                    return false;
+                }
+                let x: bool = rp.word.xbits[addr0];
+                let z: bool = rp.word.zbits[addr0];
+                let flip = match op {
+                    1 => z,
+                    2 => x ^ z,
+                    3 => x,
+                    _ => false,
+                };
+                let virt_phase = rp.phase ^ ((flip as u8) << 1);
+                if re.phase != virt_phase {
+                    return false;
+                }
+            }
+        }
+    }
+
+    // Reuse the caller-owned scratch map instead of allocating per call.
+    // Clear retains capacity across invocations. Coefficients are unchanged
+    // by both mutations, so compare existing vs parent directly.
+    scratch.clear();
+    scratch.reserve(parent.coefficients.len());
+    for (val, idx) in parent.coefficients.iter() {
+        scratch.insert(*idx, *val);
+    }
+
+    let threshold_sq =
+        existing.coefficient_threshold.clone() * existing.coefficient_threshold.clone();
+    let zero = Complex {
+        re: T::Coeff::zero(),
+        im: T::Coeff::zero(),
+    };
+    for (val0, idx0) in existing.coefficients.iter() {
+        let val1 = scratch.get(idx0).copied().unwrap_or(zero);
+        if (*val0 - val1).norm_sqr() >= threshold_sq {
+            return false;
+        }
+    }
+
+    true
+}
+
 #[cfg(test)]
 mod fingerprint_tests {
     use super::{
diff --git a/crates/ppvm-tableau-sum/src/storage/vec.rs b/crates/ppvm-tableau-sum/src/storage/vec.rs
index a235b32d6..7a6bdbb3f 100644
--- a/crates/ppvm-tableau-sum/src/storage/vec.rs
+++ b/crates/ppvm-tableau-sum/src/storage/vec.rs
@@ -13,7 +13,13 @@ use ppvm_tableau::{
 };
 use ppvm_traits::config::Config;
 
-use crate::storage::{EntryStore, phase_loss_hash, structurally_equal, word_fingerprint};
+use crate::storage::{
+    BranchMutation, EntryStore, apply_branch_mutation, phase_loss_hash, structurally_equal,
+    structurally_equal_mutated, word_fingerprint,
+};
+use bitvec::view::BitView;
+use num::PrimInt;
+use ppvm_traits::traits::Clifford;
 
 #[derive(Clone)]
 pub struct VecStorage<T: Config, I: TableauIndex, C: SparseVector<Complex<T::Coeff>, I>> {
@@ -125,6 +131,8 @@ where
         + Copy,
     I: TableauIndex + Send + Sync,
     C: SparseVector<Complex<T::Coeff>, I>,
+    GeneralizedTableau<T, I, C>: Clifford,
+    <<T as Config>::Storage as BitView>::Store: PrimInt,
 {
     fn with_capacity(cap: usize) -> Self {
         Self {
@@ -201,6 +209,72 @@ where
         needs_renormalize
     }
 
+    fn insert_or_merge_mutated_branches(
+        &mut self,
+        branches: Vec<(usize, BranchMutation, <T as Config>::Coeff, u64, u64)>,
+        cutoff: &<T as Config>::Coeff,
+    ) -> bool {
+        // Defensive: should be a no-op since the caller's for_each_mut_with_keys
+        // already ran and the branch words were never mutated.
+        self.rebuild_fingerprints_if_dirty();
+
+        let mut fp_index: FxHashMap<u64, Vec<usize>> =
+            FxHashMap::with_capacity_and_hasher(self.entries.len(), Default::default());
+        for i in 0..self.entries.len() {
+            let fp = self.fingerprints[i];
+            fp_index.entry(fp).or_default().push(i);
+        }
+
+        let mut needs_renormalize = false;
+        for (parent_idx, mutation, p, word_fp, phase_loss) in branches {
+            let fp = word_fp ^ phase_loss;
+
+            // Find a structurally-equal existing entry among the fp candidates
+            // WITHOUT materializing the branch tableau. The disjoint-field borrow
+            // (`self.entries` immutable + `self.scratch` mutable) is allowed.
+            let mut found: Option<usize> = None;
+            if let Some(candidates) = fp_index.get(&fp) {
+                for &i in candidates {
+                    if structurally_equal_mutated(
+                        &self.entries[i].0,
+                        &self.entries[parent_idx].0,
+                        mutation,
+                        &mut self.scratch,
+                    ) {
+                        found = Some(i);
+                        break;
+                    }
+                }
+            }
+
+            match found {
+                Some(i) => {
+                    let p0 = &self.entries[i].1;
+                    self.entries[i].1 = p0.clone() + p;
+                }
+                None => {
+                    if &p > cutoff {
+                        // Surviving new entry: materialize now (clone parent +
+                        // apply mutation). Later branches' parent_idx still refer
+                        // to the original entries — push never moves them.
+                        let mut tab = self.entries[parent_idx].0.clone();
+                        apply_branch_mutation(&mut tab, mutation);
+                        let new_idx = self.entries.len();
+                        self.entries.push((tab, p));
+                        self.fingerprints.push(fp);
+                        self.word_fingerprints.push(word_fp);
+                        self.phase_loss_hashes.push(phase_loss);
+                        fp_index.entry(fp).or_default().push(new_idx);
+                    } else {
+                        needs_renormalize = true;
+                    }
+                }
+            }
+        }
+
+        needs_renormalize
+    }
+
     fn retain<F>(&mut self, mut f: F)
     where
         F: FnMut(&GeneralizedTableau<T, I, C>, &<T as Config>::Coeff) -> bool,

From b682eb4144b3b52a7967caae3cfe68156ccbf76b Mon Sep 17 00:00:00 2001
From: David Plankensteiner <dplankensteiner@quera.com>
Date: Tue, 23 Jun 2026 20:51:22 +0200
Subject: [PATCH 07/24] chore(autotune): record lazy-materialization keep
 (2.73x)

---
 docs/autotune/2026-06-23-tableau-sum-build/log.md      | 1 +
 docs/autotune/2026-06-23-tableau-sum-build/metric.toml | 7 +++++++
 2 files changed, 8 insertions(+)

diff --git a/docs/autotune/2026-06-23-tableau-sum-build/log.md b/docs/autotune/2026-06-23-tableau-sum-build/log.md
index 06e93a1c0..039506bc5 100644
--- a/docs/autotune/2026-06-23-tableau-sum-build/log.md
+++ b/docs/autotune/2026-06-23-tableau-sum-build/log.md
@@ -9,3 +9,4 @@
 - Accuracy guard: branch count must stay 2025 (optimizations must not change the math). Cutoff fixed at 1e-7.
 - Bench cmd: cargo build --release -p ppvm-tableau-sum --example msd-noisy-bench && ./target/release/examples/msd-noisy-bench
 - Accuracy reference (must be preserved to ~1e-9 by all optimizations): branches=2025, sum_p=1.0, sum_p2=0.725135705447, top5_p[0]=0.8515413524292632. Key code fact: GeneralizedTableau X/Y/Z (gates/clifford.rs impl_generalized_tableau_clifford) only flip per-row sign bits (X where z=1, Y where x^z=1, Z where x=1) and leave words/coefficients/is_lost identical to parent; loss only sets one is_lost bit. So branch fingerprints are derivable without cloning -> lazy materialization is the plan.
+- KEEP lazy-materialization: build_median 2620ms -> 958ms (2.73x, -63%). per_shot unchanged (~22us). branches=2025, sum_p2=0.725135705447 and top5 bit-identical => math unchanged. 76 tests pass. Implemented BranchMutation + structurally_equal_mutated + apply_branch_mutation in storage/mod.rs; insert_or_merge_mutated_branches trait method (VecStorage lazy/index-based, MapStorage eager-clone fallback); loss_channel + pauli_error emit virtual branches and only clone survivors. two_qubit/correlated_loss/reset_loss left eager (not in msd-noisy).
diff --git a/docs/autotune/2026-06-23-tableau-sum-build/metric.toml b/docs/autotune/2026-06-23-tableau-sum-build/metric.toml
index 24d85b30b..9db9e15a7 100644
--- a/docs/autotune/2026-06-23-tableau-sum-build/metric.toml
+++ b/docs/autotune/2026-06-23-tableau-sum-build/metric.toml
@@ -5,3 +5,10 @@
 "build_median_ms" = 2620.0
 "per_shot_ns" = 22500.0
 "branches" = 2025.0
+[[metric]]
+"commit" = "59165235"
+"status" = "keep"
+"description" = "lazy branch materialization for loss_channel + pauli_error (single-qubit depolarize)"
+"build_median_ms" = 958.0
+"per_shot_ns" = 22100.0
+"branches" = 2025.0

From d0cc2fe88dadb6ab258c02bf090cd67a196073e5 Mon Sep 17 00:00:00 2001
From: David Plankensteiner <dplankensteiner@quera.com>
Date: Tue, 23 Jun 2026 20:58:44 +0200
Subject: [PATCH 08/24] perf(tableau-sum): bulk-hash word_fingerprint in one
 pass

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 crates/ppvm-tableau-sum/src/storage/mod.rs | 71 ++++++++++++++++------
 1 file changed, 52 insertions(+), 19 deletions(-)

diff --git a/crates/ppvm-tableau-sum/src/storage/mod.rs b/crates/ppvm-tableau-sum/src/storage/mod.rs
index 8522e9e8c..f44418bb9 100644
--- a/crates/ppvm-tableau-sum/src/storage/mod.rs
+++ b/crates/ppvm-tableau-sum/src/storage/mod.rs
@@ -10,14 +10,13 @@ use fxhash::FxHashMap;
 use ppvm_traits::traits::Clifford;
 
 // Hasher for the structural `word_fingerprint`. gxhash (AES-based) is fastest on
-// native but needs hardware AES and does not build on wasm32, so fall back to
-// fxhash there. The fingerprint is a transient in-memory dedup key — collisions
-// are resolved by `structurally_equal`, and it is never persisted or compared
-// across builds — so the hasher may differ per target without affecting results.
+// native and exposes a `gxhash64` bulk free function, but it needs hardware AES
+// and does not build on wasm32, so fall back to fxhash there. The fingerprint is
+// a transient in-memory dedup key — collisions are resolved by
+// `structurally_equal`, and it is never persisted or compared across builds — so
+// the hasher may differ per target without affecting results.
 #[cfg(target_arch = "wasm32")]
 use fxhash::FxHasher as FingerprintHasher;
-#[cfg(not(target_arch = "wasm32"))]
-use gxhash::GxHasher as FingerprintHasher;
 use num::{
     Complex, One, Zero,
     complex::{Complex64, ComplexFloat},
@@ -26,10 +25,30 @@ use ppvm_tableau::{
     data::GeneralizedTableau, sparsevec::SparseVector, tableau_index::TableauIndex,
 };
 use ppvm_traits::config::Config;
-use std::{
-    hash::{Hash, Hasher},
-    ops::AddAssign,
-};
+#[cfg(target_arch = "wasm32")]
+use std::hash::Hasher;
+use std::ops::AddAssign;
+
+// Reusable per-thread scratch buffer for `word_fingerprint`. Gathering every
+// row's word bytes into one contiguous slice lets us hash in a single bulk call
+// instead of two tiny `Hash::hash` writes per row (high per-call overhead).
+// Cleared (capacity retained) per call, so it adapts to any row count / qubit
+// width without re-allocating. Bytes (not the storage word type) because the
+// storage element width (`[u8; N]` vs `[u64; N]`) is generic at this call site.
+thread_local! {
+    static WORD_FP_BUF: std::cell::RefCell<Vec<u8>> = const { std::cell::RefCell::new(Vec::new()) };
+}
+
+/// View a `Copy` plain-old-data value's bytes. Sound because `A: PauliStorage`
+/// implies `bytemuck::Pod`: no padding, every bit pattern valid, so the bytes
+/// are fully initialized and `u8`-aligned.
+#[inline]
+fn pod_bytes<A: Copy>(value: &A) -> &[u8] {
+    // SAFETY: `A` is POD (PauliStorage: bytemuck::Pod); reading its
+    // `size_of::<A>()` initialized bytes as `[u8]` is sound, and the borrow is
+    // tied to `value`.
+    unsafe { std::slice::from_raw_parts(value as *const A as *const u8, std::mem::size_of::<A>()) }
+}
 
 /// Hash of the `word` (Pauli content) of every row, in order. This is the
 /// expensive component (each word is several machine words wide) and is
@@ -43,15 +62,29 @@ where
     I:,
     C: SparseVector<Complex<T::Coeff>, I>,
 {
-    let mut hasher = FingerprintHasher::default();
-    for row in tab.tableau.data.iter() {
-        // Hash the Pauli bits directly: the `PauliWord` hash cache is disabled
-        // for tableau rows (`REHASH = false`), so `row.word.hash()` would feed
-        // a stale zero and make every tableau collide.
-        row.word.xbits.data.hash(&mut hasher);
-        row.word.zbits.data.hash(&mut hasher);
-    }
-    hasher.finish()
+    WORD_FP_BUF.with(|cell| {
+        let mut buf = cell.borrow_mut();
+        // Clear retains capacity; refill with every row's bits as raw bytes.
+        buf.clear();
+        for row in tab.tableau.data.iter() {
+            // Gather the Pauli bits directly: the `PauliWord` hash cache is
+            // disabled for tableau rows (`REHASH = false`), so hashing
+            // `row.word` would feed a stale zero and make every tableau collide.
+            buf.extend_from_slice(pod_bytes(&row.word.xbits.data));
+            buf.extend_from_slice(pod_bytes(&row.word.zbits.data));
+        }
+
+        #[cfg(not(target_arch = "wasm32"))]
+        {
+            gxhash::gxhash64(&buf, 0)
+        }
+        #[cfg(target_arch = "wasm32")]
+        {
+            let mut hasher = FingerprintHasher::default();
+            hasher.write(&buf);
+            hasher.finish()
+        }
+    })
 }
 
 /// Per-row mask (splitmix64 of `(index, salt)`); a stable pure function used

From ad0d0b24260caaf50fe34fc7b72f4cd740c6a3db Mon Sep 17 00:00:00 2001
From: David Plankensteiner <dplankensteiner@quera.com>
Date: Tue, 23 Jun 2026 20:59:47 +0200
Subject: [PATCH 09/24] chore(autotune): plan bulk-word-hash approach

---
 .../bulk-word-hash/prompts.md                     | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 docs/autotune/2026-06-23-tableau-sum-build/bulk-word-hash/prompts.md

diff --git a/docs/autotune/2026-06-23-tableau-sum-build/bulk-word-hash/prompts.md b/docs/autotune/2026-06-23-tableau-sum-build/bulk-word-hash/prompts.md
new file mode 100644
index 000000000..f928d0d17
--- /dev/null
+++ b/docs/autotune/2026-06-23-tableau-sum-build/bulk-word-hash/prompts.md
@@ -0,0 +1,15 @@
+# Approach: bulk word_fingerprint hashing
+
+## Hypothesis
+After lazy materialization, `rebuild_fingerprints_if_dirty` dominates (47% self,
+61% inclusive); it re-hashes every entry's words after each clifford gate marks
+them dirty. `word_fingerprint` currently does 2 small `Hash::hash` calls per row
+(`xbits.data` then `zbits.data`) = ~340 hasher writes for 170 rows, with
+per-call overhead. Gather the row words into one contiguous buffer and hash once
+with `gxhash::gxhash64` (native) — far less per-call overhead, single SIMD pass.
+
+## Target
+`./target/release/examples/msd-noisy-bench`; baseline now build_median ~958ms.
+Must keep branches=2025, sum_p2=0.725135705447, top5[0]=0.8515413524292632.
+Fingerprints are transient dedup keys (resolved by structurally_equal), so the
+hash VALUE may change freely as long as it's consistent within a build.

From 6132edf7dc7acefb2cdaff23d1a27c7af15894bc Mon Sep 17 00:00:00 2001
From: David Plankensteiner <dplankensteiner@quera.com>
Date: Tue, 23 Jun 2026 20:59:48 +0200
Subject: [PATCH 10/24] chore(autotune): record bulk-word-hash keep (1.67x,
 cumulative 4.57x)

---
 docs/autotune/2026-06-23-tableau-sum-build/log.md      | 1 +
 docs/autotune/2026-06-23-tableau-sum-build/metric.toml | 7 +++++++
 2 files changed, 8 insertions(+)

diff --git a/docs/autotune/2026-06-23-tableau-sum-build/log.md b/docs/autotune/2026-06-23-tableau-sum-build/log.md
index 039506bc5..564d379e0 100644
--- a/docs/autotune/2026-06-23-tableau-sum-build/log.md
+++ b/docs/autotune/2026-06-23-tableau-sum-build/log.md
@@ -10,3 +10,4 @@
 - Bench cmd: cargo build --release -p ppvm-tableau-sum --example msd-noisy-bench && ./target/release/examples/msd-noisy-bench
 - Accuracy reference (must be preserved to ~1e-9 by all optimizations): branches=2025, sum_p=1.0, sum_p2=0.725135705447, top5_p[0]=0.8515413524292632. Key code fact: GeneralizedTableau X/Y/Z (gates/clifford.rs impl_generalized_tableau_clifford) only flip per-row sign bits (X where z=1, Y where x^z=1, Z where x=1) and leave words/coefficients/is_lost identical to parent; loss only sets one is_lost bit. So branch fingerprints are derivable without cloning -> lazy materialization is the plan.
 - KEEP lazy-materialization: build_median 2620ms -> 958ms (2.73x, -63%). per_shot unchanged (~22us). branches=2025, sum_p2=0.725135705447 and top5 bit-identical => math unchanged. 76 tests pass. Implemented BranchMutation + structurally_equal_mutated + apply_branch_mutation in storage/mod.rs; insert_or_merge_mutated_branches trait method (VecStorage lazy/index-based, MapStorage eager-clone fallback); loss_channel + pauli_error emit virtual branches and only clone survivors. two_qubit/correlated_loss/reset_loss left eager (not in msd-noisy).
+- KEEP bulk-word-hash: build_median 958ms -> 573ms (1.67x; cumulative 4.57x from 2620). word_fingerprint now gathers all row words into a reused thread_local Vec<u8> and calls gxhash64 once instead of ~340 tiny Hash writes. Subagent used Vec<u8>+pod_bytes (storage type is generic A:PauliStorage, not usize) and avoided adding bytemuck dep. per_shot unchanged, accuracy identical, 76+33 tests pass.
diff --git a/docs/autotune/2026-06-23-tableau-sum-build/metric.toml b/docs/autotune/2026-06-23-tableau-sum-build/metric.toml
index 9db9e15a7..f17d237c7 100644
--- a/docs/autotune/2026-06-23-tableau-sum-build/metric.toml
+++ b/docs/autotune/2026-06-23-tableau-sum-build/metric.toml
@@ -12,3 +12,10 @@
 "build_median_ms" = 958.0
 "per_shot_ns" = 22100.0
 "branches" = 2025.0
+[[metric]]
+"commit" = "ad0d0b24"
+"status" = "keep"
+"description" = "bulk-hash word_fingerprint (gather rows, single gxhash64)"
+"build_median_ms" = 573.0
+"per_shot_ns" = 22100.0
+"branches" = 2025.0

From 26e222391aa9fc7d5a1b94eb5ffa8ca49d059c2b Mon Sep 17 00:00:00 2001
From: David Plankensteiner <dplankensteiner@quera.com>
Date: Tue, 23 Jun 2026 21:05:32 +0200
Subject: [PATCH 11/24] perf(tableau-sum): precompute per-row masks for
 phase_loss + pauli deltas

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 crates/ppvm-tableau-sum/src/noise.rs       |  7 ++-
 crates/ppvm-tableau-sum/src/storage/map.rs | 24 +++++++---
 crates/ppvm-tableau-sum/src/storage/mod.rs | 56 +++++++++++++++++-----
 crates/ppvm-tableau-sum/src/storage/vec.rs | 21 ++++----
 4 files changed, 79 insertions(+), 29 deletions(-)

diff --git a/crates/ppvm-tableau-sum/src/noise.rs b/crates/ppvm-tableau-sum/src/noise.rs
index b12a08057..d6799d035 100644
--- a/crates/ppvm-tableau-sum/src/noise.rs
+++ b/crates/ppvm-tableau-sum/src/noise.rs
@@ -20,7 +20,7 @@ use rand::{RngExt, rngs::SmallRng};
 
 use crate::{
     data::GeneralizedTableauSum,
-    storage::{Branch, BranchMutation, EntryStore, loss_mask, pauli_branch_phase_loss, sign_mask},
+    storage::{Branch, BranchMutation, EntryStore, RowMasks, loss_mask, pauli_branch_phase_loss},
 };
 
 fn single_qubit_loss_branch<T, I, C>(
@@ -212,6 +212,9 @@ where
         let mut branches = Vec::<(usize, BranchMutation, T::Coeff, u64, u64)>::with_capacity(
             3 * self.entries.len(),
         );
+        // Precompute the per-row sign masks once instead of recomputing the
+        // splitmix `sign_mask` per row per entry in the hot loop below.
+        let masks = RowMasks::new(self.n_qubits);
         let mut idx = 0usize;
         self.entries
             .for_each_mut_with_keys(|tab, p_sum, word_fp, phase_loss| {
@@ -225,7 +228,7 @@ where
                 for (row, pw) in tab.tableau.data.iter().enumerate() {
                     let x: bool = pw.word.xbits[addr0];
                     let z: bool = pw.word.zbits[addr0];
-                    let m = sign_mask(row);
+                    let m = masks.sign[row];
                     if z {
                         dx ^= m;
                     }
diff --git a/crates/ppvm-tableau-sum/src/storage/map.rs b/crates/ppvm-tableau-sum/src/storage/map.rs
index c788ed996..7242f3cdf 100644
--- a/crates/ppvm-tableau-sum/src/storage/map.rs
+++ b/crates/ppvm-tableau-sum/src/storage/map.rs
@@ -15,8 +15,8 @@ use ppvm_traits::config::Config;
 use smallvec::SmallVec;
 
 use crate::storage::{
-    Branch, BranchMutation, EntryStore, apply_branch_mutation, fingerprint, phase_loss_hash,
-    structurally_equal, word_fingerprint,
+    Branch, BranchMutation, EntryStore, RowMasks, apply_branch_mutation, fingerprint,
+    phase_loss_hash_with, structurally_equal, word_fingerprint,
 };
 use bitvec::view::BitView;
 use num::PrimInt;
@@ -128,11 +128,21 @@ where
         F: FnMut(&mut GeneralizedTableau<T, I, C>, &mut <T as Config>::Coeff, u64, u64),
     {
         self.rebuild_if_dirty();
-        for v in self.buckets.values_mut() {
-            for (tab, c) in v.iter_mut() {
-                let word_fp = word_fingerprint(tab);
-                let phase_loss = phase_loss_hash(tab);
-                f(tab, c, word_fp, phase_loss);
+        // Build the per-row mask table once; every tableau in the sum shares the
+        // same qubit count. Skip when there are no entries.
+        let masks = self
+            .buckets
+            .values()
+            .flat_map(|v| v.iter())
+            .next()
+            .map(|(t, _)| RowMasks::new(t.is_lost.len()));
+        if let Some(masks) = masks {
+            for v in self.buckets.values_mut() {
+                for (tab, c) in v.iter_mut() {
+                    let word_fp = word_fingerprint(tab);
+                    let phase_loss = phase_loss_hash_with(tab, &masks);
+                    f(tab, c, word_fp, phase_loss);
+                }
             }
         }
     }
diff --git a/crates/ppvm-tableau-sum/src/storage/mod.rs b/crates/ppvm-tableau-sum/src/storage/mod.rs
index f44418bb9..268a3a091 100644
--- a/crates/ppvm-tableau-sum/src/storage/mod.rs
+++ b/crates/ppvm-tableau-sum/src/storage/mod.rs
@@ -121,17 +121,25 @@ pub(crate) fn loss_mask(q: usize) -> u64 {
     row_mask(q, 0xC3C3_C3C3_C3C3_C3C3)
 }
 
-/// XOR contribution of a single row's phase.
-#[inline]
-fn phase_contrib(row: usize, phase: u8) -> u64 {
-    let mut h = 0;
-    if phase & 1 != 0 {
-        h ^= imag_mask(row);
-    }
-    if phase & 2 != 0 {
-        h ^= sign_mask(row);
+/// Precomputed per-row/per-qubit masks (sign, imag, loss). Built once per op and
+/// indexed instead of recomputing the splitmix `row_mask` per row per entry.
+pub(crate) struct RowMasks {
+    pub sign: Vec<u64>, // sign_mask(i) for i in 0..2*n_qubits
+    pub imag: Vec<u64>, // imag_mask(i) for i in 0..2*n_qubits
+    pub loss: Vec<u64>, // loss_mask(q) for q in 0..n_qubits
+}
+
+impl RowMasks {
+    /// Build the mask tables. The tableau has `2 * n_qubits` rows (`sign`/`imag`
+    /// indexed by row); `loss` is indexed by qubit `0..n_qubits`.
+    pub(crate) fn new(n_qubits: usize) -> Self {
+        let n_rows = 2 * n_qubits;
+        Self {
+            sign: (0..n_rows).map(sign_mask).collect(),
+            imag: (0..n_rows).map(imag_mask).collect(),
+            loss: (0..n_qubits).map(loss_mask).collect(),
+        }
     }
-    h
 }
 
 /// XOR-combinable hash of `is_lost` plus every row's `phase`, formed as the
@@ -139,17 +147,41 @@ fn phase_contrib(row: usize, phase: u8) -> u64 {
 /// XOR-combinable lets a branch inherit its parent's value and update only the
 /// rows it changed — a sign flip XORs [`sign_mask`], a loss XORs [`loss_mask`].
 pub fn phase_loss_hash<T, I, C>(tab: &GeneralizedTableau<T, I, C>) -> u64
+where
+    T: Config,
+    C: SparseVector<Complex<T::Coeff>, I>,
+{
+    // Single implementation: build a one-shot mask table and delegate so the
+    // table-indexed and from-scratch values are guaranteed identical.
+    // `is_lost.len() == n_qubits` and is available under these minimal bounds.
+    let masks = RowMasks::new(tab.is_lost.len());
+    phase_loss_hash_with(tab, &masks)
+}
+
+/// Like [`phase_loss_hash`], but indexes a precomputed [`RowMasks`] instead of
+/// recomputing the splitmix masks per row/qubit. Reproduces the same value:
+/// phase bit 0 (imag) XORs `masks.imag[row]`, phase bit 1 (sign) XORs
+/// `masks.sign[row]`, and a lost qubit `q` XORs `masks.loss[q]`.
+pub(crate) fn phase_loss_hash_with<T, I, C>(
+    tab: &GeneralizedTableau<T, I, C>,
+    masks: &RowMasks,
+) -> u64
 where
     T: Config,
     C: SparseVector<Complex<T::Coeff>, I>,
 {
     let mut h = 0u64;
     for (row, ppw) in tab.tableau.data.iter().enumerate() {
-        h ^= phase_contrib(row, ppw.phase);
+        if ppw.phase & 1 != 0 {
+            h ^= masks.imag[row];
+        }
+        if ppw.phase & 2 != 0 {
+            h ^= masks.sign[row];
+        }
     }
     for (q, lost) in tab.is_lost.iter().enumerate() {
         if *lost {
-            h ^= loss_mask(q);
+            h ^= masks.loss[q];
         }
     }
     h
diff --git a/crates/ppvm-tableau-sum/src/storage/vec.rs b/crates/ppvm-tableau-sum/src/storage/vec.rs
index 7a6bdbb3f..7a13e23e8 100644
--- a/crates/ppvm-tableau-sum/src/storage/vec.rs
+++ b/crates/ppvm-tableau-sum/src/storage/vec.rs
@@ -14,8 +14,8 @@ use ppvm_tableau::{
 use ppvm_traits::config::Config;
 
 use crate::storage::{
-    BranchMutation, EntryStore, apply_branch_mutation, phase_loss_hash, structurally_equal,
-    structurally_equal_mutated, word_fingerprint,
+    BranchMutation, EntryStore, RowMasks, apply_branch_mutation, phase_loss_hash_with,
+    structurally_equal, structurally_equal_mutated, word_fingerprint,
 };
 use bitvec::view::BitView;
 use num::PrimInt;
@@ -109,12 +109,17 @@ where
         self.fingerprints.clear();
         self.word_fingerprints.clear();
         self.phase_loss_hashes.clear();
-        for (t, _) in self.entries.iter() {
-            let wfp = word_fingerprint(t);
-            let plh = phase_loss_hash(t);
-            self.word_fingerprints.push(wfp);
-            self.phase_loss_hashes.push(plh);
-            self.fingerprints.push(wfp ^ plh);
+        // Build the per-row mask table once for all entries (every tableau in a
+        // sum shares the same qubit count). Skip when there are no entries.
+        if let Some((first, _)) = self.entries.first() {
+            let masks = RowMasks::new(first.is_lost.len());
+            for (t, _) in self.entries.iter() {
+                let wfp = word_fingerprint(t);
+                let plh = phase_loss_hash_with(t, &masks);
+                self.word_fingerprints.push(wfp);
+                self.phase_loss_hashes.push(plh);
+                self.fingerprints.push(wfp ^ plh);
+            }
         }
         self.dirty = false;
     }

From 4fea5841cf4985de6e14f88bf7eacf0673a8db06 Mon Sep 17 00:00:00 2001
From: David Plankensteiner <dplankensteiner@quera.com>
Date: Tue, 23 Jun 2026 21:06:45 +0200
Subject: [PATCH 12/24] chore(autotune): plan precompute-masks approach

---
 .../precompute-masks/prompts.md                  | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 docs/autotune/2026-06-23-tableau-sum-build/precompute-masks/prompts.md

diff --git a/docs/autotune/2026-06-23-tableau-sum-build/precompute-masks/prompts.md b/docs/autotune/2026-06-23-tableau-sum-build/precompute-masks/prompts.md
new file mode 100644
index 000000000..5dc8c0fff
--- /dev/null
+++ b/docs/autotune/2026-06-23-tableau-sum-build/precompute-masks/prompts.md
@@ -0,0 +1,16 @@
+# Approach: precompute per-row masks (kill redundant splitmix)
+
+## Hypothesis
+`sign_mask(row)`/`imag_mask(row)`/`loss_mask(q)` are splitmix64 hashes of a pure
+index. They are recomputed per-row, per-entry, on every op:
+- `pauli_error`'s dx/dy/dz loop computes `sign_mask(row)` for all 2n rows of
+  every entry on every depolarize.
+- `phase_loss_hash` (called per entry in `rebuild_fingerprints_if_dirty`)
+  recomputes sign/imag masks per set phase and loss masks per lost qubit.
+
+Precompute the per-index mask tables ONCE per op and index them. Values are
+identical, so all fingerprints (and the accuracy fingerprint) are unchanged.
+
+## Target
+`./target/release/examples/msd-noisy-bench`; baseline now build_median ~573ms.
+Keep branches=2025, sum_p2=0.725135705447, top5[0]=0.8515413524292632, per_shot ~22us.

From 96064d0ab1922238ee1a6dd591bef161e564574c Mon Sep 17 00:00:00 2001
From: David Plankensteiner <dplankensteiner@quera.com>
Date: Tue, 23 Jun 2026 21:06:57 +0200
Subject: [PATCH 13/24] chore(autotune): record precompute-masks keep
 (cumulative 4.75x)

---
 docs/autotune/2026-06-23-tableau-sum-build/log.md      | 1 +
 docs/autotune/2026-06-23-tableau-sum-build/metric.toml | 7 +++++++
 2 files changed, 8 insertions(+)

diff --git a/docs/autotune/2026-06-23-tableau-sum-build/log.md b/docs/autotune/2026-06-23-tableau-sum-build/log.md
index 564d379e0..3dc311a7b 100644
--- a/docs/autotune/2026-06-23-tableau-sum-build/log.md
+++ b/docs/autotune/2026-06-23-tableau-sum-build/log.md
@@ -11,3 +11,4 @@
 - Accuracy reference (must be preserved to ~1e-9 by all optimizations): branches=2025, sum_p=1.0, sum_p2=0.725135705447, top5_p[0]=0.8515413524292632. Key code fact: GeneralizedTableau X/Y/Z (gates/clifford.rs impl_generalized_tableau_clifford) only flip per-row sign bits (X where z=1, Y where x^z=1, Z where x=1) and leave words/coefficients/is_lost identical to parent; loss only sets one is_lost bit. So branch fingerprints are derivable without cloning -> lazy materialization is the plan.
 - KEEP lazy-materialization: build_median 2620ms -> 958ms (2.73x, -63%). per_shot unchanged (~22us). branches=2025, sum_p2=0.725135705447 and top5 bit-identical => math unchanged. 76 tests pass. Implemented BranchMutation + structurally_equal_mutated + apply_branch_mutation in storage/mod.rs; insert_or_merge_mutated_branches trait method (VecStorage lazy/index-based, MapStorage eager-clone fallback); loss_channel + pauli_error emit virtual branches and only clone survivors. two_qubit/correlated_loss/reset_loss left eager (not in msd-noisy).
 - KEEP bulk-word-hash: build_median 958ms -> 573ms (1.67x; cumulative 4.57x from 2620). word_fingerprint now gathers all row words into a reused thread_local Vec<u8> and calls gxhash64 once instead of ~340 tiny Hash writes. Subagent used Vec<u8>+pod_bytes (storage type is generic A:PauliStorage, not usize) and avoided adding bytemuck dep. per_shot unchanged, accuracy identical, 76+33 tests pass.
+- KEEP precompute-masks: build_median 573ms -> 552ms (~3.7%; cumulative 4.75x). RowMasks table built once per op; phase_loss_hash_with + pauli_error dx/dy/dz index it instead of recomputing splitmix sign_mask per row per entry. Smaller than hoped => phase_loss_hash wasn't splitmix-dominated (mostly iteration over rows). accuracy identical, per_shot unchanged.
diff --git a/docs/autotune/2026-06-23-tableau-sum-build/metric.toml b/docs/autotune/2026-06-23-tableau-sum-build/metric.toml
index f17d237c7..3a64c189d 100644
--- a/docs/autotune/2026-06-23-tableau-sum-build/metric.toml
+++ b/docs/autotune/2026-06-23-tableau-sum-build/metric.toml
@@ -19,3 +19,10 @@
 "build_median_ms" = 573.0
 "per_shot_ns" = 22100.0
 "branches" = 2025.0
+[[metric]]
+"commit" = "4fea5841"
+"status" = "keep"
+"description" = "precompute per-row masks (RowMasks) for phase_loss + pauli dx/dy/dz"
+"build_median_ms" = 552.0
+"per_shot_ns" = 22300.0
+"branches" = 2025.0

From 34b41ceba941677b3fef69a509d2d3333c05cb2f Mon Sep 17 00:00:00 2001
From: David Plankensteiner <dplankensteiner@quera.com>
Date: Tue, 23 Jun 2026 21:14:32 +0200
Subject: [PATCH 14/24] perf(tableau-sum): direct single-pass word_fingerprint
 (drop gather+gxhash)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 crates/ppvm-tableau-sum/src/storage/mod.rs | 80 ++++++++++------------
 1 file changed, 35 insertions(+), 45 deletions(-)

diff --git a/crates/ppvm-tableau-sum/src/storage/mod.rs b/crates/ppvm-tableau-sum/src/storage/mod.rs
index 268a3a091..fc6d5d212 100644
--- a/crates/ppvm-tableau-sum/src/storage/mod.rs
+++ b/crates/ppvm-tableau-sum/src/storage/mod.rs
@@ -7,16 +7,6 @@ pub mod vec;
 
 pub use entry_store::{Branch, EntryStore};
 use fxhash::FxHashMap;
-use ppvm_traits::traits::Clifford;
-
-// Hasher for the structural `word_fingerprint`. gxhash (AES-based) is fastest on
-// native and exposes a `gxhash64` bulk free function, but it needs hardware AES
-// and does not build on wasm32, so fall back to fxhash there. The fingerprint is
-// a transient in-memory dedup key — collisions are resolved by
-// `structurally_equal`, and it is never persisted or compared across builds — so
-// the hasher may differ per target without affecting results.
-#[cfg(target_arch = "wasm32")]
-use fxhash::FxHasher as FingerprintHasher;
 use num::{
     Complex, One, Zero,
     complex::{Complex64, ComplexFloat},
@@ -25,20 +15,9 @@ use ppvm_tableau::{
     data::GeneralizedTableau, sparsevec::SparseVector, tableau_index::TableauIndex,
 };
 use ppvm_traits::config::Config;
-#[cfg(target_arch = "wasm32")]
-use std::hash::Hasher;
+use ppvm_traits::traits::Clifford;
 use std::ops::AddAssign;
 
-// Reusable per-thread scratch buffer for `word_fingerprint`. Gathering every
-// row's word bytes into one contiguous slice lets us hash in a single bulk call
-// instead of two tiny `Hash::hash` writes per row (high per-call overhead).
-// Cleared (capacity retained) per call, so it adapts to any row count / qubit
-// width without re-allocating. Bytes (not the storage word type) because the
-// storage element width (`[u8; N]` vs `[u64; N]`) is generic at this call site.
-thread_local! {
-    static WORD_FP_BUF: std::cell::RefCell<Vec<u8>> = const { std::cell::RefCell::new(Vec::new()) };
-}
-
 /// View a `Copy` plain-old-data value's bytes. Sound because `A: PauliStorage`
 /// implies `bytemuck::Pod`: no padding, every bit pattern valid, so the bytes
 /// are fully initialized and `u8`-aligned.
@@ -59,32 +38,43 @@ fn pod_bytes<A: Copy>(value: &A) -> &[u8] {
 pub fn word_fingerprint<T, I, C>(tab: &GeneralizedTableau<T, I, C>) -> u64
 where
     T: Config,
-    I:,
     C: SparseVector<Complex<T::Coeff>, I>,
 {
-    WORD_FP_BUF.with(|cell| {
-        let mut buf = cell.borrow_mut();
-        // Clear retains capacity; refill with every row's bits as raw bytes.
-        buf.clear();
-        for row in tab.tableau.data.iter() {
-            // Gather the Pauli bits directly: the `PauliWord` hash cache is
-            // disabled for tableau rows (`REHASH = false`), so hashing
-            // `row.word` would feed a stale zero and make every tableau collide.
-            buf.extend_from_slice(pod_bytes(&row.word.xbits.data));
-            buf.extend_from_slice(pod_bytes(&row.word.zbits.data));
-        }
+    // fxhash-style multiplicative-rotate mix, fed the raw storage words of
+    // every row's x then z bits. No allocation, no thread_local, single pass.
+    const K: u64 = 0x51_7c_c1_b7_27_22_0a_95; // fxhash constant
+    let mut h: u64 = 0;
+    let mut mix = |w: u64| {
+        h = (h.rotate_left(5) ^ w).wrapping_mul(K);
+    };
+    for row in tab.tableau.data.iter() {
+        // The `PauliWord` hash cache is disabled for tableau rows
+        // (`REHASH = false`), so hashing `row.word` would feed a stale zero and
+        // make every tableau collide; read the bit storage directly. The
+        // storage word element type is generic (`[u8; N]` vs `[u64; N]`) with no
+        // numeric bound available here, so view it as POD bytes and fold 8 at a
+        // time into a `u64` lane — identical on native and wasm.
+        mix_pod_words(pod_bytes(&row.word.xbits.data), &mut mix);
+        mix_pod_words(pod_bytes(&row.word.zbits.data), &mut mix);
+    }
+    h
+}
 
-        #[cfg(not(target_arch = "wasm32"))]
-        {
-            gxhash::gxhash64(&buf, 0)
-        }
-        #[cfg(target_arch = "wasm32")]
-        {
-            let mut hasher = FingerprintHasher::default();
-            hasher.write(&buf);
-            hasher.finish()
-        }
-    })
+/// Fold a POD byte slice into the `mix` closure 8 bytes (one `u64` lane) at a
+/// time, little-endian, zero-padding a short trailing chunk. Keeps the hash
+/// independent of the generic storage word width.
+#[inline]
+fn mix_pod_words(bytes: &[u8], mix: &mut impl FnMut(u64)) {
+    let mut chunks = bytes.chunks_exact(8);
+    for c in &mut chunks {
+        mix(u64::from_le_bytes(c.try_into().unwrap()));
+    }
+    let rem = chunks.remainder();
+    if !rem.is_empty() {
+        let mut buf = [0u8; 8];
+        buf[..rem.len()].copy_from_slice(rem);
+        mix(u64::from_le_bytes(buf));
+    }
 }
 
 /// Per-row mask (splitmix64 of `(index, salt)`); a stable pure function used

From 5ea2c00f0a00841197854eb6b2cb7fa4cbd31905 Mon Sep 17 00:00:00 2001
From: David Plankensteiner <dplankensteiner@quera.com>
Date: Tue, 23 Jun 2026 21:15:50 +0200
Subject: [PATCH 15/24] Revert "perf(tableau-sum): direct single-pass
 word_fingerprint (drop gather+gxhash)"

This reverts commit 34b41ceba941677b3fef69a509d2d3333c05cb2f.
---
 crates/ppvm-tableau-sum/src/storage/mod.rs | 80 ++++++++++++----------
 1 file changed, 45 insertions(+), 35 deletions(-)

diff --git a/crates/ppvm-tableau-sum/src/storage/mod.rs b/crates/ppvm-tableau-sum/src/storage/mod.rs
index fc6d5d212..268a3a091 100644
--- a/crates/ppvm-tableau-sum/src/storage/mod.rs
+++ b/crates/ppvm-tableau-sum/src/storage/mod.rs
@@ -7,6 +7,16 @@ pub mod vec;
 
 pub use entry_store::{Branch, EntryStore};
 use fxhash::FxHashMap;
+use ppvm_traits::traits::Clifford;
+
+// Hasher for the structural `word_fingerprint`. gxhash (AES-based) is fastest on
+// native and exposes a `gxhash64` bulk free function, but it needs hardware AES
+// and does not build on wasm32, so fall back to fxhash there. The fingerprint is
+// a transient in-memory dedup key — collisions are resolved by
+// `structurally_equal`, and it is never persisted or compared across builds — so
+// the hasher may differ per target without affecting results.
+#[cfg(target_arch = "wasm32")]
+use fxhash::FxHasher as FingerprintHasher;
 use num::{
     Complex, One, Zero,
     complex::{Complex64, ComplexFloat},
@@ -15,9 +25,20 @@ use ppvm_tableau::{
     data::GeneralizedTableau, sparsevec::SparseVector, tableau_index::TableauIndex,
 };
 use ppvm_traits::config::Config;
-use ppvm_traits::traits::Clifford;
+#[cfg(target_arch = "wasm32")]
+use std::hash::Hasher;
 use std::ops::AddAssign;
 
+// Reusable per-thread scratch buffer for `word_fingerprint`. Gathering every
+// row's word bytes into one contiguous slice lets us hash in a single bulk call
+// instead of two tiny `Hash::hash` writes per row (high per-call overhead).
+// Cleared (capacity retained) per call, so it adapts to any row count / qubit
+// width without re-allocating. Bytes (not the storage word type) because the
+// storage element width (`[u8; N]` vs `[u64; N]`) is generic at this call site.
+thread_local! {
+    static WORD_FP_BUF: std::cell::RefCell<Vec<u8>> = const { std::cell::RefCell::new(Vec::new()) };
+}
+
 /// View a `Copy` plain-old-data value's bytes. Sound because `A: PauliStorage`
 /// implies `bytemuck::Pod`: no padding, every bit pattern valid, so the bytes
 /// are fully initialized and `u8`-aligned.
@@ -38,43 +59,32 @@ fn pod_bytes<A: Copy>(value: &A) -> &[u8] {
 pub fn word_fingerprint<T, I, C>(tab: &GeneralizedTableau<T, I, C>) -> u64
 where
     T: Config,
+    I:,
     C: SparseVector<Complex<T::Coeff>, I>,
 {
-    // fxhash-style multiplicative-rotate mix, fed the raw storage words of
-    // every row's x then z bits. No allocation, no thread_local, single pass.
-    const K: u64 = 0x51_7c_c1_b7_27_22_0a_95; // fxhash constant
-    let mut h: u64 = 0;
-    let mut mix = |w: u64| {
-        h = (h.rotate_left(5) ^ w).wrapping_mul(K);
-    };
-    for row in tab.tableau.data.iter() {
-        // The `PauliWord` hash cache is disabled for tableau rows
-        // (`REHASH = false`), so hashing `row.word` would feed a stale zero and
-        // make every tableau collide; read the bit storage directly. The
-        // storage word element type is generic (`[u8; N]` vs `[u64; N]`) with no
-        // numeric bound available here, so view it as POD bytes and fold 8 at a
-        // time into a `u64` lane — identical on native and wasm.
-        mix_pod_words(pod_bytes(&row.word.xbits.data), &mut mix);
-        mix_pod_words(pod_bytes(&row.word.zbits.data), &mut mix);
-    }
-    h
-}
+    WORD_FP_BUF.with(|cell| {
+        let mut buf = cell.borrow_mut();
+        // Clear retains capacity; refill with every row's bits as raw bytes.
+        buf.clear();
+        for row in tab.tableau.data.iter() {
+            // Gather the Pauli bits directly: the `PauliWord` hash cache is
+            // disabled for tableau rows (`REHASH = false`), so hashing
+            // `row.word` would feed a stale zero and make every tableau collide.
+            buf.extend_from_slice(pod_bytes(&row.word.xbits.data));
+            buf.extend_from_slice(pod_bytes(&row.word.zbits.data));
+        }
 
-/// Fold a POD byte slice into the `mix` closure 8 bytes (one `u64` lane) at a
-/// time, little-endian, zero-padding a short trailing chunk. Keeps the hash
-/// independent of the generic storage word width.
-#[inline]
-fn mix_pod_words(bytes: &[u8], mix: &mut impl FnMut(u64)) {
-    let mut chunks = bytes.chunks_exact(8);
-    for c in &mut chunks {
-        mix(u64::from_le_bytes(c.try_into().unwrap()));
-    }
-    let rem = chunks.remainder();
-    if !rem.is_empty() {
-        let mut buf = [0u8; 8];
-        buf[..rem.len()].copy_from_slice(rem);
-        mix(u64::from_le_bytes(buf));
-    }
+        #[cfg(not(target_arch = "wasm32"))]
+        {
+            gxhash::gxhash64(&buf, 0)
+        }
+        #[cfg(target_arch = "wasm32")]
+        {
+            let mut hasher = FingerprintHasher::default();
+            hasher.write(&buf);
+            hasher.finish()
+        }
+    })
 }
 
 /// Per-row mask (splitmix64 of `(index, salt)`); a stable pure function used

From d491621054464a35cde88f0e759dd1fac1d7cbce Mon Sep 17 00:00:00 2001
From: David Plankensteiner <dplankensteiner@quera.com>
Date: Tue, 23 Jun 2026 21:16:18 +0200
Subject: [PATCH 16/24] chore(autotune): plan direct-word-hash approach

---
 .../direct-word-hash/prompts.md                   | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 docs/autotune/2026-06-23-tableau-sum-build/direct-word-hash/prompts.md

diff --git a/docs/autotune/2026-06-23-tableau-sum-build/direct-word-hash/prompts.md b/docs/autotune/2026-06-23-tableau-sum-build/direct-word-hash/prompts.md
new file mode 100644
index 000000000..7cbbf1987
--- /dev/null
+++ b/docs/autotune/2026-06-23-tableau-sum-build/direct-word-hash/prompts.md
@@ -0,0 +1,15 @@
+# Approach: direct single-pass word_fingerprint (no gather, no thread_local)
+
+## Hypothesis
+`word_fingerprint` currently gathers all row words into a thread_local buffer and
+calls `gxhash64` (the `with` frame is 22% self). The gather (memcpy ~2.7KB/entry),
+the thread_local access, and the separate hash pass have overhead. Replace with a
+direct single-pass scalar hash that reads the row words straight from the tableau
+(no buffer, no thread_local, one pass). fxhash-style mixing is proven adequate
+here (it's the existing wasm fallback) and `structurally_equal` resolves any
+extra collisions (it's <1% of runtime, lots of headroom). Bonus: the hash becomes
+portable (same on native + wasm), simplifying the cfg.
+
+## Target
+`./target/release/examples/msd-noisy-bench`; baseline now build_median ~552ms.
+Keep branches=2025, sum_p2=0.725135705447, top5[0]=0.8515413524292632, per_shot ~22us.

From b1c0ed532517bd53aa1a47782e4e0d1ceb7f7e86 Mon Sep 17 00:00:00 2001
From: David Plankensteiner <dplankensteiner@quera.com>
Date: Tue, 23 Jun 2026 21:16:24 +0200
Subject: [PATCH 17/24] chore(autotune): record direct-word-hash discard

---
 docs/autotune/2026-06-23-tableau-sum-build/log.md      | 1 +
 docs/autotune/2026-06-23-tableau-sum-build/metric.toml | 7 +++++++
 2 files changed, 8 insertions(+)

diff --git a/docs/autotune/2026-06-23-tableau-sum-build/log.md b/docs/autotune/2026-06-23-tableau-sum-build/log.md
index 3dc311a7b..22fb3f312 100644
--- a/docs/autotune/2026-06-23-tableau-sum-build/log.md
+++ b/docs/autotune/2026-06-23-tableau-sum-build/log.md
@@ -12,3 +12,4 @@
 - KEEP lazy-materialization: build_median 2620ms -> 958ms (2.73x, -63%). per_shot unchanged (~22us). branches=2025, sum_p2=0.725135705447 and top5 bit-identical => math unchanged. 76 tests pass. Implemented BranchMutation + structurally_equal_mutated + apply_branch_mutation in storage/mod.rs; insert_or_merge_mutated_branches trait method (VecStorage lazy/index-based, MapStorage eager-clone fallback); loss_channel + pauli_error emit virtual branches and only clone survivors. two_qubit/correlated_loss/reset_loss left eager (not in msd-noisy).
 - KEEP bulk-word-hash: build_median 958ms -> 573ms (1.67x; cumulative 4.57x from 2620). word_fingerprint now gathers all row words into a reused thread_local Vec<u8> and calls gxhash64 once instead of ~340 tiny Hash writes. Subagent used Vec<u8>+pod_bytes (storage type is generic A:PauliStorage, not usize) and avoided adding bytemuck dep. per_shot unchanged, accuracy identical, 76+33 tests pass.
 - KEEP precompute-masks: build_median 573ms -> 552ms (~3.7%; cumulative 4.75x). RowMasks table built once per op; phase_loss_hash_with + pauli_error dx/dy/dz index it instead of recomputing splitmix sign_mask per row per entry. Smaller than hoped => phase_loss_hash wasn't splitmix-dominated (mostly iteration over rows). accuracy identical, per_shot unchanged.
+- DISCARD direct-word-hash: scalar fxhash-style fold REGRESSED 552ms -> 921ms. Reverted. Lesson: word_fingerprint is SIMD-throughput-bound; gxhash (even with the gather+thread_local) beats a scalar byte/word fold. The gather+gxhash is near-optimal for full hashing. => the ONLY way to cut the 22% word-hash is to avoid re-hashing entirely (incremental/Zobrist fingerprint maintained through gates). Next: incremental fingerprinting.
diff --git a/docs/autotune/2026-06-23-tableau-sum-build/metric.toml b/docs/autotune/2026-06-23-tableau-sum-build/metric.toml
index 3a64c189d..0cc1e5130 100644
--- a/docs/autotune/2026-06-23-tableau-sum-build/metric.toml
+++ b/docs/autotune/2026-06-23-tableau-sum-build/metric.toml
@@ -26,3 +26,10 @@
 "build_median_ms" = 552.0
 "per_shot_ns" = 22300.0
 "branches" = 2025.0
+[[metric]]
+"commit" = "d4916210"
+"status" = "discard"
+"description" = "direct single-pass scalar word_fingerprint (no gather/gxhash) \u2014 REGRESSED"
+"build_median_ms" = 921.0
+"per_shot_ns" = 22300.0
+"branches" = 2025.0

From 4be19ee3afae3273082c217ccbd56c38369df8ee Mon Sep 17 00:00:00 2001
From: David Plankensteiner <dplankensteiner@quera.com>
Date: Tue, 23 Jun 2026 21:22:30 +0200
Subject: [PATCH 18/24] perf(tableau-sum): direct-word column reads in pauli
 noise + virtual compare

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 crates/ppvm-tableau-sum/src/noise.rs       | 17 +++++++++++++----
 crates/ppvm-tableau-sum/src/storage/mod.rs | 21 ++++++++++++++++++---
 2 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/crates/ppvm-tableau-sum/src/noise.rs b/crates/ppvm-tableau-sum/src/noise.rs
index d6799d035..ad1bad2b1 100644
--- a/crates/ppvm-tableau-sum/src/noise.rs
+++ b/crates/ppvm-tableau-sum/src/noise.rs
@@ -3,7 +3,7 @@
 
 use std::fmt::Debug;
 
-use bitvec::view::BitView;
+use bitvec::view::{BitView, BitViewSized};
 use num::{
     Complex, One, PrimInt, ToPrimitive, Zero,
     complex::{Complex64, ComplexFloat},
@@ -20,7 +20,9 @@ use rand::{RngExt, rngs::SmallRng};
 
 use crate::{
     data::GeneralizedTableauSum,
-    storage::{Branch, BranchMutation, EntryStore, RowMasks, loss_mask, pauli_branch_phase_loss},
+    storage::{
+        Branch, BranchMutation, EntryStore, RowMasks, bit_at, loss_mask, pauli_branch_phase_loss,
+    },
 };
 
 fn single_qubit_loss_branch<T, I, C>(
@@ -215,6 +217,11 @@ where
         // Precompute the per-row sign masks once instead of recomputing the
         // splitmix `sign_mask` per row per entry in the hot loop below.
         let masks = RowMasks::new(self.n_qubits);
+        // The store-word index / bit position of column `addr0` are the same for
+        // every entry and row, so resolve them once (Lsb0 convention).
+        let bits_per_word = std::mem::size_of::<<T::Storage as BitView>::Store>() * 8;
+        let word_idx = addr0 / bits_per_word;
+        let bit = addr0 % bits_per_word;
         let mut idx = 0usize;
         self.entries
             .for_each_mut_with_keys(|tab, p_sum, word_fp, phase_loss| {
@@ -226,8 +233,10 @@ where
 
                 let (mut dx, mut dy, mut dz) = (0u64, 0u64, 0u64);
                 for (row, pw) in tab.tableau.data.iter().enumerate() {
-                    let x: bool = pw.word.xbits[addr0];
-                    let z: bool = pw.word.zbits[addr0];
+                    let xw = pw.word.xbits.data.as_raw_slice();
+                    let zw = pw.word.zbits.data.as_raw_slice();
+                    let x: bool = bit_at(xw, word_idx, bit);
+                    let z: bool = bit_at(zw, word_idx, bit);
                     let m = masks.sign[row];
                     if z {
                         dx ^= m;
diff --git a/crates/ppvm-tableau-sum/src/storage/mod.rs b/crates/ppvm-tableau-sum/src/storage/mod.rs
index 268a3a091..6d9393fcb 100644
--- a/crates/ppvm-tableau-sum/src/storage/mod.rs
+++ b/crates/ppvm-tableau-sum/src/storage/mod.rs
@@ -15,10 +15,11 @@ use ppvm_traits::traits::Clifford;
 // a transient in-memory dedup key — collisions are resolved by
 // `structurally_equal`, and it is never persisted or compared across builds — so
 // the hasher may differ per target without affecting results.
+use bitvec::view::{BitView, BitViewSized};
 #[cfg(target_arch = "wasm32")]
 use fxhash::FxHasher as FingerprintHasher;
 use num::{
-    Complex, One, Zero,
+    Complex, One, PrimInt, Zero,
     complex::{Complex64, ComplexFloat},
 };
 use ppvm_tableau::{
@@ -39,6 +40,14 @@ thread_local! {
     static WORD_FP_BUF: std::cell::RefCell<Vec<u8>> = const { std::cell::RefCell::new(Vec::new()) };
 }
 
+/// Read a single bit from a raw store-word slice (Lsb0 convention). Skips the
+/// per-call word/bit recomputation and bounds-check that `BitArray`'s `Index`
+/// does, for the hot per-row column reads in noise propagation.
+#[inline]
+pub(crate) fn bit_at<S: PrimInt>(words: &[S], word_idx: usize, bit: usize) -> bool {
+    (words[word_idx] >> bit) & S::one() != S::zero()
+}
+
 /// View a `Copy` plain-old-data value's bytes. Sound because `A: PauliStorage`
 /// implies `bytemuck::Pod`: no padding, every bit pattern valid, so the bytes
 /// are fully initialized and `u8`-aligned.
@@ -334,6 +343,7 @@ pub(crate) fn structurally_equal_mutated<T, I, C>(
 ) -> bool
 where
     T: Config,
+    <<T as Config>::Storage as BitView>::Store: PrimInt,
     T::Coeff: One + Zero + Clone + num::Num + PartialOrd,
     Complex<T::Coeff>: std::ops::Mul<Output = Complex<T::Coeff>>
         + AddAssign
@@ -385,12 +395,17 @@ where
             }
         }
         BranchMutation::Pauli { op, addr0 } => {
+            let bits_per_word = std::mem::size_of::<<T::Storage as BitView>::Store>() * 8;
+            let word_idx = addr0 / bits_per_word;
+            let bit = addr0 % bits_per_word;
             for (re, rp) in existing.tableau.data.iter().zip(parent.tableau.data.iter()) {
                 if re.word != rp.word {
                     return false;
                 }
-                let x: bool = rp.word.xbits[addr0];
-                let z: bool = rp.word.zbits[addr0];
+                let xw = rp.word.xbits.data.as_raw_slice();
+                let zw = rp.word.zbits.data.as_raw_slice();
+                let x: bool = bit_at(xw, word_idx, bit);
+                let z: bool = bit_at(zw, word_idx, bit);
                 let flip = match op {
                     1 => z,
                     2 => x ^ z,

From 4b8fa6f4a4d7f8a3bcaa36c435ff5beaaabdd0ff Mon Sep 17 00:00:00 2001
From: David Plankensteiner <dplankensteiner@quera.com>
Date: Tue, 23 Jun 2026 21:23:27 +0200
Subject: [PATCH 19/24] chore(autotune): plan direct-column-read approach

---
 .../direct-column-read/prompts.md                  | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 docs/autotune/2026-06-23-tableau-sum-build/direct-column-read/prompts.md

diff --git a/docs/autotune/2026-06-23-tableau-sum-build/direct-column-read/prompts.md b/docs/autotune/2026-06-23-tableau-sum-build/direct-column-read/prompts.md
new file mode 100644
index 000000000..5b565edde
--- /dev/null
+++ b/docs/autotune/2026-06-23-tableau-sum-build/direct-column-read/prompts.md
@@ -0,0 +1,14 @@
+# Approach: direct-word column reads in hot loops
+
+## Hypothesis
+`pauli_error`'s per-entry dx/dy/dz loop and `structurally_equal_mutated`'s Pauli
+branch read the tableau column with `bitvec`'s generic `Index` (`pw.word.xbits[addr0]`),
+which recomputes word/bit and bounds-checks per access — done for all 2n rows of
+every entry on every depolarize (part of the 23% `for_each_mut_with_keys` self).
+Replace with direct storage-word access: compute `word_idx = addr0 / bits_per_word`
+and `bit = addr0 % bits_per_word` ONCE, then test `(data.as_raw_slice()[word_idx] >> bit) & 1`.
+Same bit values (Lsb0, matches `Tableau::build_masks`), so branches stay 2025.
+
+## Target
+`./target/release/examples/msd-noisy-bench`; baseline now build_median ~542ms.
+Keep branches=2025, sum_p2=0.725135705447, top5[0]=0.8515413524292632, per_shot ~22us.

From 1c2e6e30b203e2389a03a1e8100deb944c3bb964 Mon Sep 17 00:00:00 2001
From: David Plankensteiner <dplankensteiner@quera.com>
Date: Tue, 23 Jun 2026 21:23:27 +0200
Subject: [PATCH 20/24] chore(autotune): record direct-column-read keep
 (cumulative 5.04x)

---
 docs/autotune/2026-06-23-tableau-sum-build/log.md      | 1 +
 docs/autotune/2026-06-23-tableau-sum-build/metric.toml | 7 +++++++
 2 files changed, 8 insertions(+)

diff --git a/docs/autotune/2026-06-23-tableau-sum-build/log.md b/docs/autotune/2026-06-23-tableau-sum-build/log.md
index 22fb3f312..f00469525 100644
--- a/docs/autotune/2026-06-23-tableau-sum-build/log.md
+++ b/docs/autotune/2026-06-23-tableau-sum-build/log.md
@@ -13,3 +13,4 @@
 - KEEP bulk-word-hash: build_median 958ms -> 573ms (1.67x; cumulative 4.57x from 2620). word_fingerprint now gathers all row words into a reused thread_local Vec<u8> and calls gxhash64 once instead of ~340 tiny Hash writes. Subagent used Vec<u8>+pod_bytes (storage type is generic A:PauliStorage, not usize) and avoided adding bytemuck dep. per_shot unchanged, accuracy identical, 76+33 tests pass.
 - KEEP precompute-masks: build_median 573ms -> 552ms (~3.7%; cumulative 4.75x). RowMasks table built once per op; phase_loss_hash_with + pauli_error dx/dy/dz index it instead of recomputing splitmix sign_mask per row per entry. Smaller than hoped => phase_loss_hash wasn't splitmix-dominated (mostly iteration over rows). accuracy identical, per_shot unchanged.
 - DISCARD direct-word-hash: scalar fxhash-style fold REGRESSED 552ms -> 921ms. Reverted. Lesson: word_fingerprint is SIMD-throughput-bound; gxhash (even with the gather+thread_local) beats a scalar byte/word fold. The gather+gxhash is near-optimal for full hashing. => the ONLY way to cut the 22% word-hash is to avoid re-hashing entirely (incremental/Zobrist fingerprint maintained through gates). Next: incremental fingerprinting.
+- KEEP direct-column-read: build_median 542ms -> 520ms (~4%; cumulative 5.04x). bit_at() helper reads the storage word directly (word_idx/bit precomputed once) instead of bitvec Index per row, in pauli_error dx/dy/dz and structurally_equal_mutated. accuracy identical, per_shot unchanged.
diff --git a/docs/autotune/2026-06-23-tableau-sum-build/metric.toml b/docs/autotune/2026-06-23-tableau-sum-build/metric.toml
index 0cc1e5130..3d943cd80 100644
--- a/docs/autotune/2026-06-23-tableau-sum-build/metric.toml
+++ b/docs/autotune/2026-06-23-tableau-sum-build/metric.toml
@@ -33,3 +33,10 @@
 "build_median_ms" = 921.0
 "per_shot_ns" = 22300.0
 "branches" = 2025.0
+[[metric]]
+"commit" = "4b8fa6f4"
+"status" = "keep"
+"description" = "direct-word column reads in pauli_error dx/dy/dz + structurally_equal_mutated"
+"build_median_ms" = 520.0
+"per_shot_ns" = 22100.0
+"branches" = 2025.0

From 683d3ef662b4060b94fa9e399f8398f51a7177f1 Mon Sep 17 00:00:00 2001
From: David Plankensteiner <dplankensteiner@quera.com>
Date: Tue, 23 Jun 2026 21:30:48 +0200
Subject: [PATCH 21/24] chore(autotune): record plateau analysis +
 incremental-fingerprint opportunity

---
 docs/autotune/2026-06-23-tableau-sum-build/log.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/autotune/2026-06-23-tableau-sum-build/log.md b/docs/autotune/2026-06-23-tableau-sum-build/log.md
index f00469525..02c7f8683 100644
--- a/docs/autotune/2026-06-23-tableau-sum-build/log.md
+++ b/docs/autotune/2026-06-23-tableau-sum-build/log.md
@@ -14,3 +14,6 @@
 - KEEP precompute-masks: build_median 573ms -> 552ms (~3.7%; cumulative 4.75x). RowMasks table built once per op; phase_loss_hash_with + pauli_error dx/dy/dz index it instead of recomputing splitmix sign_mask per row per entry. Smaller than hoped => phase_loss_hash wasn't splitmix-dominated (mostly iteration over rows). accuracy identical, per_shot unchanged.
 - DISCARD direct-word-hash: scalar fxhash-style fold REGRESSED 552ms -> 921ms. Reverted. Lesson: word_fingerprint is SIMD-throughput-bound; gxhash (even with the gather+thread_local) beats a scalar byte/word fold. The gather+gxhash is near-optimal for full hashing. => the ONLY way to cut the 22% word-hash is to avoid re-hashing entirely (incremental/Zobrist fingerprint maintained through gates). Next: incremental fingerprinting.
 - KEEP direct-column-read: build_median 542ms -> 520ms (~4%; cumulative 5.04x). bit_at() helper reads the storage word directly (word_idx/bit precomputed once) instead of bitvec Index per row, in pauli_error dx/dy/dz and structurally_equal_mutated. accuracy identical, per_shot unchanged.
+- PLATEAU ANALYSIS @ 520ms (5.04x): profile now: word-hash gxhash gather ('with') 22%, phase_loss_hash_with 14.5% (=> rebuild ~36.7%), noise branch-building (for_each self) 21%, clifford gates 23% (cz 10.5% + sqrt_* 12.7%), sampling ~10% (not build), T compute_decomposition 4.5%, merge 3.8%. The rebuild is SIMD-compute-bound (proven: scalar-fold DISCARD regressed), so full re-hashing is near-optimal. Remaining gate cost is inherent (gate applied to every one of 2025 entries x 170 rows).
+
+NEXT OPPORTUNITY (not done — high complexity, conflicts with user 'simplicity over cleverness' pref; ~10-20% est, revert-safe due to loud failure via test_word_fingerprint_cache_stays_consistent + branches!=2025 guard): cell-level incremental ('Zobrist') word_fingerprint maintained through clifford gates so rebuild is skipped for the common gates. Replace gxhash word_fp with XOR over (row r, qubit q) of (xbit?XM(r,q):0)^(zbit?ZM(r,q):0). Keep dirty+full-recompute fallback for non-handled gates (T, s, cnot, x/y/z standalone). Incremental update per gate reads OLD column bits (direct, via bit_at) and XORs precomputed column masks; gate-delta rules (single gate at col q, from old x,z): h: x_chg=z_chg=(x^z), sign=x&z; sqrt_x: x_chg=z,z_chg=0,sign=z&!x; sqrt_x_dag: x_chg=z,sign=x&z; sqrt_y: x_chg=z_chg=(x^z),sign=x&!z; sqrt_y_dag: x_chg=z_chg=(x^z),sign=z&!x. cz(c,t) from old xa,za,xb,zb: z@c flips iff xb (XOR ZM[c][r]), z@t flips iff xa (XOR ZM[t][r]), x unchanged, sign=xa&xb&(za^zb). phase_loss uses existing sign_mask. Estimated ~11% (non-fused, gates left intact) to ~20% (fused gate+delta in one direct pass).

From 863eadc823276ed59bc95b5efce5319a61604c3c Mon Sep 17 00:00:00 2001
From: David Plankensteiner <dplankensteiner@quera.com>
Date: Wed, 24 Jun 2026 08:51:10 +0200
Subject: [PATCH 22/24] chore(autotune): drop session scratch docs (kept out of
 PR)

The per-iteration log, metric ledger, and prompt records were working
notes for the tuning session; they are summarized in the PR description
rather than checked in.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../bulk-word-hash/prompts.md                 | 15 -------
 .../direct-column-read/prompts.md             | 14 -------
 .../direct-word-hash/prompts.md               | 15 -------
 .../lazy-materialization/prompts.md           | 33 ---------------
 .../2026-06-23-tableau-sum-build/log.md       | 19 ---------
 .../2026-06-23-tableau-sum-build/metric.toml  | 42 -------------------
 .../precompute-masks/prompts.md               | 16 -------
 7 files changed, 154 deletions(-)
 delete mode 100644 docs/autotune/2026-06-23-tableau-sum-build/bulk-word-hash/prompts.md
 delete mode 100644 docs/autotune/2026-06-23-tableau-sum-build/direct-column-read/prompts.md
 delete mode 100644 docs/autotune/2026-06-23-tableau-sum-build/direct-word-hash/prompts.md
 delete mode 100644 docs/autotune/2026-06-23-tableau-sum-build/lazy-materialization/prompts.md
 delete mode 100644 docs/autotune/2026-06-23-tableau-sum-build/log.md
 delete mode 100644 docs/autotune/2026-06-23-tableau-sum-build/metric.toml
 delete mode 100644 docs/autotune/2026-06-23-tableau-sum-build/precompute-masks/prompts.md

diff --git a/docs/autotune/2026-06-23-tableau-sum-build/bulk-word-hash/prompts.md b/docs/autotune/2026-06-23-tableau-sum-build/bulk-word-hash/prompts.md
deleted file mode 100644
index f928d0d17..000000000
--- a/docs/autotune/2026-06-23-tableau-sum-build/bulk-word-hash/prompts.md
+++ /dev/null
@@ -1,15 +0,0 @@
-# Approach: bulk word_fingerprint hashing
-
-## Hypothesis
-After lazy materialization, `rebuild_fingerprints_if_dirty` dominates (47% self,
-61% inclusive); it re-hashes every entry's words after each clifford gate marks
-them dirty. `word_fingerprint` currently does 2 small `Hash::hash` calls per row
-(`xbits.data` then `zbits.data`) = ~340 hasher writes for 170 rows, with
-per-call overhead. Gather the row words into one contiguous buffer and hash once
-with `gxhash::gxhash64` (native) — far less per-call overhead, single SIMD pass.
-
-## Target
-`./target/release/examples/msd-noisy-bench`; baseline now build_median ~958ms.
-Must keep branches=2025, sum_p2=0.725135705447, top5[0]=0.8515413524292632.
-Fingerprints are transient dedup keys (resolved by structurally_equal), so the
-hash VALUE may change freely as long as it's consistent within a build.
diff --git a/docs/autotune/2026-06-23-tableau-sum-build/direct-column-read/prompts.md b/docs/autotune/2026-06-23-tableau-sum-build/direct-column-read/prompts.md
deleted file mode 100644
index 5b565edde..000000000
--- a/docs/autotune/2026-06-23-tableau-sum-build/direct-column-read/prompts.md
+++ /dev/null
@@ -1,14 +0,0 @@
-# Approach: direct-word column reads in hot loops
-
-## Hypothesis
-`pauli_error`'s per-entry dx/dy/dz loop and `structurally_equal_mutated`'s Pauli
-branch read the tableau column with `bitvec`'s generic `Index` (`pw.word.xbits[addr0]`),
-which recomputes word/bit and bounds-checks per access — done for all 2n rows of
-every entry on every depolarize (part of the 23% `for_each_mut_with_keys` self).
-Replace with direct storage-word access: compute `word_idx = addr0 / bits_per_word`
-and `bit = addr0 % bits_per_word` ONCE, then test `(data.as_raw_slice()[word_idx] >> bit) & 1`.
-Same bit values (Lsb0, matches `Tableau::build_masks`), so branches stay 2025.
-
-## Target
-`./target/release/examples/msd-noisy-bench`; baseline now build_median ~542ms.
-Keep branches=2025, sum_p2=0.725135705447, top5[0]=0.8515413524292632, per_shot ~22us.
diff --git a/docs/autotune/2026-06-23-tableau-sum-build/direct-word-hash/prompts.md b/docs/autotune/2026-06-23-tableau-sum-build/direct-word-hash/prompts.md
deleted file mode 100644
index 7cbbf1987..000000000
--- a/docs/autotune/2026-06-23-tableau-sum-build/direct-word-hash/prompts.md
+++ /dev/null
@@ -1,15 +0,0 @@
-# Approach: direct single-pass word_fingerprint (no gather, no thread_local)
-
-## Hypothesis
-`word_fingerprint` currently gathers all row words into a thread_local buffer and
-calls `gxhash64` (the `with` frame is 22% self). The gather (memcpy ~2.7KB/entry),
-the thread_local access, and the separate hash pass have overhead. Replace with a
-direct single-pass scalar hash that reads the row words straight from the tableau
-(no buffer, no thread_local, one pass). fxhash-style mixing is proven adequate
-here (it's the existing wasm fallback) and `structurally_equal` resolves any
-extra collisions (it's <1% of runtime, lots of headroom). Bonus: the hash becomes
-portable (same on native + wasm), simplifying the cfg.
-
-## Target
-`./target/release/examples/msd-noisy-bench`; baseline now build_median ~552ms.
-Keep branches=2025, sum_p2=0.725135705447, top5[0]=0.8515413524292632, per_shot ~22us.
diff --git a/docs/autotune/2026-06-23-tableau-sum-build/lazy-materialization/prompts.md b/docs/autotune/2026-06-23-tableau-sum-build/lazy-materialization/prompts.md
deleted file mode 100644
index cb1883d20..000000000
--- a/docs/autotune/2026-06-23-tableau-sum-build/lazy-materialization/prompts.md
+++ /dev/null
@@ -1,33 +0,0 @@
-# Approach: lazy branch materialization (loss_channel + pauli_error)
-
-## Hypothesis
-Build time is dominated by `fork` (deep-clone of a ~7KB `GeneralizedTableau`,
-47% inclusive, 32% `_platform_memmove` self). Each `depolarize1` forks 3 full
-tableaux per entry and `loss_channel` forks 1 per entry, but ~85% of those
-branches are immediately merged into an existing entry or dropped below
-`sum_cutoff`. Those clones are pure waste.
-
-Key fact (verified in `ppvm-tableau/src/gates/clifford.rs`): applying X/Y/Z to a
-`GeneralizedTableau` only flips per-row **sign bits** and leaves the Pauli
-words, the `coefficients` vector, and `is_lost` identical to the parent. Loss
-only sets one `is_lost` bit. So a branch's fingerprint and structural identity
-are derivable from the parent **without cloning**. Materialize (clone+mutate)
-only when a branch survives as a *new* entry.
-
-Per-row sign-flip rule at column `addr0` (matches the gate code exactly):
-- X flips sign of row iff `z[addr0] == 1`
-- Y flips sign of row iff `x[addr0] ^ z[addr0] == 1`
-- Z flips sign of row iff `x[addr0] == 1`
-(Only phase bit 1 = sign; the imaginary bit 0 is untouched. So the phase/loss
-hash delta is `XOR sign_mask(row)` over flipped rows — same as the existing
-`pauli_branch_phase_loss`.)
-
-## Target metric
-`cargo build --release -p ppvm-tableau-sum --example msd-noisy-bench && ./target/release/examples/msd-noisy-bench`
-Baseline: build_median ~2620ms, per_shot ~22.5us, branches=2025,
-sum_p2=0.725135705447, top5[0]=0.8515413524292632. The math must be unchanged:
-branches stays 2025 and the accuracy fingerprint must match to ~1e-9.
-
-## Expected win
-Replace ~3N depolarize clones + ~N loss clones with clones only for survivors
-(~1.2N total entries). Target 25-40% build-time reduction.
diff --git a/docs/autotune/2026-06-23-tableau-sum-build/log.md b/docs/autotune/2026-06-23-tableau-sum-build/log.md
deleted file mode 100644
index 02c7f8683..000000000
--- a/docs/autotune/2026-06-23-tableau-sum-build/log.md
+++ /dev/null
@@ -1,19 +0,0 @@
-# Log for 2026-06-23-tableau-sum-build
-
-## 2026-06-23
-- Architecture Notes / baseline profile (samply, 2733 samples):
-- Target: examples/msd-noisy build time. Baseline build_median ~2620ms, per_shot ~22.5us, final branches=2025. Config Byte8F64<2> (storage [u64;2]=128bit), index u128, 85 qubits => tableau has 170 rows, each row ~32B word data.
-- INCLUSIVE: for_each_mut_with_keys 85%; depolarize1 53%; fork(clone) 47%; loss_channel 38%; rebuild_fingerprints_if_dirty 23%; mimalloc alloc ~15-20%.
-- SELF: _platform_memmove 32% (the tableau deep-clone in fork); rebuild_fingerprints_if_dirty 18% (re-hashes all words after every clifford gate marks dirty); for_each_mut_with_keys 11%; phase_loss_hash 5%; gates y/cz/sqrt_* ~10% total.
-- Root causes: (1) noise branching deep-clones a full ~7KB tableau per branch; depolarize forks 3x/entry, ~85% of branches are then merged or truncated -> wasted clones. (2) every clifford gate marks all entries dirty -> next noise op re-hashes all words of all entries.
-- Accuracy guard: branch count must stay 2025 (optimizations must not change the math). Cutoff fixed at 1e-7.
-- Bench cmd: cargo build --release -p ppvm-tableau-sum --example msd-noisy-bench && ./target/release/examples/msd-noisy-bench
-- Accuracy reference (must be preserved to ~1e-9 by all optimizations): branches=2025, sum_p=1.0, sum_p2=0.725135705447, top5_p[0]=0.8515413524292632. Key code fact: GeneralizedTableau X/Y/Z (gates/clifford.rs impl_generalized_tableau_clifford) only flip per-row sign bits (X where z=1, Y where x^z=1, Z where x=1) and leave words/coefficients/is_lost identical to parent; loss only sets one is_lost bit. So branch fingerprints are derivable without cloning -> lazy materialization is the plan.
-- KEEP lazy-materialization: build_median 2620ms -> 958ms (2.73x, -63%). per_shot unchanged (~22us). branches=2025, sum_p2=0.725135705447 and top5 bit-identical => math unchanged. 76 tests pass. Implemented BranchMutation + structurally_equal_mutated + apply_branch_mutation in storage/mod.rs; insert_or_merge_mutated_branches trait method (VecStorage lazy/index-based, MapStorage eager-clone fallback); loss_channel + pauli_error emit virtual branches and only clone survivors. two_qubit/correlated_loss/reset_loss left eager (not in msd-noisy).
-- KEEP bulk-word-hash: build_median 958ms -> 573ms (1.67x; cumulative 4.57x from 2620). word_fingerprint now gathers all row words into a reused thread_local Vec<u8> and calls gxhash64 once instead of ~340 tiny Hash writes. Subagent used Vec<u8>+pod_bytes (storage type is generic A:PauliStorage, not usize) and avoided adding bytemuck dep. per_shot unchanged, accuracy identical, 76+33 tests pass.
-- KEEP precompute-masks: build_median 573ms -> 552ms (~3.7%; cumulative 4.75x). RowMasks table built once per op; phase_loss_hash_with + pauli_error dx/dy/dz index it instead of recomputing splitmix sign_mask per row per entry. Smaller than hoped => phase_loss_hash wasn't splitmix-dominated (mostly iteration over rows). accuracy identical, per_shot unchanged.
-- DISCARD direct-word-hash: scalar fxhash-style fold REGRESSED 552ms -> 921ms. Reverted. Lesson: word_fingerprint is SIMD-throughput-bound; gxhash (even with the gather+thread_local) beats a scalar byte/word fold. The gather+gxhash is near-optimal for full hashing. => the ONLY way to cut the 22% word-hash is to avoid re-hashing entirely (incremental/Zobrist fingerprint maintained through gates). Next: incremental fingerprinting.
-- KEEP direct-column-read: build_median 542ms -> 520ms (~4%; cumulative 5.04x). bit_at() helper reads the storage word directly (word_idx/bit precomputed once) instead of bitvec Index per row, in pauli_error dx/dy/dz and structurally_equal_mutated. accuracy identical, per_shot unchanged.
-- PLATEAU ANALYSIS @ 520ms (5.04x): profile now: word-hash gxhash gather ('with') 22%, phase_loss_hash_with 14.5% (=> rebuild ~36.7%), noise branch-building (for_each self) 21%, clifford gates 23% (cz 10.5% + sqrt_* 12.7%), sampling ~10% (not build), T compute_decomposition 4.5%, merge 3.8%. The rebuild is SIMD-compute-bound (proven: scalar-fold DISCARD regressed), so full re-hashing is near-optimal. Remaining gate cost is inherent (gate applied to every one of 2025 entries x 170 rows).
-
-NEXT OPPORTUNITY (not done — high complexity, conflicts with user 'simplicity over cleverness' pref; ~10-20% est, revert-safe due to loud failure via test_word_fingerprint_cache_stays_consistent + branches!=2025 guard): cell-level incremental ('Zobrist') word_fingerprint maintained through clifford gates so rebuild is skipped for the common gates. Replace gxhash word_fp with XOR over (row r, qubit q) of (xbit?XM(r,q):0)^(zbit?ZM(r,q):0). Keep dirty+full-recompute fallback for non-handled gates (T, s, cnot, x/y/z standalone). Incremental update per gate reads OLD column bits (direct, via bit_at) and XORs precomputed column masks; gate-delta rules (single gate at col q, from old x,z): h: x_chg=z_chg=(x^z), sign=x&z; sqrt_x: x_chg=z,z_chg=0,sign=z&!x; sqrt_x_dag: x_chg=z,sign=x&z; sqrt_y: x_chg=z_chg=(x^z),sign=x&!z; sqrt_y_dag: x_chg=z_chg=(x^z),sign=z&!x. cz(c,t) from old xa,za,xb,zb: z@c flips iff xb (XOR ZM[c][r]), z@t flips iff xa (XOR ZM[t][r]), x unchanged, sign=xa&xb&(za^zb). phase_loss uses existing sign_mask. Estimated ~11% (non-fused, gates left intact) to ~20% (fused gate+delta in one direct pass).
diff --git a/docs/autotune/2026-06-23-tableau-sum-build/metric.toml b/docs/autotune/2026-06-23-tableau-sum-build/metric.toml
deleted file mode 100644
index 3d943cd80..000000000
--- a/docs/autotune/2026-06-23-tableau-sum-build/metric.toml
+++ /dev/null
@@ -1,42 +0,0 @@
-[[metric]]
-"commit" = "f152652e"
-"status" = "keep"
-"description" = "baseline (msd-noisy build, seeded harness, median of 5)"
-"build_median_ms" = 2620.0
-"per_shot_ns" = 22500.0
-"branches" = 2025.0
-[[metric]]
-"commit" = "59165235"
-"status" = "keep"
-"description" = "lazy branch materialization for loss_channel + pauli_error (single-qubit depolarize)"
-"build_median_ms" = 958.0
-"per_shot_ns" = 22100.0
-"branches" = 2025.0
-[[metric]]
-"commit" = "ad0d0b24"
-"status" = "keep"
-"description" = "bulk-hash word_fingerprint (gather rows, single gxhash64)"
-"build_median_ms" = 573.0
-"per_shot_ns" = 22100.0
-"branches" = 2025.0
-[[metric]]
-"commit" = "4fea5841"
-"status" = "keep"
-"description" = "precompute per-row masks (RowMasks) for phase_loss + pauli dx/dy/dz"
-"build_median_ms" = 552.0
-"per_shot_ns" = 22300.0
-"branches" = 2025.0
-[[metric]]
-"commit" = "d4916210"
-"status" = "discard"
-"description" = "direct single-pass scalar word_fingerprint (no gather/gxhash) \u2014 REGRESSED"
-"build_median_ms" = 921.0
-"per_shot_ns" = 22300.0
-"branches" = 2025.0
-[[metric]]
-"commit" = "4b8fa6f4"
-"status" = "keep"
-"description" = "direct-word column reads in pauli_error dx/dy/dz + structurally_equal_mutated"
-"build_median_ms" = 520.0
-"per_shot_ns" = 22100.0
-"branches" = 2025.0
diff --git a/docs/autotune/2026-06-23-tableau-sum-build/precompute-masks/prompts.md b/docs/autotune/2026-06-23-tableau-sum-build/precompute-masks/prompts.md
deleted file mode 100644
index 5dc8c0fff..000000000
--- a/docs/autotune/2026-06-23-tableau-sum-build/precompute-masks/prompts.md
+++ /dev/null
@@ -1,16 +0,0 @@
-# Approach: precompute per-row masks (kill redundant splitmix)
-
-## Hypothesis
-`sign_mask(row)`/`imag_mask(row)`/`loss_mask(q)` are splitmix64 hashes of a pure
-index. They are recomputed per-row, per-entry, on every op:
-- `pauli_error`'s dx/dy/dz loop computes `sign_mask(row)` for all 2n rows of
-  every entry on every depolarize.
-- `phase_loss_hash` (called per entry in `rebuild_fingerprints_if_dirty`)
-  recomputes sign/imag masks per set phase and loss masks per lost qubit.
-
-Precompute the per-index mask tables ONCE per op and index them. Values are
-identical, so all fingerprints (and the accuracy fingerprint) are unchanged.
-
-## Target
-`./target/release/examples/msd-noisy-bench`; baseline now build_median ~573ms.
-Keep branches=2025, sum_p2=0.725135705447, top5[0]=0.8515413524292632, per_shot ~22us.

From e35ddd94e721acf417f323563c5488f385ab7b68 Mon Sep 17 00:00:00 2001
From: David Plankensteiner <dplankensteiner@quera.com>
Date: Wed, 24 Jun 2026 09:19:59 +0200
Subject: [PATCH 23/24] refactor(tableau-sum): use bytemuck::bytes_of in
 word_fingerprint

Replace the hand-rolled `unsafe pod_bytes` byte view with
`bytemuck::bytes_of`. `PauliStorage` already requires `bytemuck::Pod`,
so the byte view is sound without `unsafe`, matching the existing idiom
in `PauliWord::rehash`. Identical codegen (same pointer cast), so build
time and accuracy are unchanged (branches=2025, sum_p2 bit-identical).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 Cargo.lock                                 |  1 +
 crates/ppvm-tableau-sum/Cargo.toml         |  1 +
 crates/ppvm-tableau-sum/src/storage/mod.rs | 17 ++++-------------
 3 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index f0aaf37d1..b0a5bb9ef 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1034,6 +1034,7 @@ name = "ppvm-tableau-sum"
 version = "0.1.0"
 dependencies = [
  "bitvec",
+ "bytemuck",
  "criterion 0.8.2",
  "fxhash",
  "gxhash",
diff --git a/crates/ppvm-tableau-sum/Cargo.toml b/crates/ppvm-tableau-sum/Cargo.toml
index 65355efec..aca490053 100644
--- a/crates/ppvm-tableau-sum/Cargo.toml
+++ b/crates/ppvm-tableau-sum/Cargo.toml
@@ -5,6 +5,7 @@ edition = "2024"
 
 [dependencies]
 bitvec = "1.0.1"
+bytemuck = { version = "1", features = ["min_const_generics"] }
 fxhash = "0.2.1"
 num = "0.4.3"
 ppvm-traits = { version = "0.1.0", path = "../ppvm-traits" }
diff --git a/crates/ppvm-tableau-sum/src/storage/mod.rs b/crates/ppvm-tableau-sum/src/storage/mod.rs
index 6d9393fcb..c62f7daa6 100644
--- a/crates/ppvm-tableau-sum/src/storage/mod.rs
+++ b/crates/ppvm-tableau-sum/src/storage/mod.rs
@@ -48,17 +48,6 @@ pub(crate) fn bit_at<S: PrimInt>(words: &[S], word_idx: usize, bit: usize) -> bo
     (words[word_idx] >> bit) & S::one() != S::zero()
 }
 
-/// View a `Copy` plain-old-data value's bytes. Sound because `A: PauliStorage`
-/// implies `bytemuck::Pod`: no padding, every bit pattern valid, so the bytes
-/// are fully initialized and `u8`-aligned.
-#[inline]
-fn pod_bytes<A: Copy>(value: &A) -> &[u8] {
-    // SAFETY: `A` is POD (PauliStorage: bytemuck::Pod); reading its
-    // `size_of::<A>()` initialized bytes as `[u8]` is sound, and the borrow is
-    // tied to `value`.
-    unsafe { std::slice::from_raw_parts(value as *const A as *const u8, std::mem::size_of::<A>()) }
-}
-
 /// Hash of the `word` (Pauli content) of every row, in order. This is the
 /// expensive component (each word is several machine words wide) and is
 /// *invariant* under X/Y/Z and `is_lost` flips, so a branch inherits it from
@@ -79,8 +68,10 @@ where
             // Gather the Pauli bits directly: the `PauliWord` hash cache is
             // disabled for tableau rows (`REHASH = false`), so hashing
             // `row.word` would feed a stale zero and make every tableau collide.
-            buf.extend_from_slice(pod_bytes(&row.word.xbits.data));
-            buf.extend_from_slice(pod_bytes(&row.word.zbits.data));
+            // `xbits.data`/`zbits.data` are the `PauliStorage` backing array,
+            // which is `bytemuck::Pod`, so this byte view is safe and zero-copy.
+            buf.extend_from_slice(bytemuck::bytes_of(&row.word.xbits.data));
+            buf.extend_from_slice(bytemuck::bytes_of(&row.word.zbits.data));
         }
 
         #[cfg(not(target_arch = "wasm32"))]

From a7c8057eea29658cb0e857c609ad446f1a889edf Mon Sep 17 00:00:00 2001
From: David Plankensteiner <dplankensteiner@quera.com>
Date: Thu, 25 Jun 2026 09:57:21 +0200
Subject: [PATCH 24/24] refactor(tableau-sum): type BranchMutation Pauli op as
 NotIdentity enum
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The depolarizing-branch op was a `u8` (1=X, 2=Y, 3=Z), so both matches
on it carried a dead `_` catch-all that silently ignored invalid ops.
Reuse the existing `ppvm_pauli_word::pattern::NotIdentity` enum (X/Y/Z)
instead, which makes `apply_branch_mutation` and the
`structurally_equal_mutated` flip rule exhaustive with no catch-all —
the invalid state is now unrepresentable.

Promotes `NotIdentity` from `pub(crate)` to `pub` and re-exports it from
the `pattern` module. Matching is by variant name, so the enum's
`X=1, Z=2, Y=3` discriminants don't affect behavior.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 crates/ppvm-pauli-word/src/pattern/data.rs |  5 ++++-
 crates/ppvm-pauli-word/src/pattern/mod.rs  |  2 +-
 crates/ppvm-tableau-sum/src/noise.rs       | 16 +++++++++++++---
 crates/ppvm-tableau-sum/src/storage/mod.rs | 19 +++++++++----------
 4 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/crates/ppvm-pauli-word/src/pattern/data.rs b/crates/ppvm-pauli-word/src/pattern/data.rs
index f34e48802..7e0d17a68 100644
--- a/crates/ppvm-pauli-word/src/pattern/data.rs
+++ b/crates/ppvm-pauli-word/src/pattern/data.rs
@@ -8,11 +8,14 @@ use bincode::{Decode, Encode};
 #[cfg(feature = "serde")]
 use serde::{Deserialize, Serialize};
 
+/// A single-qubit Pauli that is not the identity: `X`, `Y`, or `Z`. Encoded so
+/// the low two bits match [`Pauli`](crate::Pauli) (`X = 1`, `Z = 2`, `Y = 3`),
+/// which lets `From<NotIdentity> for Pauli` be a no-op transmute.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 #[cfg_attr(feature = "bincode", derive(Encode, Decode))]
 #[repr(u8)]
-pub(crate) enum NotIdentity {
+pub enum NotIdentity {
     X = 1,
     Z = 2,
     Y = 3,
diff --git a/crates/ppvm-pauli-word/src/pattern/mod.rs b/crates/ppvm-pauli-word/src/pattern/mod.rs
index 2147de525..93465d567 100644
--- a/crates/ppvm-pauli-word/src/pattern/mod.rs
+++ b/crates/ppvm-pauli-word/src/pattern/mod.rs
@@ -10,4 +10,4 @@ mod parse;
 mod trace;
 
 pub use contains::Contains;
-pub use data::PauliPattern;
+pub use data::{NotIdentity, PauliPattern};
diff --git a/crates/ppvm-tableau-sum/src/noise.rs b/crates/ppvm-tableau-sum/src/noise.rs
index ad1bad2b1..eedbde003 100644
--- a/crates/ppvm-tableau-sum/src/noise.rs
+++ b/crates/ppvm-tableau-sum/src/noise.rs
@@ -8,6 +8,7 @@ use num::{
     Complex, One, PrimInt, ToPrimitive, Zero,
     complex::{Complex64, ComplexFloat},
 };
+use ppvm_pauli_word::pattern::NotIdentity;
 use ppvm_tableau::{
     data::GeneralizedTableau, sparsevec::SparseVector, tableau_index::TableauIndex,
 };
@@ -251,21 +252,30 @@ where
 
                 branches.push((
                     parent_idx,
-                    BranchMutation::Pauli { op: 1, addr0 },
+                    BranchMutation::Pauli {
+                        op: NotIdentity::X,
+                        addr0,
+                    },
                     p_sum.clone() * p[0].clone(),
                     word_fp,
                     phase_loss ^ dx,
                 ));
                 branches.push((
                     parent_idx,
-                    BranchMutation::Pauli { op: 2, addr0 },
+                    BranchMutation::Pauli {
+                        op: NotIdentity::Y,
+                        addr0,
+                    },
                     p_sum.clone() * p[1].clone(),
                     word_fp,
                     phase_loss ^ dy,
                 ));
                 branches.push((
                     parent_idx,
-                    BranchMutation::Pauli { op: 3, addr0 },
+                    BranchMutation::Pauli {
+                        op: NotIdentity::Z,
+                        addr0,
+                    },
                     p_sum.clone() * p[2].clone(),
                     word_fp,
                     phase_loss ^ dz,
diff --git a/crates/ppvm-tableau-sum/src/storage/mod.rs b/crates/ppvm-tableau-sum/src/storage/mod.rs
index c62f7daa6..c4320146c 100644
--- a/crates/ppvm-tableau-sum/src/storage/mod.rs
+++ b/crates/ppvm-tableau-sum/src/storage/mod.rs
@@ -22,6 +22,7 @@ use num::{
     Complex, One, PrimInt, Zero,
     complex::{Complex64, ComplexFloat},
 };
+use ppvm_pauli_word::pattern::NotIdentity;
 use ppvm_tableau::{
     data::GeneralizedTableau, sparsevec::SparseVector, tableau_index::TableauIndex,
 };
@@ -289,8 +290,8 @@ where
 /// cloning, materializing the tableau only for surviving new entries.
 #[derive(Clone, Copy, Debug)]
 pub enum BranchMutation {
-    /// Apply Pauli `op` (1=X, 2=Y, 3=Z) at `addr0`: flips per-row sign bits only.
-    Pauli { op: u8, addr0: usize },
+    /// Apply a non-identity Pauli at `addr0`: flips per-row sign bits only.
+    Pauli { op: NotIdentity, addr0: usize },
     /// Mark qubit `q` lost (set is_lost[q] = true).
     Loss { q: usize },
 }
@@ -307,10 +308,9 @@ pub(crate) fn apply_branch_mutation<T, I, C>(
 {
     match m {
         BranchMutation::Pauli { op, addr0 } => match op {
-            1 => tab.x(addr0),
-            2 => tab.y(addr0),
-            3 => tab.z(addr0),
-            _ => {}
+            NotIdentity::X => tab.x(addr0),
+            NotIdentity::Y => tab.y(addr0),
+            NotIdentity::Z => tab.z(addr0),
         },
         BranchMutation::Loss { q } => {
             tab.is_lost[q] = true;
@@ -398,10 +398,9 @@ where
                 let x: bool = bit_at(xw, word_idx, bit);
                 let z: bool = bit_at(zw, word_idx, bit);
                 let flip = match op {
-                    1 => z,
-                    2 => x ^ z,
-                    3 => x,
-                    _ => false,
+                    NotIdentity::X => z,
+                    NotIdentity::Y => x ^ z,
+                    NotIdentity::Z => x,
                 };
                 let virt_phase = rp.phase ^ ((flip as u8) << 1);
                 if re.phase != virt_phase {