diff --git a/benchmarks/README.md b/benchmarks/README.md
index 4fb388156..39dc22a30 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -48,3 +48,42 @@ uv run --with matplotlib python benchmarks/plot_tfim_sweep.py \
 - `plot_tfim_sweep.py` — renders the log-y comparison from the two CSVs.
 
 [pp]: https://github.com/MSRudolph/PauliPropagation.jl
+
+# Branch-coalesce scaling: sort-merge vs FxHashMap
+
+Follow-up study for PR #154, which replaced the `FxHashMap` coalesce in the
+T-gate hot path (`GeneralizedTableau::branch_with_coefficients`) with a
+sort-merge and measured ~10× on `cultivation_d5`. This harness asks whether
+that win **persists as the branch count `m` grows**, and **where the hash
+coalesce wins again**. Because #154 deleted the hash path from the default
+build, the bench reimplements *both* coalesce routines (faithful ports, asserted
+equivalent at start-up) and drives them with identical real inputs at
+`m = 2^k` (`k` branching T gates on an 80-qubit, `u128`-indexed tableau).
+
+Two collision regimes:
+
+- **doubling** — the next T flips a fresh index bit (output `2m`, zero merges);
+  the canonical per-T-gate cost. Sort-merge wins throughout and the gap
+  *widens* with scale (≈3.8× at `m = 2^20`).
+- **merge** — the next T flips a bit the set is already closed under (output
+  `m`, all collisions); the flavour of the measurement case-a path. The hash
+  coalesce overtakes sort-merge for `m ≳ 2048` (the dense-collision regime is
+  where probing's free coalesce-on-insert beats paying for a full sort).
+
+## Reproduce
+
+```bash
+# 1. Run the bench (writes target/criterion/branch-coalesce-*/...).
+#    Default sweep tops out at m = 2^20; bump with PPVM_BRANCH_MAX_EXP.
+cargo bench -p ppvm-tableau --bench branch-coalesce-scaling
+# PPVM_BRANCH_MAX_EXP=22 cargo bench -p ppvm-tableau --bench branch-coalesce-scaling
+
+# 2. Plot (reads criterion's estimates.json directly — no CSV step).
+uv run --with matplotlib python benchmarks/plot_branch_coalesce.py \
+  --out /tmp/branch_coalesce_scaling.png
+```
+
+- `../crates/ppvm-tableau/benches/branch-coalesce-scaling.rs` — the A/B bench.
+- `plot_branch_coalesce.py` — left panel: time vs `m` (log-log); right panel:
+  sort-merge speedup `t_hash / t_sortmerge` vs `m`, with the crossover line and
+  the "hash wins" band.
diff --git a/benchmarks/plot_branch_coalesce.py b/benchmarks/plot_branch_coalesce.py
new file mode 100644
index 000000000..ebec6ab4b
--- /dev/null
+++ b/benchmarks/plot_branch_coalesce.py
@@ -0,0 +1,136 @@
+# SPDX-FileCopyrightText: 2026 The PPVM Authors
+# SPDX-License-Identifier: Apache-2.0
+"""Plot the branch-coalesce scaling study: sort-merge (PR #154) vs the
+pre-#154 FxHashMap coalesce, as a function of the branch count ``m``.
+
+Reads the JSON criterion writes for the ``branch-coalesce-scaling`` bench
+(``crates/ppvm-tableau/benches/branch-coalesce-scaling.rs``) — no CSV step.
+Run the bench first, then this script:
+
+    cargo bench -p ppvm-tableau --bench branch-coalesce-scaling
+    uv run --with matplotlib python benchmarks/plot_branch_coalesce.py \
+        --out /tmp/branch_coalesce_scaling.png
+
+The left panel is the raw time-vs-``m`` scaling (log-log); the right panel is
+the sort-merge speedup ``t_hashmap / t_sortmerge`` (>1 → sort-merge wins,
+<1 → hash wins), with the crossover line and the "hash wins" band shaded. The
+two regimes (doubling = fresh branching, merge = collision-heavy) tell opposite
+stories, which is the whole point.
+"""
+
+import argparse
+import glob
+import json
+import os
+
+import matplotlib
+
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+
+CRITERION_DIR = "target/criterion"
+
+# regime -> (label, color, marker)
+REGIMES = {
+    "doubling": ("doubling (fresh bit · output 2m)", "#5b3fb8", "o"),
+    "merge": ("merge (closed set · output m)", "#d08700", "s"),
+}
+# algo -> (label, color, linestyle)
+ALGOS = {
+    "hashmap": ("FxHashMap coalesce (pre-#154)", "#c0392b", "--"),
+    "sortmerge": ("sort-merge (this PR)", "#5b3fb8", "-"),
+}
+# packed-path cutoff in the sort-merge: m > 65535 drops to the generic fallback.
+PACKED_CUTOFF = 65535
+
+
+def load_series(regime, algo):
+    """Return ([m...], [seconds...]) sorted by m, from criterion JSON."""
+    base = os.path.join(CRITERION_DIR, f"branch-coalesce-{regime}", algo)
+    pts = []
+    for est in glob.glob(os.path.join(base, "*", "new", "estimates.json")):
+        m = int(os.path.basename(os.path.dirname(os.path.dirname(est))))
+        with open(est) as f:
+            ns = json.load(f)["median"]["point_estimate"]
+        pts.append((m, ns * 1e-9))
+    if not pts:
+        raise SystemExit(
+            f"no criterion data under {base} — run the bench first:\n"
+            "  cargo bench -p ppvm-tableau --bench branch-coalesce-scaling"
+        )
+    pts.sort()
+    return [m for m, _ in pts], [s for _, s in pts]
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--out", required=True)
+    args = ap.parse_args()
+
+    fig, (ax_t, ax_s) = plt.subplots(1, 2, figsize=(13, 5.2))
+
+    # ---- left: absolute time vs m (log-log) -----------------------------
+    for regime, (rlabel, rcolor, marker) in REGIMES.items():
+        for algo, (alabel, acolor, ls) in ALGOS.items():
+            m, t = load_series(regime, algo)
+            ax_t.plot(
+                m,
+                t,
+                ls,
+                color=acolor,
+                marker=marker,
+                ms=5,
+                lw=1.7,
+                alpha=0.95 if regime == "doubling" else 0.6,
+                label=f"{algo} · {regime}",
+            )
+    ax_t.set_xscale("log", base=2)
+    ax_t.set_yscale("log")
+    ax_t.set_xlabel("branch count  m  (= 2^k for k T gates)")
+    ax_t.set_ylabel("coalesce time per T gate (s)")
+    ax_t.set_title("Coalesce cost vs branch count", fontsize=11)
+    ax_t.axvline(PACKED_CUTOFF, color="0.5", ls=":", lw=1)
+    ax_t.text(
+        PACKED_CUTOFF, ax_t.get_ylim()[0], "  packed→generic",
+        rotation=90, va="bottom", ha="left", fontsize=7.5, color="0.45",
+    )
+    ax_t.grid(True, which="both", ls=":", lw=0.5, alpha=0.5)
+    ax_t.legend(frameon=False, fontsize=8.5, loc="upper left")
+
+    # ---- right: sort-merge speedup vs m ---------------------------------
+    ax_s.axhline(1.0, color="0.3", lw=1)
+    ymax_band = 4.5
+    ax_s.axhspan(0, 1.0, color="#c0392b", alpha=0.06, lw=0)
+    ax_s.text(
+        0.98, 0.04, "hash wins", transform=ax_s.transAxes,
+        ha="right", va="bottom", fontsize=9, color="#c0392b",
+    )
+    ax_s.text(
+        0.02, 0.96, "sort-merge wins", transform=ax_s.transAxes,
+        ha="left", va="top", fontsize=9, color="#5b3fb8",
+    )
+    for regime, (rlabel, rcolor, marker) in REGIMES.items():
+        m, t_h = load_series(regime, "hashmap")
+        _, t_s = load_series(regime, "sortmerge")
+        speedup = [h / s for h, s in zip(t_h, t_s)]
+        ax_s.plot(m, speedup, "-", color=rcolor, marker=marker, ms=5, lw=1.8, label=rlabel)
+    ax_s.set_xscale("log", base=2)
+    ax_s.set_ylim(0, ymax_band)
+    ax_s.axvline(PACKED_CUTOFF, color="0.5", ls=":", lw=1)
+    ax_s.set_xlabel("branch count  m")
+    ax_s.set_ylabel("sort-merge speedup  (t_hash / t_sortmerge)")
+    ax_s.set_title("Where each coalesce wins", fontsize=11)
+    ax_s.grid(True, which="both", ls=":", lw=0.5, alpha=0.5)
+    ax_s.legend(frameon=False, fontsize=9, loc="upper center")
+
+    fig.suptitle(
+        "ppvm-tableau branch coalesce: sort-merge vs FxHashMap  (80 qubits, u128 index)",
+        fontsize=12.5,
+    )
+    fig.tight_layout(rect=(0, 0, 1, 0.96))
+    fig.savefig(args.out, dpi=150)
+    print(f"wrote {args.out}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/crates/ppvm-tableau/Cargo.toml b/crates/ppvm-tableau/Cargo.toml
index 3a1e38860..72826d5a9 100644
--- a/crates/ppvm-tableau/Cargo.toml
+++ b/crates/ppvm-tableau/Cargo.toml
@@ -66,6 +66,10 @@ harness = false
 name = "measure-all"
 harness = false
 
+[[bench]]
+name = "branch-coalesce-scaling"
+harness = false
+
 [[bench]]
 name = "rot2-apply"
 harness = false
diff --git a/crates/ppvm-tableau/benches/branch-coalesce-scaling.rs b/crates/ppvm-tableau/benches/branch-coalesce-scaling.rs
new file mode 100644
index 000000000..e2703edd1
--- /dev/null
+++ b/crates/ppvm-tableau/benches/branch-coalesce-scaling.rs
@@ -0,0 +1,403 @@
+// SPDX-FileCopyrightText: 2026 The PPVM Authors
+// SPDX-License-Identifier: Apache-2.0
+
+//! Branch-coalesce scaling: **sort-merge vs. hash-map**, head to head.
+//!
+//! PR #154 replaced the `FxHashMap` coalesce in the T-gate hot path
+//! (`GeneralizedTableau::branch_with_coefficients`, `data.rs`) with a
+//! sort-merge, measuring ~10× on `cultivation_d5`. That win was found on
+//! one circuit; this benchmark asks the follow-up question directly:
+//!
+//! > Does the sort-merge advantage **persist as the branch count `m`
+//! > grows**, and is there a regime where the hash coalesce wins again?
+//!
+//! Because PR #154 *deleted* the hash path from the default build (it now
+//! survives only behind the `rayon` feature), there is no way to A/B the
+//! two strategies through the public gate API. So both coalesce routines
+//! are reimplemented here as free functions, faithful to their sources:
+//!
+//! * [`coalesce_sortmerge`] — a verbatim port of the sequential sort-merge
+//!   in `branch_with_coefficients` (both the `u64`-packed fast path and the
+//!   generic `(I, u32)` fallback), specialised to `IndexType = u128`.
+//! * [`coalesce_hashmap`] — the pre-#154 `FxHashMap` coalesce, matching
+//!   `branch_coefficients_seq` in `data.rs`.
+//!
+//! Both consume the **same real input**: a coefficient vector grown to an
+//! exact size by applying H+T gates to a fresh 80-qubit tableau, plus the
+//! genuine decomposition (`compute_decomposition` /
+//! `odd_phase_destabilizer_mask`) of the next T gate. `verify_equivalence`
+//! asserts the two produce identical coefficient sets before any timing, so
+//! a drifted port fails loudly rather than benchmarking a lie.
+//!
+//! ## Mapping T gates ↔ branches
+//!
+//! T gates touch only the coefficient vector, never the tableau, so `k`
+//! branching T gates on distinct qubits produce exactly `m = 2^k` branches
+//! (no truncation here — the threshold is 0). The benchmark therefore
+//! sweeps `m = 2^j` directly; that *is* the "number of T gates" axis. Real
+//! circuits with truncation reach high T-gate counts at a bounded `m`, and
+//! that bounded `m` is exactly what the sweep covers. 40 *untruncated*
+//! branching T gates would be 2^40 ≈ 10^12 branches — out of reach for any
+//! coalesce — so the honest variable is `m`, not the raw T count.
+//!
+//! ## Two collision regimes
+//!
+//! * **doubling** — the benched T gate flips a *fresh* index bit, so every
+//!   branch lands on a new index: output `= 2·m`, zero merges. This is the
+//!   canonical per-T-gate cost in a growing circuit.
+//! * **merge** — the benched T gate flips an index bit the set is already
+//!   closed under, so every branch coalesces onto an existing entry: output
+//!   `= m`, all merges. This is the collision-heavy regime (the flavour of
+//!   the measurement case-a path) and the most likely place for the hash
+//!   coalesce to claw back ground.
+//!
+//! ## Reading the results
+//!
+//! Within each group (`branch-coalesce-doubling`, `branch-coalesce-merge`)
+//! the `hashmap` and `sortmerge` lines are parameterised by `m`. Compare
+//! them at each `m` and watch for a crossover. The `m = 32768 → 65536` step
+//! also straddles the packed-path cutoff (`m ≤ 65535`): above it the
+//! sort-merge drops to its generic `(I, u32)` fallback, so any change in the
+//! gap there is the packing's contribution.
+//!
+//! Run:
+//! ```bash
+//! cargo bench -p ppvm-tableau --bench branch-coalesce-scaling
+//! ```
+//! Push the ceiling (default `m ≤ 2^20`) with e.g. `PPVM_BRANCH_MAX_EXP=22`.
+
+use std::cmp::Ordering;
+use std::hint::black_box;
+use std::time::Duration;
+
+use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main};
+use fxhash::FxHashMap;
+use num::complex::{Complex, Complex64};
+use ppvm_pauli_sum::config::fx64hash::Byte8F64;
+use ppvm_tableau::prelude::*;
+
+/// 128-bit index / 2×u64 storage — the 80-qubit regime used elsewhere
+/// (`measure-scaling.rs`, `profile_scaling.rs`).
+type Tab = GeneralizedTableau<Byte8F64<2>, u128>;
+
+const N_QUBITS: usize = 80;
+
+/// `exp(iπ/8)·cos(π/8)` — the non-branch (`coefficient_factor`) weight a
+/// `T` gate applies. Copied from `gates/tgate.rs`.
+const COS: Complex64 = Complex {
+    re: 0.8535533905932737,
+    im: 0.3535533905932738,
+};
+/// `-i·exp(iπ/8)·sin(π/8)` — the branch (`branch_factor`) weight. From
+/// `gates/tgate.rs`.
+const SIN: Complex64 = Complex {
+    re: 0.14644660940672624,
+    im: -0.3535533905932738,
+};
+
+/// Phase index → unit complex, matching `COMPLEX_PHASE_CONVERSION` in `data.rs`.
+const PHASE: [Complex64; 4] = [
+    Complex { re: 1.0, im: 0.0 },
+    Complex { re: 0.0, im: 1.0 },
+    Complex { re: -1.0, im: 0.0 },
+    Complex { re: 0.0, im: -1.0 },
+];
+
+/// The decomposition + scaling factors a single branching T gate applies to
+/// every coefficient. Mirrors the arguments threaded through
+/// `branch_with_coefficients`.
+#[derive(Clone, Copy)]
+struct Params {
+    stab_bits: u128,
+    destab_bits: u128,
+    odd_mask: u128,
+    phase_decomp: u8,
+    coefficient_factor: Complex64,
+    branch_factor: Complex64,
+    cutoff_sq: f64,
+}
+
+/// Verbatim port of `compute_phase_with_mask_static` (`data.rs`), specialised
+/// to `u128`.
+#[inline]
+fn compute_phase_with_mask(destab_bits: u128, basis: u128, stab_bits: u128, odd_mask: u128) -> u8 {
+    let mut phase = (2 * ((destab_bits & basis).count_ones() as u8)) % 4;
+    let active = basis & stab_bits;
+    let parity = (active & odd_mask).count_ones() % 2;
+    phase = (phase + 2 * parity as u8) % 4;
+    phase
+}
+
+/// Pre-#154 hash coalesce: one `FxHashMap` probe per branch + non-branch
+/// contribution, then a magnitude-cutoff sweep into the output vector.
+/// Matches `branch_coefficients_seq` (`data.rs`).
+fn coalesce_hashmap(input: &[(Complex64, u128)], p: &Params) -> Vec<(Complex64, u128)> {
+    let mut map: FxHashMap<u128, Complex64> =
+        FxHashMap::with_capacity_and_hasher(2 * input.len(), Default::default());
+    for &(coeff, idx) in input {
+        let branch_index = idx ^ p.stab_bits;
+        let bpc = compute_phase_with_mask(p.destab_bits, idx, p.stab_bits, p.odd_mask);
+        let branch_phase = (bpc + p.phase_decomp) % 4;
+        let pf = PHASE[branch_phase as usize];
+        let branch_coefficient = pf * coeff * p.branch_factor;
+        let nonbranch_coefficient = coeff * p.coefficient_factor;
+        *map.entry(branch_index).or_insert(Complex64::new(0.0, 0.0)) += branch_coefficient;
+        *map.entry(idx).or_insert(Complex64::new(0.0, 0.0)) += nonbranch_coefficient;
+    }
+    let mut out = Vec::with_capacity(map.len());
+    for (idx, coeff) in map {
+        if coeff.norm_sqr() > p.cutoff_sq {
+            out.push((coeff, idx));
+        }
+    }
+    out
+}
+
+/// Verbatim port of the sequential sort-merge in `branch_with_coefficients`
+/// (`data.rs`), specialised to `u128`. Keeps both the `u64`-packed fast path
+/// (engaged when `m ≤ 65535` and every branch key fits in 47 bits) and the
+/// generic `(u128, u32)` fallback, so the benchmark exercises whichever path
+/// the real code would take at a given `m`.
+fn coalesce_sortmerge(input: &[(Complex64, u128)], p: &Params) -> Vec<(Complex64, u128)> {
+    let n = input.len();
+    let cutoff_sq = p.cutoff_sq;
+
+    let mut nb: Vec<(u128, Complex64)> = Vec::with_capacity(n);
+    let mut brv: Vec<Complex64> = Vec::with_capacity(n);
+    let mut packed: Vec<u64> = Vec::with_capacity(n);
+    let mut packable = n <= 0xFFFF;
+    let mut nb_sorted = true;
+    let mut prev: Option<u128> = None;
+
+    for (pos, &(coeff, idx)) in (0_u32..).zip(input) {
+        let branch_index = idx ^ p.stab_bits;
+        let bpc = compute_phase_with_mask(p.destab_bits, idx, p.stab_bits, p.odd_mask);
+        let branch_phase = (bpc + p.phase_decomp) % 4;
+        let pf = PHASE[branch_phase as usize];
+        brv.push(pf * coeff * p.branch_factor);
+        if branch_index < (1u128 << 47) {
+            packed.push(((branch_index as u64) << 16) | (pos as u64));
+        } else {
+            packable = false;
+            packed.push(pos as u64);
+        }
+        nb.push((idx, coeff * p.coefficient_factor));
+        if let Some(pp) = prev
+            && idx < pp
+        {
+            nb_sorted = false;
+        }
+        prev = Some(idx);
+    }
+
+    let mut out: Vec<(Complex64, u128)> = Vec::with_capacity(nb.len() + brv.len());
+    let mut i = 0;
+
+    if packable {
+        if !nb_sorted {
+            nb.sort_unstable_by_key(|a| a.0);
+        }
+        packed.sort_unstable();
+        let mut j = 0;
+        while i < nb.len() && j < packed.len() {
+            let bp = (packed[j] & 0xFFFF) as usize;
+            let bk = (packed[j] >> 16) as u128;
+            match nb[i].0.cmp(&bk) {
+                Ordering::Less => {
+                    if nb[i].1.norm_sqr() > cutoff_sq {
+                        out.push((nb[i].1, nb[i].0));
+                    }
+                    i += 1;
+                }
+                Ordering::Greater => {
+                    let v = brv[bp];
+                    if v.norm_sqr() > cutoff_sq {
+                        out.push((v, bk));
+                    }
+                    j += 1;
+                }
+                Ordering::Equal => {
+                    let mut sv = nb[i].1;
+                    sv += brv[bp];
+                    if sv.norm_sqr() > cutoff_sq {
+                        out.push((sv, nb[i].0));
+                    }
+                    i += 1;
+                    j += 1;
+                }
+            }
+        }
+        while j < packed.len() {
+            let bp = (packed[j] & 0xFFFF) as usize;
+            let bk = (packed[j] >> 16) as u128;
+            let v = brv[bp];
+            if v.norm_sqr() > cutoff_sq {
+                out.push((v, bk));
+            }
+            j += 1;
+        }
+    } else {
+        let mut brk: Vec<(u128, u32)> = (0_u32..)
+            .zip(nb.iter())
+            .map(|(pp, &(idx, _))| (idx ^ p.stab_bits, pp))
+            .collect();
+        if !nb_sorted {
+            nb.sort_unstable_by_key(|a| a.0);
+        }
+        brk.sort_unstable_by_key(|a| a.0);
+        let mut j = 0;
+        while i < nb.len() && j < brk.len() {
+            let (bk, bp) = brk[j];
+            match nb[i].0.cmp(&bk) {
+                Ordering::Less => {
+                    if nb[i].1.norm_sqr() > cutoff_sq {
+                        out.push((nb[i].1, nb[i].0));
+                    }
+                    i += 1;
+                }
+                Ordering::Greater => {
+                    let v = brv[bp as usize];
+                    if v.norm_sqr() > cutoff_sq {
+                        out.push((v, bk));
+                    }
+                    j += 1;
+                }
+                Ordering::Equal => {
+                    let mut sv = nb[i].1;
+                    sv += brv[bp as usize];
+                    if sv.norm_sqr() > cutoff_sq {
+                        out.push((sv, nb[i].0));
+                    }
+                    i += 1;
+                    j += 1;
+                }
+            }
+        }
+        while j < brk.len() {
+            let (bk, bp) = brk[j];
+            let v = brv[bp as usize];
+            if v.norm_sqr() > cutoff_sq {
+                out.push((v, bk));
+            }
+            j += 1;
+        }
+    }
+    while i < nb.len() {
+        if nb[i].1.norm_sqr() > cutoff_sq {
+            out.push((nb[i].1, nb[i].0));
+        }
+        i += 1;
+    }
+    out
+}
+
+/// Build a real coefficient vector of size `m = 2^j` plus the decomposition
+/// of one more T gate. `fresh_target` selects the collision regime:
+/// * `true`  — the next T flips a *fresh* qubit bit ⇒ doubling (output `2·m`).
+/// * `false` — the next T flips an *existing* bit  ⇒ all-merge (output `m`).
+fn build(n_qubits: usize, j: usize, fresh_target: bool) -> (Vec<(Complex64, u128)>, Params) {
+    let mut tab: Tab = GeneralizedTableau::new(n_qubits, 0.0);
+    // `j` branching T gates on distinct fresh qubits ⇒ exactly 2^j branches.
+    for i in 0..j {
+        tab.h(i);
+        tab.t(i);
+    }
+    let target = if fresh_target {
+        tab.h(j); // a qubit not yet branched on
+        j
+    } else {
+        0 // qubit 0 is already branched — the set is closed under its bit
+    };
+    let (phase_decomp, stab_bits, destab_bits) = tab.compute_decomposition(target, Pauli::Z);
+    let odd_mask = tab.odd_phase_destabilizer_mask();
+    let input = tab.coefficients.clone();
+    (
+        input,
+        Params {
+            stab_bits,
+            destab_bits,
+            odd_mask,
+            phase_decomp,
+            coefficient_factor: COS,
+            branch_factor: SIN,
+            cutoff_sq: 0.0,
+        },
+    )
+}
+
+/// Assert the two coalesce routines agree on a real input — same set of
+/// indices, same coefficients to 1e-9. Guards against the ports drifting from
+/// their sources.
+fn assert_equivalent(label: &str, a: &[(Complex64, u128)], b: &[(Complex64, u128)]) {
+    assert_eq!(a.len(), b.len(), "{label}: output sizes differ");
+    let mut a = a.to_vec();
+    let mut b = b.to_vec();
+    a.sort_unstable_by_key(|x| x.1);
+    b.sort_unstable_by_key(|x| x.1);
+    for (x, y) in a.iter().zip(b.iter()) {
+        assert_eq!(x.1, y.1, "{label}: index mismatch");
+        assert!(
+            (x.0 - y.0).norm() < 1e-9,
+            "{label}: coeff mismatch at index {}: {:?} vs {:?}",
+            x.1,
+            x.0,
+            y.0
+        );
+    }
+}
+
+fn verify_equivalence() {
+    for (regime, fresh) in [("doubling", true), ("merge", false)] {
+        let (input, params) = build(N_QUBITS, 6, fresh);
+        let hm = coalesce_hashmap(&input, &params);
+        let sm = coalesce_sortmerge(&input, &params);
+        assert_equivalent(regime, &hm, &sm);
+    }
+}
+
+/// Exponents `j` (with `m = 2^j`) chosen to bracket the packed-path cutoff
+/// (`m ≤ 65535`, i.e. j ≤ 15) and span small → large branch counts.
+fn exponents() -> Vec<usize> {
+    let max = std::env::var("PPVM_BRANCH_MAX_EXP")
+        .ok()
+        .and_then(|s| s.parse::<usize>().ok())
+        .unwrap_or(20);
+    [2usize, 5, 8, 11, 14, 15, 16, 18, 20]
+        .into_iter()
+        .filter(|&j| j <= max)
+        .collect()
+}
+
+fn bench_scenario(c: &mut Criterion, group_name: &str, fresh_target: bool) {
+    let mut group = c.benchmark_group(group_name);
+    for j in exponents() {
+        let (input, params) = build(N_QUBITS, j, fresh_target);
+        let m = input.len() as u64;
+        // ns/element is the cleaner scaling readout than raw wall time.
+        group.throughput(Throughput::Elements(m));
+
+        group.bench_with_input(BenchmarkId::new("hashmap", m), &m, |b, _| {
+            b.iter(|| black_box(coalesce_hashmap(black_box(&input), &params)));
+        });
+        group.bench_with_input(BenchmarkId::new("sortmerge", m), &m, |b, _| {
+            b.iter(|| black_box(coalesce_sortmerge(black_box(&input), &params)));
+        });
+    }
+    group.finish();
+}
+
+fn bench_branch_coalesce(c: &mut Criterion) {
+    verify_equivalence();
+    bench_scenario(c, "branch-coalesce-doubling", true);
+    bench_scenario(c, "branch-coalesce-merge", false);
+}
+
+criterion_group! {
+    name = benches;
+    config = Criterion::default()
+        .warm_up_time(Duration::from_secs(1))
+        .measurement_time(Duration::from_secs(3))
+        .sample_size(30);
+    targets = bench_branch_coalesce
+}
+criterion_main!(benches);