diff --git a/benchmarks/README.md b/benchmarks/README.md index 4fb388156..39dc22a30 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -48,3 +48,42 @@ uv run --with matplotlib python benchmarks/plot_tfim_sweep.py \ - `plot_tfim_sweep.py` — renders the log-y comparison from the two CSVs. [pp]: https://github.com/MSRudolph/PauliPropagation.jl + +# Branch-coalesce scaling: sort-merge vs FxHashMap + +Follow-up study for PR #154, which replaced the `FxHashMap` coalesce in the +T-gate hot path (`GeneralizedTableau::branch_with_coefficients`) with a +sort-merge and measured ~10× on `cultivation_d5`. This harness asks whether +that win **persists as the branch count `m` grows**, and **where the hash +coalesce wins again**. Because #154 deleted the hash path from the default +build, the bench reimplements *both* coalesce routines (faithful ports, asserted +equivalent at start-up) and drives them with identical real inputs at +`m = 2^k` (`k` branching T gates on an 80-qubit, `u128`-indexed tableau). + +Two collision regimes: + +- **doubling** — the next T flips a fresh index bit (output `2m`, zero merges); + the canonical per-T-gate cost. Sort-merge wins throughout and the gap + *widens* with scale (≈3.8× at `m = 2^20`). +- **merge** — the next T flips a bit the set is already closed under (output + `m`, all collisions); the flavour of the measurement case-a path. The hash + coalesce overtakes sort-merge for `m ≳ 2048` (the dense-collision regime is + where probing's free coalesce-on-insert beats paying for a full sort). + +## Reproduce + +```bash +# 1. Run the bench (writes target/criterion/branch-coalesce-*/...). +# Default sweep tops out at m = 2^20; bump with PPVM_BRANCH_MAX_EXP. +cargo bench -p ppvm-tableau --bench branch-coalesce-scaling +# PPVM_BRANCH_MAX_EXP=22 cargo bench -p ppvm-tableau --bench branch-coalesce-scaling + +# 2. Plot (reads criterion's estimates.json directly — no CSV step). +uv run --with matplotlib python benchmarks/plot_branch_coalesce.py \ + --out /tmp/branch_coalesce_scaling.png +``` + +- `../crates/ppvm-tableau/benches/branch-coalesce-scaling.rs` — the A/B bench. +- `plot_branch_coalesce.py` — left panel: time vs `m` (log-log); right panel: + sort-merge speedup `t_hash / t_sortmerge` vs `m`, with the crossover line and + the "hash wins" band. diff --git a/benchmarks/plot_branch_coalesce.py b/benchmarks/plot_branch_coalesce.py new file mode 100644 index 000000000..ebec6ab4b --- /dev/null +++ b/benchmarks/plot_branch_coalesce.py @@ -0,0 +1,136 @@ +# SPDX-FileCopyrightText: 2026 The PPVM Authors +# SPDX-License-Identifier: Apache-2.0 +"""Plot the branch-coalesce scaling study: sort-merge (PR #154) vs the +pre-#154 FxHashMap coalesce, as a function of the branch count ``m``. + +Reads the JSON criterion writes for the ``branch-coalesce-scaling`` bench +(``crates/ppvm-tableau/benches/branch-coalesce-scaling.rs``) — no CSV step. +Run the bench first, then this script: + + cargo bench -p ppvm-tableau --bench branch-coalesce-scaling + uv run --with matplotlib python benchmarks/plot_branch_coalesce.py \ + --out /tmp/branch_coalesce_scaling.png + +The left panel is the raw time-vs-``m`` scaling (log-log); the right panel is +the sort-merge speedup ``t_hashmap / t_sortmerge`` (>1 → sort-merge wins, +<1 → hash wins), with the crossover line and the "hash wins" band shaded. The +two regimes (doubling = fresh branching, merge = collision-heavy) tell opposite +stories, which is the whole point. +""" + +import argparse +import glob +import json +import os + +import matplotlib + +matplotlib.use("Agg") +import matplotlib.pyplot as plt + +CRITERION_DIR = "target/criterion" + +# regime -> (label, color, marker) +REGIMES = { + "doubling": ("doubling (fresh bit · output 2m)", "#5b3fb8", "o"), + "merge": ("merge (closed set · output m)", "#d08700", "s"), +} +# algo -> (label, color, linestyle) +ALGOS = { + "hashmap": ("FxHashMap coalesce (pre-#154)", "#c0392b", "--"), + "sortmerge": ("sort-merge (this PR)", "#5b3fb8", "-"), +} +# packed-path cutoff in the sort-merge: m > 65535 drops to the generic fallback. +PACKED_CUTOFF = 65535 + + +def load_series(regime, algo): + """Return ([m...], [seconds...]) sorted by m, from criterion JSON.""" + base = os.path.join(CRITERION_DIR, f"branch-coalesce-{regime}", algo) + pts = [] + for est in glob.glob(os.path.join(base, "*", "new", "estimates.json")): + m = int(os.path.basename(os.path.dirname(os.path.dirname(est)))) + with open(est) as f: + ns = json.load(f)["median"]["point_estimate"] + pts.append((m, ns * 1e-9)) + if not pts: + raise SystemExit( + f"no criterion data under {base} — run the bench first:\n" + " cargo bench -p ppvm-tableau --bench branch-coalesce-scaling" + ) + pts.sort() + return [m for m, _ in pts], [s for _, s in pts] + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--out", required=True) + args = ap.parse_args() + + fig, (ax_t, ax_s) = plt.subplots(1, 2, figsize=(13, 5.2)) + + # ---- left: absolute time vs m (log-log) ----------------------------- + for regime, (rlabel, rcolor, marker) in REGIMES.items(): + for algo, (alabel, acolor, ls) in ALGOS.items(): + m, t = load_series(regime, algo) + ax_t.plot( + m, + t, + ls, + color=acolor, + marker=marker, + ms=5, + lw=1.7, + alpha=0.95 if regime == "doubling" else 0.6, + label=f"{algo} · {regime}", + ) + ax_t.set_xscale("log", base=2) + ax_t.set_yscale("log") + ax_t.set_xlabel("branch count m (= 2^k for k T gates)") + ax_t.set_ylabel("coalesce time per T gate (s)") + ax_t.set_title("Coalesce cost vs branch count", fontsize=11) + ax_t.axvline(PACKED_CUTOFF, color="0.5", ls=":", lw=1) + ax_t.text( + PACKED_CUTOFF, ax_t.get_ylim()[0], " packed→generic", + rotation=90, va="bottom", ha="left", fontsize=7.5, color="0.45", + ) + ax_t.grid(True, which="both", ls=":", lw=0.5, alpha=0.5) + ax_t.legend(frameon=False, fontsize=8.5, loc="upper left") + + # ---- right: sort-merge speedup vs m --------------------------------- + ax_s.axhline(1.0, color="0.3", lw=1) + ymax_band = 4.5 + ax_s.axhspan(0, 1.0, color="#c0392b", alpha=0.06, lw=0) + ax_s.text( + 0.98, 0.04, "hash wins", transform=ax_s.transAxes, + ha="right", va="bottom", fontsize=9, color="#c0392b", + ) + ax_s.text( + 0.02, 0.96, "sort-merge wins", transform=ax_s.transAxes, + ha="left", va="top", fontsize=9, color="#5b3fb8", + ) + for regime, (rlabel, rcolor, marker) in REGIMES.items(): + m, t_h = load_series(regime, "hashmap") + _, t_s = load_series(regime, "sortmerge") + speedup = [h / s for h, s in zip(t_h, t_s)] + ax_s.plot(m, speedup, "-", color=rcolor, marker=marker, ms=5, lw=1.8, label=rlabel) + ax_s.set_xscale("log", base=2) + ax_s.set_ylim(0, ymax_band) + ax_s.axvline(PACKED_CUTOFF, color="0.5", ls=":", lw=1) + ax_s.set_xlabel("branch count m") + ax_s.set_ylabel("sort-merge speedup (t_hash / t_sortmerge)") + ax_s.set_title("Where each coalesce wins", fontsize=11) + ax_s.grid(True, which="both", ls=":", lw=0.5, alpha=0.5) + ax_s.legend(frameon=False, fontsize=9, loc="upper center") + + fig.suptitle( + "ppvm-tableau branch coalesce: sort-merge vs FxHashMap (80 qubits, u128 index)", + fontsize=12.5, + ) + fig.tight_layout(rect=(0, 0, 1, 0.96)) + fig.savefig(args.out, dpi=150) + print(f"wrote {args.out}") + + +if __name__ == "__main__": + main() diff --git a/crates/ppvm-tableau/Cargo.toml b/crates/ppvm-tableau/Cargo.toml index 3a1e38860..72826d5a9 100644 --- a/crates/ppvm-tableau/Cargo.toml +++ b/crates/ppvm-tableau/Cargo.toml @@ -66,6 +66,10 @@ harness = false name = "measure-all" harness = false +[[bench]] +name = "branch-coalesce-scaling" +harness = false + [[bench]] name = "rot2-apply" harness = false diff --git a/crates/ppvm-tableau/benches/branch-coalesce-scaling.rs b/crates/ppvm-tableau/benches/branch-coalesce-scaling.rs new file mode 100644 index 000000000..e2703edd1 --- /dev/null +++ b/crates/ppvm-tableau/benches/branch-coalesce-scaling.rs @@ -0,0 +1,403 @@ +// SPDX-FileCopyrightText: 2026 The PPVM Authors +// SPDX-License-Identifier: Apache-2.0 + +//! Branch-coalesce scaling: **sort-merge vs. hash-map**, head to head. +//! +//! PR #154 replaced the `FxHashMap` coalesce in the T-gate hot path +//! (`GeneralizedTableau::branch_with_coefficients`, `data.rs`) with a +//! sort-merge, measuring ~10× on `cultivation_d5`. That win was found on +//! one circuit; this benchmark asks the follow-up question directly: +//! +//! > Does the sort-merge advantage **persist as the branch count `m` +//! > grows**, and is there a regime where the hash coalesce wins again? +//! +//! Because PR #154 *deleted* the hash path from the default build (it now +//! survives only behind the `rayon` feature), there is no way to A/B the +//! two strategies through the public gate API. So both coalesce routines +//! are reimplemented here as free functions, faithful to their sources: +//! +//! * [`coalesce_sortmerge`] — a verbatim port of the sequential sort-merge +//! in `branch_with_coefficients` (both the `u64`-packed fast path and the +//! generic `(I, u32)` fallback), specialised to `IndexType = u128`. +//! * [`coalesce_hashmap`] — the pre-#154 `FxHashMap` coalesce, matching +//! `branch_coefficients_seq` in `data.rs`. +//! +//! Both consume the **same real input**: a coefficient vector grown to an +//! exact size by applying H+T gates to a fresh 80-qubit tableau, plus the +//! genuine decomposition (`compute_decomposition` / +//! `odd_phase_destabilizer_mask`) of the next T gate. `verify_equivalence` +//! asserts the two produce identical coefficient sets before any timing, so +//! a drifted port fails loudly rather than benchmarking a lie. +//! +//! ## Mapping T gates ↔ branches +//! +//! T gates touch only the coefficient vector, never the tableau, so `k` +//! branching T gates on distinct qubits produce exactly `m = 2^k` branches +//! (no truncation here — the threshold is 0). The benchmark therefore +//! sweeps `m = 2^j` directly; that *is* the "number of T gates" axis. Real +//! circuits with truncation reach high T-gate counts at a bounded `m`, and +//! that bounded `m` is exactly what the sweep covers. 40 *untruncated* +//! branching T gates would be 2^40 ≈ 10^12 branches — out of reach for any +//! coalesce — so the honest variable is `m`, not the raw T count. +//! +//! ## Two collision regimes +//! +//! * **doubling** — the benched T gate flips a *fresh* index bit, so every +//! branch lands on a new index: output `= 2·m`, zero merges. This is the +//! canonical per-T-gate cost in a growing circuit. +//! * **merge** — the benched T gate flips an index bit the set is already +//! closed under, so every branch coalesces onto an existing entry: output +//! `= m`, all merges. This is the collision-heavy regime (the flavour of +//! the measurement case-a path) and the most likely place for the hash +//! coalesce to claw back ground. +//! +//! ## Reading the results +//! +//! Within each group (`branch-coalesce-doubling`, `branch-coalesce-merge`) +//! the `hashmap` and `sortmerge` lines are parameterised by `m`. Compare +//! them at each `m` and watch for a crossover. The `m = 32768 → 65536` step +//! also straddles the packed-path cutoff (`m ≤ 65535`): above it the +//! sort-merge drops to its generic `(I, u32)` fallback, so any change in the +//! gap there is the packing's contribution. +//! +//! Run: +//! ```bash +//! cargo bench -p ppvm-tableau --bench branch-coalesce-scaling +//! ``` +//! Push the ceiling (default `m ≤ 2^20`) with e.g. `PPVM_BRANCH_MAX_EXP=22`. + +use std::cmp::Ordering; +use std::hint::black_box; +use std::time::Duration; + +use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main}; +use fxhash::FxHashMap; +use num::complex::{Complex, Complex64}; +use ppvm_pauli_sum::config::fx64hash::Byte8F64; +use ppvm_tableau::prelude::*; + +/// 128-bit index / 2×u64 storage — the 80-qubit regime used elsewhere +/// (`measure-scaling.rs`, `profile_scaling.rs`). +type Tab = GeneralizedTableau, u128>; + +const N_QUBITS: usize = 80; + +/// `exp(iπ/8)·cos(π/8)` — the non-branch (`coefficient_factor`) weight a +/// `T` gate applies. Copied from `gates/tgate.rs`. +const COS: Complex64 = Complex { + re: 0.8535533905932737, + im: 0.3535533905932738, +}; +/// `-i·exp(iπ/8)·sin(π/8)` — the branch (`branch_factor`) weight. From +/// `gates/tgate.rs`. +const SIN: Complex64 = Complex { + re: 0.14644660940672624, + im: -0.3535533905932738, +}; + +/// Phase index → unit complex, matching `COMPLEX_PHASE_CONVERSION` in `data.rs`. +const PHASE: [Complex64; 4] = [ + Complex { re: 1.0, im: 0.0 }, + Complex { re: 0.0, im: 1.0 }, + Complex { re: -1.0, im: 0.0 }, + Complex { re: 0.0, im: -1.0 }, +]; + +/// The decomposition + scaling factors a single branching T gate applies to +/// every coefficient. Mirrors the arguments threaded through +/// `branch_with_coefficients`. +#[derive(Clone, Copy)] +struct Params { + stab_bits: u128, + destab_bits: u128, + odd_mask: u128, + phase_decomp: u8, + coefficient_factor: Complex64, + branch_factor: Complex64, + cutoff_sq: f64, +} + +/// Verbatim port of `compute_phase_with_mask_static` (`data.rs`), specialised +/// to `u128`. +#[inline] +fn compute_phase_with_mask(destab_bits: u128, basis: u128, stab_bits: u128, odd_mask: u128) -> u8 { + let mut phase = (2 * ((destab_bits & basis).count_ones() as u8)) % 4; + let active = basis & stab_bits; + let parity = (active & odd_mask).count_ones() % 2; + phase = (phase + 2 * parity as u8) % 4; + phase +} + +/// Pre-#154 hash coalesce: one `FxHashMap` probe per branch + non-branch +/// contribution, then a magnitude-cutoff sweep into the output vector. +/// Matches `branch_coefficients_seq` (`data.rs`). +fn coalesce_hashmap(input: &[(Complex64, u128)], p: &Params) -> Vec<(Complex64, u128)> { + let mut map: FxHashMap = + FxHashMap::with_capacity_and_hasher(2 * input.len(), Default::default()); + for &(coeff, idx) in input { + let branch_index = idx ^ p.stab_bits; + let bpc = compute_phase_with_mask(p.destab_bits, idx, p.stab_bits, p.odd_mask); + let branch_phase = (bpc + p.phase_decomp) % 4; + let pf = PHASE[branch_phase as usize]; + let branch_coefficient = pf * coeff * p.branch_factor; + let nonbranch_coefficient = coeff * p.coefficient_factor; + *map.entry(branch_index).or_insert(Complex64::new(0.0, 0.0)) += branch_coefficient; + *map.entry(idx).or_insert(Complex64::new(0.0, 0.0)) += nonbranch_coefficient; + } + let mut out = Vec::with_capacity(map.len()); + for (idx, coeff) in map { + if coeff.norm_sqr() > p.cutoff_sq { + out.push((coeff, idx)); + } + } + out +} + +/// Verbatim port of the sequential sort-merge in `branch_with_coefficients` +/// (`data.rs`), specialised to `u128`. Keeps both the `u64`-packed fast path +/// (engaged when `m ≤ 65535` and every branch key fits in 47 bits) and the +/// generic `(u128, u32)` fallback, so the benchmark exercises whichever path +/// the real code would take at a given `m`. +fn coalesce_sortmerge(input: &[(Complex64, u128)], p: &Params) -> Vec<(Complex64, u128)> { + let n = input.len(); + let cutoff_sq = p.cutoff_sq; + + let mut nb: Vec<(u128, Complex64)> = Vec::with_capacity(n); + let mut brv: Vec = Vec::with_capacity(n); + let mut packed: Vec = Vec::with_capacity(n); + let mut packable = n <= 0xFFFF; + let mut nb_sorted = true; + let mut prev: Option = None; + + for (pos, &(coeff, idx)) in (0_u32..).zip(input) { + let branch_index = idx ^ p.stab_bits; + let bpc = compute_phase_with_mask(p.destab_bits, idx, p.stab_bits, p.odd_mask); + let branch_phase = (bpc + p.phase_decomp) % 4; + let pf = PHASE[branch_phase as usize]; + brv.push(pf * coeff * p.branch_factor); + if branch_index < (1u128 << 47) { + packed.push(((branch_index as u64) << 16) | (pos as u64)); + } else { + packable = false; + packed.push(pos as u64); + } + nb.push((idx, coeff * p.coefficient_factor)); + if let Some(pp) = prev + && idx < pp + { + nb_sorted = false; + } + prev = Some(idx); + } + + let mut out: Vec<(Complex64, u128)> = Vec::with_capacity(nb.len() + brv.len()); + let mut i = 0; + + if packable { + if !nb_sorted { + nb.sort_unstable_by_key(|a| a.0); + } + packed.sort_unstable(); + let mut j = 0; + while i < nb.len() && j < packed.len() { + let bp = (packed[j] & 0xFFFF) as usize; + let bk = (packed[j] >> 16) as u128; + match nb[i].0.cmp(&bk) { + Ordering::Less => { + if nb[i].1.norm_sqr() > cutoff_sq { + out.push((nb[i].1, nb[i].0)); + } + i += 1; + } + Ordering::Greater => { + let v = brv[bp]; + if v.norm_sqr() > cutoff_sq { + out.push((v, bk)); + } + j += 1; + } + Ordering::Equal => { + let mut sv = nb[i].1; + sv += brv[bp]; + if sv.norm_sqr() > cutoff_sq { + out.push((sv, nb[i].0)); + } + i += 1; + j += 1; + } + } + } + while j < packed.len() { + let bp = (packed[j] & 0xFFFF) as usize; + let bk = (packed[j] >> 16) as u128; + let v = brv[bp]; + if v.norm_sqr() > cutoff_sq { + out.push((v, bk)); + } + j += 1; + } + } else { + let mut brk: Vec<(u128, u32)> = (0_u32..) + .zip(nb.iter()) + .map(|(pp, &(idx, _))| (idx ^ p.stab_bits, pp)) + .collect(); + if !nb_sorted { + nb.sort_unstable_by_key(|a| a.0); + } + brk.sort_unstable_by_key(|a| a.0); + let mut j = 0; + while i < nb.len() && j < brk.len() { + let (bk, bp) = brk[j]; + match nb[i].0.cmp(&bk) { + Ordering::Less => { + if nb[i].1.norm_sqr() > cutoff_sq { + out.push((nb[i].1, nb[i].0)); + } + i += 1; + } + Ordering::Greater => { + let v = brv[bp as usize]; + if v.norm_sqr() > cutoff_sq { + out.push((v, bk)); + } + j += 1; + } + Ordering::Equal => { + let mut sv = nb[i].1; + sv += brv[bp as usize]; + if sv.norm_sqr() > cutoff_sq { + out.push((sv, nb[i].0)); + } + i += 1; + j += 1; + } + } + } + while j < brk.len() { + let (bk, bp) = brk[j]; + let v = brv[bp as usize]; + if v.norm_sqr() > cutoff_sq { + out.push((v, bk)); + } + j += 1; + } + } + while i < nb.len() { + if nb[i].1.norm_sqr() > cutoff_sq { + out.push((nb[i].1, nb[i].0)); + } + i += 1; + } + out +} + +/// Build a real coefficient vector of size `m = 2^j` plus the decomposition +/// of one more T gate. `fresh_target` selects the collision regime: +/// * `true` — the next T flips a *fresh* qubit bit ⇒ doubling (output `2·m`). +/// * `false` — the next T flips an *existing* bit ⇒ all-merge (output `m`). +fn build(n_qubits: usize, j: usize, fresh_target: bool) -> (Vec<(Complex64, u128)>, Params) { + let mut tab: Tab = GeneralizedTableau::new(n_qubits, 0.0); + // `j` branching T gates on distinct fresh qubits ⇒ exactly 2^j branches. + for i in 0..j { + tab.h(i); + tab.t(i); + } + let target = if fresh_target { + tab.h(j); // a qubit not yet branched on + j + } else { + 0 // qubit 0 is already branched — the set is closed under its bit + }; + let (phase_decomp, stab_bits, destab_bits) = tab.compute_decomposition(target, Pauli::Z); + let odd_mask = tab.odd_phase_destabilizer_mask(); + let input = tab.coefficients.clone(); + ( + input, + Params { + stab_bits, + destab_bits, + odd_mask, + phase_decomp, + coefficient_factor: COS, + branch_factor: SIN, + cutoff_sq: 0.0, + }, + ) +} + +/// Assert the two coalesce routines agree on a real input — same set of +/// indices, same coefficients to 1e-9. Guards against the ports drifting from +/// their sources. +fn assert_equivalent(label: &str, a: &[(Complex64, u128)], b: &[(Complex64, u128)]) { + assert_eq!(a.len(), b.len(), "{label}: output sizes differ"); + let mut a = a.to_vec(); + let mut b = b.to_vec(); + a.sort_unstable_by_key(|x| x.1); + b.sort_unstable_by_key(|x| x.1); + for (x, y) in a.iter().zip(b.iter()) { + assert_eq!(x.1, y.1, "{label}: index mismatch"); + assert!( + (x.0 - y.0).norm() < 1e-9, + "{label}: coeff mismatch at index {}: {:?} vs {:?}", + x.1, + x.0, + y.0 + ); + } +} + +fn verify_equivalence() { + for (regime, fresh) in [("doubling", true), ("merge", false)] { + let (input, params) = build(N_QUBITS, 6, fresh); + let hm = coalesce_hashmap(&input, ¶ms); + let sm = coalesce_sortmerge(&input, ¶ms); + assert_equivalent(regime, &hm, &sm); + } +} + +/// Exponents `j` (with `m = 2^j`) chosen to bracket the packed-path cutoff +/// (`m ≤ 65535`, i.e. j ≤ 15) and span small → large branch counts. +fn exponents() -> Vec { + let max = std::env::var("PPVM_BRANCH_MAX_EXP") + .ok() + .and_then(|s| s.parse::().ok()) + .unwrap_or(20); + [2usize, 5, 8, 11, 14, 15, 16, 18, 20] + .into_iter() + .filter(|&j| j <= max) + .collect() +} + +fn bench_scenario(c: &mut Criterion, group_name: &str, fresh_target: bool) { + let mut group = c.benchmark_group(group_name); + for j in exponents() { + let (input, params) = build(N_QUBITS, j, fresh_target); + let m = input.len() as u64; + // ns/element is the cleaner scaling readout than raw wall time. + group.throughput(Throughput::Elements(m)); + + group.bench_with_input(BenchmarkId::new("hashmap", m), &m, |b, _| { + b.iter(|| black_box(coalesce_hashmap(black_box(&input), ¶ms))); + }); + group.bench_with_input(BenchmarkId::new("sortmerge", m), &m, |b, _| { + b.iter(|| black_box(coalesce_sortmerge(black_box(&input), ¶ms))); + }); + } + group.finish(); +} + +fn bench_branch_coalesce(c: &mut Criterion) { + verify_equivalence(); + bench_scenario(c, "branch-coalesce-doubling", true); + bench_scenario(c, "branch-coalesce-merge", false); +} + +criterion_group! { + name = benches; + config = Criterion::default() + .warm_up_time(Duration::from_secs(1)) + .measurement_time(Duration::from_secs(3)) + .sample_size(30); + targets = bench_branch_coalesce +} +criterion_main!(benches);