From ebdb817b514118bf1d14652cdd5457663806eae4 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Fri, 22 May 2026 22:15:32 +0200
Subject: [PATCH 01/18] Disable per-page ResizeObserver: it caught no real
 reflows (~130ms saved).

---
 docs/lib/paged.browser.js | 39 ++++++++++++++++-----------------------
 1 file changed, 16 insertions(+), 23 deletions(-)

diff --git a/docs/lib/paged.browser.js b/docs/lib/paged.browser.js
index 8d212d9..f681876 100644
--- a/docs/lib/paged.browser.js
+++ b/docs/lib/paged.browser.js
@@ -2486,29 +2486,22 @@
 		}
 
 		addResizeObserver(contents) {
-			let wrapper = this.wrapper;
-			let prevHeight = wrapper.getBoundingClientRect().height;
-			this.ro = new ResizeObserver(entries => {
-
-				if (!this.listening) {
-					return;
-				}
-				requestAnimationFrame(() => {
-					for (let entry of entries) {
-						const cr = entry.contentRect;
-
-						if (cr.height > prevHeight) {
-							this.checkOverflowAfterResize(contents);
-							prevHeight = wrapper.getBoundingClientRect().height;
-						} else if (cr.height < prevHeight) { // TODO: calc line height && (prevHeight - cr.height) >= 22
-							this.checkUnderflowAfterResize(contents);
-							prevHeight = cr.height;
-						}
-					}
-				});
-			});
-
-			this.ro.observe(wrapper);
+			// [PATCH: disable-resize-observer] The RO existed to catch
+			// post-layout content reflow -- late-loading fonts, image
+			// dimensions resolving after layout, etc. -- by re-running
+			// findBreakToken whenever the wrapper grew/shrunk after
+			// renderTo returned. Our pipeline navigates with
+			// `waitUntil: "load"` and uses embedded fonts; nothing
+			// resizes after layout. The `_onOverflow` rescue path
+			// (Chunker.addPage line 3296) only fires while
+			// `!chunker.rendered`, and would emit a console.warn
+			// before re-rendering, so a regression would be loud.
+			// Disabling the RO removes a per-page allocation plus the
+			// stream of async findBreakToken / gBCR calls its callback
+			// would otherwise drive after every page's renderTo.
+			// checkUnderflowAfterResize is already gated by an absent
+			// _onUnderflow (see README "Attempt C"); checkOverflowAfterResize
+			// was the only live consumer.
 		}
 
 		checkOverflowAfterResize(contents) {

From 78fccf043805b0d41ab9e5f49fabbc4baf290e2e Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Fri, 22 May 2026 22:46:00 +0200
Subject: [PATCH 02/18] Add Chrome trace analysis.

---
 perf/README.md         |   2 +
 perf/analyze-trace.mjs | 163 +++++++++++++++++++++++++++++++++++++++++
 perf/measure.mjs       |  44 +++++++++++
 3 files changed, 209 insertions(+)
 create mode 100644 perf/analyze-trace.mjs

diff --git a/perf/README.md b/perf/README.md
index 135d2de..1385d1f 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -107,6 +107,7 @@ DevTools-compatible trace is a few lines.
 | `compare-outlines.mjs` | Diffs two PDFs' `/Outlines` trees by `(depth, title, target page)`. Used to verify whether Chrome's native outline matches the injected one. |
 | `probe-outline-exclusions.mjs` | Tests which per-element attributes / styles (aria-hidden, role=presentation, hidden, display:none, CSS bookmark-level, ...) make Chrome drop a heading from its outline. |
 | `analyze-profile.mjs` | Bottom-up self-time analyzer for `.cpuprofile` files. Same shape as DevTools' Performance bottom-up view, in the terminal. |
+| `analyze-trace.mjs` | Bottom-up self-time analyzer for Chrome traces (`trace.json` from `--tracing`). Computes per-event self-time on the renderer's main thread (`CrRendererMain` by default) by walking nested `X`-phase events. Cracks the cpu profile's `(program)` bucket open into named Blink / V8 events (`Layout`, `RecalcStyle`, `RunMicrotasks`, `V8.GC_*`, ...). |
 | `find-callers.mjs` | "Who paid for this callee's time?" -- walks a `.cpuprofile` and attributes a target function's total time back to each direct caller. Used throughout the post-mortems to detect gBCR migration between callers. |
 | `find-callees.mjs` | The other direction of `find-callers.mjs`: splits a function's self+descendant time across its direct callees. Surfaces the cases where V8 has rolled native DOM work back into the calling JS frame (Range deletion in `removeOverflow`, HTML parser in `wrapContent`). |
 | `grep-profile.mjs` | Lists every node in a `.cpuprofile` whose `functionName` matches a regex, with self-time and location. Quick check for "is this frame in the profile at all, and what's it called?" |
@@ -164,6 +165,7 @@ run.bat --instrument                      # count + time DOM-accessor calls
 run.bat --time-hooks                      # per-task timing of every chunker/polisher hook
 run.bat --incremental                     # process via incremental update instead of pdf-lib roundtrip
 run.bat --chrome-outline                  # let Chrome emit /Outlines (skip parseOutline + setOutline)
+run.bat --tracing                         # capture a Chrome trace of the render phase
 ```
 
 Flags compose. The CPU profile lands as `render.cpuprofile`
diff --git a/perf/analyze-trace.mjs b/perf/analyze-trace.mjs
new file mode 100644
index 0000000..cefc6cf
--- /dev/null
+++ b/perf/analyze-trace.mjs
@@ -0,0 +1,163 @@
+// Bottom-up Chrome trace analyzer.
+//
+// Reads a trace.json produced by `node measure.mjs --tracing` (or any
+// other source -- the trace format is Chrome's standard "JSON Object
+// Format", https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU)
+// and prints the top events on the renderer's main thread by self-time,
+// aggregated by event name. Use it to break the cpu profile's (program)
+// frame down into named Blink/V8 work (Layout, UpdateLayoutTree, ParseHTML,
+// V8.CompileCode, V8.RunMicrotasks, etc.).
+//
+// Usage:
+//   node analyze-trace.mjs <path/to/trace.json> [--top N] [--min-pct P]
+//                          [--thread <name>] [--all-threads]
+//
+// Defaults: --top 30, --min-pct 0.1 (hide rows under 0.1% self-time),
+//           thread = CrRendererMain (the V8 / DOM / Blink layout thread).
+//
+// Self-time is computed by walking 'X' (complete) events in ts order on
+// each thread independently, subtracting nested children from each
+// parent's duration. Matches the "Bottom-Up" view in chrome://tracing
+// and DevTools' Performance panel when grouped by event name.
+
+import { readFileSync } from 'node:fs';
+import { resolve } from 'node:path';
+
+const args = process.argv.slice(2);
+let tracePath = null;
+let topN = 30;
+let minPct = 0.1;
+let threadName = 'CrRendererMain';
+let allThreads = false;
+for (let i = 0; i < args.length; i++) {
+  const a = args[i];
+  if (a === '--top') topN = parseInt(args[++i], 10);
+  else if (a === '--min-pct') minPct = parseFloat(args[++i]);
+  else if (a === '--thread') threadName = args[++i];
+  else if (a === '--all-threads') allThreads = true;
+  else if (!tracePath) tracePath = a;
+}
+if (!tracePath) {
+  console.error('usage: node analyze-trace.mjs <path> [--top N] [--min-pct P] [--thread NAME] [--all-threads]');
+  process.exit(2);
+}
+tracePath = resolve(process.cwd(), tracePath);
+
+const trace = JSON.parse(readFileSync(tracePath, 'utf8'));
+const events = Array.isArray(trace) ? trace : (trace.traceEvents || []);
+
+// Thread/process metadata events declare human-readable names. We use
+// these to identify the main renderer thread (default CrRendererMain).
+const threadNames = new Map(); // key=`${pid}.${tid}` -> name
+const processNames = new Map(); // key=pid -> name
+for (const e of events) {
+  if (e.ph !== 'M') continue;
+  if (e.name === 'thread_name' && e.args && e.args.name) {
+    threadNames.set(`${e.pid}.${e.tid}`, e.args.name);
+  } else if (e.name === 'process_name' && e.args && e.args.name) {
+    processNames.set(e.pid, e.args.name);
+  }
+}
+
+// Bucket non-metadata 'X' events (complete events with dur) by thread.
+// 'B'/'E' pairs are rare in devtools.timeline + v8 categories but we
+// fold them in for robustness: a 'B' is matched with the next 'E' of
+// the same name on the same thread.
+const byThread = new Map();
+const openBE = new Map(); // key=`${pid}.${tid}.${name}` -> stack of B events
+for (const e of events) {
+  if (!e.ph || !e.pid) continue;
+  const tk = `${e.pid}.${e.tid}`;
+  if (e.ph === 'X') {
+    if (typeof e.dur !== 'number' || e.dur < 0) continue;
+    if (!byThread.has(tk)) byThread.set(tk, []);
+    byThread.get(tk).push({ ts: e.ts, dur: e.dur, name: e.name, cat: e.cat });
+  } else if (e.ph === 'B') {
+    const k = `${tk}.${e.name}`;
+    if (!openBE.has(k)) openBE.set(k, []);
+    openBE.get(k).push(e);
+  } else if (e.ph === 'E') {
+    const k = `${tk}.${e.name}`;
+    const stack = openBE.get(k);
+    if (stack && stack.length) {
+      const b = stack.pop();
+      const dur = e.ts - b.ts;
+      if (dur >= 0) {
+        if (!byThread.has(tk)) byThread.set(tk, []);
+        byThread.get(tk).push({ ts: b.ts, dur, name: e.name, cat: e.cat || b.cat });
+      }
+    }
+  }
+}
+
+// Pick the thread(s) to report.
+const targetThreads = [];
+for (const [tk, name] of threadNames) {
+  if (allThreads || name === threadName) {
+    targetThreads.push({ tk, name, pid: parseInt(tk.split('.')[0], 10) });
+  }
+}
+if (!targetThreads.length) {
+  console.error(`no thread matched --thread "${threadName}". Threads present:`);
+  for (const [tk, name] of threadNames) console.error(`  ${name}  (${tk})`);
+  console.error('Pass --all-threads to aggregate across every thread, or --thread NAME to pick one.');
+  process.exit(3);
+}
+
+// Per-thread self-time computation via depth-walk over X events.
+// Sort by ts ascending; on tie, longer dur first so a containing
+// event lands on the stack before its child.
+const selfByName = new Map(); // name -> { self_us, cat }
+let totalEvents = 0;
+let traceMinTs = Infinity, traceMaxTs = -Infinity;
+for (const { tk, name: tname } of targetThreads) {
+  const list = byThread.get(tk);
+  if (!list || !list.length) continue;
+  list.sort((a, b) => a.ts - b.ts || b.dur - a.dur);
+  totalEvents += list.length;
+  const stack = [];
+  const flush = (top) => {
+    const self = top.dur - top.childTime;
+    if (self <= 0) return;
+    const cur = selfByName.get(top.name) || { self_us: 0, cat: top.cat || '' };
+    cur.self_us += self;
+    if (!cur.cat && top.cat) cur.cat = top.cat;
+    selfByName.set(top.name, cur);
+  };
+  for (const e of list) {
+    if (e.ts < traceMinTs) traceMinTs = e.ts;
+    if (e.ts + e.dur > traceMaxTs) traceMaxTs = e.ts + e.dur;
+    while (stack.length && stack[stack.length - 1].endTs <= e.ts) {
+      flush(stack.pop());
+    }
+    if (stack.length) stack[stack.length - 1].childTime += e.dur;
+    stack.push({ name: e.name, cat: e.cat, dur: e.dur, endTs: e.ts + e.dur, childTime: 0 });
+  }
+  while (stack.length) flush(stack.pop());
+}
+
+const traceDurUs = (traceMaxTs > traceMinTs) ? (traceMaxTs - traceMinTs) : 0;
+
+const totalSelfUs = [...selfByName.values()].reduce((s, x) => s + x.self_us, 0);
+const rows = [...selfByName.entries()]
+  .map(([name, v]) => ({
+    name,
+    cat: v.cat,
+    self_ms: v.self_us / 1000,
+    pct: 100 * v.self_us / (totalSelfUs || 1),
+  }))
+  .sort((a, b) => b.self_ms - a.self_ms)
+  .filter(r => r.pct >= minPct)
+  .slice(0, topN);
+
+const fmt = (n, w) => n.toFixed(2).padStart(w);
+console.log(`trace:   ${tracePath}`);
+console.log(`events:  ${totalEvents}  thread${allThreads ? 's' : ''}: ${allThreads ? 'all' : threadName}  span: ${(traceDurUs / 1e6).toFixed(2)}s`);
+console.log(`total self: ${(totalSelfUs / 1000).toFixed(2)}ms across ${selfByName.size} distinct event names`);
+console.log(`top ${topN} by self-time (min ${minPct}%):`);
+console.log('');
+console.log('   self_ms   self_%   event  @  category');
+console.log('   -------   ------   ----------------------------------------------');
+for (const r of rows) {
+  console.log(`  ${fmt(r.self_ms, 8)}   ${fmt(r.pct, 5)}%   ${r.name}  @  ${r.cat || '(no cat)'}`);
+}
diff --git a/perf/measure.mjs b/perf/measure.mjs
index c9df9cf..6161ce7 100644
--- a/perf/measure.mjs
+++ b/perf/measure.mjs
@@ -23,6 +23,7 @@
 //   node measure.mjs [path/to/book.html] [--out <dir>] [--keep-open]
 //                    [--cpu-profile] [--cpu-sampling <microseconds>]
 //                    [--heap-profile] [--heap-sampling <bytes>]
+//                    [--tracing]
 //                    [--detach-pages] [--instrument] [--time-hooks]
 //                    [--incremental] [--chrome-outline] [--no-timing]
 //                    [--clone-count] [--render-only]
@@ -67,6 +68,15 @@
 // via Performance -> "Load profile..." (or just drag onto the panel).
 // --cpu-sampling sets the sampling interval in microseconds; default
 // 1000 (1 ms). Raise it to keep the profile file smaller on long runs.
+//
+// --tracing wraps the render phase in a Chrome trace via CDP's Tracing
+// domain (page.tracing.start) and writes trace.json to the results
+// folder. The trace categorises Blink work as Layout / UpdateLayoutTree
+// / ParseHTML / Composite / FunctionCall / V8.* etc -- the named buckets
+// hiding inside the cpu profile's (program) frame. Load the file in
+// chrome://tracing or perfetto.dev, or run analyze-trace.mjs against it
+// for a top-N self-time table grouped by event name. Composable with
+// --cpu-profile; uses an independent CDP domain.
 
 import { pathToFileURL, fileURLToPath } from 'node:url';
 import { dirname, resolve, join } from 'node:path';
@@ -99,6 +109,7 @@ let chromeOutline = false;
 let noTiming = false;
 let cloneCount = false;
 let renderOnly = false;
+let tracing = false;
 for (let i = 0; i < args.length; i++) {
   const a = args[i];
   if (a === '--out') outArg = args[++i];
@@ -115,6 +126,7 @@ for (let i = 0; i < args.length; i++) {
   else if (a === '--no-timing') noTiming = true;
   else if (a === '--clone-count') cloneCount = true;
   else if (a === '--render-only') renderOnly = true;
+  else if (a === '--tracing') tracing = true;
   else if (!inputArg) inputArg = a;
   else { console.error(`unknown arg: ${a}`); process.exit(2); }
 }
@@ -247,6 +259,29 @@ try {
     await cdp.send('HeapProfiler.startSampling', { samplingInterval: heapSampling });
     console.log(`[harness] heap profile: sampling every ${heapSampling} bytes`);
   }
+  let tracePath = null;
+  if (tracing) {
+    // Independent of Profiler / HeapProfiler -- different CDP domain.
+    // Categories chosen to crack open the cpu profile's (program) bucket:
+    // devtools.timeline gives Layout / RecalcStyles / ParseHTML /
+    // FunctionCall / EvaluateScript; disabled-by-default-devtools.timeline
+    // adds UpdateLayoutTree / InvalidateLayout / ScheduleStyleRecalc /
+    // HitTest; blink covers internal Blink events; v8 + v8.execute cover
+    // V8.GC* / V8.CompileCode / V8.RunMicrotasks / V8.Execute.
+    tracePath = join(outDir, 'trace.json');
+    await page.tracing.start({
+      path: tracePath,
+      screenshots: false,
+      categories: [
+        'devtools.timeline',
+        'disabled-by-default-devtools.timeline',
+        'blink',
+        'v8',
+        'v8.execute',
+      ],
+    });
+    console.log(`[harness] tracing: ${tracePath}`);
+  }
 
   const tRenderStart = Date.now();
   await page.evaluate(async () => {
@@ -266,6 +301,15 @@ try {
   const tRenderEnd = Date.now();
   const renderMs = tRenderEnd - tRenderStart;
 
+  if (tracing) {
+    await page.tracing.stop();
+    try {
+      const { statSync } = await import('node:fs');
+      const sz = statSync(tracePath).size;
+      console.log(`[harness] tracing: ${tracePath} (${(sz / 1024 / 1024).toFixed(1)} MB)`);
+    } catch { /* size reporting is best-effort */ }
+  }
+
   let profilePath = null;
   let heapProfilePath = null;
   if (cdp) {

From 1c665b2458672b80a729a1650dd8ccf68a2d2442 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Fri, 22 May 2026 23:50:57 +0200
Subject: [PATCH 03/18] Compress book.html; layer :post_render hooks by role.

Extend html-compress to mark book-combined as compress-eligible
so book.html collapses inter-element whitespace at Jekyll time
instead of paged.js's WhiteSpaceFilter doing ~37k DOM mutations
at render time.

Reorder :pages, :post_render and :documents, :post_render hooks
into a three-tier convention so adding compress to book.html
composes correctly with the other plugins:

  :high   mutators  (book-href-rewrite)
  :normal compress  (html-compress)
  :low    readers   (pdfify capture, offlinify per-page rewrite)

Without the layering, book-href-rewrite's landing-heading strip
ran after compress, leaving adjacent single-space runs that no
downstream pass collapsed. The 3-tier ordering makes "compress
is the last cleanup pass among mutators" and "readers see final
compressed bytes" hold by construction.

Verified: 0 outside-pre multi-whitespace runs in the regenerated
book.html (was 37,087 without compress). Branch-counting the
WhiteSpaceFilter post-fix shows DOM mutations drop from ~37k to 0.
Ruby-prof A/B confirms the priority shuffle is CPU-invariant; the
only attributable cost is one extra compress! call (~480 ms once
per Jekyll build, ~300-500 ms saved per paged.js render).

Adds analyze-trace.mjs --children mode used to localise this
during the investigation. Full writeup in perf/README.md and
docs/_plugins/html-compress.md.
---
 WIP.md                             |   2 +-
 docs/_plugins/book-href-rewrite.rb |   6 +-
 docs/_plugins/html-compress.md     |  34 +++-
 docs/_plugins/html-compress.rb     |  43 ++++-
 docs/_plugins/offlinify.rb         |   6 +-
 docs/_plugins/pdfify.rb            |   4 +-
 perf/README.md                     | 301 +++++++++++++++++++++++++++++
 perf/analyze-trace.mjs             |  95 +++++++--
 8 files changed, 463 insertions(+), 28 deletions(-)

diff --git a/WIP.md b/WIP.md
index 3bd2998..ef72dfa 100644
--- a/WIP.md
+++ b/WIP.md
@@ -433,7 +433,7 @@ From `docs/`:
 - `check.bat` — link check (offline Lychee against `_site/`).
 - `book.bat` — renders the PDF from `_site-pdf/book.html` via `pagedjs-cli` into `_pdf/book.pdf`. Run `build.bat` first to populate `_site-pdf/`.
 
-The HTML whitespace compression that wraps every page's render chain is handled by `_plugins/html-compress.rb` rather than the just-the-docs theme's `vendor/compress.html` Liquid layout — see [_plugins/html-compress.md](docs/_plugins/html-compress.md) for the full writeup. The Liquid layout's per-page cost in the profile was ~2.4s of Liquid filter dispatch (a `split: " " | join: " "` over the outside-of-`<pre>` content, lowering to a per-page Array allocation of every whitespace-delimited token across 837 pages — millions of small `String` objects). The layout is short-circuited via `compress_html.ignore.envs: all` in `_config.yml`; it then outputs a bare `{{ content }}` and the plugin takes over at `:pages, :post_render` / `:documents, :post_render` with `priority :high`, doing the same pre-block-protected whitespace collapse via `content.split(PRE_BLOCK_RE).each { |s| s.split(" ").join(" ") }` in C-implemented Ruby. The `priority :high` annotation places this hook before offlinify and pdfify (both `:normal`) so they see the compressed bytes. Pages whose layout chain doesn't reach `vendor/compress` are gated out via a `:site, :pre_render` precompute that walks `site.layouts[name].data["layout"]` for every layout key and marks the entire compress-reaching chain (default → table_wrappers → vendor/compress) -- jekyll-redirect-from stubs, the SCSS-derived CSS pages, `assets/js/zzzz-search-data.json`, and `book.html` (which uses the minimal `book-combined` layout that has no parent) all stay un-gated and pass through verbatim, matching exactly what the Liquid layout would have processed. Output is byte-identical to the layout-based version: a recursive `diff -rq` of `_site/` against a vendor/compress.html baseline reports zero differences across all ~840 HTML pages, 290 redirect stubs, every CSS / JSON / SVG / image asset. The plugin's correctness depended on two non-obvious details that broke an earlier cut -- the layout-chain walk has to compare against the layout *key* (`"vendor/compress"`) rather than `layout.name` (which carries the `.html` extension), and the per-segment `split(" ").join(" ")` strips trailing whitespace that the Liquid layout's *template* re-adds via its trailing-newline source character, so the plugin captures `content.end_with?("\n")` before the split and re-appends a `\n` after the join. Both regressions surfaced as nonzero `diff -rq` counts during development and are flagged in the plugin's header comment and [_plugins/html-compress.md](docs/_plugins/html-compress.md).
+The HTML whitespace compression that wraps every page's render chain is handled by `_plugins/html-compress.rb` rather than the just-the-docs theme's `vendor/compress.html` Liquid layout — see [_plugins/html-compress.md](docs/_plugins/html-compress.md) for the full writeup. The Liquid layout's per-page cost in the profile was ~2.4s of Liquid filter dispatch (a `split: " " | join: " "` over the outside-of-`<pre>` content, lowering to a per-page Array allocation of every whitespace-delimited token across 837 pages — millions of small `String` objects). The layout is short-circuited via `compress_html.ignore.envs: all` in `_config.yml`; it then outputs a bare `{{ content }}` and the plugin takes over at `:pages, :post_render` / `:documents, :post_render` with `priority :normal`, doing the same pre-block-protected whitespace collapse via `content.split(PRE_BLOCK_RE).each { |s| s.split(" ").join(" ") }` in C-implemented Ruby. The `:normal` priority is the *middle* tier of a three-level convention across the site's `:post_render` hooks: mutators (`book-href-rewrite`) run at `:high`, this cleanup pass at `:normal`, readers (`pdfify`, `offlinify`) at `:low`. The invariant "compress runs after every mutator and before every reader" therefore holds by construction; no downstream plugin has to be whitespace-aware. Pages whose layout chain doesn't reach `vendor/compress` are gated out via a `:site, :pre_render` precompute that walks `site.layouts[name].data["layout"]` for every layout key and marks the entire compress-reaching chain (default → table_wrappers → vendor/compress) -- jekyll-redirect-from stubs, the SCSS-derived CSS pages, and `assets/js/zzzz-search-data.json` all stay un-gated and pass through verbatim. `book.html` (which uses the minimal `book-combined` layout that has no parent) is *also* outside that chain but is explicitly added to the compress-eligible set at the end of the precompute, so the same whitespace collapse runs on it -- saves paged.js's render-time `WhiteSpaceFilter` ~37k DOM mutations (~28k `textContent` overwrites + ~9k `removeChild` calls) at the cost of ~480 ms once per Jekyll build. Output is byte-identical to the layout-based version: a recursive `diff -rq` of `_site/` against a vendor/compress.html baseline reports zero differences across all ~840 HTML pages, 290 redirect stubs, every CSS / JSON / SVG / image asset. The plugin's correctness depended on two non-obvious details that broke an earlier cut -- the layout-chain walk has to compare against the layout *key* (`"vendor/compress"`) rather than `layout.name` (which carries the `.html` extension), and the per-segment `split(" ").join(" ")` strips trailing whitespace that the Liquid layout's *template* re-adds via its trailing-newline source character, so the plugin captures `content.end_with?("\n")` before the split and re-appends a `\n` after the join. Both regressions surfaced as nonzero `diff -rq` counts during development and are flagged in the plugin's header comment and [_plugins/html-compress.md](docs/_plugins/html-compress.md).
 
 ### Profiling the build
 
diff --git a/docs/_plugins/book-href-rewrite.rb b/docs/_plugins/book-href-rewrite.rb
index add4b3e..28acdc2 100644
--- a/docs/_plugins/book-href-rewrite.rb
+++ b/docs/_plugins/book-href-rewrite.rb
@@ -372,7 +372,11 @@ def self.process(page)
   end
 end
 
-Jekyll::Hooks.register :pages, :post_render do |page|
+# :high so this MUTATOR runs before html-compress (priority :normal).
+# Otherwise the landing-heading strip leaves a double-space run that
+# no downstream pass cleans up. See html-compress.rb's priority
+# convention comment for the full layering.
+Jekyll::Hooks.register :pages, :post_render, priority: :high do |page|
   next unless page.path == "book.html"
   BookHrefRewrite.process(page)
 end
diff --git a/docs/_plugins/html-compress.md b/docs/_plugins/html-compress.md
index 114e388..7430862 100644
--- a/docs/_plugins/html-compress.md
+++ b/docs/_plugins/html-compress.md
@@ -1,6 +1,6 @@
 # HtmlCompress
 
-`_plugins/html-compress.rb` runs the HTML whitespace compression that wraps every page's render chain — the same job just-the-docs's vendor/compress.html Liquid layout was doing, but in Ruby instead of Liquid filters. Output is byte-identical to the layout-based version (verified by recursive diff of every file in `_site/` against a vendor/compress.html baseline). The Liquid layout is short-circuited to a `{{ content }}` passthrough via `compress_html.ignore.envs: all` in `_config.yml`; the plugin then runs at `:pages, :post_render` / `:documents, :post_render` with `priority :high`, so the compressed bytes are what offlinify and Jekyll's writer see.
+`_plugins/html-compress.rb` runs the HTML whitespace compression that wraps every page's render chain — the same job just-the-docs's vendor/compress.html Liquid layout was doing, but in Ruby instead of Liquid filters. Output is byte-identical to the layout-based version for the 837 vendor/compress-reaching pages (verified by recursive diff of every file in `_site/` against a vendor/compress.html baseline). The Liquid layout is short-circuited to a `{{ content }}` passthrough via `compress_html.ignore.envs: all` in `_config.yml`; the plugin then runs at `:pages, :post_render` / `:documents, :post_render` with `priority :normal` as the *cleanup* step in a three-tier `:high` → `:normal` → `:low` ordering (mutators → compress → readers — see [Hook priority convention](#hook-priority-convention) below). It also picks up one page the original layout didn't process, `book.html`, via an explicit `book-combined` addition to the compress-eligible set — see [book.html inclusion](#bookhtml-inclusion).
 
 This file sits in `_plugins/` for the same reasons as `offlinify.md` and `pdfify.md`: it lives next to the code it documents, and Jekyll's `_plugins/` folder is plugin-only territory, so this Markdown never gets rendered into the public site.
 
@@ -74,7 +74,7 @@ page.md   (layout: default)
         └── vendor/compress.html (no layout)
 ```
 
-Pages that don't use any of these layouts — jekyll-redirect-from stubs, the SCSS-derived CSS pages, `assets/js/zzzz-search-data.json`, `book.html` (which uses the minimal `book-combined` layout that has no parent) — were left untouched by the layout. The plugin has to match that gating, otherwise it would compress files that compress.html doesn't, breaking byte-identity.
+Pages that don't use any of these layouts — jekyll-redirect-from stubs, the SCSS-derived CSS pages, `assets/js/zzzz-search-data.json` — were left untouched by the layout. The plugin has to match that gating, otherwise it would compress files that compress.html doesn't, breaking byte-identity. `book.html` (which uses the minimal `book-combined` layout that has no parent) was originally in this list, but is now explicitly added to the compress-eligible set — see [book.html inclusion](#bookhtml-inclusion).
 
 The gate is precomputed once at `:site, :pre_render`:
 
@@ -114,20 +114,42 @@ Jekyll::Hooks.register :site, :pre_render do |site|
   HtmlCompress.precompute_compress_layouts!(site)
 end
 
-Jekyll::Hooks.register :pages, :post_render, priority: :high do |page|
+Jekyll::Hooks.register :pages, :post_render, priority: :normal do |page|
   next unless page.output.is_a?(String)
   next unless HtmlCompress.compress?(page)
   HtmlCompress.compress!(page.output)
 end
 
-Jekyll::Hooks.register :documents, :post_render, priority: :high do |doc|
+Jekyll::Hooks.register :documents, :post_render, priority: :normal do |doc|
   next unless doc.output.is_a?(String)
   next unless HtmlCompress.compress?(doc)
   HtmlCompress.compress!(doc.output)
 end
 ```
 
-The `priority: :high` is what places the plugin *before* `offlinify.rb` and `pdfify.rb` in the per-page render-hook order — both of those use the default `:normal` priority and rely on reading the final compressed `page.output`. Jekyll runs `:post_render` hooks in descending priority, so `:high` (30) fires before `:normal` (20). Without the priority annotation the order would be insertion-order across all `.rb` files in `_plugins/`, which is not a stable contract.
+## Hook priority convention
+
+The `priority: :normal` is the middle tier of a three-level ordering for `:pages, :post_render` and `:documents, :post_render` hooks across the plugin set. Jekyll runs hooks in descending priority (`:high` (30) → `:normal` (20) → `:low` (10)), and the three tiers carry distinct roles:
+
+| Tier | Role | Plugins |
+| --- | --- | --- |
+| `:high` (30) | **Mutators.** Modify `page.output` so the final bytes reflect this pass. | `book-href-rewrite` (chapter href rewrites + landing-heading strip on `book.html`). |
+| `:normal` (20) | **Compress.** The cleanup pass. Sandwiched between mutators and readers so any whitespace runs left behind by a mutator's `gsub` get collapsed before any reader captures the bytes. | `html-compress` (this plugin). |
+| `:low` (10) | **Readers.** Snapshot or consume `page.output` after the cleanup pass. | `pdfify` (captures `book.html` for the PDF pipeline), `offlinify` (per-page href / src rewrites + write to `_site-offline/`). |
+
+The layering was originally implicit: the plugin sat at `:high` next to no other priority-annotated `:post_render` hooks. That worked until `book-href-rewrite` joined the set at default `:normal`. Its landing-heading strip ran *after* compress, removing `<h2>` blocks but leaving the (already-collapsed) single-space runs on either side adjacent — producing literal `>  <` blobs in three chapter openings that paged.js's WhiteSpaceFilter then had to handle at render time. Promoting `book-href-rewrite` to `:high` and demoting compress to `:normal` makes the invariant "compress is the last cleanup step among mutators" hold by construction; demoting the readers to `:low` makes "readers see the final compressed output" hold by construction. Future plugins choose their tier by their role and the ordering composes automatically.
+
+The full priority story is documented as a comment block above the `Jekyll::Hooks.register` calls in [`html-compress.rb`](html-compress.rb); each of the four affected plugins (this one, `book-href-rewrite`, `pdfify`, `offlinify`) carries a one-line note pointing back to that block.
+
+## book.html inclusion
+
+The layout-chain walk above only marks layouts that reach `vendor/compress`. `book.html` uses the minimal `book-combined` layout, which has no parent, so the walk never reaches it and the page was originally skipped (matching the layout's behaviour). After investigation of paged.js's per-render `WhiteSpaceFilter` work (see [`perf/README.md`](../../perf/README.md)) showed it doing ~37k DOM mutations at render time to handle whitespace text nodes that *would* have been collapsed if the page had been compressed at Jekyll build time, the precompute was extended to mark `book-combined` explicitly:
+
+```ruby
+@compress_layouts << "book-combined" if site.layouts.key?("book-combined")
+```
+
+at the end of `precompute_compress_layouts!`. Output: `book.html` now passes through `compress!` once per build (~480 ms of additional `String#split` work on the ~5.5 MB document), saving roughly the same wall-clock at paged.js render time (~28k `textContent` overwrites + ~9k `removeChild` calls eliminated). Net is approximately wall-clock-neutral for full builds, and a small net win for incremental Jekyll workflows that skip the PDF (`also_build_pdf: false`) — the compress cost is paid once per Jekyll build, the render saving is paid every PDF build, and decoupling the two is the structural improvement.
 
 ## Verification
 
@@ -157,6 +179,6 @@ In source order in [`html-compress.rb`](html-compress.rb):
 
 - `precompute_compress_layouts!(site)` — `:site, :pre_render` entry. Walks every layout chain via `data["layout"]`, marks each layout on the path as compress-ending the moment the walk hits `vendor/compress`. Idempotent; the resulting `@compress_layouts` set persists across builds in `jekyll serve` and gets rebuilt fresh each `:pre_render`.
 
-- `compress?(page)` — gate check. Returns `true` when the page's `data["layout"]` is in `@compress_layouts`. Pages without a layout (jekyll-redirect-from stubs, SCSS-derived CSS, JSON-via-page-rendering, `book.html` via `book-combined`) return `false` and skip the compression entirely.
+- `compress?(page)` — gate check. Returns `true` when the page's `data["layout"]` is in `@compress_layouts`. Pages without a layout (jekyll-redirect-from stubs, SCSS-derived CSS, JSON-via-page-rendering) return `false` and skip the compression entirely. `book.html` (which uses `book-combined`, a minimal layout with no parent) used to land here too; it is now explicitly added to the set by `precompute_compress_layouts!` — see [book.html inclusion](#bookhtml-inclusion).
 
 - `compress!(content)` — the actual compression, in place. Captures the trailing-newline state, splits by `PRE_BLOCK_RE` with the capture group so pre bodies are preserved in the result array, runs `split(" ").join(" ")` on every outside-of-pre segment, joins, restores the trailing newline if needed, then mutates the input string via `String#replace`. The `replace` is what lets us hand back the same string object the caller passed in — Jekyll's writer reads `page.output` after `:post_render`, so in-place mutation is the cheapest way to update what gets written.
diff --git a/docs/_plugins/html-compress.rb b/docs/_plugins/html-compress.rb
index 7603a58..ddbb194 100644
--- a/docs/_plugins/html-compress.rb
+++ b/docs/_plugins/html-compress.rb
@@ -72,6 +72,12 @@ def self.precompute_compress_layouts!(site)
         cur_name = cur ? cur.data["layout"] : nil
       end
     end
+    # book-combined is a minimal layout with no parent, so the walk
+    # above doesn't reach it. Compressing its only consumer (book.html)
+    # at Jekyll time saves paged.js's WhiteSpaceFilter ~37k DOM
+    # mutations and ~300-400 ms once per render -- see
+    # perf/README.md "WhiteSpaceFilter that wasn't" section.
+    @compress_layouts << "book-combined" if site.layouts.key?("book-combined")
   end
 
   # True when `page` (or document) uses a layout chain ending in
@@ -117,16 +123,43 @@ def self.compress!(content)
   HtmlCompress.precompute_compress_layouts!(site)
 end
 
-# Run before offlinify (default :normal priority) so the offline-tree
-# rewrites see the compressed page.output, and before Jekyll's
-# `:site, :post_write` writes _site/ for the same reason.
-Jekyll::Hooks.register :pages, :post_render, priority: :high do |page|
+# Priority convention for :pages, :post_render hooks in this site:
+#
+#   :high   = MUTATORS. Plugins that modify page.output. Run first so
+#             their mutations are visible to compress and downstream
+#             readers. Examples: book-href-rewrite (landing heading
+#             strip + in-book href rewrites).
+#
+#   :normal = COMPRESS. This plugin. The cleanup pass, sandwiched
+#             between mutators and readers so any whitespace runs left
+#             behind by a mutator's gsub get collapsed before anyone
+#             reads the final bytes.
+#
+#   :low    = READERS. Plugins that snapshot or consume page.output
+#             after all mutations and the compress pass. Run last so
+#             they see final output. Examples: pdfify (captures
+#             book.html for the PDF pipeline), offlinify (rewrites
+#             root-absolute hrefs and writes to _site-offline/).
+#
+# Without this layering, a mutator running after compress leaves
+# adjacent whitespace runs that no downstream pass collapses; a
+# reader running before compress captures uncompressed bytes. Both
+# regressions surfaced when book-href-rewrite (default :normal) ran
+# after html-compress (originally :high) -- its 3 landing-heading
+# strips left double-space artifacts that paged.js's WhiteSpaceFilter
+# had to handle at render time.
+#
+# Offlinify also runs at :site, :post_write (a later phase entirely),
+# where it always sees the final compressed bytes regardless of
+# per-page priority. The :low designation here governs its per-page
+# capture hook specifically.
+Jekyll::Hooks.register :pages, :post_render, priority: :normal do |page|
   next unless page.output.is_a?(String)
   next unless HtmlCompress.compress?(page)
   HtmlCompress.compress!(page.output)
 end
 
-Jekyll::Hooks.register :documents, :post_render, priority: :high do |doc|
+Jekyll::Hooks.register :documents, :post_render, priority: :normal do |doc|
   next unless doc.output.is_a?(String)
   next unless HtmlCompress.compress?(doc)
   HtmlCompress.compress!(doc.output)
diff --git a/docs/_plugins/offlinify.rb b/docs/_plugins/offlinify.rb
index ab5032e..782038f 100644
--- a/docs/_plugins/offlinify.rb
+++ b/docs/_plugins/offlinify.rb
@@ -1443,11 +1443,13 @@ def self.decode(path)
   Offlinify.setup(site)
 end
 
-Jekyll::Hooks.register :pages, :post_render do |page|
+# :low so these READERS see page.output after html-compress (:normal)
+# has run. See html-compress.rb's priority convention.
+Jekyll::Hooks.register :pages, :post_render, priority: :low do |page|
   Offlinify.process_page(page)
 end
 
-Jekyll::Hooks.register :documents, :post_render do |doc|
+Jekyll::Hooks.register :documents, :post_render, priority: :low do |doc|
   Offlinify.process_page(doc)
 end
 
diff --git a/docs/_plugins/pdfify.rb b/docs/_plugins/pdfify.rb
index 49eebea..f3feabb 100644
--- a/docs/_plugins/pdfify.rb
+++ b/docs/_plugins/pdfify.rb
@@ -287,7 +287,9 @@ def self.copy_file(src, dst)
   Pdfify.setup(site)
 end
 
-Jekyll::Hooks.register :pages, :post_render do |page|
+# :low so this READER captures page.output after html-compress
+# (:normal) has run. See html-compress.rb's priority convention.
+Jekyll::Hooks.register :pages, :post_render, priority: :low do |page|
   Pdfify.maybe_capture(page)
 end
 
diff --git a/perf/README.md b/perf/README.md
index 1385d1f..518605e 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -4121,3 +4121,304 @@ those operations didn't have a Blink layout-tree
 mutation step downstream. Mutations are where the cost
 that *looks* like JS allocation actually lives in this
 codebase.
+
+## Cracking `(program)` open with a Blink-category trace
+
+The cpu profile's `(program)` row sat at ~2.2 s (23 %) of
+render and resisted attribution -- `find-callers.mjs` puts
+it directly under `(root)`, the V8 sampler's structural
+floor for "isolate is on-CPU but no JS frame on top." To
+see *what* native code was running there, the harness gained
+a `--tracing` flag and a companion `analyze-trace.mjs`.
+
+The flag wraps the render phase in `page.tracing.start()`
+with Blink-relevant categories (`devtools.timeline`,
+`disabled-by-default-devtools.timeline`, `blink`, `v8`,
+`v8.execute`) and writes `trace.json` to the results
+folder. `analyze-trace.mjs` walks the trace's complete-phase
+events on `CrRendererMain`, computes self-time per event
+name via a nested-event stack walk (same shape as
+`analyze-profile.mjs` for cpuprofiles), and prints a
+top-N table. A `--children <name>` mode breaks any
+parent event into its direct callees, mirroring
+`find-callees.mjs`.
+
+### What's on the main thread
+
+Top events by self-time on a fresh `--detach-pages
+--no-timing --render-only --tracing` run, 1651-page book,
+9.07 s render:
+
+| event                                    | self_ms | self_% |
+| ---------------------------------------- | ------- | ------ |
+| `RunMicrotasks`                          | 3039.42 | 33.5 % |
+| `LocalFrameView::performLayout`          | 1800.31 | 19.9 % |
+| `Document::recalcStyle`                  | 1785.55 | 19.7 % |
+| `InlineNode::ShapeTextIncludingFirstLine`|  526.64 |  5.8 % |
+| `Document::rebuildLayoutTree`            |  484.88 |  5.4 % |
+| `FunctionCall`                           |  285.89 |  3.2 % |
+| `v8.callFunction`                        |  251.48 |  2.8 % |
+| `Blink.CompositingInputs.UpdateTime`     |  130.77 |  1.4 % |
+| `Blink.PrePaint.UpdateTime`              |  118.90 |  1.3 % |
+| `Document::updateStyle`                  |  101.65 |  1.1 % |
+| ... 189 smaller events ...               |         |        |
+
+Mapping these onto the cpu profile's labels:
+
+| cpu profile row | trace decomposition |
+| --- | --- |
+| `getBoundingClientRect` self 3.7 s | `performLayout` 1.8 s + `recalcStyle` 1.8 s -- the layout flush gBCR triggers, which the cpu profile lumps under the native frame. |
+| `removeChild` self 1.6 s | `rebuildLayoutTree` 0.5 s + portions of `recalcStyle` / `performLayout` -- each removeChild dirties style and layout. |
+| `(program)` self 2.2 s | `RunMicrotasks` 3.0 s mostly. The cpu profile attributes a chunk of this to neighbour rows; what's left under `(program)` is the V8 runtime plumbing that has no JS frame on top. |
+| `(garbage collector)` 100 ms | Sum of `V8.GC_*` events ≈ 135 ms. |
+
+So `(program)` is essentially **the V8 runtime inside a
+microtask continuation**. The natural follow-up is "which
+microtask, and what's it doing?"
+
+### Inside `RunMicrotasks`
+
+`--children RunMicrotasks` shows the parent fired only
+**15 times** across the whole render, totalling 7.14 s:
+
+```
+parent: RunMicrotasks  hits: 15  total: 7142.49ms  self: 3039.42ms (42.6%)
+
+   total_ms  total_%     hits   child
+   --------  -------   ------   --------------------------------
+   3442.01   48.19%    39437   Document::UpdateStyleAndLayout
+   3039.42   42.55%       15   (self / unattributed)
+    547.98    7.67%   181106   v8.callFunction
+     50.99    0.71%      892   Blink.Style.UpdateTime
+     34.88    0.49%      205   V8.StackGuard
+     17.05    0.24%        6   MinorGC
+```
+
+Listing the 15 events by duration:
+
+```
+rm[0]   70.89 ms   -- one early-render burst (the parser)
+rm[1..3]  < 1 ms  -- empty-trigger settle ticks
+rm[4]  7071.14 ms  -- THE render loop
+rm[5..14]  < 1 ms each  -- post-render cleanup
+```
+
+**One event accounts for 99.0 % of the parent total.**
+rm[4] envelopes essentially the whole render. V8 batches
+the ~6 `await` boundaries inside `Chunker.flow()`
+(beforeParsed / filter / afterParsed / loadFonts /
+render / afterRendered) -- all of which Phase 1 of the
+async cleanup turned into `await undefined` fast-paths --
+into a single drained microtask continuation. There is
+**no per-page microtask cost**. The async stripping did
+its job.
+
+### The 181,106 `v8.callFunction` callbacks
+
+The first thing that looked like a smoking gun --
+"181k dispatches sounds per-page-shaped" -- turned out
+to be **one DOM walk**. Aggregating FunctionCall events
+by `args.data.functionName + lineNumber`:
+
+```
+hits      dur_ms   functionName:line
+181041    296.54   (anon):32455  (paged.browser.js)
+     2      0.25   request.onload:27495
+```
+
+paged.browser.js:32455 is `WhiteSpaceFilter.filter`'s
+TreeWalker callback:
+
+```js
+filterTree(content, (node) => {
+    return this.filterEmpty(node);
+}, NodeFilter.SHOW_TEXT);
+```
+
+The walker visits every text node in the parsed
+document and calls the lambda. For our 5.5 MB book
+that's 181,041 invocations, all clustered in the first
+685 ms of rm[4]. Same `(node) => this.filterEmpty(...)`
+arrow allocated once but called from C++→JS 181k times,
+so V8 emits a `v8.callFunction` event each invocation.
+
+These aren't 181k microtasks. They're 181k synchronous
+TreeWalker callbacks nested inside the one big
+continuation. The "callbacks per page" framing was a
+mirage produced by dividing 181k by page count.
+
+### What's actually in `(program)`'s 2.2 s
+
+Triangulating the trace and cpu profile:
+
+- **~1.7 s** is V8 dispatch glue for the 181k filter
+  walk callbacks + remaining native→JS transitions
+  inside the continuation. V8 charges this to
+  `RunMicrotasks` self in the trace; the cpu profile
+  splits it between `(program)` and rows like `v8.callFunction`.
+- **~0.3 s** is V8 IC / inline-cache miss handling on
+  the per-page hot path. Each polymorphic call site
+  pays a stub-call indirection that lands in `(program)`.
+- **~0.1 s** is Blink microtask checkpoint code -- the
+  auto-style-and-layout pass that fires whenever a
+  microtask drains. The `Document::UpdateStyleAndLayout`
+  events under `RunMicrotasks` (3.44 s) attribute the
+  work *itself* to named Blink rows; the C++ glue
+  bracketing each call lands in `(program)`.
+- The remainder is V8 scheduler bookkeeping, microtask
+  queue drain machinery, and small unnamed natives.
+
+None of this is a *per-page* cost. Reducing further
+would require either (a) eliminating the filter walk,
+or (b) reducing the per-page hot path's native→JS
+transition count -- which is dominated by gBCR-driven
+layout flushes that we've already pushed against
+unsuccessfully in earlier sections (Attempts B, D from
+the "createBreakToken dedup" investigation).
+
+### The "actionable finding" that wasn't: WhiteSpaceFilter
+
+The whitespace filter walk costs **~685 ms once per
+render** -- 296 ms inside the JS callback bodies plus
+~390 ms in TreeWalker dispatch overhead. The initial
+read was "this is doing nothing useful for compressed
+HTML, short-circuit it." Wrong on both counts.
+
+Branch-counting the filter via a one-shot probe (count
+every branch in `filterEmpty`, dump to the harness
+console):
+
+```
+total:        181,106  every text node visited
+  length === 0:       0
+  length === 1:  38,685  (21.4%)  collapsed inter-element spaces
+  length > 1, !ignorable: 101,930  (56.3%)  real content -- hot path
+  length > 1, ignorable:  40,491  (22.4%)  whitespace-only, body runs
+    inside <pre>:        3,408   no-op (REJECT)
+    middle position:    27,901   textContent = " " (mutated)
+    left edge:           5,405   removeChild (accepted)
+    right edge:          3,777   removeChild (accepted)
+    orphan:                  0
+```
+
+**22.4 % of calls entered the body** and 37,083 actual
+DOM mutations happened: 9,182 nodes removed +
+27,901 nodes overwritten to single spaces. Far from
+zero.
+
+The premise was based on a misreading of html-compress:
+the plugin does collapse inter-element whitespace, but
+the `:site, :pre_render` gate that picks which pages it
+processes explicitly excludes `book.html` (which uses
+the minimal `book-combined` layout that doesn't reach
+`vendor/compress`; same README's html-compress section
+calls this out). Source indentation is preserved in
+the PDF input, so paged.js sees the raw multi-char
+whitespace text nodes. The filter is load-bearing --
+its mutations are what subsequent chunker walkers
+rely on to skip whitespace cheaply.
+
+The 0.83 % of calls that exceeded 4 us in the trace's
+dur histogram came from this body running; the
+histogram undercounted body entries because the
+short-branch (`closest("pre")` → REJECT) takes only
+~2-3 us, indistinguishable from the hot path in the
+0-4 us buckets. Branch counters were needed to reveal
+the true split.
+
+There's still optimisation headroom (the per-call
+TreeWalker dispatch is ~3 us of which only ~1.5 us is
+the body), but it requires changing the algorithm
+rather than skipping it: e.g. a hand-rolled JS recursion
+that avoids the C++→JS transition per node, or
+folding WhiteSpaceFilter + CommentsFilter + ScriptsFilter
+into a single TreeWalker pass with `SHOW_TEXT | SHOW_COMMENT`
+and a dispatcher. Net saving probably ~300-400 ms once
+per render; not investigated.
+
+The methodology lesson: a histogram of per-call dur
+**cannot** distinguish a fast body branch from a hot
+path -- both compile to 2-3 µs on V8. Branch
+instrumentation is the only way to count what each
+call actually did. The histogram suggested "0.8 %
+body entries"; reality was 22.4 %.
+
+### And we did fix it, on the Jekyll side
+
+The premise that motivated the original "actionable
+finding" -- that book.html should already be
+whitespace-collapsed when paged.js sees it -- was true
+in spirit, just wrong about whether it was being done.
+The fix landed in two parts:
+
+1. **Extend `html-compress.rb` to book.html.** The
+   layout-chain precompute now explicitly adds
+   `book-combined` to `@compress_layouts` at the end of
+   `precompute_compress_layouts!`. book.html therefore
+   passes through `compress!` once per build (~480 ms
+   of `String#split` work on the ~5.5 MB document), and
+   paged.js sees a document with inter-element
+   whitespace already collapsed to single spaces.
+
+2. **Reorder hook priorities** so that adding compress
+   to book.html composes cleanly with the other
+   `:pages, :post_render` plugins. The original
+   `:high`-priority compress ran *before*
+   `book-href-rewrite` -- whose landing-heading strip
+   removed `<h2>` blocks from three chapter openings,
+   leaving the (already-collapsed) single spaces on
+   either side adjacent and producing literal `>  <`
+   blobs. The fix is a three-tier convention: mutators
+   at `:high` (run first), compress at `:normal` (the
+   cleanup), readers at `:low` (snapshot final bytes).
+   See `_plugins/html-compress.md` for the full table.
+
+Verified: 0 outside-pre multi-whitespace runs in the
+regenerated book.html (was 3 with the
+landing-heading-strip artifacts; was 37,087 without
+compress at all). Branch-counting the WhiteSpaceFilter
+after the fix shows body entries drop from ~40 k to
+the 3,408 in-pre cases that the filter is structurally
+required to visit (and immediately REJECTs via
+`closest("pre")`). DOM mutations drop from ~37 k to 0.
+PDF output is byte-equivalent within timestamp drift.
+
+Net wall-clock is approximately neutral on full builds
+(~480 ms added to Jekyll, ~300-500 ms saved at paged.js
+render time), and a small win for incremental Jekyll
+workflows that skip the PDF (`also_build_pdf: false`):
+the compress cost is paid once per Jekyll build, the
+render saving is paid every PDF build, and decoupling
+the two is the structural improvement.
+
+A ruby-prof A/B (post-change vs pre-change with a
+single stashed-changes revert) confirmed that the only
+attributable Jekyll-side cost is exactly one extra
+`compress!` invocation (837 → 838) and its downstream
+`String#split` calls (+819 from book.html's non-pre
+segments). No plugin's call count or self-time changed
+beyond the noise floor; the priority shuffle is
+CPU-invariant for everything except the new compress
+pass on book.html.
+
+### What the trace doesn't change
+
+Nothing about the cpu profile's bottom-up table is
+wrong; the trace just resolves what `(program)` masked.
+After this exercise, the menu of remaining levers is
+unchanged:
+
+- `pageRanges` sharding for the generate phase (biggest
+  untried knob, generate is now the larger phase).
+- WhiteSpaceFilter algorithmic restructuring (~0.3 s,
+  render) -- not a short-circuit, since the filter does
+  real work; would need a hand-rolled traversal that
+  avoids the per-node C++→JS dispatch.
+- Everything else lives below the noise floor.
+
+Render-stage optimization headroom is exhausted. The
+cpu profile's `(program)` row isn't a structural smell
+or a missed microtask -- it's the fixed cost of V8
+running the JavaScript we already have, accounted for
+honestly by the trace and accounted for opaquely by
+the JS sampler.
diff --git a/perf/analyze-trace.mjs b/perf/analyze-trace.mjs
index cefc6cf..9e1496b 100644
--- a/perf/analyze-trace.mjs
+++ b/perf/analyze-trace.mjs
@@ -11,6 +11,7 @@
 // Usage:
 //   node analyze-trace.mjs <path/to/trace.json> [--top N] [--min-pct P]
 //                          [--thread <name>] [--all-threads]
+//                          [--children <event-name>]
 //
 // Defaults: --top 30, --min-pct 0.1 (hide rows under 0.1% self-time),
 //           thread = CrRendererMain (the V8 / DOM / Blink layout thread).
@@ -19,6 +20,14 @@
 // each thread independently, subtracting nested children from each
 // parent's duration. Matches the "Bottom-Up" view in chrome://tracing
 // and DevTools' Performance panel when grouped by event name.
+//
+// --children <name> switches the report from "top events by self-time"
+// to "what runs directly inside <name>?". For every X-event whose name
+// matches, aggregate the total time of each direct child by child name
+// (this is total-time / inclusive cost from the parent's POV, the same
+// shape as find-callees.mjs for cpuprofiles). Plus a synthetic
+// "(self / unattributed)" row capturing parent dur minus the sum of
+// direct children -- i.e. work attributed to the parent frame itself.
 
 import { readFileSync } from 'node:fs';
 import { resolve } from 'node:path';
@@ -29,16 +38,18 @@ let topN = 30;
 let minPct = 0.1;
 let threadName = 'CrRendererMain';
 let allThreads = false;
+let childrenOf = null;
 for (let i = 0; i < args.length; i++) {
   const a = args[i];
   if (a === '--top') topN = parseInt(args[++i], 10);
   else if (a === '--min-pct') minPct = parseFloat(args[++i]);
   else if (a === '--thread') threadName = args[++i];
   else if (a === '--all-threads') allThreads = true;
+  else if (a === '--children') childrenOf = args[++i];
   else if (!tracePath) tracePath = a;
 }
 if (!tracePath) {
-  console.error('usage: node analyze-trace.mjs <path> [--top N] [--min-pct P] [--thread NAME] [--all-threads]');
+  console.error('usage: node analyze-trace.mjs <path> [--top N] [--min-pct P] [--thread NAME] [--all-threads] [--children NAME]');
   process.exit(2);
 }
 tracePath = resolve(process.cwd(), tracePath);
@@ -104,13 +115,23 @@ if (!targetThreads.length) {
   process.exit(3);
 }
 
-// Per-thread self-time computation via depth-walk over X events.
-// Sort by ts ascending; on tie, longer dur first so a containing
-// event lands on the stack before its child.
+// Per-thread depth-walk over X events. Sort by ts ascending; on tie,
+// longer dur first so a containing event lands on the stack before its
+// child.
+//
+// Two output modes:
+//   default        --> top-N events by self-time (bottom-up view).
+//   --children X   --> direct callees of every X-event named X,
+//                      aggregated by child name + a (self) row.
 const selfByName = new Map(); // name -> { self_us, cat }
+const childrenAcc = childrenOf ? new Map() : null; // child name -> { total_us, cat, hits }
+let childrenParentTotal_us = 0;
+let childrenParentSelf_us = 0;
+let childrenParentHits = 0;
+let childrenParentCat = '';
 let totalEvents = 0;
 let traceMinTs = Infinity, traceMaxTs = -Infinity;
-for (const { tk, name: tname } of targetThreads) {
+for (const { tk } of targetThreads) {
   const list = byThread.get(tk);
   if (!list || !list.length) continue;
   list.sort((a, b) => a.ts - b.ts || b.dur - a.dur);
@@ -118,11 +139,18 @@ for (const { tk, name: tname } of targetThreads) {
   const stack = [];
   const flush = (top) => {
     const self = top.dur - top.childTime;
-    if (self <= 0) return;
-    const cur = selfByName.get(top.name) || { self_us: 0, cat: top.cat || '' };
-    cur.self_us += self;
-    if (!cur.cat && top.cat) cur.cat = top.cat;
-    selfByName.set(top.name, cur);
+    if (self > 0) {
+      const cur = selfByName.get(top.name) || { self_us: 0, cat: top.cat || '' };
+      cur.self_us += self;
+      if (!cur.cat && top.cat) cur.cat = top.cat;
+      selfByName.set(top.name, cur);
+    }
+    if (childrenOf && top.name === childrenOf) {
+      childrenParentTotal_us += top.dur;
+      childrenParentSelf_us += Math.max(0, top.dur - top.childTime);
+      childrenParentHits += 1;
+      if (!childrenParentCat && top.cat) childrenParentCat = top.cat;
+    }
   };
   for (const e of list) {
     if (e.ts < traceMinTs) traceMinTs = e.ts;
@@ -130,7 +158,15 @@ for (const { tk, name: tname } of targetThreads) {
     while (stack.length && stack[stack.length - 1].endTs <= e.ts) {
       flush(stack.pop());
     }
-    if (stack.length) stack[stack.length - 1].childTime += e.dur;
+    const parent = stack.length ? stack[stack.length - 1] : null;
+    if (parent) parent.childTime += e.dur;
+    if (childrenOf && parent && parent.name === childrenOf) {
+      const cur = childrenAcc.get(e.name) || { total_us: 0, cat: e.cat || '', hits: 0 };
+      cur.total_us += e.dur;
+      cur.hits += 1;
+      if (!cur.cat && e.cat) cur.cat = e.cat;
+      childrenAcc.set(e.name, cur);
+    }
     stack.push({ name: e.name, cat: e.cat, dur: e.dur, endTs: e.ts + e.dur, childTime: 0 });
   }
   while (stack.length) flush(stack.pop());
@@ -138,6 +174,42 @@ for (const { tk, name: tname } of targetThreads) {
 
 const traceDurUs = (traceMaxTs > traceMinTs) ? (traceMaxTs - traceMinTs) : 0;
 
+const fmt = (n, w) => n.toFixed(2).padStart(w);
+
+if (childrenOf) {
+  if (!childrenParentHits) {
+    console.error(`no X events named "${childrenOf}" found on ${allThreads ? 'any thread' : threadName}`);
+    process.exit(3);
+  }
+  const rows = [...childrenAcc.entries()]
+    .map(([name, v]) => ({
+      name,
+      cat: v.cat,
+      hits: v.hits,
+      total_ms: v.total_us / 1000,
+      pct: 100 * v.total_us / childrenParentTotal_us,
+    }));
+  rows.push({
+    name: '(self / unattributed)',
+    cat: childrenParentCat,
+    hits: childrenParentHits,
+    total_ms: childrenParentSelf_us / 1000,
+    pct: 100 * childrenParentSelf_us / childrenParentTotal_us,
+  });
+  rows.sort((a, b) => b.total_ms - a.total_ms);
+  console.log(`trace:   ${tracePath}`);
+  console.log(`thread${allThreads ? 's' : ''}: ${allThreads ? 'all' : threadName}  span: ${(traceDurUs / 1e6).toFixed(2)}s`);
+  console.log(`parent: ${childrenOf}  hits: ${childrenParentHits}  total: ${(childrenParentTotal_us / 1000).toFixed(2)}ms  self: ${(childrenParentSelf_us / 1000).toFixed(2)}ms (${(100*childrenParentSelf_us/childrenParentTotal_us).toFixed(1)}%)`);
+  console.log(`direct children, top ${topN} by total time (min ${minPct}% of parent total):`);
+  console.log('');
+  console.log('   total_ms  total_%     hits   child  @  category');
+  console.log('   --------  -------   ------   --------------------------------------');
+  for (const r of rows.filter(r => r.pct >= minPct).slice(0, topN)) {
+    console.log(`  ${fmt(r.total_ms, 8)}   ${fmt(r.pct, 5)}%   ${String(r.hits).padStart(6)}   ${r.name}  @  ${r.cat || '(no cat)'}`);
+  }
+  process.exit(0);
+}
+
 const totalSelfUs = [...selfByName.values()].reduce((s, x) => s + x.self_us, 0);
 const rows = [...selfByName.entries()]
   .map(([name, v]) => ({
@@ -150,7 +222,6 @@ const rows = [...selfByName.entries()]
   .filter(r => r.pct >= minPct)
   .slice(0, topN);
 
-const fmt = (n, w) => n.toFixed(2).padStart(w);
 console.log(`trace:   ${tracePath}`);
 console.log(`events:  ${totalEvents}  thread${allThreads ? 's' : ''}: ${allThreads ? 'all' : threadName}  span: ${(traceDurUs / 1e6).toFixed(2)}s`);
 console.log(`total self: ${(totalSelfUs / 1000).toFixed(2)}ms across ${selfByName.size} distinct event names`);

From 272d47b7acb5bfcabb97564fe2ae89c325338e97 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sat, 23 May 2026 00:21:21 +0200
Subject: [PATCH 04/18] Disable WhiteSpaceFilter by default; opt-in via
 PagedConfig.

A 3+3 paired cpu-profile A/B (perf/ab-aggregate.mjs) showed the
filter's 181k TreeWalker callbacks cost ~600 ms of CPU on every
render even when html-compress has already collapsed inter-element
whitespace at Jekyll time. ~125 ms is direct (filterTree/filterEmpty
self); the rest is indirect -- gBCR, recalcStyle, performLayout
and UpdateStyleAndLayout all run ~14% cheaper per call when V8's
IC and Blink scheduler aren't being churned by 181k C++->JS
dispatches. The cost is small per call but compounds because the
walk lives inside the same microtask continuation as the per-page
render loop.

Earlier wall-clock A/B (3+3, 8.78s vs 8.53s) had attributed the
delta to noise; that was wrong. Per-row aggregation across paired
cpu profiles shows the filterTree row at 88 ms (sd 14) vs 2 ms (sd
1) -- a 6 sigma shift -- and the downstream gBCR row at -338 ms
mean, consistent with the trace's -574 ms drop on
Document::UpdateStyleAndLayout total.

The fix: gate the TreeWalker invocation behind
window.PagedConfig.runWhitespaceFilter (default undefined = off).
Our pipeline never sets the flag because html-compress already
does the work; documents that need the cleanup can opt back in.

Also adds perf/ab-aggregate.mjs (per-row mean+SD aggregator across
6 paired cpu profiles) and a long writeup in perf/README.md with
the methodology, the corrected understanding of why the filter has
cost (not flush migration -- it does no layout-flushing work; it's
V8 IC pressure + Blink scheduler overhead), and lessons about when
to trust wall-clock vs aggregated cpu-profile rows.
---
 docs/lib/paged.browser.js |  19 ++++
 perf/README.md            | 194 ++++++++++++++++++++++++++++++++++++--
 perf/ab-aggregate.mjs     |  96 +++++++++++++++++++
 3 files changed, 302 insertions(+), 7 deletions(-)
 create mode 100644 perf/ab-aggregate.mjs

diff --git a/docs/lib/paged.browser.js b/docs/lib/paged.browser.js
index f681876..e058715 100644
--- a/docs/lib/paged.browser.js
+++ b/docs/lib/paged.browser.js
@@ -32445,12 +32445,31 @@
 		TargetText
 	];
 
+	// [PATCH: whitespace-filter-opt-in] Default off because our Jekyll
+	// pipeline runs html-compress on `book.html` (see _plugins/html-
+	// compress.rb's three-tier hook ordering: book-combined is in the
+	// compress-eligible set), so inter-element whitespace is already
+	// collapsed by the time paged.js sees the document. The filter
+	// would visit every text node in the parsed DOM (~181 k callbacks
+	// on the 1651-page book) and -- post-compression -- find essentially
+	// nothing to mutate. A paired cpu-profile A/B (3+3 runs, see
+	// perf/README.md) showed the no-op walk still costs ~600 ms of CPU
+	// per render: ~125 ms direct (filterTree / filterEmpty self) plus
+	// ~480 ms indirect (gBCR + downstream Blink layout / style work that
+	// runs cheaper when V8's IC + Blink scheduler aren't being churned
+	// by 181 k C++->JS callback dispatches). The cost is small per call
+	// but compounds because the walk lives inside the same microtask
+	// continuation as the per-page render loop. Set
+	// `window.PagedConfig.runWhitespaceFilter = true` before
+	// PagedPolyfill.preview() if processing a document whose source
+	// HTML wasn't compressed at build time.
 	class WhiteSpaceFilter extends Handler {
 		constructor(chunker, polisher, caller) {
 			super(chunker, polisher, caller);
 		}
 
 		filter(content) {
+			if (!(typeof window !== "undefined" && window.PagedConfig && window.PagedConfig.runWhitespaceFilter)) return;
 
 			filterTree(content, (node) => {
 				return this.filterEmpty(node);
diff --git a/perf/README.md b/perf/README.md
index 518605e..2f8e17b 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -108,6 +108,7 @@ DevTools-compatible trace is a few lines.
 | `probe-outline-exclusions.mjs` | Tests which per-element attributes / styles (aria-hidden, role=presentation, hidden, display:none, CSS bookmark-level, ...) make Chrome drop a heading from its outline. |
 | `analyze-profile.mjs` | Bottom-up self-time analyzer for `.cpuprofile` files. Same shape as DevTools' Performance bottom-up view, in the terminal. |
 | `analyze-trace.mjs` | Bottom-up self-time analyzer for Chrome traces (`trace.json` from `--tracing`). Computes per-event self-time on the renderer's main thread (`CrRendererMain` by default) by walking nested `X`-phase events. Cracks the cpu profile's `(program)` bucket open into named Blink / V8 events (`Layout`, `RecalcStyle`, `RunMicrotasks`, `V8.GC_*`, ...). |
+| `ab-aggregate.mjs` | Per-row mean + SD aggregator across 6 paired cpu profiles (`ab-A1..A3.cpuprofile` and `ab-B1..B3.cpuprofile`). Use when wall-clock noise drowns a structural change: capture 3+3 interleaved profiles via `measure.mjs --cpu-profile` with the change toggled on/off between runs, then point this at the 6 files for a mean-with-SD table that surfaces deltas wall-clock can't see (e.g. ~6 σ shifts on rows that move from 88 ms to 2 ms). See the "Disabling the filter outright" section in this README for the methodology. |
 | `find-callers.mjs` | "Who paid for this callee's time?" -- walks a `.cpuprofile` and attributes a target function's total time back to each direct caller. Used throughout the post-mortems to detect gBCR migration between callers. |
 | `find-callees.mjs` | The other direction of `find-callers.mjs`: splits a function's self+descendant time across its direct callees. Surfaces the cases where V8 has rolled native DOM work back into the calling JS frame (Range deletion in `removeOverflow`, HTML parser in `wrapContent`). |
 | `grep-profile.mjs` | Lists every node in a `.cpuprofile` whose `functionName` matches a regex, with self-time and location. Quick check for "is this frame in the profile at all, and what's it called?" |
@@ -4410,15 +4411,194 @@ unchanged:
 
 - `pageRanges` sharding for the generate phase (biggest
   untried knob, generate is now the larger phase).
-- WhiteSpaceFilter algorithmic restructuring (~0.3 s,
-  render) -- not a short-circuit, since the filter does
-  real work; would need a hand-rolled traversal that
-  avoids the per-node C++→JS dispatch.
+- WhiteSpaceFilter -- the trace and a follow-up cpu-
+  profile A/B (see next section) eventually showed this
+  *is* skippable for our pipeline once html-compress has
+  done the work at Jekyll time. Worth ~600 ms / 6 %.
 - Everything else lives below the noise floor.
 
-Render-stage optimization headroom is exhausted. The
-cpu profile's `(program)` row isn't a structural smell
-or a missed microtask -- it's the fixed cost of V8
+The cpu profile's `(program)` row isn't a structural
+smell or a missed microtask -- it's the fixed cost of V8
 running the JavaScript we already have, accounted for
 honestly by the trace and accounted for opaquely by
 the JS sampler.
+
+## Disabling the filter outright: paired cpu-profile A/B
+
+The "actionable finding that wasn't" + "and we did fix
+it, on the Jekyll side" pair above closed with two
+conclusions:
+
+1. WhiteSpaceFilter does real work on book.html
+   (37k DOM mutations pre-compression, 0 post-).
+2. Post-compression the filter is essentially a no-op
+   visit over 181k text nodes, and skipping it doesn't
+   save measurable wall-clock -- a 3+3 wall-clock A/B
+   showed 8.78 s avg with filter vs 8.53 s without, well
+   inside the 1.17 s within-variant noise band.
+
+Conclusion (1) is correct. Conclusion (2) was wrong --
+specifically the "no measurable saving" claim and the
+flush-migration explanation I attached to the ~+180 ms
+gBCR move that appeared in a single-run profile pair.
+
+A reader pointed out the flush-migration reasoning was
+incoherent: `WhiteSpaceFilter.filter` runs *once* in
+`Chunker.flow()` *before* any page is created. The body
+of `filterEmpty` reads `textContent`, walks parents via
+`closest("pre")`, and walks siblings -- none of which
+read layout-flushing properties (`gBCR`, `offsetTop`,
+computed style, etc.). There is no flush for migration
+to migrate from. Whatever the +180 ms gBCR move in the
+single-run pair was, it wasn't "the filter's flush load
+deferring to the next gBCR." It was single-run noise on
+a 38 % row -- which has a much wider noise band than
+the README's "50-150 ms for sub-1 % rows" methodology
+note covers.
+
+### The proper A/B
+
+Three filter-on (A) and three filter-off (B) cpu-profile
+runs, interleaved A1 B1 A2 B2 A3 B3 so system-load
+variance hits both sides equally. The probe is a one-line
+`return;` at the top of `WhiteSpaceFilter.filter` --
+skip the TreeWalker entirely. Toggle is a single edit
+between runs. Both states are otherwise identical
+(post-compression book.html, current bundle).
+
+Per-run totals from
+[`perf/ab-aggregate.mjs`](ab-aggregate.mjs):
+
+| run | total CPU |
+| --- | --- |
+| A1 (filter ON)  | 11,120 ms |
+| A2 (filter ON)  | 10,270 ms |
+| A3 (filter ON)  |  9,727 ms |
+| **A mean**      | **10,372 ms** |
+| B1 (filter OFF) |  9,744 ms |
+| B2 (filter OFF) | 10,189 ms |
+| B3 (filter OFF) |  9,180 ms |
+| **B mean**      |  **9,705 ms** |
+| **Δ (B - A)**   |   **-668 ms (-6.4 %)** |
+
+The within-group ranges are ~1.3 s (A) and ~1.0 s (B),
+so the -668 ms total-CPU delta sits at roughly 1 σ of
+within-variant spread. By itself, that's a soft signal.
+
+But per-row breakdown is tighter:
+
+| row | A mean ± sd | B mean ± sd | Δ |
+| --- | --- | --- | --- |
+| `getBoundingClientRect`         | 4128 ± 309 | 3791 ± 163 | **-338 ms** |
+| `(program)`                     | 2243 ± 56  | 2328 ± 173 | +85 ms (noisy) |
+| `removeChild`                   | 1619 ± 63  | 1564 ± 43  | -55 ms |
+| `afterPageLayout` @ paged.js    |  150 ± 26  |  119 ± 17  | -32 ms |
+| **`filterTree` self**           | **88 ± 14** |  **2 ± 1** | **-86 ms** |
+| `(garbage collector)`           |  103 ± 6   |   92 ± 4   | -11 ms |
+| `handleAlignment`               |   70 ± 5   |   56 ± 7   | -14 ms |
+| `create` (`Page.create`)        |   66 ± 7   |   50 ± 4   | -15 ms |
+| `sortDisplayedSelectors`        |   60 ± 10  |   46 ± 1   | -14 ms |
+| **`filterEmpty` self**          | **37 ± 2** |    **0**   | **-37 ms** |
+
+Direct attribution (the filter rows that vanish in B):
+
+- `filterTree` self: -86 ms
+- `filterEmpty` self: -37 ms
+- ~123 ms
+
+Indirect attribution (rows that shrink in B despite
+unchanged call counts -- see the trace data above
+where Document::UpdateStyleAndLayout, recalcStyle and
+performLayout all run ~14-15 % cheaper per call with
+filter off):
+
+- `getBoundingClientRect`: -338 ms
+- `removeChild`: -55 ms
+- `afterPageLayout @ paged.js:30458` (paged.js core): -32 ms
+- `create`: -15 ms
+- `handleAlignment`: -14 ms
+- `sortDisplayedSelectors`: -14 ms
+- `(garbage collector)`: -11 ms
+- smaller rows: ~50 ms
+- ~529 ms
+
+Direct + indirect ≈ 652 ms, in the neighbourhood of
+the -668 ms total-CPU delta. They corroborate.
+
+### Why the filter has indirect cost
+
+The single-trace measurement above (filter-off trace
+captured for the same render) made the indirect path
+visible: with filter off, `Document::UpdateStyleAndLayout`
+total dropped by 574 ms across an *unchanged* 39,437
+call count -- ~14 µs less per call. `recalcStyle` and
+`performLayout` similarly dropped ~14 % per call.
+Plausibly:
+
+- V8's polymorphic inline caches stay warmer on the
+  per-page hot path when 181 k extra C++→JS
+  dispatches haven't been churning them.
+- Blink's main-thread scheduler has fewer task
+  boundaries to bookkeep across.
+- Allocator/GC pressure is lower (the filter walk
+  allocates per-callback closures and intermediate
+  strings, even when each callback just returns
+  FILTER_REJECT).
+
+None of those are "the filter triggers a layout
+flush." Layout work *itself* gets cheaper because the
+ambient V8/Blink state is less polluted. Same per-call
+mechanics, slightly faster main-thread context.
+
+### The fix: config flag, default off
+
+`window.PagedConfig.runWhitespaceFilter` gates the
+walk. Default is undefined (falsy) -- our pipeline runs
+`html-compress` on book.html, so the filter has
+nothing to do and skipping it saves the ~600 ms.
+
+Anyone running paged.js against an uncompressed
+document can set the flag before `PagedPolyfill.preview()`
+to opt back in. The class itself is unchanged so the
+opt-in path is byte-equivalent to the original.
+
+The opt-in semantic is the conservative choice: paged.js
+upstream and many downstream users feed it untouched
+HTML (with inter-element indentation surviving), where
+the filter does meaningful cleanup. Disabling it for
+*every* caller of this bundle would be a regression for
+those use cases. Disabling it by default for *our*
+pipeline is fine because we control the input
+end-to-end.
+
+Cost: zero per-page work (the gate is one `&&`-chain
+check at startup), structural correctness for clean
+documents, opt-in safety valve for everyone else.
+
+### Methodology note
+
+The wall-clock A/B was correct in claiming "the saving
+is below the wall-clock noise floor for short N." It
+was wrong in concluding "therefore no saving exists."
+Two corrections:
+
+1. Aggregate CPU work across paired profiles. Wall-clock
+   noise is ~1 s per run on this machine; CPU sample
+   totals are also ~1 s per run but the row-by-row
+   self-time deltas can be much tighter. The
+   `filterTree` row goes from 88 ms (sd 14) to 2 ms (sd
+   1) -- a 6 σ shift. Per-row analysis can see signals
+   that per-run totals lose.
+
+2. Use *enough* paired runs that within-group SD lets
+   you compute mean ± SD honestly. 3+3 is the bare
+   minimum (gives 1 σ confidence on row-level deltas
+   for things that change by 5+ σ). 5+5 or 10+10 would
+   tighten the gBCR delta confidence further -- worth
+   doing for finer signals.
+
+The probe + aggregator are reusable
+([`perf/ab-aggregate.mjs`](ab-aggregate.mjs)): point at
+6 `ab-*.cpuprofile` files and it prints the mean ± SD
+table. Pattern fits any future "does this change save
+CPU?" question where wall-clock noise is the obstacle.
diff --git a/perf/ab-aggregate.mjs b/perf/ab-aggregate.mjs
new file mode 100644
index 0000000..f8adfe2
--- /dev/null
+++ b/perf/ab-aggregate.mjs
@@ -0,0 +1,96 @@
+// One-shot aggregator for the 3+3 paired cpu-profile A/B (ab-A1..A3 / ab-B1..B3).
+// Computes per-row self_ms mean across the 3 A runs and 3 B runs, plus the difference.
+// Also prints total samples / duration per run so we can sanity-check variance.
+
+import { readFileSync, existsSync } from 'node:fs';
+import { resolve, dirname } from 'node:path';
+import { fileURLToPath } from 'node:url';
+
+const __dirname = dirname(fileURLToPath(import.meta.url));
+const runs = ['A1','A2','A3','B1','B2','B3'].map(tag => ({
+  tag,
+  path: resolve(__dirname, `ab-${tag}.cpuprofile`),
+}));
+
+for (const r of runs) {
+  if (!existsSync(r.path)) { console.error('missing:', r.path); process.exit(1); }
+}
+
+// Same hit-counting + self_ms computation as analyze-profile.mjs.
+function summarise(path) {
+  const p = JSON.parse(readFileSync(path, 'utf8'));
+  const totalUs = p.endTime - p.startTime;
+  const totalSamples = p.samples.length;
+  const us = totalUs / totalSamples;
+  const byKey = new Map();
+  let totalHits = 0;
+  for (const n of p.nodes) {
+    const cf = n.callFrame || {};
+    const fn = cf.functionName || '(anonymous)';
+    const url = cf.url || '';
+    const line = cf.lineNumber != null ? cf.lineNumber + 1 : '?';
+    const key = `${fn}  @  ${url || '(no url)'}:${line}`;
+    const cur = byKey.get(key) || { hits: 0 };
+    cur.hits += n.hitCount || 0;
+    byKey.set(key, cur);
+    totalHits += n.hitCount || 0;
+  }
+  const rows = new Map();
+  for (const [key, v] of byKey) {
+    rows.set(key, v.hits * us / 1000);  // self_ms
+  }
+  return { totalSamples, durationS: totalUs / 1e6, usPerSample: us, rows };
+}
+
+const summaries = runs.map(r => ({ tag: r.tag, ...summarise(r.path) }));
+
+console.log('per-run totals');
+console.log('  tag      samples  dur(s)  us/sample');
+for (const s of summaries) {
+  console.log(`  ${s.tag}      ${String(s.totalSamples).padStart(6)}  ${s.durationS.toFixed(2).padStart(6)}    ${s.usPerSample.toFixed(1)}`);
+}
+console.log('');
+
+// Union of row keys across all 6 runs.
+const keys = new Set();
+for (const s of summaries) for (const k of s.rows.keys()) keys.add(k);
+
+// Compute A-mean / A-stddev / B-mean / B-stddev per row.
+function statsFor(group, key) {
+  const vals = group.map(s => s.rows.get(key) || 0);
+  const mean = vals.reduce((a,b)=>a+b,0) / vals.length;
+  const variance = vals.reduce((a,b)=>a+(b-mean)*(b-mean),0) / vals.length;
+  return { mean, sd: Math.sqrt(variance), vals };
+}
+
+const A = summaries.filter(s => s.tag.startsWith('A'));
+const B = summaries.filter(s => s.tag.startsWith('B'));
+
+const rows = [...keys].map(k => {
+  const sa = statsFor(A, k);
+  const sb = statsFor(B, k);
+  return { key: k, aMean: sa.mean, aSd: sa.sd, bMean: sb.mean, bSd: sb.sd, delta: sb.mean - sa.mean };
+});
+
+// Sort by max(|A|, |B|) so the biggest rows surface regardless of which side they're on.
+rows.sort((x,y) => Math.max(y.aMean, y.bMean) - Math.max(x.aMean, x.bMean));
+
+const fmt = (n, w) => n.toFixed(1).padStart(w);
+console.log('top 25 rows by max(A mean, B mean), self_ms:');
+console.log('');
+console.log('   A_mean   A_sd    B_mean   B_sd     delta    function');
+console.log('   ------   ----    ------   ----     -----    --------');
+for (const r of rows.slice(0, 25)) {
+  // Strip the long URL prefix for readability.
+  const short = r.key.replace(/D:\\\\OCP\\\\wc\\\\twinBASIC-documentation\\\\docs\\\\lib\\\\paged\.browser\.js/, 'paged.browser.js')
+                    .replace(/D:\\OCP\\wc\\twinBASIC-documentation\\docs\\lib\\paged.browser.js/, 'paged.browser.js');
+  console.log(`  ${fmt(r.aMean,7)}  ${fmt(r.aSd,5)}   ${fmt(r.bMean,7)}  ${fmt(r.bSd,5)}   ${fmt(r.delta,7)}    ${short}`);
+}
+
+// Total CPU work across all rows.
+const aTotal = A.reduce((s,r)=>s+r.totalSamples*r.usPerSample,0)/A.length / 1000;
+const bTotal = B.reduce((s,r)=>s+r.totalSamples*r.usPerSample,0)/B.length / 1000;
+console.log('');
+console.log(`A mean total CPU: ${aTotal.toFixed(0)} ms  (${A.map(r => (r.totalSamples*r.usPerSample/1000).toFixed(0)).join(' / ')})`);
+console.log(`B mean total CPU: ${bTotal.toFixed(0)} ms  (${B.map(r => (r.totalSamples*r.usPerSample/1000).toFixed(0)).join(' / ')})`);
+console.log(`delta (B-A):     ${(bTotal-aTotal).toFixed(0)} ms`);

From 688ad1fbea435f670b46778dca3ed91a3a0d4032 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sat, 23 May 2026 15:00:55 +0200
Subject: [PATCH 05/18] Strip all async from paged.js render chain;
 RunMicrotasks 6333->0.56ms.

---
 docs/lib/paged.browser.js | 221 +++++++++++++-----------
 docs/render-book.mjs      |  10 +-
 perf/README.md            | 350 ++++++++++++++++++++++++++++++++++++++
 perf/measure.mjs          |   7 +-
 4 files changed, 487 insertions(+), 101 deletions(-)

diff --git a/docs/lib/paged.browser.js b/docs/lib/paged.browser.js
index e058715..83c9e53 100644
--- a/docs/lib/paged.browser.js
+++ b/docs/lib/paged.browser.js
@@ -3001,10 +3001,21 @@
 
 		}
 
-		async flow(content, renderTo) {
+		// [PATCH: sync-chain] flow() is now synchronous. All five await
+		// sites turn into sync calls:
+		//   - beforeParsed / afterParsed / afterRendered hooks: handlers
+		//     on our pipeline are all sync, so _assertSync guards them
+		//     the same way the per-page hot path does.
+		//   - loadFonts: now a sync assert (throws if any face isn't
+		//     loaded; page.goto waitUntil:'load' ensures they are).
+		//   - render: now a plain sync function.
+		// This was the last load-bearing await in the bundle. With
+		// flow() sync, the entire per-render call chain executes
+		// without yielding to a microtask boundary.
+		flow(content, renderTo) {
 			let parsed;
 
-			await this.hooks.beforeParsed.trigger(content, this);
+			_assertSync(this.hooks.beforeParsed.trigger(content, this), "beforeParsed");
 
 			parsed = new ContentParser(content);
 
@@ -3022,20 +3033,20 @@
 
 			this.emit("rendering", parsed);
 
-			await this.hooks.afterParsed.trigger(parsed, this);
+			_assertSync(this.hooks.afterParsed.trigger(parsed, this), "afterParsed");
 
-			await this.loadFonts();
+			this.loadFonts();
 
-			let rendered = await this.render(parsed, this.breakToken);
+			let rendered = this.render(parsed, this.breakToken);
 			while (rendered.canceled) {
 				this.start();
-				rendered = await this.render(parsed, this.breakToken);
+				rendered = this.render(parsed, this.breakToken);
 			}
 
 			this.rendered = true;
 			this.pagesArea.style.setProperty("--pagedjs-page-count", this.total);
 
-			await this.hooks.afterRendered.trigger(this.pages, this);
+			_assertSync(this.hooks.afterRendered.trigger(this.pages, this), "afterRendered");
 
 			this.emit("rendered", this.pages);
 
@@ -3072,12 +3083,11 @@
 		// 	}
 		// }
 
-		// [PATCH: sync-chain] *layout is a sync generator now, so
-		// renderer.next() returns synchronously -- no per-page await.
-		// render() itself stays `async` because callers (flow()) await
-		// it and other once-per-render awaits in flow() (loadFonts,
-		// beforeParsed / afterParsed / afterRendered) still need it.
-		async render(parsed, startAt) {
+		// [PATCH: sync-chain] render() is now plain sync. *layout is a
+		// sync generator (renderer.next() returns synchronously), and
+		// flow() no longer awaits this call -- the entire per-render
+		// chain (preview -> flow -> render) is sync end to end.
+		render(parsed, startAt) {
 			let renderer = this.layout(parsed, startAt);
 
 			let result;
@@ -3367,7 +3377,12 @@
 		}
 		*/
 
-		async clonePage(originalPage) {
+		// [PATCH: sync-chain] clonePage is now synchronous. Only caller
+		// is the Footnotes handler (line ~31625) which is itself gated
+		// out for documents without `[data-note='footnote']` -- dead
+		// path on our content but kept sync-clean for consistency with
+		// the rest of the per-page hook surface.
+		clonePage(originalPage) {
 			let lastPage = this.pages[this.pages.length - 1];
 
 			let page = new Page(this.pagesArea, this.pageTemplate, false, this.hooks);
@@ -3379,7 +3394,7 @@
 
 			page.index(this.total);
 
-			await this.hooks.beforePageLayout.trigger(page, undefined, undefined, this);
+			_assertSync(this.hooks.beforePageLayout.trigger(page, undefined, undefined, this), "beforePageLayout");
 			this.emit("page", page);
 
 			for (const className of originalPage.element.classList) {
@@ -3388,27 +3403,32 @@
 				}
 			}
 
-			await this.hooks.afterPageLayout.trigger(page.element, page, undefined, this);
-			await this.hooks.finalizePage.trigger(page.element, page, undefined, this);
+			_assertSync(this.hooks.afterPageLayout.trigger(page.element, page, undefined, this), "afterPageLayout");
+			_assertSync(this.hooks.finalizePage.trigger(page.element, page, undefined, this), "finalizePage");
 			this.emit("renderedPage", page);
 		}
 
+		// [PATCH: sync-chain] loadFonts is now a synchronous assertion.
+		// Upstream walked document.fonts and kicked off fontFace.load()
+		// for any not-yet-loaded face, returning a Promise.all. Our
+		// headless pipeline drives `page.goto(url, { waitUntil: "load" })`
+		// before paged.js runs, which settles document.fonts.ready --
+		// every face is already in state "loaded" by the time we get
+		// here. The walk is a safety check: if a face is still loading
+		// (or hit an error), pipeline assumptions are broken and we
+		// should fail loudly rather than silently re-asyncify.
 		loadFonts() {
-			let fontPromises = [];
 			(document.fonts || []).forEach((fontFace) => {
 				if (fontFace.status !== "loaded") {
-					let fontLoaded = fontFace.load().then((r) => {
-						return fontFace.family;
-					}, (r) => {
-						console.warn("Failed to preload font-family:", fontFace.family);
-						return fontFace.family;
-					});
-					fontPromises.push(fontLoaded);
+					throw new Error(
+						"paged.js (forked): font-face '" + fontFace.family +
+						"' is not yet loaded (status=" + fontFace.status +
+						"). The headless pipeline expects every font to be " +
+						"loaded before PagedPolyfill.preview() runs; ensure " +
+						"page.goto uses { waitUntil: 'load' } or 'networkidle0'."
+					);
 				}
 			});
-			return Promise.all(fontPromises).catch((err) => {
-				console.warn(err);
-			});
 		}
 
 		destroy() {
@@ -26501,16 +26521,22 @@
 
 
 
-		// parse
-		async parse(text) {
+		// [PATCH: sync-chain] parse() is now synchronous. Upstream awaited
+		// the three Polisher.hooks.{beforeTreeParse, beforeTreeWalk,
+		// afterTreeWalk} triggers; with our pipeline registering no async
+		// handlers for any of them, the awaits were pure microtask
+		// boundaries. _assertSync throws if anyone ever does register a
+		// thenable-returning handler -- same safety pattern the chunker's
+		// per-page hot path uses.
+		parse(text) {
 			this.text = text;
 
-			await this.hooks.beforeTreeParse.trigger(this.text, this);
+			_assertSync(this.hooks.beforeTreeParse.trigger(this.text, this), "beforeTreeParse");
 
 			// send to csstree
 			this.ast = csstree.parse(this._text);
 
-			await this.hooks.beforeTreeWalk.trigger(this.ast);
+			_assertSync(this.hooks.beforeTreeWalk.trigger(this.ast), "beforeTreeWalk");
 
 			// Replace urls
 			this.replaceUrls(this.ast);
@@ -26525,7 +26551,7 @@
 			this.rules(this.ast);
 			this.atrules(this.ast);
 
-			await this.hooks.afterTreeWalk.trigger(this.ast, this);
+			_assertSync(this.hooks.afterTreeWalk.trigger(this.ast, this), "afterTreeWalk");
 
 			// return ast
 			return this.ast;
@@ -27480,28 +27506,30 @@
 }
 `;
 
-	async function request(url, options={}) {
-		return new Promise(function(resolve, reject) {
-			let request = new XMLHttpRequest();
-
-			request.open(options.method || "get", url, true);
-
-			for (let i in options.headers) {
-				request.setRequestHeader(i, options.headers[i]);
-			}
-
-			request.withCredentials = options.credentials === "include";
-
-			request.onload = () => {
-				// Chrome returns a status code of 0 for local files
-				const status = request.status === 0 && url.startsWith("file://") ? 200 : request.status;
-				resolve(new Response(request.responseText, {status}));
-			};
-
-			request.onerror = reject;
-
-			request.send(options.body || null);
-		});
+	// [PATCH: sync-chain] Synchronous XHR returning body text directly.
+	// Upstream paged.js used async XHR + Promise + Response wrapper to
+	// keep the interactive-browser main thread responsive while
+	// stylesheets loaded. Our headless pipeline doesn't share that
+	// constraint: every stylesheet is a local file:// URL, fetches are
+	// sub-ms, and we want the polisher's stylesheet ingestion off the
+	// microtask queue so the whole render chain stays sync. Both
+	// callers (Polisher.add / convertViaSheet) only ever consumed
+	// response.text(), which is itself async per spec -- returning the
+	// text directly skips that boundary too. Throws on HTTP error.
+	function request(url, options={}) {
+		let req = new XMLHttpRequest();
+		req.open(options.method || "get", url, false);
+		for (let i in options.headers) {
+			req.setRequestHeader(i, options.headers[i]);
+		}
+		req.withCredentials = options.credentials === "include";
+		req.send(options.body || null);
+		// Chrome returns status 0 for successful local-file loads.
+		const status = req.status === 0 && url.startsWith("file://") ? 200 : req.status;
+		if (status < 200 || status >= 300) {
+			throw new Error("paged.js (forked): request " + url + " failed with status " + status);
+		}
+		return req.responseText;
 	}
 
 	class Polisher {
@@ -27538,53 +27566,43 @@
 			return this.styleSheet;
 		}
 
-		async add() {
-			let fetched = [];
-			let urls = [];
-
-			for (var i = 0; i < arguments.length; i++) {
-				let f;
-
-				if (typeof arguments[i] === "object") {
-					for (let url in arguments[i]) {
-						let obj = arguments[i];
-						f = new Promise(function(resolve, reject) {
-							urls.push(url);
-							resolve(obj[url]);
-						});
+		// [PATCH: sync-chain] add() is now synchronous. Upstream collected
+		// every input as a Promise (Promise.all + then-chain), even when
+		// inputs were inline {url:text} objects with no fetch needed.
+		// With request() returning text directly and convertViaSheet now
+		// sync, we just walk the arguments once and feed each to the
+		// pipeline. Same return semantics: the converted-and-inserted
+		// text of the last stylesheet.
+		add() {
+			let text = "";
+			for (let i = 0; i < arguments.length; i++) {
+				let arg = arguments[i];
+				if (typeof arg === "object") {
+					for (let url in arg) {
+						text = this.convertViaSheet(arg[url], url);
+						this.insert(text);
 					}
 				} else {
-					urls.push(arguments[i]);
-					f = request(arguments[i]).then((response) => {
-						return response.text();
-					});
+					let url = arg;
+					let cssStr = request(url);
+					text = this.convertViaSheet(cssStr, url);
+					this.insert(text);
 				}
-
-
-				fetched.push(f);
 			}
-
-			return await Promise.all(fetched)
-				.then(async (originals) => {
-					let text = "";
-					for (let index = 0; index < originals.length; index++) {
-						text = await this.convertViaSheet(originals[index], urls[index]);
-						this.insert(text);
-					}
-					return text;
-				});
+			return text;
 		}
 
-		async convertViaSheet(cssStr, href) {
+		// [PATCH: sync-chain] convertViaSheet is now synchronous.
+		// sheet.parse is sync; request() now returns body text directly
+		// (sync XHR + responseText, no Response wrapper).
+		convertViaSheet(cssStr, href) {
 			let sheet = new Sheet(href, this.hooks);
-			await sheet.parse(cssStr);
+			sheet.parse(cssStr);
 
 			// Insert the imported sheets first
 			for (let url of sheet.imported) {
-				let str = await request(url).then((response) => {
-					return response.text();
-				});
-				let text = await this.convertViaSheet(str, url);
+				let str = request(url);
+				let text = this.convertViaSheet(str, url);
 				this.insert(text);
 			}
 
@@ -33074,9 +33092,18 @@
 				});
 		}
 
-		async preview(content, stylesheets, renderTo) {
+		// [PATCH: sync-chain] preview() is now synchronous end-to-end.
+		// beforePreview / afterPreview hooks are once-per-render so the
+		// _assertSync guard is the same shape as the chunker's per-page
+		// hot path uses. polisher.add and chunker.flow are sync above.
+		// External callers (perf/measure.mjs, docs/render-book.mjs) now
+		// call this without `await` -- the page.evaluate IIFE wrapping
+		// the call is also sync, so the entire script execution runs
+		// inside one EvaluateScript frame instead of being scheduled
+		// across multiple microtask continuations.
+		preview(content, stylesheets, renderTo) {
 
-			await this.hooks.beforePreview.trigger(content, renderTo);
+			_assertSync(this.hooks.beforePreview.trigger(content, renderTo), "beforePreview");
 
 			if (!content) {
 				content = this.wrapContent();
@@ -33090,12 +33117,12 @@
 
 			this.handlers = this.initializeHandlers();
 
-			await this.polisher.add(...stylesheets);
+			this.polisher.add(...stylesheets);
 
 			let startTime = performance.now();
 
 			// Render flow
-			let flow = await this.chunker.flow(content, renderTo);
+			let flow = this.chunker.flow(content, renderTo);
 
 			let endTime = performance.now();
 
@@ -33104,7 +33131,7 @@
 
 			this.emit("rendered", flow);
 
-			await this.hooks.afterPreview.trigger(flow.pages);
+			_assertSync(this.hooks.afterPreview.trigger(flow.pages), "afterPreview");
 
 			return flow;
 		}
diff --git a/docs/render-book.mjs b/docs/render-book.mjs
index 7ad9509..71de7e1 100644
--- a/docs/render-book.mjs
+++ b/docs/render-book.mjs
@@ -158,13 +158,19 @@ try {
   }
 
   // Render -- paged.js per-page layout.
+  // PagedPolyfill.preview() is fully synchronous in our forked bundle
+  // (the entire chain preview -> chunker.flow -> render -> *layout is
+  // now sync; loadFonts is a sync assertion that page.goto's
+  // waitUntil:'load' already satisfied; stylesheets are loaded via
+  // synchronous XHR). Inner IIFE is a plain sync arrow; outer await
+  // is just the CDP round-trip puppeteer needs to ferry the result.
   const tRender = Date.now();
-  await page.evaluate(async () => {
+  await page.evaluate(() => {
     if (!window.PagedPolyfill) {
       throw new Error('paged.js bundle did not expose window.PagedPolyfill');
     }
     try {
-      await window.PagedPolyfill.preview();
+      window.PagedPolyfill.preview();
     } catch (err) {
       // Unwrap the undecorated ProgressEvent paged.js throws on fetch
       // failures so the message includes the offending URL.
diff --git a/perf/README.md b/perf/README.md
index 2f8e17b..9ce36dd 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -3108,6 +3108,18 @@ plain function call that produces a plain return value.
 
 ### What's still async, and why
 
+> **Update.** All four survivors listed below were
+> subsequently stripped -- see "Following `RunMicrotasks`
+> down to zero" at the end of this README. The reasoning
+> here ("once-per-render, overhead irrelevant") was
+> correct as a per-call cost argument but missed that
+> the unbroken await chain forced V8 to attribute the
+> entire post-`loadFonts` render to a microtask
+> continuation (`RunMicrotasks` in the trace,
+> `(program)` in the cpu profile). Re-attribution alone
+> was worth the conversion; wall-clock is unchanged.
+> The list below is preserved for chronological accuracy.
+
 The async machinery that survives this audit is now at the
 once-per-render layer, where it's load-bearing:
 
@@ -4602,3 +4614,341 @@ The probe + aggregator are reusable
 6 `ab-*.cpuprofile` files and it prints the mean ± SD
 table. Pattern fits any future "does this change save
 CPU?" question where wall-clock noise is the obstacle.
+
+## Following `RunMicrotasks` down to zero
+
+The trace section above pinned the cpu profile's
+`(program)` row to V8 running JS inside a microtask
+continuation. With the WhiteSpaceFilter gone the
+`--children RunMicrotasks` breakdown still showed one
+`rm[4] = 6262 ms` event enveloping essentially the
+whole render -- 15 hits total, 99 % concentrated in one
+batched drain. That raised a sharper question: if the
+per-page hot path is sync (Phase 1 + 2 above), why is
+*any* of the render running inside a microtask scope?
+
+### What was still async, and what it cost us
+
+The README's earlier "What's still async, and why"
+inventory was honest about the surviving await sites at
+that point:
+
+- `Chunker.flow()` -- async wrapper, awaited
+  `beforeParsed` / `afterParsed` / `afterRendered` hook
+  triggers, `loadFonts()`, and `chunker.render()`.
+- `Chunker.render()` -- thin async wrapper around the
+  sync `renderer.next()` loop, kept so `flow()` could
+  `await` it.
+- `Chunker.clonePage()` -- async, awaited three
+  per-page hooks. Footnotes-only caller, dead path for
+  our content but live in the bundle.
+- `PagedPolyfill.preview()` -- async, awaited
+  `beforePreview` / `afterPreview` hooks plus
+  `polisher.add` and `chunker.flow`.
+- `Polisher.add()` / `Polisher.convertViaSheet()` /
+  `Sheet.parse()` -- async chain to fetch and parse
+  external stylesheets. `Polisher.add` did
+  `Promise.all` over the inputs.
+- `Chunker.loadFonts()` -- returned `Promise.all` of
+  `fontFace.load()` for any face not yet in state
+  "loaded".
+- `request()` -- async XHR + `Promise` wrapper, used by
+  the polisher chain to fetch each `<link rel="stylesheet">`
+  URL.
+
+Cost of each: small. Cost of all of them together: V8
+sees an unbroken await chain from `page.evaluate(async
+() => { await PagedPolyfill.preview(); })` down to
+`document.fonts.ready` (the one genuinely-async
+dependency in the chain). When that promise resolves V8
+schedules a microtask to resume `flow()`. Phase 1 + 2
+of the async cleanup made the *body* of the resumed
+function execute synchronously, so once it resumes it
+runs ~6.2 s straight to the end of the render. V8
+correctly attributes the whole continuation to the
+`RunMicrotasks` host frame, since that's the C++ frame
+on the stack while the resumed JS runs.
+
+So `RunMicrotasks` self-time being 2.89 s wasn't a
+sign of microtask overhead -- it was the bookkeeping
+label V8 puts on continuation-style work. Every named
+Blink event nested inside (`Document::UpdateStyleAndLayout`,
+`recalcStyle`, `performLayout`, etc.) appeared in the
+trace as a child of `RunMicrotasks`. Same shape applied
+in the cpu profile: `(program)` is the catch-all bucket
+V8 picks when no JS frame sits on top of the stack at
+sample time, and a microtask continuation is exactly
+that condition.
+
+The bucket name was misleading, but the cost itself was
+real -- the JS *running* inside the continuation
+*was* paged.js doing its per-page work. No "microtask
+plumbing overhead" to slim down. The only way to remove
+the `RunMicrotasks` attribution was to stop wrapping the
+render in a microtask continuation entirely -- i.e.,
+make the whole chain synchronous so V8 has no async
+scope to attribute to.
+
+### Why this is OK for our pipeline (and not for upstream)
+
+Upstream paged.js needs the async machinery. Its target
+deployment is an interactive browser page: real
+stylesheet fetches over HTTP (genuinely async), font
+loads against the OS (genuinely async), user-registered
+handlers that may load external resources or do
+expensive work between page renders (async-friendly to
+keep the page responsive). The await chain is the
+canonical pattern for "yield to the browser between
+expensive steps so the UI thread can paint."
+
+Our pipeline has none of those constraints:
+
+- `page.goto(url, { waitUntil: 'load' })` settles
+  *before* paged.js is invoked. Every font, image, and
+  stylesheet referenced by `<link>` / `@font-face` /
+  `<img>` is already loaded by the time the render
+  starts. The async checks are no-ops.
+- The headless renderer has no compositor coordinating
+  with us, no paint budget to respect, no user looking
+  at the page. Blocking the main thread for 8 s is
+  fine -- nobody's watching.
+- All registered handlers in our build are synchronous.
+  The `_assertSync` guard from the Phase 1/2 cleanup
+  has been in place for the per-page hot path for a
+  while; we just hadn't extended the pattern to the
+  once-per-render hooks.
+- The stylesheet fetches the polisher does are local
+  `file://` URLs. Sync XHR resolves them in microseconds.
+
+So the entire async surface in paged.js -- which
+upstream needs -- is, for our specific use case, the
+opposite of helpful: it pushes work into microtask
+continuations that show up as `RunMicrotasks` in the
+trace and `(program)` in the cpu profile, instead of
+landing under honest names like `RunTask` and
+`EvaluateScript`.
+
+### The conversion
+
+Nine functions in `docs/lib/paged.browser.js` switched
+from `async` to plain sync, marked
+`[PATCH: sync-chain]` at each site:
+
+| function | what changed |
+| --- | --- |
+| `request()` | Async XHR + `new Promise` + `Response` wrapper → sync XHR (`open(...,false)`) returning body text directly. Both callers (`Polisher.add` / `convertViaSheet`) only ever consumed `response.text()` (itself async per spec), so returning text skips that boundary too. |
+| `Sheet.parse()` | Three `await hook.trigger(...)` → `_assertSync(triggerSync(...))`. CSS-parser hooks all sync in our build. |
+| `Polisher.convertViaSheet()` | Drop awaits on `sheet.parse` / `request` / recursive `convertViaSheet`. |
+| `Polisher.add()` | Drop the `Promise.all` + then-chain entirely. Walks arguments once, feeds each through the sync pipeline. |
+| `Chunker.loadFonts()` | `Promise.all(fontFace.load())` → sync walk of `document.fonts` that throws if any face's `status !== "loaded"`. The throw is a safety net; `page.goto({waitUntil:'load'})` settles fonts in practice. |
+| `Chunker.clonePage()` | Three per-page hook awaits → `_assertSync`. Cold path (Footnotes-only). |
+| `Chunker.render()` | Strip `async`. Body was already sync after the Phase 1/2 cleanup. |
+| `Chunker.flow()` | Strip `async`; five await sites → sync calls / `_assertSync`. |
+| `PagedPolyfill.preview()` | Strip `async`; two hook awaits → `_assertSync`; drop awaits on `polisher.add` / `chunker.flow`. |
+
+Plus the two external callers in
+[`perf/measure.mjs`](measure.mjs) and
+[`docs/render-book.mjs`](../docs/render-book.mjs):
+both did `page.evaluate(async () => { await
+window.PagedPolyfill.preview(); })`. The inner IIFE is
+now a plain sync arrow; the outer `await` is just the
+CDP round-trip puppeteer needs to ferry control back.
+
+The `_assertSync` helper (from the earlier
+"sync chain end-to-end through the per-page hot path"
+work) is the load-bearing safety net throughout: if any
+future hook handler returns a thenable, the chain
+throws with a useful error message instead of silently
+swallowing async work. The contract is now:
+
+> Every hook handler in this bundle is sync. Every
+> external resource referenced by the document is
+> loaded before `PagedPolyfill.preview()` runs.
+
+If either invariant breaks, `_assertSync` or
+`loadFonts`'s throw catches it loudly.
+
+### Results
+
+Paired `--detach-pages --no-timing --render-only
+--tracing` run on the 1651-page book, comparing the
+pre-conversion trace ([results from
+"Inside RunMicrotasks" above]) against the post-:
+
+| metric | pre-sync | post-sync | Δ |
+| --- | --- | --- | --- |
+| render wall | 8.13 s | 8.36 s | flat (within single-run noise) |
+| trace event count | 250,376 | 255,949 | flat |
+| `RunMicrotasks` self | 2890.66 ms (35.6 %) | **0.56 ms** (off top-30) | **-2890 ms (-99.98 %)** |
+| `RunMicrotasks` total | 6333.18 ms | **0.56 ms** | **-6333 ms** |
+| `RunMicrotasks` hits | 15 | 12 | -3 |
+| `RunMicrotasks` rm[4] dur | 6262.34 ms | gone | -6262 ms |
+| `RunTask` self (top-30) | (below threshold, ~16 ms) | **2984.11 ms (34.6 %)** | **+2968 ms** |
+| `RunTask` hits | (~few hundred) | **1005** | re-attributed |
+| `RunTask` total | (small) | **8630.80 ms** | the whole render |
+| `Document::UpdateStyleAndLayout` total/hits | 3320 / 39675 | 3515 / 39675 | flat |
+| `Document::recalcStyle` self | 1737 ms | 1877 ms | flat |
+| `LocalFrameView::performLayout` self | 1737 ms | 1881 ms | flat |
+| per-page ratio (last/first quarter) | 1.36x | 1.27x | slight improvement (noise band) |
+| pages | 1651 | 1651 | identical |
+| PDF size (full render, separate run) | 16.1 MB | **16.1 MB** | byte-equivalent |
+
+The headline number is the **6333 → 0.56 ms collapse**
+in `RunMicrotasks` total. The 12 surviving sub-ms hits
+are pure puppeteer/CDP plumbing (one `AsyncTask Run`
+child = 0.01 ms; the rest are V8 internal MT-checkpoint
+runs). There is no remaining JS executing inside a
+microtask continuation -- the render runs as a plain
+synchronous task from start to end.
+
+The work didn't disappear, it re-attributed. `RunTask`
+self-time (2984 ms) almost exactly equals the old
+`RunMicrotasks` self-time (2891 ms) plus single-run
+noise. Per-call children counts are unchanged
+(`Document::UpdateStyleAndLayout`: 39675 calls then,
+39675 calls now). Same JS, same DOM mutations, same
+layout flushes -- just no longer wrapped in a
+continuation.
+
+### What this buys
+
+**Profile readability.** A reader opening
+`render.cpuprofile` or `trace.json` after this change
+sees:
+
+- `(program)` in the cpu profile drops by the
+  proportion that was V8 runtime overhead inside the
+  continuation (the MT plumbing + dispatch glue
+  between named natives). The remaining `(program)`
+  is genuinely-unattributable V8 work (IC stubs,
+  runtime helpers).
+- `RunMicrotasks` no longer appears at the top of the
+  trace's bottom-up table. The render lands under
+  `RunTask` / `EvaluateScript` / `FunctionCall`, with
+  Blink work (`performLayout`, `recalcStyle`,
+  `rebuildLayoutTree`) as named children where it
+  belongs.
+- The cpu profile's `(idle)` row already collapsed in
+  the earlier rAF→queueMicrotask fix; this change
+  closes the symmetric gap on the JS side.
+
+**Structural simplicity.** Nine functions in the bundle
+lost the `async` keyword and the `await` site
+discipline that went with it. The render call chain is
+now top-to-bottom synchronous: `preview()` calls into
+`flow()` calls into `render()` calls into `*layout()`,
+plain returns all the way down. Anyone tracing through
+the bundle for a perf investigation can read the
+control flow without modeling promise resolution
+ordering.
+
+**Single contract.** The hook surface is now uniformly
+sync via `_assertSync`. Before the conversion, the
+per-page hooks (`beforePageLayout`, `afterPageLayout`,
+`finalizePage`, etc.) were sync-asserted while the
+once-per-render hooks (`beforeParsed`, `afterParsed`,
+`afterRendered`, `beforePreview`, `afterPreview`) used
+`await trigger(...)`. The split was historical, not
+principled. Now every hook is sync-asserted, same
+shape, same error message.
+
+### What this doesn't buy
+
+**Wall-clock.** Render goes 8.13 s → 8.36 s, which is
+within the ±1 s single-run noise band for this machine
+documented elsewhere in this README. CPU work
+re-attributes but doesn't shrink: the chunker's JS
+still runs the same way, DOM mutations still trigger
+the same layout flushes, gBCR self-time still owns
+~21 % of the trace. Phase 1's microtask-boundary
+elimination cost (~850 ms) was real because there *were*
+8 k boundaries to remove; this conversion eliminates a
+handful of additional boundaries (the once-per-render
+sites) whose per-boundary cost is small.
+
+**A path to fewer flushes.** The remaining gBCR-driven
+layout work is intrinsic to paged.js's per-page
+break-and-resume algorithm. The README's earlier
+attempts (B, D from the "createBreakToken dedup"
+investigation; the move-not-clone experiment) confirmed
+that gBCR re-attributes if you elide one site, and
+that mutations are the structural source. Synchronising
+the chain doesn't change any of that.
+
+### Verification
+
+The 1651-page book renders identically pre- and
+post-conversion -- same page count, same 16.1 MB PDF.
+The PDF differs from the previous build only by the
+expected timestamp drift (the `/CreationDate` /
+`/ModDate` entries Chrome writes per run). No content
+changes; the bundle does the same work in the same
+order.
+
+The trace's `RunTask` -> `Document::UpdateStyleAndLayout`
+hit count (39 675) matches the previous run exactly,
+confirming the per-page chunker iteration count is
+preserved through the conversion. `RunTask` ->
+`WebFrameWidgetImpl::UpdateLifecycle` at 1950 ms / 1
+hit is Chromium's final-frame lifecycle work after the
+last page is laid out, same as before -- it just shows
+up under `RunTask` instead of being attributed to a
+post-render microtask, which is also why `RunTask` self
+includes it.
+
+### What's still async, post-conversion
+
+Two surfaces remain async-shaped, both intentionally:
+
+1. **The auto-run block at [paged.browser.js:33153](../docs/lib/paged.browser.js:33153).**
+   `ready.then(async function () { ... })` fires once at
+   `DOMContentLoaded` and is gated by `config.auto !==
+   false` -- our pipeline always sets `config.auto =
+   false` before invoking `preview()`, so this branch
+   never runs. Leaving it async-shaped costs one
+   microtask scheduling at startup, sub-microsecond,
+   and preserves byte-for-byte compatibility with
+   upstream paged.js's auto-init semantic for anyone
+   running this bundle in a configuration we don't.
+2. **External `page.evaluate(...)` callers.** The
+   wrapper around `window.PagedPolyfill.preview()` in
+   `perf/measure.mjs` and `docs/render-book.mjs` is a
+   sync arrow, but `page.evaluate` itself returns a
+   Promise (CDP roundtrip). Node-side code awaits that
+   Promise. Cost is the CDP round-trip, not the JS we
+   execute.
+
+Neither contributes to the renderer's main-thread
+profile.
+
+### Cumulative trace shape
+
+For reference, the post-conversion top-of-table on
+`CrRendererMain` reads:
+
+```
+   self_ms   self_%   event                                       category
+   -------   ------   ----------------------------------------------
+   2984.11   34.58%   RunTask                                     devtools.timeline
+   1880.79   21.79%   LocalFrameView::performLayout               blink
+   1876.53   21.74%   Document::recalcStyle                       blink
+    540.06    6.26%   InlineNode::ShapeTextIncludingFirstLine     blink
+    503.09    5.83%   Document::rebuildLayoutTree                 blink
+    128.90    1.49%   Blink.CompositingInputs.UpdateTime          blink
+    123.41    1.43%   Blink.PrePaint.UpdateTime                   blink
+     99.60    1.15%   Document::updateStyle                       blink
+     76.83    0.89%   V8.GC_MC_INCREMENTAL_EMBEDDER_TRACING       v8.gc
+     43.20    0.50%   Layout                                      devtools.timeline
+     ...
+```
+
+`RunMicrotasks` no longer appears. `(self /
+unattributed)` time inside `RunTask` is 2984 ms across
+1005 hits -- average ~3 ms per task, consistent with
+"each render task does ~one page's worth of work" plus
+some longer tasks for setup / teardown. The dominant
+named children are unchanged: `UpdateStyleAndLayout`,
+`recalcStyle`, `performLayout`, `ShapeText`,
+`rebuildLayoutTree`. Same work, honest labels.
+
+Shipped.
diff --git a/perf/measure.mjs b/perf/measure.mjs
index 6161ce7..ed8b811 100644
--- a/perf/measure.mjs
+++ b/perf/measure.mjs
@@ -284,12 +284,15 @@ try {
   }
 
   const tRenderStart = Date.now();
-  await page.evaluate(async () => {
+  // [PATCH: sync-chain] PagedPolyfill.preview() is fully synchronous in
+  // our forked bundle, so the IIFE here is a plain sync arrow. Outer
+  // `await` is for puppeteer's CDP round-trip back from page.evaluate.
+  await page.evaluate(() => {
     if (!window.PagedPolyfill) {
       throw new Error('paged.js bundle did not expose window.PagedPolyfill');
     }
     try {
-      await window.PagedPolyfill.preview();
+      window.PagedPolyfill.preview();
     } catch (err) {
       const e = err && err.target
         ? new Error(`${err.type || 'event'} on ${err.target.tagName || '?'}: ${err.target.src || err.target.href || ''}`)

From 7886aed558342b1b19e05ad70b28a27ec833d8e9 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sat, 23 May 2026 15:23:34 +0200
Subject: [PATCH 06/18] Hybrid trace: embed V8 cpu_profiler samples in
 --tracing output.

---
 perf/README.md   | 23 ++++++++++++++++++-----
 perf/measure.mjs |  5 +++++
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/perf/README.md b/perf/README.md
index 9ce36dd..287614b 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -107,7 +107,7 @@ DevTools-compatible trace is a few lines.
 | `compare-outlines.mjs` | Diffs two PDFs' `/Outlines` trees by `(depth, title, target page)`. Used to verify whether Chrome's native outline matches the injected one. |
 | `probe-outline-exclusions.mjs` | Tests which per-element attributes / styles (aria-hidden, role=presentation, hidden, display:none, CSS bookmark-level, ...) make Chrome drop a heading from its outline. |
 | `analyze-profile.mjs` | Bottom-up self-time analyzer for `.cpuprofile` files. Same shape as DevTools' Performance bottom-up view, in the terminal. |
-| `analyze-trace.mjs` | Bottom-up self-time analyzer for Chrome traces (`trace.json` from `--tracing`). Computes per-event self-time on the renderer's main thread (`CrRendererMain` by default) by walking nested `X`-phase events. Cracks the cpu profile's `(program)` bucket open into named Blink / V8 events (`Layout`, `RecalcStyle`, `RunMicrotasks`, `V8.GC_*`, ...). |
+| `analyze-trace.mjs` | Bottom-up self-time analyzer for Chrome traces (`trace.json` from `--tracing`). Computes per-event self-time on the renderer's main thread (`CrRendererMain` by default) by walking nested `X`-phase events. Cracks the cpu profile's `(program)` bucket open into named Blink / V8 events (`Layout`, `RecalcStyle`, `RunMicrotasks`, `V8.GC_*`, ...). Operates on the Blink trace events only -- ignores any embedded V8 cpu samples (`Profile` / `ProfileChunk`). |
 | `ab-aggregate.mjs` | Per-row mean + SD aggregator across 6 paired cpu profiles (`ab-A1..A3.cpuprofile` and `ab-B1..B3.cpuprofile`). Use when wall-clock noise drowns a structural change: capture 3+3 interleaved profiles via `measure.mjs --cpu-profile` with the change toggled on/off between runs, then point this at the 6 files for a mean-with-SD table that surfaces deltas wall-clock can't see (e.g. ~6 σ shifts on rows that move from 88 ms to 2 ms). See the "Disabling the filter outright" section in this README for the methodology. |
 | `find-callers.mjs` | "Who paid for this callee's time?" -- walks a `.cpuprofile` and attributes a target function's total time back to each direct caller. Used throughout the post-mortems to detect gBCR migration between callers. |
 | `find-callees.mjs` | The other direction of `find-callers.mjs`: splits a function's self+descendant time across its direct callees. Surfaces the cases where V8 has rolled native DOM work back into the calling JS frame (Range deletion in `removeOverflow`, HTML parser in `wrapContent`). |
@@ -166,7 +166,7 @@ run.bat --instrument                      # count + time DOM-accessor calls
 run.bat --time-hooks                      # per-task timing of every chunker/polisher hook
 run.bat --incremental                     # process via incremental update instead of pdf-lib roundtrip
 run.bat --chrome-outline                  # let Chrome emit /Outlines (skip parseOutline + setOutline)
-run.bat --tracing                         # capture a Chrome trace of the render phase
+run.bat --tracing                         # capture a hybrid Chrome trace (Blink events + embedded V8 cpu samples)
 ```
 
 Flags compose. The CPU profile lands as `render.cpuprofile`
@@ -4147,14 +4147,27 @@ a `--tracing` flag and a companion `analyze-trace.mjs`.
 The flag wraps the render phase in `page.tracing.start()`
 with Blink-relevant categories (`devtools.timeline`,
 `disabled-by-default-devtools.timeline`, `blink`, `v8`,
-`v8.execute`) and writes `trace.json` to the results
-folder. `analyze-trace.mjs` walks the trace's complete-phase
+`v8.execute`, `disabled-by-default-v8.cpu_profiler`) and
+writes `trace.json` to the results folder. The
+`v8.cpu_profiler` category embeds V8 sampling-profile data
+as `Profile` / `ProfileChunk` events inline with the Blink
+trace events, so the single trace file is *hybrid*: loaded
+in Chrome DevTools Performance or [ui.perfetto.dev](https://ui.perfetto.dev)
+it renders JS call stacks aligned with Blink events on the
+same timeline (the de facto answer to "what was `(program)`
+doing?"). Cost: ~2x file size (e.g. 22 MB -> 52 MB on the
+1651-page book) and ~0.4 s wall-clock for the extra sampler
+work -- both noise on the analysis side.
+
+`analyze-trace.mjs` walks the trace's complete-phase
 events on `CrRendererMain`, computes self-time per event
 name via a nested-event stack walk (same shape as
 `analyze-profile.mjs` for cpuprofiles), and prints a
 top-N table. A `--children <name>` mode breaks any
 parent event into its direct callees, mirroring
-`find-callees.mjs`.
+`find-callees.mjs`. It ignores the embedded V8 cpu samples
+-- those are consumed separately by the viewers above (or
+by a forthcoming hybrid analyzer in this folder).
 
 ### What's on the main thread
 
diff --git a/perf/measure.mjs b/perf/measure.mjs
index ed8b811..46af266 100644
--- a/perf/measure.mjs
+++ b/perf/measure.mjs
@@ -268,6 +268,10 @@ try {
     // adds UpdateLayoutTree / InvalidateLayout / ScheduleStyleRecalc /
     // HitTest; blink covers internal Blink events; v8 + v8.execute cover
     // V8.GC* / V8.CompileCode / V8.RunMicrotasks / V8.Execute.
+    // disabled-by-default-v8.cpu_profiler embeds V8 sampling-profile data
+    // as Profile / ProfileChunk events inline with the trace, giving JS
+    // call stacks aligned with Blink events when loaded in Chrome
+    // DevTools Performance or perfetto.dev (the hybrid view).
     tracePath = join(outDir, 'trace.json');
     await page.tracing.start({
       path: tracePath,
@@ -278,6 +282,7 @@ try {
         'blink',
         'v8',
         'v8.execute',
+        'disabled-by-default-v8.cpu_profiler',
       ],
     });
     console.log(`[harness] tracing: ${tracePath}`);

From 0a968d912e4df706a1a2ed07ff13de82c61d147a Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sat, 23 May 2026 15:37:11 +0200
Subject: [PATCH 07/18] Add analyze-hybrid.mjs: bottom-up + callees view across
 JS and Blink.

---
 perf/README.md          |   7 +-
 perf/analyze-hybrid.mjs | 341 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 346 insertions(+), 2 deletions(-)
 create mode 100644 perf/analyze-hybrid.mjs

diff --git a/perf/README.md b/perf/README.md
index 287614b..5fb34fc 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -108,6 +108,7 @@ DevTools-compatible trace is a few lines.
 | `probe-outline-exclusions.mjs` | Tests which per-element attributes / styles (aria-hidden, role=presentation, hidden, display:none, CSS bookmark-level, ...) make Chrome drop a heading from its outline. |
 | `analyze-profile.mjs` | Bottom-up self-time analyzer for `.cpuprofile` files. Same shape as DevTools' Performance bottom-up view, in the terminal. |
 | `analyze-trace.mjs` | Bottom-up self-time analyzer for Chrome traces (`trace.json` from `--tracing`). Computes per-event self-time on the renderer's main thread (`CrRendererMain` by default) by walking nested `X`-phase events. Cracks the cpu profile's `(program)` bucket open into named Blink / V8 events (`Layout`, `RecalcStyle`, `RunMicrotasks`, `V8.GC_*`, ...). Operates on the Blink trace events only -- ignores any embedded V8 cpu samples (`Profile` / `ProfileChunk`). |
+| `analyze-hybrid.mjs` | Bottom-up analyzer that *combines* the V8 cpu samples and the Blink trace events from a hybrid `trace.json`. Builds a `[JS root..leaf] ++ [Blink outer..inner]` stack at each sample (filtering V8's virtual frames and JS-entry wrapper events) and prints either top-N self-time mixing JS function names with Blink/V8 event names, or `--callees <label>` direct-callees for any name on either axis. Lets you walk a single causation chain from a JS function down through the Blink layout / style work it triggered via gBCR (`hasOverflow -> getBoundingClientRect -> Document::UpdateStyleAndLayout -> Blink.ForcedStyleAndLayout.UpdateTime -> ...`). |
 | `ab-aggregate.mjs` | Per-row mean + SD aggregator across 6 paired cpu profiles (`ab-A1..A3.cpuprofile` and `ab-B1..B3.cpuprofile`). Use when wall-clock noise drowns a structural change: capture 3+3 interleaved profiles via `measure.mjs --cpu-profile` with the change toggled on/off between runs, then point this at the 6 files for a mean-with-SD table that surfaces deltas wall-clock can't see (e.g. ~6 σ shifts on rows that move from 88 ms to 2 ms). See the "Disabling the filter outright" section in this README for the methodology. |
 | `find-callers.mjs` | "Who paid for this callee's time?" -- walks a `.cpuprofile` and attributes a target function's total time back to each direct caller. Used throughout the post-mortems to detect gBCR migration between callers. |
 | `find-callees.mjs` | The other direction of `find-callers.mjs`: splits a function's self+descendant time across its direct callees. Surfaces the cases where V8 has rolled native DOM work back into the calling JS frame (Range deletion in `removeOverflow`, HTML parser in `wrapContent`). |
@@ -4166,8 +4167,10 @@ name via a nested-event stack walk (same shape as
 top-N table. A `--children <name>` mode breaks any
 parent event into its direct callees, mirroring
 `find-callees.mjs`. It ignores the embedded V8 cpu samples
--- those are consumed separately by the viewers above (or
-by a forthcoming hybrid analyzer in this folder).
+-- those are consumed by the viewers above (DevTools /
+Perfetto) or, for terminal use, by `analyze-hybrid.mjs`,
+which combines V8 sample stacks with Blink event nests
+into a single bottom-up / callees view.
 
 ### What's on the main thread
 
diff --git a/perf/analyze-hybrid.mjs b/perf/analyze-hybrid.mjs
new file mode 100644
index 0000000..cdf1335
--- /dev/null
+++ b/perf/analyze-hybrid.mjs
@@ -0,0 +1,341 @@
+// Bottom-up analyzer for a HYBRID Chrome trace.
+//
+// Reads a trace.json that contains BOTH Blink/V8 trace events AND an
+// embedded V8 cpu sampling profile (delivered as Profile / ProfileChunk
+// events when the trace is captured with the
+// disabled-by-default-v8.cpu_profiler category, as `node measure.mjs
+// --tracing` does). Produces a single bottom-up table that mixes
+// JS function names with named Blink/V8 events -- the missing piece
+// that neither analyze-profile.mjs (cpu profile alone, can't name
+// `(program)`) nor analyze-trace.mjs (Blink events alone, can't see
+// JS frames) can give on their own.
+//
+// Usage:
+//   node analyze-hybrid.mjs <path/to/trace.json> [--top N] [--min-pct P]
+//                           [--thread <name>] [--callees <label>]
+//
+// Defaults: --top 30, --min-pct 0.1, thread = CrRendererMain.
+//
+// The model: build a combined "hybrid stack" at each cpu sample as
+//   [ JS frames root->leaf ]  ++  [ Blink events outer->inner, filtered ]
+// where the JS frames come from the V8 cpu profile node lineage
+// (filtering virtual frames -- (root), (program), (idle), (garbage
+// collector)) and the Blink events come from the trace's X-event nest
+// active at the sample's timestamp on the renderer main thread
+// (filtering "JS-entry" wrappers -- RunTask, RunMicrotasks, FunctionCall,
+// EvaluateScript, V8.Execute, V8.RunMicrotasks -- which aren't part of
+// the per-page work the user cares about).
+//
+// JS is outer, real Blink work is inner. This matches the actual stack
+// shape during a synchronous layout flush: a JS frame (e.g. findOverflow)
+// calls into a V8 binding (e.g. getBoundingClientRect), the binding
+// enters Blink, and Blink runs nested layout/style work (performLayout,
+// recalcStyle, ...) before returning. The leaf of the combined stack is
+// what's "actually running" at sample time: a JS function when V8 is
+// executing JS, a Blink event when V8 is idle inside the binding.
+//
+// Default mode: aggregate self-time by combined-stack leaf and print
+// top-N. Equivalent to bottom-up view in DevTools' Performance panel
+// when grouped by event/function name.
+//
+// --callees <label> mode: for every sample whose combined stack contains
+// <label>, attribute the next-deeper stack entry to <label> as a callee.
+// If <label> matches both a JS function name and a Blink event name in
+// the trace, both attributions are pooled (you're asking "what runs
+// inside this label" regardless of which axis the label lives on). The
+// synthetic "(self / unattributed)" row covers samples where <label> is
+// the leaf of the combined stack.
+
+import { readFileSync } from 'node:fs';
+import { resolve } from 'node:path';
+
+const args = process.argv.slice(2);
+let tracePath = null;
+let topN = 30;
+let minPct = 0.1;
+let threadName = 'CrRendererMain';
+let calleesOf = null;
+for (let i = 0; i < args.length; i++) {
+  const a = args[i];
+  if (a === '--top') topN = parseInt(args[++i], 10);
+  else if (a === '--min-pct') minPct = parseFloat(args[++i]);
+  else if (a === '--thread') threadName = args[++i];
+  else if (a === '--callees') calleesOf = args[++i];
+  else if (!tracePath) tracePath = a;
+}
+if (!tracePath) {
+  console.error('usage: node analyze-hybrid.mjs <path> [--top N] [--min-pct P] [--thread NAME] [--callees LABEL]');
+  process.exit(2);
+}
+tracePath = resolve(process.cwd(), tracePath);
+
+const trace = JSON.parse(readFileSync(tracePath, 'utf8'));
+const events = Array.isArray(trace) ? trace : (trace.traceEvents || []);
+
+// --- Thread / process metadata ----------------------------------------
+const threadKeyByName = new Map(); // name -> Set of `${pid}.${tid}`
+const threadNames = new Map();     // `${pid}.${tid}` -> name
+for (const e of events) {
+  if (e.ph !== 'M' || !e.args) continue;
+  if (e.name === 'thread_name' && e.args.name) {
+    const tk = `${e.pid}.${e.tid}`;
+    threadNames.set(tk, e.args.name);
+    if (!threadKeyByName.has(e.args.name)) threadKeyByName.set(e.args.name, new Set());
+    threadKeyByName.get(e.args.name).add(tk);
+  }
+}
+const mainThreadKeys = threadKeyByName.get(threadName);
+if (!mainThreadKeys || !mainThreadKeys.size) {
+  console.error(`no thread named "${threadName}". Threads present:`);
+  for (const [tk, name] of threadNames) console.error(`  ${name}  (${tk})`);
+  process.exit(3);
+}
+
+// --- Trace X-events on the target thread ------------------------------
+// We collect them as flat (ts, dur, name) records; the timeline walk
+// below reconstructs nesting via start/end ordering. "JS-entry" wrapper
+// events are dropped so they don't appear in the combined stack -- they
+// surround JS execution but aren't part of the work we want to attribute.
+const JS_WRAPPER_NAMES = new Set([
+  'RunTask',
+  'RunMicrotasks',
+  'FunctionCall',
+  'EvaluateScript',
+  'V8.Execute',
+  'V8.RunMicrotasks',
+  'Task',
+  'ThreadControllerImpl::RunTask',
+]);
+const mainEvents = [];
+for (const e of events) {
+  if (e.ph !== 'X' || typeof e.dur !== 'number' || e.dur <= 0) continue;
+  if (!mainThreadKeys.has(`${e.pid}.${e.tid}`)) continue;
+  if (JS_WRAPPER_NAMES.has(e.name)) continue;
+  mainEvents.push({ ts: e.ts, dur: e.dur, end: e.ts + e.dur, name: e.name });
+}
+
+// --- V8 cpu profile reconstruction ------------------------------------
+// Collect every Profile + ProfileChunk; pair by `id`. nodes accumulate
+// across chunks; samples and timeDeltas concatenate. Sample absolute
+// timestamps are Profile.startTime + cumulative sum of timeDeltas.
+const profilesById = new Map(); // id -> { startTime, nodes:Map(id->node), samples:[nodeId], deltas:[us] }
+for (const e of events) {
+  if (e.name !== 'Profile' && e.name !== 'ProfileChunk') continue;
+  const id = e.id || (e.args && e.args.id) || '0x1';
+  if (!profilesById.has(id)) {
+    profilesById.set(id, { startTime: null, nodes: new Map(), samples: [], deltas: [] });
+  }
+  const p = profilesById.get(id);
+  if (e.name === 'Profile') {
+    const d = e.args && e.args.data;
+    if (d && typeof d.startTime === 'number') p.startTime = d.startTime;
+    continue;
+  }
+  // ProfileChunk
+  const d = e.args && e.args.data;
+  if (!d) continue;
+  if (d.cpuProfile) {
+    if (Array.isArray(d.cpuProfile.nodes)) {
+      for (const n of d.cpuProfile.nodes) p.nodes.set(n.id, n);
+    }
+    if (Array.isArray(d.cpuProfile.samples)) {
+      for (const sid of d.cpuProfile.samples) p.samples.push(sid);
+    }
+  }
+  if (Array.isArray(d.timeDeltas)) {
+    for (const dt of d.timeDeltas) p.deltas.push(dt);
+  }
+}
+
+// Pick the largest profile (in practice always one). Anything else gets
+// folded in for completeness.
+const allSamples = []; // { ts, nodeId, deltaUs }
+let nodes = new Map();
+for (const p of profilesById.values()) {
+  for (const [k, v] of p.nodes) nodes.set(k, v);
+  if (p.startTime == null) continue;
+  let t = p.startTime;
+  for (let i = 0; i < p.samples.length; i++) {
+    const dt = p.deltas[i] || 0;
+    t += dt;
+    allSamples.push({ ts: t, nodeId: p.samples[i], deltaUs: dt });
+  }
+}
+allSamples.sort((a, b) => a.ts - b.ts);
+if (!allSamples.length) {
+  console.error('no cpu samples found in trace (was the disabled-by-default-v8.cpu_profiler category enabled at capture time?)');
+  process.exit(3);
+}
+
+// --- V8 node lineage cache --------------------------------------------
+// For each node id, the chain leaf->root of callFrames, filtered to drop
+// virtual frames so they don't appear as JS callees / leaves.
+const VIRTUAL_NAMES = new Set(['(root)', '(program)', '(idle)', '(garbage collector)', '']);
+const lineageCache = new Map(); // nodeId -> { jsFrames:[name leaf->root], rawLeaf:string }
+function lineageOf(id) {
+  if (lineageCache.has(id)) return lineageCache.get(id);
+  const frames = [];
+  let leafName = null;
+  let cur = id;
+  let guard = 0;
+  while (cur != null && guard++ < 4096) {
+    const n = nodes.get(cur);
+    if (!n) break;
+    const cf = n.callFrame || {};
+    const fn = cf.functionName || '';
+    if (leafName == null) leafName = fn || '(anonymous)';
+    if (!VIRTUAL_NAMES.has(fn)) frames.push(fn || '(anonymous)');
+    cur = n.parent;
+  }
+  const out = { jsFrames: frames, rawLeaf: leafName || '(unknown)' };
+  lineageCache.set(id, out);
+  return out;
+}
+
+// --- Build per-sample enclosing event-stack ---------------------------
+// Walk a timeline of (start, end, sample) markers sorted by (ts, type)
+// where type order is end < start < sample within ties. The active stack
+// at each sample marker is the chain of enclosing events outer->inner;
+// store as a snapshot. Snapshots share suffix arrays where possible
+// (not done here -- memory is fine, ~20k samples * ~8 events = ~160k
+// refs, all small).
+const TIMELINE_END = 0, TIMELINE_START = 1, TIMELINE_SAMPLE = 2;
+const timeline = new Array(mainEvents.length * 2 + allSamples.length);
+let wi = 0;
+for (const ev of mainEvents) {
+  timeline[wi++] = { ts: ev.ts, type: TIMELINE_START, ev };
+  timeline[wi++] = { ts: ev.end, type: TIMELINE_END, ev };
+}
+for (const s of allSamples) {
+  timeline[wi++] = { ts: s.ts, type: TIMELINE_SAMPLE, s };
+}
+timeline.sort((a, b) => a.ts - b.ts || a.type - b.type);
+
+const activeStack = [];
+for (const item of timeline) {
+  if (item.type === TIMELINE_START) {
+    activeStack.push(item.ev);
+  } else if (item.type === TIMELINE_END) {
+    // Usually the top; if not (e.g. degenerate trace), pop by ref.
+    const top = activeStack[activeStack.length - 1];
+    if (top === item.ev) activeStack.pop();
+    else {
+      const idx = activeStack.lastIndexOf(item.ev);
+      if (idx >= 0) activeStack.splice(idx, 1);
+    }
+  } else {
+    // sample: snapshot stack (event name chain, outer->inner)
+    item.s.eventStack = activeStack.length
+      ? activeStack.map(e => e.name)
+      : null;
+  }
+}
+
+// --- Build combined hybrid stack per sample ---------------------------
+// hybridStack[i] = jsFrames (root->leaf) ++ eventStack (outer->inner)
+// All entries are plain strings. The leaf of this combined stack is
+// what self-time gets attributed to.
+function hybridStackFor(s) {
+  const evStack = s.eventStack || [];
+  const lin = lineageOf(s.nodeId);
+  const jsRootToLeaf = lin.jsFrames.slice().reverse();
+  if (!evStack.length && !jsRootToLeaf.length) {
+    // Pure virtual sample with no enclosing non-wrapper event -- attribute
+    // to the raw leaf so (idle) / (program) still show up honestly.
+    return [lin.rawLeaf];
+  }
+  return jsRootToLeaf.concat(evStack);
+}
+
+// --- Mode dispatch ----------------------------------------------------
+const fmt = (n, w) => n.toFixed(2).padStart(w);
+
+if (calleesOf) {
+  // Callees of `calleesOf`: for each sample whose hybrid stack contains
+  // calleesOf, find the next entry deeper in the stack and attribute
+  // sample's deltaUs to that name. If calleesOf is the leaf, attribute
+  // to (self / unattributed).
+  const byCallee = new Map(); // name -> { us, hits }
+  let parentUs = 0, parentHits = 0, parentSelfUs = 0;
+  for (const s of allSamples) {
+    const stack = hybridStackFor(s);
+    const idx = stack.lastIndexOf(calleesOf);
+    if (idx < 0) continue;
+    parentUs += s.deltaUs;
+    parentHits++;
+    if (idx === stack.length - 1) {
+      parentSelfUs += s.deltaUs;
+      continue;
+    }
+    const callee = stack[idx + 1];
+    const cur = byCallee.get(callee) || { us: 0, hits: 0 };
+    cur.us += s.deltaUs;
+    cur.hits++;
+    byCallee.set(callee, cur);
+  }
+  if (!parentHits) {
+    console.error(`no hybrid-stack frames matched "${calleesOf}". Try the default mode first to find label names.`);
+    process.exit(3);
+  }
+  const rows = [...byCallee.entries()].map(([name, v]) => ({
+    name, hits: v.hits, ms: v.us / 1000, pct: 100 * v.us / parentUs,
+  }));
+  rows.push({
+    name: '(self / unattributed)',
+    hits: parentHits,
+    ms: parentSelfUs / 1000,
+    pct: 100 * parentSelfUs / parentUs,
+  });
+  rows.sort((a, b) => b.ms - a.ms);
+  console.log(`trace:   ${tracePath}`);
+  console.log(`samples: ${allSamples.length}  events(${threadName}): ${mainEvents.length}`);
+  console.log(`parent:  ${calleesOf}  hits: ${parentHits}  total: ${(parentUs/1000).toFixed(2)}ms  self: ${(parentSelfUs/1000).toFixed(2)}ms (${(100*parentSelfUs/parentUs).toFixed(1)}%)`);
+  console.log(`direct callees, top ${topN} by total time (min ${minPct}% of parent total):`);
+  console.log('');
+  console.log('   total_ms  total_%     hits   callee');
+  console.log('   --------  -------   ------   ----------------------------------------------');
+  for (const r of rows.filter(r => r.pct >= minPct).slice(0, topN)) {
+    console.log(`  ${fmt(r.ms, 8)}   ${fmt(r.pct, 5)}%   ${String(r.hits).padStart(6)}   ${r.name}`);
+  }
+  process.exit(0);
+}
+
+// Default mode: bottom-up self-time by combined-stack leaf.
+const selfByLabel = new Map(); // name -> { us, kind: 'js' | 'event' | 'virtual' }
+let totalUs = 0;
+for (const s of allSamples) {
+  const stack = hybridStackFor(s);
+  const leaf = stack[stack.length - 1];
+  totalUs += s.deltaUs;
+  // Under [JS-root..leaf] ++ [Blink-outer..inner] ordering, the leaf came
+  // from event stack iff event stack is non-empty (events nest inside JS).
+  // Otherwise it's a pure JS leaf, or (if both are empty) the raw virtual.
+  let kind;
+  if (s.eventStack && s.eventStack.length) kind = 'event';
+  else if (lineageOf(s.nodeId).jsFrames.length) kind = 'js';
+  else kind = 'virtual';
+  const cur = selfByLabel.get(leaf) || { us: 0, kind };
+  cur.us += s.deltaUs;
+  // If we ever see js attribution for this label, prefer js (event names
+  // can coincidentally collide with JS function names, though it's rare).
+  if (kind === 'js' && cur.kind !== 'js') cur.kind = 'js';
+  selfByLabel.set(leaf, cur);
+}
+
+const rows = [...selfByLabel.entries()]
+  .map(([name, v]) => ({ name, ms: v.us / 1000, pct: 100 * v.us / totalUs, kind: v.kind }))
+  .sort((a, b) => b.ms - a.ms)
+  .filter(r => r.pct >= minPct)
+  .slice(0, topN);
+
+console.log(`trace:   ${tracePath}`);
+console.log(`samples: ${allSamples.length}  events(${threadName}): ${mainEvents.length}  span: ${((allSamples[allSamples.length-1].ts - allSamples[0].ts)/1e6).toFixed(2)}s`);
+console.log(`total self: ${(totalUs/1000).toFixed(2)}ms across ${selfByLabel.size} distinct labels`);
+console.log(`top ${topN} by self-time (min ${minPct}%):  [js]=JS function, [ev]=Blink/V8 event, [..]=virtual leaf`);
+console.log('');
+console.log('   self_ms   self_%   kind   label');
+console.log('   -------   ------   ----   ----------------------------------------------');
+for (const r of rows) {
+  const k = r.kind === 'js' ? '[js]' : r.kind === 'event' ? '[ev]' : '[..]';
+  console.log(`  ${fmt(r.ms, 8)}   ${fmt(r.pct, 5)}%   ${k}   ${r.name}`);
+}

From 4ce52897975a421ffd8afde20f3cdc3cf4b26c1d Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sat, 23 May 2026 16:13:53 +0200
Subject: [PATCH 08/18] Per-section CSS cost attribution via ab-css.mjs; defer
 pageRanges sharding.

---
 perf/.gitignore |   2 +
 perf/README.md  |  67 +++++++++
 perf/ab-css.mjs | 386 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 455 insertions(+)
 create mode 100644 perf/ab-css.mjs

diff --git a/perf/.gitignore b/perf/.gitignore
index fbca225..df01c96 100644
--- a/perf/.gitignore
+++ b/perf/.gitignore
@@ -1 +1,3 @@
 results/
+ab-css/
+ab-css-*/
diff --git a/perf/README.md b/perf/README.md
index 5fb34fc..5eb26e6 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -109,6 +109,7 @@ DevTools-compatible trace is a few lines.
 | `analyze-profile.mjs` | Bottom-up self-time analyzer for `.cpuprofile` files. Same shape as DevTools' Performance bottom-up view, in the terminal. |
 | `analyze-trace.mjs` | Bottom-up self-time analyzer for Chrome traces (`trace.json` from `--tracing`). Computes per-event self-time on the renderer's main thread (`CrRendererMain` by default) by walking nested `X`-phase events. Cracks the cpu profile's `(program)` bucket open into named Blink / V8 events (`Layout`, `RecalcStyle`, `RunMicrotasks`, `V8.GC_*`, ...). Operates on the Blink trace events only -- ignores any embedded V8 cpu samples (`Profile` / `ProfileChunk`). |
 | `analyze-hybrid.mjs` | Bottom-up analyzer that *combines* the V8 cpu samples and the Blink trace events from a hybrid `trace.json`. Builds a `[JS root..leaf] ++ [Blink outer..inner]` stack at each sample (filtering V8's virtual frames and JS-entry wrapper events) and prints either top-N self-time mixing JS function names with Blink/V8 event names, or `--callees <label>` direct-callees for any name on either axis. Lets you walk a single causation chain from a JS function down through the Blink layout / style work it triggered via gBCR (`hasOverflow -> getBoundingClientRect -> Document::UpdateStyleAndLayout -> Blink.ForcedStyleAndLayout.UpdateTime -> ...`). |
+| `ab-css.mjs` | Per-section CSS cost attribution. Parses `docs/_site-pdf/assets/css/print.css` into themed sections by its `/* ---- ---- */` dividers, renders the book once per variant (full / minimal / each-section-dropped), and reports per-variant **CPU sample-time** totals (sum of V8 sample deltas) plus per-`Document::recalcStyle` / `LocalFrameView::performLayout` / `rebuildLayoutTree` / `ShapeText` totals. CPU sample-time is preemption-free and machine-load-independent, so single runs per variant are clean enough. Defaults to subtractive sweep; `--mode add` for additive. |
 | `ab-aggregate.mjs` | Per-row mean + SD aggregator across 6 paired cpu profiles (`ab-A1..A3.cpuprofile` and `ab-B1..B3.cpuprofile`). Use when wall-clock noise drowns a structural change: capture 3+3 interleaved profiles via `measure.mjs --cpu-profile` with the change toggled on/off between runs, then point this at the 6 files for a mean-with-SD table that surfaces deltas wall-clock can't see (e.g. ~6 σ shifts on rows that move from 88 ms to 2 ms). See the "Disabling the filter outright" section in this README for the methodology. |
 | `find-callers.mjs` | "Who paid for this callee's time?" -- walks a `.cpuprofile` and attributes a target function's total time back to each direct caller. Used throughout the post-mortems to detect gBCR migration between callers. |
 | `find-callees.mjs` | The other direction of `find-callers.mjs`: splits a function's self+descendant time across its direct callees. Surfaces the cases where V8 has rolled native DOM work back into the calling JS frame (Range deletion in `removeOverflow`, HTML parser in `wrapContent`). |
@@ -4968,3 +4969,69 @@ named children are unchanged: `UpdateStyleAndLayout`,
 `rebuildLayoutTree`. Same work, honest labels.
 
 Shipped.
+
+## `pageRanges` sharding: off the table for now
+
+Several sections above flag `pageRanges` sharding as
+"the biggest untried lever" for the `generate` phase --
+run `page.pdf()` N times over disjoint page ranges in
+parallel headless browsers, concatenate the resulting
+PDFs with pdf-lib, divide generate's ~43 s wall-clock by
+N. The arithmetic is appealing; the engineering isn't.
+
+A separate investigation (not in this repo) found enough
+pitfalls to make the work not worth pursuing at current
+scale. Sketch of what bit:
+
+- Each shard re-loads `book.html` and re-runs `paged.js`
+  rendering for *its* range, which means the per-shard
+  render is **not** 1/N of the original render -- paged.js
+  has to lay out all preceding pages to position the slice
+  correctly (named strings, counters, footnote numbering,
+  cross-references). Several "fixes" (skip-to-page hooks,
+  pre-rendered state injection) each broke in subtle ways
+  on the book's actual content.
+- PDF concatenation via pdf-lib reintroduces the full
+  `PDFDocument.load` cost the incremental writer avoided
+  -- need a streaming concatenator or qpdf binary
+  dependency to keep the process phase cheap.
+- Page numbers, named strings (`string(chapter-title)`),
+  and the running header rely on per-page state that the
+  Counters handler and `addEnvFunctions` rebuild from
+  document order. Sharding loses that order and breaks
+  the header on every shard boundary unless the per-shard
+  paged.js render is given the right starting state, which
+  is itself a research project.
+- Outline injection has to know cross-shard page numbers,
+  so either Chrome's native outline (which we don't ship)
+  or a post-concat outline rebuild is required.
+
+Net: even with aggressive engineering, the realistic win
+on a 1651-page book at N=4 shards is ~15-25 s of
+`generate` saved -- not the 32 s / 75 % the naive math
+suggests -- against a maintenance cost of a sharding
+harness that wraps puppeteer launch + IPC + pdf concat
++ per-shard state setup. Below the cost/benefit bar.
+
+The lever is documented in this README because it *is*
+the largest remaining target if priorities change (e.g.
+the book grows past 3000 pages, or a CI runtime cap
+forces it). It's just not the next thing to build.
+
+## What the next thing is, instead
+
+Render is at ~11 s on a 1651-page book, down from ~104 s
+in the original baseline. The bottom-up profile after
+all of the above changes shows no individual JS body
+above ~250 ms self-time; the dominant rows are native
+Blink work (`recalcStyle` 2.4 s, `performLayout` 2.2 s,
+`removeChild` 1.7 s) that's intrinsic to laying out and
+detaching 1651 pages of content.
+
+The A/B against a stripped print.css (next section)
+confirms `recalcStyle` is doing real work, not duplicate
+work -- and bounds what CSS pruning could save. Further
+optimization, if pursued, would mean selectively
+pruning rules from `print.css` based on per-section
+cost attribution. The `ab-css.mjs` tool measures that
+attribution.
diff --git a/perf/ab-css.mjs b/perf/ab-css.mjs
new file mode 100644
index 0000000..95bd8c8
--- /dev/null
+++ b/perf/ab-css.mjs
@@ -0,0 +1,386 @@
+// Per-section CSS cost attribution.
+//
+// Parses docs/_site-pdf/assets/css/print.css into themed sections by its
+// existing `/* ---- Section name ---- */` dividers, then renders the book
+// once per variant (full CSS, minimal-required CSS, and full minus each
+// individual section in turn -- or minimal plus each section in turn).
+// For each render we capture a hybrid trace and pull on-CPU time from
+// the embedded V8 cpu profile -- NOT wall-clock, which is too noisy at
+// single-run granularity. CPU sample-time is machine-load-independent
+// (preempted intervals don't sample), so one run per variant is enough.
+//
+// Output is a table of per-section deltas:
+//
+//   cpu_total_ms = sum of all V8 cpu sample deltas (whole render)
+//   recalc_ms    = sum of deltas where Document::recalcStyle is in the
+//                  hybrid stack (V8 lineage + Blink event nest)
+//   layout_ms    = same for LocalFrameView::performLayout
+//   Δ*           = baseline-full minus variant (subtract) or variant
+//                  minus baseline-minimal (add)
+//
+// Usage:
+//   node ab-css.mjs                # subtractive sweep, 1 run/variant
+//   node ab-css.mjs --mode add     # additive (minimal + 1 section)
+//   node ab-css.mjs --only typography,headings  # filter variants
+//   node ab-css.mjs --out my-run   # results folder name (default: ab-css)
+//
+// Always-kept sections (not dropped/added; required for paged.js to
+// paginate at roughly the right page count): preamble, "Page geometry,
+// running header, page numbers", "Chapter boundaries".
+
+import { readFileSync, writeFileSync, mkdirSync, rmSync } from 'node:fs';
+import { spawnSync } from 'node:child_process';
+import { resolve, join } from 'node:path';
+
+// ---- CLI -------------------------------------------------------------
+let mode = 'subtract';
+let outRoot = 'ab-css';
+let onlyFilter = null;
+const args = process.argv.slice(2);
+for (let i = 0; i < args.length; i++) {
+  if (args[i] === '--mode') mode = args[++i];
+  else if (args[i] === '--out') outRoot = args[++i];
+  else if (args[i] === '--only') onlyFilter = args[++i].split(',').map(s => s.trim().toLowerCase());
+  else if (args[i] === '-h' || args[i] === '--help') {
+    console.error('usage: node ab-css.mjs [--mode subtract|add] [--out DIR] [--only NAME[,NAME...]]');
+    process.exit(0);
+  } else {
+    console.error('unknown arg: ' + args[i]);
+    process.exit(2);
+  }
+}
+if (mode !== 'subtract' && mode !== 'add') {
+  console.error('--mode must be "subtract" or "add"');
+  process.exit(2);
+}
+
+// ---- File paths ------------------------------------------------------
+const SITE_PDF = resolve('../docs/_site-pdf');
+const PRINT_CSS_PATH = join(SITE_PDF, 'assets/css/print.css');
+const BOOK_HTML_PATH = join(SITE_PDF, 'book.html');
+const SWAP_CSS_PATH = join(SITE_PDF, 'assets/css/print-ab.css');
+const SWAP_HTML_PATH = join(SITE_PDF, 'book-ab.html');
+
+const PRINT_CSS = readFileSync(PRINT_CSS_PATH, 'utf8');
+const BOOK_HTML = readFileSync(BOOK_HTML_PATH, 'utf8');
+
+// ---- Parse print.css into sections -----------------------------------
+// Section divider: a comment whose body is `---- Name ----` (any number
+// of dashes on each side, name in between). Handles both single-line and
+// multi-line divider comments by anchoring on the `/* ----` prefix.
+const dividerRe = /\/\*\s*-{3,}\s*([^\-\n*][^\n*]*?)\s*-{3,}/g;
+const dividers = [];
+let m;
+while ((m = dividerRe.exec(PRINT_CSS))) dividers.push({ idx: m.index, name: m[1].trim() });
+if (dividers.length < 2) {
+  console.error('expected multiple `/* ---- Section ---- */` dividers in print.css; found ' + dividers.length);
+  process.exit(3);
+}
+const sections = [];
+if (dividers[0].idx > 0) sections.push({ name: '(preamble)', text: PRINT_CSS.slice(0, dividers[0].idx) });
+for (let i = 0; i < dividers.length; i++) {
+  const end = dividers[i + 1]?.idx ?? PRINT_CSS.length;
+  sections.push({ name: dividers[i].name, text: PRINT_CSS.slice(dividers[i].idx, end) });
+}
+console.error(`parsed ${sections.length} sections (${PRINT_CSS.length} bytes total):`);
+for (const s of sections) console.error(`  ${String(s.text.length).padStart(6)} bytes  ${s.name}`);
+console.error('');
+
+// Sections that are always kept (paged.js depends on them for pagination).
+const ALWAYS_KEEP = new Set([
+  '(preamble)',
+  'Page geometry, running header, page numbers',
+  'Chapter boundaries',
+]);
+
+// Slug for output dir naming.
+const slug = (s) => s.toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-|-$/g, '').slice(0, 40) || 'unnamed';
+
+// ---- Variant list ----------------------------------------------------
+// "baseline-full" + "baseline-minimal" run unconditionally for comparison.
+// Then, for each non-kept section, either drop it (subtract) or add it on
+// top of minimal (add).
+const variants = [];
+variants.push({ label: 'baseline-full',    keep: () => sections.map(s => s.name) });
+variants.push({ label: 'baseline-minimal', keep: () => sections.filter(s => ALWAYS_KEEP.has(s.name)).map(s => s.name) });
+for (const s of sections) {
+  if (ALWAYS_KEEP.has(s.name)) continue;
+  if (onlyFilter && !onlyFilter.some(f => s.name.toLowerCase().includes(f))) continue;
+  if (mode === 'subtract') {
+    variants.push({
+      label: 'drop-' + slug(s.name),
+      keep: () => sections.filter(x => x.name !== s.name).map(x => x.name),
+    });
+  } else {
+    variants.push({
+      label: 'add-' + slug(s.name),
+      keep: () => [...sections.filter(x => ALWAYS_KEEP.has(x.name)).map(x => x.name), s.name],
+    });
+  }
+}
+
+const cssFor = (keepNames) => sections.filter(s => keepNames.includes(s.name)).map(s => s.text).join('\n');
+const swappedHtml = BOOK_HTML.replace(
+  '<link rel="stylesheet" href="assets/css/print.css">',
+  '<link rel="stylesheet" href="assets/css/print-ab.css">',
+);
+if (swappedHtml === BOOK_HTML) {
+  console.error('failed to swap <link href=print.css> in book.html; aborting');
+  process.exit(3);
+}
+
+// ---- Render + measure ------------------------------------------------
+function runOnce(outDir) {
+  const r = spawnSync('node', [
+    'measure.mjs', SWAP_HTML_PATH,
+    '--detach-pages', '--no-timing', '--render-only', '--tracing',
+    '--out', outDir,
+  ], { stdio: ['ignore', 'pipe', 'pipe'] });
+  const err = r.stderr?.toString() ?? '';
+  if (r.status !== 0) {
+    console.error(r.stdout?.toString() || '');
+    console.error(err);
+    throw new Error('measure.mjs failed with status ' + r.status);
+  }
+}
+
+// Wrapper events that surround V8 execution; filtered from event-nest
+// reconstruction so they don't pollute "inner work" attribution.
+const JS_WRAPPER_NAMES = new Set([
+  'RunTask', 'RunMicrotasks', 'FunctionCall', 'EvaluateScript',
+  'V8.Execute', 'V8.RunMicrotasks', 'Task', 'ThreadControllerImpl::RunTask',
+]);
+// V8 virtual frames; filtered from JS lineage so they don't shadow named
+// Blink work in the hybrid stack.
+const V8_VIRTUAL = new Set(['(root)', '(program)', '(idle)', '(garbage collector)', '']);
+// Labels we want CPU-attribution for (any appearance in the hybrid stack
+// counts the sample for total-time semantics).
+const WANT_LABELS = new Set([
+  'Document::recalcStyle',
+  'LocalFrameView::performLayout',
+  'Document::UpdateStyleAndLayout',
+  'Document::rebuildLayoutTree',
+  'InlineNode::ShapeTextIncludingFirstLine',
+  'Blink.Style.UpdateTime',
+  'Blink.Layout.UpdateTime',
+]);
+
+function cpuStatsFromTrace(tracePath) {
+  const t = JSON.parse(readFileSync(tracePath, 'utf8'));
+  const events = Array.isArray(t) ? t : t.traceEvents;
+
+  // CrRendererMain thread key(s).
+  const mainKeys = new Set();
+  for (const e of events) {
+    if (e.ph === 'M' && e.name === 'thread_name' && e.args?.name === 'CrRendererMain') {
+      mainKeys.add(e.pid + '.' + e.tid);
+    }
+  }
+
+  // Main-thread X-events, minus JS-entry wrappers.
+  const mainEvents = [];
+  for (const e of events) {
+    if (e.ph !== 'X' || typeof e.dur !== 'number' || e.dur <= 0) continue;
+    if (!mainKeys.has(e.pid + '.' + e.tid)) continue;
+    if (JS_WRAPPER_NAMES.has(e.name)) continue;
+    mainEvents.push({ ts: e.ts, end: e.ts + e.dur, name: e.name });
+  }
+
+  // V8 cpu profile reconstruction.
+  const profiles = new Map();
+  for (const e of events) {
+    if (e.name !== 'Profile' && e.name !== 'ProfileChunk') continue;
+    const id = e.id || (e.args?.id) || '0x1';
+    if (!profiles.has(id)) profiles.set(id, { startTime: null, nodes: new Map(), samples: [], deltas: [] });
+    const p = profiles.get(id);
+    if (e.name === 'Profile') {
+      const d = e.args?.data;
+      if (d && typeof d.startTime === 'number') p.startTime = d.startTime;
+      continue;
+    }
+    const d = e.args?.data;
+    if (!d) continue;
+    if (d.cpuProfile?.nodes) for (const n of d.cpuProfile.nodes) p.nodes.set(n.id, n);
+    if (d.cpuProfile?.samples) for (const sid of d.cpuProfile.samples) p.samples.push(sid);
+    if (d.timeDeltas) for (const dt of d.timeDeltas) p.deltas.push(dt);
+  }
+  const allSamples = [];
+  const nodes = new Map();
+  for (const p of profiles.values()) {
+    for (const [k, v] of p.nodes) nodes.set(k, v);
+    if (p.startTime == null) continue;
+    let tcur = p.startTime;
+    for (let i = 0; i < p.samples.length; i++) {
+      const dt = p.deltas[i] || 0;
+      tcur += dt;
+      allSamples.push({ ts: tcur, nodeId: p.samples[i], deltaUs: dt });
+    }
+  }
+  allSamples.sort((a, b) => a.ts - b.ts);
+  if (!allSamples.length) throw new Error('no V8 cpu samples in trace (cpu_profiler category missing?)');
+
+  // Event-nest snapshot per sample via timeline merge: end < start < sample.
+  const TYPE_END = 0, TYPE_START = 1, TYPE_SAMPLE = 2;
+  const timeline = new Array(mainEvents.length * 2 + allSamples.length);
+  let wi = 0;
+  for (const ev of mainEvents) {
+    timeline[wi++] = { ts: ev.ts, type: TYPE_START, ev };
+    timeline[wi++] = { ts: ev.end, type: TYPE_END, ev };
+  }
+  for (const s of allSamples) timeline[wi++] = { ts: s.ts, type: TYPE_SAMPLE, s };
+  timeline.sort((a, b) => a.ts - b.ts || a.type - b.type);
+  const active = [];
+  for (const item of timeline) {
+    if (item.type === TYPE_START) active.push(item.ev);
+    else if (item.type === TYPE_END) {
+      const top = active[active.length - 1];
+      if (top === item.ev) active.pop();
+      else { const i = active.lastIndexOf(item.ev); if (i >= 0) active.splice(i, 1); }
+    } else {
+      item.s.eventStackNames = active.length ? active.map(e => e.name) : null;
+    }
+  }
+
+  // V8 lineage per node (filter virtual frames). Cached.
+  const lineageCache = new Map();
+  function lineageNamesOf(id) {
+    if (lineageCache.has(id)) return lineageCache.get(id);
+    const out = [];
+    let cur = id, g = 0;
+    while (cur != null && g++ < 4096) {
+      const n = nodes.get(cur);
+      if (!n) break;
+      const fn = n.callFrame?.functionName || '';
+      if (!V8_VIRTUAL.has(fn)) out.push(fn || '(anonymous)');
+      cur = n.parent;
+    }
+    lineageCache.set(id, out);
+    return out;
+  }
+
+  // Aggregate per-sample: total cpu + per-WANT_LABEL totals (total-time
+  // semantics: count once per sample if the label appears anywhere in
+  // the hybrid stack).
+  let totalUs = 0;
+  const labelUs = new Map();
+  for (const s of allSamples) {
+    totalUs += s.deltaUs;
+    const jsLineage = lineageNamesOf(s.nodeId);
+    const evStack = s.eventStackNames || [];
+    // hybrid stack = jsRootToLeaf ++ eventOuterToInner (we don't care
+    // about order for total-time semantics; just need set membership).
+    const seen = new Set();
+    for (const name of jsLineage) {
+      if (WANT_LABELS.has(name) && !seen.has(name)) {
+        seen.add(name);
+        labelUs.set(name, (labelUs.get(name) || 0) + s.deltaUs);
+      }
+    }
+    for (const name of evStack) {
+      if (WANT_LABELS.has(name) && !seen.has(name)) {
+        seen.add(name);
+        labelUs.set(name, (labelUs.get(name) || 0) + s.deltaUs);
+      }
+    }
+  }
+
+  return {
+    totalCpuUs: totalUs,
+    labelUs,
+    nSamples: allSamples.length,
+  };
+}
+
+// ---- Main loop -------------------------------------------------------
+const results = [];
+try {
+  for (const v of variants) {
+    const keep = v.keep();
+    const cssText = cssFor(keep);
+    const outDir = resolve(outRoot, v.label);
+    mkdirSync(outDir, { recursive: true });
+    writeFileSync(SWAP_CSS_PATH, cssText);
+    writeFileSync(SWAP_HTML_PATH, swappedHtml);
+    runOnce(outDir);
+    const stats = cpuStatsFromTrace(join(outDir, 'trace.json'));
+    results.push({ label: v.label, kept: keep, stats });
+    const totalMs = stats.totalCpuUs / 1000;
+    const recMs = (stats.labelUs.get('Document::recalcStyle') || 0) / 1000;
+    const layMs = (stats.labelUs.get('LocalFrameView::performLayout') || 0) / 1000;
+    console.error(`  ${v.label}: cpu_total=${totalMs.toFixed(0)}ms  recalc=${recMs.toFixed(0)}ms  layout=${layMs.toFixed(0)}ms  (${stats.nSamples} samples)`);
+  }
+} finally {
+  for (const p of [SWAP_CSS_PATH, SWAP_HTML_PATH]) {
+    try { rmSync(p, { force: true }); } catch {}
+  }
+}
+
+// ---- Report ----------------------------------------------------------
+const baseFull = results.find(r => r.label === 'baseline-full');
+const baseMin  = results.find(r => r.label === 'baseline-minimal');
+const baseFor  = mode === 'subtract' ? baseFull : baseMin;
+
+const ms = (us) => (us || 0) / 1000;
+const labelMs = (r, name) => ms(r.stats.labelUs.get(name));
+
+const H_LABEL = 38, H_NUM = 10;
+const hdr = (s, w) => String(s).padStart(w);
+
+console.log('');
+console.log(`mode=${mode}  delta-baseline=${baseFor?.label || 'none'}  (CPU sample-time from embedded V8 profile)`);
+console.log('');
+console.log(
+  'variant'.padEnd(H_LABEL) +
+  hdr('cpu_total', H_NUM) +
+  hdr('recalc', H_NUM) +
+  hdr('layout', H_NUM) +
+  hdr('rebuild', H_NUM) +
+  hdr('shape', H_NUM) +
+  hdr('Δtotal', H_NUM) +
+  hdr('Δrecalc', H_NUM)
+);
+console.log('-'.repeat(H_LABEL + H_NUM * 7));
+
+const sign = mode === 'subtract' ? -1 : 1;
+const baseTotal  = baseFor ? ms(baseFor.stats.totalCpuUs) : 0;
+const baseRecalc = baseFor ? labelMs(baseFor, 'Document::recalcStyle') : 0;
+const variantRows = results.filter(r => r !== baseFull && r !== baseMin);
+variantRows.sort((a, b) => {
+  const da = sign * (ms(a.stats.totalCpuUs) - baseTotal);
+  const db = sign * (ms(b.stats.totalCpuUs) - baseTotal);
+  return db - da;
+});
+const display = [baseFull, baseMin, ...variantRows].filter(Boolean);
+
+for (const r of display) {
+  const total  = ms(r.stats.totalCpuUs);
+  const recalc = labelMs(r, 'Document::recalcStyle');
+  const layout = labelMs(r, 'LocalFrameView::performLayout');
+  const rebuild = labelMs(r, 'Document::rebuildLayoutTree');
+  const shape   = labelMs(r, 'InlineNode::ShapeTextIncludingFirstLine');
+  let dTotal = '-', dRecalc = '-';
+  if (baseFor && r !== baseFor) {
+    dTotal  = (sign * (total - baseTotal)).toFixed(0);
+    dRecalc = (sign * (recalc - baseRecalc)).toFixed(0);
+  }
+  console.log(
+    r.label.padEnd(H_LABEL) +
+    hdr(total.toFixed(0), H_NUM) +
+    hdr(recalc.toFixed(0), H_NUM) +
+    hdr(layout.toFixed(0), H_NUM) +
+    hdr(rebuild.toFixed(0), H_NUM) +
+    hdr(shape.toFixed(0), H_NUM) +
+    hdr(dTotal, H_NUM) +
+    hdr(dRecalc, H_NUM)
+  );
+}
+
+console.log('');
+console.log(`All columns are CPU sample-time in ms (sum of V8 sample timeDeltas).`);
+console.log(`recalc / layout / rebuild / shape = total-time of each Blink event`);
+console.log(`  (any sample whose hybrid stack contains the label).`);
+console.log(`Δ columns = ${mode === 'subtract'
+  ? 'baseline-full MINUS variant  (positive = "this section costs about this much")'
+  : 'variant MINUS baseline-minimal  (positive = "adding this section adds about this much")'}`);
+console.log(`Always-kept sections: preamble + Page geometry + Chapter boundaries.`);
+console.log(`Per-variant traces saved under: ${resolve(outRoot)}/`);

From c08fc8688a819c6cc31375fc26ace7be9d3b4914 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sat, 23 May 2026 17:01:59 +0200
Subject: [PATCH 09/18] ab-css: pair-paired diffs + Windows /affinity
 auto-relaunch for stable signal.

---
 perf/ab-css.mjs | 268 +++++++++++++++++++++++++++++++++---------------
 1 file changed, 186 insertions(+), 82 deletions(-)

diff --git a/perf/ab-css.mjs b/perf/ab-css.mjs
index 95bd8c8..e84680e 100644
--- a/perf/ab-css.mjs
+++ b/perf/ab-css.mjs
@@ -2,53 +2,99 @@
 //
 // Parses docs/_site-pdf/assets/css/print.css into themed sections by its
 // existing `/* ---- Section name ---- */` dividers, then renders the book
-// once per variant (full CSS, minimal-required CSS, and full minus each
+// per variant (full CSS, minimal-required CSS, and full minus each
 // individual section in turn -- or minimal plus each section in turn).
 // For each render we capture a hybrid trace and pull on-CPU time from
 // the embedded V8 cpu profile -- NOT wall-clock, which is too noisy at
-// single-run granularity. CPU sample-time is machine-load-independent
-// (preempted intervals don't sample), so one run per variant is enough.
+// single-run granularity.
 //
-// Output is a table of per-section deltas:
+// **CPU sample-time on this machine has ~15-25 % single-run variance**,
+// large enough to drown per-section deltas below ~10 % of recalcStyle.
+// The tool defaults to 3 paired runs per variant interleaved with the
+// baseline (A B A B A B ...), then reports the mean paired difference
+// + SD across the N pairs. Paired differencing cancels machine-state
+// drift far better than independent runs, so 3 pairs gives reliable
+// rankings even when individual run-to-run variance is ~20 %.
 //
-//   cpu_total_ms = sum of all V8 cpu sample deltas (whole render)
-//   recalc_ms    = sum of deltas where Document::recalcStyle is in the
-//                  hybrid stack (V8 lineage + Blink event nest)
-//   layout_ms    = same for LocalFrameView::performLayout
-//   Δ*           = baseline-full minus variant (subtract) or variant
-//                  minus baseline-minimal (add)
+// Output columns:
+//
+//   total_cpu_ms = mean CPU sample-time for the variant (informational)
+//   Δrecalc_ms   = mean of (A.recalc - B.recalc) across paired runs;
+//                  positive = "this section costs about this much"
+//   Δrecalc_sd   = SD of the paired differences across the N pairs
+//   Δtotal_ms    = same for total CPU
 //
 // Usage:
-//   node ab-css.mjs                # subtractive sweep, 1 run/variant
+//   node ab-css.mjs                # subtractive sweep, 3 pairs/variant
+//   node ab-css.mjs --runs 5       # tighter SD if you can spare the time
 //   node ab-css.mjs --mode add     # additive (minimal + 1 section)
 //   node ab-css.mjs --only typography,headings  # filter variants
 //   node ab-css.mjs --out my-run   # results folder name (default: ab-css)
 //
-// Always-kept sections (not dropped/added; required for paged.js to
-// paginate at roughly the right page count): preamble, "Page geometry,
-// running header, page numbers", "Chapter boundaries".
+// Always-kept sections (required for paged.js to paginate at roughly
+// the right page count): preamble, "Page geometry, running header,
+// page numbers", "Chapter boundaries".
 
 import { readFileSync, writeFileSync, mkdirSync, rmSync } from 'node:fs';
 import { spawnSync } from 'node:child_process';
 import { resolve, join } from 'node:path';
 
+// ---- Windows: auto-relaunch with CPU affinity + High priority --------
+// CPU sample-time has ~15-25% single-run variance on a stock Windows dev
+// box where background processes share cores with our benchmark. Pinning
+// to a fixed subset of logical processors (and raising priority class)
+// cuts that to ~2-4%. `start /affinity HEX /high` is the simplest tool;
+// child processes (puppeteer's Chromium and its renderer / utility
+// children) inherit the mask from us.
+//
+// Default mask 0x5500 = LPs 8, 10, 12, 14 on Windows enumeration: on an
+// 8-core / 16-thread AMD Ryzen 7 (Zen 1..4) that's physical cores 4..7,
+// thread 0 of each pair only -- no SMT contention. Sets it explicitly
+// rather than relying on the OS to balance. Override with the
+// AB_CSS_AFFINITY env var (any hex mask); set --no-affinity to skip.
+if (process.platform === 'win32'
+    && !process.env.AB_CSS_PINNED
+    && !process.argv.includes('--no-affinity')) {
+  const mask = process.env.AB_CSS_AFFINITY || '5500';
+  const argv0 = process.argv[1];
+  const userArgs = process.argv.slice(2)
+    .map(a => /[\s"]/.test(a) ? `"${a.replace(/"/g, '\\"')}"` : a)
+    .join(' ');
+  console.error(`[ab-css] Re-launching with /affinity 0x${mask} /high to stabilise measurements.`);
+  console.error(`[ab-css] Override mask: AB_CSS_AFFINITY=<hex>. Skip pinning: --no-affinity.`);
+  // Note: empty "" after start is a window-title placeholder. Without
+  // it, start consumes the first quoted token as the title and corrupts
+  // the script path. shell:true so cmd.exe handles quoting (Node's CRT
+  // would otherwise escape the inner quotes and break start's parsing).
+  const cmdLine = `set AB_CSS_PINNED=1 && start "" /affinity ${mask} /high /wait /b node "${argv0}" ${userArgs}`;
+  const r = spawnSync(cmdLine, { shell: true, stdio: 'inherit' });
+  process.exit(r.status ?? 0);
+}
+if (process.env.AB_CSS_PINNED) {
+  console.error(`[ab-css] Running pinned (AB_CSS_PINNED=1).`);
+}
+
 // ---- CLI -------------------------------------------------------------
 let mode = 'subtract';
 let outRoot = 'ab-css';
 let onlyFilter = null;
+let pairs = 3;
 const args = process.argv.slice(2);
 for (let i = 0; i < args.length; i++) {
   if (args[i] === '--mode') mode = args[++i];
   else if (args[i] === '--out') outRoot = args[++i];
   else if (args[i] === '--only') onlyFilter = args[++i].split(',').map(s => s.trim().toLowerCase());
+  else if (args[i] === '--runs') pairs = parseInt(args[++i], 10);
+  else if (args[i] === '--no-affinity') { /* handled in the relaunch shim above */ }
   else if (args[i] === '-h' || args[i] === '--help') {
-    console.error('usage: node ab-css.mjs [--mode subtract|add] [--out DIR] [--only NAME[,NAME...]]');
+    console.error('usage: node ab-css.mjs [--runs N] [--mode subtract|add] [--out DIR] [--only NAME[,NAME...]]');
     process.exit(0);
   } else {
     console.error('unknown arg: ' + args[i]);
     process.exit(2);
   }
 }
+if (pairs < 1) { console.error('--runs must be >= 1'); process.exit(2); }
 if (mode !== 'subtract' && mode !== 'add') {
   console.error('--mode must be "subtract" or "add"');
   process.exit(2);
@@ -292,22 +338,52 @@ function cpuStatsFromTrace(tracePath) {
 }
 
 // ---- Main loop -------------------------------------------------------
-const results = [];
+// Paired interleaving: for each variant, capture N (A, variant) pairs
+// back-to-back (baseline render, then variant render). Paired
+// differences cancel machine-state drift far better than averaging
+// independent runs.
+const baselineLabel = mode === 'subtract' ? 'baseline-full' : 'baseline-minimal';
+const baseline = variants.find(v => v.label === baselineLabel);
+const others = variants.filter(v => v.label !== baselineLabel);
+// Hold the baseline's stats per pair-index; baseline is sampled once
+// per variant per pair (so it's re-measured under similar machine
+// state to each variant). This costs more runs but yields paired data.
+function runVariant(v, pairIdx) {
+  const dirName = pairs > 1 ? `${v.label}-r${pairIdx + 1}` : v.label;
+  const outDir = resolve(outRoot, dirName);
+  mkdirSync(outDir, { recursive: true });
+  writeFileSync(SWAP_CSS_PATH, cssFor(v.keep()));
+  writeFileSync(SWAP_HTML_PATH, swappedHtml);
+  runOnce(outDir);
+  return cpuStatsFromTrace(join(outDir, 'trace.json'));
+}
+
+const results = []; // { label, perPair: [{statsBase, statsVariant} or {stats}] }
 try {
-  for (const v of variants) {
-    const keep = v.keep();
-    const cssText = cssFor(keep);
-    const outDir = resolve(outRoot, v.label);
-    mkdirSync(outDir, { recursive: true });
-    writeFileSync(SWAP_CSS_PATH, cssText);
-    writeFileSync(SWAP_HTML_PATH, swappedHtml);
-    runOnce(outDir);
-    const stats = cpuStatsFromTrace(join(outDir, 'trace.json'));
-    results.push({ label: v.label, kept: keep, stats });
-    const totalMs = stats.totalCpuUs / 1000;
-    const recMs = (stats.labelUs.get('Document::recalcStyle') || 0) / 1000;
-    const layMs = (stats.labelUs.get('LocalFrameView::performLayout') || 0) / 1000;
-    console.error(`  ${v.label}: cpu_total=${totalMs.toFixed(0)}ms  recalc=${recMs.toFixed(0)}ms  layout=${layMs.toFixed(0)}ms  (${stats.nSamples} samples)`);
+  // Run the baselines separately first (no pairing for baselines vs themselves).
+  const baselineRuns = [];
+  for (let p = 0; p < pairs; p++) {
+    const stats = runVariant(baseline, p);
+    baselineRuns.push(stats);
+    console.error(`  ${baseline.label} pair${p + 1}: cpu=${(stats.totalCpuUs/1000).toFixed(0)}ms recalc=${((stats.labelUs.get('Document::recalcStyle')||0)/1000).toFixed(0)}ms`);
+  }
+  results.push({ label: baseline.label, baselineRuns, isBaseline: true });
+
+  for (const v of others) {
+    const pairsData = [];
+    for (let p = 0; p < pairs; p++) {
+      // Re-measure baseline immediately before each variant pair to
+      // pair the two against the same machine state.
+      const statsBase = runVariant(baseline, p + pairs);  // separate dir
+      const statsVar  = runVariant(v, p);
+      pairsData.push({ statsBase, statsVariant: statsVar });
+      const dB = statsBase.totalCpuUs / 1000;
+      const dV = statsVar.totalCpuUs / 1000;
+      const rB = (statsBase.labelUs.get('Document::recalcStyle') || 0) / 1000;
+      const rV = (statsVar.labelUs.get('Document::recalcStyle') || 0) / 1000;
+      console.error(`  ${v.label} pair${p + 1}: baseline cpu=${dB.toFixed(0)}/recalc=${rB.toFixed(0)}  variant cpu=${dV.toFixed(0)}/recalc=${rV.toFixed(0)}  Δcpu=${(dB-dV).toFixed(0)} Δrecalc=${(rB-rV).toFixed(0)}`);
+    }
+    results.push({ label: v.label, pairsData });
   }
 } finally {
   for (const p of [SWAP_CSS_PATH, SWAP_HTML_PATH]) {
@@ -316,71 +392,99 @@ try {
 }
 
 // ---- Report ----------------------------------------------------------
-const baseFull = results.find(r => r.label === 'baseline-full');
-const baseMin  = results.find(r => r.label === 'baseline-minimal');
-const baseFor  = mode === 'subtract' ? baseFull : baseMin;
-
 const ms = (us) => (us || 0) / 1000;
-const labelMs = (r, name) => ms(r.stats.labelUs.get(name));
+const metricMs = (stats, name) => name === 'cpu_total' ? ms(stats.totalCpuUs) : ms(stats.labelUs.get(name));
+const METRICS = [
+  ['cpu_total',                              'cpu'],
+  ['Document::recalcStyle',                  'recalc'],
+  ['LocalFrameView::performLayout',          'layout'],
+  ['Document::rebuildLayoutTree',            'rebuild'],
+  ['InlineNode::ShapeTextIncludingFirstLine','shape'],
+];
+function meanSD(xs) {
+  const n = xs.length;
+  if (!n) return { mean: 0, sd: 0 };
+  const mean = xs.reduce((s, x) => s + x, 0) / n;
+  if (n < 2) return { mean, sd: 0 };
+  const sd = Math.sqrt(xs.reduce((s, x) => s + (x - mean) ** 2, 0) / (n - 1));
+  return { mean, sd };
+}
+
+const sign = mode === 'subtract' ? +1 : +1;  // both modes: positive Δ = "this section costs this much"
+// In subtract mode the paired diff is (baseline - variant); positive means
+// dropping the section saved time, i.e. the section was costly.
+// In add mode the paired diff is (variant - minimal); positive means
+// adding the section costs this much. Same sign convention either way.
 
-const H_LABEL = 38, H_NUM = 10;
+const H_LABEL = 36, H_NUM = 11;
 const hdr = (s, w) => String(s).padStart(w);
 
 console.log('');
-console.log(`mode=${mode}  delta-baseline=${baseFor?.label || 'none'}  (CPU sample-time from embedded V8 profile)`);
+console.log(`mode=${mode}  pairs=${pairs}  (CPU sample-time from embedded V8 profile)`);
+console.log('Δ = mean of paired (baseline − variant) across N pairs, ± SD');
 console.log('');
-console.log(
-  'variant'.padEnd(H_LABEL) +
-  hdr('cpu_total', H_NUM) +
-  hdr('recalc', H_NUM) +
-  hdr('layout', H_NUM) +
-  hdr('rebuild', H_NUM) +
-  hdr('shape', H_NUM) +
-  hdr('Δtotal', H_NUM) +
-  hdr('Δrecalc', H_NUM)
-);
-console.log('-'.repeat(H_LABEL + H_NUM * 7));
 
-const sign = mode === 'subtract' ? -1 : 1;
-const baseTotal  = baseFor ? ms(baseFor.stats.totalCpuUs) : 0;
-const baseRecalc = baseFor ? labelMs(baseFor, 'Document::recalcStyle') : 0;
-const variantRows = results.filter(r => r !== baseFull && r !== baseMin);
+const variantRows = results.filter(r => !r.isBaseline);
+// Compute paired diffs per variant per metric.
+function diffStats(v) {
+  const out = new Map();
+  for (const [metric] of METRICS) {
+    const diffs = v.pairsData.map(p => metricMs(p.statsBase, metric) - metricMs(p.statsVariant, metric));
+    out.set(metric, meanSD(diffs));
+  }
+  // Also store mean of variant's own metric across pairs (informational).
+  const variantOwn = new Map();
+  for (const [metric] of METRICS) {
+    variantOwn.set(metric, meanSD(v.pairsData.map(p => metricMs(p.statsVariant, metric))));
+  }
+  return { diffs: out, own: variantOwn };
+}
+const variantStats = new Map(variantRows.map(v => [v.label, diffStats(v)]));
+
+// Sort by mean Δrecalc descending (largest claimed cost first).
 variantRows.sort((a, b) => {
-  const da = sign * (ms(a.stats.totalCpuUs) - baseTotal);
-  const db = sign * (ms(b.stats.totalCpuUs) - baseTotal);
+  const da = variantStats.get(a.label).diffs.get('Document::recalcStyle').mean;
+  const db = variantStats.get(b.label).diffs.get('Document::recalcStyle').mean;
   return db - da;
 });
-const display = [baseFull, baseMin, ...variantRows].filter(Boolean);
-
-for (const r of display) {
-  const total  = ms(r.stats.totalCpuUs);
-  const recalc = labelMs(r, 'Document::recalcStyle');
-  const layout = labelMs(r, 'LocalFrameView::performLayout');
-  const rebuild = labelMs(r, 'Document::rebuildLayoutTree');
-  const shape   = labelMs(r, 'InlineNode::ShapeTextIncludingFirstLine');
-  let dTotal = '-', dRecalc = '-';
-  if (baseFor && r !== baseFor) {
-    dTotal  = (sign * (total - baseTotal)).toFixed(0);
-    dRecalc = (sign * (recalc - baseRecalc)).toFixed(0);
-  }
+
+// Baseline row first (for context).
+const baselineResult = results.find(r => r.isBaseline);
+const baselineOwn = new Map();
+for (const [metric] of METRICS) {
+  baselineOwn.set(metric, meanSD(baselineResult.baselineRuns.map(s => metricMs(s, metric))));
+}
+
+console.log(
+  'variant'.padEnd(H_LABEL) +
+  METRICS.map(([_, short]) => hdr('Δ' + short, H_NUM)).join('') +
+  hdr('  ± Δrecalc SD', 16)
+);
+console.log('-'.repeat(H_LABEL + H_NUM * METRICS.length + 16));
+
+// Baseline row: own values, no Δ.
+console.log(
+  (baselineResult.label + ' (mean)').padEnd(H_LABEL) +
+  METRICS.map(([metric]) => hdr(baselineOwn.get(metric).mean.toFixed(0), H_NUM)).join('') +
+  ''
+);
+console.log(
+  (baselineResult.label + ' (SD)').padEnd(H_LABEL) +
+  METRICS.map(([metric]) => hdr(baselineOwn.get(metric).sd.toFixed(0), H_NUM)).join('') +
+  ''
+);
+
+for (const v of variantRows) {
+  const s = variantStats.get(v.label);
+  const recalcSD = s.diffs.get('Document::recalcStyle').sd;
   console.log(
-    r.label.padEnd(H_LABEL) +
-    hdr(total.toFixed(0), H_NUM) +
-    hdr(recalc.toFixed(0), H_NUM) +
-    hdr(layout.toFixed(0), H_NUM) +
-    hdr(rebuild.toFixed(0), H_NUM) +
-    hdr(shape.toFixed(0), H_NUM) +
-    hdr(dTotal, H_NUM) +
-    hdr(dRecalc, H_NUM)
+    v.label.padEnd(H_LABEL) +
+    METRICS.map(([metric]) => hdr(s.diffs.get(metric).mean.toFixed(0), H_NUM)).join('') +
+    hdr('± ' + recalcSD.toFixed(0), 16)
   );
 }
 
 console.log('');
-console.log(`All columns are CPU sample-time in ms (sum of V8 sample timeDeltas).`);
-console.log(`recalc / layout / rebuild / shape = total-time of each Blink event`);
-console.log(`  (any sample whose hybrid stack contains the label).`);
-console.log(`Δ columns = ${mode === 'subtract'
-  ? 'baseline-full MINUS variant  (positive = "this section costs about this much")'
-  : 'variant MINUS baseline-minimal  (positive = "adding this section adds about this much")'}`);
-console.log(`Always-kept sections: preamble + Page geometry + Chapter boundaries.`);
+console.log(`All numbers are CPU sample-time in ms (sum of V8 sample timeDeltas).`);
+console.log(`Variant Δrecalc < 2*SD is consistent with zero -- below the noise floor.`);
 console.log(`Per-variant traces saved under: ${resolve(outRoot)}/`);

From 4e4be3c1f665f16bde6b2432f9d95cc5d5a2c98f Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sat, 23 May 2026 17:17:11 +0200
Subject: [PATCH 10/18] ab-css: sweep rouge.css and print.css extras; document
 findings.

---
 perf/README.md  | 127 +++++++++++++++++++++++++++++++++++++----
 perf/ab-css.mjs | 149 ++++++++++++++++++++++++++----------------------
 2 files changed, 195 insertions(+), 81 deletions(-)

diff --git a/perf/README.md b/perf/README.md
index 5eb26e6..9ebbeeb 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -109,7 +109,7 @@ DevTools-compatible trace is a few lines.
 | `analyze-profile.mjs` | Bottom-up self-time analyzer for `.cpuprofile` files. Same shape as DevTools' Performance bottom-up view, in the terminal. |
 | `analyze-trace.mjs` | Bottom-up self-time analyzer for Chrome traces (`trace.json` from `--tracing`). Computes per-event self-time on the renderer's main thread (`CrRendererMain` by default) by walking nested `X`-phase events. Cracks the cpu profile's `(program)` bucket open into named Blink / V8 events (`Layout`, `RecalcStyle`, `RunMicrotasks`, `V8.GC_*`, ...). Operates on the Blink trace events only -- ignores any embedded V8 cpu samples (`Profile` / `ProfileChunk`). |
 | `analyze-hybrid.mjs` | Bottom-up analyzer that *combines* the V8 cpu samples and the Blink trace events from a hybrid `trace.json`. Builds a `[JS root..leaf] ++ [Blink outer..inner]` stack at each sample (filtering V8's virtual frames and JS-entry wrapper events) and prints either top-N self-time mixing JS function names with Blink/V8 event names, or `--callees <label>` direct-callees for any name on either axis. Lets you walk a single causation chain from a JS function down through the Blink layout / style work it triggered via gBCR (`hasOverflow -> getBoundingClientRect -> Document::UpdateStyleAndLayout -> Blink.ForcedStyleAndLayout.UpdateTime -> ...`). |
-| `ab-css.mjs` | Per-section CSS cost attribution. Parses `docs/_site-pdf/assets/css/print.css` into themed sections by its `/* ---- ---- */` dividers, renders the book once per variant (full / minimal / each-section-dropped), and reports per-variant **CPU sample-time** totals (sum of V8 sample deltas) plus per-`Document::recalcStyle` / `LocalFrameView::performLayout` / `rebuildLayoutTree` / `ShapeText` totals. CPU sample-time is preemption-free and machine-load-independent, so single runs per variant are clean enough. Defaults to subtractive sweep; `--mode add` for additive. |
+| `ab-css.mjs` | CSS cost attribution for `docs/_site-pdf/assets/css/print.css` + `rouge.css`. Renders the book per variant (full / drop-rouge / drop-print-extras / baseline-minimal) and reports **paired-difference** CPU sample-time across N pairs (default 3), with the baseline re-measured immediately before each variant pair to cancel machine-state drift. Pulls per-`Document::recalcStyle` / `LocalFrameView::performLayout` / `rebuildLayoutTree` / `ShapeText` total time from the embedded V8 cpu profile in the hybrid trace; prints mean ± SD per variant so noise-floor rows are visible. On Windows it auto-relaunches under `start /affinity 0x5500 /high` (cores 4-7 physical, thread 0 only on an 8C16T AMD Ryzen 7) which cuts single-run variance from ~15-25 % to ~3 %. Optional `--per-print-section` adds one drop-print-`<section>` variant per `/* ---- ---- */` divider in print.css; individual sections of print.css turned out to be below the noise floor on this book, so off by default. |
 | `ab-aggregate.mjs` | Per-row mean + SD aggregator across 6 paired cpu profiles (`ab-A1..A3.cpuprofile` and `ab-B1..B3.cpuprofile`). Use when wall-clock noise drowns a structural change: capture 3+3 interleaved profiles via `measure.mjs --cpu-profile` with the change toggled on/off between runs, then point this at the 6 files for a mean-with-SD table that surfaces deltas wall-clock can't see (e.g. ~6 σ shifts on rows that move from 88 ms to 2 ms). See the "Disabling the filter outright" section in this README for the methodology. |
 | `find-callers.mjs` | "Who paid for this callee's time?" -- walks a `.cpuprofile` and attributes a target function's total time back to each direct caller. Used throughout the post-mortems to detect gBCR migration between callers. |
 | `find-callees.mjs` | The other direction of `find-callers.mjs`: splits a function's self+descendant time across its direct callees. Surfaces the cases where V8 has rolled native DOM work back into the calling JS frame (Range deletion in `removeOverflow`, HTML parser in `wrapContent`). |
@@ -5018,20 +5018,123 @@ the largest remaining target if priorities change (e.g.
 the book grows past 3000 pages, or a CI runtime cap
 forces it). It's just not the next thing to build.
 
-## What the next thing is, instead
+## CSS cost attribution
 
-Render is at ~11 s on a 1651-page book, down from ~104 s
+Render is at ~10 s on a 1651-page book, down from ~104 s
 in the original baseline. The bottom-up profile after
 all of the above changes shows no individual JS body
 above ~250 ms self-time; the dominant rows are native
 Blink work (`recalcStyle` 2.4 s, `performLayout` 2.2 s,
 `removeChild` 1.7 s) that's intrinsic to laying out and
-detaching 1651 pages of content.
-
-The A/B against a stripped print.css (next section)
-confirms `recalcStyle` is doing real work, not duplicate
-work -- and bounds what CSS pruning could save. Further
-optimization, if pursued, would mean selectively
-pruning rules from `print.css` based on per-section
-cost attribution. The `ab-css.mjs` tool measures that
-attribution.
+detaching 1651 pages of content. The remaining question:
+is any of that recalcStyle work *avoidable* via CSS
+pruning?
+
+`ab-css.mjs` automates the answer. It renders the book
+under four variants -- baseline-full (print.css +
+rouge.css), drop-rouge, drop-print-extras (only the
+always-kept Page-geometry + Chapter-boundaries sections
+of print.css), and baseline-minimal (both stripped) --
+then reports the **paired difference** of CPU sample-time
+(`Document::recalcStyle` total in particular) between
+baseline-full and each variant. Pairing immediately
+interleaves baseline + variant runs so machine-state
+drift cancels across the diff. On Windows the harness
+auto-relaunches itself under `start /affinity 0x5500
+/high` to pin to a fixed subset of cores, which on a
+Ryzen 7 cuts run-to-run variance from ~15-25 % to ~3 %.
+
+### Methodology calibration
+
+We learned the variance story the hard way. The first
+sweep used single runs per variant and CPU sample-time,
+on the theory that profile time would be machine-load-
+independent. It wasn't on this Windows dev box: four
+identical-content runs of baseline-full spanned
+9.47-16.89 s (the 16.89 was an outlier; even excluding
+it, the remaining three varied by ~12 %). At that noise
+floor, the per-section "drop-X saves N ms" rankings the
+tool was emitting were ~75 % noise. The fix had two
+parts:
+
+1. **CPU pinning via `start /affinity`** -- shipped as
+   the auto-relaunch shim in `ab-css.mjs`. Reduced
+   baseline SD on recalcStyle total from ~12-25 % to
+   ~3 %.
+2. **Paired interleaved measurement** -- run baseline
+   immediately before each variant, pair the two, take
+   the difference. Mean paired difference and SD across
+   N pairs let noise-floor rows show themselves honestly
+   (mean within ~2 σ of zero). Default N=3 pairs; bump
+   to `--runs 5` for tighter SD at the cost of wall
+   time.
+
+The original "stripping CSS saves ~740 ms" finding from
+a single manual A/B turned out to be partly real, partly
+noise, and partly confounded by what "minimal" meant.
+The manual A/B's "minimal" was just `@page` +
+`article{break-before:page}`; the tool's "baseline-
+minimal" keeps the preamble + Page-geometry +
+Chapter-boundaries sections (paged.js needs the
+string-set / @top-right / @bottom-right machinery for
+running headers and page numbers). The earlier signal
+was real, but spread across pieces the tool can and
+can't isolate.
+
+### Findings
+
+With pinning + paired diffs (3 pairs per variant):
+
+| variant | Δrecalc ms | ± SD | mean/SD | verdict |
+| --- | --- | --- | --- | --- |
+| **drop-print-extras** | **237** | **60** | **3.95** | **real signal** |
+| baseline-minimal | 193 | 246 | 0.78 | noise |
+| drop-rouge | 66 | 124 | 0.53 | noise |
+| (baseline-full mean) | 2038 | 108 SD | -- | reference |
+
+Read this as:
+
+- **print.css extras (everything beyond the always-kept
+  Page-geometry + Chapter-boundaries sections) contribute
+  ~237 ms of recalcStyle**: ~11 % of recalcStyle, ~2.4 %
+  of render. All three pairs gave Δrecalc 202, 307, 202 --
+  consistent direction and magnitude, ~4 σ from zero.
+- **rouge.css contribution is at the noise floor**
+  (66 ± 124 ms). The earlier hypothesis ("rouge.css is
+  the big spender via per-span cascade work in code
+  blocks") was wrong; the per-pair Δrecalc values were
+  38, 202, -42 -- variance too high to claim signal at
+  N=3.
+- **baseline-minimal** stripping both still lands inside
+  the noise band on this tool's run. The original manual
+  A/B's larger delta came from removing more than this
+  tool removes -- specifically the Page-geometry section
+  that the tool keeps.
+
+The per-section sweep behind `--per-print-section`
+confirmed the methodology lesson the hard way: when each
+print.css section is dropped individually, every Δrecalc
+lands within ~2 σ of zero. The 237 ms of print.css cost
+is structurally non-additive -- selectors interact in
+the cascade, the style sharing cache hits differently
+when rule count drops, and Blink's invalidation walks
+change shape based on what rules exist. Any single
+section's marginal contribution is too small to surface
+above ~60 ms of paired-diff noise; the sum-of-extras
+effect is the only real signal.
+
+### Where this leaves render
+
+Render is structurally near its floor. The biggest
+plausible CSS prune (drop-print-extras) saves ~240 ms of
+recalcStyle ≈ ~2.4 % of render, but would mean losing
+the typography that makes the PDF look like a book. The
+remaining levers all live outside render:
+
+- `pageRanges` sharding (~5-20 s in generate): off the
+  table for now (see previous section).
+- Chrome's `outline: true` (~5 s in process): one
+  `role="presentation"` preprocessor pass away from
+  shipping, but not pursued.
+
+No structurally promising next target inside render.
diff --git a/perf/ab-css.mjs b/perf/ab-css.mjs
index e84680e..1851719 100644
--- a/perf/ab-css.mjs
+++ b/perf/ab-css.mjs
@@ -1,39 +1,39 @@
-// Per-section CSS cost attribution.
+// CSS cost attribution: print.css extras + rouge.css.
 //
-// Parses docs/_site-pdf/assets/css/print.css into themed sections by its
-// existing `/* ---- Section name ---- */` dividers, then renders the book
-// per variant (full CSS, minimal-required CSS, and full minus each
-// individual section in turn -- or minimal plus each section in turn).
-// For each render we capture a hybrid trace and pull on-CPU time from
-// the embedded V8 cpu profile -- NOT wall-clock, which is too noisy at
-// single-run granularity.
+// Renders the book per variant, capturing a hybrid trace and pulling
+// on-CPU time from the embedded V8 cpu profile (NOT wall-clock, which
+// is too noisy at single-run granularity). For each metric of interest
+// (cpu_total, recalcStyle, performLayout, ...) the tool computes the
+// mean paired difference (baseline - variant) across N pairs and its
+// SD. Paired differencing + Windows /affinity pinning (auto-relaunch
+// below) brings per-pair variance down to ~3 % of baseline.
 //
-// **CPU sample-time on this machine has ~15-25 % single-run variance**,
-// large enough to drown per-section deltas below ~10 % of recalcStyle.
-// The tool defaults to 3 paired runs per variant interleaved with the
-// baseline (A B A B A B ...), then reports the mean paired difference
-// + SD across the N pairs. Paired differencing cancels machine-state
-// drift far better than independent runs, so 3 pairs gives reliable
-// rankings even when individual run-to-run variance is ~20 %.
+// **Default variants** (always run):
+//   baseline-full       = print.css (all sections) + rouge.css
+//   drop-rouge          = print.css (all sections); no rouge.css
+//   drop-print-extras   = print.css (always-kept sections only) + rouge.css
+//   baseline-minimal    = print.css (always-kept sections only); no rouge.css
 //
-// Output columns:
+// "Always-kept" print.css sections (paged.js needs them to paginate at
+// the right page count): preamble + "Page geometry, running header,
+// page numbers" + "Chapter boundaries".
 //
-//   total_cpu_ms = mean CPU sample-time for the variant (informational)
-//   Δrecalc_ms   = mean of (A.recalc - B.recalc) across paired runs;
-//                  positive = "this section costs about this much"
-//   Δrecalc_sd   = SD of the paired differences across the N pairs
-//   Δtotal_ms    = same for total CPU
+// With these four variants the pairwise differences reveal:
+//   baseline-full - drop-rouge        = rouge.css contribution
+//   baseline-full - drop-print-extras = print.css extras contribution
+//   baseline-full - baseline-minimal  = total CSS contribution
 //
-// Usage:
-//   node ab-css.mjs                # subtractive sweep, 3 pairs/variant
-//   node ab-css.mjs --runs 5       # tighter SD if you can spare the time
-//   node ab-css.mjs --mode add     # additive (minimal + 1 section)
-//   node ab-css.mjs --only typography,headings  # filter variants
-//   node ab-css.mjs --out my-run   # results folder name (default: ab-css)
+// **Optional per-section print.css sweep** (`--per-print-section`):
+// adds one drop-print-<section> variant per `/* ---- Section ---- */`
+// divider in print.css. Slower; previous runs showed all per-section
+// deltas below the noise floor for this book, so off by default.
 //
-// Always-kept sections (required for paged.js to paginate at roughly
-// the right page count): preamble, "Page geometry, running header,
-// page numbers", "Chapter boundaries".
+// Usage:
+//   node ab-css.mjs                       # 4 variants, 3 pairs each
+//   node ab-css.mjs --runs 5              # tighter SD, longer wall time
+//   node ab-css.mjs --per-print-section   # also sweep each print.css section
+//   node ab-css.mjs --out my-run          # results folder (default: ab-css)
+//   node ab-css.mjs --no-affinity         # skip Windows CPU pinning
 
 import { readFileSync, writeFileSync, mkdirSync, rmSync } from 'node:fs';
 import { spawnSync } from 'node:child_process';
@@ -75,19 +75,23 @@ if (process.env.AB_CSS_PINNED) {
 }
 
 // ---- CLI -------------------------------------------------------------
-let mode = 'subtract';
 let outRoot = 'ab-css';
-let onlyFilter = null;
 let pairs = 3;
+let perPrintSection = false;
 const args = process.argv.slice(2);
 for (let i = 0; i < args.length; i++) {
-  if (args[i] === '--mode') mode = args[++i];
-  else if (args[i] === '--out') outRoot = args[++i];
-  else if (args[i] === '--only') onlyFilter = args[++i].split(',').map(s => s.trim().toLowerCase());
+  if (args[i] === '--out') outRoot = args[++i];
   else if (args[i] === '--runs') pairs = parseInt(args[++i], 10);
+  else if (args[i] === '--per-print-section') perPrintSection = true;
   else if (args[i] === '--no-affinity') { /* handled in the relaunch shim above */ }
   else if (args[i] === '-h' || args[i] === '--help') {
-    console.error('usage: node ab-css.mjs [--runs N] [--mode subtract|add] [--out DIR] [--only NAME[,NAME...]]');
+    console.error('usage: node ab-css.mjs [--runs N] [--out DIR] [--per-print-section]');
+    console.error('');
+    console.error('  Default: 3 top-level variants per stylesheet (baseline-full,');
+    console.error('  drop-rouge, drop-print-extras, baseline-minimal). Run with');
+    console.error('  --per-print-section to additionally sweep each /* ---- ---- */');
+    console.error('  section of print.css (slower; per-section deltas tend to be');
+    console.error('  below the noise floor on this book).');
     process.exit(0);
   } else {
     console.error('unknown arg: ' + args[i]);
@@ -95,19 +99,21 @@ for (let i = 0; i < args.length; i++) {
   }
 }
 if (pairs < 1) { console.error('--runs must be >= 1'); process.exit(2); }
-if (mode !== 'subtract' && mode !== 'add') {
-  console.error('--mode must be "subtract" or "add"');
-  process.exit(2);
-}
 
 // ---- File paths ------------------------------------------------------
 const SITE_PDF = resolve('../docs/_site-pdf');
 const PRINT_CSS_PATH = join(SITE_PDF, 'assets/css/print.css');
+const ROUGE_CSS_PATH = join(SITE_PDF, 'assets/css/rouge.css');
 const BOOK_HTML_PATH = join(SITE_PDF, 'book.html');
+// Single generated CSS that book-ab.html links to. Per-variant we write
+// it with whatever combination of print.css sections + rouge.css we want
+// to test; book-ab.html drops the rouge.css link, so the only stylesheet
+// the document loads is print-ab.css.
 const SWAP_CSS_PATH = join(SITE_PDF, 'assets/css/print-ab.css');
 const SWAP_HTML_PATH = join(SITE_PDF, 'book-ab.html');
 
 const PRINT_CSS = readFileSync(PRINT_CSS_PATH, 'utf8');
+const ROUGE_CSS = readFileSync(ROUGE_CSS_PATH, 'utf8');
 const BOOK_HTML = readFileSync(BOOK_HTML_PATH, 'utf8');
 
 // ---- Parse print.css into sections -----------------------------------
@@ -143,33 +149,41 @@ const ALWAYS_KEEP = new Set([
 const slug = (s) => s.toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-|-$/g, '').slice(0, 40) || 'unnamed';
 
 // ---- Variant list ----------------------------------------------------
-// "baseline-full" + "baseline-minimal" run unconditionally for comparison.
-// Then, for each non-kept section, either drop it (subtract) or add it on
-// top of minimal (add).
+// Each variant carries a `build()` that returns the full CSS string to
+// be written into print-ab.css for that run. Always-kept print.css
+// sections come from `sections.filter(s => ALWAYS_KEEP.has(s.name))`;
+// "extras" means print.css minus always-kept.
+const printAll       = sections.map(s => s.text).join('\n');
+const printMinimal   = sections.filter(s => ALWAYS_KEEP.has(s.name)).map(s => s.text).join('\n');
+const ROUGE_HEADER   = '\n/* ---- rouge.css inlined (concatenated by ab-css.mjs) ---- */\n';
+
 const variants = [];
-variants.push({ label: 'baseline-full',    keep: () => sections.map(s => s.name) });
-variants.push({ label: 'baseline-minimal', keep: () => sections.filter(s => ALWAYS_KEEP.has(s.name)).map(s => s.name) });
-for (const s of sections) {
-  if (ALWAYS_KEEP.has(s.name)) continue;
-  if (onlyFilter && !onlyFilter.some(f => s.name.toLowerCase().includes(f))) continue;
-  if (mode === 'subtract') {
+// Top-level variants -- always run.
+variants.push({ label: 'baseline-full',       build: () => printAll     + ROUGE_HEADER + ROUGE_CSS });
+variants.push({ label: 'drop-rouge',          build: () => printAll });
+variants.push({ label: 'drop-print-extras',   build: () => printMinimal + ROUGE_HEADER + ROUGE_CSS });
+variants.push({ label: 'baseline-minimal',    build: () => printMinimal });
+
+// Optional per-section print.css sweep (opt-in via --per-print-section).
+// Each drop-<section> keeps full rouge.css and full print.css minus the
+// named section.
+if (perPrintSection) {
+  for (const s of sections) {
+    if (ALWAYS_KEEP.has(s.name)) continue;
     variants.push({
-      label: 'drop-' + slug(s.name),
-      keep: () => sections.filter(x => x.name !== s.name).map(x => x.name),
-    });
-  } else {
-    variants.push({
-      label: 'add-' + slug(s.name),
-      keep: () => [...sections.filter(x => ALWAYS_KEEP.has(x.name)).map(x => x.name), s.name],
+      label: 'drop-print-' + slug(s.name),
+      build: () => sections.filter(x => x.name !== s.name).map(x => x.text).join('\n') + ROUGE_HEADER + ROUGE_CSS,
     });
   }
 }
 
-const cssFor = (keepNames) => sections.filter(s => keepNames.includes(s.name)).map(s => s.text).join('\n');
-const swappedHtml = BOOK_HTML.replace(
-  '<link rel="stylesheet" href="assets/css/print.css">',
-  '<link rel="stylesheet" href="assets/css/print-ab.css">',
-);
+// Swap book.html: replace the print.css link with print-ab.css, and
+// drop the rouge.css link (its content is inlined into print-ab.css
+// when the variant calls for it).
+let swappedHtml = BOOK_HTML
+  .replace('<link rel="stylesheet" href="assets/css/print.css">',
+           '<link rel="stylesheet" href="assets/css/print-ab.css">')
+  .replace(/\s*<link rel="stylesheet" href="assets\/css\/rouge\.css">/, '');
 if (swappedHtml === BOOK_HTML) {
   console.error('failed to swap <link href=print.css> in book.html; aborting');
   process.exit(3);
@@ -342,7 +356,7 @@ function cpuStatsFromTrace(tracePath) {
 // back-to-back (baseline render, then variant render). Paired
 // differences cancel machine-state drift far better than averaging
 // independent runs.
-const baselineLabel = mode === 'subtract' ? 'baseline-full' : 'baseline-minimal';
+const baselineLabel = 'baseline-full';
 const baseline = variants.find(v => v.label === baselineLabel);
 const others = variants.filter(v => v.label !== baselineLabel);
 // Hold the baseline's stats per pair-index; baseline is sampled once
@@ -352,7 +366,7 @@ function runVariant(v, pairIdx) {
   const dirName = pairs > 1 ? `${v.label}-r${pairIdx + 1}` : v.label;
   const outDir = resolve(outRoot, dirName);
   mkdirSync(outDir, { recursive: true });
-  writeFileSync(SWAP_CSS_PATH, cssFor(v.keep()));
+  writeFileSync(SWAP_CSS_PATH, v.build());
   writeFileSync(SWAP_HTML_PATH, swappedHtml);
   runOnce(outDir);
   return cpuStatsFromTrace(join(outDir, 'trace.json'));
@@ -410,17 +424,14 @@ function meanSD(xs) {
   return { mean, sd };
 }
 
-const sign = mode === 'subtract' ? +1 : +1;  // both modes: positive Δ = "this section costs this much"
-// In subtract mode the paired diff is (baseline - variant); positive means
-// dropping the section saved time, i.e. the section was costly.
-// In add mode the paired diff is (variant - minimal); positive means
-// adding the section costs this much. Same sign convention either way.
+// Paired diff is (baseline - variant). Positive Δ means dropping the
+// thing saved time, i.e. that thing was costly.
 
 const H_LABEL = 36, H_NUM = 11;
 const hdr = (s, w) => String(s).padStart(w);
 
 console.log('');
-console.log(`mode=${mode}  pairs=${pairs}  (CPU sample-time from embedded V8 profile)`);
+console.log(`pairs=${pairs}  per-print-section=${perPrintSection}  (CPU sample-time from embedded V8 profile)`);
 console.log('Δ = mean of paired (baseline − variant) across N pairs, ± SD');
 console.log('');
 

From df395efddfc37029d91e8d18745c4d4d311d1ec6 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sat, 23 May 2026 17:24:25 +0200
Subject: [PATCH 11/18] Extract pin-cpu.mjs; auto-pin measure, profile-load,
 profile-roundtrip on Windows.

---
 perf/README.md             |  9 +++---
 perf/ab-css.mjs            | 40 ++++---------------------
 perf/measure.mjs           |  7 +++++
 perf/pin-cpu.mjs           | 60 ++++++++++++++++++++++++++++++++++++++
 perf/profile-load.mjs      |  5 ++++
 perf/profile-roundtrip.mjs |  5 ++++
 6 files changed, 88 insertions(+), 38 deletions(-)
 create mode 100644 perf/pin-cpu.mjs

diff --git a/perf/README.md b/perf/README.md
index 9ebbeeb..47af3e4 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -93,7 +93,8 @@ DevTools-compatible trace is a few lines.
 | File | Role |
 | --- | --- |
 | `package.json` | Pins `puppeteer` + `pdf-lib` + `html-entities` (the same direct deps `docs/` uses). |
-| `measure.mjs` | Puppeteer harness. Drives the same flow as `docs/render-book.mjs` (loads the vendored paged.js bundle, runs `PagedPolyfill.preview()`, calls `page.pdf()`, then either the pdf-lib roundtrip or the incremental writer), with optional CPU profiling, in-page handler injection, and DOM-accessor instrumentation. |
+| `measure.mjs` | Puppeteer harness. Drives the same flow as `docs/render-book.mjs` (loads the vendored paged.js bundle, runs `PagedPolyfill.preview()`, calls `page.pdf()`, then either the pdf-lib roundtrip or the incremental writer), with optional CPU profiling, in-page handler injection, and DOM-accessor instrumentation. Auto-pins to a fixed core mask on Windows via `pin-cpu.mjs` (see below) for stable measurements; pass `--no-affinity` to opt out. |
+| `pin-cpu.mjs` | Shared shim used by `measure.mjs`, `profile-load.mjs`, `profile-roundtrip.mjs`, and `ab-css.mjs`. On Windows, auto-relaunches the parent Node process under `start /affinity 0x5500 /high` (cores 4-7 physical, thread 0 each, on an 8C16T AMD Ryzen 7) so puppeteer's Chromium children inherit the mask + priority at spawn time. Reduces single-run CPU sample-time variance from ~15-25 % on a stock dev box to ~3 %. No-op on non-Windows; opt out per-invocation with `--no-affinity` or `PERF_PINNED=1`; override mask with `PERF_AFFINITY=<hex>`. |
 | `timing-handler.js` | `Paged.Handler` that records per-page wall time + heap into `window.__pagedTiming` and streams a line per page to the console. Always injected. |
 | `detach-pages.js` | `Paged.Handler` that hides each completed page from the layout tree (registered against `finalizePage`). The fix. Injected by `--detach-pages` and by `docs/book.bat`. |
 | `instrument-flush-ops.js` | Wraps `getComputedStyle`, `getBoundingClientRect`, and the `offsetWidth` / `clientWidth` / `scrollWidth` family with counters + per-call timing. Injected by `--instrument`. |
@@ -101,15 +102,15 @@ DevTools-compatible trace is a few lines.
 | `instrument-clones.js` | Wraps `Layout.prototype.append` to tag every source-walker clone, then walks each finalized page at `finalizePage` counting tagged survivors. Reports total appendCalls vs. survivors and the per-page overshoot distribution -- the share of clones rolled back by `removeOverflow`. Requires a one-line `window.PagedLayout = Layout` patch near the bottom of `docs/lib/paged.browser.js` (it's a private class otherwise). Injected by `--clone-count`. |
 | `incremental-pdf.mjs` | Replaces the pdf-lib load+save roundtrip with a PDF 1.7 §7.5.6 incremental update appended to Chrome's bytes. Used by `--incremental`. |
 | `test-incremental.mjs` | Smoke test for `incremental-pdf.mjs`: renders a tiny probe page, runs the writer, verifies the result parses (via pdf-lib re-load) and that outline + metadata land correctly. |
-| `profile-load.mjs` | Standalone profiler for `PDFDocument.load`. Runs the load on a chosen PDF with a chosen `parseSpeed`; intended to be run under `node --cpu-prof`. |
-| `profile-roundtrip.mjs` | Times the full pdf-lib `load + save` roundtrip across the three `parseSpeed` / `objectsPerTick` settings on a chosen PDF. |
+| `profile-load.mjs` | Standalone profiler for `PDFDocument.load`. Runs the load on a chosen PDF with a chosen `parseSpeed`; intended to be run under `node --cpu-prof`. Auto-pins on Windows via `pin-cpu.mjs`. |
+| `profile-roundtrip.mjs` | Times the full pdf-lib `load + save` roundtrip across the three `parseSpeed` / `objectsPerTick` settings on a chosen PDF. Auto-pins on Windows via `pin-cpu.mjs`. |
 | `probe-chrome-outline.mjs` | Renders a synthetic multi-level h1..h6 document via Chrome's `outline: true` and dumps the resulting `/Outlines` tree. Quick check that the CDP flag is wired correctly in the local Chromium / puppeteer combo. |
 | `compare-outlines.mjs` | Diffs two PDFs' `/Outlines` trees by `(depth, title, target page)`. Used to verify whether Chrome's native outline matches the injected one. |
 | `probe-outline-exclusions.mjs` | Tests which per-element attributes / styles (aria-hidden, role=presentation, hidden, display:none, CSS bookmark-level, ...) make Chrome drop a heading from its outline. |
 | `analyze-profile.mjs` | Bottom-up self-time analyzer for `.cpuprofile` files. Same shape as DevTools' Performance bottom-up view, in the terminal. |
 | `analyze-trace.mjs` | Bottom-up self-time analyzer for Chrome traces (`trace.json` from `--tracing`). Computes per-event self-time on the renderer's main thread (`CrRendererMain` by default) by walking nested `X`-phase events. Cracks the cpu profile's `(program)` bucket open into named Blink / V8 events (`Layout`, `RecalcStyle`, `RunMicrotasks`, `V8.GC_*`, ...). Operates on the Blink trace events only -- ignores any embedded V8 cpu samples (`Profile` / `ProfileChunk`). |
 | `analyze-hybrid.mjs` | Bottom-up analyzer that *combines* the V8 cpu samples and the Blink trace events from a hybrid `trace.json`. Builds a `[JS root..leaf] ++ [Blink outer..inner]` stack at each sample (filtering V8's virtual frames and JS-entry wrapper events) and prints either top-N self-time mixing JS function names with Blink/V8 event names, or `--callees <label>` direct-callees for any name on either axis. Lets you walk a single causation chain from a JS function down through the Blink layout / style work it triggered via gBCR (`hasOverflow -> getBoundingClientRect -> Document::UpdateStyleAndLayout -> Blink.ForcedStyleAndLayout.UpdateTime -> ...`). |
-| `ab-css.mjs` | CSS cost attribution for `docs/_site-pdf/assets/css/print.css` + `rouge.css`. Renders the book per variant (full / drop-rouge / drop-print-extras / baseline-minimal) and reports **paired-difference** CPU sample-time across N pairs (default 3), with the baseline re-measured immediately before each variant pair to cancel machine-state drift. Pulls per-`Document::recalcStyle` / `LocalFrameView::performLayout` / `rebuildLayoutTree` / `ShapeText` total time from the embedded V8 cpu profile in the hybrid trace; prints mean ± SD per variant so noise-floor rows are visible. On Windows it auto-relaunches under `start /affinity 0x5500 /high` (cores 4-7 physical, thread 0 only on an 8C16T AMD Ryzen 7) which cuts single-run variance from ~15-25 % to ~3 %. Optional `--per-print-section` adds one drop-print-`<section>` variant per `/* ---- ---- */` divider in print.css; individual sections of print.css turned out to be below the noise floor on this book, so off by default. |
+| `ab-css.mjs` | CSS cost attribution for `docs/_site-pdf/assets/css/print.css` + `rouge.css`. Renders the book per variant (full / drop-rouge / drop-print-extras / baseline-minimal) and reports **paired-difference** CPU sample-time across N pairs (default 3), with the baseline re-measured immediately before each variant pair to cancel machine-state drift. Pulls per-`Document::recalcStyle` / `LocalFrameView::performLayout` / `rebuildLayoutTree` / `ShapeText` total time from the embedded V8 cpu profile in the hybrid trace; prints mean ± SD per variant so noise-floor rows are visible. Auto-pins on Windows via `pin-cpu.mjs`. Optional `--per-print-section` adds one drop-print-`<section>` variant per `/* ---- ---- */` divider in print.css; individual sections of print.css turned out to be below the noise floor on this book, so off by default. |
 | `ab-aggregate.mjs` | Per-row mean + SD aggregator across 6 paired cpu profiles (`ab-A1..A3.cpuprofile` and `ab-B1..B3.cpuprofile`). Use when wall-clock noise drowns a structural change: capture 3+3 interleaved profiles via `measure.mjs --cpu-profile` with the change toggled on/off between runs, then point this at the 6 files for a mean-with-SD table that surfaces deltas wall-clock can't see (e.g. ~6 σ shifts on rows that move from 88 ms to 2 ms). See the "Disabling the filter outright" section in this README for the methodology. |
 | `find-callers.mjs` | "Who paid for this callee's time?" -- walks a `.cpuprofile` and attributes a target function's total time back to each direct caller. Used throughout the post-mortems to detect gBCR migration between callers. |
 | `find-callees.mjs` | The other direction of `find-callers.mjs`: splits a function's self+descendant time across its direct callees. Surfaces the cases where V8 has rolled native DOM work back into the calling JS frame (Range deletion in `removeOverflow`, HTML parser in `wrapContent`). |
diff --git a/perf/ab-css.mjs b/perf/ab-css.mjs
index 1851719..eb310fb 100644
--- a/perf/ab-css.mjs
+++ b/perf/ab-css.mjs
@@ -38,41 +38,13 @@
 import { readFileSync, writeFileSync, mkdirSync, rmSync } from 'node:fs';
 import { spawnSync } from 'node:child_process';
 import { resolve, join } from 'node:path';
+import { pinCpuIfWindows } from './pin-cpu.mjs';
 
-// ---- Windows: auto-relaunch with CPU affinity + High priority --------
-// CPU sample-time has ~15-25% single-run variance on a stock Windows dev
-// box where background processes share cores with our benchmark. Pinning
-// to a fixed subset of logical processors (and raising priority class)
-// cuts that to ~2-4%. `start /affinity HEX /high` is the simplest tool;
-// child processes (puppeteer's Chromium and its renderer / utility
-// children) inherit the mask from us.
-//
-// Default mask 0x5500 = LPs 8, 10, 12, 14 on Windows enumeration: on an
-// 8-core / 16-thread AMD Ryzen 7 (Zen 1..4) that's physical cores 4..7,
-// thread 0 of each pair only -- no SMT contention. Sets it explicitly
-// rather than relying on the OS to balance. Override with the
-// AB_CSS_AFFINITY env var (any hex mask); set --no-affinity to skip.
-if (process.platform === 'win32'
-    && !process.env.AB_CSS_PINNED
-    && !process.argv.includes('--no-affinity')) {
-  const mask = process.env.AB_CSS_AFFINITY || '5500';
-  const argv0 = process.argv[1];
-  const userArgs = process.argv.slice(2)
-    .map(a => /[\s"]/.test(a) ? `"${a.replace(/"/g, '\\"')}"` : a)
-    .join(' ');
-  console.error(`[ab-css] Re-launching with /affinity 0x${mask} /high to stabilise measurements.`);
-  console.error(`[ab-css] Override mask: AB_CSS_AFFINITY=<hex>. Skip pinning: --no-affinity.`);
-  // Note: empty "" after start is a window-title placeholder. Without
-  // it, start consumes the first quoted token as the title and corrupts
-  // the script path. shell:true so cmd.exe handles quoting (Node's CRT
-  // would otherwise escape the inner quotes and break start's parsing).
-  const cmdLine = `set AB_CSS_PINNED=1 && start "" /affinity ${mask} /high /wait /b node "${argv0}" ${userArgs}`;
-  const r = spawnSync(cmdLine, { shell: true, stdio: 'inherit' });
-  process.exit(r.status ?? 0);
-}
-if (process.env.AB_CSS_PINNED) {
-  console.error(`[ab-css] Running pinned (AB_CSS_PINNED=1).`);
-}
+// On Windows, re-launch under `start /affinity 0x5500 /high` to stabilise
+// CPU sample-time. See pin-cpu.mjs for the rationale; the default mask
+// targets cores 4..7 on an 8C16T Ryzen 7.
+pinCpuIfWindows({ toolName: 'ab-css' });
+if (process.env.PERF_PINNED) console.error(`[ab-css] Running pinned (PERF_PINNED=1).`);
 
 // ---- CLI -------------------------------------------------------------
 let outRoot = 'ab-css';
diff --git a/perf/measure.mjs b/perf/measure.mjs
index 46af266..b0ea333 100644
--- a/perf/measure.mjs
+++ b/perf/measure.mjs
@@ -90,6 +90,12 @@ import { PDFDocument, ParseSpeeds } from 'pdf-lib';
 import { parseOutline, setOutline } from '../docs/lib/outline.mjs';
 import { setMetadata }              from '../docs/lib/postprocesser.mjs';
 import { applyOutlineAndMetadataIncremental } from './incremental-pdf.mjs';
+import { pinCpuIfWindows } from './pin-cpu.mjs';
+
+// On Windows, re-launch under `start /affinity 0x5500 /high` to stabilise
+// CPU sample-time. See pin-cpu.mjs. Cuts run-to-run variance from
+// ~15-25 % to ~3 % on this Ryzen 7 dev box. Pass --no-affinity to skip.
+pinCpuIfWindows({ toolName: 'measure.mjs' });
 
 const __dirname = dirname(fileURLToPath(import.meta.url));
 
@@ -127,6 +133,7 @@ for (let i = 0; i < args.length; i++) {
   else if (a === '--clone-count') cloneCount = true;
   else if (a === '--render-only') renderOnly = true;
   else if (a === '--tracing') tracing = true;
+  else if (a === '--no-affinity') { /* handled in pin-cpu.mjs */ }
   else if (!inputArg) inputArg = a;
   else { console.error(`unknown arg: ${a}`); process.exit(2); }
 }
diff --git a/perf/pin-cpu.mjs b/perf/pin-cpu.mjs
new file mode 100644
index 0000000..68ce070
--- /dev/null
+++ b/perf/pin-cpu.mjs
@@ -0,0 +1,60 @@
+// Re-launch the current Node process under `start /affinity HEX /high`
+// on Windows for stable benchmark measurements. No-op on non-Windows.
+//
+// Stock Windows dev boxes have 15-25 % single-run variance on CPU
+// sample-time because background processes share cores with the
+// renderer being profiled. Pinning the benchmark to a fixed subset of
+// logical processors (and raising priority class to High) brings that
+// down to ~3 %. Child processes (puppeteer's Chromium + its renderer /
+// utility children) inherit the mask + priority from us at spawn time.
+//
+// Default mask 0x5500 = LPs 8, 10, 12, 14 = physical cores 4..7, thread
+// 0 of each pair only on an 8-core / 16-thread AMD Ryzen 7 (Zen 1..4).
+// Avoids SMT contention; steers clear of cores 0-3 where Windows
+// system threads cluster.
+//
+// Usage:
+//   import { pinCpuIfWindows } from './pin-cpu.mjs';
+//   pinCpuIfWindows({ toolName: 'measure.mjs' });   // call BEFORE any work
+//
+// Knobs (env / argv):
+//   PERF_PINNED=1       sentinel set by the relaunched child so the
+//                       shim doesn't recurse. Also: set this manually
+//                       to suppress pinning entirely.
+//   PERF_AFFINITY=HEX   override the default mask. Hex string, no 0x.
+//   --no-affinity       CLI flag with the same effect as PERF_PINNED=1.
+
+import { spawnSync } from 'node:child_process';
+
+const DEFAULT_MASK = '5500';
+
+/**
+ * @param {Object} opts
+ * @param {string} [opts.toolName] - logged in the relaunch banner.
+ * @param {string} [opts.defaultMask] - hex string (no 0x), default '5500'.
+ */
+export function pinCpuIfWindows(opts = {}) {
+  if (process.platform !== 'win32') return;
+  if (process.env.PERF_PINNED) return;
+  if (process.argv.includes('--no-affinity')) return;
+
+  const toolName = opts.toolName || 'perf';
+  const mask = process.env.PERF_AFFINITY || opts.defaultMask || DEFAULT_MASK;
+
+  const argv0 = process.argv[1];
+  const userArgs = process.argv.slice(2)
+    .map(a => /[\s"]/.test(a) ? `"${a.replace(/"/g, '\\"')}"` : a)
+    .join(' ');
+
+  console.error(`[${toolName}] Re-launching with /affinity 0x${mask} /high to stabilise measurements.`);
+  console.error(`[${toolName}] Override mask: PERF_AFFINITY=<hex>. Skip pinning: --no-affinity.`);
+
+  // Empty "" after start is a window-title placeholder. Without it,
+  // start consumes the first quoted token as the title and corrupts
+  // the script path. shell:true so cmd.exe handles the inner quoting
+  // (Node's CRT would otherwise escape the inner quotes and break
+  // start's argument parsing).
+  const cmdLine = `set PERF_PINNED=1 && start "" /affinity ${mask} /high /wait /b node "${argv0}" ${userArgs}`;
+  const r = spawnSync(cmdLine, { shell: true, stdio: 'inherit' });
+  process.exit(r.status ?? 0);
+}
diff --git a/perf/profile-load.mjs b/perf/profile-load.mjs
index 6edc010..ce5fddc 100644
--- a/perf/profile-load.mjs
+++ b/perf/profile-load.mjs
@@ -17,6 +17,11 @@
 import { readFileSync } from 'node:fs';
 import { resolve } from 'node:path';
 import { PDFDocument, ParseSpeeds } from 'pdf-lib';
+import { pinCpuIfWindows } from './pin-cpu.mjs';
+
+// On Windows, re-launch under `start /affinity 0x5500 /high` to stabilise
+// timing. See pin-cpu.mjs. Pass --no-affinity to skip.
+pinCpuIfWindows({ toolName: 'profile-load' });
 
 const arg = process.argv[2];
 if (!arg) {
diff --git a/perf/profile-roundtrip.mjs b/perf/profile-roundtrip.mjs
index dfb5a44..c953340 100644
--- a/perf/profile-roundtrip.mjs
+++ b/perf/profile-roundtrip.mjs
@@ -16,6 +16,11 @@
 import { readFileSync } from 'node:fs';
 import { resolve } from 'node:path';
 import { PDFDocument, ParseSpeeds } from 'pdf-lib';
+import { pinCpuIfWindows } from './pin-cpu.mjs';
+
+// On Windows, re-launch under `start /affinity 0x5500 /high` to stabilise
+// timing. See pin-cpu.mjs. Pass --no-affinity to skip.
+pinCpuIfWindows({ toolName: 'profile-roundtrip' });
 
 const pdfPath = resolve(process.cwd(), process.argv[2] || '');
 if (!process.argv[2]) {

From 7809dfc343ac1494003e9a719648022988a9a026 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sat, 23 May 2026 19:17:03 +0200
Subject: [PATCH 12/18] Ship --disable-gpu pair; add memory + parallel-generate
 probes.

- docs/render-book.mjs, perf/measure.mjs: add --disable-gpu and
  --disable-software-rasterizer. Renderer ~120 MB lighter, gpu-process
  ~84 MB lighter (shrinks to a 16 MB stub -- only --in-process-gpu
  kills it entirely, at +15 s wall clock; rejected), generate ~5 s
  faster, PDF byte-identical.
- perf/probe-parallel.mjs: two-shard pageRanges parallel-generate
  probe. N=2 saves ~17 s wall clock (render+generate ~36 s vs ~53 s
  single-process), confirms two browsers parallelise at the OS level.
  Not shipped -- N=2 ~5 GB peak, N=4 ~10 GB peak, over CI budget.
- perf/probe-memory.mjs + sample-mem.ps1: per-process tree memory
  sampler. PowerShell + WMI walks the chrome.exe parent->child tree
  at 500 ms intervals, reports per-process private bytes + working
  set. Used to A/B the --disable-gpu / --in-process-gpu / --single-
  process variants (the last crashes in modern headless).
- perf/probe-renderer-mem.mjs + analyze-mem-trace.mjs: per-allocator
  renderer breakdown via Chromium's memory-infra trace + on-demand
  PMD dumps. Shows the 1.9 GB renderer is ~80 % Blink (Oilpan heap),
  not V8 (V8 is 34 MB). Top object classes are paged.js's per-page
  CSS grid (132 MB), 1 M ComputedStyle (74 MB), LayoutNG fragments
  (~200 MB combined), 411 k AXNodeObject for tagged-PDF (41 MB).
- --gc-passes N flag on probe-renderer-mem.mjs: triggers V8 +
  Memory.simulatePressureNotification between render and generate.
  One pass + pressure (~1 s) frees ~180 MB of dangling Blink objects
  reachable from no user-visible state. Not shipped -- masking a
  retention defect (paged.js hooks? detach-pages closures?) rather
  than fixing it. Hypotheses + next-step heap-snapshot direction
  documented in perf/README.md.
---
 docs/render-book.mjs        |  10 +
 perf/README.md              | 399 ++++++++++++++++++++++++++++++++++++
 perf/analyze-mem-trace.mjs  | 142 +++++++++++++
 perf/measure.mjs            |   7 +
 perf/probe-memory.mjs       | 256 +++++++++++++++++++++++
 perf/probe-parallel.mjs     | 198 ++++++++++++++++++
 perf/probe-renderer-mem.mjs | 326 +++++++++++++++++++++++++++++
 perf/sample-mem.ps1         | 114 +++++++++++
 8 files changed, 1452 insertions(+)
 create mode 100644 perf/analyze-mem-trace.mjs
 create mode 100644 perf/probe-memory.mjs
 create mode 100644 perf/probe-parallel.mjs
 create mode 100644 perf/probe-renderer-mem.mjs
 create mode 100644 perf/sample-mem.ps1

diff --git a/docs/render-book.mjs b/docs/render-book.mjs
index 71de7e1..e7ad9bf 100644
--- a/docs/render-book.mjs
+++ b/docs/render-book.mjs
@@ -99,10 +99,20 @@ const browser = await puppeteer.launch({
   // tagged-pdf and outline launch flags are added by puppeteer 22+
   // automatically in ChromeLauncher.defaultArgs(), so we don't repeat
   // them here.
+  //
+  // --disable-gpu + --disable-software-rasterizer: shrinks the GPU
+  // process from ~100 MB to ~16 MB (Chromium keeps a stub even with
+  // these flags -- only --in-process-gpu kills it entirely, but that
+  // serialises GPU work onto the main thread and costs ~15 s on the
+  // render+generate wall clock). With just the disable pair the
+  // renderer is also ~120 MB lighter and generate runs ~5 s faster
+  // (Skia skips a GPU init path). PDF output is byte-identical.
   args: [
     '--no-sandbox',
     '--disable-dev-shm-usage',
     '--allow-file-access-from-files',
+    '--disable-gpu',
+    '--disable-software-rasterizer',
   ],
 });
 
diff --git a/perf/README.md b/perf/README.md
index 47af3e4..68af042 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -5019,6 +5019,52 @@ the largest remaining target if priorities change (e.g.
 the book grows past 3000 pages, or a CI runtime cap
 forces it). It's just not the next thing to build.
 
+### Probe results (later session)
+
+A two-shard probe in [perf/probe-parallel.mjs](probe-parallel.mjs)
+was run after the render-side speedups to see what the
+actual wall-clock floor looks like with current numbers.
+N=2, equal page-count split, no concatenation -- just
+two browsers in parallel each printing their `pageRanges`
+slice:
+
+| shard | launch | load | render | generate | total |
+| --- | --- | --- | --- | --- | --- |
+| 0 (pp 1-826)   | 1.00 s | 1.61 s | 10.37 s | 24.02 s | 35.54 s |
+| 1 (pp 827-1651)| 0.97 s | 1.61 s | 10.12 s | 24.46 s | 35.74 s |
+
+Wall clock for `Promise.all` of both: **35.94 s**. Both
+slices open via pdf-lib and the page counts add up
+exactly (826 + 825 = 1,651). Vs the ~53 s single-process
+render+generate, parallel N=2 saves ~17 s wall clock.
+
+The probe also confirms two browsers really do run in
+parallel at the OS level: generate dropped from ~43 s to
+~24 s per shard (roughly linear with a ~2-3 s per-call
+fixed overhead), which would only happen if the Skia +
+PrintCompositor workloads in the two browser trees
+weren't serialised by a shared kernel resource. So the
+"single-threaded Skia per page" finding from the
+`Page.printToPDF` survey above is per-process -- not a
+machine-wide lock.
+
+**Still not shipped.** Reasons unchanged:
+
+- Each shard re-renders the whole book to maintain
+  per-shard layout state (named strings, counters,
+  footnotes). With render at ~10 s that's now cheap CPU-
+  wise, but the memory cost is the blocker -- see the
+  "Memory: where the renderer's 1.9 GB goes" section
+  below. N=2 ≈ 5 GB peak, N=4 ≈ 10 GB peak; the CI
+  runner doesn't have that headroom.
+- Concat + outline page-number remap still needs to be
+  built. The incremental-pdf.mjs pattern extends to it
+  but it's nontrivial.
+
+Probe stays available as `node perf/probe-parallel.mjs
+[--shards N]` for re-evaluation if either constraint
+changes (CI machine grows, or book size forces it).
+
 ## CSS cost attribution
 
 Render is at ~10 s on a 1651-page book, down from ~104 s
@@ -5139,3 +5185,356 @@ remaining levers all live outside render:
   shipping, but not pursued.
 
 No structurally promising next target inside render.
+
+## Memory: where the renderer's 1.9 GB goes
+
+CI runs the book build with limited RAM headroom -- the
+1651-page book is the largest job on the machine and the
+budget matters. This section measures one render's peak
+memory and breaks it down by allocator, so we know what
+levers exist if the book grows.
+
+`perf/probe-memory.mjs` is the harness. It runs the full
+pipeline (load + render + generate) in a single browser
+and watches the chrome.exe process tree at 500 ms
+intervals via `sample-mem.ps1`, reporting per-process
+private bytes + working set. `perf/probe-renderer-mem.mjs`
+goes deeper -- it drives Chromium's memory-infra tracing
+to capture detailed per-allocator dumps from inside the
+renderer at three points (post-render, mid-generate,
+post-generate). `perf/analyze-mem-trace.mjs` reads the
+resulting trace.json and prints the breakdown.
+
+### Process-tree footprint
+
+Peak across the whole tree on the 1651-page book:
+
+```
+renderer (main)                 ~1,880 MB private
+utility:PrintCompositor           ~290-450 MB  (high variance)
+browser                         ~70-1,100 MB   (PDF IPC buffer; very high variance)
+gpu-process                       ~100 MB
+renderer (about:blank etc.)        ~25 MB total
+utility:network/storage            ~30 MB total
+crashpad-handler                    ~2 MB
+                                ------------
+total peak                      ~2.5-3.5 GB private
+                                ~2.7-2.9 GB working set
+```
+
+The browser-process number is the wildest -- across
+runs it ranged from 72 MB to 1.1 GB. That's the IPC
+buffer the PDF travels through on its way from the
+renderer back to puppeteer; how much accumulates depends
+on timing between Mojo write and Node read. The
+PrintCompositor utility process appears only during
+generate; it's the Chromium service that turns the
+renderer's Skia commands into PDF bytes for `page.pdf()`.
+
+### Inside the renderer
+
+memory-infra dump at post-generate, renderer process,
+top-level allocators (`blink_gc` and `blink_objects`
+overlap by design -- they're two views of the same
+Oilpan heap, raw pages vs typed object counts):
+
+| allocator         | size      | notes |
+| ----------------- | --------- | ----- |
+| `blink_gc`        | 1,350 MB  | C++ DOM, layout, render objects (Oilpan) |
+| `malloc`          |   332 MB  | Skia raster buffers + small native allocations |
+| `partition_alloc` |   114 MB  | String buffers, ArrayBuffers |
+| `v8`              |    34 MB  | JS heap (paged.js + page JS); tiny |
+| other             |   ~22 MB  | web_cache, shared_memory, cc, gpu stub |
+
+V8 is only ~2 % of the renderer. Blink is ~80 %. That
+matches the structural picture: the renderer holds the
+laid-out state of 1651 pages of typeset content, and
+that state is C++ objects, not JS.
+
+Top Blink object classes (the `blink_objects` view of
+the Oilpan heap, post-generate):
+
+| class                                  | size    | count       |
+| -------------------------------------- | ------- | ----------- |
+| `GridSizingTrackCollection`            | 132 MB  | 79,246      |
+| `ComputedStyle`                        |  74 MB  | 1,074,537   |
+| `ConstraintSpace::RareData`            |  71 MB  | 617,415     |
+| `PhysicalBoxFragment`                  |  42 MB  | 516,289     |
+| `LogicalLineItems`                     |  42 MB  | 24,118      |
+| `Text` (DOM nodes)                     |  42 MB  | 498,077     |
+| `LayoutResult`                         |  41 MB  | 540,447     |
+| `AXNodeObject`                         |  41 MB  | 411,760     |
+| `GridItemData`                         |  30 MB  | 162,443     |
+| `ComputedStyleBase::StyleBoxData`      |  30 MB  | 176,479     |
+| `InlineItem`                           |  28 MB  | 737,744     |
+| `LayoutResult::RareData`               |  28 MB  | 229,056     |
+| `ElementRareDataVector`                |  24 MB  | 613,629     |
+| `CachedMatchedProperties`              |  23 MB  | 226,679     |
+| `ShapeResultView`                      |  21 MB  | 306,762     |
+| `HeapVectorBacking<FragmentItem>`      |  21 MB  | 72,175      |
+| `HeapVectorBacking<HarfBuzzRunGlyphData>` | 20 MB | 165,957  |
+| `LayoutText`                           |  14 MB  | 129,056     |
+| `HTMLDivElement`                       |  12 MB  | 118,877     |
+| `HTMLSpanElement`                      |  10 MB  | 104,266     |
+
+Three patterns visible:
+
+1. **Page-template grid is expensive.** paged.js renders
+   each `@page` as a CSS grid (so `@top-right`,
+   `@bottom-right`, etc. resolve correctly). 79,246
+   `GridSizingTrackCollection` ≈ 48 per page × 1651
+   pages, plus 162k `GridItemData`. Combined ~162 MB just
+   for the running header/footer geometry.
+2. **Style explosion.** 1,074,537 `ComputedStyle`
+   objects across 1651 pages is ~650 per page, which
+   matches roughly one per leaf element after style
+   sharing. `CachedMatchedProperties` (23 MB, 227k)
+   shows the sharing cache is active; without it the
+   number would be much worse.
+3. **LayoutNG fragment tree.** `PhysicalBoxFragment`
+   (42 MB), `LogicalLineItems` (42 MB), `LayoutResult`
+   (41 MB), various `RareData` (98 MB combined),
+   `InlineItem` (28 MB) -- the modern Blink layout tree
+   is fragment-based and the fragments add up across
+   half a million layout objects.
+
+The render→generate transition adds about 500 MB:
+~272 MB to `blink_gc` (print-preview snapshot retention)
+and ~219 MB to `malloc` (Skia content-stream allocations
+during PDF emit, visible as a million-ish small
+allocations in the bucket-size profile).
+
+### Disabling the GPU process
+
+The GPU process at ~100 MB looked like easy win. It
+isn't, quite -- in headless Chromium still spawns a
+GPU process to host SwiftShader (software raster) for
+canvas / WebGL emulation, even when no canvas / WebGL
+is in use. Three variants tested:
+
+| variant                                       | render | generate | total | gpu-process | renderer | PDF bytes |
+| --------------------------------------------- | ------ | -------- | ----- | ----------- | -------- | --------- |
+| baseline                                      | 10-11s | 44-50s   | 51-56s |  100 MB    | 1,880 MB | 41,076,362 |
+| `--disable-gpu --disable-software-rasterizer` | 10s    | 45s      | 45s   |  16 MB      | 1,761 MB | 41,076,362 |
+| above + `--in-process-gpu`                    | 15s    | 61s      | 62s   |  (gone)     | 1,748 MB | 41,076,362 |
+| `--single-process`                            | crash  | -        | -     | -           | -        | -         |
+
+`--single-process` is documented as debug-only in
+Chromium; the renderer crashes shortly after page load
+in modern headless. Also doesn't actually collapse to
+one process -- crashpad-handler always runs separately
+and a Mojo broker stays alive too.
+
+`--in-process-gpu` does kill the GPU process entirely
+but folds the GPU work onto the same thread as JS +
+layout. Render slows by ~5 s and generate by ~15 s --
+a 25 % total slowdown bought for ~100 MB of saved
+process overhead. Bad trade.
+
+The disable pair alone (`--disable-gpu
+--disable-software-rasterizer`) is the sweet spot:
+
+- GPU process shrinks from ~100 MB to ~16 MB (Chromium
+  keeps a stub for command handling)
+- Renderer ~120 MB lighter (consistent across runs;
+  exact cause is some GPU-context init path Skia skips)
+- Generate runs ~5 s faster (Skia presumably skips the
+  same GPU init path)
+- PDF output is byte-identical: same 41,076,362 bytes,
+  same content streams. SHA differs only because of
+  per-run /CreationDate, /ModDate, and /ID -- 0.018 %
+  of bytes differ, all inside the tagged-PDF tree's
+  hash-derived element IDs.
+
+Shipped in both [docs/render-book.mjs](../docs/render-book.mjs)
+and [perf/measure.mjs](measure.mjs).
+
+### What's not addressable
+
+Accessibility tagging accounts for ~41 MB of
+`AXNodeObject` instances (411k of them, one per DOM
+element for the PDF/UA structure tree). Disabling
+`--export-tagged-pdf` would free this, but the PDF
+loses its structure tree -- screen readers see a flat
+glyph stream, search highlighting and copy-paste break
+reading order in the multi-column layout, and the PDF
+falls out of Section 508 / PDF-UA / EN 301 549
+compliance. Off the table; the cost buys real
+accessibility for a docs site that aims to be readable.
+
+### Where this leaves memory
+
+End-state on the 1651-page book with the shipped flag
+pair:
+
+```
+renderer (main)                 ~1,760 MB private
+PrintCompositor (utility)         ~350 MB
+browser                           ~70-1,100 MB  (IPC buffer; high variance)
+gpu-process (stub)                 ~16 MB
+other (renderers, network, etc.)   ~80 MB
+                                ------------
+peak                            ~2.3-3.3 GB private
+                                ~2.5-2.9 GB working set
+```
+
+Inside the renderer, the dominant buckets are
+intrinsic to laying out 1651 pages of typeset content:
+
+- `GridSizingTrackCollection` (132 MB) is paged.js's
+  per-page template grid. The grid drives `@top-right`
+  / `@bottom-right` / margin-box positioning; replacing
+  it with absolute positioning would save the 132 MB
+  but is a paged.js architectural change.
+- `ComputedStyle` (74 MB across 1M objects) and the
+  LayoutNG fragment tree (~200 MB combined) scale with
+  DOM size. The biggest knob here is the DOM the book
+  feeds in: fewer wrapper elements would directly
+  shrink everything downstream.
+- The render→generate +500 MB is Chromium-internal
+  (print-preview retention + Skia raster prep) and not
+  reachable without recompiling.
+
+Next memory targets, in rough order of effort vs payoff:
+
+1. **DOM shape audit.** 1.07 M `ComputedStyle`, 498 k
+   `Text` nodes, 118 k `HTMLDivElement`, 104 k
+   `HTMLSpanElement` -- the input shape drives all of
+   this. Just-the-docs and the markdown converters add
+   wrapper elements that may not be needed in the PDF
+   layout. A pre-render DOM-simplification pass (strip
+   inert wrappers, collapse nested spans) is the most
+   accessible lever; we own the Jekyll pipeline end to
+   end.
+2. **Chase the dangling references** that the GC probe
+   surfaced (see next subsection). 180+ MB of Blink
+   objects exist post-render but are unreachable from
+   anything user-visible; if we can find what's holding
+   them in the JS-level state (paged.js hooks?
+   detach-pages closures? jQuery-style retained
+   references?), the renderer peak shrinks without any
+   runtime GC cost.
+3. **Page-template grid replacement** in vendored
+   paged.js -- ~132 MB potential. Largest single target
+   but an invasive rewrite of paged.js's `@page` area
+   handler.
+
+### GC-pass probe: 180 MB of dangling Blink objects
+
+Forcing a `window.gc()` pass between render and generate
+frees ~180 MB of `blink_objects` (the typed view of the
+Oilpan heap) without touching anything user-visible.
+That means there's a clear shape of dangling references
+in our post-render state -- objects unreachable from any
+DOM, layout, or print-preview anchor, but still held by
+some JS-level retention somewhere in the paged.js /
+detach-pages chain.
+
+Probe: `perf/probe-renderer-mem.mjs --gc-passes N`.
+Launches with `--js-flags=--expose-gc`, runs N V8
+`gc()` calls between the post-render and pre-generate
+memory dumps, then fires
+`Memory.simulatePressureNotification` to coax Chromium
+into dropping caches. Sweep across N=0,1,2,3,5 on the
+1651-page book (single run each; absolute numbers carry
+run-to-run noise but the deltas vs same-run baseline
+are stable):
+
+| N | gc time | +pressure | post-render | post-gc | mid-gen renderer | Δ vs no-gc baseline |
+| --- | --- | --- | --- | --- | --- | --- |
+| (off, baseline)| --     | --     |  1,229 MB | --     | **1,941 MB** | -- |
+| 0 (pressure only) | 0.00s | 0.52s |  1,358 MB | 1,358 MB | 1,869 MB | ~noise |
+| **1** | **0.44s** | **0.96s** | 1,329 MB | **1,275 MB** | **1,754 MB** | **-187 MB** |
+| 2 | 0.82s | 1.33s |  1,337 MB | 1,293 MB | 1,758 MB | -183 MB |
+| 3 | 1.46s | 1.97s |  1,316 MB | 1,277 MB | 1,757 MB | -184 MB |
+| 5 | 2.11s | 2.61s |  1,553 MB* | 1,498 MB* | 1,841 MB* | (high-side outlier run) |
+
+Three takeaways:
+
+1. **`Memory.simulatePressureNotification` alone does
+   nothing in headless.** N=0 mid-gen is within
+   run-to-run noise of the no-gc baseline.
+2. **One `gc()` call does ~90 % of the work.** 1 pass +
+   pressure: ~1 s cost, ~187 MB peak savings. Passes
+   2 and 3 match it (~185 MB) without further
+   improvement.
+3. **Each `gc()` pass costs ~0.4-0.5 s** of wall clock
+   on the 1651-page book (the V8 + Oilpan major-GC
+   pause walking ~1 GB of heap).
+
+Inside the renderer at post-gc (1 pass), the breakdown
+shows where the freed space went:
+
+| allocator      | baseline | post-gc | Δ |
+| -------------- | -------- | ------- | --- |
+| `blink_objects` (typed Oilpan view) |  698 MB |  472 MB | **-226 MB** |
+| `blink_gc` (raw pages)              |  973 MB |  940 MB |  -33 MB |
+| `malloc`                            |  120 MB |   93 MB |  -27 MB |
+| `v8`                                |   28 MB |   19 MB |   -9 MB |
+
+GC freed ~226 MB of typed Blink objects, but Oilpan
+only returned 33 MB of underlying pages to the OS
+immediately -- empty pages are recycled lazily. The
+visible peak win shows up at mid-generate (-187 MB)
+because Chromium reuses the freed object slots for the
+print-preview snapshot instead of growing fresh.
+
+PDF output is byte-identical across all variants
+(41,076,362 bytes; SHA differs only in metadata).
+
+**Not shipped.** 1 second per render is meaningful when
+multiplied across CI builds, and the GC pass is masking
+a real defect (the dangling references) rather than
+fixing it. The cleaner direction is to find what's
+retaining those objects and release them at the JS
+level, which would shrink the peak without any GC cost.
+
+The probe and the `--gc-passes` flag stay in
+[probe-renderer-mem.mjs](probe-renderer-mem.mjs) for
+future use -- either as a measurement baseline when
+investigating the retention, or as a one-off escape
+hatch if a future bigger book ever hits a CI memory
+ceiling.
+
+#### What might be holding the references
+
+Hypotheses for where to look first when chasing this:
+
+- **paged.js hook chains** keep handler references on
+  the global `window.Paged.Hooks` object, and each hook
+  is called with per-page context (the page wrapper,
+  the chunker, the layout result). If a hook closure
+  captures a page-local value it can keep that page's
+  state alive past `finalizePage`. The render chain is
+  fully synchronous now (see [Stripping
+  headless-irrelevant async machinery](#stripping-headless-irrelevant-async-machinery))
+  so there are no awaited promises holding stale
+  closures; the suspect is closure capture in synchronous
+  hooks.
+- **detach-pages.js** removes the page DOM from the
+  visible tree but the handler itself retains a
+  per-page reference in its own arrays/maps; check the
+  handler's instance state for accumulating per-page
+  entries that should be cleared after the detach.
+- **Chunker / Layout instance fields** on
+  `window.PagedPolyfill.chunker` -- the chunker keeps
+  `pages[]`, `layout`, `breakToken`, etc. After
+  `preview()` returns the last page's state might be
+  retained on these fields and root the whole tree
+  through them.
+- **EventTarget retained listeners** on the document or
+  on `window` -- paged.js installs a few
+  `resize`/`afterPageLayout`/etc. event listeners that
+  capture closures; their references survive even after
+  the layout work is done.
+
+A heap profile snapshot taken at post-render, opened in
+Chrome DevTools' "Retainers" view, would surface the
+exact chain for the largest object categories
+(`PhysicalBoxFragment`, `LogicalLineItems`,
+`ConstraintSpace::RareData`) and point at which JS
+object is keeping them alive. CDP's `HeapProfiler.
+takeHeapSnapshot` is the entry point;
+`probe-renderer-mem.mjs` already has a CDP session and
+could be extended with a `--heap-snapshot` flag if this
+becomes the next investigation.
diff --git a/perf/analyze-mem-trace.mjs b/perf/analyze-mem-trace.mjs
new file mode 100644
index 0000000..5d850d9
--- /dev/null
+++ b/perf/analyze-mem-trace.mjs
@@ -0,0 +1,142 @@
+// Pull the per-allocator memory breakdown out of a memory-infra trace.
+//
+// Reads a Chrome trace.json (memory-infra category) and prints, for each
+// detailed memory dump, the largest process's top-level allocator
+// buckets plus sub-breakdowns of the dominant ones.
+//
+// Usage:
+//   node analyze-mem-trace.mjs <path/to/trace.json>
+
+import { readFileSync } from 'node:fs';
+import { resolve } from 'node:path';
+
+const argv = process.argv.slice(2);
+if (argv.length < 1) {
+  console.error('usage: node analyze-mem-trace.mjs <trace.json>');
+  process.exit(2);
+}
+const tracePath = resolve(process.cwd(), argv[0]);
+
+const fmtMB = (b) => {
+  if (b == null || Number.isNaN(b)) return '   ? MB';
+  return (b / 1024 / 1024).toFixed(0).padStart(5) + ' MB';
+};
+
+const parseHexBytes = (s) => {
+  if (s == null) return null;
+  const n = parseInt(String(s), 16);
+  return Number.isFinite(n) ? n : null;
+};
+
+console.log(`reading ${tracePath} ...`);
+const trace = JSON.parse(readFileSync(tracePath, 'utf8'));
+const events = trace.traceEvents;
+console.log(`total events: ${events.length}`);
+
+// memory-infra puts both light and detailed dumps under ph='v'.
+// Detailed dumps carry args.dumps.allocators; light dumps only have
+// args.dumps.process_totals. We want detailed.
+const detailed = events.filter((e) => e.ph === 'v' && e.args?.dumps?.allocators);
+
+// Group by id and sort dump groups by min timestamp (insertion order).
+const byId = new Map();
+for (const e of detailed) {
+  const k = String(e.id);
+  if (!byId.has(k)) byId.set(k, []);
+  byId.get(k).push(e);
+}
+const groups = Array.from(byId.entries())
+  .map(([id, procs]) => ({
+    id,
+    ts: Math.min(...procs.map((e) => e.ts)),
+    procs,
+  }))
+  .sort((a, b) => a.ts - b.ts);
+
+// Resolve pid -> process_name from the metadata events (ph='M', name='process_name').
+const procName = new Map();
+for (const e of events) {
+  if (e.ph === 'M' && e.name === 'process_name' && e.args?.name) {
+    procName.set(e.pid, e.args.name);
+  }
+}
+
+console.log(`detailed dumps: ${groups.length}`);
+for (const g of groups) {
+  console.log(`  dump ${g.id} @ t=${g.ts}us  (${g.procs.length} processes)`);
+}
+
+for (let gi = 0; gi < groups.length; gi++) {
+  const g = groups[gi];
+  console.log();
+  console.log(`=== dump ${gi} (id=${g.id}) ===`);
+
+  const rows = g.procs.map((e) => {
+    // Light counterpart with same id+pid carries process_totals; the
+    // detailed event sometimes does too -- pull from whichever has it.
+    const totals = e.args?.dumps?.process_totals
+      ?? events.find((x) =>
+        x.ph === 'v' && String(x.id) === g.id && x.pid === e.pid &&
+        x.args?.dumps?.process_totals
+      )?.args.dumps.process_totals
+      ?? {};
+    const priv     = parseHexBytes(totals.private_footprint_bytes);
+    const resident = parseHexBytes(totals.peak_resident_set_size);
+    return {
+      pid:        e.pid,
+      name:       procName.get(e.pid) ?? '(unknown)',
+      priv,
+      resident,
+      allocators: e.args.dumps.allocators,
+    };
+  }).sort((a, b) => (b.priv ?? 0) - (a.priv ?? 0));
+
+  // Process table
+  for (let i = 0; i < rows.length; i++) {
+    const r = rows[i];
+    const star = i === 0 ? '*' : ' ';
+    console.log(`  ${star} pid=${String(r.pid).padEnd(6)} ${r.name.padEnd(20)} private ${fmtMB(r.priv)}`);
+  }
+
+  // Deep-dive on the biggest process (the renderer).
+  const top = rows[0];
+  if (!top) continue;
+
+  console.log();
+  console.log(`  top process pid=${top.pid} (${top.name}) top-level allocators >= 1 MB:`);
+  const topLevel = [];
+  for (const [name, info] of Object.entries(top.allocators)) {
+    if (name.includes('/')) continue;
+    const size = parseHexBytes(info?.attrs?.size?.value);
+    if (size == null || size < 1024 * 1024) continue;
+    topLevel.push({ name, size });
+  }
+  topLevel.sort((a, b) => b.size - a.size);
+  let sum = 0;
+  for (const r of topLevel) {
+    sum += r.size;
+    console.log(`    ${r.name.padEnd(36)} ${fmtMB(r.size)}`);
+  }
+  console.log(`    ${'(sum of top-level >= 1 MB)'.padEnd(36)} ${fmtMB(sum)}`);
+
+  // Sub-breakdown of the top 4 dominant top-level allocators.
+  for (const big of topLevel.slice(0, 4)) {
+    const prefix = big.name + '/';
+    const subs = [];
+    for (const [name, info] of Object.entries(top.allocators)) {
+      if (!name.startsWith(prefix)) continue;
+      const rest = name.slice(prefix.length);
+      if (rest.includes('/')) continue; // only one level deeper
+      const size = parseHexBytes(info?.attrs?.size?.value);
+      if (size == null || size < 1024 * 256) continue; // 0.25 MB cut-off
+      subs.push({ name: rest, size });
+    }
+    if (subs.length === 0) continue;
+    subs.sort((a, b) => b.size - a.size);
+    console.log();
+    console.log(`    ${big.name}/ sub-breakdown:`);
+    for (const s of subs.slice(0, 15)) {
+      console.log(`      ${s.name.padEnd(34)} ${fmtMB(s.size)}`);
+    }
+  }
+}
diff --git a/perf/measure.mjs b/perf/measure.mjs
index b0ea333..3177052 100644
--- a/perf/measure.mjs
+++ b/perf/measure.mjs
@@ -191,10 +191,17 @@ const browser = await puppeteer.launch({
   // puppeteer 22+ unconditionally in ChromeLauncher.defaultArgs(), so
   // we don't need to repeat them here. --chrome-outline below relies on
   // the latter being present at launch.
+  //
+  // --disable-gpu + --disable-software-rasterizer mirror production
+  // (docs/render-book.mjs). Shrinks the GPU process from ~100 MB to
+  // ~16 MB and the renderer ~120 MB; generate ~5 s faster; PDF byte-
+  // identical. See perf/README.md "Disabling the GPU process".
   args: [
     '--disable-dev-shm-usage',
     '--allow-file-access-from-files',
     '--enable-precise-memory-info',
+    '--disable-gpu',
+    '--disable-software-rasterizer',
   ],
 });
 
diff --git a/perf/probe-memory.mjs b/perf/probe-memory.mjs
new file mode 100644
index 0000000..15b7086
--- /dev/null
+++ b/perf/probe-memory.mjs
@@ -0,0 +1,256 @@
+// Memory footprint of one Chromium browser process tree during a single
+// book-PDF render. Runs the full pipeline (load + render + generate)
+// against the same paged.js + detach-pages setup probe-parallel.mjs uses,
+// then walks the chrome.exe process tree (browser + renderer + GPU +
+// utility) at 500 ms intervals via sample-mem.ps1 and reports the peak
+// and phase-aligned snapshots.
+//
+// "Private bytes" is the per-process counter for everything writable that
+// isn't file-backed: heap (V8, Blink, native, Skia buffers), stacks, BSS,
+// copy-on-write dirty pages from DLLs. It excludes DLL .text segments and
+// read-only const data -- exactly what the question asked for. It also
+// misses inter-process shared memory regions (GPU buffers, IPC ring
+// buffers), so the real OS commitment is moderately larger; the working-
+// set column is included as a cross-check.
+//
+// Usage:
+//   node probe-memory.mjs [path/to/book.html]
+//                         [--no-gpu] [--in-process-gpu] [--single-process]
+//
+// --no-gpu adds `--disable-gpu --disable-software-rasterizer` to the
+// Chromium launch args, to test whether the GPU process can be killed
+// in headless and whether the PDF output stays the same.
+//
+// --in-process-gpu folds the GPU work into the browser process instead
+// of a separate one. Less aggressive than --single-process; compatible
+// with normal rendering.
+//
+// --single-process collapses all Chromium subprocesses (renderer, GPU,
+// utility, PrintCompositor) into the browser process. Drops the
+// sandbox; only safe with trusted input (which is our case: we render
+// local, generated HTML). Known to be unstable in modern headless.
+
+import { spawn } from 'node:child_process';
+import { pathToFileURL, fileURLToPath } from 'node:url';
+import { dirname, resolve, join } from 'node:path';
+import { mkdirSync, writeFileSync, existsSync } from 'node:fs';
+import { createHash } from 'node:crypto';
+import puppeteer from 'puppeteer';
+
+const __dirname = dirname(fileURLToPath(import.meta.url));
+
+const args = process.argv.slice(2);
+let inputArg = null;
+let disableGpu = false;
+let inProcessGpu = false;
+let singleProcess = false;
+for (let i = 0; i < args.length; i++) {
+  const a = args[i];
+  if (a === '--no-gpu') disableGpu = true;
+  else if (a === '--in-process-gpu') inProcessGpu = true;
+  else if (a === '--single-process') singleProcess = true;
+  else if (!inputArg && !a.startsWith('-')) inputArg = a;
+  else { console.error(`unknown arg: ${a}`); process.exit(2); }
+}
+
+const inputPath = inputArg
+  ? resolve(process.cwd(), inputArg)
+  : resolve(__dirname, '..', 'docs', '_site-pdf', 'book.html');
+if (!existsSync(inputPath)) {
+  console.error(`book HTML not found: ${inputPath}`);
+  console.error('Build it first with docs/build.bat.');
+  process.exit(1);
+}
+
+const pagedScriptPath = resolve(__dirname, '..', 'docs', 'lib', 'paged.browser.js');
+const detachPagesPath = resolve(__dirname, 'detach-pages.js');
+const samplerPath     = resolve(__dirname, 'sample-mem.ps1');
+for (const p of [pagedScriptPath, detachPagesPath, samplerPath]) {
+  if (!existsSync(p)) {
+    console.error(`missing required file: ${p}`);
+    process.exit(1);
+  }
+}
+
+const tagParts = [];
+if (singleProcess) tagParts.push('single');
+if (inProcessGpu)  tagParts.push('ipgpu');
+if (disableGpu)    tagParts.push('nogpu');
+const tag = tagParts.length ? tagParts.join('-') : 'baseline';
+const stamp = new Date().toISOString().replace(/[:.]/g, '-');
+const outDir = resolve(__dirname, 'results', `probe-memory-${tag}-${stamp}`);
+mkdirSync(outDir, { recursive: true });
+
+const fmtMs = (ms) => (ms / 1000).toFixed(2) + 's';
+const fmtMB = (b) => (b / 1024 / 1024).toFixed(0).padStart(5) + ' MB';
+
+const chromeArgs = [
+  '--no-sandbox',
+  '--disable-dev-shm-usage',
+  '--allow-file-access-from-files',
+];
+if (disableGpu)    chromeArgs.push('--disable-gpu', '--disable-software-rasterizer');
+if (inProcessGpu)  chromeArgs.push('--in-process-gpu');
+if (singleProcess) chromeArgs.push('--single-process');
+
+console.log(`[probe] input         : ${inputPath}`);
+console.log(`[probe] output        : ${outDir}`);
+console.log(`[probe] disable-gpu   : ${disableGpu}`);
+console.log(`[probe] in-process-gpu: ${inProcessGpu}`);
+console.log(`[probe] single-process: ${singleProcess}`);
+
+const t0 = Date.now();
+
+const browser = await puppeteer.launch({
+  headless: true,
+  args: chromeArgs,
+});
+const browserPid = browser.process().pid;
+console.log(`[probe] browser pid: ${browserPid}`);
+
+const samples = [];
+const sampler = spawn('powershell', [
+  '-NoProfile', '-NonInteractive',
+  '-ExecutionPolicy', 'Bypass',
+  '-File', samplerPath,
+  '-RootPid', String(browserPid),
+  '-IntervalMs', '500',
+], { stdio: ['ignore', 'pipe', 'pipe'] });
+
+let samplerBuf = '';
+sampler.stdout.on('data', (chunk) => {
+  samplerBuf += chunk.toString('utf8');
+  let nl;
+  while ((nl = samplerBuf.indexOf('\n')) !== -1) {
+    const line = samplerBuf.slice(0, nl).trim();
+    samplerBuf = samplerBuf.slice(nl + 1);
+    if (!line) continue;
+    try {
+      const s = JSON.parse(line);
+      if (!s.done) samples.push({ ...s, t_elapsed: Date.now() - t0 });
+    } catch {
+      console.error('[probe] bad sampler line:', line.slice(0, 100));
+    }
+  }
+});
+sampler.stderr.on('data', (d) => process.stderr.write(`[sampler-err] ${d}`));
+
+let exitCode = 0;
+const phase = {};
+const pdfInfo = {};
+try {
+  const page = await browser.newPage();
+  page.setDefaultTimeout(0);
+  page.on('pageerror',     (e) => console.error('[page error]', e.message));
+  page.on('requestfailed', (r) => {
+    const f = r.failure();
+    console.error('[request failed]', r.url(), f && f.errorText);
+  });
+
+  await page.emulateMediaType('print');
+
+  await page.goto(pathToFileURL(inputPath).href, { waitUntil: 'load' });
+  phase.afterLoad = Date.now() - t0;
+
+  await page.evaluate(() => {
+    window.PagedConfig = window.PagedConfig || {};
+    window.PagedConfig.auto = false;
+  });
+  await page.addScriptTag({ path: pagedScriptPath });
+  await page.addScriptTag({ path: detachPagesPath });
+
+  await page.evaluate(() => {
+    if (!window.PagedPolyfill) throw new Error('paged.js bundle missing');
+    window.PagedPolyfill.preview();
+  });
+  await page.waitForSelector('.pagedjs_pages');
+  phase.afterRender = Date.now() - t0;
+
+  const pdfBytes = await page.pdf({
+    printBackground:     true,
+    displayHeaderFooter: false,
+    preferCSSPageSize:   true,
+    margin: { top: 0, right: 0, bottom: 0, left: 0 },
+  });
+  phase.afterGenerate = Date.now() - t0;
+  pdfInfo.bytes  = pdfBytes.length;
+  pdfInfo.sha256 = createHash('sha256').update(pdfBytes).digest('hex');
+  pdfInfo.path   = join(outDir, 'output.pdf');
+  writeFileSync(pdfInfo.path, pdfBytes);
+} catch (err) {
+  console.error('[probe error]', err);
+  exitCode = 1;
+} finally {
+  // Give the sampler one more tick to capture the post-generate peak
+  // before we tear the browser down (process exit collapses memory).
+  await new Promise((r) => setTimeout(r, 700));
+  sampler.kill();
+  await browser.close();
+}
+
+const total = Date.now() - t0;
+
+console.log();
+console.log('phase stamps:');
+console.log(`  after load    : +${fmtMs(phase.afterLoad     ?? 0)}`);
+console.log(`  after render  : +${fmtMs(phase.afterRender   ?? 0)}`);
+console.log(`  after generate: +${fmtMs(phase.afterGenerate ?? 0)}`);
+console.log(`  total         : +${fmtMs(total)}`);
+
+if (samples.length === 0) {
+  console.error('[probe] no samples captured -- sampler may have failed.');
+  process.exit(exitCode || 1);
+}
+
+writeFileSync(join(outDir, 'samples.json'), JSON.stringify(samples, null, 2));
+
+const snapshotAt = (elapsedMs) => {
+  let last = null;
+  for (const s of samples) {
+    if (s.t_elapsed > elapsedMs) break;
+    last = s;
+  }
+  return last;
+};
+
+const snapLoad     = snapshotAt(phase.afterLoad);
+const snapRender   = snapshotAt(phase.afterRender);
+const snapGenerate = snapshotAt(phase.afterGenerate);
+
+let peak = samples[0];
+for (const s of samples) if (s.total_private > peak.total_private) peak = s;
+
+const reportSnap = (label, s) => {
+  if (!s) { console.log(`  ${label}: (no sample)`); return; }
+  console.log(`  ${label}: ${fmtMB(s.total_private)} private  ${fmtMB(s.total_ws)} ws  (${s.n} procs, +${fmtMs(s.t_elapsed)})`);
+};
+
+console.log();
+console.log('phase-aligned memory snapshots (whole process tree):');
+reportSnap('after load    ', snapLoad);
+reportSnap('after render  ', snapRender);
+reportSnap('after generate', snapGenerate);
+console.log();
+console.log('peak:');
+reportSnap('              ', peak);
+
+const top = peak.rows.slice().sort((a, b) => b.private - a.private);
+console.log();
+console.log('top processes at peak:');
+for (const r of top) {
+  const role = r.role.padEnd(20);
+  console.log(`  ${role} pid=${String(r.pid).padEnd(6)} ${fmtMB(r.private)} private  ${fmtMB(r.ws)} ws`);
+}
+
+// Did a gpu-process ever appear across all samples?
+const gpuSeen = samples.some((s) => s.rows.some((r) => r.role === 'gpu-process'));
+console.log();
+console.log(`gpu-process seen in any sample: ${gpuSeen ? 'YES' : 'NO'}`);
+
+console.log();
+console.log('pdf output:');
+console.log(`  path  : ${pdfInfo.path ?? '(not written)'}`);
+console.log(`  bytes : ${pdfInfo.bytes ?? 0}`);
+console.log(`  sha256: ${pdfInfo.sha256 ?? '(n/a)'}`);
+
+process.exit(exitCode);
diff --git a/perf/probe-parallel.mjs b/perf/probe-parallel.mjs
new file mode 100644
index 0000000..cc67b5b
--- /dev/null
+++ b/perf/probe-parallel.mjs
@@ -0,0 +1,198 @@
+// Two-shard `pageRanges` parallel-generate probe.
+//
+// Launches N puppeteer browsers in parallel. Each loads book.html, runs
+// paged.js + detach-pages, then calls page.pdf({ pageRanges: ... }) over
+// its slice of the document. Reports per-shard launch/load/render/
+// generate timings and the Promise.all wall clock so we can compare
+// against the single-process ~58 s (render ~10 s + generate ~43 s) the
+// README cites for the current pipeline.
+//
+// The probe does not concatenate slices or remap outlines -- the point
+// is just to see (a) what the wall-clock floor of parallel generate
+// looks like on this machine, and (b) whether the per-shard slices look
+// structurally sane (each opens via pdf-lib; page counts add up).
+//
+// Usage:
+//   node probe-parallel.mjs [path/to/book.html] [--shards N]
+//
+// Defaults to ../docs/_site-pdf/book.html and N=2 shards.
+
+import { pathToFileURL, fileURLToPath } from 'node:url';
+import { dirname, resolve, join } from 'node:path';
+import { mkdirSync, writeFileSync, existsSync, readFileSync } from 'node:fs';
+import puppeteer from 'puppeteer';
+import { PDFDocument, ParseSpeeds } from 'pdf-lib';
+
+const __dirname = dirname(fileURLToPath(import.meta.url));
+
+const args = process.argv.slice(2);
+let inputArg = null;
+let shardCount = 2;
+for (let i = 0; i < args.length; i++) {
+  const a = args[i];
+  if (a === '--shards') shardCount = parseInt(args[++i], 10);
+  else if (!inputArg && !a.startsWith('-')) inputArg = a;
+  else { console.error(`unknown arg: ${a}`); process.exit(2); }
+}
+if (!Number.isFinite(shardCount) || shardCount < 1) {
+  console.error(`--shards must be >= 1, got ${shardCount}`);
+  process.exit(2);
+}
+
+const inputPath = inputArg
+  ? resolve(process.cwd(), inputArg)
+  : resolve(__dirname, '..', 'docs', '_site-pdf', 'book.html');
+if (!existsSync(inputPath)) {
+  console.error(`book HTML not found: ${inputPath}`);
+  console.error('Build it first with docs/build.bat.');
+  process.exit(1);
+}
+
+const pagedScriptPath = resolve(__dirname, '..', 'docs', 'lib', 'paged.browser.js');
+const detachPagesPath = resolve(__dirname, 'detach-pages.js');
+for (const p of [pagedScriptPath, detachPagesPath]) {
+  if (!existsSync(p)) {
+    console.error(`missing required file: ${p}`);
+    process.exit(1);
+  }
+}
+
+const stamp = new Date().toISOString().replace(/[:.]/g, '-');
+const outDir = resolve(__dirname, 'results', `probe-parallel-${stamp}`);
+mkdirSync(outDir, { recursive: true });
+
+const fmtMs = (ms) => (ms / 1000).toFixed(2) + 's';
+
+console.log(`[probe] input  : ${inputPath}`);
+console.log(`[probe] output : ${outDir}`);
+console.log(`[probe] shards : ${shardCount}`);
+
+async function runShard(shardIndex) {
+  const tStart = Date.now();
+  const browser = await puppeteer.launch({
+    // Matches docs/render-book.mjs (production path).
+    headless: true,
+    args: [
+      '--no-sandbox',
+      '--disable-dev-shm-usage',
+      '--allow-file-access-from-files',
+    ],
+  });
+  const tLaunched = Date.now();
+
+  try {
+    const page = await browser.newPage();
+    page.setDefaultTimeout(0);
+    page.on('pageerror',     (err) => console.error(`[shard ${shardIndex} pageerror]`, err.message));
+    page.on('requestfailed', (req) => {
+      const f = req.failure();
+      console.error(`[shard ${shardIndex} requestfailed]`, req.url(), f && f.errorText);
+    });
+
+    await page.emulateMediaType('print');
+
+    const tLoad = Date.now();
+    await page.goto(pathToFileURL(inputPath).href, { waitUntil: 'load' });
+    const tLoaded = Date.now();
+
+    await page.evaluate(() => {
+      window.PagedConfig = window.PagedConfig || {};
+      window.PagedConfig.auto = false;
+    });
+    await page.addScriptTag({ path: pagedScriptPath });
+    await page.addScriptTag({ path: detachPagesPath });
+
+    const tRender = Date.now();
+    await page.evaluate(() => {
+      if (!window.PagedPolyfill) {
+        throw new Error('paged.js bundle did not expose window.PagedPolyfill');
+      }
+      window.PagedPolyfill.preview();
+    });
+    await page.waitForSelector('.pagedjs_pages');
+    const tRendered = Date.now();
+    const pageCount = await page.evaluate(
+      () => document.querySelectorAll('.pagedjs_pages > .pagedjs_page').length
+    );
+
+    // Compute this shard's slice. Equal-ish split by page count: shard i
+    // covers pages [i*ceil(N/S)+1, min((i+1)*ceil(N/S), N)].
+    const slice = Math.ceil(pageCount / shardCount);
+    const first = shardIndex * slice + 1;
+    const last  = Math.min((shardIndex + 1) * slice, pageCount);
+    const pageRange = `${first}-${last}`;
+
+    const tGenerate = Date.now();
+    const pdfBuffer = await page.pdf({
+      printBackground:     true,
+      displayHeaderFooter: false,
+      preferCSSPageSize:   true,
+      margin: { top: 0, right: 0, bottom: 0, left: 0 },
+      pageRanges:          pageRange,
+    });
+    const tGenerated = Date.now();
+
+    const shardPath = join(outDir, `shard-${shardIndex}.pdf`);
+    writeFileSync(shardPath, pdfBuffer);
+
+    return {
+      shard:     shardIndex,
+      pageCount,
+      pageRange,
+      bytes:     pdfBuffer.length,
+      outputPath: shardPath,
+      launch:    tLaunched   - tStart,
+      load:      tLoaded     - tLaunched,
+      render:    tRendered   - tLoad,
+      generate:  tGenerated  - tGenerate,
+      total:     tGenerated  - tStart,
+    };
+  } finally {
+    await browser.close();
+  }
+}
+
+const tWall = Date.now();
+const shards = await Promise.all(
+  Array.from({ length: shardCount }, (_, i) => runShard(i))
+);
+const wallClock = Date.now() - tWall;
+
+console.log();
+console.log('per-shard timings:');
+for (const s of shards) {
+  console.log(
+    `  shard ${s.shard} (range ${s.pageRange} of ${s.pageCount}): ` +
+    `launch ${fmtMs(s.launch)}, load ${fmtMs(s.load)}, ` +
+    `render ${fmtMs(s.render)}, generate ${fmtMs(s.generate)}, ` +
+    `${(s.bytes / 1024 / 1024).toFixed(1)} MB, total ${fmtMs(s.total)}`
+  );
+}
+console.log();
+console.log(`wall clock (Promise.all): ${fmtMs(wallClock)}`);
+
+console.log();
+console.log('verification (pdf-lib load + page count):');
+let okAll = true;
+let totalPagesOut = 0;
+for (const s of shards) {
+  try {
+    const t0 = Date.now();
+    const doc = await PDFDocument.load(readFileSync(s.outputPath), {
+      parseSpeed: ParseSpeeds.Fastest,
+    });
+    const n = doc.getPageCount();
+    totalPagesOut += n;
+    console.log(`  shard ${s.shard}: ${n} pages, load ${fmtMs(Date.now() - t0)} -- ok`);
+  } catch (err) {
+    okAll = false;
+    console.error(`  shard ${s.shard}: load failed -- ${err.message}`);
+  }
+}
+const expected = shards[0]?.pageCount ?? 0;
+console.log();
+console.log(`output total pages: ${totalPagesOut}`);
+console.log(`expected:           ${expected}`);
+console.log(`coverage:           ${totalPagesOut === expected ? 'OK' : 'MISMATCH'}`);
+
+process.exit(okAll && totalPagesOut === expected ? 0 : 1);
diff --git a/perf/probe-renderer-mem.mjs b/perf/probe-renderer-mem.mjs
new file mode 100644
index 0000000..449a49c
--- /dev/null
+++ b/perf/probe-renderer-mem.mjs
@@ -0,0 +1,326 @@
+// Per-allocator memory breakdown for the renderer process during a
+// book-PDF render. Drives Chromium's memory-infra tracing system to
+// capture detailed process memory dumps (PMDs) at three points:
+// post-render, mid-generate, post-generate. Reports the dominant
+// allocator buckets per dump so we can see where the renderer's
+// ~1.9 GB goes beyond the V8 heap.
+//
+// Dumps come from MemoryDumpManager inside Chromium. Categories cover
+// V8 heap, Blink GC (Oilpan), partition_alloc pools, Skia caches,
+// discardable memory, malloc, IPC channel buffers, etc. -- the same
+// data chrome://memory-internals would show if we weren't headless.
+//
+// --gc-passes N inserts an extra dump point between post-render and the
+// generate phase: triggers N V8 gc() calls (requires
+// --js-flags=--expose-gc, added automatically) plus CDP
+// Memory.simulatePressureNotification to coax Chromium into freeing
+// caches, then dumps. Tests whether the ~272 MB blink_gc growth
+// during generate can be pre-released. N=0 skips explicit gc() and
+// only fires the pressure notification. --gc is shorthand for
+// --gc-passes 5.
+//
+// Usage:
+//   node probe-renderer-mem.mjs [path/to/book.html]
+//                               [--gc | --gc-passes N]
+
+import { pathToFileURL, fileURLToPath } from 'node:url';
+import { dirname, resolve, join } from 'node:path';
+import { mkdirSync, writeFileSync, existsSync, readFileSync } from 'node:fs';
+import { createHash } from 'node:crypto';
+import puppeteer from 'puppeteer';
+
+const __dirname = dirname(fileURLToPath(import.meta.url));
+
+const args = process.argv.slice(2);
+let inputArg = null;
+let gcPasses = -1;
+for (let i = 0; i < args.length; i++) {
+  const a = args[i];
+  if (a === '--gc') gcPasses = 5;
+  else if (a === '--gc-passes') gcPasses = parseInt(args[++i], 10);
+  else if (!inputArg && !a.startsWith('-')) inputArg = a;
+  else { console.error(`unknown arg: ${a}`); process.exit(2); }
+}
+const forceGc = gcPasses >= 0;
+if (forceGc && !Number.isFinite(gcPasses)) {
+  console.error(`--gc-passes requires a non-negative integer, got ${args[args.indexOf('--gc-passes')+1]}`);
+  process.exit(2);
+}
+
+const inputPath = inputArg
+  ? resolve(process.cwd(), inputArg)
+  : resolve(__dirname, '..', 'docs', '_site-pdf', 'book.html');
+if (!existsSync(inputPath)) {
+  console.error(`book HTML not found: ${inputPath}`);
+  process.exit(1);
+}
+
+const pagedScriptPath = resolve(__dirname, '..', 'docs', 'lib', 'paged.browser.js');
+const detachPagesPath = resolve(__dirname, 'detach-pages.js');
+for (const p of [pagedScriptPath, detachPagesPath]) {
+  if (!existsSync(p)) {
+    console.error(`missing required file: ${p}`);
+    process.exit(1);
+  }
+}
+
+const stamp = new Date().toISOString().replace(/[:.]/g, '-');
+const outDir = resolve(__dirname, 'results', `probe-renderer-mem-${stamp}`);
+mkdirSync(outDir, { recursive: true });
+
+const fmtMB = (b) => {
+  if (b === null || b === undefined || Number.isNaN(b)) return '   ? MB';
+  return (b / 1024 / 1024).toFixed(0).padStart(5) + ' MB';
+};
+
+console.log(`[probe] input    : ${inputPath}`);
+console.log(`[probe] output   : ${outDir}`);
+console.log(`[probe] gc-passes: ${forceGc ? gcPasses : '(off)'}`);
+
+// Match production launch args (docs/render-book.mjs). --expose-gc
+// is added when --gc is set so window.gc() inside the page works;
+// pinning V8 to that flag has no measurable cost on render or generate.
+const chromeArgs = [
+  '--no-sandbox',
+  '--disable-dev-shm-usage',
+  '--allow-file-access-from-files',
+  '--disable-gpu',
+  '--disable-software-rasterizer',
+];
+if (forceGc) chromeArgs.push('--js-flags=--expose-gc');
+
+const browser = await puppeteer.launch({
+  headless: true,
+  args: chromeArgs,
+});
+
+let exitCode = 0;
+const dumpRequests = [];
+
+try {
+  const page = await browser.newPage();
+  page.setDefaultTimeout(0);
+  page.on('pageerror',     (e) => console.error('[page error]', e.message));
+  page.on('requestfailed', (r) => {
+    const f = r.failure();
+    console.error('[request failed]', r.url(), f && f.errorText);
+  });
+
+  await page.emulateMediaType('print');
+  await page.goto(pathToFileURL(inputPath).href, { waitUntil: 'load' });
+
+  await page.evaluate(() => {
+    window.PagedConfig = window.PagedConfig || {};
+    window.PagedConfig.auto = false;
+  });
+  await page.addScriptTag({ path: pagedScriptPath });
+  await page.addScriptTag({ path: detachPagesPath });
+
+  const tracePath = join(outDir, 'trace.json');
+  await page.tracing.start({
+    path: tracePath,
+    screenshots: false,
+    categories: ['disabled-by-default-memory-infra'],
+  });
+
+  const cdp = await page.createCDPSession();
+
+  const tRender = Date.now();
+  await page.evaluate(() => {
+    if (!window.PagedPolyfill) throw new Error('paged.js bundle missing');
+    window.PagedPolyfill.preview();
+  });
+  await page.waitForSelector('.pagedjs_pages');
+  console.log(`render: ${((Date.now() - tRender) / 1000).toFixed(2)}s`);
+
+  const dumpAt = async (label) => {
+    const r = await cdp.send('Tracing.requestMemoryDump', { levelOfDetail: 'detailed' });
+    console.log(`  ${label}: guid=${r.dumpGuid} success=${r.success}`);
+    dumpRequests.push({ label, guid: r.dumpGuid });
+  };
+
+  await dumpAt('post-render');
+
+  if (forceGc) {
+    const tGc = Date.now();
+    // V8 GC (Oilpan finalizers run in stages; repeated calls progress
+    // further through the heap). N=0 skips explicit gc() and tests
+    // whether the pressure notification alone is enough.
+    const passes = gcPasses;
+    await page.evaluate((n) => {
+      if (n === 0) return;
+      if (typeof gc !== 'function') {
+        console.warn('gc() not exposed; --expose-gc missing?');
+        return;
+      }
+      for (let i = 0; i < n; i++) gc();
+    }, passes);
+    const tAfterGc = Date.now();
+    // Coax Chromium into dropping caches across all heaps.
+    await cdp.send('Memory.simulatePressureNotification', { level: 'critical' });
+    // Chromium GC finalizers are async; give them a beat.
+    await new Promise((r) => setTimeout(r, 500));
+    console.log(`gc-pass(${passes}): ${((tAfterGc - tGc) / 1000).toFixed(2)}s  total(+pressure): ${((Date.now() - tGc) / 1000).toFixed(2)}s`);
+    await dumpAt('post-gc');
+  }
+
+  const midTimer = setTimeout(() => { dumpAt('mid-generate').catch(() => {}); }, 25000);
+
+  const tGen = Date.now();
+  const pdfBytes = await page.pdf({
+    printBackground:     true,
+    displayHeaderFooter: false,
+    preferCSSPageSize:   true,
+    margin: { top: 0, right: 0, bottom: 0, left: 0 },
+  });
+  clearTimeout(midTimer);
+  console.log(`generate: ${((Date.now() - tGen) / 1000).toFixed(2)}s`);
+
+  await dumpAt('post-generate');
+
+  const pdfPath = join(outDir, 'output.pdf');
+  writeFileSync(pdfPath, pdfBytes);
+  const sha = createHash('sha256').update(pdfBytes).digest('hex');
+  console.log(`pdf: ${pdfBytes.length} bytes  sha256=${sha}`);
+
+  await page.tracing.stop();
+  console.log(`trace written: ${tracePath}`);
+
+  reportDumps(tracePath, dumpRequests);
+} catch (err) {
+  console.error('[probe error]', err);
+  exitCode = 1;
+} finally {
+  await browser.close();
+}
+
+process.exit(exitCode);
+
+function reportDumps(tracePath, requests) {
+  const trace = JSON.parse(readFileSync(tracePath, 'utf8'));
+  const events = Array.isArray(trace) ? trace : (trace.traceEvents ?? []);
+
+  // In modern Chromium both light and detailed PMDs are ph='v' (lowercase);
+  // detailed dumps carry args.dumps.allocators, light ones only
+  // process_totals. We want detailed.
+  const detailed = events.filter((e) => e.ph === 'v' && e.args?.dumps?.allocators);
+
+  // Group by event id (dump GUID), then order by min timestamp so the
+  // dumps in the trace map onto the requested labels by insertion order
+  // (Chromium renumbers GUIDs per-trace-session, so the CDP-returned
+  // GUID won't match the trace event id; we line them up by order).
+  const byId = new Map();
+  for (const e of detailed) {
+    const k = String(e.id);
+    if (!byId.has(k)) byId.set(k, []);
+    byId.get(k).push(e);
+  }
+  const groups = Array.from(byId.entries())
+    .map(([id, procs]) => ({ id, ts: Math.min(...procs.map((e) => e.ts)), procs }))
+    .sort((a, b) => a.ts - b.ts);
+
+  // Resolve pid -> process_name from metadata events.
+  const procName = new Map();
+  for (const e of events) {
+    if (e.ph === 'M' && e.name === 'process_name' && e.args?.name) {
+      procName.set(e.pid, e.args.name);
+    }
+  }
+
+  console.log(`\nfound ${groups.length} detailed dumps in trace`);
+  for (const g of groups) console.log(`  dump ${g.id} @ t=${g.ts}us  (${g.procs.length} processes)`);
+
+  // Match by insertion order: first requested label -> first dump, etc.
+  const n = Math.min(requests.length, groups.length);
+  for (let i = 0; i < n; i++) {
+    reportDump(requests[i].label, groups[i].id, groups[i].procs, procName, events);
+  }
+  if (groups.length < requests.length) {
+    console.log(`\n(only ${groups.length} dumps in trace, expected ${requests.length}; some may have been dropped)`);
+  }
+}
+
+function reportDump(label, id, procs, procName, allEvents) {
+  console.log(`\n=== ${label}  (dump ${id}, ${procs.length} processes) ===`);
+
+  // Detailed dumps in modern Chromium typically don't carry
+  // process_totals -- those live in the matching light dump event
+  // (same id + pid). Fall back when missing.
+  const findTotals = (id, pid, eDetailed) => {
+    const t = eDetailed.args?.dumps?.process_totals;
+    if (t && (t.private_footprint_bytes || t.peak_resident_set_size)) return t;
+    const light = allEvents.find((x) =>
+      x.ph === 'v' && String(x.id) === id && x.pid === pid &&
+      x.args?.dumps?.process_totals
+    );
+    return light?.args.dumps.process_totals ?? {};
+  };
+
+  const procRows = procs.map((e) => {
+    const totals     = findTotals(id, e.pid, e);
+    const allocators = e.args?.dumps?.allocators ?? {};
+    const resident   = parseHexBytes(totals.peak_resident_set_size);
+    const priv       = parseHexBytes(totals.private_footprint_bytes);
+    return {
+      pid:        e.pid,
+      procName:   procName.get(e.pid) ?? '(unknown)',
+      resident,
+      priv,
+      allocators,
+    };
+  });
+  procRows.sort((a, b) => (b.priv ?? 0) - (a.priv ?? 0));
+
+  // Show top processes, deep-dive on the largest.
+  for (let i = 0; i < procRows.length; i++) {
+    const r = procRows[i];
+    const tag = i === 0 ? '*' : ' ';
+    console.log(`  ${tag} pid=${String(r.pid).padEnd(6)} ${r.procName.padEnd(20)} private ${fmtMB(r.priv)}  resident ${fmtMB(r.resident)}`);
+  }
+
+  // Deep-dive on the renderer with the largest footprint.
+  const top = procRows[0];
+  if (!top) return;
+  console.log(`\n  top process (pid=${top.pid}) allocator breakdown (>= 1 MB):`);
+  const rows = [];
+  for (const [name, info] of Object.entries(top.allocators)) {
+    if (name.includes('/')) continue; // top-level only
+    const size = parseHexBytes(info?.attrs?.size?.value);
+    if (size == null || size < 1024 * 1024) continue;
+    rows.push({ name, size });
+  }
+  rows.sort((a, b) => b.size - a.size);
+  let sum = 0;
+  for (const r of rows) {
+    sum += r.size;
+    console.log(`    ${r.name.padEnd(36)} ${fmtMB(r.size)}`);
+  }
+  console.log(`    ${'(sum of >=1 MB top-level)'.padEnd(36)} ${fmtMB(sum)}`);
+
+  // Sub-breakdown of the biggest top-level entries (typical: blink_gc, malloc).
+  for (const big of rows.slice(0, 3)) {
+    const subs = [];
+    for (const [name, info] of Object.entries(top.allocators)) {
+      if (!name.startsWith(big.name + '/')) continue;
+      // Only one level below the parent.
+      const sub = name.slice(big.name.length + 1);
+      if (sub.includes('/')) continue;
+      const size = parseHexBytes(info?.attrs?.size?.value);
+      if (size == null || size < 1024 * 512) continue; // 0.5 MB cut-off for subs
+      subs.push({ name: sub, size });
+    }
+    if (subs.length === 0) continue;
+    subs.sort((a, b) => b.size - a.size);
+    console.log(`\n    ${big.name} sub-breakdown:`);
+    for (const s of subs.slice(0, 12)) {
+      console.log(`      ${s.name.padEnd(34)} ${fmtMB(s.size)}`);
+    }
+  }
+}
+
+function parseHexBytes(s) {
+  if (s == null) return null;
+  // memory-infra sizes are hex strings, sometimes with a leading 0x.
+  const n = parseInt(String(s), 16);
+  return Number.isFinite(n) ? n : null;
+}
diff --git a/perf/sample-mem.ps1 b/perf/sample-mem.ps1
new file mode 100644
index 0000000..9c970a3
--- /dev/null
+++ b/perf/sample-mem.ps1
@@ -0,0 +1,114 @@
+# Poll memory for a process tree rooted at $RootPid, emitting one JSON
+# sample per tick on stdout. Used by probe-memory.mjs to watch the
+# Chromium tree puppeteer spawns during a book-PDF render.
+#
+# Each sample:
+#   { "t": <iso ts>, "n": <proc count>, "total_private": <bytes>,
+#     "total_ws": <bytes>, "rows": [{pid,name,role,private,ws}, ...] }
+#
+# When the root process is gone, emits one final {"done":true} and exits.
+#
+# Usage:
+#   powershell -NoProfile -File sample-mem.ps1 -RootPid <pid> [-IntervalMs 500]
+param(
+    [Parameter(Mandatory)][int]$RootPid,
+    [int]$IntervalMs = 500
+)
+$ErrorActionPreference = 'Continue'
+[Console]::OutputEncoding = [System.Text.Encoding]::UTF8
+
+function Get-TreeProcesses {
+    param([int]$rootPid)
+    # One CIM query for the whole process table, then walk parent->children.
+    $all = Get-CimInstance -Class Win32_Process `
+        -Property ProcessId,ParentProcessId,Name,CommandLine
+    $byId = @{}
+    foreach ($p in $all) { $byId[[int]$p.ProcessId] = $p }
+    $byParent = @{}
+    foreach ($p in $all) {
+        $parentId = [int]$p.ParentProcessId
+        if (-not $byParent.ContainsKey($parentId)) {
+            $byParent[$parentId] = New-Object Collections.Generic.List[object]
+        }
+        [void]$byParent[$parentId].Add($p)
+    }
+    $stack = New-Object Collections.Generic.Stack[int]
+    $stack.Push($rootPid)
+    $tree = New-Object Collections.Generic.List[object]
+    $seen = @{}
+    while ($stack.Count -gt 0) {
+        $id = $stack.Pop()
+        if ($seen.ContainsKey($id)) { continue }
+        $seen[$id] = $true
+        if ($byId.ContainsKey($id)) { [void]$tree.Add($byId[$id]) }
+        if ($byParent.ContainsKey($id)) {
+            foreach ($c in $byParent[$id]) { $stack.Push([int]$c.ProcessId) }
+        }
+    }
+    return $tree
+}
+
+function Get-ChromeRole {
+    param([string]$cmdline)
+    # The browser parent process has no --type arg. Children pass --type=X
+    # (renderer, gpu-process, utility, crashpad-handler, ...).
+    if ([string]::IsNullOrEmpty($cmdline)) { return 'browser' }
+    if ($cmdline -match '--type=([^\s"]+)') {
+        $role = $Matches[1]
+        # Utility subprocesses pass --utility-sub-type too; surface that.
+        if ($role -eq 'utility' -and $cmdline -match '--utility-sub-type=([^\s"]+)') {
+            return 'utility:' + $Matches[1]
+        }
+        return $role
+    }
+    return 'browser'
+}
+
+while ($true) {
+    try {
+        $descendants = Get-TreeProcesses -rootPid $RootPid
+    } catch {
+        Start-Sleep -Milliseconds $IntervalMs
+        continue
+    }
+
+    if ($descendants.Count -eq 0) {
+        Write-Output '{"done":true}'
+        break
+    }
+
+    $totalPrivate = 0L
+    $totalWS = 0L
+    $rows = New-Object Collections.Generic.List[object]
+    foreach ($d in $descendants) {
+        $procId = [int]$d.ProcessId
+        $proc = Get-Process -Id $procId -ErrorAction SilentlyContinue
+        if ($null -eq $proc) { continue }
+        $priv = [int64]$proc.PrivateMemorySize64
+        $ws   = [int64]$proc.WorkingSet64
+        $totalPrivate += $priv
+        $totalWS += $ws
+        [void]$rows.Add([ordered]@{
+            pid     = $procId
+            name    = $proc.ProcessName
+            role    = Get-ChromeRole -cmdline $d.CommandLine
+            private = $priv
+            ws      = $ws
+        })
+    }
+
+    if ($rows.Count -eq 0) {
+        Write-Output '{"done":true}'
+        break
+    }
+
+    $sample = [ordered]@{
+        t             = (Get-Date).ToString("o")
+        n             = $rows.Count
+        total_private = $totalPrivate
+        total_ws      = $totalWS
+        rows          = $rows
+    }
+    Write-Output ($sample | ConvertTo-Json -Compress -Depth 5)
+    Start-Sleep -Milliseconds $IntervalMs
+}

From 921f2a4af5b25ca2b2cae3cf336a1fa24d5d0060 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sat, 23 May 2026 19:22:48 +0200
Subject: [PATCH 13/18] probe-renderer-mem: add --heap-snapshot for
 retainer-chain investigation.

CDP HeapProfiler.takeHeapSnapshot at post-render (and post-gc when
combined with --gc-passes) -- ~200 MB .heapsnapshot file per dump,
loadable in Chrome DevTools Memory tab. The Comparison view between
pre- and post-gc snapshots shows which V8-visible categories the GC
freed; Summary + filter "Detached" surfaces DOM nodes still held by
JS after their owning page was removed, and Retainers gives the
exact chain. Workflow documented in perf/README.md under "--heap-
snapshot: extract V8 retainer chains".

Oilpan-only objects (PhysicalBoxFragment, LogicalLineItems,
ConstraintSpace::RareData -- no V8 wrapper) don't appear in the V8
snapshot but are typically owned by a DOM node that does, so the
investigation route is detached-DOM-from-snapshot + ownership graph
from the memory-infra dump.
---
 perf/README.md              | 55 ++++++++++++++++++++++++++++++-------
 perf/probe-renderer-mem.mjs | 49 +++++++++++++++++++++++++++++++--
 2 files changed, 91 insertions(+), 13 deletions(-)

diff --git a/perf/README.md b/perf/README.md
index 68af042..d18dde5 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -5528,13 +5528,48 @@ Hypotheses for where to look first when chasing this:
   capture closures; their references survive even after
   the layout work is done.
 
-A heap profile snapshot taken at post-render, opened in
-Chrome DevTools' "Retainers" view, would surface the
-exact chain for the largest object categories
-(`PhysicalBoxFragment`, `LogicalLineItems`,
-`ConstraintSpace::RareData`) and point at which JS
-object is keeping them alive. CDP's `HeapProfiler.
-takeHeapSnapshot` is the entry point;
-`probe-renderer-mem.mjs` already has a CDP session and
-could be extended with a `--heap-snapshot` flag if this
-becomes the next investigation.
+#### `--heap-snapshot`: extract V8 retainer chains
+
+`probe-renderer-mem.mjs --heap-snapshot` captures a V8
+heap snapshot at post-render via CDP
+`HeapProfiler.takeHeapSnapshot` and writes it as
+`outDir/post-render.heapsnapshot` (~200 MB on the
+1651-page book). Combined with `--gc-passes N`, a
+second snapshot `post-gc.heapsnapshot` is taken right
+after the GC pass.
+
+DevTools workflow to chase the retention:
+
+1. Run `node perf/probe-renderer-mem.mjs --gc-passes 1
+   --heap-snapshot`. ~80 s wall clock; output dir
+   echoed to stdout.
+2. Open Chrome DevTools (any tab) -> Memory tab.
+3. Load `post-render.heapsnapshot` (the "Load profile"
+   icon). Switch to **Comparison** view, base = the
+   post-gc snapshot. The `# Deleted` and `Freed size`
+   columns show which V8-visible object categories the
+   GC was able to release.
+4. Switch the dropdown to **Summary** on the
+   post-render snapshot and filter by "Detached" --
+   detached `HTMLDivElement` / `Text` / etc. are DOM
+   nodes still held by JS after their owning page was
+   removed from the visible tree. Each row's
+   **Retainers** pane shows the exact JS-object chain
+   keeping the node alive (paged.js hook closure,
+   chunker `pages[]` entry, retained event listener,
+   etc.).
+5. For Oilpan-only objects (`PhysicalBoxFragment`,
+   `LogicalLineItems`, `ConstraintSpace::RareData` --
+   no V8 wrapper) the snapshot won't show them
+   directly. They're typically owned by a DOM node
+   that *is* in the snapshot; trace the detached DOM
+   from step 4 to its layout state via the C++
+   ownership graph in the memory-infra dump
+   (`blink_gc/main/blink::...` paths in the per-
+   allocator breakdown -- the analyzer above prints
+   the top-N classes).
+
+The snapshot itself is JS-side only. The complete
+picture is heap-snapshot (V8 reachability) + memory-
+infra dump (per-allocator + per-type sizes) =
+"what's there" + "what's keeping it there".
diff --git a/perf/probe-renderer-mem.mjs b/perf/probe-renderer-mem.mjs
index 449a49c..96478a7 100644
--- a/perf/probe-renderer-mem.mjs
+++ b/perf/probe-renderer-mem.mjs
@@ -19,9 +19,19 @@
 // only fires the pressure notification. --gc is shorthand for
 // --gc-passes 5.
 //
+// --heap-snapshot takes a V8/Blink heap snapshot at post-render via
+// CDP HeapProfiler.takeHeapSnapshot, written to outDir as
+// post-render.heapsnapshot. Combined with --gc-passes, a second
+// snapshot post-gc.heapsnapshot is taken right after the GC pass.
+// Load both in Chrome DevTools (Memory -> Load profile) -- the
+// diff between them shows exactly which objects the GC freed, and
+// the Retainers view on the largest object classes shows what's
+// keeping the dangling references alive in JS-land.
+//
 // Usage:
 //   node probe-renderer-mem.mjs [path/to/book.html]
 //                               [--gc | --gc-passes N]
+//                               [--heap-snapshot]
 
 import { pathToFileURL, fileURLToPath } from 'node:url';
 import { dirname, resolve, join } from 'node:path';
@@ -34,10 +44,12 @@ const __dirname = dirname(fileURLToPath(import.meta.url));
 const args = process.argv.slice(2);
 let inputArg = null;
 let gcPasses = -1;
+let heapSnap = false;
 for (let i = 0; i < args.length; i++) {
   const a = args[i];
   if (a === '--gc') gcPasses = 5;
   else if (a === '--gc-passes') gcPasses = parseInt(args[++i], 10);
+  else if (a === '--heap-snapshot') heapSnap = true;
   else if (!inputArg && !a.startsWith('-')) inputArg = a;
   else { console.error(`unknown arg: ${a}`); process.exit(2); }
 }
@@ -73,9 +85,10 @@ const fmtMB = (b) => {
   return (b / 1024 / 1024).toFixed(0).padStart(5) + ' MB';
 };
 
-console.log(`[probe] input    : ${inputPath}`);
-console.log(`[probe] output   : ${outDir}`);
-console.log(`[probe] gc-passes: ${forceGc ? gcPasses : '(off)'}`);
+console.log(`[probe] input        : ${inputPath}`);
+console.log(`[probe] output       : ${outDir}`);
+console.log(`[probe] gc-passes    : ${forceGc ? gcPasses : '(off)'}`);
+console.log(`[probe] heap-snapshot: ${heapSnap}`);
 
 // Match production launch args (docs/render-book.mjs). --expose-gc
 // is added when --gc is set so window.gc() inside the page works;
@@ -139,7 +152,36 @@ try {
     dumpRequests.push({ label, guid: r.dumpGuid });
   };
 
+  // V8/Blink heap snapshot via CDP. Streams chunks; concatenated text
+  // is a JSON document DevTools recognises as a .heapsnapshot. The
+  // snapshot itself is reachable from JS-land roots only -- it does
+  // not see Oilpan-only objects that have no JS reference. That's
+  // why dangling Blink objects show up as "Detached HTMLElement" or
+  // similar in DevTools' Retainers view: the V8 wrapper exists, and
+  // the path back to it is what we need to find.
+  const takeHeapSnapshot = async (label) => {
+    const t0 = Date.now();
+    await cdp.send('HeapProfiler.enable');
+    const chunks = [];
+    const onChunk = ({ chunk }) => chunks.push(chunk);
+    cdp.on('HeapProfiler.addHeapSnapshotChunk', onChunk);
+    try {
+      await cdp.send('HeapProfiler.takeHeapSnapshot', {
+        reportProgress:            false,
+        treatGlobalObjectsAsRoots: true,
+        captureNumericValue:       false,
+      });
+    } finally {
+      cdp.off('HeapProfiler.addHeapSnapshotChunk', onChunk);
+    }
+    const snap = chunks.join('');
+    const path = join(outDir, `${label}.heapsnapshot`);
+    writeFileSync(path, snap);
+    console.log(`heap snapshot ${label}: ${path} (${(snap.length / 1024 / 1024).toFixed(1)} MB, ${((Date.now() - t0) / 1000).toFixed(2)}s)`);
+  };
+
   await dumpAt('post-render');
+  if (heapSnap) await takeHeapSnapshot('post-render');
 
   if (forceGc) {
     const tGc = Date.now();
@@ -162,6 +204,7 @@ try {
     await new Promise((r) => setTimeout(r, 500));
     console.log(`gc-pass(${passes}): ${((tAfterGc - tGc) / 1000).toFixed(2)}s  total(+pressure): ${((Date.now() - tGc) / 1000).toFixed(2)}s`);
     await dumpAt('post-gc');
+    if (heapSnap) await takeHeapSnapshot('post-gc');
   }
 
   const midTimer = setTimeout(() => { dumpAt('mid-generate').catch(() => {}); }, 25000);

From 98b292b9ead24d3957499f597422d52940765705 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sat, 23 May 2026 19:44:15 +0200
Subject: [PATCH 14/18] Retention investigation: it's unswept Oilpan garbage,
 not a JS leak.

V8 heap snapshot diff pre-gc vs post-gc is byte-identical -- same
2,938,992 nodes, same 108.9 MB self_size, same per-category counts.
Rules out the "dangling JS references" hypothesis the gc-pass probe
initially suggested.

Per-Blink-class diff of the memory-infra dumps (new
perf/diff-blink-classes.mjs) shows what actually gets freed: style-
system caches and layout intermediates that are unreachable from
the moment their page finalises but stay in Oilpan because nothing
forces a major GC during the synchronous render loop. Two ~100% freed
categories are the cleanest signal: CachedMatchedProperties (Blink's
style-sharing cache, dead after layout) and GridItemData (paged.js's
per-page-template CSS grid items, dead after layout). The remainder
is sub-ComputedStyle (StyleBoxData, StyleSurroundData, StyleMisc*),
ShapeResultView / HarfBuzzRunGlyphData / ShapeResultRun, layout-
fragment RareData.

Conclusion: not a leak, not actionable as a retention fix. The only
direct mitigation is forcing a GC (already rejected, costs ~1 s).
Indirect lever is upstream DOM size (DOM-shape audit).

Tooling produced:
- perf/analyze-heap-snapshot.mjs: top type x name aggregation +
  pairwise diff for V8 heap snapshots. Also surfaces the
  detachedness=2 subset (corrected from earlier mis-read of the V8
  DetachednessV8 enum, where {1=Attached, 2=Detached}).
- perf/diff-blink-classes.mjs: per-Blink-class diff between two
  memory-infra dumps in the same trace. Strips the per-dump GUID
  suffix from class names so the same class lines up across dumps.

README updated: GC-pass section title and intro corrected; "What
might be holding the references" replaced with "What the GC actually
freed"; --heap-snapshot workflow re-framed as a visibility check
rather than a retainer-chain hunt (because the diff is zero).
---
 perf/README.md                 | 256 +++++++++++++++++++++------------
 perf/analyze-heap-snapshot.mjs | 172 ++++++++++++++++++++++
 perf/diff-blink-classes.mjs    | 123 ++++++++++++++++
 3 files changed, 459 insertions(+), 92 deletions(-)
 create mode 100644 perf/analyze-heap-snapshot.mjs
 create mode 100644 perf/diff-blink-classes.mjs

diff --git a/perf/README.md b/perf/README.md
index d18dde5..9ace1ee 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -5406,29 +5406,34 @@ Next memory targets, in rough order of effort vs payoff:
    inert wrappers, collapse nested spans) is the most
    accessible lever; we own the Jekyll pipeline end to
    end.
-2. **Chase the dangling references** that the GC probe
-   surfaced (see next subsection). 180+ MB of Blink
-   objects exist post-render but are unreachable from
-   anything user-visible; if we can find what's holding
-   them in the JS-level state (paged.js hooks?
-   detach-pages closures? jQuery-style retained
-   references?), the renderer peak shrinks without any
-   runtime GC cost.
+2. **Layout-intermediate garbage** that Oilpan doesn't
+   sweep during the synchronous render loop. ~75-225
+   MB of `CachedMatchedProperties`, sub-`ComputedStyle`
+   data, `GridItemData`, text-shape intermediates --
+   not retained by anything, just unswept. See the
+   "GC-pass probe" subsection for the per-class
+   breakdown; the only direct mitigation is forcing
+   GC (rejected, costs ~1 s), and the indirect lever
+   is upstream DOM size (item 1 above).
 3. **Page-template grid replacement** in vendored
    paged.js -- ~132 MB potential. Largest single target
    but an invasive rewrite of paged.js's `@page` area
    handler.
 
-### GC-pass probe: 180 MB of dangling Blink objects
+### GC-pass probe: 180 MB of unswept Oilpan garbage
 
 Forcing a `window.gc()` pass between render and generate
 frees ~180 MB of `blink_objects` (the typed view of the
 Oilpan heap) without touching anything user-visible.
-That means there's a clear shape of dangling references
-in our post-render state -- objects unreachable from any
-DOM, layout, or print-preview anchor, but still held by
-some JS-level retention somewhere in the paged.js /
-detach-pages chain.
+Initial framing: "dangling references somewhere in the
+paged.js / detach-pages chain". Investigation (see "What
+the GC actually freed" subsection below) shows the
+framing was wrong -- there is no JS-side retention.
+What the GC frees is per-page layout intermediate state
+(style sharing caches, `ComputedStyle` sub-data, grid
+item data, text-shape views) that's already unreachable
+from anything but stays in Oilpan because nothing forces
+a major GC during the synchronous render loop.
 
 Probe: `perf/probe-renderer-mem.mjs --gc-passes N`.
 Launches with `--js-flags=--expose-gc`, runs N V8
@@ -5483,52 +5488,114 @@ PDF output is byte-identical across all variants
 (41,076,362 bytes; SHA differs only in metadata).
 
 **Not shipped.** 1 second per render is meaningful when
-multiplied across CI builds, and the GC pass is masking
-a real defect (the dangling references) rather than
-fixing it. The cleaner direction is to find what's
-retaining those objects and release them at the JS
-level, which would shrink the peak without any GC cost.
+multiplied across CI builds, and after investigating
+what the GC actually freed (below) it's clear there's
+no underlying defect to fix -- this is Blink's normal
+allocation behaviour, with Oilpan's normal sweep
+behaviour, just observed in a workload that doesn't
+give Oilpan an idle moment to sweep.
 
 The probe and the `--gc-passes` flag stay in
 [probe-renderer-mem.mjs](probe-renderer-mem.mjs) for
-future use -- either as a measurement baseline when
-investigating the retention, or as a one-off escape
-hatch if a future bigger book ever hits a CI memory
-ceiling.
-
-#### What might be holding the references
-
-Hypotheses for where to look first when chasing this:
-
-- **paged.js hook chains** keep handler references on
-  the global `window.Paged.Hooks` object, and each hook
-  is called with per-page context (the page wrapper,
-  the chunker, the layout result). If a hook closure
-  captures a page-local value it can keep that page's
-  state alive past `finalizePage`. The render chain is
-  fully synchronous now (see [Stripping
-  headless-irrelevant async machinery](#stripping-headless-irrelevant-async-machinery))
-  so there are no awaited promises holding stale
-  closures; the suspect is closure capture in synchronous
-  hooks.
-- **detach-pages.js** removes the page DOM from the
-  visible tree but the handler itself retains a
-  per-page reference in its own arrays/maps; check the
-  handler's instance state for accumulating per-page
-  entries that should be cleared after the detach.
-- **Chunker / Layout instance fields** on
-  `window.PagedPolyfill.chunker` -- the chunker keeps
-  `pages[]`, `layout`, `breakToken`, etc. After
-  `preview()` returns the last page's state might be
-  retained on these fields and root the whole tree
-  through them.
-- **EventTarget retained listeners** on the document or
-  on `window` -- paged.js installs a few
-  `resize`/`afterPageLayout`/etc. event listeners that
-  capture closures; their references survive even after
-  the layout work is done.
-
-#### `--heap-snapshot`: extract V8 retainer chains
+future use -- either as a measurement baseline if a
+future bigger book ever hits a CI memory ceiling, or as
+an A/B reference if Blink's allocation pattern changes
+with a Chromium upgrade.
+
+#### What the GC actually freed
+
+Two analyses, both negative for the "dangling references"
+hypothesis, both positive for "Oilpan didn't sweep":
+
+**V8 heap snapshot diff (pre-gc vs post-gc):** byte-
+identical. Same 2,938,992 nodes, same 108.9 MB self_size,
+same per-category counts. The diff is zero across every
+node category in V8. Whatever the GC freed was invisible
+to V8's snapshot, which means it had no V8 wrapper --
+which means no JS reference can be holding it. Probe:
+[analyze-heap-snapshot.mjs](analyze-heap-snapshot.mjs)
+in single-snapshot or diff mode.
+
+**Per-Blink-class diff (memory-infra dumps):** the
+freed memory is concentrated in style-system caches and
+layout intermediates. Top freed classes between dump 0
+(post-render) and dump 1 (post-gc), 1-pass GC run:
+
+| class                                            | a_count | a_MB | b_count | b_MB | freed |
+| ------------------------------------------------ | ------- | ---- | ------- | ---- | ----- |
+| `CachedMatchedProperties`                        | 122,110 | 12.1 |     355 |  0.0 | **-12.1 MB** (~100%) |
+| `ComputedStyle`                                  | 380,974 | 26.2 | 244,772 | 16.8 |  -9.4 MB (~36%)      |
+| `ComputedStyleBase::StyleMisc2Data`              |  24,649 |  8.3 |   6,911 |  2.3 |  -6.0 MB             |
+| `ComputedStyleBase::StyleBoxData`                |  94,867 | 15.9 |  63,937 | 10.7 |  -5.2 MB             |
+| `ComputedStyleBase::StyleSurroundData`           |  32,350 |  9.6 |  15,101 |  4.5 |  -5.1 MB             |
+| `GridItemData`                                   |  27,508 |  5.0 |       0 |  0.0 | **-5.0 MB** (~100%)  |
+| `ShapeResultView`                                | 225,299 | 15.5 | 170,366 | 11.7 |  -3.8 MB             |
+| `HeapVectorBacking<HarfBuzzRunGlyphData>`        | 163,864 | 19.2 | 149,993 | 16.4 |  -2.9 MB             |
+| `LayoutResult::RareData`                         |  71,960 |  8.8 |  48,955 |  6.0 |  -2.8 MB             |
+| `ConstraintSpace::RareData`                      |  79,445 |  9.1 |  55,209 |  6.3 |  -2.8 MB             |
+| `ComputedStyleBase::StyleMisc1Data`              |  19,034 |  3.0 |   1,958 |  0.3 |  -2.7 MB             |
+| `ComputedStyleBase::StyleMiscData`               |  64,838 |  5.4 |  39,653 |  3.3 |  -2.1 MB             |
+| `LayoutResult`                                   | 179,728 | 13.7 | 155,052 | 11.8 |  -1.9 MB             |
+| ... (smaller)                                    |         |      |         |      |  -16  MB             |
+| **total**                                        |         |      |         |      | **-76 MB** (this run; -226 MB on a different run -- noisy) |
+
+The two ~100% freed categories tell the cleanest story:
+
+- **`CachedMatchedProperties`** is Blink's style-sharing
+  cache -- "which CSS rules matched element X, so that
+  similar element Y can reuse the resolved style". After
+  layout completes, it's dead state. Only useful if the
+  document gets relaid out, which our pipeline never
+  does.
+- **`GridItemData`** is per-item layout state for CSS
+  Grid. Paged.js puts each `@page` area inside a grid
+  to position the running headers / footers / margin
+  boxes; once the page is laid out, the `GridItemData`
+  for that page's items is dead.
+
+Everything else is style sub-structures
+(`ComputedStyleBase::Style*Data`) and text-shape
+intermediates (`ShapeResultView`, `HarfBuzzRunGlyphData`,
+`ShapeResultRun`) that get freed when their owning
+`ComputedStyle` or layout fragment becomes unreachable.
+All Blink-internal allocations driven by layout.
+
+What this means for the leak question:
+
+- **Not a leak.** Nothing holds these objects after
+  layout. They're unreachable from the moment their
+  page is finalised; they sit in Oilpan because
+  Chromium doesn't run a major GC during the
+  synchronous render loop.
+- **Not a JS-side retention.** detach-pages.js,
+  paged.js's chunker, hook chains, and event listeners
+  were the suspect list. The V8 snapshot diff rules
+  them all out -- if any of them held the layout state,
+  the snapshot would change between pre-gc and post-gc.
+- **It's a real over-allocation in the sense that we
+  hold ~75-225 MB longer than necessary**, but the cost
+  to fix it (force a GC: 1 s wall clock) exceeds the
+  CI memory headroom it would buy at our current book
+  size.
+
+The indirect lever still works: reducing the input DOM
+size reduces both peak working set AND this garbage
+fraction proportionally. That's the DOM-shape audit
+item in "Next memory targets".
+
+Tooling produced by this investigation, kept in
+[perf/](.) for re-use:
+
+- [analyze-heap-snapshot.mjs](analyze-heap-snapshot.mjs)
+  -- single-snapshot summary (top type x name by
+  aggregate bytes, detached subset) and pairwise diff
+  between two snapshots.
+- [diff-blink-classes.mjs](diff-blink-classes.mjs) --
+  per-Blink-class diff between two memory-infra dumps
+  in the same trace. Strips the per-dump GUID suffix
+  from class names so the diff lines up across dumps.
+
+#### `--heap-snapshot`: V8 visibility check
 
 `probe-renderer-mem.mjs --heap-snapshot` captures a V8
 heap snapshot at post-render via CDP
@@ -5538,38 +5605,43 @@ heap snapshot at post-render via CDP
 second snapshot `post-gc.heapsnapshot` is taken right
 after the GC pass.
 
-DevTools workflow to chase the retention:
-
-1. Run `node perf/probe-renderer-mem.mjs --gc-passes 1
-   --heap-snapshot`. ~80 s wall clock; output dir
-   echoed to stdout.
-2. Open Chrome DevTools (any tab) -> Memory tab.
-3. Load `post-render.heapsnapshot` (the "Load profile"
-   icon). Switch to **Comparison** view, base = the
-   post-gc snapshot. The `# Deleted` and `Freed size`
-   columns show which V8-visible object categories the
-   GC was able to release.
-4. Switch the dropdown to **Summary** on the
-   post-render snapshot and filter by "Detached" --
-   detached `HTMLDivElement` / `Text` / etc. are DOM
-   nodes still held by JS after their owning page was
-   removed from the visible tree. Each row's
-   **Retainers** pane shows the exact JS-object chain
-   keeping the node alive (paged.js hook closure,
-   chunker `pages[]` entry, retained event listener,
-   etc.).
-5. For Oilpan-only objects (`PhysicalBoxFragment`,
-   `LogicalLineItems`, `ConstraintSpace::RareData` --
-   no V8 wrapper) the snapshot won't show them
-   directly. They're typically owned by a DOM node
-   that *is* in the snapshot; trace the detached DOM
-   from step 4 to its layout state via the C++
-   ownership graph in the memory-infra dump
-   (`blink_gc/main/blink::...` paths in the per-
-   allocator breakdown -- the analyzer above prints
-   the top-N classes).
-
-The snapshot itself is JS-side only. The complete
-picture is heap-snapshot (V8 reachability) + memory-
-infra dump (per-allocator + per-type sizes) =
-"what's there" + "what's keeping it there".
+The original intent was a retainer-chain investigation
+to find what JS-side state was holding the Blink
+objects the GC frees. The result of that investigation
+(see "What the GC actually freed" above) is that
+**nothing on the V8 side holds them** -- the snapshot
+diff is byte-identical pre-gc vs post-gc, ruling out
+JS retention entirely. The freed memory is Oilpan-only,
+invisible to V8's snapshot.
+
+The snapshot tooling is still useful as a visibility
+check -- "is the renderer holding what I expect?" --
+and for finding any actual JS-side retention if one
+ever surfaces. CLI analysis:
+
+- `node perf/analyze-heap-snapshot.mjs <snap>` --
+  single-snapshot summary (top type x name by aggregate
+  bytes, plus actually-detached subset).
+- `node perf/analyze-heap-snapshot.mjs <a> <b>` --
+  pairwise diff: what categories grew or shrank.
+
+DevTools workflow (more interactive, for following
+specific retention chains):
+
+1. Open Chrome DevTools (any tab) -> Memory tab.
+2. Load `<...>.heapsnapshot` (the "Load profile" icon).
+   Browse the **Summary** view for the largest object
+   categories.
+3. For any object of interest, the **Retainers** pane
+   shows the chain of JS references holding it. Filter
+   by name (e.g. `Detached HTMLDivElement`) or by class.
+
+Oilpan-only objects (`CachedMatchedProperties`,
+`ComputedStyleBase::*Data`, `GridItemData`,
+`ShapeResultView`, layout fragments, etc.) do not appear
+in the V8 snapshot -- they have no V8 wrapper. The
+memory-infra dump + `diff-blink-classes.mjs` is the
+right tool for those. The complete picture is
+heap-snapshot (V8 reachability) + memory-infra dump
+(per-allocator + per-Blink-class sizes) = "what JS sees"
++ "what's actually in the renderer".
diff --git a/perf/analyze-heap-snapshot.mjs b/perf/analyze-heap-snapshot.mjs
new file mode 100644
index 0000000..ab6a6aa
--- /dev/null
+++ b/perf/analyze-heap-snapshot.mjs
@@ -0,0 +1,172 @@
+// Analyse a Chrome DevTools heap snapshot (.heapsnapshot JSON).
+//
+// In "single" mode reports per-type×name aggregate counts and sizes:
+// who holds the V8-visible memory in this snapshot, in descending
+// order of total bytes. In "diff" mode reports the per-key delta
+// between two snapshots: positive = retained more in B, negative =
+// freed between A and B. Used to identify which object categories
+// the GC freed (and by extension, which the JS-side retention is
+// keeping alive in the no-GC baseline).
+//
+// Self_size is the shallow size of each node (not retained size --
+// computing retained size requires a dominator-tree pass over the
+// graph, which DevTools does interactively but we don't here). For
+// figuring out where the renderer's memory goes, the type×name
+// distribution is the actionable view.
+//
+// Usage:
+//   node analyze-heap-snapshot.mjs <snap.heapsnapshot>
+//   node analyze-heap-snapshot.mjs <before.heapsnapshot> <after.heapsnapshot>
+
+import { readFileSync } from 'node:fs';
+import { resolve } from 'node:path';
+
+const argv = process.argv.slice(2);
+if (argv.length < 1 || argv.length > 2) {
+  console.error('usage: node analyze-heap-snapshot.mjs <snap> [<after-snap-for-diff>]');
+  process.exit(2);
+}
+const pathA = resolve(process.cwd(), argv[0]);
+const pathB = argv[1] ? resolve(process.cwd(), argv[1]) : null;
+
+const fmtMB = (b) => (b / 1024 / 1024).toFixed(1).padStart(8) + ' MB';
+const fmtN  = (n) => n.toLocaleString().padStart(10);
+
+function loadSnapshot(path) {
+  const t0 = Date.now();
+  const bytes = readFileSync(path, 'utf8');
+  const snap  = JSON.parse(bytes);
+  console.log(`loaded ${path}  (${(bytes.length / 1024 / 1024).toFixed(1)} MB, ${((Date.now() - t0) / 1000).toFixed(1)}s)`);
+  return snap;
+}
+
+function decodeAggregate(snap) {
+  const meta = snap.snapshot.meta;
+  const nodes   = snap.nodes;
+  const strings = snap.strings;
+  const fields  = meta.node_fields;          // e.g. ["type","name","id","self_size","edge_count","detachedness"]
+  const types   = meta.node_types[0];        // the enum for the 'type' field
+  const F = fields.length;
+  const typeIdx     = fields.indexOf('type');
+  const nameIdx     = fields.indexOf('name');
+  const selfSizeIdx = fields.indexOf('self_size');
+  const detachIdx   = fields.indexOf('detachedness');
+
+  const byKey = new Map();
+  const detachedByKey = new Map();
+  const nodeCount = nodes.length / F;
+  for (let i = 0; i < nodeCount; i++) {
+    const base   = i * F;
+    const t      = types[nodes[base + typeIdx]];
+    const name   = strings[nodes[base + nameIdx]];
+    const size   = nodes[base + selfSizeIdx];
+    const det    = detachIdx >= 0 ? nodes[base + detachIdx] : 0;
+    const key    = `${t}:${name}`;
+    let cur = byKey.get(key);
+    if (!cur) { cur = { count: 0, bytes: 0 }; byKey.set(key, cur); }
+    cur.count += 1;
+    cur.bytes += size;
+    // V8 DetachednessV8 enum: 0=Unknown, 1=Attached, 2=Detached.
+    // Only 2 is "this DOM node is no longer reachable through the
+    // document tree".
+    if (det === 2) {
+      let dCur = detachedByKey.get(key);
+      if (!dCur) { dCur = { count: 0, bytes: 0 }; detachedByKey.set(key, dCur); }
+      dCur.count += 1;
+      dCur.bytes += size;
+    }
+  }
+  return { byKey, detachedByKey, nodeCount };
+}
+
+function topN(map, n, by = 'bytes') {
+  return Array.from(map, ([k, v]) => ({ key: k, ...v }))
+    .sort((a, b) => b[by] - a[by])
+    .slice(0, n);
+}
+
+const snapA = loadSnapshot(pathA);
+const aggA  = decodeAggregate(snapA);
+
+if (!pathB) {
+  const total = Array.from(aggA.byKey.values()).reduce((s, v) => s + v.bytes, 0);
+  console.log(`\nnodes: ${aggA.nodeCount.toLocaleString()}`);
+  console.log(`self_size total: ${fmtMB(total)}`);
+
+  console.log(`\ntop 30 type:name by aggregate bytes:`);
+  for (const r of topN(aggA.byKey, 30)) {
+    console.log(`  ${r.key.padEnd(48)} ${fmtN(r.count)} x  ${fmtMB(r.bytes)}`);
+  }
+
+  if (aggA.detachedByKey.size > 0) {
+    const dTotal = Array.from(aggA.detachedByKey.values()).reduce((s, v) => s + v.bytes, 0);
+    console.log(`\ndetached nodes (detachedness in {1,2}): ${dTotal === 0 ? '0' : fmtMB(dTotal)}`);
+    console.log(`top 20 detached type:name by bytes:`);
+    for (const r of topN(aggA.detachedByKey, 20)) {
+      console.log(`  ${r.key.padEnd(48)} ${fmtN(r.count)} x  ${fmtMB(r.bytes)}`);
+    }
+  }
+  process.exit(0);
+}
+
+// Diff mode
+const snapB = loadSnapshot(pathB);
+const aggB  = decodeAggregate(snapB);
+
+const allKeys = new Set([...aggA.byKey.keys(), ...aggB.byKey.keys()]);
+const diffRows = [];
+for (const k of allKeys) {
+  const a = aggA.byKey.get(k) ?? { count: 0, bytes: 0 };
+  const b = aggB.byKey.get(k) ?? { count: 0, bytes: 0 };
+  diffRows.push({
+    key:        k,
+    countA:     a.count,
+    bytesA:     a.bytes,
+    countB:     b.count,
+    bytesB:     b.bytes,
+    countDelta: b.count - a.count,
+    bytesDelta: b.bytes - a.bytes,
+  });
+}
+
+const totalA = Array.from(aggA.byKey.values()).reduce((s, v) => s + v.bytes, 0);
+const totalB = Array.from(aggB.byKey.values()).reduce((s, v) => s + v.bytes, 0);
+console.log(`\nA total self_size: ${fmtMB(totalA)}  (${aggA.nodeCount.toLocaleString()} nodes)`);
+console.log(`B total self_size: ${fmtMB(totalB)}  (${aggB.nodeCount.toLocaleString()} nodes)`);
+console.log(`Δ self_size:       ${fmtMB(totalB - totalA)}  (${(aggB.nodeCount - aggA.nodeCount).toLocaleString()} nodes)`);
+
+const freed = diffRows.filter((r) => r.bytesDelta < 0).sort((a, b) => a.bytesDelta - b.bytesDelta);
+console.log(`\ntop 30 categories FREED in B (bytesDelta < 0):`);
+console.log(`  ${'type:name'.padEnd(48)} ${'A count'.padStart(10)} ${'A bytes'.padStart(11)}  ${'B count'.padStart(10)} ${'B bytes'.padStart(11)}  ${'Δ count'.padStart(10)} ${'Δ bytes'.padStart(11)}`);
+for (const r of freed.slice(0, 30)) {
+  console.log(`  ${r.key.padEnd(48)} ${fmtN(r.countA)} ${fmtMB(r.bytesA)} ${fmtN(r.countB)} ${fmtMB(r.bytesB)} ${fmtN(r.countDelta)} ${fmtMB(r.bytesDelta)}`);
+}
+
+const grown = diffRows.filter((r) => r.bytesDelta > 0).sort((a, b) => b.bytesDelta - a.bytesDelta);
+console.log(`\ntop 15 categories GROWN in B (bytesDelta > 0):`);
+console.log(`  ${'type:name'.padEnd(48)} ${'Δ count'.padStart(10)} ${'Δ bytes'.padStart(11)}`);
+for (const r of grown.slice(0, 15)) {
+  console.log(`  ${r.key.padEnd(48)} ${fmtN(r.countDelta)} ${fmtMB(r.bytesDelta)}`);
+}
+
+// Detached diff
+const allDetachedKeys = new Set([...aggA.detachedByKey.keys(), ...aggB.detachedByKey.keys()]);
+if (allDetachedKeys.size > 0) {
+  const dRows = [];
+  for (const k of allDetachedKeys) {
+    const a = aggA.detachedByKey.get(k) ?? { count: 0, bytes: 0 };
+    const b = aggB.detachedByKey.get(k) ?? { count: 0, bytes: 0 };
+    dRows.push({ key: k, countA: a.count, bytesA: a.bytes, countB: b.count, bytesB: b.bytes,
+                 countDelta: b.count - a.count, bytesDelta: b.bytes - a.bytes });
+  }
+  const dTotalA = Array.from(aggA.detachedByKey.values()).reduce((s, v) => s + v.bytes, 0);
+  const dTotalB = Array.from(aggB.detachedByKey.values()).reduce((s, v) => s + v.bytes, 0);
+  console.log(`\nDetached nodes:`);
+  console.log(`  A: ${fmtMB(dTotalA)}   B: ${fmtMB(dTotalB)}   Δ: ${fmtMB(dTotalB - dTotalA)}`);
+  console.log(`top 20 detached type:name by |Δ bytes|:`);
+  dRows.sort((a, b) => Math.abs(b.bytesDelta) - Math.abs(a.bytesDelta));
+  for (const r of dRows.slice(0, 20)) {
+    const tag = r.bytesDelta < 0 ? 'freed' : 'grew ';
+    console.log(`  ${tag} ${r.key.padEnd(44)} ${fmtN(r.countA)} -> ${fmtN(r.countB)}  Δ ${fmtN(r.countDelta)}  Δ ${fmtMB(r.bytesDelta)}`);
+  }
+}
diff --git a/perf/diff-blink-classes.mjs b/perf/diff-blink-classes.mjs
new file mode 100644
index 0000000..4e34182
--- /dev/null
+++ b/perf/diff-blink-classes.mjs
@@ -0,0 +1,123 @@
+// Diff per-Blink-class object counts and sizes between two memory-infra
+// dumps in the same trace, for the renderer process. Useful for
+// understanding what a forced GC freed at the typed-object level
+// (which the V8 heap snapshot can't see because the freed objects
+// have no V8 wrappers).
+//
+// Usage:
+//   node diff-blink-classes.mjs <trace.json> [<dump-index-a>] [<dump-index-b>]
+//
+// Default: index 0 (post-render) and index 1 (post-gc when --gc-passes
+// was used; mid-generate otherwise).
+
+import { readFileSync } from 'node:fs';
+import { resolve } from 'node:path';
+
+const argv = process.argv.slice(2);
+if (argv.length < 1) {
+  console.error('usage: node diff-blink-classes.mjs <trace.json> [a] [b]');
+  process.exit(2);
+}
+const tracePath = resolve(process.cwd(), argv[0]);
+const idxA = parseInt(argv[1] ?? '0', 10);
+const idxB = parseInt(argv[2] ?? '1', 10);
+
+const trace  = JSON.parse(readFileSync(tracePath, 'utf8'));
+const events = trace.traceEvents;
+const detailed = events.filter((e) => e.ph === 'v' && e.args?.dumps?.allocators);
+
+const byId = new Map();
+for (const e of detailed) {
+  const k = String(e.id);
+  if (!byId.has(k)) byId.set(k, []);
+  byId.get(k).push(e);
+}
+const groups = [...byId.entries()]
+  .map(([id, p]) => ({ id, ts: Math.min(...p.map((x) => x.ts)), procs: p }))
+  .sort((a, b) => a.ts - b.ts);
+
+console.log(`dumps in trace, in order:`);
+groups.forEach((g, i) => console.log(`  ${i}: id=${g.id}, ${g.procs.length} processes`));
+
+if (idxA >= groups.length || idxB >= groups.length) {
+  console.error(`requested indices ${idxA},${idxB} but only ${groups.length} dumps in trace`);
+  process.exit(1);
+}
+
+function findRenderer(procs, id) {
+  const findTotals = (pid, eDetail) => {
+    const t = eDetail.args?.dumps?.process_totals;
+    if (t?.private_footprint_bytes) return t;
+    const light = events.find((x) =>
+      x.ph === 'v' && String(x.id) === id && x.pid === pid && x.args?.dumps?.process_totals
+    );
+    return light?.args.dumps.process_totals ?? {};
+  };
+  let best = null;
+  for (const e of procs) {
+    const t = findTotals(e.pid, e);
+    const priv = t.private_footprint_bytes ? parseInt(t.private_footprint_bytes, 16) : 0;
+    if (!best || priv > best.priv) best = { e, priv };
+  }
+  return best;
+}
+
+const rA = findRenderer(groups[idxA].procs, groups[idxA].id);
+const rB = findRenderer(groups[idxB].procs, groups[idxB].id);
+const fmtMB = (b) => (b / 1024 / 1024).toFixed(1).padStart(7);
+console.log(`\nA (dump ${idxA}) renderer: pid=${rA.e.pid}, priv=${fmtMB(rA.priv)} MB`);
+console.log(`B (dump ${idxB}) renderer: pid=${rB.e.pid}, priv=${fmtMB(rB.priv)} MB`);
+
+function blinkClasses(allocators) {
+  // The typed-class breakdown lives under blink_objects/blink_gc/main/
+  // (not blink_gc/main/, which is a hashed-page namespace). Each entry
+  // is "blink::ClassName (0xNNN)" or "DOMTypeName (0xNNN)". The (0xNNN)
+  // suffix is a per-dump GUID -- the SAME class gets a different GUID
+  // in each dump -- so it must be stripped before comparing across
+  // dumps.
+  const out = new Map();
+  for (const [name, info] of Object.entries(allocators)) {
+    const m = name.match(/^blink_objects\/blink_gc\/main\/(.+)$/);
+    if (!m) continue;
+    const cls = m[1].replace(/\s*\(0x[0-9a-fA-F]+\)\s*$/, '');
+    const size  = parseInt(info?.attrs?.size?.value ?? '0', 16);
+    const count = parseInt(info?.attrs?.object_count?.value ?? '0', 16);
+    const prev = out.get(cls);
+    if (prev) {
+      out.set(cls, { size: prev.size + size, count: prev.count + count });
+    } else {
+      out.set(cls, { size, count });
+    }
+  }
+  return out;
+}
+
+const cA = blinkClasses(rA.e.args.dumps.allocators);
+const cB = blinkClasses(rB.e.args.dumps.allocators);
+const allClasses = new Set([...cA.keys(), ...cB.keys()]);
+const rows = [];
+for (const cls of allClasses) {
+  const a = cA.get(cls) ?? { size: 0, count: 0 };
+  const b = cB.get(cls) ?? { size: 0, count: 0 };
+  rows.push({ cls, aCount: a.count, aSize: a.size, bCount: b.count, bSize: b.size,
+              dCount: b.count - a.count, dSize: b.size - a.size });
+}
+
+const fmtN = (n) => n.toLocaleString().padStart(9);
+
+const freed = rows.filter((r) => r.dSize < 0).sort((a, b) => a.dSize - b.dSize);
+console.log(`\ntop 30 Blink classes FREED in B (sorted by |Δ bytes|):`);
+console.log(`  ${'class'.padEnd(58)}  a_count    a_MB   b_count    b_MB   d_count    d_MB`);
+for (const r of freed.slice(0, 30)) {
+  console.log(`  ${r.cls.padEnd(58)} ${fmtN(r.aCount)} ${fmtMB(r.aSize)} ${fmtN(r.bCount)} ${fmtMB(r.bSize)} ${fmtN(r.dCount)} ${fmtMB(r.dSize)}`);
+}
+const totalFreed = freed.reduce((s, r) => s + r.dSize, 0);
+console.log(`\ntotal freed bytes across all blink_gc/main/* classes: ${fmtMB(totalFreed)} MB`);
+
+const grown = rows.filter((r) => r.dSize > 0).sort((a, b) => b.dSize - a.dSize);
+if (grown.length > 0) {
+  console.log(`\ntop 10 classes that GREW in B (B alloc'd more than A):`);
+  for (const r of grown.slice(0, 10)) {
+    console.log(`  ${r.cls.padEnd(58)} d_count=${fmtN(r.dCount)} d_MB=${fmtMB(r.dSize)}`);
+  }
+}

From 929f57961818ab3496f4fc3efb052db7198b1e13 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sat, 23 May 2026 22:01:32 +0200
Subject: [PATCH 15/18] Document Chromium-internal PDF-generation approaches in
 CHROMIUM.md.

Research notes from the conversation that explored what it would take
to extract Blink's draw stream (SkPicture / cc::PaintRecord), spawn
standalone PrintCompositor utility processes, or build a Chromium-
linked helper binary -- all to enable parallel PDF generation without
N-way memory blowup.

Five approaches catalogued with honest cost estimates:
  A. Patch + upstream a Chromium flag (skip PrintCompositor for
     single-renderer, or streaming printToPDF).
  B. Port SkPDF to JS (doesn't help alone -- the input data
     extraction is the real bottleneck).
  C. Frida + reimplement Mojo client in Node (~15-22 weeks).
  D. Frida + CanvasKit-WASM workers (~6-10 weeks, tagged-PDF rebuild
     required).
  E. Helper binary linking Chromium components (~4-6 weeks total,
     corrected from earlier overestimates -- shallow gclient sync
     ~20-30 GB and ~30-90 min, targeted ninja build of ~1500-2500
     TUs ~30-90 min first time).

All rejected for the current 70 s build, but documented so the
analysis isn't lost if the book size or CI budget makes it relevant
again. Also captures the hard facts:

- chrome.dll is a single 283 MB monolithic binary with exactly six
  exported functions (ChromeMain + 5 others); PrintCompositor / Mojo
  / Skia / Blink / V8 are not externally callable.
- The idle Chromium tree is ~125-180 MB (corrected from earlier
  claim of "70-1100 MB"; the high end was PDF-in-transit, not
  steady-state).
- HarfBuzz shaping results and SkTextBlob glyph positions never
  leave the renderer via any public API; the natural extraction
  point is the Mojo serialization between renderer and
  PrintCompositor.

New probe perf/probe-idle-browser.mjs measures the idle baseline
(post-launch, post-newPage, post-goto(about:blank)) -- the data
behind the corrected memory math.

Pointer from perf/README.md "Memory" section to CHROMIUM.md so the
separate research file is discoverable.
---
 perf/CHROMIUM.md            | 431 ++++++++++++++++++++++++++++++++++++
 perf/README.md              |   9 +
 perf/probe-idle-browser.mjs | 105 +++++++++
 3 files changed, 545 insertions(+)
 create mode 100644 perf/CHROMIUM.md
 create mode 100644 perf/probe-idle-browser.mjs

diff --git a/perf/CHROMIUM.md b/perf/CHROMIUM.md
new file mode 100644
index 0000000..a7e4a4f
--- /dev/null
+++ b/perf/CHROMIUM.md
@@ -0,0 +1,431 @@
+# Chromium-internal approaches to parallel PDF generation
+
+A separate document because none of this is shipped or even partially
+implemented. It records the research we did into Chromium-internal
+approaches to faster / parallel PDF emission, with honest cost
+estimates and the reasons each was rejected. Kept as a reference for
+two scenarios:
+
+1. The book grows large enough that the 70 s build becomes a CI
+   bottleneck again (3000+ pages, or CI runtime tightens).
+2. Someone independently rediscovers the same ideas and wants to know
+   why we didn't pursue them.
+
+For the perf work that *did* land (the `--disable-gpu` flag pair, the
+memory probes, the GC-pass investigation), see [README.md](README.md).
+
+## What the public APIs don't expose
+
+The shortest version: **Skia's drawing stream and HarfBuzz's shape
+results never leave the renderer process via any documented API**.
+That's the wall behind every approach below.
+
+What's documented and works from JS / CDP:
+
+- `Range.getClientRects()`, `Element.getBoundingClientRect()` -- per
+  line-fragment bounding boxes. Box-level, not glyph-level.
+- CDP `DOMSnapshot.captureSnapshot` -- the full layout tree as JSON
+  with each text node's `textBoxes[]` (bounds + text-fragment offsets).
+  Run-level granularity.
+- `CanvasRenderingContext2D.measureText()` -- `TextMetrics` for text
+  *about to be drawn*, not text already laid out.
+- `document.fonts` (`FontFaceSet`) -- load state, not glyph positions.
+
+What is *not* exposed anywhere:
+
+- The HarfBuzz shaping result -- the character-to-glyph mapping with
+  ligatures, contextual substitutions, kerning all applied. Lives in
+  Blink's `blink::ShapeResult` / `ShapeResultView` (~50 MB in the
+  renderer for our book, visible in the memory-infra dump).
+- Per-glyph x-positions (`SkTextBlob`).
+- Font binaries / subsets (security/copyright concerns).
+- The accessibility structure tree that becomes the tagged-PDF
+  structure tree.
+
+What is internally serialized but invisible from outside:
+
+- `cc::PaintRecord` / `SkPicture` -- the renderer's full draw stream,
+  containing every `SkTextBlob` with its glyph IDs and positions.
+  Serialized for Mojo transfer renderer → PrintCompositor (see below);
+  could be intercepted with dynamic instrumentation.
+- The tagged-PDF structure tree -- traveled separately through Mojo
+  to PrintCompositor; same intercept-by-hook story.
+
+## How the print path actually works
+
+Inside one PDF render (`Page.printToPDF`):
+
+1. **Renderer (where paged.js lives)** -- Blink lays out the document
+   via LayoutNG; the paint pass produces a `cc::PaintRecord`
+   containing every draw op as `SkPaint` + `SkTextBlob` + `SkPath` +
+   `SkImage` plus the accessibility structure tree.
+2. **Mojo IPC** -- the `PaintRecord` is serialized (Skia's documented
+   `SkPicture` byte format, ~50 MB on our book) and sent over a Mojo
+   channel to the PrintCompositor utility process. The structure tree
+   travels via a separate Mojo message.
+3. **PrintCompositor utility process** (`chrome.exe --type=utility
+   --utility-sub-type=printing.mojom.PrintCompositor`) -- deserializes
+   the picture into Skia, calls `SkPDFDocument` to emit PDF bytes,
+   merges the structure tree on top, returns the PDF bytes via Mojo.
+4. **Browser process** -- receives the PDF, forwards over the
+   DevTools/CDP channel to puppeteer over a WebSocket.
+5. **Node (us)** -- receives the bytes from puppeteer.
+
+Cost shape on the 1651-page book, with the shipped `--disable-gpu`
+flag pair:
+
+| stage | typical wall clock | peak memory |
+| ----- | ------------------ | ----------- |
+| render (Blink layout + paged.js) | ~10 s | renderer ~1.3 GB |
+| Mojo transfer renderer → PrintCompositor | <100 ms | (briefly +50 MB browser IPC buffer) |
+| PrintCompositor → PDF | ~35 s | utility process ~300-500 MB |
+| PDF transfer back | <500 ms | browser process spikes (PDF is in flight) |
+| pdf-lib outline + metadata | ~5 s | Node ~100 MB |
+
+The 35 s `SkPDF` step is single-threaded Skia walking the layout tree
+and emitting PDF objects per the SkPDF design (see "Memory: where the
+renderer's 1.9 GB goes" in README.md for the per-allocator breakdown
+of that growth).
+
+## Chromium's binary boundary
+
+`chrome.dll` is a single ~283 MB blob containing essentially all of
+Chromium: Blink, V8, Skia, Mojo, services, PrintCompositor,
+everything. The launcher `chrome.exe` is a 4 MB shim that loads
+`chrome.dll` and calls `ChromeMain`.
+
+A PE export-table dump (see `perf/probe-idle-browser.mjs` for the
+measurement that surfaced this) shows **chrome.dll exports exactly
+six functions**:
+
+```
+ChromeMain                                       # main entry point
+CrashForExceptionInNonABICompliantCodeRange      # crash helper
+GetHandleVerifier                                # sandbox handle check
+IsSandboxedProcess                               # sandbox query
+RelaunchChromeBrowserWithNewCommandLineIfNeeded  # relauncher
+sqlite3_dbdata_init                              # accidental third-party leak
+```
+
+Out of probably millions of internal C++ functions, six are reachable
+from outside via `LoadLibrary` + `GetProcAddress`. PrintCompositor,
+Mojo, Skia, Blink, V8 -- none are exported. The binary is opaque by
+design; Chromium isn't built as a library for third-party embedders.
+
+**CEF** (Chromium Embedded Framework, which the docs ship a reference
+for in `docs/Reference/CEF/`) exists exactly because of this gap.
+CEF is a deliberately-stable C/C++ API wrapper on top of Chromium
+internals, with a single stable ABI per major version. The CEF
+maintainers do the work of (a) building Chromium with the right
+configs, (b) exposing necessary internals through a stable wrapper,
+and (c) keeping the wrapper compatible across Chromium upgrades.
+
+## Idle process tree baseline
+
+Measured by [probe-idle-browser.mjs](probe-idle-browser.mjs) -- a
+fresh puppeteer.launch + about:blank only, no work:
+
+| process | private |
+| ------- | ------- |
+| browser (the parent) | 40-46 MB |
+| renderer (initial about:blank target) | 20-23 MB |
+| gpu-process (stub, post `--disable-gpu`) | 15-16 MB |
+| utility:network.mojom.NetworkService | 17 MB |
+| utility:storage.mojom.StorageService | 11 MB |
+| crashpad-handler x 2 | 2 MB each |
+| **total tree** | **~125-180 MB** |
+
+The "browser process at 1,113 MB" figure in earlier memory probes was
+specific to the PDF-transit phase -- the browser process buffers the
+41 MB PDF + the tagged structure tree as they flow from PrintCompositor
+to the browser to puppeteer's CDP channel. It is not the steady-state
+cost.
+
+## Approach A: patch and upstream a Chromium flag
+
+The highest-leverage candidate: a CDP/flag-level change that either
+skips PrintCompositor for single-renderer documents or adds streaming
+output. Concrete entry points for research:
+
+- Skia source: <https://skia.googlesource.com/skia/+/refs/heads/main/src/pdf/>
+  -- commit log against the Skia revision pinned in our Chromium
+  build.
+- Skia Gerrit reviews-in-flight: <https://skia-review.googlesource.com/>
+  filtered by `src/pdf/`.
+- Chromium printing tree: `chromium/src/printing/`,
+  `components/printing/`, `chrome/browser/printing/`.
+- crbug.com: searches like `component:Internals>Printing performance`
+  or `component:Internals>Skia>PDF`.
+- Dev mailing lists: `chromium-dev@chromium.org`,
+  `skia-discuss@googlegroups.com` (Google Groups archives).
+
+Plausibly upstreamable patches:
+
+1. `Page.printToPDF({ singleRenderer: true })` -- skip PrintCompositor
+   when the document doesn't span multiple frames. Saves ~450 MB
+   peak + ~5-10 s in our pipeline.
+2. CDP method that emits the renderer's `SkPicture` directly. Unlocks
+   external pipelines.
+3. Streaming `Page.printToPDF` output. Lets us overlap `process`
+   (pdf-lib outline / metadata) with `generate`.
+
+**Rejected because** the gains overestimated what they'd buy us. The
+generate phase is ~35 s with the shipped flag pair, peak memory is
+~2.4 GB. Saving ~450 MB of PrintCompositor or shaving 5-10 s of
+generate isn't worth the upstreaming overhead (RFC, review cycles,
+Chromium release cadence, plus carrying a patch until the upstream
+lands).
+
+## Approach B: port SkPDF to JS
+
+Skia's PDF backend (`src/pdf/` in Skia, ~30 k LOC of C++) consumes an
+`SkCanvas` draw stream and emits PDF bytes. Porting it to JS is a
+real project but the work it does isn't where the time goes -- Skia
+is well-optimized. **The hard problem is not Skia. It's getting
+Blink's draw stream out to feed into the port.**
+
+CanvasKit (`canvaskit-wasm` on npm) is Skia compiled to WASM and
+includes `SkDocument::MakePDFDocument`. In principle: load an
+`SkPicture` into CanvasKit, replay it into the PDF document's canvas,
+serialize. The same input problem still applies -- the `SkPicture`
+isn't accessible from JS land without a Chromium-side intervention.
+CanvasKit's PDF surface is also materially less battle-tested than
+native SkPDF and lacks the tagged-PDF API.
+
+**Rejected because** the port alone doesn't unblock anything and the
+real bottleneck (data extraction) is identical to approaches C-E.
+
+## Approach C: Frida + Mojo emulation in Node
+
+Architecture:
+
+1. Frida-hook the renderer process, intercept `SkPicture::serialize`
+   to capture the serialized picture bytes during `Page.printToPDF`.
+2. Slice the picture by page bounds using `SkBBoxHierarchy` /
+   `SkPicture::playback` with a clipping canvas.
+3. Spawn N PrintCompositor utility processes from Node, talking to
+   each over Mojo to send a sub-picture and receive a PDF slice.
+4. Concatenate slices with raw-byte xref rewriting.
+
+The blocker is step 3. Mojo has three sub-layers:
+
+- **Transport** -- Win32 named pipes. One end inherited by the child
+  via the `PROC_THREAD_ATTRIBUTE_HANDLE_LIST` Win32 attribute,
+  command-line arg `--mojo-platform-channel-handle=<N>`.
+- **Wire protocol** -- framed messages with version headers,
+  attachment references, multiplexed message pipes.
+- **Bindings** -- `.mojom` interface files (e.g.,
+  `components/services/print_compositor/public/mojom/print_compositor.mojom`)
+  compiled to marshaling stubs.
+
+The handshake the browser-process normally does to bring up a
+PrintCompositor utility:
+
+1. Spawn the child with the `--type` / `--utility-sub-type` args plus
+   the inheritable pipe handle.
+2. Send the Mojo "invitation" message containing the primordial
+   message pipe handle.
+3. Once the child has resolved the invitation, send a binding request
+   for the named `printing.mojom.PrintCompositor` attachment.
+4. Call methods on the resulting remote (e.g.,
+   `PrepareForDocumentToPdf`, `CompositePage`, `FinishDocumentToPdf`),
+   each method being a structured Mojo message with mojom-encoded
+   payload and shared-memory regions for the large blobs.
+
+Implementing all of this in Node, against unstable Chromium internal
+interfaces, is the cost:
+
+| component | effort |
+| --------- | ------ |
+| Win32 process spawn with inherited handles (Win32 FFI) | 1 week |
+| Named pipe + cross-process handle transfer | 1 week |
+| Mojo channel framing (read/write headers, multiplex) | 2-3 weeks |
+| Mojo invitation protocol | 1-2 weeks |
+| `.mojom` parser + JS codegen, or hand-written stubs | 2-3 weeks |
+| Shared-memory region encoding | 1 week |
+| PrintCompositor-specific marshaling | 1-2 weeks |
+| Tagged-PDF tree capture + slicing | 2-3 weeks |
+| SkPicture slicing by page bounds | 1-2 weeks |
+| Integration + Chromium-version drift debugging | 3-4 weeks |
+| **total** | **15-22 weeks** |
+
+Plus ongoing maintenance every Chromium upgrade -- internal
+interfaces have no stability guarantees because they're build-time
+contracts between Chromium components.
+
+**Rejected because** the engineering cost dwarfs the wall-clock
+savings, and the maintenance is permanent.
+
+## Approach D: Frida + CanvasKit-WASM in workers
+
+Avoids Mojo by using Skia directly. Architecture:
+
+1. Frida-hook to capture the SkPicture bytes (same as C).
+2. Slice the picture by page bounds (same as C).
+3. Spawn N Node `worker_threads`, each loads CanvasKit-WASM,
+   deserializes its sub-picture, calls `SkDocument::MakePDFDocument`,
+   emits a sub-PDF.
+4. Concatenate.
+
+Cost is smaller than approach C because no Mojo plumbing, but two
+issues:
+
+- **CanvasKit's PDF surface diverges from native SkPDF.** Font
+  subsetting, image encoding, color-space handling have known gaps
+  and quirks. Plan on 1-2 weeks of debugging diverging output before
+  matching native SkPDF closely enough for production.
+- **Tagged PDF is missing.** CanvasKit's `SkDocument` doesn't expose
+  Skia's tagging API; the structure tree would have to be applied
+  separately, derived from the DOM in our own code. Probably 2-4
+  weeks to rebuild.
+
+Total: **6-10 weeks**, with output-fidelity risk.
+
+**Rejected because** of the tagged-PDF gap (accessibility is
+non-negotiable) and the divergence risk against the production
+Chromium SkPDF baseline.
+
+## Approach E: helper binary linking Chromium components
+
+Architecture: build a small DLL/EXE that statically links against
+`//mojo/core/embedder`, `//components/services/print_compositor`,
+and `//cc/paint`. The helper exports C-style functions Node calls
+via FFI:
+
+- `helper_init()` -- start a Mojo node, set up the embedder.
+- `helper_emit_pdf(skp_bytes, ax_tree_bytes, page_range, out_pdf*)` --
+  spawn or reuse a PrintCompositor, send the inputs, return the PDF.
+
+GN file is short:
+
+```gn
+shared_library("printcomp_helper") {
+  sources = [ "helper.cc" ]
+  deps = [
+    "//mojo/core/embedder",
+    "//components/services/print_compositor",
+    "//cc/paint",
+    "//base",
+  ]
+}
+```
+
+The helper does all the Mojo plumbing using Chromium's own Mojo
+library, so we avoid reimplementing Mojo in Node. Node handles
+SkPicture slicing (a pure data problem) and PDF concatenation.
+
+### Checkout and build cost (corrected)
+
+The "Chromium build is 50 GB and 6 hours" rule of thumb refers to
+the full-history `fetch chromium`. For a single-purpose helper,
+with `gclient sync --no-history --shallow` and targeted GN builds:
+
+| step | estimate |
+| ---- | -------- |
+| depot_tools install + Visual Studio Build Tools + Win SDK 10 (if not already set up) | half day, one-time |
+| Shallow `gclient sync` for selected DEPS | 30-90 min |
+| Disk footprint after shallow sync | ~20-30 GB (not 50) |
+| First `ninja printcomp_helper` with `is_debug=false symbol_level=1` | 30-90 min (~1500-2500 TUs vs ~50,000 for full Chromium) |
+| Incremental rebuild (touched `helper.cc`) | 5-15 min |
+| Output DLL size | ~80-150 MB (statically-linked Skia, base, mojo, abseil, icu) |
+| Per-Chromium-upgrade re-sync + rebuild | 1 hour if interfaces stable, up to a day if a signature changed |
+
+So the **initial commitment is more like a Saturday afternoon than a
+quarter** -- the 6-12 weeks figure from approach C drops to **4-6
+weeks for the full pipeline** (helper + Frida extraction + SkPicture
+slicing + AX tree slicing + Node orchestration + PDF concat).
+
+### A potentially smaller variant: Skia-only helper
+
+If tagged PDF were acceptable to drop, the helper could skip
+`//components/services/print_compositor` and link only against
+`//third_party/skia`. The build shrinks to ~800-1200 TUs, ~20-40 min
+first build, helper DLL ~30-50 MB. The PDF emit path becomes a direct
+`SkDocument::MakePDF` call.
+
+**Rejected because** tagged PDF is non-negotiable. Documented here
+because it's the simplest viable Chromium-internal architecture if
+the accessibility requirement ever changes.
+
+### Why approach E was still rejected
+
+The 4-6 week full-project estimate is a fair cost for the gains:
+
+- Render once, extract SkPicture (~10 s).
+- Kill the original Chromium (frees ~1.4 GB renderer).
+- Run N PrintCompositor helpers in parallel (~11 s wall clock for N=4
+  at ~45/4 s each).
+- Concat (~3 s).
+- **End-to-end: ~26 s vs current ~70 s, peak ~2 GB.**
+
+Actual 41 s wall-clock save with comparable peak memory. Worth doing
+if the engineering budget exists.
+
+What pushes it off the table for now:
+
+1. **Maintenance against Chromium version churn.** Mojo's
+   `printing.mojom.PrintCompositor` interface signature changes
+   between Chromium milestones. We'd be re-syncing + rebuilding +
+   retesting on every Puppeteer Chromium bump (every few months).
+2. **CI build pipeline complexity.** Helper.dll has to be pre-built
+   and shipped as a release artifact -- can't be built fresh in
+   GitHub Actions every PR because the sync + build is ~45-90 min on
+   a CI-class machine.
+3. **The savings aren't urgent.** A 70 s build is fine on CI. A
+   ~26 s build would be nicer, but the 44 s difference doesn't change
+   any developer workflow we have.
+
+If item 3 changes (book grows past ~3000 pages, or CI gains a hard
+runtime cap), approach E becomes the right answer.
+
+## Cost summary
+
+| approach | engineering | tagged PDF | output fidelity | binary | maintenance |
+| -------- | ----------- | ---------- | --------------- | ------ | ----------- |
+| A (upstream patch) | weeks-months of RFC + review | works | identical | none (official) | none after merge |
+| B (port SkPDF alone) | doesn't unblock | n/a | n/a | n/a | n/a |
+| C (Frida + Mojo in Node) | 15-22 weeks | works | identical | small | high (Mojo internals) |
+| D (Frida + CanvasKit workers) | 6-10 weeks | requires rebuild | divergence risk | medium | medium |
+| E (helper binary) | 4-6 weeks | works | identical | 80-150 MB | per Chromium upgrade |
+| E-slim (Skia-only helper) | 3-4 weeks | broken | divergence on tags | 30-50 MB | per Chromium upgrade |
+
+## What would change the calculus
+
+- **Book grows past ~3000 pages.** Generate time scales roughly
+  linearly in Skia; at 3000 pages the single-process pipeline is
+  ~70-90 s generate alone, ~100-120 s total. Approach E pays off.
+- **CI runner downsized.** If peak memory has to stay under ~1.5 GB,
+  any current single-Chromium path is in trouble; approach E with
+  the renderer killed mid-pipeline is the only fit.
+- **Chromium ships streaming `Page.printToPDF`.** A long-standing
+  feature request that would let us overlap `generate` and
+  `process`. If it lands upstream, our pipeline benefits without any
+  patch work and approach E loses its remaining edge.
+- **CEF adds tagged-PDF support.** Currently a gap; if filled, the
+  helper-binary architecture could route through CEF's stable API
+  instead of raw Chromium internals, collapsing the maintenance cost.
+
+## Tooling notes for future investigators
+
+If you do come back to this:
+
+- [perf/probe-idle-browser.mjs](probe-idle-browser.mjs) gives the
+  idle baseline (~125-180 MB tree) and was the data behind the
+  corrected memory math here.
+- [perf/probe-memory.mjs](probe-memory.mjs) + sample-mem.ps1 gives
+  the working pipeline's per-process tree at peak.
+- [perf/probe-renderer-mem.mjs](probe-renderer-mem.mjs) +
+  analyze-mem-trace.mjs gives the per-allocator breakdown inside the
+  renderer via memory-infra dumps.
+- [perf/diff-blink-classes.mjs](diff-blink-classes.mjs) compares
+  Blink object class counts between two memory-infra dumps -- useful
+  for verifying that a code change is or isn't affecting layout-state
+  count.
+- [perf/analyze-heap-snapshot.mjs](analyze-heap-snapshot.mjs) parses
+  V8 heap snapshots from the `--heap-snapshot` extension to
+  probe-renderer-mem.mjs.
+
+For exploring Chromium internals: <https://source.chromium.org>
+(searches and cross-refs the source). The `printing/` and
+`components/services/print_compositor/` directories are the entry
+points to the print pipeline.
diff --git a/perf/README.md b/perf/README.md
index 9ace1ee..f8ffb89 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -5194,6 +5194,15 @@ budget matters. This section measures one render's peak
 memory and breaks it down by allocator, so we know what
 levers exist if the book grows.
 
+> **Note.** Approaches that involve Chromium internals --
+> patching the binary, intercepting the SkPicture stream
+> via Frida, spawning standalone PrintCompositors via
+> Mojo, building a Chromium-linked helper binary -- were
+> researched but not shipped. They're documented
+> separately in [CHROMIUM.md](CHROMIUM.md). This section
+> covers only what's measurable from the outside through
+> public APIs.
+
 `perf/probe-memory.mjs` is the harness. It runs the full
 pipeline (load + render + generate) in a single browser
 and watches the chrome.exe process tree at 500 ms
diff --git a/perf/probe-idle-browser.mjs b/perf/probe-idle-browser.mjs
new file mode 100644
index 0000000..50111be
--- /dev/null
+++ b/perf/probe-idle-browser.mjs
@@ -0,0 +1,105 @@
+// Measure the private bytes of a freshly-launched Chromium with
+// nothing loaded (just about:blank on the default page). Probes a few
+// states: post-launch, post-newPage, post-goto(about:blank).
+//
+// Usage:
+//   node probe-idle-browser.mjs
+
+import { spawnSync } from 'node:child_process';
+import puppeteer from 'puppeteer';
+
+const fmtMB = (b) => (b / 1024 / 1024).toFixed(0).padStart(5) + ' MB';
+
+function sampleTree(rootPid) {
+  // One CIM query for the whole process table, then walk the tree
+  // rooted at rootPid in PowerShell. Returns parsed JSON.
+  const ps = `
+$root = ${rootPid}
+$all = Get-CimInstance Win32_Process -Property ProcessId,ParentProcessId,Name,CommandLine
+$byParent = @{}
+foreach ($p in $all) {
+  $pp = [int]$p.ParentProcessId
+  if (-not $byParent.ContainsKey($pp)) { $byParent[$pp] = @() }
+  $byParent[$pp] += $p
+}
+$queue = New-Object System.Collections.Queue
+$queue.Enqueue($root)
+$rows = @()
+while ($queue.Count -gt 0) {
+  $id = $queue.Dequeue()
+  $entry = $all | Where-Object { [int]$_.ProcessId -eq $id }
+  if ($null -ne $entry) {
+    $proc = Get-Process -Id $id -ErrorAction SilentlyContinue
+    if ($null -ne $proc) {
+      $role = if ([string]::IsNullOrEmpty($entry.CommandLine)) {
+        'browser'
+      } elseif ($entry.CommandLine -match '--type=([^\\s"]+)') {
+        if ($Matches[1] -eq 'utility' -and $entry.CommandLine -match '--utility-sub-type=([^\\s"]+)') {
+          'utility:' + $Matches[1]
+        } else {
+          $Matches[1]
+        }
+      } else {
+        'browser'
+      }
+      $rows += [ordered]@{ pid = $id; role = $role; private = [int64]$proc.PrivateMemorySize64; ws = [int64]$proc.WorkingSet64 }
+    }
+  }
+  if ($byParent.ContainsKey($id)) {
+    foreach ($c in $byParent[$id]) { $queue.Enqueue([int]$c.ProcessId) }
+  }
+}
+$rows | ConvertTo-Json -Compress -Depth 5
+`;
+  const r = spawnSync('powershell', ['-NoProfile', '-NonInteractive', '-Command', ps], { encoding: 'utf8' });
+  if (r.status !== 0) {
+    console.error('powershell stderr:', r.stderr);
+    throw new Error(`powershell failed (status ${r.status})`);
+  }
+  const txt = r.stdout.trim();
+  if (!txt) return [];
+  const parsed = JSON.parse(txt);
+  return Array.isArray(parsed) ? parsed : [parsed];
+}
+
+function report(label, rows) {
+  const total = rows.reduce((s, r) => s + r.private, 0);
+  const wsTotal = rows.reduce((s, r) => s + r.ws, 0);
+  console.log(`\n=== ${label} ===  total private ${fmtMB(total)}  ws ${fmtMB(wsTotal)}  (${rows.length} procs)`);
+  const sorted = [...rows].sort((a, b) => b.private - a.private);
+  for (const r of sorted) {
+    console.log(`  ${String(r.role).padEnd(22)} pid=${String(r.pid).padEnd(6)} ${fmtMB(r.private)} private  ${fmtMB(r.ws)} ws`);
+  }
+}
+
+const browser = await puppeteer.launch({
+  headless: true,
+  args: [
+    '--no-sandbox',
+    '--disable-dev-shm-usage',
+    '--allow-file-access-from-files',
+    '--disable-gpu',
+    '--disable-software-rasterizer',
+  ],
+});
+const rootPid = browser.process().pid;
+console.log(`[probe] browser pid: ${rootPid}`);
+
+// 1. Post-launch (browser created, no page yet).
+// Puppeteer creates an initial about:blank target automatically, so
+// even this state has one renderer.
+report('post-launch', sampleTree(rootPid));
+
+// 2. Post-newPage (force a fresh blank page).
+const page = await browser.newPage();
+report('post-newPage', sampleTree(rootPid));
+
+// 3. Post-goto(about:blank) explicitly.
+await page.goto('about:blank', { waitUntil: 'load' });
+report('post-goto(about:blank)', sampleTree(rootPid));
+
+// 4. Settle a moment, sample again to see if anything's still warming up.
+await new Promise((r) => setTimeout(r, 2000));
+report('after 2s settle', sampleTree(rootPid));
+
+await browser.close();

From 6a0235cc2eeb285cdfa64e8b6f411e0e23cae7e4 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sat, 23 May 2026 23:04:13 +0200
Subject: [PATCH 16/18] Factor the performance README.

---
 perf/README.md                             | 5607 +-------------------
 perf/notes/01-baseline-and-detach.md       |  962 ++++
 perf/notes/02-finalizepage.md              | 1010 ++++
 perf/notes/03-puppeteer-bump-findref.md    |  805 +++
 perf/notes/04-sync-and-inner-loop.md       |  941 ++++
 perf/notes/05-blink-trace.md               |  765 +++
 perf/notes/06-microtasks-pageranges-css.md |  564 ++
 perf/notes/07-memory.md                    |  470 ++
 8 files changed, 5645 insertions(+), 5479 deletions(-)
 create mode 100644 perf/notes/01-baseline-and-detach.md
 create mode 100644 perf/notes/02-finalizepage.md
 create mode 100644 perf/notes/03-puppeteer-bump-findref.md
 create mode 100644 perf/notes/04-sync-and-inner-loop.md
 create mode 100644 perf/notes/05-blink-trace.md
 create mode 100644 perf/notes/06-microtasks-pageranges-css.md
 create mode 100644 perf/notes/07-memory.md

diff --git a/perf/README.md b/perf/README.md
index f8ffb89..0a3254b 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -5,12 +5,18 @@ paged.js + headless Chromium + pdf-lib (see `docs/book.bat`, which
 invokes `docs/render-book.mjs`). The pipeline was historically driven
 by `pagedjs-cli`; we replaced that with our own thin driver after the
 investigations in this folder, so we control pdf-lib's parseSpeed
-without patching upstream (see "Profiling pdf-lib's load" below).
-As the book has grown we noticed **quadratic** wall-clock behaviour:
-time-per-page goes up as later pages are laid out, so doubling the
-page count roughly quadruples the total render time.
-
-This folder holds the tools used to investigate that.
+without patching upstream (see *Profiling pdf-lib's load* in
+[notes/01-baseline-and-detach.md](notes/01-baseline-and-detach.md)).
+As the book grew we found **quadratic** wall-clock behaviour --
+time-per-page grew with page count -- and chased it through ~22
+sub-investigations, recorded in [`notes/`](notes/).
+
+This folder holds the tools used to investigate that. The README is
+the operational reference: what each tool does, how to run it, and
+what shape the output takes. The narrative -- baselines, each landed
+optimisation, what was tried and failed -- lives split across the
+seven phase files in [`notes/`](notes/). The current state is summarised
+at the bottom of this file.
 
 ## Profiling `paged.browser.js`: canonical command
 
@@ -40,83 +46,77 @@ node measure.mjs --detach-pages --no-timing --render-only --cpu-profile --cpu-sa
   / `grep-profile.mjs`.
 - `--cpu-sampling 100` -- 100 us sampling, 10x denser than the 1 ms
   default. Resolves frames in paged.js's sub-millisecond inner loops
-  where most remaining cost lives (see "Looking past `finalizePage`"
-  and later sections). Larger profile file in return.
+  where most remaining cost lives (see *Looking past `finalizePage`*
+  in [notes/02-finalizepage.md](notes/02-finalizepage.md) and later
+  phase files). Larger profile file in return.
 
 Drop `--render-only` whenever you need to also measure generate /
 process (e.g. confirming a fix doesn't shift cost into `page.pdf()`
 or pdf-lib), or to write `book.pdf` for behavioural verification.
 
-The rest of this README is the long-form narrative -- baseline
-findings, each landed optimisation, and the residual hotspots.
-
-## The plan
-
-The render pipeline has three phases, matching what `pagedjs-cli`
-historically showed as its three spinners:
-
-1. **Rendering** -- `PagedPolyfill.preview()` does all the per-page
-   layout work inside headless Chromium.
-2. **Generating** -- `page.pdf()` asks Chromium to serialize the
-   laid-out DOM into PDF bytes, after a small `parseOutline` DOM
-   walk.
-3. **Processing** -- `pdf-lib` loads Chromium's PDF, attaches the
-   outline and metadata, and re-serialises.
-
-All three can grow super-linearly. So the harness times all three
-separately and produces a phase breakdown.
-
-Two-step investigation, cheapest first:
-
-1. **Per-page timing + phase breakdown** -- the cheap pass. Hook
-   paged.js's `beforePageLayout` / `afterPageLayout` for the
-   per-page render curve, and wall-clock the generate and process
-   phases from Node. If render's per-page cost grows with page index
-   that's an `O(n^2)` render; if generate or process dominate, the
-   bottleneck is downstream of paged.js.
-
-2. **CPU profile of headless Chromium** -- the deep pass, only if
-   step 1 doesn't already point at a culprit. Attach the Chrome
-   DevTools Performance panel (or save a CPU profile via the CDP
-   `Profiler` domain) and look for the hot function. Typical paged.js
-   suspects in render: `Chunker`, `Layout`, cross-reference
-   resolution, or a handler that walks the entire document on every
-   page. Generate / process bottlenecks usually point at Chromium's
-   PDF writer or `pdf-lib`'s outline / save path.
-
-Step 1 is what's wired up here. Step 2 will reuse the same harness --
-adding `page.tracing.start()` / `page.tracing.stop()` for a
-DevTools-compatible trace is a few lines.
-
 ## What's in this folder
 
+The harness and core probes:
+
 | File | Role |
 | --- | --- |
-| `package.json` | Pins `puppeteer` + `pdf-lib` + `html-entities` (the same direct deps `docs/` uses). |
 | `measure.mjs` | Puppeteer harness. Drives the same flow as `docs/render-book.mjs` (loads the vendored paged.js bundle, runs `PagedPolyfill.preview()`, calls `page.pdf()`, then either the pdf-lib roundtrip or the incremental writer), with optional CPU profiling, in-page handler injection, and DOM-accessor instrumentation. Auto-pins to a fixed core mask on Windows via `pin-cpu.mjs` (see below) for stable measurements; pass `--no-affinity` to opt out. |
 | `pin-cpu.mjs` | Shared shim used by `measure.mjs`, `profile-load.mjs`, `profile-roundtrip.mjs`, and `ab-css.mjs`. On Windows, auto-relaunches the parent Node process under `start /affinity 0x5500 /high` (cores 4-7 physical, thread 0 each, on an 8C16T AMD Ryzen 7) so puppeteer's Chromium children inherit the mask + priority at spawn time. Reduces single-run CPU sample-time variance from ~15-25 % on a stock dev box to ~3 %. No-op on non-Windows; opt out per-invocation with `--no-affinity` or `PERF_PINNED=1`; override mask with `PERF_AFFINITY=<hex>`. |
 | `timing-handler.js` | `Paged.Handler` that records per-page wall time + heap into `window.__pagedTiming` and streams a line per page to the console. Always injected. |
 | `detach-pages.js` | `Paged.Handler` that hides each completed page from the layout tree (registered against `finalizePage`). The fix. Injected by `--detach-pages` and by `docs/book.bat`. |
 | `instrument-flush-ops.js` | Wraps `getComputedStyle`, `getBoundingClientRect`, and the `offsetWidth` / `clientWidth` / `scrollWidth` family with counters + per-call timing. Injected by `--instrument`. |
+| `instrument-detach.js` | Counters around `detach-pages.js`'s removeChild / restore cycle. |
 | `time-hooks.js` | Wraps every task registered to `chunker.hooks.*` and `polisher.hooks.*` with a wall-clock timer. Tells you which handler's hook method is eating render time, per page. Injected by `--time-hooks`. |
 | `instrument-clones.js` | Wraps `Layout.prototype.append` to tag every source-walker clone, then walks each finalized page at `finalizePage` counting tagged survivors. Reports total appendCalls vs. survivors and the per-page overshoot distribution -- the share of clones rolled back by `removeOverflow`. Requires a one-line `window.PagedLayout = Layout` patch near the bottom of `docs/lib/paged.browser.js` (it's a private class otherwise). Injected by `--clone-count`. |
 | `incremental-pdf.mjs` | Replaces the pdf-lib load+save roundtrip with a PDF 1.7 §7.5.6 incremental update appended to Chrome's bytes. Used by `--incremental`. |
 | `test-incremental.mjs` | Smoke test for `incremental-pdf.mjs`: renders a tiny probe page, runs the writer, verifies the result parses (via pdf-lib re-load) and that outline + metadata land correctly. |
-| `profile-load.mjs` | Standalone profiler for `PDFDocument.load`. Runs the load on a chosen PDF with a chosen `parseSpeed`; intended to be run under `node --cpu-prof`. Auto-pins on Windows via `pin-cpu.mjs`. |
-| `profile-roundtrip.mjs` | Times the full pdf-lib `load + save` roundtrip across the three `parseSpeed` / `objectsPerTick` settings on a chosen PDF. Auto-pins on Windows via `pin-cpu.mjs`. |
-| `probe-chrome-outline.mjs` | Renders a synthetic multi-level h1..h6 document via Chrome's `outline: true` and dumps the resulting `/Outlines` tree. Quick check that the CDP flag is wired correctly in the local Chromium / puppeteer combo. |
-| `compare-outlines.mjs` | Diffs two PDFs' `/Outlines` trees by `(depth, title, target page)`. Used to verify whether Chrome's native outline matches the injected one. |
-| `probe-outline-exclusions.mjs` | Tests which per-element attributes / styles (aria-hidden, role=presentation, hidden, display:none, CSS bookmark-level, ...) make Chrome drop a heading from its outline. |
+| `run.bat` | Windows wrapper. On first run, runs `npm install` against the repo-root `package.json` (which pins `puppeteer` / `pdf-lib` / `html-entities` -- the same direct deps `docs/` uses; consolidated to repo root in commit `3da85e8`, May 2026, so `node_modules` is shared). Then invokes `node measure.mjs`. |
+| `results/` | Output, one timestamped subfolder per run. Git-ignored. |
+
+Profile / trace analysis (point at files produced by `--cpu-profile`
+or `--tracing`):
+
+| File | Role |
+| --- | --- |
 | `analyze-profile.mjs` | Bottom-up self-time analyzer for `.cpuprofile` files. Same shape as DevTools' Performance bottom-up view, in the terminal. |
 | `analyze-trace.mjs` | Bottom-up self-time analyzer for Chrome traces (`trace.json` from `--tracing`). Computes per-event self-time on the renderer's main thread (`CrRendererMain` by default) by walking nested `X`-phase events. Cracks the cpu profile's `(program)` bucket open into named Blink / V8 events (`Layout`, `RecalcStyle`, `RunMicrotasks`, `V8.GC_*`, ...). Operates on the Blink trace events only -- ignores any embedded V8 cpu samples (`Profile` / `ProfileChunk`). |
 | `analyze-hybrid.mjs` | Bottom-up analyzer that *combines* the V8 cpu samples and the Blink trace events from a hybrid `trace.json`. Builds a `[JS root..leaf] ++ [Blink outer..inner]` stack at each sample (filtering V8's virtual frames and JS-entry wrapper events) and prints either top-N self-time mixing JS function names with Blink/V8 event names, or `--callees <label>` direct-callees for any name on either axis. Lets you walk a single causation chain from a JS function down through the Blink layout / style work it triggered via gBCR (`hasOverflow -> getBoundingClientRect -> Document::UpdateStyleAndLayout -> Blink.ForcedStyleAndLayout.UpdateTime -> ...`). |
-| `ab-css.mjs` | CSS cost attribution for `docs/_site-pdf/assets/css/print.css` + `rouge.css`. Renders the book per variant (full / drop-rouge / drop-print-extras / baseline-minimal) and reports **paired-difference** CPU sample-time across N pairs (default 3), with the baseline re-measured immediately before each variant pair to cancel machine-state drift. Pulls per-`Document::recalcStyle` / `LocalFrameView::performLayout` / `rebuildLayoutTree` / `ShapeText` total time from the embedded V8 cpu profile in the hybrid trace; prints mean ± SD per variant so noise-floor rows are visible. Auto-pins on Windows via `pin-cpu.mjs`. Optional `--per-print-section` adds one drop-print-`<section>` variant per `/* ---- ---- */` divider in print.css; individual sections of print.css turned out to be below the noise floor on this book, so off by default. |
-| `ab-aggregate.mjs` | Per-row mean + SD aggregator across 6 paired cpu profiles (`ab-A1..A3.cpuprofile` and `ab-B1..B3.cpuprofile`). Use when wall-clock noise drowns a structural change: capture 3+3 interleaved profiles via `measure.mjs --cpu-profile` with the change toggled on/off between runs, then point this at the 6 files for a mean-with-SD table that surfaces deltas wall-clock can't see (e.g. ~6 σ shifts on rows that move from 88 ms to 2 ms). See the "Disabling the filter outright" section in this README for the methodology. |
 | `find-callers.mjs` | "Who paid for this callee's time?" -- walks a `.cpuprofile` and attributes a target function's total time back to each direct caller. Used throughout the post-mortems to detect gBCR migration between callers. |
 | `find-callees.mjs` | The other direction of `find-callers.mjs`: splits a function's self+descendant time across its direct callees. Surfaces the cases where V8 has rolled native DOM work back into the calling JS frame (Range deletion in `removeOverflow`, HTML parser in `wrapContent`). |
 | `grep-profile.mjs` | Lists every node in a `.cpuprofile` whose `functionName` matches a regex, with self-time and location. Quick check for "is this frame in the profile at all, and what's it called?" |
-| `run.bat` | Windows wrapper. Installs deps on first run, then invokes `node measure.mjs`. |
-| `results/` | Output, one timestamped subfolder per run. Git-ignored. |
+| `ab-css.mjs` | CSS cost attribution for `docs/_site-pdf/assets/css/print.css` + `rouge.css`. Renders the book per variant (full / drop-rouge / drop-print-extras / baseline-minimal) and reports **paired-difference** CPU sample-time across N pairs (default 3), with the baseline re-measured immediately before each variant pair to cancel machine-state drift. Pulls per-`Document::recalcStyle` / `LocalFrameView::performLayout` / `rebuildLayoutTree` / `ShapeText` total time from the embedded V8 cpu profile in the hybrid trace; prints mean ± SD per variant so noise-floor rows are visible. Auto-pins on Windows via `pin-cpu.mjs`. Optional `--per-print-section` adds one drop-print-`<section>` variant per `/* ---- ---- */` divider in print.css; individual sections of print.css turned out to be below the noise floor on this book, so off by default. |
+| `ab-aggregate.mjs` | Per-row mean + SD aggregator across 6 paired cpu profiles (`ab-A1..A3.cpuprofile` and `ab-B1..B3.cpuprofile`). Use when wall-clock noise drowns a structural change: capture 3+3 interleaved profiles via `measure.mjs --cpu-profile` with the change toggled on/off between runs, then point this at the 6 files for a mean-with-SD table that surfaces deltas wall-clock can't see (e.g. ~6 σ shifts on rows that move from 88 ms to 2 ms). See *Disabling the filter outright* in [notes/05-blink-trace.md](notes/05-blink-trace.md) for the methodology. |
+
+Memory probes (added during the phase-7 investigation):
+
+| File | Role |
+| --- | --- |
+| `probe-renderer-mem.mjs` | Per-allocator + per-Blink-class memory breakdown of the renderer via Chromium's memory-infra tracing. Captures process memory dumps at three points by default (post-render, mid-generate, post-generate). `--gc-passes N` inserts an extra post-gc dump between post-render and the generate phase (triggers V8 `gc()` + `Memory.simulatePressureNotification`; auto-adds `--js-flags=--expose-gc`); `--heap-snapshot` additionally captures V8 snapshots via CDP `HeapProfiler.takeHeapSnapshot`. |
+| `probe-memory.mjs` | Generic memory probe: dumps a memory-infra trace at the end of one render. |
+| `analyze-heap-snapshot.mjs` | Single-snapshot summary (top type × name by aggregate bytes, detached subset) and pairwise diff between two snapshots. |
+| `analyze-heap-profile.mjs` | Bottom-up size analyzer for V8 `.heapprofile` files. |
+| `analyze-mem-trace.mjs` | Per-process / per-allocator extractor for memory-infra traces. |
+| `diff-blink-classes.mjs` | Per-Blink-class diff between two memory-infra dumps in the same trace. Strips the per-dump GUID suffix from class names so the diff lines up. |
+| `diff-heap-profile.mjs` | Pairwise diff between two `.heapprofile` files. |
+
+Side experiments / one-shot probes:
+
+| File | Role |
+| --- | --- |
+| `profile-load.mjs` | Standalone profiler for `PDFDocument.load`. Runs the load on a chosen PDF with a chosen `parseSpeed`; intended to be run under `node --cpu-prof`. Auto-pins on Windows via `pin-cpu.mjs`. |
+| `profile-roundtrip.mjs` | Times the full pdf-lib `load + save` roundtrip across the three `parseSpeed` / `objectsPerTick` settings on a chosen PDF. Auto-pins on Windows via `pin-cpu.mjs`. |
+| `probe-chrome-outline.mjs` | Renders a synthetic multi-level h1..h6 document via Chrome's `outline: true` and dumps the resulting `/Outlines` tree. Quick check that the CDP flag is wired correctly in the local Chromium / puppeteer combo. |
+| `compare-outlines.mjs` | Diffs two PDFs' `/Outlines` trees by `(depth, title, target page)`. Used to verify whether Chrome's native outline matches the injected one. |
+| `probe-outline-exclusions.mjs` | Tests which per-element attributes / styles (aria-hidden, role=presentation, hidden, display:none, CSS bookmark-level, ...) make Chrome drop a heading from its outline. |
+| `probe-parallel.mjs` | Two-shard `Promise.all` `page.pdf()` probe -- the cost-of-`pageRanges`-sharding measurement (see *`pageRanges` sharding: off the table for now* in [notes/06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md)). |
+| `probe-idle-browser.mjs` | Standalone probe: launches a headless browser and measures steady-state idle memory + sample-time, for separating render cost from browser-fixed overhead. |
+
+Documentation:
+
+| File | Role |
+| --- | --- |
+| `CHROMIUM.md` | Chromium-internal PDF-generation paths investigated separately (out-of-process print compositor, SkPDF, alternative drivers). |
+| `notes/` | The seven phase-by-phase investigation files. See *Investigation log* at the bottom of this README. |
 
 The harness is structurally a copy of `pagedjs-cli/src/printer.js`'s
 `render()` flow, now living in our own code:
@@ -144,10 +144,12 @@ Why vendor rather than depend on `pagedjs-cli`? Two reasons:
 pagedjs-cli's `Printer.pdf()` calls `PDFDocument.load(pdf)` and
 `pdfDoc.save()` with no options and therefore inherits pdf-lib's
 default `parseSpeed: Slow`, which adds ~32 s of pure idle yielding
-to every build (the "Profiling pdf-lib's load" section explains why);
-also, it doesn't forward in-page `console.log` to its own stdout, and
-we have no way to call `page.evaluate()` from outside to pull out the
-timing data at the end. Driving Puppeteer ourselves gets both.
+to every build (see *Profiling pdf-lib's load* in
+[notes/01-baseline-and-detach.md](notes/01-baseline-and-detach.md) for
+the full investigation); also, it doesn't forward in-page `console.log`
+to its own stdout, and we have no way to call `page.evaluate()` from
+outside to pull out the timing data at the end. Driving Puppeteer
+ourselves gets both.
 
 The net effect: what we measure tracks what production renders --
 `docs/render-book.mjs` and `perf/measure.mjs` share the same helpers
@@ -229,5428 +231,75 @@ heap grows roughly linearly with page index, the layout phase is
 retaining per-page state -- a common cause of quadratic cost (every
 new page walks all previously-retained nodes).
 
-## Findings (initial run)
-
-A single run on `docs/_site-pdf/book.html` (1638 pages, May 2026,
-clean checkout, headless Chromium 122):
-
-| Phase    | Time      | % of total | Notes |
-| -------- | --------- | ---------- | ---   |
-| render   | 103.8 s   | 50 %       | paged.js layout. Per-page cost grows ~5x start-to-end. |
-| generate |  63.6 s   | 31 %       | 99.9% of it is `page.pdf()`. Raw Chrome output: 52 MB. |
-| process  |  39.6 s   | 19 %       | 90% of it is `PDFDocument.load`. Final PDF: 17 MB. |
-| **total**|**207.0 s**|            |       |
-
-### Render: super-linear, ~5x growth (confirms the suspicion)
-
-Per-page render cost, bucketed by 100 pages:
-
-```
-pages    0-  99   avg=  3.4 ms
-pages  100- 499   avg=  7-9 ms
-pages  500- 799   avg= 12-15 ms
-pages  800-1099   avg= 23-25 ms
-pages 1100-1599   avg= 27-39 ms
-pages 1600-1637   avg= 35 ms
-```
-
-The first-quarter / last-quarter ratio is **5.09x** with a
-position ratio of 4.0x. That's a clean linear-in-`n` per-page
-growth pattern, i.e. **total render time is roughly O(n^2)** with
-content variance overlaid. The single biggest outlier is
-pages 1100-1199 (37 ms) -- one chapter that's heavier than its
-neighbours.
-
-JS heap stays bounded around 10-25 MB throughout. So whatever's
-making later pages expensive is **CPU work that scales with `n`,
-not retained DOM**. Likely candidates: a `querySelectorAll` over
-the whole rendered tree on each page, cross-reference / named-flow
-resolution, or a handler walking already-laid-out content. The CPU
-profile in step 2 should pin which.
-
-### Generate: opaque Chrome PDF writer, large raw output
-
-`parseOutline` is 30 ms -- irrelevant. The whole 63-second phase
-is `page.pdf()`, i.e. Chromium serialising the laid-out DOM into
-52 MB of raw PDF bytes. This is the part we have least control
-over -- it's Chromium internals.
-
-What stands out is the **52 MB raw size**. After pdf-lib's
-`save()` re-emits it, the final file is **17 MB**. A 3x shrink
-from a re-serialise alone suggests Chrome isn't compressing
-streams aggressively (probably writing `/FlateDecode`-able streams
-uncompressed). Worth a follow-up sanity check, but not the
-priority.
-
-### Process: pdf-lib roundtrip overhead
-
-```
-load        : 35.62 s   parse the 52 MB raw PDF
-setOutline  :  0.01 s   write outline tree into the doc
-save        :  3.97 s   re-serialise (the 52 -> 17 MB shrink)
-```
-
-The actual outline / metadata mutations are basically free. **The
-whole 40-second phase is the cost of a load + save roundtrip on
-the big raw PDF that Chrome produced**, just so we can attach an
-outline that Chrome can't generate itself.
-
-This is a clear optimisation target: drop the pdf-lib roundtrip in
-favour of a streaming outline-injection tool (`qpdf`, `pdftk`,
-something hand-rolled with `pdf-lib`'s lower-level API) and the
-process phase could collapse to seconds. Tractable on its own
-without touching paged.js.
-
-### Where to focus
-
-- **Render** is the largest phase **and** the only super-linear
-  one. Step 2's CPU profile goes here first.
-- **Process** is purely linear-in-PDF-size overhead with a clean
-  fix path (skip pdf-lib's full parse). Independent of the
-  quadratic story.
-- **Generate** is Chrome's PDF writer. Not actionable from our
-  side without a Chromium patch; the 52 MB raw size deserves a
-  glance, but later.
-
-The user-perceived quadratic behaviour is real and lives in the
-render phase. Fixing it would knock 50-80 s off a 200 s build.
-Fixing process is independent and could knock off another 30 s.
-
-## Step 2: CPU profile of the render phase
-
-`measure.mjs --cpu-profile` wraps the render phase only (preview()
-through the `.pagedjs_pages` selector) in a V8 CPU profile via the
-CDP `Profiler` domain, and writes it to `render.cpuprofile` in the
-results folder:
-
-```
-run.bat --cpu-profile                          # default 1ms sampling
-run.bat --cpu-profile --cpu-sampling 5000      # 5ms sampling, smaller file
-```
-
-The profile covers only the render phase deliberately -- generate is
-opaque Chrome internals and process has a clean non-profiling fix, so
-both would dilute the signal.
-
-To view: open Chrome (or Edge) -> DevTools -> **Performance** tab ->
-click **Load profile...** (folder icon) and pick the `.cpuprofile`
-file. Or drag it onto the panel. The bottom-up view sorted by
-self-time pins the hot function fastest.
-
-What to look for, given the heap stayed bounded and per-page cost
-scales linearly with `n`:
-
-- A function whose self-time grows roughly with page index. The
-  bottom-up view aggregates across the whole phase, so a per-page
-  `O(n)` scan shows up as a fat self-time bar.
-- DOM-query hot spots: paged.js calling `querySelectorAll`,
-  `getElementsByTagName`, or `closest` against the whole rendered
-  tree on each new page.
-- Cross-reference / named-flow / footnote resolution that re-walks
-  prior pages.
-
-A 1 ms sampling interval over a 100 s render produces a profile around
-20-50 MB. The render phase itself runs ~5-15% slower while sampling.
-
-If the bottleneck turns out to be in paged.js itself, the next step
-is to patch our vendored copy. There is no widely-known maintained
-fork with the detach-pages optimisation at time of writing -- the
-named "performance forks" of paged.js that turn up in casual
-searches mostly don't exist or haven't shipped a fix. Worth checking
-the upstream issue tracker at
-[pagedjs/pagedjs on GitHub](https://github.com/pagedjs/pagedjs/issues)
-(currently the active home; older threads may still live on
-[Coko's GitLab](https://gitlab.coko.foundation/pagedjs/pagedjs/-/issues))
-before reinventing the fix.
-
-## Findings (CPU profile of render phase)
-
-A profiled run (`--cpu-profile`, 1 ms sampling) over the same
-1638-page book:
-
-```
-samples: 52314   duration: 95.18 s   us/sample: 1819
-
-   self_ms   self_%   function  @  source
-   -------   ------   ----------------------------------------------
-  63525.42   66.82%   getBoundingClientRect   (browser native)
-  19075.46   20.07%   (program)               (V8/Blink native)
-   1941.39    2.04%   findElement             browser.js:638
-   1497.43    1.58%   removeOverflow          browser.js:2196
-   1106.25    1.16%   (anonymous)             browser.js:29501
-   1002.54    1.05%   createBreakToken        browser.js:1796
-    580.42    0.61%   findEndToken            browser.js:2094
-    527.65    0.56%   create                  browser.js:2257
-    442.13    0.47%   afterPageLayout         browser.js:30184
-    ... rest sub-0.5% ...
-```
-
-**67% of render is `getBoundingClientRect`. Another 20% is V8/Blink
-native code -- almost certainly the synchronous layout passes those
-`getBoundingClientRect` calls force.** Together 87% of render is the
-browser doing layout work driven by paged.js measurement calls.
-
-> **Terminology**: this doc abbreviates `getBoundingClientRect` as
-> **gBCR** below. It's the DOM method that returns an element's
-> viewport-relative position and size; calling it forces Chromium
-> to synchronously flush any pending layout work before answering,
-> so "gBCR self-time" in a CPU profile is layout-flush attribution
-> charged to the JS frame that asked, not JS computation. The
-> same applies to other layout-reading APIs (`offsetTop`,
-> `clientHeight`, `getComputedStyle`, etc.) -- they're collectively
-> the *layout-flush surface* in the profile.
-
-### Why this is `O(n^2)`
-
-The hot caller is `Chunker.findOverflow` at `browser.js:1934`. Its
-loop:
-
-```js
-findOverflow(rendered, bounds, gap) {
-  if (!this.hasOverflow(rendered, bounds)) return;
-  ...
-  let walker = walk(rendered.firstChild, rendered);
-  while (!done) {
-    next = walker.next();
-    node = next.value;
-    if (node) {
-      let pos = getBoundingClientRect(node);   // <-- line 1957
-      ...
-    }
-  }
-}
-```
-
-Per page, paged.js walks the just-rendered fragment node-by-node
-calling `getBoundingClientRect` to find where the content overflows
-the page box. `findOverflow` itself only touches the new fragment, so
-in isolation it should be `O(page_content)`.
-
-The catch: `getBoundingClientRect` is **synchronous**. If the DOM has
-been mutated since the last layout (and paged.js mutates constantly
--- appending pages, splitting nodes, retrying overflow), each call
-forces Chromium to flush layout. **The cost of that flush scales
-with the live DOM tree**, which is every previously-laid-out page,
-all still attached to the document. Page `n`'s overflow walk pays
-`O(n)` layout cost. Total cost is `O(n^2)`.
-
-This matches everything else we saw:
-
-- Heap stays bounded (10-25 MB): no JS-level retention, just Blink's
-  layout tree growing with page count.
-- Per-page render cost grows ~10x from page 0 to page 1638: the
-  layout-flush cost grows linearly with `n`.
-- Content-driven spikes (the 1100-1199 chapter at 37 ms avg): pages
-  with heavier content do more walker iterations, multiplying the
-  per-iteration sync-layout cost.
-
-### Fix paths, in order of effort
-
-1. **Detach (or `display: none`) finalised pages.** Once a page's
-   layout is committed, take it out of the live document (or hide it
-   via `display: none` / `content-visibility: hidden`) so subsequent
-   sync layouts don't traverse it. Re-attach all pages at
-   `afterRendered` before `page.pdf()` runs. The idea is
-   well-understood and the patch is small (it lives in the chunker /
-   layout glue); collapses the render to roughly `O(n)`.
-
-2. **Batch the walker.** `findOverflow` reads
-   `getBoundingClientRect` on every node and Chromium can't batch
-   reads if they're interleaved with DOM writes. Splitting overflow
-   detection into a write-then-read-then-write phased pass would
-   reduce the number of forced layouts per page, even without
-   detaching previous pages. Smaller win than (1) but compatible
-   with it.
-
-For our pipeline, fix (1) would knock 60-80 seconds off the
-100-second render. Combined with skipping the pdf-lib roundtrip in
-Process (the easy win from the previous findings section), the
-total drops from ~207 s to roughly 90 s.
-
-## Fix applied: `perf/detach-pages.js`
-
-We went with fix (1) above, **as a paged.js handler rather than a
-bundle patch** -- a 20-line `Paged.Handler` subclass that sets
-`pageElement.style.display = 'none'` in `afterPageLayout` and
-restores them at `afterRendered` before `page.pdf()` runs. The
-existing `--additional-script` mechanism is exactly the extension
-point this needs, so no fork required.
-
-Wired into production in `docs/book.bat`. Originally:
-
-```bat
-npx pagedjs-cli _site-pdf\book.html -o _pdf\book.pdf ^
-    --outline-tags h1,h2,h3,h4 -t 600000 ^
-    --additional-script ..\perf\detach-pages.js
-```
-
-After the later `pagedjs-cli` removal (see "Dropping pagedjs-cli"
-below) the same `--additional-script` flag carries over to
-`render-book.mjs`:
-
-```bat
-node render-book.mjs _site-pdf\book.html -o _pdf\book.pdf ^
-    --outline-tags h1,h2,h3,h4 ^
-    --additional-script ..\perf\detach-pages.js
-```
-
-And into the perf harness via the `--detach-pages` flag.
-
-### Results
-
-Three-phase numbers, same 1638-page book, measured via the harness:
-
-| Phase    | Baseline | + handler | Δ |
-| -------- | -------- | --------- | --- |
-| render   | 103.8 s  |  50.9 s   | **-52.9 s (-51%)** |
-| generate |  63.6 s  |  60.2 s   | -3.4 s |
-| process  |  39.6 s  |  39.7 s   | unchanged |
-| **total**| **207.0 s** | **150.7 s** | **-56.3 s (-27%)** |
-
-Render last-quarter / first-quarter ratio: **4.56x -> 1.65x**.
-The remaining 1.65x is content variance (chapter 1100-1199 has
-dense tables / code blocks). No `n`-driven component remains.
-
-Per-page render curve, bucketed:
-
-```
-                  baseline    +handler
-pages 0-99      :   3.4 ms      6.1 ms
-pages 500-799   : 12-17 ms      5-6 ms       <- now flat
-pages 1100-1199 :  36.7 ms     13.4 ms       <- heaviest chapter, ~3x faster
-pages 1600-1637 :  37.7 ms     10.7 ms       <- ~3.5x faster
-```
-
-CPU profile shift (self-ms):
-
-```
-                                            baseline   +handler
-getBoundingClientRect      (native)            63525      19459
-(program)                  (V8/Blink)          19075       3676
-```
-
-`getBoundingClientRect` self-time dropped 3.3x and `(program)`
-(V8/Blink-internal layout) dropped 5.2x. Both are still in the top
-slots because layout work doesn't go to zero -- but they're now
-in line with the *current* page's content, not the entire growing
-document.
-
-### Production confirmation
-
-`docs/book.bat` (the real production path) reports:
-
-```
-✔ Rendering 1638 pages took 49,547 ms.
-✔ Generated
-✔ Processed
-✔ Saved to docs\_pdf\book.pdf            (10.5 MB)
-total elapsed: 185 s
-```
-
-The render number is within 3% of the harness measurement, no
-errors, PDF written. (The harness's PDF lands at 16.9 MB rather
-than 10.5 MB -- that's an artefact of the harness's slightly
-different post-processing flow, not the handler.)
-
-### What this didn't fix (independent follow-ups)
-
-The handler closes the quadratic-render hole. Remaining costs are
-linear-in-`n` and don't shrink with this change:
-
-1. **Process: 40 s of pdf-lib roundtrip on a 52 MB raw PDF.** Out
-   of that, `setOutline` is 11 ms; the other 39+ seconds is
-   `PDFDocument.load` + `pdfDoc.save` on the big Chrome output.
-   Replacing the load+save with a streaming outline-injection
-   tool (`qpdf`, hand-rolled with pdf-lib's lower-level API)
-   could cut another ~30 s.
-2. **Generate: 60 s in `page.pdf()`.** Chromium internals; mostly
-   opaque. The 52 MB raw size hints at uncompressed streams in
-   Chrome's writer -- worth a glance but not a quick fix.
-
-## Confirming the mechanism (instrumentation A/B)
-
-The CPU profile said `getBoundingClientRect` self-time dropped
-3.3x; the wall-clock measurement said render dropped 2x. To
-double-check that's actually due to the smaller layout tree (and
-not a profile-attribution coincidence, or paged.js silently
-skipping work, or new costs appearing elsewhere) the harness has
-an `--instrument` flag that wraps every in-page DOM accessor
-that *can* force a synchronous layout -- `getComputedStyle`,
-`getBoundingClientRect`, the `offsetWidth` / `offsetHeight` /
-`offsetTop` / `offsetLeft` family, and the `clientWidth` /
-`clientHeight` / `scrollWidth` / `scrollHeight` getters -- with
-counters and per-call timing.
-
-Same wrapper overhead in both runs, so absolute totals are
-inflated but the comparison is apples-to-apples.
-
-Two runs, same content, only difference is `--detach-pages`:
+## Current state
 
-| op                      | baseline                  | + detach                  |
-| ---                     | ---                       | ---                       |
-| `getBoundingClientRect` | 260,668 calls, **208 us** avg | 258,940 calls, **70 us** avg |
-| `scrollWidth`           |  37,911 calls,   1.4 us   |  37,047 calls,   1.1 us   |
-| `scrollHeight`          |  37,911 calls,   0.7 us   |  37,047 calls,   0.6 us   |
-| `getComputedStyle`      |   9,179 calls,   1.7 us   |   9,179 calls,   1.8 us   |
-| `offset*` / `client*`   |       **0 calls**         |       **0 calls**         |
-
-Instrumented render wall-clock: 82.1 s baseline -> 47.7 s with
-detach. Same shape as the un-instrumented runs.
-
-What the numbers say:
-
-1. **Call counts are essentially identical.** The detach handler
-   isn't getting paged.js to skip any work -- 260,668 vs 258,940
-   `getBoundingClientRect` calls is a rounding error. The fix
-   makes each call cheaper, not the number of calls smaller.
-
-2. **`getBoundingClientRect` per-call cost dropped 66 %**,
-   208 us -> 70 us. Smaller live layout tree, less to recompute
-   on each forced flush. Total cost on this op alone: 54.3 s ->
-   18.2 s, which is most of the wall-clock render savings.
-
-3. **`offsetWidth` / `offsetHeight` / `offsetTop` / `offsetLeft`
-   / `clientWidth` / `clientHeight` are called zero times** on
-   our content. The auto-width branches inside `finalizePage`'s
-   margin-box `forEach` (where those accesses live) never fire
-   on the kind of margin content we have (bottom-right page
-   number, nothing else).
-
-## Why detach-pages.js hooks `finalizePage`, not `afterPageLayout`
-
-The chunker's per-page hook order is:
-
-```
-beforePageLayout  ->  afterPageLayout  ->  finalizePage
-```
-
-`AtPage.finalizePage` (built into paged.js) reads `getComputedStyle`
-on margin-box children and writes `el.style["grid-template-columns"]`
-on them. `time-hooks.js` measurements show this method is **11x
-slower per call when run on a `display:none` page**:
-
-| Variant | `chunker.finalizePage::finalizePage` per call |
-| --- | --- |
-| Baseline (no detach) | 0.82 ms |
-| Detach hooked on `afterPageLayout` (hide *before* AtPage) | **9.24 ms** |
-| Detach hooked on `finalizePage` (hide *after* AtPage) | 0.67 ms |
-
-Chromium has fast paths for style reads/writes on visible elements;
-on hidden subtrees the same operations re-cascade each call. So
-hiding the page before AtPage runs makes AtPage pay a slow path
-worth ~8 ms/page over the whole render.
-
-`detach-pages.js` therefore hooks `finalizePage`, registering after
-AtPage so its method runs second. AtPage works on a visible page;
-we hide immediately after. The next chunker iteration sees pages
-0..N-1 hidden, so the original `getBoundingClientRect` saving in
-the chunker is preserved.
-
-**Wall-clock impact: none measurable.** A 4+4 interleaved A/B
-between the two variants showed render medians within ~1 s of
-each other (48.70 s vs 49.83 s un-instrumented; 50.78 s vs 50.90 s
-with `--time-hooks`), well inside the 3-7 s within-variant noise.
-The `finalizePage` hook is the variant we ship because it makes
-the CPU profile read honestly (no mystery cost inside AtPage) and
-gives AtPage the visible page it expects, not because of a
-measurable speedup.
-
-## Fix applied: `perf/incremental-pdf.mjs`
-
-The direct follow-up from the previous section's "What this didn't
-fix" list: kill the pdf-lib roundtrip that owned the 40 s process
-phase. 99 % of that was `PDFDocument.load` + `pdfDoc.save` on the
-52 MB raw PDF -- just so we can attach an outline tree and override a
-handful of `/Info` fields.
-
-Approach: a **PDF incremental update** (PDF 1.7 §7.5.6). We never
-call `PDFDocument.load`. Instead:
-
-1. Parse only the trailer, xref, Catalog, and Info objects -- using
-   `PDFParser` positioned at known byte offsets. Three small dicts,
-   ~50 ms.
-2. Build outline objects in a fresh `PDFContext`, allocating refs
-   starting from the original `/Size`.
-3. Mutate the parsed Catalog (add `/Outlines`, `/Lang`) and Info
-   (override `/Title`, `/Creator`, dates, ...) **in place**, keeping
-   their original refs.
-4. Append to the original bytes:
-   - The new and updated indirect objects.
-   - A new xref section whose subsections cover only those refs.
-   - A new trailer dict with `/Prev` pointing at the original xref.
-   - `startxref <new-offset>` + `%%EOF`.
-
-Readers chain backward through `/Prev` to resolve any ref we didn't
-touch (`/Pages`, `/Dests`, every font / image / content stream). The
-original 52 MB stays byte-identical; we just append a few hundred KB.
-
-The writer is built on pdf-lib's low-level primitives -- `PDFParser`
-for the few objects we read, `PDFContext` + `PDFDict` for object
-construction, `PDFCrossRefSection` + `PDFTrailerDict` for emitting
-the new xref / trailer. The expensive `PDFDocument.load` (which
-parses every indirect object in the file) is bypassed entirely.
-
-### Results
-
-Same 1638-page book, `--detach-pages` already in effect for both runs:
-
-| Phase    | pdf-lib roundtrip | + incremental | Δ |
-| -------- | ----------------- | ------------- | --- |
-| render   |  50.9 s   |  49.2 s   | unchanged (noise) |
-| generate |  60.2 s   |  60.9 s   | unchanged (noise) |
-| process  |  39.7 s   |   0.25 s  | **-39.4 s (-99%)** |
-| **total**| **150.7 s** | **110.3 s** | **-40.4 s (-27%)** |
-
-Combined with the detach-pages fix, the build is now **110 s vs
-207 s baseline (-47 %)**.
-
-Process-phase breakdown for the incremental path:
-
-```
-incremental    : 250 ms total
-appended       : ~410 KB (vs 52 MB raw Chrome PDF, untouched)
-new objects    : 1776 (outline root + 1773 outline items + Catalog + Info)
-```
-
-The output reparses cleanly under both pdf-lib's full
-`PDFDocument.load` and poppler's `pdfinfo` (PDF 1.4, 1638 pages,
-A4, all metadata intact). Outline navigation works in the viewer.
-
-### The size tradeoff
-
-`pdf-lib`'s `save()` quietly deflate-compresses content streams as a
-side effect of full re-emission. That's why the old output was 17 MB
-even though Chrome's raw PDF is 52 MB. The incremental writer keeps
-Chrome's bytes verbatim, so the final file is essentially "52 MB +
-outline":
-
-| Output mode       | Final PDF size |
-| ---               | --- |
-| pdf-lib roundtrip | 16.9 MB |
-| incremental       | 52.7 MB |
-
-This is the same uncompressed-streams problem the initial findings
-section flagged ("Chrome isn't compressing streams aggressively").
-Two ways to claw the size back without going back to a full parse,
-both independent follow-ups:
-
-1. **qpdf post-pass** -- `qpdf --object-streams=generate
-   --compress-streams=y in.pdf out.pdf` re-emits the file with deflate
-   on every stream, without reifying document semantics. C++,
-   skips object-by-object reconstruction; should be much faster than
-   pdf-lib's load. Adds a binary dependency.
-2. **Deflate inside the writer** -- detect raw streams without
-   `/Filter` in the parsed objects and rewrite them with
-   `/Filter /FlateDecode` + a pako-deflated body. Same engineering
-   shape as qpdf but in JS, and lets the incremental update stay
-   self-contained. Requires walking the full body of the original
-   PDF, which puts back some of the cost we just removed.
-
-The incremental writer ships as-is; pick a size strategy when /
-if file size becomes a concern.
-
-### Production integration
-
-`measure.mjs --incremental` exercises the writer for measurement.
-`docs/book.bat` doesn't ship it: production goes through the pdf-lib
-roundtrip path (with `parseSpeed: Fastest`, now ~5 s and gives the
-17 MB compressed output). Switching production to the incremental
-writer is a one-line change in `docs/render-book.mjs` (call
-`applyOutlineAndMetadataIncremental` from `../perf/incremental-pdf.mjs`
-instead of `PDFDocument.load + ... + save`), gated behind whether the
-larger output is acceptable for that pipeline.
-
-## Profiling pdf-lib's load: 79 % was idle yielding
-
-The "Fix applied: detach-pages" section above showed the pdf-lib
-roundtrip at 39.7 s for the process phase. After profiling, **most
-of that wasn't pdf-lib doing work -- it was pdf-lib yielding to the
-event loop**.
-
-`PDFDocument.load` defaults to `parseSpeed: ParseSpeeds.Slow = 100`
-objects per tick, with an `await waitForTick()` between batches.
-`pdfDoc.save` does the same with `objectsPerTick: 50`. For our
-~50k-object PDF that's ~500 yields during load, ~1000 during save,
-each costing ~5-10 ms of pure idle on a quiet system.
-
-A CPU profile of `PDFDocument.load` running standalone on the 52 MB
-Chrome output (`node --cpu-prof`, fresh process, no concurrent work):
+End-to-end on the 1651-page book, `book.bat` path, after all shipped
+optimisations:
 
 ```
-samples: 3441   duration: 6.09s   us/sample: 1770
-
-   self_ms   self_%   function  @  source
-   -------   ------   ----------------------------------------------
-   4766.25   78.92%   (idle)                  (V8 idle wait)
-    251.41    4.16%   PDFRef.of               PDFRef.js:34
-    196.53    3.25%   (garbage collector)
-    116.85    1.93%   (program)
-     63.74    1.06%   PDFObjectParser.parseString
-     46.03    0.76%   BaseParser.parseRawInt
-     38.95    0.64%   BaseParser.parseRawNumber
-     35.41    0.59%   PDFObjectParser.parseNumberOrRef
+render   :   ~8 s    (was ~104 s in the original baseline)
+generate :  ~32-43 s (was ~64 s; Chromium-version-bump sensitive)
+process  :   ~5 s    (was ~40 s; pdf-lib parseSpeed:Fastest)
+total    :  ~45-60 s (was 207 s -- 3.5-4x speedup)
 ```
 
-On a 6 s load, **4.77 s is V8 sitting on its hands** between
-scheduled batches. Actual parsing self-time is well under a second;
-the rest is GC and V8 internals.
-
-Why such a cautious default? pdf-lib targets the browser too, where
-locking the main thread for 30+ s to parse a big PDF would freeze the
-page. In Node, with the harness having no other work to do, yielding
-is pure overhead.
-
-### Wins from `parseSpeed: Fastest` (objects/tick = Infinity)
-
-Three-variant roundtrip on the same 52 MB PDF, fresh process each
-time (`profile-roundtrip.mjs`):
-
-| parseSpeed / objectsPerTick | load   | save  | total   |
-| ---                         | ---    | ---   | ---     |
-| **Slow / 50 (default)**     | 36.7 s | 3.8 s | 40.5 s  |
-| Fast / 1500                 | 3.0 s  | 2.6 s | 5.6 s   |
-| **Fastest / Infinity**      | **2.0 s** | **2.7 s** | **4.7 s** |
-
-`save` is barely affected by `objectsPerTick` -- its CPU work
-dominates the yield overhead -- but `load` collapses by **18x**.
-
-### Wired into the harness
-
-`measure.mjs`'s default pdf-lib roundtrip path now passes
-`parseSpeed: ParseSpeeds.Fastest` and `objectsPerTick: Infinity`.
-End-to-end on the book (`--detach-pages`, default = pdf-lib path,
-no `--incremental`):
-
-| Phase    | Old pdf-lib defaults | Fast knobs | Δ |
-| -------- | -------------------- | ---------- | --- |
-| render   |  50.9 s   |  45.7 s   | noise |
-| generate |  60.2 s   |  52.4 s   | noise (Chrome variance) |
-| process  |  39.7 s   |   7.8 s   | **-31.9 s (-80 %)** |
-| **total**| **150.7 s** | **105.9 s** | **-44.8 s (-30 %)** |
-
-Result: the pdf-lib roundtrip is now **competitive with the
-incremental writer** (105.9 s vs 110.3 s total) **while still
-producing a 17 MB output** (vs 53 MB for incremental, because
-`save()` flate-compresses content streams as it re-emits them).
-
-### What this reinterprets
-
-The "Fix applied: detach-pages" table is still accurate, but its
-39.7 s process column reflects pdf-lib's default tick-yielding, not
-its actual work. A reader benchmarking pdf-lib on its merits should
-compare against the **7.8 s** number, not 40 s.
-
-The incremental writer (above) still produces the fastest process
-phase by far (0.25 s) and remains useful when sub-second matters
-more than file size. But for the common case the single-line
-`parseSpeed: Fastest` tweak is the immediate win.
-
-## Chromium `Page.printToPDF` knob survey
-
-While we were here, we audited which Chromium / CDP options affect
-PDF output. Partly to confirm "is there something Chrome could
-compress for us?" (no), partly because one option turned out to be
-a real win: `outline: true`.
-
-Verified against `devtools-protocol@0.0.1312386` and
-`puppeteer-core@22.15.0` (both shipped under `perf/node_modules`).
-
-### `outline: true` -- Chrome can emit /Outlines itself
-
-CDP's `Page.printToPDF` accepts `generateDocumentOutline: true` since
-Chrome M122 (Feb 2024). Puppeteer exposes it as `outline: true` since
-v22.x. Behaviour:
-
-- Chrome walks the rendered DOM's `<h1>..<h6>` once and emits a
-  /Outlines tree with **page+coords destinations** (`[N 0 R /XYZ x y z]`)
-  instead of named destinations.
-- Implies `tagged: true` (the outline is built from the accessibility
-  tree). Puppeteer enforces this in `util.ts:395`.
-- Requires the launch flag `--generate-pdf-document-outline`.
-  Puppeteer 22+ adds it automatically in `ChromeLauncher.defaultArgs()`,
-  so both `measure.mjs` and `docs/render-book.mjs` get it for free.
-- **No tag-level filter**: walks `h1..h6` unconditionally. There is
-  no equivalent of our `--outline-tags h1,h2,h3,h4` knob.
-
-Measured cost on the 1638-page book with `--chrome-outline --detach-pages`:
-
-| Phase    | injected outline | Chrome outline | Δ |
-| -------- | ---------------- | -------------- | --- |
-| generate |  52.4 s   |  53.8 s   | +1.4 s (Chrome walking the headings) |
-| process  |   7.8 s   |   5.3 s   | -2.5 s (no outline objects to save) |
-| **total**| **105.9 s** | **107.8 s** | +1.9 s |
+Renderer memory peaks at ~1.76 GB; full process tree peaks at ~2.3-3.3
+GB private working set (see [notes/07-memory.md](notes/07-memory.md)).
+The render hot path is structurally near its floor; the largest
+remaining untried lever is `pageRanges` sharding for `generate`, but
+the per-shard memory cost makes it impractical at current scale.
+Chrome's native `outline: true` could trim another ~5 s off `process`
+but requires a `role="presentation"` preprocessor pass on h5/h6
+headings; not pursued yet.
 
-Total is roughly a wash -- one cost shifts to another. The real
-benefit is **fewer moving parts**: no `parseOutline`, no
-`setOutline`, no incremental-writer outline objects, just metadata.
+What shipped, in chronological order, with attribution to the phase
+file documenting each:
 
-### Does Chrome's outline match the injected one?
-
-We diffed the two outputs on the 1638-page book (`compare-outlines.mjs`):
-`results/pdf-lib-fastest/book.pdf` (injected, 1773 entries from
-`--outline-tags h1..h4`) versus `results/chrome-outline-on/book.pdf`
-(Chrome's, 6023 entries total).
-
-Naïvely filtering Chrome's tree to "depth ≤ 3" to approximate our
-h1..h4 view gives 1820 entries -- close to 1773 in count, but **not
-equivalent** structurally. Two reasons:
-
-1. **Chrome walks all h1..h6 unconditionally.** First concrete
-   divergence is at the "Alias Types" section: the source
-   ([book.html:302](docs/_site-pdf/book.html:302)) has
-   `<h5 id="ch-Features-Language-Alias-Types-example">Example</h5>`
-   immediately after the h3 "Alias Types". Our `--outline-tags`
-   filter correctly drops it; Chrome includes it. Every such
-   insertion shifts the rest of the pre-order walk.
-2. **Chrome's tree depth ≠ HTML heading level.** Chrome collapses
-   skipped levels: an `<h5>` directly under an `<h3>` becomes
-   depth+1 (not depth+2). So "filter to depth ≤ 3" does *not*
-   extract "h1..h4 only" -- it extracts the first four levels of
-   *nesting*, which can be any mix of h1..h6 depending on context.
-
-Numerical summary:
-
-| metric                                  | value |
-| ---                                     | --- |
-| injected entries                        | 1773 |
-| Chrome entries (h1..h6, all depths)     | 6023 |
-| Chrome entries filtered to depth ≤ 3    | 1820 |
-| pre-order matches (vs injected)         | 27 / 1820 |
-| same title+depth, different page        | 10 |
-
-The 10 "page-only mismatches" are the smoking gun for structural
-drift: same heading title in both outlines but pointing at different
-sections of the book. The deltas grow as the walk progresses --
-e.g. "Properties" at A=p956 vs B=p883 (Δ = -73 pages), and similar
-near the end of the book. By that point Chrome and our outline are
-literally talking about different headings that happen to share a
-name (every class in the reference docs has its own "Properties"
-sub-heading).
-
-### Selectively excluding headings from Chrome's outline
-
-Chrome's outline is built from the accessibility tree (puppeteer
-enforces `tagged: true` alongside `outline: true` for this reason).
-Anything that hides a heading from a11y excludes it from the outline.
-Tested matrix (`probe-outline-exclusions.mjs`):
-
-**Excluded** -- the heading is dropped from `/Outlines`:
-
-| attribute on the heading or an ancestor | clean? | notes |
+| Fix | Phase | Saved |
 | --- | --- | --- |
-| `role="presentation"`     | yes | Removes heading semantic only. Visual rendering, DOM, anchor `#id` targets all unchanged. **The cleanest knob.** |
-| `role="none"`             | yes | Alias of `presentation`. |
-| `role="generic"`          | yes | Any non-heading role works. |
-| `aria-hidden="true"`      | -   | Excludes the whole subtree from a11y. Heavier -- also affects screen readers. |
-| `hidden` attribute        | no  | Also visually hides. |
-| `display: none`           | no  | Same. |
-| `visibility: hidden`      | no  | Same. |
-
-**No effect** -- Chrome ignores these:
-
-| attribute            | why |
-| ---                  | --- |
-| `bookmark-level: none` (CSS GCPM) | Chrome doesn't implement GCPM. |
-
-**Reverse direction.** `<div role="heading" aria-level="3">Foo</div>`
-*adds* an h3-level entry to Chrome's outline despite not being an
-HTML heading. Useful if you ever want an outline entry that doesn't
-look like a heading on screen.
-
-**Implication for our pipeline.** The "Chrome's outline is too
-noisy" objection above isn't actually structural -- it's one CSS
-selector away from being fixed. A preprocessor step that adds
-`role="presentation"` to every `<h5>` and `<h6>` in the Jekyll
-build would let Chrome's `outline: true` produce the same h1..h4
-view we want today. We haven't done that step yet, so we still
-ship the injected outline -- but the path from "Chrome's outline
-works for measurement only" to "Chrome's outline ships in
-production" is now ~5 lines of Jekyll plugin code, not a
-fundamental redesign.
-
-### Did pagedjs-cli ever try Chrome's outline?
-
-No. Searched (`gh api search/issues`, `gh api search/code`, and
-web search):
-
-- `repo:pagedjs/pagedjs-cli outline` -- 2 hits, both unrelated
-  (TOC page-number bug, rowspan/colspan).
-- `org:pagedjs chromium outline` -- 1 hit (the same TOC bug).
-- `"pagedjs printToPDF outline"` -- 0 hits.
-- `generateDocumentOutline org:pagedjs` (code search) -- 0 hits.
-- `"--generate-pdf-document-outline" org:pagedjs` -- 0 hits.
-
-Timing: Chrome's `generateDocumentOutline` shipped M122 (Feb 2024);
-[pagedjs-cli](https://github.com/pagedjs/pagedjs-cli)'s last
-meaningful change is May 2024 (Docker hyphenation). The project
-is in near-maintenance mode (21 stars). The feature post-dates
-active development, and the unfilterable-outline regression
-(without the `role="presentation"` workaround above) would have
-been a real concern for existing `--outline-tags` users -- so
-even a casual look would probably have ended in "we'll keep
-injecting for now". Nobody appears to have looked.
-
-### What's not exposed in CDP (we checked)
-
-- **No stream-compression flag.** Chromium uses Skia's `SkPDF`,
-  which writes content streams uncompressed. There's a C++-only
-  `SkPDF::Metadata::fPDFA` setting; no CDP plumbing for it. This is
-  *why* `save()` re-emission shrinks 52 MB → 17 MB.
-- **No object-streams flag, no font subsetting / image downsampling
-  knobs, no PDF/A mode.** Skia subsets fonts automatically per face.
-- **No parallelism knob.** Generate's 60 s in `page.pdf()` is
-  single-threaded Skia walking the layout tree.
-
-### What might still be worth trying
-
-- **`tagged: false`** -- drops the StructTreeRoot, saving ~10-20 %
-  of generate time and file size. Loses accessibility *and* the
-  Chrome outline (tagging is a prerequisite). Probably a no for
-  our use; documenting for completeness.
-- **`pageRanges` sharding** -- run `page.pdf()` N times with
-  disjoint ranges on parallel browser pages. Each shard serialises
-  only its slice and they run concurrently. Biggest unused lever
-  for the 60 s generate phase, but requires a PDF concatenation
-  post-pass (pdf-lib can do it).
-- **`transferMode: 'ReturnAsStream'`** -- puppeteer already
-  hard-codes it. Without it Chrome buffers + base64-encodes the
-  whole PDF into one JSON message; very slow and memory-heavy.
-
-## Where this leaves us
-
-The full menu of fixes, all measured against the original 207 s
-baseline:
-
-| Configuration                          | render | generate | process | total | size |
-| ---                                    | ---    | ---      | ---     | ---   | ---  |
-| original                               | 103.8s | 63.6s    | 39.6s   | 207.0s | 17 MB |
-| + detach-pages                         |  50.9s | 60.2s    | 39.7s   | 150.7s | 17 MB |
-| + detach + **parseSpeed:Fastest**      |  45.7s | 52.4s    |  7.8s   | **105.9s** | **17 MB** |
-| + detach + incremental writer          |  49.2s | 60.9s    |  0.25s  | 110.3s | 53 MB |
-| + detach + Chrome outline              |  48.7s | 53.8s    |  5.3s   | 107.8s | 17 MB |
-
-**Practical winner: `+ detach + parseSpeed:Fastest`.** Half the
-original wall time, same output size, one-line change. Ship this
-first regardless of what else gets layered on top.
-
-The incremental writer is still the fastest process phase (0.25 s)
-and remains the right answer if file size doesn't matter and
-sub-second process does.
-
-Chrome's outline is the simplest *architecture* (no parseOutline,
-no setOutline, no incremental outline objects -- just metadata),
-and the "unfilterable h1..h6" objection turns out to be a
-preprocessor change away from being solved: tag every `<h5>` /
-`<h6>` in the Jekyll build with `role="presentation"` and Chrome's
-outline collapses to the same h1..h4 view we want today. With that
-change, the totals look like:
-
-| Configuration                                     | render | generate | process | total | size |
-| ---                                               | ---    | ---      | ---     | ---   | ---  |
-| + detach + parseSpeed:Fastest *(today)*           |  45.7s | 52.4s    |  7.8s   | 105.9s | 17 MB |
-| + detach + parseSpeed:Fastest + Chrome outline    |  48.7s | 53.8s    |  5.3s   | 107.8s | 17 MB |
-| *(latter, with role="presentation" on h5/h6 -- pending)* | | | | | |
-
-The compound win isn't in wall time -- it's in deleting code:
-`parseOutline`, `setOutline`, and the entire outline branch of the
-incremental writer all go away. Worth it if/when someone wants to
-trim the surface area.
-
-## Dropping `pagedjs-cli`
-
-`pagedjs-cli` did three useful things for us and one harmful one. On
-the useful side: it shipped the paged.js browser bundle in
-`dist/browser.js`, the outline + metadata helpers in
-`src/outline.js` and `src/postprocesser.js` (~250 LOC total), and a
-CLI wrapper for the pdf pipeline. On the harmful side, the wrapper
-calls `PDFDocument.load(pdf)` and `pdfDoc.save()` with no options
-and therefore inherits the slow defaults that wasted ~32 s per build
-(see "Profiling pdf-lib's load" above). Patching upstream to fix
-that is plumbing for plumbing's sake; the rest of pagedjs-cli is
-already mostly duplicated by our harness.
-
-So we vendored what we needed and dropped the dep:
-
-- `docs/lib/paged.browser.js` -- `pagedjs-cli@0.4.3/dist/browser.js`,
-  byte-for-byte. MIT-licensed; license header preserved at top of file.
-- `docs/lib/outline.mjs`  -- `src/outline.js`, ESM-ified, attribution
-  in the file header.
-- `docs/lib/postprocesser.mjs` -- `src/postprocesser.js`, same.
-- `docs/render-book.mjs` -- the production driver. Argv-compatible
-  with the subset of `pagedjs-cli` flags `book.bat` actually used
-  (`-o`, `--outline-tags`, `-t`, `--additional-script`). Calls
-  pdf-lib with `parseSpeed: Fastest` + `objectsPerTick: Infinity`
-  inline, no patching required.
-- `docs/book.bat` -- swapped `npx pagedjs-cli ...` for
-  `node render-book.mjs ...`. Same CLI, ~32 s faster (pdf-lib idle
-  yielding gone), one fewer transitive dependency tree.
-
-Both `docs/package.json` and `perf/package.json` now depend directly
-on `puppeteer` + `pdf-lib` + `html-entities` instead of inheriting
-them via `pagedjs-cli`. `perf/measure.mjs` imports from `docs/lib/`
-so the harness and production share the exact same code path through
-the helpers and bundle -- whatever production renders, the harness
-measures.
-
-End-to-end on the 1638-page book through the new driver:
-
-```
-render:   53.5s  (1638 pages)
-generate: 68.8s  (raw 52.3 MB)
-process:  5.1s
-saved:    docs\_pdf\book.pdf  (16.9 MB)
-total:    130.4s
-```
-
-(The total includes puppeteer launch + page nav overhead the
-harness elides, so it reads a few seconds higher than the harness's
-105 s headline.)
-
-## Restoring live progress
-
-Dropping `pagedjs-cli` (above) quietly dropped its ora spinners
-along with the rest of the CLI. The terminal goes silent for the
-~50 s render and ~60 s generate phases -- on a 130 s build, most
-of the wall time looks like the process is hung.
-
-Render phase: restored via `docs/lib/progress-handler.js`, a small
-`Paged.Handler` subclass that emits a `[render-progress] page=N
-elapsed=Ns` line from `afterPageLayout`. `render-book.mjs` listens
-on `page.on('console')` and re-renders the line as a
-`\r`-overwritten TTY status (`rendering: 234 pages (12.4s)`), or
-every 100 pages on its own line when stdout is piped (CI / log
-files). The live line is cleared just before the final
-`render: 53.5s (1638 pages)` summary is printed.
-
-The handler is a separate in-page script rather than inlined into
-`render-book.mjs` because `addScriptTag({ path })` loads it via
-file:// into the headless page -- it has to be a real file. It's
-structurally parallel to `perf/timing-handler.js`, which uses the
-same hook but additionally retains per-page detail on
-`window.__pagedTiming` for offline analysis. The production version
-stays minimal -- just the log line.
-
-Generate phase: a 500 ms wall-clock heartbeat in `render-book.mjs`
-writes `generating: 23.4s` to a `\r`-overwritten TTY line during
-the `page.pdf()` wait. Elapsed time only; no byte- or page-count
-signal. The line is cleared before the final
-`generate: 68.8s (raw 52.3 MB)` summary, same shape as the render
-phase.
-
-We initially tried byte-level progress -- drive `page.pdf()` at the
-CDP level with `transferMode: 'ReturnAsStream'` + chunked `IO.read`.
-On the Chromium we ship with, the bytes don't actually stream:
-Chrome's SkPDF writer buffers the whole document internally and
-emits all 52 MB in one tick at the end. The wrapper showed `0.0 MB`
-for ~50 s then flickered `52 MB` for one frame before the summary
--- the heartbeat was doing all the visible work. Dropped the CDP
-code; the buffer-then-dump finding is preserved in a comment above
-the heartbeat so the next person doesn't re-investigate.
-
-The process phase stays silent. At ~5 s with the fast pdf-lib knobs
-(`parseSpeed: Fastest`) it's not worth a progress signal of its own.
-
-## Revisiting `AtPage.finalizePage`
-
-The post-detach CPU profile in the "Fix applied: `perf/detach-pages.js`"
-section above showed an `(anonymous) @ browser.js:29501` row at **13.7 s
-self-time** -- the `["top","bottom"].forEach(...)` lambda inside
-`AtPage.finalizePage`. That looked like a fat target.
-
-It wasn't, for two reasons:
-
-1. **The 13.7 s number was stale.** It came from the *first*
-   detach-pages.js variant, which hooked `afterPageLayout` and hid the
-   page *before* AtPage ran -- so AtPage paid Chromium's slow style-
-   cascade path on a `display:none` subtree (~9 ms/page). The shipping
-   variant hooks `finalizePage` and hides *after* AtPage, so AtPage
-   sees a visible page and the same lambda is **~0.7 ms/page = ~1.1 s
-   total render**. Re-measured on a fresh profile, the lambda is
-   ~1.0 s self-time, not 14 s. The original number is correct for the
-   variant it was measured on, but doesn't reflect current ship.
-2. **Most of that ~1 s isn't query CPU.** Per-page the method does
-   ~17 `querySelector` calls plus a few `getComputedStyle` reads.
-   Native query self-time across the whole render is ~340 ms
-   (`querySelector` ~155 ms + `querySelectorAll` ~185 ms in the
-   unpatched baseline). The rest of the lambda's ~1 s is the
-   downstream layout flush triggered by `getComputedStyle` and the
-   style writes -- unaffected by query consolidation.
-
-We patched it anyway, as a cleanup. `docs/lib/paged.browser.js`'s
-`finalizePage` now builds a `__mLookup` table once per page via a
-single `querySelectorAll` over all 16 known margin-cell + margin-
-group class selectors, then the two forEach loops index that table
-instead of calling `page.element.querySelector(...)` 4× per
-iteration. The patch is marked `// PATCH: consolidate` at each of
-the three touch points so a future re-vendoring of the bundle can
-grep for it.
-
-### A/B results
-
-Interleaved 3+3 (A1 B1 A2 B2 A3 B3), `--detach-pages --cpu-profile`,
-same 1638-page book each run:
-
-| metric                        | A (patched) | B (unpatched) | Δ |
-| ---                           | ---         | ---           | --- |
-| render wall-clock, mean       | 49.45 s     | 49.91 s       | -0.46 s (noise; within-variant range 4-13 s) |
-| `querySelector` self-time     | <50 ms      | 155 ms        | -155 ms |
-| `querySelectorAll` self-time  | 247 ms      | 183 ms        | +64 ms |
-| **query CPU total**           | **~247 ms** | **~338 ms**   | **-91 ms (-27 %)** |
-| finalizePage lambda self-time | 1033 ms     | 1025 ms       | unchanged |
-
-The patch does what it says on the tin: ~91 ms shifts out of native
-`querySelector` and into a single `querySelectorAll`. Wall-clock
-delta is in the noise; the within-variant spread (3-13 s across runs
-of the same variant) drowns it out.
-
-The lambda's self-time being unchanged is the load-bearing
-observation: query consolidation doesn't reduce the layout-flush
-component, which is most of the 1 s. The next lever in this method
-would be **read/write batching** -- hoist all `getComputedStyle`
-reads to the top of `finalizePage` before any style writes, so the
-write-then-read pattern stops forcing a flush mid-method.
-
-### Read/write batching
-
-We applied the hoist anyway, as a follow-up cleanup. After the
-`__mLookup` block above, `finalizePage` now reads every relevant
-`max-width` / `max-height` value into two `Map`s (`__maxW`, `__maxH`)
-in a single batch -- gated by the same `.hasContent` check the
-original conditionals used. The two forEach loops then consume
-those cached values instead of calling `getComputedStyle` inline.
-Marked `// PATCH: max-width reads hoisted` / `max-height reads
-hoisted` at each touch point.
-
-**For this book**, the hoist is a no-op behaviourally. Our @page CSS
-sets content on exactly one corner (bottom-right page number), so
-only one `.hasContent` cell exists per page; the original code did
-exactly one `getComputedStyle` per page and therefore one forced
-flush. The hoisted version does the same.
-
-Smoke test, single render with `--detach-pages` (no profiling): 1638
-pages, 16.9 MB output, render 47.98 s, ratio 1.69x. All in the noise
-band from the consolidate-querySelector A/B.
-
-**For docs with multi-cell marginalia** (running headers + footers +
-page numbers across several corners) the hoist collapses N forced
-flushes -- one per cell that hits the `if (xContent)` branch in the
-original -- down to 1. The win scales with marginalia density.
-
-### Cross-page memoization
-
-The next layer of duplicate work: `finalizePage`'s computation is a
-pure function of `(page.element.className, this.marginalia, CSS
-@page rules)`. The marginalia map and CSS are static; only the
-className varies. **Two pages with the same className get the same
-four `grid-template-columns` / `grid-template-rows` values.** So we
-cache the result.
-
-Implementation: a `this.__finalizeCache: Map<string, {top, bottom,
-left, right}>` on the AtPage instance, keyed by
-`page.element.className`. The cache check sits between the
-`__mLookup` build and the GCS hoist. On a hit we apply the cached
-values via `__mLookup` and `return` -- Phases B and C never run. On
-a miss the existing code runs and the result is recorded at the end
-of the method by reading back the just-written `.style.grid-
-template-*` values.
-
-Phase A's marginalia `.hasContent` classifier still runs on every
-page (the class has to be added to *this* page's elements so the
-@page-margin CSS rules apply). Only the grid-template
-computation is skipped.
-
-**Assumption.** Cache key is `page.element.className`. Sound as long
-as @page rules don't use position-dependent selectors (e.g.
-`:nth-of-type`) that pick different rules on pages that share a
-className. Common case, true for this book; comment in the bundle
-flags the caveat.
-
-Smoke render (`--detach-pages`, no profile): 1638 pages, **16.9 MB
-output (byte-equivalent to the pre-patch run)**, render 48.27 s.
-Wall-clock impact still in the noise -- same reason as the hoist:
-the flush we skip in `finalizePage` is just deferred to the next
-chunker iteration's `findOverflow`. Total layout work the document
-demands doesn't shrink. What does shrink is the JS-side work --
-~1633 of 1638 pages now skip ~17 `querySelector` lookups, 6
-`classList.contains` reads, and the GCS pass entirely -- but that's
-sub-millisecond per page and disappears into the noise band.
-
-We're not going to keep iterating on `finalizePage`: budget is ~1 s
-total render even when every flush triggers, so further work here is
-cleanup-only.
-
-### Hoisting grid-template emission to parse time
-
-The cleanup payoff. Three patches in a row -- `__mLookup`, GCS hoist,
-cross-page memoization -- had whittled `finalizePage`'s per-page
-work to ~30 sub-ms ops, then to one Map lookup. The architectural
-move was to **delete the hot spot rather than keep optimizing
-around it**: hoist the grid-template computation out of the
-per-page JS path and into the polisher's @page CSS emission, so the
-rules are emitted once at parse time and the browser applies them
-via cascade for every matching page.
-
-The decision tree's inputs are static at parse time:
-
-- **hasContent** per `(page-class, margin-cell)` -- already recorded
-  in `this.marginalia[sel]` by `addMarginaliaStyles` for Phase A's
-  classifier, and invariant per page-class regardless of page index.
-- **max-width / max-height** per cell -- created by the same walker
-  that copies `width`/`height` declarations to `max-width` /
-  `max-height` on corner cells. The runtime
-  `getComputedStyle(el)["max-width"]` reads return the CSS-cascade
-  result of those rules, which is the value the parser saw. We
-  capture the string at parse time on the marginalia entry,
-  defaulting to `"none"` when no declaration exists.
-
-`AtPage.afterTreeWalk` already runs `addPageClasses`, which
-populates `this.marginalia` and emits the per-cell margin-styling
-rules. We extended it with `emitMarginGridTemplates`: for each page
-entry in `this.pages`, build the effective per-cell `hasContent +
-maxWidth + maxHeight` by unioning across every marginalia entry
-whose page-selector is a subset of the page's class signature
-(matching the runtime Phase A OR-cascade; `maxWidth` follows CSS
-cascade and takes the most-specific declared value). Run the same
-decision tree the runtime did on that snapshot. Emit one rule per
-margin group with `selectorsForPage(page)` as the selector and the
-computed `grid-template-columns` / `-rows` as a Raw value
-declaration. Skip emission for the four offset-fallback branches
-that need `offsetWidth` measurement (they can't be pre-computed --
-they read live layout).
-
-For this book that produces 24 rules total -- 6 page-class
-signatures (`*`, `:first`, `divider`, `front-matter`,
-`part-foreword`, `chapter-divider`) × 4 margin groups -- all with
-the same `0 0 1fr` value (the static branch the decision tree
-produces when only one corner has content and no widths are
-declared):
-
-```css
-.pagedjs_page .pagedjs_margin-top    { grid-template-columns: 0 0 1fr; }
-.pagedjs_page .pagedjs_margin-bottom { grid-template-columns: 0 0 1fr; }
-.pagedjs_page .pagedjs_margin-left   { grid-template-rows:    0 0 1fr; }
-.pagedjs_page .pagedjs_margin-right  { grid-template-rows:    0 0 1fr; }
-... (5 more page-class signatures) ...
-```
-
-`finalizePage` collapses to **Phase A + an offset-only Phase B**:
-
-- **Phase A** unchanged. Per-page DOM, can't be hoisted -- it has to
-  add `.hasContent` to the freshly created margin cells so the
-  base-style `.pagedjs_margin:not(.hasContent) { visibility: hidden
-  }` rule unhides the right ones.
-- **Phase B offset fallbacks.** The four branches in the upstream
-  Phase B that compute `minmax(%, ...)` templates from `offsetWidth`
-  measurements stay -- they read live layout and can't be
-  pre-computed. The forEach loop early-exits via a `couldFire` check
-  (two-or-more cells have content) before any `getComputedStyle` or
-  `querySelector` on the margin group; for this book that gate fails
-  on every page so the forEach is dominated by three `querySelector`
-  calls + three `classList.contains` reads per group.
-- **Phase C** disappears entirely. Every branch in the upstream
-  Phase C (left/right vertical groups) is static at parse time --
-  the upstream code has no offset measurement in those paths.
-- All three prior PATCH blocks come out: `__mLookup` and
-  cross-page memoization had no callers left, and only the GCS
-  hoist stays (preserved as an inline batched read of `max-width`
-  inside the `couldFire` gate, for documents whose marginalia would
-  reach the offset fallbacks).
-
-### Verifying it
-
-Instrumented A/B on the same 1638-page book:
-
-| op                      | pre-emit (3 patches) | post-emit | Δ |
-| ---                     | ---                  | ---       | --- |
-| `getComputedStyle`      | 9,179 calls          | **5,903 calls** | **-3,276 (-36%)** |
-| `getBoundingClientRect` | 258,940              | 258,940   | unchanged (different code path) |
-| `offsetWidth`           | 0                    | 0         | unchanged (gate never fires) |
-| render wall-clock       | 47.6 s               | 46.0-47.0 s | noise |
-| pdf size                | 16.9 MB              | 16.9 MB   | unchanged (±27-bytes timestamp variance) |
-
-The -3,276 GCS drop is exactly two reads per page eliminated -- the
-prior GCS hoist batched the per-cell `max-width` reads on
-`.hasContent` cells (one per `top-right`, one per `bottom-right`
-per page). The new `couldFire` early-exit skips them entirely.
-
-Wall-clock is in the noise, as predicted in the patch brief: this
-moves work from runtime JS to parse-time CSS but the browser still
-does the same cascade + layout work. The value here is **deleting
-the hot spot from the bundle**, not shaving milliseconds.
-
-Smoke render of `book.bat`: 1638 pages, 16.9 MB output (within 54
-bytes of the pre-patch run -- ±27 bytes is the normal run-to-run
-variance from Chrome's `/CreationDate` / `/ModDate` encoding),
-render 45.8 s.
-
-### What's left in `finalizePage`
-
-Two phases, both with clear single-purpose justifications:
-
-```
-Phase A   classify .hasContent per margin cell (per-page DOM)
-Phase B'  offset-fallback for auto-width minmax(%) templates
-          (dead code in this book; live for paged.js compatibility)
-```
-
-For our content Phase B' is dominated by an early `couldFire`
-short-circuit. The method now reads top-to-bottom as "what does the
-runtime *have* to do per page", with all the layered optimizations
-unwound. There's nothing left to hoist.
-
-## Looking past `finalizePage`: where render time goes now
-
-With the `finalizePage` work landed, a fresh `--detach-pages
---time-hooks --cpu-profile` run on 1638 pages (2026-05-19) shows the
-named handlers we hook -- the surface we own -- now account for
-**under 1 ms/page combined**. Per-page handler costs, top of table:
-
-```
-hook::handler                                count  total_ms  per_page_ms
-chunker.afterPageLayout (detach-pages)        1638     788.5        0.481
-chunker.afterPageLayout (#10)                 1638     249.0        0.152
-chunker.renderNode                           44365     185.6        0.113
-chunker.afterPageLayout (#6)                  1638     100.9        0.062
-chunker.finalizePage                          1638      71.8        0.044
-chunker.beforePageLayout                      1638      68.6        0.042
-```
-
-Render is ~49 s on this hardware (~30 ms/page average). Subtracting
-the ~1 ms/page of handler work leaves ~29 ms/page of **paged.js
-core**: chunking, layout probing, overflow detection, and the
-text-break split. That's what the CPU profile attributes to:
-
-```
-self_ms   self_%   function                     source
-22855     33.0 %   getBoundingClientRect        (native, called from JS)
-19332     27.9 %   (program)                    V8 overhead / idle
- 9931     14.4 %   removeOverflow               paged.browser.js:2196
- 4280      6.2 %   findEndToken                 paged.browser.js:2094
- 2364      3.4 %   findElement                  paged.browser.js:638 (cache hit; cheap)
- 1456      2.1 %   insertBefore                 native
- 1228      1.8 %   createBreakToken             paged.browser.js:1796
-  580      0.8 %   afterPageLayout (paged.js)   paged.browser.js:30381
-```
-
-(Counter-check on the ratio: this run reads **5.59 x** rather than
-the usual ~1.6 x. That's instrumentation skew -- both `--time-hooks`
-and `--cpu-profile` wrap hot paths, and the sampling overhead is
-proportionally larger on later pages. The handler totals and
-self-time table are still accurate; the per-page growth curve isn't
-trustworthy on instrumented runs.)
-
-So 33 % of render is `getBoundingClientRect` and another ~20 % is
-inside `removeOverflow` + `findEndToken` -- paged.js's per-page
-overflow-find + text-split path. That work isn't redundant: each
-page genuinely has to decide where its content ends. The remaining
-opportunities aren't *eliminating* work, they're *replacing the
-algorithm* with something the browser can answer in one call.
-
-### Three places non-redundant work could be made simpler
-
-**1. `Layout.textBreak` -- replace per-word `gBCR` loop with a
-single native call.** [paged.browser.js:2136](docs/lib/paged.browser.js:2136)
-walks an overflowing Range word-by-word, calling
-`getBoundingClientRect` on each `Range` to find which word crosses
-the page boundary; if a word straddles it, it descends letter-by-
-letter doing the same. On a long text node that's dozens to
-hundreds of gBCR calls -- and `textBreak` is the inner loop of
-`findOverflow`, so it fires on every page that overflows.
-
-A single `document.caretPositionFromPoint(x, vEnd)` (or
-`caretRangeFromPoint` on Chromium) returns the exact text node +
-offset at the boundary in **one** browser call. Equivalently,
-`range.getClientRects()` returns every line box of the range in one
-call, after which the crossing line is a simple `.find()`. Either
-replaces an `O(words-in-overflow)` scan with `O(1)`.
-
-This is the highest-leverage candidate: even if it cuts only half
-of the `gBCR` time, that's ~10 s off render. The risk is fidelity
--- we'd need to verify the substitute gives the *same* split point
-as the word-walk on edge cases (RTL, hyphenated words,
-`white-space: pre`, soft hyphens). Worth a prototype + diff against
-the current bundle's output PDF.
-
-**2. `findOverflow` -- collapse three ancestor walks into one.**
-Inside the per-node loop in
-[paged.browser.js:1934](docs/lib/paged.browser.js:1934):
-
-```js
-const insideTableCell = parentOf(node, "TD", rendered);
-// ...
-tableRow = parentOf(node, "TR", rendered);
-// ...
-const table = parentOf(tableRow, "TABLE", rendered);
-```
-
-Three separate ancestor traversals per node visited, each climbing
-from `node` to `rendered`. One walk that emits the nearest TD/TR/
-TABLE together is ~10 lines and visits each ancestor once. Won't
-match #1 for raw savings (this is in the same loop that's already
-calling `getComputedStyle`, so a single-digit % gain at best) but
-it's the easy follow-up.
-
-**3. Cache `getComputedStyle` per page.** Same loop,
-[paged.browser.js:1969, 1974, 1992](docs/lib/paged.browser.js:1969):
-up to four `getComputedStyle` calls per node visited (on the node,
-its TD ancestor, and the parent TBODY/THEAD). The walker revisits
-the same ancestors across many child nodes; a `WeakMap<Element,
-CSSStyleDeclaration>` populated lazily per page would dedupe.
-
-This one *is* deduplication-shaped, but it's the cheapest of the
-three to land (no algorithmic change, no fidelity risk) and a clean
-follow-up if #1 lands.
-
-### Probable bug worth surfacing separately
-
-[paged.browser.js:1998](docs/lib/paged.browser.js:1998):
-
-```js
-const table = parentOf(tableRow, "TABLE", rendered);
-const rowspan = table.querySelector("[colspan]");
-```
-
-The local is named `rowspan` and the surrounding comment is about
-rowspan-aware break handling, but the selector matches `colspan`.
-Looks like a typo that's silently broken the rowspan path since the
-bundle was vendored. Not a perf issue per se, but worth a separate
-fix.
-
-### Strategic note
-
-Render and generate are now within ~20 s of each other (49 s vs
-70 s on this run). Each second shaved off render moves total by
-less than it used to, because `page.pdf()` is now the larger phase.
-Item 1 above is the only remaining render change that plausibly
-returns 10+ s; items 2 and 3 are <5 s each.
-
-After item 1 the remaining levers all live outside render. The
-Chrome-outline experiment above shows generate isn't moved by
-shifting outline work around (Chrome walking `h1..h6` itself costs
-about what `parseOutline` + `setOutline` save -- net was +1.9 s).
-The one generate-side lever we haven't tried is **`pageRanges`
-sharding** -- run `page.pdf()` N times with disjoint page ranges on
-parallel browser pages and concatenate with pdf-lib. Each shard
-serialises only its slice and they run concurrently, so generate
-collapses to roughly `60 s / N` plus a small concat pass. Listed
-under "What might still be worth trying" above; it's the biggest
-untried knob in the pipeline.
-
-## What happened when we tried item 1
-
-The strategic note above was wrong about item 1 -- the binary-search
-replacement for `textBreak` saves nothing, and the reason it saves
-nothing reveals the actual structure of the remaining render cost.
-
-### Attempt A: binary-search `textBreak`
-
-Replaced the per-word-then-per-letter gBCR cascade in
-[`Layout.textBreak`](docs/lib/paged.browser.js:2136) with a binary
-search over offsets using a single-character probe `Range`.
-Semantically equivalent (both return the smallest offset whose
-character satisfies `left >= end || top >= vEnd`), should reduce
-gBCR call count from O(words) to O(log nodeLength).
-
-Paired runs with `--detach-pages`:
-
-| run        | baseline | binsearch |
-| ---------- | -------- | --------- |
-| render (1) |  47.73 s |  51.43 s  |
-| render (2) |  47.10 s |  47.12 s  |
-| **avg**    | **47.4** | **49.3**  |
-
-Wash, possibly small regression. PDF byte size and page count
-identical. Reverted.
-
-### Attempt B: memoize `Page.create`'s `area.getBoundingClientRect`
-
-The CPU profile of attempt A's baseline pointed at a much bigger
-target. Tracing gBCR's native frames up to their JS callers in the
-profile graph:
-
-```
-caller                           gBCR time
-create:2257                      12,947 ms   (69 %)
-hasOverflow:1925                  4,419 ms   (24 %)
-Layout:1443                         586 ms
-...
-total native gBCR                18,424 ms
-```
-
-[`Page.create`](docs/lib/paged.browser.js:2257) does one
-`area.getBoundingClientRect()` per page, right after the fresh
-`insertBefore` / `appendChild` of the page DOM -- so each call
-forces a synchronous layout pass. The `area`'s size is CSS-driven
-and constant per template, so the gBCR should be cacheable.
-
-Memoized the result on the `pageTemplate` node (first page pays,
-all subsequent same-template pages reuse).
-
-Profile diff (same `--detach-pages --cpu-profile` flags, paired):
-
-| caller            | PRE       | POST      | Δ          |
-| ----------------- | --------- | --------- | ---------- |
-| `create:2257`     | 12,947 ms |      2 ms | **-12,945** |
-| `Layout:1443`     |    586 ms | 13,567 ms | **+12,981** |
-| `hasOverflow:1925`|  4,419 ms |  4,533 ms |    +114    |
-| **total**         | 18,424 ms | 18,554 ms |    +130    |
-
-The cost moved, it didn't disappear. The memoization successfully
-eliminated the gBCR at `create:2257` (from 12,947 ms to 2 ms), but
-the layout flush that gBCR was driving still had to happen
-somewhere -- it migrated to the next call in the per-page sequence,
-[`Layout`'s constructor](docs/lib/paged.browser.js:1443):
-
-```js
-this.bounds = this.element.getBoundingClientRect();
-this.parentBounds = this.element.offsetParent.getBoundingClientRect();
-```
-
-Total gBCR self-time barely changed (+130 ms). Per-page ratio got
-worse (1.77x -> 3.07x), probably because the deferred flush
-accumulated more pending mutations before firing. Reverted.
-
-### The lesson
-
-**gBCR self-time in the profile is layout-flush attribution, not
-JS call overhead.** Reducing the *number* of gBCR calls in a hot
-path saves ~nothing if the layout flush they trigger has to fire
-anyway. The cost lives in the flush itself, which is paged.js
-measuring the live layout tree to decide where to break.
-
-Where the residual per-page layout cost actually comes from, after
-`--detach-pages` has already trimmed completed pages out of the
-layout tree, is probably one of:
-
-- **CSS counters** at
-  [`.pagedjs_pages`](docs/lib/paged.browser.js:27213)
-  (`counter-reset: pages ... footnote ...`). Counter resolution
-  walks the document, and counter-affecting elements per page
-  accumulate even when `display: none`.
-- **`offsetParent` lookup** in `Layout`'s constructor. That's a
-  layout-tree walk to find the nearest positioned ancestor; cost
-  can grow with sibling count even when most siblings are
-  display:none.
-
-Neither is fixable by dedup-shaped optimizations in our bundle.
-
-The remaining `findOverflow` opportunities (items 2 and 3 in the
-strategic note above -- collapsing ancestor walks, caching
-`getComputedStyle`) might still be worth doing on their own
-merits, but they're not where the gBCR time lives.
-
-### Methodology: compare profiles, not wall-clock
-
-Both attempts above showed wall-clock results that looked like
-noise (47.7 vs 47.1 vs 51.4 s -- inside the run-to-run jitter band
-on a busy dev machine). The actual structural change was only
-visible by **diffing the bottom-up gBCR-caller breakdown across
-two CPU profiles**. The `+12,981 ms` move from `create:2257` to
-`Layout:1443` would have been invisible in a wall-clock A/B.
-
-For any future render-stage optimization work, the rule is:
-
-1. Run with `--cpu-profile` (paired pre/post, same flags).
-2. Compare bottom-up self-time tables ([`analyze-profile.mjs`](perf/analyze-profile.mjs))
-   and caller breakdowns ([`find-callers.mjs`](perf/find-callers.mjs);
-   point it at a profile + a callee name to see which frames are
-   paying for that callee's time -- essential for spotting gBCR
-   migration between callers).
-3. Treat the wall-clock totals as a sanity check only -- they
-   confirm "did anything change" but not "where".
-
-This matters because:
-
-- **Render's per-page CPU work is dominated by native (layout,
-  DOM) frames.** V8 self-time deltas from JS-level dedup are
-  small compared to the layout flushes those calls trigger.
-- **CPU sample percentages are stable across machine load.** A
-  busy machine slows the absolute wall-clock but the proportional
-  breakdown (gBCR = ~38 % of render samples) stays the same.
-- **Migrations between attribution sites are common.** Moving a
-  gBCR off one call site usually re-attributes its layout cost to
-  the next caller in the sequence, not to nothing.
-
-For `generate` and `process` the picture is different (Chromium
-internals and pdf-lib parse cost respectively); CPU profiles of
-those phases are less informative because the work happens
-outside the JS we can see, and wall-clock can be a fine
-single-signal A/B. But anything inside paged.js's
-render loop wants a profile diff, not a stopwatch.
-
-## Finding the residual O(n): it's not counters, it's siblings
-
-After the methodology shift to profile-diffing, two more A/Bs
-finally pinned down where the residual per-page layout cost comes
-from. Spoiler: it's not what we expected, and the fix is large.
-
-### Hypothesis 1: CSS counters
-
-The book uses `@bottom-right { content: counter(page); }` for page
-numbers and `article.part-divider { counter-reset: page 0; }` for
-per-part renumbering. paged.js's bundle puts
-`counter-increment: page var(--pagedjs-page-counter-increment);`
-on every `.pagedjs_page`. So on each new page's `@bottom-right`,
-Chromium has to resolve `counter(page)` by walking preceding
-`counter-increment: page` elements.
-
-Per CSS spec (`display: none` elements don't increment counters),
-`--detach-pages`'s `display: none` strategy should already make
-this O(1). But Chromium implementations have historically been
-liberal about which display states still contribute. So: A/B by
-commenting out the `counter-increment: page` rule entirely
-([paged.browser.js:27198](docs/lib/paged.browser.js:27198)) and
-diffing the profile.
-
-Result:
-
-| variant                 | render   | total gBCR | gBCR %/render | ratio |
-| ----------------------- | -------- | ---------- | ------------- | ----- |
-| baseline (counters on)  | 48.51 s  | 18,424 ms  | 38 %          | 1.77x |
-| counters disabled       | 44.72 s  | 21,514 ms  | 48 %          | 2.44x |
-
-Disabling counters did **not** reduce gBCR; it grew. The
-wall-clock drop is run-to-run noise (counter resolution is genuinely
-cheap on `display: none` siblings); the proportional growth means
-removing counter-increment didn't save anything and may have shifted
-work elsewhere. **Counter resolution is not the residual O(n).**
-
-### Hypothesis 2: sibling sweeps over `display: none` pages
-
-Re-reading the README on `--detach-pages`: the claim has always
-been that `display: none` "removes a subtree from the layout tree
-entirely". That's true for *layout* -- but Chromium's per-page
-work also includes **style/selector resolution and rule matching**,
-which walks the sibling list regardless of display state. With
-1638 `.pagedjs_page` siblings under `.pagedjs_pages`, any per-page
-selector evaluation is O(n).
-
-A/B: physically `removeChild` finalized pages instead of just
-`display: none`, then re-append all at `afterRendered` so
-`page.pdf()` sees them. The chunker passes `lastPage.element` to
-`Page.create()` for ordered insertion, so the most recent finalized
-page has to stay in the DOM -- detach one page behind. DOM holds
-at most 2 pages at any moment: the in-flight one being laid out
-plus the most recent finalized one.
-
-Probe modification (in [perf/detach-pages.js](perf/detach-pages.js)),
-not shipped; page numbers come out wrong because `counter(page)`
-doesn't accumulate, but the profile signal is clean.
-
-Result:
-
-| metric              | display:none | removeChild | Δ            |
-| ------------------- | ------------ | ----------- | ------------ |
-| **render**          | **48.5 s**   | **28.0 s**  | **-20.5 s (-42 %)** |
-| total native gBCR   | 18,424 ms    | 7,320 ms    | -11,104 ms   |
-| `create:2257` gBCR  | 12,947 ms    | 1,073 ms    | **-11,874 ms (12x)** |
-| `hasOverflow:1925`  | 4,419 ms     | 5,119 ms    | +700 ms      |
-| `Layout:1443`       | 586 ms       | 562 ms      | flat         |
-| per-page ratio      | 1.77x        | 1.43x       | flatter      |
-
-`Page.create`'s layout flush -- the dominant per-page cost in
-every profile we've seen -- went from 12.9 s to 1.1 s. That's the
-work Chromium does to maintain style/selector state across the
-sibling list, and it's now nearly constant per page. `hasOverflow`
-still has a small residual growth but it's an order of magnitude
-smaller and bounds the next plausible optimization target.
-
-**This is the largest single render-stage win we've found in this
-investigation.** 20+ seconds off render, dropping render from the
-larger phase to the smaller one (vs generate's ~60-70 s).
-
-### Shipping it
-
-The probe rendered the right number of pages but the output PDF
-was incorrect in two ways: `counter(page)` doesn't accumulate
-across detached siblings, and the re-attach loop appended pages
-at the end instead of in original order. Both fixable; the
-question was whether named strings (`string(chapter-title)`)
-would survive detach. Verified empirically: they do.
-
-Final shipped change set:
-
-1. **[perf/detach-pages.js](perf/detach-pages.js)** -- rewrite
-   from `display:none` to physical `removeChild`. Keep the most
-   recent finalized page in the DOM (the chunker passes
-   `lastPage.element` to `Page.create` for ordered insertion);
-   detach one page behind. At `afterRendered`, detach the keeper
-   and re-append all in finalize order (which is document order).
-
-2. **[docs/lib/paged.browser.js](docs/lib/paged.browser.js) -- Counters handler.**
-   Track a running display-page counter on the handler instance,
-   increment per page during `afterPageLayout`, and write the
-   value as `--page-num: "N"` on the page wrapper's inline style.
-   On pages with `[data-counter-page-reset]` (the part dividers),
-   skip the increment -- mirrors the shipping behaviour of the
-   pre-existing CSS, where the injected per-page rule's
-   `counter-increment: none` takes effect but the
-   `counter-reset: page N` part doesn't (cascade/specificity
-   issue, not yet diagnosed; behaviour-preserving fix here, the
-   "intended" part-restart numbering would be a separate change).
-
-3. **[docs/assets/css/print.css](docs/assets/css/print.css) +
-   [_site-pdf copy](docs/_site-pdf/assets/css/print.css)** --
-   replace `content: counter(page)` in `@bottom-right` with
-   `content: var(--page-num)`. The CSS custom property approach
-   keeps the existing cascade (suppression on `@page :first` and
-   `@page divider` still works, since those rules override the
-   `content` declaration entirely).
-
-Verification (1638-page book, all sample pages spot-checked
-against the pre-detach output):
-
-- Page count matches (1638).
-- `@bottom-right` page numbers byte-equivalent on every sampled
-  page (1, 2, 5, 6, 10, 100, 500, 1000, 1500, 1638).
-- `@top-right` chapter titles byte-equivalent on every sampled
-  page -- named strings persist through detach.
-
-### Shipped numbers
-
-Profile diff (paired `--detach-pages --cpu-profile` runs):
-
-| metric              | pre (display:none) | post (removeChild) | Δ                    |
-| ------------------- | ------------------ | ------------------ | -------------------- |
-| **render**          | **48.5 s**         | **26.3 s**         | **-22.2 s (-46 %)**  |
-| total native gBCR   | 18,424 ms          | 7,455 ms           | -10,969 ms (-60 %)   |
-| gBCR % / render     | 38 %               | 28 %               | flatter              |
-| `create:2257` gBCR  | 12,947 ms          | **877 ms**         | **-12,070 ms (15x)** |
-| `hasOverflow:1925`  | 4,419 ms           | 4,590 ms           | flat                 |
-| `Layout:1443`       | 586 ms             | 463 ms             | flat                 |
-| per-page ratio      | 1.77x              | 1.18x              | nearly flat          |
-
-`Page.create`'s layout flush -- the largest single per-page cost
-in every profile we'd seen -- went from 12.9 s to 0.9 s. The
-remaining gBCR work in `hasOverflow` is now the largest layout
-flush, but it's an order of magnitude smaller and only marginally
-super-linear.
-
-### Where this leaves the picture
-
-The full menu of fixes against the original 207 s baseline:
-
-| fix                                 | render saved | total saved | shipped |
-| ----------------------------------- | ------------ | ----------- | ------- |
-| `--detach-pages` (display:none)     |   ~55 s      |   ~55 s     | yes     |
-| `--incremental` PDF update          |    -         |   ~32 s     | yes     |
-| pdf-lib `parseSpeed: Fastest`       |    -         |    ~3 s     | yes     |
-| `finalizePage` micro-optimizations  |    ~3 s      |    ~3 s     | yes     |
-| **aggressive detach (removeChild)** | **~22 s**    | **~22 s**   | **yes** |
-| **skip dead `findEndToken` path**   | **~3.5 s**   | **~3.5 s**  | **yes** |
-| **renderTo additive backoff**       | **~4.25 s**  | **~4.25 s** | **yes** |
-| pageRanges sharding (generate)      |    -         |  10-40 s    | no      |
-
-Render is now ~19 s on a 1638-page book, down from ~104 s in the
-original baseline. The next bottleneck is unambiguously
-`page.pdf()` -- ~60-70 s of Chromium-internal PDF serialisation
-that's only addressable via the `pageRanges` sharding approach
-(run multiple `page.pdf()` calls on disjoint page ranges in
-parallel browsers, concatenate with pdf-lib).
-
-## What happened when we tried `createBreakToken` dedup
-
-With render down to ~26 s, the bottom-up profile points at three
-JS bodies still worth looking at:
-
-```
-findEndToken    self 3270 ms (12.4 %)
-findElement     self 1924 ms ( 7.3 %)
-createBreakToken self  996 ms ( 3.8 %)
-```
-
-### Attempt A: cache `lastChild.lastChild` in `findEndToken`
-
-The descend-to-deepest-valid-descendant loop in
-[`findEndToken`](docs/lib/paged.browser.js:2100) reads
-`lastChild.lastChild` up to three times per iteration (while
-condition, `validNode` check, assignment). Cache once.
-
-Profile diff (paired `--detach-pages --cpu-profile`):
-
-| function         | PRE       | POST      | Δ        |
-| ---------------- | --------- | --------- | -------- |
-| `findEndToken`   | 3269.9 ms | 3108.0 ms | **-162** |
-| `createBreakToken` | 995.8 ms |  964.9 ms | -31      |
-| `findElement`    | 1924.0 ms | 1767.2 ms | -157     |
-
-Real, modest win on `findEndToken` self-time. Plausibly the `-157`
-on `findElement` is jitter (`findEndToken` doesn't call it), but
-the `findEndToken` self drop is the only one we'd hang our hat on.
-PDF byte-equivalent on all sampled pages. Shipped.
-
-### Attempt B: dedup `findElement(renderedNode, source)` in `createBreakToken`
-
-In the `!renderedNode` branch of
-[`createBreakToken`](docs/lib/paged.browser.js:1796),
-`findElement(renderedNode, source)` is called once at line 1817
-(inside `if (!temp.nextSibling)`) and again unconditionally at
-line 1830. Hoist + reuse: at most one call per invocation that
-takes this branch.
-
-Profile diff vs the post-Attempt-A baseline:
-
-| edge                                | PRE       | POST      | Δ      |
-| ----------------------------------- | --------- | --------- | ------ |
-| `findElement` self                  | 1767 ms   | 1892 ms   | +125   |
-| `findElement` <- `createBreakToken` | 1232 ms   | 1308 ms   | +76    |
-| `findElement` <- `findEndToken`     |  537 ms   |  580 ms   | +43    |
-
-The change cannot regress (it only ever removes one call), so the
-deltas are jitter, not real cost. The give-away is the
-`findElement <- findEndToken` edge: `findEndToken` wasn't touched
-between the two runs, yet its attributed `findElement` total still
-moved by +43 ms. That fixes the per-edge noise floor at ~40-80 ms
-on this machine, which swallows whatever savings the dedup
-produces.
-
-Read the other way: the `!renderedNode + !temp.nextSibling` branch
-must fire rarely enough that removing one of its two `findElement`
-calls doesn't register above this noise. We don't have call-count
-instrumentation in the cpuprofile to confirm directly (`hitCount`
-is samples-on-stack, not invocations), but a savings below
-noise is functionally indistinguishable from no savings.
-
-Reverted. The lesson echoes Attempt A above (textBreak): if the
-target branch fires rarely, the dedup's correctness is undeniable
-but its effect is unmeasurable.
-
-### Attempt C: skip `findEndToken` when nobody reads its result
-
-`findEndToken` (3.1 s self) was the top remaining JS-body in the
-post-A profile. Both Attempt A (cache the `.lastChild` access) and
-the speculative validNode-caching extension above tried to make
-it *faster*. Wrong question. The bottom-up profile shows where
-cost lives, but a caller breakdown shows *why* it lives there:
-
-```
-findEndToken: self=3108 ms, total=3652 ms
-callers (attributed total ms):
-   3652.19 ms   checkUnderflowAfterResize@paged.browser.js:2502
-```
-
-`findEndToken` is called from exactly one place:
-[`Page.checkUnderflowAfterResize`](docs/lib/paged.browser.js:2503),
-which fires from a `ResizeObserver` whenever the page wrapper
-*shrinks*. That happens on every overflow extraction during
-normal render. The handler computes an `endToken` and hands it to
-`this._onUnderflow(endToken)`. The only live registration of
-`onUnderflow` in the bundle was an empty callback in
-[`Chunker.addPage`](docs/lib/paged.browser.js:3251) with
-commented-out intent (`// page.append(this.source, overflowToken);`).
-The computed endToken was discarded every time.
-
-The fix is subtraction, not optimization: delete the no-op
-registration so `_onUnderflow` stays `undefined` by default, and
-add an early bail in `checkUnderflowAfterResize` so `findEndToken`
-doesn't run when nobody can consume its result. A future caller
-that wants the path back just calls `page.onUnderflow(realFn)` --
-the presence of a non-default handler is itself the activation
-signal, no flag plumbing required.
-
-Profile diff (paired `--detach-pages --cpu-profile`):
-
-| function       | PRE       | POST      | Δ          |
-| -------------- | --------- | --------- | ---------- |
-| `findEndToken` | 3108.0 ms |     0.0 ms | **-3108** |
-| `findElement`  | 1767.2 ms |  1313.8 ms | **-453**  |
-| **render**     | **25.75 s** | **22.26 s** | **-3.49 s (-14%)** |
-
-The `findElement` drop matches the previously-attributed
-`findEndToken → findElement` total-time edge (~537 ms) within
-noise; rest is jitter. PDF byte-equivalent on all sampled pages.
-Shipped.
-
-### Attempt D: skip `Footnotes.afterPageLayout` when no `float: footnote`
-
-After Attempt C the next gBCR caller worth looking at was
-[`Footnotes.afterPageLayout`](docs/lib/paged.browser.js:31477) at
-~1114 ms attributed gBCR. The handler implements the CSS
-`float: footnote` / `@footnote`-margin-box feature; the per-page
-work begins with `noteContent.getBoundingClientRect()`, then
-sets the inner content's `columnWidth`, then constructs a `Layout`
-and runs `findOverflow` on the (for our document, empty)
-`pagedjs_footnote_inner_content`.
-
-Our stylesheet declares `float: footnote` nowhere
-(`grep -r "float: footnote" docs/_site-pdf/`), so the handler's
-`this.footnotes` dict stays `{}` for the whole render and the
-per-page work is in service of nothing. Same shape as Attempt C:
-gate at the top with `if (Object.keys(this.footnotes).length === 0) return;`.
-
-Profile diff (paired `--detach-pages --cpu-profile`):
-
-| metric                          | PRE       | POST      | Δ          |
-| ------------------------------- | --------- | --------- | ---------- |
-| total gBCR (attribution)        | 7925 ms   | 7756 ms   | **-169**   |
-| ↳ Footnotes `afterPageLayout`   | 1114 ms   |    0 ms   | -1114      |
-| ↳ `hasOverflow`                 | 4687 ms   | 4961 ms   | **+274**   |
-| ↳ `create`                      |  913 ms   | 1019 ms   | **+106**   |
-| ↳ `Layout`                      |  446 ms   |  543 ms   | **+97**    |
-| ↳ next-page `afterPageLayout`   |    0 ms   |  431 ms   | **+431**   |
-| **render wall-clock**           | **22.26 s** | **23.14 s** | **+880 ms** |
-| **per-page ratio (last/first)** | **1.50x** | **1.75x** | **worse**  |
-
-Net gBCR reduction is only ~170 ms even though we eliminated 1114 ms
-of attributed gBCR at the Footnotes call site. The missing ~944 ms
-re-attributed to the next gBCR callers in the per-page sequence
-(`hasOverflow`, `create`, `Layout`, and a previously-invisible
-`afterPageLayout` at line 31986). And the per-page ratio went from
-1.50x to 1.75x -- the late pages got *more* expensive, not less.
-
-That ratio regression is the give-away. The Footnotes' small
-gBCR was apparently absorbing pending DOM mutations that, when
-not flushed there, accumulated until the next gBCR (typically a
-larger one) had to flush more state at once. This is the same
-shape as the Page.create memoize trap documented above: removing
-a layout flush at point A makes the flush at point B more
-expensive, and the cost is super-linear in the deferred mutation
-count.
-
-Reverted.
-
-### Attempt E: additive backoff on `renderTo`'s overflow check
-
-After Attempt D the lesson seemed to be "gBCR self-time is
-layout-flush attribution; you can't skip a gBCR without the flush
-migrating." Then re-reading the per-page render loop turned up a
-case the migration framing doesn't actually cover.
-
-[`Layout.renderTo`](docs/lib/paged.browser.js:1478) calls
-`findBreakToken` (→ `findOverflow` → `hasOverflow` → gBCR) when
-the cumulative text length of appended nodes crosses `maxChars`
-(default 1500). The gate looks like batching, but the reset is
-asymmetric:
-
-```js
-if (length >= this.maxChars) {
-  // ... layout hook, await images ...
-  newBreakToken = this.findBreakToken(wrapper, source, bounds, prevBreakToken);
-  if (newBreakToken) {
-    length = 0;                                    // only reset on overflow found
-    this.rebuildTableFromBreakToken(newBreakToken, wrapper);
-  }
-}
-```
-
-When no overflow is found, `length` doesn't reset -- it stays
-above `maxChars` and the very next iteration's appended node
-triggers another `findBreakToken`. The check fires *every
-iteration past `maxChars`* until overflow trips. On a typical
-~3000-char page that's ~30+ findBreakToken calls (each one a
-hasOverflow gBCR = layout flush) before the actual break point.
-
-Replace with **additive backoff**: track a moving baseline
-`lengthAtLastCheck` and only fire the check when `length -
-lengthAtLastCheck >= maxChars`. Advance the baseline when no
-overflow yet; reset both on overflow. Per-page check count drops
-from O(nodes-past-maxChars) to O(page-chars / maxChars), typically
-2-3 instead of 30+.
-
-Correctness rests on findBreakToken handling arbitrary overshoot:
-`findOverflow` walks the wrapper to identify the overflowing
-Range regardless of how much excess was appended past it,
-`removeOverflow` extracts the excess via `extractContents`, and
-`createBreakToken` returns a BreakToken at the right source
-position. The chunker builds a fresh walker from `breakToken.node`
-on the next page, so the trimmed content gets re-laid-out from
-its correct source position. (The `break-inside: avoid` worry --
-that containers with extra trailing content might make different
-break decisions -- turned out to be empirically unfounded.)
-
-Profile diff (paired `--detach-pages --cpu-profile`):
-
-| metric                      | PRE       | POST      | Δ                |
-| --------------------------- | --------- | --------- | ---------------- |
-| **render wall-clock**       | **23.73 s** | **19.48 s** | **-4.25 s (-18 %)** |
-| total gBCR (attribution)    | 8024 ms   | 5705 ms   | -2319 (-29 %)    |
-| ↳ `hasOverflow` gBCR        | 4837 ms   | 2725 ms   | **-2112 (-44 %)** |
-| ↳ `findOverflow` per-node   |  438 ms   |  166 ms   | -272             |
-| ↳ `create` / `Layout` / Footn. | unchanged within jitter                  |
-| `removeOverflow` self       |  457 ms   |  370 ms   | **-87 (improved)** |
-| per-page ratio (last/first) | 1.64x     | 1.60x     | improved         |
-
-No migration: Footnotes (1127 ms), create (955), Layout (534)
-all flat. `removeOverflow` *dropped* despite the over-append
-overshoot concern, because fewer findBreakToken invocations means
-fewer extractContents passes, not larger ones -- the per-call
-overshoot is bounded by maxChars (~1500 chars), small relative to
-page capacity.
-
-Full pdftotext-MD5 match on pages 6, 100, 500, 1000, 1500, 1638.
-Page count 1638. PDF byte size 126 bytes apart (metadata).
-
-Shipped.
-
-### The deeper lesson (a third pattern)
-
-Attempts B and D taught that you can't elide a *single* gBCR
-because the layout flush migrates to the next caller. Attempt E
-shows the framing was too narrow: you can't elide one flush, but
-you can do *fewer total flushes* if you batch observations across
-mutations.
-
-The three working patterns for render perf, distinguished:
-
-- **Reduce per-flush cost**: aggressive-detach (-22 s). Shrink the
-  layout tree by physically removing finalized pages so each
-  remaining flush has less style/selector state to maintain.
-
-- **Reduce flush count**: renderTo additive backoff (-4.25 s).
-  When mutations between observations don't independently need
-  observing, query once per batch instead of per-mutation. The
-  per-flush cost grows slightly with deferred mutations but
-  amortizes well below the linear scan.
-
-- **Delete dead JS**: skip-findEndToken (-3.5 s), Page.create
-  hoisted CSS, etc. Walk up the call chain; if the consumer
-  doesn't read the value, delete the production. Works whenever
-  the JS self-time is genuinely JS, not flush attribution.
-
-What *doesn't* work: try to elide one specific gBCR while
-preserving the mutation pattern around it (Attempts B and D). The
-flush re-attributes to the next gBCR in the per-page sequence,
-which then has to flush a larger backlog -- net wash or
-regression.
-
-The diagnostic question to tell these apart: *what does the
-mutation rhythm look like between consecutive gBCR calls?* If it's
-"mutation, gBCR, mutation, gBCR, ..." (renderTo's per-iteration
-check), batching wins. If it's "one mutation, multiple gBCRs"
-(Page.create memoize, Footnotes skip), each gBCR is on the same
-mutation state and the flush has to happen for the *next*
-mutation regardless of which JS asks.
-
-### Where this leaves the picture
-
-Render is now ~19 s on a 1638-page book, down from ~104 s in the
-original baseline. The JS-body profile after Attempt E:
-
-```
-findElement     self 1373 ms ( 7.1 %)
-createBreakToken self 1027 ms ( 5.3 %)
-removeOverflow  self  370 ms ( 1.9 %)
-afterPageLayout self  239 ms ( 1.2 %)
-```
-
-None of these are individually addressable -- they're load-bearing
-work in the per-page break loop. `findElement` already takes the
-dictionary fast path. `pageRanges` sharding of `generate` (~60-70 s
-of `page.pdf()`) is the only remaining knob with a profile target
-large enough to move the wall-clock total meaningfully, and it's
-single-threaded-inaddressable (requires multiple Chromium
-processes + pdf-lib concatenation).
-
-> [!NOTE]
-> The "`findElement` already takes the dictionary fast path" claim
-> above turned out to be wrong. A re-investigation under puppeteer 25
-> (see "findRef wasn't taking the fast path" below) found 39 % of
-> findRef calls falling through to `doc.querySelector("[data-ref='X']")`
-> because the per-page index wasn't populated for rebuilt ancestors
-> and the source tree never had one at all. Fixing both saves ~2.4 s
-> of render.
-
-## Rebaselining after the puppeteer 22 -> 25 bump
-
-`docs/package.json` was bumped from `puppeteer ^22.x` to `^25.0.4`,
-which pulled in a newer bundled Chromium. Same harness, same book
-(now 1651 pages after a small content addition vs the 1638 the
-prior baseline measured), `--detach-pages --cpu-profile`:
-
-| Phase    | Prior (puppeteer 22, post-Attempt-E) | New (puppeteer 25) | Δ |
-| -------- | ------------------------------------ | ------------------ | --- |
-| render   | ~19 s   | 22.0 s | flat (run-to-run noise) |
-| generate | ~60-70 s | **42.7 s** | **-20 to -28 s** |
-| process  | ~5 s    | 4.9 s | flat |
-| **total**| ~95-100 s | **69.6 s** | **-25 to -35 s** |
-| raw Chrome PDF size | 52 MB | **39.3 MB** | -12 MB |
-| render ratio (last/first quarter) | 1.60x | 1.36x | flatter |
-
-The whole wall-clock win is in `generate`. Chrome's PDF writer got
-meaningfully faster, and is now emitting something more compact --
-a 25 % drop in the raw byte stream that previously needed pdf-lib's
-re-emit pass to shrink. The "Chromium `Page.printToPDF` knob survey"
-above noted Skia wrote streams uncompressed; whatever changed at
-the SkPDF level closes part of that gap automatically. The
-final PDF after pdf-lib's `save()` is still ~17 MB either way --
-the re-emit's deflate step was already doing most of the work.
-
-Render itself is unchanged in shape. The same hot paths
-(`hasOverflow`, `Footnotes.afterPageLayout`, `Page.create`,
-`findRef`) sit at roughly the same self-times. Nothing that was
-cheap got expensive; nothing that was expensive got cheap.
-
-Notable side-effect: with `generate` no longer dominating, the
-strategic note at the end of "Where this leaves the picture" above
-("`pageRanges` sharding of `generate` is the only remaining knob
-with a profile target large enough to move the wall-clock total
-meaningfully") is now less true. The shard target shrunk from
-~60 s to ~43 s, so the upper bound on what sharding can save
-shrunk with it. Still the biggest untried knob, but the urgency
-is lower.
-
-The re-baselined bottom-up render profile also surfaced something
-that *was* always there but had been mis-attributed: see the next
-section.
-
-## findRef wasn't taking the fast path
-
-The new-baseline cpu profile's top entries:
-
-```
-   self_ms   self_%   function  @  source
-   5872.93   26.84%   (program)             (V8/Blink internal)
-   4831.83   22.08%   getBoundingClientRect (native)
-   2530.25   11.56%   findRef               paged.browser.js:643
-   2426.14   11.09%   removeChild           (native, called by detach-pages)
-   1007.64    4.60%   (idle)
-    565.17    2.58%   removeOverflow
-```
-
-`findRef` at **11.6 % of render self-time** is the second-largest
-non-native bucket after gBCR. The prior README state's "JS-body
-profile after Attempt E" reported `findElement self 1373 ms (7.1 %)`
-and concluded `findElement` was already fast. Both numbers refer
-to the same call chain -- V8 just attributes time differently
-between the two-line forwarder and its called helper:
-
-```js
-function findElement(node, doc, forceQuery) {
-    const ref = node.getAttribute("data-ref");
-    return findRef(ref, doc, forceQuery);
-}
-
-function findRef(ref, doc, forceQuery) {
-    if (!forceQuery && doc.indexOfRefs && doc.indexOfRefs[ref]) {
-        return doc.indexOfRefs[ref];                              // fast
-    } else {
-        return doc.querySelector(`[data-ref='${ref}']`);          // slow
-    }
-}
-```
-
-The "post-Attempt-E" profile's `findElement` charge was its
-forwarder cost; the actual body work has always been inside
-`findRef`. The new V8 profile splits the attribution honestly,
-with `findElement` reading `self=0.00 ms` and `findRef` carrying
-the 2.5 s.
-
-### Instrumenting per-branch call counts
-
-Wrapped `findRef` with counters keyed by which branch it took:
-fast-path (dict hit), `forceQuery` (caller explicitly asked for
-querySelector), `noDict` (the doc didn't have `indexOfRefs` at all),
-and `dictMiss` (the doc had a dict but no entry for the ref). The
-caller of each branch was captured from `new Error().stack`.
-
-A single instrumented run on the 1651-page book:
-
-```
-findRef.calls         = 47,867
-findRef.fastPath      = 29,300   (8.4 ms total, 0.29 us/call)
-findRef.fallback total = 18,567  (2585.5 ms total)
-  forceQuery          =      2
-  noDict              =  2,739
-  dictMiss            = 15,826
-  fallbackReturnedNull =    892
-
-byCallerLine (top, all attributed to docs/lib/paged.browser.js):
-   15,767  dictMiss   <- Layout.append, `findElement(node.parentNode, dest)`
-      955  noDict     <- Layout.append, same call
-      892  noDict     <- Layout.append, `findElement(node.parentNode, fragment)`
-      848  noDict     <- Layout.createBreakToken, `findElement(*, source)`
-       58  dictMiss   <- Layout.createBreakToken (an `*, rendered` site)
-       42  noDict     <- Layout.createBreakToken, another `*, source` site
-        2  forceQuery <- Layout.rebuildTableFromBreakToken
-```
-
-The fast path is essentially free (0.29 us/call -- a hashed object
-lookup). **The entire 2.5 s lives in the 18,567 fallback calls**.
-Two structural reasons:
-
-### Root cause 1: rebuilt ancestors aren't indexed in `dest`
-
-`Layout.append(node, dest, ...)` writes each leaf clone into
-`dest.indexOfRefs` near the end of the function. But when the
-leaf's parent isn't already in `dest`, `append` calls
-`rebuildAncestors(node)` to clone the source ancestor chain into
-a fresh `DocumentFragment` and appends the fragment to `dest`:
-
-```js
-let fragment = rebuildAncestors(node);
-parent = findElement(node.parentNode, fragment);
-// ... attach clone ...
-dest.appendChild(fragment);   // <-- ancestors now live in dest's DOM
-                              //     but dest.indexOfRefs wasn't updated
-```
-
-The rebuilt ancestors are now in `dest`'s DOM tree, findable by
-`dest.querySelector("[data-ref='X']")`. They are **not** in
-`dest.indexOfRefs`. Every subsequent `append` whose `node`
-descends from one of those rebuilt ancestors hits dictMiss on
-that ancestor and falls through to `dest.querySelector`. With
-~15.7 k such calls per book at ~140 us each -- a small per-page
-wrapper, so querySelector is fast even when it walks -- that's
-about 2.2 s.
-
-The 892 `noDict <- Layout.append, findElement(*, fragment)` calls
-in the byCallerLine table are a related symptom: the second
-`findElement` call inside the rebuild branch -- which looks the
-parent up in the *fragment* before it gets appended to `dest` --
-hits a fragment whose `indexOfRefs` was never created.
-
-### Root cause 2: the source tree never has an index
-
-Six call sites in `Layout.createBreakToken` use
-`findElement(*, source)` to map a rendered node back to its
-position in the original document. `source` is the
-`ContentParser`-wrapped result of the initial document walk in
-`ContentParser.addRefs` -- which walks every element, assigns a
-`data-ref`, and **stops**. No `indexOfRefs` is ever populated.
-Every `findElement(*, source)` therefore falls through to
-`source.querySelector("[data-ref='X']")` against the whole
-~10 k-element source tree.
-
-There are only ~890 such calls per render (they only fire on
-pages where the break landed mid-element), but at ~1.3 ms each
-that's ~1.2 s.
-
-### The fix
-
-Three small patches in `docs/lib/paged.browser.js`, all marked
-`// [PATCH: findRef fast-path]`:
-
-1. **`rebuildAncestors`** -- initialise `fragment.indexOfRefs = {}`
-   at the top, and write each rebuilt clone into it as the loop
-   builds the chain. The second `findElement(*, fragment)` call in
-   `Layout.append`'s rebuild branch then hits the fast path.
-
-2. **`Layout.append`'s rebuild branch** -- after
-   `dest.appendChild(fragment)`, merge `fragment.indexOfRefs` into
-   `dest.indexOfRefs`. Subsequent `findElement(*, dest)` calls on
-   any rebuilt ancestor now hit the fast path too.
-
-3. **`ContentParser.addRefs`** -- initialise `content.indexOfRefs = {}`
-   on entry and write `content.indexOfRefs[ref] = node` inside the
-   tree-walk loop. Every `findElement(*, source)` call site now hits
-   the fast path.
-
-### Results
-
-Instrumented A/B (call counts pre/post on the same 1651-page book):
-
-| metric | pre-fix | post-fix | Δ |
-| ------ | ------- | -------- | --- |
-| findRef calls (total) | 47,867 | 47,867 | (same; this is a per-call cost change, not a count change) |
-| fast path | 29,300 | **46,914** | **+17,614** |
-| fallback total calls | 18,567 | **953** | **-17,614 (-95 %)** |
-| dictMiss | 15,826 | 59 | -15,767 |
-| noDict (`findElement(*, fragment)` in rebuild branch) | 892 | 0 | -892 |
-| noDict (createBreakToken vs source) | 848 + 42 | 0 + 0 | -890 |
-| fallback total time | 2,585 ms | **6.9 ms** | **-2,578 ms** |
-| fallbackReturnedNull | 892 | 892 | unchanged (these are the genuine "no such ref" misses) |
-
-The 892 residual fallbacks are all `findElement(node.parentNode, dest)`
-on a *fresh* per-page `dest` whose dict was just created and only
-contains its own leaf clones, so the parent lookup correctly returns
-null (the parent's first appearance on this page will be in the
-next call's rebuilt fragment). 7 ms total; not worth a third patch.
-
-Wall-clock A/B, paired runs, no instrumentation, no cpu-profile
-(stash the fix, run twice; pop, run twice):
-
-| run | BEFORE render | AFTER render |
-| --- | --- | --- |
-| 1 | 20.73 s | 18.17 s |
-| 2 | 20.54 s | 18.22 s |
-| **avg** | **20.64 s** | **18.20 s** |
-
-**Δ = -2.44 s render (-12 %).**
-
-Profile diff (`--detach-pages --cpu-profile`, single run each --
-between-run noise on cpu-profile self-time is in the 50-150 ms band
-for sub-1 % rows):
-
-| function | PRE | POST | Δ |
-| --- | --- | --- | --- |
-| `findRef`   | 2530 ms (11.56 %) | undetectable (<130 ms) | **-2400 ms** |
-| `findElement` self | 0 ms (forwarder) | 0 ms | unchanged |
-| `addRefs`  | not in top 20 | **157 ms (0.80 %)** | +157 ms (new dict-population cost) |
-| `removeChild` (detach handler) | 2426 ms | 2320 ms | -106 ms (noise) |
-| `getBoundingClientRect` | 4832 ms | 4632 ms | -200 ms (noise) |
-| total render | 22.0 s | 19.8 s | -2.2 s |
-
-PDF byte size is 16-47 bytes apart between any two runs (well inside
-the standard `/CreationDate` / `/ModDate` timestamp drift); content
-is functionally byte-identical.
-
-Shipped.
-
-### Was it the headers/footers change?
-
-A reasonable initial hypothesis was that the recent
-"Get the details of page headers/footers out of paged.js"
-(`c70b83d`) or its precursor "Add the part name as a prefix to
-the page number" (`71aea3d`) had introduced the cost. Neither
-did:
-
-- `71aea3d` added a per-page
-  `pageElement.querySelector("article.part-divider")` in the
-  Counters handler, which would have shown up as extra querySelector
-  work, but it's unrelated to `findRef`'s call path.
-- `c70b83d` removed that querySelector again, moving the part-title
-  capture from per-page JS to a CSS `string-set` / `string()` rule.
-  Net per-page work went *down*, not up.
-
-`findRef`'s slow path was always there -- the README's prior
-post-Attempt-E profile reported the same call chain as
-`findElement self 1373 ms (7.1 %)`. Two things happened to make it
-worth a fresh look:
-
-- **V8's attribution split.** The new V8 charges `findElement` 0 ms
-  and `findRef` 2530 ms instead of attributing the helper's body
-  to its forwarder. Same call chain, different bucket label, much
-  more visible in the bottom-up view.
-- **The cost itself may have grown.** 1.4 s → 2.5 s is more than a
-  V8 attribution shift can explain on a +0.8 % content change. The
-  branch counters above don't tell us the pre-puppeteer-25 split;
-  the most we can claim is "the fallback was clearly the dominant
-  branch by the time we measured." Either way, the fix removes it.
-
-### Methodology
-
-This one had two of the recurring lessons baked in:
-
-1. **Instrument to understand the workload, not just the time.**
-   The CPU profile showed `findRef` at 2.5 s self-time; that's
-   *what*. It needed branch-counting (fast-path vs dictMiss vs
-   noDict, with caller attribution) to find out *why*. Wall-clock
-   A/B alone would have detected the regression; only the per-branch
-   counters explained it.
-
-2. **`new Error().stack` is the cheap way to attribute hot-function
-   calls back to their callers in-browser**, when you can't
-   instrument the call sites individually. The harness already had
-   `find-callers.mjs` for post-hoc cpu-profile attribution, but
-   that aggregates by sample, not by call. Per-call attribution
-   needed the in-page stack walk. Cost ~5 us per call, OK for
-   1-shot diagnostic runs, not OK to ship.
-
-## Where this leaves the picture
-
-Updated cumulative table, all measured against the original 207 s
-puppeteer-22 baseline:
-
-| fix                                 | render saved | total saved | shipped |
-| ----------------------------------- | ------------ | ----------- | ------- |
-| `--detach-pages` (display:none)     |   ~55 s      |   ~55 s     | yes     |
-| `--incremental` PDF update          |    -         |   ~32 s     | yes     |
-| pdf-lib `parseSpeed: Fastest`       |    -         |    ~3 s     | yes     |
-| `finalizePage` micro-optimizations  |    ~3 s      |    ~3 s     | yes     |
-| aggressive detach (`removeChild`)   |   ~22 s      |   ~22 s     | yes     |
-| skip dead `findEndToken` path       |   ~3.5 s     |   ~3.5 s    | yes     |
-| `renderTo` additive backoff         |   ~4.25 s    |   ~4.25 s   | yes     |
-| **puppeteer 22 -> 25 (Chromium bump)** | **-**     | **~20-30 s** *(generate)* | **yes** |
-| **findRef fast-path** (this section) | **~2.4 s** | **~2.4 s**  | **yes** |
-| `pageRanges` sharding (generate)    |    -         |  ~5-20 s    | no      |
-
-Current end-to-end on the 1651-page book, `book.bat` path:
-
-```
-render   :  ~18 s    (was ~104 s in the original baseline)
-generate :  ~43-48 s (was ~64 s; mostly the puppeteer 25 bump)
-process  :  ~5 s
-total    : ~70 s     (was ~207 s, a 3x speedup)
-```
-
-The remaining JS-body profile after the findRef fix:
-
-```
-self_ms   self_%   function                    source
-  ~500    ~2.5 %   removeOverflow              paged.browser.js
-  ~320    ~1.6 %   wrapContent
-  ~200    ~1.0 %   afterPageLayout (paged.js)
-  ~187    ~1.0 %   afterPageLayout (Footnotes)
-  ~157    ~0.8 %   addRefs                     (new -- the fix above)
-  ~130    ~0.7 %   renderTo
-```
-
-None of those individually clear the noise band; the largest
-remaining JS-body bucket is the same scale as the `addRefs` cost
-we just added. Native frames (`getBoundingClientRect` ~23 %,
-`(program)` ~30 %, `removeChild` ~12 %) are now the dominant
-contributors to render, and gBCR's caller breakdown is the same
-flat-per-page shape it's had since aggressive detach landed.
-
-The single biggest untried lever remains `pageRanges` sharding for
-generate. After the puppeteer 25 bump it would save less than the
-earlier estimate (the 64 s -> 43 s gain made the target smaller),
-but it's still the only knob with a profile target large enough to
-move the wall-clock total by 5+ s.
-
-## Can we make `removeChild` cheaper?
-
-After the findRef fix, `removeChild` sits at ~12 % of render
-self-time. The detach-pages handler attribution is clean -- 1651
-detaches for 1651 pages, exactly one per page, with the only
-other removeChild callers being `filterTree` at startup (9,192
-ignorable-text-node strips totalling 2.3 ms; not a hot path).
-
-Per-call cost on the 1651-page book, with `Element.prototype.removeChild`
-wrapped to measure each call:
-
-```
-[instrument] page-detach avg:      1.009 ms/call
-[instrument] page-detach median:   0.900 ms/call
-[instrument] page-detach p90:      2.000 ms/call
-[instrument] page-detach p99:      3.000 ms/call
-[instrument] avg descendants/page: 147.7
-```
-
-That's ~5-7 us per descendant LayoutObject torn down, multiplied
-by ~150 descendants per page, multiplied by ~1651 pages = ~1.7 s
-total. The distribution is tight and scales linearly with
-descendant count -- this looks like ordinary Blink teardown work
-rather than a pathological slow path.
-
-To verify, two structural variants both tested at the same
-instrumentation harness:
-
-### Variant B: graveyard DocumentFragment
-
-Replace `parent.removeChild(page)` with
-`graveyard.appendChild(page)`, where `graveyard` is a fresh
-`DocumentFragment` held by the handler. Hypothesis: the
-move-to-out-of-document-fragment path might skip some
-LayoutObject teardown work because the destination is itself
-disconnected.
-
-| metric | A (removeChild) | B (graveyard) |
-| ------ | --------------- | ------------- |
-| avg per call | **1.009 ms** | 1.082 ms (+7 %) |
-| median | 0.900 ms | 0.900 ms |
-| p90 | 2.000 ms | 2.200 ms |
-| p99 | 3.000 ms | 3.100 ms |
-| total page wall | 1666 ms | 1785 ms |
-| render wall-clock | ~16.1 s | ~15.2 s (run-to-run noise) |
-
-The graveyard move is **slightly slower** per call. Blink tears
-down the LayoutObjects regardless of where the node lands; there's
-no fast-path for "moved to a detached parent". No win.
-
-### Variant C: `contain: layout style` on `.pagedjs_page`
-
-Inject `<style>.pagedjs_page { contain: layout style; }</style>`
-into the document before render. Hypothesis: removing a contained
-subtree might skip style/layout invalidation propagation because
-Blink already knows the subtree didn't influence its siblings or
-parent.
-
-Also tested `contain: strict` (which adds `paint` and `size`
-containment -- pages already have explicit dimensions via @page
-CSS so this is safe).
-
-| metric | A (no contain) | C (layout style) | C-strict |
-| ------ | -------------- | ---------------- | -------- |
-| avg per call | **1.009 ms** | 1.017 ms | 0.991 ms |
-| median | 0.900 ms | 0.900 ms | 0.900 ms |
-| p90 | 2.000 ms | 1.900 ms | 1.900 ms |
-| total page wall | 1666 ms | 1678 ms | 1634 ms |
-| render wall-clock | ~16.1 s | ~15.0 s | ~14.8 s |
-
-All four runs are within ~5 % of each other on per-call cost --
-well inside the run-to-run noise band. Containment doesn't unlock
-a faster removeChild path either.
-
-### Conclusion (variants B + C)
-
-The 1.7 s of `removeChild` is intrinsic Blink LayoutObject
-teardown work. The math checks out at ~5-7 us per descendant ×
-~150 descendants × 1651 pages, and three different framings
-(plain removeChild, move-to-fragment, contain + removeChild) all
-land within ~10 % of each other. The destination of the move and
-the containment metadata don't change Blink's teardown rate.
-
-The one thing we *don't* do is "remove less per page" -- removing
-a page's content as N individual leaf removals would be strictly
-worse (N × overhead instead of 1 × overhead, same teardown total).
-Each removeChild call carries DOM-mutation, style-invalidation,
-and notify overhead beyond the per-descendant cost, so consolidating
-to one removal per page is already the optimal framing.
-
-### Variant D: don't detach at all, just `contain: strict`
-
-A natural follow-up: if the per-page cost of having siblings
-around really comes from style/selector traversal, maybe Blink
-will skip a *contained* sibling subtree even when it can't skip
-a `display: none` one. Containment is a stronger signal -- it
-explicitly tells the engine "no observable interaction crosses
-this boundary" -- so the renderer ought to be able to short-circuit
-sibling-walks more aggressively.
-
-Implementation: replace the detach handler with one that sets
-`pageElement.style.contain = 'strict'` at finalizePage and clears
-the property for every page at afterRendered (so `page.pdf()`
-serializes the right paint state).
-
-Result:
-
-| metric | current detach | variant D (contain:strict, no detach) |
-| ------ | -------------- | --------------------------------------- |
-| **render wall-clock** | **~16 s** | **89.3 s** |
-| `Page.create` gBCR | ~764 ms | **31,142 ms** |
-| `hasOverflow` gBCR | ~2,478 ms | 10,922 ms |
-| total gBCR | ~4,832 ms | 45,413 ms |
-| per-page ratio (last/first) | 1.36x | 4.11x |
-
-Worse than the README's display:none baseline (`Page.create`
-gBCR 12,947 ms / render 48.5 s). Containment metadata adds work
-to per-sibling evaluation rather than removing it. **Definitive
-no.** Containment is a hint about what's inside the box; it
-doesn't make the box invisible to neighbours.
-
-### Variant E: empty the wrapper, leave it in place
-
-A second framing of the same idea: keep the page wrapper as a
-sibling, but move its children to a stash so the wrapper itself
-is a leaf (no descendants for Blink to walk through). Restore
-the children at afterRendered. This isolates the "what costs
-what" question: does sibling-walk cost depend on descendant
-count, or just on sibling count?
-
-Implementation: at finalizePage, for the previous-finalized page
-(one behind, mirroring the keep-one-back pattern), move each
-child into an array via `wrapper.removeChild(wrapper.firstChild)`,
-set `min-height: 297mm` so the wrapper still occupies its slot,
-and stash the children. At afterRendered, restore.
-
-Result:
-
-| metric | current detach | variant E (empty wrapper) |
-| ------ | -------------- | --------------------------- |
-| **render wall-clock** | **~16 s** | **21.9 s** |
-| `Page.create` gBCR | ~764 ms | 2,628 ms (+1,864) |
-| `hasOverflow` gBCR | ~2,478 ms | 5,024 ms (+2,546) |
-| `Layout` gBCR | ~294 ms | 937 ms |
-| total gBCR | ~4,832 ms | **10,127 ms (+5,295)** |
-| `removeChild` self | 2,426 ms | **854 ms (-1,572)** |
-| per-page ratio (last/first) | 1.36x | 2.93x |
-
-The removeChild *savings* are real -- with no wrapper to tear
-down, just ~150 child removals per page at sub-microsecond each.
-But the gBCR *cost* roughly doubles because the wrappers are
-still siblings, and gBCR firings have to walk them. Net is +5 s
-render, *worse* than the current detach.
-
-This experiment yields a clean cost-model decomposition. Pulling
-the gBCR deltas apart against the wrapper-vs-content split:
-
-```
-display:none baseline (full content):       gBCR(Page.create) ≈ 12,947 ms
-variant E (empty wrappers, n=1651):         gBCR(Page.create) ≈  2,628 ms
-current detach (no siblings):               gBCR(Page.create) ≈    764 ms
-```
-
-Subtracting:
-
-- (variant E - current detach) = 1,864 ms for 1,651 sibling wrappers
-  → ~1.1 us per wrapper-sibling per `Page.create` gBCR call
-- (display:none - variant E) = 10,319 ms for 1,651 × 150 ≈
-  247,650 sibling descendants
-  → ~42 us per sibling-descendant per `Page.create` gBCR call
-
-Both wrappers and their descendants contribute to the per-call
-cost. Removing the descendants helps -- variant E really is
-substantially cheaper than display:none -- but the wrapper cost
-alone is enough to lose. To zero out both contributions you have
-to take both the wrapper and its descendants out of the sibling
-list, which is exactly what the current detach does.
-
-### Variant F: `content-visibility: hidden`, no detach
-
-The CSS spec's `content-visibility: hidden` is the closest
-property to "freeze in place without disposing" -- per spec,
-rendering work is "skipped" but cached state is preserved for
-cheap restoration. Conceptually nearer to a freeze than
-`display: none` or `contain: strict` were.
-
-Implementation: at finalizePage, set
-`pageElement.style.contentVisibility = 'hidden'` and
-`containIntrinsicSize = '210mm 297mm'` (the size hint Blink uses
-when content-visibility skips a subtree). At afterRendered,
-clear both.
-
-Result:
-
-| metric | current detach | variant F (cv:hidden) |
-| ------ | -------------- | ----------------------- |
-| **render wall-clock** | **~16 s** | **95.2 s** |
-| `Page.create` gBCR | ~764 ms | **29,656 ms** |
-| `hasOverflow` gBCR | ~2,478 ms | 17,558 ms |
-| total gBCR | ~4,832 ms | 52,899 ms |
-| per-page ratio (last/first) | 1.36x | 5.12x |
-
-Worse than every other variant. The spec's "skip rendering work"
-clause covers painting and composition; it does **not** make the
-subtree invisible to sibling-walks during style and selector
-matching that gBCR forces. Three "leave in place" properties
-(`display: none`, `contain: strict`, `content-visibility: hidden`)
-have now been tested and none of them short-circuit the
-sibling-walk.
-
-### Conclusion across all six variants
-
-| variant | render | net vs current |
-| ------- | ------ | -------------- |
-| A current (removeChild, no contain) | ~16.1 s | (baseline) |
-| B graveyard fragment | ~15.2 s | flat (noise) |
-| C `contain: layout style` + removeChild | ~15.0 s | flat (noise) |
-| C-strict `contain: strict` + removeChild | ~14.8 s | flat (noise) |
-| **D `contain: strict`, no detach** | **89.3 s** | **+73 s** |
-| **E empty wrappers, no detach** | **21.9 s** | **+5.9 s** |
-| **F `content-visibility: hidden`, no detach** | **95.2 s** | **+79 s** |
-
-The flat band (A/B/C/C-strict) is the cost-of-doing-business --
-~1 ms × 1651 pages = ~1.7 s of intrinsic Blink LayoutObject
-teardown. Variations on the framing don't move it. The
-catastrophic band (D, E) confirms that any path where the page
-wrapper stays in the live sibling list pays meaningfully more
-than the teardown cost would have been -- ~1.1 us per
-wrapper-sibling × 1651 wrappers × several gBCR call sites per
-page comes out to several seconds of extra render even when the
-wrapper is otherwise empty and contained.
-
-The 1.7 s is the bill we pay for shrinking the live DOM from
-~150 × 1651 ≈ 250k nodes back down to 2 nodes (in-flight page +
-keeper), which is what kept `Page.create`'s gBCR flat per page
-(see "Hypothesis 2: sibling sweeps over `display: none` pages"
-above). Net savings vs the display:none variant was ~22 s render;
-the 1.7 s removeChild cost is roughly 8 % of that win paid back
-to Blink for cleanup. Worth keeping.
-
-### Aside: it's not GC, and JS references don't help
-
-A reasonable follow-up question to all of this is "can we just
-hold a reference to the detached children to avoid disposal,
-or turn off GC to skip the cleanup?" Neither applies to what
-we're measuring.
-
-Chromium maintains two trees:
-
-- **DOM tree** -- `Node` objects, JS-visible, referenceable.
-- **Render tree** -- `LayoutObject` / `LayoutBox` / `LayoutText`
-  etc., Blink-internal, NOT JS-visible.
-
-`removeChild` keeps the DOM Node alive (JS reference holders --
-including the handler's `this._detached` array -- prevent
-collection). But the corresponding LayoutObject in the render
-tree is **destroyed immediately**, synchronously, at the
-removeChild call. Re-attaching via appendChild later builds a
-new LayoutObject from scratch.
-
-There is no JS-level API to keep a LayoutObject alive across
-detach + reattach. Holding DOM references doesn't change the
-render-tree lifecycle. The 1.7 s lives entirely in
-LayoutObject teardown -- which is Blink-internal C++ work
-attributed to the `removeChild` native frame in the profile,
-not to GC.
-
-V8's GC is a separate concern and isn't the bottleneck. The
-profile reads:
-
-```
-   self_ms   self_%   function
-    195.21    0.89%   (garbage collector)
-```
-
-~200 ms over a ~22 s render. Even if it could be disabled
-(it can't -- Node would OOM), it would barely register.
-
-The asymmetry between variants B and E makes this concrete.
-Variant B (graveyard fragment) moves the page from
-`.pagedjs_pages` to a detached DocumentFragment; variant E
-(empty wrapper) keeps the page in `.pagedjs_pages` but moves
-its children out. The fragment-move path *does* trigger
-LayoutObject teardown (you can see the 1.08 ms / call in
-variant B's instrumentation) even though the DOM Node lives on
-in a JS-visible fragment -- because the destination is itself
-not attached to the document, so there's no live render-tree
-parent. Conversely, variant E's wrapper stays in
-`.pagedjs_pages` with a live LayoutObject the whole time, so
-the wrapper's render-tree slot doesn't get torn down; only
-its child LayoutObjects do (as the children move out). The
-"keep render objects alive" idea would have to mean keeping
-the wrapper in `.pagedjs_pages` with all its children, which
-is the display:none baseline -- ~48 s render.
-
-The trade-off is therefore not "keep things alive vs. let GC
-collect them"; it's "be a live render-tree sibling vs. not".
-Anything that keeps the wrapper as a live sibling pays the
-~1.1 us per wrapper-sibling per gBCR call shown above, and the
-gBCR firings compound that into seconds across 1651 pages.
-
-## Chasing the residual `(idle)` to requestAnimationFrame
-
-A second axis of the same investigation. The post-findRef-fix
-profile showed `(idle) 735 ms (4.6 %)` -- not huge, but non-zero
-and worth understanding. `(idle)` in a V8 CPU profile means
-samples taken while the main thread had nothing scheduled --
-waiting on async/await, microtask queue settling, requestAnimationFrame
-ticks, or other browser-internal yields.
-
-### Hypothesis 1: microtask boundaries from `await Hook.trigger(...)`
-
-The chunker's per-page loop has 5-6 `await this.hooks.X.trigger(...)`
-calls per page. `Hook.trigger()` wraps every sync handler in a fresh
-Promise and returns `Promise.all(promises)`, so the caller always
-awaits a thenable -- a microtask boundary per await even when every
-handler resolved synchronously. 5 boundaries × 1651 pages ≈ 8,255
-yields; if each yield is ~85 us in V8 it lines up with the 735 ms.
-
-Patched it: `Hook.trigger()` returns `undefined` when no handler
-returned a thenable, callers do
-`let p = hook.trigger(...); if (p) await p;` to skip the await on
-the sync fast path. Patched at four hot per-page sites (3 in
-`chunker.layout`, 3 in `chunker.handleBreaks`).
-
-Result: render went **up** by ~0.35 s on a 2-run paired A/B
-(14.57 s -> 14.92 s avg). `(idle)` in the profile went **up too**
-(735 ms -> 1223 ms in absolute terms). Microtask boundaries are
-~30 us each at the JIT level; the V8 sampler at 1 ms intervals
-hardly catches them, so they show up as `(program)` rather than
-`(idle)`. The patch shaved microtask scheduling cost in the
-single-digit percent range but added a branch on every Hook.trigger
-call -- net wash, slight regression. **Reverted.**
-
-### Hypothesis 2: ResizeObserver firing per page
-
-Per page, `Page.addResizeObserver` creates a fresh `ResizeObserver`
-that fires its callback asynchronously from the compositor thread
-back to main. The callback wraps work in `requestAnimationFrame`,
-so each RO firing schedules a frame-tick wait. 1651 pages × ~0.5 ms
-per RO-rAF round-trip ≈ ~800 ms. Plausible.
-
-Two-step probe:
-1. **Skip the rAF wrap inside the RO callback**, run synchronously.
-   Result: `(idle) 902 ms`. No improvement, possibly slightly worse.
-2. **Disable the ResizeObserver entirely** (early-return in
-   `addResizeObserver`). Result: `(idle) 1,074 ms`. Still no
-   improvement.
-
-Neither helped. The RO isn't the source -- the per-page
-`addResizeObserver` overhead is real, but it doesn't show up in
-the `(idle)` bucket. Restored upstream behaviour.
-
-### Hypothesis 3: the chunker's `Queue.tick` is `requestAnimationFrame`
-
-The chunker drives its per-page work through a `Queue` class
-(`paged.browser.js:2666`). The queue's constructor sets:
-
-```js
-this.tick = requestAnimationFrame;
-```
-
-and `Queue.run()` schedules each iteration via
-`this.tick.call(window, () => { ... });`. Chunker's `render()`
-loops over `this.q.enqueue(() => this.renderAsync(renderer))`
-once per page. Every per-page iteration therefore waits one rAF
-tick before processing.
-
-`requestAnimationFrame` waits for the next animation frame. In
-headless puppeteer with no display, rAF still delivers callbacks
-on a regular cadence (Chromium's headless mode default is around
-60 Hz off-screen / ~16 ms per frame, with the scheduler often
-batching tighter than that). Either way, per-page rAF waits
-across 1651 pages add up to several hundred milliseconds of pure
-main-thread idle.
-
-The fix is one line:
-
-```js
-this.tick = (cb) => queueMicrotask(cb);
-```
-
-`queueMicrotask` schedules the callback on the microtask queue --
-runs before returning to the event loop, microsecond-scale latency
-instead of millisecond-scale. The `Queue` doesn't depend on rAF
-semantics (no paint coordination, no frame-budget yielding --
-it's just a serializer that wants to run tasks back-to-back).
-
-Verification (paired 2-run A/B, `--detach-pages`, no
-instrumentation, no cpu-profile):
-
-| run | BEFORE render | AFTER render |
-| --- | --- | --- |
-| 1 | 14.62 s | 11.86 s |
-| 2 | 14.51 s | 12.12 s |
-| **avg** | **14.57 s** | **11.99 s** |
-
-**Δ = -2.58 s render (-18 %).** Larger than the 735 ms `(idle)`
-that prompted the look -- because rAF was costing real (program)
-work too (V8 scheduler, microtask queue draining around the rAF
-boundary), not just idle wait. CPU profile of the fixed render:
-
-```
-   self_ms   self_%   function
-   -------   ------   ----------------------------------------------
-   4355.74   34.75%   getBoundingClientRect
-   1935.89   15.45%   removeChild
-   1934.11   15.43%   (program)             (was 5872 -- down ~4 s)
-    636.43    5.08%   removeOverflow
-    -- (idle) absent from the top 10, < 130 ms (1 %)
-```
-
-`(idle)` dropped out of the top 10 (< 130 ms / 1 %), `(program)`
-dropped from 5872 ms to 1934 ms (-4 s), `removeChild` dropped
-slightly (2426 ms -> 1935 ms; smaller render = same per-call cost
-× same call count, so this is sampling artefact, not a real
-change). PDF byte size unchanged (within standard timestamp
-drift). Shipped.
-
-### What the three hypotheses together teach
-
-`(idle)` in a V8 CPU profile attribution table is **not** primarily
-microtask scheduling -- those are too fast to sample. It's
-genuinely-waiting time, where the main thread had no V8 work to do.
-The dominant source of waiting in our render was not async/await,
-not ResizeObserver coalescing, but a `requestAnimationFrame`
-buried in the chunker's task queue. Replacing it with
-`queueMicrotask` collapses the per-page wait, and additionally
-shrinks the surrounding V8 scheduler work because each rAF
-callback came with its own setup / teardown overhead.
-
-The pattern to remember: if a profile shows non-trivial `(idle)`
-in a render-style workload, hunt for explicit `requestAnimationFrame`
-/ `setTimeout` / `requestIdleCallback` calls in the hot path before
-investigating microtask machinery. The frame-paced scheduler is a
-much bigger lever than the microtask scheduler.
-
-### Follow-up: the `Queue` itself was unnecessary indirection
-
-The chunker's `render()` routes each per-page iteration through
-`this.q.enqueue(() => this.renderAsync(renderer))`. The queue's
-job is to serialize tasks -- but an async generator is already
-inherently serial (you can't call `.next()` twice in parallel).
-With the rAF-tick fix above, the queue was reduced to a
-`queueMicrotask` hop plus a Promise/deferred allocation per page,
-for no purpose.
-
-Dropped the indirection: `render()` now iterates `renderer.next()`
-directly. The `Queue` class still exists in the bundle for the
-`onOverflow` re-render path (which is rare in practice), but the
-hot per-page loop bypasses it.
-
-This is a structural simplification more than a measurable speedup
--- the queueMicrotask hop was already cheap and the deferred
-allocation amortizes. But it removes a layer that was doing
-nothing useful for our use case, which is the point of
-maintaining a fork.
-
-## Stripping headless-irrelevant async machinery
-
-paged.js was designed to be fully usable in interactive browser
-work. The async coordination patterns it carries -- always
-returning Promises from hook triggers, awaiting microtask
-boundaries between every phase, deferring tasks via animation
-frames -- pay off when the same engine is rendering inside a
-visible page that needs to stay responsive, coordinate with the
-compositor, and tolerate handlers that load external resources.
-
-In our headless puppeteer pipeline, none of that is true:
-
-- The page is offscreen; no compositor to coordinate with.
-- We don't care if any individual page-render blocks for tens of
-  milliseconds, because the browser isn't trying to repaint.
-- Every handler we register is synchronous. No hook needs to
-  await anything.
-- The book HTML is loaded before render starts (`page.goto(url,
-  { waitUntil: "load" })`), so every image's `.complete` flag is
-  already true. No image-loading awaits ever actually wait.
-
-Each remaining async wrapper is overhead we pay for a flexibility
-we never use. We're maintaining a task-specific fork; we can keep
-peeling layers as long as the simplifications don't change observed
-output.
-
-### Phase 1: hook fast-path
-
-`Hook.trigger()` upstream always wraps sync handler results in
-`new Promise(resolve => resolve(executing))` and returns
-`Promise.all(promises)`. The chunker's per-page loop awaits each
-of `beforePageLayout`, `afterPageLayout`, and `finalizePage`. With
-all six of our registered handlers running synchronously,
-`await trigger(...)` was a no-work microtask boundary per call.
-
-Patch: `Hook.trigger()` returns `undefined` when no handler
-returned a thenable. Callers in the per-page hot path become:
-
-```js
-let _p = this.hooks.X.trigger(...);
-if (_p) await _p;
-```
-
-The microtask boundary is skipped entirely on the sync fast
-path. Patched at six per-page sites (three in `chunker.layout`,
-three in `chunker.handleBreaks`).
-
-CPU profile comparison (post-queue-tick + drop-queue baseline vs
-post-Phase-1):
-
-| metric | baseline | Phase 1 | Δ |
-| ------ | -------- | ------- | --- |
-| samples | 7,353 | 6,902 | -451 |
-| profile duration | 13.07 s | 12.22 s | **-0.85 s (-6.5 %)** |
-| `getBoundingClientRect` self | 4,622 ms | 4,273 ms | -349 ms |
-| `(program)` self | 1,873 ms | 1,874 ms | flat |
-| `removeChild` self | 1,885 ms | 1,913 ms | flat |
-| `removeOverflow` self | 592 ms | 579 ms | flat |
-| `(idle)` self | n/a (< 130 ms) | n/a (< 130 ms) | flat |
-
-The 451 fewer samples account for ~800 ms of saved CPU work.
-`getBoundingClientRect`'s self-time dropped by ~350 ms; the rest
-is distributed across many small hot spots that all shrank
-slightly because they were each preceded by fewer microtask
-yields. No new hot spot appeared.
-
-> [!NOTE]
-> We compare CPU-profile sample counts and self-times here, not
-> wall-clock. Wall-clock includes I/O variance and system load on
-> the dev machine; CPU profile sample times are independent of
-> those and more reliable for "did this actually change CPU work."
-> Wall-clock numbers from these runs are noted where useful for
-> sanity-checking but aren't the primary signal.
-
-Shipped. The fix is small (one helper change + six call-site
-edits) and removes about 8k microtask boundaries from the
-per-page hot loop on a 1651-page render.
-
-### Phase 2: sync chain end-to-end through the per-page hot path
-
-With Phase 1 in place, every per-page `await` in the chunker is
-unconditional on a function that returned a Promise even when
-nothing was actually awaitable. The structural answer is to make
-those functions plain sync functions.
-
-The chain, top to bottom of the per-page call tree:
-
-```
-chunker.*layout()              (async generator → sync generator)
-  chunker.handleBreaks()       (async → sync)
-  page.layout()                (async → sync)
-    Layout.renderTo()          (async → sync)
-      Layout.waitForImages()   (async → sync, throws if not preloaded)
-chunker.render() loop          (still async at the outer edge;
-                                renderer.next() now sync)
-```
-
-Phase 2 converts each step. The only function that *could* have
-been genuinely async -- `waitForImages` -- is now a synchronous
-check: it walks the supplied `<img>` nodes and throws if any
-isn't `.complete`. In our pipeline,
-`page.goto(url, { waitUntil: "load" })` settles before paged.js
-is invoked, so every image is already loaded; the throw is a
-safety net for pipeline bugs, not a runtime path we expect to
-take.
-
-The hook triggers in the per-page hot path keep the Phase 1
-fast-path semantics but switch from
-`let _p = hook.trigger(...); if (_p) await _p;` to
-`_assertSync(hook.trigger(...), "hook-name")`. The helper throws
-if a handler ever returns a thenable -- the same safety pattern
-as `waitForImages`. None of our shipping handlers do.
-
-Dead code removed in the same pass: `Chunker.renderAsync` and
-`Chunker.renderOnIdle`, both unreachable since the drop-queue
-change above stripped their only caller. Together ~30 lines of
-async machinery that existed only to wrap the (now sync)
-`renderer.next()` call.
-
-CPU profile (Phase 1 baseline vs Phase 2):
-
-| metric | Phase 1 | Phase 2 | Δ |
-| ------ | -------- | ------- | --- |
-| samples | 6,902 | 6,948 | +46 |
-| profile duration | 12.22 s | 12.35 s | +0.13 s (noise) |
-| `getBoundingClientRect` self | 4,273 ms | 4,524 ms | +251 ms (noise) |
-| `(program)` self | 1,874 ms | 1,909 ms | +35 ms |
-| `removeChild` self | 1,913 ms | 1,883 ms | -30 ms |
-| `removeOverflow` self | 579 ms | 523 ms | -56 ms |
-
-Phase 2 sits inside the run-to-run noise band on CPU time --
-the per-call CPU cost of an `await` on an already-settled Promise
-is small (a handful of microseconds), and Phase 1 already
-eliminated most of the boundary count. **What Phase 2 buys is
-not measurable CPU time -- it's structural simplicity.**
-
-Code shape, before and after:
-
-- 6 fewer `async` keywords on hot-path methods.
-- 13 fewer `await` keywords removed from the bodies of those
-  methods (the per-page chain no longer threads `await` through
-  any of its layers).
-- One async generator (`async *layout`) → sync generator
-  (`*layout`).
-- Two dead methods removed (`renderAsync`, `renderOnIdle`).
-- Two `_assertSync` guards added at the chunker's hook call
-  sites + one at `waitForImages` -- the contract we now rely on
-  (per-page handlers all synchronous, every `<img>` preloaded)
-  is enforced at runtime with a useful error message.
-
-PDF output is **byte-identical** to the Phase 1 build on this
-content (`async-phase1/book.pdf` and `async-phase2/book.pdf`
-both 16,893,546 bytes -- a rare 0-byte timestamp drift, but
-the structural content is identical regardless).
-
-This is the kind of cleanup that's only worth doing because
-we maintain a task-specific fork of the bundle. Upstream
-paged.js has to support handlers that await fetches or image
-loads or font measurements -- our pipeline never registers one.
-Removing the async machinery in our copy shrinks the surface to
-reason about and makes the data-flow direct: a render is a
-plain function call that produces a plain return value.
-
-### What's still async, and why
-
-> **Update.** All four survivors listed below were
-> subsequently stripped -- see "Following `RunMicrotasks`
-> down to zero" at the end of this README. The reasoning
-> here ("once-per-render, overhead irrelevant") was
-> correct as a per-call cost argument but missed that
-> the unbroken await chain forced V8 to attribute the
-> entire post-`loadFonts` render to a microtask
-> continuation (`RunMicrotasks` in the trace,
-> `(program)` in the cpu profile). Re-attribution alone
-> was worth the conversion; wall-clock is unchanged.
-> The list below is preserved for chronological accuracy.
-
-The async machinery that survives this audit is now at the
-once-per-render layer, where it's load-bearing:
-
-- `Chunker.flow()` is async because `loadFonts()` waits on the
-  CSS font-face descriptor's load promise, which is actually
-  async and OS-level.
-- `Chunker.render()` stays `async` as a thin wrapper so callers
-  in `flow()` can `await` it (the alternative would be to
-  remove `async` and have `flow()` not await it, but the call
-  site reads more clearly with the `await` retained).
-- `beforeParsed`, `afterParsed`, `afterRendered` hooks are still
-  awaited with the `await hook.trigger(...)` form because they
-  fire once per render and the overhead is irrelevant.
-- The `onOverflow` recovery path (`Chunker.q.enqueue(async ...)`)
-  re-renders the document if any page overflows after paint. In
-  practice this never fires for our content, but keeping the
-  recovery code intact costs nothing and preserves behaviour for
-  edge cases.
-
-The hot per-page path is now `function`, `function*`, plain
-return values, and a `while` loop. Future work that touches
-this code can reason about it as straight-line synchronous
-flow.
-
-## Doing less work in `Layout.append()`
-
-Picking the next hotspot after the async cleanup, BreakToken
-JSON, gBCR wrapper inline, and UUID-counter changes had all
-landed. Fresh profile from a clean baseline at 100us sampling
-(V8 effectively clamped this to ~543us/sample on this Node/
-Chromium build), `--no-timing --detach-pages`, render-only:
-
-```
-   self_ms   self_%   function  @  source
-   -------   ------   --------------------------------------------------
-   4825.28   38.22%   getBoundingClientRect       (native)
-   2021.89   16.02%   (program)                   (native)
-   1954.01   15.48%   removeChild                 (native)
-    635.95    5.04%   removeOverflow              paged.browser.js
-    288.38    2.28%   wrapContent                 paged.browser.js
-    255.25    2.02%   insertBefore                (native)
-    227.01    1.80%   appendChild                 (native)
-    164.01    1.30%   findOverflow                paged.browser.js
-    140.66    1.11%   (garbage collector)         (native)
-    138.49    1.10%   afterPageLayout             paged.browser.js (Splits)
-    129.25    1.02%   cloneNode                   (native)
-    125.99    1.00%   addRefs                     paged.browser.js
-     90.15    0.71%   renderTo                    paged.browser.js
-     81.46    0.65%   filterTree                  paged.browser.js
-     80.92    0.64%   importNode                  (native)
-     80.38    0.64%   setAttribute                (native)
-     72.77    0.58%   append                      paged.browser.js
-     ...
-```
-
-The four heavy hitters are unchanged from earlier reports.
-`Layout.append` itself shows only 73 ms of self-time, but
-inclusively it owns a large fraction of the per-source-node
-work: `cloneNode`, `appendChild`/`insertBefore`, the
-`findElement` chain (`querySelector` + `getAttribute`), the
-`renderNode` hook dispatch, and `rebuildAncestors` at page
-boundaries all flow through it. With ~100k+ source-node
-clones per render, anything per-call adds up.
-
-Reading the body of `append()`, three things stood out as
-potentially-reducible:
-
-1. The `renderNode` hook dispatch fires for every cloned
-   node. Even if no handler is registered, `triggerSync`
-   still allocates a results array, runs `this.hooks.forEach`
-   over zero entries, and returns the empty array; the
-   caller then runs its own `.forEach` over that empty array.
-2. The `findElement(node.parentNode, dest)` lookup goes
-   through `getAttribute("data-ref")` on the parent. The
-   ref is also set on every source element at decoration
-   time, so the value could be stashed on a plain JS expando.
-3. `clone.dataset.ref` is read a second time at the end of
-   `append()` to register the clone in `dest.indexOfRefs`.
-   Same expando trick applies.
-
-Following the (1) thread first uncovered two separable wins:
-a bug inside the only registered `renderNode` handler, and
-the broader empty-handlers dispatch overhead.
-
-### `Footnotes.renderNode`: always-truthy NodeList condition
-
-The grep for `renderNode` method definitions in the bundle
-returns exactly one match: `Footnotes.renderNode` (in the
-package's footnotes-handling class). Every `append()` call
-goes through it. Its body:
-
-```js
-renderNode(node) {
-    if (node.nodeType == 1) {
-        let notes;
-        if (!node.dataset) return;
-
-        if (node.dataset.note === "footnote") {
-            notes = [node];
-        } else if (node.dataset.hasNotes ||
-                   node.querySelectorAll("[data-note='footnote']")) {
-            notes = node.querySelectorAll("[data-note='footnote']");
-        }
-
-        if (notes && notes.length) {
-            this.findVisibleFootnotes(notes, node);
-        }
-    }
-}
-```
-
-The `else if` condition has an upstream bug: a `NodeList` is
-always truthy (even an empty one -- it's an object), so when
-`dataset.hasNotes` is undefined the right arm of the `||`
-runs `querySelectorAll`, the condition evaluates true, and
-the next line then runs `querySelectorAll` **a second time**.
-Two subtree scans per element-node clone, for any document
-that doesn't author `data-note='footnote'` directly.
-
-`grep -c 'data-note' docs/_site-pdf/book.html` returns 0 --
-every one of those scans on every clone of every page of
-the book was dead work.
-
-The fix narrows the `else if` to the original intent:
-
-```js
-} else if (node.dataset.hasNotes) {
-    notes = node.querySelectorAll("[data-note='footnote']");
-}
-```
-
-Profile delta (post-tojson baseline vs surgical fix):
-
-| metric | baseline | post-fix | Δ |
-| ------ | -------- | -------- | --- |
-| render wall | 12.63 s | 12.63 s | flat (within noise) |
-| `querySelectorAll` self | 67.9 ms | 52.8 ms | -15 ms |
-| samples | 23,313 | 23,250 | -63 |
-
-A small saving in absolute terms: most of the eliminated
-`querySelectorAll` calls were against tiny leaf subtrees
-that terminate in microseconds when no matches are present.
-The bug fix is upstream-clean and correct; the perf-relevant
-takeaway was that *most* of the work `append()` pays for the
-`renderNode` hook is in the dispatch wrapping the handler,
-not in the handler's body. That motivated (2).
-
-### `Hook.triggerSync` empty-handlers fast-path
-
-Mirrors the README's earlier "Phase 1: hook fast-path" for
-the async `trigger()` path. `Hook.triggerSync` previously:
-
-```js
-triggerSync() {
-    var args = arguments;
-    var context = this.context;
-    var results = [];
-    this.hooks.forEach(function (task) {
-        var executing = task.apply(context, args);
-        results.push(executing);
-    });
-    return results;
-}
-```
-
-…and the four reducer call sites in `Layout` always did:
-
-```js
-let r = this.hooks.X.triggerSync(...);
-r.forEach((newVal) => { if (newVal !== undefined) target = newVal; });
-```
-
-Walking the bundle to see which of those four hook arrays
-are actually populated in our build:
-
-| call site | hook | handlers registered |
-| --------- | ---- | ------------------- |
-| `breakAt` (line 1551) | `onBreakToken` | 0 |
-| `append` (line 1640) | `renderNode` | 1 (`Footnotes`) |
-| `findBreakToken` (line 1805) | `onOverflow` | 0 |
-| `findBreakToken` (line 1815) | `onBreakToken` | 0 |
-| `Chunker.flow` (line 2910) | `filter` | 4 |
-
-Three of the four hot sites are dispatching against an empty
-handler array every call. `onOverflow` and the two
-`onBreakToken` sites all fire from the per-page break-
-detection path, which can run more than once per page when
-overflow-and-retry happens.
-
-Patch: `triggerSync` returns `undefined` on the empty path,
-callers guard their reducer `forEach` with a truthy check.
-
-```js
-triggerSync() {
-    if (this.hooks.length === 0) return undefined;
-    // ...existing body
-}
-```
-
-```js
-let r = this.hooks.X.triggerSync(...);
-if (r) r.forEach((newVal) => { ... });
-```
-
-Profile delta (post-surgical vs post-fast-path):
-
-| metric | post-surgical | post-fast-path | Δ |
-| ------ | ------------- | -------------- | --- |
-| render wall | 12.63 s | **12.14 s** | **-0.49 s** |
-| samples | 23,250 | 22,433 | -817 |
-| `getBoundingClientRect` self | 4,819 ms | 4,714 ms | -105 ms |
-| `removeChild` self | 1,962 ms | 1,902 ms | -60 ms |
-| `removeOverflow` self | 634 ms | 552 ms | -82 ms |
-| `querySelectorAll` self | 52.8 ms | 43.4 ms | -10 ms |
-
-The wall-clock drop (~490 ms) and sample drop (817 × 542 us
-≈ 443 ms) line up cleanly, so the saving is real, not run-
-to-run noise. The reductions spread across rows because the
-per-call cost of an empty `triggerSync` -- an array alloc, a
-forEach over zero entries, a return, and the caller's own
-forEach over the returned `[]` -- creates pressure on the
-allocator and the V8 inliner that compounds on the per-page
-hot path even though no single line attributes the cost.
-
-The `renderNode` site at line 1640 does **not** hit the fast
-path in this build -- `Footnotes` still occupies it with one
-handler, so `hooks.length === 1` and the body runs as
-before. The savings come entirely from the three zero-
-handler sites.
-
-### `Footnotes` self-disables when no footnotes are in source
-
-That left the per-element `Footnotes.renderNode` dispatch
-still firing on every cloned node, plus four other hook
-methods `Footnotes` registers via the `Handler` base auto-
-wiring. Inventory of what `Footnotes` is doing on a render
-with zero footnote-marked nodes:
-
-| method | fires | what it does on a footnote-free doc |
-| ------ | ----- | ----------------------------------- |
-| `onDeclaration` | per CSS declaration | quick property-name checks. Cheap. |
-| `renderNode` | per element-node clone | short-circuits after surgical fix. |
-| `beforePageLayout` | once per page | checks `this.needsLayout.length` (always 0). Cheap. |
-| `afterPageLayout` | once per page | **3 `querySelector`s + `getBoundingClientRect` + `new Layout(...)` (which does 2 more `getBoundingClientRect`s + `getComputedStyle` in its constructor) + `findOverflow()` on the footnote-inner-content area.** Real work. |
-| `afterOverflowRemoved` | per overflow detection | `querySelectorAll` returning empty. Cheap-ish. |
-
-The big hidden cost was `afterPageLayout` -- ~1,650 calls per
-render, each measuring an empty footnote area through several
-DOM ops and constructing a transient `Layout` instance whose
-constructor itself does multiple gBCRs.
-
-The detect-and-disable plan:
-
-1. Footnotes is the *only* registrant for each of its hook
-   methods (`onDeclaration` aside -- it's a polisher-time
-   hook with other registrants, but it's also cheap).
-2. By the time `afterParsed` fires, both the CSS-driven
-   selectors (populated by `onDeclaration` calls into
-   `this.footnotes`) and any source-HTML `data-note` markers
-   are accounted for. `Footnotes.afterParsed` already runs
-   `processFootnotes(parsed, this.footnotes)` which writes
-   `data-note='footnote'` on any element matching a CSS
-   selector. So a single `parsed.querySelector(
-   "[data-note='footnote']")` at the end of that pass is
-   conclusive.
-3. If null, splice `Footnotes`'s bound functions back out
-   of each hook array. With the empty-handlers fast-path
-   from (2) already landed, the per-page and per-node
-   dispatches then return `undefined` immediately and
-   callers skip their reducer `forEach`.
-
-To enable (3), the `Handler` base class gets a small
-addition: each `(hook, bound)` pair from auto-registration
-is stashed under its hook name on `this._registered`, and a
-new `_unregisterAll(except)` method splices each entry back
-out. The `except` argument lets the caller skip the hook
-it's currently inside (`afterParsed` in this case) --
-splicing the array we're iterating would cause the
-surrounding `trigger()` loop to skip a sibling handler.
-The skipped entry stays in `this._registered` forever, but
-it's a one-shot anyway: harmless.
-
-`Footnotes.afterParsed` then becomes:
-
-```js
-afterParsed(parsed) {
-    this.processFootnotes(parsed, this.footnotes);
-    if (!parsed.querySelector("[data-note='footnote']")) {
-        this._unregisterAll("afterParsed");
-    }
-}
-```
-
-Profile delta (post-fast-path vs post-self-disable):
-
-| metric | post-fast-path | post-self-disable | Δ |
-| ------ | -------------- | ----------------- | --- |
-| render wall | 12.14 s | **11.77 s** | **-0.37 s** |
-| samples | 22,433 | 21,809 | -624 |
-| **`getBoundingClientRect` self** | **4,714 ms** | **4,198 ms** | **-516 ms** |
-| `removeChild` self | 1,902 ms | 1,898 ms | flat |
-| `(program)` self | 2,022 ms | 2,198 ms | +176 ms |
-| `append` self | 76 ms | 69 ms | -7 ms |
-
-The 516 ms `getBoundingClientRect` drop is exactly the
-`Footnotes.afterPageLayout` cost that the inventory
-predicted -- one gBCR on `noteContent` plus two more in
-the `new Layout(noteArea, ...)` constructor plus internal
-gBCRs from `findOverflow()`, multiplied by ~1,650 pages.
-The `(program)` row growing by 176 ms is V8 reattributing
-work between native and self-time as the dispatch pattern
-changes; not new work, just a different breakdown.
-
-PDF output remained byte-identical to the previous build
-on this content (16.1 MB, same checksum on the raw
-Chromium output).
-
-### `Layout.append` parent-lookup cache
-
-When the source walker emits consecutive children of the
-same parent, `findElement(node.parentNode, dest)` in
-`append()` gets called repeatedly with the same input.
-For a parent with N children that's N - 1 redundant
-lookups -- each one cheap (`getAttribute("data-ref")` +
-`dest.indexOfRefs[ref]` is an O(1) dict hit on the fast
-path), but the call count is north of 100k per render.
-
-Patch: a three-property memo on `Layout` -- last
-`srcParent`, last `dest`, last `destParent`. Hit check at
-the top of `append`, writeback at the bottom after the
-parent is resolved (whether via direct lookup or via the
-rebuild-ancestors branch, since the rebuild attaches the
-cloned ancestor into `dest`).
-
-Invalidation: reset all three at the top of every
-`renderTo`. The cache is safe within a single `renderTo`
-loop because `append()` never detaches DOM from `dest`,
-and `removeOverflow` (the one thing that does) only fires
-at loop exit. Across `renderTo` calls on the same `Layout`
-instance the previous run's `removeOverflow` may have
-detached the cached parent, so the explicit reset is the
-correctness guard.
-
-Profile delta (post-self-disable vs post-parent-cache):
-
-| metric | post-self-disable | post-parent-cache | Δ |
-| ------ | ----------------- | ----------------- | --- |
-| render wall | 11.77 s | 11.72 s | flat (within noise) |
-| samples | 21,809 | 21,688 | -121 (~65 ms) |
-| `(program)` self | 2,198 ms | 2,169 ms | -29 ms |
-| `getAttribute` (native) | 43 ms | off-list (<40 ms) | -3 ms+ |
-| `querySelector` (native) | 63 ms | 59 ms | -4 ms |
-| `Layout.append` self | 69 ms | 70 ms | flat |
-
-Order ~50-100 ms saved depending on the row chosen, fully
-below the run-to-run wall-clock noise band but visible in
-the cpuprofile rows. The math checks: ~100k append calls
-× ~80 % sibling-cache-hit rate × ~1 us per skipped
-findElement ≈ 80 ms.
-
-PDF output byte-identical.
-
-### What didn't land: the `_ref` expando
-
-One sibling candidate to the parent-lookup cache was
-tried and reverted. The idea: mirror `data-ref` onto a
-plain JS property `_ref` at decoration time (in
-`ContentParser.addRefs`), propagate via the `cloneNode`
-helper, and read it in `findElement` and `append`'s
-postlude instead of `getAttribute("data-ref")` /
-`clone.dataset.ref`. Both reads in the hot path become
-plain JS property loads instead of going through C++ DOM
-attribute fetches or the `DOMStringMap` proxy.
-
-Measured win on the per-row breakdown:
-
-- `Layout.append` self 69 -> 47 ms (-22 ms).
-- `getAttribute` native 43 ms -> off-list (-3+ ms).
-
-About 25 ms of real per-call work removed. Reverted: the
-saving is genuinely smaller than the diff's surface --
-`cloneNode` helper has to propagate an extra property,
-the `data-ref` attribute has to stay for CSS selectors
-and the `querySelector` fallback in `findRef`, `findElement`
-needs a `||` fallback to keep direct `.cloneNode()`
-callers in `rebuildAncestors` working unchanged, and any
-future code that wants the ref has two places it could
-read from. Not worth maintaining for a saving that
-doesn't move single-run wall-clock.
-
-Lesson worth carrying forward: at this point in the
-codebase, per-call findElement / `dataset.ref` work has
-been ground down close enough to its floor that any
-further shave produces savings in the 20-50 ms band, well
-below the run-to-run wall-clock noise on this machine.
-Reading the cpuprofile per-row deltas is the only way to
-tell whether such a change is genuine; reading wall-clock
-isn't. And the bar for landing scales with the size of
-the diff -- the parent-cache landed because it's three
-property writes and one branch; the expando didn't
-because it's a propagation pattern that ripples through
-the bundle.
-
-### Cumulative effect
-
-Across all four landings:
-
-| metric | pre-investigation | post-parent-cache | Δ |
-| ------ | ----------------- | ----------------- | --- |
-| render wall | 12.63 s | 11.72 s | **-0.91 s (-7.2 %)** |
-| samples | 23,313 | 21,688 | -1,625 |
-| `getBoundingClientRect` self | 4,825 ms | 4,194 ms | -631 ms |
-| `removeChild` self | 1,954 ms | 1,897 ms | -57 ms |
-| `removeOverflow` self | 636 ms | 583 ms | -53 ms |
-| `getAttribute` (native) | ~125 ms* | off-list (<40 ms) | -85 ms+ |
-
-\* Inferred from the post-tojson baseline rank; not
-explicitly tabulated in the top-25 cut at that time.
-
-The `Handler._registered` + `_unregisterAll(except)` plumbing
-is reusable: any future handler that determines at
-parse/decoration time that it has nothing to do for a given
-render can self-disable the same way, and the
-empty-handlers fast-path will swallow the per-call dispatch
-cost for free. That's the pattern this work leaves behind --
-combine "detect once at a known-quiet point" with "remove
-yourself from the dispatch chain" and you pay zero
-ongoing cost for inactive handlers.
-
-## Skipping the `wrapContent` innerHTML round-trip
-
-The post-append-cache profile's 5th-largest JS row was
-`wrapContent` at 260 ms. It's called once per render, right
-at the top of `Chunker.flow`, so unlike the previous fixes it
-has no per-page hot path -- the absolute size is the whole
-story.
-
-`Layout.wrapContent` lifts the entire `<body>` into a
-`<template data-ref='pagedjs-content'>` so the chunker can
-iterate the source without disturbing the live DOM. Original:
-
-```js
-template.innerHTML = body.innerHTML;
-body.innerHTML = "";
-body.appendChild(template);
-```
-
-Two heavy halves, both linear in document size:
-
-1. **`body.innerHTML` getter**: walks every node in the body
-   and serialises the entire subtree to one HTML string.
-2. **`template.innerHTML = ...` setter**: hands the string to
-   the HTML parser, which reparses it into a fresh tree
-   inside the template's contents-owner document.
-
-On our 5.5 MB book, the round-trip is exactly 260 ms.
-`find-callees.mjs` confirms 99 % of that lives in the JS frame
-itself (the C++ serialiser/parser get attributed back to the
-calling frame, same trick `removeOverflow`'s `Range`
-deletion uses):
-
-```
-wrapContent: self=259.97ms, total=262.15ms (callees=2.18ms)
-per direct callee (subtree total ms):
-      2.18 ms   querySelector  @  (native):0
-```
-
-The fix moves children directly into a plain
-`DocumentFragment`, no string round-trip:
-
-```js
-let fragment = document.createDocumentFragment();
-while (body.firstChild) fragment.appendChild(body.firstChild);
-template = document.createElement("template");
-template.dataset.ref = "pagedjs-content";
-template._pagedjsContent = fragment;  // re-entrancy stash
-body.appendChild(template);
-return fragment;
-```
-
-### Why a plain fragment, not `template.content`
-
-The first cut moved children into the template's content,
-which is the obvious shape since `wrapContent` was already
-returning `template.content`. It crashed on the first page:
-
-```
-paged.js (forked): image not loaded at render time.
-Image: file:///.../Features/Images/b0724fe2-....png
-   at Layout.waitForImages
-   at Layout.renderTo
-```
-
-The reason is in the spec. A `<template>`'s `content` fragment
-is owned by a separate "template contents owner document"
-that has no browsing context -- resources inside it never
-load. Moving a live `<img>` into `template.content` triggers
-`adoptNode` to that inert document, which then runs the
-"update the image data" algorithm, creates a fresh request
-in state "unavailable", and flips `.complete` to false. The
-source image is now stuck in that state; clones into the live
-page wrappers inherit it without the synchronous cache-hit
-path firing in time for the sync `[PATCH: assert-sync]`
-`waitForImages` check.
-
-The `innerHTML` round-trip avoids this incidentally: the
-freshly-parsed `<img>` elements in `template.content` are
-brand new (never live), they have no prior load state to
-disturb, and when their clones land in the live page wrappers
-Chromium's file:// cache lookup resolves them synchronously.
-
-A plain `DocumentFragment` is owned by the live document.
-Moving children into it is a same-document append -- no
-adoption, no "update the image data", no `.complete` reset.
-Clones from the fragment into the live page wrappers then
-take the same fast cache path the round-trip's parsed images
-did.
-
-### Re-entrancy
-
-The original returned `template.content`, so a second call
-finding the existing template just returned that same
-fragment. Under the move strategy `template.content` is
-empty (the children live in the plain fragment we returned),
-so the re-entrant branch reads the fragment back off a
-`template._pagedjsContent` expando on the marker template.
-Functionally equivalent for the one-call-per-render case
-that's actually exercised; preserves the multi-call contract
-in case anyone leans on it later.
-
-### Results
-
-Paired A/B, 2 runs each, `--detach-pages --no-timing
---cpu-profile --cpu-sampling 100`:
-
-| run | pre | post |
-| --- | --- | --- |
-| 1 | 11.92 s | 10.72 s |
-| 2 | 11.60 s | 11.06 s |
-| **avg** | **11.76 s** | **10.89 s** |
-
-**Δ = -0.87 s render (-7.4 %).** Larger than the 260 ms the
-profile attributed to `wrapContent` itself -- the round-trip
-also allocated a transient 5.5 MB string that pushed GC and
-distributed sample noise into the surrounding rows; removing
-the allocation relieves pressure across the whole per-page
-hot path. The cpuprofile rows breakdown:
-
-| function | pre | post | Δ |
-| -------- | --- | ---- | --- |
-| `wrapContent` self | 260 ms | off-list (<25 ms) | **-260 ms+** |
-| `getBoundingClientRect` self | 4,281 ms | 4,036 ms | -245 ms |
-| `removeOverflow` self | 560 ms | 353 ms | -207 ms |
-| `removeChild` self | 1,871 ms | 1,730 ms | -141 ms |
-| `(program)` self | 2,298 ms | 2,152 ms | -146 ms |
-
-The `wrapContent` row is the only one outside the single-run
-noise band (the README's earlier methodology section pins
-that at 50-150 ms for sub-1 % rows on this machine). The
-others are plausibly real but inseparable from noise without
-more runs; the sample-count delta (-2,100 samples × 542 us
-= ~1,135 ms) matches the wall-clock delta closely enough that
-the distributed component is probably real GC-pressure
-relief, not just sampler jitter.
-
-PDF byte-equivalent to the pre-fix build (16.1 MB).
-
-### What the pattern leaves behind
-
-`removeOverflow` and `wrapContent` are both cases where V8
-rolled native DOM work (`Range.deleteContents`,
-HTML serialiser+parser) into the calling JS frame's
-self-time. The diagnostic move is the same one we used for
-gBCR attribution: `find-callees.mjs` on the suspect frame.
-If self-time is ~100 % of total, the work is happening
-inside a native callee the sampler didn't name -- read the
-JS body to find which DOM API is doing the work and whether
-it can be replaced with a cheaper equivalent.
-
-`find-callees.mjs` was added for this investigation and
-sits alongside `find-callers.mjs`; the two together cover
-both directions of the V8 attribution edge.
-
-## The per-page overflow-check rhythm: two bugs in the adaptive `maxChars`
-
-The "Attempt E: additive backoff" section above describes
-the per-page rhythm of `renderTo`'s overflow checks: append
-nodes, fire `findBreakToken` every `maxChars` chars of
-appended content, break out when it returns a non-null
-breakToken. `maxChars` defaults to 1500 and is meant to
-adapt up or down based on observed page capacity.
-
-The post-wrapContent profile showed `findOverflow` total
-2.24 s, almost all of it (1.96 s) in `hasOverflow`'s single
-gate gBCR -- one call per `findBreakToken`. Was the call
-count high because the page actually needs that many
-probes, or was the rhythm wrong?
-
-Instrumenting with `window.__breakCheckStats` and
-`window.__layoutMaxChars` answered it:
-
-```
-findBreakToken checks: 7,764  hits: 862  nulls: 6,902
-renderTo calls: 1651  checks/call avg: 4.70
-Layout.maxChars: first=1500  median=177  last=177  min=177  max=1500
-```
-
-Four findings:
-
-1. **89 % of checks (6,902 / 7,764) return null.** They're
-   "no overflow yet, keep appending" probes. Each is still
-   a full layout-flush gBCR. The actual overflow detections
-   are 862, slightly more than half of the 1651 pages
-   (the rest end naturally, or via CSS-driven breaks).
-
-2. **`Layout.maxChars` was locked at 177 for the entire
-   render** after page 1. That's an order of magnitude
-   below a typical page's capacity (which the @page CSS,
-   font size, and content density determine -- closer to
-   4000-4500 chars of body text on this book). Page 1 ran
-   with the default 1500; pages 2-1651 ran with 177.
-
-3. The reason was a propagation gate in `Page.layout`:
-   ```js
-   if (!settings.maxChars && maxChars) {
-       settings.maxChars = maxChars;
-   }
-   ```
-   `settings` is shared across all pages (one object, set
-   by reference in the Chunker constructor). The chunker
-   maintains a running estimate in `this.maxChars` via
-   `recordCharLength` and passes it into each page's
-   `layout(..., maxChars)`. But `!settings.maxChars` is
-   only truthy on the first page that gets a defined value
-   -- the rest see settings.maxChars already populated and
-   skip the update. Whatever value page 2 picked up (177,
-   from a freak short page 1 that had been recorded as
-   capacity), every subsequent page kept.
-
-4. The recording itself is biased. `recordCharLength` pushes
-   `page.wrapper.textContent.length` after every layout and
-   averages the last 4 values. Short pages -- chapter
-   endings, part dividers -- get recorded alongside full
-   pages, dragging the average well below true capacity.
-   Even with propagation fixed, the average would land
-   around 1200, not 4500.
-
-### The fix
-
-Two patches in `docs/lib/paged.browser.js`, marked
-`// [PATCH: maxChars-propagate]` and `// [PATCH: maxChars-
-running-max]`:
-
-1. **`Page.layout`'s gate drops the staleness check**:
-   `if (maxChars) settings.maxChars = maxChars;`. Each page
-   now picks up the chunker's current estimate.
-
-2. **`Chunker.recordCharLength` tracks the running max over
-   the last 16 pages** instead of the running average over
-   4. Max biases toward "the largest page recently seen,"
-   which approximates true capacity for our content. Short
-   pages still get pushed into the window but don't pull
-   the estimate down. The window of 16 is wide enough that
-   a transient stretch of short pages doesn't collapse the
-   estimate before a full page restores it.
-
-### Results
-
-Paired A/B, 2 runs each, `--detach-pages --no-timing`, no
-profiling:
-
-| run | pre | post |
-| --- | --- | --- |
-| 1 | 10.08 s | 8.15 s |
-| 2 | 11.86 s | 7.98 s |
-| **avg** | **10.97 s** | **8.07 s** |
-
-**Δ = -2.90 s render (-26 %).** CPU profile (single run,
-within noise band on the smaller rows):
-
-| metric                   | pre        | post       | Δ |
-| ------------------------ | ---------- | ---------- | --- |
-| `findOverflow` total     | 2,236 ms   | 1,690 ms   | **-546 ms** |
-| ↳ `hasOverflow` total    | 1,957 ms   | 1,597 ms   | -360 ms |
-| ↳ ↳ `gBCR` native        | 1,945 ms   | 1,587 ms   | -358 ms |
-| ↳ `findOverflow` self    | 142 ms     | 47 ms      | -95 ms |
-| ↳ walker-loop callees    | ~135 ms    | ~46 ms     | -89 ms |
-| `removeOverflow` self    | 353 ms     | 122 ms     | **-231 ms** |
-| `removeChild` self       | 1,731 ms   | 1,637 ms   | flat (noise) |
-| `(program)` self         | 2,152 ms   | 2,215 ms   | flat (noise) |
-
-The `removeOverflow` drop was the surprise. Going in, the
-concern was that bigger `maxChars` (now ~4500 instead of
-177) would mean larger overshoot when overflow fired -- so
-`extractContents` / `deleteContents` would have more nodes
-to detach. The opposite happened: `removeOverflow` self
-dropped two-thirds. The reason is the call count, not the
-per-call size. With `maxChars=177` the renderTo loop
-checked at every 177-char interval, but many of those
-checks were *near* the page boundary, where the walker in
-`findOverflow` did real work even when returning null
-(walking nodes to test text-break candidates that don't
-quite fit). With `maxChars=4500`, the very first check on
-most pages fires right at the overflow point; the walker
-runs once per page instead of several times, and the per-
-call work it does is roughly the same as before.
-
-PDF output is byte-identical to the pre-fix build
-(16.1 MB, same checksum on the raw Chromium output).
-
-### Why the average was the wrong statistic
-
-The textbook reason to track a running average is to
-estimate a stationary quantity in the presence of noise.
-The thing being estimated here -- "how many chars fit on a
-full page" -- is a tight ceiling, not a noisy reading: each
-page's textContent.length either equals page capacity
-(because the page broke for overflow) or is well below it
-(because content ran out / a CSS break fired). The
-distribution is bimodal, and the average sits between the
-modes -- exactly where it's worst as an estimator of
-either.
-
-The running max, by contrast, finds the upper mode and
-sticks to it. It only moves down if the entire window is
-sub-capacity pages, which means the document genuinely
-doesn't have full pages anymore (end of book, perhaps), at
-which point the estimate doesn't matter much.
-
-### Where this leaves the picture
-
-Render is now ~8 s on the 1651-page book, down from ~11 s
-post-wrapContent, down from ~104 s in the original
-baseline. Updated cumulative table:
-
-| fix                                 | render saved | shipped |
-| ----------------------------------- | ------------ | ------- |
-| `--detach-pages` (display:none)     |   ~55 s      | yes     |
-| aggressive detach (`removeChild`)   |   ~22 s      | yes     |
-| `renderTo` additive backoff         |   ~4.25 s    | yes     |
-| skip dead `findEndToken` path       |   ~3.5 s     | yes     |
-| `findRef` fast-path                 |   ~2.4 s     | yes     |
-| queue-tick: rAF -> queueMicrotask   |   ~2.6 s     | yes     |
-| `finalizePage` micro-optimisations  |   ~3 s       | yes     |
-| `wrapContent` move (skip innerHTML) |   ~0.9 s     | yes     |
-| **`maxChars` propagation + max**    | **~2.9 s**   | **yes** |
-| (others, smaller)                   |   ~3 s       | yes     |
-
-The strategic conclusion at the bottom of "Where this
-leaves the picture" updates accordingly: render is now
-roughly half the size of generate (~8 s vs ~32 s wall on
-the production build), and `pageRanges` sharding remains
-the only knob with a profile target large enough to move
-the wall-clock total meaningfully -- and that target is
-generate, not render.
-
-## What happened when we tried move-not-clone
-
-A fresh `--detach-pages --no-timing --cpu-profile
---cpu-sampling 100` baseline run showed `cloneNode` at
-~146 ms self-time, all of it inside `Layout.append`'s per-
-source-node clone path. `Layout.append`'s body for the
-`!shallow` (deep-cloned leaf) yields was:
-
-```js
-let clone = cloneNode(node, !shallow);  // deep clone
-// ... attach clone to dest ...
-return clone;
-```
-
-The user's question: source's read-only-template contract
-is just an artifact of paged.js's break-and-resume model.
-We're doing offline layout -- nothing reads source after
-the render finishes. Could we MOVE the source node into
-dest instead of cloning it, and avoid the allocation cost
-entirely? Best-case ceiling estimated at ~300-450 ms /
-~3-5 % of render (the cloneNode self plus distributed GC-
-pressure relief from not allocating ~250 k duplicate DOM
-nodes).
-
-### What the refactor required
-
-Three load-bearing assumptions in the chunker break the
-moment source is mutated:
-
-1. The walker traverses via live links
-   (`node.firstChild` / `nextSibling` / `parentNode`).
-   After a leaf yield, `walker = walk$2(nodeAfter(node,
-   source), source)` reads `nodeAfter` AFTER `append` has
-   moved `node` into dest -- the reads now go into dest's
-   tree, not source's. Fix: capture `nodeAfter(node,
-   source)` BEFORE the append call and pass it to the
-   walker reset.
-
-2. `BreakToken.node` stores a source-tree reference for
-   the next page's `getStart(source, breakToken)` to
-   resume from. `createBreakToken`'s four
-   `findElement(*, source)` call sites map rendered
-   (clone) nodes back to source via shared `data-ref`.
-   With moves, source has lost the leaves and findElement
-   returns the moved node now living in dest. Fix:
-   bypass `createBreakToken` entirely. Compute the
-   resume point from the extract-and-restore step
-   instead (see `restoreOverflow` below).
-
-3. `removeOverflow`'s `deleteContents` would drop the
-   moved content forever. In the clone model that was
-   fine -- source still held a pristine copy. In the
-   move model, source needs the overflow content back so
-   the next page can render it. Fix: replace with
-   `restoreOverflow` -- `extractContents` the overflow
-   range, walk the fragment depth-first collecting leaf
-   elements, and reinsert each leaf at its stashed
-   `_srcParent` / `_srcNextSibling` position. For the
-   boundary leaf that's partially overflowing,
-   `extractContents` produces a shallow clone of the
-   leaf in the fragment; we inherit its source position
-   via `source.indexOfRefs[ref]` (which still points at
-   the original-now-in-dest, which carries the stash).
-   Reverse-order iteration so each leaf's `_srcNextSibling`
-   target is back in source by the time we insert.
-
-### The bug that taught the real story
-
-First pass rendered the book to 1740 pages -- 89 more
-than the 1651-page baseline. Content was byte-identical
-modulo timestamps. Per-page char counts in the FAQ
-section showed pages 127+ with only ~50-500 chars each:
-
-```
-[BL p127] 3045 chars      [EX p127] 438 chars
-[BL p128] 3732 chars      [EX p128] 185 chars
-```
-
-Some FAQ pages had a single short paragraph. Instrumenting
-`shouldBreak` revealed it was returning true on every
-non-first yield inside the FAQ article:
-
-```
-[instrument] shouldBreak true: tag=P  ref=6bv pba=- prevNode=ARTICLE
-[instrument] shouldBreak true: tag=B  ref=6bx pba=- prevNode=ARTICLE
-[instrument] shouldBreak true: tag=P  ref=6by pba=- prevNode=ARTICLE
-... (one per FAQ paragraph)
-```
-
-The `<p>` elements have no `data-break-before` and no
-`data-previous-break-after`, so the fire is via
-`needsPageBreak(node, previousNode)` -- which checks
-whether `node`'s effective `data-page` differs from
-`previousNode`'s.
-
-`previousNode` is computed via
-`nodeBefore(node, limiter)`, which walks
-`node.previousSibling` then climbs via `parentNode` if
-no significant sibling exists. In the move model, after
-the previous yield was moved out of source, the current
-yield's `previousSibling` is `null` (the previous one no
-longer lives in source). The climb continues up:
-FAQ article (no `data-page`) -> looks at its previous
-sibling -> finds the **part-divider article** sitting
-right before the FAQ article in source, which DOES carry
-`data-page="divider"` (set by processBreaks for the CSS
-`page: divider;` rule on `article.part-divider`).
-
-So `needsPageBreak` saw a transition from
-`page="divider"` to (effectively) no page, fired true,
-and the chunker started a fresh page for every paragraph
-in the FAQ section. The chapter article's normal
-"siblings share the same effective page-name" property
-broke because the sibling-walk now escapes the chapter
-into the prior part-divider.
-
-### Fix: track previousLeaf in renderTo
-
-The chunker already knows the right answer: the last
-leaf it actually appended this page. Threaded through
-`shouldBreak` as a third argument, used by the
-`needsPageBreak` branch only (`needsBreakBefore` and the
-`parentBreakBefore` logic still use `nodeBefore`):
-
-```js
-let _moveLastLeaf = null;
-// ... in the loop ...
-if (hasRenderedContent &&
-    this.shouldBreak(node, start, _moveLastLeaf)) { ... }
-// ... after append ...
-if (!shallow) _moveLastLeaf = node;
-```
-
-In `shouldBreak`:
-
-```js
-let pageBreakRef = previousLeaf || nodeBefore(node, limiter);
-return ... || needsPageBreak(node, pageBreakRef);
-```
-
-With that, page count went 1740 -> 1653 (within 2 of
-baseline) and per-page content matched. PDF
-byte-equivalent to baseline within timestamp drift.
-
-### Profile diff
-
-Both runs `--detach-pages --cpu-profile --cpu-sampling
-100`, sample-time absolute, single run each (wall-clock
-on this machine is too noisy to be a useful signal --
-see "Methodology: compare profiles, not wall-clock"
-above):
-
-| function | baseline | move | Δ |
-| --- | --- | --- | --- |
-| `getBoundingClientRect` | 3539 ms | 4036 ms | **+497** |
-| `appendChild` | 137 ms | 390 ms | **+253** |
-| `restoreOverflow` (new) | -- | 168 ms | +168 |
-| `removeChild` | 1536 ms | 1635 ms | +99 |
-| `insertBefore` | <50 ms | 87 ms | ~+87 |
-| `getNodeWithNamedPage` | <50 ms | 108 ms | ~+85 |
-| `afterPageLayout` (AtPage) | 105 ms | 182 ms | +77 |
-| `(program)` | 2196 ms | 2266 ms | +70 |
-| `Layout` ctor | 23 ms | 31 ms | +8 |
-| `cloneNode` | 146 ms | <130 ms | **-146** |
-| `removeOverflow` | 124 ms | -- (replaced) | -124 |
-| **samples** | **17,481** | **19,590** | **+2,109** |
-| **CPU work** | **9.48 s** | **10.74 s** | **+1.26 s** |
-
-Net **+1.26 s of CPU work** -- the change is a clear
-regression in the opposite direction from the prediction.
-
-### Why the prediction was wrong
-
-The cloneNode self-time saving (-146 ms) shows up as
-expected, but three structural costs dwarf it:
-
-1. **`appendChild` on an attached node is roughly 2x
-   the cost of `appendChild` on a fresh clone (+253 ms).**
-   A move is internally detach-from-source-parent +
-   attach-to-dest-parent; both touch Blink's child-list
-   bookkeeping. cloneNode produces an unparented node,
-   so the subsequent attach is one-sided. Intrinsic to
-   any move-based design -- no implementation choice
-   avoids it.
-
-2. **Each move dirties Blink's layout state more than
-   each clone does, distributing cost into gBCR
-   (+497 ms).** The increase is spread across every
-   gBCR call site -- `Page.create` (+225 ms),
-   `hasOverflow` (+152 ms), `Layout` ctor (+58 ms),
-   `afterPageLayout` (+31 ms), `addResizeObserver`
-   (+31 ms) -- not localized to any new code. Each
-   gBCR call flushes pending mutations; with every move
-   counting as two mutations vs one for clone+append,
-   each flush has more to do. Same migration pattern
-   the README's "Attempt B: memoize `Page.create`'s gBCR"
-   documented above -- DOM mutation cost doesn't go
-   away by elimination, it migrates to whichever frame
-   next forces a layout flush.
-
-3. **The extract-and-restore cycle adds ~340 ms of new
-   JS work.** `restoreOverflow` (168 ms) builds an
-   `extractContents` fragment + walks it for leaves +
-   inserts each back into source. `previousLeaf` makes
-   `shouldBreak` call `getNodeWithNamedPage` (108 ms)
-   on every leaf yield (it climbs parent chains looking
-   for `data-page`). `insertBefore` (87 ms) is the
-   per-restore reinsertion.
-
-The deeper structural reason: paged.js's break-and-
-resume model touches each source leaf O(pages-spanning-
-that-leaf) times in the move model -- moved into page N,
-extracted to the fragment, reinserted into source,
-moved into page N+1. Each touch is a DOM mutation. The
-clone model touches each node O(1) times -- allocated
-once, attached, thrown away with the page. Cumulative
-mutation count is structurally higher under moves.
-
-The cloneNode time the profile attributes to its native
-frame is just the *allocator* portion of cloning work --
-not the total cost of "duplicating a subtree". The rest
-hides in V8 / Blink native frames not labeled
-`cloneNode`, and that rest doesn't disappear when you
-switch to moves; it shows up as appendChild +
-invalidation cost instead.
-
-### Where this leaves the picture
-
-Reverted. The cumulative table from the previous
-section is unchanged. No row added.
-
-The pattern this attempt taught is the inverse of the
-"distributed savings often exceed direct estimates"
-heuristic the README documents elsewhere: sometimes a
-change with a direct cost saving has bigger distributed
-*regressions* that aren't visible until you measure.
-The cloneNode saving was real; the appendChild + gBCR +
-restoreOverflow overhead was bigger.
-
-The only design that would avoid all three costs is one
-that never re-moves the same node -- a single-pass
-paginator with no break-and-resume. That's not paged.js;
-it's a different algorithm. Not a small refactor.
-
-The buffer variant (pre-clone source once at startup,
-move from buffer to dest) was considered and not
-prototyped: it'd shift the cloneNode allocation cost to
-one big startup call but every per-page move would
-still hit the same appendChild + gBCR dynamic that ate
-the savings here. No structural win.
-
-This experiment also clarifies why the "Profiling
-pdf-lib's load" and "Findings: removeChild" sections
-saw allocation savings show up as wall-clock gains:
-those operations didn't have a Blink layout-tree
-mutation step downstream. Mutations are where the cost
-that *looks* like JS allocation actually lives in this
-codebase.
-
-## Cracking `(program)` open with a Blink-category trace
-
-The cpu profile's `(program)` row sat at ~2.2 s (23 %) of
-render and resisted attribution -- `find-callers.mjs` puts
-it directly under `(root)`, the V8 sampler's structural
-floor for "isolate is on-CPU but no JS frame on top." To
-see *what* native code was running there, the harness gained
-a `--tracing` flag and a companion `analyze-trace.mjs`.
-
-The flag wraps the render phase in `page.tracing.start()`
-with Blink-relevant categories (`devtools.timeline`,
-`disabled-by-default-devtools.timeline`, `blink`, `v8`,
-`v8.execute`, `disabled-by-default-v8.cpu_profiler`) and
-writes `trace.json` to the results folder. The
-`v8.cpu_profiler` category embeds V8 sampling-profile data
-as `Profile` / `ProfileChunk` events inline with the Blink
-trace events, so the single trace file is *hybrid*: loaded
-in Chrome DevTools Performance or [ui.perfetto.dev](https://ui.perfetto.dev)
-it renders JS call stacks aligned with Blink events on the
-same timeline (the de facto answer to "what was `(program)`
-doing?"). Cost: ~2x file size (e.g. 22 MB -> 52 MB on the
-1651-page book) and ~0.4 s wall-clock for the extra sampler
-work -- both noise on the analysis side.
-
-`analyze-trace.mjs` walks the trace's complete-phase
-events on `CrRendererMain`, computes self-time per event
-name via a nested-event stack walk (same shape as
-`analyze-profile.mjs` for cpuprofiles), and prints a
-top-N table. A `--children <name>` mode breaks any
-parent event into its direct callees, mirroring
-`find-callees.mjs`. It ignores the embedded V8 cpu samples
--- those are consumed by the viewers above (DevTools /
-Perfetto) or, for terminal use, by `analyze-hybrid.mjs`,
-which combines V8 sample stacks with Blink event nests
-into a single bottom-up / callees view.
-
-### What's on the main thread
-
-Top events by self-time on a fresh `--detach-pages
---no-timing --render-only --tracing` run, 1651-page book,
-9.07 s render:
-
-| event                                    | self_ms | self_% |
-| ---------------------------------------- | ------- | ------ |
-| `RunMicrotasks`                          | 3039.42 | 33.5 % |
-| `LocalFrameView::performLayout`          | 1800.31 | 19.9 % |
-| `Document::recalcStyle`                  | 1785.55 | 19.7 % |
-| `InlineNode::ShapeTextIncludingFirstLine`|  526.64 |  5.8 % |
-| `Document::rebuildLayoutTree`            |  484.88 |  5.4 % |
-| `FunctionCall`                           |  285.89 |  3.2 % |
-| `v8.callFunction`                        |  251.48 |  2.8 % |
-| `Blink.CompositingInputs.UpdateTime`     |  130.77 |  1.4 % |
-| `Blink.PrePaint.UpdateTime`              |  118.90 |  1.3 % |
-| `Document::updateStyle`                  |  101.65 |  1.1 % |
-| ... 189 smaller events ...               |         |        |
-
-Mapping these onto the cpu profile's labels:
-
-| cpu profile row | trace decomposition |
-| --- | --- |
-| `getBoundingClientRect` self 3.7 s | `performLayout` 1.8 s + `recalcStyle` 1.8 s -- the layout flush gBCR triggers, which the cpu profile lumps under the native frame. |
-| `removeChild` self 1.6 s | `rebuildLayoutTree` 0.5 s + portions of `recalcStyle` / `performLayout` -- each removeChild dirties style and layout. |
-| `(program)` self 2.2 s | `RunMicrotasks` 3.0 s mostly. The cpu profile attributes a chunk of this to neighbour rows; what's left under `(program)` is the V8 runtime plumbing that has no JS frame on top. |
-| `(garbage collector)` 100 ms | Sum of `V8.GC_*` events ≈ 135 ms. |
-
-So `(program)` is essentially **the V8 runtime inside a
-microtask continuation**. The natural follow-up is "which
-microtask, and what's it doing?"
-
-### Inside `RunMicrotasks`
-
-`--children RunMicrotasks` shows the parent fired only
-**15 times** across the whole render, totalling 7.14 s:
-
-```
-parent: RunMicrotasks  hits: 15  total: 7142.49ms  self: 3039.42ms (42.6%)
-
-   total_ms  total_%     hits   child
-   --------  -------   ------   --------------------------------
-   3442.01   48.19%    39437   Document::UpdateStyleAndLayout
-   3039.42   42.55%       15   (self / unattributed)
-    547.98    7.67%   181106   v8.callFunction
-     50.99    0.71%      892   Blink.Style.UpdateTime
-     34.88    0.49%      205   V8.StackGuard
-     17.05    0.24%        6   MinorGC
-```
-
-Listing the 15 events by duration:
-
-```
-rm[0]   70.89 ms   -- one early-render burst (the parser)
-rm[1..3]  < 1 ms  -- empty-trigger settle ticks
-rm[4]  7071.14 ms  -- THE render loop
-rm[5..14]  < 1 ms each  -- post-render cleanup
-```
-
-**One event accounts for 99.0 % of the parent total.**
-rm[4] envelopes essentially the whole render. V8 batches
-the ~6 `await` boundaries inside `Chunker.flow()`
-(beforeParsed / filter / afterParsed / loadFonts /
-render / afterRendered) -- all of which Phase 1 of the
-async cleanup turned into `await undefined` fast-paths --
-into a single drained microtask continuation. There is
-**no per-page microtask cost**. The async stripping did
-its job.
-
-### The 181,106 `v8.callFunction` callbacks
-
-The first thing that looked like a smoking gun --
-"181k dispatches sounds per-page-shaped" -- turned out
-to be **one DOM walk**. Aggregating FunctionCall events
-by `args.data.functionName + lineNumber`:
-
-```
-hits      dur_ms   functionName:line
-181041    296.54   (anon):32455  (paged.browser.js)
-     2      0.25   request.onload:27495
-```
-
-paged.browser.js:32455 is `WhiteSpaceFilter.filter`'s
-TreeWalker callback:
-
-```js
-filterTree(content, (node) => {
-    return this.filterEmpty(node);
-}, NodeFilter.SHOW_TEXT);
-```
-
-The walker visits every text node in the parsed
-document and calls the lambda. For our 5.5 MB book
-that's 181,041 invocations, all clustered in the first
-685 ms of rm[4]. Same `(node) => this.filterEmpty(...)`
-arrow allocated once but called from C++→JS 181k times,
-so V8 emits a `v8.callFunction` event each invocation.
-
-These aren't 181k microtasks. They're 181k synchronous
-TreeWalker callbacks nested inside the one big
-continuation. The "callbacks per page" framing was a
-mirage produced by dividing 181k by page count.
-
-### What's actually in `(program)`'s 2.2 s
-
-Triangulating the trace and cpu profile:
-
-- **~1.7 s** is V8 dispatch glue for the 181k filter
-  walk callbacks + remaining native→JS transitions
-  inside the continuation. V8 charges this to
-  `RunMicrotasks` self in the trace; the cpu profile
-  splits it between `(program)` and rows like `v8.callFunction`.
-- **~0.3 s** is V8 IC / inline-cache miss handling on
-  the per-page hot path. Each polymorphic call site
-  pays a stub-call indirection that lands in `(program)`.
-- **~0.1 s** is Blink microtask checkpoint code -- the
-  auto-style-and-layout pass that fires whenever a
-  microtask drains. The `Document::UpdateStyleAndLayout`
-  events under `RunMicrotasks` (3.44 s) attribute the
-  work *itself* to named Blink rows; the C++ glue
-  bracketing each call lands in `(program)`.
-- The remainder is V8 scheduler bookkeeping, microtask
-  queue drain machinery, and small unnamed natives.
-
-None of this is a *per-page* cost. Reducing further
-would require either (a) eliminating the filter walk,
-or (b) reducing the per-page hot path's native→JS
-transition count -- which is dominated by gBCR-driven
-layout flushes that we've already pushed against
-unsuccessfully in earlier sections (Attempts B, D from
-the "createBreakToken dedup" investigation).
-
-### The "actionable finding" that wasn't: WhiteSpaceFilter
-
-The whitespace filter walk costs **~685 ms once per
-render** -- 296 ms inside the JS callback bodies plus
-~390 ms in TreeWalker dispatch overhead. The initial
-read was "this is doing nothing useful for compressed
-HTML, short-circuit it." Wrong on both counts.
-
-Branch-counting the filter via a one-shot probe (count
-every branch in `filterEmpty`, dump to the harness
-console):
-
-```
-total:        181,106  every text node visited
-  length === 0:       0
-  length === 1:  38,685  (21.4%)  collapsed inter-element spaces
-  length > 1, !ignorable: 101,930  (56.3%)  real content -- hot path
-  length > 1, ignorable:  40,491  (22.4%)  whitespace-only, body runs
-    inside <pre>:        3,408   no-op (REJECT)
-    middle position:    27,901   textContent = " " (mutated)
-    left edge:           5,405   removeChild (accepted)
-    right edge:          3,777   removeChild (accepted)
-    orphan:                  0
-```
-
-**22.4 % of calls entered the body** and 37,083 actual
-DOM mutations happened: 9,182 nodes removed +
-27,901 nodes overwritten to single spaces. Far from
-zero.
-
-The premise was based on a misreading of html-compress:
-the plugin does collapse inter-element whitespace, but
-the `:site, :pre_render` gate that picks which pages it
-processes explicitly excludes `book.html` (which uses
-the minimal `book-combined` layout that doesn't reach
-`vendor/compress`; same README's html-compress section
-calls this out). Source indentation is preserved in
-the PDF input, so paged.js sees the raw multi-char
-whitespace text nodes. The filter is load-bearing --
-its mutations are what subsequent chunker walkers
-rely on to skip whitespace cheaply.
-
-The 0.83 % of calls that exceeded 4 us in the trace's
-dur histogram came from this body running; the
-histogram undercounted body entries because the
-short-branch (`closest("pre")` → REJECT) takes only
-~2-3 us, indistinguishable from the hot path in the
-0-4 us buckets. Branch counters were needed to reveal
-the true split.
-
-There's still optimisation headroom (the per-call
-TreeWalker dispatch is ~3 us of which only ~1.5 us is
-the body), but it requires changing the algorithm
-rather than skipping it: e.g. a hand-rolled JS recursion
-that avoids the C++→JS transition per node, or
-folding WhiteSpaceFilter + CommentsFilter + ScriptsFilter
-into a single TreeWalker pass with `SHOW_TEXT | SHOW_COMMENT`
-and a dispatcher. Net saving probably ~300-400 ms once
-per render; not investigated.
-
-The methodology lesson: a histogram of per-call dur
-**cannot** distinguish a fast body branch from a hot
-path -- both compile to 2-3 µs on V8. Branch
-instrumentation is the only way to count what each
-call actually did. The histogram suggested "0.8 %
-body entries"; reality was 22.4 %.
-
-### And we did fix it, on the Jekyll side
-
-The premise that motivated the original "actionable
-finding" -- that book.html should already be
-whitespace-collapsed when paged.js sees it -- was true
-in spirit, just wrong about whether it was being done.
-The fix landed in two parts:
-
-1. **Extend `html-compress.rb` to book.html.** The
-   layout-chain precompute now explicitly adds
-   `book-combined` to `@compress_layouts` at the end of
-   `precompute_compress_layouts!`. book.html therefore
-   passes through `compress!` once per build (~480 ms
-   of `String#split` work on the ~5.5 MB document), and
-   paged.js sees a document with inter-element
-   whitespace already collapsed to single spaces.
-
-2. **Reorder hook priorities** so that adding compress
-   to book.html composes cleanly with the other
-   `:pages, :post_render` plugins. The original
-   `:high`-priority compress ran *before*
-   `book-href-rewrite` -- whose landing-heading strip
-   removed `<h2>` blocks from three chapter openings,
-   leaving the (already-collapsed) single spaces on
-   either side adjacent and producing literal `>  <`
-   blobs. The fix is a three-tier convention: mutators
-   at `:high` (run first), compress at `:normal` (the
-   cleanup), readers at `:low` (snapshot final bytes).
-   See `_plugins/html-compress.md` for the full table.
-
-Verified: 0 outside-pre multi-whitespace runs in the
-regenerated book.html (was 3 with the
-landing-heading-strip artifacts; was 37,087 without
-compress at all). Branch-counting the WhiteSpaceFilter
-after the fix shows body entries drop from ~40 k to
-the 3,408 in-pre cases that the filter is structurally
-required to visit (and immediately REJECTs via
-`closest("pre")`). DOM mutations drop from ~37 k to 0.
-PDF output is byte-equivalent within timestamp drift.
-
-Net wall-clock is approximately neutral on full builds
-(~480 ms added to Jekyll, ~300-500 ms saved at paged.js
-render time), and a small win for incremental Jekyll
-workflows that skip the PDF (`also_build_pdf: false`):
-the compress cost is paid once per Jekyll build, the
-render saving is paid every PDF build, and decoupling
-the two is the structural improvement.
-
-A ruby-prof A/B (post-change vs pre-change with a
-single stashed-changes revert) confirmed that the only
-attributable Jekyll-side cost is exactly one extra
-`compress!` invocation (837 → 838) and its downstream
-`String#split` calls (+819 from book.html's non-pre
-segments). No plugin's call count or self-time changed
-beyond the noise floor; the priority shuffle is
-CPU-invariant for everything except the new compress
-pass on book.html.
-
-### What the trace doesn't change
-
-Nothing about the cpu profile's bottom-up table is
-wrong; the trace just resolves what `(program)` masked.
-After this exercise, the menu of remaining levers is
-unchanged:
-
-- `pageRanges` sharding for the generate phase (biggest
-  untried knob, generate is now the larger phase).
-- WhiteSpaceFilter -- the trace and a follow-up cpu-
-  profile A/B (see next section) eventually showed this
-  *is* skippable for our pipeline once html-compress has
-  done the work at Jekyll time. Worth ~600 ms / 6 %.
-- Everything else lives below the noise floor.
-
-The cpu profile's `(program)` row isn't a structural
-smell or a missed microtask -- it's the fixed cost of V8
-running the JavaScript we already have, accounted for
-honestly by the trace and accounted for opaquely by
-the JS sampler.
-
-## Disabling the filter outright: paired cpu-profile A/B
-
-The "actionable finding that wasn't" + "and we did fix
-it, on the Jekyll side" pair above closed with two
-conclusions:
-
-1. WhiteSpaceFilter does real work on book.html
-   (37k DOM mutations pre-compression, 0 post-).
-2. Post-compression the filter is essentially a no-op
-   visit over 181k text nodes, and skipping it doesn't
-   save measurable wall-clock -- a 3+3 wall-clock A/B
-   showed 8.78 s avg with filter vs 8.53 s without, well
-   inside the 1.17 s within-variant noise band.
-
-Conclusion (1) is correct. Conclusion (2) was wrong --
-specifically the "no measurable saving" claim and the
-flush-migration explanation I attached to the ~+180 ms
-gBCR move that appeared in a single-run profile pair.
-
-A reader pointed out the flush-migration reasoning was
-incoherent: `WhiteSpaceFilter.filter` runs *once* in
-`Chunker.flow()` *before* any page is created. The body
-of `filterEmpty` reads `textContent`, walks parents via
-`closest("pre")`, and walks siblings -- none of which
-read layout-flushing properties (`gBCR`, `offsetTop`,
-computed style, etc.). There is no flush for migration
-to migrate from. Whatever the +180 ms gBCR move in the
-single-run pair was, it wasn't "the filter's flush load
-deferring to the next gBCR." It was single-run noise on
-a 38 % row -- which has a much wider noise band than
-the README's "50-150 ms for sub-1 % rows" methodology
-note covers.
-
-### The proper A/B
-
-Three filter-on (A) and three filter-off (B) cpu-profile
-runs, interleaved A1 B1 A2 B2 A3 B3 so system-load
-variance hits both sides equally. The probe is a one-line
-`return;` at the top of `WhiteSpaceFilter.filter` --
-skip the TreeWalker entirely. Toggle is a single edit
-between runs. Both states are otherwise identical
-(post-compression book.html, current bundle).
-
-Per-run totals from
-[`perf/ab-aggregate.mjs`](ab-aggregate.mjs):
-
-| run | total CPU |
-| --- | --- |
-| A1 (filter ON)  | 11,120 ms |
-| A2 (filter ON)  | 10,270 ms |
-| A3 (filter ON)  |  9,727 ms |
-| **A mean**      | **10,372 ms** |
-| B1 (filter OFF) |  9,744 ms |
-| B2 (filter OFF) | 10,189 ms |
-| B3 (filter OFF) |  9,180 ms |
-| **B mean**      |  **9,705 ms** |
-| **Δ (B - A)**   |   **-668 ms (-6.4 %)** |
-
-The within-group ranges are ~1.3 s (A) and ~1.0 s (B),
-so the -668 ms total-CPU delta sits at roughly 1 σ of
-within-variant spread. By itself, that's a soft signal.
-
-But per-row breakdown is tighter:
-
-| row | A mean ± sd | B mean ± sd | Δ |
-| --- | --- | --- | --- |
-| `getBoundingClientRect`         | 4128 ± 309 | 3791 ± 163 | **-338 ms** |
-| `(program)`                     | 2243 ± 56  | 2328 ± 173 | +85 ms (noisy) |
-| `removeChild`                   | 1619 ± 63  | 1564 ± 43  | -55 ms |
-| `afterPageLayout` @ paged.js    |  150 ± 26  |  119 ± 17  | -32 ms |
-| **`filterTree` self**           | **88 ± 14** |  **2 ± 1** | **-86 ms** |
-| `(garbage collector)`           |  103 ± 6   |   92 ± 4   | -11 ms |
-| `handleAlignment`               |   70 ± 5   |   56 ± 7   | -14 ms |
-| `create` (`Page.create`)        |   66 ± 7   |   50 ± 4   | -15 ms |
-| `sortDisplayedSelectors`        |   60 ± 10  |   46 ± 1   | -14 ms |
-| **`filterEmpty` self**          | **37 ± 2** |    **0**   | **-37 ms** |
-
-Direct attribution (the filter rows that vanish in B):
-
-- `filterTree` self: -86 ms
-- `filterEmpty` self: -37 ms
-- ~123 ms
-
-Indirect attribution (rows that shrink in B despite
-unchanged call counts -- see the trace data above
-where Document::UpdateStyleAndLayout, recalcStyle and
-performLayout all run ~14-15 % cheaper per call with
-filter off):
-
-- `getBoundingClientRect`: -338 ms
-- `removeChild`: -55 ms
-- `afterPageLayout @ paged.js:30458` (paged.js core): -32 ms
-- `create`: -15 ms
-- `handleAlignment`: -14 ms
-- `sortDisplayedSelectors`: -14 ms
-- `(garbage collector)`: -11 ms
-- smaller rows: ~50 ms
-- ~529 ms
-
-Direct + indirect ≈ 652 ms, in the neighbourhood of
-the -668 ms total-CPU delta. They corroborate.
-
-### Why the filter has indirect cost
-
-The single-trace measurement above (filter-off trace
-captured for the same render) made the indirect path
-visible: with filter off, `Document::UpdateStyleAndLayout`
-total dropped by 574 ms across an *unchanged* 39,437
-call count -- ~14 µs less per call. `recalcStyle` and
-`performLayout` similarly dropped ~14 % per call.
-Plausibly:
-
-- V8's polymorphic inline caches stay warmer on the
-  per-page hot path when 181 k extra C++→JS
-  dispatches haven't been churning them.
-- Blink's main-thread scheduler has fewer task
-  boundaries to bookkeep across.
-- Allocator/GC pressure is lower (the filter walk
-  allocates per-callback closures and intermediate
-  strings, even when each callback just returns
-  FILTER_REJECT).
-
-None of those are "the filter triggers a layout
-flush." Layout work *itself* gets cheaper because the
-ambient V8/Blink state is less polluted. Same per-call
-mechanics, slightly faster main-thread context.
-
-### The fix: config flag, default off
-
-`window.PagedConfig.runWhitespaceFilter` gates the
-walk. Default is undefined (falsy) -- our pipeline runs
-`html-compress` on book.html, so the filter has
-nothing to do and skipping it saves the ~600 ms.
-
-Anyone running paged.js against an uncompressed
-document can set the flag before `PagedPolyfill.preview()`
-to opt back in. The class itself is unchanged so the
-opt-in path is byte-equivalent to the original.
-
-The opt-in semantic is the conservative choice: paged.js
-upstream and many downstream users feed it untouched
-HTML (with inter-element indentation surviving), where
-the filter does meaningful cleanup. Disabling it for
-*every* caller of this bundle would be a regression for
-those use cases. Disabling it by default for *our*
-pipeline is fine because we control the input
-end-to-end.
-
-Cost: zero per-page work (the gate is one `&&`-chain
-check at startup), structural correctness for clean
-documents, opt-in safety valve for everyone else.
-
-### Methodology note
-
-The wall-clock A/B was correct in claiming "the saving
-is below the wall-clock noise floor for short N." It
-was wrong in concluding "therefore no saving exists."
-Two corrections:
-
-1. Aggregate CPU work across paired profiles. Wall-clock
-   noise is ~1 s per run on this machine; CPU sample
-   totals are also ~1 s per run but the row-by-row
-   self-time deltas can be much tighter. The
-   `filterTree` row goes from 88 ms (sd 14) to 2 ms (sd
-   1) -- a 6 σ shift. Per-row analysis can see signals
-   that per-run totals lose.
-
-2. Use *enough* paired runs that within-group SD lets
-   you compute mean ± SD honestly. 3+3 is the bare
-   minimum (gives 1 σ confidence on row-level deltas
-   for things that change by 5+ σ). 5+5 or 10+10 would
-   tighten the gBCR delta confidence further -- worth
-   doing for finer signals.
-
-The probe + aggregator are reusable
-([`perf/ab-aggregate.mjs`](ab-aggregate.mjs)): point at
-6 `ab-*.cpuprofile` files and it prints the mean ± SD
-table. Pattern fits any future "does this change save
-CPU?" question where wall-clock noise is the obstacle.
-
-## Following `RunMicrotasks` down to zero
-
-The trace section above pinned the cpu profile's
-`(program)` row to V8 running JS inside a microtask
-continuation. With the WhiteSpaceFilter gone the
-`--children RunMicrotasks` breakdown still showed one
-`rm[4] = 6262 ms` event enveloping essentially the
-whole render -- 15 hits total, 99 % concentrated in one
-batched drain. That raised a sharper question: if the
-per-page hot path is sync (Phase 1 + 2 above), why is
-*any* of the render running inside a microtask scope?
-
-### What was still async, and what it cost us
-
-The README's earlier "What's still async, and why"
-inventory was honest about the surviving await sites at
-that point:
-
-- `Chunker.flow()` -- async wrapper, awaited
-  `beforeParsed` / `afterParsed` / `afterRendered` hook
-  triggers, `loadFonts()`, and `chunker.render()`.
-- `Chunker.render()` -- thin async wrapper around the
-  sync `renderer.next()` loop, kept so `flow()` could
-  `await` it.
-- `Chunker.clonePage()` -- async, awaited three
-  per-page hooks. Footnotes-only caller, dead path for
-  our content but live in the bundle.
-- `PagedPolyfill.preview()` -- async, awaited
-  `beforePreview` / `afterPreview` hooks plus
-  `polisher.add` and `chunker.flow`.
-- `Polisher.add()` / `Polisher.convertViaSheet()` /
-  `Sheet.parse()` -- async chain to fetch and parse
-  external stylesheets. `Polisher.add` did
-  `Promise.all` over the inputs.
-- `Chunker.loadFonts()` -- returned `Promise.all` of
-  `fontFace.load()` for any face not yet in state
-  "loaded".
-- `request()` -- async XHR + `Promise` wrapper, used by
-  the polisher chain to fetch each `<link rel="stylesheet">`
-  URL.
-
-Cost of each: small. Cost of all of them together: V8
-sees an unbroken await chain from `page.evaluate(async
-() => { await PagedPolyfill.preview(); })` down to
-`document.fonts.ready` (the one genuinely-async
-dependency in the chain). When that promise resolves V8
-schedules a microtask to resume `flow()`. Phase 1 + 2
-of the async cleanup made the *body* of the resumed
-function execute synchronously, so once it resumes it
-runs ~6.2 s straight to the end of the render. V8
-correctly attributes the whole continuation to the
-`RunMicrotasks` host frame, since that's the C++ frame
-on the stack while the resumed JS runs.
-
-So `RunMicrotasks` self-time being 2.89 s wasn't a
-sign of microtask overhead -- it was the bookkeeping
-label V8 puts on continuation-style work. Every named
-Blink event nested inside (`Document::UpdateStyleAndLayout`,
-`recalcStyle`, `performLayout`, etc.) appeared in the
-trace as a child of `RunMicrotasks`. Same shape applied
-in the cpu profile: `(program)` is the catch-all bucket
-V8 picks when no JS frame sits on top of the stack at
-sample time, and a microtask continuation is exactly
-that condition.
-
-The bucket name was misleading, but the cost itself was
-real -- the JS *running* inside the continuation
-*was* paged.js doing its per-page work. No "microtask
-plumbing overhead" to slim down. The only way to remove
-the `RunMicrotasks` attribution was to stop wrapping the
-render in a microtask continuation entirely -- i.e.,
-make the whole chain synchronous so V8 has no async
-scope to attribute to.
-
-### Why this is OK for our pipeline (and not for upstream)
-
-Upstream paged.js needs the async machinery. Its target
-deployment is an interactive browser page: real
-stylesheet fetches over HTTP (genuinely async), font
-loads against the OS (genuinely async), user-registered
-handlers that may load external resources or do
-expensive work between page renders (async-friendly to
-keep the page responsive). The await chain is the
-canonical pattern for "yield to the browser between
-expensive steps so the UI thread can paint."
-
-Our pipeline has none of those constraints:
-
-- `page.goto(url, { waitUntil: 'load' })` settles
-  *before* paged.js is invoked. Every font, image, and
-  stylesheet referenced by `<link>` / `@font-face` /
-  `<img>` is already loaded by the time the render
-  starts. The async checks are no-ops.
-- The headless renderer has no compositor coordinating
-  with us, no paint budget to respect, no user looking
-  at the page. Blocking the main thread for 8 s is
-  fine -- nobody's watching.
-- All registered handlers in our build are synchronous.
-  The `_assertSync` guard from the Phase 1/2 cleanup
-  has been in place for the per-page hot path for a
-  while; we just hadn't extended the pattern to the
-  once-per-render hooks.
-- The stylesheet fetches the polisher does are local
-  `file://` URLs. Sync XHR resolves them in microseconds.
-
-So the entire async surface in paged.js -- which
-upstream needs -- is, for our specific use case, the
-opposite of helpful: it pushes work into microtask
-continuations that show up as `RunMicrotasks` in the
-trace and `(program)` in the cpu profile, instead of
-landing under honest names like `RunTask` and
-`EvaluateScript`.
-
-### The conversion
-
-Nine functions in `docs/lib/paged.browser.js` switched
-from `async` to plain sync, marked
-`[PATCH: sync-chain]` at each site:
-
-| function | what changed |
+| `detach-pages` handler | [01](notes/01-baseline-and-detach.md) | ~55 s render |
+| Incremental PDF writer | [01](notes/01-baseline-and-detach.md) | ~32 s process |
+| pdf-lib `parseSpeed: Fastest` | [01](notes/01-baseline-and-detach.md) | ~3 s process |
+| Drop `pagedjs-cli` dependency | [01](notes/01-baseline-and-detach.md) | (cleanup) |
+| `finalizePage` micro-opts | [02](notes/02-finalizepage.md) | ~3 s render |
+| Aggressive detach (`removeChild`) | [02](notes/02-finalizepage.md) | ~22 s render |
+| Skip dead `findEndToken` path | [02](notes/02-finalizepage.md) | ~3.5 s render |
+| `renderTo` additive backoff | [02](notes/02-finalizepage.md) | ~4.25 s render |
+| Puppeteer 22→25 bump | [03](notes/03-puppeteer-bump-findref.md) | ~20-30 s generate |
+| `findRef` fast-path fix | [03](notes/03-puppeteer-bump-findref.md) | ~2.4 s render |
+| `requestAnimationFrame` → microtask | [03](notes/03-puppeteer-bump-findref.md) | small but unblocks more |
+| Strip async machinery (Phase 1+2) | [04](notes/04-sync-and-inner-loop.md) | re-attribution + small |
+| `Layout.append` parent-lookup cache | [04](notes/04-sync-and-inner-loop.md) | ~0.3 s render |
+| `Hook.triggerSync` empty fast-path | [04](notes/04-sync-and-inner-loop.md) | small |
+| Footnotes self-disable when none | [04](notes/04-sync-and-inner-loop.md) | small |
+| Skip `wrapContent` innerHTML roundtrip | [04](notes/04-sync-and-inner-loop.md) | ~0.9 s render |
+| Adaptive `maxChars` bug fixes | [04](notes/04-sync-and-inner-loop.md) | ~1 s render |
+| Disable WhiteSpaceFilter | [05](notes/05-blink-trace.md) | ~0.7 s render |
+| Full sync chain (RunMicrotasks → 0) | [06](notes/06-microtasks-pageranges-css.md) | re-attribution |
+| `--disable-gpu` + `--in-process-gpu` | [07](notes/07-memory.md) | ~200 MB memory |
+
+What was tried and didn't ship:
+
+- Binary-search `Layout.textBreak` ([02](notes/02-finalizepage.md))
+- Memoize `Page.create`'s `getBoundingClientRect` ([02](notes/02-finalizepage.md))
+- Four of five `createBreakToken` dedup attempts ([02](notes/02-finalizepage.md)) -- Attempt E shipped as the `renderTo` additive backoff above
+- Six cheaper-`removeChild` variants ([03](notes/03-puppeteer-bump-findref.md))
+- Move-not-clone instead of clone+detach ([05](notes/05-blink-trace.md))
+- `pageRanges` sharding for `generate` ([06](notes/06-microtasks-pageranges-css.md))
+- Forced GC between render and generate ([07](notes/07-memory.md))
+
+## Investigation log
+
+The seven phase files in [`notes/`](notes/) cover the full investigation
+narrative. Each is self-contained but they're written in chronological
+order; later ones reference earlier ones for context.
+
+| File | Covers |
 | --- | --- |
-| `request()` | Async XHR + `new Promise` + `Response` wrapper → sync XHR (`open(...,false)`) returning body text directly. Both callers (`Polisher.add` / `convertViaSheet`) only ever consumed `response.text()` (itself async per spec), so returning text skips that boundary too. |
-| `Sheet.parse()` | Three `await hook.trigger(...)` → `_assertSync(triggerSync(...))`. CSS-parser hooks all sync in our build. |
-| `Polisher.convertViaSheet()` | Drop awaits on `sheet.parse` / `request` / recursive `convertViaSheet`. |
-| `Polisher.add()` | Drop the `Promise.all` + then-chain entirely. Walks arguments once, feeds each through the sync pipeline. |
-| `Chunker.loadFonts()` | `Promise.all(fontFace.load())` → sync walk of `document.fonts` that throws if any face's `status !== "loaded"`. The throw is a safety net; `page.goto({waitUntil:'load'})` settles fonts in practice. |
-| `Chunker.clonePage()` | Three per-page hook awaits → `_assertSync`. Cold path (Footnotes-only). |
-| `Chunker.render()` | Strip `async`. Body was already sync after the Phase 1/2 cleanup. |
-| `Chunker.flow()` | Strip `async`; five await sites → sync calls / `_assertSync`. |
-| `PagedPolyfill.preview()` | Strip `async`; two hook awaits → `_assertSync`; drop awaits on `polisher.add` / `chunker.flow`. |
-
-Plus the two external callers in
-[`perf/measure.mjs`](measure.mjs) and
-[`docs/render-book.mjs`](../docs/render-book.mjs):
-both did `page.evaluate(async () => { await
-window.PagedPolyfill.preview(); })`. The inner IIFE is
-now a plain sync arrow; the outer `await` is just the
-CDP round-trip puppeteer needs to ferry control back.
-
-The `_assertSync` helper (from the earlier
-"sync chain end-to-end through the per-page hot path"
-work) is the load-bearing safety net throughout: if any
-future hook handler returns a thenable, the chain
-throws with a useful error message instead of silently
-swallowing async work. The contract is now:
-
-> Every hook handler in this bundle is sync. Every
-> external resource referenced by the document is
-> loaded before `PagedPolyfill.preview()` runs.
-
-If either invariant breaks, `_assertSync` or
-`loadFonts`'s throw catches it loudly.
-
-### Results
-
-Paired `--detach-pages --no-timing --render-only
---tracing` run on the 1651-page book, comparing the
-pre-conversion trace ([results from
-"Inside RunMicrotasks" above]) against the post-:
-
-| metric | pre-sync | post-sync | Δ |
-| --- | --- | --- | --- |
-| render wall | 8.13 s | 8.36 s | flat (within single-run noise) |
-| trace event count | 250,376 | 255,949 | flat |
-| `RunMicrotasks` self | 2890.66 ms (35.6 %) | **0.56 ms** (off top-30) | **-2890 ms (-99.98 %)** |
-| `RunMicrotasks` total | 6333.18 ms | **0.56 ms** | **-6333 ms** |
-| `RunMicrotasks` hits | 15 | 12 | -3 |
-| `RunMicrotasks` rm[4] dur | 6262.34 ms | gone | -6262 ms |
-| `RunTask` self (top-30) | (below threshold, ~16 ms) | **2984.11 ms (34.6 %)** | **+2968 ms** |
-| `RunTask` hits | (~few hundred) | **1005** | re-attributed |
-| `RunTask` total | (small) | **8630.80 ms** | the whole render |
-| `Document::UpdateStyleAndLayout` total/hits | 3320 / 39675 | 3515 / 39675 | flat |
-| `Document::recalcStyle` self | 1737 ms | 1877 ms | flat |
-| `LocalFrameView::performLayout` self | 1737 ms | 1881 ms | flat |
-| per-page ratio (last/first quarter) | 1.36x | 1.27x | slight improvement (noise band) |
-| pages | 1651 | 1651 | identical |
-| PDF size (full render, separate run) | 16.1 MB | **16.1 MB** | byte-equivalent |
-
-The headline number is the **6333 → 0.56 ms collapse**
-in `RunMicrotasks` total. The 12 surviving sub-ms hits
-are pure puppeteer/CDP plumbing (one `AsyncTask Run`
-child = 0.01 ms; the rest are V8 internal MT-checkpoint
-runs). There is no remaining JS executing inside a
-microtask continuation -- the render runs as a plain
-synchronous task from start to end.
-
-The work didn't disappear, it re-attributed. `RunTask`
-self-time (2984 ms) almost exactly equals the old
-`RunMicrotasks` self-time (2891 ms) plus single-run
-noise. Per-call children counts are unchanged
-(`Document::UpdateStyleAndLayout`: 39675 calls then,
-39675 calls now). Same JS, same DOM mutations, same
-layout flushes -- just no longer wrapped in a
-continuation.
-
-### What this buys
-
-**Profile readability.** A reader opening
-`render.cpuprofile` or `trace.json` after this change
-sees:
-
-- `(program)` in the cpu profile drops by the
-  proportion that was V8 runtime overhead inside the
-  continuation (the MT plumbing + dispatch glue
-  between named natives). The remaining `(program)`
-  is genuinely-unattributable V8 work (IC stubs,
-  runtime helpers).
-- `RunMicrotasks` no longer appears at the top of the
-  trace's bottom-up table. The render lands under
-  `RunTask` / `EvaluateScript` / `FunctionCall`, with
-  Blink work (`performLayout`, `recalcStyle`,
-  `rebuildLayoutTree`) as named children where it
-  belongs.
-- The cpu profile's `(idle)` row already collapsed in
-  the earlier rAF→queueMicrotask fix; this change
-  closes the symmetric gap on the JS side.
-
-**Structural simplicity.** Nine functions in the bundle
-lost the `async` keyword and the `await` site
-discipline that went with it. The render call chain is
-now top-to-bottom synchronous: `preview()` calls into
-`flow()` calls into `render()` calls into `*layout()`,
-plain returns all the way down. Anyone tracing through
-the bundle for a perf investigation can read the
-control flow without modeling promise resolution
-ordering.
-
-**Single contract.** The hook surface is now uniformly
-sync via `_assertSync`. Before the conversion, the
-per-page hooks (`beforePageLayout`, `afterPageLayout`,
-`finalizePage`, etc.) were sync-asserted while the
-once-per-render hooks (`beforeParsed`, `afterParsed`,
-`afterRendered`, `beforePreview`, `afterPreview`) used
-`await trigger(...)`. The split was historical, not
-principled. Now every hook is sync-asserted, same
-shape, same error message.
-
-### What this doesn't buy
-
-**Wall-clock.** Render goes 8.13 s → 8.36 s, which is
-within the ±1 s single-run noise band for this machine
-documented elsewhere in this README. CPU work
-re-attributes but doesn't shrink: the chunker's JS
-still runs the same way, DOM mutations still trigger
-the same layout flushes, gBCR self-time still owns
-~21 % of the trace. Phase 1's microtask-boundary
-elimination cost (~850 ms) was real because there *were*
-8 k boundaries to remove; this conversion eliminates a
-handful of additional boundaries (the once-per-render
-sites) whose per-boundary cost is small.
-
-**A path to fewer flushes.** The remaining gBCR-driven
-layout work is intrinsic to paged.js's per-page
-break-and-resume algorithm. The README's earlier
-attempts (B, D from the "createBreakToken dedup"
-investigation; the move-not-clone experiment) confirmed
-that gBCR re-attributes if you elide one site, and
-that mutations are the structural source. Synchronising
-the chain doesn't change any of that.
-
-### Verification
-
-The 1651-page book renders identically pre- and
-post-conversion -- same page count, same 16.1 MB PDF.
-The PDF differs from the previous build only by the
-expected timestamp drift (the `/CreationDate` /
-`/ModDate` entries Chrome writes per run). No content
-changes; the bundle does the same work in the same
-order.
-
-The trace's `RunTask` -> `Document::UpdateStyleAndLayout`
-hit count (39 675) matches the previous run exactly,
-confirming the per-page chunker iteration count is
-preserved through the conversion. `RunTask` ->
-`WebFrameWidgetImpl::UpdateLifecycle` at 1950 ms / 1
-hit is Chromium's final-frame lifecycle work after the
-last page is laid out, same as before -- it just shows
-up under `RunTask` instead of being attributed to a
-post-render microtask, which is also why `RunTask` self
-includes it.
-
-### What's still async, post-conversion
-
-Two surfaces remain async-shaped, both intentionally:
-
-1. **The auto-run block at [paged.browser.js:33153](../docs/lib/paged.browser.js:33153).**
-   `ready.then(async function () { ... })` fires once at
-   `DOMContentLoaded` and is gated by `config.auto !==
-   false` -- our pipeline always sets `config.auto =
-   false` before invoking `preview()`, so this branch
-   never runs. Leaving it async-shaped costs one
-   microtask scheduling at startup, sub-microsecond,
-   and preserves byte-for-byte compatibility with
-   upstream paged.js's auto-init semantic for anyone
-   running this bundle in a configuration we don't.
-2. **External `page.evaluate(...)` callers.** The
-   wrapper around `window.PagedPolyfill.preview()` in
-   `perf/measure.mjs` and `docs/render-book.mjs` is a
-   sync arrow, but `page.evaluate` itself returns a
-   Promise (CDP roundtrip). Node-side code awaits that
-   Promise. Cost is the CDP round-trip, not the JS we
-   execute.
-
-Neither contributes to the renderer's main-thread
-profile.
-
-### Cumulative trace shape
-
-For reference, the post-conversion top-of-table on
-`CrRendererMain` reads:
-
-```
-   self_ms   self_%   event                                       category
-   -------   ------   ----------------------------------------------
-   2984.11   34.58%   RunTask                                     devtools.timeline
-   1880.79   21.79%   LocalFrameView::performLayout               blink
-   1876.53   21.74%   Document::recalcStyle                       blink
-    540.06    6.26%   InlineNode::ShapeTextIncludingFirstLine     blink
-    503.09    5.83%   Document::rebuildLayoutTree                 blink
-    128.90    1.49%   Blink.CompositingInputs.UpdateTime          blink
-    123.41    1.43%   Blink.PrePaint.UpdateTime                   blink
-     99.60    1.15%   Document::updateStyle                       blink
-     76.83    0.89%   V8.GC_MC_INCREMENTAL_EMBEDDER_TRACING       v8.gc
-     43.20    0.50%   Layout                                      devtools.timeline
-     ...
-```
-
-`RunMicrotasks` no longer appears. `(self /
-unattributed)` time inside `RunTask` is 2984 ms across
-1005 hits -- average ~3 ms per task, consistent with
-"each render task does ~one page's worth of work" plus
-some longer tasks for setup / teardown. The dominant
-named children are unchanged: `UpdateStyleAndLayout`,
-`recalcStyle`, `performLayout`, `ShapeText`,
-`rebuildLayoutTree`. Same work, honest labels.
-
-Shipped.
-
-## `pageRanges` sharding: off the table for now
-
-Several sections above flag `pageRanges` sharding as
-"the biggest untried lever" for the `generate` phase --
-run `page.pdf()` N times over disjoint page ranges in
-parallel headless browsers, concatenate the resulting
-PDFs with pdf-lib, divide generate's ~43 s wall-clock by
-N. The arithmetic is appealing; the engineering isn't.
-
-A separate investigation (not in this repo) found enough
-pitfalls to make the work not worth pursuing at current
-scale. Sketch of what bit:
-
-- Each shard re-loads `book.html` and re-runs `paged.js`
-  rendering for *its* range, which means the per-shard
-  render is **not** 1/N of the original render -- paged.js
-  has to lay out all preceding pages to position the slice
-  correctly (named strings, counters, footnote numbering,
-  cross-references). Several "fixes" (skip-to-page hooks,
-  pre-rendered state injection) each broke in subtle ways
-  on the book's actual content.
-- PDF concatenation via pdf-lib reintroduces the full
-  `PDFDocument.load` cost the incremental writer avoided
-  -- need a streaming concatenator or qpdf binary
-  dependency to keep the process phase cheap.
-- Page numbers, named strings (`string(chapter-title)`),
-  and the running header rely on per-page state that the
-  Counters handler and `addEnvFunctions` rebuild from
-  document order. Sharding loses that order and breaks
-  the header on every shard boundary unless the per-shard
-  paged.js render is given the right starting state, which
-  is itself a research project.
-- Outline injection has to know cross-shard page numbers,
-  so either Chrome's native outline (which we don't ship)
-  or a post-concat outline rebuild is required.
-
-Net: even with aggressive engineering, the realistic win
-on a 1651-page book at N=4 shards is ~15-25 s of
-`generate` saved -- not the 32 s / 75 % the naive math
-suggests -- against a maintenance cost of a sharding
-harness that wraps puppeteer launch + IPC + pdf concat
-+ per-shard state setup. Below the cost/benefit bar.
-
-The lever is documented in this README because it *is*
-the largest remaining target if priorities change (e.g.
-the book grows past 3000 pages, or a CI runtime cap
-forces it). It's just not the next thing to build.
-
-### Probe results (later session)
-
-A two-shard probe in [perf/probe-parallel.mjs](probe-parallel.mjs)
-was run after the render-side speedups to see what the
-actual wall-clock floor looks like with current numbers.
-N=2, equal page-count split, no concatenation -- just
-two browsers in parallel each printing their `pageRanges`
-slice:
-
-| shard | launch | load | render | generate | total |
-| --- | --- | --- | --- | --- | --- |
-| 0 (pp 1-826)   | 1.00 s | 1.61 s | 10.37 s | 24.02 s | 35.54 s |
-| 1 (pp 827-1651)| 0.97 s | 1.61 s | 10.12 s | 24.46 s | 35.74 s |
-
-Wall clock for `Promise.all` of both: **35.94 s**. Both
-slices open via pdf-lib and the page counts add up
-exactly (826 + 825 = 1,651). Vs the ~53 s single-process
-render+generate, parallel N=2 saves ~17 s wall clock.
-
-The probe also confirms two browsers really do run in
-parallel at the OS level: generate dropped from ~43 s to
-~24 s per shard (roughly linear with a ~2-3 s per-call
-fixed overhead), which would only happen if the Skia +
-PrintCompositor workloads in the two browser trees
-weren't serialised by a shared kernel resource. So the
-"single-threaded Skia per page" finding from the
-`Page.printToPDF` survey above is per-process -- not a
-machine-wide lock.
-
-**Still not shipped.** Reasons unchanged:
-
-- Each shard re-renders the whole book to maintain
-  per-shard layout state (named strings, counters,
-  footnotes). With render at ~10 s that's now cheap CPU-
-  wise, but the memory cost is the blocker -- see the
-  "Memory: where the renderer's 1.9 GB goes" section
-  below. N=2 ≈ 5 GB peak, N=4 ≈ 10 GB peak; the CI
-  runner doesn't have that headroom.
-- Concat + outline page-number remap still needs to be
-  built. The incremental-pdf.mjs pattern extends to it
-  but it's nontrivial.
-
-Probe stays available as `node perf/probe-parallel.mjs
-[--shards N]` for re-evaluation if either constraint
-changes (CI machine grows, or book size forces it).
-
-## CSS cost attribution
-
-Render is at ~10 s on a 1651-page book, down from ~104 s
-in the original baseline. The bottom-up profile after
-all of the above changes shows no individual JS body
-above ~250 ms self-time; the dominant rows are native
-Blink work (`recalcStyle` 2.4 s, `performLayout` 2.2 s,
-`removeChild` 1.7 s) that's intrinsic to laying out and
-detaching 1651 pages of content. The remaining question:
-is any of that recalcStyle work *avoidable* via CSS
-pruning?
-
-`ab-css.mjs` automates the answer. It renders the book
-under four variants -- baseline-full (print.css +
-rouge.css), drop-rouge, drop-print-extras (only the
-always-kept Page-geometry + Chapter-boundaries sections
-of print.css), and baseline-minimal (both stripped) --
-then reports the **paired difference** of CPU sample-time
-(`Document::recalcStyle` total in particular) between
-baseline-full and each variant. Pairing immediately
-interleaves baseline + variant runs so machine-state
-drift cancels across the diff. On Windows the harness
-auto-relaunches itself under `start /affinity 0x5500
-/high` to pin to a fixed subset of cores, which on a
-Ryzen 7 cuts run-to-run variance from ~15-25 % to ~3 %.
-
-### Methodology calibration
-
-We learned the variance story the hard way. The first
-sweep used single runs per variant and CPU sample-time,
-on the theory that profile time would be machine-load-
-independent. It wasn't on this Windows dev box: four
-identical-content runs of baseline-full spanned
-9.47-16.89 s (the 16.89 was an outlier; even excluding
-it, the remaining three varied by ~12 %). At that noise
-floor, the per-section "drop-X saves N ms" rankings the
-tool was emitting were ~75 % noise. The fix had two
-parts:
-
-1. **CPU pinning via `start /affinity`** -- shipped as
-   the auto-relaunch shim in `ab-css.mjs`. Reduced
-   baseline SD on recalcStyle total from ~12-25 % to
-   ~3 %.
-2. **Paired interleaved measurement** -- run baseline
-   immediately before each variant, pair the two, take
-   the difference. Mean paired difference and SD across
-   N pairs let noise-floor rows show themselves honestly
-   (mean within ~2 σ of zero). Default N=3 pairs; bump
-   to `--runs 5` for tighter SD at the cost of wall
-   time.
-
-The original "stripping CSS saves ~740 ms" finding from
-a single manual A/B turned out to be partly real, partly
-noise, and partly confounded by what "minimal" meant.
-The manual A/B's "minimal" was just `@page` +
-`article{break-before:page}`; the tool's "baseline-
-minimal" keeps the preamble + Page-geometry +
-Chapter-boundaries sections (paged.js needs the
-string-set / @top-right / @bottom-right machinery for
-running headers and page numbers). The earlier signal
-was real, but spread across pieces the tool can and
-can't isolate.
-
-### Findings
-
-With pinning + paired diffs (3 pairs per variant):
-
-| variant | Δrecalc ms | ± SD | mean/SD | verdict |
-| --- | --- | --- | --- | --- |
-| **drop-print-extras** | **237** | **60** | **3.95** | **real signal** |
-| baseline-minimal | 193 | 246 | 0.78 | noise |
-| drop-rouge | 66 | 124 | 0.53 | noise |
-| (baseline-full mean) | 2038 | 108 SD | -- | reference |
-
-Read this as:
-
-- **print.css extras (everything beyond the always-kept
-  Page-geometry + Chapter-boundaries sections) contribute
-  ~237 ms of recalcStyle**: ~11 % of recalcStyle, ~2.4 %
-  of render. All three pairs gave Δrecalc 202, 307, 202 --
-  consistent direction and magnitude, ~4 σ from zero.
-- **rouge.css contribution is at the noise floor**
-  (66 ± 124 ms). The earlier hypothesis ("rouge.css is
-  the big spender via per-span cascade work in code
-  blocks") was wrong; the per-pair Δrecalc values were
-  38, 202, -42 -- variance too high to claim signal at
-  N=3.
-- **baseline-minimal** stripping both still lands inside
-  the noise band on this tool's run. The original manual
-  A/B's larger delta came from removing more than this
-  tool removes -- specifically the Page-geometry section
-  that the tool keeps.
-
-The per-section sweep behind `--per-print-section`
-confirmed the methodology lesson the hard way: when each
-print.css section is dropped individually, every Δrecalc
-lands within ~2 σ of zero. The 237 ms of print.css cost
-is structurally non-additive -- selectors interact in
-the cascade, the style sharing cache hits differently
-when rule count drops, and Blink's invalidation walks
-change shape based on what rules exist. Any single
-section's marginal contribution is too small to surface
-above ~60 ms of paired-diff noise; the sum-of-extras
-effect is the only real signal.
-
-### Where this leaves render
-
-Render is structurally near its floor. The biggest
-plausible CSS prune (drop-print-extras) saves ~240 ms of
-recalcStyle ≈ ~2.4 % of render, but would mean losing
-the typography that makes the PDF look like a book. The
-remaining levers all live outside render:
-
-- `pageRanges` sharding (~5-20 s in generate): off the
-  table for now (see previous section).
-- Chrome's `outline: true` (~5 s in process): one
-  `role="presentation"` preprocessor pass away from
-  shipping, but not pursued.
-
-No structurally promising next target inside render.
-
-## Memory: where the renderer's 1.9 GB goes
-
-CI runs the book build with limited RAM headroom -- the
-1651-page book is the largest job on the machine and the
-budget matters. This section measures one render's peak
-memory and breaks it down by allocator, so we know what
-levers exist if the book grows.
-
-> **Note.** Approaches that involve Chromium internals --
-> patching the binary, intercepting the SkPicture stream
-> via Frida, spawning standalone PrintCompositors via
-> Mojo, building a Chromium-linked helper binary -- were
-> researched but not shipped. They're documented
-> separately in [CHROMIUM.md](CHROMIUM.md). This section
-> covers only what's measurable from the outside through
-> public APIs.
-
-`perf/probe-memory.mjs` is the harness. It runs the full
-pipeline (load + render + generate) in a single browser
-and watches the chrome.exe process tree at 500 ms
-intervals via `sample-mem.ps1`, reporting per-process
-private bytes + working set. `perf/probe-renderer-mem.mjs`
-goes deeper -- it drives Chromium's memory-infra tracing
-to capture detailed per-allocator dumps from inside the
-renderer at three points (post-render, mid-generate,
-post-generate). `perf/analyze-mem-trace.mjs` reads the
-resulting trace.json and prints the breakdown.
-
-### Process-tree footprint
-
-Peak across the whole tree on the 1651-page book:
-
-```
-renderer (main)                 ~1,880 MB private
-utility:PrintCompositor           ~290-450 MB  (high variance)
-browser                         ~70-1,100 MB   (PDF IPC buffer; very high variance)
-gpu-process                       ~100 MB
-renderer (about:blank etc.)        ~25 MB total
-utility:network/storage            ~30 MB total
-crashpad-handler                    ~2 MB
-                                ------------
-total peak                      ~2.5-3.5 GB private
-                                ~2.7-2.9 GB working set
-```
-
-The browser-process number is the wildest -- across
-runs it ranged from 72 MB to 1.1 GB. That's the IPC
-buffer the PDF travels through on its way from the
-renderer back to puppeteer; how much accumulates depends
-on timing between Mojo write and Node read. The
-PrintCompositor utility process appears only during
-generate; it's the Chromium service that turns the
-renderer's Skia commands into PDF bytes for `page.pdf()`.
-
-### Inside the renderer
-
-memory-infra dump at post-generate, renderer process,
-top-level allocators (`blink_gc` and `blink_objects`
-overlap by design -- they're two views of the same
-Oilpan heap, raw pages vs typed object counts):
-
-| allocator         | size      | notes |
-| ----------------- | --------- | ----- |
-| `blink_gc`        | 1,350 MB  | C++ DOM, layout, render objects (Oilpan) |
-| `malloc`          |   332 MB  | Skia raster buffers + small native allocations |
-| `partition_alloc` |   114 MB  | String buffers, ArrayBuffers |
-| `v8`              |    34 MB  | JS heap (paged.js + page JS); tiny |
-| other             |   ~22 MB  | web_cache, shared_memory, cc, gpu stub |
-
-V8 is only ~2 % of the renderer. Blink is ~80 %. That
-matches the structural picture: the renderer holds the
-laid-out state of 1651 pages of typeset content, and
-that state is C++ objects, not JS.
-
-Top Blink object classes (the `blink_objects` view of
-the Oilpan heap, post-generate):
-
-| class                                  | size    | count       |
-| -------------------------------------- | ------- | ----------- |
-| `GridSizingTrackCollection`            | 132 MB  | 79,246      |
-| `ComputedStyle`                        |  74 MB  | 1,074,537   |
-| `ConstraintSpace::RareData`            |  71 MB  | 617,415     |
-| `PhysicalBoxFragment`                  |  42 MB  | 516,289     |
-| `LogicalLineItems`                     |  42 MB  | 24,118      |
-| `Text` (DOM nodes)                     |  42 MB  | 498,077     |
-| `LayoutResult`                         |  41 MB  | 540,447     |
-| `AXNodeObject`                         |  41 MB  | 411,760     |
-| `GridItemData`                         |  30 MB  | 162,443     |
-| `ComputedStyleBase::StyleBoxData`      |  30 MB  | 176,479     |
-| `InlineItem`                           |  28 MB  | 737,744     |
-| `LayoutResult::RareData`               |  28 MB  | 229,056     |
-| `ElementRareDataVector`                |  24 MB  | 613,629     |
-| `CachedMatchedProperties`              |  23 MB  | 226,679     |
-| `ShapeResultView`                      |  21 MB  | 306,762     |
-| `HeapVectorBacking<FragmentItem>`      |  21 MB  | 72,175      |
-| `HeapVectorBacking<HarfBuzzRunGlyphData>` | 20 MB | 165,957  |
-| `LayoutText`                           |  14 MB  | 129,056     |
-| `HTMLDivElement`                       |  12 MB  | 118,877     |
-| `HTMLSpanElement`                      |  10 MB  | 104,266     |
-
-Three patterns visible:
-
-1. **Page-template grid is expensive.** paged.js renders
-   each `@page` as a CSS grid (so `@top-right`,
-   `@bottom-right`, etc. resolve correctly). 79,246
-   `GridSizingTrackCollection` ≈ 48 per page × 1651
-   pages, plus 162k `GridItemData`. Combined ~162 MB just
-   for the running header/footer geometry.
-2. **Style explosion.** 1,074,537 `ComputedStyle`
-   objects across 1651 pages is ~650 per page, which
-   matches roughly one per leaf element after style
-   sharing. `CachedMatchedProperties` (23 MB, 227k)
-   shows the sharing cache is active; without it the
-   number would be much worse.
-3. **LayoutNG fragment tree.** `PhysicalBoxFragment`
-   (42 MB), `LogicalLineItems` (42 MB), `LayoutResult`
-   (41 MB), various `RareData` (98 MB combined),
-   `InlineItem` (28 MB) -- the modern Blink layout tree
-   is fragment-based and the fragments add up across
-   half a million layout objects.
-
-The render→generate transition adds about 500 MB:
-~272 MB to `blink_gc` (print-preview snapshot retention)
-and ~219 MB to `malloc` (Skia content-stream allocations
-during PDF emit, visible as a million-ish small
-allocations in the bucket-size profile).
-
-### Disabling the GPU process
-
-The GPU process at ~100 MB looked like easy win. It
-isn't, quite -- in headless Chromium still spawns a
-GPU process to host SwiftShader (software raster) for
-canvas / WebGL emulation, even when no canvas / WebGL
-is in use. Three variants tested:
-
-| variant                                       | render | generate | total | gpu-process | renderer | PDF bytes |
-| --------------------------------------------- | ------ | -------- | ----- | ----------- | -------- | --------- |
-| baseline                                      | 10-11s | 44-50s   | 51-56s |  100 MB    | 1,880 MB | 41,076,362 |
-| `--disable-gpu --disable-software-rasterizer` | 10s    | 45s      | 45s   |  16 MB      | 1,761 MB | 41,076,362 |
-| above + `--in-process-gpu`                    | 15s    | 61s      | 62s   |  (gone)     | 1,748 MB | 41,076,362 |
-| `--single-process`                            | crash  | -        | -     | -           | -        | -         |
-
-`--single-process` is documented as debug-only in
-Chromium; the renderer crashes shortly after page load
-in modern headless. Also doesn't actually collapse to
-one process -- crashpad-handler always runs separately
-and a Mojo broker stays alive too.
-
-`--in-process-gpu` does kill the GPU process entirely
-but folds the GPU work onto the same thread as JS +
-layout. Render slows by ~5 s and generate by ~15 s --
-a 25 % total slowdown bought for ~100 MB of saved
-process overhead. Bad trade.
-
-The disable pair alone (`--disable-gpu
---disable-software-rasterizer`) is the sweet spot:
-
-- GPU process shrinks from ~100 MB to ~16 MB (Chromium
-  keeps a stub for command handling)
-- Renderer ~120 MB lighter (consistent across runs;
-  exact cause is some GPU-context init path Skia skips)
-- Generate runs ~5 s faster (Skia presumably skips the
-  same GPU init path)
-- PDF output is byte-identical: same 41,076,362 bytes,
-  same content streams. SHA differs only because of
-  per-run /CreationDate, /ModDate, and /ID -- 0.018 %
-  of bytes differ, all inside the tagged-PDF tree's
-  hash-derived element IDs.
-
-Shipped in both [docs/render-book.mjs](../docs/render-book.mjs)
-and [perf/measure.mjs](measure.mjs).
-
-### What's not addressable
-
-Accessibility tagging accounts for ~41 MB of
-`AXNodeObject` instances (411k of them, one per DOM
-element for the PDF/UA structure tree). Disabling
-`--export-tagged-pdf` would free this, but the PDF
-loses its structure tree -- screen readers see a flat
-glyph stream, search highlighting and copy-paste break
-reading order in the multi-column layout, and the PDF
-falls out of Section 508 / PDF-UA / EN 301 549
-compliance. Off the table; the cost buys real
-accessibility for a docs site that aims to be readable.
-
-### Where this leaves memory
-
-End-state on the 1651-page book with the shipped flag
-pair:
-
-```
-renderer (main)                 ~1,760 MB private
-PrintCompositor (utility)         ~350 MB
-browser                           ~70-1,100 MB  (IPC buffer; high variance)
-gpu-process (stub)                 ~16 MB
-other (renderers, network, etc.)   ~80 MB
-                                ------------
-peak                            ~2.3-3.3 GB private
-                                ~2.5-2.9 GB working set
-```
-
-Inside the renderer, the dominant buckets are
-intrinsic to laying out 1651 pages of typeset content:
-
-- `GridSizingTrackCollection` (132 MB) is paged.js's
-  per-page template grid. The grid drives `@top-right`
-  / `@bottom-right` / margin-box positioning; replacing
-  it with absolute positioning would save the 132 MB
-  but is a paged.js architectural change.
-- `ComputedStyle` (74 MB across 1M objects) and the
-  LayoutNG fragment tree (~200 MB combined) scale with
-  DOM size. The biggest knob here is the DOM the book
-  feeds in: fewer wrapper elements would directly
-  shrink everything downstream.
-- The render→generate +500 MB is Chromium-internal
-  (print-preview retention + Skia raster prep) and not
-  reachable without recompiling.
-
-Next memory targets, in rough order of effort vs payoff:
-
-1. **DOM shape audit.** 1.07 M `ComputedStyle`, 498 k
-   `Text` nodes, 118 k `HTMLDivElement`, 104 k
-   `HTMLSpanElement` -- the input shape drives all of
-   this. Just-the-docs and the markdown converters add
-   wrapper elements that may not be needed in the PDF
-   layout. A pre-render DOM-simplification pass (strip
-   inert wrappers, collapse nested spans) is the most
-   accessible lever; we own the Jekyll pipeline end to
-   end.
-2. **Layout-intermediate garbage** that Oilpan doesn't
-   sweep during the synchronous render loop. ~75-225
-   MB of `CachedMatchedProperties`, sub-`ComputedStyle`
-   data, `GridItemData`, text-shape intermediates --
-   not retained by anything, just unswept. See the
-   "GC-pass probe" subsection for the per-class
-   breakdown; the only direct mitigation is forcing
-   GC (rejected, costs ~1 s), and the indirect lever
-   is upstream DOM size (item 1 above).
-3. **Page-template grid replacement** in vendored
-   paged.js -- ~132 MB potential. Largest single target
-   but an invasive rewrite of paged.js's `@page` area
-   handler.
-
-### GC-pass probe: 180 MB of unswept Oilpan garbage
-
-Forcing a `window.gc()` pass between render and generate
-frees ~180 MB of `blink_objects` (the typed view of the
-Oilpan heap) without touching anything user-visible.
-Initial framing: "dangling references somewhere in the
-paged.js / detach-pages chain". Investigation (see "What
-the GC actually freed" subsection below) shows the
-framing was wrong -- there is no JS-side retention.
-What the GC frees is per-page layout intermediate state
-(style sharing caches, `ComputedStyle` sub-data, grid
-item data, text-shape views) that's already unreachable
-from anything but stays in Oilpan because nothing forces
-a major GC during the synchronous render loop.
-
-Probe: `perf/probe-renderer-mem.mjs --gc-passes N`.
-Launches with `--js-flags=--expose-gc`, runs N V8
-`gc()` calls between the post-render and pre-generate
-memory dumps, then fires
-`Memory.simulatePressureNotification` to coax Chromium
-into dropping caches. Sweep across N=0,1,2,3,5 on the
-1651-page book (single run each; absolute numbers carry
-run-to-run noise but the deltas vs same-run baseline
-are stable):
-
-| N | gc time | +pressure | post-render | post-gc | mid-gen renderer | Δ vs no-gc baseline |
-| --- | --- | --- | --- | --- | --- | --- |
-| (off, baseline)| --     | --     |  1,229 MB | --     | **1,941 MB** | -- |
-| 0 (pressure only) | 0.00s | 0.52s |  1,358 MB | 1,358 MB | 1,869 MB | ~noise |
-| **1** | **0.44s** | **0.96s** | 1,329 MB | **1,275 MB** | **1,754 MB** | **-187 MB** |
-| 2 | 0.82s | 1.33s |  1,337 MB | 1,293 MB | 1,758 MB | -183 MB |
-| 3 | 1.46s | 1.97s |  1,316 MB | 1,277 MB | 1,757 MB | -184 MB |
-| 5 | 2.11s | 2.61s |  1,553 MB* | 1,498 MB* | 1,841 MB* | (high-side outlier run) |
-
-Three takeaways:
-
-1. **`Memory.simulatePressureNotification` alone does
-   nothing in headless.** N=0 mid-gen is within
-   run-to-run noise of the no-gc baseline.
-2. **One `gc()` call does ~90 % of the work.** 1 pass +
-   pressure: ~1 s cost, ~187 MB peak savings. Passes
-   2 and 3 match it (~185 MB) without further
-   improvement.
-3. **Each `gc()` pass costs ~0.4-0.5 s** of wall clock
-   on the 1651-page book (the V8 + Oilpan major-GC
-   pause walking ~1 GB of heap).
-
-Inside the renderer at post-gc (1 pass), the breakdown
-shows where the freed space went:
-
-| allocator      | baseline | post-gc | Δ |
-| -------------- | -------- | ------- | --- |
-| `blink_objects` (typed Oilpan view) |  698 MB |  472 MB | **-226 MB** |
-| `blink_gc` (raw pages)              |  973 MB |  940 MB |  -33 MB |
-| `malloc`                            |  120 MB |   93 MB |  -27 MB |
-| `v8`                                |   28 MB |   19 MB |   -9 MB |
-
-GC freed ~226 MB of typed Blink objects, but Oilpan
-only returned 33 MB of underlying pages to the OS
-immediately -- empty pages are recycled lazily. The
-visible peak win shows up at mid-generate (-187 MB)
-because Chromium reuses the freed object slots for the
-print-preview snapshot instead of growing fresh.
-
-PDF output is byte-identical across all variants
-(41,076,362 bytes; SHA differs only in metadata).
-
-**Not shipped.** 1 second per render is meaningful when
-multiplied across CI builds, and after investigating
-what the GC actually freed (below) it's clear there's
-no underlying defect to fix -- this is Blink's normal
-allocation behaviour, with Oilpan's normal sweep
-behaviour, just observed in a workload that doesn't
-give Oilpan an idle moment to sweep.
-
-The probe and the `--gc-passes` flag stay in
-[probe-renderer-mem.mjs](probe-renderer-mem.mjs) for
-future use -- either as a measurement baseline if a
-future bigger book ever hits a CI memory ceiling, or as
-an A/B reference if Blink's allocation pattern changes
-with a Chromium upgrade.
-
-#### What the GC actually freed
-
-Two analyses, both negative for the "dangling references"
-hypothesis, both positive for "Oilpan didn't sweep":
-
-**V8 heap snapshot diff (pre-gc vs post-gc):** byte-
-identical. Same 2,938,992 nodes, same 108.9 MB self_size,
-same per-category counts. The diff is zero across every
-node category in V8. Whatever the GC freed was invisible
-to V8's snapshot, which means it had no V8 wrapper --
-which means no JS reference can be holding it. Probe:
-[analyze-heap-snapshot.mjs](analyze-heap-snapshot.mjs)
-in single-snapshot or diff mode.
-
-**Per-Blink-class diff (memory-infra dumps):** the
-freed memory is concentrated in style-system caches and
-layout intermediates. Top freed classes between dump 0
-(post-render) and dump 1 (post-gc), 1-pass GC run:
-
-| class                                            | a_count | a_MB | b_count | b_MB | freed |
-| ------------------------------------------------ | ------- | ---- | ------- | ---- | ----- |
-| `CachedMatchedProperties`                        | 122,110 | 12.1 |     355 |  0.0 | **-12.1 MB** (~100%) |
-| `ComputedStyle`                                  | 380,974 | 26.2 | 244,772 | 16.8 |  -9.4 MB (~36%)      |
-| `ComputedStyleBase::StyleMisc2Data`              |  24,649 |  8.3 |   6,911 |  2.3 |  -6.0 MB             |
-| `ComputedStyleBase::StyleBoxData`                |  94,867 | 15.9 |  63,937 | 10.7 |  -5.2 MB             |
-| `ComputedStyleBase::StyleSurroundData`           |  32,350 |  9.6 |  15,101 |  4.5 |  -5.1 MB             |
-| `GridItemData`                                   |  27,508 |  5.0 |       0 |  0.0 | **-5.0 MB** (~100%)  |
-| `ShapeResultView`                                | 225,299 | 15.5 | 170,366 | 11.7 |  -3.8 MB             |
-| `HeapVectorBacking<HarfBuzzRunGlyphData>`        | 163,864 | 19.2 | 149,993 | 16.4 |  -2.9 MB             |
-| `LayoutResult::RareData`                         |  71,960 |  8.8 |  48,955 |  6.0 |  -2.8 MB             |
-| `ConstraintSpace::RareData`                      |  79,445 |  9.1 |  55,209 |  6.3 |  -2.8 MB             |
-| `ComputedStyleBase::StyleMisc1Data`              |  19,034 |  3.0 |   1,958 |  0.3 |  -2.7 MB             |
-| `ComputedStyleBase::StyleMiscData`               |  64,838 |  5.4 |  39,653 |  3.3 |  -2.1 MB             |
-| `LayoutResult`                                   | 179,728 | 13.7 | 155,052 | 11.8 |  -1.9 MB             |
-| ... (smaller)                                    |         |      |         |      |  -16  MB             |
-| **total**                                        |         |      |         |      | **-76 MB** (this run; -226 MB on a different run -- noisy) |
-
-The two ~100% freed categories tell the cleanest story:
-
-- **`CachedMatchedProperties`** is Blink's style-sharing
-  cache -- "which CSS rules matched element X, so that
-  similar element Y can reuse the resolved style". After
-  layout completes, it's dead state. Only useful if the
-  document gets relaid out, which our pipeline never
-  does.
-- **`GridItemData`** is per-item layout state for CSS
-  Grid. Paged.js puts each `@page` area inside a grid
-  to position the running headers / footers / margin
-  boxes; once the page is laid out, the `GridItemData`
-  for that page's items is dead.
-
-Everything else is style sub-structures
-(`ComputedStyleBase::Style*Data`) and text-shape
-intermediates (`ShapeResultView`, `HarfBuzzRunGlyphData`,
-`ShapeResultRun`) that get freed when their owning
-`ComputedStyle` or layout fragment becomes unreachable.
-All Blink-internal allocations driven by layout.
-
-What this means for the leak question:
-
-- **Not a leak.** Nothing holds these objects after
-  layout. They're unreachable from the moment their
-  page is finalised; they sit in Oilpan because
-  Chromium doesn't run a major GC during the
-  synchronous render loop.
-- **Not a JS-side retention.** detach-pages.js,
-  paged.js's chunker, hook chains, and event listeners
-  were the suspect list. The V8 snapshot diff rules
-  them all out -- if any of them held the layout state,
-  the snapshot would change between pre-gc and post-gc.
-- **It's a real over-allocation in the sense that we
-  hold ~75-225 MB longer than necessary**, but the cost
-  to fix it (force a GC: 1 s wall clock) exceeds the
-  CI memory headroom it would buy at our current book
-  size.
-
-The indirect lever still works: reducing the input DOM
-size reduces both peak working set AND this garbage
-fraction proportionally. That's the DOM-shape audit
-item in "Next memory targets".
-
-Tooling produced by this investigation, kept in
-[perf/](.) for re-use:
-
-- [analyze-heap-snapshot.mjs](analyze-heap-snapshot.mjs)
-  -- single-snapshot summary (top type x name by
-  aggregate bytes, detached subset) and pairwise diff
-  between two snapshots.
-- [diff-blink-classes.mjs](diff-blink-classes.mjs) --
-  per-Blink-class diff between two memory-infra dumps
-  in the same trace. Strips the per-dump GUID suffix
-  from class names so the diff lines up across dumps.
-
-#### `--heap-snapshot`: V8 visibility check
-
-`probe-renderer-mem.mjs --heap-snapshot` captures a V8
-heap snapshot at post-render via CDP
-`HeapProfiler.takeHeapSnapshot` and writes it as
-`outDir/post-render.heapsnapshot` (~200 MB on the
-1651-page book). Combined with `--gc-passes N`, a
-second snapshot `post-gc.heapsnapshot` is taken right
-after the GC pass.
-
-The original intent was a retainer-chain investigation
-to find what JS-side state was holding the Blink
-objects the GC frees. The result of that investigation
-(see "What the GC actually freed" above) is that
-**nothing on the V8 side holds them** -- the snapshot
-diff is byte-identical pre-gc vs post-gc, ruling out
-JS retention entirely. The freed memory is Oilpan-only,
-invisible to V8's snapshot.
-
-The snapshot tooling is still useful as a visibility
-check -- "is the renderer holding what I expect?" --
-and for finding any actual JS-side retention if one
-ever surfaces. CLI analysis:
-
-- `node perf/analyze-heap-snapshot.mjs <snap>` --
-  single-snapshot summary (top type x name by aggregate
-  bytes, plus actually-detached subset).
-- `node perf/analyze-heap-snapshot.mjs <a> <b>` --
-  pairwise diff: what categories grew or shrank.
-
-DevTools workflow (more interactive, for following
-specific retention chains):
-
-1. Open Chrome DevTools (any tab) -> Memory tab.
-2. Load `<...>.heapsnapshot` (the "Load profile" icon).
-   Browse the **Summary** view for the largest object
-   categories.
-3. For any object of interest, the **Retainers** pane
-   shows the chain of JS references holding it. Filter
-   by name (e.g. `Detached HTMLDivElement`) or by class.
-
-Oilpan-only objects (`CachedMatchedProperties`,
-`ComputedStyleBase::*Data`, `GridItemData`,
-`ShapeResultView`, layout fragments, etc.) do not appear
-in the V8 snapshot -- they have no V8 wrapper. The
-memory-infra dump + `diff-blink-classes.mjs` is the
-right tool for those. The complete picture is
-heap-snapshot (V8 reachability) + memory-infra dump
-(per-allocator + per-Blink-class sizes) = "what JS sees"
-+ "what's actually in the renderer".
+| [01-baseline-and-detach.md](notes/01-baseline-and-detach.md) | Confirming the quadratic; detach-pages; incremental writer; pdf-lib parseSpeed; Chromium `Page.printToPDF` knob survey; dropping `pagedjs-cli`; restoring live progress. |
+| [02-finalizepage.md](notes/02-finalizepage.md) | Revisiting `AtPage.finalizePage`; looking past it; the failed binary-search and gBCR-memo attempts; finding the residual O(n) was CSS-Grid sibling sweeps over `display:none` pages (aggressive-detach fix); five `createBreakToken` dedup attempts (Attempt E shipped the `renderTo` additive backoff). |
+| [03-puppeteer-bump-findref.md](notes/03-puppeteer-bump-findref.md) | Rebaselining after puppeteer 22→25; the `findRef` fast-path miss (39 % of calls were falling through, ~2.4 s win); six cheaper-`removeChild` variants (none shipped); chasing the residual `(idle)` time down to `requestAnimationFrame`. |
+| [04-sync-and-inner-loop.md](notes/04-sync-and-inner-loop.md) | Stripping headless-irrelevant async machinery (hook fast-path; sync chain end-to-end); shrinking `Layout.append` (footnote fast-path, parent-lookup cache, `triggerSync` empty-handlers); skipping `wrapContent`'s innerHTML round-trip; fixing two bugs in the adaptive `maxChars` overflow-check rhythm. |
+| [05-blink-trace.md](notes/05-blink-trace.md) | What happened when we tried move-not-clone (a `previousLeaf` cache shipped instead of move); cracking the cpu profile's `(program)` row open with a Blink-category trace; the WhiteSpaceFilter paired-A/B that found it wasn't worth its layout cost in our pipeline. |
+| [06-microtasks-pageranges-css.md](notes/06-microtasks-pageranges-css.md) | Following `RunMicrotasks` down to zero (chunker fully sync); why `pageRanges` sharding is off the table; CSS cost attribution showing print.css's individual sections are all below the noise floor. |
+| [07-memory.md](notes/07-memory.md) | Where the renderer's 1.9 GB goes -- process-tree footprint, per-allocator + per-Blink-class breakdown, `--disable-gpu` + `--in-process-gpu` saving ~200 MB, a GC-pass probe finding 180 MB of unswept Oilpan garbage. |
diff --git a/perf/notes/01-baseline-and-detach.md b/perf/notes/01-baseline-and-detach.md
new file mode 100644
index 0000000..54cb8a6
--- /dev/null
+++ b/perf/notes/01-baseline-and-detach.md
@@ -0,0 +1,962 @@
+# Baseline, the first big wins, and dropping pagedjs-cli
+
+Opening investigation: confirming the suspected quadratic, finding it lives in paged.js's per-page DOM growth, shipping the detach-pages handler, the incremental-PDF writer, and pdf-lib's `parseSpeed: Fastest`; then surveying Chromium's `Page.printToPDF` knobs, dropping the `pagedjs-cli` dependency, and restoring live progress to the terminal.
+
+## The plan
+
+The render pipeline has three phases, matching what `pagedjs-cli`
+historically showed as its three spinners:
+
+1. **Rendering** -- `PagedPolyfill.preview()` does all the per-page
+   layout work inside headless Chromium.
+2. **Generating** -- `page.pdf()` asks Chromium to serialize the
+   laid-out DOM into PDF bytes, after a small `parseOutline` DOM
+   walk.
+3. **Processing** -- `pdf-lib` loads Chromium's PDF, attaches the
+   outline and metadata, and re-serialises.
+
+All three can grow super-linearly. So the harness times all three
+separately and produces a phase breakdown.
+
+Two-step investigation, cheapest first:
+
+1. **Per-page timing + phase breakdown** -- the cheap pass. Hook
+   paged.js's `beforePageLayout` / `afterPageLayout` for the
+   per-page render curve, and wall-clock the generate and process
+   phases from Node. If render's per-page cost grows with page index
+   that's an `O(n^2)` render; if generate or process dominate, the
+   bottleneck is downstream of paged.js.
+
+2. **CPU profile of headless Chromium** -- the deep pass, only if
+   step 1 doesn't already point at a culprit. Attach the Chrome
+   DevTools Performance panel (or save a CPU profile via the CDP
+   `Profiler` domain) and look for the hot function. Typical paged.js
+   suspects in render: `Chunker`, `Layout`, cross-reference
+   resolution, or a handler that walks the entire document on every
+   page. Generate / process bottlenecks usually point at Chromium's
+   PDF writer or `pdf-lib`'s outline / save path.
+
+Step 1 is what's wired up here. Step 2 will reuse the same harness --
+adding `page.tracing.start()` / `page.tracing.stop()` for a
+DevTools-compatible trace is a few lines.
+
+## Findings (initial run)
+
+A single run on `docs/_site-pdf/book.html` (1638 pages, May 2026,
+clean checkout, headless Chromium 122):
+
+| Phase    | Time      | % of total | Notes |
+| -------- | --------- | ---------- | ---   |
+| render   | 103.8 s   | 50 %       | paged.js layout. Per-page cost grows ~5x start-to-end. |
+| generate |  63.6 s   | 31 %       | 99.9% of it is `page.pdf()`. Raw Chrome output: 52 MB. |
+| process  |  39.6 s   | 19 %       | 90% of it is `PDFDocument.load`. Final PDF: 17 MB. |
+| **total**|**207.0 s**|            |       |
+
+### Render: super-linear, ~5x growth (confirms the suspicion)
+
+Per-page render cost, bucketed by 100 pages:
+
+```
+pages    0-  99   avg=  3.4 ms
+pages  100- 499   avg=  7-9 ms
+pages  500- 799   avg= 12-15 ms
+pages  800-1099   avg= 23-25 ms
+pages 1100-1599   avg= 27-39 ms
+pages 1600-1637   avg= 35 ms
+```
+
+The first-quarter / last-quarter ratio is **5.09x** with a
+position ratio of 4.0x. That's a clean linear-in-`n` per-page
+growth pattern, i.e. **total render time is roughly O(n^2)** with
+content variance overlaid. The single biggest outlier is
+pages 1100-1199 (37 ms) -- one chapter that's heavier than its
+neighbours.
+
+JS heap stays bounded around 10-25 MB throughout. So whatever's
+making later pages expensive is **CPU work that scales with `n`,
+not retained DOM**. Likely candidates: a `querySelectorAll` over
+the whole rendered tree on each page, cross-reference / named-flow
+resolution, or a handler walking already-laid-out content. The CPU
+profile in step 2 should pin which.
+
+### Generate: opaque Chrome PDF writer, large raw output
+
+`parseOutline` is 30 ms -- irrelevant. The whole 63-second phase
+is `page.pdf()`, i.e. Chromium serialising the laid-out DOM into
+52 MB of raw PDF bytes. This is the part we have least control
+over -- it's Chromium internals.
+
+What stands out is the **52 MB raw size**. After pdf-lib's
+`save()` re-emits it, the final file is **17 MB**. A 3x shrink
+from a re-serialise alone suggests Chrome isn't compressing
+streams aggressively (probably writing `/FlateDecode`-able streams
+uncompressed). Worth a follow-up sanity check, but not the
+priority.
+
+### Process: pdf-lib roundtrip overhead
+
+```
+load        : 35.62 s   parse the 52 MB raw PDF
+setOutline  :  0.01 s   write outline tree into the doc
+save        :  3.97 s   re-serialise (the 52 -> 17 MB shrink)
+```
+
+The actual outline / metadata mutations are basically free. **The
+whole 40-second phase is the cost of a load + save roundtrip on
+the big raw PDF that Chrome produced**, just so we can attach an
+outline that Chrome can't generate itself.
+
+This is a clear optimisation target: drop the pdf-lib roundtrip in
+favour of a streaming outline-injection tool (`qpdf`, `pdftk`,
+something hand-rolled with `pdf-lib`'s lower-level API) and the
+process phase could collapse to seconds. Tractable on its own
+without touching paged.js.
+
+### Where to focus
+
+- **Render** is the largest phase **and** the only super-linear
+  one. Step 2's CPU profile goes here first.
+- **Process** is purely linear-in-PDF-size overhead with a clean
+  fix path (skip pdf-lib's full parse). Independent of the
+  quadratic story.
+- **Generate** is Chrome's PDF writer. Not actionable from our
+  side without a Chromium patch; the 52 MB raw size deserves a
+  glance, but later.
+
+The user-perceived quadratic behaviour is real and lives in the
+render phase. Fixing it would knock 50-80 s off a 200 s build.
+Fixing process is independent and could knock off another 30 s.
+
+## Step 2: CPU profile of the render phase
+
+`measure.mjs --cpu-profile` wraps the render phase only (preview()
+through the `.pagedjs_pages` selector) in a V8 CPU profile via the
+CDP `Profiler` domain, and writes it to `render.cpuprofile` in the
+results folder:
+
+```
+run.bat --cpu-profile                          # default 1ms sampling
+run.bat --cpu-profile --cpu-sampling 5000      # 5ms sampling, smaller file
+```
+
+The profile covers only the render phase deliberately -- generate is
+opaque Chrome internals and process has a clean non-profiling fix, so
+both would dilute the signal.
+
+To view: open Chrome (or Edge) -> DevTools -> **Performance** tab ->
+click **Load profile...** (folder icon) and pick the `.cpuprofile`
+file. Or drag it onto the panel. The bottom-up view sorted by
+self-time pins the hot function fastest.
+
+What to look for, given the heap stayed bounded and per-page cost
+scales linearly with `n`:
+
+- A function whose self-time grows roughly with page index. The
+  bottom-up view aggregates across the whole phase, so a per-page
+  `O(n)` scan shows up as a fat self-time bar.
+- DOM-query hot spots: paged.js calling `querySelectorAll`,
+  `getElementsByTagName`, or `closest` against the whole rendered
+  tree on each new page.
+- Cross-reference / named-flow / footnote resolution that re-walks
+  prior pages.
+
+A 1 ms sampling interval over a 100 s render produces a profile around
+20-50 MB. The render phase itself runs ~5-15% slower while sampling.
+
+If the bottleneck turns out to be in paged.js itself, the next step
+is to patch our vendored copy. There is no widely-known maintained
+fork with the detach-pages optimisation at time of writing -- the
+named "performance forks" of paged.js that turn up in casual
+searches mostly don't exist or haven't shipped a fix. Worth checking
+the upstream issue tracker at
+[pagedjs/pagedjs on GitHub](https://github.com/pagedjs/pagedjs/issues)
+(currently the active home; older threads may still live on
+[Coko's GitLab](https://gitlab.coko.foundation/pagedjs/pagedjs/-/issues))
+before reinventing the fix.
+
+## Findings (CPU profile of render phase)
+
+A profiled run (`--cpu-profile`, 1 ms sampling) over the same
+1638-page book:
+
+```
+samples: 52314   duration: 95.18 s   us/sample: 1819
+
+   self_ms   self_%   function  @  source
+   -------   ------   ----------------------------------------------
+  63525.42   66.82%   getBoundingClientRect   (browser native)
+  19075.46   20.07%   (program)               (V8/Blink native)
+   1941.39    2.04%   findElement             browser.js:638
+   1497.43    1.58%   removeOverflow          browser.js:2196
+   1106.25    1.16%   (anonymous)             browser.js:29501
+   1002.54    1.05%   createBreakToken        browser.js:1796
+    580.42    0.61%   findEndToken            browser.js:2094
+    527.65    0.56%   create                  browser.js:2257
+    442.13    0.47%   afterPageLayout         browser.js:30184
+    ... rest sub-0.5% ...
+```
+
+**67% of render is `getBoundingClientRect`. Another 20% is V8/Blink
+native code -- almost certainly the synchronous layout passes those
+`getBoundingClientRect` calls force.** Together 87% of render is the
+browser doing layout work driven by paged.js measurement calls.
+
+> **Terminology**: this doc abbreviates `getBoundingClientRect` as
+> **gBCR** below. It's the DOM method that returns an element's
+> viewport-relative position and size; calling it forces Chromium
+> to synchronously flush any pending layout work before answering,
+> so "gBCR self-time" in a CPU profile is layout-flush attribution
+> charged to the JS frame that asked, not JS computation. The
+> same applies to other layout-reading APIs (`offsetTop`,
+> `clientHeight`, `getComputedStyle`, etc.) -- they're collectively
+> the *layout-flush surface* in the profile.
+
+### Why this is `O(n^2)`
+
+The hot caller is `Chunker.findOverflow` at `browser.js:1934`. Its
+loop:
+
+```js
+findOverflow(rendered, bounds, gap) {
+  if (!this.hasOverflow(rendered, bounds)) return;
+  ...
+  let walker = walk(rendered.firstChild, rendered);
+  while (!done) {
+    next = walker.next();
+    node = next.value;
+    if (node) {
+      let pos = getBoundingClientRect(node);   // <-- line 1957
+      ...
+    }
+  }
+}
+```
+
+Per page, paged.js walks the just-rendered fragment node-by-node
+calling `getBoundingClientRect` to find where the content overflows
+the page box. `findOverflow` itself only touches the new fragment, so
+in isolation it should be `O(page_content)`.
+
+The catch: `getBoundingClientRect` is **synchronous**. If the DOM has
+been mutated since the last layout (and paged.js mutates constantly
+-- appending pages, splitting nodes, retrying overflow), each call
+forces Chromium to flush layout. **The cost of that flush scales
+with the live DOM tree**, which is every previously-laid-out page,
+all still attached to the document. Page `n`'s overflow walk pays
+`O(n)` layout cost. Total cost is `O(n^2)`.
+
+This matches everything else we saw:
+
+- Heap stays bounded (10-25 MB): no JS-level retention, just Blink's
+  layout tree growing with page count.
+- Per-page render cost grows ~10x from page 0 to page 1638: the
+  layout-flush cost grows linearly with `n`.
+- Content-driven spikes (the 1100-1199 chapter at 37 ms avg): pages
+  with heavier content do more walker iterations, multiplying the
+  per-iteration sync-layout cost.
+
+### Fix paths, in order of effort
+
+1. **Detach (or `display: none`) finalised pages.** Once a page's
+   layout is committed, take it out of the live document (or hide it
+   via `display: none` / `content-visibility: hidden`) so subsequent
+   sync layouts don't traverse it. Re-attach all pages at
+   `afterRendered` before `page.pdf()` runs. The idea is
+   well-understood and the patch is small (it lives in the chunker /
+   layout glue); collapses the render to roughly `O(n)`.
+
+2. **Batch the walker.** `findOverflow` reads
+   `getBoundingClientRect` on every node and Chromium can't batch
+   reads if they're interleaved with DOM writes. Splitting overflow
+   detection into a write-then-read-then-write phased pass would
+   reduce the number of forced layouts per page, even without
+   detaching previous pages. Smaller win than (1) but compatible
+   with it.
+
+For our pipeline, fix (1) would knock 60-80 seconds off the
+100-second render. Combined with skipping the pdf-lib roundtrip in
+Process (the easy win from the previous findings section), the
+total drops from ~207 s to roughly 90 s.
+
+## Fix applied: `perf/detach-pages.js`
+
+We went with fix (1) above, **as a paged.js handler rather than a
+bundle patch** -- a 20-line `Paged.Handler` subclass that sets
+`pageElement.style.display = 'none'` in `afterPageLayout` and
+restores them at `afterRendered` before `page.pdf()` runs. The
+existing `--additional-script` mechanism is exactly the extension
+point this needs, so no fork required.
+
+Wired into production in `docs/book.bat`. Originally:
+
+```bat
+npx pagedjs-cli _site-pdf\book.html -o _pdf\book.pdf ^
+    --outline-tags h1,h2,h3,h4 -t 600000 ^
+    --additional-script ..\perf\detach-pages.js
+```
+
+After the later `pagedjs-cli` removal (see "Dropping pagedjs-cli"
+below) the same `--additional-script` flag carries over to
+`render-book.mjs`:
+
+```bat
+node render-book.mjs _site-pdf\book.html -o _pdf\book.pdf ^
+    --outline-tags h1,h2,h3,h4 ^
+    --additional-script ..\perf\detach-pages.js
+```
+
+And into the perf harness via the `--detach-pages` flag.
+
+### Results
+
+Three-phase numbers, same 1638-page book, measured via the harness:
+
+| Phase    | Baseline | + handler | Δ |
+| -------- | -------- | --------- | --- |
+| render   | 103.8 s  |  50.9 s   | **-52.9 s (-51%)** |
+| generate |  63.6 s  |  60.2 s   | -3.4 s |
+| process  |  39.6 s  |  39.7 s   | unchanged |
+| **total**| **207.0 s** | **150.7 s** | **-56.3 s (-27%)** |
+
+Render last-quarter / first-quarter ratio: **4.56x -> 1.65x**.
+The remaining 1.65x is content variance (chapter 1100-1199 has
+dense tables / code blocks). No `n`-driven component remains.
+
+Per-page render curve, bucketed:
+
+```
+                  baseline    +handler
+pages 0-99      :   3.4 ms      6.1 ms
+pages 500-799   : 12-17 ms      5-6 ms       <- now flat
+pages 1100-1199 :  36.7 ms     13.4 ms       <- heaviest chapter, ~3x faster
+pages 1600-1637 :  37.7 ms     10.7 ms       <- ~3.5x faster
+```
+
+CPU profile shift (self-ms):
+
+```
+                                            baseline   +handler
+getBoundingClientRect      (native)            63525      19459
+(program)                  (V8/Blink)          19075       3676
+```
+
+`getBoundingClientRect` self-time dropped 3.3x and `(program)`
+(V8/Blink-internal layout) dropped 5.2x. Both are still in the top
+slots because layout work doesn't go to zero -- but they're now
+in line with the *current* page's content, not the entire growing
+document.
+
+### Production confirmation
+
+`docs/book.bat` (the real production path) reports:
+
+```
+✔ Rendering 1638 pages took 49,547 ms.
+✔ Generated
+✔ Processed
+✔ Saved to docs\_pdf\book.pdf            (10.5 MB)
+total elapsed: 185 s
+```
+
+The render number is within 3% of the harness measurement, no
+errors, PDF written. (The harness's PDF lands at 16.9 MB rather
+than 10.5 MB -- that's an artefact of the harness's slightly
+different post-processing flow, not the handler.)
+
+### What this didn't fix (independent follow-ups)
+
+The handler closes the quadratic-render hole. Remaining costs are
+linear-in-`n` and don't shrink with this change:
+
+1. **Process: 40 s of pdf-lib roundtrip on a 52 MB raw PDF.** Out
+   of that, `setOutline` is 11 ms; the other 39+ seconds is
+   `PDFDocument.load` + `pdfDoc.save` on the big Chrome output.
+   Replacing the load+save with a streaming outline-injection
+   tool (`qpdf`, hand-rolled with pdf-lib's lower-level API)
+   could cut another ~30 s.
+2. **Generate: 60 s in `page.pdf()`.** Chromium internals; mostly
+   opaque. The 52 MB raw size hints at uncompressed streams in
+   Chrome's writer -- worth a glance but not a quick fix.
+
+## Confirming the mechanism (instrumentation A/B)
+
+The CPU profile said `getBoundingClientRect` self-time dropped
+3.3x; the wall-clock measurement said render dropped 2x. To
+double-check that's actually due to the smaller layout tree (and
+not a profile-attribution coincidence, or paged.js silently
+skipping work, or new costs appearing elsewhere) the harness has
+an `--instrument` flag that wraps every in-page DOM accessor
+that *can* force a synchronous layout -- `getComputedStyle`,
+`getBoundingClientRect`, the `offsetWidth` / `offsetHeight` /
+`offsetTop` / `offsetLeft` family, and the `clientWidth` /
+`clientHeight` / `scrollWidth` / `scrollHeight` getters -- with
+counters and per-call timing.
+
+Same wrapper overhead in both runs, so absolute totals are
+inflated but the comparison is apples-to-apples.
+
+Two runs, same content, only difference is `--detach-pages`:
+
+| op                      | baseline                  | + detach                  |
+| ---                     | ---                       | ---                       |
+| `getBoundingClientRect` | 260,668 calls, **208 us** avg | 258,940 calls, **70 us** avg |
+| `scrollWidth`           |  37,911 calls,   1.4 us   |  37,047 calls,   1.1 us   |
+| `scrollHeight`          |  37,911 calls,   0.7 us   |  37,047 calls,   0.6 us   |
+| `getComputedStyle`      |   9,179 calls,   1.7 us   |   9,179 calls,   1.8 us   |
+| `offset*` / `client*`   |       **0 calls**         |       **0 calls**         |
+
+Instrumented render wall-clock: 82.1 s baseline -> 47.7 s with
+detach. Same shape as the un-instrumented runs.
+
+What the numbers say:
+
+1. **Call counts are essentially identical.** The detach handler
+   isn't getting paged.js to skip any work -- 260,668 vs 258,940
+   `getBoundingClientRect` calls is a rounding error. The fix
+   makes each call cheaper, not the number of calls smaller.
+
+2. **`getBoundingClientRect` per-call cost dropped 66 %**,
+   208 us -> 70 us. Smaller live layout tree, less to recompute
+   on each forced flush. Total cost on this op alone: 54.3 s ->
+   18.2 s, which is most of the wall-clock render savings.
+
+3. **`offsetWidth` / `offsetHeight` / `offsetTop` / `offsetLeft`
+   / `clientWidth` / `clientHeight` are called zero times** on
+   our content. The auto-width branches inside `finalizePage`'s
+   margin-box `forEach` (where those accesses live) never fire
+   on the kind of margin content we have (bottom-right page
+   number, nothing else).
+
+## Why detach-pages.js hooks `finalizePage`, not `afterPageLayout`
+
+The chunker's per-page hook order is:
+
+```
+beforePageLayout  ->  afterPageLayout  ->  finalizePage
+```
+
+`AtPage.finalizePage` (built into paged.js) reads `getComputedStyle`
+on margin-box children and writes `el.style["grid-template-columns"]`
+on them. `time-hooks.js` measurements show this method is **11x
+slower per call when run on a `display:none` page**:
+
+| Variant | `chunker.finalizePage::finalizePage` per call |
+| --- | --- |
+| Baseline (no detach) | 0.82 ms |
+| Detach hooked on `afterPageLayout` (hide *before* AtPage) | **9.24 ms** |
+| Detach hooked on `finalizePage` (hide *after* AtPage) | 0.67 ms |
+
+Chromium has fast paths for style reads/writes on visible elements;
+on hidden subtrees the same operations re-cascade each call. So
+hiding the page before AtPage runs makes AtPage pay a slow path
+worth ~8 ms/page over the whole render.
+
+`detach-pages.js` therefore hooks `finalizePage`, registering after
+AtPage so its method runs second. AtPage works on a visible page;
+we hide immediately after. The next chunker iteration sees pages
+0..N-1 hidden, so the original `getBoundingClientRect` saving in
+the chunker is preserved.
+
+**Wall-clock impact: none measurable.** A 4+4 interleaved A/B
+between the two variants showed render medians within ~1 s of
+each other (48.70 s vs 49.83 s un-instrumented; 50.78 s vs 50.90 s
+with `--time-hooks`), well inside the 3-7 s within-variant noise.
+The `finalizePage` hook is the variant we ship because it makes
+the CPU profile read honestly (no mystery cost inside AtPage) and
+gives AtPage the visible page it expects, not because of a
+measurable speedup.
+
+## Fix applied: `perf/incremental-pdf.mjs`
+
+The direct follow-up from the previous section's "What this didn't
+fix" list: kill the pdf-lib roundtrip that owned the 40 s process
+phase. 99 % of that was `PDFDocument.load` + `pdfDoc.save` on the
+52 MB raw PDF -- just so we can attach an outline tree and override a
+handful of `/Info` fields.
+
+Approach: a **PDF incremental update** (PDF 1.7 §7.5.6). We never
+call `PDFDocument.load`. Instead:
+
+1. Parse only the trailer, xref, Catalog, and Info objects -- using
+   `PDFParser` positioned at known byte offsets. Three small dicts,
+   ~50 ms.
+2. Build outline objects in a fresh `PDFContext`, allocating refs
+   starting from the original `/Size`.
+3. Mutate the parsed Catalog (add `/Outlines`, `/Lang`) and Info
+   (override `/Title`, `/Creator`, dates, ...) **in place**, keeping
+   their original refs.
+4. Append to the original bytes:
+   - The new and updated indirect objects.
+   - A new xref section whose subsections cover only those refs.
+   - A new trailer dict with `/Prev` pointing at the original xref.
+   - `startxref <new-offset>` + `%%EOF`.
+
+Readers chain backward through `/Prev` to resolve any ref we didn't
+touch (`/Pages`, `/Dests`, every font / image / content stream). The
+original 52 MB stays byte-identical; we just append a few hundred KB.
+
+The writer is built on pdf-lib's low-level primitives -- `PDFParser`
+for the few objects we read, `PDFContext` + `PDFDict` for object
+construction, `PDFCrossRefSection` + `PDFTrailerDict` for emitting
+the new xref / trailer. The expensive `PDFDocument.load` (which
+parses every indirect object in the file) is bypassed entirely.
+
+### Results
+
+Same 1638-page book, `--detach-pages` already in effect for both runs:
+
+| Phase    | pdf-lib roundtrip | + incremental | Δ |
+| -------- | ----------------- | ------------- | --- |
+| render   |  50.9 s   |  49.2 s   | unchanged (noise) |
+| generate |  60.2 s   |  60.9 s   | unchanged (noise) |
+| process  |  39.7 s   |   0.25 s  | **-39.4 s (-99%)** |
+| **total**| **150.7 s** | **110.3 s** | **-40.4 s (-27%)** |
+
+Combined with the detach-pages fix, the build is now **110 s vs
+207 s baseline (-47 %)**.
+
+Process-phase breakdown for the incremental path:
+
+```
+incremental    : 250 ms total
+appended       : ~410 KB (vs 52 MB raw Chrome PDF, untouched)
+new objects    : 1776 (outline root + 1773 outline items + Catalog + Info)
+```
+
+The output reparses cleanly under both pdf-lib's full
+`PDFDocument.load` and poppler's `pdfinfo` (PDF 1.4, 1638 pages,
+A4, all metadata intact). Outline navigation works in the viewer.
+
+### The size tradeoff
+
+`pdf-lib`'s `save()` quietly deflate-compresses content streams as a
+side effect of full re-emission. That's why the old output was 17 MB
+even though Chrome's raw PDF is 52 MB. The incremental writer keeps
+Chrome's bytes verbatim, so the final file is essentially "52 MB +
+outline":
+
+| Output mode       | Final PDF size |
+| ---               | --- |
+| pdf-lib roundtrip | 16.9 MB |
+| incremental       | 52.7 MB |
+
+This is the same uncompressed-streams problem the initial findings
+section flagged ("Chrome isn't compressing streams aggressively").
+Two ways to claw the size back without going back to a full parse,
+both independent follow-ups:
+
+1. **qpdf post-pass** -- `qpdf --object-streams=generate
+   --compress-streams=y in.pdf out.pdf` re-emits the file with deflate
+   on every stream, without reifying document semantics. C++,
+   skips object-by-object reconstruction; should be much faster than
+   pdf-lib's load. Adds a binary dependency.
+2. **Deflate inside the writer** -- detect raw streams without
+   `/Filter` in the parsed objects and rewrite them with
+   `/Filter /FlateDecode` + a pako-deflated body. Same engineering
+   shape as qpdf but in JS, and lets the incremental update stay
+   self-contained. Requires walking the full body of the original
+   PDF, which puts back some of the cost we just removed.
+
+The incremental writer ships as-is; pick a size strategy when /
+if file size becomes a concern.
+
+### Production integration
+
+`measure.mjs --incremental` exercises the writer for measurement.
+`docs/book.bat` doesn't ship it: production goes through the pdf-lib
+roundtrip path (with `parseSpeed: Fastest`, now ~5 s and gives the
+17 MB compressed output). Switching production to the incremental
+writer is a one-line change in `docs/render-book.mjs` (call
+`applyOutlineAndMetadataIncremental` from `../perf/incremental-pdf.mjs`
+instead of `PDFDocument.load + ... + save`), gated behind whether the
+larger output is acceptable for that pipeline.
+
+## Profiling pdf-lib's load: 79 % was idle yielding
+
+The "Fix applied: detach-pages" section above showed the pdf-lib
+roundtrip at 39.7 s for the process phase. After profiling, **most
+of that wasn't pdf-lib doing work -- it was pdf-lib yielding to the
+event loop**.
+
+`PDFDocument.load` defaults to `parseSpeed: ParseSpeeds.Slow = 100`
+objects per tick, with an `await waitForTick()` between batches.
+`pdfDoc.save` does the same with `objectsPerTick: 50`. For our
+~50k-object PDF that's ~500 yields during load, ~1000 during save,
+each costing ~5-10 ms of pure idle on a quiet system.
+
+A CPU profile of `PDFDocument.load` running standalone on the 52 MB
+Chrome output (`node --cpu-prof`, fresh process, no concurrent work):
+
+```
+samples: 3441   duration: 6.09s   us/sample: 1770
+
+   self_ms   self_%   function  @  source
+   -------   ------   ----------------------------------------------
+   4766.25   78.92%   (idle)                  (V8 idle wait)
+    251.41    4.16%   PDFRef.of               PDFRef.js:34
+    196.53    3.25%   (garbage collector)
+    116.85    1.93%   (program)
+     63.74    1.06%   PDFObjectParser.parseString
+     46.03    0.76%   BaseParser.parseRawInt
+     38.95    0.64%   BaseParser.parseRawNumber
+     35.41    0.59%   PDFObjectParser.parseNumberOrRef
+```
+
+On a 6 s load, **4.77 s is V8 sitting on its hands** between
+scheduled batches. Actual parsing self-time is well under a second;
+the rest is GC and V8 internals.
+
+Why such a cautious default? pdf-lib targets the browser too, where
+locking the main thread for 30+ s to parse a big PDF would freeze the
+page. In Node, with the harness having no other work to do, yielding
+is pure overhead.
+
+### Wins from `parseSpeed: Fastest` (objects/tick = Infinity)
+
+Three-variant roundtrip on the same 52 MB PDF, fresh process each
+time (`profile-roundtrip.mjs`):
+
+| parseSpeed / objectsPerTick | load   | save  | total   |
+| ---                         | ---    | ---   | ---     |
+| **Slow / 50 (default)**     | 36.7 s | 3.8 s | 40.5 s  |
+| Fast / 1500                 | 3.0 s  | 2.6 s | 5.6 s   |
+| **Fastest / Infinity**      | **2.0 s** | **2.7 s** | **4.7 s** |
+
+`save` is barely affected by `objectsPerTick` -- its CPU work
+dominates the yield overhead -- but `load` collapses by **18x**.
+
+### Wired into the harness
+
+`measure.mjs`'s default pdf-lib roundtrip path now passes
+`parseSpeed: ParseSpeeds.Fastest` and `objectsPerTick: Infinity`.
+End-to-end on the book (`--detach-pages`, default = pdf-lib path,
+no `--incremental`):
+
+| Phase    | Old pdf-lib defaults | Fast knobs | Δ |
+| -------- | -------------------- | ---------- | --- |
+| render   |  50.9 s   |  45.7 s   | noise |
+| generate |  60.2 s   |  52.4 s   | noise (Chrome variance) |
+| process  |  39.7 s   |   7.8 s   | **-31.9 s (-80 %)** |
+| **total**| **150.7 s** | **105.9 s** | **-44.8 s (-30 %)** |
+
+Result: the pdf-lib roundtrip is now **competitive with the
+incremental writer** (105.9 s vs 110.3 s total) **while still
+producing a 17 MB output** (vs 53 MB for incremental, because
+`save()` flate-compresses content streams as it re-emits them).
+
+### What this reinterprets
+
+The "Fix applied: detach-pages" table is still accurate, but its
+39.7 s process column reflects pdf-lib's default tick-yielding, not
+its actual work. A reader benchmarking pdf-lib on its merits should
+compare against the **7.8 s** number, not 40 s.
+
+The incremental writer (above) still produces the fastest process
+phase by far (0.25 s) and remains useful when sub-second matters
+more than file size. But for the common case the single-line
+`parseSpeed: Fastest` tweak is the immediate win.
+
+## Chromium `Page.printToPDF` knob survey
+
+While we were here, we audited which Chromium / CDP options affect
+PDF output. Partly to confirm "is there something Chrome could
+compress for us?" (no), partly because one option turned out to be
+a real win: `outline: true`.
+
+Verified against `devtools-protocol@0.0.1312386` and
+`puppeteer-core@22.15.0` (both shipped under `perf/node_modules`).
+
+### `outline: true` -- Chrome can emit /Outlines itself
+
+CDP's `Page.printToPDF` accepts `generateDocumentOutline: true` since
+Chrome M122 (Feb 2024). Puppeteer exposes it as `outline: true` since
+v22.x. Behaviour:
+
+- Chrome walks the rendered DOM's `<h1>..<h6>` once and emits a
+  /Outlines tree with **page+coords destinations** (`[N 0 R /XYZ x y z]`)
+  instead of named destinations.
+- Implies `tagged: true` (the outline is built from the accessibility
+  tree). Puppeteer enforces this in `util.ts:395`.
+- Requires the launch flag `--generate-pdf-document-outline`.
+  Puppeteer 22+ adds it automatically in `ChromeLauncher.defaultArgs()`,
+  so both `measure.mjs` and `docs/render-book.mjs` get it for free.
+- **No tag-level filter**: walks `h1..h6` unconditionally. There is
+  no equivalent of our `--outline-tags h1,h2,h3,h4` knob.
+
+Measured cost on the 1638-page book with `--chrome-outline --detach-pages`:
+
+| Phase    | injected outline | Chrome outline | Δ |
+| -------- | ---------------- | -------------- | --- |
+| generate |  52.4 s   |  53.8 s   | +1.4 s (Chrome walking the headings) |
+| process  |   7.8 s   |   5.3 s   | -2.5 s (no outline objects to save) |
+| **total**| **105.9 s** | **107.8 s** | +1.9 s |
+
+Total is roughly a wash -- one cost shifts to another. The real
+benefit is **fewer moving parts**: no `parseOutline`, no
+`setOutline`, no incremental-writer outline objects, just metadata.
+
+### Does Chrome's outline match the injected one?
+
+We diffed the two outputs on the 1638-page book (`compare-outlines.mjs`):
+`results/pdf-lib-fastest/book.pdf` (injected, 1773 entries from
+`--outline-tags h1..h4`) versus `results/chrome-outline-on/book.pdf`
+(Chrome's, 6023 entries total).
+
+Naïvely filtering Chrome's tree to "depth ≤ 3" to approximate our
+h1..h4 view gives 1820 entries -- close to 1773 in count, but **not
+equivalent** structurally. Two reasons:
+
+1. **Chrome walks all h1..h6 unconditionally.** First concrete
+   divergence is at the "Alias Types" section: the source
+   ([book.html:302](../../docs/_site-pdf/book.html:302)) has
+   `<h5 id="ch-Features-Language-Alias-Types-example">Example</h5>`
+   immediately after the h3 "Alias Types". Our `--outline-tags`
+   filter correctly drops it; Chrome includes it. Every such
+   insertion shifts the rest of the pre-order walk.
+2. **Chrome's tree depth ≠ HTML heading level.** Chrome collapses
+   skipped levels: an `<h5>` directly under an `<h3>` becomes
+   depth+1 (not depth+2). So "filter to depth ≤ 3" does *not*
+   extract "h1..h4 only" -- it extracts the first four levels of
+   *nesting*, which can be any mix of h1..h6 depending on context.
+
+Numerical summary:
+
+| metric                                  | value |
+| ---                                     | --- |
+| injected entries                        | 1773 |
+| Chrome entries (h1..h6, all depths)     | 6023 |
+| Chrome entries filtered to depth ≤ 3    | 1820 |
+| pre-order matches (vs injected)         | 27 / 1820 |
+| same title+depth, different page        | 10 |
+
+The 10 "page-only mismatches" are the smoking gun for structural
+drift: same heading title in both outlines but pointing at different
+sections of the book. The deltas grow as the walk progresses --
+e.g. "Properties" at A=p956 vs B=p883 (Δ = -73 pages), and similar
+near the end of the book. By that point Chrome and our outline are
+literally talking about different headings that happen to share a
+name (every class in the reference docs has its own "Properties"
+sub-heading).
+
+### Selectively excluding headings from Chrome's outline
+
+Chrome's outline is built from the accessibility tree (puppeteer
+enforces `tagged: true` alongside `outline: true` for this reason).
+Anything that hides a heading from a11y excludes it from the outline.
+Tested matrix (`probe-outline-exclusions.mjs`):
+
+**Excluded** -- the heading is dropped from `/Outlines`:
+
+| attribute on the heading or an ancestor | clean? | notes |
+| --- | --- | --- |
+| `role="presentation"`     | yes | Removes heading semantic only. Visual rendering, DOM, anchor `#id` targets all unchanged. **The cleanest knob.** |
+| `role="none"`             | yes | Alias of `presentation`. |
+| `role="generic"`          | yes | Any non-heading role works. |
+| `aria-hidden="true"`      | -   | Excludes the whole subtree from a11y. Heavier -- also affects screen readers. |
+| `hidden` attribute        | no  | Also visually hides. |
+| `display: none`           | no  | Same. |
+| `visibility: hidden`      | no  | Same. |
+
+**No effect** -- Chrome ignores these:
+
+| attribute            | why |
+| ---                  | --- |
+| `bookmark-level: none` (CSS GCPM) | Chrome doesn't implement GCPM. |
+
+**Reverse direction.** `<div role="heading" aria-level="3">Foo</div>`
+*adds* an h3-level entry to Chrome's outline despite not being an
+HTML heading. Useful if you ever want an outline entry that doesn't
+look like a heading on screen.
+
+**Implication for our pipeline.** The "Chrome's outline is too
+noisy" objection above isn't actually structural -- it's one CSS
+selector away from being fixed. A preprocessor step that adds
+`role="presentation"` to every `<h5>` and `<h6>` in the Jekyll
+build would let Chrome's `outline: true` produce the same h1..h4
+view we want today. We haven't done that step yet, so we still
+ship the injected outline -- but the path from "Chrome's outline
+works for measurement only" to "Chrome's outline ships in
+production" is now ~5 lines of Jekyll plugin code, not a
+fundamental redesign.
+
+### Did pagedjs-cli ever try Chrome's outline?
+
+No. Searched (`gh api search/issues`, `gh api search/code`, and
+web search):
+
+- `repo:pagedjs/pagedjs-cli outline` -- 2 hits, both unrelated
+  (TOC page-number bug, rowspan/colspan).
+- `org:pagedjs chromium outline` -- 1 hit (the same TOC bug).
+- `"pagedjs printToPDF outline"` -- 0 hits.
+- `generateDocumentOutline org:pagedjs` (code search) -- 0 hits.
+- `"--generate-pdf-document-outline" org:pagedjs` -- 0 hits.
+
+Timing: Chrome's `generateDocumentOutline` shipped M122 (Feb 2024);
+[pagedjs-cli](https://github.com/pagedjs/pagedjs-cli)'s last
+meaningful change is May 2024 (Docker hyphenation). The project
+is in near-maintenance mode (21 stars). The feature post-dates
+active development, and the unfilterable-outline regression
+(without the `role="presentation"` workaround above) would have
+been a real concern for existing `--outline-tags` users -- so
+even a casual look would probably have ended in "we'll keep
+injecting for now". Nobody appears to have looked.
+
+### What's not exposed in CDP (we checked)
+
+- **No stream-compression flag.** Chromium uses Skia's `SkPDF`,
+  which writes content streams uncompressed. There's a C++-only
+  `SkPDF::Metadata::fPDFA` setting; no CDP plumbing for it. This is
+  *why* `save()` re-emission shrinks 52 MB → 17 MB.
+- **No object-streams flag, no font subsetting / image downsampling
+  knobs, no PDF/A mode.** Skia subsets fonts automatically per face.
+- **No parallelism knob.** Generate's 60 s in `page.pdf()` is
+  single-threaded Skia walking the layout tree.
+
+### What might still be worth trying
+
+- **`tagged: false`** -- drops the StructTreeRoot, saving ~10-20 %
+  of generate time and file size. Loses accessibility *and* the
+  Chrome outline (tagging is a prerequisite). Probably a no for
+  our use; documenting for completeness.
+- **`pageRanges` sharding** -- run `page.pdf()` N times with
+  disjoint ranges on parallel browser pages. Each shard serialises
+  only its slice and they run concurrently. Biggest unused lever
+  for the 60 s generate phase, but requires a PDF concatenation
+  post-pass (pdf-lib can do it).
+- **`transferMode: 'ReturnAsStream'`** -- puppeteer already
+  hard-codes it. Without it Chrome buffers + base64-encodes the
+  whole PDF into one JSON message; very slow and memory-heavy.
+
+## Where this leaves us
+
+The full menu of fixes, all measured against the original 207 s
+baseline:
+
+| Configuration                          | render | generate | process | total | size |
+| ---                                    | ---    | ---      | ---     | ---   | ---  |
+| original                               | 103.8s | 63.6s    | 39.6s   | 207.0s | 17 MB |
+| + detach-pages                         |  50.9s | 60.2s    | 39.7s   | 150.7s | 17 MB |
+| + detach + **parseSpeed:Fastest**      |  45.7s | 52.4s    |  7.8s   | **105.9s** | **17 MB** |
+| + detach + incremental writer          |  49.2s | 60.9s    |  0.25s  | 110.3s | 53 MB |
+| + detach + Chrome outline              |  48.7s | 53.8s    |  5.3s   | 107.8s | 17 MB |
+
+**Practical winner: `+ detach + parseSpeed:Fastest`.** Half the
+original wall time, same output size, one-line change. Ship this
+first regardless of what else gets layered on top.
+
+The incremental writer is still the fastest process phase (0.25 s)
+and remains the right answer if file size doesn't matter and
+sub-second process does.
+
+Chrome's outline is the simplest *architecture* (no parseOutline,
+no setOutline, no incremental outline objects -- just metadata),
+and the "unfilterable h1..h6" objection turns out to be a
+preprocessor change away from being solved: tag every `<h5>` /
+`<h6>` in the Jekyll build with `role="presentation"` and Chrome's
+outline collapses to the same h1..h4 view we want today. With that
+change, the totals look like:
+
+| Configuration                                     | render | generate | process | total | size |
+| ---                                               | ---    | ---      | ---     | ---   | ---  |
+| + detach + parseSpeed:Fastest *(today)*           |  45.7s | 52.4s    |  7.8s   | 105.9s | 17 MB |
+| + detach + parseSpeed:Fastest + Chrome outline    |  48.7s | 53.8s    |  5.3s   | 107.8s | 17 MB |
+| *(latter, with role="presentation" on h5/h6 -- pending)* | | | | | |
+
+The compound win isn't in wall time -- it's in deleting code:
+`parseOutline`, `setOutline`, and the entire outline branch of the
+incremental writer all go away. Worth it if/when someone wants to
+trim the surface area.
+
+## Dropping `pagedjs-cli`
+
+`pagedjs-cli` did three useful things for us and one harmful one. On
+the useful side: it shipped the paged.js browser bundle in
+`dist/browser.js`, the outline + metadata helpers in
+`src/outline.js` and `src/postprocesser.js` (~250 LOC total), and a
+CLI wrapper for the pdf pipeline. On the harmful side, the wrapper
+calls `PDFDocument.load(pdf)` and `pdfDoc.save()` with no options
+and therefore inherits the slow defaults that wasted ~32 s per build
+(see "Profiling pdf-lib's load" above). Patching upstream to fix
+that is plumbing for plumbing's sake; the rest of pagedjs-cli is
+already mostly duplicated by our harness.
+
+So we vendored what we needed and dropped the dep:
+
+- `docs/lib/paged.browser.js` -- `pagedjs-cli@0.4.3/dist/browser.js`,
+  byte-for-byte. MIT-licensed; license header preserved at top of file.
+- `docs/lib/outline.mjs`  -- `src/outline.js`, ESM-ified, attribution
+  in the file header.
+- `docs/lib/postprocesser.mjs` -- `src/postprocesser.js`, same.
+- `docs/render-book.mjs` -- the production driver. Argv-compatible
+  with the subset of `pagedjs-cli` flags `book.bat` actually used
+  (`-o`, `--outline-tags`, `-t`, `--additional-script`). Calls
+  pdf-lib with `parseSpeed: Fastest` + `objectsPerTick: Infinity`
+  inline, no patching required.
+- `docs/book.bat` -- swapped `npx pagedjs-cli ...` for
+  `node render-book.mjs ...`. Same CLI, ~32 s faster (pdf-lib idle
+  yielding gone), one fewer transitive dependency tree.
+
+Both `docs/package.json` and `perf/package.json` now depend directly
+on `puppeteer` + `pdf-lib` + `html-entities` instead of inheriting
+them via `pagedjs-cli`. `perf/measure.mjs` imports from `docs/lib/`
+so the harness and production share the exact same code path through
+the helpers and bundle -- whatever production renders, the harness
+measures.
+
+End-to-end on the 1638-page book through the new driver:
+
+```
+render:   53.5s  (1638 pages)
+generate: 68.8s  (raw 52.3 MB)
+process:  5.1s
+saved:    docs\_pdf\book.pdf  (16.9 MB)
+total:    130.4s
+```
+
+(The total includes puppeteer launch + page nav overhead the
+harness elides, so it reads a few seconds higher than the harness's
+105 s headline.)
+
+## Restoring live progress
+
+Dropping `pagedjs-cli` (above) quietly dropped its ora spinners
+along with the rest of the CLI. The terminal goes silent for the
+~50 s render and ~60 s generate phases -- on a 130 s build, most
+of the wall time looks like the process is hung.
+
+Render phase: restored via `docs/lib/progress-handler.js`, a small
+`Paged.Handler` subclass that emits a `[render-progress] page=N
+elapsed=Ns` line from `afterPageLayout`. `render-book.mjs` listens
+on `page.on('console')` and re-renders the line as a
+`\r`-overwritten TTY status (`rendering: 234 pages (12.4s)`), or
+every 100 pages on its own line when stdout is piped (CI / log
+files). The live line is cleared just before the final
+`render: 53.5s (1638 pages)` summary is printed.
+
+The handler is a separate in-page script rather than inlined into
+`render-book.mjs` because `addScriptTag({ path })` loads it via
+file:// into the headless page -- it has to be a real file. It's
+structurally parallel to `perf/timing-handler.js`, which uses the
+same hook but additionally retains per-page detail on
+`window.__pagedTiming` for offline analysis. The production version
+stays minimal -- just the log line.
+
+Generate phase: a 500 ms wall-clock heartbeat in `render-book.mjs`
+writes `generating: 23.4s` to a `\r`-overwritten TTY line during
+the `page.pdf()` wait. Elapsed time only; no byte- or page-count
+signal. The line is cleared before the final
+`generate: 68.8s (raw 52.3 MB)` summary, same shape as the render
+phase.
+
+We initially tried byte-level progress -- drive `page.pdf()` at the
+CDP level with `transferMode: 'ReturnAsStream'` + chunked `IO.read`.
+On the Chromium we ship with, the bytes don't actually stream:
+Chrome's SkPDF writer buffers the whole document internally and
+emits all 52 MB in one tick at the end. The wrapper showed `0.0 MB`
+for ~50 s then flickered `52 MB` for one frame before the summary
+-- the heartbeat was doing all the visible work. Dropped the CDP
+code; the buffer-then-dump finding is preserved in a comment above
+the heartbeat so the next person doesn't re-investigate.
+
+The process phase stays silent. At ~5 s with the fast pdf-lib knobs
+(`parseSpeed: Fastest`) it's not worth a progress signal of its own.
diff --git a/perf/notes/02-finalizepage.md b/perf/notes/02-finalizepage.md
new file mode 100644
index 0000000..367f867
--- /dev/null
+++ b/perf/notes/02-finalizepage.md
@@ -0,0 +1,1010 @@
+# finalizePage deep dive
+
+Revisiting `AtPage.finalizePage` after the detach-pages fix exposed it as the next-largest self-time row, then chasing the residual O(n) further: looking past finalizePage into `Layout.textBreak` / `Page.create`, attempts that failed (binary-search textBreak; memoize `getBoundingClientRect`), the actual residual (CSS-Grid sibling sweeps over `display:none` pages, fixed with `aggressive-detach`), and five attempts at `createBreakToken` dedup that surfaced an additive-backoff fix for `renderTo`'s overflow check.
+
+## Revisiting `AtPage.finalizePage`
+
+The post-detach CPU profile in *Fix applied: `perf/detach-pages.js`*
+(see [01-baseline-and-detach.md](01-baseline-and-detach.md)) showed an
+`(anonymous) @ browser.js:29501` row at **13.7 s
+self-time** -- the `["top","bottom"].forEach(...)` lambda inside
+`AtPage.finalizePage`. That looked like a fat target.
+
+It wasn't, for two reasons:
+
+1. **The 13.7 s number was stale.** It came from the *first*
+   detach-pages.js variant, which hooked `afterPageLayout` and hid the
+   page *before* AtPage ran -- so AtPage paid Chromium's slow style-
+   cascade path on a `display:none` subtree (~9 ms/page). The shipping
+   variant hooks `finalizePage` and hides *after* AtPage, so AtPage
+   sees a visible page and the same lambda is **~0.7 ms/page = ~1.1 s
+   total render**. Re-measured on a fresh profile, the lambda is
+   ~1.0 s self-time, not 14 s. The original number is correct for the
+   variant it was measured on, but doesn't reflect current ship.
+2. **Most of that ~1 s isn't query CPU.** Per-page the method does
+   ~17 `querySelector` calls plus a few `getComputedStyle` reads.
+   Native query self-time across the whole render is ~340 ms
+   (`querySelector` ~155 ms + `querySelectorAll` ~185 ms in the
+   unpatched baseline). The rest of the lambda's ~1 s is the
+   downstream layout flush triggered by `getComputedStyle` and the
+   style writes -- unaffected by query consolidation.
+
+We patched it anyway, as a cleanup. `docs/lib/paged.browser.js`'s
+`finalizePage` now builds a `__mLookup` table once per page via a
+single `querySelectorAll` over all 16 known margin-cell + margin-
+group class selectors, then the two forEach loops index that table
+instead of calling `page.element.querySelector(...)` 4× per
+iteration. The patch is marked `// PATCH: consolidate` at each of
+the three touch points so a future re-vendoring of the bundle can
+grep for it.
+
+### A/B results
+
+Interleaved 3+3 (A1 B1 A2 B2 A3 B3), `--detach-pages --cpu-profile`,
+same 1638-page book each run:
+
+| metric                        | A (patched) | B (unpatched) | Δ |
+| ---                           | ---         | ---           | --- |
+| render wall-clock, mean       | 49.45 s     | 49.91 s       | -0.46 s (noise; within-variant range 4-13 s) |
+| `querySelector` self-time     | <50 ms      | 155 ms        | -155 ms |
+| `querySelectorAll` self-time  | 247 ms      | 183 ms        | +64 ms |
+| **query CPU total**           | **~247 ms** | **~338 ms**   | **-91 ms (-27 %)** |
+| finalizePage lambda self-time | 1033 ms     | 1025 ms       | unchanged |
+
+The patch does what it says on the tin: ~91 ms shifts out of native
+`querySelector` and into a single `querySelectorAll`. Wall-clock
+delta is in the noise; the within-variant spread (3-13 s across runs
+of the same variant) drowns it out.
+
+The lambda's self-time being unchanged is the load-bearing
+observation: query consolidation doesn't reduce the layout-flush
+component, which is most of the 1 s. The next lever in this method
+would be **read/write batching** -- hoist all `getComputedStyle`
+reads to the top of `finalizePage` before any style writes, so the
+write-then-read pattern stops forcing a flush mid-method.
+
+### Read/write batching
+
+We applied the hoist anyway, as a follow-up cleanup. After the
+`__mLookup` block above, `finalizePage` now reads every relevant
+`max-width` / `max-height` value into two `Map`s (`__maxW`, `__maxH`)
+in a single batch -- gated by the same `.hasContent` check the
+original conditionals used. The two forEach loops then consume
+those cached values instead of calling `getComputedStyle` inline.
+Marked `// PATCH: max-width reads hoisted` / `max-height reads
+hoisted` at each touch point.
+
+**For this book**, the hoist is a no-op behaviourally. Our @page CSS
+sets content on exactly one corner (bottom-right page number), so
+only one `.hasContent` cell exists per page; the original code did
+exactly one `getComputedStyle` per page and therefore one forced
+flush. The hoisted version does the same.
+
+Smoke test, single render with `--detach-pages` (no profiling): 1638
+pages, 16.9 MB output, render 47.98 s, ratio 1.69x. All in the noise
+band from the consolidate-querySelector A/B.
+
+**For docs with multi-cell marginalia** (running headers + footers +
+page numbers across several corners) the hoist collapses N forced
+flushes -- one per cell that hits the `if (xContent)` branch in the
+original -- down to 1. The win scales with marginalia density.
+
+### Cross-page memoization
+
+The next layer of duplicate work: `finalizePage`'s computation is a
+pure function of `(page.element.className, this.marginalia, CSS
+@page rules)`. The marginalia map and CSS are static; only the
+className varies. **Two pages with the same className get the same
+four `grid-template-columns` / `grid-template-rows` values.** So we
+cache the result.
+
+Implementation: a `this.__finalizeCache: Map<string, {top, bottom,
+left, right}>` on the AtPage instance, keyed by
+`page.element.className`. The cache check sits between the
+`__mLookup` build and the GCS hoist. On a hit we apply the cached
+values via `__mLookup` and `return` -- Phases B and C never run. On
+a miss the existing code runs and the result is recorded at the end
+of the method by reading back the just-written `.style.grid-
+template-*` values.
+
+Phase A's marginalia `.hasContent` classifier still runs on every
+page (the class has to be added to *this* page's elements so the
+@page-margin CSS rules apply). Only the grid-template
+computation is skipped.
+
+**Assumption.** Cache key is `page.element.className`. Sound as long
+as @page rules don't use position-dependent selectors (e.g.
+`:nth-of-type`) that pick different rules on pages that share a
+className. Common case, true for this book; comment in the bundle
+flags the caveat.
+
+Smoke render (`--detach-pages`, no profile): 1638 pages, **16.9 MB
+output (byte-equivalent to the pre-patch run)**, render 48.27 s.
+Wall-clock impact still in the noise -- same reason as the hoist:
+the flush we skip in `finalizePage` is just deferred to the next
+chunker iteration's `findOverflow`. Total layout work the document
+demands doesn't shrink. What does shrink is the JS-side work --
+~1633 of 1638 pages now skip ~17 `querySelector` lookups, 6
+`classList.contains` reads, and the GCS pass entirely -- but that's
+sub-millisecond per page and disappears into the noise band.
+
+We're not going to keep iterating on `finalizePage`: budget is ~1 s
+total render even when every flush triggers, so further work here is
+cleanup-only.
+
+### Hoisting grid-template emission to parse time
+
+The cleanup payoff. Three patches in a row -- `__mLookup`, GCS hoist,
+cross-page memoization -- had whittled `finalizePage`'s per-page
+work to ~30 sub-ms ops, then to one Map lookup. The architectural
+move was to **delete the hot spot rather than keep optimizing
+around it**: hoist the grid-template computation out of the
+per-page JS path and into the polisher's @page CSS emission, so the
+rules are emitted once at parse time and the browser applies them
+via cascade for every matching page.
+
+The decision tree's inputs are static at parse time:
+
+- **hasContent** per `(page-class, margin-cell)` -- already recorded
+  in `this.marginalia[sel]` by `addMarginaliaStyles` for Phase A's
+  classifier, and invariant per page-class regardless of page index.
+- **max-width / max-height** per cell -- created by the same walker
+  that copies `width`/`height` declarations to `max-width` /
+  `max-height` on corner cells. The runtime
+  `getComputedStyle(el)["max-width"]` reads return the CSS-cascade
+  result of those rules, which is the value the parser saw. We
+  capture the string at parse time on the marginalia entry,
+  defaulting to `"none"` when no declaration exists.
+
+`AtPage.afterTreeWalk` already runs `addPageClasses`, which
+populates `this.marginalia` and emits the per-cell margin-styling
+rules. We extended it with `emitMarginGridTemplates`: for each page
+entry in `this.pages`, build the effective per-cell `hasContent +
+maxWidth + maxHeight` by unioning across every marginalia entry
+whose page-selector is a subset of the page's class signature
+(matching the runtime Phase A OR-cascade; `maxWidth` follows CSS
+cascade and takes the most-specific declared value). Run the same
+decision tree the runtime did on that snapshot. Emit one rule per
+margin group with `selectorsForPage(page)` as the selector and the
+computed `grid-template-columns` / `-rows` as a Raw value
+declaration. Skip emission for the four offset-fallback branches
+that need `offsetWidth` measurement (they can't be pre-computed --
+they read live layout).
+
+For this book that produces 24 rules total -- 6 page-class
+signatures (`*`, `:first`, `divider`, `front-matter`,
+`part-foreword`, `chapter-divider`) × 4 margin groups -- all with
+the same `0 0 1fr` value (the static branch the decision tree
+produces when only one corner has content and no widths are
+declared):
+
+```css
+.pagedjs_page .pagedjs_margin-top    { grid-template-columns: 0 0 1fr; }
+.pagedjs_page .pagedjs_margin-bottom { grid-template-columns: 0 0 1fr; }
+.pagedjs_page .pagedjs_margin-left   { grid-template-rows:    0 0 1fr; }
+.pagedjs_page .pagedjs_margin-right  { grid-template-rows:    0 0 1fr; }
+... (5 more page-class signatures) ...
+```
+
+`finalizePage` collapses to **Phase A + an offset-only Phase B**:
+
+- **Phase A** unchanged. Per-page DOM, can't be hoisted -- it has to
+  add `.hasContent` to the freshly created margin cells so the
+  base-style `.pagedjs_margin:not(.hasContent) { visibility: hidden
+  }` rule unhides the right ones.
+- **Phase B offset fallbacks.** The four branches in the upstream
+  Phase B that compute `minmax(%, ...)` templates from `offsetWidth`
+  measurements stay -- they read live layout and can't be
+  pre-computed. The forEach loop early-exits via a `couldFire` check
+  (two-or-more cells have content) before any `getComputedStyle` or
+  `querySelector` on the margin group; for this book that gate fails
+  on every page so the forEach is dominated by three `querySelector`
+  calls + three `classList.contains` reads per group.
+- **Phase C** disappears entirely. Every branch in the upstream
+  Phase C (left/right vertical groups) is static at parse time --
+  the upstream code has no offset measurement in those paths.
+- All three prior PATCH blocks come out: `__mLookup` and
+  cross-page memoization had no callers left, and only the GCS
+  hoist stays (preserved as an inline batched read of `max-width`
+  inside the `couldFire` gate, for documents whose marginalia would
+  reach the offset fallbacks).
+
+### Verifying it
+
+Instrumented A/B on the same 1638-page book:
+
+| op                      | pre-emit (3 patches) | post-emit | Δ |
+| ---                     | ---                  | ---       | --- |
+| `getComputedStyle`      | 9,179 calls          | **5,903 calls** | **-3,276 (-36%)** |
+| `getBoundingClientRect` | 258,940              | 258,940   | unchanged (different code path) |
+| `offsetWidth`           | 0                    | 0         | unchanged (gate never fires) |
+| render wall-clock       | 47.6 s               | 46.0-47.0 s | noise |
+| pdf size                | 16.9 MB              | 16.9 MB   | unchanged (±27-bytes timestamp variance) |
+
+The -3,276 GCS drop is exactly two reads per page eliminated -- the
+prior GCS hoist batched the per-cell `max-width` reads on
+`.hasContent` cells (one per `top-right`, one per `bottom-right`
+per page). The new `couldFire` early-exit skips them entirely.
+
+Wall-clock is in the noise, as predicted in the patch brief: this
+moves work from runtime JS to parse-time CSS but the browser still
+does the same cascade + layout work. The value here is **deleting
+the hot spot from the bundle**, not shaving milliseconds.
+
+Smoke render of `book.bat`: 1638 pages, 16.9 MB output (within 54
+bytes of the pre-patch run -- ±27 bytes is the normal run-to-run
+variance from Chrome's `/CreationDate` / `/ModDate` encoding),
+render 45.8 s.
+
+### What's left in `finalizePage`
+
+Two phases, both with clear single-purpose justifications:
+
+```
+Phase A   classify .hasContent per margin cell (per-page DOM)
+Phase B'  offset-fallback for auto-width minmax(%) templates
+          (dead code in this book; live for paged.js compatibility)
+```
+
+For our content Phase B' is dominated by an early `couldFire`
+short-circuit. The method now reads top-to-bottom as "what does the
+runtime *have* to do per page", with all the layered optimizations
+unwound. There's nothing left to hoist.
+
+## Looking past `finalizePage`: where render time goes now
+
+With the `finalizePage` work landed, a fresh `--detach-pages
+--time-hooks --cpu-profile` run on 1638 pages (2026-05-19) shows the
+named handlers we hook -- the surface we own -- now account for
+**under 1 ms/page combined**. Per-page handler costs, top of table:
+
+```
+hook::handler                                count  total_ms  per_page_ms
+chunker.afterPageLayout (detach-pages)        1638     788.5        0.481
+chunker.afterPageLayout (#10)                 1638     249.0        0.152
+chunker.renderNode                           44365     185.6        0.113
+chunker.afterPageLayout (#6)                  1638     100.9        0.062
+chunker.finalizePage                          1638      71.8        0.044
+chunker.beforePageLayout                      1638      68.6        0.042
+```
+
+Render is ~49 s on this hardware (~30 ms/page average). Subtracting
+the ~1 ms/page of handler work leaves ~29 ms/page of **paged.js
+core**: chunking, layout probing, overflow detection, and the
+text-break split. That's what the CPU profile attributes to:
+
+```
+self_ms   self_%   function                     source
+22855     33.0 %   getBoundingClientRect        (native, called from JS)
+19332     27.9 %   (program)                    V8 overhead / idle
+ 9931     14.4 %   removeOverflow               paged.browser.js:2196
+ 4280      6.2 %   findEndToken                 paged.browser.js:2094
+ 2364      3.4 %   findElement                  paged.browser.js:638 (cache hit; cheap)
+ 1456      2.1 %   insertBefore                 native
+ 1228      1.8 %   createBreakToken             paged.browser.js:1796
+  580      0.8 %   afterPageLayout (paged.js)   paged.browser.js:30381
+```
+
+(Counter-check on the ratio: this run reads **5.59 x** rather than
+the usual ~1.6 x. That's instrumentation skew -- both `--time-hooks`
+and `--cpu-profile` wrap hot paths, and the sampling overhead is
+proportionally larger on later pages. The handler totals and
+self-time table are still accurate; the per-page growth curve isn't
+trustworthy on instrumented runs.)
+
+So 33 % of render is `getBoundingClientRect` and another ~20 % is
+inside `removeOverflow` + `findEndToken` -- paged.js's per-page
+overflow-find + text-split path. That work isn't redundant: each
+page genuinely has to decide where its content ends. The remaining
+opportunities aren't *eliminating* work, they're *replacing the
+algorithm* with something the browser can answer in one call.
+
+### Three places non-redundant work could be made simpler
+
+**1. `Layout.textBreak` -- replace per-word `gBCR` loop with a
+single native call.** [paged.browser.js:2136](../../docs/lib/paged.browser.js:2136)
+walks an overflowing Range word-by-word, calling
+`getBoundingClientRect` on each `Range` to find which word crosses
+the page boundary; if a word straddles it, it descends letter-by-
+letter doing the same. On a long text node that's dozens to
+hundreds of gBCR calls -- and `textBreak` is the inner loop of
+`findOverflow`, so it fires on every page that overflows.
+
+A single `document.caretPositionFromPoint(x, vEnd)` (or
+`caretRangeFromPoint` on Chromium) returns the exact text node +
+offset at the boundary in **one** browser call. Equivalently,
+`range.getClientRects()` returns every line box of the range in one
+call, after which the crossing line is a simple `.find()`. Either
+replaces an `O(words-in-overflow)` scan with `O(1)`.
+
+This is the highest-leverage candidate: even if it cuts only half
+of the `gBCR` time, that's ~10 s off render. The risk is fidelity
+-- we'd need to verify the substitute gives the *same* split point
+as the word-walk on edge cases (RTL, hyphenated words,
+`white-space: pre`, soft hyphens). Worth a prototype + diff against
+the current bundle's output PDF.
+
+**2. `findOverflow` -- collapse three ancestor walks into one.**
+Inside the per-node loop in
+[paged.browser.js:1934](../../docs/lib/paged.browser.js:1934):
+
+```js
+const insideTableCell = parentOf(node, "TD", rendered);
+// ...
+tableRow = parentOf(node, "TR", rendered);
+// ...
+const table = parentOf(tableRow, "TABLE", rendered);
+```
+
+Three separate ancestor traversals per node visited, each climbing
+from `node` to `rendered`. One walk that emits the nearest TD/TR/
+TABLE together is ~10 lines and visits each ancestor once. Won't
+match #1 for raw savings (this is in the same loop that's already
+calling `getComputedStyle`, so a single-digit % gain at best) but
+it's the easy follow-up.
+
+**3. Cache `getComputedStyle` per page.** Same loop,
+[paged.browser.js:1969, 1974, 1992](../../docs/lib/paged.browser.js:1969):
+up to four `getComputedStyle` calls per node visited (on the node,
+its TD ancestor, and the parent TBODY/THEAD). The walker revisits
+the same ancestors across many child nodes; a `WeakMap<Element,
+CSSStyleDeclaration>` populated lazily per page would dedupe.
+
+This one *is* deduplication-shaped, but it's the cheapest of the
+three to land (no algorithmic change, no fidelity risk) and a clean
+follow-up if #1 lands.
+
+### Probable bug worth surfacing separately
+
+[paged.browser.js:1998](../../docs/lib/paged.browser.js:1998):
+
+```js
+const table = parentOf(tableRow, "TABLE", rendered);
+const rowspan = table.querySelector("[colspan]");
+```
+
+The local is named `rowspan` and the surrounding comment is about
+rowspan-aware break handling, but the selector matches `colspan`.
+Looks like a typo that's silently broken the rowspan path since the
+bundle was vendored. Not a perf issue per se, but worth a separate
+fix.
+
+### Strategic note
+
+Render and generate are now within ~20 s of each other (49 s vs
+70 s on this run). Each second shaved off render moves total by
+less than it used to, because `page.pdf()` is now the larger phase.
+Item 1 above is the only remaining render change that plausibly
+returns 10+ s; items 2 and 3 are <5 s each.
+
+After item 1 the remaining levers all live outside render. The
+Chrome-outline experiment in
+[01-baseline-and-detach.md](01-baseline-and-detach.md) shows
+generate isn't moved by shifting outline work around (Chrome
+walking `h1..h6` itself costs about what `parseOutline` +
+`setOutline` save -- net was +1.9 s).
+The one generate-side lever we haven't tried is **`pageRanges`
+sharding** -- run `page.pdf()` N times with disjoint page ranges on
+parallel browser pages and concatenate with pdf-lib. Each shard
+serialises only its slice and they run concurrently, so generate
+collapses to roughly `60 s / N` plus a small concat pass. Listed
+under *What might still be worth trying* in
+[01-baseline-and-detach.md](01-baseline-and-detach.md); it's the
+biggest untried knob in the pipeline.
+
+## What happened when we tried item 1
+
+The strategic note above was wrong about item 1 -- the binary-search
+replacement for `textBreak` saves nothing, and the reason it saves
+nothing reveals the actual structure of the remaining render cost.
+
+### Attempt A: binary-search `textBreak`
+
+Replaced the per-word-then-per-letter gBCR cascade in
+[`Layout.textBreak`](../../docs/lib/paged.browser.js:2136) with a binary
+search over offsets using a single-character probe `Range`.
+Semantically equivalent (both return the smallest offset whose
+character satisfies `left >= end || top >= vEnd`), should reduce
+gBCR call count from O(words) to O(log nodeLength).
+
+Paired runs with `--detach-pages`:
+
+| run        | baseline | binsearch |
+| ---------- | -------- | --------- |
+| render (1) |  47.73 s |  51.43 s  |
+| render (2) |  47.10 s |  47.12 s  |
+| **avg**    | **47.4** | **49.3**  |
+
+Wash, possibly small regression. PDF byte size and page count
+identical. Reverted.
+
+### Attempt B: memoize `Page.create`'s `area.getBoundingClientRect`
+
+The CPU profile of attempt A's baseline pointed at a much bigger
+target. Tracing gBCR's native frames up to their JS callers in the
+profile graph:
+
+```
+caller                           gBCR time
+create:2257                      12,947 ms   (69 %)
+hasOverflow:1925                  4,419 ms   (24 %)
+Layout:1443                         586 ms
+...
+total native gBCR                18,424 ms
+```
+
+[`Page.create`](../../docs/lib/paged.browser.js:2257) does one
+`area.getBoundingClientRect()` per page, right after the fresh
+`insertBefore` / `appendChild` of the page DOM -- so each call
+forces a synchronous layout pass. The `area`'s size is CSS-driven
+and constant per template, so the gBCR should be cacheable.
+
+Memoized the result on the `pageTemplate` node (first page pays,
+all subsequent same-template pages reuse).
+
+Profile diff (same `--detach-pages --cpu-profile` flags, paired):
+
+| caller            | PRE       | POST      | Δ          |
+| ----------------- | --------- | --------- | ---------- |
+| `create:2257`     | 12,947 ms |      2 ms | **-12,945** |
+| `Layout:1443`     |    586 ms | 13,567 ms | **+12,981** |
+| `hasOverflow:1925`|  4,419 ms |  4,533 ms |    +114    |
+| **total**         | 18,424 ms | 18,554 ms |    +130    |
+
+The cost moved, it didn't disappear. The memoization successfully
+eliminated the gBCR at `create:2257` (from 12,947 ms to 2 ms), but
+the layout flush that gBCR was driving still had to happen
+somewhere -- it migrated to the next call in the per-page sequence,
+[`Layout`'s constructor](../../docs/lib/paged.browser.js:1443):
+
+```js
+this.bounds = this.element.getBoundingClientRect();
+this.parentBounds = this.element.offsetParent.getBoundingClientRect();
+```
+
+Total gBCR self-time barely changed (+130 ms). Per-page ratio got
+worse (1.77x -> 3.07x), probably because the deferred flush
+accumulated more pending mutations before firing. Reverted.
+
+### The lesson
+
+**gBCR self-time in the profile is layout-flush attribution, not
+JS call overhead.** Reducing the *number* of gBCR calls in a hot
+path saves ~nothing if the layout flush they trigger has to fire
+anyway. The cost lives in the flush itself, which is paged.js
+measuring the live layout tree to decide where to break.
+
+Where the residual per-page layout cost actually comes from, after
+`--detach-pages` has already trimmed completed pages out of the
+layout tree, is probably one of:
+
+- **CSS counters** at
+  [`.pagedjs_pages`](../../docs/lib/paged.browser.js:27213)
+  (`counter-reset: pages ... footnote ...`). Counter resolution
+  walks the document, and counter-affecting elements per page
+  accumulate even when `display: none`.
+- **`offsetParent` lookup** in `Layout`'s constructor. That's a
+  layout-tree walk to find the nearest positioned ancestor; cost
+  can grow with sibling count even when most siblings are
+  display:none.
+
+Neither is fixable by dedup-shaped optimizations in our bundle.
+
+The remaining `findOverflow` opportunities (items 2 and 3 in the
+strategic note above -- collapsing ancestor walks, caching
+`getComputedStyle`) might still be worth doing on their own
+merits, but they're not where the gBCR time lives.
+
+### Methodology: compare profiles, not wall-clock
+
+Both attempts above showed wall-clock results that looked like
+noise (47.7 vs 47.1 vs 51.4 s -- inside the run-to-run jitter band
+on a busy dev machine). The actual structural change was only
+visible by **diffing the bottom-up gBCR-caller breakdown across
+two CPU profiles**. The `+12,981 ms` move from `create:2257` to
+`Layout:1443` would have been invisible in a wall-clock A/B.
+
+For any future render-stage optimization work, the rule is:
+
+1. Run with `--cpu-profile` (paired pre/post, same flags).
+2. Compare bottom-up self-time tables ([`analyze-profile.mjs`](../analyze-profile.mjs))
+   and caller breakdowns ([`find-callers.mjs`](../find-callers.mjs);
+   point it at a profile + a callee name to see which frames are
+   paying for that callee's time -- essential for spotting gBCR
+   migration between callers).
+3. Treat the wall-clock totals as a sanity check only -- they
+   confirm "did anything change" but not "where".
+
+This matters because:
+
+- **Render's per-page CPU work is dominated by native (layout,
+  DOM) frames.** V8 self-time deltas from JS-level dedup are
+  small compared to the layout flushes those calls trigger.
+- **CPU sample percentages are stable across machine load.** A
+  busy machine slows the absolute wall-clock but the proportional
+  breakdown (gBCR = ~38 % of render samples) stays the same.
+- **Migrations between attribution sites are common.** Moving a
+  gBCR off one call site usually re-attributes its layout cost to
+  the next caller in the sequence, not to nothing.
+
+For `generate` and `process` the picture is different (Chromium
+internals and pdf-lib parse cost respectively); CPU profiles of
+those phases are less informative because the work happens
+outside the JS we can see, and wall-clock can be a fine
+single-signal A/B. But anything inside paged.js's
+render loop wants a profile diff, not a stopwatch.
+
+## Finding the residual O(n): it's not counters, it's siblings
+
+After the methodology shift to profile-diffing, two more A/Bs
+finally pinned down where the residual per-page layout cost comes
+from. Spoiler: it's not what we expected, and the fix is large.
+
+### Hypothesis 1: CSS counters
+
+The book uses `@bottom-right { content: counter(page); }` for page
+numbers and `article.part-divider { counter-reset: page 0; }` for
+per-part renumbering. paged.js's bundle puts
+`counter-increment: page var(--pagedjs-page-counter-increment);`
+on every `.pagedjs_page`. So on each new page's `@bottom-right`,
+Chromium has to resolve `counter(page)` by walking preceding
+`counter-increment: page` elements.
+
+Per CSS spec (`display: none` elements don't increment counters),
+`--detach-pages`'s `display: none` strategy should already make
+this O(1). But Chromium implementations have historically been
+liberal about which display states still contribute. So: A/B by
+commenting out the `counter-increment: page` rule entirely
+([paged.browser.js:27198](../../docs/lib/paged.browser.js:27198)) and
+diffing the profile.
+
+Result:
+
+| variant                 | render   | total gBCR | gBCR %/render | ratio |
+| ----------------------- | -------- | ---------- | ------------- | ----- |
+| baseline (counters on)  | 48.51 s  | 18,424 ms  | 38 %          | 1.77x |
+| counters disabled       | 44.72 s  | 21,514 ms  | 48 %          | 2.44x |
+
+Disabling counters did **not** reduce gBCR; it grew. The
+wall-clock drop is run-to-run noise (counter resolution is genuinely
+cheap on `display: none` siblings); the proportional growth means
+removing counter-increment didn't save anything and may have shifted
+work elsewhere. **Counter resolution is not the residual O(n).**
+
+### Hypothesis 2: sibling sweeps over `display: none` pages
+
+Re-reading the `--detach-pages` writeup in
+[01-baseline-and-detach.md](01-baseline-and-detach.md): the claim
+has always been that `display: none` "removes a subtree from the
+layout tree entirely". That's true for *layout* -- but Chromium's
+per-page work also includes **style/selector resolution and rule
+matching**,
+which walks the sibling list regardless of display state. With
+1638 `.pagedjs_page` siblings under `.pagedjs_pages`, any per-page
+selector evaluation is O(n).
+
+A/B: physically `removeChild` finalized pages instead of just
+`display: none`, then re-append all at `afterRendered` so
+`page.pdf()` sees them. The chunker passes `lastPage.element` to
+`Page.create()` for ordered insertion, so the most recent finalized
+page has to stay in the DOM -- detach one page behind. DOM holds
+at most 2 pages at any moment: the in-flight one being laid out
+plus the most recent finalized one.
+
+Probe modification (in [perf/detach-pages.js](../detach-pages.js)),
+not shipped; page numbers come out wrong because `counter(page)`
+doesn't accumulate, but the profile signal is clean.
+
+Result:
+
+| metric              | display:none | removeChild | Δ            |
+| ------------------- | ------------ | ----------- | ------------ |
+| **render**          | **48.5 s**   | **28.0 s**  | **-20.5 s (-42 %)** |
+| total native gBCR   | 18,424 ms    | 7,320 ms    | -11,104 ms   |
+| `create:2257` gBCR  | 12,947 ms    | 1,073 ms    | **-11,874 ms (12x)** |
+| `hasOverflow:1925`  | 4,419 ms     | 5,119 ms    | +700 ms      |
+| `Layout:1443`       | 586 ms       | 562 ms      | flat         |
+| per-page ratio      | 1.77x        | 1.43x       | flatter      |
+
+`Page.create`'s layout flush -- the dominant per-page cost in
+every profile we've seen -- went from 12.9 s to 1.1 s. That's the
+work Chromium does to maintain style/selector state across the
+sibling list, and it's now nearly constant per page. `hasOverflow`
+still has a small residual growth but it's an order of magnitude
+smaller and bounds the next plausible optimization target.
+
+**This is the largest single render-stage win we've found in this
+investigation.** 20+ seconds off render, dropping render from the
+larger phase to the smaller one (vs generate's ~60-70 s).
+
+### Shipping it
+
+The probe rendered the right number of pages but the output PDF
+was incorrect in two ways: `counter(page)` doesn't accumulate
+across detached siblings, and the re-attach loop appended pages
+at the end instead of in original order. Both fixable; the
+question was whether named strings (`string(chapter-title)`)
+would survive detach. Verified empirically: they do.
+
+Final shipped change set:
+
+1. **[perf/detach-pages.js](../detach-pages.js)** -- rewrite
+   from `display:none` to physical `removeChild`. Keep the most
+   recent finalized page in the DOM (the chunker passes
+   `lastPage.element` to `Page.create` for ordered insertion);
+   detach one page behind. At `afterRendered`, detach the keeper
+   and re-append all in finalize order (which is document order).
+
+2. **[docs/lib/paged.browser.js](../../docs/lib/paged.browser.js) -- Counters handler.**
+   Track a running display-page counter on the handler instance,
+   increment per page during `afterPageLayout`, and write the
+   value as `--page-num: "N"` on the page wrapper's inline style.
+   On pages with `[data-counter-page-reset]` (the part dividers),
+   skip the increment -- mirrors the shipping behaviour of the
+   pre-existing CSS, where the injected per-page rule's
+   `counter-increment: none` takes effect but the
+   `counter-reset: page N` part doesn't (cascade/specificity
+   issue, not yet diagnosed; behaviour-preserving fix here, the
+   "intended" part-restart numbering would be a separate change).
+
+3. **[docs/assets/css/print.css](../../docs/assets/css/print.css) +
+   [_site-pdf copy](../../docs/_site-pdf/assets/css/print.css)** --
+   replace `content: counter(page)` in `@bottom-right` with
+   `content: var(--page-num)`. The CSS custom property approach
+   keeps the existing cascade (suppression on `@page :first` and
+   `@page divider` still works, since those rules override the
+   `content` declaration entirely).
+
+Verification (1638-page book, all sample pages spot-checked
+against the pre-detach output):
+
+- Page count matches (1638).
+- `@bottom-right` page numbers byte-equivalent on every sampled
+  page (1, 2, 5, 6, 10, 100, 500, 1000, 1500, 1638).
+- `@top-right` chapter titles byte-equivalent on every sampled
+  page -- named strings persist through detach.
+
+### Shipped numbers
+
+Profile diff (paired `--detach-pages --cpu-profile` runs):
+
+| metric              | pre (display:none) | post (removeChild) | Δ                    |
+| ------------------- | ------------------ | ------------------ | -------------------- |
+| **render**          | **48.5 s**         | **26.3 s**         | **-22.2 s (-46 %)**  |
+| total native gBCR   | 18,424 ms          | 7,455 ms           | -10,969 ms (-60 %)   |
+| gBCR % / render     | 38 %               | 28 %               | flatter              |
+| `create:2257` gBCR  | 12,947 ms          | **877 ms**         | **-12,070 ms (15x)** |
+| `hasOverflow:1925`  | 4,419 ms           | 4,590 ms           | flat                 |
+| `Layout:1443`       | 586 ms             | 463 ms             | flat                 |
+| per-page ratio      | 1.77x              | 1.18x              | nearly flat          |
+
+`Page.create`'s layout flush -- the largest single per-page cost
+in every profile we'd seen -- went from 12.9 s to 0.9 s. The
+remaining gBCR work in `hasOverflow` is now the largest layout
+flush, but it's an order of magnitude smaller and only marginally
+super-linear.
+
+### Where this leaves the picture
+
+The full menu of fixes against the original 207 s baseline:
+
+| fix                                 | render saved | total saved | shipped |
+| ----------------------------------- | ------------ | ----------- | ------- |
+| `--detach-pages` (display:none)     |   ~55 s      |   ~55 s     | yes     |
+| `--incremental` PDF update          |    -         |   ~32 s     | yes     |
+| pdf-lib `parseSpeed: Fastest`       |    -         |    ~3 s     | yes     |
+| `finalizePage` micro-optimizations  |    ~3 s      |    ~3 s     | yes     |
+| **aggressive detach (removeChild)** | **~22 s**    | **~22 s**   | **yes** |
+| **skip dead `findEndToken` path**   | **~3.5 s**   | **~3.5 s**  | **yes** |
+| **renderTo additive backoff**       | **~4.25 s**  | **~4.25 s** | **yes** |
+| pageRanges sharding (generate)      |    -         |  10-40 s    | no      |
+
+Render is now ~19 s on a 1638-page book, down from ~104 s in the
+original baseline. The next bottleneck is unambiguously
+`page.pdf()` -- ~60-70 s of Chromium-internal PDF serialisation
+that's only addressable via the `pageRanges` sharding approach
+(run multiple `page.pdf()` calls on disjoint page ranges in
+parallel browsers, concatenate with pdf-lib).
+
+## What happened when we tried `createBreakToken` dedup
+
+With render down to ~26 s, the bottom-up profile points at three
+JS bodies still worth looking at:
+
+```
+findEndToken    self 3270 ms (12.4 %)
+findElement     self 1924 ms ( 7.3 %)
+createBreakToken self  996 ms ( 3.8 %)
+```
+
+### Attempt A: cache `lastChild.lastChild` in `findEndToken`
+
+The descend-to-deepest-valid-descendant loop in
+[`findEndToken`](../../docs/lib/paged.browser.js:2100) reads
+`lastChild.lastChild` up to three times per iteration (while
+condition, `validNode` check, assignment). Cache once.
+
+Profile diff (paired `--detach-pages --cpu-profile`):
+
+| function         | PRE       | POST      | Δ        |
+| ---------------- | --------- | --------- | -------- |
+| `findEndToken`   | 3269.9 ms | 3108.0 ms | **-162** |
+| `createBreakToken` | 995.8 ms |  964.9 ms | -31      |
+| `findElement`    | 1924.0 ms | 1767.2 ms | -157     |
+
+Real, modest win on `findEndToken` self-time. Plausibly the `-157`
+on `findElement` is jitter (`findEndToken` doesn't call it), but
+the `findEndToken` self drop is the only one we'd hang our hat on.
+PDF byte-equivalent on all sampled pages. Shipped.
+
+### Attempt B: dedup `findElement(renderedNode, source)` in `createBreakToken`
+
+In the `!renderedNode` branch of
+[`createBreakToken`](../../docs/lib/paged.browser.js:1796),
+`findElement(renderedNode, source)` is called once at line 1817
+(inside `if (!temp.nextSibling)`) and again unconditionally at
+line 1830. Hoist + reuse: at most one call per invocation that
+takes this branch.
+
+Profile diff vs the post-Attempt-A baseline:
+
+| edge                                | PRE       | POST      | Δ      |
+| ----------------------------------- | --------- | --------- | ------ |
+| `findElement` self                  | 1767 ms   | 1892 ms   | +125   |
+| `findElement` <- `createBreakToken` | 1232 ms   | 1308 ms   | +76    |
+| `findElement` <- `findEndToken`     |  537 ms   |  580 ms   | +43    |
+
+The change cannot regress (it only ever removes one call), so the
+deltas are jitter, not real cost. The give-away is the
+`findElement <- findEndToken` edge: `findEndToken` wasn't touched
+between the two runs, yet its attributed `findElement` total still
+moved by +43 ms. That fixes the per-edge noise floor at ~40-80 ms
+on this machine, which swallows whatever savings the dedup
+produces.
+
+Read the other way: the `!renderedNode + !temp.nextSibling` branch
+must fire rarely enough that removing one of its two `findElement`
+calls doesn't register above this noise. We don't have call-count
+instrumentation in the cpuprofile to confirm directly (`hitCount`
+is samples-on-stack, not invocations), but a savings below
+noise is functionally indistinguishable from no savings.
+
+Reverted. The lesson echoes Attempt A above (textBreak): if the
+target branch fires rarely, the dedup's correctness is undeniable
+but its effect is unmeasurable.
+
+### Attempt C: skip `findEndToken` when nobody reads its result
+
+`findEndToken` (3.1 s self) was the top remaining JS-body in the
+post-A profile. Both Attempt A (cache the `.lastChild` access) and
+the speculative validNode-caching extension above tried to make
+it *faster*. Wrong question. The bottom-up profile shows where
+cost lives, but a caller breakdown shows *why* it lives there:
+
+```
+findEndToken: self=3108 ms, total=3652 ms
+callers (attributed total ms):
+   3652.19 ms   checkUnderflowAfterResize@paged.browser.js:2502
+```
+
+`findEndToken` is called from exactly one place:
+[`Page.checkUnderflowAfterResize`](../../docs/lib/paged.browser.js:2503),
+which fires from a `ResizeObserver` whenever the page wrapper
+*shrinks*. That happens on every overflow extraction during
+normal render. The handler computes an `endToken` and hands it to
+`this._onUnderflow(endToken)`. The only live registration of
+`onUnderflow` in the bundle was an empty callback in
+[`Chunker.addPage`](../../docs/lib/paged.browser.js:3251) with
+commented-out intent (`// page.append(this.source, overflowToken);`).
+The computed endToken was discarded every time.
+
+The fix is subtraction, not optimization: delete the no-op
+registration so `_onUnderflow` stays `undefined` by default, and
+add an early bail in `checkUnderflowAfterResize` so `findEndToken`
+doesn't run when nobody can consume its result. A future caller
+that wants the path back just calls `page.onUnderflow(realFn)` --
+the presence of a non-default handler is itself the activation
+signal, no flag plumbing required.
+
+Profile diff (paired `--detach-pages --cpu-profile`):
+
+| function       | PRE       | POST      | Δ          |
+| -------------- | --------- | --------- | ---------- |
+| `findEndToken` | 3108.0 ms |     0.0 ms | **-3108** |
+| `findElement`  | 1767.2 ms |  1313.8 ms | **-453**  |
+| **render**     | **25.75 s** | **22.26 s** | **-3.49 s (-14%)** |
+
+The `findElement` drop matches the previously-attributed
+`findEndToken → findElement` total-time edge (~537 ms) within
+noise; rest is jitter. PDF byte-equivalent on all sampled pages.
+Shipped.
+
+### Attempt D: skip `Footnotes.afterPageLayout` when no `float: footnote`
+
+After Attempt C the next gBCR caller worth looking at was
+[`Footnotes.afterPageLayout`](../../docs/lib/paged.browser.js:31477) at
+~1114 ms attributed gBCR. The handler implements the CSS
+`float: footnote` / `@footnote`-margin-box feature; the per-page
+work begins with `noteContent.getBoundingClientRect()`, then
+sets the inner content's `columnWidth`, then constructs a `Layout`
+and runs `findOverflow` on the (for our document, empty)
+`pagedjs_footnote_inner_content`.
+
+Our stylesheet declares `float: footnote` nowhere
+(`grep -r "float: footnote" docs/_site-pdf/`), so the handler's
+`this.footnotes` dict stays `{}` for the whole render and the
+per-page work is in service of nothing. Same shape as Attempt C:
+gate at the top with `if (Object.keys(this.footnotes).length === 0) return;`.
+
+Profile diff (paired `--detach-pages --cpu-profile`):
+
+| metric                          | PRE       | POST      | Δ          |
+| ------------------------------- | --------- | --------- | ---------- |
+| total gBCR (attribution)        | 7925 ms   | 7756 ms   | **-169**   |
+| ↳ Footnotes `afterPageLayout`   | 1114 ms   |    0 ms   | -1114      |
+| ↳ `hasOverflow`                 | 4687 ms   | 4961 ms   | **+274**   |
+| ↳ `create`                      |  913 ms   | 1019 ms   | **+106**   |
+| ↳ `Layout`                      |  446 ms   |  543 ms   | **+97**    |
+| ↳ next-page `afterPageLayout`   |    0 ms   |  431 ms   | **+431**   |
+| **render wall-clock**           | **22.26 s** | **23.14 s** | **+880 ms** |
+| **per-page ratio (last/first)** | **1.50x** | **1.75x** | **worse**  |
+
+Net gBCR reduction is only ~170 ms even though we eliminated 1114 ms
+of attributed gBCR at the Footnotes call site. The missing ~944 ms
+re-attributed to the next gBCR callers in the per-page sequence
+(`hasOverflow`, `create`, `Layout`, and a previously-invisible
+`afterPageLayout` at line 31986). And the per-page ratio went from
+1.50x to 1.75x -- the late pages got *more* expensive, not less.
+
+That ratio regression is the give-away. The Footnotes' small
+gBCR was apparently absorbing pending DOM mutations that, when
+not flushed there, accumulated until the next gBCR (typically a
+larger one) had to flush more state at once. This is the same
+shape as the Page.create memoize trap documented above: removing
+a layout flush at point A makes the flush at point B more
+expensive, and the cost is super-linear in the deferred mutation
+count.
+
+Reverted.
+
+### Attempt E: additive backoff on `renderTo`'s overflow check
+
+After Attempt D the lesson seemed to be "gBCR self-time is
+layout-flush attribution; you can't skip a gBCR without the flush
+migrating." Then re-reading the per-page render loop turned up a
+case the migration framing doesn't actually cover.
+
+[`Layout.renderTo`](../../docs/lib/paged.browser.js:1478) calls
+`findBreakToken` (→ `findOverflow` → `hasOverflow` → gBCR) when
+the cumulative text length of appended nodes crosses `maxChars`
+(default 1500). The gate looks like batching, but the reset is
+asymmetric:
+
+```js
+if (length >= this.maxChars) {
+  // ... layout hook, await images ...
+  newBreakToken = this.findBreakToken(wrapper, source, bounds, prevBreakToken);
+  if (newBreakToken) {
+    length = 0;                                    // only reset on overflow found
+    this.rebuildTableFromBreakToken(newBreakToken, wrapper);
+  }
+}
+```
+
+When no overflow is found, `length` doesn't reset -- it stays
+above `maxChars` and the very next iteration's appended node
+triggers another `findBreakToken`. The check fires *every
+iteration past `maxChars`* until overflow trips. On a typical
+~3000-char page that's ~30+ findBreakToken calls (each one a
+hasOverflow gBCR = layout flush) before the actual break point.
+
+Replace with **additive backoff**: track a moving baseline
+`lengthAtLastCheck` and only fire the check when `length -
+lengthAtLastCheck >= maxChars`. Advance the baseline when no
+overflow yet; reset both on overflow. Per-page check count drops
+from O(nodes-past-maxChars) to O(page-chars / maxChars), typically
+2-3 instead of 30+.
+
+Correctness rests on findBreakToken handling arbitrary overshoot:
+`findOverflow` walks the wrapper to identify the overflowing
+Range regardless of how much excess was appended past it,
+`removeOverflow` extracts the excess via `extractContents`, and
+`createBreakToken` returns a BreakToken at the right source
+position. The chunker builds a fresh walker from `breakToken.node`
+on the next page, so the trimmed content gets re-laid-out from
+its correct source position. (The `break-inside: avoid` worry --
+that containers with extra trailing content might make different
+break decisions -- turned out to be empirically unfounded.)
+
+Profile diff (paired `--detach-pages --cpu-profile`):
+
+| metric                      | PRE       | POST      | Δ                |
+| --------------------------- | --------- | --------- | ---------------- |
+| **render wall-clock**       | **23.73 s** | **19.48 s** | **-4.25 s (-18 %)** |
+| total gBCR (attribution)    | 8024 ms   | 5705 ms   | -2319 (-29 %)    |
+| ↳ `hasOverflow` gBCR        | 4837 ms   | 2725 ms   | **-2112 (-44 %)** |
+| ↳ `findOverflow` per-node   |  438 ms   |  166 ms   | -272             |
+| ↳ `create` / `Layout` / Footn. | unchanged within jitter                  |
+| `removeOverflow` self       |  457 ms   |  370 ms   | **-87 (improved)** |
+| per-page ratio (last/first) | 1.64x     | 1.60x     | improved         |
+
+No migration: Footnotes (1127 ms), create (955), Layout (534)
+all flat. `removeOverflow` *dropped* despite the over-append
+overshoot concern, because fewer findBreakToken invocations means
+fewer extractContents passes, not larger ones -- the per-call
+overshoot is bounded by maxChars (~1500 chars), small relative to
+page capacity.
+
+Full pdftotext-MD5 match on pages 6, 100, 500, 1000, 1500, 1638.
+Page count 1638. PDF byte size 126 bytes apart (metadata).
+
+Shipped.
+
+### The deeper lesson (a third pattern)
+
+Attempts B and D taught that you can't elide a *single* gBCR
+because the layout flush migrates to the next caller. Attempt E
+shows the framing was too narrow: you can't elide one flush, but
+you can do *fewer total flushes* if you batch observations across
+mutations.
+
+The three working patterns for render perf, distinguished:
+
+- **Reduce per-flush cost**: aggressive-detach (-22 s). Shrink the
+  layout tree by physically removing finalized pages so each
+  remaining flush has less style/selector state to maintain.
+
+- **Reduce flush count**: renderTo additive backoff (-4.25 s).
+  When mutations between observations don't independently need
+  observing, query once per batch instead of per-mutation. The
+  per-flush cost grows slightly with deferred mutations but
+  amortizes well below the linear scan.
+
+- **Delete dead JS**: skip-findEndToken (-3.5 s), Page.create
+  hoisted CSS, etc. Walk up the call chain; if the consumer
+  doesn't read the value, delete the production. Works whenever
+  the JS self-time is genuinely JS, not flush attribution.
+
+What *doesn't* work: try to elide one specific gBCR while
+preserving the mutation pattern around it (Attempts B and D). The
+flush re-attributes to the next gBCR in the per-page sequence,
+which then has to flush a larger backlog -- net wash or
+regression.
+
+The diagnostic question to tell these apart: *what does the
+mutation rhythm look like between consecutive gBCR calls?* If it's
+"mutation, gBCR, mutation, gBCR, ..." (renderTo's per-iteration
+check), batching wins. If it's "one mutation, multiple gBCRs"
+(Page.create memoize, Footnotes skip), each gBCR is on the same
+mutation state and the flush has to happen for the *next*
+mutation regardless of which JS asks.
+
+### Where this leaves the picture
+
+Render is now ~19 s on a 1638-page book, down from ~104 s in the
+original baseline. The JS-body profile after Attempt E:
+
+```
+findElement     self 1373 ms ( 7.1 %)
+createBreakToken self 1027 ms ( 5.3 %)
+removeOverflow  self  370 ms ( 1.9 %)
+afterPageLayout self  239 ms ( 1.2 %)
+```
+
+None of these are individually addressable -- they're load-bearing
+work in the per-page break loop. `findElement` already takes the
+dictionary fast path. `pageRanges` sharding of `generate` (~60-70 s
+of `page.pdf()`) is the only remaining knob with a profile target
+large enough to move the wall-clock total meaningfully, and it's
+single-threaded-inaddressable (requires multiple Chromium
+processes + pdf-lib concatenation).
+
+> [!NOTE]
+> The "`findElement` already takes the dictionary fast path" claim
+> above turned out to be wrong. A re-investigation under puppeteer 25
+> (see *findRef wasn't taking the fast path* in
+> [03-puppeteer-bump-findref.md](03-puppeteer-bump-findref.md)) found 39 % of
+> findRef calls falling through to `doc.querySelector("[data-ref='X']")`
+> because the per-page index wasn't populated for rebuilt ancestors
+> and the source tree never had one at all. Fixing both saves ~2.4 s
+> of render.
diff --git a/perf/notes/03-puppeteer-bump-findref.md b/perf/notes/03-puppeteer-bump-findref.md
new file mode 100644
index 0000000..5ac9766
--- /dev/null
+++ b/perf/notes/03-puppeteer-bump-findref.md
@@ -0,0 +1,805 @@
+# Puppeteer 22→25 rebaseline, findRef fast path, removeChild, and the idle/RAF chase
+
+Rebaselining after a Chromium version bump shifted the generate hot path, finding that `findRef` had been silently falling out of its fast path for 39% of calls (~2.4s render win), checkpointing the cumulative picture, then six variants of cheaper `removeChild` (none shipped) and chasing the residual `(idle)` time down to `requestAnimationFrame`.
+
+## Rebaselining after the puppeteer 22 -> 25 bump
+
+`docs/package.json` was bumped from `puppeteer ^22.x` to `^25.0.4`,
+which pulled in a newer bundled Chromium. Same harness, same book
+(now 1651 pages after a small content addition vs the 1638 the
+prior baseline measured), `--detach-pages --cpu-profile`:
+
+| Phase    | Prior (puppeteer 22, post-Attempt-E) | New (puppeteer 25) | Δ |
+| -------- | ------------------------------------ | ------------------ | --- |
+| render   | ~19 s   | 22.0 s | flat (run-to-run noise) |
+| generate | ~60-70 s | **42.7 s** | **-20 to -28 s** |
+| process  | ~5 s    | 4.9 s | flat |
+| **total**| ~95-100 s | **69.6 s** | **-25 to -35 s** |
+| raw Chrome PDF size | 52 MB | **39.3 MB** | -12 MB |
+| render ratio (last/first quarter) | 1.60x | 1.36x | flatter |
+
+The whole wall-clock win is in `generate`. Chrome's PDF writer got
+meaningfully faster, and is now emitting something more compact --
+a 25 % drop in the raw byte stream that previously needed pdf-lib's
+re-emit pass to shrink. *Chromium `Page.printToPDF` knob survey* in
+[01-baseline-and-detach.md](01-baseline-and-detach.md) noted Skia
+wrote streams uncompressed; whatever changed at
+the SkPDF level closes part of that gap automatically. The
+final PDF after pdf-lib's `save()` is still ~17 MB either way --
+the re-emit's deflate step was already doing most of the work.
+
+Render itself is unchanged in shape. The same hot paths
+(`hasOverflow`, `Footnotes.afterPageLayout`, `Page.create`,
+`findRef`) sit at roughly the same self-times. Nothing that was
+cheap got expensive; nothing that was expensive got cheap.
+
+Notable side-effect: with `generate` no longer dominating, the
+strategic note at the end of *Where this leaves the picture* in
+[02-finalizepage.md](02-finalizepage.md)
+("`pageRanges` sharding of `generate` is the only remaining knob
+with a profile target large enough to move the wall-clock total
+meaningfully") is now less true. The shard target shrunk from
+~60 s to ~43 s, so the upper bound on what sharding can save
+shrunk with it. Still the biggest untried knob, but the urgency
+is lower.
+
+The re-baselined bottom-up render profile also surfaced something
+that *was* always there but had been mis-attributed: see the next
+section.
+
+## findRef wasn't taking the fast path
+
+The new-baseline cpu profile's top entries:
+
+```
+   self_ms   self_%   function  @  source
+   5872.93   26.84%   (program)             (V8/Blink internal)
+   4831.83   22.08%   getBoundingClientRect (native)
+   2530.25   11.56%   findRef               paged.browser.js:643
+   2426.14   11.09%   removeChild           (native, called by detach-pages)
+   1007.64    4.60%   (idle)
+    565.17    2.58%   removeOverflow
+```
+
+`findRef` at **11.6 % of render self-time** is the second-largest
+non-native bucket after gBCR. The prior *JS-body profile after
+Attempt E* (in [02-finalizepage.md](02-finalizepage.md)) reported
+`findElement self 1373 ms (7.1 %)`
+and concluded `findElement` was already fast. Both numbers refer
+to the same call chain -- V8 just attributes time differently
+between the two-line forwarder and its called helper:
+
+```js
+function findElement(node, doc, forceQuery) {
+    const ref = node.getAttribute("data-ref");
+    return findRef(ref, doc, forceQuery);
+}
+
+function findRef(ref, doc, forceQuery) {
+    if (!forceQuery && doc.indexOfRefs && doc.indexOfRefs[ref]) {
+        return doc.indexOfRefs[ref];                              // fast
+    } else {
+        return doc.querySelector(`[data-ref='${ref}']`);          // slow
+    }
+}
+```
+
+The "post-Attempt-E" profile's `findElement` charge was its
+forwarder cost; the actual body work has always been inside
+`findRef`. The new V8 profile splits the attribution honestly,
+with `findElement` reading `self=0.00 ms` and `findRef` carrying
+the 2.5 s.
+
+### Instrumenting per-branch call counts
+
+Wrapped `findRef` with counters keyed by which branch it took:
+fast-path (dict hit), `forceQuery` (caller explicitly asked for
+querySelector), `noDict` (the doc didn't have `indexOfRefs` at all),
+and `dictMiss` (the doc had a dict but no entry for the ref). The
+caller of each branch was captured from `new Error().stack`.
+
+A single instrumented run on the 1651-page book:
+
+```
+findRef.calls         = 47,867
+findRef.fastPath      = 29,300   (8.4 ms total, 0.29 us/call)
+findRef.fallback total = 18,567  (2585.5 ms total)
+  forceQuery          =      2
+  noDict              =  2,739
+  dictMiss            = 15,826
+  fallbackReturnedNull =    892
+
+byCallerLine (top, all attributed to docs/lib/paged.browser.js):
+   15,767  dictMiss   <- Layout.append, `findElement(node.parentNode, dest)`
+      955  noDict     <- Layout.append, same call
+      892  noDict     <- Layout.append, `findElement(node.parentNode, fragment)`
+      848  noDict     <- Layout.createBreakToken, `findElement(*, source)`
+       58  dictMiss   <- Layout.createBreakToken (an `*, rendered` site)
+       42  noDict     <- Layout.createBreakToken, another `*, source` site
+        2  forceQuery <- Layout.rebuildTableFromBreakToken
+```
+
+The fast path is essentially free (0.29 us/call -- a hashed object
+lookup). **The entire 2.5 s lives in the 18,567 fallback calls**.
+Two structural reasons:
+
+### Root cause 1: rebuilt ancestors aren't indexed in `dest`
+
+`Layout.append(node, dest, ...)` writes each leaf clone into
+`dest.indexOfRefs` near the end of the function. But when the
+leaf's parent isn't already in `dest`, `append` calls
+`rebuildAncestors(node)` to clone the source ancestor chain into
+a fresh `DocumentFragment` and appends the fragment to `dest`:
+
+```js
+let fragment = rebuildAncestors(node);
+parent = findElement(node.parentNode, fragment);
+// ... attach clone ...
+dest.appendChild(fragment);   // <-- ancestors now live in dest's DOM
+                              //     but dest.indexOfRefs wasn't updated
+```
+
+The rebuilt ancestors are now in `dest`'s DOM tree, findable by
+`dest.querySelector("[data-ref='X']")`. They are **not** in
+`dest.indexOfRefs`. Every subsequent `append` whose `node`
+descends from one of those rebuilt ancestors hits dictMiss on
+that ancestor and falls through to `dest.querySelector`. With
+~15.7 k such calls per book at ~140 us each -- a small per-page
+wrapper, so querySelector is fast even when it walks -- that's
+about 2.2 s.
+
+The 892 `noDict <- Layout.append, findElement(*, fragment)` calls
+in the byCallerLine table are a related symptom: the second
+`findElement` call inside the rebuild branch -- which looks the
+parent up in the *fragment* before it gets appended to `dest` --
+hits a fragment whose `indexOfRefs` was never created.
+
+### Root cause 2: the source tree never has an index
+
+Six call sites in `Layout.createBreakToken` use
+`findElement(*, source)` to map a rendered node back to its
+position in the original document. `source` is the
+`ContentParser`-wrapped result of the initial document walk in
+`ContentParser.addRefs` -- which walks every element, assigns a
+`data-ref`, and **stops**. No `indexOfRefs` is ever populated.
+Every `findElement(*, source)` therefore falls through to
+`source.querySelector("[data-ref='X']")` against the whole
+~10 k-element source tree.
+
+There are only ~890 such calls per render (they only fire on
+pages where the break landed mid-element), but at ~1.3 ms each
+that's ~1.2 s.
+
+### The fix
+
+Three small patches in `docs/lib/paged.browser.js`, all marked
+`// [PATCH: findRef fast-path]`:
+
+1. **`rebuildAncestors`** -- initialise `fragment.indexOfRefs = {}`
+   at the top, and write each rebuilt clone into it as the loop
+   builds the chain. The second `findElement(*, fragment)` call in
+   `Layout.append`'s rebuild branch then hits the fast path.
+
+2. **`Layout.append`'s rebuild branch** -- after
+   `dest.appendChild(fragment)`, merge `fragment.indexOfRefs` into
+   `dest.indexOfRefs`. Subsequent `findElement(*, dest)` calls on
+   any rebuilt ancestor now hit the fast path too.
+
+3. **`ContentParser.addRefs`** -- initialise `content.indexOfRefs = {}`
+   on entry and write `content.indexOfRefs[ref] = node` inside the
+   tree-walk loop. Every `findElement(*, source)` call site now hits
+   the fast path.
+
+### Results
+
+Instrumented A/B (call counts pre/post on the same 1651-page book):
+
+| metric | pre-fix | post-fix | Δ |
+| ------ | ------- | -------- | --- |
+| findRef calls (total) | 47,867 | 47,867 | (same; this is a per-call cost change, not a count change) |
+| fast path | 29,300 | **46,914** | **+17,614** |
+| fallback total calls | 18,567 | **953** | **-17,614 (-95 %)** |
+| dictMiss | 15,826 | 59 | -15,767 |
+| noDict (`findElement(*, fragment)` in rebuild branch) | 892 | 0 | -892 |
+| noDict (createBreakToken vs source) | 848 + 42 | 0 + 0 | -890 |
+| fallback total time | 2,585 ms | **6.9 ms** | **-2,578 ms** |
+| fallbackReturnedNull | 892 | 892 | unchanged (these are the genuine "no such ref" misses) |
+
+The 892 residual fallbacks are all `findElement(node.parentNode, dest)`
+on a *fresh* per-page `dest` whose dict was just created and only
+contains its own leaf clones, so the parent lookup correctly returns
+null (the parent's first appearance on this page will be in the
+next call's rebuilt fragment). 7 ms total; not worth a third patch.
+
+Wall-clock A/B, paired runs, no instrumentation, no cpu-profile
+(stash the fix, run twice; pop, run twice):
+
+| run | BEFORE render | AFTER render |
+| --- | --- | --- |
+| 1 | 20.73 s | 18.17 s |
+| 2 | 20.54 s | 18.22 s |
+| **avg** | **20.64 s** | **18.20 s** |
+
+**Δ = -2.44 s render (-12 %).**
+
+Profile diff (`--detach-pages --cpu-profile`, single run each --
+between-run noise on cpu-profile self-time is in the 50-150 ms band
+for sub-1 % rows):
+
+| function | PRE | POST | Δ |
+| --- | --- | --- | --- |
+| `findRef`   | 2530 ms (11.56 %) | undetectable (<130 ms) | **-2400 ms** |
+| `findElement` self | 0 ms (forwarder) | 0 ms | unchanged |
+| `addRefs`  | not in top 20 | **157 ms (0.80 %)** | +157 ms (new dict-population cost) |
+| `removeChild` (detach handler) | 2426 ms | 2320 ms | -106 ms (noise) |
+| `getBoundingClientRect` | 4832 ms | 4632 ms | -200 ms (noise) |
+| total render | 22.0 s | 19.8 s | -2.2 s |
+
+PDF byte size is 16-47 bytes apart between any two runs (well inside
+the standard `/CreationDate` / `/ModDate` timestamp drift); content
+is functionally byte-identical.
+
+Shipped.
+
+### Was it the headers/footers change?
+
+A reasonable initial hypothesis was that the recent
+"Get the details of page headers/footers out of paged.js"
+(`c70b83d`) or its precursor "Add the part name as a prefix to
+the page number" (`71aea3d`) had introduced the cost. Neither
+did:
+
+- `71aea3d` added a per-page
+  `pageElement.querySelector("article.part-divider")` in the
+  Counters handler, which would have shown up as extra querySelector
+  work, but it's unrelated to `findRef`'s call path.
+- `c70b83d` removed that querySelector again, moving the part-title
+  capture from per-page JS to a CSS `string-set` / `string()` rule.
+  Net per-page work went *down*, not up.
+
+`findRef`'s slow path was always there -- the prior post-Attempt-E
+profile in [02-finalizepage.md](02-finalizepage.md) reported the same
+call chain as `findElement self 1373 ms (7.1 %)`. Two things
+happened to make it worth a fresh look:
+
+- **V8's attribution split.** The new V8 charges `findElement` 0 ms
+  and `findRef` 2530 ms instead of attributing the helper's body
+  to its forwarder. Same call chain, different bucket label, much
+  more visible in the bottom-up view.
+- **The cost itself may have grown.** 1.4 s → 2.5 s is more than a
+  V8 attribution shift can explain on a +0.8 % content change. The
+  branch counters above don't tell us the pre-puppeteer-25 split;
+  the most we can claim is "the fallback was clearly the dominant
+  branch by the time we measured." Either way, the fix removes it.
+
+### Methodology
+
+This one had two of the recurring lessons baked in:
+
+1. **Instrument to understand the workload, not just the time.**
+   The CPU profile showed `findRef` at 2.5 s self-time; that's
+   *what*. It needed branch-counting (fast-path vs dictMiss vs
+   noDict, with caller attribution) to find out *why*. Wall-clock
+   A/B alone would have detected the regression; only the per-branch
+   counters explained it.
+
+2. **`new Error().stack` is the cheap way to attribute hot-function
+   calls back to their callers in-browser**, when you can't
+   instrument the call sites individually. The harness already had
+   `find-callers.mjs` for post-hoc cpu-profile attribution, but
+   that aggregates by sample, not by call. Per-call attribution
+   needed the in-page stack walk. Cost ~5 us per call, OK for
+   1-shot diagnostic runs, not OK to ship.
+
+## Where this leaves the picture
+
+Updated cumulative table, all measured against the original 207 s
+puppeteer-22 baseline:
+
+| fix                                 | render saved | total saved | shipped |
+| ----------------------------------- | ------------ | ----------- | ------- |
+| `--detach-pages` (display:none)     |   ~55 s      |   ~55 s     | yes     |
+| `--incremental` PDF update          |    -         |   ~32 s     | yes     |
+| pdf-lib `parseSpeed: Fastest`       |    -         |    ~3 s     | yes     |
+| `finalizePage` micro-optimizations  |    ~3 s      |    ~3 s     | yes     |
+| aggressive detach (`removeChild`)   |   ~22 s      |   ~22 s     | yes     |
+| skip dead `findEndToken` path       |   ~3.5 s     |   ~3.5 s    | yes     |
+| `renderTo` additive backoff         |   ~4.25 s    |   ~4.25 s   | yes     |
+| **puppeteer 22 -> 25 (Chromium bump)** | **-**     | **~20-30 s** *(generate)* | **yes** |
+| **findRef fast-path** (this section) | **~2.4 s** | **~2.4 s**  | **yes** |
+| `pageRanges` sharding (generate)    |    -         |  ~5-20 s    | no      |
+
+Current end-to-end on the 1651-page book, `book.bat` path:
+
+```
+render   :  ~18 s    (was ~104 s in the original baseline)
+generate :  ~43-48 s (was ~64 s; mostly the puppeteer 25 bump)
+process  :  ~5 s
+total    : ~70 s     (was ~207 s, a 3x speedup)
+```
+
+The remaining JS-body profile after the findRef fix:
+
+```
+self_ms   self_%   function                    source
+  ~500    ~2.5 %   removeOverflow              paged.browser.js
+  ~320    ~1.6 %   wrapContent
+  ~200    ~1.0 %   afterPageLayout (paged.js)
+  ~187    ~1.0 %   afterPageLayout (Footnotes)
+  ~157    ~0.8 %   addRefs                     (new -- the fix above)
+  ~130    ~0.7 %   renderTo
+```
+
+None of those individually clear the noise band; the largest
+remaining JS-body bucket is the same scale as the `addRefs` cost
+we just added. Native frames (`getBoundingClientRect` ~23 %,
+`(program)` ~30 %, `removeChild` ~12 %) are now the dominant
+contributors to render, and gBCR's caller breakdown is the same
+flat-per-page shape it's had since aggressive detach landed.
+
+The single biggest untried lever remains `pageRanges` sharding for
+generate. After the puppeteer 25 bump it would save less than the
+earlier estimate (the 64 s -> 43 s gain made the target smaller),
+but it's still the only knob with a profile target large enough to
+move the wall-clock total by 5+ s.
+
+## Can we make `removeChild` cheaper?
+
+After the findRef fix, `removeChild` sits at ~12 % of render
+self-time. The detach-pages handler attribution is clean -- 1651
+detaches for 1651 pages, exactly one per page, with the only
+other removeChild callers being `filterTree` at startup (9,192
+ignorable-text-node strips totalling 2.3 ms; not a hot path).
+
+Per-call cost on the 1651-page book, with `Element.prototype.removeChild`
+wrapped to measure each call:
+
+```
+[instrument] page-detach avg:      1.009 ms/call
+[instrument] page-detach median:   0.900 ms/call
+[instrument] page-detach p90:      2.000 ms/call
+[instrument] page-detach p99:      3.000 ms/call
+[instrument] avg descendants/page: 147.7
+```
+
+That's ~5-7 us per descendant LayoutObject torn down, multiplied
+by ~150 descendants per page, multiplied by ~1651 pages = ~1.7 s
+total. The distribution is tight and scales linearly with
+descendant count -- this looks like ordinary Blink teardown work
+rather than a pathological slow path.
+
+To verify, two structural variants both tested at the same
+instrumentation harness:
+
+### Variant B: graveyard DocumentFragment
+
+Replace `parent.removeChild(page)` with
+`graveyard.appendChild(page)`, where `graveyard` is a fresh
+`DocumentFragment` held by the handler. Hypothesis: the
+move-to-out-of-document-fragment path might skip some
+LayoutObject teardown work because the destination is itself
+disconnected.
+
+| metric | A (removeChild) | B (graveyard) |
+| ------ | --------------- | ------------- |
+| avg per call | **1.009 ms** | 1.082 ms (+7 %) |
+| median | 0.900 ms | 0.900 ms |
+| p90 | 2.000 ms | 2.200 ms |
+| p99 | 3.000 ms | 3.100 ms |
+| total page wall | 1666 ms | 1785 ms |
+| render wall-clock | ~16.1 s | ~15.2 s (run-to-run noise) |
+
+The graveyard move is **slightly slower** per call. Blink tears
+down the LayoutObjects regardless of where the node lands; there's
+no fast-path for "moved to a detached parent". No win.
+
+### Variant C: `contain: layout style` on `.pagedjs_page`
+
+Inject `<style>.pagedjs_page { contain: layout style; }</style>`
+into the document before render. Hypothesis: removing a contained
+subtree might skip style/layout invalidation propagation because
+Blink already knows the subtree didn't influence its siblings or
+parent.
+
+Also tested `contain: strict` (which adds `paint` and `size`
+containment -- pages already have explicit dimensions via @page
+CSS so this is safe).
+
+| metric | A (no contain) | C (layout style) | C-strict |
+| ------ | -------------- | ---------------- | -------- |
+| avg per call | **1.009 ms** | 1.017 ms | 0.991 ms |
+| median | 0.900 ms | 0.900 ms | 0.900 ms |
+| p90 | 2.000 ms | 1.900 ms | 1.900 ms |
+| total page wall | 1666 ms | 1678 ms | 1634 ms |
+| render wall-clock | ~16.1 s | ~15.0 s | ~14.8 s |
+
+All four runs are within ~5 % of each other on per-call cost --
+well inside the run-to-run noise band. Containment doesn't unlock
+a faster removeChild path either.
+
+### Conclusion (variants B + C)
+
+The 1.7 s of `removeChild` is intrinsic Blink LayoutObject
+teardown work. The math checks out at ~5-7 us per descendant ×
+~150 descendants × 1651 pages, and three different framings
+(plain removeChild, move-to-fragment, contain + removeChild) all
+land within ~10 % of each other. The destination of the move and
+the containment metadata don't change Blink's teardown rate.
+
+The one thing we *don't* do is "remove less per page" -- removing
+a page's content as N individual leaf removals would be strictly
+worse (N × overhead instead of 1 × overhead, same teardown total).
+Each removeChild call carries DOM-mutation, style-invalidation,
+and notify overhead beyond the per-descendant cost, so consolidating
+to one removal per page is already the optimal framing.
+
+### Variant D: don't detach at all, just `contain: strict`
+
+A natural follow-up: if the per-page cost of having siblings
+around really comes from style/selector traversal, maybe Blink
+will skip a *contained* sibling subtree even when it can't skip
+a `display: none` one. Containment is a stronger signal -- it
+explicitly tells the engine "no observable interaction crosses
+this boundary" -- so the renderer ought to be able to short-circuit
+sibling-walks more aggressively.
+
+Implementation: replace the detach handler with one that sets
+`pageElement.style.contain = 'strict'` at finalizePage and clears
+the property for every page at afterRendered (so `page.pdf()`
+serializes the right paint state).
+
+Result:
+
+| metric | current detach | variant D (contain:strict, no detach) |
+| ------ | -------------- | --------------------------------------- |
+| **render wall-clock** | **~16 s** | **89.3 s** |
+| `Page.create` gBCR | ~764 ms | **31,142 ms** |
+| `hasOverflow` gBCR | ~2,478 ms | 10,922 ms |
+| total gBCR | ~4,832 ms | 45,413 ms |
+| per-page ratio (last/first) | 1.36x | 4.11x |
+
+Worse than the display:none baseline (`Page.create`
+gBCR 12,947 ms / render 48.5 s, reported in
+[02-finalizepage.md](02-finalizepage.md)).
+Containment metadata adds work to per-sibling evaluation rather
+than removing it. **Definitive no.** Containment is a hint about
+what's inside the box; it doesn't make the box invisible to
+neighbours.
+
+### Variant E: empty the wrapper, leave it in place
+
+A second framing of the same idea: keep the page wrapper as a
+sibling, but move its children to a stash so the wrapper itself
+is a leaf (no descendants for Blink to walk through). Restore
+the children at afterRendered. This isolates the "what costs
+what" question: does sibling-walk cost depend on descendant
+count, or just on sibling count?
+
+Implementation: at finalizePage, for the previous-finalized page
+(one behind, mirroring the keep-one-back pattern), move each
+child into an array via `wrapper.removeChild(wrapper.firstChild)`,
+set `min-height: 297mm` so the wrapper still occupies its slot,
+and stash the children. At afterRendered, restore.
+
+Result:
+
+| metric | current detach | variant E (empty wrapper) |
+| ------ | -------------- | --------------------------- |
+| **render wall-clock** | **~16 s** | **21.9 s** |
+| `Page.create` gBCR | ~764 ms | 2,628 ms (+1,864) |
+| `hasOverflow` gBCR | ~2,478 ms | 5,024 ms (+2,546) |
+| `Layout` gBCR | ~294 ms | 937 ms |
+| total gBCR | ~4,832 ms | **10,127 ms (+5,295)** |
+| `removeChild` self | 2,426 ms | **854 ms (-1,572)** |
+| per-page ratio (last/first) | 1.36x | 2.93x |
+
+The removeChild *savings* are real -- with no wrapper to tear
+down, just ~150 child removals per page at sub-microsecond each.
+But the gBCR *cost* roughly doubles because the wrappers are
+still siblings, and gBCR firings have to walk them. Net is +5 s
+render, *worse* than the current detach.
+
+This experiment yields a clean cost-model decomposition. Pulling
+the gBCR deltas apart against the wrapper-vs-content split:
+
+```
+display:none baseline (full content):       gBCR(Page.create) ≈ 12,947 ms
+variant E (empty wrappers, n=1651):         gBCR(Page.create) ≈  2,628 ms
+current detach (no siblings):               gBCR(Page.create) ≈    764 ms
+```
+
+Subtracting:
+
+- (variant E - current detach) = 1,864 ms for 1,651 sibling wrappers
+  → ~1.1 us per wrapper-sibling per `Page.create` gBCR call
+- (display:none - variant E) = 10,319 ms for 1,651 × 150 ≈
+  247,650 sibling descendants
+  → ~42 us per sibling-descendant per `Page.create` gBCR call
+
+Both wrappers and their descendants contribute to the per-call
+cost. Removing the descendants helps -- variant E really is
+substantially cheaper than display:none -- but the wrapper cost
+alone is enough to lose. To zero out both contributions you have
+to take both the wrapper and its descendants out of the sibling
+list, which is exactly what the current detach does.
+
+### Variant F: `content-visibility: hidden`, no detach
+
+The CSS spec's `content-visibility: hidden` is the closest
+property to "freeze in place without disposing" -- per spec,
+rendering work is "skipped" but cached state is preserved for
+cheap restoration. Conceptually nearer to a freeze than
+`display: none` or `contain: strict` were.
+
+Implementation: at finalizePage, set
+`pageElement.style.contentVisibility = 'hidden'` and
+`containIntrinsicSize = '210mm 297mm'` (the size hint Blink uses
+when content-visibility skips a subtree). At afterRendered,
+clear both.
+
+Result:
+
+| metric | current detach | variant F (cv:hidden) |
+| ------ | -------------- | ----------------------- |
+| **render wall-clock** | **~16 s** | **95.2 s** |
+| `Page.create` gBCR | ~764 ms | **29,656 ms** |
+| `hasOverflow` gBCR | ~2,478 ms | 17,558 ms |
+| total gBCR | ~4,832 ms | 52,899 ms |
+| per-page ratio (last/first) | 1.36x | 5.12x |
+
+Worse than every other variant. The spec's "skip rendering work"
+clause covers painting and composition; it does **not** make the
+subtree invisible to sibling-walks during style and selector
+matching that gBCR forces. Three "leave in place" properties
+(`display: none`, `contain: strict`, `content-visibility: hidden`)
+have now been tested and none of them short-circuit the
+sibling-walk.
+
+### Conclusion across all six variants
+
+| variant | render | net vs current |
+| ------- | ------ | -------------- |
+| A current (removeChild, no contain) | ~16.1 s | (baseline) |
+| B graveyard fragment | ~15.2 s | flat (noise) |
+| C `contain: layout style` + removeChild | ~15.0 s | flat (noise) |
+| C-strict `contain: strict` + removeChild | ~14.8 s | flat (noise) |
+| **D `contain: strict`, no detach** | **89.3 s** | **+73 s** |
+| **E empty wrappers, no detach** | **21.9 s** | **+5.9 s** |
+| **F `content-visibility: hidden`, no detach** | **95.2 s** | **+79 s** |
+
+The flat band (A/B/C/C-strict) is the cost-of-doing-business --
+~1 ms × 1651 pages = ~1.7 s of intrinsic Blink LayoutObject
+teardown. Variations on the framing don't move it. The
+catastrophic band (D, E) confirms that any path where the page
+wrapper stays in the live sibling list pays meaningfully more
+than the teardown cost would have been -- ~1.1 us per
+wrapper-sibling × 1651 wrappers × several gBCR call sites per
+page comes out to several seconds of extra render even when the
+wrapper is otherwise empty and contained.
+
+The 1.7 s is the bill we pay for shrinking the live DOM from
+~150 × 1651 ≈ 250k nodes back down to 2 nodes (in-flight page +
+keeper), which is what kept `Page.create`'s gBCR flat per page
+(see *Hypothesis 2: sibling sweeps over `display: none` pages* in
+[02-finalizepage.md](02-finalizepage.md)). Net savings vs the
+display:none variant was ~22 s render;
+the 1.7 s removeChild cost is roughly 8 % of that win paid back
+to Blink for cleanup. Worth keeping.
+
+### Aside: it's not GC, and JS references don't help
+
+A reasonable follow-up question to all of this is "can we just
+hold a reference to the detached children to avoid disposal,
+or turn off GC to skip the cleanup?" Neither applies to what
+we're measuring.
+
+Chromium maintains two trees:
+
+- **DOM tree** -- `Node` objects, JS-visible, referenceable.
+- **Render tree** -- `LayoutObject` / `LayoutBox` / `LayoutText`
+  etc., Blink-internal, NOT JS-visible.
+
+`removeChild` keeps the DOM Node alive (JS reference holders --
+including the handler's `this._detached` array -- prevent
+collection). But the corresponding LayoutObject in the render
+tree is **destroyed immediately**, synchronously, at the
+removeChild call. Re-attaching via appendChild later builds a
+new LayoutObject from scratch.
+
+There is no JS-level API to keep a LayoutObject alive across
+detach + reattach. Holding DOM references doesn't change the
+render-tree lifecycle. The 1.7 s lives entirely in
+LayoutObject teardown -- which is Blink-internal C++ work
+attributed to the `removeChild` native frame in the profile,
+not to GC.
+
+V8's GC is a separate concern and isn't the bottleneck. The
+profile reads:
+
+```
+   self_ms   self_%   function
+    195.21    0.89%   (garbage collector)
+```
+
+~200 ms over a ~22 s render. Even if it could be disabled
+(it can't -- Node would OOM), it would barely register.
+
+The asymmetry between variants B and E makes this concrete.
+Variant B (graveyard fragment) moves the page from
+`.pagedjs_pages` to a detached DocumentFragment; variant E
+(empty wrapper) keeps the page in `.pagedjs_pages` but moves
+its children out. The fragment-move path *does* trigger
+LayoutObject teardown (you can see the 1.08 ms / call in
+variant B's instrumentation) even though the DOM Node lives on
+in a JS-visible fragment -- because the destination is itself
+not attached to the document, so there's no live render-tree
+parent. Conversely, variant E's wrapper stays in
+`.pagedjs_pages` with a live LayoutObject the whole time, so
+the wrapper's render-tree slot doesn't get torn down; only
+its child LayoutObjects do (as the children move out). The
+"keep render objects alive" idea would have to mean keeping
+the wrapper in `.pagedjs_pages` with all its children, which
+is the display:none baseline -- ~48 s render.
+
+The trade-off is therefore not "keep things alive vs. let GC
+collect them"; it's "be a live render-tree sibling vs. not".
+Anything that keeps the wrapper as a live sibling pays the
+~1.1 us per wrapper-sibling per gBCR call shown above, and the
+gBCR firings compound that into seconds across 1651 pages.
+
+## Chasing the residual `(idle)` to requestAnimationFrame
+
+A second axis of the same investigation. The post-findRef-fix
+profile showed `(idle) 735 ms (4.6 %)` -- not huge, but non-zero
+and worth understanding. `(idle)` in a V8 CPU profile means
+samples taken while the main thread had nothing scheduled --
+waiting on async/await, microtask queue settling, requestAnimationFrame
+ticks, or other browser-internal yields.
+
+### Hypothesis 1: microtask boundaries from `await Hook.trigger(...)`
+
+The chunker's per-page loop has 5-6 `await this.hooks.X.trigger(...)`
+calls per page. `Hook.trigger()` wraps every sync handler in a fresh
+Promise and returns `Promise.all(promises)`, so the caller always
+awaits a thenable -- a microtask boundary per await even when every
+handler resolved synchronously. 5 boundaries × 1651 pages ≈ 8,255
+yields; if each yield is ~85 us in V8 it lines up with the 735 ms.
+
+Patched it: `Hook.trigger()` returns `undefined` when no handler
+returned a thenable, callers do
+`let p = hook.trigger(...); if (p) await p;` to skip the await on
+the sync fast path. Patched at four hot per-page sites (3 in
+`chunker.layout`, 3 in `chunker.handleBreaks`).
+
+Result: render went **up** by ~0.35 s on a 2-run paired A/B
+(14.57 s -> 14.92 s avg). `(idle)` in the profile went **up too**
+(735 ms -> 1223 ms in absolute terms). Microtask boundaries are
+~30 us each at the JIT level; the V8 sampler at 1 ms intervals
+hardly catches them, so they show up as `(program)` rather than
+`(idle)`. The patch shaved microtask scheduling cost in the
+single-digit percent range but added a branch on every Hook.trigger
+call -- net wash, slight regression. **Reverted.**
+
+### Hypothesis 2: ResizeObserver firing per page
+
+Per page, `Page.addResizeObserver` creates a fresh `ResizeObserver`
+that fires its callback asynchronously from the compositor thread
+back to main. The callback wraps work in `requestAnimationFrame`,
+so each RO firing schedules a frame-tick wait. 1651 pages × ~0.5 ms
+per RO-rAF round-trip ≈ ~800 ms. Plausible.
+
+Two-step probe:
+1. **Skip the rAF wrap inside the RO callback**, run synchronously.
+   Result: `(idle) 902 ms`. No improvement, possibly slightly worse.
+2. **Disable the ResizeObserver entirely** (early-return in
+   `addResizeObserver`). Result: `(idle) 1,074 ms`. Still no
+   improvement.
+
+Neither helped. The RO isn't the source -- the per-page
+`addResizeObserver` overhead is real, but it doesn't show up in
+the `(idle)` bucket. Restored upstream behaviour.
+
+### Hypothesis 3: the chunker's `Queue.tick` is `requestAnimationFrame`
+
+The chunker drives its per-page work through a `Queue` class
+(`paged.browser.js:2666`). The queue's constructor sets:
+
+```js
+this.tick = requestAnimationFrame;
+```
+
+and `Queue.run()` schedules each iteration via
+`this.tick.call(window, () => { ... });`. Chunker's `render()`
+loops over `this.q.enqueue(() => this.renderAsync(renderer))`
+once per page. Every per-page iteration therefore waits one rAF
+tick before processing.
+
+`requestAnimationFrame` waits for the next animation frame. In
+headless puppeteer with no display, rAF still delivers callbacks
+on a regular cadence (Chromium's headless mode default is around
+60 Hz off-screen / ~16 ms per frame, with the scheduler often
+batching tighter than that). Either way, per-page rAF waits
+across 1651 pages add up to several hundred milliseconds of pure
+main-thread idle.
+
+The fix is one line:
+
+```js
+this.tick = (cb) => queueMicrotask(cb);
+```
+
+`queueMicrotask` schedules the callback on the microtask queue --
+runs before returning to the event loop, microsecond-scale latency
+instead of millisecond-scale. The `Queue` doesn't depend on rAF
+semantics (no paint coordination, no frame-budget yielding --
+it's just a serializer that wants to run tasks back-to-back).
+
+Verification (paired 2-run A/B, `--detach-pages`, no
+instrumentation, no cpu-profile):
+
+| run | BEFORE render | AFTER render |
+| --- | --- | --- |
+| 1 | 14.62 s | 11.86 s |
+| 2 | 14.51 s | 12.12 s |
+| **avg** | **14.57 s** | **11.99 s** |
+
+**Δ = -2.58 s render (-18 %).** Larger than the 735 ms `(idle)`
+that prompted the look -- because rAF was costing real (program)
+work too (V8 scheduler, microtask queue draining around the rAF
+boundary), not just idle wait. CPU profile of the fixed render:
+
+```
+   self_ms   self_%   function
+   -------   ------   ----------------------------------------------
+   4355.74   34.75%   getBoundingClientRect
+   1935.89   15.45%   removeChild
+   1934.11   15.43%   (program)             (was 5872 -- down ~4 s)
+    636.43    5.08%   removeOverflow
+    -- (idle) absent from the top 10, < 130 ms (1 %)
+```
+
+`(idle)` dropped out of the top 10 (< 130 ms / 1 %), `(program)`
+dropped from 5872 ms to 1934 ms (-4 s), `removeChild` dropped
+slightly (2426 ms -> 1935 ms; smaller render = same per-call cost
+× same call count, so this is sampling artefact, not a real
+change). PDF byte size unchanged (within standard timestamp
+drift). Shipped.
+
+### What the three hypotheses together teach
+
+`(idle)` in a V8 CPU profile attribution table is **not** primarily
+microtask scheduling -- those are too fast to sample. It's
+genuinely-waiting time, where the main thread had no V8 work to do.
+The dominant source of waiting in our render was not async/await,
+not ResizeObserver coalescing, but a `requestAnimationFrame`
+buried in the chunker's task queue. Replacing it with
+`queueMicrotask` collapses the per-page wait, and additionally
+shrinks the surrounding V8 scheduler work because each rAF
+callback came with its own setup / teardown overhead.
+
+The pattern to remember: if a profile shows non-trivial `(idle)`
+in a render-style workload, hunt for explicit `requestAnimationFrame`
+/ `setTimeout` / `requestIdleCallback` calls in the hot path before
+investigating microtask machinery. The frame-paced scheduler is a
+much bigger lever than the microtask scheduler.
+
+### Follow-up: the `Queue` itself was unnecessary indirection
+
+The chunker's `render()` routes each per-page iteration through
+`this.q.enqueue(() => this.renderAsync(renderer))`. The queue's
+job is to serialize tasks -- but an async generator is already
+inherently serial (you can't call `.next()` twice in parallel).
+With the rAF-tick fix above, the queue was reduced to a
+`queueMicrotask` hop plus a Promise/deferred allocation per page,
+for no purpose.
+
+Dropped the indirection: `render()` now iterates `renderer.next()`
+directly. The `Queue` class still exists in the bundle for the
+`onOverflow` re-render path (which is rare in practice), but the
+hot per-page loop bypasses it.
+
+This is a structural simplification more than a measurable speedup
+-- the queueMicrotask hop was already cheap and the deferred
+allocation amortizes. But it removes a layer that was doing
+nothing useful for our use case, which is the point of
+maintaining a fork.
diff --git a/perf/notes/04-sync-and-inner-loop.md b/perf/notes/04-sync-and-inner-loop.md
new file mode 100644
index 0000000..85a8c88
--- /dev/null
+++ b/perf/notes/04-sync-and-inner-loop.md
@@ -0,0 +1,941 @@
+# Stripping async machinery and shrinking `Layout.append`
+
+Removing headless-irrelevant async coordination from paged.js (hook fast-path, sync chain end-to-end through the per-page hot path), then a sequence of small wins inside `Layout.append`: footnote NodeList fast-path, parent-lookup cache, `Hook.triggerSync` empty-handlers fast-path, footnotes self-disable; finally skipping `wrapContent`'s innerHTML round-trip and fixing two bugs in the adaptive `maxChars` overflow-check rhythm.
+
+## Stripping headless-irrelevant async machinery
+
+paged.js was designed to be fully usable in interactive browser
+work. The async coordination patterns it carries -- always
+returning Promises from hook triggers, awaiting microtask
+boundaries between every phase, deferring tasks via animation
+frames -- pay off when the same engine is rendering inside a
+visible page that needs to stay responsive, coordinate with the
+compositor, and tolerate handlers that load external resources.
+
+In our headless puppeteer pipeline, none of that is true:
+
+- The page is offscreen; no compositor to coordinate with.
+- We don't care if any individual page-render blocks for tens of
+  milliseconds, because the browser isn't trying to repaint.
+- Every handler we register is synchronous. No hook needs to
+  await anything.
+- The book HTML is loaded before render starts (`page.goto(url,
+  { waitUntil: "load" })`), so every image's `.complete` flag is
+  already true. No image-loading awaits ever actually wait.
+
+Each remaining async wrapper is overhead we pay for a flexibility
+we never use. We're maintaining a task-specific fork; we can keep
+peeling layers as long as the simplifications don't change observed
+output.
+
+### Phase 1: hook fast-path
+
+`Hook.trigger()` upstream always wraps sync handler results in
+`new Promise(resolve => resolve(executing))` and returns
+`Promise.all(promises)`. The chunker's per-page loop awaits each
+of `beforePageLayout`, `afterPageLayout`, and `finalizePage`. With
+all six of our registered handlers running synchronously,
+`await trigger(...)` was a no-work microtask boundary per call.
+
+Patch: `Hook.trigger()` returns `undefined` when no handler
+returned a thenable. Callers in the per-page hot path become:
+
+```js
+let _p = this.hooks.X.trigger(...);
+if (_p) await _p;
+```
+
+The microtask boundary is skipped entirely on the sync fast
+path. Patched at six per-page sites (three in `chunker.layout`,
+three in `chunker.handleBreaks`).
+
+CPU profile comparison (post-queue-tick + drop-queue baseline vs
+post-Phase-1):
+
+| metric | baseline | Phase 1 | Δ |
+| ------ | -------- | ------- | --- |
+| samples | 7,353 | 6,902 | -451 |
+| profile duration | 13.07 s | 12.22 s | **-0.85 s (-6.5 %)** |
+| `getBoundingClientRect` self | 4,622 ms | 4,273 ms | -349 ms |
+| `(program)` self | 1,873 ms | 1,874 ms | flat |
+| `removeChild` self | 1,885 ms | 1,913 ms | flat |
+| `removeOverflow` self | 592 ms | 579 ms | flat |
+| `(idle)` self | n/a (< 130 ms) | n/a (< 130 ms) | flat |
+
+The 451 fewer samples account for ~800 ms of saved CPU work.
+`getBoundingClientRect`'s self-time dropped by ~350 ms; the rest
+is distributed across many small hot spots that all shrank
+slightly because they were each preceded by fewer microtask
+yields. No new hot spot appeared.
+
+> [!NOTE]
+> We compare CPU-profile sample counts and self-times here, not
+> wall-clock. Wall-clock includes I/O variance and system load on
+> the dev machine; CPU profile sample times are independent of
+> those and more reliable for "did this actually change CPU work."
+> Wall-clock numbers from these runs are noted where useful for
+> sanity-checking but aren't the primary signal.
+
+Shipped. The fix is small (one helper change + six call-site
+edits) and removes about 8k microtask boundaries from the
+per-page hot loop on a 1651-page render.
+
+### Phase 2: sync chain end-to-end through the per-page hot path
+
+With Phase 1 in place, every per-page `await` in the chunker is
+unconditional on a function that returned a Promise even when
+nothing was actually awaitable. The structural answer is to make
+those functions plain sync functions.
+
+The chain, top to bottom of the per-page call tree:
+
+```
+chunker.*layout()              (async generator → sync generator)
+  chunker.handleBreaks()       (async → sync)
+  page.layout()                (async → sync)
+    Layout.renderTo()          (async → sync)
+      Layout.waitForImages()   (async → sync, throws if not preloaded)
+chunker.render() loop          (still async at the outer edge;
+                                renderer.next() now sync)
+```
+
+Phase 2 converts each step. The only function that *could* have
+been genuinely async -- `waitForImages` -- is now a synchronous
+check: it walks the supplied `<img>` nodes and throws if any
+isn't `.complete`. In our pipeline,
+`page.goto(url, { waitUntil: "load" })` settles before paged.js
+is invoked, so every image is already loaded; the throw is a
+safety net for pipeline bugs, not a runtime path we expect to
+take.
+
+The hook triggers in the per-page hot path keep the Phase 1
+fast-path semantics but switch from
+`let _p = hook.trigger(...); if (_p) await _p;` to
+`_assertSync(hook.trigger(...), "hook-name")`. The helper throws
+if a handler ever returns a thenable -- the same safety pattern
+as `waitForImages`. None of our shipping handlers do.
+
+Dead code removed in the same pass: `Chunker.renderAsync` and
+`Chunker.renderOnIdle`, both unreachable since the drop-queue
+change above stripped their only caller. Together ~30 lines of
+async machinery that existed only to wrap the (now sync)
+`renderer.next()` call.
+
+CPU profile (Phase 1 baseline vs Phase 2):
+
+| metric | Phase 1 | Phase 2 | Δ |
+| ------ | -------- | ------- | --- |
+| samples | 6,902 | 6,948 | +46 |
+| profile duration | 12.22 s | 12.35 s | +0.13 s (noise) |
+| `getBoundingClientRect` self | 4,273 ms | 4,524 ms | +251 ms (noise) |
+| `(program)` self | 1,874 ms | 1,909 ms | +35 ms |
+| `removeChild` self | 1,913 ms | 1,883 ms | -30 ms |
+| `removeOverflow` self | 579 ms | 523 ms | -56 ms |
+
+Phase 2 sits inside the run-to-run noise band on CPU time --
+the per-call CPU cost of an `await` on an already-settled Promise
+is small (a handful of microseconds), and Phase 1 already
+eliminated most of the boundary count. **What Phase 2 buys is
+not measurable CPU time -- it's structural simplicity.**
+
+Code shape, before and after:
+
+- 6 fewer `async` keywords on hot-path methods.
+- 13 fewer `await` keywords removed from the bodies of those
+  methods (the per-page chain no longer threads `await` through
+  any of its layers).
+- One async generator (`async *layout`) → sync generator
+  (`*layout`).
+- Two dead methods removed (`renderAsync`, `renderOnIdle`).
+- Two `_assertSync` guards added at the chunker's hook call
+  sites + one at `waitForImages` -- the contract we now rely on
+  (per-page handlers all synchronous, every `<img>` preloaded)
+  is enforced at runtime with a useful error message.
+
+PDF output is **byte-identical** to the Phase 1 build on this
+content (`async-phase1/book.pdf` and `async-phase2/book.pdf`
+both 16,893,546 bytes -- a rare 0-byte timestamp drift, but
+the structural content is identical regardless).
+
+This is the kind of cleanup that's only worth doing because
+we maintain a task-specific fork of the bundle. Upstream
+paged.js has to support handlers that await fetches or image
+loads or font measurements -- our pipeline never registers one.
+Removing the async machinery in our copy shrinks the surface to
+reason about and makes the data-flow direct: a render is a
+plain function call that produces a plain return value.
+
+### What's still async, and why
+
+> **Update.** All four survivors listed below were
+> subsequently stripped -- see *Following `RunMicrotasks`
+> down to zero* in
+> [06-microtasks-pageranges-css.md](06-microtasks-pageranges-css.md).
+> The reasoning
+> here ("once-per-render, overhead irrelevant") was
+> correct as a per-call cost argument but missed that
+> the unbroken await chain forced V8 to attribute the
+> entire post-`loadFonts` render to a microtask
+> continuation (`RunMicrotasks` in the trace,
+> `(program)` in the cpu profile). Re-attribution alone
+> was worth the conversion; wall-clock is unchanged.
+> The list below is preserved for chronological accuracy.
+
+The async machinery that survives this audit is now at the
+once-per-render layer, where it's load-bearing:
+
+- `Chunker.flow()` is async because `loadFonts()` waits on the
+  CSS font-face descriptor's load promise, which is actually
+  async and OS-level.
+- `Chunker.render()` stays `async` as a thin wrapper so callers
+  in `flow()` can `await` it (the alternative would be to
+  remove `async` and have `flow()` not await it, but the call
+  site reads more clearly with the `await` retained).
+- `beforeParsed`, `afterParsed`, `afterRendered` hooks are still
+  awaited with the `await hook.trigger(...)` form because they
+  fire once per render and the overhead is irrelevant.
+- The `onOverflow` recovery path (`Chunker.q.enqueue(async ...)`)
+  re-renders the document if any page overflows after paint. In
+  practice this never fires for our content, but keeping the
+  recovery code intact costs nothing and preserves behaviour for
+  edge cases.
+
+The hot per-page path is now `function`, `function*`, plain
+return values, and a `while` loop. Future work that touches
+this code can reason about it as straight-line synchronous
+flow.
+
+## Doing less work in `Layout.append()`
+
+Picking the next hotspot after the async cleanup, BreakToken
+JSON, gBCR wrapper inline, and UUID-counter changes had all
+landed. Fresh profile from a clean baseline at 100us sampling
+(V8 effectively clamped this to ~543us/sample on this Node/
+Chromium build), `--no-timing --detach-pages`, render-only:
+
+```
+   self_ms   self_%   function  @  source
+   -------   ------   --------------------------------------------------
+   4825.28   38.22%   getBoundingClientRect       (native)
+   2021.89   16.02%   (program)                   (native)
+   1954.01   15.48%   removeChild                 (native)
+    635.95    5.04%   removeOverflow              paged.browser.js
+    288.38    2.28%   wrapContent                 paged.browser.js
+    255.25    2.02%   insertBefore                (native)
+    227.01    1.80%   appendChild                 (native)
+    164.01    1.30%   findOverflow                paged.browser.js
+    140.66    1.11%   (garbage collector)         (native)
+    138.49    1.10%   afterPageLayout             paged.browser.js (Splits)
+    129.25    1.02%   cloneNode                   (native)
+    125.99    1.00%   addRefs                     paged.browser.js
+     90.15    0.71%   renderTo                    paged.browser.js
+     81.46    0.65%   filterTree                  paged.browser.js
+     80.92    0.64%   importNode                  (native)
+     80.38    0.64%   setAttribute                (native)
+     72.77    0.58%   append                      paged.browser.js
+     ...
+```
+
+The four heavy hitters are unchanged from earlier reports.
+`Layout.append` itself shows only 73 ms of self-time, but
+inclusively it owns a large fraction of the per-source-node
+work: `cloneNode`, `appendChild`/`insertBefore`, the
+`findElement` chain (`querySelector` + `getAttribute`), the
+`renderNode` hook dispatch, and `rebuildAncestors` at page
+boundaries all flow through it. With ~100k+ source-node
+clones per render, anything per-call adds up.
+
+Reading the body of `append()`, three things stood out as
+potentially-reducible:
+
+1. The `renderNode` hook dispatch fires for every cloned
+   node. Even if no handler is registered, `triggerSync`
+   still allocates a results array, runs `this.hooks.forEach`
+   over zero entries, and returns the empty array; the
+   caller then runs its own `.forEach` over that empty array.
+2. The `findElement(node.parentNode, dest)` lookup goes
+   through `getAttribute("data-ref")` on the parent. The
+   ref is also set on every source element at decoration
+   time, so the value could be stashed on a plain JS expando.
+3. `clone.dataset.ref` is read a second time at the end of
+   `append()` to register the clone in `dest.indexOfRefs`.
+   Same expando trick applies.
+
+Following the (1) thread first uncovered two separable wins:
+a bug inside the only registered `renderNode` handler, and
+the broader empty-handlers dispatch overhead.
+
+### `Footnotes.renderNode`: always-truthy NodeList condition
+
+The grep for `renderNode` method definitions in the bundle
+returns exactly one match: `Footnotes.renderNode` (in the
+package's footnotes-handling class). Every `append()` call
+goes through it. Its body:
+
+```js
+renderNode(node) {
+    if (node.nodeType == 1) {
+        let notes;
+        if (!node.dataset) return;
+
+        if (node.dataset.note === "footnote") {
+            notes = [node];
+        } else if (node.dataset.hasNotes ||
+                   node.querySelectorAll("[data-note='footnote']")) {
+            notes = node.querySelectorAll("[data-note='footnote']");
+        }
+
+        if (notes && notes.length) {
+            this.findVisibleFootnotes(notes, node);
+        }
+    }
+}
+```
+
+The `else if` condition has an upstream bug: a `NodeList` is
+always truthy (even an empty one -- it's an object), so when
+`dataset.hasNotes` is undefined the right arm of the `||`
+runs `querySelectorAll`, the condition evaluates true, and
+the next line then runs `querySelectorAll` **a second time**.
+Two subtree scans per element-node clone, for any document
+that doesn't author `data-note='footnote'` directly.
+
+`grep -c 'data-note' docs/_site-pdf/book.html` returns 0 --
+every one of those scans on every clone of every page of
+the book was dead work.
+
+The fix narrows the `else if` to the original intent:
+
+```js
+} else if (node.dataset.hasNotes) {
+    notes = node.querySelectorAll("[data-note='footnote']");
+}
+```
+
+Profile delta (post-tojson baseline vs surgical fix):
+
+| metric | baseline | post-fix | Δ |
+| ------ | -------- | -------- | --- |
+| render wall | 12.63 s | 12.63 s | flat (within noise) |
+| `querySelectorAll` self | 67.9 ms | 52.8 ms | -15 ms |
+| samples | 23,313 | 23,250 | -63 |
+
+A small saving in absolute terms: most of the eliminated
+`querySelectorAll` calls were against tiny leaf subtrees
+that terminate in microseconds when no matches are present.
+The bug fix is upstream-clean and correct; the perf-relevant
+takeaway was that *most* of the work `append()` pays for the
+`renderNode` hook is in the dispatch wrapping the handler,
+not in the handler's body. That motivated (2).
+
+### `Hook.triggerSync` empty-handlers fast-path
+
+Mirrors the earlier *Phase 1: hook fast-path* (in
+*Stripping headless-irrelevant async machinery* above) for
+the async `trigger()` path. `Hook.triggerSync` previously:
+
+```js
+triggerSync() {
+    var args = arguments;
+    var context = this.context;
+    var results = [];
+    this.hooks.forEach(function (task) {
+        var executing = task.apply(context, args);
+        results.push(executing);
+    });
+    return results;
+}
+```
+
+…and the four reducer call sites in `Layout` always did:
+
+```js
+let r = this.hooks.X.triggerSync(...);
+r.forEach((newVal) => { if (newVal !== undefined) target = newVal; });
+```
+
+Walking the bundle to see which of those four hook arrays
+are actually populated in our build:
+
+| call site | hook | handlers registered |
+| --------- | ---- | ------------------- |
+| `breakAt` (line 1551) | `onBreakToken` | 0 |
+| `append` (line 1640) | `renderNode` | 1 (`Footnotes`) |
+| `findBreakToken` (line 1805) | `onOverflow` | 0 |
+| `findBreakToken` (line 1815) | `onBreakToken` | 0 |
+| `Chunker.flow` (line 2910) | `filter` | 4 |
+
+Three of the four hot sites are dispatching against an empty
+handler array every call. `onOverflow` and the two
+`onBreakToken` sites all fire from the per-page break-
+detection path, which can run more than once per page when
+overflow-and-retry happens.
+
+Patch: `triggerSync` returns `undefined` on the empty path,
+callers guard their reducer `forEach` with a truthy check.
+
+```js
+triggerSync() {
+    if (this.hooks.length === 0) return undefined;
+    // ...existing body
+}
+```
+
+```js
+let r = this.hooks.X.triggerSync(...);
+if (r) r.forEach((newVal) => { ... });
+```
+
+Profile delta (post-surgical vs post-fast-path):
+
+| metric | post-surgical | post-fast-path | Δ |
+| ------ | ------------- | -------------- | --- |
+| render wall | 12.63 s | **12.14 s** | **-0.49 s** |
+| samples | 23,250 | 22,433 | -817 |
+| `getBoundingClientRect` self | 4,819 ms | 4,714 ms | -105 ms |
+| `removeChild` self | 1,962 ms | 1,902 ms | -60 ms |
+| `removeOverflow` self | 634 ms | 552 ms | -82 ms |
+| `querySelectorAll` self | 52.8 ms | 43.4 ms | -10 ms |
+
+The wall-clock drop (~490 ms) and sample drop (817 × 542 us
+≈ 443 ms) line up cleanly, so the saving is real, not run-
+to-run noise. The reductions spread across rows because the
+per-call cost of an empty `triggerSync` -- an array alloc, a
+forEach over zero entries, a return, and the caller's own
+forEach over the returned `[]` -- creates pressure on the
+allocator and the V8 inliner that compounds on the per-page
+hot path even though no single line attributes the cost.
+
+The `renderNode` site at line 1640 does **not** hit the fast
+path in this build -- `Footnotes` still occupies it with one
+handler, so `hooks.length === 1` and the body runs as
+before. The savings come entirely from the three zero-
+handler sites.
+
+### `Footnotes` self-disables when no footnotes are in source
+
+That left the per-element `Footnotes.renderNode` dispatch
+still firing on every cloned node, plus four other hook
+methods `Footnotes` registers via the `Handler` base auto-
+wiring. Inventory of what `Footnotes` is doing on a render
+with zero footnote-marked nodes:
+
+| method | fires | what it does on a footnote-free doc |
+| ------ | ----- | ----------------------------------- |
+| `onDeclaration` | per CSS declaration | quick property-name checks. Cheap. |
+| `renderNode` | per element-node clone | short-circuits after surgical fix. |
+| `beforePageLayout` | once per page | checks `this.needsLayout.length` (always 0). Cheap. |
+| `afterPageLayout` | once per page | **3 `querySelector`s + `getBoundingClientRect` + `new Layout(...)` (which does 2 more `getBoundingClientRect`s + `getComputedStyle` in its constructor) + `findOverflow()` on the footnote-inner-content area.** Real work. |
+| `afterOverflowRemoved` | per overflow detection | `querySelectorAll` returning empty. Cheap-ish. |
+
+The big hidden cost was `afterPageLayout` -- ~1,650 calls per
+render, each measuring an empty footnote area through several
+DOM ops and constructing a transient `Layout` instance whose
+constructor itself does multiple gBCRs.
+
+The detect-and-disable plan:
+
+1. Footnotes is the *only* registrant for each of its hook
+   methods (`onDeclaration` aside -- it's a polisher-time
+   hook with other registrants, but it's also cheap).
+2. By the time `afterParsed` fires, both the CSS-driven
+   selectors (populated by `onDeclaration` calls into
+   `this.footnotes`) and any source-HTML `data-note` markers
+   are accounted for. `Footnotes.afterParsed` already runs
+   `processFootnotes(parsed, this.footnotes)` which writes
+   `data-note='footnote'` on any element matching a CSS
+   selector. So a single `parsed.querySelector(
+   "[data-note='footnote']")` at the end of that pass is
+   conclusive.
+3. If null, splice `Footnotes`'s bound functions back out
+   of each hook array. With the empty-handlers fast-path
+   from (2) already landed, the per-page and per-node
+   dispatches then return `undefined` immediately and
+   callers skip their reducer `forEach`.
+
+To enable (3), the `Handler` base class gets a small
+addition: each `(hook, bound)` pair from auto-registration
+is stashed under its hook name on `this._registered`, and a
+new `_unregisterAll(except)` method splices each entry back
+out. The `except` argument lets the caller skip the hook
+it's currently inside (`afterParsed` in this case) --
+splicing the array we're iterating would cause the
+surrounding `trigger()` loop to skip a sibling handler.
+The skipped entry stays in `this._registered` forever, but
+it's a one-shot anyway: harmless.
+
+`Footnotes.afterParsed` then becomes:
+
+```js
+afterParsed(parsed) {
+    this.processFootnotes(parsed, this.footnotes);
+    if (!parsed.querySelector("[data-note='footnote']")) {
+        this._unregisterAll("afterParsed");
+    }
+}
+```
+
+Profile delta (post-fast-path vs post-self-disable):
+
+| metric | post-fast-path | post-self-disable | Δ |
+| ------ | -------------- | ----------------- | --- |
+| render wall | 12.14 s | **11.77 s** | **-0.37 s** |
+| samples | 22,433 | 21,809 | -624 |
+| **`getBoundingClientRect` self** | **4,714 ms** | **4,198 ms** | **-516 ms** |
+| `removeChild` self | 1,902 ms | 1,898 ms | flat |
+| `(program)` self | 2,022 ms | 2,198 ms | +176 ms |
+| `append` self | 76 ms | 69 ms | -7 ms |
+
+The 516 ms `getBoundingClientRect` drop is exactly the
+`Footnotes.afterPageLayout` cost that the inventory
+predicted -- one gBCR on `noteContent` plus two more in
+the `new Layout(noteArea, ...)` constructor plus internal
+gBCRs from `findOverflow()`, multiplied by ~1,650 pages.
+The `(program)` row growing by 176 ms is V8 reattributing
+work between native and self-time as the dispatch pattern
+changes; not new work, just a different breakdown.
+
+PDF output remained byte-identical to the previous build
+on this content (16.1 MB, same checksum on the raw
+Chromium output).
+
+### `Layout.append` parent-lookup cache
+
+When the source walker emits consecutive children of the
+same parent, `findElement(node.parentNode, dest)` in
+`append()` gets called repeatedly with the same input.
+For a parent with N children that's N - 1 redundant
+lookups -- each one cheap (`getAttribute("data-ref")` +
+`dest.indexOfRefs[ref]` is an O(1) dict hit on the fast
+path), but the call count is north of 100k per render.
+
+Patch: a three-property memo on `Layout` -- last
+`srcParent`, last `dest`, last `destParent`. Hit check at
+the top of `append`, writeback at the bottom after the
+parent is resolved (whether via direct lookup or via the
+rebuild-ancestors branch, since the rebuild attaches the
+cloned ancestor into `dest`).
+
+Invalidation: reset all three at the top of every
+`renderTo`. The cache is safe within a single `renderTo`
+loop because `append()` never detaches DOM from `dest`,
+and `removeOverflow` (the one thing that does) only fires
+at loop exit. Across `renderTo` calls on the same `Layout`
+instance the previous run's `removeOverflow` may have
+detached the cached parent, so the explicit reset is the
+correctness guard.
+
+Profile delta (post-self-disable vs post-parent-cache):
+
+| metric | post-self-disable | post-parent-cache | Δ |
+| ------ | ----------------- | ----------------- | --- |
+| render wall | 11.77 s | 11.72 s | flat (within noise) |
+| samples | 21,809 | 21,688 | -121 (~65 ms) |
+| `(program)` self | 2,198 ms | 2,169 ms | -29 ms |
+| `getAttribute` (native) | 43 ms | off-list (<40 ms) | -3 ms+ |
+| `querySelector` (native) | 63 ms | 59 ms | -4 ms |
+| `Layout.append` self | 69 ms | 70 ms | flat |
+
+Order ~50-100 ms saved depending on the row chosen, fully
+below the run-to-run wall-clock noise band but visible in
+the cpuprofile rows. The math checks: ~100k append calls
+× ~80 % sibling-cache-hit rate × ~1 us per skipped
+findElement ≈ 80 ms.
+
+PDF output byte-identical.
+
+### What didn't land: the `_ref` expando
+
+One sibling candidate to the parent-lookup cache was
+tried and reverted. The idea: mirror `data-ref` onto a
+plain JS property `_ref` at decoration time (in
+`ContentParser.addRefs`), propagate via the `cloneNode`
+helper, and read it in `findElement` and `append`'s
+postlude instead of `getAttribute("data-ref")` /
+`clone.dataset.ref`. Both reads in the hot path become
+plain JS property loads instead of going through C++ DOM
+attribute fetches or the `DOMStringMap` proxy.
+
+Measured win on the per-row breakdown:
+
+- `Layout.append` self 69 -> 47 ms (-22 ms).
+- `getAttribute` native 43 ms -> off-list (-3+ ms).
+
+About 25 ms of real per-call work removed. Reverted: the
+saving is genuinely smaller than the diff's surface --
+`cloneNode` helper has to propagate an extra property,
+the `data-ref` attribute has to stay for CSS selectors
+and the `querySelector` fallback in `findRef`, `findElement`
+needs a `||` fallback to keep direct `.cloneNode()`
+callers in `rebuildAncestors` working unchanged, and any
+future code that wants the ref has two places it could
+read from. Not worth maintaining for a saving that
+doesn't move single-run wall-clock.
+
+Lesson worth carrying forward: at this point in the
+codebase, per-call findElement / `dataset.ref` work has
+been ground down close enough to its floor that any
+further shave produces savings in the 20-50 ms band, well
+below the run-to-run wall-clock noise on this machine.
+Reading the cpuprofile per-row deltas is the only way to
+tell whether such a change is genuine; reading wall-clock
+isn't. And the bar for landing scales with the size of
+the diff -- the parent-cache landed because it's three
+property writes and one branch; the expando didn't
+because it's a propagation pattern that ripples through
+the bundle.
+
+### Cumulative effect
+
+Across all four landings:
+
+| metric | pre-investigation | post-parent-cache | Δ |
+| ------ | ----------------- | ----------------- | --- |
+| render wall | 12.63 s | 11.72 s | **-0.91 s (-7.2 %)** |
+| samples | 23,313 | 21,688 | -1,625 |
+| `getBoundingClientRect` self | 4,825 ms | 4,194 ms | -631 ms |
+| `removeChild` self | 1,954 ms | 1,897 ms | -57 ms |
+| `removeOverflow` self | 636 ms | 583 ms | -53 ms |
+| `getAttribute` (native) | ~125 ms* | off-list (<40 ms) | -85 ms+ |
+
+\* Inferred from the post-tojson baseline rank; not
+explicitly tabulated in the top-25 cut at that time.
+
+The `Handler._registered` + `_unregisterAll(except)` plumbing
+is reusable: any future handler that determines at
+parse/decoration time that it has nothing to do for a given
+render can self-disable the same way, and the
+empty-handlers fast-path will swallow the per-call dispatch
+cost for free. That's the pattern this work leaves behind --
+combine "detect once at a known-quiet point" with "remove
+yourself from the dispatch chain" and you pay zero
+ongoing cost for inactive handlers.
+
+## Skipping the `wrapContent` innerHTML round-trip
+
+The post-append-cache profile's 5th-largest JS row was
+`wrapContent` at 260 ms. It's called once per render, right
+at the top of `Chunker.flow`, so unlike the previous fixes it
+has no per-page hot path -- the absolute size is the whole
+story.
+
+`Layout.wrapContent` lifts the entire `<body>` into a
+`<template data-ref='pagedjs-content'>` so the chunker can
+iterate the source without disturbing the live DOM. Original:
+
+```js
+template.innerHTML = body.innerHTML;
+body.innerHTML = "";
+body.appendChild(template);
+```
+
+Two heavy halves, both linear in document size:
+
+1. **`body.innerHTML` getter**: walks every node in the body
+   and serialises the entire subtree to one HTML string.
+2. **`template.innerHTML = ...` setter**: hands the string to
+   the HTML parser, which reparses it into a fresh tree
+   inside the template's contents-owner document.
+
+On our 5.5 MB book, the round-trip is exactly 260 ms.
+`find-callees.mjs` confirms 99 % of that lives in the JS frame
+itself (the C++ serialiser/parser get attributed back to the
+calling frame, same trick `removeOverflow`'s `Range`
+deletion uses):
+
+```
+wrapContent: self=259.97ms, total=262.15ms (callees=2.18ms)
+per direct callee (subtree total ms):
+      2.18 ms   querySelector  @  (native):0
+```
+
+The fix moves children directly into a plain
+`DocumentFragment`, no string round-trip:
+
+```js
+let fragment = document.createDocumentFragment();
+while (body.firstChild) fragment.appendChild(body.firstChild);
+template = document.createElement("template");
+template.dataset.ref = "pagedjs-content";
+template._pagedjsContent = fragment;  // re-entrancy stash
+body.appendChild(template);
+return fragment;
+```
+
+### Why a plain fragment, not `template.content`
+
+The first cut moved children into the template's content,
+which is the obvious shape since `wrapContent` was already
+returning `template.content`. It crashed on the first page:
+
+```
+paged.js (forked): image not loaded at render time.
+Image: file:///.../Features/Images/b0724fe2-....png
+   at Layout.waitForImages
+   at Layout.renderTo
+```
+
+The reason is in the spec. A `<template>`'s `content` fragment
+is owned by a separate "template contents owner document"
+that has no browsing context -- resources inside it never
+load. Moving a live `<img>` into `template.content` triggers
+`adoptNode` to that inert document, which then runs the
+"update the image data" algorithm, creates a fresh request
+in state "unavailable", and flips `.complete` to false. The
+source image is now stuck in that state; clones into the live
+page wrappers inherit it without the synchronous cache-hit
+path firing in time for the sync `[PATCH: assert-sync]`
+`waitForImages` check.
+
+The `innerHTML` round-trip avoids this incidentally: the
+freshly-parsed `<img>` elements in `template.content` are
+brand new (never live), they have no prior load state to
+disturb, and when their clones land in the live page wrappers
+Chromium's file:// cache lookup resolves them synchronously.
+
+A plain `DocumentFragment` is owned by the live document.
+Moving children into it is a same-document append -- no
+adoption, no "update the image data", no `.complete` reset.
+Clones from the fragment into the live page wrappers then
+take the same fast cache path the round-trip's parsed images
+did.
+
+### Re-entrancy
+
+The original returned `template.content`, so a second call
+finding the existing template just returned that same
+fragment. Under the move strategy `template.content` is
+empty (the children live in the plain fragment we returned),
+so the re-entrant branch reads the fragment back off a
+`template._pagedjsContent` expando on the marker template.
+Functionally equivalent for the one-call-per-render case
+that's actually exercised; preserves the multi-call contract
+in case anyone leans on it later.
+
+### Results
+
+Paired A/B, 2 runs each, `--detach-pages --no-timing
+--cpu-profile --cpu-sampling 100`:
+
+| run | pre | post |
+| --- | --- | --- |
+| 1 | 11.92 s | 10.72 s |
+| 2 | 11.60 s | 11.06 s |
+| **avg** | **11.76 s** | **10.89 s** |
+
+**Δ = -0.87 s render (-7.4 %).** Larger than the 260 ms the
+profile attributed to `wrapContent` itself -- the round-trip
+also allocated a transient 5.5 MB string that pushed GC and
+distributed sample noise into the surrounding rows; removing
+the allocation relieves pressure across the whole per-page
+hot path. The cpuprofile rows breakdown:
+
+| function | pre | post | Δ |
+| -------- | --- | ---- | --- |
+| `wrapContent` self | 260 ms | off-list (<25 ms) | **-260 ms+** |
+| `getBoundingClientRect` self | 4,281 ms | 4,036 ms | -245 ms |
+| `removeOverflow` self | 560 ms | 353 ms | -207 ms |
+| `removeChild` self | 1,871 ms | 1,730 ms | -141 ms |
+| `(program)` self | 2,298 ms | 2,152 ms | -146 ms |
+
+The `wrapContent` row is the only one outside the single-run
+noise band (*Methodology: compare profiles, not wall-clock*
+in [02-finalizepage.md](02-finalizepage.md) pins
+that at 50-150 ms for sub-1 % rows on this machine). The
+others are plausibly real but inseparable from noise without
+more runs; the sample-count delta (-2,100 samples × 542 us
+= ~1,135 ms) matches the wall-clock delta closely enough that
+the distributed component is probably real GC-pressure
+relief, not just sampler jitter.
+
+PDF byte-equivalent to the pre-fix build (16.1 MB).
+
+### What the pattern leaves behind
+
+`removeOverflow` and `wrapContent` are both cases where V8
+rolled native DOM work (`Range.deleteContents`,
+HTML serialiser+parser) into the calling JS frame's
+self-time. The diagnostic move is the same one we used for
+gBCR attribution: `find-callees.mjs` on the suspect frame.
+If self-time is ~100 % of total, the work is happening
+inside a native callee the sampler didn't name -- read the
+JS body to find which DOM API is doing the work and whether
+it can be replaced with a cheaper equivalent.
+
+`find-callees.mjs` was added for this investigation and
+sits alongside `find-callers.mjs`; the two together cover
+both directions of the V8 attribution edge.
+
+## The per-page overflow-check rhythm: two bugs in the adaptive `maxChars`
+
+*Attempt E: additive backoff* in
+[02-finalizepage.md](02-finalizepage.md) describes
+the per-page rhythm of `renderTo`'s overflow checks: append
+nodes, fire `findBreakToken` every `maxChars` chars of
+appended content, break out when it returns a non-null
+breakToken. `maxChars` defaults to 1500 and is meant to
+adapt up or down based on observed page capacity.
+
+The post-wrapContent profile showed `findOverflow` total
+2.24 s, almost all of it (1.96 s) in `hasOverflow`'s single
+gate gBCR -- one call per `findBreakToken`. Was the call
+count high because the page actually needs that many
+probes, or was the rhythm wrong?
+
+Instrumenting with `window.__breakCheckStats` and
+`window.__layoutMaxChars` answered it:
+
+```
+findBreakToken checks: 7,764  hits: 862  nulls: 6,902
+renderTo calls: 1651  checks/call avg: 4.70
+Layout.maxChars: first=1500  median=177  last=177  min=177  max=1500
+```
+
+Four findings:
+
+1. **89 % of checks (6,902 / 7,764) return null.** They're
+   "no overflow yet, keep appending" probes. Each is still
+   a full layout-flush gBCR. The actual overflow detections
+   are 862, slightly more than half of the 1651 pages
+   (the rest end naturally, or via CSS-driven breaks).
+
+2. **`Layout.maxChars` was locked at 177 for the entire
+   render** after page 1. That's an order of magnitude
+   below a typical page's capacity (which the @page CSS,
+   font size, and content density determine -- closer to
+   4000-4500 chars of body text on this book). Page 1 ran
+   with the default 1500; pages 2-1651 ran with 177.
+
+3. The reason was a propagation gate in `Page.layout`:
+   ```js
+   if (!settings.maxChars && maxChars) {
+       settings.maxChars = maxChars;
+   }
+   ```
+   `settings` is shared across all pages (one object, set
+   by reference in the Chunker constructor). The chunker
+   maintains a running estimate in `this.maxChars` via
+   `recordCharLength` and passes it into each page's
+   `layout(..., maxChars)`. But `!settings.maxChars` is
+   only truthy on the first page that gets a defined value
+   -- the rest see settings.maxChars already populated and
+   skip the update. Whatever value page 2 picked up (177,
+   from a freak short page 1 that had been recorded as
+   capacity), every subsequent page kept.
+
+4. The recording itself is biased. `recordCharLength` pushes
+   `page.wrapper.textContent.length` after every layout and
+   averages the last 4 values. Short pages -- chapter
+   endings, part dividers -- get recorded alongside full
+   pages, dragging the average well below true capacity.
+   Even with propagation fixed, the average would land
+   around 1200, not 4500.
+
+### The fix
+
+Two patches in `docs/lib/paged.browser.js`, marked
+`// [PATCH: maxChars-propagate]` and `// [PATCH: maxChars-
+running-max]`:
+
+1. **`Page.layout`'s gate drops the staleness check**:
+   `if (maxChars) settings.maxChars = maxChars;`. Each page
+   now picks up the chunker's current estimate.
+
+2. **`Chunker.recordCharLength` tracks the running max over
+   the last 16 pages** instead of the running average over
+   4. Max biases toward "the largest page recently seen,"
+   which approximates true capacity for our content. Short
+   pages still get pushed into the window but don't pull
+   the estimate down. The window of 16 is wide enough that
+   a transient stretch of short pages doesn't collapse the
+   estimate before a full page restores it.
+
+### Results
+
+Paired A/B, 2 runs each, `--detach-pages --no-timing`, no
+profiling:
+
+| run | pre | post |
+| --- | --- | --- |
+| 1 | 10.08 s | 8.15 s |
+| 2 | 11.86 s | 7.98 s |
+| **avg** | **10.97 s** | **8.07 s** |
+
+**Δ = -2.90 s render (-26 %).** CPU profile (single run,
+within noise band on the smaller rows):
+
+| metric                   | pre        | post       | Δ |
+| ------------------------ | ---------- | ---------- | --- |
+| `findOverflow` total     | 2,236 ms   | 1,690 ms   | **-546 ms** |
+| ↳ `hasOverflow` total    | 1,957 ms   | 1,597 ms   | -360 ms |
+| ↳ ↳ `gBCR` native        | 1,945 ms   | 1,587 ms   | -358 ms |
+| ↳ `findOverflow` self    | 142 ms     | 47 ms      | -95 ms |
+| ↳ walker-loop callees    | ~135 ms    | ~46 ms     | -89 ms |
+| `removeOverflow` self    | 353 ms     | 122 ms     | **-231 ms** |
+| `removeChild` self       | 1,731 ms   | 1,637 ms   | flat (noise) |
+| `(program)` self         | 2,152 ms   | 2,215 ms   | flat (noise) |
+
+The `removeOverflow` drop was the surprise. Going in, the
+concern was that bigger `maxChars` (now ~4500 instead of
+177) would mean larger overshoot when overflow fired -- so
+`extractContents` / `deleteContents` would have more nodes
+to detach. The opposite happened: `removeOverflow` self
+dropped two-thirds. The reason is the call count, not the
+per-call size. With `maxChars=177` the renderTo loop
+checked at every 177-char interval, but many of those
+checks were *near* the page boundary, where the walker in
+`findOverflow` did real work even when returning null
+(walking nodes to test text-break candidates that don't
+quite fit). With `maxChars=4500`, the very first check on
+most pages fires right at the overflow point; the walker
+runs once per page instead of several times, and the per-
+call work it does is roughly the same as before.
+
+PDF output is byte-identical to the pre-fix build
+(16.1 MB, same checksum on the raw Chromium output).
+
+### Why the average was the wrong statistic
+
+The textbook reason to track a running average is to
+estimate a stationary quantity in the presence of noise.
+The thing being estimated here -- "how many chars fit on a
+full page" -- is a tight ceiling, not a noisy reading: each
+page's textContent.length either equals page capacity
+(because the page broke for overflow) or is well below it
+(because content ran out / a CSS break fired). The
+distribution is bimodal, and the average sits between the
+modes -- exactly where it's worst as an estimator of
+either.
+
+The running max, by contrast, finds the upper mode and
+sticks to it. It only moves down if the entire window is
+sub-capacity pages, which means the document genuinely
+doesn't have full pages anymore (end of book, perhaps), at
+which point the estimate doesn't matter much.
+
+### Where this leaves the picture
+
+Render is now ~8 s on the 1651-page book, down from ~11 s
+post-wrapContent, down from ~104 s in the original
+baseline. Updated cumulative table:
+
+| fix                                 | render saved | shipped |
+| ----------------------------------- | ------------ | ------- |
+| `--detach-pages` (display:none)     |   ~55 s      | yes     |
+| aggressive detach (`removeChild`)   |   ~22 s      | yes     |
+| `renderTo` additive backoff         |   ~4.25 s    | yes     |
+| skip dead `findEndToken` path       |   ~3.5 s     | yes     |
+| `findRef` fast-path                 |   ~2.4 s     | yes     |
+| queue-tick: rAF -> queueMicrotask   |   ~2.6 s     | yes     |
+| `finalizePage` micro-optimisations  |   ~3 s       | yes     |
+| `wrapContent` move (skip innerHTML) |   ~0.9 s     | yes     |
+| **`maxChars` propagation + max**    | **~2.9 s**   | **yes** |
+| (others, smaller)                   |   ~3 s       | yes     |
+
+The strategic conclusion at the bottom of "Where this
+leaves the picture" updates accordingly: render is now
+roughly half the size of generate (~8 s vs ~32 s wall on
+the production build), and `pageRanges` sharding remains
+the only knob with a profile target large enough to move
+the wall-clock total meaningfully -- and that target is
+generate, not render.
diff --git a/perf/notes/05-blink-trace.md b/perf/notes/05-blink-trace.md
new file mode 100644
index 0000000..90498d0
--- /dev/null
+++ b/perf/notes/05-blink-trace.md
@@ -0,0 +1,765 @@
+# Move-not-clone, `(program)` Blink trace, and the WhiteSpaceFilter
+
+What happened when we tried move-not-clone instead of clone-then-detach (the prediction was wrong; a `previousLeaf` cache shipped instead); cracking the cpu profile's `(program)` row open with a Blink-category trace; and a paired cpu-profile A/B that found the WhiteSpaceFilter wasn't worth its layout cost in our headless pipeline.
+
+## What happened when we tried move-not-clone
+
+A fresh `--detach-pages --no-timing --cpu-profile
+--cpu-sampling 100` baseline run showed `cloneNode` at
+~146 ms self-time, all of it inside `Layout.append`'s per-
+source-node clone path. `Layout.append`'s body for the
+`!shallow` (deep-cloned leaf) yields was:
+
+```js
+let clone = cloneNode(node, !shallow);  // deep clone
+// ... attach clone to dest ...
+return clone;
+```
+
+The user's question: source's read-only-template contract
+is just an artifact of paged.js's break-and-resume model.
+We're doing offline layout -- nothing reads source after
+the render finishes. Could we MOVE the source node into
+dest instead of cloning it, and avoid the allocation cost
+entirely? Best-case ceiling estimated at ~300-450 ms /
+~3-5 % of render (the cloneNode self plus distributed GC-
+pressure relief from not allocating ~250 k duplicate DOM
+nodes).
+
+### What the refactor required
+
+Three load-bearing assumptions in the chunker break the
+moment source is mutated:
+
+1. The walker traverses via live links
+   (`node.firstChild` / `nextSibling` / `parentNode`).
+   After a leaf yield, `walker = walk$2(nodeAfter(node,
+   source), source)` reads `nodeAfter` AFTER `append` has
+   moved `node` into dest -- the reads now go into dest's
+   tree, not source's. Fix: capture `nodeAfter(node,
+   source)` BEFORE the append call and pass it to the
+   walker reset.
+
+2. `BreakToken.node` stores a source-tree reference for
+   the next page's `getStart(source, breakToken)` to
+   resume from. `createBreakToken`'s four
+   `findElement(*, source)` call sites map rendered
+   (clone) nodes back to source via shared `data-ref`.
+   With moves, source has lost the leaves and findElement
+   returns the moved node now living in dest. Fix:
+   bypass `createBreakToken` entirely. Compute the
+   resume point from the extract-and-restore step
+   instead (see `restoreOverflow` below).
+
+3. `removeOverflow`'s `deleteContents` would drop the
+   moved content forever. In the clone model that was
+   fine -- source still held a pristine copy. In the
+   move model, source needs the overflow content back so
+   the next page can render it. Fix: replace with
+   `restoreOverflow` -- `extractContents` the overflow
+   range, walk the fragment depth-first collecting leaf
+   elements, and reinsert each leaf at its stashed
+   `_srcParent` / `_srcNextSibling` position. For the
+   boundary leaf that's partially overflowing,
+   `extractContents` produces a shallow clone of the
+   leaf in the fragment; we inherit its source position
+   via `source.indexOfRefs[ref]` (which still points at
+   the original-now-in-dest, which carries the stash).
+   Reverse-order iteration so each leaf's `_srcNextSibling`
+   target is back in source by the time we insert.
+
+### The bug that taught the real story
+
+First pass rendered the book to 1740 pages -- 89 more
+than the 1651-page baseline. Content was byte-identical
+modulo timestamps. Per-page char counts in the FAQ
+section showed pages 127+ with only ~50-500 chars each:
+
+```
+[BL p127] 3045 chars      [EX p127] 438 chars
+[BL p128] 3732 chars      [EX p128] 185 chars
+```
+
+Some FAQ pages had a single short paragraph. Instrumenting
+`shouldBreak` revealed it was returning true on every
+non-first yield inside the FAQ article:
+
+```
+[instrument] shouldBreak true: tag=P  ref=6bv pba=- prevNode=ARTICLE
+[instrument] shouldBreak true: tag=B  ref=6bx pba=- prevNode=ARTICLE
+[instrument] shouldBreak true: tag=P  ref=6by pba=- prevNode=ARTICLE
+... (one per FAQ paragraph)
+```
+
+The `<p>` elements have no `data-break-before` and no
+`data-previous-break-after`, so the fire is via
+`needsPageBreak(node, previousNode)` -- which checks
+whether `node`'s effective `data-page` differs from
+`previousNode`'s.
+
+`previousNode` is computed via
+`nodeBefore(node, limiter)`, which walks
+`node.previousSibling` then climbs via `parentNode` if
+no significant sibling exists. In the move model, after
+the previous yield was moved out of source, the current
+yield's `previousSibling` is `null` (the previous one no
+longer lives in source). The climb continues up:
+FAQ article (no `data-page`) -> looks at its previous
+sibling -> finds the **part-divider article** sitting
+right before the FAQ article in source, which DOES carry
+`data-page="divider"` (set by processBreaks for the CSS
+`page: divider;` rule on `article.part-divider`).
+
+So `needsPageBreak` saw a transition from
+`page="divider"` to (effectively) no page, fired true,
+and the chunker started a fresh page for every paragraph
+in the FAQ section. The chapter article's normal
+"siblings share the same effective page-name" property
+broke because the sibling-walk now escapes the chapter
+into the prior part-divider.
+
+### Fix: track previousLeaf in renderTo
+
+The chunker already knows the right answer: the last
+leaf it actually appended this page. Threaded through
+`shouldBreak` as a third argument, used by the
+`needsPageBreak` branch only (`needsBreakBefore` and the
+`parentBreakBefore` logic still use `nodeBefore`):
+
+```js
+let _moveLastLeaf = null;
+// ... in the loop ...
+if (hasRenderedContent &&
+    this.shouldBreak(node, start, _moveLastLeaf)) { ... }
+// ... after append ...
+if (!shallow) _moveLastLeaf = node;
+```
+
+In `shouldBreak`:
+
+```js
+let pageBreakRef = previousLeaf || nodeBefore(node, limiter);
+return ... || needsPageBreak(node, pageBreakRef);
+```
+
+With that, page count went 1740 -> 1653 (within 2 of
+baseline) and per-page content matched. PDF
+byte-equivalent to baseline within timestamp drift.
+
+### Profile diff
+
+Both runs `--detach-pages --cpu-profile --cpu-sampling
+100`, sample-time absolute, single run each (wall-clock
+on this machine is too noisy to be a useful signal --
+see *Methodology: compare profiles, not wall-clock* in
+[02-finalizepage.md](02-finalizepage.md)):
+
+| function | baseline | move | Δ |
+| --- | --- | --- | --- |
+| `getBoundingClientRect` | 3539 ms | 4036 ms | **+497** |
+| `appendChild` | 137 ms | 390 ms | **+253** |
+| `restoreOverflow` (new) | -- | 168 ms | +168 |
+| `removeChild` | 1536 ms | 1635 ms | +99 |
+| `insertBefore` | <50 ms | 87 ms | ~+87 |
+| `getNodeWithNamedPage` | <50 ms | 108 ms | ~+85 |
+| `afterPageLayout` (AtPage) | 105 ms | 182 ms | +77 |
+| `(program)` | 2196 ms | 2266 ms | +70 |
+| `Layout` ctor | 23 ms | 31 ms | +8 |
+| `cloneNode` | 146 ms | <130 ms | **-146** |
+| `removeOverflow` | 124 ms | -- (replaced) | -124 |
+| **samples** | **17,481** | **19,590** | **+2,109** |
+| **CPU work** | **9.48 s** | **10.74 s** | **+1.26 s** |
+
+Net **+1.26 s of CPU work** -- the change is a clear
+regression in the opposite direction from the prediction.
+
+### Why the prediction was wrong
+
+The cloneNode self-time saving (-146 ms) shows up as
+expected, but three structural costs dwarf it:
+
+1. **`appendChild` on an attached node is roughly 2x
+   the cost of `appendChild` on a fresh clone (+253 ms).**
+   A move is internally detach-from-source-parent +
+   attach-to-dest-parent; both touch Blink's child-list
+   bookkeeping. cloneNode produces an unparented node,
+   so the subsequent attach is one-sided. Intrinsic to
+   any move-based design -- no implementation choice
+   avoids it.
+
+2. **Each move dirties Blink's layout state more than
+   each clone does, distributing cost into gBCR
+   (+497 ms).** The increase is spread across every
+   gBCR call site -- `Page.create` (+225 ms),
+   `hasOverflow` (+152 ms), `Layout` ctor (+58 ms),
+   `afterPageLayout` (+31 ms), `addResizeObserver`
+   (+31 ms) -- not localized to any new code. Each
+   gBCR call flushes pending mutations; with every move
+   counting as two mutations vs one for clone+append,
+   each flush has more to do. Same migration pattern
+   *Attempt B: memoize `Page.create`'s gBCR* in
+   [02-finalizepage.md](02-finalizepage.md) documented --
+   DOM mutation cost doesn't go away by elimination, it
+   migrates to whichever frame next forces a layout flush.
+
+3. **The extract-and-restore cycle adds ~340 ms of new
+   JS work.** `restoreOverflow` (168 ms) builds an
+   `extractContents` fragment + walks it for leaves +
+   inserts each back into source. `previousLeaf` makes
+   `shouldBreak` call `getNodeWithNamedPage` (108 ms)
+   on every leaf yield (it climbs parent chains looking
+   for `data-page`). `insertBefore` (87 ms) is the
+   per-restore reinsertion.
+
+The deeper structural reason: paged.js's break-and-
+resume model touches each source leaf O(pages-spanning-
+that-leaf) times in the move model -- moved into page N,
+extracted to the fragment, reinserted into source,
+moved into page N+1. Each touch is a DOM mutation. The
+clone model touches each node O(1) times -- allocated
+once, attached, thrown away with the page. Cumulative
+mutation count is structurally higher under moves.
+
+The cloneNode time the profile attributes to its native
+frame is just the *allocator* portion of cloning work --
+not the total cost of "duplicating a subtree". The rest
+hides in V8 / Blink native frames not labeled
+`cloneNode`, and that rest doesn't disappear when you
+switch to moves; it shows up as appendChild +
+invalidation cost instead.
+
+### Where this leaves the picture
+
+Reverted. The cumulative table from earlier phases
+([03-puppeteer-bump-findref.md](03-puppeteer-bump-findref.md))
+is unchanged. No row added.
+
+The pattern this attempt taught is the inverse of the
+"distributed savings often exceed direct estimates"
+heuristic the earlier phases documented: sometimes a
+change with a direct cost saving has bigger distributed
+*regressions* that aren't visible until you measure.
+The cloneNode saving was real; the appendChild + gBCR +
+restoreOverflow overhead was bigger.
+
+The only design that would avoid all three costs is one
+that never re-moves the same node -- a single-pass
+paginator with no break-and-resume. That's not paged.js;
+it's a different algorithm. Not a small refactor.
+
+The buffer variant (pre-clone source once at startup,
+move from buffer to dest) was considered and not
+prototyped: it'd shift the cloneNode allocation cost to
+one big startup call but every per-page move would
+still hit the same appendChild + gBCR dynamic that ate
+the savings here. No structural win.
+
+This experiment also clarifies why *Profiling pdf-lib's
+load* (in
+[01-baseline-and-detach.md](01-baseline-and-detach.md))
+and *Can we make `removeChild` cheaper?* (in
+[03-puppeteer-bump-findref.md](03-puppeteer-bump-findref.md))
+saw allocation savings show up as wall-clock gains:
+those operations didn't have a Blink layout-tree
+mutation step downstream. Mutations are where the cost
+that *looks* like JS allocation actually lives in this
+codebase.
+
+## Cracking `(program)` open with a Blink-category trace
+
+The cpu profile's `(program)` row sat at ~2.2 s (23 %) of
+render and resisted attribution -- `find-callers.mjs` puts
+it directly under `(root)`, the V8 sampler's structural
+floor for "isolate is on-CPU but no JS frame on top." To
+see *what* native code was running there, the harness gained
+a `--tracing` flag and a companion `analyze-trace.mjs`.
+
+The flag wraps the render phase in `page.tracing.start()`
+with Blink-relevant categories (`devtools.timeline`,
+`disabled-by-default-devtools.timeline`, `blink`, `v8`,
+`v8.execute`, `disabled-by-default-v8.cpu_profiler`) and
+writes `trace.json` to the results folder. The
+`v8.cpu_profiler` category embeds V8 sampling-profile data
+as `Profile` / `ProfileChunk` events inline with the Blink
+trace events, so the single trace file is *hybrid*: loaded
+in Chrome DevTools Performance or [ui.perfetto.dev](https://ui.perfetto.dev)
+it renders JS call stacks aligned with Blink events on the
+same timeline (the de facto answer to "what was `(program)`
+doing?"). Cost: ~2x file size (e.g. 22 MB -> 52 MB on the
+1651-page book) and ~0.4 s wall-clock for the extra sampler
+work -- both noise on the analysis side.
+
+`analyze-trace.mjs` walks the trace's complete-phase
+events on `CrRendererMain`, computes self-time per event
+name via a nested-event stack walk (same shape as
+`analyze-profile.mjs` for cpuprofiles), and prints a
+top-N table. A `--children <name>` mode breaks any
+parent event into its direct callees, mirroring
+`find-callees.mjs`. It ignores the embedded V8 cpu samples
+-- those are consumed by the viewers above (DevTools /
+Perfetto) or, for terminal use, by `analyze-hybrid.mjs`,
+which combines V8 sample stacks with Blink event nests
+into a single bottom-up / callees view.
+
+### What's on the main thread
+
+Top events by self-time on a fresh `--detach-pages
+--no-timing --render-only --tracing` run, 1651-page book,
+9.07 s render:
+
+| event                                    | self_ms | self_% |
+| ---------------------------------------- | ------- | ------ |
+| `RunMicrotasks`                          | 3039.42 | 33.5 % |
+| `LocalFrameView::performLayout`          | 1800.31 | 19.9 % |
+| `Document::recalcStyle`                  | 1785.55 | 19.7 % |
+| `InlineNode::ShapeTextIncludingFirstLine`|  526.64 |  5.8 % |
+| `Document::rebuildLayoutTree`            |  484.88 |  5.4 % |
+| `FunctionCall`                           |  285.89 |  3.2 % |
+| `v8.callFunction`                        |  251.48 |  2.8 % |
+| `Blink.CompositingInputs.UpdateTime`     |  130.77 |  1.4 % |
+| `Blink.PrePaint.UpdateTime`              |  118.90 |  1.3 % |
+| `Document::updateStyle`                  |  101.65 |  1.1 % |
+| ... 189 smaller events ...               |         |        |
+
+Mapping these onto the cpu profile's labels:
+
+| cpu profile row | trace decomposition |
+| --- | --- |
+| `getBoundingClientRect` self 3.7 s | `performLayout` 1.8 s + `recalcStyle` 1.8 s -- the layout flush gBCR triggers, which the cpu profile lumps under the native frame. |
+| `removeChild` self 1.6 s | `rebuildLayoutTree` 0.5 s + portions of `recalcStyle` / `performLayout` -- each removeChild dirties style and layout. |
+| `(program)` self 2.2 s | `RunMicrotasks` 3.0 s mostly. The cpu profile attributes a chunk of this to neighbour rows; what's left under `(program)` is the V8 runtime plumbing that has no JS frame on top. |
+| `(garbage collector)` 100 ms | Sum of `V8.GC_*` events ≈ 135 ms. |
+
+So `(program)` is essentially **the V8 runtime inside a
+microtask continuation**. The natural follow-up is "which
+microtask, and what's it doing?"
+
+### Inside `RunMicrotasks`
+
+`--children RunMicrotasks` shows the parent fired only
+**15 times** across the whole render, totalling 7.14 s:
+
+```
+parent: RunMicrotasks  hits: 15  total: 7142.49ms  self: 3039.42ms (42.6%)
+
+   total_ms  total_%     hits   child
+   --------  -------   ------   --------------------------------
+   3442.01   48.19%    39437   Document::UpdateStyleAndLayout
+   3039.42   42.55%       15   (self / unattributed)
+    547.98    7.67%   181106   v8.callFunction
+     50.99    0.71%      892   Blink.Style.UpdateTime
+     34.88    0.49%      205   V8.StackGuard
+     17.05    0.24%        6   MinorGC
+```
+
+Listing the 15 events by duration:
+
+```
+rm[0]   70.89 ms   -- one early-render burst (the parser)
+rm[1..3]  < 1 ms  -- empty-trigger settle ticks
+rm[4]  7071.14 ms  -- THE render loop
+rm[5..14]  < 1 ms each  -- post-render cleanup
+```
+
+**One event accounts for 99.0 % of the parent total.**
+rm[4] envelopes essentially the whole render. V8 batches
+the ~6 `await` boundaries inside `Chunker.flow()`
+(beforeParsed / filter / afterParsed / loadFonts /
+render / afterRendered) -- all of which Phase 1 of
+*Stripping headless-irrelevant async machinery* in
+[04-sync-and-inner-loop.md](04-sync-and-inner-loop.md) turned
+into `await undefined` fast-paths -- into a single drained
+microtask continuation. There is
+**no per-page microtask cost**. The async stripping did
+its job.
+
+### The 181,106 `v8.callFunction` callbacks
+
+The first thing that looked like a smoking gun --
+"181k dispatches sounds per-page-shaped" -- turned out
+to be **one DOM walk**. Aggregating FunctionCall events
+by `args.data.functionName + lineNumber`:
+
+```
+hits      dur_ms   functionName:line
+181041    296.54   (anon):32455  (paged.browser.js)
+     2      0.25   request.onload:27495
+```
+
+paged.browser.js:32455 is `WhiteSpaceFilter.filter`'s
+TreeWalker callback:
+
+```js
+filterTree(content, (node) => {
+    return this.filterEmpty(node);
+}, NodeFilter.SHOW_TEXT);
+```
+
+The walker visits every text node in the parsed
+document and calls the lambda. For our 5.5 MB book
+that's 181,041 invocations, all clustered in the first
+685 ms of rm[4]. Same `(node) => this.filterEmpty(...)`
+arrow allocated once but called from C++→JS 181k times,
+so V8 emits a `v8.callFunction` event each invocation.
+
+These aren't 181k microtasks. They're 181k synchronous
+TreeWalker callbacks nested inside the one big
+continuation. The "callbacks per page" framing was a
+mirage produced by dividing 181k by page count.
+
+### What's actually in `(program)`'s 2.2 s
+
+Triangulating the trace and cpu profile:
+
+- **~1.7 s** is V8 dispatch glue for the 181k filter
+  walk callbacks + remaining native→JS transitions
+  inside the continuation. V8 charges this to
+  `RunMicrotasks` self in the trace; the cpu profile
+  splits it between `(program)` and rows like `v8.callFunction`.
+- **~0.3 s** is V8 IC / inline-cache miss handling on
+  the per-page hot path. Each polymorphic call site
+  pays a stub-call indirection that lands in `(program)`.
+- **~0.1 s** is Blink microtask checkpoint code -- the
+  auto-style-and-layout pass that fires whenever a
+  microtask drains. The `Document::UpdateStyleAndLayout`
+  events under `RunMicrotasks` (3.44 s) attribute the
+  work *itself* to named Blink rows; the C++ glue
+  bracketing each call lands in `(program)`.
+- The remainder is V8 scheduler bookkeeping, microtask
+  queue drain machinery, and small unnamed natives.
+
+None of this is a *per-page* cost. Reducing further
+would require either (a) eliminating the filter walk,
+or (b) reducing the per-page hot path's native→JS
+transition count -- which is dominated by gBCR-driven
+layout flushes that we've already pushed against
+unsuccessfully (Attempts B, D from *What happened when
+we tried `createBreakToken` dedup* in
+[02-finalizepage.md](02-finalizepage.md)).
+
+### The "actionable finding" that wasn't: WhiteSpaceFilter
+
+The whitespace filter walk costs **~685 ms once per
+render** -- 296 ms inside the JS callback bodies plus
+~390 ms in TreeWalker dispatch overhead. The initial
+read was "this is doing nothing useful for compressed
+HTML, short-circuit it." Wrong on both counts.
+
+Branch-counting the filter via a one-shot probe (count
+every branch in `filterEmpty`, dump to the harness
+console):
+
+```
+total:        181,106  every text node visited
+  length === 0:       0
+  length === 1:  38,685  (21.4%)  collapsed inter-element spaces
+  length > 1, !ignorable: 101,930  (56.3%)  real content -- hot path
+  length > 1, ignorable:  40,491  (22.4%)  whitespace-only, body runs
+    inside <pre>:        3,408   no-op (REJECT)
+    middle position:    27,901   textContent = " " (mutated)
+    left edge:           5,405   removeChild (accepted)
+    right edge:          3,777   removeChild (accepted)
+    orphan:                  0
+```
+
+**22.4 % of calls entered the body** and 37,083 actual
+DOM mutations happened: 9,182 nodes removed +
+27,901 nodes overwritten to single spaces. Far from
+zero.
+
+The premise was based on a misreading of html-compress:
+the plugin does collapse inter-element whitespace, but
+the `:site, :pre_render` gate that picks which pages it
+processes explicitly excludes `book.html` (which uses
+the minimal `book-combined` layout that doesn't reach
+`vendor/compress`;
+[docs/_plugins/html-compress.md](../../docs/_plugins/html-compress.md)
+calls this out). Source indentation is preserved in
+the PDF input, so paged.js sees the raw multi-char
+whitespace text nodes. The filter is load-bearing --
+its mutations are what subsequent chunker walkers
+rely on to skip whitespace cheaply.
+
+The 0.83 % of calls that exceeded 4 us in the trace's
+dur histogram came from this body running; the
+histogram undercounted body entries because the
+short-branch (`closest("pre")` → REJECT) takes only
+~2-3 us, indistinguishable from the hot path in the
+0-4 us buckets. Branch counters were needed to reveal
+the true split.
+
+There's still optimisation headroom (the per-call
+TreeWalker dispatch is ~3 us of which only ~1.5 us is
+the body), but it requires changing the algorithm
+rather than skipping it: e.g. a hand-rolled JS recursion
+that avoids the C++→JS transition per node, or
+folding WhiteSpaceFilter + CommentsFilter + ScriptsFilter
+into a single TreeWalker pass with `SHOW_TEXT | SHOW_COMMENT`
+and a dispatcher. Net saving probably ~300-400 ms once
+per render; not investigated.
+
+The methodology lesson: a histogram of per-call dur
+**cannot** distinguish a fast body branch from a hot
+path -- both compile to 2-3 µs on V8. Branch
+instrumentation is the only way to count what each
+call actually did. The histogram suggested "0.8 %
+body entries"; reality was 22.4 %.
+
+### And we did fix it, on the Jekyll side
+
+The premise that motivated the original "actionable
+finding" -- that book.html should already be
+whitespace-collapsed when paged.js sees it -- was true
+in spirit, just wrong about whether it was being done.
+The fix landed in two parts:
+
+1. **Extend `html-compress.rb` to book.html.** The
+   layout-chain precompute now explicitly adds
+   `book-combined` to `@compress_layouts` at the end of
+   `precompute_compress_layouts!`. book.html therefore
+   passes through `compress!` once per build (~480 ms
+   of `String#split` work on the ~5.5 MB document), and
+   paged.js sees a document with inter-element
+   whitespace already collapsed to single spaces.
+
+2. **Reorder hook priorities** so that adding compress
+   to book.html composes cleanly with the other
+   `:pages, :post_render` plugins. The original
+   `:high`-priority compress ran *before*
+   `book-href-rewrite` -- whose landing-heading strip
+   removed `<h2>` blocks from three chapter openings,
+   leaving the (already-collapsed) single spaces on
+   either side adjacent and producing literal `>  <`
+   blobs. The fix is a three-tier convention: mutators
+   at `:high` (run first), compress at `:normal` (the
+   cleanup), readers at `:low` (snapshot final bytes).
+   See `_plugins/html-compress.md` for the full table.
+
+Verified: 0 outside-pre multi-whitespace runs in the
+regenerated book.html (was 3 with the
+landing-heading-strip artifacts; was 37,087 without
+compress at all). Branch-counting the WhiteSpaceFilter
+after the fix shows body entries drop from ~40 k to
+the 3,408 in-pre cases that the filter is structurally
+required to visit (and immediately REJECTs via
+`closest("pre")`). DOM mutations drop from ~37 k to 0.
+PDF output is byte-equivalent within timestamp drift.
+
+Net wall-clock is approximately neutral on full builds
+(~480 ms added to Jekyll, ~300-500 ms saved at paged.js
+render time), and a small win for incremental Jekyll
+workflows that skip the PDF (`also_build_pdf: false`):
+the compress cost is paid once per Jekyll build, the
+render saving is paid every PDF build, and decoupling
+the two is the structural improvement.
+
+A ruby-prof A/B (post-change vs pre-change with a
+single stashed-changes revert) confirmed that the only
+attributable Jekyll-side cost is exactly one extra
+`compress!` invocation (837 → 838) and its downstream
+`String#split` calls (+819 from book.html's non-pre
+segments). No plugin's call count or self-time changed
+beyond the noise floor; the priority shuffle is
+CPU-invariant for everything except the new compress
+pass on book.html.
+
+### What the trace doesn't change
+
+Nothing about the cpu profile's bottom-up table is
+wrong; the trace just resolves what `(program)` masked.
+After this exercise, the menu of remaining levers is
+unchanged:
+
+- `pageRanges` sharding for the generate phase (biggest
+  untried knob, generate is now the larger phase).
+- WhiteSpaceFilter -- the trace and a follow-up cpu-
+  profile A/B (see next section) eventually showed this
+  *is* skippable for our pipeline once html-compress has
+  done the work at Jekyll time. Worth ~600 ms / 6 %.
+- Everything else lives below the noise floor.
+
+The cpu profile's `(program)` row isn't a structural
+smell or a missed microtask -- it's the fixed cost of V8
+running the JavaScript we already have, accounted for
+honestly by the trace and accounted for opaquely by
+the JS sampler.
+
+## Disabling the filter outright: paired cpu-profile A/B
+
+The "actionable finding that wasn't" + "and we did fix
+it, on the Jekyll side" pair above closed with two
+conclusions:
+
+1. WhiteSpaceFilter does real work on book.html
+   (37k DOM mutations pre-compression, 0 post-).
+2. Post-compression the filter is essentially a no-op
+   visit over 181k text nodes, and skipping it doesn't
+   save measurable wall-clock -- a 3+3 wall-clock A/B
+   showed 8.78 s avg with filter vs 8.53 s without, well
+   inside the 1.17 s within-variant noise band.
+
+Conclusion (1) is correct. Conclusion (2) was wrong --
+specifically the "no measurable saving" claim and the
+flush-migration explanation I attached to the ~+180 ms
+gBCR move that appeared in a single-run profile pair.
+
+A reader pointed out the flush-migration reasoning was
+incoherent: `WhiteSpaceFilter.filter` runs *once* in
+`Chunker.flow()` *before* any page is created. The body
+of `filterEmpty` reads `textContent`, walks parents via
+`closest("pre")`, and walks siblings -- none of which
+read layout-flushing properties (`gBCR`, `offsetTop`,
+computed style, etc.). There is no flush for migration
+to migrate from. Whatever the +180 ms gBCR move in the
+single-run pair was, it wasn't "the filter's flush load
+deferring to the next gBCR." It was single-run noise on
+a 38 % row -- which has a much wider noise band than the
+"50-150 ms for sub-1 % rows" methodology note in
+[02-finalizepage.md](02-finalizepage.md) covers.
+
+### The proper A/B
+
+Three filter-on (A) and three filter-off (B) cpu-profile
+runs, interleaved A1 B1 A2 B2 A3 B3 so system-load
+variance hits both sides equally. The probe is a one-line
+`return;` at the top of `WhiteSpaceFilter.filter` --
+skip the TreeWalker entirely. Toggle is a single edit
+between runs. Both states are otherwise identical
+(post-compression book.html, current bundle).
+
+Per-run totals from
+[`perf/ab-aggregate.mjs`](../ab-aggregate.mjs):
+
+| run | total CPU |
+| --- | --- |
+| A1 (filter ON)  | 11,120 ms |
+| A2 (filter ON)  | 10,270 ms |
+| A3 (filter ON)  |  9,727 ms |
+| **A mean**      | **10,372 ms** |
+| B1 (filter OFF) |  9,744 ms |
+| B2 (filter OFF) | 10,189 ms |
+| B3 (filter OFF) |  9,180 ms |
+| **B mean**      |  **9,705 ms** |
+| **Δ (B - A)**   |   **-668 ms (-6.4 %)** |
+
+The within-group ranges are ~1.3 s (A) and ~1.0 s (B),
+so the -668 ms total-CPU delta sits at roughly 1 σ of
+within-variant spread. By itself, that's a soft signal.
+
+But per-row breakdown is tighter:
+
+| row | A mean ± sd | B mean ± sd | Δ |
+| --- | --- | --- | --- |
+| `getBoundingClientRect`         | 4128 ± 309 | 3791 ± 163 | **-338 ms** |
+| `(program)`                     | 2243 ± 56  | 2328 ± 173 | +85 ms (noisy) |
+| `removeChild`                   | 1619 ± 63  | 1564 ± 43  | -55 ms |
+| `afterPageLayout` @ paged.js    |  150 ± 26  |  119 ± 17  | -32 ms |
+| **`filterTree` self**           | **88 ± 14** |  **2 ± 1** | **-86 ms** |
+| `(garbage collector)`           |  103 ± 6   |   92 ± 4   | -11 ms |
+| `handleAlignment`               |   70 ± 5   |   56 ± 7   | -14 ms |
+| `create` (`Page.create`)        |   66 ± 7   |   50 ± 4   | -15 ms |
+| `sortDisplayedSelectors`        |   60 ± 10  |   46 ± 1   | -14 ms |
+| **`filterEmpty` self**          | **37 ± 2** |    **0**   | **-37 ms** |
+
+Direct attribution (the filter rows that vanish in B):
+
+- `filterTree` self: -86 ms
+- `filterEmpty` self: -37 ms
+- ~123 ms
+
+Indirect attribution (rows that shrink in B despite
+unchanged call counts -- see the trace data above
+where Document::UpdateStyleAndLayout, recalcStyle and
+performLayout all run ~14-15 % cheaper per call with
+filter off):
+
+- `getBoundingClientRect`: -338 ms
+- `removeChild`: -55 ms
+- `afterPageLayout @ paged.js:30458` (paged.js core): -32 ms
+- `create`: -15 ms
+- `handleAlignment`: -14 ms
+- `sortDisplayedSelectors`: -14 ms
+- `(garbage collector)`: -11 ms
+- smaller rows: ~50 ms
+- ~529 ms
+
+Direct + indirect ≈ 652 ms, in the neighbourhood of
+the -668 ms total-CPU delta. They corroborate.
+
+### Why the filter has indirect cost
+
+The single-trace measurement above (filter-off trace
+captured for the same render) made the indirect path
+visible: with filter off, `Document::UpdateStyleAndLayout`
+total dropped by 574 ms across an *unchanged* 39,437
+call count -- ~14 µs less per call. `recalcStyle` and
+`performLayout` similarly dropped ~14 % per call.
+Plausibly:
+
+- V8's polymorphic inline caches stay warmer on the
+  per-page hot path when 181 k extra C++→JS
+  dispatches haven't been churning them.
+- Blink's main-thread scheduler has fewer task
+  boundaries to bookkeep across.
+- Allocator/GC pressure is lower (the filter walk
+  allocates per-callback closures and intermediate
+  strings, even when each callback just returns
+  FILTER_REJECT).
+
+None of those are "the filter triggers a layout
+flush." Layout work *itself* gets cheaper because the
+ambient V8/Blink state is less polluted. Same per-call
+mechanics, slightly faster main-thread context.
+
+### The fix: config flag, default off
+
+`window.PagedConfig.runWhitespaceFilter` gates the
+walk. Default is undefined (falsy) -- our pipeline runs
+`html-compress` on book.html, so the filter has
+nothing to do and skipping it saves the ~600 ms.
+
+Anyone running paged.js against an uncompressed
+document can set the flag before `PagedPolyfill.preview()`
+to opt back in. The class itself is unchanged so the
+opt-in path is byte-equivalent to the original.
+
+The opt-in semantic is the conservative choice: paged.js
+upstream and many downstream users feed it untouched
+HTML (with inter-element indentation surviving), where
+the filter does meaningful cleanup. Disabling it for
+*every* caller of this bundle would be a regression for
+those use cases. Disabling it by default for *our*
+pipeline is fine because we control the input
+end-to-end.
+
+Cost: zero per-page work (the gate is one `&&`-chain
+check at startup), structural correctness for clean
+documents, opt-in safety valve for everyone else.
+
+### Methodology note
+
+The wall-clock A/B was correct in claiming "the saving
+is below the wall-clock noise floor for short N." It
+was wrong in concluding "therefore no saving exists."
+Two corrections:
+
+1. Aggregate CPU work across paired profiles. Wall-clock
+   noise is ~1 s per run on this machine; CPU sample
+   totals are also ~1 s per run but the row-by-row
+   self-time deltas can be much tighter. The
+   `filterTree` row goes from 88 ms (sd 14) to 2 ms (sd
+   1) -- a 6 σ shift. Per-row analysis can see signals
+   that per-run totals lose.
+
+2. Use *enough* paired runs that within-group SD lets
+   you compute mean ± SD honestly. 3+3 is the bare
+   minimum (gives 1 σ confidence on row-level deltas
+   for things that change by 5+ σ). 5+5 or 10+10 would
+   tighten the gBCR delta confidence further -- worth
+   doing for finer signals.
+
+The probe + aggregator are reusable
+([`perf/ab-aggregate.mjs`](../ab-aggregate.mjs)): point at
+6 `ab-*.cpuprofile` files and it prints the mean ± SD
+table. Pattern fits any future "does this change save
+CPU?" question where wall-clock noise is the obstacle.
diff --git a/perf/notes/06-microtasks-pageranges-css.md b/perf/notes/06-microtasks-pageranges-css.md
new file mode 100644
index 0000000..cbfe3c4
--- /dev/null
+++ b/perf/notes/06-microtasks-pageranges-css.md
@@ -0,0 +1,564 @@
+# RunMicrotasks down to zero, pageRanges, and CSS cost attribution
+
+Following the `RunMicrotasks` row down to zero by converting the chunker's per-page loop fully sync; ruling out `pageRanges` sharding for `generate` (memory profile makes it impractical); and a CSS cost-attribution sweep showing print.css's individual sections are all below the noise floor.
+
+## Following `RunMicrotasks` down to zero
+
+The Blink-trace investigation in
+[05-blink-trace.md](05-blink-trace.md) pinned the cpu
+profile's `(program)` row to V8 running JS inside a
+microtask continuation. With the WhiteSpaceFilter gone the
+`--children RunMicrotasks` breakdown still showed one
+`rm[4] = 6262 ms` event enveloping essentially the
+whole render -- 15 hits total, 99 % concentrated in one
+batched drain. That raised a sharper question: if the
+per-page hot path is sync (Phase 1 + 2 of *Stripping
+headless-irrelevant async machinery* in
+[04-sync-and-inner-loop.md](04-sync-and-inner-loop.md)),
+why is *any* of the render running inside a microtask scope?
+
+### What was still async, and what it cost us
+
+The *What's still async, and why* inventory in
+[04-sync-and-inner-loop.md](04-sync-and-inner-loop.md) was
+honest about the surviving await sites at
+that point:
+
+- `Chunker.flow()` -- async wrapper, awaited
+  `beforeParsed` / `afterParsed` / `afterRendered` hook
+  triggers, `loadFonts()`, and `chunker.render()`.
+- `Chunker.render()` -- thin async wrapper around the
+  sync `renderer.next()` loop, kept so `flow()` could
+  `await` it.
+- `Chunker.clonePage()` -- async, awaited three
+  per-page hooks. Footnotes-only caller, dead path for
+  our content but live in the bundle.
+- `PagedPolyfill.preview()` -- async, awaited
+  `beforePreview` / `afterPreview` hooks plus
+  `polisher.add` and `chunker.flow`.
+- `Polisher.add()` / `Polisher.convertViaSheet()` /
+  `Sheet.parse()` -- async chain to fetch and parse
+  external stylesheets. `Polisher.add` did
+  `Promise.all` over the inputs.
+- `Chunker.loadFonts()` -- returned `Promise.all` of
+  `fontFace.load()` for any face not yet in state
+  "loaded".
+- `request()` -- async XHR + `Promise` wrapper, used by
+  the polisher chain to fetch each `<link rel="stylesheet">`
+  URL.
+
+Cost of each: small. Cost of all of them together: V8
+sees an unbroken await chain from `page.evaluate(async
+() => { await PagedPolyfill.preview(); })` down to
+`document.fonts.ready` (the one genuinely-async
+dependency in the chain). When that promise resolves V8
+schedules a microtask to resume `flow()`. Phase 1 + 2
+of the async cleanup made the *body* of the resumed
+function execute synchronously, so once it resumes it
+runs ~6.2 s straight to the end of the render. V8
+correctly attributes the whole continuation to the
+`RunMicrotasks` host frame, since that's the C++ frame
+on the stack while the resumed JS runs.
+
+So `RunMicrotasks` self-time being 2.89 s wasn't a
+sign of microtask overhead -- it was the bookkeeping
+label V8 puts on continuation-style work. Every named
+Blink event nested inside (`Document::UpdateStyleAndLayout`,
+`recalcStyle`, `performLayout`, etc.) appeared in the
+trace as a child of `RunMicrotasks`. Same shape applied
+in the cpu profile: `(program)` is the catch-all bucket
+V8 picks when no JS frame sits on top of the stack at
+sample time, and a microtask continuation is exactly
+that condition.
+
+The bucket name was misleading, but the cost itself was
+real -- the JS *running* inside the continuation
+*was* paged.js doing its per-page work. No "microtask
+plumbing overhead" to slim down. The only way to remove
+the `RunMicrotasks` attribution was to stop wrapping the
+render in a microtask continuation entirely -- i.e.,
+make the whole chain synchronous so V8 has no async
+scope to attribute to.
+
+### Why this is OK for our pipeline (and not for upstream)
+
+Upstream paged.js needs the async machinery. Its target
+deployment is an interactive browser page: real
+stylesheet fetches over HTTP (genuinely async), font
+loads against the OS (genuinely async), user-registered
+handlers that may load external resources or do
+expensive work between page renders (async-friendly to
+keep the page responsive). The await chain is the
+canonical pattern for "yield to the browser between
+expensive steps so the UI thread can paint."
+
+Our pipeline has none of those constraints:
+
+- `page.goto(url, { waitUntil: 'load' })` settles
+  *before* paged.js is invoked. Every font, image, and
+  stylesheet referenced by `<link>` / `@font-face` /
+  `<img>` is already loaded by the time the render
+  starts. The async checks are no-ops.
+- The headless renderer has no compositor coordinating
+  with us, no paint budget to respect, no user looking
+  at the page. Blocking the main thread for 8 s is
+  fine -- nobody's watching.
+- All registered handlers in our build are synchronous.
+  The `_assertSync` guard from the Phase 1/2 cleanup
+  has been in place for the per-page hot path for a
+  while; we just hadn't extended the pattern to the
+  once-per-render hooks.
+- The stylesheet fetches the polisher does are local
+  `file://` URLs. Sync XHR resolves them in microseconds.
+
+So the entire async surface in paged.js -- which
+upstream needs -- is, for our specific use case, the
+opposite of helpful: it pushes work into microtask
+continuations that show up as `RunMicrotasks` in the
+trace and `(program)` in the cpu profile, instead of
+landing under honest names like `RunTask` and
+`EvaluateScript`.
+
+### The conversion
+
+Nine functions in `docs/lib/paged.browser.js` switched
+from `async` to plain sync, marked
+`[PATCH: sync-chain]` at each site:
+
+| function | what changed |
+| --- | --- |
+| `request()` | Async XHR + `new Promise` + `Response` wrapper → sync XHR (`open(...,false)`) returning body text directly. Both callers (`Polisher.add` / `convertViaSheet`) only ever consumed `response.text()` (itself async per spec), so returning text skips that boundary too. |
+| `Sheet.parse()` | Three `await hook.trigger(...)` → `_assertSync(triggerSync(...))`. CSS-parser hooks all sync in our build. |
+| `Polisher.convertViaSheet()` | Drop awaits on `sheet.parse` / `request` / recursive `convertViaSheet`. |
+| `Polisher.add()` | Drop the `Promise.all` + then-chain entirely. Walks arguments once, feeds each through the sync pipeline. |
+| `Chunker.loadFonts()` | `Promise.all(fontFace.load())` → sync walk of `document.fonts` that throws if any face's `status !== "loaded"`. The throw is a safety net; `page.goto({waitUntil:'load'})` settles fonts in practice. |
+| `Chunker.clonePage()` | Three per-page hook awaits → `_assertSync`. Cold path (Footnotes-only). |
+| `Chunker.render()` | Strip `async`. Body was already sync after the Phase 1/2 cleanup. |
+| `Chunker.flow()` | Strip `async`; five await sites → sync calls / `_assertSync`. |
+| `PagedPolyfill.preview()` | Strip `async`; two hook awaits → `_assertSync`; drop awaits on `polisher.add` / `chunker.flow`. |
+
+Plus the two external callers in
+[`perf/measure.mjs`](../measure.mjs) and
+[`docs/render-book.mjs`](../../docs/render-book.mjs):
+both did `page.evaluate(async () => { await
+window.PagedPolyfill.preview(); })`. The inner IIFE is
+now a plain sync arrow; the outer `await` is just the
+CDP round-trip puppeteer needs to ferry control back.
+
+The `_assertSync` helper (from the earlier
+"sync chain end-to-end through the per-page hot path"
+work) is the load-bearing safety net throughout: if any
+future hook handler returns a thenable, the chain
+throws with a useful error message instead of silently
+swallowing async work. The contract is now:
+
+> Every hook handler in this bundle is sync. Every
+> external resource referenced by the document is
+> loaded before `PagedPolyfill.preview()` runs.
+
+If either invariant breaks, `_assertSync` or
+`loadFonts`'s throw catches it loudly.
+
+### Results
+
+Paired `--detach-pages --no-timing --render-only
+--tracing` run on the 1651-page book, comparing the
+pre-conversion trace (results from *Inside RunMicrotasks*
+in [05-blink-trace.md](05-blink-trace.md)) against
+the post-:
+
+| metric | pre-sync | post-sync | Δ |
+| --- | --- | --- | --- |
+| render wall | 8.13 s | 8.36 s | flat (within single-run noise) |
+| trace event count | 250,376 | 255,949 | flat |
+| `RunMicrotasks` self | 2890.66 ms (35.6 %) | **0.56 ms** (off top-30) | **-2890 ms (-99.98 %)** |
+| `RunMicrotasks` total | 6333.18 ms | **0.56 ms** | **-6333 ms** |
+| `RunMicrotasks` hits | 15 | 12 | -3 |
+| `RunMicrotasks` rm[4] dur | 6262.34 ms | gone | -6262 ms |
+| `RunTask` self (top-30) | (below threshold, ~16 ms) | **2984.11 ms (34.6 %)** | **+2968 ms** |
+| `RunTask` hits | (~few hundred) | **1005** | re-attributed |
+| `RunTask` total | (small) | **8630.80 ms** | the whole render |
+| `Document::UpdateStyleAndLayout` total/hits | 3320 / 39675 | 3515 / 39675 | flat |
+| `Document::recalcStyle` self | 1737 ms | 1877 ms | flat |
+| `LocalFrameView::performLayout` self | 1737 ms | 1881 ms | flat |
+| per-page ratio (last/first quarter) | 1.36x | 1.27x | slight improvement (noise band) |
+| pages | 1651 | 1651 | identical |
+| PDF size (full render, separate run) | 16.1 MB | **16.1 MB** | byte-equivalent |
+
+The headline number is the **6333 → 0.56 ms collapse**
+in `RunMicrotasks` total. The 12 surviving sub-ms hits
+are pure puppeteer/CDP plumbing (one `AsyncTask Run`
+child = 0.01 ms; the rest are V8 internal MT-checkpoint
+runs). There is no remaining JS executing inside a
+microtask continuation -- the render runs as a plain
+synchronous task from start to end.
+
+The work didn't disappear, it re-attributed. `RunTask`
+self-time (2984 ms) almost exactly equals the old
+`RunMicrotasks` self-time (2891 ms) plus single-run
+noise. Per-call children counts are unchanged
+(`Document::UpdateStyleAndLayout`: 39675 calls then,
+39675 calls now). Same JS, same DOM mutations, same
+layout flushes -- just no longer wrapped in a
+continuation.
+
+### What this buys
+
+**Profile readability.** A reader opening
+`render.cpuprofile` or `trace.json` after this change
+sees:
+
+- `(program)` in the cpu profile drops by the
+  proportion that was V8 runtime overhead inside the
+  continuation (the MT plumbing + dispatch glue
+  between named natives). The remaining `(program)`
+  is genuinely-unattributable V8 work (IC stubs,
+  runtime helpers).
+- `RunMicrotasks` no longer appears at the top of the
+  trace's bottom-up table. The render lands under
+  `RunTask` / `EvaluateScript` / `FunctionCall`, with
+  Blink work (`performLayout`, `recalcStyle`,
+  `rebuildLayoutTree`) as named children where it
+  belongs.
+- The cpu profile's `(idle)` row already collapsed in
+  the earlier rAF→queueMicrotask fix; this change
+  closes the symmetric gap on the JS side.
+
+**Structural simplicity.** Nine functions in the bundle
+lost the `async` keyword and the `await` site
+discipline that went with it. The render call chain is
+now top-to-bottom synchronous: `preview()` calls into
+`flow()` calls into `render()` calls into `*layout()`,
+plain returns all the way down. Anyone tracing through
+the bundle for a perf investigation can read the
+control flow without modeling promise resolution
+ordering.
+
+**Single contract.** The hook surface is now uniformly
+sync via `_assertSync`. Before the conversion, the
+per-page hooks (`beforePageLayout`, `afterPageLayout`,
+`finalizePage`, etc.) were sync-asserted while the
+once-per-render hooks (`beforeParsed`, `afterParsed`,
+`afterRendered`, `beforePreview`, `afterPreview`) used
+`await trigger(...)`. The split was historical, not
+principled. Now every hook is sync-asserted, same
+shape, same error message.
+
+### What this doesn't buy
+
+**Wall-clock.** Render goes 8.13 s → 8.36 s, which is
+within the ±1 s single-run noise band for this machine
+documented in earlier phases. CPU work
+re-attributes but doesn't shrink: the chunker's JS
+still runs the same way, DOM mutations still trigger
+the same layout flushes, gBCR self-time still owns
+~21 % of the trace. Phase 1's microtask-boundary
+elimination cost (~850 ms) was real because there *were*
+8 k boundaries to remove; this conversion eliminates a
+handful of additional boundaries (the once-per-render
+sites) whose per-boundary cost is small.
+
+**A path to fewer flushes.** The remaining gBCR-driven
+layout work is intrinsic to paged.js's per-page
+break-and-resume algorithm. The earlier attempts (B, D from
+*What happened when we tried `createBreakToken` dedup* in
+[02-finalizepage.md](02-finalizepage.md); the
+move-not-clone experiment in
+[05-blink-trace.md](05-blink-trace.md))
+confirmed that gBCR re-attributes if you elide one site, and
+that mutations are the structural source. Synchronising
+the chain doesn't change any of that.
+
+### Verification
+
+The 1651-page book renders identically pre- and
+post-conversion -- same page count, same 16.1 MB PDF.
+The PDF differs from the previous build only by the
+expected timestamp drift (the `/CreationDate` /
+`/ModDate` entries Chrome writes per run). No content
+changes; the bundle does the same work in the same
+order.
+
+The trace's `RunTask` -> `Document::UpdateStyleAndLayout`
+hit count (39 675) matches the previous run exactly,
+confirming the per-page chunker iteration count is
+preserved through the conversion. `RunTask` ->
+`WebFrameWidgetImpl::UpdateLifecycle` at 1950 ms / 1
+hit is Chromium's final-frame lifecycle work after the
+last page is laid out, same as before -- it just shows
+up under `RunTask` instead of being attributed to a
+post-render microtask, which is also why `RunTask` self
+includes it.
+
+### What's still async, post-conversion
+
+Two surfaces remain async-shaped, both intentionally:
+
+1. **The auto-run block at [paged.browser.js:33153](../../docs/lib/paged.browser.js:33153).**
+   `ready.then(async function () { ... })` fires once at
+   `DOMContentLoaded` and is gated by `config.auto !==
+   false` -- our pipeline always sets `config.auto =
+   false` before invoking `preview()`, so this branch
+   never runs. Leaving it async-shaped costs one
+   microtask scheduling at startup, sub-microsecond,
+   and preserves byte-for-byte compatibility with
+   upstream paged.js's auto-init semantic for anyone
+   running this bundle in a configuration we don't.
+2. **External `page.evaluate(...)` callers.** The
+   wrapper around `window.PagedPolyfill.preview()` in
+   `perf/measure.mjs` and `docs/render-book.mjs` is a
+   sync arrow, but `page.evaluate` itself returns a
+   Promise (CDP roundtrip). Node-side code awaits that
+   Promise. Cost is the CDP round-trip, not the JS we
+   execute.
+
+Neither contributes to the renderer's main-thread
+profile.
+
+### Cumulative trace shape
+
+For reference, the post-conversion top-of-table on
+`CrRendererMain` reads:
+
+```
+   self_ms   self_%   event                                       category
+   -------   ------   ----------------------------------------------
+   2984.11   34.58%   RunTask                                     devtools.timeline
+   1880.79   21.79%   LocalFrameView::performLayout               blink
+   1876.53   21.74%   Document::recalcStyle                       blink
+    540.06    6.26%   InlineNode::ShapeTextIncludingFirstLine     blink
+    503.09    5.83%   Document::rebuildLayoutTree                 blink
+    128.90    1.49%   Blink.CompositingInputs.UpdateTime          blink
+    123.41    1.43%   Blink.PrePaint.UpdateTime                   blink
+     99.60    1.15%   Document::updateStyle                       blink
+     76.83    0.89%   V8.GC_MC_INCREMENTAL_EMBEDDER_TRACING       v8.gc
+     43.20    0.50%   Layout                                      devtools.timeline
+     ...
+```
+
+`RunMicrotasks` no longer appears. `(self /
+unattributed)` time inside `RunTask` is 2984 ms across
+1005 hits -- average ~3 ms per task, consistent with
+"each render task does ~one page's worth of work" plus
+some longer tasks for setup / teardown. The dominant
+named children are unchanged: `UpdateStyleAndLayout`,
+`recalcStyle`, `performLayout`, `ShapeText`,
+`rebuildLayoutTree`. Same work, honest labels.
+
+Shipped.
+
+## `pageRanges` sharding: off the table for now
+
+Several earlier phase notes flag `pageRanges` sharding as
+"the biggest untried lever" for the `generate` phase --
+run `page.pdf()` N times over disjoint page ranges in
+parallel headless browsers, concatenate the resulting
+PDFs with pdf-lib, divide generate's ~43 s wall-clock by
+N. The arithmetic is appealing; the engineering isn't.
+
+A separate investigation (not in this repo) found enough
+pitfalls to make the work not worth pursuing at current
+scale. Sketch of what bit:
+
+- Each shard re-loads `book.html` and re-runs `paged.js`
+  rendering for *its* range, which means the per-shard
+  render is **not** 1/N of the original render -- paged.js
+  has to lay out all preceding pages to position the slice
+  correctly (named strings, counters, footnote numbering,
+  cross-references). Several "fixes" (skip-to-page hooks,
+  pre-rendered state injection) each broke in subtle ways
+  on the book's actual content.
+- PDF concatenation via pdf-lib reintroduces the full
+  `PDFDocument.load` cost the incremental writer avoided
+  -- need a streaming concatenator or qpdf binary
+  dependency to keep the process phase cheap.
+- Page numbers, named strings (`string(chapter-title)`),
+  and the running header rely on per-page state that the
+  Counters handler and `addEnvFunctions` rebuild from
+  document order. Sharding loses that order and breaks
+  the header on every shard boundary unless the per-shard
+  paged.js render is given the right starting state, which
+  is itself a research project.
+- Outline injection has to know cross-shard page numbers,
+  so either Chrome's native outline (which we don't ship)
+  or a post-concat outline rebuild is required.
+
+Net: even with aggressive engineering, the realistic win
+on a 1651-page book at N=4 shards is ~15-25 s of
+`generate` saved -- not the 32 s / 75 % the naive math
+suggests -- against a maintenance cost of a sharding
+harness that wraps puppeteer launch + IPC + pdf concat
++ per-shard state setup. Below the cost/benefit bar.
+
+The lever is documented here because it *is*
+the largest remaining target if priorities change (e.g.
+the book grows past 3000 pages, or a CI runtime cap
+forces it). It's just not the next thing to build.
+
+### Probe results (later session)
+
+A two-shard probe in [perf/probe-parallel.mjs](../probe-parallel.mjs)
+was run after the render-side speedups to see what the
+actual wall-clock floor looks like with current numbers.
+N=2, equal page-count split, no concatenation -- just
+two browsers in parallel each printing their `pageRanges`
+slice:
+
+| shard | launch | load | render | generate | total |
+| --- | --- | --- | --- | --- | --- |
+| 0 (pp 1-826)   | 1.00 s | 1.61 s | 10.37 s | 24.02 s | 35.54 s |
+| 1 (pp 827-1651)| 0.97 s | 1.61 s | 10.12 s | 24.46 s | 35.74 s |
+
+Wall clock for `Promise.all` of both: **35.94 s**. Both
+slices open via pdf-lib and the page counts add up
+exactly (826 + 825 = 1,651). Vs the ~53 s single-process
+render+generate, parallel N=2 saves ~17 s wall clock.
+
+The probe also confirms two browsers really do run in
+parallel at the OS level: generate dropped from ~43 s to
+~24 s per shard (roughly linear with a ~2-3 s per-call
+fixed overhead), which would only happen if the Skia +
+PrintCompositor workloads in the two browser trees
+weren't serialised by a shared kernel resource. So the
+"single-threaded Skia per page" finding from
+*Chromium `Page.printToPDF` knob survey* in
+[01-baseline-and-detach.md](01-baseline-and-detach.md) is
+per-process -- not a machine-wide lock.
+
+**Still not shipped.** Reasons unchanged:
+
+- Each shard re-renders the whole book to maintain
+  per-shard layout state (named strings, counters,
+  footnotes). With render at ~10 s that's now cheap CPU-
+  wise, but the memory cost is the blocker -- see
+  [07-memory.md](07-memory.md). N=2 ≈ 5 GB peak,
+  N=4 ≈ 10 GB peak; the CI runner doesn't have that
+  headroom.
+- Concat + outline page-number remap still needs to be
+  built. The incremental-pdf.mjs pattern extends to it
+  but it's nontrivial.
+
+Probe stays available as `node perf/probe-parallel.mjs
+[--shards N]` for re-evaluation if either constraint
+changes (CI machine grows, or book size forces it).
+
+## CSS cost attribution
+
+Render is at ~10 s on a 1651-page book, down from ~104 s
+in the original baseline. The bottom-up profile after
+all of the above changes shows no individual JS body
+above ~250 ms self-time; the dominant rows are native
+Blink work (`recalcStyle` 2.4 s, `performLayout` 2.2 s,
+`removeChild` 1.7 s) that's intrinsic to laying out and
+detaching 1651 pages of content. The remaining question:
+is any of that recalcStyle work *avoidable* via CSS
+pruning?
+
+`ab-css.mjs` automates the answer. It renders the book
+under four variants -- baseline-full (print.css +
+rouge.css), drop-rouge, drop-print-extras (only the
+always-kept Page-geometry + Chapter-boundaries sections
+of print.css), and baseline-minimal (both stripped) --
+then reports the **paired difference** of CPU sample-time
+(`Document::recalcStyle` total in particular) between
+baseline-full and each variant. Pairing immediately
+interleaves baseline + variant runs so machine-state
+drift cancels across the diff. On Windows the harness
+auto-relaunches itself under `start /affinity 0x5500
+/high` to pin to a fixed subset of cores, which on a
+Ryzen 7 cuts run-to-run variance from ~15-25 % to ~3 %.
+
+### Methodology calibration
+
+We learned the variance story the hard way. The first
+sweep used single runs per variant and CPU sample-time,
+on the theory that profile time would be machine-load-
+independent. It wasn't on this Windows dev box: four
+identical-content runs of baseline-full spanned
+9.47-16.89 s (the 16.89 was an outlier; even excluding
+it, the remaining three varied by ~12 %). At that noise
+floor, the per-section "drop-X saves N ms" rankings the
+tool was emitting were ~75 % noise. The fix had two
+parts:
+
+1. **CPU pinning via `start /affinity`** -- shipped as
+   the auto-relaunch shim in `ab-css.mjs`. Reduced
+   baseline SD on recalcStyle total from ~12-25 % to
+   ~3 %.
+2. **Paired interleaved measurement** -- run baseline
+   immediately before each variant, pair the two, take
+   the difference. Mean paired difference and SD across
+   N pairs let noise-floor rows show themselves honestly
+   (mean within ~2 σ of zero). Default N=3 pairs; bump
+   to `--runs 5` for tighter SD at the cost of wall
+   time.
+
+The original "stripping CSS saves ~740 ms" finding from
+a single manual A/B turned out to be partly real, partly
+noise, and partly confounded by what "minimal" meant.
+The manual A/B's "minimal" was just `@page` +
+`article{break-before:page}`; the tool's "baseline-
+minimal" keeps the preamble + Page-geometry +
+Chapter-boundaries sections (paged.js needs the
+string-set / @top-right / @bottom-right machinery for
+running headers and page numbers). The earlier signal
+was real, but spread across pieces the tool can and
+can't isolate.
+
+### Findings
+
+With pinning + paired diffs (3 pairs per variant):
+
+| variant | Δrecalc ms | ± SD | mean/SD | verdict |
+| --- | --- | --- | --- | --- |
+| **drop-print-extras** | **237** | **60** | **3.95** | **real signal** |
+| baseline-minimal | 193 | 246 | 0.78 | noise |
+| drop-rouge | 66 | 124 | 0.53 | noise |
+| (baseline-full mean) | 2038 | 108 SD | -- | reference |
+
+Read this as:
+
+- **print.css extras (everything beyond the always-kept
+  Page-geometry + Chapter-boundaries sections) contribute
+  ~237 ms of recalcStyle**: ~11 % of recalcStyle, ~2.4 %
+  of render. All three pairs gave Δrecalc 202, 307, 202 --
+  consistent direction and magnitude, ~4 σ from zero.
+- **rouge.css contribution is at the noise floor**
+  (66 ± 124 ms). The earlier hypothesis ("rouge.css is
+  the big spender via per-span cascade work in code
+  blocks") was wrong; the per-pair Δrecalc values were
+  38, 202, -42 -- variance too high to claim signal at
+  N=3.
+- **baseline-minimal** stripping both still lands inside
+  the noise band on this tool's run. The original manual
+  A/B's larger delta came from removing more than this
+  tool removes -- specifically the Page-geometry section
+  that the tool keeps.
+
+The per-section sweep behind `--per-print-section`
+confirmed the methodology lesson the hard way: when each
+print.css section is dropped individually, every Δrecalc
+lands within ~2 σ of zero. The 237 ms of print.css cost
+is structurally non-additive -- selectors interact in
+the cascade, the style sharing cache hits differently
+when rule count drops, and Blink's invalidation walks
+change shape based on what rules exist. Any single
+section's marginal contribution is too small to surface
+above ~60 ms of paired-diff noise; the sum-of-extras
+effect is the only real signal.
+
+### Where this leaves render
+
+Render is structurally near its floor. The biggest
+plausible CSS prune (drop-print-extras) saves ~240 ms of
+recalcStyle ≈ ~2.4 % of render, but would mean losing
+the typography that makes the PDF look like a book. The
+remaining levers all live outside render:
+
+- `pageRanges` sharding (~5-20 s in generate): off the
+  table for now (see previous section).
+- Chrome's `outline: true` (~5 s in process): one
+  `role="presentation"` preprocessor pass away from
+  shipping, but not pursued.
+
+No structurally promising next target inside render.
diff --git a/perf/notes/07-memory.md b/perf/notes/07-memory.md
new file mode 100644
index 0000000..b1b0079
--- /dev/null
+++ b/perf/notes/07-memory.md
@@ -0,0 +1,470 @@
+# Memory: where the renderer's 1.9 GB goes
+
+Process-tree footprint, per-allocator breakdown inside the renderer, the `--disable-gpu` + `--in-process-gpu` pair that saves ~200 MB, and a GC-pass probe that showed 180 MB of unswept Oilpan garbage but determined the cost-to-fix exceeds the headroom it would buy.
+
+CI runs the book build with limited RAM headroom -- the
+1651-page book is the largest job on the machine and the
+budget matters. These notes measure one render's peak
+memory and break it down by allocator, so we know what
+levers exist if the book grows.
+
+> **Note.** Approaches that involve Chromium internals --
+> patching the binary, intercepting the SkPicture stream
+> via Frida, spawning standalone PrintCompositors via
+> Mojo, building a Chromium-linked helper binary -- were
+> researched but not shipped. They're documented
+> separately in [CHROMIUM.md](../CHROMIUM.md). What
+> follows covers only what's measurable from the outside
+> through public APIs.
+
+`perf/probe-memory.mjs` is the harness. It runs the full
+pipeline (load + render + generate) in a single browser
+and watches the chrome.exe process tree at 500 ms
+intervals via `sample-mem.ps1`, reporting per-process
+private bytes + working set. `perf/probe-renderer-mem.mjs`
+goes deeper -- it drives Chromium's memory-infra tracing
+to capture detailed per-allocator dumps from inside the
+renderer at three points (post-render, mid-generate,
+post-generate). `perf/analyze-mem-trace.mjs` reads the
+resulting trace.json and prints the breakdown.
+
+## Process-tree footprint
+
+Peak across the whole tree on the 1651-page book:
+
+```
+renderer (main)                 ~1,880 MB private
+utility:PrintCompositor           ~290-450 MB  (high variance)
+browser                         ~70-1,100 MB   (PDF IPC buffer; very high variance)
+gpu-process                       ~100 MB
+renderer (about:blank etc.)        ~25 MB total
+utility:network/storage            ~30 MB total
+crashpad-handler                    ~2 MB
+                                ------------
+total peak                      ~2.5-3.5 GB private
+                                ~2.7-2.9 GB working set
+```
+
+The browser-process number is the wildest -- across
+runs it ranged from 72 MB to 1.1 GB. That's the IPC
+buffer the PDF travels through on its way from the
+renderer back to puppeteer; how much accumulates depends
+on timing between Mojo write and Node read. The
+PrintCompositor utility process appears only during
+generate; it's the Chromium service that turns the
+renderer's Skia commands into PDF bytes for `page.pdf()`.
+
+## Inside the renderer
+
+memory-infra dump at post-generate, renderer process,
+top-level allocators (`blink_gc` and `blink_objects`
+overlap by design -- they're two views of the same
+Oilpan heap, raw pages vs typed object counts):
+
+| allocator         | size      | notes |
+| ----------------- | --------- | ----- |
+| `blink_gc`        | 1,350 MB  | C++ DOM, layout, render objects (Oilpan) |
+| `malloc`          |   332 MB  | Skia raster buffers + small native allocations |
+| `partition_alloc` |   114 MB  | String buffers, ArrayBuffers |
+| `v8`              |    34 MB  | JS heap (paged.js + page JS); tiny |
+| other             |   ~22 MB  | web_cache, shared_memory, cc, gpu stub |
+
+V8 is only ~2 % of the renderer. Blink is ~80 %. That
+matches the structural picture: the renderer holds the
+laid-out state of 1651 pages of typeset content, and
+that state is C++ objects, not JS.
+
+Top Blink object classes (the `blink_objects` view of
+the Oilpan heap, post-generate):
+
+| class                                  | size    | count       |
+| -------------------------------------- | ------- | ----------- |
+| `GridSizingTrackCollection`            | 132 MB  | 79,246      |
+| `ComputedStyle`                        |  74 MB  | 1,074,537   |
+| `ConstraintSpace::RareData`            |  71 MB  | 617,415     |
+| `PhysicalBoxFragment`                  |  42 MB  | 516,289     |
+| `LogicalLineItems`                     |  42 MB  | 24,118      |
+| `Text` (DOM nodes)                     |  42 MB  | 498,077     |
+| `LayoutResult`                         |  41 MB  | 540,447     |
+| `AXNodeObject`                         |  41 MB  | 411,760     |
+| `GridItemData`                         |  30 MB  | 162,443     |
+| `ComputedStyleBase::StyleBoxData`      |  30 MB  | 176,479     |
+| `InlineItem`                           |  28 MB  | 737,744     |
+| `LayoutResult::RareData`               |  28 MB  | 229,056     |
+| `ElementRareDataVector`                |  24 MB  | 613,629     |
+| `CachedMatchedProperties`              |  23 MB  | 226,679     |
+| `ShapeResultView`                      |  21 MB  | 306,762     |
+| `HeapVectorBacking<FragmentItem>`      |  21 MB  | 72,175      |
+| `HeapVectorBacking<HarfBuzzRunGlyphData>` | 20 MB | 165,957  |
+| `LayoutText`                           |  14 MB  | 129,056     |
+| `HTMLDivElement`                       |  12 MB  | 118,877     |
+| `HTMLSpanElement`                      |  10 MB  | 104,266     |
+
+Three patterns visible:
+
+1. **Page-template grid is expensive.** paged.js renders
+   each `@page` as a CSS grid (so `@top-right`,
+   `@bottom-right`, etc. resolve correctly). 79,246
+   `GridSizingTrackCollection` ≈ 48 per page × 1651
+   pages, plus 162k `GridItemData`. Combined ~162 MB just
+   for the running header/footer geometry.
+2. **Style explosion.** 1,074,537 `ComputedStyle`
+   objects across 1651 pages is ~650 per page, which
+   matches roughly one per leaf element after style
+   sharing. `CachedMatchedProperties` (23 MB, 227k)
+   shows the sharing cache is active; without it the
+   number would be much worse.
+3. **LayoutNG fragment tree.** `PhysicalBoxFragment`
+   (42 MB), `LogicalLineItems` (42 MB), `LayoutResult`
+   (41 MB), various `RareData` (98 MB combined),
+   `InlineItem` (28 MB) -- the modern Blink layout tree
+   is fragment-based and the fragments add up across
+   half a million layout objects.
+
+The render→generate transition adds about 500 MB:
+~272 MB to `blink_gc` (print-preview snapshot retention)
+and ~219 MB to `malloc` (Skia content-stream allocations
+during PDF emit, visible as a million-ish small
+allocations in the bucket-size profile).
+
+## Disabling the GPU process
+
+The GPU process at ~100 MB looked like easy win. It
+isn't, quite -- in headless Chromium still spawns a
+GPU process to host SwiftShader (software raster) for
+canvas / WebGL emulation, even when no canvas / WebGL
+is in use. Three variants tested:
+
+| variant                                       | render | generate | total | gpu-process | renderer | PDF bytes |
+| --------------------------------------------- | ------ | -------- | ----- | ----------- | -------- | --------- |
+| baseline                                      | 10-11s | 44-50s   | 51-56s |  100 MB    | 1,880 MB | 41,076,362 |
+| `--disable-gpu --disable-software-rasterizer` | 10s    | 45s      | 45s   |  16 MB      | 1,761 MB | 41,076,362 |
+| above + `--in-process-gpu`                    | 15s    | 61s      | 62s   |  (gone)     | 1,748 MB | 41,076,362 |
+| `--single-process`                            | crash  | -        | -     | -           | -        | -         |
+
+`--single-process` is documented as debug-only in
+Chromium; the renderer crashes shortly after page load
+in modern headless. Also doesn't actually collapse to
+one process -- crashpad-handler always runs separately
+and a Mojo broker stays alive too.
+
+`--in-process-gpu` does kill the GPU process entirely
+but folds the GPU work onto the same thread as JS +
+layout. Render slows by ~5 s and generate by ~15 s --
+a 25 % total slowdown bought for ~100 MB of saved
+process overhead. Bad trade.
+
+The disable pair alone (`--disable-gpu
+--disable-software-rasterizer`) is the sweet spot:
+
+- GPU process shrinks from ~100 MB to ~16 MB (Chromium
+  keeps a stub for command handling)
+- Renderer ~120 MB lighter (consistent across runs;
+  exact cause is some GPU-context init path Skia skips)
+- Generate runs ~5 s faster (Skia presumably skips the
+  same GPU init path)
+- PDF output is byte-identical: same 41,076,362 bytes,
+  same content streams. SHA differs only because of
+  per-run /CreationDate, /ModDate, and /ID -- 0.018 %
+  of bytes differ, all inside the tagged-PDF tree's
+  hash-derived element IDs.
+
+Shipped in both [docs/render-book.mjs](../../docs/render-book.mjs)
+and [perf/measure.mjs](../measure.mjs).
+
+## What's not addressable
+
+Accessibility tagging accounts for ~41 MB of
+`AXNodeObject` instances (411k of them, one per DOM
+element for the PDF/UA structure tree). Disabling
+`--export-tagged-pdf` would free this, but the PDF
+loses its structure tree -- screen readers see a flat
+glyph stream, search highlighting and copy-paste break
+reading order in the multi-column layout, and the PDF
+falls out of Section 508 / PDF-UA / EN 301 549
+compliance. Off the table; the cost buys real
+accessibility for a docs site that aims to be readable.
+
+## Where this leaves memory
+
+End-state on the 1651-page book with the shipped flag
+pair:
+
+```
+renderer (main)                 ~1,760 MB private
+PrintCompositor (utility)         ~350 MB
+browser                           ~70-1,100 MB  (IPC buffer; high variance)
+gpu-process (stub)                 ~16 MB
+other (renderers, network, etc.)   ~80 MB
+                                ------------
+peak                            ~2.3-3.3 GB private
+                                ~2.5-2.9 GB working set
+```
+
+Inside the renderer, the dominant buckets are
+intrinsic to laying out 1651 pages of typeset content:
+
+- `GridSizingTrackCollection` (132 MB) is paged.js's
+  per-page template grid. The grid drives `@top-right`
+  / `@bottom-right` / margin-box positioning; replacing
+  it with absolute positioning would save the 132 MB
+  but is a paged.js architectural change.
+- `ComputedStyle` (74 MB across 1M objects) and the
+  LayoutNG fragment tree (~200 MB combined) scale with
+  DOM size. The biggest knob here is the DOM the book
+  feeds in: fewer wrapper elements would directly
+  shrink everything downstream.
+- The render→generate +500 MB is Chromium-internal
+  (print-preview retention + Skia raster prep) and not
+  reachable without recompiling.
+
+Next memory targets, in rough order of effort vs payoff:
+
+1. **DOM shape audit.** 1.07 M `ComputedStyle`, 498 k
+   `Text` nodes, 118 k `HTMLDivElement`, 104 k
+   `HTMLSpanElement` -- the input shape drives all of
+   this. Just-the-docs and the markdown converters add
+   wrapper elements that may not be needed in the PDF
+   layout. A pre-render DOM-simplification pass (strip
+   inert wrappers, collapse nested spans) is the most
+   accessible lever; we own the Jekyll pipeline end to
+   end.
+2. **Layout-intermediate garbage** that Oilpan doesn't
+   sweep during the synchronous render loop. ~75-225
+   MB of `CachedMatchedProperties`, sub-`ComputedStyle`
+   data, `GridItemData`, text-shape intermediates --
+   not retained by anything, just unswept. See the
+   "GC-pass probe" subsection for the per-class
+   breakdown; the only direct mitigation is forcing
+   GC (rejected, costs ~1 s), and the indirect lever
+   is upstream DOM size (item 1 above).
+3. **Page-template grid replacement** in vendored
+   paged.js -- ~132 MB potential. Largest single target
+   but an invasive rewrite of paged.js's `@page` area
+   handler.
+
+## GC-pass probe: 180 MB of unswept Oilpan garbage
+
+Forcing a `window.gc()` pass between render and generate
+frees ~180 MB of `blink_objects` (the typed view of the
+Oilpan heap) without touching anything user-visible.
+Initial framing: "dangling references somewhere in the
+paged.js / detach-pages chain". Investigation (see "What
+the GC actually freed" subsection below) shows the
+framing was wrong -- there is no JS-side retention.
+What the GC frees is per-page layout intermediate state
+(style sharing caches, `ComputedStyle` sub-data, grid
+item data, text-shape views) that's already unreachable
+from anything but stays in Oilpan because nothing forces
+a major GC during the synchronous render loop.
+
+Probe: `perf/probe-renderer-mem.mjs --gc-passes N`.
+Launches with `--js-flags=--expose-gc`, runs N V8
+`gc()` calls between the post-render and pre-generate
+memory dumps, then fires
+`Memory.simulatePressureNotification` to coax Chromium
+into dropping caches. Sweep across N=0,1,2,3,5 on the
+1651-page book (single run each; absolute numbers carry
+run-to-run noise but the deltas vs same-run baseline
+are stable):
+
+| N | gc time | +pressure | post-render | post-gc | mid-gen renderer | Δ vs no-gc baseline |
+| --- | --- | --- | --- | --- | --- | --- |
+| (off, baseline)| --     | --     |  1,229 MB | --     | **1,941 MB** | -- |
+| 0 (pressure only) | 0.00s | 0.52s |  1,358 MB | 1,358 MB | 1,869 MB | ~noise |
+| **1** | **0.44s** | **0.96s** | 1,329 MB | **1,275 MB** | **1,754 MB** | **-187 MB** |
+| 2 | 0.82s | 1.33s |  1,337 MB | 1,293 MB | 1,758 MB | -183 MB |
+| 3 | 1.46s | 1.97s |  1,316 MB | 1,277 MB | 1,757 MB | -184 MB |
+| 5 | 2.11s | 2.61s |  1,553 MB* | 1,498 MB* | 1,841 MB* | (high-side outlier run) |
+
+Three takeaways:
+
+1. **`Memory.simulatePressureNotification` alone does
+   nothing in headless.** N=0 mid-gen is within
+   run-to-run noise of the no-gc baseline.
+2. **One `gc()` call does ~90 % of the work.** 1 pass +
+   pressure: ~1 s cost, ~187 MB peak savings. Passes
+   2 and 3 match it (~185 MB) without further
+   improvement.
+3. **Each `gc()` pass costs ~0.4-0.5 s** of wall clock
+   on the 1651-page book (the V8 + Oilpan major-GC
+   pause walking ~1 GB of heap).
+
+Inside the renderer at post-gc (1 pass), the breakdown
+shows where the freed space went:
+
+| allocator      | baseline | post-gc | Δ |
+| -------------- | -------- | ------- | --- |
+| `blink_objects` (typed Oilpan view) |  698 MB |  472 MB | **-226 MB** |
+| `blink_gc` (raw pages)              |  973 MB |  940 MB |  -33 MB |
+| `malloc`                            |  120 MB |   93 MB |  -27 MB |
+| `v8`                                |   28 MB |   19 MB |   -9 MB |
+
+GC freed ~226 MB of typed Blink objects, but Oilpan
+only returned 33 MB of underlying pages to the OS
+immediately -- empty pages are recycled lazily. The
+visible peak win shows up at mid-generate (-187 MB)
+because Chromium reuses the freed object slots for the
+print-preview snapshot instead of growing fresh.
+
+PDF output is byte-identical across all variants
+(41,076,362 bytes; SHA differs only in metadata).
+
+**Not shipped.** 1 second per render is meaningful when
+multiplied across CI builds, and after investigating
+what the GC actually freed (below) it's clear there's
+no underlying defect to fix -- this is Blink's normal
+allocation behaviour, with Oilpan's normal sweep
+behaviour, just observed in a workload that doesn't
+give Oilpan an idle moment to sweep.
+
+The probe and the `--gc-passes` flag stay in
+[probe-renderer-mem.mjs](../probe-renderer-mem.mjs) for
+future use -- either as a measurement baseline if a
+future bigger book ever hits a CI memory ceiling, or as
+an A/B reference if Blink's allocation pattern changes
+with a Chromium upgrade.
+
+### What the GC actually freed
+
+Two analyses, both negative for the "dangling references"
+hypothesis, both positive for "Oilpan didn't sweep":
+
+**V8 heap snapshot diff (pre-gc vs post-gc):** byte-
+identical. Same 2,938,992 nodes, same 108.9 MB self_size,
+same per-category counts. The diff is zero across every
+node category in V8. Whatever the GC freed was invisible
+to V8's snapshot, which means it had no V8 wrapper --
+which means no JS reference can be holding it. Probe:
+[analyze-heap-snapshot.mjs](../analyze-heap-snapshot.mjs)
+in single-snapshot or diff mode.
+
+**Per-Blink-class diff (memory-infra dumps):** the
+freed memory is concentrated in style-system caches and
+layout intermediates. Top freed classes between dump 0
+(post-render) and dump 1 (post-gc), 1-pass GC run:
+
+| class                                            | a_count | a_MB | b_count | b_MB | freed |
+| ------------------------------------------------ | ------- | ---- | ------- | ---- | ----- |
+| `CachedMatchedProperties`                        | 122,110 | 12.1 |     355 |  0.0 | **-12.1 MB** (~100%) |
+| `ComputedStyle`                                  | 380,974 | 26.2 | 244,772 | 16.8 |  -9.4 MB (~36%)      |
+| `ComputedStyleBase::StyleMisc2Data`              |  24,649 |  8.3 |   6,911 |  2.3 |  -6.0 MB             |
+| `ComputedStyleBase::StyleBoxData`                |  94,867 | 15.9 |  63,937 | 10.7 |  -5.2 MB             |
+| `ComputedStyleBase::StyleSurroundData`           |  32,350 |  9.6 |  15,101 |  4.5 |  -5.1 MB             |
+| `GridItemData`                                   |  27,508 |  5.0 |       0 |  0.0 | **-5.0 MB** (~100%)  |
+| `ShapeResultView`                                | 225,299 | 15.5 | 170,366 | 11.7 |  -3.8 MB             |
+| `HeapVectorBacking<HarfBuzzRunGlyphData>`        | 163,864 | 19.2 | 149,993 | 16.4 |  -2.9 MB             |
+| `LayoutResult::RareData`                         |  71,960 |  8.8 |  48,955 |  6.0 |  -2.8 MB             |
+| `ConstraintSpace::RareData`                      |  79,445 |  9.1 |  55,209 |  6.3 |  -2.8 MB             |
+| `ComputedStyleBase::StyleMisc1Data`              |  19,034 |  3.0 |   1,958 |  0.3 |  -2.7 MB             |
+| `ComputedStyleBase::StyleMiscData`               |  64,838 |  5.4 |  39,653 |  3.3 |  -2.1 MB             |
+| `LayoutResult`                                   | 179,728 | 13.7 | 155,052 | 11.8 |  -1.9 MB             |
+| ... (smaller)                                    |         |      |         |      |  -16  MB             |
+| **total**                                        |         |      |         |      | **-76 MB** (this run; -226 MB on a different run -- noisy) |
+
+The two ~100% freed categories tell the cleanest story:
+
+- **`CachedMatchedProperties`** is Blink's style-sharing
+  cache -- "which CSS rules matched element X, so that
+  similar element Y can reuse the resolved style". After
+  layout completes, it's dead state. Only useful if the
+  document gets relaid out, which our pipeline never
+  does.
+- **`GridItemData`** is per-item layout state for CSS
+  Grid. Paged.js puts each `@page` area inside a grid
+  to position the running headers / footers / margin
+  boxes; once the page is laid out, the `GridItemData`
+  for that page's items is dead.
+
+Everything else is style sub-structures
+(`ComputedStyleBase::Style*Data`) and text-shape
+intermediates (`ShapeResultView`, `HarfBuzzRunGlyphData`,
+`ShapeResultRun`) that get freed when their owning
+`ComputedStyle` or layout fragment becomes unreachable.
+All Blink-internal allocations driven by layout.
+
+What this means for the leak question:
+
+- **Not a leak.** Nothing holds these objects after
+  layout. They're unreachable from the moment their
+  page is finalised; they sit in Oilpan because
+  Chromium doesn't run a major GC during the
+  synchronous render loop.
+- **Not a JS-side retention.** detach-pages.js,
+  paged.js's chunker, hook chains, and event listeners
+  were the suspect list. The V8 snapshot diff rules
+  them all out -- if any of them held the layout state,
+  the snapshot would change between pre-gc and post-gc.
+- **It's a real over-allocation in the sense that we
+  hold ~75-225 MB longer than necessary**, but the cost
+  to fix it (force a GC: 1 s wall clock) exceeds the
+  CI memory headroom it would buy at our current book
+  size.
+
+The indirect lever still works: reducing the input DOM
+size reduces both peak working set AND this garbage
+fraction proportionally. That's the DOM-shape audit
+item in "Next memory targets".
+
+Tooling produced by this investigation, kept in
+[perf/](..) for re-use:
+
+- [analyze-heap-snapshot.mjs](../analyze-heap-snapshot.mjs)
+  -- single-snapshot summary (top type x name by
+  aggregate bytes, detached subset) and pairwise diff
+  between two snapshots.
+- [diff-blink-classes.mjs](../diff-blink-classes.mjs) --
+  per-Blink-class diff between two memory-infra dumps
+  in the same trace. Strips the per-dump GUID suffix
+  from class names so the diff lines up across dumps.
+
+### `--heap-snapshot`: V8 visibility check
+
+`probe-renderer-mem.mjs --heap-snapshot` captures a V8
+heap snapshot at post-render via CDP
+`HeapProfiler.takeHeapSnapshot` and writes it as
+`outDir/post-render.heapsnapshot` (~200 MB on the
+1651-page book). Combined with `--gc-passes N`, a
+second snapshot `post-gc.heapsnapshot` is taken right
+after the GC pass.
+
+The original intent was a retainer-chain investigation
+to find what JS-side state was holding the Blink
+objects the GC frees. The result of that investigation
+(see "What the GC actually freed" above) is that
+**nothing on the V8 side holds them** -- the snapshot
+diff is byte-identical pre-gc vs post-gc, ruling out
+JS retention entirely. The freed memory is Oilpan-only,
+invisible to V8's snapshot.
+
+The snapshot tooling is still useful as a visibility
+check -- "is the renderer holding what I expect?" --
+and for finding any actual JS-side retention if one
+ever surfaces. CLI analysis:
+
+- `node perf/analyze-heap-snapshot.mjs <snap>` --
+  single-snapshot summary (top type x name by aggregate
+  bytes, plus actually-detached subset).
+- `node perf/analyze-heap-snapshot.mjs <a> <b>` --
+  pairwise diff: what categories grew or shrank.
+
+DevTools workflow (more interactive, for following
+specific retention chains):
+
+1. Open Chrome DevTools (any tab) -> Memory tab.
+2. Load `<...>.heapsnapshot` (the "Load profile" icon).
+   Browse the **Summary** view for the largest object
+   categories.
+3. For any object of interest, the **Retainers** pane
+   shows the chain of JS references holding it. Filter
+   by name (e.g. `Detached HTMLDivElement`) or by class.
+
+Oilpan-only objects (`CachedMatchedProperties`,
+`ComputedStyleBase::*Data`, `GridItemData`,
+`ShapeResultView`, layout fragments, etc.) do not appear
+in the V8 snapshot -- they have no V8 wrapper. The
+memory-infra dump + `diff-blink-classes.mjs` is the
+right tool for those. The complete picture is
+heap-snapshot (V8 reachability) + memory-infra dump
+(per-allocator + per-Blink-class sizes) = "what JS sees"
++ "what's actually in the renderer".

From 11425345bf4a16fc7fe94264c110d120829cfc2d Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sat, 23 May 2026 23:08:28 +0200
Subject: [PATCH 17/18] Make --detach-pages the default.

---
 perf/README.md               | 16 ++++++++++------
 perf/ab-css.mjs              |  2 +-
 perf/instrument-clones.js    |  8 +++++---
 perf/instrument-flush-ops.js |  9 +++++----
 perf/measure.mjs             | 18 +++++++++++-------
 5 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/perf/README.md b/perf/README.md
index 0a3254b..7cedb6d 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -23,13 +23,17 @@ at the bottom of this file.
 The command we reach for whenever CPU-profiling paged.js:
 
 ```
-node measure.mjs --detach-pages --no-timing --render-only --cpu-profile --cpu-sampling 100
+node measure.mjs --no-timing --render-only --cpu-profile --cpu-sampling 100
 ```
 
-(`run.bat` forwards the same args.) Flag rationale:
+(`run.bat` forwards the same args.) The detach-pages handler is now
+injected by default, since it's the shipping fix and matching
+production is the right baseline for any profiling work. Pass
+`--no-detach-pages` if you specifically need the pre-fix O(n²)
+baseline for an A/B against the original quadratic.
+
+Flag rationale:
 
-- `--detach-pages` -- inject the shipping fix. The profile reflects
-  what production actually pays, not the old O(n^2) baseline.
 - `--no-timing` -- skip the per-page `console.log` relay from
   `timing-handler.js`. The relay costs ~2 % of render self-time on
   the 1638-page book and muddies the bottom-up view.
@@ -63,7 +67,7 @@ The harness and core probes:
 | `measure.mjs` | Puppeteer harness. Drives the same flow as `docs/render-book.mjs` (loads the vendored paged.js bundle, runs `PagedPolyfill.preview()`, calls `page.pdf()`, then either the pdf-lib roundtrip or the incremental writer), with optional CPU profiling, in-page handler injection, and DOM-accessor instrumentation. Auto-pins to a fixed core mask on Windows via `pin-cpu.mjs` (see below) for stable measurements; pass `--no-affinity` to opt out. |
 | `pin-cpu.mjs` | Shared shim used by `measure.mjs`, `profile-load.mjs`, `profile-roundtrip.mjs`, and `ab-css.mjs`. On Windows, auto-relaunches the parent Node process under `start /affinity 0x5500 /high` (cores 4-7 physical, thread 0 each, on an 8C16T AMD Ryzen 7) so puppeteer's Chromium children inherit the mask + priority at spawn time. Reduces single-run CPU sample-time variance from ~15-25 % on a stock dev box to ~3 %. No-op on non-Windows; opt out per-invocation with `--no-affinity` or `PERF_PINNED=1`; override mask with `PERF_AFFINITY=<hex>`. |
 | `timing-handler.js` | `Paged.Handler` that records per-page wall time + heap into `window.__pagedTiming` and streams a line per page to the console. Always injected. |
-| `detach-pages.js` | `Paged.Handler` that hides each completed page from the layout tree (registered against `finalizePage`). The fix. Injected by `--detach-pages` and by `docs/book.bat`. |
+| `detach-pages.js` | `Paged.Handler` that hides each completed page from the layout tree (registered against `finalizePage`). The shipping fix. Injected by default (both by `measure.mjs` and by `docs/book.bat`); pass `--no-detach-pages` to measure the pre-fix baseline. |
 | `instrument-flush-ops.js` | Wraps `getComputedStyle`, `getBoundingClientRect`, and the `offsetWidth` / `clientWidth` / `scrollWidth` family with counters + per-call timing. Injected by `--instrument`. |
 | `instrument-detach.js` | Counters around `detach-pages.js`'s removeChild / restore cycle. |
 | `time-hooks.js` | Wraps every task registered to `chunker.hooks.*` and `polisher.hooks.*` with a wall-clock timer. Tells you which handler's hook method is eating render time, per page. Injected by `--time-hooks`. |
@@ -163,7 +167,7 @@ real `book.bat` number too.
 run.bat                                   # defaults to ..\docs\_site-pdf\book.html
 run.bat path\to\some-other.html           # explicit input
 run.bat --out my-run                      # explicit output directory
-run.bat --detach-pages                    # inject the detach-pages fix
+run.bat --no-detach-pages                 # opt out of the detach-pages fix (measure pre-fix O(n²) baseline)
 run.bat --cpu-profile                     # CPU-profile the render phase
 run.bat --render-only                     # bail out after render (skip generate + process, ~47s saved)
 run.bat --clone-count                     # report Layout.append clones appended vs survivors per page
diff --git a/perf/ab-css.mjs b/perf/ab-css.mjs
index eb310fb..b827110 100644
--- a/perf/ab-css.mjs
+++ b/perf/ab-css.mjs
@@ -165,7 +165,7 @@ if (swappedHtml === BOOK_HTML) {
 function runOnce(outDir) {
   const r = spawnSync('node', [
     'measure.mjs', SWAP_HTML_PATH,
-    '--detach-pages', '--no-timing', '--render-only', '--tracing',
+    '--no-timing', '--render-only', '--tracing',
     '--out', outDir,
   ], { stdio: ['ignore', 'pipe', 'pipe'] });
   const err = r.stderr?.toString() ?? '';
diff --git a/perf/instrument-clones.js b/perf/instrument-clones.js
index 7358d47..d625df2 100644
--- a/perf/instrument-clones.js
+++ b/perf/instrument-clones.js
@@ -12,9 +12,11 @@
 //   - At afterRendered, summarise totals + per-page distribution.
 //
 // Cost: O(1) per append + one tree walk per finalized page. Run with
-//   --detach-pages --no-timing --additional-script ..\perf\instrument-clones.js
-// from a measure.mjs invocation. Numbers are reported via console.log
-// which measure.mjs forwards to stdout.
+//   --no-timing --additional-script ..\perf\instrument-clones.js
+// from a measure.mjs invocation. (detach-pages.js is on by default;
+// add --no-detach-pages to compare against the pre-fix baseline.)
+// Numbers are reported via console.log which measure.mjs forwards
+// to stdout.
 
 (() => {
     const Layout = window.PagedLayout;
diff --git a/perf/instrument-flush-ops.js b/perf/instrument-flush-ops.js
index 282eb43..a5bdcb5 100644
--- a/perf/instrument-flush-ops.js
+++ b/perf/instrument-flush-ops.js
@@ -9,10 +9,11 @@
 // ideally be cleanest, but the harness loads paged.js first; we then
 // register a Paged.Handler so we can dump results at afterRendered.
 //
-// Run with: node measure.mjs --instrument [--detach-pages]
-// Compare runs with and without --detach-pages to see whether the
-// detach handler changed the count of layout-flushing calls, the
-// per-call cost, or both.
+// Run with: node measure.mjs --instrument
+// (detach-pages.js is injected by default; pass --no-detach-pages to
+// compare against the pre-fix baseline -- useful for seeing whether
+// the detach handler changed the count of layout-flushing calls, the
+// per-call cost, or both.)
 
 (() => {
   const stats = {};
diff --git a/perf/measure.mjs b/perf/measure.mjs
index 3177052..6737e0a 100644
--- a/perf/measure.mjs
+++ b/perf/measure.mjs
@@ -24,7 +24,7 @@
 //                    [--cpu-profile] [--cpu-sampling <microseconds>]
 //                    [--heap-profile] [--heap-sampling <bytes>]
 //                    [--tracing]
-//                    [--detach-pages] [--instrument] [--time-hooks]
+//                    [--no-detach-pages] [--instrument] [--time-hooks]
 //                    [--incremental] [--chrome-outline] [--no-timing]
 //                    [--clone-count] [--render-only]
 //
@@ -40,9 +40,12 @@
 // possible bottom-up table; loses the per-page CSV and the first/last
 // quartile summary in return.
 //
-// --detach-pages also injects detach-pages.js -- a Paged.Handler that
-// hides each completed page from the layout tree -- to test whether
-// the O(n^2) render hotspot disappears.
+// detach-pages.js is injected by default -- a Paged.Handler that hides
+// each completed page from the layout tree. This is the shipping fix
+// for the O(n^2) render hotspot (see notes/01-baseline-and-detach.md);
+// matching it in the harness keeps measurements aligned with what
+// production renders. Pass --no-detach-pages to measure the pre-fix
+// O(n^2) baseline.
 //
 // --incremental switches the process phase from a pdf-lib roundtrip to
 // an incremental update against Chrome's bytes. Massively faster (sub-
@@ -107,7 +110,7 @@ let cpuProfile = false;
 let cpuSampling = 1000; // microseconds
 let heapProfile = false;
 let heapSampling = 32768; // bytes between samples (CDP default)
-let detachPages = false;
+let detachPages = true;
 let instrument = false;
 let timeHooks = false;
 let incremental = false;
@@ -124,7 +127,8 @@ for (let i = 0; i < args.length; i++) {
   else if (a === '--cpu-sampling') cpuSampling = parseInt(args[++i], 10);
   else if (a === '--heap-profile') heapProfile = true;
   else if (a === '--heap-sampling') heapSampling = parseInt(args[++i], 10);
-  else if (a === '--detach-pages') detachPages = true;
+  else if (a === '--detach-pages') detachPages = true;       // accepted for backwards compat; default since the fix landed
+  else if (a === '--no-detach-pages') detachPages = false;
   else if (a === '--instrument') instrument = true;
   else if (a === '--time-hooks') timeHooks = true;
   else if (a === '--incremental') incremental = true;
@@ -163,7 +167,7 @@ if (cloneCount)  required.push(cloneCountPath);
 for (const p of required) {
   if (!existsSync(p)) {
     console.error(`missing required file: ${p}`);
-    console.error('Run "npm install" inside perf/ first.');
+    console.error('Run "npm install" at the repo root first (run.bat does this automatically).');
     process.exit(1);
   }
 }

From 1b4fad681a829e8dec30e40bb974c7a38816dcd7 Mon Sep 17 00:00:00 2001
From: Kuba Sunderland-Ober <kuba@mareimbrium.org>
Date: Sat, 23 May 2026 23:16:17 +0200
Subject: [PATCH 18/18] Make --timing opt-in rather than opt-out (--no-timing).

---
 perf/README.md            | 40 ++++++++++++++++++----------------
 perf/ab-css.mjs           |  2 +-
 perf/instrument-clones.js |  6 ++++--
 perf/measure.mjs          | 45 ++++++++++++++++++++++-----------------
 4 files changed, 53 insertions(+), 40 deletions(-)

diff --git a/perf/README.md b/perf/README.md
index 7cedb6d..81e67b6 100644
--- a/perf/README.md
+++ b/perf/README.md
@@ -23,20 +23,23 @@ at the bottom of this file.
 The command we reach for whenever CPU-profiling paged.js:
 
 ```
-node measure.mjs --no-timing --render-only --cpu-profile --cpu-sampling 100
+node measure.mjs --render-only --cpu-profile --cpu-sampling 100
 ```
 
-(`run.bat` forwards the same args.) The detach-pages handler is now
-injected by default, since it's the shipping fix and matching
-production is the right baseline for any profiling work. Pass
-`--no-detach-pages` if you specifically need the pre-fix O(n²)
-baseline for an A/B against the original quadratic.
+(`run.bat` forwards the same args.) Two defaults match what most
+profiling work needs:
+
+- **detach-pages is on.** It's the shipping fix; matching production
+  is the right baseline for any profiling work. Pass
+  `--no-detach-pages` for an A/B against the original O(n²) quadratic.
+- **timing is off.** The `timing-handler.js` per-page `console.log`
+  relay costs ~2 % of render self-time on the 1638-page book and
+  muddies bottom-up profile tables. Pass `--timing` when you want the
+  per-page CSV + first/last-quartile summary; otherwise `timing.csv`
+  is empty and `summary.txt` says so.
 
 Flag rationale:
 
-- `--no-timing` -- skip the per-page `console.log` relay from
-  `timing-handler.js`. The relay costs ~2 % of render self-time on
-  the 1638-page book and muddies the bottom-up view.
 - `--render-only` -- bail out after `PagedPolyfill.preview()`
   returns. Skips meta extraction, `parseOutline`, `page.pdf`, and
   the pdf-lib roundtrip / incremental writer. ~47 s saved per run
@@ -66,7 +69,7 @@ The harness and core probes:
 | --- | --- |
 | `measure.mjs` | Puppeteer harness. Drives the same flow as `docs/render-book.mjs` (loads the vendored paged.js bundle, runs `PagedPolyfill.preview()`, calls `page.pdf()`, then either the pdf-lib roundtrip or the incremental writer), with optional CPU profiling, in-page handler injection, and DOM-accessor instrumentation. Auto-pins to a fixed core mask on Windows via `pin-cpu.mjs` (see below) for stable measurements; pass `--no-affinity` to opt out. |
 | `pin-cpu.mjs` | Shared shim used by `measure.mjs`, `profile-load.mjs`, `profile-roundtrip.mjs`, and `ab-css.mjs`. On Windows, auto-relaunches the parent Node process under `start /affinity 0x5500 /high` (cores 4-7 physical, thread 0 each, on an 8C16T AMD Ryzen 7) so puppeteer's Chromium children inherit the mask + priority at spawn time. Reduces single-run CPU sample-time variance from ~15-25 % on a stock dev box to ~3 %. No-op on non-Windows; opt out per-invocation with `--no-affinity` or `PERF_PINNED=1`; override mask with `PERF_AFFINITY=<hex>`. |
-| `timing-handler.js` | `Paged.Handler` that records per-page wall time + heap into `window.__pagedTiming` and streams a line per page to the console. Always injected. |
+| `timing-handler.js` | `Paged.Handler` that records per-page wall time + heap into `window.__pagedTiming` and streams a line per page to the console. Injected when `--timing` is passed; off by default because the per-page console relay costs ~2 % of render self-time. |
 | `detach-pages.js` | `Paged.Handler` that hides each completed page from the layout tree (registered against `finalizePage`). The shipping fix. Injected by default (both by `measure.mjs` and by `docs/book.bat`); pass `--no-detach-pages` to measure the pre-fix baseline. |
 | `instrument-flush-ops.js` | Wraps `getComputedStyle`, `getBoundingClientRect`, and the `offsetWidth` / `clientWidth` / `scrollWidth` family with counters + per-call timing. Injected by `--instrument`. |
 | `instrument-detach.js` | Counters around `detach-pages.js`'s removeChild / restore cycle. |
@@ -168,6 +171,7 @@ run.bat                                   # defaults to ..\docs\_site-pdf\book.h
 run.bat path\to\some-other.html           # explicit input
 run.bat --out my-run                      # explicit output directory
 run.bat --no-detach-pages                 # opt out of the detach-pages fix (measure pre-fix O(n²) baseline)
+run.bat --timing                          # collect per-page wall time + heap (writes timing.csv + quartile summary)
 run.bat --cpu-profile                     # CPU-profile the render phase
 run.bat --render-only                     # bail out after render (skip generate + process, ~47s saved)
 run.bat --clone-count                     # report Layout.append clones appended vs survivors per page
@@ -189,17 +193,17 @@ Outputs land in `perf/results/<ISO-timestamp>/`:
 
 - `book.pdf`    -- the rendered PDF, byte-equivalent to what
   `book.bat` produces.
-- `timing.json` -- full record: phase totals, sub-phase breakdowns
-  (`parseOutline`, `page.pdf`, pdf-lib load / setOutline / save),
-  and the per-page render entries.
-- `timing.csv`  -- one row per page,
-  `page,dur_ms,heap_start_mb,heap_end_mb,elapsed_s`.
-- `summary.txt` -- the three phase totals, plus first-quarter vs
-  last-quarter average per-page render cost and ratio.
+- `timing.json` -- phase totals + sub-phase breakdowns
+  (`parseOutline`, `page.pdf`, pdf-lib load / setOutline / save).
+  Per-page render entries are populated only when `--timing` is set.
+- `timing.csv`  -- one row per page, `page,dur_ms,heap_start_mb,
+  heap_end_mb,elapsed_s`. Empty (header only) without `--timing`.
+- `summary.txt` -- the three phase totals; with `--timing` also adds
+  first-quarter vs last-quarter average per-page render cost + ratio.
 
 ## Reading the output
 
-The summary prints something like:
+With `--timing`, the summary prints something like:
 
 ```
 pages        : 1638
diff --git a/perf/ab-css.mjs b/perf/ab-css.mjs
index b827110..e6c9c13 100644
--- a/perf/ab-css.mjs
+++ b/perf/ab-css.mjs
@@ -165,7 +165,7 @@ if (swappedHtml === BOOK_HTML) {
 function runOnce(outDir) {
   const r = spawnSync('node', [
     'measure.mjs', SWAP_HTML_PATH,
-    '--no-timing', '--render-only', '--tracing',
+    '--render-only', '--tracing',
     '--out', outDir,
   ], { stdio: ['ignore', 'pipe', 'pipe'] });
   const err = r.stderr?.toString() ?? '';
diff --git a/perf/instrument-clones.js b/perf/instrument-clones.js
index d625df2..c2eb96f 100644
--- a/perf/instrument-clones.js
+++ b/perf/instrument-clones.js
@@ -12,9 +12,11 @@
 //   - At afterRendered, summarise totals + per-page distribution.
 //
 // Cost: O(1) per append + one tree walk per finalized page. Run with
-//   --no-timing --additional-script ..\perf\instrument-clones.js
+//   --additional-script ..\perf\instrument-clones.js
 // from a measure.mjs invocation. (detach-pages.js is on by default;
-// add --no-detach-pages to compare against the pre-fix baseline.)
+// add --no-detach-pages to compare against the pre-fix baseline.
+// The timing handler is off by default; if you also pass --timing,
+// its per-page console relay will mix with this probe's output.)
 // Numbers are reported via console.log which measure.mjs forwards
 // to stdout.
 
diff --git a/perf/measure.mjs b/perf/measure.mjs
index 6737e0a..c3ec049 100644
--- a/perf/measure.mjs
+++ b/perf/measure.mjs
@@ -25,7 +25,7 @@
 //                    [--heap-profile] [--heap-sampling <bytes>]
 //                    [--tracing]
 //                    [--no-detach-pages] [--instrument] [--time-hooks]
-//                    [--incremental] [--chrome-outline] [--no-timing]
+//                    [--incremental] [--chrome-outline] [--timing]
 //                    [--clone-count] [--render-only]
 //
 // --render-only bails out after the render phase. Skips meta extraction,
@@ -34,11 +34,13 @@
 // phase matters; trims ~45s off the full ~55s book run. No book.pdf is
 // written, and the timing.json / summary.txt omit generate/process.
 //
-// --no-timing skips the per-page timing-handler.js injection. The handler
-// adds a per-page console.log relayed via CDP that costs ~2% of render
-// self-time on the 1638-page book. Use when profiling for the cleanest
-// possible bottom-up table; loses the per-page CSV and the first/last
-// quartile summary in return.
+// --timing injects timing-handler.js. The handler records per-page wall
+// time + heap to window.__pagedTiming (so the harness can emit
+// timing.csv and the first/last-quartile summary) and streams a per-page
+// console.log relayed via CDP. The relay costs ~2 % of render self-time
+// on the 1638-page book, which is why the handler isn't on by default --
+// profile-clean runs and most A/B comparisons don't need it. Pass it
+// when you want the per-page CSV.
 //
 // detach-pages.js is injected by default -- a Paged.Handler that hides
 // each completed page from the layout tree. This is the shipping fix
@@ -115,7 +117,7 @@ let instrument = false;
 let timeHooks = false;
 let incremental = false;
 let chromeOutline = false;
-let noTiming = false;
+let timing = false;
 let cloneCount = false;
 let renderOnly = false;
 let tracing = false;
@@ -133,7 +135,8 @@ for (let i = 0; i < args.length; i++) {
   else if (a === '--time-hooks') timeHooks = true;
   else if (a === '--incremental') incremental = true;
   else if (a === '--chrome-outline') chromeOutline = true;
-  else if (a === '--no-timing') noTiming = true;
+  else if (a === '--timing') timing = true;
+  else if (a === '--no-timing') timing = false;             // accepted for backwards compat; default since the relay cost was measured
   else if (a === '--clone-count') cloneCount = true;
   else if (a === '--render-only') renderOnly = true;
   else if (a === '--tracing') tracing = true;
@@ -159,7 +162,7 @@ const instrumentPath   = resolve(__dirname, 'instrument-flush-ops.js');
 const timeHooksPath    = resolve(__dirname, 'time-hooks.js');
 const cloneCountPath   = resolve(__dirname, 'instrument-clones.js');
 const required = [pagedScriptPath];
-if (!noTiming)  required.push(handlerPath);
+if (timing)     required.push(handlerPath);
 if (detachPages) required.push(detachPagesPath);
 if (instrument)  required.push(instrumentPath);
 if (timeHooks)   required.push(timeHooksPath);
@@ -241,7 +244,7 @@ try {
   });
 
   await page.addScriptTag({ path: pagedScriptPath });
-  if (!noTiming) {
+  if (timing) {
     await page.addScriptTag({ path: handlerPath });
   }
   if (detachPages) {
@@ -471,9 +474,9 @@ try {
   console.log(`[harness] total    ${fmtMs(totalMs)}`);
 
   // Persist results -------------------------------------------------
-  const timing = noTiming
-    ? { pages: [], phases: {}, pageCount: null }
-    : await page.evaluate(() => window.__pagedTiming);
+  const timingData = timing
+    ? await page.evaluate(() => window.__pagedTiming)
+    : { pages: [], phases: {}, pageCount: null };
   if (finalPdf) {
     const pdfPath = join(outDir, 'book.pdf');
     writeFileSync(pdfPath, Buffer.from(finalPdf));
@@ -481,14 +484,14 @@ try {
 
   const record = {
     input: inputPath,
-    pageCount: timing.pageCount,
+    pageCount: timingData.pageCount,
     pdfBytes: finalPdf ? finalPdf.length : null,
     cpuProfile: profilePath,
     phases: {
       render: {
         ms: renderMs,
-        perPage: timing.pages,
-        phaseMarks: timing.phases,
+        perPage: timingData.pages,
+        phaseMarks: timingData.phases,
       },
     },
     totalMs,
@@ -509,7 +512,7 @@ try {
   writeFileSync(join(outDir, 'timing.json'), JSON.stringify(record, null, 2));
 
   const csv = ['page,dur_ms,heap_start_mb,heap_end_mb,elapsed_s'];
-  for (const p of timing.pages) {
+  for (const p of timingData.pages) {
     csv.push([
       p.idx,
       p.dur.toFixed(2),
@@ -520,10 +523,14 @@ try {
   }
   writeFileSync(join(outDir, 'timing.csv'), csv.join('\n'));
 
-  const pages = timing.pages;
+  const pages = timingData.pages;
   const summary = [];
   summary.push(`input        : ${inputPath}`);
-  summary.push(`pages        : ${pages.length}`);
+  if (timing) {
+    summary.push(`pages        : ${pages.length}`);
+  } else {
+    summary.push(`pages        : (per-page timing not collected; pass --timing for the CSV)`);
+  }
   if (finalPdf) {
     summary.push(`pdf size     : ${(finalPdf.length / 1024 / 1024).toFixed(1)} MB`);
   }