From ebdb817b514118bf1d14652cdd5457663806eae4 Mon Sep 17 00:00:00 2001 From: Kuba Sunderland-Ober Date: Fri, 22 May 2026 22:15:32 +0200 Subject: [PATCH 01/18] Disable per-page ResizeObserver: it caught no real reflows (~130ms saved). --- docs/lib/paged.browser.js | 39 ++++++++++++++++----------------------- 1 file changed, 16 insertions(+), 23 deletions(-) diff --git a/docs/lib/paged.browser.js b/docs/lib/paged.browser.js index 8d212d9..f681876 100644 --- a/docs/lib/paged.browser.js +++ b/docs/lib/paged.browser.js @@ -2486,29 +2486,22 @@ } addResizeObserver(contents) { - let wrapper = this.wrapper; - let prevHeight = wrapper.getBoundingClientRect().height; - this.ro = new ResizeObserver(entries => { - - if (!this.listening) { - return; - } - requestAnimationFrame(() => { - for (let entry of entries) { - const cr = entry.contentRect; - - if (cr.height > prevHeight) { - this.checkOverflowAfterResize(contents); - prevHeight = wrapper.getBoundingClientRect().height; - } else if (cr.height < prevHeight) { // TODO: calc line height && (prevHeight - cr.height) >= 22 - this.checkUnderflowAfterResize(contents); - prevHeight = cr.height; - } - } - }); - }); - - this.ro.observe(wrapper); + // [PATCH: disable-resize-observer] The RO existed to catch + // post-layout content reflow -- late-loading fonts, image + // dimensions resolving after layout, etc. -- by re-running + // findBreakToken whenever the wrapper grew/shrunk after + // renderTo returned. Our pipeline navigates with + // `waitUntil: "load"` and uses embedded fonts; nothing + // resizes after layout. The `_onOverflow` rescue path + // (Chunker.addPage line 3296) only fires while + // `!chunker.rendered`, and would emit a console.warn + // before re-rendering, so a regression would be loud. + // Disabling the RO removes a per-page allocation plus the + // stream of async findBreakToken / gBCR calls its callback + // would otherwise drive after every page's renderTo. + // checkUnderflowAfterResize is already gated by an absent + // _onUnderflow (see README "Attempt C"); checkOverflowAfterResize + // was the only live consumer. } checkOverflowAfterResize(contents) { From 78fccf043805b0d41ab9e5f49fabbc4baf290e2e Mon Sep 17 00:00:00 2001 From: Kuba Sunderland-Ober Date: Fri, 22 May 2026 22:46:00 +0200 Subject: [PATCH 02/18] Add Chrome trace analysis. --- perf/README.md | 2 + perf/analyze-trace.mjs | 163 +++++++++++++++++++++++++++++++++++++++++ perf/measure.mjs | 44 +++++++++++ 3 files changed, 209 insertions(+) create mode 100644 perf/analyze-trace.mjs diff --git a/perf/README.md b/perf/README.md index 135d2de..1385d1f 100644 --- a/perf/README.md +++ b/perf/README.md @@ -107,6 +107,7 @@ DevTools-compatible trace is a few lines. | `compare-outlines.mjs` | Diffs two PDFs' `/Outlines` trees by `(depth, title, target page)`. Used to verify whether Chrome's native outline matches the injected one. | | `probe-outline-exclusions.mjs` | Tests which per-element attributes / styles (aria-hidden, role=presentation, hidden, display:none, CSS bookmark-level, ...) make Chrome drop a heading from its outline. | | `analyze-profile.mjs` | Bottom-up self-time analyzer for `.cpuprofile` files. Same shape as DevTools' Performance bottom-up view, in the terminal. | +| `analyze-trace.mjs` | Bottom-up self-time analyzer for Chrome traces (`trace.json` from `--tracing`). Computes per-event self-time on the renderer's main thread (`CrRendererMain` by default) by walking nested `X`-phase events. Cracks the cpu profile's `(program)` bucket open into named Blink / V8 events (`Layout`, `RecalcStyle`, `RunMicrotasks`, `V8.GC_*`, ...). | | `find-callers.mjs` | "Who paid for this callee's time?" -- walks a `.cpuprofile` and attributes a target function's total time back to each direct caller. Used throughout the post-mortems to detect gBCR migration between callers. | | `find-callees.mjs` | The other direction of `find-callers.mjs`: splits a function's self+descendant time across its direct callees. Surfaces the cases where V8 has rolled native DOM work back into the calling JS frame (Range deletion in `removeOverflow`, HTML parser in `wrapContent`). | | `grep-profile.mjs` | Lists every node in a `.cpuprofile` whose `functionName` matches a regex, with self-time and location. Quick check for "is this frame in the profile at all, and what's it called?" | @@ -164,6 +165,7 @@ run.bat --instrument # count + time DOM-accessor calls run.bat --time-hooks # per-task timing of every chunker/polisher hook run.bat --incremental # process via incremental update instead of pdf-lib roundtrip run.bat --chrome-outline # let Chrome emit /Outlines (skip parseOutline + setOutline) +run.bat --tracing # capture a Chrome trace of the render phase ``` Flags compose. The CPU profile lands as `render.cpuprofile` diff --git a/perf/analyze-trace.mjs b/perf/analyze-trace.mjs new file mode 100644 index 0000000..cefc6cf --- /dev/null +++ b/perf/analyze-trace.mjs @@ -0,0 +1,163 @@ +// Bottom-up Chrome trace analyzer. +// +// Reads a trace.json produced by `node measure.mjs --tracing` (or any +// other source -- the trace format is Chrome's standard "JSON Object +// Format", https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU) +// and prints the top events on the renderer's main thread by self-time, +// aggregated by event name. Use it to break the cpu profile's (program) +// frame down into named Blink/V8 work (Layout, UpdateLayoutTree, ParseHTML, +// V8.CompileCode, V8.RunMicrotasks, etc.). +// +// Usage: +// node analyze-trace.mjs [--top N] [--min-pct P] +// [--thread ] [--all-threads] +// +// Defaults: --top 30, --min-pct 0.1 (hide rows under 0.1% self-time), +// thread = CrRendererMain (the V8 / DOM / Blink layout thread). +// +// Self-time is computed by walking 'X' (complete) events in ts order on +// each thread independently, subtracting nested children from each +// parent's duration. Matches the "Bottom-Up" view in chrome://tracing +// and DevTools' Performance panel when grouped by event name. + +import { readFileSync } from 'node:fs'; +import { resolve } from 'node:path'; + +const args = process.argv.slice(2); +let tracePath = null; +let topN = 30; +let minPct = 0.1; +let threadName = 'CrRendererMain'; +let allThreads = false; +for (let i = 0; i < args.length; i++) { + const a = args[i]; + if (a === '--top') topN = parseInt(args[++i], 10); + else if (a === '--min-pct') minPct = parseFloat(args[++i]); + else if (a === '--thread') threadName = args[++i]; + else if (a === '--all-threads') allThreads = true; + else if (!tracePath) tracePath = a; +} +if (!tracePath) { + console.error('usage: node analyze-trace.mjs [--top N] [--min-pct P] [--thread NAME] [--all-threads]'); + process.exit(2); +} +tracePath = resolve(process.cwd(), tracePath); + +const trace = JSON.parse(readFileSync(tracePath, 'utf8')); +const events = Array.isArray(trace) ? trace : (trace.traceEvents || []); + +// Thread/process metadata events declare human-readable names. We use +// these to identify the main renderer thread (default CrRendererMain). +const threadNames = new Map(); // key=`${pid}.${tid}` -> name +const processNames = new Map(); // key=pid -> name +for (const e of events) { + if (e.ph !== 'M') continue; + if (e.name === 'thread_name' && e.args && e.args.name) { + threadNames.set(`${e.pid}.${e.tid}`, e.args.name); + } else if (e.name === 'process_name' && e.args && e.args.name) { + processNames.set(e.pid, e.args.name); + } +} + +// Bucket non-metadata 'X' events (complete events with dur) by thread. +// 'B'/'E' pairs are rare in devtools.timeline + v8 categories but we +// fold them in for robustness: a 'B' is matched with the next 'E' of +// the same name on the same thread. +const byThread = new Map(); +const openBE = new Map(); // key=`${pid}.${tid}.${name}` -> stack of B events +for (const e of events) { + if (!e.ph || !e.pid) continue; + const tk = `${e.pid}.${e.tid}`; + if (e.ph === 'X') { + if (typeof e.dur !== 'number' || e.dur < 0) continue; + if (!byThread.has(tk)) byThread.set(tk, []); + byThread.get(tk).push({ ts: e.ts, dur: e.dur, name: e.name, cat: e.cat }); + } else if (e.ph === 'B') { + const k = `${tk}.${e.name}`; + if (!openBE.has(k)) openBE.set(k, []); + openBE.get(k).push(e); + } else if (e.ph === 'E') { + const k = `${tk}.${e.name}`; + const stack = openBE.get(k); + if (stack && stack.length) { + const b = stack.pop(); + const dur = e.ts - b.ts; + if (dur >= 0) { + if (!byThread.has(tk)) byThread.set(tk, []); + byThread.get(tk).push({ ts: b.ts, dur, name: e.name, cat: e.cat || b.cat }); + } + } + } +} + +// Pick the thread(s) to report. +const targetThreads = []; +for (const [tk, name] of threadNames) { + if (allThreads || name === threadName) { + targetThreads.push({ tk, name, pid: parseInt(tk.split('.')[0], 10) }); + } +} +if (!targetThreads.length) { + console.error(`no thread matched --thread "${threadName}". Threads present:`); + for (const [tk, name] of threadNames) console.error(` ${name} (${tk})`); + console.error('Pass --all-threads to aggregate across every thread, or --thread NAME to pick one.'); + process.exit(3); +} + +// Per-thread self-time computation via depth-walk over X events. +// Sort by ts ascending; on tie, longer dur first so a containing +// event lands on the stack before its child. +const selfByName = new Map(); // name -> { self_us, cat } +let totalEvents = 0; +let traceMinTs = Infinity, traceMaxTs = -Infinity; +for (const { tk, name: tname } of targetThreads) { + const list = byThread.get(tk); + if (!list || !list.length) continue; + list.sort((a, b) => a.ts - b.ts || b.dur - a.dur); + totalEvents += list.length; + const stack = []; + const flush = (top) => { + const self = top.dur - top.childTime; + if (self <= 0) return; + const cur = selfByName.get(top.name) || { self_us: 0, cat: top.cat || '' }; + cur.self_us += self; + if (!cur.cat && top.cat) cur.cat = top.cat; + selfByName.set(top.name, cur); + }; + for (const e of list) { + if (e.ts < traceMinTs) traceMinTs = e.ts; + if (e.ts + e.dur > traceMaxTs) traceMaxTs = e.ts + e.dur; + while (stack.length && stack[stack.length - 1].endTs <= e.ts) { + flush(stack.pop()); + } + if (stack.length) stack[stack.length - 1].childTime += e.dur; + stack.push({ name: e.name, cat: e.cat, dur: e.dur, endTs: e.ts + e.dur, childTime: 0 }); + } + while (stack.length) flush(stack.pop()); +} + +const traceDurUs = (traceMaxTs > traceMinTs) ? (traceMaxTs - traceMinTs) : 0; + +const totalSelfUs = [...selfByName.values()].reduce((s, x) => s + x.self_us, 0); +const rows = [...selfByName.entries()] + .map(([name, v]) => ({ + name, + cat: v.cat, + self_ms: v.self_us / 1000, + pct: 100 * v.self_us / (totalSelfUs || 1), + })) + .sort((a, b) => b.self_ms - a.self_ms) + .filter(r => r.pct >= minPct) + .slice(0, topN); + +const fmt = (n, w) => n.toFixed(2).padStart(w); +console.log(`trace: ${tracePath}`); +console.log(`events: ${totalEvents} thread${allThreads ? 's' : ''}: ${allThreads ? 'all' : threadName} span: ${(traceDurUs / 1e6).toFixed(2)}s`); +console.log(`total self: ${(totalSelfUs / 1000).toFixed(2)}ms across ${selfByName.size} distinct event names`); +console.log(`top ${topN} by self-time (min ${minPct}%):`); +console.log(''); +console.log(' self_ms self_% event @ category'); +console.log(' ------- ------ ----------------------------------------------'); +for (const r of rows) { + console.log(` ${fmt(r.self_ms, 8)} ${fmt(r.pct, 5)}% ${r.name} @ ${r.cat || '(no cat)'}`); +} diff --git a/perf/measure.mjs b/perf/measure.mjs index c9df9cf..6161ce7 100644 --- a/perf/measure.mjs +++ b/perf/measure.mjs @@ -23,6 +23,7 @@ // node measure.mjs [path/to/book.html] [--out ] [--keep-open] // [--cpu-profile] [--cpu-sampling ] // [--heap-profile] [--heap-sampling ] +// [--tracing] // [--detach-pages] [--instrument] [--time-hooks] // [--incremental] [--chrome-outline] [--no-timing] // [--clone-count] [--render-only] @@ -67,6 +68,15 @@ // via Performance -> "Load profile..." (or just drag onto the panel). // --cpu-sampling sets the sampling interval in microseconds; default // 1000 (1 ms). Raise it to keep the profile file smaller on long runs. +// +// --tracing wraps the render phase in a Chrome trace via CDP's Tracing +// domain (page.tracing.start) and writes trace.json to the results +// folder. The trace categorises Blink work as Layout / UpdateLayoutTree +// / ParseHTML / Composite / FunctionCall / V8.* etc -- the named buckets +// hiding inside the cpu profile's (program) frame. Load the file in +// chrome://tracing or perfetto.dev, or run analyze-trace.mjs against it +// for a top-N self-time table grouped by event name. Composable with +// --cpu-profile; uses an independent CDP domain. import { pathToFileURL, fileURLToPath } from 'node:url'; import { dirname, resolve, join } from 'node:path'; @@ -99,6 +109,7 @@ let chromeOutline = false; let noTiming = false; let cloneCount = false; let renderOnly = false; +let tracing = false; for (let i = 0; i < args.length; i++) { const a = args[i]; if (a === '--out') outArg = args[++i]; @@ -115,6 +126,7 @@ for (let i = 0; i < args.length; i++) { else if (a === '--no-timing') noTiming = true; else if (a === '--clone-count') cloneCount = true; else if (a === '--render-only') renderOnly = true; + else if (a === '--tracing') tracing = true; else if (!inputArg) inputArg = a; else { console.error(`unknown arg: ${a}`); process.exit(2); } } @@ -247,6 +259,29 @@ try { await cdp.send('HeapProfiler.startSampling', { samplingInterval: heapSampling }); console.log(`[harness] heap profile: sampling every ${heapSampling} bytes`); } + let tracePath = null; + if (tracing) { + // Independent of Profiler / HeapProfiler -- different CDP domain. + // Categories chosen to crack open the cpu profile's (program) bucket: + // devtools.timeline gives Layout / RecalcStyles / ParseHTML / + // FunctionCall / EvaluateScript; disabled-by-default-devtools.timeline + // adds UpdateLayoutTree / InvalidateLayout / ScheduleStyleRecalc / + // HitTest; blink covers internal Blink events; v8 + v8.execute cover + // V8.GC* / V8.CompileCode / V8.RunMicrotasks / V8.Execute. + tracePath = join(outDir, 'trace.json'); + await page.tracing.start({ + path: tracePath, + screenshots: false, + categories: [ + 'devtools.timeline', + 'disabled-by-default-devtools.timeline', + 'blink', + 'v8', + 'v8.execute', + ], + }); + console.log(`[harness] tracing: ${tracePath}`); + } const tRenderStart = Date.now(); await page.evaluate(async () => { @@ -266,6 +301,15 @@ try { const tRenderEnd = Date.now(); const renderMs = tRenderEnd - tRenderStart; + if (tracing) { + await page.tracing.stop(); + try { + const { statSync } = await import('node:fs'); + const sz = statSync(tracePath).size; + console.log(`[harness] tracing: ${tracePath} (${(sz / 1024 / 1024).toFixed(1)} MB)`); + } catch { /* size reporting is best-effort */ } + } + let profilePath = null; let heapProfilePath = null; if (cdp) { From 1c665b2458672b80a729a1650dd8ccf68a2d2442 Mon Sep 17 00:00:00 2001 From: Kuba Sunderland-Ober Date: Fri, 22 May 2026 23:50:57 +0200 Subject: [PATCH 03/18] Compress book.html; layer :post_render hooks by role. Extend html-compress to mark book-combined as compress-eligible so book.html collapses inter-element whitespace at Jekyll time instead of paged.js's WhiteSpaceFilter doing ~37k DOM mutations at render time. Reorder :pages, :post_render and :documents, :post_render hooks into a three-tier convention so adding compress to book.html composes correctly with the other plugins: :high mutators (book-href-rewrite) :normal compress (html-compress) :low readers (pdfify capture, offlinify per-page rewrite) Without the layering, book-href-rewrite's landing-heading strip ran after compress, leaving adjacent single-space runs that no downstream pass collapsed. The 3-tier ordering makes "compress is the last cleanup pass among mutators" and "readers see final compressed bytes" hold by construction. Verified: 0 outside-pre multi-whitespace runs in the regenerated book.html (was 37,087 without compress). Branch-counting the WhiteSpaceFilter post-fix shows DOM mutations drop from ~37k to 0. Ruby-prof A/B confirms the priority shuffle is CPU-invariant; the only attributable cost is one extra compress! call (~480 ms once per Jekyll build, ~300-500 ms saved per paged.js render). Adds analyze-trace.mjs --children mode used to localise this during the investigation. Full writeup in perf/README.md and docs/_plugins/html-compress.md. --- WIP.md | 2 +- docs/_plugins/book-href-rewrite.rb | 6 +- docs/_plugins/html-compress.md | 34 +++- docs/_plugins/html-compress.rb | 43 ++++- docs/_plugins/offlinify.rb | 6 +- docs/_plugins/pdfify.rb | 4 +- perf/README.md | 301 +++++++++++++++++++++++++++++ perf/analyze-trace.mjs | 95 +++++++-- 8 files changed, 463 insertions(+), 28 deletions(-) diff --git a/WIP.md b/WIP.md index 3bd2998..ef72dfa 100644 --- a/WIP.md +++ b/WIP.md @@ -433,7 +433,7 @@ From `docs/`: - `check.bat` — link check (offline Lychee against `_site/`). - `book.bat` — renders the PDF from `_site-pdf/book.html` via `pagedjs-cli` into `_pdf/book.pdf`. Run `build.bat` first to populate `_site-pdf/`. -The HTML whitespace compression that wraps every page's render chain is handled by `_plugins/html-compress.rb` rather than the just-the-docs theme's `vendor/compress.html` Liquid layout — see [_plugins/html-compress.md](docs/_plugins/html-compress.md) for the full writeup. The Liquid layout's per-page cost in the profile was ~2.4s of Liquid filter dispatch (a `split: " " | join: " "` over the outside-of-`
` content, lowering to a per-page Array allocation of every whitespace-delimited token across 837 pages — millions of small `String` objects). The layout is short-circuited via `compress_html.ignore.envs: all` in `_config.yml`; it then outputs a bare `{{ content }}` and the plugin takes over at `:pages, :post_render` / `:documents, :post_render` with `priority :high`, doing the same pre-block-protected whitespace collapse via `content.split(PRE_BLOCK_RE).each { |s| s.split(" ").join(" ") }` in C-implemented Ruby. The `priority :high` annotation places this hook before offlinify and pdfify (both `:normal`) so they see the compressed bytes. Pages whose layout chain doesn't reach `vendor/compress` are gated out via a `:site, :pre_render` precompute that walks `site.layouts[name].data["layout"]` for every layout key and marks the entire compress-reaching chain (default → table_wrappers → vendor/compress) -- jekyll-redirect-from stubs, the SCSS-derived CSS pages, `assets/js/zzzz-search-data.json`, and `book.html` (which uses the minimal `book-combined` layout that has no parent) all stay un-gated and pass through verbatim, matching exactly what the Liquid layout would have processed. Output is byte-identical to the layout-based version: a recursive `diff -rq` of `_site/` against a vendor/compress.html baseline reports zero differences across all ~840 HTML pages, 290 redirect stubs, every CSS / JSON / SVG / image asset. The plugin's correctness depended on two non-obvious details that broke an earlier cut -- the layout-chain walk has to compare against the layout *key* (`"vendor/compress"`) rather than `layout.name` (which carries the `.html` extension), and the per-segment `split(" ").join(" ")` strips trailing whitespace that the Liquid layout's *template* re-adds via its trailing-newline source character, so the plugin captures `content.end_with?("\n")` before the split and re-appends a `\n` after the join. Both regressions surfaced as nonzero `diff -rq` counts during development and are flagged in the plugin's header comment and [_plugins/html-compress.md](docs/_plugins/html-compress.md).
+The HTML whitespace compression that wraps every page's render chain is handled by `_plugins/html-compress.rb` rather than the just-the-docs theme's `vendor/compress.html` Liquid layout — see [_plugins/html-compress.md](docs/_plugins/html-compress.md) for the full writeup. The Liquid layout's per-page cost in the profile was ~2.4s of Liquid filter dispatch (a `split: " " | join: " "` over the outside-of-`
` content, lowering to a per-page Array allocation of every whitespace-delimited token across 837 pages — millions of small `String` objects). The layout is short-circuited via `compress_html.ignore.envs: all` in `_config.yml`; it then outputs a bare `{{ content }}` and the plugin takes over at `:pages, :post_render` / `:documents, :post_render` with `priority :normal`, doing the same pre-block-protected whitespace collapse via `content.split(PRE_BLOCK_RE).each { |s| s.split(" ").join(" ") }` in C-implemented Ruby. The `:normal` priority is the *middle* tier of a three-level convention across the site's `:post_render` hooks: mutators (`book-href-rewrite`) run at `:high`, this cleanup pass at `:normal`, readers (`pdfify`, `offlinify`) at `:low`. The invariant "compress runs after every mutator and before every reader" therefore holds by construction; no downstream plugin has to be whitespace-aware. Pages whose layout chain doesn't reach `vendor/compress` are gated out via a `:site, :pre_render` precompute that walks `site.layouts[name].data["layout"]` for every layout key and marks the entire compress-reaching chain (default → table_wrappers → vendor/compress) -- jekyll-redirect-from stubs, the SCSS-derived CSS pages, and `assets/js/zzzz-search-data.json` all stay un-gated and pass through verbatim. `book.html` (which uses the minimal `book-combined` layout that has no parent) is *also* outside that chain but is explicitly added to the compress-eligible set at the end of the precompute, so the same whitespace collapse runs on it -- saves paged.js's render-time `WhiteSpaceFilter` ~37k DOM mutations (~28k `textContent` overwrites + ~9k `removeChild` calls) at the cost of ~480 ms once per Jekyll build. Output is byte-identical to the layout-based version: a recursive `diff -rq` of `_site/` against a vendor/compress.html baseline reports zero differences across all ~840 HTML pages, 290 redirect stubs, every CSS / JSON / SVG / image asset. The plugin's correctness depended on two non-obvious details that broke an earlier cut -- the layout-chain walk has to compare against the layout *key* (`"vendor/compress"`) rather than `layout.name` (which carries the `.html` extension), and the per-segment `split(" ").join(" ")` strips trailing whitespace that the Liquid layout's *template* re-adds via its trailing-newline source character, so the plugin captures `content.end_with?("\n")` before the split and re-appends a `\n` after the join. Both regressions surfaced as nonzero `diff -rq` counts during development and are flagged in the plugin's header comment and [_plugins/html-compress.md](docs/_plugins/html-compress.md).
 
 ### Profiling the build
 
diff --git a/docs/_plugins/book-href-rewrite.rb b/docs/_plugins/book-href-rewrite.rb
index add4b3e..28acdc2 100644
--- a/docs/_plugins/book-href-rewrite.rb
+++ b/docs/_plugins/book-href-rewrite.rb
@@ -372,7 +372,11 @@ def self.process(page)
   end
 end
 
-Jekyll::Hooks.register :pages, :post_render do |page|
+# :high so this MUTATOR runs before html-compress (priority :normal).
+# Otherwise the landing-heading strip leaves a double-space run that
+# no downstream pass cleans up. See html-compress.rb's priority
+# convention comment for the full layering.
+Jekyll::Hooks.register :pages, :post_render, priority: :high do |page|
   next unless page.path == "book.html"
   BookHrefRewrite.process(page)
 end
diff --git a/docs/_plugins/html-compress.md b/docs/_plugins/html-compress.md
index 114e388..7430862 100644
--- a/docs/_plugins/html-compress.md
+++ b/docs/_plugins/html-compress.md
@@ -1,6 +1,6 @@
 # HtmlCompress
 
-`_plugins/html-compress.rb` runs the HTML whitespace compression that wraps every page's render chain — the same job just-the-docs's vendor/compress.html Liquid layout was doing, but in Ruby instead of Liquid filters. Output is byte-identical to the layout-based version (verified by recursive diff of every file in `_site/` against a vendor/compress.html baseline). The Liquid layout is short-circuited to a `{{ content }}` passthrough via `compress_html.ignore.envs: all` in `_config.yml`; the plugin then runs at `:pages, :post_render` / `:documents, :post_render` with `priority :high`, so the compressed bytes are what offlinify and Jekyll's writer see.
+`_plugins/html-compress.rb` runs the HTML whitespace compression that wraps every page's render chain — the same job just-the-docs's vendor/compress.html Liquid layout was doing, but in Ruby instead of Liquid filters. Output is byte-identical to the layout-based version for the 837 vendor/compress-reaching pages (verified by recursive diff of every file in `_site/` against a vendor/compress.html baseline). The Liquid layout is short-circuited to a `{{ content }}` passthrough via `compress_html.ignore.envs: all` in `_config.yml`; the plugin then runs at `:pages, :post_render` / `:documents, :post_render` with `priority :normal` as the *cleanup* step in a three-tier `:high` → `:normal` → `:low` ordering (mutators → compress → readers — see [Hook priority convention](#hook-priority-convention) below). It also picks up one page the original layout didn't process, `book.html`, via an explicit `book-combined` addition to the compress-eligible set — see [book.html inclusion](#bookhtml-inclusion).
 
 This file sits in `_plugins/` for the same reasons as `offlinify.md` and `pdfify.md`: it lives next to the code it documents, and Jekyll's `_plugins/` folder is plugin-only territory, so this Markdown never gets rendered into the public site.
 
@@ -74,7 +74,7 @@ page.md   (layout: default)
         └── vendor/compress.html (no layout)
 ```
 
-Pages that don't use any of these layouts — jekyll-redirect-from stubs, the SCSS-derived CSS pages, `assets/js/zzzz-search-data.json`, `book.html` (which uses the minimal `book-combined` layout that has no parent) — were left untouched by the layout. The plugin has to match that gating, otherwise it would compress files that compress.html doesn't, breaking byte-identity.
+Pages that don't use any of these layouts — jekyll-redirect-from stubs, the SCSS-derived CSS pages, `assets/js/zzzz-search-data.json` — were left untouched by the layout. The plugin has to match that gating, otherwise it would compress files that compress.html doesn't, breaking byte-identity. `book.html` (which uses the minimal `book-combined` layout that has no parent) was originally in this list, but is now explicitly added to the compress-eligible set — see [book.html inclusion](#bookhtml-inclusion).
 
 The gate is precomputed once at `:site, :pre_render`:
 
@@ -114,20 +114,42 @@ Jekyll::Hooks.register :site, :pre_render do |site|
   HtmlCompress.precompute_compress_layouts!(site)
 end
 
-Jekyll::Hooks.register :pages, :post_render, priority: :high do |page|
+Jekyll::Hooks.register :pages, :post_render, priority: :normal do |page|
   next unless page.output.is_a?(String)
   next unless HtmlCompress.compress?(page)
   HtmlCompress.compress!(page.output)
 end
 
-Jekyll::Hooks.register :documents, :post_render, priority: :high do |doc|
+Jekyll::Hooks.register :documents, :post_render, priority: :normal do |doc|
   next unless doc.output.is_a?(String)
   next unless HtmlCompress.compress?(doc)
   HtmlCompress.compress!(doc.output)
 end
 ```
 
-The `priority: :high` is what places the plugin *before* `offlinify.rb` and `pdfify.rb` in the per-page render-hook order — both of those use the default `:normal` priority and rely on reading the final compressed `page.output`. Jekyll runs `:post_render` hooks in descending priority, so `:high` (30) fires before `:normal` (20). Without the priority annotation the order would be insertion-order across all `.rb` files in `_plugins/`, which is not a stable contract.
+## Hook priority convention
+
+The `priority: :normal` is the middle tier of a three-level ordering for `:pages, :post_render` and `:documents, :post_render` hooks across the plugin set. Jekyll runs hooks in descending priority (`:high` (30) → `:normal` (20) → `:low` (10)), and the three tiers carry distinct roles:
+
+| Tier | Role | Plugins |
+| --- | --- | --- |
+| `:high` (30) | **Mutators.** Modify `page.output` so the final bytes reflect this pass. | `book-href-rewrite` (chapter href rewrites + landing-heading strip on `book.html`). |
+| `:normal` (20) | **Compress.** The cleanup pass. Sandwiched between mutators and readers so any whitespace runs left behind by a mutator's `gsub` get collapsed before any reader captures the bytes. | `html-compress` (this plugin). |
+| `:low` (10) | **Readers.** Snapshot or consume `page.output` after the cleanup pass. | `pdfify` (captures `book.html` for the PDF pipeline), `offlinify` (per-page href / src rewrites + write to `_site-offline/`). |
+
+The layering was originally implicit: the plugin sat at `:high` next to no other priority-annotated `:post_render` hooks. That worked until `book-href-rewrite` joined the set at default `:normal`. Its landing-heading strip ran *after* compress, removing `

` blocks but leaving the (already-collapsed) single-space runs on either side adjacent — producing literal `> <` blobs in three chapter openings that paged.js's WhiteSpaceFilter then had to handle at render time. Promoting `book-href-rewrite` to `:high` and demoting compress to `:normal` makes the invariant "compress is the last cleanup step among mutators" hold by construction; demoting the readers to `:low` makes "readers see the final compressed output" hold by construction. Future plugins choose their tier by their role and the ordering composes automatically. + +The full priority story is documented as a comment block above the `Jekyll::Hooks.register` calls in [`html-compress.rb`](html-compress.rb); each of the four affected plugins (this one, `book-href-rewrite`, `pdfify`, `offlinify`) carries a one-line note pointing back to that block. + +## book.html inclusion + +The layout-chain walk above only marks layouts that reach `vendor/compress`. `book.html` uses the minimal `book-combined` layout, which has no parent, so the walk never reaches it and the page was originally skipped (matching the layout's behaviour). After investigation of paged.js's per-render `WhiteSpaceFilter` work (see [`perf/README.md`](../../perf/README.md)) showed it doing ~37k DOM mutations at render time to handle whitespace text nodes that *would* have been collapsed if the page had been compressed at Jekyll build time, the precompute was extended to mark `book-combined` explicitly: + +```ruby +@compress_layouts << "book-combined" if site.layouts.key?("book-combined") +``` + +at the end of `precompute_compress_layouts!`. Output: `book.html` now passes through `compress!` once per build (~480 ms of additional `String#split` work on the ~5.5 MB document), saving roughly the same wall-clock at paged.js render time (~28k `textContent` overwrites + ~9k `removeChild` calls eliminated). Net is approximately wall-clock-neutral for full builds, and a small net win for incremental Jekyll workflows that skip the PDF (`also_build_pdf: false`) — the compress cost is paid once per Jekyll build, the render saving is paid every PDF build, and decoupling the two is the structural improvement. ## Verification @@ -157,6 +179,6 @@ In source order in [`html-compress.rb`](html-compress.rb): - `precompute_compress_layouts!(site)` — `:site, :pre_render` entry. Walks every layout chain via `data["layout"]`, marks each layout on the path as compress-ending the moment the walk hits `vendor/compress`. Idempotent; the resulting `@compress_layouts` set persists across builds in `jekyll serve` and gets rebuilt fresh each `:pre_render`. -- `compress?(page)` — gate check. Returns `true` when the page's `data["layout"]` is in `@compress_layouts`. Pages without a layout (jekyll-redirect-from stubs, SCSS-derived CSS, JSON-via-page-rendering, `book.html` via `book-combined`) return `false` and skip the compression entirely. +- `compress?(page)` — gate check. Returns `true` when the page's `data["layout"]` is in `@compress_layouts`. Pages without a layout (jekyll-redirect-from stubs, SCSS-derived CSS, JSON-via-page-rendering) return `false` and skip the compression entirely. `book.html` (which uses `book-combined`, a minimal layout with no parent) used to land here too; it is now explicitly added to the set by `precompute_compress_layouts!` — see [book.html inclusion](#bookhtml-inclusion). - `compress!(content)` — the actual compression, in place. Captures the trailing-newline state, splits by `PRE_BLOCK_RE` with the capture group so pre bodies are preserved in the result array, runs `split(" ").join(" ")` on every outside-of-pre segment, joins, restores the trailing newline if needed, then mutates the input string via `String#replace`. The `replace` is what lets us hand back the same string object the caller passed in — Jekyll's writer reads `page.output` after `:post_render`, so in-place mutation is the cheapest way to update what gets written. diff --git a/docs/_plugins/html-compress.rb b/docs/_plugins/html-compress.rb index 7603a58..ddbb194 100644 --- a/docs/_plugins/html-compress.rb +++ b/docs/_plugins/html-compress.rb @@ -72,6 +72,12 @@ def self.precompute_compress_layouts!(site) cur_name = cur ? cur.data["layout"] : nil end end + # book-combined is a minimal layout with no parent, so the walk + # above doesn't reach it. Compressing its only consumer (book.html) + # at Jekyll time saves paged.js's WhiteSpaceFilter ~37k DOM + # mutations and ~300-400 ms once per render -- see + # perf/README.md "WhiteSpaceFilter that wasn't" section. + @compress_layouts << "book-combined" if site.layouts.key?("book-combined") end # True when `page` (or document) uses a layout chain ending in @@ -117,16 +123,43 @@ def self.compress!(content) HtmlCompress.precompute_compress_layouts!(site) end -# Run before offlinify (default :normal priority) so the offline-tree -# rewrites see the compressed page.output, and before Jekyll's -# `:site, :post_write` writes _site/ for the same reason. -Jekyll::Hooks.register :pages, :post_render, priority: :high do |page| +# Priority convention for :pages, :post_render hooks in this site: +# +# :high = MUTATORS. Plugins that modify page.output. Run first so +# their mutations are visible to compress and downstream +# readers. Examples: book-href-rewrite (landing heading +# strip + in-book href rewrites). +# +# :normal = COMPRESS. This plugin. The cleanup pass, sandwiched +# between mutators and readers so any whitespace runs left +# behind by a mutator's gsub get collapsed before anyone +# reads the final bytes. +# +# :low = READERS. Plugins that snapshot or consume page.output +# after all mutations and the compress pass. Run last so +# they see final output. Examples: pdfify (captures +# book.html for the PDF pipeline), offlinify (rewrites +# root-absolute hrefs and writes to _site-offline/). +# +# Without this layering, a mutator running after compress leaves +# adjacent whitespace runs that no downstream pass collapses; a +# reader running before compress captures uncompressed bytes. Both +# regressions surfaced when book-href-rewrite (default :normal) ran +# after html-compress (originally :high) -- its 3 landing-heading +# strips left double-space artifacts that paged.js's WhiteSpaceFilter +# had to handle at render time. +# +# Offlinify also runs at :site, :post_write (a later phase entirely), +# where it always sees the final compressed bytes regardless of +# per-page priority. The :low designation here governs its per-page +# capture hook specifically. +Jekyll::Hooks.register :pages, :post_render, priority: :normal do |page| next unless page.output.is_a?(String) next unless HtmlCompress.compress?(page) HtmlCompress.compress!(page.output) end -Jekyll::Hooks.register :documents, :post_render, priority: :high do |doc| +Jekyll::Hooks.register :documents, :post_render, priority: :normal do |doc| next unless doc.output.is_a?(String) next unless HtmlCompress.compress?(doc) HtmlCompress.compress!(doc.output) diff --git a/docs/_plugins/offlinify.rb b/docs/_plugins/offlinify.rb index ab5032e..782038f 100644 --- a/docs/_plugins/offlinify.rb +++ b/docs/_plugins/offlinify.rb @@ -1443,11 +1443,13 @@ def self.decode(path) Offlinify.setup(site) end -Jekyll::Hooks.register :pages, :post_render do |page| +# :low so these READERS see page.output after html-compress (:normal) +# has run. See html-compress.rb's priority convention. +Jekyll::Hooks.register :pages, :post_render, priority: :low do |page| Offlinify.process_page(page) end -Jekyll::Hooks.register :documents, :post_render do |doc| +Jekyll::Hooks.register :documents, :post_render, priority: :low do |doc| Offlinify.process_page(doc) end diff --git a/docs/_plugins/pdfify.rb b/docs/_plugins/pdfify.rb index 49eebea..f3feabb 100644 --- a/docs/_plugins/pdfify.rb +++ b/docs/_plugins/pdfify.rb @@ -287,7 +287,9 @@ def self.copy_file(src, dst) Pdfify.setup(site) end -Jekyll::Hooks.register :pages, :post_render do |page| +# :low so this READER captures page.output after html-compress +# (:normal) has run. See html-compress.rb's priority convention. +Jekyll::Hooks.register :pages, :post_render, priority: :low do |page| Pdfify.maybe_capture(page) end diff --git a/perf/README.md b/perf/README.md index 1385d1f..518605e 100644 --- a/perf/README.md +++ b/perf/README.md @@ -4121,3 +4121,304 @@ those operations didn't have a Blink layout-tree mutation step downstream. Mutations are where the cost that *looks* like JS allocation actually lives in this codebase. + +## Cracking `(program)` open with a Blink-category trace + +The cpu profile's `(program)` row sat at ~2.2 s (23 %) of +render and resisted attribution -- `find-callers.mjs` puts +it directly under `(root)`, the V8 sampler's structural +floor for "isolate is on-CPU but no JS frame on top." To +see *what* native code was running there, the harness gained +a `--tracing` flag and a companion `analyze-trace.mjs`. + +The flag wraps the render phase in `page.tracing.start()` +with Blink-relevant categories (`devtools.timeline`, +`disabled-by-default-devtools.timeline`, `blink`, `v8`, +`v8.execute`) and writes `trace.json` to the results +folder. `analyze-trace.mjs` walks the trace's complete-phase +events on `CrRendererMain`, computes self-time per event +name via a nested-event stack walk (same shape as +`analyze-profile.mjs` for cpuprofiles), and prints a +top-N table. A `--children ` mode breaks any +parent event into its direct callees, mirroring +`find-callees.mjs`. + +### What's on the main thread + +Top events by self-time on a fresh `--detach-pages +--no-timing --render-only --tracing` run, 1651-page book, +9.07 s render: + +| event | self_ms | self_% | +| ---------------------------------------- | ------- | ------ | +| `RunMicrotasks` | 3039.42 | 33.5 % | +| `LocalFrameView::performLayout` | 1800.31 | 19.9 % | +| `Document::recalcStyle` | 1785.55 | 19.7 % | +| `InlineNode::ShapeTextIncludingFirstLine`| 526.64 | 5.8 % | +| `Document::rebuildLayoutTree` | 484.88 | 5.4 % | +| `FunctionCall` | 285.89 | 3.2 % | +| `v8.callFunction` | 251.48 | 2.8 % | +| `Blink.CompositingInputs.UpdateTime` | 130.77 | 1.4 % | +| `Blink.PrePaint.UpdateTime` | 118.90 | 1.3 % | +| `Document::updateStyle` | 101.65 | 1.1 % | +| ... 189 smaller events ... | | | + +Mapping these onto the cpu profile's labels: + +| cpu profile row | trace decomposition | +| --- | --- | +| `getBoundingClientRect` self 3.7 s | `performLayout` 1.8 s + `recalcStyle` 1.8 s -- the layout flush gBCR triggers, which the cpu profile lumps under the native frame. | +| `removeChild` self 1.6 s | `rebuildLayoutTree` 0.5 s + portions of `recalcStyle` / `performLayout` -- each removeChild dirties style and layout. | +| `(program)` self 2.2 s | `RunMicrotasks` 3.0 s mostly. The cpu profile attributes a chunk of this to neighbour rows; what's left under `(program)` is the V8 runtime plumbing that has no JS frame on top. | +| `(garbage collector)` 100 ms | Sum of `V8.GC_*` events ≈ 135 ms. | + +So `(program)` is essentially **the V8 runtime inside a +microtask continuation**. The natural follow-up is "which +microtask, and what's it doing?" + +### Inside `RunMicrotasks` + +`--children RunMicrotasks` shows the parent fired only +**15 times** across the whole render, totalling 7.14 s: + +``` +parent: RunMicrotasks hits: 15 total: 7142.49ms self: 3039.42ms (42.6%) + + total_ms total_% hits child + -------- ------- ------ -------------------------------- + 3442.01 48.19% 39437 Document::UpdateStyleAndLayout + 3039.42 42.55% 15 (self / unattributed) + 547.98 7.67% 181106 v8.callFunction + 50.99 0.71% 892 Blink.Style.UpdateTime + 34.88 0.49% 205 V8.StackGuard + 17.05 0.24% 6 MinorGC +``` + +Listing the 15 events by duration: + +``` +rm[0] 70.89 ms -- one early-render burst (the parser) +rm[1..3] < 1 ms -- empty-trigger settle ticks +rm[4] 7071.14 ms -- THE render loop +rm[5..14] < 1 ms each -- post-render cleanup +``` + +**One event accounts for 99.0 % of the parent total.** +rm[4] envelopes essentially the whole render. V8 batches +the ~6 `await` boundaries inside `Chunker.flow()` +(beforeParsed / filter / afterParsed / loadFonts / +render / afterRendered) -- all of which Phase 1 of the +async cleanup turned into `await undefined` fast-paths -- +into a single drained microtask continuation. There is +**no per-page microtask cost**. The async stripping did +its job. + +### The 181,106 `v8.callFunction` callbacks + +The first thing that looked like a smoking gun -- +"181k dispatches sounds per-page-shaped" -- turned out +to be **one DOM walk**. Aggregating FunctionCall events +by `args.data.functionName + lineNumber`: + +``` +hits dur_ms functionName:line +181041 296.54 (anon):32455 (paged.browser.js) + 2 0.25 request.onload:27495 +``` + +paged.browser.js:32455 is `WhiteSpaceFilter.filter`'s +TreeWalker callback: + +```js +filterTree(content, (node) => { + return this.filterEmpty(node); +}, NodeFilter.SHOW_TEXT); +``` + +The walker visits every text node in the parsed +document and calls the lambda. For our 5.5 MB book +that's 181,041 invocations, all clustered in the first +685 ms of rm[4]. Same `(node) => this.filterEmpty(...)` +arrow allocated once but called from C++→JS 181k times, +so V8 emits a `v8.callFunction` event each invocation. + +These aren't 181k microtasks. They're 181k synchronous +TreeWalker callbacks nested inside the one big +continuation. The "callbacks per page" framing was a +mirage produced by dividing 181k by page count. + +### What's actually in `(program)`'s 2.2 s + +Triangulating the trace and cpu profile: + +- **~1.7 s** is V8 dispatch glue for the 181k filter + walk callbacks + remaining native→JS transitions + inside the continuation. V8 charges this to + `RunMicrotasks` self in the trace; the cpu profile + splits it between `(program)` and rows like `v8.callFunction`. +- **~0.3 s** is V8 IC / inline-cache miss handling on + the per-page hot path. Each polymorphic call site + pays a stub-call indirection that lands in `(program)`. +- **~0.1 s** is Blink microtask checkpoint code -- the + auto-style-and-layout pass that fires whenever a + microtask drains. The `Document::UpdateStyleAndLayout` + events under `RunMicrotasks` (3.44 s) attribute the + work *itself* to named Blink rows; the C++ glue + bracketing each call lands in `(program)`. +- The remainder is V8 scheduler bookkeeping, microtask + queue drain machinery, and small unnamed natives. + +None of this is a *per-page* cost. Reducing further +would require either (a) eliminating the filter walk, +or (b) reducing the per-page hot path's native→JS +transition count -- which is dominated by gBCR-driven +layout flushes that we've already pushed against +unsuccessfully in earlier sections (Attempts B, D from +the "createBreakToken dedup" investigation). + +### The "actionable finding" that wasn't: WhiteSpaceFilter + +The whitespace filter walk costs **~685 ms once per +render** -- 296 ms inside the JS callback bodies plus +~390 ms in TreeWalker dispatch overhead. The initial +read was "this is doing nothing useful for compressed +HTML, short-circuit it." Wrong on both counts. + +Branch-counting the filter via a one-shot probe (count +every branch in `filterEmpty`, dump to the harness +console): + +``` +total: 181,106 every text node visited + length === 0: 0 + length === 1: 38,685 (21.4%) collapsed inter-element spaces + length > 1, !ignorable: 101,930 (56.3%) real content -- hot path + length > 1, ignorable: 40,491 (22.4%) whitespace-only, body runs + inside
:        3,408   no-op (REJECT)
+    middle position:    27,901   textContent = " " (mutated)
+    left edge:           5,405   removeChild (accepted)
+    right edge:          3,777   removeChild (accepted)
+    orphan:                  0
+```
+
+**22.4 % of calls entered the body** and 37,083 actual
+DOM mutations happened: 9,182 nodes removed +
+27,901 nodes overwritten to single spaces. Far from
+zero.
+
+The premise was based on a misreading of html-compress:
+the plugin does collapse inter-element whitespace, but
+the `:site, :pre_render` gate that picks which pages it
+processes explicitly excludes `book.html` (which uses
+the minimal `book-combined` layout that doesn't reach
+`vendor/compress`; same README's html-compress section
+calls this out). Source indentation is preserved in
+the PDF input, so paged.js sees the raw multi-char
+whitespace text nodes. The filter is load-bearing --
+its mutations are what subsequent chunker walkers
+rely on to skip whitespace cheaply.
+
+The 0.83 % of calls that exceeded 4 us in the trace's
+dur histogram came from this body running; the
+histogram undercounted body entries because the
+short-branch (`closest("pre")` → REJECT) takes only
+~2-3 us, indistinguishable from the hot path in the
+0-4 us buckets. Branch counters were needed to reveal
+the true split.
+
+There's still optimisation headroom (the per-call
+TreeWalker dispatch is ~3 us of which only ~1.5 us is
+the body), but it requires changing the algorithm
+rather than skipping it: e.g. a hand-rolled JS recursion
+that avoids the C++→JS transition per node, or
+folding WhiteSpaceFilter + CommentsFilter + ScriptsFilter
+into a single TreeWalker pass with `SHOW_TEXT | SHOW_COMMENT`
+and a dispatcher. Net saving probably ~300-400 ms once
+per render; not investigated.
+
+The methodology lesson: a histogram of per-call dur
+**cannot** distinguish a fast body branch from a hot
+path -- both compile to 2-3 µs on V8. Branch
+instrumentation is the only way to count what each
+call actually did. The histogram suggested "0.8 %
+body entries"; reality was 22.4 %.
+
+### And we did fix it, on the Jekyll side
+
+The premise that motivated the original "actionable
+finding" -- that book.html should already be
+whitespace-collapsed when paged.js sees it -- was true
+in spirit, just wrong about whether it was being done.
+The fix landed in two parts:
+
+1. **Extend `html-compress.rb` to book.html.** The
+   layout-chain precompute now explicitly adds
+   `book-combined` to `@compress_layouts` at the end of
+   `precompute_compress_layouts!`. book.html therefore
+   passes through `compress!` once per build (~480 ms
+   of `String#split` work on the ~5.5 MB document), and
+   paged.js sees a document with inter-element
+   whitespace already collapsed to single spaces.
+
+2. **Reorder hook priorities** so that adding compress
+   to book.html composes cleanly with the other
+   `:pages, :post_render` plugins. The original
+   `:high`-priority compress ran *before*
+   `book-href-rewrite` -- whose landing-heading strip
+   removed `

` blocks from three chapter openings, + leaving the (already-collapsed) single spaces on + either side adjacent and producing literal `> <` + blobs. The fix is a three-tier convention: mutators + at `:high` (run first), compress at `:normal` (the + cleanup), readers at `:low` (snapshot final bytes). + See `_plugins/html-compress.md` for the full table. + +Verified: 0 outside-pre multi-whitespace runs in the +regenerated book.html (was 3 with the +landing-heading-strip artifacts; was 37,087 without +compress at all). Branch-counting the WhiteSpaceFilter +after the fix shows body entries drop from ~40 k to +the 3,408 in-pre cases that the filter is structurally +required to visit (and immediately REJECTs via +`closest("pre")`). DOM mutations drop from ~37 k to 0. +PDF output is byte-equivalent within timestamp drift. + +Net wall-clock is approximately neutral on full builds +(~480 ms added to Jekyll, ~300-500 ms saved at paged.js +render time), and a small win for incremental Jekyll +workflows that skip the PDF (`also_build_pdf: false`): +the compress cost is paid once per Jekyll build, the +render saving is paid every PDF build, and decoupling +the two is the structural improvement. + +A ruby-prof A/B (post-change vs pre-change with a +single stashed-changes revert) confirmed that the only +attributable Jekyll-side cost is exactly one extra +`compress!` invocation (837 → 838) and its downstream +`String#split` calls (+819 from book.html's non-pre +segments). No plugin's call count or self-time changed +beyond the noise floor; the priority shuffle is +CPU-invariant for everything except the new compress +pass on book.html. + +### What the trace doesn't change + +Nothing about the cpu profile's bottom-up table is +wrong; the trace just resolves what `(program)` masked. +After this exercise, the menu of remaining levers is +unchanged: + +- `pageRanges` sharding for the generate phase (biggest + untried knob, generate is now the larger phase). +- WhiteSpaceFilter algorithmic restructuring (~0.3 s, + render) -- not a short-circuit, since the filter does + real work; would need a hand-rolled traversal that + avoids the per-node C++→JS dispatch. +- Everything else lives below the noise floor. + +Render-stage optimization headroom is exhausted. The +cpu profile's `(program)` row isn't a structural smell +or a missed microtask -- it's the fixed cost of V8 +running the JavaScript we already have, accounted for +honestly by the trace and accounted for opaquely by +the JS sampler. diff --git a/perf/analyze-trace.mjs b/perf/analyze-trace.mjs index cefc6cf..9e1496b 100644 --- a/perf/analyze-trace.mjs +++ b/perf/analyze-trace.mjs @@ -11,6 +11,7 @@ // Usage: // node analyze-trace.mjs [--top N] [--min-pct P] // [--thread ] [--all-threads] +// [--children ] // // Defaults: --top 30, --min-pct 0.1 (hide rows under 0.1% self-time), // thread = CrRendererMain (the V8 / DOM / Blink layout thread). @@ -19,6 +20,14 @@ // each thread independently, subtracting nested children from each // parent's duration. Matches the "Bottom-Up" view in chrome://tracing // and DevTools' Performance panel when grouped by event name. +// +// --children switches the report from "top events by self-time" +// to "what runs directly inside ?". For every X-event whose name +// matches, aggregate the total time of each direct child by child name +// (this is total-time / inclusive cost from the parent's POV, the same +// shape as find-callees.mjs for cpuprofiles). Plus a synthetic +// "(self / unattributed)" row capturing parent dur minus the sum of +// direct children -- i.e. work attributed to the parent frame itself. import { readFileSync } from 'node:fs'; import { resolve } from 'node:path'; @@ -29,16 +38,18 @@ let topN = 30; let minPct = 0.1; let threadName = 'CrRendererMain'; let allThreads = false; +let childrenOf = null; for (let i = 0; i < args.length; i++) { const a = args[i]; if (a === '--top') topN = parseInt(args[++i], 10); else if (a === '--min-pct') minPct = parseFloat(args[++i]); else if (a === '--thread') threadName = args[++i]; else if (a === '--all-threads') allThreads = true; + else if (a === '--children') childrenOf = args[++i]; else if (!tracePath) tracePath = a; } if (!tracePath) { - console.error('usage: node analyze-trace.mjs [--top N] [--min-pct P] [--thread NAME] [--all-threads]'); + console.error('usage: node analyze-trace.mjs [--top N] [--min-pct P] [--thread NAME] [--all-threads] [--children NAME]'); process.exit(2); } tracePath = resolve(process.cwd(), tracePath); @@ -104,13 +115,23 @@ if (!targetThreads.length) { process.exit(3); } -// Per-thread self-time computation via depth-walk over X events. -// Sort by ts ascending; on tie, longer dur first so a containing -// event lands on the stack before its child. +// Per-thread depth-walk over X events. Sort by ts ascending; on tie, +// longer dur first so a containing event lands on the stack before its +// child. +// +// Two output modes: +// default --> top-N events by self-time (bottom-up view). +// --children X --> direct callees of every X-event named X, +// aggregated by child name + a (self) row. const selfByName = new Map(); // name -> { self_us, cat } +const childrenAcc = childrenOf ? new Map() : null; // child name -> { total_us, cat, hits } +let childrenParentTotal_us = 0; +let childrenParentSelf_us = 0; +let childrenParentHits = 0; +let childrenParentCat = ''; let totalEvents = 0; let traceMinTs = Infinity, traceMaxTs = -Infinity; -for (const { tk, name: tname } of targetThreads) { +for (const { tk } of targetThreads) { const list = byThread.get(tk); if (!list || !list.length) continue; list.sort((a, b) => a.ts - b.ts || b.dur - a.dur); @@ -118,11 +139,18 @@ for (const { tk, name: tname } of targetThreads) { const stack = []; const flush = (top) => { const self = top.dur - top.childTime; - if (self <= 0) return; - const cur = selfByName.get(top.name) || { self_us: 0, cat: top.cat || '' }; - cur.self_us += self; - if (!cur.cat && top.cat) cur.cat = top.cat; - selfByName.set(top.name, cur); + if (self > 0) { + const cur = selfByName.get(top.name) || { self_us: 0, cat: top.cat || '' }; + cur.self_us += self; + if (!cur.cat && top.cat) cur.cat = top.cat; + selfByName.set(top.name, cur); + } + if (childrenOf && top.name === childrenOf) { + childrenParentTotal_us += top.dur; + childrenParentSelf_us += Math.max(0, top.dur - top.childTime); + childrenParentHits += 1; + if (!childrenParentCat && top.cat) childrenParentCat = top.cat; + } }; for (const e of list) { if (e.ts < traceMinTs) traceMinTs = e.ts; @@ -130,7 +158,15 @@ for (const { tk, name: tname } of targetThreads) { while (stack.length && stack[stack.length - 1].endTs <= e.ts) { flush(stack.pop()); } - if (stack.length) stack[stack.length - 1].childTime += e.dur; + const parent = stack.length ? stack[stack.length - 1] : null; + if (parent) parent.childTime += e.dur; + if (childrenOf && parent && parent.name === childrenOf) { + const cur = childrenAcc.get(e.name) || { total_us: 0, cat: e.cat || '', hits: 0 }; + cur.total_us += e.dur; + cur.hits += 1; + if (!cur.cat && e.cat) cur.cat = e.cat; + childrenAcc.set(e.name, cur); + } stack.push({ name: e.name, cat: e.cat, dur: e.dur, endTs: e.ts + e.dur, childTime: 0 }); } while (stack.length) flush(stack.pop()); @@ -138,6 +174,42 @@ for (const { tk, name: tname } of targetThreads) { const traceDurUs = (traceMaxTs > traceMinTs) ? (traceMaxTs - traceMinTs) : 0; +const fmt = (n, w) => n.toFixed(2).padStart(w); + +if (childrenOf) { + if (!childrenParentHits) { + console.error(`no X events named "${childrenOf}" found on ${allThreads ? 'any thread' : threadName}`); + process.exit(3); + } + const rows = [...childrenAcc.entries()] + .map(([name, v]) => ({ + name, + cat: v.cat, + hits: v.hits, + total_ms: v.total_us / 1000, + pct: 100 * v.total_us / childrenParentTotal_us, + })); + rows.push({ + name: '(self / unattributed)', + cat: childrenParentCat, + hits: childrenParentHits, + total_ms: childrenParentSelf_us / 1000, + pct: 100 * childrenParentSelf_us / childrenParentTotal_us, + }); + rows.sort((a, b) => b.total_ms - a.total_ms); + console.log(`trace: ${tracePath}`); + console.log(`thread${allThreads ? 's' : ''}: ${allThreads ? 'all' : threadName} span: ${(traceDurUs / 1e6).toFixed(2)}s`); + console.log(`parent: ${childrenOf} hits: ${childrenParentHits} total: ${(childrenParentTotal_us / 1000).toFixed(2)}ms self: ${(childrenParentSelf_us / 1000).toFixed(2)}ms (${(100*childrenParentSelf_us/childrenParentTotal_us).toFixed(1)}%)`); + console.log(`direct children, top ${topN} by total time (min ${minPct}% of parent total):`); + console.log(''); + console.log(' total_ms total_% hits child @ category'); + console.log(' -------- ------- ------ --------------------------------------'); + for (const r of rows.filter(r => r.pct >= minPct).slice(0, topN)) { + console.log(` ${fmt(r.total_ms, 8)} ${fmt(r.pct, 5)}% ${String(r.hits).padStart(6)} ${r.name} @ ${r.cat || '(no cat)'}`); + } + process.exit(0); +} + const totalSelfUs = [...selfByName.values()].reduce((s, x) => s + x.self_us, 0); const rows = [...selfByName.entries()] .map(([name, v]) => ({ @@ -150,7 +222,6 @@ const rows = [...selfByName.entries()] .filter(r => r.pct >= minPct) .slice(0, topN); -const fmt = (n, w) => n.toFixed(2).padStart(w); console.log(`trace: ${tracePath}`); console.log(`events: ${totalEvents} thread${allThreads ? 's' : ''}: ${allThreads ? 'all' : threadName} span: ${(traceDurUs / 1e6).toFixed(2)}s`); console.log(`total self: ${(totalSelfUs / 1000).toFixed(2)}ms across ${selfByName.size} distinct event names`); From 272d47b7acb5bfcabb97564fe2ae89c325338e97 Mon Sep 17 00:00:00 2001 From: Kuba Sunderland-Ober Date: Sat, 23 May 2026 00:21:21 +0200 Subject: [PATCH 04/18] Disable WhiteSpaceFilter by default; opt-in via PagedConfig. A 3+3 paired cpu-profile A/B (perf/ab-aggregate.mjs) showed the filter's 181k TreeWalker callbacks cost ~600 ms of CPU on every render even when html-compress has already collapsed inter-element whitespace at Jekyll time. ~125 ms is direct (filterTree/filterEmpty self); the rest is indirect -- gBCR, recalcStyle, performLayout and UpdateStyleAndLayout all run ~14% cheaper per call when V8's IC and Blink scheduler aren't being churned by 181k C++->JS dispatches. The cost is small per call but compounds because the walk lives inside the same microtask continuation as the per-page render loop. Earlier wall-clock A/B (3+3, 8.78s vs 8.53s) had attributed the delta to noise; that was wrong. Per-row aggregation across paired cpu profiles shows the filterTree row at 88 ms (sd 14) vs 2 ms (sd 1) -- a 6 sigma shift -- and the downstream gBCR row at -338 ms mean, consistent with the trace's -574 ms drop on Document::UpdateStyleAndLayout total. The fix: gate the TreeWalker invocation behind window.PagedConfig.runWhitespaceFilter (default undefined = off). Our pipeline never sets the flag because html-compress already does the work; documents that need the cleanup can opt back in. Also adds perf/ab-aggregate.mjs (per-row mean+SD aggregator across 6 paired cpu profiles) and a long writeup in perf/README.md with the methodology, the corrected understanding of why the filter has cost (not flush migration -- it does no layout-flushing work; it's V8 IC pressure + Blink scheduler overhead), and lessons about when to trust wall-clock vs aggregated cpu-profile rows. --- docs/lib/paged.browser.js | 19 ++++ perf/README.md | 194 ++++++++++++++++++++++++++++++++++++-- perf/ab-aggregate.mjs | 96 +++++++++++++++++++ 3 files changed, 302 insertions(+), 7 deletions(-) create mode 100644 perf/ab-aggregate.mjs diff --git a/docs/lib/paged.browser.js b/docs/lib/paged.browser.js index f681876..e058715 100644 --- a/docs/lib/paged.browser.js +++ b/docs/lib/paged.browser.js @@ -32445,12 +32445,31 @@ TargetText ]; + // [PATCH: whitespace-filter-opt-in] Default off because our Jekyll + // pipeline runs html-compress on `book.html` (see _plugins/html- + // compress.rb's three-tier hook ordering: book-combined is in the + // compress-eligible set), so inter-element whitespace is already + // collapsed by the time paged.js sees the document. The filter + // would visit every text node in the parsed DOM (~181 k callbacks + // on the 1651-page book) and -- post-compression -- find essentially + // nothing to mutate. A paired cpu-profile A/B (3+3 runs, see + // perf/README.md) showed the no-op walk still costs ~600 ms of CPU + // per render: ~125 ms direct (filterTree / filterEmpty self) plus + // ~480 ms indirect (gBCR + downstream Blink layout / style work that + // runs cheaper when V8's IC + Blink scheduler aren't being churned + // by 181 k C++->JS callback dispatches). The cost is small per call + // but compounds because the walk lives inside the same microtask + // continuation as the per-page render loop. Set + // `window.PagedConfig.runWhitespaceFilter = true` before + // PagedPolyfill.preview() if processing a document whose source + // HTML wasn't compressed at build time. class WhiteSpaceFilter extends Handler { constructor(chunker, polisher, caller) { super(chunker, polisher, caller); } filter(content) { + if (!(typeof window !== "undefined" && window.PagedConfig && window.PagedConfig.runWhitespaceFilter)) return; filterTree(content, (node) => { return this.filterEmpty(node); diff --git a/perf/README.md b/perf/README.md index 518605e..2f8e17b 100644 --- a/perf/README.md +++ b/perf/README.md @@ -108,6 +108,7 @@ DevTools-compatible trace is a few lines. | `probe-outline-exclusions.mjs` | Tests which per-element attributes / styles (aria-hidden, role=presentation, hidden, display:none, CSS bookmark-level, ...) make Chrome drop a heading from its outline. | | `analyze-profile.mjs` | Bottom-up self-time analyzer for `.cpuprofile` files. Same shape as DevTools' Performance bottom-up view, in the terminal. | | `analyze-trace.mjs` | Bottom-up self-time analyzer for Chrome traces (`trace.json` from `--tracing`). Computes per-event self-time on the renderer's main thread (`CrRendererMain` by default) by walking nested `X`-phase events. Cracks the cpu profile's `(program)` bucket open into named Blink / V8 events (`Layout`, `RecalcStyle`, `RunMicrotasks`, `V8.GC_*`, ...). | +| `ab-aggregate.mjs` | Per-row mean + SD aggregator across 6 paired cpu profiles (`ab-A1..A3.cpuprofile` and `ab-B1..B3.cpuprofile`). Use when wall-clock noise drowns a structural change: capture 3+3 interleaved profiles via `measure.mjs --cpu-profile` with the change toggled on/off between runs, then point this at the 6 files for a mean-with-SD table that surfaces deltas wall-clock can't see (e.g. ~6 σ shifts on rows that move from 88 ms to 2 ms). See the "Disabling the filter outright" section in this README for the methodology. | | `find-callers.mjs` | "Who paid for this callee's time?" -- walks a `.cpuprofile` and attributes a target function's total time back to each direct caller. Used throughout the post-mortems to detect gBCR migration between callers. | | `find-callees.mjs` | The other direction of `find-callers.mjs`: splits a function's self+descendant time across its direct callees. Surfaces the cases where V8 has rolled native DOM work back into the calling JS frame (Range deletion in `removeOverflow`, HTML parser in `wrapContent`). | | `grep-profile.mjs` | Lists every node in a `.cpuprofile` whose `functionName` matches a regex, with self-time and location. Quick check for "is this frame in the profile at all, and what's it called?" | @@ -4410,15 +4411,194 @@ unchanged: - `pageRanges` sharding for the generate phase (biggest untried knob, generate is now the larger phase). -- WhiteSpaceFilter algorithmic restructuring (~0.3 s, - render) -- not a short-circuit, since the filter does - real work; would need a hand-rolled traversal that - avoids the per-node C++→JS dispatch. +- WhiteSpaceFilter -- the trace and a follow-up cpu- + profile A/B (see next section) eventually showed this + *is* skippable for our pipeline once html-compress has + done the work at Jekyll time. Worth ~600 ms / 6 %. - Everything else lives below the noise floor. -Render-stage optimization headroom is exhausted. The -cpu profile's `(program)` row isn't a structural smell -or a missed microtask -- it's the fixed cost of V8 +The cpu profile's `(program)` row isn't a structural +smell or a missed microtask -- it's the fixed cost of V8 running the JavaScript we already have, accounted for honestly by the trace and accounted for opaquely by the JS sampler. + +## Disabling the filter outright: paired cpu-profile A/B + +The "actionable finding that wasn't" + "and we did fix +it, on the Jekyll side" pair above closed with two +conclusions: + +1. WhiteSpaceFilter does real work on book.html + (37k DOM mutations pre-compression, 0 post-). +2. Post-compression the filter is essentially a no-op + visit over 181k text nodes, and skipping it doesn't + save measurable wall-clock -- a 3+3 wall-clock A/B + showed 8.78 s avg with filter vs 8.53 s without, well + inside the 1.17 s within-variant noise band. + +Conclusion (1) is correct. Conclusion (2) was wrong -- +specifically the "no measurable saving" claim and the +flush-migration explanation I attached to the ~+180 ms +gBCR move that appeared in a single-run profile pair. + +A reader pointed out the flush-migration reasoning was +incoherent: `WhiteSpaceFilter.filter` runs *once* in +`Chunker.flow()` *before* any page is created. The body +of `filterEmpty` reads `textContent`, walks parents via +`closest("pre")`, and walks siblings -- none of which +read layout-flushing properties (`gBCR`, `offsetTop`, +computed style, etc.). There is no flush for migration +to migrate from. Whatever the +180 ms gBCR move in the +single-run pair was, it wasn't "the filter's flush load +deferring to the next gBCR." It was single-run noise on +a 38 % row -- which has a much wider noise band than +the README's "50-150 ms for sub-1 % rows" methodology +note covers. + +### The proper A/B + +Three filter-on (A) and three filter-off (B) cpu-profile +runs, interleaved A1 B1 A2 B2 A3 B3 so system-load +variance hits both sides equally. The probe is a one-line +`return;` at the top of `WhiteSpaceFilter.filter` -- +skip the TreeWalker entirely. Toggle is a single edit +between runs. Both states are otherwise identical +(post-compression book.html, current bundle). + +Per-run totals from +[`perf/ab-aggregate.mjs`](ab-aggregate.mjs): + +| run | total CPU | +| --- | --- | +| A1 (filter ON) | 11,120 ms | +| A2 (filter ON) | 10,270 ms | +| A3 (filter ON) | 9,727 ms | +| **A mean** | **10,372 ms** | +| B1 (filter OFF) | 9,744 ms | +| B2 (filter OFF) | 10,189 ms | +| B3 (filter OFF) | 9,180 ms | +| **B mean** | **9,705 ms** | +| **Δ (B - A)** | **-668 ms (-6.4 %)** | + +The within-group ranges are ~1.3 s (A) and ~1.0 s (B), +so the -668 ms total-CPU delta sits at roughly 1 σ of +within-variant spread. By itself, that's a soft signal. + +But per-row breakdown is tighter: + +| row | A mean ± sd | B mean ± sd | Δ | +| --- | --- | --- | --- | +| `getBoundingClientRect` | 4128 ± 309 | 3791 ± 163 | **-338 ms** | +| `(program)` | 2243 ± 56 | 2328 ± 173 | +85 ms (noisy) | +| `removeChild` | 1619 ± 63 | 1564 ± 43 | -55 ms | +| `afterPageLayout` @ paged.js | 150 ± 26 | 119 ± 17 | -32 ms | +| **`filterTree` self** | **88 ± 14** | **2 ± 1** | **-86 ms** | +| `(garbage collector)` | 103 ± 6 | 92 ± 4 | -11 ms | +| `handleAlignment` | 70 ± 5 | 56 ± 7 | -14 ms | +| `create` (`Page.create`) | 66 ± 7 | 50 ± 4 | -15 ms | +| `sortDisplayedSelectors` | 60 ± 10 | 46 ± 1 | -14 ms | +| **`filterEmpty` self** | **37 ± 2** | **0** | **-37 ms** | + +Direct attribution (the filter rows that vanish in B): + +- `filterTree` self: -86 ms +- `filterEmpty` self: -37 ms +- ~123 ms + +Indirect attribution (rows that shrink in B despite +unchanged call counts -- see the trace data above +where Document::UpdateStyleAndLayout, recalcStyle and +performLayout all run ~14-15 % cheaper per call with +filter off): + +- `getBoundingClientRect`: -338 ms +- `removeChild`: -55 ms +- `afterPageLayout @ paged.js:30458` (paged.js core): -32 ms +- `create`: -15 ms +- `handleAlignment`: -14 ms +- `sortDisplayedSelectors`: -14 ms +- `(garbage collector)`: -11 ms +- smaller rows: ~50 ms +- ~529 ms + +Direct + indirect ≈ 652 ms, in the neighbourhood of +the -668 ms total-CPU delta. They corroborate. + +### Why the filter has indirect cost + +The single-trace measurement above (filter-off trace +captured for the same render) made the indirect path +visible: with filter off, `Document::UpdateStyleAndLayout` +total dropped by 574 ms across an *unchanged* 39,437 +call count -- ~14 µs less per call. `recalcStyle` and +`performLayout` similarly dropped ~14 % per call. +Plausibly: + +- V8's polymorphic inline caches stay warmer on the + per-page hot path when 181 k extra C++→JS + dispatches haven't been churning them. +- Blink's main-thread scheduler has fewer task + boundaries to bookkeep across. +- Allocator/GC pressure is lower (the filter walk + allocates per-callback closures and intermediate + strings, even when each callback just returns + FILTER_REJECT). + +None of those are "the filter triggers a layout +flush." Layout work *itself* gets cheaper because the +ambient V8/Blink state is less polluted. Same per-call +mechanics, slightly faster main-thread context. + +### The fix: config flag, default off + +`window.PagedConfig.runWhitespaceFilter` gates the +walk. Default is undefined (falsy) -- our pipeline runs +`html-compress` on book.html, so the filter has +nothing to do and skipping it saves the ~600 ms. + +Anyone running paged.js against an uncompressed +document can set the flag before `PagedPolyfill.preview()` +to opt back in. The class itself is unchanged so the +opt-in path is byte-equivalent to the original. + +The opt-in semantic is the conservative choice: paged.js +upstream and many downstream users feed it untouched +HTML (with inter-element indentation surviving), where +the filter does meaningful cleanup. Disabling it for +*every* caller of this bundle would be a regression for +those use cases. Disabling it by default for *our* +pipeline is fine because we control the input +end-to-end. + +Cost: zero per-page work (the gate is one `&&`-chain +check at startup), structural correctness for clean +documents, opt-in safety valve for everyone else. + +### Methodology note + +The wall-clock A/B was correct in claiming "the saving +is below the wall-clock noise floor for short N." It +was wrong in concluding "therefore no saving exists." +Two corrections: + +1. Aggregate CPU work across paired profiles. Wall-clock + noise is ~1 s per run on this machine; CPU sample + totals are also ~1 s per run but the row-by-row + self-time deltas can be much tighter. The + `filterTree` row goes from 88 ms (sd 14) to 2 ms (sd + 1) -- a 6 σ shift. Per-row analysis can see signals + that per-run totals lose. + +2. Use *enough* paired runs that within-group SD lets + you compute mean ± SD honestly. 3+3 is the bare + minimum (gives 1 σ confidence on row-level deltas + for things that change by 5+ σ). 5+5 or 10+10 would + tighten the gBCR delta confidence further -- worth + doing for finer signals. + +The probe + aggregator are reusable +([`perf/ab-aggregate.mjs`](ab-aggregate.mjs)): point at +6 `ab-*.cpuprofile` files and it prints the mean ± SD +table. Pattern fits any future "does this change save +CPU?" question where wall-clock noise is the obstacle. diff --git a/perf/ab-aggregate.mjs b/perf/ab-aggregate.mjs new file mode 100644 index 0000000..f8adfe2 --- /dev/null +++ b/perf/ab-aggregate.mjs @@ -0,0 +1,96 @@ +// One-shot aggregator for the 3+3 paired cpu-profile A/B (ab-A1..A3 / ab-B1..B3). +// Computes per-row self_ms mean across the 3 A runs and 3 B runs, plus the difference. +// Also prints total samples / duration per run so we can sanity-check variance. + +import { readFileSync, existsSync } from 'node:fs'; +import { resolve, dirname } from 'node:path'; +import { fileURLToPath } from 'node:url'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const runs = ['A1','A2','A3','B1','B2','B3'].map(tag => ({ + tag, + path: resolve(__dirname, `ab-${tag}.cpuprofile`), +})); + +for (const r of runs) { + if (!existsSync(r.path)) { console.error('missing:', r.path); process.exit(1); } +} + +// Same hit-counting + self_ms computation as analyze-profile.mjs. +function summarise(path) { + const p = JSON.parse(readFileSync(path, 'utf8')); + const totalUs = p.endTime - p.startTime; + const totalSamples = p.samples.length; + const us = totalUs / totalSamples; + const byKey = new Map(); + let totalHits = 0; + for (const n of p.nodes) { + const cf = n.callFrame || {}; + const fn = cf.functionName || '(anonymous)'; + const url = cf.url || ''; + const line = cf.lineNumber != null ? cf.lineNumber + 1 : '?'; + const key = `${fn} @ ${url || '(no url)'}:${line}`; + const cur = byKey.get(key) || { hits: 0 }; + cur.hits += n.hitCount || 0; + byKey.set(key, cur); + totalHits += n.hitCount || 0; + } + const rows = new Map(); + for (const [key, v] of byKey) { + rows.set(key, v.hits * us / 1000); // self_ms + } + return { totalSamples, durationS: totalUs / 1e6, usPerSample: us, rows }; +} + +const summaries = runs.map(r => ({ tag: r.tag, ...summarise(r.path) })); + +console.log('per-run totals'); +console.log(' tag samples dur(s) us/sample'); +for (const s of summaries) { + console.log(` ${s.tag} ${String(s.totalSamples).padStart(6)} ${s.durationS.toFixed(2).padStart(6)} ${s.usPerSample.toFixed(1)}`); +} +console.log(''); + +// Union of row keys across all 6 runs. +const keys = new Set(); +for (const s of summaries) for (const k of s.rows.keys()) keys.add(k); + +// Compute A-mean / A-stddev / B-mean / B-stddev per row. +function statsFor(group, key) { + const vals = group.map(s => s.rows.get(key) || 0); + const mean = vals.reduce((a,b)=>a+b,0) / vals.length; + const variance = vals.reduce((a,b)=>a+(b-mean)*(b-mean),0) / vals.length; + return { mean, sd: Math.sqrt(variance), vals }; +} + +const A = summaries.filter(s => s.tag.startsWith('A')); +const B = summaries.filter(s => s.tag.startsWith('B')); + +const rows = [...keys].map(k => { + const sa = statsFor(A, k); + const sb = statsFor(B, k); + return { key: k, aMean: sa.mean, aSd: sa.sd, bMean: sb.mean, bSd: sb.sd, delta: sb.mean - sa.mean }; +}); + +// Sort by max(|A|, |B|) so the biggest rows surface regardless of which side they're on. +rows.sort((x,y) => Math.max(y.aMean, y.bMean) - Math.max(x.aMean, x.bMean)); + +const fmt = (n, w) => n.toFixed(1).padStart(w); +console.log('top 25 rows by max(A mean, B mean), self_ms:'); +console.log(''); +console.log(' A_mean A_sd B_mean B_sd delta function'); +console.log(' ------ ---- ------ ---- ----- --------'); +for (const r of rows.slice(0, 25)) { + // Strip the long URL prefix for readability. + const short = r.key.replace(/D:\\\\OCP\\\\wc\\\\twinBASIC-documentation\\\\docs\\\\lib\\\\paged\.browser\.js/, 'paged.browser.js') + .replace(/D:\\OCP\\wc\\twinBASIC-documentation\\docs\\lib\\paged.browser.js/, 'paged.browser.js'); + console.log(` ${fmt(r.aMean,7)} ${fmt(r.aSd,5)} ${fmt(r.bMean,7)} ${fmt(r.bSd,5)} ${fmt(r.delta,7)} ${short}`); +} + +// Total CPU work across all rows. +const aTotal = A.reduce((s,r)=>s+r.totalSamples*r.usPerSample,0)/A.length / 1000; +const bTotal = B.reduce((s,r)=>s+r.totalSamples*r.usPerSample,0)/B.length / 1000; +console.log(''); +console.log(`A mean total CPU: ${aTotal.toFixed(0)} ms (${A.map(r => (r.totalSamples*r.usPerSample/1000).toFixed(0)).join(' / ')})`); +console.log(`B mean total CPU: ${bTotal.toFixed(0)} ms (${B.map(r => (r.totalSamples*r.usPerSample/1000).toFixed(0)).join(' / ')})`); +console.log(`delta (B-A): ${(bTotal-aTotal).toFixed(0)} ms`); From 688ad1fbea435f670b46778dca3ed91a3a0d4032 Mon Sep 17 00:00:00 2001 From: Kuba Sunderland-Ober Date: Sat, 23 May 2026 15:00:55 +0200 Subject: [PATCH 05/18] Strip all async from paged.js render chain; RunMicrotasks 6333->0.56ms. --- docs/lib/paged.browser.js | 221 +++++++++++++----------- docs/render-book.mjs | 10 +- perf/README.md | 350 ++++++++++++++++++++++++++++++++++++++ perf/measure.mjs | 7 +- 4 files changed, 487 insertions(+), 101 deletions(-) diff --git a/docs/lib/paged.browser.js b/docs/lib/paged.browser.js index e058715..83c9e53 100644 --- a/docs/lib/paged.browser.js +++ b/docs/lib/paged.browser.js @@ -3001,10 +3001,21 @@ } - async flow(content, renderTo) { + // [PATCH: sync-chain] flow() is now synchronous. All five await + // sites turn into sync calls: + // - beforeParsed / afterParsed / afterRendered hooks: handlers + // on our pipeline are all sync, so _assertSync guards them + // the same way the per-page hot path does. + // - loadFonts: now a sync assert (throws if any face isn't + // loaded; page.goto waitUntil:'load' ensures they are). + // - render: now a plain sync function. + // This was the last load-bearing await in the bundle. With + // flow() sync, the entire per-render call chain executes + // without yielding to a microtask boundary. + flow(content, renderTo) { let parsed; - await this.hooks.beforeParsed.trigger(content, this); + _assertSync(this.hooks.beforeParsed.trigger(content, this), "beforeParsed"); parsed = new ContentParser(content); @@ -3022,20 +3033,20 @@ this.emit("rendering", parsed); - await this.hooks.afterParsed.trigger(parsed, this); + _assertSync(this.hooks.afterParsed.trigger(parsed, this), "afterParsed"); - await this.loadFonts(); + this.loadFonts(); - let rendered = await this.render(parsed, this.breakToken); + let rendered = this.render(parsed, this.breakToken); while (rendered.canceled) { this.start(); - rendered = await this.render(parsed, this.breakToken); + rendered = this.render(parsed, this.breakToken); } this.rendered = true; this.pagesArea.style.setProperty("--pagedjs-page-count", this.total); - await this.hooks.afterRendered.trigger(this.pages, this); + _assertSync(this.hooks.afterRendered.trigger(this.pages, this), "afterRendered"); this.emit("rendered", this.pages); @@ -3072,12 +3083,11 @@ // } // } - // [PATCH: sync-chain] *layout is a sync generator now, so - // renderer.next() returns synchronously -- no per-page await. - // render() itself stays `async` because callers (flow()) await - // it and other once-per-render awaits in flow() (loadFonts, - // beforeParsed / afterParsed / afterRendered) still need it. - async render(parsed, startAt) { + // [PATCH: sync-chain] render() is now plain sync. *layout is a + // sync generator (renderer.next() returns synchronously), and + // flow() no longer awaits this call -- the entire per-render + // chain (preview -> flow -> render) is sync end to end. + render(parsed, startAt) { let renderer = this.layout(parsed, startAt); let result; @@ -3367,7 +3377,12 @@ } */ - async clonePage(originalPage) { + // [PATCH: sync-chain] clonePage is now synchronous. Only caller + // is the Footnotes handler (line ~31625) which is itself gated + // out for documents without `[data-note='footnote']` -- dead + // path on our content but kept sync-clean for consistency with + // the rest of the per-page hook surface. + clonePage(originalPage) { let lastPage = this.pages[this.pages.length - 1]; let page = new Page(this.pagesArea, this.pageTemplate, false, this.hooks); @@ -3379,7 +3394,7 @@ page.index(this.total); - await this.hooks.beforePageLayout.trigger(page, undefined, undefined, this); + _assertSync(this.hooks.beforePageLayout.trigger(page, undefined, undefined, this), "beforePageLayout"); this.emit("page", page); for (const className of originalPage.element.classList) { @@ -3388,27 +3403,32 @@ } } - await this.hooks.afterPageLayout.trigger(page.element, page, undefined, this); - await this.hooks.finalizePage.trigger(page.element, page, undefined, this); + _assertSync(this.hooks.afterPageLayout.trigger(page.element, page, undefined, this), "afterPageLayout"); + _assertSync(this.hooks.finalizePage.trigger(page.element, page, undefined, this), "finalizePage"); this.emit("renderedPage", page); } + // [PATCH: sync-chain] loadFonts is now a synchronous assertion. + // Upstream walked document.fonts and kicked off fontFace.load() + // for any not-yet-loaded face, returning a Promise.all. Our + // headless pipeline drives `page.goto(url, { waitUntil: "load" })` + // before paged.js runs, which settles document.fonts.ready -- + // every face is already in state "loaded" by the time we get + // here. The walk is a safety check: if a face is still loading + // (or hit an error), pipeline assumptions are broken and we + // should fail loudly rather than silently re-asyncify. loadFonts() { - let fontPromises = []; (document.fonts || []).forEach((fontFace) => { if (fontFace.status !== "loaded") { - let fontLoaded = fontFace.load().then((r) => { - return fontFace.family; - }, (r) => { - console.warn("Failed to preload font-family:", fontFace.family); - return fontFace.family; - }); - fontPromises.push(fontLoaded); + throw new Error( + "paged.js (forked): font-face '" + fontFace.family + + "' is not yet loaded (status=" + fontFace.status + + "). The headless pipeline expects every font to be " + + "loaded before PagedPolyfill.preview() runs; ensure " + + "page.goto uses { waitUntil: 'load' } or 'networkidle0'." + ); } }); - return Promise.all(fontPromises).catch((err) => { - console.warn(err); - }); } destroy() { @@ -26501,16 +26521,22 @@ - // parse - async parse(text) { + // [PATCH: sync-chain] parse() is now synchronous. Upstream awaited + // the three Polisher.hooks.{beforeTreeParse, beforeTreeWalk, + // afterTreeWalk} triggers; with our pipeline registering no async + // handlers for any of them, the awaits were pure microtask + // boundaries. _assertSync throws if anyone ever does register a + // thenable-returning handler -- same safety pattern the chunker's + // per-page hot path uses. + parse(text) { this.text = text; - await this.hooks.beforeTreeParse.trigger(this.text, this); + _assertSync(this.hooks.beforeTreeParse.trigger(this.text, this), "beforeTreeParse"); // send to csstree this.ast = csstree.parse(this._text); - await this.hooks.beforeTreeWalk.trigger(this.ast); + _assertSync(this.hooks.beforeTreeWalk.trigger(this.ast), "beforeTreeWalk"); // Replace urls this.replaceUrls(this.ast); @@ -26525,7 +26551,7 @@ this.rules(this.ast); this.atrules(this.ast); - await this.hooks.afterTreeWalk.trigger(this.ast, this); + _assertSync(this.hooks.afterTreeWalk.trigger(this.ast, this), "afterTreeWalk"); // return ast return this.ast; @@ -27480,28 +27506,30 @@ } `; - async function request(url, options={}) { - return new Promise(function(resolve, reject) { - let request = new XMLHttpRequest(); - - request.open(options.method || "get", url, true); - - for (let i in options.headers) { - request.setRequestHeader(i, options.headers[i]); - } - - request.withCredentials = options.credentials === "include"; - - request.onload = () => { - // Chrome returns a status code of 0 for local files - const status = request.status === 0 && url.startsWith("file://") ? 200 : request.status; - resolve(new Response(request.responseText, {status})); - }; - - request.onerror = reject; - - request.send(options.body || null); - }); + // [PATCH: sync-chain] Synchronous XHR returning body text directly. + // Upstream paged.js used async XHR + Promise + Response wrapper to + // keep the interactive-browser main thread responsive while + // stylesheets loaded. Our headless pipeline doesn't share that + // constraint: every stylesheet is a local file:// URL, fetches are + // sub-ms, and we want the polisher's stylesheet ingestion off the + // microtask queue so the whole render chain stays sync. Both + // callers (Polisher.add / convertViaSheet) only ever consumed + // response.text(), which is itself async per spec -- returning the + // text directly skips that boundary too. Throws on HTTP error. + function request(url, options={}) { + let req = new XMLHttpRequest(); + req.open(options.method || "get", url, false); + for (let i in options.headers) { + req.setRequestHeader(i, options.headers[i]); + } + req.withCredentials = options.credentials === "include"; + req.send(options.body || null); + // Chrome returns status 0 for successful local-file loads. + const status = req.status === 0 && url.startsWith("file://") ? 200 : req.status; + if (status < 200 || status >= 300) { + throw new Error("paged.js (forked): request " + url + " failed with status " + status); + } + return req.responseText; } class Polisher { @@ -27538,53 +27566,43 @@ return this.styleSheet; } - async add() { - let fetched = []; - let urls = []; - - for (var i = 0; i < arguments.length; i++) { - let f; - - if (typeof arguments[i] === "object") { - for (let url in arguments[i]) { - let obj = arguments[i]; - f = new Promise(function(resolve, reject) { - urls.push(url); - resolve(obj[url]); - }); + // [PATCH: sync-chain] add() is now synchronous. Upstream collected + // every input as a Promise (Promise.all + then-chain), even when + // inputs were inline {url:text} objects with no fetch needed. + // With request() returning text directly and convertViaSheet now + // sync, we just walk the arguments once and feed each to the + // pipeline. Same return semantics: the converted-and-inserted + // text of the last stylesheet. + add() { + let text = ""; + for (let i = 0; i < arguments.length; i++) { + let arg = arguments[i]; + if (typeof arg === "object") { + for (let url in arg) { + text = this.convertViaSheet(arg[url], url); + this.insert(text); } } else { - urls.push(arguments[i]); - f = request(arguments[i]).then((response) => { - return response.text(); - }); + let url = arg; + let cssStr = request(url); + text = this.convertViaSheet(cssStr, url); + this.insert(text); } - - - fetched.push(f); } - - return await Promise.all(fetched) - .then(async (originals) => { - let text = ""; - for (let index = 0; index < originals.length; index++) { - text = await this.convertViaSheet(originals[index], urls[index]); - this.insert(text); - } - return text; - }); + return text; } - async convertViaSheet(cssStr, href) { + // [PATCH: sync-chain] convertViaSheet is now synchronous. + // sheet.parse is sync; request() now returns body text directly + // (sync XHR + responseText, no Response wrapper). + convertViaSheet(cssStr, href) { let sheet = new Sheet(href, this.hooks); - await sheet.parse(cssStr); + sheet.parse(cssStr); // Insert the imported sheets first for (let url of sheet.imported) { - let str = await request(url).then((response) => { - return response.text(); - }); - let text = await this.convertViaSheet(str, url); + let str = request(url); + let text = this.convertViaSheet(str, url); this.insert(text); } @@ -33074,9 +33092,18 @@ }); } - async preview(content, stylesheets, renderTo) { + // [PATCH: sync-chain] preview() is now synchronous end-to-end. + // beforePreview / afterPreview hooks are once-per-render so the + // _assertSync guard is the same shape as the chunker's per-page + // hot path uses. polisher.add and chunker.flow are sync above. + // External callers (perf/measure.mjs, docs/render-book.mjs) now + // call this without `await` -- the page.evaluate IIFE wrapping + // the call is also sync, so the entire script execution runs + // inside one EvaluateScript frame instead of being scheduled + // across multiple microtask continuations. + preview(content, stylesheets, renderTo) { - await this.hooks.beforePreview.trigger(content, renderTo); + _assertSync(this.hooks.beforePreview.trigger(content, renderTo), "beforePreview"); if (!content) { content = this.wrapContent(); @@ -33090,12 +33117,12 @@ this.handlers = this.initializeHandlers(); - await this.polisher.add(...stylesheets); + this.polisher.add(...stylesheets); let startTime = performance.now(); // Render flow - let flow = await this.chunker.flow(content, renderTo); + let flow = this.chunker.flow(content, renderTo); let endTime = performance.now(); @@ -33104,7 +33131,7 @@ this.emit("rendered", flow); - await this.hooks.afterPreview.trigger(flow.pages); + _assertSync(this.hooks.afterPreview.trigger(flow.pages), "afterPreview"); return flow; } diff --git a/docs/render-book.mjs b/docs/render-book.mjs index 7ad9509..71de7e1 100644 --- a/docs/render-book.mjs +++ b/docs/render-book.mjs @@ -158,13 +158,19 @@ try { } // Render -- paged.js per-page layout. + // PagedPolyfill.preview() is fully synchronous in our forked bundle + // (the entire chain preview -> chunker.flow -> render -> *layout is + // now sync; loadFonts is a sync assertion that page.goto's + // waitUntil:'load' already satisfied; stylesheets are loaded via + // synchronous XHR). Inner IIFE is a plain sync arrow; outer await + // is just the CDP round-trip puppeteer needs to ferry the result. const tRender = Date.now(); - await page.evaluate(async () => { + await page.evaluate(() => { if (!window.PagedPolyfill) { throw new Error('paged.js bundle did not expose window.PagedPolyfill'); } try { - await window.PagedPolyfill.preview(); + window.PagedPolyfill.preview(); } catch (err) { // Unwrap the undecorated ProgressEvent paged.js throws on fetch // failures so the message includes the offending URL. diff --git a/perf/README.md b/perf/README.md index 2f8e17b..9ce36dd 100644 --- a/perf/README.md +++ b/perf/README.md @@ -3108,6 +3108,18 @@ plain function call that produces a plain return value. ### What's still async, and why +> **Update.** All four survivors listed below were +> subsequently stripped -- see "Following `RunMicrotasks` +> down to zero" at the end of this README. The reasoning +> here ("once-per-render, overhead irrelevant") was +> correct as a per-call cost argument but missed that +> the unbroken await chain forced V8 to attribute the +> entire post-`loadFonts` render to a microtask +> continuation (`RunMicrotasks` in the trace, +> `(program)` in the cpu profile). Re-attribution alone +> was worth the conversion; wall-clock is unchanged. +> The list below is preserved for chronological accuracy. + The async machinery that survives this audit is now at the once-per-render layer, where it's load-bearing: @@ -4602,3 +4614,341 @@ The probe + aggregator are reusable 6 `ab-*.cpuprofile` files and it prints the mean ± SD table. Pattern fits any future "does this change save CPU?" question where wall-clock noise is the obstacle. + +## Following `RunMicrotasks` down to zero + +The trace section above pinned the cpu profile's +`(program)` row to V8 running JS inside a microtask +continuation. With the WhiteSpaceFilter gone the +`--children RunMicrotasks` breakdown still showed one +`rm[4] = 6262 ms` event enveloping essentially the +whole render -- 15 hits total, 99 % concentrated in one +batched drain. That raised a sharper question: if the +per-page hot path is sync (Phase 1 + 2 above), why is +*any* of the render running inside a microtask scope? + +### What was still async, and what it cost us + +The README's earlier "What's still async, and why" +inventory was honest about the surviving await sites at +that point: + +- `Chunker.flow()` -- async wrapper, awaited + `beforeParsed` / `afterParsed` / `afterRendered` hook + triggers, `loadFonts()`, and `chunker.render()`. +- `Chunker.render()` -- thin async wrapper around the + sync `renderer.next()` loop, kept so `flow()` could + `await` it. +- `Chunker.clonePage()` -- async, awaited three + per-page hooks. Footnotes-only caller, dead path for + our content but live in the bundle. +- `PagedPolyfill.preview()` -- async, awaited + `beforePreview` / `afterPreview` hooks plus + `polisher.add` and `chunker.flow`. +- `Polisher.add()` / `Polisher.convertViaSheet()` / + `Sheet.parse()` -- async chain to fetch and parse + external stylesheets. `Polisher.add` did + `Promise.all` over the inputs. +- `Chunker.loadFonts()` -- returned `Promise.all` of + `fontFace.load()` for any face not yet in state + "loaded". +- `request()` -- async XHR + `Promise` wrapper, used by + the polisher chain to fetch each `` + URL. + +Cost of each: small. Cost of all of them together: V8 +sees an unbroken await chain from `page.evaluate(async +() => { await PagedPolyfill.preview(); })` down to +`document.fonts.ready` (the one genuinely-async +dependency in the chain). When that promise resolves V8 +schedules a microtask to resume `flow()`. Phase 1 + 2 +of the async cleanup made the *body* of the resumed +function execute synchronously, so once it resumes it +runs ~6.2 s straight to the end of the render. V8 +correctly attributes the whole continuation to the +`RunMicrotasks` host frame, since that's the C++ frame +on the stack while the resumed JS runs. + +So `RunMicrotasks` self-time being 2.89 s wasn't a +sign of microtask overhead -- it was the bookkeeping +label V8 puts on continuation-style work. Every named +Blink event nested inside (`Document::UpdateStyleAndLayout`, +`recalcStyle`, `performLayout`, etc.) appeared in the +trace as a child of `RunMicrotasks`. Same shape applied +in the cpu profile: `(program)` is the catch-all bucket +V8 picks when no JS frame sits on top of the stack at +sample time, and a microtask continuation is exactly +that condition. + +The bucket name was misleading, but the cost itself was +real -- the JS *running* inside the continuation +*was* paged.js doing its per-page work. No "microtask +plumbing overhead" to slim down. The only way to remove +the `RunMicrotasks` attribution was to stop wrapping the +render in a microtask continuation entirely -- i.e., +make the whole chain synchronous so V8 has no async +scope to attribute to. + +### Why this is OK for our pipeline (and not for upstream) + +Upstream paged.js needs the async machinery. Its target +deployment is an interactive browser page: real +stylesheet fetches over HTTP (genuinely async), font +loads against the OS (genuinely async), user-registered +handlers that may load external resources or do +expensive work between page renders (async-friendly to +keep the page responsive). The await chain is the +canonical pattern for "yield to the browser between +expensive steps so the UI thread can paint." + +Our pipeline has none of those constraints: + +- `page.goto(url, { waitUntil: 'load' })` settles + *before* paged.js is invoked. Every font, image, and + stylesheet referenced by `` / `@font-face` / + `` is already loaded by the time the render + starts. The async checks are no-ops. +- The headless renderer has no compositor coordinating + with us, no paint budget to respect, no user looking + at the page. Blocking the main thread for 8 s is + fine -- nobody's watching. +- All registered handlers in our build are synchronous. + The `_assertSync` guard from the Phase 1/2 cleanup + has been in place for the per-page hot path for a + while; we just hadn't extended the pattern to the + once-per-render hooks. +- The stylesheet fetches the polisher does are local + `file://` URLs. Sync XHR resolves them in microseconds. + +So the entire async surface in paged.js -- which +upstream needs -- is, for our specific use case, the +opposite of helpful: it pushes work into microtask +continuations that show up as `RunMicrotasks` in the +trace and `(program)` in the cpu profile, instead of +landing under honest names like `RunTask` and +`EvaluateScript`. + +### The conversion + +Nine functions in `docs/lib/paged.browser.js` switched +from `async` to plain sync, marked +`[PATCH: sync-chain]` at each site: + +| function | what changed | +| --- | --- | +| `request()` | Async XHR + `new Promise` + `Response` wrapper → sync XHR (`open(...,false)`) returning body text directly. Both callers (`Polisher.add` / `convertViaSheet`) only ever consumed `response.text()` (itself async per spec), so returning text skips that boundary too. | +| `Sheet.parse()` | Three `await hook.trigger(...)` → `_assertSync(triggerSync(...))`. CSS-parser hooks all sync in our build. | +| `Polisher.convertViaSheet()` | Drop awaits on `sheet.parse` / `request` / recursive `convertViaSheet`. | +| `Polisher.add()` | Drop the `Promise.all` + then-chain entirely. Walks arguments once, feeds each through the sync pipeline. | +| `Chunker.loadFonts()` | `Promise.all(fontFace.load())` → sync walk of `document.fonts` that throws if any face's `status !== "loaded"`. The throw is a safety net; `page.goto({waitUntil:'load'})` settles fonts in practice. | +| `Chunker.clonePage()` | Three per-page hook awaits → `_assertSync`. Cold path (Footnotes-only). | +| `Chunker.render()` | Strip `async`. Body was already sync after the Phase 1/2 cleanup. | +| `Chunker.flow()` | Strip `async`; five await sites → sync calls / `_assertSync`. | +| `PagedPolyfill.preview()` | Strip `async`; two hook awaits → `_assertSync`; drop awaits on `polisher.add` / `chunker.flow`. | + +Plus the two external callers in +[`perf/measure.mjs`](measure.mjs) and +[`docs/render-book.mjs`](../docs/render-book.mjs): +both did `page.evaluate(async () => { await +window.PagedPolyfill.preview(); })`. The inner IIFE is +now a plain sync arrow; the outer `await` is just the +CDP round-trip puppeteer needs to ferry control back. + +The `_assertSync` helper (from the earlier +"sync chain end-to-end through the per-page hot path" +work) is the load-bearing safety net throughout: if any +future hook handler returns a thenable, the chain +throws with a useful error message instead of silently +swallowing async work. The contract is now: + +> Every hook handler in this bundle is sync. Every +> external resource referenced by the document is +> loaded before `PagedPolyfill.preview()` runs. + +If either invariant breaks, `_assertSync` or +`loadFonts`'s throw catches it loudly. + +### Results + +Paired `--detach-pages --no-timing --render-only +--tracing` run on the 1651-page book, comparing the +pre-conversion trace ([results from +"Inside RunMicrotasks" above]) against the post-: + +| metric | pre-sync | post-sync | Δ | +| --- | --- | --- | --- | +| render wall | 8.13 s | 8.36 s | flat (within single-run noise) | +| trace event count | 250,376 | 255,949 | flat | +| `RunMicrotasks` self | 2890.66 ms (35.6 %) | **0.56 ms** (off top-30) | **-2890 ms (-99.98 %)** | +| `RunMicrotasks` total | 6333.18 ms | **0.56 ms** | **-6333 ms** | +| `RunMicrotasks` hits | 15 | 12 | -3 | +| `RunMicrotasks` rm[4] dur | 6262.34 ms | gone | -6262 ms | +| `RunTask` self (top-30) | (below threshold, ~16 ms) | **2984.11 ms (34.6 %)** | **+2968 ms** | +| `RunTask` hits | (~few hundred) | **1005** | re-attributed | +| `RunTask` total | (small) | **8630.80 ms** | the whole render | +| `Document::UpdateStyleAndLayout` total/hits | 3320 / 39675 | 3515 / 39675 | flat | +| `Document::recalcStyle` self | 1737 ms | 1877 ms | flat | +| `LocalFrameView::performLayout` self | 1737 ms | 1881 ms | flat | +| per-page ratio (last/first quarter) | 1.36x | 1.27x | slight improvement (noise band) | +| pages | 1651 | 1651 | identical | +| PDF size (full render, separate run) | 16.1 MB | **16.1 MB** | byte-equivalent | + +The headline number is the **6333 → 0.56 ms collapse** +in `RunMicrotasks` total. The 12 surviving sub-ms hits +are pure puppeteer/CDP plumbing (one `AsyncTask Run` +child = 0.01 ms; the rest are V8 internal MT-checkpoint +runs). There is no remaining JS executing inside a +microtask continuation -- the render runs as a plain +synchronous task from start to end. + +The work didn't disappear, it re-attributed. `RunTask` +self-time (2984 ms) almost exactly equals the old +`RunMicrotasks` self-time (2891 ms) plus single-run +noise. Per-call children counts are unchanged +(`Document::UpdateStyleAndLayout`: 39675 calls then, +39675 calls now). Same JS, same DOM mutations, same +layout flushes -- just no longer wrapped in a +continuation. + +### What this buys + +**Profile readability.** A reader opening +`render.cpuprofile` or `trace.json` after this change +sees: + +- `(program)` in the cpu profile drops by the + proportion that was V8 runtime overhead inside the + continuation (the MT plumbing + dispatch glue + between named natives). The remaining `(program)` + is genuinely-unattributable V8 work (IC stubs, + runtime helpers). +- `RunMicrotasks` no longer appears at the top of the + trace's bottom-up table. The render lands under + `RunTask` / `EvaluateScript` / `FunctionCall`, with + Blink work (`performLayout`, `recalcStyle`, + `rebuildLayoutTree`) as named children where it + belongs. +- The cpu profile's `(idle)` row already collapsed in + the earlier rAF→queueMicrotask fix; this change + closes the symmetric gap on the JS side. + +**Structural simplicity.** Nine functions in the bundle +lost the `async` keyword and the `await` site +discipline that went with it. The render call chain is +now top-to-bottom synchronous: `preview()` calls into +`flow()` calls into `render()` calls into `*layout()`, +plain returns all the way down. Anyone tracing through +the bundle for a perf investigation can read the +control flow without modeling promise resolution +ordering. + +**Single contract.** The hook surface is now uniformly +sync via `_assertSync`. Before the conversion, the +per-page hooks (`beforePageLayout`, `afterPageLayout`, +`finalizePage`, etc.) were sync-asserted while the +once-per-render hooks (`beforeParsed`, `afterParsed`, +`afterRendered`, `beforePreview`, `afterPreview`) used +`await trigger(...)`. The split was historical, not +principled. Now every hook is sync-asserted, same +shape, same error message. + +### What this doesn't buy + +**Wall-clock.** Render goes 8.13 s → 8.36 s, which is +within the ±1 s single-run noise band for this machine +documented elsewhere in this README. CPU work +re-attributes but doesn't shrink: the chunker's JS +still runs the same way, DOM mutations still trigger +the same layout flushes, gBCR self-time still owns +~21 % of the trace. Phase 1's microtask-boundary +elimination cost (~850 ms) was real because there *were* +8 k boundaries to remove; this conversion eliminates a +handful of additional boundaries (the once-per-render +sites) whose per-boundary cost is small. + +**A path to fewer flushes.** The remaining gBCR-driven +layout work is intrinsic to paged.js's per-page +break-and-resume algorithm. The README's earlier +attempts (B, D from the "createBreakToken dedup" +investigation; the move-not-clone experiment) confirmed +that gBCR re-attributes if you elide one site, and +that mutations are the structural source. Synchronising +the chain doesn't change any of that. + +### Verification + +The 1651-page book renders identically pre- and +post-conversion -- same page count, same 16.1 MB PDF. +The PDF differs from the previous build only by the +expected timestamp drift (the `/CreationDate` / +`/ModDate` entries Chrome writes per run). No content +changes; the bundle does the same work in the same +order. + +The trace's `RunTask` -> `Document::UpdateStyleAndLayout` +hit count (39 675) matches the previous run exactly, +confirming the per-page chunker iteration count is +preserved through the conversion. `RunTask` -> +`WebFrameWidgetImpl::UpdateLifecycle` at 1950 ms / 1 +hit is Chromium's final-frame lifecycle work after the +last page is laid out, same as before -- it just shows +up under `RunTask` instead of being attributed to a +post-render microtask, which is also why `RunTask` self +includes it. + +### What's still async, post-conversion + +Two surfaces remain async-shaped, both intentionally: + +1. **The auto-run block at [paged.browser.js:33153](../docs/lib/paged.browser.js:33153).** + `ready.then(async function () { ... })` fires once at + `DOMContentLoaded` and is gated by `config.auto !== + false` -- our pipeline always sets `config.auto = + false` before invoking `preview()`, so this branch + never runs. Leaving it async-shaped costs one + microtask scheduling at startup, sub-microsecond, + and preserves byte-for-byte compatibility with + upstream paged.js's auto-init semantic for anyone + running this bundle in a configuration we don't. +2. **External `page.evaluate(...)` callers.** The + wrapper around `window.PagedPolyfill.preview()` in + `perf/measure.mjs` and `docs/render-book.mjs` is a + sync arrow, but `page.evaluate` itself returns a + Promise (CDP roundtrip). Node-side code awaits that + Promise. Cost is the CDP round-trip, not the JS we + execute. + +Neither contributes to the renderer's main-thread +profile. + +### Cumulative trace shape + +For reference, the post-conversion top-of-table on +`CrRendererMain` reads: + +``` + self_ms self_% event category + ------- ------ ---------------------------------------------- + 2984.11 34.58% RunTask devtools.timeline + 1880.79 21.79% LocalFrameView::performLayout blink + 1876.53 21.74% Document::recalcStyle blink + 540.06 6.26% InlineNode::ShapeTextIncludingFirstLine blink + 503.09 5.83% Document::rebuildLayoutTree blink + 128.90 1.49% Blink.CompositingInputs.UpdateTime blink + 123.41 1.43% Blink.PrePaint.UpdateTime blink + 99.60 1.15% Document::updateStyle blink + 76.83 0.89% V8.GC_MC_INCREMENTAL_EMBEDDER_TRACING v8.gc + 43.20 0.50% Layout devtools.timeline + ... +``` + +`RunMicrotasks` no longer appears. `(self / +unattributed)` time inside `RunTask` is 2984 ms across +1005 hits -- average ~3 ms per task, consistent with +"each render task does ~one page's worth of work" plus +some longer tasks for setup / teardown. The dominant +named children are unchanged: `UpdateStyleAndLayout`, +`recalcStyle`, `performLayout`, `ShapeText`, +`rebuildLayoutTree`. Same work, honest labels. + +Shipped. diff --git a/perf/measure.mjs b/perf/measure.mjs index 6161ce7..ed8b811 100644 --- a/perf/measure.mjs +++ b/perf/measure.mjs @@ -284,12 +284,15 @@ try { } const tRenderStart = Date.now(); - await page.evaluate(async () => { + // [PATCH: sync-chain] PagedPolyfill.preview() is fully synchronous in + // our forked bundle, so the IIFE here is a plain sync arrow. Outer + // `await` is for puppeteer's CDP round-trip back from page.evaluate. + await page.evaluate(() => { if (!window.PagedPolyfill) { throw new Error('paged.js bundle did not expose window.PagedPolyfill'); } try { - await window.PagedPolyfill.preview(); + window.PagedPolyfill.preview(); } catch (err) { const e = err && err.target ? new Error(`${err.type || 'event'} on ${err.target.tagName || '?'}: ${err.target.src || err.target.href || ''}`) From 7886aed558342b1b19e05ad70b28a27ec833d8e9 Mon Sep 17 00:00:00 2001 From: Kuba Sunderland-Ober Date: Sat, 23 May 2026 15:23:34 +0200 Subject: [PATCH 06/18] Hybrid trace: embed V8 cpu_profiler samples in --tracing output. --- perf/README.md | 23 ++++++++++++++++++----- perf/measure.mjs | 5 +++++ 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/perf/README.md b/perf/README.md index 9ce36dd..287614b 100644 --- a/perf/README.md +++ b/perf/README.md @@ -107,7 +107,7 @@ DevTools-compatible trace is a few lines. | `compare-outlines.mjs` | Diffs two PDFs' `/Outlines` trees by `(depth, title, target page)`. Used to verify whether Chrome's native outline matches the injected one. | | `probe-outline-exclusions.mjs` | Tests which per-element attributes / styles (aria-hidden, role=presentation, hidden, display:none, CSS bookmark-level, ...) make Chrome drop a heading from its outline. | | `analyze-profile.mjs` | Bottom-up self-time analyzer for `.cpuprofile` files. Same shape as DevTools' Performance bottom-up view, in the terminal. | -| `analyze-trace.mjs` | Bottom-up self-time analyzer for Chrome traces (`trace.json` from `--tracing`). Computes per-event self-time on the renderer's main thread (`CrRendererMain` by default) by walking nested `X`-phase events. Cracks the cpu profile's `(program)` bucket open into named Blink / V8 events (`Layout`, `RecalcStyle`, `RunMicrotasks`, `V8.GC_*`, ...). | +| `analyze-trace.mjs` | Bottom-up self-time analyzer for Chrome traces (`trace.json` from `--tracing`). Computes per-event self-time on the renderer's main thread (`CrRendererMain` by default) by walking nested `X`-phase events. Cracks the cpu profile's `(program)` bucket open into named Blink / V8 events (`Layout`, `RecalcStyle`, `RunMicrotasks`, `V8.GC_*`, ...). Operates on the Blink trace events only -- ignores any embedded V8 cpu samples (`Profile` / `ProfileChunk`). | | `ab-aggregate.mjs` | Per-row mean + SD aggregator across 6 paired cpu profiles (`ab-A1..A3.cpuprofile` and `ab-B1..B3.cpuprofile`). Use when wall-clock noise drowns a structural change: capture 3+3 interleaved profiles via `measure.mjs --cpu-profile` with the change toggled on/off between runs, then point this at the 6 files for a mean-with-SD table that surfaces deltas wall-clock can't see (e.g. ~6 σ shifts on rows that move from 88 ms to 2 ms). See the "Disabling the filter outright" section in this README for the methodology. | | `find-callers.mjs` | "Who paid for this callee's time?" -- walks a `.cpuprofile` and attributes a target function's total time back to each direct caller. Used throughout the post-mortems to detect gBCR migration between callers. | | `find-callees.mjs` | The other direction of `find-callers.mjs`: splits a function's self+descendant time across its direct callees. Surfaces the cases where V8 has rolled native DOM work back into the calling JS frame (Range deletion in `removeOverflow`, HTML parser in `wrapContent`). | @@ -166,7 +166,7 @@ run.bat --instrument # count + time DOM-accessor calls run.bat --time-hooks # per-task timing of every chunker/polisher hook run.bat --incremental # process via incremental update instead of pdf-lib roundtrip run.bat --chrome-outline # let Chrome emit /Outlines (skip parseOutline + setOutline) -run.bat --tracing # capture a Chrome trace of the render phase +run.bat --tracing # capture a hybrid Chrome trace (Blink events + embedded V8 cpu samples) ``` Flags compose. The CPU profile lands as `render.cpuprofile` @@ -4147,14 +4147,27 @@ a `--tracing` flag and a companion `analyze-trace.mjs`. The flag wraps the render phase in `page.tracing.start()` with Blink-relevant categories (`devtools.timeline`, `disabled-by-default-devtools.timeline`, `blink`, `v8`, -`v8.execute`) and writes `trace.json` to the results -folder. `analyze-trace.mjs` walks the trace's complete-phase +`v8.execute`, `disabled-by-default-v8.cpu_profiler`) and +writes `trace.json` to the results folder. The +`v8.cpu_profiler` category embeds V8 sampling-profile data +as `Profile` / `ProfileChunk` events inline with the Blink +trace events, so the single trace file is *hybrid*: loaded +in Chrome DevTools Performance or [ui.perfetto.dev](https://ui.perfetto.dev) +it renders JS call stacks aligned with Blink events on the +same timeline (the de facto answer to "what was `(program)` +doing?"). Cost: ~2x file size (e.g. 22 MB -> 52 MB on the +1651-page book) and ~0.4 s wall-clock for the extra sampler +work -- both noise on the analysis side. + +`analyze-trace.mjs` walks the trace's complete-phase events on `CrRendererMain`, computes self-time per event name via a nested-event stack walk (same shape as `analyze-profile.mjs` for cpuprofiles), and prints a top-N table. A `--children ` mode breaks any parent event into its direct callees, mirroring -`find-callees.mjs`. +`find-callees.mjs`. It ignores the embedded V8 cpu samples +-- those are consumed separately by the viewers above (or +by a forthcoming hybrid analyzer in this folder). ### What's on the main thread diff --git a/perf/measure.mjs b/perf/measure.mjs index ed8b811..46af266 100644 --- a/perf/measure.mjs +++ b/perf/measure.mjs @@ -268,6 +268,10 @@ try { // adds UpdateLayoutTree / InvalidateLayout / ScheduleStyleRecalc / // HitTest; blink covers internal Blink events; v8 + v8.execute cover // V8.GC* / V8.CompileCode / V8.RunMicrotasks / V8.Execute. + // disabled-by-default-v8.cpu_profiler embeds V8 sampling-profile data + // as Profile / ProfileChunk events inline with the trace, giving JS + // call stacks aligned with Blink events when loaded in Chrome + // DevTools Performance or perfetto.dev (the hybrid view). tracePath = join(outDir, 'trace.json'); await page.tracing.start({ path: tracePath, @@ -278,6 +282,7 @@ try { 'blink', 'v8', 'v8.execute', + 'disabled-by-default-v8.cpu_profiler', ], }); console.log(`[harness] tracing: ${tracePath}`); From 0a968d912e4df706a1a2ed07ff13de82c61d147a Mon Sep 17 00:00:00 2001 From: Kuba Sunderland-Ober Date: Sat, 23 May 2026 15:37:11 +0200 Subject: [PATCH 07/18] Add analyze-hybrid.mjs: bottom-up + callees view across JS and Blink. --- perf/README.md | 7 +- perf/analyze-hybrid.mjs | 341 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 346 insertions(+), 2 deletions(-) create mode 100644 perf/analyze-hybrid.mjs diff --git a/perf/README.md b/perf/README.md index 287614b..5fb34fc 100644 --- a/perf/README.md +++ b/perf/README.md @@ -108,6 +108,7 @@ DevTools-compatible trace is a few lines. | `probe-outline-exclusions.mjs` | Tests which per-element attributes / styles (aria-hidden, role=presentation, hidden, display:none, CSS bookmark-level, ...) make Chrome drop a heading from its outline. | | `analyze-profile.mjs` | Bottom-up self-time analyzer for `.cpuprofile` files. Same shape as DevTools' Performance bottom-up view, in the terminal. | | `analyze-trace.mjs` | Bottom-up self-time analyzer for Chrome traces (`trace.json` from `--tracing`). Computes per-event self-time on the renderer's main thread (`CrRendererMain` by default) by walking nested `X`-phase events. Cracks the cpu profile's `(program)` bucket open into named Blink / V8 events (`Layout`, `RecalcStyle`, `RunMicrotasks`, `V8.GC_*`, ...). Operates on the Blink trace events only -- ignores any embedded V8 cpu samples (`Profile` / `ProfileChunk`). | +| `analyze-hybrid.mjs` | Bottom-up analyzer that *combines* the V8 cpu samples and the Blink trace events from a hybrid `trace.json`. Builds a `[JS root..leaf] ++ [Blink outer..inner]` stack at each sample (filtering V8's virtual frames and JS-entry wrapper events) and prints either top-N self-time mixing JS function names with Blink/V8 event names, or `--callees