From 1aec83c2f32a02c6f5e3c010c76ed012d75e8e53 Mon Sep 17 00:00:00 2001 From: Kuba Sunderland-Ober Date: Sun, 24 May 2026 14:52:27 +0200 Subject: [PATCH 1/2] Port link checking to node.js. No more Python dependency. --- .github/workflows/checks.yml | 41 +- .github/workflows/jekyll-gh-pages.yml | 61 +- WIP.md | 4 +- .../Documentation Development.md | 2 +- docs/_plugins/offlinify.md | 26 +- docs/_plugins/offlinify.rb | 7 +- docs/check.bat | 52 +- docs/lychee.bat | 45 -- package-lock.json | 108 ++++ package.json | 1 + requirements.txt | 1 - scripts/check_links.mjs | 607 ++++++++++++++++++ scripts/check_links.py | 455 ------------- scripts/check_offline_live_links.py | 97 --- 14 files changed, 800 insertions(+), 707 deletions(-) delete mode 100644 docs/lychee.bat delete mode 100644 requirements.txt create mode 100644 scripts/check_links.mjs delete mode 100644 scripts/check_links.py delete mode 100644 scripts/check_offline_live_links.py diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index a6fca2e8..c246d68d 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -35,41 +35,40 @@ jobs: - name: Build with Jekyll run: bundle exec jekyll build working-directory: ./docs - - name: Set up Python for link checks - uses: actions/setup-python@v5 + - name: Set up Node.js + uses: actions/setup-node@v4 with: - python-version: '3.14' - cache: 'pip' - - name: Install Python deps - run: pip install -r requirements.txt - - name: Check online links (check_links.py) + node-version: '22' + cache: 'npm' + cache-dependency-path: package-lock.json + - name: Install Node.js dependencies + run: npm ci + - name: Check online links (check_links.mjs) # `--fallback-extensions html` mirrors what GitHub Pages does at request time: # an extensionless URL like `/FAQ` is served as `/FAQ.html`. This workflow's # Jekyll build runs without --baseurl (no Pages prefix), so no --base-path is # needed -- contrast with jekyll-gh-pages.yml. run: >- - python scripts/check_links.py + node scripts/check_links.mjs --offline --include-fragments --fallback-extensions html --index-files 'index.html,.' --root-dir docs/_site docs/_site - - name: Check offline links (check_links.py) + - name: Check offline links and live-link survivors (check_links.mjs) + # Strict check on `_site-offline/`: every link must resolve to an actual file + # under `file://`, with no extension fallback. `--forbid` also fails the build + # if any extracted link still points at https://docs.twinbasic.com/ -- + # i.e. any live-site reference the offlinify rewrite missed. The bare root + # URL (https://docs.twinbasic.com[/]) is exempt, since intentional "go to the + # live site" links are allowed. run: >- - python scripts/check_links.py + node scripts/check_links.mjs --offline --include-fragments --index-files index.html + --forbid 'https://docs.twinbasic.com' --root-dir docs/_site-offline docs/_site-offline - - name: Check for surviving live-site links in offline tree - # Flags any https://docs.twinbasic.com/ reference left in - # _site-offline/ HTML outside /
 blocks. After offlinify
-        # strips the jekyll-seo-tag block, anything surviving is a source
-        # link that points at the live site instead of using a relative or
-        # /tB/... permalink that resolves locally. The bare root URL
-        # (https://docs.twinbasic.com[/]) is exempt -- intentional "go to
-        # the live site" links are allowed.
-        run: python scripts/check_offline_live_links.py
       - name: Check book links (informational)
         # Failures do not block the build. The book still has absolute
         # intra-site URLs that the chapter transform has not yet rewritten
@@ -77,7 +76,7 @@ jobs:
         # for visibility until those are fixed.
         continue-on-error: true
         run: >-
-          python scripts/check_links.py
+          node scripts/check_links.mjs
           --offline --include-fragments
           --root-dir docs/_site-pdf
-          docs/_site-pdf/book.html
\ No newline at end of file
+          docs/_site-pdf/book.html
diff --git a/.github/workflows/jekyll-gh-pages.yml b/.github/workflows/jekyll-gh-pages.yml
index 3ca84c27..e7b5c684 100644
--- a/.github/workflows/jekyll-gh-pages.yml
+++ b/.github/workflows/jekyll-gh-pages.yml
@@ -57,51 +57,48 @@ jobs:
         env:
           JEKYLL_ENV: production
           PAGES_REPO_NWO: "${{ github.repository }}"
-      - name: Set up Python for link checks
-        uses: actions/setup-python@v5
+      - name: Set up Node.js
+        uses: actions/setup-node@v4
         with:
-          python-version: '3.14'
-          cache: 'pip'
-      - name: Install Python deps
-        run: pip install -r requirements.txt
-      - name: Check online links (check_links.py)
+          node-version: '22'
+          cache: 'npm'
+          cache-dependency-path: package-lock.json
+      - name: Install Node.js dependencies and Chromium
+        # Install npm deps first so the link checks (check_links.mjs)
+        # can use htmlparser2. Chromium download runs in the same step
+        # so the cache hit / miss is one decision.
+        run: |
+          npm ci
+          sudo npx puppeteer browsers install chrome --install-deps
+      - name: Check online links (check_links.mjs)
         # `--fallback-extensions html` mirrors what GitHub Pages does at request time:
         # an extensionless URL like `/FAQ` is served as `/FAQ.html`. Without the flag
         # every pretty permalink on the site would look broken.
         #
         # `--base-path` strips the Pages baseurl (e.g. `/twinBASIC-docs`) from absolute
-        # URLs before resolving against `--root-dir`. Equivalent to the `--remap` regex
-        # that lychee used in earlier iterations of this step.
+        # URLs before resolving against `--root-dir`.
         run: >-
-          python scripts/check_links.py
+          node scripts/check_links.mjs
           --offline --include-fragments
           --fallback-extensions html
           --index-files 'index.html,.'
           --base-path '${{ steps.pages.outputs.base_path }}'
           --root-dir docs/_site
           docs/_site
-      - name: Check offline links (check_links.py)
+      - name: Check offline links and live-link survivors (check_links.mjs)
         # Strict check on `_site-offline/`: every link must resolve to an actual file
-        # under `file://`, with no extension fallback. Catches relative links in
-        # markdown sources that point at a permalink that doesn't match the rendered
-        # filename (e.g. `[Foo](Foo/)` when Jekyll wrote `Foo.html`, not
-        # `Foo/index.html`) -- the kind of breakage the online check above hides
-        # behind `--fallback-extensions html`.
+        # under `file://`, with no extension fallback. `--forbid` also fails the build
+        # if any extracted link still points at https://docs.twinbasic.com/ --
+        # i.e. any live-site reference the offlinify rewrite missed. The bare root
+        # URL (https://docs.twinbasic.com[/]) is exempt, since intentional "go to the
+        # live site" links are allowed.
         run: >-
-          python scripts/check_links.py
+          node scripts/check_links.mjs
           --offline --include-fragments
           --index-files index.html
+          --forbid 'https://docs.twinbasic.com'
           --root-dir docs/_site-offline
           docs/_site-offline
-      - name: Check for surviving live-site links in offline tree
-        # Flags any https://docs.twinbasic.com/ reference left in
-        # _site-offline/ HTML outside /
 blocks. After offlinify
-        # strips the jekyll-seo-tag block, anything surviving is a source
-        # link that points at the live site instead of using a relative or
-        # /tB/... permalink that resolves locally. The bare root URL
-        # (https://docs.twinbasic.com[/]) is exempt -- intentional "go to
-        # the live site" links are allowed.
-        run: python scripts/check_offline_live_links.py
       - name: Check book links (informational)
         # Failures do not block the build. The book still has absolute
         # intra-site URLs that the chapter transform has not yet rewritten
@@ -109,20 +106,10 @@ jobs:
         # for visibility until those are fixed.
         continue-on-error: true
         run: >-
-          python scripts/check_links.py
+          node scripts/check_links.mjs
           --offline --include-fragments
           --root-dir docs/_site-pdf
           docs/_site-pdf/book.html
-      - name: Set up Node.js
-        uses: actions/setup-node@v4
-        with:
-          node-version: '22'
-          cache: 'npm'
-          cache-dependency-path: package-lock.json
-      - name: Install Node.js dependencies and Chromium
-        run: |
-          npm ci
-          sudo npx puppeteer browsers install chrome --install-deps
       - name: Render book PDF
         run: |
           mkdir -p _pdf
diff --git a/WIP.md b/WIP.md
index ef72dfa5..35ecd07e 100644
--- a/WIP.md
+++ b/WIP.md
@@ -430,7 +430,7 @@ From `docs/`:
 
 - `bundle exec jekyll build` (or `build.bat`) — builds three trees in a single Jekyll run: the online copy at `_site/`, a `file://`-browsable copy at `_site-offline/`, and the sparse pagedjs source at `_site-pdf/`. The offline pass (`_plugins/offlinify.rb`, activated by `also_build_offline: true` in `_config.yml`) adds ~3-5s and the PDF pass (`_plugins/pdfify.rb`, activated by `also_build_pdf: true`) adds <1s on top of the normal ~13s build. The PDF plugin captures `book.html`'s rendered output (the concatenated chapter document built via `_layouts/book-combined.html`) at `:pages, :post_render`, drops the page from `site.pages` at `:site, :post_render` so `_site/book.html` is never written, and at `:site, :post_write` writes the captured bytes into `_site-pdf/book.html` along with `assets/css/print.css`, `assets/css/rouge.css`, and every relative `` target -- just what pagedjs needs to render the book PDF. The companion `offline_exclude: [..., book.html]` entry in `_config.yml` keeps `offlinify.rb` from copying book.html into `_site-offline/`: offlinify's per-page hook fires before pdfify's `:site, :post_render` (Jekyll fires every per-page hook before any site-level post-render hook), so during offlinify's pass `book.html` is still in `site.pages` and the exclude is what makes it skip writing the offline copy. When `also_build_pdf: false` the exclude does the same job from a different angle -- pdfify never runs, `book.html` renders normally to `_site/`, and the exclude still keeps it out of `_site-offline/`. After Jekyll's WRITE phase, the offline plugin walks `_site/`, copies binary assets verbatim into `_site-offline/`, and for each HTML and CSS file rewrites every root-absolute `href` / `src` / `url()` to a page-relative path with the resolved file extension (`/FAQ` → `../../FAQ.html`, `/Tutorials/CEF/` → `../../Tutorials/CEF/index.html`). It also patches the offline copy of `assets/js/just-the-docs.js` in two places — `navLink()` to match the active nav entry by resolved DOM `link.href` rather than `document.location.pathname` (the upstream pathname-vs-attribute compare returns no match under `file://`, leaving the sidebar with no `.active` class so the nav appears collapsed on every navigation), and `initSearch()` to read the lunr index from `window.SEARCH_DATA` rather than fetching `search-data.json` over `XMLHttpRequest` (XHR to `file://` resources is blocked by browsers; classic `