|
| 1 | +name: Link Checker |
| 2 | + |
| 3 | +# ======================= |
| 4 | +# Website Link Validation |
| 5 | +# ======================= |
| 6 | +# Purpose: Downloads the latest built site and checks all links (internal + external) |
| 7 | +# Triggers: Weekly on Mondays at 01:30 UTC or manual dispatch |
| 8 | +# Reports: Creates a GitHub issue with label "link-check" when broken links are found |
| 9 | +# Config: See .lychee.toml for exclusion patterns and request settings |
| 10 | + |
| 11 | +on: |
| 12 | + schedule: |
| 13 | + # Runs at 01:30 UTC every Monday |
| 14 | + - cron: '30 1 * * 1' |
| 15 | + workflow_dispatch: |
| 16 | + |
| 17 | +permissions: |
| 18 | + contents: read |
| 19 | + issues: write |
| 20 | + actions: read |
| 21 | + |
| 22 | +concurrency: |
| 23 | + group: link-check-${{ github.ref }} |
| 24 | + cancel-in-progress: true |
| 25 | + |
| 26 | +jobs: |
| 27 | + link-check: |
| 28 | + name: Check Links |
| 29 | + runs-on: ubuntu-latest |
| 30 | + steps: |
| 31 | + - name: Checkout repository |
| 32 | + uses: actions/checkout@v4 |
| 33 | + |
| 34 | + - name: Download latest build artifact |
| 35 | + uses: dawidd6/action-download-artifact@07ab29fd4a977ae4d2b275087cf67563dfdf0295 |
| 36 | + with: |
| 37 | + workflow: deploy.yaml |
| 38 | + name: forrt-website-.* |
| 39 | + name_is_regexp: true |
| 40 | + path: /tmp/site |
| 41 | + github_token: ${{ secrets.GITHUB_TOKEN }} |
| 42 | + search_artifacts: true |
| 43 | + if_no_artifact_found: fail |
| 44 | + |
| 45 | + - name: Run lychee link checker |
| 46 | + id: lychee |
| 47 | + uses: lycheeverse/lychee-action@v2 |
| 48 | + with: |
| 49 | + args: "--config .lychee.toml --base-url https://forrt.org /tmp/site" |
| 50 | + output: /tmp/lychee/out.md |
| 51 | + fail: false |
| 52 | + env: |
| 53 | + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} |
| 54 | + |
| 55 | + - name: Process lychee output |
| 56 | + if: steps.lychee.outputs.exit_code != 0 |
| 57 | + run: | |
| 58 | + python3 << 'PYEOF' |
| 59 | + import re |
| 60 | +
|
| 61 | + with open("/tmp/lychee/out.md") as f: |
| 62 | + content = f.read() |
| 63 | +
|
| 64 | + lines = content.split("\n") |
| 65 | +
|
| 66 | + # Keep the summary table (everything before "## Errors per input") |
| 67 | + summary_lines = [] |
| 68 | + error_lines = [] |
| 69 | + in_errors = False |
| 70 | + for line in lines: |
| 71 | + if line.startswith("## Errors per input"): |
| 72 | + in_errors = True |
| 73 | + continue |
| 74 | + if in_errors: |
| 75 | + error_lines.append(line) |
| 76 | + else: |
| 77 | + summary_lines.append(line) |
| 78 | +
|
| 79 | + # Parse errors, tracking which pages each URL appears on |
| 80 | + # url -> {"status": str, "pages": [str]} |
| 81 | + url_info = {} |
| 82 | + main_errors = {} # non-403 |
| 83 | + forbidden_errors = {} # 403 |
| 84 | + current_page = "" |
| 85 | +
|
| 86 | + for line in error_lines: |
| 87 | + # Track section headers (### Errors in /tmp/site/.../index.html) |
| 88 | + page_match = re.match(r"^### Errors in /tmp/site/[^/]+/(.+)", line) |
| 89 | + if page_match: |
| 90 | + # Convert file path to URL path |
| 91 | + path = page_match.group(1) |
| 92 | + path = re.sub(r"/index\.html$", "/", path) |
| 93 | + current_page = f"/{path}" |
| 94 | + continue |
| 95 | +
|
| 96 | + m = re.match(r"^\* \[(\w+)\] <([^>]+)>", line) |
| 97 | + if not m: |
| 98 | + continue |
| 99 | + status, url = m.group(1), m.group(2) |
| 100 | +
|
| 101 | + target = forbidden_errors if status == "403" else main_errors |
| 102 | + if url not in target: |
| 103 | + target[url] = {"status": status, "pages": []} |
| 104 | + if current_page and current_page not in target[url]["pages"]: |
| 105 | + target[url]["pages"].append(current_page) |
| 106 | +
|
| 107 | + # Build output |
| 108 | + output = "\n".join(summary_lines).rstrip() |
| 109 | + output += "\n\n## Broken links\n\n" |
| 110 | +
|
| 111 | + if main_errors: |
| 112 | + for url, info in main_errors.items(): |
| 113 | + pages = info["pages"] |
| 114 | + page_str = f" (in {pages[0]})" if len(pages) == 1 else f" (in {len(pages)} pages)" |
| 115 | + output += f"* [{info['status']}] <{url}>{page_str}\n" |
| 116 | + else: |
| 117 | + output += "No broken links found (excluding 403s).\n" |
| 118 | +
|
| 119 | + if forbidden_errors: |
| 120 | + output += f"\n<details>\n<summary>403 Forbidden ({len(forbidden_errors)} URLs — likely bot-blocking, not broken)</summary>\n\n" |
| 121 | + output += "These sites block automated requests. The links may still be valid.\n" |
| 122 | + output += "Showing first 100 — see workflow logs for full list.\n\n" |
| 123 | + for i, (url, info) in enumerate(forbidden_errors.items()): |
| 124 | + if i >= 100: |
| 125 | + output += f"\n*... and {len(forbidden_errors) - 100} more*" |
| 126 | + break |
| 127 | + output += f"* <{url}>\n" |
| 128 | + output += "\n</details>\n" |
| 129 | +
|
| 130 | + with open("/tmp/lychee/out.md", "w") as f: |
| 131 | + f.write(output) |
| 132 | + PYEOF |
| 133 | +
|
| 134 | + - name: Find publisher URLs that should use doi.org |
| 135 | + id: doi-check |
| 136 | + run: | |
| 137 | + # Search source markdown files (excluding glossary, which is auto-generated) |
| 138 | + # for direct publisher URLs that should use https://doi.org/ instead. |
| 139 | + PUBLISHERS='(journals\.sagepub|tandfonline|psycnet\.apa|onlinelibrary\.wiley|link\.springer|academic\.oup|sciencedirect|jstor\.org|journals\.lww|royalsocietypublishing)\.(com|org)' |
| 140 | + # Extract just file:line and the URL itself (not full line content) |
| 141 | + MATCHES=$(grep -rno --include='*.md' -E \ |
| 142 | + "https?://[^ )\"']*(${PUBLISHERS})/[^ )\"']*(doi/|article|fulltext)[^ )\"']*" \ |
| 143 | + content/ --exclude-dir=content/glossary | sort -u || true) |
| 144 | + if [ -n "$MATCHES" ]; then |
| 145 | + COUNT=$(echo "$MATCHES" | wc -l) |
| 146 | + { |
| 147 | + echo "" |
| 148 | + echo "## Publisher URLs that should use doi.org ($COUNT found)" |
| 149 | + echo "" |
| 150 | + echo "The following links point directly to publisher websites instead of using" |
| 151 | + echo "\`https://doi.org/{DOI}\` format. Publishers often block automated requests," |
| 152 | + echo "making these URLs uncheckable. Please replace them with doi.org links." |
| 153 | + echo "If the DOI is not visible in the URL, look it up on https://search.crossref.org" |
| 154 | + echo "" |
| 155 | + echo '```' |
| 156 | + echo "$MATCHES" |
| 157 | + echo '```' |
| 158 | + } >> /tmp/lychee/out.md |
| 159 | + echo "found=true" >> "$GITHUB_OUTPUT" |
| 160 | + else |
| 161 | + echo "found=false" >> "$GITHUB_OUTPUT" |
| 162 | + fi |
| 163 | +
|
| 164 | + - name: Create issue from lychee output |
| 165 | + if: steps.lychee.outputs.exit_code != 0 || steps.doi-check.outputs.found == 'true' |
| 166 | + uses: peter-evans/create-issue-from-file@v5 |
| 167 | + with: |
| 168 | + title: "Link Checker Report" |
| 169 | + content-filepath: /tmp/lychee/out.md |
| 170 | + labels: link-check |
| 171 | + token: ${{ secrets.GITHUB_TOKEN }} |
0 commit comments