diff --git a/.github/workflows/nightly-news-generation.yml b/.github/workflows/nightly-news-generation.yml new file mode 100644 index 00000000..5ec309c4 --- /dev/null +++ b/.github/workflows/nightly-news-generation.yml @@ -0,0 +1,235 @@ +name: Nightly News Generation + +on: + schedule: + # 02:00 CET (01:00 UTC winter / stays near local midnight in summer) + - cron: '0 1 * * *' + workflow_dispatch: + inputs: + date: + description: 'Override document window start date (YYYY-MM-DD, default: yesterday)' + required: false + threshold: + description: 'Minimum document count per type to trigger generation (default: 5)' + required: false + default: '5' + languages: + description: 'Languages to generate (en,sv | nordic | eu-core | all)' + required: false + default: 'all' + types: + description: 'Article types (committee-reports,propositions,motions,week-ahead)' + required: false + default: 'committee-reports,propositions,motions,week-ahead' + dry_run: + description: 'Dry run โ log what would happen without writing files' + type: boolean + required: false + default: false + +permissions: + contents: write + pull-requests: write + +jobs: + generate-news: + name: Generate Daily News Articles + runs-on: ubuntu-latest + timeout-minutes: 60 + + steps: + - name: Harden Runner + uses: step-security/harden-runner@5ef0c079ce82195b2a36a210272d6b661572d83e # v2.14.2 + with: + egress-policy: audit + + - name: Checkout repository + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Setup Node.js + uses: actions/setup-node@6044e13b5dc448c55e2357c09f80417699197238 # v6.2.0 + with: + node-version: '24' + cache: 'npm' + + - name: Install dependencies + run: | + echo "๐ฆ Installing dependenciesโฆ" + npm ci --prefer-offline --no-audit + echo "โ Dependencies installed" + + - name: Set date variables + id: dates + run: | + DATE=$(date +%Y-%m-%d) + YESTERDAY=$(date -d yesterday +%Y-%m-%d) + echo "today=$DATE" >> "$GITHUB_OUTPUT" + echo "yesterday=$YESTERDAY" >> "$GITHUB_OUTPUT" + echo "๐ Today : $DATE" + echo "๐ Yesterday: $YESTERDAY" + + - name: Generate daily news articles + id: generate + env: + MCP_AUTH_TOKEN: ${{ secrets.MCP_AUTH_TOKEN }} + MCP_SERVER_URL: ${{ vars.MCP_SERVER_URL || 'https://riksdag-regering-ai.onrender.com/mcp' }} + MCP_CLIENT_TIMEOUT_MS: '90000' + run: | + # Build CLI arguments + ARGS="" + + # --date + if [ -n "${{ github.event.inputs.date }}" ]; then + ARGS="$ARGS --date=${{ github.event.inputs.date }}" + fi + + # --threshold + THRESHOLD="${{ github.event.inputs.threshold || '5' }}" + ARGS="$ARGS --threshold=$THRESHOLD" + + # --languages + LANGS="${{ github.event.inputs.languages || 'all' }}" + ARGS="$ARGS --languages=$LANGS" + + # --types + TYPES="${{ github.event.inputs.types || 'committee-reports,propositions,motions,week-ahead' }}" + ARGS="$ARGS --types=$TYPES" + + # --dry-run + if [ "${{ github.event.inputs.dry_run }}" = "true" ]; then + ARGS="$ARGS --dry-run" + echo "๐ Dry-run mode enabled" + fi + + echo "๐ Running: node scripts/generate-daily-news.js $ARGS" + node scripts/generate-daily-news.js $ARGS + + - name: Read generation report + if: always() + id: report + run: | + REPORT_FILE="news/metadata/daily-report.json" + if [ -f "$REPORT_FILE" ]; then + echo "๐ Generation report:" + cat "$REPORT_FILE" + + ARTICLES_CREATED=$(jq -r '.articlesCreated | join(", ")' "$REPORT_FILE") + ERRORS=$(jq -r '.errors | length' "$REPORT_FILE") + echo "articles_created=$ARTICLES_CREATED" >> "$GITHUB_OUTPUT" + echo "error_count=$ERRORS" >> "$GITHUB_OUTPUT" + else + echo "โ ๏ธ No report file found" + echo "articles_created=" >> "$GITHUB_OUTPUT" + echo "error_count=0" >> "$GITHUB_OUTPUT" + fi + + - name: Update news indexes and sitemap + if: steps.generate.outcome == 'success' && github.event.inputs.dry_run != 'true' + run: | + echo "๐ Updating news indexesโฆ" + if [ -f "scripts/update-news-indexes-and-sitemap.py" ]; then + python3 scripts/update-news-indexes-and-sitemap.py + echo "โ Indexes and sitemap updated" + elif [ -f "package.json" ] && grep -q '"generate-news-indexes"' package.json; then + node scripts/generate-news-indexes.js + node scripts/generate-sitemap.js + echo "โ Indexes and sitemap updated" + else + echo "โน๏ธ No index/sitemap update script found โ skipping" + fi + + - name: Validate generated HTML + if: steps.generate.outcome == 'success' && github.event.inputs.dry_run != 'true' + run: | + echo "๐ Validating generated HTMLโฆ" + TODAY="${{ steps.dates.outputs.today }}" + + # Build list of today's new files + shopt -s nullglob + NEW_FILES=(news/${TODAY}-*.html) + shopt -u nullglob + + if [ ${#NEW_FILES[@]} -eq 0 ]; then + echo "โน๏ธ No new HTML files for $TODAY โ skipping validation" + else + echo "Validating ${#NEW_FILES[@]} filesโฆ" + npx --yes htmlhint "${NEW_FILES[@]}" || echo "โ ๏ธ HTMLHint found issues (non-blocking)" + echo "โ HTML validation complete" + fi + + - name: Create Pull Request + if: > + steps.generate.outcome == 'success' && + github.event.inputs.dry_run != 'true' && + steps.report.outputs.articles_created != '' + id: create-pr + uses: peter-evans/create-pull-request@c0f553fe549906ede9cf27b5156039d195d2ece0 # v8.1.0 + with: + token: ${{ secrets.GITHUB_TOKEN }} + commit-message: 'news: automated daily articles for ${{ steps.dates.outputs.today }}' + title: '๐ฐ Daily news: ${{ steps.dates.outputs.today }}' + body: | + ## ๐ฐ Automated Daily News Generation + + This PR was created automatically by the nightly news generation workflow. + + ### Summary + - **Date**: ${{ steps.dates.outputs.today }} + - **Document window**: ${{ steps.dates.outputs.yesterday }} โ ${{ steps.dates.outputs.today }} + - **Languages**: ${{ github.event.inputs.languages || 'all' }} + - **Articles created**: ${{ steps.report.outputs.articles_created }} + - **Errors**: ${{ steps.report.outputs.error_count }} + + ### Article Types Generated + ${{ steps.report.outputs.articles_created }} + + ### Quality Checks + - [x] MCP data fetched from riksdag-regering-mcp + - [x] Document threshold applied (โฅ${{ github.event.inputs.threshold || '5' }} docs per type) + - [x] Multi-language generation (${{ github.event.inputs.languages || 'all' }}) + - [x] HTML validation with HTMLHint + - [x] News indexes and sitemap updated + + ### References + - Workflow run: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + - Script: `scripts/generate-daily-news.js` + - Guide: `ARTICLE_ENHANCEMENT_GUIDE.md` + + --- + *Automatically generated by the Nightly News Generation workflow* + branch: 'auto/daily-news-${{ steps.dates.outputs.today }}' + delete-branch: true + labels: | + automated-pipeline + news-article + content + + - name: Output PR URL + if: steps.create-pr.outputs.pull-request-url != '' + run: | + echo "โ Pull request created: ${{ steps.create-pr.outputs.pull-request-url }}" + echo "## ๐ฐ PR Created" >> "$GITHUB_STEP_SUMMARY" + echo "${{ steps.create-pr.outputs.pull-request-url }}" >> "$GITHUB_STEP_SUMMARY" + + - name: Write step summary + if: always() + run: | + echo "## ๐ฐ Nightly News Generation โ ${{ steps.dates.outputs.today }}" >> "$GITHUB_STEP_SUMMARY" + echo "" >> "$GITHUB_STEP_SUMMARY" + echo "| Field | Value |" >> "$GITHUB_STEP_SUMMARY" + echo "|-------|-------|" >> "$GITHUB_STEP_SUMMARY" + echo "| Date | ${{ steps.dates.outputs.today }} |" >> "$GITHUB_STEP_SUMMARY" + echo "| Languages | ${{ github.event.inputs.languages || 'all' }} |" >> "$GITHUB_STEP_SUMMARY" + echo "| Articles created | ${{ steps.report.outputs.articles_created || 'none' }} |" >> "$GITHUB_STEP_SUMMARY" + echo "| Errors | ${{ steps.report.outputs.error_count || '0' }} |" >> "$GITHUB_STEP_SUMMARY" + echo "| Status | ${{ steps.generate.outcome }} |" >> "$GITHUB_STEP_SUMMARY" + + - name: Notify on failure + if: failure() + run: | + echo "โ Nightly news generation failed!" >&2 + echo "Please check the workflow run for details:" + echo "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + echo "" + echo "## โ Generation Failed" >> "$GITHUB_STEP_SUMMARY" + echo "Check the [workflow run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) for details." >> "$GITHUB_STEP_SUMMARY" diff --git a/ARTICLE_ENHANCEMENT_GUIDE.md b/ARTICLE_ENHANCEMENT_GUIDE.md new file mode 100644 index 00000000..da60764b --- /dev/null +++ b/ARTICLE_ENHANCEMENT_GUIDE.md @@ -0,0 +1,892 @@ +# Article Enhancement Guide + +**Version:** 2.0 +**Last Updated:** 2026-02-19 +**Classification:** Public +**Owner:** Hack23 AB +**Repository:** [Hack23/riksdagsmonitor](https://github.com/Hack23/riksdagsmonitor) + +--- + +## ๐ Overview + +This guide documents the complete workflow for generating, enhancing, translating, and publishing news articles on Riksdagsmonitor. It consolidates proven patterns from Issues [#306โ#334](https://github.com/Hack23/riksdagsmonitor/issues) and successful PRs [#307](https://github.com/Hack23/riksdagsmonitor/pull/307), [#312](https://github.com/Hack23/riksdagsmonitor/pull/312), [#313](https://github.com/Hack23/riksdagsmonitor/pull/313), [#314](https://github.com/Hack23/riksdagsmonitor/pull/314), [#326](https://github.com/Hack23/riksdagsmonitor/pull/326), [#333](https://github.com/Hack23/riksdagsmonitor/pull/333), and [#334](https://github.com/Hack23/riksdagsmonitor/pull/334) that collectively enhanced 176 articles. + +--- + +## ๐บ๏ธ Architecture Overview + +``` +riksdag-regering-mcp (32 tools) + โ + โผ +scripts/generate-daily-news.js โ nightly orchestrator + โ (decides which types to generate based on doc count โฅ threshold) + โผ +scripts/generate-news-enhanced.js โ multi-language article engine + โ + โโโ scripts/mcp-client.js โ MCP transport layer + โโโ scripts/data-transformers.js โ semantic transformation + โโโ scripts/article-template.js โ HTML generation + โโโ scripts/editorial-pillars.js โ 5-pillar content strategy + โโโ scripts/news-types/ โ per-type generators + โโโ committee-reports.js + โโโ propositions.js + โโโ motions.js + โโโ week-ahead.js + โโโ breaking-news.js + +Output: news/YYYY-MM-DD-{type}-{lang}.html (ร 14 languages) + news/metadata/daily-report.json + sitemap.xml (updated by update-news-indexes-and-sitemap.py) +``` + +--- + +## ๐ Quick Start + +### Automated Nightly Generation (Recommended) + +The workflow runs automatically at **02:00 CET** via GitHub Actions: + +```bash +# Trigger manually via GitHub CLI +gh workflow run nightly-news-generation.yml + +# With options +gh workflow run nightly-news-generation.yml \ + -f languages=all \ + -f threshold=5 \ + -f types=committee-reports,propositions,motions,week-ahead +``` + +### Manual Generation (Local) + +```bash +# Install dependencies +npm ci + +# Generate today's news (all languages, threshold=5) +node scripts/generate-daily-news.js --languages=all --threshold=5 + +# Generate with custom date window +node scripts/generate-daily-news.js --date=2026-02-18 --languages=nordic + +# Generate specific types only +node scripts/generate-daily-news.js --types=committee-reports,propositions + +# Dry run (no files written) +node scripts/generate-daily-news.js --dry-run --languages=en +``` + +### Legacy Enhanced Generator + +```bash +# Direct invocation (used internally by generate-daily-news.js) +node scripts/generate-news-enhanced.js \ + --types=week-ahead,committee-reports,propositions,motions \ + --languages=all \ + --skip-existing +``` + +--- + +## ๐ Step-by-Step Workflow + +### Step 1: Fetch Documents from MCP + +The nightly script queries the **riksdag-regering-mcp** server for documents published since yesterday. + +```javascript +// Internal implementation in scripts/generate-daily-news.js +const result = await client.request('search_dokument', { + doktyp: 'bet', // 'bet' | 'prop' | 'mot' + from_date: '2026-02-18', // yesterday + limit: 100 +}); +``` + +**Document type codes:** + +| Code | Article type | Swedish | Description | +|------|-------------|---------|-------------| +| `bet` | `committee-reports` | Betรคnkanden | Committee reports | +| `prop` | `propositions` | Propositioner | Government bills | +| `mot` | `motions` | Motioner | Parliamentary motions | + +### Step 2: Apply Document Threshold + +Generation only proceeds when **โฅ 5 documents** of a type are found (configurable via `--threshold`). This prevents sparse daily articles that lack analytical value. + +```bash +โ 'committee-reports': 9 documents โฅ threshold (5) โ will generate +โญ๏ธ 'propositions': 3 documents < threshold (5) โ skipping +โ 'motions': 7 documents โฅ threshold (5) โ will generate +``` + +### Step 3: Enrich Documents with MCP Content + +For each document the generator calls `get_dokument_innehall` to fetch the full text, which enables 150โ400 word analysis sections. + +```javascript +// Handled by MCPClient.enrichDocumentsWithContent() +reports = await client.enrichDocumentsWithContent(reports, 3); // max 3 docs enriched +``` + +### Step 4: Generate Articles (14 Languages) + +The enhanced generator produces one HTML file per language: + +``` +news/2026-02-19-committee-reports-en.html +news/2026-02-19-committee-reports-sv.html +news/2026-02-19-committee-reports-da.html +... (14 files total) +``` + +Each file contains: +- Semantic HTML5 with WCAG 2.1 AA compliance +- Schema.org `NewsArticle` structured data (JSON-LD) +- Correct `` and `dir="rtl"` for Arabic/Hebrew +- hreflang links to all 14 language versions +- Cyberpunk theme via external `styles.css` + +### Step 5: Update Indexes and Sitemap + +```bash +# Run after generation +python3 scripts/update-news-indexes-and-sitemap.py + +# Or via npm scripts +node scripts/generate-news-indexes.js +node scripts/generate-sitemap.js +``` + +The index updater: +1. Scans all `news/*.html` files +2. Extracts metadata (title, description, date, language) +3. Updates all 14 `index_*.html` files with current article lists +4. Regenerates `sitemap.xml` with ~574 URLs (articles + API docs + coverage) + +### Step 6: Validate HTML + +```bash +# Validate generated files +npx htmlhint news/2026-02-19-*.html + +# Or full validation +npm run htmlhint +``` + +### Step 7: Create Pull Request + +The workflow automatically creates a PR via `peter-evans/create-pull-request`: + +``` +Branch: auto/daily-news-2026-02-19 +Title: ๐ฐ Daily news: 2026-02-19 +Labels: automated-pipeline, news-article, content +``` + +--- + +## ๐ง MCP Tool Reference (All 32 Tools) + +The `MCPClient` in `scripts/mcp-client.js` provides typed wrappers for all 32 tools. + +### Riksdag Tools (15 tools) + +#### 1. `get_ledamoter` โ MP list + +```javascript +const mps = await client.request('get_ledamoter', { limit: 50 }); +``` + +#### 2. `get_ledamot` โ MP details + +```javascript +const mp = await client.request('get_ledamot', { intressent_id: '0980680893021' }); +``` + +#### 3. `search_ledamoter` โ MP search + +```javascript +const results = await client.request('search_ledamoter', { + parti: 'S', // S, M, SD, V, MP, C, L, KD + valkrets: 'Stockholm' +}); +``` + +#### 4. `get_motioner` โ All motions + +```javascript +const motions = await client.request('get_motioner', { + rm: '2025/26', + limit: 20 +}); +``` + +#### 5. `search_dokument` โ Document search + +```javascript +const docs = await client.request('search_dokument', { + doktyp: 'bet', // bet | prop | mot | skr | sou + from_date: '2026-02-18', + limit: 50 +}); +``` + +#### 6. `search_dokument_fulltext` โ Full-text search + +```javascript +const results = await client.request('search_dokument_fulltext', { + query: 'klimat energi', + limit: 20 +}); +``` + +#### 7. `get_dokument` โ Specific document + +```javascript +const doc = await client.request('get_dokument', { + dok_id: 'H901FiU1', + include_full_text: false +}); +``` + +#### 8. `get_dokument_innehall` โ Document content + summary + +```javascript +const content = await client.request('get_dokument_innehall', { + dok_id: 'H901FiU1', + include_full_text: false +}); +``` + +#### 9. `get_propositioner` โ Recent propositions + +```javascript +const props = await client.request('get_propositioner', { + rm: '2025/26', + limit: 10 +}); +``` + +#### 10. `get_betankanden` โ Recent committee reports + +```javascript +const reports = await client.request('get_betankanden', { + organ: 'FiU', // Committee code, optional + limit: 10 +}); +``` + +#### 11. `get_fragor` โ Written questions + +```javascript +const questions = await client.request('get_fragor', { + rm: '2025/26', + limit: 10 +}); +``` + +#### 12. `get_interpellationer` โ Interpellations + +```javascript +const interpellations = await client.request('get_interpellationer', { + rm: '2025/26', + limit: 10 +}); +``` + +#### 13. `search_voteringar` โ Vote search + +```javascript +const votes = await client.request('search_voteringar', { + rm: '2025/26', + parti: 'S', + rost: 'Nej', // Ja | Nej | Avstรฅr | Frรฅnvarande + limit: 20 +}); +``` + +#### 14. `search_anforanden` โ Speech search + +```javascript +const speeches = await client.request('search_anforanden', { + talare: 'Ulf Kristersson', + rm: '2025/26', + limit: 10 +}); +``` + +#### 15. `get_calendar_events` โ Parliamentary calendar + +```javascript +const events = await client.request('get_calendar_events', { + from: '2026-02-24', + tom: '2026-03-01', + limit: 200 +}); +``` + +### Government Tools (7 tools) + +#### 16. `search_regering` โ Government document search + +```javascript +const govDocs = await client.request('search_regering', { + title: 'klimat', + dateFrom: '2026-01-01', + dateTo: '2026-02-19', + limit: 10 +}); +``` + +#### 17. `get_regering_document` โ Government document + +```javascript +const doc = await client.request('get_regering_document', { + document_id: 'klimat-och-miljo-2026', + type: 'propositioner' +}); +``` + +#### 18. `summarize_regering_document` โ Document summary + +```javascript +const summary = await client.request('summarize_regering_document', { + document_id: 'klimat-och-miljo-2026', + max_length: 500 +}); +``` + +#### 19. `get_g0v_document_content` โ Markdown content + +```javascript +const markdown = await client.request('get_g0v_document_content', { + regeringenUrl: 'https://www.regeringen.se/...' +}); +``` + +#### 20. `get_g0v_document_types` โ Available document types + +```javascript +const types = await client.request('get_g0v_document_types', {}); +``` + +#### 21. `get_g0v_category_codes` โ Category codes + +```javascript +const codes = await client.request('get_g0v_category_codes', {}); +``` + +#### 22. `analyze_g0v_by_department` โ Department analysis + +```javascript +const analysis = await client.request('analyze_g0v_by_department', { + dateFrom: '2026-01-01', + dateTo: '2026-02-19' +}); +``` + +### Statistical & Metadata Tools (5 tools) + +#### 23. `get_utskott` โ Committee list + +```javascript +const committees = await client.request('get_utskott', {}); +``` + +#### 24. `get_sync_status` โ Server health check + +```javascript +const status = await client.request('get_sync_status', {}); +// Response: { last_sync: '2026-02-19T01:00:00Z', status: 'ok' } +``` + +#### 25. `get_data_dictionary` โ Field descriptions + +```javascript +const dict = await client.request('get_data_dictionary', { + dataset: 'dokument' // optional +}); +``` + +#### 26. `fetch_paginated_documents` โ Paginated document retrieval + +```javascript +const page = await client.request('fetch_paginated_documents', { + doktyp: 'bet', + rm: '2025/26', + page: 1, + pageSize: 50 +}); +``` + +#### 27. `fetch_paginated_anforanden` โ Paginated speeches + +```javascript +const page = await client.request('fetch_paginated_anforanden', { + parti: 'M', + rm: '2025/26', + page: 1, + pageSize: 100 +}); +``` + +### Aggregation Tools (5 tools) + +#### 28. `enhanced_government_search` โ Combined search + +```javascript +const results = await client.request('enhanced_government_search', { + query: 'bostadspolitik', + includeRegeringen: true, + limit: 20, + regeringenLimit: 5 +}); +``` + +#### 29. `get_voting_group` โ Group vote results + +```javascript +const groups = await client.request('get_voting_group', { + bet: 'FiU10', + punkt: '1', + groupBy: 'parti' // parti | valkrets | namn +}); +``` + +#### 30. `batch_fetch_documents` โ Multi-session fetch + +```javascript +const batch = await client.request('batch_fetch_documents', { + doktyp: 'bet', + riksmoten: ['2024/25', '2025/26'], + maxPerRiksmote: 100 +}); +``` + +#### 31. `list_reports` โ Available reports + +```javascript +const reports = await client.request('list_reports', {}); +``` + +#### 32. `fetch_report` โ Named report + +```javascript +const report = await client.request('fetch_report', { + report: 'ledamotsstatistik', // ledamotsstatistik | kontaktutskott | ... + limit: 200 +}); +``` + +--- + +## ๐ Content Quality Standards + +### Word Count Targets + +| Article type | Target | Minimum | Maximum | +|-------------|--------|---------|---------| +| Week Ahead | 250 | 150 | 400 | +| Committee Reports | 300 | 150 | 400 | +| Government Propositions | 350 | 200 | 400 | +| Opposition Motions | 300 | 150 | 400 | +| Breaking News | 200 | 100 | 300 | + +### The Economist Style Guidelines + +1. **Lede paragraph** โ 2โ3 sentences. State the most newsworthy fact first. +2. **H2 sections** โ Use 3โ5 thematic sections per article. +3. **H3 subsections** โ Use sparingly; maximum 2 per H2. +4. **No bullet lists** in body text โ use prose instead. +5. **Tone** โ Formal, analytical, neutral. Avoid partisan framing. +6. **Numbers** โ Spell out one through ten; use digits for 11 and above. +7. **Dates** โ Use `DD Month YYYY` format (e.g., `19 February 2026`). +8. **Attribution** โ Always attribute: "according to the Finance Committee" not "reportedly". + +### Article Structure Template + +```html + +
โฆ
+ +โฆbackground and significanceโฆ
+ +โฆspecific documents/events coveredโฆ
+ +โฆanalysis of impactโฆ
+ +โฆnext steps, upcoming votes, deadlinesโฆ
+``` + +### Schema.org NewsArticle Requirements + +Every article **must** include synchronized metadata in four locations: + +```html + + + + + + + + + + + +``` + +All four description fields **must be identical**. See PR #307 for the fix script (`scripts/fix-pr-review-comments.py`) when they drift. + +--- + +## ๐ Translation Workflow (14 Languages) + +### Language Codes and File Patterns + +| Language | Code | File suffix | Direction | +|----------|------|-------------|-----------| +| English | `en` | `-en.html` | LTR (master) | +| Swedish | `sv` | `-sv.html` | LTR | +| Danish | `da` | `-da.html` | LTR | +| Norwegian | `no` | `-no.html` | LTR | +| Finnish | `fi` | `-fi.html` | LTR | +| German | `de` | `-de.html` | LTR | +| French | `fr` | `-fr.html` | LTR | +| Spanish | `es` | `-es.html` | LTR | +| Dutch | `nl` | `-nl.html` | LTR | +| Arabic | `ar` | `-ar.html` | **RTL** | +| Hebrew | `he` | `-he.html` | **RTL** | +| Japanese | `ja` | `-ja.html` | LTR | +| Korean | `ko` | `-ko.html` | LTR | +| Chinese | `zh` | `-zh.html` | LTR | + +### Automated Translation (Built-in) + +The enhanced generator creates all 14 language files automatically. Run: + +```bash +node scripts/generate-news-enhanced.js --types=committee-reports --languages=all +``` + +### Manual Translation Improvement + +When improving machine-generated translations: + +``` +1. Update title/meta/OG/Twitter metadata +2. Replace full article body with translated text + - Maintain H2/H3 structure + - Match word count targets (150โ400 words) + - Apply The Economist style +3. Update Schema.org (headline, description, wordCount) +4. Update navigation: "โ Back to News" with localized text: + - Swedish: "โ Tillbaka till nyheter" + - Danish: "โ Tilbage til nyheder" + - Norwegian:"โ Tilbake til nyheter" + - Finnish: "โ Takaisin uutisiin" + - German: "โ Zurรผck zu den Nachrichten" + - French: "โ Retour aux actualitรฉs" + - Spanish: "โ Volver a las noticias" + - Dutch: "โ Terug naar nieuws" + - Arabic: "โ ุงูุนูุฏุฉ ุฅูู ุงูุฃุฎุจุงุฑ" + - Hebrew: "โ ืืืจื ืืืืฉืืช" + - Japanese: "โ ใใฅใผในใซๆปใ" + - Korean: "โ ๋ด์ค๋ก ๋์๊ฐ๊ธฐ" + - Chinese: "โ ่ฟๅๆฐ้ป" +5. Validate with HTMLHint +6. Commit individually per language +``` + +**โ ๏ธ Critical:** The `generate-content-based-titles.py` script defaults to `--english-only` mode. Use `--overwrite-translations` with interactive `YES` confirmation only when intentionally replacing professional translations. + +### Translation Workflow Order (Efficiency) + +Process languages in this order for maximum efficiency: +1. **English** (master/source) +2. **Swedish** (closest to source material) +3. **Danish** (similar to Swedish, ~10 min) +4. **Norwegian** (similar to Danish, ~10 min) +5. **Finnish** (independent, ~15 min) +6. **German, French, Spanish, Dutch** (~15 min each) +7. **Arabic, Hebrew** (RTL โ require `dir="rtl"` on ``, ~20 min each) +8. **Japanese, Korean, Chinese** (~15 min each) + +### RTL Languages Special Requirements + +For Arabic (`ar`) and Hebrew (`he`) articles: + +```html + + + +``` + +CSS variables from `styles.css` handle the rest automatically โ no inline styles needed. + +--- + +## โ Validation Checklist + +### Pre-Commit (Manual) + +```bash +# 1. HTML validation (zero errors required) +npx htmlhint news/YYYY-MM-DD-*.html + +# 2. Link checking (internal links) +python3 -m http.server 8080 & +linkinator http://localhost:8080/news/ --recurse --skip "http://localhost:8080/docs" + +# 3. Schema.org consistency check +grep -h '"description"' news/YYYY-MM-DD-*.html | sort | uniq -c + +# 4. Word count check (aim for 150-400 words) +for f in news/YYYY-MM-DD-*-en.html; do + wc=$(cat "$f" | sed 's/<[^>]*>//g' | wc -w) + echo "$f: $wc words" +done +``` + +### Post-Commit (Automated CI) + +The `quality-checks.yml` workflow validates: +- โ HTMLHint on all `*.html` and `news/*.html` +- โ ESLint on all `*.js` scripts +- โ Translation consistency (`validate-translations.js`) +- โ News translation completeness (`validate-news-translations.js`) + +--- + +## ๐ Index and Sitemap Update + +After generating new articles, always run the index updater: + +```bash +python3 scripts/update-news-indexes-and-sitemap.py +``` + +This script: +1. Scans all `news/*.html` files (currently ~347 articles) +2. Extracts metadata: title, description, date, language +3. Updates all 14 `index_*.html` files with article lists +4. Regenerates `sitemap.xml` with all URLs including: + - News articles (priority 0.4โ0.8, age-based) + - API documentation in `docs/api/` (priority 0.5) + - Test coverage in `docs/coverage/` (priority 0.4) + - Root pages (priority 0.9โ1.0) + +**Sitemap priorities:** + +| URL type | Priority | +|----------|----------| +| `index.html` (English) | 1.0 | +| `index_sv.html` (Swedish) | 0.9 | +| Recent news (< 7 days) | 0.8 | +| Nordic language indexes | 0.7 | +| Other language indexes | 0.6 | +| `docs/api/` pages | 0.5 | +| Old articles + coverage | 0.4 | + +--- + +## ๐ Common Pitfalls + +### 1. MCP Server Cold Start (30โ60 s) + +**Problem:** First request fails with timeout. + +**Solution:** The `generate-daily-news.js` script warm-up step sends `get_sync_status` before any data queries. Set `MCP_CLIENT_TIMEOUT_MS=90000` in CI. + +### 2. Inconsistent Schema.org Descriptions + +**Problem:** `meta description` and `NewsArticle.description` differ. + +**Solution:** Always update all four fields together (meta, og:description, twitter:description, JSON-LD description). Use `scripts/fix-pr-review-comments.py` pattern for bulk fixes. + +### 3. English UI on Non-English Pages + +**Problem:** Non-English articles show "โ Back to News" in English. + +**Solution:** Use language-specific navigation strings (see Translation Workflow section above). + +### 4. Professional Translation Overwrite + +**Problem:** Script accidentally overwrites human-translated articles. + +**Solution:** `scripts/generate-content-based-titles.py` requires `--overwrite-translations` flag with interactive `YES` confirmation. Default `--english-only` mode is safe. + +### 5. PR Format-Patch Size Limit + +**Problem:** PRs with 50+ changed files fail with `ENOBUFS` when `sitemap.xml` diff exceeds 1 MB. + +**Solution:** Commit sitemap updates separately from article files, or use the nightly workflow which handles this automatically. + +### 6. Missing Article Threshold + +**Problem:** Articles generated with only 1โ2 documents provide no analytical value. + +**Solution:** Use `--threshold=5` (default in `generate-daily-news.js`). Adjust only for breaking news (`--threshold=1`). + +### 7. Hard-coded Absolute Paths + +**Problem:** Scripts with `/home/runner/work/โฆ` paths fail in local environments. + +**Solution:** Always use `Path('news')` (relative) or `path.join(__dirname, '..', 'news')` patterns. See `scripts/generate-daily-news.js` as reference. + +### 8. Merge Conflicts with Professional Translations + +**Problem:** Auto-generated articles conflict with human translations in PR. + +**Solution:** Always accept the professional translation (`--theirs` for the specific file). Professional translations are canonical; auto-generated content is a starting point only. + +--- + +## ๐ 5 Editorial Pillars Framework + +All generated content aligns with the five pillars defined in `scripts/editorial-pillars.js`: + +| Pillar | Focus | Primary types | +|--------|-------|---------------| +| 1. Parliamentary Pulse | Main legislative developments | committee-reports, propositions | +| 2. Government Watch | Executive announcements | propositions | +| 3. Opposition Dynamics | Cross-party positioning | motions | +| 4. Committee Intelligence | Specialist analysis | committee-reports | +| 5. Looking Ahead | Political forecasting | week-ahead | + +--- + +## ๐ Security and Compliance + +### Authentication + +```bash +# Set MCP auth token (optional, but required for production) +export MCP_AUTH_TOKEN="Bearer your-token-here" + +# Or via GitHub Secrets (recommended) +# Repository Settings โ Secrets โ MCP_AUTH_TOKEN +``` + +The `mcp-client.js` reads from `process.env.MCP_AUTH_TOKEN`. Never commit tokens to source code. + +### GDPR Compliance + +All generated content covers: +- **Public officials in official capacity only** โ no personal data processing +- **Right to be forgotten not applicable** โ historical parliamentary records +- **Purpose limitation** โ journalism and democratic transparency only +- **Data minimization** โ process only publicly available parliamentary data + +Legal basis: Article 6(1)(e) GDPR โ processing in the public interest. + +### Data Quality + +The MCP server is the **single authoritative source**. Always: +1. Validate document IDs against official Riksdagen records +2. Cross-reference document titles with `dok_id` field +3. Use `get_dokument` for definitive metadata when in doubt + +--- + +## ๐ Related Documentation + +| Document | Purpose | +|----------|---------| +| `NEWS_ARTICLE_STYLING_GUIDE.md` | HTML/CSS styling conventions | +| `TRANSLATION_GUIDE.md` | Translation terminology tables | +| `COMMITTEE_REPORTS_TRANSLATION_WORKFLOW.md` | Committee reports specific workflow | +| `WORKFLOWS.md` | GitHub Actions workflow overview | +| `TESTING.md` | Test suite documentation | +| `scripts/generate-daily-news.js` | Nightly generation orchestrator | +| `scripts/generate-news-enhanced.js` | Multi-language article engine | +| `scripts/mcp-client.js` | MCP transport layer (all 32 tools) | +| `scripts/article-template.js` | HTML template generator | +| `scripts/update-news-indexes-and-sitemap.py` | Index and sitemap updater | +| `.github/workflows/nightly-news-generation.yml` | Automated nightly workflow | + +--- + +## ๐๏ธ Proven Patterns from Issues #306โ334 + +### Pattern: Bulk Enhancement Script + +For systematic enhancement of multiple articles (e.g., 176 articles across Issues #306โ334): + +```python +# scripts/enhance-batch-articles.py pattern +ARTICLES = { + '2026-02-14': { + 'bet': [ + {'id': 'H801AU10', 'title': 'Arbetsmarknadsfrรฅgor', 'dept': 'AU', 'date': '2026-02-14'} + ] + } +} + +for date, types in ARTICLES.items(): + for doctype, docs in types.items(): + enhance_article(date, doctype, docs) +``` + +See `scripts/enhance-2026-02-19-articles.py` for a complete example. + +### Pattern: Content-Based Titles + +Generate titles from actual document content rather than generic templates: + +```python +# scripts/generate-content-based-titles.py --english-only +# Generates: "Finance Committee Approves 2026 Budget Framework" +# Instead of: "Committee Reports: Parliamentary Priorities This Week" +``` + +Run with `--english-only` (safe default) before any PR. Requires `--overwrite-translations` with `YES` confirmation to update translated files. + +### Pattern: Post-Generation Validation + +After every batch generation, run the full validation pipeline: + +```bash +# 1. HTMLHint (zero errors required) +npm run htmlhint + +# 2. Translation consistency +npm run validate-news + +# 3. Index/sitemap update +python3 scripts/update-news-indexes-and-sitemap.py + +# 4. Commit in logical groups (< 1 MB per commit to stay within safe-outputs limits) +git add news/2026-02-19-*.html +git commit -m "news: 2026-02-19 committee reports (14 languages)" + +git add index*.html sitemap.xml +git commit -m "chore: update indexes and sitemap for 2026-02-19" +``` + +--- + +*Last Updated: 2026-02-19 | Issues: #306โ339 | PRs: #307, #312, #313, #314, #326, #333, #334* diff --git a/news/metadata/daily-report.json b/news/metadata/daily-report.json new file mode 100644 index 00000000..f8046083 --- /dev/null +++ b/news/metadata/daily-report.json @@ -0,0 +1,18 @@ +{ + "date": "2026-02-19", + "fromDate": "2026-02-18", + "threshold": 5, + "languages": [ + "en" + ], + "dryRun": true, + "documentsFound": {}, + "typesTriggered": [], + "typesSkipped": [], + "articlesCreated": [ + "week-ahead" + ], + "errors": [], + "startTime": "2026-02-19T17:24:52.562Z", + "endTime": "2026-02-19T17:24:52.640Z" +} \ No newline at end of file diff --git a/package.json b/package.json index faab866f..b7f30096 100644 --- a/package.json +++ b/package.json @@ -25,6 +25,7 @@ "validate-translations": "node scripts/validate-translations.js", "validate-news": "node scripts/validate-news-translations.js", "validate-all": "npm run htmlhint && npm run validate-translations && npm run validate-news", + "generate-daily-news": "node scripts/generate-daily-news.js", "generate-news": "node scripts/generate-news-enhanced.js", "generate-news-indexes": "node scripts/generate-news-indexes.js", "generate-sitemap": "node scripts/generate-sitemap.js", diff --git a/scripts/generate-daily-news.js b/scripts/generate-daily-news.js new file mode 100644 index 00000000..e42def4c --- /dev/null +++ b/scripts/generate-daily-news.js @@ -0,0 +1,452 @@ +#!/usr/bin/env node + +/** + * @module Intelligence Operations/Daily News Generation + * @category Intelligence Operations - Nightly Automated News Generation + * + * @description + * Nightly news generation orchestrator that queries the riksdag-regering-mcp server + * for documents published in the last 24 hours, groups them by type, applies a + * minimum-document threshold, and delegates article generation to the enhanced + * news engine (generate-news-enhanced.js). + * + * Designed to run unattended at 02:00 CET via GitHub Actions but is also safely + * triggerable by hand. All output files are written into news/ and the metadata + * directory, exactly as the enhanced script does, so downstream index / sitemap + * updates work without modification. + * + * Workflow + * โโโโโโโโ + * 1. Fetch new documents from riksdag-regering-mcp published since yesterday + * 2. Group by document type (bet, prop, mot) + * 3. Generate articles for types that meet the threshold (default โฅ5 documents) + * 4. Always generate the Week-Ahead calendar article + * 5. Write generation report to news/metadata/daily-report.json + * 6. Exit 0 on full success, 1 if any article generation failed + * + * CLI flags + * โโโโโโโโโ + * --date=YYYY-MM-DD Override "yesterday" date for document window + * --threshold=N Override minimum document count (default 5) + * --types=t1,t2 Restrict article types (committee-reports,propositions,motions,week-ahead) + * --languages=l1,l2 Language codes or presets (en,sv | nordic | eu-core | all) + * --dry-run Log what would happen without writing files + * --skip-existing Skip languages that already have today's articles + * --no-week-ahead Suppress the always-on Week Ahead article + * --batch-size=N Pass through to enhanced generator for batching + * + * Environment variables + * โโโโโโโโโโโโโโโโโโโโโ + * MCP_AUTH_TOKEN Bearer token for riksdag-regering-mcp (optional) + * MCP_SERVER_URL Override MCP server URL + * MCP_CLIENT_TIMEOUT_MS Request timeout in ms (default 90000 for cold start) + * + * @author Hack23 AB - Intelligence Operations Team + * @license Apache-2.0 + * @version 1.0.0 + * + * @see {@link ./generate-news-enhanced.js} Enhanced multi-language article engine + * @see {@link ./mcp-client.js} MCP transport layer + * @see {@link ../ARTICLE_ENHANCEMENT_GUIDE.md} Workflow documentation + */ + +import fs from 'fs'; +import path from 'path'; +import { fileURLToPath } from 'url'; +import { MCPClient } from './mcp-client.js'; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); + +// โโโ CLI argument parsing โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ + +const args = process.argv.slice(2); + +function getArg(prefix) { + const hit = args.find(a => a.startsWith(prefix + '=')); + return hit ? hit.slice(prefix.length + 1) : null; +} + +const dryRun = args.includes('--dry-run'); +const skipExisting = args.includes('--skip-existing'); +const noWeekAhead = args.includes('--no-week-ahead'); + +const dateArg = getArg('--date'); +const thresholdArg = getArg('--threshold'); +const typesArg = getArg('--types'); +const languagesArg = getArg('--languages'); +const batchSizeArg = getArg('--batch-size'); + +const DOCUMENT_THRESHOLD = thresholdArg ? parseInt(thresholdArg, 10) : 5; + +// โโโ Language helpers โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ + +const ALL_LANGUAGES = ['en', 'sv', 'da', 'no', 'fi', 'de', 'fr', 'es', 'nl', 'ar', 'he', 'ja', 'ko', 'zh']; + +const LANGUAGE_PRESETS = { + all: ALL_LANGUAGES, + nordic: ['en', 'sv', 'da', 'no', 'fi'], + 'eu-core':['en', 'sv', 'de', 'fr', 'es', 'nl'] +}; + +let languagesInput = languagesArg ? languagesArg.trim().toLowerCase() : 'all'; +if (LANGUAGE_PRESETS[languagesInput]) { + languagesInput = LANGUAGE_PRESETS[languagesInput].join(','); +} +const LANGUAGES = languagesInput.split(',').map(l => l.trim()).filter(l => ALL_LANGUAGES.includes(l)); + +if (LANGUAGES.length === 0) { + console.error('โ No valid language codes. Valid codes:', ALL_LANGUAGES.join(', ')); + process.exit(1); +} + +// โโโ Date helpers โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ + +/** + * Return yesterday's date as YYYY-MM-DD (or the override from --date flag). + * @returns {string} + */ +function getFromDate() { + if (dateArg) return dateArg; + const d = new Date(); + d.setDate(d.getDate() - 1); + return d.toISOString().split('T')[0]; +} + +const TODAY = new Date().toISOString().split('T')[0]; +const FROM_DATE = getFromDate(); + +// โโโ Paths โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ + +const NEWS_DIR = path.join(__dirname, '..', 'news'); +const METADATA_DIR = path.join(NEWS_DIR, 'metadata'); + +if (!fs.existsSync(METADATA_DIR)) { + fs.mkdirSync(METADATA_DIR, { recursive: true }); +} + +// โโโ Document type โ article type mapping โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ + +/** + * Maps riksdag document type codes to article type identifiers used by the + * enhanced generator. + */ +const DOCTYPE_TO_ARTICLE_TYPE = { + bet: 'committee-reports', + prop: 'propositions', + mot: 'motions' +}; + +const VALID_ARTICLE_TYPES = ['committee-reports', 'propositions', 'motions', 'week-ahead']; + +// โโโ Allowed article types from --types flag โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ + +const requestedTypes = typesArg + ? typesArg.split(',').map(t => t.trim()).filter(t => VALID_ARTICLE_TYPES.includes(t)) + : VALID_ARTICLE_TYPES; + +// โโโ Report accumulator โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ + +const report = { + date: TODAY, + fromDate: FROM_DATE, + threshold: DOCUMENT_THRESHOLD, + languages: LANGUAGES, + dryRun, + documentsFound: {}, + typesTriggered: [], + typesSkipped: [], + articlesCreated: [], + errors: [], + startTime: new Date().toISOString() +}; + +// โโโ MCP helpers โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ + +let _client = null; + +/** + * Get (or lazily create) the shared MCPClient, with cold-start warm-up. + * @returns {Promise