From 28b625400bafdf00c4d78ea5d1b3f7fc792d52e9 Mon Sep 17 00:00:00 2001 From: sandroqdb Date: Fri, 3 Jul 2026 16:07:12 +0200 Subject: [PATCH 1/3] feat: generate llms-full.txt with complete docs content MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The website's llms.txt has long advertised a full documentation corpus at /docs/llms-full.txt, but no such file was ever generated — the URL 404s. This adds scripts/generate-llms-full.js, which walks the sidebar (same order as llms.txt) and concatenates every doc's full markdown content into static/llms-full.txt, served at /docs/llms-full.txt. MDX processing (partials, component conversion, import stripping, heading bumping) reuses plugins/raw-markdown/convert-components, so the output matches the per-page .md endpoints exactly. Each doc entry carries a Source: line pointing at its canonical markdown URL. Output on current content: 354 docs, 2.69 MB. Wired into prebuild and gitignored like the other generated files. Companion to questdb/questdb.io#2923, which repairs the llms.txt link; once this deploys, the Full Documentation Content link can be restored there. Co-Authored-By: Claude Fable 5 --- .gitignore | 1 + package.json | 2 +- scripts/generate-llms-full.js | 215 ++++++++++++++++++++++++++++++++++ 3 files changed, 217 insertions(+), 1 deletion(-) create mode 100644 scripts/generate-llms-full.js diff --git a/.gitignore b/.gitignore index b651ebe92b..5a321307d8 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ plugins/*/compiled .netlify .cache-loader static/llms.txt +static/llms-full.txt static/reference-full.md static/web-console/*.json diff --git a/package.json b/package.json index 5975e39a97..cf25b452de 100644 --- a/package.json +++ b/package.json @@ -5,7 +5,7 @@ "license": "Apache-2.0", "scripts": { "start": "cross-env docusaurus start --port 3001", - "prebuild": "docusaurus clear && node ./scripts/generate-llms-files.js && node ./scripts/generate-reference-full.js && node ./scripts/generate-web-console-json.js", + "prebuild": "docusaurus clear && node ./scripts/generate-llms-files.js && node ./scripts/generate-llms-full.js && node ./scripts/generate-reference-full.js && node ./scripts/generate-web-console-json.js", "build": "cross-env NO_UPDATE_NOTIFIER=true USE_SIMPLE_CSS_MINIFIER=true PWA_SW_CUSTOM= docusaurus build", "deploy": "docusaurus deploy", "serve": "docusaurus serve", diff --git a/scripts/generate-llms-full.js b/scripts/generate-llms-full.js new file mode 100644 index 0000000000..5fa9fe8e38 --- /dev/null +++ b/scripts/generate-llms-full.js @@ -0,0 +1,215 @@ +const fs = require('fs') +const path = require('path') +const yaml = require('js-yaml') +const { + convertAllComponents, + bumpHeadings, + normalizeNewLines, + removeImports, + processPartialImports, +} = require('../plugins/raw-markdown/convert-components') + +const sidebarConfig = require('../documentation/sidebars.js') + +const ROOT_DIR = path.resolve(__dirname, '..') +const DOCS_DIR = path.join(ROOT_DIR, 'documentation') +const OUTPUT_DIR = path.join(ROOT_DIR, 'static') +const BASE_URL = 'https://questdb.com/docs/' + +function readDocFile(docId) { + const mdPath = path.join(DOCS_DIR, docId + '.md') + if (fs.existsSync(mdPath)) { + return { raw: fs.readFileSync(mdPath, 'utf8'), filePath: mdPath } + } + const mdxPath = path.join(DOCS_DIR, docId + '.mdx') + if (fs.existsSync(mdxPath)) { + return { raw: fs.readFileSync(mdxPath, 'utf8'), filePath: mdxPath } + } + console.warn(`[generate-llms-full] Warning: File not found: ${mdPath} or ${mdxPath}`) + return null +} + +// Partial cache shared across all files +const partialCache = new Map() + +function loadPartial(partialPath, currentFileDir) { + // Unescape markdown escaped characters (like \_ -> _) + const unescapedPath = partialPath.replace(/\\_/g, '_') + const absolutePath = path.resolve(path.join(DOCS_DIR, currentFileDir), unescapedPath) + + if (partialCache.has(absolutePath)) { + return partialCache.get(absolutePath) + } + + if (fs.existsSync(absolutePath)) { + const partialRaw = fs.readFileSync(absolutePath, 'utf8') + const frontmatterRegex = /^---\s*\n[\s\S]*?\n---\s*\n([\s\S]*)$/ + const match = partialRaw.match(frontmatterRegex) + const content = match ? match[1] : partialRaw + partialCache.set(absolutePath, content) + return content + } + + console.warn(`[generate-llms-full] Warning: Partial not found: ${absolutePath}`) + return `` +} + +function normalizeUrl(url) { + const clean = url.endsWith('/') ? url.slice(0, -1) : url + return clean + '.md' +} + +function generateUrl(docId, slug) { + if (slug) { + let urlPath = slug + + // Absolute slug (starts with /) + if (urlPath.startsWith('/')) { + urlPath = urlPath.substring(1) + if (urlPath === '') { + return BASE_URL + 'index.md' + } + return normalizeUrl(BASE_URL + urlPath) + } + + // Relative slug - resolve it relative to the document's directory + const docDir = path.dirname(docId) + if (docDir && docDir !== '.') { + urlPath = path.join(docDir, urlPath) + } + + return normalizeUrl(BASE_URL + urlPath) + } + + if (docId === 'introduction') { + return BASE_URL + 'index.md' + } + // Strip /index suffix to match raw-markdown plugin output (e.g. cookbook/index -> cookbook.md) + const urlDocId = docId.endsWith('/index') ? docId.slice(0, -'/index'.length) : docId + return normalizeUrl(BASE_URL + urlDocId) +} + +async function renderDoc(docId) { + const doc = readDocFile(docId) + if (!doc) return '' + + const frontmatterRegex = /^---\s*\n([\s\S]*?)\n---\s*\n([\s\S]*)$/ + const match = doc.raw.match(frontmatterRegex) + + let frontmatter = {} + let mainContent = doc.raw + + if (match) { + try { + frontmatter = yaml.load(match[1]) || {} + } catch (_) {} + mainContent = match[2] + } + + // Process partial component imports + const relativeDir = path.relative(DOCS_DIR, path.dirname(doc.filePath)) + let processedContent = processPartialImports(mainContent, loadPartial, relativeDir) + + // Convert MDX components to markdown + processedContent = await convertAllComponents(processedContent, path.dirname(doc.filePath), DOCS_DIR) + + processedContent = removeImports(processedContent) + processedContent = normalizeNewLines(processedContent) + + // Body headings H2 -> H3 etc. so the manually emitted H2 title stays the top of each doc + processedContent = bumpHeadings(processedContent, 1) + + const title = frontmatter.title || docId + const url = generateUrl(docId, frontmatter.slug || null) + + let out = `## ${title}\n\n` + out += `Source: ${url}\n\n` + if (frontmatter.description) { + out += `${frontmatter.description}\n\n` + } + out += processedContent.trim() + '\n\n' + return out +} + +// Walk the sidebar in order, collecting doc ids grouped by top-level category. +// Loose top-level docs fall under "Getting Started", matching llms.txt. +function collectSections(items) { + const sections = [] + let current = { label: 'Getting Started', docIds: [] } + + function collectDocIds(subItems, into) { + for (const item of subItems) { + if (typeof item === 'string') { + into.push(item) + } else if (item.type === 'doc') { + into.push(item.id) + } else if (item.type === 'category' && item.items) { + collectDocIds(item.items, into) + } + // item.type === 'link' is external; skip + } + } + + for (const item of items) { + if (typeof item === 'string') { + current.docIds.push(item) + } else if (item.type === 'doc') { + current.docIds.push(item.id) + } else if (item.type === 'category') { + if (current.docIds.length > 0) { + sections.push(current) + } + current = { label: item.label, docIds: [] } + if (item.items) { + collectDocIds(item.items, current.docIds) + } + } + } + + if (current.docIds.length > 0) { + sections.push(current) + } + + return sections +} + +async function generateLlmsFull() { + console.log('Generating llms-full.txt from QuestDB documentation...') + + const sections = collectSections(sidebarConfig.docs) + + let output = `# QuestDB Documentation — Full Content + +Complete text of the QuestDB documentation as a single document, in the same +order as the index at ${BASE_URL}llms.txt. Each entry links its canonical +markdown source. + +` + + let docCount = 0 + for (const section of sections) { + output += `# ${section.label}\n\n` + for (const docId of section.docIds) { + output += await renderDoc(docId) + docCount++ + } + } + + if (!fs.existsSync(OUTPUT_DIR)) { + fs.mkdirSync(OUTPUT_DIR, { recursive: true }) + } + + const targetPath = path.join(OUTPUT_DIR, 'llms-full.txt') + fs.writeFileSync(targetPath, output) + + const sizeMB = (Buffer.byteLength(output, 'utf8') / 1024 / 1024).toFixed(2) + console.log('✅ llms-full.txt generated successfully!') + console.log(` - Path: ${targetPath}`) + console.log(` - Docs: ${docCount}`) + console.log(` - Size: ${sizeMB} MB`) +} + +generateLlmsFull().catch(error => { + console.error('Error generating llms-full.txt:', error) + process.exitCode = 1 +}) From 7aa820fad8a4949e56eb4943d984a6b1def836c6 Mon Sep 17 00:00:00 2001 From: sandroqdb Date: Fri, 3 Jul 2026 16:26:34 +0200 Subject: [PATCH 2/3] address review: category-link docs, repo examples, dedup, shared URL logic Fixes from high-effort review of the first revision: - Docs attached to a category only via link: {type: 'doc'} were silently dropped; both generators now include them (llms.txt gains the one doc the sidebar only references that way, cookbook/sql/finance/index). - Pass the remote-repo-example plugin's data to convertAllComponents so renders real code instead of its 'Example not found' fallback. - Doc ids listed in multiple sidebar positions are rendered once in llms-full.txt (4 duplicate entries skipped, logged); doc count now reflects rendered docs only. - Extract canonical-URL construction into scripts/lib/docs-urls.js, mirroring plugins/raw-markdown/index.js exactly (fixes latent multi-segment relative-slug divergence) and shared by both the llms.txt and llms-full.txt generators. Verified: URL set identical to production llms.txt except the one added doc. - Parse frontmatter with gray-matter (existing dep, same as the raw-markdown plugin) instead of a hand-rolled regex. Co-Authored-By: Claude Fable 5 --- scripts/generate-llms-files.js | 58 ++++++++----------- scripts/generate-llms-full.js | 101 ++++++++++++++------------------- scripts/lib/docs-urls.js | 36 ++++++++++++ 3 files changed, 102 insertions(+), 93 deletions(-) create mode 100644 scripts/lib/docs-urls.js diff --git a/scripts/generate-llms-files.js b/scripts/generate-llms-files.js index a88380f0d3..e3c94d8448 100644 --- a/scripts/generate-llms-files.js +++ b/scripts/generate-llms-files.js @@ -3,7 +3,7 @@ const path = require('path') const yaml = require('js-yaml') const sidebarConfig = require('../documentation/sidebars.js') -const BASE_URL = 'https://questdb.com/docs/' +const { generateUrl: buildDocUrl } = require('./lib/docs-urls') const processedFiles = new Map() @@ -53,43 +53,21 @@ function extractFrontmatter(filePath) { } } -function normalizeUrl(url) { - const clean = url.endsWith("/") ? url.slice(0, -1) : url - return clean + ".md" -} - function generateUrl(docId, docPath) { // Extract frontmatter to check for custom slug const { slug } = extractFrontmatter(docPath) + return buildDocUrl(docId, slug) +} - if (slug) { - let urlPath = slug - - // Absolute slug (starts with /) - if (urlPath.startsWith('/')) { - urlPath = urlPath.substring(1) - if (urlPath === '') { - return BASE_URL + "index.md" - } - return normalizeUrl(BASE_URL + urlPath) - } - - // Relative slug - resolve it relative to the document's directory - const docDir = path.dirname(docId) - if (docDir && docDir !== '.') { - urlPath = path.join(docDir, urlPath) - } - - return normalizeUrl(BASE_URL + urlPath) - } - - // Default behavior: use docId - if (docId === 'introduction') { - return BASE_URL + "index.md" - } - // Strip /index suffix to match raw-markdown plugin output (e.g. cookbook/index -> cookbook.md) - let urlDocId = docId.endsWith('/index') ? docId.slice(0, -'/index'.length) : docId - return normalizeUrl(BASE_URL + urlDocId) +function subtreeContainsDoc(items, docId) { + if (!items) return false + return items.some(item => + (typeof item === 'string' && item === docId) || + (item.type === 'doc' && item.id === docId) || + (item.type === 'category' && + ((item.link && item.link.type === 'doc' && item.link.id === docId) || + subtreeContainsDoc(item.items, docId))) + ) } function processForLlmsTxt(items, indent = 0, isTopLevel = false) { @@ -123,13 +101,25 @@ function processForLlmsTxt(items, indent = 0, isTopLevel = false) { result += '\n' } else if (item.type === 'category') { + // A category's own link page (link: {type: 'doc'}) is a real doc too, + // unless the same doc is already listed among the category's items + const linkDoc = item.link && item.link.type === 'doc' && item.link.id && + !subtreeContainsDoc(item.items, item.link.id) + ? [{ type: 'doc', id: item.link.id }] + : [] if (isTopLevel) { result += `\n## ${item.label}\n` + if (linkDoc.length > 0) { + result += processForLlmsTxt(linkDoc, 0, false) + } if (item.items && item.items.length > 0) { result += processForLlmsTxt(item.items, 0, false) } } else { result += `${indentStr}${item.label}\n` + if (linkDoc.length > 0) { + result += processForLlmsTxt(linkDoc, indent + 1, false) + } if (item.items && item.items.length > 0) { result += processForLlmsTxt(item.items, indent + 1, false) } diff --git a/scripts/generate-llms-full.js b/scripts/generate-llms-full.js index 5fa9fe8e38..5e51c13fe3 100644 --- a/scripts/generate-llms-full.js +++ b/scripts/generate-llms-full.js @@ -1,6 +1,6 @@ const fs = require('fs') const path = require('path') -const yaml = require('js-yaml') +const matter = require('gray-matter') const { convertAllComponents, bumpHeadings, @@ -8,13 +8,14 @@ const { removeImports, processPartialImports, } = require('../plugins/raw-markdown/convert-components') +const remoteRepoExamplePlugin = require('../plugins/remote-repo-example/index') const sidebarConfig = require('../documentation/sidebars.js') +const { BASE_URL, generateUrl } = require('./lib/docs-urls') const ROOT_DIR = path.resolve(__dirname, '..') const DOCS_DIR = path.join(ROOT_DIR, 'documentation') const OUTPUT_DIR = path.join(ROOT_DIR, 'static') -const BASE_URL = 'https://questdb.com/docs/' function readDocFile(docId) { const mdPath = path.join(DOCS_DIR, docId + '.md') @@ -43,9 +44,7 @@ function loadPartial(partialPath, currentFileDir) { if (fs.existsSync(absolutePath)) { const partialRaw = fs.readFileSync(absolutePath, 'utf8') - const frontmatterRegex = /^---\s*\n[\s\S]*?\n---\s*\n([\s\S]*)$/ - const match = partialRaw.match(frontmatterRegex) - const content = match ? match[1] : partialRaw + const { content } = matter(partialRaw) partialCache.set(absolutePath, content) return content } @@ -54,64 +53,23 @@ function loadPartial(partialPath, currentFileDir) { return `` } -function normalizeUrl(url) { - const clean = url.endsWith('/') ? url.slice(0, -1) : url - return clean + '.md' -} - -function generateUrl(docId, slug) { - if (slug) { - let urlPath = slug - - // Absolute slug (starts with /) - if (urlPath.startsWith('/')) { - urlPath = urlPath.substring(1) - if (urlPath === '') { - return BASE_URL + 'index.md' - } - return normalizeUrl(BASE_URL + urlPath) - } - - // Relative slug - resolve it relative to the document's directory - const docDir = path.dirname(docId) - if (docDir && docDir !== '.') { - urlPath = path.join(docDir, urlPath) - } - - return normalizeUrl(BASE_URL + urlPath) - } - - if (docId === 'introduction') { - return BASE_URL + 'index.md' - } - // Strip /index suffix to match raw-markdown plugin output (e.g. cookbook/index -> cookbook.md) - const urlDocId = docId.endsWith('/index') ? docId.slice(0, -'/index'.length) : docId - return normalizeUrl(BASE_URL + urlDocId) -} - -async function renderDoc(docId) { +async function renderDoc(docId, repoExamples) { const doc = readDocFile(docId) if (!doc) return '' - const frontmatterRegex = /^---\s*\n([\s\S]*?)\n---\s*\n([\s\S]*)$/ - const match = doc.raw.match(frontmatterRegex) - - let frontmatter = {} - let mainContent = doc.raw - - if (match) { - try { - frontmatter = yaml.load(match[1]) || {} - } catch (_) {} - mainContent = match[2] - } + const { data: frontmatter, content: mainContent } = matter(doc.raw) // Process partial component imports const relativeDir = path.relative(DOCS_DIR, path.dirname(doc.filePath)) let processedContent = processPartialImports(mainContent, loadPartial, relativeDir) // Convert MDX components to markdown - processedContent = await convertAllComponents(processedContent, path.dirname(doc.filePath), DOCS_DIR) + processedContent = await convertAllComponents( + processedContent, + path.dirname(doc.filePath), + DOCS_DIR, + repoExamples, + ) processedContent = removeImports(processedContent) processedContent = normalizeNewLines(processedContent) @@ -133,6 +91,7 @@ async function renderDoc(docId) { // Walk the sidebar in order, collecting doc ids grouped by top-level category. // Loose top-level docs fall under "Getting Started", matching llms.txt. +// A category's own `link: {type: 'doc'}` page is included before its items. function collectSections(items) { const sections = [] let current = { label: 'Getting Started', docIds: [] } @@ -143,8 +102,13 @@ function collectSections(items) { into.push(item) } else if (item.type === 'doc') { into.push(item.id) - } else if (item.type === 'category' && item.items) { - collectDocIds(item.items, into) + } else if (item.type === 'category') { + if (item.link && item.link.type === 'doc' && item.link.id) { + into.push(item.link.id) + } + if (item.items) { + collectDocIds(item.items, into) + } } // item.type === 'link' is external; skip } @@ -160,6 +124,9 @@ function collectSections(items) { sections.push(current) } current = { label: item.label, docIds: [] } + if (item.link && item.link.type === 'doc' && item.link.id) { + current.docIds.push(item.link.id) + } if (item.items) { collectDocIds(item.items, current.docIds) } @@ -176,6 +143,10 @@ function collectSections(items) { async function generateLlmsFull() { console.log('Generating llms-full.txt from QuestDB documentation...') + // Same remote example data the raw-markdown plugin receives at build time, + // so renders real code instead of its fallback + const repoExamples = await remoteRepoExamplePlugin().loadContent() + const sections = collectSections(sidebarConfig.docs) let output = `# QuestDB Documentation — Full Content @@ -186,12 +157,24 @@ markdown source. ` + // Docs can appear in several sidebar positions; render each only once + const renderedDocIds = new Set() let docCount = 0 + let duplicateCount = 0 + for (const section of sections) { output += `# ${section.label}\n\n` for (const docId of section.docIds) { - output += await renderDoc(docId) - docCount++ + if (renderedDocIds.has(docId)) { + duplicateCount++ + continue + } + renderedDocIds.add(docId) + const rendered = await renderDoc(docId, repoExamples) + if (rendered) { + output += rendered + docCount++ + } } } @@ -205,7 +188,7 @@ markdown source. const sizeMB = (Buffer.byteLength(output, 'utf8') / 1024 / 1024).toFixed(2) console.log('✅ llms-full.txt generated successfully!') console.log(` - Path: ${targetPath}`) - console.log(` - Docs: ${docCount}`) + console.log(` - Docs: ${docCount} (${duplicateCount} duplicate sidebar entries skipped)`) console.log(` - Size: ${sizeMB} MB`) } diff --git a/scripts/lib/docs-urls.js b/scripts/lib/docs-urls.js new file mode 100644 index 0000000000..27c26158c1 --- /dev/null +++ b/scripts/lib/docs-urls.js @@ -0,0 +1,36 @@ +const path = require('path') + +const BASE_URL = 'https://questdb.com/docs/' + +// Canonical raw-markdown URL for a doc, shared by the llms.txt and +// llms-full.txt generators. Mirrors plugins/raw-markdown/index.js exactly — +// that plugin decides where the .md files are actually written, so any +// divergence here produces dead Source links. +function generateUrl(docId, slug) { + let urlPath + + if (slug) { + urlPath = slug + if (urlPath.startsWith('/')) { + urlPath = urlPath.substring(1) + } + // Only prepend the doc's directory if the slug doesn't already include + // path segments (same rule as the raw-markdown plugin) + const fileDir = path.dirname(docId) + if (!urlPath.includes('/') && fileDir !== '.') { + urlPath = path.join(fileDir, urlPath) + } + } else { + urlPath = docId + if (urlPath.endsWith('/index')) { + urlPath = urlPath.replace(/\/index$/, '') + } + } + + if (urlPath === '' || urlPath === '.') { + return BASE_URL + 'index.md' + } + return BASE_URL + urlPath + '.md' +} + +module.exports = { BASE_URL, generateUrl } From 5d99349ca37d4ba282893e190ee64090e03dec2d Mon Sep 17 00:00:00 2001 From: sandroqdb Date: Fri, 3 Jul 2026 17:07:49 +0200 Subject: [PATCH 3/3] address round-2 review: heading integrity, section labels, build resilience MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Bump body headings by 2 (H1->H3) instead of 1: introduction.md and changelog.mdx carry body H1s that landed at H2 — the per-doc delimiter level — creating phantom doc boundaries. Verified fence-aware: 352 real H2 doc headers == 352 Source lines. - Fix section labeling: loose top-level docs before the first category form an 'Overview' section (no more duplicate 'Getting Started' headers), and loose docs after a category (changelog) get their own title-labeled section instead of folding into the preceding category. - Never fail the docs build on a GitHub flake: remote example data is only used for llms-full.txt, so loadContent gets one retry and then degrades to placeholder examples for that build instead of aborting the whole deploy. - Gate category link docs with subtreeContainsDoc (moved to shared scripts/lib/sidebar-utils.js, used by both generators) so llms-full orders them identically to llms.txt; buffer section bodies so a section whose docs all rendered elsewhere emits no bare header. - docs-urls: restore the introduction -> index.md fallback as a safety net against slug-extraction failure; document that a trailing slash in a slug is deliberately not stripped (the raw-markdown plugin writes '.md' verbatim, so stripping would link a path it never writes). llms.txt output verified byte-identical before/after the shared-walker refactor. Co-Authored-By: Claude Fable 5 --- scripts/generate-llms-files.js | 12 +---- scripts/generate-llms-full.js | 97 ++++++++++++++++++++++++---------- scripts/lib/docs-urls.js | 9 ++++ scripts/lib/sidebar-utils.js | 16 ++++++ 4 files changed, 96 insertions(+), 38 deletions(-) create mode 100644 scripts/lib/sidebar-utils.js diff --git a/scripts/generate-llms-files.js b/scripts/generate-llms-files.js index e3c94d8448..6cad75d131 100644 --- a/scripts/generate-llms-files.js +++ b/scripts/generate-llms-files.js @@ -4,6 +4,7 @@ const yaml = require('js-yaml') const sidebarConfig = require('../documentation/sidebars.js') const { generateUrl: buildDocUrl } = require('./lib/docs-urls') +const { subtreeContainsDoc } = require('./lib/sidebar-utils') const processedFiles = new Map() @@ -59,17 +60,6 @@ function generateUrl(docId, docPath) { return buildDocUrl(docId, slug) } -function subtreeContainsDoc(items, docId) { - if (!items) return false - return items.some(item => - (typeof item === 'string' && item === docId) || - (item.type === 'doc' && item.id === docId) || - (item.type === 'category' && - ((item.link && item.link.type === 'doc' && item.link.id === docId) || - subtreeContainsDoc(item.items, docId))) - ) -} - function processForLlmsTxt(items, indent = 0, isTopLevel = false) { let result = '' const indentStr = ' '.repeat(indent) diff --git a/scripts/generate-llms-full.js b/scripts/generate-llms-full.js index 5e51c13fe3..ceb02a041f 100644 --- a/scripts/generate-llms-full.js +++ b/scripts/generate-llms-full.js @@ -12,6 +12,7 @@ const remoteRepoExamplePlugin = require('../plugins/remote-repo-example/index') const sidebarConfig = require('../documentation/sidebars.js') const { BASE_URL, generateUrl } = require('./lib/docs-urls') +const { subtreeContainsDoc } = require('./lib/sidebar-utils') const ROOT_DIR = path.resolve(__dirname, '..') const DOCS_DIR = path.join(ROOT_DIR, 'documentation') @@ -74,8 +75,10 @@ async function renderDoc(docId, repoExamples) { processedContent = removeImports(processedContent) processedContent = normalizeNewLines(processedContent) - // Body headings H2 -> H3 etc. so the manually emitted H2 title stays the top of each doc - processedContent = bumpHeadings(processedContent, 1) + // Bump body headings by 2 (H1 -> H3, H2 -> H4, …) so nothing in a doc body + // can collide with the H1 section headers or the H2 per-doc title below — + // some docs (introduction, changelog) legitimately contain body H1s + processedContent = bumpHeadings(processedContent, 2) const title = frontmatter.title || docId const url = generateUrl(docId, frontmatter.slug || null) @@ -89,12 +92,32 @@ async function renderDoc(docId, repoExamples) { return out } -// Walk the sidebar in order, collecting doc ids grouped by top-level category. -// Loose top-level docs fall under "Getting Started", matching llms.txt. -// A category's own `link: {type: 'doc'}` page is included before its items. +function docTitle(docId) { + const doc = readDocFile(docId) + if (!doc) return docId + const { data } = matter(doc.raw) + return data.title || docId +} + +// Walk the sidebar in order, collecting doc ids grouped into sections. +// Top-level categories become sections labeled by the category. Loose +// top-level docs before the first category form an "Overview" section; +// loose docs appearing after a category (e.g. changelog) each get their own +// section labeled by the doc's title, so no doc is misattributed to a +// neighboring category. A category's own `link: {type: 'doc'}` page is +// included before its items unless the items already list it — the same +// rule (and therefore the same order) as the llms.txt generator. function collectSections(items) { const sections = [] - let current = { label: 'Getting Started', docIds: [] } + const leading = { label: 'Overview', docIds: [] } + let seenCategory = false + + function categoryLinkDocIds(item) { + return item.link && item.link.type === 'doc' && item.link.id && + !subtreeContainsDoc(item.items, item.link.id) + ? [item.link.id] + : [] + } function collectDocIds(subItems, into) { for (const item of subItems) { @@ -103,9 +126,7 @@ function collectSections(items) { } else if (item.type === 'doc') { into.push(item.id) } else if (item.type === 'category') { - if (item.link && item.link.type === 'doc' && item.link.id) { - into.push(item.link.id) - } + into.push(...categoryLinkDocIds(item)) if (item.items) { collectDocIds(item.items, into) } @@ -115,37 +136,55 @@ function collectSections(items) { } for (const item of items) { - if (typeof item === 'string') { - current.docIds.push(item) - } else if (item.type === 'doc') { - current.docIds.push(item.id) - } else if (item.type === 'category') { - if (current.docIds.length > 0) { - sections.push(current) + if (typeof item === 'string' || item.type === 'doc') { + const docId = typeof item === 'string' ? item : item.id + if (seenCategory) { + sections.push({ label: docTitle(docId), docIds: [docId] }) + } else { + leading.docIds.push(docId) } - current = { label: item.label, docIds: [] } - if (item.link && item.link.type === 'doc' && item.link.id) { - current.docIds.push(item.link.id) + } else if (item.type === 'category') { + if (!seenCategory && leading.docIds.length > 0) { + sections.push(leading) } + seenCategory = true + const section = { label: item.label, docIds: [] } + section.docIds.push(...categoryLinkDocIds(item)) if (item.items) { - collectDocIds(item.items, current.docIds) + collectDocIds(item.items, section.docIds) } + sections.push(section) } } - if (current.docIds.length > 0) { - sections.push(current) + if (!seenCategory && leading.docIds.length > 0) { + sections.push(leading) } return sections } +// Same remote example data the raw-markdown plugin receives at build time, +// so renders real code instead of its fallback. +// Never fails the build: this data is only used for llms-full.txt, so on +// persistent fetch errors we degrade to placeholder examples for one build +// rather than blocking the whole docs deploy on a GitHub flake. +async function loadRepoExamples() { + for (let attempt = 1; attempt <= 2; attempt++) { + try { + return await remoteRepoExamplePlugin().loadContent() + } catch (error) { + console.warn(`[generate-llms-full] Warning: could not load remote repo examples (attempt ${attempt}/2): ${error.message}`) + } + } + console.warn('[generate-llms-full] Proceeding without remote examples; blocks will render placeholders until the next successful build.') + return {} +} + async function generateLlmsFull() { console.log('Generating llms-full.txt from QuestDB documentation...') - // Same remote example data the raw-markdown plugin receives at build time, - // so renders real code instead of its fallback - const repoExamples = await remoteRepoExamplePlugin().loadContent() + const repoExamples = await loadRepoExamples() const sections = collectSections(sidebarConfig.docs) @@ -163,7 +202,7 @@ markdown source. let duplicateCount = 0 for (const section of sections) { - output += `# ${section.label}\n\n` + let body = '' for (const docId of section.docIds) { if (renderedDocIds.has(docId)) { duplicateCount++ @@ -172,10 +211,14 @@ markdown source. renderedDocIds.add(docId) const rendered = await renderDoc(docId, repoExamples) if (rendered) { - output += rendered + body += rendered docCount++ } } + // Skip the header if every doc in this section was a duplicate or missing + if (body) { + output += `# ${section.label}\n\n` + body + } } if (!fs.existsSync(OUTPUT_DIR)) { diff --git a/scripts/lib/docs-urls.js b/scripts/lib/docs-urls.js index 27c26158c1..a4146adacd 100644 --- a/scripts/lib/docs-urls.js +++ b/scripts/lib/docs-urls.js @@ -21,11 +21,20 @@ function generateUrl(docId, slug) { urlPath = path.join(fileDir, urlPath) } } else { + // Safety net: introduction carries `slug: /`; if slug extraction ever + // fails (parse error, unreadable file) fall back to the URL the plugin + // publishes for it rather than emitting a dead introduction.md link. + if (docId === 'introduction') { + return BASE_URL + 'index.md' + } urlPath = docId if (urlPath.endsWith('/index')) { urlPath = urlPath.replace(/\/index$/, '') } } + // Note: a trailing '/' in a slug is deliberately NOT stripped — the + // raw-markdown plugin writes `.md` verbatim, so stripping here + // would link a path the plugin never publishes. if (urlPath === '' || urlPath === '.') { return BASE_URL + 'index.md' diff --git a/scripts/lib/sidebar-utils.js b/scripts/lib/sidebar-utils.js new file mode 100644 index 0000000000..47bd06c38f --- /dev/null +++ b/scripts/lib/sidebar-utils.js @@ -0,0 +1,16 @@ +// Shared sidebar helpers for the llms.txt / llms-full.txt generators. + +// True if docId appears anywhere in the given sidebar items subtree +// (as a string entry, a doc entry, or a category's own link doc). +function subtreeContainsDoc(items, docId) { + if (!items) return false + return items.some(item => + (typeof item === 'string' && item === docId) || + (item.type === 'doc' && item.id === docId) || + (item.type === 'category' && + ((item.link && item.link.type === 'doc' && item.link.id === docId) || + subtreeContainsDoc(item.items, docId))) + ) +} + +module.exports = { subtreeContainsDoc }