diff --git a/.gitignore b/.gitignore index b651ebe92b..5a321307d8 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ plugins/*/compiled .netlify .cache-loader static/llms.txt +static/llms-full.txt static/reference-full.md static/web-console/*.json diff --git a/package.json b/package.json index 5975e39a97..cf25b452de 100644 --- a/package.json +++ b/package.json @@ -5,7 +5,7 @@ "license": "Apache-2.0", "scripts": { "start": "cross-env docusaurus start --port 3001", - "prebuild": "docusaurus clear && node ./scripts/generate-llms-files.js && node ./scripts/generate-reference-full.js && node ./scripts/generate-web-console-json.js", + "prebuild": "docusaurus clear && node ./scripts/generate-llms-files.js && node ./scripts/generate-llms-full.js && node ./scripts/generate-reference-full.js && node ./scripts/generate-web-console-json.js", "build": "cross-env NO_UPDATE_NOTIFIER=true USE_SIMPLE_CSS_MINIFIER=true PWA_SW_CUSTOM= docusaurus build", "deploy": "docusaurus deploy", "serve": "docusaurus serve", diff --git a/scripts/generate-llms-files.js b/scripts/generate-llms-files.js index a88380f0d3..6cad75d131 100644 --- a/scripts/generate-llms-files.js +++ b/scripts/generate-llms-files.js @@ -3,7 +3,8 @@ const path = require('path') const yaml = require('js-yaml') const sidebarConfig = require('../documentation/sidebars.js') -const BASE_URL = 'https://questdb.com/docs/' +const { generateUrl: buildDocUrl } = require('./lib/docs-urls') +const { subtreeContainsDoc } = require('./lib/sidebar-utils') const processedFiles = new Map() @@ -53,43 +54,10 @@ function extractFrontmatter(filePath) { } } -function normalizeUrl(url) { - const clean = url.endsWith("/") ? url.slice(0, -1) : url - return clean + ".md" -} - function generateUrl(docId, docPath) { // Extract frontmatter to check for custom slug const { slug } = extractFrontmatter(docPath) - - if (slug) { - let urlPath = slug - - // Absolute slug (starts with /) - if (urlPath.startsWith('/')) { - urlPath = urlPath.substring(1) - if (urlPath === '') { - return BASE_URL + "index.md" - } - return normalizeUrl(BASE_URL + urlPath) - } - - // Relative slug - resolve it relative to the document's directory - const docDir = path.dirname(docId) - if (docDir && docDir !== '.') { - urlPath = path.join(docDir, urlPath) - } - - return normalizeUrl(BASE_URL + urlPath) - } - - // Default behavior: use docId - if (docId === 'introduction') { - return BASE_URL + "index.md" - } - // Strip /index suffix to match raw-markdown plugin output (e.g. cookbook/index -> cookbook.md) - let urlDocId = docId.endsWith('/index') ? docId.slice(0, -'/index'.length) : docId - return normalizeUrl(BASE_URL + urlDocId) + return buildDocUrl(docId, slug) } function processForLlmsTxt(items, indent = 0, isTopLevel = false) { @@ -123,13 +91,25 @@ function processForLlmsTxt(items, indent = 0, isTopLevel = false) { result += '\n' } else if (item.type === 'category') { + // A category's own link page (link: {type: 'doc'}) is a real doc too, + // unless the same doc is already listed among the category's items + const linkDoc = item.link && item.link.type === 'doc' && item.link.id && + !subtreeContainsDoc(item.items, item.link.id) + ? [{ type: 'doc', id: item.link.id }] + : [] if (isTopLevel) { result += `\n## ${item.label}\n` + if (linkDoc.length > 0) { + result += processForLlmsTxt(linkDoc, 0, false) + } if (item.items && item.items.length > 0) { result += processForLlmsTxt(item.items, 0, false) } } else { result += `${indentStr}${item.label}\n` + if (linkDoc.length > 0) { + result += processForLlmsTxt(linkDoc, indent + 1, false) + } if (item.items && item.items.length > 0) { result += processForLlmsTxt(item.items, indent + 1, false) } diff --git a/scripts/generate-llms-full.js b/scripts/generate-llms-full.js new file mode 100644 index 0000000000..ceb02a041f --- /dev/null +++ b/scripts/generate-llms-full.js @@ -0,0 +1,241 @@ +const fs = require('fs') +const path = require('path') +const matter = require('gray-matter') +const { + convertAllComponents, + bumpHeadings, + normalizeNewLines, + removeImports, + processPartialImports, +} = require('../plugins/raw-markdown/convert-components') +const remoteRepoExamplePlugin = require('../plugins/remote-repo-example/index') + +const sidebarConfig = require('../documentation/sidebars.js') +const { BASE_URL, generateUrl } = require('./lib/docs-urls') +const { subtreeContainsDoc } = require('./lib/sidebar-utils') + +const ROOT_DIR = path.resolve(__dirname, '..') +const DOCS_DIR = path.join(ROOT_DIR, 'documentation') +const OUTPUT_DIR = path.join(ROOT_DIR, 'static') + +function readDocFile(docId) { + const mdPath = path.join(DOCS_DIR, docId + '.md') + if (fs.existsSync(mdPath)) { + return { raw: fs.readFileSync(mdPath, 'utf8'), filePath: mdPath } + } + const mdxPath = path.join(DOCS_DIR, docId + '.mdx') + if (fs.existsSync(mdxPath)) { + return { raw: fs.readFileSync(mdxPath, 'utf8'), filePath: mdxPath } + } + console.warn(`[generate-llms-full] Warning: File not found: ${mdPath} or ${mdxPath}`) + return null +} + +// Partial cache shared across all files +const partialCache = new Map() + +function loadPartial(partialPath, currentFileDir) { + // Unescape markdown escaped characters (like \_ -> _) + const unescapedPath = partialPath.replace(/\\_/g, '_') + const absolutePath = path.resolve(path.join(DOCS_DIR, currentFileDir), unescapedPath) + + if (partialCache.has(absolutePath)) { + return partialCache.get(absolutePath) + } + + if (fs.existsSync(absolutePath)) { + const partialRaw = fs.readFileSync(absolutePath, 'utf8') + const { content } = matter(partialRaw) + partialCache.set(absolutePath, content) + return content + } + + console.warn(`[generate-llms-full] Warning: Partial not found: ${absolutePath}`) + return `` +} + +async function renderDoc(docId, repoExamples) { + const doc = readDocFile(docId) + if (!doc) return '' + + const { data: frontmatter, content: mainContent } = matter(doc.raw) + + // Process partial component imports + const relativeDir = path.relative(DOCS_DIR, path.dirname(doc.filePath)) + let processedContent = processPartialImports(mainContent, loadPartial, relativeDir) + + // Convert MDX components to markdown + processedContent = await convertAllComponents( + processedContent, + path.dirname(doc.filePath), + DOCS_DIR, + repoExamples, + ) + + processedContent = removeImports(processedContent) + processedContent = normalizeNewLines(processedContent) + + // Bump body headings by 2 (H1 -> H3, H2 -> H4, …) so nothing in a doc body + // can collide with the H1 section headers or the H2 per-doc title below — + // some docs (introduction, changelog) legitimately contain body H1s + processedContent = bumpHeadings(processedContent, 2) + + const title = frontmatter.title || docId + const url = generateUrl(docId, frontmatter.slug || null) + + let out = `## ${title}\n\n` + out += `Source: ${url}\n\n` + if (frontmatter.description) { + out += `${frontmatter.description}\n\n` + } + out += processedContent.trim() + '\n\n' + return out +} + +function docTitle(docId) { + const doc = readDocFile(docId) + if (!doc) return docId + const { data } = matter(doc.raw) + return data.title || docId +} + +// Walk the sidebar in order, collecting doc ids grouped into sections. +// Top-level categories become sections labeled by the category. Loose +// top-level docs before the first category form an "Overview" section; +// loose docs appearing after a category (e.g. changelog) each get their own +// section labeled by the doc's title, so no doc is misattributed to a +// neighboring category. A category's own `link: {type: 'doc'}` page is +// included before its items unless the items already list it — the same +// rule (and therefore the same order) as the llms.txt generator. +function collectSections(items) { + const sections = [] + const leading = { label: 'Overview', docIds: [] } + let seenCategory = false + + function categoryLinkDocIds(item) { + return item.link && item.link.type === 'doc' && item.link.id && + !subtreeContainsDoc(item.items, item.link.id) + ? [item.link.id] + : [] + } + + function collectDocIds(subItems, into) { + for (const item of subItems) { + if (typeof item === 'string') { + into.push(item) + } else if (item.type === 'doc') { + into.push(item.id) + } else if (item.type === 'category') { + into.push(...categoryLinkDocIds(item)) + if (item.items) { + collectDocIds(item.items, into) + } + } + // item.type === 'link' is external; skip + } + } + + for (const item of items) { + if (typeof item === 'string' || item.type === 'doc') { + const docId = typeof item === 'string' ? item : item.id + if (seenCategory) { + sections.push({ label: docTitle(docId), docIds: [docId] }) + } else { + leading.docIds.push(docId) + } + } else if (item.type === 'category') { + if (!seenCategory && leading.docIds.length > 0) { + sections.push(leading) + } + seenCategory = true + const section = { label: item.label, docIds: [] } + section.docIds.push(...categoryLinkDocIds(item)) + if (item.items) { + collectDocIds(item.items, section.docIds) + } + sections.push(section) + } + } + + if (!seenCategory && leading.docIds.length > 0) { + sections.push(leading) + } + + return sections +} + +// Same remote example data the raw-markdown plugin receives at build time, +// so renders real code instead of its fallback. +// Never fails the build: this data is only used for llms-full.txt, so on +// persistent fetch errors we degrade to placeholder examples for one build +// rather than blocking the whole docs deploy on a GitHub flake. +async function loadRepoExamples() { + for (let attempt = 1; attempt <= 2; attempt++) { + try { + return await remoteRepoExamplePlugin().loadContent() + } catch (error) { + console.warn(`[generate-llms-full] Warning: could not load remote repo examples (attempt ${attempt}/2): ${error.message}`) + } + } + console.warn('[generate-llms-full] Proceeding without remote examples; blocks will render placeholders until the next successful build.') + return {} +} + +async function generateLlmsFull() { + console.log('Generating llms-full.txt from QuestDB documentation...') + + const repoExamples = await loadRepoExamples() + + const sections = collectSections(sidebarConfig.docs) + + let output = `# QuestDB Documentation — Full Content + +Complete text of the QuestDB documentation as a single document, in the same +order as the index at ${BASE_URL}llms.txt. Each entry links its canonical +markdown source. + +` + + // Docs can appear in several sidebar positions; render each only once + const renderedDocIds = new Set() + let docCount = 0 + let duplicateCount = 0 + + for (const section of sections) { + let body = '' + for (const docId of section.docIds) { + if (renderedDocIds.has(docId)) { + duplicateCount++ + continue + } + renderedDocIds.add(docId) + const rendered = await renderDoc(docId, repoExamples) + if (rendered) { + body += rendered + docCount++ + } + } + // Skip the header if every doc in this section was a duplicate or missing + if (body) { + output += `# ${section.label}\n\n` + body + } + } + + if (!fs.existsSync(OUTPUT_DIR)) { + fs.mkdirSync(OUTPUT_DIR, { recursive: true }) + } + + const targetPath = path.join(OUTPUT_DIR, 'llms-full.txt') + fs.writeFileSync(targetPath, output) + + const sizeMB = (Buffer.byteLength(output, 'utf8') / 1024 / 1024).toFixed(2) + console.log('✅ llms-full.txt generated successfully!') + console.log(` - Path: ${targetPath}`) + console.log(` - Docs: ${docCount} (${duplicateCount} duplicate sidebar entries skipped)`) + console.log(` - Size: ${sizeMB} MB`) +} + +generateLlmsFull().catch(error => { + console.error('Error generating llms-full.txt:', error) + process.exitCode = 1 +}) diff --git a/scripts/lib/docs-urls.js b/scripts/lib/docs-urls.js new file mode 100644 index 0000000000..a4146adacd --- /dev/null +++ b/scripts/lib/docs-urls.js @@ -0,0 +1,45 @@ +const path = require('path') + +const BASE_URL = 'https://questdb.com/docs/' + +// Canonical raw-markdown URL for a doc, shared by the llms.txt and +// llms-full.txt generators. Mirrors plugins/raw-markdown/index.js exactly — +// that plugin decides where the .md files are actually written, so any +// divergence here produces dead Source links. +function generateUrl(docId, slug) { + let urlPath + + if (slug) { + urlPath = slug + if (urlPath.startsWith('/')) { + urlPath = urlPath.substring(1) + } + // Only prepend the doc's directory if the slug doesn't already include + // path segments (same rule as the raw-markdown plugin) + const fileDir = path.dirname(docId) + if (!urlPath.includes('/') && fileDir !== '.') { + urlPath = path.join(fileDir, urlPath) + } + } else { + // Safety net: introduction carries `slug: /`; if slug extraction ever + // fails (parse error, unreadable file) fall back to the URL the plugin + // publishes for it rather than emitting a dead introduction.md link. + if (docId === 'introduction') { + return BASE_URL + 'index.md' + } + urlPath = docId + if (urlPath.endsWith('/index')) { + urlPath = urlPath.replace(/\/index$/, '') + } + } + // Note: a trailing '/' in a slug is deliberately NOT stripped — the + // raw-markdown plugin writes `.md` verbatim, so stripping here + // would link a path the plugin never publishes. + + if (urlPath === '' || urlPath === '.') { + return BASE_URL + 'index.md' + } + return BASE_URL + urlPath + '.md' +} + +module.exports = { BASE_URL, generateUrl } diff --git a/scripts/lib/sidebar-utils.js b/scripts/lib/sidebar-utils.js new file mode 100644 index 0000000000..47bd06c38f --- /dev/null +++ b/scripts/lib/sidebar-utils.js @@ -0,0 +1,16 @@ +// Shared sidebar helpers for the llms.txt / llms-full.txt generators. + +// True if docId appears anywhere in the given sidebar items subtree +// (as a string entry, a doc entry, or a category's own link doc). +function subtreeContainsDoc(items, docId) { + if (!items) return false + return items.some(item => + (typeof item === 'string' && item === docId) || + (item.type === 'doc' && item.id === docId) || + (item.type === 'category' && + ((item.link && item.link.type === 'doc' && item.link.id === docId) || + subtreeContainsDoc(item.items, docId))) + ) +} + +module.exports = { subtreeContainsDoc }