From 2e14e68e609b05ce7ebe48e209a5bb8061e51b69 Mon Sep 17 00:00:00 2001 From: Marco Walz Date: Mon, 23 Mar 2026 16:43:36 +0100 Subject: [PATCH] fix(docs): improve agentdocsspec compliance - Move agent signaling blockquote from Banner component to post-build injection right after , placing it at ~12% of the document instead of >50% (past nav/sidebar). Addresses the "buried deep" warning from the agentdocsspec checker. - Split the monolithic CLI reference (52K+ chars) into 63 per-command markdown endpoints under reference/cli/. Top-level commands are listed in llms.txt; subcommand files include linked navigation to siblings. Includes build-time validation for slug collisions, broken links, and format changes. - Copy sitemap-index.xml to sitemap.xml so the agentdocsspec freshness check can discover it at the conventional path. Move agentDocs() integration after starlight() to ensure sitemap exists when the hook runs. --- docs-site/astro.config.mjs | 5 +- docs-site/plugins/astro-agent-docs.mjs | 177 ++++++++++++++++++++++++- docs-site/src/components/Banner.astro | 4 - 3 files changed, 178 insertions(+), 8 deletions(-) diff --git a/docs-site/astro.config.mjs b/docs-site/astro.config.mjs index f1513d7c..3f0c47d9 100644 --- a/docs-site/astro.config.mjs +++ b/docs-site/astro.config.mjs @@ -19,8 +19,6 @@ export default defineConfig({ ], }, integrations: [ - // Generate .md endpoints and llms.txt for agent-friendly docs - agentDocs(), starlight({ title: 'ICP CLI', description: 'Command-line tool for developing and deploying applications on the Internet Computer Protocol (ICP)', @@ -124,5 +122,8 @@ export default defineConfig({ }, ], }), + // Generate .md endpoints, llms.txt, and agent signaling for agent-friendly docs. + // Listed after starlight() so the astro:build:done hook runs after sitemap generation. + agentDocs(), ], }); diff --git a/docs-site/plugins/astro-agent-docs.mjs b/docs-site/plugins/astro-agent-docs.mjs index 5d29c8df..0297a3ed 100644 --- a/docs-site/plugins/astro-agent-docs.mjs +++ b/docs-site/plugins/astro-agent-docs.mjs @@ -4,6 +4,8 @@ * * 1. Markdown endpoints — serves a clean .md file alongside every HTML page * 2. llms.txt — discovery index listing all pages with links to .md endpoints + * 3. Agent signaling — injects a hidden llms.txt directive right after + * in every HTML page so agents discover it early (before nav/sidebar) * * Runs in the astro:build:done hook so it operates on the final build output. */ @@ -62,8 +64,119 @@ function findSection(filePath) { return best; } +// Path to the CLI reference page — split into per-command endpoints for agents. +const CLI_REFERENCE = "reference/cli.md"; + +/** + * Split the CLI reference into per-command markdown files. + * Each `## \`icp ...\`` heading becomes its own file under reference/cli/. + * Returns metadata for each generated sub-page (for llms.txt). + */ +function splitCliReference(outDir) { + const cliMd = path.join(outDir, CLI_REFERENCE); + if (!fs.existsSync(cliMd)) return []; + + const content = fs + .readFileSync(cliMd, "utf-8") + // Strip the clap-markdown generation footer that appears at the end. + .replace(/\n*\s*\n*[\s\S]*?<\/small>\s*$/, "\n"); + // Split on ## `icp ...` headings, keeping the heading with the section. + const sections = content.split(/^(?=## `icp\b)/m).filter((s) => s.trim()); + + const subDir = path.join(outDir, "reference", "cli"); + fs.mkdirSync(subDir, { recursive: true }); + + const subPages = []; + const seenSlugs = new Map(); // slug → command name, for collision detection + for (const section of sections) { + const match = section.match(/^## `(icp[\w\s-]*?)`/); + if (!match) continue; + + const command = match[1].trim(); + // icp build → build, icp canister call → canister-call + const slug = command === "icp" ? "index" : command.replace(/^icp /, "").replace(/ /g, "-"); + const fileName = `${slug}.md`; + + // Detect slug collisions (e.g., "icp foo-bar" vs "icp foo bar"). + if (seenSlugs.has(slug)) { + throw new Error( + `CLI reference split: slug collision for "${fileName}" ` + + `between commands "${seenSlugs.get(slug)}" and "${command}"` + ); + } + seenSlugs.set(slug, command); + + // Extract the description: first plain-text line after the heading, + // skipping **Usage:**, ###### headings, list items, and empty lines. + const lines = section.split("\n"); + const descLine = lines.find( + (l, i) => + i > 0 && + l.trim() && + !l.startsWith("**Usage") && + !l.startsWith("#") && + !l.startsWith("*") + ); + const description = descLine ? descLine.trim() : ""; + + // Rewrite subcommand list items to link to their per-command endpoints. + // e.g., `* \`call\` — ...` → `* [\`call\`](canister-call.md) — ...` + // The parent prefix (e.g., "canister") is used to build the slug. + const parentSlug = command.replace(/^icp ?/, "").replace(/ /g, "-"); + const body = section.replace(/^## [^\n]+\n+/, "").replace( + /^\* `(\w[\w-]*)` —/gm, + (_, sub) => { + const subSlug = parentSlug ? `${parentSlug}-${sub}` : sub; + return `* [\`${sub}\`](${subSlug}.md) —`; + } + ); + + fs.writeFileSync( + path.join(subDir, fileName), + BOM + `# ${command}\n\n` + body + "\n" + ); + + subPages.push({ + file: `reference/cli/${fileName}`, + title: `\`${command}\``, + description, + // Top-level commands have exactly one space (e.g., "icp build"). + // The bare "icp" root and deep subcommands are excluded from llms.txt. + isTopLevel: (command.match(/ /g) || []).length === 1, + }); + } + + // Validate: the CLI reference should contain commands. If the format changed + // and nothing was extracted, fail loudly rather than silently producing no output. + if (subPages.length === 0) { + throw new Error( + "CLI reference split: no commands found. " + + "Expected ## `icp ...` headings in " + CLI_REFERENCE + ); + } + + // Validate: all subcommand links in generated files point to existing files. + for (const { file } of subPages) { + const filePath = path.join(outDir, file); + const md = fs.readFileSync(filePath, "utf-8"); + const linkPattern = /\]\((\S+\.md)\)/g; + let linkMatch; + while ((linkMatch = linkPattern.exec(md)) !== null) { + const target = path.join(path.dirname(filePath), linkMatch[1]); + if (!fs.existsSync(target)) { + throw new Error( + `CLI reference split: broken link in ${file}: ` + + `${linkMatch[1]} does not exist` + ); + } + } + } + + return subPages; +} + /** Generate llms.txt content from collected page metadata. */ -function generateLlmsTxt(pages, siteUrl, basePath) { +function generateLlmsTxt(pages, siteUrl, basePath, cliSubPages) { const base = (siteUrl + basePath).replace(/\/$/, ""); const skillsBase = @@ -149,6 +262,20 @@ function generateLlmsTxt(pages, siteUrl, basePath) { ? `- [${page.title}](${url}): ${page.description}` : `- [${page.title}](${url})`; lines.push(entry); + + // Nest top-level command endpoints under the CLI Reference entry. + // Subcommands (e.g., "icp canister call") are omitted from the index + // but still available as .md endpoints for agents to fetch on demand. + if (page.file === CLI_REFERENCE && cliSubPages.length > 0) { + for (const sub of cliSubPages) { + if (!sub.isTopLevel) continue; + const subUrl = `${base}/${sub.file}`; + const subEntry = sub.description + ? ` - [${sub.title}](${subUrl}): ${sub.description}` + : ` - [${sub.title}](${subUrl})`; + lines.push(subEntry); + } + } } lines.push(""); } @@ -200,12 +327,58 @@ export default function agentDocs() { logger.info(`Generated ${pages.length} markdown endpoints`); + // 1b. Split CLI reference into per-command endpoints for agents + const cliSubPages = splitCliReference(outDir); + if (cliSubPages.length > 0) { + logger.info( + `Split CLI reference into ${cliSubPages.length} per-command endpoints` + ); + } + // 2. Generate llms.txt - const llmsTxt = generateLlmsTxt(pages, siteUrl, basePath); + const llmsTxt = generateLlmsTxt(pages, siteUrl, basePath, cliSubPages); fs.writeFileSync(path.join(outDir, "llms.txt"), llmsTxt); logger.info( `Generated llms.txt (${llmsTxt.length} chars, ${pages.length} pages)` ); + + // 3. Inject agent signaling directive into HTML pages + // Places a visually-hidden blockquote right after so it appears + // early in the document (within the first ~15%), before nav/sidebar. + // Uses CSS clip-rect (not display:none) so it survives HTML-to-markdown + // conversion. See: https://agentdocsspec.com + const llmsTxtUrl = `${basePath}llms.txt`; + const directive = + `
` + + `

For AI agents: Documentation index at ` + + `${llmsTxtUrl}

`; + const htmlFiles = fs.globSync("**/*.html", { cwd: outDir }); + let injected = 0; + for (const file of htmlFiles) { + const filePath = path.join(outDir, file); + const html = fs.readFileSync(filePath, "utf-8"); + const bodyIdx = html.indexOf("", bodyIdx); + if (closeIdx === -1) continue; + const insertAt = closeIdx + 1; + fs.writeFileSync( + filePath, + html.slice(0, insertAt) + directive + html.slice(insertAt) + ); + injected++; + } + logger.info(`Injected agent signaling into ${injected} HTML pages`); + + // 4. Alias sitemap-index.xml → sitemap.xml + // Astro's sitemap integration outputs sitemap-index.xml, but crawlers + // and the agentdocsspec checker expect /sitemap.xml by convention. + const sitemapIndex = path.join(outDir, "sitemap-index.xml"); + const sitemapAlias = path.join(outDir, "sitemap.xml"); + if (fs.existsSync(sitemapIndex) && !fs.existsSync(sitemapAlias)) { + fs.copyFileSync(sitemapIndex, sitemapAlias); + logger.info("Copied sitemap-index.xml → sitemap.xml"); + } }, }, }; diff --git a/docs-site/src/components/Banner.astro b/docs-site/src/components/Banner.astro index ede74a6d..f006e396 100644 --- a/docs-site/src/components/Banner.astro +++ b/docs-site/src/components/Banner.astro @@ -3,12 +3,8 @@ // Overrides Starlight's default Banner component so we don't need // banner frontmatter in each file. const content = 'Feedback welcome! Report issues on GitHub, ask questions on the Forum, or chat with us on Discord.'; -const llmsTxtPath = `${import.meta.env.BASE_URL}llms.txt`; --- -
-

For AI agents: Documentation index at {llmsTxtPath}

-