Skip to content

Commit f73e4e8

Browse files
committed
fix(docs): improve agentdocsspec compliance
Cherry-pick of 02b7c5a from main. - Move agent signaling to post-build <body> injection (~12% position) - Split CLI reference into per-command .md endpoints - Copy sitemap-index.xml → sitemap.xml for freshness check - Build-time validation for slug collisions and broken links
1 parent 7227aea commit f73e4e8

2 files changed

Lines changed: 178 additions & 4 deletions

File tree

docs-site/astro.config.mjs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,6 @@ export default defineConfig({
1717
],
1818
},
1919
integrations: [
20-
// Generate .md endpoints and llms.txt for agent-friendly docs
21-
agentDocs(),
2220
starlight({
2321
title: 'ICP CLI',
2422
description: 'Command-line tool for developing and deploying applications on the Internet Computer Protocol (ICP)',
@@ -116,5 +114,8 @@ export default defineConfig({
116114
},
117115
],
118116
}),
117+
// Generate .md endpoints, llms.txt, and agent signaling for agent-friendly docs.
118+
// Listed after starlight() so the astro:build:done hook runs after sitemap generation.
119+
agentDocs(),
119120
],
120121
});

docs-site/plugins/astro-agent-docs.mjs

Lines changed: 175 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
*
55
* 1. Markdown endpoints — serves a clean .md file alongside every HTML page
66
* 2. llms.txt — discovery index listing all pages with links to .md endpoints
7+
* 3. Agent signaling — injects a hidden llms.txt directive right after <body>
8+
* in every HTML page so agents discover it early (before nav/sidebar)
79
*
810
* Runs in the astro:build:done hook so it operates on the final build output.
911
*/
@@ -62,8 +64,119 @@ function findSection(filePath) {
6264
return best;
6365
}
6466

67+
// Path to the CLI reference page — split into per-command endpoints for agents.
68+
const CLI_REFERENCE = "reference/cli.md";
69+
70+
/**
71+
* Split the CLI reference into per-command markdown files.
72+
* Each `## \`icp ...\`` heading becomes its own file under reference/cli/.
73+
* Returns metadata for each generated sub-page (for llms.txt).
74+
*/
75+
function splitCliReference(outDir) {
76+
const cliMd = path.join(outDir, CLI_REFERENCE);
77+
if (!fs.existsSync(cliMd)) return [];
78+
79+
const content = fs
80+
.readFileSync(cliMd, "utf-8")
81+
// Strip the clap-markdown generation footer that appears at the end.
82+
.replace(/\n*<hr\/>\s*\n*<small>[\s\S]*?<\/small>\s*$/, "\n");
83+
// Split on ## `icp ...` headings, keeping the heading with the section.
84+
const sections = content.split(/^(?=## `icp\b)/m).filter((s) => s.trim());
85+
86+
const subDir = path.join(outDir, "reference", "cli");
87+
fs.mkdirSync(subDir, { recursive: true });
88+
89+
const subPages = [];
90+
const seenSlugs = new Map(); // slug → command name, for collision detection
91+
for (const section of sections) {
92+
const match = section.match(/^## `(icp[\w\s-]*?)`/);
93+
if (!match) continue;
94+
95+
const command = match[1].trim();
96+
// icp build → build, icp canister call → canister-call
97+
const slug = command === "icp" ? "index" : command.replace(/^icp /, "").replace(/ /g, "-");
98+
const fileName = `${slug}.md`;
99+
100+
// Detect slug collisions (e.g., "icp foo-bar" vs "icp foo bar").
101+
if (seenSlugs.has(slug)) {
102+
throw new Error(
103+
`CLI reference split: slug collision for "${fileName}" ` +
104+
`between commands "${seenSlugs.get(slug)}" and "${command}"`
105+
);
106+
}
107+
seenSlugs.set(slug, command);
108+
109+
// Extract the description: first plain-text line after the heading,
110+
// skipping **Usage:**, ###### headings, list items, and empty lines.
111+
const lines = section.split("\n");
112+
const descLine = lines.find(
113+
(l, i) =>
114+
i > 0 &&
115+
l.trim() &&
116+
!l.startsWith("**Usage") &&
117+
!l.startsWith("#") &&
118+
!l.startsWith("*")
119+
);
120+
const description = descLine ? descLine.trim() : "";
121+
122+
// Rewrite subcommand list items to link to their per-command endpoints.
123+
// e.g., `* \`call\` — ...` → `* [\`call\`](canister-call.md) — ...`
124+
// The parent prefix (e.g., "canister") is used to build the slug.
125+
const parentSlug = command.replace(/^icp ?/, "").replace(/ /g, "-");
126+
const body = section.replace(/^## [^\n]+\n+/, "").replace(
127+
/^\* `(\w[\w-]*)` /gm,
128+
(_, sub) => {
129+
const subSlug = parentSlug ? `${parentSlug}-${sub}` : sub;
130+
return `* [\`${sub}\`](${subSlug}.md) —`;
131+
}
132+
);
133+
134+
fs.writeFileSync(
135+
path.join(subDir, fileName),
136+
BOM + `# ${command}\n\n` + body + "\n"
137+
);
138+
139+
subPages.push({
140+
file: `reference/cli/${fileName}`,
141+
title: `\`${command}\``,
142+
description,
143+
// Top-level commands have exactly one space (e.g., "icp build").
144+
// The bare "icp" root and deep subcommands are excluded from llms.txt.
145+
isTopLevel: (command.match(/ /g) || []).length === 1,
146+
});
147+
}
148+
149+
// Validate: the CLI reference should contain commands. If the format changed
150+
// and nothing was extracted, fail loudly rather than silently producing no output.
151+
if (subPages.length === 0) {
152+
throw new Error(
153+
"CLI reference split: no commands found. " +
154+
"Expected ## `icp ...` headings in " + CLI_REFERENCE
155+
);
156+
}
157+
158+
// Validate: all subcommand links in generated files point to existing files.
159+
for (const { file } of subPages) {
160+
const filePath = path.join(outDir, file);
161+
const md = fs.readFileSync(filePath, "utf-8");
162+
const linkPattern = /\]\((\S+\.md)\)/g;
163+
let linkMatch;
164+
while ((linkMatch = linkPattern.exec(md)) !== null) {
165+
const target = path.join(path.dirname(filePath), linkMatch[1]);
166+
if (!fs.existsSync(target)) {
167+
throw new Error(
168+
`CLI reference split: broken link in ${file}: ` +
169+
`${linkMatch[1]} does not exist`
170+
);
171+
}
172+
}
173+
}
174+
175+
return subPages;
176+
}
177+
65178
/** Generate llms.txt content from collected page metadata. */
66-
function generateLlmsTxt(pages, siteUrl, basePath) {
179+
function generateLlmsTxt(pages, siteUrl, basePath, cliSubPages) {
67180
const base = (siteUrl + basePath).replace(/\/$/, "");
68181

69182
const skillsBase =
@@ -149,6 +262,20 @@ function generateLlmsTxt(pages, siteUrl, basePath) {
149262
? `- [${page.title}](${url}): ${page.description}`
150263
: `- [${page.title}](${url})`;
151264
lines.push(entry);
265+
266+
// Nest top-level command endpoints under the CLI Reference entry.
267+
// Subcommands (e.g., "icp canister call") are omitted from the index
268+
// but still available as .md endpoints for agents to fetch on demand.
269+
if (page.file === CLI_REFERENCE && cliSubPages.length > 0) {
270+
for (const sub of cliSubPages) {
271+
if (!sub.isTopLevel) continue;
272+
const subUrl = `${base}/${sub.file}`;
273+
const subEntry = sub.description
274+
? ` - [${sub.title}](${subUrl}): ${sub.description}`
275+
: ` - [${sub.title}](${subUrl})`;
276+
lines.push(subEntry);
277+
}
278+
}
152279
}
153280
lines.push("");
154281
}
@@ -200,12 +327,58 @@ export default function agentDocs() {
200327

201328
logger.info(`Generated ${pages.length} markdown endpoints`);
202329

330+
// 1b. Split CLI reference into per-command endpoints for agents
331+
const cliSubPages = splitCliReference(outDir);
332+
if (cliSubPages.length > 0) {
333+
logger.info(
334+
`Split CLI reference into ${cliSubPages.length} per-command endpoints`
335+
);
336+
}
337+
203338
// 2. Generate llms.txt
204-
const llmsTxt = generateLlmsTxt(pages, siteUrl, basePath);
339+
const llmsTxt = generateLlmsTxt(pages, siteUrl, basePath, cliSubPages);
205340
fs.writeFileSync(path.join(outDir, "llms.txt"), llmsTxt);
206341
logger.info(
207342
`Generated llms.txt (${llmsTxt.length} chars, ${pages.length} pages)`
208343
);
344+
345+
// 3. Inject agent signaling directive into HTML pages
346+
// Places a visually-hidden blockquote right after <body> so it appears
347+
// early in the document (within the first ~15%), before nav/sidebar.
348+
// Uses CSS clip-rect (not display:none) so it survives HTML-to-markdown
349+
// conversion. See: https://agentdocsspec.com
350+
const llmsTxtUrl = `${basePath}llms.txt`;
351+
const directive =
352+
`<blockquote class="agent-signaling" data-pagefind-ignore>` +
353+
`<p>For AI agents: Documentation index at ` +
354+
`<a href="${llmsTxtUrl}">${llmsTxtUrl}</a></p></blockquote>`;
355+
const htmlFiles = fs.globSync("**/*.html", { cwd: outDir });
356+
let injected = 0;
357+
for (const file of htmlFiles) {
358+
const filePath = path.join(outDir, file);
359+
const html = fs.readFileSync(filePath, "utf-8");
360+
const bodyIdx = html.indexOf("<body");
361+
if (bodyIdx === -1) continue;
362+
const closeIdx = html.indexOf(">", bodyIdx);
363+
if (closeIdx === -1) continue;
364+
const insertAt = closeIdx + 1;
365+
fs.writeFileSync(
366+
filePath,
367+
html.slice(0, insertAt) + directive + html.slice(insertAt)
368+
);
369+
injected++;
370+
}
371+
logger.info(`Injected agent signaling into ${injected} HTML pages`);
372+
373+
// 4. Alias sitemap-index.xml → sitemap.xml
374+
// Astro's sitemap integration outputs sitemap-index.xml, but crawlers
375+
// and the agentdocsspec checker expect /sitemap.xml by convention.
376+
const sitemapIndex = path.join(outDir, "sitemap-index.xml");
377+
const sitemapAlias = path.join(outDir, "sitemap.xml");
378+
if (fs.existsSync(sitemapIndex) && !fs.existsSync(sitemapAlias)) {
379+
fs.copyFileSync(sitemapIndex, sitemapAlias);
380+
logger.info("Copied sitemap-index.xml → sitemap.xml");
381+
}
209382
},
210383
},
211384
};

0 commit comments

Comments
 (0)