diff --git a/packages/vinext/src/entries/pages-server-entry.ts b/packages/vinext/src/entries/pages-server-entry.ts index 2477f2641..00eeb8a4f 100644 --- a/packages/vinext/src/entries/pages-server-entry.ts +++ b/packages/vinext/src/entries/pages-server-entry.ts @@ -114,6 +114,7 @@ export async function generateServerEntry( headers: nextConfig?.headers ?? [], expireTime: nextConfig?.expireTime, cacheMaxMemorySize: nextConfig?.cacheMaxMemorySize, + htmlLimitedBots: nextConfig?.htmlLimitedBots, i18n: nextConfig?.i18n ?? null, // Mirrors Next.js `experimental.disableOptimizedLoading` — when false // (the default), page scripts are emitted with `defer` in . See @@ -358,6 +359,7 @@ const _renderPage = __createPagesPageHandler({ assetPrefix: vinextConfig.assetPrefix, trailingSlash: vinextConfig.trailingSlash, expireTime: vinextConfig.expireTime, + htmlLimitedBots: vinextConfig.htmlLimitedBots, clientTraceMetadata: vinextConfig.clientTraceMetadata, disableOptimizedLoading: vinextConfig.disableOptimizedLoading, }, diff --git a/packages/vinext/src/index.ts b/packages/vinext/src/index.ts index 85fd8452d..393d7ab14 100644 --- a/packages/vinext/src/index.ts +++ b/packages/vinext/src/index.ts @@ -3793,6 +3793,7 @@ export default function vinext(options: VinextOptions = {}): PluginOption[] { (nextConfig?.rewrites.afterFiles.length ?? 0) > 0 || (nextConfig?.rewrites.fallback.length ?? 0) > 0, nextConfig?.clientTraceMetadata, + nextConfig?.htmlLimitedBots, ); flushStagedHeaders(); flushRequestHeaders(); diff --git a/packages/vinext/src/server/dev-server.ts b/packages/vinext/src/server/dev-server.ts index 2b86baeed..1fc2e29d1 100644 --- a/packages/vinext/src/server/dev-server.ts +++ b/packages/vinext/src/server/dev-server.ts @@ -66,6 +66,7 @@ import { } from "./pages-document-initial-props.js"; import { callDocumentGetInitialProps } from "./document-initial-head.js"; import { loadPagesGetInitialProps } from "./pages-get-initial-props.js"; +import { isBotUserAgent } from "../utils/html-limited-bots.js"; /** * Render a React element to a string using renderToReadableStream. @@ -409,6 +410,7 @@ export function createSSRHandler( * `next.config`. When undefined or empty, no meta tags are emitted. */ clientTraceMetadata?: readonly string[], + htmlLimitedBots?: string, ) { const matcher = fileMatcher ?? createValidFileMatcher(); @@ -740,7 +742,10 @@ export function createSSRHandler( // Render the loading shell for `fallback: true` when the path // wasn't pre-rendered. Data requests still resolve real props so // the client can swap in after the shell ships. - if (fallback === true && !isValidPath && !isDataReq) { + const userAgentHeader = req.headers["user-agent"]; + const userAgent = Array.isArray(userAgentHeader) ? userAgentHeader[0] : userAgentHeader; + const isBotRequest = !!userAgent && isBotUserAgent(userAgent, htmlLimitedBots); + if (fallback === true && !isValidPath && !isDataReq && !isBotRequest) { isFallbackRender = true; if (typeof routerShim.setSSRContext === "function") { routerShim.setSSRContext({ diff --git a/packages/vinext/src/server/pages-page-data.ts b/packages/vinext/src/server/pages-page-data.ts index 65700724c..689c6349d 100644 --- a/packages/vinext/src/server/pages-page-data.ts +++ b/packages/vinext/src/server/pages-page-data.ts @@ -25,6 +25,7 @@ import { import { buildNextDataJsonResponse } from "./pages-data-route.js"; import { NEXTJS_DEPLOYMENT_ID_HEADER } from "./headers.js"; import { isSerializableProps } from "./pages-serializable-props.js"; +import { isBotUserAgent } from "../utils/html-limited-bots.js"; type PagesRedirectResult = { destination: string; @@ -183,6 +184,7 @@ export type ResolvePagesPageDataOptions = { * Typically sourced from `process.env.__VINEXT_DEPLOYMENT_ID || process.env.NEXT_DEPLOYMENT_ID`. */ deploymentId?: string; + htmlLimitedBots?: string; pageModule: PagesPageModule; params: Record; query: Record; @@ -544,7 +546,9 @@ export async function resolvePagesPageData( // Render the fallback shell for unlisted paths under `fallback: true`. // Data requests resolve props normally so the client can fill in after // the loading shell ships (`fallback: 'blocking'` keeps SSRing as before). - if (fallback === true && !isValidPath && !options.isDataReq) { + const isBotRequest = + !!options.userAgent && isBotUserAgent(options.userAgent, options.htmlLimitedBots); + if (fallback === true && !isValidPath && !options.isDataReq && !isBotRequest) { isFallback = true; } } diff --git a/packages/vinext/src/server/pages-page-handler.ts b/packages/vinext/src/server/pages-page-handler.ts index 2d51ef3cb..8103d76a4 100644 --- a/packages/vinext/src/server/pages-page-handler.ts +++ b/packages/vinext/src/server/pages-page-handler.ts @@ -83,6 +83,7 @@ type VinextConfigSubset = { assetPrefix: string; trailingSlash: boolean; expireTime?: number; + htmlLimitedBots?: string; clientTraceMetadata?: readonly string[]; disableOptimizedLoading: boolean; }; @@ -511,6 +512,7 @@ export function createPagesPageHandler( applyRequestContexts: applySSRContext, buildId, deploymentId: process.env.__VINEXT_DEPLOYMENT_ID || process.env.NEXT_DEPLOYMENT_ID, + htmlLimitedBots: vinextConfig.htmlLimitedBots, createGsspReqRes() { return createPagesReqRes({ body: undefined, query, request, url: routeUrl }); }, diff --git a/packages/vinext/src/utils/html-limited-bots.ts b/packages/vinext/src/utils/html-limited-bots.ts index 3d6adcff0..90f57c838 100644 --- a/packages/vinext/src/utils/html-limited-bots.ts +++ b/packages/vinext/src/utils/html-limited-bots.ts @@ -2,6 +2,13 @@ // packages/next/src/shared/lib/router/utils/html-bots.ts const HTML_LIMITED_BOT_UA_RE_STRING = String.raw`[\w-]+-Google|Google-[\w-]+|Chrome-Lighthouse|Slurp|DuckDuckBot|baiduspider|yandex|sogou|bitlybot|tumblr|vkShare|quora link preview|redditbot|ia_archiver|Bingbot|BingPreview|applebot|facebookexternalhit|facebookcatalog|Twitterbot|LinkedInBot|Slackbot|Discordbot|WhatsApp|SkypeUriPreview|Yeti|googleweblight`; +// Headless browser bot (executes JS). Mirrors Next.js +// `HEADLESS_BROWSER_BOT_UA_RE` in +// `.nextjs-ref/packages/next/src/shared/lib/router/utils/is-bot.ts`. +// Matches "Googlebot" but NOT "Mediapartners-Google" / "AdsBot-Google" / +// other Google crawlers, which are covered by the HTML-limited list. +const HEADLESS_BROWSER_BOT_UA_RE = /Googlebot(?!-)|Googlebot$/i; + const htmlLimitedBotRegexCache = new Map(); export function getHtmlLimitedBotRegex(htmlLimitedBots: string | undefined): RegExp { @@ -13,3 +20,25 @@ export function getHtmlLimitedBotRegex(htmlLimitedBots: string | undefined): Reg htmlLimitedBotRegexCache.set(source, regex); return regex; } + +/** + * Returns true when the User-Agent matches a known crawler/bot. Combines + * Next.js's "headless browser bot" check (Googlebot proper) with the + * "HTML-limited bot" list (Bingbot, DuckDuckBot, facebookexternalhit, …). + * + * Used by the Pages Router fallback path: a bot hitting an unlisted + * `fallback: true` route should get a synchronous render (real content) and + * not the loading shell, so the crawler indexes the actual page. Mirrors + * Next.js's `isBot()` in `.nextjs-ref/packages/next/src/shared/lib/router/utils/is-bot.ts` + * and the bot-aware fallback flip in + * `.nextjs-ref/packages/next/src/server/route-modules/pages/pages-handler.ts`. + * + * `htmlLimitedBots` allows next.config to override the HTML-limited list + * (same flag that drives `getHtmlLimitedBotRegex`), so a custom list applies + * to both streaming metadata gating and bot-aware fallback rendering. + */ +export function isBotUserAgent(userAgent: string, htmlLimitedBots?: string): boolean { + if (!userAgent) return false; + if (HEADLESS_BROWSER_BOT_UA_RE.test(userAgent)) return true; + return getHtmlLimitedBotRegex(htmlLimitedBots).test(userAgent); +} diff --git a/tests/pages-page-data.test.ts b/tests/pages-page-data.test.ts index e0b112904..3dde4a39f 100644 --- a/tests/pages-page-data.test.ts +++ b/tests/pages-page-data.test.ts @@ -282,6 +282,70 @@ describe("pages page data", () => { await expect(result.response.text()).resolves.toBe("{}"); }); + // Refs #1543: a crawler/bot UA hitting an unlisted `fallback: true` path + // must NOT receive the loading shell — it should render synchronously so + // the bot indexes real content. Mirrors Next.js's bot check in + // `.nextjs-ref/packages/next/src/server/route-modules/pages/pages-handler.ts`. + it("does not set isFallback for bot User-Agent on unlisted fallback: true paths", async () => { + let gspCalled = false; + const result = await resolvePagesPageData( + createOptions({ + pageModule: { + async getStaticPaths() { + return { + fallback: true, + paths: [{ params: { slug: "hello-world" } }], + }; + }, + async getStaticProps({ params }) { + gspCalled = true; + return { props: { slug: params?.slug ?? null } }; + }, + }, + params: { slug: "unknown" }, + query: { slug: "unknown" }, + route: { isDynamic: true }, + routeUrl: "/posts/unknown", + userAgent: "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", + }), + ); + + expect(result.kind).toBe("render"); + if (result.kind !== "render") throw new Error("expected render result"); + expect(result.isFallback).toBe(false); + expect(gspCalled).toBe(true); + expect(result.pageProps).toMatchObject({ slug: "unknown" }); + }); + + it("sets isFallback for normal browser User-Agent on unlisted fallback: true paths", async () => { + const result = await resolvePagesPageData( + createOptions({ + pageModule: { + async getStaticPaths() { + return { + fallback: true, + paths: [{ params: { slug: "hello-world" } }], + }; + }, + async getStaticProps() { + throw new Error("getStaticProps should not run on a fallback shell render"); + }, + }, + params: { slug: "unknown" }, + query: { slug: "unknown" }, + route: { isDynamic: true }, + routeUrl: "/posts/unknown", + userAgent: + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36", + }), + ); + + expect(result.kind).toBe("render"); + if (result.kind !== "render") throw new Error("expected render result"); + expect(result.isFallback).toBe(true); + expect(result.pageProps).toEqual({}); + }); + it("short-circuits getServerSideProps responses after res.end()", async () => { const responsePromise = Promise.resolve( new Response('{"ok":true}', { diff --git a/tests/pages-router.test.ts b/tests/pages-router.test.ts index f7fe6450f..190ac706b 100644 --- a/tests/pages-router.test.ts +++ b/tests/pages-router.test.ts @@ -1601,6 +1601,56 @@ describe("Pages Router integration", () => { expect(json.pageProps).toMatchObject({ pid: "unknown" }); }); + // Refs #1543: bot/crawler requests must bypass the `fallback: true` loading + // shell and synchronously render real content so crawlers index the page, + // not `Loading...`. Mirrors Next.js's bot check in + // `.nextjs-ref/packages/next/src/server/route-modules/pages/pages-handler.ts` + // and the Next.js e2e regression test + // `.nextjs-ref/test/e2e/prerender-crawler.test.ts`. + it("renders synchronously (not the fallback shell) for crawler UAs on unlisted fallback: true paths", async () => { + const userAgents = [ + "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", + "Mozilla/5.0 (compatible; Bingbot/2.0; +http://www.bing.com/bingbot.htm)", + "DuckDuckBot/1.0; (+http://duckduckgo.com/duckduckbot.html)", + "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)", + "facebookexternalhit/1.0 (+http://www.facebook.com/externalhit_uatext.php)", + ]; + for (const userAgent of userAgents) { + const slug = `bot-slug-${Math.random().toString(36).slice(2)}`; + const res = await fetch(`${baseUrl}/products/${slug}`, { + headers: { "user-agent": userAgent }, + }); + expect(res.status, `UA: ${userAgent}`).toBe(200); + const html = await res.text(); + // Bot should see the real rendered page, not the loading shell. + expect(html, `UA: ${userAgent}`).not.toContain("Loading product..."); + expect(html, `UA: ${userAgent}`).toMatch(new RegExp(`Product ID:.*${slug}`)); + const match = html.match(/__NEXT_DATA__\s*=\s*(\{.*?\})\s*[;<]/); + expect(match, `UA: ${userAgent}`).toBeTruthy(); + const nextData = JSON.parse(match![1]); + expect(nextData.isFallback, `UA: ${userAgent}`).toBe(false); + expect(nextData.props.pageProps).toMatchObject({ pid: slug }); + } + }); + + it("still ships the fallback shell for normal browser UAs on unlisted fallback: true paths", async () => { + // Counterpart of the crawler test — the bot-flip must not catch real + // browsers. Plain Chrome UA should still receive the loading shell. + const res = await fetch(`${baseUrl}/products/non-bot-slug`, { + headers: { + "user-agent": + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36", + }, + }); + expect(res.status).toBe(200); + const html = await res.text(); + expect(html).toContain("Loading product..."); + const match = html.match(/__NEXT_DATA__\s*=\s*(\{.*?\})\s*[;<]/); + expect(match).toBeTruthy(); + const nextData = JSON.parse(match![1]); + expect(nextData.isFallback).toBe(true); + }); + it("includes isFallback: false in __NEXT_DATA__", async () => { const res = await fetch(`${baseUrl}/products/widget`); const html = await res.text();