Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .changeset/green-rice-jump.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
---
"@browserbasehq/stagehand": patch
---

Fix file input handling in `observe` context so upload inputs are preserved and can be reliably targeted by XPath.

Also adds an eval and example showing the upload flow:
1. use `observe` to find the file input,
2. unpack the returned XPath,
3. call `page.locator(xpath).setInputFiles(...)`.
69 changes: 69 additions & 0 deletions packages/core/examples/observe_file_input_upload.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
/**
* Observe file input example:
* 1. Use observe() to get the xpath selector for the upload input.
* 2. Unpack the first observed result.
* 3. Pass that xpath into page.locator().setInputFiles().
*/
import { promises as fs } from "fs";
import path from "path";
import crypto from "crypto";
import { Stagehand } from "../lib/v3";

const FILE_UPLOAD_V2_URL =
"https://browserbase.github.io/stagehand-eval-sites/sites/file-uploads-2/";
const RESUME_SUCCESS = "#resumeSuccess";

async function observeFileInputUpload() {
const stagehand = new Stagehand({
env: "BROWSERBASE",
verbose: 1,
});

const fixturePath = path.resolve(
process.cwd(),
`observe-file-upload-example-${crypto.randomBytes(4).toString("hex")}.txt`,
);

await stagehand.init();
const page = stagehand.context.pages()[0];

try {
await fs.writeFile(
fixturePath,
"Stagehand observe() + setInputFiles() example",
"utf8",
);
await page.goto(FILE_UPLOAD_V2_URL);

const observations = await stagehand.observe(
"Find the resume file upload input element. Return the actual upload input field.",
);

// Unpack the result and use the observed xpath directly.
const [resumeUploadInput] = observations;
if (!resumeUploadInput?.selector) {
throw new Error("observe() did not return a file input selector");
}
const xpath = resumeUploadInput.selector;
await page.locator(xpath).setInputFiles(fixturePath);

const uploaded = await page.evaluate((selector) => {
const success = document.querySelector(selector);
if (!success) return false;
const text = (success.textContent ?? "").toLowerCase();
return text.includes("resume uploaded");
}, RESUME_SUCCESS);

if (!uploaded) {
throw new Error("upload confirmation not found");
}
console.log(`Uploaded fixture with selector: ${xpath}`);
} finally {
await stagehand.close();
await fs.unlink(fixturePath).catch(() => {});
}
}

(async () => {
await observeFileInputUpload();
})();
5 changes: 5 additions & 0 deletions packages/core/lib/v3/types/private/snapshot.ts
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,17 @@ export type SessionDomIndex = {
rootBackend: number;
absByBe: Map<number, string>;
tagByBe: Map<number, string>;
/** backendNodeId -> lowercased input type value for <input> nodes. */
inputTypeByBe?: Map<number, string>;
scrollByBe: Map<number, boolean>;
docRootOf: Map<number, number>;
contentDocRootByIframe: Map<number, number>;
};

export type FrameDomMaps = {
tagNameMap: Record<string, string>;
/** EncodedId -> lowercased input type value for <input> nodes. */
inputTypeMap?: Record<string, string>;
xpathMap: Record<string, string>;
scrollableMap: Record<string, boolean>;
urlMap: Record<string, string>;
Expand Down Expand Up @@ -106,6 +110,7 @@ export type A11yOptions = {
focusSelector?: string;
experimental: boolean;
tagNameMap: Record<string, string>;
inputTypeMap?: Record<string, string>;
scrollableMap: Record<string, boolean>;
encode: (backendNodeId: number) => string;
};
Expand Down
82 changes: 79 additions & 3 deletions packages/core/lib/v3/understudy/a11y/snapshot/a11yTree.ts
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,11 @@ export async function a11yForFrame(

const decorated = decorateRoles(nodesForOutline, opts);
const { tree } = await buildHierarchicalTree(decorated, opts);
const treeWithFileInputs = appendMissingFileInputNodes(tree, decorated, opts);

const simplified = tree.map((n) => formatTreeLine(n)).join("\n");
const simplified = treeWithFileInputs
.map((n) => formatTreeLine(n))
.join("\n");
return { outline: simplified.trimEnd(), urlMap, scopeApplied };
}

Expand Down Expand Up @@ -154,7 +157,8 @@ export async function buildHierarchicalTree(
const keep =
!!(n.name && n.name.trim()) ||
!!(n.childIds && n.childIds.length) ||
!isStructural(n.role);
!isStructural(n.role) ||
isFileInputNode(n, opts);
if (!keep) continue;
nodeMap.set(n.nodeId, { ...n });
}
Expand All @@ -181,6 +185,9 @@ export async function buildHierarchicalTree(

const children = node.children ?? [];
if (!children.length) {
if (isFileInputNode(node, opts)) {
return { ...node, role: "file input" };
}
return isStructural(node.role) ? null : node;
}

Expand All @@ -190,12 +197,15 @@ export async function buildHierarchicalTree(

const prunedStatic = removeRedundantStaticTextChildren(node, cleanedKids);

if (isStructural(node.role)) {
if (isStructural(node.role) && !isFileInputNode(node, opts)) {
if (prunedStatic.length === 1) return prunedStatic[0]!;
if (prunedStatic.length === 0) return null;
}

let newRole = node.role;
if (isFileInputNode(node, opts)) {
newRole = "file input";
}
if ((newRole === "generic" || newRole === "none") && node.encodedId) {
const tagName = opts.tagNameMap[node.encodedId];
if (tagName) newRole = tagName;
Expand All @@ -210,6 +220,72 @@ export async function buildHierarchicalTree(
}
}

function isEncodedFileInput(encodedId: string, opts: A11yOptions): boolean {
const tag = opts.tagNameMap[encodedId];
const inputType = opts.inputTypeMap?.[encodedId];
return (
String(tag ?? "").toLowerCase() === "input" &&
String(inputType ?? "").toLowerCase() === "file"
);
}

function isFileInputNode(node: A11yNode, opts: A11yOptions): boolean {
if (!node.encodedId) return false;
return isEncodedFileInput(node.encodedId, opts);
}

function collectEncodedIds(
nodes: A11yNode[],
out = new Set<string>(),
): Set<string> {
for (const node of nodes) {
if (node.encodedId) out.add(node.encodedId);
if (node.children?.length) collectEncodedIds(node.children, out);
}
return out;
}

function appendMissingFileInputNodes(
tree: A11yNode[],
decorated: A11yNode[],
opts: A11yOptions,
): A11yNode[] {
const inputTypes = opts.inputTypeMap ?? {};
const presentEncodedIds = collectEncodedIds(tree);
const decoratedByEncoded = new Map<string, A11yNode>();
for (const node of decorated) {
if (node.encodedId && !decoratedByEncoded.has(node.encodedId)) {
decoratedByEncoded.set(node.encodedId, node);
}
}

const extras: A11yNode[] = [];
for (const [encodedId, inputType] of Object.entries(inputTypes)) {
if (String(inputType).toLowerCase() !== "file") continue;
if (!isEncodedFileInput(encodedId, opts)) continue;
if (presentEncodedIds.has(encodedId)) continue;

const existing = decoratedByEncoded.get(encodedId);
if (existing) {
extras.push({
...existing,
role: "file input",
children: undefined,
});
continue;
}

extras.push({
role: "file input",
nodeId: `synthetic-file-${encodedId}`,
encodedId,
});
}

if (!extras.length) return tree;
return [...tree, ...extras];
}

export function isStructural(role: string): boolean {
const r = role?.toLowerCase();
return r === "generic" || r === "none" || r === "inlinetextbox";
Expand Down
28 changes: 20 additions & 8 deletions packages/core/lib/v3/understudy/a11y/snapshot/capture.ts
Original file line number Diff line number Diff line change
Expand Up @@ -173,20 +173,22 @@ export async function tryScopedSnapshot(
const sameSessionAsParent =
!!parentId &&
ownerSession(page, parentId) === ownerSession(page, targetFrameId);
const { tagNameMap, xpathMap, scrollableMap } = await domMapsForSession(
owningSess,
targetFrameId,
pierce,
(fid, be) => `${page.getOrdinal(fid)}-${be}`,
sameSessionAsParent,
);
const { tagNameMap, inputTypeMap, xpathMap, scrollableMap } =
await domMapsForSession(
owningSess,
targetFrameId,
pierce,
(fid, be) => `${page.getOrdinal(fid)}-${be}`,
sameSessionAsParent,
);

const { outline, urlMap, scopeApplied } = await a11yForFrame(
owningSess,
targetFrameId,
{
focusSelector: tailSelector || undefined,
tagNameMap,
inputTypeMap,
experimental: options?.experimental ?? false,
scrollableMap,
encode: (backendNodeId) =>
Expand Down Expand Up @@ -308,6 +310,7 @@ export async function collectPerFrameMaps(
}

const tagNameMap: Record<string, string> = {};
const inputTypeMap: Record<string, string> = {};
const xpathMap: Record<string, string> = {};
const scrollableMap: Record<string, boolean> = {};
const enc = (be: number) => `${page.getOrdinal(frameId)}-${be}`;
Expand All @@ -323,18 +326,27 @@ export async function collectPerFrameMaps(
xpathMap[key] = rel;
const tag = idx.tagByBe.get(be);
if (tag) tagNameMap[key] = tag;
const inputType = idx.inputTypeByBe?.get(be);
if (inputType) inputTypeMap[key] = inputType;
if (idx.scrollByBe.get(be)) scrollableMap[key] = true;
}

const { outline, urlMap } = await a11yForFrame(sess, frameId, {
experimental: options?.experimental ?? false,
tagNameMap,
inputTypeMap,
scrollableMap,
encode: (backendNodeId) => `${page.getOrdinal(frameId)}-${backendNodeId}`,
});

perFrameOutlines.push({ frameId, outline });
perFrameMaps.set(frameId, { tagNameMap, xpathMap, scrollableMap, urlMap });
perFrameMaps.set(frameId, {
tagNameMap,
inputTypeMap,
xpathMap,
scrollableMap,
urlMap,
});
}

return { perFrameMaps, perFrameOutlines };
Expand Down
25 changes: 24 additions & 1 deletion packages/core/lib/v3/understudy/a11y/snapshot/domTree.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,21 @@ function isCborStackError(message: string): boolean {
return message.includes("CBOR: stack limit exceeded");
}

function extractInputType(node: Protocol.DOM.Node): string | undefined {
const tag = String(node.nodeName ?? "").toLowerCase();
if (tag !== "input") return undefined;
const attrs = Array.isArray(node.attributes) ? node.attributes : [];
for (let i = 0; i < attrs.length - 1; i += 2) {
const name = String(attrs[i] ?? "").toLowerCase();
if (name !== "type") continue;
const value = String(attrs[i + 1] ?? "")
.trim()
.toLowerCase();
return value || "text";
}
return "text";
}

/**
* Determine if CDP truncated a node's children when streaming the DOM tree.
* childNodeCount stays accurate even when `children` are omitted; we use this to
Expand Down Expand Up @@ -182,6 +197,7 @@ export async function domMapsForSession(
attemptOwnerLookup = true,
): Promise<{
tagNameMap: Record<string, string>;
inputTypeMap: Record<string, string>;
xpathMap: Record<string, string>;
scrollableMap: Record<string, boolean>;
}> {
Expand All @@ -208,6 +224,7 @@ export async function domMapsForSession(
}

const tagNameMap: Record<string, string> = {};
const inputTypeMap: Record<string, string> = {};
const xpathMap: Record<string, string> = {};
const scrollableMap: Record<string, boolean> = {};

Expand All @@ -220,6 +237,8 @@ export async function domMapsForSession(
if (node.backendNodeId) {
const encId = encode(frameId, node.backendNodeId);
tagNameMap[encId] = String(node.nodeName).toLowerCase();
const inputType = extractInputType(node);
if (inputType) inputTypeMap[encId] = inputType;
xpathMap[encId] = xpath || "/";
const isScrollable = node?.isScrollable === true;
if (isScrollable) scrollableMap[encId] = true;
Expand All @@ -246,7 +265,7 @@ export async function domMapsForSession(
}
}

return { tagNameMap, xpathMap, scrollableMap };
return { tagNameMap, inputTypeMap, xpathMap, scrollableMap };
}

/**
Expand All @@ -263,6 +282,7 @@ export async function buildSessionDomIndex(

const absByBe = new Map<number, string>();
const tagByBe = new Map<number, string>();
const inputTypeByBe = new Map<number, string>();
const scrollByBe = new Map<number, boolean>();
const docRootOf = new Map<number, number>();
const contentDocRootByIframe = new Map<number, number>();
Expand All @@ -276,6 +296,8 @@ export async function buildSessionDomIndex(
if (node.backendNodeId) {
absByBe.set(node.backendNodeId, xp || "/");
tagByBe.set(node.backendNodeId, String(node.nodeName).toLowerCase());
const inputType = extractInputType(node);
if (inputType) inputTypeByBe.set(node.backendNodeId, inputType);
if (node?.isScrollable === true) scrollByBe.set(node.backendNodeId, true);
docRootOf.set(node.backendNodeId, docRootBe);
}
Expand Down Expand Up @@ -305,6 +327,7 @@ export async function buildSessionDomIndex(
rootBackend: rootBe,
absByBe,
tagByBe,
inputTypeByBe,
scrollByBe,
docRootOf,
contentDocRootByIframe,
Expand Down
Loading
Loading