Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,35 @@ decodo google-search "query" --full --pretty
decodo google-search "query" --format ndjson --full | jq -c '.url'
```

## Batch input

Run the same command over many inputs from a file. Works with `scrape`, `search`, `screenshot`, and any target subcommand.

| Flag | Effect |
| --- | --- |
| `--input-file <path>` | Read inputs from a `.txt` (one URL/query per line) or `.csv` file |
| `--input-column <name>` | Column to read when `--input-file` is a CSV (required for CSV) |
| `--concurrency <n>` | Number of requests to run in parallel (default: 4) |

By default, results stream to stdout as NDJSON — one JSON record per line, regardless of TTY. Each record carries the item's `index` and `input` plus either a `result` or an `error` (`{ class, message }`). A failed item does not stop the batch; a summary is printed to stderr at the end. Pass `-o <dir>` to write one file per item instead (named by URL slug or row index). Inputs are streamed, so large files are not loaded into memory.

```bash
# Scrape every URL in a file, 8 at a time, as NDJSON
decodo scrape --input-file urls.txt --concurrency 8 > results.ndjson

# CSV input: choose which column holds the URL/query
decodo scrape --input-file products.csv --input-column url

# One result file per input, written into a directory
decodo scrape --input-file urls.txt -o results/

# Batch a target subcommand, then post-process with jq
decodo google-search --input-file queries.txt --format ndjson | jq -c '.input'

# Batch screenshots — PNG per item, output directory required
decodo screenshot --input-file urls.txt -o shots/
```

## Examples

### Pipe-friendly workflows
Expand Down
19 changes: 19 additions & 0 deletions src/batch/commands/attach-batch-options.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import type { Command } from "commander";
import { parseConcurrency } from "../services/resolve-concurrency.js";

export function attachBatchOptions(command: Command): Command {
return command
.option(
"--input-file <path>",
"Run each line/row of a .txt or .csv file as a batch item"
)
.option(
"--input-column <name>",
"Column to read inputs from when --input-file is a CSV"
)
.option(
"--concurrency <n>",
"Max requests to run in parallel in batch mode (default: 4)",
parseConcurrency
);
}
39 changes: 39 additions & 0 deletions src/batch/services/binary-directory-sink.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import { mkdirSync, writeFileSync } from "node:fs";
import { join } from "node:path";
import type { BatchResult } from "../types/batch-result.js";
import type { BatchSink } from "../types/batch-sink.js";
import { batchItemFilename } from "./batch-item-filename.js";
import { toBatchRecord } from "./batch-record.js";
import { uniqueName } from "./unique-name.js";

function isBytes(value: unknown): value is Uint8Array {
return value instanceof Uint8Array;
}

export function createBinaryDirectorySink(
dir: string,
extension = "png"
): BatchSink {
mkdirSync(dir, { recursive: true });
const used = new Set<string>();

return {
write(result: BatchResult): void {
const name = uniqueName(
batchItemFilename(result.input, result.index),
used
);

if (result.ok && isBytes(result.data)) {
writeFileSync(join(dir, `${name}.${extension}`), result.data);
return;
}

writeFileSync(
join(dir, `${name}.error.json`),
JSON.stringify(toBatchRecord(result), null, 2),
"utf8"
);
},
};
}
14 changes: 14 additions & 0 deletions src/batch/services/resolve-concurrency.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import { CliUsageError } from "../../platform/services/handle-cli-error.js";
import { DEFAULT_CONCURRENCY } from "../constants.js";

export function parseConcurrency(value: string): number {
const parsed = Number.parseInt(value, 10);
if (Number.isNaN(parsed) || parsed < 1) {
throw new CliUsageError("--concurrency must be a positive integer.");
}
return parsed;
}

export function resolveConcurrency(value: number | undefined): number {
return value ?? DEFAULT_CONCURRENCY;
}
64 changes: 64 additions & 0 deletions src/batch/services/run-batch-command.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import { ValidationError } from "@decodo/sdk-ts";
import { CliUsageError } from "../../platform/services/handle-cli-error.js";
import type { BatchSummary } from "../types/batch-result.js";
import type { BatchSink } from "../types/batch-sink.js";
import { createBinaryDirectorySink } from "./binary-directory-sink.js";
import { createDirectorySink } from "./directory-sink.js";
import { createNdjsonStdoutSink } from "./ndjson-stdout-sink.js";
import { readInputFile } from "./read-input-file.js";
import { runBatch } from "./run-batch.js";

export interface RunBatchCommandOptions {
binary?: boolean;
concurrency: number;
inputColumn?: string;
inputFile: string;
output?: string;
pretty?: boolean;
scrapeItem: (input: string) => Promise<unknown>;
}

function selectSink(options: RunBatchCommandOptions): BatchSink {
if (options.binary) {
if (!options.output) {
throw new CliUsageError(
"Batch mode for binary output requires -o <dir> to write files."
);
}
return createBinaryDirectorySink(options.output);
}

if (options.output) {
return createDirectorySink(options.output, { pretty: options.pretty });
}

return createNdjsonStdoutSink();
}

export async function runBatchCommand(
options: RunBatchCommandOptions
): Promise<BatchSummary> {
const sink = selectSink(options);
const items = readInputFile(options.inputFile, {
inputColumn: options.inputColumn,
});

const summary = await runBatch({
items,
concurrency: options.concurrency,
worker: (item) => options.scrapeItem(item.input),
onResult: (result) => sink.write(result),
});

await sink.close?.();

if (summary.total === 0) {
throw new ValidationError("Input file produced no inputs.");
}

console.error(
`Batch complete: ${summary.succeeded} succeeded, ${summary.failed} failed (${summary.total} total).`
);

return summary;
}
5 changes: 5 additions & 0 deletions src/batch/types/batch-flags.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
export interface BatchFlags {
concurrency?: number;
inputColumn?: string;
inputFile?: string;
}
7 changes: 5 additions & 2 deletions src/scrape/commands/scrape.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import { type DecodoSchema, Target, ValidationError } from "@decodo/sdk-ts";
import { Command } from "commander";
import { attachBatchOptions } from "../../batch/commands/attach-batch-options.js";
import { attachScrapeOutputOptions } from "../../output/commands/attach-output-options.js";
import { applyRequestDefaults } from "../../output/services/apply-request-defaults.js";
import type { OutputOptions } from "../../output/types/output-options.js";
import { CliUsageError } from "../../platform/services/handle-cli-error.js";
import { resolveTarget } from "../services/resolve-target.js";
import { createTargetAction } from "../services/run-target-scrape.js";
import type { ScrapeOptions } from "../types/scrape-command.js";
Expand Down Expand Up @@ -32,17 +34,18 @@ export function createScrapeCommand(schema: DecodoSchema): Command {
.description(
"Scrape a URL with the universal target (markdown by default). Use decodo universal for --markdown, --parse, and other API flags."
)
.argument("<url>", "URL to scrape")
.argument("[url]", "URL to scrape (omit when using --input-file)")
.option("--country <code>", "Geo / country code (maps to geo)")
.option("--headers <json>", "Request headers as a JSON object string")
.option("--target <name>", "Scrape target override (default: universal)");

attachScrapeOutputOptions(command);
attachBatchOptions(command);

return command.action(
createTargetAction(Target.Universal, schema, (url, options) => {
if (url === undefined) {
throw new Error("Missing required URL.");
throw new CliUsageError("Missing required URL.");
}

const opts = options as ScrapeOptions & OutputOptions;
Expand Down
7 changes: 5 additions & 2 deletions src/scrape/commands/screenshot.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import { type DecodoSchema, Target } from "@decodo/sdk-ts";
import { Command } from "commander";
import { attachBatchOptions } from "../../batch/commands/attach-batch-options.js";
import { attachScrapeOutputOptions } from "../../output/commands/attach-output-options.js";
import { CliUsageError } from "../../platform/services/handle-cli-error.js";
import { resolveTarget } from "../services/resolve-target.js";
import { createTargetAction } from "../services/run-target-scrape.js";
import type { ScreenshotOptions } from "../types/screenshot-command.js";
Expand All @@ -10,21 +12,22 @@ export function createScreenshotCommand(schema: DecodoSchema): Command {
.description(
"Capture a PNG screenshot (universal, headless). Use decodo universal --headless png for full options."
)
.argument("<url>", "URL to screenshot")
.argument("[url]", "URL to screenshot (omit when using --input-file)")
.option("--country <code>", "Geo / country code (maps to geo)")
.option("--target <name>", "Scrape target override (default: universal)");

attachScrapeOutputOptions(command, {
outputHelp: "Write PNG to file or directory (default name: <host>.png)",
});
attachBatchOptions(command);

return command.action(
createTargetAction(
Target.Universal,
schema,
(url, options) => {
if (url === undefined) {
throw new Error("Missing required URL.");
throw new CliUsageError("Missing required URL.");
}

const opts = options as ScreenshotOptions;
Expand Down
7 changes: 5 additions & 2 deletions src/scrape/commands/search.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import { type DecodoSchema, Target, ValidationError } from "@decodo/sdk-ts";
import { Command, Option } from "commander";
import { attachBatchOptions } from "../../batch/commands/attach-batch-options.js";
import { attachScrapeOutputOptions } from "../../output/commands/attach-output-options.js";
import { applyRequestDefaults } from "../../output/services/apply-request-defaults.js";
import { CliUsageError } from "../../platform/services/handle-cli-error.js";
import { resolveTarget } from "../services/resolve-target.js";
import { createTargetAction } from "../services/run-target-scrape.js";
import type { SearchOptions } from "../types/search-command.js";
Expand Down Expand Up @@ -52,7 +54,7 @@ export function createSearchCommand(schema: DecodoSchema): Command {
.description(
"Search the web (default: Google). Use decodo google-search or decodo bing-search for full options."
)
.argument("<query>", "Search query")
.argument("[query]", "Search query (omit when using --input-file)")
.addOption(
new Option("--engine <engine>", "Search engine")
.choices(["google", "bing"])
Expand All @@ -63,11 +65,12 @@ export function createSearchCommand(schema: DecodoSchema): Command {
.option("--target <name>", "Scrape target override");

attachScrapeOutputOptions(command);
attachBatchOptions(command);

return command.action(
createTargetAction(Target.GoogleSearch, schema, (query, options) => {
if (query === undefined) {
throw new Error("Missing required query.");
throw new CliUsageError("Missing required query.");
}

const opts = options as SearchOptions;
Expand Down
9 changes: 7 additions & 2 deletions src/scrape/services/command-builder.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import type { DecodoSchema } from "@decodo/sdk-ts";
import { type Command, Option } from "commander";
import type { JSONSchema4 } from "json-schema";
import { attachBatchOptions } from "../../batch/commands/attach-batch-options.js";
import { attachScrapeOutputOptions } from "../../output/commands/attach-output-options.js";
import { applyRequestDefaults } from "../../output/services/apply-request-defaults.js";
import { CliUsageError } from "../../platform/services/handle-cli-error.js";
import type { TargetCommandConfig } from "../types/target-command.js";
import { snakeToCamel, snakeToKebab } from "./naming.js";
import { getPrimaryInputField } from "./primary-input.js";
Expand Down Expand Up @@ -81,7 +83,7 @@ export function configureTargetCommand(
| undefined;
const inputHelp =
primarySchema?.description ?? `Primary ${primaryField} input`;
command.argument("<input>", inputHelp);
command.argument("[input]", inputHelp);
}

const optionFields = Object.keys(parameterSchema?.properties ?? {}).filter(
Expand All @@ -94,6 +96,7 @@ export function configureTargetCommand(
}

attachScrapeOutputOptions(command);
attachBatchOptions(command);

return { target, primaryField, optionFields };
}
Expand All @@ -109,7 +112,9 @@ export function buildScrapeBody(

if (config.primaryField) {
if (input === undefined) {
throw new Error(`Missing required input for ${config.primaryField}.`);
throw new CliUsageError(
`Missing required input for ${config.primaryField}.`
);
}
body[config.primaryField] = input;
}
Expand Down
Loading
Loading