diff --git a/packages/job-queue/src/job/Job.ts b/packages/job-queue/src/job/Job.ts index c5a6b00ce..fdab8308d 100644 --- a/packages/job-queue/src/job/Job.ts +++ b/packages/job-queue/src/job/Job.ts @@ -6,7 +6,7 @@ import { JobStatus } from "../queue-storage/IQueueStorage"; import { JobError } from "./JobError"; -import type { JobProgressListener } from "./JobQueueEventListeners"; +import type { JobProgressListener, StreamEventLike } from "./JobQueueEventListeners"; export { JobStatus }; @@ -20,6 +20,12 @@ export interface IJobExecuteContext { message?: string, details?: Record | null ) => Promise; + /** + * OPTIONAL. Present only when the worker's transport can deliver stream + * events. Jobs MUST NOT retain references to chunk buffers after calling + * this (buffers may be transferred across a worker boundary and detached). + */ + emitStreamEvent?: (event: StreamEventLike) => void; } /** diff --git a/packages/job-queue/src/job/JobQueueClient.ts b/packages/job-queue/src/job/JobQueueClient.ts index 2ae48100e..3d07c68b6 100644 --- a/packages/job-queue/src/job/JobQueueClient.ts +++ b/packages/job-queue/src/job/JobQueueClient.ts @@ -26,6 +26,8 @@ import { JobQueueEventListeners, JobQueueEventParameters, JobQueueEvents, + JobStreamListener, + type StreamEventLike, } from "./JobQueueEventListeners"; import type { JobQueueServer } from "./JobQueueServer"; import { storageToClass } from "./JobStorageConverters"; @@ -38,8 +40,36 @@ export interface JobHandle { waitFor(): Promise; abort(): Promise; onProgress(callback: JobProgressListener): () => void; + /** + * OPTIONAL — present only when this handle's transport can deliver stream + * events (a same-process server-attached queue). Absent on storage-only + * backends; callers branch on `typeof handle.onStream === "function"`. + */ + onStream?(callback: JobStreamListener): () => void; + /** + * OPTIONAL — present only when the client was configured with an + * `outputStreamResolver` (an output cache backing reachable from this + * process). Awaits the job's completion, then streams the binary result + * back out of the output cache without materializing it. `port` selects the + * output port; omit it when the output holds exactly one cache ref. + * Resolves `undefined` when there is nothing binary to stream (or the + * cache entry was evicted). + */ + outputStream?(port?: string): Promise | undefined>; } +/** + * Resolves a completed job's output value to a byte stream. Injected via + * {@link JobQueueClientOptions.outputStreamResolver} because the cache layer + * that understands output refs lives above this package in the dependency + * graph (`@workglow/task-graph` exports `makeJobOutputStreamResolver` to + * build one from a cache backing). + */ +export type JobOutputStreamResolver = ( + output: unknown, + port?: string +) => Promise | undefined>; + /** * Options for creating a JobQueueClient */ @@ -47,6 +77,13 @@ export interface JobQueueClientOptions { readonly messageQueue: IMessageQueue>; readonly jobStore: IJobStore; readonly queueName: string; + /** + * OPTIONAL — enables `JobHandle.outputStream` on handles from this client. + * Deployments whose output cache backing is reachable from this process + * inject a resolver (see `makeJobOutputStreamResolver` in + * `@workglow/task-graph`); without it, handles omit the method. + */ + readonly outputStreamResolver?: JobOutputStreamResolver; } /** @@ -61,6 +98,7 @@ export class JobQueueClient { protected readonly events = new EventEmitter>(); protected server: JobQueueServer | null = null; protected storageUnsubscribe: (() => void) | null = null; + protected readonly outputStreamResolver: JobOutputStreamResolver | undefined; /** * Map of job IDs to their pending promise resolvers @@ -78,6 +116,11 @@ export class JobQueueClient { */ protected readonly jobProgressListeners: Map> = new Map(); + /** + * Map of job IDs to their stream listeners + */ + protected readonly jobStreamListeners: Map> = new Map(); + /** * Last known progress state for each job */ @@ -94,6 +137,7 @@ export class JobQueueClient { this.queueName = options.queueName; this.messageQueue = options.messageQueue; this.jobStore = options.jobStore; + this.outputStreamResolver = options.outputStreamResolver; } /** @@ -391,6 +435,27 @@ export class JobQueueClient { }; } + /** + * Subscribe to stream events for a specific job + */ + public onJobStream(jobId: unknown, listener: JobStreamListener): () => void { + if (!this.jobStreamListeners.has(jobId)) { + this.jobStreamListeners.set(jobId, new Set()); + } + const listeners = this.jobStreamListeners.get(jobId)!; + listeners.add(listener); + + return () => { + const listeners = this.jobStreamListeners.get(jobId); + if (listeners) { + listeners.delete(listener); + if (listeners.size === 0) { + this.jobStreamListeners.delete(jobId); + } + } + }; + } + // ======================================================================== // Event handling // ======================================================================== @@ -524,23 +589,60 @@ export class JobQueueClient { } } + /** + * Called by server when a job emits a stream event. Listener throws are + * isolated per-listener — one misbehaving subscriber does not interrupt + * delivery to the rest or abort the dispatch itself. + * @internal + */ + public handleJobStream(jobId: unknown, event: StreamEventLike): void { + this.events.emit("job_stream", this.queueName, jobId, event); + + const listeners = this.jobStreamListeners.get(jobId); + if (!listeners) return; + for (const listener of listeners) { + try { + listener(event); + } catch (err) { + getLogger().error("JobHandle.onStream listener threw", { + jobId, + error: err instanceof Error ? err.message : String(err), + }); + } + } + } + // ======================================================================== // Private helpers // ======================================================================== private createJobHandle(id: unknown): JobHandle { - return { + const handle: JobHandle = { id, waitFor: () => this.waitFor(id), abort: () => this.abort(id), onProgress: (callback: JobProgressListener) => this.onJobProgress(id, callback), }; + // Stream delivery requires a same-process server-attached transport — the + // same signal `connect()` uses. Storage-only backends omit `onStream`, so + // callers branch on `typeof handle.onStream === "function"`. + if (this.server) { + handle.onStream = (callback: JobStreamListener) => this.onJobStream(id, callback); + } + // Streaming result reads require a cache backing reachable from this + // process; the injected resolver is the capability signal. + const resolver = this.outputStreamResolver; + if (resolver) { + handle.outputStream = async (port?: string) => resolver(await this.waitFor(id), port); + } + return handle; } private cleanupJob(jobId: unknown): void { this.activeJobPromises.delete(jobId); this.lastKnownProgress.delete(jobId); this.jobProgressListeners.delete(jobId); + this.jobStreamListeners.delete(jobId); } private handleStorageChange(change: QueueChangePayload): void { diff --git a/packages/job-queue/src/job/JobQueueEventListeners.ts b/packages/job-queue/src/job/JobQueueEventListeners.ts index ad3b7c457..18913f196 100644 --- a/packages/job-queue/src/job/JobQueueEventListeners.ts +++ b/packages/job-queue/src/job/JobQueueEventListeners.ts @@ -25,6 +25,7 @@ export type JobQueueEventListeners = { message: string, details: Record | null ) => void; + job_stream: (queueName: string, jobId: unknown, event: StreamEventLike) => void; }; export type JobQueueEvents = keyof JobQueueEventListeners; @@ -46,3 +47,16 @@ export type JobProgressListener = ( message: string, details: Record | null ) => void; + +/** + * Minimal structural shape of a stream event crossing the job-queue boundary. + * + * `@workglow/job-queue` sits below `@workglow/task-graph` in the dependency + * graph, so it cannot import task-graph's `StreamEvent`. This structural type + * captures just what the queue plumbing needs; task-graph's `StreamEvent` is + * assignable to it, so real stream producers interoperate transparently. + */ +export type StreamEventLike = { type: string; port?: string; [k: string]: unknown }; + +/** Listener for cross-process stream events emitted by an executing job. */ +export type JobStreamListener = (event: StreamEventLike) => void; diff --git a/packages/job-queue/src/job/JobQueueServer.ts b/packages/job-queue/src/job/JobQueueServer.ts index 9f6e9fc59..e72dff9a0 100644 --- a/packages/job-queue/src/job/JobQueueServer.ts +++ b/packages/job-queue/src/job/JobQueueServer.ts @@ -13,6 +13,7 @@ import type { JobStorageFormat, QueueChangePayload } from "../queue-storage/IQue import { JobStatus } from "../queue-storage/IQueueStorage"; import type { DeadLetter } from "./DeadLetter"; import { Job, JobClass } from "./Job"; +import type { StreamEventLike } from "./JobQueueEventListeners"; import { JobQueueClient } from "./JobQueueClient"; import { JobQueueWorker } from "./JobQueueWorker"; import { storageToClass } from "./JobStorageConverters"; @@ -49,6 +50,7 @@ export type JobQueueServerEventListeners = { message: string, details: Record | null ) => void; + job_stream: (queueName: string, jobId: unknown, event: StreamEventLike) => void; }; export type JobQueueServerEvents = keyof JobQueueServerEventListeners; @@ -479,6 +481,11 @@ export class JobQueueServer< this.forwardToClients("handleJobProgress", jobId, progress, message, details); }); + worker.on("job_stream", (jobId, event) => { + this.events.emit("job_stream", this.queueName, jobId, event); + this.forwardToClients("handleJobStream", jobId, event); + }); + return worker; } @@ -502,6 +509,11 @@ export class JobQueueServer< message: string, details: Record | null ): void; + protected forwardToClients( + method: "handleJobStream", + jobId: unknown, + event: StreamEventLike + ): void; protected forwardToClients(method: string, ...args: unknown[]): void { for (const client of this.clients) { const fn = (client as any)[method]; diff --git a/packages/job-queue/src/job/JobQueueWorker.ts b/packages/job-queue/src/job/JobQueueWorker.ts index dbca8015f..93518ce44 100644 --- a/packages/job-queue/src/job/JobQueueWorker.ts +++ b/packages/job-queue/src/job/JobQueueWorker.ts @@ -31,7 +31,8 @@ import { RetryableJobError, } from "./JobError"; import { withJobErrorDiagnostics } from "./JobErrorDiagnostics"; -import { storageToClass } from "./JobStorageConverters"; +import type { StreamEventLike } from "./JobQueueEventListeners"; +import { classToStorage, storageToClass } from "./JobStorageConverters"; /** * Upper bound on {@link JobQueueWorker.getLimiterWakeDelay}. Prevents a @@ -56,6 +57,7 @@ export type JobQueueWorkerEventListeners = { message: string, details: Record | null ) => void; + job_stream: (jobId: unknown, event: StreamEventLike) => void; worker_start: () => void; worker_stop: () => void; }; @@ -812,6 +814,7 @@ export class JobQueueWorker< return await job.execute(job.input, { signal, updateProgress: this.updateProgress.bind(this, job.id), + emitStreamEvent: (event) => this.emitStreamEvent(job.id, event), }); } @@ -833,6 +836,17 @@ export class JobQueueWorker< this.events.emit("job_progress", jobId, progress, message, details); } + /** + * Emit a cross-process stream event for a job. + * + * Mirrors {@link updateProgress}: stream events are delivered in-memory via + * the `job_stream` event and forwarded by an attached `JobQueueServer` to + * subscribed clients. Storage is not touched. + */ + protected emitStreamEvent(jobId: unknown, event: StreamEventLike): void { + this.events.emit("job_stream", jobId, event); + } + /** Internal — resolve the active claim for a job id, throw if missing. */ private getClaim(jobId: unknown): IClaim> | undefined { return this.activeClaims.get(jobId); diff --git a/packages/task-graph/src/EXECUTION_MODEL.md b/packages/task-graph/src/EXECUTION_MODEL.md index 5ee415189..a6f0ac5ee 100644 --- a/packages/task-graph/src/EXECUTION_MODEL.md +++ b/packages/task-graph/src/EXECUTION_MODEL.md @@ -282,6 +282,26 @@ key = sha256(taskType + getCacheVersion() + fingerprint(inputs)) Failed tasks are never cached — only `Ok` results reach `saveOutput`. `saveOutput` is upsert by primary key (last writer wins) — the underlying `TaskOutputTabularRepository` calls `put()` on its tabular storage, so a same-key write replaces the existing row. +### Binary cache stream-out (refs on the read path) + +Binary output ports whose bytes were piped into a stream-capable cache carry a branded `CacheRef` in the cached row. On a **cache hit**, the runner mirrors the fresh-run event contract, driven by two graph-computed consumer hints (`IRunConfig.hasStreamingConsumers` / `hasMaterializingConsumers`): + +- **Stream-capable consumer** (`x-stream: "binary"` on both ends of an edge): the cached bytes replay as chunked `binary-delta` events, pull-paced from the repository's streaming reader (`getOutputStreamByRef`), so memory stays bounded by the read chunk size. The finish event keeps the ref at the port. +- **Materializing consumer** (target port cannot consume the stream): the ref hydrates into the **enriched finish event** as a `Blob`/`ArrayBuffer` (per the port's `format`), exactly what a fresh run's accumulator would have delivered. The *returned* output still carries the small ref. +- **No consumers**: no reads are performed; the synthetic finish carries the ref unchanged (callers resolve via `resolveOutput` / `resolveJobOutputStream`). + +**Rows store the wire form**: the cached row always carries the `CacheRef`, never inline bytes — JSON-row backings would destroy an inline `Blob`/`ArrayBuffer` (`JSON.stringify(Blob)` is `{}`). Below-threshold hydration to inline bytes applies to the value **returned to the caller**, identically on fresh runs and cache hits. + +**Single binary port**: the cache sink keys bytes by `(taskType, inputs)` with no port axis, so cache-streaming supports exactly one binary output port. Tasks with multiple binary ports take the accumulation path (enforced in both `StreamPump.canStreamBinaryToCache` and `CacheCoordinator.getBinaryRefSinksByPolicy`); their inline outputs are only safely cacheable by non-JSON-row backings until per-port refs land. + +**Self-healing dangling refs**: when a ref needed for replay or hydration no longer resolves (blob evicted, cache cleared), the hit converts into a **miss** — the task re-executes and rewrites both the row and the bytes. No events are emitted before all refs are validated. + +**Input-side hydration**: any branded ref that reaches a task's resolved inputs is hydrated against the run's `CacheRegistry` (private first, then deterministic) before validation and cache-key computation, so ref-bearing inputs fingerprint identically to materialized ones. Binary-streaming input ports with a live input stream are skipped — those consumers take bytes from the stream. An unresolvable input ref fails the task with an error naming the port. + +**Queue consumers**: `JobHandle.outputStream(port?)` (present only when the `JobQueueClient` was configured with an `outputStreamResolver`, typically `makeJobOutputStreamResolver(repo)`) awaits completion and streams the binary result out of the cache without materializing it. + +`FsFolderTaskOutputRepository` (node/bun) is the production streaming backing: JSON rows via `FsFolderTabularStorage`, bytes as sidecar files under `/blobs/` written incrementally and published by atomic rename — `_.bin`, so a re-run overwrites rather than leaks. Two instances over one folder interoperate (the cross-process read story). + ### Durable execution model A run is an atomic unit on a single worker. When the worker crashes: diff --git a/packages/task-graph/src/bun.ts b/packages/task-graph/src/bun.ts index 4622a4480..075a6411f 100644 --- a/packages/task-graph/src/bun.ts +++ b/packages/task-graph/src/bun.ts @@ -7,3 +7,4 @@ // organize-imports-ignore export * from "./common"; +export * from "./common-server"; diff --git a/packages/task-graph/src/cache/BinaryPortCodec.ts b/packages/task-graph/src/cache/BinaryPortCodec.ts new file mode 100644 index 000000000..de09d8a07 --- /dev/null +++ b/packages/task-graph/src/cache/BinaryPortCodec.ts @@ -0,0 +1,116 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import { registerPortCodec } from "@workglow/util"; +import { isCacheRef } from "./CacheRef"; + +/** + * JSON-safe wire form for inline binary port values. Without a codec, + * `JSON.stringify(new Blob(...))` is `"{}"` — a cacheable task with a binary + * output port and a non-streaming (JSON-row) cache backing would silently + * persist an empty object and corrupt every later cache hit. + * + * Base64 inflates the payload ~1.33× (plus a UTF-16 spike inside + * `JSON.stringify`), so this path is intended for small payloads; large + * binary outputs belong on a streaming-capable backing where the row carries + * a `CacheRef` envelope instead of bytes. + */ +export interface BinaryPortWire { + readonly __binaryPortWire: 1; + readonly base64: string; + readonly size: number; + readonly mime: string | undefined; +} + +function isBinaryPortWire(value: unknown): value is BinaryPortWire { + if (value === null || typeof value !== "object") return false; + const o = value as Record; + return o.__binaryPortWire === 1 && typeof o.base64 === "string" && typeof o.size === "number"; +} + +function bytesToBase64(bytes: Uint8Array): string { + if (typeof Buffer !== "undefined") { + return Buffer.from(bytes.buffer, bytes.byteOffset, bytes.byteLength).toString("base64"); + } + let bin = ""; + for (let i = 0; i < bytes.length; i++) bin += String.fromCharCode(bytes[i] ?? 0); + return btoa(bin); +} + +function base64ToBytes(b64: string): Uint8Array { + if (typeof Buffer !== "undefined") { + const buf = Buffer.from(b64, "base64"); + return new Uint8Array(buf.buffer, buf.byteOffset, buf.byteLength); + } + const bin = atob(b64); + const out = new Uint8Array(bin.length); + for (let i = 0; i < bin.length; i++) out[i] = bin.charCodeAt(i); + return out; +} + +/** + * Values that are not raw bytes pass through unchanged in BOTH directions: + * on the streaming-cache path the port slot holds a `CacheRef` envelope (the + * row must keep it verbatim for cache-hit replay/hydration), and legacy rows + * may hold shapes written before this codec existed. Only `Blob`, + * `ArrayBuffer`, and `ArrayBuffer` views are encoded. + */ +async function serializeBinary(value: unknown): Promise { + if (value instanceof Blob) { + const bytes = new Uint8Array(await value.arrayBuffer()); + return { + __binaryPortWire: 1, + base64: bytesToBase64(bytes), + size: bytes.byteLength, + mime: value.type === "" ? undefined : value.type, + } satisfies BinaryPortWire; + } + if (value instanceof ArrayBuffer) { + const bytes = new Uint8Array(value); + return { + __binaryPortWire: 1, + base64: bytesToBase64(bytes), + size: bytes.byteLength, + mime: undefined, + } satisfies BinaryPortWire; + } + if (ArrayBuffer.isView(value)) { + const view = value as ArrayBufferView; + const bytes = new Uint8Array(view.buffer, view.byteOffset, view.byteLength); + return { + __binaryPortWire: 1, + base64: bytesToBase64(bytes), + size: bytes.byteLength, + mime: undefined, + } satisfies BinaryPortWire; + } + return value; +} + +registerPortCodec("blob", { + async serialize(value) { + return serializeBinary(value); + }, + async deserialize(wire) { + if (isCacheRef(wire) || !isBinaryPortWire(wire)) return wire; + const bytes = base64ToBytes(wire.base64); + return new Blob( + [bytes.buffer.slice(bytes.byteOffset, bytes.byteOffset + bytes.byteLength) as ArrayBuffer], + wire.mime ? { type: wire.mime } : undefined + ); + }, +}); + +registerPortCodec("binary", { + async serialize(value) { + return serializeBinary(value); + }, + async deserialize(wire) { + if (isCacheRef(wire) || !isBinaryPortWire(wire)) return wire; + const bytes = base64ToBytes(wire.base64); + return bytes.buffer.slice(bytes.byteOffset, bytes.byteOffset + bytes.byteLength); + }, +}); diff --git a/packages/task-graph/src/cache/CacheRef.ts b/packages/task-graph/src/cache/CacheRef.ts new file mode 100644 index 000000000..eeffffd28 --- /dev/null +++ b/packages/task-graph/src/cache/CacheRef.ts @@ -0,0 +1,94 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * Brand value for {@link CacheRef}. A literal string (not a Symbol) so the brand + * survives JSON serialization across queue rows / IPC boundaries — a Symbol-based + * brand would be erased by `JSON.stringify` and the resulting object would no + * longer be identifiable as a cache reference on the receiving side. + */ +export const CACHE_REF_KIND = "task-graph/CacheRef" as const; + +/** + * A reference to bytes that live in the configured cache backing rather than + * inline in a task `Output`. Emitted by `TaskRunner` for binary output ports + * whose committed size meets the `IRunConfig.referenceThresholdBytes` and + * whose cache backing implements `saveOutputStream`. + * + * `$ref` is opaque to consumers: only the cache backing knows how to translate + * it back into bytes. `size` and `mime` are best-effort hints populated when + * known at finish time; absent values do not imply unknown failure. + * + * The `kind` brand discriminates a cache ref from other `{$ref: string}` + * shapes (e.g. JSON-Schema references) so the resolver never walks an + * untrusted `$ref` string into the cache. The brand is a literal so it survives + * JSON round-trip across queue boundaries. + * + * Resolution is best-effort: the cache backing's TTL is the lifetime contract, + * and `resolveOutputRef` returns `undefined` when the underlying entry has + * been evicted. + */ +export interface ICacheRef { + readonly kind: typeof CACHE_REF_KIND; + readonly $ref: string; + readonly size?: number; + readonly mime?: string; +} + +export type CacheRef = ICacheRef; + +/** + * Narrow an unknown value to {@link CacheRef}. Discriminates on the literal + * {@link CACHE_REF_KIND} brand AND a string `$ref`; shape-only `{$ref: string}` + * objects (JSON-Schema refs, user metadata) do NOT match. + */ +export function isCacheRef(value: unknown): value is CacheRef { + if (typeof value !== "object" || value === null) return false; + const candidate = value as { readonly kind?: unknown; readonly $ref?: unknown }; + return candidate.kind === CACHE_REF_KIND && typeof candidate.$ref === "string"; +} + +/** + * Construct a branded {@link CacheRef}. Cache backings MUST use this helper (or + * spread `{kind: CACHE_REF_KIND, ...}` themselves) so the resulting ref carries + * the brand. Helpers in {@link CacheCoordinator} / {@link RunPrivateCacheRepo} + * defensively re-wrap legacy backings whose `saveOutputStream` predates the + * brand and returns an unbranded `{$ref}` shape. + */ +export function makeCacheRef(raw: { + readonly $ref: string; + readonly size?: number; + readonly mime?: string; +}): CacheRef { + return { + kind: CACHE_REF_KIND, + $ref: raw.$ref, + ...(raw.size !== undefined && { size: raw.size }), + ...(raw.mime !== undefined && { mime: raw.mime }), + }; +} + +/** + * Default threshold (in bytes) at which a binary output port becomes a + * {@link CacheRef} instead of being inlined in `Output`. Below this size, the + * runner inlines the bytes; at or above, it emits a reference. + * + * `0` is a sentinel meaning "always emit a reference" and is honored by the + * runtime path (a callsite that wants to force refs sets `0` explicitly via + * `IRunConfig.referenceThresholdBytes`). + */ +export const REFERENCE_THRESHOLD_BYTES_DEFAULT = 65_536; + +/** + * Resolve the effective reference threshold for a run, falling back to + * {@link REFERENCE_THRESHOLD_BYTES_DEFAULT} when unset. A negative value is + * treated as the default (negative thresholds are nonsensical). + */ +export function resolveReferenceThreshold(threshold: number | undefined): number { + if (threshold === undefined) return REFERENCE_THRESHOLD_BYTES_DEFAULT; + if (threshold < 0) return REFERENCE_THRESHOLD_BYTES_DEFAULT; + return threshold; +} diff --git a/packages/task-graph/src/cache/RunPrivateCacheRepo.ts b/packages/task-graph/src/cache/RunPrivateCacheRepo.ts index 9b0e137b9..e11d387ab 100644 --- a/packages/task-graph/src/cache/RunPrivateCacheRepo.ts +++ b/packages/task-graph/src/cache/RunPrivateCacheRepo.ts @@ -6,6 +6,8 @@ import { TaskOutputRepository } from "../storage/TaskOutputRepository"; import type { TaskInput, TaskOutput } from "../task/TaskTypes"; +import type { CacheRef } from "./CacheRef"; +import { isCacheRef, makeCacheRef } from "./CacheRef"; export interface RunPrivateCacheRepoOptions { backing: TaskOutputRepository; @@ -31,6 +33,11 @@ export interface RunPrivateCacheRepoOptions { * and signals the fallback via {@link noteFallbackKey}; this wrapper warns once * per process to tell the operator that the cache is best-effort intra-process * only until task ids are pinned. + * + * Capability probing on this wrapper MUST use `typeof repo.method === + * "function"` or optional-call (`repo.method?.()`): optional methods the + * backing lacks are shadowed to `undefined` on the instance, so an unguarded + * call is a `TypeError` even though the prototype declares the method. */ export class RunPrivateCacheRepo extends TaskOutputRepository { private static fallbackWarned = false; @@ -43,6 +50,20 @@ export class RunPrivateCacheRepo extends TaskOutputRepository { super({ outputCompression: backing.outputCompression }); this.backing = backing; this.runId = runId; + // Mirror the backing's optional-method shape on this instance so callers + // probing `typeof repo.saveOutputStream === "function"` (or the + // getOutputByRef/getOutputStreamByRef siblings) see the true capability + // instead of the always-present wrapper override. Class methods live on + // the prototype; assigning `undefined` on the instance shadows them. + if (typeof backing.saveOutputStream !== "function") { + (this as { saveOutputStream?: unknown }).saveOutputStream = undefined; + } + if (typeof backing.getOutputByRef !== "function") { + (this as { getOutputByRef?: unknown }).getOutputByRef = undefined; + } + if (typeof backing.getOutputStreamByRef !== "function") { + (this as { getOutputStreamByRef?: unknown }).getOutputStreamByRef = undefined; + } } /** @@ -86,6 +107,62 @@ export class RunPrivateCacheRepo extends TaskOutputRepository { return this.backing.getOutput(this.ns(cacheIdentity), inputs); } + /** + * Forwards the streaming sink to the backing repository, applying the same + * `runId` namespacing as `saveOutput`. Only present in effect when the + * backing repo supports streaming; `supportsStreaming()` (below) reflects the + * backing repo so callers branch correctly before calling this. + * + * Returns whatever {@link CacheRef} the backing produced (already namespaced + * via the wrapped `taskType`). Resolvers calling `getOutputByRef` on this + * wrapper forward to the backing, which decodes its own `$ref`. + */ + public override async saveOutputStream( + taskType: string, + inputs: TaskInput, + chunks: AsyncIterable, + metadata: Record + ): Promise { + const fn = this.backing.saveOutputStream; + if (typeof fn !== "function") { + throw new Error( + `RunPrivateCacheRepo: backing repository does not implement saveOutputStream. ` + + `Call supportsStreaming() before saveOutputStream.` + ); + } + // Re-wrap the backing's CacheRef so legacy backings that pre-date the + // `kind` brand still produce a discriminator-bearing ref through this + // wrapper. Branded refs pass through unchanged. + const raw = await fn.call(this.backing, this.ns(taskType), inputs, chunks, metadata); + return isCacheRef(raw) ? raw : makeCacheRef(raw); + } + + /** + * Forwards by-ref retrieval to the backing repository. The `$ref` already + * encodes whatever the backing needs to locate the entry; no namespacing is + * re-applied here. + */ + public override getOutputByRef(ref: CacheRef): Promise { + if (typeof this.backing.getOutputByRef !== "function") return Promise.resolve(undefined); + return this.backing.getOutputByRef(ref); + } + + /** Forwards streaming by-ref retrieval to the backing repository. */ + public override getOutputStreamByRef(ref: CacheRef): AsyncIterable | undefined { + if (typeof this.backing.getOutputStreamByRef !== "function") return undefined; + return this.backing.getOutputStreamByRef(ref); + } + + /** Mirrors the backing repository's streaming capability. */ + public override supportsStreaming(): boolean { + return this.backing.supportsStreaming(); + } + + /** Mirrors the backing repository's streaming-read capability. */ + public override supportsStreamingReads(): boolean { + return this.backing.supportsStreamingReads(); + } + /** * Override of `TaskOutputRepository.clear()` that only deletes entries * namespaced under THIS wrapper's `runId`. Entries from other runs are not diff --git a/packages/task-graph/src/cache/index.ts b/packages/task-graph/src/cache/index.ts index ac1518d67..b712deb38 100644 --- a/packages/task-graph/src/cache/index.ts +++ b/packages/task-graph/src/cache/index.ts @@ -4,7 +4,11 @@ * SPDX-License-Identifier: Apache-2.0 */ +export * from "./BinaryPortCodec"; export * from "./CacheJanitor"; export * from "./CachePolicy"; +export * from "./CacheRef"; export * from "./CacheRegistry"; +export * from "./resolveJobOutput"; +export * from "./resolveRef"; export * from "./RunPrivateCacheRepo"; diff --git a/packages/task-graph/src/cache/resolveJobOutput.ts b/packages/task-graph/src/cache/resolveJobOutput.ts new file mode 100644 index 000000000..23880e979 --- /dev/null +++ b/packages/task-graph/src/cache/resolveJobOutput.ts @@ -0,0 +1,161 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { CacheRef } from "./CacheRef"; +import { isCacheRef } from "./CacheRef"; +import type { CacheRefResolver, RefStreamBacking, ResolveOutputOptions } from "./resolveRef"; +import { byteIterableFromBlob, resolveOutput, streamRefViaBacking } from "./resolveRef"; + +/** + * Structural type matching `@workglow/job-queue`'s `JobHandle`. Declared + * locally so this module doesn't have to import from job-queue (avoiding a + * runtime dependency edge for a structural shape). + */ +export interface JobHandleLike { + waitFor(): Promise; +} + +/** + * Carrier of the resolver. Two-shape input: either a {@link CacheRefResolver} + * function directly, or anything with a `getOutputByRef` method (the shape + * `TaskOutputRepository` exposes). + */ +export type RefBacking = + | CacheRefResolver + | { readonly getOutputByRef?: (ref: CacheRef) => Promise }; + +/** + * Await a job's completion and hydrate every {@link CacheRef} inside its + * `Output` to inline bytes via the supplied backing. The backing can be a + * raw resolver function or any object exposing `getOutputByRef` (e.g. a + * `TaskOutputRepository`). + * + * On cache miss the placeholder is replaced by `undefined` (best-effort + * resolution). Backings that don't implement `getOutputByRef` + * leave every ref in place. + */ +export async function resolveJobOutput( + handle: JobHandleLike, + backing: RefBacking, + options?: ResolveOutputOptions +): Promise { + const output = await handle.waitFor(); + const resolver = asResolver(backing); + if (resolver === undefined) return output; + return resolveOutput(output, resolver, options); +} + +function asResolver(backing: RefBacking): CacheRefResolver | undefined { + if (typeof backing === "function") return backing; + const get = backing.getOutputByRef; + if (typeof get !== "function") return undefined; + return (ref) => get.call(backing, ref); +} + +function collectCacheRefs( + value: unknown, + out: CacheRef[], + visited: WeakSet = new WeakSet() +): void { + if (isCacheRef(value)) { + out.push(value); + return; + } + if (value === null || typeof value !== "object") return; + if (visited.has(value as object)) return; + visited.add(value as object); + if (Array.isArray(value)) { + for (const v of value) collectCacheRefs(v, out, visited); + return; + } + if (value instanceof Map) { + for (const v of value.values()) collectCacheRefs(v, out, visited); + return; + } + if (value instanceof Set) { + for (const v of value) collectCacheRefs(v, out, visited); + return; + } + // Opaque-by-default: only plain objects (Object.prototype / null prototype) + // are walked structurally. Every class instance — Blob, ArrayBuffer, typed + // arrays, Error, URL, Headers, Request, Response, FormData, + // URLSearchParams, ReadableStream, user classes — is opaque (matches the + // `resolveRef.ts` walker so both stop at the same boundary). + const proto = Object.getPrototypeOf(value); + if (proto !== null && proto !== Object.prototype) return; + const source = value as Record; + for (const k of Object.keys(source)) collectCacheRefs(source[k], out, visited); +} + +async function outputValueToStream( + output: unknown, + backing: RefStreamBacking, + port?: string +): Promise | undefined> { + let candidate: unknown; + if (port !== undefined) { + candidate = (output as Record | undefined)?.[port]; + } else { + const refs: CacheRef[] = []; + collectCacheRefs(output, refs); + if (refs.length > 1) { + throw new Error( + `resolveJobOutputStream: output contains ${refs.length} cache refs; pass an explicit port.` + ); + } + candidate = refs[0]; + } + if (candidate === undefined) return undefined; + if (isCacheRef(candidate)) return streamRefViaBacking(candidate, backing); + if (candidate instanceof Blob) return byteIterableFromBlob(candidate); + if (candidate instanceof ArrayBuffer) { + const bytes = new Uint8Array(candidate); + return (async function* () { + yield bytes; + })(); + } + if (candidate instanceof Uint8Array) { + const bytes = candidate; + return (async function* () { + yield bytes; + })(); + } + return undefined; +} + +/** + * Await a job's completion and stream its binary result back out of the + * output cache without materializing it. `port` selects the output port; + * when omitted, the single branded {@link CacheRef} reachable in the output + * is used (two or more refs without a port is an error; zero resolves + * `undefined`). Inline `Blob` / `ArrayBuffer` / `Uint8Array` values at a + * named port are adapted to a stream so callers don't branch on whether the + * reference threshold kept the value inline. + * + * Portless discovery walks the ENTIRE output, including fields whose content + * the job may have copied from untrusted input — a crafted branded ref shape + * embedded there would be resolved against the backing. Pass an explicit + * `port` whenever the producer of the output is not fully trusted. + */ +export async function resolveJobOutputStream( + handle: JobHandleLike, + backing: RefStreamBacking, + port?: string +): Promise | undefined> { + return outputValueToStream(await handle.waitFor(), backing, port); +} + +/** + * Factory closing over a cache backing, producing the resolver shape + * `@workglow/job-queue` accepts as `JobQueueClientOptions.outputStreamResolver` + * (job-queue cannot import this package — the dependency edge points the + * other way — so the resolver is injected as a structural function). + */ +export function makeJobOutputStreamResolver( + backing: RefStreamBacking +): (output: unknown, port?: string) => Promise | undefined> { + return (output, port) => outputValueToStream(output, backing, port); +} diff --git a/packages/task-graph/src/cache/resolveRef.ts b/packages/task-graph/src/cache/resolveRef.ts new file mode 100644 index 000000000..1c167e4f7 --- /dev/null +++ b/packages/task-graph/src/cache/resolveRef.ts @@ -0,0 +1,268 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { CacheRef } from "./CacheRef"; +import { isCacheRef } from "./CacheRef"; + +/** + * Resolves a single {@link CacheRef} to bytes (or `undefined` on cache miss). + * Wired up by callers against their configured cache backing; this module is + * unaware of any specific repository implementation. + */ +export type CacheRefResolver = (ref: CacheRef) => Promise; + +/** + * Streaming counterpart of {@link CacheRefResolver}. Returns an async iterable + * of chunks for consumers that want to pipe the bytes further (e.g. into an + * HTTP response) without materializing the full payload. Returns `undefined` + * if the backing has no streaming retrieval for this ref or the entry is + * absent. + */ +export type CacheRefStreamResolver = (ref: CacheRef) => AsyncIterable | undefined; + +/** + * Object shape carrying the optional by-ref readers a cache backing exposes + * (the read surface of `TaskOutputRepository`). Both members are optional so + * any repository — streaming or not — satisfies the shape; helpers degrade + * per capability. + */ +export interface RefStreamBacking { + readonly getOutputByRef?: (ref: CacheRef) => Promise; + readonly getOutputStreamByRef?: (ref: CacheRef) => AsyncIterable | undefined; +} + +/** Adapt a `Blob` to the `AsyncIterable` chunk shape. */ +export async function* byteIterableFromBlob(blob: Blob): AsyncIterable { + const reader = blob.stream().getReader(); + try { + while (true) { + const { done, value } = await reader.read(); + if (done) return; + yield value; + } + } finally { + reader.releaseLock(); + } +} + +/** + * Stream a {@link CacheRef}'s bytes out of a backing. Prefers the backing's + * streaming reader; falls back to materializing via `getOutputByRef` and + * re-chunking through `blob.stream()`. Resolves `undefined` when the entry is + * absent (dangling ref) or the backing exposes no readers. + */ +export async function streamRefViaBacking( + ref: CacheRef, + backing: RefStreamBacking +): Promise | undefined> { + if (typeof backing.getOutputStreamByRef === "function") { + const stream = backing.getOutputStreamByRef(ref); + if (stream !== undefined) return stream; + } + if (typeof backing.getOutputByRef === "function") { + const blob = await backing.getOutputByRef(ref); + if (blob !== undefined) return byteIterableFromBlob(blob); + } + return undefined; +} + +/** Options accepted by {@link resolveOutput}. */ +export type ResolveOutputOptions = { + /** + * Maximum number of concurrent resolver calls. Defaults to unbounded + * (`Infinity`), suitable for backings that handle their own pacing. + * Set a finite value when the backing is rate-limited. + */ + readonly concurrency?: number; + /** + * Predicate deciding which refs are resolved. Refs that fail the filter are + * left in place (the slot keeps the original {@link CacheRef}). When omitted, + * every ref is resolved. + */ + readonly filter?: (ref: CacheRef) => boolean; +}; + +/** + * Recursively visit a task output and replace every {@link CacheRef} encountered + * with the value produced by the resolver. Non-ref values are returned as-is. + * + * Identity is preserved when the input contains no refs (or none that match the + * optional filter): the same object reference comes back, so callers can rely + * on `===` / `WeakMap` keys not being silently invalidated by an auto-resolve. + * + * Walker policy is opaque-by-default: only plain objects (prototype + * `Object.prototype` or `null`), `Array`, `Map`, and `Set` are walked + * structurally. Every class instance — `Blob`, `ArrayBuffer`, typed arrays, + * `Date`, `RegExp`, `Promise`, `Error`, `URL`, `Headers`, `Request`, + * `Response`, `FormData`, `URLSearchParams`, `ReadableStream`, and any + * user-defined class — is treated as an opaque leaf and returned by reference. + * Walking such instances through `Object.keys()` would silently drop data that + * lives on the prototype (accessors) or in private slots, so the safe default + * is to leave them untouched. + * + * On cache miss the resolver returns `undefined`; the corresponding slot in + * the returned output is `undefined`. This is the documented best-effort + * behavior — callers either tolerate missing bytes or check explicitly. + */ +export async function resolveOutput( + output: T, + resolver: CacheRefResolver, + options?: ResolveOutputOptions +): Promise { + if (!hasMatchingRef(output, options?.filter, new WeakSet())) return output; + const limit = createLimiter(options?.concurrency); + return (await walk(output, resolver, limit, options?.filter, new WeakSet())) as T; +} + +/** + * Cheap pre-scan: returns `true` if any {@link CacheRef} (matching the + * optional filter) is reachable inside `value`. Lets `resolveOutput` + * short-circuit and preserve identity when nothing needs resolving. + * + * `visited` short-circuits cyclic and shared-subtree structures: revisiting an + * already-seen object answers `false` instead of recursing forever. The + * pre-scan is a containment check, so reporting `false` on a revisit is safe — + * if a ref were reachable through that subtree, the FIRST visit would have + * found it. + */ +function hasMatchingRef( + value: unknown, + filter: ((ref: CacheRef) => boolean) | undefined, + visited: WeakSet +): boolean { + if (isCacheRef(value)) return filter ? filter(value) : true; + if (value === null || value === undefined) return false; + if (isLeaf(value)) return false; + if (typeof value === "object") { + if (visited.has(value as object)) return false; + visited.add(value as object); + } + if (Array.isArray(value)) { + for (const v of value) { + if (hasMatchingRef(v, filter, visited)) return true; + } + return false; + } + if (value instanceof Map) { + for (const v of value.values()) { + if (hasMatchingRef(v, filter, visited)) return true; + } + return false; + } + if (value instanceof Set) { + for (const v of value) { + if (hasMatchingRef(v, filter, visited)) return true; + } + return false; + } + if (typeof value === "object") { + const source = value as Record; + for (const k of Object.keys(source)) { + if (hasMatchingRef(source[k], filter, visited)) return true; + } + return false; + } + return false; +} + +async function walk( + value: unknown, + resolver: CacheRefResolver, + limit: Limiter, + filter: ((ref: CacheRef) => boolean) | undefined, + visited: WeakSet +): Promise { + if (isCacheRef(value)) { + if (filter && !filter(value)) return value; + return limit.run(() => resolver(value)); + } + if (value === null || value === undefined) return value; + if (isLeaf(value)) return value; + // Cycle / shared-subtree guard: a revisited object is returned by reference + // unchanged. Cycles are not rewritten — the caller keeps their original + // graph topology, including any unresolved refs the cycle contains. + if (typeof value === "object" && visited.has(value as object)) return value; + if (!hasMatchingRef(value, filter, new WeakSet())) return value; + if (typeof value === "object") visited.add(value as object); + if (Array.isArray(value)) { + return Promise.all(value.map((v) => walk(v, resolver, limit, filter, visited))); + } + if (value instanceof Map) { + const out = new Map(); + const entries = Array.from(value.entries()); + const resolved = await Promise.all( + entries.map(async ([k, v]) => [k, await walk(v, resolver, limit, filter, visited)] as const) + ); + for (const [k, v] of resolved) out.set(k, v); + return out; + } + if (value instanceof Set) { + const out = new Set(); + const resolved = await Promise.all( + Array.from(value).map((v) => walk(v, resolver, limit, filter, visited)) + ); + for (const v of resolved) out.add(v); + return out; + } + if (typeof value === "object") { + const source = value as Record; + // Only plain objects (Object.prototype / null prototype) reach this + // branch; class instances are screened out by isLeaf above and returned + // by reference unchanged. + const out: Record = {}; + // Iterate in source order so the returned object's enumeration order + // matches the input even though resolutions race. + const keys = Object.keys(source); + const resolvedValues = await Promise.all( + keys.map((k) => walk(source[k], resolver, limit, filter, visited)) + ); + for (let i = 0; i < keys.length; i++) out[keys[i]!] = resolvedValues[i]; + return out; + } + return value; +} + +/** + * Opaque-by-default policy. Only plain objects (prototype `Object.prototype` + * or `null`), `Array`, `Map`, and `Set` are structurally walked. Every other + * object — including `Blob`, `ArrayBuffer`, typed arrays, `Date`, `RegExp`, + * `Promise`, `Error`, `URL`, `Headers`, `Request`, `Response`, `FormData`, + * `URLSearchParams`, `ReadableStream`, and user-defined classes — is opaque: + * generic `Object.keys()` cloning would drop prototype-resident data + * (accessors) and private slots. + */ +function isLeaf(value: unknown): boolean { + if (typeof value !== "object" || value === null) return true; + if (Array.isArray(value)) return false; + if (value instanceof Map || value instanceof Set) return false; + const proto = Object.getPrototypeOf(value); + return proto !== null && proto !== Object.prototype; +} + +type Limiter = { run(fn: () => Promise): Promise }; + +function createLimiter(concurrency: number | undefined): Limiter { + if (concurrency === undefined || concurrency === Infinity) { + return { run: (fn) => fn() }; + } + let free = Math.max(1, Math.floor(concurrency)); + const waiters: Array<() => void> = []; + return { + async run(fn: () => Promise): Promise { + while (free <= 0) { + await new Promise((resolve) => waiters.push(resolve)); + } + free--; + try { + return await fn(); + } finally { + free++; + const next = waiters.shift(); + if (next) next(); + } + }, + }; +} diff --git a/packages/task-graph/src/common-server.ts b/packages/task-graph/src/common-server.ts new file mode 100644 index 000000000..646641e33 --- /dev/null +++ b/packages/task-graph/src/common-server.ts @@ -0,0 +1,10 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +// Server-only exports (node:fs et al.) — re-exported by node.ts and bun.ts, +// never by browser.ts. + +export * from "./storage/FsFolderTaskOutputRepository"; diff --git a/packages/task-graph/src/common.ts b/packages/task-graph/src/common.ts index b995546c3..fc5441e83 100644 --- a/packages/task-graph/src/common.ts +++ b/packages/task-graph/src/common.ts @@ -6,6 +6,10 @@ // organize-imports-ignore +// Side-effect: registers the default "blob"/"binary" port codecs so JSON-row +// cache backings round-trip inline binary values instead of storing "{}". +import "./cache/BinaryPortCodec"; + export * from "./task-graph/Dataflow"; export * from "./task-graph/DataflowEvents"; diff --git a/packages/task-graph/src/node.ts b/packages/task-graph/src/node.ts index 4622a4480..075a6411f 100644 --- a/packages/task-graph/src/node.ts +++ b/packages/task-graph/src/node.ts @@ -7,3 +7,4 @@ // organize-imports-ignore export * from "./common"; +export * from "./common-server"; diff --git a/packages/task-graph/src/storage/FsFolderTaskOutputRepository.ts b/packages/task-graph/src/storage/FsFolderTaskOutputRepository.ts new file mode 100644 index 000000000..b27c7e3c5 --- /dev/null +++ b/packages/task-graph/src/storage/FsFolderTaskOutputRepository.ts @@ -0,0 +1,258 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import { FsFolderTabularStorage } from "@workglow/storage"; +import { makeFingerprint } from "@workglow/util"; +import { randomUUID } from "node:crypto"; +import { createReadStream, existsSync } from "node:fs"; +import { mkdir, open, readdir, readFile, rename, rm, stat } from "node:fs/promises"; +import { join } from "node:path"; +import type { CacheRef } from "../cache/CacheRef"; +import { makeCacheRef } from "../cache/CacheRef"; +import type { TaskInput } from "../task/TaskTypes"; +import { tabularTaskOutputStorage } from "./TabularTaskOutputStorage"; +import { + TaskOutputPrimaryKeyNames, + TaskOutputSchema, + TaskOutputTabularRepository, +} from "./TaskOutputTabularRepository"; + +/** + * Blob names are `_.bin`; anything else + * (including in-flight `.tmp` files and foreign `$ref` schemes) never resolves. + * The single-segment match also rules out path traversal through a crafted ref. + */ +const REF_PATTERN = /^fsfolder:\/\/blobs\/([A-Za-z0-9._-]+\.bin)$/; + +function sanitize(s: string): string { + return s.replace(/[^A-Za-z0-9._-]/g, "-"); +} + +/** + * Filesystem-backed task output repository with full binary streaming support. + * + * JSON output rows are stored through {@link FsFolderTabularStorage} (one file + * per row, compression and TTL pruning inherited from + * {@link TaskOutputTabularRepository}). Binary payloads written via + * `saveOutputStream` live as sidecar files under `/blobs/`, written + * incrementally (never materialized) to a `.tmp` file and atomically renamed + * on completion — a crash mid-write never publishes a readable partial blob. + * The temp handle is `sync()`'d before rename so a power loss between the + * rename and the OS flushing dirty data cannot leave the published blob name + * pointing at zero bytes. The containing directory is then `sync()`'d as + * well so the rename itself is durable: on ext4 `data=ordered` and similar + * filesystems, a crash between the rename returning and the directory + * metadata being flushed can otherwise leave the published name pointing at + * stale (zero-byte) content. The dir fsync runs best-effort — platforms + * that reject opening a directory for fsync (`EPERM` / `EINVAL` / `ENOTSUP` + * / `EISDIR`) fall through silently. + * + * Each `saveOutputStream` call mints a unique blob filename of the form + * `__.bin`. Two concurrent writers + * computing the same `(taskType, inputs)` therefore land at distinct paths, + * so a failed-row-commit cleanup on one writer cannot remove the published + * blob the other writer's row still points at. The published `$ref` continues + * to carry the sanitized taskType prefix so prefix-scoped pruning + * (`deleteByTaskTypePrefix` / `clearOlderThanWithTaskTypePrefix`) keeps + * cascading correctly; stale blobs from crashes between rename and row + * commit are reclaimed by `clearOlderThan` / the {@link CacheJanitor}. + * + * Two instances pointed at the same folder interoperate: a `CacheRef` written + * by one resolves through the other (the cross-process contract for queue + * consumers). Node/Bun only — exported via the package's server entries. + * + * Multi-tenant warning: blob names in the deterministic cache path are derived + * from `(sanitize(taskType), fingerprint(inputs))` with no tenant axis, so two + * tenants running the same task with identical inputs share a blob — an + * existence side-channel for sensitive inputs. This repository assumes a + * SINGLE-TENANT deployment. For multi-tenant use, wrap with a per-tenant + * folder/prefix at the layer above, or scope writes through + * {@link RunPrivateCacheRepo} so each run namespaces its own blobs. + */ +export class FsFolderTaskOutputRepository extends TaskOutputTabularRepository { + private readonly blobsDir: string; + + constructor(folderPath: string) { + super({ + storage: tabularTaskOutputStorage( + new FsFolderTabularStorage(folderPath, TaskOutputSchema, TaskOutputPrimaryKeyNames) + ), + }); + this.blobsDir = join(folderPath, "blobs"); + } + + override async saveOutputStream( + taskType: string, + inputs: TaskInput, + chunks: AsyncIterable, + metadata: Record + ): Promise { + await mkdir(this.blobsDir, { recursive: true }); + // The fingerprint covers the raw taskType too: `sanitize` is lossy, so two + // distinct task types can share a sanitized prefix — the hash keeps their + // blobs distinct while the prefix keeps names greppable and prefix-deletable. + // The per-write UUID suffix makes every blob path unique so two + // concurrent writers with the same `(taskType, inputs)` can't race on the + // same file (a row-commit failure on one writer must not delete the blob + // the other writer's row points at). Legacy un-suffixed names written by + // older versions of this repo still resolve through {@link REF_PATTERN}. + // Note (multi-tenant): there is no tenant axis here. Identical inputs from + // two tenants resolve to the same prefix and share an existence side- + // channel. See the class JSDoc for the deployment assumption. + const fingerprint = await makeFingerprint({ __taskType: taskType, inputs }); + const name = `${sanitize(taskType)}_${fingerprint}_${randomUUID()}.bin`; + const tmpPath = join(this.blobsDir, `${name}.tmp`); + const handle = await open(tmpPath, "w"); + let size = 0; + try { + try { + for await (const chunk of chunks) { + await handle.write(chunk); + size += chunk.byteLength; + } + // Flush the file's data to the underlying storage before we publish + // it under the final name. Without this, a power loss between the + // rename and the OS flushing dirty pages can leave the published + // blob name pointing at zero bytes (FS-dependent). + await handle.sync(); + } finally { + await handle.close(); + } + await rename(tmpPath, join(this.blobsDir, name)); + // Flush the directory entry itself: on ext4 `data=ordered` and similar + // filesystems the rename is not durable until the parent directory's + // metadata is flushed, so a crash between the rename and that flush + // can leave the published name visible but pointing at stale (zero- + // byte) data. Best-effort: platforms that reject directory fsync fall + // through silently — recomputing on next read is the documented + // fallback. + try { + const dir = await open(this.blobsDir, "r"); + try { + await dir.sync(); + } finally { + await dir.close(); + } + } catch (err) { + const code = (err as NodeJS.ErrnoException).code; + if (code !== "EPERM" && code !== "EINVAL" && code !== "ENOTSUP" && code !== "EISDIR") { + throw err; + } + } + } catch (err) { + // A failed write or rename must not leave a stray .tmp behind. + await rm(tmpPath, { force: true }); + throw err; + } + this.emit("output_saved", taskType); + const mime = typeof metadata.mime === "string" ? metadata.mime : undefined; + return makeCacheRef({ $ref: `fsfolder://blobs/${name}`, size, mime }); + } + + private blobPath(ref: CacheRef): string | undefined { + const match = REF_PATTERN.exec(ref.$ref); + return match ? join(this.blobsDir, match[1]) : undefined; + } + + override async getOutputByRef(ref: CacheRef): Promise { + const path = this.blobPath(ref); + if (path === undefined) return undefined; + try { + return new Blob([await readFile(path)]); + } catch (err) { + if ((err as NodeJS.ErrnoException).code === "ENOENT") return undefined; + throw err; + } + } + + override getOutputStreamByRef(ref: CacheRef): AsyncIterable | undefined { + const path = this.blobPath(ref); + if (path === undefined || !existsSync(path)) return undefined; + return (async function* () { + for await (const chunk of createReadStream(path)) { + yield chunk as Uint8Array; + } + })(); + } + + override async deleteOutputByRef(ref: CacheRef): Promise { + const path = this.blobPath(ref); + if (path === undefined) return; + await rm(path, { force: true }); + } + + override async clear(): Promise { + await super.clear(); + await rm(this.blobsDir, { recursive: true, force: true }); + } + + /** + * Prune rows and blobs older than `olderThanInMs`. Operators SHOULD schedule + * this (e.g. via {@link CacheJanitor}) on a recurring cadence: it is also + * the sweep that reclaims orphan blobs left by process crashes between a + * successful stream-write and the row commit (the runner cleans up best- + * effort via {@link TaskOutputRepository.deleteOutputByRef} on synchronous + * save failure, but a hard kill races that path). Without periodic + * `clearOlderThan`, stranded blobs accumulate without bound. + */ + override async clearOlderThan(olderThanInMs: number): Promise { + const cutoff = Date.now() - olderThanInMs; + // FsFolderTabularStorage does not implement deleteSearch (the base + // implementation's pruning path), so prune rows by scanning. + for await (const row of this.storage.records()) { + const ts = typeof row.createdAt === "string" ? new Date(row.createdAt).getTime() : NaN; + if (!isNaN(ts) && ts < cutoff) { + await this.storage.delete({ key: row.key, taskType: row.taskType }); + } + } + this.emit("output_pruned"); + await this.deleteBlobsByPrefix("", cutoff); + } + + /** + * Row deletions scoped by `taskType` prefix (run-private cleanup via + * `RunPrivateCacheRepo.clearRun()` and `CacheJanitor`) must cascade to the + * blob sidecar files, or every streamed run-private payload leaks on disk + * after its rows are pruned. Blob names start with the sanitized taskType, + * and `sanitize` preserves prefix relationships, so a sanitized-prefix match + * selects exactly the rows' blobs (a cross-prefix collision would require + * two raw prefixes with identical sanitized forms — run namespaces embed a + * UUID, so this does not occur in practice). + */ + override async deleteByTaskTypePrefix(prefix: string): Promise { + await super.deleteByTaskTypePrefix(prefix); + await this.deleteBlobsByPrefix(sanitize(prefix)); + } + + override async clearOlderThanWithTaskTypePrefix( + prefix: string, + olderThanInMs: number + ): Promise { + await super.clearOlderThanWithTaskTypePrefix(prefix, olderThanInMs); + await this.deleteBlobsByPrefix(sanitize(prefix), Date.now() - olderThanInMs); + } + + private async deleteBlobsByPrefix(namePrefix: string, olderThanMtimeMs?: number): Promise { + let names: string[]; + try { + names = await readdir(this.blobsDir); + } catch { + return; + } + for (const name of names) { + if (!name.startsWith(namePrefix)) continue; + const path = join(this.blobsDir, name); + try { + if (olderThanMtimeMs !== undefined && (await stat(path)).mtimeMs >= olderThanMtimeMs) { + continue; + } + await rm(path, { force: true }); + } catch { + // Raced with a concurrent write or delete; the next sweep catches it. + } + } + } +} diff --git a/packages/task-graph/src/storage/TaskOutputRepository.ts b/packages/task-graph/src/storage/TaskOutputRepository.ts index 1a3b8ed5e..dfaa2ef55 100644 --- a/packages/task-graph/src/storage/TaskOutputRepository.ts +++ b/packages/task-graph/src/storage/TaskOutputRepository.ts @@ -5,6 +5,7 @@ */ import { createServiceToken, EventEmitter, EventParameters } from "@workglow/util"; +import type { CacheRef } from "../cache/CacheRef"; import { TaskInput, TaskOutput } from "../task/TaskTypes"; export const TASK_OUTPUT_REPOSITORY = createServiceToken( @@ -79,6 +80,75 @@ export abstract class TaskOutputRepository { createdAt?: Date // for testing purposes ): Promise; + /** + * OPTIONAL streaming sink. Implementations that can ingest a byte stream + * without materializing the full payload (e.g. a file-backed cache) declare + * this method; the runner pipes `binary-delta` chunks straight to it. The + * default base class does NOT implement it — call `supportsStreaming()` to + * branch. `metadata` carries side-band data (e.g. HTTP response headers). + * + * Returns a {@link CacheRef} that the runner places into `Output` at the + * binary port slot when the reference threshold is met. The `$ref` string is + * opaque; only this repository (and any wrapping namespacer like + * {@link RunPrivateCacheRepo}) needs to know how to decode it via + * {@link getOutputByRef} / {@link getOutputStreamByRef}. + * + * Implementations that provide `saveOutputStream` MUST also provide + * `getOutputByRef` (and ideally `getOutputStreamByRef`); a ref written by + * one without a paired reader is unresolvable. + * + * Implementations SHOULD populate `size` on the returned ref: refs without + * a known size are conservatively kept as refs, silently bypassing + * below-threshold inlining for callers that expect small outputs inline. + */ + saveOutputStream?( + taskType: string, + inputs: TaskInput, + chunks: AsyncIterable, + metadata: Record + ): Promise; + + /** + * OPTIONAL reader counterpart of {@link saveOutputStream}. Resolves a + * {@link CacheRef} previously produced by `saveOutputStream` to a `Blob`. + * Returns `undefined` on cache miss (TTL expiry, manual clear). The runner + * never calls this directly; consumers calling `JobHandle.result()` or + * `resolveOutput` reach it through the resolver layer. + */ + getOutputByRef?(ref: CacheRef): Promise; + + /** + * OPTIONAL streaming reader counterpart of {@link saveOutputStream}. Returns + * an async iterable of bytes for the referenced entry, or `undefined` when + * the entry is absent or this backing does not support streaming retrieval. + * + * Implementations MUST yield bounded-size chunks (e.g. filesystem read + * chunks): cache-hit replay paces consumers per chunk, so yielding the + * whole payload as one chunk defeats the memory bound this reader exists + * to provide. + */ + getOutputStreamByRef?(ref: CacheRef): AsyncIterable | undefined; + + /** + * OPTIONAL cleanup hook for orphan blobs. Called by the runner when a + * stream-write succeeded (producing a {@link CacheRef}) but the row write + * that points at it failed — without this, the blob would persist on disk + * with no row referencing it, and the row-driven cleanup paths would never + * find it. Implementations SHOULD be best-effort and idempotent (no error + * on missing entry). Returns when the deletion attempt has settled. + */ + deleteOutputByRef?(ref: CacheRef): Promise; + + /** True when this repository implements `saveOutputStream`. */ + supportsStreaming(): boolean { + return typeof this.saveOutputStream === "function"; + } + + /** True when this repository implements `getOutputStreamByRef`. */ + supportsStreamingReads(): boolean { + return typeof this.getOutputStreamByRef === "function"; + } + abstract getOutput(taskType: string, inputs: TaskInput): Promise; abstract clear(): Promise; diff --git a/packages/task-graph/src/task-graph/StreamPump.ts b/packages/task-graph/src/task-graph/StreamPump.ts index b53515cc4..ad8fdfc1d 100644 --- a/packages/task-graph/src/task-graph/StreamPump.ts +++ b/packages/task-graph/src/task-graph/StreamPump.ts @@ -8,7 +8,12 @@ import type { ResourceScope, ServiceRegistry } from "@workglow/util"; import type { TaskOutputRepository } from "../storage/TaskOutputRepository"; import type { ITask } from "../task/ITask"; import type { StreamEvent, StreamMode } from "../task/StreamTypes"; -import { edgeNeedsAccumulation, getOutputStreamMode, getStreamingPorts } from "../task/StreamTypes"; +import { + edgeNeedsAccumulation, + getOutputStreamMode, + getPortStreamMode, + getStreamingPorts, +} from "../task/StreamTypes"; import type { TaskInput } from "../task/TaskTypes"; import { TaskStatus } from "../task/TaskTypes"; import { Dataflow, DATAFLOW_ALL_PORTS } from "./Dataflow"; @@ -189,10 +194,17 @@ export class StreamPump { // otherwise pass the legacy repo (or undefined to use CACHE_REGISTRY). outputCache: options.legacyCacheExplicitlyDisabled ? false : options.outputCache, shouldAccumulate, + hasStreamingConsumers: StreamPump.anyConsumerAcceptsBinaryStream(this.graph, task), + hasMaterializingConsumers: StreamPump.anyConsumerNeedsMaterialized(this.graph, task), updateProgress: options.updateProgress, registry: options.registry, resourceScope: options.resourceScope, runId: options.runId, + // Sinks are installed regardless of downstream needs: when both an + // accumulator and a router exist (downstream needs materialized + cache + // can stream), StreamProcessor tees — accumulator drives the enriched + // finish event for edge consumers; the router's CacheRef takes the + // port slot in finalOutput so the queue/cache row stays small. }); await this.edgeMaterializer.pushOutputFromNodeToEdges(task, results); @@ -229,7 +241,15 @@ export class StreamPump { outputCache: TaskOutputRepository | undefined, accumulateLeafOutputs: boolean ): boolean { - if (outputCache) return true; + if (outputCache) { + // Relaxation: when the cache can ingest a byte stream, the task streams + // ONLY binary, and no downstream edge needs the materialized value, the + // bytes are piped straight to the cache sink instead of being buffered + // into an enriched finish event. This is the memory win for large binary + // outputs (e.g. file/image producers). + if (StreamPump.canStreamBinaryToCache(this.graph, task, outputCache)) return false; + return true; + } const outEdges = this.graph.getTargetDataflows(task.id); if (outEdges.length === 0) return accumulateLeafOutputs; @@ -256,6 +276,90 @@ export class StreamPump { return false; } + /** + * Decides whether a streaming task's binary output can be piped straight to a + * stream-capable cache sink (skipping in-memory accumulation). True when: + * + * 1. The cache reports `supportsStreaming()` (NOT a `typeof saveOutputStream` + * duck-type — wrappers like `RunPrivateCacheRepo` always expose a concrete + * `saveOutputStream` but their `supportsStreaming()` reflects the BACKING + * repo, so the duck-type would falsely report `true` over a non-streaming + * backing store). + * 2. The task's only streaming output port(s) are binary. + * 3. No downstream dataflow edge needs the materialized value (every consumer + * accepts the raw binary stream, or there are no consumers). + * + * Exposed as a static (taking the graph explicitly) so the decision is + * unit-testable in isolation from a live run. + */ + static canStreamBinaryToCache( + graph: TaskGraph, + task: ITask, + outputCache: TaskOutputRepository | undefined + ): boolean { + // Defensive: a repository may not implement `supportsStreaming` (the base + // class does, but test doubles / partial mocks may not). Treat anything + // that cannot affirmatively report streaming support as non-streaming. + if (typeof outputCache?.supportsStreaming !== "function") return false; + if (!outputCache.supportsStreaming()) return false; + + const outSchema = task.outputSchema(); + const streamingPorts = getStreamingPorts(outSchema); + // Exactly ONE binary port: the cache sink contract keys bytes by + // (taskType, inputs) with no port axis, so only a single port can pipe to + // the cache. With accumulation skipped, any additional binary port would + // have neither a sink nor an accumulator and its chunks would be silently + // dropped — multi-port tasks must take the accumulation path instead. + if (streamingPorts.length !== 1 || streamingPorts[0].mode !== "binary") return false; + + return !StreamPump.anyConsumerNeedsMaterialized(graph, task); + } + + /** + * Returns `true` when any outgoing dataflow edge from {@link task} has a + * target task whose input port can't consume the source's stream mode + * directly (per {@link edgeNeedsAccumulation}). Independent of the cache — + * used by the graph runner to decide whether to inhibit binary-stream sinks + * on the source task's runner (refs can't survive across an edge whose + * target expects a materialized value). + * + * Treats fan-out `*` edges as always-needs-materialized (conservative). + */ + static anyConsumerNeedsMaterialized(graph: TaskGraph, task: ITask): boolean { + const outSchema = task.outputSchema(); + const outEdges = graph.getTargetDataflows(task.id); + return outEdges.some((df) => { + if (df.sourceTaskPortId === DATAFLOW_ALL_PORTS) return true; + const targetTask = graph.getTask(df.targetTaskId); + if (!targetTask) return false; + return edgeNeedsAccumulation( + outSchema, + df.sourceTaskPortId, + targetTask.inputSchema(), + df.targetTaskPortId + ); + }); + } + + /** + * Returns `true` when any outgoing dataflow edge targets an input port that + * consumes the source port's binary stream mode directly (`x-stream: + * "binary"` on both ends). Used to decide whether a cache hit should replay + * cached bytes as `binary-delta` events: with no stream-capable consumer + * the replay would be wasted reads. `*` fan-out edges don't count — their + * consumers receive materialized values, not streams. + */ + static anyConsumerAcceptsBinaryStream(graph: TaskGraph, task: ITask): boolean { + const outSchema = task.outputSchema(); + return graph.getTargetDataflows(task.id).some((df) => { + if (df.sourceTaskPortId === DATAFLOW_ALL_PORTS) return false; + if (getPortStreamMode(outSchema, df.sourceTaskPortId) !== "binary") return false; + const targetTask = graph.getTask(df.targetTaskId); + if (!targetTask) return false; + return getPortStreamMode(targetTask.inputSchema(), df.targetTaskPortId) === "binary"; + }); + } + /** * Returns true if an event carries a port-specific delta (text-delta or object-delta). */ @@ -303,17 +407,37 @@ export class StreamPump { // Stream may be closed } }; + const detach = () => { + task.off("stream_chunk", onChunk); + task.off("stream_end", onEnd); + task.off("abort", onTerminate); + task.off("error", onTerminate); + }; const onEnd = () => { try { controller.close(); } catch { // Stream may already be closed } - task.off("stream_chunk", onChunk); - task.off("stream_end", onEnd); + detach(); + }; + // Abort/error never emit `stream_end` (the stream loop throws first), + // so without these the edge stream would stay open forever and the + // listeners would leak. Close gracefully — downstream materialization + // settles on whatever events arrived; the task's own abort/error + // already surfaces through the run. + const onTerminate = () => { + try { + controller.close(); + } catch { + // Stream may already be closed + } + detach(); }; task.on("stream_chunk", onChunk); task.on("stream_end", onEnd); + task.on("abort", onTerminate); + task.on("error", onTerminate); }, }); } diff --git a/packages/task-graph/src/task/CacheCoordinator.ts b/packages/task-graph/src/task/CacheCoordinator.ts index 187c9ce63..1b940bb52 100644 --- a/packages/task-graph/src/task/CacheCoordinator.ts +++ b/packages/task-graph/src/task/CacheCoordinator.ts @@ -5,20 +5,38 @@ */ import { getPortCodec } from "@workglow/util"; +import type { DataPortSchema } from "@workglow/util/schema"; import { type CachePolicy, isPolicyCached, isPolicyPrivate } from "../cache/CachePolicy"; +import type { CacheRef } from "../cache/CacheRef"; +import { isCacheRef, makeCacheRef } from "../cache/CacheRef"; import type { CacheRegistry } from "../cache/CacheRegistry"; import { RunPrivateCacheRepo } from "../cache/RunPrivateCacheRepo"; +import { streamRefViaBacking } from "../cache/resolveRef"; import type { TaskOutputRepository } from "../storage/TaskOutputRepository"; import type { ITask } from "./ITask"; +import type { BinaryRefSink } from "./StreamProcessor"; import type { StreamEvent } from "./StreamTypes"; +import { assertBinaryFormat, getStreamingPorts, materializeBinary } from "./StreamTypes"; import { Task } from "./Task"; import type { TaskRunContext } from "./TaskRunContext"; import type { TaskInput, TaskOutput } from "./TaskTypes"; +import { TaskStatus } from "./TaskTypes"; interface SchemaProperties { properties?: Record; } +/** + * Graph-computed consumer hints driving cache-hit behavior for binary + * {@link CacheRef} output ports: hydrate bytes into the enriched finish event + * for materializing consumers, replay chunked `binary-delta` events for + * stream-capable consumers, or (neither flag set) leave refs untouched. + */ +export interface CacheReplayContext { + readonly hasMaterializingConsumers: boolean; + readonly hasStreamingConsumers: boolean; +} + /** * @internal * Cache key normalization, lookup, save, and cache-hit stream-event emission @@ -85,7 +103,8 @@ export class CacheCoordinator { if (!outputCache || !this.task.cacheable) return undefined; @@ -104,10 +123,21 @@ export class CacheCoordinator { + const needBytes = replay?.hasMaterializingConsumers === true; + const replayDeltas = replay?.hasStreamingConsumers === true; + if (!needBytes && !replayDeltas) return "none"; + if (outputs === null || typeof outputs !== "object") return "none"; + + const source = outputs as Record; + const refPorts = getStreamingPorts(outputSchema) + .filter((p) => p.mode === "binary") + .map((p) => p.port) + .filter((port) => isCacheRef(source[port])); + if (refPorts.length === 0) return "none"; + + // Resolve every ref before emitting any event so a dangling ref becomes a + // clean miss with zero observable side effects. + const streams = new Map>(); + for (const port of refPorts) { + const stream = await streamRefViaBacking(source[port] as CacheRef, outputCache); + if (stream === undefined) return "miss"; + streams.set(port, stream); + } + + this.task.runOutputData = outputs; + this.task.emit("stream_start"); + // Flipping to STREAMING before the first data event is what makes the + // graph runner attach edge streams (same contract as StreamProcessor). + this.task.status = TaskStatus.STREAMING; + this.task.emit("status", this.task.status); + + const finishData: Record = { ...source }; + for (const [port, stream] of streams) { + const chunks: Uint8Array[] | undefined = needBytes ? [] : undefined; + for await (const chunk of stream) { + chunks?.push(chunk); + if (replayDeltas) { + this.task.emit("stream_chunk", { + type: "binary-delta", + port, + binaryDelta: chunk, + } as StreamEvent); + } + } + if (chunks !== undefined) { + finishData[port] = materializeBinary(chunks, assertBinaryFormat(outputSchema, port)); + } + } + this.task.emit("stream_chunk", { type: "finish", data: finishData } as StreamEvent); + this.task.emit("stream_end", finishData); + return "handled"; + } + /** * Serializes and saves output. No-op when no cache is configured or task is * not cacheable. @@ -138,6 +242,23 @@ export class CacheCoordinator, + metadata: Record, + outputCache: TaskOutputRepository | undefined + ): Promise { + if (!outputCache || !this.task.cacheable) return undefined; + if (!outputCache.supportsStreaming()) return undefined; + return outputCache.saveOutputStream!(this.task.type, keyInputs, chunks, metadata); + } + // ======================================================================== // Policy-aware routing methods // ======================================================================== @@ -169,9 +290,17 @@ export class CacheCoordinator { - return this.lookup(keyInputs, this.repoFor(registry, policy), policy, isStreamable, ctx); + return this.lookup( + keyInputs, + this.repoFor(registry, policy), + policy, + isStreamable, + ctx, + replay + ); } public async saveByPolicy( @@ -183,6 +312,146 @@ export class CacheCoordinator { + if (output === null || typeof output !== "object") return; + const cache = this.repoFor(registry, policy); + if (!cache || typeof cache.deleteOutputByRef !== "function") return; + const source = output as Record; + const binaryPorts = getStreamingPorts(outputSchema) + .filter((p) => p.mode === "binary") + .map((p) => p.port); + await Promise.all( + binaryPorts.map(async (port) => { + const value = source[port]; + if (!isCacheRef(value)) return; + try { + await cache.deleteOutputByRef!(value); + } catch { + // Best-effort: the periodic janitor sweep will reclaim what we miss. + } + }) + ); + } + + /** + * Build the per-port `BinaryRefSink` map the runner passes to + * `StreamProcessor.run()` so binary streams pipe straight to the cache and + * `Output` carries a {@link CacheRef} at the port slot. + * + * Returns `undefined` when any of the conditions for the ref path are not + * met: no cache is configured by policy, the cache does not implement + * `saveOutputStream`, the task is not cacheable, or the output schema has + * no `x-stream: "binary"` port. v1 supports single-binary-port tasks only; + * tasks with multiple binary ports fall back to the accumulation path. + * + * The threshold ({@link IRunConfig.referenceThresholdBytes}) controls + * whether the ref *survives* in the final Output, not whether the sink + * runs: when total bytes streamed end up below the threshold, the runner + * rehydrates the ref to an inline `Blob`/`ArrayBuffer` via + * {@link hydrateRefsBelowThreshold}. Setting threshold to `0` forces + * every binary port to a ref regardless of size. + */ + public getBinaryRefSinksByPolicy( + keyInputs: Input, + registry: CacheRegistry | undefined, + policy: CachePolicy, + outputSchema: DataPortSchema + ): ReadonlyMap | undefined { + if (!this.task.cacheable) return undefined; + const cache = this.repoFor(registry, policy); + if (!cache || !cache.supportsStreaming()) return undefined; + // The sink keys bytes by (taskType, inputs) with no port axis, so two + // binary ports would overwrite each other in the backing. Enforce the + // single-binary-port restriction here as well as in the accumulation + // decision (StreamPump.canStreamBinaryToCache) — multi-port tasks fall + // back to pure accumulation. + const binaryPorts = getStreamingPorts(outputSchema).filter((p) => p.mode === "binary"); + if (binaryPorts.length !== 1) return undefined; + const port = binaryPorts[0].port; + const taskType = this.task.type; + // Re-wrap the backing's CacheRef so legacy `saveOutputStream` implementations + // that pre-date the `kind` brand still produce a discriminator-bearing ref. + // Branded refs pass through unchanged (preserving size/mime hints). + const sink: BinaryRefSink = async (chunks) => { + const raw = await cache.saveOutputStream!(taskType, keyInputs, chunks, {}); + return isCacheRef(raw) ? raw : makeCacheRef(raw); + }; + return new Map([[port, sink]]); + } + + /** + * Post-process the streaming task's `Output`: for every **binary streaming + * port** (per the schema) whose value is a {@link CacheRef} with + * `size < referenceThresholdBytes`, rehydrate the bytes via `getOutputByRef` + * and inline them as `Blob`/`ArrayBuffer` (per the port's `format` + * annotation). Refs at or above the threshold are left in place. + * `referenceThresholdBytes === 0` forces every ref to survive regardless of + * size. + * + * Restricted to schema-declared binary streaming ports so that legitimate + * non-binary fields that happen to carry a `{$ref: string}` shape (e.g. a + * JSON-Schema reference embedded in metadata) are not mistakenly resolved + * against the cache. + * + * Refs without a known `size` are kept as-is (the writer didn't measure; + * conservatively assume "large enough to keep as ref"). Backings that want + * threshold-based rehydration MUST populate `size` on the CacheRef they + * return from `saveOutputStream`. + */ + public async hydrateRefsBelowThreshold( + output: Output, + registry: CacheRegistry | undefined, + policy: CachePolicy, + outputSchema: DataPortSchema, + referenceThresholdBytes: number + ): Promise { + if (referenceThresholdBytes === 0) return output; + if (output === null || typeof output !== "object") return output; + const cache = this.repoFor(registry, policy); + if (!cache || typeof cache.getOutputByRef !== "function") return output; + + const binaryPorts = getStreamingPorts(outputSchema) + .filter((p) => p.mode === "binary") + .map((p) => p.port); + if (binaryPorts.length === 0) return output; + + const source = output as Record; + let out: Record | undefined; + const rehydrations = await Promise.all( + binaryPorts.map(async (port) => { + const value = source[port]; + if (!isCacheRef(value)) return undefined; + const size = value.size; + if (size === undefined || size >= referenceThresholdBytes) return undefined; + const blob = await cache.getOutputByRef!(value); + if (blob === undefined) return undefined; + const format = assertBinaryFormat(outputSchema, port); + const inlined = format === "binary" ? await blob.arrayBuffer() : blob; + return { port, inlined }; + }) + ); + for (const r of rehydrations) { + if (!r) continue; + out ??= { ...source }; + out[r.port] = r.inlined; + } + return (out ?? source) as Output; + } + // ======================================================================== // Private static helpers (lifted from current module-private functions in // TaskRunner.ts) diff --git a/packages/task-graph/src/task/ITask.ts b/packages/task-graph/src/task/ITask.ts index b05c2386c..ca0571dcc 100644 --- a/packages/task-graph/src/task/ITask.ts +++ b/packages/task-graph/src/task/ITask.ts @@ -68,6 +68,16 @@ export interface IExecuteContext { * did not provide it. */ resourceScope?: ResourceScope; + /** + * Optional cooperative backpressure hook for streaming tasks that emit very + * large binary outputs by direct event emission (rather than through the + * StreamProcessor's `await router.push(...)` path). Tasks may `await` this + * between yields/emits to give downstream sinks a chance to drain. + * + * Defaults to a no-op when the runtime does not install a real backpressure + * source — tasks can call it unconditionally without paying a cost. + */ + binaryBackpressure?: () => Promise; } export type IExecutePreviewContext = Pick; @@ -109,6 +119,53 @@ export interface IRunConfig { */ shouldAccumulate?: boolean; + /** + * Graph-computed hint: `true` when at least one downstream dataflow edge + * consumes this task's binary output port as a stream (`x-stream: "binary"` + * on both ends). On a cache hit the runner replays cached bytes as + * `binary-delta` events only when a stream-capable consumer exists. + * `undefined` (standalone runs) means "no known stream consumers". + */ + hasStreamingConsumers?: boolean; + + /** + * Graph-computed hint: `true` when at least one downstream dataflow edge + * needs this task's output materialized (the target port cannot consume the + * stream mode directly). On a cache hit the runner hydrates binary + * {@link CacheRef} values into the enriched finish event so those consumers + * receive `Blob`/`ArrayBuffer` just like on a fresh run. `undefined` + * (standalone runs) means "no known materializing consumers". + */ + hasMaterializingConsumers?: boolean; + + /** + * Threshold (in bytes) at which a binary output port's value is replaced by + * a {@link CacheRef} in `Output` instead of being inlined. Below this size, + * the runner inlines the bytes; at or above, it emits a reference and the + * bytes live only in the cache backing. + * + * `0` forces a reference for every binary port regardless of size. Negative + * values and `undefined` fall back to + * {@link REFERENCE_THRESHOLD_BYTES_DEFAULT} (64 KB). + * + * Only applied when the cache backing implements `saveOutputStream` and the + * port carries binary stream events; otherwise the value is always inlined + * regardless of this setting. + */ + referenceThresholdBytes?: number; + + /** + * High-water mark (bytes) for the streaming runtime's per-port binary + * router buffer. When the buffered (un-consumed) byte total reaches or + * exceeds this threshold, the producer (`executeStream`) is parked between + * `binary-delta` yields until the cache sink drains the buffer back below + * the mark. Bounds peak memory for fast-producer / slow-sink scenarios. + * + * Defaults to {@link DEFAULT_BINARY_HIGH_WATER_BYTES} (8 MiB) when omitted + * or set to a non-positive value. + */ + binaryHighWaterBytes?: number; + /** * Optional callback invoked whenever a task's progress changes during execution. * @param task - The task whose progress changed. diff --git a/packages/task-graph/src/task/StreamProcessor.ts b/packages/task-graph/src/task/StreamProcessor.ts index 386fd0553..0ed04ab43 100644 --- a/packages/task-graph/src/task/StreamProcessor.ts +++ b/packages/task-graph/src/task/StreamProcessor.ts @@ -5,15 +5,33 @@ */ import type { ResourceScope, ServiceRegistry } from "@workglow/util"; +import type { CacheRef } from "../cache/CacheRef"; import type { Taskish } from "../task-graph/Conversions"; import type { ITask } from "./ITask"; import type { StreamEvent, StreamMode } from "./StreamTypes"; -import { getOutputStreamMode, getStreamingPorts } from "./StreamTypes"; +import { + assertBinaryFormat, + DEFAULT_BINARY_HIGH_WATER_BYTES, + getOutputStreamMode, + getStreamingPorts, + materializeBinary, +} from "./StreamTypes"; import { TaskAbortedError, TaskError } from "./TaskError"; import type { TaskRunContext } from "./TaskRunContext"; import type { TaskInput, TaskOutput } from "./TaskTypes"; import { TaskStatus } from "./TaskTypes"; +/** + * Consumer for a port's binary-delta stream. The processor exposes chunks as + * an async iterable; the sink returns the {@link CacheRef} the processor + * places into `Output` at the port slot. + * + * Implementations are typically thin wrappers around + * `TaskOutputRepository.saveOutputStream` — the runner supplies the wrapper + * once it knows the cache key. + */ +export type BinaryRefSink = (chunks: AsyncIterable) => Promise; + /** * Per-call run-state inputs shared by StreamProcessor.run. Bundles facade * state pulled at call time (registry, resourceScope, inputStreams) and @@ -31,6 +49,26 @@ export interface StreamProcessorDeps { ...args: any[] ) => Promise; readonly own: >(i: T) => T; + /** + * Per-port binary-stream sinks. When a port has a sink registered, the + * processor routes that port's `binary-delta` chunks to the sink (as an + * async iterable) **instead** of accumulating them into a `Blob` / + * `ArrayBuffer` in memory. At finish, the sink's returned {@link CacheRef} + * replaces the port's slot in the output object — unless an explicit + * binary finish payload is present for that port, which always wins + * (artifact precedence: an explicit whole payload wins over a delta-built one). + * + * Ports without a sink follow the normal accumulation path. + */ + readonly binaryRefSinks?: ReadonlyMap; + /** + * High-water mark (bytes) for the per-port binary stream router buffer. When + * the buffered (un-consumed) byte total reaches or exceeds this value, + * `BinaryStreamRouter.push()` returns a Promise that resolves only after the + * consumer drains the buffer back below the mark. Defaults to + * {@link DEFAULT_BINARY_HIGH_WATER_BYTES} when omitted. + */ + readonly binaryHighWaterBytes?: number; } /** @@ -70,11 +108,50 @@ export class StreamProcessor const accumulatedObjects = ctx.shouldAccumulate ? new Map | unknown[]>() : undefined; + const accumulatedBinary = ctx.shouldAccumulate ? new Map() : undefined; + // Per-port routers: lazily created on the first binary-delta whose port has + // a sink in `deps.binaryRefSinks`. Routes chunks to the sink instead of + // accumulating in memory; at finish, awaits the sink's returned CacheRef + // and writes it into the output at the port slot. + const sinks = deps.binaryRefSinks; + const highWaterMark = + deps.binaryHighWaterBytes !== undefined && deps.binaryHighWaterBytes > 0 + ? deps.binaryHighWaterBytes + : DEFAULT_BINARY_HIGH_WATER_BYTES; + const routers = new Map(); + const ensureRouter = (port: string): BinaryStreamRouter | undefined => { + if (!sinks) return undefined; + const sink = sinks.get(port); + if (!sink) return undefined; + let r = routers.get(port); + if (!r) { + r = new BinaryStreamRouter(sink, highWaterMark); + routers.set(port, r); + } + return r; + }; + let streamingStarted = false; let finalOutput: Output | undefined; this.task.emit("stream_start"); + // Cooperative backpressure hook for executeStream() implementations that + // emit through a side channel (not StreamProcessor's awaited `push`). When + // any port has a router (we'd be applying byte-bounded backpressure on the + // direct `binary-delta` path anyway), `await ctx.binaryBackpressure()` + // waits until ALL active routers are at-or-below their high-water mark. + // Without a router this is a cheap no-op. + const binaryBackpressure = async (): Promise => { + if (routers.size === 0) return; + const waits: Promise[] = []; + for (const r of routers.values()) { + if (r._bufferedBytes >= r._highWaterMarkBytes) waits.push(r._awaitDrain()); + } + if (waits.length === 0) return; + await Promise.all(waits); + }; + const stream = this.task.executeStream!(input, { signal: ctx.abortController.signal, updateProgress: deps.onProgress, @@ -82,142 +159,230 @@ export class StreamProcessor registry: deps.registry, resourceScope: deps.resourceScope, inputStreams: deps.inputStreams, + binaryBackpressure, }); - for await (const event of stream) { - // For snapshot events, update runOutputData BEFORE emitting stream_chunk - // so listeners see the latest snapshot when they handle the event. - if (event.type === "snapshot") { - this.task.runOutputData = event.data as Output; - } - - switch (event.type) { - case "phase": { - // Phase events are metadata: emit for observability, translate to a - // progress event with optional progress + message, do NOT mutate - // accumulators or runOutputData, do NOT flip status to STREAMING. - this.task.emit("stream_chunk", event as StreamEvent); - await deps.onProgress(event.progress, event.message); - break; + try { + for await (const event of stream) { + // For snapshot events, update runOutputData BEFORE emitting stream_chunk + // so listeners see the latest snapshot when they handle the event. + if (event.type === "snapshot") { + this.task.runOutputData = event.data as Output; } - case "text-delta": { - if (!streamingStarted) { - streamingStarted = true; - this.task.status = TaskStatus.STREAMING; - this.task.emit("status", this.task.status); - } - if (accumulated) { - accumulated.set(event.port, (accumulated.get(event.port) ?? "") + event.textDelta); + + switch (event.type) { + case "phase": { + // Phase events are metadata: emit for observability, translate to a + // progress event with optional progress + message, do NOT mutate + // accumulators or runOutputData, do NOT flip status to STREAMING. + this.task.emit("stream_chunk", event as StreamEvent); + await deps.onProgress(event.progress, event.message); + break; } - this.task.emit("stream_chunk", event as StreamEvent); - break; - } - case "object-delta": { - if (!streamingStarted) { - streamingStarted = true; - this.task.status = TaskStatus.STREAMING; - this.task.emit("status", this.task.status); + case "text-delta": { + if (!streamingStarted) { + streamingStarted = true; + this.task.status = TaskStatus.STREAMING; + this.task.emit("status", this.task.status); + } + if (accumulated) { + accumulated.set(event.port, (accumulated.get(event.port) ?? "") + event.textDelta); + } + this.task.emit("stream_chunk", event as StreamEvent); + break; } - if (accumulatedObjects) { - const existing = accumulatedObjects.get(event.port); - if (Array.isArray(event.objectDelta)) { - // Array delta: upsert items by `id` into accumulated array - const arr: unknown[] = Array.isArray(existing) ? [...existing] : []; - for (const item of event.objectDelta) { - const itemObj = item as Record; - if (itemObj && typeof itemObj === "object" && "id" in itemObj) { - const idx = arr.findIndex( - (e) => (e as Record).id === itemObj.id - ); - if (idx >= 0) arr[idx] = item; - else arr.push(item); - } else { - arr.push(item); + case "object-delta": { + if (!streamingStarted) { + streamingStarted = true; + this.task.status = TaskStatus.STREAMING; + this.task.emit("status", this.task.status); + } + if (accumulatedObjects) { + const existing = accumulatedObjects.get(event.port); + if (Array.isArray(event.objectDelta)) { + // Array delta: upsert items by `id` into accumulated array + const arr: unknown[] = Array.isArray(existing) ? [...existing] : []; + for (const item of event.objectDelta) { + const itemObj = item as Record; + if (itemObj && typeof itemObj === "object" && "id" in itemObj) { + const idx = arr.findIndex( + (e) => (e as Record).id === itemObj.id + ); + if (idx >= 0) arr[idx] = item; + else arr.push(item); + } else { + arr.push(item); + } } + accumulatedObjects.set(event.port, arr); + } else { + // Non-array (e.g. structured generation): replace semantics + accumulatedObjects.set(event.port, event.objectDelta); } - accumulatedObjects.set(event.port, arr); - } else { - // Non-array (e.g. structured generation): replace semantics - accumulatedObjects.set(event.port, event.objectDelta); } + // Update runOutputData with accumulated state so listeners see growing state + this.task.runOutputData = { + ...this.task.runOutputData, + [event.port]: accumulatedObjects?.get(event.port) ?? event.objectDelta, + } as Output; + this.task.emit("stream_chunk", event as StreamEvent); + break; } - // Update runOutputData with accumulated state so listeners see growing state - this.task.runOutputData = { - ...this.task.runOutputData, - [event.port]: accumulatedObjects?.get(event.port) ?? event.objectDelta, - } as Output; - this.task.emit("stream_chunk", event as StreamEvent); - break; - } - case "snapshot": { - if (!streamingStarted) { - streamingStarted = true; - this.task.status = TaskStatus.STREAMING; - this.task.emit("status", this.task.status); + case "binary-delta": { + if (!streamingStarted) { + streamingStarted = true; + this.task.status = TaskStatus.STREAMING; + this.task.emit("status", this.task.status); + } + // Tee: when both a router AND an accumulator exist + // for this port (graph context where the cache can stream but a + // downstream edge needs the materialized value), push to BOTH — + // router writes to the cache for the small ref-bearing Output, + // accumulator drives the enriched finish event so edge consumers + // still receive a Blob/ArrayBuffer. + // `await router.push(...)` here is where byte-bounded backpressure + // takes effect: the producer (executeStream) parks until the sink + // drains the router buffer back under the high-water mark, or + // until the router is closed (abort/error path). + const router = ensureRouter(event.port); + if (router) await router.push(event.binaryDelta); + if (accumulatedBinary) { + const arr = accumulatedBinary.get(event.port) ?? []; + arr.push(event.binaryDelta); + accumulatedBinary.set(event.port, arr); + } + this.task.emit("stream_chunk", event as StreamEvent); + break; } - this.task.emit("stream_chunk", event as StreamEvent); - break; - } - case "finish": { - if (accumulated || accumulatedObjects) { - // Emit an enriched finish event: merge accumulated deltas into - // the finish payload so downstream dataflows get complete port data - // without needing to re-accumulate themselves. - const merged: Record = { ...(event.data || {}) }; - if (accumulated) { - for (const [port, text] of accumulated) { - if (text.length > 0) merged[port] = text; - } + case "snapshot": { + if (!streamingStarted) { + streamingStarted = true; + this.task.status = TaskStatus.STREAMING; + this.task.emit("status", this.task.status); } - if (accumulatedObjects) { - for (const [port, obj] of accumulatedObjects) { - merged[port] = obj; + this.task.emit("stream_chunk", event as StreamEvent); + break; + } + case "finish": { + const hasEnrichment = + accumulated !== undefined || + accumulatedObjects !== undefined || + accumulatedBinary !== undefined || + routers.size > 0; + if (hasEnrichment) { + // Emit an enriched finish event: merge accumulated deltas into + // the finish payload so downstream dataflows get complete port data + // without needing to re-accumulate themselves. + const explicitPayload = (event.data || {}) as Record; + const merged: Record = { ...explicitPayload }; + if (accumulated) { + for (const [port, text] of accumulated) { + if (text.length > 0) merged[port] = text; + } } - } - // For replace-mode streams, finish carries data: {} by convention. - // Fall back to the last snapshot (runOutputData) so the final output - // is not silently cleared when the finish payload is empty. - if (streamMode === "replace" && Object.keys(merged).length === 0) { - const lastSnapshot = this.task.runOutputData; - if (lastSnapshot && Object.keys(lastSnapshot).length > 0) { - finalOutput = lastSnapshot as Output; - this.task.emit("stream_chunk", { - type: "finish", - data: lastSnapshot, - } as StreamEvent); - break; + if (accumulatedObjects) { + for (const [port, obj] of accumulatedObjects) { + merged[port] = obj; + } } - } - finalOutput = merged as unknown as Output; - this.task.emit("stream_chunk", { type: "finish", data: merged } as StreamEvent); - } else { - // No accumulation. For replace-mode streams the provider's finish - // event carries `data: {}` by convention — the snapshots already - // delivered the value, so the finish payload is intentionally - // empty. Fall back to `runOutputData` (set on every snapshot above) - // so we don't clobber the last snapshot with an empty object. This - // mirrors the same fallback in the accumulation branch. - const finishData = (event.data ?? {}) as Record; - if (streamMode === "replace" && Object.keys(finishData).length === 0) { - const lastSnapshot = this.task.runOutputData; - if (lastSnapshot && Object.keys(lastSnapshot).length > 0) { - finalOutput = lastSnapshot as Output; - this.task.emit("stream_chunk", { - type: "finish", - data: lastSnapshot, - } as StreamEvent); - break; + if (accumulatedBinary) { + const outSchema = this.task.outputSchema(); + for (const [port, chunks] of accumulatedBinary) { + // Explicit binary finish payload wins. (Unlike text/object + // deltas above, which overwrite event.data, binary yields to + // an explicit payload — it's a whole artifact, not a partial.) + if (port in explicitPayload) continue; + const format = assertBinaryFormat(outSchema, port); + merged[port] = materializeBinary(chunks, format); + } + } + // Close routers and collect refs. Explicit binary finish payload + // still wins for the OUTPUT slot (artifact precedence); the + // router's CacheRef is discarded in that case but the cache + // write already happened. + for (const router of routers.values()) router.end(); + const refs = new Map(); + for (const [port, router] of routers) { + if (port in explicitPayload) { + // Drain the promise so the sink doesn't leak; ignore the ref. + router.ref().catch(() => {}); + continue; + } + refs.set(port, await router.ref()); + } + // For replace-mode streams, finish carries data: {} by convention. + // Fall back to the last snapshot (runOutputData) so the final output + // is not silently cleared when the finish payload is empty — + // overlaying router refs on top so cache-written bytes are not + // orphaned (the ref still lands in the OUTPUT slot). + if (streamMode === "replace" && Object.keys(merged).length === 0) { + const lastSnapshot = this.task.runOutputData; + if (lastSnapshot && Object.keys(lastSnapshot).length > 0) { + const snapshotWithRefs: Record = { ...lastSnapshot }; + for (const [port, ref] of refs) snapshotWithRefs[port] = ref; + finalOutput = snapshotWithRefs as Output; + this.task.emit("stream_chunk", { + type: "finish", + data: lastSnapshot, + } as StreamEvent); + break; + } + } + // The emitted finish event always carries the materialized payload + // (from accumulators) so edge consumers see Blob/ArrayBuffer. + // finalOutput diverges only when a router produced a ref for a + // port that wasn't already pinned by an explicit payload — that + // ref takes the slot in the return value so the queue/cache row + // stays small (the tee path). + this.task.emit("stream_chunk", { type: "finish", data: merged } as StreamEvent); + if (refs.size === 0) { + finalOutput = merged as unknown as Output; + } else { + const finalMerged: Record = { ...merged }; + for (const [port, ref] of refs) finalMerged[port] = ref; + finalOutput = finalMerged as unknown as Output; } + } else { + // No accumulation. For replace-mode streams the provider's finish + // event carries `data: {}` by convention — the snapshots already + // delivered the value, so the finish payload is intentionally + // empty. Fall back to `runOutputData` (set on every snapshot above) + // so we don't clobber the last snapshot with an empty object. This + // mirrors the same fallback in the accumulation branch. + const finishData = (event.data ?? {}) as Record; + if (streamMode === "replace" && Object.keys(finishData).length === 0) { + const lastSnapshot = this.task.runOutputData; + if (lastSnapshot && Object.keys(lastSnapshot).length > 0) { + finalOutput = lastSnapshot as Output; + this.task.emit("stream_chunk", { + type: "finish", + data: lastSnapshot, + } as StreamEvent); + break; + } + } + finalOutput = event.data as Output; + this.task.emit("stream_chunk", event as StreamEvent); } - finalOutput = event.data as Output; - this.task.emit("stream_chunk", event as StreamEvent); + break; + } + case "error": { + throw event.error; } - break; - } - case "error": { - throw event.error; } } + } catch (err) { + // Surface the error to any in-flight router sinks so they reject + // (rather than waiting forever on the producer). The original error is + // rethrown unchanged. + const failure = err instanceof Error ? err : new Error(String(err)); + for (const router of routers.values()) router.fail(failure); + throw err; + } finally { + // Defensive: if the loop exited without seeing a `finish` event + // (e.g. abort, generator return without yield), close routers so their + // sinks see end-of-stream rather than blocking on the next chunk. + for (const router of routers.values()) router.end(); } // Check if the task was aborted during streaming @@ -234,3 +399,151 @@ export class StreamProcessor return this.task.runOutputData as Output; } } + +/** + * Producer-consumer router used by {@link StreamProcessor} to forward a single + * binary output port's `binary-delta` chunks to a {@link BinaryRefSink}. The + * sink consumes the chunks via the async iterable and returns a + * {@link CacheRef} that the processor places into `Output` at finish. + * + * Lifecycle: chunks pushed via `push()` are yielded to the sink in order. + * `end()` signals end-of-stream (sink completes consumption, refPromise + * resolves). `fail(err)` causes the iterable to throw on the next read + * (refPromise rejects). `end()` and `fail()` are idempotent. + * + * Backpressure: byte-bounded. `push()` returns a Promise; the producer is + * resolved immediately while the buffered (un-consumed) byte total stays + * below `highWaterMarkBytes`, and parks until the consumer drains the + * buffer back under the mark once the threshold is reached. `end()` and + * `fail()` BOTH release any parked producer so an abort mid-park does not + * leak the `push()` promise. + */ +export class BinaryStreamRouter { + private readonly buffer: Uint8Array[] = []; + private bufferedBytes = 0; + private finished = false; + private failure: Error | undefined; + /** Resolver for the consumer side (iterable awaiting next chunk). */ + private chunkNotify: (() => void) | undefined; + /** Resolver for the producer side parked waiting for drain. */ + private drainNotify: (() => void) | undefined; + private readonly refPromise: Promise; + private readonly highWaterMarkBytes: number; + + constructor(sink: BinaryRefSink, highWaterMarkBytes: number) { + this.highWaterMarkBytes = Math.max(1, highWaterMarkBytes); + this.refPromise = sink(this.iterable()); + // Observe rejection so an unawaited refPromise (e.g. after fail() in an + // error path) doesn't surface as an unhandled rejection. Subsequent + // `await this.refPromise` still rejects. + this.refPromise.catch(() => {}); + } + + /** + * Buffer one chunk and return a Promise the caller must await. The promise + * resolves immediately when buffered bytes remain under the high-water + * mark, and otherwise parks until the consumer drains the buffer (or until + * `end()` / `fail()` releases all parked callers). + */ + push(chunk: Uint8Array): Promise { + if (this.finished) return Promise.resolve(); + this.buffer.push(chunk); + this.bufferedBytes += chunk.byteLength; + this.wakeChunk(); + if (this.bufferedBytes < this.highWaterMarkBytes) return Promise.resolve(); + return new Promise((res) => { + // Chain resolvers so a long park doesn't lose earlier waiters. + const prev = this.drainNotify; + this.drainNotify = prev + ? () => { + prev(); + res(); + } + : res; + }); + } + + end(): void { + if (this.finished) return; + this.finished = true; + this.wakeChunk(); + // Release any producer parked at the high-water mark — abort mid-stream + // would otherwise orphan the parked Promise. + this.wakeDrain(); + } + + fail(err: Error): void { + if (this.finished) return; + this.failure = err; + this.finished = true; + this.wakeChunk(); + this.wakeDrain(); + } + + ref(): Promise { + return this.refPromise; + } + + /** @internal Test hook: current buffered byte count (consumer-unread). */ + public get _bufferedBytes(): number { + return this.bufferedBytes; + } + + /** @internal Test hook: high-water mark in effect. */ + public get _highWaterMarkBytes(): number { + return this.highWaterMarkBytes; + } + + /** + * @internal Used by {@link IExecuteContext.binaryBackpressure} so a task + * emitting via a side channel can park until the consumer drains. Resolves + * immediately when the buffer is already under the mark or the router has + * been closed. + */ + public _awaitDrain(): Promise { + if (this.finished) return Promise.resolve(); + if (this.bufferedBytes < this.highWaterMarkBytes) return Promise.resolve(); + return new Promise((res) => { + const prev = this.drainNotify; + this.drainNotify = prev + ? () => { + prev(); + res(); + } + : res; + }); + } + + private wakeChunk(): void { + const n = this.chunkNotify; + this.chunkNotify = undefined; + n?.(); + } + + private wakeDrain(): void { + const n = this.drainNotify; + this.drainNotify = undefined; + n?.(); + } + + private async *iterable(): AsyncIterable { + while (true) { + while (this.buffer.length > 0) { + const chunk = this.buffer.shift()!; + this.bufferedBytes -= chunk.byteLength; + // Wake any parked producer once we drop back below the mark. We + // resolve as soon as we cross the threshold rather than waiting for + // the buffer to drain fully — that keeps the producer pipelined. + if (this.drainNotify && this.bufferedBytes < this.highWaterMarkBytes) { + this.wakeDrain(); + } + yield chunk; + } + if (this.failure) throw this.failure; + if (this.finished) return; + await new Promise((res) => { + this.chunkNotify = res; + }); + } + } +} diff --git a/packages/task-graph/src/task/StreamTypes.ts b/packages/task-graph/src/task/StreamTypes.ts index 3f79f20d3..8bff267bf 100644 --- a/packages/task-graph/src/task/StreamTypes.ts +++ b/packages/task-graph/src/task/StreamTypes.ts @@ -12,12 +12,13 @@ import type { DataPortSchema, JsonSchema } from "@workglow/util/schema"; * - `append`: Each chunk is a delta (e.g., a new token). * - `replace`: Each chunk is a corrected/revised snapshot of the complete output so far. * - `object`: Each chunk is a progressively more complete partial object snapshot. + * - `binary`: Each chunk is an ordered byte slice; consumer concatenates into a Blob/ArrayBuffer. * - `mixed`: Multiple ports use different stream modes (e.g., append + object). * * Declared per-port via the `x-stream` schema extension property. * Absent `x-stream` = `"none"`. */ -export type StreamMode = "none" | "append" | "replace" | "object" | "mixed"; +export type StreamMode = "none" | "append" | "replace" | "object" | "binary" | "mixed"; /** * Append mode: delta chunk (consumer accumulates). @@ -45,6 +46,18 @@ export type StreamObjectDelta = { objectDelta: Record | unknown[]; }; +/** + * Binary mode: an ordered, append-only chunk of bytes (consumer concatenates). + * `port` identifies which output port this delta belongs to. Chunks are + * materialized on `finish` into a `Blob` or `ArrayBuffer` per the port's + * schema `format` (see `materializeBinary`). + */ +export type StreamBinaryDelta = { + type: "binary-delta"; + port: string; + binaryDelta: Uint8Array; +}; + /** * Replace mode: full snapshot chunk (replaces previous state). */ @@ -104,6 +117,7 @@ export type StreamPhase = { export type StreamEvent> = | StreamTextDelta | StreamObjectDelta + | StreamBinaryDelta | StreamSnapshot | StreamFinish | StreamError @@ -126,7 +140,8 @@ export function getPortStreamMode(schema: DataPortSchema | JsonSchema, portId: s const prop = (schema.properties as Record)?.[portId]; if (!prop || typeof prop === "boolean") return "none"; const xStream = prop["x-stream"]; - if (xStream === "append" || xStream === "replace" || xStream === "object") return xStream; + if (xStream === "append" || xStream === "replace" || xStream === "object" || xStream === "binary") + return xStream; return "none"; } @@ -147,7 +162,12 @@ export function getStreamingPorts( for (const [name, prop] of Object.entries(props)) { if (!prop || typeof prop === "boolean") continue; const xStream = (prop as any)["x-stream"]; - if (xStream === "append" || xStream === "replace" || xStream === "object") { + if ( + xStream === "append" || + xStream === "replace" || + xStream === "object" || + xStream === "binary" + ) { result.push({ port: name, mode: xStream }); } } @@ -251,6 +271,107 @@ export function getObjectPortId(schema: DataPortSchema): string | undefined { return undefined; } +/** + * Returns the port ID (property name) of the first output port that declares + * `x-stream: "binary"`, or `undefined` if no such port exists. + * + * @param schema - The task's output DataPortSchema + * @returns The port name with binary streaming, or undefined + */ +export function getBinaryPortId(schema: DataPortSchema): string | undefined { + if (typeof schema === "boolean") return undefined; + const props = schema.properties; + if (!props) return undefined; + + for (const [name, prop] of Object.entries(props)) { + if (!prop || typeof prop === "boolean") continue; + if ((prop as any)["x-stream"] === "binary") return name; + } + return undefined; +} + +/** + * Canonical vocabulary for the `format` annotation on a binary streaming output + * port. `"blob"` materializes chunks into a `Blob` (the default); `"binary"` + * materializes them into an `ArrayBuffer`. Any other value is rejected at + * registration time (see {@link assertBinaryFormat}) so a typo like `"Blob"` + * cannot silently fall through to the ArrayBuffer branch. + */ +export type BinaryFormat = "blob" | "binary"; + +/** + * Default high-water mark for the binary-stream router's producer buffer, in + * bytes. When the buffered (un-consumed) byte total reaches this threshold the + * producer awaits a drain signal from the consumer before pushing further + * chunks; below the threshold the producer is allowed to run free. 8 MiB lets + * even fast producers race ahead by a few chunks without stalling, while + * bounding worst-case memory growth when the sink (cache, disk, network) + * cannot keep up. Callers can override per-run via + * `IRunConfig.binaryHighWaterBytes`. + */ +export const DEFAULT_BINARY_HIGH_WATER_BYTES = 8 * 1024 * 1024; + +/** + * Reads the `format` annotation of a single output port from the task's output + * schema. Returns the raw string (or `undefined`) — callers needing the + * canonical {@link BinaryFormat} vocabulary should go through + * {@link assertBinaryFormat}, which rejects unknown values. + */ +export function getBinaryPortFormat(schema: DataPortSchema, port: string): string | undefined { + if (typeof schema === "boolean") return undefined; + const prop = (schema.properties as Record)?.[port]; + if (!prop || typeof prop === "boolean") return undefined; + return prop.format as string | undefined; +} + +/** + * Resolves the `format` annotation on a binary streaming port to a canonical + * {@link BinaryFormat}. `undefined` and `"blob"` both resolve to `"blob"`; + * `"binary"` resolves to `"binary"`. Anything else throws — a casing typo such + * as `"Blob"` or a leftover legacy value would otherwise be silently coerced + * to one branch and produce the wrong runtime type, so this is checked at + * task-registration time and again on the streaming hot paths. + */ +export function assertBinaryFormat(schema: DataPortSchema, port: string): BinaryFormat { + const f = getBinaryPortFormat(schema, port); + if (f === undefined || f === "blob") return "blob"; + if (f === "binary") return "binary"; + throw new Error( + `Port "${port}" has x-stream:"binary" but format:"${f}". Allowed: "blob" | "binary".` + ); +} + +/** + * Materializes ordered binary chunks into the value type declared by the + * output port's canonical {@link BinaryFormat}: + * - `"blob"` → `Blob` (the default) + * - `"binary"` → `ArrayBuffer` + * + * Chunks are concatenated in arrival order. Callers MUST pass chunks in the + * order they were emitted, and MUST resolve `format` through + * {@link assertBinaryFormat} so unknown values are rejected at registration + * rather than reinterpreted here. + * + * @param chunks - Ordered binary chunks to concatenate + * @param format - Canonical binary format selector + * @returns The materialized `Blob` or `ArrayBuffer` + */ +export function materializeBinary( + chunks: readonly Uint8Array[], + format: BinaryFormat +): Blob | ArrayBuffer { + if (format === "blob") return new Blob(chunks as unknown as BlobPart[]); + let total = 0; + for (const c of chunks) total += c.byteLength; + const merged = new Uint8Array(total); + let offset = 0; + for (const c of chunks) { + merged.set(c, offset); + offset += c.byteLength; + } + return merged.buffer; +} + /** * Returns a map of port names to their JSON Schemas for every output port * that declares `"x-structured-output": true`. diff --git a/packages/task-graph/src/task/TaskRegistry.ts b/packages/task-graph/src/task/TaskRegistry.ts index 3e8439cc7..3a68f55ea 100644 --- a/packages/task-graph/src/task/TaskRegistry.ts +++ b/packages/task-graph/src/task/TaskRegistry.ts @@ -14,6 +14,7 @@ import { } from "@workglow/util"; import { validateSchema } from "@workglow/util/schema"; import type { ITaskConstructor } from "./ITask"; +import { assertBinaryFormat, getStreamingPorts } from "./StreamTypes"; type AnyTaskConstructor = ITaskConstructor; @@ -40,12 +41,32 @@ function registerTask(baseClass: AnyTaskConstructor): void { `Task type "${baseClass.type}" is already registered. Unregister it first to replace.` ); } + + // Validate every binary streaming output port's `format` against the + // canonical {@link BinaryFormat} vocabulary BEFORE adding to the registry. + // A typo like `format: "Blob"` would otherwise silently coerce to the + // ArrayBuffer branch in `materializeBinary`, producing the wrong runtime + // type for every consumer of that port. Fail at registration so the + // misconfiguration surfaces near the task definition site. + const outputSchema = baseClass.outputSchema(); + for (const { port, mode } of getStreamingPorts(outputSchema)) { + if (mode !== "binary") continue; + try { + assertBinaryFormat(outputSchema, port); + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + throw new Error( + `Cannot register task "${baseClass.type}": invalid binary stream port. ${message}` + ); + } + } + taskConstructors.set(baseClass.type, baseClass); // Validate schemas at registration time (soft — warn only, don't throw) const schemas = [ { name: "inputSchema", schema: baseClass.inputSchema() }, - { name: "outputSchema", schema: baseClass.outputSchema() }, + { name: "outputSchema", schema: outputSchema }, ] as const; for (const { name, schema } of schemas) { diff --git a/packages/task-graph/src/task/TaskRunner.ts b/packages/task-graph/src/task/TaskRunner.ts index 9140fff67..dffb61150 100644 --- a/packages/task-graph/src/task/TaskRunner.ts +++ b/packages/task-graph/src/task/TaskRunner.ts @@ -13,7 +13,13 @@ import { SpanStatusCode, } from "@workglow/util"; import type { CacheRegistry } from "../cache"; -import { CACHE_REGISTRY, DefaultCacheRegistry, RunPrivateCacheRepo } from "../cache"; +import { + CACHE_REGISTRY, + DefaultCacheRegistry, + isCacheRef, + resolveReferenceThreshold, + RunPrivateCacheRepo, +} from "../cache"; import { TASK_OUTPUT_REPOSITORY, TaskOutputRepository } from "../storage/TaskOutputRepository"; import type { Taskish } from "../task-graph/Conversions"; import { ensureTask } from "../task-graph/Conversions"; @@ -23,7 +29,12 @@ import type { IRunConfig, ITask } from "./ITask"; import { ITaskRunner } from "./ITaskRunner"; import { StreamProcessor } from "./StreamProcessor"; import type { StreamEvent } from "./StreamTypes"; -import { getOutputStreamMode, isTaskStreamable } from "./StreamTypes"; +import { + getBinaryPortFormat, + getOutputStreamMode, + getPortStreamMode, + isTaskStreamable, +} from "./StreamTypes"; import { Task } from "./Task"; import { TaskAbortedError, @@ -179,7 +190,8 @@ export class TaskRunner< await this.resolveSchemas(); - const inputs: Input = this.task.runInputData as Input; + const inputs: Input = await this.hydrateInputRefs(this.task.runInputData as Input); + this.task.runInputData = inputs; const isValid = await this.task.validateInput(inputs); if (!isValid) { throw new TaskInvalidInputError("Invalid input data"); @@ -239,15 +251,38 @@ export class TaskRunner< this.cacheRegistry, policy ); + const referenceThresholdBytes = resolveReferenceThreshold( + config.referenceThresholdBytes ?? this.task.runConfig.referenceThresholdBytes + ); let outputs = await this.cacheCoordinator.lookupByPolicy( keyInputs, this.cacheRegistry, policy, isStreamable, - ctx + ctx, + { + hasMaterializingConsumers: config.hasMaterializingConsumers === true, + hasStreamingConsumers: config.hasStreamingConsumers === true, + } ); if (outputs === undefined) { + // Build per-port binary-stream sinks when the cache supports + // streaming and the schema has a binary port. The sinks always run + // (memory-bounded write to cache); the runtime threshold controls + // whether the resulting CacheRef SURVIVES in Output or gets + // rehydrated to an inline Blob/ArrayBuffer below. + const binaryRefSinks = isStreamable + ? this.cacheCoordinator.getBinaryRefSinksByPolicy( + keyInputs, + this.cacheRegistry, + policy, + this.task.outputSchema() + ) + : undefined; + + const binaryHighWaterBytes = + config.binaryHighWaterBytes ?? this.task.runConfig.binaryHighWaterBytes; outputs = isStreamable ? await this.streamProcessor.run(inputs, ctx, { registry: this.registry, @@ -255,17 +290,65 @@ export class TaskRunner< inputStreams: this.inputStreams, onProgress: this.handleProgress.bind(this), own: this.own, + binaryRefSinks, + binaryHighWaterBytes, }) : await this.executeTask(inputs, ctx); - await this.cacheCoordinator.saveByPolicy( - keyInputs, + // Save the wire form FIRST: a CacheRef at a binary port is a small + // JSON-safe envelope, while an inline Blob/ArrayBuffer would be + // destroyed by JSON-row backings (JSON.stringify(Blob) === "{}"). + // The row therefore always carries the ref; hydration below applies + // only to the value returned to the caller. + try { + await this.cacheCoordinator.saveByPolicy( + keyInputs, + outputs as Output, + this.cacheRegistry, + policy + ); + } catch (saveErr) { + // The stream sink already wrote the blob and minted a CacheRef + // before we got here; the row write failure leaves that blob + // unreferenced. Best-effort delete it so the cache directory + // does not accumulate orphans on every save failure. + if (binaryRefSinks !== undefined && outputs !== undefined) { + await this.cacheCoordinator.cleanupOrphanBlobsForBinaryPorts( + outputs as Output, + this.cacheRegistry, + policy, + this.task.outputSchema() + ); + } + throw saveErr; + } + + // Rehydrate refs whose committed size is below the configured + // threshold so callers see inline bytes for small outputs (threshold + // default = 64 KiB). Refs at/above threshold survive. threshold = 0 + // forces every ref to survive regardless of size. + if (outputs !== undefined && binaryRefSinks !== undefined) { + outputs = await this.cacheCoordinator.hydrateRefsBelowThreshold( + outputs as Output, + this.cacheRegistry, + policy, + this.task.outputSchema(), + referenceThresholdBytes + ); + } + } else { + // Cache hit: rows store refs (wire form), so apply the same + // below-threshold hydration a fresh run applies before returning — + // small outputs come back as inline Blob/ArrayBuffer either way. + outputs = await this.cacheCoordinator.hydrateRefsBelowThreshold( outputs as Output, this.cacheRegistry, - policy + policy, + this.task.outputSchema(), + referenceThresholdBytes ); - this.task.runOutputData = outputs ?? ({} as Output); } + this.task.runOutputData = outputs ?? ({} as Output); await this.handleComplete(ctx); @@ -312,6 +395,56 @@ export class TaskRunner< } } + /** + * Hydrate branded {@link CacheRef} values in resolved inputs to inline + * bytes before `execute()` runs, resolving against the run's cache registry + * (private repo first, then deterministic). Materialization type follows the + * input port's `format` annotation (`"binary"` → `ArrayBuffer`, anything + * else → `Blob`). + * + * Binary-streaming input ports with a live input stream are skipped: those + * consumers take bytes from the stream and the ref at the port remains the + * durable pointer — hydrating it would re-materialize what the stream + * already delivers. + * + * Hydration runs before cache-key computation so a ref-bearing input + * fingerprints identically to the materialized input a fresh upstream run + * would have produced. + * + * A ref that no longer resolves throws: by this point the bytes were + * expected to exist, and letting `undefined` flow into `execute()` produces + * far less debuggable failures than a named-port error. + */ + private async hydrateInputRefs(inputs: Input): Promise { + if (inputs === null || typeof inputs !== "object") return inputs; + const repos = [this.cacheRegistry?.private, this.cacheRegistry?.deterministic].filter( + (r): r is TaskOutputRepository => r !== undefined && typeof r.getOutputByRef === "function" + ); + if (repos.length === 0) return inputs; + + const schema = this.task.inputSchema(); + const source = inputs as Record; + let out: Record | undefined; + for (const [port, value] of Object.entries(source)) { + if (!isCacheRef(value)) continue; + if (getPortStreamMode(schema, port) === "binary" && this.inputStreams?.has(port)) continue; + let blob: Blob | undefined; + for (const repo of repos) { + blob = await repo.getOutputByRef!(value); + if (blob !== undefined) break; + } + if (blob === undefined) { + throw new TaskFailedError( + `Task "${this.task.type}" input port "${port}" holds a cache ref that no configured ` + + `cache backing can resolve (entry evicted?).` + ); + } + out ??= { ...source }; + out[port] = getBinaryPortFormat(schema, port) === "binary" ? await blob.arrayBuffer() : blob; + } + return (out ?? source) as Input; + } + public async runPreview(overrides: Partial = {}): Promise { if (this.task.status === TaskStatus.PROCESSING) { return this.task.runOutputData as Output; diff --git a/packages/test/src/binding/FsFolderTaskOutputRepository.ts b/packages/test/src/binding/FsFolderTaskOutputRepository.ts index 22d725469..1d9add3e2 100644 --- a/packages/test/src/binding/FsFolderTaskOutputRepository.ts +++ b/packages/test/src/binding/FsFolderTaskOutputRepository.ts @@ -4,24 +4,4 @@ * SPDX-License-Identifier: Apache-2.0 */ -import { FsFolderTabularStorage } from "@workglow/storage"; -import { - tabularTaskOutputStorage, - TaskOutputPrimaryKeyNames, - TaskOutputSchema, - TaskOutputTabularRepository, -} from "@workglow/task-graph"; - -/** - * File system folder implementation of a task output repository. - * Provides storage and retrieval for task outputs using the file system. - */ -export class FsFolderTaskOutputRepository extends TaskOutputTabularRepository { - constructor(folderPath: string) { - super({ - storage: tabularTaskOutputStorage( - new FsFolderTabularStorage(folderPath, TaskOutputSchema, TaskOutputPrimaryKeyNames) - ), - }); - } -} +export { FsFolderTaskOutputRepository } from "@workglow/task-graph"; diff --git a/packages/test/src/binding/StreamingMemoryRepo.ts b/packages/test/src/binding/StreamingMemoryRepo.ts new file mode 100644 index 000000000..a31b77225 --- /dev/null +++ b/packages/test/src/binding/StreamingMemoryRepo.ts @@ -0,0 +1,101 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { CacheRef, TaskInput, TaskOutput } from "@workglow/task-graph"; +import { makeCacheRef, TaskOutputRepository } from "@workglow/task-graph"; + +/** + * In-memory {@link TaskOutputRepository} implementing the full streaming + * surface (`saveOutputStream`, `getOutputByRef`, `getOutputStreamByRef`). + * Test-only: bytes live in process memory, keyed by `$ref`. + */ +export class StreamingMemoryRepo extends TaskOutputRepository { + public readonly streamed = new Map(); + public readonly streamedMetadata = new Map>(); + /** When set, `getOutputStreamByRef` yields slices of at most this many bytes. */ + public streamReadChunkSize: number | undefined; + private store = new Map(); + + override async saveOutput(t: string, i: TaskInput, o: TaskOutput): Promise { + this.store.set(t + JSON.stringify(i), o); + } + override async getOutput(t: string, i: TaskInput): Promise { + return this.store.get(t + JSON.stringify(i)); + } + override async clear(): Promise { + this.store.clear(); + this.streamed.clear(); + } + override async size(): Promise { + return this.store.size; + } + override async clearOlderThan(): Promise {} + override isDurable(): boolean { + return false; + } + override async saveOutputStream( + taskType: string, + inputs: TaskInput, + chunks: AsyncIterable, + metadata: Record + ): Promise { + const parts: Uint8Array[] = []; + for await (const c of chunks) parts.push(c); + let total = 0; + for (const p of parts) total += p.byteLength; + const merged = new Uint8Array(total); + let off = 0; + for (const p of parts) { + merged.set(p, off); + off += p.byteLength; + } + const key = taskType + JSON.stringify(inputs); + this.streamed.set(key, merged); + this.streamedMetadata.set(key, metadata); + return makeCacheRef({ $ref: `inmem://${key}`, size: total }); + } + override async getOutputByRef(ref: CacheRef): Promise { + const key = ref.$ref.replace(/^inmem:\/\//, ""); + const bytes = this.streamed.get(key); + return bytes === undefined ? undefined : new Blob([bytes as unknown as BlobPart]); + } + override getOutputStreamByRef(ref: CacheRef): AsyncIterable | undefined { + const key = ref.$ref.replace(/^inmem:\/\//, ""); + const bytes = this.streamed.get(key); + if (bytes === undefined) return undefined; + const chunkSize = this.streamReadChunkSize ?? (bytes.byteLength || 1); + return (async function* () { + for (let i = 0; i < bytes.byteLength; i += chunkSize) { + yield bytes.subarray(i, Math.min(i + chunkSize, bytes.byteLength)); + } + })(); + } +} + +/** + * In-memory {@link TaskOutputRepository} that deliberately omits every + * streaming method, for capability-gating tests. + */ +export class NonStreamingMemoryRepo extends TaskOutputRepository { + private store = new Map(); + + override async saveOutput(t: string, i: TaskInput, o: TaskOutput): Promise { + this.store.set(t + JSON.stringify(i), o); + } + override async getOutput(t: string, i: TaskInput): Promise { + return this.store.get(t + JSON.stringify(i)); + } + override async clear(): Promise { + this.store.clear(); + } + override async size(): Promise { + return this.store.size; + } + override async clearOlderThan(): Promise {} + override isDurable(): boolean { + return false; + } +} diff --git a/packages/test/src/test/job-queue/JobOutputStream.test.ts b/packages/test/src/test/job-queue/JobOutputStream.test.ts new file mode 100644 index 000000000..654ac1f12 --- /dev/null +++ b/packages/test/src/test/job-queue/JobOutputStream.test.ts @@ -0,0 +1,136 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { IJobExecuteContext } from "@workglow/job-queue"; +import { + InMemoryQueueStorage, + Job, + JobQueueClient, + JobQueueServer, + wrapQueueStorage, +} from "@workglow/job-queue"; +import type { CacheRef } from "@workglow/task-graph"; +import { makeJobOutputStreamResolver } from "@workglow/task-graph"; +import { uuid4 } from "@workglow/util"; +import { afterEach, beforeEach, describe, expect, it } from "vitest"; +import { StreamingMemoryRepo } from "../../binding/StreamingMemoryRepo"; + +interface SInput { + readonly [key: string]: unknown; +} +interface SOutput { + readonly ok: true; + readonly [key: string]: unknown; +} + +const repo = new StreamingMemoryRepo({}); + +async function* gen(...chunks: Uint8Array[]): AsyncIterable { + for (const c of chunks) yield c; +} + +async function collect(stream: AsyncIterable): Promise { + const out: number[] = []; + for await (const chunk of stream) for (const b of chunk) out.push(b); + return out; +} + +/** + * Simulates the worker-side ref path: streams its payload into the shared + * cache backing and completes with an Output carrying the CacheRef instead + * of the bytes (small queue row). + */ +class RefProducingJob extends Job { + public override async execute(input: SInput, _context: IJobExecuteContext): Promise { + const file: CacheRef = await repo.saveOutputStream( + "RefJob", + { id: input.id }, + gen(new Uint8Array([10, 20]), new Uint8Array([30])), + {} + ); + const transcript: CacheRef = await repo.saveOutputStream( + "RefJobTranscript", + { id: input.id }, + gen(new Uint8Array([7])), + {} + ); + return { ok: true, file, transcript }; + } +} + +describe("JobHandle.outputStream (capability-gated streaming result reads)", () => { + let server: JobQueueServer; + let storage: InMemoryQueueStorage; + let queueName: string; + let queueParts: ReturnType>; + + beforeEach(async () => { + await repo.clear(); + queueName = `test-outputstream-${uuid4()}`; + storage = new InMemoryQueueStorage(queueName); + await storage.migrate(); + queueParts = wrapQueueStorage(storage); + server = new JobQueueServer(RefProducingJob, { + messageQueue: queueParts.messageQueue, + jobStore: queueParts.jobStore, + queueName, + pollIntervalMs: 1, + stopTimeoutMs: 0, + }); + await server.start(); + }); + + afterEach(async () => { + if (server) await server.stop(); + if (storage) await storage.deleteAll(); + }); + + it("streams a completed job's binary output out of the cache by port", async () => { + const client = new JobQueueClient({ + messageQueue: queueParts.messageQueue, + jobStore: queueParts.jobStore, + queueName, + outputStreamResolver: makeJobOutputStreamResolver(repo), + }); + client.attach(server); + + const handle = await client.send({ id: uuid4() }); + expect(typeof handle.outputStream).toBe("function"); + + const stream = await handle.outputStream!("file"); + expect(stream).toBeDefined(); + expect(await collect(stream!)).toEqual([10, 20, 30]); + + const transcript = await handle.outputStream!("transcript"); + expect(await collect(transcript!)).toEqual([7]); + }); + + it("rejects portless discovery when the output holds two refs", async () => { + const client = new JobQueueClient({ + messageQueue: queueParts.messageQueue, + jobStore: queueParts.jobStore, + queueName, + outputStreamResolver: makeJobOutputStreamResolver(repo), + }); + client.attach(server); + + const handle = await client.send({ id: uuid4() }); + await expect(handle.outputStream!()).rejects.toThrow(/explicit port/); + }); + + it("is absent when the client has no outputStreamResolver", async () => { + const client = new JobQueueClient({ + messageQueue: queueParts.messageQueue, + jobStore: queueParts.jobStore, + queueName, + }); + client.attach(server); + + const handle = await client.send({ id: uuid4() }); + expect(handle.outputStream).toBeUndefined(); + await handle.waitFor(); + }); +}); diff --git a/packages/test/src/test/job-queue/JobQueueStream.test.ts b/packages/test/src/test/job-queue/JobQueueStream.test.ts new file mode 100644 index 000000000..9523d94f5 --- /dev/null +++ b/packages/test/src/test/job-queue/JobQueueStream.test.ts @@ -0,0 +1,111 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { IJobExecuteContext, StreamEventLike } from "@workglow/job-queue"; +import { + InMemoryQueueStorage, + Job, + JobQueueClient, + JobQueueServer, + wrapQueueStorage, +} from "@workglow/job-queue"; +import { uuid4 } from "@workglow/util"; +import { afterEach, beforeEach, describe, expect, it } from "vitest"; + +interface SInput { + readonly [key: string]: unknown; +} +interface SOutput { + readonly ok: true; + readonly [key: string]: unknown; +} + +/** + * A job that emits a few stream events during execution via the OPTIONAL + * `emitStreamEvent` context hook, then returns a result. Two ordered + * `binary-delta` chunks followed by a `finish` exercise both binary payload + * delivery and ordering across the same-process server-attached channel. + */ +class StreamEmittingJob extends Job { + public override async execute(_input: SInput, context: IJobExecuteContext): Promise { + context.emitStreamEvent?.({ + type: "binary-delta", + port: "bytes", + binaryDelta: new Uint8Array([1, 2]), + }); + context.emitStreamEvent?.({ + type: "binary-delta", + port: "bytes", + binaryDelta: new Uint8Array([3]), + }); + context.emitStreamEvent?.({ type: "finish", data: {} }); + return { ok: true }; + } +} + +// Same-process server-attached harness, mirroring genericJobQueueTests.ts: +// InMemory queue storage + JobQueueServer + JobQueueClient, with the client +// attached to the server (`client.attach(server)`) so the client's `this.server` +// is set and `JobHandle.onStream` is present. These same-process queue tests run +// unconditionally in the repo (see InMemoryJobQueue.test.ts), so this suite is +// not gated behind any RUN_QUEUE_TESTS flag. +describe("job-queue stream delivery (same-process)", () => { + let server: JobQueueServer; + let client: JobQueueClient; + let storage: InMemoryQueueStorage; + let queueName: string; + + beforeEach(async () => { + queueName = `test-stream-${uuid4()}`; + storage = new InMemoryQueueStorage(queueName); + await storage.migrate(); + + const { messageQueue, jobStore } = wrapQueueStorage(storage); + server = new JobQueueServer(StreamEmittingJob, { + messageQueue, + jobStore, + queueName, + pollIntervalMs: 1, + stopTimeoutMs: 0, + }); + client = new JobQueueClient({ messageQueue, jobStore, queueName }); + // Attach for same-process optimization → sets client.server → enables onStream. + client.attach(server); + }); + + afterEach(async () => { + if (server) await server.stop(); + if (storage) await storage.deleteAll(); + }); + + it("delivers stream events in order via handle.onStream", async () => { + await server.start(); + + const handle = await client.send({ taskType: "stream" }); + + // onStream is present only on a server-attached handle (capability gate). + expect(typeof handle.onStream).toBe("function"); + + const received: StreamEventLike[] = []; + const cleanup = handle.onStream!((event) => { + received.push(event); + }); + + const output = await handle.waitFor(); + cleanup(); + + expect(output).toEqual({ ok: true }); + + // Events arrived in emission order. + expect(received.map((e) => e.type)).toEqual(["binary-delta", "binary-delta", "finish"]); + + // Binary payloads preserved byte-for-byte across the channel. + const firstBytes = received[0].binaryDelta as Uint8Array; + const secondBytes = received[1].binaryDelta as Uint8Array; + expect(Array.from(firstBytes)).toEqual([1, 2]); + expect(Array.from(secondBytes)).toEqual([3]); + }); +}); diff --git a/packages/test/src/test/job-queue/JobQueueStreamWorker.integration.test.ts b/packages/test/src/test/job-queue/JobQueueStreamWorker.integration.test.ts new file mode 100644 index 000000000..5e5a26d9c --- /dev/null +++ b/packages/test/src/test/job-queue/JobQueueStreamWorker.integration.test.ts @@ -0,0 +1,93 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import { Worker } from "node:worker_threads"; +import { afterEach, describe, expect, it } from "vitest"; + +/** + * Node primitive validation: structured-clone + transferable buffers across a + * `worker_threads` boundary. Navigational marker for a future cross-thread + * queue host — NOT a test of current `@workglow/job-queue` behavior. + * + * SCOPE — read before changing this test: today, `@workglow/job-queue`'s own + * stream channel (`IJobExecuteContext.emitStreamEvent` → worker `job_stream` + * event → `JobQueueServer.forwardToClients("handleJobStream", …)` → + * `JobQueueClient` → `JobHandle.onStream`) is entirely SAME-PROCESS: the + * `JobQueueWorker` runs in-process inside `JobQueueServer`, and + * `forwardToClients` invokes attached-client methods directly (no postMessage, + * no worker thread, no transferables). Cross-PROCESS coordination is handled + * by the message-queue storage layer via `IMessageQueue.subscribeToChanges` + * with serialized rows — also not a transferables path. There is no + * `WorkerManager`-hosted queue transport anywhere in the package. The actual + * same-process delivery path is proven by JobQueueStream.test.ts. + * + * This test therefore exercises the underlying Node primitive that a future + * `WorkerServer`-hosted queue would have to rely on: binary chunks emitted + * from a worker thread can be TRANSFERRED (not copied) to the host via + * `postMessage`, which is what `WorkerServerBase.extractTransferables` + * (packages/util/src/worker/WorkerServerBase.ts ~line 30) walks payloads to + * arrange. Note that `WorkerServerBase` currently applies that walk only in + * `postResult` (terminal complete message), not in `postStreamChunk` — so + * even on that boundary, incremental chunks are structure-cloned today; this + * test validates that the transfer semantics work for the binary-delta payload + * shape if anyone later wires them up. The worker emits two `binary-delta` + * events across the thread boundary (see jobQueueStreamWorker.fixture.mjs); + * the host receives the full byte sequence in order, and the worker's + * transferred views detach (`byteLength` becomes 0). Run under Node (vitest's + * default pool); bun's worker_threads does not detach transferred buffers, + * which is why this is an `.integration` test executed under the Node ABI per + * libs/.claude/CLAUDE.md. + */ +describe("worker_threads transfer mechanism — payload validation for a future cross-thread queue host (not current job-queue code)", () => { + let worker: Worker | undefined; + + afterEach(async () => { + if (worker) { + await worker.terminate(); + worker = undefined; + } + }); + + it("host receives the full byte sequence in order; worker buffers detach", async () => { + const fixtureUrl = new URL("./jobQueueStreamWorker.fixture.mjs", import.meta.url); + worker = new Worker(fixtureUrl); + + const received: Array<{ type: string; port?: string; binaryDelta?: Uint8Array }> = []; + + const done = await new Promise<{ firstByteLength: number; secondByteLength: number }>( + (resolve, reject) => { + worker!.on("error", reject); + worker!.on("message", (msg: Record) => { + // The host plays the role of `JobHandle.onStream` listener: collect + // every stream event the worker emits across the thread boundary. + if (msg.type === "binary-delta" || msg.type === "finish") { + received.push(msg as { type: string; port?: string; binaryDelta?: Uint8Array }); + } else if (msg.type === "done") { + resolve(msg as { firstByteLength: number; secondByteLength: number }); + } + }); + worker!.postMessage("start"); + } + ); + + // Events arrived in emission order across the thread boundary. + expect(received.map((e) => e.type)).toEqual(["binary-delta", "binary-delta", "finish"]); + + // Host received the full byte sequence in order across the two events. + const hostBytes = received + .filter((e) => e.type === "binary-delta") + .flatMap((e) => Array.from(e.binaryDelta as Uint8Array)); + expect(hostBytes).toEqual([1, 2, 3]); + + // Detachment: the worker transferred (did not copy) each chunk buffer, so + // its own views are now detached (byteLength === 0). This is the + // `WorkerServerBase.extractTransferables` behavior. Asserted under Node, + // which detaches transferred buffers per the structured-clone transfer + // semantics the design depends on. + expect(done.firstByteLength).toBe(0); + expect(done.secondByteLength).toBe(0); + }); +}); diff --git a/packages/test/src/test/job-queue/jobQueueStreamWorker.fixture.mjs b/packages/test/src/test/job-queue/jobQueueStreamWorker.fixture.mjs new file mode 100644 index 000000000..2b031a718 --- /dev/null +++ b/packages/test/src/test/job-queue/jobQueueStreamWorker.fixture.mjs @@ -0,0 +1,53 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * Worker-thread fixture for JobQueueStreamWorker.integration.test.ts. + * + * Simulates a job executing inside a real worker thread that emits two ordered + * `binary-delta` stream events. Each chunk is posted to the host with the + * chunk's underlying `ArrayBuffer` in the transfer list — the exact mechanism + * `WorkerServerBase.extractTransferables` (packages/util/src/worker/WorkerServerBase.ts) + * applies automatically when a TypedArray crosses a worker boundary. The + * transfer (rather than copy) detaches the worker's view of the buffer, which + * the host asserts via the reported `byteLength` values in the terminal "done" + * message (a stand-in for the job result the job would otherwise return). + * + * Plain `.mjs` (not `.ts`) so it can be launched directly by `worker_threads` + * under the Node runtime vitest uses, without a TypeScript transform step. + */ + +import { parentPort } from "node:worker_threads"; + +if (!parentPort) { + throw new Error("jobQueueStreamWorker.fixture.mjs must run as a worker thread"); +} + +parentPort.on("message", () => { + // Two ordered binary-delta chunks, then a finish — mirrors a job calling + // ctx.emitStreamEvent?.(...) during execution. + const first = new Uint8Array([1, 2]); + const second = new Uint8Array([3]); + + parentPort.postMessage( + { type: "binary-delta", port: "bytes", binaryDelta: first }, + [first.buffer] + ); + parentPort.postMessage( + { type: "binary-delta", port: "bytes", binaryDelta: second }, + [second.buffer] + ); + parentPort.postMessage({ type: "finish", data: {} }); + + // Report the worker-side byteLength of the retained chunk views AFTER they + // were transferred. A genuine transfer detaches the underlying buffer, so + // these are 0 under Node. (Stands in for the job result.) + parentPort.postMessage({ + type: "done", + firstByteLength: first.byteLength, + secondByteLength: second.byteLength, + }); +}); diff --git a/packages/test/src/test/task-graph-output-cache/FsFolderConcurrentWrite.integration.test.ts b/packages/test/src/test/task-graph-output-cache/FsFolderConcurrentWrite.integration.test.ts new file mode 100644 index 000000000..bf31fa559 --- /dev/null +++ b/packages/test/src/test/task-graph-output-cache/FsFolderConcurrentWrite.integration.test.ts @@ -0,0 +1,115 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import { setLogger } from "@workglow/util"; +import { mkdtempSync, rmSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { afterEach, beforeEach, describe, expect, it } from "vitest"; +import { FsFolderTaskOutputRepository } from "../../binding/FsFolderTaskOutputRepository"; +import { getTestingLogger } from "../../binding/TestingLogger"; +// Re-use the package surface for `makeCacheRef` so the legacy-compat test can +// synthesize a ref pointing at a file the test placed on disk directly. +import { makeCacheRef } from "@workglow/task-graph"; + +async function* once(bytes: Uint8Array): AsyncIterable { + yield bytes; +} + +const refToBlobName = (ref: { $ref: string }): string => { + const m = /^fsfolder:\/\/blobs\/([^/]+)$/.exec(ref.$ref); + if (!m) throw new Error(`unexpected ref shape: ${ref.$ref}`); + return m[1]!; +}; + +describe("FsFolderTaskOutputRepository concurrent-write safety", () => { + setLogger(getTestingLogger()); + let folder: string; + + beforeEach(() => { + folder = mkdtempSync(join(tmpdir(), "fsfolder-concurrent-")); + }); + + afterEach(() => { + try { + rmSync(folder, { recursive: true, force: true }); + } catch {} + }); + + it("two concurrent writers with the same (taskType, inputs) produce distinct refs that both resolve", async () => { + const repo = new FsFolderTaskOutputRepository(folder); + const taskType = "SameInputs"; + const inputs = { key: "shared" }; + const payloadA = new Uint8Array([1, 1, 1]); + const payloadB = new Uint8Array([2, 2, 2]); + const [refA, refB] = await Promise.all([ + repo.saveOutputStream(taskType, inputs, once(payloadA), {}), + repo.saveOutputStream(taskType, inputs, once(payloadB), {}), + ]); + // Distinct paths — no race on a shared blob name. + expect(refA.$ref).not.toBe(refB.$ref); + // Sanitized-taskType prefix preserved so prefix-scoped pruning still + // cascades to both blobs. + expect(refToBlobName(refA)).toMatch(/^SameInputs_/); + expect(refToBlobName(refB)).toMatch(/^SameInputs_/); + // Both blobs are independently readable. + const blobA = await repo.getOutputByRef(refA); + const blobB = await repo.getOutputByRef(refB); + expect(blobA).toBeDefined(); + expect(blobB).toBeDefined(); + expect(Array.from(new Uint8Array(await blobA!.arrayBuffer()))).toEqual(Array.from(payloadA)); + expect(Array.from(new Uint8Array(await blobB!.arrayBuffer()))).toEqual(Array.from(payloadB)); + }); + + it("A's failed-row cleanup (deleteOutputByRef) does not delete B's blob", async () => { + const repo = new FsFolderTaskOutputRepository(folder); + const taskType = "RaceCleanup"; + const inputs = { id: 1 }; + // Writer A runs, gets a ref. + const payloadA = new Uint8Array([9, 9, 9]); + const refA = await repo.saveOutputStream(taskType, inputs, once(payloadA), {}); + // Writer B runs (same inputs), gets a different ref because of the + // per-write UUID suffix. + const payloadB = new Uint8Array([4, 5, 6]); + const refB = await repo.saveOutputStream(taskType, inputs, once(payloadB), {}); + expect(refA.$ref).not.toBe(refB.$ref); + // Simulate A's row-commit failure cleanup: delete A's blob. + await repo.deleteOutputByRef(refA); + // A's blob is gone … + const blobA = await repo.getOutputByRef(refA); + expect(blobA).toBeUndefined(); + // … but B's blob is still readable, with the right bytes. This is the + // race the unique suffix exists to fix: without it, A and B would have + // collided on a single path and A's cleanup would have unlinked B's + // payload. + const blobB = await repo.getOutputByRef(refB); + expect(blobB).toBeDefined(); + expect(Array.from(new Uint8Array(await blobB!.arrayBuffer()))).toEqual(Array.from(payloadB)); + }); + + it("backward compat: a legacy un-suffixed blob is still resolvable through getOutputByRef", async () => { + const repo = new FsFolderTaskOutputRepository(folder); + // Synthesize a legacy blob path: `_.bin` (no UUID + // suffix). The exact fingerprint doesn't matter — we just need a name + // that matches the public ref pattern. + const legacyName = "LegacyTask_abcdef0123456789.bin"; + const blobsDir = join(folder, "blobs"); + // First, force the repo to lazily create the blobs dir by issuing one + // write so the directory exists; then drop a legacy file in there. + await repo.saveOutputStream("Bootstrap", {}, once(new Uint8Array([0])), {}); + const legacyBytes = new Uint8Array([7, 7, 7, 7]); + writeFileSync(join(blobsDir, legacyName), legacyBytes); + const legacyRef = makeCacheRef({ + $ref: `fsfolder://blobs/${legacyName}`, + size: legacyBytes.byteLength, + }); + const legacyBlob = await repo.getOutputByRef(legacyRef); + expect(legacyBlob).toBeDefined(); + expect(Array.from(new Uint8Array(await legacyBlob!.arrayBuffer()))).toEqual( + Array.from(legacyBytes) + ); + }); +}); diff --git a/packages/test/src/test/task-graph-output-cache/FsFolderDirSync.test.ts b/packages/test/src/test/task-graph-output-cache/FsFolderDirSync.test.ts new file mode 100644 index 000000000..98de84610 --- /dev/null +++ b/packages/test/src/test/task-graph-output-cache/FsFolderDirSync.test.ts @@ -0,0 +1,76 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import { setLogger } from "@workglow/util"; +import { mkdtempSync, rmSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { afterEach, beforeEach, describe, expect, it } from "vitest"; +import { FsFolderTaskOutputRepository } from "../../binding/FsFolderTaskOutputRepository"; +import { getTestingLogger } from "../../binding/TestingLogger"; + +async function* once(bytes: Uint8Array): AsyncIterable { + yield bytes; +} + +/** + * Integration test for the dir-fsync addition on the blob write path. + * + * The directory `open()` + `sync()` runs unconditionally after every rename + * to keep the parent directory's metadata durable on `data=ordered` ext4 (so + * a crash between the rename and the directory flush cannot publish a name + * pointing at zero bytes). We can't easily simulate the unsupported-FS error + * codes (`EPERM` / `EINVAL` / `ENOTSUP` / `EISDIR`) on a Linux tmp dir, so + * these tests cover the happy path end-to-end: + * + * - the dir-sync code executes on every write without raising, + * - bytes round-trip through `getOutputByRef` (proving the rename landed), + * - many concurrent writes all complete without any dir-sync interleaving + * failure. + */ +describe("FsFolderTaskOutputRepository dir fsync (integration)", () => { + setLogger(getTestingLogger()); + let folder: string; + + beforeEach(() => { + folder = mkdtempSync(join(tmpdir(), "fsfolder-dirsync-")); + }); + + afterEach(() => { + try { + rmSync(folder, { recursive: true, force: true }); + } catch {} + }); + + it("saveOutputStream resolves and bytes round-trip with dir-sync enabled", async () => { + const repo = new FsFolderTaskOutputRepository(folder); + const payload = new Uint8Array([1, 2, 3, 4, 5]); + const ref = await repo.saveOutputStream("RoundTrip", { k: 1 }, once(payload), { + mime: "application/octet-stream", + }); + expect(ref.$ref).toMatch(/^fsfolder:\/\/blobs\/RoundTrip_/); + const blob = await repo.getOutputByRef(ref); + expect(blob).toBeDefined(); + const bytes = new Uint8Array(await blob!.arrayBuffer()); + expect(Array.from(bytes)).toEqual(Array.from(payload)); + }); + + it("16 concurrent writes complete (dir-sync does not serialize incorrectly)", async () => { + const repo = new FsFolderTaskOutputRepository(folder); + const refs = await Promise.all( + Array.from({ length: 16 }, (_, i) => + repo.saveOutputStream("Concurrent", { i }, once(new Uint8Array([i])), {}) + ) + ); + expect(refs).toHaveLength(16); + for (let i = 0; i < refs.length; i++) { + const blob = await repo.getOutputByRef(refs[i]!); + expect(blob).toBeDefined(); + const bytes = new Uint8Array(await blob!.arrayBuffer()); + expect(Array.from(bytes)).toEqual([i]); + } + }); +}); diff --git a/packages/test/src/test/task-graph/BinaryCacheIntegrity.test.ts b/packages/test/src/test/task-graph/BinaryCacheIntegrity.test.ts new file mode 100644 index 000000000..ab89f2136 --- /dev/null +++ b/packages/test/src/test/task-graph/BinaryCacheIntegrity.test.ts @@ -0,0 +1,248 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ +import type { StreamEvent } from "@workglow/task-graph"; +import { + CACHE_REGISTRY, + DefaultCacheRegistry, + FsFolderTaskOutputRepository, + IExecuteContext, + isCacheRef, + StreamPump, + Task, + TaskGraph, + TaskGraphRunner, +} from "@workglow/task-graph"; +import { Container, ServiceRegistry, sleep } from "@workglow/util"; +import type { DataPortSchema } from "@workglow/util/schema"; +import { mkdtemp } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { describe, expect, it } from "vitest"; +import { InMemoryTaskOutputRepository } from "../../binding/InMemoryTaskOutputRepository"; +import { StreamingMemoryRepo } from "../../binding/StreamingMemoryRepo"; + +async function blobBytes(value: unknown): Promise { + expect(value).toBeInstanceOf(Blob); + return Array.from(new Uint8Array(await (value as Blob).arrayBuffer())); +} + +type SmallOut = { bytes: Blob | ArrayBuffer }; + +let smallExecutions = 0; + +/** Small (5-byte) binary producer — well below the default 64 KiB threshold. */ +class SmallBlobStreamTask extends Task, SmallOut> { + public static override type = "BinaryCacheIntegrity_Small"; + public static override category = "Test"; + public static override cacheable = true; + + public static override inputSchema(): DataPortSchema { + return { type: "object", properties: {}, additionalProperties: false } as const; + } + + public static override outputSchema(): DataPortSchema { + return { + type: "object", + properties: { bytes: { type: "object", format: "blob", "x-stream": "binary" } }, + additionalProperties: false, + } as const satisfies DataPortSchema; + } + + async *executeStream( + _input: Record, + _ctx: IExecuteContext + ): AsyncIterable> { + smallExecutions++; + yield { type: "binary-delta", port: "bytes", binaryDelta: new Uint8Array([1, 2, 3]) }; + await sleep(1); + yield { type: "binary-delta", port: "bytes", binaryDelta: new Uint8Array([4, 5]) }; + yield { type: "finish", data: {} as SmallOut }; + } +} + +type TwoPortOut = { a: Blob | ArrayBuffer; b: Blob | ArrayBuffer }; + +/** Streams two independent binary ports in one run. */ +class TwoPortBinarySource extends Task, TwoPortOut> { + public static override type = "BinaryCacheIntegrity_TwoPort"; + public static override category = "Test"; + public static override cacheable = true; + + public static override inputSchema(): DataPortSchema { + return { type: "object", properties: {}, additionalProperties: false } as const; + } + + public static override outputSchema(): DataPortSchema { + return { + type: "object", + properties: { + a: { type: "object", format: "blob", "x-stream": "binary" }, + b: { type: "object", format: "blob", "x-stream": "binary" }, + }, + additionalProperties: false, + } as const satisfies DataPortSchema; + } + + async *executeStream( + _input: Record, + _ctx: IExecuteContext + ): AsyncIterable> { + yield { type: "binary-delta", port: "a", binaryDelta: new Uint8Array([1, 2]) }; + yield { type: "binary-delta", port: "b", binaryDelta: new Uint8Array([9, 8, 7]) }; + yield { type: "finish", data: {} as TwoPortOut }; + } +} + +describe("binary cache integrity", () => { + it("default-threshold small outputs survive a tabular (JSON-row) cache round-trip", async () => { + smallExecutions = 0; + const folder = await mkdtemp(join(tmpdir(), "wg-cache-integrity-")); + const repo = new FsFolderTaskOutputRepository(folder); + const services = new ServiceRegistry(new Container()); + services.registerInstance(CACHE_REGISTRY, new DefaultCacheRegistry({ deterministic: repo })); + + const out1 = await new SmallBlobStreamTask().run({}, { registry: services }); + expect(await blobBytes(out1.bytes)).toEqual([1, 2, 3, 4, 5]); + expect(smallExecutions).toBe(1); + + // Second run must be a cache hit AND return the original bytes — not a + // JSON-mangled `{}` from stringifying an inline Blob into the row. + const out2 = await new SmallBlobStreamTask().run({}, { registry: services }); + expect(smallExecutions).toBe(1); + expect(await blobBytes(out2.bytes)).toEqual([1, 2, 3, 4, 5]); + }); + + it("multi-binary-port tasks fall back to accumulation — no port is dropped", async () => { + const repo = new StreamingMemoryRepo({}); + + const graph = new TaskGraph(); + const source = new TwoPortBinarySource({ id: "source" }); + graph.addTask(source); + // Legacy direct-cache config: this is the path where taskNeedsAccumulation + // consults canStreamBinaryToCache and may skip accumulation entirely. + const results = await new TaskGraphRunner(graph).runGraph({}, { outputCache: repo }); + + const data = results.find((r) => r.id === "source")!.data as TwoPortOut; + expect(await blobBytes(data.a)).toEqual([1, 2]); + expect(await blobBytes(data.b)).toEqual([9, 8, 7]); + }); + + it("canStreamBinaryToCache rejects tasks with more than one binary port", () => { + const graph = new TaskGraph(); + const source = new TwoPortBinarySource({ id: "source" }); + graph.addTask(source); + expect(StreamPump.canStreamBinaryToCache(graph, source, new StreamingMemoryRepo({}))).toBe( + false + ); + }); +}); + +type InlineOut = { bytes: Blob | ArrayBuffer }; + +let inlineBlobExecutions = 0; + +/** Non-streaming binary producer: plain execute() returning an inline Blob. */ +class InlineBlobTask extends Task, InlineOut> { + public static override type = "BinaryCacheIntegrity_InlineBlob"; + public static override category = "Test"; + public static override cacheable = true; + + public static override inputSchema(): DataPortSchema { + return { type: "object", properties: {}, additionalProperties: false } as const; + } + + public static override outputSchema(): DataPortSchema { + return { + type: "object", + properties: { bytes: { type: "object", format: "blob" } }, + additionalProperties: false, + } as const satisfies DataPortSchema; + } + + override async execute(): Promise { + inlineBlobExecutions++; + return { + bytes: new Blob([new Uint8Array([1, 2, 3, 4])], { type: "application/octet-stream" }), + }; + } +} + +let inlineBufferExecutions = 0; + +/** Non-streaming binary producer returning an inline ArrayBuffer. */ +class InlineBufferTask extends Task, InlineOut> { + public static override type = "BinaryCacheIntegrity_InlineBuffer"; + public static override category = "Test"; + public static override cacheable = true; + + public static override inputSchema(): DataPortSchema { + return { type: "object", properties: {}, additionalProperties: false } as const; + } + + public static override outputSchema(): DataPortSchema { + return { + type: "object", + properties: { bytes: { type: "object", format: "binary" } }, + additionalProperties: false, + } as const satisfies DataPortSchema; + } + + override async execute(): Promise { + inlineBufferExecutions++; + return { bytes: new Uint8Array([5, 6, 7]).buffer }; + } +} + +describe("inline binary cache serialization (BinaryPortCodec)", () => { + it("Blob round-trips through a non-streaming JSON-row backing", async () => { + inlineBlobExecutions = 0; + const repo = new InMemoryTaskOutputRepository(); + const services = new ServiceRegistry(new Container()); + services.registerInstance(CACHE_REGISTRY, new DefaultCacheRegistry({ deterministic: repo })); + + const out1 = await new InlineBlobTask().run({}, { registry: services }); + expect(await blobBytes(out1.bytes)).toEqual([1, 2, 3, 4]); + expect(inlineBlobExecutions).toBe(1); + + const out2 = await new InlineBlobTask().run({}, { registry: services }); + expect(inlineBlobExecutions).toBe(1); // cache hit + expect(await blobBytes(out2.bytes)).toEqual([1, 2, 3, 4]); + expect((out2.bytes as Blob).type).toBe("application/octet-stream"); + }); + + it("ArrayBuffer round-trips through a non-streaming JSON-row backing", async () => { + inlineBufferExecutions = 0; + const repo = new InMemoryTaskOutputRepository(); + const services = new ServiceRegistry(new Container()); + services.registerInstance(CACHE_REGISTRY, new DefaultCacheRegistry({ deterministic: repo })); + + const out1 = await new InlineBufferTask().run({}, { registry: services }); + expect(out1.bytes).toBeInstanceOf(ArrayBuffer); + expect(Array.from(new Uint8Array(out1.bytes as ArrayBuffer))).toEqual([5, 6, 7]); + + const out2 = await new InlineBufferTask().run({}, { registry: services }); + expect(inlineBufferExecutions).toBe(1); // cache hit + expect(out2.bytes).toBeInstanceOf(ArrayBuffer); + expect(Array.from(new Uint8Array(out2.bytes as ArrayBuffer))).toEqual([5, 6, 7]); + }); + + it("streaming backings still store a CacheRef row, not a BinaryPortWire", async () => { + const repo = new StreamingMemoryRepo({}); + const services = new ServiceRegistry(new Container()); + services.registerInstance(CACHE_REGISTRY, new DefaultCacheRegistry({ deterministic: repo })); + + await new SmallBlobStreamTask().run({}, { registry: services, referenceThresholdBytes: 0 }); + + const rows = Array.from( + (repo as unknown as { store: Map> }).store.values() + ); + // StreamingMemoryRepo stores rows verbatim; the binary port slot must be + // the branded ref envelope — the codec passes refs through untouched. + expect(rows).toHaveLength(1); + expect(isCacheRef(rows[0].bytes)).toBe(true); + expect((rows[0].bytes as Record).__binaryPortWire).toBeUndefined(); + }); +}); diff --git a/packages/test/src/test/task-graph/CacheHitReplay.test.ts b/packages/test/src/test/task-graph/CacheHitReplay.test.ts new file mode 100644 index 000000000..5e7da1a7e --- /dev/null +++ b/packages/test/src/test/task-graph/CacheHitReplay.test.ts @@ -0,0 +1,262 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ +import type { CachePolicy, StreamEvent } from "@workglow/task-graph"; +import { + CACHE_REGISTRY, + Dataflow, + DefaultCacheRegistry, + FsFolderTaskOutputRepository, + IExecuteContext, + isCacheRef, + Task, + TaskGraph, + TaskGraphRunner, + TaskStatus, +} from "@workglow/task-graph"; +import { Container, ServiceRegistry, sleep } from "@workglow/util"; +import type { DataPortSchema } from "@workglow/util/schema"; +import { mkdtemp } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { beforeEach, describe, expect, it, vi } from "vitest"; +import { StreamingMemoryRepo } from "../../binding/StreamingMemoryRepo"; + +type BinOut = { bytes: Blob | ArrayBuffer }; +type SinkInput = { bytes: Blob | ArrayBuffer }; +type SinkOutput = { text: string }; + +let producerExecutions = 0; + +/** Cacheable binary producer: yields two delta chunks (5 bytes total). */ +class ReplayProducer extends Task, BinOut> { + public static override type = "CacheHitReplay_Producer"; + public static override category = "Test"; + public static override cacheable = true; + + public static override inputSchema(): DataPortSchema { + return { type: "object", properties: {}, additionalProperties: false } as const; + } + + public static override outputSchema(): DataPortSchema { + return { + type: "object", + properties: { bytes: { type: "object", format: "blob", "x-stream": "binary" } }, + additionalProperties: false, + } as const satisfies DataPortSchema; + } + + async *executeStream( + _input: Record, + _ctx: IExecuteContext + ): AsyncIterable> { + producerExecutions++; + yield { type: "binary-delta", port: "bytes", binaryDelta: new Uint8Array([1, 2, 3]) }; + await sleep(1); + yield { type: "binary-delta", port: "bytes", binaryDelta: new Uint8Array([4, 5]) }; + yield { type: "finish", data: {} as BinOut }; + } +} + +/** + * Streaming consumer: reads the binary input stream from + * `this.runner.inputStreams` and records every delta it sees. + */ +class CollectingStreamConsumer extends Task { + public static override type = "CacheHitReplay_StreamConsumer"; + public static override category = "Test"; + public static override cachePolicy: CachePolicy = { kind: "none" }; + + public deltaCount = 0; + public collected: number[] = []; + + public static override inputSchema(): DataPortSchema { + return { + type: "object", + properties: { bytes: { type: "object", format: "blob", "x-stream": "binary" } }, + additionalProperties: false, + } as const satisfies DataPortSchema; + } + + public static override outputSchema(): DataPortSchema { + return { + type: "object", + properties: { text: { type: "string", "x-stream": "append" } }, + additionalProperties: false, + } as const satisfies DataPortSchema; + } + + async *executeStream( + _input: SinkInput, + _ctx: IExecuteContext + ): AsyncIterable> { + const stream = this.runner.inputStreams?.get("bytes"); + if (stream) { + const reader = stream.getReader(); + while (true) { + const { done, value } = await reader.read(); + if (done) break; + if (value.type === "binary-delta") { + this.deltaCount++; + for (const b of value.binaryDelta) this.collected.push(b); + } + } + } + yield { type: "finish", data: { text: `len:${this.collected.length}` } }; + } +} + +/** Materializing consumer: plain blob input port, no `x-stream`. */ +class MaterializingConsumer extends Task { + public static override type = "CacheHitReplay_MaterializingConsumer"; + public static override category = "Test"; + public static override cachePolicy: CachePolicy = { kind: "none" }; + + public received: unknown; + + public static override inputSchema(): DataPortSchema { + return { + type: "object", + properties: { bytes: { title: "Bytes" } }, + additionalProperties: false, + } as const satisfies DataPortSchema; + } + + public static override outputSchema(): DataPortSchema { + return { + type: "object", + properties: { text: { type: "string" } }, + additionalProperties: false, + } as const satisfies DataPortSchema; + } + + override async execute(input: SinkInput): Promise { + this.received = input.bytes; + if (input.bytes instanceof Blob) return { text: `blob:${input.bytes.size}` }; + return { text: `other` }; + } +} + +let repo: StreamingMemoryRepo; +let services: ServiceRegistry; + +beforeEach(() => { + producerExecutions = 0; + repo = new StreamingMemoryRepo({}); + services = new ServiceRegistry(new Container()); + services.registerInstance(CACHE_REGISTRY, new DefaultCacheRegistry({ deterministic: repo })); +}); + +function buildStreamingGraph(): { graph: TaskGraph; consumer: CollectingStreamConsumer } { + const graph = new TaskGraph(); + const producer = new ReplayProducer({ id: "producer" }, { referenceThresholdBytes: 0 }); + const consumer = new CollectingStreamConsumer({ id: "consumer" }); + graph.addTasks([producer, consumer]); + graph.addDataflow(new Dataflow("producer", "bytes", "consumer", "bytes")); + return { graph, consumer }; +} + +function buildMaterializingGraph(): { graph: TaskGraph; consumer: MaterializingConsumer } { + const graph = new TaskGraph(); + const producer = new ReplayProducer({ id: "producer" }, { referenceThresholdBytes: 0 }); + const consumer = new MaterializingConsumer({ id: "consumer" }); + graph.addTasks([producer, consumer]); + graph.addDataflow(new Dataflow("producer", "bytes", "consumer", "bytes")); + return { graph, consumer }; +} + +describe("cache-hit replay of binary CacheRefs", () => { + it("replays cached bytes as chunked binary-delta events to a streaming consumer", async () => { + const first = buildStreamingGraph(); + await new TaskGraphRunner(first.graph).runGraph({}, { registry: services }); + expect(producerExecutions).toBe(1); + expect(first.consumer.collected).toEqual([1, 2, 3, 4, 5]); + + // Replay reads back in 2-byte chunks → at least 2 deltas on the cache hit. + repo.streamReadChunkSize = 2; + const second = buildStreamingGraph(); + await new TaskGraphRunner(second.graph).runGraph({}, { registry: services }); + + expect(producerExecutions).toBe(1); // cache hit — producer not re-executed + expect(second.consumer.deltaCount).toBeGreaterThanOrEqual(2); + expect(second.consumer.collected).toEqual([1, 2, 3, 4, 5]); + }); + + it("hydrates the ref into the enriched finish for a materializing consumer", async () => { + const first = buildMaterializingGraph(); + await new TaskGraphRunner(first.graph).runGraph({}, { registry: services }); + expect(producerExecutions).toBe(1); + expect(first.consumer.received).toBeInstanceOf(Blob); + + const second = buildMaterializingGraph(); + await new TaskGraphRunner(second.graph).runGraph({}, { registry: services }); + + expect(producerExecutions).toBe(1); // cache hit + expect(second.consumer.received).toBeInstanceOf(Blob); + const bytes = new Uint8Array(await (second.consumer.received as Blob).arrayBuffer()); + expect(Array.from(bytes)).toEqual([1, 2, 3, 4, 5]); + }); + + it("treats a dangling ref as a cache miss and re-executes (self-healing)", async () => { + const first = buildMaterializingGraph(); + await new TaskGraphRunner(first.graph).runGraph({}, { registry: services }); + expect(producerExecutions).toBe(1); + + // Keep the JSON row (with its ref) but delete the bytes behind it. + repo.streamed.clear(); + + const second = buildMaterializingGraph(); + await new TaskGraphRunner(second.graph).runGraph({}, { registry: services }); + + expect(producerExecutions).toBe(2); // re-executed, cache rewritten + expect(second.consumer.received).toBeInstanceOf(Blob); + const bytes = new Uint8Array(await (second.consumer.received as Blob).arrayBuffer()); + expect(Array.from(bytes)).toEqual([1, 2, 3, 4, 5]); + }); + + it("leaves the ref untouched and performs no reads when there are no consumers", async () => { + const buildLeafGraph = () => { + const graph = new TaskGraph(); + graph.addTask(new ReplayProducer({ id: "producer" }, { referenceThresholdBytes: 0 })); + return graph; + }; + await new TaskGraphRunner(buildLeafGraph()).runGraph({}, { registry: services }); + expect(producerExecutions).toBe(1); + + const streamSpy = vi.spyOn(repo, "getOutputStreamByRef"); + const blobSpy = vi.spyOn(repo, "getOutputByRef"); + const graph = buildLeafGraph(); + const results = await new TaskGraphRunner(graph).runGraph({}, { registry: services }); + + expect(producerExecutions).toBe(1); // cache hit + expect(streamSpy).not.toHaveBeenCalled(); + expect(blobSpy).not.toHaveBeenCalled(); + const producerResult = results.find((r) => r.id === "producer"); + expect(isCacheRef((producerResult!.data as Record).bytes)).toBe(true); + expect(graph.getTask("producer")!.status).toBe(TaskStatus.COMPLETED); + }); + + it("replays end-to-end through FsFolderTaskOutputRepository", async () => { + const folder = await mkdtemp(join(tmpdir(), "wg-cache-replay-")); + const fsRepo = new FsFolderTaskOutputRepository(folder); + const fsServices = new ServiceRegistry(new Container()); + fsServices.registerInstance( + CACHE_REGISTRY, + new DefaultCacheRegistry({ deterministic: fsRepo }) + ); + + const first = buildStreamingGraph(); + await new TaskGraphRunner(first.graph).runGraph({}, { registry: fsServices }); + expect(producerExecutions).toBe(1); + expect(first.consumer.collected).toEqual([1, 2, 3, 4, 5]); + + const second = buildStreamingGraph(); + await new TaskGraphRunner(second.graph).runGraph({}, { registry: fsServices }); + + expect(producerExecutions).toBe(1); // cache hit from disk + expect(second.consumer.deltaCount).toBeGreaterThanOrEqual(1); + expect(second.consumer.collected).toEqual([1, 2, 3, 4, 5]); + }); +}); diff --git a/packages/test/src/test/task-graph/CacheRef.test.ts b/packages/test/src/test/task-graph/CacheRef.test.ts new file mode 100644 index 000000000..430dd8286 --- /dev/null +++ b/packages/test/src/test/task-graph/CacheRef.test.ts @@ -0,0 +1,129 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { CacheRef, IRunConfig } from "@workglow/task-graph"; +import { + CACHE_REF_KIND, + isCacheRef, + makeCacheRef, + REFERENCE_THRESHOLD_BYTES_DEFAULT, + resolveReferenceThreshold, +} from "@workglow/task-graph"; +import { describe, expect, it } from "vitest"; + +describe("isCacheRef", () => { + it("accepts a minimal branded ref carrying only $ref", () => { + const ref: CacheRef = makeCacheRef({ $ref: "cache://k1" }); + expect(isCacheRef(ref)).toBe(true); + }); + + it("accepts a branded ref carrying size and mime hints", () => { + const ref: CacheRef = makeCacheRef({ $ref: "cache://k2", size: 1024, mime: "audio/wav" }); + expect(isCacheRef(ref)).toBe(true); + }); + + it("rejects values without a string $ref", () => { + expect(isCacheRef({ kind: CACHE_REF_KIND })).toBe(false); + expect(isCacheRef({ kind: CACHE_REF_KIND, ref: "cache://k" })).toBe(false); + expect(isCacheRef({ kind: CACHE_REF_KIND, $ref: 42 })).toBe(false); + expect(isCacheRef({ kind: CACHE_REF_KIND, $ref: null })).toBe(false); + }); + + it("rejects values that lack the kind brand even when $ref is a string", () => { + expect(isCacheRef({ $ref: "cache://k" })).toBe(false); + expect(isCacheRef({ $ref: "cache://k", size: 10 })).toBe(false); + }); + + it("rejects values whose kind is not the literal brand", () => { + expect(isCacheRef({ kind: "other", $ref: "cache://k" })).toBe(false); + expect(isCacheRef({ kind: 1, $ref: "cache://k" })).toBe(false); + }); + + it("rejects primitives and null", () => { + expect(isCacheRef(null)).toBe(false); + expect(isCacheRef(undefined)).toBe(false); + expect(isCacheRef("cache://k")).toBe(false); + expect(isCacheRef(42)).toBe(false); + expect(isCacheRef(true)).toBe(false); + }); + + it("accepts a branded ref where $ref is the empty string (still string-typed)", () => { + expect(isCacheRef(makeCacheRef({ $ref: "" }))).toBe(true); + }); + + it("does NOT confuse JSON-Schema $ref strings with cache refs", () => { + // Before the kind brand, a shape-only check would have matched any + // {$ref: string} — including JSON-Schema $refs in metadata or attacker + // payloads pointing at other-run cache keys. With the brand, a JSON-Schema + // ref no longer passes isCacheRef and the cache resolver will not walk it. + const jsonSchemaRef = { $ref: "#/$defs/Foo" }; + expect(isCacheRef(jsonSchemaRef)).toBe(false); + }); +}); + +describe("makeCacheRef", () => { + it("brands the constructed object with CACHE_REF_KIND", () => { + const ref = makeCacheRef({ $ref: "cache://x" }); + expect(ref.kind).toBe(CACHE_REF_KIND); + }); + + it("omits size and mime when not supplied", () => { + const ref = makeCacheRef({ $ref: "cache://x" }); + expect("size" in ref).toBe(false); + expect("mime" in ref).toBe(false); + }); + + it("preserves size and mime when supplied", () => { + const ref = makeCacheRef({ $ref: "cache://x", size: 99, mime: "image/png" }); + expect(ref.size).toBe(99); + expect(ref.mime).toBe("image/png"); + }); + + it("survives JSON round-trip and still passes isCacheRef", () => { + const original = makeCacheRef({ $ref: "cache://round-trip", size: 7, mime: "text/plain" }); + const wire = JSON.stringify(original); + const received = JSON.parse(wire); + expect(isCacheRef(received)).toBe(true); + expect(received.kind).toBe(CACHE_REF_KIND); + expect(received.$ref).toBe("cache://round-trip"); + expect(received.size).toBe(7); + expect(received.mime).toBe("text/plain"); + }); +}); + +describe("resolveReferenceThreshold", () => { + it("returns the default constant when threshold is undefined", () => { + expect(resolveReferenceThreshold(undefined)).toBe(REFERENCE_THRESHOLD_BYTES_DEFAULT); + }); + + it("returns the configured threshold when set to a positive number", () => { + expect(resolveReferenceThreshold(1024)).toBe(1024); + expect(resolveReferenceThreshold(1_000_000)).toBe(1_000_000); + }); + + it("returns 0 when set to 0 (sentinel: always emit a reference)", () => { + expect(resolveReferenceThreshold(0)).toBe(0); + }); + + it("falls back to the default when given a negative value", () => { + expect(resolveReferenceThreshold(-1)).toBe(REFERENCE_THRESHOLD_BYTES_DEFAULT); + }); + + it("the default is 64 KiB", () => { + expect(REFERENCE_THRESHOLD_BYTES_DEFAULT).toBe(65_536); + }); + + it("IRunConfig accepts referenceThresholdBytes as a number", () => { + const cfg: IRunConfig = { referenceThresholdBytes: 0 }; + expect(resolveReferenceThreshold(cfg.referenceThresholdBytes)).toBe(0); + const cfg2: IRunConfig = { referenceThresholdBytes: 2048 }; + expect(resolveReferenceThreshold(cfg2.referenceThresholdBytes)).toBe(2048); + const cfg3: IRunConfig = {}; + expect(resolveReferenceThreshold(cfg3.referenceThresholdBytes)).toBe( + REFERENCE_THRESHOLD_BYTES_DEFAULT + ); + }); +}); diff --git a/packages/test/src/test/task-graph/CacheStreamOut.test.ts b/packages/test/src/test/task-graph/CacheStreamOut.test.ts new file mode 100644 index 000000000..45c7d7fea --- /dev/null +++ b/packages/test/src/test/task-graph/CacheStreamOut.test.ts @@ -0,0 +1,248 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ +import type { CacheRef, TaskOutputRepository } from "@workglow/task-graph"; +import { + byteIterableFromBlob, + FsFolderTaskOutputRepository, + makeCacheRef, + RunPrivateCacheRepo, + streamRefViaBacking, +} from "@workglow/task-graph"; +import { existsSync } from "node:fs"; +import { mkdtemp, readdir } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { describe, expect, it } from "vitest"; +import { NonStreamingMemoryRepo, StreamingMemoryRepo } from "../../binding/StreamingMemoryRepo"; + +async function* gen(...chunks: Uint8Array[]): AsyncIterable { + for (const c of chunks) yield c; +} + +async function collect(stream: AsyncIterable): Promise { + const out: number[] = []; + for await (const chunk of stream) for (const b of chunk) out.push(b); + return out; +} + +describe("supportsStreamingReads", () => { + it("reflects presence of getOutputStreamByRef", () => { + expect(new StreamingMemoryRepo({}).supportsStreamingReads()).toBe(true); + expect(new NonStreamingMemoryRepo({}).supportsStreamingReads()).toBe(false); + }); + + it("RunPrivateCacheRepo mirrors the backing's read capability", () => { + const yes = new RunPrivateCacheRepo({ backing: new StreamingMemoryRepo({}), runId: "r" }); + const no = new RunPrivateCacheRepo({ backing: new NonStreamingMemoryRepo({}), runId: "r" }); + expect(yes.supportsStreamingReads()).toBe(true); + expect(no.supportsStreamingReads()).toBe(false); + }); +}); + +describe("byteIterableFromBlob", () => { + it("yields the blob's bytes", async () => { + const blob = new Blob([new Uint8Array([1, 2, 3, 4])]); + expect(await collect(byteIterableFromBlob(blob))).toEqual([1, 2, 3, 4]); + }); +}); + +describe("streamRefViaBacking", () => { + it("prefers getOutputStreamByRef when present", async () => { + const repo = new StreamingMemoryRepo({}); + const ref = await repo.saveOutputStream("T", { k: 1 }, gen(new Uint8Array([5, 6, 7])), {}); + let streamReaderCalled = false; + const wrapped = { + getOutputByRef: (r: CacheRef) => repo.getOutputByRef(r), + getOutputStreamByRef: (r: CacheRef) => { + streamReaderCalled = true; + return repo.getOutputStreamByRef(r); + }, + }; + const stream = await streamRefViaBacking(ref, wrapped); + expect(stream).toBeDefined(); + expect(await collect(stream!)).toEqual([5, 6, 7]); + expect(streamReaderCalled).toBe(true); + }); + + it("falls back to getOutputByRef + blob.stream() when no stream reader", async () => { + const repo = new StreamingMemoryRepo({}); + const ref = await repo.saveOutputStream("T", { k: 2 }, gen(new Uint8Array([8, 9])), {}); + const materializingOnly = { getOutputByRef: (r: CacheRef) => repo.getOutputByRef(r) }; + const stream = await streamRefViaBacking(ref, materializingOnly); + expect(stream).toBeDefined(); + expect(await collect(stream!)).toEqual([8, 9]); + }); + + it("returns undefined for a dangling ref", async () => { + const repo = new StreamingMemoryRepo({}); + const ref = await repo.saveOutputStream("T", { k: 3 }, gen(new Uint8Array([1])), {}); + await repo.clear(); + expect(await streamRefViaBacking(ref, repo)).toBeUndefined(); + }); + + it("returns undefined when the backing has no readers at all", async () => { + const ref = makeCacheRef({ $ref: "inmem://nope" }); + expect(await streamRefViaBacking(ref, {})).toBeUndefined(); + }); +}); + +// ============================================================================ +// Stream-out contract suite — run against every streaming-capable repository +// ============================================================================ + +interface ContractSetup { + readonly repo: TaskOutputRepository; + /** A second instance over the same persistent backing (FS-only). */ + readonly sibling?: () => TaskOutputRepository; +} + +function runCacheStreamOutContractTests(name: string, setup: () => Promise): void { + describe(`stream-out contract: ${name}`, () => { + it("round-trips a multi-chunk write through getOutputStreamByRef", async () => { + const { repo } = await setup(); + const ref = await repo.saveOutputStream!( + "T", + { k: 1 }, + gen(new Uint8Array([1, 2]), new Uint8Array([3]), new Uint8Array([4, 5, 6])), + {} + ); + expect(repo.supportsStreamingReads()).toBe(true); + expect(ref.size).toBe(6); + const stream = repo.getOutputStreamByRef!(ref); + expect(stream).toBeDefined(); + expect(await collect(stream!)).toEqual([1, 2, 3, 4, 5, 6]); + }); + + it("round-trips through the materializing reader too", async () => { + const { repo } = await setup(); + const ref = await repo.saveOutputStream!("T", { k: 2 }, gen(new Uint8Array([7, 8])), {}); + const blob = await repo.getOutputByRef!(ref); + expect(blob).toBeInstanceOf(Blob); + expect(Array.from(new Uint8Array(await blob!.arrayBuffer()))).toEqual([7, 8]); + }); + + it("re-writing the same (taskType, inputs) overwrites the previous bytes", async () => { + const { repo } = await setup(); + await repo.saveOutputStream!("T", { k: 3 }, gen(new Uint8Array([1, 2, 3])), {}); + const ref = await repo.saveOutputStream!("T", { k: 3 }, gen(new Uint8Array([9])), {}); + expect(await collect(repo.getOutputStreamByRef!(ref)!)).toEqual([9]); + }); + + it("returns undefined from both readers for an unknown ref", async () => { + const { repo } = await setup(); + const ref = makeCacheRef({ $ref: "fsfolder://blobs/never-written.bin" }); + expect(await repo.getOutputByRef!(ref)).toBeUndefined(); + expect(repo.getOutputStreamByRef!(ref)).toBeUndefined(); + }); + + it("clear() makes previously written refs dangle", async () => { + const { repo } = await setup(); + const ref = await repo.saveOutputStream!("T", { k: 4 }, gen(new Uint8Array([1])), {}); + await repo.clear(); + expect(await repo.getOutputByRef!(ref)).toBeUndefined(); + expect(repo.getOutputStreamByRef!(ref)).toBeUndefined(); + }); + + it("a sibling instance over the same backing resolves the ref (cross-process)", async () => { + const { repo, sibling } = await setup(); + if (!sibling) return; // in-memory backings have no cross-instance story + const ref = await repo.saveOutputStream!("T", { k: 5 }, gen(new Uint8Array([4, 2])), {}); + const other = sibling(); + expect(await collect(other.getOutputStreamByRef!(ref)!)).toEqual([4, 2]); + const blob = await other.getOutputByRef!(ref); + expect(Array.from(new Uint8Array(await blob!.arrayBuffer()))).toEqual([4, 2]); + }); + }); +} + +runCacheStreamOutContractTests("StreamingMemoryRepo", async () => ({ + repo: new StreamingMemoryRepo({}), +})); + +runCacheStreamOutContractTests("FsFolderTaskOutputRepository", async () => { + const folder = await mkdtemp(join(tmpdir(), "wg-cache-streamout-")); + return { + repo: new FsFolderTaskOutputRepository(folder), + sibling: () => new FsFolderTaskOutputRepository(folder), + }; +}); + +describe("FsFolderTaskOutputRepository specifics", () => { + it("publishes atomically: no .tmp file remains and the blob is named by the ref", async () => { + const folder = await mkdtemp(join(tmpdir(), "wg-cache-streamout-")); + const repo = new FsFolderTaskOutputRepository(folder); + const ref = await repo.saveOutputStream!("My Task/v2", { k: 1 }, gen(new Uint8Array([1])), { + mime: "application/octet-stream", + }); + expect(ref.mime).toBe("application/octet-stream"); + const files = await readdir(join(folder, "blobs")); + expect(files.some((f) => f.endsWith(".tmp"))).toBe(false); + expect(files).toHaveLength(1); + expect(ref.$ref).toBe(`fsfolder://blobs/${files[0]}`); + expect(existsSync(join(folder, "blobs", files[0]))).toBe(true); + }); + + it("rejects path-traversal shaped refs", async () => { + const folder = await mkdtemp(join(tmpdir(), "wg-cache-streamout-")); + const repo = new FsFolderTaskOutputRepository(folder); + const evil = makeCacheRef({ $ref: "fsfolder://blobs/../../etc/passwd.bin" }); + expect(await repo.getOutputByRef!(evil)).toBeUndefined(); + expect(repo.getOutputStreamByRef!(evil)).toBeUndefined(); + }); + + it("clearOlderThan prunes blob files alongside rows", async () => { + const folder = await mkdtemp(join(tmpdir(), "wg-cache-streamout-")); + const repo = new FsFolderTaskOutputRepository(folder); + const ref = await repo.saveOutputStream!("T", { k: 1 }, gen(new Uint8Array([1])), {}); + // Negative age puts the cutoff in the future: everything is "older". + await repo.clearOlderThan(-60_000); + expect(repo.getOutputStreamByRef!(ref)).toBeUndefined(); + }); + + it("deleteByTaskTypePrefix cascades to blob files (run-private cleanup)", async () => { + const folder = await mkdtemp(join(tmpdir(), "wg-cache-streamout-")); + const repo = new FsFolderTaskOutputRepository(folder); + const runRef = await repo.saveOutputStream!( + "__run:abc::T", + { k: 1 }, + gen(new Uint8Array([1])), + {} + ); + const keepRef = await repo.saveOutputStream!("Other", { k: 1 }, gen(new Uint8Array([2])), {}); + await repo.deleteByTaskTypePrefix("__run:abc::"); + expect(repo.getOutputStreamByRef!(runRef)).toBeUndefined(); + expect(repo.getOutputStreamByRef!(keepRef)).toBeDefined(); + }); + + it("clearOlderThanWithTaskTypePrefix cascades to blob files by prefix and age", async () => { + const folder = await mkdtemp(join(tmpdir(), "wg-cache-streamout-")); + const repo = new FsFolderTaskOutputRepository(folder); + const staleRef = await repo.saveOutputStream!( + "__run:x::T", + { k: 1 }, + gen(new Uint8Array([1])), + {} + ); + const keepRef = await repo.saveOutputStream!("Other", { k: 1 }, gen(new Uint8Array([2])), {}); + await repo.clearOlderThanWithTaskTypePrefix("__run:", -60_000); + expect(repo.getOutputStreamByRef!(staleRef)).toBeUndefined(); + expect(repo.getOutputStreamByRef!(keepRef)).toBeDefined(); + }); + + it("task types with colliding sanitized names get distinct blobs", async () => { + const folder = await mkdtemp(join(tmpdir(), "wg-cache-streamout-")); + const repo = new FsFolderTaskOutputRepository(folder); + // Both sanitize to "My-Task"; same inputs — only the fingerprinted raw + // taskType keeps the blob files apart. + const refA = await repo.saveOutputStream!("My@Task", { k: 1 }, gen(new Uint8Array([1])), {}); + const refB = await repo.saveOutputStream!("My/Task", { k: 1 }, gen(new Uint8Array([2])), {}); + expect(refA.$ref).not.toBe(refB.$ref); + const a = new Uint8Array(await (await repo.getOutputByRef!(refA))!.arrayBuffer()); + const b = new Uint8Array(await (await repo.getOutputByRef!(refB))!.arrayBuffer()); + expect(Array.from(a)).toEqual([1]); + expect(Array.from(b)).toEqual([2]); + }); +}); diff --git a/packages/test/src/test/task-graph/Spec2QueueRowAndRehydrate.test.ts b/packages/test/src/test/task-graph/Spec2QueueRowAndRehydrate.test.ts new file mode 100644 index 000000000..febcd460c --- /dev/null +++ b/packages/test/src/test/task-graph/Spec2QueueRowAndRehydrate.test.ts @@ -0,0 +1,305 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { + CacheRef, + JobHandleLike, + StreamEvent, + TaskInput, + TaskOutput, +} from "@workglow/task-graph"; +import { + CACHE_REGISTRY, + DefaultCacheRegistry, + IExecuteContext, + isCacheRef, + makeCacheRef, + resolveJobOutput, + Task, + TaskOutputRepository, + TaskRegistry, +} from "@workglow/task-graph"; +import { Container, ServiceRegistry, sleep } from "@workglow/util"; +import { DataPortSchema } from "@workglow/util/schema"; +import { beforeAll, beforeEach, describe, expect, it } from "vitest"; + +type BinOut = { bytes: Blob }; + +/** + * Streaming memory cache that exposes both `saveOutputStream` (Spec 2 path, + * returns CacheRef + stores bytes in a side map) and `getOutputByRef` so the + * cross-process resolution test below can hydrate refs without touching the + * main `saveOutput` row. + */ +class StreamingMemoryRepo extends TaskOutputRepository { + public readonly saved = new Map(); + public readonly streamed = new Map(); + override async saveOutput(t: string, i: TaskInput, o: TaskOutput): Promise { + this.saved.set(t + JSON.stringify(i), o); + } + override async getOutput(t: string, i: TaskInput): Promise { + return this.saved.get(t + JSON.stringify(i)); + } + override async clear(): Promise { + this.saved.clear(); + this.streamed.clear(); + } + override async size(): Promise { + return this.saved.size; + } + override async clearOlderThan(): Promise {} + override isDurable(): boolean { + return false; + } + override async saveOutputStream( + taskType: string, + inputs: TaskInput, + chunks: AsyncIterable, + _metadata: Record + ): Promise { + const parts: Uint8Array[] = []; + let size = 0; + for await (const c of chunks) { + parts.push(c); + size += c.byteLength; + } + const merged = new Uint8Array(size); + let off = 0; + for (const p of parts) { + merged.set(p, off); + off += p.byteLength; + } + const key = `inmem://${taskType}::${JSON.stringify(inputs)}`; + this.streamed.set(key, merged); + return makeCacheRef({ $ref: key, size, mime: "application/octet-stream" }); + } + override async getOutputByRef(ref: CacheRef): Promise { + const bytes = this.streamed.get(ref.$ref); + return bytes === undefined ? undefined : new Blob([bytes as unknown as BlobPart]); + } +} + +class NonStreamingMemoryRepo extends TaskOutputRepository { + public readonly saved = new Map(); + override async saveOutput(t: string, i: TaskInput, o: TaskOutput): Promise { + this.saved.set(t + JSON.stringify(i), o); + } + override async getOutput(t: string, i: TaskInput): Promise { + return this.saved.get(t + JSON.stringify(i)); + } + override async clear(): Promise { + this.saved.clear(); + } + override async size(): Promise { + return this.saved.size; + } + override async clearOlderThan(): Promise {} + override isDurable(): boolean { + return false; + } +} + +const CHUNK = 4 * 1024; // 4 KiB +const CHUNKS = 16; // 64 KiB total — large enough that inline-vs-ref is dramatic + +class BigBlobStreamTask extends Task, BinOut> { + public static override type = "Spec2QueueRowTest_BigBlobStream"; + public static override category = "Test"; + public static override cacheable = true; + + public static override outputSchema(): DataPortSchema { + return { + type: "object", + properties: { bytes: { type: "object", format: "blob", "x-stream": "binary" } }, + additionalProperties: false, + } as const satisfies DataPortSchema; + } + + async *executeStream( + _input: Record, + _ctx: IExecuteContext + ): AsyncIterable> { + for (let i = 0; i < CHUNKS; i++) { + const chunk = new Uint8Array(CHUNK).fill(i & 0xff); + yield { type: "binary-delta", port: "bytes", binaryDelta: chunk }; + if (i % 4 === 3) await sleep(0); + } + yield { type: "finish", data: {} as BinOut }; + } +} + +type ArrayBufOut = { bytes: ArrayBuffer }; + +class BigArrayBufferStreamTask extends Task, ArrayBufOut> { + public static override type = "Spec2QueueRowTest_BigArrayBufferStream"; + public static override category = "Test"; + public static override cacheable = true; + + public static override outputSchema(): DataPortSchema { + return { + type: "object", + properties: { bytes: { type: "object", format: "binary", "x-stream": "binary" } }, + additionalProperties: false, + } as const satisfies DataPortSchema; + } + + async *executeStream( + _input: Record, + _ctx: IExecuteContext + ): AsyncIterable> { + for (let i = 0; i < CHUNKS; i++) { + const chunk = new Uint8Array(CHUNK).fill(i & 0xff); + yield { type: "binary-delta", port: "bytes", binaryDelta: chunk }; + if (i % 4 === 3) await sleep(0); + } + yield { type: "finish", data: {} as ArrayBufOut }; + } +} + +beforeAll(() => { + TaskRegistry.registerTask(BigBlobStreamTask as any); + TaskRegistry.registerTask(BigArrayBufferStreamTask as any); +}); + +let services: ServiceRegistry; +let repo: StreamingMemoryRepo; +beforeEach(() => { + repo = new StreamingMemoryRepo({}); + services = new ServiceRegistry(new Container()); + services.registerInstance(CACHE_REGISTRY, new DefaultCacheRegistry({ deterministic: repo })); +}); + +/** + * The principal user value of Spec 2: the SAVED ROW in the cache (the same + * value job-queue would carry through `JobStorageFormat.output`) stays small + * regardless of payload size when the ref path is taken. These tests measure + * the wire size by JSON-serializing the saved output the way a real storage + * backend (Postgres/SQLite) would. + */ +describe("Spec 2 — saved-row size & cross-process rehydration", () => { + it("force-ref keeps the saved row tiny (CacheRef envelope only); bytes live in the streaming cache", async () => { + const task = new BigBlobStreamTask(); + const output = await task.run({}, { registry: services, referenceThresholdBytes: 0 }); + + // Wire shape of the cached small row. + expect(repo.saved.size).toBe(1); + const [savedOutput] = Array.from(repo.saved.values()) as Array>; + expect(isCacheRef(savedOutput.bytes)).toBe(true); + + const savedJson = JSON.stringify(savedOutput); + // CacheRef envelope is well under 1 KiB regardless of payload size. + expect(savedJson.length).toBeLessThan(1024); + + // Bytes are present in the streaming side of the cache (full size). + const ref = savedOutput.bytes as CacheRef; + expect(ref.size).toBe(CHUNKS * CHUNK); + const hydrated = await repo.getOutputByRef(ref); + expect(hydrated).toBeInstanceOf(Blob); + expect(hydrated!.size).toBe(CHUNKS * CHUNK); + + // And Output's port slot is a ref (not a Blob). + expect(isCacheRef(output.bytes)).toBe(true); + }); + + it("contrast: a non-streaming cache embeds the full payload in the saved row (the bloat path)", async () => { + // Same task, but the cache cannot stream — the runner falls through to + // accumulation and the saved row contains the serialized payload as a + // JSON-safe BinaryPortWire envelope (the binary port codec), not a ref. + const nonStreamRepo = new NonStreamingMemoryRepo({}); + const altServices = new ServiceRegistry(new Container()); + altServices.registerInstance( + CACHE_REGISTRY, + new DefaultCacheRegistry({ deterministic: nonStreamRepo }) + ); + + const task = new BigBlobStreamTask(); + await task.run({}, { registry: altServices, referenceThresholdBytes: 0 }); + + expect(nonStreamRepo.saved.size).toBe(1); + const [savedOutput] = Array.from(nonStreamRepo.saved.values()) as Array< + Record + >; + expect(isCacheRef(savedOutput.bytes)).toBe(false); + // The observable point: this row carries the (encoded) artifact itself, + // not a reference — the full payload still bloats the row. + const wire = savedOutput.bytes as Record; + expect(wire.__binaryPortWire).toBe(1); + expect(wire.size).toBe(CHUNKS * CHUNK); + }); + + it("cross-process simulation: serialize the small row, deserialize elsewhere, resolveJobOutput against shared cache", async () => { + const task = new BigBlobStreamTask(); + await task.run({}, { registry: services, referenceThresholdBytes: 0 }); + const [savedOutput] = Array.from(repo.saved.values()) as Array>; + + // "Process A" → wire: serialize the small row to a string (what Postgres + // would store in JSONB / SQLite would store in TEXT for the JobStorageFormat + // output column). + const wire = JSON.stringify(savedOutput); + expect(wire.length).toBeLessThan(1024); + + // "Process B" pulls the small row off the queue and reconstructs Output. + // The CacheRef survives JSON round-trip unchanged (just data). + const received = JSON.parse(wire) as { bytes: CacheRef }; + expect(isCacheRef(received.bytes)).toBe(true); + + // Process B resolves the ref against the SHARED cache (in real + // deployments: S3, networked FS, shared Postgres) — here `repo` is the + // shared backing for the test. resolveJobOutput is the queue-boundary + // bridge that callers wrap their JobHandle in. + const handle: JobHandleLike<{ bytes: Blob }> = { + waitFor: async () => received as unknown as { bytes: Blob }, + }; + const resolved = await resolveJobOutput(handle, repo); + + expect(resolved.bytes).toBeInstanceOf(Blob); + expect((resolved.bytes as Blob).size).toBe(CHUNKS * CHUNK); + }); + + it("rehydration below threshold inlines a Blob for format:'blob' (canonical BinaryFormat)", async () => { + const task = new BigBlobStreamTask(); + // Threshold above the full payload → rehydrate inline. + const output = await task.run( + {}, + { + registry: services, + referenceThresholdBytes: CHUNKS * CHUNK + 1, + } + ); + expect(output.bytes).toBeInstanceOf(Blob); + expect((output.bytes as Blob).size).toBe(CHUNKS * CHUNK); + }); + + it("rehydration below threshold inlines an ArrayBuffer for format:'binary' (canonical BinaryFormat)", async () => { + const task = new BigArrayBufferStreamTask(); + const output = await task.run( + {}, + { + registry: services, + referenceThresholdBytes: CHUNKS * CHUNK + 1, + } + ); + expect(output.bytes).toBeInstanceOf(ArrayBuffer); + expect((output.bytes as ArrayBuffer).byteLength).toBe(CHUNKS * CHUNK); + }); + + it("dangling refs (cache cleared between save and read) resolve to undefined — best-effort contract", async () => { + const task = new BigBlobStreamTask(); + await task.run({}, { registry: services, referenceThresholdBytes: 0 }); + const [savedOutput] = Array.from(repo.saved.values()) as Array>; + + // Cache TTL expired / explicit clear / different deployment with no + // backing access — the ref now points nowhere. + await repo.clear(); + + const handle: JobHandleLike<{ bytes: Blob }> = { + waitFor: async () => savedOutput as unknown as { bytes: Blob }, + }; + const resolved = await resolveJobOutput(handle, repo); + // Per Spec 2 §2: best-effort, returns undefined on cache miss. + expect(resolved.bytes).toBeUndefined(); + }); +}); diff --git a/packages/test/src/test/task-graph/StreamBinaryProcessor.test.ts b/packages/test/src/test/task-graph/StreamBinaryProcessor.test.ts new file mode 100644 index 000000000..d97534e46 --- /dev/null +++ b/packages/test/src/test/task-graph/StreamBinaryProcessor.test.ts @@ -0,0 +1,95 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { StreamEvent } from "@workglow/task-graph"; +import { IExecuteContext, Task, TaskRegistry } from "@workglow/task-graph"; +import { sleep } from "@workglow/util"; +import { DataPortSchema } from "@workglow/util/schema"; +import { beforeAll, describe, expect, it } from "vitest"; + +type BinOut = { bytes: Blob | ArrayBuffer }; + +/** + * A streaming source task (binary mode) that yields two byte chunks and an + * empty finish, mirroring how real binary producers emit `binary-delta` events. + */ +class BlobStreamTask extends Task, BinOut> { + public static override type = "BlobStreamTask"; + public static override category = "Test"; + public static override cacheable = false; + + public static override outputSchema(): DataPortSchema { + return { + type: "object", + properties: { bytes: { type: "object", format: "blob", "x-stream": "binary" } }, + additionalProperties: false, + } as const satisfies DataPortSchema; + } + + async *executeStream( + _input: Record, + _ctx: IExecuteContext + ): AsyncIterable> { + yield { type: "binary-delta", port: "bytes", binaryDelta: new Uint8Array([1, 2]) }; + await sleep(2); + yield { type: "binary-delta", port: "bytes", binaryDelta: new Uint8Array([3, 4]) }; + yield { type: "finish", data: {} as BinOut }; + } +} + +class ArrayBufferStreamTask extends BlobStreamTask { + public static override type = "ArrayBufferStreamTask"; + + public static override outputSchema(): DataPortSchema { + return { + type: "object", + properties: { bytes: { type: "object", format: "binary", "x-stream": "binary" } }, + additionalProperties: false, + } as const satisfies DataPortSchema; + } +} + +class BinaryFinishOverrideTask extends BlobStreamTask { + public static override type = "BinaryFinishOverrideTask"; + + override async *executeStream( + _input: Record, + _ctx: IExecuteContext + ): AsyncIterable> { + yield { type: "binary-delta", port: "bytes", binaryDelta: new Uint8Array([9, 9]) }; + // Explicit finish payload at the binary port must win over accumulation. + yield { type: "finish", data: { bytes: new Blob([new Uint8Array([7])]) } as BinOut }; + } +} + +describe("StreamProcessor binary accumulation", () => { + beforeAll(() => { + TaskRegistry.registerTask(BlobStreamTask); + TaskRegistry.registerTask(ArrayBufferStreamTask); + TaskRegistry.registerTask(BinaryFinishOverrideTask); + }); + + it("accumulates binary deltas into a Blob (format: blob)", async () => { + const task = new BlobStreamTask({}); + const out = (await task.run()) as BinOut; + expect(out.bytes).toBeInstanceOf(Blob); + const buf = await (out.bytes as Blob).arrayBuffer(); + expect(Array.from(new Uint8Array(buf))).toEqual([1, 2, 3, 4]); + }); + + it("accumulates binary deltas into an ArrayBuffer (format: binary)", async () => { + const task = new ArrayBufferStreamTask({}); + const out = (await task.run()) as BinOut; + expect(out.bytes).toBeInstanceOf(ArrayBuffer); + expect(Array.from(new Uint8Array(out.bytes as ArrayBuffer))).toEqual([1, 2, 3, 4]); + }); + + it("uses explicit finish payload at the binary port verbatim", async () => { + const out = (await new BinaryFinishOverrideTask({}).run()) as BinOut; + const buf = await (out.bytes as Blob).arrayBuffer(); + expect(Array.from(new Uint8Array(buf))).toEqual([7]); // not [9,9] + }); +}); diff --git a/packages/test/src/test/task-graph/StreamBinaryPump.test.ts b/packages/test/src/test/task-graph/StreamBinaryPump.test.ts new file mode 100644 index 000000000..006cdd936 --- /dev/null +++ b/packages/test/src/test/task-graph/StreamBinaryPump.test.ts @@ -0,0 +1,488 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +/** + * StreamPump binary-stream behavior. + * + * C1 (regression guard): a binary source feeding a NON-binary consumer must + * MATERIALIZE across the edge — `edgeNeedsAccumulation(binary → non-stream)` is + * `true`, so the pump accumulates and the sink receives a finished `Blob`. + * + * C2 (cache-streaming decision): `StreamPump.canStreamBinaryToCache` is asserted + * directly, in isolation from a live run: `true` for a streaming-capable cache + + * binary-only leaf with no value-needing consumer; `false` for a buffered cache, + * for a downstream edge that needs the materialized value, and (defensively) for + * a cache that cannot report `supportsStreaming()`. + * + * The live byte delivery to `saveOutputStream` during a real run is owned by + * `StreamProcessor`'s `BinaryStreamRouter` and covered by + * `StreamProcessorBinaryRefSink.test.ts` / `TaskRunnerRefPath.test.ts`. + */ + +import type { CacheRef, StreamEvent, TaskInput, TaskOutput } from "@workglow/task-graph"; +import { + Dataflow, + IExecuteContext, + makeCacheRef, + StreamPump, + Task, + TaskGraph, + TaskGraphRunner, + TaskOutputRepository, + TaskStatus, +} from "@workglow/task-graph"; +import { setLogger, sleep } from "@workglow/util"; +import type { DataPortSchema } from "@workglow/util/schema"; +import { beforeEach, describe, expect, it } from "vitest"; +import { getTestingLogger } from "../../binding/TestingLogger"; + +setLogger(getTestingLogger()); + +// ============================================================================ +// Test tasks +// ============================================================================ + +type BinOut = { bytes: Blob | ArrayBuffer }; + +/** + * Binary streaming source: yields two `binary-delta` chunks then an empty + * `finish` (mirrors a real producer that does not re-buffer its output). + */ +class BinaryStreamSource extends Task, BinOut> { + public static override type = "StreamBinaryPump_Source"; + public static override category = "Test"; + public static override cacheable = false; + + public static override inputSchema(): DataPortSchema { + return { type: "object", properties: {}, additionalProperties: false } as const; + } + + public static override outputSchema(): DataPortSchema { + return { + type: "object", + properties: { bytes: { type: "object", format: "blob", "x-stream": "binary" } }, + additionalProperties: false, + } as const satisfies DataPortSchema; + } + + async *executeStream( + _input: Record, + context: IExecuteContext + ): AsyncIterable> { + if (context.signal.aborted) return; + yield { type: "binary-delta", port: "bytes", binaryDelta: new Uint8Array([1, 2]) }; + await sleep(2); + yield { type: "binary-delta", port: "bytes", binaryDelta: new Uint8Array([3, 4]) }; + yield { type: "finish", data: {} as BinOut }; + } + + override async execute(): Promise { + return { bytes: new Blob([new Uint8Array([1, 2, 3, 4])]) }; + } +} + +/** + * A cacheable variant — needed to exercise the cache-streaming decision (the + * cache is only consulted for cacheable tasks). + */ +class CacheableBinaryStreamSource extends BinaryStreamSource { + public static override type = "StreamBinaryPump_CacheableSource"; + public static override cacheable = true; +} + +type SinkInput = { bytes: Blob | ArrayBuffer }; +type SinkOutput = { length: number; isBlob: boolean }; + +/** + * Non-binary consumer: its `bytes` input port has NO `x-stream`, so a binary + * source feeding it MUST materialize across the edge. + */ +class BinarySinkTask extends Task { + public static override type = "StreamBinaryPump_Sink"; + public static override category = "Test"; + public static override cacheable = false; + + public received: Blob | ArrayBuffer | undefined = undefined; + + public static override inputSchema(): DataPortSchema { + // No `type` constraint (accepts the materialized Blob at runtime) and NO + // `x-stream` ⇒ a non-streaming consumer that needs the value across the edge. + return { + type: "object", + properties: { bytes: { title: "Bytes", description: "materialized binary" } }, + additionalProperties: false, + } as const satisfies DataPortSchema; + } + + public static override outputSchema(): DataPortSchema { + return { + type: "object", + properties: { + length: { type: "number" }, + isBlob: { type: "boolean" }, + }, + additionalProperties: false, + } as const satisfies DataPortSchema; + } + + override async execute(input: SinkInput): Promise { + this.received = input.bytes; + if (input.bytes instanceof Blob) { + return { length: input.bytes.size, isBlob: true }; + } + if (input.bytes instanceof ArrayBuffer) { + return { length: input.bytes.byteLength, isBlob: false }; + } + return { length: -1, isBlob: false }; + } +} + +// ============================================================================ +// Cache repositories (in-test) +// ============================================================================ + +/** + * Records whether `saveOutputStream` (streaming) vs `saveOutput` (buffered) was + * invoked, and the total bytes seen through the streaming path. + */ +class StreamingMemoryRepo extends TaskOutputRepository { + public saveOutputCalls = 0; + public saveOutputStreamCalls = 0; + public streamedBytes: number[] = []; + private store = new Map(); + + constructor() { + super({ outputCompression: false }); + } + + override async saveOutput( + taskType: string, + inputs: TaskInput, + output: TaskOutput + ): Promise { + this.saveOutputCalls++; + this.store.set(taskType + JSON.stringify(inputs), output); + } + + override async getOutput(taskType: string, inputs: TaskInput): Promise { + return this.store.get(taskType + JSON.stringify(inputs)); + } + + override async clear(): Promise { + this.store.clear(); + } + + override async size(): Promise { + return this.store.size; + } + + override async clearOlderThan(): Promise {} + + override isDurable(): boolean { + return false; + } + + override async saveOutputStream( + taskType: string, + inputs: TaskInput, + chunks: AsyncIterable, + _metadata: Record + ): Promise { + this.saveOutputStreamCalls++; + let size = 0; + for await (const c of chunks) { + size += c.byteLength; + for (const b of c) this.streamedBytes.push(b); + } + return makeCacheRef({ $ref: `inmem://${taskType}::${JSON.stringify(inputs)}`, size }); + } +} + +/** + * A buffered-only cache: extends the streaming repo but removes the streaming + * capability so `supportsStreaming()` returns `false`. + */ +class BufferedMemoryRepo extends StreamingMemoryRepo { + public override saveOutputStream = + undefined as unknown as StreamingMemoryRepo["saveOutputStream"]; +} + +// ============================================================================ +// Helpers +// ============================================================================ + +function blobFromFinish(event: StreamEvent | undefined): Blob | ArrayBuffer | undefined { + if (!event || event.type !== "finish") return undefined; + return (event.data as Record)?.bytes as Blob | ArrayBuffer | undefined; +} + +async function* gen(...chunks: Uint8Array[]): AsyncIterable { + for (const c of chunks) yield c; +} + +// ============================================================================ +// C1: regression guard — binary source materializes across a non-binary edge +// ============================================================================ + +describe("StreamBinaryPump — C1 binary source → non-binary consumer", () => { + it("materializes a Blob across the edge (no production change)", async () => { + const graph = new TaskGraph(); + const source = new BinaryStreamSource({ id: "source" }); + const sink = new BinarySinkTask({ id: "sink" }); + + graph.addTasks([source, sink]); + graph.addDataflow(new Dataflow("source", "bytes", "sink", "bytes")); + + const runner = new TaskGraphRunner(graph); + const results = await runner.runGraph({}); + + expect(source.status).toBe(TaskStatus.COMPLETED); + expect(sink.status).toBe(TaskStatus.COMPLETED); + + // The sink received a materialized Blob with the concatenated bytes. + expect(sink.received).toBeInstanceOf(Blob); + const buf = await (sink.received as Blob).arrayBuffer(); + expect(Array.from(new Uint8Array(buf))).toEqual([1, 2, 3, 4]); + + const sinkResult = results.find((r) => r.id === "sink"); + expect(sinkResult).toBeDefined(); + expect((sinkResult!.data as SinkOutput).isBlob).toBe(true); + expect((sinkResult!.data as SinkOutput).length).toBe(4); + }); +}); + +// ============================================================================ +// C2: cache-streaming decision — asserted DIRECTLY via canStreamBinaryToCache +// +// These tests assert the DECISION in isolation, not a real-run outcome. We +// deliberately do NOT run a streaming-cache graph and assert "binary port absent +// from finish" as correct: in the reduced scope nothing drives saveOutputStream +// on a real run, so absent bytes there means SILENT DATA LOSS, not success. The +// live pipe (cache actually receiving the bytes on a real run) lands in Spec 2. +// ============================================================================ + +describe("StreamBinaryPump.canStreamBinaryToCache — decision", () => { + it("returns true: streaming cache + binary-only leaf + no value-needing consumer", () => { + const graph = new TaskGraph(); + const source = new CacheableBinaryStreamSource({ id: "source" }); + graph.addTask(source); + + expect(StreamPump.canStreamBinaryToCache(graph, source, new StreamingMemoryRepo())).toBe(true); + }); + + it("returns false: buffered (non-streaming) cache", () => { + const graph = new TaskGraph(); + const source = new CacheableBinaryStreamSource({ id: "source" }); + graph.addTask(source); + + const cache = new BufferedMemoryRepo(); + expect(cache.supportsStreaming()).toBe(false); + expect(StreamPump.canStreamBinaryToCache(graph, source, cache)).toBe(false); + }); + + it("returns false: a downstream edge needs the materialized value", () => { + const graph = new TaskGraph(); + const source = new CacheableBinaryStreamSource({ id: "source" }); + const sink = new BinarySinkTask({ id: "sink" }); + graph.addTasks([source, sink]); + graph.addDataflow(new Dataflow("source", "bytes", "sink", "bytes")); + + // Streaming-capable cache present, but the non-binary consumer needs the + // value across the edge ⇒ must still accumulate. + expect(StreamPump.canStreamBinaryToCache(graph, source, new StreamingMemoryRepo())).toBe(false); + }); + + it("returns false (defensive): a cache that cannot report supportsStreaming()", () => { + const graph = new TaskGraph(); + const source = new CacheableBinaryStreamSource({ id: "source" }); + graph.addTask(source); + + // A `{}`-style partial double with no `supportsStreaming` method: the guard + // must treat anything that can't affirmatively report streaming support as + // non-streaming, never optimistically piping. + const partialCache = {} as unknown as TaskOutputRepository; + expect(StreamPump.canStreamBinaryToCache(graph, source, partialCache)).toBe(false); + }); +}); + +// ============================================================================ +// Stream-out decision: anyConsumerAcceptsBinaryStream +// ============================================================================ + +/** Streaming consumer: its `bytes` input port accepts the binary stream mode. */ +class BinaryStreamConsumer extends Task { + public static override type = "StreamBinaryPump_StreamConsumer"; + public static override category = "Test"; + public static override cacheable = false; + + public static override inputSchema(): DataPortSchema { + return { + type: "object", + properties: { bytes: { type: "object", format: "blob", "x-stream": "binary" } }, + additionalProperties: false, + } as const satisfies DataPortSchema; + } + + public static override outputSchema(): DataPortSchema { + return { + type: "object", + properties: { length: { type: "number" }, isBlob: { type: "boolean" } }, + additionalProperties: false, + } as const satisfies DataPortSchema; + } + + override async execute(input: SinkInput): Promise { + const size = input.bytes instanceof Blob ? input.bytes.size : (input.bytes?.byteLength ?? -1); + return { length: size, isBlob: input.bytes instanceof Blob }; + } +} + +describe("StreamPump.anyConsumerAcceptsBinaryStream", () => { + it("returns true when an out-edge targets a binary-streaming input port", () => { + const graph = new TaskGraph(); + const source = new CacheableBinaryStreamSource({ id: "source" }); + const consumer = new BinaryStreamConsumer({ id: "consumer" }); + graph.addTasks([source, consumer]); + graph.addDataflow(new Dataflow("source", "bytes", "consumer", "bytes")); + + expect(StreamPump.anyConsumerAcceptsBinaryStream(graph, source)).toBe(true); + }); + + it("returns false with no consumers", () => { + const graph = new TaskGraph(); + const source = new CacheableBinaryStreamSource({ id: "source" }); + graph.addTask(source); + + expect(StreamPump.anyConsumerAcceptsBinaryStream(graph, source)).toBe(false); + }); + + it("returns false when the only consumer needs a materialized value", () => { + const graph = new TaskGraph(); + const source = new CacheableBinaryStreamSource({ id: "source" }); + const sink = new BinarySinkTask({ id: "sink" }); + graph.addTasks([source, sink]); + graph.addDataflow(new Dataflow("source", "bytes", "sink", "bytes")); + + expect(StreamPump.anyConsumerAcceptsBinaryStream(graph, source)).toBe(false); + }); + + it("returns false for * fan-out edges (consumers receive materialized values)", () => { + const graph = new TaskGraph(); + const source = new CacheableBinaryStreamSource({ id: "source" }); + const consumer = new BinaryStreamConsumer({ id: "consumer" }); + graph.addTasks([source, consumer]); + graph.addDataflow(new Dataflow("source", "*", "consumer", "*")); + + expect(StreamPump.anyConsumerAcceptsBinaryStream(graph, source)).toBe(false); + }); + + it("returns true with mixed consumers (one streams, one materializes)", () => { + const graph = new TaskGraph(); + const source = new CacheableBinaryStreamSource({ id: "source" }); + const consumer = new BinaryStreamConsumer({ id: "consumer" }); + const sink = new BinarySinkTask({ id: "sink" }); + graph.addTasks([source, consumer, sink]); + graph.addDataflow(new Dataflow("source", "bytes", "consumer", "bytes")); + graph.addDataflow(new Dataflow("source", "bytes", "sink", "bytes")); + + expect(StreamPump.anyConsumerAcceptsBinaryStream(graph, source)).toBe(true); + }); +}); + +// ============================================================================ +// C2: cache-streaming decision — observed on a real run via the source's finish. +// +// These run a real graph and assert the bytes ARE materialized (present) when the +// decision is "accumulate". They guard the POSITIVE outcome (bytes delivered), not +// the absence of bytes, so they do not bless data loss. +// ============================================================================ + +describe("StreamBinaryPump — C2 accumulation materializes bytes on a real run", () => { + let logger = getTestingLogger(); + setLogger(logger); + + it("DOES accumulate a leaf binary task when the cache cannot stream", async () => { + const graph = new TaskGraph(); + const source = new CacheableBinaryStreamSource({ id: "source" }); + graph.addTask(source); + const runner = new TaskGraphRunner(graph); + + const finishes: StreamEvent[] = []; + source.on("stream_chunk", (e) => { + if (e.type === "finish") finishes.push(e); + }); + + const cache = new BufferedMemoryRepo(); + expect(cache.supportsStreaming()).toBe(false); + await runner.runGraph({}, { outputCache: cache }); + + // Decision = true ⇒ enriched finish ⇒ binary port materialized to a Blob. + expect(finishes.length).toBe(1); + const bytes = blobFromFinish(finishes[0]); + expect(bytes).toBeInstanceOf(Blob); + const buf = await (bytes as Blob).arrayBuffer(); + expect(Array.from(new Uint8Array(buf))).toEqual([1, 2, 3, 4]); + }); + + it("tees when a downstream edge needs materialized AND the cache can stream", async () => { + // Spec 2 Phase E: cache-can-stream + downstream-needs-materialized used to + // inhibit refs entirely. Now both paths fire — accumulator drives the + // enriched finish event (Blob for the edge consumer) and the router + // writes to the cache so the queue/cache row stays small. + const graph = new TaskGraph(); + const source = new CacheableBinaryStreamSource({ id: "source" }); + const sink = new BinarySinkTask({ id: "sink" }); + graph.addTasks([source, sink]); + graph.addDataflow(new Dataflow("source", "bytes", "sink", "bytes")); + const runner = new TaskGraphRunner(graph); + + const finishes: StreamEvent[] = []; + source.on("stream_chunk", (e) => { + if (e.type === "finish") finishes.push(e); + }); + + const cache = new StreamingMemoryRepo(); + await runner.runGraph({}, { outputCache: cache }); + + // Edge path: downstream still receives a materialized Blob. + expect(finishes.length).toBe(1); + const bytes = blobFromFinish(finishes[0]); + expect(bytes).toBeInstanceOf(Blob); + expect(sink.received).toBeInstanceOf(Blob); + + // Cache path: the streaming sink fired too (tee). + expect(cache.saveOutputStreamCalls).toBeGreaterThanOrEqual(1); + }); +}); + +// ============================================================================ +// Sanity: the in-test repos behave as expected +// ============================================================================ + +describe("StreamBinaryPump — repo capability sanity", () => { + let repo: StreamingMemoryRepo; + beforeEach(() => { + repo = new StreamingMemoryRepo(); + }); + + it("streaming repo reports supportsStreaming() === true", () => { + expect(repo.supportsStreaming()).toBe(true); + }); + + it("buffered repo reports supportsStreaming() === false", () => { + expect(new BufferedMemoryRepo().supportsStreaming()).toBe(false); + }); + + it("saveOutputStream concatenates all delivered bytes", async () => { + await repo.saveOutputStream( + "T", + { k: 1 }, + gen(new Uint8Array([1, 2]), new Uint8Array([3])), + {} + ); + expect(repo.streamedBytes).toEqual([1, 2, 3]); + }); +}); diff --git a/packages/test/src/test/task-graph/StreamBinaryTypes.test.ts b/packages/test/src/test/task-graph/StreamBinaryTypes.test.ts new file mode 100644 index 000000000..df2ca1fe9 --- /dev/null +++ b/packages/test/src/test/task-graph/StreamBinaryTypes.test.ts @@ -0,0 +1,167 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ +import type { StreamBinaryDelta, StreamEvent, StreamMode } from "@workglow/task-graph"; +import { + assertBinaryFormat, + edgeNeedsAccumulation, + getBinaryPortId, + getOutputStreamMode, + getPortStreamMode, + getStreamingPorts, + materializeBinary, +} from "@workglow/task-graph"; +import type { DataPortSchema } from "@workglow/util/schema"; +import { describe, expect, it } from "vitest"; + +const binarySchema = { + type: "object", + properties: { + bytes: { type: "object", format: "blob", "x-stream": "binary" }, + }, + additionalProperties: false, +} as const satisfies DataPortSchema; + +const mixedSchema = { + type: "object", + properties: { + text: { type: "string", "x-stream": "append" }, + bytes: { type: "object", format: "binary", "x-stream": "binary" }, + }, + additionalProperties: false, +} as const satisfies DataPortSchema; + +const unannotatedBinarySchema = { + type: "object", + properties: { + bytes: { type: "object", "x-stream": "binary" }, + }, + additionalProperties: false, +} as const satisfies DataPortSchema; + +const typoFormatSchema = { + type: "object", + properties: { + bytes: { type: "object", format: "Blob", "x-stream": "binary" }, + }, + additionalProperties: false, +} as const satisfies DataPortSchema; + +const unknownFormatSchema = { + type: "object", + properties: { + bytes: { type: "object", format: "wat", "x-stream": "binary" }, + }, + additionalProperties: false, +} as const satisfies DataPortSchema; + +describe("StreamBinaryDelta type", () => { + it("is assignable to StreamEvent and carries a Uint8Array delta", () => { + const evt: StreamEvent = { + type: "binary-delta", + port: "bytes", + binaryDelta: new Uint8Array([1, 2, 3]), + } satisfies StreamBinaryDelta; + expect(evt.type).toBe("binary-delta"); + if (evt.type === "binary-delta") { + expect(evt.binaryDelta).toBeInstanceOf(Uint8Array); + expect(Array.from(evt.binaryDelta)).toEqual([1, 2, 3]); + } + }); + + it("admits 'binary' as a StreamMode", () => { + const mode: StreamMode = "binary"; + expect(mode).toBe("binary"); + }); +}); + +describe("binary-aware port helpers", () => { + it("getPortStreamMode returns 'binary'", () => { + expect(getPortStreamMode(binarySchema, "bytes")).toBe("binary"); + }); + + it("getStreamingPorts includes binary ports", () => { + expect(getStreamingPorts(binarySchema)).toEqual([{ port: "bytes", mode: "binary" }]); + }); + + it("getOutputStreamMode returns 'binary' for a single binary port", () => { + expect(getOutputStreamMode(binarySchema)).toBe("binary"); + }); + + it("getOutputStreamMode returns 'mixed' for append + binary", () => { + expect(getOutputStreamMode(mixedSchema)).toBe("mixed"); + }); + + it("getBinaryPortId finds the first binary port", () => { + expect(getBinaryPortId(binarySchema)).toBe("bytes"); + expect(getBinaryPortId(mixedSchema)).toBe("bytes"); + }); + + it("getBinaryPortId returns undefined when no binary port", () => { + const noBinary = { + type: "object", + properties: { text: { type: "string", "x-stream": "append" } }, + } as const satisfies DataPortSchema; + expect(getBinaryPortId(noBinary)).toBeUndefined(); + }); + + it("edgeNeedsAccumulation: binary source → non-binary target accumulates", () => { + const target = { + type: "object", + properties: { bytes: { type: "object" } }, + } as const satisfies DataPortSchema; + expect(edgeNeedsAccumulation(binarySchema, "bytes", target, "bytes")).toBe(true); + }); + + it("edgeNeedsAccumulation: binary → binary passes through", () => { + expect(edgeNeedsAccumulation(binarySchema, "bytes", binarySchema, "bytes")).toBe(false); + }); +}); + +describe("assertBinaryFormat", () => { + it("returns 'blob' when format is 'blob'", () => { + expect(assertBinaryFormat(binarySchema, "bytes")).toBe("blob"); + }); + + it("returns 'binary' when format is 'binary'", () => { + expect(assertBinaryFormat(mixedSchema, "bytes")).toBe("binary"); + }); + + it("returns 'blob' for undefined / absent format (canonical default)", () => { + expect(assertBinaryFormat(unannotatedBinarySchema, "bytes")).toBe("blob"); + }); + + it("throws on a casing typo such as 'Blob'", () => { + expect(() => assertBinaryFormat(typoFormatSchema, "bytes")).toThrow( + /Allowed: "blob" \| "binary"/ + ); + }); + + it("throws on an unknown format value", () => { + expect(() => assertBinaryFormat(unknownFormatSchema, "bytes")).toThrow(/wat/); + }); +}); + +describe("materializeBinary", () => { + const chunks = [new Uint8Array([1, 2]), new Uint8Array([3, 4, 5])]; + + it("concatenates to an ArrayBuffer when format is 'binary'", async () => { + const out = materializeBinary(chunks, "binary"); + expect(out).toBeInstanceOf(ArrayBuffer); + expect(Array.from(new Uint8Array(out as ArrayBuffer))).toEqual([1, 2, 3, 4, 5]); + }); + + it("concatenates to a Blob when format is 'blob'", async () => { + const out = materializeBinary(chunks, "blob"); + expect(out).toBeInstanceOf(Blob); + const buf = await (out as Blob).arrayBuffer(); + expect(Array.from(new Uint8Array(buf))).toEqual([1, 2, 3, 4, 5]); + }); + + it("handles an empty chunk list", () => { + expect(materializeBinary([], "binary")).toBeInstanceOf(ArrayBuffer); + expect((materializeBinary([], "binary") as ArrayBuffer).byteLength).toBe(0); + }); +}); diff --git a/packages/test/src/test/task-graph/StreamProcessorBinaryRefSink.test.ts b/packages/test/src/test/task-graph/StreamProcessorBinaryRefSink.test.ts new file mode 100644 index 000000000..d69d005bb --- /dev/null +++ b/packages/test/src/test/task-graph/StreamProcessorBinaryRefSink.test.ts @@ -0,0 +1,231 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { BinaryRefSink, CacheRef, StreamEvent } from "@workglow/task-graph"; +import { + IExecuteContext, + isCacheRef, + makeCacheRef, + Task, + TaskRegistry, +} from "@workglow/task-graph"; +import { sleep } from "@workglow/util"; +import { DataPortSchema } from "@workglow/util/schema"; +import { beforeAll, describe, expect, it } from "vitest"; + +type BinOut = { bytes: Blob | ArrayBuffer }; +type TwoBinOut = { audio: Blob; transcript: Blob }; + +class BlobStreamTask extends Task, BinOut> { + public static override type = "BinaryRefSinkTest_BlobStream"; + public static override category = "Test"; + public static override cacheable = false; + + public static override outputSchema(): DataPortSchema { + return { + type: "object", + properties: { bytes: { type: "object", format: "blob", "x-stream": "binary" } }, + additionalProperties: false, + } as const satisfies DataPortSchema; + } + + async *executeStream( + _input: Record, + _ctx: IExecuteContext + ): AsyncIterable> { + yield { type: "binary-delta", port: "bytes", binaryDelta: new Uint8Array([1, 2]) }; + await sleep(1); + yield { type: "binary-delta", port: "bytes", binaryDelta: new Uint8Array([3]) }; + yield { type: "finish", data: {} as BinOut }; + } +} + +class TwoPortStreamTask extends Task, TwoBinOut> { + public static override type = "BinaryRefSinkTest_TwoPort"; + public static override category = "Test"; + public static override cacheable = false; + + public static override outputSchema(): DataPortSchema { + return { + type: "object", + properties: { + audio: { type: "object", format: "blob", "x-stream": "binary" }, + transcript: { type: "object", format: "blob", "x-stream": "binary" }, + }, + additionalProperties: false, + } as const satisfies DataPortSchema; + } + + async *executeStream( + _input: Record, + _ctx: IExecuteContext + ): AsyncIterable> { + yield { type: "binary-delta", port: "audio", binaryDelta: new Uint8Array([10, 11]) }; + yield { type: "binary-delta", port: "transcript", binaryDelta: new Uint8Array([20]) }; + yield { type: "binary-delta", port: "audio", binaryDelta: new Uint8Array([12]) }; + yield { type: "finish", data: {} as TwoBinOut }; + } +} + +class ExplicitFinishPayloadTask extends BlobStreamTask { + public static override type = "BinaryRefSinkTest_ExplicitFinish"; + + override async *executeStream( + _input: Record, + _ctx: IExecuteContext + ): AsyncIterable> { + yield { type: "binary-delta", port: "bytes", binaryDelta: new Uint8Array([9, 9]) }; + yield { type: "finish", data: { bytes: new Blob([new Uint8Array([7])]) } as BinOut }; + } +} + +beforeAll(() => { + TaskRegistry.registerTask(BlobStreamTask as any); + TaskRegistry.registerTask(TwoPortStreamTask as any); + TaskRegistry.registerTask(ExplicitFinishPayloadTask as any); +}); + +function makeSink(): { + sink: BinaryRefSink; + collected: Promise<{ ref: CacheRef; bytes: number[] }>; +} { + const $ref = `inmem://test/${Math.random().toString(36).slice(2)}`; + let resolveCollected: (v: { ref: CacheRef; bytes: number[] }) => void = () => {}; + let rejectCollected: (e: unknown) => void = () => {}; + const collected = new Promise<{ ref: CacheRef; bytes: number[] }>((res, rej) => { + resolveCollected = res; + rejectCollected = rej; + }); + const sink: BinaryRefSink = async (chunks) => { + const bytes: number[] = []; + try { + for await (const c of chunks) { + for (const b of c) bytes.push(b); + } + } catch (err) { + rejectCollected(err); + throw err; + } + const ref = makeCacheRef({ $ref, size: bytes.length, mime: "application/octet-stream" }); + resolveCollected({ ref, bytes }); + return ref; + }; + return { sink, collected }; +} + +describe("StreamProcessor — binaryRefSinks (direct deps wiring)", () => { + it("routes a single binary port to its sink and produces CacheRef in Output", async () => { + const task = new BlobStreamTask(); + const { sink, collected } = makeSink(); + + // Drive the streamProcessor directly with sinks injected. + const processor = (task as any).runner.streamProcessor as { + run(input: any, ctx: any, deps: any): Promise; + }; + + // Mimic minimal ctx + deps the processor needs. + const abortController = new AbortController(); + const ctx = { + abortController, + shouldAccumulate: true, + registry: undefined, + runId: undefined, + runStartedAt: new Date(), + runOutputData: {}, + telemetrySpan: undefined, + dispose: () => {}, + } as any; + + const output = (await processor.run({}, ctx, { + registry: undefined as any, + resourceScope: undefined, + inputStreams: undefined, + onProgress: async () => {}, + own: (t: T) => t, + binaryRefSinks: new Map([["bytes", sink]]), + })) as BinOut; + + expect(output).toBeDefined(); + expect(isCacheRef((output as any).bytes)).toBe(true); + const { ref, bytes } = await collected; + expect(ref.size).toBe(3); + expect(bytes).toEqual([1, 2, 3]); + expect((output as any).bytes.$ref).toBe(ref.$ref); + }); + + it("routes only the configured port; other binary ports continue to accumulate", async () => { + const task = new TwoPortStreamTask(); + const { sink: audioSink, collected: audioCollected } = makeSink(); + + const processor = (task as any).runner.streamProcessor as { + run(input: any, ctx: any, deps: any): Promise; + }; + const abortController = new AbortController(); + const ctx = { + abortController, + shouldAccumulate: true, + registry: undefined, + runId: undefined, + runStartedAt: new Date(), + runOutputData: {}, + telemetrySpan: undefined, + dispose: () => {}, + } as any; + + const output = (await processor.run({}, ctx, { + registry: undefined as any, + resourceScope: undefined, + inputStreams: undefined, + onProgress: async () => {}, + own: (t: T) => t, + binaryRefSinks: new Map([["audio", audioSink]]), + })) as TwoBinOut; + + expect(isCacheRef((output as any).audio)).toBe(true); + expect((output as any).transcript).toBeInstanceOf(Blob); + const { bytes: audioBytes } = await audioCollected; + expect(audioBytes).toEqual([10, 11, 12]); + const transcriptBytes = new Uint8Array(await (output.transcript as Blob).arrayBuffer()); + expect(Array.from(transcriptBytes)).toEqual([20]); + }); + + it("explicit binary finish payload wins over the sink's CacheRef (artifact precedence)", async () => { + const task = new ExplicitFinishPayloadTask(); + const { sink, collected } = makeSink(); + const processor = (task as any).runner.streamProcessor as { + run(input: any, ctx: any, deps: any): Promise; + }; + const abortController = new AbortController(); + const ctx = { + abortController, + shouldAccumulate: true, + registry: undefined, + runId: undefined, + runStartedAt: new Date(), + runOutputData: {}, + telemetrySpan: undefined, + dispose: () => {}, + } as any; + + const output = (await processor.run({}, ctx, { + registry: undefined as any, + resourceScope: undefined, + inputStreams: undefined, + onProgress: async () => {}, + own: (t: T) => t, + binaryRefSinks: new Map([["bytes", sink]]), + })) as BinOut; + + // The explicit finish payload (Blob of [7]) takes the slot, not the ref. + expect(isCacheRef((output as any).bytes)).toBe(false); + expect((output as any).bytes).toBeInstanceOf(Blob); + const blobBytes = new Uint8Array(await (output.bytes as Blob).arrayBuffer()); + expect(Array.from(blobBytes)).toEqual([7]); + // The sink still observed the deltas (just lost the race for the slot). + const { bytes } = await collected; + expect(bytes).toEqual([9, 9]); + }); +}); diff --git a/packages/test/src/test/task-graph/StreamingBackpressure.test.ts b/packages/test/src/test/task-graph/StreamingBackpressure.test.ts index 0728927d2..bf00b44a9 100644 --- a/packages/test/src/test/task-graph/StreamingBackpressure.test.ts +++ b/packages/test/src/test/task-graph/StreamingBackpressure.test.ts @@ -20,10 +20,12 @@ * downstream tasks from starting. */ -import type { CachePolicy, StreamEvent } from "@workglow/task-graph"; +import type { BinaryRefSink, CachePolicy, CacheRef, StreamEvent } from "@workglow/task-graph"; import { Dataflow, IExecuteContext, + isCacheRef, + makeCacheRef, Task, TaskGraph, TaskGraphRunner, @@ -314,6 +316,198 @@ describe("Streaming backpressure and stress", () => { }); }); + describe("binary backpressure", () => { + // Sized so a fast producer would overwhelm a slow sink: 100 chunks of 1 MiB + // each = 100 MiB total, sink consumes one chunk every 50 ms (~5 s end-to-end). + const CHUNK = 1024 * 1024; + const CHUNKS = 100; + const HIGH_WATER = 4 * 1024 * 1024; + + type BinOut = { bytes: Blob }; + + /** + * Streams `CHUNKS` × `CHUNK` bytes; awaits the producer-side `push` + * promise so the byte-bounded backpressure check actually parks when the + * router buffer reaches the high-water mark. + */ + class FastBinaryProducer extends Task, BinOut> { + public static override type = "StreamingBackpressure_FastBinaryProducer"; + public static override cachePolicy: CachePolicy = { kind: "none" }; + public static override cacheable = true; + + public static override inputSchema(): DataPortSchema { + return { type: "object", properties: {}, additionalProperties: false } as const; + } + public static override outputSchema(): DataPortSchema { + return { + type: "object", + properties: { bytes: { type: "object", format: "blob", "x-stream": "binary" } }, + additionalProperties: false, + } as const satisfies DataPortSchema; + } + + async *executeStream( + _input: Record, + _ctx: IExecuteContext + ): AsyncIterable> { + for (let i = 0; i < CHUNKS; i++) { + const chunk = new Uint8Array(CHUNK).fill(i & 0xff); + yield { type: "binary-delta", port: "bytes", binaryDelta: chunk }; + } + yield { type: "finish", data: {} as BinOut }; + } + } + + it("keeps router buffer at-or-below the high-water mark while a slow sink drains", async () => { + const router = await import("@workglow/task-graph"); + const { BinaryStreamRouter } = router as unknown as { + BinaryStreamRouter: new ( + sink: BinaryRefSink, + highWaterMarkBytes: number + ) => { + push(chunk: Uint8Array): Promise; + end(): void; + ref(): Promise; + readonly _bufferedBytes: number; + }; + }; + + let observedPeak = 0; + const consumed: number[] = []; + const sink: BinaryRefSink = async (chunks) => { + for await (const c of chunks) { + consumed.push(c.byteLength); + // Slow consumer: 50 ms per chunk. + await new Promise((res) => setTimeout(res, 50)); + } + return makeCacheRef({ $ref: "inmem://bp", size: consumed.reduce((a, b) => a + b, 0) }); + }; + + const r = new BinaryStreamRouter(sink, HIGH_WATER); + for (let i = 0; i < CHUNKS; i++) { + const chunk = new Uint8Array(CHUNK).fill(i & 0xff); + await r.push(chunk); + observedPeak = Math.max(observedPeak, r._bufferedBytes); + } + r.end(); + const ref = await r.ref(); + + // Peak buffer stayed at-or-below the high-water mark + at most one chunk + // (the chunk that pushed us over the mark is counted before we park). + expect(observedPeak).toBeLessThanOrEqual(HIGH_WATER + CHUNK); + + // Every byte was delivered. + const totalDelivered = consumed.reduce((a, b) => a + b, 0); + expect(totalDelivered).toBe(CHUNKS * CHUNK); + expect(ref.size).toBe(CHUNKS * CHUNK); + expect(isCacheRef(ref)).toBe(true); + }, 30_000); + + it("releases a parked push() promise within 100ms when the router is ended (abort path)", async () => { + const router = await import("@workglow/task-graph"); + const { BinaryStreamRouter } = router as unknown as { + BinaryStreamRouter: new ( + sink: BinaryRefSink, + highWaterMarkBytes: number + ) => { + push(chunk: Uint8Array): Promise; + end(): void; + fail(err: Error): void; + ref(): Promise; + readonly _bufferedBytes: number; + }; + }; + + // Sink starts consuming but the gate keeps it parked so the producer + // buffer fills to the high-water mark. The orphaned-Promise bug pre-fix: + // `end()` did not release the parked producer, so the test would + // wait until the test-level timeout. + let gateRelease: (() => void) | undefined; + const gate = new Promise((res) => { + gateRelease = res; + }); + const seen: Uint8Array[] = []; + const sink: BinaryRefSink = async (chunks) => { + await gate; // park the sink so the buffer stays full + for await (const c of chunks) { + seen.push(c); + } + return makeCacheRef({ $ref: "inmem://parked", size: 0 }); + }; + + // High-water mark of 1 byte so a single non-empty chunk parks the next push. + const r = new BinaryStreamRouter(sink, 1); + // The first push parks immediately (1 byte >= 1-byte mark, no consumer). + const parked = r.push(new Uint8Array([0xff])); + + let parkedResolved = false; + parked.then(() => { + parkedResolved = true; + }); + + // Park before measuring. + await new Promise((res) => setTimeout(res, 10)); + expect(parkedResolved).toBe(false); + + const t0 = Date.now(); + r.end(); + await parked; + const elapsed = Date.now() - t0; + + expect(parkedResolved).toBe(true); + expect(elapsed).toBeLessThan(100); + + // Release the sink so the test exits cleanly. + gateRelease?.(); + await r.ref(); + void seen; + }, 10_000); + + it("runs a 100MiB stream end-to-end through StreamProcessor with byte-bounded backpressure", async () => { + const producer = new FastBinaryProducer({ id: "producer" }); + + let received = 0; + const sink: BinaryRefSink = async (chunks) => { + for await (const c of chunks) { + received += c.byteLength; + // Slow consumer — gives the producer time to outrun it without + // bound if backpressure didn't apply. The byte-bounded invariant + // itself is exercised directly in the BinaryStreamRouter unit + // test above; here we verify the full StreamProcessor path + // delivers every byte through the throttle without hangs or drops. + await new Promise((res) => setTimeout(res, 2)); + } + return makeCacheRef({ $ref: "inmem://e2e", size: received }); + }; + + const processor = (producer as any).runner.streamProcessor as { + run(input: any, ctx: any, deps: any): Promise; + }; + const abortController = new AbortController(); + const ctx = { + abortController, + shouldAccumulate: false, + telemetrySpan: undefined, + dispose: () => {}, + } as any; + const sinks: ReadonlyMap = new Map([["bytes", sink]]); + + const output = (await processor.run({}, ctx, { + registry: undefined as any, + resourceScope: undefined, + inputStreams: undefined, + onProgress: async () => {}, + own: (t: T) => t, + binaryRefSinks: sinks, + binaryHighWaterBytes: HIGH_WATER, + })) as BinOut | undefined; + + expect(output).toBeDefined(); + // Every byte arrived; backpressure didn't drop chunks. + expect(received).toBe(CHUNKS * CHUNK); + }, 30_000); + }); + describe("StreamError propagation", () => { it("fails the source task on a StreamError event and keeps downstream from completing", async () => { const graph = new TaskGraph(); diff --git a/packages/test/src/test/task-graph/TaskOutputRepositoryStream.test.ts b/packages/test/src/test/task-graph/TaskOutputRepositoryStream.test.ts new file mode 100644 index 000000000..7e9980efd --- /dev/null +++ b/packages/test/src/test/task-graph/TaskOutputRepositoryStream.test.ts @@ -0,0 +1,106 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ +import { RunPrivateCacheRepo } from "@workglow/task-graph"; +import { describe, expect, it } from "vitest"; +import { NonStreamingMemoryRepo, StreamingMemoryRepo } from "../../binding/StreamingMemoryRepo"; + +async function* gen(...chunks: Uint8Array[]): AsyncIterable { + for (const c of chunks) yield c; +} + +describe("TaskOutputRepository.saveOutputStream", () => { + it("supportsStreaming reflects presence of saveOutputStream", () => { + expect(new StreamingMemoryRepo({}).supportsStreaming()).toBe(true); + expect(new NonStreamingMemoryRepo({}).supportsStreaming()).toBe(false); + }); + + it("streams chunks and the total equals total bytes streamed", async () => { + const repo = new StreamingMemoryRepo({}); + await repo.saveOutputStream( + "T", + { k: 1 }, + gen(new Uint8Array([1, 2]), new Uint8Array([3])), + {} + ); + expect(Array.from(repo.streamed.get('T{"k":1}')!)).toEqual([1, 2, 3]); + }); + + it("an empty stream stores a zero-length Uint8Array", async () => { + const repo = new StreamingMemoryRepo({}); + await repo.saveOutputStream("T", { k: 1 }, gen(), {}); + const stored = repo.streamed.get('T{"k":1}')!; + expect(stored).toBeInstanceOf(Uint8Array); + expect(stored.byteLength).toBe(0); + }); + + it("passes the metadata arg through to the repo (side-band contract)", async () => { + const repo = new StreamingMemoryRepo({}); + const metadata = { contentType: "application/octet-stream", status: 200 }; + await repo.saveOutputStream("T", { k: 1 }, gen(new Uint8Array([9])), metadata); + expect(repo.streamedMetadata.get('T{"k":1}')).toEqual(metadata); + }); + + it("RunPrivateCacheRepo forwards streaming with namespaced taskType", async () => { + const backing = new StreamingMemoryRepo({}); + const wrapper = new RunPrivateCacheRepo({ backing, runId: "run-A" }); + + expect(wrapper.supportsStreaming()).toBe(true); + + await wrapper.saveOutputStream("T", { k: 1 }, gen(new Uint8Array([1, 2, 3])), {}); + + // taskType is namespaced exactly as saveOutput namespaces it. + const namespacedKey = `__run:run-A::T${JSON.stringify({ k: 1 })}`; + expect(Array.from(backing.streamed.get(namespacedKey)!)).toEqual([1, 2, 3]); + expect(backing.streamed.has('T{"k":1}')).toBe(false); + }); + + it("RunPrivateCacheRepo.supportsStreaming() is false when backing lacks it", () => { + const backing = new NonStreamingMemoryRepo({}); + const wrapper = new RunPrivateCacheRepo({ backing, runId: "run-A" }); + expect(wrapper.supportsStreaming()).toBe(false); + }); + + it("saveOutputStream returns a CacheRef the same backing can resolve to bytes", async () => { + const repo = new StreamingMemoryRepo({}); + const ref = await repo.saveOutputStream("T", { k: 1 }, gen(new Uint8Array([7, 8, 9])), {}); + expect(typeof ref.$ref).toBe("string"); + expect(ref.size).toBe(3); + const hydrated = await repo.getOutputByRef(ref); + expect(hydrated).toBeInstanceOf(Blob); + const bytes = new Uint8Array(await hydrated!.arrayBuffer()); + expect(Array.from(bytes)).toEqual([7, 8, 9]); + }); + + it("getOutputStreamByRef yields bytes for a saved ref", async () => { + const repo = new StreamingMemoryRepo({}); + const ref = await repo.saveOutputStream("T", { k: 2 }, gen(new Uint8Array([4, 5])), {}); + const stream = repo.getOutputStreamByRef(ref); + expect(stream).toBeDefined(); + const collected: number[] = []; + for await (const chunk of stream!) { + for (const b of chunk) collected.push(b); + } + expect(collected).toEqual([4, 5]); + }); + + it("getOutputByRef returns undefined after clear (dangling reference)", async () => { + const repo = new StreamingMemoryRepo({}); + const ref = await repo.saveOutputStream("T", { k: 3 }, gen(new Uint8Array([1])), {}); + expect(await repo.getOutputByRef(ref)).toBeInstanceOf(Blob); + await repo.clear(); + expect(await repo.getOutputByRef(ref)).toBeUndefined(); + }); + + it("RunPrivateCacheRepo forwards getOutputByRef to backing", async () => { + const backing = new StreamingMemoryRepo({}); + const wrapper = new RunPrivateCacheRepo({ backing, runId: "run-B" }); + const ref = await wrapper.saveOutputStream("T", { k: 4 }, gen(new Uint8Array([42])), {}); + const hydrated = await wrapper.getOutputByRef(ref); + expect(hydrated).toBeInstanceOf(Blob); + const bytes = new Uint8Array(await hydrated!.arrayBuffer()); + expect(Array.from(bytes)).toEqual([42]); + }); +}); diff --git a/packages/test/src/test/task-graph/TaskRegistry.test.ts b/packages/test/src/test/task-graph/TaskRegistry.test.ts index 1a65cd88f..b0bdf7267 100644 --- a/packages/test/src/test/task-graph/TaskRegistry.test.ts +++ b/packages/test/src/test/task-graph/TaskRegistry.test.ts @@ -30,10 +30,58 @@ class TaskB extends Task { } } +// Binary-stream port format checks happen at registration time so a typo +// surfaces near the task definition rather than during a streaming run. + +class BinaryPortTypoTask extends Task { + static override readonly type = "TaskRegistryTest_BinaryFormatTypo"; + static override inputSchema(): DataPortSchema { + return { type: "object", properties: {}, additionalProperties: false } as const; + } + static override outputSchema(): DataPortSchema { + return { + type: "object", + properties: { bytes: { type: "object", format: "Blob", "x-stream": "binary" } }, + additionalProperties: false, + } as const satisfies DataPortSchema; + } +} + +class BinaryPortValidBlobTask extends Task { + static override readonly type = "TaskRegistryTest_BinaryValidBlob"; + static override inputSchema(): DataPortSchema { + return { type: "object", properties: {}, additionalProperties: false } as const; + } + static override outputSchema(): DataPortSchema { + return { + type: "object", + properties: { bytes: { type: "object", format: "blob", "x-stream": "binary" } }, + additionalProperties: false, + } as const satisfies DataPortSchema; + } +} + +class BinaryPortValidBinaryTask extends Task { + static override readonly type = "TaskRegistryTest_BinaryValidBinary"; + static override inputSchema(): DataPortSchema { + return { type: "object", properties: {}, additionalProperties: false } as const; + } + static override outputSchema(): DataPortSchema { + return { + type: "object", + properties: { bytes: { type: "object", format: "binary", "x-stream": "binary" } }, + additionalProperties: false, + } as const satisfies DataPortSchema; + } +} + describe("TaskRegistry", () => { afterEach(() => { // Clean up any registrations made during the test TaskRegistry.unregisterTask(TaskA.type); + TaskRegistry.unregisterTask(BinaryPortTypoTask.type); + TaskRegistry.unregisterTask(BinaryPortValidBlobTask.type); + TaskRegistry.unregisterTask(BinaryPortValidBinaryTask.type); }); it("registers a task constructor", () => { @@ -67,4 +115,22 @@ describe("TaskRegistry", () => { it("unregisterTask returns false when the type was not registered", () => { expect(TaskRegistry.unregisterTask("NonExistentType")).toBe(false); }); + + it("throws at registration when a binary port uses a typo format like 'Blob'", () => { + expect(() => TaskRegistry.registerTask(BinaryPortTypoTask)).toThrow( + /invalid binary stream port/ + ); + // And the task is NOT in the registry afterwards. + expect(TaskRegistry.all.get(BinaryPortTypoTask.type)).toBeUndefined(); + }); + + it("accepts a binary port with format 'blob'", () => { + expect(() => TaskRegistry.registerTask(BinaryPortValidBlobTask)).not.toThrow(); + expect(TaskRegistry.all.get(BinaryPortValidBlobTask.type)).toBe(BinaryPortValidBlobTask); + }); + + it("accepts a binary port with format 'binary'", () => { + expect(() => TaskRegistry.registerTask(BinaryPortValidBinaryTask)).not.toThrow(); + expect(TaskRegistry.all.get(BinaryPortValidBinaryTask.type)).toBe(BinaryPortValidBinaryTask); + }); }); diff --git a/packages/test/src/test/task-graph/TaskRunnerInputHydration.test.ts b/packages/test/src/test/task-graph/TaskRunnerInputHydration.test.ts new file mode 100644 index 000000000..07b1e443e --- /dev/null +++ b/packages/test/src/test/task-graph/TaskRunnerInputHydration.test.ts @@ -0,0 +1,109 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ +import { CACHE_REGISTRY, DefaultCacheRegistry, makeCacheRef, Task } from "@workglow/task-graph"; +import { Container, ServiceRegistry } from "@workglow/util"; +import type { DataPortSchema } from "@workglow/util/schema"; +import { beforeEach, describe, expect, it } from "vitest"; +import { StreamingMemoryRepo } from "../../binding/StreamingMemoryRepo"; + +async function* gen(...chunks: Uint8Array[]): AsyncIterable { + for (const c of chunks) yield c; +} + +type BlobInput = { bytes: unknown }; +type Out = { text: string }; + +class BlobInputTask extends Task { + public static override type = "InputHydration_BlobInput"; + public static override category = "Test"; + public static override cacheable = false; + + public received: unknown; + + public static override inputSchema(): DataPortSchema { + return { + type: "object", + properties: { bytes: { format: "blob", title: "Bytes" } }, + additionalProperties: false, + } as const satisfies DataPortSchema; + } + + public static override outputSchema(): DataPortSchema { + return { + type: "object", + properties: { text: { type: "string" } }, + additionalProperties: false, + } as const satisfies DataPortSchema; + } + + override async execute(input: BlobInput): Promise { + this.received = input.bytes; + return { text: "ok" }; + } +} + +class BinaryInputTask extends BlobInputTask { + public static override type = "InputHydration_BinaryInput"; + + public static override inputSchema(): DataPortSchema { + return { + type: "object", + properties: { bytes: { format: "binary", title: "Bytes" } }, + additionalProperties: false, + } as const satisfies DataPortSchema; + } +} + +let repo: StreamingMemoryRepo; +let services: ServiceRegistry; + +beforeEach(() => { + repo = new StreamingMemoryRepo({}); + services = new ServiceRegistry(new Container()); + services.registerInstance(CACHE_REGISTRY, new DefaultCacheRegistry({ deterministic: repo })); +}); + +describe("TaskRunner input-side CacheRef hydration", () => { + it("hydrates a branded ref to a Blob for a format:'blob' input port", async () => { + const ref = await repo.saveOutputStream("Up", { n: 1 }, gen(new Uint8Array([1, 2, 3])), {}); + const task = new BlobInputTask(); + await task.run({ bytes: ref }, { registry: services }); + + expect(task.received).toBeInstanceOf(Blob); + const bytes = new Uint8Array(await (task.received as Blob).arrayBuffer()); + expect(Array.from(bytes)).toEqual([1, 2, 3]); + }); + + it("hydrates a branded ref to an ArrayBuffer for a format:'binary' input port", async () => { + const ref = await repo.saveOutputStream("Up", { n: 2 }, gen(new Uint8Array([4, 5])), {}); + const task = new BinaryInputTask(); + await task.run({ bytes: ref }, { registry: services }); + + expect(task.received).toBeInstanceOf(ArrayBuffer); + expect(Array.from(new Uint8Array(task.received as ArrayBuffer))).toEqual([4, 5]); + }); + + it("rejects with an error naming the port for a dangling ref", async () => { + const ref = makeCacheRef({ $ref: "inmem://missing" }); + const task = new BlobInputTask(); + await expect(task.run({ bytes: ref }, { registry: services })).rejects.toThrow(/"bytes"/); + }); + + it("passes ref-free inputs through untouched", async () => { + const blob = new Blob([new Uint8Array([9])]); + const task = new BlobInputTask(); + await task.run({ bytes: blob }, { registry: services }); + expect(task.received).toBe(blob); + }); + + it("leaves refs in place when no cache backing offers readers", async () => { + const bare = new ServiceRegistry(new Container()); + const ref = makeCacheRef({ $ref: "inmem://whatever" }); + const task = new BlobInputTask(); + await task.run({ bytes: ref }, { registry: bare }); + expect(task.received).toBe(ref); + }); +}); diff --git a/packages/test/src/test/task-graph/TaskRunnerRefPath.test.ts b/packages/test/src/test/task-graph/TaskRunnerRefPath.test.ts new file mode 100644 index 000000000..9cc216141 --- /dev/null +++ b/packages/test/src/test/task-graph/TaskRunnerRefPath.test.ts @@ -0,0 +1,236 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { CacheRef, StreamEvent, TaskInput, TaskOutput } from "@workglow/task-graph"; +import { + CACHE_REGISTRY, + DefaultCacheRegistry, + IExecuteContext, + isCacheRef, + makeCacheRef, + Task, + TaskOutputRepository, + TaskRegistry, +} from "@workglow/task-graph"; +import { Container, ServiceRegistry, sleep } from "@workglow/util"; +import { DataPortSchema } from "@workglow/util/schema"; +import { beforeAll, beforeEach, describe, expect, it } from "vitest"; + +type BinOut = { bytes: Blob | ArrayBuffer }; + +class StreamingMemoryRepo extends TaskOutputRepository { + public readonly saved = new Map(); + public readonly streamed = new Map(); + public saveOutputCalls = 0; + public saveOutputStreamCalls = 0; + + override async saveOutput(t: string, i: TaskInput, o: TaskOutput): Promise { + this.saveOutputCalls++; + this.saved.set(t + JSON.stringify(i), o); + } + override async getOutput(t: string, i: TaskInput): Promise { + return this.saved.get(t + JSON.stringify(i)); + } + override async clear(): Promise { + this.saved.clear(); + this.streamed.clear(); + } + override async size(): Promise { + return this.saved.size; + } + override async clearOlderThan(): Promise {} + override isDurable(): boolean { + return false; + } + override async saveOutputStream( + taskType: string, + inputs: TaskInput, + chunks: AsyncIterable, + _metadata: Record + ): Promise { + this.saveOutputStreamCalls++; + const parts: Uint8Array[] = []; + let size = 0; + for await (const c of chunks) { + parts.push(c); + size += c.byteLength; + } + const merged = new Uint8Array(size); + let off = 0; + for (const p of parts) { + merged.set(p, off); + off += p.byteLength; + } + const key = `inmem://${taskType}::${JSON.stringify(inputs)}`; + this.streamed.set(key, merged); + return makeCacheRef({ $ref: key, size, mime: "application/octet-stream" }); + } + override async getOutputByRef(ref: CacheRef): Promise { + const bytes = this.streamed.get(ref.$ref); + return bytes === undefined ? undefined : new Blob([bytes as unknown as BlobPart]); + } +} + +class BlobStreamTask extends Task, BinOut> { + public static override type = "TaskRunnerRefPathTest_BlobStream"; + public static override category = "Test"; + public static override cacheable = true; + + public static override outputSchema(): DataPortSchema { + return { + type: "object", + properties: { bytes: { type: "object", format: "blob", "x-stream": "binary" } }, + additionalProperties: false, + } as const satisfies DataPortSchema; + } + + async *executeStream( + _input: Record, + _ctx: IExecuteContext + ): AsyncIterable> { + yield { type: "binary-delta", port: "bytes", binaryDelta: new Uint8Array([1, 2, 3]) }; + await sleep(1); + yield { type: "binary-delta", port: "bytes", binaryDelta: new Uint8Array([4, 5]) }; + yield { type: "finish", data: {} as BinOut }; + } +} + +class NonCacheableBlobStreamTask extends BlobStreamTask { + public static override type = "TaskRunnerRefPathTest_NonCacheableBlobStream"; + public static override cacheable = false; +} + +beforeAll(() => { + TaskRegistry.registerTask(BlobStreamTask as any); + TaskRegistry.registerTask(NonCacheableBlobStreamTask as any); +}); + +let repo: StreamingMemoryRepo; +let services: ServiceRegistry; +beforeEach(() => { + repo = new StreamingMemoryRepo({}); + services = new ServiceRegistry(new Container()); + services.registerInstance(CACHE_REGISTRY, new DefaultCacheRegistry({ deterministic: repo })); +}); + +describe("TaskRunner — referenceThresholdBytes: 0 (force-ref) ref path", () => { + it("Output carries a CacheRef at the binary port; bytes live in the streaming cache", async () => { + const task = new BlobStreamTask(); + const output = await task.run({}, { registry: services, referenceThresholdBytes: 0 }); + + expect(repo.saveOutputStreamCalls).toBe(1); + expect(isCacheRef(output.bytes)).toBe(true); + + const ref = output.bytes as unknown as CacheRef; + const hydrated = await repo.getOutputByRef(ref); + expect(hydrated).toBeInstanceOf(Blob); + const bytes = new Uint8Array(await hydrated!.arrayBuffer()); + expect(Array.from(bytes)).toEqual([1, 2, 3, 4, 5]); + expect(ref.size).toBe(5); + }); + + it("saveOutput still runs (small Output with embedded ref → small queue/cache row)", async () => { + const task = new BlobStreamTask(); + await task.run({}, { registry: services, referenceThresholdBytes: 0 }); + + expect(repo.saveOutputCalls).toBe(1); + // The cached small row contains the ref, NOT the bytes. + const [savedOutput] = Array.from(repo.saved.values()); + expect(isCacheRef((savedOutput as any).bytes)).toBe(true); + }); + + it("defaults (threshold 64 KiB) produce inline Blob in Output — small outputs rehydrate", async () => { + const task = new BlobStreamTask(); + const output = await task.run({}, { registry: services }); + + // D.4: sink runs unconditionally when cache supports streaming; the + // rehydrate step converts the ref back to an inline Blob because total + // bytes (5) is below the 64 KiB default threshold. + expect(repo.saveOutputStreamCalls).toBe(1); + expect(output.bytes).toBeInstanceOf(Blob); + const bytes = new Uint8Array(await (output.bytes as Blob).arrayBuffer()); + expect(Array.from(bytes)).toEqual([1, 2, 3, 4, 5]); + }); + + it("non-cacheable tasks fall through to accumulation even with threshold=0", async () => { + const task = new NonCacheableBlobStreamTask(); + const output = await task.run({}, { registry: services, referenceThresholdBytes: 0 }); + + expect(repo.saveOutputStreamCalls).toBe(0); + expect(output.bytes).toBeInstanceOf(Blob); + }); +}); + +describe("TaskRunner — Phase D.4 threshold-based size decision", () => { + it("output below threshold is rehydrated to an inline Blob (sink still ran for memory bound)", async () => { + const task = new BlobStreamTask(); + // Threshold well above the 5 bytes the task produces → rehydrate inline. + const output = await task.run({}, { registry: services, referenceThresholdBytes: 100 }); + + expect(repo.saveOutputStreamCalls).toBe(1); // sink ran (memory-bounded write) + expect(output.bytes).toBeInstanceOf(Blob); // but the slot is now an inline Blob + const bytes = new Uint8Array(await (output.bytes as Blob).arrayBuffer()); + expect(Array.from(bytes)).toEqual([1, 2, 3, 4, 5]); + }); + + it("output at or above threshold keeps the CacheRef", async () => { + const task = new BlobStreamTask(); + // 5 bytes >= threshold 5 → ref survives. + const output = await task.run({}, { registry: services, referenceThresholdBytes: 5 }); + + expect(repo.saveOutputStreamCalls).toBe(1); + expect(isCacheRef(output.bytes)).toBe(true); + expect((output.bytes as unknown as CacheRef).size).toBe(5); + }); + + it("threshold=0 (force-ref) overrides the size check; the ref survives regardless", async () => { + const task = new BlobStreamTask(); + const output = await task.run({}, { registry: services, referenceThresholdBytes: 0 }); + + expect(repo.saveOutputStreamCalls).toBe(1); + expect(isCacheRef(output.bytes)).toBe(true); + }); + + it("default threshold (64 KiB) rehydrates the small-output path automatically", async () => { + const task = new BlobStreamTask(); + // No threshold specified → resolves to 64 KiB default; 5 bytes is below. + const output = await task.run({}, { registry: services }); + + expect(repo.saveOutputStreamCalls).toBe(1); // sink now always runs when cache supports it + expect(output.bytes).toBeInstanceOf(Blob); + }); +}); + +describe("TaskRunner — orphan blob cleanup on row-save failure", () => { + it("deletes the streamed blob when saveOutput throws so the ref does not leak", async () => { + class FailingRepo extends StreamingMemoryRepo { + public deletedRefs: CacheRef[] = []; + override async saveOutput(): Promise { + throw new Error("row write failed"); + } + override async deleteOutputByRef(ref: CacheRef): Promise { + this.deletedRefs.push(ref); + this.streamed.delete(ref.$ref); + } + } + const failing = new FailingRepo({}); + const failingServices = new ServiceRegistry(new Container()); + failingServices.registerInstance( + CACHE_REGISTRY, + new DefaultCacheRegistry({ deterministic: failing }) + ); + const task = new BlobStreamTask(); + await expect( + task.run({}, { registry: failingServices, referenceThresholdBytes: 0 }) + ).rejects.toThrow("row write failed"); + + // Sink wrote the blob, but the row commit failed — the runner should + // have asked the repo to delete the orphan blob before re-throwing. + expect(failing.saveOutputStreamCalls).toBe(1); + expect(failing.deletedRefs.length).toBe(1); + expect(failing.streamed.size).toBe(0); + }); +}); diff --git a/packages/test/src/test/task-graph/resolveJobOutput.test.ts b/packages/test/src/test/task-graph/resolveJobOutput.test.ts new file mode 100644 index 000000000..0f321a8c5 --- /dev/null +++ b/packages/test/src/test/task-graph/resolveJobOutput.test.ts @@ -0,0 +1,99 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { CacheRef, CacheRefResolver, JobHandleLike } from "@workglow/task-graph"; +import { makeCacheRef, resolveJobOutput } from "@workglow/task-graph"; +import { describe, expect, it, vi } from "vitest"; + +const handleOf = (value: T): JobHandleLike => ({ + waitFor: async () => value, +}); + +const ref = (key: string, size = 0): CacheRef => makeCacheRef({ $ref: key, size }); + +describe("resolveJobOutput", () => { + it("awaits the job and hydrates a top-level ref through a function resolver", async () => { + const blob = new Blob([new Uint8Array([1, 2, 3])]); + const resolver: CacheRefResolver = async (r) => (r.$ref === "cache://A" ? blob : undefined); + const handle = handleOf({ bytes: ref("cache://A", 3) as unknown as Blob }); + const out = await resolveJobOutput(handle, resolver); + expect(out.bytes).toBe(blob); + }); + + it("accepts an object with getOutputByRef (TaskOutputRepository shape)", async () => { + const blob = new Blob([new Uint8Array([7, 8])]); + const backing = { + getOutputByRef: async (r: CacheRef) => (r.$ref === "cache://B" ? blob : undefined), + }; + const handle = handleOf({ payload: ref("cache://B", 2) as unknown as Blob }); + const out = await resolveJobOutput(handle, backing); + expect(out.payload).toBe(blob); + }); + + it("returns the output unchanged when the backing has no getOutputByRef", async () => { + const original = { bytes: ref("cache://x", 1) as unknown as Blob }; + const handle = handleOf(original); + const out = await resolveJobOutput(handle, {}); + expect(out).toBe(original); + }); + + it("replaces refs with undefined on cache miss (best-effort)", async () => { + const handle = handleOf({ bytes: ref("cache://missing", 1) as unknown as Blob }); + const out = await resolveJobOutput(handle, async () => undefined); + expect(out.bytes).toBeUndefined(); + }); + + it("walks nested structures", async () => { + const blob = new Blob([new Uint8Array([42])]); + const handle = handleOf({ + meta: { lang: "en" }, + payload: { audio: ref("cache://A", 1) as unknown as Blob }, + }); + const out = await resolveJobOutput(handle, async () => blob); + expect(out.meta).toEqual({ lang: "en" }); + expect(out.payload.audio).toBe(blob); + }); + + it("propagates rejection from the underlying handle.waitFor()", async () => { + const handle: JobHandleLike = { + waitFor: async () => { + throw new Error("job failed"); + }, + }; + await expect(resolveJobOutput(handle, async () => undefined)).rejects.toThrow("job failed"); + }); + + it("never invokes getOutputByRef for attacker-supplied {$ref} shapes without the brand", async () => { + // Cross-tenant attack vector: a task output contains a metadata field whose + // shape collides with the legacy CacheRef discriminator + // (`{$ref: "cache://OTHER_RUN/secret"}`). With the literal `kind` brand, + // the resolver MUST NOT pass the unbranded value to `getOutputByRef`, so + // bytes from another run/tenant can't be read by shape collision alone. + const getOutputByRef = vi.fn(async (_r: CacheRef) => new Blob([new Uint8Array([1, 2, 3])])); + const backing = { getOutputByRef }; + const handle = handleOf({ note: { $ref: "cache://OTHER_RUN/secret" } }); + const out = await resolveJobOutput(handle, backing); + expect(out.note).toEqual({ $ref: "cache://OTHER_RUN/secret" }); + expect(getOutputByRef).not.toHaveBeenCalled(); + }); + + it("forwards ResolveOutputOptions to the underlying walker", async () => { + let inFlight = 0; + let observedMax = 0; + const resolver: CacheRefResolver = async () => { + inFlight++; + observedMax = Math.max(observedMax, inFlight); + await new Promise((res) => setTimeout(res, 5)); + inFlight--; + return new Blob(); + }; + const handle = handleOf( + Array.from({ length: 6 }, (_, i) => ref(`cache://r${i}`, 1) as unknown as Blob) + ); + await resolveJobOutput(handle, resolver, { concurrency: 2 }); + expect(observedMax).toBeLessThanOrEqual(2); + }); +}); diff --git a/packages/test/src/test/task-graph/resolveJobOutputStream.test.ts b/packages/test/src/test/task-graph/resolveJobOutputStream.test.ts new file mode 100644 index 000000000..d2e34738c --- /dev/null +++ b/packages/test/src/test/task-graph/resolveJobOutputStream.test.ts @@ -0,0 +1,96 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ +import { + makeCacheRef, + makeJobOutputStreamResolver, + resolveJobOutputStream, +} from "@workglow/task-graph"; +import { describe, expect, it } from "vitest"; +import { StreamingMemoryRepo } from "../../binding/StreamingMemoryRepo"; + +async function* gen(...chunks: Uint8Array[]): AsyncIterable { + for (const c of chunks) yield c; +} + +async function collect(stream: AsyncIterable): Promise { + const out: number[] = []; + for await (const chunk of stream) for (const b of chunk) out.push(b); + return out; +} + +const handleFor = (output: T) => ({ waitFor: () => Promise.resolve(output) }); + +describe("resolveJobOutputStream", () => { + it("streams the ref at an explicit port", async () => { + const repo = new StreamingMemoryRepo({}); + const ref = await repo.saveOutputStream("T", { a: 1 }, gen(new Uint8Array([1, 2, 3])), {}); + const handle = handleFor({ transcript: "hi", audio: ref }); + const stream = await resolveJobOutputStream(handle, repo, "audio"); + expect(stream).toBeDefined(); + expect(await collect(stream!)).toEqual([1, 2, 3]); + }); + + it("auto-discovers a single nested ref without a port", async () => { + const repo = new StreamingMemoryRepo({}); + const ref = await repo.saveOutputStream("T", { a: 2 }, gen(new Uint8Array([9, 8])), {}); + const handle = handleFor({ meta: { inner: [ref] }, note: "x" }); + const stream = await resolveJobOutputStream(handle, repo); + expect(await collect(stream!)).toEqual([9, 8]); + }); + + it("resolves undefined when the output has no refs and no port given", async () => { + const repo = new StreamingMemoryRepo({}); + expect(await resolveJobOutputStream(handleFor({ text: "plain" }), repo)).toBeUndefined(); + }); + + it("throws for two refs without an explicit port", async () => { + const repo = new StreamingMemoryRepo({}); + const r1 = await repo.saveOutputStream("T", { a: 3 }, gen(new Uint8Array([1])), {}); + const r2 = await repo.saveOutputStream("T", { a: 4 }, gen(new Uint8Array([2])), {}); + await expect(resolveJobOutputStream(handleFor({ x: r1, y: r2 }), repo)).rejects.toThrow( + /explicit port/ + ); + }); + + it("adapts inline Blob / ArrayBuffer / Uint8Array values at a named port", async () => { + const repo = new StreamingMemoryRepo({}); + const blobStream = await resolveJobOutputStream( + handleFor({ data: new Blob([new Uint8Array([1, 2])]) }), + repo, + "data" + ); + expect(await collect(blobStream!)).toEqual([1, 2]); + + const abStream = await resolveJobOutputStream( + handleFor({ data: new Uint8Array([3, 4]).buffer }), + repo, + "data" + ); + expect(await collect(abStream!)).toEqual([3, 4]); + + const u8Stream = await resolveJobOutputStream( + handleFor({ data: new Uint8Array([5]) }), + repo, + "data" + ); + expect(await collect(u8Stream!)).toEqual([5]); + }); + + it("resolves undefined for a dangling ref", async () => { + const repo = new StreamingMemoryRepo({}); + const ref = makeCacheRef({ $ref: "inmem://gone" }); + expect(await resolveJobOutputStream(handleFor({ data: ref }), repo, "data")).toBeUndefined(); + }); + + it("makeJobOutputStreamResolver closes over the backing", async () => { + const repo = new StreamingMemoryRepo({}); + const ref = await repo.saveOutputStream("T", { a: 5 }, gen(new Uint8Array([7, 7])), {}); + const resolver = makeJobOutputStreamResolver(repo); + const stream = await resolver({ file: ref }, "file"); + expect(await collect(stream!)).toEqual([7, 7]); + expect(await resolver({ file: "not-binary" }, "file")).toBeUndefined(); + }); +}); diff --git a/packages/test/src/test/task-graph/resolveOutput.test.ts b/packages/test/src/test/task-graph/resolveOutput.test.ts new file mode 100644 index 000000000..f9477462f --- /dev/null +++ b/packages/test/src/test/task-graph/resolveOutput.test.ts @@ -0,0 +1,195 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { CacheRef, CacheRefResolver } from "@workglow/task-graph"; +import { makeCacheRef, resolveOutput } from "@workglow/task-graph"; +import { describe, expect, it, vi } from "vitest"; + +const ref = (key: string, size?: number, mime?: string): CacheRef => + makeCacheRef({ $ref: key, size, mime }); + +const fakeResolver = + (table: Record): CacheRefResolver => + async (r) => + table[r.$ref]; + +describe("resolveOutput", () => { + it("returns primitives and non-ref objects unchanged", async () => { + const resolver = vi.fn(fakeResolver({})); + const input = { a: 1, b: "two", c: true, d: null }; + expect(await resolveOutput(input, resolver)).toEqual(input); + expect(resolver).not.toHaveBeenCalled(); + }); + + it("does not walk JSON-Schema-shaped {$ref: string} objects (no brand)", async () => { + // Brand discrimination matters here: a JSON-Schema $ref embedded in + // metadata must NOT be passed to the cache resolver, since the cache + // backing would treat the JSON-Schema pointer as a cache key. Identity is + // preserved because the tree has no branded refs to resolve. + const resolver = vi.fn(); + const input = { schema: { $ref: "#/$defs/Foo" }, name: "ok" }; + const out = await resolveOutput(input, resolver); + expect(out).toBe(input); + expect(out.schema).toBe(input.schema); + expect(resolver).not.toHaveBeenCalled(); + }); + + it("resolves a top-level ref to bytes", async () => { + const blob = new Blob([new Uint8Array([1, 2, 3])]); + const table = { "cache://x": blob }; + const out = await resolveOutput(ref("cache://x") as unknown as Blob, fakeResolver(table)); + expect(out).toBe(blob); + }); + + it("resolves refs nested inside a plain object, leaving siblings alone", async () => { + const audio = new Blob([new Uint8Array([9, 9, 9])]); + const input = { + transcript: "hello", + audio: ref("cache://a", 3, "audio/wav") as unknown as Blob, + meta: { lang: "en" }, + }; + const out = await resolveOutput(input, fakeResolver({ "cache://a": audio })); + expect(out.transcript).toBe("hello"); + expect(out.audio).toBe(audio); + expect(out.meta).toEqual({ lang: "en" }); + }); + + it("resolves refs inside arrays", async () => { + const b1 = new Blob([new Uint8Array([1])]); + const b2 = new Blob([new Uint8Array([2])]); + const input = [ + ref("cache://1") as unknown as Blob, + "plain", + ref("cache://2") as unknown as Blob, + ]; + const out = await resolveOutput(input, fakeResolver({ "cache://1": b1, "cache://2": b2 })); + expect(out[0]).toBe(b1); + expect(out[1]).toBe("plain"); + expect(out[2]).toBe(b2); + }); + + it("treats Blob, ArrayBuffer, typed arrays, Date as opaque leaves (not walked)", async () => { + const blob = new Blob([new Uint8Array([1])]); + const ab = new ArrayBuffer(8); + const u8 = new Uint8Array([5, 6, 7]); + const date = new Date(2026, 0, 1); + const resolver = vi.fn(); + const input = { blob, ab, u8, date }; + const out = await resolveOutput(input, resolver); + expect(out.blob).toBe(blob); + expect(out.ab).toBe(ab); + expect(out.u8).toBe(u8); + expect(out.date).toBe(date); + expect(resolver).not.toHaveBeenCalled(); + }); + + it("returns undefined for refs the resolver cannot resolve (best-effort)", async () => { + const input = { audio: ref("cache://missing") as unknown as Blob }; + const out = await resolveOutput(input, fakeResolver({})); + expect(out.audio).toBeUndefined(); + }); + + it("propagates resolver rejections (caller-controlled error policy)", async () => { + const failingResolver: CacheRefResolver = async () => { + throw new Error("backing down"); + }; + await expect( + resolveOutput({ x: ref("cache://k") as unknown as Blob }, failingResolver) + ).rejects.toThrow("backing down"); + }); + + it("resolves refs in deeply nested structures", async () => { + const b = new Blob([new Uint8Array([42])]); + const input = { + level1: { + level2: { + items: [{ payload: ref("cache://deep") as unknown as Blob }], + }, + }, + }; + const out = await resolveOutput(input, fakeResolver({ "cache://deep": b })); + expect(out.level1.level2.items[0].payload).toBe(b); + }); + + it("honors a concurrency bound: never exceeds the configured maximum in flight", async () => { + let inFlight = 0; + let observedMax = 0; + const resolver: CacheRefResolver = async (r) => { + inFlight++; + observedMax = Math.max(observedMax, inFlight); + await new Promise((res) => setTimeout(res, 5)); + inFlight--; + return new Blob([new Uint8Array([Number(r.$ref.slice(-1))])]); + }; + const refs = Array.from({ length: 8 }, (_, i) => ref(`cache://r${i}`)); + await resolveOutput(refs as unknown as Blob[], resolver, { concurrency: 2 }); + expect(observedMax).toBeLessThanOrEqual(2); + }); + + it("with concurrency undefined runs all resolutions in parallel", async () => { + let inFlight = 0; + let observedMax = 0; + const resolver: CacheRefResolver = async () => { + inFlight++; + observedMax = Math.max(observedMax, inFlight); + await new Promise((res) => setTimeout(res, 5)); + inFlight--; + return new Blob(); + }; + const refs = Array.from({ length: 6 }, (_, i) => ref(`cache://r${i}`)); + await resolveOutput(refs as unknown as Blob[], resolver); + expect(observedMax).toBe(6); + }); + + it("returns without overflow on a self-referential input (no refs)", async () => { + const resolver = vi.fn(); + const a: any = { name: "loop" }; + a.self = a; + const out: any = await resolveOutput(a, resolver); + expect(resolver).not.toHaveBeenCalled(); + // No refs reachable, so identity is preserved (including the cycle). + expect(out).toBe(a); + expect(out.self).toBe(out); + }); + + it("resolves refs reachable through a cycle without overflow", async () => { + const blob = new Blob([new Uint8Array([7])]); + const a: any = { payload: ref("cache://r1") as unknown as Blob }; + // Cycle pointing back to the root. + a.parent = a; + const out: any = await resolveOutput(a, fakeResolver({ "cache://r1": blob })); + expect(out.payload).toBe(blob); + // The cycle is preserved: `parent` resolves to the ORIGINAL input (the + // walker short-circuits a revisited object by reference rather than + // attempting to rewrite the back-edge). + expect(out.parent).toBe(a); + }); + + it("treats Error as an opaque leaf (own non-enumerable data preserved)", async () => { + const resolver = vi.fn(); + const err = new Error("boom"); + const input = { failure: err }; + const out = await resolveOutput(input, resolver); + // Identity preserved (no refs to resolve). + expect(out).toBe(input); + expect(out.failure).toBe(err); + expect(out.failure.message).toBe("boom"); + expect(out.failure instanceof Error).toBe(true); + expect(resolver).not.toHaveBeenCalled(); + }); + + it("treats URL as an opaque leaf (prototype accessors keep working)", async () => { + const resolver = vi.fn(); + const url = new URL("https://example.com/path?q=1"); + const input = { target: url }; + const out = await resolveOutput(input, resolver); + expect(out).toBe(input); + expect(out.target).toBe(url); + expect(out.target.href).toBe("https://example.com/path?q=1"); + expect(out.target instanceof URL).toBe(true); + expect(resolver).not.toHaveBeenCalled(); + }); +}); diff --git a/packages/test/src/test/task-graph/resolveRef.opaque-types.test.ts b/packages/test/src/test/task-graph/resolveRef.opaque-types.test.ts new file mode 100644 index 000000000..5d7e09f47 --- /dev/null +++ b/packages/test/src/test/task-graph/resolveRef.opaque-types.test.ts @@ -0,0 +1,145 @@ +/** + * @license + * Copyright 2026 Steven Roussey + * SPDX-License-Identifier: Apache-2.0 + */ + +import type { CacheRef, CacheRefResolver } from "@workglow/task-graph"; +import { makeCacheRef, resolveOutput } from "@workglow/task-graph"; +import { describe, expect, it, vi } from "vitest"; + +const ref = (key: string): CacheRef => makeCacheRef({ $ref: key }); + +describe("resolveOutput opaque-type policy", () => { + it("treats Headers as opaque (preserves instance, never inspects keys)", async () => { + const resolver = vi.fn(); + const headers = new Headers({ "x-test": "1" }); + const input = { headers }; + const out = await resolveOutput(input, resolver); + expect(out).toBe(input); + expect(out.headers).toBe(headers); + expect(out.headers.get("x-test")).toBe("1"); + expect(resolver).not.toHaveBeenCalled(); + }); + + it("treats Request as opaque (Object.keys would drop everything)", async () => { + const resolver = vi.fn(); + const request = new Request("https://example.com/path", { method: "POST" }); + const input = { request }; + const out = await resolveOutput(input, resolver); + expect(out).toBe(input); + expect(out.request).toBe(request); + expect(out.request.method).toBe("POST"); + expect(out.request instanceof Request).toBe(true); + expect(resolver).not.toHaveBeenCalled(); + }); + + it("treats Response as opaque (body lives in a private slot)", async () => { + const resolver = vi.fn(); + const response = new Response("hello", { status: 201 }); + const input = { response }; + const out = await resolveOutput(input, resolver); + expect(out).toBe(input); + expect(out.response).toBe(response); + expect(out.response.status).toBe(201); + expect(out.response instanceof Response).toBe(true); + expect(resolver).not.toHaveBeenCalled(); + }); + + it("treats FormData as opaque (entries are not own enumerable properties)", async () => { + const resolver = vi.fn(); + const form = new FormData(); + form.set("k", "v"); + const input = { form }; + const out = await resolveOutput(input, resolver); + expect(out).toBe(input); + expect(out.form).toBe(form); + expect(out.form.get("k")).toBe("v"); + expect(resolver).not.toHaveBeenCalled(); + }); + + it("treats URLSearchParams as opaque", async () => { + const resolver = vi.fn(); + const params = new URLSearchParams("a=1&b=2"); + const input = { params }; + const out = await resolveOutput(input, resolver); + expect(out).toBe(input); + expect(out.params).toBe(params); + expect(out.params.get("a")).toBe("1"); + expect(resolver).not.toHaveBeenCalled(); + }); + + it("treats Error as opaque (preserves message/stack on prototype-resident slots)", async () => { + const resolver = vi.fn(); + const err = new Error("boom"); + const input = { err }; + const out = await resolveOutput(input, resolver); + expect(out).toBe(input); + expect(out.err).toBe(err); + expect(out.err.message).toBe("boom"); + expect(out.err instanceof Error).toBe(true); + expect(resolver).not.toHaveBeenCalled(); + }); + + it("treats URL as opaque", async () => { + const resolver = vi.fn(); + const url = new URL("https://example.com/path?q=1"); + const input = { url }; + const out = await resolveOutput(input, resolver); + expect(out).toBe(input); + expect(out.url).toBe(url); + expect(out.url.href).toBe("https://example.com/path?q=1"); + expect(out.url instanceof URL).toBe(true); + expect(resolver).not.toHaveBeenCalled(); + }); + + it("treats user-defined classes (including private fields) as opaque", async () => { + const resolver = vi.fn(); + class UserClass { + readonly #secret: number; + readonly label: string; + constructor(secret: number, label: string) { + this.#secret = secret; + this.label = label; + } + getSecret(): number { + return this.#secret; + } + } + const instance = new UserClass(42, "answer"); + const input = { instance }; + const out = await resolveOutput(input, resolver); + expect(out).toBe(input); + expect(out.instance).toBe(instance); + expect(out.instance.getSecret()).toBe(42); + expect(out.instance.label).toBe("answer"); + expect(out.instance instanceof UserClass).toBe(true); + expect(resolver).not.toHaveBeenCalled(); + }); + + it("treats null-prototype objects as plain (walked structurally)", async () => { + const blob = new Blob([new Uint8Array([7])]); + const resolver: CacheRefResolver = async () => blob; + const inner: Record = Object.create(null); + inner.payload = ref("cache://np"); + const out = await resolveOutput({ inner }, resolver); + expect(out.inner.payload).toBe(blob); + }); + + it("resolves a ref sibling of an opaque class instance without inspecting the instance", async () => { + const blob = new Blob([new Uint8Array([1])]); + const resolver = vi.fn(async (r) => + r.$ref === "cache://k" ? blob : undefined + ); + const headers = new Headers({ "x-not-touched": "1" }); + const input = { + headers, + payload: ref("cache://k"), + }; + const out = await resolveOutput(input, resolver); + expect(out.headers).toBe(headers); + expect(out.payload).toBe(blob); + // Only the branded ref triggered a resolver call; Headers was not walked. + expect(resolver).toHaveBeenCalledTimes(1); + }); +}); diff --git a/packages/util/src/json-schema/JsonSchema.ts b/packages/util/src/json-schema/JsonSchema.ts index 4b5a1d4c0..7d5d7774a 100644 --- a/packages/util/src/json-schema/JsonSchema.ts +++ b/packages/util/src/json-schema/JsonSchema.ts @@ -24,7 +24,7 @@ export type JsonSchemaCustomProps = { "x-ui"?: unknown; "x-ui-iteration"?: boolean; // marks property as iteration-injected (hidden from parent, read-only in subgraph) "x-auto-generated"?: boolean; // marks a primary key column as auto-generated by storage backend - "x-stream"?: "append" | "replace" | "object"; // streaming mode for this port (absent = none/non-streaming) + "x-stream"?: "append" | "replace" | "object" | "binary"; // streaming mode for this port (absent = none/non-streaming) "x-structured-output"?: boolean; // marks a port as requiring structured output from the AI provider };