From a3b070318b0db2efa70fde7673d0c259c93e277c Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Wed, 29 Apr 2026 10:24:36 -0600 Subject: [PATCH 01/21] implement canonical metrics, feature flagged to preserve legacy where legacy ones were released --- harness/main.ts | 9 +- harness/manifests/deployment.yaml | 2 + src/sdk/clients/worker/Poller.ts | 1 + src/sdk/clients/worker/TaskRunner.ts | 6 + .../clients/worker/events/EventDispatcher.ts | 13 + src/sdk/clients/worker/events/types.ts | 10 +- src/sdk/clients/worker/types.ts | 2 + src/sdk/clients/workflow/WorkflowExecutor.ts | 28 + .../helpers/fetchWithRetry.ts | 58 +- src/sdk/worker/core/TaskHandler.ts | 49 ++ src/sdk/worker/index.ts | 4 + .../metrics/CanonicalMetricsCollector.ts | 510 ++++++++++++++++++ .../metrics/CanonicalPrometheusRegistry.ts | 168 ++++++ ...Collector.ts => LegacyMetricsCollector.ts} | 121 ++--- .../metrics/MetricsCollectorInterface.ts | 48 ++ src/sdk/worker/metrics/MetricsServer.ts | 6 +- .../CanonicalMetricsCollector.test.ts | 321 +++++++++++ .../MetricsCollector.prometheus.test.ts | 12 +- .../__tests__/MetricsCollector.test.ts | 8 +- .../metrics/__tests__/MetricsServer.test.ts | 14 +- .../metrics/__tests__/metricsFactory.test.ts | 99 ++++ src/sdk/worker/metrics/accumulators.ts | 177 ++++++ src/sdk/worker/metrics/httpObserver.ts | 40 ++ src/sdk/worker/metrics/index.ts | 34 +- src/sdk/worker/metrics/metricsFactory.ts | 26 + 25 files changed, 1664 insertions(+), 102 deletions(-) create mode 100644 src/sdk/worker/metrics/CanonicalMetricsCollector.ts create mode 100644 src/sdk/worker/metrics/CanonicalPrometheusRegistry.ts rename src/sdk/worker/metrics/{MetricsCollector.ts => LegacyMetricsCollector.ts} (85%) create mode 100644 src/sdk/worker/metrics/MetricsCollectorInterface.ts create mode 100644 src/sdk/worker/metrics/__tests__/CanonicalMetricsCollector.test.ts create mode 100644 src/sdk/worker/metrics/__tests__/metricsFactory.test.ts create mode 100644 src/sdk/worker/metrics/accumulators.ts create mode 100644 src/sdk/worker/metrics/httpObserver.ts create mode 100644 src/sdk/worker/metrics/metricsFactory.ts diff --git a/harness/main.ts b/harness/main.ts index 3240ce1a..b7aadeed 100644 --- a/harness/main.ts +++ b/harness/main.ts @@ -3,7 +3,7 @@ import { ConductorWorkflow, TaskHandler, simpleTask, - MetricsCollector, + createMetricsCollector, } from "../src/sdk"; import { MetadataResource } from "../src/open-api/generated"; import type { ConductorWorker } from "../src/sdk/clients/worker/types"; @@ -93,8 +93,11 @@ async function main(): Promise { }); const metricsPort = envIntOrDefault("HARNESS_METRICS_PORT", 9991); - const metricsCollector = new MetricsCollector({ httpPort: metricsPort }); - console.log(`Prometheus metrics server started on port ${metricsPort}`); + const metricsCollector = createMetricsCollector({ httpPort: metricsPort }); + const variant = process.env.WORKER_CANONICAL_METRICS?.toLowerCase() === "true" + ? "canonical" + : "legacy"; + console.log(`Prometheus metrics server started on port ${metricsPort} (${variant} metrics)`); const handler = new TaskHandler({ client, diff --git a/harness/manifests/deployment.yaml b/harness/manifests/deployment.yaml index 878d6728..95218587 100644 --- a/harness/manifests/deployment.yaml +++ b/harness/manifests/deployment.yaml @@ -40,6 +40,8 @@ spec: value: "20" - name: HARNESS_POLL_INTERVAL_MS value: "100" + - name: WORKER_CANONICAL_METRICS + value: "true" ports: - name: metrics containerPort: 9991 diff --git a/src/sdk/clients/worker/Poller.ts b/src/sdk/clients/worker/Poller.ts index a7fb1787..414552ac 100644 --- a/src/sdk/clients/worker/Poller.ts +++ b/src/sdk/clients/worker/Poller.ts @@ -189,6 +189,7 @@ export class Poller { this.logger.debug( `Worker ${this._pollerId} is paused, skipping poll` ); + this.options.onPaused?.(); await this.sleep( this.options.pollInterval ?? DEFAULT_POLL_INTERVAL ); diff --git a/src/sdk/clients/worker/TaskRunner.ts b/src/sdk/clients/worker/TaskRunner.ts index a836ad5b..cb637e48 100644 --- a/src/sdk/clients/worker/TaskRunner.ts +++ b/src/sdk/clients/worker/TaskRunner.ts @@ -96,6 +96,12 @@ export class TaskRunner { { concurrency: worker.concurrency ?? this.options.concurrency, pollInterval: worker.pollInterval ?? this.options.pollInterval, + onPaused: () => { + void this.eventDispatcher.publishTaskPaused({ + taskType: this.worker.taskDefName, + timestamp: new Date(), + }); + }, }, this.logger ); diff --git a/src/sdk/clients/worker/events/EventDispatcher.ts b/src/sdk/clients/worker/events/EventDispatcher.ts index e6e1557a..16a2cf3c 100644 --- a/src/sdk/clients/worker/events/EventDispatcher.ts +++ b/src/sdk/clients/worker/events/EventDispatcher.ts @@ -9,6 +9,7 @@ import type { TaskExecutionFailure, TaskUpdateCompleted, TaskUpdateFailure, + TaskPaused, } from "./types"; /** @@ -60,6 +61,11 @@ export interface TaskRunnerEventsListener { * This is a CRITICAL event that may require operational intervention. */ onTaskUpdateFailure?(event: TaskUpdateFailure): void | Promise; + + /** + * Called when a poll cycle is skipped because the worker is paused. + */ + onTaskPaused?(event: TaskPaused): void | Promise; } /** @@ -159,6 +165,13 @@ export class EventDispatcher { await this.publishEvent("onTaskUpdateFailure", event); } + /** + * Publish a TaskPaused event. + */ + async publishTaskPaused(event: TaskPaused): Promise { + await this.publishEvent("onTaskPaused", event); + } + /** * Internal method to publish events to all registered listeners. * Listener failures are caught and logged to prevent affecting task execution. diff --git a/src/sdk/clients/worker/events/types.ts b/src/sdk/clients/worker/events/types.ts index 7f12b2bf..e252a25e 100644 --- a/src/sdk/clients/worker/events/types.ts +++ b/src/sdk/clients/worker/events/types.ts @@ -134,6 +134,13 @@ export interface TaskUpdateCompleted extends TaskRunnerEvent { durationMs: number; } +/** + * Event published when a poll cycle is skipped because the worker is paused. + */ +export interface TaskPaused extends TaskRunnerEvent { + // No additional fields — taskType is inherited from TaskRunnerEvent. +} + /** * Union type of all task runner events. */ @@ -145,4 +152,5 @@ export type TaskRunnerEventType = | TaskExecutionCompleted | TaskExecutionFailure | TaskUpdateCompleted - | TaskUpdateFailure; + | TaskUpdateFailure + | TaskPaused; diff --git a/src/sdk/clients/worker/types.ts b/src/sdk/clients/worker/types.ts index b56eed5c..3aa0db61 100644 --- a/src/sdk/clients/worker/types.ts +++ b/src/sdk/clients/worker/types.ts @@ -86,6 +86,8 @@ export interface PollerOptions { adaptiveBackoff?: boolean; /** Whether this poller is paused (default: false) */ paused?: boolean; + /** Callback invoked each time a poll cycle is skipped because the poller is paused */ + onPaused?: () => void; } /** diff --git a/src/sdk/clients/workflow/WorkflowExecutor.ts b/src/sdk/clients/workflow/WorkflowExecutor.ts index a34188e2..c527050e 100644 --- a/src/sdk/clients/workflow/WorkflowExecutor.ts +++ b/src/sdk/clients/workflow/WorkflowExecutor.ts @@ -30,6 +30,7 @@ import { enhanceSignalResponse } from "./helpers/enhanceSignalResponse"; import { reverseFind } from "./helpers/reverseFind"; import { isCompletedTaskMatchingType } from "./helpers/isCompletedTaskMatchingType"; import { RETRY_TIME_IN_MILLISECONDS } from "./constants"; +import { getHttpMetricsObserver } from "../../worker/metrics/httpObserver"; export class WorkflowExecutor { public readonly _client: Client; @@ -71,6 +72,23 @@ export class WorkflowExecutor { public async startWorkflow( workflowRequest: StartWorkflowRequest ): Promise { + const observer = getHttpMetricsObserver(); + if (observer) { + try { + const inputBytes = workflowRequest.input + ? JSON.stringify(workflowRequest.input).length + : 0; + observer.recordWorkflowInputSize( + workflowRequest.name ?? "", + inputBytes, + workflowRequest.version != null + ? String(workflowRequest.version) + : undefined, + ); + } catch { + // Best-effort — don't let metrics break workflow start. + } + } try { const { data } = await WorkflowResource.startWorkflow({ body: workflowRequest, @@ -80,6 +98,16 @@ export class WorkflowExecutor { return data; } catch (error: unknown) { + if (observer) { + const excName = + error instanceof Error + ? error.name || error.constructor?.name || "Error" + : "Error"; + observer.recordWorkflowStartError( + workflowRequest.name ?? "", + excName, + ); + } handleSdkError(error, "Failed to start workflow"); } } diff --git a/src/sdk/createConductorClient/helpers/fetchWithRetry.ts b/src/sdk/createConductorClient/helpers/fetchWithRetry.ts index 733313cd..1a49d509 100644 --- a/src/sdk/createConductorClient/helpers/fetchWithRetry.ts +++ b/src/sdk/createConductorClient/helpers/fetchWithRetry.ts @@ -186,9 +186,61 @@ export const retryFetch = async ( export const wrapFetchWithRetry = ( fetchFn: typeof fetch, - options?: RetryFetchOptions + options?: RetryFetchOptions, ): typeof fetch => { - return (input: Input, init?: Init): Promise => { - return retryFetch(input, init, fetchFn, options); + return async (input: Input, init?: Init): Promise => { + const start = performance.now(); + let method = "GET"; + let uri = ""; + + try { + if (input instanceof Request) { + method = input.method; + uri = new URL(input.url).pathname; + } else if (typeof input === "string") { + method = init?.method ?? "GET"; + try { uri = new URL(input).pathname; } catch { uri = input; } + } else { + method = init?.method ?? "GET"; + try { uri = input.pathname; } catch { uri = String(input); } + } + } catch { + // Best-effort URI extraction + } + + try { + const response = await retryFetch(input, init, fetchFn, options); + const durationMs = performance.now() - start; + try { + const { getHttpMetricsObserver } = await import( + "../../worker/metrics/httpObserver.js" + ); + getHttpMetricsObserver()?.recordApiRequestTime( + method, + uri, + String(response.status), + durationMs, + ); + } catch { + // Metrics recording is best-effort + } + return response; + } catch (error) { + const durationMs = performance.now() - start; + try { + const { getHttpMetricsObserver } = await import( + "../../worker/metrics/httpObserver.js" + ); + getHttpMetricsObserver()?.recordApiRequestTime( + method, + uri, + "0", + durationMs, + ); + } catch { + // Metrics recording is best-effort + } + throw error; + } }; }; diff --git a/src/sdk/worker/core/TaskHandler.ts b/src/sdk/worker/core/TaskHandler.ts index 2c0dd083..b39a9b94 100644 --- a/src/sdk/worker/core/TaskHandler.ts +++ b/src/sdk/worker/core/TaskHandler.ts @@ -7,6 +7,7 @@ import { RESTART_BACKOFF_MAX_MS, } from "../../clients/worker/constants"; import type { TaskRunnerEventsListener } from "../../clients/worker/events"; +import type { MetricsCollectorInterface } from "../metrics/MetricsCollectorInterface"; import { TaskRunner } from "../../clients/worker/TaskRunner"; import type { ConductorWorker, HealthMonitorConfig } from "../../clients/worker/types"; import type { ConductorLogger } from "../../helpers/logger"; @@ -140,6 +141,10 @@ export class TaskHandler { private restartAttempts = new Map(); // runner index → attempt count private healthMonitorConfig: HealthMonitorConfig; + // Uncaught exception handlers (bound so we can remove them) + private _uncaughtHandler?: (err: Error) => void; + private _unhandledHandler?: (reason: unknown) => void; + /** * Create a TaskHandler instance with async module imports. * Use this instead of `new TaskHandler()` when using `importModules`. @@ -323,6 +328,7 @@ export class TaskHandler { this.isRunning = true; this.startHealthMonitor(); + this.wireUncaughtExceptionHandlers(); this.logger.info("All workers started successfully"); } @@ -341,6 +347,7 @@ export class TaskHandler { } this.stopHealthMonitor(); + this.unwireUncaughtExceptionHandlers(); this.logger.info(`Stopping ${this.taskRunners.length} worker(s)...`); const stopPromises = this.taskRunners.map(async (runner, index) => { @@ -528,6 +535,48 @@ export class TaskHandler { } } + // ── Uncaught Exception Handling ───────────────────────────────── + + private findMetricsCollector(): MetricsCollectorInterface | undefined { + const listeners = this.config.eventListeners ?? []; + return listeners.find( + (l): l is MetricsCollectorInterface => + typeof (l as MetricsCollectorInterface).recordUncaughtException === + "function", + ); + } + + private wireUncaughtExceptionHandlers(): void { + const collector = this.findMetricsCollector(); + if (!collector) return; + + this._uncaughtHandler = (err: Error) => { + const excName = err.name || err.constructor?.name || "Error"; + collector.recordUncaughtException(excName); + }; + this._unhandledHandler = (reason: unknown) => { + const excName = + reason instanceof Error + ? reason.name || reason.constructor?.name || "Error" + : "Error"; + collector.recordUncaughtException(excName); + }; + + process.on("uncaughtException", this._uncaughtHandler); + process.on("unhandledRejection", this._unhandledHandler); + } + + private unwireUncaughtExceptionHandlers(): void { + if (this._uncaughtHandler) { + process.removeListener("uncaughtException", this._uncaughtHandler); + this._uncaughtHandler = undefined; + } + if (this._unhandledHandler) { + process.removeListener("unhandledRejection", this._unhandledHandler); + this._unhandledHandler = undefined; + } + } + // ── Discovery Summary ─────────────────────────────────────────── /** diff --git a/src/sdk/worker/index.ts b/src/sdk/worker/index.ts index 35e1fed9..7605d589 100644 --- a/src/sdk/worker/index.ts +++ b/src/sdk/worker/index.ts @@ -17,7 +17,11 @@ export { TaskContext, getTaskContext } from "./context"; // Metrics export { MetricsCollector, + LegacyMetricsCollector, + CanonicalMetricsCollector, + createMetricsCollector, MetricsServer, + type MetricsCollectorInterface, type MetricsCollectorConfig, type WorkerMetrics, } from "./metrics"; diff --git a/src/sdk/worker/metrics/CanonicalMetricsCollector.ts b/src/sdk/worker/metrics/CanonicalMetricsCollector.ts new file mode 100644 index 00000000..d12b8e7a --- /dev/null +++ b/src/sdk/worker/metrics/CanonicalMetricsCollector.ts @@ -0,0 +1,510 @@ +import type { + PollStarted, + PollCompleted, + PollFailure, + TaskExecutionStarted, + TaskExecutionCompleted, + TaskExecutionFailure, + TaskUpdateCompleted, + TaskUpdateFailure, + TaskPaused, +} from "../../clients/worker/events"; +import type { MetricsCollectorInterface } from "./MetricsCollectorInterface"; +import type { MetricsCollectorConfig } from "./LegacyMetricsCollector"; +import { setHttpMetricsObserver } from "./httpObserver"; +import { + HistogramAccumulator, + MultiLabelCounter, + GaugeMetric, + TIME_BUCKETS, + SIZE_BUCKETS, + exceptionLabel, +} from "./accumulators"; + +interface CanonicalMetricState { + // Counters (12) + taskPollTotal: MultiLabelCounter; + taskExecutionStartedTotal: MultiLabelCounter; + taskPollErrorTotal: MultiLabelCounter; + taskExecuteErrorTotal: MultiLabelCounter; + taskUpdateErrorTotal: MultiLabelCounter; + taskAckErrorTotal: MultiLabelCounter; + taskAckFailedTotal: MultiLabelCounter; + taskExecutionQueueFullTotal: MultiLabelCounter; + taskPausedTotal: MultiLabelCounter; + threadUncaughtExceptionsTotal: MultiLabelCounter; + externalPayloadUsedTotal: MultiLabelCounter; + workflowStartErrorTotal: MultiLabelCounter; + + // Time histograms (4) — seconds + taskPollTimeSeconds: HistogramAccumulator; + taskExecuteTimeSeconds: HistogramAccumulator; + taskUpdateTimeSeconds: HistogramAccumulator; + httpApiClientRequestSeconds: HistogramAccumulator; + + // Size histograms (2) — bytes + taskResultSizeBytes: HistogramAccumulator; + workflowInputSizeBytes: HistogramAccumulator; + + // Gauge (1) + activeWorkers: GaugeMetric; +} + +/** + * Canonical metrics collector. + * + * Emits unprefixed Prometheus metrics with camelCase labels, second-based + * time units, and Histogram type for distributions, per the cross-SDK + * canonical metric catalog. + * + * Selected when WORKER_CANONICAL_METRICS=true. + */ +export class CanonicalMetricsCollector implements MetricsCollectorInterface { + private state: CanonicalMetricState; + private _server?: import("./MetricsServer.js").MetricsServer; + private _fileTimer?: ReturnType; + private _promRegistry?: import("./CanonicalPrometheusRegistry.js").CanonicalPrometheusRegistry; + private readonly _usePromClient: boolean; + + constructor(config?: MetricsCollectorConfig) { + this.state = this.createEmpty(); + this._usePromClient = config?.usePromClient ?? false; + if (this._usePromClient) { + void this.initPromClient(); + } + if (config?.httpPort) { + void this.startServer(config.httpPort); + } + if (config?.filePath) { + this.startFileWriter( + config.filePath, + config.fileWriteIntervalMs ?? 5000, + ); + } + setHttpMetricsObserver(this); + } + + private async initPromClient(): Promise { + const { CanonicalPrometheusRegistry } = await import( + "./CanonicalPrometheusRegistry.js" + ); + this._promRegistry = new CanonicalPrometheusRegistry(); + await this._promRegistry.initialize(); + } + + private async startServer(port: number): Promise { + const { MetricsServer } = await import("./MetricsServer.js"); + this._server = new MetricsServer(this, port); + await this._server.start(); + } + + private startFileWriter(filePath: string, intervalMs: number): void { + const doWrite = async () => { + try { + const { writeFile } = await import("node:fs/promises"); + await writeFile(filePath, this.toPrometheusText(), "utf-8"); + } catch { + // Silently ignore file write errors + } + }; + void doWrite(); + this._fileTimer = setInterval(doWrite, intervalMs); + if (typeof this._fileTimer === "object" && "unref" in this._fileTimer) { + this._fileTimer.unref(); + } + } + + private createEmpty(): CanonicalMetricState { + return { + taskPollTotal: new MultiLabelCounter(), + taskExecutionStartedTotal: new MultiLabelCounter(), + taskPollErrorTotal: new MultiLabelCounter(), + taskExecuteErrorTotal: new MultiLabelCounter(), + taskUpdateErrorTotal: new MultiLabelCounter(), + taskAckErrorTotal: new MultiLabelCounter(), + taskAckFailedTotal: new MultiLabelCounter(), + taskExecutionQueueFullTotal: new MultiLabelCounter(), + taskPausedTotal: new MultiLabelCounter(), + threadUncaughtExceptionsTotal: new MultiLabelCounter(), + externalPayloadUsedTotal: new MultiLabelCounter(), + workflowStartErrorTotal: new MultiLabelCounter(), + + taskPollTimeSeconds: new HistogramAccumulator(TIME_BUCKETS), + taskExecuteTimeSeconds: new HistogramAccumulator(TIME_BUCKETS), + taskUpdateTimeSeconds: new HistogramAccumulator(TIME_BUCKETS), + httpApiClientRequestSeconds: new HistogramAccumulator(TIME_BUCKETS), + + taskResultSizeBytes: new HistogramAccumulator(SIZE_BUCKETS), + workflowInputSizeBytes: new HistogramAccumulator(SIZE_BUCKETS), + + activeWorkers: new GaugeMetric(), + }; + } + + // ── Event Listener Methods ────────────────────────────────────── + + onPollStarted(event: PollStarted): void { + this.state.taskPollTotal.increment({ taskType: event.taskType }); + this._promRegistry?.incrementCounter("task_poll_total", { + taskType: event.taskType, + }); + } + + onPollCompleted(event: PollCompleted): void { + const seconds = event.durationMs / 1000; + this.state.taskPollTimeSeconds.observe( + { taskType: event.taskType, status: "SUCCESS" }, + seconds, + ); + this._promRegistry?.observeHistogram("task_poll_time_seconds", { + taskType: event.taskType, + status: "SUCCESS", + }, seconds); + } + + onPollFailure(event: PollFailure): void { + const excName = exceptionLabel(event.cause); + this.state.taskPollErrorTotal.increment({ + taskType: event.taskType, + exception: excName, + }); + this._promRegistry?.incrementCounter("task_poll_error_total", { + taskType: event.taskType, + exception: excName, + }); + + const seconds = event.durationMs / 1000; + this.state.taskPollTimeSeconds.observe( + { taskType: event.taskType, status: "FAILURE" }, + seconds, + ); + this._promRegistry?.observeHistogram("task_poll_time_seconds", { + taskType: event.taskType, + status: "FAILURE", + }, seconds); + } + + onTaskExecutionStarted(event: TaskExecutionStarted): void { + this.state.taskExecutionStartedTotal.increment({ + taskType: event.taskType, + }); + this._promRegistry?.incrementCounter("task_execution_started_total", { + taskType: event.taskType, + }); + this.state.activeWorkers.inc({ taskType: event.taskType }); + this._promRegistry?.setGauge( + "active_workers", + { taskType: event.taskType }, + this.state.activeWorkers.getValue({ taskType: event.taskType }), + ); + } + + onTaskExecutionCompleted(event: TaskExecutionCompleted): void { + const seconds = event.durationMs / 1000; + this.state.taskExecuteTimeSeconds.observe( + { taskType: event.taskType, status: "SUCCESS" }, + seconds, + ); + this._promRegistry?.observeHistogram("task_execute_time_seconds", { + taskType: event.taskType, + status: "SUCCESS", + }, seconds); + + if (event.outputSizeBytes !== undefined) { + this.state.taskResultSizeBytes.observe( + { taskType: event.taskType }, + event.outputSizeBytes, + ); + this._promRegistry?.observeHistogram("task_result_size_bytes", { + taskType: event.taskType, + }, event.outputSizeBytes); + } + + this.state.activeWorkers.dec({ taskType: event.taskType }); + this._promRegistry?.setGauge( + "active_workers", + { taskType: event.taskType }, + Math.max(0, this.state.activeWorkers.getValue({ taskType: event.taskType })), + ); + } + + onTaskExecutionFailure(event: TaskExecutionFailure): void { + const excName = exceptionLabel(event.cause); + this.state.taskExecuteErrorTotal.increment({ + taskType: event.taskType, + exception: excName, + }); + this._promRegistry?.incrementCounter("task_execute_error_total", { + taskType: event.taskType, + exception: excName, + }); + + const seconds = event.durationMs / 1000; + this.state.taskExecuteTimeSeconds.observe( + { taskType: event.taskType, status: "FAILURE" }, + seconds, + ); + this._promRegistry?.observeHistogram("task_execute_time_seconds", { + taskType: event.taskType, + status: "FAILURE", + }, seconds); + + this.state.activeWorkers.dec({ taskType: event.taskType }); + this._promRegistry?.setGauge( + "active_workers", + { taskType: event.taskType }, + Math.max(0, this.state.activeWorkers.getValue({ taskType: event.taskType })), + ); + } + + onTaskUpdateCompleted(event: TaskUpdateCompleted): void { + const seconds = event.durationMs / 1000; + this.state.taskUpdateTimeSeconds.observe( + { taskType: event.taskType, status: "SUCCESS" }, + seconds, + ); + this._promRegistry?.observeHistogram("task_update_time_seconds", { + taskType: event.taskType, + status: "SUCCESS", + }, seconds); + } + + onTaskUpdateFailure(event: TaskUpdateFailure): void { + const excName = exceptionLabel(event.cause); + this.state.taskUpdateErrorTotal.increment({ + taskType: event.taskType, + exception: excName, + }); + this._promRegistry?.incrementCounter("task_update_error_total", { + taskType: event.taskType, + exception: excName, + }); + } + + onTaskPaused(event: TaskPaused): void { + this.recordTaskPaused(event.taskType); + } + + // ── Direct Recording Methods ─────────────────────────────────── + + recordTaskExecutionQueueFull(taskType: string): void { + this.state.taskExecutionQueueFullTotal.increment({ taskType }); + this._promRegistry?.incrementCounter("task_execution_queue_full_total", { + taskType, + }); + } + + recordUncaughtException(exception?: string): void { + const excName = exception ?? "Error"; + this.state.threadUncaughtExceptionsTotal.increment({ + exception: excName, + }); + this._promRegistry?.incrementCounter( + "thread_uncaught_exceptions_total", + { exception: excName }, + ); + } + + recordWorkerRestart(): void { + // Noop: worker_restart_total is N/A for the JS SDK (single-process model) + } + + recordTaskPaused(taskType: string): void { + this.state.taskPausedTotal.increment({ taskType }); + this._promRegistry?.incrementCounter("task_paused_total", { taskType }); + } + + recordTaskAckError(taskType: string, exception?: string): void { + const excName = exception ?? "Error"; + this.state.taskAckErrorTotal.increment({ taskType, exception: excName }); + this._promRegistry?.incrementCounter("task_ack_error_total", { + taskType, + exception: excName, + }); + } + + recordTaskAckFailed(taskType: string): void { + this.state.taskAckFailedTotal.increment({ taskType }); + this._promRegistry?.incrementCounter("task_ack_failed_total", { + taskType, + }); + } + + recordWorkflowStartError(workflowType?: string, exception?: string): void { + const wfType = workflowType ?? ""; + const excName = exception ?? "Error"; + this.state.workflowStartErrorTotal.increment({ + workflowType: wfType, + exception: excName, + }); + this._promRegistry?.incrementCounter("workflow_start_error_total", { + workflowType: wfType, + exception: excName, + }); + } + + recordExternalPayloadUsed( + payloadType: string, + entityName?: string, + operation?: string, + ): void { + this.state.externalPayloadUsedTotal.increment({ + entityName: entityName ?? "", + operation: operation ?? "", + payloadType, + }); + this._promRegistry?.incrementCounter("external_payload_used_total", { + entityName: entityName ?? "", + operation: operation ?? "", + payloadType, + }); + } + + recordWorkflowInputSize( + workflowType: string, + sizeBytes: number, + version?: string, + ): void { + this.state.workflowInputSizeBytes.observe( + { workflowType, version: version ?? "" }, + sizeBytes, + ); + this._promRegistry?.observeHistogram("workflow_input_size_bytes", { + workflowType, + version: version ?? "", + }, sizeBytes); + } + + recordApiRequestTime( + method: string, + uri: string, + status: number | string, + durationMs: number, + ): void { + const statusStr = String(status); + const seconds = durationMs / 1000; + this.state.httpApiClientRequestSeconds.observe( + { method, uri, status: statusStr }, + seconds, + ); + this._promRegistry?.observeHistogram("http_api_client_request_seconds", { + method, + uri, + status: statusStr, + }, seconds); + } + + // ── Public API ────────────────────────────────────────────────── + + getMetrics(): Readonly { + return this.state; + } + + reset(): void { + this.state = this.createEmpty(); + } + + async stop(): Promise { + setHttpMetricsObserver(undefined); + if (this._fileTimer) { + clearInterval(this._fileTimer); + this._fileTimer = undefined; + } + if (this._server) { + await this._server.stop(); + this._server = undefined; + } + } + + getContentType(): string { + return ( + this._promRegistry?.contentType ?? + "text/plain; version=0.0.4; charset=utf-8" + ); + } + + async toPrometheusTextAsync(): Promise { + if (this._promRegistry?.available) { + return this._promRegistry.metrics(); + } + return this.toPrometheusText(); + } + + toPrometheusText(_prefix?: string): string { + const lines: string[] = []; + + // ── Counters ── + const counterDefs: { + name: string; + help: string; + counter: MultiLabelCounter; + }[] = [ + { name: "task_poll_total", help: "Total number of task polls", counter: this.state.taskPollTotal }, + { name: "task_execution_started_total", help: "Total number of task executions started", counter: this.state.taskExecutionStartedTotal }, + { name: "task_poll_error_total", help: "Total number of task poll errors", counter: this.state.taskPollErrorTotal }, + { name: "task_execute_error_total", help: "Total number of task execution errors", counter: this.state.taskExecuteErrorTotal }, + { name: "task_update_error_total", help: "Total number of task update errors", counter: this.state.taskUpdateErrorTotal }, + { name: "task_ack_error_total", help: "Total number of task ack errors", counter: this.state.taskAckErrorTotal }, + { name: "task_ack_failed_total", help: "Total number of task ack failures (server declined)", counter: this.state.taskAckFailedTotal }, + { name: "task_execution_queue_full_total", help: "Total number of task execution queue full events", counter: this.state.taskExecutionQueueFullTotal }, + { name: "task_paused_total", help: "Total number of task paused events", counter: this.state.taskPausedTotal }, + { name: "thread_uncaught_exceptions_total", help: "Total uncaught exceptions", counter: this.state.threadUncaughtExceptionsTotal }, + { name: "external_payload_used_total", help: "Total external payload usage", counter: this.state.externalPayloadUsedTotal }, + { name: "workflow_start_error_total", help: "Total workflow start errors", counter: this.state.workflowStartErrorTotal }, + ]; + + for (const def of counterDefs) { + const rendered = def.counter.render(def.name, def.help); + if (rendered) lines.push(rendered); + } + + // ── Time histograms ── + const timeHistogramDefs: { + name: string; + help: string; + histogram: HistogramAccumulator; + }[] = [ + { name: "task_poll_time_seconds", help: "Task poll duration in seconds", histogram: this.state.taskPollTimeSeconds }, + { name: "task_execute_time_seconds", help: "Task execution duration in seconds", histogram: this.state.taskExecuteTimeSeconds }, + { name: "task_update_time_seconds", help: "Task update duration in seconds", histogram: this.state.taskUpdateTimeSeconds }, + { name: "http_api_client_request_seconds", help: "HTTP API client request duration in seconds", histogram: this.state.httpApiClientRequestSeconds }, + ]; + + for (const def of timeHistogramDefs) { + const rendered = def.histogram.render(def.name, def.help); + if (rendered) lines.push(rendered); + } + + // ── Size histograms ── + const sizeHistogramDefs: { + name: string; + help: string; + histogram: HistogramAccumulator; + }[] = [ + { name: "task_result_size_bytes", help: "Task result output size in bytes", histogram: this.state.taskResultSizeBytes }, + { name: "workflow_input_size_bytes", help: "Workflow input payload size in bytes", histogram: this.state.workflowInputSizeBytes }, + ]; + + for (const def of sizeHistogramDefs) { + const rendered = def.histogram.render(def.name, def.help); + if (rendered) lines.push(rendered); + } + + // ── Gauges ── + const gaugeDefs: { + name: string; + help: string; + gauge: GaugeMetric; + }[] = [ + { name: "active_workers", help: "Number of workers actively executing tasks", gauge: this.state.activeWorkers }, + ]; + + for (const def of gaugeDefs) { + const rendered = def.gauge.render(def.name, def.help); + if (rendered) lines.push(rendered); + } + + lines.push(""); // trailing newline + return lines.join("\n"); + } +} diff --git a/src/sdk/worker/metrics/CanonicalPrometheusRegistry.ts b/src/sdk/worker/metrics/CanonicalPrometheusRegistry.ts new file mode 100644 index 00000000..c7ced41d --- /dev/null +++ b/src/sdk/worker/metrics/CanonicalPrometheusRegistry.ts @@ -0,0 +1,168 @@ +/** + * Optional prom-client adapter for canonical metrics. + * + * Registers Prometheus Counter, Histogram, and Gauge objects using the + * canonical metric names (unprefixed, camelCase labels, seconds/bytes units). + */ + +import { TIME_BUCKETS, SIZE_BUCKETS } from "./accumulators"; + +interface PromCounter { + inc(labels: Record, value?: number): void; +} +interface PromHistogram { + observe(labels: Record, value: number): void; +} +interface PromGauge { + set(labels: Record, value: number): void; +} +interface PromRegistry { + metrics(): Promise; + contentType: string; +} + +export class CanonicalPrometheusRegistry { + private _counters = new Map(); + private _histograms = new Map(); + private _gauges = new Map(); + private _registry?: PromRegistry; + private _available = false; + + async initialize(): Promise { + try { + const promClient = await import("prom-client"); + this._registry = promClient.register; + this.createMetrics(promClient); + this._available = true; + return true; + } catch { + this._available = false; + return false; + } + } + + get available(): boolean { + return this._available; + } + + get contentType(): string { + return ( + this._registry?.contentType ?? + "text/plain; version=0.0.4; charset=utf-8" + ); + } + + async metrics(): Promise { + if (!this._registry) return ""; + return this._registry.metrics(); + } + + incrementCounter( + name: string, + labels: Record, + value = 1, + ): void { + this._counters.get(name)?.inc(labels, value); + } + + observeHistogram( + name: string, + labels: Record, + value: number, + ): void { + this._histograms.get(name)?.observe(labels, value); + } + + setGauge( + name: string, + labels: Record, + value: number, + ): void { + this._gauges.get(name)?.set(labels, value); + } + + // eslint-disable-next-line @typescript-eslint/no-explicit-any + private createMetrics(promClient: any): void { + const Counter = promClient.Counter; + const Histogram = promClient.Histogram; + const Gauge = promClient.Gauge; + + const counterDefs: { + name: string; + help: string; + labels: string[]; + }[] = [ + { name: "task_poll_total", help: "Total task polls", labels: ["taskType"] }, + { name: "task_execution_started_total", help: "Total task executions started", labels: ["taskType"] }, + { name: "task_poll_error_total", help: "Total task poll errors", labels: ["taskType", "exception"] }, + { name: "task_execute_error_total", help: "Total task execution errors", labels: ["taskType", "exception"] }, + { name: "task_update_error_total", help: "Total task update errors", labels: ["taskType", "exception"] }, + { name: "task_ack_error_total", help: "Total task ack errors", labels: ["taskType", "exception"] }, + { name: "task_ack_failed_total", help: "Total task ack failures", labels: ["taskType"] }, + { name: "task_execution_queue_full_total", help: "Task execution queue full", labels: ["taskType"] }, + { name: "task_paused_total", help: "Task paused events", labels: ["taskType"] }, + { name: "thread_uncaught_exceptions_total", help: "Uncaught exceptions", labels: ["exception"] }, + { name: "external_payload_used_total", help: "External payload used", labels: ["entityName", "operation", "payloadType"] }, + { name: "workflow_start_error_total", help: "Workflow start errors", labels: ["workflowType", "exception"] }, + ]; + + for (const def of counterDefs) { + this._counters.set( + def.name, + new Counter({ + name: def.name, + help: def.help, + labelNames: def.labels, + }), + ); + } + + const timeBuckets = [...TIME_BUCKETS]; + const sizeBuckets = [...SIZE_BUCKETS]; + + const histogramDefs: { + name: string; + help: string; + labels: string[]; + buckets: number[]; + }[] = [ + { name: "task_poll_time_seconds", help: "Task poll duration (seconds)", labels: ["taskType", "status"], buckets: timeBuckets }, + { name: "task_execute_time_seconds", help: "Task execution duration (seconds)", labels: ["taskType", "status"], buckets: timeBuckets }, + { name: "task_update_time_seconds", help: "Task update duration (seconds)", labels: ["taskType", "status"], buckets: timeBuckets }, + { name: "http_api_client_request_seconds", help: "HTTP API client request duration (seconds)", labels: ["method", "uri", "status"], buckets: timeBuckets }, + { name: "task_result_size_bytes", help: "Task result size (bytes)", labels: ["taskType"], buckets: sizeBuckets }, + { name: "workflow_input_size_bytes", help: "Workflow input size (bytes)", labels: ["workflowType", "version"], buckets: sizeBuckets }, + ]; + + for (const def of histogramDefs) { + this._histograms.set( + def.name, + new Histogram({ + name: def.name, + help: def.help, + labelNames: def.labels, + buckets: def.buckets, + }), + ); + } + + const gaugeDefs: { + name: string; + help: string; + labels: string[]; + }[] = [ + { name: "active_workers", help: "Workers actively executing tasks", labels: ["taskType"] }, + ]; + + for (const def of gaugeDefs) { + this._gauges.set( + def.name, + new Gauge({ + name: def.name, + help: def.help, + labelNames: def.labels, + }), + ); + } + } +} diff --git a/src/sdk/worker/metrics/MetricsCollector.ts b/src/sdk/worker/metrics/LegacyMetricsCollector.ts similarity index 85% rename from src/sdk/worker/metrics/MetricsCollector.ts rename to src/sdk/worker/metrics/LegacyMetricsCollector.ts index ae3fe542..cbf1b3ac 100644 --- a/src/sdk/worker/metrics/MetricsCollector.ts +++ b/src/sdk/worker/metrics/LegacyMetricsCollector.ts @@ -1,5 +1,4 @@ import type { - TaskRunnerEventsListener, PollStarted, PollCompleted, PollFailure, @@ -9,6 +8,8 @@ import type { TaskUpdateCompleted, TaskUpdateFailure, } from "../../clients/worker/events"; +import type { MetricsCollectorInterface } from "./MetricsCollectorInterface"; +import { setHttpMetricsObserver } from "./httpObserver"; /** * Configuration for MetricsCollector. @@ -35,7 +36,7 @@ export interface MetricsCollectorConfig { } /** - * Collected worker metrics. + * Collected worker metrics (legacy shape). */ export interface WorkerMetrics { /** Total polls by taskType */ @@ -78,9 +79,6 @@ export interface WorkerMetrics { const QUANTILES = [0.5, 0.75, 0.9, 0.95, 0.99] as const; -/** - * Calculate quantiles from sorted array using linear interpolation. - */ function computeQuantile(sorted: number[], q: number): number { if (sorted.length === 0) return 0; if (sorted.length === 1) return sorted[0]; @@ -92,26 +90,13 @@ function computeQuantile(sorted: number[], q: number): number { } /** - * Built-in metrics collector implementing TaskRunnerEventsListener. - * - * Collects 19 metric types matching the Python SDK's MetricsCollector, - * with sliding-window quantile support (p50, p75, p90, p95, p99). - * - * @example - * ```typescript - * const metrics = new MetricsCollector({ httpPort: 9090 }); - * - * const handler = new TaskHandler({ - * client, - * eventListeners: [metrics], - * }); + * Legacy metrics collector. * - * await handler.startWorkers(); - * // GET http://localhost:9090/metrics — Prometheus format - * // GET http://localhost:9090/health — {"status":"UP"} - * ``` + * Emits prefixed Prometheus metrics with `task_type` labels, millisecond + * time units, and Summary type for distributions. This is the default + * implementation during the deprecation transition period. */ -export class MetricsCollector implements TaskRunnerEventsListener { +export class LegacyMetricsCollector implements MetricsCollectorInterface { private metrics: WorkerMetrics; private readonly _prefix: string; private readonly _slidingWindowSize: number; @@ -132,9 +117,10 @@ export class MetricsCollector implements TaskRunnerEventsListener { if (config?.filePath) { this.startFileWriter( config.filePath, - config.fileWriteIntervalMs ?? 5000 + config.fileWriteIntervalMs ?? 5000, ); } + setHttpMetricsObserver(this); } private async initPromClient(): Promise { @@ -158,10 +144,8 @@ export class MetricsCollector implements TaskRunnerEventsListener { // Silently ignore file write errors } }; - // Immediate first write, then periodic void doWrite(); this._fileTimer = setInterval(doWrite, intervalMs); - // Unref so the timer doesn't prevent process exit if (typeof this._fileTimer === "object" && "unref" in this._fileTimer) { this._fileTimer.unref(); } @@ -197,7 +181,7 @@ export class MetricsCollector implements TaskRunnerEventsListener { private observe( map: Map, key: string, - value: number + value: number, ): void { let arr = map.get(key); if (!arr) { @@ -205,7 +189,6 @@ export class MetricsCollector implements TaskRunnerEventsListener { map.set(key, arr); } arr.push(value); - // Sliding window: keep only the last N observations if (arr.length > this._slidingWindowSize) { arr.splice(0, arr.length - this._slidingWindowSize); } @@ -215,7 +198,7 @@ export class MetricsCollector implements TaskRunnerEventsListener { map: Map, key: string, promKey: string, - labelName: string + labelName: string, ): void { this.increment(map, key); this._promRegistry?.incrementCounter(promKey, { [labelName]: key }); @@ -226,7 +209,7 @@ export class MetricsCollector implements TaskRunnerEventsListener { key: string, value: number, promKey: string, - labelName: string + labelName: string, ): void { this.observe(map, key, value); this._promRegistry?.observeSummary(promKey, { [labelName]: key }, value); @@ -248,7 +231,7 @@ export class MetricsCollector implements TaskRunnerEventsListener { } onTaskExecutionStarted(_event: TaskExecutionStarted): void { - // Counted on completion + // Legacy counts on completion, not start } onTaskExecutionCompleted(event: TaskExecutionCompleted): void { @@ -273,57 +256,65 @@ export class MetricsCollector implements TaskRunnerEventsListener { this.incrementCounter(this.metrics.taskUpdateFailureTotal, event.taskType, "update_error_total", "task_type"); } - // ── Direct Recording Methods (for code outside event system) ─── + // Canonical-only event — noop in legacy + onTaskPaused(): void { + // Noop: TaskPaused events are handled via recordTaskPaused() + } + + // ── Direct Recording Methods ─────────────────────────────────── - /** Record a task execution queue full event */ recordTaskExecutionQueueFull(taskType: string): void { this.incrementCounter(this.metrics.taskExecutionQueueFullTotal, taskType, "queue_full_total", "task_type"); } - /** Record an uncaught exception */ - recordUncaughtException(): void { + recordUncaughtException(_exception?: string): void { this.metrics.uncaughtExceptionTotal++; this._promRegistry?.incrementCounter("uncaught_total", {}); } - /** Record a worker restart */ recordWorkerRestart(): void { this.metrics.workerRestartTotal++; this._promRegistry?.incrementCounter("restart_total", {}); } - /** Record a task paused event */ recordTaskPaused(taskType: string): void { this.incrementCounter(this.metrics.taskPausedTotal, taskType, "paused_total", "task_type"); } - /** Record a task ack error */ - recordTaskAckError(taskType: string): void { + recordTaskAckError(taskType: string, _exception?: string): void { this.incrementCounter(this.metrics.taskAckErrorTotal, taskType, "ack_error_total", "task_type"); } - /** Record a workflow start error */ - recordWorkflowStartError(): void { + recordTaskAckFailed(_taskType: string): void { + // Noop: canonical-only metric (server declined ack) + } + + recordWorkflowStartError(_workflowType?: string, _exception?: string): void { this.metrics.workflowStartErrorTotal++; this._promRegistry?.incrementCounter("wf_start_error_total", {}); } - /** Record external payload usage */ - recordExternalPayloadUsed(payloadType: string): void { + recordExternalPayloadUsed( + payloadType: string, + _entityName?: string, + _operation?: string, + ): void { this.incrementCounter(this.metrics.externalPayloadUsedTotal, payloadType, "external_payload_total", "payload_type"); } - /** Record workflow input size */ - recordWorkflowInputSize(workflowType: string, sizeBytes: number): void { + recordWorkflowInputSize( + workflowType: string, + sizeBytes: number, + _version?: string, + ): void { this.observeSummary(this.metrics.workflowInputSizeBytes, workflowType, sizeBytes, "wf_input_size", "workflow_type"); } - /** Record API request duration */ recordApiRequestTime( method: string, uri: string, - status: number, - durationMs: number + status: number | string, + durationMs: number, ): void { const key = `${method}:${uri}:${status}`; this.observeSummary(this.metrics.apiRequestDurationMs, key, durationMs, "api_request", "endpoint"); @@ -331,18 +322,16 @@ export class MetricsCollector implements TaskRunnerEventsListener { // ── Public API ────────────────────────────────────────────────── - /** Get a snapshot of all collected metrics */ getMetrics(): Readonly { return this.metrics; } - /** Reset all collected metrics */ reset(): void { this.metrics = this.createEmptyMetrics(); } - /** Stop the auto-started metrics HTTP server and file writer (if any) */ async stop(): Promise { + setHttpMetricsObserver(undefined); if (this._fileTimer) { clearInterval(this._fileTimer); this._fileTimer = undefined; @@ -353,27 +342,13 @@ export class MetricsCollector implements TaskRunnerEventsListener { } } - /** - * Get the content type for the Prometheus metrics endpoint. - * Returns prom-client's content type when available, otherwise standard Prometheus text format. - */ getContentType(): string { - return this._promRegistry?.contentType ?? "text/plain; version=0.0.4; charset=utf-8"; + return ( + this._promRegistry?.contentType ?? + "text/plain; version=0.0.4; charset=utf-8" + ); } - /** - * Render all collected metrics in Prometheus exposition format. - * If prom-client is available and `usePromClient: true`, delegates to prom-client's registry. - * Otherwise uses built-in rendering with p50/p75/p90/p95/p99 quantiles. - * - * @param prefix - Metric name prefix (defaults to constructor config or "conductor_worker") - * @returns Prometheus text format string - */ - /** - * Async version of toPrometheusText. - * When prom-client is available, returns its native registry output. - * Otherwise falls back to the built-in text format. - */ async toPrometheusTextAsync(): Promise { if (this._promRegistry?.available) { return this._promRegistry.metrics(); @@ -454,7 +429,7 @@ export class MetricsCollector implements TaskRunnerEventsListener { lines.push(`# TYPE ${counter.name} counter`); for (const [label, value] of counter.data) { lines.push( - `${counter.name}{${counter.labelName}="${label}"} ${value}` + `${counter.name}{${counter.labelName}="${label}"} ${value}`, ); } } @@ -545,14 +520,14 @@ export class MetricsCollector implements TaskRunnerEventsListener { for (const q of QUANTILES) { const val = computeQuantile(sorted, q); lines.push( - `${summary.name}{${summary.labelName}="${label}",quantile="${q}"} ${val}` + `${summary.name}{${summary.labelName}="${label}",quantile="${q}"} ${val}`, ); } lines.push( - `${summary.name}_count{${summary.labelName}="${label}"} ${count}` + `${summary.name}_count{${summary.labelName}="${label}"} ${count}`, ); lines.push( - `${summary.name}_sum{${summary.labelName}="${label}"} ${sum}` + `${summary.name}_sum{${summary.labelName}="${label}"} ${sum}`, ); } } diff --git a/src/sdk/worker/metrics/MetricsCollectorInterface.ts b/src/sdk/worker/metrics/MetricsCollectorInterface.ts new file mode 100644 index 00000000..78d0c0b0 --- /dev/null +++ b/src/sdk/worker/metrics/MetricsCollectorInterface.ts @@ -0,0 +1,48 @@ +import type { + TaskRunnerEventsListener, +} from "../../clients/worker/events"; + +/** + * Unified metrics collector interface. + * + * Both LegacyMetricsCollector and CanonicalMetricsCollector implement this + * interface so call sites never need to know which variant is active. + * Methods that only apply to one variant are noops in the other. + */ +export interface MetricsCollectorInterface extends TaskRunnerEventsListener { + // ── Direct recording methods (superset signatures) ───────────── + + recordTaskExecutionQueueFull(taskType: string): void; + recordUncaughtException(exception?: string): void; + recordWorkerRestart(): void; + recordTaskPaused(taskType: string): void; + recordTaskAckError(taskType: string, exception?: string): void; + /** Canonical-only: server declined ack (no exception). Legacy noops. */ + recordTaskAckFailed(taskType: string): void; + recordWorkflowStartError(workflowType?: string, exception?: string): void; + recordExternalPayloadUsed( + payloadType: string, + entityName?: string, + operation?: string, + ): void; + recordWorkflowInputSize( + workflowType: string, + sizeBytes: number, + version?: string, + ): void; + recordApiRequestTime( + method: string, + uri: string, + status: number | string, + durationMs: number, + ): void; + + // ── Output / lifecycle ───────────────────────────────────────── + + getMetrics(): unknown; + reset(): void; + stop(): Promise; + getContentType(): string; + toPrometheusText(prefix?: string): string; + toPrometheusTextAsync(): Promise; +} diff --git a/src/sdk/worker/metrics/MetricsServer.ts b/src/sdk/worker/metrics/MetricsServer.ts index 62f1fbcb..d575901b 100644 --- a/src/sdk/worker/metrics/MetricsServer.ts +++ b/src/sdk/worker/metrics/MetricsServer.ts @@ -1,5 +1,5 @@ import { createServer, type Server, type IncomingMessage, type ServerResponse } from "node:http"; -import type { MetricsCollector } from "./MetricsCollector"; +import type { MetricsCollectorInterface } from "./MetricsCollectorInterface"; /** * Lightweight HTTP server exposing Prometheus metrics and a health check endpoint. @@ -21,11 +21,11 @@ import type { MetricsCollector } from "./MetricsCollector"; * ``` */ export class MetricsServer { - private readonly _collector: MetricsCollector; + private readonly _collector: MetricsCollectorInterface; private readonly _port: number; private _server?: Server; - constructor(collector: MetricsCollector, port: number) { + constructor(collector: MetricsCollectorInterface, port: number) { this._collector = collector; this._port = port; } diff --git a/src/sdk/worker/metrics/__tests__/CanonicalMetricsCollector.test.ts b/src/sdk/worker/metrics/__tests__/CanonicalMetricsCollector.test.ts new file mode 100644 index 00000000..6b616ce3 --- /dev/null +++ b/src/sdk/worker/metrics/__tests__/CanonicalMetricsCollector.test.ts @@ -0,0 +1,321 @@ +import { describe, it, expect, beforeEach } from "@jest/globals"; +import { CanonicalMetricsCollector } from "../CanonicalMetricsCollector"; + +describe("CanonicalMetricsCollector", () => { + let collector: CanonicalMetricsCollector; + + beforeEach(() => { + collector = new CanonicalMetricsCollector(); + }); + + describe("poll metrics", () => { + it("should emit task_poll_total counter on onPollStarted", () => { + collector.onPollStarted({ + taskType: "task_a", + workerId: "w1", + pollCount: 5, + timestamp: new Date(), + }); + collector.onPollStarted({ + taskType: "task_a", + workerId: "w1", + pollCount: 5, + timestamp: new Date(), + }); + + const text = collector.toPrometheusText(); + expect(text).toContain("# TYPE task_poll_total counter"); + expect(text).toContain('task_poll_total{taskType="task_a"} 2'); + }); + + it("should emit task_poll_time_seconds histogram on onPollCompleted", () => { + collector.onPollCompleted({ + taskType: "task_a", + durationMs: 100, + tasksReceived: 3, + timestamp: new Date(), + }); + + const text = collector.toPrometheusText(); + expect(text).toContain("# TYPE task_poll_time_seconds histogram"); + expect(text).toContain('task_poll_time_seconds_bucket{taskType="task_a",status="SUCCESS",le="0.1"} 1'); + expect(text).toContain('task_poll_time_seconds_sum{taskType="task_a",status="SUCCESS"} 0.1'); + expect(text).toContain('task_poll_time_seconds_count{taskType="task_a",status="SUCCESS"} 1'); + }); + + it("should emit task_poll_error_total with exception label on onPollFailure", () => { + collector.onPollFailure({ + taskType: "task_a", + durationMs: 5000, + cause: new TypeError("timeout"), + timestamp: new Date(), + }); + + const text = collector.toPrometheusText(); + expect(text).toContain('task_poll_error_total{taskType="task_a",exception="TypeError"} 1'); + expect(text).toContain('task_poll_time_seconds_bucket{taskType="task_a",status="FAILURE"'); + }); + }); + + describe("execution metrics", () => { + it("should emit task_execution_started_total on onTaskExecutionStarted", () => { + collector.onTaskExecutionStarted({ + taskType: "task_a", + taskId: "t1", + workerId: "w1", + timestamp: new Date(), + }); + + const text = collector.toPrometheusText(); + expect(text).toContain('task_execution_started_total{taskType="task_a"} 1'); + }); + + it("should track active_workers gauge", () => { + collector.onTaskExecutionStarted({ + taskType: "task_a", + taskId: "t1", + workerId: "w1", + timestamp: new Date(), + }); + collector.onTaskExecutionStarted({ + taskType: "task_a", + taskId: "t2", + workerId: "w1", + timestamp: new Date(), + }); + + let text = collector.toPrometheusText(); + expect(text).toContain('active_workers{taskType="task_a"} 2'); + + collector.onTaskExecutionCompleted({ + taskType: "task_a", + taskId: "t1", + workerId: "w1", + durationMs: 200, + timestamp: new Date(), + }); + + text = collector.toPrometheusText(); + expect(text).toContain('active_workers{taskType="task_a"} 1'); + }); + + it("should emit task_execute_time_seconds histogram on completion", () => { + collector.onTaskExecutionCompleted({ + taskType: "task_a", + taskId: "t1", + workerId: "w1", + durationMs: 500, + timestamp: new Date(), + }); + + const text = collector.toPrometheusText(); + expect(text).toContain("# TYPE task_execute_time_seconds histogram"); + expect(text).toContain('task_execute_time_seconds_sum{taskType="task_a",status="SUCCESS"} 0.5'); + }); + + it("should emit task_result_size_bytes histogram on completion", () => { + collector.onTaskExecutionCompleted({ + taskType: "task_a", + taskId: "t1", + workerId: "w1", + durationMs: 200, + outputSizeBytes: 5000, + timestamp: new Date(), + }); + + const text = collector.toPrometheusText(); + expect(text).toContain("# TYPE task_result_size_bytes histogram"); + expect(text).toContain('task_result_size_bytes_sum{taskType="task_a"} 5000'); + }); + + it("should emit task_execute_error_total with exception label on failure", () => { + const err = new TypeError("invalid input"); + collector.onTaskExecutionFailure({ + taskType: "task_a", + taskId: "t1", + workerId: "w1", + cause: err, + durationMs: 50, + timestamp: new Date(), + }); + + const text = collector.toPrometheusText(); + expect(text).toContain('task_execute_error_total{taskType="task_a",exception="TypeError"} 1'); + expect(text).toContain('task_execute_time_seconds_bucket{taskType="task_a",status="FAILURE"'); + }); + }); + + describe("task update metrics", () => { + it("should emit task_update_time_seconds histogram on completion", () => { + collector.onTaskUpdateCompleted({ + taskType: "task_a", + taskId: "t1", + workerId: "w1", + durationMs: 25, + timestamp: new Date(), + }); + + const text = collector.toPrometheusText(); + expect(text).toContain("# TYPE task_update_time_seconds histogram"); + expect(text).toContain('task_update_time_seconds_sum{taskType="task_a",status="SUCCESS"} 0.025'); + }); + + it("should emit task_update_error_total with exception label on failure", () => { + collector.onTaskUpdateFailure({ + taskType: "task_a", + taskId: "t1", + workerId: "w1", + cause: new Error("server error"), + retryCount: 4, + taskResult: {}, + timestamp: new Date(), + }); + + const text = collector.toPrometheusText(); + expect(text).toContain('task_update_error_total{taskType="task_a",exception="Error"} 1'); + }); + }); + + describe("direct recording methods", () => { + it("recordTaskExecutionQueueFull should emit counter", () => { + collector.recordTaskExecutionQueueFull("task_a"); + collector.recordTaskExecutionQueueFull("task_a"); + + const text = collector.toPrometheusText(); + expect(text).toContain('task_execution_queue_full_total{taskType="task_a"} 2'); + }); + + it("recordUncaughtException should emit counter with exception label", () => { + collector.recordUncaughtException("RangeError"); + collector.recordUncaughtException("RangeError"); + collector.recordUncaughtException("TypeError"); + + const text = collector.toPrometheusText(); + expect(text).toContain('thread_uncaught_exceptions_total{exception="RangeError"} 2'); + expect(text).toContain('thread_uncaught_exceptions_total{exception="TypeError"} 1'); + }); + + it("recordWorkerRestart should be a noop (N/A for JS)", () => { + collector.recordWorkerRestart(); + const text = collector.toPrometheusText(); + expect(text).not.toContain("worker_restart"); + }); + + it("recordTaskPaused should emit counter", () => { + collector.recordTaskPaused("paused_task"); + const text = collector.toPrometheusText(); + expect(text).toContain('task_paused_total{taskType="paused_task"} 1'); + }); + + it("recordTaskAckError should emit counter with exception label", () => { + collector.recordTaskAckError("task_a", "TimeoutError"); + const text = collector.toPrometheusText(); + expect(text).toContain('task_ack_error_total{taskType="task_a",exception="TimeoutError"} 1'); + }); + + it("recordTaskAckFailed should emit counter", () => { + collector.recordTaskAckFailed("task_a"); + const text = collector.toPrometheusText(); + expect(text).toContain('task_ack_failed_total{taskType="task_a"} 1'); + }); + + it("recordWorkflowStartError should emit counter with labels", () => { + collector.recordWorkflowStartError("my_workflow", "NetworkError"); + const text = collector.toPrometheusText(); + expect(text).toContain('workflow_start_error_total{workflowType="my_workflow",exception="NetworkError"} 1'); + }); + + it("recordExternalPayloadUsed should emit counter with labels", () => { + collector.recordExternalPayloadUsed("TASK_OUTPUT", "myEntity", "WRITE"); + const text = collector.toPrometheusText(); + expect(text).toContain('external_payload_used_total{entityName="myEntity",operation="WRITE",payloadType="TASK_OUTPUT"} 1'); + }); + + it("recordWorkflowInputSize should emit histogram", () => { + collector.recordWorkflowInputSize("order_flow", 50000, "1"); + const text = collector.toPrometheusText(); + expect(text).toContain("# TYPE workflow_input_size_bytes histogram"); + expect(text).toContain('workflow_input_size_bytes_sum{workflowType="order_flow",version="1"} 50000'); + }); + + it("recordApiRequestTime should emit histogram in seconds", () => { + collector.recordApiRequestTime("GET", "/api/workflow", 200, 45); + const text = collector.toPrometheusText(); + expect(text).toContain("# TYPE http_api_client_request_seconds histogram"); + expect(text).toContain('http_api_client_request_seconds_sum{method="GET",uri="/api/workflow",status="200"} 0.045'); + }); + }); + + describe("onTaskPaused event", () => { + it("should increment task_paused_total", () => { + collector.onTaskPaused({ + taskType: "paused_task", + timestamp: new Date(), + }); + const text = collector.toPrometheusText(); + expect(text).toContain('task_paused_total{taskType="paused_task"} 1'); + }); + }); + + describe("reset", () => { + it("should clear all canonical metrics", () => { + collector.onPollStarted({ + taskType: "task_a", + workerId: "w1", + pollCount: 1, + timestamp: new Date(), + }); + collector.recordUncaughtException("Error"); + collector.recordTaskAckFailed("task_a"); + + collector.reset(); + + const text = collector.toPrometheusText(); + expect(text).toBe(""); + }); + }); + + describe("output format", () => { + it("should not apply a prefix to canonical metric names", () => { + collector.onPollStarted({ + taskType: "t", + workerId: "w", + pollCount: 1, + timestamp: new Date(), + }); + const text = collector.toPrometheusText(); + expect(text).not.toContain("conductor_worker"); + expect(text).toContain("task_poll_total{"); + }); + + it("should use seconds for time histograms", () => { + collector.onPollCompleted({ + taskType: "t", + durationMs: 1000, + tasksReceived: 1, + timestamp: new Date(), + }); + const text = collector.toPrometheusText(); + expect(text).toContain("task_poll_time_seconds"); + expect(text).toContain('_sum{taskType="t",status="SUCCESS"} 1'); + }); + + it("should use camelCase taskType label", () => { + collector.onPollStarted({ + taskType: "my_task", + workerId: "w", + pollCount: 1, + timestamp: new Date(), + }); + const text = collector.toPrometheusText(); + expect(text).toContain("taskType="); + expect(text).not.toContain("task_type="); + }); + }); + + describe("stop", () => { + it("should not throw when no server is running", async () => { + await expect(collector.stop()).resolves.toBeUndefined(); + }); + }); +}); diff --git a/src/sdk/worker/metrics/__tests__/MetricsCollector.prometheus.test.ts b/src/sdk/worker/metrics/__tests__/MetricsCollector.prometheus.test.ts index 3c61671a..ffe9f567 100644 --- a/src/sdk/worker/metrics/__tests__/MetricsCollector.prometheus.test.ts +++ b/src/sdk/worker/metrics/__tests__/MetricsCollector.prometheus.test.ts @@ -1,11 +1,11 @@ import { describe, it, expect, beforeEach } from "@jest/globals"; -import { MetricsCollector } from "../MetricsCollector"; +import { LegacyMetricsCollector } from "../LegacyMetricsCollector"; -describe("MetricsCollector - Prometheus features", () => { - let collector: MetricsCollector; +describe("LegacyMetricsCollector - Prometheus features", () => { + let collector: LegacyMetricsCollector; beforeEach(() => { - collector = new MetricsCollector({ slidingWindowSize: 1000 }); + collector = new LegacyMetricsCollector({ slidingWindowSize: 1000 }); }); // ── toPrometheusText() ────────────────────────────────────────── @@ -36,7 +36,7 @@ describe("MetricsCollector - Prometheus features", () => { }); it("should use custom prefix", () => { - const c = new MetricsCollector({ prefix: "myapp" }); + const c = new LegacyMetricsCollector({ prefix: "myapp" }); c.onPollStarted({ taskType: "t", workerId: "w", pollCount: 1, timestamp: new Date() }); const text = c.toPrometheusText(); expect(text).toContain("myapp_task_poll_total"); @@ -167,7 +167,7 @@ describe("MetricsCollector - Prometheus features", () => { describe("sliding window", () => { it("should trim observations beyond window size", () => { - const small = new MetricsCollector({ slidingWindowSize: 5 }); + const small = new LegacyMetricsCollector({ slidingWindowSize: 5 }); for (let i = 0; i < 10; i++) { small.onPollCompleted({ taskType: "t", workerId: "w", durationMs: i, pollCount: i, taskCount: 1, timestamp: new Date() }); } diff --git a/src/sdk/worker/metrics/__tests__/MetricsCollector.test.ts b/src/sdk/worker/metrics/__tests__/MetricsCollector.test.ts index 691d9325..3e8d8ebb 100644 --- a/src/sdk/worker/metrics/__tests__/MetricsCollector.test.ts +++ b/src/sdk/worker/metrics/__tests__/MetricsCollector.test.ts @@ -1,11 +1,11 @@ import { describe, it, expect, beforeEach } from "@jest/globals"; -import { MetricsCollector } from "../MetricsCollector"; +import { LegacyMetricsCollector } from "../LegacyMetricsCollector"; -describe("MetricsCollector", () => { - let collector: MetricsCollector; +describe("LegacyMetricsCollector", () => { + let collector: LegacyMetricsCollector; beforeEach(() => { - collector = new MetricsCollector(); + collector = new LegacyMetricsCollector(); }); describe("poll metrics", () => { diff --git a/src/sdk/worker/metrics/__tests__/MetricsServer.test.ts b/src/sdk/worker/metrics/__tests__/MetricsServer.test.ts index bbe5153f..c0b423f7 100644 --- a/src/sdk/worker/metrics/__tests__/MetricsServer.test.ts +++ b/src/sdk/worker/metrics/__tests__/MetricsServer.test.ts @@ -1,6 +1,6 @@ import { describe, it, expect, afterEach } from "@jest/globals"; import { get as httpGet } from "node:http"; -import { MetricsCollector } from "../MetricsCollector"; +import { LegacyMetricsCollector } from "../LegacyMetricsCollector"; import { MetricsServer } from "../MetricsServer"; function fetchHttp(url: string): Promise<{ status: number; body: string; headers: Record }> { @@ -38,7 +38,7 @@ describe("MetricsServer", () => { it("should serve /metrics with Prometheus text format", async () => { const port = nextPort(); - const collector = new MetricsCollector(); + const collector = new LegacyMetricsCollector(); collector.onPollStarted({ taskType: "test_task", workerId: "w", pollCount: 1, timestamp: new Date() }); server = new MetricsServer(collector, port); @@ -51,7 +51,7 @@ describe("MetricsServer", () => { it("should serve /health with JSON status", async () => { const port = nextPort(); - const collector = new MetricsCollector(); + const collector = new LegacyMetricsCollector(); server = new MetricsServer(collector, port); await server.start(); @@ -62,7 +62,7 @@ describe("MetricsServer", () => { it("should return 404 for unknown paths", async () => { const port = nextPort(); - const collector = new MetricsCollector(); + const collector = new LegacyMetricsCollector(); server = new MetricsServer(collector, port); await server.start(); @@ -72,7 +72,7 @@ describe("MetricsServer", () => { it("should stop cleanly after start", async () => { const port = nextPort(); - const collector = new MetricsCollector(); + const collector = new LegacyMetricsCollector(); server = new MetricsServer(collector, port); await server.start(); await server.stop(); @@ -80,7 +80,7 @@ describe("MetricsServer", () => { }); it("should not throw when stop() called without start()", async () => { - const collector = new MetricsCollector(); + const collector = new LegacyMetricsCollector(); server = new MetricsServer(collector, nextPort()); await server.stop(); server = undefined; @@ -88,7 +88,7 @@ describe("MetricsServer", () => { it("should not throw when start() called twice", async () => { const port = nextPort(); - const collector = new MetricsCollector(); + const collector = new LegacyMetricsCollector(); server = new MetricsServer(collector, port); await server.start(); await server.start(); // second call should be a no-op diff --git a/src/sdk/worker/metrics/__tests__/metricsFactory.test.ts b/src/sdk/worker/metrics/__tests__/metricsFactory.test.ts new file mode 100644 index 00000000..a0149968 --- /dev/null +++ b/src/sdk/worker/metrics/__tests__/metricsFactory.test.ts @@ -0,0 +1,99 @@ +import { describe, it, expect, afterEach } from "@jest/globals"; +import { createMetricsCollector } from "../metricsFactory"; +import { LegacyMetricsCollector } from "../LegacyMetricsCollector"; +import { CanonicalMetricsCollector } from "../CanonicalMetricsCollector"; + +describe("createMetricsCollector", () => { + const originalEnv = { ...process.env }; + + afterEach(() => { + process.env = { ...originalEnv }; + }); + + it("should return LegacyMetricsCollector by default", () => { + delete process.env.WORKER_CANONICAL_METRICS; + delete process.env.WORKER_LEGACY_METRICS; + + const collector = createMetricsCollector(); + expect(collector).toBeInstanceOf(LegacyMetricsCollector); + }); + + it("should return LegacyMetricsCollector when WORKER_LEGACY_METRICS=true", () => { + process.env.WORKER_LEGACY_METRICS = "true"; + delete process.env.WORKER_CANONICAL_METRICS; + + const collector = createMetricsCollector(); + expect(collector).toBeInstanceOf(LegacyMetricsCollector); + }); + + it("should return CanonicalMetricsCollector when WORKER_CANONICAL_METRICS=true", () => { + process.env.WORKER_CANONICAL_METRICS = "true"; + + const collector = createMetricsCollector(); + expect(collector).toBeInstanceOf(CanonicalMetricsCollector); + }); + + it("should prefer canonical when both env vars are true", () => { + process.env.WORKER_CANONICAL_METRICS = "true"; + process.env.WORKER_LEGACY_METRICS = "true"; + + const collector = createMetricsCollector(); + expect(collector).toBeInstanceOf(CanonicalMetricsCollector); + }); + + it("should return LegacyMetricsCollector when WORKER_CANONICAL_METRICS=false", () => { + process.env.WORKER_CANONICAL_METRICS = "false"; + + const collector = createMetricsCollector(); + expect(collector).toBeInstanceOf(LegacyMetricsCollector); + }); + + it("should be case-insensitive for env var value", () => { + process.env.WORKER_CANONICAL_METRICS = "TRUE"; + + const collector = createMetricsCollector(); + expect(collector).toBeInstanceOf(CanonicalMetricsCollector); + }); + + it("should pass config through to the collector", () => { + delete process.env.WORKER_CANONICAL_METRICS; + + const collector = createMetricsCollector({ prefix: "custom_prefix" }); + expect(collector).toBeInstanceOf(LegacyMetricsCollector); + const text = collector.toPrometheusText(); + // No metrics recorded yet, so empty, but the collector was created successfully + expect(typeof text).toBe("string"); + }); + + it("both implementations satisfy MetricsCollectorInterface", () => { + const legacy = createMetricsCollector(); + const requiredMethods = [ + "recordTaskExecutionQueueFull", + "recordUncaughtException", + "recordWorkerRestart", + "recordTaskPaused", + "recordTaskAckError", + "recordTaskAckFailed", + "recordWorkflowStartError", + "recordExternalPayloadUsed", + "recordWorkflowInputSize", + "recordApiRequestTime", + "getMetrics", + "reset", + "stop", + "getContentType", + "toPrometheusText", + "toPrometheusTextAsync", + ]; + + for (const method of requiredMethods) { + expect(typeof (legacy as unknown as Record)[method]).toBe("function"); + } + + process.env.WORKER_CANONICAL_METRICS = "true"; + const canonical = createMetricsCollector(); + for (const method of requiredMethods) { + expect(typeof (canonical as unknown as Record)[method]).toBe("function"); + } + }); +}); diff --git a/src/sdk/worker/metrics/accumulators.ts b/src/sdk/worker/metrics/accumulators.ts new file mode 100644 index 00000000..de8d48ae --- /dev/null +++ b/src/sdk/worker/metrics/accumulators.ts @@ -0,0 +1,177 @@ +/** + * In-memory Prometheus-compatible metric accumulators used by the + * canonical metrics implementation. + */ + +export const TIME_BUCKETS = [ + 0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, +] as const; + +export const SIZE_BUCKETS = [ + 100, 1_000, 10_000, 100_000, 1_000_000, 10_000_000, +] as const; + +export function labelKey(labels: Record): string { + const keys = Object.keys(labels).sort(); + return keys.map((k) => `${k}=${labels[k]}`).join(","); +} + +export function renderLabels(labels: Record): string { + return Object.entries(labels) + .map(([k, v]) => `${k}="${v}"`) + .join(","); +} + +export function exceptionLabel(error: unknown): string { + if (error instanceof Error) { + return error.name || error.constructor?.name || "Error"; + } + return "Error"; +} + +// ── HistogramAccumulator ───────────────────────────────────────── + +interface HistogramSeries { + labels: Record; + buckets: number[]; + count: number; + sum: number; +} + +export class HistogramAccumulator { + private readonly _boundaries: readonly number[]; + private _series = new Map(); + + constructor(boundaries: readonly number[] = TIME_BUCKETS) { + this._boundaries = boundaries; + } + + observe(labels: Record, value: number): void { + const key = labelKey(labels); + let s = this._series.get(key); + if (!s) { + s = { + labels, + buckets: new Array(this._boundaries.length).fill(0), + count: 0, + sum: 0, + }; + this._series.set(key, s); + } + for (let i = 0; i < this._boundaries.length; i++) { + if (value <= this._boundaries[i]) { + s.buckets[i]++; + } + } + s.count++; + s.sum += value; + } + + render(name: string, help: string): string { + if (this._series.size === 0) return ""; + const lines: string[] = []; + lines.push(`# HELP ${name} ${help}`); + lines.push(`# TYPE ${name} histogram`); + for (const s of this._series.values()) { + const lblStr = renderLabels(s.labels); + const sep = lblStr ? "," : ""; + for (let i = 0; i < this._boundaries.length; i++) { + lines.push( + `${name}_bucket{${lblStr}${sep}le="${this._boundaries[i]}"} ${s.buckets[i]}`, + ); + } + lines.push(`${name}_bucket{${lblStr}${sep}le="+Inf"} ${s.count}`); + lines.push(`${name}_sum{${lblStr}} ${s.sum}`); + lines.push(`${name}_count{${lblStr}} ${s.count}`); + } + return lines.join("\n"); + } +} + +// ── MultiLabelCounter ──────────────────────────────────────────── + +interface CounterSeries { + labels: Record; + value: number; +} + +export class MultiLabelCounter { + private _series = new Map(); + + increment(labels: Record, value = 1): void { + const key = labelKey(labels); + let s = this._series.get(key); + if (!s) { + s = { labels, value: 0 }; + this._series.set(key, s); + } + s.value += value; + } + + render(name: string, help: string): string { + if (this._series.size === 0) return ""; + const lines: string[] = []; + lines.push(`# HELP ${name} ${help}`); + lines.push(`# TYPE ${name} counter`); + for (const s of this._series.values()) { + lines.push(`${name}{${renderLabels(s.labels)}} ${s.value}`); + } + return lines.join("\n"); + } +} + +// ── GaugeMetric ────────────────────────────────────────────────── + +interface GaugeSeries { + labels: Record; + value: number; +} + +export class GaugeMetric { + private _series = new Map(); + + set(labels: Record, value: number): void { + const key = labelKey(labels); + let s = this._series.get(key); + if (!s) { + s = { labels, value: 0 }; + this._series.set(key, s); + } + s.value = value; + } + + inc(labels: Record, delta = 1): void { + const key = labelKey(labels); + let s = this._series.get(key); + if (!s) { + s = { labels, value: 0 }; + this._series.set(key, s); + } + s.value += delta; + } + + dec(labels: Record, delta = 1): void { + const key = labelKey(labels); + let s = this._series.get(key); + if (!s) { + s = { labels, value: 0 }; + this._series.set(key, s); + } + s.value -= delta; + } + + getValue(labels: Record): number { + return this._series.get(labelKey(labels))?.value ?? 0; + } + + render(name: string, help: string): string { + if (this._series.size === 0) return ""; + const lines: string[] = []; + lines.push(`# HELP ${name} ${help}`); + lines.push(`# TYPE ${name} gauge`); + for (const s of this._series.values()) { + lines.push(`${name}{${renderLabels(s.labels)}} ${s.value}`); + } + return lines.join("\n"); + } +} diff --git a/src/sdk/worker/metrics/httpObserver.ts b/src/sdk/worker/metrics/httpObserver.ts new file mode 100644 index 00000000..fa2cbc76 --- /dev/null +++ b/src/sdk/worker/metrics/httpObserver.ts @@ -0,0 +1,40 @@ +/** + * Global metrics observer for recording API client request latency + * and workflow-level metrics from code outside the event system. + * + * Both LegacyMetricsCollector and CanonicalMetricsCollector register + * themselves here on construction; fetchWithRetry and WorkflowExecutor + * call the observer without needing a direct reference. + */ + +export interface HttpMetricsObserver { + recordApiRequestTime( + method: string, + uri: string, + status: number | string, + durationMs: number, + ): void; + + recordWorkflowInputSize( + workflowType: string, + sizeBytes: number, + version?: string, + ): void; + + recordWorkflowStartError( + workflowType?: string, + exception?: string, + ): void; +} + +let _observer: HttpMetricsObserver | undefined; + +export function setHttpMetricsObserver( + observer: HttpMetricsObserver | undefined, +): void { + _observer = observer; +} + +export function getHttpMetricsObserver(): HttpMetricsObserver | undefined { + return _observer; +} diff --git a/src/sdk/worker/metrics/index.ts b/src/sdk/worker/metrics/index.ts index 9869a1c7..732e02f7 100644 --- a/src/sdk/worker/metrics/index.ts +++ b/src/sdk/worker/metrics/index.ts @@ -1,7 +1,37 @@ +// Interface +export type { MetricsCollectorInterface } from "./MetricsCollectorInterface"; + +// Implementations export { - MetricsCollector, + LegacyMetricsCollector, type MetricsCollectorConfig, type WorkerMetrics, -} from "./MetricsCollector"; +} from "./LegacyMetricsCollector"; +export { CanonicalMetricsCollector } from "./CanonicalMetricsCollector"; + +// Backward-compat alias: MetricsCollector → LegacyMetricsCollector +export { LegacyMetricsCollector as MetricsCollector } from "./LegacyMetricsCollector"; + +// Factory +export { createMetricsCollector } from "./metricsFactory"; + +// HTTP observer +export { + type HttpMetricsObserver, + getHttpMetricsObserver, + setHttpMetricsObserver, +} from "./httpObserver"; + +// Server & registries export { MetricsServer } from "./MetricsServer"; export { PrometheusRegistry } from "./PrometheusRegistry"; +export { CanonicalPrometheusRegistry } from "./CanonicalPrometheusRegistry"; + +// Accumulators (exposed for advanced usage / testing) +export { + HistogramAccumulator, + MultiLabelCounter, + GaugeMetric, + TIME_BUCKETS, + SIZE_BUCKETS, +} from "./accumulators"; diff --git a/src/sdk/worker/metrics/metricsFactory.ts b/src/sdk/worker/metrics/metricsFactory.ts new file mode 100644 index 00000000..870ef843 --- /dev/null +++ b/src/sdk/worker/metrics/metricsFactory.ts @@ -0,0 +1,26 @@ +import type { MetricsCollectorConfig } from "./LegacyMetricsCollector"; +import type { MetricsCollectorInterface } from "./MetricsCollectorInterface"; +import { LegacyMetricsCollector } from "./LegacyMetricsCollector"; +import { CanonicalMetricsCollector } from "./CanonicalMetricsCollector"; + +/** + * Create the appropriate MetricsCollector based on environment variables. + * + * - WORKER_CANONICAL_METRICS=true -> CanonicalMetricsCollector (default: false) + * - WORKER_LEGACY_METRICS=true -> LegacyMetricsCollector (default: true) + * + * WORKER_CANONICAL_METRICS takes priority when both are set. + * During the deprecation transition period the default is legacy. + */ +export function createMetricsCollector( + config?: MetricsCollectorConfig, +): MetricsCollectorInterface { + const useCanonical = + process.env.WORKER_CANONICAL_METRICS?.toLowerCase() === "true"; + + if (useCanonical) { + return new CanonicalMetricsCollector(config); + } + + return new LegacyMetricsCollector(config); +} From 083f173273ff471f02f3650f4fe6010a04a971d6 Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Wed, 29 Apr 2026 10:37:28 -0600 Subject: [PATCH 02/21] standardize on what truthiness means for env var --- harness/main.ts | 4 +++- .../metrics/__tests__/metricsFactory.test.ts | 14 ++++++++++++++ src/sdk/worker/metrics/metricsFactory.ts | 5 +++-- 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/harness/main.ts b/harness/main.ts index b7aadeed..ad30b732 100644 --- a/harness/main.ts +++ b/harness/main.ts @@ -94,7 +94,9 @@ async function main(): Promise { const metricsPort = envIntOrDefault("HARNESS_METRICS_PORT", 9991); const metricsCollector = createMetricsCollector({ httpPort: metricsPort }); - const variant = process.env.WORKER_CANONICAL_METRICS?.toLowerCase() === "true" + const variant = ["true", "1", "yes"].includes( + (process.env.WORKER_CANONICAL_METRICS ?? "").toLowerCase(), + ) ? "canonical" : "legacy"; console.log(`Prometheus metrics server started on port ${metricsPort} (${variant} metrics)`); diff --git a/src/sdk/worker/metrics/__tests__/metricsFactory.test.ts b/src/sdk/worker/metrics/__tests__/metricsFactory.test.ts index a0149968..f5a28f91 100644 --- a/src/sdk/worker/metrics/__tests__/metricsFactory.test.ts +++ b/src/sdk/worker/metrics/__tests__/metricsFactory.test.ts @@ -55,6 +55,20 @@ describe("createMetricsCollector", () => { expect(collector).toBeInstanceOf(CanonicalMetricsCollector); }); + it('should return CanonicalMetricsCollector when WORKER_CANONICAL_METRICS="1"', () => { + process.env.WORKER_CANONICAL_METRICS = "1"; + + const collector = createMetricsCollector(); + expect(collector).toBeInstanceOf(CanonicalMetricsCollector); + }); + + it('should return CanonicalMetricsCollector when WORKER_CANONICAL_METRICS="yes"', () => { + process.env.WORKER_CANONICAL_METRICS = "yes"; + + const collector = createMetricsCollector(); + expect(collector).toBeInstanceOf(CanonicalMetricsCollector); + }); + it("should pass config through to the collector", () => { delete process.env.WORKER_CANONICAL_METRICS; diff --git a/src/sdk/worker/metrics/metricsFactory.ts b/src/sdk/worker/metrics/metricsFactory.ts index 870ef843..bcb45588 100644 --- a/src/sdk/worker/metrics/metricsFactory.ts +++ b/src/sdk/worker/metrics/metricsFactory.ts @@ -15,8 +15,9 @@ import { CanonicalMetricsCollector } from "./CanonicalMetricsCollector"; export function createMetricsCollector( config?: MetricsCollectorConfig, ): MetricsCollectorInterface { - const useCanonical = - process.env.WORKER_CANONICAL_METRICS?.toLowerCase() === "true"; + const useCanonical = ["true", "1", "yes"].includes( + (process.env.WORKER_CANONICAL_METRICS ?? "").toLowerCase(), + ); if (useCanonical) { return new CanonicalMetricsCollector(config); From be00e0802d8e26fad419f8b20c76b04b5bb2ec80 Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Tue, 5 May 2026 09:25:39 -0600 Subject: [PATCH 03/21] add some tests for new stuff --- jest.config.mjs | 1 + .../events/__tests__/EventDispatcher.test.ts | 10 + .../helpers/__tests__/fetchWithRetry.test.ts | 106 ++++++- .../CanonicalMetricsCollector.test.ts | 112 +++++++ .../CanonicalPrometheusRegistry.test.ts | 176 +++++++++++ .../metrics/__tests__/accumulators.test.ts | 283 ++++++++++++++++++ .../metrics/__tests__/httpObserver.test.ts | 58 ++++ 7 files changed, 745 insertions(+), 1 deletion(-) create mode 100644 src/sdk/worker/metrics/__tests__/CanonicalPrometheusRegistry.test.ts create mode 100644 src/sdk/worker/metrics/__tests__/accumulators.test.ts create mode 100644 src/sdk/worker/metrics/__tests__/httpObserver.test.ts diff --git a/jest.config.mjs b/jest.config.mjs index d8703a14..f8407f50 100644 --- a/jest.config.mjs +++ b/jest.config.mjs @@ -21,6 +21,7 @@ export default { "^@/(.*)$": "/src/$1", "^@open-api/(.*)$": "/src/open-api/$1", "^@test-utils/(.*)$": "/src/integration-tests/utils/$1", + "^(.*)\\.js$": "$1", }, transform: { "^.+\\.tsx?$": [ diff --git a/src/sdk/clients/worker/events/__tests__/EventDispatcher.test.ts b/src/sdk/clients/worker/events/__tests__/EventDispatcher.test.ts index bc2f2dc6..b99310bf 100644 --- a/src/sdk/clients/worker/events/__tests__/EventDispatcher.test.ts +++ b/src/sdk/clients/worker/events/__tests__/EventDispatcher.test.ts @@ -9,6 +9,7 @@ import type { TaskExecutionFailure, TaskUpdateCompleted, TaskUpdateFailure, + TaskPaused, } from "../types"; describe("EventDispatcher", () => { @@ -143,6 +144,7 @@ describe("EventDispatcher", () => { onTaskExecutionFailure: jest.fn<() => void>(), onTaskUpdateCompleted: jest.fn<() => void>(), onTaskUpdateFailure: jest.fn<() => void>(), + onTaskPaused: jest.fn<() => void>(), }; dispatcher.register(listener); @@ -231,6 +233,14 @@ describe("EventDispatcher", () => { }; await dispatcher.publishTaskUpdateFailure(updateFailure); expect(listener.onTaskUpdateFailure).toHaveBeenCalledWith(updateFailure); + + // Test TaskPaused + const taskPaused: TaskPaused = { + taskType: "test-task", + timestamp: new Date(), + }; + await dispatcher.publishTaskPaused(taskPaused); + expect(listener.onTaskPaused).toHaveBeenCalledWith(taskPaused); }); test("should have zero overhead when no listeners registered", async () => { diff --git a/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts b/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts index 76f73f9b..887c0665 100644 --- a/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts +++ b/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts @@ -1,5 +1,6 @@ -import { jest, expect, describe, it, beforeEach } from "@jest/globals"; +import { jest, expect, describe, it, beforeEach, afterEach } from "@jest/globals"; import { retryFetch, wrapFetchWithRetry, applyTimeout } from "../fetchWithRetry"; +import * as httpObserver from "@/sdk/worker/metrics/httpObserver"; const createMockResponse = (status: number, body = ""): Response => new Response(body, { status, statusText: `Status ${status}` }); @@ -525,4 +526,107 @@ describe("fetchWithRetry", () => { expect(result.status).toBe(200); }); }); + + // ─── wrapFetchWithRetry metrics recording ─────────────────────────── + + describe("wrapFetchWithRetry metrics", () => { + const mockRecordApiRequestTime = jest.fn< + (m: string, u: string, s: string, d: number) => void + >(); + + beforeEach(() => { + mockRecordApiRequestTime.mockClear(); + httpObserver.setHttpMetricsObserver({ + recordApiRequestTime: mockRecordApiRequestTime, + recordWorkflowInputSize: jest.fn(), + recordWorkflowStartError: jest.fn(), + }); + }); + + afterEach(() => { + httpObserver.setHttpMetricsObserver(undefined); + }); + + it("should record API request time on successful response", async () => { + mockFetch.mockResolvedValue(createMockResponse(200)); + + const wrappedFetch = wrapFetchWithRetry(mockFetch); + await wrappedFetch("http://test.com/api/tasks", { method: "POST" }); + + expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1); + const [method, uri, status, duration] = + mockRecordApiRequestTime.mock.calls[0]; + expect(method).toBe("POST"); + expect(uri).toBe("/api/tasks"); + expect(status).toBe("200"); + expect(duration).toBeGreaterThanOrEqual(0); + }); + + it("should record status '0' on network failure", async () => { + mockFetch.mockRejectedValue(new Error("ECONNRESET")); + + const wrappedFetch = wrapFetchWithRetry(mockFetch, { + maxTransportRetries: 0, + }); + + await expect( + wrappedFetch("http://test.com/api/workflow") + ).rejects.toThrow("ECONNRESET"); + + expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1); + const [method, uri, status] = + mockRecordApiRequestTime.mock.calls[0]; + expect(method).toBe("GET"); + expect(uri).toBe("/api/workflow"); + expect(status).toBe("0"); + }); + + it("should extract method and URI from string URL", async () => { + mockFetch.mockResolvedValue(createMockResponse(201)); + + const wrappedFetch = wrapFetchWithRetry(mockFetch); + await wrappedFetch("http://example.com/tasks/123", { method: "PUT" }); + + expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1); + const [method, uri] = mockRecordApiRequestTime.mock.calls[0]; + expect(method).toBe("PUT"); + expect(uri).toBe("/tasks/123"); + }); + + it("should extract method and URI from Request object", async () => { + mockFetch.mockResolvedValue(createMockResponse(200)); + + const wrappedFetch = wrapFetchWithRetry(mockFetch); + const request = new Request("http://example.com/api/metadata", { + method: "DELETE", + }); + await wrappedFetch(request); + + expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1); + const [method, uri] = mockRecordApiRequestTime.mock.calls[0]; + expect(method).toBe("DELETE"); + expect(uri).toBe("/api/metadata"); + }); + + it("should not break fetch when no observer is registered", async () => { + httpObserver.setHttpMetricsObserver(undefined); + + mockFetch.mockResolvedValue(createMockResponse(200)); + const wrappedFetch = wrapFetchWithRetry(mockFetch); + const result = await wrappedFetch("http://test.com"); + + expect(result.status).toBe(200); + }); + + it("should default method to GET when init has no method", async () => { + mockFetch.mockResolvedValue(createMockResponse(200)); + + const wrappedFetch = wrapFetchWithRetry(mockFetch); + await wrappedFetch("http://test.com/path"); + + expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1); + const [method] = mockRecordApiRequestTime.mock.calls[0]; + expect(method).toBe("GET"); + }); + }); }); diff --git a/src/sdk/worker/metrics/__tests__/CanonicalMetricsCollector.test.ts b/src/sdk/worker/metrics/__tests__/CanonicalMetricsCollector.test.ts index 6b616ce3..369eeee3 100644 --- a/src/sdk/worker/metrics/__tests__/CanonicalMetricsCollector.test.ts +++ b/src/sdk/worker/metrics/__tests__/CanonicalMetricsCollector.test.ts @@ -318,4 +318,116 @@ describe("CanonicalMetricsCollector", () => { await expect(collector.stop()).resolves.toBeUndefined(); }); }); + + describe("default argument handling", () => { + it("recordUncaughtException with no arg should default to 'Error'", () => { + collector.recordUncaughtException(); + const text = collector.toPrometheusText(); + expect(text).toContain('thread_uncaught_exceptions_total{exception="Error"} 1'); + }); + + it("recordWorkflowStartError with no args should default both labels", () => { + collector.recordWorkflowStartError(); + const text = collector.toPrometheusText(); + expect(text).toContain('workflow_start_error_total{workflowType="",exception="Error"} 1'); + }); + + it("recordWorkflowStartError with only workflowType should default exception", () => { + collector.recordWorkflowStartError("my_wf"); + const text = collector.toPrometheusText(); + expect(text).toContain('workflow_start_error_total{workflowType="my_wf",exception="Error"} 1'); + }); + + it("recordExternalPayloadUsed with missing optional args should default to empty strings", () => { + collector.recordExternalPayloadUsed("TASK_OUTPUT"); + const text = collector.toPrometheusText(); + expect(text).toContain('external_payload_used_total{entityName="",operation="",payloadType="TASK_OUTPUT"} 1'); + }); + + it("recordTaskAckError with no exception arg should default to 'Error'", () => { + collector.recordTaskAckError("task_a"); + const text = collector.toPrometheusText(); + expect(text).toContain('task_ack_error_total{taskType="task_a",exception="Error"} 1'); + }); + + it("recordWorkflowInputSize with no version should default to empty string", () => { + collector.recordWorkflowInputSize("my_wf", 1000); + const text = collector.toPrometheusText(); + expect(text).toContain('workflow_input_size_bytes_sum{workflowType="my_wf",version=""} 1000'); + }); + }); + + describe("execution completion without outputSizeBytes", () => { + it("should not emit task_result_size_bytes when outputSizeBytes is undefined", () => { + collector.onTaskExecutionCompleted({ + taskType: "task_a", + taskId: "t1", + workerId: "w1", + durationMs: 200, + timestamp: new Date(), + }); + + const text = collector.toPrometheusText(); + expect(text).toContain("task_execute_time_seconds"); + expect(text).not.toContain("task_result_size_bytes"); + }); + }); + + describe("output helpers without prom-client", () => { + it("getContentType should return plain text fallback", () => { + expect(collector.getContentType()).toBe( + "text/plain; version=0.0.4; charset=utf-8" + ); + }); + + it("toPrometheusTextAsync should fall back to sync text", async () => { + collector.onPollStarted({ + taskType: "t", + workerId: "w", + pollCount: 1, + timestamp: new Date(), + }); + const asyncText = await collector.toPrometheusTextAsync(); + const syncText = collector.toPrometheusText(); + expect(asyncText).toBe(syncText); + }); + + it("toPrometheusText should ignore _prefix argument", () => { + collector.onPollStarted({ + taskType: "t", + workerId: "w", + pollCount: 1, + timestamp: new Date(), + }); + const text = collector.toPrometheusText("some_prefix"); + expect(text).not.toContain("some_prefix"); + expect(text).toContain("task_poll_total{"); + }); + }); + + describe("active_workers gauge on failure", () => { + it("should decrement active_workers on task execution failure", () => { + collector.onTaskExecutionStarted({ + taskType: "task_a", + taskId: "t1", + workerId: "w1", + timestamp: new Date(), + }); + + let text = collector.toPrometheusText(); + expect(text).toContain('active_workers{taskType="task_a"} 1'); + + collector.onTaskExecutionFailure({ + taskType: "task_a", + taskId: "t1", + workerId: "w1", + cause: new Error("fail"), + durationMs: 100, + timestamp: new Date(), + }); + + text = collector.toPrometheusText(); + expect(text).toContain('active_workers{taskType="task_a"} 0'); + }); + }); }); diff --git a/src/sdk/worker/metrics/__tests__/CanonicalPrometheusRegistry.test.ts b/src/sdk/worker/metrics/__tests__/CanonicalPrometheusRegistry.test.ts new file mode 100644 index 00000000..449955eb --- /dev/null +++ b/src/sdk/worker/metrics/__tests__/CanonicalPrometheusRegistry.test.ts @@ -0,0 +1,176 @@ +import { describe, it, expect, beforeEach } from "@jest/globals"; +import { CanonicalPrometheusRegistry } from "../CanonicalPrometheusRegistry"; + +describe("CanonicalPrometheusRegistry", () => { + let registry: CanonicalPrometheusRegistry; + + beforeEach(async () => { + try { + const promClient = await import("prom-client"); + promClient.register.clear(); + } catch { + // prom-client not installed — skip cleanup + } + registry = new CanonicalPrometheusRegistry(); + }); + + describe("before initialization", () => { + it("should not be available", () => { + expect(registry.available).toBe(false); + }); + + it("should return default content type", () => { + expect(registry.contentType).toBe( + "text/plain; version=0.0.4; charset=utf-8" + ); + }); + + it("should return empty string from metrics()", async () => { + expect(await registry.metrics()).toBe(""); + }); + + it("should not throw when incrementCounter is called", () => { + expect(() => + registry.incrementCounter("task_poll_total", { taskType: "t" }) + ).not.toThrow(); + }); + + it("should not throw when observeHistogram is called", () => { + expect(() => + registry.observeHistogram("task_poll_time_seconds", { taskType: "t" }, 0.1) + ).not.toThrow(); + }); + + it("should not throw when setGauge is called", () => { + expect(() => + registry.setGauge("active_workers", { taskType: "t" }, 5) + ).not.toThrow(); + }); + }); + + describe("initialize()", () => { + it("should return true when prom-client is installed", async () => { + const result = await registry.initialize(); + expect(result).toBe(true); + expect(registry.available).toBe(true); + }); + + it("should set contentType from prom-client registry", async () => { + await registry.initialize(); + expect(registry.contentType).toBeDefined(); + expect(registry.contentType.length).toBeGreaterThan(0); + }); + }); + + describe("after initialization", () => { + beforeEach(async () => { + await registry.initialize(); + }); + + it("should be available", () => { + expect(registry.available).toBe(true); + }); + + it("should return metrics text from prom-client", async () => { + const text = await registry.metrics(); + expect(typeof text).toBe("string"); + }); + + it("should increment a counter and see it in metrics output", async () => { + registry.incrementCounter("task_poll_total", { taskType: "test_t" }); + registry.incrementCounter("task_poll_total", { taskType: "test_t" }); + const text = await registry.metrics(); + expect(text).toContain("task_poll_total"); + expect(text).toContain("test_t"); + }); + + it("should observe a histogram and see it in metrics output", async () => { + registry.observeHistogram( + "task_poll_time_seconds", + { taskType: "test_t", status: "SUCCESS" }, + 0.05 + ); + const text = await registry.metrics(); + expect(text).toContain("task_poll_time_seconds"); + expect(text).toContain("test_t"); + }); + + it("should set a gauge and see it in metrics output", async () => { + registry.setGauge("active_workers", { taskType: "test_t" }, 3); + const text = await registry.metrics(); + expect(text).toContain("active_workers"); + expect(text).toContain("test_t"); + }); + + it("should handle unknown counter key as no-op", () => { + expect(() => + registry.incrementCounter("nonexistent", { x: "y" }) + ).not.toThrow(); + }); + + it("should handle unknown histogram key as no-op", () => { + expect(() => + registry.observeHistogram("nonexistent", { x: "y" }, 1) + ).not.toThrow(); + }); + + it("should handle unknown gauge key as no-op", () => { + expect(() => + registry.setGauge("nonexistent", { x: "y" }, 1) + ).not.toThrow(); + }); + + it("should accept custom increment value for counter", () => { + expect(() => + registry.incrementCounter("task_poll_total", { taskType: "t" }, 5) + ).not.toThrow(); + }); + + it("should record all counter types", async () => { + registry.incrementCounter("task_poll_total", { taskType: "t" }); + registry.incrementCounter("task_execution_started_total", { taskType: "t" }); + registry.incrementCounter("task_poll_error_total", { taskType: "t", exception: "Error" }); + registry.incrementCounter("task_execute_error_total", { taskType: "t", exception: "Error" }); + registry.incrementCounter("task_update_error_total", { taskType: "t", exception: "Error" }); + registry.incrementCounter("task_ack_error_total", { taskType: "t", exception: "Error" }); + registry.incrementCounter("task_ack_failed_total", { taskType: "t" }); + registry.incrementCounter("task_execution_queue_full_total", { taskType: "t" }); + registry.incrementCounter("task_paused_total", { taskType: "t" }); + registry.incrementCounter("thread_uncaught_exceptions_total", { exception: "Error" }); + registry.incrementCounter("external_payload_used_total", { entityName: "e", operation: "o", payloadType: "p" }); + registry.incrementCounter("workflow_start_error_total", { workflowType: "w", exception: "Error" }); + + const text = await registry.metrics(); + expect(text).toContain("task_poll_total"); + expect(text).toContain("task_execution_started_total"); + expect(text).toContain("task_poll_error_total"); + expect(text).toContain("task_ack_failed_total"); + expect(text).toContain("thread_uncaught_exceptions_total"); + expect(text).toContain("external_payload_used_total"); + expect(text).toContain("workflow_start_error_total"); + }); + + it("should record all histogram types", async () => { + registry.observeHistogram("task_poll_time_seconds", { taskType: "t", status: "SUCCESS" }, 0.1); + registry.observeHistogram("task_execute_time_seconds", { taskType: "t", status: "SUCCESS" }, 0.5); + registry.observeHistogram("task_update_time_seconds", { taskType: "t", status: "SUCCESS" }, 0.02); + registry.observeHistogram("http_api_client_request_seconds", { method: "GET", uri: "/api", status: "200" }, 0.1); + registry.observeHistogram("task_result_size_bytes", { taskType: "t" }, 1024); + registry.observeHistogram("workflow_input_size_bytes", { workflowType: "w", version: "1" }, 512); + + const text = await registry.metrics(); + expect(text).toContain("task_poll_time_seconds"); + expect(text).toContain("task_execute_time_seconds"); + expect(text).toContain("task_update_time_seconds"); + expect(text).toContain("http_api_client_request_seconds"); + expect(text).toContain("task_result_size_bytes"); + expect(text).toContain("workflow_input_size_bytes"); + }); + + it("should record gauge metric", async () => { + registry.setGauge("active_workers", { taskType: "t" }, 7); + const text = await registry.metrics(); + expect(text).toContain("active_workers"); + }); + }); +}); diff --git a/src/sdk/worker/metrics/__tests__/accumulators.test.ts b/src/sdk/worker/metrics/__tests__/accumulators.test.ts new file mode 100644 index 00000000..d25c3181 --- /dev/null +++ b/src/sdk/worker/metrics/__tests__/accumulators.test.ts @@ -0,0 +1,283 @@ +import { describe, it, expect } from "@jest/globals"; +import { + labelKey, + renderLabels, + exceptionLabel, + HistogramAccumulator, + MultiLabelCounter, + GaugeMetric, + TIME_BUCKETS, + SIZE_BUCKETS, +} from "../accumulators"; + +describe("labelKey", () => { + it("should return empty string for empty labels", () => { + expect(labelKey({})).toBe(""); + }); + + it("should return key=value for a single label", () => { + expect(labelKey({ taskType: "my_task" })).toBe("taskType=my_task"); + }); + + it("should sort keys alphabetically", () => { + expect(labelKey({ z: "1", a: "2", m: "3" })).toBe("a=2,m=3,z=1"); + }); + + it("should produce the same key regardless of insertion order", () => { + const key1 = labelKey({ status: "SUCCESS", taskType: "t" }); + const key2 = labelKey({ taskType: "t", status: "SUCCESS" }); + expect(key1).toBe(key2); + }); +}); + +describe("renderLabels", () => { + it("should return empty string for empty labels", () => { + expect(renderLabels({})).toBe(""); + }); + + it("should format a single label with quotes", () => { + expect(renderLabels({ taskType: "my_task" })).toBe('taskType="my_task"'); + }); + + it("should format multiple labels comma-separated", () => { + const result = renderLabels({ method: "GET", uri: "/api" }); + expect(result).toBe('method="GET",uri="/api"'); + }); +}); + +describe("exceptionLabel", () => { + it("should return Error name for standard Error", () => { + expect(exceptionLabel(new Error("oops"))).toBe("Error"); + }); + + it("should return subclass name for TypeError", () => { + expect(exceptionLabel(new TypeError("bad type"))).toBe("TypeError"); + }); + + it("should return subclass name for RangeError", () => { + expect(exceptionLabel(new RangeError("out of range"))).toBe("RangeError"); + }); + + it("should return 'Error' for non-Error values", () => { + expect(exceptionLabel("string error")).toBe("Error"); + expect(exceptionLabel(42)).toBe("Error"); + expect(exceptionLabel(null)).toBe("Error"); + expect(exceptionLabel(undefined)).toBe("Error"); + expect(exceptionLabel({ message: "not an error" })).toBe("Error"); + }); + + it("should fall back to constructor name when .name is empty", () => { + const err = new TypeError("test"); + Object.defineProperty(err, "name", { value: "" }); + expect(exceptionLabel(err)).toBe("TypeError"); + }); +}); + +describe("HistogramAccumulator", () => { + it("should render empty string when no observations", () => { + const h = new HistogramAccumulator([1, 5, 10]); + expect(h.render("test_metric", "A test")).toBe(""); + }); + + it("should place value in correct buckets", () => { + const h = new HistogramAccumulator([1, 5, 10]); + h.observe({ taskType: "t" }, 3); + + const text = h.render("req_time", "Request time"); + expect(text).toContain("# HELP req_time Request time"); + expect(text).toContain("# TYPE req_time histogram"); + expect(text).toContain('req_time_bucket{taskType="t",le="1"} 0'); + expect(text).toContain('req_time_bucket{taskType="t",le="5"} 1'); + expect(text).toContain('req_time_bucket{taskType="t",le="10"} 1'); + expect(text).toContain('req_time_bucket{taskType="t",le="+Inf"} 1'); + expect(text).toContain('req_time_sum{taskType="t"} 3'); + expect(text).toContain('req_time_count{taskType="t"} 1'); + }); + + it("should increment all buckets at or above the value boundary", () => { + const h = new HistogramAccumulator([1, 5, 10]); + h.observe({}, 1); // exactly on boundary + + const text = h.render("m", "help"); + expect(text).toContain('m_bucket{le="1"} 1'); + expect(text).toContain('m_bucket{le="5"} 1'); + expect(text).toContain('m_bucket{le="10"} 1'); + }); + + it("should handle value above all boundaries", () => { + const h = new HistogramAccumulator([1, 5, 10]); + h.observe({}, 100); + + const text = h.render("m", "help"); + expect(text).toContain('m_bucket{le="1"} 0'); + expect(text).toContain('m_bucket{le="5"} 0'); + expect(text).toContain('m_bucket{le="10"} 0'); + expect(text).toContain('m_bucket{le="+Inf"} 1'); + expect(text).toContain("m_sum{} 100"); + }); + + it("should accumulate multiple observations", () => { + const h = new HistogramAccumulator([1, 5, 10]); + h.observe({ t: "a" }, 0.5); + h.observe({ t: "a" }, 3); + h.observe({ t: "a" }, 7); + + const text = h.render("m", "help"); + expect(text).toContain('m_bucket{t="a",le="1"} 1'); + expect(text).toContain('m_bucket{t="a",le="5"} 2'); + expect(text).toContain('m_bucket{t="a",le="10"} 3'); + expect(text).toContain('m_count{t="a"} 3'); + expect(text).toContain('m_sum{t="a"} 10.5'); + }); + + it("should track separate series for different label sets", () => { + const h = new HistogramAccumulator([10]); + h.observe({ status: "OK" }, 5); + h.observe({ status: "ERR" }, 15); + + const text = h.render("m", "help"); + expect(text).toContain('m_bucket{status="OK",le="10"} 1'); + expect(text).toContain('m_bucket{status="ERR",le="10"} 0'); + expect(text).toContain('m_count{status="OK"} 1'); + expect(text).toContain('m_count{status="ERR"} 1'); + }); + + it("should default to TIME_BUCKETS when no boundaries given", () => { + const h = new HistogramAccumulator(); + h.observe({}, 0.005); + const text = h.render("m", "help"); + // TIME_BUCKETS starts at 0.001, 0.005, ... + expect(text).toContain('le="0.001"'); + expect(text).toContain('le="0.005"'); + }); +}); + +describe("MultiLabelCounter", () => { + it("should render empty string when no increments", () => { + const c = new MultiLabelCounter(); + expect(c.render("test_counter", "A test")).toBe(""); + }); + + it("should increment and render a single-label counter", () => { + const c = new MultiLabelCounter(); + c.increment({ taskType: "t" }); + c.increment({ taskType: "t" }); + + const text = c.render("poll_total", "Total polls"); + expect(text).toContain("# HELP poll_total Total polls"); + expect(text).toContain("# TYPE poll_total counter"); + expect(text).toContain('poll_total{taskType="t"} 2'); + }); + + it("should support custom increment values", () => { + const c = new MultiLabelCounter(); + c.increment({ taskType: "t" }, 5); + c.increment({ taskType: "t" }, 3); + + const text = c.render("m", "help"); + expect(text).toContain('m{taskType="t"} 8'); + }); + + it("should track separate series for different label sets", () => { + const c = new MultiLabelCounter(); + c.increment({ taskType: "a" }); + c.increment({ taskType: "b" }); + c.increment({ taskType: "a" }); + + const text = c.render("m", "help"); + expect(text).toContain('m{taskType="a"} 2'); + expect(text).toContain('m{taskType="b"} 1'); + }); + + it("should handle multi-label counters", () => { + const c = new MultiLabelCounter(); + c.increment({ taskType: "t", exception: "TypeError" }); + + const text = c.render("err", "Errors"); + expect(text).toContain('err{taskType="t",exception="TypeError"} 1'); + }); +}); + +describe("GaugeMetric", () => { + it("should render empty string when no values set", () => { + const g = new GaugeMetric(); + expect(g.render("test_gauge", "A test")).toBe(""); + }); + + it("should set and render a gauge value", () => { + const g = new GaugeMetric(); + g.set({ taskType: "t" }, 42); + + const text = g.render("active", "Active workers"); + expect(text).toContain("# HELP active Active workers"); + expect(text).toContain("# TYPE active gauge"); + expect(text).toContain('active{taskType="t"} 42'); + }); + + it("should overwrite previous value on set", () => { + const g = new GaugeMetric(); + g.set({ taskType: "t" }, 10); + g.set({ taskType: "t" }, 99); + + const text = g.render("m", "help"); + expect(text).toContain('m{taskType="t"} 99'); + expect(text).not.toContain("10"); + }); + + it("should increment with inc()", () => { + const g = new GaugeMetric(); + g.inc({ taskType: "t" }); + g.inc({ taskType: "t" }); + g.inc({ taskType: "t" }, 3); + + expect(g.getValue({ taskType: "t" })).toBe(5); + }); + + it("should decrement with dec()", () => { + const g = new GaugeMetric(); + g.inc({ taskType: "t" }, 5); + g.dec({ taskType: "t" }); + g.dec({ taskType: "t" }, 2); + + expect(g.getValue({ taskType: "t" })).toBe(2); + }); + + it("should allow negative values after dec()", () => { + const g = new GaugeMetric(); + g.dec({ taskType: "t" }); + + expect(g.getValue({ taskType: "t" })).toBe(-1); + }); + + it("should return 0 for getValue on unknown labels", () => { + const g = new GaugeMetric(); + expect(g.getValue({ taskType: "unknown" })).toBe(0); + }); + + it("should track separate series for different label sets", () => { + const g = new GaugeMetric(); + g.set({ taskType: "a" }, 10); + g.set({ taskType: "b" }, 20); + + expect(g.getValue({ taskType: "a" })).toBe(10); + expect(g.getValue({ taskType: "b" })).toBe(20); + + const text = g.render("m", "help"); + expect(text).toContain('m{taskType="a"} 10'); + expect(text).toContain('m{taskType="b"} 20'); + }); +}); + +describe("bucket constants", () => { + it("TIME_BUCKETS should be sorted ascending", () => { + for (let i = 1; i < TIME_BUCKETS.length; i++) { + expect(TIME_BUCKETS[i]).toBeGreaterThan(TIME_BUCKETS[i - 1]); + } + }); + + it("SIZE_BUCKETS should be sorted ascending", () => { + for (let i = 1; i < SIZE_BUCKETS.length; i++) { + expect(SIZE_BUCKETS[i]).toBeGreaterThan(SIZE_BUCKETS[i - 1]); + } + }); +}); diff --git a/src/sdk/worker/metrics/__tests__/httpObserver.test.ts b/src/sdk/worker/metrics/__tests__/httpObserver.test.ts new file mode 100644 index 00000000..04da8fa6 --- /dev/null +++ b/src/sdk/worker/metrics/__tests__/httpObserver.test.ts @@ -0,0 +1,58 @@ +import { describe, it, expect, beforeEach } from "@jest/globals"; +import { + getHttpMetricsObserver, + setHttpMetricsObserver, + type HttpMetricsObserver, +} from "../httpObserver"; + +describe("httpObserver", () => { + beforeEach(() => { + setHttpMetricsObserver(undefined); + }); + + it("should return undefined by default", () => { + expect(getHttpMetricsObserver()).toBeUndefined(); + }); + + it("should return the observer after setting one", () => { + const observer: HttpMetricsObserver = { + recordApiRequestTime: () => {}, + recordWorkflowInputSize: () => {}, + recordWorkflowStartError: () => {}, + }; + setHttpMetricsObserver(observer); + expect(getHttpMetricsObserver()).toBe(observer); + }); + + it("should clear the observer when set to undefined", () => { + const observer: HttpMetricsObserver = { + recordApiRequestTime: () => {}, + recordWorkflowInputSize: () => {}, + recordWorkflowStartError: () => {}, + }; + setHttpMetricsObserver(observer); + expect(getHttpMetricsObserver()).toBe(observer); + + setHttpMetricsObserver(undefined); + expect(getHttpMetricsObserver()).toBeUndefined(); + }); + + it("should replace the observer on subsequent set calls", () => { + const observer1: HttpMetricsObserver = { + recordApiRequestTime: () => {}, + recordWorkflowInputSize: () => {}, + recordWorkflowStartError: () => {}, + }; + const observer2: HttpMetricsObserver = { + recordApiRequestTime: () => {}, + recordWorkflowInputSize: () => {}, + recordWorkflowStartError: () => {}, + }; + + setHttpMetricsObserver(observer1); + expect(getHttpMetricsObserver()).toBe(observer1); + + setHttpMetricsObserver(observer2); + expect(getHttpMetricsObserver()).toBe(observer2); + }); +}); From 2384d2dd4cc2d7e7c5cfcec6d4e3cdf9e65181f5 Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Tue, 5 May 2026 09:33:41 -0600 Subject: [PATCH 04/21] delint --- .../LeaseExtension.validation.test.ts | 13 ++++------ .../builders/tasks/pullWorkflowMessages.ts | 2 +- .../worker/__tests__/LeaseTracker.test.ts | 4 +-- src/sdk/clients/worker/events/types.ts | 1 + .../metrics/__tests__/httpObserver.test.ts | 26 +++++++++---------- 5 files changed, 22 insertions(+), 24 deletions(-) diff --git a/src/integration-tests/LeaseExtension.validation.test.ts b/src/integration-tests/LeaseExtension.validation.test.ts index 015512c8..a977613b 100644 --- a/src/integration-tests/LeaseExtension.validation.test.ts +++ b/src/integration-tests/LeaseExtension.validation.test.ts @@ -36,7 +36,6 @@ import type { Client } from "../open-api"; import { MetadataClient, WorkflowExecutor, - TaskClient, orkesConductorClient, } from "../sdk"; import { LeaseTracker } from "../sdk/clients/worker/LeaseTracker"; @@ -56,7 +55,6 @@ describe("Lease Extension — end-to-end validation", () => { let client: Client; let executor: WorkflowExecutor; let metadataClient: MetadataClient; - let taskClient: TaskClient; const logger: ConductorLogger = new DefaultLogger(); const suffix = Date.now(); @@ -67,7 +65,6 @@ describe("Lease Extension — end-to-end validation", () => { client = await orkesConductorClient(); executor = new WorkflowExecutor(client); metadataClient = new MetadataClient(client); - taskClient = new TaskClient(client); await metadataClient.registerTask({ name: taskDefName, @@ -117,7 +114,7 @@ describe("Lease Extension — end-to-end validation", () => { const deadline = Date.now() + maxWaitMs; while (Date.now() < deadline) { const wf = await executor.getWorkflow(workflowId, false); - if (terminal.has(wf.status ?? "")) return wf.status!; + if (terminal.has(wf.status ?? "")) return wf.status ?? ""; await sleep(1_000); } return "STILL_RUNNING"; @@ -132,8 +129,8 @@ describe("Lease Extension — end-to-end validation", () => { const { data: tasks1 } = await TaskResource.batchPoll({ client, path: { tasktype: taskDefName }, query: { workerid: "val-worker-no-lease", count: 1, timeout: 200 } }); const [task] = tasks1 ?? []; expect(task).toBeDefined(); - const taskId1 = task.taskId!; - const wfId1 = task.workflowInstanceId!; + const taskId1 = task.taskId ?? ""; + const wfId1 = task.workflowInstanceId ?? ""; console.log(` Task polled: ${taskId1}`); console.log(` No LeaseTracker created — zero extendLease calls will be made`); @@ -177,8 +174,8 @@ describe("Lease Extension — end-to-end validation", () => { const { data: tasks2 } = await TaskResource.batchPoll({ client, path: { tasktype: taskDefName }, query: { workerid: "val-worker-with-lease", count: 1, timeout: 200 } }); const [task] = tasks2 ?? []; expect(task).toBeDefined(); - const taskId2 = task.taskId!; - const wfId2 = task.workflowInstanceId!; + const taskId2 = task.taskId ?? ""; + const wfId2 = task.workflowInstanceId ?? ""; console.log(` Task polled: ${taskId2}`); // Track heartbeat calls via a spy that ALSO sends the real heartbeat diff --git a/src/sdk/builders/tasks/pullWorkflowMessages.ts b/src/sdk/builders/tasks/pullWorkflowMessages.ts index e4b2ebae..e0bcb3c3 100644 --- a/src/sdk/builders/tasks/pullWorkflowMessages.ts +++ b/src/sdk/builders/tasks/pullWorkflowMessages.ts @@ -16,7 +16,7 @@ import { TaskType, PullWorkflowMessagesTaskDef } from "../../../open-api"; */ export const pullWorkflowMessages = ( taskReferenceName: string, - batchSize: number = 1, + batchSize = 1, optional?: boolean ): PullWorkflowMessagesTaskDef => ({ name: taskReferenceName, diff --git a/src/sdk/clients/worker/__tests__/LeaseTracker.test.ts b/src/sdk/clients/worker/__tests__/LeaseTracker.test.ts index 86cf4e3e..73b56823 100644 --- a/src/sdk/clients/worker/__tests__/LeaseTracker.test.ts +++ b/src/sdk/clients/worker/__tests__/LeaseTracker.test.ts @@ -1,5 +1,5 @@ -import { LeaseTracker, LeaseInfo } from "@/sdk/clients/worker/LeaseTracker"; -import { LEASE_EXTEND_DURATION_FACTOR, LEASE_EXTEND_RETRY_COUNT, HEARTBEAT_RETRY_DELAY_MS } from "@/sdk/clients/worker/constants"; +import { LeaseTracker } from "@/sdk/clients/worker/LeaseTracker"; +import { LEASE_EXTEND_RETRY_COUNT, HEARTBEAT_RETRY_DELAY_MS } from "@/sdk/clients/worker/constants"; import { afterEach, beforeEach, describe, expect, jest, test } from "@jest/globals"; import type { Task } from "@open-api/index"; import { mockLogger } from "@test-utils/mockLogger"; diff --git a/src/sdk/clients/worker/events/types.ts b/src/sdk/clients/worker/events/types.ts index e252a25e..9c94af69 100644 --- a/src/sdk/clients/worker/events/types.ts +++ b/src/sdk/clients/worker/events/types.ts @@ -137,6 +137,7 @@ export interface TaskUpdateCompleted extends TaskRunnerEvent { /** * Event published when a poll cycle is skipped because the worker is paused. */ +// eslint-disable-next-line @typescript-eslint/no-empty-object-type export interface TaskPaused extends TaskRunnerEvent { // No additional fields — taskType is inherited from TaskRunnerEvent. } diff --git a/src/sdk/worker/metrics/__tests__/httpObserver.test.ts b/src/sdk/worker/metrics/__tests__/httpObserver.test.ts index 04da8fa6..bf8adb42 100644 --- a/src/sdk/worker/metrics/__tests__/httpObserver.test.ts +++ b/src/sdk/worker/metrics/__tests__/httpObserver.test.ts @@ -1,4 +1,4 @@ -import { describe, it, expect, beforeEach } from "@jest/globals"; +import { describe, it, expect, beforeEach, jest } from "@jest/globals"; import { getHttpMetricsObserver, setHttpMetricsObserver, @@ -16,9 +16,9 @@ describe("httpObserver", () => { it("should return the observer after setting one", () => { const observer: HttpMetricsObserver = { - recordApiRequestTime: () => {}, - recordWorkflowInputSize: () => {}, - recordWorkflowStartError: () => {}, + recordApiRequestTime: jest.fn(), + recordWorkflowInputSize: jest.fn(), + recordWorkflowStartError: jest.fn(), }; setHttpMetricsObserver(observer); expect(getHttpMetricsObserver()).toBe(observer); @@ -26,9 +26,9 @@ describe("httpObserver", () => { it("should clear the observer when set to undefined", () => { const observer: HttpMetricsObserver = { - recordApiRequestTime: () => {}, - recordWorkflowInputSize: () => {}, - recordWorkflowStartError: () => {}, + recordApiRequestTime: jest.fn(), + recordWorkflowInputSize: jest.fn(), + recordWorkflowStartError: jest.fn(), }; setHttpMetricsObserver(observer); expect(getHttpMetricsObserver()).toBe(observer); @@ -39,14 +39,14 @@ describe("httpObserver", () => { it("should replace the observer on subsequent set calls", () => { const observer1: HttpMetricsObserver = { - recordApiRequestTime: () => {}, - recordWorkflowInputSize: () => {}, - recordWorkflowStartError: () => {}, + recordApiRequestTime: jest.fn(), + recordWorkflowInputSize: jest.fn(), + recordWorkflowStartError: jest.fn(), }; const observer2: HttpMetricsObserver = { - recordApiRequestTime: () => {}, - recordWorkflowInputSize: () => {}, - recordWorkflowStartError: () => {}, + recordApiRequestTime: jest.fn(), + recordWorkflowInputSize: jest.fn(), + recordWorkflowStartError: jest.fn(), }; setHttpMetricsObserver(observer1); From 0b082d6c4d2d931c4f446ccd90c9e2f675f57a25 Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Tue, 5 May 2026 11:02:22 -0600 Subject: [PATCH 05/21] add retry for 502,503,504 error codes --- .github/workflows/pull_request.yml | 4 +- .../helpers/__tests__/fetchWithRetry.test.ts | 74 +++++++++++++++++++ .../helpers/fetchWithRetry.ts | 12 +++ 3 files changed, 88 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index 56599d35..c2271a6a 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -79,14 +79,14 @@ jobs: name: codecov-unit-node-${{ matrix.node-version }} fail_ci_if_error: false - # Integration tests (v5): one job at a time (max-parallel: 1) to avoid 502/503 on shared Conductor. + # Integration tests (v5): one shard at a time (max-parallel: 1) to avoid 502/503 on shared Conductor. # Sharding (--shard i/N) splits the suite so each job runs ~1/N of tests — keeps per-job under timeout. integration-tests: runs-on: ubuntu-latest timeout-minutes: 25 strategy: fail-fast: false - max-parallel: 2 + max-parallel: 1 matrix: node-version: [20, 22, 24] shard: [1, 2, 3] diff --git a/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts b/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts index 887c0665..883c65b5 100644 --- a/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts +++ b/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts @@ -181,6 +181,80 @@ describe("fetchWithRetry", () => { }); }); + // ─── Server error (502/503/504) retry ─────────────────────────────── + + describe("server error (502/503/504) retry", () => { + it("should retry 502 and succeed on next attempt", async () => { + mockFetch + .mockResolvedValueOnce(createMockResponse(502, "Bad Gateway")) + .mockResolvedValueOnce(createMockResponse(200, "ok")); + + const result = await retryFetch("http://test.com", {}, mockFetch, { + maxTransportRetries: 3, + initialRetryDelay: 1, + }); + + expect(result.status).toBe(200); + expect(mockFetch).toHaveBeenCalledTimes(2); + }); + + it("should retry 503 and succeed on next attempt", async () => { + mockFetch + .mockResolvedValueOnce(createMockResponse(503, "Service Unavailable")) + .mockResolvedValueOnce(createMockResponse(200, "ok")); + + const result = await retryFetch("http://test.com", {}, mockFetch, { + maxTransportRetries: 3, + initialRetryDelay: 1, + }); + + expect(result.status).toBe(200); + expect(mockFetch).toHaveBeenCalledTimes(2); + }); + + it("should retry 504 and succeed on next attempt", async () => { + mockFetch + .mockResolvedValueOnce(createMockResponse(504, "Gateway Timeout")) + .mockResolvedValueOnce(createMockResponse(200, "ok")); + + const result = await retryFetch("http://test.com", {}, mockFetch, { + maxTransportRetries: 3, + initialRetryDelay: 1, + }); + + expect(result.status).toBe(200); + expect(mockFetch).toHaveBeenCalledTimes(2); + }); + + it("should exhaust retries and return last 5xx response", async () => { + mockFetch.mockResolvedValue(createMockResponse(502, "Bad Gateway")); + + const result = await retryFetch("http://test.com", {}, mockFetch, { + maxTransportRetries: 2, + initialRetryDelay: 1, + }); + + expect(result.status).toBe(502); + // 1 initial + 2 retries = 3 + expect(mockFetch).toHaveBeenCalledTimes(3); + }); + + it("should handle transport error then 502 then success", async () => { + mockFetch + .mockRejectedValueOnce(new Error("ECONNRESET")) + .mockResolvedValueOnce(createMockResponse(502, "Bad Gateway")) + .mockResolvedValueOnce(createMockResponse(200, "ok")); + + const result = await retryFetch("http://test.com", {}, mockFetch, { + maxTransportRetries: 3, + initialRetryDelay: 1, + }); + + expect(result.status).toBe(200); + expect(mockFetch).toHaveBeenCalledTimes(3); + }); + }); + // ─── Auth failure (401/403) retry ────────────────────────────────── describe("auth failure (401/403) retry", () => { diff --git a/src/sdk/createConductorClient/helpers/fetchWithRetry.ts b/src/sdk/createConductorClient/helpers/fetchWithRetry.ts index 1a49d509..679a7439 100644 --- a/src/sdk/createConductorClient/helpers/fetchWithRetry.ts +++ b/src/sdk/createConductorClient/helpers/fetchWithRetry.ts @@ -157,6 +157,18 @@ export const retryFetch = async ( return rateLimitResponse; } + // Gateway error retry (502, 503, 504) -- transient proxy/server errors + if (response.status >= 502 && response.status <= 504) { + lastError = new Error(`Server error: HTTP ${response.status}`); + if (transportAttempt < maxTransportRetries) { + await new Promise((resolve) => + setTimeout(resolve, withJitter(initialRetryDelay * (transportAttempt + 1))) + ); + continue; + } + return response; + } + // Auth failure retry (401/403) - only refresh+retry when the error is a token // problem (EXPIRED_TOKEN or INVALID_TOKEN). Permission errors should propagate // immediately without wasting a token refresh + retry round-trip. From 396fcc5efaf938685c4a75df5b4d499d0f873257 Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Tue, 5 May 2026 11:13:25 -0600 Subject: [PATCH 06/21] experiments with flaky test runs ... --- .github/workflows/pull_request.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index c2271a6a..ddf85030 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -79,14 +79,16 @@ jobs: name: codecov-unit-node-${{ matrix.node-version }} fail_ci_if_error: false - # Integration tests (v5): one shard at a time (max-parallel: 1) to avoid 502/503 on shared Conductor. - # Sharding (--shard i/N) splits the suite so each job runs ~1/N of tests — keeps per-job under timeout. + # Integration tests (v5): lower max-parallel reduces 502/503 from the shared Conductor server + # but makes CI slower without eliminating flakes entirely — feel free to experiment - this test + # setup is insane. Sharding (--shard i/N) splits the suite so each job runs ~1/N of tests. + # fetchWithRetry now retries 502/503/504, so higher parallelism is more viable than before. integration-tests: runs-on: ubuntu-latest timeout-minutes: 25 strategy: fail-fast: false - max-parallel: 1 + max-parallel: 3 matrix: node-version: [20, 22, 24] shard: [1, 2, 3] From bf71a1eed66367da5fc4a14c3ddf0155066c5108 Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Tue, 5 May 2026 11:45:48 -0600 Subject: [PATCH 07/21] remove editorialization --- .github/workflows/pull_request.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index ddf85030..b835e6e5 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -80,8 +80,8 @@ jobs: fail_ci_if_error: false # Integration tests (v5): lower max-parallel reduces 502/503 from the shared Conductor server - # but makes CI slower without eliminating flakes entirely — feel free to experiment - this test - # setup is insane. Sharding (--shard i/N) splits the suite so each job runs ~1/N of tests. + # but makes CI slower without eliminating flakes entirely — feel free to experiment. + # Sharding (--shard i/N) splits the suite so each job runs ~1/N of tests. # fetchWithRetry now retries 502/503/504, so higher parallelism is more viable than before. integration-tests: runs-on: ubuntu-latest From c8c4e9f288b33e3bc5afc2a88af77a9806dbd114 Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Wed, 6 May 2026 09:37:01 -0600 Subject: [PATCH 08/21] cleaner reporting on which metrics implementation is used in the test harness worker --- harness/main.ts | 7 +------ .../worker/metrics/CanonicalMetricsCollector.ts | 4 ++++ src/sdk/worker/metrics/LegacyMetricsCollector.ts | 4 ++++ .../worker/metrics/MetricsCollectorInterface.ts | 1 + .../metrics/__tests__/metricsFactory.test.ts | 15 +++++++++++++++ 5 files changed, 25 insertions(+), 6 deletions(-) diff --git a/harness/main.ts b/harness/main.ts index ad30b732..081188d7 100644 --- a/harness/main.ts +++ b/harness/main.ts @@ -94,12 +94,7 @@ async function main(): Promise { const metricsPort = envIntOrDefault("HARNESS_METRICS_PORT", 9991); const metricsCollector = createMetricsCollector({ httpPort: metricsPort }); - const variant = ["true", "1", "yes"].includes( - (process.env.WORKER_CANONICAL_METRICS ?? "").toLowerCase(), - ) - ? "canonical" - : "legacy"; - console.log(`Prometheus metrics server started on port ${metricsPort} (${variant} metrics)`); + console.log(`Prometheus metrics server started on port ${metricsPort} (${metricsCollector.collectorName()} metrics)`); const handler = new TaskHandler({ client, diff --git a/src/sdk/worker/metrics/CanonicalMetricsCollector.ts b/src/sdk/worker/metrics/CanonicalMetricsCollector.ts index d12b8e7a..f0fa5314 100644 --- a/src/sdk/worker/metrics/CanonicalMetricsCollector.ts +++ b/src/sdk/worker/metrics/CanonicalMetricsCollector.ts @@ -416,6 +416,10 @@ export class CanonicalMetricsCollector implements MetricsCollectorInterface { } } + collectorName(): string { + return "canonical"; + } + getContentType(): string { return ( this._promRegistry?.contentType ?? diff --git a/src/sdk/worker/metrics/LegacyMetricsCollector.ts b/src/sdk/worker/metrics/LegacyMetricsCollector.ts index cbf1b3ac..381c14b5 100644 --- a/src/sdk/worker/metrics/LegacyMetricsCollector.ts +++ b/src/sdk/worker/metrics/LegacyMetricsCollector.ts @@ -342,6 +342,10 @@ export class LegacyMetricsCollector implements MetricsCollectorInterface { } } + collectorName(): string { + return "legacy"; + } + getContentType(): string { return ( this._promRegistry?.contentType ?? diff --git a/src/sdk/worker/metrics/MetricsCollectorInterface.ts b/src/sdk/worker/metrics/MetricsCollectorInterface.ts index 78d0c0b0..4852b099 100644 --- a/src/sdk/worker/metrics/MetricsCollectorInterface.ts +++ b/src/sdk/worker/metrics/MetricsCollectorInterface.ts @@ -39,6 +39,7 @@ export interface MetricsCollectorInterface extends TaskRunnerEventsListener { // ── Output / lifecycle ───────────────────────────────────────── + collectorName(): string; getMetrics(): unknown; reset(): void; stop(): Promise; diff --git a/src/sdk/worker/metrics/__tests__/metricsFactory.test.ts b/src/sdk/worker/metrics/__tests__/metricsFactory.test.ts index f5a28f91..529fad17 100644 --- a/src/sdk/worker/metrics/__tests__/metricsFactory.test.ts +++ b/src/sdk/worker/metrics/__tests__/metricsFactory.test.ts @@ -79,6 +79,20 @@ describe("createMetricsCollector", () => { expect(typeof text).toBe("string"); }); + it('legacy collector returns "legacy" from collectorName()', () => { + delete process.env.WORKER_CANONICAL_METRICS; + + const collector = createMetricsCollector(); + expect(collector.collectorName()).toBe("legacy"); + }); + + it('canonical collector returns "canonical" from collectorName()', () => { + process.env.WORKER_CANONICAL_METRICS = "true"; + + const collector = createMetricsCollector(); + expect(collector.collectorName()).toBe("canonical"); + }); + it("both implementations satisfy MetricsCollectorInterface", () => { const legacy = createMetricsCollector(); const requiredMethods = [ @@ -97,6 +111,7 @@ describe("createMetricsCollector", () => { "stop", "getContentType", "toPrometheusText", + "collectorName", "toPrometheusTextAsync", ]; From c3da9274c4ae09bac81725f3b28fca362b8ea2a9 Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Wed, 6 May 2026 12:42:53 -0600 Subject: [PATCH 09/21] updates for metrics related documentation --- AGENTS.md | 10 +- METRICS.md | 485 ++++++++++++++++++------------ README.md | 10 +- SDK_COMPARISON.md | 27 +- SDK_DEVELOPMENT.md | 24 +- WORKER_ARCHITECTURE_COMPARISON.md | 27 +- 6 files changed, 347 insertions(+), 236 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 87f2aa09..afb1584e 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -32,7 +32,7 @@ src/sdk/ # Main SDK source decorators/worker.ts # @worker decorator + dual-mode support decorators/registry.ts # Global registry (register/get/clear) context/TaskContext.ts # AsyncLocalStorage per-task context - metrics/ # MetricsCollector, MetricsServer, PrometheusRegistry + metrics/ # LegacyMetricsCollector, CanonicalMetricsCollector, metricsFactory, MetricsServer, PrometheusRegistry, CanonicalPrometheusRegistry, accumulators, httpObserver schema/ # jsonSchema, schemaField decorators generators/ # Legacy generators (pre-v3, still exported for compat) src/open-api/ # OpenAPI layer @@ -211,10 +211,10 @@ public async someMethod(args): Promise { ### Metrics Documentation (METRICS.md) -When adding, removing, or renaming metrics in `src/sdk/worker/metrics/MetricsCollector.ts`: -1. Update `METRICS.md` to reflect the change (name, type, labels, description) -2. Ensure both `MetricsCollector.toPrometheusText()` and `PrometheusRegistry.createMetrics()` are updated in sync — missing a summary/counter in either causes silent data loss -3. Update the metric count in the METRICS.md overview section +When adding, removing, or renaming metrics in `src/sdk/worker/metrics/`: +1. Update both `LegacyMetricsCollector.ts` and `CanonicalMetricsCollector.ts` (or add a no-op stub in the collector that does not emit the metric) +2. Ensure `toPrometheusText()` and the corresponding `PrometheusRegistry` / `CanonicalPrometheusRegistry` are updated in sync — missing a metric in either causes silent data loss +3. Update `METRICS.md` to reflect the change in both the legacy and canonical catalog tables 4. Add or update the corresponding direct recording method documentation if applicable ### SDK_NEW_LANGUAGE_GUIDE.md diff --git a/METRICS.md b/METRICS.md index d8770ed8..43c22252 100644 --- a/METRICS.md +++ b/METRICS.md @@ -1,259 +1,364 @@ -# Metrics Reference +# JavaScript SDK Metrics -The Conductor JavaScript SDK provides built-in Prometheus metrics for monitoring worker performance, API latency, and task execution. +The Conductor JavaScript SDK can expose Prometheus metrics for worker polling, +task execution, task updates, workflow starts, external payload usage, and +API-client HTTP calls. -## Overview +The SDK currently has two mutually exclusive metric surfaces: -`MetricsCollector` implements `TaskRunnerEventsListener` and records **18 metric types** (12 counters + 6 summaries). Metrics are exposed in [Prometheus exposition format](https://prometheus.io/docs/instrumenting/exposition_formats/). +- **Legacy metrics** are the default. They preserve the original JavaScript SDK + names and shapes, including a `conductor_worker_` prefix, `task_type` labels, + millisecond time units, and Summary type for distributions. +- **Canonical metrics** are opt-in with `WORKER_CANONICAL_METRICS=true`. They + use the cross-SDK canonical names, labels, units, and Prometheus histogram + shapes. -- **Default prefix:** `conductor_worker` -- **Quantiles:** p50, p75, p90, p95, p99 (computed from a sliding window) -- **Sliding window:** Last 1,000 observations (configurable) +Only one collector is active at a time. The SDK does not emit legacy and +canonical metrics at the same time. -## Quick Start +Metrics are created lazily. A metric appears in `/metrics` only after the +corresponding worker event or collector method records it. Some low-level +surface metrics, such as ack, queue-full, paused, and uncaught-exception +counters, may not appear in normal worker runs unless that path is exercised. -### HTTP Server +## Usage -```typescript -import { MetricsCollector, MetricsServer, TaskHandler } from "@io-orkes/conductor-javascript"; +Create a metrics collector, start a scrape server, and wire the collector into +`TaskHandler` as an event listener: -const metrics = new MetricsCollector({ httpPort: 9090 }); +```typescript +import { + createMetricsCollector, + MetricsServer, + TaskHandler, +} from "@io-orkes/conductor-javascript"; + +const metrics = createMetricsCollector(); +const server = new MetricsServer(metrics, 9090); +await server.start(); const handler = new TaskHandler({ client, - scanForDecorated: true, eventListeners: [metrics], + scanForDecorated: true, }); await handler.startWorkers(); // GET http://localhost:9090/metrics — Prometheus text format -// GET http://localhost:9090/health — { "status": "UP" } +// GET http://localhost:9090/health — {"status":"UP"} +``` + +`createMetricsCollector()` reads `WORKER_CANONICAL_METRICS` and returns either +a `LegacyMetricsCollector` or a `CanonicalMetricsCollector`. Both implement +`MetricsCollectorInterface`, so call sites never need to know which variant is +active. + +You can also construct a collector directly if you need to pass configuration: + +```typescript +import { LegacyMetricsCollector } from "@io-orkes/conductor-javascript"; + +const metrics = new LegacyMetricsCollector({ + httpPort: 9090, + filePath: "/tmp/conductor_metrics.prom", + fileWriteIntervalMs: 10000, + usePromClient: true, +}); ``` ### File Output ```typescript -const metrics = new MetricsCollector({ +const metrics = createMetricsCollector({ filePath: "/tmp/conductor_metrics.prom", - fileWriteIntervalMs: 10000, // write every 10s + fileWriteIntervalMs: 10000, }); ``` -The file writer performs an immediate first write, then writes periodically at the configured interval. The timer is unreferenced so it does not prevent Node.js process exit. +The file writer performs an immediate first write, then writes periodically at +the configured interval. The timer is unreferenced so it does not prevent +Node.js process exit. ### prom-client Integration ```typescript -const metrics = new MetricsCollector({ usePromClient: true }); +const metrics = createMetricsCollector({ usePromClient: true }); // Metrics are registered in prom-client's default registry. // Use prom-client's register.metrics() for native scraping. ``` -Requires `npm install prom-client`. Falls back to built-in text format if not installed. +Requires `npm install prom-client`. Falls back to built-in text format if +prom-client is not installed. -### All-in-One +## Selecting Canonical Metrics -```typescript -const metrics = new MetricsCollector({ - prefix: "myapp_worker", - httpPort: 9090, - filePath: "/tmp/metrics.prom", - fileWriteIntervalMs: 10000, - slidingWindowSize: 500, - usePromClient: true, -}); +Set `WORKER_CANONICAL_METRICS` before the worker starts: + +```shell +WORKER_CANONICAL_METRICS=true node my_worker.js ``` +Accepted true values are `true`, `1`, and `yes`, case-insensitive. Any other +value, or an unset variable, selects legacy metrics. The variable is read when +the metrics collector is created, so changing it requires a worker restart. + ## Configuration | Option | Type | Default | Description | -|--------|------|---------|-------------| -| `prefix` | `string` | `"conductor_worker"` | Prometheus metric name prefix | -| `httpPort` | `number` | — | Start built-in HTTP server on this port | -| `filePath` | `string` | — | Periodically write metrics to this file path | -| `fileWriteIntervalMs` | `number` | `5000` | File write interval in milliseconds | -| `slidingWindowSize` | `number` | `1000` | Max observations kept for quantile calculation | -| `usePromClient` | `boolean` | `false` | Use `prom-client` for native Prometheus integration | - ---- - -## Counter Metrics - -### Labeled by `task_type` - -| Prometheus Name | Internal Key | Description | -|----------------|-------------|-------------| -| `{prefix}_task_poll_total` | `pollTotal` | Total number of task polls initiated | -| `{prefix}_task_poll_error_total` | `pollErrorTotal` | Total number of failed task polls | -| `{prefix}_task_execute_total` | `taskExecutionTotal` | Total number of task executions completed | -| `{prefix}_task_execute_error_total` | `taskExecutionErrorTotal` | Total task execution errors. Label format: `taskType:ExceptionName` | -| `{prefix}_task_update_error_total` | `taskUpdateFailureTotal` | Total task result update failures (result lost from Conductor) | -| `{prefix}_task_ack_error_total` | `taskAckErrorTotal` | Total task acknowledgement errors | -| `{prefix}_task_execution_queue_full_total` | `taskExecutionQueueFullTotal` | Times the execution queue was full (concurrency limit reached) | -| `{prefix}_task_paused_total` | `taskPausedTotal` | Total task paused events | - -### Labeled by `payload_type` - -| Prometheus Name | Internal Key | Description | -|----------------|-------------|-------------| -| `{prefix}_external_payload_used_total` | `externalPayloadUsedTotal` | External payload storage usage (e.g., `"workflow_input"`, `"task_output"`) | - -### Global (no labels) - -| Prometheus Name | Internal Key | Description | -|----------------|-------------|-------------| -| `{prefix}_thread_uncaught_exceptions_total` | `uncaughtExceptionTotal` | Total uncaught exceptions in worker processes | -| `{prefix}_worker_restart_total` | `workerRestartTotal` | Total worker restart events | -| `{prefix}_workflow_start_error_total` | `workflowStartErrorTotal` | Total workflow start errors | - ---- - -## Summary Metrics - -Each summary emits quantile values, a count, and a sum: - -``` -{name}{task_type="myTask",quantile="0.5"} 12.3 -{name}{task_type="myTask",quantile="0.75"} 15.1 -{name}{task_type="myTask",quantile="0.9"} 18.7 -{name}{task_type="myTask",quantile="0.95"} 22.0 -{name}{task_type="myTask",quantile="0.99"} 45.2 -{name}_count{task_type="myTask"} 1000 -{name}_sum{task_type="myTask"} 14523.7 +|---|---|---|---| +| `prefix` | `string` | `"conductor_worker"` | Prometheus metric name prefix. Legacy only; canonical metrics are unprefixed. | +| `httpPort` | `number` | — | Start built-in HTTP server on this port. | +| `filePath` | `string` | — | Periodically write metrics to this file path. | +| `fileWriteIntervalMs` | `number` | `5000` | File write interval in milliseconds. | +| `slidingWindowSize` | `number` | `1000` | Max observations for quantile calculation. Legacy only; canonical uses histogram buckets. | +| `usePromClient` | `boolean` | `false` | Use `prom-client` for native Prometheus integration. | + +## Canonical Metrics + +Canonical timing values are seconds. Canonical size values are bytes. Label +names use camelCase. + +### Canonical Counters + +| Metric | Labels | Description | +|---|---|---| +| `task_poll_total` | `taskType` | Incremented each time the worker issues a poll request. | +| `task_execution_started_total` | `taskType` | Incremented when a polled task is dispatched to the worker function. | +| `task_poll_error_total` | `taskType`, `exception` | Incremented when a poll request fails client-side. | +| `task_execute_error_total` | `taskType`, `exception` | Incremented when the worker function throws. | +| `task_update_error_total` | `taskType`, `exception` | Incremented when updating the task result fails. | +| `task_ack_error_total` | `taskType`, `exception` | Collector surface for task ack errors. The internal runner uses batch poll responses as ack and may not emit this during normal polling. | +| `task_ack_failed_total` | `taskType` | Collector surface for failed task ack responses. The internal runner uses batch poll responses as ack and may not emit this during normal polling. | +| `task_execution_queue_full_total` | `taskType` | Incremented when the worker execution queue is saturated. | +| `task_paused_total` | `taskType` | Incremented when a worker is paused and skips acting on a poll. | +| `thread_uncaught_exceptions_total` | `exception` | Incremented on uncaught exceptions in the worker process. | +| `external_payload_used_total` | `entityName`, `operation`, `payloadType` | Incremented when external payload storage is used for task or workflow payloads. | +| `workflow_start_error_total` | `workflowType`, `exception` | Incremented when starting a workflow fails client-side. | + +### Canonical Time Histograms + +All canonical time histograms use buckets: +`0.001`, `0.005`, `0.01`, `0.025`, `0.05`, `0.1`, `0.25`, `0.5`, `1`, `2.5`, +`5`, `10`. + +| Metric | Labels | Description | +|---|---|---| +| `task_poll_time_seconds` | `taskType`, `status` | Poll request latency. `status` is `SUCCESS` or `FAILURE`. | +| `task_execute_time_seconds` | `taskType`, `status` | Worker function execution duration. `status` is `SUCCESS` or `FAILURE`. | +| `task_update_time_seconds` | `taskType`, `status` | Task-result update latency. `status` is `SUCCESS` or `FAILURE`. | +| `http_api_client_request_seconds` | `method`, `uri`, `status` | API-client HTTP request latency. `status` is the HTTP status code as a string, or `"0"` on network failure. | + +Each histogram exposes Prometheus series such as: + +```prometheus +task_execute_time_seconds_bucket{taskType="my_task",status="SUCCESS",le="0.1"} 42 +task_execute_time_seconds_count{taskType="my_task",status="SUCCESS"} 50 +task_execute_time_seconds_sum{taskType="my_task",status="SUCCESS"} 2.3 ``` -### Labeled by `task_type` +### Canonical Size Histograms -| Prometheus Name | Internal Key | Unit | Description | -|----------------|-------------|------|-------------| -| `{prefix}_task_poll_time` | `pollDurationMs` | ms | Task poll round-trip duration | -| `{prefix}_task_execute_time` | `executionDurationMs` | ms | Worker function execution duration | -| `{prefix}_task_update_time` | `updateDurationMs` | ms | Task result update (SDK to server) duration | -| `{prefix}_task_result_size_bytes` | `outputSizeBytes` | bytes | Task result output payload size | +All canonical size histograms use buckets: +`100`, `1000`, `10000`, `100000`, `1000000`, `10000000`. -### Labeled by `workflow_type` +| Metric | Labels | Description | +|---|---|---| +| `task_result_size_bytes` | `taskType` | Serialized task result output size. | +| `workflow_input_size_bytes` | `workflowType`, `version` | Serialized workflow input size. `version` is an empty string when not provided. | -| Prometheus Name | Internal Key | Unit | Description | -|----------------|-------------|------|-------------| -| `{prefix}_workflow_input_size_bytes` | `workflowInputSizeBytes` | bytes | Workflow input payload size | +### Canonical Gauges -### Labeled by `endpoint` +| Metric | Labels | Description | +|---|---|---| +| `active_workers` | `taskType` | Current number of workers actively executing tasks. | -| Prometheus Name | Internal Key | Unit | Description | -|----------------|-------------|------|-------------| -| `{prefix}_http_api_client_request` | `apiRequestDurationMs` | ms | API request duration. Label format: `METHOD:/api/path:STATUS` | +## Legacy Metrics ---- +Legacy mode is the default so existing dashboards and alerts continue to work. +The default metric name prefix is `conductor_worker`. The prefix is configurable +via the `prefix` option on `MetricsCollectorConfig`. -## Event Listener Methods +Distribution metrics are sliding-window summaries over the latest 1,000 +observations (configurable via `slidingWindowSize`), exposing quantiles at +p50, p75, p90, p95, and p99. Legacy distribution metrics also expose `_count` +and `_sum` series. -These methods are called automatically by the `TaskRunner` when `MetricsCollector` is registered as an event listener: +### Legacy Counters -| Method | Metrics Updated | -|--------|----------------| -| `onPollStarted(event)` | Increments `pollTotal` | -| `onPollCompleted(event)` | Records `pollDurationMs` | -| `onPollFailure(event)` | Increments `pollErrorTotal`, records `pollDurationMs` | -| `onTaskExecutionStarted(event)` | _(no-op, counted on completion)_ | -| `onTaskExecutionCompleted(event)` | Increments `taskExecutionTotal`, records `executionDurationMs` and `outputSizeBytes` | -| `onTaskExecutionFailure(event)` | Increments `taskExecutionErrorTotal`, records `executionDurationMs` | -| `onTaskUpdateCompleted(event)` | Records `updateDurationMs` | -| `onTaskUpdateFailure(event)` | Increments `taskUpdateFailureTotal` | +| Metric | Labels | Description | +|---|---|---| +| `conductor_worker_task_poll_total` | `task_type` | Incremented each time polling is done. | +| `conductor_worker_task_poll_error_total` | `task_type` | Incremented when a poll request fails. | +| `conductor_worker_task_execute_total` | `task_type` | Incremented when a task execution completes. | +| `conductor_worker_task_execute_error_total` | `task_type` | Task execution errors. Label format: `taskType:ExceptionName`. | +| `conductor_worker_task_update_error_total` | `task_type` | Incremented when updating the task result fails. | +| `conductor_worker_task_ack_error_total` | `task_type` | Collector surface for task ack errors. | +| `conductor_worker_task_execution_queue_full_total` | `task_type` | Incremented when the execution queue is saturated. | +| `conductor_worker_task_paused_total` | `task_type` | Incremented when a worker is paused and skips a poll. | +| `conductor_worker_external_payload_used_total` | `payload_type` | External payload storage usage. | +| `conductor_worker_thread_uncaught_exceptions_total` | none | Uncaught exceptions in the worker process. | +| `conductor_worker_worker_restart_total` | none | Worker restart events. | +| `conductor_worker_workflow_start_error_total` | none | Workflow start errors. | -## Direct Recording Methods +Legacy mode does not emit `task_execution_started_total`, +`task_ack_failed_total`, or `active_workers`. -For metrics outside the event listener system, call these methods directly: +### Legacy Time Metrics -```typescript -const collector = new MetricsCollector(); - -collector.recordTaskExecutionQueueFull("my_task"); -collector.recordUncaughtException(); -collector.recordWorkerRestart(); -collector.recordTaskPaused("my_task"); -collector.recordTaskAckError("my_task"); -collector.recordWorkflowStartError(); -collector.recordExternalPayloadUsed("task_output"); -collector.recordWorkflowInputSize("my_workflow", 2048); -collector.recordApiRequestTime("POST", "/api/tasks", 200, 35); -``` +Time values are milliseconds. Type is Summary. -## Exposition Formats +| Metric | Labels | Description | +|---|---|---| +| `conductor_worker_task_poll_time` | `task_type` | Poll round-trip duration. | +| `conductor_worker_task_execute_time` | `task_type` | Worker function execution duration. | +| `conductor_worker_task_update_time` | `task_type` | Task result update duration. | -### Built-in Prometheus Text +Each summary exposes quantile, count, and sum series: -```typescript -const text = collector.toPrometheusText(); -// Returns Prometheus text format (text/plain; version=0.0.4) +```prometheus +conductor_worker_task_execute_time{task_type="my_task",quantile="0.5"} 102 +conductor_worker_task_execute_time{task_type="my_task",quantile="0.95"} 250 +conductor_worker_task_execute_time_count{task_type="my_task"} 1000 +conductor_worker_task_execute_time_sum{task_type="my_task"} 120345 ``` -### Async (with prom-client support) - -```typescript -const text = await collector.toPrometheusTextAsync(); -// Uses prom-client registry when available, falls back to built-in -``` - -### HTTP Server (MetricsServer) - -```typescript -import { MetricsServer } from "@io-orkes/conductor-javascript"; - -const server = new MetricsServer(collector, 9090); -await server.start(); -// GET /metrics — Content-Type from collector.getContentType() -// GET /health — { "status": "UP" } -await server.stop(); +### Legacy Size Metrics + +Type is Summary. Values are bytes. + +| Metric | Labels | Description | +|---|---|---| +| `conductor_worker_task_result_size_bytes` | `task_type` | Task result output payload size. | +| `conductor_worker_workflow_input_size_bytes` | `workflow_type` | Workflow input payload size. | + +### Legacy HTTP Metrics + +| Metric | Labels | Description | +|---|---|---| +| `conductor_worker_http_api_client_request` | `endpoint` | API request duration in milliseconds. The `endpoint` label is a compound `"METHOD:/api/path:STATUS"` string. | + +## Labels + +| Label | Used by | Values | +|---|---|---| +| `task_type` | Legacy worker metrics | Task definition name. | +| `taskType` | Canonical worker metrics | Task definition name. | +| `workflowType` | Canonical workflow metrics | Workflow definition name. | +| `workflow_type` | Legacy `conductor_worker_workflow_input_size_bytes` | Workflow definition name. | +| `version` | Canonical `workflow_input_size_bytes` | Workflow version as a string. Empty string when not provided. | +| `status` | Canonical task time histograms | `SUCCESS` or `FAILURE`. For `http_api_client_request_seconds`, the HTTP status code as a string, or `"0"` on network failure. | +| `exception` | Canonical error counters | Exception type name, such as `TypeError`. Derived from `error.name` or `error.constructor.name`. | +| `entityName` | Canonical `external_payload_used_total` | Task type or workflow name associated with the external payload. | +| `operation` | Canonical `external_payload_used_total` | External payload operation, such as `READ` or `WRITE`. | +| `payload_type` | Legacy `conductor_worker_external_payload_used_total` | Payload type, such as `workflow_input` or `task_output`. | +| `payloadType` | Canonical `external_payload_used_total` | Payload type, such as `TASK_INPUT`, `TASK_OUTPUT`, `WORKFLOW_INPUT`, or `WORKFLOW_OUTPUT`. | +| `method` | Canonical HTTP metrics | HTTP verb. | +| `uri` | Canonical HTTP metrics | Request path. May contain interpolated identifiers. | +| `endpoint` | Legacy HTTP metrics | Compound `"METHOD:/api/path:STATUS"` string. | +| `quantile` | Legacy time and size metrics | `0.5`, `0.75`, `0.9`, `0.95`, or `0.99`. | + +## Migrating From Legacy to Canonical + +Canonical mode is opt-in during the deprecation period. Before switching a +production worker, update dashboards and alerts against a staging worker with +`WORKER_CANONICAL_METRICS=true`. + +Key changes: + +- The `conductor_worker_` prefix is removed. Canonical metric names are + unprefixed. +- Legacy task labels use `task_type`; canonical task labels use `taskType`. +- Legacy time metrics are millisecond summaries with quantiles. Canonical time + metrics are second-based histograms with bucket boundaries. Query `_bucket` + series with `histogram_quantile()` instead of reading `{quantile="..."}` + gauges. +- Legacy size metrics are summaries. Canonical size metrics are histograms. +- Canonical error counters add an `exception` label containing the exception + type name. +- Canonical time histograms add a `status` label (`SUCCESS` or `FAILURE`). +- Canonical mode adds metrics that legacy mode never emits: + `task_execution_started_total`, `task_ack_failed_total`, and + `active_workers`. +- Legacy `conductor_worker_worker_restart_total` is not emitted in canonical + mode (Node.js single-process model). +- Legacy uses `payload_type`; canonical uses `payloadType`. +- Legacy HTTP metrics use a compound `endpoint` label; canonical uses separate + `method`, `uri`, and `status` labels. +- Canonical and legacy collectors are mutually exclusive. During a migration, + compare scrape output by running separate worker instances or environments + with and without `WORKER_CANONICAL_METRICS=true`. + +Legacy-to-canonical replacements: + +| Legacy metric | Canonical replacement | +|---|---| +| `conductor_worker_task_poll_total{task_type}` | `task_poll_total{taskType}` | +| `conductor_worker_task_poll_error_total{task_type}` | `task_poll_error_total{taskType,exception}` | +| `conductor_worker_task_execute_total{task_type}` | `task_execute_time_seconds{taskType,status}` (count from histogram) and `task_execution_started_total{taskType}` | +| `conductor_worker_task_execute_error_total{task_type}` | `task_execute_error_total{taskType,exception}` | +| `conductor_worker_task_update_error_total{task_type}` | `task_update_error_total{taskType,exception}` | +| `conductor_worker_task_poll_time{task_type}` (summary, ms) | `task_poll_time_seconds{taskType,status}` (histogram, seconds) | +| `conductor_worker_task_execute_time{task_type}` (summary, ms) | `task_execute_time_seconds{taskType,status}` (histogram, seconds) | +| `conductor_worker_task_update_time{task_type}` (summary, ms) | `task_update_time_seconds{taskType,status}` (histogram, seconds) | +| `conductor_worker_task_result_size_bytes{task_type}` (summary) | `task_result_size_bytes{taskType}` (histogram) | +| `conductor_worker_workflow_input_size_bytes{workflow_type}` (summary) | `workflow_input_size_bytes{workflowType,version}` (histogram) | +| `conductor_worker_http_api_client_request{endpoint}` (summary, ms) | `http_api_client_request_seconds{method,uri,status}` (histogram, seconds) | +| `conductor_worker_external_payload_used_total{payload_type}` | `external_payload_used_total{entityName,operation,payloadType}` | +| `conductor_worker_workflow_start_error_total` (no labels) | `workflow_start_error_total{workflowType,exception}` | +| `conductor_worker_worker_restart_total` | — (not emitted in canonical mode) | + +Common PromQL replacements: + +| Legacy | Canonical | +|---|---| +| `conductor_worker_task_execute_time{quantile="0.95"}` | `histogram_quantile(0.95, sum by (le, taskType, status) (rate(task_execute_time_seconds_bucket[5m])))` | +| `conductor_worker_task_poll_time{quantile="0.95"}` | `histogram_quantile(0.95, sum by (le, taskType, status) (rate(task_poll_time_seconds_bucket[5m])))` | +| `conductor_worker_http_api_client_request{quantile="0.95"}` | `histogram_quantile(0.95, sum by (le, method, uri, status) (rate(http_api_client_request_seconds_bucket[5m])))` | +| `conductor_worker_task_result_size_bytes{quantile="0.95"}` | `histogram_quantile(0.95, sum by (le, taskType) (rate(task_result_size_bytes_bucket[5m])))` | + +Average latency queries continue to use `_sum` divided by `_count`, but the +canonical series are cumulative histogram counters: + +```promql +sum(rate(task_execute_time_seconds_sum[5m])) by (taskType) +/ +sum(rate(task_execute_time_seconds_count[5m])) by (taskType) ``` -### File Output - -Configured via `filePath` in `MetricsCollectorConfig`. Writes `toPrometheusText()` output to disk. The file writer performs an immediate first write on construction, then writes periodically at the configured interval. - ---- - -## Sliding Window and Quantile Calculation - -Summary metrics use a **sliding window** (default: 1,000 observations) to calculate percentiles. This provides: +## Troubleshooting -- Accurate recent percentiles without unbounded memory growth -- No need to pre-configure histogram bucket boundaries -- Direct percentile values without interpolation artifacts +### Metrics Are Empty -Quantiles are computed on-demand using linear interpolation on sorted observations when `toPrometheusText()` is called. +- Verify that `createMetricsCollector()` or a collector constructor is called + and the collector is passed to `TaskHandler` via `eventListeners`. +- Verify workers have polled or executed tasks. Metrics are created lazily when + the relevant event occurs. +- Confirm the scrape endpoint is reachable at the expected host and port. -When using `prom-client` (`usePromClient: true`), summaries use prom-client's native implementation with `maxAgeSeconds: 600` and `ageBuckets: 5`. +### Missing HTTP or Workflow Metrics ---- +- `http_api_client_request_seconds` (canonical) or + `conductor_worker_http_api_client_request` (legacy) is recorded from the + `fetchWithRetry` HTTP layer. Verify the collector is constructed before HTTP + calls begin. +- `workflow_input_size_bytes` and `workflow_start_error_total` are recorded in + `WorkflowExecutor`. Verify the collector is active before starting workflows. -## Monitoring Best Practices +### High Cardinality -- **Use p95/p99 for SLO monitoring** rather than averages. Percentile-based thresholds better capture user-impacting performance variations. -- **Alert on `task_update_error_total`** — a rising count indicates task results are being lost and workers are failing to report back to the Conductor server. -- **Alert on `task_execution_queue_full_total`** — indicates the concurrency limit is consistently reached. Consider increasing worker `concurrency`. -- **Monitor `task_poll_time` p99** — high poll latency suggests network issues or server overload. -- **Monitor `task_execute_time` p95** — watch for execution time regression in worker functions. -- **File output interval**: 10-60 seconds recommended for production. Lower intervals increase disk I/O. -- **Clean metrics directory on startup** when using file output with multiprocess workers to avoid stale data. +- Watch the `uri` label (canonical) or `endpoint` label (legacy) on HTTP + metrics. The SDK records the interpolated request path, which may include + task type names or workflow IDs. +- Prefer canonical mode for bounded `exception` labels. Legacy error counters + encode exception names in the Map key, not as a proper Prometheus label. +- Avoid embedding user identifiers or unbounded values in task type, workflow + type, or external payload labels. ---- +### prom-client Issues -## Programmatic Access - -```typescript -const metrics = collector.getMetrics(); - -// Counter values -metrics.pollTotal.get("my_task"); // number -metrics.taskExecutionTotal.get("my_task"); // number - -// Summary observations (raw array) -metrics.pollDurationMs.get("my_task"); // number[] -metrics.executionDurationMs.get("my_task"); // number[] - -// Reset all metrics -collector.reset(); - -// Stop file writer and HTTP server -await collector.stop(); -``` +- `MetricsCollector` uses `await import("./MetricsServer.js")` internally. The + `.js` extension does not resolve under Jest's TypeScript transform. Test + `MetricsServer` by importing it directly, not via the `httpPort` config + option. +- When `usePromClient: true` is set but `prom-client` is not installed, the + collector falls back to the built-in text format silently. diff --git a/README.md b/README.md index 7880908a..f5479ac5 100644 --- a/README.md +++ b/README.md @@ -386,12 +386,12 @@ await handler.startWorkers(); ## Monitoring Workers -Enable Prometheus metrics with the built-in `MetricsCollector`: +Enable Prometheus metrics with the built-in metrics collector: ```typescript -import { MetricsCollector, MetricsServer, TaskHandler } from "@io-orkes/conductor-javascript"; +import { createMetricsCollector, MetricsServer, TaskHandler } from "@io-orkes/conductor-javascript"; -const metrics = new MetricsCollector(); +const metrics = createMetricsCollector(); const server = new MetricsServer(metrics, 9090); await server.start(); @@ -405,7 +405,7 @@ await handler.startWorkers(); // GET http://localhost:9090/health — {"status":"UP"} ``` -Collects 18 metric types: poll counts, execution durations, error rates, output sizes, and more — with p50/p75/p90/p95/p99 quantiles. See [METRICS.md](METRICS.md) for the full reference. +The SDK has two metric surfaces: **legacy** (default, prefixed `conductor_worker_` names, Summary type) and **canonical** (opt-in via `WORKER_CANONICAL_METRICS=true`, unprefixed names, Histogram type). See [METRICS.md](METRICS.md) for the full reference. ## Managing Workflow Executions @@ -542,7 +542,7 @@ See [examples/agentic-workflows/](examples/agentic-workflows/) for all examples. | Document | Description | |----------|-------------| | [SDK Development Guide](SDK_DEVELOPMENT.md) | Architecture, patterns, pitfalls, testing | -| [Metrics Reference](METRICS.md) | All 18 Prometheus metrics with descriptions | +| [Metrics Reference](METRICS.md) | Legacy and canonical Prometheus metrics with migration guide | | [Breaking Changes](BREAKING_CHANGES.md) | v3.x migration guide | | [Workflow Management](docs/api-reference/workflow-executor.md) | Start, pause, resume, terminate, retry, search, signal | | [Task Management](docs/api-reference/task-client.md) | Task operations, logs, queue management | diff --git a/SDK_COMPARISON.md b/SDK_COMPARISON.md index 881800bc..a2fdbfd6 100644 --- a/SDK_COMPARISON.md +++ b/SDK_COMPARISON.md @@ -4,7 +4,7 @@ This document provides a detailed comparison between the **Python SDK** (golden reference, `conductor-python`) and the **JavaScript SDK** (`@io-orkes/conductor-javascript` v3.0.0), covering API surface, architecture, worker system, and feature parity. -**Verdict Summary**: The JavaScript SDK has **near-complete parity** with the Python SDK (~98%). All client classes (including rate limit CRUD), worker features (TaskContext, adaptive backoff, 19 Prometheus metrics with quantiles and optional prom-client, decorator-based JSON schema generation), workflow DSL (`ConductorWorkflow` with `toSubWorkflowTask()`), and all 34 task type builders (including 13 LLM/AI + 2 MCP) are implemented. +**Verdict Summary**: The JavaScript SDK has **near-complete parity** with the Python SDK (~98%). All client classes (including rate limit CRUD), worker features (TaskContext, adaptive backoff, Prometheus metrics with legacy and canonical modes, decorator-based JSON schema generation), workflow DSL (`ConductorWorkflow` with `toSubWorkflowTask()`), and all 34 task type builders (including 13 LLM/AI + 2 MCP) are implemented. --- @@ -580,22 +580,23 @@ Features: #### Metrics / Observability +Both SDKs support legacy and canonical metric surfaces via `WORKER_CANONICAL_METRICS`. See [METRICS.md](METRICS.md) for the JavaScript SDK catalog. + | # | Feature | Python | JavaScript | Status | |---|---------|--------|-----------|--------| -| 1 | MetricsCollector | Yes (19 metrics) | Yes (19 metrics) | Full | +| 1 | Legacy / canonical collectors | `create_metrics_collector()` | `createMetricsCollector()` | Full | | 2 | HTTP `/metrics` endpoint | Yes | `MetricsServer` | Full | | 3 | HTTP `/health` endpoint | Yes | `MetricsServer` | Full | | 4 | Prometheus text format | Yes | `toPrometheusText()` | Full | | 5 | File-based metrics | Yes (`.prom` files) | Yes (`filePath` config option) | Full | -| 6 | API request metrics | `http_api_client_request` | `recordApiRequestTime()` | Full | -| 7 | Queue full metric | `task_execution_queue_full` | `recordTaskExecutionQueueFull()` | Full | -| 8 | Uncaught exception metric | `thread_uncaught_exceptions` | `recordUncaughtException()` | Full | -| 9 | Workflow start error | `workflow_start_error` | `recordWorkflowStartError()` | Full | -| 10 | External payload used | `external_payload_used` | `recordExternalPayloadUsed()` | Full | -| 11 | Worker restart metric | `worker_restart` | `recordWorkerRestart()` | Full | -| 12 | Quantile calculation (p50-p99) | Yes (sliding window 1000) | Yes (sliding window, configurable) | Full | -| 13 | `prometheus_client` integration | Yes (native prom-client) | Optional `prom-client` integration via `usePromClient: true` + `PrometheusRegistry` | Full (optional peer dep) | -| 14 | Auto-start via `httpPort` config | Yes | Yes | Full | +| 6 | API request metrics | Yes | `recordApiRequestTime()` | Full | +| 7 | Queue full metric | Yes | `recordTaskExecutionQueueFull()` | Full | +| 8 | Uncaught exception metric | Yes | `recordUncaughtException()` | Full | +| 9 | Workflow start error | Yes | `recordWorkflowStartError()` | Full | +| 10 | External payload used | Yes | `recordExternalPayloadUsed()` | Full | +| 11 | Worker restart metric | Yes (Python-only, multi-process) | Legacy only (single-process model) | Full | +| 12 | `prometheus_client` / `prom-client` | Yes (native) | Optional via `usePromClient: true` | Full | +| 13 | Auto-start via `httpPort` config | Yes | Yes | Full | #### JSON Schema Generation @@ -692,7 +693,7 @@ Both SDKs support identical environment variable formats for worker configuratio | **Worker system** | **98%** | TaskContext, adaptive backoff, auth backoff, events, schema | | **Task type builders** | **100%** | 34/34 builders including LLM, MCP, HTTP Poll, Webhook, GetDocument | | **Workflow DSL** | **95%** | Full `ConductorWorkflow` with `toSubWorkflowTask()`. Missing only `__call__` (language limitation) | -| **Metrics/Observability** | **98%** | 19 metrics, quantiles (p50-p99), HTTP + file export, optional prom-client integration | +| **Metrics/Observability** | **98%** | Legacy + canonical modes, HTTP + file export, optional prom-client integration | | **AI module** | **95%** | All LLM task builders + `LLMProvider`/`VectorDB` enums + `IntegrationConfig` types. Missing only `AIOrchestrator` | | **JSON schema** | **95%** | `jsonSchema()` declarative helper + `@schemaField()` decorator with `reflect-metadata` type inference | | **Scheduling** | **100%** | Full parity including tags | @@ -712,7 +713,7 @@ Both SDKs support identical environment variable formats for worker configuratio 7. **Worker configuration hierarchy**: Identical env var format and precedence 8. **`TaskHandler` orchestrator**: Same architecture as Python 9. **TaskContext**: Full async-local context with `addLog()`, `setCallbackAfter()` -10. **Prometheus metrics**: 19 metrics with quantiles (p50-p99), `MetricsServer` (HTTP + file export), optional `prom-client` integration +10. **Prometheus metrics**: Legacy and canonical modes with `createMetricsCollector()`, `MetricsServer` (HTTP + file export), optional `prom-client` integration 11. **JSON schema generation**: `jsonSchema()` declarative helper + `@schemaField()` decorator with `reflect-metadata` runtime type inference + `inputType`/`outputType` on `@worker` 12. **AI types**: `LLMProvider`, `VectorDB` enums + typed `IntegrationConfig` + `withPromptVariable()` helpers 13. **OpenAPI-generated types**: Strong TypeScript types from spec diff --git a/SDK_DEVELOPMENT.md b/SDK_DEVELOPMENT.md index b40fd2a6..05bfd277 100644 --- a/SDK_DEVELOPMENT.md +++ b/SDK_DEVELOPMENT.md @@ -119,9 +119,15 @@ src/ context/ TaskContext.ts # AsyncLocalStorage-based per-task context + getTaskContext() metrics/ - MetricsCollector.ts # TaskRunnerEventsListener impl, 19 metric types, quantiles - MetricsServer.ts # HTTP server: /metrics (Prometheus) + /health (JSON) - PrometheusRegistry.ts # Optional prom-client bridge (lazy loaded) + LegacyMetricsCollector.ts # Default: prefixed names, Summary type, ms units + CanonicalMetricsCollector.ts # Opt-in: unprefixed canonical names, Histogram type, seconds + metricsFactory.ts # createMetricsCollector() reads WORKER_CANONICAL_METRICS + MetricsServer.ts # HTTP server: /metrics (Prometheus) + /health (JSON) + MetricsCollectorInterface.ts # Shared interface for both collectors + PrometheusRegistry.ts # Optional prom-client bridge for legacy collector + CanonicalPrometheusRegistry.ts # Optional prom-client bridge for canonical collector + accumulators.ts # HistogramAccumulator, MultiLabelCounter, GaugeMetric + httpObserver.ts # Global singleton for API request metrics schema/ # jsonSchema(), schemaField() decorator, generateSchemaFromClass() config/ # Worker configuration resolution helpers generators/ # Legacy task generators (pre-v3, still exported for backward compat) @@ -268,15 +274,13 @@ Returns `undefined` outside task execution. All 16 methods: `getTaskId()`, `getW ### 7. Metrics +`createMetricsCollector()` reads `WORKER_CANONICAL_METRICS` and returns a legacy or canonical collector. See [METRICS.md](METRICS.md) for the full catalog and migration guide. + ```typescript -const metrics = new MetricsCollector({ prefix: "my_app", slidingWindowSize: 1000 }); +const metrics = createMetricsCollector(); const handler = new TaskHandler({ client, eventListeners: [metrics], scanForDecorated: true }); await handler.startWorkers(); -// Programmatic access -const m = metrics.getMetrics(); -console.log(m.pollTotal.get("my_task")); - // Prometheus text const text = metrics.toPrometheusText(); ``` @@ -291,7 +295,7 @@ await server.start(); await server.stop(); ``` -**Do not use `MetricsCollector({ httpPort })` in Jest** - it uses a dynamic import with `.js` extension that doesn't resolve under Jest's TS transform. +**Do not use `MetricsCollector({ httpPort })` in Jest** -- it uses a dynamic import with `.js` extension that doesn't resolve under Jest's TS transform. ## Known Pitfalls @@ -616,7 +620,7 @@ Node 18+ required. Dual ESM/CJS via `tsup` with `exports` field in `package.json | Workflow DSL | `ConductorWorkflow` | `ConductorWorkflow` | Fluent builder | | Worker decorator | `@worker_task` | `@worker` | TS decorator syntax | | Task context | `get_task_context()` | `getTaskContext()` | AsyncLocalStorage vs contextvars | -| Metrics | `MetricsCollector` | `MetricsCollector` | 19 metric types, quantiles | +| Metrics | `MetricsCollector` | `createMetricsCollector()` | Legacy + canonical modes, see [METRICS.md](METRICS.md) | | Metrics server | HTTP endpoint | `MetricsServer` | `/metrics` + `/health` | | Non-retryable | `NonRetryableError` | `NonRetryableException` | `FAILED_WITH_TERMINAL_ERROR` | | LLM builders | 13 builders | 13 builders | Full parity | diff --git a/WORKER_ARCHITECTURE_COMPARISON.md b/WORKER_ARCHITECTURE_COMPARISON.md index dd0701b5..528921ee 100644 --- a/WORKER_ARCHITECTURE_COMPARISON.md +++ b/WORKER_ARCHITECTURE_COMPARISON.md @@ -545,37 +545,38 @@ handler = TaskHandler( ### JavaScript: Built-in MetricsCollector -The JavaScript SDK provides a built-in `MetricsCollector` that implements `TaskRunnerEventsListener`: +The JavaScript SDK provides `createMetricsCollector()` which returns a legacy or canonical collector based on `WORKER_CANONICAL_METRICS`. See [METRICS.md](METRICS.md) for the full catalog. ```typescript -import { MetricsCollector, TaskHandler } from "@io-orkes/conductor-javascript/worker"; +import { createMetricsCollector, MetricsServer, TaskHandler } from "@io-orkes/conductor-javascript"; -const metrics = new MetricsCollector(); +const metrics = createMetricsCollector(); +const server = new MetricsServer(metrics, 9090); +await server.start(); const handler = new TaskHandler({ client, eventListeners: [metrics], }); -handler.startWorkers(); - -// Read metrics -const snapshot = metrics.getMetrics(); -console.log("Poll total:", snapshot.pollTotal); -console.log("Execution durations:", snapshot.executionDurationMs); +await handler.startWorkers(); +// GET http://localhost:9090/metrics -> Prometheus text +// GET http://localhost:9090/health -> {"status":"UP"} ``` ### Comparison | Feature | Python | JavaScript | |---------|--------|-----------| -| Built-in metrics collector | Yes (`MetricsCollector`) | Yes (`MetricsCollector`) | Parity | -| HTTP metrics endpoint | Yes (`/metrics`, `/health`) | No (use prom-client separately) | Python richer | -| File-based metrics | Yes (`.prom` files) | No | Python richer | +| Built-in metrics collector | Yes (`create_metrics_collector()`) | Yes (`createMetricsCollector()`) | Parity | +| Legacy / canonical modes | Yes (`WORKER_CANONICAL_METRICS`) | Yes (`WORKER_CANONICAL_METRICS`) | Parity | +| HTTP metrics endpoint | Yes (`/metrics`, `/health`) | Yes (`MetricsServer`) | Parity | +| File-based metrics | Yes (`.prom` files) | Yes (`filePath` config) | Parity | | Multiprocess aggregation | Yes (SQLite) | N/A (single process) | N/A | -| API request metrics | Yes (`http_api_client_request`) | No | Python richer | +| API request metrics | Yes | Yes (`recordApiRequestTime()`) | Parity | | Event-based architecture | Yes (MetricsCollector is listener) | Yes (MetricsCollector is listener) | Parity | | Custom metrics via events | Yes | Yes | Parity | +| Optional `prom-client` | Yes (native) | Yes (`usePromClient: true`) | Parity | --- From 19773f8c3ecbb69ae6b862d6717934e3233570b4 Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Thu, 7 May 2026 11:15:05 -0600 Subject: [PATCH 10/21] add or update a changelog --- CHANGELOG.md | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000..4ecd25b2 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,26 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +### Added + +- **Metrics harmonization** - canonical metric surface aligned with the cross-SDK catalog, opt-in via `WORKER_CANONICAL_METRICS=true` + - New `CanonicalMetricsCollector` and optional `CanonicalPrometheusRegistry` (prom-client adapter) emit the harmonized cross-SDK catalog: 12 counters (e.g. `task_poll_total`, `task_execution_started_total`, `task_paused_total`, `external_payload_used_total{entityName,operation,payloadType}`, `workflow_start_error_total{workflowType,exception}`), 4 time histograms (`task_poll_time_seconds`, `task_execute_time_seconds`, `task_update_time_seconds`, `http_api_client_request_seconds{method,uri,status}`) with buckets `0.001…10s`, 2 size histograms (`task_result_size_bytes`, `workflow_input_size_bytes{workflowType,version}`) with buckets `100…10_000_000` bytes, and `active_workers` gauge. Labels are camelCase; names are unprefixed. + - `createMetricsCollector()` factory selects `LegacyMetricsCollector` (default) or `CanonicalMetricsCollector` based on `WORKER_CANONICAL_METRICS` (truthy: `true`, `1`, `yes`, case-insensitive). `WORKER_LEGACY_METRICS` is also recognized; canonical wins when both are set. + - `HttpMetricsObserver` plus `fetchWithRetry` instrumentation records `http_api_client_request_seconds`; `WorkflowExecutor` records `workflow_input_size_bytes` and `workflow_start_error_total`. + - `Poller`, `TaskRunner`, and `EventDispatcher` emit a new `taskAckFailed` event and propagate output size, exception cause, and status. + - `fetchWithRetry` now retries on HTTP 502/503/504. + - Harness deployment manifest sets `WORKER_CANONICAL_METRICS=true`; `harness/main.ts` logs which collector is active. + +### Changed + +- **Metrics harmonization** - defaults preserved; legacy metrics emit unchanged when `WORKER_CANONICAL_METRICS` is unset + - `src/sdk/worker/metrics/MetricsCollector.ts` was renamed to `LegacyMetricsCollector.ts`. The public symbol is preserved via `export { LegacyMetricsCollector as MetricsCollector }` in `src/sdk/worker/metrics/index.ts`, so existing imports keep working. + - Default behavior is unchanged: with no env var set, the metric names, labels, and `conductor_worker_*` prefix from `v3.0.3` are preserved byte-for-byte. + - Rewrote `METRICS.md` with both surfaces, the env-var gate, side-by-side migration table with PromQL replacements, and troubleshooting. + - Updated `README.md`, `AGENTS.md`, `SDK_DEVELOPMENT.md`, `SDK_COMPARISON.md`, and `WORKER_ARCHITECTURE_COMPARISON.md` to reference `createMetricsCollector()` and the env var. From 6cd069f3a0fef6354eed66efc135e100c6a658bc Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Fri, 15 May 2026 11:18:28 -0600 Subject: [PATCH 11/21] adjustments to internal mechanics --- CHANGELOG.md | 4 +- METRICS.md | 24 +++++++++ jest.config.mjs | 1 - src/sdk/clients/worker/TaskRunner.ts | 5 +- .../events/__tests__/EventDispatcher.test.ts | 1 + src/sdk/clients/worker/events/types.ts | 2 + .../helpers/__tests__/fetchWithRetry.test.ts | 38 +++++++++++++ .../helpers/fetchWithRetry.ts | 53 +++++++++---------- src/sdk/worker/core/TaskHandler.ts | 48 ----------------- .../metrics/CanonicalMetricsCollector.ts | 11 +++- .../worker/metrics/LegacyMetricsCollector.ts | 2 +- .../CanonicalMetricsCollector.test.ts | 4 +- .../__tests__/MetricsCollector.test.ts | 4 +- src/sdk/worker/metrics/metricsFactory.ts | 10 ++-- 14 files changed, 118 insertions(+), 89 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4ecd25b2..1dacb8b7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,8 +13,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - New `CanonicalMetricsCollector` and optional `CanonicalPrometheusRegistry` (prom-client adapter) emit the harmonized cross-SDK catalog: 12 counters (e.g. `task_poll_total`, `task_execution_started_total`, `task_paused_total`, `external_payload_used_total{entityName,operation,payloadType}`, `workflow_start_error_total{workflowType,exception}`), 4 time histograms (`task_poll_time_seconds`, `task_execute_time_seconds`, `task_update_time_seconds`, `http_api_client_request_seconds{method,uri,status}`) with buckets `0.001…10s`, 2 size histograms (`task_result_size_bytes`, `workflow_input_size_bytes{workflowType,version}`) with buckets `100…10_000_000` bytes, and `active_workers` gauge. Labels are camelCase; names are unprefixed. - `createMetricsCollector()` factory selects `LegacyMetricsCollector` (default) or `CanonicalMetricsCollector` based on `WORKER_CANONICAL_METRICS` (truthy: `true`, `1`, `yes`, case-insensitive). `WORKER_LEGACY_METRICS` is also recognized; canonical wins when both are set. - `HttpMetricsObserver` plus `fetchWithRetry` instrumentation records `http_api_client_request_seconds`; `WorkflowExecutor` records `workflow_input_size_bytes` and `workflow_start_error_total`. - - `Poller`, `TaskRunner`, and `EventDispatcher` emit a new `taskAckFailed` event and propagate output size, exception cause, and status. - - `fetchWithRetry` now retries on HTTP 502/503/504. + - `Poller`, `TaskRunner`, and `EventDispatcher` emit a new `taskPaused` event when a poll cycle is skipped because the worker is paused. + - `fetchWithRetry` now retries HTTP 502/503/504 for idempotent methods (GET, HEAD, OPTIONS, PUT, DELETE). - Harness deployment manifest sets `WORKER_CANONICAL_METRICS=true`; `harness/main.ts` logs which collector is active. ### Changed diff --git a/METRICS.md b/METRICS.md index 43c22252..7d43e0c2 100644 --- a/METRICS.md +++ b/METRICS.md @@ -354,6 +354,30 @@ sum(rate(task_execute_time_seconds_count[5m])) by (taskType) - Avoid embedding user identifiers or unbounded values in task type, workflow type, or external payload labels. +### Recording Uncaught Exceptions + +The `thread_uncaught_exceptions_total` metric is not wired automatically. In +Node.js, registering a `process.on("uncaughtException")` handler overrides the +default crash behavior, which can leave the process running in a corrupted +state. Instead, wire it yourself so you control the exit policy: + +```typescript +const metrics = createMetricsCollector(); + +process.on("uncaughtException", (err) => { + metrics.recordUncaughtException(err.name || "Error"); + console.error(err); + process.exit(1); +}); + +process.on("unhandledRejection", (reason) => { + const name = reason instanceof Error ? reason.name || "Error" : "Error"; + metrics.recordUncaughtException(name); + console.error(reason); + process.exit(1); +}); +``` + ### prom-client Issues - `MetricsCollector` uses `await import("./MetricsServer.js")` internally. The diff --git a/jest.config.mjs b/jest.config.mjs index f8407f50..d8703a14 100644 --- a/jest.config.mjs +++ b/jest.config.mjs @@ -21,7 +21,6 @@ export default { "^@/(.*)$": "/src/$1", "^@open-api/(.*)$": "/src/open-api/$1", "^@test-utils/(.*)$": "/src/integration-tests/utils/$1", - "^(.*)\\.js$": "$1", }, transform: { "^.+\\.tsx?$": [ diff --git a/src/sdk/clients/worker/TaskRunner.ts b/src/sdk/clients/worker/TaskRunner.ts index cb637e48..5be29c83 100644 --- a/src/sdk/clients/worker/TaskRunner.ts +++ b/src/sdk/clients/worker/TaskRunner.ts @@ -227,15 +227,16 @@ export class TaskRunner { const { workerID } = this.options; let retryCount = 0; let lastError: Error | null = null; + let lastAttemptDurationMs = 0; while (retryCount < this.maxRetries) { + const updateStart = Date.now(); try { if (process.env.CI) { console.log( `[TaskRunner] Submitting task result taskId=${taskResult.taskId} workflowId=${taskResult.workflowInstanceId} taskType=${this.worker.taskDefName} attempt=${retryCount + 1}/${this.maxRetries}` ); } - const updateStart = Date.now(); if (TaskRunner.updateV2Available === false) { // Already detected a legacy server — skip the probe, call legacy directly. @@ -334,6 +335,7 @@ export class TaskRunner { }); return nextTask ?? undefined; } catch (error: unknown) { + lastAttemptDurationMs = Date.now() - updateStart; lastError = error as Error; this.errorHandler(lastError, task); this.logger.error( @@ -364,6 +366,7 @@ export class TaskRunner { workerId: workerID, workflowInstanceId: taskResult.workflowInstanceId, cause: lastError ?? new Error("Task update failed after all retries"), + durationMs: lastAttemptDurationMs, retryCount, taskResult, timestamp: new Date(), diff --git a/src/sdk/clients/worker/events/__tests__/EventDispatcher.test.ts b/src/sdk/clients/worker/events/__tests__/EventDispatcher.test.ts index b99310bf..c8b31634 100644 --- a/src/sdk/clients/worker/events/__tests__/EventDispatcher.test.ts +++ b/src/sdk/clients/worker/events/__tests__/EventDispatcher.test.ts @@ -227,6 +227,7 @@ describe("EventDispatcher", () => { workerId: "worker-1", workflowInstanceId: "workflow-1", cause: new Error("Update failed"), + durationMs: 500, retryCount: 4, taskResult: { status: "COMPLETED" }, timestamp: new Date(), diff --git a/src/sdk/clients/worker/events/types.ts b/src/sdk/clients/worker/events/types.ts index 9c94af69..4bd5306c 100644 --- a/src/sdk/clients/worker/events/types.ts +++ b/src/sdk/clients/worker/events/types.ts @@ -114,6 +114,8 @@ export interface TaskUpdateFailure extends TaskRunnerEvent { workflowInstanceId?: string; /** The error that caused the final update failure */ cause: Error; + /** Time taken for the last update attempt in milliseconds */ + durationMs: number; /** Number of retry attempts made */ retryCount: number; /** The TaskResult object that failed to update (for recovery/logging) */ diff --git a/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts b/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts index 883c65b5..84277b67 100644 --- a/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts +++ b/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts @@ -239,6 +239,44 @@ describe("fetchWithRetry", () => { expect(mockFetch).toHaveBeenCalledTimes(3); }); + it("should NOT retry 502 for POST requests (non-idempotent)", async () => { + mockFetch.mockResolvedValue(createMockResponse(502, "Bad Gateway")); + + const result = await retryFetch("http://test.com", { method: "POST" }, mockFetch, { + maxTransportRetries: 3, + initialRetryDelay: 1, + }); + + expect(result.status).toBe(502); + expect(mockFetch).toHaveBeenCalledTimes(1); + }); + + it("should NOT retry 503 for PATCH requests (non-idempotent)", async () => { + mockFetch.mockResolvedValue(createMockResponse(503, "Service Unavailable")); + + const result = await retryFetch("http://test.com", { method: "PATCH" }, mockFetch, { + maxTransportRetries: 3, + initialRetryDelay: 1, + }); + + expect(result.status).toBe(503); + expect(mockFetch).toHaveBeenCalledTimes(1); + }); + + it("should retry 502 for PUT requests (idempotent)", async () => { + mockFetch + .mockResolvedValueOnce(createMockResponse(502, "Bad Gateway")) + .mockResolvedValueOnce(createMockResponse(200, "ok")); + + const result = await retryFetch("http://test.com", { method: "PUT" }, mockFetch, { + maxTransportRetries: 3, + initialRetryDelay: 1, + }); + + expect(result.status).toBe(200); + expect(mockFetch).toHaveBeenCalledTimes(2); + }); + it("should handle transport error then 502 then success", async () => { mockFetch .mockRejectedValueOnce(new Error("ECONNRESET")) diff --git a/src/sdk/createConductorClient/helpers/fetchWithRetry.ts b/src/sdk/createConductorClient/helpers/fetchWithRetry.ts index 679a7439..693289df 100644 --- a/src/sdk/createConductorClient/helpers/fetchWithRetry.ts +++ b/src/sdk/createConductorClient/helpers/fetchWithRetry.ts @@ -1,3 +1,5 @@ +import { getHttpMetricsObserver } from "../../worker/metrics/httpObserver"; + type Input = Parameters[0]; type Init = Parameters[1]; @@ -95,6 +97,8 @@ const withJitter = (delayMs: number): number => { return Math.max(0, Math.round(delayMs + jitter)); }; +const IDEMPOTENT_METHODS = new Set(["GET", "HEAD", "OPTIONS", "PUT", "DELETE"]); + export const retryFetch = async ( input: Input, init: Init, @@ -157,10 +161,15 @@ export const retryFetch = async ( return rateLimitResponse; } - // Gateway error retry (502, 503, 504) -- transient proxy/server errors + // Gateway error retry (502, 503, 504) -- only for idempotent methods to + // avoid duplicate side effects when the upstream may have processed the request. if (response.status >= 502 && response.status <= 504) { - lastError = new Error(`Server error: HTTP ${response.status}`); - if (transportAttempt < maxTransportRetries) { + const reqMethod = (input instanceof Request + ? input.method + : init?.method ?? "GET" + ).toUpperCase(); + if (IDEMPOTENT_METHODS.has(reqMethod) && transportAttempt < maxTransportRetries) { + lastError = new Error(`Server error: HTTP ${response.status}`); await new Promise((resolve) => setTimeout(resolve, withJitter(initialRetryDelay * (transportAttempt + 1))) ); @@ -223,35 +232,21 @@ export const wrapFetchWithRetry = ( try { const response = await retryFetch(input, init, fetchFn, options); const durationMs = performance.now() - start; - try { - const { getHttpMetricsObserver } = await import( - "../../worker/metrics/httpObserver.js" - ); - getHttpMetricsObserver()?.recordApiRequestTime( - method, - uri, - String(response.status), - durationMs, - ); - } catch { - // Metrics recording is best-effort - } + getHttpMetricsObserver()?.recordApiRequestTime( + method, + uri, + String(response.status), + durationMs, + ); return response; } catch (error) { const durationMs = performance.now() - start; - try { - const { getHttpMetricsObserver } = await import( - "../../worker/metrics/httpObserver.js" - ); - getHttpMetricsObserver()?.recordApiRequestTime( - method, - uri, - "0", - durationMs, - ); - } catch { - // Metrics recording is best-effort - } + getHttpMetricsObserver()?.recordApiRequestTime( + method, + uri, + "0", + durationMs, + ); throw error; } }; diff --git a/src/sdk/worker/core/TaskHandler.ts b/src/sdk/worker/core/TaskHandler.ts index b39a9b94..5bfcda5f 100644 --- a/src/sdk/worker/core/TaskHandler.ts +++ b/src/sdk/worker/core/TaskHandler.ts @@ -7,7 +7,6 @@ import { RESTART_BACKOFF_MAX_MS, } from "../../clients/worker/constants"; import type { TaskRunnerEventsListener } from "../../clients/worker/events"; -import type { MetricsCollectorInterface } from "../metrics/MetricsCollectorInterface"; import { TaskRunner } from "../../clients/worker/TaskRunner"; import type { ConductorWorker, HealthMonitorConfig } from "../../clients/worker/types"; import type { ConductorLogger } from "../../helpers/logger"; @@ -141,9 +140,6 @@ export class TaskHandler { private restartAttempts = new Map(); // runner index → attempt count private healthMonitorConfig: HealthMonitorConfig; - // Uncaught exception handlers (bound so we can remove them) - private _uncaughtHandler?: (err: Error) => void; - private _unhandledHandler?: (reason: unknown) => void; /** * Create a TaskHandler instance with async module imports. @@ -328,7 +324,6 @@ export class TaskHandler { this.isRunning = true; this.startHealthMonitor(); - this.wireUncaughtExceptionHandlers(); this.logger.info("All workers started successfully"); } @@ -347,7 +342,6 @@ export class TaskHandler { } this.stopHealthMonitor(); - this.unwireUncaughtExceptionHandlers(); this.logger.info(`Stopping ${this.taskRunners.length} worker(s)...`); const stopPromises = this.taskRunners.map(async (runner, index) => { @@ -535,48 +529,6 @@ export class TaskHandler { } } - // ── Uncaught Exception Handling ───────────────────────────────── - - private findMetricsCollector(): MetricsCollectorInterface | undefined { - const listeners = this.config.eventListeners ?? []; - return listeners.find( - (l): l is MetricsCollectorInterface => - typeof (l as MetricsCollectorInterface).recordUncaughtException === - "function", - ); - } - - private wireUncaughtExceptionHandlers(): void { - const collector = this.findMetricsCollector(); - if (!collector) return; - - this._uncaughtHandler = (err: Error) => { - const excName = err.name || err.constructor?.name || "Error"; - collector.recordUncaughtException(excName); - }; - this._unhandledHandler = (reason: unknown) => { - const excName = - reason instanceof Error - ? reason.name || reason.constructor?.name || "Error" - : "Error"; - collector.recordUncaughtException(excName); - }; - - process.on("uncaughtException", this._uncaughtHandler); - process.on("unhandledRejection", this._unhandledHandler); - } - - private unwireUncaughtExceptionHandlers(): void { - if (this._uncaughtHandler) { - process.removeListener("uncaughtException", this._uncaughtHandler); - this._uncaughtHandler = undefined; - } - if (this._unhandledHandler) { - process.removeListener("unhandledRejection", this._unhandledHandler); - this._unhandledHandler = undefined; - } - } - // ── Discovery Summary ─────────────────────────────────────────── /** diff --git a/src/sdk/worker/metrics/CanonicalMetricsCollector.ts b/src/sdk/worker/metrics/CanonicalMetricsCollector.ts index f0fa5314..6757bf08 100644 --- a/src/sdk/worker/metrics/CanonicalMetricsCollector.ts +++ b/src/sdk/worker/metrics/CanonicalMetricsCollector.ts @@ -81,7 +81,6 @@ export class CanonicalMetricsCollector implements MetricsCollectorInterface { config.fileWriteIntervalMs ?? 5000, ); } - setHttpMetricsObserver(this); } private async initPromClient(): Promise { @@ -279,6 +278,16 @@ export class CanonicalMetricsCollector implements MetricsCollectorInterface { taskType: event.taskType, exception: excName, }); + + const seconds = event.durationMs / 1000; + this.state.taskUpdateTimeSeconds.observe( + { taskType: event.taskType, status: "FAILURE" }, + seconds, + ); + this._promRegistry?.observeHistogram("task_update_time_seconds", { + taskType: event.taskType, + status: "FAILURE", + }, seconds); } onTaskPaused(event: TaskPaused): void { diff --git a/src/sdk/worker/metrics/LegacyMetricsCollector.ts b/src/sdk/worker/metrics/LegacyMetricsCollector.ts index 381c14b5..8932a91b 100644 --- a/src/sdk/worker/metrics/LegacyMetricsCollector.ts +++ b/src/sdk/worker/metrics/LegacyMetricsCollector.ts @@ -120,7 +120,6 @@ export class LegacyMetricsCollector implements MetricsCollectorInterface { config.fileWriteIntervalMs ?? 5000, ); } - setHttpMetricsObserver(this); } private async initPromClient(): Promise { @@ -254,6 +253,7 @@ export class LegacyMetricsCollector implements MetricsCollectorInterface { onTaskUpdateFailure(event: TaskUpdateFailure): void { this.incrementCounter(this.metrics.taskUpdateFailureTotal, event.taskType, "update_error_total", "task_type"); + this.observeSummary(this.metrics.updateDurationMs, event.taskType, event.durationMs, "update_time", "task_type"); } // Canonical-only event — noop in legacy diff --git a/src/sdk/worker/metrics/__tests__/CanonicalMetricsCollector.test.ts b/src/sdk/worker/metrics/__tests__/CanonicalMetricsCollector.test.ts index 369eeee3..37c9300f 100644 --- a/src/sdk/worker/metrics/__tests__/CanonicalMetricsCollector.test.ts +++ b/src/sdk/worker/metrics/__tests__/CanonicalMetricsCollector.test.ts @@ -160,12 +160,13 @@ describe("CanonicalMetricsCollector", () => { expect(text).toContain('task_update_time_seconds_sum{taskType="task_a",status="SUCCESS"} 0.025'); }); - it("should emit task_update_error_total with exception label on failure", () => { + it("should emit task_update_error_total and task_update_time_seconds on failure", () => { collector.onTaskUpdateFailure({ taskType: "task_a", taskId: "t1", workerId: "w1", cause: new Error("server error"), + durationMs: 150, retryCount: 4, taskResult: {}, timestamp: new Date(), @@ -173,6 +174,7 @@ describe("CanonicalMetricsCollector", () => { const text = collector.toPrometheusText(); expect(text).toContain('task_update_error_total{taskType="task_a",exception="Error"} 1'); + expect(text).toContain('task_update_time_seconds_sum{taskType="task_a",status="FAILURE"} 0.15'); }); }); diff --git a/src/sdk/worker/metrics/__tests__/MetricsCollector.test.ts b/src/sdk/worker/metrics/__tests__/MetricsCollector.test.ts index 3e8d8ebb..33787620 100644 --- a/src/sdk/worker/metrics/__tests__/MetricsCollector.test.ts +++ b/src/sdk/worker/metrics/__tests__/MetricsCollector.test.ts @@ -133,13 +133,14 @@ describe("LegacyMetricsCollector", () => { expect(metrics.updateDurationMs.get("task_a")).toEqual([25, 30]); }); - it("should count update failures via onTaskUpdateFailure", () => { + it("should count update failures and record duration via onTaskUpdateFailure", () => { collector.onTaskUpdateFailure({ taskType: "task_a", taskId: "t1", workerId: "w1", workflowInstanceId: "wf1", cause: new Error("server error"), + durationMs: 200, retryCount: 4, taskResult: {}, timestamp: new Date(), @@ -147,6 +148,7 @@ describe("LegacyMetricsCollector", () => { const metrics = collector.getMetrics(); expect(metrics.taskUpdateFailureTotal.get("task_a")).toBe(1); + expect(metrics.updateDurationMs.get("task_a")).toEqual([200]); }); }); diff --git a/src/sdk/worker/metrics/metricsFactory.ts b/src/sdk/worker/metrics/metricsFactory.ts index bcb45588..dabe4810 100644 --- a/src/sdk/worker/metrics/metricsFactory.ts +++ b/src/sdk/worker/metrics/metricsFactory.ts @@ -2,6 +2,7 @@ import type { MetricsCollectorConfig } from "./LegacyMetricsCollector"; import type { MetricsCollectorInterface } from "./MetricsCollectorInterface"; import { LegacyMetricsCollector } from "./LegacyMetricsCollector"; import { CanonicalMetricsCollector } from "./CanonicalMetricsCollector"; +import { setHttpMetricsObserver } from "./httpObserver"; /** * Create the appropriate MetricsCollector based on environment variables. @@ -19,9 +20,10 @@ export function createMetricsCollector( (process.env.WORKER_CANONICAL_METRICS ?? "").toLowerCase(), ); - if (useCanonical) { - return new CanonicalMetricsCollector(config); - } + const collector = useCanonical + ? new CanonicalMetricsCollector(config) + : new LegacyMetricsCollector(config); - return new LegacyMetricsCollector(config); + setHttpMetricsObserver(collector); + return collector; } From db1fd4c3cf35e6f773a223c42a3725041a4ba0b3 Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Fri, 15 May 2026 12:05:24 -0600 Subject: [PATCH 12/21] harness improvements for generating traffic with uris in metrics --- harness/main.ts | 13 +++++++++++++ harness/workflowGovernor.ts | 10 +++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/harness/main.ts b/harness/main.ts index 081188d7..182b8d50 100644 --- a/harness/main.ts +++ b/harness/main.ts @@ -9,6 +9,7 @@ import { MetadataResource } from "../src/open-api/generated"; import type { ConductorWorker } from "../src/sdk/clients/worker/types"; import { SimulatedTaskWorker } from "./simulatedTaskWorker"; import { WorkflowGovernor } from "./workflowGovernor"; +import { WorkflowStatusProbe } from "./workflowStatusProbe"; const WORKFLOW_NAME = "js_simulated_tasks_workflow"; @@ -104,15 +105,27 @@ async function main(): Promise { }); await handler.startWorkers(); + const probeRate = envIntOrDefault("HARNESS_PROBE_RATE_PER_SEC", 0); + const probe = + probeRate > 0 ? new WorkflowStatusProbe(client, probeRate) : undefined; + const governor = new WorkflowGovernor( workflowClient, WORKFLOW_NAME, workflowsPerSec, + probe ? probe.offer.bind(probe) : undefined, ); governor.start(); + if (probe) { + probe.start(); + } + const shutdown = async () => { console.log("Shutting down..."); + if (probe) { + probe.stop(); + } governor.stop(); await handler.stopWorkers(); process.exit(0); diff --git a/harness/workflowGovernor.ts b/harness/workflowGovernor.ts index 9865f126..4ccd0d33 100644 --- a/harness/workflowGovernor.ts +++ b/harness/workflowGovernor.ts @@ -4,16 +4,19 @@ export class WorkflowGovernor { private readonly workflowExecutor: WorkflowExecutor; private readonly workflowName: string; private readonly workflowsPerSecond: number; + private readonly idSink?: (id: string) => void; private timer: ReturnType | undefined; constructor( workflowExecutor: WorkflowExecutor, workflowName: string, workflowsPerSecond: number, + idSink?: (id: string) => void, ) { this.workflowExecutor = workflowExecutor; this.workflowName = workflowName; this.workflowsPerSecond = workflowsPerSecond; + this.idSink = idSink; } start(): void { @@ -51,10 +54,15 @@ export class WorkflowGovernor { } Promise.all(promises) - .then(() => { + .then((ids) => { console.log( `Governor: started ${this.workflowsPerSecond} workflow(s)`, ); + if (this.idSink) { + for (const id of ids) { + this.idSink(id); + } + } }) .catch((err: unknown) => { console.error( From cb9717193ac90003f6f7e8089b411030fe398e52 Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Fri, 15 May 2026 12:05:55 -0600 Subject: [PATCH 13/21] harness improvements for generating traffic with uris in metrics .. cont'd (missing file oops) --- harness/workflowStatusProbe.ts | 97 ++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 harness/workflowStatusProbe.ts diff --git a/harness/workflowStatusProbe.ts b/harness/workflowStatusProbe.ts new file mode 100644 index 00000000..741f4438 --- /dev/null +++ b/harness/workflowStatusProbe.ts @@ -0,0 +1,97 @@ +import type { Client } from "../src/open-api/generated/client"; +import { WorkflowResource } from "../src/open-api/generated"; + +const MAX_TRACKED_IDS = 256; + +/** + * Exercises UUID-bearing workflow lookup endpoints so + * http_api_client_request_seconds picks up entries with + * uri=/workflow/{workflowId} and uri=/workflow/{workflowId}/tasks. + * + * Default harness traffic only hits bounded, no-path-param URLs (poll/update), + * making the high-cardinality concern on the uri label invisible without this + * probe. + * + * Default off. Runs only when HARNESS_PROBE_RATE_PER_SEC > 0. + * Side-effect-free: only issues read calls (getExecutionStatus, + * getExecutionStatusTaskList). + * Self-bounded: fixed-size FIFO of workflow IDs. + */ +export class WorkflowStatusProbe { + private readonly client: Client; + private readonly callsPerSecond: number; + private readonly recentIDs: string[] = []; + private timer: ReturnType | undefined; + + constructor(client: Client, callsPerSecond: number) { + this.client = client; + this.callsPerSecond = callsPerSecond; + } + + offer(workflowId: string): void { + if (!workflowId) return; + this.recentIDs.push(workflowId); + if (this.recentIDs.length > MAX_TRACKED_IDS) { + this.recentIDs.splice(0, this.recentIDs.length - MAX_TRACKED_IDS); + } + } + + start(): void { + if (this.callsPerSecond <= 0) { + console.log( + "WorkflowStatusProbe disabled (HARNESS_PROBE_RATE_PER_SEC<=0)", + ); + return; + } + console.log( + `WorkflowStatusProbe started: rate=${this.callsPerSecond}/sec, retainedIds<=${MAX_TRACKED_IDS}`, + ); + + this.timer = setInterval(() => { + this.tick(); + }, 1000); + + if (this.timer && typeof this.timer === "object" && "unref" in this.timer) { + this.timer.unref(); + } + } + + stop(): void { + if (this.timer) { + clearInterval(this.timer); + this.timer = undefined; + } + console.log("WorkflowStatusProbe stopped"); + } + + private tick(): void { + const budget = Math.min(this.callsPerSecond, this.recentIDs.length); + if (budget === 0) return; + + const ids: string[] = []; + for (let i = 0; i < budget; i++) { + ids.push( + this.recentIDs[Math.floor(Math.random() * this.recentIDs.length)], + ); + } + + for (const id of ids) { + const call = + Math.random() < 0.5 + ? WorkflowResource.getExecutionStatus({ + client: this.client, + path: { workflowId: id }, + }) + : WorkflowResource.getExecutionStatusTaskList({ + client: this.client, + path: { workflowId: id }, + }); + + call.catch((err: unknown) => { + console.error( + `probe: ${id}: ${err instanceof Error ? err.message : String(err)}`, + ); + }); + } + } +} From 24b162c3a87997f0de5047fe925fe534752acc1b Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Mon, 18 May 2026 11:37:22 -0600 Subject: [PATCH 14/21] revisions to internal mechanics for http request metrics --- .../createConductorClient.ts | 5 + .../helpers/__tests__/fetchWithRetry.test.ts | 51 ++---- .../__tests__/metricsInterceptors.test.ts | 147 ++++++++++++++++++ .../helpers/fetchWithRetry.ts | 44 +++--- .../helpers/metricsInterceptors.ts | 67 ++++++++ .../metrics/CanonicalMetricsCollector.ts | 6 +- .../worker/metrics/LegacyMetricsCollector.ts | 1 + .../metrics/MetricsCollectorInterface.ts | 1 + .../CanonicalMetricsCollector.test.ts | 13 ++ .../MetricsCollector.prometheus.test.ts | 7 + src/sdk/worker/metrics/httpObserver.ts | 1 + 11 files changed, 280 insertions(+), 63 deletions(-) create mode 100644 src/sdk/createConductorClient/helpers/__tests__/metricsInterceptors.test.ts create mode 100644 src/sdk/createConductorClient/helpers/metricsInterceptors.ts diff --git a/src/sdk/createConductorClient/createConductorClient.ts b/src/sdk/createConductorClient/createConductorClient.ts index 51e460c7..c1d8970c 100644 --- a/src/sdk/createConductorClient/createConductorClient.ts +++ b/src/sdk/createConductorClient/createConductorClient.ts @@ -7,6 +7,7 @@ import { import type { OrkesApiConfig } from "../types"; import { createClient } from "../../open-api/generated/client"; import { addResourcesBackwardCompatibility } from "./helpers/addResourcesBackwardCompatibility"; +import { createMetricsInterceptors } from "./helpers/metricsInterceptors"; /** * Creates a Conductor client with authentication and configuration @@ -56,6 +57,10 @@ export const createConductorClient = async ( throwOnError: true, }); + const { onRequest, onResponse } = createMetricsInterceptors(); + openApiClient.interceptors.request.use(onRequest); + openApiClient.interceptors.response.use(onResponse); + let authResult: Awaited> | undefined; if (keyId && keySecret) { authResult = await handleAuth( diff --git a/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts b/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts index 84277b67..111f0120 100644 --- a/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts +++ b/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts @@ -640,6 +640,10 @@ describe("fetchWithRetry", () => { }); // ─── wrapFetchWithRetry metrics recording ─────────────────────────── + // + // After the interceptor migration, wrapFetchWithRetry only records + // metrics on network errors (status "0") — the response interceptor + // handles all successful/HTTP-error cases. describe("wrapFetchWithRetry metrics", () => { const mockRecordApiRequestTime = jest.fn< @@ -659,19 +663,13 @@ describe("fetchWithRetry", () => { httpObserver.setHttpMetricsObserver(undefined); }); - it("should record API request time on successful response", async () => { + it("should NOT record metrics on successful response (interceptors handle it)", async () => { mockFetch.mockResolvedValue(createMockResponse(200)); const wrappedFetch = wrapFetchWithRetry(mockFetch); await wrappedFetch("http://test.com/api/tasks", { method: "POST" }); - expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1); - const [method, uri, status, duration] = - mockRecordApiRequestTime.mock.calls[0]; - expect(method).toBe("POST"); - expect(uri).toBe("/api/tasks"); - expect(status).toBe("200"); - expect(duration).toBeGreaterThanOrEqual(0); + expect(mockRecordApiRequestTime).not.toHaveBeenCalled(); }); it("should record status '0' on network failure", async () => { @@ -693,31 +691,23 @@ describe("fetchWithRetry", () => { expect(status).toBe("0"); }); - it("should extract method and URI from string URL", async () => { - mockFetch.mockResolvedValue(createMockResponse(201)); - - const wrappedFetch = wrapFetchWithRetry(mockFetch); - await wrappedFetch("http://example.com/tasks/123", { method: "PUT" }); - - expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1); - const [method, uri] = mockRecordApiRequestTime.mock.calls[0]; - expect(method).toBe("PUT"); - expect(uri).toBe("/tasks/123"); - }); - - it("should extract method and URI from Request object", async () => { - mockFetch.mockResolvedValue(createMockResponse(200)); + it("should extract method and URI from Request object on network failure", async () => { + mockFetch.mockRejectedValue(new Error("ECONNRESET")); - const wrappedFetch = wrapFetchWithRetry(mockFetch); + const wrappedFetch = wrapFetchWithRetry(mockFetch, { + maxTransportRetries: 0, + }); const request = new Request("http://example.com/api/metadata", { method: "DELETE", }); - await wrappedFetch(request); + + await expect(wrappedFetch(request)).rejects.toThrow("ECONNRESET"); expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1); - const [method, uri] = mockRecordApiRequestTime.mock.calls[0]; + const [method, uri, status] = mockRecordApiRequestTime.mock.calls[0]; expect(method).toBe("DELETE"); expect(uri).toBe("/api/metadata"); + expect(status).toBe("0"); }); it("should not break fetch when no observer is registered", async () => { @@ -729,16 +719,5 @@ describe("fetchWithRetry", () => { expect(result.status).toBe(200); }); - - it("should default method to GET when init has no method", async () => { - mockFetch.mockResolvedValue(createMockResponse(200)); - - const wrappedFetch = wrapFetchWithRetry(mockFetch); - await wrappedFetch("http://test.com/path"); - - expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1); - const [method] = mockRecordApiRequestTime.mock.calls[0]; - expect(method).toBe("GET"); - }); }); }); diff --git a/src/sdk/createConductorClient/helpers/__tests__/metricsInterceptors.test.ts b/src/sdk/createConductorClient/helpers/__tests__/metricsInterceptors.test.ts new file mode 100644 index 00000000..77800cb3 --- /dev/null +++ b/src/sdk/createConductorClient/helpers/__tests__/metricsInterceptors.test.ts @@ -0,0 +1,147 @@ +import { jest, expect, describe, it, beforeEach, afterEach } from "@jest/globals"; +import { createMetricsInterceptors } from "../metricsInterceptors"; +import * as httpObserver from "@/sdk/worker/metrics/httpObserver"; +import type { ResolvedRequestOptions } from "@/open-api/generated/client/types.gen"; + +const mockRecordApiRequestTime = jest.fn< + (m: string, u: string, s: string | number, d: number, mu?: string) => void +>(); + +const fakeOpts = (url: string): ResolvedRequestOptions => + ({ url } as unknown as ResolvedRequestOptions); + +describe("metricsInterceptors", () => { + beforeEach(() => { + mockRecordApiRequestTime.mockClear(); + httpObserver.setHttpMetricsObserver({ + recordApiRequestTime: mockRecordApiRequestTime, + recordWorkflowInputSize: jest.fn(), + recordWorkflowStartError: jest.fn(), + }); + }); + + afterEach(() => { + httpObserver.setHttpMetricsObserver(undefined); + }); + + it("should record method, interpolated uri, status, duration, and template metricUri", () => { + const { onRequest, onResponse } = createMetricsInterceptors(); + + const request = new Request("http://conductor.example.com/api/workflow/abc-123", { + method: "GET", + }); + const opts = fakeOpts("/api/workflow/{workflowId}"); + + onRequest(request, opts); + const response = new Response(null, { status: 200 }); + onResponse(response, request, opts); + + expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1); + const [method, uri, status, duration, metricUri] = + mockRecordApiRequestTime.mock.calls[0]; + expect(method).toBe("GET"); + expect(uri).toBe("/api/workflow/abc-123"); + expect(status).toBe("200"); + expect(duration).toBeGreaterThanOrEqual(0); + expect(metricUri).toBe("/workflow/{workflowId}"); + }); + + it("should strip /api prefix from path template", () => { + const { onRequest, onResponse } = createMetricsInterceptors(); + + const request = new Request("http://host/api/tasks/poll/myTask", { + method: "POST", + }); + const opts = fakeOpts("/api/tasks/poll/{taskType}"); + + onRequest(request, opts); + onResponse(new Response(null, { status: 200 }), request, opts); + + const metricUri = mockRecordApiRequestTime.mock.calls[0][4]; + expect(metricUri).toBe("/tasks/poll/{taskType}"); + }); + + it("should not strip prefix if path does not start with /api/", () => { + const { onRequest, onResponse } = createMetricsInterceptors(); + + const request = new Request("http://host/health", { method: "GET" }); + const opts = fakeOpts("/health"); + + onRequest(request, opts); + onResponse(new Response(null, { status: 200 }), request, opts); + + const metricUri = mockRecordApiRequestTime.mock.calls[0][4]; + expect(metricUri).toBe("/health"); + }); + + it("should compute duration from request interceptor start time", () => { + const { onRequest, onResponse } = createMetricsInterceptors(); + + const request = new Request("http://host/api/workflow", { method: "GET" }); + const opts = fakeOpts("/api/workflow"); + + onRequest(request, opts); + // Simulate a small delay so duration > 0 + const busyWaitUntil = performance.now() + 2; + while (performance.now() < busyWaitUntil) { /* spin */ } + onResponse(new Response(null, { status: 200 }), request, opts); + + const duration = mockRecordApiRequestTime.mock.calls[0][3] as number; + expect(duration).toBeGreaterThan(0); + }); + + it("should gracefully handle missing start time (duration defaults to 0)", () => { + const { onResponse } = createMetricsInterceptors(); + + const request = new Request("http://host/api/workflow", { method: "GET" }); + const opts = fakeOpts("/api/workflow"); + + onResponse(new Response(null, { status: 200 }), request, opts); + + const duration = mockRecordApiRequestTime.mock.calls[0][3] as number; + expect(duration).toBe(0); + }); + + it("should do nothing when no observer is registered", () => { + httpObserver.setHttpMetricsObserver(undefined); + + const { onRequest, onResponse } = createMetricsInterceptors(); + + const request = new Request("http://host/api/workflow", { method: "GET" }); + const opts = fakeOpts("/api/workflow"); + + onRequest(request, opts); + const response = new Response(null, { status: 200 }); + const result = onResponse(response, request, opts); + + expect(result).toBe(response); + expect(mockRecordApiRequestTime).not.toHaveBeenCalled(); + }); + + it("should return request and response unchanged", () => { + const { onRequest, onResponse } = createMetricsInterceptors(); + + const request = new Request("http://host/api/workflow", { method: "GET" }); + const opts = fakeOpts("/api/workflow"); + + const returnedRequest = onRequest(request, opts); + expect(returnedRequest).toBe(request); + + const response = new Response("body", { status: 201 }); + const returnedResponse = onResponse(response, request, opts); + expect(returnedResponse).toBe(response); + }); + + it("should record correct status for error responses", () => { + const { onRequest, onResponse } = createMetricsInterceptors(); + + const request = new Request("http://host/api/workflow", { method: "POST" }); + const opts = fakeOpts("/api/workflow"); + + onRequest(request, opts); + onResponse(new Response(null, { status: 500 }), request, opts); + + const status = mockRecordApiRequestTime.mock.calls[0][2]; + expect(status).toBe("500"); + }); +}); diff --git a/src/sdk/createConductorClient/helpers/fetchWithRetry.ts b/src/sdk/createConductorClient/helpers/fetchWithRetry.ts index 693289df..69f0feea 100644 --- a/src/sdk/createConductorClient/helpers/fetchWithRetry.ts +++ b/src/sdk/createConductorClient/helpers/fetchWithRetry.ts @@ -211,35 +211,29 @@ export const wrapFetchWithRetry = ( ): typeof fetch => { return async (input: Input, init?: Init): Promise => { const start = performance.now(); - let method = "GET"; - let uri = ""; try { - if (input instanceof Request) { - method = input.method; - uri = new URL(input.url).pathname; - } else if (typeof input === "string") { - method = init?.method ?? "GET"; - try { uri = new URL(input).pathname; } catch { uri = input; } - } else { - method = init?.method ?? "GET"; - try { uri = input.pathname; } catch { uri = String(input); } + return await retryFetch(input, init, fetchFn, options); + } catch (error) { + // Network-error fallback: the response interceptor never runs when + // fetch throws, so record a status="0" metric here as a safety net. + let method = "GET"; + let uri = ""; + try { + if (input instanceof Request) { + method = input.method; + uri = new URL(input.url).pathname; + } else if (typeof input === "string") { + method = init?.method ?? "GET"; + try { uri = new URL(input).pathname; } catch { uri = input; } + } else { + method = init?.method ?? "GET"; + try { uri = input.pathname; } catch { uri = String(input); } + } + } catch { + // Best-effort URI extraction } - } catch { - // Best-effort URI extraction - } - try { - const response = await retryFetch(input, init, fetchFn, options); - const durationMs = performance.now() - start; - getHttpMetricsObserver()?.recordApiRequestTime( - method, - uri, - String(response.status), - durationMs, - ); - return response; - } catch (error) { const durationMs = performance.now() - start; getHttpMetricsObserver()?.recordApiRequestTime( method, diff --git a/src/sdk/createConductorClient/helpers/metricsInterceptors.ts b/src/sdk/createConductorClient/helpers/metricsInterceptors.ts new file mode 100644 index 00000000..f958ae9f --- /dev/null +++ b/src/sdk/createConductorClient/helpers/metricsInterceptors.ts @@ -0,0 +1,67 @@ +import type { ResolvedRequestOptions } from "../../../open-api/generated/client/types.gen"; +import { getHttpMetricsObserver } from "../../worker/metrics/httpObserver"; + +/** + * Strips the /api prefix that the OpenAPI spec bakes into every path. + * Other SDKs (Go, Java, Python, Rust, Ruby, C#) keep /api in the base URL + * and use clean paths like /workflow/{workflowId}. The JS SDK's generated + * code does it backwards: the base URL has /api stripped and every path + * template starts with /api/. This normalises back to the cross-SDK + * convention. + */ +const stripApiPrefix = (url: string): string => + url.startsWith("/api/") ? url.slice(4) : url; + +type OptsWithMetrics = ResolvedRequestOptions & { _metricsStart?: number }; + +/** + * Creates a matched pair of request/response interceptors that record + * http_api_client_request_seconds via the global HttpMetricsObserver. + * + * Both interceptors receive the same `opts` object for a given request + * (see client.gen.ts lines 89-102), so start time is stashed directly + * on opts — no WeakMap or side-channel needed. + * + * The response interceptor passes both the interpolated URI (for legacy + * metrics) and the bounded-cardinality path template (for canonical + * metrics) to the observer, letting each collector choose which to use. + */ +export function createMetricsInterceptors() { + const onRequest = ( + request: Request, + opts: ResolvedRequestOptions, + ): Request => { + (opts as OptsWithMetrics)._metricsStart = performance.now(); + return request; + }; + + const onResponse = ( + response: Response, + request: Request, + opts: ResolvedRequestOptions, + ): Response => { + const observer = getHttpMetricsObserver(); + if (!observer) return response; + + const start = (opts as OptsWithMetrics)._metricsStart; + const durationMs = start != null ? performance.now() - start : 0; + + const method = request.method; + const status = String(response.status); + + let uri = ""; + try { + uri = new URL(request.url).pathname; + } catch { + uri = String(request.url); + } + + const template = typeof opts.url === "string" ? stripApiPrefix(opts.url) : uri; + + observer.recordApiRequestTime(method, uri, status, durationMs, template); + + return response; + }; + + return { onRequest, onResponse }; +} diff --git a/src/sdk/worker/metrics/CanonicalMetricsCollector.ts b/src/sdk/worker/metrics/CanonicalMetricsCollector.ts index 6757bf08..bf8b6ad3 100644 --- a/src/sdk/worker/metrics/CanonicalMetricsCollector.ts +++ b/src/sdk/worker/metrics/CanonicalMetricsCollector.ts @@ -389,16 +389,18 @@ export class CanonicalMetricsCollector implements MetricsCollectorInterface { uri: string, status: number | string, durationMs: number, + metricUri?: string, ): void { + const resolvedUri = metricUri ?? uri; const statusStr = String(status); const seconds = durationMs / 1000; this.state.httpApiClientRequestSeconds.observe( - { method, uri, status: statusStr }, + { method, uri: resolvedUri, status: statusStr }, seconds, ); this._promRegistry?.observeHistogram("http_api_client_request_seconds", { method, - uri, + uri: resolvedUri, status: statusStr, }, seconds); } diff --git a/src/sdk/worker/metrics/LegacyMetricsCollector.ts b/src/sdk/worker/metrics/LegacyMetricsCollector.ts index 8932a91b..32467ff4 100644 --- a/src/sdk/worker/metrics/LegacyMetricsCollector.ts +++ b/src/sdk/worker/metrics/LegacyMetricsCollector.ts @@ -315,6 +315,7 @@ export class LegacyMetricsCollector implements MetricsCollectorInterface { uri: string, status: number | string, durationMs: number, + _metricUri?: string, ): void { const key = `${method}:${uri}:${status}`; this.observeSummary(this.metrics.apiRequestDurationMs, key, durationMs, "api_request", "endpoint"); diff --git a/src/sdk/worker/metrics/MetricsCollectorInterface.ts b/src/sdk/worker/metrics/MetricsCollectorInterface.ts index 4852b099..dd341994 100644 --- a/src/sdk/worker/metrics/MetricsCollectorInterface.ts +++ b/src/sdk/worker/metrics/MetricsCollectorInterface.ts @@ -35,6 +35,7 @@ export interface MetricsCollectorInterface extends TaskRunnerEventsListener { uri: string, status: number | string, durationMs: number, + metricUri?: string, ): void; // ── Output / lifecycle ───────────────────────────────────────── diff --git a/src/sdk/worker/metrics/__tests__/CanonicalMetricsCollector.test.ts b/src/sdk/worker/metrics/__tests__/CanonicalMetricsCollector.test.ts index 37c9300f..aab49782 100644 --- a/src/sdk/worker/metrics/__tests__/CanonicalMetricsCollector.test.ts +++ b/src/sdk/worker/metrics/__tests__/CanonicalMetricsCollector.test.ts @@ -246,6 +246,19 @@ describe("CanonicalMetricsCollector", () => { expect(text).toContain("# TYPE http_api_client_request_seconds histogram"); expect(text).toContain('http_api_client_request_seconds_sum{method="GET",uri="/api/workflow",status="200"} 0.045'); }); + + it("recordApiRequestTime should prefer metricUri over uri when provided", () => { + collector.recordApiRequestTime("GET", "/api/workflow/abc-123", 200, 30, "/workflow/{workflowId}"); + const text = collector.toPrometheusText(); + expect(text).toContain('uri="/workflow/{workflowId}"'); + expect(text).not.toContain("abc-123"); + }); + + it("recordApiRequestTime should fall back to uri when metricUri is undefined", () => { + collector.recordApiRequestTime("POST", "/api/tasks", 201, 20); + const text = collector.toPrometheusText(); + expect(text).toContain('uri="/api/tasks"'); + }); }); describe("onTaskPaused event", () => { diff --git a/src/sdk/worker/metrics/__tests__/MetricsCollector.prometheus.test.ts b/src/sdk/worker/metrics/__tests__/MetricsCollector.prometheus.test.ts index ffe9f567..424922a7 100644 --- a/src/sdk/worker/metrics/__tests__/MetricsCollector.prometheus.test.ts +++ b/src/sdk/worker/metrics/__tests__/MetricsCollector.prometheus.test.ts @@ -161,6 +161,13 @@ describe("LegacyMetricsCollector - Prometheus features", () => { const m = collector.getMetrics(); expect(m.apiRequestDurationMs.get("GET:/api/workflow:200")).toEqual([45, 55]); }); + + it("should ignore metricUri and use uri (legacy behavior preserved)", () => { + collector.recordApiRequestTime("GET", "/api/workflow/abc-123", 200, 45, "/workflow/{workflowId}"); + const m = collector.getMetrics(); + expect(m.apiRequestDurationMs.get("GET:/api/workflow/abc-123:200")).toEqual([45]); + expect(m.apiRequestDurationMs.get("GET:/workflow/{workflowId}:200")).toBeUndefined(); + }); }); // ── Sliding window ────────────────────────────────────────────── diff --git a/src/sdk/worker/metrics/httpObserver.ts b/src/sdk/worker/metrics/httpObserver.ts index fa2cbc76..9366e783 100644 --- a/src/sdk/worker/metrics/httpObserver.ts +++ b/src/sdk/worker/metrics/httpObserver.ts @@ -13,6 +13,7 @@ export interface HttpMetricsObserver { uri: string, status: number | string, durationMs: number, + metricUri?: string, ): void; recordWorkflowInputSize( From c8e97afc45d69660b431a8cd1f2de9bbe4650e1e Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Mon, 18 May 2026 11:52:12 -0600 Subject: [PATCH 15/21] update to changelog and metrics docs --- CHANGELOG.md | 23 ++++++------ METRICS.md | 102 +++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 106 insertions(+), 19 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1dacb8b7..f541892d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,18 +9,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added -- **Metrics harmonization** - canonical metric surface aligned with the cross-SDK catalog, opt-in via `WORKER_CANONICAL_METRICS=true` - - New `CanonicalMetricsCollector` and optional `CanonicalPrometheusRegistry` (prom-client adapter) emit the harmonized cross-SDK catalog: 12 counters (e.g. `task_poll_total`, `task_execution_started_total`, `task_paused_total`, `external_payload_used_total{entityName,operation,payloadType}`, `workflow_start_error_total{workflowType,exception}`), 4 time histograms (`task_poll_time_seconds`, `task_execute_time_seconds`, `task_update_time_seconds`, `http_api_client_request_seconds{method,uri,status}`) with buckets `0.001…10s`, 2 size histograms (`task_result_size_bytes`, `workflow_input_size_bytes{workflowType,version}`) with buckets `100…10_000_000` bytes, and `active_workers` gauge. Labels are camelCase; names are unprefixed. - - `createMetricsCollector()` factory selects `LegacyMetricsCollector` (default) or `CanonicalMetricsCollector` based on `WORKER_CANONICAL_METRICS` (truthy: `true`, `1`, `yes`, case-insensitive). `WORKER_LEGACY_METRICS` is also recognized; canonical wins when both are set. - - `HttpMetricsObserver` plus `fetchWithRetry` instrumentation records `http_api_client_request_seconds`; `WorkflowExecutor` records `workflow_input_size_bytes` and `workflow_start_error_total`. - - `Poller`, `TaskRunner`, and `EventDispatcher` emit a new `taskPaused` event when a poll cycle is skipped because the worker is paused. - - `fetchWithRetry` now retries HTTP 502/503/504 for idempotent methods (GET, HEAD, OPTIONS, PUT, DELETE). - - Harness deployment manifest sets `WORKER_CANONICAL_METRICS=true`; `harness/main.ts` logs which collector is active. +- Canonical metrics: opt-in harmonized metric surface via `WORKER_CANONICAL_METRICS=true` -- see [METRICS.md](METRICS.md) for the full catalog, configuration, and migration guide +- Bounded `uri` label on `http_api_client_request_seconds`: canonical mode uses path templates (e.g. `/workflow/{workflowId}`) instead of fully-resolved paths, preventing metric cardinality explosion from dynamic IDs +- `WorkflowStatusProbe` in harness: opt-in probe (via `HARNESS_PROBE_RATE_PER_SEC`) that exercises UUID-bearing endpoints to validate template URI metrics +- `fetchWithRetry` now retries HTTP 502/503/504 for idempotent methods (GET, HEAD, OPTIONS, PUT, DELETE) ### Changed -- **Metrics harmonization** - defaults preserved; legacy metrics emit unchanged when `WORKER_CANONICAL_METRICS` is unset - - `src/sdk/worker/metrics/MetricsCollector.ts` was renamed to `LegacyMetricsCollector.ts`. The public symbol is preserved via `export { LegacyMetricsCollector as MetricsCollector }` in `src/sdk/worker/metrics/index.ts`, so existing imports keep working. - - Default behavior is unchanged: with no env var set, the metric names, labels, and `conductor_worker_*` prefix from `v3.0.3` are preserved byte-for-byte. - - Rewrote `METRICS.md` with both surfaces, the env-var gate, side-by-side migration table with PromQL replacements, and troubleshooting. - - Updated `README.md`, `AGENTS.md`, `SDK_DEVELOPMENT.md`, `SDK_COMPARISON.md`, and `WORKER_ARCHITECTURE_COMPARISON.md` to reference `createMetricsCollector()` and the env var. +- Legacy metrics emit unchanged by default; no action required for existing deployments +- `MetricsCollector.ts` renamed to `LegacyMetricsCollector.ts`; the public symbol is preserved via re-export so existing imports keep working +- HTTP metrics recording moved from `fetchWithRetry` to OpenAPI client interceptors -- [details](METRICS.md#detailed-technical-notes----unreleased) + +### Deprecated + +- Legacy metric names remain the default. Migration guidance is in [METRICS.md](METRICS.md#migrating-from-legacy-to-canonical). diff --git a/METRICS.md b/METRICS.md index 7d43e0c2..b687a0e1 100644 --- a/METRICS.md +++ b/METRICS.md @@ -253,7 +253,7 @@ Type is Summary. Values are bytes. | `payload_type` | Legacy `conductor_worker_external_payload_used_total` | Payload type, such as `workflow_input` or `task_output`. | | `payloadType` | Canonical `external_payload_used_total` | Payload type, such as `TASK_INPUT`, `TASK_OUTPUT`, `WORKFLOW_INPUT`, or `WORKFLOW_OUTPUT`. | | `method` | Canonical HTTP metrics | HTTP verb. | -| `uri` | Canonical HTTP metrics | Request path. May contain interpolated identifiers. | +| `uri` | Canonical HTTP metrics | Request path template (e.g. `/workflow/{workflowId}`). Uses bounded-cardinality templates, not interpolated paths. | | `endpoint` | Legacy HTTP metrics | Compound `"METHOD:/api/path:STATUS"` string. | | `quantile` | Legacy time and size metrics | `0.5`, `0.75`, `0.9`, `0.95`, or `0.99`. | @@ -338,17 +338,17 @@ sum(rate(task_execute_time_seconds_count[5m])) by (taskType) ### Missing HTTP or Workflow Metrics - `http_api_client_request_seconds` (canonical) or - `conductor_worker_http_api_client_request` (legacy) is recorded from the - `fetchWithRetry` HTTP layer. Verify the collector is constructed before HTTP - calls begin. + `conductor_worker_http_api_client_request` (legacy) is recorded by request / + response interceptors on the OpenAPI client, with a network-error fallback in + `fetchWithRetry`. Verify the collector is constructed before HTTP calls begin. - `workflow_input_size_bytes` and `workflow_start_error_total` are recorded in `WorkflowExecutor`. Verify the collector is active before starting workflows. ### High Cardinality -- Watch the `uri` label (canonical) or `endpoint` label (legacy) on HTTP - metrics. The SDK records the interpolated request path, which may include - task type names or workflow IDs. +- In canonical mode the `uri` label on `http_api_client_request_seconds` uses + path templates (e.g. `/workflow/{workflowId}`), giving bounded cardinality. + Legacy mode's `endpoint` label still contains fully interpolated paths. - Prefer canonical mode for bounded `exception` labels. Legacy error counters encode exception names in the Map key, not as a proper Prometheus label. - Avoid embedding user identifiers or unbounded values in task type, workflow @@ -386,3 +386,91 @@ process.on("unhandledRejection", (reason) => { option. - When `usePromClient: true` is set but `prom-client` is not installed, the collector falls back to the built-in text format silently. + +## Detailed Technical Notes -- Unreleased + +This section documents internal implementation details for developers reviewing +the metrics harmonization changes. It is not end-user-facing and will be +removed or folded into the relevant sections once the release is published. + +### Architecture + +The `MetricsCollectorInterface` (`src/sdk/worker/metrics/MetricsCollectorInterface.ts`) +defines the contract for recording SDK metrics. Two implementations exist: + +- `LegacyMetricsCollector` (`src/sdk/worker/metrics/LegacyMetricsCollector.ts`) + -- emits the original metric names and types (sliding-window Summaries for + timing, compound `endpoint` labels, `conductor_worker_` prefix). +- `CanonicalMetricsCollector` (`src/sdk/worker/metrics/CanonicalMetricsCollector.ts`) + -- emits the harmonized cross-SDK catalog (Histograms in seconds/bytes, + `_total` counters, bounded `exception` labels from `error.name`). + +`createMetricsCollector()` reads `WORKER_CANONICAL_METRICS` once and returns the +appropriate implementation. + +### HTTP request timing via interceptors + +`createMetricsInterceptors()` (`src/sdk/createConductorClient/helpers/metricsInterceptors.ts`) +returns a matched pair of request/response interceptors registered on the +OpenAPI generated client: + +- **Request interceptor** stashes `performance.now()` on the `opts` object + that the generated client passes through both interceptors. +- **Response interceptor** computes the request duration, extracts the + interpolated URI from `request.url`, extracts the path template from + `opts.url` (stripping the `/api/` prefix baked in by the OpenAPI spec), and + calls `observer.recordApiRequestTime(method, uri, status, durationMs, metricUri)`. + +Both interceptors receive the same `opts` reference (confirmed in +`client.gen.ts` lines 89-102), so no `WeakMap` or side-channel is needed. + +`wrapFetchWithRetry` retains a network-error fallback that records `status: "0"` +when `fetch` throws before any HTTP response (the response interceptor never +runs in that case). + +### Canonical vs. legacy URI label + +`recordApiRequestTime` accepts an optional `metricUri` parameter (the path +template). The canonical collector uses `metricUri ?? uri`; the legacy collector +ignores `metricUri` and uses the interpolated `uri`, preserving legacy output +byte-for-byte. + +The JavaScript SDK's OpenAPI spec bakes `/api/` into every path template +(`opts.url = "/api/workflow/{workflowId}"`). Other SDKs keep `/api` in the base +URL and use clean paths. The response interceptor strips this prefix so the +canonical `uri` label matches the cross-SDK convention (e.g. +`/workflow/{workflowId}`). + +### Payload size metrics + +`workflow_input_size_bytes` and `task_result_size_bytes` measure specific +sub-fields of the request/response payloads (`workflowRequest.input` and +`merged.outputData` respectively), not the full HTTP body. This requires a +separate `JSON.stringify` call on the sub-field, independent of the HTTP body +serialization performed by the OpenAPI client. + +This is an acceptable minor cost, consistent with how the Go, Java, and Python +SDKs measure the same metrics. The Go SDK provides `WORKER_METRICS_PAYLOAD_SIZE` +to opt out of this serialization; a similar env var can be added to the +JavaScript SDK in a future release if needed. + +### Metrics collector event surface + +`CanonicalMetricsCollector` and `LegacyMetricsCollector` both implement +`TaskRunnerEventsListener`, so they receive lifecycle events from the worker +infrastructure: + +- `onTaskPoll` / `onTaskPollError` / `onTaskExecutionStarted` / + `onTaskExecutionCompleted` / `onTaskExecutionFailed` / `onTaskUpdateSuccess` / + `onTaskUpdateError` / `onTaskAckError` / `onTaskAckFailed` / + `onTaskExecutionQueueFull` / `onTaskPaused` +- `Poller`, `TaskRunner`, and `EventDispatcher` emit a `taskPaused` event when a + poll cycle is skipped because the worker is paused. + +### Harness changes + +- Harness deployment manifest sets `WORKER_CANONICAL_METRICS=true`. +- `harness/main.ts` logs which collector is active. +- `WorkflowStatusProbe` (opt-in via `HARNESS_PROBE_RATE_PER_SEC`) exercises + UUID-bearing endpoints (`/api/workflow/{workflowId}`) to validate that the + `uri` label uses bounded templates. From 5b0becde22c8186e6418154ab8162304fe93e8ef Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Mon, 18 May 2026 12:54:21 -0600 Subject: [PATCH 16/21] address self-review concerns regarding backward compatibility --- .github/workflows/pull_request.yml | 5 ++- CHANGELOG.md | 20 +++++---- METRICS.md | 43 ++++++++++++++++--- src/sdk/clients/worker/events/types.ts | 2 +- src/sdk/clients/workflow/WorkflowExecutor.ts | 7 ++- .../createConductorClient.ts | 4 +- .../helpers/__tests__/fetchWithRetry.test.ts | 20 +++++++++ .../helpers/fetchWithRetry.ts | 8 ++-- .../helpers/metricsInterceptors.ts | 4 +- .../helpers/resolveOrkesConfig.ts | 3 ++ src/sdk/types.ts | 1 + .../metrics/CanonicalMetricsCollector.ts | 23 ++++++---- .../worker/metrics/LegacyMetricsCollector.ts | 18 ++++++-- .../metrics/MetricsCollectorInterface.ts | 4 ++ .../metrics/__tests__/metricsFactory.test.ts | 17 -------- src/sdk/worker/metrics/httpObserver.ts | 2 + src/sdk/worker/metrics/metricsFactory.ts | 9 ++-- 17 files changed, 132 insertions(+), 58 deletions(-) diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index b835e6e5..92c3eda7 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -82,13 +82,12 @@ jobs: # Integration tests (v5): lower max-parallel reduces 502/503 from the shared Conductor server # but makes CI slower without eliminating flakes entirely — feel free to experiment. # Sharding (--shard i/N) splits the suite so each job runs ~1/N of tests. - # fetchWithRetry now retries 502/503/504, so higher parallelism is more viable than before. integration-tests: runs-on: ubuntu-latest timeout-minutes: 25 strategy: fail-fast: false - max-parallel: 3 + max-parallel: 2 matrix: node-version: [20, 22, 24] shard: [1, 2, 3] @@ -120,6 +119,7 @@ jobs: CONDUCTOR_AUTH_KEY: ${{ secrets.AUTH_KEY }} CONDUCTOR_AUTH_SECRET: ${{ secrets.AUTH_SECRET }} CONDUCTOR_REQUEST_TIMEOUT_MS: "120000" + CONDUCTOR_RETRY_SERVER_ERRORS: "true" JEST_JUNIT_OUTPUT_NAME: integration-v5-node-${{ matrix.node-version }}-shard-${{ matrix.shard }}-test-results.xml - name: Publish Test Results uses: dorny/test-reporter@v2 @@ -177,6 +177,7 @@ jobs: CONDUCTOR_AUTH_KEY: ${{ secrets.AUTH_KEY_V4 }} CONDUCTOR_AUTH_SECRET: ${{ secrets.AUTH_SECRET_V4 }} CONDUCTOR_REQUEST_TIMEOUT_MS: "120000" + CONDUCTOR_RETRY_SERVER_ERRORS: "true" JEST_JUNIT_OUTPUT_NAME: integration-v4-node-${{ matrix.node-version }}-shard-${{ matrix.shard }}-test-results.xml - name: Publish Test Results uses: dorny/test-reporter@v2 diff --git a/CHANGELOG.md b/CHANGELOG.md index f541892d..004f0a2a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,17 +9,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added -- Canonical metrics: opt-in harmonized metric surface via `WORKER_CANONICAL_METRICS=true` -- see [METRICS.md](METRICS.md) for the full catalog, configuration, and migration guide -- Bounded `uri` label on `http_api_client_request_seconds`: canonical mode uses path templates (e.g. `/workflow/{workflowId}`) instead of fully-resolved paths, preventing metric cardinality explosion from dynamic IDs -- `WorkflowStatusProbe` in harness: opt-in probe (via `HARNESS_PROBE_RATE_PER_SEC`) that exercises UUID-bearing endpoints to validate template URI metrics -- `fetchWithRetry` now retries HTTP 502/503/504 for idempotent methods (GET, HEAD, OPTIONS, PUT, DELETE) +- **Canonical metrics** -- opt-in harmonized metric surface via `WORKER_CANONICAL_METRICS=true`. See [METRICS.md](METRICS.md) for the full catalog, configuration, and migration guide. +- Bounded `uri` label on `http_api_client_request_seconds`: canonical mode uses path templates (e.g. `/workflow/{workflowId}`) instead of fully-resolved paths, preventing metric cardinality explosion from dynamic IDs. +- `TaskPaused` event type and `PollerOptions.onPaused` callback: emitted when a poll cycle is skipped because the worker is paused. Canonical mode records `task_paused_total`; legacy mode does not (see Implementation Notes in METRICS.md). +- `measurePayloadSize` option in `MetricsCollectorConfig`: controls whether `workflow_input_size_bytes` is recorded via `JSON.stringify` on each `startWorkflow` call. Defaults to `true` for canonical, `false` for legacy. +- `retryServerErrors` option in `OrkesApiConfig` / `RetryFetchOptions` and `CONDUCTOR_RETRY_SERVER_ERRORS` env var: opt-in retry of HTTP 502/503/504 for idempotent methods (GET, HEAD, OPTIONS, PUT, DELETE). Default `false`; set to `true` to enable. +- `WorkflowStatusProbe` in harness: opt-in probe (via `HARNESS_PROBE_RATE_PER_SEC`) that exercises UUID-bearing endpoints to validate template URI metrics. +- `WORKER_LEGACY_METRICS` is reserved for future use. Once canonical metrics become the default, setting `WORKER_LEGACY_METRICS=true` will re-activate the legacy surface. It is not read by the current implementation. ### Changed -- Legacy metrics emit unchanged by default; no action required for existing deployments -- `MetricsCollector.ts` renamed to `LegacyMetricsCollector.ts`; the public symbol is preserved via re-export so existing imports keep working -- HTTP metrics recording moved from `fetchWithRetry` to OpenAPI client interceptors -- [details](METRICS.md#detailed-technical-notes----unreleased) +- Legacy metrics emit unchanged by default; no action required for existing deployments. +- `MetricsCollector.ts` renamed to `LegacyMetricsCollector.ts`; the public symbol is preserved via re-export so existing imports keep working. +- HTTP metrics recording moved from `fetchWithRetry` to OpenAPI client interceptors -- [details](METRICS.md#implementation-notes). +- `TaskUpdateFailure.durationMs` is now optional to preserve backward compatibility for external event listener implementations. ### Deprecated -- Legacy metric names remain the default. Migration guidance is in [METRICS.md](METRICS.md#migrating-from-legacy-to-canonical). +- Legacy metric names remain the default during the transition period. Migration guidance is in [METRICS.md](METRICS.md#migrating-from-legacy-to-canonical). diff --git a/METRICS.md b/METRICS.md index b687a0e1..ee3c1637 100644 --- a/METRICS.md +++ b/METRICS.md @@ -387,6 +387,40 @@ process.on("unhandledRejection", (reason) => { - When `usePromClient: true` is set but `prom-client` is not installed, the collector falls back to the built-in text format silently. +## Implementation Notes + +### One Collector Per Process + +Only one metrics collector can be active at a time. Both +`LegacyMetricsCollector` and `CanonicalMetricsCollector` register themselves +as the global HTTP metrics observer on construction. Creating a second +collector replaces the first. Calling `stop()` clears the global observer. + +### Payload Size Measurement + +`workflow_input_size_bytes` is recorded by calling `JSON.stringify` on the +workflow input and measuring the resulting UTF-8 byte length with +`Buffer.byteLength`. This cost is controlled by the `measurePayloadSize` +config option: + +- **Canonical** (default `true`): measured on every `startWorkflow` call. + Set `measurePayloadSize: false` to disable. +- **Legacy** (default `false`): not measured unless you set + `measurePayloadSize: true`. + +### Legacy `task_paused_total` + +Legacy mode does not emit `task_paused_total`. This metric was defined but +never recorded in any prior release. It remains unrecorded in legacy mode to +preserve byte-for-byte output compatibility. Switch to canonical mode +(`WORKER_CANONICAL_METRICS=true`) to get paused metrics. + +### `WORKER_LEGACY_METRICS` (Reserved) + +`WORKER_LEGACY_METRICS` is reserved for future use. Once canonical metrics +become the default, setting `WORKER_LEGACY_METRICS=true` will re-activate +the legacy metric surface. It is not read by the current implementation. + ## Detailed Technical Notes -- Unreleased This section documents internal implementation details for developers reviewing @@ -415,7 +449,7 @@ returns a matched pair of request/response interceptors registered on the OpenAPI generated client: - **Request interceptor** stashes `performance.now()` on the `opts` object - that the generated client passes through both interceptors. + only when a metrics observer is active (no mutation otherwise). - **Response interceptor** computes the request duration, extracts the interpolated URI from `request.url`, extracts the path template from `opts.url` (stripping the `/api/` prefix baked in by the OpenAPI spec), and @@ -449,10 +483,9 @@ sub-fields of the request/response payloads (`workflowRequest.input` and separate `JSON.stringify` call on the sub-field, independent of the HTTP body serialization performed by the OpenAPI client. -This is an acceptable minor cost, consistent with how the Go, Java, and Python -SDKs measure the same metrics. The Go SDK provides `WORKER_METRICS_PAYLOAD_SIZE` -to opt out of this serialization; a similar env var can be added to the -JavaScript SDK in a future release if needed. +This cost is controlled by the `measurePayloadSize` config option (canonical +default `true`, legacy default `false`). The Go SDK provides +`WORKER_METRICS_PAYLOAD_SIZE` for the same purpose. ### Metrics collector event surface diff --git a/src/sdk/clients/worker/events/types.ts b/src/sdk/clients/worker/events/types.ts index 4bd5306c..14554aff 100644 --- a/src/sdk/clients/worker/events/types.ts +++ b/src/sdk/clients/worker/events/types.ts @@ -115,7 +115,7 @@ export interface TaskUpdateFailure extends TaskRunnerEvent { /** The error that caused the final update failure */ cause: Error; /** Time taken for the last update attempt in milliseconds */ - durationMs: number; + durationMs?: number; /** Number of retry attempts made */ retryCount: number; /** The TaskResult object that failed to update (for recovery/logging) */ diff --git a/src/sdk/clients/workflow/WorkflowExecutor.ts b/src/sdk/clients/workflow/WorkflowExecutor.ts index c527050e..17cd25f7 100644 --- a/src/sdk/clients/workflow/WorkflowExecutor.ts +++ b/src/sdk/clients/workflow/WorkflowExecutor.ts @@ -73,11 +73,10 @@ export class WorkflowExecutor { workflowRequest: StartWorkflowRequest ): Promise { const observer = getHttpMetricsObserver(); - if (observer) { + if (observer?.measurePayloadSize && workflowRequest.input) { try { - const inputBytes = workflowRequest.input - ? JSON.stringify(workflowRequest.input).length - : 0; + const json = JSON.stringify(workflowRequest.input); + const inputBytes = Buffer.byteLength(json, "utf8"); observer.recordWorkflowInputSize( workflowRequest.name ?? "", inputBytes, diff --git a/src/sdk/createConductorClient/createConductorClient.ts b/src/sdk/createConductorClient/createConductorClient.ts index c1d8970c..e080fd93 100644 --- a/src/sdk/createConductorClient/createConductorClient.ts +++ b/src/sdk/createConductorClient/createConductorClient.ts @@ -35,6 +35,7 @@ export const createConductorClient = async ( proxyUrl, tlsInsecure, disableHttp2, + retryServerErrors, } = resolveOrkesConfig(config); if (!serverUrl) throw new Error("Conductor server URL is not set"); @@ -53,7 +54,7 @@ export const createConductorClient = async ( // Start with retry + timeout on fetch (no auth failure callback yet) const openApiClient = createClient({ baseUrl: serverUrl, - fetch: wrapFetchWithRetry(baseFetchFn, { requestTimeoutMs }), + fetch: wrapFetchWithRetry(baseFetchFn, { requestTimeoutMs, retryServerErrors }), throwOnError: true, }); @@ -79,6 +80,7 @@ export const createConductorClient = async ( fetch: wrapFetchWithRetry(baseFetchFn, { onAuthFailure: authResult.refreshToken, requestTimeoutMs, + retryServerErrors, }), }); } diff --git a/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts b/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts index 111f0120..bfed241e 100644 --- a/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts +++ b/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts @@ -184,6 +184,18 @@ describe("fetchWithRetry", () => { // ─── Server error (502/503/504) retry ─────────────────────────────── describe("server error (502/503/504) retry", () => { + it("should NOT retry 502 by default (retryServerErrors unset)", async () => { + mockFetch.mockResolvedValue(createMockResponse(502, "Bad Gateway")); + + const result = await retryFetch("http://test.com", {}, mockFetch, { + maxTransportRetries: 3, + initialRetryDelay: 1, + }); + + expect(result.status).toBe(502); + expect(mockFetch).toHaveBeenCalledTimes(1); + }); + it("should retry 502 and succeed on next attempt", async () => { mockFetch .mockResolvedValueOnce(createMockResponse(502, "Bad Gateway")) @@ -192,6 +204,7 @@ describe("fetchWithRetry", () => { const result = await retryFetch("http://test.com", {}, mockFetch, { maxTransportRetries: 3, initialRetryDelay: 1, + retryServerErrors: true, }); expect(result.status).toBe(200); @@ -206,6 +219,7 @@ describe("fetchWithRetry", () => { const result = await retryFetch("http://test.com", {}, mockFetch, { maxTransportRetries: 3, initialRetryDelay: 1, + retryServerErrors: true, }); expect(result.status).toBe(200); @@ -220,6 +234,7 @@ describe("fetchWithRetry", () => { const result = await retryFetch("http://test.com", {}, mockFetch, { maxTransportRetries: 3, initialRetryDelay: 1, + retryServerErrors: true, }); expect(result.status).toBe(200); @@ -232,6 +247,7 @@ describe("fetchWithRetry", () => { const result = await retryFetch("http://test.com", {}, mockFetch, { maxTransportRetries: 2, initialRetryDelay: 1, + retryServerErrors: true, }); expect(result.status).toBe(502); @@ -245,6 +261,7 @@ describe("fetchWithRetry", () => { const result = await retryFetch("http://test.com", { method: "POST" }, mockFetch, { maxTransportRetries: 3, initialRetryDelay: 1, + retryServerErrors: true, }); expect(result.status).toBe(502); @@ -257,6 +274,7 @@ describe("fetchWithRetry", () => { const result = await retryFetch("http://test.com", { method: "PATCH" }, mockFetch, { maxTransportRetries: 3, initialRetryDelay: 1, + retryServerErrors: true, }); expect(result.status).toBe(503); @@ -271,6 +289,7 @@ describe("fetchWithRetry", () => { const result = await retryFetch("http://test.com", { method: "PUT" }, mockFetch, { maxTransportRetries: 3, initialRetryDelay: 1, + retryServerErrors: true, }); expect(result.status).toBe(200); @@ -286,6 +305,7 @@ describe("fetchWithRetry", () => { const result = await retryFetch("http://test.com", {}, mockFetch, { maxTransportRetries: 3, initialRetryDelay: 1, + retryServerErrors: true, }); expect(result.status).toBe(200); diff --git a/src/sdk/createConductorClient/helpers/fetchWithRetry.ts b/src/sdk/createConductorClient/helpers/fetchWithRetry.ts index 69f0feea..81a2b824 100644 --- a/src/sdk/createConductorClient/helpers/fetchWithRetry.ts +++ b/src/sdk/createConductorClient/helpers/fetchWithRetry.ts @@ -9,6 +9,7 @@ export interface RetryFetchOptions { maxRateLimitRetries?: number; // default 5 maxTransportRetries?: number; // default 3 initialRetryDelay?: number; // default 1000ms + retryServerErrors?: boolean; // retry 502/503/504 for idempotent methods (default false) } const isTimeoutError = (error: unknown): boolean => { @@ -161,9 +162,10 @@ export const retryFetch = async ( return rateLimitResponse; } - // Gateway error retry (502, 503, 504) -- only for idempotent methods to - // avoid duplicate side effects when the upstream may have processed the request. - if (response.status >= 502 && response.status <= 504) { + // Gateway error retry (502, 503, 504) -- opt-in via retryServerErrors. + // Only retries idempotent methods to avoid duplicate side effects when + // the upstream may have processed the request. + if (options.retryServerErrors && response.status >= 502 && response.status <= 504) { const reqMethod = (input instanceof Request ? input.method : init?.method ?? "GET" diff --git a/src/sdk/createConductorClient/helpers/metricsInterceptors.ts b/src/sdk/createConductorClient/helpers/metricsInterceptors.ts index f958ae9f..1e9a2d81 100644 --- a/src/sdk/createConductorClient/helpers/metricsInterceptors.ts +++ b/src/sdk/createConductorClient/helpers/metricsInterceptors.ts @@ -31,7 +31,9 @@ export function createMetricsInterceptors() { request: Request, opts: ResolvedRequestOptions, ): Request => { - (opts as OptsWithMetrics)._metricsStart = performance.now(); + if (getHttpMetricsObserver()) { + (opts as OptsWithMetrics)._metricsStart = performance.now(); + } return request; }; diff --git a/src/sdk/createConductorClient/helpers/resolveOrkesConfig.ts b/src/sdk/createConductorClient/helpers/resolveOrkesConfig.ts index b11c9def..42d74744 100644 --- a/src/sdk/createConductorClient/helpers/resolveOrkesConfig.ts +++ b/src/sdk/createConductorClient/helpers/resolveOrkesConfig.ts @@ -67,5 +67,8 @@ export const resolveOrkesConfig = (config?: Partial) => { disableHttp2: parseEnvBoolean(process.env.CONDUCTOR_DISABLE_HTTP2) ?? config?.disableHttp2, + retryServerErrors: + parseEnvBoolean(process.env.CONDUCTOR_RETRY_SERVER_ERRORS) ?? + config?.retryServerErrors, }; }; diff --git a/src/sdk/types.ts b/src/sdk/types.ts index 81f988c9..fc3beffd 100644 --- a/src/sdk/types.ts +++ b/src/sdk/types.ts @@ -20,4 +20,5 @@ export interface OrkesApiConfig { proxyUrl?: string; // HTTP/HTTPS proxy URL tlsInsecure?: boolean; // disable TLS certificate verification (for self-signed certs) disableHttp2?: boolean; // force HTTP/1.1 instead of HTTP/2 + retryServerErrors?: boolean; // retry 502/503/504 for idempotent methods (default false) } diff --git a/src/sdk/worker/metrics/CanonicalMetricsCollector.ts b/src/sdk/worker/metrics/CanonicalMetricsCollector.ts index bf8b6ad3..03ebb5af 100644 --- a/src/sdk/worker/metrics/CanonicalMetricsCollector.ts +++ b/src/sdk/worker/metrics/CanonicalMetricsCollector.ts @@ -65,10 +65,12 @@ export class CanonicalMetricsCollector implements MetricsCollectorInterface { private _fileTimer?: ReturnType; private _promRegistry?: import("./CanonicalPrometheusRegistry.js").CanonicalPrometheusRegistry; private readonly _usePromClient: boolean; + readonly measurePayloadSize: boolean; constructor(config?: MetricsCollectorConfig) { this.state = this.createEmpty(); this._usePromClient = config?.usePromClient ?? false; + this.measurePayloadSize = config?.measurePayloadSize ?? true; if (this._usePromClient) { void this.initPromClient(); } @@ -81,6 +83,7 @@ export class CanonicalMetricsCollector implements MetricsCollectorInterface { config.fileWriteIntervalMs ?? 5000, ); } + setHttpMetricsObserver(this); } private async initPromClient(): Promise { @@ -279,15 +282,17 @@ export class CanonicalMetricsCollector implements MetricsCollectorInterface { exception: excName, }); - const seconds = event.durationMs / 1000; - this.state.taskUpdateTimeSeconds.observe( - { taskType: event.taskType, status: "FAILURE" }, - seconds, - ); - this._promRegistry?.observeHistogram("task_update_time_seconds", { - taskType: event.taskType, - status: "FAILURE", - }, seconds); + if (event.durationMs != null) { + const seconds = event.durationMs / 1000; + this.state.taskUpdateTimeSeconds.observe( + { taskType: event.taskType, status: "FAILURE" }, + seconds, + ); + this._promRegistry?.observeHistogram("task_update_time_seconds", { + taskType: event.taskType, + status: "FAILURE", + }, seconds); + } } onTaskPaused(event: TaskPaused): void { diff --git a/src/sdk/worker/metrics/LegacyMetricsCollector.ts b/src/sdk/worker/metrics/LegacyMetricsCollector.ts index 32467ff4..60ac3b5a 100644 --- a/src/sdk/worker/metrics/LegacyMetricsCollector.ts +++ b/src/sdk/worker/metrics/LegacyMetricsCollector.ts @@ -33,6 +33,11 @@ export interface MetricsCollectorConfig { * Falls back to custom text format if prom-client is not installed. */ usePromClient?: boolean; + /** + * Measure workflow input payload size via JSON.stringify on each + * `startWorkflow` call. Default: true for canonical, false for legacy. + */ + measurePayloadSize?: boolean; } /** @@ -103,11 +108,13 @@ export class LegacyMetricsCollector implements MetricsCollectorInterface { private _server?: import("./MetricsServer.js").MetricsServer; private _fileTimer?: ReturnType; private _promRegistry?: import("./PrometheusRegistry.js").PrometheusRegistry; + readonly measurePayloadSize: boolean; constructor(config?: MetricsCollectorConfig) { this.metrics = this.createEmptyMetrics(); this._prefix = config?.prefix ?? "conductor_worker"; this._slidingWindowSize = config?.slidingWindowSize ?? 1000; + this.measurePayloadSize = config?.measurePayloadSize ?? false; if (config?.usePromClient) { void this.initPromClient(); } @@ -120,6 +127,7 @@ export class LegacyMetricsCollector implements MetricsCollectorInterface { config.fileWriteIntervalMs ?? 5000, ); } + setHttpMetricsObserver(this); } private async initPromClient(): Promise { @@ -253,12 +261,16 @@ export class LegacyMetricsCollector implements MetricsCollectorInterface { onTaskUpdateFailure(event: TaskUpdateFailure): void { this.incrementCounter(this.metrics.taskUpdateFailureTotal, event.taskType, "update_error_total", "task_type"); - this.observeSummary(this.metrics.updateDurationMs, event.taskType, event.durationMs, "update_time", "task_type"); + if (event.durationMs != null) { + this.observeSummary(this.metrics.updateDurationMs, event.taskType, event.durationMs, "update_time", "task_type"); + } } - // Canonical-only event — noop in legacy onTaskPaused(): void { - // Noop: TaskPaused events are handled via recordTaskPaused() + // Intentional no-op: task_paused_total was never emitted by the legacy + // metrics surface. Keeping it unrecorded preserves byte-for-byte output + // compatibility. Users who want paused metrics should switch to canonical + // mode (WORKER_CANONICAL_METRICS=true). } // ── Direct Recording Methods ─────────────────────────────────── diff --git a/src/sdk/worker/metrics/MetricsCollectorInterface.ts b/src/sdk/worker/metrics/MetricsCollectorInterface.ts index dd341994..90ec8e8e 100644 --- a/src/sdk/worker/metrics/MetricsCollectorInterface.ts +++ b/src/sdk/worker/metrics/MetricsCollectorInterface.ts @@ -38,6 +38,10 @@ export interface MetricsCollectorInterface extends TaskRunnerEventsListener { metricUri?: string, ): void; + // ── Configuration ─────────────────────────────────────────────── + + readonly measurePayloadSize: boolean; + // ── Output / lifecycle ───────────────────────────────────────── collectorName(): string; diff --git a/src/sdk/worker/metrics/__tests__/metricsFactory.test.ts b/src/sdk/worker/metrics/__tests__/metricsFactory.test.ts index 529fad17..02ca03c6 100644 --- a/src/sdk/worker/metrics/__tests__/metricsFactory.test.ts +++ b/src/sdk/worker/metrics/__tests__/metricsFactory.test.ts @@ -12,15 +12,6 @@ describe("createMetricsCollector", () => { it("should return LegacyMetricsCollector by default", () => { delete process.env.WORKER_CANONICAL_METRICS; - delete process.env.WORKER_LEGACY_METRICS; - - const collector = createMetricsCollector(); - expect(collector).toBeInstanceOf(LegacyMetricsCollector); - }); - - it("should return LegacyMetricsCollector when WORKER_LEGACY_METRICS=true", () => { - process.env.WORKER_LEGACY_METRICS = "true"; - delete process.env.WORKER_CANONICAL_METRICS; const collector = createMetricsCollector(); expect(collector).toBeInstanceOf(LegacyMetricsCollector); @@ -33,14 +24,6 @@ describe("createMetricsCollector", () => { expect(collector).toBeInstanceOf(CanonicalMetricsCollector); }); - it("should prefer canonical when both env vars are true", () => { - process.env.WORKER_CANONICAL_METRICS = "true"; - process.env.WORKER_LEGACY_METRICS = "true"; - - const collector = createMetricsCollector(); - expect(collector).toBeInstanceOf(CanonicalMetricsCollector); - }); - it("should return LegacyMetricsCollector when WORKER_CANONICAL_METRICS=false", () => { process.env.WORKER_CANONICAL_METRICS = "false"; diff --git a/src/sdk/worker/metrics/httpObserver.ts b/src/sdk/worker/metrics/httpObserver.ts index 9366e783..ab2633b9 100644 --- a/src/sdk/worker/metrics/httpObserver.ts +++ b/src/sdk/worker/metrics/httpObserver.ts @@ -8,6 +8,8 @@ */ export interface HttpMetricsObserver { + readonly measurePayloadSize: boolean; + recordApiRequestTime( method: string, uri: string, diff --git a/src/sdk/worker/metrics/metricsFactory.ts b/src/sdk/worker/metrics/metricsFactory.ts index dabe4810..86066895 100644 --- a/src/sdk/worker/metrics/metricsFactory.ts +++ b/src/sdk/worker/metrics/metricsFactory.ts @@ -7,11 +7,12 @@ import { setHttpMetricsObserver } from "./httpObserver"; /** * Create the appropriate MetricsCollector based on environment variables. * - * - WORKER_CANONICAL_METRICS=true -> CanonicalMetricsCollector (default: false) - * - WORKER_LEGACY_METRICS=true -> LegacyMetricsCollector (default: true) + * - WORKER_CANONICAL_METRICS=true -> CanonicalMetricsCollector + * - Unset / any other value -> LegacyMetricsCollector (default) * - * WORKER_CANONICAL_METRICS takes priority when both are set. - * During the deprecation transition period the default is legacy. + * WORKER_LEGACY_METRICS is reserved for future use: once canonical becomes + * the default, setting WORKER_LEGACY_METRICS=true will re-activate legacy + * metrics. It is not read by the current implementation. */ export function createMetricsCollector( config?: MetricsCollectorConfig, From 7a4c991c9d9236abcb86b083c2ca67a20895447e Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Mon, 18 May 2026 13:19:14 -0600 Subject: [PATCH 17/21] address backward compat concerned from self review --- CHANGELOG.md | 6 +++--- METRICS.md | 13 +++++++++---- .../createConductorClient/helpers/fetchWithRetry.ts | 5 ++++- src/sdk/worker/metrics/CanonicalMetricsCollector.ts | 6 ++++-- src/sdk/worker/metrics/LegacyMetricsCollector.ts | 7 ++++--- src/sdk/worker/metrics/httpObserver.ts | 11 ++++++++--- 6 files changed, 32 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 004f0a2a..b06338a3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,10 +19,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed -- Legacy metrics emit unchanged by default; no action required for existing deployments. +- Legacy metrics emit unchanged when constructing `LegacyMetricsCollector` directly (the pre-existing pattern). Using `createMetricsCollector()` additionally enables automatic HTTP request timing via OpenAPI interceptors for both legacy and canonical modes; no other action required for existing deployments. - `MetricsCollector.ts` renamed to `LegacyMetricsCollector.ts`; the public symbol is preserved via re-export so existing imports keep working. -- HTTP metrics recording moved from `fetchWithRetry` to OpenAPI client interceptors -- [details](METRICS.md#implementation-notes). -- `TaskUpdateFailure.durationMs` is now optional to preserve backward compatibility for external event listener implementations. +- `http_api_client_request` timing is now recorded automatically via OpenAPI client request/response interceptors when a metrics collector is active (via `createMetricsCollector()` or `setHttpMetricsObserver`). Previously, `recordApiRequestTime` existed but was not wired into the HTTP pipeline -- [details](METRICS.md#implementation-notes). +- Added optional `durationMs` field to `TaskUpdateFailure` event, recording the duration of the last update attempt. Declared optional so existing event listener implementations are unaffected. ### Deprecated diff --git a/METRICS.md b/METRICS.md index ee3c1637..d8def06a 100644 --- a/METRICS.md +++ b/METRICS.md @@ -493,10 +493,15 @@ default `true`, legacy default `false`). The Go SDK provides `TaskRunnerEventsListener`, so they receive lifecycle events from the worker infrastructure: -- `onTaskPoll` / `onTaskPollError` / `onTaskExecutionStarted` / - `onTaskExecutionCompleted` / `onTaskExecutionFailed` / `onTaskUpdateSuccess` / - `onTaskUpdateError` / `onTaskAckError` / `onTaskAckFailed` / - `onTaskExecutionQueueFull` / `onTaskPaused` +- Event listener methods (from `TaskRunnerEventsListener`): `onPollStarted` / + `onPollCompleted` / `onPollFailure` / `onTaskExecutionStarted` / + `onTaskExecutionCompleted` / `onTaskExecutionFailure` / + `onTaskUpdateCompleted` / `onTaskUpdateFailure` / `onTaskPaused` +- Direct recording methods (called outside the event system): + `recordTaskExecutionQueueFull` / `recordUncaughtException` / + `recordTaskAckError` / `recordTaskAckFailed` / `recordTaskPaused` / + `recordExternalPayloadUsed` / `recordWorkflowStartError` / + `recordWorkflowInputSize` / `recordApiRequestTime` - `Poller`, `TaskRunner`, and `EventDispatcher` emit a `taskPaused` event when a poll cycle is skipped because the worker is paused. diff --git a/src/sdk/createConductorClient/helpers/fetchWithRetry.ts b/src/sdk/createConductorClient/helpers/fetchWithRetry.ts index 81a2b824..494930b8 100644 --- a/src/sdk/createConductorClient/helpers/fetchWithRetry.ts +++ b/src/sdk/createConductorClient/helpers/fetchWithRetry.ts @@ -212,8 +212,11 @@ export const wrapFetchWithRetry = ( options?: RetryFetchOptions, ): typeof fetch => { return async (input: Input, init?: Init): Promise => { - const start = performance.now(); + if (!getHttpMetricsObserver()) { + return retryFetch(input, init, fetchFn, options); + } + const start = performance.now(); try { return await retryFetch(input, init, fetchFn, options); } catch (error) { diff --git a/src/sdk/worker/metrics/CanonicalMetricsCollector.ts b/src/sdk/worker/metrics/CanonicalMetricsCollector.ts index 03ebb5af..212cc8b8 100644 --- a/src/sdk/worker/metrics/CanonicalMetricsCollector.ts +++ b/src/sdk/worker/metrics/CanonicalMetricsCollector.ts @@ -11,7 +11,7 @@ import type { } from "../../clients/worker/events"; import type { MetricsCollectorInterface } from "./MetricsCollectorInterface"; import type { MetricsCollectorConfig } from "./LegacyMetricsCollector"; -import { setHttpMetricsObserver } from "./httpObserver"; +import { getHttpMetricsObserver, setHttpMetricsObserver } from "./httpObserver"; import { HistogramAccumulator, MultiLabelCounter, @@ -421,7 +421,9 @@ export class CanonicalMetricsCollector implements MetricsCollectorInterface { } async stop(): Promise { - setHttpMetricsObserver(undefined); + if (getHttpMetricsObserver() === this) { + setHttpMetricsObserver(undefined); + } if (this._fileTimer) { clearInterval(this._fileTimer); this._fileTimer = undefined; diff --git a/src/sdk/worker/metrics/LegacyMetricsCollector.ts b/src/sdk/worker/metrics/LegacyMetricsCollector.ts index 60ac3b5a..237c3c8d 100644 --- a/src/sdk/worker/metrics/LegacyMetricsCollector.ts +++ b/src/sdk/worker/metrics/LegacyMetricsCollector.ts @@ -9,7 +9,7 @@ import type { TaskUpdateFailure, } from "../../clients/worker/events"; import type { MetricsCollectorInterface } from "./MetricsCollectorInterface"; -import { setHttpMetricsObserver } from "./httpObserver"; +import { getHttpMetricsObserver, setHttpMetricsObserver } from "./httpObserver"; /** * Configuration for MetricsCollector. @@ -127,7 +127,6 @@ export class LegacyMetricsCollector implements MetricsCollectorInterface { config.fileWriteIntervalMs ?? 5000, ); } - setHttpMetricsObserver(this); } private async initPromClient(): Promise { @@ -344,7 +343,9 @@ export class LegacyMetricsCollector implements MetricsCollectorInterface { } async stop(): Promise { - setHttpMetricsObserver(undefined); + if (getHttpMetricsObserver() === this) { + setHttpMetricsObserver(undefined); + } if (this._fileTimer) { clearInterval(this._fileTimer); this._fileTimer = undefined; diff --git a/src/sdk/worker/metrics/httpObserver.ts b/src/sdk/worker/metrics/httpObserver.ts index ab2633b9..311a69b7 100644 --- a/src/sdk/worker/metrics/httpObserver.ts +++ b/src/sdk/worker/metrics/httpObserver.ts @@ -2,9 +2,14 @@ * Global metrics observer for recording API client request latency * and workflow-level metrics from code outside the event system. * - * Both LegacyMetricsCollector and CanonicalMetricsCollector register - * themselves here on construction; fetchWithRetry and WorkflowExecutor - * call the observer without needing a direct reference. + * `createMetricsCollector()` and `CanonicalMetricsCollector` register + * themselves here on construction. `LegacyMetricsCollector` does NOT + * self-register so that direct construction preserves pre-metrics + * behavior; use `createMetricsCollector()` or call + * `setHttpMetricsObserver()` explicitly to opt in. + * + * fetchWithRetry and WorkflowExecutor call the observer without + * needing a direct reference. */ export interface HttpMetricsObserver { From e4d8e6776c533793d5d08099742939f149619217 Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Mon, 18 May 2026 13:57:37 -0600 Subject: [PATCH 18/21] trying to settle on a better implementation of retry fetch wrapper with regards to how metrics are tracked --- CHANGELOG.md | 2 +- METRICS.md | 36 +++--- .../createConductorClient.ts | 3 +- .../helpers/__tests__/fetchWithRetry.test.ts | 79 +++++++++++- .../__tests__/metricsInterceptors.test.ts | 120 +++--------------- .../helpers/fetchWithRetry.ts | 72 ++++++----- .../helpers/metricsInterceptors.ts | 61 +++------ 7 files changed, 170 insertions(+), 203 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b06338a3..b713ac3a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,7 +21,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Legacy metrics emit unchanged when constructing `LegacyMetricsCollector` directly (the pre-existing pattern). Using `createMetricsCollector()` additionally enables automatic HTTP request timing via OpenAPI interceptors for both legacy and canonical modes; no other action required for existing deployments. - `MetricsCollector.ts` renamed to `LegacyMetricsCollector.ts`; the public symbol is preserved via re-export so existing imports keep working. -- `http_api_client_request` timing is now recorded automatically via OpenAPI client request/response interceptors when a metrics collector is active (via `createMetricsCollector()` or `setHttpMetricsObserver`). Previously, `recordApiRequestTime` existed but was not wired into the HTTP pipeline -- [details](METRICS.md#implementation-notes). +- `http_api_client_request` timing is now recorded automatically by `wrapFetchWithRetry` when a metrics collector is active (via `createMetricsCollector()` or `setHttpMetricsObserver`), covering both successful responses and network-error fallback paths. A lightweight request interceptor captures OpenAPI path templates so the canonical `uri` label uses bounded-cardinality templates in all cases. Previously, `recordApiRequestTime` existed but was not wired into the HTTP pipeline -- [details](METRICS.md#implementation-notes). - Added optional `durationMs` field to `TaskUpdateFailure` event, recording the duration of the last update attempt. Declared optional so existing event listener implementations are unaffected. ### Deprecated diff --git a/METRICS.md b/METRICS.md index d8def06a..2c3058bb 100644 --- a/METRICS.md +++ b/METRICS.md @@ -338,9 +338,9 @@ sum(rate(task_execute_time_seconds_count[5m])) by (taskType) ### Missing HTTP or Workflow Metrics - `http_api_client_request_seconds` (canonical) or - `conductor_worker_http_api_client_request` (legacy) is recorded by request / - response interceptors on the OpenAPI client, with a network-error fallback in - `fetchWithRetry`. Verify the collector is constructed before HTTP calls begin. + `conductor_worker_http_api_client_request` (legacy) is recorded by the + `wrapFetchWithRetry` wrapper. Verify the collector is constructed before HTTP + calls begin. - `workflow_input_size_bytes` and `workflow_start_error_total` are recorded in `WorkflowExecutor`. Verify the collector is active before starting workflows. @@ -444,23 +444,19 @@ appropriate implementation. ### HTTP request timing via interceptors -`createMetricsInterceptors()` (`src/sdk/createConductorClient/helpers/metricsInterceptors.ts`) -returns a matched pair of request/response interceptors registered on the -OpenAPI generated client: - -- **Request interceptor** stashes `performance.now()` on the `opts` object - only when a metrics observer is active (no mutation otherwise). -- **Response interceptor** computes the request duration, extracts the - interpolated URI from `request.url`, extracts the path template from - `opts.url` (stripping the `/api/` prefix baked in by the OpenAPI spec), and - calls `observer.recordApiRequestTime(method, uri, status, durationMs, metricUri)`. - -Both interceptors receive the same `opts` reference (confirmed in -`client.gen.ts` lines 89-102), so no `WeakMap` or side-channel is needed. +A **request interceptor** (`createMetricsInterceptors()` in +`src/sdk/createConductorClient/helpers/metricsInterceptors.ts`) stashes the +OpenAPI path template (from `opts.url`) in a `WeakMap` keyed on the `Request` +object. The template is stripped of the `/api/` prefix baked in by the OpenAPI +spec so the canonical `uri` label matches the cross-SDK convention (e.g. +`/workflow/{workflowId}`). -`wrapFetchWithRetry` retains a network-error fallback that records `status: "0"` -when `fetch` throws before any HTTP response (the response interceptor never -runs in that case). +`wrapFetchWithRetry` reads the template from the `WeakMap` before calling +`retryFetch` and records HTTP metrics for both successful responses and +network errors (status `"0"`). All timing and metric recording is centralised +in the fetch wrapper; the interceptor's only job is to bridge the path +template from `opts` (available to interceptors) to the fetch layer (which +only receives the `Request` object). ### Canonical vs. legacy URI label @@ -471,7 +467,7 @@ byte-for-byte. The JavaScript SDK's OpenAPI spec bakes `/api/` into every path template (`opts.url = "/api/workflow/{workflowId}"`). Other SDKs keep `/api` in the base -URL and use clean paths. The response interceptor strips this prefix so the +URL and use clean paths. The request interceptor strips this prefix so the canonical `uri` label matches the cross-SDK convention (e.g. `/workflow/{workflowId}`). diff --git a/src/sdk/createConductorClient/createConductorClient.ts b/src/sdk/createConductorClient/createConductorClient.ts index e080fd93..e6815811 100644 --- a/src/sdk/createConductorClient/createConductorClient.ts +++ b/src/sdk/createConductorClient/createConductorClient.ts @@ -58,9 +58,8 @@ export const createConductorClient = async ( throwOnError: true, }); - const { onRequest, onResponse } = createMetricsInterceptors(); + const { onRequest } = createMetricsInterceptors(); openApiClient.interceptors.request.use(onRequest); - openApiClient.interceptors.response.use(onResponse); let authResult: Awaited> | undefined; if (keyId && keySecret) { diff --git a/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts b/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts index bfed241e..1799cada 100644 --- a/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts +++ b/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts @@ -1,6 +1,7 @@ import { jest, expect, describe, it, beforeEach, afterEach } from "@jest/globals"; import { retryFetch, wrapFetchWithRetry, applyTimeout } from "../fetchWithRetry"; import * as httpObserver from "@/sdk/worker/metrics/httpObserver"; +import { requestTemplateMap } from "../metricsInterceptors"; const createMockResponse = (status: number, body = ""): Response => new Response(body, { status, statusText: `Status ${status}` }); @@ -660,19 +661,16 @@ describe("fetchWithRetry", () => { }); // ─── wrapFetchWithRetry metrics recording ─────────────────────────── - // - // After the interceptor migration, wrapFetchWithRetry only records - // metrics on network errors (status "0") — the response interceptor - // handles all successful/HTTP-error cases. describe("wrapFetchWithRetry metrics", () => { const mockRecordApiRequestTime = jest.fn< - (m: string, u: string, s: string, d: number) => void + (m: string, u: string, s: string, d: number, t?: string) => void >(); beforeEach(() => { mockRecordApiRequestTime.mockClear(); httpObserver.setHttpMetricsObserver({ + measurePayloadSize: false, recordApiRequestTime: mockRecordApiRequestTime, recordWorkflowInputSize: jest.fn(), recordWorkflowStartError: jest.fn(), @@ -683,13 +681,19 @@ describe("fetchWithRetry", () => { httpObserver.setHttpMetricsObserver(undefined); }); - it("should NOT record metrics on successful response (interceptors handle it)", async () => { + it("should record metrics on successful response", async () => { mockFetch.mockResolvedValue(createMockResponse(200)); const wrappedFetch = wrapFetchWithRetry(mockFetch); await wrappedFetch("http://test.com/api/tasks", { method: "POST" }); - expect(mockRecordApiRequestTime).not.toHaveBeenCalled(); + expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1); + const [method, uri, status, duration] = + mockRecordApiRequestTime.mock.calls[0]; + expect(method).toBe("POST"); + expect(uri).toBe("/api/tasks"); + expect(status).toBe("200"); + expect(duration).toBeGreaterThanOrEqual(0); }); it("should record status '0' on network failure", async () => { @@ -730,6 +734,57 @@ describe("fetchWithRetry", () => { expect(status).toBe("0"); }); + it("should pass template from requestTemplateMap on success", async () => { + mockFetch.mockResolvedValue(createMockResponse(200)); + + const request = new Request("http://host/api/workflow/abc-123", { + method: "GET", + }); + requestTemplateMap.set(request, "/workflow/{workflowId}"); + + const wrappedFetch = wrapFetchWithRetry(mockFetch); + await wrappedFetch(request); + + expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1); + const [, , , , template] = mockRecordApiRequestTime.mock.calls[0]; + expect(template).toBe("/workflow/{workflowId}"); + }); + + it("should pass template from requestTemplateMap on network failure", async () => { + mockFetch.mockRejectedValue(new Error("ECONNRESET")); + + const request = new Request("http://host/api/workflow/abc-123", { + method: "GET", + }); + requestTemplateMap.set(request, "/workflow/{workflowId}"); + + const wrappedFetch = wrapFetchWithRetry(mockFetch, { + maxTransportRetries: 0, + }); + + await expect(wrappedFetch(request)).rejects.toThrow("ECONNRESET"); + + expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1); + const [, , , , template] = mockRecordApiRequestTime.mock.calls[0]; + expect(template).toBe("/workflow/{workflowId}"); + }); + + it("should pass undefined template when requestTemplateMap has no entry", async () => { + mockFetch.mockResolvedValue(createMockResponse(200)); + + const request = new Request("http://host/api/workflow/abc-123", { + method: "GET", + }); + // Do NOT set requestTemplateMap + + const wrappedFetch = wrapFetchWithRetry(mockFetch); + await wrappedFetch(request); + + expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1); + const [, , , , template] = mockRecordApiRequestTime.mock.calls[0]; + expect(template).toBeUndefined(); + }); + it("should not break fetch when no observer is registered", async () => { httpObserver.setHttpMetricsObserver(undefined); @@ -739,5 +794,15 @@ describe("fetchWithRetry", () => { expect(result.status).toBe(200); }); + + it("should not record metrics when no observer is registered", async () => { + httpObserver.setHttpMetricsObserver(undefined); + + mockFetch.mockResolvedValue(createMockResponse(200)); + const wrappedFetch = wrapFetchWithRetry(mockFetch); + await wrappedFetch("http://test.com"); + + expect(mockRecordApiRequestTime).not.toHaveBeenCalled(); + }); }); }); diff --git a/src/sdk/createConductorClient/helpers/__tests__/metricsInterceptors.test.ts b/src/sdk/createConductorClient/helpers/__tests__/metricsInterceptors.test.ts index 77800cb3..421bb726 100644 --- a/src/sdk/createConductorClient/helpers/__tests__/metricsInterceptors.test.ts +++ b/src/sdk/createConductorClient/helpers/__tests__/metricsInterceptors.test.ts @@ -1,31 +1,16 @@ -import { jest, expect, describe, it, beforeEach, afterEach } from "@jest/globals"; -import { createMetricsInterceptors } from "../metricsInterceptors"; -import * as httpObserver from "@/sdk/worker/metrics/httpObserver"; +import { expect, describe, it } from "@jest/globals"; +import { + createMetricsInterceptors, + requestTemplateMap, +} from "../metricsInterceptors"; import type { ResolvedRequestOptions } from "@/open-api/generated/client/types.gen"; -const mockRecordApiRequestTime = jest.fn< - (m: string, u: string, s: string | number, d: number, mu?: string) => void ->(); - const fakeOpts = (url: string): ResolvedRequestOptions => ({ url } as unknown as ResolvedRequestOptions); describe("metricsInterceptors", () => { - beforeEach(() => { - mockRecordApiRequestTime.mockClear(); - httpObserver.setHttpMetricsObserver({ - recordApiRequestTime: mockRecordApiRequestTime, - recordWorkflowInputSize: jest.fn(), - recordWorkflowStartError: jest.fn(), - }); - }); - - afterEach(() => { - httpObserver.setHttpMetricsObserver(undefined); - }); - - it("should record method, interpolated uri, status, duration, and template metricUri", () => { - const { onRequest, onResponse } = createMetricsInterceptors(); + it("should stash stripped template in requestTemplateMap", () => { + const { onRequest } = createMetricsInterceptors(); const request = new Request("http://conductor.example.com/api/workflow/abc-123", { method: "GET", @@ -33,21 +18,12 @@ describe("metricsInterceptors", () => { const opts = fakeOpts("/api/workflow/{workflowId}"); onRequest(request, opts); - const response = new Response(null, { status: 200 }); - onResponse(response, request, opts); - - expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1); - const [method, uri, status, duration, metricUri] = - mockRecordApiRequestTime.mock.calls[0]; - expect(method).toBe("GET"); - expect(uri).toBe("/api/workflow/abc-123"); - expect(status).toBe("200"); - expect(duration).toBeGreaterThanOrEqual(0); - expect(metricUri).toBe("/workflow/{workflowId}"); + + expect(requestTemplateMap.get(request)).toBe("/workflow/{workflowId}"); }); it("should strip /api prefix from path template", () => { - const { onRequest, onResponse } = createMetricsInterceptors(); + const { onRequest } = createMetricsInterceptors(); const request = new Request("http://host/api/tasks/poll/myTask", { method: "POST", @@ -55,93 +31,39 @@ describe("metricsInterceptors", () => { const opts = fakeOpts("/api/tasks/poll/{taskType}"); onRequest(request, opts); - onResponse(new Response(null, { status: 200 }), request, opts); - const metricUri = mockRecordApiRequestTime.mock.calls[0][4]; - expect(metricUri).toBe("/tasks/poll/{taskType}"); + expect(requestTemplateMap.get(request)).toBe("/tasks/poll/{taskType}"); }); it("should not strip prefix if path does not start with /api/", () => { - const { onRequest, onResponse } = createMetricsInterceptors(); + const { onRequest } = createMetricsInterceptors(); const request = new Request("http://host/health", { method: "GET" }); const opts = fakeOpts("/health"); onRequest(request, opts); - onResponse(new Response(null, { status: 200 }), request, opts); - - const metricUri = mockRecordApiRequestTime.mock.calls[0][4]; - expect(metricUri).toBe("/health"); - }); - - it("should compute duration from request interceptor start time", () => { - const { onRequest, onResponse } = createMetricsInterceptors(); - - const request = new Request("http://host/api/workflow", { method: "GET" }); - const opts = fakeOpts("/api/workflow"); - - onRequest(request, opts); - // Simulate a small delay so duration > 0 - const busyWaitUntil = performance.now() + 2; - while (performance.now() < busyWaitUntil) { /* spin */ } - onResponse(new Response(null, { status: 200 }), request, opts); - - const duration = mockRecordApiRequestTime.mock.calls[0][3] as number; - expect(duration).toBeGreaterThan(0); - }); - - it("should gracefully handle missing start time (duration defaults to 0)", () => { - const { onResponse } = createMetricsInterceptors(); - - const request = new Request("http://host/api/workflow", { method: "GET" }); - const opts = fakeOpts("/api/workflow"); - - onResponse(new Response(null, { status: 200 }), request, opts); - const duration = mockRecordApiRequestTime.mock.calls[0][3] as number; - expect(duration).toBe(0); + expect(requestTemplateMap.get(request)).toBe("/health"); }); - it("should do nothing when no observer is registered", () => { - httpObserver.setHttpMetricsObserver(undefined); - - const { onRequest, onResponse } = createMetricsInterceptors(); + it("should return the request unchanged", () => { + const { onRequest } = createMetricsInterceptors(); const request = new Request("http://host/api/workflow", { method: "GET" }); const opts = fakeOpts("/api/workflow"); - onRequest(request, opts); - const response = new Response(null, { status: 200 }); - const result = onResponse(response, request, opts); - - expect(result).toBe(response); - expect(mockRecordApiRequestTime).not.toHaveBeenCalled(); + const returned = onRequest(request, opts); + expect(returned).toBe(request); }); - it("should return request and response unchanged", () => { - const { onRequest, onResponse } = createMetricsInterceptors(); + it("should not stash when opts.url is not a string", () => { + const { onRequest } = createMetricsInterceptors(); const request = new Request("http://host/api/workflow", { method: "GET" }); - const opts = fakeOpts("/api/workflow"); - - const returnedRequest = onRequest(request, opts); - expect(returnedRequest).toBe(request); - - const response = new Response("body", { status: 201 }); - const returnedResponse = onResponse(response, request, opts); - expect(returnedResponse).toBe(response); - }); - - it("should record correct status for error responses", () => { - const { onRequest, onResponse } = createMetricsInterceptors(); - - const request = new Request("http://host/api/workflow", { method: "POST" }); - const opts = fakeOpts("/api/workflow"); + const opts = {} as unknown as ResolvedRequestOptions; onRequest(request, opts); - onResponse(new Response(null, { status: 500 }), request, opts); - const status = mockRecordApiRequestTime.mock.calls[0][2]; - expect(status).toBe("500"); + expect(requestTemplateMap.has(request)).toBe(false); }); }); diff --git a/src/sdk/createConductorClient/helpers/fetchWithRetry.ts b/src/sdk/createConductorClient/helpers/fetchWithRetry.ts index 494930b8..d56776d3 100644 --- a/src/sdk/createConductorClient/helpers/fetchWithRetry.ts +++ b/src/sdk/createConductorClient/helpers/fetchWithRetry.ts @@ -1,4 +1,5 @@ import { getHttpMetricsObserver } from "../../worker/metrics/httpObserver"; +import { requestTemplateMap } from "./metricsInterceptors"; type Input = Parameters[0]; type Init = Parameters[1]; @@ -207,46 +208,53 @@ export const retryFetch = async ( throw lastError ?? new Error("Fetch retry exhausted"); }; +const extractRequestInfo = (input: Input, init?: Init) => { + const method = input instanceof Request + ? input.method + : (init?.method ?? "GET"); + let uri = ""; + try { + uri = new URL( + input instanceof Request ? input.url : String(input), + ).pathname; + } catch { + uri = String(input); + } + return { method, uri }; +}; + export const wrapFetchWithRetry = ( fetchFn: typeof fetch, options?: RetryFetchOptions, ): typeof fetch => { - return async (input: Input, init?: Init): Promise => { - if (!getHttpMetricsObserver()) { + return (input: Input, init?: Init): Promise => { + const observer = getHttpMetricsObserver(); + if (!observer) { return retryFetch(input, init, fetchFn, options); } const start = performance.now(); - try { - return await retryFetch(input, init, fetchFn, options); - } catch (error) { - // Network-error fallback: the response interceptor never runs when - // fetch throws, so record a status="0" metric here as a safety net. - let method = "GET"; - let uri = ""; - try { - if (input instanceof Request) { - method = input.method; - uri = new URL(input.url).pathname; - } else if (typeof input === "string") { - method = init?.method ?? "GET"; - try { uri = new URL(input).pathname; } catch { uri = input; } - } else { - method = init?.method ?? "GET"; - try { uri = input.pathname; } catch { uri = String(input); } - } - } catch { - // Best-effort URI extraction - } + const template = input instanceof Request + ? requestTemplateMap.get(input) + : undefined; - const durationMs = performance.now() - start; - getHttpMetricsObserver()?.recordApiRequestTime( - method, - uri, - "0", - durationMs, - ); - throw error; - } + return retryFetch(input, init, fetchFn, options).then( + (response) => { + const durationMs = performance.now() - start; + const { method, uri } = extractRequestInfo(input, init); + observer.recordApiRequestTime( + method, uri, String(response.status), durationMs, template, + ); + return response; + }, + (error) => { + const durationMs = performance.now() - start; + const { method, uri } = extractRequestInfo(input, init); + observer.recordApiRequestTime( + method, uri, "0", durationMs, template, + ); + throw error; + }, + ); }; }; diff --git a/src/sdk/createConductorClient/helpers/metricsInterceptors.ts b/src/sdk/createConductorClient/helpers/metricsInterceptors.ts index 1e9a2d81..c667ac31 100644 --- a/src/sdk/createConductorClient/helpers/metricsInterceptors.ts +++ b/src/sdk/createConductorClient/helpers/metricsInterceptors.ts @@ -1,5 +1,4 @@ import type { ResolvedRequestOptions } from "../../../open-api/generated/client/types.gen"; -import { getHttpMetricsObserver } from "../../worker/metrics/httpObserver"; /** * Strips the /api prefix that the OpenAPI spec bakes into every path. @@ -12,58 +11,36 @@ import { getHttpMetricsObserver } from "../../worker/metrics/httpObserver"; const stripApiPrefix = (url: string): string => url.startsWith("/api/") ? url.slice(4) : url; -type OptsWithMetrics = ResolvedRequestOptions & { _metricsStart?: number }; - /** - * Creates a matched pair of request/response interceptors that record - * http_api_client_request_seconds via the global HttpMetricsObserver. + * Maps each Request to its OpenAPI path template. The request interceptor + * stashes the template here so the fetch wrapper (which only receives the + * Request object) can pass bounded-cardinality URI labels to the metrics + * observer for both success and error paths. * - * Both interceptors receive the same `opts` object for a given request - * (see client.gen.ts lines 89-102), so start time is stashed directly - * on opts — no WeakMap or side-channel needed. + * Entries are garbage-collected with the Request. + */ +export const requestTemplateMap = new WeakMap(); + +/** + * Creates a request interceptor that captures the OpenAPI path template + * for each request. The template is stored in {@link requestTemplateMap} + * and read by `wrapFetchWithRetry` when recording HTTP metrics. * - * The response interceptor passes both the interpolated URI (for legacy - * metrics) and the bounded-cardinality path template (for canonical - * metrics) to the observer, letting each collector choose which to use. + * Timing and metric recording are handled entirely in the fetch wrapper; + * the interceptor's only job is to bridge the path template from `opts` + * (available to interceptors) to the fetch layer (which only sees the + * Request object). */ export function createMetricsInterceptors() { const onRequest = ( request: Request, opts: ResolvedRequestOptions, ): Request => { - if (getHttpMetricsObserver()) { - (opts as OptsWithMetrics)._metricsStart = performance.now(); + if (typeof opts.url === "string") { + requestTemplateMap.set(request, stripApiPrefix(opts.url)); } return request; }; - const onResponse = ( - response: Response, - request: Request, - opts: ResolvedRequestOptions, - ): Response => { - const observer = getHttpMetricsObserver(); - if (!observer) return response; - - const start = (opts as OptsWithMetrics)._metricsStart; - const durationMs = start != null ? performance.now() - start : 0; - - const method = request.method; - const status = String(response.status); - - let uri = ""; - try { - uri = new URL(request.url).pathname; - } catch { - uri = String(request.url); - } - - const template = typeof opts.url === "string" ? stripApiPrefix(opts.url) : uri; - - observer.recordApiRequestTime(method, uri, status, durationMs, template); - - return response; - }; - - return { onRequest, onResponse }; + return { onRequest }; } From a20cc3f0c2e8fba1c1026c0dafb468cf92855d4d Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Mon, 18 May 2026 15:02:53 -0600 Subject: [PATCH 19/21] fix for label defect in canonical metrics --- METRICS.md | 10 ++++--- harness/workflowGovernor.ts | 1 + src/sdk/clients/worker/TaskRunner.ts | 2 +- .../__tests__/metricsInterceptors.test.ts | 26 +++++++++++++++++++ .../helpers/metricsInterceptors.ts | 20 +++++++++++++- 5 files changed, 53 insertions(+), 6 deletions(-) diff --git a/METRICS.md b/METRICS.md index 2c3058bb..813753fb 100644 --- a/METRICS.md +++ b/METRICS.md @@ -391,10 +391,12 @@ process.on("unhandledRejection", (reason) => { ### One Collector Per Process -Only one metrics collector can be active at a time. Both -`LegacyMetricsCollector` and `CanonicalMetricsCollector` register themselves -as the global HTTP metrics observer on construction. Creating a second -collector replaces the first. Calling `stop()` clears the global observer. +Only one metrics collector can be active at a time. +`CanonicalMetricsCollector` registers itself as the global HTTP metrics +observer on construction. `LegacyMetricsCollector` does not self-register; +use `createMetricsCollector()` or call `setHttpMetricsObserver()` explicitly +to enable HTTP metrics in legacy mode. Creating a second collector replaces +the first. Calling `stop()` clears the global observer. ### Payload Size Measurement diff --git a/harness/workflowGovernor.ts b/harness/workflowGovernor.ts index 4ccd0d33..722e61f9 100644 --- a/harness/workflowGovernor.ts +++ b/harness/workflowGovernor.ts @@ -49,6 +49,7 @@ export class WorkflowGovernor { this.workflowExecutor.startWorkflow({ name: this.workflowName, version: 1, + input: { startedAt: Date.now() }, }), ); } diff --git a/src/sdk/clients/worker/TaskRunner.ts b/src/sdk/clients/worker/TaskRunner.ts index 5be29c83..26ffab4e 100644 --- a/src/sdk/clients/worker/TaskRunner.ts +++ b/src/sdk/clients/worker/TaskRunner.ts @@ -509,7 +509,7 @@ export class TaskRunner { // Calculate output size if possible const outputSizeBytes = merged.outputData - ? JSON.stringify(merged.outputData).length + ? Buffer.byteLength(JSON.stringify(merged.outputData), "utf8") : undefined; // Untrack immediately — execution done, update will follow diff --git a/src/sdk/createConductorClient/helpers/__tests__/metricsInterceptors.test.ts b/src/sdk/createConductorClient/helpers/__tests__/metricsInterceptors.test.ts index 421bb726..24bfcdd8 100644 --- a/src/sdk/createConductorClient/helpers/__tests__/metricsInterceptors.test.ts +++ b/src/sdk/createConductorClient/helpers/__tests__/metricsInterceptors.test.ts @@ -56,6 +56,32 @@ describe("metricsInterceptors", () => { expect(returned).toBe(request); }); + it("should normalize lowercase {tasktype} to {taskType}", () => { + const { onRequest } = createMetricsInterceptors(); + + const request = new Request("http://host/api/tasks/poll/batch/my_task", { + method: "GET", + }); + const opts = fakeOpts("/api/tasks/poll/batch/{tasktype}"); + + onRequest(request, opts); + + expect(requestTemplateMap.get(request)).toBe("/tasks/poll/batch/{taskType}"); + }); + + it("should leave already-camelCase placeholders unchanged", () => { + const { onRequest } = createMetricsInterceptors(); + + const request = new Request("http://host/api/workflow/abc-123", { + method: "GET", + }); + const opts = fakeOpts("/api/workflow/{workflowId}"); + + onRequest(request, opts); + + expect(requestTemplateMap.get(request)).toBe("/workflow/{workflowId}"); + }); + it("should not stash when opts.url is not a string", () => { const { onRequest } = createMetricsInterceptors(); diff --git a/src/sdk/createConductorClient/helpers/metricsInterceptors.ts b/src/sdk/createConductorClient/helpers/metricsInterceptors.ts index c667ac31..146bacdf 100644 --- a/src/sdk/createConductorClient/helpers/metricsInterceptors.ts +++ b/src/sdk/createConductorClient/helpers/metricsInterceptors.ts @@ -11,6 +11,21 @@ import type { ResolvedRequestOptions } from "../../../open-api/generated/client/ const stripApiPrefix = (url: string): string => url.startsWith("/api/") ? url.slice(4) : url; +/** + * The OpenAPI spec uses all-lowercase path parameter names in a few + * templates (e.g. `{tasktype}` instead of `{taskType}`). This map + * normalizes them to camelCase so the canonical `uri` metric label + * is consistent across all endpoints. + */ +const CANONICAL_PARAM_NAMES: Record = { + tasktype: "taskType", +}; + +const normalizePlaceholders = (url: string): string => + url.replace(/\{(\w+)\}/g, (_m, name: string) => + `{${CANONICAL_PARAM_NAMES[name] ?? name}}`, + ); + /** * Maps each Request to its OpenAPI path template. The request interceptor * stashes the template here so the fetch wrapper (which only receives the @@ -37,7 +52,10 @@ export function createMetricsInterceptors() { opts: ResolvedRequestOptions, ): Request => { if (typeof opts.url === "string") { - requestTemplateMap.set(request, stripApiPrefix(opts.url)); + requestTemplateMap.set( + request, + normalizePlaceholders(stripApiPrefix(opts.url)), + ); } return request; }; From f42bec9f8ee1243f631f9e0dcca2633779c58a4b Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Tue, 19 May 2026 09:17:56 -0600 Subject: [PATCH 20/21] give 1 flaky integration test more time and log it's progress --- src/integration-tests/E2EFiveTaskWorkflow.test.ts | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/integration-tests/E2EFiveTaskWorkflow.test.ts b/src/integration-tests/E2EFiveTaskWorkflow.test.ts index 66aed967..1d6fc9af 100644 --- a/src/integration-tests/E2EFiveTaskWorkflow.test.ts +++ b/src/integration-tests/E2EFiveTaskWorkflow.test.ts @@ -120,10 +120,17 @@ describeForOrkesV5("E2E: 5-task workflow × 50 executions", () => { expect(workflowIds.length).toBe(WORKFLOW_COUNT); - // Wait for all 50 to complete (600s in CI-friendly timeout, poll every 2s) + // Wait for all 50 to complete (600s per workflow, poll every 2s) + let completed = 0; const results = await Promise.all( workflowIds.map((id) => - waitForWorkflowStatus(executor, id, "COMPLETED", 600_000, 2000) + waitForWorkflowStatus(executor, id, "COMPLETED", 600_000, 2000).then( + (wf) => { + completed++; + console.log(`[E2E] workflow ${completed}/${WORKFLOW_COUNT} completed (${id})`); + return wf; + } + ) ) ); @@ -168,7 +175,6 @@ describeForOrkesV5("E2E: 5-task workflow × 50 executions", () => { workflows: [{ name: workflowName, version: 1 }], tasks: taskNames, }); - }, - 330_000 // 5.5 minute timeout for the entire test + } ); }); From 01ca97fa7ef8aeed27fe8f6d03d4ef581950d086 Mon Sep 17 00:00:00 2001 From: Chris Hagglund Date: Tue, 19 May 2026 12:41:21 -0600 Subject: [PATCH 21/21] test out pre-existing prom-client renderer --- harness/main.ts | 6 ++++-- src/sdk/worker/metrics/CanonicalMetricsCollector.ts | 4 +--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/harness/main.ts b/harness/main.ts index 182b8d50..6ed6be8c 100644 --- a/harness/main.ts +++ b/harness/main.ts @@ -94,8 +94,10 @@ async function main(): Promise { }); const metricsPort = envIntOrDefault("HARNESS_METRICS_PORT", 9991); - const metricsCollector = createMetricsCollector({ httpPort: metricsPort }); - console.log(`Prometheus metrics server started on port ${metricsPort} (${metricsCollector.collectorName()} metrics)`); + const usePromClient = process.env.HARNESS_USE_PROM_CLIENT === "true"; + const metricsCollector = createMetricsCollector({ httpPort: metricsPort, usePromClient }); + const backend = usePromClient ? "prom-client" : "built-in"; + console.log(`Prometheus metrics server started on port ${metricsPort} (${metricsCollector.collectorName()} metrics, ${backend} backend)`); const handler = new TaskHandler({ client, diff --git a/src/sdk/worker/metrics/CanonicalMetricsCollector.ts b/src/sdk/worker/metrics/CanonicalMetricsCollector.ts index 212cc8b8..c5df73c0 100644 --- a/src/sdk/worker/metrics/CanonicalMetricsCollector.ts +++ b/src/sdk/worker/metrics/CanonicalMetricsCollector.ts @@ -64,14 +64,12 @@ export class CanonicalMetricsCollector implements MetricsCollectorInterface { private _server?: import("./MetricsServer.js").MetricsServer; private _fileTimer?: ReturnType; private _promRegistry?: import("./CanonicalPrometheusRegistry.js").CanonicalPrometheusRegistry; - private readonly _usePromClient: boolean; readonly measurePayloadSize: boolean; constructor(config?: MetricsCollectorConfig) { this.state = this.createEmpty(); - this._usePromClient = config?.usePromClient ?? false; this.measurePayloadSize = config?.measurePayloadSize ?? true; - if (this._usePromClient) { + if (config?.usePromClient) { void this.initPromClient(); } if (config?.httpPort) {