From a3b070318b0db2efa70fde7673d0c259c93e277c Mon Sep 17 00:00:00 2001
From: Chris Hagglund <chris.hagglund@orkes.io>
Date: Wed, 29 Apr 2026 10:24:36 -0600
Subject: [PATCH 01/21] implement canonical metrics, feature flagged to
 preserve legacy where legacy ones were released

---
 harness/main.ts                               |   9 +-
 harness/manifests/deployment.yaml             |   2 +
 src/sdk/clients/worker/Poller.ts              |   1 +
 src/sdk/clients/worker/TaskRunner.ts          |   6 +
 .../clients/worker/events/EventDispatcher.ts  |  13 +
 src/sdk/clients/worker/events/types.ts        |  10 +-
 src/sdk/clients/worker/types.ts               |   2 +
 src/sdk/clients/workflow/WorkflowExecutor.ts  |  28 +
 .../helpers/fetchWithRetry.ts                 |  58 +-
 src/sdk/worker/core/TaskHandler.ts            |  49 ++
 src/sdk/worker/index.ts                       |   4 +
 .../metrics/CanonicalMetricsCollector.ts      | 510 ++++++++++++++++++
 .../metrics/CanonicalPrometheusRegistry.ts    | 168 ++++++
 ...Collector.ts => LegacyMetricsCollector.ts} | 121 ++---
 .../metrics/MetricsCollectorInterface.ts      |  48 ++
 src/sdk/worker/metrics/MetricsServer.ts       |   6 +-
 .../CanonicalMetricsCollector.test.ts         | 321 +++++++++++
 .../MetricsCollector.prometheus.test.ts       |  12 +-
 .../__tests__/MetricsCollector.test.ts        |   8 +-
 .../metrics/__tests__/MetricsServer.test.ts   |  14 +-
 .../metrics/__tests__/metricsFactory.test.ts  |  99 ++++
 src/sdk/worker/metrics/accumulators.ts        | 177 ++++++
 src/sdk/worker/metrics/httpObserver.ts        |  40 ++
 src/sdk/worker/metrics/index.ts               |  34 +-
 src/sdk/worker/metrics/metricsFactory.ts      |  26 +
 25 files changed, 1664 insertions(+), 102 deletions(-)
 create mode 100644 src/sdk/worker/metrics/CanonicalMetricsCollector.ts
 create mode 100644 src/sdk/worker/metrics/CanonicalPrometheusRegistry.ts
 rename src/sdk/worker/metrics/{MetricsCollector.ts => LegacyMetricsCollector.ts} (85%)
 create mode 100644 src/sdk/worker/metrics/MetricsCollectorInterface.ts
 create mode 100644 src/sdk/worker/metrics/__tests__/CanonicalMetricsCollector.test.ts
 create mode 100644 src/sdk/worker/metrics/__tests__/metricsFactory.test.ts
 create mode 100644 src/sdk/worker/metrics/accumulators.ts
 create mode 100644 src/sdk/worker/metrics/httpObserver.ts
 create mode 100644 src/sdk/worker/metrics/metricsFactory.ts
diff --git a/harness/main.ts b/harness/main.ts
index 3240ce1a..b7aadeed 100644
--- a/harness/main.ts
+++ b/harness/main.ts
@@ -3,7 +3,7 @@ import {
   ConductorWorkflow,
   TaskHandler,
   simpleTask,
-  MetricsCollector,
+  createMetricsCollector,
 } from "../src/sdk";
 import { MetadataResource } from "../src/open-api/generated";
 import type { ConductorWorker } from "../src/sdk/clients/worker/types";
@@ -93,8 +93,11 @@ async function main(): Promise<void> {
   });
 
   const metricsPort = envIntOrDefault("HARNESS_METRICS_PORT", 9991);
-  const metricsCollector = new MetricsCollector({ httpPort: metricsPort });
-  console.log(`Prometheus metrics server started on port ${metricsPort}`);
+  const metricsCollector = createMetricsCollector({ httpPort: metricsPort });
+  const variant = process.env.WORKER_CANONICAL_METRICS?.toLowerCase() === "true"
+    ? "canonical"
+    : "legacy";
+  console.log(`Prometheus metrics server started on port ${metricsPort} (${variant} metrics)`);
 
   const handler = new TaskHandler({
     client,
diff --git a/harness/manifests/deployment.yaml b/harness/manifests/deployment.yaml
index 878d6728..95218587 100644
--- a/harness/manifests/deployment.yaml
+++ b/harness/manifests/deployment.yaml
@@ -40,6 +40,8 @@ spec:
           value: "20"
         - name: HARNESS_POLL_INTERVAL_MS
           value: "100"
+        - name: WORKER_CANONICAL_METRICS
+          value: "true"
         ports:
         - name: metrics
           containerPort: 9991
diff --git a/src/sdk/clients/worker/Poller.ts b/src/sdk/clients/worker/Poller.ts
index a7fb1787..414552ac 100644
--- a/src/sdk/clients/worker/Poller.ts
+++ b/src/sdk/clients/worker/Poller.ts
@@ -189,6 +189,7 @@ export class Poller<T> {
           this.logger.debug(
             `Worker ${this._pollerId} is paused, skipping poll`
           );
+          this.options.onPaused?.();
           await this.sleep(
             this.options.pollInterval ?? DEFAULT_POLL_INTERVAL
           );
diff --git a/src/sdk/clients/worker/TaskRunner.ts b/src/sdk/clients/worker/TaskRunner.ts
index a836ad5b..cb637e48 100644
--- a/src/sdk/clients/worker/TaskRunner.ts
+++ b/src/sdk/clients/worker/TaskRunner.ts
@@ -96,6 +96,12 @@ export class TaskRunner {
       {
         concurrency: worker.concurrency ?? this.options.concurrency,
         pollInterval: worker.pollInterval ?? this.options.pollInterval,
+        onPaused: () => {
+          void this.eventDispatcher.publishTaskPaused({
+            taskType: this.worker.taskDefName,
+            timestamp: new Date(),
+          });
+        },
       },
       this.logger
     );
diff --git a/src/sdk/clients/worker/events/EventDispatcher.ts b/src/sdk/clients/worker/events/EventDispatcher.ts
index e6e1557a..16a2cf3c 100644
--- a/src/sdk/clients/worker/events/EventDispatcher.ts
+++ b/src/sdk/clients/worker/events/EventDispatcher.ts
@@ -9,6 +9,7 @@ import type {
   TaskExecutionFailure,
   TaskUpdateCompleted,
   TaskUpdateFailure,
+  TaskPaused,
 } from "./types";
 
 /**
@@ -60,6 +61,11 @@ export interface TaskRunnerEventsListener {
    * This is a CRITICAL event that may require operational intervention.
    */
   onTaskUpdateFailure?(event: TaskUpdateFailure): void | Promise<void>;
+
+  /**
+   * Called when a poll cycle is skipped because the worker is paused.
+   */
+  onTaskPaused?(event: TaskPaused): void | Promise<void>;
 }
 
 /**
@@ -159,6 +165,13 @@ export class EventDispatcher {
     await this.publishEvent("onTaskUpdateFailure", event);
   }
 
+  /**
+   * Publish a TaskPaused event.
+   */
+  async publishTaskPaused(event: TaskPaused): Promise<void> {
+    await this.publishEvent("onTaskPaused", event);
+  }
+
   /**
    * Internal method to publish events to all registered listeners.
    * Listener failures are caught and logged to prevent affecting task execution.
diff --git a/src/sdk/clients/worker/events/types.ts b/src/sdk/clients/worker/events/types.ts
index 7f12b2bf..e252a25e 100644
--- a/src/sdk/clients/worker/events/types.ts
+++ b/src/sdk/clients/worker/events/types.ts
@@ -134,6 +134,13 @@ export interface TaskUpdateCompleted extends TaskRunnerEvent {
   durationMs: number;
 }
 
+/**
+ * Event published when a poll cycle is skipped because the worker is paused.
+ */
+export interface TaskPaused extends TaskRunnerEvent {
+  // No additional fields — taskType is inherited from TaskRunnerEvent.
+}
+
 /**
  * Union type of all task runner events.
  */
@@ -145,4 +152,5 @@ export type TaskRunnerEventType =
   | TaskExecutionCompleted
   | TaskExecutionFailure
   | TaskUpdateCompleted
-  | TaskUpdateFailure;
+  | TaskUpdateFailure
+  | TaskPaused;
diff --git a/src/sdk/clients/worker/types.ts b/src/sdk/clients/worker/types.ts
index b56eed5c..3aa0db61 100644
--- a/src/sdk/clients/worker/types.ts
+++ b/src/sdk/clients/worker/types.ts
@@ -86,6 +86,8 @@ export interface PollerOptions {
   adaptiveBackoff?: boolean;
   /** Whether this poller is paused (default: false) */
   paused?: boolean;
+  /** Callback invoked each time a poll cycle is skipped because the poller is paused */
+  onPaused?: () => void;
 }
 
 /**
diff --git a/src/sdk/clients/workflow/WorkflowExecutor.ts b/src/sdk/clients/workflow/WorkflowExecutor.ts
index a34188e2..c527050e 100644
--- a/src/sdk/clients/workflow/WorkflowExecutor.ts
+++ b/src/sdk/clients/workflow/WorkflowExecutor.ts
@@ -30,6 +30,7 @@ import { enhanceSignalResponse } from "./helpers/enhanceSignalResponse";
 import { reverseFind } from "./helpers/reverseFind";
 import { isCompletedTaskMatchingType } from "./helpers/isCompletedTaskMatchingType";
 import { RETRY_TIME_IN_MILLISECONDS } from "./constants";
+import { getHttpMetricsObserver } from "../../worker/metrics/httpObserver";
 
 export class WorkflowExecutor {
   public readonly _client: Client;
@@ -71,6 +72,23 @@ export class WorkflowExecutor {
   public async startWorkflow(
     workflowRequest: StartWorkflowRequest
   ): Promise<string> {
+    const observer = getHttpMetricsObserver();
+    if (observer) {
+      try {
+        const inputBytes = workflowRequest.input
+          ? JSON.stringify(workflowRequest.input).length
+          : 0;
+        observer.recordWorkflowInputSize(
+          workflowRequest.name ?? "",
+          inputBytes,
+          workflowRequest.version != null
+            ? String(workflowRequest.version)
+            : undefined,
+        );
+      } catch {
+        // Best-effort — don't let metrics break workflow start.
+      }
+    }
     try {
       const { data } = await WorkflowResource.startWorkflow({
         body: workflowRequest,
@@ -80,6 +98,16 @@ export class WorkflowExecutor {
 
       return data;
     } catch (error: unknown) {
+      if (observer) {
+        const excName =
+          error instanceof Error
+            ? error.name || error.constructor?.name || "Error"
+            : "Error";
+        observer.recordWorkflowStartError(
+          workflowRequest.name ?? "",
+          excName,
+        );
+      }
       handleSdkError(error, "Failed to start workflow");
     }
   }
diff --git a/src/sdk/createConductorClient/helpers/fetchWithRetry.ts b/src/sdk/createConductorClient/helpers/fetchWithRetry.ts
index 733313cd..1a49d509 100644
--- a/src/sdk/createConductorClient/helpers/fetchWithRetry.ts
+++ b/src/sdk/createConductorClient/helpers/fetchWithRetry.ts
@@ -186,9 +186,61 @@ export const retryFetch = async (
 
 export const wrapFetchWithRetry = (
   fetchFn: typeof fetch,
-  options?: RetryFetchOptions
+  options?: RetryFetchOptions,
 ): typeof fetch => {
-  return (input: Input, init?: Init): Promise<Response> => {
-    return retryFetch(input, init, fetchFn, options);
+  return async (input: Input, init?: Init): Promise<Response> => {
+    const start = performance.now();
+    let method = "GET";
+    let uri = "";
+
+    try {
+      if (input instanceof Request) {
+        method = input.method;
+        uri = new URL(input.url).pathname;
+      } else if (typeof input === "string") {
+        method = init?.method ?? "GET";
+        try { uri = new URL(input).pathname; } catch { uri = input; }
+      } else {
+        method = init?.method ?? "GET";
+        try { uri = input.pathname; } catch { uri = String(input); }
+      }
+    } catch {
+      // Best-effort URI extraction
+    }
+
+    try {
+      const response = await retryFetch(input, init, fetchFn, options);
+      const durationMs = performance.now() - start;
+      try {
+        const { getHttpMetricsObserver } = await import(
+          "../../worker/metrics/httpObserver.js"
+        );
+        getHttpMetricsObserver()?.recordApiRequestTime(
+          method,
+          uri,
+          String(response.status),
+          durationMs,
+        );
+      } catch {
+        // Metrics recording is best-effort
+      }
+      return response;
+    } catch (error) {
+      const durationMs = performance.now() - start;
+      try {
+        const { getHttpMetricsObserver } = await import(
+          "../../worker/metrics/httpObserver.js"
+        );
+        getHttpMetricsObserver()?.recordApiRequestTime(
+          method,
+          uri,
+          "0",
+          durationMs,
+        );
+      } catch {
+        // Metrics recording is best-effort
+      }
+      throw error;
+    }
   };
 };
diff --git a/src/sdk/worker/core/TaskHandler.ts b/src/sdk/worker/core/TaskHandler.ts
index 2c0dd083..b39a9b94 100644
--- a/src/sdk/worker/core/TaskHandler.ts
+++ b/src/sdk/worker/core/TaskHandler.ts
@@ -7,6 +7,7 @@ import {
   RESTART_BACKOFF_MAX_MS,
 } from "../../clients/worker/constants";
 import type { TaskRunnerEventsListener } from "../../clients/worker/events";
+import type { MetricsCollectorInterface } from "../metrics/MetricsCollectorInterface";
 import { TaskRunner } from "../../clients/worker/TaskRunner";
 import type { ConductorWorker, HealthMonitorConfig } from "../../clients/worker/types";
 import type { ConductorLogger } from "../../helpers/logger";
@@ -140,6 +141,10 @@ export class TaskHandler {
   private restartAttempts = new Map<number, number>(); // runner index → attempt count
   private healthMonitorConfig: HealthMonitorConfig;
 
+  // Uncaught exception handlers (bound so we can remove them)
+  private _uncaughtHandler?: (err: Error) => void;
+  private _unhandledHandler?: (reason: unknown) => void;
+
   /**
    * Create a TaskHandler instance with async module imports.
    * Use this instead of `new TaskHandler()` when using `importModules`.
@@ -323,6 +328,7 @@ export class TaskHandler {
 
     this.isRunning = true;
     this.startHealthMonitor();
+    this.wireUncaughtExceptionHandlers();
     this.logger.info("All workers started successfully");
   }
 
@@ -341,6 +347,7 @@ export class TaskHandler {
     }
 
     this.stopHealthMonitor();
+    this.unwireUncaughtExceptionHandlers();
     this.logger.info(`Stopping ${this.taskRunners.length} worker(s)...`);
 
     const stopPromises = this.taskRunners.map(async (runner, index) => {
@@ -528,6 +535,48 @@ export class TaskHandler {
     }
   }
 
+  // ── Uncaught Exception Handling ─────────────────────────────────
+
+  private findMetricsCollector(): MetricsCollectorInterface | undefined {
+    const listeners = this.config.eventListeners ?? [];
+    return listeners.find(
+      (l): l is MetricsCollectorInterface =>
+        typeof (l as MetricsCollectorInterface).recordUncaughtException ===
+        "function",
+    );
+  }
+
+  private wireUncaughtExceptionHandlers(): void {
+    const collector = this.findMetricsCollector();
+    if (!collector) return;
+
+    this._uncaughtHandler = (err: Error) => {
+      const excName = err.name || err.constructor?.name || "Error";
+      collector.recordUncaughtException(excName);
+    };
+    this._unhandledHandler = (reason: unknown) => {
+      const excName =
+        reason instanceof Error
+          ? reason.name || reason.constructor?.name || "Error"
+          : "Error";
+      collector.recordUncaughtException(excName);
+    };
+
+    process.on("uncaughtException", this._uncaughtHandler);
+    process.on("unhandledRejection", this._unhandledHandler);
+  }
+
+  private unwireUncaughtExceptionHandlers(): void {
+    if (this._uncaughtHandler) {
+      process.removeListener("uncaughtException", this._uncaughtHandler);
+      this._uncaughtHandler = undefined;
+    }
+    if (this._unhandledHandler) {
+      process.removeListener("unhandledRejection", this._unhandledHandler);
+      this._unhandledHandler = undefined;
+    }
+  }
+
   // ── Discovery Summary ───────────────────────────────────────────
 
   /**
diff --git a/src/sdk/worker/index.ts b/src/sdk/worker/index.ts
index 35e1fed9..7605d589 100644
--- a/src/sdk/worker/index.ts
+++ b/src/sdk/worker/index.ts
@@ -17,7 +17,11 @@ export { TaskContext, getTaskContext } from "./context";
 // Metrics
 export {
   MetricsCollector,
+  LegacyMetricsCollector,
+  CanonicalMetricsCollector,
+  createMetricsCollector,
   MetricsServer,
+  type MetricsCollectorInterface,
   type MetricsCollectorConfig,
   type WorkerMetrics,
 } from "./metrics";
diff --git a/src/sdk/worker/metrics/CanonicalMetricsCollector.ts b/src/sdk/worker/metrics/CanonicalMetricsCollector.ts
new file mode 100644
index 00000000..d12b8e7a
--- /dev/null
+++ b/src/sdk/worker/metrics/CanonicalMetricsCollector.ts
@@ -0,0 +1,510 @@
+import type {
+  PollStarted,
+  PollCompleted,
+  PollFailure,
+  TaskExecutionStarted,
+  TaskExecutionCompleted,
+  TaskExecutionFailure,
+  TaskUpdateCompleted,
+  TaskUpdateFailure,
+  TaskPaused,
+} from "../../clients/worker/events";
+import type { MetricsCollectorInterface } from "./MetricsCollectorInterface";
+import type { MetricsCollectorConfig } from "./LegacyMetricsCollector";
+import { setHttpMetricsObserver } from "./httpObserver";
+import {
+  HistogramAccumulator,
+  MultiLabelCounter,
+  GaugeMetric,
+  TIME_BUCKETS,
+  SIZE_BUCKETS,
+  exceptionLabel,
+} from "./accumulators";
+
+interface CanonicalMetricState {
+  // Counters (12)
+  taskPollTotal: MultiLabelCounter;
+  taskExecutionStartedTotal: MultiLabelCounter;
+  taskPollErrorTotal: MultiLabelCounter;
+  taskExecuteErrorTotal: MultiLabelCounter;
+  taskUpdateErrorTotal: MultiLabelCounter;
+  taskAckErrorTotal: MultiLabelCounter;
+  taskAckFailedTotal: MultiLabelCounter;
+  taskExecutionQueueFullTotal: MultiLabelCounter;
+  taskPausedTotal: MultiLabelCounter;
+  threadUncaughtExceptionsTotal: MultiLabelCounter;
+  externalPayloadUsedTotal: MultiLabelCounter;
+  workflowStartErrorTotal: MultiLabelCounter;
+
+  // Time histograms (4) — seconds
+  taskPollTimeSeconds: HistogramAccumulator;
+  taskExecuteTimeSeconds: HistogramAccumulator;
+  taskUpdateTimeSeconds: HistogramAccumulator;
+  httpApiClientRequestSeconds: HistogramAccumulator;
+
+  // Size histograms (2) — bytes
+  taskResultSizeBytes: HistogramAccumulator;
+  workflowInputSizeBytes: HistogramAccumulator;
+
+  // Gauge (1)
+  activeWorkers: GaugeMetric;
+}
+
+/**
+ * Canonical metrics collector.
+ *
+ * Emits unprefixed Prometheus metrics with camelCase labels, second-based
+ * time units, and Histogram type for distributions, per the cross-SDK
+ * canonical metric catalog.
+ *
+ * Selected when WORKER_CANONICAL_METRICS=true.
+ */
+export class CanonicalMetricsCollector implements MetricsCollectorInterface {
+  private state: CanonicalMetricState;
+  private _server?: import("./MetricsServer.js").MetricsServer;
+  private _fileTimer?: ReturnType<typeof setInterval>;
+  private _promRegistry?: import("./CanonicalPrometheusRegistry.js").CanonicalPrometheusRegistry;
+  private readonly _usePromClient: boolean;
+
+  constructor(config?: MetricsCollectorConfig) {
+    this.state = this.createEmpty();
+    this._usePromClient = config?.usePromClient ?? false;
+    if (this._usePromClient) {
+      void this.initPromClient();
+    }
+    if (config?.httpPort) {
+      void this.startServer(config.httpPort);
+    }
+    if (config?.filePath) {
+      this.startFileWriter(
+        config.filePath,
+        config.fileWriteIntervalMs ?? 5000,
+      );
+    }
+    setHttpMetricsObserver(this);
+  }
+
+  private async initPromClient(): Promise<void> {
+    const { CanonicalPrometheusRegistry } = await import(
+      "./CanonicalPrometheusRegistry.js"
+    );
+    this._promRegistry = new CanonicalPrometheusRegistry();
+    await this._promRegistry.initialize();
+  }
+
+  private async startServer(port: number): Promise<void> {
+    const { MetricsServer } = await import("./MetricsServer.js");
+    this._server = new MetricsServer(this, port);
+    await this._server.start();
+  }
+
+  private startFileWriter(filePath: string, intervalMs: number): void {
+    const doWrite = async () => {
+      try {
+        const { writeFile } = await import("node:fs/promises");
+        await writeFile(filePath, this.toPrometheusText(), "utf-8");
+      } catch {
+        // Silently ignore file write errors
+      }
+    };
+    void doWrite();
+    this._fileTimer = setInterval(doWrite, intervalMs);
+    if (typeof this._fileTimer === "object" && "unref" in this._fileTimer) {
+      this._fileTimer.unref();
+    }
+  }
+
+  private createEmpty(): CanonicalMetricState {
+    return {
+      taskPollTotal: new MultiLabelCounter(),
+      taskExecutionStartedTotal: new MultiLabelCounter(),
+      taskPollErrorTotal: new MultiLabelCounter(),
+      taskExecuteErrorTotal: new MultiLabelCounter(),
+      taskUpdateErrorTotal: new MultiLabelCounter(),
+      taskAckErrorTotal: new MultiLabelCounter(),
+      taskAckFailedTotal: new MultiLabelCounter(),
+      taskExecutionQueueFullTotal: new MultiLabelCounter(),
+      taskPausedTotal: new MultiLabelCounter(),
+      threadUncaughtExceptionsTotal: new MultiLabelCounter(),
+      externalPayloadUsedTotal: new MultiLabelCounter(),
+      workflowStartErrorTotal: new MultiLabelCounter(),
+
+      taskPollTimeSeconds: new HistogramAccumulator(TIME_BUCKETS),
+      taskExecuteTimeSeconds: new HistogramAccumulator(TIME_BUCKETS),
+      taskUpdateTimeSeconds: new HistogramAccumulator(TIME_BUCKETS),
+      httpApiClientRequestSeconds: new HistogramAccumulator(TIME_BUCKETS),
+
+      taskResultSizeBytes: new HistogramAccumulator(SIZE_BUCKETS),
+      workflowInputSizeBytes: new HistogramAccumulator(SIZE_BUCKETS),
+
+      activeWorkers: new GaugeMetric(),
+    };
+  }
+
+  // ── Event Listener Methods ──────────────────────────────────────
+
+  onPollStarted(event: PollStarted): void {
+    this.state.taskPollTotal.increment({ taskType: event.taskType });
+    this._promRegistry?.incrementCounter("task_poll_total", {
+      taskType: event.taskType,
+    });
+  }
+
+  onPollCompleted(event: PollCompleted): void {
+    const seconds = event.durationMs / 1000;
+    this.state.taskPollTimeSeconds.observe(
+      { taskType: event.taskType, status: "SUCCESS" },
+      seconds,
+    );
+    this._promRegistry?.observeHistogram("task_poll_time_seconds", {
+      taskType: event.taskType,
+      status: "SUCCESS",
+    }, seconds);
+  }
+
+  onPollFailure(event: PollFailure): void {
+    const excName = exceptionLabel(event.cause);
+    this.state.taskPollErrorTotal.increment({
+      taskType: event.taskType,
+      exception: excName,
+    });
+    this._promRegistry?.incrementCounter("task_poll_error_total", {
+      taskType: event.taskType,
+      exception: excName,
+    });
+
+    const seconds = event.durationMs / 1000;
+    this.state.taskPollTimeSeconds.observe(
+      { taskType: event.taskType, status: "FAILURE" },
+      seconds,
+    );
+    this._promRegistry?.observeHistogram("task_poll_time_seconds", {
+      taskType: event.taskType,
+      status: "FAILURE",
+    }, seconds);
+  }
+
+  onTaskExecutionStarted(event: TaskExecutionStarted): void {
+    this.state.taskExecutionStartedTotal.increment({
+      taskType: event.taskType,
+    });
+    this._promRegistry?.incrementCounter("task_execution_started_total", {
+      taskType: event.taskType,
+    });
+    this.state.activeWorkers.inc({ taskType: event.taskType });
+    this._promRegistry?.setGauge(
+      "active_workers",
+      { taskType: event.taskType },
+      this.state.activeWorkers.getValue({ taskType: event.taskType }),
+    );
+  }
+
+  onTaskExecutionCompleted(event: TaskExecutionCompleted): void {
+    const seconds = event.durationMs / 1000;
+    this.state.taskExecuteTimeSeconds.observe(
+      { taskType: event.taskType, status: "SUCCESS" },
+      seconds,
+    );
+    this._promRegistry?.observeHistogram("task_execute_time_seconds", {
+      taskType: event.taskType,
+      status: "SUCCESS",
+    }, seconds);
+
+    if (event.outputSizeBytes !== undefined) {
+      this.state.taskResultSizeBytes.observe(
+        { taskType: event.taskType },
+        event.outputSizeBytes,
+      );
+      this._promRegistry?.observeHistogram("task_result_size_bytes", {
+        taskType: event.taskType,
+      }, event.outputSizeBytes);
+    }
+
+    this.state.activeWorkers.dec({ taskType: event.taskType });
+    this._promRegistry?.setGauge(
+      "active_workers",
+      { taskType: event.taskType },
+      Math.max(0, this.state.activeWorkers.getValue({ taskType: event.taskType })),
+    );
+  }
+
+  onTaskExecutionFailure(event: TaskExecutionFailure): void {
+    const excName = exceptionLabel(event.cause);
+    this.state.taskExecuteErrorTotal.increment({
+      taskType: event.taskType,
+      exception: excName,
+    });
+    this._promRegistry?.incrementCounter("task_execute_error_total", {
+      taskType: event.taskType,
+      exception: excName,
+    });
+
+    const seconds = event.durationMs / 1000;
+    this.state.taskExecuteTimeSeconds.observe(
+      { taskType: event.taskType, status: "FAILURE" },
+      seconds,
+    );
+    this._promRegistry?.observeHistogram("task_execute_time_seconds", {
+      taskType: event.taskType,
+      status: "FAILURE",
+    }, seconds);
+
+    this.state.activeWorkers.dec({ taskType: event.taskType });
+    this._promRegistry?.setGauge(
+      "active_workers",
+      { taskType: event.taskType },
+      Math.max(0, this.state.activeWorkers.getValue({ taskType: event.taskType })),
+    );
+  }
+
+  onTaskUpdateCompleted(event: TaskUpdateCompleted): void {
+    const seconds = event.durationMs / 1000;
+    this.state.taskUpdateTimeSeconds.observe(
+      { taskType: event.taskType, status: "SUCCESS" },
+      seconds,
+    );
+    this._promRegistry?.observeHistogram("task_update_time_seconds", {
+      taskType: event.taskType,
+      status: "SUCCESS",
+    }, seconds);
+  }
+
+  onTaskUpdateFailure(event: TaskUpdateFailure): void {
+    const excName = exceptionLabel(event.cause);
+    this.state.taskUpdateErrorTotal.increment({
+      taskType: event.taskType,
+      exception: excName,
+    });
+    this._promRegistry?.incrementCounter("task_update_error_total", {
+      taskType: event.taskType,
+      exception: excName,
+    });
+  }
+
+  onTaskPaused(event: TaskPaused): void {
+    this.recordTaskPaused(event.taskType);
+  }
+
+  // ── Direct Recording Methods ───────────────────────────────────
+
+  recordTaskExecutionQueueFull(taskType: string): void {
+    this.state.taskExecutionQueueFullTotal.increment({ taskType });
+    this._promRegistry?.incrementCounter("task_execution_queue_full_total", {
+      taskType,
+    });
+  }
+
+  recordUncaughtException(exception?: string): void {
+    const excName = exception ?? "Error";
+    this.state.threadUncaughtExceptionsTotal.increment({
+      exception: excName,
+    });
+    this._promRegistry?.incrementCounter(
+      "thread_uncaught_exceptions_total",
+      { exception: excName },
+    );
+  }
+
+  recordWorkerRestart(): void {
+    // Noop: worker_restart_total is N/A for the JS SDK (single-process model)
+  }
+
+  recordTaskPaused(taskType: string): void {
+    this.state.taskPausedTotal.increment({ taskType });
+    this._promRegistry?.incrementCounter("task_paused_total", { taskType });
+  }
+
+  recordTaskAckError(taskType: string, exception?: string): void {
+    const excName = exception ?? "Error";
+    this.state.taskAckErrorTotal.increment({ taskType, exception: excName });
+    this._promRegistry?.incrementCounter("task_ack_error_total", {
+      taskType,
+      exception: excName,
+    });
+  }
+
+  recordTaskAckFailed(taskType: string): void {
+    this.state.taskAckFailedTotal.increment({ taskType });
+    this._promRegistry?.incrementCounter("task_ack_failed_total", {
+      taskType,
+    });
+  }
+
+  recordWorkflowStartError(workflowType?: string, exception?: string): void {
+    const wfType = workflowType ?? "";
+    const excName = exception ?? "Error";
+    this.state.workflowStartErrorTotal.increment({
+      workflowType: wfType,
+      exception: excName,
+    });
+    this._promRegistry?.incrementCounter("workflow_start_error_total", {
+      workflowType: wfType,
+      exception: excName,
+    });
+  }
+
+  recordExternalPayloadUsed(
+    payloadType: string,
+    entityName?: string,
+    operation?: string,
+  ): void {
+    this.state.externalPayloadUsedTotal.increment({
+      entityName: entityName ?? "",
+      operation: operation ?? "",
+      payloadType,
+    });
+    this._promRegistry?.incrementCounter("external_payload_used_total", {
+      entityName: entityName ?? "",
+      operation: operation ?? "",
+      payloadType,
+    });
+  }
+
+  recordWorkflowInputSize(
+    workflowType: string,
+    sizeBytes: number,
+    version?: string,
+  ): void {
+    this.state.workflowInputSizeBytes.observe(
+      { workflowType, version: version ?? "" },
+      sizeBytes,
+    );
+    this._promRegistry?.observeHistogram("workflow_input_size_bytes", {
+      workflowType,
+      version: version ?? "",
+    }, sizeBytes);
+  }
+
+  recordApiRequestTime(
+    method: string,
+    uri: string,
+    status: number | string,
+    durationMs: number,
+  ): void {
+    const statusStr = String(status);
+    const seconds = durationMs / 1000;
+    this.state.httpApiClientRequestSeconds.observe(
+      { method, uri, status: statusStr },
+      seconds,
+    );
+    this._promRegistry?.observeHistogram("http_api_client_request_seconds", {
+      method,
+      uri,
+      status: statusStr,
+    }, seconds);
+  }
+
+  // ── Public API ──────────────────────────────────────────────────
+
+  getMetrics(): Readonly<CanonicalMetricState> {
+    return this.state;
+  }
+
+  reset(): void {
+    this.state = this.createEmpty();
+  }
+
+  async stop(): Promise<void> {
+    setHttpMetricsObserver(undefined);
+    if (this._fileTimer) {
+      clearInterval(this._fileTimer);
+      this._fileTimer = undefined;
+    }
+    if (this._server) {
+      await this._server.stop();
+      this._server = undefined;
+    }
+  }
+
+  getContentType(): string {
+    return (
+      this._promRegistry?.contentType ??
+      "text/plain; version=0.0.4; charset=utf-8"
+    );
+  }
+
+  async toPrometheusTextAsync(): Promise<string> {
+    if (this._promRegistry?.available) {
+      return this._promRegistry.metrics();
+    }
+    return this.toPrometheusText();
+  }
+
+  toPrometheusText(_prefix?: string): string {
+    const lines: string[] = [];
+
+    // ── Counters ──
+    const counterDefs: {
+      name: string;
+      help: string;
+      counter: MultiLabelCounter;
+    }[] = [
+      { name: "task_poll_total", help: "Total number of task polls", counter: this.state.taskPollTotal },
+      { name: "task_execution_started_total", help: "Total number of task executions started", counter: this.state.taskExecutionStartedTotal },
+      { name: "task_poll_error_total", help: "Total number of task poll errors", counter: this.state.taskPollErrorTotal },
+      { name: "task_execute_error_total", help: "Total number of task execution errors", counter: this.state.taskExecuteErrorTotal },
+      { name: "task_update_error_total", help: "Total number of task update errors", counter: this.state.taskUpdateErrorTotal },
+      { name: "task_ack_error_total", help: "Total number of task ack errors", counter: this.state.taskAckErrorTotal },
+      { name: "task_ack_failed_total", help: "Total number of task ack failures (server declined)", counter: this.state.taskAckFailedTotal },
+      { name: "task_execution_queue_full_total", help: "Total number of task execution queue full events", counter: this.state.taskExecutionQueueFullTotal },
+      { name: "task_paused_total", help: "Total number of task paused events", counter: this.state.taskPausedTotal },
+      { name: "thread_uncaught_exceptions_total", help: "Total uncaught exceptions", counter: this.state.threadUncaughtExceptionsTotal },
+      { name: "external_payload_used_total", help: "Total external payload usage", counter: this.state.externalPayloadUsedTotal },
+      { name: "workflow_start_error_total", help: "Total workflow start errors", counter: this.state.workflowStartErrorTotal },
+    ];
+
+    for (const def of counterDefs) {
+      const rendered = def.counter.render(def.name, def.help);
+      if (rendered) lines.push(rendered);
+    }
+
+    // ── Time histograms ──
+    const timeHistogramDefs: {
+      name: string;
+      help: string;
+      histogram: HistogramAccumulator;
+    }[] = [
+      { name: "task_poll_time_seconds", help: "Task poll duration in seconds", histogram: this.state.taskPollTimeSeconds },
+      { name: "task_execute_time_seconds", help: "Task execution duration in seconds", histogram: this.state.taskExecuteTimeSeconds },
+      { name: "task_update_time_seconds", help: "Task update duration in seconds", histogram: this.state.taskUpdateTimeSeconds },
+      { name: "http_api_client_request_seconds", help: "HTTP API client request duration in seconds", histogram: this.state.httpApiClientRequestSeconds },
+    ];
+
+    for (const def of timeHistogramDefs) {
+      const rendered = def.histogram.render(def.name, def.help);
+      if (rendered) lines.push(rendered);
+    }
+
+    // ── Size histograms ──
+    const sizeHistogramDefs: {
+      name: string;
+      help: string;
+      histogram: HistogramAccumulator;
+    }[] = [
+      { name: "task_result_size_bytes", help: "Task result output size in bytes", histogram: this.state.taskResultSizeBytes },
+      { name: "workflow_input_size_bytes", help: "Workflow input payload size in bytes", histogram: this.state.workflowInputSizeBytes },
+    ];
+
+    for (const def of sizeHistogramDefs) {
+      const rendered = def.histogram.render(def.name, def.help);
+      if (rendered) lines.push(rendered);
+    }
+
+    // ── Gauges ──
+    const gaugeDefs: {
+      name: string;
+      help: string;
+      gauge: GaugeMetric;
+    }[] = [
+      { name: "active_workers", help: "Number of workers actively executing tasks", gauge: this.state.activeWorkers },
+    ];
+
+    for (const def of gaugeDefs) {
+      const rendered = def.gauge.render(def.name, def.help);
+      if (rendered) lines.push(rendered);
+    }
+
+    lines.push(""); // trailing newline
+    return lines.join("\n");
+  }
+}
diff --git a/src/sdk/worker/metrics/CanonicalPrometheusRegistry.ts b/src/sdk/worker/metrics/CanonicalPrometheusRegistry.ts
new file mode 100644
index 00000000..c7ced41d
--- /dev/null
+++ b/src/sdk/worker/metrics/CanonicalPrometheusRegistry.ts
@@ -0,0 +1,168 @@
+/**
+ * Optional prom-client adapter for canonical metrics.
+ *
+ * Registers Prometheus Counter, Histogram, and Gauge objects using the
+ * canonical metric names (unprefixed, camelCase labels, seconds/bytes units).
+ */
+
+import { TIME_BUCKETS, SIZE_BUCKETS } from "./accumulators";
+
+interface PromCounter {
+  inc(labels: Record<string, string>, value?: number): void;
+}
+interface PromHistogram {
+  observe(labels: Record<string, string>, value: number): void;
+}
+interface PromGauge {
+  set(labels: Record<string, string>, value: number): void;
+}
+interface PromRegistry {
+  metrics(): Promise<string>;
+  contentType: string;
+}
+
+export class CanonicalPrometheusRegistry {
+  private _counters = new Map<string, PromCounter>();
+  private _histograms = new Map<string, PromHistogram>();
+  private _gauges = new Map<string, PromGauge>();
+  private _registry?: PromRegistry;
+  private _available = false;
+
+  async initialize(): Promise<boolean> {
+    try {
+      const promClient = await import("prom-client");
+      this._registry = promClient.register;
+      this.createMetrics(promClient);
+      this._available = true;
+      return true;
+    } catch {
+      this._available = false;
+      return false;
+    }
+  }
+
+  get available(): boolean {
+    return this._available;
+  }
+
+  get contentType(): string {
+    return (
+      this._registry?.contentType ??
+      "text/plain; version=0.0.4; charset=utf-8"
+    );
+  }
+
+  async metrics(): Promise<string> {
+    if (!this._registry) return "";
+    return this._registry.metrics();
+  }
+
+  incrementCounter(
+    name: string,
+    labels: Record<string, string>,
+    value = 1,
+  ): void {
+    this._counters.get(name)?.inc(labels, value);
+  }
+
+  observeHistogram(
+    name: string,
+    labels: Record<string, string>,
+    value: number,
+  ): void {
+    this._histograms.get(name)?.observe(labels, value);
+  }
+
+  setGauge(
+    name: string,
+    labels: Record<string, string>,
+    value: number,
+  ): void {
+    this._gauges.get(name)?.set(labels, value);
+  }
+
+  // eslint-disable-next-line @typescript-eslint/no-explicit-any
+  private createMetrics(promClient: any): void {
+    const Counter = promClient.Counter;
+    const Histogram = promClient.Histogram;
+    const Gauge = promClient.Gauge;
+
+    const counterDefs: {
+      name: string;
+      help: string;
+      labels: string[];
+    }[] = [
+      { name: "task_poll_total", help: "Total task polls", labels: ["taskType"] },
+      { name: "task_execution_started_total", help: "Total task executions started", labels: ["taskType"] },
+      { name: "task_poll_error_total", help: "Total task poll errors", labels: ["taskType", "exception"] },
+      { name: "task_execute_error_total", help: "Total task execution errors", labels: ["taskType", "exception"] },
+      { name: "task_update_error_total", help: "Total task update errors", labels: ["taskType", "exception"] },
+      { name: "task_ack_error_total", help: "Total task ack errors", labels: ["taskType", "exception"] },
+      { name: "task_ack_failed_total", help: "Total task ack failures", labels: ["taskType"] },
+      { name: "task_execution_queue_full_total", help: "Task execution queue full", labels: ["taskType"] },
+      { name: "task_paused_total", help: "Task paused events", labels: ["taskType"] },
+      { name: "thread_uncaught_exceptions_total", help: "Uncaught exceptions", labels: ["exception"] },
+      { name: "external_payload_used_total", help: "External payload used", labels: ["entityName", "operation", "payloadType"] },
+      { name: "workflow_start_error_total", help: "Workflow start errors", labels: ["workflowType", "exception"] },
+    ];
+
+    for (const def of counterDefs) {
+      this._counters.set(
+        def.name,
+        new Counter({
+          name: def.name,
+          help: def.help,
+          labelNames: def.labels,
+        }),
+      );
+    }
+
+    const timeBuckets = [...TIME_BUCKETS];
+    const sizeBuckets = [...SIZE_BUCKETS];
+
+    const histogramDefs: {
+      name: string;
+      help: string;
+      labels: string[];
+      buckets: number[];
+    }[] = [
+      { name: "task_poll_time_seconds", help: "Task poll duration (seconds)", labels: ["taskType", "status"], buckets: timeBuckets },
+      { name: "task_execute_time_seconds", help: "Task execution duration (seconds)", labels: ["taskType", "status"], buckets: timeBuckets },
+      { name: "task_update_time_seconds", help: "Task update duration (seconds)", labels: ["taskType", "status"], buckets: timeBuckets },
+      { name: "http_api_client_request_seconds", help: "HTTP API client request duration (seconds)", labels: ["method", "uri", "status"], buckets: timeBuckets },
+      { name: "task_result_size_bytes", help: "Task result size (bytes)", labels: ["taskType"], buckets: sizeBuckets },
+      { name: "workflow_input_size_bytes", help: "Workflow input size (bytes)", labels: ["workflowType", "version"], buckets: sizeBuckets },
+    ];
+
+    for (const def of histogramDefs) {
+      this._histograms.set(
+        def.name,
+        new Histogram({
+          name: def.name,
+          help: def.help,
+          labelNames: def.labels,
+          buckets: def.buckets,
+        }),
+      );
+    }
+
+    const gaugeDefs: {
+      name: string;
+      help: string;
+      labels: string[];
+    }[] = [
+      { name: "active_workers", help: "Workers actively executing tasks", labels: ["taskType"] },
+    ];
+
+    for (const def of gaugeDefs) {
+      this._gauges.set(
+        def.name,
+        new Gauge({
+          name: def.name,
+          help: def.help,
+          labelNames: def.labels,
+        }),
+      );
+    }
+  }
+}
diff --git a/src/sdk/worker/metrics/MetricsCollector.ts b/src/sdk/worker/metrics/LegacyMetricsCollector.ts
similarity index 85%
rename from src/sdk/worker/metrics/MetricsCollector.ts
rename to src/sdk/worker/metrics/LegacyMetricsCollector.ts
index ae3fe542..cbf1b3ac 100644
--- a/src/sdk/worker/metrics/MetricsCollector.ts
+++ b/src/sdk/worker/metrics/LegacyMetricsCollector.ts
@@ -1,5 +1,4 @@
 import type {
-  TaskRunnerEventsListener,
   PollStarted,
   PollCompleted,
   PollFailure,
@@ -9,6 +8,8 @@ import type {
   TaskUpdateCompleted,
   TaskUpdateFailure,
 } from "../../clients/worker/events";
+import type { MetricsCollectorInterface } from "./MetricsCollectorInterface";
+import { setHttpMetricsObserver } from "./httpObserver";
 
 /**
  * Configuration for MetricsCollector.
@@ -35,7 +36,7 @@ export interface MetricsCollectorConfig {
 }
 
 /**
- * Collected worker metrics.
+ * Collected worker metrics (legacy shape).
  */
 export interface WorkerMetrics {
   /** Total polls by taskType */
@@ -78,9 +79,6 @@ export interface WorkerMetrics {
 
 const QUANTILES = [0.5, 0.75, 0.9, 0.95, 0.99] as const;
 
-/**
- * Calculate quantiles from sorted array using linear interpolation.
- */
 function computeQuantile(sorted: number[], q: number): number {
   if (sorted.length === 0) return 0;
   if (sorted.length === 1) return sorted[0];
@@ -92,26 +90,13 @@ function computeQuantile(sorted: number[], q: number): number {
 }
 
 /**
- * Built-in metrics collector implementing TaskRunnerEventsListener.
- *
- * Collects 19 metric types matching the Python SDK's MetricsCollector,
- * with sliding-window quantile support (p50, p75, p90, p95, p99).
- *
- * @example
- * ```typescript
- * const metrics = new MetricsCollector({ httpPort: 9090 });
- *
- * const handler = new TaskHandler({
- *   client,
- *   eventListeners: [metrics],
- * });
+ * Legacy metrics collector.
  *
- * await handler.startWorkers();
- * // GET http://localhost:9090/metrics  — Prometheus format
- * // GET http://localhost:9090/health   — {"status":"UP"}
- * ```
+ * Emits prefixed Prometheus metrics with `task_type` labels, millisecond
+ * time units, and Summary type for distributions. This is the default
+ * implementation during the deprecation transition period.
  */
-export class MetricsCollector implements TaskRunnerEventsListener {
+export class LegacyMetricsCollector implements MetricsCollectorInterface {
   private metrics: WorkerMetrics;
   private readonly _prefix: string;
   private readonly _slidingWindowSize: number;
@@ -132,9 +117,10 @@ export class MetricsCollector implements TaskRunnerEventsListener {
     if (config?.filePath) {
       this.startFileWriter(
         config.filePath,
-        config.fileWriteIntervalMs ?? 5000
+        config.fileWriteIntervalMs ?? 5000,
       );
     }
+    setHttpMetricsObserver(this);
   }
 
   private async initPromClient(): Promise<void> {
@@ -158,10 +144,8 @@ export class MetricsCollector implements TaskRunnerEventsListener {
         // Silently ignore file write errors
       }
     };
-    // Immediate first write, then periodic
     void doWrite();
     this._fileTimer = setInterval(doWrite, intervalMs);
-    // Unref so the timer doesn't prevent process exit
     if (typeof this._fileTimer === "object" && "unref" in this._fileTimer) {
       this._fileTimer.unref();
     }
@@ -197,7 +181,7 @@ export class MetricsCollector implements TaskRunnerEventsListener {
   private observe(
     map: Map<string, number[]>,
     key: string,
-    value: number
+    value: number,
   ): void {
     let arr = map.get(key);
     if (!arr) {
@@ -205,7 +189,6 @@ export class MetricsCollector implements TaskRunnerEventsListener {
       map.set(key, arr);
     }
     arr.push(value);
-    // Sliding window: keep only the last N observations
     if (arr.length > this._slidingWindowSize) {
       arr.splice(0, arr.length - this._slidingWindowSize);
     }
@@ -215,7 +198,7 @@ export class MetricsCollector implements TaskRunnerEventsListener {
     map: Map<string, number>,
     key: string,
     promKey: string,
-    labelName: string
+    labelName: string,
   ): void {
     this.increment(map, key);
     this._promRegistry?.incrementCounter(promKey, { [labelName]: key });
@@ -226,7 +209,7 @@ export class MetricsCollector implements TaskRunnerEventsListener {
     key: string,
     value: number,
     promKey: string,
-    labelName: string
+    labelName: string,
   ): void {
     this.observe(map, key, value);
     this._promRegistry?.observeSummary(promKey, { [labelName]: key }, value);
@@ -248,7 +231,7 @@ export class MetricsCollector implements TaskRunnerEventsListener {
   }
 
   onTaskExecutionStarted(_event: TaskExecutionStarted): void {
-    // Counted on completion
+    // Legacy counts on completion, not start
   }
 
   onTaskExecutionCompleted(event: TaskExecutionCompleted): void {
@@ -273,57 +256,65 @@ export class MetricsCollector implements TaskRunnerEventsListener {
     this.incrementCounter(this.metrics.taskUpdateFailureTotal, event.taskType, "update_error_total", "task_type");
   }
 
-  // ── Direct Recording Methods (for code outside event system) ───
+  // Canonical-only event — noop in legacy
+  onTaskPaused(): void {
+    // Noop: TaskPaused events are handled via recordTaskPaused()
+  }
+
+  // ── Direct Recording Methods ───────────────────────────────────
 
-  /** Record a task execution queue full event */
   recordTaskExecutionQueueFull(taskType: string): void {
     this.incrementCounter(this.metrics.taskExecutionQueueFullTotal, taskType, "queue_full_total", "task_type");
   }
 
-  /** Record an uncaught exception */
-  recordUncaughtException(): void {
+  recordUncaughtException(_exception?: string): void {
     this.metrics.uncaughtExceptionTotal++;
     this._promRegistry?.incrementCounter("uncaught_total", {});
   }
 
-  /** Record a worker restart */
   recordWorkerRestart(): void {
     this.metrics.workerRestartTotal++;
     this._promRegistry?.incrementCounter("restart_total", {});
   }
 
-  /** Record a task paused event */
   recordTaskPaused(taskType: string): void {
     this.incrementCounter(this.metrics.taskPausedTotal, taskType, "paused_total", "task_type");
   }
 
-  /** Record a task ack error */
-  recordTaskAckError(taskType: string): void {
+  recordTaskAckError(taskType: string, _exception?: string): void {
     this.incrementCounter(this.metrics.taskAckErrorTotal, taskType, "ack_error_total", "task_type");
   }
 
-  /** Record a workflow start error */
-  recordWorkflowStartError(): void {
+  recordTaskAckFailed(_taskType: string): void {
+    // Noop: canonical-only metric (server declined ack)
+  }
+
+  recordWorkflowStartError(_workflowType?: string, _exception?: string): void {
     this.metrics.workflowStartErrorTotal++;
     this._promRegistry?.incrementCounter("wf_start_error_total", {});
   }
 
-  /** Record external payload usage */
-  recordExternalPayloadUsed(payloadType: string): void {
+  recordExternalPayloadUsed(
+    payloadType: string,
+    _entityName?: string,
+    _operation?: string,
+  ): void {
     this.incrementCounter(this.metrics.externalPayloadUsedTotal, payloadType, "external_payload_total", "payload_type");
   }
 
-  /** Record workflow input size */
-  recordWorkflowInputSize(workflowType: string, sizeBytes: number): void {
+  recordWorkflowInputSize(
+    workflowType: string,
+    sizeBytes: number,
+    _version?: string,
+  ): void {
     this.observeSummary(this.metrics.workflowInputSizeBytes, workflowType, sizeBytes, "wf_input_size", "workflow_type");
   }
 
-  /** Record API request duration */
   recordApiRequestTime(
     method: string,
     uri: string,
-    status: number,
-    durationMs: number
+    status: number | string,
+    durationMs: number,
   ): void {
     const key = `${method}:${uri}:${status}`;
     this.observeSummary(this.metrics.apiRequestDurationMs, key, durationMs, "api_request", "endpoint");
@@ -331,18 +322,16 @@ export class MetricsCollector implements TaskRunnerEventsListener {
 
   // ── Public API ──────────────────────────────────────────────────
 
-  /** Get a snapshot of all collected metrics */
   getMetrics(): Readonly<WorkerMetrics> {
     return this.metrics;
   }
 
-  /** Reset all collected metrics */
   reset(): void {
     this.metrics = this.createEmptyMetrics();
   }
 
-  /** Stop the auto-started metrics HTTP server and file writer (if any) */
   async stop(): Promise<void> {
+    setHttpMetricsObserver(undefined);
     if (this._fileTimer) {
       clearInterval(this._fileTimer);
       this._fileTimer = undefined;
@@ -353,27 +342,13 @@ export class MetricsCollector implements TaskRunnerEventsListener {
     }
   }
 
-  /**
-   * Get the content type for the Prometheus metrics endpoint.
-   * Returns prom-client's content type when available, otherwise standard Prometheus text format.
-   */
   getContentType(): string {
-    return this._promRegistry?.contentType ?? "text/plain; version=0.0.4; charset=utf-8";
+    return (
+      this._promRegistry?.contentType ??
+      "text/plain; version=0.0.4; charset=utf-8"
+    );
   }
 
-  /**
-   * Render all collected metrics in Prometheus exposition format.
-   * If prom-client is available and `usePromClient: true`, delegates to prom-client's registry.
-   * Otherwise uses built-in rendering with p50/p75/p90/p95/p99 quantiles.
-   *
-   * @param prefix - Metric name prefix (defaults to constructor config or "conductor_worker")
-   * @returns Prometheus text format string
-   */
-  /**
-   * Async version of toPrometheusText.
-   * When prom-client is available, returns its native registry output.
-   * Otherwise falls back to the built-in text format.
-   */
   async toPrometheusTextAsync(): Promise<string> {
     if (this._promRegistry?.available) {
       return this._promRegistry.metrics();
@@ -454,7 +429,7 @@ export class MetricsCollector implements TaskRunnerEventsListener {
       lines.push(`# TYPE ${counter.name} counter`);
       for (const [label, value] of counter.data) {
         lines.push(
-          `${counter.name}{${counter.labelName}="${label}"} ${value}`
+          `${counter.name}{${counter.labelName}="${label}"} ${value}`,
         );
       }
     }
@@ -545,14 +520,14 @@ export class MetricsCollector implements TaskRunnerEventsListener {
         for (const q of QUANTILES) {
           const val = computeQuantile(sorted, q);
           lines.push(
-            `${summary.name}{${summary.labelName}="${label}",quantile="${q}"} ${val}`
+            `${summary.name}{${summary.labelName}="${label}",quantile="${q}"} ${val}`,
           );
         }
         lines.push(
-          `${summary.name}_count{${summary.labelName}="${label}"} ${count}`
+          `${summary.name}_count{${summary.labelName}="${label}"} ${count}`,
         );
         lines.push(
-          `${summary.name}_sum{${summary.labelName}="${label}"} ${sum}`
+          `${summary.name}_sum{${summary.labelName}="${label}"} ${sum}`,
         );
       }
     }
diff --git a/src/sdk/worker/metrics/MetricsCollectorInterface.ts b/src/sdk/worker/metrics/MetricsCollectorInterface.ts
new file mode 100644
index 00000000..78d0c0b0
--- /dev/null
+++ b/src/sdk/worker/metrics/MetricsCollectorInterface.ts
@@ -0,0 +1,48 @@
+import type {
+  TaskRunnerEventsListener,
+} from "../../clients/worker/events";
+
+/**
+ * Unified metrics collector interface.
+ *
+ * Both LegacyMetricsCollector and CanonicalMetricsCollector implement this
+ * interface so call sites never need to know which variant is active.
+ * Methods that only apply to one variant are noops in the other.
+ */
+export interface MetricsCollectorInterface extends TaskRunnerEventsListener {
+  // ── Direct recording methods (superset signatures) ─────────────
+
+  recordTaskExecutionQueueFull(taskType: string): void;
+  recordUncaughtException(exception?: string): void;
+  recordWorkerRestart(): void;
+  recordTaskPaused(taskType: string): void;
+  recordTaskAckError(taskType: string, exception?: string): void;
+  /** Canonical-only: server declined ack (no exception). Legacy noops. */
+  recordTaskAckFailed(taskType: string): void;
+  recordWorkflowStartError(workflowType?: string, exception?: string): void;
+  recordExternalPayloadUsed(
+    payloadType: string,
+    entityName?: string,
+    operation?: string,
+  ): void;
+  recordWorkflowInputSize(
+    workflowType: string,
+    sizeBytes: number,
+    version?: string,
+  ): void;
+  recordApiRequestTime(
+    method: string,
+    uri: string,
+    status: number | string,
+    durationMs: number,
+  ): void;
+
+  // ── Output / lifecycle ─────────────────────────────────────────
+
+  getMetrics(): unknown;
+  reset(): void;
+  stop(): Promise<void>;
+  getContentType(): string;
+  toPrometheusText(prefix?: string): string;
+  toPrometheusTextAsync(): Promise<string>;
+}
diff --git a/src/sdk/worker/metrics/MetricsServer.ts b/src/sdk/worker/metrics/MetricsServer.ts
index 62f1fbcb..d575901b 100644
--- a/src/sdk/worker/metrics/MetricsServer.ts
+++ b/src/sdk/worker/metrics/MetricsServer.ts
@@ -1,5 +1,5 @@
 import { createServer, type Server, type IncomingMessage, type ServerResponse } from "node:http";
-import type { MetricsCollector } from "./MetricsCollector";
+import type { MetricsCollectorInterface } from "./MetricsCollectorInterface";
 
 /**
  * Lightweight HTTP server exposing Prometheus metrics and a health check endpoint.
@@ -21,11 +21,11 @@ import type { MetricsCollector } from "./MetricsCollector";
  * ```
  */
 export class MetricsServer {
-  private readonly _collector: MetricsCollector;
+  private readonly _collector: MetricsCollectorInterface;
   private readonly _port: number;
   private _server?: Server;
 
-  constructor(collector: MetricsCollector, port: number) {
+  constructor(collector: MetricsCollectorInterface, port: number) {
     this._collector = collector;
     this._port = port;
   }
diff --git a/src/sdk/worker/metrics/__tests__/CanonicalMetricsCollector.test.ts b/src/sdk/worker/metrics/__tests__/CanonicalMetricsCollector.test.ts
new file mode 100644
index 00000000..6b616ce3
--- /dev/null
+++ b/src/sdk/worker/metrics/__tests__/CanonicalMetricsCollector.test.ts
@@ -0,0 +1,321 @@
+import { describe, it, expect, beforeEach } from "@jest/globals";
+import { CanonicalMetricsCollector } from "../CanonicalMetricsCollector";
+
+describe("CanonicalMetricsCollector", () => {
+  let collector: CanonicalMetricsCollector;
+
+  beforeEach(() => {
+    collector = new CanonicalMetricsCollector();
+  });
+
+  describe("poll metrics", () => {
+    it("should emit task_poll_total counter on onPollStarted", () => {
+      collector.onPollStarted({
+        taskType: "task_a",
+        workerId: "w1",
+        pollCount: 5,
+        timestamp: new Date(),
+      });
+      collector.onPollStarted({
+        taskType: "task_a",
+        workerId: "w1",
+        pollCount: 5,
+        timestamp: new Date(),
+      });
+
+      const text = collector.toPrometheusText();
+      expect(text).toContain("# TYPE task_poll_total counter");
+      expect(text).toContain('task_poll_total{taskType="task_a"} 2');
+    });
+
+    it("should emit task_poll_time_seconds histogram on onPollCompleted", () => {
+      collector.onPollCompleted({
+        taskType: "task_a",
+        durationMs: 100,
+        tasksReceived: 3,
+        timestamp: new Date(),
+      });
+
+      const text = collector.toPrometheusText();
+      expect(text).toContain("# TYPE task_poll_time_seconds histogram");
+      expect(text).toContain('task_poll_time_seconds_bucket{taskType="task_a",status="SUCCESS",le="0.1"} 1');
+      expect(text).toContain('task_poll_time_seconds_sum{taskType="task_a",status="SUCCESS"} 0.1');
+      expect(text).toContain('task_poll_time_seconds_count{taskType="task_a",status="SUCCESS"} 1');
+    });
+
+    it("should emit task_poll_error_total with exception label on onPollFailure", () => {
+      collector.onPollFailure({
+        taskType: "task_a",
+        durationMs: 5000,
+        cause: new TypeError("timeout"),
+        timestamp: new Date(),
+      });
+
+      const text = collector.toPrometheusText();
+      expect(text).toContain('task_poll_error_total{taskType="task_a",exception="TypeError"} 1');
+      expect(text).toContain('task_poll_time_seconds_bucket{taskType="task_a",status="FAILURE"');
+    });
+  });
+
+  describe("execution metrics", () => {
+    it("should emit task_execution_started_total on onTaskExecutionStarted", () => {
+      collector.onTaskExecutionStarted({
+        taskType: "task_a",
+        taskId: "t1",
+        workerId: "w1",
+        timestamp: new Date(),
+      });
+
+      const text = collector.toPrometheusText();
+      expect(text).toContain('task_execution_started_total{taskType="task_a"} 1');
+    });
+
+    it("should track active_workers gauge", () => {
+      collector.onTaskExecutionStarted({
+        taskType: "task_a",
+        taskId: "t1",
+        workerId: "w1",
+        timestamp: new Date(),
+      });
+      collector.onTaskExecutionStarted({
+        taskType: "task_a",
+        taskId: "t2",
+        workerId: "w1",
+        timestamp: new Date(),
+      });
+
+      let text = collector.toPrometheusText();
+      expect(text).toContain('active_workers{taskType="task_a"} 2');
+
+      collector.onTaskExecutionCompleted({
+        taskType: "task_a",
+        taskId: "t1",
+        workerId: "w1",
+        durationMs: 200,
+        timestamp: new Date(),
+      });
+
+      text = collector.toPrometheusText();
+      expect(text).toContain('active_workers{taskType="task_a"} 1');
+    });
+
+    it("should emit task_execute_time_seconds histogram on completion", () => {
+      collector.onTaskExecutionCompleted({
+        taskType: "task_a",
+        taskId: "t1",
+        workerId: "w1",
+        durationMs: 500,
+        timestamp: new Date(),
+      });
+
+      const text = collector.toPrometheusText();
+      expect(text).toContain("# TYPE task_execute_time_seconds histogram");
+      expect(text).toContain('task_execute_time_seconds_sum{taskType="task_a",status="SUCCESS"} 0.5');
+    });
+
+    it("should emit task_result_size_bytes histogram on completion", () => {
+      collector.onTaskExecutionCompleted({
+        taskType: "task_a",
+        taskId: "t1",
+        workerId: "w1",
+        durationMs: 200,
+        outputSizeBytes: 5000,
+        timestamp: new Date(),
+      });
+
+      const text = collector.toPrometheusText();
+      expect(text).toContain("# TYPE task_result_size_bytes histogram");
+      expect(text).toContain('task_result_size_bytes_sum{taskType="task_a"} 5000');
+    });
+
+    it("should emit task_execute_error_total with exception label on failure", () => {
+      const err = new TypeError("invalid input");
+      collector.onTaskExecutionFailure({
+        taskType: "task_a",
+        taskId: "t1",
+        workerId: "w1",
+        cause: err,
+        durationMs: 50,
+        timestamp: new Date(),
+      });
+
+      const text = collector.toPrometheusText();
+      expect(text).toContain('task_execute_error_total{taskType="task_a",exception="TypeError"} 1');
+      expect(text).toContain('task_execute_time_seconds_bucket{taskType="task_a",status="FAILURE"');
+    });
+  });
+
+  describe("task update metrics", () => {
+    it("should emit task_update_time_seconds histogram on completion", () => {
+      collector.onTaskUpdateCompleted({
+        taskType: "task_a",
+        taskId: "t1",
+        workerId: "w1",
+        durationMs: 25,
+        timestamp: new Date(),
+      });
+
+      const text = collector.toPrometheusText();
+      expect(text).toContain("# TYPE task_update_time_seconds histogram");
+      expect(text).toContain('task_update_time_seconds_sum{taskType="task_a",status="SUCCESS"} 0.025');
+    });
+
+    it("should emit task_update_error_total with exception label on failure", () => {
+      collector.onTaskUpdateFailure({
+        taskType: "task_a",
+        taskId: "t1",
+        workerId: "w1",
+        cause: new Error("server error"),
+        retryCount: 4,
+        taskResult: {},
+        timestamp: new Date(),
+      });
+
+      const text = collector.toPrometheusText();
+      expect(text).toContain('task_update_error_total{taskType="task_a",exception="Error"} 1');
+    });
+  });
+
+  describe("direct recording methods", () => {
+    it("recordTaskExecutionQueueFull should emit counter", () => {
+      collector.recordTaskExecutionQueueFull("task_a");
+      collector.recordTaskExecutionQueueFull("task_a");
+
+      const text = collector.toPrometheusText();
+      expect(text).toContain('task_execution_queue_full_total{taskType="task_a"} 2');
+    });
+
+    it("recordUncaughtException should emit counter with exception label", () => {
+      collector.recordUncaughtException("RangeError");
+      collector.recordUncaughtException("RangeError");
+      collector.recordUncaughtException("TypeError");
+
+      const text = collector.toPrometheusText();
+      expect(text).toContain('thread_uncaught_exceptions_total{exception="RangeError"} 2');
+      expect(text).toContain('thread_uncaught_exceptions_total{exception="TypeError"} 1');
+    });
+
+    it("recordWorkerRestart should be a noop (N/A for JS)", () => {
+      collector.recordWorkerRestart();
+      const text = collector.toPrometheusText();
+      expect(text).not.toContain("worker_restart");
+    });
+
+    it("recordTaskPaused should emit counter", () => {
+      collector.recordTaskPaused("paused_task");
+      const text = collector.toPrometheusText();
+      expect(text).toContain('task_paused_total{taskType="paused_task"} 1');
+    });
+
+    it("recordTaskAckError should emit counter with exception label", () => {
+      collector.recordTaskAckError("task_a", "TimeoutError");
+      const text = collector.toPrometheusText();
+      expect(text).toContain('task_ack_error_total{taskType="task_a",exception="TimeoutError"} 1');
+    });
+
+    it("recordTaskAckFailed should emit counter", () => {
+      collector.recordTaskAckFailed("task_a");
+      const text = collector.toPrometheusText();
+      expect(text).toContain('task_ack_failed_total{taskType="task_a"} 1');
+    });
+
+    it("recordWorkflowStartError should emit counter with labels", () => {
+      collector.recordWorkflowStartError("my_workflow", "NetworkError");
+      const text = collector.toPrometheusText();
+      expect(text).toContain('workflow_start_error_total{workflowType="my_workflow",exception="NetworkError"} 1');
+    });
+
+    it("recordExternalPayloadUsed should emit counter with labels", () => {
+      collector.recordExternalPayloadUsed("TASK_OUTPUT", "myEntity", "WRITE");
+      const text = collector.toPrometheusText();
+      expect(text).toContain('external_payload_used_total{entityName="myEntity",operation="WRITE",payloadType="TASK_OUTPUT"} 1');
+    });
+
+    it("recordWorkflowInputSize should emit histogram", () => {
+      collector.recordWorkflowInputSize("order_flow", 50000, "1");
+      const text = collector.toPrometheusText();
+      expect(text).toContain("# TYPE workflow_input_size_bytes histogram");
+      expect(text).toContain('workflow_input_size_bytes_sum{workflowType="order_flow",version="1"} 50000');
+    });
+
+    it("recordApiRequestTime should emit histogram in seconds", () => {
+      collector.recordApiRequestTime("GET", "/api/workflow", 200, 45);
+      const text = collector.toPrometheusText();
+      expect(text).toContain("# TYPE http_api_client_request_seconds histogram");
+      expect(text).toContain('http_api_client_request_seconds_sum{method="GET",uri="/api/workflow",status="200"} 0.045');
+    });
+  });
+
+  describe("onTaskPaused event", () => {
+    it("should increment task_paused_total", () => {
+      collector.onTaskPaused({
+        taskType: "paused_task",
+        timestamp: new Date(),
+      });
+      const text = collector.toPrometheusText();
+      expect(text).toContain('task_paused_total{taskType="paused_task"} 1');
+    });
+  });
+
+  describe("reset", () => {
+    it("should clear all canonical metrics", () => {
+      collector.onPollStarted({
+        taskType: "task_a",
+        workerId: "w1",
+        pollCount: 1,
+        timestamp: new Date(),
+      });
+      collector.recordUncaughtException("Error");
+      collector.recordTaskAckFailed("task_a");
+
+      collector.reset();
+
+      const text = collector.toPrometheusText();
+      expect(text).toBe("");
+    });
+  });
+
+  describe("output format", () => {
+    it("should not apply a prefix to canonical metric names", () => {
+      collector.onPollStarted({
+        taskType: "t",
+        workerId: "w",
+        pollCount: 1,
+        timestamp: new Date(),
+      });
+      const text = collector.toPrometheusText();
+      expect(text).not.toContain("conductor_worker");
+      expect(text).toContain("task_poll_total{");
+    });
+
+    it("should use seconds for time histograms", () => {
+      collector.onPollCompleted({
+        taskType: "t",
+        durationMs: 1000,
+        tasksReceived: 1,
+        timestamp: new Date(),
+      });
+      const text = collector.toPrometheusText();
+      expect(text).toContain("task_poll_time_seconds");
+      expect(text).toContain('_sum{taskType="t",status="SUCCESS"} 1');
+    });
+
+    it("should use camelCase taskType label", () => {
+      collector.onPollStarted({
+        taskType: "my_task",
+        workerId: "w",
+        pollCount: 1,
+        timestamp: new Date(),
+      });
+      const text = collector.toPrometheusText();
+      expect(text).toContain("taskType=");
+      expect(text).not.toContain("task_type=");
+    });
+  });
+
+  describe("stop", () => {
+    it("should not throw when no server is running", async () => {
+      await expect(collector.stop()).resolves.toBeUndefined();
+    });
+  });
+});
diff --git a/src/sdk/worker/metrics/__tests__/MetricsCollector.prometheus.test.ts b/src/sdk/worker/metrics/__tests__/MetricsCollector.prometheus.test.ts
index 3c61671a..ffe9f567 100644
--- a/src/sdk/worker/metrics/__tests__/MetricsCollector.prometheus.test.ts
+++ b/src/sdk/worker/metrics/__tests__/MetricsCollector.prometheus.test.ts
@@ -1,11 +1,11 @@
 import { describe, it, expect, beforeEach } from "@jest/globals";
-import { MetricsCollector } from "../MetricsCollector";
+import { LegacyMetricsCollector } from "../LegacyMetricsCollector";
 
-describe("MetricsCollector - Prometheus features", () => {
-  let collector: MetricsCollector;
+describe("LegacyMetricsCollector - Prometheus features", () => {
+  let collector: LegacyMetricsCollector;
 
   beforeEach(() => {
-    collector = new MetricsCollector({ slidingWindowSize: 1000 });
+    collector = new LegacyMetricsCollector({ slidingWindowSize: 1000 });
   });
 
   // ── toPrometheusText() ──────────────────────────────────────────
@@ -36,7 +36,7 @@ describe("MetricsCollector - Prometheus features", () => {
     });
 
     it("should use custom prefix", () => {
-      const c = new MetricsCollector({ prefix: "myapp" });
+      const c = new LegacyMetricsCollector({ prefix: "myapp" });
       c.onPollStarted({ taskType: "t", workerId: "w", pollCount: 1, timestamp: new Date() });
       const text = c.toPrometheusText();
       expect(text).toContain("myapp_task_poll_total");
@@ -167,7 +167,7 @@ describe("MetricsCollector - Prometheus features", () => {
 
   describe("sliding window", () => {
     it("should trim observations beyond window size", () => {
-      const small = new MetricsCollector({ slidingWindowSize: 5 });
+      const small = new LegacyMetricsCollector({ slidingWindowSize: 5 });
       for (let i = 0; i < 10; i++) {
         small.onPollCompleted({ taskType: "t", workerId: "w", durationMs: i, pollCount: i, taskCount: 1, timestamp: new Date() });
       }
diff --git a/src/sdk/worker/metrics/__tests__/MetricsCollector.test.ts b/src/sdk/worker/metrics/__tests__/MetricsCollector.test.ts
index 691d9325..3e8d8ebb 100644
--- a/src/sdk/worker/metrics/__tests__/MetricsCollector.test.ts
+++ b/src/sdk/worker/metrics/__tests__/MetricsCollector.test.ts
@@ -1,11 +1,11 @@
 import { describe, it, expect, beforeEach } from "@jest/globals";
-import { MetricsCollector } from "../MetricsCollector";
+import { LegacyMetricsCollector } from "../LegacyMetricsCollector";
 
-describe("MetricsCollector", () => {
-  let collector: MetricsCollector;
+describe("LegacyMetricsCollector", () => {
+  let collector: LegacyMetricsCollector;
 
   beforeEach(() => {
-    collector = new MetricsCollector();
+    collector = new LegacyMetricsCollector();
   });
 
   describe("poll metrics", () => {
diff --git a/src/sdk/worker/metrics/__tests__/MetricsServer.test.ts b/src/sdk/worker/metrics/__tests__/MetricsServer.test.ts
index bbe5153f..c0b423f7 100644
--- a/src/sdk/worker/metrics/__tests__/MetricsServer.test.ts
+++ b/src/sdk/worker/metrics/__tests__/MetricsServer.test.ts
@@ -1,6 +1,6 @@
 import { describe, it, expect, afterEach } from "@jest/globals";
 import { get as httpGet } from "node:http";
-import { MetricsCollector } from "../MetricsCollector";
+import { LegacyMetricsCollector } from "../LegacyMetricsCollector";
 import { MetricsServer } from "../MetricsServer";
 
 function fetchHttp(url: string): Promise<{ status: number; body: string; headers: Record<string, string | string[] | undefined> }> {
@@ -38,7 +38,7 @@ describe("MetricsServer", () => {
 
   it("should serve /metrics with Prometheus text format", async () => {
     const port = nextPort();
-    const collector = new MetricsCollector();
+    const collector = new LegacyMetricsCollector();
     collector.onPollStarted({ taskType: "test_task", workerId: "w", pollCount: 1, timestamp: new Date() });
 
     server = new MetricsServer(collector, port);
@@ -51,7 +51,7 @@ describe("MetricsServer", () => {
 
   it("should serve /health with JSON status", async () => {
     const port = nextPort();
-    const collector = new MetricsCollector();
+    const collector = new LegacyMetricsCollector();
     server = new MetricsServer(collector, port);
     await server.start();
 
@@ -62,7 +62,7 @@ describe("MetricsServer", () => {
 
   it("should return 404 for unknown paths", async () => {
     const port = nextPort();
-    const collector = new MetricsCollector();
+    const collector = new LegacyMetricsCollector();
     server = new MetricsServer(collector, port);
     await server.start();
 
@@ -72,7 +72,7 @@ describe("MetricsServer", () => {
 
   it("should stop cleanly after start", async () => {
     const port = nextPort();
-    const collector = new MetricsCollector();
+    const collector = new LegacyMetricsCollector();
     server = new MetricsServer(collector, port);
     await server.start();
     await server.stop();
@@ -80,7 +80,7 @@ describe("MetricsServer", () => {
   });
 
   it("should not throw when stop() called without start()", async () => {
-    const collector = new MetricsCollector();
+    const collector = new LegacyMetricsCollector();
     server = new MetricsServer(collector, nextPort());
     await server.stop();
     server = undefined;
@@ -88,7 +88,7 @@ describe("MetricsServer", () => {
 
   it("should not throw when start() called twice", async () => {
     const port = nextPort();
-    const collector = new MetricsCollector();
+    const collector = new LegacyMetricsCollector();
     server = new MetricsServer(collector, port);
     await server.start();
     await server.start(); // second call should be a no-op
diff --git a/src/sdk/worker/metrics/__tests__/metricsFactory.test.ts b/src/sdk/worker/metrics/__tests__/metricsFactory.test.ts
new file mode 100644
index 00000000..a0149968
--- /dev/null
+++ b/src/sdk/worker/metrics/__tests__/metricsFactory.test.ts
@@ -0,0 +1,99 @@
+import { describe, it, expect, afterEach } from "@jest/globals";
+import { createMetricsCollector } from "../metricsFactory";
+import { LegacyMetricsCollector } from "../LegacyMetricsCollector";
+import { CanonicalMetricsCollector } from "../CanonicalMetricsCollector";
+
+describe("createMetricsCollector", () => {
+  const originalEnv = { ...process.env };
+
+  afterEach(() => {
+    process.env = { ...originalEnv };
+  });
+
+  it("should return LegacyMetricsCollector by default", () => {
+    delete process.env.WORKER_CANONICAL_METRICS;
+    delete process.env.WORKER_LEGACY_METRICS;
+
+    const collector = createMetricsCollector();
+    expect(collector).toBeInstanceOf(LegacyMetricsCollector);
+  });
+
+  it("should return LegacyMetricsCollector when WORKER_LEGACY_METRICS=true", () => {
+    process.env.WORKER_LEGACY_METRICS = "true";
+    delete process.env.WORKER_CANONICAL_METRICS;
+
+    const collector = createMetricsCollector();
+    expect(collector).toBeInstanceOf(LegacyMetricsCollector);
+  });
+
+  it("should return CanonicalMetricsCollector when WORKER_CANONICAL_METRICS=true", () => {
+    process.env.WORKER_CANONICAL_METRICS = "true";
+
+    const collector = createMetricsCollector();
+    expect(collector).toBeInstanceOf(CanonicalMetricsCollector);
+  });
+
+  it("should prefer canonical when both env vars are true", () => {
+    process.env.WORKER_CANONICAL_METRICS = "true";
+    process.env.WORKER_LEGACY_METRICS = "true";
+
+    const collector = createMetricsCollector();
+    expect(collector).toBeInstanceOf(CanonicalMetricsCollector);
+  });
+
+  it("should return LegacyMetricsCollector when WORKER_CANONICAL_METRICS=false", () => {
+    process.env.WORKER_CANONICAL_METRICS = "false";
+
+    const collector = createMetricsCollector();
+    expect(collector).toBeInstanceOf(LegacyMetricsCollector);
+  });
+
+  it("should be case-insensitive for env var value", () => {
+    process.env.WORKER_CANONICAL_METRICS = "TRUE";
+
+    const collector = createMetricsCollector();
+    expect(collector).toBeInstanceOf(CanonicalMetricsCollector);
+  });
+
+  it("should pass config through to the collector", () => {
+    delete process.env.WORKER_CANONICAL_METRICS;
+
+    const collector = createMetricsCollector({ prefix: "custom_prefix" });
+    expect(collector).toBeInstanceOf(LegacyMetricsCollector);
+    const text = collector.toPrometheusText();
+    // No metrics recorded yet, so empty, but the collector was created successfully
+    expect(typeof text).toBe("string");
+  });
+
+  it("both implementations satisfy MetricsCollectorInterface", () => {
+    const legacy = createMetricsCollector();
+    const requiredMethods = [
+      "recordTaskExecutionQueueFull",
+      "recordUncaughtException",
+      "recordWorkerRestart",
+      "recordTaskPaused",
+      "recordTaskAckError",
+      "recordTaskAckFailed",
+      "recordWorkflowStartError",
+      "recordExternalPayloadUsed",
+      "recordWorkflowInputSize",
+      "recordApiRequestTime",
+      "getMetrics",
+      "reset",
+      "stop",
+      "getContentType",
+      "toPrometheusText",
+      "toPrometheusTextAsync",
+    ];
+
+    for (const method of requiredMethods) {
+      expect(typeof (legacy as unknown as Record<string, unknown>)[method]).toBe("function");
+    }
+
+    process.env.WORKER_CANONICAL_METRICS = "true";
+    const canonical = createMetricsCollector();
+    for (const method of requiredMethods) {
+      expect(typeof (canonical as unknown as Record<string, unknown>)[method]).toBe("function");
+    }
+  });
+});
diff --git a/src/sdk/worker/metrics/accumulators.ts b/src/sdk/worker/metrics/accumulators.ts
new file mode 100644
index 00000000..de8d48ae
--- /dev/null
+++ b/src/sdk/worker/metrics/accumulators.ts
@@ -0,0 +1,177 @@
+/**
+ * In-memory Prometheus-compatible metric accumulators used by the
+ * canonical metrics implementation.
+ */
+
+export const TIME_BUCKETS = [
+  0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10,
+] as const;
+
+export const SIZE_BUCKETS = [
+  100, 1_000, 10_000, 100_000, 1_000_000, 10_000_000,
+] as const;
+
+export function labelKey(labels: Record<string, string>): string {
+  const keys = Object.keys(labels).sort();
+  return keys.map((k) => `${k}=${labels[k]}`).join(",");
+}
+
+export function renderLabels(labels: Record<string, string>): string {
+  return Object.entries(labels)
+    .map(([k, v]) => `${k}="${v}"`)
+    .join(",");
+}
+
+export function exceptionLabel(error: unknown): string {
+  if (error instanceof Error) {
+    return error.name || error.constructor?.name || "Error";
+  }
+  return "Error";
+}
+
+// ── HistogramAccumulator ─────────────────────────────────────────
+
+interface HistogramSeries {
+  labels: Record<string, string>;
+  buckets: number[];
+  count: number;
+  sum: number;
+}
+
+export class HistogramAccumulator {
+  private readonly _boundaries: readonly number[];
+  private _series = new Map<string, HistogramSeries>();
+
+  constructor(boundaries: readonly number[] = TIME_BUCKETS) {
+    this._boundaries = boundaries;
+  }
+
+  observe(labels: Record<string, string>, value: number): void {
+    const key = labelKey(labels);
+    let s = this._series.get(key);
+    if (!s) {
+      s = {
+        labels,
+        buckets: new Array(this._boundaries.length).fill(0),
+        count: 0,
+        sum: 0,
+      };
+      this._series.set(key, s);
+    }
+    for (let i = 0; i < this._boundaries.length; i++) {
+      if (value <= this._boundaries[i]) {
+        s.buckets[i]++;
+      }
+    }
+    s.count++;
+    s.sum += value;
+  }
+
+  render(name: string, help: string): string {
+    if (this._series.size === 0) return "";
+    const lines: string[] = [];
+    lines.push(`# HELP ${name} ${help}`);
+    lines.push(`# TYPE ${name} histogram`);
+    for (const s of this._series.values()) {
+      const lblStr = renderLabels(s.labels);
+      const sep = lblStr ? "," : "";
+      for (let i = 0; i < this._boundaries.length; i++) {
+        lines.push(
+          `${name}_bucket{${lblStr}${sep}le="${this._boundaries[i]}"} ${s.buckets[i]}`,
+        );
+      }
+      lines.push(`${name}_bucket{${lblStr}${sep}le="+Inf"} ${s.count}`);
+      lines.push(`${name}_sum{${lblStr}} ${s.sum}`);
+      lines.push(`${name}_count{${lblStr}} ${s.count}`);
+    }
+    return lines.join("\n");
+  }
+}
+
+// ── MultiLabelCounter ────────────────────────────────────────────
+
+interface CounterSeries {
+  labels: Record<string, string>;
+  value: number;
+}
+
+export class MultiLabelCounter {
+  private _series = new Map<string, CounterSeries>();
+
+  increment(labels: Record<string, string>, value = 1): void {
+    const key = labelKey(labels);
+    let s = this._series.get(key);
+    if (!s) {
+      s = { labels, value: 0 };
+      this._series.set(key, s);
+    }
+    s.value += value;
+  }
+
+  render(name: string, help: string): string {
+    if (this._series.size === 0) return "";
+    const lines: string[] = [];
+    lines.push(`# HELP ${name} ${help}`);
+    lines.push(`# TYPE ${name} counter`);
+    for (const s of this._series.values()) {
+      lines.push(`${name}{${renderLabels(s.labels)}} ${s.value}`);
+    }
+    return lines.join("\n");
+  }
+}
+
+// ── GaugeMetric ──────────────────────────────────────────────────
+
+interface GaugeSeries {
+  labels: Record<string, string>;
+  value: number;
+}
+
+export class GaugeMetric {
+  private _series = new Map<string, GaugeSeries>();
+
+  set(labels: Record<string, string>, value: number): void {
+    const key = labelKey(labels);
+    let s = this._series.get(key);
+    if (!s) {
+      s = { labels, value: 0 };
+      this._series.set(key, s);
+    }
+    s.value = value;
+  }
+
+  inc(labels: Record<string, string>, delta = 1): void {
+    const key = labelKey(labels);
+    let s = this._series.get(key);
+    if (!s) {
+      s = { labels, value: 0 };
+      this._series.set(key, s);
+    }
+    s.value += delta;
+  }
+
+  dec(labels: Record<string, string>, delta = 1): void {
+    const key = labelKey(labels);
+    let s = this._series.get(key);
+    if (!s) {
+      s = { labels, value: 0 };
+      this._series.set(key, s);
+    }
+    s.value -= delta;
+  }
+
+  getValue(labels: Record<string, string>): number {
+    return this._series.get(labelKey(labels))?.value ?? 0;
+  }
+
+  render(name: string, help: string): string {
+    if (this._series.size === 0) return "";
+    const lines: string[] = [];
+    lines.push(`# HELP ${name} ${help}`);
+    lines.push(`# TYPE ${name} gauge`);
+    for (const s of this._series.values()) {
+      lines.push(`${name}{${renderLabels(s.labels)}} ${s.value}`);
+    }
+    return lines.join("\n");
+  }
+}
diff --git a/src/sdk/worker/metrics/httpObserver.ts b/src/sdk/worker/metrics/httpObserver.ts
new file mode 100644
index 00000000..fa2cbc76
--- /dev/null
+++ b/src/sdk/worker/metrics/httpObserver.ts
@@ -0,0 +1,40 @@
+/**
+ * Global metrics observer for recording API client request latency
+ * and workflow-level metrics from code outside the event system.
+ *
+ * Both LegacyMetricsCollector and CanonicalMetricsCollector register
+ * themselves here on construction; fetchWithRetry and WorkflowExecutor
+ * call the observer without needing a direct reference.
+ */
+
+export interface HttpMetricsObserver {
+  recordApiRequestTime(
+    method: string,
+    uri: string,
+    status: number | string,
+    durationMs: number,
+  ): void;
+
+  recordWorkflowInputSize(
+    workflowType: string,
+    sizeBytes: number,
+    version?: string,
+  ): void;
+
+  recordWorkflowStartError(
+    workflowType?: string,
+    exception?: string,
+  ): void;
+}
+
+let _observer: HttpMetricsObserver | undefined;
+
+export function setHttpMetricsObserver(
+  observer: HttpMetricsObserver | undefined,
+): void {
+  _observer = observer;
+}
+
+export function getHttpMetricsObserver(): HttpMetricsObserver | undefined {
+  return _observer;
+}
diff --git a/src/sdk/worker/metrics/index.ts b/src/sdk/worker/metrics/index.ts
index 9869a1c7..732e02f7 100644
--- a/src/sdk/worker/metrics/index.ts
+++ b/src/sdk/worker/metrics/index.ts
@@ -1,7 +1,37 @@
+// Interface
+export type { MetricsCollectorInterface } from "./MetricsCollectorInterface";
+
+// Implementations
 export {
-  MetricsCollector,
+  LegacyMetricsCollector,
   type MetricsCollectorConfig,
   type WorkerMetrics,
-} from "./MetricsCollector";
+} from "./LegacyMetricsCollector";
+export { CanonicalMetricsCollector } from "./CanonicalMetricsCollector";
+
+// Backward-compat alias: MetricsCollector → LegacyMetricsCollector
+export { LegacyMetricsCollector as MetricsCollector } from "./LegacyMetricsCollector";
+
+// Factory
+export { createMetricsCollector } from "./metricsFactory";
+
+// HTTP observer
+export {
+  type HttpMetricsObserver,
+  getHttpMetricsObserver,
+  setHttpMetricsObserver,
+} from "./httpObserver";
+
+// Server & registries
 export { MetricsServer } from "./MetricsServer";
 export { PrometheusRegistry } from "./PrometheusRegistry";
+export { CanonicalPrometheusRegistry } from "./CanonicalPrometheusRegistry";
+
+// Accumulators (exposed for advanced usage / testing)
+export {
+  HistogramAccumulator,
+  MultiLabelCounter,
+  GaugeMetric,
+  TIME_BUCKETS,
+  SIZE_BUCKETS,
+} from "./accumulators";
diff --git a/src/sdk/worker/metrics/metricsFactory.ts b/src/sdk/worker/metrics/metricsFactory.ts
new file mode 100644
index 00000000..870ef843
--- /dev/null
+++ b/src/sdk/worker/metrics/metricsFactory.ts
@@ -0,0 +1,26 @@
+import type { MetricsCollectorConfig } from "./LegacyMetricsCollector";
+import type { MetricsCollectorInterface } from "./MetricsCollectorInterface";
+import { LegacyMetricsCollector } from "./LegacyMetricsCollector";
+import { CanonicalMetricsCollector } from "./CanonicalMetricsCollector";
+
+/**
+ * Create the appropriate MetricsCollector based on environment variables.
+ *
+ * - WORKER_CANONICAL_METRICS=true  -> CanonicalMetricsCollector  (default: false)
+ * - WORKER_LEGACY_METRICS=true     -> LegacyMetricsCollector     (default: true)
+ *
+ * WORKER_CANONICAL_METRICS takes priority when both are set.
+ * During the deprecation transition period the default is legacy.
+ */
+export function createMetricsCollector(
+  config?: MetricsCollectorConfig,
+): MetricsCollectorInterface {
+  const useCanonical =
+    process.env.WORKER_CANONICAL_METRICS?.toLowerCase() === "true";
+
+  if (useCanonical) {
+    return new CanonicalMetricsCollector(config);
+  }
+
+  return new LegacyMetricsCollector(config);
+}

From 083f173273ff471f02f3650f4fe6010a04a971d6 Mon Sep 17 00:00:00 2001
From: Chris Hagglund <chris.hagglund@orkes.io>
Date: Wed, 29 Apr 2026 10:37:28 -0600
Subject: [PATCH 02/21] standardize on what truthiness means for env var

---
 harness/main.ts                                    |  4 +++-
 .../metrics/__tests__/metricsFactory.test.ts       | 14 ++++++++++++++
 src/sdk/worker/metrics/metricsFactory.ts           |  5 +++--
 3 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/harness/main.ts b/harness/main.ts
index b7aadeed..ad30b732 100644
--- a/harness/main.ts
+++ b/harness/main.ts
@@ -94,7 +94,9 @@ async function main(): Promise<void> {
 
   const metricsPort = envIntOrDefault("HARNESS_METRICS_PORT", 9991);
   const metricsCollector = createMetricsCollector({ httpPort: metricsPort });
-  const variant = process.env.WORKER_CANONICAL_METRICS?.toLowerCase() === "true"
+  const variant = ["true", "1", "yes"].includes(
+    (process.env.WORKER_CANONICAL_METRICS ?? "").toLowerCase(),
+  )
     ? "canonical"
     : "legacy";
   console.log(`Prometheus metrics server started on port ${metricsPort} (${variant} metrics)`);
diff --git a/src/sdk/worker/metrics/__tests__/metricsFactory.test.ts b/src/sdk/worker/metrics/__tests__/metricsFactory.test.ts
index a0149968..f5a28f91 100644
--- a/src/sdk/worker/metrics/__tests__/metricsFactory.test.ts
+++ b/src/sdk/worker/metrics/__tests__/metricsFactory.test.ts
@@ -55,6 +55,20 @@ describe("createMetricsCollector", () => {
     expect(collector).toBeInstanceOf(CanonicalMetricsCollector);
   });
 
+  it('should return CanonicalMetricsCollector when WORKER_CANONICAL_METRICS="1"', () => {
+    process.env.WORKER_CANONICAL_METRICS = "1";
+
+    const collector = createMetricsCollector();
+    expect(collector).toBeInstanceOf(CanonicalMetricsCollector);
+  });
+
+  it('should return CanonicalMetricsCollector when WORKER_CANONICAL_METRICS="yes"', () => {
+    process.env.WORKER_CANONICAL_METRICS = "yes";
+
+    const collector = createMetricsCollector();
+    expect(collector).toBeInstanceOf(CanonicalMetricsCollector);
+  });
+
   it("should pass config through to the collector", () => {
     delete process.env.WORKER_CANONICAL_METRICS;
 
diff --git a/src/sdk/worker/metrics/metricsFactory.ts b/src/sdk/worker/metrics/metricsFactory.ts
index 870ef843..bcb45588 100644
--- a/src/sdk/worker/metrics/metricsFactory.ts
+++ b/src/sdk/worker/metrics/metricsFactory.ts
@@ -15,8 +15,9 @@ import { CanonicalMetricsCollector } from "./CanonicalMetricsCollector";
 export function createMetricsCollector(
   config?: MetricsCollectorConfig,
 ): MetricsCollectorInterface {
-  const useCanonical =
-    process.env.WORKER_CANONICAL_METRICS?.toLowerCase() === "true";
+  const useCanonical = ["true", "1", "yes"].includes(
+    (process.env.WORKER_CANONICAL_METRICS ?? "").toLowerCase(),
+  );
 
   if (useCanonical) {
     return new CanonicalMetricsCollector(config);

From be00e0802d8e26fad419f8b20c76b04b5bb2ec80 Mon Sep 17 00:00:00 2001
From: Chris Hagglund <chris.hagglund@orkes.io>
Date: Tue, 5 May 2026 09:25:39 -0600
Subject: [PATCH 03/21] add some tests for new stuff

---
 jest.config.mjs                               |   1 +
 .../events/__tests__/EventDispatcher.test.ts  |  10 +
 .../helpers/__tests__/fetchWithRetry.test.ts  | 106 ++++++-
 .../CanonicalMetricsCollector.test.ts         | 112 +++++++
 .../CanonicalPrometheusRegistry.test.ts       | 176 +++++++++++
 .../metrics/__tests__/accumulators.test.ts    | 283 ++++++++++++++++++
 .../metrics/__tests__/httpObserver.test.ts    |  58 ++++
 7 files changed, 745 insertions(+), 1 deletion(-)
 create mode 100644 src/sdk/worker/metrics/__tests__/CanonicalPrometheusRegistry.test.ts
 create mode 100644 src/sdk/worker/metrics/__tests__/accumulators.test.ts
 create mode 100644 src/sdk/worker/metrics/__tests__/httpObserver.test.ts

diff --git a/jest.config.mjs b/jest.config.mjs
index d8703a14..f8407f50 100644
--- a/jest.config.mjs
+++ b/jest.config.mjs
@@ -21,6 +21,7 @@ export default {
     "^@/(.*)$": "<rootDir>/src/$1",
     "^@open-api/(.*)$": "<rootDir>/src/open-api/$1",
     "^@test-utils/(.*)$": "<rootDir>/src/integration-tests/utils/$1",
+    "^(.*)\\.js$": "$1",
   },
   transform: {
     "^.+\\.tsx?$": [
diff --git a/src/sdk/clients/worker/events/__tests__/EventDispatcher.test.ts b/src/sdk/clients/worker/events/__tests__/EventDispatcher.test.ts
index bc2f2dc6..b99310bf 100644
--- a/src/sdk/clients/worker/events/__tests__/EventDispatcher.test.ts
+++ b/src/sdk/clients/worker/events/__tests__/EventDispatcher.test.ts
@@ -9,6 +9,7 @@ import type {
   TaskExecutionFailure,
   TaskUpdateCompleted,
   TaskUpdateFailure,
+  TaskPaused,
 } from "../types";
 
 describe("EventDispatcher", () => {
@@ -143,6 +144,7 @@ describe("EventDispatcher", () => {
       onTaskExecutionFailure: jest.fn<() => void>(),
       onTaskUpdateCompleted: jest.fn<() => void>(),
       onTaskUpdateFailure: jest.fn<() => void>(),
+      onTaskPaused: jest.fn<() => void>(),
     };
 
     dispatcher.register(listener);
@@ -231,6 +233,14 @@ describe("EventDispatcher", () => {
     };
     await dispatcher.publishTaskUpdateFailure(updateFailure);
     expect(listener.onTaskUpdateFailure).toHaveBeenCalledWith(updateFailure);
+
+    // Test TaskPaused
+    const taskPaused: TaskPaused = {
+      taskType: "test-task",
+      timestamp: new Date(),
+    };
+    await dispatcher.publishTaskPaused(taskPaused);
+    expect(listener.onTaskPaused).toHaveBeenCalledWith(taskPaused);
   });
 
   test("should have zero overhead when no listeners registered", async () => {
diff --git a/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts b/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts
index 76f73f9b..887c0665 100644
--- a/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts
+++ b/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts
@@ -1,5 +1,6 @@
-import { jest, expect, describe, it, beforeEach } from "@jest/globals";
+import { jest, expect, describe, it, beforeEach, afterEach } from "@jest/globals";
 import { retryFetch, wrapFetchWithRetry, applyTimeout } from "../fetchWithRetry";
+import * as httpObserver from "@/sdk/worker/metrics/httpObserver";
 
 const createMockResponse = (status: number, body = ""): Response =>
   new Response(body, { status, statusText: `Status ${status}` });
@@ -525,4 +526,107 @@ describe("fetchWithRetry", () => {
       expect(result.status).toBe(200);
     });
   });
+
+  // ─── wrapFetchWithRetry metrics recording ───────────────────────────
+
+  describe("wrapFetchWithRetry metrics", () => {
+    const mockRecordApiRequestTime = jest.fn<
+      (m: string, u: string, s: string, d: number) => void
+    >();
+
+    beforeEach(() => {
+      mockRecordApiRequestTime.mockClear();
+      httpObserver.setHttpMetricsObserver({
+        recordApiRequestTime: mockRecordApiRequestTime,
+        recordWorkflowInputSize: jest.fn(),
+        recordWorkflowStartError: jest.fn(),
+      });
+    });
+
+    afterEach(() => {
+      httpObserver.setHttpMetricsObserver(undefined);
+    });
+
+    it("should record API request time on successful response", async () => {
+      mockFetch.mockResolvedValue(createMockResponse(200));
+
+      const wrappedFetch = wrapFetchWithRetry(mockFetch);
+      await wrappedFetch("http://test.com/api/tasks", { method: "POST" });
+
+      expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1);
+      const [method, uri, status, duration] =
+        mockRecordApiRequestTime.mock.calls[0];
+      expect(method).toBe("POST");
+      expect(uri).toBe("/api/tasks");
+      expect(status).toBe("200");
+      expect(duration).toBeGreaterThanOrEqual(0);
+    });
+
+    it("should record status '0' on network failure", async () => {
+      mockFetch.mockRejectedValue(new Error("ECONNRESET"));
+
+      const wrappedFetch = wrapFetchWithRetry(mockFetch, {
+        maxTransportRetries: 0,
+      });
+
+      await expect(
+        wrappedFetch("http://test.com/api/workflow")
+      ).rejects.toThrow("ECONNRESET");
+
+      expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1);
+      const [method, uri, status] =
+        mockRecordApiRequestTime.mock.calls[0];
+      expect(method).toBe("GET");
+      expect(uri).toBe("/api/workflow");
+      expect(status).toBe("0");
+    });
+
+    it("should extract method and URI from string URL", async () => {
+      mockFetch.mockResolvedValue(createMockResponse(201));
+
+      const wrappedFetch = wrapFetchWithRetry(mockFetch);
+      await wrappedFetch("http://example.com/tasks/123", { method: "PUT" });
+
+      expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1);
+      const [method, uri] = mockRecordApiRequestTime.mock.calls[0];
+      expect(method).toBe("PUT");
+      expect(uri).toBe("/tasks/123");
+    });
+
+    it("should extract method and URI from Request object", async () => {
+      mockFetch.mockResolvedValue(createMockResponse(200));
+
+      const wrappedFetch = wrapFetchWithRetry(mockFetch);
+      const request = new Request("http://example.com/api/metadata", {
+        method: "DELETE",
+      });
+      await wrappedFetch(request);
+
+      expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1);
+      const [method, uri] = mockRecordApiRequestTime.mock.calls[0];
+      expect(method).toBe("DELETE");
+      expect(uri).toBe("/api/metadata");
+    });
+
+    it("should not break fetch when no observer is registered", async () => {
+      httpObserver.setHttpMetricsObserver(undefined);
+
+      mockFetch.mockResolvedValue(createMockResponse(200));
+      const wrappedFetch = wrapFetchWithRetry(mockFetch);
+      const result = await wrappedFetch("http://test.com");
+
+      expect(result.status).toBe(200);
+    });
+
+    it("should default method to GET when init has no method", async () => {
+      mockFetch.mockResolvedValue(createMockResponse(200));
+
+      const wrappedFetch = wrapFetchWithRetry(mockFetch);
+      await wrappedFetch("http://test.com/path");
+
+      expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1);
+      const [method] = mockRecordApiRequestTime.mock.calls[0];
+      expect(method).toBe("GET");
+    });
+  });
 });
diff --git a/src/sdk/worker/metrics/__tests__/CanonicalMetricsCollector.test.ts b/src/sdk/worker/metrics/__tests__/CanonicalMetricsCollector.test.ts
index 6b616ce3..369eeee3 100644
--- a/src/sdk/worker/metrics/__tests__/CanonicalMetricsCollector.test.ts
+++ b/src/sdk/worker/metrics/__tests__/CanonicalMetricsCollector.test.ts
@@ -318,4 +318,116 @@ describe("CanonicalMetricsCollector", () => {
       await expect(collector.stop()).resolves.toBeUndefined();
     });
   });
+
+  describe("default argument handling", () => {
+    it("recordUncaughtException with no arg should default to 'Error'", () => {
+      collector.recordUncaughtException();
+      const text = collector.toPrometheusText();
+      expect(text).toContain('thread_uncaught_exceptions_total{exception="Error"} 1');
+    });
+
+    it("recordWorkflowStartError with no args should default both labels", () => {
+      collector.recordWorkflowStartError();
+      const text = collector.toPrometheusText();
+      expect(text).toContain('workflow_start_error_total{workflowType="",exception="Error"} 1');
+    });
+
+    it("recordWorkflowStartError with only workflowType should default exception", () => {
+      collector.recordWorkflowStartError("my_wf");
+      const text = collector.toPrometheusText();
+      expect(text).toContain('workflow_start_error_total{workflowType="my_wf",exception="Error"} 1');
+    });
+
+    it("recordExternalPayloadUsed with missing optional args should default to empty strings", () => {
+      collector.recordExternalPayloadUsed("TASK_OUTPUT");
+      const text = collector.toPrometheusText();
+      expect(text).toContain('external_payload_used_total{entityName="",operation="",payloadType="TASK_OUTPUT"} 1');
+    });
+
+    it("recordTaskAckError with no exception arg should default to 'Error'", () => {
+      collector.recordTaskAckError("task_a");
+      const text = collector.toPrometheusText();
+      expect(text).toContain('task_ack_error_total{taskType="task_a",exception="Error"} 1');
+    });
+
+    it("recordWorkflowInputSize with no version should default to empty string", () => {
+      collector.recordWorkflowInputSize("my_wf", 1000);
+      const text = collector.toPrometheusText();
+      expect(text).toContain('workflow_input_size_bytes_sum{workflowType="my_wf",version=""} 1000');
+    });
+  });
+
+  describe("execution completion without outputSizeBytes", () => {
+    it("should not emit task_result_size_bytes when outputSizeBytes is undefined", () => {
+      collector.onTaskExecutionCompleted({
+        taskType: "task_a",
+        taskId: "t1",
+        workerId: "w1",
+        durationMs: 200,
+        timestamp: new Date(),
+      });
+
+      const text = collector.toPrometheusText();
+      expect(text).toContain("task_execute_time_seconds");
+      expect(text).not.toContain("task_result_size_bytes");
+    });
+  });
+
+  describe("output helpers without prom-client", () => {
+    it("getContentType should return plain text fallback", () => {
+      expect(collector.getContentType()).toBe(
+        "text/plain; version=0.0.4; charset=utf-8"
+      );
+    });
+
+    it("toPrometheusTextAsync should fall back to sync text", async () => {
+      collector.onPollStarted({
+        taskType: "t",
+        workerId: "w",
+        pollCount: 1,
+        timestamp: new Date(),
+      });
+      const asyncText = await collector.toPrometheusTextAsync();
+      const syncText = collector.toPrometheusText();
+      expect(asyncText).toBe(syncText);
+    });
+
+    it("toPrometheusText should ignore _prefix argument", () => {
+      collector.onPollStarted({
+        taskType: "t",
+        workerId: "w",
+        pollCount: 1,
+        timestamp: new Date(),
+      });
+      const text = collector.toPrometheusText("some_prefix");
+      expect(text).not.toContain("some_prefix");
+      expect(text).toContain("task_poll_total{");
+    });
+  });
+
+  describe("active_workers gauge on failure", () => {
+    it("should decrement active_workers on task execution failure", () => {
+      collector.onTaskExecutionStarted({
+        taskType: "task_a",
+        taskId: "t1",
+        workerId: "w1",
+        timestamp: new Date(),
+      });
+
+      let text = collector.toPrometheusText();
+      expect(text).toContain('active_workers{taskType="task_a"} 1');
+
+      collector.onTaskExecutionFailure({
+        taskType: "task_a",
+        taskId: "t1",
+        workerId: "w1",
+        cause: new Error("fail"),
+        durationMs: 100,
+        timestamp: new Date(),
+      });
+
+      text = collector.toPrometheusText();
+      expect(text).toContain('active_workers{taskType="task_a"} 0');
+    });
+  });
 });
diff --git a/src/sdk/worker/metrics/__tests__/CanonicalPrometheusRegistry.test.ts b/src/sdk/worker/metrics/__tests__/CanonicalPrometheusRegistry.test.ts
new file mode 100644
index 00000000..449955eb
--- /dev/null
+++ b/src/sdk/worker/metrics/__tests__/CanonicalPrometheusRegistry.test.ts
@@ -0,0 +1,176 @@
+import { describe, it, expect, beforeEach } from "@jest/globals";
+import { CanonicalPrometheusRegistry } from "../CanonicalPrometheusRegistry";
+
+describe("CanonicalPrometheusRegistry", () => {
+  let registry: CanonicalPrometheusRegistry;
+
+  beforeEach(async () => {
+    try {
+      const promClient = await import("prom-client");
+      promClient.register.clear();
+    } catch {
+      // prom-client not installed — skip cleanup
+    }
+    registry = new CanonicalPrometheusRegistry();
+  });
+
+  describe("before initialization", () => {
+    it("should not be available", () => {
+      expect(registry.available).toBe(false);
+    });
+
+    it("should return default content type", () => {
+      expect(registry.contentType).toBe(
+        "text/plain; version=0.0.4; charset=utf-8"
+      );
+    });
+
+    it("should return empty string from metrics()", async () => {
+      expect(await registry.metrics()).toBe("");
+    });
+
+    it("should not throw when incrementCounter is called", () => {
+      expect(() =>
+        registry.incrementCounter("task_poll_total", { taskType: "t" })
+      ).not.toThrow();
+    });
+
+    it("should not throw when observeHistogram is called", () => {
+      expect(() =>
+        registry.observeHistogram("task_poll_time_seconds", { taskType: "t" }, 0.1)
+      ).not.toThrow();
+    });
+
+    it("should not throw when setGauge is called", () => {
+      expect(() =>
+        registry.setGauge("active_workers", { taskType: "t" }, 5)
+      ).not.toThrow();
+    });
+  });
+
+  describe("initialize()", () => {
+    it("should return true when prom-client is installed", async () => {
+      const result = await registry.initialize();
+      expect(result).toBe(true);
+      expect(registry.available).toBe(true);
+    });
+
+    it("should set contentType from prom-client registry", async () => {
+      await registry.initialize();
+      expect(registry.contentType).toBeDefined();
+      expect(registry.contentType.length).toBeGreaterThan(0);
+    });
+  });
+
+  describe("after initialization", () => {
+    beforeEach(async () => {
+      await registry.initialize();
+    });
+
+    it("should be available", () => {
+      expect(registry.available).toBe(true);
+    });
+
+    it("should return metrics text from prom-client", async () => {
+      const text = await registry.metrics();
+      expect(typeof text).toBe("string");
+    });
+
+    it("should increment a counter and see it in metrics output", async () => {
+      registry.incrementCounter("task_poll_total", { taskType: "test_t" });
+      registry.incrementCounter("task_poll_total", { taskType: "test_t" });
+      const text = await registry.metrics();
+      expect(text).toContain("task_poll_total");
+      expect(text).toContain("test_t");
+    });
+
+    it("should observe a histogram and see it in metrics output", async () => {
+      registry.observeHistogram(
+        "task_poll_time_seconds",
+        { taskType: "test_t", status: "SUCCESS" },
+        0.05
+      );
+      const text = await registry.metrics();
+      expect(text).toContain("task_poll_time_seconds");
+      expect(text).toContain("test_t");
+    });
+
+    it("should set a gauge and see it in metrics output", async () => {
+      registry.setGauge("active_workers", { taskType: "test_t" }, 3);
+      const text = await registry.metrics();
+      expect(text).toContain("active_workers");
+      expect(text).toContain("test_t");
+    });
+
+    it("should handle unknown counter key as no-op", () => {
+      expect(() =>
+        registry.incrementCounter("nonexistent", { x: "y" })
+      ).not.toThrow();
+    });
+
+    it("should handle unknown histogram key as no-op", () => {
+      expect(() =>
+        registry.observeHistogram("nonexistent", { x: "y" }, 1)
+      ).not.toThrow();
+    });
+
+    it("should handle unknown gauge key as no-op", () => {
+      expect(() =>
+        registry.setGauge("nonexistent", { x: "y" }, 1)
+      ).not.toThrow();
+    });
+
+    it("should accept custom increment value for counter", () => {
+      expect(() =>
+        registry.incrementCounter("task_poll_total", { taskType: "t" }, 5)
+      ).not.toThrow();
+    });
+
+    it("should record all counter types", async () => {
+      registry.incrementCounter("task_poll_total", { taskType: "t" });
+      registry.incrementCounter("task_execution_started_total", { taskType: "t" });
+      registry.incrementCounter("task_poll_error_total", { taskType: "t", exception: "Error" });
+      registry.incrementCounter("task_execute_error_total", { taskType: "t", exception: "Error" });
+      registry.incrementCounter("task_update_error_total", { taskType: "t", exception: "Error" });
+      registry.incrementCounter("task_ack_error_total", { taskType: "t", exception: "Error" });
+      registry.incrementCounter("task_ack_failed_total", { taskType: "t" });
+      registry.incrementCounter("task_execution_queue_full_total", { taskType: "t" });
+      registry.incrementCounter("task_paused_total", { taskType: "t" });
+      registry.incrementCounter("thread_uncaught_exceptions_total", { exception: "Error" });
+      registry.incrementCounter("external_payload_used_total", { entityName: "e", operation: "o", payloadType: "p" });
+      registry.incrementCounter("workflow_start_error_total", { workflowType: "w", exception: "Error" });
+
+      const text = await registry.metrics();
+      expect(text).toContain("task_poll_total");
+      expect(text).toContain("task_execution_started_total");
+      expect(text).toContain("task_poll_error_total");
+      expect(text).toContain("task_ack_failed_total");
+      expect(text).toContain("thread_uncaught_exceptions_total");
+      expect(text).toContain("external_payload_used_total");
+      expect(text).toContain("workflow_start_error_total");
+    });
+
+    it("should record all histogram types", async () => {
+      registry.observeHistogram("task_poll_time_seconds", { taskType: "t", status: "SUCCESS" }, 0.1);
+      registry.observeHistogram("task_execute_time_seconds", { taskType: "t", status: "SUCCESS" }, 0.5);
+      registry.observeHistogram("task_update_time_seconds", { taskType: "t", status: "SUCCESS" }, 0.02);
+      registry.observeHistogram("http_api_client_request_seconds", { method: "GET", uri: "/api", status: "200" }, 0.1);
+      registry.observeHistogram("task_result_size_bytes", { taskType: "t" }, 1024);
+      registry.observeHistogram("workflow_input_size_bytes", { workflowType: "w", version: "1" }, 512);
+
+      const text = await registry.metrics();
+      expect(text).toContain("task_poll_time_seconds");
+      expect(text).toContain("task_execute_time_seconds");
+      expect(text).toContain("task_update_time_seconds");
+      expect(text).toContain("http_api_client_request_seconds");
+      expect(text).toContain("task_result_size_bytes");
+      expect(text).toContain("workflow_input_size_bytes");
+    });
+
+    it("should record gauge metric", async () => {
+      registry.setGauge("active_workers", { taskType: "t" }, 7);
+      const text = await registry.metrics();
+      expect(text).toContain("active_workers");
+    });
+  });
+});
diff --git a/src/sdk/worker/metrics/__tests__/accumulators.test.ts b/src/sdk/worker/metrics/__tests__/accumulators.test.ts
new file mode 100644
index 00000000..d25c3181
--- /dev/null
+++ b/src/sdk/worker/metrics/__tests__/accumulators.test.ts
@@ -0,0 +1,283 @@
+import { describe, it, expect } from "@jest/globals";
+import {
+  labelKey,
+  renderLabels,
+  exceptionLabel,
+  HistogramAccumulator,
+  MultiLabelCounter,
+  GaugeMetric,
+  TIME_BUCKETS,
+  SIZE_BUCKETS,
+} from "../accumulators";
+
+describe("labelKey", () => {
+  it("should return empty string for empty labels", () => {
+    expect(labelKey({})).toBe("");
+  });
+
+  it("should return key=value for a single label", () => {
+    expect(labelKey({ taskType: "my_task" })).toBe("taskType=my_task");
+  });
+
+  it("should sort keys alphabetically", () => {
+    expect(labelKey({ z: "1", a: "2", m: "3" })).toBe("a=2,m=3,z=1");
+  });
+
+  it("should produce the same key regardless of insertion order", () => {
+    const key1 = labelKey({ status: "SUCCESS", taskType: "t" });
+    const key2 = labelKey({ taskType: "t", status: "SUCCESS" });
+    expect(key1).toBe(key2);
+  });
+});
+
+describe("renderLabels", () => {
+  it("should return empty string for empty labels", () => {
+    expect(renderLabels({})).toBe("");
+  });
+
+  it("should format a single label with quotes", () => {
+    expect(renderLabels({ taskType: "my_task" })).toBe('taskType="my_task"');
+  });
+
+  it("should format multiple labels comma-separated", () => {
+    const result = renderLabels({ method: "GET", uri: "/api" });
+    expect(result).toBe('method="GET",uri="/api"');
+  });
+});
+
+describe("exceptionLabel", () => {
+  it("should return Error name for standard Error", () => {
+    expect(exceptionLabel(new Error("oops"))).toBe("Error");
+  });
+
+  it("should return subclass name for TypeError", () => {
+    expect(exceptionLabel(new TypeError("bad type"))).toBe("TypeError");
+  });
+
+  it("should return subclass name for RangeError", () => {
+    expect(exceptionLabel(new RangeError("out of range"))).toBe("RangeError");
+  });
+
+  it("should return 'Error' for non-Error values", () => {
+    expect(exceptionLabel("string error")).toBe("Error");
+    expect(exceptionLabel(42)).toBe("Error");
+    expect(exceptionLabel(null)).toBe("Error");
+    expect(exceptionLabel(undefined)).toBe("Error");
+    expect(exceptionLabel({ message: "not an error" })).toBe("Error");
+  });
+
+  it("should fall back to constructor name when .name is empty", () => {
+    const err = new TypeError("test");
+    Object.defineProperty(err, "name", { value: "" });
+    expect(exceptionLabel(err)).toBe("TypeError");
+  });
+});
+
+describe("HistogramAccumulator", () => {
+  it("should render empty string when no observations", () => {
+    const h = new HistogramAccumulator([1, 5, 10]);
+    expect(h.render("test_metric", "A test")).toBe("");
+  });
+
+  it("should place value in correct buckets", () => {
+    const h = new HistogramAccumulator([1, 5, 10]);
+    h.observe({ taskType: "t" }, 3);
+
+    const text = h.render("req_time", "Request time");
+    expect(text).toContain("# HELP req_time Request time");
+    expect(text).toContain("# TYPE req_time histogram");
+    expect(text).toContain('req_time_bucket{taskType="t",le="1"} 0');
+    expect(text).toContain('req_time_bucket{taskType="t",le="5"} 1');
+    expect(text).toContain('req_time_bucket{taskType="t",le="10"} 1');
+    expect(text).toContain('req_time_bucket{taskType="t",le="+Inf"} 1');
+    expect(text).toContain('req_time_sum{taskType="t"} 3');
+    expect(text).toContain('req_time_count{taskType="t"} 1');
+  });
+
+  it("should increment all buckets at or above the value boundary", () => {
+    const h = new HistogramAccumulator([1, 5, 10]);
+    h.observe({}, 1); // exactly on boundary
+
+    const text = h.render("m", "help");
+    expect(text).toContain('m_bucket{le="1"} 1');
+    expect(text).toContain('m_bucket{le="5"} 1');
+    expect(text).toContain('m_bucket{le="10"} 1');
+  });
+
+  it("should handle value above all boundaries", () => {
+    const h = new HistogramAccumulator([1, 5, 10]);
+    h.observe({}, 100);
+
+    const text = h.render("m", "help");
+    expect(text).toContain('m_bucket{le="1"} 0');
+    expect(text).toContain('m_bucket{le="5"} 0');
+    expect(text).toContain('m_bucket{le="10"} 0');
+    expect(text).toContain('m_bucket{le="+Inf"} 1');
+    expect(text).toContain("m_sum{} 100");
+  });
+
+  it("should accumulate multiple observations", () => {
+    const h = new HistogramAccumulator([1, 5, 10]);
+    h.observe({ t: "a" }, 0.5);
+    h.observe({ t: "a" }, 3);
+    h.observe({ t: "a" }, 7);
+
+    const text = h.render("m", "help");
+    expect(text).toContain('m_bucket{t="a",le="1"} 1');
+    expect(text).toContain('m_bucket{t="a",le="5"} 2');
+    expect(text).toContain('m_bucket{t="a",le="10"} 3');
+    expect(text).toContain('m_count{t="a"} 3');
+    expect(text).toContain('m_sum{t="a"} 10.5');
+  });
+
+  it("should track separate series for different label sets", () => {
+    const h = new HistogramAccumulator([10]);
+    h.observe({ status: "OK" }, 5);
+    h.observe({ status: "ERR" }, 15);
+
+    const text = h.render("m", "help");
+    expect(text).toContain('m_bucket{status="OK",le="10"} 1');
+    expect(text).toContain('m_bucket{status="ERR",le="10"} 0');
+    expect(text).toContain('m_count{status="OK"} 1');
+    expect(text).toContain('m_count{status="ERR"} 1');
+  });
+
+  it("should default to TIME_BUCKETS when no boundaries given", () => {
+    const h = new HistogramAccumulator();
+    h.observe({}, 0.005);
+    const text = h.render("m", "help");
+    // TIME_BUCKETS starts at 0.001, 0.005, ...
+    expect(text).toContain('le="0.001"');
+    expect(text).toContain('le="0.005"');
+  });
+});
+
+describe("MultiLabelCounter", () => {
+  it("should render empty string when no increments", () => {
+    const c = new MultiLabelCounter();
+    expect(c.render("test_counter", "A test")).toBe("");
+  });
+
+  it("should increment and render a single-label counter", () => {
+    const c = new MultiLabelCounter();
+    c.increment({ taskType: "t" });
+    c.increment({ taskType: "t" });
+
+    const text = c.render("poll_total", "Total polls");
+    expect(text).toContain("# HELP poll_total Total polls");
+    expect(text).toContain("# TYPE poll_total counter");
+    expect(text).toContain('poll_total{taskType="t"} 2');
+  });
+
+  it("should support custom increment values", () => {
+    const c = new MultiLabelCounter();
+    c.increment({ taskType: "t" }, 5);
+    c.increment({ taskType: "t" }, 3);
+
+    const text = c.render("m", "help");
+    expect(text).toContain('m{taskType="t"} 8');
+  });
+
+  it("should track separate series for different label sets", () => {
+    const c = new MultiLabelCounter();
+    c.increment({ taskType: "a" });
+    c.increment({ taskType: "b" });
+    c.increment({ taskType: "a" });
+
+    const text = c.render("m", "help");
+    expect(text).toContain('m{taskType="a"} 2');
+    expect(text).toContain('m{taskType="b"} 1');
+  });
+
+  it("should handle multi-label counters", () => {
+    const c = new MultiLabelCounter();
+    c.increment({ taskType: "t", exception: "TypeError" });
+
+    const text = c.render("err", "Errors");
+    expect(text).toContain('err{taskType="t",exception="TypeError"} 1');
+  });
+});
+
+describe("GaugeMetric", () => {
+  it("should render empty string when no values set", () => {
+    const g = new GaugeMetric();
+    expect(g.render("test_gauge", "A test")).toBe("");
+  });
+
+  it("should set and render a gauge value", () => {
+    const g = new GaugeMetric();
+    g.set({ taskType: "t" }, 42);
+
+    const text = g.render("active", "Active workers");
+    expect(text).toContain("# HELP active Active workers");
+    expect(text).toContain("# TYPE active gauge");
+    expect(text).toContain('active{taskType="t"} 42');
+  });
+
+  it("should overwrite previous value on set", () => {
+    const g = new GaugeMetric();
+    g.set({ taskType: "t" }, 10);
+    g.set({ taskType: "t" }, 99);
+
+    const text = g.render("m", "help");
+    expect(text).toContain('m{taskType="t"} 99');
+    expect(text).not.toContain("10");
+  });
+
+  it("should increment with inc()", () => {
+    const g = new GaugeMetric();
+    g.inc({ taskType: "t" });
+    g.inc({ taskType: "t" });
+    g.inc({ taskType: "t" }, 3);
+
+    expect(g.getValue({ taskType: "t" })).toBe(5);
+  });
+
+  it("should decrement with dec()", () => {
+    const g = new GaugeMetric();
+    g.inc({ taskType: "t" }, 5);
+    g.dec({ taskType: "t" });
+    g.dec({ taskType: "t" }, 2);
+
+    expect(g.getValue({ taskType: "t" })).toBe(2);
+  });
+
+  it("should allow negative values after dec()", () => {
+    const g = new GaugeMetric();
+    g.dec({ taskType: "t" });
+
+    expect(g.getValue({ taskType: "t" })).toBe(-1);
+  });
+
+  it("should return 0 for getValue on unknown labels", () => {
+    const g = new GaugeMetric();
+    expect(g.getValue({ taskType: "unknown" })).toBe(0);
+  });
+
+  it("should track separate series for different label sets", () => {
+    const g = new GaugeMetric();
+    g.set({ taskType: "a" }, 10);
+    g.set({ taskType: "b" }, 20);
+
+    expect(g.getValue({ taskType: "a" })).toBe(10);
+    expect(g.getValue({ taskType: "b" })).toBe(20);
+
+    const text = g.render("m", "help");
+    expect(text).toContain('m{taskType="a"} 10');
+    expect(text).toContain('m{taskType="b"} 20');
+  });
+});
+
+describe("bucket constants", () => {
+  it("TIME_BUCKETS should be sorted ascending", () => {
+    for (let i = 1; i < TIME_BUCKETS.length; i++) {
+      expect(TIME_BUCKETS[i]).toBeGreaterThan(TIME_BUCKETS[i - 1]);
+    }
+  });
+
+  it("SIZE_BUCKETS should be sorted ascending", () => {
+    for (let i = 1; i < SIZE_BUCKETS.length; i++) {
+      expect(SIZE_BUCKETS[i]).toBeGreaterThan(SIZE_BUCKETS[i - 1]);
+    }
+  });
+});
diff --git a/src/sdk/worker/metrics/__tests__/httpObserver.test.ts b/src/sdk/worker/metrics/__tests__/httpObserver.test.ts
new file mode 100644
index 00000000..04da8fa6
--- /dev/null
+++ b/src/sdk/worker/metrics/__tests__/httpObserver.test.ts
@@ -0,0 +1,58 @@
+import { describe, it, expect, beforeEach } from "@jest/globals";
+import {
+  getHttpMetricsObserver,
+  setHttpMetricsObserver,
+  type HttpMetricsObserver,
+} from "../httpObserver";
+
+describe("httpObserver", () => {
+  beforeEach(() => {
+    setHttpMetricsObserver(undefined);
+  });
+
+  it("should return undefined by default", () => {
+    expect(getHttpMetricsObserver()).toBeUndefined();
+  });
+
+  it("should return the observer after setting one", () => {
+    const observer: HttpMetricsObserver = {
+      recordApiRequestTime: () => {},
+      recordWorkflowInputSize: () => {},
+      recordWorkflowStartError: () => {},
+    };
+    setHttpMetricsObserver(observer);
+    expect(getHttpMetricsObserver()).toBe(observer);
+  });
+
+  it("should clear the observer when set to undefined", () => {
+    const observer: HttpMetricsObserver = {
+      recordApiRequestTime: () => {},
+      recordWorkflowInputSize: () => {},
+      recordWorkflowStartError: () => {},
+    };
+    setHttpMetricsObserver(observer);
+    expect(getHttpMetricsObserver()).toBe(observer);
+
+    setHttpMetricsObserver(undefined);
+    expect(getHttpMetricsObserver()).toBeUndefined();
+  });
+
+  it("should replace the observer on subsequent set calls", () => {
+    const observer1: HttpMetricsObserver = {
+      recordApiRequestTime: () => {},
+      recordWorkflowInputSize: () => {},
+      recordWorkflowStartError: () => {},
+    };
+    const observer2: HttpMetricsObserver = {
+      recordApiRequestTime: () => {},
+      recordWorkflowInputSize: () => {},
+      recordWorkflowStartError: () => {},
+    };
+
+    setHttpMetricsObserver(observer1);
+    expect(getHttpMetricsObserver()).toBe(observer1);
+
+    setHttpMetricsObserver(observer2);
+    expect(getHttpMetricsObserver()).toBe(observer2);
+  });
+});

From 2384d2dd4cc2d7e7c5cfcec6d4e3cdf9e65181f5 Mon Sep 17 00:00:00 2001
From: Chris Hagglund <chris.hagglund@orkes.io>
Date: Tue, 5 May 2026 09:33:41 -0600
Subject: [PATCH 04/21] delint

---
 .../LeaseExtension.validation.test.ts         | 13 ++++------
 .../builders/tasks/pullWorkflowMessages.ts    |  2 +-
 .../worker/__tests__/LeaseTracker.test.ts     |  4 +--
 src/sdk/clients/worker/events/types.ts        |  1 +
 .../metrics/__tests__/httpObserver.test.ts    | 26 +++++++++----------
 5 files changed, 22 insertions(+), 24 deletions(-)

diff --git a/src/integration-tests/LeaseExtension.validation.test.ts b/src/integration-tests/LeaseExtension.validation.test.ts
index 015512c8..a977613b 100644
--- a/src/integration-tests/LeaseExtension.validation.test.ts
+++ b/src/integration-tests/LeaseExtension.validation.test.ts
@@ -36,7 +36,6 @@ import type { Client } from "../open-api";
 import {
   MetadataClient,
   WorkflowExecutor,
-  TaskClient,
   orkesConductorClient,
 } from "../sdk";
 import { LeaseTracker } from "../sdk/clients/worker/LeaseTracker";
@@ -56,7 +55,6 @@ describe("Lease Extension — end-to-end validation", () => {
   let client: Client;
   let executor: WorkflowExecutor;
   let metadataClient: MetadataClient;
-  let taskClient: TaskClient;
   const logger: ConductorLogger = new DefaultLogger();
 
   const suffix      = Date.now();
@@ -67,7 +65,6 @@ describe("Lease Extension — end-to-end validation", () => {
     client         = await orkesConductorClient();
     executor       = new WorkflowExecutor(client);
     metadataClient = new MetadataClient(client);
-    taskClient     = new TaskClient(client);
 
     await metadataClient.registerTask({
       name:                   taskDefName,
@@ -117,7 +114,7 @@ describe("Lease Extension — end-to-end validation", () => {
     const deadline = Date.now() + maxWaitMs;
     while (Date.now() < deadline) {
       const wf = await executor.getWorkflow(workflowId, false);
-      if (terminal.has(wf.status ?? "")) return wf.status!;
+      if (terminal.has(wf.status ?? "")) return wf.status ?? "";
       await sleep(1_000);
     }
     return "STILL_RUNNING";
@@ -132,8 +129,8 @@ describe("Lease Extension — end-to-end validation", () => {
     const { data: tasks1 } = await TaskResource.batchPoll({ client, path: { tasktype: taskDefName }, query: { workerid: "val-worker-no-lease", count: 1, timeout: 200 } });
     const [task] = tasks1 ?? [];
     expect(task).toBeDefined();
-    const taskId1 = task.taskId!;
-    const wfId1   = task.workflowInstanceId!;
+    const taskId1 = task.taskId ?? "";
+    const wfId1   = task.workflowInstanceId ?? "";
     console.log(`   Task polled: ${taskId1}`);
     console.log(`   No LeaseTracker created — zero extendLease calls will be made`);
 
@@ -177,8 +174,8 @@ describe("Lease Extension — end-to-end validation", () => {
     const { data: tasks2 } = await TaskResource.batchPoll({ client, path: { tasktype: taskDefName }, query: { workerid: "val-worker-with-lease", count: 1, timeout: 200 } });
     const [task] = tasks2 ?? [];
     expect(task).toBeDefined();
-    const taskId2 = task.taskId!;
-    const wfId2   = task.workflowInstanceId!;
+    const taskId2 = task.taskId ?? "";
+    const wfId2   = task.workflowInstanceId ?? "";
     console.log(`   Task polled: ${taskId2}`);
 
     // Track heartbeat calls via a spy that ALSO sends the real heartbeat
diff --git a/src/sdk/builders/tasks/pullWorkflowMessages.ts b/src/sdk/builders/tasks/pullWorkflowMessages.ts
index e4b2ebae..e0bcb3c3 100644
--- a/src/sdk/builders/tasks/pullWorkflowMessages.ts
+++ b/src/sdk/builders/tasks/pullWorkflowMessages.ts
@@ -16,7 +16,7 @@ import { TaskType, PullWorkflowMessagesTaskDef } from "../../../open-api";
  */
 export const pullWorkflowMessages = (
   taskReferenceName: string,
-  batchSize: number = 1,
+  batchSize = 1,
   optional?: boolean
 ): PullWorkflowMessagesTaskDef => ({
   name: taskReferenceName,
diff --git a/src/sdk/clients/worker/__tests__/LeaseTracker.test.ts b/src/sdk/clients/worker/__tests__/LeaseTracker.test.ts
index 86cf4e3e..73b56823 100644
--- a/src/sdk/clients/worker/__tests__/LeaseTracker.test.ts
+++ b/src/sdk/clients/worker/__tests__/LeaseTracker.test.ts
@@ -1,5 +1,5 @@
-import { LeaseTracker, LeaseInfo } from "@/sdk/clients/worker/LeaseTracker";
-import { LEASE_EXTEND_DURATION_FACTOR, LEASE_EXTEND_RETRY_COUNT, HEARTBEAT_RETRY_DELAY_MS } from "@/sdk/clients/worker/constants";
+import { LeaseTracker } from "@/sdk/clients/worker/LeaseTracker";
+import { LEASE_EXTEND_RETRY_COUNT, HEARTBEAT_RETRY_DELAY_MS } from "@/sdk/clients/worker/constants";
 import { afterEach, beforeEach, describe, expect, jest, test } from "@jest/globals";
 import type { Task } from "@open-api/index";
 import { mockLogger } from "@test-utils/mockLogger";
diff --git a/src/sdk/clients/worker/events/types.ts b/src/sdk/clients/worker/events/types.ts
index e252a25e..9c94af69 100644
--- a/src/sdk/clients/worker/events/types.ts
+++ b/src/sdk/clients/worker/events/types.ts
@@ -137,6 +137,7 @@ export interface TaskUpdateCompleted extends TaskRunnerEvent {
 /**
  * Event published when a poll cycle is skipped because the worker is paused.
  */
+// eslint-disable-next-line @typescript-eslint/no-empty-object-type
 export interface TaskPaused extends TaskRunnerEvent {
   // No additional fields — taskType is inherited from TaskRunnerEvent.
 }
diff --git a/src/sdk/worker/metrics/__tests__/httpObserver.test.ts b/src/sdk/worker/metrics/__tests__/httpObserver.test.ts
index 04da8fa6..bf8adb42 100644
--- a/src/sdk/worker/metrics/__tests__/httpObserver.test.ts
+++ b/src/sdk/worker/metrics/__tests__/httpObserver.test.ts
@@ -1,4 +1,4 @@
-import { describe, it, expect, beforeEach } from "@jest/globals";
+import { describe, it, expect, beforeEach, jest } from "@jest/globals";
 import {
   getHttpMetricsObserver,
   setHttpMetricsObserver,
@@ -16,9 +16,9 @@ describe("httpObserver", () => {
 
   it("should return the observer after setting one", () => {
     const observer: HttpMetricsObserver = {
-      recordApiRequestTime: () => {},
-      recordWorkflowInputSize: () => {},
-      recordWorkflowStartError: () => {},
+      recordApiRequestTime: jest.fn(),
+      recordWorkflowInputSize: jest.fn(),
+      recordWorkflowStartError: jest.fn(),
     };
     setHttpMetricsObserver(observer);
     expect(getHttpMetricsObserver()).toBe(observer);
@@ -26,9 +26,9 @@ describe("httpObserver", () => {
 
   it("should clear the observer when set to undefined", () => {
     const observer: HttpMetricsObserver = {
-      recordApiRequestTime: () => {},
-      recordWorkflowInputSize: () => {},
-      recordWorkflowStartError: () => {},
+      recordApiRequestTime: jest.fn(),
+      recordWorkflowInputSize: jest.fn(),
+      recordWorkflowStartError: jest.fn(),
     };
     setHttpMetricsObserver(observer);
     expect(getHttpMetricsObserver()).toBe(observer);
@@ -39,14 +39,14 @@ describe("httpObserver", () => {
 
   it("should replace the observer on subsequent set calls", () => {
     const observer1: HttpMetricsObserver = {
-      recordApiRequestTime: () => {},
-      recordWorkflowInputSize: () => {},
-      recordWorkflowStartError: () => {},
+      recordApiRequestTime: jest.fn(),
+      recordWorkflowInputSize: jest.fn(),
+      recordWorkflowStartError: jest.fn(),
     };
     const observer2: HttpMetricsObserver = {
-      recordApiRequestTime: () => {},
-      recordWorkflowInputSize: () => {},
-      recordWorkflowStartError: () => {},
+      recordApiRequestTime: jest.fn(),
+      recordWorkflowInputSize: jest.fn(),
+      recordWorkflowStartError: jest.fn(),
     };
 
     setHttpMetricsObserver(observer1);

From 0b082d6c4d2d931c4f446ccd90c9e2f675f57a25 Mon Sep 17 00:00:00 2001
From: Chris Hagglund <chris.hagglund@orkes.io>
Date: Tue, 5 May 2026 11:02:22 -0600
Subject: [PATCH 05/21] add retry for 502,503,504 error codes

---
 .github/workflows/pull_request.yml            |  4 +-
 .../helpers/__tests__/fetchWithRetry.test.ts  | 74 +++++++++++++++++++
 .../helpers/fetchWithRetry.ts                 | 12 +++
 3 files changed, 88 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml
index 56599d35..c2271a6a 100644
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@@ -79,14 +79,14 @@ jobs:
           name: codecov-unit-node-${{ matrix.node-version }}
           fail_ci_if_error: false
 
-  # Integration tests (v5): one job at a time (max-parallel: 1) to avoid 502/503 on shared Conductor.
+  # Integration tests (v5): one shard at a time (max-parallel: 1) to avoid 502/503 on shared Conductor.
   # Sharding (--shard i/N) splits the suite so each job runs ~1/N of tests — keeps per-job under timeout.
   integration-tests:
     runs-on: ubuntu-latest
     timeout-minutes: 25
     strategy:
       fail-fast: false
-      max-parallel: 2
+      max-parallel: 1
       matrix:
         node-version: [20, 22, 24]
         shard: [1, 2, 3]
diff --git a/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts b/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts
index 887c0665..883c65b5 100644
--- a/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts
+++ b/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts
@@ -181,6 +181,80 @@ describe("fetchWithRetry", () => {
     });
   });
 
+  // ─── Server error (502/503/504) retry ───────────────────────────────
+
+  describe("server error (502/503/504) retry", () => {
+    it("should retry 502 and succeed on next attempt", async () => {
+      mockFetch
+        .mockResolvedValueOnce(createMockResponse(502, "Bad Gateway"))
+        .mockResolvedValueOnce(createMockResponse(200, "ok"));
+
+      const result = await retryFetch("http://test.com", {}, mockFetch, {
+        maxTransportRetries: 3,
+        initialRetryDelay: 1,
+      });
+
+      expect(result.status).toBe(200);
+      expect(mockFetch).toHaveBeenCalledTimes(2);
+    });
+
+    it("should retry 503 and succeed on next attempt", async () => {
+      mockFetch
+        .mockResolvedValueOnce(createMockResponse(503, "Service Unavailable"))
+        .mockResolvedValueOnce(createMockResponse(200, "ok"));
+
+      const result = await retryFetch("http://test.com", {}, mockFetch, {
+        maxTransportRetries: 3,
+        initialRetryDelay: 1,
+      });
+
+      expect(result.status).toBe(200);
+      expect(mockFetch).toHaveBeenCalledTimes(2);
+    });
+
+    it("should retry 504 and succeed on next attempt", async () => {
+      mockFetch
+        .mockResolvedValueOnce(createMockResponse(504, "Gateway Timeout"))
+        .mockResolvedValueOnce(createMockResponse(200, "ok"));
+
+      const result = await retryFetch("http://test.com", {}, mockFetch, {
+        maxTransportRetries: 3,
+        initialRetryDelay: 1,
+      });
+
+      expect(result.status).toBe(200);
+      expect(mockFetch).toHaveBeenCalledTimes(2);
+    });
+
+    it("should exhaust retries and return last 5xx response", async () => {
+      mockFetch.mockResolvedValue(createMockResponse(502, "Bad Gateway"));
+
+      const result = await retryFetch("http://test.com", {}, mockFetch, {
+        maxTransportRetries: 2,
+        initialRetryDelay: 1,
+      });
+
+      expect(result.status).toBe(502);
+      // 1 initial + 2 retries = 3
+      expect(mockFetch).toHaveBeenCalledTimes(3);
+    });
+
+    it("should handle transport error then 502 then success", async () => {
+      mockFetch
+        .mockRejectedValueOnce(new Error("ECONNRESET"))
+        .mockResolvedValueOnce(createMockResponse(502, "Bad Gateway"))
+        .mockResolvedValueOnce(createMockResponse(200, "ok"));
+
+      const result = await retryFetch("http://test.com", {}, mockFetch, {
+        maxTransportRetries: 3,
+        initialRetryDelay: 1,
+      });
+
+      expect(result.status).toBe(200);
+      expect(mockFetch).toHaveBeenCalledTimes(3);
+    });
+  });
+
   // ─── Auth failure (401/403) retry ──────────────────────────────────
 
   describe("auth failure (401/403) retry", () => {
diff --git a/src/sdk/createConductorClient/helpers/fetchWithRetry.ts b/src/sdk/createConductorClient/helpers/fetchWithRetry.ts
index 1a49d509..679a7439 100644
--- a/src/sdk/createConductorClient/helpers/fetchWithRetry.ts
+++ b/src/sdk/createConductorClient/helpers/fetchWithRetry.ts
@@ -157,6 +157,18 @@ export const retryFetch = async (
       return rateLimitResponse;
     }
 
+    // Gateway error retry (502, 503, 504) -- transient proxy/server errors
+    if (response.status >= 502 && response.status <= 504) {
+      lastError = new Error(`Server error: HTTP ${response.status}`);
+      if (transportAttempt < maxTransportRetries) {
+        await new Promise((resolve) =>
+          setTimeout(resolve, withJitter(initialRetryDelay * (transportAttempt + 1)))
+        );
+        continue;
+      }
+      return response;
+    }
+
     // Auth failure retry (401/403) - only refresh+retry when the error is a token
     // problem (EXPIRED_TOKEN or INVALID_TOKEN). Permission errors should propagate
     // immediately without wasting a token refresh + retry round-trip.

From 396fcc5efaf938685c4a75df5b4d499d0f873257 Mon Sep 17 00:00:00 2001
From: Chris Hagglund <chris.hagglund@orkes.io>
Date: Tue, 5 May 2026 11:13:25 -0600
Subject: [PATCH 06/21] experiments with flaky test runs ...

---
 .github/workflows/pull_request.yml | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml
index c2271a6a..ddf85030 100644
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@@ -79,14 +79,16 @@ jobs:
           name: codecov-unit-node-${{ matrix.node-version }}
           fail_ci_if_error: false
 
-  # Integration tests (v5): one shard at a time (max-parallel: 1) to avoid 502/503 on shared Conductor.
-  # Sharding (--shard i/N) splits the suite so each job runs ~1/N of tests — keeps per-job under timeout.
+  # Integration tests (v5): lower max-parallel reduces 502/503 from the shared Conductor server
+  # but makes CI slower without eliminating flakes entirely — feel free to experiment - this test
+  # setup is insane. Sharding (--shard i/N) splits the suite so each job runs ~1/N of tests.
+  # fetchWithRetry now retries 502/503/504, so higher parallelism is more viable than before.
   integration-tests:
     runs-on: ubuntu-latest
     timeout-minutes: 25
     strategy:
       fail-fast: false
-      max-parallel: 1
+      max-parallel: 3
       matrix:
         node-version: [20, 22, 24]
         shard: [1, 2, 3]

From bf71a1eed66367da5fc4a14c3ddf0155066c5108 Mon Sep 17 00:00:00 2001
From: Chris Hagglund <chris.hagglund@orkes.io>
Date: Tue, 5 May 2026 11:45:48 -0600
Subject: [PATCH 07/21] remove editorialization

---
 .github/workflows/pull_request.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml
index ddf85030..b835e6e5 100644
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@@ -80,8 +80,8 @@ jobs:
           fail_ci_if_error: false
 
   # Integration tests (v5): lower max-parallel reduces 502/503 from the shared Conductor server
-  # but makes CI slower without eliminating flakes entirely — feel free to experiment - this test
-  # setup is insane. Sharding (--shard i/N) splits the suite so each job runs ~1/N of tests.
+  # but makes CI slower without eliminating flakes entirely — feel free to experiment.
+  # Sharding (--shard i/N) splits the suite so each job runs ~1/N of tests.
   # fetchWithRetry now retries 502/503/504, so higher parallelism is more viable than before.
   integration-tests:
     runs-on: ubuntu-latest

From c8c4e9f288b33e3bc5afc2a88af77a9806dbd114 Mon Sep 17 00:00:00 2001
From: Chris Hagglund <chris.hagglund@orkes.io>
Date: Wed, 6 May 2026 09:37:01 -0600
Subject: [PATCH 08/21] cleaner reporting on which metrics implementation is
 used in the test harness worker

---
 harness/main.ts                                   |  7 +------
 .../worker/metrics/CanonicalMetricsCollector.ts   |  4 ++++
 src/sdk/worker/metrics/LegacyMetricsCollector.ts  |  4 ++++
 .../worker/metrics/MetricsCollectorInterface.ts   |  1 +
 .../metrics/__tests__/metricsFactory.test.ts      | 15 +++++++++++++++
 5 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/harness/main.ts b/harness/main.ts
index ad30b732..081188d7 100644
--- a/harness/main.ts
+++ b/harness/main.ts
@@ -94,12 +94,7 @@ async function main(): Promise<void> {
 
   const metricsPort = envIntOrDefault("HARNESS_METRICS_PORT", 9991);
   const metricsCollector = createMetricsCollector({ httpPort: metricsPort });
-  const variant = ["true", "1", "yes"].includes(
-    (process.env.WORKER_CANONICAL_METRICS ?? "").toLowerCase(),
-  )
-    ? "canonical"
-    : "legacy";
-  console.log(`Prometheus metrics server started on port ${metricsPort} (${variant} metrics)`);
+  console.log(`Prometheus metrics server started on port ${metricsPort} (${metricsCollector.collectorName()} metrics)`);
 
   const handler = new TaskHandler({
     client,
diff --git a/src/sdk/worker/metrics/CanonicalMetricsCollector.ts b/src/sdk/worker/metrics/CanonicalMetricsCollector.ts
index d12b8e7a..f0fa5314 100644
--- a/src/sdk/worker/metrics/CanonicalMetricsCollector.ts
+++ b/src/sdk/worker/metrics/CanonicalMetricsCollector.ts
@@ -416,6 +416,10 @@ export class CanonicalMetricsCollector implements MetricsCollectorInterface {
     }
   }
 
+  collectorName(): string {
+    return "canonical";
+  }
+
   getContentType(): string {
     return (
       this._promRegistry?.contentType ??
diff --git a/src/sdk/worker/metrics/LegacyMetricsCollector.ts b/src/sdk/worker/metrics/LegacyMetricsCollector.ts
index cbf1b3ac..381c14b5 100644
--- a/src/sdk/worker/metrics/LegacyMetricsCollector.ts
+++ b/src/sdk/worker/metrics/LegacyMetricsCollector.ts
@@ -342,6 +342,10 @@ export class LegacyMetricsCollector implements MetricsCollectorInterface {
     }
   }
 
+  collectorName(): string {
+    return "legacy";
+  }
+
   getContentType(): string {
     return (
       this._promRegistry?.contentType ??
diff --git a/src/sdk/worker/metrics/MetricsCollectorInterface.ts b/src/sdk/worker/metrics/MetricsCollectorInterface.ts
index 78d0c0b0..4852b099 100644
--- a/src/sdk/worker/metrics/MetricsCollectorInterface.ts
+++ b/src/sdk/worker/metrics/MetricsCollectorInterface.ts
@@ -39,6 +39,7 @@ export interface MetricsCollectorInterface extends TaskRunnerEventsListener {
 
   // ── Output / lifecycle ─────────────────────────────────────────
 
+  collectorName(): string;
   getMetrics(): unknown;
   reset(): void;
   stop(): Promise<void>;
diff --git a/src/sdk/worker/metrics/__tests__/metricsFactory.test.ts b/src/sdk/worker/metrics/__tests__/metricsFactory.test.ts
index f5a28f91..529fad17 100644
--- a/src/sdk/worker/metrics/__tests__/metricsFactory.test.ts
+++ b/src/sdk/worker/metrics/__tests__/metricsFactory.test.ts
@@ -79,6 +79,20 @@ describe("createMetricsCollector", () => {
     expect(typeof text).toBe("string");
   });
 
+  it('legacy collector returns "legacy" from collectorName()', () => {
+    delete process.env.WORKER_CANONICAL_METRICS;
+
+    const collector = createMetricsCollector();
+    expect(collector.collectorName()).toBe("legacy");
+  });
+
+  it('canonical collector returns "canonical" from collectorName()', () => {
+    process.env.WORKER_CANONICAL_METRICS = "true";
+
+    const collector = createMetricsCollector();
+    expect(collector.collectorName()).toBe("canonical");
+  });
+
   it("both implementations satisfy MetricsCollectorInterface", () => {
     const legacy = createMetricsCollector();
     const requiredMethods = [
@@ -97,6 +111,7 @@ describe("createMetricsCollector", () => {
       "stop",
       "getContentType",
       "toPrometheusText",
+      "collectorName",
       "toPrometheusTextAsync",
     ];
 

From c3da9274c4ae09bac81725f3b28fca362b8ea2a9 Mon Sep 17 00:00:00 2001
From: Chris Hagglund <chris.hagglund@orkes.io>
Date: Wed, 6 May 2026 12:42:53 -0600
Subject: [PATCH 09/21] updates for metrics related documentation

---
 AGENTS.md                         |  10 +-
 METRICS.md                        | 485 ++++++++++++++++++------------
 README.md                         |  10 +-
 SDK_COMPARISON.md                 |  27 +-
 SDK_DEVELOPMENT.md                |  24 +-
 WORKER_ARCHITECTURE_COMPARISON.md |  27 +-
 6 files changed, 347 insertions(+), 236 deletions(-)

diff --git a/AGENTS.md b/AGENTS.md
index 87f2aa09..afb1584e 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -32,7 +32,7 @@ src/sdk/                         # Main SDK source
     decorators/worker.ts         # @worker decorator + dual-mode support
     decorators/registry.ts       # Global registry (register/get/clear)
     context/TaskContext.ts        # AsyncLocalStorage per-task context
-    metrics/                     # MetricsCollector, MetricsServer, PrometheusRegistry
+    metrics/                     # LegacyMetricsCollector, CanonicalMetricsCollector, metricsFactory, MetricsServer, PrometheusRegistry, CanonicalPrometheusRegistry, accumulators, httpObserver
     schema/                      # jsonSchema, schemaField decorators
   generators/                    # Legacy generators (pre-v3, still exported for compat)
 src/open-api/                    # OpenAPI layer
@@ -211,10 +211,10 @@ public async someMethod(args): Promise<T> {
 
 ### Metrics Documentation (METRICS.md)
 
-When adding, removing, or renaming metrics in `src/sdk/worker/metrics/MetricsCollector.ts`:
-1. Update `METRICS.md` to reflect the change (name, type, labels, description)
-2. Ensure both `MetricsCollector.toPrometheusText()` and `PrometheusRegistry.createMetrics()` are updated in sync — missing a summary/counter in either causes silent data loss
-3. Update the metric count in the METRICS.md overview section
+When adding, removing, or renaming metrics in `src/sdk/worker/metrics/`:
+1. Update both `LegacyMetricsCollector.ts` and `CanonicalMetricsCollector.ts` (or add a no-op stub in the collector that does not emit the metric)
+2. Ensure `toPrometheusText()` and the corresponding `PrometheusRegistry` / `CanonicalPrometheusRegistry` are updated in sync — missing a metric in either causes silent data loss
+3. Update `METRICS.md` to reflect the change in both the legacy and canonical catalog tables
 4. Add or update the corresponding direct recording method documentation if applicable
 
 ### SDK_NEW_LANGUAGE_GUIDE.md
diff --git a/METRICS.md b/METRICS.md
index d8770ed8..43c22252 100644
--- a/METRICS.md
+++ b/METRICS.md
@@ -1,259 +1,364 @@
-# Metrics Reference
+# JavaScript SDK Metrics
 
-The Conductor JavaScript SDK provides built-in Prometheus metrics for monitoring worker performance, API latency, and task execution.
+The Conductor JavaScript SDK can expose Prometheus metrics for worker polling,
+task execution, task updates, workflow starts, external payload usage, and
+API-client HTTP calls.
 
-## Overview
+The SDK currently has two mutually exclusive metric surfaces:
 
-`MetricsCollector` implements `TaskRunnerEventsListener` and records **18 metric types** (12 counters + 6 summaries). Metrics are exposed in [Prometheus exposition format](https://prometheus.io/docs/instrumenting/exposition_formats/).
+- **Legacy metrics** are the default. They preserve the original JavaScript SDK
+  names and shapes, including a `conductor_worker_` prefix, `task_type` labels,
+  millisecond time units, and Summary type for distributions.
+- **Canonical metrics** are opt-in with `WORKER_CANONICAL_METRICS=true`. They
+  use the cross-SDK canonical names, labels, units, and Prometheus histogram
+  shapes.
 
-- **Default prefix:** `conductor_worker`
-- **Quantiles:** p50, p75, p90, p95, p99 (computed from a sliding window)
-- **Sliding window:** Last 1,000 observations (configurable)
+Only one collector is active at a time. The SDK does not emit legacy and
+canonical metrics at the same time.
 
-## Quick Start
+Metrics are created lazily. A metric appears in `/metrics` only after the
+corresponding worker event or collector method records it. Some low-level
+surface metrics, such as ack, queue-full, paused, and uncaught-exception
+counters, may not appear in normal worker runs unless that path is exercised.
 
-### HTTP Server
+## Usage
 
-```typescript
-import { MetricsCollector, MetricsServer, TaskHandler } from "@io-orkes/conductor-javascript";
+Create a metrics collector, start a scrape server, and wire the collector into
+`TaskHandler` as an event listener:
 
-const metrics = new MetricsCollector({ httpPort: 9090 });
+```typescript
+import {
+  createMetricsCollector,
+  MetricsServer,
+  TaskHandler,
+} from "@io-orkes/conductor-javascript";
+
+const metrics = createMetricsCollector();
+const server = new MetricsServer(metrics, 9090);
+await server.start();
 
 const handler = new TaskHandler({
   client,
-  scanForDecorated: true,
   eventListeners: [metrics],
+  scanForDecorated: true,
 });
 
 await handler.startWorkers();
 // GET http://localhost:9090/metrics  — Prometheus text format
-// GET http://localhost:9090/health   — { "status": "UP" }
+// GET http://localhost:9090/health   — {"status":"UP"}
+```
+
+`createMetricsCollector()` reads `WORKER_CANONICAL_METRICS` and returns either
+a `LegacyMetricsCollector` or a `CanonicalMetricsCollector`. Both implement
+`MetricsCollectorInterface`, so call sites never need to know which variant is
+active.
+
+You can also construct a collector directly if you need to pass configuration:
+
+```typescript
+import { LegacyMetricsCollector } from "@io-orkes/conductor-javascript";
+
+const metrics = new LegacyMetricsCollector({
+  httpPort: 9090,
+  filePath: "/tmp/conductor_metrics.prom",
+  fileWriteIntervalMs: 10000,
+  usePromClient: true,
+});
 ```
 
 ### File Output
 
 ```typescript
-const metrics = new MetricsCollector({
+const metrics = createMetricsCollector({
   filePath: "/tmp/conductor_metrics.prom",
-  fileWriteIntervalMs: 10000, // write every 10s
+  fileWriteIntervalMs: 10000,
 });
 ```
 
-The file writer performs an immediate first write, then writes periodically at the configured interval. The timer is unreferenced so it does not prevent Node.js process exit.
+The file writer performs an immediate first write, then writes periodically at
+the configured interval. The timer is unreferenced so it does not prevent
+Node.js process exit.
 
 ### prom-client Integration
 
 ```typescript
-const metrics = new MetricsCollector({ usePromClient: true });
+const metrics = createMetricsCollector({ usePromClient: true });
 // Metrics are registered in prom-client's default registry.
 // Use prom-client's register.metrics() for native scraping.
 ```
 
-Requires `npm install prom-client`. Falls back to built-in text format if not installed.
+Requires `npm install prom-client`. Falls back to built-in text format if
+prom-client is not installed.
 
-### All-in-One
+## Selecting Canonical Metrics
 
-```typescript
-const metrics = new MetricsCollector({
-  prefix: "myapp_worker",
-  httpPort: 9090,
-  filePath: "/tmp/metrics.prom",
-  fileWriteIntervalMs: 10000,
-  slidingWindowSize: 500,
-  usePromClient: true,
-});
+Set `WORKER_CANONICAL_METRICS` before the worker starts:
+
+```shell
+WORKER_CANONICAL_METRICS=true node my_worker.js
 ```
 
+Accepted true values are `true`, `1`, and `yes`, case-insensitive. Any other
+value, or an unset variable, selects legacy metrics. The variable is read when
+the metrics collector is created, so changing it requires a worker restart.
+
 ## Configuration
 
 | Option | Type | Default | Description |
-|--------|------|---------|-------------|
-| `prefix` | `string` | `"conductor_worker"` | Prometheus metric name prefix |
-| `httpPort` | `number` | — | Start built-in HTTP server on this port |
-| `filePath` | `string` | — | Periodically write metrics to this file path |
-| `fileWriteIntervalMs` | `number` | `5000` | File write interval in milliseconds |
-| `slidingWindowSize` | `number` | `1000` | Max observations kept for quantile calculation |
-| `usePromClient` | `boolean` | `false` | Use `prom-client` for native Prometheus integration |
-
----
-
-## Counter Metrics
-
-### Labeled by `task_type`
-
-| Prometheus Name | Internal Key | Description |
-|----------------|-------------|-------------|
-| `{prefix}_task_poll_total` | `pollTotal` | Total number of task polls initiated |
-| `{prefix}_task_poll_error_total` | `pollErrorTotal` | Total number of failed task polls |
-| `{prefix}_task_execute_total` | `taskExecutionTotal` | Total number of task executions completed |
-| `{prefix}_task_execute_error_total` | `taskExecutionErrorTotal` | Total task execution errors. Label format: `taskType:ExceptionName` |
-| `{prefix}_task_update_error_total` | `taskUpdateFailureTotal` | Total task result update failures (result lost from Conductor) |
-| `{prefix}_task_ack_error_total` | `taskAckErrorTotal` | Total task acknowledgement errors |
-| `{prefix}_task_execution_queue_full_total` | `taskExecutionQueueFullTotal` | Times the execution queue was full (concurrency limit reached) |
-| `{prefix}_task_paused_total` | `taskPausedTotal` | Total task paused events |
-
-### Labeled by `payload_type`
-
-| Prometheus Name | Internal Key | Description |
-|----------------|-------------|-------------|
-| `{prefix}_external_payload_used_total` | `externalPayloadUsedTotal` | External payload storage usage (e.g., `"workflow_input"`, `"task_output"`) |
-
-### Global (no labels)
-
-| Prometheus Name | Internal Key | Description |
-|----------------|-------------|-------------|
-| `{prefix}_thread_uncaught_exceptions_total` | `uncaughtExceptionTotal` | Total uncaught exceptions in worker processes |
-| `{prefix}_worker_restart_total` | `workerRestartTotal` | Total worker restart events |
-| `{prefix}_workflow_start_error_total` | `workflowStartErrorTotal` | Total workflow start errors |
-
----
-
-## Summary Metrics
-
-Each summary emits quantile values, a count, and a sum:
-
-```
-{name}{task_type="myTask",quantile="0.5"} 12.3
-{name}{task_type="myTask",quantile="0.75"} 15.1
-{name}{task_type="myTask",quantile="0.9"} 18.7
-{name}{task_type="myTask",quantile="0.95"} 22.0
-{name}{task_type="myTask",quantile="0.99"} 45.2
-{name}_count{task_type="myTask"} 1000
-{name}_sum{task_type="myTask"} 14523.7
+|---|---|---|---|
+| `prefix` | `string` | `"conductor_worker"` | Prometheus metric name prefix. Legacy only; canonical metrics are unprefixed. |
+| `httpPort` | `number` | — | Start built-in HTTP server on this port. |
+| `filePath` | `string` | — | Periodically write metrics to this file path. |
+| `fileWriteIntervalMs` | `number` | `5000` | File write interval in milliseconds. |
+| `slidingWindowSize` | `number` | `1000` | Max observations for quantile calculation. Legacy only; canonical uses histogram buckets. |
+| `usePromClient` | `boolean` | `false` | Use `prom-client` for native Prometheus integration. |
+
+## Canonical Metrics
+
+Canonical timing values are seconds. Canonical size values are bytes. Label
+names use camelCase.
+
+### Canonical Counters
+
+| Metric | Labels | Description |
+|---|---|---|
+| `task_poll_total` | `taskType` | Incremented each time the worker issues a poll request. |
+| `task_execution_started_total` | `taskType` | Incremented when a polled task is dispatched to the worker function. |
+| `task_poll_error_total` | `taskType`, `exception` | Incremented when a poll request fails client-side. |
+| `task_execute_error_total` | `taskType`, `exception` | Incremented when the worker function throws. |
+| `task_update_error_total` | `taskType`, `exception` | Incremented when updating the task result fails. |
+| `task_ack_error_total` | `taskType`, `exception` | Collector surface for task ack errors. The internal runner uses batch poll responses as ack and may not emit this during normal polling. |
+| `task_ack_failed_total` | `taskType` | Collector surface for failed task ack responses. The internal runner uses batch poll responses as ack and may not emit this during normal polling. |
+| `task_execution_queue_full_total` | `taskType` | Incremented when the worker execution queue is saturated. |
+| `task_paused_total` | `taskType` | Incremented when a worker is paused and skips acting on a poll. |
+| `thread_uncaught_exceptions_total` | `exception` | Incremented on uncaught exceptions in the worker process. |
+| `external_payload_used_total` | `entityName`, `operation`, `payloadType` | Incremented when external payload storage is used for task or workflow payloads. |
+| `workflow_start_error_total` | `workflowType`, `exception` | Incremented when starting a workflow fails client-side. |
+
+### Canonical Time Histograms
+
+All canonical time histograms use buckets:
+`0.001`, `0.005`, `0.01`, `0.025`, `0.05`, `0.1`, `0.25`, `0.5`, `1`, `2.5`,
+`5`, `10`.
+
+| Metric | Labels | Description |
+|---|---|---|
+| `task_poll_time_seconds` | `taskType`, `status` | Poll request latency. `status` is `SUCCESS` or `FAILURE`. |
+| `task_execute_time_seconds` | `taskType`, `status` | Worker function execution duration. `status` is `SUCCESS` or `FAILURE`. |
+| `task_update_time_seconds` | `taskType`, `status` | Task-result update latency. `status` is `SUCCESS` or `FAILURE`. |
+| `http_api_client_request_seconds` | `method`, `uri`, `status` | API-client HTTP request latency. `status` is the HTTP status code as a string, or `"0"` on network failure. |
+
+Each histogram exposes Prometheus series such as:
+
+```prometheus
+task_execute_time_seconds_bucket{taskType="my_task",status="SUCCESS",le="0.1"} 42
+task_execute_time_seconds_count{taskType="my_task",status="SUCCESS"} 50
+task_execute_time_seconds_sum{taskType="my_task",status="SUCCESS"} 2.3
 ```
 
-### Labeled by `task_type`
+### Canonical Size Histograms
 
-| Prometheus Name | Internal Key | Unit | Description |
-|----------------|-------------|------|-------------|
-| `{prefix}_task_poll_time` | `pollDurationMs` | ms | Task poll round-trip duration |
-| `{prefix}_task_execute_time` | `executionDurationMs` | ms | Worker function execution duration |
-| `{prefix}_task_update_time` | `updateDurationMs` | ms | Task result update (SDK to server) duration |
-| `{prefix}_task_result_size_bytes` | `outputSizeBytes` | bytes | Task result output payload size |
+All canonical size histograms use buckets:
+`100`, `1000`, `10000`, `100000`, `1000000`, `10000000`.
 
-### Labeled by `workflow_type`
+| Metric | Labels | Description |
+|---|---|---|
+| `task_result_size_bytes` | `taskType` | Serialized task result output size. |
+| `workflow_input_size_bytes` | `workflowType`, `version` | Serialized workflow input size. `version` is an empty string when not provided. |
 
-| Prometheus Name | Internal Key | Unit | Description |
-|----------------|-------------|------|-------------|
-| `{prefix}_workflow_input_size_bytes` | `workflowInputSizeBytes` | bytes | Workflow input payload size |
+### Canonical Gauges
 
-### Labeled by `endpoint`
+| Metric | Labels | Description |
+|---|---|---|
+| `active_workers` | `taskType` | Current number of workers actively executing tasks. |
 
-| Prometheus Name | Internal Key | Unit | Description |
-|----------------|-------------|------|-------------|
-| `{prefix}_http_api_client_request` | `apiRequestDurationMs` | ms | API request duration. Label format: `METHOD:/api/path:STATUS` |
+## Legacy Metrics
 
----
+Legacy mode is the default so existing dashboards and alerts continue to work.
+The default metric name prefix is `conductor_worker`. The prefix is configurable
+via the `prefix` option on `MetricsCollectorConfig`.
 
-## Event Listener Methods
+Distribution metrics are sliding-window summaries over the latest 1,000
+observations (configurable via `slidingWindowSize`), exposing quantiles at
+p50, p75, p90, p95, and p99. Legacy distribution metrics also expose `_count`
+and `_sum` series.
 
-These methods are called automatically by the `TaskRunner` when `MetricsCollector` is registered as an event listener:
+### Legacy Counters
 
-| Method | Metrics Updated |
-|--------|----------------|
-| `onPollStarted(event)` | Increments `pollTotal` |
-| `onPollCompleted(event)` | Records `pollDurationMs` |
-| `onPollFailure(event)` | Increments `pollErrorTotal`, records `pollDurationMs` |
-| `onTaskExecutionStarted(event)` | _(no-op, counted on completion)_ |
-| `onTaskExecutionCompleted(event)` | Increments `taskExecutionTotal`, records `executionDurationMs` and `outputSizeBytes` |
-| `onTaskExecutionFailure(event)` | Increments `taskExecutionErrorTotal`, records `executionDurationMs` |
-| `onTaskUpdateCompleted(event)` | Records `updateDurationMs` |
-| `onTaskUpdateFailure(event)` | Increments `taskUpdateFailureTotal` |
+| Metric | Labels | Description |
+|---|---|---|
+| `conductor_worker_task_poll_total` | `task_type` | Incremented each time polling is done. |
+| `conductor_worker_task_poll_error_total` | `task_type` | Incremented when a poll request fails. |
+| `conductor_worker_task_execute_total` | `task_type` | Incremented when a task execution completes. |
+| `conductor_worker_task_execute_error_total` | `task_type` | Task execution errors. Label format: `taskType:ExceptionName`. |
+| `conductor_worker_task_update_error_total` | `task_type` | Incremented when updating the task result fails. |
+| `conductor_worker_task_ack_error_total` | `task_type` | Collector surface for task ack errors. |
+| `conductor_worker_task_execution_queue_full_total` | `task_type` | Incremented when the execution queue is saturated. |
+| `conductor_worker_task_paused_total` | `task_type` | Incremented when a worker is paused and skips a poll. |
+| `conductor_worker_external_payload_used_total` | `payload_type` | External payload storage usage. |
+| `conductor_worker_thread_uncaught_exceptions_total` | none | Uncaught exceptions in the worker process. |
+| `conductor_worker_worker_restart_total` | none | Worker restart events. |
+| `conductor_worker_workflow_start_error_total` | none | Workflow start errors. |
 
-## Direct Recording Methods
+Legacy mode does not emit `task_execution_started_total`,
+`task_ack_failed_total`, or `active_workers`.
 
-For metrics outside the event listener system, call these methods directly:
+### Legacy Time Metrics
 
-```typescript
-const collector = new MetricsCollector();
-
-collector.recordTaskExecutionQueueFull("my_task");
-collector.recordUncaughtException();
-collector.recordWorkerRestart();
-collector.recordTaskPaused("my_task");
-collector.recordTaskAckError("my_task");
-collector.recordWorkflowStartError();
-collector.recordExternalPayloadUsed("task_output");
-collector.recordWorkflowInputSize("my_workflow", 2048);
-collector.recordApiRequestTime("POST", "/api/tasks", 200, 35);
-```
+Time values are milliseconds. Type is Summary.
 
-## Exposition Formats
+| Metric | Labels | Description |
+|---|---|---|
+| `conductor_worker_task_poll_time` | `task_type` | Poll round-trip duration. |
+| `conductor_worker_task_execute_time` | `task_type` | Worker function execution duration. |
+| `conductor_worker_task_update_time` | `task_type` | Task result update duration. |
 
-### Built-in Prometheus Text
+Each summary exposes quantile, count, and sum series:
 
-```typescript
-const text = collector.toPrometheusText();
-// Returns Prometheus text format (text/plain; version=0.0.4)
+```prometheus
+conductor_worker_task_execute_time{task_type="my_task",quantile="0.5"} 102
+conductor_worker_task_execute_time{task_type="my_task",quantile="0.95"} 250
+conductor_worker_task_execute_time_count{task_type="my_task"} 1000
+conductor_worker_task_execute_time_sum{task_type="my_task"} 120345
 ```
 
-### Async (with prom-client support)
-
-```typescript
-const text = await collector.toPrometheusTextAsync();
-// Uses prom-client registry when available, falls back to built-in
-```
-
-### HTTP Server (MetricsServer)
-
-```typescript
-import { MetricsServer } from "@io-orkes/conductor-javascript";
-
-const server = new MetricsServer(collector, 9090);
-await server.start();
-// GET /metrics — Content-Type from collector.getContentType()
-// GET /health  — { "status": "UP" }
-await server.stop();
+### Legacy Size Metrics
+
+Type is Summary. Values are bytes.
+
+| Metric | Labels | Description |
+|---|---|---|
+| `conductor_worker_task_result_size_bytes` | `task_type` | Task result output payload size. |
+| `conductor_worker_workflow_input_size_bytes` | `workflow_type` | Workflow input payload size. |
+
+### Legacy HTTP Metrics
+
+| Metric | Labels | Description |
+|---|---|---|
+| `conductor_worker_http_api_client_request` | `endpoint` | API request duration in milliseconds. The `endpoint` label is a compound `"METHOD:/api/path:STATUS"` string. |
+
+## Labels
+
+| Label | Used by | Values |
+|---|---|---|
+| `task_type` | Legacy worker metrics | Task definition name. |
+| `taskType` | Canonical worker metrics | Task definition name. |
+| `workflowType` | Canonical workflow metrics | Workflow definition name. |
+| `workflow_type` | Legacy `conductor_worker_workflow_input_size_bytes` | Workflow definition name. |
+| `version` | Canonical `workflow_input_size_bytes` | Workflow version as a string. Empty string when not provided. |
+| `status` | Canonical task time histograms | `SUCCESS` or `FAILURE`. For `http_api_client_request_seconds`, the HTTP status code as a string, or `"0"` on network failure. |
+| `exception` | Canonical error counters | Exception type name, such as `TypeError`. Derived from `error.name` or `error.constructor.name`. |
+| `entityName` | Canonical `external_payload_used_total` | Task type or workflow name associated with the external payload. |
+| `operation` | Canonical `external_payload_used_total` | External payload operation, such as `READ` or `WRITE`. |
+| `payload_type` | Legacy `conductor_worker_external_payload_used_total` | Payload type, such as `workflow_input` or `task_output`. |
+| `payloadType` | Canonical `external_payload_used_total` | Payload type, such as `TASK_INPUT`, `TASK_OUTPUT`, `WORKFLOW_INPUT`, or `WORKFLOW_OUTPUT`. |
+| `method` | Canonical HTTP metrics | HTTP verb. |
+| `uri` | Canonical HTTP metrics | Request path. May contain interpolated identifiers. |
+| `endpoint` | Legacy HTTP metrics | Compound `"METHOD:/api/path:STATUS"` string. |
+| `quantile` | Legacy time and size metrics | `0.5`, `0.75`, `0.9`, `0.95`, or `0.99`. |
+
+## Migrating From Legacy to Canonical
+
+Canonical mode is opt-in during the deprecation period. Before switching a
+production worker, update dashboards and alerts against a staging worker with
+`WORKER_CANONICAL_METRICS=true`.
+
+Key changes:
+
+- The `conductor_worker_` prefix is removed. Canonical metric names are
+  unprefixed.
+- Legacy task labels use `task_type`; canonical task labels use `taskType`.
+- Legacy time metrics are millisecond summaries with quantiles. Canonical time
+  metrics are second-based histograms with bucket boundaries. Query `_bucket`
+  series with `histogram_quantile()` instead of reading `{quantile="..."}`
+  gauges.
+- Legacy size metrics are summaries. Canonical size metrics are histograms.
+- Canonical error counters add an `exception` label containing the exception
+  type name.
+- Canonical time histograms add a `status` label (`SUCCESS` or `FAILURE`).
+- Canonical mode adds metrics that legacy mode never emits:
+  `task_execution_started_total`, `task_ack_failed_total`, and
+  `active_workers`.
+- Legacy `conductor_worker_worker_restart_total` is not emitted in canonical
+  mode (Node.js single-process model).
+- Legacy uses `payload_type`; canonical uses `payloadType`.
+- Legacy HTTP metrics use a compound `endpoint` label; canonical uses separate
+  `method`, `uri`, and `status` labels.
+- Canonical and legacy collectors are mutually exclusive. During a migration,
+  compare scrape output by running separate worker instances or environments
+  with and without `WORKER_CANONICAL_METRICS=true`.
+
+Legacy-to-canonical replacements:
+
+| Legacy metric | Canonical replacement |
+|---|---|
+| `conductor_worker_task_poll_total{task_type}` | `task_poll_total{taskType}` |
+| `conductor_worker_task_poll_error_total{task_type}` | `task_poll_error_total{taskType,exception}` |
+| `conductor_worker_task_execute_total{task_type}` | `task_execute_time_seconds{taskType,status}` (count from histogram) and `task_execution_started_total{taskType}` |
+| `conductor_worker_task_execute_error_total{task_type}` | `task_execute_error_total{taskType,exception}` |
+| `conductor_worker_task_update_error_total{task_type}` | `task_update_error_total{taskType,exception}` |
+| `conductor_worker_task_poll_time{task_type}` (summary, ms) | `task_poll_time_seconds{taskType,status}` (histogram, seconds) |
+| `conductor_worker_task_execute_time{task_type}` (summary, ms) | `task_execute_time_seconds{taskType,status}` (histogram, seconds) |
+| `conductor_worker_task_update_time{task_type}` (summary, ms) | `task_update_time_seconds{taskType,status}` (histogram, seconds) |
+| `conductor_worker_task_result_size_bytes{task_type}` (summary) | `task_result_size_bytes{taskType}` (histogram) |
+| `conductor_worker_workflow_input_size_bytes{workflow_type}` (summary) | `workflow_input_size_bytes{workflowType,version}` (histogram) |
+| `conductor_worker_http_api_client_request{endpoint}` (summary, ms) | `http_api_client_request_seconds{method,uri,status}` (histogram, seconds) |
+| `conductor_worker_external_payload_used_total{payload_type}` | `external_payload_used_total{entityName,operation,payloadType}` |
+| `conductor_worker_workflow_start_error_total` (no labels) | `workflow_start_error_total{workflowType,exception}` |
+| `conductor_worker_worker_restart_total` | — (not emitted in canonical mode) |
+
+Common PromQL replacements:
+
+| Legacy | Canonical |
+|---|---|
+| `conductor_worker_task_execute_time{quantile="0.95"}` | `histogram_quantile(0.95, sum by (le, taskType, status) (rate(task_execute_time_seconds_bucket[5m])))` |
+| `conductor_worker_task_poll_time{quantile="0.95"}` | `histogram_quantile(0.95, sum by (le, taskType, status) (rate(task_poll_time_seconds_bucket[5m])))` |
+| `conductor_worker_http_api_client_request{quantile="0.95"}` | `histogram_quantile(0.95, sum by (le, method, uri, status) (rate(http_api_client_request_seconds_bucket[5m])))` |
+| `conductor_worker_task_result_size_bytes{quantile="0.95"}` | `histogram_quantile(0.95, sum by (le, taskType) (rate(task_result_size_bytes_bucket[5m])))` |
+
+Average latency queries continue to use `_sum` divided by `_count`, but the
+canonical series are cumulative histogram counters:
+
+```promql
+sum(rate(task_execute_time_seconds_sum[5m])) by (taskType)
+/
+sum(rate(task_execute_time_seconds_count[5m])) by (taskType)
 ```
 
-### File Output
-
-Configured via `filePath` in `MetricsCollectorConfig`. Writes `toPrometheusText()` output to disk. The file writer performs an immediate first write on construction, then writes periodically at the configured interval.
-
----
-
-## Sliding Window and Quantile Calculation
-
-Summary metrics use a **sliding window** (default: 1,000 observations) to calculate percentiles. This provides:
+## Troubleshooting
 
-- Accurate recent percentiles without unbounded memory growth
-- No need to pre-configure histogram bucket boundaries
-- Direct percentile values without interpolation artifacts
+### Metrics Are Empty
 
-Quantiles are computed on-demand using linear interpolation on sorted observations when `toPrometheusText()` is called.
+- Verify that `createMetricsCollector()` or a collector constructor is called
+  and the collector is passed to `TaskHandler` via `eventListeners`.
+- Verify workers have polled or executed tasks. Metrics are created lazily when
+  the relevant event occurs.
+- Confirm the scrape endpoint is reachable at the expected host and port.
 
-When using `prom-client` (`usePromClient: true`), summaries use prom-client's native implementation with `maxAgeSeconds: 600` and `ageBuckets: 5`.
+### Missing HTTP or Workflow Metrics
 
----
+- `http_api_client_request_seconds` (canonical) or
+  `conductor_worker_http_api_client_request` (legacy) is recorded from the
+  `fetchWithRetry` HTTP layer. Verify the collector is constructed before HTTP
+  calls begin.
+- `workflow_input_size_bytes` and `workflow_start_error_total` are recorded in
+  `WorkflowExecutor`. Verify the collector is active before starting workflows.
 
-## Monitoring Best Practices
+### High Cardinality
 
-- **Use p95/p99 for SLO monitoring** rather than averages. Percentile-based thresholds better capture user-impacting performance variations.
-- **Alert on `task_update_error_total`** — a rising count indicates task results are being lost and workers are failing to report back to the Conductor server.
-- **Alert on `task_execution_queue_full_total`** — indicates the concurrency limit is consistently reached. Consider increasing worker `concurrency`.
-- **Monitor `task_poll_time` p99** — high poll latency suggests network issues or server overload.
-- **Monitor `task_execute_time` p95** — watch for execution time regression in worker functions.
-- **File output interval**: 10-60 seconds recommended for production. Lower intervals increase disk I/O.
-- **Clean metrics directory on startup** when using file output with multiprocess workers to avoid stale data.
+- Watch the `uri` label (canonical) or `endpoint` label (legacy) on HTTP
+  metrics. The SDK records the interpolated request path, which may include
+  task type names or workflow IDs.
+- Prefer canonical mode for bounded `exception` labels. Legacy error counters
+  encode exception names in the Map key, not as a proper Prometheus label.
+- Avoid embedding user identifiers or unbounded values in task type, workflow
+  type, or external payload labels.
 
----
+### prom-client Issues
 
-## Programmatic Access
-
-```typescript
-const metrics = collector.getMetrics();
-
-// Counter values
-metrics.pollTotal.get("my_task");           // number
-metrics.taskExecutionTotal.get("my_task");  // number
-
-// Summary observations (raw array)
-metrics.pollDurationMs.get("my_task");      // number[]
-metrics.executionDurationMs.get("my_task"); // number[]
-
-// Reset all metrics
-collector.reset();
-
-// Stop file writer and HTTP server
-await collector.stop();
-```
+- `MetricsCollector` uses `await import("./MetricsServer.js")` internally. The
+  `.js` extension does not resolve under Jest's TypeScript transform. Test
+  `MetricsServer` by importing it directly, not via the `httpPort` config
+  option.
+- When `usePromClient: true` is set but `prom-client` is not installed, the
+  collector falls back to the built-in text format silently.
diff --git a/README.md b/README.md
index 7880908a..f5479ac5 100644
--- a/README.md
+++ b/README.md
@@ -386,12 +386,12 @@ await handler.startWorkers();
 
 ## Monitoring Workers
 
-Enable Prometheus metrics with the built-in `MetricsCollector`:
+Enable Prometheus metrics with the built-in metrics collector:
 
 ```typescript
-import { MetricsCollector, MetricsServer, TaskHandler } from "@io-orkes/conductor-javascript";
+import { createMetricsCollector, MetricsServer, TaskHandler } from "@io-orkes/conductor-javascript";
 
-const metrics = new MetricsCollector();
+const metrics = createMetricsCollector();
 const server = new MetricsServer(metrics, 9090);
 await server.start();
 
@@ -405,7 +405,7 @@ await handler.startWorkers();
 // GET http://localhost:9090/health  — {"status":"UP"}
 ```
 
-Collects 18 metric types: poll counts, execution durations, error rates, output sizes, and more — with p50/p75/p90/p95/p99 quantiles. See [METRICS.md](METRICS.md) for the full reference.
+The SDK has two metric surfaces: **legacy** (default, prefixed `conductor_worker_` names, Summary type) and **canonical** (opt-in via `WORKER_CANONICAL_METRICS=true`, unprefixed names, Histogram type). See [METRICS.md](METRICS.md) for the full reference.
 
 ## Managing Workflow Executions
 
@@ -542,7 +542,7 @@ See [examples/agentic-workflows/](examples/agentic-workflows/) for all examples.
 | Document | Description |
 |----------|-------------|
 | [SDK Development Guide](SDK_DEVELOPMENT.md) | Architecture, patterns, pitfalls, testing |
-| [Metrics Reference](METRICS.md) | All 18 Prometheus metrics with descriptions |
+| [Metrics Reference](METRICS.md) | Legacy and canonical Prometheus metrics with migration guide |
 | [Breaking Changes](BREAKING_CHANGES.md) | v3.x migration guide |
 | [Workflow Management](docs/api-reference/workflow-executor.md) | Start, pause, resume, terminate, retry, search, signal |
 | [Task Management](docs/api-reference/task-client.md) | Task operations, logs, queue management |
diff --git a/SDK_COMPARISON.md b/SDK_COMPARISON.md
index 881800bc..a2fdbfd6 100644
--- a/SDK_COMPARISON.md
+++ b/SDK_COMPARISON.md
@@ -4,7 +4,7 @@
 
 This document provides a detailed comparison between the **Python SDK** (golden reference, `conductor-python`) and the **JavaScript SDK** (`@io-orkes/conductor-javascript` v3.0.0), covering API surface, architecture, worker system, and feature parity.
 
-**Verdict Summary**: The JavaScript SDK has **near-complete parity** with the Python SDK (~98%). All client classes (including rate limit CRUD), worker features (TaskContext, adaptive backoff, 19 Prometheus metrics with quantiles and optional prom-client, decorator-based JSON schema generation), workflow DSL (`ConductorWorkflow` with `toSubWorkflowTask()`), and all 34 task type builders (including 13 LLM/AI + 2 MCP) are implemented.
+**Verdict Summary**: The JavaScript SDK has **near-complete parity** with the Python SDK (~98%). All client classes (including rate limit CRUD), worker features (TaskContext, adaptive backoff, Prometheus metrics with legacy and canonical modes, decorator-based JSON schema generation), workflow DSL (`ConductorWorkflow` with `toSubWorkflowTask()`), and all 34 task type builders (including 13 LLM/AI + 2 MCP) are implemented.
 
 ---
 
@@ -580,22 +580,23 @@ Features:
 
 #### Metrics / Observability
 
+Both SDKs support legacy and canonical metric surfaces via `WORKER_CANONICAL_METRICS`. See [METRICS.md](METRICS.md) for the JavaScript SDK catalog.
+
 | # | Feature | Python | JavaScript | Status |
 |---|---------|--------|-----------|--------|
-| 1 | MetricsCollector | Yes (19 metrics) | Yes (19 metrics) | Full |
+| 1 | Legacy / canonical collectors | `create_metrics_collector()` | `createMetricsCollector()` | Full |
 | 2 | HTTP `/metrics` endpoint | Yes | `MetricsServer` | Full |
 | 3 | HTTP `/health` endpoint | Yes | `MetricsServer` | Full |
 | 4 | Prometheus text format | Yes | `toPrometheusText()` | Full |
 | 5 | File-based metrics | Yes (`.prom` files) | Yes (`filePath` config option) | Full |
-| 6 | API request metrics | `http_api_client_request` | `recordApiRequestTime()` | Full |
-| 7 | Queue full metric | `task_execution_queue_full` | `recordTaskExecutionQueueFull()` | Full |
-| 8 | Uncaught exception metric | `thread_uncaught_exceptions` | `recordUncaughtException()` | Full |
-| 9 | Workflow start error | `workflow_start_error` | `recordWorkflowStartError()` | Full |
-| 10 | External payload used | `external_payload_used` | `recordExternalPayloadUsed()` | Full |
-| 11 | Worker restart metric | `worker_restart` | `recordWorkerRestart()` | Full |
-| 12 | Quantile calculation (p50-p99) | Yes (sliding window 1000) | Yes (sliding window, configurable) | Full |
-| 13 | `prometheus_client` integration | Yes (native prom-client) | Optional `prom-client` integration via `usePromClient: true` + `PrometheusRegistry` | Full (optional peer dep) |
-| 14 | Auto-start via `httpPort` config | Yes | Yes | Full |
+| 6 | API request metrics | Yes | `recordApiRequestTime()` | Full |
+| 7 | Queue full metric | Yes | `recordTaskExecutionQueueFull()` | Full |
+| 8 | Uncaught exception metric | Yes | `recordUncaughtException()` | Full |
+| 9 | Workflow start error | Yes | `recordWorkflowStartError()` | Full |
+| 10 | External payload used | Yes | `recordExternalPayloadUsed()` | Full |
+| 11 | Worker restart metric | Yes (Python-only, multi-process) | Legacy only (single-process model) | Full |
+| 12 | `prometheus_client` / `prom-client` | Yes (native) | Optional via `usePromClient: true` | Full |
+| 13 | Auto-start via `httpPort` config | Yes | Yes | Full |
 
 #### JSON Schema Generation
 
@@ -692,7 +693,7 @@ Both SDKs support identical environment variable formats for worker configuratio
 | **Worker system** | **98%** | TaskContext, adaptive backoff, auth backoff, events, schema |
 | **Task type builders** | **100%** | 34/34 builders including LLM, MCP, HTTP Poll, Webhook, GetDocument |
 | **Workflow DSL** | **95%** | Full `ConductorWorkflow` with `toSubWorkflowTask()`. Missing only `__call__` (language limitation) |
-| **Metrics/Observability** | **98%** | 19 metrics, quantiles (p50-p99), HTTP + file export, optional prom-client integration |
+| **Metrics/Observability** | **98%** | Legacy + canonical modes, HTTP + file export, optional prom-client integration |
 | **AI module** | **95%** | All LLM task builders + `LLMProvider`/`VectorDB` enums + `IntegrationConfig` types. Missing only `AIOrchestrator` |
 | **JSON schema** | **95%** | `jsonSchema()` declarative helper + `@schemaField()` decorator with `reflect-metadata` type inference |
 | **Scheduling** | **100%** | Full parity including tags |
@@ -712,7 +713,7 @@ Both SDKs support identical environment variable formats for worker configuratio
 7. **Worker configuration hierarchy**: Identical env var format and precedence
 8. **`TaskHandler` orchestrator**: Same architecture as Python
 9. **TaskContext**: Full async-local context with `addLog()`, `setCallbackAfter()`
-10. **Prometheus metrics**: 19 metrics with quantiles (p50-p99), `MetricsServer` (HTTP + file export), optional `prom-client` integration
+10. **Prometheus metrics**: Legacy and canonical modes with `createMetricsCollector()`, `MetricsServer` (HTTP + file export), optional `prom-client` integration
 11. **JSON schema generation**: `jsonSchema()` declarative helper + `@schemaField()` decorator with `reflect-metadata` runtime type inference + `inputType`/`outputType` on `@worker`
 12. **AI types**: `LLMProvider`, `VectorDB` enums + typed `IntegrationConfig` + `withPromptVariable()` helpers
 13. **OpenAPI-generated types**: Strong TypeScript types from spec
diff --git a/SDK_DEVELOPMENT.md b/SDK_DEVELOPMENT.md
index b40fd2a6..05bfd277 100644
--- a/SDK_DEVELOPMENT.md
+++ b/SDK_DEVELOPMENT.md
@@ -119,9 +119,15 @@ src/
       context/
         TaskContext.ts            # AsyncLocalStorage-based per-task context + getTaskContext()
       metrics/
-        MetricsCollector.ts      # TaskRunnerEventsListener impl, 19 metric types, quantiles
-        MetricsServer.ts         # HTTP server: /metrics (Prometheus) + /health (JSON)
-        PrometheusRegistry.ts    # Optional prom-client bridge (lazy loaded)
+        LegacyMetricsCollector.ts       # Default: prefixed names, Summary type, ms units
+        CanonicalMetricsCollector.ts    # Opt-in: unprefixed canonical names, Histogram type, seconds
+        metricsFactory.ts               # createMetricsCollector() reads WORKER_CANONICAL_METRICS
+        MetricsServer.ts                # HTTP server: /metrics (Prometheus) + /health (JSON)
+        MetricsCollectorInterface.ts    # Shared interface for both collectors
+        PrometheusRegistry.ts           # Optional prom-client bridge for legacy collector
+        CanonicalPrometheusRegistry.ts  # Optional prom-client bridge for canonical collector
+        accumulators.ts                 # HistogramAccumulator, MultiLabelCounter, GaugeMetric
+        httpObserver.ts                 # Global singleton for API request metrics
       schema/                    # jsonSchema(), schemaField() decorator, generateSchemaFromClass()
       config/                    # Worker configuration resolution helpers
     generators/                   # Legacy task generators (pre-v3, still exported for backward compat)
@@ -268,15 +274,13 @@ Returns `undefined` outside task execution. All 16 methods: `getTaskId()`, `getW
 
 ### 7. Metrics
 
+`createMetricsCollector()` reads `WORKER_CANONICAL_METRICS` and returns a legacy or canonical collector. See [METRICS.md](METRICS.md) for the full catalog and migration guide.
+
 ```typescript
-const metrics = new MetricsCollector({ prefix: "my_app", slidingWindowSize: 1000 });
+const metrics = createMetricsCollector();
 const handler = new TaskHandler({ client, eventListeners: [metrics], scanForDecorated: true });
 await handler.startWorkers();
 
-// Programmatic access
-const m = metrics.getMetrics();
-console.log(m.pollTotal.get("my_task"));
-
 // Prometheus text
 const text = metrics.toPrometheusText();
 ```
@@ -291,7 +295,7 @@ await server.start();
 await server.stop();
 ```
 
-**Do not use `MetricsCollector({ httpPort })` in Jest** - it uses a dynamic import with `.js` extension that doesn't resolve under Jest's TS transform.
+**Do not use `MetricsCollector({ httpPort })` in Jest** -- it uses a dynamic import with `.js` extension that doesn't resolve under Jest's TS transform.
 
 ## Known Pitfalls
 
@@ -616,7 +620,7 @@ Node 18+ required. Dual ESM/CJS via `tsup` with `exports` field in `package.json
 | Workflow DSL | `ConductorWorkflow` | `ConductorWorkflow` | Fluent builder |
 | Worker decorator | `@worker_task` | `@worker` | TS decorator syntax |
 | Task context | `get_task_context()` | `getTaskContext()` | AsyncLocalStorage vs contextvars |
-| Metrics | `MetricsCollector` | `MetricsCollector` | 19 metric types, quantiles |
+| Metrics | `MetricsCollector` | `createMetricsCollector()` | Legacy + canonical modes, see [METRICS.md](METRICS.md) |
 | Metrics server | HTTP endpoint | `MetricsServer` | `/metrics` + `/health` |
 | Non-retryable | `NonRetryableError` | `NonRetryableException` | `FAILED_WITH_TERMINAL_ERROR` |
 | LLM builders | 13 builders | 13 builders | Full parity |
diff --git a/WORKER_ARCHITECTURE_COMPARISON.md b/WORKER_ARCHITECTURE_COMPARISON.md
index dd0701b5..528921ee 100644
--- a/WORKER_ARCHITECTURE_COMPARISON.md
+++ b/WORKER_ARCHITECTURE_COMPARISON.md
@@ -545,37 +545,38 @@ handler = TaskHandler(
 
 ### JavaScript: Built-in MetricsCollector
 
-The JavaScript SDK provides a built-in `MetricsCollector` that implements `TaskRunnerEventsListener`:
+The JavaScript SDK provides `createMetricsCollector()` which returns a legacy or canonical collector based on `WORKER_CANONICAL_METRICS`. See [METRICS.md](METRICS.md) for the full catalog.
 
 ```typescript
-import { MetricsCollector, TaskHandler } from "@io-orkes/conductor-javascript/worker";
+import { createMetricsCollector, MetricsServer, TaskHandler } from "@io-orkes/conductor-javascript";
 
-const metrics = new MetricsCollector();
+const metrics = createMetricsCollector();
+const server = new MetricsServer(metrics, 9090);
+await server.start();
 
 const handler = new TaskHandler({
     client,
     eventListeners: [metrics],
 });
 
-handler.startWorkers();
-
-// Read metrics
-const snapshot = metrics.getMetrics();
-console.log("Poll total:", snapshot.pollTotal);
-console.log("Execution durations:", snapshot.executionDurationMs);
+await handler.startWorkers();
+// GET http://localhost:9090/metrics -> Prometheus text
+// GET http://localhost:9090/health  -> {"status":"UP"}
 ```
 
 ### Comparison
 
 | Feature | Python | JavaScript |
 |---------|--------|-----------|
-| Built-in metrics collector | Yes (`MetricsCollector`) | Yes (`MetricsCollector`) | Parity |
-| HTTP metrics endpoint | Yes (`/metrics`, `/health`) | No (use prom-client separately) | Python richer |
-| File-based metrics | Yes (`.prom` files) | No | Python richer |
+| Built-in metrics collector | Yes (`create_metrics_collector()`) | Yes (`createMetricsCollector()`) | Parity |
+| Legacy / canonical modes | Yes (`WORKER_CANONICAL_METRICS`) | Yes (`WORKER_CANONICAL_METRICS`) | Parity |
+| HTTP metrics endpoint | Yes (`/metrics`, `/health`) | Yes (`MetricsServer`) | Parity |
+| File-based metrics | Yes (`.prom` files) | Yes (`filePath` config) | Parity |
 | Multiprocess aggregation | Yes (SQLite) | N/A (single process) | N/A |
-| API request metrics | Yes (`http_api_client_request`) | No | Python richer |
+| API request metrics | Yes | Yes (`recordApiRequestTime()`) | Parity |
 | Event-based architecture | Yes (MetricsCollector is listener) | Yes (MetricsCollector is listener) | Parity |
 | Custom metrics via events | Yes | Yes | Parity |
+| Optional `prom-client` | Yes (native) | Yes (`usePromClient: true`) | Parity |
 
 ---
 

From 19773f8c3ecbb69ae6b862d6717934e3233570b4 Mon Sep 17 00:00:00 2001
From: Chris Hagglund <chris.hagglund@orkes.io>
Date: Thu, 7 May 2026 11:15:05 -0600
Subject: [PATCH 10/21] add or update a changelog

---
 CHANGELOG.md | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 CHANGELOG.md

diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 00000000..4ecd25b2
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,26 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [Unreleased]
+
+### Added
+
+- **Metrics harmonization** - canonical metric surface aligned with the cross-SDK catalog, opt-in via `WORKER_CANONICAL_METRICS=true`
+  - New `CanonicalMetricsCollector` and optional `CanonicalPrometheusRegistry` (prom-client adapter) emit the harmonized cross-SDK catalog: 12 counters (e.g. `task_poll_total`, `task_execution_started_total`, `task_paused_total`, `external_payload_used_total{entityName,operation,payloadType}`, `workflow_start_error_total{workflowType,exception}`), 4 time histograms (`task_poll_time_seconds`, `task_execute_time_seconds`, `task_update_time_seconds`, `http_api_client_request_seconds{method,uri,status}`) with buckets `0.001…10s`, 2 size histograms (`task_result_size_bytes`, `workflow_input_size_bytes{workflowType,version}`) with buckets `100…10_000_000` bytes, and `active_workers` gauge. Labels are camelCase; names are unprefixed.
+  - `createMetricsCollector()` factory selects `LegacyMetricsCollector` (default) or `CanonicalMetricsCollector` based on `WORKER_CANONICAL_METRICS` (truthy: `true`, `1`, `yes`, case-insensitive). `WORKER_LEGACY_METRICS` is also recognized; canonical wins when both are set.
+  - `HttpMetricsObserver` plus `fetchWithRetry` instrumentation records `http_api_client_request_seconds`; `WorkflowExecutor` records `workflow_input_size_bytes` and `workflow_start_error_total`.
+  - `Poller`, `TaskRunner`, and `EventDispatcher` emit a new `taskAckFailed` event and propagate output size, exception cause, and status.
+  - `fetchWithRetry` now retries on HTTP 502/503/504.
+  - Harness deployment manifest sets `WORKER_CANONICAL_METRICS=true`; `harness/main.ts` logs which collector is active.
+
+### Changed
+
+- **Metrics harmonization** - defaults preserved; legacy metrics emit unchanged when `WORKER_CANONICAL_METRICS` is unset
+  - `src/sdk/worker/metrics/MetricsCollector.ts` was renamed to `LegacyMetricsCollector.ts`. The public symbol is preserved via `export { LegacyMetricsCollector as MetricsCollector }` in `src/sdk/worker/metrics/index.ts`, so existing imports keep working.
+  - Default behavior is unchanged: with no env var set, the metric names, labels, and `conductor_worker_*` prefix from `v3.0.3` are preserved byte-for-byte.
+  - Rewrote `METRICS.md` with both surfaces, the env-var gate, side-by-side migration table with PromQL replacements, and troubleshooting.
+  - Updated `README.md`, `AGENTS.md`, `SDK_DEVELOPMENT.md`, `SDK_COMPARISON.md`, and `WORKER_ARCHITECTURE_COMPARISON.md` to reference `createMetricsCollector()` and the env var.

From 6cd069f3a0fef6354eed66efc135e100c6a658bc Mon Sep 17 00:00:00 2001
From: Chris Hagglund <chris.hagglund@orkes.io>
Date: Fri, 15 May 2026 11:18:28 -0600
Subject: [PATCH 11/21] adjustments to internal mechanics

---
 CHANGELOG.md                                  |  4 +-
 METRICS.md                                    | 24 +++++++++
 jest.config.mjs                               |  1 -
 src/sdk/clients/worker/TaskRunner.ts          |  5 +-
 .../events/__tests__/EventDispatcher.test.ts  |  1 +
 src/sdk/clients/worker/events/types.ts        |  2 +
 .../helpers/__tests__/fetchWithRetry.test.ts  | 38 +++++++++++++
 .../helpers/fetchWithRetry.ts                 | 53 +++++++++----------
 src/sdk/worker/core/TaskHandler.ts            | 48 -----------------
 .../metrics/CanonicalMetricsCollector.ts      | 11 +++-
 .../worker/metrics/LegacyMetricsCollector.ts  |  2 +-
 .../CanonicalMetricsCollector.test.ts         |  4 +-
 .../__tests__/MetricsCollector.test.ts        |  4 +-
 src/sdk/worker/metrics/metricsFactory.ts      | 10 ++--
 14 files changed, 118 insertions(+), 89 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4ecd25b2..1dacb8b7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,8 +13,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   - New `CanonicalMetricsCollector` and optional `CanonicalPrometheusRegistry` (prom-client adapter) emit the harmonized cross-SDK catalog: 12 counters (e.g. `task_poll_total`, `task_execution_started_total`, `task_paused_total`, `external_payload_used_total{entityName,operation,payloadType}`, `workflow_start_error_total{workflowType,exception}`), 4 time histograms (`task_poll_time_seconds`, `task_execute_time_seconds`, `task_update_time_seconds`, `http_api_client_request_seconds{method,uri,status}`) with buckets `0.001…10s`, 2 size histograms (`task_result_size_bytes`, `workflow_input_size_bytes{workflowType,version}`) with buckets `100…10_000_000` bytes, and `active_workers` gauge. Labels are camelCase; names are unprefixed.
   - `createMetricsCollector()` factory selects `LegacyMetricsCollector` (default) or `CanonicalMetricsCollector` based on `WORKER_CANONICAL_METRICS` (truthy: `true`, `1`, `yes`, case-insensitive). `WORKER_LEGACY_METRICS` is also recognized; canonical wins when both are set.
   - `HttpMetricsObserver` plus `fetchWithRetry` instrumentation records `http_api_client_request_seconds`; `WorkflowExecutor` records `workflow_input_size_bytes` and `workflow_start_error_total`.
-  - `Poller`, `TaskRunner`, and `EventDispatcher` emit a new `taskAckFailed` event and propagate output size, exception cause, and status.
-  - `fetchWithRetry` now retries on HTTP 502/503/504.
+  - `Poller`, `TaskRunner`, and `EventDispatcher` emit a new `taskPaused` event when a poll cycle is skipped because the worker is paused.
+  - `fetchWithRetry` now retries HTTP 502/503/504 for idempotent methods (GET, HEAD, OPTIONS, PUT, DELETE).
   - Harness deployment manifest sets `WORKER_CANONICAL_METRICS=true`; `harness/main.ts` logs which collector is active.
 
 ### Changed
diff --git a/METRICS.md b/METRICS.md
index 43c22252..7d43e0c2 100644
--- a/METRICS.md
+++ b/METRICS.md
@@ -354,6 +354,30 @@ sum(rate(task_execute_time_seconds_count[5m])) by (taskType)
 - Avoid embedding user identifiers or unbounded values in task type, workflow
   type, or external payload labels.
 
+### Recording Uncaught Exceptions
+
+The `thread_uncaught_exceptions_total` metric is not wired automatically. In
+Node.js, registering a `process.on("uncaughtException")` handler overrides the
+default crash behavior, which can leave the process running in a corrupted
+state. Instead, wire it yourself so you control the exit policy:
+
+```typescript
+const metrics = createMetricsCollector();
+
+process.on("uncaughtException", (err) => {
+  metrics.recordUncaughtException(err.name || "Error");
+  console.error(err);
+  process.exit(1);
+});
+
+process.on("unhandledRejection", (reason) => {
+  const name = reason instanceof Error ? reason.name || "Error" : "Error";
+  metrics.recordUncaughtException(name);
+  console.error(reason);
+  process.exit(1);
+});
+```
+
 ### prom-client Issues
 
 - `MetricsCollector` uses `await import("./MetricsServer.js")` internally. The
diff --git a/jest.config.mjs b/jest.config.mjs
index f8407f50..d8703a14 100644
--- a/jest.config.mjs
+++ b/jest.config.mjs
@@ -21,7 +21,6 @@ export default {
     "^@/(.*)$": "<rootDir>/src/$1",
     "^@open-api/(.*)$": "<rootDir>/src/open-api/$1",
     "^@test-utils/(.*)$": "<rootDir>/src/integration-tests/utils/$1",
-    "^(.*)\\.js$": "$1",
   },
   transform: {
     "^.+\\.tsx?$": [
diff --git a/src/sdk/clients/worker/TaskRunner.ts b/src/sdk/clients/worker/TaskRunner.ts
index cb637e48..5be29c83 100644
--- a/src/sdk/clients/worker/TaskRunner.ts
+++ b/src/sdk/clients/worker/TaskRunner.ts
@@ -227,15 +227,16 @@ export class TaskRunner {
     const { workerID } = this.options;
     let retryCount = 0;
     let lastError: Error | null = null;
+    let lastAttemptDurationMs = 0;
 
     while (retryCount < this.maxRetries) {
+      const updateStart = Date.now();
       try {
         if (process.env.CI) {
           console.log(
             `[TaskRunner] Submitting task result taskId=${taskResult.taskId} workflowId=${taskResult.workflowInstanceId} taskType=${this.worker.taskDefName} attempt=${retryCount + 1}/${this.maxRetries}`
           );
         }
-        const updateStart = Date.now();
 
         if (TaskRunner.updateV2Available === false) {
           // Already detected a legacy server — skip the probe, call legacy directly.
@@ -334,6 +335,7 @@ export class TaskRunner {
         });
         return nextTask ?? undefined;
       } catch (error: unknown) {
+        lastAttemptDurationMs = Date.now() - updateStart;
         lastError = error as Error;
         this.errorHandler(lastError, task);
         this.logger.error(
@@ -364,6 +366,7 @@ export class TaskRunner {
       workerId: workerID,
       workflowInstanceId: taskResult.workflowInstanceId,
       cause: lastError ?? new Error("Task update failed after all retries"),
+      durationMs: lastAttemptDurationMs,
       retryCount,
       taskResult,
       timestamp: new Date(),
diff --git a/src/sdk/clients/worker/events/__tests__/EventDispatcher.test.ts b/src/sdk/clients/worker/events/__tests__/EventDispatcher.test.ts
index b99310bf..c8b31634 100644
--- a/src/sdk/clients/worker/events/__tests__/EventDispatcher.test.ts
+++ b/src/sdk/clients/worker/events/__tests__/EventDispatcher.test.ts
@@ -227,6 +227,7 @@ describe("EventDispatcher", () => {
       workerId: "worker-1",
       workflowInstanceId: "workflow-1",
       cause: new Error("Update failed"),
+      durationMs: 500,
       retryCount: 4,
       taskResult: { status: "COMPLETED" },
       timestamp: new Date(),
diff --git a/src/sdk/clients/worker/events/types.ts b/src/sdk/clients/worker/events/types.ts
index 9c94af69..4bd5306c 100644
--- a/src/sdk/clients/worker/events/types.ts
+++ b/src/sdk/clients/worker/events/types.ts
@@ -114,6 +114,8 @@ export interface TaskUpdateFailure extends TaskRunnerEvent {
   workflowInstanceId?: string;
   /** The error that caused the final update failure */
   cause: Error;
+  /** Time taken for the last update attempt in milliseconds */
+  durationMs: number;
   /** Number of retry attempts made */
   retryCount: number;
   /** The TaskResult object that failed to update (for recovery/logging) */
diff --git a/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts b/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts
index 883c65b5..84277b67 100644
--- a/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts
+++ b/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts
@@ -239,6 +239,44 @@ describe("fetchWithRetry", () => {
       expect(mockFetch).toHaveBeenCalledTimes(3);
     });
 
+    it("should NOT retry 502 for POST requests (non-idempotent)", async () => {
+      mockFetch.mockResolvedValue(createMockResponse(502, "Bad Gateway"));
+
+      const result = await retryFetch("http://test.com", { method: "POST" }, mockFetch, {
+        maxTransportRetries: 3,
+        initialRetryDelay: 1,
+      });
+
+      expect(result.status).toBe(502);
+      expect(mockFetch).toHaveBeenCalledTimes(1);
+    });
+
+    it("should NOT retry 503 for PATCH requests (non-idempotent)", async () => {
+      mockFetch.mockResolvedValue(createMockResponse(503, "Service Unavailable"));
+
+      const result = await retryFetch("http://test.com", { method: "PATCH" }, mockFetch, {
+        maxTransportRetries: 3,
+        initialRetryDelay: 1,
+      });
+
+      expect(result.status).toBe(503);
+      expect(mockFetch).toHaveBeenCalledTimes(1);
+    });
+
+    it("should retry 502 for PUT requests (idempotent)", async () => {
+      mockFetch
+        .mockResolvedValueOnce(createMockResponse(502, "Bad Gateway"))
+        .mockResolvedValueOnce(createMockResponse(200, "ok"));
+
+      const result = await retryFetch("http://test.com", { method: "PUT" }, mockFetch, {
+        maxTransportRetries: 3,
+        initialRetryDelay: 1,
+      });
+
+      expect(result.status).toBe(200);
+      expect(mockFetch).toHaveBeenCalledTimes(2);
+    });
+
     it("should handle transport error then 502 then success", async () => {
       mockFetch
         .mockRejectedValueOnce(new Error("ECONNRESET"))
diff --git a/src/sdk/createConductorClient/helpers/fetchWithRetry.ts b/src/sdk/createConductorClient/helpers/fetchWithRetry.ts
index 679a7439..693289df 100644
--- a/src/sdk/createConductorClient/helpers/fetchWithRetry.ts
+++ b/src/sdk/createConductorClient/helpers/fetchWithRetry.ts
@@ -1,3 +1,5 @@
+import { getHttpMetricsObserver } from "../../worker/metrics/httpObserver";
+
 type Input = Parameters<typeof fetch>[0];
 type Init = Parameters<typeof fetch>[1];
 
@@ -95,6 +97,8 @@ const withJitter = (delayMs: number): number => {
   return Math.max(0, Math.round(delayMs + jitter));
 };
 
+const IDEMPOTENT_METHODS = new Set(["GET", "HEAD", "OPTIONS", "PUT", "DELETE"]);
+
 export const retryFetch = async (
   input: Input,
   init: Init,
@@ -157,10 +161,15 @@ export const retryFetch = async (
       return rateLimitResponse;
     }
 
-    // Gateway error retry (502, 503, 504) -- transient proxy/server errors
+    // Gateway error retry (502, 503, 504) -- only for idempotent methods to
+    // avoid duplicate side effects when the upstream may have processed the request.
     if (response.status >= 502 && response.status <= 504) {
-      lastError = new Error(`Server error: HTTP ${response.status}`);
-      if (transportAttempt < maxTransportRetries) {
+      const reqMethod = (input instanceof Request
+        ? input.method
+        : init?.method ?? "GET"
+      ).toUpperCase();
+      if (IDEMPOTENT_METHODS.has(reqMethod) && transportAttempt < maxTransportRetries) {
+        lastError = new Error(`Server error: HTTP ${response.status}`);
         await new Promise((resolve) =>
           setTimeout(resolve, withJitter(initialRetryDelay * (transportAttempt + 1)))
         );
@@ -223,35 +232,21 @@ export const wrapFetchWithRetry = (
     try {
       const response = await retryFetch(input, init, fetchFn, options);
       const durationMs = performance.now() - start;
-      try {
-        const { getHttpMetricsObserver } = await import(
-          "../../worker/metrics/httpObserver.js"
-        );
-        getHttpMetricsObserver()?.recordApiRequestTime(
-          method,
-          uri,
-          String(response.status),
-          durationMs,
-        );
-      } catch {
-        // Metrics recording is best-effort
-      }
+      getHttpMetricsObserver()?.recordApiRequestTime(
+        method,
+        uri,
+        String(response.status),
+        durationMs,
+      );
       return response;
     } catch (error) {
       const durationMs = performance.now() - start;
-      try {
-        const { getHttpMetricsObserver } = await import(
-          "../../worker/metrics/httpObserver.js"
-        );
-        getHttpMetricsObserver()?.recordApiRequestTime(
-          method,
-          uri,
-          "0",
-          durationMs,
-        );
-      } catch {
-        // Metrics recording is best-effort
-      }
+      getHttpMetricsObserver()?.recordApiRequestTime(
+        method,
+        uri,
+        "0",
+        durationMs,
+      );
       throw error;
     }
   };
diff --git a/src/sdk/worker/core/TaskHandler.ts b/src/sdk/worker/core/TaskHandler.ts
index b39a9b94..5bfcda5f 100644
--- a/src/sdk/worker/core/TaskHandler.ts
+++ b/src/sdk/worker/core/TaskHandler.ts
@@ -7,7 +7,6 @@ import {
   RESTART_BACKOFF_MAX_MS,
 } from "../../clients/worker/constants";
 import type { TaskRunnerEventsListener } from "../../clients/worker/events";
-import type { MetricsCollectorInterface } from "../metrics/MetricsCollectorInterface";
 import { TaskRunner } from "../../clients/worker/TaskRunner";
 import type { ConductorWorker, HealthMonitorConfig } from "../../clients/worker/types";
 import type { ConductorLogger } from "../../helpers/logger";
@@ -141,9 +140,6 @@ export class TaskHandler {
   private restartAttempts = new Map<number, number>(); // runner index → attempt count
   private healthMonitorConfig: HealthMonitorConfig;
 
-  // Uncaught exception handlers (bound so we can remove them)
-  private _uncaughtHandler?: (err: Error) => void;
-  private _unhandledHandler?: (reason: unknown) => void;
 
   /**
    * Create a TaskHandler instance with async module imports.
@@ -328,7 +324,6 @@ export class TaskHandler {
 
     this.isRunning = true;
     this.startHealthMonitor();
-    this.wireUncaughtExceptionHandlers();
     this.logger.info("All workers started successfully");
   }
 
@@ -347,7 +342,6 @@ export class TaskHandler {
     }
 
     this.stopHealthMonitor();
-    this.unwireUncaughtExceptionHandlers();
     this.logger.info(`Stopping ${this.taskRunners.length} worker(s)...`);
 
     const stopPromises = this.taskRunners.map(async (runner, index) => {
@@ -535,48 +529,6 @@ export class TaskHandler {
     }
   }
 
-  // ── Uncaught Exception Handling ─────────────────────────────────
-
-  private findMetricsCollector(): MetricsCollectorInterface | undefined {
-    const listeners = this.config.eventListeners ?? [];
-    return listeners.find(
-      (l): l is MetricsCollectorInterface =>
-        typeof (l as MetricsCollectorInterface).recordUncaughtException ===
-        "function",
-    );
-  }
-
-  private wireUncaughtExceptionHandlers(): void {
-    const collector = this.findMetricsCollector();
-    if (!collector) return;
-
-    this._uncaughtHandler = (err: Error) => {
-      const excName = err.name || err.constructor?.name || "Error";
-      collector.recordUncaughtException(excName);
-    };
-    this._unhandledHandler = (reason: unknown) => {
-      const excName =
-        reason instanceof Error
-          ? reason.name || reason.constructor?.name || "Error"
-          : "Error";
-      collector.recordUncaughtException(excName);
-    };
-
-    process.on("uncaughtException", this._uncaughtHandler);
-    process.on("unhandledRejection", this._unhandledHandler);
-  }
-
-  private unwireUncaughtExceptionHandlers(): void {
-    if (this._uncaughtHandler) {
-      process.removeListener("uncaughtException", this._uncaughtHandler);
-      this._uncaughtHandler = undefined;
-    }
-    if (this._unhandledHandler) {
-      process.removeListener("unhandledRejection", this._unhandledHandler);
-      this._unhandledHandler = undefined;
-    }
-  }
-
   // ── Discovery Summary ───────────────────────────────────────────
 
   /**
diff --git a/src/sdk/worker/metrics/CanonicalMetricsCollector.ts b/src/sdk/worker/metrics/CanonicalMetricsCollector.ts
index f0fa5314..6757bf08 100644
--- a/src/sdk/worker/metrics/CanonicalMetricsCollector.ts
+++ b/src/sdk/worker/metrics/CanonicalMetricsCollector.ts
@@ -81,7 +81,6 @@ export class CanonicalMetricsCollector implements MetricsCollectorInterface {
         config.fileWriteIntervalMs ?? 5000,
       );
     }
-    setHttpMetricsObserver(this);
   }
 
   private async initPromClient(): Promise<void> {
@@ -279,6 +278,16 @@ export class CanonicalMetricsCollector implements MetricsCollectorInterface {
       taskType: event.taskType,
       exception: excName,
     });
+
+    const seconds = event.durationMs / 1000;
+    this.state.taskUpdateTimeSeconds.observe(
+      { taskType: event.taskType, status: "FAILURE" },
+      seconds,
+    );
+    this._promRegistry?.observeHistogram("task_update_time_seconds", {
+      taskType: event.taskType,
+      status: "FAILURE",
+    }, seconds);
   }
 
   onTaskPaused(event: TaskPaused): void {
diff --git a/src/sdk/worker/metrics/LegacyMetricsCollector.ts b/src/sdk/worker/metrics/LegacyMetricsCollector.ts
index 381c14b5..8932a91b 100644
--- a/src/sdk/worker/metrics/LegacyMetricsCollector.ts
+++ b/src/sdk/worker/metrics/LegacyMetricsCollector.ts
@@ -120,7 +120,6 @@ export class LegacyMetricsCollector implements MetricsCollectorInterface {
         config.fileWriteIntervalMs ?? 5000,
       );
     }
-    setHttpMetricsObserver(this);
   }
 
   private async initPromClient(): Promise<void> {
@@ -254,6 +253,7 @@ export class LegacyMetricsCollector implements MetricsCollectorInterface {
 
   onTaskUpdateFailure(event: TaskUpdateFailure): void {
     this.incrementCounter(this.metrics.taskUpdateFailureTotal, event.taskType, "update_error_total", "task_type");
+    this.observeSummary(this.metrics.updateDurationMs, event.taskType, event.durationMs, "update_time", "task_type");
   }
 
   // Canonical-only event — noop in legacy
diff --git a/src/sdk/worker/metrics/__tests__/CanonicalMetricsCollector.test.ts b/src/sdk/worker/metrics/__tests__/CanonicalMetricsCollector.test.ts
index 369eeee3..37c9300f 100644
--- a/src/sdk/worker/metrics/__tests__/CanonicalMetricsCollector.test.ts
+++ b/src/sdk/worker/metrics/__tests__/CanonicalMetricsCollector.test.ts
@@ -160,12 +160,13 @@ describe("CanonicalMetricsCollector", () => {
       expect(text).toContain('task_update_time_seconds_sum{taskType="task_a",status="SUCCESS"} 0.025');
     });
 
-    it("should emit task_update_error_total with exception label on failure", () => {
+    it("should emit task_update_error_total and task_update_time_seconds on failure", () => {
       collector.onTaskUpdateFailure({
         taskType: "task_a",
         taskId: "t1",
         workerId: "w1",
         cause: new Error("server error"),
+        durationMs: 150,
         retryCount: 4,
         taskResult: {},
         timestamp: new Date(),
@@ -173,6 +174,7 @@ describe("CanonicalMetricsCollector", () => {
 
       const text = collector.toPrometheusText();
       expect(text).toContain('task_update_error_total{taskType="task_a",exception="Error"} 1');
+      expect(text).toContain('task_update_time_seconds_sum{taskType="task_a",status="FAILURE"} 0.15');
     });
   });
 
diff --git a/src/sdk/worker/metrics/__tests__/MetricsCollector.test.ts b/src/sdk/worker/metrics/__tests__/MetricsCollector.test.ts
index 3e8d8ebb..33787620 100644
--- a/src/sdk/worker/metrics/__tests__/MetricsCollector.test.ts
+++ b/src/sdk/worker/metrics/__tests__/MetricsCollector.test.ts
@@ -133,13 +133,14 @@ describe("LegacyMetricsCollector", () => {
       expect(metrics.updateDurationMs.get("task_a")).toEqual([25, 30]);
     });
 
-    it("should count update failures via onTaskUpdateFailure", () => {
+    it("should count update failures and record duration via onTaskUpdateFailure", () => {
       collector.onTaskUpdateFailure({
         taskType: "task_a",
         taskId: "t1",
         workerId: "w1",
         workflowInstanceId: "wf1",
         cause: new Error("server error"),
+        durationMs: 200,
         retryCount: 4,
         taskResult: {},
         timestamp: new Date(),
@@ -147,6 +148,7 @@ describe("LegacyMetricsCollector", () => {
 
       const metrics = collector.getMetrics();
       expect(metrics.taskUpdateFailureTotal.get("task_a")).toBe(1);
+      expect(metrics.updateDurationMs.get("task_a")).toEqual([200]);
     });
   });
 
diff --git a/src/sdk/worker/metrics/metricsFactory.ts b/src/sdk/worker/metrics/metricsFactory.ts
index bcb45588..dabe4810 100644
--- a/src/sdk/worker/metrics/metricsFactory.ts
+++ b/src/sdk/worker/metrics/metricsFactory.ts
@@ -2,6 +2,7 @@ import type { MetricsCollectorConfig } from "./LegacyMetricsCollector";
 import type { MetricsCollectorInterface } from "./MetricsCollectorInterface";
 import { LegacyMetricsCollector } from "./LegacyMetricsCollector";
 import { CanonicalMetricsCollector } from "./CanonicalMetricsCollector";
+import { setHttpMetricsObserver } from "./httpObserver";
 
 /**
  * Create the appropriate MetricsCollector based on environment variables.
@@ -19,9 +20,10 @@ export function createMetricsCollector(
     (process.env.WORKER_CANONICAL_METRICS ?? "").toLowerCase(),
   );
 
-  if (useCanonical) {
-    return new CanonicalMetricsCollector(config);
-  }
+  const collector = useCanonical
+    ? new CanonicalMetricsCollector(config)
+    : new LegacyMetricsCollector(config);
 
-  return new LegacyMetricsCollector(config);
+  setHttpMetricsObserver(collector);
+  return collector;
 }

From db1fd4c3cf35e6f773a223c42a3725041a4ba0b3 Mon Sep 17 00:00:00 2001
From: Chris Hagglund <chris.hagglund@orkes.io>
Date: Fri, 15 May 2026 12:05:24 -0600
Subject: [PATCH 12/21] harness improvements for generating traffic with uris
 in metrics

---
 harness/main.ts             | 13 +++++++++++++
 harness/workflowGovernor.ts | 10 +++++++++-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/harness/main.ts b/harness/main.ts
index 081188d7..182b8d50 100644
--- a/harness/main.ts
+++ b/harness/main.ts
@@ -9,6 +9,7 @@ import { MetadataResource } from "../src/open-api/generated";
 import type { ConductorWorker } from "../src/sdk/clients/worker/types";
 import { SimulatedTaskWorker } from "./simulatedTaskWorker";
 import { WorkflowGovernor } from "./workflowGovernor";
+import { WorkflowStatusProbe } from "./workflowStatusProbe";
 
 const WORKFLOW_NAME = "js_simulated_tasks_workflow";
 
@@ -104,15 +105,27 @@ async function main(): Promise<void> {
   });
   await handler.startWorkers();
 
+  const probeRate = envIntOrDefault("HARNESS_PROBE_RATE_PER_SEC", 0);
+  const probe =
+    probeRate > 0 ? new WorkflowStatusProbe(client, probeRate) : undefined;
+
   const governor = new WorkflowGovernor(
     workflowClient,
     WORKFLOW_NAME,
     workflowsPerSec,
+    probe ? probe.offer.bind(probe) : undefined,
   );
   governor.start();
 
+  if (probe) {
+    probe.start();
+  }
+
   const shutdown = async () => {
     console.log("Shutting down...");
+    if (probe) {
+      probe.stop();
+    }
     governor.stop();
     await handler.stopWorkers();
     process.exit(0);
diff --git a/harness/workflowGovernor.ts b/harness/workflowGovernor.ts
index 9865f126..4ccd0d33 100644
--- a/harness/workflowGovernor.ts
+++ b/harness/workflowGovernor.ts
@@ -4,16 +4,19 @@ export class WorkflowGovernor {
   private readonly workflowExecutor: WorkflowExecutor;
   private readonly workflowName: string;
   private readonly workflowsPerSecond: number;
+  private readonly idSink?: (id: string) => void;
   private timer: ReturnType<typeof setInterval> | undefined;
 
   constructor(
     workflowExecutor: WorkflowExecutor,
     workflowName: string,
     workflowsPerSecond: number,
+    idSink?: (id: string) => void,
   ) {
     this.workflowExecutor = workflowExecutor;
     this.workflowName = workflowName;
     this.workflowsPerSecond = workflowsPerSecond;
+    this.idSink = idSink;
   }
 
   start(): void {
@@ -51,10 +54,15 @@ export class WorkflowGovernor {
     }
 
     Promise.all(promises)
-      .then(() => {
+      .then((ids) => {
         console.log(
           `Governor: started ${this.workflowsPerSecond} workflow(s)`,
         );
+        if (this.idSink) {
+          for (const id of ids) {
+            this.idSink(id);
+          }
+        }
       })
       .catch((err: unknown) => {
         console.error(

From cb9717193ac90003f6f7e8089b411030fe398e52 Mon Sep 17 00:00:00 2001
From: Chris Hagglund <chris.hagglund@orkes.io>
Date: Fri, 15 May 2026 12:05:55 -0600
Subject: [PATCH 13/21] harness improvements for generating traffic with uris
 in metrics .. cont'd (missing file oops)

---
 harness/workflowStatusProbe.ts | 97 ++++++++++++++++++++++++++++++++++
 1 file changed, 97 insertions(+)
 create mode 100644 harness/workflowStatusProbe.ts

diff --git a/harness/workflowStatusProbe.ts b/harness/workflowStatusProbe.ts
new file mode 100644
index 00000000..741f4438
--- /dev/null
+++ b/harness/workflowStatusProbe.ts
@@ -0,0 +1,97 @@
+import type { Client } from "../src/open-api/generated/client";
+import { WorkflowResource } from "../src/open-api/generated";
+
+const MAX_TRACKED_IDS = 256;
+
+/**
+ * Exercises UUID-bearing workflow lookup endpoints so
+ * http_api_client_request_seconds picks up entries with
+ * uri=/workflow/{workflowId} and uri=/workflow/{workflowId}/tasks.
+ *
+ * Default harness traffic only hits bounded, no-path-param URLs (poll/update),
+ * making the high-cardinality concern on the uri label invisible without this
+ * probe.
+ *
+ * Default off. Runs only when HARNESS_PROBE_RATE_PER_SEC > 0.
+ * Side-effect-free: only issues read calls (getExecutionStatus,
+ * getExecutionStatusTaskList).
+ * Self-bounded: fixed-size FIFO of workflow IDs.
+ */
+export class WorkflowStatusProbe {
+  private readonly client: Client;
+  private readonly callsPerSecond: number;
+  private readonly recentIDs: string[] = [];
+  private timer: ReturnType<typeof setInterval> | undefined;
+
+  constructor(client: Client, callsPerSecond: number) {
+    this.client = client;
+    this.callsPerSecond = callsPerSecond;
+  }
+
+  offer(workflowId: string): void {
+    if (!workflowId) return;
+    this.recentIDs.push(workflowId);
+    if (this.recentIDs.length > MAX_TRACKED_IDS) {
+      this.recentIDs.splice(0, this.recentIDs.length - MAX_TRACKED_IDS);
+    }
+  }
+
+  start(): void {
+    if (this.callsPerSecond <= 0) {
+      console.log(
+        "WorkflowStatusProbe disabled (HARNESS_PROBE_RATE_PER_SEC<=0)",
+      );
+      return;
+    }
+    console.log(
+      `WorkflowStatusProbe started: rate=${this.callsPerSecond}/sec, retainedIds<=${MAX_TRACKED_IDS}`,
+    );
+
+    this.timer = setInterval(() => {
+      this.tick();
+    }, 1000);
+
+    if (this.timer && typeof this.timer === "object" && "unref" in this.timer) {
+      this.timer.unref();
+    }
+  }
+
+  stop(): void {
+    if (this.timer) {
+      clearInterval(this.timer);
+      this.timer = undefined;
+    }
+    console.log("WorkflowStatusProbe stopped");
+  }
+
+  private tick(): void {
+    const budget = Math.min(this.callsPerSecond, this.recentIDs.length);
+    if (budget === 0) return;
+
+    const ids: string[] = [];
+    for (let i = 0; i < budget; i++) {
+      ids.push(
+        this.recentIDs[Math.floor(Math.random() * this.recentIDs.length)],
+      );
+    }
+
+    for (const id of ids) {
+      const call =
+        Math.random() < 0.5
+          ? WorkflowResource.getExecutionStatus({
+              client: this.client,
+              path: { workflowId: id },
+            })
+          : WorkflowResource.getExecutionStatusTaskList({
+              client: this.client,
+              path: { workflowId: id },
+            });
+
+      call.catch((err: unknown) => {
+        console.error(
+          `probe: ${id}: ${err instanceof Error ? err.message : String(err)}`,
+        );
+      });
+    }
+  }
+}

From 24b162c3a87997f0de5047fe925fe534752acc1b Mon Sep 17 00:00:00 2001
From: Chris Hagglund <chris.hagglund@orkes.io>
Date: Mon, 18 May 2026 11:37:22 -0600
Subject: [PATCH 14/21] revisions to internal mechanics for http request
 metrics

---
 .../createConductorClient.ts                  |   5 +
 .../helpers/__tests__/fetchWithRetry.test.ts  |  51 ++----
 .../__tests__/metricsInterceptors.test.ts     | 147 ++++++++++++++++++
 .../helpers/fetchWithRetry.ts                 |  44 +++---
 .../helpers/metricsInterceptors.ts            |  67 ++++++++
 .../metrics/CanonicalMetricsCollector.ts      |   6 +-
 .../worker/metrics/LegacyMetricsCollector.ts  |   1 +
 .../metrics/MetricsCollectorInterface.ts      |   1 +
 .../CanonicalMetricsCollector.test.ts         |  13 ++
 .../MetricsCollector.prometheus.test.ts       |   7 +
 src/sdk/worker/metrics/httpObserver.ts        |   1 +
 11 files changed, 280 insertions(+), 63 deletions(-)
 create mode 100644 src/sdk/createConductorClient/helpers/__tests__/metricsInterceptors.test.ts
 create mode 100644 src/sdk/createConductorClient/helpers/metricsInterceptors.ts

diff --git a/src/sdk/createConductorClient/createConductorClient.ts b/src/sdk/createConductorClient/createConductorClient.ts
index 51e460c7..c1d8970c 100644
--- a/src/sdk/createConductorClient/createConductorClient.ts
+++ b/src/sdk/createConductorClient/createConductorClient.ts
@@ -7,6 +7,7 @@ import {
 import type { OrkesApiConfig } from "../types";
 import { createClient } from "../../open-api/generated/client";
 import { addResourcesBackwardCompatibility } from "./helpers/addResourcesBackwardCompatibility";
+import { createMetricsInterceptors } from "./helpers/metricsInterceptors";
 
 /**
  * Creates a Conductor client with authentication and configuration
@@ -56,6 +57,10 @@ export const createConductorClient = async (
     throwOnError: true,
   });
 
+  const { onRequest, onResponse } = createMetricsInterceptors();
+  openApiClient.interceptors.request.use(onRequest);
+  openApiClient.interceptors.response.use(onResponse);
+
   let authResult: Awaited<ReturnType<typeof handleAuth>> | undefined;
   if (keyId && keySecret) {
     authResult = await handleAuth(
diff --git a/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts b/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts
index 84277b67..111f0120 100644
--- a/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts
+++ b/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts
@@ -640,6 +640,10 @@ describe("fetchWithRetry", () => {
   });
 
   // ─── wrapFetchWithRetry metrics recording ───────────────────────────
+  //
+  // After the interceptor migration, wrapFetchWithRetry only records
+  // metrics on network errors (status "0") — the response interceptor
+  // handles all successful/HTTP-error cases.
 
   describe("wrapFetchWithRetry metrics", () => {
     const mockRecordApiRequestTime = jest.fn<
@@ -659,19 +663,13 @@ describe("fetchWithRetry", () => {
       httpObserver.setHttpMetricsObserver(undefined);
     });
 
-    it("should record API request time on successful response", async () => {
+    it("should NOT record metrics on successful response (interceptors handle it)", async () => {
       mockFetch.mockResolvedValue(createMockResponse(200));
 
       const wrappedFetch = wrapFetchWithRetry(mockFetch);
       await wrappedFetch("http://test.com/api/tasks", { method: "POST" });
 
-      expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1);
-      const [method, uri, status, duration] =
-        mockRecordApiRequestTime.mock.calls[0];
-      expect(method).toBe("POST");
-      expect(uri).toBe("/api/tasks");
-      expect(status).toBe("200");
-      expect(duration).toBeGreaterThanOrEqual(0);
+      expect(mockRecordApiRequestTime).not.toHaveBeenCalled();
     });
 
     it("should record status '0' on network failure", async () => {
@@ -693,31 +691,23 @@ describe("fetchWithRetry", () => {
       expect(status).toBe("0");
     });
 
-    it("should extract method and URI from string URL", async () => {
-      mockFetch.mockResolvedValue(createMockResponse(201));
-
-      const wrappedFetch = wrapFetchWithRetry(mockFetch);
-      await wrappedFetch("http://example.com/tasks/123", { method: "PUT" });
-
-      expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1);
-      const [method, uri] = mockRecordApiRequestTime.mock.calls[0];
-      expect(method).toBe("PUT");
-      expect(uri).toBe("/tasks/123");
-    });
-
-    it("should extract method and URI from Request object", async () => {
-      mockFetch.mockResolvedValue(createMockResponse(200));
+    it("should extract method and URI from Request object on network failure", async () => {
+      mockFetch.mockRejectedValue(new Error("ECONNRESET"));
 
-      const wrappedFetch = wrapFetchWithRetry(mockFetch);
+      const wrappedFetch = wrapFetchWithRetry(mockFetch, {
+        maxTransportRetries: 0,
+      });
       const request = new Request("http://example.com/api/metadata", {
         method: "DELETE",
       });
-      await wrappedFetch(request);
+
+      await expect(wrappedFetch(request)).rejects.toThrow("ECONNRESET");
 
       expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1);
-      const [method, uri] = mockRecordApiRequestTime.mock.calls[0];
+      const [method, uri, status] = mockRecordApiRequestTime.mock.calls[0];
       expect(method).toBe("DELETE");
       expect(uri).toBe("/api/metadata");
+      expect(status).toBe("0");
     });
 
     it("should not break fetch when no observer is registered", async () => {
@@ -729,16 +719,5 @@ describe("fetchWithRetry", () => {
 
       expect(result.status).toBe(200);
     });
-
-    it("should default method to GET when init has no method", async () => {
-      mockFetch.mockResolvedValue(createMockResponse(200));
-
-      const wrappedFetch = wrapFetchWithRetry(mockFetch);
-      await wrappedFetch("http://test.com/path");
-
-      expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1);
-      const [method] = mockRecordApiRequestTime.mock.calls[0];
-      expect(method).toBe("GET");
-    });
   });
 });
diff --git a/src/sdk/createConductorClient/helpers/__tests__/metricsInterceptors.test.ts b/src/sdk/createConductorClient/helpers/__tests__/metricsInterceptors.test.ts
new file mode 100644
index 00000000..77800cb3
--- /dev/null
+++ b/src/sdk/createConductorClient/helpers/__tests__/metricsInterceptors.test.ts
@@ -0,0 +1,147 @@
+import { jest, expect, describe, it, beforeEach, afterEach } from "@jest/globals";
+import { createMetricsInterceptors } from "../metricsInterceptors";
+import * as httpObserver from "@/sdk/worker/metrics/httpObserver";
+import type { ResolvedRequestOptions } from "@/open-api/generated/client/types.gen";
+
+const mockRecordApiRequestTime = jest.fn<
+  (m: string, u: string, s: string | number, d: number, mu?: string) => void
+>();
+
+const fakeOpts = (url: string): ResolvedRequestOptions =>
+  ({ url } as unknown as ResolvedRequestOptions);
+
+describe("metricsInterceptors", () => {
+  beforeEach(() => {
+    mockRecordApiRequestTime.mockClear();
+    httpObserver.setHttpMetricsObserver({
+      recordApiRequestTime: mockRecordApiRequestTime,
+      recordWorkflowInputSize: jest.fn(),
+      recordWorkflowStartError: jest.fn(),
+    });
+  });
+
+  afterEach(() => {
+    httpObserver.setHttpMetricsObserver(undefined);
+  });
+
+  it("should record method, interpolated uri, status, duration, and template metricUri", () => {
+    const { onRequest, onResponse } = createMetricsInterceptors();
+
+    const request = new Request("http://conductor.example.com/api/workflow/abc-123", {
+      method: "GET",
+    });
+    const opts = fakeOpts("/api/workflow/{workflowId}");
+
+    onRequest(request, opts);
+    const response = new Response(null, { status: 200 });
+    onResponse(response, request, opts);
+
+    expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1);
+    const [method, uri, status, duration, metricUri] =
+      mockRecordApiRequestTime.mock.calls[0];
+    expect(method).toBe("GET");
+    expect(uri).toBe("/api/workflow/abc-123");
+    expect(status).toBe("200");
+    expect(duration).toBeGreaterThanOrEqual(0);
+    expect(metricUri).toBe("/workflow/{workflowId}");
+  });
+
+  it("should strip /api prefix from path template", () => {
+    const { onRequest, onResponse } = createMetricsInterceptors();
+
+    const request = new Request("http://host/api/tasks/poll/myTask", {
+      method: "POST",
+    });
+    const opts = fakeOpts("/api/tasks/poll/{taskType}");
+
+    onRequest(request, opts);
+    onResponse(new Response(null, { status: 200 }), request, opts);
+
+    const metricUri = mockRecordApiRequestTime.mock.calls[0][4];
+    expect(metricUri).toBe("/tasks/poll/{taskType}");
+  });
+
+  it("should not strip prefix if path does not start with /api/", () => {
+    const { onRequest, onResponse } = createMetricsInterceptors();
+
+    const request = new Request("http://host/health", { method: "GET" });
+    const opts = fakeOpts("/health");
+
+    onRequest(request, opts);
+    onResponse(new Response(null, { status: 200 }), request, opts);
+
+    const metricUri = mockRecordApiRequestTime.mock.calls[0][4];
+    expect(metricUri).toBe("/health");
+  });
+
+  it("should compute duration from request interceptor start time", () => {
+    const { onRequest, onResponse } = createMetricsInterceptors();
+
+    const request = new Request("http://host/api/workflow", { method: "GET" });
+    const opts = fakeOpts("/api/workflow");
+
+    onRequest(request, opts);
+    // Simulate a small delay so duration > 0
+    const busyWaitUntil = performance.now() + 2;
+    while (performance.now() < busyWaitUntil) { /* spin */ }
+    onResponse(new Response(null, { status: 200 }), request, opts);
+
+    const duration = mockRecordApiRequestTime.mock.calls[0][3] as number;
+    expect(duration).toBeGreaterThan(0);
+  });
+
+  it("should gracefully handle missing start time (duration defaults to 0)", () => {
+    const { onResponse } = createMetricsInterceptors();
+
+    const request = new Request("http://host/api/workflow", { method: "GET" });
+    const opts = fakeOpts("/api/workflow");
+
+    onResponse(new Response(null, { status: 200 }), request, opts);
+
+    const duration = mockRecordApiRequestTime.mock.calls[0][3] as number;
+    expect(duration).toBe(0);
+  });
+
+  it("should do nothing when no observer is registered", () => {
+    httpObserver.setHttpMetricsObserver(undefined);
+
+    const { onRequest, onResponse } = createMetricsInterceptors();
+
+    const request = new Request("http://host/api/workflow", { method: "GET" });
+    const opts = fakeOpts("/api/workflow");
+
+    onRequest(request, opts);
+    const response = new Response(null, { status: 200 });
+    const result = onResponse(response, request, opts);
+
+    expect(result).toBe(response);
+    expect(mockRecordApiRequestTime).not.toHaveBeenCalled();
+  });
+
+  it("should return request and response unchanged", () => {
+    const { onRequest, onResponse } = createMetricsInterceptors();
+
+    const request = new Request("http://host/api/workflow", { method: "GET" });
+    const opts = fakeOpts("/api/workflow");
+
+    const returnedRequest = onRequest(request, opts);
+    expect(returnedRequest).toBe(request);
+
+    const response = new Response("body", { status: 201 });
+    const returnedResponse = onResponse(response, request, opts);
+    expect(returnedResponse).toBe(response);
+  });
+
+  it("should record correct status for error responses", () => {
+    const { onRequest, onResponse } = createMetricsInterceptors();
+
+    const request = new Request("http://host/api/workflow", { method: "POST" });
+    const opts = fakeOpts("/api/workflow");
+
+    onRequest(request, opts);
+    onResponse(new Response(null, { status: 500 }), request, opts);
+
+    const status = mockRecordApiRequestTime.mock.calls[0][2];
+    expect(status).toBe("500");
+  });
+});
diff --git a/src/sdk/createConductorClient/helpers/fetchWithRetry.ts b/src/sdk/createConductorClient/helpers/fetchWithRetry.ts
index 693289df..69f0feea 100644
--- a/src/sdk/createConductorClient/helpers/fetchWithRetry.ts
+++ b/src/sdk/createConductorClient/helpers/fetchWithRetry.ts
@@ -211,35 +211,29 @@ export const wrapFetchWithRetry = (
 ): typeof fetch => {
   return async (input: Input, init?: Init): Promise<Response> => {
     const start = performance.now();
-    let method = "GET";
-    let uri = "";
 
     try {
-      if (input instanceof Request) {
-        method = input.method;
-        uri = new URL(input.url).pathname;
-      } else if (typeof input === "string") {
-        method = init?.method ?? "GET";
-        try { uri = new URL(input).pathname; } catch { uri = input; }
-      } else {
-        method = init?.method ?? "GET";
-        try { uri = input.pathname; } catch { uri = String(input); }
+      return await retryFetch(input, init, fetchFn, options);
+    } catch (error) {
+      // Network-error fallback: the response interceptor never runs when
+      // fetch throws, so record a status="0" metric here as a safety net.
+      let method = "GET";
+      let uri = "";
+      try {
+        if (input instanceof Request) {
+          method = input.method;
+          uri = new URL(input.url).pathname;
+        } else if (typeof input === "string") {
+          method = init?.method ?? "GET";
+          try { uri = new URL(input).pathname; } catch { uri = input; }
+        } else {
+          method = init?.method ?? "GET";
+          try { uri = input.pathname; } catch { uri = String(input); }
+        }
+      } catch {
+        // Best-effort URI extraction
       }
-    } catch {
-      // Best-effort URI extraction
-    }
 
-    try {
-      const response = await retryFetch(input, init, fetchFn, options);
-      const durationMs = performance.now() - start;
-      getHttpMetricsObserver()?.recordApiRequestTime(
-        method,
-        uri,
-        String(response.status),
-        durationMs,
-      );
-      return response;
-    } catch (error) {
       const durationMs = performance.now() - start;
       getHttpMetricsObserver()?.recordApiRequestTime(
         method,
diff --git a/src/sdk/createConductorClient/helpers/metricsInterceptors.ts b/src/sdk/createConductorClient/helpers/metricsInterceptors.ts
new file mode 100644
index 00000000..f958ae9f
--- /dev/null
+++ b/src/sdk/createConductorClient/helpers/metricsInterceptors.ts
@@ -0,0 +1,67 @@
+import type { ResolvedRequestOptions } from "../../../open-api/generated/client/types.gen";
+import { getHttpMetricsObserver } from "../../worker/metrics/httpObserver";
+
+/**
+ * Strips the /api prefix that the OpenAPI spec bakes into every path.
+ * Other SDKs (Go, Java, Python, Rust, Ruby, C#) keep /api in the base URL
+ * and use clean paths like /workflow/{workflowId}. The JS SDK's generated
+ * code does it backwards: the base URL has /api stripped and every path
+ * template starts with /api/. This normalises back to the cross-SDK
+ * convention.
+ */
+const stripApiPrefix = (url: string): string =>
+  url.startsWith("/api/") ? url.slice(4) : url;
+
+type OptsWithMetrics = ResolvedRequestOptions & { _metricsStart?: number };
+
+/**
+ * Creates a matched pair of request/response interceptors that record
+ * http_api_client_request_seconds via the global HttpMetricsObserver.
+ *
+ * Both interceptors receive the same `opts` object for a given request
+ * (see client.gen.ts lines 89-102), so start time is stashed directly
+ * on opts — no WeakMap or side-channel needed.
+ *
+ * The response interceptor passes both the interpolated URI (for legacy
+ * metrics) and the bounded-cardinality path template (for canonical
+ * metrics) to the observer, letting each collector choose which to use.
+ */
+export function createMetricsInterceptors() {
+  const onRequest = (
+    request: Request,
+    opts: ResolvedRequestOptions,
+  ): Request => {
+    (opts as OptsWithMetrics)._metricsStart = performance.now();
+    return request;
+  };
+
+  const onResponse = (
+    response: Response,
+    request: Request,
+    opts: ResolvedRequestOptions,
+  ): Response => {
+    const observer = getHttpMetricsObserver();
+    if (!observer) return response;
+
+    const start = (opts as OptsWithMetrics)._metricsStart;
+    const durationMs = start != null ? performance.now() - start : 0;
+
+    const method = request.method;
+    const status = String(response.status);
+
+    let uri = "";
+    try {
+      uri = new URL(request.url).pathname;
+    } catch {
+      uri = String(request.url);
+    }
+
+    const template = typeof opts.url === "string" ? stripApiPrefix(opts.url) : uri;
+
+    observer.recordApiRequestTime(method, uri, status, durationMs, template);
+
+    return response;
+  };
+
+  return { onRequest, onResponse };
+}
diff --git a/src/sdk/worker/metrics/CanonicalMetricsCollector.ts b/src/sdk/worker/metrics/CanonicalMetricsCollector.ts
index 6757bf08..bf8b6ad3 100644
--- a/src/sdk/worker/metrics/CanonicalMetricsCollector.ts
+++ b/src/sdk/worker/metrics/CanonicalMetricsCollector.ts
@@ -389,16 +389,18 @@ export class CanonicalMetricsCollector implements MetricsCollectorInterface {
     uri: string,
     status: number | string,
     durationMs: number,
+    metricUri?: string,
   ): void {
+    const resolvedUri = metricUri ?? uri;
     const statusStr = String(status);
     const seconds = durationMs / 1000;
     this.state.httpApiClientRequestSeconds.observe(
-      { method, uri, status: statusStr },
+      { method, uri: resolvedUri, status: statusStr },
       seconds,
     );
     this._promRegistry?.observeHistogram("http_api_client_request_seconds", {
       method,
-      uri,
+      uri: resolvedUri,
       status: statusStr,
     }, seconds);
   }
diff --git a/src/sdk/worker/metrics/LegacyMetricsCollector.ts b/src/sdk/worker/metrics/LegacyMetricsCollector.ts
index 8932a91b..32467ff4 100644
--- a/src/sdk/worker/metrics/LegacyMetricsCollector.ts
+++ b/src/sdk/worker/metrics/LegacyMetricsCollector.ts
@@ -315,6 +315,7 @@ export class LegacyMetricsCollector implements MetricsCollectorInterface {
     uri: string,
     status: number | string,
     durationMs: number,
+    _metricUri?: string,
   ): void {
     const key = `${method}:${uri}:${status}`;
     this.observeSummary(this.metrics.apiRequestDurationMs, key, durationMs, "api_request", "endpoint");
diff --git a/src/sdk/worker/metrics/MetricsCollectorInterface.ts b/src/sdk/worker/metrics/MetricsCollectorInterface.ts
index 4852b099..dd341994 100644
--- a/src/sdk/worker/metrics/MetricsCollectorInterface.ts
+++ b/src/sdk/worker/metrics/MetricsCollectorInterface.ts
@@ -35,6 +35,7 @@ export interface MetricsCollectorInterface extends TaskRunnerEventsListener {
     uri: string,
     status: number | string,
     durationMs: number,
+    metricUri?: string,
   ): void;
 
   // ── Output / lifecycle ─────────────────────────────────────────
diff --git a/src/sdk/worker/metrics/__tests__/CanonicalMetricsCollector.test.ts b/src/sdk/worker/metrics/__tests__/CanonicalMetricsCollector.test.ts
index 37c9300f..aab49782 100644
--- a/src/sdk/worker/metrics/__tests__/CanonicalMetricsCollector.test.ts
+++ b/src/sdk/worker/metrics/__tests__/CanonicalMetricsCollector.test.ts
@@ -246,6 +246,19 @@ describe("CanonicalMetricsCollector", () => {
       expect(text).toContain("# TYPE http_api_client_request_seconds histogram");
       expect(text).toContain('http_api_client_request_seconds_sum{method="GET",uri="/api/workflow",status="200"} 0.045');
     });
+
+    it("recordApiRequestTime should prefer metricUri over uri when provided", () => {
+      collector.recordApiRequestTime("GET", "/api/workflow/abc-123", 200, 30, "/workflow/{workflowId}");
+      const text = collector.toPrometheusText();
+      expect(text).toContain('uri="/workflow/{workflowId}"');
+      expect(text).not.toContain("abc-123");
+    });
+
+    it("recordApiRequestTime should fall back to uri when metricUri is undefined", () => {
+      collector.recordApiRequestTime("POST", "/api/tasks", 201, 20);
+      const text = collector.toPrometheusText();
+      expect(text).toContain('uri="/api/tasks"');
+    });
   });
 
   describe("onTaskPaused event", () => {
diff --git a/src/sdk/worker/metrics/__tests__/MetricsCollector.prometheus.test.ts b/src/sdk/worker/metrics/__tests__/MetricsCollector.prometheus.test.ts
index ffe9f567..424922a7 100644
--- a/src/sdk/worker/metrics/__tests__/MetricsCollector.prometheus.test.ts
+++ b/src/sdk/worker/metrics/__tests__/MetricsCollector.prometheus.test.ts
@@ -161,6 +161,13 @@ describe("LegacyMetricsCollector - Prometheus features", () => {
       const m = collector.getMetrics();
       expect(m.apiRequestDurationMs.get("GET:/api/workflow:200")).toEqual([45, 55]);
     });
+
+    it("should ignore metricUri and use uri (legacy behavior preserved)", () => {
+      collector.recordApiRequestTime("GET", "/api/workflow/abc-123", 200, 45, "/workflow/{workflowId}");
+      const m = collector.getMetrics();
+      expect(m.apiRequestDurationMs.get("GET:/api/workflow/abc-123:200")).toEqual([45]);
+      expect(m.apiRequestDurationMs.get("GET:/workflow/{workflowId}:200")).toBeUndefined();
+    });
   });
 
   // ── Sliding window ──────────────────────────────────────────────
diff --git a/src/sdk/worker/metrics/httpObserver.ts b/src/sdk/worker/metrics/httpObserver.ts
index fa2cbc76..9366e783 100644
--- a/src/sdk/worker/metrics/httpObserver.ts
+++ b/src/sdk/worker/metrics/httpObserver.ts
@@ -13,6 +13,7 @@ export interface HttpMetricsObserver {
     uri: string,
     status: number | string,
     durationMs: number,
+    metricUri?: string,
   ): void;
 
   recordWorkflowInputSize(

From c8e97afc45d69660b431a8cd1f2de9bbe4650e1e Mon Sep 17 00:00:00 2001
From: Chris Hagglund <chris.hagglund@orkes.io>
Date: Mon, 18 May 2026 11:52:12 -0600
Subject: [PATCH 15/21] update to changelog and metrics docs

---
 CHANGELOG.md |  23 ++++++------
 METRICS.md   | 102 +++++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 106 insertions(+), 19 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1dacb8b7..f541892d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,18 +9,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
-- **Metrics harmonization** - canonical metric surface aligned with the cross-SDK catalog, opt-in via `WORKER_CANONICAL_METRICS=true`
-  - New `CanonicalMetricsCollector` and optional `CanonicalPrometheusRegistry` (prom-client adapter) emit the harmonized cross-SDK catalog: 12 counters (e.g. `task_poll_total`, `task_execution_started_total`, `task_paused_total`, `external_payload_used_total{entityName,operation,payloadType}`, `workflow_start_error_total{workflowType,exception}`), 4 time histograms (`task_poll_time_seconds`, `task_execute_time_seconds`, `task_update_time_seconds`, `http_api_client_request_seconds{method,uri,status}`) with buckets `0.001…10s`, 2 size histograms (`task_result_size_bytes`, `workflow_input_size_bytes{workflowType,version}`) with buckets `100…10_000_000` bytes, and `active_workers` gauge. Labels are camelCase; names are unprefixed.
-  - `createMetricsCollector()` factory selects `LegacyMetricsCollector` (default) or `CanonicalMetricsCollector` based on `WORKER_CANONICAL_METRICS` (truthy: `true`, `1`, `yes`, case-insensitive). `WORKER_LEGACY_METRICS` is also recognized; canonical wins when both are set.
-  - `HttpMetricsObserver` plus `fetchWithRetry` instrumentation records `http_api_client_request_seconds`; `WorkflowExecutor` records `workflow_input_size_bytes` and `workflow_start_error_total`.
-  - `Poller`, `TaskRunner`, and `EventDispatcher` emit a new `taskPaused` event when a poll cycle is skipped because the worker is paused.
-  - `fetchWithRetry` now retries HTTP 502/503/504 for idempotent methods (GET, HEAD, OPTIONS, PUT, DELETE).
-  - Harness deployment manifest sets `WORKER_CANONICAL_METRICS=true`; `harness/main.ts` logs which collector is active.
+- Canonical metrics: opt-in harmonized metric surface via `WORKER_CANONICAL_METRICS=true` -- see [METRICS.md](METRICS.md) for the full catalog, configuration, and migration guide
+- Bounded `uri` label on `http_api_client_request_seconds`: canonical mode uses path templates (e.g. `/workflow/{workflowId}`) instead of fully-resolved paths, preventing metric cardinality explosion from dynamic IDs
+- `WorkflowStatusProbe` in harness: opt-in probe (via `HARNESS_PROBE_RATE_PER_SEC`) that exercises UUID-bearing endpoints to validate template URI metrics
+- `fetchWithRetry` now retries HTTP 502/503/504 for idempotent methods (GET, HEAD, OPTIONS, PUT, DELETE)
 
 ### Changed
 
-- **Metrics harmonization** - defaults preserved; legacy metrics emit unchanged when `WORKER_CANONICAL_METRICS` is unset
-  - `src/sdk/worker/metrics/MetricsCollector.ts` was renamed to `LegacyMetricsCollector.ts`. The public symbol is preserved via `export { LegacyMetricsCollector as MetricsCollector }` in `src/sdk/worker/metrics/index.ts`, so existing imports keep working.
-  - Default behavior is unchanged: with no env var set, the metric names, labels, and `conductor_worker_*` prefix from `v3.0.3` are preserved byte-for-byte.
-  - Rewrote `METRICS.md` with both surfaces, the env-var gate, side-by-side migration table with PromQL replacements, and troubleshooting.
-  - Updated `README.md`, `AGENTS.md`, `SDK_DEVELOPMENT.md`, `SDK_COMPARISON.md`, and `WORKER_ARCHITECTURE_COMPARISON.md` to reference `createMetricsCollector()` and the env var.
+- Legacy metrics emit unchanged by default; no action required for existing deployments
+- `MetricsCollector.ts` renamed to `LegacyMetricsCollector.ts`; the public symbol is preserved via re-export so existing imports keep working
+- HTTP metrics recording moved from `fetchWithRetry` to OpenAPI client interceptors -- [details](METRICS.md#detailed-technical-notes----unreleased)
+
+### Deprecated
+
+- Legacy metric names remain the default. Migration guidance is in [METRICS.md](METRICS.md#migrating-from-legacy-to-canonical).
diff --git a/METRICS.md b/METRICS.md
index 7d43e0c2..b687a0e1 100644
--- a/METRICS.md
+++ b/METRICS.md
@@ -253,7 +253,7 @@ Type is Summary. Values are bytes.
 | `payload_type` | Legacy `conductor_worker_external_payload_used_total` | Payload type, such as `workflow_input` or `task_output`. |
 | `payloadType` | Canonical `external_payload_used_total` | Payload type, such as `TASK_INPUT`, `TASK_OUTPUT`, `WORKFLOW_INPUT`, or `WORKFLOW_OUTPUT`. |
 | `method` | Canonical HTTP metrics | HTTP verb. |
-| `uri` | Canonical HTTP metrics | Request path. May contain interpolated identifiers. |
+| `uri` | Canonical HTTP metrics | Request path template (e.g. `/workflow/{workflowId}`). Uses bounded-cardinality templates, not interpolated paths. |
 | `endpoint` | Legacy HTTP metrics | Compound `"METHOD:/api/path:STATUS"` string. |
 | `quantile` | Legacy time and size metrics | `0.5`, `0.75`, `0.9`, `0.95`, or `0.99`. |
 
@@ -338,17 +338,17 @@ sum(rate(task_execute_time_seconds_count[5m])) by (taskType)
 ### Missing HTTP or Workflow Metrics
 
 - `http_api_client_request_seconds` (canonical) or
-  `conductor_worker_http_api_client_request` (legacy) is recorded from the
-  `fetchWithRetry` HTTP layer. Verify the collector is constructed before HTTP
-  calls begin.
+  `conductor_worker_http_api_client_request` (legacy) is recorded by request /
+  response interceptors on the OpenAPI client, with a network-error fallback in
+  `fetchWithRetry`. Verify the collector is constructed before HTTP calls begin.
 - `workflow_input_size_bytes` and `workflow_start_error_total` are recorded in
   `WorkflowExecutor`. Verify the collector is active before starting workflows.
 
 ### High Cardinality
 
-- Watch the `uri` label (canonical) or `endpoint` label (legacy) on HTTP
-  metrics. The SDK records the interpolated request path, which may include
-  task type names or workflow IDs.
+- In canonical mode the `uri` label on `http_api_client_request_seconds` uses
+  path templates (e.g. `/workflow/{workflowId}`), giving bounded cardinality.
+  Legacy mode's `endpoint` label still contains fully interpolated paths.
 - Prefer canonical mode for bounded `exception` labels. Legacy error counters
   encode exception names in the Map key, not as a proper Prometheus label.
 - Avoid embedding user identifiers or unbounded values in task type, workflow
@@ -386,3 +386,91 @@ process.on("unhandledRejection", (reason) => {
   option.
 - When `usePromClient: true` is set but `prom-client` is not installed, the
   collector falls back to the built-in text format silently.
+
+## Detailed Technical Notes -- Unreleased
+
+This section documents internal implementation details for developers reviewing
+the metrics harmonization changes. It is not end-user-facing and will be
+removed or folded into the relevant sections once the release is published.
+
+### Architecture
+
+The `MetricsCollectorInterface` (`src/sdk/worker/metrics/MetricsCollectorInterface.ts`)
+defines the contract for recording SDK metrics. Two implementations exist:
+
+- `LegacyMetricsCollector` (`src/sdk/worker/metrics/LegacyMetricsCollector.ts`)
+  -- emits the original metric names and types (sliding-window Summaries for
+  timing, compound `endpoint` labels, `conductor_worker_` prefix).
+- `CanonicalMetricsCollector` (`src/sdk/worker/metrics/CanonicalMetricsCollector.ts`)
+  -- emits the harmonized cross-SDK catalog (Histograms in seconds/bytes,
+  `_total` counters, bounded `exception` labels from `error.name`).
+
+`createMetricsCollector()` reads `WORKER_CANONICAL_METRICS` once and returns the
+appropriate implementation.
+
+### HTTP request timing via interceptors
+
+`createMetricsInterceptors()` (`src/sdk/createConductorClient/helpers/metricsInterceptors.ts`)
+returns a matched pair of request/response interceptors registered on the
+OpenAPI generated client:
+
+- **Request interceptor** stashes `performance.now()` on the `opts` object
+  that the generated client passes through both interceptors.
+- **Response interceptor** computes the request duration, extracts the
+  interpolated URI from `request.url`, extracts the path template from
+  `opts.url` (stripping the `/api/` prefix baked in by the OpenAPI spec), and
+  calls `observer.recordApiRequestTime(method, uri, status, durationMs, metricUri)`.
+
+Both interceptors receive the same `opts` reference (confirmed in
+`client.gen.ts` lines 89-102), so no `WeakMap` or side-channel is needed.
+
+`wrapFetchWithRetry` retains a network-error fallback that records `status: "0"`
+when `fetch` throws before any HTTP response (the response interceptor never
+runs in that case).
+
+### Canonical vs. legacy URI label
+
+`recordApiRequestTime` accepts an optional `metricUri` parameter (the path
+template). The canonical collector uses `metricUri ?? uri`; the legacy collector
+ignores `metricUri` and uses the interpolated `uri`, preserving legacy output
+byte-for-byte.
+
+The JavaScript SDK's OpenAPI spec bakes `/api/` into every path template
+(`opts.url = "/api/workflow/{workflowId}"`). Other SDKs keep `/api` in the base
+URL and use clean paths. The response interceptor strips this prefix so the
+canonical `uri` label matches the cross-SDK convention (e.g.
+`/workflow/{workflowId}`).
+
+### Payload size metrics
+
+`workflow_input_size_bytes` and `task_result_size_bytes` measure specific
+sub-fields of the request/response payloads (`workflowRequest.input` and
+`merged.outputData` respectively), not the full HTTP body. This requires a
+separate `JSON.stringify` call on the sub-field, independent of the HTTP body
+serialization performed by the OpenAPI client.
+
+This is an acceptable minor cost, consistent with how the Go, Java, and Python
+SDKs measure the same metrics. The Go SDK provides `WORKER_METRICS_PAYLOAD_SIZE`
+to opt out of this serialization; a similar env var can be added to the
+JavaScript SDK in a future release if needed.
+
+### Metrics collector event surface
+
+`CanonicalMetricsCollector` and `LegacyMetricsCollector` both implement
+`TaskRunnerEventsListener`, so they receive lifecycle events from the worker
+infrastructure:
+
+- `onTaskPoll` / `onTaskPollError` / `onTaskExecutionStarted` /
+  `onTaskExecutionCompleted` / `onTaskExecutionFailed` / `onTaskUpdateSuccess` /
+  `onTaskUpdateError` / `onTaskAckError` / `onTaskAckFailed` /
+  `onTaskExecutionQueueFull` / `onTaskPaused`
+- `Poller`, `TaskRunner`, and `EventDispatcher` emit a `taskPaused` event when a
+  poll cycle is skipped because the worker is paused.
+
+### Harness changes
+
+- Harness deployment manifest sets `WORKER_CANONICAL_METRICS=true`.
+- `harness/main.ts` logs which collector is active.
+- `WorkflowStatusProbe` (opt-in via `HARNESS_PROBE_RATE_PER_SEC`) exercises
+  UUID-bearing endpoints (`/api/workflow/{workflowId}`) to validate that the
+  `uri` label uses bounded templates.

From 5b0becde22c8186e6418154ab8162304fe93e8ef Mon Sep 17 00:00:00 2001
From: Chris Hagglund <chris.hagglund@orkes.io>
Date: Mon, 18 May 2026 12:54:21 -0600
Subject: [PATCH 16/21] address self-review concerns regarding backward
 compatibility

---
 .github/workflows/pull_request.yml            |  5 ++-
 CHANGELOG.md                                  | 20 +++++----
 METRICS.md                                    | 43 ++++++++++++++++---
 src/sdk/clients/worker/events/types.ts        |  2 +-
 src/sdk/clients/workflow/WorkflowExecutor.ts  |  7 ++-
 .../createConductorClient.ts                  |  4 +-
 .../helpers/__tests__/fetchWithRetry.test.ts  | 20 +++++++++
 .../helpers/fetchWithRetry.ts                 |  8 ++--
 .../helpers/metricsInterceptors.ts            |  4 +-
 .../helpers/resolveOrkesConfig.ts             |  3 ++
 src/sdk/types.ts                              |  1 +
 .../metrics/CanonicalMetricsCollector.ts      | 23 ++++++----
 .../worker/metrics/LegacyMetricsCollector.ts  | 18 ++++++--
 .../metrics/MetricsCollectorInterface.ts      |  4 ++
 .../metrics/__tests__/metricsFactory.test.ts  | 17 --------
 src/sdk/worker/metrics/httpObserver.ts        |  2 +
 src/sdk/worker/metrics/metricsFactory.ts      |  9 ++--
 17 files changed, 132 insertions(+), 58 deletions(-)

diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml
index b835e6e5..92c3eda7 100644
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@@ -82,13 +82,12 @@ jobs:
   # Integration tests (v5): lower max-parallel reduces 502/503 from the shared Conductor server
   # but makes CI slower without eliminating flakes entirely — feel free to experiment.
   # Sharding (--shard i/N) splits the suite so each job runs ~1/N of tests.
-  # fetchWithRetry now retries 502/503/504, so higher parallelism is more viable than before.
   integration-tests:
     runs-on: ubuntu-latest
     timeout-minutes: 25
     strategy:
       fail-fast: false
-      max-parallel: 3
+      max-parallel: 2
       matrix:
         node-version: [20, 22, 24]
         shard: [1, 2, 3]
@@ -120,6 +119,7 @@ jobs:
           CONDUCTOR_AUTH_KEY: ${{ secrets.AUTH_KEY }}
           CONDUCTOR_AUTH_SECRET: ${{ secrets.AUTH_SECRET }}
           CONDUCTOR_REQUEST_TIMEOUT_MS: "120000"
+          CONDUCTOR_RETRY_SERVER_ERRORS: "true"
           JEST_JUNIT_OUTPUT_NAME: integration-v5-node-${{ matrix.node-version }}-shard-${{ matrix.shard }}-test-results.xml
       - name: Publish Test Results
         uses: dorny/test-reporter@v2
@@ -177,6 +177,7 @@ jobs:
           CONDUCTOR_AUTH_KEY: ${{ secrets.AUTH_KEY_V4 }}
           CONDUCTOR_AUTH_SECRET: ${{ secrets.AUTH_SECRET_V4 }}
           CONDUCTOR_REQUEST_TIMEOUT_MS: "120000"
+          CONDUCTOR_RETRY_SERVER_ERRORS: "true"
           JEST_JUNIT_OUTPUT_NAME: integration-v4-node-${{ matrix.node-version }}-shard-${{ matrix.shard }}-test-results.xml
       - name: Publish Test Results
         uses: dorny/test-reporter@v2
diff --git a/CHANGELOG.md b/CHANGELOG.md
index f541892d..004f0a2a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,17 +9,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
-- Canonical metrics: opt-in harmonized metric surface via `WORKER_CANONICAL_METRICS=true` -- see [METRICS.md](METRICS.md) for the full catalog, configuration, and migration guide
-- Bounded `uri` label on `http_api_client_request_seconds`: canonical mode uses path templates (e.g. `/workflow/{workflowId}`) instead of fully-resolved paths, preventing metric cardinality explosion from dynamic IDs
-- `WorkflowStatusProbe` in harness: opt-in probe (via `HARNESS_PROBE_RATE_PER_SEC`) that exercises UUID-bearing endpoints to validate template URI metrics
-- `fetchWithRetry` now retries HTTP 502/503/504 for idempotent methods (GET, HEAD, OPTIONS, PUT, DELETE)
+- **Canonical metrics** -- opt-in harmonized metric surface via `WORKER_CANONICAL_METRICS=true`. See [METRICS.md](METRICS.md) for the full catalog, configuration, and migration guide.
+- Bounded `uri` label on `http_api_client_request_seconds`: canonical mode uses path templates (e.g. `/workflow/{workflowId}`) instead of fully-resolved paths, preventing metric cardinality explosion from dynamic IDs.
+- `TaskPaused` event type and `PollerOptions.onPaused` callback: emitted when a poll cycle is skipped because the worker is paused. Canonical mode records `task_paused_total`; legacy mode does not (see Implementation Notes in METRICS.md).
+- `measurePayloadSize` option in `MetricsCollectorConfig`: controls whether `workflow_input_size_bytes` is recorded via `JSON.stringify` on each `startWorkflow` call. Defaults to `true` for canonical, `false` for legacy.
+- `retryServerErrors` option in `OrkesApiConfig` / `RetryFetchOptions` and `CONDUCTOR_RETRY_SERVER_ERRORS` env var: opt-in retry of HTTP 502/503/504 for idempotent methods (GET, HEAD, OPTIONS, PUT, DELETE). Default `false`; set to `true` to enable.
+- `WorkflowStatusProbe` in harness: opt-in probe (via `HARNESS_PROBE_RATE_PER_SEC`) that exercises UUID-bearing endpoints to validate template URI metrics.
+- `WORKER_LEGACY_METRICS` is reserved for future use. Once canonical metrics become the default, setting `WORKER_LEGACY_METRICS=true` will re-activate the legacy surface. It is not read by the current implementation.
 
 ### Changed
 
-- Legacy metrics emit unchanged by default; no action required for existing deployments
-- `MetricsCollector.ts` renamed to `LegacyMetricsCollector.ts`; the public symbol is preserved via re-export so existing imports keep working
-- HTTP metrics recording moved from `fetchWithRetry` to OpenAPI client interceptors -- [details](METRICS.md#detailed-technical-notes----unreleased)
+- Legacy metrics emit unchanged by default; no action required for existing deployments.
+- `MetricsCollector.ts` renamed to `LegacyMetricsCollector.ts`; the public symbol is preserved via re-export so existing imports keep working.
+- HTTP metrics recording moved from `fetchWithRetry` to OpenAPI client interceptors -- [details](METRICS.md#implementation-notes).
+- `TaskUpdateFailure.durationMs` is now optional to preserve backward compatibility for external event listener implementations.
 
 ### Deprecated
 
-- Legacy metric names remain the default. Migration guidance is in [METRICS.md](METRICS.md#migrating-from-legacy-to-canonical).
+- Legacy metric names remain the default during the transition period. Migration guidance is in [METRICS.md](METRICS.md#migrating-from-legacy-to-canonical).
diff --git a/METRICS.md b/METRICS.md
index b687a0e1..ee3c1637 100644
--- a/METRICS.md
+++ b/METRICS.md
@@ -387,6 +387,40 @@ process.on("unhandledRejection", (reason) => {
 - When `usePromClient: true` is set but `prom-client` is not installed, the
   collector falls back to the built-in text format silently.
 
+## Implementation Notes
+
+### One Collector Per Process
+
+Only one metrics collector can be active at a time. Both
+`LegacyMetricsCollector` and `CanonicalMetricsCollector` register themselves
+as the global HTTP metrics observer on construction. Creating a second
+collector replaces the first. Calling `stop()` clears the global observer.
+
+### Payload Size Measurement
+
+`workflow_input_size_bytes` is recorded by calling `JSON.stringify` on the
+workflow input and measuring the resulting UTF-8 byte length with
+`Buffer.byteLength`. This cost is controlled by the `measurePayloadSize`
+config option:
+
+- **Canonical** (default `true`): measured on every `startWorkflow` call.
+  Set `measurePayloadSize: false` to disable.
+- **Legacy** (default `false`): not measured unless you set
+  `measurePayloadSize: true`.
+
+### Legacy `task_paused_total`
+
+Legacy mode does not emit `task_paused_total`. This metric was defined but
+never recorded in any prior release. It remains unrecorded in legacy mode to
+preserve byte-for-byte output compatibility. Switch to canonical mode
+(`WORKER_CANONICAL_METRICS=true`) to get paused metrics.
+
+### `WORKER_LEGACY_METRICS` (Reserved)
+
+`WORKER_LEGACY_METRICS` is reserved for future use. Once canonical metrics
+become the default, setting `WORKER_LEGACY_METRICS=true` will re-activate
+the legacy metric surface. It is not read by the current implementation.
+
 ## Detailed Technical Notes -- Unreleased
 
 This section documents internal implementation details for developers reviewing
@@ -415,7 +449,7 @@ returns a matched pair of request/response interceptors registered on the
 OpenAPI generated client:
 
 - **Request interceptor** stashes `performance.now()` on the `opts` object
-  that the generated client passes through both interceptors.
+  only when a metrics observer is active (no mutation otherwise).
 - **Response interceptor** computes the request duration, extracts the
   interpolated URI from `request.url`, extracts the path template from
   `opts.url` (stripping the `/api/` prefix baked in by the OpenAPI spec), and
@@ -449,10 +483,9 @@ sub-fields of the request/response payloads (`workflowRequest.input` and
 separate `JSON.stringify` call on the sub-field, independent of the HTTP body
 serialization performed by the OpenAPI client.
 
-This is an acceptable minor cost, consistent with how the Go, Java, and Python
-SDKs measure the same metrics. The Go SDK provides `WORKER_METRICS_PAYLOAD_SIZE`
-to opt out of this serialization; a similar env var can be added to the
-JavaScript SDK in a future release if needed.
+This cost is controlled by the `measurePayloadSize` config option (canonical
+default `true`, legacy default `false`). The Go SDK provides
+`WORKER_METRICS_PAYLOAD_SIZE` for the same purpose.
 
 ### Metrics collector event surface
 
diff --git a/src/sdk/clients/worker/events/types.ts b/src/sdk/clients/worker/events/types.ts
index 4bd5306c..14554aff 100644
--- a/src/sdk/clients/worker/events/types.ts
+++ b/src/sdk/clients/worker/events/types.ts
@@ -115,7 +115,7 @@ export interface TaskUpdateFailure extends TaskRunnerEvent {
   /** The error that caused the final update failure */
   cause: Error;
   /** Time taken for the last update attempt in milliseconds */
-  durationMs: number;
+  durationMs?: number;
   /** Number of retry attempts made */
   retryCount: number;
   /** The TaskResult object that failed to update (for recovery/logging) */
diff --git a/src/sdk/clients/workflow/WorkflowExecutor.ts b/src/sdk/clients/workflow/WorkflowExecutor.ts
index c527050e..17cd25f7 100644
--- a/src/sdk/clients/workflow/WorkflowExecutor.ts
+++ b/src/sdk/clients/workflow/WorkflowExecutor.ts
@@ -73,11 +73,10 @@ export class WorkflowExecutor {
     workflowRequest: StartWorkflowRequest
   ): Promise<string> {
     const observer = getHttpMetricsObserver();
-    if (observer) {
+    if (observer?.measurePayloadSize && workflowRequest.input) {
       try {
-        const inputBytes = workflowRequest.input
-          ? JSON.stringify(workflowRequest.input).length
-          : 0;
+        const json = JSON.stringify(workflowRequest.input);
+        const inputBytes = Buffer.byteLength(json, "utf8");
         observer.recordWorkflowInputSize(
           workflowRequest.name ?? "",
           inputBytes,
diff --git a/src/sdk/createConductorClient/createConductorClient.ts b/src/sdk/createConductorClient/createConductorClient.ts
index c1d8970c..e080fd93 100644
--- a/src/sdk/createConductorClient/createConductorClient.ts
+++ b/src/sdk/createConductorClient/createConductorClient.ts
@@ -35,6 +35,7 @@ export const createConductorClient = async (
     proxyUrl,
     tlsInsecure,
     disableHttp2,
+    retryServerErrors,
   } = resolveOrkesConfig(config);
 
   if (!serverUrl) throw new Error("Conductor server URL is not set");
@@ -53,7 +54,7 @@ export const createConductorClient = async (
   // Start with retry + timeout on fetch (no auth failure callback yet)
   const openApiClient = createClient({
     baseUrl: serverUrl,
-    fetch: wrapFetchWithRetry(baseFetchFn, { requestTimeoutMs }),
+    fetch: wrapFetchWithRetry(baseFetchFn, { requestTimeoutMs, retryServerErrors }),
     throwOnError: true,
   });
 
@@ -79,6 +80,7 @@ export const createConductorClient = async (
       fetch: wrapFetchWithRetry(baseFetchFn, {
         onAuthFailure: authResult.refreshToken,
         requestTimeoutMs,
+        retryServerErrors,
       }),
     });
   }
diff --git a/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts b/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts
index 111f0120..bfed241e 100644
--- a/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts
+++ b/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts
@@ -184,6 +184,18 @@ describe("fetchWithRetry", () => {
   // ─── Server error (502/503/504) retry ───────────────────────────────
 
   describe("server error (502/503/504) retry", () => {
+    it("should NOT retry 502 by default (retryServerErrors unset)", async () => {
+      mockFetch.mockResolvedValue(createMockResponse(502, "Bad Gateway"));
+
+      const result = await retryFetch("http://test.com", {}, mockFetch, {
+        maxTransportRetries: 3,
+        initialRetryDelay: 1,
+      });
+
+      expect(result.status).toBe(502);
+      expect(mockFetch).toHaveBeenCalledTimes(1);
+    });
+
     it("should retry 502 and succeed on next attempt", async () => {
       mockFetch
         .mockResolvedValueOnce(createMockResponse(502, "Bad Gateway"))
@@ -192,6 +204,7 @@ describe("fetchWithRetry", () => {
       const result = await retryFetch("http://test.com", {}, mockFetch, {
         maxTransportRetries: 3,
         initialRetryDelay: 1,
+        retryServerErrors: true,
       });
 
       expect(result.status).toBe(200);
@@ -206,6 +219,7 @@ describe("fetchWithRetry", () => {
       const result = await retryFetch("http://test.com", {}, mockFetch, {
         maxTransportRetries: 3,
         initialRetryDelay: 1,
+        retryServerErrors: true,
       });
 
       expect(result.status).toBe(200);
@@ -220,6 +234,7 @@ describe("fetchWithRetry", () => {
       const result = await retryFetch("http://test.com", {}, mockFetch, {
         maxTransportRetries: 3,
         initialRetryDelay: 1,
+        retryServerErrors: true,
       });
 
       expect(result.status).toBe(200);
@@ -232,6 +247,7 @@ describe("fetchWithRetry", () => {
       const result = await retryFetch("http://test.com", {}, mockFetch, {
         maxTransportRetries: 2,
         initialRetryDelay: 1,
+        retryServerErrors: true,
       });
 
       expect(result.status).toBe(502);
@@ -245,6 +261,7 @@ describe("fetchWithRetry", () => {
       const result = await retryFetch("http://test.com", { method: "POST" }, mockFetch, {
         maxTransportRetries: 3,
         initialRetryDelay: 1,
+        retryServerErrors: true,
       });
 
       expect(result.status).toBe(502);
@@ -257,6 +274,7 @@ describe("fetchWithRetry", () => {
       const result = await retryFetch("http://test.com", { method: "PATCH" }, mockFetch, {
         maxTransportRetries: 3,
         initialRetryDelay: 1,
+        retryServerErrors: true,
       });
 
       expect(result.status).toBe(503);
@@ -271,6 +289,7 @@ describe("fetchWithRetry", () => {
       const result = await retryFetch("http://test.com", { method: "PUT" }, mockFetch, {
         maxTransportRetries: 3,
         initialRetryDelay: 1,
+        retryServerErrors: true,
       });
 
       expect(result.status).toBe(200);
@@ -286,6 +305,7 @@ describe("fetchWithRetry", () => {
       const result = await retryFetch("http://test.com", {}, mockFetch, {
         maxTransportRetries: 3,
         initialRetryDelay: 1,
+        retryServerErrors: true,
       });
 
       expect(result.status).toBe(200);
diff --git a/src/sdk/createConductorClient/helpers/fetchWithRetry.ts b/src/sdk/createConductorClient/helpers/fetchWithRetry.ts
index 69f0feea..81a2b824 100644
--- a/src/sdk/createConductorClient/helpers/fetchWithRetry.ts
+++ b/src/sdk/createConductorClient/helpers/fetchWithRetry.ts
@@ -9,6 +9,7 @@ export interface RetryFetchOptions {
   maxRateLimitRetries?: number; // default 5
   maxTransportRetries?: number; // default 3
   initialRetryDelay?: number; // default 1000ms
+  retryServerErrors?: boolean; // retry 502/503/504 for idempotent methods (default false)
 }
 
 const isTimeoutError = (error: unknown): boolean => {
@@ -161,9 +162,10 @@ export const retryFetch = async (
       return rateLimitResponse;
     }
 
-    // Gateway error retry (502, 503, 504) -- only for idempotent methods to
-    // avoid duplicate side effects when the upstream may have processed the request.
-    if (response.status >= 502 && response.status <= 504) {
+    // Gateway error retry (502, 503, 504) -- opt-in via retryServerErrors.
+    // Only retries idempotent methods to avoid duplicate side effects when
+    // the upstream may have processed the request.
+    if (options.retryServerErrors && response.status >= 502 && response.status <= 504) {
       const reqMethod = (input instanceof Request
         ? input.method
         : init?.method ?? "GET"
diff --git a/src/sdk/createConductorClient/helpers/metricsInterceptors.ts b/src/sdk/createConductorClient/helpers/metricsInterceptors.ts
index f958ae9f..1e9a2d81 100644
--- a/src/sdk/createConductorClient/helpers/metricsInterceptors.ts
+++ b/src/sdk/createConductorClient/helpers/metricsInterceptors.ts
@@ -31,7 +31,9 @@ export function createMetricsInterceptors() {
     request: Request,
     opts: ResolvedRequestOptions,
   ): Request => {
-    (opts as OptsWithMetrics)._metricsStart = performance.now();
+    if (getHttpMetricsObserver()) {
+      (opts as OptsWithMetrics)._metricsStart = performance.now();
+    }
     return request;
   };
 
diff --git a/src/sdk/createConductorClient/helpers/resolveOrkesConfig.ts b/src/sdk/createConductorClient/helpers/resolveOrkesConfig.ts
index b11c9def..42d74744 100644
--- a/src/sdk/createConductorClient/helpers/resolveOrkesConfig.ts
+++ b/src/sdk/createConductorClient/helpers/resolveOrkesConfig.ts
@@ -67,5 +67,8 @@ export const resolveOrkesConfig = (config?: Partial<OrkesApiConfig>) => {
     disableHttp2:
       parseEnvBoolean(process.env.CONDUCTOR_DISABLE_HTTP2) ??
       config?.disableHttp2,
+    retryServerErrors:
+      parseEnvBoolean(process.env.CONDUCTOR_RETRY_SERVER_ERRORS) ??
+      config?.retryServerErrors,
   };
 };
diff --git a/src/sdk/types.ts b/src/sdk/types.ts
index 81f988c9..fc3beffd 100644
--- a/src/sdk/types.ts
+++ b/src/sdk/types.ts
@@ -20,4 +20,5 @@ export interface OrkesApiConfig {
   proxyUrl?: string; // HTTP/HTTPS proxy URL
   tlsInsecure?: boolean; // disable TLS certificate verification (for self-signed certs)
   disableHttp2?: boolean; // force HTTP/1.1 instead of HTTP/2
+  retryServerErrors?: boolean; // retry 502/503/504 for idempotent methods (default false)
 }
diff --git a/src/sdk/worker/metrics/CanonicalMetricsCollector.ts b/src/sdk/worker/metrics/CanonicalMetricsCollector.ts
index bf8b6ad3..03ebb5af 100644
--- a/src/sdk/worker/metrics/CanonicalMetricsCollector.ts
+++ b/src/sdk/worker/metrics/CanonicalMetricsCollector.ts
@@ -65,10 +65,12 @@ export class CanonicalMetricsCollector implements MetricsCollectorInterface {
   private _fileTimer?: ReturnType<typeof setInterval>;
   private _promRegistry?: import("./CanonicalPrometheusRegistry.js").CanonicalPrometheusRegistry;
   private readonly _usePromClient: boolean;
+  readonly measurePayloadSize: boolean;
 
   constructor(config?: MetricsCollectorConfig) {
     this.state = this.createEmpty();
     this._usePromClient = config?.usePromClient ?? false;
+    this.measurePayloadSize = config?.measurePayloadSize ?? true;
     if (this._usePromClient) {
       void this.initPromClient();
     }
@@ -81,6 +83,7 @@ export class CanonicalMetricsCollector implements MetricsCollectorInterface {
         config.fileWriteIntervalMs ?? 5000,
       );
     }
+    setHttpMetricsObserver(this);
   }
 
   private async initPromClient(): Promise<void> {
@@ -279,15 +282,17 @@ export class CanonicalMetricsCollector implements MetricsCollectorInterface {
       exception: excName,
     });
 
-    const seconds = event.durationMs / 1000;
-    this.state.taskUpdateTimeSeconds.observe(
-      { taskType: event.taskType, status: "FAILURE" },
-      seconds,
-    );
-    this._promRegistry?.observeHistogram("task_update_time_seconds", {
-      taskType: event.taskType,
-      status: "FAILURE",
-    }, seconds);
+    if (event.durationMs != null) {
+      const seconds = event.durationMs / 1000;
+      this.state.taskUpdateTimeSeconds.observe(
+        { taskType: event.taskType, status: "FAILURE" },
+        seconds,
+      );
+      this._promRegistry?.observeHistogram("task_update_time_seconds", {
+        taskType: event.taskType,
+        status: "FAILURE",
+      }, seconds);
+    }
   }
 
   onTaskPaused(event: TaskPaused): void {
diff --git a/src/sdk/worker/metrics/LegacyMetricsCollector.ts b/src/sdk/worker/metrics/LegacyMetricsCollector.ts
index 32467ff4..60ac3b5a 100644
--- a/src/sdk/worker/metrics/LegacyMetricsCollector.ts
+++ b/src/sdk/worker/metrics/LegacyMetricsCollector.ts
@@ -33,6 +33,11 @@ export interface MetricsCollectorConfig {
    * Falls back to custom text format if prom-client is not installed.
    */
   usePromClient?: boolean;
+  /**
+   * Measure workflow input payload size via JSON.stringify on each
+   * `startWorkflow` call.  Default: true for canonical, false for legacy.
+   */
+  measurePayloadSize?: boolean;
 }
 
 /**
@@ -103,11 +108,13 @@ export class LegacyMetricsCollector implements MetricsCollectorInterface {
   private _server?: import("./MetricsServer.js").MetricsServer;
   private _fileTimer?: ReturnType<typeof setInterval>;
   private _promRegistry?: import("./PrometheusRegistry.js").PrometheusRegistry;
+  readonly measurePayloadSize: boolean;
 
   constructor(config?: MetricsCollectorConfig) {
     this.metrics = this.createEmptyMetrics();
     this._prefix = config?.prefix ?? "conductor_worker";
     this._slidingWindowSize = config?.slidingWindowSize ?? 1000;
+    this.measurePayloadSize = config?.measurePayloadSize ?? false;
     if (config?.usePromClient) {
       void this.initPromClient();
     }
@@ -120,6 +127,7 @@ export class LegacyMetricsCollector implements MetricsCollectorInterface {
         config.fileWriteIntervalMs ?? 5000,
       );
     }
+    setHttpMetricsObserver(this);
   }
 
   private async initPromClient(): Promise<void> {
@@ -253,12 +261,16 @@ export class LegacyMetricsCollector implements MetricsCollectorInterface {
 
   onTaskUpdateFailure(event: TaskUpdateFailure): void {
     this.incrementCounter(this.metrics.taskUpdateFailureTotal, event.taskType, "update_error_total", "task_type");
-    this.observeSummary(this.metrics.updateDurationMs, event.taskType, event.durationMs, "update_time", "task_type");
+    if (event.durationMs != null) {
+      this.observeSummary(this.metrics.updateDurationMs, event.taskType, event.durationMs, "update_time", "task_type");
+    }
   }
 
-  // Canonical-only event — noop in legacy
   onTaskPaused(): void {
-    // Noop: TaskPaused events are handled via recordTaskPaused()
+    // Intentional no-op: task_paused_total was never emitted by the legacy
+    // metrics surface. Keeping it unrecorded preserves byte-for-byte output
+    // compatibility. Users who want paused metrics should switch to canonical
+    // mode (WORKER_CANONICAL_METRICS=true).
   }
 
   // ── Direct Recording Methods ───────────────────────────────────
diff --git a/src/sdk/worker/metrics/MetricsCollectorInterface.ts b/src/sdk/worker/metrics/MetricsCollectorInterface.ts
index dd341994..90ec8e8e 100644
--- a/src/sdk/worker/metrics/MetricsCollectorInterface.ts
+++ b/src/sdk/worker/metrics/MetricsCollectorInterface.ts
@@ -38,6 +38,10 @@ export interface MetricsCollectorInterface extends TaskRunnerEventsListener {
     metricUri?: string,
   ): void;
 
+  // ── Configuration ───────────────────────────────────────────────
+
+  readonly measurePayloadSize: boolean;
+
   // ── Output / lifecycle ─────────────────────────────────────────
 
   collectorName(): string;
diff --git a/src/sdk/worker/metrics/__tests__/metricsFactory.test.ts b/src/sdk/worker/metrics/__tests__/metricsFactory.test.ts
index 529fad17..02ca03c6 100644
--- a/src/sdk/worker/metrics/__tests__/metricsFactory.test.ts
+++ b/src/sdk/worker/metrics/__tests__/metricsFactory.test.ts
@@ -12,15 +12,6 @@ describe("createMetricsCollector", () => {
 
   it("should return LegacyMetricsCollector by default", () => {
     delete process.env.WORKER_CANONICAL_METRICS;
-    delete process.env.WORKER_LEGACY_METRICS;
-
-    const collector = createMetricsCollector();
-    expect(collector).toBeInstanceOf(LegacyMetricsCollector);
-  });
-
-  it("should return LegacyMetricsCollector when WORKER_LEGACY_METRICS=true", () => {
-    process.env.WORKER_LEGACY_METRICS = "true";
-    delete process.env.WORKER_CANONICAL_METRICS;
 
     const collector = createMetricsCollector();
     expect(collector).toBeInstanceOf(LegacyMetricsCollector);
@@ -33,14 +24,6 @@ describe("createMetricsCollector", () => {
     expect(collector).toBeInstanceOf(CanonicalMetricsCollector);
   });
 
-  it("should prefer canonical when both env vars are true", () => {
-    process.env.WORKER_CANONICAL_METRICS = "true";
-    process.env.WORKER_LEGACY_METRICS = "true";
-
-    const collector = createMetricsCollector();
-    expect(collector).toBeInstanceOf(CanonicalMetricsCollector);
-  });
-
   it("should return LegacyMetricsCollector when WORKER_CANONICAL_METRICS=false", () => {
     process.env.WORKER_CANONICAL_METRICS = "false";
 
diff --git a/src/sdk/worker/metrics/httpObserver.ts b/src/sdk/worker/metrics/httpObserver.ts
index 9366e783..ab2633b9 100644
--- a/src/sdk/worker/metrics/httpObserver.ts
+++ b/src/sdk/worker/metrics/httpObserver.ts
@@ -8,6 +8,8 @@
  */
 
 export interface HttpMetricsObserver {
+  readonly measurePayloadSize: boolean;
+
   recordApiRequestTime(
     method: string,
     uri: string,
diff --git a/src/sdk/worker/metrics/metricsFactory.ts b/src/sdk/worker/metrics/metricsFactory.ts
index dabe4810..86066895 100644
--- a/src/sdk/worker/metrics/metricsFactory.ts
+++ b/src/sdk/worker/metrics/metricsFactory.ts
@@ -7,11 +7,12 @@ import { setHttpMetricsObserver } from "./httpObserver";
 /**
  * Create the appropriate MetricsCollector based on environment variables.
  *
- * - WORKER_CANONICAL_METRICS=true  -> CanonicalMetricsCollector  (default: false)
- * - WORKER_LEGACY_METRICS=true     -> LegacyMetricsCollector     (default: true)
+ * - WORKER_CANONICAL_METRICS=true  -> CanonicalMetricsCollector
+ * - Unset / any other value        -> LegacyMetricsCollector (default)
  *
- * WORKER_CANONICAL_METRICS takes priority when both are set.
- * During the deprecation transition period the default is legacy.
+ * WORKER_LEGACY_METRICS is reserved for future use: once canonical becomes
+ * the default, setting WORKER_LEGACY_METRICS=true will re-activate legacy
+ * metrics.  It is not read by the current implementation.
  */
 export function createMetricsCollector(
   config?: MetricsCollectorConfig,

From 7a4c991c9d9236abcb86b083c2ca67a20895447e Mon Sep 17 00:00:00 2001
From: Chris Hagglund <chris.hagglund@orkes.io>
Date: Mon, 18 May 2026 13:19:14 -0600
Subject: [PATCH 17/21] address backward compat concerned from self review

---
 CHANGELOG.md                                        |  6 +++---
 METRICS.md                                          | 13 +++++++++----
 .../createConductorClient/helpers/fetchWithRetry.ts |  5 ++++-
 src/sdk/worker/metrics/CanonicalMetricsCollector.ts |  6 ++++--
 src/sdk/worker/metrics/LegacyMetricsCollector.ts    |  7 ++++---
 src/sdk/worker/metrics/httpObserver.ts              | 11 ++++++++---
 6 files changed, 32 insertions(+), 16 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 004f0a2a..b06338a3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,10 +19,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Changed
 
-- Legacy metrics emit unchanged by default; no action required for existing deployments.
+- Legacy metrics emit unchanged when constructing `LegacyMetricsCollector` directly (the pre-existing pattern). Using `createMetricsCollector()` additionally enables automatic HTTP request timing via OpenAPI interceptors for both legacy and canonical modes; no other action required for existing deployments.
 - `MetricsCollector.ts` renamed to `LegacyMetricsCollector.ts`; the public symbol is preserved via re-export so existing imports keep working.
-- HTTP metrics recording moved from `fetchWithRetry` to OpenAPI client interceptors -- [details](METRICS.md#implementation-notes).
-- `TaskUpdateFailure.durationMs` is now optional to preserve backward compatibility for external event listener implementations.
+- `http_api_client_request` timing is now recorded automatically via OpenAPI client request/response interceptors when a metrics collector is active (via `createMetricsCollector()` or `setHttpMetricsObserver`). Previously, `recordApiRequestTime` existed but was not wired into the HTTP pipeline -- [details](METRICS.md#implementation-notes).
+- Added optional `durationMs` field to `TaskUpdateFailure` event, recording the duration of the last update attempt. Declared optional so existing event listener implementations are unaffected.
 
 ### Deprecated
 
diff --git a/METRICS.md b/METRICS.md
index ee3c1637..d8def06a 100644
--- a/METRICS.md
+++ b/METRICS.md
@@ -493,10 +493,15 @@ default `true`, legacy default `false`). The Go SDK provides
 `TaskRunnerEventsListener`, so they receive lifecycle events from the worker
 infrastructure:
 
-- `onTaskPoll` / `onTaskPollError` / `onTaskExecutionStarted` /
-  `onTaskExecutionCompleted` / `onTaskExecutionFailed` / `onTaskUpdateSuccess` /
-  `onTaskUpdateError` / `onTaskAckError` / `onTaskAckFailed` /
-  `onTaskExecutionQueueFull` / `onTaskPaused`
+- Event listener methods (from `TaskRunnerEventsListener`): `onPollStarted` /
+  `onPollCompleted` / `onPollFailure` / `onTaskExecutionStarted` /
+  `onTaskExecutionCompleted` / `onTaskExecutionFailure` /
+  `onTaskUpdateCompleted` / `onTaskUpdateFailure` / `onTaskPaused`
+- Direct recording methods (called outside the event system):
+  `recordTaskExecutionQueueFull` / `recordUncaughtException` /
+  `recordTaskAckError` / `recordTaskAckFailed` / `recordTaskPaused` /
+  `recordExternalPayloadUsed` / `recordWorkflowStartError` /
+  `recordWorkflowInputSize` / `recordApiRequestTime`
 - `Poller`, `TaskRunner`, and `EventDispatcher` emit a `taskPaused` event when a
   poll cycle is skipped because the worker is paused.
 
diff --git a/src/sdk/createConductorClient/helpers/fetchWithRetry.ts b/src/sdk/createConductorClient/helpers/fetchWithRetry.ts
index 81a2b824..494930b8 100644
--- a/src/sdk/createConductorClient/helpers/fetchWithRetry.ts
+++ b/src/sdk/createConductorClient/helpers/fetchWithRetry.ts
@@ -212,8 +212,11 @@ export const wrapFetchWithRetry = (
   options?: RetryFetchOptions,
 ): typeof fetch => {
   return async (input: Input, init?: Init): Promise<Response> => {
-    const start = performance.now();
+    if (!getHttpMetricsObserver()) {
+      return retryFetch(input, init, fetchFn, options);
+    }
 
+    const start = performance.now();
     try {
       return await retryFetch(input, init, fetchFn, options);
     } catch (error) {
diff --git a/src/sdk/worker/metrics/CanonicalMetricsCollector.ts b/src/sdk/worker/metrics/CanonicalMetricsCollector.ts
index 03ebb5af..212cc8b8 100644
--- a/src/sdk/worker/metrics/CanonicalMetricsCollector.ts
+++ b/src/sdk/worker/metrics/CanonicalMetricsCollector.ts
@@ -11,7 +11,7 @@ import type {
 } from "../../clients/worker/events";
 import type { MetricsCollectorInterface } from "./MetricsCollectorInterface";
 import type { MetricsCollectorConfig } from "./LegacyMetricsCollector";
-import { setHttpMetricsObserver } from "./httpObserver";
+import { getHttpMetricsObserver, setHttpMetricsObserver } from "./httpObserver";
 import {
   HistogramAccumulator,
   MultiLabelCounter,
@@ -421,7 +421,9 @@ export class CanonicalMetricsCollector implements MetricsCollectorInterface {
   }
 
   async stop(): Promise<void> {
-    setHttpMetricsObserver(undefined);
+    if (getHttpMetricsObserver() === this) {
+      setHttpMetricsObserver(undefined);
+    }
     if (this._fileTimer) {
       clearInterval(this._fileTimer);
       this._fileTimer = undefined;
diff --git a/src/sdk/worker/metrics/LegacyMetricsCollector.ts b/src/sdk/worker/metrics/LegacyMetricsCollector.ts
index 60ac3b5a..237c3c8d 100644
--- a/src/sdk/worker/metrics/LegacyMetricsCollector.ts
+++ b/src/sdk/worker/metrics/LegacyMetricsCollector.ts
@@ -9,7 +9,7 @@ import type {
   TaskUpdateFailure,
 } from "../../clients/worker/events";
 import type { MetricsCollectorInterface } from "./MetricsCollectorInterface";
-import { setHttpMetricsObserver } from "./httpObserver";
+import { getHttpMetricsObserver, setHttpMetricsObserver } from "./httpObserver";
 
 /**
  * Configuration for MetricsCollector.
@@ -127,7 +127,6 @@ export class LegacyMetricsCollector implements MetricsCollectorInterface {
         config.fileWriteIntervalMs ?? 5000,
       );
     }
-    setHttpMetricsObserver(this);
   }
 
   private async initPromClient(): Promise<void> {
@@ -344,7 +343,9 @@ export class LegacyMetricsCollector implements MetricsCollectorInterface {
   }
 
   async stop(): Promise<void> {
-    setHttpMetricsObserver(undefined);
+    if (getHttpMetricsObserver() === this) {
+      setHttpMetricsObserver(undefined);
+    }
     if (this._fileTimer) {
       clearInterval(this._fileTimer);
       this._fileTimer = undefined;
diff --git a/src/sdk/worker/metrics/httpObserver.ts b/src/sdk/worker/metrics/httpObserver.ts
index ab2633b9..311a69b7 100644
--- a/src/sdk/worker/metrics/httpObserver.ts
+++ b/src/sdk/worker/metrics/httpObserver.ts
@@ -2,9 +2,14 @@
  * Global metrics observer for recording API client request latency
  * and workflow-level metrics from code outside the event system.
  *
- * Both LegacyMetricsCollector and CanonicalMetricsCollector register
- * themselves here on construction; fetchWithRetry and WorkflowExecutor
- * call the observer without needing a direct reference.
+ * `createMetricsCollector()` and `CanonicalMetricsCollector` register
+ * themselves here on construction. `LegacyMetricsCollector` does NOT
+ * self-register so that direct construction preserves pre-metrics
+ * behavior; use `createMetricsCollector()` or call
+ * `setHttpMetricsObserver()` explicitly to opt in.
+ *
+ * fetchWithRetry and WorkflowExecutor call the observer without
+ * needing a direct reference.
  */
 
 export interface HttpMetricsObserver {

From e4d8e6776c533793d5d08099742939f149619217 Mon Sep 17 00:00:00 2001
From: Chris Hagglund <chris.hagglund@orkes.io>
Date: Mon, 18 May 2026 13:57:37 -0600
Subject: [PATCH 18/21] trying to settle on a better implementation of retry
 fetch wrapper with regards to how metrics are tracked

---
 CHANGELOG.md                                  |   2 +-
 METRICS.md                                    |  36 +++---
 .../createConductorClient.ts                  |   3 +-
 .../helpers/__tests__/fetchWithRetry.test.ts  |  79 +++++++++++-
 .../__tests__/metricsInterceptors.test.ts     | 120 +++---------------
 .../helpers/fetchWithRetry.ts                 |  72 ++++++-----
 .../helpers/metricsInterceptors.ts            |  61 +++------
 7 files changed, 170 insertions(+), 203 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b06338a3..b713ac3a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -21,7 +21,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Legacy metrics emit unchanged when constructing `LegacyMetricsCollector` directly (the pre-existing pattern). Using `createMetricsCollector()` additionally enables automatic HTTP request timing via OpenAPI interceptors for both legacy and canonical modes; no other action required for existing deployments.
 - `MetricsCollector.ts` renamed to `LegacyMetricsCollector.ts`; the public symbol is preserved via re-export so existing imports keep working.
-- `http_api_client_request` timing is now recorded automatically via OpenAPI client request/response interceptors when a metrics collector is active (via `createMetricsCollector()` or `setHttpMetricsObserver`). Previously, `recordApiRequestTime` existed but was not wired into the HTTP pipeline -- [details](METRICS.md#implementation-notes).
+- `http_api_client_request` timing is now recorded automatically by `wrapFetchWithRetry` when a metrics collector is active (via `createMetricsCollector()` or `setHttpMetricsObserver`), covering both successful responses and network-error fallback paths. A lightweight request interceptor captures OpenAPI path templates so the canonical `uri` label uses bounded-cardinality templates in all cases. Previously, `recordApiRequestTime` existed but was not wired into the HTTP pipeline -- [details](METRICS.md#implementation-notes).
 - Added optional `durationMs` field to `TaskUpdateFailure` event, recording the duration of the last update attempt. Declared optional so existing event listener implementations are unaffected.
 
 ### Deprecated
diff --git a/METRICS.md b/METRICS.md
index d8def06a..2c3058bb 100644
--- a/METRICS.md
+++ b/METRICS.md
@@ -338,9 +338,9 @@ sum(rate(task_execute_time_seconds_count[5m])) by (taskType)
 ### Missing HTTP or Workflow Metrics
 
 - `http_api_client_request_seconds` (canonical) or
-  `conductor_worker_http_api_client_request` (legacy) is recorded by request /
-  response interceptors on the OpenAPI client, with a network-error fallback in
-  `fetchWithRetry`. Verify the collector is constructed before HTTP calls begin.
+  `conductor_worker_http_api_client_request` (legacy) is recorded by the
+  `wrapFetchWithRetry` wrapper. Verify the collector is constructed before HTTP
+  calls begin.
 - `workflow_input_size_bytes` and `workflow_start_error_total` are recorded in
   `WorkflowExecutor`. Verify the collector is active before starting workflows.
 
@@ -444,23 +444,19 @@ appropriate implementation.
 
 ### HTTP request timing via interceptors
 
-`createMetricsInterceptors()` (`src/sdk/createConductorClient/helpers/metricsInterceptors.ts`)
-returns a matched pair of request/response interceptors registered on the
-OpenAPI generated client:
-
-- **Request interceptor** stashes `performance.now()` on the `opts` object
-  only when a metrics observer is active (no mutation otherwise).
-- **Response interceptor** computes the request duration, extracts the
-  interpolated URI from `request.url`, extracts the path template from
-  `opts.url` (stripping the `/api/` prefix baked in by the OpenAPI spec), and
-  calls `observer.recordApiRequestTime(method, uri, status, durationMs, metricUri)`.
-
-Both interceptors receive the same `opts` reference (confirmed in
-`client.gen.ts` lines 89-102), so no `WeakMap` or side-channel is needed.
+A **request interceptor** (`createMetricsInterceptors()` in
+`src/sdk/createConductorClient/helpers/metricsInterceptors.ts`) stashes the
+OpenAPI path template (from `opts.url`) in a `WeakMap` keyed on the `Request`
+object. The template is stripped of the `/api/` prefix baked in by the OpenAPI
+spec so the canonical `uri` label matches the cross-SDK convention (e.g.
+`/workflow/{workflowId}`).
 
-`wrapFetchWithRetry` retains a network-error fallback that records `status: "0"`
-when `fetch` throws before any HTTP response (the response interceptor never
-runs in that case).
+`wrapFetchWithRetry` reads the template from the `WeakMap` before calling
+`retryFetch` and records HTTP metrics for both successful responses and
+network errors (status `"0"`). All timing and metric recording is centralised
+in the fetch wrapper; the interceptor's only job is to bridge the path
+template from `opts` (available to interceptors) to the fetch layer (which
+only receives the `Request` object).
 
 ### Canonical vs. legacy URI label
 
@@ -471,7 +467,7 @@ byte-for-byte.
 
 The JavaScript SDK's OpenAPI spec bakes `/api/` into every path template
 (`opts.url = "/api/workflow/{workflowId}"`). Other SDKs keep `/api` in the base
-URL and use clean paths. The response interceptor strips this prefix so the
+URL and use clean paths. The request interceptor strips this prefix so the
 canonical `uri` label matches the cross-SDK convention (e.g.
 `/workflow/{workflowId}`).
 
diff --git a/src/sdk/createConductorClient/createConductorClient.ts b/src/sdk/createConductorClient/createConductorClient.ts
index e080fd93..e6815811 100644
--- a/src/sdk/createConductorClient/createConductorClient.ts
+++ b/src/sdk/createConductorClient/createConductorClient.ts
@@ -58,9 +58,8 @@ export const createConductorClient = async (
     throwOnError: true,
   });
 
-  const { onRequest, onResponse } = createMetricsInterceptors();
+  const { onRequest } = createMetricsInterceptors();
   openApiClient.interceptors.request.use(onRequest);
-  openApiClient.interceptors.response.use(onResponse);
 
   let authResult: Awaited<ReturnType<typeof handleAuth>> | undefined;
   if (keyId && keySecret) {
diff --git a/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts b/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts
index bfed241e..1799cada 100644
--- a/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts
+++ b/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts
@@ -1,6 +1,7 @@
 import { jest, expect, describe, it, beforeEach, afterEach } from "@jest/globals";
 import { retryFetch, wrapFetchWithRetry, applyTimeout } from "../fetchWithRetry";
 import * as httpObserver from "@/sdk/worker/metrics/httpObserver";
+import { requestTemplateMap } from "../metricsInterceptors";
 
 const createMockResponse = (status: number, body = ""): Response =>
   new Response(body, { status, statusText: `Status ${status}` });
@@ -660,19 +661,16 @@ describe("fetchWithRetry", () => {
   });
 
   // ─── wrapFetchWithRetry metrics recording ───────────────────────────
-  //
-  // After the interceptor migration, wrapFetchWithRetry only records
-  // metrics on network errors (status "0") — the response interceptor
-  // handles all successful/HTTP-error cases.
 
   describe("wrapFetchWithRetry metrics", () => {
     const mockRecordApiRequestTime = jest.fn<
-      (m: string, u: string, s: string, d: number) => void
+      (m: string, u: string, s: string, d: number, t?: string) => void
     >();
 
     beforeEach(() => {
       mockRecordApiRequestTime.mockClear();
       httpObserver.setHttpMetricsObserver({
+        measurePayloadSize: false,
         recordApiRequestTime: mockRecordApiRequestTime,
         recordWorkflowInputSize: jest.fn(),
         recordWorkflowStartError: jest.fn(),
@@ -683,13 +681,19 @@ describe("fetchWithRetry", () => {
       httpObserver.setHttpMetricsObserver(undefined);
     });
 
-    it("should NOT record metrics on successful response (interceptors handle it)", async () => {
+    it("should record metrics on successful response", async () => {
       mockFetch.mockResolvedValue(createMockResponse(200));
 
       const wrappedFetch = wrapFetchWithRetry(mockFetch);
       await wrappedFetch("http://test.com/api/tasks", { method: "POST" });
 
-      expect(mockRecordApiRequestTime).not.toHaveBeenCalled();
+      expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1);
+      const [method, uri, status, duration] =
+        mockRecordApiRequestTime.mock.calls[0];
+      expect(method).toBe("POST");
+      expect(uri).toBe("/api/tasks");
+      expect(status).toBe("200");
+      expect(duration).toBeGreaterThanOrEqual(0);
     });
 
     it("should record status '0' on network failure", async () => {
@@ -730,6 +734,57 @@ describe("fetchWithRetry", () => {
       expect(status).toBe("0");
     });
 
+    it("should pass template from requestTemplateMap on success", async () => {
+      mockFetch.mockResolvedValue(createMockResponse(200));
+
+      const request = new Request("http://host/api/workflow/abc-123", {
+        method: "GET",
+      });
+      requestTemplateMap.set(request, "/workflow/{workflowId}");
+
+      const wrappedFetch = wrapFetchWithRetry(mockFetch);
+      await wrappedFetch(request);
+
+      expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1);
+      const [, , , , template] = mockRecordApiRequestTime.mock.calls[0];
+      expect(template).toBe("/workflow/{workflowId}");
+    });
+
+    it("should pass template from requestTemplateMap on network failure", async () => {
+      mockFetch.mockRejectedValue(new Error("ECONNRESET"));
+
+      const request = new Request("http://host/api/workflow/abc-123", {
+        method: "GET",
+      });
+      requestTemplateMap.set(request, "/workflow/{workflowId}");
+
+      const wrappedFetch = wrapFetchWithRetry(mockFetch, {
+        maxTransportRetries: 0,
+      });
+
+      await expect(wrappedFetch(request)).rejects.toThrow("ECONNRESET");
+
+      expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1);
+      const [, , , , template] = mockRecordApiRequestTime.mock.calls[0];
+      expect(template).toBe("/workflow/{workflowId}");
+    });
+
+    it("should pass undefined template when requestTemplateMap has no entry", async () => {
+      mockFetch.mockResolvedValue(createMockResponse(200));
+
+      const request = new Request("http://host/api/workflow/abc-123", {
+        method: "GET",
+      });
+      // Do NOT set requestTemplateMap
+
+      const wrappedFetch = wrapFetchWithRetry(mockFetch);
+      await wrappedFetch(request);
+
+      expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1);
+      const [, , , , template] = mockRecordApiRequestTime.mock.calls[0];
+      expect(template).toBeUndefined();
+    });
+
     it("should not break fetch when no observer is registered", async () => {
       httpObserver.setHttpMetricsObserver(undefined);
 
@@ -739,5 +794,15 @@ describe("fetchWithRetry", () => {
 
       expect(result.status).toBe(200);
     });
+
+    it("should not record metrics when no observer is registered", async () => {
+      httpObserver.setHttpMetricsObserver(undefined);
+
+      mockFetch.mockResolvedValue(createMockResponse(200));
+      const wrappedFetch = wrapFetchWithRetry(mockFetch);
+      await wrappedFetch("http://test.com");
+
+      expect(mockRecordApiRequestTime).not.toHaveBeenCalled();
+    });
   });
 });
diff --git a/src/sdk/createConductorClient/helpers/__tests__/metricsInterceptors.test.ts b/src/sdk/createConductorClient/helpers/__tests__/metricsInterceptors.test.ts
index 77800cb3..421bb726 100644
--- a/src/sdk/createConductorClient/helpers/__tests__/metricsInterceptors.test.ts
+++ b/src/sdk/createConductorClient/helpers/__tests__/metricsInterceptors.test.ts
@@ -1,31 +1,16 @@
-import { jest, expect, describe, it, beforeEach, afterEach } from "@jest/globals";
-import { createMetricsInterceptors } from "../metricsInterceptors";
-import * as httpObserver from "@/sdk/worker/metrics/httpObserver";
+import { expect, describe, it } from "@jest/globals";
+import {
+  createMetricsInterceptors,
+  requestTemplateMap,
+} from "../metricsInterceptors";
 import type { ResolvedRequestOptions } from "@/open-api/generated/client/types.gen";
 
-const mockRecordApiRequestTime = jest.fn<
-  (m: string, u: string, s: string | number, d: number, mu?: string) => void
->();
-
 const fakeOpts = (url: string): ResolvedRequestOptions =>
   ({ url } as unknown as ResolvedRequestOptions);
 
 describe("metricsInterceptors", () => {
-  beforeEach(() => {
-    mockRecordApiRequestTime.mockClear();
-    httpObserver.setHttpMetricsObserver({
-      recordApiRequestTime: mockRecordApiRequestTime,
-      recordWorkflowInputSize: jest.fn(),
-      recordWorkflowStartError: jest.fn(),
-    });
-  });
-
-  afterEach(() => {
-    httpObserver.setHttpMetricsObserver(undefined);
-  });
-
-  it("should record method, interpolated uri, status, duration, and template metricUri", () => {
-    const { onRequest, onResponse } = createMetricsInterceptors();
+  it("should stash stripped template in requestTemplateMap", () => {
+    const { onRequest } = createMetricsInterceptors();
 
     const request = new Request("http://conductor.example.com/api/workflow/abc-123", {
       method: "GET",
@@ -33,21 +18,12 @@ describe("metricsInterceptors", () => {
     const opts = fakeOpts("/api/workflow/{workflowId}");
 
     onRequest(request, opts);
-    const response = new Response(null, { status: 200 });
-    onResponse(response, request, opts);
-
-    expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1);
-    const [method, uri, status, duration, metricUri] =
-      mockRecordApiRequestTime.mock.calls[0];
-    expect(method).toBe("GET");
-    expect(uri).toBe("/api/workflow/abc-123");
-    expect(status).toBe("200");
-    expect(duration).toBeGreaterThanOrEqual(0);
-    expect(metricUri).toBe("/workflow/{workflowId}");
+
+    expect(requestTemplateMap.get(request)).toBe("/workflow/{workflowId}");
   });
 
   it("should strip /api prefix from path template", () => {
-    const { onRequest, onResponse } = createMetricsInterceptors();
+    const { onRequest } = createMetricsInterceptors();
 
     const request = new Request("http://host/api/tasks/poll/myTask", {
       method: "POST",
@@ -55,93 +31,39 @@ describe("metricsInterceptors", () => {
     const opts = fakeOpts("/api/tasks/poll/{taskType}");
 
     onRequest(request, opts);
-    onResponse(new Response(null, { status: 200 }), request, opts);
 
-    const metricUri = mockRecordApiRequestTime.mock.calls[0][4];
-    expect(metricUri).toBe("/tasks/poll/{taskType}");
+    expect(requestTemplateMap.get(request)).toBe("/tasks/poll/{taskType}");
   });
 
   it("should not strip prefix if path does not start with /api/", () => {
-    const { onRequest, onResponse } = createMetricsInterceptors();
+    const { onRequest } = createMetricsInterceptors();
 
     const request = new Request("http://host/health", { method: "GET" });
     const opts = fakeOpts("/health");
 
     onRequest(request, opts);
-    onResponse(new Response(null, { status: 200 }), request, opts);
-
-    const metricUri = mockRecordApiRequestTime.mock.calls[0][4];
-    expect(metricUri).toBe("/health");
-  });
-
-  it("should compute duration from request interceptor start time", () => {
-    const { onRequest, onResponse } = createMetricsInterceptors();
-
-    const request = new Request("http://host/api/workflow", { method: "GET" });
-    const opts = fakeOpts("/api/workflow");
-
-    onRequest(request, opts);
-    // Simulate a small delay so duration > 0
-    const busyWaitUntil = performance.now() + 2;
-    while (performance.now() < busyWaitUntil) { /* spin */ }
-    onResponse(new Response(null, { status: 200 }), request, opts);
-
-    const duration = mockRecordApiRequestTime.mock.calls[0][3] as number;
-    expect(duration).toBeGreaterThan(0);
-  });
-
-  it("should gracefully handle missing start time (duration defaults to 0)", () => {
-    const { onResponse } = createMetricsInterceptors();
-
-    const request = new Request("http://host/api/workflow", { method: "GET" });
-    const opts = fakeOpts("/api/workflow");
-
-    onResponse(new Response(null, { status: 200 }), request, opts);
 
-    const duration = mockRecordApiRequestTime.mock.calls[0][3] as number;
-    expect(duration).toBe(0);
+    expect(requestTemplateMap.get(request)).toBe("/health");
   });
 
-  it("should do nothing when no observer is registered", () => {
-    httpObserver.setHttpMetricsObserver(undefined);
-
-    const { onRequest, onResponse } = createMetricsInterceptors();
+  it("should return the request unchanged", () => {
+    const { onRequest } = createMetricsInterceptors();
 
     const request = new Request("http://host/api/workflow", { method: "GET" });
     const opts = fakeOpts("/api/workflow");
 
-    onRequest(request, opts);
-    const response = new Response(null, { status: 200 });
-    const result = onResponse(response, request, opts);
-
-    expect(result).toBe(response);
-    expect(mockRecordApiRequestTime).not.toHaveBeenCalled();
+    const returned = onRequest(request, opts);
+    expect(returned).toBe(request);
   });
 
-  it("should return request and response unchanged", () => {
-    const { onRequest, onResponse } = createMetricsInterceptors();
+  it("should not stash when opts.url is not a string", () => {
+    const { onRequest } = createMetricsInterceptors();
 
     const request = new Request("http://host/api/workflow", { method: "GET" });
-    const opts = fakeOpts("/api/workflow");
-
-    const returnedRequest = onRequest(request, opts);
-    expect(returnedRequest).toBe(request);
-
-    const response = new Response("body", { status: 201 });
-    const returnedResponse = onResponse(response, request, opts);
-    expect(returnedResponse).toBe(response);
-  });
-
-  it("should record correct status for error responses", () => {
-    const { onRequest, onResponse } = createMetricsInterceptors();
-
-    const request = new Request("http://host/api/workflow", { method: "POST" });
-    const opts = fakeOpts("/api/workflow");
+    const opts = {} as unknown as ResolvedRequestOptions;
 
     onRequest(request, opts);
-    onResponse(new Response(null, { status: 500 }), request, opts);
 
-    const status = mockRecordApiRequestTime.mock.calls[0][2];
-    expect(status).toBe("500");
+    expect(requestTemplateMap.has(request)).toBe(false);
   });
 });
diff --git a/src/sdk/createConductorClient/helpers/fetchWithRetry.ts b/src/sdk/createConductorClient/helpers/fetchWithRetry.ts
index 494930b8..d56776d3 100644
--- a/src/sdk/createConductorClient/helpers/fetchWithRetry.ts
+++ b/src/sdk/createConductorClient/helpers/fetchWithRetry.ts
@@ -1,4 +1,5 @@
 import { getHttpMetricsObserver } from "../../worker/metrics/httpObserver";
+import { requestTemplateMap } from "./metricsInterceptors";
 
 type Input = Parameters<typeof fetch>[0];
 type Init = Parameters<typeof fetch>[1];
@@ -207,46 +208,53 @@ export const retryFetch = async (
   throw lastError ?? new Error("Fetch retry exhausted");
 };
 
+const extractRequestInfo = (input: Input, init?: Init) => {
+  const method = input instanceof Request
+    ? input.method
+    : (init?.method ?? "GET");
+  let uri = "";
+  try {
+    uri = new URL(
+      input instanceof Request ? input.url : String(input),
+    ).pathname;
+  } catch {
+    uri = String(input);
+  }
+  return { method, uri };
+};
+
 export const wrapFetchWithRetry = (
   fetchFn: typeof fetch,
   options?: RetryFetchOptions,
 ): typeof fetch => {
-  return async (input: Input, init?: Init): Promise<Response> => {
-    if (!getHttpMetricsObserver()) {
+  return (input: Input, init?: Init): Promise<Response> => {
+    const observer = getHttpMetricsObserver();
+    if (!observer) {
       return retryFetch(input, init, fetchFn, options);
     }
 
     const start = performance.now();
-    try {
-      return await retryFetch(input, init, fetchFn, options);
-    } catch (error) {
-      // Network-error fallback: the response interceptor never runs when
-      // fetch throws, so record a status="0" metric here as a safety net.
-      let method = "GET";
-      let uri = "";
-      try {
-        if (input instanceof Request) {
-          method = input.method;
-          uri = new URL(input.url).pathname;
-        } else if (typeof input === "string") {
-          method = init?.method ?? "GET";
-          try { uri = new URL(input).pathname; } catch { uri = input; }
-        } else {
-          method = init?.method ?? "GET";
-          try { uri = input.pathname; } catch { uri = String(input); }
-        }
-      } catch {
-        // Best-effort URI extraction
-      }
+    const template = input instanceof Request
+      ? requestTemplateMap.get(input)
+      : undefined;
 
-      const durationMs = performance.now() - start;
-      getHttpMetricsObserver()?.recordApiRequestTime(
-        method,
-        uri,
-        "0",
-        durationMs,
-      );
-      throw error;
-    }
+    return retryFetch(input, init, fetchFn, options).then(
+      (response) => {
+        const durationMs = performance.now() - start;
+        const { method, uri } = extractRequestInfo(input, init);
+        observer.recordApiRequestTime(
+          method, uri, String(response.status), durationMs, template,
+        );
+        return response;
+      },
+      (error) => {
+        const durationMs = performance.now() - start;
+        const { method, uri } = extractRequestInfo(input, init);
+        observer.recordApiRequestTime(
+          method, uri, "0", durationMs, template,
+        );
+        throw error;
+      },
+    );
   };
 };
diff --git a/src/sdk/createConductorClient/helpers/metricsInterceptors.ts b/src/sdk/createConductorClient/helpers/metricsInterceptors.ts
index 1e9a2d81..c667ac31 100644
--- a/src/sdk/createConductorClient/helpers/metricsInterceptors.ts
+++ b/src/sdk/createConductorClient/helpers/metricsInterceptors.ts
@@ -1,5 +1,4 @@
 import type { ResolvedRequestOptions } from "../../../open-api/generated/client/types.gen";
-import { getHttpMetricsObserver } from "../../worker/metrics/httpObserver";
 
 /**
  * Strips the /api prefix that the OpenAPI spec bakes into every path.
@@ -12,58 +11,36 @@ import { getHttpMetricsObserver } from "../../worker/metrics/httpObserver";
 const stripApiPrefix = (url: string): string =>
   url.startsWith("/api/") ? url.slice(4) : url;
 
-type OptsWithMetrics = ResolvedRequestOptions & { _metricsStart?: number };
-
 /**
- * Creates a matched pair of request/response interceptors that record
- * http_api_client_request_seconds via the global HttpMetricsObserver.
+ * Maps each Request to its OpenAPI path template. The request interceptor
+ * stashes the template here so the fetch wrapper (which only receives the
+ * Request object) can pass bounded-cardinality URI labels to the metrics
+ * observer for both success and error paths.
  *
- * Both interceptors receive the same `opts` object for a given request
- * (see client.gen.ts lines 89-102), so start time is stashed directly
- * on opts — no WeakMap or side-channel needed.
+ * Entries are garbage-collected with the Request.
+ */
+export const requestTemplateMap = new WeakMap<Request, string>();
+
+/**
+ * Creates a request interceptor that captures the OpenAPI path template
+ * for each request. The template is stored in {@link requestTemplateMap}
+ * and read by `wrapFetchWithRetry` when recording HTTP metrics.
  *
- * The response interceptor passes both the interpolated URI (for legacy
- * metrics) and the bounded-cardinality path template (for canonical
- * metrics) to the observer, letting each collector choose which to use.
+ * Timing and metric recording are handled entirely in the fetch wrapper;
+ * the interceptor's only job is to bridge the path template from `opts`
+ * (available to interceptors) to the fetch layer (which only sees the
+ * Request object).
  */
 export function createMetricsInterceptors() {
   const onRequest = (
     request: Request,
     opts: ResolvedRequestOptions,
   ): Request => {
-    if (getHttpMetricsObserver()) {
-      (opts as OptsWithMetrics)._metricsStart = performance.now();
+    if (typeof opts.url === "string") {
+      requestTemplateMap.set(request, stripApiPrefix(opts.url));
     }
     return request;
   };
 
-  const onResponse = (
-    response: Response,
-    request: Request,
-    opts: ResolvedRequestOptions,
-  ): Response => {
-    const observer = getHttpMetricsObserver();
-    if (!observer) return response;
-
-    const start = (opts as OptsWithMetrics)._metricsStart;
-    const durationMs = start != null ? performance.now() - start : 0;
-
-    const method = request.method;
-    const status = String(response.status);
-
-    let uri = "";
-    try {
-      uri = new URL(request.url).pathname;
-    } catch {
-      uri = String(request.url);
-    }
-
-    const template = typeof opts.url === "string" ? stripApiPrefix(opts.url) : uri;
-
-    observer.recordApiRequestTime(method, uri, status, durationMs, template);
-
-    return response;
-  };
-
-  return { onRequest, onResponse };
+  return { onRequest };
 }

From a20cc3f0c2e8fba1c1026c0dafb468cf92855d4d Mon Sep 17 00:00:00 2001
From: Chris Hagglund <chris.hagglund@orkes.io>
Date: Mon, 18 May 2026 15:02:53 -0600
Subject: [PATCH 19/21] fix for label defect in canonical metrics

---
 METRICS.md                                    | 10 ++++---
 harness/workflowGovernor.ts                   |  1 +
 src/sdk/clients/worker/TaskRunner.ts          |  2 +-
 .../__tests__/metricsInterceptors.test.ts     | 26 +++++++++++++++++++
 .../helpers/metricsInterceptors.ts            | 20 +++++++++++++-
 5 files changed, 53 insertions(+), 6 deletions(-)

diff --git a/METRICS.md b/METRICS.md
index 2c3058bb..813753fb 100644
--- a/METRICS.md
+++ b/METRICS.md
@@ -391,10 +391,12 @@ process.on("unhandledRejection", (reason) => {
 
 ### One Collector Per Process
 
-Only one metrics collector can be active at a time. Both
-`LegacyMetricsCollector` and `CanonicalMetricsCollector` register themselves
-as the global HTTP metrics observer on construction. Creating a second
-collector replaces the first. Calling `stop()` clears the global observer.
+Only one metrics collector can be active at a time.
+`CanonicalMetricsCollector` registers itself as the global HTTP metrics
+observer on construction. `LegacyMetricsCollector` does not self-register;
+use `createMetricsCollector()` or call `setHttpMetricsObserver()` explicitly
+to enable HTTP metrics in legacy mode. Creating a second collector replaces
+the first. Calling `stop()` clears the global observer.
 
 ### Payload Size Measurement
 
diff --git a/harness/workflowGovernor.ts b/harness/workflowGovernor.ts
index 4ccd0d33..722e61f9 100644
--- a/harness/workflowGovernor.ts
+++ b/harness/workflowGovernor.ts
@@ -49,6 +49,7 @@ export class WorkflowGovernor {
         this.workflowExecutor.startWorkflow({
           name: this.workflowName,
           version: 1,
+          input: { startedAt: Date.now() },
         }),
       );
     }
diff --git a/src/sdk/clients/worker/TaskRunner.ts b/src/sdk/clients/worker/TaskRunner.ts
index 5be29c83..26ffab4e 100644
--- a/src/sdk/clients/worker/TaskRunner.ts
+++ b/src/sdk/clients/worker/TaskRunner.ts
@@ -509,7 +509,7 @@ export class TaskRunner {
 
       // Calculate output size if possible
       const outputSizeBytes = merged.outputData
-        ? JSON.stringify(merged.outputData).length
+        ? Buffer.byteLength(JSON.stringify(merged.outputData), "utf8")
         : undefined;
 
       // Untrack immediately — execution done, update will follow
diff --git a/src/sdk/createConductorClient/helpers/__tests__/metricsInterceptors.test.ts b/src/sdk/createConductorClient/helpers/__tests__/metricsInterceptors.test.ts
index 421bb726..24bfcdd8 100644
--- a/src/sdk/createConductorClient/helpers/__tests__/metricsInterceptors.test.ts
+++ b/src/sdk/createConductorClient/helpers/__tests__/metricsInterceptors.test.ts
@@ -56,6 +56,32 @@ describe("metricsInterceptors", () => {
     expect(returned).toBe(request);
   });
 
+  it("should normalize lowercase {tasktype} to {taskType}", () => {
+    const { onRequest } = createMetricsInterceptors();
+
+    const request = new Request("http://host/api/tasks/poll/batch/my_task", {
+      method: "GET",
+    });
+    const opts = fakeOpts("/api/tasks/poll/batch/{tasktype}");
+
+    onRequest(request, opts);
+
+    expect(requestTemplateMap.get(request)).toBe("/tasks/poll/batch/{taskType}");
+  });
+
+  it("should leave already-camelCase placeholders unchanged", () => {
+    const { onRequest } = createMetricsInterceptors();
+
+    const request = new Request("http://host/api/workflow/abc-123", {
+      method: "GET",
+    });
+    const opts = fakeOpts("/api/workflow/{workflowId}");
+
+    onRequest(request, opts);
+
+    expect(requestTemplateMap.get(request)).toBe("/workflow/{workflowId}");
+  });
+
   it("should not stash when opts.url is not a string", () => {
     const { onRequest } = createMetricsInterceptors();
 
diff --git a/src/sdk/createConductorClient/helpers/metricsInterceptors.ts b/src/sdk/createConductorClient/helpers/metricsInterceptors.ts
index c667ac31..146bacdf 100644
--- a/src/sdk/createConductorClient/helpers/metricsInterceptors.ts
+++ b/src/sdk/createConductorClient/helpers/metricsInterceptors.ts
@@ -11,6 +11,21 @@ import type { ResolvedRequestOptions } from "../../../open-api/generated/client/
 const stripApiPrefix = (url: string): string =>
   url.startsWith("/api/") ? url.slice(4) : url;
 
+/**
+ * The OpenAPI spec uses all-lowercase path parameter names in a few
+ * templates (e.g. `{tasktype}` instead of `{taskType}`). This map
+ * normalizes them to camelCase so the canonical `uri` metric label
+ * is consistent across all endpoints.
+ */
+const CANONICAL_PARAM_NAMES: Record<string, string> = {
+  tasktype: "taskType",
+};
+
+const normalizePlaceholders = (url: string): string =>
+  url.replace(/\{(\w+)\}/g, (_m, name: string) =>
+    `{${CANONICAL_PARAM_NAMES[name] ?? name}}`,
+  );
+
 /**
  * Maps each Request to its OpenAPI path template. The request interceptor
  * stashes the template here so the fetch wrapper (which only receives the
@@ -37,7 +52,10 @@ export function createMetricsInterceptors() {
     opts: ResolvedRequestOptions,
   ): Request => {
     if (typeof opts.url === "string") {
-      requestTemplateMap.set(request, stripApiPrefix(opts.url));
+      requestTemplateMap.set(
+        request,
+        normalizePlaceholders(stripApiPrefix(opts.url)),
+      );
     }
     return request;
   };

From f42bec9f8ee1243f631f9e0dcca2633779c58a4b Mon Sep 17 00:00:00 2001
From: Chris Hagglund <chris.hagglund@orkes.io>
Date: Tue, 19 May 2026 09:17:56 -0600
Subject: [PATCH 20/21] give 1 flaky integration test more time and log it's
 progress

---
 src/integration-tests/E2EFiveTaskWorkflow.test.ts | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/integration-tests/E2EFiveTaskWorkflow.test.ts b/src/integration-tests/E2EFiveTaskWorkflow.test.ts
index 66aed967..1d6fc9af 100644
--- a/src/integration-tests/E2EFiveTaskWorkflow.test.ts
+++ b/src/integration-tests/E2EFiveTaskWorkflow.test.ts
@@ -120,10 +120,17 @@ describeForOrkesV5("E2E: 5-task workflow × 50 executions", () => {
 
       expect(workflowIds.length).toBe(WORKFLOW_COUNT);
 
-      // Wait for all 50 to complete (600s in CI-friendly timeout, poll every 2s)
+      // Wait for all 50 to complete (600s per workflow, poll every 2s)
+      let completed = 0;
       const results = await Promise.all(
         workflowIds.map((id) =>
-          waitForWorkflowStatus(executor, id, "COMPLETED", 600_000, 2000)
+          waitForWorkflowStatus(executor, id, "COMPLETED", 600_000, 2000).then(
+            (wf) => {
+              completed++;
+              console.log(`[E2E] workflow ${completed}/${WORKFLOW_COUNT} completed (${id})`);
+              return wf;
+            }
+          )
         )
       );
 
@@ -168,7 +175,6 @@ describeForOrkesV5("E2E: 5-task workflow × 50 executions", () => {
         workflows: [{ name: workflowName, version: 1 }],
         tasks: taskNames,
       });
-    },
-    330_000 // 5.5 minute timeout for the entire test
+    }
   );
 });

From 01ca97fa7ef8aeed27fe8f6d03d4ef581950d086 Mon Sep 17 00:00:00 2001
From: Chris Hagglund <chris.hagglund@orkes.io>
Date: Tue, 19 May 2026 12:41:21 -0600
Subject: [PATCH 21/21] test out pre-existing prom-client renderer

---
 harness/main.ts                                     | 6 ++++--
 src/sdk/worker/metrics/CanonicalMetricsCollector.ts | 4 +---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/harness/main.ts b/harness/main.ts
index 182b8d50..6ed6be8c 100644
--- a/harness/main.ts
+++ b/harness/main.ts
@@ -94,8 +94,10 @@ async function main(): Promise<void> {
   });
 
   const metricsPort = envIntOrDefault("HARNESS_METRICS_PORT", 9991);
-  const metricsCollector = createMetricsCollector({ httpPort: metricsPort });
-  console.log(`Prometheus metrics server started on port ${metricsPort} (${metricsCollector.collectorName()} metrics)`);
+  const usePromClient = process.env.HARNESS_USE_PROM_CLIENT === "true";
+  const metricsCollector = createMetricsCollector({ httpPort: metricsPort, usePromClient });
+  const backend = usePromClient ? "prom-client" : "built-in";
+  console.log(`Prometheus metrics server started on port ${metricsPort} (${metricsCollector.collectorName()} metrics, ${backend} backend)`);
 
   const handler = new TaskHandler({
     client,
diff --git a/src/sdk/worker/metrics/CanonicalMetricsCollector.ts b/src/sdk/worker/metrics/CanonicalMetricsCollector.ts
index 212cc8b8..c5df73c0 100644
--- a/src/sdk/worker/metrics/CanonicalMetricsCollector.ts
+++ b/src/sdk/worker/metrics/CanonicalMetricsCollector.ts
@@ -64,14 +64,12 @@ export class CanonicalMetricsCollector implements MetricsCollectorInterface {
   private _server?: import("./MetricsServer.js").MetricsServer;
   private _fileTimer?: ReturnType<typeof setInterval>;
   private _promRegistry?: import("./CanonicalPrometheusRegistry.js").CanonicalPrometheusRegistry;
-  private readonly _usePromClient: boolean;
   readonly measurePayloadSize: boolean;
 
   constructor(config?: MetricsCollectorConfig) {
     this.state = this.createEmpty();
-    this._usePromClient = config?.usePromClient ?? false;
     this.measurePayloadSize = config?.measurePayloadSize ?? true;
-    if (this._usePromClient) {
+    if (config?.usePromClient) {
       void this.initPromClient();
     }
     if (config?.httpPort) {