diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index 56599d35..92c3eda7 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -79,8 +79,9 @@ jobs: name: codecov-unit-node-${{ matrix.node-version }} fail_ci_if_error: false - # Integration tests (v5): one job at a time (max-parallel: 1) to avoid 502/503 on shared Conductor. - # Sharding (--shard i/N) splits the suite so each job runs ~1/N of tests — keeps per-job under timeout. + # Integration tests (v5): lower max-parallel reduces 502/503 from the shared Conductor server + # but makes CI slower without eliminating flakes entirely — feel free to experiment. + # Sharding (--shard i/N) splits the suite so each job runs ~1/N of tests. integration-tests: runs-on: ubuntu-latest timeout-minutes: 25 @@ -118,6 +119,7 @@ jobs: CONDUCTOR_AUTH_KEY: ${{ secrets.AUTH_KEY }} CONDUCTOR_AUTH_SECRET: ${{ secrets.AUTH_SECRET }} CONDUCTOR_REQUEST_TIMEOUT_MS: "120000" + CONDUCTOR_RETRY_SERVER_ERRORS: "true" JEST_JUNIT_OUTPUT_NAME: integration-v5-node-${{ matrix.node-version }}-shard-${{ matrix.shard }}-test-results.xml - name: Publish Test Results uses: dorny/test-reporter@v2 @@ -175,6 +177,7 @@ jobs: CONDUCTOR_AUTH_KEY: ${{ secrets.AUTH_KEY_V4 }} CONDUCTOR_AUTH_SECRET: ${{ secrets.AUTH_SECRET_V4 }} CONDUCTOR_REQUEST_TIMEOUT_MS: "120000" + CONDUCTOR_RETRY_SERVER_ERRORS: "true" JEST_JUNIT_OUTPUT_NAME: integration-v4-node-${{ matrix.node-version }}-shard-${{ matrix.shard }}-test-results.xml - name: Publish Test Results uses: dorny/test-reporter@v2 diff --git a/AGENTS.md b/AGENTS.md index 87f2aa09..afb1584e 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -32,7 +32,7 @@ src/sdk/ # Main SDK source decorators/worker.ts # @worker decorator + dual-mode support decorators/registry.ts # Global registry (register/get/clear) context/TaskContext.ts # AsyncLocalStorage per-task context - metrics/ # MetricsCollector, MetricsServer, PrometheusRegistry + metrics/ # LegacyMetricsCollector, CanonicalMetricsCollector, metricsFactory, MetricsServer, PrometheusRegistry, CanonicalPrometheusRegistry, accumulators, httpObserver schema/ # jsonSchema, schemaField decorators generators/ # Legacy generators (pre-v3, still exported for compat) src/open-api/ # OpenAPI layer @@ -211,10 +211,10 @@ public async someMethod(args): Promise { ### Metrics Documentation (METRICS.md) -When adding, removing, or renaming metrics in `src/sdk/worker/metrics/MetricsCollector.ts`: -1. Update `METRICS.md` to reflect the change (name, type, labels, description) -2. Ensure both `MetricsCollector.toPrometheusText()` and `PrometheusRegistry.createMetrics()` are updated in sync — missing a summary/counter in either causes silent data loss -3. Update the metric count in the METRICS.md overview section +When adding, removing, or renaming metrics in `src/sdk/worker/metrics/`: +1. Update both `LegacyMetricsCollector.ts` and `CanonicalMetricsCollector.ts` (or add a no-op stub in the collector that does not emit the metric) +2. Ensure `toPrometheusText()` and the corresponding `PrometheusRegistry` / `CanonicalPrometheusRegistry` are updated in sync — missing a metric in either causes silent data loss +3. Update `METRICS.md` to reflect the change in both the legacy and canonical catalog tables 4. Add or update the corresponding direct recording method documentation if applicable ### SDK_NEW_LANGUAGE_GUIDE.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000..b713ac3a --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,29 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +### Added + +- **Canonical metrics** -- opt-in harmonized metric surface via `WORKER_CANONICAL_METRICS=true`. See [METRICS.md](METRICS.md) for the full catalog, configuration, and migration guide. +- Bounded `uri` label on `http_api_client_request_seconds`: canonical mode uses path templates (e.g. `/workflow/{workflowId}`) instead of fully-resolved paths, preventing metric cardinality explosion from dynamic IDs. +- `TaskPaused` event type and `PollerOptions.onPaused` callback: emitted when a poll cycle is skipped because the worker is paused. Canonical mode records `task_paused_total`; legacy mode does not (see Implementation Notes in METRICS.md). +- `measurePayloadSize` option in `MetricsCollectorConfig`: controls whether `workflow_input_size_bytes` is recorded via `JSON.stringify` on each `startWorkflow` call. Defaults to `true` for canonical, `false` for legacy. +- `retryServerErrors` option in `OrkesApiConfig` / `RetryFetchOptions` and `CONDUCTOR_RETRY_SERVER_ERRORS` env var: opt-in retry of HTTP 502/503/504 for idempotent methods (GET, HEAD, OPTIONS, PUT, DELETE). Default `false`; set to `true` to enable. +- `WorkflowStatusProbe` in harness: opt-in probe (via `HARNESS_PROBE_RATE_PER_SEC`) that exercises UUID-bearing endpoints to validate template URI metrics. +- `WORKER_LEGACY_METRICS` is reserved for future use. Once canonical metrics become the default, setting `WORKER_LEGACY_METRICS=true` will re-activate the legacy surface. It is not read by the current implementation. + +### Changed + +- Legacy metrics emit unchanged when constructing `LegacyMetricsCollector` directly (the pre-existing pattern). Using `createMetricsCollector()` additionally enables automatic HTTP request timing via OpenAPI interceptors for both legacy and canonical modes; no other action required for existing deployments. +- `MetricsCollector.ts` renamed to `LegacyMetricsCollector.ts`; the public symbol is preserved via re-export so existing imports keep working. +- `http_api_client_request` timing is now recorded automatically by `wrapFetchWithRetry` when a metrics collector is active (via `createMetricsCollector()` or `setHttpMetricsObserver`), covering both successful responses and network-error fallback paths. A lightweight request interceptor captures OpenAPI path templates so the canonical `uri` label uses bounded-cardinality templates in all cases. Previously, `recordApiRequestTime` existed but was not wired into the HTTP pipeline -- [details](METRICS.md#implementation-notes). +- Added optional `durationMs` field to `TaskUpdateFailure` event, recording the duration of the last update attempt. Declared optional so existing event listener implementations are unaffected. + +### Deprecated + +- Legacy metric names remain the default during the transition period. Migration guidance is in [METRICS.md](METRICS.md#migrating-from-legacy-to-canonical). diff --git a/METRICS.md b/METRICS.md index d8770ed8..813753fb 100644 --- a/METRICS.md +++ b/METRICS.md @@ -1,259 +1,512 @@ -# Metrics Reference +# JavaScript SDK Metrics -The Conductor JavaScript SDK provides built-in Prometheus metrics for monitoring worker performance, API latency, and task execution. +The Conductor JavaScript SDK can expose Prometheus metrics for worker polling, +task execution, task updates, workflow starts, external payload usage, and +API-client HTTP calls. -## Overview +The SDK currently has two mutually exclusive metric surfaces: -`MetricsCollector` implements `TaskRunnerEventsListener` and records **18 metric types** (12 counters + 6 summaries). Metrics are exposed in [Prometheus exposition format](https://prometheus.io/docs/instrumenting/exposition_formats/). +- **Legacy metrics** are the default. They preserve the original JavaScript SDK + names and shapes, including a `conductor_worker_` prefix, `task_type` labels, + millisecond time units, and Summary type for distributions. +- **Canonical metrics** are opt-in with `WORKER_CANONICAL_METRICS=true`. They + use the cross-SDK canonical names, labels, units, and Prometheus histogram + shapes. -- **Default prefix:** `conductor_worker` -- **Quantiles:** p50, p75, p90, p95, p99 (computed from a sliding window) -- **Sliding window:** Last 1,000 observations (configurable) +Only one collector is active at a time. The SDK does not emit legacy and +canonical metrics at the same time. -## Quick Start +Metrics are created lazily. A metric appears in `/metrics` only after the +corresponding worker event or collector method records it. Some low-level +surface metrics, such as ack, queue-full, paused, and uncaught-exception +counters, may not appear in normal worker runs unless that path is exercised. -### HTTP Server +## Usage -```typescript -import { MetricsCollector, MetricsServer, TaskHandler } from "@io-orkes/conductor-javascript"; +Create a metrics collector, start a scrape server, and wire the collector into +`TaskHandler` as an event listener: -const metrics = new MetricsCollector({ httpPort: 9090 }); +```typescript +import { + createMetricsCollector, + MetricsServer, + TaskHandler, +} from "@io-orkes/conductor-javascript"; + +const metrics = createMetricsCollector(); +const server = new MetricsServer(metrics, 9090); +await server.start(); const handler = new TaskHandler({ client, - scanForDecorated: true, eventListeners: [metrics], + scanForDecorated: true, }); await handler.startWorkers(); // GET http://localhost:9090/metrics — Prometheus text format -// GET http://localhost:9090/health — { "status": "UP" } +// GET http://localhost:9090/health — {"status":"UP"} +``` + +`createMetricsCollector()` reads `WORKER_CANONICAL_METRICS` and returns either +a `LegacyMetricsCollector` or a `CanonicalMetricsCollector`. Both implement +`MetricsCollectorInterface`, so call sites never need to know which variant is +active. + +You can also construct a collector directly if you need to pass configuration: + +```typescript +import { LegacyMetricsCollector } from "@io-orkes/conductor-javascript"; + +const metrics = new LegacyMetricsCollector({ + httpPort: 9090, + filePath: "/tmp/conductor_metrics.prom", + fileWriteIntervalMs: 10000, + usePromClient: true, +}); ``` ### File Output ```typescript -const metrics = new MetricsCollector({ +const metrics = createMetricsCollector({ filePath: "/tmp/conductor_metrics.prom", - fileWriteIntervalMs: 10000, // write every 10s + fileWriteIntervalMs: 10000, }); ``` -The file writer performs an immediate first write, then writes periodically at the configured interval. The timer is unreferenced so it does not prevent Node.js process exit. +The file writer performs an immediate first write, then writes periodically at +the configured interval. The timer is unreferenced so it does not prevent +Node.js process exit. ### prom-client Integration ```typescript -const metrics = new MetricsCollector({ usePromClient: true }); +const metrics = createMetricsCollector({ usePromClient: true }); // Metrics are registered in prom-client's default registry. // Use prom-client's register.metrics() for native scraping. ``` -Requires `npm install prom-client`. Falls back to built-in text format if not installed. +Requires `npm install prom-client`. Falls back to built-in text format if +prom-client is not installed. -### All-in-One +## Selecting Canonical Metrics -```typescript -const metrics = new MetricsCollector({ - prefix: "myapp_worker", - httpPort: 9090, - filePath: "/tmp/metrics.prom", - fileWriteIntervalMs: 10000, - slidingWindowSize: 500, - usePromClient: true, -}); +Set `WORKER_CANONICAL_METRICS` before the worker starts: + +```shell +WORKER_CANONICAL_METRICS=true node my_worker.js ``` +Accepted true values are `true`, `1`, and `yes`, case-insensitive. Any other +value, or an unset variable, selects legacy metrics. The variable is read when +the metrics collector is created, so changing it requires a worker restart. + ## Configuration | Option | Type | Default | Description | -|--------|------|---------|-------------| -| `prefix` | `string` | `"conductor_worker"` | Prometheus metric name prefix | -| `httpPort` | `number` | — | Start built-in HTTP server on this port | -| `filePath` | `string` | — | Periodically write metrics to this file path | -| `fileWriteIntervalMs` | `number` | `5000` | File write interval in milliseconds | -| `slidingWindowSize` | `number` | `1000` | Max observations kept for quantile calculation | -| `usePromClient` | `boolean` | `false` | Use `prom-client` for native Prometheus integration | - ---- - -## Counter Metrics - -### Labeled by `task_type` - -| Prometheus Name | Internal Key | Description | -|----------------|-------------|-------------| -| `{prefix}_task_poll_total` | `pollTotal` | Total number of task polls initiated | -| `{prefix}_task_poll_error_total` | `pollErrorTotal` | Total number of failed task polls | -| `{prefix}_task_execute_total` | `taskExecutionTotal` | Total number of task executions completed | -| `{prefix}_task_execute_error_total` | `taskExecutionErrorTotal` | Total task execution errors. Label format: `taskType:ExceptionName` | -| `{prefix}_task_update_error_total` | `taskUpdateFailureTotal` | Total task result update failures (result lost from Conductor) | -| `{prefix}_task_ack_error_total` | `taskAckErrorTotal` | Total task acknowledgement errors | -| `{prefix}_task_execution_queue_full_total` | `taskExecutionQueueFullTotal` | Times the execution queue was full (concurrency limit reached) | -| `{prefix}_task_paused_total` | `taskPausedTotal` | Total task paused events | - -### Labeled by `payload_type` - -| Prometheus Name | Internal Key | Description | -|----------------|-------------|-------------| -| `{prefix}_external_payload_used_total` | `externalPayloadUsedTotal` | External payload storage usage (e.g., `"workflow_input"`, `"task_output"`) | - -### Global (no labels) - -| Prometheus Name | Internal Key | Description | -|----------------|-------------|-------------| -| `{prefix}_thread_uncaught_exceptions_total` | `uncaughtExceptionTotal` | Total uncaught exceptions in worker processes | -| `{prefix}_worker_restart_total` | `workerRestartTotal` | Total worker restart events | -| `{prefix}_workflow_start_error_total` | `workflowStartErrorTotal` | Total workflow start errors | - ---- - -## Summary Metrics - -Each summary emits quantile values, a count, and a sum: - -``` -{name}{task_type="myTask",quantile="0.5"} 12.3 -{name}{task_type="myTask",quantile="0.75"} 15.1 -{name}{task_type="myTask",quantile="0.9"} 18.7 -{name}{task_type="myTask",quantile="0.95"} 22.0 -{name}{task_type="myTask",quantile="0.99"} 45.2 -{name}_count{task_type="myTask"} 1000 -{name}_sum{task_type="myTask"} 14523.7 +|---|---|---|---| +| `prefix` | `string` | `"conductor_worker"` | Prometheus metric name prefix. Legacy only; canonical metrics are unprefixed. | +| `httpPort` | `number` | — | Start built-in HTTP server on this port. | +| `filePath` | `string` | — | Periodically write metrics to this file path. | +| `fileWriteIntervalMs` | `number` | `5000` | File write interval in milliseconds. | +| `slidingWindowSize` | `number` | `1000` | Max observations for quantile calculation. Legacy only; canonical uses histogram buckets. | +| `usePromClient` | `boolean` | `false` | Use `prom-client` for native Prometheus integration. | + +## Canonical Metrics + +Canonical timing values are seconds. Canonical size values are bytes. Label +names use camelCase. + +### Canonical Counters + +| Metric | Labels | Description | +|---|---|---| +| `task_poll_total` | `taskType` | Incremented each time the worker issues a poll request. | +| `task_execution_started_total` | `taskType` | Incremented when a polled task is dispatched to the worker function. | +| `task_poll_error_total` | `taskType`, `exception` | Incremented when a poll request fails client-side. | +| `task_execute_error_total` | `taskType`, `exception` | Incremented when the worker function throws. | +| `task_update_error_total` | `taskType`, `exception` | Incremented when updating the task result fails. | +| `task_ack_error_total` | `taskType`, `exception` | Collector surface for task ack errors. The internal runner uses batch poll responses as ack and may not emit this during normal polling. | +| `task_ack_failed_total` | `taskType` | Collector surface for failed task ack responses. The internal runner uses batch poll responses as ack and may not emit this during normal polling. | +| `task_execution_queue_full_total` | `taskType` | Incremented when the worker execution queue is saturated. | +| `task_paused_total` | `taskType` | Incremented when a worker is paused and skips acting on a poll. | +| `thread_uncaught_exceptions_total` | `exception` | Incremented on uncaught exceptions in the worker process. | +| `external_payload_used_total` | `entityName`, `operation`, `payloadType` | Incremented when external payload storage is used for task or workflow payloads. | +| `workflow_start_error_total` | `workflowType`, `exception` | Incremented when starting a workflow fails client-side. | + +### Canonical Time Histograms + +All canonical time histograms use buckets: +`0.001`, `0.005`, `0.01`, `0.025`, `0.05`, `0.1`, `0.25`, `0.5`, `1`, `2.5`, +`5`, `10`. + +| Metric | Labels | Description | +|---|---|---| +| `task_poll_time_seconds` | `taskType`, `status` | Poll request latency. `status` is `SUCCESS` or `FAILURE`. | +| `task_execute_time_seconds` | `taskType`, `status` | Worker function execution duration. `status` is `SUCCESS` or `FAILURE`. | +| `task_update_time_seconds` | `taskType`, `status` | Task-result update latency. `status` is `SUCCESS` or `FAILURE`. | +| `http_api_client_request_seconds` | `method`, `uri`, `status` | API-client HTTP request latency. `status` is the HTTP status code as a string, or `"0"` on network failure. | + +Each histogram exposes Prometheus series such as: + +```prometheus +task_execute_time_seconds_bucket{taskType="my_task",status="SUCCESS",le="0.1"} 42 +task_execute_time_seconds_count{taskType="my_task",status="SUCCESS"} 50 +task_execute_time_seconds_sum{taskType="my_task",status="SUCCESS"} 2.3 ``` -### Labeled by `task_type` +### Canonical Size Histograms -| Prometheus Name | Internal Key | Unit | Description | -|----------------|-------------|------|-------------| -| `{prefix}_task_poll_time` | `pollDurationMs` | ms | Task poll round-trip duration | -| `{prefix}_task_execute_time` | `executionDurationMs` | ms | Worker function execution duration | -| `{prefix}_task_update_time` | `updateDurationMs` | ms | Task result update (SDK to server) duration | -| `{prefix}_task_result_size_bytes` | `outputSizeBytes` | bytes | Task result output payload size | +All canonical size histograms use buckets: +`100`, `1000`, `10000`, `100000`, `1000000`, `10000000`. -### Labeled by `workflow_type` +| Metric | Labels | Description | +|---|---|---| +| `task_result_size_bytes` | `taskType` | Serialized task result output size. | +| `workflow_input_size_bytes` | `workflowType`, `version` | Serialized workflow input size. `version` is an empty string when not provided. | -| Prometheus Name | Internal Key | Unit | Description | -|----------------|-------------|------|-------------| -| `{prefix}_workflow_input_size_bytes` | `workflowInputSizeBytes` | bytes | Workflow input payload size | +### Canonical Gauges -### Labeled by `endpoint` +| Metric | Labels | Description | +|---|---|---| +| `active_workers` | `taskType` | Current number of workers actively executing tasks. | -| Prometheus Name | Internal Key | Unit | Description | -|----------------|-------------|------|-------------| -| `{prefix}_http_api_client_request` | `apiRequestDurationMs` | ms | API request duration. Label format: `METHOD:/api/path:STATUS` | +## Legacy Metrics ---- +Legacy mode is the default so existing dashboards and alerts continue to work. +The default metric name prefix is `conductor_worker`. The prefix is configurable +via the `prefix` option on `MetricsCollectorConfig`. -## Event Listener Methods +Distribution metrics are sliding-window summaries over the latest 1,000 +observations (configurable via `slidingWindowSize`), exposing quantiles at +p50, p75, p90, p95, and p99. Legacy distribution metrics also expose `_count` +and `_sum` series. -These methods are called automatically by the `TaskRunner` when `MetricsCollector` is registered as an event listener: +### Legacy Counters -| Method | Metrics Updated | -|--------|----------------| -| `onPollStarted(event)` | Increments `pollTotal` | -| `onPollCompleted(event)` | Records `pollDurationMs` | -| `onPollFailure(event)` | Increments `pollErrorTotal`, records `pollDurationMs` | -| `onTaskExecutionStarted(event)` | _(no-op, counted on completion)_ | -| `onTaskExecutionCompleted(event)` | Increments `taskExecutionTotal`, records `executionDurationMs` and `outputSizeBytes` | -| `onTaskExecutionFailure(event)` | Increments `taskExecutionErrorTotal`, records `executionDurationMs` | -| `onTaskUpdateCompleted(event)` | Records `updateDurationMs` | -| `onTaskUpdateFailure(event)` | Increments `taskUpdateFailureTotal` | +| Metric | Labels | Description | +|---|---|---| +| `conductor_worker_task_poll_total` | `task_type` | Incremented each time polling is done. | +| `conductor_worker_task_poll_error_total` | `task_type` | Incremented when a poll request fails. | +| `conductor_worker_task_execute_total` | `task_type` | Incremented when a task execution completes. | +| `conductor_worker_task_execute_error_total` | `task_type` | Task execution errors. Label format: `taskType:ExceptionName`. | +| `conductor_worker_task_update_error_total` | `task_type` | Incremented when updating the task result fails. | +| `conductor_worker_task_ack_error_total` | `task_type` | Collector surface for task ack errors. | +| `conductor_worker_task_execution_queue_full_total` | `task_type` | Incremented when the execution queue is saturated. | +| `conductor_worker_task_paused_total` | `task_type` | Incremented when a worker is paused and skips a poll. | +| `conductor_worker_external_payload_used_total` | `payload_type` | External payload storage usage. | +| `conductor_worker_thread_uncaught_exceptions_total` | none | Uncaught exceptions in the worker process. | +| `conductor_worker_worker_restart_total` | none | Worker restart events. | +| `conductor_worker_workflow_start_error_total` | none | Workflow start errors. | -## Direct Recording Methods +Legacy mode does not emit `task_execution_started_total`, +`task_ack_failed_total`, or `active_workers`. -For metrics outside the event listener system, call these methods directly: +### Legacy Time Metrics -```typescript -const collector = new MetricsCollector(); - -collector.recordTaskExecutionQueueFull("my_task"); -collector.recordUncaughtException(); -collector.recordWorkerRestart(); -collector.recordTaskPaused("my_task"); -collector.recordTaskAckError("my_task"); -collector.recordWorkflowStartError(); -collector.recordExternalPayloadUsed("task_output"); -collector.recordWorkflowInputSize("my_workflow", 2048); -collector.recordApiRequestTime("POST", "/api/tasks", 200, 35); -``` +Time values are milliseconds. Type is Summary. -## Exposition Formats +| Metric | Labels | Description | +|---|---|---| +| `conductor_worker_task_poll_time` | `task_type` | Poll round-trip duration. | +| `conductor_worker_task_execute_time` | `task_type` | Worker function execution duration. | +| `conductor_worker_task_update_time` | `task_type` | Task result update duration. | -### Built-in Prometheus Text +Each summary exposes quantile, count, and sum series: -```typescript -const text = collector.toPrometheusText(); -// Returns Prometheus text format (text/plain; version=0.0.4) +```prometheus +conductor_worker_task_execute_time{task_type="my_task",quantile="0.5"} 102 +conductor_worker_task_execute_time{task_type="my_task",quantile="0.95"} 250 +conductor_worker_task_execute_time_count{task_type="my_task"} 1000 +conductor_worker_task_execute_time_sum{task_type="my_task"} 120345 ``` -### Async (with prom-client support) - -```typescript -const text = await collector.toPrometheusTextAsync(); -// Uses prom-client registry when available, falls back to built-in +### Legacy Size Metrics + +Type is Summary. Values are bytes. + +| Metric | Labels | Description | +|---|---|---| +| `conductor_worker_task_result_size_bytes` | `task_type` | Task result output payload size. | +| `conductor_worker_workflow_input_size_bytes` | `workflow_type` | Workflow input payload size. | + +### Legacy HTTP Metrics + +| Metric | Labels | Description | +|---|---|---| +| `conductor_worker_http_api_client_request` | `endpoint` | API request duration in milliseconds. The `endpoint` label is a compound `"METHOD:/api/path:STATUS"` string. | + +## Labels + +| Label | Used by | Values | +|---|---|---| +| `task_type` | Legacy worker metrics | Task definition name. | +| `taskType` | Canonical worker metrics | Task definition name. | +| `workflowType` | Canonical workflow metrics | Workflow definition name. | +| `workflow_type` | Legacy `conductor_worker_workflow_input_size_bytes` | Workflow definition name. | +| `version` | Canonical `workflow_input_size_bytes` | Workflow version as a string. Empty string when not provided. | +| `status` | Canonical task time histograms | `SUCCESS` or `FAILURE`. For `http_api_client_request_seconds`, the HTTP status code as a string, or `"0"` on network failure. | +| `exception` | Canonical error counters | Exception type name, such as `TypeError`. Derived from `error.name` or `error.constructor.name`. | +| `entityName` | Canonical `external_payload_used_total` | Task type or workflow name associated with the external payload. | +| `operation` | Canonical `external_payload_used_total` | External payload operation, such as `READ` or `WRITE`. | +| `payload_type` | Legacy `conductor_worker_external_payload_used_total` | Payload type, such as `workflow_input` or `task_output`. | +| `payloadType` | Canonical `external_payload_used_total` | Payload type, such as `TASK_INPUT`, `TASK_OUTPUT`, `WORKFLOW_INPUT`, or `WORKFLOW_OUTPUT`. | +| `method` | Canonical HTTP metrics | HTTP verb. | +| `uri` | Canonical HTTP metrics | Request path template (e.g. `/workflow/{workflowId}`). Uses bounded-cardinality templates, not interpolated paths. | +| `endpoint` | Legacy HTTP metrics | Compound `"METHOD:/api/path:STATUS"` string. | +| `quantile` | Legacy time and size metrics | `0.5`, `0.75`, `0.9`, `0.95`, or `0.99`. | + +## Migrating From Legacy to Canonical + +Canonical mode is opt-in during the deprecation period. Before switching a +production worker, update dashboards and alerts against a staging worker with +`WORKER_CANONICAL_METRICS=true`. + +Key changes: + +- The `conductor_worker_` prefix is removed. Canonical metric names are + unprefixed. +- Legacy task labels use `task_type`; canonical task labels use `taskType`. +- Legacy time metrics are millisecond summaries with quantiles. Canonical time + metrics are second-based histograms with bucket boundaries. Query `_bucket` + series with `histogram_quantile()` instead of reading `{quantile="..."}` + gauges. +- Legacy size metrics are summaries. Canonical size metrics are histograms. +- Canonical error counters add an `exception` label containing the exception + type name. +- Canonical time histograms add a `status` label (`SUCCESS` or `FAILURE`). +- Canonical mode adds metrics that legacy mode never emits: + `task_execution_started_total`, `task_ack_failed_total`, and + `active_workers`. +- Legacy `conductor_worker_worker_restart_total` is not emitted in canonical + mode (Node.js single-process model). +- Legacy uses `payload_type`; canonical uses `payloadType`. +- Legacy HTTP metrics use a compound `endpoint` label; canonical uses separate + `method`, `uri`, and `status` labels. +- Canonical and legacy collectors are mutually exclusive. During a migration, + compare scrape output by running separate worker instances or environments + with and without `WORKER_CANONICAL_METRICS=true`. + +Legacy-to-canonical replacements: + +| Legacy metric | Canonical replacement | +|---|---| +| `conductor_worker_task_poll_total{task_type}` | `task_poll_total{taskType}` | +| `conductor_worker_task_poll_error_total{task_type}` | `task_poll_error_total{taskType,exception}` | +| `conductor_worker_task_execute_total{task_type}` | `task_execute_time_seconds{taskType,status}` (count from histogram) and `task_execution_started_total{taskType}` | +| `conductor_worker_task_execute_error_total{task_type}` | `task_execute_error_total{taskType,exception}` | +| `conductor_worker_task_update_error_total{task_type}` | `task_update_error_total{taskType,exception}` | +| `conductor_worker_task_poll_time{task_type}` (summary, ms) | `task_poll_time_seconds{taskType,status}` (histogram, seconds) | +| `conductor_worker_task_execute_time{task_type}` (summary, ms) | `task_execute_time_seconds{taskType,status}` (histogram, seconds) | +| `conductor_worker_task_update_time{task_type}` (summary, ms) | `task_update_time_seconds{taskType,status}` (histogram, seconds) | +| `conductor_worker_task_result_size_bytes{task_type}` (summary) | `task_result_size_bytes{taskType}` (histogram) | +| `conductor_worker_workflow_input_size_bytes{workflow_type}` (summary) | `workflow_input_size_bytes{workflowType,version}` (histogram) | +| `conductor_worker_http_api_client_request{endpoint}` (summary, ms) | `http_api_client_request_seconds{method,uri,status}` (histogram, seconds) | +| `conductor_worker_external_payload_used_total{payload_type}` | `external_payload_used_total{entityName,operation,payloadType}` | +| `conductor_worker_workflow_start_error_total` (no labels) | `workflow_start_error_total{workflowType,exception}` | +| `conductor_worker_worker_restart_total` | — (not emitted in canonical mode) | + +Common PromQL replacements: + +| Legacy | Canonical | +|---|---| +| `conductor_worker_task_execute_time{quantile="0.95"}` | `histogram_quantile(0.95, sum by (le, taskType, status) (rate(task_execute_time_seconds_bucket[5m])))` | +| `conductor_worker_task_poll_time{quantile="0.95"}` | `histogram_quantile(0.95, sum by (le, taskType, status) (rate(task_poll_time_seconds_bucket[5m])))` | +| `conductor_worker_http_api_client_request{quantile="0.95"}` | `histogram_quantile(0.95, sum by (le, method, uri, status) (rate(http_api_client_request_seconds_bucket[5m])))` | +| `conductor_worker_task_result_size_bytes{quantile="0.95"}` | `histogram_quantile(0.95, sum by (le, taskType) (rate(task_result_size_bytes_bucket[5m])))` | + +Average latency queries continue to use `_sum` divided by `_count`, but the +canonical series are cumulative histogram counters: + +```promql +sum(rate(task_execute_time_seconds_sum[5m])) by (taskType) +/ +sum(rate(task_execute_time_seconds_count[5m])) by (taskType) ``` -### HTTP Server (MetricsServer) +## Troubleshooting -```typescript -import { MetricsServer } from "@io-orkes/conductor-javascript"; +### Metrics Are Empty -const server = new MetricsServer(collector, 9090); -await server.start(); -// GET /metrics — Content-Type from collector.getContentType() -// GET /health — { "status": "UP" } -await server.stop(); -``` - -### File Output +- Verify that `createMetricsCollector()` or a collector constructor is called + and the collector is passed to `TaskHandler` via `eventListeners`. +- Verify workers have polled or executed tasks. Metrics are created lazily when + the relevant event occurs. +- Confirm the scrape endpoint is reachable at the expected host and port. -Configured via `filePath` in `MetricsCollectorConfig`. Writes `toPrometheusText()` output to disk. The file writer performs an immediate first write on construction, then writes periodically at the configured interval. +### Missing HTTP or Workflow Metrics ---- +- `http_api_client_request_seconds` (canonical) or + `conductor_worker_http_api_client_request` (legacy) is recorded by the + `wrapFetchWithRetry` wrapper. Verify the collector is constructed before HTTP + calls begin. +- `workflow_input_size_bytes` and `workflow_start_error_total` are recorded in + `WorkflowExecutor`. Verify the collector is active before starting workflows. -## Sliding Window and Quantile Calculation +### High Cardinality -Summary metrics use a **sliding window** (default: 1,000 observations) to calculate percentiles. This provides: +- In canonical mode the `uri` label on `http_api_client_request_seconds` uses + path templates (e.g. `/workflow/{workflowId}`), giving bounded cardinality. + Legacy mode's `endpoint` label still contains fully interpolated paths. +- Prefer canonical mode for bounded `exception` labels. Legacy error counters + encode exception names in the Map key, not as a proper Prometheus label. +- Avoid embedding user identifiers or unbounded values in task type, workflow + type, or external payload labels. -- Accurate recent percentiles without unbounded memory growth -- No need to pre-configure histogram bucket boundaries -- Direct percentile values without interpolation artifacts +### Recording Uncaught Exceptions -Quantiles are computed on-demand using linear interpolation on sorted observations when `toPrometheusText()` is called. +The `thread_uncaught_exceptions_total` metric is not wired automatically. In +Node.js, registering a `process.on("uncaughtException")` handler overrides the +default crash behavior, which can leave the process running in a corrupted +state. Instead, wire it yourself so you control the exit policy: -When using `prom-client` (`usePromClient: true`), summaries use prom-client's native implementation with `maxAgeSeconds: 600` and `ageBuckets: 5`. +```typescript +const metrics = createMetricsCollector(); ---- +process.on("uncaughtException", (err) => { + metrics.recordUncaughtException(err.name || "Error"); + console.error(err); + process.exit(1); +}); -## Monitoring Best Practices +process.on("unhandledRejection", (reason) => { + const name = reason instanceof Error ? reason.name || "Error" : "Error"; + metrics.recordUncaughtException(name); + console.error(reason); + process.exit(1); +}); +``` -- **Use p95/p99 for SLO monitoring** rather than averages. Percentile-based thresholds better capture user-impacting performance variations. -- **Alert on `task_update_error_total`** — a rising count indicates task results are being lost and workers are failing to report back to the Conductor server. -- **Alert on `task_execution_queue_full_total`** — indicates the concurrency limit is consistently reached. Consider increasing worker `concurrency`. -- **Monitor `task_poll_time` p99** — high poll latency suggests network issues or server overload. -- **Monitor `task_execute_time` p95** — watch for execution time regression in worker functions. -- **File output interval**: 10-60 seconds recommended for production. Lower intervals increase disk I/O. -- **Clean metrics directory on startup** when using file output with multiprocess workers to avoid stale data. +### prom-client Issues ---- +- `MetricsCollector` uses `await import("./MetricsServer.js")` internally. The + `.js` extension does not resolve under Jest's TypeScript transform. Test + `MetricsServer` by importing it directly, not via the `httpPort` config + option. +- When `usePromClient: true` is set but `prom-client` is not installed, the + collector falls back to the built-in text format silently. -## Programmatic Access +## Implementation Notes -```typescript -const metrics = collector.getMetrics(); +### One Collector Per Process -// Counter values -metrics.pollTotal.get("my_task"); // number -metrics.taskExecutionTotal.get("my_task"); // number +Only one metrics collector can be active at a time. +`CanonicalMetricsCollector` registers itself as the global HTTP metrics +observer on construction. `LegacyMetricsCollector` does not self-register; +use `createMetricsCollector()` or call `setHttpMetricsObserver()` explicitly +to enable HTTP metrics in legacy mode. Creating a second collector replaces +the first. Calling `stop()` clears the global observer. -// Summary observations (raw array) -metrics.pollDurationMs.get("my_task"); // number[] -metrics.executionDurationMs.get("my_task"); // number[] +### Payload Size Measurement -// Reset all metrics -collector.reset(); +`workflow_input_size_bytes` is recorded by calling `JSON.stringify` on the +workflow input and measuring the resulting UTF-8 byte length with +`Buffer.byteLength`. This cost is controlled by the `measurePayloadSize` +config option: -// Stop file writer and HTTP server -await collector.stop(); -``` +- **Canonical** (default `true`): measured on every `startWorkflow` call. + Set `measurePayloadSize: false` to disable. +- **Legacy** (default `false`): not measured unless you set + `measurePayloadSize: true`. + +### Legacy `task_paused_total` + +Legacy mode does not emit `task_paused_total`. This metric was defined but +never recorded in any prior release. It remains unrecorded in legacy mode to +preserve byte-for-byte output compatibility. Switch to canonical mode +(`WORKER_CANONICAL_METRICS=true`) to get paused metrics. + +### `WORKER_LEGACY_METRICS` (Reserved) + +`WORKER_LEGACY_METRICS` is reserved for future use. Once canonical metrics +become the default, setting `WORKER_LEGACY_METRICS=true` will re-activate +the legacy metric surface. It is not read by the current implementation. + +## Detailed Technical Notes -- Unreleased + +This section documents internal implementation details for developers reviewing +the metrics harmonization changes. It is not end-user-facing and will be +removed or folded into the relevant sections once the release is published. + +### Architecture + +The `MetricsCollectorInterface` (`src/sdk/worker/metrics/MetricsCollectorInterface.ts`) +defines the contract for recording SDK metrics. Two implementations exist: + +- `LegacyMetricsCollector` (`src/sdk/worker/metrics/LegacyMetricsCollector.ts`) + -- emits the original metric names and types (sliding-window Summaries for + timing, compound `endpoint` labels, `conductor_worker_` prefix). +- `CanonicalMetricsCollector` (`src/sdk/worker/metrics/CanonicalMetricsCollector.ts`) + -- emits the harmonized cross-SDK catalog (Histograms in seconds/bytes, + `_total` counters, bounded `exception` labels from `error.name`). + +`createMetricsCollector()` reads `WORKER_CANONICAL_METRICS` once and returns the +appropriate implementation. + +### HTTP request timing via interceptors + +A **request interceptor** (`createMetricsInterceptors()` in +`src/sdk/createConductorClient/helpers/metricsInterceptors.ts`) stashes the +OpenAPI path template (from `opts.url`) in a `WeakMap` keyed on the `Request` +object. The template is stripped of the `/api/` prefix baked in by the OpenAPI +spec so the canonical `uri` label matches the cross-SDK convention (e.g. +`/workflow/{workflowId}`). + +`wrapFetchWithRetry` reads the template from the `WeakMap` before calling +`retryFetch` and records HTTP metrics for both successful responses and +network errors (status `"0"`). All timing and metric recording is centralised +in the fetch wrapper; the interceptor's only job is to bridge the path +template from `opts` (available to interceptors) to the fetch layer (which +only receives the `Request` object). + +### Canonical vs. legacy URI label + +`recordApiRequestTime` accepts an optional `metricUri` parameter (the path +template). The canonical collector uses `metricUri ?? uri`; the legacy collector +ignores `metricUri` and uses the interpolated `uri`, preserving legacy output +byte-for-byte. + +The JavaScript SDK's OpenAPI spec bakes `/api/` into every path template +(`opts.url = "/api/workflow/{workflowId}"`). Other SDKs keep `/api` in the base +URL and use clean paths. The request interceptor strips this prefix so the +canonical `uri` label matches the cross-SDK convention (e.g. +`/workflow/{workflowId}`). + +### Payload size metrics + +`workflow_input_size_bytes` and `task_result_size_bytes` measure specific +sub-fields of the request/response payloads (`workflowRequest.input` and +`merged.outputData` respectively), not the full HTTP body. This requires a +separate `JSON.stringify` call on the sub-field, independent of the HTTP body +serialization performed by the OpenAPI client. + +This cost is controlled by the `measurePayloadSize` config option (canonical +default `true`, legacy default `false`). The Go SDK provides +`WORKER_METRICS_PAYLOAD_SIZE` for the same purpose. + +### Metrics collector event surface + +`CanonicalMetricsCollector` and `LegacyMetricsCollector` both implement +`TaskRunnerEventsListener`, so they receive lifecycle events from the worker +infrastructure: + +- Event listener methods (from `TaskRunnerEventsListener`): `onPollStarted` / + `onPollCompleted` / `onPollFailure` / `onTaskExecutionStarted` / + `onTaskExecutionCompleted` / `onTaskExecutionFailure` / + `onTaskUpdateCompleted` / `onTaskUpdateFailure` / `onTaskPaused` +- Direct recording methods (called outside the event system): + `recordTaskExecutionQueueFull` / `recordUncaughtException` / + `recordTaskAckError` / `recordTaskAckFailed` / `recordTaskPaused` / + `recordExternalPayloadUsed` / `recordWorkflowStartError` / + `recordWorkflowInputSize` / `recordApiRequestTime` +- `Poller`, `TaskRunner`, and `EventDispatcher` emit a `taskPaused` event when a + poll cycle is skipped because the worker is paused. + +### Harness changes + +- Harness deployment manifest sets `WORKER_CANONICAL_METRICS=true`. +- `harness/main.ts` logs which collector is active. +- `WorkflowStatusProbe` (opt-in via `HARNESS_PROBE_RATE_PER_SEC`) exercises + UUID-bearing endpoints (`/api/workflow/{workflowId}`) to validate that the + `uri` label uses bounded templates. diff --git a/README.md b/README.md index 7880908a..f5479ac5 100644 --- a/README.md +++ b/README.md @@ -386,12 +386,12 @@ await handler.startWorkers(); ## Monitoring Workers -Enable Prometheus metrics with the built-in `MetricsCollector`: +Enable Prometheus metrics with the built-in metrics collector: ```typescript -import { MetricsCollector, MetricsServer, TaskHandler } from "@io-orkes/conductor-javascript"; +import { createMetricsCollector, MetricsServer, TaskHandler } from "@io-orkes/conductor-javascript"; -const metrics = new MetricsCollector(); +const metrics = createMetricsCollector(); const server = new MetricsServer(metrics, 9090); await server.start(); @@ -405,7 +405,7 @@ await handler.startWorkers(); // GET http://localhost:9090/health — {"status":"UP"} ``` -Collects 18 metric types: poll counts, execution durations, error rates, output sizes, and more — with p50/p75/p90/p95/p99 quantiles. See [METRICS.md](METRICS.md) for the full reference. +The SDK has two metric surfaces: **legacy** (default, prefixed `conductor_worker_` names, Summary type) and **canonical** (opt-in via `WORKER_CANONICAL_METRICS=true`, unprefixed names, Histogram type). See [METRICS.md](METRICS.md) for the full reference. ## Managing Workflow Executions @@ -542,7 +542,7 @@ See [examples/agentic-workflows/](examples/agentic-workflows/) for all examples. | Document | Description | |----------|-------------| | [SDK Development Guide](SDK_DEVELOPMENT.md) | Architecture, patterns, pitfalls, testing | -| [Metrics Reference](METRICS.md) | All 18 Prometheus metrics with descriptions | +| [Metrics Reference](METRICS.md) | Legacy and canonical Prometheus metrics with migration guide | | [Breaking Changes](BREAKING_CHANGES.md) | v3.x migration guide | | [Workflow Management](docs/api-reference/workflow-executor.md) | Start, pause, resume, terminate, retry, search, signal | | [Task Management](docs/api-reference/task-client.md) | Task operations, logs, queue management | diff --git a/SDK_COMPARISON.md b/SDK_COMPARISON.md index 881800bc..a2fdbfd6 100644 --- a/SDK_COMPARISON.md +++ b/SDK_COMPARISON.md @@ -4,7 +4,7 @@ This document provides a detailed comparison between the **Python SDK** (golden reference, `conductor-python`) and the **JavaScript SDK** (`@io-orkes/conductor-javascript` v3.0.0), covering API surface, architecture, worker system, and feature parity. -**Verdict Summary**: The JavaScript SDK has **near-complete parity** with the Python SDK (~98%). All client classes (including rate limit CRUD), worker features (TaskContext, adaptive backoff, 19 Prometheus metrics with quantiles and optional prom-client, decorator-based JSON schema generation), workflow DSL (`ConductorWorkflow` with `toSubWorkflowTask()`), and all 34 task type builders (including 13 LLM/AI + 2 MCP) are implemented. +**Verdict Summary**: The JavaScript SDK has **near-complete parity** with the Python SDK (~98%). All client classes (including rate limit CRUD), worker features (TaskContext, adaptive backoff, Prometheus metrics with legacy and canonical modes, decorator-based JSON schema generation), workflow DSL (`ConductorWorkflow` with `toSubWorkflowTask()`), and all 34 task type builders (including 13 LLM/AI + 2 MCP) are implemented. --- @@ -580,22 +580,23 @@ Features: #### Metrics / Observability +Both SDKs support legacy and canonical metric surfaces via `WORKER_CANONICAL_METRICS`. See [METRICS.md](METRICS.md) for the JavaScript SDK catalog. + | # | Feature | Python | JavaScript | Status | |---|---------|--------|-----------|--------| -| 1 | MetricsCollector | Yes (19 metrics) | Yes (19 metrics) | Full | +| 1 | Legacy / canonical collectors | `create_metrics_collector()` | `createMetricsCollector()` | Full | | 2 | HTTP `/metrics` endpoint | Yes | `MetricsServer` | Full | | 3 | HTTP `/health` endpoint | Yes | `MetricsServer` | Full | | 4 | Prometheus text format | Yes | `toPrometheusText()` | Full | | 5 | File-based metrics | Yes (`.prom` files) | Yes (`filePath` config option) | Full | -| 6 | API request metrics | `http_api_client_request` | `recordApiRequestTime()` | Full | -| 7 | Queue full metric | `task_execution_queue_full` | `recordTaskExecutionQueueFull()` | Full | -| 8 | Uncaught exception metric | `thread_uncaught_exceptions` | `recordUncaughtException()` | Full | -| 9 | Workflow start error | `workflow_start_error` | `recordWorkflowStartError()` | Full | -| 10 | External payload used | `external_payload_used` | `recordExternalPayloadUsed()` | Full | -| 11 | Worker restart metric | `worker_restart` | `recordWorkerRestart()` | Full | -| 12 | Quantile calculation (p50-p99) | Yes (sliding window 1000) | Yes (sliding window, configurable) | Full | -| 13 | `prometheus_client` integration | Yes (native prom-client) | Optional `prom-client` integration via `usePromClient: true` + `PrometheusRegistry` | Full (optional peer dep) | -| 14 | Auto-start via `httpPort` config | Yes | Yes | Full | +| 6 | API request metrics | Yes | `recordApiRequestTime()` | Full | +| 7 | Queue full metric | Yes | `recordTaskExecutionQueueFull()` | Full | +| 8 | Uncaught exception metric | Yes | `recordUncaughtException()` | Full | +| 9 | Workflow start error | Yes | `recordWorkflowStartError()` | Full | +| 10 | External payload used | Yes | `recordExternalPayloadUsed()` | Full | +| 11 | Worker restart metric | Yes (Python-only, multi-process) | Legacy only (single-process model) | Full | +| 12 | `prometheus_client` / `prom-client` | Yes (native) | Optional via `usePromClient: true` | Full | +| 13 | Auto-start via `httpPort` config | Yes | Yes | Full | #### JSON Schema Generation @@ -692,7 +693,7 @@ Both SDKs support identical environment variable formats for worker configuratio | **Worker system** | **98%** | TaskContext, adaptive backoff, auth backoff, events, schema | | **Task type builders** | **100%** | 34/34 builders including LLM, MCP, HTTP Poll, Webhook, GetDocument | | **Workflow DSL** | **95%** | Full `ConductorWorkflow` with `toSubWorkflowTask()`. Missing only `__call__` (language limitation) | -| **Metrics/Observability** | **98%** | 19 metrics, quantiles (p50-p99), HTTP + file export, optional prom-client integration | +| **Metrics/Observability** | **98%** | Legacy + canonical modes, HTTP + file export, optional prom-client integration | | **AI module** | **95%** | All LLM task builders + `LLMProvider`/`VectorDB` enums + `IntegrationConfig` types. Missing only `AIOrchestrator` | | **JSON schema** | **95%** | `jsonSchema()` declarative helper + `@schemaField()` decorator with `reflect-metadata` type inference | | **Scheduling** | **100%** | Full parity including tags | @@ -712,7 +713,7 @@ Both SDKs support identical environment variable formats for worker configuratio 7. **Worker configuration hierarchy**: Identical env var format and precedence 8. **`TaskHandler` orchestrator**: Same architecture as Python 9. **TaskContext**: Full async-local context with `addLog()`, `setCallbackAfter()` -10. **Prometheus metrics**: 19 metrics with quantiles (p50-p99), `MetricsServer` (HTTP + file export), optional `prom-client` integration +10. **Prometheus metrics**: Legacy and canonical modes with `createMetricsCollector()`, `MetricsServer` (HTTP + file export), optional `prom-client` integration 11. **JSON schema generation**: `jsonSchema()` declarative helper + `@schemaField()` decorator with `reflect-metadata` runtime type inference + `inputType`/`outputType` on `@worker` 12. **AI types**: `LLMProvider`, `VectorDB` enums + typed `IntegrationConfig` + `withPromptVariable()` helpers 13. **OpenAPI-generated types**: Strong TypeScript types from spec diff --git a/SDK_DEVELOPMENT.md b/SDK_DEVELOPMENT.md index b40fd2a6..05bfd277 100644 --- a/SDK_DEVELOPMENT.md +++ b/SDK_DEVELOPMENT.md @@ -119,9 +119,15 @@ src/ context/ TaskContext.ts # AsyncLocalStorage-based per-task context + getTaskContext() metrics/ - MetricsCollector.ts # TaskRunnerEventsListener impl, 19 metric types, quantiles - MetricsServer.ts # HTTP server: /metrics (Prometheus) + /health (JSON) - PrometheusRegistry.ts # Optional prom-client bridge (lazy loaded) + LegacyMetricsCollector.ts # Default: prefixed names, Summary type, ms units + CanonicalMetricsCollector.ts # Opt-in: unprefixed canonical names, Histogram type, seconds + metricsFactory.ts # createMetricsCollector() reads WORKER_CANONICAL_METRICS + MetricsServer.ts # HTTP server: /metrics (Prometheus) + /health (JSON) + MetricsCollectorInterface.ts # Shared interface for both collectors + PrometheusRegistry.ts # Optional prom-client bridge for legacy collector + CanonicalPrometheusRegistry.ts # Optional prom-client bridge for canonical collector + accumulators.ts # HistogramAccumulator, MultiLabelCounter, GaugeMetric + httpObserver.ts # Global singleton for API request metrics schema/ # jsonSchema(), schemaField() decorator, generateSchemaFromClass() config/ # Worker configuration resolution helpers generators/ # Legacy task generators (pre-v3, still exported for backward compat) @@ -268,15 +274,13 @@ Returns `undefined` outside task execution. All 16 methods: `getTaskId()`, `getW ### 7. Metrics +`createMetricsCollector()` reads `WORKER_CANONICAL_METRICS` and returns a legacy or canonical collector. See [METRICS.md](METRICS.md) for the full catalog and migration guide. + ```typescript -const metrics = new MetricsCollector({ prefix: "my_app", slidingWindowSize: 1000 }); +const metrics = createMetricsCollector(); const handler = new TaskHandler({ client, eventListeners: [metrics], scanForDecorated: true }); await handler.startWorkers(); -// Programmatic access -const m = metrics.getMetrics(); -console.log(m.pollTotal.get("my_task")); - // Prometheus text const text = metrics.toPrometheusText(); ``` @@ -291,7 +295,7 @@ await server.start(); await server.stop(); ``` -**Do not use `MetricsCollector({ httpPort })` in Jest** - it uses a dynamic import with `.js` extension that doesn't resolve under Jest's TS transform. +**Do not use `MetricsCollector({ httpPort })` in Jest** -- it uses a dynamic import with `.js` extension that doesn't resolve under Jest's TS transform. ## Known Pitfalls @@ -616,7 +620,7 @@ Node 18+ required. Dual ESM/CJS via `tsup` with `exports` field in `package.json | Workflow DSL | `ConductorWorkflow` | `ConductorWorkflow` | Fluent builder | | Worker decorator | `@worker_task` | `@worker` | TS decorator syntax | | Task context | `get_task_context()` | `getTaskContext()` | AsyncLocalStorage vs contextvars | -| Metrics | `MetricsCollector` | `MetricsCollector` | 19 metric types, quantiles | +| Metrics | `MetricsCollector` | `createMetricsCollector()` | Legacy + canonical modes, see [METRICS.md](METRICS.md) | | Metrics server | HTTP endpoint | `MetricsServer` | `/metrics` + `/health` | | Non-retryable | `NonRetryableError` | `NonRetryableException` | `FAILED_WITH_TERMINAL_ERROR` | | LLM builders | 13 builders | 13 builders | Full parity | diff --git a/WORKER_ARCHITECTURE_COMPARISON.md b/WORKER_ARCHITECTURE_COMPARISON.md index dd0701b5..528921ee 100644 --- a/WORKER_ARCHITECTURE_COMPARISON.md +++ b/WORKER_ARCHITECTURE_COMPARISON.md @@ -545,37 +545,38 @@ handler = TaskHandler( ### JavaScript: Built-in MetricsCollector -The JavaScript SDK provides a built-in `MetricsCollector` that implements `TaskRunnerEventsListener`: +The JavaScript SDK provides `createMetricsCollector()` which returns a legacy or canonical collector based on `WORKER_CANONICAL_METRICS`. See [METRICS.md](METRICS.md) for the full catalog. ```typescript -import { MetricsCollector, TaskHandler } from "@io-orkes/conductor-javascript/worker"; +import { createMetricsCollector, MetricsServer, TaskHandler } from "@io-orkes/conductor-javascript"; -const metrics = new MetricsCollector(); +const metrics = createMetricsCollector(); +const server = new MetricsServer(metrics, 9090); +await server.start(); const handler = new TaskHandler({ client, eventListeners: [metrics], }); -handler.startWorkers(); - -// Read metrics -const snapshot = metrics.getMetrics(); -console.log("Poll total:", snapshot.pollTotal); -console.log("Execution durations:", snapshot.executionDurationMs); +await handler.startWorkers(); +// GET http://localhost:9090/metrics -> Prometheus text +// GET http://localhost:9090/health -> {"status":"UP"} ``` ### Comparison | Feature | Python | JavaScript | |---------|--------|-----------| -| Built-in metrics collector | Yes (`MetricsCollector`) | Yes (`MetricsCollector`) | Parity | -| HTTP metrics endpoint | Yes (`/metrics`, `/health`) | No (use prom-client separately) | Python richer | -| File-based metrics | Yes (`.prom` files) | No | Python richer | +| Built-in metrics collector | Yes (`create_metrics_collector()`) | Yes (`createMetricsCollector()`) | Parity | +| Legacy / canonical modes | Yes (`WORKER_CANONICAL_METRICS`) | Yes (`WORKER_CANONICAL_METRICS`) | Parity | +| HTTP metrics endpoint | Yes (`/metrics`, `/health`) | Yes (`MetricsServer`) | Parity | +| File-based metrics | Yes (`.prom` files) | Yes (`filePath` config) | Parity | | Multiprocess aggregation | Yes (SQLite) | N/A (single process) | N/A | -| API request metrics | Yes (`http_api_client_request`) | No | Python richer | +| API request metrics | Yes | Yes (`recordApiRequestTime()`) | Parity | | Event-based architecture | Yes (MetricsCollector is listener) | Yes (MetricsCollector is listener) | Parity | | Custom metrics via events | Yes | Yes | Parity | +| Optional `prom-client` | Yes (native) | Yes (`usePromClient: true`) | Parity | --- diff --git a/harness/main.ts b/harness/main.ts index 3240ce1a..6ed6be8c 100644 --- a/harness/main.ts +++ b/harness/main.ts @@ -3,12 +3,13 @@ import { ConductorWorkflow, TaskHandler, simpleTask, - MetricsCollector, + createMetricsCollector, } from "../src/sdk"; import { MetadataResource } from "../src/open-api/generated"; import type { ConductorWorker } from "../src/sdk/clients/worker/types"; import { SimulatedTaskWorker } from "./simulatedTaskWorker"; import { WorkflowGovernor } from "./workflowGovernor"; +import { WorkflowStatusProbe } from "./workflowStatusProbe"; const WORKFLOW_NAME = "js_simulated_tasks_workflow"; @@ -93,8 +94,10 @@ async function main(): Promise { }); const metricsPort = envIntOrDefault("HARNESS_METRICS_PORT", 9991); - const metricsCollector = new MetricsCollector({ httpPort: metricsPort }); - console.log(`Prometheus metrics server started on port ${metricsPort}`); + const usePromClient = process.env.HARNESS_USE_PROM_CLIENT === "true"; + const metricsCollector = createMetricsCollector({ httpPort: metricsPort, usePromClient }); + const backend = usePromClient ? "prom-client" : "built-in"; + console.log(`Prometheus metrics server started on port ${metricsPort} (${metricsCollector.collectorName()} metrics, ${backend} backend)`); const handler = new TaskHandler({ client, @@ -104,15 +107,27 @@ async function main(): Promise { }); await handler.startWorkers(); + const probeRate = envIntOrDefault("HARNESS_PROBE_RATE_PER_SEC", 0); + const probe = + probeRate > 0 ? new WorkflowStatusProbe(client, probeRate) : undefined; + const governor = new WorkflowGovernor( workflowClient, WORKFLOW_NAME, workflowsPerSec, + probe ? probe.offer.bind(probe) : undefined, ); governor.start(); + if (probe) { + probe.start(); + } + const shutdown = async () => { console.log("Shutting down..."); + if (probe) { + probe.stop(); + } governor.stop(); await handler.stopWorkers(); process.exit(0); diff --git a/harness/manifests/deployment.yaml b/harness/manifests/deployment.yaml index 878d6728..95218587 100644 --- a/harness/manifests/deployment.yaml +++ b/harness/manifests/deployment.yaml @@ -40,6 +40,8 @@ spec: value: "20" - name: HARNESS_POLL_INTERVAL_MS value: "100" + - name: WORKER_CANONICAL_METRICS + value: "true" ports: - name: metrics containerPort: 9991 diff --git a/harness/workflowGovernor.ts b/harness/workflowGovernor.ts index 9865f126..722e61f9 100644 --- a/harness/workflowGovernor.ts +++ b/harness/workflowGovernor.ts @@ -4,16 +4,19 @@ export class WorkflowGovernor { private readonly workflowExecutor: WorkflowExecutor; private readonly workflowName: string; private readonly workflowsPerSecond: number; + private readonly idSink?: (id: string) => void; private timer: ReturnType | undefined; constructor( workflowExecutor: WorkflowExecutor, workflowName: string, workflowsPerSecond: number, + idSink?: (id: string) => void, ) { this.workflowExecutor = workflowExecutor; this.workflowName = workflowName; this.workflowsPerSecond = workflowsPerSecond; + this.idSink = idSink; } start(): void { @@ -46,15 +49,21 @@ export class WorkflowGovernor { this.workflowExecutor.startWorkflow({ name: this.workflowName, version: 1, + input: { startedAt: Date.now() }, }), ); } Promise.all(promises) - .then(() => { + .then((ids) => { console.log( `Governor: started ${this.workflowsPerSecond} workflow(s)`, ); + if (this.idSink) { + for (const id of ids) { + this.idSink(id); + } + } }) .catch((err: unknown) => { console.error( diff --git a/harness/workflowStatusProbe.ts b/harness/workflowStatusProbe.ts new file mode 100644 index 00000000..741f4438 --- /dev/null +++ b/harness/workflowStatusProbe.ts @@ -0,0 +1,97 @@ +import type { Client } from "../src/open-api/generated/client"; +import { WorkflowResource } from "../src/open-api/generated"; + +const MAX_TRACKED_IDS = 256; + +/** + * Exercises UUID-bearing workflow lookup endpoints so + * http_api_client_request_seconds picks up entries with + * uri=/workflow/{workflowId} and uri=/workflow/{workflowId}/tasks. + * + * Default harness traffic only hits bounded, no-path-param URLs (poll/update), + * making the high-cardinality concern on the uri label invisible without this + * probe. + * + * Default off. Runs only when HARNESS_PROBE_RATE_PER_SEC > 0. + * Side-effect-free: only issues read calls (getExecutionStatus, + * getExecutionStatusTaskList). + * Self-bounded: fixed-size FIFO of workflow IDs. + */ +export class WorkflowStatusProbe { + private readonly client: Client; + private readonly callsPerSecond: number; + private readonly recentIDs: string[] = []; + private timer: ReturnType | undefined; + + constructor(client: Client, callsPerSecond: number) { + this.client = client; + this.callsPerSecond = callsPerSecond; + } + + offer(workflowId: string): void { + if (!workflowId) return; + this.recentIDs.push(workflowId); + if (this.recentIDs.length > MAX_TRACKED_IDS) { + this.recentIDs.splice(0, this.recentIDs.length - MAX_TRACKED_IDS); + } + } + + start(): void { + if (this.callsPerSecond <= 0) { + console.log( + "WorkflowStatusProbe disabled (HARNESS_PROBE_RATE_PER_SEC<=0)", + ); + return; + } + console.log( + `WorkflowStatusProbe started: rate=${this.callsPerSecond}/sec, retainedIds<=${MAX_TRACKED_IDS}`, + ); + + this.timer = setInterval(() => { + this.tick(); + }, 1000); + + if (this.timer && typeof this.timer === "object" && "unref" in this.timer) { + this.timer.unref(); + } + } + + stop(): void { + if (this.timer) { + clearInterval(this.timer); + this.timer = undefined; + } + console.log("WorkflowStatusProbe stopped"); + } + + private tick(): void { + const budget = Math.min(this.callsPerSecond, this.recentIDs.length); + if (budget === 0) return; + + const ids: string[] = []; + for (let i = 0; i < budget; i++) { + ids.push( + this.recentIDs[Math.floor(Math.random() * this.recentIDs.length)], + ); + } + + for (const id of ids) { + const call = + Math.random() < 0.5 + ? WorkflowResource.getExecutionStatus({ + client: this.client, + path: { workflowId: id }, + }) + : WorkflowResource.getExecutionStatusTaskList({ + client: this.client, + path: { workflowId: id }, + }); + + call.catch((err: unknown) => { + console.error( + `probe: ${id}: ${err instanceof Error ? err.message : String(err)}`, + ); + }); + } + } +} diff --git a/src/integration-tests/E2EFiveTaskWorkflow.test.ts b/src/integration-tests/E2EFiveTaskWorkflow.test.ts index 66aed967..1d6fc9af 100644 --- a/src/integration-tests/E2EFiveTaskWorkflow.test.ts +++ b/src/integration-tests/E2EFiveTaskWorkflow.test.ts @@ -120,10 +120,17 @@ describeForOrkesV5("E2E: 5-task workflow × 50 executions", () => { expect(workflowIds.length).toBe(WORKFLOW_COUNT); - // Wait for all 50 to complete (600s in CI-friendly timeout, poll every 2s) + // Wait for all 50 to complete (600s per workflow, poll every 2s) + let completed = 0; const results = await Promise.all( workflowIds.map((id) => - waitForWorkflowStatus(executor, id, "COMPLETED", 600_000, 2000) + waitForWorkflowStatus(executor, id, "COMPLETED", 600_000, 2000).then( + (wf) => { + completed++; + console.log(`[E2E] workflow ${completed}/${WORKFLOW_COUNT} completed (${id})`); + return wf; + } + ) ) ); @@ -168,7 +175,6 @@ describeForOrkesV5("E2E: 5-task workflow × 50 executions", () => { workflows: [{ name: workflowName, version: 1 }], tasks: taskNames, }); - }, - 330_000 // 5.5 minute timeout for the entire test + } ); }); diff --git a/src/integration-tests/LeaseExtension.validation.test.ts b/src/integration-tests/LeaseExtension.validation.test.ts index 015512c8..a977613b 100644 --- a/src/integration-tests/LeaseExtension.validation.test.ts +++ b/src/integration-tests/LeaseExtension.validation.test.ts @@ -36,7 +36,6 @@ import type { Client } from "../open-api"; import { MetadataClient, WorkflowExecutor, - TaskClient, orkesConductorClient, } from "../sdk"; import { LeaseTracker } from "../sdk/clients/worker/LeaseTracker"; @@ -56,7 +55,6 @@ describe("Lease Extension — end-to-end validation", () => { let client: Client; let executor: WorkflowExecutor; let metadataClient: MetadataClient; - let taskClient: TaskClient; const logger: ConductorLogger = new DefaultLogger(); const suffix = Date.now(); @@ -67,7 +65,6 @@ describe("Lease Extension — end-to-end validation", () => { client = await orkesConductorClient(); executor = new WorkflowExecutor(client); metadataClient = new MetadataClient(client); - taskClient = new TaskClient(client); await metadataClient.registerTask({ name: taskDefName, @@ -117,7 +114,7 @@ describe("Lease Extension — end-to-end validation", () => { const deadline = Date.now() + maxWaitMs; while (Date.now() < deadline) { const wf = await executor.getWorkflow(workflowId, false); - if (terminal.has(wf.status ?? "")) return wf.status!; + if (terminal.has(wf.status ?? "")) return wf.status ?? ""; await sleep(1_000); } return "STILL_RUNNING"; @@ -132,8 +129,8 @@ describe("Lease Extension — end-to-end validation", () => { const { data: tasks1 } = await TaskResource.batchPoll({ client, path: { tasktype: taskDefName }, query: { workerid: "val-worker-no-lease", count: 1, timeout: 200 } }); const [task] = tasks1 ?? []; expect(task).toBeDefined(); - const taskId1 = task.taskId!; - const wfId1 = task.workflowInstanceId!; + const taskId1 = task.taskId ?? ""; + const wfId1 = task.workflowInstanceId ?? ""; console.log(` Task polled: ${taskId1}`); console.log(` No LeaseTracker created — zero extendLease calls will be made`); @@ -177,8 +174,8 @@ describe("Lease Extension — end-to-end validation", () => { const { data: tasks2 } = await TaskResource.batchPoll({ client, path: { tasktype: taskDefName }, query: { workerid: "val-worker-with-lease", count: 1, timeout: 200 } }); const [task] = tasks2 ?? []; expect(task).toBeDefined(); - const taskId2 = task.taskId!; - const wfId2 = task.workflowInstanceId!; + const taskId2 = task.taskId ?? ""; + const wfId2 = task.workflowInstanceId ?? ""; console.log(` Task polled: ${taskId2}`); // Track heartbeat calls via a spy that ALSO sends the real heartbeat diff --git a/src/sdk/builders/tasks/pullWorkflowMessages.ts b/src/sdk/builders/tasks/pullWorkflowMessages.ts index e4b2ebae..e0bcb3c3 100644 --- a/src/sdk/builders/tasks/pullWorkflowMessages.ts +++ b/src/sdk/builders/tasks/pullWorkflowMessages.ts @@ -16,7 +16,7 @@ import { TaskType, PullWorkflowMessagesTaskDef } from "../../../open-api"; */ export const pullWorkflowMessages = ( taskReferenceName: string, - batchSize: number = 1, + batchSize = 1, optional?: boolean ): PullWorkflowMessagesTaskDef => ({ name: taskReferenceName, diff --git a/src/sdk/clients/worker/Poller.ts b/src/sdk/clients/worker/Poller.ts index a7fb1787..414552ac 100644 --- a/src/sdk/clients/worker/Poller.ts +++ b/src/sdk/clients/worker/Poller.ts @@ -189,6 +189,7 @@ export class Poller { this.logger.debug( `Worker ${this._pollerId} is paused, skipping poll` ); + this.options.onPaused?.(); await this.sleep( this.options.pollInterval ?? DEFAULT_POLL_INTERVAL ); diff --git a/src/sdk/clients/worker/TaskRunner.ts b/src/sdk/clients/worker/TaskRunner.ts index a836ad5b..26ffab4e 100644 --- a/src/sdk/clients/worker/TaskRunner.ts +++ b/src/sdk/clients/worker/TaskRunner.ts @@ -96,6 +96,12 @@ export class TaskRunner { { concurrency: worker.concurrency ?? this.options.concurrency, pollInterval: worker.pollInterval ?? this.options.pollInterval, + onPaused: () => { + void this.eventDispatcher.publishTaskPaused({ + taskType: this.worker.taskDefName, + timestamp: new Date(), + }); + }, }, this.logger ); @@ -221,15 +227,16 @@ export class TaskRunner { const { workerID } = this.options; let retryCount = 0; let lastError: Error | null = null; + let lastAttemptDurationMs = 0; while (retryCount < this.maxRetries) { + const updateStart = Date.now(); try { if (process.env.CI) { console.log( `[TaskRunner] Submitting task result taskId=${taskResult.taskId} workflowId=${taskResult.workflowInstanceId} taskType=${this.worker.taskDefName} attempt=${retryCount + 1}/${this.maxRetries}` ); } - const updateStart = Date.now(); if (TaskRunner.updateV2Available === false) { // Already detected a legacy server — skip the probe, call legacy directly. @@ -328,6 +335,7 @@ export class TaskRunner { }); return nextTask ?? undefined; } catch (error: unknown) { + lastAttemptDurationMs = Date.now() - updateStart; lastError = error as Error; this.errorHandler(lastError, task); this.logger.error( @@ -358,6 +366,7 @@ export class TaskRunner { workerId: workerID, workflowInstanceId: taskResult.workflowInstanceId, cause: lastError ?? new Error("Task update failed after all retries"), + durationMs: lastAttemptDurationMs, retryCount, taskResult, timestamp: new Date(), @@ -500,7 +509,7 @@ export class TaskRunner { // Calculate output size if possible const outputSizeBytes = merged.outputData - ? JSON.stringify(merged.outputData).length + ? Buffer.byteLength(JSON.stringify(merged.outputData), "utf8") : undefined; // Untrack immediately — execution done, update will follow diff --git a/src/sdk/clients/worker/__tests__/LeaseTracker.test.ts b/src/sdk/clients/worker/__tests__/LeaseTracker.test.ts index 86cf4e3e..73b56823 100644 --- a/src/sdk/clients/worker/__tests__/LeaseTracker.test.ts +++ b/src/sdk/clients/worker/__tests__/LeaseTracker.test.ts @@ -1,5 +1,5 @@ -import { LeaseTracker, LeaseInfo } from "@/sdk/clients/worker/LeaseTracker"; -import { LEASE_EXTEND_DURATION_FACTOR, LEASE_EXTEND_RETRY_COUNT, HEARTBEAT_RETRY_DELAY_MS } from "@/sdk/clients/worker/constants"; +import { LeaseTracker } from "@/sdk/clients/worker/LeaseTracker"; +import { LEASE_EXTEND_RETRY_COUNT, HEARTBEAT_RETRY_DELAY_MS } from "@/sdk/clients/worker/constants"; import { afterEach, beforeEach, describe, expect, jest, test } from "@jest/globals"; import type { Task } from "@open-api/index"; import { mockLogger } from "@test-utils/mockLogger"; diff --git a/src/sdk/clients/worker/events/EventDispatcher.ts b/src/sdk/clients/worker/events/EventDispatcher.ts index e6e1557a..16a2cf3c 100644 --- a/src/sdk/clients/worker/events/EventDispatcher.ts +++ b/src/sdk/clients/worker/events/EventDispatcher.ts @@ -9,6 +9,7 @@ import type { TaskExecutionFailure, TaskUpdateCompleted, TaskUpdateFailure, + TaskPaused, } from "./types"; /** @@ -60,6 +61,11 @@ export interface TaskRunnerEventsListener { * This is a CRITICAL event that may require operational intervention. */ onTaskUpdateFailure?(event: TaskUpdateFailure): void | Promise; + + /** + * Called when a poll cycle is skipped because the worker is paused. + */ + onTaskPaused?(event: TaskPaused): void | Promise; } /** @@ -159,6 +165,13 @@ export class EventDispatcher { await this.publishEvent("onTaskUpdateFailure", event); } + /** + * Publish a TaskPaused event. + */ + async publishTaskPaused(event: TaskPaused): Promise { + await this.publishEvent("onTaskPaused", event); + } + /** * Internal method to publish events to all registered listeners. * Listener failures are caught and logged to prevent affecting task execution. diff --git a/src/sdk/clients/worker/events/__tests__/EventDispatcher.test.ts b/src/sdk/clients/worker/events/__tests__/EventDispatcher.test.ts index bc2f2dc6..c8b31634 100644 --- a/src/sdk/clients/worker/events/__tests__/EventDispatcher.test.ts +++ b/src/sdk/clients/worker/events/__tests__/EventDispatcher.test.ts @@ -9,6 +9,7 @@ import type { TaskExecutionFailure, TaskUpdateCompleted, TaskUpdateFailure, + TaskPaused, } from "../types"; describe("EventDispatcher", () => { @@ -143,6 +144,7 @@ describe("EventDispatcher", () => { onTaskExecutionFailure: jest.fn<() => void>(), onTaskUpdateCompleted: jest.fn<() => void>(), onTaskUpdateFailure: jest.fn<() => void>(), + onTaskPaused: jest.fn<() => void>(), }; dispatcher.register(listener); @@ -225,12 +227,21 @@ describe("EventDispatcher", () => { workerId: "worker-1", workflowInstanceId: "workflow-1", cause: new Error("Update failed"), + durationMs: 500, retryCount: 4, taskResult: { status: "COMPLETED" }, timestamp: new Date(), }; await dispatcher.publishTaskUpdateFailure(updateFailure); expect(listener.onTaskUpdateFailure).toHaveBeenCalledWith(updateFailure); + + // Test TaskPaused + const taskPaused: TaskPaused = { + taskType: "test-task", + timestamp: new Date(), + }; + await dispatcher.publishTaskPaused(taskPaused); + expect(listener.onTaskPaused).toHaveBeenCalledWith(taskPaused); }); test("should have zero overhead when no listeners registered", async () => { diff --git a/src/sdk/clients/worker/events/types.ts b/src/sdk/clients/worker/events/types.ts index 7f12b2bf..14554aff 100644 --- a/src/sdk/clients/worker/events/types.ts +++ b/src/sdk/clients/worker/events/types.ts @@ -114,6 +114,8 @@ export interface TaskUpdateFailure extends TaskRunnerEvent { workflowInstanceId?: string; /** The error that caused the final update failure */ cause: Error; + /** Time taken for the last update attempt in milliseconds */ + durationMs?: number; /** Number of retry attempts made */ retryCount: number; /** The TaskResult object that failed to update (for recovery/logging) */ @@ -134,6 +136,14 @@ export interface TaskUpdateCompleted extends TaskRunnerEvent { durationMs: number; } +/** + * Event published when a poll cycle is skipped because the worker is paused. + */ +// eslint-disable-next-line @typescript-eslint/no-empty-object-type +export interface TaskPaused extends TaskRunnerEvent { + // No additional fields — taskType is inherited from TaskRunnerEvent. +} + /** * Union type of all task runner events. */ @@ -145,4 +155,5 @@ export type TaskRunnerEventType = | TaskExecutionCompleted | TaskExecutionFailure | TaskUpdateCompleted - | TaskUpdateFailure; + | TaskUpdateFailure + | TaskPaused; diff --git a/src/sdk/clients/worker/types.ts b/src/sdk/clients/worker/types.ts index b56eed5c..3aa0db61 100644 --- a/src/sdk/clients/worker/types.ts +++ b/src/sdk/clients/worker/types.ts @@ -86,6 +86,8 @@ export interface PollerOptions { adaptiveBackoff?: boolean; /** Whether this poller is paused (default: false) */ paused?: boolean; + /** Callback invoked each time a poll cycle is skipped because the poller is paused */ + onPaused?: () => void; } /** diff --git a/src/sdk/clients/workflow/WorkflowExecutor.ts b/src/sdk/clients/workflow/WorkflowExecutor.ts index a34188e2..17cd25f7 100644 --- a/src/sdk/clients/workflow/WorkflowExecutor.ts +++ b/src/sdk/clients/workflow/WorkflowExecutor.ts @@ -30,6 +30,7 @@ import { enhanceSignalResponse } from "./helpers/enhanceSignalResponse"; import { reverseFind } from "./helpers/reverseFind"; import { isCompletedTaskMatchingType } from "./helpers/isCompletedTaskMatchingType"; import { RETRY_TIME_IN_MILLISECONDS } from "./constants"; +import { getHttpMetricsObserver } from "../../worker/metrics/httpObserver"; export class WorkflowExecutor { public readonly _client: Client; @@ -71,6 +72,22 @@ export class WorkflowExecutor { public async startWorkflow( workflowRequest: StartWorkflowRequest ): Promise { + const observer = getHttpMetricsObserver(); + if (observer?.measurePayloadSize && workflowRequest.input) { + try { + const json = JSON.stringify(workflowRequest.input); + const inputBytes = Buffer.byteLength(json, "utf8"); + observer.recordWorkflowInputSize( + workflowRequest.name ?? "", + inputBytes, + workflowRequest.version != null + ? String(workflowRequest.version) + : undefined, + ); + } catch { + // Best-effort — don't let metrics break workflow start. + } + } try { const { data } = await WorkflowResource.startWorkflow({ body: workflowRequest, @@ -80,6 +97,16 @@ export class WorkflowExecutor { return data; } catch (error: unknown) { + if (observer) { + const excName = + error instanceof Error + ? error.name || error.constructor?.name || "Error" + : "Error"; + observer.recordWorkflowStartError( + workflowRequest.name ?? "", + excName, + ); + } handleSdkError(error, "Failed to start workflow"); } } diff --git a/src/sdk/createConductorClient/createConductorClient.ts b/src/sdk/createConductorClient/createConductorClient.ts index 51e460c7..e6815811 100644 --- a/src/sdk/createConductorClient/createConductorClient.ts +++ b/src/sdk/createConductorClient/createConductorClient.ts @@ -7,6 +7,7 @@ import { import type { OrkesApiConfig } from "../types"; import { createClient } from "../../open-api/generated/client"; import { addResourcesBackwardCompatibility } from "./helpers/addResourcesBackwardCompatibility"; +import { createMetricsInterceptors } from "./helpers/metricsInterceptors"; /** * Creates a Conductor client with authentication and configuration @@ -34,6 +35,7 @@ export const createConductorClient = async ( proxyUrl, tlsInsecure, disableHttp2, + retryServerErrors, } = resolveOrkesConfig(config); if (!serverUrl) throw new Error("Conductor server URL is not set"); @@ -52,10 +54,13 @@ export const createConductorClient = async ( // Start with retry + timeout on fetch (no auth failure callback yet) const openApiClient = createClient({ baseUrl: serverUrl, - fetch: wrapFetchWithRetry(baseFetchFn, { requestTimeoutMs }), + fetch: wrapFetchWithRetry(baseFetchFn, { requestTimeoutMs, retryServerErrors }), throwOnError: true, }); + const { onRequest } = createMetricsInterceptors(); + openApiClient.interceptors.request.use(onRequest); + let authResult: Awaited> | undefined; if (keyId && keySecret) { authResult = await handleAuth( @@ -74,6 +79,7 @@ export const createConductorClient = async ( fetch: wrapFetchWithRetry(baseFetchFn, { onAuthFailure: authResult.refreshToken, requestTimeoutMs, + retryServerErrors, }), }); } diff --git a/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts b/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts index 76f73f9b..1799cada 100644 --- a/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts +++ b/src/sdk/createConductorClient/helpers/__tests__/fetchWithRetry.test.ts @@ -1,5 +1,7 @@ -import { jest, expect, describe, it, beforeEach } from "@jest/globals"; +import { jest, expect, describe, it, beforeEach, afterEach } from "@jest/globals"; import { retryFetch, wrapFetchWithRetry, applyTimeout } from "../fetchWithRetry"; +import * as httpObserver from "@/sdk/worker/metrics/httpObserver"; +import { requestTemplateMap } from "../metricsInterceptors"; const createMockResponse = (status: number, body = ""): Response => new Response(body, { status, statusText: `Status ${status}` }); @@ -180,6 +182,138 @@ describe("fetchWithRetry", () => { }); }); + // ─── Server error (502/503/504) retry ─────────────────────────────── + + describe("server error (502/503/504) retry", () => { + it("should NOT retry 502 by default (retryServerErrors unset)", async () => { + mockFetch.mockResolvedValue(createMockResponse(502, "Bad Gateway")); + + const result = await retryFetch("http://test.com", {}, mockFetch, { + maxTransportRetries: 3, + initialRetryDelay: 1, + }); + + expect(result.status).toBe(502); + expect(mockFetch).toHaveBeenCalledTimes(1); + }); + + it("should retry 502 and succeed on next attempt", async () => { + mockFetch + .mockResolvedValueOnce(createMockResponse(502, "Bad Gateway")) + .mockResolvedValueOnce(createMockResponse(200, "ok")); + + const result = await retryFetch("http://test.com", {}, mockFetch, { + maxTransportRetries: 3, + initialRetryDelay: 1, + retryServerErrors: true, + }); + + expect(result.status).toBe(200); + expect(mockFetch).toHaveBeenCalledTimes(2); + }); + + it("should retry 503 and succeed on next attempt", async () => { + mockFetch + .mockResolvedValueOnce(createMockResponse(503, "Service Unavailable")) + .mockResolvedValueOnce(createMockResponse(200, "ok")); + + const result = await retryFetch("http://test.com", {}, mockFetch, { + maxTransportRetries: 3, + initialRetryDelay: 1, + retryServerErrors: true, + }); + + expect(result.status).toBe(200); + expect(mockFetch).toHaveBeenCalledTimes(2); + }); + + it("should retry 504 and succeed on next attempt", async () => { + mockFetch + .mockResolvedValueOnce(createMockResponse(504, "Gateway Timeout")) + .mockResolvedValueOnce(createMockResponse(200, "ok")); + + const result = await retryFetch("http://test.com", {}, mockFetch, { + maxTransportRetries: 3, + initialRetryDelay: 1, + retryServerErrors: true, + }); + + expect(result.status).toBe(200); + expect(mockFetch).toHaveBeenCalledTimes(2); + }); + + it("should exhaust retries and return last 5xx response", async () => { + mockFetch.mockResolvedValue(createMockResponse(502, "Bad Gateway")); + + const result = await retryFetch("http://test.com", {}, mockFetch, { + maxTransportRetries: 2, + initialRetryDelay: 1, + retryServerErrors: true, + }); + + expect(result.status).toBe(502); + // 1 initial + 2 retries = 3 + expect(mockFetch).toHaveBeenCalledTimes(3); + }); + + it("should NOT retry 502 for POST requests (non-idempotent)", async () => { + mockFetch.mockResolvedValue(createMockResponse(502, "Bad Gateway")); + + const result = await retryFetch("http://test.com", { method: "POST" }, mockFetch, { + maxTransportRetries: 3, + initialRetryDelay: 1, + retryServerErrors: true, + }); + + expect(result.status).toBe(502); + expect(mockFetch).toHaveBeenCalledTimes(1); + }); + + it("should NOT retry 503 for PATCH requests (non-idempotent)", async () => { + mockFetch.mockResolvedValue(createMockResponse(503, "Service Unavailable")); + + const result = await retryFetch("http://test.com", { method: "PATCH" }, mockFetch, { + maxTransportRetries: 3, + initialRetryDelay: 1, + retryServerErrors: true, + }); + + expect(result.status).toBe(503); + expect(mockFetch).toHaveBeenCalledTimes(1); + }); + + it("should retry 502 for PUT requests (idempotent)", async () => { + mockFetch + .mockResolvedValueOnce(createMockResponse(502, "Bad Gateway")) + .mockResolvedValueOnce(createMockResponse(200, "ok")); + + const result = await retryFetch("http://test.com", { method: "PUT" }, mockFetch, { + maxTransportRetries: 3, + initialRetryDelay: 1, + retryServerErrors: true, + }); + + expect(result.status).toBe(200); + expect(mockFetch).toHaveBeenCalledTimes(2); + }); + + it("should handle transport error then 502 then success", async () => { + mockFetch + .mockRejectedValueOnce(new Error("ECONNRESET")) + .mockResolvedValueOnce(createMockResponse(502, "Bad Gateway")) + .mockResolvedValueOnce(createMockResponse(200, "ok")); + + const result = await retryFetch("http://test.com", {}, mockFetch, { + maxTransportRetries: 3, + initialRetryDelay: 1, + retryServerErrors: true, + }); + + expect(result.status).toBe(200); + expect(mockFetch).toHaveBeenCalledTimes(3); + }); + }); + // ─── Auth failure (401/403) retry ────────────────────────────────── describe("auth failure (401/403) retry", () => { @@ -525,4 +659,150 @@ describe("fetchWithRetry", () => { expect(result.status).toBe(200); }); }); + + // ─── wrapFetchWithRetry metrics recording ─────────────────────────── + + describe("wrapFetchWithRetry metrics", () => { + const mockRecordApiRequestTime = jest.fn< + (m: string, u: string, s: string, d: number, t?: string) => void + >(); + + beforeEach(() => { + mockRecordApiRequestTime.mockClear(); + httpObserver.setHttpMetricsObserver({ + measurePayloadSize: false, + recordApiRequestTime: mockRecordApiRequestTime, + recordWorkflowInputSize: jest.fn(), + recordWorkflowStartError: jest.fn(), + }); + }); + + afterEach(() => { + httpObserver.setHttpMetricsObserver(undefined); + }); + + it("should record metrics on successful response", async () => { + mockFetch.mockResolvedValue(createMockResponse(200)); + + const wrappedFetch = wrapFetchWithRetry(mockFetch); + await wrappedFetch("http://test.com/api/tasks", { method: "POST" }); + + expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1); + const [method, uri, status, duration] = + mockRecordApiRequestTime.mock.calls[0]; + expect(method).toBe("POST"); + expect(uri).toBe("/api/tasks"); + expect(status).toBe("200"); + expect(duration).toBeGreaterThanOrEqual(0); + }); + + it("should record status '0' on network failure", async () => { + mockFetch.mockRejectedValue(new Error("ECONNRESET")); + + const wrappedFetch = wrapFetchWithRetry(mockFetch, { + maxTransportRetries: 0, + }); + + await expect( + wrappedFetch("http://test.com/api/workflow") + ).rejects.toThrow("ECONNRESET"); + + expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1); + const [method, uri, status] = + mockRecordApiRequestTime.mock.calls[0]; + expect(method).toBe("GET"); + expect(uri).toBe("/api/workflow"); + expect(status).toBe("0"); + }); + + it("should extract method and URI from Request object on network failure", async () => { + mockFetch.mockRejectedValue(new Error("ECONNRESET")); + + const wrappedFetch = wrapFetchWithRetry(mockFetch, { + maxTransportRetries: 0, + }); + const request = new Request("http://example.com/api/metadata", { + method: "DELETE", + }); + + await expect(wrappedFetch(request)).rejects.toThrow("ECONNRESET"); + + expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1); + const [method, uri, status] = mockRecordApiRequestTime.mock.calls[0]; + expect(method).toBe("DELETE"); + expect(uri).toBe("/api/metadata"); + expect(status).toBe("0"); + }); + + it("should pass template from requestTemplateMap on success", async () => { + mockFetch.mockResolvedValue(createMockResponse(200)); + + const request = new Request("http://host/api/workflow/abc-123", { + method: "GET", + }); + requestTemplateMap.set(request, "/workflow/{workflowId}"); + + const wrappedFetch = wrapFetchWithRetry(mockFetch); + await wrappedFetch(request); + + expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1); + const [, , , , template] = mockRecordApiRequestTime.mock.calls[0]; + expect(template).toBe("/workflow/{workflowId}"); + }); + + it("should pass template from requestTemplateMap on network failure", async () => { + mockFetch.mockRejectedValue(new Error("ECONNRESET")); + + const request = new Request("http://host/api/workflow/abc-123", { + method: "GET", + }); + requestTemplateMap.set(request, "/workflow/{workflowId}"); + + const wrappedFetch = wrapFetchWithRetry(mockFetch, { + maxTransportRetries: 0, + }); + + await expect(wrappedFetch(request)).rejects.toThrow("ECONNRESET"); + + expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1); + const [, , , , template] = mockRecordApiRequestTime.mock.calls[0]; + expect(template).toBe("/workflow/{workflowId}"); + }); + + it("should pass undefined template when requestTemplateMap has no entry", async () => { + mockFetch.mockResolvedValue(createMockResponse(200)); + + const request = new Request("http://host/api/workflow/abc-123", { + method: "GET", + }); + // Do NOT set requestTemplateMap + + const wrappedFetch = wrapFetchWithRetry(mockFetch); + await wrappedFetch(request); + + expect(mockRecordApiRequestTime).toHaveBeenCalledTimes(1); + const [, , , , template] = mockRecordApiRequestTime.mock.calls[0]; + expect(template).toBeUndefined(); + }); + + it("should not break fetch when no observer is registered", async () => { + httpObserver.setHttpMetricsObserver(undefined); + + mockFetch.mockResolvedValue(createMockResponse(200)); + const wrappedFetch = wrapFetchWithRetry(mockFetch); + const result = await wrappedFetch("http://test.com"); + + expect(result.status).toBe(200); + }); + + it("should not record metrics when no observer is registered", async () => { + httpObserver.setHttpMetricsObserver(undefined); + + mockFetch.mockResolvedValue(createMockResponse(200)); + const wrappedFetch = wrapFetchWithRetry(mockFetch); + await wrappedFetch("http://test.com"); + + expect(mockRecordApiRequestTime).not.toHaveBeenCalled(); + }); + }); }); diff --git a/src/sdk/createConductorClient/helpers/__tests__/metricsInterceptors.test.ts b/src/sdk/createConductorClient/helpers/__tests__/metricsInterceptors.test.ts new file mode 100644 index 00000000..24bfcdd8 --- /dev/null +++ b/src/sdk/createConductorClient/helpers/__tests__/metricsInterceptors.test.ts @@ -0,0 +1,95 @@ +import { expect, describe, it } from "@jest/globals"; +import { + createMetricsInterceptors, + requestTemplateMap, +} from "../metricsInterceptors"; +import type { ResolvedRequestOptions } from "@/open-api/generated/client/types.gen"; + +const fakeOpts = (url: string): ResolvedRequestOptions => + ({ url } as unknown as ResolvedRequestOptions); + +describe("metricsInterceptors", () => { + it("should stash stripped template in requestTemplateMap", () => { + const { onRequest } = createMetricsInterceptors(); + + const request = new Request("http://conductor.example.com/api/workflow/abc-123", { + method: "GET", + }); + const opts = fakeOpts("/api/workflow/{workflowId}"); + + onRequest(request, opts); + + expect(requestTemplateMap.get(request)).toBe("/workflow/{workflowId}"); + }); + + it("should strip /api prefix from path template", () => { + const { onRequest } = createMetricsInterceptors(); + + const request = new Request("http://host/api/tasks/poll/myTask", { + method: "POST", + }); + const opts = fakeOpts("/api/tasks/poll/{taskType}"); + + onRequest(request, opts); + + expect(requestTemplateMap.get(request)).toBe("/tasks/poll/{taskType}"); + }); + + it("should not strip prefix if path does not start with /api/", () => { + const { onRequest } = createMetricsInterceptors(); + + const request = new Request("http://host/health", { method: "GET" }); + const opts = fakeOpts("/health"); + + onRequest(request, opts); + + expect(requestTemplateMap.get(request)).toBe("/health"); + }); + + it("should return the request unchanged", () => { + const { onRequest } = createMetricsInterceptors(); + + const request = new Request("http://host/api/workflow", { method: "GET" }); + const opts = fakeOpts("/api/workflow"); + + const returned = onRequest(request, opts); + expect(returned).toBe(request); + }); + + it("should normalize lowercase {tasktype} to {taskType}", () => { + const { onRequest } = createMetricsInterceptors(); + + const request = new Request("http://host/api/tasks/poll/batch/my_task", { + method: "GET", + }); + const opts = fakeOpts("/api/tasks/poll/batch/{tasktype}"); + + onRequest(request, opts); + + expect(requestTemplateMap.get(request)).toBe("/tasks/poll/batch/{taskType}"); + }); + + it("should leave already-camelCase placeholders unchanged", () => { + const { onRequest } = createMetricsInterceptors(); + + const request = new Request("http://host/api/workflow/abc-123", { + method: "GET", + }); + const opts = fakeOpts("/api/workflow/{workflowId}"); + + onRequest(request, opts); + + expect(requestTemplateMap.get(request)).toBe("/workflow/{workflowId}"); + }); + + it("should not stash when opts.url is not a string", () => { + const { onRequest } = createMetricsInterceptors(); + + const request = new Request("http://host/api/workflow", { method: "GET" }); + const opts = {} as unknown as ResolvedRequestOptions; + + onRequest(request, opts); + + expect(requestTemplateMap.has(request)).toBe(false); + }); +}); diff --git a/src/sdk/createConductorClient/helpers/fetchWithRetry.ts b/src/sdk/createConductorClient/helpers/fetchWithRetry.ts index 733313cd..d56776d3 100644 --- a/src/sdk/createConductorClient/helpers/fetchWithRetry.ts +++ b/src/sdk/createConductorClient/helpers/fetchWithRetry.ts @@ -1,3 +1,6 @@ +import { getHttpMetricsObserver } from "../../worker/metrics/httpObserver"; +import { requestTemplateMap } from "./metricsInterceptors"; + type Input = Parameters[0]; type Init = Parameters[1]; @@ -7,6 +10,7 @@ export interface RetryFetchOptions { maxRateLimitRetries?: number; // default 5 maxTransportRetries?: number; // default 3 initialRetryDelay?: number; // default 1000ms + retryServerErrors?: boolean; // retry 502/503/504 for idempotent methods (default false) } const isTimeoutError = (error: unknown): boolean => { @@ -95,6 +99,8 @@ const withJitter = (delayMs: number): number => { return Math.max(0, Math.round(delayMs + jitter)); }; +const IDEMPOTENT_METHODS = new Set(["GET", "HEAD", "OPTIONS", "PUT", "DELETE"]); + export const retryFetch = async ( input: Input, init: Init, @@ -157,6 +163,24 @@ export const retryFetch = async ( return rateLimitResponse; } + // Gateway error retry (502, 503, 504) -- opt-in via retryServerErrors. + // Only retries idempotent methods to avoid duplicate side effects when + // the upstream may have processed the request. + if (options.retryServerErrors && response.status >= 502 && response.status <= 504) { + const reqMethod = (input instanceof Request + ? input.method + : init?.method ?? "GET" + ).toUpperCase(); + if (IDEMPOTENT_METHODS.has(reqMethod) && transportAttempt < maxTransportRetries) { + lastError = new Error(`Server error: HTTP ${response.status}`); + await new Promise((resolve) => + setTimeout(resolve, withJitter(initialRetryDelay * (transportAttempt + 1))) + ); + continue; + } + return response; + } + // Auth failure retry (401/403) - only refresh+retry when the error is a token // problem (EXPIRED_TOKEN or INVALID_TOKEN). Permission errors should propagate // immediately without wasting a token refresh + retry round-trip. @@ -184,11 +208,53 @@ export const retryFetch = async ( throw lastError ?? new Error("Fetch retry exhausted"); }; +const extractRequestInfo = (input: Input, init?: Init) => { + const method = input instanceof Request + ? input.method + : (init?.method ?? "GET"); + let uri = ""; + try { + uri = new URL( + input instanceof Request ? input.url : String(input), + ).pathname; + } catch { + uri = String(input); + } + return { method, uri }; +}; + export const wrapFetchWithRetry = ( fetchFn: typeof fetch, - options?: RetryFetchOptions + options?: RetryFetchOptions, ): typeof fetch => { return (input: Input, init?: Init): Promise => { - return retryFetch(input, init, fetchFn, options); + const observer = getHttpMetricsObserver(); + if (!observer) { + return retryFetch(input, init, fetchFn, options); + } + + const start = performance.now(); + const template = input instanceof Request + ? requestTemplateMap.get(input) + : undefined; + + return retryFetch(input, init, fetchFn, options).then( + (response) => { + const durationMs = performance.now() - start; + const { method, uri } = extractRequestInfo(input, init); + observer.recordApiRequestTime( + method, uri, String(response.status), durationMs, template, + ); + return response; + }, + (error) => { + const durationMs = performance.now() - start; + const { method, uri } = extractRequestInfo(input, init); + observer.recordApiRequestTime( + method, uri, "0", durationMs, template, + ); + throw error; + }, + ); }; }; diff --git a/src/sdk/createConductorClient/helpers/metricsInterceptors.ts b/src/sdk/createConductorClient/helpers/metricsInterceptors.ts new file mode 100644 index 00000000..146bacdf --- /dev/null +++ b/src/sdk/createConductorClient/helpers/metricsInterceptors.ts @@ -0,0 +1,64 @@ +import type { ResolvedRequestOptions } from "../../../open-api/generated/client/types.gen"; + +/** + * Strips the /api prefix that the OpenAPI spec bakes into every path. + * Other SDKs (Go, Java, Python, Rust, Ruby, C#) keep /api in the base URL + * and use clean paths like /workflow/{workflowId}. The JS SDK's generated + * code does it backwards: the base URL has /api stripped and every path + * template starts with /api/. This normalises back to the cross-SDK + * convention. + */ +const stripApiPrefix = (url: string): string => + url.startsWith("/api/") ? url.slice(4) : url; + +/** + * The OpenAPI spec uses all-lowercase path parameter names in a few + * templates (e.g. `{tasktype}` instead of `{taskType}`). This map + * normalizes them to camelCase so the canonical `uri` metric label + * is consistent across all endpoints. + */ +const CANONICAL_PARAM_NAMES: Record = { + tasktype: "taskType", +}; + +const normalizePlaceholders = (url: string): string => + url.replace(/\{(\w+)\}/g, (_m, name: string) => + `{${CANONICAL_PARAM_NAMES[name] ?? name}}`, + ); + +/** + * Maps each Request to its OpenAPI path template. The request interceptor + * stashes the template here so the fetch wrapper (which only receives the + * Request object) can pass bounded-cardinality URI labels to the metrics + * observer for both success and error paths. + * + * Entries are garbage-collected with the Request. + */ +export const requestTemplateMap = new WeakMap(); + +/** + * Creates a request interceptor that captures the OpenAPI path template + * for each request. The template is stored in {@link requestTemplateMap} + * and read by `wrapFetchWithRetry` when recording HTTP metrics. + * + * Timing and metric recording are handled entirely in the fetch wrapper; + * the interceptor's only job is to bridge the path template from `opts` + * (available to interceptors) to the fetch layer (which only sees the + * Request object). + */ +export function createMetricsInterceptors() { + const onRequest = ( + request: Request, + opts: ResolvedRequestOptions, + ): Request => { + if (typeof opts.url === "string") { + requestTemplateMap.set( + request, + normalizePlaceholders(stripApiPrefix(opts.url)), + ); + } + return request; + }; + + return { onRequest }; +} diff --git a/src/sdk/createConductorClient/helpers/resolveOrkesConfig.ts b/src/sdk/createConductorClient/helpers/resolveOrkesConfig.ts index b11c9def..42d74744 100644 --- a/src/sdk/createConductorClient/helpers/resolveOrkesConfig.ts +++ b/src/sdk/createConductorClient/helpers/resolveOrkesConfig.ts @@ -67,5 +67,8 @@ export const resolveOrkesConfig = (config?: Partial) => { disableHttp2: parseEnvBoolean(process.env.CONDUCTOR_DISABLE_HTTP2) ?? config?.disableHttp2, + retryServerErrors: + parseEnvBoolean(process.env.CONDUCTOR_RETRY_SERVER_ERRORS) ?? + config?.retryServerErrors, }; }; diff --git a/src/sdk/types.ts b/src/sdk/types.ts index 81f988c9..fc3beffd 100644 --- a/src/sdk/types.ts +++ b/src/sdk/types.ts @@ -20,4 +20,5 @@ export interface OrkesApiConfig { proxyUrl?: string; // HTTP/HTTPS proxy URL tlsInsecure?: boolean; // disable TLS certificate verification (for self-signed certs) disableHttp2?: boolean; // force HTTP/1.1 instead of HTTP/2 + retryServerErrors?: boolean; // retry 502/503/504 for idempotent methods (default false) } diff --git a/src/sdk/worker/core/TaskHandler.ts b/src/sdk/worker/core/TaskHandler.ts index 2c0dd083..5bfcda5f 100644 --- a/src/sdk/worker/core/TaskHandler.ts +++ b/src/sdk/worker/core/TaskHandler.ts @@ -140,6 +140,7 @@ export class TaskHandler { private restartAttempts = new Map(); // runner index → attempt count private healthMonitorConfig: HealthMonitorConfig; + /** * Create a TaskHandler instance with async module imports. * Use this instead of `new TaskHandler()` when using `importModules`. diff --git a/src/sdk/worker/index.ts b/src/sdk/worker/index.ts index 35e1fed9..7605d589 100644 --- a/src/sdk/worker/index.ts +++ b/src/sdk/worker/index.ts @@ -17,7 +17,11 @@ export { TaskContext, getTaskContext } from "./context"; // Metrics export { MetricsCollector, + LegacyMetricsCollector, + CanonicalMetricsCollector, + createMetricsCollector, MetricsServer, + type MetricsCollectorInterface, type MetricsCollectorConfig, type WorkerMetrics, } from "./metrics"; diff --git a/src/sdk/worker/metrics/CanonicalMetricsCollector.ts b/src/sdk/worker/metrics/CanonicalMetricsCollector.ts new file mode 100644 index 00000000..c5df73c0 --- /dev/null +++ b/src/sdk/worker/metrics/CanonicalMetricsCollector.ts @@ -0,0 +1,530 @@ +import type { + PollStarted, + PollCompleted, + PollFailure, + TaskExecutionStarted, + TaskExecutionCompleted, + TaskExecutionFailure, + TaskUpdateCompleted, + TaskUpdateFailure, + TaskPaused, +} from "../../clients/worker/events"; +import type { MetricsCollectorInterface } from "./MetricsCollectorInterface"; +import type { MetricsCollectorConfig } from "./LegacyMetricsCollector"; +import { getHttpMetricsObserver, setHttpMetricsObserver } from "./httpObserver"; +import { + HistogramAccumulator, + MultiLabelCounter, + GaugeMetric, + TIME_BUCKETS, + SIZE_BUCKETS, + exceptionLabel, +} from "./accumulators"; + +interface CanonicalMetricState { + // Counters (12) + taskPollTotal: MultiLabelCounter; + taskExecutionStartedTotal: MultiLabelCounter; + taskPollErrorTotal: MultiLabelCounter; + taskExecuteErrorTotal: MultiLabelCounter; + taskUpdateErrorTotal: MultiLabelCounter; + taskAckErrorTotal: MultiLabelCounter; + taskAckFailedTotal: MultiLabelCounter; + taskExecutionQueueFullTotal: MultiLabelCounter; + taskPausedTotal: MultiLabelCounter; + threadUncaughtExceptionsTotal: MultiLabelCounter; + externalPayloadUsedTotal: MultiLabelCounter; + workflowStartErrorTotal: MultiLabelCounter; + + // Time histograms (4) — seconds + taskPollTimeSeconds: HistogramAccumulator; + taskExecuteTimeSeconds: HistogramAccumulator; + taskUpdateTimeSeconds: HistogramAccumulator; + httpApiClientRequestSeconds: HistogramAccumulator; + + // Size histograms (2) — bytes + taskResultSizeBytes: HistogramAccumulator; + workflowInputSizeBytes: HistogramAccumulator; + + // Gauge (1) + activeWorkers: GaugeMetric; +} + +/** + * Canonical metrics collector. + * + * Emits unprefixed Prometheus metrics with camelCase labels, second-based + * time units, and Histogram type for distributions, per the cross-SDK + * canonical metric catalog. + * + * Selected when WORKER_CANONICAL_METRICS=true. + */ +export class CanonicalMetricsCollector implements MetricsCollectorInterface { + private state: CanonicalMetricState; + private _server?: import("./MetricsServer.js").MetricsServer; + private _fileTimer?: ReturnType; + private _promRegistry?: import("./CanonicalPrometheusRegistry.js").CanonicalPrometheusRegistry; + readonly measurePayloadSize: boolean; + + constructor(config?: MetricsCollectorConfig) { + this.state = this.createEmpty(); + this.measurePayloadSize = config?.measurePayloadSize ?? true; + if (config?.usePromClient) { + void this.initPromClient(); + } + if (config?.httpPort) { + void this.startServer(config.httpPort); + } + if (config?.filePath) { + this.startFileWriter( + config.filePath, + config.fileWriteIntervalMs ?? 5000, + ); + } + setHttpMetricsObserver(this); + } + + private async initPromClient(): Promise { + const { CanonicalPrometheusRegistry } = await import( + "./CanonicalPrometheusRegistry.js" + ); + this._promRegistry = new CanonicalPrometheusRegistry(); + await this._promRegistry.initialize(); + } + + private async startServer(port: number): Promise { + const { MetricsServer } = await import("./MetricsServer.js"); + this._server = new MetricsServer(this, port); + await this._server.start(); + } + + private startFileWriter(filePath: string, intervalMs: number): void { + const doWrite = async () => { + try { + const { writeFile } = await import("node:fs/promises"); + await writeFile(filePath, this.toPrometheusText(), "utf-8"); + } catch { + // Silently ignore file write errors + } + }; + void doWrite(); + this._fileTimer = setInterval(doWrite, intervalMs); + if (typeof this._fileTimer === "object" && "unref" in this._fileTimer) { + this._fileTimer.unref(); + } + } + + private createEmpty(): CanonicalMetricState { + return { + taskPollTotal: new MultiLabelCounter(), + taskExecutionStartedTotal: new MultiLabelCounter(), + taskPollErrorTotal: new MultiLabelCounter(), + taskExecuteErrorTotal: new MultiLabelCounter(), + taskUpdateErrorTotal: new MultiLabelCounter(), + taskAckErrorTotal: new MultiLabelCounter(), + taskAckFailedTotal: new MultiLabelCounter(), + taskExecutionQueueFullTotal: new MultiLabelCounter(), + taskPausedTotal: new MultiLabelCounter(), + threadUncaughtExceptionsTotal: new MultiLabelCounter(), + externalPayloadUsedTotal: new MultiLabelCounter(), + workflowStartErrorTotal: new MultiLabelCounter(), + + taskPollTimeSeconds: new HistogramAccumulator(TIME_BUCKETS), + taskExecuteTimeSeconds: new HistogramAccumulator(TIME_BUCKETS), + taskUpdateTimeSeconds: new HistogramAccumulator(TIME_BUCKETS), + httpApiClientRequestSeconds: new HistogramAccumulator(TIME_BUCKETS), + + taskResultSizeBytes: new HistogramAccumulator(SIZE_BUCKETS), + workflowInputSizeBytes: new HistogramAccumulator(SIZE_BUCKETS), + + activeWorkers: new GaugeMetric(), + }; + } + + // ── Event Listener Methods ────────────────────────────────────── + + onPollStarted(event: PollStarted): void { + this.state.taskPollTotal.increment({ taskType: event.taskType }); + this._promRegistry?.incrementCounter("task_poll_total", { + taskType: event.taskType, + }); + } + + onPollCompleted(event: PollCompleted): void { + const seconds = event.durationMs / 1000; + this.state.taskPollTimeSeconds.observe( + { taskType: event.taskType, status: "SUCCESS" }, + seconds, + ); + this._promRegistry?.observeHistogram("task_poll_time_seconds", { + taskType: event.taskType, + status: "SUCCESS", + }, seconds); + } + + onPollFailure(event: PollFailure): void { + const excName = exceptionLabel(event.cause); + this.state.taskPollErrorTotal.increment({ + taskType: event.taskType, + exception: excName, + }); + this._promRegistry?.incrementCounter("task_poll_error_total", { + taskType: event.taskType, + exception: excName, + }); + + const seconds = event.durationMs / 1000; + this.state.taskPollTimeSeconds.observe( + { taskType: event.taskType, status: "FAILURE" }, + seconds, + ); + this._promRegistry?.observeHistogram("task_poll_time_seconds", { + taskType: event.taskType, + status: "FAILURE", + }, seconds); + } + + onTaskExecutionStarted(event: TaskExecutionStarted): void { + this.state.taskExecutionStartedTotal.increment({ + taskType: event.taskType, + }); + this._promRegistry?.incrementCounter("task_execution_started_total", { + taskType: event.taskType, + }); + this.state.activeWorkers.inc({ taskType: event.taskType }); + this._promRegistry?.setGauge( + "active_workers", + { taskType: event.taskType }, + this.state.activeWorkers.getValue({ taskType: event.taskType }), + ); + } + + onTaskExecutionCompleted(event: TaskExecutionCompleted): void { + const seconds = event.durationMs / 1000; + this.state.taskExecuteTimeSeconds.observe( + { taskType: event.taskType, status: "SUCCESS" }, + seconds, + ); + this._promRegistry?.observeHistogram("task_execute_time_seconds", { + taskType: event.taskType, + status: "SUCCESS", + }, seconds); + + if (event.outputSizeBytes !== undefined) { + this.state.taskResultSizeBytes.observe( + { taskType: event.taskType }, + event.outputSizeBytes, + ); + this._promRegistry?.observeHistogram("task_result_size_bytes", { + taskType: event.taskType, + }, event.outputSizeBytes); + } + + this.state.activeWorkers.dec({ taskType: event.taskType }); + this._promRegistry?.setGauge( + "active_workers", + { taskType: event.taskType }, + Math.max(0, this.state.activeWorkers.getValue({ taskType: event.taskType })), + ); + } + + onTaskExecutionFailure(event: TaskExecutionFailure): void { + const excName = exceptionLabel(event.cause); + this.state.taskExecuteErrorTotal.increment({ + taskType: event.taskType, + exception: excName, + }); + this._promRegistry?.incrementCounter("task_execute_error_total", { + taskType: event.taskType, + exception: excName, + }); + + const seconds = event.durationMs / 1000; + this.state.taskExecuteTimeSeconds.observe( + { taskType: event.taskType, status: "FAILURE" }, + seconds, + ); + this._promRegistry?.observeHistogram("task_execute_time_seconds", { + taskType: event.taskType, + status: "FAILURE", + }, seconds); + + this.state.activeWorkers.dec({ taskType: event.taskType }); + this._promRegistry?.setGauge( + "active_workers", + { taskType: event.taskType }, + Math.max(0, this.state.activeWorkers.getValue({ taskType: event.taskType })), + ); + } + + onTaskUpdateCompleted(event: TaskUpdateCompleted): void { + const seconds = event.durationMs / 1000; + this.state.taskUpdateTimeSeconds.observe( + { taskType: event.taskType, status: "SUCCESS" }, + seconds, + ); + this._promRegistry?.observeHistogram("task_update_time_seconds", { + taskType: event.taskType, + status: "SUCCESS", + }, seconds); + } + + onTaskUpdateFailure(event: TaskUpdateFailure): void { + const excName = exceptionLabel(event.cause); + this.state.taskUpdateErrorTotal.increment({ + taskType: event.taskType, + exception: excName, + }); + this._promRegistry?.incrementCounter("task_update_error_total", { + taskType: event.taskType, + exception: excName, + }); + + if (event.durationMs != null) { + const seconds = event.durationMs / 1000; + this.state.taskUpdateTimeSeconds.observe( + { taskType: event.taskType, status: "FAILURE" }, + seconds, + ); + this._promRegistry?.observeHistogram("task_update_time_seconds", { + taskType: event.taskType, + status: "FAILURE", + }, seconds); + } + } + + onTaskPaused(event: TaskPaused): void { + this.recordTaskPaused(event.taskType); + } + + // ── Direct Recording Methods ─────────────────────────────────── + + recordTaskExecutionQueueFull(taskType: string): void { + this.state.taskExecutionQueueFullTotal.increment({ taskType }); + this._promRegistry?.incrementCounter("task_execution_queue_full_total", { + taskType, + }); + } + + recordUncaughtException(exception?: string): void { + const excName = exception ?? "Error"; + this.state.threadUncaughtExceptionsTotal.increment({ + exception: excName, + }); + this._promRegistry?.incrementCounter( + "thread_uncaught_exceptions_total", + { exception: excName }, + ); + } + + recordWorkerRestart(): void { + // Noop: worker_restart_total is N/A for the JS SDK (single-process model) + } + + recordTaskPaused(taskType: string): void { + this.state.taskPausedTotal.increment({ taskType }); + this._promRegistry?.incrementCounter("task_paused_total", { taskType }); + } + + recordTaskAckError(taskType: string, exception?: string): void { + const excName = exception ?? "Error"; + this.state.taskAckErrorTotal.increment({ taskType, exception: excName }); + this._promRegistry?.incrementCounter("task_ack_error_total", { + taskType, + exception: excName, + }); + } + + recordTaskAckFailed(taskType: string): void { + this.state.taskAckFailedTotal.increment({ taskType }); + this._promRegistry?.incrementCounter("task_ack_failed_total", { + taskType, + }); + } + + recordWorkflowStartError(workflowType?: string, exception?: string): void { + const wfType = workflowType ?? ""; + const excName = exception ?? "Error"; + this.state.workflowStartErrorTotal.increment({ + workflowType: wfType, + exception: excName, + }); + this._promRegistry?.incrementCounter("workflow_start_error_total", { + workflowType: wfType, + exception: excName, + }); + } + + recordExternalPayloadUsed( + payloadType: string, + entityName?: string, + operation?: string, + ): void { + this.state.externalPayloadUsedTotal.increment({ + entityName: entityName ?? "", + operation: operation ?? "", + payloadType, + }); + this._promRegistry?.incrementCounter("external_payload_used_total", { + entityName: entityName ?? "", + operation: operation ?? "", + payloadType, + }); + } + + recordWorkflowInputSize( + workflowType: string, + sizeBytes: number, + version?: string, + ): void { + this.state.workflowInputSizeBytes.observe( + { workflowType, version: version ?? "" }, + sizeBytes, + ); + this._promRegistry?.observeHistogram("workflow_input_size_bytes", { + workflowType, + version: version ?? "", + }, sizeBytes); + } + + recordApiRequestTime( + method: string, + uri: string, + status: number | string, + durationMs: number, + metricUri?: string, + ): void { + const resolvedUri = metricUri ?? uri; + const statusStr = String(status); + const seconds = durationMs / 1000; + this.state.httpApiClientRequestSeconds.observe( + { method, uri: resolvedUri, status: statusStr }, + seconds, + ); + this._promRegistry?.observeHistogram("http_api_client_request_seconds", { + method, + uri: resolvedUri, + status: statusStr, + }, seconds); + } + + // ── Public API ────────────────────────────────────────────────── + + getMetrics(): Readonly { + return this.state; + } + + reset(): void { + this.state = this.createEmpty(); + } + + async stop(): Promise { + if (getHttpMetricsObserver() === this) { + setHttpMetricsObserver(undefined); + } + if (this._fileTimer) { + clearInterval(this._fileTimer); + this._fileTimer = undefined; + } + if (this._server) { + await this._server.stop(); + this._server = undefined; + } + } + + collectorName(): string { + return "canonical"; + } + + getContentType(): string { + return ( + this._promRegistry?.contentType ?? + "text/plain; version=0.0.4; charset=utf-8" + ); + } + + async toPrometheusTextAsync(): Promise { + if (this._promRegistry?.available) { + return this._promRegistry.metrics(); + } + return this.toPrometheusText(); + } + + toPrometheusText(_prefix?: string): string { + const lines: string[] = []; + + // ── Counters ── + const counterDefs: { + name: string; + help: string; + counter: MultiLabelCounter; + }[] = [ + { name: "task_poll_total", help: "Total number of task polls", counter: this.state.taskPollTotal }, + { name: "task_execution_started_total", help: "Total number of task executions started", counter: this.state.taskExecutionStartedTotal }, + { name: "task_poll_error_total", help: "Total number of task poll errors", counter: this.state.taskPollErrorTotal }, + { name: "task_execute_error_total", help: "Total number of task execution errors", counter: this.state.taskExecuteErrorTotal }, + { name: "task_update_error_total", help: "Total number of task update errors", counter: this.state.taskUpdateErrorTotal }, + { name: "task_ack_error_total", help: "Total number of task ack errors", counter: this.state.taskAckErrorTotal }, + { name: "task_ack_failed_total", help: "Total number of task ack failures (server declined)", counter: this.state.taskAckFailedTotal }, + { name: "task_execution_queue_full_total", help: "Total number of task execution queue full events", counter: this.state.taskExecutionQueueFullTotal }, + { name: "task_paused_total", help: "Total number of task paused events", counter: this.state.taskPausedTotal }, + { name: "thread_uncaught_exceptions_total", help: "Total uncaught exceptions", counter: this.state.threadUncaughtExceptionsTotal }, + { name: "external_payload_used_total", help: "Total external payload usage", counter: this.state.externalPayloadUsedTotal }, + { name: "workflow_start_error_total", help: "Total workflow start errors", counter: this.state.workflowStartErrorTotal }, + ]; + + for (const def of counterDefs) { + const rendered = def.counter.render(def.name, def.help); + if (rendered) lines.push(rendered); + } + + // ── Time histograms ── + const timeHistogramDefs: { + name: string; + help: string; + histogram: HistogramAccumulator; + }[] = [ + { name: "task_poll_time_seconds", help: "Task poll duration in seconds", histogram: this.state.taskPollTimeSeconds }, + { name: "task_execute_time_seconds", help: "Task execution duration in seconds", histogram: this.state.taskExecuteTimeSeconds }, + { name: "task_update_time_seconds", help: "Task update duration in seconds", histogram: this.state.taskUpdateTimeSeconds }, + { name: "http_api_client_request_seconds", help: "HTTP API client request duration in seconds", histogram: this.state.httpApiClientRequestSeconds }, + ]; + + for (const def of timeHistogramDefs) { + const rendered = def.histogram.render(def.name, def.help); + if (rendered) lines.push(rendered); + } + + // ── Size histograms ── + const sizeHistogramDefs: { + name: string; + help: string; + histogram: HistogramAccumulator; + }[] = [ + { name: "task_result_size_bytes", help: "Task result output size in bytes", histogram: this.state.taskResultSizeBytes }, + { name: "workflow_input_size_bytes", help: "Workflow input payload size in bytes", histogram: this.state.workflowInputSizeBytes }, + ]; + + for (const def of sizeHistogramDefs) { + const rendered = def.histogram.render(def.name, def.help); + if (rendered) lines.push(rendered); + } + + // ── Gauges ── + const gaugeDefs: { + name: string; + help: string; + gauge: GaugeMetric; + }[] = [ + { name: "active_workers", help: "Number of workers actively executing tasks", gauge: this.state.activeWorkers }, + ]; + + for (const def of gaugeDefs) { + const rendered = def.gauge.render(def.name, def.help); + if (rendered) lines.push(rendered); + } + + lines.push(""); // trailing newline + return lines.join("\n"); + } +} diff --git a/src/sdk/worker/metrics/CanonicalPrometheusRegistry.ts b/src/sdk/worker/metrics/CanonicalPrometheusRegistry.ts new file mode 100644 index 00000000..c7ced41d --- /dev/null +++ b/src/sdk/worker/metrics/CanonicalPrometheusRegistry.ts @@ -0,0 +1,168 @@ +/** + * Optional prom-client adapter for canonical metrics. + * + * Registers Prometheus Counter, Histogram, and Gauge objects using the + * canonical metric names (unprefixed, camelCase labels, seconds/bytes units). + */ + +import { TIME_BUCKETS, SIZE_BUCKETS } from "./accumulators"; + +interface PromCounter { + inc(labels: Record, value?: number): void; +} +interface PromHistogram { + observe(labels: Record, value: number): void; +} +interface PromGauge { + set(labels: Record, value: number): void; +} +interface PromRegistry { + metrics(): Promise; + contentType: string; +} + +export class CanonicalPrometheusRegistry { + private _counters = new Map(); + private _histograms = new Map(); + private _gauges = new Map(); + private _registry?: PromRegistry; + private _available = false; + + async initialize(): Promise { + try { + const promClient = await import("prom-client"); + this._registry = promClient.register; + this.createMetrics(promClient); + this._available = true; + return true; + } catch { + this._available = false; + return false; + } + } + + get available(): boolean { + return this._available; + } + + get contentType(): string { + return ( + this._registry?.contentType ?? + "text/plain; version=0.0.4; charset=utf-8" + ); + } + + async metrics(): Promise { + if (!this._registry) return ""; + return this._registry.metrics(); + } + + incrementCounter( + name: string, + labels: Record, + value = 1, + ): void { + this._counters.get(name)?.inc(labels, value); + } + + observeHistogram( + name: string, + labels: Record, + value: number, + ): void { + this._histograms.get(name)?.observe(labels, value); + } + + setGauge( + name: string, + labels: Record, + value: number, + ): void { + this._gauges.get(name)?.set(labels, value); + } + + // eslint-disable-next-line @typescript-eslint/no-explicit-any + private createMetrics(promClient: any): void { + const Counter = promClient.Counter; + const Histogram = promClient.Histogram; + const Gauge = promClient.Gauge; + + const counterDefs: { + name: string; + help: string; + labels: string[]; + }[] = [ + { name: "task_poll_total", help: "Total task polls", labels: ["taskType"] }, + { name: "task_execution_started_total", help: "Total task executions started", labels: ["taskType"] }, + { name: "task_poll_error_total", help: "Total task poll errors", labels: ["taskType", "exception"] }, + { name: "task_execute_error_total", help: "Total task execution errors", labels: ["taskType", "exception"] }, + { name: "task_update_error_total", help: "Total task update errors", labels: ["taskType", "exception"] }, + { name: "task_ack_error_total", help: "Total task ack errors", labels: ["taskType", "exception"] }, + { name: "task_ack_failed_total", help: "Total task ack failures", labels: ["taskType"] }, + { name: "task_execution_queue_full_total", help: "Task execution queue full", labels: ["taskType"] }, + { name: "task_paused_total", help: "Task paused events", labels: ["taskType"] }, + { name: "thread_uncaught_exceptions_total", help: "Uncaught exceptions", labels: ["exception"] }, + { name: "external_payload_used_total", help: "External payload used", labels: ["entityName", "operation", "payloadType"] }, + { name: "workflow_start_error_total", help: "Workflow start errors", labels: ["workflowType", "exception"] }, + ]; + + for (const def of counterDefs) { + this._counters.set( + def.name, + new Counter({ + name: def.name, + help: def.help, + labelNames: def.labels, + }), + ); + } + + const timeBuckets = [...TIME_BUCKETS]; + const sizeBuckets = [...SIZE_BUCKETS]; + + const histogramDefs: { + name: string; + help: string; + labels: string[]; + buckets: number[]; + }[] = [ + { name: "task_poll_time_seconds", help: "Task poll duration (seconds)", labels: ["taskType", "status"], buckets: timeBuckets }, + { name: "task_execute_time_seconds", help: "Task execution duration (seconds)", labels: ["taskType", "status"], buckets: timeBuckets }, + { name: "task_update_time_seconds", help: "Task update duration (seconds)", labels: ["taskType", "status"], buckets: timeBuckets }, + { name: "http_api_client_request_seconds", help: "HTTP API client request duration (seconds)", labels: ["method", "uri", "status"], buckets: timeBuckets }, + { name: "task_result_size_bytes", help: "Task result size (bytes)", labels: ["taskType"], buckets: sizeBuckets }, + { name: "workflow_input_size_bytes", help: "Workflow input size (bytes)", labels: ["workflowType", "version"], buckets: sizeBuckets }, + ]; + + for (const def of histogramDefs) { + this._histograms.set( + def.name, + new Histogram({ + name: def.name, + help: def.help, + labelNames: def.labels, + buckets: def.buckets, + }), + ); + } + + const gaugeDefs: { + name: string; + help: string; + labels: string[]; + }[] = [ + { name: "active_workers", help: "Workers actively executing tasks", labels: ["taskType"] }, + ]; + + for (const def of gaugeDefs) { + this._gauges.set( + def.name, + new Gauge({ + name: def.name, + help: def.help, + labelNames: def.labels, + }), + ); + } + } +} diff --git a/src/sdk/worker/metrics/MetricsCollector.ts b/src/sdk/worker/metrics/LegacyMetricsCollector.ts similarity index 85% rename from src/sdk/worker/metrics/MetricsCollector.ts rename to src/sdk/worker/metrics/LegacyMetricsCollector.ts index ae3fe542..237c3c8d 100644 --- a/src/sdk/worker/metrics/MetricsCollector.ts +++ b/src/sdk/worker/metrics/LegacyMetricsCollector.ts @@ -1,5 +1,4 @@ import type { - TaskRunnerEventsListener, PollStarted, PollCompleted, PollFailure, @@ -9,6 +8,8 @@ import type { TaskUpdateCompleted, TaskUpdateFailure, } from "../../clients/worker/events"; +import type { MetricsCollectorInterface } from "./MetricsCollectorInterface"; +import { getHttpMetricsObserver, setHttpMetricsObserver } from "./httpObserver"; /** * Configuration for MetricsCollector. @@ -32,10 +33,15 @@ export interface MetricsCollectorConfig { * Falls back to custom text format if prom-client is not installed. */ usePromClient?: boolean; + /** + * Measure workflow input payload size via JSON.stringify on each + * `startWorkflow` call. Default: true for canonical, false for legacy. + */ + measurePayloadSize?: boolean; } /** - * Collected worker metrics. + * Collected worker metrics (legacy shape). */ export interface WorkerMetrics { /** Total polls by taskType */ @@ -78,9 +84,6 @@ export interface WorkerMetrics { const QUANTILES = [0.5, 0.75, 0.9, 0.95, 0.99] as const; -/** - * Calculate quantiles from sorted array using linear interpolation. - */ function computeQuantile(sorted: number[], q: number): number { if (sorted.length === 0) return 0; if (sorted.length === 1) return sorted[0]; @@ -92,37 +95,26 @@ function computeQuantile(sorted: number[], q: number): number { } /** - * Built-in metrics collector implementing TaskRunnerEventsListener. - * - * Collects 19 metric types matching the Python SDK's MetricsCollector, - * with sliding-window quantile support (p50, p75, p90, p95, p99). - * - * @example - * ```typescript - * const metrics = new MetricsCollector({ httpPort: 9090 }); + * Legacy metrics collector. * - * const handler = new TaskHandler({ - * client, - * eventListeners: [metrics], - * }); - * - * await handler.startWorkers(); - * // GET http://localhost:9090/metrics — Prometheus format - * // GET http://localhost:9090/health — {"status":"UP"} - * ``` + * Emits prefixed Prometheus metrics with `task_type` labels, millisecond + * time units, and Summary type for distributions. This is the default + * implementation during the deprecation transition period. */ -export class MetricsCollector implements TaskRunnerEventsListener { +export class LegacyMetricsCollector implements MetricsCollectorInterface { private metrics: WorkerMetrics; private readonly _prefix: string; private readonly _slidingWindowSize: number; private _server?: import("./MetricsServer.js").MetricsServer; private _fileTimer?: ReturnType; private _promRegistry?: import("./PrometheusRegistry.js").PrometheusRegistry; + readonly measurePayloadSize: boolean; constructor(config?: MetricsCollectorConfig) { this.metrics = this.createEmptyMetrics(); this._prefix = config?.prefix ?? "conductor_worker"; this._slidingWindowSize = config?.slidingWindowSize ?? 1000; + this.measurePayloadSize = config?.measurePayloadSize ?? false; if (config?.usePromClient) { void this.initPromClient(); } @@ -132,7 +124,7 @@ export class MetricsCollector implements TaskRunnerEventsListener { if (config?.filePath) { this.startFileWriter( config.filePath, - config.fileWriteIntervalMs ?? 5000 + config.fileWriteIntervalMs ?? 5000, ); } } @@ -158,10 +150,8 @@ export class MetricsCollector implements TaskRunnerEventsListener { // Silently ignore file write errors } }; - // Immediate first write, then periodic void doWrite(); this._fileTimer = setInterval(doWrite, intervalMs); - // Unref so the timer doesn't prevent process exit if (typeof this._fileTimer === "object" && "unref" in this._fileTimer) { this._fileTimer.unref(); } @@ -197,7 +187,7 @@ export class MetricsCollector implements TaskRunnerEventsListener { private observe( map: Map, key: string, - value: number + value: number, ): void { let arr = map.get(key); if (!arr) { @@ -205,7 +195,6 @@ export class MetricsCollector implements TaskRunnerEventsListener { map.set(key, arr); } arr.push(value); - // Sliding window: keep only the last N observations if (arr.length > this._slidingWindowSize) { arr.splice(0, arr.length - this._slidingWindowSize); } @@ -215,7 +204,7 @@ export class MetricsCollector implements TaskRunnerEventsListener { map: Map, key: string, promKey: string, - labelName: string + labelName: string, ): void { this.increment(map, key); this._promRegistry?.incrementCounter(promKey, { [labelName]: key }); @@ -226,7 +215,7 @@ export class MetricsCollector implements TaskRunnerEventsListener { key: string, value: number, promKey: string, - labelName: string + labelName: string, ): void { this.observe(map, key, value); this._promRegistry?.observeSummary(promKey, { [labelName]: key }, value); @@ -248,7 +237,7 @@ export class MetricsCollector implements TaskRunnerEventsListener { } onTaskExecutionStarted(_event: TaskExecutionStarted): void { - // Counted on completion + // Legacy counts on completion, not start } onTaskExecutionCompleted(event: TaskExecutionCompleted): void { @@ -271,59 +260,73 @@ export class MetricsCollector implements TaskRunnerEventsListener { onTaskUpdateFailure(event: TaskUpdateFailure): void { this.incrementCounter(this.metrics.taskUpdateFailureTotal, event.taskType, "update_error_total", "task_type"); + if (event.durationMs != null) { + this.observeSummary(this.metrics.updateDurationMs, event.taskType, event.durationMs, "update_time", "task_type"); + } } - // ── Direct Recording Methods (for code outside event system) ─── + onTaskPaused(): void { + // Intentional no-op: task_paused_total was never emitted by the legacy + // metrics surface. Keeping it unrecorded preserves byte-for-byte output + // compatibility. Users who want paused metrics should switch to canonical + // mode (WORKER_CANONICAL_METRICS=true). + } + + // ── Direct Recording Methods ─────────────────────────────────── - /** Record a task execution queue full event */ recordTaskExecutionQueueFull(taskType: string): void { this.incrementCounter(this.metrics.taskExecutionQueueFullTotal, taskType, "queue_full_total", "task_type"); } - /** Record an uncaught exception */ - recordUncaughtException(): void { + recordUncaughtException(_exception?: string): void { this.metrics.uncaughtExceptionTotal++; this._promRegistry?.incrementCounter("uncaught_total", {}); } - /** Record a worker restart */ recordWorkerRestart(): void { this.metrics.workerRestartTotal++; this._promRegistry?.incrementCounter("restart_total", {}); } - /** Record a task paused event */ recordTaskPaused(taskType: string): void { this.incrementCounter(this.metrics.taskPausedTotal, taskType, "paused_total", "task_type"); } - /** Record a task ack error */ - recordTaskAckError(taskType: string): void { + recordTaskAckError(taskType: string, _exception?: string): void { this.incrementCounter(this.metrics.taskAckErrorTotal, taskType, "ack_error_total", "task_type"); } - /** Record a workflow start error */ - recordWorkflowStartError(): void { + recordTaskAckFailed(_taskType: string): void { + // Noop: canonical-only metric (server declined ack) + } + + recordWorkflowStartError(_workflowType?: string, _exception?: string): void { this.metrics.workflowStartErrorTotal++; this._promRegistry?.incrementCounter("wf_start_error_total", {}); } - /** Record external payload usage */ - recordExternalPayloadUsed(payloadType: string): void { + recordExternalPayloadUsed( + payloadType: string, + _entityName?: string, + _operation?: string, + ): void { this.incrementCounter(this.metrics.externalPayloadUsedTotal, payloadType, "external_payload_total", "payload_type"); } - /** Record workflow input size */ - recordWorkflowInputSize(workflowType: string, sizeBytes: number): void { + recordWorkflowInputSize( + workflowType: string, + sizeBytes: number, + _version?: string, + ): void { this.observeSummary(this.metrics.workflowInputSizeBytes, workflowType, sizeBytes, "wf_input_size", "workflow_type"); } - /** Record API request duration */ recordApiRequestTime( method: string, uri: string, - status: number, - durationMs: number + status: number | string, + durationMs: number, + _metricUri?: string, ): void { const key = `${method}:${uri}:${status}`; this.observeSummary(this.metrics.apiRequestDurationMs, key, durationMs, "api_request", "endpoint"); @@ -331,18 +334,18 @@ export class MetricsCollector implements TaskRunnerEventsListener { // ── Public API ────────────────────────────────────────────────── - /** Get a snapshot of all collected metrics */ getMetrics(): Readonly { return this.metrics; } - /** Reset all collected metrics */ reset(): void { this.metrics = this.createEmptyMetrics(); } - /** Stop the auto-started metrics HTTP server and file writer (if any) */ async stop(): Promise { + if (getHttpMetricsObserver() === this) { + setHttpMetricsObserver(undefined); + } if (this._fileTimer) { clearInterval(this._fileTimer); this._fileTimer = undefined; @@ -353,27 +356,17 @@ export class MetricsCollector implements TaskRunnerEventsListener { } } - /** - * Get the content type for the Prometheus metrics endpoint. - * Returns prom-client's content type when available, otherwise standard Prometheus text format. - */ + collectorName(): string { + return "legacy"; + } + getContentType(): string { - return this._promRegistry?.contentType ?? "text/plain; version=0.0.4; charset=utf-8"; + return ( + this._promRegistry?.contentType ?? + "text/plain; version=0.0.4; charset=utf-8" + ); } - /** - * Render all collected metrics in Prometheus exposition format. - * If prom-client is available and `usePromClient: true`, delegates to prom-client's registry. - * Otherwise uses built-in rendering with p50/p75/p90/p95/p99 quantiles. - * - * @param prefix - Metric name prefix (defaults to constructor config or "conductor_worker") - * @returns Prometheus text format string - */ - /** - * Async version of toPrometheusText. - * When prom-client is available, returns its native registry output. - * Otherwise falls back to the built-in text format. - */ async toPrometheusTextAsync(): Promise { if (this._promRegistry?.available) { return this._promRegistry.metrics(); @@ -454,7 +447,7 @@ export class MetricsCollector implements TaskRunnerEventsListener { lines.push(`# TYPE ${counter.name} counter`); for (const [label, value] of counter.data) { lines.push( - `${counter.name}{${counter.labelName}="${label}"} ${value}` + `${counter.name}{${counter.labelName}="${label}"} ${value}`, ); } } @@ -545,14 +538,14 @@ export class MetricsCollector implements TaskRunnerEventsListener { for (const q of QUANTILES) { const val = computeQuantile(sorted, q); lines.push( - `${summary.name}{${summary.labelName}="${label}",quantile="${q}"} ${val}` + `${summary.name}{${summary.labelName}="${label}",quantile="${q}"} ${val}`, ); } lines.push( - `${summary.name}_count{${summary.labelName}="${label}"} ${count}` + `${summary.name}_count{${summary.labelName}="${label}"} ${count}`, ); lines.push( - `${summary.name}_sum{${summary.labelName}="${label}"} ${sum}` + `${summary.name}_sum{${summary.labelName}="${label}"} ${sum}`, ); } } diff --git a/src/sdk/worker/metrics/MetricsCollectorInterface.ts b/src/sdk/worker/metrics/MetricsCollectorInterface.ts new file mode 100644 index 00000000..90ec8e8e --- /dev/null +++ b/src/sdk/worker/metrics/MetricsCollectorInterface.ts @@ -0,0 +1,54 @@ +import type { + TaskRunnerEventsListener, +} from "../../clients/worker/events"; + +/** + * Unified metrics collector interface. + * + * Both LegacyMetricsCollector and CanonicalMetricsCollector implement this + * interface so call sites never need to know which variant is active. + * Methods that only apply to one variant are noops in the other. + */ +export interface MetricsCollectorInterface extends TaskRunnerEventsListener { + // ── Direct recording methods (superset signatures) ───────────── + + recordTaskExecutionQueueFull(taskType: string): void; + recordUncaughtException(exception?: string): void; + recordWorkerRestart(): void; + recordTaskPaused(taskType: string): void; + recordTaskAckError(taskType: string, exception?: string): void; + /** Canonical-only: server declined ack (no exception). Legacy noops. */ + recordTaskAckFailed(taskType: string): void; + recordWorkflowStartError(workflowType?: string, exception?: string): void; + recordExternalPayloadUsed( + payloadType: string, + entityName?: string, + operation?: string, + ): void; + recordWorkflowInputSize( + workflowType: string, + sizeBytes: number, + version?: string, + ): void; + recordApiRequestTime( + method: string, + uri: string, + status: number | string, + durationMs: number, + metricUri?: string, + ): void; + + // ── Configuration ─────────────────────────────────────────────── + + readonly measurePayloadSize: boolean; + + // ── Output / lifecycle ───────────────────────────────────────── + + collectorName(): string; + getMetrics(): unknown; + reset(): void; + stop(): Promise; + getContentType(): string; + toPrometheusText(prefix?: string): string; + toPrometheusTextAsync(): Promise; +} diff --git a/src/sdk/worker/metrics/MetricsServer.ts b/src/sdk/worker/metrics/MetricsServer.ts index 62f1fbcb..d575901b 100644 --- a/src/sdk/worker/metrics/MetricsServer.ts +++ b/src/sdk/worker/metrics/MetricsServer.ts @@ -1,5 +1,5 @@ import { createServer, type Server, type IncomingMessage, type ServerResponse } from "node:http"; -import type { MetricsCollector } from "./MetricsCollector"; +import type { MetricsCollectorInterface } from "./MetricsCollectorInterface"; /** * Lightweight HTTP server exposing Prometheus metrics and a health check endpoint. @@ -21,11 +21,11 @@ import type { MetricsCollector } from "./MetricsCollector"; * ``` */ export class MetricsServer { - private readonly _collector: MetricsCollector; + private readonly _collector: MetricsCollectorInterface; private readonly _port: number; private _server?: Server; - constructor(collector: MetricsCollector, port: number) { + constructor(collector: MetricsCollectorInterface, port: number) { this._collector = collector; this._port = port; } diff --git a/src/sdk/worker/metrics/__tests__/CanonicalMetricsCollector.test.ts b/src/sdk/worker/metrics/__tests__/CanonicalMetricsCollector.test.ts new file mode 100644 index 00000000..aab49782 --- /dev/null +++ b/src/sdk/worker/metrics/__tests__/CanonicalMetricsCollector.test.ts @@ -0,0 +1,448 @@ +import { describe, it, expect, beforeEach } from "@jest/globals"; +import { CanonicalMetricsCollector } from "../CanonicalMetricsCollector"; + +describe("CanonicalMetricsCollector", () => { + let collector: CanonicalMetricsCollector; + + beforeEach(() => { + collector = new CanonicalMetricsCollector(); + }); + + describe("poll metrics", () => { + it("should emit task_poll_total counter on onPollStarted", () => { + collector.onPollStarted({ + taskType: "task_a", + workerId: "w1", + pollCount: 5, + timestamp: new Date(), + }); + collector.onPollStarted({ + taskType: "task_a", + workerId: "w1", + pollCount: 5, + timestamp: new Date(), + }); + + const text = collector.toPrometheusText(); + expect(text).toContain("# TYPE task_poll_total counter"); + expect(text).toContain('task_poll_total{taskType="task_a"} 2'); + }); + + it("should emit task_poll_time_seconds histogram on onPollCompleted", () => { + collector.onPollCompleted({ + taskType: "task_a", + durationMs: 100, + tasksReceived: 3, + timestamp: new Date(), + }); + + const text = collector.toPrometheusText(); + expect(text).toContain("# TYPE task_poll_time_seconds histogram"); + expect(text).toContain('task_poll_time_seconds_bucket{taskType="task_a",status="SUCCESS",le="0.1"} 1'); + expect(text).toContain('task_poll_time_seconds_sum{taskType="task_a",status="SUCCESS"} 0.1'); + expect(text).toContain('task_poll_time_seconds_count{taskType="task_a",status="SUCCESS"} 1'); + }); + + it("should emit task_poll_error_total with exception label on onPollFailure", () => { + collector.onPollFailure({ + taskType: "task_a", + durationMs: 5000, + cause: new TypeError("timeout"), + timestamp: new Date(), + }); + + const text = collector.toPrometheusText(); + expect(text).toContain('task_poll_error_total{taskType="task_a",exception="TypeError"} 1'); + expect(text).toContain('task_poll_time_seconds_bucket{taskType="task_a",status="FAILURE"'); + }); + }); + + describe("execution metrics", () => { + it("should emit task_execution_started_total on onTaskExecutionStarted", () => { + collector.onTaskExecutionStarted({ + taskType: "task_a", + taskId: "t1", + workerId: "w1", + timestamp: new Date(), + }); + + const text = collector.toPrometheusText(); + expect(text).toContain('task_execution_started_total{taskType="task_a"} 1'); + }); + + it("should track active_workers gauge", () => { + collector.onTaskExecutionStarted({ + taskType: "task_a", + taskId: "t1", + workerId: "w1", + timestamp: new Date(), + }); + collector.onTaskExecutionStarted({ + taskType: "task_a", + taskId: "t2", + workerId: "w1", + timestamp: new Date(), + }); + + let text = collector.toPrometheusText(); + expect(text).toContain('active_workers{taskType="task_a"} 2'); + + collector.onTaskExecutionCompleted({ + taskType: "task_a", + taskId: "t1", + workerId: "w1", + durationMs: 200, + timestamp: new Date(), + }); + + text = collector.toPrometheusText(); + expect(text).toContain('active_workers{taskType="task_a"} 1'); + }); + + it("should emit task_execute_time_seconds histogram on completion", () => { + collector.onTaskExecutionCompleted({ + taskType: "task_a", + taskId: "t1", + workerId: "w1", + durationMs: 500, + timestamp: new Date(), + }); + + const text = collector.toPrometheusText(); + expect(text).toContain("# TYPE task_execute_time_seconds histogram"); + expect(text).toContain('task_execute_time_seconds_sum{taskType="task_a",status="SUCCESS"} 0.5'); + }); + + it("should emit task_result_size_bytes histogram on completion", () => { + collector.onTaskExecutionCompleted({ + taskType: "task_a", + taskId: "t1", + workerId: "w1", + durationMs: 200, + outputSizeBytes: 5000, + timestamp: new Date(), + }); + + const text = collector.toPrometheusText(); + expect(text).toContain("# TYPE task_result_size_bytes histogram"); + expect(text).toContain('task_result_size_bytes_sum{taskType="task_a"} 5000'); + }); + + it("should emit task_execute_error_total with exception label on failure", () => { + const err = new TypeError("invalid input"); + collector.onTaskExecutionFailure({ + taskType: "task_a", + taskId: "t1", + workerId: "w1", + cause: err, + durationMs: 50, + timestamp: new Date(), + }); + + const text = collector.toPrometheusText(); + expect(text).toContain('task_execute_error_total{taskType="task_a",exception="TypeError"} 1'); + expect(text).toContain('task_execute_time_seconds_bucket{taskType="task_a",status="FAILURE"'); + }); + }); + + describe("task update metrics", () => { + it("should emit task_update_time_seconds histogram on completion", () => { + collector.onTaskUpdateCompleted({ + taskType: "task_a", + taskId: "t1", + workerId: "w1", + durationMs: 25, + timestamp: new Date(), + }); + + const text = collector.toPrometheusText(); + expect(text).toContain("# TYPE task_update_time_seconds histogram"); + expect(text).toContain('task_update_time_seconds_sum{taskType="task_a",status="SUCCESS"} 0.025'); + }); + + it("should emit task_update_error_total and task_update_time_seconds on failure", () => { + collector.onTaskUpdateFailure({ + taskType: "task_a", + taskId: "t1", + workerId: "w1", + cause: new Error("server error"), + durationMs: 150, + retryCount: 4, + taskResult: {}, + timestamp: new Date(), + }); + + const text = collector.toPrometheusText(); + expect(text).toContain('task_update_error_total{taskType="task_a",exception="Error"} 1'); + expect(text).toContain('task_update_time_seconds_sum{taskType="task_a",status="FAILURE"} 0.15'); + }); + }); + + describe("direct recording methods", () => { + it("recordTaskExecutionQueueFull should emit counter", () => { + collector.recordTaskExecutionQueueFull("task_a"); + collector.recordTaskExecutionQueueFull("task_a"); + + const text = collector.toPrometheusText(); + expect(text).toContain('task_execution_queue_full_total{taskType="task_a"} 2'); + }); + + it("recordUncaughtException should emit counter with exception label", () => { + collector.recordUncaughtException("RangeError"); + collector.recordUncaughtException("RangeError"); + collector.recordUncaughtException("TypeError"); + + const text = collector.toPrometheusText(); + expect(text).toContain('thread_uncaught_exceptions_total{exception="RangeError"} 2'); + expect(text).toContain('thread_uncaught_exceptions_total{exception="TypeError"} 1'); + }); + + it("recordWorkerRestart should be a noop (N/A for JS)", () => { + collector.recordWorkerRestart(); + const text = collector.toPrometheusText(); + expect(text).not.toContain("worker_restart"); + }); + + it("recordTaskPaused should emit counter", () => { + collector.recordTaskPaused("paused_task"); + const text = collector.toPrometheusText(); + expect(text).toContain('task_paused_total{taskType="paused_task"} 1'); + }); + + it("recordTaskAckError should emit counter with exception label", () => { + collector.recordTaskAckError("task_a", "TimeoutError"); + const text = collector.toPrometheusText(); + expect(text).toContain('task_ack_error_total{taskType="task_a",exception="TimeoutError"} 1'); + }); + + it("recordTaskAckFailed should emit counter", () => { + collector.recordTaskAckFailed("task_a"); + const text = collector.toPrometheusText(); + expect(text).toContain('task_ack_failed_total{taskType="task_a"} 1'); + }); + + it("recordWorkflowStartError should emit counter with labels", () => { + collector.recordWorkflowStartError("my_workflow", "NetworkError"); + const text = collector.toPrometheusText(); + expect(text).toContain('workflow_start_error_total{workflowType="my_workflow",exception="NetworkError"} 1'); + }); + + it("recordExternalPayloadUsed should emit counter with labels", () => { + collector.recordExternalPayloadUsed("TASK_OUTPUT", "myEntity", "WRITE"); + const text = collector.toPrometheusText(); + expect(text).toContain('external_payload_used_total{entityName="myEntity",operation="WRITE",payloadType="TASK_OUTPUT"} 1'); + }); + + it("recordWorkflowInputSize should emit histogram", () => { + collector.recordWorkflowInputSize("order_flow", 50000, "1"); + const text = collector.toPrometheusText(); + expect(text).toContain("# TYPE workflow_input_size_bytes histogram"); + expect(text).toContain('workflow_input_size_bytes_sum{workflowType="order_flow",version="1"} 50000'); + }); + + it("recordApiRequestTime should emit histogram in seconds", () => { + collector.recordApiRequestTime("GET", "/api/workflow", 200, 45); + const text = collector.toPrometheusText(); + expect(text).toContain("# TYPE http_api_client_request_seconds histogram"); + expect(text).toContain('http_api_client_request_seconds_sum{method="GET",uri="/api/workflow",status="200"} 0.045'); + }); + + it("recordApiRequestTime should prefer metricUri over uri when provided", () => { + collector.recordApiRequestTime("GET", "/api/workflow/abc-123", 200, 30, "/workflow/{workflowId}"); + const text = collector.toPrometheusText(); + expect(text).toContain('uri="/workflow/{workflowId}"'); + expect(text).not.toContain("abc-123"); + }); + + it("recordApiRequestTime should fall back to uri when metricUri is undefined", () => { + collector.recordApiRequestTime("POST", "/api/tasks", 201, 20); + const text = collector.toPrometheusText(); + expect(text).toContain('uri="/api/tasks"'); + }); + }); + + describe("onTaskPaused event", () => { + it("should increment task_paused_total", () => { + collector.onTaskPaused({ + taskType: "paused_task", + timestamp: new Date(), + }); + const text = collector.toPrometheusText(); + expect(text).toContain('task_paused_total{taskType="paused_task"} 1'); + }); + }); + + describe("reset", () => { + it("should clear all canonical metrics", () => { + collector.onPollStarted({ + taskType: "task_a", + workerId: "w1", + pollCount: 1, + timestamp: new Date(), + }); + collector.recordUncaughtException("Error"); + collector.recordTaskAckFailed("task_a"); + + collector.reset(); + + const text = collector.toPrometheusText(); + expect(text).toBe(""); + }); + }); + + describe("output format", () => { + it("should not apply a prefix to canonical metric names", () => { + collector.onPollStarted({ + taskType: "t", + workerId: "w", + pollCount: 1, + timestamp: new Date(), + }); + const text = collector.toPrometheusText(); + expect(text).not.toContain("conductor_worker"); + expect(text).toContain("task_poll_total{"); + }); + + it("should use seconds for time histograms", () => { + collector.onPollCompleted({ + taskType: "t", + durationMs: 1000, + tasksReceived: 1, + timestamp: new Date(), + }); + const text = collector.toPrometheusText(); + expect(text).toContain("task_poll_time_seconds"); + expect(text).toContain('_sum{taskType="t",status="SUCCESS"} 1'); + }); + + it("should use camelCase taskType label", () => { + collector.onPollStarted({ + taskType: "my_task", + workerId: "w", + pollCount: 1, + timestamp: new Date(), + }); + const text = collector.toPrometheusText(); + expect(text).toContain("taskType="); + expect(text).not.toContain("task_type="); + }); + }); + + describe("stop", () => { + it("should not throw when no server is running", async () => { + await expect(collector.stop()).resolves.toBeUndefined(); + }); + }); + + describe("default argument handling", () => { + it("recordUncaughtException with no arg should default to 'Error'", () => { + collector.recordUncaughtException(); + const text = collector.toPrometheusText(); + expect(text).toContain('thread_uncaught_exceptions_total{exception="Error"} 1'); + }); + + it("recordWorkflowStartError with no args should default both labels", () => { + collector.recordWorkflowStartError(); + const text = collector.toPrometheusText(); + expect(text).toContain('workflow_start_error_total{workflowType="",exception="Error"} 1'); + }); + + it("recordWorkflowStartError with only workflowType should default exception", () => { + collector.recordWorkflowStartError("my_wf"); + const text = collector.toPrometheusText(); + expect(text).toContain('workflow_start_error_total{workflowType="my_wf",exception="Error"} 1'); + }); + + it("recordExternalPayloadUsed with missing optional args should default to empty strings", () => { + collector.recordExternalPayloadUsed("TASK_OUTPUT"); + const text = collector.toPrometheusText(); + expect(text).toContain('external_payload_used_total{entityName="",operation="",payloadType="TASK_OUTPUT"} 1'); + }); + + it("recordTaskAckError with no exception arg should default to 'Error'", () => { + collector.recordTaskAckError("task_a"); + const text = collector.toPrometheusText(); + expect(text).toContain('task_ack_error_total{taskType="task_a",exception="Error"} 1'); + }); + + it("recordWorkflowInputSize with no version should default to empty string", () => { + collector.recordWorkflowInputSize("my_wf", 1000); + const text = collector.toPrometheusText(); + expect(text).toContain('workflow_input_size_bytes_sum{workflowType="my_wf",version=""} 1000'); + }); + }); + + describe("execution completion without outputSizeBytes", () => { + it("should not emit task_result_size_bytes when outputSizeBytes is undefined", () => { + collector.onTaskExecutionCompleted({ + taskType: "task_a", + taskId: "t1", + workerId: "w1", + durationMs: 200, + timestamp: new Date(), + }); + + const text = collector.toPrometheusText(); + expect(text).toContain("task_execute_time_seconds"); + expect(text).not.toContain("task_result_size_bytes"); + }); + }); + + describe("output helpers without prom-client", () => { + it("getContentType should return plain text fallback", () => { + expect(collector.getContentType()).toBe( + "text/plain; version=0.0.4; charset=utf-8" + ); + }); + + it("toPrometheusTextAsync should fall back to sync text", async () => { + collector.onPollStarted({ + taskType: "t", + workerId: "w", + pollCount: 1, + timestamp: new Date(), + }); + const asyncText = await collector.toPrometheusTextAsync(); + const syncText = collector.toPrometheusText(); + expect(asyncText).toBe(syncText); + }); + + it("toPrometheusText should ignore _prefix argument", () => { + collector.onPollStarted({ + taskType: "t", + workerId: "w", + pollCount: 1, + timestamp: new Date(), + }); + const text = collector.toPrometheusText("some_prefix"); + expect(text).not.toContain("some_prefix"); + expect(text).toContain("task_poll_total{"); + }); + }); + + describe("active_workers gauge on failure", () => { + it("should decrement active_workers on task execution failure", () => { + collector.onTaskExecutionStarted({ + taskType: "task_a", + taskId: "t1", + workerId: "w1", + timestamp: new Date(), + }); + + let text = collector.toPrometheusText(); + expect(text).toContain('active_workers{taskType="task_a"} 1'); + + collector.onTaskExecutionFailure({ + taskType: "task_a", + taskId: "t1", + workerId: "w1", + cause: new Error("fail"), + durationMs: 100, + timestamp: new Date(), + }); + + text = collector.toPrometheusText(); + expect(text).toContain('active_workers{taskType="task_a"} 0'); + }); + }); +}); diff --git a/src/sdk/worker/metrics/__tests__/CanonicalPrometheusRegistry.test.ts b/src/sdk/worker/metrics/__tests__/CanonicalPrometheusRegistry.test.ts new file mode 100644 index 00000000..449955eb --- /dev/null +++ b/src/sdk/worker/metrics/__tests__/CanonicalPrometheusRegistry.test.ts @@ -0,0 +1,176 @@ +import { describe, it, expect, beforeEach } from "@jest/globals"; +import { CanonicalPrometheusRegistry } from "../CanonicalPrometheusRegistry"; + +describe("CanonicalPrometheusRegistry", () => { + let registry: CanonicalPrometheusRegistry; + + beforeEach(async () => { + try { + const promClient = await import("prom-client"); + promClient.register.clear(); + } catch { + // prom-client not installed — skip cleanup + } + registry = new CanonicalPrometheusRegistry(); + }); + + describe("before initialization", () => { + it("should not be available", () => { + expect(registry.available).toBe(false); + }); + + it("should return default content type", () => { + expect(registry.contentType).toBe( + "text/plain; version=0.0.4; charset=utf-8" + ); + }); + + it("should return empty string from metrics()", async () => { + expect(await registry.metrics()).toBe(""); + }); + + it("should not throw when incrementCounter is called", () => { + expect(() => + registry.incrementCounter("task_poll_total", { taskType: "t" }) + ).not.toThrow(); + }); + + it("should not throw when observeHistogram is called", () => { + expect(() => + registry.observeHistogram("task_poll_time_seconds", { taskType: "t" }, 0.1) + ).not.toThrow(); + }); + + it("should not throw when setGauge is called", () => { + expect(() => + registry.setGauge("active_workers", { taskType: "t" }, 5) + ).not.toThrow(); + }); + }); + + describe("initialize()", () => { + it("should return true when prom-client is installed", async () => { + const result = await registry.initialize(); + expect(result).toBe(true); + expect(registry.available).toBe(true); + }); + + it("should set contentType from prom-client registry", async () => { + await registry.initialize(); + expect(registry.contentType).toBeDefined(); + expect(registry.contentType.length).toBeGreaterThan(0); + }); + }); + + describe("after initialization", () => { + beforeEach(async () => { + await registry.initialize(); + }); + + it("should be available", () => { + expect(registry.available).toBe(true); + }); + + it("should return metrics text from prom-client", async () => { + const text = await registry.metrics(); + expect(typeof text).toBe("string"); + }); + + it("should increment a counter and see it in metrics output", async () => { + registry.incrementCounter("task_poll_total", { taskType: "test_t" }); + registry.incrementCounter("task_poll_total", { taskType: "test_t" }); + const text = await registry.metrics(); + expect(text).toContain("task_poll_total"); + expect(text).toContain("test_t"); + }); + + it("should observe a histogram and see it in metrics output", async () => { + registry.observeHistogram( + "task_poll_time_seconds", + { taskType: "test_t", status: "SUCCESS" }, + 0.05 + ); + const text = await registry.metrics(); + expect(text).toContain("task_poll_time_seconds"); + expect(text).toContain("test_t"); + }); + + it("should set a gauge and see it in metrics output", async () => { + registry.setGauge("active_workers", { taskType: "test_t" }, 3); + const text = await registry.metrics(); + expect(text).toContain("active_workers"); + expect(text).toContain("test_t"); + }); + + it("should handle unknown counter key as no-op", () => { + expect(() => + registry.incrementCounter("nonexistent", { x: "y" }) + ).not.toThrow(); + }); + + it("should handle unknown histogram key as no-op", () => { + expect(() => + registry.observeHistogram("nonexistent", { x: "y" }, 1) + ).not.toThrow(); + }); + + it("should handle unknown gauge key as no-op", () => { + expect(() => + registry.setGauge("nonexistent", { x: "y" }, 1) + ).not.toThrow(); + }); + + it("should accept custom increment value for counter", () => { + expect(() => + registry.incrementCounter("task_poll_total", { taskType: "t" }, 5) + ).not.toThrow(); + }); + + it("should record all counter types", async () => { + registry.incrementCounter("task_poll_total", { taskType: "t" }); + registry.incrementCounter("task_execution_started_total", { taskType: "t" }); + registry.incrementCounter("task_poll_error_total", { taskType: "t", exception: "Error" }); + registry.incrementCounter("task_execute_error_total", { taskType: "t", exception: "Error" }); + registry.incrementCounter("task_update_error_total", { taskType: "t", exception: "Error" }); + registry.incrementCounter("task_ack_error_total", { taskType: "t", exception: "Error" }); + registry.incrementCounter("task_ack_failed_total", { taskType: "t" }); + registry.incrementCounter("task_execution_queue_full_total", { taskType: "t" }); + registry.incrementCounter("task_paused_total", { taskType: "t" }); + registry.incrementCounter("thread_uncaught_exceptions_total", { exception: "Error" }); + registry.incrementCounter("external_payload_used_total", { entityName: "e", operation: "o", payloadType: "p" }); + registry.incrementCounter("workflow_start_error_total", { workflowType: "w", exception: "Error" }); + + const text = await registry.metrics(); + expect(text).toContain("task_poll_total"); + expect(text).toContain("task_execution_started_total"); + expect(text).toContain("task_poll_error_total"); + expect(text).toContain("task_ack_failed_total"); + expect(text).toContain("thread_uncaught_exceptions_total"); + expect(text).toContain("external_payload_used_total"); + expect(text).toContain("workflow_start_error_total"); + }); + + it("should record all histogram types", async () => { + registry.observeHistogram("task_poll_time_seconds", { taskType: "t", status: "SUCCESS" }, 0.1); + registry.observeHistogram("task_execute_time_seconds", { taskType: "t", status: "SUCCESS" }, 0.5); + registry.observeHistogram("task_update_time_seconds", { taskType: "t", status: "SUCCESS" }, 0.02); + registry.observeHistogram("http_api_client_request_seconds", { method: "GET", uri: "/api", status: "200" }, 0.1); + registry.observeHistogram("task_result_size_bytes", { taskType: "t" }, 1024); + registry.observeHistogram("workflow_input_size_bytes", { workflowType: "w", version: "1" }, 512); + + const text = await registry.metrics(); + expect(text).toContain("task_poll_time_seconds"); + expect(text).toContain("task_execute_time_seconds"); + expect(text).toContain("task_update_time_seconds"); + expect(text).toContain("http_api_client_request_seconds"); + expect(text).toContain("task_result_size_bytes"); + expect(text).toContain("workflow_input_size_bytes"); + }); + + it("should record gauge metric", async () => { + registry.setGauge("active_workers", { taskType: "t" }, 7); + const text = await registry.metrics(); + expect(text).toContain("active_workers"); + }); + }); +}); diff --git a/src/sdk/worker/metrics/__tests__/MetricsCollector.prometheus.test.ts b/src/sdk/worker/metrics/__tests__/MetricsCollector.prometheus.test.ts index 3c61671a..424922a7 100644 --- a/src/sdk/worker/metrics/__tests__/MetricsCollector.prometheus.test.ts +++ b/src/sdk/worker/metrics/__tests__/MetricsCollector.prometheus.test.ts @@ -1,11 +1,11 @@ import { describe, it, expect, beforeEach } from "@jest/globals"; -import { MetricsCollector } from "../MetricsCollector"; +import { LegacyMetricsCollector } from "../LegacyMetricsCollector"; -describe("MetricsCollector - Prometheus features", () => { - let collector: MetricsCollector; +describe("LegacyMetricsCollector - Prometheus features", () => { + let collector: LegacyMetricsCollector; beforeEach(() => { - collector = new MetricsCollector({ slidingWindowSize: 1000 }); + collector = new LegacyMetricsCollector({ slidingWindowSize: 1000 }); }); // ── toPrometheusText() ────────────────────────────────────────── @@ -36,7 +36,7 @@ describe("MetricsCollector - Prometheus features", () => { }); it("should use custom prefix", () => { - const c = new MetricsCollector({ prefix: "myapp" }); + const c = new LegacyMetricsCollector({ prefix: "myapp" }); c.onPollStarted({ taskType: "t", workerId: "w", pollCount: 1, timestamp: new Date() }); const text = c.toPrometheusText(); expect(text).toContain("myapp_task_poll_total"); @@ -161,13 +161,20 @@ describe("MetricsCollector - Prometheus features", () => { const m = collector.getMetrics(); expect(m.apiRequestDurationMs.get("GET:/api/workflow:200")).toEqual([45, 55]); }); + + it("should ignore metricUri and use uri (legacy behavior preserved)", () => { + collector.recordApiRequestTime("GET", "/api/workflow/abc-123", 200, 45, "/workflow/{workflowId}"); + const m = collector.getMetrics(); + expect(m.apiRequestDurationMs.get("GET:/api/workflow/abc-123:200")).toEqual([45]); + expect(m.apiRequestDurationMs.get("GET:/workflow/{workflowId}:200")).toBeUndefined(); + }); }); // ── Sliding window ────────────────────────────────────────────── describe("sliding window", () => { it("should trim observations beyond window size", () => { - const small = new MetricsCollector({ slidingWindowSize: 5 }); + const small = new LegacyMetricsCollector({ slidingWindowSize: 5 }); for (let i = 0; i < 10; i++) { small.onPollCompleted({ taskType: "t", workerId: "w", durationMs: i, pollCount: i, taskCount: 1, timestamp: new Date() }); } diff --git a/src/sdk/worker/metrics/__tests__/MetricsCollector.test.ts b/src/sdk/worker/metrics/__tests__/MetricsCollector.test.ts index 691d9325..33787620 100644 --- a/src/sdk/worker/metrics/__tests__/MetricsCollector.test.ts +++ b/src/sdk/worker/metrics/__tests__/MetricsCollector.test.ts @@ -1,11 +1,11 @@ import { describe, it, expect, beforeEach } from "@jest/globals"; -import { MetricsCollector } from "../MetricsCollector"; +import { LegacyMetricsCollector } from "../LegacyMetricsCollector"; -describe("MetricsCollector", () => { - let collector: MetricsCollector; +describe("LegacyMetricsCollector", () => { + let collector: LegacyMetricsCollector; beforeEach(() => { - collector = new MetricsCollector(); + collector = new LegacyMetricsCollector(); }); describe("poll metrics", () => { @@ -133,13 +133,14 @@ describe("MetricsCollector", () => { expect(metrics.updateDurationMs.get("task_a")).toEqual([25, 30]); }); - it("should count update failures via onTaskUpdateFailure", () => { + it("should count update failures and record duration via onTaskUpdateFailure", () => { collector.onTaskUpdateFailure({ taskType: "task_a", taskId: "t1", workerId: "w1", workflowInstanceId: "wf1", cause: new Error("server error"), + durationMs: 200, retryCount: 4, taskResult: {}, timestamp: new Date(), @@ -147,6 +148,7 @@ describe("MetricsCollector", () => { const metrics = collector.getMetrics(); expect(metrics.taskUpdateFailureTotal.get("task_a")).toBe(1); + expect(metrics.updateDurationMs.get("task_a")).toEqual([200]); }); }); diff --git a/src/sdk/worker/metrics/__tests__/MetricsServer.test.ts b/src/sdk/worker/metrics/__tests__/MetricsServer.test.ts index bbe5153f..c0b423f7 100644 --- a/src/sdk/worker/metrics/__tests__/MetricsServer.test.ts +++ b/src/sdk/worker/metrics/__tests__/MetricsServer.test.ts @@ -1,6 +1,6 @@ import { describe, it, expect, afterEach } from "@jest/globals"; import { get as httpGet } from "node:http"; -import { MetricsCollector } from "../MetricsCollector"; +import { LegacyMetricsCollector } from "../LegacyMetricsCollector"; import { MetricsServer } from "../MetricsServer"; function fetchHttp(url: string): Promise<{ status: number; body: string; headers: Record }> { @@ -38,7 +38,7 @@ describe("MetricsServer", () => { it("should serve /metrics with Prometheus text format", async () => { const port = nextPort(); - const collector = new MetricsCollector(); + const collector = new LegacyMetricsCollector(); collector.onPollStarted({ taskType: "test_task", workerId: "w", pollCount: 1, timestamp: new Date() }); server = new MetricsServer(collector, port); @@ -51,7 +51,7 @@ describe("MetricsServer", () => { it("should serve /health with JSON status", async () => { const port = nextPort(); - const collector = new MetricsCollector(); + const collector = new LegacyMetricsCollector(); server = new MetricsServer(collector, port); await server.start(); @@ -62,7 +62,7 @@ describe("MetricsServer", () => { it("should return 404 for unknown paths", async () => { const port = nextPort(); - const collector = new MetricsCollector(); + const collector = new LegacyMetricsCollector(); server = new MetricsServer(collector, port); await server.start(); @@ -72,7 +72,7 @@ describe("MetricsServer", () => { it("should stop cleanly after start", async () => { const port = nextPort(); - const collector = new MetricsCollector(); + const collector = new LegacyMetricsCollector(); server = new MetricsServer(collector, port); await server.start(); await server.stop(); @@ -80,7 +80,7 @@ describe("MetricsServer", () => { }); it("should not throw when stop() called without start()", async () => { - const collector = new MetricsCollector(); + const collector = new LegacyMetricsCollector(); server = new MetricsServer(collector, nextPort()); await server.stop(); server = undefined; @@ -88,7 +88,7 @@ describe("MetricsServer", () => { it("should not throw when start() called twice", async () => { const port = nextPort(); - const collector = new MetricsCollector(); + const collector = new LegacyMetricsCollector(); server = new MetricsServer(collector, port); await server.start(); await server.start(); // second call should be a no-op diff --git a/src/sdk/worker/metrics/__tests__/accumulators.test.ts b/src/sdk/worker/metrics/__tests__/accumulators.test.ts new file mode 100644 index 00000000..d25c3181 --- /dev/null +++ b/src/sdk/worker/metrics/__tests__/accumulators.test.ts @@ -0,0 +1,283 @@ +import { describe, it, expect } from "@jest/globals"; +import { + labelKey, + renderLabels, + exceptionLabel, + HistogramAccumulator, + MultiLabelCounter, + GaugeMetric, + TIME_BUCKETS, + SIZE_BUCKETS, +} from "../accumulators"; + +describe("labelKey", () => { + it("should return empty string for empty labels", () => { + expect(labelKey({})).toBe(""); + }); + + it("should return key=value for a single label", () => { + expect(labelKey({ taskType: "my_task" })).toBe("taskType=my_task"); + }); + + it("should sort keys alphabetically", () => { + expect(labelKey({ z: "1", a: "2", m: "3" })).toBe("a=2,m=3,z=1"); + }); + + it("should produce the same key regardless of insertion order", () => { + const key1 = labelKey({ status: "SUCCESS", taskType: "t" }); + const key2 = labelKey({ taskType: "t", status: "SUCCESS" }); + expect(key1).toBe(key2); + }); +}); + +describe("renderLabels", () => { + it("should return empty string for empty labels", () => { + expect(renderLabels({})).toBe(""); + }); + + it("should format a single label with quotes", () => { + expect(renderLabels({ taskType: "my_task" })).toBe('taskType="my_task"'); + }); + + it("should format multiple labels comma-separated", () => { + const result = renderLabels({ method: "GET", uri: "/api" }); + expect(result).toBe('method="GET",uri="/api"'); + }); +}); + +describe("exceptionLabel", () => { + it("should return Error name for standard Error", () => { + expect(exceptionLabel(new Error("oops"))).toBe("Error"); + }); + + it("should return subclass name for TypeError", () => { + expect(exceptionLabel(new TypeError("bad type"))).toBe("TypeError"); + }); + + it("should return subclass name for RangeError", () => { + expect(exceptionLabel(new RangeError("out of range"))).toBe("RangeError"); + }); + + it("should return 'Error' for non-Error values", () => { + expect(exceptionLabel("string error")).toBe("Error"); + expect(exceptionLabel(42)).toBe("Error"); + expect(exceptionLabel(null)).toBe("Error"); + expect(exceptionLabel(undefined)).toBe("Error"); + expect(exceptionLabel({ message: "not an error" })).toBe("Error"); + }); + + it("should fall back to constructor name when .name is empty", () => { + const err = new TypeError("test"); + Object.defineProperty(err, "name", { value: "" }); + expect(exceptionLabel(err)).toBe("TypeError"); + }); +}); + +describe("HistogramAccumulator", () => { + it("should render empty string when no observations", () => { + const h = new HistogramAccumulator([1, 5, 10]); + expect(h.render("test_metric", "A test")).toBe(""); + }); + + it("should place value in correct buckets", () => { + const h = new HistogramAccumulator([1, 5, 10]); + h.observe({ taskType: "t" }, 3); + + const text = h.render("req_time", "Request time"); + expect(text).toContain("# HELP req_time Request time"); + expect(text).toContain("# TYPE req_time histogram"); + expect(text).toContain('req_time_bucket{taskType="t",le="1"} 0'); + expect(text).toContain('req_time_bucket{taskType="t",le="5"} 1'); + expect(text).toContain('req_time_bucket{taskType="t",le="10"} 1'); + expect(text).toContain('req_time_bucket{taskType="t",le="+Inf"} 1'); + expect(text).toContain('req_time_sum{taskType="t"} 3'); + expect(text).toContain('req_time_count{taskType="t"} 1'); + }); + + it("should increment all buckets at or above the value boundary", () => { + const h = new HistogramAccumulator([1, 5, 10]); + h.observe({}, 1); // exactly on boundary + + const text = h.render("m", "help"); + expect(text).toContain('m_bucket{le="1"} 1'); + expect(text).toContain('m_bucket{le="5"} 1'); + expect(text).toContain('m_bucket{le="10"} 1'); + }); + + it("should handle value above all boundaries", () => { + const h = new HistogramAccumulator([1, 5, 10]); + h.observe({}, 100); + + const text = h.render("m", "help"); + expect(text).toContain('m_bucket{le="1"} 0'); + expect(text).toContain('m_bucket{le="5"} 0'); + expect(text).toContain('m_bucket{le="10"} 0'); + expect(text).toContain('m_bucket{le="+Inf"} 1'); + expect(text).toContain("m_sum{} 100"); + }); + + it("should accumulate multiple observations", () => { + const h = new HistogramAccumulator([1, 5, 10]); + h.observe({ t: "a" }, 0.5); + h.observe({ t: "a" }, 3); + h.observe({ t: "a" }, 7); + + const text = h.render("m", "help"); + expect(text).toContain('m_bucket{t="a",le="1"} 1'); + expect(text).toContain('m_bucket{t="a",le="5"} 2'); + expect(text).toContain('m_bucket{t="a",le="10"} 3'); + expect(text).toContain('m_count{t="a"} 3'); + expect(text).toContain('m_sum{t="a"} 10.5'); + }); + + it("should track separate series for different label sets", () => { + const h = new HistogramAccumulator([10]); + h.observe({ status: "OK" }, 5); + h.observe({ status: "ERR" }, 15); + + const text = h.render("m", "help"); + expect(text).toContain('m_bucket{status="OK",le="10"} 1'); + expect(text).toContain('m_bucket{status="ERR",le="10"} 0'); + expect(text).toContain('m_count{status="OK"} 1'); + expect(text).toContain('m_count{status="ERR"} 1'); + }); + + it("should default to TIME_BUCKETS when no boundaries given", () => { + const h = new HistogramAccumulator(); + h.observe({}, 0.005); + const text = h.render("m", "help"); + // TIME_BUCKETS starts at 0.001, 0.005, ... + expect(text).toContain('le="0.001"'); + expect(text).toContain('le="0.005"'); + }); +}); + +describe("MultiLabelCounter", () => { + it("should render empty string when no increments", () => { + const c = new MultiLabelCounter(); + expect(c.render("test_counter", "A test")).toBe(""); + }); + + it("should increment and render a single-label counter", () => { + const c = new MultiLabelCounter(); + c.increment({ taskType: "t" }); + c.increment({ taskType: "t" }); + + const text = c.render("poll_total", "Total polls"); + expect(text).toContain("# HELP poll_total Total polls"); + expect(text).toContain("# TYPE poll_total counter"); + expect(text).toContain('poll_total{taskType="t"} 2'); + }); + + it("should support custom increment values", () => { + const c = new MultiLabelCounter(); + c.increment({ taskType: "t" }, 5); + c.increment({ taskType: "t" }, 3); + + const text = c.render("m", "help"); + expect(text).toContain('m{taskType="t"} 8'); + }); + + it("should track separate series for different label sets", () => { + const c = new MultiLabelCounter(); + c.increment({ taskType: "a" }); + c.increment({ taskType: "b" }); + c.increment({ taskType: "a" }); + + const text = c.render("m", "help"); + expect(text).toContain('m{taskType="a"} 2'); + expect(text).toContain('m{taskType="b"} 1'); + }); + + it("should handle multi-label counters", () => { + const c = new MultiLabelCounter(); + c.increment({ taskType: "t", exception: "TypeError" }); + + const text = c.render("err", "Errors"); + expect(text).toContain('err{taskType="t",exception="TypeError"} 1'); + }); +}); + +describe("GaugeMetric", () => { + it("should render empty string when no values set", () => { + const g = new GaugeMetric(); + expect(g.render("test_gauge", "A test")).toBe(""); + }); + + it("should set and render a gauge value", () => { + const g = new GaugeMetric(); + g.set({ taskType: "t" }, 42); + + const text = g.render("active", "Active workers"); + expect(text).toContain("# HELP active Active workers"); + expect(text).toContain("# TYPE active gauge"); + expect(text).toContain('active{taskType="t"} 42'); + }); + + it("should overwrite previous value on set", () => { + const g = new GaugeMetric(); + g.set({ taskType: "t" }, 10); + g.set({ taskType: "t" }, 99); + + const text = g.render("m", "help"); + expect(text).toContain('m{taskType="t"} 99'); + expect(text).not.toContain("10"); + }); + + it("should increment with inc()", () => { + const g = new GaugeMetric(); + g.inc({ taskType: "t" }); + g.inc({ taskType: "t" }); + g.inc({ taskType: "t" }, 3); + + expect(g.getValue({ taskType: "t" })).toBe(5); + }); + + it("should decrement with dec()", () => { + const g = new GaugeMetric(); + g.inc({ taskType: "t" }, 5); + g.dec({ taskType: "t" }); + g.dec({ taskType: "t" }, 2); + + expect(g.getValue({ taskType: "t" })).toBe(2); + }); + + it("should allow negative values after dec()", () => { + const g = new GaugeMetric(); + g.dec({ taskType: "t" }); + + expect(g.getValue({ taskType: "t" })).toBe(-1); + }); + + it("should return 0 for getValue on unknown labels", () => { + const g = new GaugeMetric(); + expect(g.getValue({ taskType: "unknown" })).toBe(0); + }); + + it("should track separate series for different label sets", () => { + const g = new GaugeMetric(); + g.set({ taskType: "a" }, 10); + g.set({ taskType: "b" }, 20); + + expect(g.getValue({ taskType: "a" })).toBe(10); + expect(g.getValue({ taskType: "b" })).toBe(20); + + const text = g.render("m", "help"); + expect(text).toContain('m{taskType="a"} 10'); + expect(text).toContain('m{taskType="b"} 20'); + }); +}); + +describe("bucket constants", () => { + it("TIME_BUCKETS should be sorted ascending", () => { + for (let i = 1; i < TIME_BUCKETS.length; i++) { + expect(TIME_BUCKETS[i]).toBeGreaterThan(TIME_BUCKETS[i - 1]); + } + }); + + it("SIZE_BUCKETS should be sorted ascending", () => { + for (let i = 1; i < SIZE_BUCKETS.length; i++) { + expect(SIZE_BUCKETS[i]).toBeGreaterThan(SIZE_BUCKETS[i - 1]); + } + }); +}); diff --git a/src/sdk/worker/metrics/__tests__/httpObserver.test.ts b/src/sdk/worker/metrics/__tests__/httpObserver.test.ts new file mode 100644 index 00000000..bf8adb42 --- /dev/null +++ b/src/sdk/worker/metrics/__tests__/httpObserver.test.ts @@ -0,0 +1,58 @@ +import { describe, it, expect, beforeEach, jest } from "@jest/globals"; +import { + getHttpMetricsObserver, + setHttpMetricsObserver, + type HttpMetricsObserver, +} from "../httpObserver"; + +describe("httpObserver", () => { + beforeEach(() => { + setHttpMetricsObserver(undefined); + }); + + it("should return undefined by default", () => { + expect(getHttpMetricsObserver()).toBeUndefined(); + }); + + it("should return the observer after setting one", () => { + const observer: HttpMetricsObserver = { + recordApiRequestTime: jest.fn(), + recordWorkflowInputSize: jest.fn(), + recordWorkflowStartError: jest.fn(), + }; + setHttpMetricsObserver(observer); + expect(getHttpMetricsObserver()).toBe(observer); + }); + + it("should clear the observer when set to undefined", () => { + const observer: HttpMetricsObserver = { + recordApiRequestTime: jest.fn(), + recordWorkflowInputSize: jest.fn(), + recordWorkflowStartError: jest.fn(), + }; + setHttpMetricsObserver(observer); + expect(getHttpMetricsObserver()).toBe(observer); + + setHttpMetricsObserver(undefined); + expect(getHttpMetricsObserver()).toBeUndefined(); + }); + + it("should replace the observer on subsequent set calls", () => { + const observer1: HttpMetricsObserver = { + recordApiRequestTime: jest.fn(), + recordWorkflowInputSize: jest.fn(), + recordWorkflowStartError: jest.fn(), + }; + const observer2: HttpMetricsObserver = { + recordApiRequestTime: jest.fn(), + recordWorkflowInputSize: jest.fn(), + recordWorkflowStartError: jest.fn(), + }; + + setHttpMetricsObserver(observer1); + expect(getHttpMetricsObserver()).toBe(observer1); + + setHttpMetricsObserver(observer2); + expect(getHttpMetricsObserver()).toBe(observer2); + }); +}); diff --git a/src/sdk/worker/metrics/__tests__/metricsFactory.test.ts b/src/sdk/worker/metrics/__tests__/metricsFactory.test.ts new file mode 100644 index 00000000..02ca03c6 --- /dev/null +++ b/src/sdk/worker/metrics/__tests__/metricsFactory.test.ts @@ -0,0 +1,111 @@ +import { describe, it, expect, afterEach } from "@jest/globals"; +import { createMetricsCollector } from "../metricsFactory"; +import { LegacyMetricsCollector } from "../LegacyMetricsCollector"; +import { CanonicalMetricsCollector } from "../CanonicalMetricsCollector"; + +describe("createMetricsCollector", () => { + const originalEnv = { ...process.env }; + + afterEach(() => { + process.env = { ...originalEnv }; + }); + + it("should return LegacyMetricsCollector by default", () => { + delete process.env.WORKER_CANONICAL_METRICS; + + const collector = createMetricsCollector(); + expect(collector).toBeInstanceOf(LegacyMetricsCollector); + }); + + it("should return CanonicalMetricsCollector when WORKER_CANONICAL_METRICS=true", () => { + process.env.WORKER_CANONICAL_METRICS = "true"; + + const collector = createMetricsCollector(); + expect(collector).toBeInstanceOf(CanonicalMetricsCollector); + }); + + it("should return LegacyMetricsCollector when WORKER_CANONICAL_METRICS=false", () => { + process.env.WORKER_CANONICAL_METRICS = "false"; + + const collector = createMetricsCollector(); + expect(collector).toBeInstanceOf(LegacyMetricsCollector); + }); + + it("should be case-insensitive for env var value", () => { + process.env.WORKER_CANONICAL_METRICS = "TRUE"; + + const collector = createMetricsCollector(); + expect(collector).toBeInstanceOf(CanonicalMetricsCollector); + }); + + it('should return CanonicalMetricsCollector when WORKER_CANONICAL_METRICS="1"', () => { + process.env.WORKER_CANONICAL_METRICS = "1"; + + const collector = createMetricsCollector(); + expect(collector).toBeInstanceOf(CanonicalMetricsCollector); + }); + + it('should return CanonicalMetricsCollector when WORKER_CANONICAL_METRICS="yes"', () => { + process.env.WORKER_CANONICAL_METRICS = "yes"; + + const collector = createMetricsCollector(); + expect(collector).toBeInstanceOf(CanonicalMetricsCollector); + }); + + it("should pass config through to the collector", () => { + delete process.env.WORKER_CANONICAL_METRICS; + + const collector = createMetricsCollector({ prefix: "custom_prefix" }); + expect(collector).toBeInstanceOf(LegacyMetricsCollector); + const text = collector.toPrometheusText(); + // No metrics recorded yet, so empty, but the collector was created successfully + expect(typeof text).toBe("string"); + }); + + it('legacy collector returns "legacy" from collectorName()', () => { + delete process.env.WORKER_CANONICAL_METRICS; + + const collector = createMetricsCollector(); + expect(collector.collectorName()).toBe("legacy"); + }); + + it('canonical collector returns "canonical" from collectorName()', () => { + process.env.WORKER_CANONICAL_METRICS = "true"; + + const collector = createMetricsCollector(); + expect(collector.collectorName()).toBe("canonical"); + }); + + it("both implementations satisfy MetricsCollectorInterface", () => { + const legacy = createMetricsCollector(); + const requiredMethods = [ + "recordTaskExecutionQueueFull", + "recordUncaughtException", + "recordWorkerRestart", + "recordTaskPaused", + "recordTaskAckError", + "recordTaskAckFailed", + "recordWorkflowStartError", + "recordExternalPayloadUsed", + "recordWorkflowInputSize", + "recordApiRequestTime", + "getMetrics", + "reset", + "stop", + "getContentType", + "toPrometheusText", + "collectorName", + "toPrometheusTextAsync", + ]; + + for (const method of requiredMethods) { + expect(typeof (legacy as unknown as Record)[method]).toBe("function"); + } + + process.env.WORKER_CANONICAL_METRICS = "true"; + const canonical = createMetricsCollector(); + for (const method of requiredMethods) { + expect(typeof (canonical as unknown as Record)[method]).toBe("function"); + } + }); +}); diff --git a/src/sdk/worker/metrics/accumulators.ts b/src/sdk/worker/metrics/accumulators.ts new file mode 100644 index 00000000..de8d48ae --- /dev/null +++ b/src/sdk/worker/metrics/accumulators.ts @@ -0,0 +1,177 @@ +/** + * In-memory Prometheus-compatible metric accumulators used by the + * canonical metrics implementation. + */ + +export const TIME_BUCKETS = [ + 0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, +] as const; + +export const SIZE_BUCKETS = [ + 100, 1_000, 10_000, 100_000, 1_000_000, 10_000_000, +] as const; + +export function labelKey(labels: Record): string { + const keys = Object.keys(labels).sort(); + return keys.map((k) => `${k}=${labels[k]}`).join(","); +} + +export function renderLabels(labels: Record): string { + return Object.entries(labels) + .map(([k, v]) => `${k}="${v}"`) + .join(","); +} + +export function exceptionLabel(error: unknown): string { + if (error instanceof Error) { + return error.name || error.constructor?.name || "Error"; + } + return "Error"; +} + +// ── HistogramAccumulator ───────────────────────────────────────── + +interface HistogramSeries { + labels: Record; + buckets: number[]; + count: number; + sum: number; +} + +export class HistogramAccumulator { + private readonly _boundaries: readonly number[]; + private _series = new Map(); + + constructor(boundaries: readonly number[] = TIME_BUCKETS) { + this._boundaries = boundaries; + } + + observe(labels: Record, value: number): void { + const key = labelKey(labels); + let s = this._series.get(key); + if (!s) { + s = { + labels, + buckets: new Array(this._boundaries.length).fill(0), + count: 0, + sum: 0, + }; + this._series.set(key, s); + } + for (let i = 0; i < this._boundaries.length; i++) { + if (value <= this._boundaries[i]) { + s.buckets[i]++; + } + } + s.count++; + s.sum += value; + } + + render(name: string, help: string): string { + if (this._series.size === 0) return ""; + const lines: string[] = []; + lines.push(`# HELP ${name} ${help}`); + lines.push(`# TYPE ${name} histogram`); + for (const s of this._series.values()) { + const lblStr = renderLabels(s.labels); + const sep = lblStr ? "," : ""; + for (let i = 0; i < this._boundaries.length; i++) { + lines.push( + `${name}_bucket{${lblStr}${sep}le="${this._boundaries[i]}"} ${s.buckets[i]}`, + ); + } + lines.push(`${name}_bucket{${lblStr}${sep}le="+Inf"} ${s.count}`); + lines.push(`${name}_sum{${lblStr}} ${s.sum}`); + lines.push(`${name}_count{${lblStr}} ${s.count}`); + } + return lines.join("\n"); + } +} + +// ── MultiLabelCounter ──────────────────────────────────────────── + +interface CounterSeries { + labels: Record; + value: number; +} + +export class MultiLabelCounter { + private _series = new Map(); + + increment(labels: Record, value = 1): void { + const key = labelKey(labels); + let s = this._series.get(key); + if (!s) { + s = { labels, value: 0 }; + this._series.set(key, s); + } + s.value += value; + } + + render(name: string, help: string): string { + if (this._series.size === 0) return ""; + const lines: string[] = []; + lines.push(`# HELP ${name} ${help}`); + lines.push(`# TYPE ${name} counter`); + for (const s of this._series.values()) { + lines.push(`${name}{${renderLabels(s.labels)}} ${s.value}`); + } + return lines.join("\n"); + } +} + +// ── GaugeMetric ────────────────────────────────────────────────── + +interface GaugeSeries { + labels: Record; + value: number; +} + +export class GaugeMetric { + private _series = new Map(); + + set(labels: Record, value: number): void { + const key = labelKey(labels); + let s = this._series.get(key); + if (!s) { + s = { labels, value: 0 }; + this._series.set(key, s); + } + s.value = value; + } + + inc(labels: Record, delta = 1): void { + const key = labelKey(labels); + let s = this._series.get(key); + if (!s) { + s = { labels, value: 0 }; + this._series.set(key, s); + } + s.value += delta; + } + + dec(labels: Record, delta = 1): void { + const key = labelKey(labels); + let s = this._series.get(key); + if (!s) { + s = { labels, value: 0 }; + this._series.set(key, s); + } + s.value -= delta; + } + + getValue(labels: Record): number { + return this._series.get(labelKey(labels))?.value ?? 0; + } + + render(name: string, help: string): string { + if (this._series.size === 0) return ""; + const lines: string[] = []; + lines.push(`# HELP ${name} ${help}`); + lines.push(`# TYPE ${name} gauge`); + for (const s of this._series.values()) { + lines.push(`${name}{${renderLabels(s.labels)}} ${s.value}`); + } + return lines.join("\n"); + } +} diff --git a/src/sdk/worker/metrics/httpObserver.ts b/src/sdk/worker/metrics/httpObserver.ts new file mode 100644 index 00000000..311a69b7 --- /dev/null +++ b/src/sdk/worker/metrics/httpObserver.ts @@ -0,0 +1,48 @@ +/** + * Global metrics observer for recording API client request latency + * and workflow-level metrics from code outside the event system. + * + * `createMetricsCollector()` and `CanonicalMetricsCollector` register + * themselves here on construction. `LegacyMetricsCollector` does NOT + * self-register so that direct construction preserves pre-metrics + * behavior; use `createMetricsCollector()` or call + * `setHttpMetricsObserver()` explicitly to opt in. + * + * fetchWithRetry and WorkflowExecutor call the observer without + * needing a direct reference. + */ + +export interface HttpMetricsObserver { + readonly measurePayloadSize: boolean; + + recordApiRequestTime( + method: string, + uri: string, + status: number | string, + durationMs: number, + metricUri?: string, + ): void; + + recordWorkflowInputSize( + workflowType: string, + sizeBytes: number, + version?: string, + ): void; + + recordWorkflowStartError( + workflowType?: string, + exception?: string, + ): void; +} + +let _observer: HttpMetricsObserver | undefined; + +export function setHttpMetricsObserver( + observer: HttpMetricsObserver | undefined, +): void { + _observer = observer; +} + +export function getHttpMetricsObserver(): HttpMetricsObserver | undefined { + return _observer; +} diff --git a/src/sdk/worker/metrics/index.ts b/src/sdk/worker/metrics/index.ts index 9869a1c7..732e02f7 100644 --- a/src/sdk/worker/metrics/index.ts +++ b/src/sdk/worker/metrics/index.ts @@ -1,7 +1,37 @@ +// Interface +export type { MetricsCollectorInterface } from "./MetricsCollectorInterface"; + +// Implementations export { - MetricsCollector, + LegacyMetricsCollector, type MetricsCollectorConfig, type WorkerMetrics, -} from "./MetricsCollector"; +} from "./LegacyMetricsCollector"; +export { CanonicalMetricsCollector } from "./CanonicalMetricsCollector"; + +// Backward-compat alias: MetricsCollector → LegacyMetricsCollector +export { LegacyMetricsCollector as MetricsCollector } from "./LegacyMetricsCollector"; + +// Factory +export { createMetricsCollector } from "./metricsFactory"; + +// HTTP observer +export { + type HttpMetricsObserver, + getHttpMetricsObserver, + setHttpMetricsObserver, +} from "./httpObserver"; + +// Server & registries export { MetricsServer } from "./MetricsServer"; export { PrometheusRegistry } from "./PrometheusRegistry"; +export { CanonicalPrometheusRegistry } from "./CanonicalPrometheusRegistry"; + +// Accumulators (exposed for advanced usage / testing) +export { + HistogramAccumulator, + MultiLabelCounter, + GaugeMetric, + TIME_BUCKETS, + SIZE_BUCKETS, +} from "./accumulators"; diff --git a/src/sdk/worker/metrics/metricsFactory.ts b/src/sdk/worker/metrics/metricsFactory.ts new file mode 100644 index 00000000..86066895 --- /dev/null +++ b/src/sdk/worker/metrics/metricsFactory.ts @@ -0,0 +1,30 @@ +import type { MetricsCollectorConfig } from "./LegacyMetricsCollector"; +import type { MetricsCollectorInterface } from "./MetricsCollectorInterface"; +import { LegacyMetricsCollector } from "./LegacyMetricsCollector"; +import { CanonicalMetricsCollector } from "./CanonicalMetricsCollector"; +import { setHttpMetricsObserver } from "./httpObserver"; + +/** + * Create the appropriate MetricsCollector based on environment variables. + * + * - WORKER_CANONICAL_METRICS=true -> CanonicalMetricsCollector + * - Unset / any other value -> LegacyMetricsCollector (default) + * + * WORKER_LEGACY_METRICS is reserved for future use: once canonical becomes + * the default, setting WORKER_LEGACY_METRICS=true will re-activate legacy + * metrics. It is not read by the current implementation. + */ +export function createMetricsCollector( + config?: MetricsCollectorConfig, +): MetricsCollectorInterface { + const useCanonical = ["true", "1", "yes"].includes( + (process.env.WORKER_CANONICAL_METRICS ?? "").toLowerCase(), + ); + + const collector = useCanonical + ? new CanonicalMetricsCollector(config) + : new LegacyMetricsCollector(config); + + setHttpMetricsObserver(collector); + return collector; +}