diff --git a/.changeset/add-debounce-maxdelay.md b/.changeset/add-debounce-maxdelay.md deleted file mode 100644 index a70b95d4765..00000000000 --- a/.changeset/add-debounce-maxdelay.md +++ /dev/null @@ -1,16 +0,0 @@ ---- -"@trigger.dev/core": patch -"@trigger.dev/sdk": patch ---- - -Add `maxDelay` option to debounce feature. This allows setting a maximum time limit for how long a debounced run can be delayed, ensuring execution happens within a specified window even with continuous triggers. - -```typescript -await myTask.trigger(payload, { - debounce: { - key: "my-key", - delay: "5s", - maxDelay: "30m", // Execute within 30 minutes regardless of continuous triggers - }, -}); -``` diff --git a/.changeset/agent-skills.md b/.changeset/agent-skills.md new file mode 100644 index 00000000000..5ed3b11fc2f --- /dev/null +++ b/.changeset/agent-skills.md @@ -0,0 +1,16 @@ +--- +"@trigger.dev/sdk": patch +"@trigger.dev/core": patch +"@trigger.dev/build": patch +"trigger.dev": patch +--- + +Add Agent Skills for `chat.agent`. Drop a folder with a `SKILL.md` and any helper scripts/references next to your task code, register it with `skills.define({ id, path })`, and the CLI bundles it into the deploy image automatically — no `trigger.config.ts` changes. The agent gets a one-line summary in its system prompt and discovers full instructions on demand via `loadSkill`, with `bash` and `readFile` tools scoped per-skill (path-traversal guards, output caps, abort-signal propagation). + +```ts +const pdfSkill = skills.define({ id: "pdf-extract", path: "./skills/pdf-extract" }); + +chat.skills.set([await pdfSkill.local()]); +``` + +Built on the [AI SDK cookbook pattern](https://ai-sdk.dev/cookbook/guides/agent-skills) — portable across providers. SDK + CLI only for now; dashboard-editable `SKILL.md` text is on the roadmap. diff --git a/.changeset/ai-prompts.md b/.changeset/ai-prompts.md new file mode 100644 index 00000000000..511aa303097 --- /dev/null +++ b/.changeset/ai-prompts.md @@ -0,0 +1,52 @@ +--- +"@trigger.dev/sdk": minor +--- + +**AI Prompts** — define prompt templates as code alongside your tasks, version them on deploy, and override the text or model from the dashboard without redeploying. Prompts integrate with the Vercel AI SDK via `toAISDKTelemetry()` (links every generation span back to the prompt) and with `chat.agent` via `chat.prompt.set()` + `chat.toStreamTextOptions()`. + +```ts +import { prompts } from "@trigger.dev/sdk"; +import { generateText } from "ai"; +import { openai } from "@ai-sdk/openai"; +import { z } from "zod"; + +export const supportPrompt = prompts.define({ + id: "customer-support", + model: "gpt-4o", + config: { temperature: 0.7 }, + variables: z.object({ + customerName: z.string(), + plan: z.string(), + issue: z.string(), + }), + content: `You are a support agent for Acme. + +Customer: {{customerName}} ({{plan}} plan) +Issue: {{issue}}`, +}); + +const resolved = await supportPrompt.resolve({ + customerName: "Alice", + plan: "Pro", + issue: "Can't access billing", +}); + +const result = await generateText({ + model: openai(resolved.model ?? "gpt-4o"), + system: resolved.text, + prompt: "Can't access billing", + ...resolved.toAISDKTelemetry(), +}); +``` + +**What you get:** + +- **Code-defined, deploy-versioned templates** — define with `prompts.define({ id, model, config, variables, content })`. Every deploy creates a new version visible in the dashboard. Mustache-style placeholders (`{{var}}`, `{{#cond}}...{{/cond}}`) with Zod / ArkType / Valibot-typed variables. +- **Dashboard overrides** — change a prompt's text or model from the dashboard without redeploying. Overrides take priority over the deployed "current" version and are environment-scoped (dev / staging / production independent). +- **Resolve API** — `prompt.resolve(vars, { version?, label? })` returns the compiled `text`, resolved `model`, `version`, and labels. Standalone `prompts.resolve(slug, vars)` for cross-file resolution with full type inference on slug and variable shape. +- **AI SDK integration** — spread `resolved.toAISDKTelemetry({ ...extra })` into any `generateText` / `streamText` call and every generation span links to the prompt in the dashboard alongside its input variables, model, tokens, and cost. +- **`chat.agent` integration** — `chat.prompt.set(resolved)` stores the resolved prompt run-scoped; `chat.toStreamTextOptions({ registry })` pulls `system`, `model` (resolved via the AI SDK provider registry), `temperature` / `maxTokens` / etc., and telemetry into a single spread for `streamText`. +- **Management SDK** — `prompts.list()`, `prompts.versions(slug)`, `prompts.promote(slug, version)`, `prompts.createOverride(slug, body)`, `prompts.updateOverride(slug, body)`, `prompts.removeOverride(slug)`, `prompts.reactivateOverride(slug, version)`. +- **Dashboard** — prompts list with per-prompt usage sparklines; per-prompt detail with Template / Details / Versions / Generations / Metrics tabs. AI generation spans get a custom inspector showing the linked prompt's metadata, input variables, and template content alongside model, tokens, cost, and the message thread. + +See [/docs/ai/prompts](https://trigger.dev/docs/ai/prompts) for the full reference — template syntax, version resolution order, override workflow, and type utilities (`PromptHandle`, `PromptIdentifier`, `PromptVariables`). diff --git a/.changeset/ai-tool-helpers.md b/.changeset/ai-tool-helpers.md new file mode 100644 index 00000000000..09e3b612ada --- /dev/null +++ b/.changeset/ai-tool-helpers.md @@ -0,0 +1,15 @@ +--- +"@trigger.dev/sdk": patch +--- + +Add `ai.toolExecute(task)` so you can wire a Trigger subtask in as the `execute` handler of an AI SDK `tool()` while defining `description` and `inputSchema` yourself — useful when you want full control over the tool surface and just need Trigger's subtask machinery for the body. + +```ts +const myTool = tool({ + description: "...", + inputSchema: z.object({ ... }), + execute: ai.toolExecute(mySubtask), +}); +``` + +`ai.tool(task)` (`toolFromTask`) keeps doing the all-in-one wrap and now aligns its return type with AI SDK's `ToolSet`. Minimum `ai` peer raised to `^6.0.116` to avoid cross-version `ToolSet` mismatches in monorepos. diff --git a/.changeset/calm-hooks-wait.md b/.changeset/calm-hooks-wait.md deleted file mode 100644 index 02a83fac31c..00000000000 --- a/.changeset/calm-hooks-wait.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"@trigger.dev/react-hooks": patch ---- - -Fix `onComplete` callback firing prematurely when the realtime stream disconnects before the run finishes. diff --git a/.changeset/cap-idempotency-key-length.md b/.changeset/cap-idempotency-key-length.md new file mode 100644 index 00000000000..d1360369148 --- /dev/null +++ b/.changeset/cap-idempotency-key-length.md @@ -0,0 +1,5 @@ +--- +"@trigger.dev/core": patch +--- + +Reject overlong `idempotencyKey` values at the API boundary so they no longer trip an internal size limit on the underlying unique index and surface as a generic 500. Inputs are capped at 2048 characters — well above what `idempotencyKeys.create()` produces (a 64-character hash) and above any realistic raw key. Applies to `tasks.trigger`, `tasks.batchTrigger`, `batch.create` (Phase 1 streaming batches), `wait.createToken`, `wait.forDuration`, and the input/session stream waitpoint endpoints. Over-limit requests now return a structured 400 instead. diff --git a/.changeset/chat-agent-on-boot-hook.md b/.changeset/chat-agent-on-boot-hook.md new file mode 100644 index 00000000000..5eaa078e65e --- /dev/null +++ b/.changeset/chat-agent-on-boot-hook.md @@ -0,0 +1,21 @@ +--- +"@trigger.dev/sdk": minor +--- + +Adds `onBoot` to `chat.agent` — a lifecycle hook that fires once per worker process picking up the chat. Runs for the initial run, preloaded runs, AND reactive continuation runs (post-cancel, crash, `endRun`, `requestUpgrade`, OOM retry), before any other hook. Use it to initialize `chat.local`, open per-process resources, or re-hydrate state from your DB on continuation — anywhere the SAME run picking up after suspend/resume isn't enough. + +```ts +const userContext = chat.local<{ name: string; plan: string }>({ id: "userContext" }); + +export const myChat = chat.agent({ + id: "my-chat", + onBoot: async ({ clientData, continuation }) => { + const user = await db.user.findUnique({ where: { id: clientData.userId } }); + userContext.init({ name: user.name, plan: user.plan }); + }, + run: async ({ messages, signal }) => + streamText({ model: openai("gpt-4o"), messages, abortSignal: signal }), +}); +``` + +Use `onBoot` (not `onChatStart`) for state setup that must run every time a worker picks up the chat — `onChatStart` fires once per chat and won't run on continuation, leaving `chat.local` uninitialized when `run()` tries to use it. diff --git a/.changeset/chat-agent.md b/.changeset/chat-agent.md new file mode 100644 index 00000000000..733a8ab22e4 --- /dev/null +++ b/.changeset/chat-agent.md @@ -0,0 +1,44 @@ +--- +"@trigger.dev/sdk": minor +"@trigger.dev/core": patch +--- + +**AI Agents** — run AI SDK chat completions as durable Trigger.dev agents instead of fragile API routes. Define an agent in one function, point `useChat` at it from React, and the conversation survives page refreshes, network blips, and process restarts. + +```ts +import { chat } from "@trigger.dev/sdk/ai"; +import { streamText } from "ai"; +import { openai } from "@ai-sdk/openai"; + +export const myChat = chat.agent({ + id: "my-chat", + run: async ({ messages, signal }) => + streamText({ model: openai("gpt-4o"), messages, abortSignal: signal }), +}); +``` + +```tsx +import { useChat } from "@ai-sdk/react"; +import { useTriggerChatTransport } from "@trigger.dev/sdk/chat/react"; + +const transport = useTriggerChatTransport({ task: "my-chat", accessToken, startSession }); +const { messages, sendMessage } = useChat({ transport }); +``` + +**What you get:** + +- **AI SDK `useChat` integration** — a custom [`ChatTransport`](https://sdk.vercel.ai/docs/ai-sdk-ui/transport) (`useTriggerChatTransport`) plugs straight into Vercel AI SDK's `useChat` hook. Text streaming, tool calls, reasoning, and `data-*` parts all work natively over Trigger.dev's realtime streams. No custom API routes needed. +- **First-turn fast path (`chat.headStart`)** — opt-in handler that runs the first turn's `streamText` step in your warm server process while the agent run boots in parallel, cutting cold-start TTFC by roughly half (measured 2801ms → 1218ms on `claude-sonnet-4-6`). The agent owns step 2+ (tool execution, persistence, hooks) so heavy deps stay where they belong. Web Fetch handler works natively in Next.js, Hono, SvelteKit, Remix, Workers, etc.; bridge to Express/Fastify/Koa via `chat.toNodeListener`. New `@trigger.dev/sdk/chat-server` subpath. +- **Multi-turn durability via Sessions** — every chat is backed by a durable Session that outlives any individual run. Conversations resume across page refreshes, idle timeout, crashes, and deploys; `resume: true` reconnects via `lastEventId` so clients only see new chunks. `sessions.list` enumerates chats for inbox-style UIs. +- **Auto-accumulated history, delta-only wire** — the backend accumulates the full conversation across turns; clients only ship the new message each turn. Long chats never hit the 512 KiB body cap. Register `hydrateMessages` to be the source of truth yourself. +- **Lifecycle hooks** — `onPreload`, `onChatStart`, `onValidateMessages`, `hydrateMessages`, `onTurnStart`, `onBeforeTurnComplete`, `onTurnComplete`, `onChatSuspend`, `onChatResume` — for persistence, validation, and post-turn work. +- **Stop generation** — client-driven `transport.stopGeneration(chatId)` aborts mid-stream; the run stays alive for the next message, partial response is captured, and aborted parts (stuck `partial-call` tools, in-progress reasoning) are auto-cleaned. +- **Tool approvals (HITL)** — tools with `needsApproval: true` pause until the user approves or denies via `addToolApprovalResponse`. The runtime reconciles the updated assistant message by ID and continues `streamText`. +- **Steering and background injection** — `pendingMessages` injects user messages between tool-call steps so users can steer the agent mid-execution; `chat.inject()` + `chat.defer()` adds context from background work (self-review, RAG, safety checks) between turns. +- **Actions** — non-turn frontend commands (undo, rollback, regenerate, edit) sent via `transport.sendAction`. Fire `hydrateMessages` + `onAction` only — no turn hooks, no `run()`. `onAction` can return a `StreamTextResult` for a model response, or `void` for side-effect-only. +- **Typed state primitives** — `chat.local` for per-run state accessible from hooks, `run()`, tools, and subtasks (auto-serialized through `ai.toolExecute`); `chat.store` for typed shared data between agent and client; `chat.history` for reading and mutating the message chain; `clientDataSchema` for typed `clientData` in every hook. +- **`chat.toStreamTextOptions()`** — one spread into `streamText` wires up versioned system [Prompts](https://trigger.dev/docs/ai/prompts), model resolution, telemetry metadata, compaction, steering, and background injection. +- **Multi-tab coordination** — `multiTab: true` + `useMultiTabChat` prevents duplicate sends and syncs state across browser tabs via `BroadcastChannel`. Non-active tabs go read-only with live updates. +- **Network resilience** — built-in indefinite retry with bounded backoff, reconnect on `online` / tab refocus / bfcache restore, `Last-Event-ID` mid-stream resume. No app code needed. + +See [/docs/ai-chat](https://trigger.dev/docs/ai-chat/overview) for the full surface — quick start, three backend approaches (`chat.agent`, `chat.createSession`, raw task), persistence and code-sandbox patterns, type-level guides, and API reference. diff --git a/.changeset/chat-history-read-primitives.md b/.changeset/chat-history-read-primitives.md new file mode 100644 index 00000000000..fd26ad8548b --- /dev/null +++ b/.changeset/chat-history-read-primitives.md @@ -0,0 +1,21 @@ +--- +"@trigger.dev/sdk": minor +--- + +Add read primitives to `chat.history` for HITL flows: `getPendingToolCalls()`, `getResolvedToolCalls()`, `extractNewToolResults(message)`, `getChain()`, and `findMessage(messageId)`. These lift the accumulator-walking logic that customers building human-in-the-loop tools were re-implementing into the SDK. + +Use `getPendingToolCalls()` to gate fresh user turns while a tool call is awaiting an answer. Use `extractNewToolResults(message)` to dedup tool results when persisting to your own store — the helper returns only the parts whose `toolCallId` is not already resolved on the chain. + +```ts +const pending = chat.history.getPendingToolCalls(); +if (pending.length > 0) { + // an addToolOutput is expected before a new user message +} + +onTurnComplete: async ({ responseMessage }) => { + const newResults = chat.history.extractNewToolResults(responseMessage); + for (const r of newResults) { + await db.toolResults.upsert({ id: r.toolCallId, output: r.output, errorText: r.errorText }); + } +}; +``` diff --git a/.changeset/chat-session-attributes.md b/.changeset/chat-session-attributes.md new file mode 100644 index 00000000000..ec4c6a54076 --- /dev/null +++ b/.changeset/chat-session-attributes.md @@ -0,0 +1,6 @@ +--- +"@trigger.dev/sdk": patch +"@trigger.dev/core": patch +--- + +Stamp `gen_ai.conversation.id` (the chat id) on every span and metric emitted from inside a `chat.task` or `chat.agent` run. Lets you filter dashboard spans, runs, and metrics by the chat conversation that produced them — independent of the run boundary, so multi-run chats correlate cleanly. No code changes required on the user side. diff --git a/.changeset/cli-deploy-skip-rewrite-timestamp.md b/.changeset/cli-deploy-skip-rewrite-timestamp.md new file mode 100644 index 00000000000..60e82732dce --- /dev/null +++ b/.changeset/cli-deploy-skip-rewrite-timestamp.md @@ -0,0 +1,5 @@ +--- +"trigger.dev": patch +--- + +Add `TRIGGER_BUILD_SKIP_REWRITE_TIMESTAMP=1` escape hatch for local self-hosted builds whose buildx driver doesn't support `rewrite-timestamp` alongside push (e.g. orbstack's default `docker` driver). diff --git a/.changeset/consistent-stream-targets.md b/.changeset/consistent-stream-targets.md deleted file mode 100644 index 5fbb5780303..00000000000 --- a/.changeset/consistent-stream-targets.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"@trigger.dev/sdk": patch ---- - -Aligned the SDK's `getRunIdForOptions` logic with the Core package to handle semantic targets (`root`, `parent`) in root tasks. diff --git a/.changeset/export-start-attempt-hook-type.md b/.changeset/export-start-attempt-hook-type.md deleted file mode 100644 index bad7c5258b7..00000000000 --- a/.changeset/export-start-attempt-hook-type.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"@trigger.dev/sdk": patch ---- - -Export `AnyOnStartAttemptHookFunction` type to allow defining `onStartAttempt` hooks for individual tasks. diff --git a/.changeset/fix-dead-process-execute-hang.md b/.changeset/fix-dead-process-execute-hang.md deleted file mode 100644 index fa96e9c88c3..00000000000 --- a/.changeset/fix-dead-process-execute-hang.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -"trigger.dev": patch ---- - -Fix runner getting stuck indefinitely when `execute()` is called on a dead child process. diff --git a/.changeset/locals-key-dual-package-fix.md b/.changeset/locals-key-dual-package-fix.md new file mode 100644 index 00000000000..38d42e19dfb --- /dev/null +++ b/.changeset/locals-key-dual-package-fix.md @@ -0,0 +1,5 @@ +--- +"@trigger.dev/core": patch +--- + +Fix `LocalsKey` type incompatibility across dual-package builds. The phantom value-type brand no longer uses a module-level `unique symbol`, so a single TypeScript compilation that resolves the type from both the ESM and CJS outputs (which can happen under certain pnpm hoisting layouts) no longer sees two structurally-incompatible variants of the same type. diff --git a/.changeset/mcp-agent-chat-sessions.md b/.changeset/mcp-agent-chat-sessions.md new file mode 100644 index 00000000000..c3f01aebf28 --- /dev/null +++ b/.changeset/mcp-agent-chat-sessions.md @@ -0,0 +1,5 @@ +--- +"trigger.dev": patch +--- + +The CLI MCP server's agent-chat tools (`start_agent_chat`, `send_agent_message`, `close_agent_chat`) now run on the new Sessions primitive, so AI assistants driving a `chat.agent` get the same idempotent-by-`chatId`, durable-across-runs behavior the browser transport gets. Required PAT scopes go from `write:inputStreams` to `read:sessions` + `write:sessions`. diff --git a/.changeset/mcp-list-runs-region.md b/.changeset/mcp-list-runs-region.md new file mode 100644 index 00000000000..b72cfb23c97 --- /dev/null +++ b/.changeset/mcp-list-runs-region.md @@ -0,0 +1,5 @@ +--- +"trigger.dev": patch +--- + +MCP `list_runs` tool: add a `region` filter input and surface each run's executing region in the formatted summary. diff --git a/.changeset/mock-chat-agent-test-harness.md b/.changeset/mock-chat-agent-test-harness.md new file mode 100644 index 00000000000..9876e56a9f7 --- /dev/null +++ b/.changeset/mock-chat-agent-test-harness.md @@ -0,0 +1,8 @@ +--- +"@trigger.dev/sdk": patch +"@trigger.dev/core": patch +--- + +Unit-test `chat.agent` definitions offline with `mockChatAgent` from `@trigger.dev/sdk/ai/test`. Drives a real agent's turn loop in-process — no network, no task runtime — so you can send messages, actions, and stop signals via driver methods, inspect captured output chunks, and verify hooks fire. Pairs with `MockLanguageModelV3` from `ai/test` for model mocking. `setupLocals` lets you pre-seed `locals` (DB clients, service stubs) before `run()` starts. + +The broader `runInMockTaskContext` harness it's built on lives at `@trigger.dev/core/v3/test` — useful for unit-testing any task code, not just chat. diff --git a/.changeset/mollifier-redis-worker-primitives.md b/.changeset/mollifier-redis-worker-primitives.md new file mode 100644 index 00000000000..a209e530c24 --- /dev/null +++ b/.changeset/mollifier-redis-worker-primitives.md @@ -0,0 +1,9 @@ +--- +"@trigger.dev/redis-worker": patch +--- + +Add MollifierBuffer and MollifierDrainer primitives for trigger burst smoothing. + +MollifierBuffer (`accept`, `pop`, `ack`, `requeue`, `fail`, `evaluateTrip`) is a per-env FIFO over Redis with atomic Lua transitions for status tracking. `evaluateTrip` is a sliding-window trip evaluator the webapp gate uses to detect per-env trigger bursts. + +MollifierDrainer pops entries through a polling loop with a user-supplied handler. The loop survives transient Redis errors via capped exponential backoff (up to 5s), and per-env pop failures don't poison the rest of the batch — one env's blip is logged and counted as failed for that tick. Rotation is two-level: orgs at the top, envs within each org. The buffer maintains `mollifier:orgs` and `mollifier:org-envs:${orgId}` atomically with per-env queues, so the drainer walks orgs → envs directly without an in-memory cache. The `maxOrgsPerTick` option (default 500) caps how many orgs are scheduled per tick; for each picked org, one env is popped (rotating round-robin within the org). An org with N envs gets the same per-tick scheduling slot as an org with 1 env, so tenant-level drainage throughput is determined by org count rather than env count. diff --git a/.changeset/plugin-auth-path.md b/.changeset/plugin-auth-path.md new file mode 100644 index 00000000000..7ce08b71a33 --- /dev/null +++ b/.changeset/plugin-auth-path.md @@ -0,0 +1,5 @@ +--- +"@trigger.dev/plugins": patch +--- + +The public interfaces for a plugin system. Initially consolidated authentication and authorization interfaces. diff --git a/.changeset/pre.json b/.changeset/pre.json new file mode 100644 index 00000000000..a5d1b75f8c7 --- /dev/null +++ b/.changeset/pre.json @@ -0,0 +1,22 @@ +{ + "mode": "pre", + "tag": "rc", + "initialVersions": { + "coordinator": "0.0.1", + "docker-provider": "0.0.1", + "kubernetes-provider": "0.0.1", + "supervisor": "0.0.1", + "webapp": "1.0.0", + "@trigger.dev/build": "4.4.6", + "trigger.dev": "4.4.6", + "@trigger.dev/core": "4.4.6", + "@trigger.dev/plugins": "4.4.6", + "@trigger.dev/python": "4.4.6", + "@trigger.dev/react-hooks": "4.4.6", + "@trigger.dev/redis-worker": "4.4.6", + "@trigger.dev/rsc": "4.4.6", + "@trigger.dev/schema-to-json": "4.4.6", + "@trigger.dev/sdk": "4.4.6" + }, + "changesets": [] +} diff --git a/.changeset/retry-sigsegv.md b/.changeset/retry-sigsegv.md new file mode 100644 index 00000000000..5a53c351efe --- /dev/null +++ b/.changeset/retry-sigsegv.md @@ -0,0 +1,5 @@ +--- +"@trigger.dev/core": patch +--- + +Retry `TASK_PROCESS_SIGSEGV` task crashes under the user's retry policy instead of failing the run on the first segfault. SIGSEGV in Node tasks is frequently non-deterministic (native addon races, JIT/GC interaction, near-OOM in native code, host issues), so retrying on a fresh process often succeeds. The retry is gated by the task's existing `retry` config + `maxAttempts` — same path `TASK_PROCESS_SIGTERM` and uncaught exceptions already use — so tasks without a retry policy still fail fast. diff --git a/.changeset/runs-list-region-filter.md b/.changeset/runs-list-region-filter.md new file mode 100644 index 00000000000..c487e2d632c --- /dev/null +++ b/.changeset/runs-list-region-filter.md @@ -0,0 +1,6 @@ +--- +"@trigger.dev/core": patch +"@trigger.dev/sdk": patch +--- + +Add `region` to the runs list / retrieve API: filter runs by region (`runs.list({ region: "..." })` / `filter[region]=`) and read each run's executing region from the new `region` field on the response. diff --git a/.changeset/sessions-primitive.md b/.changeset/sessions-primitive.md new file mode 100644 index 00000000000..79a6ca48f65 --- /dev/null +++ b/.changeset/sessions-primitive.md @@ -0,0 +1,26 @@ +--- +"@trigger.dev/sdk": minor +"@trigger.dev/core": patch +--- + +**Sessions** — a durable, run-aware stream channel keyed on a stable `externalId`. A Session is the unit of state that owns a multi-run conversation: messages flow through `.in`, responses through `.out`, both survive run boundaries. Sessions back the new `chat.agent` runtime, and you can build on them directly for any pattern that needs durable bi-directional streaming across runs. + +```ts +import { sessions, tasks } from "@trigger.dev/sdk"; + +// Trigger a task and subscribe to its session output in one call +const { runId, stream } = await tasks.triggerAndSubscribe("my-task", payload, { + externalId: "user-456", +}); + +for await (const chunk of stream) { + // ... +} + +// Enumerate existing sessions (powers inbox-style UIs without a separate index) +for await (const s of sessions.list({ type: "chat.agent", tag: "user:user-456" })) { + console.log(s.id, s.externalId, s.createdAt, s.closedAt); +} +``` + +See [/docs/ai-chat/overview](https://trigger.dev/docs/ai-chat/overview) for the full surface — Sessions powers the durable, resumable chat runtime described there. diff --git a/.changeset/vendor-superjson-esm-fix.md b/.changeset/vendor-superjson-esm-fix.md deleted file mode 100644 index ef04201d2c9..00000000000 --- a/.changeset/vendor-superjson-esm-fix.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -"@trigger.dev/core": patch ---- - -fix: vendor superjson to fix ESM/CJS compatibility - -Bundle superjson during build to avoid `ERR_REQUIRE_ESM` errors on Node.js versions that don't support `require(ESM)` by default (< 22.12.0) and AWS Lambda which intentionally disables it. diff --git a/.claude/REVIEW.md b/.claude/REVIEW.md new file mode 100644 index 00000000000..19edf00a52e --- /dev/null +++ b/.claude/REVIEW.md @@ -0,0 +1,50 @@ +# REVIEW.md — Trigger.dev OSS + +Repo-specific signal for anyone (human or agent) reviewing a PR in this codebase. Calibrates what counts as critical, what to always check, and what to skip. + +## What makes a 🔴 Important finding here + +Reserve 🔴 for things that would page someone or block a rollback. In this codebase, that means: + +- **Rolling-deploy breakage.** Old and new versions of the webapp/supervisor run side-by-side during deploys. A change is broken if: + - A Lua script's behavior changes for a given key set without versioning (rename the script with a behavior-descriptive suffix like `Tracked` rather than `V2` — both versions must coexist safely). + - A Redis data shape used by both versions changes in place. New shapes need a new key namespace. + - A migration is not backward-compatible with the prior image. +- **Schema / migration safety.** Prisma migrations must be backward-compatible with the prior deploy. Adding NOT NULL without a default, dropping a column an old image still reads, renaming a column — all 🔴. +- **ClickHouse migration ordering + idempotency.** Goose runs in strict mode in the deploy pipeline and refuses to apply a missing version below the current version — slotting a new file in below the latest already-applied version blocks the deploy. New ClickHouse migration files MUST use the next available number (`max(files in internal-packages/clickhouse/schema/) + 1`); if main has added migrations while you've been on a branch, renumber yours. DDL must also be idempotent (`ADD COLUMN IF NOT EXISTS`, `DROP COLUMN IF EXISTS`, `CREATE TABLE IF NOT EXISTS`, `ADD INDEX IF NOT EXISTS`) so a partial / `--allow-missing` apply elsewhere doesn't fail on retry. Either fault is 🔴 — both break test/prod deploys. Rules live in `internal-packages/clickhouse/CLAUDE.md`. +- **Queue / concurrency correctness.** RunQueue, MarQS (V1, legacy), redis-worker — any change to enqueue / dequeue / locking semantics. Re-derive the invariant on paper before flagging or accepting. +- **Missing index on a hot table.** New Prisma queries against `TaskRun`, `TaskRunExecutionSnapshot`, `JobRun`, `Project`, etc. must use an existing index. Check `internal-packages/database/prisma/schema.prisma` for the relevant `@@index` lines — don't guess and don't propose `EXPLAIN`. +- **Recovery-path queries.** Any `TaskRun.findFirst` / `findMany` added to a schedule, run-recovery, or restart loop. Recovery fan-outs (Redis crash, restart storms) turn "rare indexed query" into a DB incident. 🔴 even if indexed. +- **Aggregations on hot tables.** No `COUNT` / `GROUP BY` on `TaskRun` or other multi-million-row tables. Use Redis or ClickHouse for counts. +- **Prod Redis blast-radius.** New code paths that `SCAN` with broad patterns (`*foo*`) on prod-shaped Redis, or `EVAL` Lua with `SCAN` loops inside. Both are 🔴. +- **`@trigger.dev/core` direct import** from anywhere outside the SDK package. Always import from `@trigger.dev/sdk`. Core direct imports are 🔴 — they break the public API contract. +- **Heavy execute-deps imported into request-handler bundles.** Specifically `chat.handover` and similar split-bundle entry points must not transitively import the agent task's execute path. Watch for new imports added at module top-level of route files. +- **V1 engine code modified in a "V2 only" PR.** The `apps/webapp/app/v3/` directory contains both. If the PR description says V2-only but it touches `triggerTaskV1`, `cancelTaskRunV1`, `MarQS`, etc. — 🔴. + +## Always check + +- **Tests use testcontainers, not mocks.** Vitest with `redisTest` / `postgresTest` / `containerTest` from `@internal/testcontainers`. Any new `vi.mock(...)` on Redis, Postgres, BullMQ, or other infra is wrong here — 🔴 if added in production-path tests, 🟡 if isolated unit test. +- **Public-package changes have a changeset.** `pnpm run changeset:add` produces `.changeset/*.md`. Required for any edit under `packages/*`. Missing → 🟡; missing on a breaking change → 🔴. +- **Server-only changes have `.server-changes/*.md`.** Required for `apps/webapp/`, `apps/supervisor/` edits with no public-package change. Body should be 1-2 sentences (it has to fit as one bullet in a future changelog). Missing → 🟡. +- **Lua script naming.** Coexisting scripts use behavior-descriptive suffixes (`Tracked`), never `V2`. Old name must keep working until the next deploy clears it. +- **RunQueue payload shape.** V2 run-queue payload's `projectId` is consumed by `workerQueueResolver` for override matching. If a PR drops it from the payload, 🔴. +- **`safeSend` scope.** Defensive IPC wrappers belong on loop / interval / handler contexts, not one-shot terminal sends. If the PR adds `safeSend` to a single terminal call for consistency, 🟡 with a "remove this" suggestion. +- **Zod version.** Pinned to `3.25.76` monorepo-wide. New package adding zod with a different version or range — 🔴. + +## Skip (do NOT flag) + +- Anything Prettier / ESLint catches. CI runs both. +- TypeScript style preferences (`type` vs `interface`) — already covered by repo standards. +- Test coverage exhortations as a generic suggestion. Only flag missing tests when a specific code path is genuinely untested and the path has prior incidents. +- `agentcrumbs` markers (`// @crumbs`, `// #region @crumbs`) and `agentcrumbs` imports — these are temporary debug instrumentation stripped before merge. +- `// removed comments for removed code`, renamed `_unused` vars, re-exported types as "backwards compatibility shims" — also covered by repo standards. +- Suggestions to "add error handling" without naming a specific scenario that breaks. +- Documentation prose nitpicks in `docs/*` MDX files unless factually wrong. + +## Things V1/legacy that should NOT block a PR + +The `apps/webapp/app/v3/` directory name is misleading — most code there is V2. Only specific files are V1-only legacy: `MarQS` queue, `triggerTaskV1`, `cancelTaskRunV1`, and a handful of others (see `apps/webapp/CLAUDE.md` for the exact list). Don't flag "you should refactor this to use V2" on those — they're frozen. + +## Confidence calibration for this repo + +The most common false-positive pattern: speculating about race conditions in code paths the agent doesn't have runtime visibility into. If the only evidence is "this *could* race", drop it. If you can point to a specific interleaving with file:line for each step, surface it. diff --git a/.claude/review-guides/chat-agent-sessions-row-agnostic.md b/.claude/review-guides/chat-agent-sessions-row-agnostic.md new file mode 100644 index 00000000000..7fb9851f308 --- /dev/null +++ b/.claude/review-guides/chat-agent-sessions-row-agnostic.md @@ -0,0 +1,287 @@ +# Review guide — chat.agent on Sessions, row-agnostic addressing + +Scope: the 12 uncommitted files. **No new behaviour beyond the public surface +already on this branch** — this is plumbing cleanup that: + +1. Eliminates the transport's session-creation step +2. Makes `chatId` the universal addressing string everywhere +3. Makes the server-side stream/append/wait routes row-agnostic + +## The two design moves + +**Move 1 — agent owns session lifecycle.** `chat.agent` and +`chat.customAgent` upsert the backing `Session` row at bind, fire-and-forget, +keyed on `externalId = payload.chatId`. The transport, server-side +`AgentChat`, and `chat.createTriggerAction` no longer create sessions at all. +Browsers cannot mint sessions either (`POST /api/v1/sessions` is now +secret-key-only). One owner, one path. + +**Move 2 — `chatId` is the only address.** The transport, server-side +`AgentChat`, JWT scopes, and S2 stream paths all use `chatId` directly. The +Session's friendlyId is informational. To make this safe, the three stream +routes (`.in/.out` PUT, GET, POST append, plus the run-engine `wait` +endpoint) became "row-optional" and derive a *canonical addressing key* +(`row.externalId ?? row.friendlyId`, fallback to the URL param when the row +hasn't been upserted yet). Same canonical key is used to build the S2 stream +path, the waitpoint cache key, and the JWT resource set — so any caller +addressing by either form converges on the same physical stream. + +Together these remove an entire class of "did the row land yet?" races. The +transport can subscribe to `/sessions/{chatId}/out` before the agent boots, +the agent's `void sessions.create({externalId: chatId})` lands a moment +later, and any earlier reads/writes are already on the right S2 key. + +--- + +## Read in this order + +### 1. `apps/webapp/app/services/realtime/sessions.server.ts` (+34 lines) + +The new primitive. Two helpers: + +- `isSessionFriendlyIdForm(value)` — `value.startsWith("session_")`. Used to + decide whether a missing row is a hard 404 (opaque friendlyId) or a soft + "row will land later" (externalId form). +- `canonicalSessionAddressingKey(row, paramSession)` — `row.externalId ?? + row.friendlyId` if the row exists, else `paramSession`. **This is the load- + bearing function.** Read its docstring. + +**Question to ask:** can two callers addressing the "same" session ever get +different canonical keys? Only if the row exists for one and not the other, +*and* the URL forms differ — but in that case the row-less caller used the +externalId form (friendlyId-form would have 404'd earlier), and the row-ful +caller computes `row.externalId ?? row.friendlyId`. If the row's externalId +matches the URL, they converge. If it doesn't, there's no row to find by +that string anyway. The interesting edge is "row exists with no externalId", +addressed via friendlyId — both sides read `row.friendlyId`. ✓ + +### 2. `apps/webapp/app/routes/realtime.v1.sessions.$session.$io.ts` (+47/-12) + +PUT initialize + GET subscribe (SSE). Both use the helper. The interesting +part is the loader's `findResource` + `authorization.resource`: + +```ts +findResource: async (params, auth) => { + const row = await resolveSessionByIdOrExternalId(...); + if (!row && isSessionFriendlyIdForm(params.session)) return undefined; // 404 + return { row, addressingKey: canonicalSessionAddressingKey(row, params.session) }; +}, +authorization: { + resource: ({ row, addressingKey }) => { + const ids = new Set([addressingKey]); + if (row) { + ids.add(row.friendlyId); + if (row.externalId) ids.add(row.externalId); + } + return { sessions: [...ids] }; + }, + superScopes: ["read:sessions", "read:all", "admin"], +}, +``` + +**Why three IDs in the resource set?** `checkAuthorization` is "any-match" +across the resource values. We want a JWT scoped to *either* form to +authorize *either* URL form. Smoke test verified the 4-cell matrix passes. + +**The PUT path** (action handler) is simpler — it just resolves the row, +builds an addressing key, and hands it to `initializeSessionStream`. Worth +noting the `closedAt` check is now `maybeSession?.closedAt` — no row means +no closedAt to enforce. + +### 3. `apps/webapp/app/routes/realtime.v1.sessions.$session.$io.append.ts` (+22/-13) + +POST append (browser writes a record to `.in` or server writes to `.out`). +Same row-optional pattern. Both the S2 append and the waitpoint drain use +`addressingKey`. + +**Question to ask:** what fires the waitpoint? An agent's +`session.in.wait()` registers a waitpoint keyed on `(addressingKey, io)` via +the wait endpoint (file 4). The append handler drains by the *same* key — +even if the agent registered with externalId form and the transport +appended via friendlyId form, both compute the same canonical key, so they +converge. ✓ + +### 4. `apps/webapp/app/routes/api.v1.runs.$runFriendlyId.session-streams.wait.ts` (+18/-13) + +The agent's `.in.wait()` endpoint. Run-engine creates the waitpoint, then +registers it in Redis under `(addressingKey, io)`. The race-check that runs +right after creation reads from S2 by the same key. Three call sites — +`addSessionStreamWaitpoint`, `readSessionStreamRecords`, +`removeSessionStreamWaitpoint` — all consistent. + +### 5. `apps/webapp/app/routes/api.v1.sessions.ts` (+4/-2) + +**Security tightening.** Removed `allowJWT: true` and `corsStrategy: "all"` +from the `POST /api/v1/sessions` action — secret-key only now. + +**Question to ask:** was the JWT path actually used? Until this branch, the +transport called it via `ensureSession` (now deleted). After this branch, +nobody reaches it from the browser. `chat.createTriggerAction` (server +secret key) is the only browser-adjacent path. + +### 6. `packages/trigger-sdk/src/v3/ai.ts` (+62/-39) + +Two near-identical edits — one in `chatAgent`, one in `chatCustomAgent`. +Both bind on `payload.chatId` and fire-and-forget the upsert: + +```ts +locals.set(chatSessionHandleKey, sessions.open(payload.chatId)); +void sessions + .create({ type: "chat.agent", externalId: payload.chatId }) + .catch(() => { /* best effort */ }); +``` + +**Question to ask:** why `void`-and-`catch`? Awaiting the upsert would gate +the agent's bind on a network round-trip that doesn't unblock anything +user-visible — `.in/.out` routes are row-agnostic and the waitpoint cache +is keyed on the addressing string, not the row id. If the upsert genuinely +fails, the next bind retries the same idempotent call (`sessions.create` +upserts on `externalId`, so concurrent triggers on one chatId converge to +one row). The row matters for downstream metadata + listing, not for live +addressing. + +The PAT scope minting in `chatAgent` (two call sites — preload and +sendMessage) now uses `payload.chatId` for the `sessions:` resource. That +matches what the transport/AgentChat use as the JWT resource and what the +JWT's resource set in the loader includes. Cross-form addressing works +either way (smoke-tested), but using `chatId` keeps the chain tight. + +`createChatTriggerAction` is the most visibly trimmed: no pre-create, no +threading `sessionId` into payload, scope mint uses `chatId`. Return type +no longer carries `sessionId` — note `TriggerChatTaskResult.sessionId` was +already declared optional, so this isn't a public-API break. + +**Stale docstring to flag:** `chat.ts:59` and `chat.ts:112` still describe +PAT scopes as `read:sessions:{sessionId}` and +`write:sessions:{sessionId}`. Functionally either ID works (row lookup +canonicalises), but the doc text is now out of date — it should say +`{chatId}`. Worth a tidy-up before merge but not blocking. + +### 7. `packages/trigger-sdk/src/v3/chat.ts` (+63/-117) + +**The biggest mechanical edit.** Net -54 lines from deleting `ensureSession` +and untangling its callers. + +What disappeared: +- `private async ensureSession(chatId)` — gone +- The "lazy upsert from the browser if no triggerTask callback" branch in + `sendMessages` and `preload` — gone +- The "throw if neither path surfaced a sessionId" guard — gone +- All `state.sessionId` URL params replaced with `chatId` +- `subscribeToSessionStream`'s `chatId?` (optional) is now `chatId` (required) + +What stayed: +- `state.sessionId` in `ChatSessionState` — optional, informational +- The `restore from external storage` branch in the constructor still + hydrates `sessionId` if persisted, just doesn't *require* it +- `notifySessionChange` still surfaces `sessionId` if known + +**Question to ask:** does the transport ever still need the friendlyId? The +only place is the `onSessionChange` callback's payload (so consumers +persisting state can save it for later display). The transport itself never +puts it in a URL or a waitpoint key. + +The `sendMessages` path is worth re-reading: when state.runId is set, it +appends to `.in/append` and subscribes to `.out`. If the append fails with +a non-auth error, it falls through to triggering a new run (legacy "run is +dead" detection — unchanged from pre-Sessions, doesn't depend on +addressing). + +### 8. `packages/trigger-sdk/src/v3/chat-client.ts` (+34/-33) + +Server-side `AgentChat`. Mirrors the transport changes — every URL uses +`this.chatId`. `triggerNewRun` no longer pre-creates a session. `ChatSession` +and internal `SessionState` types now have optional `sessionId`. + +The shape of the diff is identical to the transport: delete the upsert, +swap addressing identifiers, optionalise the friendlyId. If you've read +`chat.ts` carefully, this one is mostly mechanical confirmation that both +client surfaces (browser transport + server-side AgentChat) speak the same +addressing protocol. + +### 9. Test infrastructure — `sessions.ts` (+18) + `mock-chat-agent.ts` (+25) + +`__setSessionCreateImplForTests` mirrors the existing +`__setSessionOpenImplForTests`. `mockChatAgent` installs a no-op create stub +returning a synthetic `CreatedSessionResponseBody` so the agent's bind-time +`void sessions.create(...)` doesn't try to hit a real API. Cleanup runs in +the same `.finally` as the open override. + +**Question to ask:** is the synthetic response shape correct? It mirrors +`CreatedSessionResponseBody` — `id`, `externalId`, `type`, `tags`, +`metadata`, `closedAt`, `closedReason`, `expiresAt`, `createdAt`, +`updatedAt`, `isCached`. Tests don't currently assert on this object, so +the bar is "doesn't crash + matches the type". Met. + +### 10. `packages/trigger-sdk/src/v3/chat.test.ts` (+13/-12) + +Three classes of test edits, all consequences: + +- Stream URL assertion: `chat-1` (the chatId) instead of + `session_streamurl` (the friendlyId) +- `renewRunAccessToken` callback: `sessionId: undefined` (was + `DEFAULT_SESSION_ID` because the mocked trigger doesn't surface it) +- Token resolve count: `1` (was `2` — second resolve was for `ensureSession`) +- One `onSessionChange` matchObject loses `sessionId` + +### 11. `apps/webapp/app/routes/_app.../playground/.../route.tsx` (1 line) + +`sessionId: string` → `sessionId?: string` in the playground sidebar prop +to track the transport type change. + +--- + +## Edge cases I checked, so you don't have to + +- **Cross-form JWT auth (curl matrix).** JWT scoped to externalId can call + externalId URL ✓ and friendlyId URL ✓. JWT scoped to friendlyId can call + externalId URL ✓ and friendlyId URL ✓. Smoke-tested. +- **Row materialises after subscribe.** Transport opens + `GET /sessions/{chatId}/out` before agent's bind upsert lands → 200 OK, + `addressingKey = chatId` (paramSession fallback). Once the row lands + with `externalId = chatId`, addressingKey resolves to the same value via + `row.externalId`. Same S2 key throughout. +- **Concurrent triggers on one chatId.** Two browser tabs trigger two runs + → two binds → two `sessions.create({externalId: chatId})` calls. Upsert + semantics: both return the same row. +- **Closed session enforcement.** Still enforced when a row exists. + `maybeSession?.closedAt` is null-safe; no row = no close-state to honour. +- **Agent run cancellation.** Frontend doesn't auto-detect — unchanged from + pre-Sessions; messages sit in S2 until the next trigger (the existing + run-PAT auth-error path is the only reaper). Out of scope for this branch. +- **Idle timeout in dev.** Runs stay `EXECUTING_WITH_WAITPOINTS` past the + configured idle because dev runs don't snapshot/restore; the in-process + idle clock advances locally without touching the row. Expected, not a + regression. + +## Things explicitly **not** in this branch + +- Run-state subscription on the transport side (the "run died, re-trigger + silently" UX gap) +- Session auto-close on agent exit (still client-driven by design) +- Any change to `Session` schema, `sessions.create` semantics, or + `chatAccessTokenTTL` +- Docstring updates for `read:sessions:{sessionId}` / `write:sessions:{sessionId}` + in `chat.ts:59` and `chat.ts:112` (functional but textually stale — + follow-up nit) + +--- + +## What I'd be ready to answer cold + +- Why fire-and-forget upsert (vs. `await`) in the agent's bind step +- Why the route's authorization resource set has three IDs (cross-form JWT + auth) +- Why `POST /api/v1/sessions` lost `allowJWT` (security tightening — no + caller needs it after the transport's `ensureSession` is gone) +- What converges two callers using different URL forms onto the same S2 + stream (`canonicalSessionAddressingKey`, identical computation on both + sides for any given row) +- What makes `sessions.create` race-safe under concurrent triggers + (`externalId` upsert) +- Why `state.sessionId` stayed on `ChatSessionState` at all (pure + informational, surfaced via `onSessionChange` for consumer persistence; + zero addressing role) +- Why the chat-client (server-side AgentChat) and chat (transport) edits + look near-identical (they implement the same client protocol against the + same row-agnostic routes) diff --git a/.claude/rules/database-safety.md b/.claude/rules/database-safety.md new file mode 100644 index 00000000000..14a6523595b --- /dev/null +++ b/.claude/rules/database-safety.md @@ -0,0 +1,13 @@ +--- +paths: + - "internal-packages/database/**" +--- + +# Database Migration Safety + +- When adding indexes to **existing tables**, use `CREATE INDEX CONCURRENTLY IF NOT EXISTS` to avoid table locks. These must be in their own separate migration file (one index per file). +- Indexes on **newly created tables** (same migration as `CREATE TABLE`) do not need CONCURRENTLY. +- When indexing a **new column on an existing table**, split into two migrations: first `ADD COLUMN IF NOT EXISTS`, then `CREATE INDEX CONCURRENTLY IF NOT EXISTS` in a separate file. +- After generating a migration with Prisma, remove extraneous lines for: `_BackgroundWorkerToBackgroundWorkerFile`, `_BackgroundWorkerToTaskQueue`, `_TaskRunToTaskRunTag`, `_WaitpointRunConnections`, `_completedWaitpoints`, `SecretStore_key_idx`, and unrelated TaskRun indexes. +- Never drop columns or tables without explicit approval. +- New code should target `RunEngineVersion.V2` only. diff --git a/.claude/rules/docs-writing.md b/.claude/rules/docs-writing.md new file mode 100644 index 00000000000..bbfb471368e --- /dev/null +++ b/.claude/rules/docs-writing.md @@ -0,0 +1,14 @@ +--- +paths: + - "docs/**" +--- + +# Documentation Writing Rules + +- Use Mintlify MDX format. Frontmatter: `title`, `description`, `sidebarTitle` (optional). +- After creating a new page, add it to `docs.json` navigation under the correct group. +- Use Mintlify components: ``, ``, ``, ``, ``, ``, ``/``. +- Code examples should be complete and runnable where possible. +- Always import from `@trigger.dev/sdk`, never `@trigger.dev/sdk/v3`. +- Keep paragraphs short. Use headers to break up content. +- Link to related pages using relative paths (e.g., `[Tasks](/tasks/overview)`). diff --git a/.claude/rules/legacy-v3-code.md b/.claude/rules/legacy-v3-code.md new file mode 100644 index 00000000000..6fd8d9402c2 --- /dev/null +++ b/.claude/rules/legacy-v3-code.md @@ -0,0 +1,33 @@ +--- +paths: + - "apps/webapp/app/v3/**" +--- + +# Legacy V1 Engine Code in `app/v3/` + +The `v3/` directory name is misleading - most code here is actively used by the current V2 engine. Only the specific files below are legacy V1-only code. + +## V1-Only Files - Never Modify + +- `marqs/` directory (entire MarQS queue system: sharedQueueConsumer, devQueueConsumer, fairDequeuingStrategy, devPubSub) +- `legacyRunEngineWorker.server.ts` (V1 background job worker) +- `services/triggerTaskV1.server.ts` (deprecated V1 task triggering) +- `services/cancelTaskRunV1.server.ts` (deprecated V1 cancellation) +- `authenticatedSocketConnection.server.ts` (V1 dev WebSocket using DevQueueConsumer) +- `sharedSocketConnection.ts` (V1 shared queue socket using SharedQueueConsumer) + +## V1/V2 Branching Pattern + +Some services act as routers that branch on `RunEngineVersion`: +- `services/cancelTaskRun.server.ts` - calls V1 service or `engine.cancelRun()` for V2 +- `services/batchTriggerV3.server.ts` - uses marqs for V1 path, run-engine for V2 + +When editing these shared services, only modify V2 code paths. + +## V2 Modern Stack + +- **Run lifecycle**: `@internal/run-engine` (internal-packages/run-engine) +- **Background jobs**: `@trigger.dev/redis-worker` (not graphile-worker/zodworker) +- **Queue operations**: RunQueue inside run-engine (not MarQS) +- **V2 engine singleton**: `runEngine.server.ts`, `runEngineHandlers.server.ts` +- **V2 workers**: `commonWorker.server.ts`, `alertsWorker.server.ts`, `batchTriggerWorker.server.ts` diff --git a/.claude/rules/package-installation.md b/.claude/rules/package-installation.md new file mode 100644 index 00000000000..310074823c5 --- /dev/null +++ b/.claude/rules/package-installation.md @@ -0,0 +1,22 @@ +--- +paths: + - "**/package.json" +--- + +# Installing Packages + +When adding a new dependency to any package.json in the monorepo: + +1. **Look up the latest version** on npm before adding: + ```bash + pnpm view version + ``` + If unsure which version to use (e.g. major version compatibility), confirm with the user. + +2. **Edit the package.json directly** — do NOT use `pnpm add` as it can cause issues in the monorepo. Add the dependency with the correct version range (typically `^x.y.z`). + +3. **Run `pnpm i` from the repo root** after editing to install and update the lockfile: + ```bash + pnpm i + ``` + Always run from the repo root, not from the package directory. diff --git a/.claude/rules/sdk-packages.md b/.claude/rules/sdk-packages.md new file mode 100644 index 00000000000..549eb341809 --- /dev/null +++ b/.claude/rules/sdk-packages.md @@ -0,0 +1,12 @@ +--- +paths: + - "packages/**" +--- + +# Public Package Rules + +- Changes to `packages/` are **customer-facing**. Always add a changeset: `pnpm run changeset:add` +- Default to **patch**. Get maintainer approval for minor. Never select major without explicit approval. +- `@trigger.dev/core`: **Never import the root**. Always use subpath imports (e.g., `@trigger.dev/core/v3`). +- Do NOT update `rules/` or `.claude/skills/trigger-dev-tasks/` unless explicitly asked. These are maintained in separate dedicated passes. +- Test changes using `references/hello-world` reference project. diff --git a/.claude/rules/server-apps.md b/.claude/rules/server-apps.md new file mode 100644 index 00000000000..4d46789701c --- /dev/null +++ b/.claude/rules/server-apps.md @@ -0,0 +1,23 @@ +--- +paths: + - "apps/**" +--- + +# Server App Changes + +When modifying server apps (webapp, supervisor, coordinator, etc.) with **no package changes**, add a `.server-changes/` file instead of a changeset: + +```bash +cat > .server-changes/descriptive-name.md << 'EOF' +--- +area: webapp +type: fix +--- + +Brief description of what changed and why. +EOF +``` + +- **area**: `webapp` | `supervisor` | `coordinator` | `kubernetes-provider` | `docker-provider` +- **type**: `feature` | `fix` | `improvement` | `breaking` +- If the PR also touches `packages/`, just the changeset is sufficient (no `.server-changes/` needed). diff --git a/.claude/skills/span-timeline-events/SKILL.md b/.claude/skills/span-timeline-events/SKILL.md new file mode 100644 index 00000000000..122f49912d7 --- /dev/null +++ b/.claude/skills/span-timeline-events/SKILL.md @@ -0,0 +1,78 @@ +--- +name: span-timeline-events +description: Use when adding, modifying, or debugging OTel span timeline events in the trace view. Covers event structure, ClickHouse storage constraints, rendering in SpanTimeline component, admin visibility, and the step-by-step process for adding new events. +allowed-tools: Read, Write, Edit, Glob, Grep, Bash +--- + +# Span Timeline Events + +The trace view's right panel shows a timeline of events for the selected span. These are OTel span events rendered by `app/utils/timelineSpanEvents.ts` and the `SpanTimeline` component. + +## How They Work + +1. **Span events** in OTel are attached to a parent span. In ClickHouse, they're stored as separate rows with `kind: "SPAN_EVENT"` sharing the parent span's `span_id`. The `#mergeRecordsIntoSpanDetail` method reassembles them into the span's `events` array at query time. +2. The timeline only renders events whose `name` starts with `trigger.dev/` - all others are silently filtered out. +3. The **display name** comes from `properties.event` (not the span event name), mapped through `getFriendlyNameForEvent()`. +4. Events are shown on the **span they belong to** - events on one span don't appear in another span's timeline. + +## ClickHouse Storage Constraint + +When events are written to ClickHouse, `spanEventsToTaskEventV1Input()` filters out events whose `start_time` is not greater than the parent span's `startTime`. Events at or before the span start are silently dropped. This means span events must have timestamps strictly after the span's own `startTimeUnixNano`. + +## Timeline Rendering (SpanTimeline component) + +The `SpanTimeline` component in `app/components/run/RunTimeline.tsx` renders: + +1. **Events** (thin 1px line with hollow dots) - all events from `createTimelineSpanEventsFromSpanEvents()` +2. **"Started"** marker (thick cap) - at the span's `startTime` +3. **Duration bar** (thick 7px line) - from "Started" to "Finished" +4. **"Finished"** marker (thick cap) - at `startTime + duration` + +The thin line before "Started" only appears when there are events with timestamps between the span start and the first child span. For the Attempt span this works well (Dequeued -> Pod scheduled -> Launched -> etc. all happen before execution starts). Events all get `lineVariant: "light"` (thin) while the execution bar gets `variant: "normal"` (thick). + +## Trace View Sort Order + +Sibling spans (same parent) are sorted by `start_time ASC` from the ClickHouse query. The `createTreeFromFlatItems` function preserves this order. Event timestamps don't affect sort order - only the span's own `start_time`. + +## Event Structure + +```typescript +// OTel span event format +{ + name: "trigger.dev/run", // Must start with "trigger.dev/" to render + timeUnixNano: "1711200000000000000", + attributes: [ + { key: "event", value: { stringValue: "dequeue" } }, // The actual event type + { key: "duration", value: { intValue: 150 } }, // Optional: duration in ms + ] +} +``` + +## Admin-Only Events + +`getAdminOnlyForEvent()` controls visibility. Events default to **admin-only** (`true`). + +| Event | Admin-only | Friendly name | +|-------|-----------|---------------| +| `dequeue` | No | Dequeued | +| `fork` | No | Launched | +| `import` | No (if no fork event) | Importing task file | +| `create_attempt` | Yes | Attempt created | +| `lazy_payload` | Yes | Lazy attempt initialized | +| `pod_scheduled` | Yes | Pod scheduled | +| (default) | Yes | (raw event name) | + +## Adding New Timeline Events + +1. Add OTLP span event with `name: "trigger.dev/"` and `properties.event: ""` +2. Event timestamp must be strictly after the parent span's `startTimeUnixNano` (ClickHouse drops earlier events) +3. Add friendly name in `getFriendlyNameForEvent()` in `app/utils/timelineSpanEvents.ts` +4. Set admin visibility in `getAdminOnlyForEvent()` +5. Optionally add help text in `getHelpTextForEvent()` + +## Key Files + +- `app/utils/timelineSpanEvents.ts` - filtering, naming, admin logic +- `app/components/run/RunTimeline.tsx` - `SpanTimeline` component (thin line + thick bar rendering) +- `app/presenters/v3/SpanPresenter.server.ts` - loads span data including events +- `app/v3/eventRepository/clickhouseEventRepository.server.ts` - `spanEventsToTaskEventV1Input()` (storage filter), `#mergeRecordsIntoSpanDetail` (reassembly) diff --git a/.cursor/rules/webapp.mdc b/.cursor/rules/webapp.mdc index a362f14fe12..f1333febdc0 100644 --- a/.cursor/rules/webapp.mdc +++ b/.cursor/rules/webapp.mdc @@ -4,7 +4,7 @@ globs: apps/webapp/**/*.tsx,apps/webapp/**/*.ts alwaysApply: false --- -The main trigger.dev webapp, which powers it's API and dashboard and makes up the docker image that is produced as an OSS image, is a Remix 2.1.0 app that uses an express server, written in TypeScript. The following subsystems are either included in the webapp or are used by the webapp in another part of the monorepo: +The main trigger.dev webapp, which powers it's API and dashboard and makes up the docker image that is produced as an OSS image, is a Remix 2.17.4 app that uses an express server, written in TypeScript. The following subsystems are either included in the webapp or are used by the webapp in another part of the monorepo: - `@trigger.dev/database` exports a Prisma 6.14.0 client that is used extensively in the webapp to access a PostgreSQL instance. The schema file is [schema.prisma](mdc:internal-packages/database/prisma/schema.prisma) - `@trigger.dev/core` is a published package and is used to share code between the `@trigger.dev/sdk` and the webapp. It includes functionality but also a load of Zod schemas for data validation. When importing from `@trigger.dev/core` in the webapp, we never import the root `@trigger.dev/core` path, instead we favor one of the subpath exports that you can find in [package.json](mdc:packages/core/package.json) diff --git a/.cursor/rules/writing-tasks.mdc b/.cursor/rules/writing-tasks.mdc index 5116d083e23..359ed5d4733 100644 --- a/.cursor/rules/writing-tasks.mdc +++ b/.cursor/rules/writing-tasks.mdc @@ -14,7 +14,7 @@ alwaysApply: false ## Essential requirements when generating task code -1. You MUST use `@trigger.dev/sdk/v3` +1. You MUST import from `@trigger.dev/sdk` (NEVER `@trigger.dev/sdk/v3`) 2. You MUST NEVER use `client.defineJob` 3. YOU MUST `export` every task, including subtasks 4. If you are able to generate an example payload for a task, do so. @@ -53,7 +53,7 @@ Instead, you MUST ALWAYS generate ONLY this pattern: ```ts // ✅ ALWAYS GENERATE THIS EXACT PATTERN -import { task } from "@trigger.dev/sdk/v3"; +import { task } from "@trigger.dev/sdk"; //1. You need to export each task, even if it's a subtask export const helloWorld = task({ @@ -71,7 +71,7 @@ export const helloWorld = task({ A task is a function that can run for a long time with resilience to failure: ```ts -import { task } from "@trigger.dev/sdk/v3"; +import { task } from "@trigger.dev/sdk"; export const helloWorld = task({ id: "hello-world", @@ -271,7 +271,7 @@ Global lifecycle hooks can also be defined in `trigger.config.ts` to apply to al ## Correct Schedules task (cron) implementations ```ts -import { schedules } from "@trigger.dev/sdk/v3"; +import { schedules } from "@trigger.dev/sdk"; export const firstScheduledTask = schedules.task({ id: "first-scheduled-task", @@ -312,7 +312,7 @@ export const firstScheduledTask = schedules.task({ ### Attach a Declarative schedule ```ts -import { schedules } from "@trigger.dev/sdk/v3"; +import { schedules } from "@trigger.dev/sdk"; // Sepcify a cron pattern (UTC) export const firstScheduledTask = schedules.task({ @@ -326,7 +326,7 @@ export const firstScheduledTask = schedules.task({ ``` ```ts -import { schedules } from "@trigger.dev/sdk/v3"; +import { schedules } from "@trigger.dev/sdk"; // Specify a specific timezone like this: export const secondScheduledTask = schedules.task({ @@ -375,7 +375,7 @@ const createdSchedule = await schedules.create({ Schema tasks validate payloads against a schema before execution: ```ts -import { schemaTask } from "@trigger.dev/sdk/v3"; +import { schemaTask } from "@trigger.dev/sdk"; import { z } from "zod"; const myTask = schemaTask({ @@ -400,7 +400,7 @@ When you trigger a task from your backend code, you need to set the `TRIGGER_SEC Triggers a single run of a task with specified payload and options without importing the task. Use type-only imports for full type checking. ```ts -import { tasks } from "@trigger.dev/sdk/v3"; +import { tasks } from "@trigger.dev/sdk"; import type { emailSequence } from "~/trigger/emails"; export async function POST(request: Request) { @@ -418,7 +418,7 @@ export async function POST(request: Request) { Triggers multiple runs of a single task with different payloads without importing the task. ```ts -import { tasks } from "@trigger.dev/sdk/v3"; +import { tasks } from "@trigger.dev/sdk"; import type { emailSequence } from "~/trigger/emails"; export async function POST(request: Request) { @@ -436,7 +436,7 @@ export async function POST(request: Request) { Triggers multiple runs of different tasks at once, useful when you need to execute multiple tasks simultaneously. ```ts -import { batch } from "@trigger.dev/sdk/v3"; +import { batch } from "@trigger.dev/sdk"; import type { myTask1, myTask2 } from "~/trigger/myTasks"; export async function POST(request: Request) { @@ -621,7 +621,7 @@ const handle = await myTask.trigger( Access metadata inside a run: ```ts -import { task, metadata } from "@trigger.dev/sdk/v3"; +import { task, metadata } from "@trigger.dev/sdk"; export const myTask = task({ id: "my-task", @@ -713,7 +713,7 @@ Trigger.dev Realtime enables subscribing to runs for real-time updates on run st Subscribe to a run after triggering a task: ```ts -import { runs, tasks } from "@trigger.dev/sdk/v3"; +import { runs, tasks } from "@trigger.dev/sdk"; async function myBackend() { const handle = await tasks.trigger("my-task", { some: "data" }); @@ -735,7 +735,7 @@ async function myBackend() { You can infer types of run's payload and output by passing the task type: ```ts -import { runs } from "@trigger.dev/sdk/v3"; +import { runs } from "@trigger.dev/sdk"; import type { myTask } from "./trigger/my-task"; for await (const run of runs.subscribeToRun(handle.id)) { @@ -752,7 +752,7 @@ for await (const run of runs.subscribeToRun(handle.id)) { Stream data in realtime from inside your tasks using the metadata system: ```ts -import { task, metadata } from "@trigger.dev/sdk/v3"; +import { task, metadata } from "@trigger.dev/sdk"; import OpenAI from "openai"; export type STREAMS = { @@ -947,7 +947,7 @@ For most use cases, Realtime hooks are preferred over SWR hooks with polling due For client-side usage, generate a public access token with appropriate scopes: ```ts -import { auth } from "@trigger.dev/sdk/v3"; +import { auth } from "@trigger.dev/sdk"; const publicToken = await auth.createPublicToken({ scopes: { @@ -967,7 +967,7 @@ Idempotency ensures that an operation produces the same result when called multi Provide an `idempotencyKey` when triggering a task to ensure it runs only once with that key: ```ts -import { idempotencyKeys, task } from "@trigger.dev/sdk/v3"; +import { idempotencyKeys, task } from "@trigger.dev/sdk"; export const myTask = task({ id: "my-task", @@ -1058,7 +1058,7 @@ function hash(payload: any): string { ```ts // onFailure executes after all retries are exhausted; use for notifications, logging, or side effects on final failure: -import { task, logger } from "@trigger.dev/sdk/v3"; +import { task, logger } from "@trigger.dev/sdk"; export const loggingExample = task({ id: "logging-example", @@ -1078,7 +1078,7 @@ export const loggingExample = task({ The `trigger.config.ts` file configures your Trigger.dev project, specifying task locations, retry settings, telemetry, and build options. ```ts -import { defineConfig } from "@trigger.dev/sdk/v3"; +import { defineConfig } from "@trigger.dev/sdk"; export default defineConfig({ project: "", @@ -1226,7 +1226,7 @@ await myTask.trigger({ name: "Alice", age: 30 }); Before generating any code, you MUST verify: -1. Are you importing from `@trigger.dev/sdk/v3`? If not, STOP and FIX. +1. Are you importing from `@trigger.dev/sdk` (NOT `@trigger.dev/sdk/v3`)? If not, STOP and FIX. 2. Have you exported every task? If not, STOP and FIX. 3. Have you generated any DEPRECATED code patterns? If yes, STOP and FIX. diff --git a/.env.example b/.env.example index 35c8c976ff6..8fef8f9f171 100644 --- a/.env.example +++ b/.env.example @@ -29,6 +29,52 @@ REDIS_TLS_DISABLED="true" DEV_OTEL_EXPORTER_OTLP_ENDPOINT="http://localhost:3030/otel" DEV_OTEL_BATCH_PROCESSING_ENABLED="0" +# Realtime streams v2 (Sessions, chat.agent, large stream backfills) backed +# by S2 (https://s2.dev). The `s2` service in docker/docker-compose.yml runs +# the open-source s2-lite binary and pre-creates a basin named `trigger-local` +# (see docker/config/s2-spec.json). Comment these out to fall back to v1 +# (Redis-only) streams; Sessions and chat.agent then become unavailable. +REALTIME_STREAMS_S2_BASIN=trigger-local +REALTIME_STREAMS_S2_ACCESS_TOKEN=ignored +REALTIME_STREAMS_S2_ENDPOINT=http://localhost:4566/v1 +REALTIME_STREAMS_S2_SKIP_ACCESS_TOKENS=true +REALTIME_STREAMS_DEFAULT_VERSION=v2 + +# Running multiple instances side by side (worktrees, branch experiments) +# +# Every host port in docker/docker-compose.yml is `${VAR:-default}` and the +# project name comes from `COMPOSE_PROJECT_NAME`. To stand up a second stack +# alongside the default one, uncomment the block below in this clone's `.env` +# (pick any offset that doesn't clash with anything else running), then update +# the URL/PORT vars further up to match. Default values are commented for +# reference. +# +# --- core (pnpm run docker) --- +# COMPOSE_PROJECT_NAME=triggerdotdev-docker-alt +# CONTAINER_PREFIX=alt- +# POSTGRES_HOST_PORT=15432 # default 5432 +# REDIS_HOST_PORT=16379 # default 6379 +# ELECTRIC_HOST_PORT=13060 # default 3060 +# MINIO_API_HOST_PORT=19005 # default 9005 +# MINIO_CONSOLE_HOST_PORT=19006 # default 9006 +# CLICKHOUSE_HTTP_HOST_PORT=18123 # default 8123 +# CLICKHOUSE_TCP_HOST_PORT=19000 # default 9000 +# S2_HOST_PORT=14566 # default 4566 +# REMIX_APP_PORT=13030 # default 3030 +# --- extras (only needed if you also run `pnpm run docker:full`) --- +# ELECTRIC_SHARD_1_HOST_PORT=13061 # default 3061 +# CH_UI_HOST_PORT=15521 # default 5521 +# TOXIPROXY_PROXY_HOST_PORT=40303 # default 30303 +# TOXIPROXY_API_HOST_PORT=18474 # default 8474 +# NGINX_H2_HOST_PORT=18443 # default 8443 +# OTEL_GRPC_HOST_PORT=14317 # default 4317 +# OTEL_HTTP_HOST_PORT=14318 # default 4318 +# OTEL_PROMETHEUS_HOST_PORT=18889 # default 8889 +# PROMETHEUS_HOST_PORT=19090 # default 9090 +# GRAFANA_HOST_PORT=13001 # default 3001 +# (and update DATABASE_URL / CLICKHOUSE_URL / REDIS_PORT / APP_ORIGIN / +# LOGIN_ORIGIN / ELECTRIC_ORIGIN / REALTIME_STREAMS_S2_ENDPOINT to match) + # When the domain is set to `localhost` the CLI deploy command will only --load the image by default and not --push it DEPLOY_REGISTRY_HOST=localhost:5000 @@ -77,9 +123,28 @@ POSTHOG_PROJECT_KEY= # DEPOT_TOKEN= # DEV_OTEL_EXPORTER_OTLP_ENDPOINT="http://0.0.0.0:4318" # These are needed for the object store (for handling large payloads/outputs) -# OBJECT_STORE_BASE_URL="https://{bucket}.{accountId}.r2.cloudflarestorage.com" -# OBJECT_STORE_ACCESS_KEY_ID= -# OBJECT_STORE_SECRET_ACCESS_KEY= +# +# Default provider +# OBJECT_STORE_BASE_URL=http://localhost:9005 +# OBJECT_STORE_BUCKET=packets +# OBJECT_STORE_ACCESS_KEY_ID=minioadmin +# OBJECT_STORE_SECRET_ACCESS_KEY=minioadmin +# OBJECT_STORE_REGION=us-east-1 +# OBJECT_STORE_SERVICE=s3 +# +# OBJECT_STORE_DEFAULT_PROTOCOL=s3 # Only specify this if you're going to migrate object storage and set protocol values below +# Named providers (protocol-prefixed data) - optional for multi-provider support +# OBJECT_STORE_S3_BASE_URL=https://s3.amazonaws.com +# OBJECT_STORE_S3_ACCESS_KEY_ID= +# OBJECT_STORE_S3_SECRET_ACCESS_KEY= +# OBJECT_STORE_S3_REGION=us-east-1 +# OBJECT_STORE_S3_SERVICE=s3 +# +# OBJECT_STORE_R2_BASE_URL=https://{bucket}.{accountId}.r2.cloudflarestorage.com +# OBJECT_STORE_R2_ACCESS_KEY_ID= +# OBJECT_STORE_R2_SECRET_ACCESS_KEY= +# OBJECT_STORE_R2_REGION=auto +# OBJECT_STORE_R2_SERVICE=s3 # CHECKPOINT_THRESHOLD_IN_MS=10000 # These control the server-side internal telemetry @@ -87,7 +152,7 @@ POSTHOG_PROJECT_KEY= # INTERNAL_OTEL_TRACE_LOGGING_ENABLED=1 # INTERNAL_OTEL_TRACE_INSTRUMENT_PRISMA_ENABLED=0 -# Enable local observability stack (requires `pnpm run docker` to start otel-collector) +# Enable local observability stack (requires `pnpm run docker:full` to bring up otel-collector + prometheus + grafana) # Uncomment these to send metrics to the local Prometheus via OTEL Collector: # INTERNAL_OTEL_METRIC_EXPORTER_ENABLED=1 # INTERNAL_OTEL_METRIC_EXPORTER_URL=http://localhost:4318/v1/metrics diff --git a/.github/ISSUE_TEMPLATE/vouch-request.yml b/.github/ISSUE_TEMPLATE/vouch-request.yml new file mode 100644 index 00000000000..9ffe04a8984 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/vouch-request.yml @@ -0,0 +1,28 @@ +name: Vouch Request +description: Request to be vouched as a contributor +labels: ["vouch-request"] +body: + - type: markdown + attributes: + value: | + ## Vouch Request + + We use [vouch](https://github.com/mitchellh/vouch) to manage contributor trust. PRs from unvouched users are automatically closed. + + To get vouched, fill out this form. A maintainer will review your request and vouch for you by commenting on this issue. + - type: textarea + id: context + attributes: + label: Why do you want to contribute? + description: Tell us a bit about yourself and what you'd like to work on. + placeholder: "I'd like to fix a bug I found in..." + validations: + required: true + - type: textarea + id: prior-work + attributes: + label: Prior contributions or relevant experience + description: Links to previous open source work, relevant projects, or anything that helps us understand your background. + placeholder: "https://github.com/..." + validations: + required: false diff --git a/.github/VOUCHED.td b/.github/VOUCHED.td new file mode 100644 index 00000000000..a257444cad6 --- /dev/null +++ b/.github/VOUCHED.td @@ -0,0 +1,24 @@ +# Vouched contributors for Trigger.dev +# See: https://github.com/mitchellh/vouch +# +# Org members +0ski +D-K-P +ericallam +matt-aitken +mpcgrid +myftija +nicktrn +samejr +isshaddad +# Bots +devin-ai-integration[bot] +dependabot[bot] +# Outside contributors +gautamsi +capaj +chengzp +bharathkumar39293 +bhekanik +jrossi +ThullyoCunha \ No newline at end of file diff --git a/.github/actions/get-image-tag/action.yml b/.github/actions/get-image-tag/action.yml index e0646230463..7f1505a0c11 100644 --- a/.github/actions/get-image-tag/action.yml +++ b/.github/actions/get-image-tag/action.yml @@ -23,35 +23,37 @@ runs: id: get_tag shell: bash run: | - if [[ -n "${{ inputs.tag }}" ]]; then - tag="${{ inputs.tag }}" - elif [[ "${{ github.ref_type }}" == "tag" ]]; then - if [[ "${{ github.ref_name }}" == infra-*-* ]]; then - env=$(echo ${{ github.ref_name }} | cut -d- -f2) - sha=$(echo ${{ github.sha }} | head -c7) + if [[ -n "${INPUTS_TAG}" ]]; then + tag="${INPUTS_TAG}" + elif [[ "${GITHUB_REF_TYPE}" == "tag" ]]; then + if [[ "${GITHUB_REF_NAME}" == infra-*-* ]]; then + env=$(echo ${GITHUB_REF_NAME} | cut -d- -f2) + sha=$(echo "${GITHUB_SHA}" | head -c7) ts=$(date +%s) tag=${env}-${sha}-${ts} - elif [[ "${{ github.ref_name }}" == re2-*-* ]]; then - env=$(echo ${{ github.ref_name }} | cut -d- -f2) - sha=$(echo ${{ github.sha }} | head -c7) + elif [[ "${GITHUB_REF_NAME}" == re2-*-* ]]; then + env=$(echo ${GITHUB_REF_NAME} | cut -d- -f2) + sha=$(echo "${GITHUB_SHA}" | head -c7) ts=$(date +%s) tag=${env}-${sha}-${ts} - elif [[ "${{ github.ref_name }}" == v.docker.* ]]; then + elif [[ "${GITHUB_REF_NAME}" == v.docker.* ]]; then version="${GITHUB_REF_NAME#v.docker.}" tag="v${version}" - elif [[ "${{ github.ref_name }}" == build-* ]]; then + elif [[ "${GITHUB_REF_NAME}" == build-* ]]; then tag="${GITHUB_REF_NAME#build-}" else - echo "Invalid git tag: ${{ github.ref_name }}" + echo "Invalid git tag: ${GITHUB_REF_NAME}" exit 1 fi - elif [[ "${{ github.ref_name }}" == "main" ]]; then + elif [[ "${GITHUB_REF_NAME}" == "main" ]]; then tag="main" else - echo "Invalid git ref: ${{ github.ref }}" + echo "Invalid git ref: ${GITHUB_REF}" exit 1 fi echo "tag=${tag}" >> "$GITHUB_OUTPUT" + env: + INPUTS_TAG: ${{ inputs.tag }} - name: 🔍 Check for validity id: check_validity diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 00000000000..7bb64f36744 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,12 @@ +version: 2 +updates: + - package-ecosystem: github-actions + directory: / + schedule: + interval: weekly + cooldown: + default-days: 7 + groups: + github-actions: + patterns: + - "*" diff --git a/.github/workflows/changesets-pr.yml b/.github/workflows/changesets-pr.yml index e2fdc187614..01c303a95ca 100644 --- a/.github/workflows/changesets-pr.yml +++ b/.github/workflows/changesets-pr.yml @@ -7,6 +7,7 @@ on: paths: - "packages/**" - ".changeset/**" + - ".server-changes/**" - "package.json" - "pnpm-lock.yaml" @@ -24,15 +25,15 @@ jobs: if: github.repository == 'triggerdotdev/trigger.dev' steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 # zizmor: ignore[artipacked] changesets/action pushes the release branch; no artifact upload here so no leak path with: fetch-depth: 0 - name: Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5.0.0 - name: Setup node - uses: buildjet/setup-node@v4 + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 with: node-version: 20.20.0 cache: "pnpm" @@ -42,7 +43,7 @@ jobs: - name: Create release PR id: changesets - uses: changesets/action@v1 + uses: changesets/action@6a0a831ff30acef54f2c6aa1cbbc1096b066edaf # v1.7.0 with: version: pnpm run changeset:version commit: "chore: release" @@ -50,7 +51,7 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - name: Update PR title with version + - name: Update PR title and enhance body if: steps.changesets.outputs.published != 'true' env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} @@ -61,42 +62,13 @@ jobs: # we arbitrarily reference the version of the cli package here; it is the same for all package releases VERSION=$(git show origin/changeset-release/main:packages/cli-v3/package.json | jq -r '.version') gh pr edit "$PR_NUMBER" --title "chore: release v$VERSION" - fi - - update-lockfile: - name: Update lockfile on release PR - runs-on: ubuntu-latest - needs: release-pr - permissions: - contents: write - steps: - - name: Checkout release branch - uses: actions/checkout@v4 - with: - ref: changeset-release/main - - - name: Setup pnpm - uses: pnpm/action-setup@v4 - with: - version: 10.23.0 - - - name: Setup node - uses: buildjet/setup-node@v4 - with: - node-version: 20.20.0 - - name: Install and update lockfile - run: pnpm install --no-frozen-lockfile - - - name: Commit and push lockfile - run: | - set -e - if git diff --quiet pnpm-lock.yaml; then - echo "No lockfile changes" - else - git config user.name "github-actions[bot]" - git config user.email "github-actions[bot]@users.noreply.github.com" - git add pnpm-lock.yaml - git commit -m "chore: update lockfile for release" - git push origin changeset-release/main + # Enhance the PR body with a clean, deduplicated summary + RAW_BODY=$(gh pr view "$PR_NUMBER" --json body --jq '.body') + ENHANCED_BODY=$(CHANGESET_PR_BODY="$RAW_BODY" node scripts/enhance-release-pr.mjs "$VERSION") + if [ -n "$ENHANCED_BODY" ]; then + gh api repos/triggerdotdev/trigger.dev/pulls/"$PR_NUMBER" \ + -X PATCH \ + -f body="$ENHANCED_BODY" + fi fi diff --git a/.github/workflows/check-review-md.yml b/.github/workflows/check-review-md.yml new file mode 100644 index 00000000000..9b5b69d9d6c --- /dev/null +++ b/.github/workflows/check-review-md.yml @@ -0,0 +1,93 @@ +name: 🔎 REVIEW.md Drift Audit + +on: + pull_request: + types: [opened, ready_for_review, synchronize] + paths-ignore: + - "docs/**" + - ".changeset/**" + - ".server-changes/**" + - "references/**" + +concurrency: + group: review-md-drift-${{ github.event.pull_request.number }} + cancel-in-progress: true + +jobs: + audit: + if: >- + github.event.pull_request.draft == false && + github.event.pull_request.head.repo.full_name == github.repository + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: write + id-token: write + steps: + - name: Checkout repository + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + fetch-depth: 0 + persist-credentials: false + + - name: Run Claude Code + id: claude + uses: anthropics/claude-code-action@fefa07e9c665b7320f08c3b525980457f22f58aa # v1.0.111 + with: + anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }} + use_sticky_comment: true + allowed_bots: "devin-ai-integration[bot]" + + claude_args: | + --max-turns 30 + --allowedTools "Read,Glob,Grep,Bash(git diff:*)" + + prompt: | + You are auditing this PR for drift against `.claude/REVIEW.md`. + + ## Context + + `.claude/REVIEW.md` is the repo's source of truth for what AI / agent code reviewers should treat as critical findings (rolling-deploy safety, hot-table indexes, recovery-path queries, testcontainers usage, Lua versioning, etc.). It is consumed by review agents to calibrate severity. If REVIEW.md goes stale, every future agent review degrades. + + ## Strategy — read this first + + You have a hard turn budget. Spend it on signal, not coverage. The audit is allowed to miss things; it is NOT allowed to time out. + + 1. Read `.claude/REVIEW.md` once, in full. + 2. Run `git diff origin/main...HEAD --name-only` to get the list of changed files. Do NOT read the diff content yet. + 3. Scan the file-list for relevance to REVIEW.md scope. Relevance signals: changes to Prisma schema, Redis / queue / Lua code, hot tables, recovery / restart loops, new packages, deletions of paths REVIEW.md cites. Skim everything else. + 4. Open at most **5 files** total — only the ones most likely to surface a real signal. If nothing in the file-list looks relevant to any REVIEW.md rule, do NOT read any files; go straight to the verdict. + 5. Form a verdict and stop. Do not exhaust the turn budget exploring. + + Large PRs (>50 files changed) are a strong signal to be MORE selective, not more thorough. Pick 3-5 files at most. + + ## What to look for + + - **Stale references** — does any REVIEW.md rule cite a file, directory, function, table, Prisma model, or package name that has been removed or renamed in this PR (or is already gone from `main`)? + - **Contradictions** — does code in this PR clearly violate a current REVIEW.md rule? (Don't re-review the PR. Only flag if REVIEW.md and the PR plainly disagree.) + - **Missing rules** — does this PR introduce a new pattern future reviewers should know about? Examples: a new hot table, a new Lua-script versioning convention, a new safety wrapper, a new "must always check" invariant. + - **Obsolete rules** — has the repo moved past a constraint REVIEW.md still asserts? (e.g. a deprecated path is gone, a pattern is now linted, V1 code is deleted.) + + ## Response format + + If nothing needs changing: + + ✅ REVIEW.md looks current for this PR. + + Otherwise: + + 📝 **REVIEW.md updates suggested:** + + - **[stale]** `` — + - **[contradiction]** `` — + - **[missing]** under `##
` — + - **[obsolete]** `` — + + ## Rules + + - Maximum 3 suggestions per audit. Pick the highest-signal ones. + - Only flag things that would actually mislead a future reviewer. Style and wording do not count. + - Do NOT review the PR itself. Do NOT propose rules outside REVIEW.md's existing sections. + - Do NOT propose rules for one-off PR specifics that don't generalize to future PRs. + - If REVIEW.md does not exist in the repo, respond with `(skip)` and stop. + - When in doubt between "one more file read" and "finish now" — finish now. diff --git a/.github/workflows/claude-md-audit.yml b/.github/workflows/claude-md-audit.yml new file mode 100644 index 00000000000..01b1185cf16 --- /dev/null +++ b/.github/workflows/claude-md-audit.yml @@ -0,0 +1,73 @@ +name: 📝 CLAUDE.md Audit + +on: + pull_request: + types: [opened, ready_for_review, synchronize] + paths-ignore: + - "docs/**" + - ".changeset/**" + - ".server-changes/**" + - "**/*.md" + - "references/**" + +concurrency: + group: claude-md-audit-${{ github.event.pull_request.number }} + cancel-in-progress: true + +jobs: + audit: + if: >- + github.event.pull_request.draft == false && + github.event.pull_request.head.repo.full_name == github.repository + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: write + issues: write + id-token: write + steps: + - name: Checkout repository + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + fetch-depth: 0 + persist-credentials: false + + - name: Run Claude Code + id: claude + uses: anthropics/claude-code-action@fefa07e9c665b7320f08c3b525980457f22f58aa # v1.0.111 + with: + anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }} + use_sticky_comment: true + allowed_bots: "devin-ai-integration[bot]" + + claude_args: | + --max-turns 15 + --allowedTools "Read,Glob,Grep,Bash(git diff:*)" + + prompt: | + You are reviewing a PR to check whether any CLAUDE.md files or .claude/rules/ files need updating. + + ## Your task + + 1. Run `git diff origin/main...HEAD --name-only` to see which files changed in this PR. + 2. For each changed directory, check if there's a CLAUDE.md in that directory or a parent directory. + 3. Determine if any CLAUDE.md or .claude/rules/ file should be updated based on the changes. Consider: + - New files/directories that aren't covered by existing documentation + - Changed architecture or patterns that contradict current CLAUDE.md guidance + - New dependencies, services, or infrastructure that Claude should know about + - Renamed or moved files that are referenced in CLAUDE.md + - Changes to build commands, test patterns, or development workflows + + ## Response format + + If NO updates are needed, respond with exactly: + ✅ CLAUDE.md files look current for this PR. + + If updates ARE needed, respond with a short list: + 📝 **CLAUDE.md updates suggested:** + - `path/to/CLAUDE.md`: [what should be added/changed] + - `.claude/rules/file.md`: [what should be added/changed] + + Keep suggestions specific and brief. Only flag things that would actually mislead Claude in future sessions. + Do NOT suggest updates for trivial changes (bug fixes, small refactors within existing patterns). + Do NOT suggest creating new CLAUDE.md files - only updates to existing ones. diff --git a/.github/workflows/claude.yml b/.github/workflows/claude.yml index cadbe31773f..96a3ec96385 100644 --- a/.github/workflows/claude.yml +++ b/.github/workflows/claude.yml @@ -19,24 +19,25 @@ jobs: (github.event_name == 'issues' && (contains(github.event.issue.body, '@claude') || contains(github.event.issue.title, '@claude'))) runs-on: ubuntu-latest permissions: - contents: read - pull-requests: read - issues: read + contents: write + pull-requests: write + issues: write id-token: write actions: read # Required for Claude to read CI results on PRs steps: - name: Checkout repository - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 1 + persist-credentials: false - name: ⎔ Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5.0.0 with: - version: 10.23.0 + version: 10.33.2 - name: ⎔ Setup node - uses: buildjet/setup-node@v4 + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 with: node-version: 20.20.0 cache: "pnpm" @@ -49,9 +50,9 @@ jobs: - name: Run Claude Code id: claude - uses: anthropics/claude-code-action@v1 + uses: anthropics/claude-code-action@fefa07e9c665b7320f08c3b525980457f22f58aa # v1.0.111 with: - claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} + anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }} # This is an optional setting that allows Claude to read CI results on PRs additional_permissions: | diff --git a/.github/workflows/dependabot-weekly-summary.yml b/.github/workflows/dependabot-weekly-summary.yml new file mode 100644 index 00000000000..fb2717e2fb0 --- /dev/null +++ b/.github/workflows/dependabot-weekly-summary.yml @@ -0,0 +1,206 @@ +name: Dependabot Weekly Summary + +on: + schedule: + - cron: "0 8 * * 1" # Mon 08:00 UTC + workflow_dispatch: + +# Single-purpose monitoring workflow; serialise on workflow name only - we never +# want two concurrent summary runs racing to post the same digest. +concurrency: + group: ${{ github.workflow }} + cancel-in-progress: false + +permissions: + contents: read # gh CLI baseline + pull-requests: read # gh pr list (open dependabot PRs) + actions: read # gh run list / view (parse latest dependabot run logs) + +jobs: + summary: + name: Post weekly Dependabot summary + runs-on: ubuntu-latest + environment: dependabot-summary + env: + # Severities surface in the actions list when their remaining TTR drops + # below this many days. Override via repo/env var ACTION_THRESHOLD_DAYS. + THRESHOLD_DAYS: ${{ vars.ACTION_THRESHOLD_DAYS || '7' }} + steps: + - name: Fetch alerts and compute summaries + id: alerts + env: + GH_TOKEN: ${{ secrets.DEPENDABOT_ALERTS_TOKEN }} + REPO: ${{ github.repository }} + run: | + if ! gh api -X GET "/repos/$REPO/dependabot/alerts" --paginate > pages.json 2> err.txt; then + echo "total=?" >> "$GITHUB_OUTPUT" + ERR=$(head -c 200 err.txt | tr '\n' ' ') + echo "by_severity=:x: _failed to fetch alerts: ${ERR}_" >> "$GITHUB_OUTPUT" + echo "actions=:x: _alerts unavailable_" >> "$GITHUB_OUTPUT" + exit 0 + fi + jq -s '[.[][] | select(.state == "open")]' pages.json > open.json + + TOTAL=$(jq 'length' open.json) + echo "total=$TOTAL" >> "$GITHUB_OUTPUT" + + if [ "$TOTAL" = "0" ]; then + echo "by_severity=:white_check_mark: No open alerts." >> "$GITHUB_OUTPUT" + echo "actions=_None_" >> "$GITHUB_OUTPUT" + exit 0 + fi + + # Severity breakdown - real newlines so jq --arg in the payload + # builder encodes them as proper \n in JSON (Slack renders as breaks). + BY_SEV=$(jq -r ' + group_by(.security_advisory.severity) + | map({sev: .[0].security_advisory.severity, + count: length, + weight: ({"critical":0,"high":1,"medium":2,"low":3}[.[0].security_advisory.severity])}) + | sort_by(.weight) + | map("• *\(.count)* \(.sev)") + | join("\n") + ' open.json) + { + echo "by_severity<> "$GITHUB_OUTPUT" + + # Actions: alerts within THRESHOLD_DAYS of their TTR (P0=7d, P1=30d, P2=90d, P3=no deadline) + # Grouped by (package, severity); shows earliest deadline per group. + ACTIONS=$(jq -r --argjson threshold "$THRESHOLD_DAYS" ' + [.[] + | (.security_advisory.severity) as $sev + | ({"critical":7,"high":30,"medium":90,"low":null}[$sev]) as $ttr + | select($ttr != null) + | ((now - (.created_at | fromdateiso8601)) / 86400 | floor) as $age + | {pkg: .dependency.package.name, sev: $sev, remaining: ($ttr - $age)} + ] + | group_by([.pkg, .sev]) + | map({pkg: .[0].pkg, sev: .[0].sev, count: length, min_remaining: ([.[].remaining] | min)}) + | map(select(.min_remaining < $threshold)) + | sort_by(.min_remaining) + | if length == 0 then "_None_" + else (map( + "• *\(.pkg)* (\(.sev))" + + (if .count > 1 then " ×\(.count)" else "" end) + " - " + + (if .min_remaining < 0 then "*OVERDUE* by \(-.min_remaining)d" + else "\(.min_remaining)d remaining" end) + ) | join("\n")) + end + ' open.json) + { + echo "actions<> "$GITHUB_OUTPUT" + + - name: Fetch open dependabot PRs + id: prs + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + REPO: ${{ github.repository }} + REPO_URL: https://github.com/${{ github.repository }} + run: | + if ! PR_JSON=$(gh pr list --repo "$REPO" --state open --author "app/dependabot" --json number,title 2> err.txt); then + ERR=$(head -c 200 err.txt | tr '\n' ' ') + echo "list=:x: _failed to fetch PRs: ${ERR}_" >> "$GITHUB_OUTPUT" + exit 0 + fi + LIST=$(echo "$PR_JSON" | jq -r --arg url "$REPO_URL" ' + if length == 0 then "_None_" + else (map("• <\($url)/pull/\(.number)|#\(.number)> \(.title)") | join("\n")) + end + ') + { + echo "list<> "$GITHUB_OUTPUT" + + - name: Find latest npm dependabot run + id: latest + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + REPO: ${{ github.repository }} + run: | + # Repos without a dependabot.yml have no "Dependabot Updates" workflow; + # treat the lookup failure as "no recent run found" rather than failing. + if ! RUN_ID=$(gh run list --repo "$REPO" --workflow "Dependabot Updates" --status success --limit 30 --json databaseId,name --jq 'first(.[] | select(.name | startswith("npm_and_yarn")) | .databaseId) // empty' 2>/dev/null); then + RUN_ID="" + fi + echo "run_id=$RUN_ID" >> "$GITHUB_OUTPUT" + + - name: Extract stuck deps (only if actions pending) + id: stuck + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + REPO: ${{ github.repository }} + RUN_ID: ${{ steps.latest.outputs.run_id }} + ACTIONS: ${{ steps.alerts.outputs.actions }} + run: | + # Skip the stuck section entirely when nothing in the actions list + # - keeps the digest tidy when there's nothing to actually act on. + if [ "$ACTIONS" = "_None_" ]; then + echo "section=" >> "$GITHUB_OUTPUT" + exit 0 + fi + HEADER=$'\n\n*Couldn\'t auto-fix (need manual `pnpm.overrides`):*\n' + if [ -z "$RUN_ID" ]; then + { + echo "section<> "$GITHUB_OUTPUT" + exit 0 + fi + gh run view "$RUN_ID" --repo "$REPO" --log > log.txt 2>&1 || true + STUCK=$(grep -oE "No update possible for [^[:space:]]+ [0-9][^[:space:]]*" log.txt | sed 's/No update possible for //' | sort -u || true) + if [ -z "$STUCK" ]; then + { + echo "section<> "$GITHUB_OUTPUT" + exit 0 + fi + LIST=$(echo "$STUCK" | awk 'NR>1{printf "\n"} {printf "• *%s* %s", $1, $2}') + { + echo "section<> "$GITHUB_OUTPUT" + + - name: Build Slack payload + env: + REPO: ${{ github.repository }} + CHANNEL: ${{ vars.SLACK_CHANNEL_ID }} + TOTAL: ${{ steps.alerts.outputs.total }} + BY_SEVERITY: ${{ steps.alerts.outputs.by_severity }} + PRS_LIST: ${{ steps.prs.outputs.list }} + ACTIONS: ${{ steps.alerts.outputs.actions }} + STUCK: ${{ steps.stuck.outputs.section }} + run: | + # Build payload via jq so PR titles or error strings containing + # quotes/backslashes/newlines can't break the JSON. + jq -n \ + --arg channel "$CHANNEL" \ + --arg repo "$REPO" \ + --arg total "$TOTAL" \ + --arg by_severity "$BY_SEVERITY" \ + --arg prs_list "$PRS_LIST" \ + --arg actions "$ACTIONS" \ + --arg stuck "$STUCK" \ + --arg threshold "$THRESHOLD_DAYS" \ + '{ + channel: $channel, + text: ":calendar: *Weekly Dependabot summary* - `\($repo)`\n\n*Open alerts (\($total)):*\n\($by_severity)\n\n*Open Dependabot PRs:*\n\($prs_list)\n\n*Actions needed (<\($threshold)d remaining):*\n\($actions)\($stuck)\n\n" + }' > payload.json + + - name: Post Slack summary + uses: slackapi/slack-github-action@45a88b9581bfab2566dc881e2cd66d334e621e2c # v3.0.3 + with: + method: chat.postMessage + token: ${{ secrets.SLACK_BOT_TOKEN }} + payload-file-path: payload.json diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index bef575c353a..0cac7c8595f 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -26,10 +26,12 @@ jobs: working-directory: ./docs steps: - name: 📥 Checkout repository - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false - name: 📦 Cache npm - uses: actions/cache@v4 + uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 with: path: | ~/.npm diff --git a/.github/workflows/e2e-webapp-auth-full.yml b/.github/workflows/e2e-webapp-auth-full.yml new file mode 100644 index 00000000000..a00ca7a4195 --- /dev/null +++ b/.github/workflows/e2e-webapp-auth-full.yml @@ -0,0 +1,120 @@ +name: "🛡️ E2E Tests: Webapp Auth (full)" + +# Comprehensive RBAC auth test suite — see TRI-8731. Runs separately from +# the smoke e2e-webapp.yml because it covers every route family with a +# pass/fail matrix and would otherwise dominate per-PR CI time. +# +# Triggered: +# - Manually via workflow_dispatch. +# - Nightly via schedule. +# - On pull requests touching auth-relevant files only (paths filter). + +permissions: + contents: read + +on: + workflow_dispatch: + schedule: + - cron: "0 4 * * *" # 04:00 UTC daily + pull_request: + paths: + - "apps/webapp/app/services/routeBuilders/**" + - "apps/webapp/app/services/rbac.server.ts" + - "apps/webapp/app/services/apiAuth.server.ts" + - "apps/webapp/app/services/personalAccessToken.server.ts" + - "apps/webapp/app/services/sessionStorage.server.ts" + - "apps/webapp/app/routes/api.v*.**" + - "apps/webapp/app/routes/realtime.v*.**" + - "apps/webapp/test/**/*.e2e.full.test.ts" + - "apps/webapp/test/setup/global-e2e-full-setup.ts" + - "apps/webapp/test/helpers/sharedTestServer.ts" + - "apps/webapp/test/helpers/seedTestSession.ts" + - "apps/webapp/vitest.e2e.full.config.ts" + - "internal-packages/rbac/**" + - "packages/plugins/**" + - ".github/workflows/e2e-webapp-auth-full.yml" + +jobs: + e2eAuthFull: + name: "🛡️ E2E Auth Tests (full)" + runs-on: ubuntu-latest + timeout-minutes: 30 + env: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + steps: + - name: 🔧 Disable IPv6 + run: | + sudo sysctl -w net.ipv6.conf.all.disable_ipv6=1 + sudo sysctl -w net.ipv6.conf.default.disable_ipv6=1 + sudo sysctl -w net.ipv6.conf.lo.disable_ipv6=1 + + - name: 🔧 Configure docker address pool + run: | + CONFIG='{ + "default-address-pools" : [ + { + "base" : "172.17.0.0/12", + "size" : 20 + }, + { + "base" : "192.168.0.0/16", + "size" : 24 + } + ] + }' + mkdir -p /etc/docker + echo "$CONFIG" | sudo tee /etc/docker/daemon.json + + - name: 🔧 Restart docker daemon + run: sudo systemctl restart docker + + - name: ⬇️ Checkout repo + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + fetch-depth: 0 + # Don't leave the GITHUB_TOKEN in .git/config — this job + # doesn't need to push and the persisted creds would be + # readable from any subsequent step (zizmor/artipacked). + persist-credentials: false + + - name: ⎔ Setup pnpm + uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5.0.0 + with: + version: 10.33.2 + + - name: ⎔ Setup node + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 + with: + node-version: 20.20.0 + cache: "pnpm" + + - name: 🐳 Login to DockerHub + if: ${{ env.DOCKERHUB_USERNAME }} + uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + - name: 🐳 Skipping DockerHub login (no secrets available) + if: ${{ !env.DOCKERHUB_USERNAME }} + run: echo "DockerHub login skipped because secrets are not available." + + - name: 🐳 Pre-pull testcontainer images + if: ${{ env.DOCKERHUB_USERNAME }} + run: | + docker pull postgres:14 + docker pull redis:7.2 + docker pull testcontainers/ryuk:0.11.0 + + - name: 📥 Download deps + run: pnpm install --frozen-lockfile + + - name: 📀 Generate Prisma Client + run: pnpm run generate + + - name: 🏗️ Build Webapp + run: pnpm run build --filter webapp + + - name: 🛡️ Run Webapp Full Auth E2E Tests + run: cd apps/webapp && pnpm exec vitest run --config vitest.e2e.full.config.ts --reporter=default + env: + WEBAPP_TEST_VERBOSE: "1" diff --git a/.github/workflows/e2e-webapp.yml b/.github/workflows/e2e-webapp.yml new file mode 100644 index 00000000000..307898facd4 --- /dev/null +++ b/.github/workflows/e2e-webapp.yml @@ -0,0 +1,97 @@ +name: "🧪 E2E Tests: Webapp" + +permissions: + contents: read + +on: + workflow_call: + secrets: + DOCKERHUB_USERNAME: + required: false + DOCKERHUB_TOKEN: + required: false + +jobs: + e2eTests: + name: "🧪 E2E Tests: Webapp" + runs-on: ubuntu-latest + timeout-minutes: 20 + env: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + steps: + - name: 🔧 Disable IPv6 + run: | + sudo sysctl -w net.ipv6.conf.all.disable_ipv6=1 + sudo sysctl -w net.ipv6.conf.default.disable_ipv6=1 + sudo sysctl -w net.ipv6.conf.lo.disable_ipv6=1 + + - name: 🔧 Configure docker address pool + run: | + CONFIG='{ + "default-address-pools" : [ + { + "base" : "172.17.0.0/12", + "size" : 20 + }, + { + "base" : "192.168.0.0/16", + "size" : 24 + } + ] + }' + mkdir -p /etc/docker + echo "$CONFIG" | sudo tee /etc/docker/daemon.json + + - name: 🔧 Restart docker daemon + run: sudo systemctl restart docker + + - name: ⬇️ Checkout repo + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + fetch-depth: 0 + persist-credentials: false + + - name: ⎔ Setup pnpm + uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5.0.0 + with: + version: 10.33.2 + + - name: ⎔ Setup node + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 + with: + node-version: 20.20.0 + cache: "pnpm" + + # ..to avoid rate limits when pulling images + - name: 🐳 Login to DockerHub + if: ${{ env.DOCKERHUB_USERNAME }} + uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + - name: 🐳 Skipping DockerHub login (no secrets available) + if: ${{ !env.DOCKERHUB_USERNAME }} + run: echo "DockerHub login skipped because secrets are not available." + + - name: 🐳 Pre-pull testcontainer images + if: ${{ env.DOCKERHUB_USERNAME }} + run: | + echo "Pre-pulling Docker images with authenticated session..." + docker pull postgres:14 + docker pull redis:7.2 + docker pull testcontainers/ryuk:0.11.0 + echo "Image pre-pull complete" + + - name: 📥 Download deps + run: pnpm install --frozen-lockfile + + - name: 📀 Generate Prisma Client + run: pnpm run generate + + - name: 🏗️ Build Webapp + run: pnpm run build --filter webapp + + - name: 🧪 Run Webapp E2E Tests + run: cd apps/webapp && pnpm exec vitest run --config vitest.e2e.config.ts --reporter=default + env: + WEBAPP_TEST_VERBOSE: "1" diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml index 9518ca6157c..b9d1e19c6be 100644 --- a/.github/workflows/e2e.yml +++ b/.github/workflows/e2e.yml @@ -24,17 +24,18 @@ jobs: package-manager: ["npm", "pnpm"] steps: - name: ⬇️ Checkout repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 0 + persist-credentials: false - name: ⎔ Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5.0.0 with: - version: 10.23.0 + version: 10.33.2 - name: ⎔ Setup node - uses: buildjet/setup-node@v4 + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 with: node-version: 20.20.0 @@ -48,7 +49,7 @@ jobs: run: pnpm run build --filter trigger.dev^... - name: 🔧 Build worker template files - run: pnpm --filter trigger.dev run build:workers + run: pnpm --filter trigger.dev run --if-present build:workers - name: Enable corepack run: corepack enable diff --git a/.github/workflows/helm-pr-prerelease.yml b/.github/workflows/helm-pr-prerelease.yml deleted file mode 100644 index 8df045945e6..00000000000 --- a/.github/workflows/helm-pr-prerelease.yml +++ /dev/null @@ -1,138 +0,0 @@ -name: 🧭 Helm Chart PR Prerelease - -on: - pull_request: - types: [opened, synchronize, reopened] - paths: - - "hosting/k8s/helm/**" - -concurrency: - group: helm-prerelease-${{ github.event.pull_request.number }} - cancel-in-progress: true - -env: - REGISTRY: ghcr.io - CHART_NAME: trigger - -jobs: - lint-and-test: - runs-on: ubuntu-latest - permissions: - contents: read - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Set up Helm - uses: azure/setup-helm@v4 - with: - version: "3.18.3" - - - name: Build dependencies - run: helm dependency build ./hosting/k8s/helm/ - - - name: Extract dependency charts - run: | - cd ./hosting/k8s/helm/ - for file in ./charts/*.tgz; do echo "Extracting $file"; tar -xzf "$file" -C ./charts; done - - - name: Lint Helm Chart - run: | - helm lint ./hosting/k8s/helm/ - - - name: Render templates - run: | - helm template test-release ./hosting/k8s/helm/ \ - --values ./hosting/k8s/helm/values.yaml \ - --output-dir ./helm-output - - - name: Validate manifests - uses: docker://ghcr.io/yannh/kubeconform:v0.7.0 - with: - entrypoint: "/kubeconform" - args: "-summary -output json ./helm-output" - - prerelease: - needs: lint-and-test - runs-on: ubuntu-latest - permissions: - contents: read - packages: write - pull-requests: write - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Set up Helm - uses: azure/setup-helm@v4 - with: - version: "3.18.3" - - - name: Build dependencies - run: helm dependency build ./hosting/k8s/helm/ - - - name: Extract dependency charts - run: | - cd ./hosting/k8s/helm/ - for file in ./charts/*.tgz; do echo "Extracting $file"; tar -xzf "$file" -C ./charts; done - - - name: Log in to Container Registry - uses: docker/login-action@v3 - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - - name: Generate prerelease version - id: version - run: | - BASE_VERSION=$(grep '^version:' ./hosting/k8s/helm/Chart.yaml | awk '{print $2}') - PR_NUMBER=${{ github.event.pull_request.number }} - SHORT_SHA=$(echo "${{ github.event.pull_request.head.sha }}" | cut -c1-7) - PRERELEASE_VERSION="${BASE_VERSION}-pr${PR_NUMBER}.${SHORT_SHA}" - echo "version=$PRERELEASE_VERSION" >> $GITHUB_OUTPUT - echo "Prerelease version: $PRERELEASE_VERSION" - - - name: Update Chart.yaml with prerelease version - run: | - sed -i "s/^version:.*/version: ${{ steps.version.outputs.version }}/" ./hosting/k8s/helm/Chart.yaml - - - name: Package Helm Chart - run: | - helm package ./hosting/k8s/helm/ --destination /tmp/ - - - name: Push Helm Chart to GHCR - run: | - VERSION="${{ steps.version.outputs.version }}" - CHART_PACKAGE="/tmp/${{ env.CHART_NAME }}-${VERSION}.tgz" - - # Push to GHCR OCI registry - helm push "$CHART_PACKAGE" "oci://${{ env.REGISTRY }}/${{ github.repository_owner }}/charts" - - - name: Find existing comment - uses: peter-evans/find-comment@v3 - id: find-comment - with: - issue-number: ${{ github.event.pull_request.number }} - comment-author: "github-actions[bot]" - body-includes: "Helm Chart Prerelease Published" - - - name: Create or update PR comment - uses: peter-evans/create-or-update-comment@v4 - with: - comment-id: ${{ steps.find-comment.outputs.comment-id }} - issue-number: ${{ github.event.pull_request.number }} - body: | - ### 🧭 Helm Chart Prerelease Published - - **Version:** `${{ steps.version.outputs.version }}` - - **Install:** - ```bash - helm upgrade --install trigger \ - oci://ghcr.io/${{ github.repository_owner }}/charts/trigger \ - --version "${{ steps.version.outputs.version }}" - ``` - - > ⚠️ This is a prerelease for testing. Do not use in production. - edit-mode: replace diff --git a/.github/workflows/helm-prerelease.yml b/.github/workflows/helm-prerelease.yml new file mode 100644 index 00000000000..dd58fbb3551 --- /dev/null +++ b/.github/workflows/helm-prerelease.yml @@ -0,0 +1,200 @@ +name: 🧭 Helm Chart Prerelease + +on: + pull_request: + types: [opened, synchronize, reopened] + paths: + - "hosting/k8s/helm/**" + push: + branches: + - main + paths: + - "hosting/k8s/helm/**" + workflow_dispatch: + inputs: + app_version: + description: "Override appVersion (e.g. 'main', 'v4.4.4'). Leave empty to keep Chart.yaml value." + required: false + type: string + default: "" + +concurrency: + group: helm-prerelease-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +env: + REGISTRY: ghcr.io + CHART_NAME: trigger + +jobs: + lint-and-test: + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + + - name: Set up Helm + uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0 + with: + version: "3.18.3" + + - name: Build dependencies + run: helm dependency build ./hosting/k8s/helm/ + + - name: Extract dependency charts + run: | + cd ./hosting/k8s/helm/ + for file in ./charts/*.tgz; do echo "Extracting $file"; tar -xzf "$file" -C ./charts; done + + - name: Lint Helm Chart + run: | + helm lint ./hosting/k8s/helm/ + + - name: Render templates + run: | + helm template test-release ./hosting/k8s/helm/ \ + --values ./hosting/k8s/helm/values.yaml \ + --output-dir ./helm-output + + - name: Validate manifests + uses: docker://ghcr.io/yannh/kubeconform:v0.7.0@sha256:85dbef6b4b312b99133decc9c6fc9495e9fc5f92293d4ff3b7e1b30f5611823c + with: + entrypoint: "/kubeconform" + args: "-summary -output json ./helm-output" + + prerelease: + needs: lint-and-test + if: | + (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) || + github.event_name == 'push' || + github.event_name == 'workflow_dispatch' + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + pull-requests: write + steps: + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + + - name: Set up Helm + uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0 + with: + version: "3.18.3" + + - name: Build dependencies + run: helm dependency build ./hosting/k8s/helm/ + + - name: Extract dependency charts + run: | + cd ./hosting/k8s/helm/ + for file in ./charts/*.tgz; do echo "Extracting $file"; tar -xzf "$file" -C ./charts; done + + - name: Log in to Container Registry + uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Generate prerelease version + id: version + run: | + BASE_VERSION=$(grep '^version:' ./hosting/k8s/helm/Chart.yaml | awk '{print $2}') + if [[ "${{ github.event_name }}" == "pull_request" ]]; then + PR_NUMBER=${{ github.event.pull_request.number }} + SHORT_SHA=$(echo "${{ github.event.pull_request.head.sha }}" | cut -c1-7) + PRERELEASE_VERSION="${BASE_VERSION}-pr${PR_NUMBER}.${SHORT_SHA}" + elif [[ "${{ github.event_name }}" == "push" ]]; then + SHORT_SHA=$(echo "${GITHUB_SHA}" | cut -c1-7) + PRERELEASE_VERSION="${BASE_VERSION}-main.${SHORT_SHA}" + else + SHORT_SHA=$(echo "${GITHUB_SHA}" | cut -c1-7) + REF_SLUG=$(echo "${GITHUB_REF_NAME}" | tr '/' '-' | tr -cd 'a-zA-Z0-9-') + if [[ -z "$REF_SLUG" ]]; then + REF_SLUG="manual" + fi + PRERELEASE_VERSION="${BASE_VERSION}-${REF_SLUG}.${SHORT_SHA}" + fi + echo "version=$PRERELEASE_VERSION" >> "$GITHUB_OUTPUT" + echo "Prerelease version: $PRERELEASE_VERSION" + + - name: Update Chart.yaml with prerelease version + run: | + sed -i "s/^version:.*/version: ${STEPS_VERSION_OUTPUTS_VERSION}/" ./hosting/k8s/helm/Chart.yaml + env: + STEPS_VERSION_OUTPUTS_VERSION: ${{ steps.version.outputs.version }} + + - name: Override appVersion + if: github.event_name == 'workflow_dispatch' && inputs.app_version != '' + env: + APP_VERSION: ${{ inputs.app_version }} + run: | + yq -i '.appVersion = strenv(APP_VERSION)' ./hosting/k8s/helm/Chart.yaml + + - name: Package Helm Chart + run: | + helm package ./hosting/k8s/helm/ --destination /tmp/ + + - name: Push Helm Chart to GHCR + run: | + VERSION="${STEPS_VERSION_OUTPUTS_VERSION}" + CHART_PACKAGE="/tmp/${{ env.CHART_NAME }}-${VERSION}.tgz" + + # Push to GHCR OCI registry + helm push "$CHART_PACKAGE" "oci://${{ env.REGISTRY }}/${{ github.repository_owner }}/charts" + env: + STEPS_VERSION_OUTPUTS_VERSION: ${{ steps.version.outputs.version }} + + - name: Write run summary + run: | + { + echo "### 🧭 Helm Chart Prerelease Published" + echo "" + echo "**Version:** \`${STEPS_VERSION_OUTPUTS_VERSION}\`" + echo "" + echo "**Install:**" + echo '```bash' + echo "helm upgrade --install trigger \\" + echo " oci://${{ env.REGISTRY }}/${{ github.repository_owner }}/charts/${{ env.CHART_NAME }} \\" + echo " --version \"${STEPS_VERSION_OUTPUTS_VERSION}\"" + echo '```' + } >> "$GITHUB_STEP_SUMMARY" + env: + STEPS_VERSION_OUTPUTS_VERSION: ${{ steps.version.outputs.version }} + + - name: Find existing comment + if: github.event_name == 'pull_request' + uses: peter-evans/find-comment@b30e6a3c0ed37e7c023ccd3f1db5c6c0b0c23aad # v4.0.0 + id: find-comment + with: + issue-number: ${{ github.event.pull_request.number }} + comment-author: "github-actions[bot]" + body-includes: "Helm Chart Prerelease Published" + + - name: Create or update PR comment + if: github.event_name == 'pull_request' + uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 # v5.0.0 + with: + comment-id: ${{ steps.find-comment.outputs.comment-id }} + issue-number: ${{ github.event.pull_request.number }} + body: | + ### 🧭 Helm Chart Prerelease Published + + **Version:** `${{ steps.version.outputs.version }}` + + **Install:** + ```bash + helm upgrade --install trigger \ + oci://ghcr.io/${{ github.repository_owner }}/charts/trigger \ + --version "${{ steps.version.outputs.version }}" + ``` + + > ⚠️ This is a prerelease for testing. Do not use in production. + edit-mode: replace diff --git a/.github/workflows/pr_checks.yml b/.github/workflows/pr_checks.yml index dab18223e35..dfc0081d2df 100644 --- a/.github/workflows/pr_checks.yml +++ b/.github/workflows/pr_checks.yml @@ -3,9 +3,6 @@ name: 🤖 PR Checks on: pull_request: types: [opened, synchronize, reopened] - paths-ignore: - - "docs/**" - - ".changeset/**" concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} @@ -13,23 +10,170 @@ concurrency: permissions: contents: read - id-token: write + pull-requests: read jobs: + changes: + name: Detect changes + runs-on: ubuntu-latest + outputs: + code: ${{ steps.code_filter.outputs.code }} + typecheck_self: ${{ steps.filter.outputs.typecheck_self }} + webapp: ${{ steps.filter.outputs.webapp }} + packages: ${{ steps.filter.outputs.packages }} + internal: ${{ steps.filter.outputs.internal }} + cli: ${{ steps.filter.outputs.cli }} + sdk: ${{ steps.filter.outputs.sdk }} + steps: + # `code` uses `every` semantics so the negation patterns actually subtract. + # With the default `some` quantifier, `**` matches every file and the + # subsequent `!...` patterns are no-ops (each pattern is OR'd, not AND'd). + - uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d # v4.0.1 + id: code_filter + with: + predicate-quantifier: every + filters: | + code: + - '**' + - '!docs/**' + - '!.changeset/**' + - '!hosting/**' + - '!.github/**' + - '!references/**' + - '!**/*.md' + - '!**/.env.example' + - uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d # v4.0.1 + id: filter + with: + filters: | + typecheck_self: + - '.github/workflows/pr_checks.yml' + - '.github/workflows/typecheck.yml' + webapp: + - 'apps/webapp/**' + - 'packages/**' + - 'internal-packages/**' + - '.github/workflows/pr_checks.yml' + - '.github/workflows/unit-tests-webapp.yml' + - '.github/workflows/e2e-webapp.yml' + - '.configs/**' + - 'package.json' + - 'pnpm-lock.yaml' + - 'pnpm-workspace.yaml' + - 'turbo.json' + packages: + - 'packages/**' + - '.github/workflows/pr_checks.yml' + - '.github/workflows/unit-tests-packages.yml' + - '.configs/**' + - 'package.json' + - 'pnpm-lock.yaml' + - 'pnpm-workspace.yaml' + - 'turbo.json' + internal: + - 'internal-packages/**' + - 'packages/**' + - '.github/workflows/pr_checks.yml' + - '.github/workflows/unit-tests-internal.yml' + - '.configs/**' + - 'package.json' + - 'pnpm-lock.yaml' + - 'pnpm-workspace.yaml' + - 'turbo.json' + cli: + - 'packages/cli-v3/**' + - 'packages/build/**' + - 'packages/core/**' + - 'packages/schema-to-json/**' + - '.github/workflows/pr_checks.yml' + - '.github/workflows/e2e.yml' + - '.configs/**' + - 'package.json' + - 'pnpm-lock.yaml' + - 'pnpm-workspace.yaml' + - 'turbo.json' + sdk: + - 'packages/trigger-sdk/**' + - 'packages/core/**' + - '.github/workflows/pr_checks.yml' + - '.github/workflows/sdk-compat.yml' + - '.configs/**' + - 'package.json' + - 'pnpm-lock.yaml' + - 'pnpm-workspace.yaml' + - 'turbo.json' + typecheck: + needs: changes + if: needs.changes.outputs.code == 'true' || needs.changes.outputs.typecheck_self == 'true' uses: ./.github/workflows/typecheck.yml - secrets: inherit - units: - uses: ./.github/workflows/unit-tests.yml - secrets: inherit + webapp: + needs: changes + if: needs.changes.outputs.webapp == 'true' + uses: ./.github/workflows/unit-tests-webapp.yml + secrets: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }} + + e2e-webapp: + needs: changes + if: needs.changes.outputs.webapp == 'true' + uses: ./.github/workflows/e2e-webapp.yml + secrets: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }} + + packages: + needs: changes + if: needs.changes.outputs.packages == 'true' + uses: ./.github/workflows/unit-tests-packages.yml + secrets: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }} + + internal: + needs: changes + if: needs.changes.outputs.internal == 'true' + uses: ./.github/workflows/unit-tests-internal.yml + secrets: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }} e2e: + needs: changes + if: needs.changes.outputs.cli == 'true' uses: ./.github/workflows/e2e.yml with: package: cli-v3 - secrets: inherit sdk-compat: + needs: changes + if: needs.changes.outputs.sdk == 'true' uses: ./.github/workflows/sdk-compat.yml - secrets: inherit + + all-checks: + name: All PR Checks + needs: + - changes + - typecheck + - webapp + - e2e-webapp + - packages + - internal + - e2e + - sdk-compat + if: always() + runs-on: ubuntu-latest + steps: + - name: Verify all checks + run: | + if [[ "${{ contains(needs.*.result, 'failure') }}" == "true" ]]; then + echo "One or more checks failed" + exit 1 + fi + if [[ "${{ contains(needs.*.result, 'cancelled') }}" == "true" ]]; then + echo "One or more checks were cancelled" + exit 1 + fi + echo "All checks passed or were skipped due to path filters" diff --git a/.github/workflows/publish-webapp.yml b/.github/workflows/publish-webapp.yml index 6fcc30209ab..466eaf855c0 100644 --- a/.github/workflows/publish-webapp.yml +++ b/.github/workflows/publish-webapp.yml @@ -4,6 +4,7 @@ permissions: contents: read packages: write id-token: write + attestations: write on: workflow_call: @@ -13,6 +14,9 @@ on: type: string required: false default: "" + secrets: + SENTRY_AUTH_TOKEN: + required: false jobs: publish: @@ -24,12 +28,13 @@ jobs: short_sha: ${{ steps.get_commit.outputs.sha_short }} steps: - name: 🏭 Setup Depot CLI - uses: depot/setup-action@v1 + uses: depot/setup-action@15c09a5f77a0840ad4bce955686522a257853461 # v1.7.1 - name: ⬇️ Checkout repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: submodules: recursive + persist-credentials: false - name: "#️⃣ Get the image tag" id: get_tag @@ -40,42 +45,52 @@ jobs: - name: 🔢 Get the commit hash id: get_commit run: | - echo "sha_short=$(echo ${{ github.sha }} | cut -c1-7)" >> "$GITHUB_OUTPUT" + echo "sha_short=$(echo "${GITHUB_SHA}" | cut -c1-7)" >> "$GITHUB_OUTPUT" - name: 📛 Set the tags id: set_tags run: | ref_without_tag=ghcr.io/triggerdotdev/trigger.dev - image_tags=$ref_without_tag:${{ steps.get_tag.outputs.tag }} + image_tags=$ref_without_tag:${STEPS_GET_TAG_OUTPUTS_TAG} - # if tag is a semver, also tag it as v4 - if [[ "${{ steps.get_tag.outputs.is_semver }}" == true ]]; then - # TODO: switch to v4 tag on GA - image_tags=$image_tags,$ref_without_tag:v4-beta + # when pushing the mutable main tag, also push an immutable-by-convention + # full-commit-sha tag so a commit can be resolved to a specific digest + if [[ "${STEPS_GET_TAG_OUTPUTS_TAG}" == "main" ]]; then + image_tags=$image_tags,$ref_without_tag:${GITHUB_SHA} fi echo "image_tags=${image_tags}" >> "$GITHUB_OUTPUT" + env: + STEPS_GET_TAG_OUTPUTS_TAG: ${{ steps.get_tag.outputs.tag }} + STEPS_GET_TAG_OUTPUTS_IS_SEMVER: ${{ steps.get_tag.outputs.is_semver }} - name: 📝 Set the build info id: set_build_info run: | - tag=${{ steps.get_tag.outputs.tag }} - if [[ "${{ steps.get_tag.outputs.is_semver }}" == true ]]; then - echo "BUILD_APP_VERSION=${tag}" >> "$GITHUB_OUTPUT" - fi - echo "BUILD_GIT_SHA=${{ github.sha }}" >> "$GITHUB_OUTPUT" - echo "BUILD_GIT_REF_NAME=${{ github.ref_name }}" >> "$GITHUB_OUTPUT" - echo "BUILD_TIMESTAMP_SECONDS=$(date +%s)" >> "$GITHUB_OUTPUT" + { + tag="${STEPS_GET_TAG_OUTPUTS_TAG}" + if [[ "${STEPS_GET_TAG_OUTPUTS_IS_SEMVER}" == true ]]; then + echo "BUILD_APP_VERSION=${tag}" + fi + echo "BUILD_GIT_SHA=${GITHUB_SHA}" + echo "BUILD_GIT_REF_NAME=${GITHUB_REF_NAME}" + echo "BUILD_TIMESTAMP_SECONDS=$(date +%s)" + echo "BUILD_TIMESTAMP_RFC3339=$(date -u +%Y-%m-%dT%H:%M:%SZ)" + } >> "$GITHUB_OUTPUT" + env: + STEPS_GET_TAG_OUTPUTS_TAG: ${{ steps.get_tag.outputs.tag }} + STEPS_GET_TAG_OUTPUTS_IS_SEMVER: ${{ steps.get_tag.outputs.is_semver }} - name: 🐙 Login to GitHub Container Registry - uses: docker/login-action@v3 + uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0 with: registry: ghcr.io username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - name: 🐳 Build image and push to GitHub Container Registry - uses: depot/build-push-action@v1 + id: build_push + uses: depot/build-push-action@5f3b3c2e5a00f0093de47f657aeaefcedff27d18 # v1.17.0 with: file: ./docker/Dockerfile platforms: linux/amd64,linux/arm64 @@ -86,8 +101,20 @@ jobs: BUILD_GIT_SHA=${{ steps.set_build_info.outputs.BUILD_GIT_SHA }} BUILD_GIT_REF_NAME=${{ steps.set_build_info.outputs.BUILD_GIT_REF_NAME }} BUILD_TIMESTAMP_SECONDS=${{ steps.set_build_info.outputs.BUILD_TIMESTAMP_SECONDS }} + BUILD_TIMESTAMP_RFC3339=${{ steps.set_build_info.outputs.BUILD_TIMESTAMP_RFC3339 }} SENTRY_RELEASE=${{ steps.set_build_info.outputs.BUILD_GIT_SHA }} SENTRY_ORG=triggerdev SENTRY_PROJECT=trigger-cloud secrets: | sentry_auth_token=${{ secrets.SENTRY_AUTH_TOKEN }} + + - name: 🪪 Attest build provenance + # Image is already pushed by this point — don't fail releases (and the + # downstream publish-helm job) on a Sigstore/GHCR-referrer hiccup. Real + # config errors still surface as a step warning in the workflow run. + continue-on-error: true + uses: actions/attest-build-provenance@a2bbfa25375fe432b6a289bc6b6cd05ecd0c4c32 # v4.1.0 + with: + subject-name: ghcr.io/triggerdotdev/trigger.dev + subject-digest: ${{ steps.build_push.outputs.digest }} + push-to-registry: true diff --git a/.github/workflows/publish-worker-v4.yml b/.github/workflows/publish-worker-v4.yml index 4a2853da081..6ed490c9471 100644 --- a/.github/workflows/publish-worker-v4.yml +++ b/.github/workflows/publish-worker-v4.yml @@ -37,19 +37,22 @@ jobs: DOCKER_BUILDKIT: "1" steps: - name: 🏭 Setup Depot CLI - uses: depot/setup-action@v1 + uses: depot/setup-action@15c09a5f77a0840ad4bce955686522a257853461 # v1.7.1 - name: ⬇️ Checkout git repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false - name: 📦 Get image repo id: get_repository + env: + PACKAGE: ${{ matrix.package }} run: | - if [[ "${{ matrix.package }}" == *-provider ]]; then - provider_type=$(echo "${{ matrix.package }}" | cut -d- -f1) - repo=provider/${provider_type} + if [[ "$PACKAGE" == *-provider ]]; then + repo="provider/${PACKAGE%-provider}" else - repo="${{ matrix.package }}" + repo="$PACKAGE" fi echo "repo=${repo}" >> "$GITHUB_OUTPUT" @@ -62,26 +65,24 @@ jobs: - name: 📛 Set tags to push id: set_tags run: | - ref_without_tag=ghcr.io/triggerdotdev/${{ steps.get_repository.outputs.repo }} - image_tags=$ref_without_tag:${{ steps.get_tag.outputs.tag }} - - # if tag is a semver, also tag it as v4 - if [[ "${{ steps.get_tag.outputs.is_semver }}" == true ]]; then - # TODO: switch to v4 tag on GA - image_tags=$image_tags,$ref_without_tag:v4-beta - fi + ref_without_tag=ghcr.io/triggerdotdev/${STEPS_GET_REPOSITORY_OUTPUTS_REPO} + image_tags=$ref_without_tag:${STEPS_GET_TAG_OUTPUTS_TAG} echo "image_tags=${image_tags}" >> "$GITHUB_OUTPUT" + env: + STEPS_GET_REPOSITORY_OUTPUTS_REPO: ${{ steps.get_repository.outputs.repo }} + STEPS_GET_TAG_OUTPUTS_TAG: ${{ steps.get_tag.outputs.tag }} + STEPS_GET_TAG_OUTPUTS_IS_SEMVER: ${{ steps.get_tag.outputs.is_semver }} - name: 🐙 Login to GitHub Container Registry - uses: docker/login-action@v3 + uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0 with: registry: ghcr.io username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - name: 🐳 Build image and push to GitHub Container Registry - uses: depot/build-push-action@v1 + uses: depot/build-push-action@5f3b3c2e5a00f0093de47f657aeaefcedff27d18 # v1.17.0 with: file: ./apps/${{ matrix.package }}/Containerfile platforms: linux/amd64,linux/arm64 diff --git a/.github/workflows/publish-worker.yml b/.github/workflows/publish-worker.yml index 74a70d83667..d7e0c79ddd2 100644 --- a/.github/workflows/publish-worker.yml +++ b/.github/workflows/publish-worker.yml @@ -8,6 +8,11 @@ on: type: string required: false default: "" + secrets: + DOCKERHUB_USERNAME: + required: false + DOCKERHUB_TOKEN: + required: false push: tags: - "infra-dev-*" @@ -26,18 +31,22 @@ jobs: runs-on: ubuntu-latest env: DOCKER_BUILDKIT: "1" + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} steps: - name: ⬇️ Checkout git repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false - name: 📦 Get image repo id: get_repository + env: + PACKAGE: ${{ matrix.package }} run: | - if [[ "${{ matrix.package }}" == *-provider ]]; then - provider_type=$(echo "${{ matrix.package }}" | cut -d- -f1) - repo=provider/${provider_type} + if [[ "$PACKAGE" == *-provider ]]; then + repo="provider/${PACKAGE%-provider}" else - repo="${{ matrix.package }}" + repo="$PACKAGE" fi echo "repo=${repo}" >> "$GITHUB_OUTPUT" @@ -47,11 +56,12 @@ jobs: tag: ${{ inputs.image_tag }} - name: 🐋 Set up Docker Buildx - uses: docker/setup-buildx-action@v3 + uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # v4.0.0 # ..to avoid rate limits when pulling images - name: 🐳 Login to DockerHub - uses: docker/login-action@v3 + if: ${{ env.DOCKERHUB_USERNAME }} + uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} @@ -62,7 +72,7 @@ jobs: # ..to push image - name: 🐙 Login to GitHub Container Registry - uses: docker/login-action@v3 + uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0 with: registry: ghcr.io username: ${{ github.repository_owner }} diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 6213499c5ad..a238395c8c0 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -8,6 +8,13 @@ on: description: The image tag to publish required: true type: string + secrets: + DOCKERHUB_USERNAME: + required: false + DOCKERHUB_TOKEN: + required: false + SENTRY_AUTH_TOKEN: + required: false push: branches: - main @@ -37,8 +44,6 @@ on: - "tests/**" permissions: - id-token: write - packages: write contents: read concurrency: @@ -50,29 +55,44 @@ env: jobs: typecheck: uses: ./.github/workflows/typecheck.yml - secrets: inherit units: uses: ./.github/workflows/unit-tests.yml - secrets: inherit + secrets: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }} publish-webapp: needs: [typecheck] + permissions: + contents: read + packages: write + id-token: write + attestations: write uses: ./.github/workflows/publish-webapp.yml - secrets: inherit + secrets: + SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }} with: image_tag: ${{ inputs.image_tag }} publish-worker: needs: [typecheck] + permissions: + contents: read + packages: write uses: ./.github/workflows/publish-worker.yml - secrets: inherit + secrets: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }} with: image_tag: ${{ inputs.image_tag }} publish-worker-v4: needs: [typecheck] + permissions: + contents: read + packages: write + id-token: write uses: ./.github/workflows/publish-worker-v4.yml - secrets: inherit with: image_tag: ${{ inputs.image_tag }} diff --git a/.github/workflows/release-helm.yml b/.github/workflows/release-helm.yml index c6efd382ff6..65e846d0d39 100644 --- a/.github/workflows/release-helm.yml +++ b/.github/workflows/release-helm.yml @@ -4,6 +4,12 @@ on: push: tags: - 'helm-v*' + workflow_call: + inputs: + chart_version: + description: 'Chart version to release' + required: true + type: string workflow_dispatch: inputs: chart_version: @@ -22,10 +28,12 @@ jobs: contents: read steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false - name: Set up Helm - uses: azure/setup-helm@v4 + uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0 with: version: "3.18.3" @@ -48,7 +56,7 @@ jobs: --output-dir ./helm-output - name: Validate manifests - uses: docker://ghcr.io/yannh/kubeconform:v0.7.0 + uses: docker://ghcr.io/yannh/kubeconform:v0.7.0@sha256:85dbef6b4b312b99133decc9c6fc9495e9fc5f92293d4ff3b7e1b30f5611823c with: entrypoint: '/kubeconform' args: "-summary -output json ./helm-output" @@ -61,10 +69,12 @@ jobs: packages: write steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false - name: Set up Helm - uses: azure/setup-helm@v4 + uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0 with: version: "3.18.3" @@ -77,7 +87,7 @@ jobs: for file in ./charts/*.tgz; do echo "Extracting $file"; tar -xzf "$file" -C ./charts; done - name: Log in to Container Registry - uses: docker/login-action@v3 + uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0 with: registry: ${{ env.REGISTRY }} username: ${{ github.actor }} @@ -86,18 +96,20 @@ jobs: - name: Extract version from tag or input id: version run: | - if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then - VERSION="${{ github.event.inputs.chart_version }}" + if [ -n "${INPUTS_CHART_VERSION}" ]; then + VERSION="${INPUTS_CHART_VERSION}" else - VERSION="${{ github.ref_name }}" + VERSION="${GITHUB_REF_NAME}" VERSION="${VERSION#helm-v}" fi - echo "version=$VERSION" >> $GITHUB_OUTPUT + echo "version=$VERSION" >> "$GITHUB_OUTPUT" echo "Releasing version: $VERSION" + env: + INPUTS_CHART_VERSION: ${{ inputs.chart_version }} - name: Check Chart.yaml version matches release version run: | - VERSION="${{ steps.version.outputs.version }}" + VERSION="${STEPS_VERSION_OUTPUTS_VERSION}" CHART_VERSION=$(grep '^version:' ./hosting/k8s/helm/Chart.yaml | awk '{print $2}') echo "Chart.yaml version: $CHART_VERSION" echo "Release version: $VERSION" @@ -106,6 +118,8 @@ jobs: exit 1 fi echo "✅ Chart.yaml version matches release version." + env: + STEPS_VERSION_OUTPUTS_VERSION: ${{ steps.version.outputs.version }} - name: Package Helm Chart run: | @@ -113,18 +127,19 @@ jobs: - name: Push Helm Chart to GHCR run: | - VERSION="${{ steps.version.outputs.version }}" + VERSION="${STEPS_VERSION_OUTPUTS_VERSION}" CHART_PACKAGE="/tmp/${{ env.CHART_NAME }}-${VERSION}.tgz" # Push to GHCR OCI registry helm push "$CHART_PACKAGE" "oci://${{ env.REGISTRY }}/${{ github.repository_owner }}/charts" + env: + STEPS_VERSION_OUTPUTS_VERSION: ${{ steps.version.outputs.version }} - name: Create GitHub Release id: release - uses: softprops/action-gh-release@v1 - if: github.event_name == 'push' + uses: softprops/action-gh-release@b4309332981a82ec1c5618f44dd2e27cc8bfbfda # v3.0.0 with: - tag_name: ${{ github.ref_name }} + tag_name: helm-v${{ steps.version.outputs.version }} name: "Helm Chart ${{ steps.version.outputs.version }}" body: | ### Installation diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index ca0f0ebf16b..8ab9a4e3207 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -33,6 +33,7 @@ jobs: show-release-summary: name: 📋 Release Summary runs-on: ubuntu-latest + permissions: {} if: | github.repository == 'triggerdotdev/trigger.dev' && github.event_name == 'pull_request' && @@ -43,7 +44,7 @@ jobs: env: PR_BODY: ${{ github.event.pull_request.body }} run: | - echo "$PR_BODY" | sed -n '/^# Releases/,$p' >> $GITHUB_STEP_SUMMARY + echo "$PR_BODY" | sed -n '/^# Releases/,$p' >> "$GITHUB_STEP_SUMMARY" release: name: 🚀 Release npm packages @@ -63,9 +64,10 @@ jobs: published: ${{ steps.changesets.outputs.published }} published_packages: ${{ steps.changesets.outputs.publishedPackages }} published_package_version: ${{ steps.get_version.outputs.package_version }} + is_prerelease: ${{ steps.get_version.outputs.is_prerelease }} steps: - name: Checkout repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 # zizmor: ignore[artipacked] needs persisted git creds for tag push; no artifact upload here so no leak path with: fetch-depth: 0 ref: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.ref || github.sha }} @@ -73,18 +75,20 @@ jobs: - name: Verify ref is on main if: github.event_name == 'workflow_dispatch' run: | - if ! git merge-base --is-ancestor ${{ github.event.inputs.ref }} origin/main; then + if ! git merge-base --is-ancestor "${GITHUB_EVENT_INPUTS_REF}" origin/main; then echo "Error: ref must be an ancestor of main (i.e., already merged)" exit 1 fi + env: + GITHUB_EVENT_INPUTS_REF: ${{ github.event.inputs.ref }} - name: Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5.0.0 with: - version: 10.23.0 + version: 10.33.2 - name: Setup node - uses: buildjet/setup-node@v4 + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 with: node-version: 20.20.0 cache: "pnpm" @@ -108,10 +112,10 @@ jobs: - name: Publish id: changesets - uses: changesets/action@v1 + uses: changesets/action@6a0a831ff30acef54f2c6aa1cbbc1096b066edaf # v1.7.0 with: publish: pnpm run changeset:release - createGithubReleases: true + createGithubReleases: false env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} @@ -119,16 +123,145 @@ jobs: if: steps.changesets.outputs.published == 'true' id: get_version run: | - package_version=$(echo '${{ steps.changesets.outputs.publishedPackages }}' | jq -r '.[0].version') + package_version=$(echo "${STEPS_CHANGESETS_OUTPUTS_PUBLISHEDPACKAGES}" | jq -r '.[0].version') echo "package_version=${package_version}" >> "$GITHUB_OUTPUT" + # Any semver with a hyphen is a prerelease (e.g. 4.5.0-rc.0, 0.0.0-snapshot-...) + if [[ "${package_version}" == *-* ]]; then + echo "is_prerelease=true" >> "$GITHUB_OUTPUT" + else + echo "is_prerelease=false" >> "$GITHUB_OUTPUT" + fi + env: + STEPS_CHANGESETS_OUTPUTS_PUBLISHEDPACKAGES: ${{ steps.changesets.outputs.publishedPackages }} + + - name: Create unified GitHub release + if: steps.changesets.outputs.published == 'true' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + RELEASE_PR_BODY: ${{ github.event.pull_request.body }} + STEPS_GET_VERSION_OUTPUTS_PACKAGE_VERSION: ${{ steps.get_version.outputs.package_version }} + STEPS_GET_VERSION_OUTPUTS_IS_PRERELEASE: ${{ steps.get_version.outputs.is_prerelease }} + run: | + VERSION="${STEPS_GET_VERSION_OUTPUTS_PACKAGE_VERSION}" + node scripts/generate-github-release.mjs "$VERSION" > /tmp/release-body.md + PRERELEASE_FLAG="" + if [ "${STEPS_GET_VERSION_OUTPUTS_IS_PRERELEASE}" = "true" ]; then + PRERELEASE_FLAG="--prerelease" + fi + gh release create "v${VERSION}" \ + --title "trigger.dev v${VERSION}" \ + --notes-file /tmp/release-body.md \ + --target main \ + $PRERELEASE_FLAG - # this triggers the publish workflow for the docker images - name: Create and push Docker tag if: steps.changesets.outputs.published == 'true' run: | set -e - git tag "v.docker.${{ steps.get_version.outputs.package_version }}" - git push origin "v.docker.${{ steps.get_version.outputs.package_version }}" + git tag "v.docker.${STEPS_GET_VERSION_OUTPUTS_PACKAGE_VERSION}" + git push origin "v.docker.${STEPS_GET_VERSION_OUTPUTS_PACKAGE_VERSION}" + env: + STEPS_GET_VERSION_OUTPUTS_PACKAGE_VERSION: ${{ steps.get_version.outputs.package_version }} + + - name: Create and push Helm chart tag + if: steps.changesets.outputs.published == 'true' + run: | + set -e + git tag "helm-v${STEPS_GET_VERSION_OUTPUTS_PACKAGE_VERSION}" + git push origin "helm-v${STEPS_GET_VERSION_OUTPUTS_PACKAGE_VERSION}" + env: + STEPS_GET_VERSION_OUTPUTS_PACKAGE_VERSION: ${{ steps.get_version.outputs.package_version }} + + # Trigger Docker builds directly via workflow_call since tags pushed with + # GITHUB_TOKEN don't trigger other workflows (GitHub Actions limitation). + publish-docker: + name: 🐳 Publish Docker images + needs: release + if: needs.release.outputs.published == 'true' + permissions: + contents: read + packages: write + id-token: write + attestations: write + uses: ./.github/workflows/publish.yml + secrets: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }} + SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }} + with: + image_tag: v${{ needs.release.outputs.published_package_version }} + + # Trigger Helm chart release directly via workflow_call (same GITHUB_TOKEN + # limitation as the Docker path). Runs after Docker images are published so + # the chart never references images that don't exist yet. + publish-helm: + name: 🧭 Publish Helm chart + needs: [release, publish-docker] + if: needs.release.outputs.published == 'true' + permissions: + contents: write + packages: write + uses: ./.github/workflows/release-helm.yml + with: + chart_version: ${{ needs.release.outputs.published_package_version }} + + # After Docker images are published, update the GitHub release with the exact GHCR tag URL. + # The GHCR package version ID is only known after the image is pushed, so we query for it here. + update-release: + name: 🔗 Update release Docker link + needs: [release, publish-docker] + if: needs.release.outputs.published == 'true' + runs-on: ubuntu-latest + permissions: + contents: write + packages: read + steps: + - name: Update GitHub release with Docker image link + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + NEEDS_RELEASE_OUTPUTS_PUBLISHED_PACKAGE_VERSION: ${{ needs.release.outputs.published_package_version }} + run: | + set -e + VERSION="${NEEDS_RELEASE_OUTPUTS_PUBLISHED_PACKAGE_VERSION}" + TAG="v${VERSION}" + + # Query GHCR for the version ID matching this tag + VERSION_ID=$(gh api --paginate -H "Accept: application/vnd.github+json" \ + /orgs/triggerdotdev/packages/container/trigger.dev/versions \ + --jq ".[] | select(.metadata.container.tags[] == \"${TAG}\") | .id" \ + | head -1) + + if [ -z "$VERSION_ID" ]; then + echo "Warning: Could not find GHCR version ID for tag ${TAG}, skipping update" + exit 0 + fi + + DOCKER_URL="https://github.com/triggerdotdev/trigger.dev/pkgs/container/trigger.dev/${VERSION_ID}?tag=${TAG}" + GENERIC_URL="https://github.com/triggerdotdev/trigger.dev/pkgs/container/trigger.dev" + + # Get current release body and replace the generic link with the tag-specific one. + # Use word boundary after GENERIC_URL (closing paren) to avoid matching URLs that + # already have a version ID appended (idempotent on re-runs). + gh release view "${TAG}" --repo triggerdotdev/trigger.dev --json body --jq '.body' > /tmp/release-body.md + sed -i "s|${GENERIC_URL})|${DOCKER_URL})|g" /tmp/release-body.md + + gh release edit "${TAG}" --repo triggerdotdev/trigger.dev --notes-file /tmp/release-body.md + + # Dispatch changelog entry creation to the marketing site repo. + # Runs after update-release so the GitHub release body already has the exact Docker image URL. + dispatch-changelog: + name: 📝 Dispatch changelog PR + needs: [release, update-release] + if: needs.release.outputs.published == 'true' && needs.release.outputs.is_prerelease != 'true' + runs-on: ubuntu-latest + permissions: {} + steps: + - uses: peter-evans/repository-dispatch@28959ce8df70de7be546dd1250a005dd32156697 # v4.0.1 + with: + token: ${{ secrets.CROSS_REPO_PAT }} + repository: triggerdotdev/trigger.dev-site-v3 + event-type: new-release + client-payload: '{"version": "${{ needs.release.outputs.published_package_version }}"}' # The prerelease job needs to be on the same workflow file due to a limitation related to how npm verifies OIDC claims. prerelease: @@ -141,18 +274,19 @@ jobs: if: github.repository == 'triggerdotdev/trigger.dev' && github.event_name == 'workflow_dispatch' && github.event.inputs.type == 'prerelease' steps: - name: Checkout repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 0 ref: ${{ github.event.inputs.ref }} + persist-credentials: false - name: Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5.0.0 with: - version: 10.23.0 + version: 10.33.2 - name: Setup node - uses: buildjet/setup-node@v4 + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 with: node-version: 20.20.0 cache: "pnpm" @@ -168,10 +302,18 @@ jobs: - name: Generate Prisma Client run: pnpm run generate + - name: Exit changeset pre mode (if active) + run: | + if [ -f .changeset/pre.json ]; then + echo "Repo is in changeset pre mode; exiting so snapshot release can run" + pnpm exec changeset pre exit + fi + - name: Snapshot version - run: pnpm exec changeset version --snapshot ${{ github.event.inputs.prerelease_tag }} + run: pnpm exec changeset version --snapshot "${GITHUB_EVENT_INPUTS_PRERELEASE_TAG}" env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_EVENT_INPUTS_PRERELEASE_TAG: ${{ github.event.inputs.prerelease_tag }} - name: Clean run: pnpm run clean --filter "@trigger.dev/*" --filter "trigger.dev" @@ -180,6 +322,7 @@ jobs: run: pnpm run build --filter "@trigger.dev/*" --filter "trigger.dev" - name: Publish prerelease - run: pnpm exec changeset publish --no-git-tag --snapshot --tag ${{ github.event.inputs.prerelease_tag }} + run: pnpm exec changeset publish --no-git-tag --snapshot --tag "${GITHUB_EVENT_INPUTS_PRERELEASE_TAG}" env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_EVENT_INPUTS_PRERELEASE_TAG: ${{ github.event.inputs.prerelease_tag }} diff --git a/.github/workflows/sdk-compat.yml b/.github/workflows/sdk-compat.yml index eb347c0f771..1940504e3f8 100644 --- a/.github/workflows/sdk-compat.yml +++ b/.github/workflows/sdk-compat.yml @@ -18,17 +18,18 @@ jobs: steps: - name: ⬇️ Checkout repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 0 + persist-credentials: false - name: ⎔ Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5.0.0 with: - version: 10.23.0 + version: 10.33.2 - name: ⎔ Setup node - uses: buildjet/setup-node@v4 + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 with: node-version: ${{ matrix.node }} cache: "pnpm" @@ -56,23 +57,24 @@ jobs: runs-on: ubuntu-latest steps: - name: ⬇️ Checkout repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 0 + persist-credentials: false - name: ⎔ Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5.0.0 with: - version: 10.23.0 + version: 10.33.2 - name: ⎔ Setup node - uses: buildjet/setup-node@v4 + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 with: node-version: 20.20.0 cache: "pnpm" - name: 🥟 Setup Bun - uses: oven-sh/setup-bun@v2 + uses: oven-sh/setup-bun@0c5077e51419868618aeaa5fe8019c62421857d6 # v2.2.0 with: bun-version: latest @@ -97,23 +99,24 @@ jobs: runs-on: ubuntu-latest steps: - name: ⬇️ Checkout repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 0 + persist-credentials: false - name: ⎔ Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5.0.0 with: - version: 10.23.0 + version: 10.33.2 - name: ⎔ Setup node - uses: buildjet/setup-node@v4 + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 with: node-version: 20.20.0 cache: "pnpm" - name: 🦕 Setup Deno - uses: denoland/setup-deno@v2 + uses: denoland/setup-deno@667a34cdef165d8d2b2e98dde39547c9daac7282 # v2.0.4 with: deno-version: v2.x @@ -142,17 +145,18 @@ jobs: runs-on: ubuntu-latest steps: - name: ⬇️ Checkout repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 0 + persist-credentials: false - name: ⎔ Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5.0.0 with: - version: 10.23.0 + version: 10.33.2 - name: ⎔ Setup node - uses: buildjet/setup-node@v4 + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 with: node-version: 20.20.0 cache: "pnpm" diff --git a/.github/workflows/typecheck.yml b/.github/workflows/typecheck.yml index 665d54b2563..199af9f741a 100644 --- a/.github/workflows/typecheck.yml +++ b/.github/workflows/typecheck.yml @@ -12,17 +12,18 @@ jobs: steps: - name: ⬇️ Checkout repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 0 + persist-credentials: false - name: ⎔ Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5.0.0 with: - version: 10.23.0 + version: 10.33.2 - name: ⎔ Setup node - uses: buildjet/setup-node@v4 + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 with: node-version: 20.20.0 cache: "pnpm" diff --git a/.github/workflows/unit-tests-internal.yml b/.github/workflows/unit-tests-internal.yml index 92b951e8aa0..97ba202fcb3 100644 --- a/.github/workflows/unit-tests-internal.yml +++ b/.github/workflows/unit-tests-internal.yml @@ -5,6 +5,11 @@ permissions: on: workflow_call: + secrets: + DOCKERHUB_USERNAME: + required: false + DOCKERHUB_TOKEN: + required: false jobs: unitTests: @@ -46,17 +51,18 @@ jobs: run: sudo systemctl restart docker - name: ⬇️ Checkout repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 0 + persist-credentials: false - name: ⎔ Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5.0.0 with: - version: 10.23.0 + version: 10.33.2 - name: ⎔ Setup node - uses: buildjet/setup-node@v4 + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 with: node-version: 20.20.0 cache: "pnpm" @@ -64,7 +70,7 @@ jobs: # ..to avoid rate limits when pulling images - name: 🐳 Login to DockerHub if: ${{ env.DOCKERHUB_USERNAME }} - uses: docker/login-action@v3 + uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} @@ -101,7 +107,7 @@ jobs: - name: Upload blob reports to GitHub Actions Artifacts if: ${{ !cancelled() }} - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: internal-blob-report-${{ matrix.shardIndex }} path: .vitest-reports/* @@ -115,23 +121,24 @@ jobs: runs-on: ubuntu-latest steps: - name: ⬇️ Checkout repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 0 + persist-credentials: false - name: ⎔ Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5.0.0 with: - version: 10.23.0 + version: 10.33.2 - name: ⎔ Setup node - uses: buildjet/setup-node@v4 + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 with: node-version: 20.20.0 # no cache enabled, we're not installing deps - name: Download blob reports from GitHub Actions Artifacts - uses: actions/download-artifact@v4 + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 with: path: .vitest-reports pattern: internal-blob-report-* diff --git a/.github/workflows/unit-tests-packages.yml b/.github/workflows/unit-tests-packages.yml index 78474e03f27..fb3d513aecb 100644 --- a/.github/workflows/unit-tests-packages.yml +++ b/.github/workflows/unit-tests-packages.yml @@ -5,6 +5,11 @@ permissions: on: workflow_call: + secrets: + DOCKERHUB_USERNAME: + required: false + DOCKERHUB_TOKEN: + required: false jobs: unitTests: @@ -46,17 +51,18 @@ jobs: run: sudo systemctl restart docker - name: ⬇️ Checkout repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 0 + persist-credentials: false - name: ⎔ Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5.0.0 with: - version: 10.23.0 + version: 10.33.2 - name: ⎔ Setup node - uses: buildjet/setup-node@v4 + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 with: node-version: 20.20.0 cache: "pnpm" @@ -64,7 +70,7 @@ jobs: # ..to avoid rate limits when pulling images - name: 🐳 Login to DockerHub if: ${{ env.DOCKERHUB_USERNAME }} - uses: docker/login-action@v3 + uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} @@ -101,7 +107,7 @@ jobs: - name: Upload blob reports to GitHub Actions Artifacts if: ${{ !cancelled() }} - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: packages-blob-report-${{ matrix.shardIndex }} path: .vitest-reports/* @@ -115,23 +121,24 @@ jobs: runs-on: ubuntu-latest steps: - name: ⬇️ Checkout repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 0 + persist-credentials: false - name: ⎔ Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5.0.0 with: - version: 10.23.0 + version: 10.33.2 - name: ⎔ Setup node - uses: buildjet/setup-node@v4 + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 with: node-version: 20.20.0 # no cache enabled, we're not installing deps - name: Download blob reports from GitHub Actions Artifacts - uses: actions/download-artifact@v4 + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 with: path: .vitest-reports pattern: packages-blob-report-* diff --git a/.github/workflows/unit-tests-webapp.yml b/.github/workflows/unit-tests-webapp.yml index 523a1887db8..79445503669 100644 --- a/.github/workflows/unit-tests-webapp.yml +++ b/.github/workflows/unit-tests-webapp.yml @@ -5,6 +5,11 @@ permissions: on: workflow_call: + secrets: + DOCKERHUB_USERNAME: + required: false + DOCKERHUB_TOKEN: + required: false jobs: unitTests: @@ -46,17 +51,18 @@ jobs: run: sudo systemctl restart docker - name: ⬇️ Checkout repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 0 + persist-credentials: false - name: ⎔ Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5.0.0 with: - version: 10.23.0 + version: 10.33.2 - name: ⎔ Setup node - uses: buildjet/setup-node@v4 + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 with: node-version: 20.20.0 cache: "pnpm" @@ -64,7 +70,7 @@ jobs: # ..to avoid rate limits when pulling images - name: 🐳 Login to DockerHub if: ${{ env.DOCKERHUB_USERNAME }} - uses: docker/login-action@v3 + uses: docker/login-action@4907a6ddec9925e35a0a9e82d7399ccc52663121 # v4.1.0 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} @@ -109,7 +115,7 @@ jobs: - name: Upload blob reports to GitHub Actions Artifacts if: ${{ !cancelled() }} - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: webapp-blob-report-${{ matrix.shardIndex }} path: .vitest-reports/* @@ -123,23 +129,24 @@ jobs: runs-on: ubuntu-latest steps: - name: ⬇️ Checkout repo - uses: actions/checkout@v4 + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 with: fetch-depth: 0 + persist-credentials: false - name: ⎔ Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v5.0.0 with: - version: 10.23.0 + version: 10.33.2 - name: ⎔ Setup node - uses: buildjet/setup-node@v4 + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 with: node-version: 20.20.0 # no cache enabled, we're not installing deps - name: Download blob reports from GitHub Actions Artifacts - uses: actions/download-artifact@v4 + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1 with: path: .vitest-reports pattern: webapp-blob-report-* diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 7c90a5a30ad..96e76279c82 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -5,14 +5,30 @@ permissions: on: workflow_call: + secrets: + DOCKERHUB_USERNAME: + required: false + DOCKERHUB_TOKEN: + required: false jobs: webapp: uses: ./.github/workflows/unit-tests-webapp.yml - secrets: inherit + secrets: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }} + e2e-webapp: + uses: ./.github/workflows/e2e-webapp.yml + secrets: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }} packages: uses: ./.github/workflows/unit-tests-packages.yml - secrets: inherit + secrets: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }} internal: uses: ./.github/workflows/unit-tests-internal.yml - secrets: inherit + secrets: + DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }} + DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }} diff --git a/.github/workflows/vouch-check-pr.yml b/.github/workflows/vouch-check-pr.yml new file mode 100644 index 00000000000..29090296bb0 --- /dev/null +++ b/.github/workflows/vouch-check-pr.yml @@ -0,0 +1,49 @@ +name: Vouch - Check PR + +on: + pull_request_target: # zizmor: ignore[dangerous-triggers] needed to comment/close fork PRs; safe because we never check out PR HEAD ref so no fork-controlled code runs + types: [opened, reopened] + +permissions: {} + +jobs: + check-vouch: + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: write # auto-close unvouched PRs + issues: read + steps: + - uses: mitchellh/vouch/action/check-pr@c6d80ead49839655b61b422700b7a3bc9d0804a9 # v1.4.2 + with: + pr-number: ${{ github.event.pull_request.number }} + auto-close: true + require-vouch: true + env: + GH_TOKEN: ${{ github.token }} + + require-draft: + needs: check-vouch + permissions: + pull-requests: write # close non-draft PRs with a comment + if: > + github.event.pull_request.draft == false && + github.event.pull_request.author_association != 'MEMBER' && + github.event.pull_request.author_association != 'OWNER' && + github.event.pull_request.author_association != 'COLLABORATOR' && + github.event.pull_request.user.login != 'devin-ai-integration[bot]' && + github.event.pull_request.user.login != 'dependabot[bot]' + runs-on: ubuntu-latest + steps: + - name: Close non-draft PR + env: + GH_TOKEN: ${{ github.token }} + run: | + STATE=$(gh pr view ${{ github.event.pull_request.number }} --repo ${{ github.repository }} --json state -q '.state') + if [ "$STATE" != "OPEN" ]; then + echo "PR is already closed, skipping." + exit 0 + fi + gh pr close ${{ github.event.pull_request.number }} \ + --repo ${{ github.repository }} \ + --comment "Thanks for your contribution! We require all external PRs to be opened in **draft** status first so you can address CodeRabbit review comments and ensure CI passes before requesting a review. Please re-open this PR as a draft. See [CONTRIBUTING.md](https://github.com/${{ github.repository }}/blob/main/CONTRIBUTING.md#pr-workflow) for details." diff --git a/.github/workflows/vouch-manage-by-issue.yml b/.github/workflows/vouch-manage-by-issue.yml new file mode 100644 index 00000000000..51bce367b3e --- /dev/null +++ b/.github/workflows/vouch-manage-by-issue.yml @@ -0,0 +1,24 @@ +name: Vouch - Manage by Issue + +on: + issue_comment: + types: [created] + +permissions: + contents: write + issues: write + +jobs: + manage: + runs-on: ubuntu-latest + if: >- + contains(github.event.comment.body, 'vouch') || + contains(github.event.comment.body, 'denounce') || + contains(github.event.comment.body, 'unvouch') + steps: + - uses: mitchellh/vouch/action/manage-by-issue@c6d80ead49839655b61b422700b7a3bc9d0804a9 # v1.4.2 + with: + comment-id: ${{ github.event.comment.id }} + issue-id: ${{ github.event.issue.number }} + env: + GH_TOKEN: ${{ github.token }} diff --git a/.github/workflows/workflow-checks.yml b/.github/workflows/workflow-checks.yml new file mode 100644 index 00000000000..2e4d50cd9ed --- /dev/null +++ b/.github/workflows/workflow-checks.yml @@ -0,0 +1,51 @@ +name: Workflow Checks + +on: + push: + branches: [main] + paths: + - '.github/workflows/**' + - '.github/actions/**' + - '.github/zizmor.yml' + pull_request: + paths: + - '.github/workflows/**' + - '.github/actions/**' + - '.github/zizmor.yml' + +permissions: {} + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + actionlint: + name: Actionlint + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - name: Checkout repository + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + + - name: Run actionlint + uses: docker://rhysd/actionlint:1.7.12@sha256:b1934ee5f1c509618f2508e6eb47ee0d3520686341fec936f3b79331f9315667 + + zizmor: + name: Zizmor + runs-on: ubuntu-latest + permissions: + security-events: write # Upload SARIF to GitHub Security tab + contents: read # Read workflow files for analysis + actions: read # Read workflow run metadata + steps: + - name: Checkout repository + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + + - name: Run zizmor + uses: zizmorcore/zizmor-action@b1d7e1fb5de872772f31590499237e7cce841e8e # v0.5.3 diff --git a/.github/zizmor.yml b/.github/zizmor.yml new file mode 100644 index 00000000000..2fcbb540127 --- /dev/null +++ b/.github/zizmor.yml @@ -0,0 +1,5 @@ +rules: + unpinned-uses: + config: + policies: + '*': hash-pin diff --git a/.gitignore b/.gitignore index 071b9b59035..d071d5ae4e3 100644 --- a/.gitignore +++ b/.gitignore @@ -65,6 +65,11 @@ apps/**/public/build /packages/trigger-sdk/src/package.json /packages/python/src/package.json **/.claude/settings.local.json +.claude/architecture/ +.claude/docs-plans/ +.claude/review-guides/ +.claude/scheduled_tasks.lock .mcp.log .mcp.json -.cursor/debug.log \ No newline at end of file +.cursor/debug.log +ailogger-output.log \ No newline at end of file diff --git a/.server-changes/.gitkeep b/.server-changes/.gitkeep new file mode 100644 index 00000000000..e69de29bb2d diff --git a/.server-changes/README.md b/.server-changes/README.md new file mode 100644 index 00000000000..82716de981c --- /dev/null +++ b/.server-changes/README.md @@ -0,0 +1,81 @@ +# Server Changes + +This directory tracks changes to server-only components (webapp, supervisor, coordinator, etc.) that are not captured by changesets. Changesets only track published npm packages — server changes would otherwise go undocumented. + +## When to add a file + +**Server-only PRs**: If your PR only changes `apps/webapp/`, `apps/supervisor/`, `apps/coordinator/`, or other server components (and does NOT change anything in `packages/`), add a `.server-changes/` file. + +**Mixed PRs** (both packages and server): Just add a changeset as usual. No `.server-changes/` file needed — the changeset covers it. + +**Package-only PRs**: Just add a changeset as usual. + +## File format + +Create a markdown file with a descriptive name: + +``` +.server-changes/fix-batch-queue-stalls.md +``` + +With this format: + +```markdown +--- +area: webapp +type: fix +--- + +Speed up batch queue processing by removing stalls and fixing retry race +``` + +### Fields + +- **area** (required): `webapp` | `supervisor` | `coordinator` | `kubernetes-provider` | `docker-provider` +- **type** (required): `feature` | `fix` | `improvement` | `breaking` + +### Description + +The body text (below the frontmatter) is a one-line description of the change. Keep it concise — it will appear in release notes. + +## Lifecycle + +1. Engineer adds a `.server-changes/` file in their PR +2. Files accumulate on `main` as PRs merge +3. The changeset release PR includes these in its summary +4. After the release merges, CI cleans up the consumed files + +## Examples + +**New feature:** + +```markdown +--- +area: webapp +type: feature +--- + +TRQL query language and the Query page +``` + +**Bug fix:** + +```markdown +--- +area: webapp +type: fix +--- + +Fix schedule limit counting for orgs with custom limits +``` + +**Improvement:** + +```markdown +--- +area: webapp +type: improvement +--- + +Use the replica for API auth queries to reduce primary load +``` diff --git a/.server-changes/admin-tabs-preserve-search.md b/.server-changes/admin-tabs-preserve-search.md new file mode 100644 index 00000000000..7caaa642626 --- /dev/null +++ b/.server-changes/admin-tabs-preserve-search.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: improvement +--- + +Preserve search string when switching between the Users and Organizations tabs in the admin dashboard. diff --git a/.server-changes/agent-playground.md b/.server-changes/agent-playground.md new file mode 100644 index 00000000000..f2e0852add7 --- /dev/null +++ b/.server-changes/agent-playground.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +New Agent Playground for testing `chat.agent` tasks interactively — multi-turn chat with tool-call visualization, a side panel for payload / schema / clientData configuration, and trigger-config controls for `maxDuration`, version pin, and region. diff --git a/.server-changes/agents-dashboard.md b/.server-changes/agents-dashboard.md new file mode 100644 index 00000000000..1aca65320bb --- /dev/null +++ b/.server-changes/agents-dashboard.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +New Agents page in the dashboard listing every `chat.agent` task in the environment with active/inactive status and run counts, plus fuzzy search for navigating large agent catalogs. diff --git a/.server-changes/ai-span-inspector.md b/.server-changes/ai-span-inspector.md new file mode 100644 index 00000000000..41f7a5dea90 --- /dev/null +++ b/.server-changes/ai-span-inspector.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +AI generation spans in the run trace get a dedicated inspector showing model, provider, token counts, cost, token speed, finish reason, service tier, tool count, and a link to the prompt version that produced the generation. diff --git a/.server-changes/dev-cli-disconnect-md b/.server-changes/dev-cli-disconnect-md new file mode 100644 index 00000000000..a0790d70765 --- /dev/null +++ b/.server-changes/dev-cli-disconnect-md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +Added `/engine/v1/dev/disconnect` endpoint to auto-cancel runs when the CLI disconnects. Maximum of 500 runs can be cancelled. Uses the bulk action system when there are more than 25 runs to cancel. \ No newline at end of file diff --git a/.server-changes/fix-ck-queue-length-cap-and-dashboard.md b/.server-changes/fix-ck-queue-length-cap-and-dashboard.md new file mode 100644 index 00000000000..9b225b29d92 --- /dev/null +++ b/.server-changes/fix-ck-queue-length-cap-and-dashboard.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: fix +--- + +Per-queue length limits and the dashboard's "Queued | Running" columns now reflect the true total across all concurrency-key variants. Previously both read 0 for any queue that used concurrency keys, allowing the per-queue cap to be bypassed. diff --git a/.server-changes/fix-worker-deployment-version-race.md b/.server-changes/fix-worker-deployment-version-race.md new file mode 100644 index 00000000000..b0ad7de9e89 --- /dev/null +++ b/.server-changes/fix-worker-deployment-version-race.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: fix +--- + +Retry on unique-constraint collisions when assigning the next worker deployment version so concurrent deploys to the same environment no longer fail with P2002. diff --git a/.server-changes/google-auth-conflict-warn.md b/.server-changes/google-auth-conflict-warn.md new file mode 100644 index 00000000000..4e6b630ab21 --- /dev/null +++ b/.server-changes/google-auth-conflict-warn.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: improvement +--- + +Downgrade the "Google auth conflict" log from `error` to `warn`. This branch handles an expected user-state mismatch (Google ID belongs to one user, email is on another) by returning the existing auth user — there's no exception to chase, so it shouldn't page on the Sentry error channel. diff --git a/.server-changes/magic-link-email-validation.md b/.server-changes/magic-link-email-validation.md new file mode 100644 index 00000000000..f91ad60e94a --- /dev/null +++ b/.server-changes/magic-link-email-validation.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: fix +--- + +Validate email format on the magic link login form. diff --git a/.server-changes/models-registry.md b/.server-changes/models-registry.md new file mode 100644 index 00000000000..ee87f625868 --- /dev/null +++ b/.server-changes/models-registry.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +New Models page in the dashboard: a provider-grouped catalog of LLMs (OpenAI, Anthropic, Google, etc.) with pricing, capabilities, and cross-tenant usage metrics, plus per-model detail pages with token / cost / latency charts and a side-by-side compare panel. diff --git a/.server-changes/mollifier-burst-protection.md b/.server-changes/mollifier-burst-protection.md new file mode 100644 index 00000000000..182811d68fd --- /dev/null +++ b/.server-changes/mollifier-burst-protection.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +Lay the groundwork for an opt-in burst-protection layer on the trigger hot path. This release ships **monitoring only** — operators can observe per-env trigger storms via two opt-in modes, but no trigger calls are diverted or rate-limited yet (active burst smoothing follows in a later release). All new env vars are prefixed `TRIGGER_MOLLIFIER_*` and default off, so existing deployments see no behaviour change. With `TRIGGER_MOLLIFIER_SHADOW_MODE=1`, each trigger evaluates a per-env rate counter and logs `mollifier.would_mollify` when the threshold is crossed. With `TRIGGER_MOLLIFIER_ENABLED=1` plus a per-org `mollifierEnabled` flag, over-threshold triggers are also recorded in a Redis audit buffer alongside the normal `engine.trigger` call, drained by a background no-op consumer. The drainer has its own switch (`TRIGGER_MOLLIFIER_DRAINER_ENABLED`) so multi-replica deployments can pin the polling loop to a single worker service while every replica still produces into the buffer; unset, it inherits `TRIGGER_MOLLIFIER_ENABLED` so single-container self-hosters need only one flag. Drainer misconfiguration (shutdown-timeout reconciliation against `GRACEFUL_SHUTDOWN_TIMEOUT`, or `TRIGGER_MOLLIFIER_ENABLED=1` with no buffer Redis) now throws `MollifierConfigurationError` at boot and crashes the process, so the misconfig surfaces to the orchestrator instead of disappearing into a log line; transient init failures (Redis blip) are still logged-and-swallowed. Emits the `mollifier.decisions` OTel counter for per-env rate visibility. diff --git a/.server-changes/otel-attribute-utf16-sanitization.md b/.server-changes/otel-attribute-utf16-sanitization.md new file mode 100644 index 00000000000..941c7185bc8 --- /dev/null +++ b/.server-changes/otel-attribute-utf16-sanitization.md @@ -0,0 +1,23 @@ +--- +area: webapp +type: fix +--- + +Recover from ClickHouse `JSONEachRow` parse failures caused by lone +UTF-16 surrogates in OTel attribute strings (`Cannot parse JSON object +here ... ParallelParsingBlockInputFormat`). + +`ClickhouseEventRepository.#flushBatch` and `#flushLlmMetricsBatch` now +retry once after sanitizing every row in the batch: any string value +containing a lone surrogate is replaced with `"[invalid-utf16]"`. If +the sanitizer touched no fields (the parse error isn't a surrogate +issue) or the retry still fails, the batch is dropped without further +ClickHouse round-trips, `permanentlyDroppedBatches` increments, and an +error log with a 1KB sample row is emitted. Non-parse errors propagate +unchanged. + +Detection reuses `detectBadJsonStrings` via `JSON.stringify(value)`, +with a latent regex bug fixed: the low-surrogate hex nibble matched +`[cd]` instead of `[c-f]`, missing the U+DE00–U+DFFF half of the range +and false-flagging common emoji pairs. Healthy batches pay zero scan +cost — the check only runs when ClickHouse has already rejected. diff --git a/.server-changes/plugin-auth-path.md b/.server-changes/plugin-auth-path.md new file mode 100644 index 00000000000..c8269125ffc --- /dev/null +++ b/.server-changes/plugin-auth-path.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: improvement +--- + +Webapp now supports a plugin system. Initially consolidates authentication and authorization paths. diff --git a/.server-changes/prompts-dashboard.md b/.server-changes/prompts-dashboard.md new file mode 100644 index 00000000000..10397b9da22 --- /dev/null +++ b/.server-changes/prompts-dashboard.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +New Prompts page in the dashboard: list view with per-prompt usage sparklines, detail view with the template alongside Generations / Metrics / Versions tabs, and a dashboard override UI for changing the template text or model without redeploying. diff --git a/.server-changes/realtimestreams-dedupe.md b/.server-changes/realtimestreams-dedupe.md new file mode 100644 index 00000000000..69987f7b4b6 --- /dev/null +++ b/.server-changes/realtimestreams-dedupe.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: fix +--- + +Dedupe the `realtimeStreams` array push on `PUT /realtime/v1/streams/:runId/:target/:streamId` so repeat stream-init calls for the same `(run, streamId)` skip the row UPDATE, mirroring the existing append handler. diff --git a/.server-changes/run-agent-view.md b/.server-changes/run-agent-view.md new file mode 100644 index 00000000000..570351f89ed --- /dev/null +++ b/.server-changes/run-agent-view.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +Run detail page gains an Agent view alongside the Trace view, rendering the agent's `UIMessage` conversation in real time from the backing Session for any run whose `taskKind` is `AGENT`. diff --git a/.server-changes/runs-task-source-filter.md b/.server-changes/runs-task-source-filter.md new file mode 100644 index 00000000000..70c8e2ff895 --- /dev/null +++ b/.server-changes/runs-task-source-filter.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +Task source filter on the Runs list — slice runs by Standard, Scheduled, or Agent so agent runs can be separated from mixed workloads at a glance. diff --git a/.server-changes/s2-access-token-cache-ops-fingerprint.md b/.server-changes/s2-access-token-cache-ops-fingerprint.md new file mode 100644 index 00000000000..21937c341fd --- /dev/null +++ b/.server-changes/s2-access-token-cache-ops-fingerprint.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: fix +--- + +Include the S2 access-token scope fingerprint in its cache key so a scope change in code (e.g. adding a new op) auto-invalidates pre-deploy cached tokens instead of returning stale ones for up to 24h. diff --git a/.server-changes/sanitize-loader-action-leaks.md b/.server-changes/sanitize-loader-action-leaks.md new file mode 100644 index 00000000000..f4cb871f4cf --- /dev/null +++ b/.server-changes/sanitize-loader-action-leaks.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: fix +--- + +Expand API error response sanitization to additional loaders and actions so internal exception messages (Prisma errors, etc.) no longer leak to callers via 5xx response bodies. diff --git a/.server-changes/sessions-dashboard.md b/.server-changes/sessions-dashboard.md new file mode 100644 index 00000000000..7adc299aec6 --- /dev/null +++ b/.server-changes/sessions-dashboard.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: feature +--- + +New Sessions page in the dashboard for inspecting `chat.agent` Session rows alongside their underlying runs, with filters by status, type, task identifier, and period, and a detail view that streams the live conversation from the backing Session's `.out` and `.in` channels. diff --git a/.server-changes/streamdown-v2-upgrade.md b/.server-changes/streamdown-v2-upgrade.md new file mode 100644 index 00000000000..8a0b3f17af0 --- /dev/null +++ b/.server-changes/streamdown-v2-upgrade.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: improvement +--- + +Upgrade streamdown from v1.4.0 to v2.5.0. Custom Shiki syntax highlighting theme matching our CodeMirror dark theme colors. Consolidate duplicated lazy StreamdownRenderer into a shared component. diff --git a/.server-changes/supervisor-wide-events.md b/.server-changes/supervisor-wide-events.md new file mode 100644 index 00000000000..49b73668ddb --- /dev/null +++ b/.server-changes/supervisor-wide-events.md @@ -0,0 +1,6 @@ +--- +area: supervisor +type: feature +--- + +Wide-event observability for the dequeue loop, workload-server routes, and run socket lifecycle. Off by default behind `TRIGGER_WIDE_EVENTS_ENABLED`. diff --git a/.server-changes/task-metadata-cache.md b/.server-changes/task-metadata-cache.md new file mode 100644 index 00000000000..a71bbdf347b --- /dev/null +++ b/.server-changes/task-metadata-cache.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: improvement +--- + +Cache task defaults in Redis so the trigger API skips per-request database lookups, restoring the fast trigger path when callers pass queue and TTL options. diff --git a/.server-changes/vercel-atomic-disable-clear-trigger-version.md b/.server-changes/vercel-atomic-disable-clear-trigger-version.md new file mode 100644 index 00000000000..3483a080c8b --- /dev/null +++ b/.server-changes/vercel-atomic-disable-clear-trigger-version.md @@ -0,0 +1,9 @@ +--- +area: webapp +type: feature +--- + +Show the currently pinned `TRIGGER_VERSION` under the Atomic deployments toggle on the Vercel +integration settings, and prompt the user to clear it from Vercel production when they disable +atomic deployments. Also mark `TRIGGER_SECRET_KEY` writes to Vercel as `sensitive` so the value +cannot be read back from the Vercel dashboard or API once written. diff --git a/.server-changes/webapp-sentry-fingerprint-p1001.md b/.server-changes/webapp-sentry-fingerprint-p1001.md new file mode 100644 index 00000000000..dd2f1ecc55d --- /dev/null +++ b/.server-changes/webapp-sentry-fingerprint-p1001.md @@ -0,0 +1,6 @@ +--- +area: webapp +type: improvement +--- + +Group Prisma P1001 ("Can't reach database server") errors into a single Sentry issue via a `beforeSend` fingerprint rule, so DB outages no longer fan out into hundreds of distinct issues that bury other alerts. Adds a small extensible rule table for future collapsing rules. diff --git a/.vouch.yml b/.vouch.yml new file mode 100644 index 00000000000..228b51ab2fe --- /dev/null +++ b/.vouch.yml @@ -0,0 +1,3 @@ +vouch: + - github: edosrecki + - github: GautamBytes diff --git a/.vscode/launch.json b/.vscode/launch.json index d135aa70a20..71a76904a2b 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -31,6 +31,15 @@ "cwd": "${workspaceFolder}/apps/webapp", "sourceMaps": true }, + { + "type": "node-terminal", + "request": "launch", + "name": "Debug opened test file", + "command": "pnpm run test -- ./${relativeFile}", + "envFile": "${workspaceFolder}/.env", + "cwd": "${workspaceFolder}", + "sourceMaps": true + }, { "type": "chrome", "request": "launch", diff --git a/.vscode/settings.json b/.vscode/settings.json index 12aefeb358f..fd9f3dcde0c 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -7,5 +7,5 @@ "packages/cli-v3/e2e": true }, "vitest.disableWorkspaceWarning": true, - "typescript.experimental.useTsgo": false + "chat.agent.maxRequests": 10000 } diff --git a/AGENTS.md b/AGENTS.md index 99496f91bde..d3f23dfaf2c 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -13,12 +13,13 @@ This repository is a pnpm monorepo managed with Turbo. It contains multiple apps See `ai/references/repo.md` for a more complete explanation of the workspaces. ## Development setup -1. Install dependencies with `pnpm i` (pnpm `10.23.0` and Node.js `20.20.0` are required). +1. Install dependencies with `pnpm i` (pnpm `10.33.2` and Node.js `20.20.0` are required). 2. Copy `.env.example` to `.env` and generate a random 16 byte hex string for `ENCRYPTION_KEY` (`openssl rand -hex 16`). Update other secrets if needed. 3. Start the local services with Docker: ```bash pnpm run docker ``` + Add `:full` (`pnpm run docker:full`) for the optional observability + chaos tooling. See `docker/docker-compose.extras.yml`. 4. Run database migrations: ```bash pnpm run db:migrate diff --git a/CHANGESETS.md b/CHANGESETS.md index 722fe64eb4c..2e225b9ad34 100644 --- a/CHANGESETS.md +++ b/CHANGESETS.md @@ -1,24 +1,49 @@ -# Changesets +# Changesets and Server Changes -Trigger.dev uses [changesets](https://github.com/changesets/changesets) to manage updated our packages and releasing them to npm. +Trigger.dev uses [changesets](https://github.com/changesets/changesets) to manage package versions and releasing them to npm. For server-only changes, we use a lightweight `.server-changes/` convention. -## Adding a changeset +## Adding a changeset (package changes) To add a changeset, use `pnpm run changeset:add` and follow the instructions [here](https://github.com/changesets/changesets/blob/main/docs/adding-a-changeset.md). Please only ever select one of our public packages when adding a changeset. -## Release instructions (local only) +## Adding a server change (server-only changes) -Based on the instructions [here](https://github.com/changesets/changesets/blob/main/docs/intro-to-using-changesets.md) +If your PR only changes server components (`apps/webapp/`, `apps/supervisor/`, etc.) and does NOT change any published packages, add a `.server-changes/` file instead of a changeset: -1. Run `pnpm run changeset:version` -2. Run `pnpm run changeset:release` +```sh +cat > .server-changes/fix-batch-queue-stalls.md << 'EOF' +--- +area: webapp +type: fix +--- + +Speed up batch queue processing by removing stalls and fixing retry race +EOF +``` + +- `area`: `webapp` | `supervisor` | `coordinator` | `kubernetes-provider` | `docker-provider` +- `type`: `feature` | `fix` | `improvement` | `breaking` + +For **mixed PRs** (both packages and server): just add a changeset. No `.server-changes/` file needed. + +See `.server-changes/README.md` for full documentation. + +## When to add which + +| PR changes | What to add | +|---|---| +| Only packages (`packages/`) | Changeset (`pnpm run changeset:add`) | +| Only server (`apps/`) | `.server-changes/` file | +| Both packages and server | Just the changeset | ## Release instructions (CI) Please follow the best-practice of adding changesets in the same commit as the code making the change with `pnpm run changeset:add`, as it will allow our release.yml CI workflow to function properly: -- Anytime new changesets are added in a commit in the `main` branch, the [release.yml](./.github/workflows/release.yml) workflow will run and will automatically create/update a PR with a fresh run of `pnpm run changeset:version`. -- When the version PR is merged into `main`, the release.yml workflow will automatically run `pnpm run changeset:release` to build and release packages to npm. +- Anytime new changesets are added in a commit in the `main` branch, the [changesets-pr.yml](./.github/workflows/changesets-pr.yml) workflow will run and will automatically create/update a PR with a fresh run of `pnpm run changeset:version`. +- The release PR body is automatically enhanced with a clean, deduplicated summary that includes both package changes and `.server-changes/` entries. +- Consumed `.server-changes/` files are removed on the `changeset-release/main` branch — the same way changesets deletes `.changeset/*.md` files. When the release PR merges, they're gone from main. +- When the version PR is merged into `main`, the [release.yml](./.github/workflows/release.yml) workflow will automatically build, release packages to npm, and create a single unified GitHub release. ## Pre-release instructions diff --git a/CLAUDE.md b/CLAUDE.md index acc59359707..99b5c4c4033 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,73 +1,75 @@ # CLAUDE.md -This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. +This file provides guidance to Claude Code when working with this repository. Subdirectory CLAUDE.md files provide deeper context when you navigate into specific areas. ## Build and Development Commands -This is a pnpm 10.23.0 monorepo using Turborepo. Run commands from root with `pnpm run`. +This is a pnpm 10.33.2 monorepo using Turborepo. Run commands from root with `pnpm run`. -### Essential Commands +**Adding dependencies:** Edit `package.json` directly instead of using `pnpm add`, then run `pnpm i` from the repo root. See `.claude/rules/package-installation.md` for the full process. ```bash -# Start Docker services (PostgreSQL, Redis, Electric) -pnpm run docker - -# Run database migrations -pnpm run db:migrate - -# Seed the database (required for reference projects) -pnpm run db:seed +pnpm run docker # Core dev services (Postgres, Redis, Electric, MinIO, ClickHouse, s2-lite) +# pnpm run docker:full # Same + observability stack (Prometheus, Grafana, OTEL) and chaos tooling +pnpm run db:migrate # Run database migrations +pnpm run db:seed # Seed the database (required for reference projects) # Build packages (required before running) pnpm run build --filter webapp && pnpm run build --filter trigger.dev && pnpm run build --filter @trigger.dev/sdk -# Run webapp in development mode (http://localhost:3030) -pnpm run dev --filter webapp +pnpm run dev --filter webapp # Run webapp (http://localhost:3030) +pnpm run dev --filter trigger.dev --filter "@trigger.dev/*" # Watch CLI and packages +``` + +### Verifying Changes + +The verification command depends on where the change lives: + +- **Apps and internal packages** (`apps/*`, `internal-packages/*`): Use `typecheck`. **Never use `build`** for these — building proves almost nothing about correctness. +- **Public packages** (`packages/*`): Use `build`. -# Build and watch for changes (CLI and packages) -pnpm run dev --filter trigger.dev --filter "@trigger.dev/*" +```bash +# Apps and internal packages — use typecheck +pnpm run typecheck --filter webapp # ~1-2 minutes +pnpm run typecheck --filter @internal/run-engine + +# Public packages — use build +pnpm run build --filter @trigger.dev/sdk +pnpm run build --filter @trigger.dev/core ``` -### Testing +Only run typecheck/build after major changes (new files, significant refactors, schema changes). For small edits, trust the types and let CI catch issues. + +## Testing We use vitest exclusively. **Never mock anything** - use testcontainers instead. ```bash -# Run all tests for a package -pnpm run test --filter webapp - -# Run a single test file (preferred - cd into directory first) +pnpm run test --filter webapp # All tests for a package cd internal-packages/run-engine -pnpm run test ./src/engine/tests/ttl.test.ts --run - -# May need to build dependencies first -pnpm run build --filter @internal/run-engine +pnpm run test ./src/engine/tests/ttl.test.ts --run # Single test file +pnpm run build --filter @internal/run-engine # May need to build deps first ``` -Test files go next to source files (e.g., `MyService.ts` → `MyService.test.ts`). +Test files go next to source files (e.g., `MyService.ts` -> `MyService.test.ts`). -#### Testcontainers for Redis/PostgreSQL +### Testcontainers for Redis/PostgreSQL ```typescript import { redisTest, postgresTest, containerTest } from "@internal/testcontainers"; -// Redis only redisTest("should use redis", async ({ redisOptions }) => { /* ... */ }); - -// PostgreSQL only postgresTest("should use postgres", async ({ prisma }) => { /* ... */ }); - -// Both Redis and PostgreSQL containerTest("should use both", async ({ prisma, redisOptions }) => { /* ... */ }); ``` -### Changesets +## Changesets and Server Changes When modifying any public package (`packages/*` or `integrations/*`), add a changeset: @@ -77,227 +79,172 @@ pnpm run changeset:add - Default to **patch** for bug fixes and minor changes - Confirm with maintainers before selecting **minor** (new features) -- **Never** select major (breaking changes) without explicit approval +- **Never** select major without explicit approval -## Architecture Overview +When modifying only server components (`apps/webapp/`, `apps/supervisor/`, etc.) with no package changes, add a `.server-changes/` file instead. See `.server-changes/README.md` for format and documentation. -### Apps +## Dependency Pinning -- **apps/webapp**: Remix 2.1.0 app - main API, dashboard, and Docker image. Uses Express server. -- **apps/supervisor**: Node.js app handling task execution, interfacing with Docker/Kubernetes. +Zod is pinned to a single version across the entire monorepo (currently `3.25.76`). When adding zod to a new or existing package, use the **exact same version** as the rest of the repo - never a different version or a range. Mismatched zod versions cause runtime type incompatibilities (e.g., schemas from one package can't be used as body validators in another). -### Public Packages +## Architecture Overview -- **packages/trigger-sdk** (`@trigger.dev/sdk`): Main SDK -- **packages/cli-v3** (`trigger.dev`): CLI package -- **packages/core** (`@trigger.dev/core`): Shared code between SDK and webapp. Import subpaths only (never root). -- **packages/build**: Build extensions and types -- **packages/react-hooks**: React hooks for realtime and triggering -- **packages/redis-worker** (`@trigger.dev/redis-worker`): Custom Redis-based background job system +### Request Flow -### Internal Packages +User API call -> Webapp routes -> Services -> RunEngine -> Redis Queue -> Supervisor -> Container execution -> Results back through RunEngine -> ClickHouse (analytics) + PostgreSQL (state) -- **internal-packages/database** (`@trigger.dev/database`): Prisma 6.14.0 client and schema -- **internal-packages/clickhouse** (`@internal/clickhouse`): ClickHouse client and schema migrations -- **internal-packages/run-engine** (`@internal/run-engine`): "Run Engine 2.0" - run lifecycle management -- **internal-packages/redis** (`@internal/redis`): Redis client creation utilities -- **internal-packages/testcontainers** (`@internal/testcontainers`): Test helpers for Redis/PostgreSQL containers -- **internal-packages/zodworker** (`@internal/zodworker`): Graphile-worker wrapper (being replaced by redis-worker) +### Apps -### Reference Projects +- **apps/webapp**: Remix 2.17.4 app - main API, dashboard, orchestration. Uses Express server. +- **apps/supervisor**: Manages task execution containers (Docker/Kubernetes). -The `references/` directory contains test workspaces for developing and testing new SDK and platform features. Use these projects (e.g., `references/hello-world`) to manually test changes to the CLI, SDK, core packages, and webapp before submitting PRs. +### Public Packages -## Webapp Development +- **packages/trigger-sdk** (`@trigger.dev/sdk`): Main SDK for writing tasks +- **packages/cli-v3** (`trigger.dev`): CLI - also bundles code that goes into customer task images +- **packages/core** (`@trigger.dev/core`): Shared types. **Import subpaths only** (never root). +- **packages/build** (`@trigger.dev/build`): Build extensions and types +- **packages/react-hooks**: React hooks for realtime and triggering +- **packages/redis-worker** (`@trigger.dev/redis-worker`): Redis-based background job system -### Key Locations +### Internal Packages -- Trigger API: `apps/webapp/app/routes/api.v1.tasks.$taskId.trigger.ts` -- Batch trigger: `apps/webapp/app/routes/api.v1.tasks.batch.ts` -- Prisma setup: `apps/webapp/app/db.server.ts` -- Run engine config: `apps/webapp/app/v3/runEngine.server.ts` -- Services: `apps/webapp/app/v3/services/**/*.server.ts` -- Presenters: `apps/webapp/app/v3/presenters/**/*.server.ts` -- OTEL endpoints: `apps/webapp/app/routes/otel.v1.logs.ts`, `otel.v1.traces.ts` +- **internal-packages/database**: Prisma 6.14.0 client and schema (PostgreSQL) +- **internal-packages/clickhouse**: ClickHouse client, schema migrations, analytics queries +- **internal-packages/run-engine**: "Run Engine 2.0" - core run lifecycle management +- **internal-packages/redis**: Redis client creation utilities (ioredis) +- **internal-packages/testcontainers**: Test helpers for Redis/PostgreSQL containers +- **internal-packages/schedule-engine**: Durable cron scheduling +- **internal-packages/zodworker**: Graphile-worker wrapper (DEPRECATED - use redis-worker) -### Environment Variables +### Legacy V1 Engine Code -Access via `env` export from `apps/webapp/app/env.server.ts`, never `process.env` directly. +The `apps/webapp/app/v3/` directory name is misleading - most code there is actively used by V2. Only specific files are V1-only legacy (MarQS queue, triggerTaskV1, cancelTaskRunV1, etc.). See `apps/webapp/CLAUDE.md` for the exact list. When you encounter V1/V2 branching in services, only modify V2 code paths. All new work uses Run Engine 2.0 (`@internal/run-engine`) and redis-worker. -For testable code, **never import env.server.ts** in test files. Pass configuration as options instead. Example pattern: +### Documentation -- `realtimeClient.server.ts` (testable service) -- `realtimeClientGlobal.server.ts` (configuration) +Docs live in `docs/` as a Mintlify site (MDX format). See `docs/CLAUDE.md` for conventions. -### Legacy vs Run Engine 2.0 +### Reference Projects -The codebase is transitioning from the "legacy run engine" (spread across codebase) to "Run Engine 2.0" (`@internal/run-engine`). Focus on Run Engine 2.0 for new work. +The `references/` directory contains test workspaces for testing SDK and platform features. Use `references/hello-world` to manually test changes before submitting PRs. ## Docker Image Guidelines -When updating Docker image references in `docker/Dockerfile` or other container files: +When updating Docker image references: - **Always use multiplatform/index digests**, not architecture-specific digests -- Architecture-specific digests (e.g., for `linux/amd64` only) will cause CI failures on different build environments -- On Docker Hub, the multiplatform digest is shown on the main image page, while architecture-specific digests are listed under "OS/ARCH" -- Example: Use `node:20.20-bullseye-slim@sha256:abc123...` where the digest is from the multiplatform index, not from a specific OS/ARCH variant - -## Database Migrations (PostgreSQL) - -1. Edit `internal-packages/database/prisma/schema.prisma` -2. Create migration: - ```bash - cd internal-packages/database - pnpm run db:migrate:dev:create --name "add_new_column" - ``` -3. **Important**: Generated migration includes extraneous changes. Remove lines related to: - - `_BackgroundWorkerToBackgroundWorkerFile` - - `_BackgroundWorkerToTaskQueue` - - `_TaskRunToTaskRunTag` - - `_WaitpointRunConnections` - - `_completedWaitpoints` - - `SecretStore_key_idx` - - Various `TaskRun` indexes unless you added them -4. Apply migration: - ```bash - pnpm run db:migrate:deploy && pnpm run generate - ``` - -### Index Migration Rules - -- Indexes **must use CONCURRENTLY** to avoid table locks -- **CONCURRENTLY indexes must be in their own separate migration file** - they cannot be combined with other schema changes - -## ClickHouse Migrations - -ClickHouse migrations use Goose format and live in `internal-packages/clickhouse/schema/`. - -1. Create a new numbered SQL file (e.g., `010_add_new_column.sql`) -2. Use Goose markers: - - ```sql - -- +goose Up - ALTER TABLE trigger_dev.your_table - ADD COLUMN new_column String DEFAULT ''; - - -- +goose Down - ALTER TABLE trigger_dev.your_table - DROP COLUMN new_column; - ``` - -Follow naming conventions in `internal-packages/clickhouse/README.md`: - -- `raw_` prefix for input tables -- `_v1`, `_v2` suffixes for versioning -- `_mv_v1` suffix for materialized views +- Architecture-specific digests cause CI failures on different build environments +- Use the digest from the main Docker Hub page, not from a specific OS/ARCH variant ## Writing Trigger.dev Tasks -Always import from `@trigger.dev/sdk`. Never use `@trigger.dev/sdk/v3` or deprecated `client.defineJob` pattern. +Always import from `@trigger.dev/sdk`. Never use `@trigger.dev/sdk/v3` or deprecated `client.defineJob`. ```typescript import { task } from "@trigger.dev/sdk"; -// Every task must be exported export const myTask = task({ - id: "my-task", // Unique ID + id: "my-task", run: async (payload: { message: string }) => { - // Task logic - no timeouts + // Task logic }, }); ``` ### SDK Documentation Rules -The `rules/` directory contains versioned documentation for writing Trigger.dev tasks, distributed to users via the SDK installer. Current version is defined in `rules/manifest.json`. - -- `rules/4.3.0/` - Latest: batch trigger v2 (1,000 items, 3MB payloads), debouncing -- `rules/4.1.0/` - Realtime streams v2, updated config -- `rules/4.0.0/` - Base v4 SDK documentation - -When adding new SDK features, create a new version directory with only the files that changed from the previous version. Update `manifest.json` to point unchanged files to previous versions. - -### Claude Code Skill - -The `.claude/skills/trigger-dev-tasks/` skill provides Claude Code with Trigger.dev task expertise. It includes: - -- `SKILL.md` - Core instructions and patterns -- Reference files for basic tasks, advanced tasks, scheduled tasks, realtime, and config - -Keep the skill in sync with the latest rules version when SDK features change. +The `rules/` directory contains versioned SDK documentation distributed via the SDK installer. Current version: `rules/manifest.json`. Do NOT update `rules/` or `.claude/skills/trigger-dev-tasks/` unless explicitly asked - these are maintained in separate dedicated passes. ## Testing with hello-world Reference Project First-time setup: -1. Run `pnpm run db:seed` to seed the database (creates the hello-world project) +1. `pnpm run db:seed` to seed the database 2. Build CLI: `pnpm run build --filter trigger.dev && pnpm i` -3. Authorize CLI: `cd references/hello-world && pnpm exec trigger login -a http://localhost:3030` +3. Authorize: `cd references/hello-world && pnpm exec trigger login -a http://localhost:3030` -Running: - -```bash -cd references/hello-world -pnpm exec trigger dev # or with --log-level debug -``` +Running: `cd references/hello-world && pnpm exec trigger dev` ## Local Task Testing Workflow -This workflow enables Claude Code to run the webapp and trigger dev simultaneously, trigger tasks, and inspect results for testing code changes. - ### Step 1: Start Webapp in Background ```bash # Run from repo root with run_in_background: true pnpm run dev --filter webapp -``` - -Verify webapp is running: - -```bash -curl -s http://localhost:3030/healthcheck # Should return 200 +curl -s http://localhost:3030/healthcheck # Verify running ``` ### Step 2: Start Trigger Dev in Background ```bash -# Run from hello-world directory with run_in_background: true cd references/hello-world && pnpm exec trigger dev +# Wait for "Local worker ready [node]" ``` -The worker will build and register tasks. Check output for "Local worker ready [node]" message. - ### Step 3: Trigger and Monitor Tasks via MCP -Use the Trigger.dev MCP tools to interact with tasks: - ``` -# Get current worker and registered tasks mcp__trigger__get_current_worker(projectRef: "proj_rrkpdguyagvsoktglnod", environment: "dev") - -# Trigger a task -mcp__trigger__trigger_task( - projectRef: "proj_rrkpdguyagvsoktglnod", - environment: "dev", - taskId: "hello-world", - payload: {"message": "Hello from Claude"} -) - -# List runs to see status -mcp__trigger__list_runs( - projectRef: "proj_rrkpdguyagvsoktglnod", - environment: "dev", - taskIdentifier: "hello-world", - limit: 5 -) +mcp__trigger__trigger_task(projectRef: "proj_rrkpdguyagvsoktglnod", environment: "dev", taskId: "hello-world", payload: {"message": "Hello"}) +mcp__trigger__list_runs(projectRef: "proj_rrkpdguyagvsoktglnod", environment: "dev", taskIdentifier: "hello-world", limit: 5) ``` -### Step 4: Monitor Execution +Dashboard: http://localhost:3030/orgs/references-9dfd/projects/hello-world-97DT/env/dev/runs + + + +# Skill mappings — when working in these areas, load the linked skill file into context. + +skills: + +- task: "Using agentcrumbs for debug tracing, adding crumbs, trails, markers, querying traces, or stripping debug code before merge" + load: "node_modules/agentcrumbs/skills/agentcrumbs/SKILL.md" +- task: "Setting up agentcrumbs in the project, initializing namespace catalog, running crumbs init" +load: "node_modules/agentcrumbs/skills/agentcrumbs/init/SKILL.md" + + +## agentcrumbs + +Add crumbs as you write code — not just when debugging. Mark lines with +`// @crumbs` or wrap blocks in `// #region @crumbs`. They stay on the +branch throughout development and are stripped by `agentcrumbs strip` +before merge. -- Check trigger dev output file for real-time execution logs -- Successful runs show: `Task | Run ID | Success (Xms)` -- Dashboard available at: http://localhost:3030/orgs/references-9dfd/projects/hello-world-97DT/env/dev/runs +### Namespaces -### Key Project Refs +| Namespace | Description | Path | +| ----------------- | ---------------------------------------------- | ----------------------------------- | +| `webapp` | Main Remix app — API, dashboard, orchestration | `apps/webapp` | +| `supervisor` | Task execution container manager | `apps/supervisor` | +| `sdk` | Main SDK for writing tasks | `packages/trigger-sdk` | +| `cli` | CLI and task image bundling | `packages/cli-v3` | +| `core` | Shared types and subpath exports | `packages/core` | +| `build` | Build extensions and types | `packages/build` | +| `react-hooks` | React hooks for realtime/triggering | `packages/react-hooks` | +| `redis-worker` | Redis-based background job system | `packages/redis-worker` | +| `run-engine` | Run Engine 2.0 — core run lifecycle | `internal-packages/run-engine` | +| `database` | Prisma client and schema | `internal-packages/database` | +| `clickhouse` | ClickHouse client, migrations, analytics | `internal-packages/clickhouse` | +| `schedule-engine` | Durable cron scheduling | `internal-packages/schedule-engine` | +| `run-queue` | Run queue management | `internal-packages/run-queue` | + +Do not invent new namespaces — pick from this table or ask first. + +### For PR reviewers + +Do not flag `// @crumbs` markers, `#region @crumbs` blocks, or agentcrumbs +imports in reviews. These are temporary debug instrumentation stripped +before merge. Data logged in crumbs (IDs, names, values) never reaches +production. + +### CLI + +```bash +pnpm exec agentcrumbs collect # start collector (multi-service) +pnpm exec agentcrumbs tail --app trigger # live tail +pnpm exec agentcrumbs clear --app trigger # remove crumbs before merge +``` -- hello-world: `proj_rrkpdguyagvsoktglnod` +The preferred way to query for crumbs is to use `pnpm exec agentcrumbs query --app trigger` with the `--limit` option and cursor pagination, and clear existing crumbs before reproducing a bug via `pnpm exec agentcrumbs clear --app trigger`. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 0162350ffc1..22cc7847f4b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -2,10 +2,25 @@ Thank you for taking the time to contribute to Trigger.dev. Your involvement is not just welcomed, but we encourage it! 🚀 -Please take some time to read this guide to understand contributing best practices for Trigger.dev. +Please take some time to read this guide to understand contributing best practices for Trigger.dev. Note that we use [vouch](https://github.com/mitchellh/vouch) to manage contributor trust, so you'll need to be vouched before opening a PR. Thank you for helping us make Trigger.dev even better! 🤩 +> **Important:** We only accept PRs that address a single issue. Please do not submit PRs containing multiple unrelated fixes or features. If you have multiple contributions, open a separate PR for each one. + +## Getting vouched (required before opening a PR) + +We use [vouch](https://github.com/mitchellh/vouch) to manage contributor trust. **PRs from unvouched users are automatically closed.** + +Before you open your first pull request, you need to be vouched by a maintainer. Here's how: + +1. Open a [Vouch Request](https://github.com/triggerdotdev/trigger.dev/issues/new?template=vouch-request.yml) issue. +2. Tell us what you'd like to work on and share any relevant background. +3. A maintainer will review your request and vouch for you by commenting on the issue. +4. Once vouched, your PRs will be accepted normally. + +If you're unsure whether you're already vouched, go ahead and open a PR — the check will tell you. + ## Developing The development branch is `main`. This is the branch that all pull @@ -15,7 +30,7 @@ branch are tagged into a release periodically. ### Prerequisites - [Node.js](https://nodejs.org/en) version 20.20.0 -- [pnpm package manager](https://pnpm.io/installation) version 10.23.0 +- [pnpm package manager](https://pnpm.io/installation) version 10.33.2 - [Docker](https://www.docker.com/get-started/) - [protobuf](https://github.com/protocolbuffers/protobuf) @@ -36,7 +51,7 @@ branch are tagged into a release periodically. ``` 3. Ensure you are on the correct version of Node.js (20.20.0). If you are using `nvm`, there is an `.nvmrc` file that will automatically select the correct version of Node.js when you navigate to the repository. -4. Run `corepack enable` to use the correct version of pnpm (`10.23.0`) as specified in the root `package.json` file. +4. Run `corepack enable` to use the correct version of pnpm (`10.33.2`) as specified in the root `package.json` file. 5. Install the required packages using pnpm. ``` @@ -56,21 +71,27 @@ branch are tagged into a release periodically. Feel free to update `SESSION_SECRET` and `MAGIC_LINK_SECRET` as well using the same method. -8. Start Docker. This starts the required services like Postgres & Redis. If this is your first time using Docker, consider going through this [guide](DOCKER_INSTALLATION.md) +8. Start Docker. This starts the core dev services (Postgres, Redis, Electric, MinIO, ClickHouse, s2-lite) and runs the ClickHouse migrator once on first start. If this is your first time using Docker, consider going through this [guide](DOCKER_INSTALLATION.md). ``` pnpm run docker ``` + For the observability stack (Prometheus, Grafana, OTEL collector) and other optional tooling (Toxiproxy, nginx-h2, ch-ui, extra electric shard), use `pnpm run docker:full` instead. See `docker/docker-compose.extras.yml` for the full list. + 9. Migrate the database ``` pnpm run db:migrate ``` -10. Build everything +10. Build the webapp, CLI, and SDK + ``` + pnpm run build --filter webapp --filter trigger.dev --filter @trigger.dev/sdk + ``` +11. Seed the database. This creates a local user, a `References` org, and the reference projects (including `hello-world`) with stable IDs. ``` - pnpm run build --filter webapp && pnpm run build --filter trigger.dev && pnpm run build --filter @trigger.dev/sdk + pnpm run db:seed ``` -11. Run the app. See the section below. +12. Run the app. See the section below. ## Running @@ -90,22 +111,17 @@ We use the `/references/hello-world` subdirectory as a staging ground for ### First-time setup -First, make sure you are running the webapp according to the instructions above. Then: - -1. Visit http://localhost:3030 in your browser and create a new project called "hello-world". - -2. In Postgres go to the "Projects" table and for the project you create change the `externalRef` to `proj_rrkpdguyagvsoktglnod`. +First, make sure you are running the webapp according to the instructions above. The seed step from setup already created a `hello-world` project under the `References` org with the stable ref `proj_rrkpdguyagvsoktglnod` — log in at http://localhost:3030 with any email to access it. Then: -3. Build the CLI +1. Build the CLI (skip if you already ran the build step in setup) ```sh -# Build the CLI pnpm run build --filter trigger.dev # Make it accessible to `pnpm exec` pnpm i ``` -4. Change into the `/references/hello-world` directory and authorize the CLI to the local server: +2. Change into the `/references/hello-world` directory and authorize the CLI to the local server: ```sh cd references/hello-world @@ -153,24 +169,24 @@ If you want additional debug logging, you can use the `--log-level debug` flag: pnpm exec trigger dev --log-level debug ``` -6. If you make any changes in the CLI/Core/SDK, you'll need to `CTRL+C` to exit the `dev` command and restart it to pickup changes. Any changes to the files inside of the `hello-world/src/trigger` dir will automatically be rebuilt by the `dev` command. +5. If you make any changes in the CLI/Core/SDK, you'll need to `CTRL+C` to exit the `dev` command and restart it to pickup changes. Any changes to the files inside of the `hello-world/src/trigger` dir will automatically be rebuilt by the `dev` command. -7. Navigate to the `hello-world` project in your local dashboard at localhost:3030 and you should see the list of tasks. +6. Navigate to the `hello-world` project in your local dashboard at localhost:3030 and you should see the list of tasks. -8. Go to the "Test" page in the sidebar and select a task. Then enter a payload and click "Run test". You can tell what the payloads should be by looking at the relevant task file inside the `/references/hello-world/src/trigger` folder. Many of them accept an empty payload. +7. Go to the "Test" page in the sidebar and select a task. Then enter a payload and click "Run test". You can tell what the payloads should be by looking at the relevant task file inside the `/references/hello-world/src/trigger` folder. Many of them accept an empty payload. -9. Feel free to add additional files in `hello-world/src/trigger` to test out specific aspects of the system, or add in edge cases. +8. Feel free to add additional files in `hello-world/src/trigger` to test out specific aspects of the system, or add in edge cases. ## Adding and running migrations -1. Modify internal-packages/database/prisma/schema.prisma file -2. Change directory to the packages/database folder +1. Modify `internal-packages/database/prisma/schema.prisma`. +2. Change directory to the database package: ```sh - cd packages/database + cd internal-packages/database ``` -3. Create a migration +3. Create a migration: ``` pnpm run db:migrate:dev:create @@ -178,58 +194,39 @@ pnpm exec trigger dev --log-level debug This creates a migration file. Check the migration file does only what you want. If you're adding any database indexes they must use `CONCURRENTLY`, otherwise they'll lock the table when executed. -4. Run the migration. +4. Run the migration: -``` -pnpm run db:migrate:deploy -pnpm run generate -``` - -This executes the migrations against your database and applies changes to the database schema(s), and then regenerates the Prisma client. - -4. Commit generated migrations as well as changes to the schema.prisma file -5. If you're using VSCode you may need to restart the Typescript server in the webapp to get updated type inference. Open a TypeScript file, then open the Command Palette (View > Command Palette) and run `TypeScript: Restart TS server`. - -## Add sample jobs - -The [references/job-catalog](./references/job-catalog/) project defines simple jobs you can get started with. - -1. `cd` into `references/job-catalog` -2. Create a `.env` file with the following content, - replacing `` with an actual key: + ``` + pnpm run db:migrate:deploy + pnpm run generate + ``` -```env -TRIGGER_API_KEY=[TRIGGER_DEV_API_KEY] -TRIGGER_API_URL=http://localhost:3030 -``` + This executes the migrations against your database and applies changes to the database schema(s), and then regenerates the Prisma client. -`TRIGGER_API_URL` is used to configure the URL for your Trigger.dev instance, -where the jobs will be registered. +5. Commit the generated migration files as well as the changes to `schema.prisma`. +6. If you're using VSCode you may need to restart the TypeScript server in the webapp to get updated type inference. Open a TypeScript file, then open the Command Palette (View > Command Palette) and run `TypeScript: Restart TS server`. -3. Run one of the the `job-catalog` files: +## Making a pull request -```sh -pnpm run events -``` +**If you get errors, be sure to fix them before committing.** -This will open up a local server using `express` on port 8080. Then in a new terminal window you can run the trigger-cli dev command: +> **Note:** We may close PRs if we decide that the cost of integrating the change outweighs the benefits. To improve the chances of your PR getting accepted, follow the guidelines below. -```sh -pnpm run dev:trigger -``` +### PR workflow -See the [Job Catalog](./references/job-catalog/README.md) file for more. +1. **Always open your PR in draft status first.** Do not mark it as "Ready for Review" until the steps below are complete. +2. **Address all CodeRabbit code review comments.** Our CI runs an automated code review via CodeRabbit. Go through each comment and either fix the issue or resolve it with a comment explaining why no change is needed. +3. **Wait for all CI checks to pass.** Do not mark the PR as "Ready for Review" until every check is green. +4. **Then mark the PR as "Ready for Review"** so a maintainer can take a look. -4. Navigate to your trigger.dev instance ([http://localhost:3030](http://localhost:3030/)), to see the jobs. - You can use the test feature to trigger them. +### Cost/benefit analysis for risky changes -## Making a pull request +If your change touches core infrastructure, modifies widely-used code paths, or could introduce regressions, consider doing a brief cost/benefit analysis and including it in the PR description. Explain what the benefit is to users and why the risk is worth it. This goes a long way toward helping maintainers evaluate your contribution. -**If you get errors, be sure to fix them before committing.** +### General guidelines -- Be sure to [check the "Allow edits from maintainers" option](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/allowing-changes-to-a-pull-request-branch-created-from-a-fork) while creating you PR. -- If your PR refers to or fixes an issue, be sure to add `refs #XXX` or `fixes #XXX` to the PR description. Replacing `XXX` with the respective issue number. See more about [Linking a pull request to an issue - ](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue). +- Be sure to [check the "Allow edits from maintainers" option](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/allowing-changes-to-a-pull-request-branch-created-from-a-fork) while creating your PR. +- If your PR refers to or fixes an issue, be sure to add `refs #XXX` or `fixes #XXX` to the PR description. Replacing `XXX` with the respective issue number. See more about [Linking a pull request to an issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue). - Be sure to fill the PR Template accordingly. ## Adding changesets @@ -252,6 +249,39 @@ You will be prompted to select which packages to include in the changeset. Only Most of the time the changes you'll make are likely to be categorized as patch releases. If you feel like there is the need for a minor or major release of the package based on the changes being made, add the changeset as such and it will be discussed during PR review. +## Adding server changes + +Changesets only track published npm packages. If your PR only changes server components (`apps/webapp/`, `apps/supervisor/`, `apps/coordinator/`, etc.) with no package changes, add a `.server-changes/` file so the change appears in release notes. + +Create a markdown file with a descriptive name: + +```sh +cat > .server-changes/fix-batch-queue-stalls.md << 'EOF' +--- +area: webapp +type: fix +--- + +Speed up batch queue processing by removing stalls and fixing retry race +EOF +``` + +**Fields:** +- `area` (required): `webapp` | `supervisor` | `coordinator` | `kubernetes-provider` | `docker-provider` +- `type` (required): `feature` | `fix` | `improvement` | `breaking` + +The body text (below the frontmatter) is a one-line description of the change. Keep it concise — it will appear in release notes. + +**When to add which:** + +| PR changes | What to add | +|---|---| +| Only packages (`packages/`) | Changeset | +| Only server (`apps/`) | `.server-changes/` file | +| Both packages and server | Just the changeset | + +See `.server-changes/README.md` for more details. + ## Troubleshooting ### EADDRINUSE: address already in use :::3030 @@ -272,3 +302,7 @@ The process running on port `3030` should be destroyed. ```sh sudo kill -9 ``` + +### Running two clones side by side (worktree, branch experiment) + +The default `pnpm run docker` uses the project name `triggerdotdev-docker` and the standard host ports (5432, 6379, 3060, 4566, 8123, 9000, 9005, 9006). To stand up a second instance in another clone without clashing, set a different `COMPOSE_PROJECT_NAME` and the offset host ports in that clone's `.env`. The "Running multiple instances side by side" block in `.env.example` lists every overridable env var with its default for reference; uncomment the lines you need and update `DATABASE_URL` / `CLICKHOUSE_URL` / `REDIS_PORT` / `APP_ORIGIN` / `LOGIN_ORIGIN` / `ELECTRIC_ORIGIN` / `REALTIME_STREAMS_S2_ENDPOINT` to match. diff --git a/RELEASE.md b/RELEASE.md index 1b9273cb142..8ba3ecb5007 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,5 +1,28 @@ ## Guide on releasing a new version +### Automated release (v4+) + +Releases are fully automated via CI: + +1. PRs merge to `main` with changesets (for package changes) and/or `.server-changes/` files (for server-only changes). +2. The [changesets-pr.yml](./.github/workflows/changesets-pr.yml) workflow automatically creates/updates the `changeset-release/main` PR with version bumps and an enhanced summary of all changes. Consumed `.server-changes/` files are removed on the release branch (same approach changesets uses for `.changeset/` files — they're deleted on the branch, so merging the PR cleans them up). +3. When ready to release, merge the changeset release PR into `main`. +4. The [release.yml](./.github/workflows/release.yml) workflow automatically: + - Publishes all packages to npm + - Creates a single unified GitHub release (e.g., "trigger.dev v4.3.4") + - Tags and triggers Docker image builds + - After Docker images are pushed, updates the GitHub release with the exact GHCR tag link + +### What engineers need to do + +- **Package changes**: Add a changeset with `pnpm run changeset:add` +- **Server-only changes**: Add a `.server-changes/` file (see `.server-changes/README.md`) +- **Mixed PRs**: Just the changeset is enough + +See `CHANGESETS.md` for full details on changesets and server changes. + +### Legacy release (v3) + 1. Merge in the changeset PR into main, making sure to cancel both the release and publish github actions from that merge. 2. Pull the changes locally into main 3. Run `pnpm i` which will update the pnpm lock file with the new versions diff --git a/ai/references/repo.md b/ai/references/repo.md index 4f67bde2b4b..6e0ff056716 100644 --- a/ai/references/repo.md +++ b/ai/references/repo.md @@ -1,6 +1,6 @@ ## Repo Overview -This is a pnpm 10.23.0 monorepo that uses turborepo @turbo.json. The following workspaces are relevant +This is a pnpm 10.33.2 monorepo that uses turborepo @turbo.json. The following workspaces are relevant ## Apps diff --git a/ailogger-output.log b/ailogger-output.log new file mode 100644 index 00000000000..e69de29bb2d diff --git a/apps/coordinator/Containerfile b/apps/coordinator/Containerfile index 4e7b89e0af1..9e973675ab9 100644 --- a/apps/coordinator/Containerfile +++ b/apps/coordinator/Containerfile @@ -35,7 +35,7 @@ COPY --from=pruner --chown=node:node /app/out/full/ . COPY --from=dev-deps --chown=node:node /app/ . COPY --chown=node:node turbo.json turbo.json -RUN pnpm run -r --filter coordinator build:bundle +RUN pnpm run -r --filter @trigger.dev/core bundle-vendor && pnpm run -r --filter coordinator build:bundle FROM alpine AS cri-tools diff --git a/apps/docker-provider/Containerfile b/apps/docker-provider/Containerfile index bea730bda80..42a7ac23092 100644 --- a/apps/docker-provider/Containerfile +++ b/apps/docker-provider/Containerfile @@ -31,7 +31,7 @@ COPY --from=pruner --chown=node:node /app/out/full/ . COPY --from=dev-deps --chown=node:node /app/ . COPY --chown=node:node turbo.json turbo.json -RUN pnpm run -r --filter docker-provider build:bundle +RUN pnpm run -r --filter @trigger.dev/core bundle-vendor && pnpm run -r --filter docker-provider build:bundle FROM base AS runner diff --git a/apps/kubernetes-provider/Containerfile b/apps/kubernetes-provider/Containerfile index fb96304c26b..b46b9943275 100644 --- a/apps/kubernetes-provider/Containerfile +++ b/apps/kubernetes-provider/Containerfile @@ -31,7 +31,7 @@ COPY --from=pruner --chown=node:node /app/out/full/ . COPY --from=dev-deps --chown=node:node /app/ . COPY --chown=node:node turbo.json turbo.json -RUN pnpm run -r --filter kubernetes-provider build:bundle +RUN pnpm run -r --filter @trigger.dev/core bundle-vendor && pnpm run -r --filter kubernetes-provider build:bundle FROM base AS runner diff --git a/apps/supervisor/CLAUDE.md b/apps/supervisor/CLAUDE.md new file mode 100644 index 00000000000..ded836c6069 --- /dev/null +++ b/apps/supervisor/CLAUDE.md @@ -0,0 +1,20 @@ +# Supervisor + +Node.js app that manages task execution containers. Receives work from the platform, starts Docker/Kubernetes containers, monitors execution, and reports results. + +## Key Directories + +- `src/services/` - Core service logic +- `src/workloadManager/` - Container orchestration abstraction (Docker or Kubernetes) +- `src/workloadServer/` - HTTP server for workload communication (heartbeats, snapshots) +- `src/clients/` - Platform communication (webapp/coordinator) +- `src/env.ts` - Environment configuration + +## Architecture + +- **WorkloadManager**: Abstracts Docker vs Kubernetes execution +- **SupervisorSession**: Manages the dequeue loop with EWMA-based dynamic scaling +- **ResourceMonitor**: Tracks CPU/memory during execution +- **PodCleaner/FailedPodHandler**: Kubernetes-specific cleanup + +Communicates with the platform via Socket.io and HTTP. Receives task assignments through the dequeue protocol from the webapp. diff --git a/apps/supervisor/Containerfile b/apps/supervisor/Containerfile index d5bb5862e96..5b3b148a7cb 100644 --- a/apps/supervisor/Containerfile +++ b/apps/supervisor/Containerfile @@ -16,7 +16,7 @@ COPY --from=pruner --chown=node:node /app/out/json/ . COPY --from=pruner --chown=node:node /app/out/pnpm-lock.yaml ./pnpm-lock.yaml COPY --from=pruner --chown=node:node /app/out/pnpm-workspace.yaml ./pnpm-workspace.yaml -RUN corepack enable && corepack prepare pnpm@10.23.0 --activate +RUN corepack enable && corepack prepare pnpm@10.33.2 --activate FROM base AS deps-fetcher RUN apk add --no-cache python3-dev py3-setuptools make g++ gcc linux-headers diff --git a/apps/supervisor/package.json b/apps/supervisor/package.json index e9609bf1541..7456d421850 100644 --- a/apps/supervisor/package.json +++ b/apps/supervisor/package.json @@ -14,9 +14,11 @@ }, "dependencies": { "@aws-sdk/client-ecr": "^3.839.0", + "@internal/compute": "workspace:*", "@kubernetes/client-node": "^1.0.0", "@trigger.dev/core": "workspace:*", "dockerode": "^4.0.6", + "p-limit": "^6.2.0", "prom-client": "^15.1.0", "socket.io": "4.7.4", "std-env": "^3.8.0", diff --git a/apps/supervisor/src/env.ts b/apps/supervisor/src/env.ts index 9ef0cff2537..071de4cc814 100644 --- a/apps/supervisor/src/env.ts +++ b/apps/supervisor/src/env.ts @@ -3,137 +3,288 @@ import { env as stdEnv } from "std-env"; import { z } from "zod"; import { AdditionalEnvVars, BoolEnv } from "./envUtil.js"; -const Env = z.object({ - // This will come from `spec.nodeName` in k8s - TRIGGER_WORKER_INSTANCE_NAME: z.string().default(randomUUID()), - TRIGGER_WORKER_HEARTBEAT_INTERVAL_SECONDS: z.coerce.number().default(30), - - // Required settings - TRIGGER_API_URL: z.string().url(), - TRIGGER_WORKER_TOKEN: z.string(), // accepts file:// path to read from a file - MANAGED_WORKER_SECRET: z.string(), - OTEL_EXPORTER_OTLP_ENDPOINT: z.string().url(), // set on the runners - - // Workload API settings (coordinator mode) - the workload API is what the run controller connects to - TRIGGER_WORKLOAD_API_ENABLED: BoolEnv.default(true), - TRIGGER_WORKLOAD_API_PROTOCOL: z - .string() - .transform((s) => z.enum(["http", "https"]).parse(s.toLowerCase())) - .default("http"), - TRIGGER_WORKLOAD_API_DOMAIN: z.string().optional(), // If unset, will use orchestrator-specific default - TRIGGER_WORKLOAD_API_HOST_INTERNAL: z.string().default("0.0.0.0"), - TRIGGER_WORKLOAD_API_PORT_INTERNAL: z.coerce.number().default(8020), // This is the port the workload API listens on - TRIGGER_WORKLOAD_API_PORT_EXTERNAL: z.coerce.number().default(8020), // This is the exposed port passed to the run controller - - // Runner settings - RUNNER_HEARTBEAT_INTERVAL_SECONDS: z.coerce.number().optional(), - RUNNER_SNAPSHOT_POLL_INTERVAL_SECONDS: z.coerce.number().optional(), - RUNNER_ADDITIONAL_ENV_VARS: AdditionalEnvVars, // optional (csv) - RUNNER_PRETTY_LOGS: BoolEnv.default(false), - - // Dequeue settings (provider mode) - TRIGGER_DEQUEUE_ENABLED: BoolEnv.default(true), - TRIGGER_DEQUEUE_INTERVAL_MS: z.coerce.number().int().default(250), - TRIGGER_DEQUEUE_IDLE_INTERVAL_MS: z.coerce.number().int().default(1000), - TRIGGER_DEQUEUE_MAX_RUN_COUNT: z.coerce.number().int().default(1), - TRIGGER_DEQUEUE_MIN_CONSUMER_COUNT: z.coerce.number().int().default(1), - TRIGGER_DEQUEUE_MAX_CONSUMER_COUNT: z.coerce.number().int().default(10), - TRIGGER_DEQUEUE_SCALING_STRATEGY: z.enum(["none", "smooth", "aggressive"]).default("none"), - TRIGGER_DEQUEUE_SCALING_UP_COOLDOWN_MS: z.coerce.number().int().default(5000), // 5 seconds - TRIGGER_DEQUEUE_SCALING_DOWN_COOLDOWN_MS: z.coerce.number().int().default(30000), // 30 seconds - TRIGGER_DEQUEUE_SCALING_TARGET_RATIO: z.coerce.number().default(1.0), // Target ratio of queue items to consumers (1.0 = 1 item per consumer) - TRIGGER_DEQUEUE_SCALING_EWMA_ALPHA: z.coerce.number().min(0).max(1).default(0.3), // Smooths queue length measurements (0=historical, 1=current) - TRIGGER_DEQUEUE_SCALING_BATCH_WINDOW_MS: z.coerce.number().int().positive().default(1000), // Batch window for metrics processing (ms) - TRIGGER_DEQUEUE_SCALING_DAMPING_FACTOR: z.coerce.number().min(0).max(1).default(0.7), // Smooths consumer count changes after EWMA (0=no scaling, 1=immediate) - - // Optional services - TRIGGER_WARM_START_URL: z.string().optional(), - TRIGGER_CHECKPOINT_URL: z.string().optional(), - TRIGGER_METADATA_URL: z.string().optional(), - - // Used by the resource monitor - RESOURCE_MONITOR_ENABLED: BoolEnv.default(false), - RESOURCE_MONITOR_OVERRIDE_CPU_TOTAL: z.coerce.number().optional(), - RESOURCE_MONITOR_OVERRIDE_MEMORY_TOTAL_GB: z.coerce.number().optional(), - - // Docker settings - DOCKER_API_VERSION: z.string().optional(), - DOCKER_PLATFORM: z.string().optional(), // e.g. linux/amd64, linux/arm64 - DOCKER_STRIP_IMAGE_DIGEST: BoolEnv.default(true), - DOCKER_REGISTRY_USERNAME: z.string().optional(), - DOCKER_REGISTRY_PASSWORD: z.string().optional(), - DOCKER_REGISTRY_URL: z.string().optional(), // e.g. https://index.docker.io/v1 - DOCKER_ENFORCE_MACHINE_PRESETS: BoolEnv.default(true), - DOCKER_AUTOREMOVE_EXITED_CONTAINERS: BoolEnv.default(true), - /** - * Network mode to use for all runners. Supported standard values are: `bridge`, `host`, `none`, and `container:`. - * Any other value is taken as a custom network's name to which all runners should connect to. - * - * Accepts a list of comma-separated values to attach to multiple networks. Additional networks are interpreted as network names and will be attached after container creation. - * - * **WARNING**: Specifying multiple networks will slightly increase startup times. - * - * @default "host" - */ - DOCKER_RUNNER_NETWORKS: z.string().default("host"), - - // Kubernetes settings - KUBERNETES_FORCE_ENABLED: BoolEnv.default(false), - KUBERNETES_NAMESPACE: z.string().default("default"), - KUBERNETES_WORKER_NODETYPE_LABEL: z.string().default("v4-worker"), - KUBERNETES_IMAGE_PULL_SECRETS: z.string().optional(), // csv - KUBERNETES_EPHEMERAL_STORAGE_SIZE_LIMIT: z.string().default("10Gi"), - KUBERNETES_EPHEMERAL_STORAGE_SIZE_REQUEST: z.string().default("2Gi"), - KUBERNETES_STRIP_IMAGE_DIGEST: BoolEnv.default(false), - KUBERNETES_CPU_REQUEST_MIN_CORES: z.coerce.number().min(0).default(0), - KUBERNETES_CPU_REQUEST_RATIO: z.coerce.number().min(0).max(1).default(0.75), // Ratio of CPU limit, so 0.75 = 75% of CPU limit - KUBERNETES_MEMORY_REQUEST_MIN_GB: z.coerce.number().min(0).default(0), - KUBERNETES_MEMORY_REQUEST_RATIO: z.coerce.number().min(0).max(1).default(1), // Ratio of memory limit, so 1 = 100% of memory limit - - // Per-preset overrides of the global KUBERNETES_CPU_REQUEST_RATIO - KUBERNETES_CPU_REQUEST_RATIO_MICRO: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_CPU_REQUEST_RATIO_SMALL_1X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_CPU_REQUEST_RATIO_SMALL_2X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_CPU_REQUEST_RATIO_MEDIUM_1X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_CPU_REQUEST_RATIO_MEDIUM_2X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_CPU_REQUEST_RATIO_LARGE_1X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_CPU_REQUEST_RATIO_LARGE_2X: z.coerce.number().min(0).max(1).optional(), - - // Per-preset overrides of the global KUBERNETES_MEMORY_REQUEST_RATIO - KUBERNETES_MEMORY_REQUEST_RATIO_MICRO: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_MEMORY_REQUEST_RATIO_SMALL_1X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_MEMORY_REQUEST_RATIO_SMALL_2X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_MEMORY_REQUEST_RATIO_MEDIUM_1X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_MEMORY_REQUEST_RATIO_MEDIUM_2X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_MEMORY_REQUEST_RATIO_LARGE_1X: z.coerce.number().min(0).max(1).optional(), - KUBERNETES_MEMORY_REQUEST_RATIO_LARGE_2X: z.coerce.number().min(0).max(1).optional(), - - KUBERNETES_MEMORY_OVERHEAD_GB: z.coerce.number().min(0).optional(), // Optional memory overhead to add to the limit in GB - KUBERNETES_SCHEDULER_NAME: z.string().optional(), // Custom scheduler name for pods - KUBERNETES_LARGE_MACHINE_POOL_LABEL: z.string().optional(), // if set, large-* presets affinity for machinepool= - - // Placement tags settings - PLACEMENT_TAGS_ENABLED: BoolEnv.default(false), - PLACEMENT_TAGS_PREFIX: z.string().default("node.cluster.x-k8s.io"), - - // Metrics - METRICS_ENABLED: BoolEnv.default(true), - METRICS_COLLECT_DEFAULTS: BoolEnv.default(true), - METRICS_HOST: z.string().default("127.0.0.1"), - METRICS_PORT: z.coerce.number().int().default(9090), - - // Pod cleaner - POD_CLEANER_ENABLED: BoolEnv.default(true), - POD_CLEANER_INTERVAL_MS: z.coerce.number().int().default(10000), - POD_CLEANER_BATCH_SIZE: z.coerce.number().int().default(500), - - // Failed pod handler - FAILED_POD_HANDLER_ENABLED: BoolEnv.default(true), - FAILED_POD_HANDLER_RECONNECT_INTERVAL_MS: z.coerce.number().int().default(1000), - - // Debug - DEBUG: BoolEnv.default(false), - SEND_RUN_DEBUG_LOGS: BoolEnv.default(false), -}); +const Env = z + .object({ + // This will come from `spec.nodeName` in k8s + TRIGGER_WORKER_INSTANCE_NAME: z.string().default(randomUUID()), + TRIGGER_WORKER_HEARTBEAT_INTERVAL_SECONDS: z.coerce.number().default(30), + + // Required settings + TRIGGER_API_URL: z.string().url(), + TRIGGER_WORKER_TOKEN: z.string(), // accepts file:// path to read from a file + MANAGED_WORKER_SECRET: z.string(), + OTEL_EXPORTER_OTLP_ENDPOINT: z.string().url(), // set on the runners + + // Workload API settings (coordinator mode) - the workload API is what the run controller connects to + TRIGGER_WORKLOAD_API_ENABLED: BoolEnv.default(true), + TRIGGER_WORKLOAD_API_PROTOCOL: z + .string() + .transform((s) => z.enum(["http", "https"]).parse(s.toLowerCase())) + .default("http"), + TRIGGER_WORKLOAD_API_DOMAIN: z.string().optional(), // If unset, will use orchestrator-specific default + TRIGGER_WORKLOAD_API_HOST_INTERNAL: z.string().default("0.0.0.0"), + TRIGGER_WORKLOAD_API_PORT_INTERNAL: z.coerce.number().default(8020), // This is the port the workload API listens on + TRIGGER_WORKLOAD_API_PORT_EXTERNAL: z.coerce.number().default(8020), // This is the exposed port passed to the run controller + + // Runner settings + RUNNER_HEARTBEAT_INTERVAL_SECONDS: z.coerce.number().optional(), + RUNNER_SNAPSHOT_POLL_INTERVAL_SECONDS: z.coerce.number().optional(), + RUNNER_ADDITIONAL_ENV_VARS: AdditionalEnvVars, // optional (csv) + RUNNER_PRETTY_LOGS: BoolEnv.default(false), + + // Dequeue settings (provider mode) + TRIGGER_DEQUEUE_ENABLED: BoolEnv.default(true), + TRIGGER_DEQUEUE_INTERVAL_MS: z.coerce.number().int().default(250), + TRIGGER_DEQUEUE_IDLE_INTERVAL_MS: z.coerce.number().int().default(1000), + TRIGGER_DEQUEUE_MAX_RUN_COUNT: z.coerce.number().int().default(1), + TRIGGER_DEQUEUE_MIN_CONSUMER_COUNT: z.coerce.number().int().default(1), + TRIGGER_DEQUEUE_MAX_CONSUMER_COUNT: z.coerce.number().int().default(10), + TRIGGER_DEQUEUE_SCALING_STRATEGY: z.enum(["none", "smooth", "aggressive"]).default("none"), + TRIGGER_DEQUEUE_SCALING_UP_COOLDOWN_MS: z.coerce.number().int().default(5000), // 5 seconds + TRIGGER_DEQUEUE_SCALING_DOWN_COOLDOWN_MS: z.coerce.number().int().default(30000), // 30 seconds + TRIGGER_DEQUEUE_SCALING_TARGET_RATIO: z.coerce.number().default(1.0), // Target ratio of queue items to consumers (1.0 = 1 item per consumer) + TRIGGER_DEQUEUE_SCALING_EWMA_ALPHA: z.coerce.number().min(0).max(1).default(0.3), // Smooths queue length measurements (0=historical, 1=current) + TRIGGER_DEQUEUE_SCALING_BATCH_WINDOW_MS: z.coerce.number().int().positive().default(1000), // Batch window for metrics processing (ms) + TRIGGER_DEQUEUE_SCALING_DAMPING_FACTOR: z.coerce.number().min(0).max(1).default(0.7), // Smooths consumer count changes after EWMA (0=no scaling, 1=immediate) + + // Optional services + TRIGGER_WARM_START_URL: z.string().optional(), + TRIGGER_CHECKPOINT_URL: z.string().optional(), + TRIGGER_METADATA_URL: z.string().optional(), + + // Used by the resource monitor + RESOURCE_MONITOR_ENABLED: BoolEnv.default(false), + RESOURCE_MONITOR_OVERRIDE_CPU_TOTAL: z.coerce.number().optional(), + RESOURCE_MONITOR_OVERRIDE_MEMORY_TOTAL_GB: z.coerce.number().optional(), + + // Docker settings + DOCKER_API_VERSION: z.string().optional(), + DOCKER_PLATFORM: z.string().optional(), // e.g. linux/amd64, linux/arm64 + DOCKER_STRIP_IMAGE_DIGEST: BoolEnv.default(true), + DOCKER_REGISTRY_USERNAME: z.string().optional(), + DOCKER_REGISTRY_PASSWORD: z.string().optional(), + DOCKER_REGISTRY_URL: z.string().optional(), // e.g. https://index.docker.io/v1 + DOCKER_ENFORCE_MACHINE_PRESETS: BoolEnv.default(true), + DOCKER_AUTOREMOVE_EXITED_CONTAINERS: BoolEnv.default(true), + /** + * Network mode to use for all runners. Supported standard values are: `bridge`, `host`, `none`, and `container:`. + * Any other value is taken as a custom network's name to which all runners should connect to. + * + * Accepts a list of comma-separated values to attach to multiple networks. Additional networks are interpreted as network names and will be attached after container creation. + * + * **WARNING**: Specifying multiple networks will slightly increase startup times. + * + * @default "host" + */ + DOCKER_RUNNER_NETWORKS: z.string().default("host"), + + // Compute settings + COMPUTE_GATEWAY_URL: z.string().url().optional(), + COMPUTE_GATEWAY_AUTH_TOKEN: z.string().optional(), + COMPUTE_GATEWAY_TIMEOUT_MS: z.coerce.number().int().default(30_000), + COMPUTE_SNAPSHOTS_ENABLED: BoolEnv.default(false), + COMPUTE_TRACE_SPANS_ENABLED: BoolEnv.default(true), + COMPUTE_TRACE_OTLP_ENDPOINT: z.string().url().optional(), // Override for span export (derived from TRIGGER_API_URL if unset) + COMPUTE_SNAPSHOT_DELAY_MS: z.coerce.number().int().min(0).max(60_000).default(5_000), + COMPUTE_SNAPSHOT_DISPATCH_LIMIT: z.coerce.number().int().min(1).max(100).default(10), + + // Kubernetes settings + KUBERNETES_FORCE_ENABLED: BoolEnv.default(false), + KUBERNETES_NAMESPACE: z.string().default("default"), + KUBERNETES_WORKER_NODETYPE_LABEL: z.string().default("v4-worker"), + KUBERNETES_IMAGE_PULL_SECRETS: z.string().optional(), // csv + KUBERNETES_EPHEMERAL_STORAGE_SIZE_LIMIT: z.string().default("10Gi"), + KUBERNETES_EPHEMERAL_STORAGE_SIZE_REQUEST: z.string().default("2Gi"), + KUBERNETES_STRIP_IMAGE_DIGEST: BoolEnv.default(false), + KUBERNETES_CPU_REQUEST_MIN_CORES: z.coerce.number().min(0).default(0), + KUBERNETES_CPU_REQUEST_RATIO: z.coerce.number().min(0).max(1).default(0.75), // Ratio of CPU limit, so 0.75 = 75% of CPU limit + KUBERNETES_MEMORY_REQUEST_MIN_GB: z.coerce.number().min(0).default(0), + KUBERNETES_MEMORY_REQUEST_RATIO: z.coerce.number().min(0).max(1).default(1), // Ratio of memory limit, so 1 = 100% of memory limit + + // Per-preset overrides of the global KUBERNETES_CPU_REQUEST_RATIO + KUBERNETES_CPU_REQUEST_RATIO_MICRO: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_CPU_REQUEST_RATIO_SMALL_1X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_CPU_REQUEST_RATIO_SMALL_2X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_CPU_REQUEST_RATIO_MEDIUM_1X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_CPU_REQUEST_RATIO_MEDIUM_2X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_CPU_REQUEST_RATIO_LARGE_1X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_CPU_REQUEST_RATIO_LARGE_2X: z.coerce.number().min(0).max(1).optional(), + + // Per-preset overrides of the global KUBERNETES_MEMORY_REQUEST_RATIO + KUBERNETES_MEMORY_REQUEST_RATIO_MICRO: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_MEMORY_REQUEST_RATIO_SMALL_1X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_MEMORY_REQUEST_RATIO_SMALL_2X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_MEMORY_REQUEST_RATIO_MEDIUM_1X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_MEMORY_REQUEST_RATIO_MEDIUM_2X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_MEMORY_REQUEST_RATIO_LARGE_1X: z.coerce.number().min(0).max(1).optional(), + KUBERNETES_MEMORY_REQUEST_RATIO_LARGE_2X: z.coerce.number().min(0).max(1).optional(), + + KUBERNETES_MEMORY_OVERHEAD_GB: z.coerce.number().min(0).optional(), // Optional memory overhead to add to the limit in GB + KUBERNETES_SCHEDULER_NAME: z.string().optional(), // Custom scheduler name for pods + + // Pod DNS config — override the cluster default ndots to `KUBERNETES_POD_DNS_NDOTS`. + // Default k8s ndots is 5: any name with fewer than 5 dots (e.g. `api.example.com`, 2 dots) is first walked + // through every entry in the cluster search list (`.svc.cluster.local`, `svc.cluster.local`, `cluster.local`) + // before being tried as-is, turning one resolution into 4+ CoreDNS queries (×2 with A+AAAA). + // Overriding the default can be useful to cut CoreDNS query amplification for external domains. + // Note: before enabling, make sure no code path relies on search-list expansion for names with dots ≥ the value + // set here — those names will now hit their as-is form first and could resolve externally before falling back. + KUBERNETES_POD_DNS_NDOTS_OVERRIDE_ENABLED: BoolEnv.default(false), + KUBERNETES_POD_DNS_NDOTS: z.coerce.number().int().min(1).max(15).default(2), + // Large machine affinity settings - large-* presets prefer a dedicated pool + KUBERNETES_LARGE_MACHINE_AFFINITY_ENABLED: BoolEnv.default(false), + KUBERNETES_LARGE_MACHINE_AFFINITY_POOL_LABEL_KEY: z + .string() + .trim() + .min(1) + .default("node.cluster.x-k8s.io/machinepool"), + KUBERNETES_LARGE_MACHINE_AFFINITY_POOL_LABEL_VALUE: z + .string() + .trim() + .min(1) + .default("large-machines"), + KUBERNETES_LARGE_MACHINE_AFFINITY_WEIGHT: z.coerce.number().int().min(1).max(100).default(100), + + // Project affinity settings - pods from the same project prefer the same node + KUBERNETES_PROJECT_AFFINITY_ENABLED: BoolEnv.default(false), + KUBERNETES_PROJECT_AFFINITY_WEIGHT: z.coerce.number().int().min(1).max(100).default(50), + KUBERNETES_PROJECT_AFFINITY_TOPOLOGY_KEY: z + .string() + .trim() + .min(1) + .default("kubernetes.io/hostname"), + + // Schedule affinity settings - runs from schedule trees prefer a dedicated pool + KUBERNETES_SCHEDULED_RUN_AFFINITY_ENABLED: BoolEnv.default(false), + KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_KEY: z + .string() + .trim() + .min(1) + .default("node.cluster.x-k8s.io/machinepool"), + KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_VALUE: z + .string() + .trim() + .min(1) + .default("scheduled-runs"), + KUBERNETES_SCHEDULED_RUN_AFFINITY_WEIGHT: z.coerce.number().int().min(1).max(100).default(80), + KUBERNETES_SCHEDULED_RUN_ANTI_AFFINITY_WEIGHT: z.coerce + .number() + .int() + .min(1) + .max(100) + .default(20), + + // Schedule toleration settings - scheduled runs tolerate taints on the dedicated pool + // Comma-separated list of tolerations in the format: key=value:effect + // For Exists operator (no value): key:effect + KUBERNETES_SCHEDULED_RUN_TOLERATIONS: z + .string() + .transform((val, ctx) => { + const tolerations = val + .split(",") + .map((entry) => entry.trim()) + .filter((entry) => entry.length > 0) + .map((entry) => { + const colonIdx = entry.lastIndexOf(":"); + if (colonIdx === -1) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: `Invalid toleration format (missing effect): "${entry}"`, + }); + return z.NEVER; + } + + const effect = entry.slice(colonIdx + 1); + const validEffects = ["NoSchedule", "NoExecute", "PreferNoSchedule"]; + if (!validEffects.includes(effect)) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: `Invalid toleration effect "${effect}" in "${entry}". Must be one of: ${validEffects.join( + ", " + )}`, + }); + return z.NEVER; + } + + const keyValue = entry.slice(0, colonIdx); + const eqIdx = keyValue.indexOf("="); + const key = eqIdx === -1 ? keyValue : keyValue.slice(0, eqIdx); + + if (!key) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: `Invalid toleration format (empty key): "${entry}"`, + }); + return z.NEVER; + } + + if (eqIdx === -1) { + return { key, operator: "Exists" as const, effect }; + } + + return { + key, + operator: "Equal" as const, + value: keyValue.slice(eqIdx + 1), + effect, + }; + }); + + return tolerations; + }) + .optional(), + + // Placement tags settings + PLACEMENT_TAGS_ENABLED: BoolEnv.default(false), + PLACEMENT_TAGS_PREFIX: z.string().default("node.cluster.x-k8s.io"), + + // Metrics + METRICS_ENABLED: BoolEnv.default(true), + METRICS_COLLECT_DEFAULTS: BoolEnv.default(true), + METRICS_HOST: z.string().default("127.0.0.1"), + METRICS_PORT: z.coerce.number().int().default(9090), + + // Pod cleaner + POD_CLEANER_ENABLED: BoolEnv.default(true), + POD_CLEANER_INTERVAL_MS: z.coerce.number().int().default(10000), + POD_CLEANER_BATCH_SIZE: z.coerce.number().int().default(500), + + // Failed pod handler + FAILED_POD_HANDLER_ENABLED: BoolEnv.default(true), + FAILED_POD_HANDLER_RECONNECT_INTERVAL_MS: z.coerce.number().int().default(1000), + + // Debug + DEBUG: BoolEnv.default(false), + SEND_RUN_DEBUG_LOGS: BoolEnv.default(false), + + // Wide-event observability - off by default. Emits one flat-keyed JSON + // line per natural unit of work (dequeue iteration, HTTP request, socket + // lifecycle). High-QPS hotpath, so the kill switch must be honoured. + TRIGGER_WIDE_EVENTS_ENABLED: BoolEnv.default(false), + // When true, also emit wide events for high-frequency HTTP routes + // (heartbeat, snapshots-since, logs/debug). Off in prod to keep event + // volume manageable; on in test environments for full-fidelity debugging. + TRIGGER_WIDE_EVENTS_NOISY_ROUTES: BoolEnv.default(false), + }) + .superRefine((data, ctx) => { + if (data.COMPUTE_SNAPSHOTS_ENABLED && !data.TRIGGER_METADATA_URL) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: "TRIGGER_METADATA_URL is required when COMPUTE_SNAPSHOTS_ENABLED is true", + path: ["TRIGGER_METADATA_URL"], + }); + } + if (data.COMPUTE_SNAPSHOTS_ENABLED && !data.TRIGGER_WORKLOAD_API_DOMAIN) { + ctx.addIssue({ + code: z.ZodIssueCode.custom, + message: "TRIGGER_WORKLOAD_API_DOMAIN is required when COMPUTE_SNAPSHOTS_ENABLED is true", + path: ["TRIGGER_WORKLOAD_API_DOMAIN"], + }); + } + }) + .transform((data) => ({ + ...data, + COMPUTE_TRACE_OTLP_ENDPOINT: data.COMPUTE_TRACE_OTLP_ENDPOINT ?? `${data.TRIGGER_API_URL}/otel`, + })); export const env = Env.parse(stdEnv); diff --git a/apps/supervisor/src/index.ts b/apps/supervisor/src/index.ts index 0e274b30390..e2c16c4b1df 100644 --- a/apps/supervisor/src/index.ts +++ b/apps/supervisor/src/index.ts @@ -14,6 +14,7 @@ import { } from "./resourceMonitor.js"; import { KubernetesWorkloadManager } from "./workloadManager/kubernetes.js"; import { DockerWorkloadManager } from "./workloadManager/docker.js"; +import { ComputeWorkloadManager } from "./workloadManager/compute.js"; import { HttpServer, CheckpointClient, @@ -25,6 +26,16 @@ import { register } from "./metrics.js"; import { PodCleaner } from "./services/podCleaner.js"; import { FailedPodHandler } from "./services/failedPodHandler.js"; import { getWorkerToken } from "./workerToken.js"; +import { OtlpTraceService } from "./services/otlpTraceService.js"; +import { extractTraceparent, getRestoreRunnerId } from "./util.js"; +import { + fromContext, + recordPhaseSince, + runWideEvent, + setExtra, + setMeta, + type WideEventOptions, +} from "./wideEvents/index.js"; if (env.METRICS_COLLECT_DEFAULTS) { collectDefaultMetrics({ register }); @@ -35,18 +46,32 @@ class ManagedSupervisor { private readonly metricsServer?: HttpServer; private readonly workloadServer: WorkloadServer; private readonly workloadManager: WorkloadManager; + private readonly computeManager?: ComputeWorkloadManager; private readonly logger = new SimpleStructuredLogger("managed-supervisor"); private readonly resourceMonitor: ResourceMonitor; private readonly checkpointClient?: CheckpointClient; private readonly podCleaner?: PodCleaner; private readonly failedPodHandler?: FailedPodHandler; + private readonly tracing?: OtlpTraceService; private readonly isKubernetes = isKubernetesEnvironment(env.KUBERNETES_FORCE_ENABLED); private readonly warmStartUrl = env.TRIGGER_WARM_START_URL; + private readonly wideEventOpts: WideEventOptions = { + service: "supervisor", + env: { nodeId: env.TRIGGER_WORKER_INSTANCE_NAME }, + enabled: env.TRIGGER_WIDE_EVENTS_ENABLED, + }; + private readonly wideEventsNoisyRoutes = env.TRIGGER_WIDE_EVENTS_NOISY_ROUTES; + constructor() { - const { TRIGGER_WORKER_TOKEN, MANAGED_WORKER_SECRET, ...envWithoutSecrets } = env; + const { + TRIGGER_WORKER_TOKEN, + MANAGED_WORKER_SECRET, + COMPUTE_GATEWAY_AUTH_TOKEN, + ...envWithoutSecrets + } = env; if (env.DEBUG) { this.logger.debug("Starting up", { envWithoutSecrets }); @@ -77,9 +102,46 @@ class ManagedSupervisor { : new DockerResourceMonitor(new Docker()) : new NoopResourceMonitor(); - this.workloadManager = this.isKubernetes - ? new KubernetesWorkloadManager(workloadManagerOptions) - : new DockerWorkloadManager(workloadManagerOptions); + if (env.COMPUTE_GATEWAY_URL) { + if (!env.TRIGGER_WORKLOAD_API_DOMAIN) { + throw new Error("TRIGGER_WORKLOAD_API_DOMAIN is not set, cannot create compute manager"); + } + + const callbackUrl = `${env.TRIGGER_WORKLOAD_API_PROTOCOL}://${env.TRIGGER_WORKLOAD_API_DOMAIN}:${env.TRIGGER_WORKLOAD_API_PORT_EXTERNAL}/api/v1/compute/snapshot-complete`; + + if (env.COMPUTE_TRACE_SPANS_ENABLED) { + this.tracing = new OtlpTraceService({ + endpointUrl: env.COMPUTE_TRACE_OTLP_ENDPOINT, + }); + } + + const computeManager = new ComputeWorkloadManager({ + ...workloadManagerOptions, + gateway: { + url: env.COMPUTE_GATEWAY_URL, + authToken: env.COMPUTE_GATEWAY_AUTH_TOKEN, + timeoutMs: env.COMPUTE_GATEWAY_TIMEOUT_MS, + }, + snapshots: { + enabled: env.COMPUTE_SNAPSHOTS_ENABLED, + delayMs: env.COMPUTE_SNAPSHOT_DELAY_MS, + dispatchLimit: env.COMPUTE_SNAPSHOT_DISPATCH_LIMIT, + callbackUrl, + }, + tracing: this.tracing, + runner: { + instanceName: env.TRIGGER_WORKER_INSTANCE_NAME, + otelEndpoint: env.OTEL_EXPORTER_OTLP_ENDPOINT, + prettyLogs: env.RUNNER_PRETTY_LOGS, + }, + }); + this.computeManager = computeManager; + this.workloadManager = computeManager; + } else { + this.workloadManager = this.isKubernetes + ? new KubernetesWorkloadManager(workloadManagerOptions) + : new DockerWorkloadManager(workloadManagerOptions); + } if (this.isKubernetes) { if (env.POD_CLEANER_ENABLED) { @@ -182,102 +244,214 @@ class ManagedSupervisor { } this.workerSession.on("runNotification", async ({ time, run }) => { - this.logger.log("runNotification", { time, run }); + this.logger.verbose("runNotification", { time, run }); this.workloadServer.notifyRun({ run }); }); - this.workerSession.on("runQueueMessage", async ({ time, message }) => { - this.logger.log(`Received message with timestamp ${time.toLocaleString()}`, message); - - if (message.completedWaitpoints.length > 0) { - this.logger.debug("Run has completed waitpoints", { - runId: message.run.id, - completedWaitpoints: message.completedWaitpoints.length, - }); - } - - if (!message.image) { - this.logger.error("Run has no image", { runId: message.run.id }); - return; - } - - const { checkpoint, ...rest } = message; - - if (checkpoint) { - this.logger.log("Restoring run", { runId: message.run.id }); - - if (!this.checkpointClient) { - this.logger.error("No checkpoint client", { runId: message.run.id }); - return; - } - - try { - const didRestore = await this.checkpointClient.restoreRun({ - runFriendlyId: message.run.friendlyId, - snapshotFriendlyId: message.snapshot.friendlyId, - body: { - ...rest, - checkpoint, + this.workerSession.on( + "runQueueMessage", + async ({ time, message, dequeueResponseMs, pollingIntervalMs }) => { + this.logger.verbose(`Received message with timestamp ${time.toLocaleString()}`, message); + + const traceparent = extractTraceparent(message.run.traceContext); + + await runWideEvent( + { + ...this.wideEventOpts, + traceparent, + setup: (state) => { + setMeta(state, "run_id", message.run.friendlyId); + setMeta(state, "env_id", message.environment.id); + setMeta(state, "org_id", message.organization.id); + setMeta(state, "project_id", message.project.id); + if (message.deployment.friendlyId) { + setMeta(state, "deployment_id", message.deployment.friendlyId); + } + setMeta(state, "machine_preset", message.run.machine.name); + state.extras.iteration = "dequeue"; + state.extras.dequeue_response_ms = dequeueResponseMs; + state.extras.polling_interval_ms = pollingIntervalMs; + state.extras.completed_waitpoints = message.completedWaitpoints.length; }, - }); - - if (didRestore) { - this.logger.log("Restore successful", { runId: message.run.id }); - } else { - this.logger.error("Restore failed", { runId: message.run.id }); + }, + async () => { + if (message.completedWaitpoints.length > 0) { + this.logger.debug("Run has completed waitpoints", { + runId: message.run.id, + completedWaitpoints: message.completedWaitpoints.length, + }); + } + + if (!message.image) { + setExtra(fromContext(), "path_taken", "skipped_no_image"); + this.logger.error("Run has no image", { runId: message.run.id }); + return; + } + + const { checkpoint, ...rest } = message; + + // Register trace context early so snapshot spans work for all paths + // (cold create, restore, warm start). Re-registration on restore is safe + // since dequeue always provides fresh context. + if (this.computeManager?.traceSpansEnabled && traceparent) { + this.workloadServer.registerRunTraceContext(message.run.friendlyId, { + traceparent, + envId: message.environment.id, + orgId: message.organization.id, + projectId: message.project.id, + }); + } + + if (checkpoint) { + setExtra(fromContext(), "path_taken", "restore"); + this.logger.debug("Restoring run", { runId: message.run.id }); + + if (this.computeManager) { + const restoreStart = performance.now(); + try { + const runnerId = getRestoreRunnerId(message.run.friendlyId, checkpoint.id); + + const didRestore = await this.computeManager.restore({ + snapshotId: checkpoint.location, + runnerId, + runFriendlyId: message.run.friendlyId, + snapshotFriendlyId: message.snapshot.friendlyId, + machine: message.run.machine, + traceContext: message.run.traceContext, + envId: message.environment.id, + orgId: message.organization.id, + projectId: message.project.id, + dequeuedAt: message.dequeuedAt, + }); + recordPhaseSince("restore", restoreStart, undefined); + setExtra(fromContext(), "did_restore", didRestore); + + if (didRestore) { + this.logger.debug("Compute restore successful", { + runId: message.run.id, + runnerId, + }); + } else { + this.logger.error("Compute restore failed", { + runId: message.run.id, + runnerId, + }); + } + } catch (error) { + recordPhaseSince( + "restore", + restoreStart, + error instanceof Error ? error : new Error(String(error)) + ); + this.logger.error("Failed to restore run (compute)", { error }); + } + + return; + } + + if (!this.checkpointClient) { + this.logger.error("No checkpoint client", { runId: message.run.id }); + return; + } + + const restoreStart = performance.now(); + try { + const didRestore = await this.checkpointClient.restoreRun({ + runFriendlyId: message.run.friendlyId, + snapshotFriendlyId: message.snapshot.friendlyId, + body: { + ...rest, + checkpoint, + }, + }); + recordPhaseSince("restore", restoreStart, undefined); + setExtra(fromContext(), "did_restore", didRestore); + + if (didRestore) { + this.logger.debug("Restore successful", { runId: message.run.id }); + } else { + this.logger.error("Restore failed", { runId: message.run.id }); + } + } catch (error) { + recordPhaseSince( + "restore", + restoreStart, + error instanceof Error ? error : new Error(String(error)) + ); + this.logger.error("Failed to restore run", { error }); + } + + return; + } + + this.logger.debug("Scheduling run", { runId: message.run.id }); + + const warmStartStart = performance.now(); + const didWarmStart = await this.tryWarmStart(message, traceparent); + const warmStartCheckMs = Math.round(performance.now() - warmStartStart); + recordPhaseSince("warm_start", warmStartStart, undefined); + setExtra(fromContext(), "did_warm_start", didWarmStart); + + if (didWarmStart) { + setExtra(fromContext(), "path_taken", "warm_start"); + this.logger.debug("Warm start successful", { runId: message.run.id }); + return; + } + + setExtra(fromContext(), "path_taken", "cold_create"); + + const createStart = performance.now(); + try { + if (!message.deployment.friendlyId) { + // mostly a type guard, deployments always exists for deployed environments + // a proper fix would be to use a discriminated union schema to differentiate between dequeued runs in dev and in deployed environments. + throw new Error("Deployment is missing"); + } + + await this.workloadManager.create({ + dequeuedAt: message.dequeuedAt, + dequeueResponseMs, + pollingIntervalMs, + warmStartCheckMs, + envId: message.environment.id, + envType: message.environment.type, + image: message.image, + machine: message.run.machine, + orgId: message.organization.id, + projectId: message.project.id, + deploymentFriendlyId: message.deployment.friendlyId, + deploymentVersion: message.backgroundWorker.version, + runId: message.run.id, + runFriendlyId: message.run.friendlyId, + version: message.version, + nextAttemptNumber: message.run.attemptNumber, + snapshotId: message.snapshot.id, + snapshotFriendlyId: message.snapshot.friendlyId, + placementTags: message.placementTags, + traceContext: message.run.traceContext, + annotations: message.run.annotations, + hasPrivateLink: message.organization.hasPrivateLink, + }); + recordPhaseSince("workload_create", createStart, undefined); + + // Disabled for now + // this.resourceMonitor.blockResources({ + // cpu: message.run.machine.cpu, + // memory: message.run.machine.memory, + // }); + } catch (error) { + recordPhaseSince( + "workload_create", + createStart, + error instanceof Error ? error : new Error(String(error)) + ); + this.logger.error("Failed to create workload", { error }); + } } - } catch (error) { - this.logger.error("Failed to restore run", { error }); - } - - return; - } - - this.logger.log("Scheduling run", { runId: message.run.id }); - - const didWarmStart = await this.tryWarmStart(message); - - if (didWarmStart) { - this.logger.log("Warm start successful", { runId: message.run.id }); - return; + ); } - - try { - if (!message.deployment.friendlyId) { - // mostly a type guard, deployments always exists for deployed environments - // a proper fix would be to use a discriminated union schema to differentiate between dequeued runs in dev and in deployed environments. - throw new Error("Deployment is missing"); - } - - await this.workloadManager.create({ - dequeuedAt: message.dequeuedAt, - envId: message.environment.id, - envType: message.environment.type, - image: message.image, - machine: message.run.machine, - orgId: message.organization.id, - projectId: message.project.id, - deploymentFriendlyId: message.deployment.friendlyId, - deploymentVersion: message.backgroundWorker.version, - runId: message.run.id, - runFriendlyId: message.run.friendlyId, - version: message.version, - nextAttemptNumber: message.run.attemptNumber, - snapshotId: message.snapshot.id, - snapshotFriendlyId: message.snapshot.friendlyId, - placementTags: message.placementTags, - }); - - // Disabled for now - // this.resourceMonitor.blockResources({ - // cpu: message.run.machine.cpu, - // memory: message.run.machine.memory, - // }); - } catch (error) { - this.logger.error("Failed to create workload", { error }); - } - }); + ); if (env.METRICS_ENABLED) { this.metricsServer = new HttpServer({ @@ -296,6 +470,10 @@ class ManagedSupervisor { host: env.TRIGGER_WORKLOAD_API_HOST_INTERNAL, workerClient: this.workerSession.httpClient, checkpointClient: this.checkpointClient, + computeManager: this.computeManager, + tracing: this.tracing, + wideEventOpts: this.wideEventOpts, + wideEventsNoisyRoutes: this.wideEventsNoisyRoutes, }); this.workloadServer.on("runConnected", this.onRunConnected.bind(this)); @@ -312,19 +490,31 @@ class ManagedSupervisor { this.workerSession.unsubscribeFromRunNotifications([run.friendlyId]); } - private async tryWarmStart(dequeuedMessage: DequeuedMessage): Promise { + private async tryWarmStart( + dequeuedMessage: DequeuedMessage, + traceparent: string | undefined + ): Promise { if (!this.warmStartUrl) { return false; } const warmStartUrlWithPath = new URL("/warm-start", this.warmStartUrl); + const headers: Record = { + "Content-Type": "application/json", + }; + // Propagate the inbound W3C traceparent so the upstream warm-start + // receiver continues the same trace instead of minting a new one. Gated + // by the same kill switch as the wide-event emission so the whole PR is + // a no-op on the wire when disabled. + if (this.wideEventOpts.enabled && traceparent) { + headers.traceparent = traceparent; + } + try { const res = await fetch(warmStartUrlWithPath.href, { method: "POST", - headers: { - "Content-Type": "application/json", - }, + headers, body: JSON.stringify({ dequeuedMessage }), }); @@ -380,6 +570,7 @@ class ManagedSupervisor { async stop() { this.logger.log("Shutting down"); + await this.workloadServer.stop(); await this.workerSession.stop(); // Optional services diff --git a/apps/supervisor/src/services/computeSnapshotService.ts b/apps/supervisor/src/services/computeSnapshotService.ts new file mode 100644 index 00000000000..041e2902c75 --- /dev/null +++ b/apps/supervisor/src/services/computeSnapshotService.ts @@ -0,0 +1,242 @@ +import pLimit from "p-limit"; +import { SimpleStructuredLogger } from "@trigger.dev/core/v3/utils/structuredLogger"; +import { parseTraceparent } from "@trigger.dev/core/v3/isomorphic"; +import type { SupervisorHttpClient } from "@trigger.dev/core/v3/workers"; +import { type SnapshotCallbackPayload } from "@internal/compute"; +import type { ComputeWorkloadManager } from "../workloadManager/compute.js"; +import { TimerWheel } from "./timerWheel.js"; +import type { OtlpTraceService } from "./otlpTraceService.js"; + +type DelayedSnapshot = { + runnerId: string; + runFriendlyId: string; + snapshotFriendlyId: string; +}; + +export type RunTraceContext = { + traceparent: string; + envId: string; + orgId: string; + projectId: string; +}; + +export type ComputeSnapshotServiceOptions = { + computeManager: ComputeWorkloadManager; + workerClient: SupervisorHttpClient; + tracing?: OtlpTraceService; +}; + +export class ComputeSnapshotService { + private readonly logger = new SimpleStructuredLogger("compute-snapshot-service"); + + private static readonly MAX_TRACE_CONTEXTS = 10_000; + private readonly runTraceContexts = new Map(); + private readonly timerWheel: TimerWheel; + private readonly dispatchLimit: ReturnType; + + private readonly computeManager: ComputeWorkloadManager; + private readonly workerClient: SupervisorHttpClient; + private readonly tracing?: OtlpTraceService; + + constructor(opts: ComputeSnapshotServiceOptions) { + this.computeManager = opts.computeManager; + this.workerClient = opts.workerClient; + this.tracing = opts.tracing; + + this.dispatchLimit = pLimit(this.computeManager.snapshotDispatchLimit); + this.timerWheel = new TimerWheel({ + delayMs: this.computeManager.snapshotDelayMs, + onExpire: (item) => { + this.dispatchLimit(() => this.dispatch(item.data)).catch((error) => { + this.logger.error("Snapshot dispatch failed", { + runId: item.data.runFriendlyId, + runnerId: item.data.runnerId, + error, + }); + }); + }, + }); + this.timerWheel.start(); + } + + /** Schedule a delayed snapshot for a run. Replaces any pending snapshot for the same run. */ + schedule(runFriendlyId: string, data: DelayedSnapshot) { + this.timerWheel.submit(runFriendlyId, data); + this.logger.debug("Snapshot scheduled", { + runFriendlyId, + snapshotFriendlyId: data.snapshotFriendlyId, + delayMs: this.computeManager.snapshotDelayMs, + }); + } + + /** Cancel a pending delayed snapshot. Returns true if one was cancelled. */ + cancel(runFriendlyId: string): boolean { + const cancelled = this.timerWheel.cancel(runFriendlyId); + if (cancelled) { + this.logger.debug("Snapshot cancelled", { runFriendlyId }); + } + return cancelled; + } + + /** Handle the callback from the gateway after a snapshot completes or fails. */ + async handleCallback(body: SnapshotCallbackPayload) { + const snapshotId = body.status === "completed" ? body.snapshot_id : undefined; + + this.logger.debug("Snapshot callback", { + snapshotId, + instanceId: body.instance_id, + status: body.status, + error: body.status === "failed" ? body.error : undefined, + metadata: body.metadata, + durationMs: body.duration_ms, + }); + + const runId = body.metadata?.runId; + const snapshotFriendlyId = body.metadata?.snapshotFriendlyId; + + if (!runId || !snapshotFriendlyId) { + this.logger.error("Snapshot callback missing metadata", { body }); + return { ok: false as const, status: 400 }; + } + + this.#emitSnapshotSpan(runId, body.duration_ms, snapshotId); + + if (body.status === "completed") { + const result = await this.workerClient.submitSuspendCompletion({ + runId, + snapshotId: snapshotFriendlyId, + body: { + success: true, + checkpoint: { + type: "COMPUTE", + location: body.snapshot_id, + }, + }, + }); + + if (result.success) { + this.logger.debug("Suspend completion submitted", { + runId, + instanceId: body.instance_id, + snapshotId: body.snapshot_id, + }); + } else { + this.logger.error("Failed to submit suspend completion", { + runId, + snapshotFriendlyId, + error: result.error, + }); + } + } else { + const result = await this.workerClient.submitSuspendCompletion({ + runId, + snapshotId: snapshotFriendlyId, + body: { + success: false, + error: body.error ?? "Snapshot failed", + }, + }); + + if (!result.success) { + this.logger.error("Failed to submit suspend failure", { + runId, + snapshotFriendlyId, + error: result.error, + }); + } + } + + return { ok: true as const, status: 200 }; + } + + registerTraceContext(runFriendlyId: string, ctx: RunTraceContext) { + // Evict oldest entries if we've hit the cap. This is best-effort: on a busy + // supervisor, entries for long-lived runs may be evicted before their snapshot + // callback arrives, causing those snapshot spans to be silently dropped. + // That's acceptable - trace spans are observability sugar, not correctness. + if (this.runTraceContexts.size >= ComputeSnapshotService.MAX_TRACE_CONTEXTS) { + const firstKey = this.runTraceContexts.keys().next().value; + if (firstKey) { + this.runTraceContexts.delete(firstKey); + } + } + + this.runTraceContexts.set(runFriendlyId, ctx); + } + + /** Stop the timer wheel, dropping pending snapshots. */ + stop(): string[] { + // Intentionally drop pending snapshots rather than dispatching them. The supervisor + // is shutting down, so our callback URL will be dead by the time the gateway responds. + // Runners detect the supervisor is gone and reconnect to a new instance, which + // re-triggers the snapshot workflow. Snapshots are an optimization, not a correctness + // requirement - runs continue fine without them. + const remaining = this.timerWheel.stop(); + const droppedRuns = remaining.map((item) => item.key); + + if (droppedRuns.length > 0) { + this.logger.info("Stopped, dropped pending snapshots", { count: droppedRuns.length }); + this.logger.debug("Dropped snapshot details", { runs: droppedRuns }); + } + + return droppedRuns; + } + + /** Dispatch a snapshot request to the gateway. */ + private async dispatch(snapshot: DelayedSnapshot): Promise { + const result = await this.computeManager.snapshot({ + runnerId: snapshot.runnerId, + metadata: { + runId: snapshot.runFriendlyId, + snapshotFriendlyId: snapshot.snapshotFriendlyId, + }, + }); + + if (!result) { + this.logger.error("Failed to request snapshot", { + runId: snapshot.runFriendlyId, + runnerId: snapshot.runnerId, + }); + } + } + + #emitSnapshotSpan(runFriendlyId: string, durationMs?: number, snapshotId?: string) { + if (!this.tracing) return; + + const ctx = this.runTraceContexts.get(runFriendlyId); + if (!ctx) return; + + const parsed = parseTraceparent(ctx.traceparent); + if (!parsed) return; + + const endEpochMs = Date.now(); + const startEpochMs = durationMs ? endEpochMs - durationMs : endEpochMs; + + const spanAttributes: Record = { + "compute.type": "snapshot", + }; + + if (durationMs !== undefined) { + spanAttributes["compute.total_ms"] = durationMs; + } + + if (snapshotId) { + spanAttributes["compute.snapshot_id"] = snapshotId; + } + + this.tracing.emit({ + traceId: parsed.traceId, + parentSpanId: parsed.spanId, + spanName: "compute.snapshot", + startTimeMs: startEpochMs, + endTimeMs: endEpochMs, + resourceAttributes: { + "ctx.environment.id": ctx.envId, + "ctx.organization.id": ctx.orgId, + "ctx.project.id": ctx.projectId, + "ctx.run.id": runFriendlyId, + }, + spanAttributes, + }); + } +} diff --git a/apps/supervisor/src/services/failedPodHandler.test.ts b/apps/supervisor/src/services/failedPodHandler.test.ts index d05783288ed..4dbfda16f43 100644 --- a/apps/supervisor/src/services/failedPodHandler.test.ts +++ b/apps/supervisor/src/services/failedPodHandler.test.ts @@ -1,10 +1,11 @@ import { describe, it, expect, beforeAll, afterEach } from "vitest"; import { FailedPodHandler } from "./failedPodHandler.js"; -import { K8sApi, createK8sApi } from "../clients/kubernetes.js"; +import { type K8sApi, createK8sApi } from "../clients/kubernetes.js"; import { Registry } from "prom-client"; import { setTimeout } from "timers/promises"; -describe("FailedPodHandler Integration Tests", () => { +// These tests require live K8s cluster credentials - skip by default +describe.skipIf(!process.env.K8S_INTEGRATION_TESTS)("FailedPodHandler Integration Tests", () => { const k8s = createK8sApi(); const namespace = "integration-test"; const register = new Registry(); diff --git a/apps/supervisor/src/services/failedPodHandler.ts b/apps/supervisor/src/services/failedPodHandler.ts index 07217243769..3d56c92b213 100644 --- a/apps/supervisor/src/services/failedPodHandler.ts +++ b/apps/supervisor/src/services/failedPodHandler.ts @@ -151,7 +151,7 @@ export class FailedPodHandler { } private async onPodCompleted(pod: V1Pod) { - this.logger.info("pod-completed", this.podSummary(pod)); + this.logger.debug("pod-completed", this.podSummary(pod)); this.informerEventsTotal.inc({ namespace: this.namespace, verb: "add" }); if (!pod.metadata?.name) { @@ -165,7 +165,7 @@ export class FailedPodHandler { } if (pod.metadata?.deletionTimestamp) { - this.logger.info("pod-completed: pod is being deleted", this.podSummary(pod)); + this.logger.verbose("pod-completed: pod is being deleted", this.podSummary(pod)); return; } @@ -188,7 +188,7 @@ export class FailedPodHandler { } private async onPodSucceeded(pod: V1Pod) { - this.logger.info("pod-succeeded", this.podSummary(pod)); + this.logger.debug("pod-succeeded", this.podSummary(pod)); this.processedPodsTotal.inc({ namespace: this.namespace, status: this.podStatus(pod), @@ -196,7 +196,7 @@ export class FailedPodHandler { } private async onPodFailed(pod: V1Pod) { - this.logger.info("pod-failed", this.podSummary(pod)); + this.logger.debug("pod-failed", this.podSummary(pod)); try { await this.processFailedPod(pod); @@ -208,7 +208,7 @@ export class FailedPodHandler { } private async processFailedPod(pod: V1Pod) { - this.logger.info("pod-failed: processing pod", this.podSummary(pod)); + this.logger.verbose("pod-failed: processing pod", this.podSummary(pod)); const mainContainer = pod.status?.containerStatuses?.find((c) => c.name === "run-controller"); @@ -231,7 +231,7 @@ export class FailedPodHandler { } private async deletePod(pod: V1Pod) { - this.logger.info("pod-failed: deleting pod", this.podSummary(pod)); + this.logger.verbose("pod-failed: deleting pod", this.podSummary(pod)); try { await this.k8s.core.deleteNamespacedPod({ name: pod.metadata!.name!, diff --git a/apps/supervisor/src/services/otlpTraceService.test.ts b/apps/supervisor/src/services/otlpTraceService.test.ts new file mode 100644 index 00000000000..baf3bd90306 --- /dev/null +++ b/apps/supervisor/src/services/otlpTraceService.test.ts @@ -0,0 +1,179 @@ +import { describe, it, expect } from "vitest"; +import { buildPayload } from "./otlpTraceService.js"; + +describe("buildPayload", () => { + it("builds valid OTLP JSON with timing attributes", () => { + const payload = buildPayload({ + traceId: "abcd1234abcd1234abcd1234abcd1234", + parentSpanId: "1234567890abcdef", + spanName: "compute.provision", + startTimeMs: 1000, + endTimeMs: 1250, + resourceAttributes: { + "ctx.environment.id": "env_123", + "ctx.organization.id": "org_456", + "ctx.project.id": "proj_789", + "ctx.run.id": "run_abc", + }, + spanAttributes: { + "compute.total_ms": 250, + "compute.gateway.schedule_ms": 1, + "compute.cache.image_cached": true, + }, + }); + + expect(payload.resourceSpans).toHaveLength(1); + + const resourceSpan = payload.resourceSpans[0]!; + + // $trigger=true so the webapp accepts it + const triggerAttr = resourceSpan.resource.attributes.find((a) => a.key === "$trigger"); + expect(triggerAttr).toEqual({ key: "$trigger", value: { boolValue: true } }); + + // Resource attributes + const envAttr = resourceSpan.resource.attributes.find( + (a) => a.key === "ctx.environment.id" + ); + expect(envAttr).toEqual({ + key: "ctx.environment.id", + value: { stringValue: "env_123" }, + }); + + // Span basics + const span = resourceSpan.scopeSpans[0]!.spans[0]!; + expect(span.name).toBe("compute.provision"); + expect(span.traceId).toBe("abcd1234abcd1234abcd1234abcd1234"); + expect(span.parentSpanId).toBe("1234567890abcdef"); + + // Integer attribute + const totalMs = span.attributes.find((a) => a.key === "compute.total_ms"); + expect(totalMs).toEqual({ key: "compute.total_ms", value: { intValue: 250 } }); + + // Boolean attribute + const cached = span.attributes.find((a) => a.key === "compute.cache.image_cached"); + expect(cached).toEqual({ key: "compute.cache.image_cached", value: { boolValue: true } }); + }); + + it("generates a valid 16-char hex span ID", () => { + const payload = buildPayload({ + traceId: "abcd1234abcd1234abcd1234abcd1234", + spanName: "test", + startTimeMs: 1000, + endTimeMs: 1001, + resourceAttributes: {}, + spanAttributes: {}, + }); + + const span = payload.resourceSpans[0]!.scopeSpans[0]!.spans[0]!; + expect(span.spanId).toMatch(/^[0-9a-f]{16}$/); + }); + + it("converts timestamps to nanoseconds", () => { + const payload = buildPayload({ + traceId: "abcd1234abcd1234abcd1234abcd1234", + spanName: "test", + startTimeMs: 1000, + endTimeMs: 1250, + resourceAttributes: {}, + spanAttributes: {}, + }); + + const span = payload.resourceSpans[0]!.scopeSpans[0]!.spans[0]!; + expect(span.startTimeUnixNano).toBe("1000000000"); + expect(span.endTimeUnixNano).toBe("1250000000"); + }); + + it("converts real epoch timestamps without precision loss", () => { + // Date.now() values exceed Number.MAX_SAFE_INTEGER when multiplied by 1e6 + const startMs = 1711929600000; // 2024-04-01T00:00:00Z + const endMs = 1711929600250; + + const payload = buildPayload({ + traceId: "abcd1234abcd1234abcd1234abcd1234", + spanName: "test", + startTimeMs: startMs, + endTimeMs: endMs, + resourceAttributes: {}, + spanAttributes: {}, + }); + + const span = payload.resourceSpans[0]!.scopeSpans[0]!.spans[0]!; + expect(span.startTimeUnixNano).toBe("1711929600000000000"); + expect(span.endTimeUnixNano).toBe("1711929600250000000"); + }); + + it("preserves sub-millisecond precision from performance.now() arithmetic", () => { + // provisionStartEpochMs = Date.now() - (performance.now() - startMs) produces fractional ms. + // Use small epoch + fraction to avoid IEEE 754 noise in the fractional part. + const startMs = 1000.322; + const endMs = 1045.789; + + const payload = buildPayload({ + traceId: "abcd1234abcd1234abcd1234abcd1234", + spanName: "test", + startTimeMs: startMs, + endTimeMs: endMs, + resourceAttributes: {}, + spanAttributes: {}, + }); + + const span = payload.resourceSpans[0]!.scopeSpans[0]!.spans[0]!; + expect(span.startTimeUnixNano).toBe("1000322000"); + expect(span.endTimeUnixNano).toBe("1045789000"); + }); + + it("sub-ms precision affects ordering for real epoch values", () => { + // Two spans within the same millisecond should have different nanosecond timestamps + const spanA = buildPayload({ + traceId: "abcd1234abcd1234abcd1234abcd1234", + spanName: "a", + startTimeMs: 1711929600000.3, + endTimeMs: 1711929600001, + resourceAttributes: {}, + spanAttributes: {}, + }); + + const spanB = buildPayload({ + traceId: "abcd1234abcd1234abcd1234abcd1234", + spanName: "b", + startTimeMs: 1711929600000.7, + endTimeMs: 1711929600001, + resourceAttributes: {}, + spanAttributes: {}, + }); + + const startA = BigInt(spanA.resourceSpans[0]!.scopeSpans[0]!.spans[0]!.startTimeUnixNano); + const startB = BigInt(spanB.resourceSpans[0]!.scopeSpans[0]!.spans[0]!.startTimeUnixNano); + // A should sort before B (both in the same ms but different sub-ms positions) + expect(startA).toBeLessThan(startB); + }); + + it("omits parentSpanId when not provided", () => { + const payload = buildPayload({ + traceId: "abcd1234abcd1234abcd1234abcd1234", + spanName: "test", + startTimeMs: 1000, + endTimeMs: 1001, + resourceAttributes: {}, + spanAttributes: {}, + }); + + const span = payload.resourceSpans[0]!.scopeSpans[0]!.spans[0]!; + expect(span.parentSpanId).toBeUndefined(); + }); + + it("handles double values for non-integer numbers", () => { + const payload = buildPayload({ + traceId: "abcd1234abcd1234abcd1234abcd1234", + spanName: "test", + startTimeMs: 1000, + endTimeMs: 1001, + resourceAttributes: {}, + spanAttributes: { "compute.cpu": 0.25 }, + }); + + const span = payload.resourceSpans[0]!.scopeSpans[0]!.spans[0]!; + const cpu = span.attributes.find((a) => a.key === "compute.cpu"); + expect(cpu).toEqual({ key: "compute.cpu", value: { doubleValue: 0.25 } }); + }); +}); diff --git a/apps/supervisor/src/services/otlpTraceService.ts b/apps/supervisor/src/services/otlpTraceService.ts new file mode 100644 index 00000000000..da3310711d0 --- /dev/null +++ b/apps/supervisor/src/services/otlpTraceService.ts @@ -0,0 +1,104 @@ +import { randomBytes } from "crypto"; +import { SimpleStructuredLogger } from "@trigger.dev/core/v3/utils/structuredLogger"; + +export type OtlpTraceServiceOptions = { + endpointUrl: string; + timeoutMs?: number; +}; + +export type OtlpTraceSpan = { + traceId: string; + parentSpanId?: string; + spanName: string; + startTimeMs: number; + endTimeMs: number; + resourceAttributes: Record; + spanAttributes: Record; +}; + +export class OtlpTraceService { + private readonly logger = new SimpleStructuredLogger("otlp-trace"); + + constructor(private opts: OtlpTraceServiceOptions) {} + + /** Fire-and-forget: build payload and send to the configured OTLP endpoint */ + emit(span: OtlpTraceSpan): void { + const payload = buildPayload(span); + + fetch(`${this.opts.endpointUrl}/v1/traces`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(payload), + signal: AbortSignal.timeout(this.opts.timeoutMs ?? 5_000), + }).catch((err) => { + this.logger.warn("failed to send compute trace span", { + error: err instanceof Error ? err.message : String(err), + }); + }); + } +} + +// ── Payload builder (internal) ─────────────────────────────────────────────── + +/** @internal Exported for tests only */ +export function buildPayload(span: OtlpTraceSpan) { + const spanId = randomBytes(8).toString("hex"); + + return { + resourceSpans: [ + { + resource: { + attributes: [ + { key: "$trigger", value: { boolValue: true } }, + ...toOtlpAttributes(span.resourceAttributes), + ], + }, + scopeSpans: [ + { + scope: { name: "supervisor.compute" }, + spans: [ + { + traceId: span.traceId, + spanId, + parentSpanId: span.parentSpanId, + name: span.spanName, + kind: 3, // SPAN_KIND_CLIENT + startTimeUnixNano: msToNano(span.startTimeMs), + endTimeUnixNano: msToNano(span.endTimeMs), + attributes: toOtlpAttributes(span.spanAttributes), + status: { code: 1 }, // STATUS_CODE_OK + }, + ], + }, + ], + }, + ], + }; +} + +function toOtlpAttributes( + attrs: Record +): Array<{ key: string; value: Record }> { + return Object.entries(attrs).map(([key, value]) => ({ + key, + value: toOtlpValue(value), + })); +} + +function toOtlpValue(value: string | number | boolean): Record { + if (typeof value === "string") return { stringValue: value }; + if (typeof value === "boolean") return { boolValue: value }; + if (Number.isInteger(value)) return { intValue: value }; + return { doubleValue: value }; +} + +/** + * Convert epoch milliseconds to nanosecond string, preserving sub-ms precision. + * Fractional ms from performance.now() arithmetic carry meaningful microsecond + * data that affects span sort ordering when events happen within the same ms. + */ +function msToNano(ms: number): string { + const wholeMs = Math.trunc(ms); + const fracNs = Math.round((ms - wholeMs) * 1_000_000); + return String(BigInt(wholeMs) * 1_000_000n + BigInt(fracNs)); +} diff --git a/apps/supervisor/src/services/podCleaner.test.ts b/apps/supervisor/src/services/podCleaner.test.ts index 36bb5de6d1f..d6ed2bb737f 100644 --- a/apps/supervisor/src/services/podCleaner.test.ts +++ b/apps/supervisor/src/services/podCleaner.test.ts @@ -1,10 +1,11 @@ import { PodCleaner } from "./podCleaner.js"; -import { K8sApi, createK8sApi } from "../clients/kubernetes.js"; +import { type K8sApi, createK8sApi } from "../clients/kubernetes.js"; import { setTimeout } from "timers/promises"; import { describe, it, expect, beforeAll, afterEach } from "vitest"; import { Registry } from "prom-client"; -describe("PodCleaner Integration Tests", () => { +// These tests require live K8s cluster credentials - skip by default +describe.skipIf(!process.env.K8S_INTEGRATION_TESTS)("PodCleaner Integration Tests", () => { const k8s = createK8sApi(); const namespace = "integration-test"; const register = new Registry(); diff --git a/apps/supervisor/src/services/podCleaner.ts b/apps/supervisor/src/services/podCleaner.ts index 56eaaeb88af..3ac5da293df 100644 --- a/apps/supervisor/src/services/podCleaner.ts +++ b/apps/supervisor/src/services/podCleaner.ts @@ -90,7 +90,7 @@ export class PodCleaner { status: "succeeded", }); - this.logger.info("Deleted batch of pods", { continuationToken }); + this.logger.debug("Deleted batch of pods", { continuationToken }); } catch (err) { this.logger.error("Failed to delete batch of pods", { err: err instanceof Error ? err.message : String(err), diff --git a/apps/supervisor/src/services/timerWheel.test.ts b/apps/supervisor/src/services/timerWheel.test.ts new file mode 100644 index 00000000000..3f6bb9aa19b --- /dev/null +++ b/apps/supervisor/src/services/timerWheel.test.ts @@ -0,0 +1,254 @@ +import { describe, it, expect, vi, beforeEach, afterEach } from "vitest"; +import { TimerWheel } from "./timerWheel.js"; + +describe("TimerWheel", () => { + beforeEach(() => { + vi.useFakeTimers(); + }); + + afterEach(() => { + vi.useRealTimers(); + }); + + it("dispatches item after delay", () => { + const dispatched: string[] = []; + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: (item) => dispatched.push(item.key), + }); + + wheel.start(); + wheel.submit("run-1", "snapshot-data"); + + // Not yet + vi.advanceTimersByTime(2900); + expect(dispatched).toEqual([]); + + // After delay + vi.advanceTimersByTime(200); + expect(dispatched).toEqual(["run-1"]); + + wheel.stop(); + }); + + it("cancels item before it fires", () => { + const dispatched: string[] = []; + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: (item) => dispatched.push(item.key), + }); + + wheel.start(); + wheel.submit("run-1", "data"); + + vi.advanceTimersByTime(1000); + expect(wheel.cancel("run-1")).toBe(true); + + vi.advanceTimersByTime(5000); + expect(dispatched).toEqual([]); + expect(wheel.size).toBe(0); + + wheel.stop(); + }); + + it("cancel returns false for unknown key", () => { + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: () => {}, + }); + expect(wheel.cancel("nonexistent")).toBe(false); + }); + + it("deduplicates: resubmitting same key replaces the entry", () => { + const dispatched: { key: string; data: string }[] = []; + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: (item) => dispatched.push({ key: item.key, data: item.data }), + }); + + wheel.start(); + wheel.submit("run-1", "old-data"); + + vi.advanceTimersByTime(1000); + wheel.submit("run-1", "new-data"); + + // Original would have fired at t=3000, but was replaced + // New one fires at t=1000+3000=4000 + vi.advanceTimersByTime(2100); + expect(dispatched).toEqual([]); + + vi.advanceTimersByTime(1000); + expect(dispatched).toEqual([{ key: "run-1", data: "new-data" }]); + + wheel.stop(); + }); + + it("handles many concurrent items", () => { + const dispatched: string[] = []; + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: (item) => dispatched.push(item.key), + }); + + wheel.start(); + + for (let i = 0; i < 1000; i++) { + wheel.submit(`run-${i}`, `data-${i}`); + } + expect(wheel.size).toBe(1000); + + vi.advanceTimersByTime(3100); + expect(dispatched.length).toBe(1000); + expect(wheel.size).toBe(0); + + wheel.stop(); + }); + + it("handles items submitted at different times", () => { + const dispatched: string[] = []; + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: (item) => dispatched.push(item.key), + }); + + wheel.start(); + + wheel.submit("run-1", "data"); + vi.advanceTimersByTime(1000); + wheel.submit("run-2", "data"); + vi.advanceTimersByTime(1000); + wheel.submit("run-3", "data"); + + // t=2000: nothing yet + expect(dispatched).toEqual([]); + + // t=3100: run-1 fires + vi.advanceTimersByTime(1100); + expect(dispatched).toEqual(["run-1"]); + + // t=4100: run-2 fires + vi.advanceTimersByTime(1000); + expect(dispatched).toEqual(["run-1", "run-2"]); + + // t=5100: run-3 fires + vi.advanceTimersByTime(1000); + expect(dispatched).toEqual(["run-1", "run-2", "run-3"]); + + wheel.stop(); + }); + + it("setDelay changes delay for new items only", () => { + const dispatched: string[] = []; + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: (item) => dispatched.push(item.key), + }); + + wheel.start(); + + wheel.submit("run-1", "data"); // 3s delay + + vi.advanceTimersByTime(500); + wheel.setDelay(1000); + wheel.submit("run-2", "data"); // 1s delay + + // t=1500: run-2 should have fired (submitted at t=500 with 1s delay) + vi.advanceTimersByTime(1100); + expect(dispatched).toEqual(["run-2"]); + + // t=3100: run-1 fires at its original 3s delay + vi.advanceTimersByTime(1500); + expect(dispatched).toEqual(["run-2", "run-1"]); + + wheel.stop(); + }); + + it("stop returns unprocessed items", () => { + const dispatched: string[] = []; + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: (item) => dispatched.push(item.key), + }); + + wheel.start(); + wheel.submit("run-1", "data-1"); + wheel.submit("run-2", "data-2"); + wheel.submit("run-3", "data-3"); + + const remaining = wheel.stop(); + expect(dispatched).toEqual([]); + expect(wheel.size).toBe(0); + expect(remaining.length).toBe(3); + expect(remaining.map((r) => r.key).sort()).toEqual(["run-1", "run-2", "run-3"]); + expect(remaining.find((r) => r.key === "run-1")?.data).toBe("data-1"); + }); + + it("after stop, new submissions are silently dropped", () => { + const dispatched: string[] = []; + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: (item) => dispatched.push(item.key), + }); + + wheel.start(); + wheel.stop(); + + wheel.submit("run-late", "data"); + expect(dispatched).toEqual([]); + expect(wheel.size).toBe(0); + }); + + it("tracks size correctly through submit/cancel/dispatch", () => { + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: () => {}, + }); + + wheel.start(); + + wheel.submit("a", "data"); + wheel.submit("b", "data"); + expect(wheel.size).toBe(2); + + wheel.cancel("a"); + expect(wheel.size).toBe(1); + + vi.advanceTimersByTime(3100); + expect(wheel.size).toBe(0); + + wheel.stop(); + }); + + it("clamps delay to valid range", () => { + const dispatched: string[] = []; + + // Very small delay (should be at least 1 tick = 100ms) + const wheel = new TimerWheel({ + delayMs: 0, + onExpire: (item) => dispatched.push(item.key), + }); + + wheel.start(); + wheel.submit("run-1", "data"); + + vi.advanceTimersByTime(200); + expect(dispatched).toEqual(["run-1"]); + + wheel.stop(); + }); + + it("multiple cancel calls are safe", () => { + const wheel = new TimerWheel({ + delayMs: 3000, + onExpire: () => {}, + }); + + wheel.start(); + wheel.submit("run-1", "data"); + + expect(wheel.cancel("run-1")).toBe(true); + expect(wheel.cancel("run-1")).toBe(false); + + wheel.stop(); + }); +}); diff --git a/apps/supervisor/src/services/timerWheel.ts b/apps/supervisor/src/services/timerWheel.ts new file mode 100644 index 00000000000..9584423824d --- /dev/null +++ b/apps/supervisor/src/services/timerWheel.ts @@ -0,0 +1,160 @@ +/** + * TimerWheel implements a hashed timer wheel for efficiently managing large numbers + * of delayed operations with O(1) submit, cancel, and per-item dispatch. + * + * Used by the supervisor to delay snapshot requests so that short-lived waitpoints + * (e.g. triggerAndWait that resolves in <3s) skip the snapshot entirely. + * + * The wheel is a ring buffer of slots. A single setInterval advances a cursor. + * When the cursor reaches a slot, all items in that slot are dispatched. + * + * Fixed capacity: 600 slots at 100ms tick = 60s max delay. + */ + +const TICK_MS = 100; +const NUM_SLOTS = 600; // 60s max delay at 100ms tick + +export type TimerWheelItem = { + key: string; + data: T; +}; + +export type TimerWheelOptions = { + /** Called when an item's delay expires. */ + onExpire: (item: TimerWheelItem) => void; + /** Delay in milliseconds before items fire. Clamped to [100, 60000]. */ + delayMs: number; +}; + +type Entry = { + key: string; + data: T; + slotIndex: number; +}; + +export class TimerWheel { + private slots: Set[]; + private entries: Map>; + private cursor: number; + private intervalId: ReturnType | null; + private onExpire: (item: TimerWheelItem) => void; + private delaySlots: number; + + constructor(opts: TimerWheelOptions) { + this.slots = Array.from({ length: NUM_SLOTS }, () => new Set()); + this.entries = new Map(); + this.cursor = 0; + this.intervalId = null; + this.onExpire = opts.onExpire; + this.delaySlots = Math.max(1, Math.min(NUM_SLOTS, Math.ceil(opts.delayMs / TICK_MS))); + } + + /** Start the timer wheel. Must be called before submitting items. */ + start(): void { + if (this.intervalId) return; + this.intervalId = setInterval(() => this.tick(), TICK_MS); + // Don't hold the process open just for the timer wheel + if (this.intervalId && typeof this.intervalId === "object" && "unref" in this.intervalId) { + this.intervalId.unref(); + } + } + + /** + * Stop the timer wheel and return all unprocessed items. + * The wheel keeps running normally during graceful shutdown - call stop() + * only when you're ready to tear down. Caller decides what to do with leftovers. + */ + stop(): TimerWheelItem[] { + if (this.intervalId) { + clearInterval(this.intervalId); + this.intervalId = null; + } + + const remaining: TimerWheelItem[] = []; + for (const [key, entry] of this.entries) { + remaining.push({ key, data: entry.data }); + } + + for (const slot of this.slots) { + slot.clear(); + } + this.entries.clear(); + + return remaining; + } + + /** + * Update the delay for future submissions. Already-queued items keep their original timing. + * Clamped to [TICK_MS, 60000ms]. + */ + setDelay(delayMs: number): void { + this.delaySlots = Math.max(1, Math.min(NUM_SLOTS, Math.ceil(delayMs / TICK_MS))); + } + + /** + * Submit an item to be dispatched after the configured delay. + * If an item with the same key already exists, it is replaced (dedup). + * No-op if the wheel is stopped. + */ + submit(key: string, data: T): void { + if (!this.intervalId) return; + + // Dedup: remove existing entry for this key + this.cancel(key); + + const slotIndex = (this.cursor + this.delaySlots) % NUM_SLOTS; + const entry: Entry = { key, data, slotIndex }; + + this.entries.set(key, entry); + this.slot(slotIndex).add(key); + } + + /** + * Cancel a pending item. Returns true if the item was found and removed. + */ + cancel(key: string): boolean { + const entry = this.entries.get(key); + if (!entry) return false; + + this.slot(entry.slotIndex).delete(key); + this.entries.delete(key); + return true; + } + + /** Number of pending items in the wheel. */ + get size(): number { + return this.entries.size; + } + + /** Whether the wheel is running. */ + get running(): boolean { + return this.intervalId !== null; + } + + /** Get a slot by index. The array is fully initialized so this always returns a Set. */ + private slot(index: number): Set { + const s = this.slots[index]; + if (!s) throw new Error(`TimerWheel: invalid slot index ${index}`); + return s; + } + + /** Advance the cursor and dispatch all items in the current slot. */ + private tick(): void { + this.cursor = (this.cursor + 1) % NUM_SLOTS; + const slot = this.slot(this.cursor); + + if (slot.size === 0) return; + + // Collect items to dispatch (copy keys since we mutate during iteration) + const keys = [...slot]; + slot.clear(); + + for (const key of keys) { + const entry = this.entries.get(key); + if (!entry) continue; + + this.entries.delete(key); + this.onExpire({ key, data: entry.data }); + } + } +} diff --git a/apps/supervisor/src/util.ts b/apps/supervisor/src/util.ts index 4fcda27b2aa..d14dd99bfe1 100644 --- a/apps/supervisor/src/util.ts +++ b/apps/supervisor/src/util.ts @@ -14,6 +14,18 @@ export function getDockerHostDomain() { return isMacOS || isWindows ? "host.docker.internal" : "localhost"; } +/** Extract the W3C traceparent string from an untyped trace context record */ +export function extractTraceparent(traceContext?: Record): string | undefined { + if ( + traceContext && + "traceparent" in traceContext && + typeof traceContext.traceparent === "string" + ) { + return traceContext.traceparent; + } + return undefined; +} + export function getRunnerId(runId: string, attemptNumber?: number) { const parts = ["runner", runId.replace("run_", "")]; @@ -23,3 +35,10 @@ export function getRunnerId(runId: string, attemptNumber?: number) { return parts.join("-"); } + +/** Derive a unique runnerId for a restore cycle using the checkpoint suffix */ +export function getRestoreRunnerId(runFriendlyId: string, checkpointId: string) { + const runIdShort = runFriendlyId.replace("run_", ""); + const checkpointSuffix = checkpointId.slice(-8); + return `runner-${runIdShort}-${checkpointSuffix}`; +} diff --git a/apps/supervisor/src/wideEvents/context.ts b/apps/supervisor/src/wideEvents/context.ts new file mode 100644 index 00000000000..a89859c2707 --- /dev/null +++ b/apps/supervisor/src/wideEvents/context.ts @@ -0,0 +1,14 @@ +import { AsyncLocalStorage } from "node:async_hooks"; +import type { State } from "./state.js"; + +/** + * AsyncLocalStorage threading per-operation `State` through the call stack. + * Wrappers enter a state via `wideEventStorage.run(state, () => fn())` and + * any code in the async call tree retrieves it via `fromContext()`. + */ +export const wideEventStorage = new AsyncLocalStorage(); + +/** Returns the State attached to the current async context, or null. */ +export function fromContext(): State | null { + return wideEventStorage.getStore() ?? null; +} diff --git a/apps/supervisor/src/wideEvents/emit.test.ts b/apps/supervisor/src/wideEvents/emit.test.ts new file mode 100644 index 00000000000..4e778d3e992 --- /dev/null +++ b/apps/supervisor/src/wideEvents/emit.test.ts @@ -0,0 +1,114 @@ +import { describe, it, expect } from "vitest"; +import { emit, EmitMessage } from "./emit.js"; +import { newState } from "./new.js"; + +function captureEmit(state: Parameters[0]): Record { + const captured: string[] = []; + const origWrite = process.stdout.write; + process.stdout.write = ((chunk: unknown) => { + captured.push(String(chunk)); + return true; + }) as typeof process.stdout.write; + try { + emit(state); + } finally { + process.stdout.write = origWrite; + } + expect(captured).toHaveLength(1); + const line = captured[0]; + if (!line) throw new Error("no captured line"); + return JSON.parse(line) as Record; +} + +describe("emit", () => { + it("emits a single line with the stable message + request_id", () => { + const s = newState({ service: "supervisor", env: {} }); + s.statusCode = 200; + s.ok = true; + s.durationMs = 5; + const out = captureEmit(s); + expect(out.msg).toBe(EmitMessage); + expect(out.request_id).toBe(s.requestId); + expect(out.service).toBe("supervisor"); + expect(out.ok).toBe(true); + expect(out.status).toBe(200); + expect(out.duration_ms).toBe(5); + }); + + it("omits empty optional fields", () => { + const s = newState({ service: "supervisor", env: {} }); + s.statusCode = 200; + s.ok = true; + const out = captureEmit(s); + expect(out).not.toHaveProperty("trace_id"); + expect(out).not.toHaveProperty("version"); + expect(out).not.toHaveProperty("commit_sha"); + expect(out).not.toHaveProperty("error.code"); + }); + + it("flattens meta keys as meta.", () => { + const s = newState({ service: "supervisor", env: {} }); + s.statusCode = 200; + s.ok = true; + s.meta.run_id = "run_abc"; + s.meta.deployment_id = "dep_xyz"; + const out = captureEmit(s); + expect(out["meta.run_id"]).toBe("run_abc"); + expect(out["meta.deployment_id"]).toBe("dep_xyz"); + expect(out).not.toHaveProperty("meta"); + }); + + it("flattens phases as phase..", () => { + const s = newState({ service: "supervisor", env: {} }); + s.statusCode = 200; + s.ok = true; + s.phases.push({ name: "warm_start", durationMs: 12, ok: true, attempts: 1 }); + s.phases.push({ + name: "workload_create", + durationMs: 3, + ok: false, + attempts: 2, + errorCode: "Error", + errorMsg: "boom", + sub: { create_ms: 1 }, + }); + const out = captureEmit(s); + expect(out["phase.warm_start.duration_ms"]).toBe(12); + expect(out["phase.warm_start.ok"]).toBe(true); + expect(out["phase.warm_start.attempts"]).toBe(1); + expect(out["phase.workload_create.duration_ms"]).toBe(3); + expect(out["phase.workload_create.ok"]).toBe(false); + expect(out["phase.workload_create.attempts"]).toBe(2); + expect(out["phase.workload_create.error_code"]).toBe("Error"); + expect(out["phase.workload_create.error_message"]).toBe("boom"); + expect(out["phase.workload_create.create_ms"]).toBe(1); + }); + + it("includes error.code/message/kind when state.error is set", () => { + const s = newState({ service: "supervisor", env: {} }); + s.statusCode = 500; + s.error = { code: "InternalError", message: "kaboom", kind: "internal" }; + const out = captureEmit(s); + expect(out["error.code"]).toBe("InternalError"); + expect(out["error.message"]).toBe("kaboom"); + expect(out["error.kind"]).toBe("internal"); + }); + + it("truncates very long error messages", () => { + const s = newState({ service: "supervisor", env: {} }); + s.error = { code: "Big", message: "x".repeat(2000), kind: "internal" }; + const out = captureEmit(s); + expect((out["error.message"] as string).length).toBe(512); + }); + + it("flattens extras at the top level", () => { + const s = newState({ service: "supervisor", env: {} }); + s.statusCode = 200; + s.ok = true; + s.extras.route = "/health"; + s.extras["dispatch.result"] = "hit"; + const out = captureEmit(s); + expect(out.route).toBe("/health"); + expect(out["dispatch.result"]).toBe("hit"); + }); +}); diff --git a/apps/supervisor/src/wideEvents/emit.ts b/apps/supervisor/src/wideEvents/emit.ts new file mode 100644 index 00000000000..a1644237d5d --- /dev/null +++ b/apps/supervisor/src/wideEvents/emit.ts @@ -0,0 +1,71 @@ +import type { State } from "./state.js"; + +/** + * Stable slog message string for every wide event. Downstream filters (jq, + * Axiom queries, Vector pipelines) pin to this constant. The `service` field + * disambiguates which service emitted it. + */ +export const EmitMessage = "wide_event"; + +const MAX_ERROR_MSG_BYTES = 512; + +/** + * Serializes a State as a single flat-keyed JSON line on stdout. Keys are + * flat (no nested objects) to keep jq filtering and Axiom indexing cheap. + * Empty optional fields are omitted. + */ +export function emit(state: State): void { + const out: Record = { + msg: EmitMessage, + request_id: state.requestId, + }; + + if (state.traceId) out.trace_id = state.traceId; + appendIfSet(out, "service", state.service); + appendIfSet(out, "version", state.version); + appendIfSet(out, "commit_sha", state.commitSha); + appendIfSet(out, "region", state.region); + appendIfSet(out, "node_id", state.nodeId); + + out.ok = state.ok; + if (state.statusCode !== 0) out.status = state.statusCode; + out.duration_ms = state.durationMs; + + if (state.error) { + appendIfSet(out, "error.code", state.error.code); + const msg = + state.error.message.length > MAX_ERROR_MSG_BYTES + ? state.error.message.slice(0, MAX_ERROR_MSG_BYTES) + : state.error.message; + appendIfSet(out, "error.message", msg); + appendIfSet(out, "error.kind", state.error.kind); + } + + for (const [k, v] of Object.entries(state.meta)) { + out["meta." + k] = v; + } + + for (const p of state.phases) { + const prefix = "phase." + p.name + "."; + out[prefix + "duration_ms"] = p.durationMs; + out[prefix + "ok"] = p.ok; + out[prefix + "attempts"] = p.attempts; + if (p.errorCode) out[prefix + "error_code"] = p.errorCode; + if (p.errorMsg) out[prefix + "error_message"] = p.errorMsg; + if (p.sub) { + for (const [sk, sv] of Object.entries(p.sub)) { + out[prefix + sk] = sv; + } + } + } + + for (const [k, v] of Object.entries(state.extras)) { + out[k] = v; + } + + process.stdout.write(JSON.stringify(out) + "\n"); +} + +function appendIfSet(out: Record, key: string, value: string | undefined): void { + if (value) out[key] = value; +} diff --git a/apps/supervisor/src/wideEvents/index.ts b/apps/supervisor/src/wideEvents/index.ts new file mode 100644 index 00000000000..742d5c018e8 --- /dev/null +++ b/apps/supervisor/src/wideEvents/index.ts @@ -0,0 +1,28 @@ +/** + * Wide-event observability surface for the supervisor. One flat-keyed JSON + * line per natural unit of work (HTTP request, dequeue iteration, socket + * lifecycle event). Events join across services via `trace_id` (parsed from + * the inbound W3C `traceparent`) and `meta.run_id`. + * + * Off by default behind a kill switch - the dispatch hotpath runs at high + * QPS, so logging pressure must be cleanly removable. + */ +export { type Env, isValidRequestId, newState, type NewStateOptions } from "./new.js"; +export { emit, EmitMessage } from "./emit.js"; +export { parseTraceId } from "./traceparent.js"; +export { fromContext, wideEventStorage } from "./context.js"; +export { + type PhaseOpt, + recordPhase, + recordPhaseSince, + timePhase, +} from "./record.js"; +export { + emitOneShot, + runWideEvent, + setExtra, + setMeta, + type WideEventLifecycleOptions, + type WideEventOptions, +} from "./middleware.js"; +export type { ErrorInfo, PhaseRecord, State } from "./state.js"; diff --git a/apps/supervisor/src/wideEvents/middleware.test.ts b/apps/supervisor/src/wideEvents/middleware.test.ts new file mode 100644 index 00000000000..18a6469d7b8 --- /dev/null +++ b/apps/supervisor/src/wideEvents/middleware.test.ts @@ -0,0 +1,207 @@ +import { describe, it, expect } from "vitest"; +import { fromContext } from "./context.js"; +import { emitOneShot, runWideEvent, setMeta } from "./middleware.js"; + +function captureStdout(fn: () => Promise | unknown): Promise { + const captured: string[] = []; + const orig = process.stdout.write; + process.stdout.write = ((chunk: unknown) => { + captured.push(String(chunk)); + return true; + }) as typeof process.stdout.write; + return Promise.resolve(fn()) + .finally(() => { + process.stdout.write = orig; + }) + .then(() => captured); +} + +describe("runWideEvent", () => { + it("emits one event with ok=true when no statusCode is set", async () => { + const lines = await captureStdout(async () => { + await runWideEvent( + { service: "supervisor", env: {}, enabled: true, route: "/x", method: "POST" }, + async () => undefined + ); + }); + expect(lines).toHaveLength(1); + const line = lines[0]; + if (!line) throw new Error("no line"); + const ev = JSON.parse(line) as Record; + expect(ev.ok).toBe(true); + expect(ev.service).toBe("supervisor"); + expect(ev.route).toBe("/x"); + expect(ev.method).toBe("POST"); + expect(typeof ev.duration_ms).toBe("number"); + expect(typeof ev.request_id).toBe("string"); + }); + + it("derives ok from statusCode set via finalize", async () => { + const lines = await captureStdout(async () => { + await runWideEvent( + { service: "supervisor", env: {}, enabled: true }, + async () => undefined, + (state) => { + state.statusCode = 200; + } + ); + }); + const line = lines[0]; + if (!line) throw new Error("no line"); + const ev = JSON.parse(line) as Record; + expect(ev.ok).toBe(true); + expect(ev.status).toBe(200); + }); + + it("treats 4xx as ok=false", async () => { + const lines = await captureStdout(async () => { + await runWideEvent( + { service: "supervisor", env: {}, enabled: true }, + async () => undefined, + (state) => { + state.statusCode = 400; + } + ); + }); + const line = lines[0]; + if (!line) throw new Error("no line"); + const ev = JSON.parse(line) as Record; + expect(ev.ok).toBe(false); + expect(ev.status).toBe(400); + }); + + it("emits ok=false with error.kind=internal on throw", async () => { + const lines = await captureStdout(async () => { + await runWideEvent( + { service: "supervisor", env: {}, enabled: true }, + async () => { + throw new Error("boom"); + } + ).catch(() => undefined); + }); + const line = lines[0]; + if (!line) throw new Error("no line"); + const ev = JSON.parse(line) as Record; + expect(ev.ok).toBe(false); + expect(ev.status).toBe(500); + expect(ev["error.kind"]).toBe("internal"); + expect(ev["error.message"]).toBe("boom"); + }); + + it("threads state through AsyncLocalStorage", async () => { + const lines = await captureStdout(async () => { + await runWideEvent( + { service: "supervisor", env: {}, enabled: true }, + async () => { + setMeta(fromContext(), "run_id", "run_abc"); + } + ); + }); + const line = lines[0]; + if (!line) throw new Error("no line"); + const ev = JSON.parse(line) as Record; + expect(ev["meta.run_id"]).toBe("run_abc"); + expect(ev.ok).toBe(true); + }); + + it("picks up inbound traceparent for trace_id", async () => { + const tp = "00-4bf92f3577b34da6a3ce929d0e0e4736-00f067aa0ba902b7-01"; + const lines = await captureStdout(async () => { + await runWideEvent( + { service: "supervisor", env: {}, enabled: true, traceparent: tp }, + async () => undefined + ); + }); + const line = lines[0]; + if (!line) throw new Error("no line"); + const ev = JSON.parse(line) as Record; + expect(ev.trace_id).toBe("4bf92f3577b34da6a3ce929d0e0e4736"); + }); + + it("honours setup() to attach meta and extras before fn runs", async () => { + const lines = await captureStdout(async () => { + await runWideEvent( + { + service: "supervisor", + env: {}, + enabled: true, + setup: (state) => { + state.meta.run_id = "run_abc"; + state.extras.iteration = "dequeue"; + }, + }, + async () => undefined + ); + }); + const line = lines[0]; + if (!line) throw new Error("no line"); + const ev = JSON.parse(line) as Record; + expect(ev["meta.run_id"]).toBe("run_abc"); + expect(ev.iteration).toBe("dequeue"); + }); + + it("short-circuits to pass-through when enabled=false", async () => { + let seenState: ReturnType = null; + const lines = await captureStdout(async () => { + await runWideEvent( + { service: "supervisor", env: {}, enabled: false }, + async () => { + seenState = fromContext(); + } + ); + }); + expect(lines).toHaveLength(0); + expect(seenState).toBe(null); + }); + + it("isolates state across concurrent invocations", async () => { + const lines = await captureStdout(async () => { + await Promise.all( + ["a", "b", "c"].map((tag) => + runWideEvent( + { service: "supervisor", env: {}, enabled: true }, + async () => { + const s = fromContext(); + if (!s) throw new Error("no state"); + s.meta.tag = tag; + await new Promise((r) => setTimeout(r, 5)); + expect(s.meta.tag).toBe(tag); + } + ) + ) + ); + }); + const tags = lines.map((l) => (JSON.parse(l) as Record)["meta.tag"]); + expect(tags.sort()).toEqual(["a", "b", "c"]); + }); +}); + +describe("emitOneShot", () => { + it("emits a single event with populated meta when enabled", async () => { + const lines = await captureStdout(() => { + emitOneShot({ + service: "supervisor", + env: {}, + enabled: true, + populate: (s) => { + s.meta.run_id = "run_abc"; + s.extras.event = "run:start"; + }, + }); + }); + expect(lines).toHaveLength(1); + const line = lines[0]; + if (!line) throw new Error("no line"); + const ev = JSON.parse(line) as Record; + expect(ev.ok).toBe(true); + expect(ev["meta.run_id"]).toBe("run_abc"); + expect(ev.event).toBe("run:start"); + }); + + it("emits nothing when disabled", async () => { + const lines = await captureStdout(() => { + emitOneShot({ service: "supervisor", env: {}, enabled: false }); + }); + expect(lines).toHaveLength(0); + }); +}); diff --git a/apps/supervisor/src/wideEvents/middleware.ts b/apps/supervisor/src/wideEvents/middleware.ts new file mode 100644 index 00000000000..a87a1885d56 --- /dev/null +++ b/apps/supervisor/src/wideEvents/middleware.ts @@ -0,0 +1,122 @@ +import { emit } from "./emit.js"; +import { newState, type Env } from "./new.js"; +import { wideEventStorage } from "./context.js"; +import type { State } from "./state.js"; + +/** Options common to every wide-event lifecycle. */ +export type WideEventOptions = { + service: string; + env: Env; + /** + * Kill switch. When false, lifecycles degenerate into transparent + * pass-through - no State allocation, no AsyncLocalStorage run, no emit. + * Important for the dispatch hotpath where logging pressure must be + * cleanly removable. + */ + enabled: boolean; +}; + +/** Per-invocation options layered on top of `WideEventOptions`. */ +export type WideEventLifecycleOptions = WideEventOptions & { + /** Route template (HTTP only) captured into `extras.route`. */ + route?: string; + /** HTTP method captured into `extras.method`. */ + method?: string; + /** Inbound W3C traceparent (HTTP header, queue message field). */ + traceparent?: string; + /** Inbound request id (e.g. `x-request-id` header). */ + inboundRequestId?: string; + /** Runs after the state is built, before the wrapped fn. Use to attach meta. */ + setup?: (state: State) => void; +}; + +/** + * Runs `fn` inside an AsyncLocalStorage state and emits one wide event on + * completion or error. `finalize` runs after `fn` returns but before emit - + * use it to read out-of-band outcome info (e.g. `res.statusCode` for an HTTP + * route) and assign to `state.statusCode`. The wrapper computes `ok` from + * `statusCode` if it's set; otherwise it defaults to true on success. + * + * Returns the original `fn` result. When `enabled=false`, `fn` runs unchanged + * with no event emitted. + */ +export async function runWideEvent( + opts: WideEventLifecycleOptions, + fn: () => Promise | T, + finalize?: (state: State) => void +): Promise { + if (!opts.enabled) { + return fn(); + } + + const state = newState({ + service: opts.service, + env: opts.env, + inboundRequestId: opts.inboundRequestId, + traceparent: opts.traceparent, + }); + if (opts.route) state.extras.route = opts.route; + if (opts.method) state.extras.method = opts.method; + + const start = performance.now(); + try { + if (opts.setup) opts.setup(state); + const result = await wideEventStorage.run(state, () => Promise.resolve(fn())); + state.durationMs = Math.round(performance.now() - start); + if (finalize) finalize(state); + if (state.statusCode !== 0) { + state.ok = state.statusCode >= 200 && state.statusCode < 300; + } else { + state.ok = true; + } + emit(state); + return result; + } catch (err) { + state.durationMs = Math.round(performance.now() - start); + const e = err instanceof Error ? err : new Error(String(err)); + if (state.statusCode === 0) state.statusCode = 500; + state.ok = false; + state.error = { + code: e.name || "Error", + message: e.message, + kind: "internal", + }; + emit(state); + throw err; + } +} + +/** + * One-shot wide event with no wrapped operation. Use for socket lifecycle + * events (`run:start`, `run:stop`) where there is no surrounding async unit + * of work to time. `populate` runs synchronously to attach meta/extras + * before emit. + */ +export function emitOneShot( + opts: WideEventOptions & { + traceparent?: string; + populate?: (state: State) => void; + } +): void { + if (!opts.enabled) return; + const state = newState({ + service: opts.service, + env: opts.env, + traceparent: opts.traceparent, + }); + if (opts.populate) opts.populate(state); + state.ok = true; + emit(state); +} + +/** Convenience accessor for in-handler meta mutation. */ +export function setMeta(state: State | null, key: string, value: string): void { + if (!state) return; + state.meta[key] = value; +} + +/** Convenience for free-form fields (did_warm_start, dispatch.result, ...). */ +export function setExtra(state: State | null, key: string, value: unknown): void { + if (!state) return; + state.extras[key] = value; +} diff --git a/apps/supervisor/src/wideEvents/new.test.ts b/apps/supervisor/src/wideEvents/new.test.ts new file mode 100644 index 00000000000..476c49c3d0e --- /dev/null +++ b/apps/supervisor/src/wideEvents/new.test.ts @@ -0,0 +1,81 @@ +import { describe, it, expect } from "vitest"; +import { isValidRequestId, newState } from "./new.js"; + +describe("isValidRequestId", () => { + it("accepts visible ASCII", () => { + expect(isValidRequestId("req-abc-123_456.7")).toBe(true); + expect(isValidRequestId("a")).toBe(true); + }); + + it("rejects empty string", () => { + expect(isValidRequestId("")).toBe(false); + }); + + it("rejects overlong strings (>128 bytes)", () => { + expect(isValidRequestId("a".repeat(128))).toBe(true); + expect(isValidRequestId("a".repeat(129))).toBe(false); + }); + + it("rejects whitespace, newlines, control chars", () => { + expect(isValidRequestId("has space")).toBe(false); + expect(isValidRequestId("has\ttab")).toBe(false); + expect(isValidRequestId("has\nnewline")).toBe(false); + expect(isValidRequestId("\x00null")).toBe(false); + }); + + it("rejects high-bit / non-ASCII", () => { + expect(isValidRequestId("café")).toBe(false); + expect(isValidRequestId("a\x7f")).toBe(false); + }); +}); + +describe("newState", () => { + const env = { version: "1.0.0", commitSha: "abc123", region: "us-east-1", nodeId: "node-1" }; + + it("populates service identity from env", () => { + const s = newState({ service: "supervisor", env }); + expect(s.service).toBe("supervisor"); + expect(s.version).toBe("1.0.0"); + expect(s.commitSha).toBe("abc123"); + expect(s.region).toBe("us-east-1"); + expect(s.nodeId).toBe("node-1"); + }); + + it("mints a fresh request id when none provided", () => { + const s = newState({ service: "test", env: {} }); + expect(s.requestId).toMatch(/^req-[0-9a-f]{32}$/); + }); + + it("honours a valid inbound request id", () => { + const s = newState({ service: "test", env: {}, inboundRequestId: "trace-abc-123" }); + expect(s.requestId).toBe("trace-abc-123"); + }); + + it("rejects unsafe inbound request id and mints a fresh one", () => { + const s = newState({ service: "test", env: {}, inboundRequestId: "has space" }); + expect(s.requestId).toMatch(/^req-[0-9a-f]{32}$/); + }); + + it("parses traceparent into traceId and preserves the raw header", () => { + const tp = "00-4bf92f3577b34da6a3ce929d0e0e4736-00f067aa0ba902b7-01"; + const s = newState({ service: "test", env: {}, traceparent: tp }); + expect(s.traceId).toBe("4bf92f3577b34da6a3ce929d0e0e4736"); + expect(s.traceparent).toBe(tp); + }); + + it("leaves traceId empty when no traceparent provided", () => { + const s = newState({ service: "test", env: {} }); + expect(s.traceId).toBe(""); + expect(s.traceparent).toBe(""); + }); + + it("initialises empty meta/extras/phases", () => { + const s = newState({ service: "test", env: {} }); + expect(s.meta).toEqual({}); + expect(s.extras).toEqual({}); + expect(s.phases).toEqual([]); + expect(s.ok).toBe(false); + expect(s.statusCode).toBe(0); + expect(s.durationMs).toBe(0); + }); +}); diff --git a/apps/supervisor/src/wideEvents/new.ts b/apps/supervisor/src/wideEvents/new.ts new file mode 100644 index 00000000000..2b7c88b7cc9 --- /dev/null +++ b/apps/supervisor/src/wideEvents/new.ts @@ -0,0 +1,76 @@ +import { randomBytes } from "node:crypto"; +import { parseTraceId } from "./traceparent.js"; +import type { State } from "./state.js"; + +const MAX_REQUEST_ID_LEN = 128; + +/** + * Validates an inbound request id. Non-empty, no longer than 128 bytes, + * composed entirely of visible ASCII (0x21..0x7E). Rejects newlines, control + * characters, whitespace, DEL, high-bit bytes - any of which could poison the + * log pipeline if echoed back verbatim. + */ +export function isValidRequestId(s: string): boolean { + if (s.length === 0 || s.length > MAX_REQUEST_ID_LEN) return false; + for (let i = 0; i < s.length; i++) { + const c = s.charCodeAt(i); + if (c < 0x21 || c > 0x7e) return false; + } + return true; +} + +/** + * Service-level identity that's constant for the lifetime of the process. + * Populated once at startup, copied into every State. + */ +export type Env = { + version?: string; + commitSha?: string; + region?: string; + nodeId?: string; +}; + +export type NewStateOptions = { + service: string; + env: Env; + /** Optional inbound request id (e.g. from `x-request-id`). If unsafe or absent, a fresh `req-` is minted. */ + inboundRequestId?: string; + /** Optional inbound W3C traceparent (HTTP header, queue message field). */ + traceparent?: string; +}; + +/** + * Builds a State for a wide-event lifecycle. + * + * - requestId: honours `inboundRequestId` if present and safe; otherwise + * mints a fresh `req-` id. + * - traceId: parsed from the provided traceparent (graceful empty if + * absent or malformed). + * - traceparent: preserved verbatim for downstream propagation. + */ +export function newState(opts: NewStateOptions): State { + const traceparent = opts.traceparent ?? ""; + const inbound = opts.inboundRequestId ?? ""; + const requestId = isValidRequestId(inbound) ? inbound : newRequestId(); + + return { + requestId, + traceId: parseTraceId(traceparent), + traceparent, + service: opts.service, + version: opts.env.version, + commitSha: opts.env.commitSha, + region: opts.env.region, + nodeId: opts.env.nodeId, + meta: {}, + phases: [], + ok: false, + statusCode: 0, + durationMs: 0, + extras: {}, + }; +} + +function newRequestId(): string { + return "req-" + randomBytes(16).toString("hex"); +} diff --git a/apps/supervisor/src/wideEvents/record.test.ts b/apps/supervisor/src/wideEvents/record.test.ts new file mode 100644 index 00000000000..beeb0fff221 --- /dev/null +++ b/apps/supervisor/src/wideEvents/record.test.ts @@ -0,0 +1,112 @@ +import { describe, it, expect } from "vitest"; +import { fromContext, wideEventStorage } from "./context.js"; +import { recordPhase, recordPhaseSince, timePhase } from "./record.js"; +import { newState } from "./new.js"; +import type { State } from "./state.js"; + +function makeState(): State { + return newState({ service: "test", env: {} }); +} + +describe("recordPhase", () => { + it("appends a successful phase", () => { + const s = makeState(); + recordPhase(s, "lookup", performance.now() - 50, undefined); + expect(s.phases).toHaveLength(1); + const phase = s.phases[0]; + if (!phase) throw new Error("missing phase"); + expect(phase.name).toBe("lookup"); + expect(phase.ok).toBe(true); + expect(phase.attempts).toBe(1); + expect(phase.durationMs).toBeGreaterThanOrEqual(45); + }); + + it("appends a failed phase with error code/message", () => { + const s = makeState(); + recordPhase(s, "dispatch", performance.now(), new Error("nope")); + const phase = s.phases[0]; + if (!phase) throw new Error("missing phase"); + expect(phase.ok).toBe(false); + expect(phase.errorCode).toBe("Error"); + expect(phase.errorMsg).toBe("nope"); + }); + + it("truncates very long error messages", () => { + const s = makeState(); + recordPhase(s, "x", performance.now(), new Error("y".repeat(2000))); + const phase = s.phases[0]; + if (!phase) throw new Error("missing phase"); + expect(phase.errorMsg?.length).toBe(512); + }); + + it("honours opts.attempts", () => { + const s = makeState(); + recordPhase(s, "retry", performance.now(), undefined, { attempts: 3 }); + expect(s.phases[0]?.attempts).toBe(3); + }); + + it("attaches sub-timings", () => { + const s = makeState(); + recordPhase(s, "complex", performance.now(), undefined, { sub: { setup_ms: 10, work_ms: 5 } }); + expect(s.phases[0]?.sub).toEqual({ setup_ms: 10, work_ms: 5 }); + }); + + it("is a no-op when state is null", () => { + expect(() => recordPhase(null, "x", performance.now(), undefined)).not.toThrow(); + }); +}); + +describe("timePhase + AsyncLocalStorage threading", () => { + it("records via fromContext on success", async () => { + const s = makeState(); + const value = await wideEventStorage.run(s, () => timePhase("work", async () => 42)); + expect(value).toBe(42); + expect(s.phases).toHaveLength(1); + expect(s.phases[0]?.ok).toBe(true); + }); + + it("records via fromContext on error and rethrows", async () => { + const s = makeState(); + await expect( + wideEventStorage.run(s, () => + timePhase("work", async () => { + throw new Error("boom"); + }) + ) + ).rejects.toThrow("boom"); + expect(s.phases).toHaveLength(1); + expect(s.phases[0]?.ok).toBe(false); + expect(s.phases[0]?.errorMsg).toBe("boom"); + }); + + it("runs fn unchanged when no state on context", async () => { + const value = await timePhase("work", async () => "ok"); + expect(value).toBe("ok"); + }); +}); + +describe("recordPhaseSince", () => { + it("records using a caller-captured start time", async () => { + const s = makeState(); + await wideEventStorage.run(s, async () => { + const start = performance.now(); + await new Promise((r) => setTimeout(r, 10)); + recordPhaseSince("spanning", start, undefined); + }); + expect(s.phases).toHaveLength(1); + expect(s.phases[0]?.durationMs).toBeGreaterThanOrEqual(8); + }); +}); + +describe("fromContext", () => { + it("returns null when no state attached", () => { + expect(fromContext()).toBe(null); + }); + + it("returns the state when inside wideEventStorage.run", () => { + const s = makeState(); + wideEventStorage.run(s, () => { + expect(fromContext()).toBe(s); + }); + }); +}); diff --git a/apps/supervisor/src/wideEvents/record.ts b/apps/supervisor/src/wideEvents/record.ts new file mode 100644 index 00000000000..b37c7246a00 --- /dev/null +++ b/apps/supervisor/src/wideEvents/record.ts @@ -0,0 +1,82 @@ +import { fromContext } from "./context.js"; +import type { PhaseRecord, State } from "./state.js"; + +const MAX_ERROR_MSG_BYTES = 512; + +/** Optional knobs for a phase record. */ +export type PhaseOpt = { + /** Attempt count for the phase (default 1). */ + attempts?: number; + /** Sub-timings to fold into `phase..`. */ + sub?: Record; +}; + +/** + * Appends a phase outcome to `state.phases`. Safe to call on success + * (`err === undefined`) and error paths. `errorMsg` is truncated to 512 bytes + * to keep the wide event compact. No-op if state is null. + */ +export function recordPhase( + state: State | null, + name: string, + startMs: number, + err: Error | undefined, + opts: PhaseOpt = {} +): void { + if (!state) return; + const p: PhaseRecord = { + name, + durationMs: Math.round(performance.now() - startMs), + ok: err === undefined, + attempts: opts.attempts ?? 1, + }; + if (err) { + p.errorCode = err.name || "Error"; + const msg = err.message; + p.errorMsg = msg.length > MAX_ERROR_MSG_BYTES ? msg.slice(0, MAX_ERROR_MSG_BYTES) : msg; + } + if (opts.sub) p.sub = opts.sub; + state.phases.push(p); +} + +/** + * Runs `fn` and appends a phase outcome to the State attached to the current + * async context. If no state is on context (test paths, background work), + * `fn` runs unchanged. The phase is recorded on both success and error paths + * so failed phases still appear in the wide event with duration_ms + + * error_code. + */ +export async function timePhase( + name: string, + fn: () => Promise | T, + opts: PhaseOpt = {} +): Promise { + const start = performance.now(); + try { + const result = await fn(); + recordPhase(fromContext(), name, start, undefined, opts); + return result; + } catch (err) { + recordPhase(fromContext(), name, start, asError(err), opts); + throw err; + } +} + +/** + * Appends a phase outcome to the State attached to the current async context + * using a `startMs` captured by the caller. Use when the phase boundary spans + * multiple calls with intermediate error handling that can't fit inside a + * single `timePhase` closure. Nil-state safe. + */ +export function recordPhaseSince( + name: string, + startMs: number, + err: Error | undefined, + opts: PhaseOpt = {} +): void { + recordPhase(fromContext(), name, startMs, err, opts); +} + +function asError(e: unknown): Error { + return e instanceof Error ? e : new Error(String(e)); +} diff --git a/apps/supervisor/src/wideEvents/state.ts b/apps/supervisor/src/wideEvents/state.ts new file mode 100644 index 00000000000..1929e5e9806 --- /dev/null +++ b/apps/supervisor/src/wideEvents/state.ts @@ -0,0 +1,61 @@ +/** + * Per-event accumulator backing a single wide event. The supervisor emits one + * flat-keyed JSON line per natural unit of work (dequeue iteration, HTTP + * request, socket lifecycle event). Optional fields are omitted on emit so + * events stay compact. + */ +export type State = { + // Cross-stack correlation. + requestId: string; + traceId: string; + /** + * Raw inbound W3C `traceparent`, preserved verbatim so outbound calls can + * propagate the same trace context without losing the parent span-id. + * Empty when no inbound traceparent was set. + */ + traceparent: string; + + // Service identity (set by `newState` from Env). + service: string; + version?: string; + commitSha?: string; + region?: string; + nodeId?: string; + + // Caller-attached opaque metadata, flattened to `meta.` on emit. + meta: Record; + + // Per-phase outcomes, in completion order. + phases: PhaseRecord[]; + + // Top-level outcome (set after the wrapped operation returns). + ok: boolean; + statusCode: number; + durationMs: number; + error?: ErrorInfo; + + // Free-form ad-hoc additions (route, method, did_warm_start, ...). + extras: Record; +}; + +/** + * Single named phase outcome. Retries collapse into `attempts > 1` with the + * last error reflected in errorCode/errorMsg. + */ +export type PhaseRecord = { + name: string; + durationMs: number; + ok: boolean; + attempts: number; + errorCode?: string; + errorMsg?: string; + sub?: Record; +}; + +/** Top-level error summary for a failed operation. */ +export type ErrorInfo = { + code: string; + message: string; + /** Coarse classification - "client" | "upstream" | "internal" | "timeout". */ + kind: string; +}; diff --git a/apps/supervisor/src/wideEvents/traceparent.test.ts b/apps/supervisor/src/wideEvents/traceparent.test.ts new file mode 100644 index 00000000000..85ed31c3b6f --- /dev/null +++ b/apps/supervisor/src/wideEvents/traceparent.test.ts @@ -0,0 +1,43 @@ +import { describe, it, expect } from "vitest"; +import { parseTraceId } from "./traceparent.js"; + +describe("parseTraceId", () => { + const validTraceId = "4bf92f3577b34da6a3ce929d0e0e4736"; + const validHeader = `00-${validTraceId}-00f067aa0ba902b7-01`; + + it("extracts the trace-id from a valid W3C traceparent", () => { + expect(parseTraceId(validHeader)).toBe(validTraceId); + }); + + it("returns empty string for empty/null/undefined input", () => { + expect(parseTraceId("")).toBe(""); + expect(parseTraceId(null)).toBe(""); + expect(parseTraceId(undefined)).toBe(""); + }); + + it("returns empty for wrong segment count", () => { + expect(parseTraceId("00-abc-def")).toBe(""); + expect(parseTraceId("00-abc-def-01-extra")).toBe(""); + }); + + it("returns empty for non-zero version byte", () => { + expect(parseTraceId(`01-${validTraceId}-00f067aa0ba902b7-01`)).toBe(""); + }); + + it("returns empty for wrong-length trace-id", () => { + expect(parseTraceId("00-abc-00f067aa0ba902b7-01")).toBe(""); + }); + + it("returns empty for non-hex trace-id", () => { + expect(parseTraceId("00-zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz-00f067aa0ba902b7-01")).toBe(""); + }); + + it("returns empty for all-zero trace-id", () => { + expect(parseTraceId("00-00000000000000000000000000000000-00f067aa0ba902b7-01")).toBe(""); + }); + + it("accepts uppercase hex", () => { + const tid = "4BF92F3577B34DA6A3CE929D0E0E4736"; + expect(parseTraceId(`00-${tid}-00f067aa0ba902b7-01`)).toBe(tid); + }); +}); diff --git a/apps/supervisor/src/wideEvents/traceparent.ts b/apps/supervisor/src/wideEvents/traceparent.ts new file mode 100644 index 00000000000..9e84294f067 --- /dev/null +++ b/apps/supervisor/src/wideEvents/traceparent.ts @@ -0,0 +1,39 @@ +/** + * Extracts the trace-id from a W3C `traceparent` header. Returns "" when the + * header is absent, malformed, or carries an all-zero trace-id. + * + * Format: `---` + * version : 2 hex chars, must be "00" + * trace-id: 32 hex chars, non-zero + * span-id : 16 hex chars (not validated - we only need trace-id) + * flags : 2 hex chars (not validated) + */ +export function parseTraceId(header: string | null | undefined): string { + if (!header) return ""; + const parts = header.split("-"); + if (parts.length !== 4) return ""; + if (parts[0] !== "00") return ""; + const tid = parts[1]; + if (!tid || tid.length !== 32) return ""; + if (!isHex(tid)) return ""; + if (isAllZero(tid)) return ""; + return tid; +} + +function isHex(s: string): boolean { + for (let i = 0; i < s.length; i++) { + const c = s.charCodeAt(i); + const isDigit = c >= 0x30 && c <= 0x39; + const isLower = c >= 0x61 && c <= 0x66; + const isUpper = c >= 0x41 && c <= 0x46; + if (!isDigit && !isLower && !isUpper) return false; + } + return true; +} + +function isAllZero(s: string): boolean { + for (let i = 0; i < s.length; i++) { + if (s.charCodeAt(i) !== 0x30) return false; + } + return true; +} diff --git a/apps/supervisor/src/workloadManager/compute.ts b/apps/supervisor/src/workloadManager/compute.ts new file mode 100644 index 00000000000..1c00f33aad3 --- /dev/null +++ b/apps/supervisor/src/workloadManager/compute.ts @@ -0,0 +1,374 @@ +import { SimpleStructuredLogger } from "@trigger.dev/core/v3/utils/structuredLogger"; +import { parseTraceparent } from "@trigger.dev/core/v3/isomorphic"; +import { flattenAttributes } from "@trigger.dev/core/v3/utils/flattenAttributes"; +import { + type WorkloadManager, + type WorkloadManagerCreateOptions, + type WorkloadManagerOptions, +} from "./types.js"; +import { ComputeClient, stripImageDigest } from "@internal/compute"; +import { extractTraceparent, getRunnerId } from "../util.js"; +import type { OtlpTraceService } from "../services/otlpTraceService.js"; +import { tryCatch } from "@trigger.dev/core"; + +type ComputeWorkloadManagerOptions = WorkloadManagerOptions & { + gateway: { + url: string; + authToken?: string; + timeoutMs: number; + }; + snapshots: { + enabled: boolean; + delayMs: number; + dispatchLimit: number; + callbackUrl: string; + }; + tracing?: OtlpTraceService; + runner: { + instanceName: string; + otelEndpoint: string; + prettyLogs: boolean; + }; +}; + +export class ComputeWorkloadManager implements WorkloadManager { + private readonly logger = new SimpleStructuredLogger("compute-workload-manager"); + private readonly compute: ComputeClient; + + constructor(private opts: ComputeWorkloadManagerOptions) { + if (opts.workloadApiDomain) { + this.logger.warn("⚠️ Custom workload API domain", { + domain: opts.workloadApiDomain, + }); + } + + this.compute = new ComputeClient({ + gatewayUrl: opts.gateway.url, + authToken: opts.gateway.authToken, + timeoutMs: opts.gateway.timeoutMs, + }); + } + + get snapshotsEnabled(): boolean { + return this.opts.snapshots.enabled; + } + + get snapshotDelayMs(): number { + return this.opts.snapshots.delayMs; + } + + get snapshotDispatchLimit(): number { + return this.opts.snapshots.dispatchLimit; + } + + get traceSpansEnabled(): boolean { + return !!this.opts.tracing; + } + + async create(opts: WorkloadManagerCreateOptions) { + const runnerId = getRunnerId(opts.runFriendlyId, opts.nextAttemptNumber); + + const envVars: Record = { + OTEL_EXPORTER_OTLP_ENDPOINT: this.opts.runner.otelEndpoint, + TRIGGER_DEQUEUED_AT_MS: String(opts.dequeuedAt.getTime()), + TRIGGER_POD_SCHEDULED_AT_MS: String(Date.now()), + TRIGGER_ENV_ID: opts.envId, + TRIGGER_DEPLOYMENT_ID: opts.deploymentFriendlyId, + TRIGGER_DEPLOYMENT_VERSION: opts.deploymentVersion, + TRIGGER_RUN_ID: opts.runFriendlyId, + TRIGGER_SNAPSHOT_ID: opts.snapshotFriendlyId, + TRIGGER_SUPERVISOR_API_PROTOCOL: this.opts.workloadApiProtocol, + TRIGGER_SUPERVISOR_API_PORT: String(this.opts.workloadApiPort), + TRIGGER_SUPERVISOR_API_DOMAIN: this.opts.workloadApiDomain ?? "", + TRIGGER_WORKER_INSTANCE_NAME: this.opts.runner.instanceName, + TRIGGER_RUNNER_ID: runnerId, + TRIGGER_MACHINE_CPU: String(opts.machine.cpu), + TRIGGER_MACHINE_MEMORY: String(opts.machine.memory), + PRETTY_LOGS: String(this.opts.runner.prettyLogs), + }; + + if (this.opts.warmStartUrl) { + envVars.TRIGGER_WARM_START_URL = this.opts.warmStartUrl; + } + + if (this.snapshotsEnabled && this.opts.metadataUrl) { + envVars.TRIGGER_METADATA_URL = this.opts.metadataUrl; + } + + if (this.opts.heartbeatIntervalSeconds) { + envVars.TRIGGER_HEARTBEAT_INTERVAL_SECONDS = String(this.opts.heartbeatIntervalSeconds); + } + + if (this.opts.snapshotPollIntervalSeconds) { + envVars.TRIGGER_SNAPSHOT_POLL_INTERVAL_SECONDS = String( + this.opts.snapshotPollIntervalSeconds + ); + } + + if (this.opts.additionalEnvVars) { + Object.assign(envVars, this.opts.additionalEnvVars); + } + + // Strip image digest - resolve by tag, not digest + const imageRef = stripImageDigest(opts.image); + + // Wide event: single canonical log line emitted in finally + const event: Record = { + // High-cardinality identifiers + runId: opts.runFriendlyId, + runnerId, + envId: opts.envId, + envType: opts.envType, + orgId: opts.orgId, + projectId: opts.projectId, + deploymentVersion: opts.deploymentVersion, + machine: opts.machine.name, + // Environment + instanceName: this.opts.runner.instanceName, + // Supervisor timing + dequeueResponseMs: opts.dequeueResponseMs, + pollingIntervalMs: opts.pollingIntervalMs, + warmStartCheckMs: opts.warmStartCheckMs, + // Request + image: imageRef, + }; + + const startMs = performance.now(); + + try { + const [error, data] = await tryCatch( + this.compute.instances.create({ + name: runnerId, + image: imageRef, + env: envVars, + cpu: opts.machine.cpu, + memory_gb: opts.machine.memory, + metadata: { + runId: opts.runFriendlyId, + envId: opts.envId, + envType: opts.envType, + orgId: opts.orgId, + projectId: opts.projectId, + deploymentVersion: opts.deploymentVersion, + machine: opts.machine.name, + }, + }) + ); + + if (error) { + event.error = error instanceof Error ? error.message : String(error); + event.errorType = + error instanceof DOMException && error.name === "TimeoutError" ? "timeout" : "fetch"; + // Intentional: errors are captured in the wide event, not thrown. This matches + // the Docker/K8s managers. The run will eventually time out if scheduling fails. + return; + } + + event.instanceId = data.id; + event.ok = true; + + // Parse timing data from compute response (optional - requires gateway timing flag) + if (data._timing) { + event.timing = data._timing; + } + + this.#emitProvisionSpan(opts, startMs, data._timing); + } finally { + event.durationMs = Math.round(performance.now() - startMs); + event.ok ??= false; + this.logger.debug("create instance", event); + } + } + + async snapshot(opts: { runnerId: string; metadata: Record }): Promise { + const [error] = await tryCatch( + this.compute.instances.snapshot(opts.runnerId, { + callback: { + url: this.opts.snapshots.callbackUrl, + metadata: opts.metadata, + }, + }) + ); + + if (error) { + this.logger.error("snapshot request failed", { + runnerId: opts.runnerId, + error: error instanceof Error ? error.message : String(error), + }); + return false; + } + + this.logger.debug("snapshot request accepted", { runnerId: opts.runnerId }); + return true; + } + + async deleteInstance(runnerId: string): Promise { + const [error] = await tryCatch(this.compute.instances.delete(runnerId)); + + if (error) { + this.logger.error("delete instance failed", { + runnerId, + error: error instanceof Error ? error.message : String(error), + }); + return false; + } + + this.logger.debug("delete instance success", { runnerId }); + return true; + } + + #emitProvisionSpan(opts: WorkloadManagerCreateOptions, startMs: number, timing?: unknown) { + if (!this.traceSpansEnabled) return; + + const parsed = parseTraceparent(extractTraceparent(opts.traceContext)); + if (!parsed) return; + + const endMs = performance.now(); + const now = Date.now(); + const provisionStartEpochMs = now - (endMs - startMs); + const endEpochMs = now; + + // Span starts at dequeue time so events (dequeue) render in the thin-line section + // before "Started". The actual provision call time is in provisionStartEpochMs. + // Subtract 1ms so compute span always sorts before the attempt span (same dequeue time) + const startEpochMs = opts.dequeuedAt.getTime() - 1; + + const spanAttributes: Record = { + "compute.type": "create", + "compute.provision_start_ms": provisionStartEpochMs, + ...(timing + ? (flattenAttributes(timing, "compute") as Record) + : {}), + }; + + if (opts.dequeueResponseMs !== undefined) { + spanAttributes["supervisor.dequeue_response_ms"] = opts.dequeueResponseMs; + } + if (opts.warmStartCheckMs !== undefined) { + spanAttributes["supervisor.warm_start_check_ms"] = opts.warmStartCheckMs; + } + + // Use the platform API URL, not the runner OTLP endpoint (which may be a VM gateway IP) + this.opts.tracing?.emit({ + traceId: parsed.traceId, + parentSpanId: parsed.spanId, + spanName: "compute.provision", + startTimeMs: startEpochMs, + endTimeMs: endEpochMs, + resourceAttributes: { + "ctx.environment.id": opts.envId, + "ctx.organization.id": opts.orgId, + "ctx.project.id": opts.projectId, + "ctx.run.id": opts.runFriendlyId, + }, + spanAttributes, + }); + } + + async restore(opts: { + snapshotId: string; + runnerId: string; + runFriendlyId: string; + snapshotFriendlyId: string; + machine: { cpu: number; memory: number }; + // Trace context for OTel span emission + traceContext?: Record; + envId?: string; + orgId?: string; + projectId?: string; + dequeuedAt?: Date; + }): Promise { + const metadata: Record = { + TRIGGER_RUNNER_ID: opts.runnerId, + TRIGGER_RUN_ID: opts.runFriendlyId, + TRIGGER_SNAPSHOT_ID: opts.snapshotFriendlyId, + TRIGGER_SUPERVISOR_API_PROTOCOL: this.opts.workloadApiProtocol, + TRIGGER_SUPERVISOR_API_PORT: String(this.opts.workloadApiPort), + TRIGGER_SUPERVISOR_API_DOMAIN: this.opts.workloadApiDomain ?? "", + TRIGGER_WORKER_INSTANCE_NAME: this.opts.runner.instanceName, + }; + + this.logger.verbose("restore request body", { + snapshotId: opts.snapshotId, + runnerId: opts.runnerId, + }); + + const startMs = performance.now(); + + const [error] = await tryCatch( + this.compute.snapshots.restore(opts.snapshotId, { + name: opts.runnerId, + metadata, + cpu: opts.machine.cpu, + memory_gb: opts.machine.memory, + }) + ); + + const durationMs = Math.round(performance.now() - startMs); + + if (error) { + this.logger.error("restore request failed", { + snapshotId: opts.snapshotId, + runnerId: opts.runnerId, + error: error instanceof Error ? error.message : String(error), + durationMs, + }); + return false; + } + + this.logger.debug("restore request success", { + snapshotId: opts.snapshotId, + runnerId: opts.runnerId, + durationMs, + }); + + this.#emitRestoreSpan(opts, startMs); + + return true; + } + + #emitRestoreSpan( + opts: { + snapshotId: string; + runnerId: string; + runFriendlyId: string; + traceContext?: Record; + envId?: string; + orgId?: string; + projectId?: string; + dequeuedAt?: Date; + }, + startMs: number + ) { + if (!this.traceSpansEnabled) return; + + const parsed = parseTraceparent(extractTraceparent(opts.traceContext)); + if (!parsed || !opts.envId || !opts.orgId || !opts.projectId) return; + + const endMs = performance.now(); + const now = Date.now(); + const restoreStartEpochMs = now - (endMs - startMs); + const endEpochMs = now; + + // Subtract 1ms so restore span always sorts before the attempt span + const startEpochMs = (opts.dequeuedAt?.getTime() ?? restoreStartEpochMs) - 1; + + this.opts.tracing?.emit({ + traceId: parsed.traceId, + parentSpanId: parsed.spanId, + spanName: "compute.restore", + startTimeMs: startEpochMs, + endTimeMs: endEpochMs, + resourceAttributes: { + "ctx.environment.id": opts.envId, + "ctx.organization.id": opts.orgId, + "ctx.project.id": opts.projectId, + "ctx.run.id": opts.runFriendlyId, + }, + spanAttributes: { + "compute.type": "restore", + "compute.snapshot_id": opts.snapshotId, + }, + }); + } +} diff --git a/apps/supervisor/src/workloadManager/docker.ts b/apps/supervisor/src/workloadManager/docker.ts index d6651d325a2..66405df9ba5 100644 --- a/apps/supervisor/src/workloadManager/docker.ts +++ b/apps/supervisor/src/workloadManager/docker.ts @@ -62,7 +62,7 @@ export class DockerWorkloadManager implements WorkloadManager { } async create(opts: WorkloadManagerCreateOptions) { - this.logger.log("create()", { opts }); + this.logger.verbose("create()", { opts }); const runnerId = getRunnerId(opts.runFriendlyId, opts.nextAttemptNumber); diff --git a/apps/supervisor/src/workloadManager/kubernetes.ts b/apps/supervisor/src/workloadManager/kubernetes.ts index a725971a845..b2ed05c9f11 100644 --- a/apps/supervisor/src/workloadManager/kubernetes.ts +++ b/apps/supervisor/src/workloadManager/kubernetes.ts @@ -100,7 +100,7 @@ export class KubernetesWorkloadManager implements WorkloadManager { } async create(opts: WorkloadManagerCreateOptions) { - this.logger.log("[KubernetesWorkloadManager] Creating container", { opts }); + this.logger.verbose("[KubernetesWorkloadManager] Creating container", { opts }); const runnerId = getRunnerId(opts.runFriendlyId, opts.nextAttemptNumber); @@ -120,7 +120,8 @@ export class KubernetesWorkloadManager implements WorkloadManager { }, spec: { ...this.addPlacementTags(this.#defaultPodSpec, opts.placementTags), - affinity: this.#getNodeAffinity(opts.machine), + affinity: this.#getAffinity(opts), + tolerations: this.#getScheduleTolerations(this.#isScheduledRun(opts)), terminationGracePeriodSeconds: 60 * 60, containers: [ { @@ -320,6 +321,13 @@ export class KubernetesWorkloadManager implements WorkloadManager { }, } : {}), + ...(env.KUBERNETES_POD_DNS_NDOTS_OVERRIDE_ENABLED + ? { + dnsConfig: { + options: [{ name: "ndots", value: `${env.KUBERNETES_POD_DNS_NDOTS}` }], + }, + } + : {}), }; } @@ -335,14 +343,30 @@ export class KubernetesWorkloadManager implements WorkloadManager { }; } + #isScheduledRun(opts: WorkloadManagerCreateOptions): boolean { + return opts.annotations?.rootTriggerSource === "schedule"; + } + #getSharedLabels(opts: WorkloadManagerCreateOptions): Record { - return { + const labels: Record = { env: opts.envId, envtype: this.#envTypeToLabelValue(opts.envType), org: opts.orgId, project: opts.projectId, machine: opts.machine.name, + // We intentionally use a boolean label rather than exposing the full trigger source + // (e.g. sdk, api, cli, mcp, schedule) to keep label cardinality low in metrics. + // The schedule vs non-schedule distinction is all we need for the current metrics + // and pool-level scheduling decisions; finer-grained source breakdowns live in run annotations. + scheduled: String(this.#isScheduledRun(opts)), }; + + // Add privatelink label for CiliumNetworkPolicy matching + if (opts.hasPrivateLink) { + labels.privatelink = opts.orgId; + } + + return labels; } #getResourceRequestsForMachine(preset: MachinePreset): ResourceQuantities { @@ -390,50 +414,159 @@ export class KubernetesWorkloadManager implements WorkloadManager { return preset.name.startsWith("large-"); } - #getNodeAffinity(preset: MachinePreset): k8s.V1Affinity | undefined { - if (!env.KUBERNETES_LARGE_MACHINE_POOL_LABEL) { + #getAffinity(opts: WorkloadManagerCreateOptions): k8s.V1Affinity | undefined { + const largeNodeAffinity = this.#getNodeAffinityRules(opts.machine); + const scheduleNodeAffinity = this.#getScheduleNodeAffinityRules(this.#isScheduledRun(opts)); + const podAffinity = this.#getProjectPodAffinity(opts.projectId); + + // Merge node affinity rules from multiple sources + const preferred = [ + ...(largeNodeAffinity?.preferredDuringSchedulingIgnoredDuringExecution ?? []), + ...(scheduleNodeAffinity?.preferredDuringSchedulingIgnoredDuringExecution ?? []), + ]; + // Only large machine affinity produces hard requirements (non-large runs must stay off the large pool). + // Schedule affinity is soft both ways. + const required = [ + ...(largeNodeAffinity?.requiredDuringSchedulingIgnoredDuringExecution?.nodeSelectorTerms ?? []), + ]; + + const hasNodeAffinity = preferred.length > 0 || required.length > 0; + + if (!hasNodeAffinity && !podAffinity) { + return undefined; + } + + return { + ...(hasNodeAffinity && { + nodeAffinity: { + ...(preferred.length > 0 && { preferredDuringSchedulingIgnoredDuringExecution: preferred }), + ...(required.length > 0 && { + requiredDuringSchedulingIgnoredDuringExecution: { nodeSelectorTerms: required }, + }), + }, + }), + ...(podAffinity && { podAffinity }), + }; + } + + #getNodeAffinityRules(preset: MachinePreset): k8s.V1NodeAffinity | undefined { + if (!env.KUBERNETES_LARGE_MACHINE_AFFINITY_ENABLED) { return undefined; } if (this.#isLargeMachine(preset)) { // soft preference for the large-machine pool, falls back to standard if unavailable return { - nodeAffinity: { - preferredDuringSchedulingIgnoredDuringExecution: [ - { - weight: 100, - preference: { - matchExpressions: [ - { - key: "node.cluster.x-k8s.io/machinepool", - operator: "In", - values: [env.KUBERNETES_LARGE_MACHINE_POOL_LABEL], - }, - ], - }, + preferredDuringSchedulingIgnoredDuringExecution: [ + { + weight: env.KUBERNETES_LARGE_MACHINE_AFFINITY_WEIGHT, + preference: { + matchExpressions: [ + { + key: env.KUBERNETES_LARGE_MACHINE_AFFINITY_POOL_LABEL_KEY, + operator: "In", + values: [env.KUBERNETES_LARGE_MACHINE_AFFINITY_POOL_LABEL_VALUE], + }, + ], }, - ], - }, + }, + ], }; } // not schedulable in the large-machine pool return { - nodeAffinity: { - requiredDuringSchedulingIgnoredDuringExecution: { - nodeSelectorTerms: [ - { + requiredDuringSchedulingIgnoredDuringExecution: { + nodeSelectorTerms: [ + { + matchExpressions: [ + { + key: env.KUBERNETES_LARGE_MACHINE_AFFINITY_POOL_LABEL_KEY, + operator: "NotIn", + values: [env.KUBERNETES_LARGE_MACHINE_AFFINITY_POOL_LABEL_VALUE], + }, + ], + }, + ], + }, + }; + } + + #getScheduleNodeAffinityRules(isScheduledRun: boolean): k8s.V1NodeAffinity | undefined { + if (!env.KUBERNETES_SCHEDULED_RUN_AFFINITY_ENABLED || !env.KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_VALUE) { + return undefined; + } + + if (isScheduledRun) { + // soft preference for the schedule pool + return { + preferredDuringSchedulingIgnoredDuringExecution: [ + { + weight: env.KUBERNETES_SCHEDULED_RUN_AFFINITY_WEIGHT, + preference: { matchExpressions: [ { - key: "node.cluster.x-k8s.io/machinepool", - operator: "NotIn", - values: [env.KUBERNETES_LARGE_MACHINE_POOL_LABEL], + key: env.KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_KEY, + operator: "In", + values: [env.KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_VALUE], }, ], }, - ], + }, + ], + }; + } + + // soft anti-affinity: non-schedule runs prefer to avoid the schedule pool + return { + preferredDuringSchedulingIgnoredDuringExecution: [ + { + weight: env.KUBERNETES_SCHEDULED_RUN_ANTI_AFFINITY_WEIGHT, + preference: { + matchExpressions: [ + { + key: env.KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_KEY, + operator: "NotIn", + values: [env.KUBERNETES_SCHEDULED_RUN_AFFINITY_POOL_LABEL_VALUE], + }, + ], + }, }, - }, + ], + }; + } + + #getScheduleTolerations(isScheduledRun: boolean): k8s.V1Toleration[] | undefined { + if (!isScheduledRun || !env.KUBERNETES_SCHEDULED_RUN_TOLERATIONS?.length) { + return undefined; + } + + return env.KUBERNETES_SCHEDULED_RUN_TOLERATIONS; + } + + #getProjectPodAffinity(projectId: string): k8s.V1PodAffinity | undefined { + if (!env.KUBERNETES_PROJECT_AFFINITY_ENABLED) { + return undefined; + } + + return { + preferredDuringSchedulingIgnoredDuringExecution: [ + { + weight: env.KUBERNETES_PROJECT_AFFINITY_WEIGHT, + podAffinityTerm: { + labelSelector: { + matchExpressions: [ + { + key: "project", + operator: "In", + values: [projectId], + }, + ], + }, + topologyKey: env.KUBERNETES_PROJECT_AFFINITY_TOPOLOGY_KEY, + }, + }, + ], }; } } diff --git a/apps/supervisor/src/workloadManager/types.ts b/apps/supervisor/src/workloadManager/types.ts index 90b61957795..86199afe469 100644 --- a/apps/supervisor/src/workloadManager/types.ts +++ b/apps/supervisor/src/workloadManager/types.ts @@ -1,4 +1,4 @@ -import type { EnvironmentType, MachinePreset, PlacementTag } from "@trigger.dev/core/v3"; +import type { EnvironmentType, MachinePreset, PlacementTag, RunAnnotations } from "@trigger.dev/core/v3"; export interface WorkloadManagerOptions { workloadApiProtocol: "http" | "https"; @@ -24,6 +24,10 @@ export interface WorkloadManagerCreateOptions { nextAttemptNumber?: number; dequeuedAt: Date; placementTags?: PlacementTag[]; + // Timing context (populated by supervisor handler, included in wide event) + dequeueResponseMs?: number; + pollingIntervalMs?: number; + warmStartCheckMs?: number; // identifiers envId: string; envType: EnvironmentType; @@ -35,4 +39,9 @@ export interface WorkloadManagerCreateOptions { runFriendlyId: string; snapshotId: string; snapshotFriendlyId: string; + // Trace context for OTel span emission (W3C format: { traceparent: "00-...", tracestate?: "..." }) + traceContext?: Record; + annotations?: RunAnnotations; + // private networking + hasPrivateLink?: boolean; } diff --git a/apps/supervisor/src/workloadServer/index.ts b/apps/supervisor/src/workloadServer/index.ts index 35d53d36099..ea7728e6376 100644 --- a/apps/supervisor/src/workloadServer/index.ts +++ b/apps/supervisor/src/workloadServer/index.ts @@ -24,6 +24,21 @@ import { HttpServer, type CheckpointClient } from "@trigger.dev/core/v3/serverOn import { type IncomingMessage } from "node:http"; import { register } from "../metrics.js"; import { env } from "../env.js"; +import { SnapshotCallbackPayloadSchema } from "@internal/compute"; +import { + ComputeSnapshotService, + type RunTraceContext, +} from "../services/computeSnapshotService.js"; +import type { ComputeWorkloadManager } from "../workloadManager/compute.js"; +import type { OtlpTraceService } from "../services/otlpTraceService.js"; +import type { ServerResponse } from "node:http"; +import { + emitOneShot, + runWideEvent, + setMeta, + type State, + type WideEventOptions, +} from "../wideEvents/index.js"; // Use the official export when upgrading to socket.io@4.8.0 interface DefaultEventsMap { @@ -58,12 +73,20 @@ type WorkloadServerOptions = { host?: string; workerClient: SupervisorHttpClient; checkpointClient?: CheckpointClient; + computeManager?: ComputeWorkloadManager; + tracing?: OtlpTraceService; + wideEventOpts: WideEventOptions; + /** When true, high-frequency HTTP routes also emit wide events. */ + wideEventsNoisyRoutes: boolean; }; export class WorkloadServer extends EventEmitter { private checkpointClient?: CheckpointClient; + private readonly snapshotService?: ComputeSnapshotService; private readonly logger = new SimpleStructuredLogger("workload-server"); + private readonly wideEventOpts: WideEventOptions; + private readonly wideEventsNoisyRoutes: boolean; private readonly httpServer: HttpServer; private readonly websocketServer: Namespace< @@ -93,6 +116,16 @@ export class WorkloadServer extends EventEmitter { this.workerClient = opts.workerClient; this.checkpointClient = opts.checkpointClient; + this.wideEventOpts = opts.wideEventOpts; + this.wideEventsNoisyRoutes = opts.wideEventsNoisyRoutes; + + if (opts.computeManager?.snapshotsEnabled) { + this.snapshotService = new ComputeSnapshotService({ + computeManager: opts.computeManager, + workerClient: opts.workerClient, + tracing: opts.tracing, + }); + } this.httpServer = this.createHttpServer({ host, port }); this.websocketServer = this.createWebsocketServer(); @@ -124,6 +157,56 @@ export class WorkloadServer extends EventEmitter { return this.headerValueFromRequest(req, WORKLOAD_HEADERS.PROJECT_REF); } + /** + * Sets common route meta on the wide-event state from URL params. + */ + private attachRouteMeta(state: State, params: unknown): void { + if (!params || typeof params !== "object") return; + const p = params as Record; + if (typeof p.runFriendlyId === "string") setMeta(state, "run_id", p.runFriendlyId); + if (typeof p.snapshotFriendlyId === "string") { + setMeta(state, "snapshot_id", p.snapshotFriendlyId); + } + if (typeof p.deploymentId === "string") setMeta(state, "deployment_id", p.deploymentId); + } + + /** + * Wraps an HTTP route handler body with the wide-event lifecycle. Reads + * `traceparent` and `x-request-id` from `req.headers`, attaches `run_id` / + * `snapshot_id` / `deployment_id` meta from `params` when present, and + * captures the response status from `res.statusCode` after `fn` returns. + * + * Pass `highFrequency: true` for noisy routes (heartbeat, polling). Those + * still go through the wrapper but only emit when + * `TRIGGER_WIDE_EVENTS_NOISY_ROUTES` is on, so prod can keep them dark + * while test envs capture full-fidelity traffic for debugging. + */ + private wideRoute( + ctx: { req: IncomingMessage; res: ServerResponse; params?: unknown }, + route: string, + method: string, + fn: () => Promise | T, + routeOpts: { highFrequency?: boolean } = {} + ): Promise { + const enabled = + this.wideEventOpts.enabled && (!routeOpts.highFrequency || this.wideEventsNoisyRoutes); + return runWideEvent( + { + ...this.wideEventOpts, + enabled, + route, + method, + traceparent: this.headerValueFromRequest(ctx.req, "traceparent"), + inboundRequestId: this.headerValueFromRequest(ctx.req, "x-request-id"), + setup: (state) => this.attachRouteMeta(state, ctx.params), + }, + fn, + (state) => { + state.statusCode = ctx.res.statusCode; + } + ); + } + private createHttpServer({ host, port }: { host: string; port: number }) { const httpServer = new HttpServer({ port, @@ -144,26 +227,33 @@ export class WorkloadServer extends EventEmitter { { paramsSchema: WorkloadActionParams, bodySchema: WorkloadRunAttemptStartRequestBody, - handler: async ({ req, reply, params, body }) => { - const startResponse = await this.workerClient.startRunAttempt( - params.runFriendlyId, - params.snapshotFriendlyId, - body, - this.runnerIdFromRequest(req) - ); - - if (!startResponse.success) { - this.logger.error("Failed to start run", { - params, - error: startResponse.error, - }); - reply.empty(500); - return; - } - - reply.json(startResponse.data satisfies WorkloadRunAttemptStartResponseBody); - return; - }, + handler: async (ctx) => + this.wideRoute( + ctx, + "/api/v1/workload-actions/runs/:runFriendlyId/snapshots/:snapshotFriendlyId/attempts/start", + "POST", + async () => { + const { req, reply, params, body } = ctx; + const startResponse = await this.workerClient.startRunAttempt( + params.runFriendlyId, + params.snapshotFriendlyId, + body, + this.runnerIdFromRequest(req) + ); + + if (!startResponse.success) { + this.logger.error("Failed to start run", { + params, + error: startResponse.error, + }); + reply.empty(500); + return; + } + + reply.json(startResponse.data satisfies WorkloadRunAttemptStartResponseBody); + return; + } + ), } ) .route( @@ -172,26 +262,35 @@ export class WorkloadServer extends EventEmitter { { paramsSchema: WorkloadActionParams, bodySchema: WorkloadRunAttemptCompleteRequestBody, - handler: async ({ req, reply, params, body }) => { - const completeResponse = await this.workerClient.completeRunAttempt( - params.runFriendlyId, - params.snapshotFriendlyId, - body, - this.runnerIdFromRequest(req) - ); - - if (!completeResponse.success) { - this.logger.error("Failed to complete run", { - params, - error: completeResponse.error, - }); - reply.empty(500); - return; - } - - reply.json(completeResponse.data satisfies WorkloadRunAttemptCompleteResponseBody); - return; - }, + handler: async (ctx) => + this.wideRoute( + ctx, + "/api/v1/workload-actions/runs/:runFriendlyId/snapshots/:snapshotFriendlyId/attempts/complete", + "POST", + async () => { + const { req, reply, params, body } = ctx; + const completeResponse = await this.workerClient.completeRunAttempt( + params.runFriendlyId, + params.snapshotFriendlyId, + body, + this.runnerIdFromRequest(req) + ); + + if (!completeResponse.success) { + this.logger.error("Failed to complete run", { + params, + error: completeResponse.error, + }); + reply.empty(500); + return; + } + + reply.json( + completeResponse.data satisfies WorkloadRunAttemptCompleteResponseBody + ); + return; + } + ), } ) .route( @@ -200,27 +299,35 @@ export class WorkloadServer extends EventEmitter { { paramsSchema: WorkloadActionParams, bodySchema: WorkloadHeartbeatRequestBody, - handler: async ({ req, reply, params, body }) => { - const heartbeatResponse = await this.workerClient.heartbeatRun( - params.runFriendlyId, - params.snapshotFriendlyId, - body, - this.runnerIdFromRequest(req) - ); - - if (!heartbeatResponse.success) { - this.logger.error("Failed to heartbeat run", { - params, - error: heartbeatResponse.error, - }); - reply.empty(500); - return; - } - - reply.json({ - ok: true, - } satisfies WorkloadHeartbeatResponseBody); - }, + handler: async (ctx) => + this.wideRoute( + ctx, + "/api/v1/workload-actions/runs/:runFriendlyId/snapshots/:snapshotFriendlyId/heartbeat", + "POST", + async () => { + const { req, reply, params, body } = ctx; + const heartbeatResponse = await this.workerClient.heartbeatRun( + params.runFriendlyId, + params.snapshotFriendlyId, + body, + this.runnerIdFromRequest(req) + ); + + if (!heartbeatResponse.success) { + this.logger.error("Failed to heartbeat run", { + params, + error: heartbeatResponse.error, + }); + reply.empty(500); + return; + } + + reply.json({ + ok: true, + } satisfies WorkloadHeartbeatResponseBody); + }, + { highFrequency: true } + ), } ) .route( @@ -228,66 +335,94 @@ export class WorkloadServer extends EventEmitter { "GET", { paramsSchema: WorkloadActionParams, - handler: async ({ reply, params, req }) => { - this.logger.debug("Suspend request", { params, headers: req.headers }); - - if (!this.checkpointClient) { - reply.json( - { - ok: false, - error: "Checkpoints disabled", - } satisfies WorkloadSuspendRunResponseBody, - false, - 400 - ); - return; - } - - const runnerId = this.runnerIdFromRequest(req); - const deploymentVersion = this.deploymentVersionFromRequest(req); - const projectRef = this.projectRefFromRequest(req); - - if (!runnerId || !deploymentVersion || !projectRef) { - this.logger.error("Invalid headers for suspend request", { - ...params, - headers: req.headers, - }); - reply.json( - { - ok: false, - error: "Invalid headers", - } satisfies WorkloadSuspendRunResponseBody, - false, - 400 - ); - return; - } - - reply.json( - { - ok: true, - } satisfies WorkloadSuspendRunResponseBody, - false, - 202 - ); - - const suspendResult = await this.checkpointClient.suspendRun({ - runFriendlyId: params.runFriendlyId, - snapshotFriendlyId: params.snapshotFriendlyId, - body: { - runnerId, - runId: params.runFriendlyId, - snapshotId: params.snapshotFriendlyId, - projectRef, - deploymentVersion, - }, - }); - - if (!suspendResult) { - this.logger.error("Failed to suspend run", { params }); - return; - } - }, + handler: async (ctx) => + this.wideRoute( + ctx, + "/api/v1/workload-actions/runs/:runFriendlyId/snapshots/:snapshotFriendlyId/suspend", + "GET", + async () => { + const { reply, params, req } = ctx; + const runnerId = this.runnerIdFromRequest(req); + const deploymentVersion = this.deploymentVersionFromRequest(req); + const projectRef = this.projectRefFromRequest(req); + + this.logger.debug("Suspend request", { + params, + runnerId, + deploymentVersion, + projectRef, + }); + + if (!runnerId || !deploymentVersion || !projectRef) { + this.logger.error("Invalid headers for suspend request", { + ...params, + runnerId, + deploymentVersion, + projectRef, + }); + reply.json( + { + ok: false, + error: "Invalid headers", + } satisfies WorkloadSuspendRunResponseBody, + false, + 400 + ); + return; + } + + if (this.snapshotService) { + // Compute mode: delay snapshot to avoid wasted work on short-lived waitpoints. + // If the run continues before the delay expires, the snapshot is cancelled. + reply.json({ ok: true } satisfies WorkloadSuspendRunResponseBody, false, 202); + + this.snapshotService.schedule(params.runFriendlyId, { + runnerId, + runFriendlyId: params.runFriendlyId, + snapshotFriendlyId: params.snapshotFriendlyId, + }); + + return; + } + + if (!this.checkpointClient) { + reply.json( + { + ok: false, + error: "Checkpoints disabled", + } satisfies WorkloadSuspendRunResponseBody, + false, + 400 + ); + return; + } + + reply.json( + { + ok: true, + } satisfies WorkloadSuspendRunResponseBody, + false, + 202 + ); + + const suspendResult = await this.checkpointClient.suspendRun({ + runFriendlyId: params.runFriendlyId, + snapshotFriendlyId: params.snapshotFriendlyId, + body: { + runnerId, + runId: params.runFriendlyId, + snapshotId: params.snapshotFriendlyId, + projectRef, + deploymentVersion, + }, + }); + + if (!suspendResult) { + this.logger.error("Failed to suspend run", { params }); + return; + } + } + ), } ) .route( @@ -295,30 +430,40 @@ export class WorkloadServer extends EventEmitter { "GET", { paramsSchema: WorkloadActionParams, - handler: async ({ req, reply, params }) => { - this.logger.debug("Run continuation request", { params }); - - const continuationResult = await this.workerClient.continueRunExecution( - params.runFriendlyId, - params.snapshotFriendlyId, - this.runnerIdFromRequest(req) - ); - - if (!continuationResult.success) { - this.logger.error("Failed to continue run execution", { params }); - reply.json( - { - ok: false, - error: "Failed to continue run execution", - }, - false, - 400 - ); - return; - } - - reply.json(continuationResult.data as WorkloadContinueRunExecutionResponseBody); - }, + handler: async (ctx) => + this.wideRoute( + ctx, + "/api/v1/workload-actions/runs/:runFriendlyId/snapshots/:snapshotFriendlyId/continue", + "GET", + async () => { + const { req, reply, params } = ctx; + this.logger.debug("Run continuation request", { params }); + + // Cancel any pending delayed snapshot for this run + this.snapshotService?.cancel(params.runFriendlyId); + + const continuationResult = await this.workerClient.continueRunExecution( + params.runFriendlyId, + params.snapshotFriendlyId, + this.runnerIdFromRequest(req) + ); + + if (!continuationResult.success) { + this.logger.error("Failed to continue run execution", { params }); + reply.json( + { + ok: false, + error: "Failed to continue run execution", + }, + false, + 400 + ); + return; + } + + reply.json(continuationResult.data as WorkloadContinueRunExecutionResponseBody); + } + ), } ) .route( @@ -326,24 +471,34 @@ export class WorkloadServer extends EventEmitter { "GET", { paramsSchema: WorkloadActionParams, - handler: async ({ req, reply, params }) => { - const sinceSnapshotResponse = await this.workerClient.getSnapshotsSince( - params.runFriendlyId, - params.snapshotFriendlyId, - this.runnerIdFromRequest(req) - ); - - if (!sinceSnapshotResponse.success) { - this.logger.error("Failed to get snapshots since", { - runId: params.runFriendlyId, - error: sinceSnapshotResponse.error, - }); - reply.empty(500); - return; - } - - reply.json(sinceSnapshotResponse.data satisfies WorkloadRunSnapshotsSinceResponseBody); - }, + handler: async (ctx) => + this.wideRoute( + ctx, + "/api/v1/workload-actions/runs/:runFriendlyId/snapshots/since/:snapshotFriendlyId", + "GET", + async () => { + const { req, reply, params } = ctx; + const sinceSnapshotResponse = await this.workerClient.getSnapshotsSince( + params.runFriendlyId, + params.snapshotFriendlyId, + this.runnerIdFromRequest(req) + ); + + if (!sinceSnapshotResponse.success) { + this.logger.error("Failed to get snapshots since", { + runId: params.runFriendlyId, + error: sinceSnapshotResponse.error, + }); + reply.empty(500); + return; + } + + reply.json( + sinceSnapshotResponse.data satisfies WorkloadRunSnapshotsSinceResponseBody + ); + }, + { highFrequency: true } + ), } ) .route("/api/v1/workload-actions/deployments/:deploymentId/dequeue", "GET", { @@ -351,49 +506,87 @@ export class WorkloadServer extends EventEmitter { deploymentId: z.string(), }), - handler: async ({ req, reply, params }) => { - const dequeueResponse = await this.workerClient.dequeueFromVersion( - params.deploymentId, - 1, - this.runnerIdFromRequest(req) - ); - - if (!dequeueResponse.success) { - this.logger.error("Failed to get latest snapshot", { - deploymentId: params.deploymentId, - error: dequeueResponse.error, - }); - reply.empty(500); - return; - } + handler: async (ctx) => + this.wideRoute( + ctx, + "/api/v1/workload-actions/deployments/:deploymentId/dequeue", + "GET", + async () => { + const { req, reply, params } = ctx; + const dequeueResponse = await this.workerClient.dequeueFromVersion( + params.deploymentId, + 1, + this.runnerIdFromRequest(req) + ); - reply.json(dequeueResponse.data satisfies WorkloadDequeueFromVersionResponseBody); - }, + if (!dequeueResponse.success) { + this.logger.error("Failed to get latest snapshot", { + deploymentId: params.deploymentId, + error: dequeueResponse.error, + }); + reply.empty(500); + return; + } + + reply.json(dequeueResponse.data satisfies WorkloadDequeueFromVersionResponseBody); + } + ), }); if (env.SEND_RUN_DEBUG_LOGS) { httpServer.route("/api/v1/workload-actions/runs/:runFriendlyId/logs/debug", "POST", { paramsSchema: WorkloadActionParams.pick({ runFriendlyId: true }), bodySchema: WorkloadDebugLogRequestBody, - handler: async ({ req, reply, params, body }) => { - reply.empty(204); - - await this.workerClient.sendDebugLog( - params.runFriendlyId, - body, - this.runnerIdFromRequest(req) - ); - }, + handler: async (ctx) => + this.wideRoute( + ctx, + "/api/v1/workload-actions/runs/:runFriendlyId/logs/debug", + "POST", + async () => { + const { req, reply, params, body } = ctx; + reply.empty(204); + + await this.workerClient.sendDebugLog( + params.runFriendlyId, + body, + this.runnerIdFromRequest(req) + ); + }, + { highFrequency: true } + ), }); } else { // Lightweight mock route without schemas httpServer.route("/api/v1/workload-actions/runs/:runFriendlyId/logs/debug", "POST", { - handler: async ({ reply }) => { - reply.empty(204); - }, + handler: async (ctx) => + this.wideRoute( + ctx, + "/api/v1/workload-actions/runs/:runFriendlyId/logs/debug", + "POST", + async () => { + ctx.reply.empty(204); + }, + { highFrequency: true } + ), }); } + // Snapshot callback endpoint (inbound from compute path) + httpServer.route("/api/v1/compute/snapshot-complete", "POST", { + bodySchema: SnapshotCallbackPayloadSchema, + handler: async (ctx) => + this.wideRoute(ctx, "/api/v1/compute/snapshot-complete", "POST", async () => { + const { reply, body } = ctx; + if (!this.snapshotService) { + reply.empty(404); + return; + } + + const result = await this.snapshotService.handleCallback(body); + reply.empty(result.status); + }), + }); + return httpServer; } @@ -408,7 +601,7 @@ export class WorkloadServer extends EventEmitter { > = io.of("/workload"); websocketServer.on("disconnect", (socket) => { - this.logger.log("[WS] disconnect", socket.id); + this.logger.verbose("[WS] disconnect", socket.id); }); websocketServer.use(async (socket, next) => { const setSocketDataFromHeader = ( @@ -464,6 +657,26 @@ export class WorkloadServer extends EventEmitter { }; }; + const emitSocketLifecycle = ( + event: "run_connected" | "run_disconnected", + friendlyId: string, + disconnectReason?: string + ) => { + emitOneShot({ + ...this.wideEventOpts, + populate: (state) => { + state.extras.event = event; + setMeta(state, "run_id", friendlyId); + if (socket.data.deploymentId) { + setMeta(state, "deployment_id", socket.data.deploymentId); + } + if (socket.data.runnerId) setMeta(state, "runner_id", socket.data.runnerId); + state.extras.socket_id = socket.id; + if (disconnectReason) state.extras.disconnect_reason = disconnectReason; + }, + }); + }; + const runConnected = (friendlyId: string) => { socketLogger.debug("runConnected", { ...getSocketMetadata() }); @@ -474,23 +687,25 @@ export class WorkloadServer extends EventEmitter { newRunId: friendlyId, oldRunId: socket.data.runFriendlyId, }); - runDisconnected(socket.data.runFriendlyId); + runDisconnected(socket.data.runFriendlyId, "socket_run_replaced"); } this.runSockets.set(friendlyId, socket); this.emit("runConnected", { run: { friendlyId } }); socket.data.runFriendlyId = friendlyId; + emitSocketLifecycle("run_connected", friendlyId); }; - const runDisconnected = (friendlyId: string) => { + const runDisconnected = (friendlyId: string, reason: string) => { socketLogger.debug("runDisconnected", { ...getSocketMetadata() }); this.runSockets.delete(friendlyId); this.emit("runDisconnected", { run: { friendlyId } }); socket.data.runFriendlyId = undefined; + emitSocketLifecycle("run_disconnected", friendlyId, reason); }; - socketLogger.log("wsServer socket connected", { ...getSocketMetadata() }); + socketLogger.debug("wsServer socket connected", { ...getSocketMetadata() }); // FIXME: where does this get set? if (socket.data.runFriendlyId) { @@ -498,15 +713,19 @@ export class WorkloadServer extends EventEmitter { } socket.on("disconnecting", (reason, description) => { - socketLogger.log("Socket disconnecting", { ...getSocketMetadata(), reason, description }); + socketLogger.verbose("Socket disconnecting", { + ...getSocketMetadata(), + reason, + description, + }); if (socket.data.runFriendlyId) { - runDisconnected(socket.data.runFriendlyId); + runDisconnected(socket.data.runFriendlyId, `socket_disconnecting:${reason}`); } }); socket.on("disconnect", (reason, description) => { - socketLogger.log("Socket disconnected", { ...getSocketMetadata(), reason, description }); + socketLogger.debug("Socket disconnected", { ...getSocketMetadata(), reason, description }); }); socket.on("error", (error) => { @@ -527,7 +746,7 @@ export class WorkloadServer extends EventEmitter { ...message, }); - log.log("Handling run:start"); + log.debug("Handling run:start"); try { runConnected(message.run.friendlyId); @@ -543,10 +762,13 @@ export class WorkloadServer extends EventEmitter { ...message, }); - log.log("Handling run:stop"); + log.debug("Handling run:stop"); try { - runDisconnected(message.run.friendlyId); + runDisconnected(message.run.friendlyId, "run_stop_message"); + // Don't delete trace context here - run:stop fires after each snapshot/shutdown + // but the run may be restored on a new VM and snapshot again. Trace context is + // re-populated on dequeue, and entries are small (4 strings per run). } catch (error) { log.error("run:stop error", { error }); } @@ -588,11 +810,16 @@ export class WorkloadServer extends EventEmitter { } } + registerRunTraceContext(runFriendlyId: string, ctx: RunTraceContext) { + this.snapshotService?.registerTraceContext(runFriendlyId, ctx); + } + async start() { await this.httpServer.start(); } async stop() { + this.snapshotService?.stop(); await this.httpServer.stop(); } } diff --git a/apps/webapp/CLAUDE.md b/apps/webapp/CLAUDE.md new file mode 100644 index 00000000000..a4de6ab57b7 --- /dev/null +++ b/apps/webapp/CLAUDE.md @@ -0,0 +1,131 @@ +# Webapp + +Remix 2.17.4 app serving as the main API, dashboard, and orchestration engine. Uses an Express server (`server.ts`). + +## Verifying Changes + +**Never run `pnpm run build --filter webapp` to verify changes.** Building proves almost nothing about correctness. The webapp is an app, not a public package — use typecheck from the repo root: + +```bash +pnpm run typecheck --filter webapp # ~1-2 minutes +``` + +Only run typecheck after major changes (new files, significant refactors, schema changes). For small edits, trust the types and let CI catch issues. + +Note: Public packages (`packages/*`) use `build` instead. See the root CLAUDE.md for details. + +## Testing Dashboard Changes with Chrome DevTools MCP + +Use the `chrome-devtools` MCP server to visually verify local dashboard changes. The webapp must be running (`pnpm run dev --filter webapp` from repo root). + +### Login + +``` +1. mcp__chrome-devtools__new_page(url: "http://localhost:3030") + → Redirects to /login +2. mcp__chrome-devtools__click the "Continue with Email" link +3. mcp__chrome-devtools__fill the email field with "local@trigger.dev" +4. mcp__chrome-devtools__click "Send a magic link" + → Auto-logs in and redirects to the dashboard (no email verification needed locally) +``` + +### Navigating and Verifying + +- **take_snapshot**: Get an a11y tree of the page (text content, element UIDs for interaction). Prefer this over screenshots for understanding page structure. +- **take_screenshot**: Capture what the page looks like visually. Use to verify styling, layout, and visual changes. +- **navigate_page**: Go to specific URLs, e.g. `http://localhost:3030/orgs/references-bc08/projects/hello-world-SiWs/env/dev/runs` +- **click / fill**: Interact with elements using UIDs from `take_snapshot`. +- **evaluate_script**: Run JS in the browser console for debugging. +- **list_console_messages**: Check for console errors after navigating. + +### Tips + +- Snapshots can be very large on complex pages (200K+ chars). Use `take_screenshot` first to orient, then `take_snapshot` only when you need element UIDs to interact. +- The local seeded user email is `local@trigger.dev`. +- Dashboard URL pattern: `http://localhost:3030/orgs/{orgSlug}/projects/{projectSlug}/env/{envSlug}/{section}` + +## Key File Locations + +- **Trigger API**: `app/routes/api.v1.tasks.$taskId.trigger.ts` +- **Batch trigger**: `app/routes/api.v1.tasks.batch.ts` +- **OTEL endpoints**: `app/routes/otel.v1.logs.ts`, `app/routes/otel.v1.traces.ts` +- **Prisma setup**: `app/db.server.ts` +- **Run engine config**: `app/v3/runEngine.server.ts` +- **Services**: `app/v3/services/**/*.server.ts` +- **Presenters**: `app/v3/presenters/**/*.server.ts` + +## Route Convention + +Routes use Remix flat-file convention with dot-separated segments: +`api.v1.tasks.$taskId.trigger.ts` -> `/api/v1/tasks/:taskId/trigger` + +## Abort Signals + +**Never use `request.signal`** for detecting client disconnects. It is broken due to a Node.js bug ([nodejs/node#55428](https://github.com/nodejs/node/issues/55428)) where the AbortSignal chain is severed when Remix internally clones the Request object. Instead, use `getRequestAbortSignal()` from `app/services/httpAsyncStorage.server.ts`, which is wired directly to Express `res.on("close")` and fires reliably. + +```typescript +import { getRequestAbortSignal } from "~/services/httpAsyncStorage.server"; + +// In route handlers, SSE streams, or any server-side code: +const signal = getRequestAbortSignal(); +``` + +## Environment Variables + +Access via `env` export from `app/env.server.ts`. **Never use `process.env` directly.** + +For testable code, **never import env.server.ts** in test files. Pass configuration as options instead: +- `realtimeClient.server.ts` (testable service, takes config as constructor arg) +- `realtimeClientGlobal.server.ts` (creates singleton with env config) + +## Run Engine 2.0 + +The webapp integrates `@internal/run-engine` via `app/v3/runEngine.server.ts`. This is the singleton engine instance. Services in `app/v3/services/` call engine methods for all run lifecycle operations (triggering, completing, cancelling, etc.). + +The `engineVersion.server.ts` file determines V1 vs V2 for a given environment. New code should always target V2. + +## Background Workers + +Background job workers use `@trigger.dev/redis-worker`: +- `app/v3/commonWorker.server.ts` +- `app/v3/alertsWorker.server.ts` +- `app/v3/batchTriggerWorker.server.ts` + +Do NOT add new jobs using zodworker/graphile-worker (legacy). + +## Real-time + +- Socket.io: `app/v3/handleSocketIo.server.ts`, `app/v3/handleWebsockets.server.ts` +- Electric SQL: Powers real-time data sync for the dashboard + +## Legacy V1 Code + +The `app/v3/` directory name is misleading - most code is actively used by V2. Only these specific files are V1-only legacy: +- `app/v3/marqs/` (old MarQS queue system) +- `app/v3/legacyRunEngineWorker.server.ts` +- `app/v3/services/triggerTaskV1.server.ts` +- `app/v3/services/cancelTaskRunV1.server.ts` +- `app/v3/authenticatedSocketConnection.server.ts` +- `app/v3/sharedSocketConnection.ts` + +Some services (e.g., `cancelTaskRun.server.ts`, `batchTriggerV3.server.ts`) branch on `RunEngineVersion` to support both V1 and V2. When editing these, only modify V2 code paths. + +## Performance: Trigger Hot Path + +The `triggerTask.server.ts` service is the **highest-throughput code path** in the system. Every API trigger call goes through it. Keep it fast: + +- **Do NOT add database queries** to `triggerTask.server.ts` or `batchTriggerV3.server.ts`. Task defaults (TTL, etc.) are resolved via `backgroundWorkerTask.findFirst()` in the queue concern (`queues.server.ts`) - one query per request, in mutually exclusive branches depending on locked/non-locked path. Piggyback on the existing query instead of adding new ones. +- **Two-stage resolution pattern**: Task metadata is resolved in two stages by design: + 1. **Trigger time** (`triggerTask.server.ts`): Only TTL is resolved from task defaults. Everything else uses whatever the caller provides. + 2. **Dequeue time** (`dequeueSystem.ts`): Full `BackgroundWorkerTask` is loaded and retry config, machine config, maxDuration, etc. are resolved against task defaults. +- If you need to add a new task-level default, **add it to the existing `select` clause** in the `backgroundWorkerTask.findFirst()` query — do NOT add a second query. If the default doesn't need to be known at trigger time, resolve it at dequeue time instead. +- Batch triggers (`batchTriggerV3.server.ts`) follow the same pattern — keep batch paths equally fast. + +## Prisma Query Patterns + +- **Always use `findFirst` instead of `findUnique`.** Prisma's `findUnique` has an implicit DataLoader that batches concurrent calls into a single `IN` query. This batching cannot be disabled and has active bugs even in Prisma 6.x: uppercase UUIDs returning null (#25484, confirmed 6.4.1), composite key SQL correctness issues (#22202), and 5-10x worse performance than manual DataLoader (#6573, open since 2021). `findFirst` is never batched and avoids this entire class of issues. + +## React Patterns + +- Only use `useCallback`/`useMemo` for context provider values, expensive derived data that is a dependency elsewhere, or stable refs required by a dependency array. Don't wrap ordinary event handlers or trivial computations. +- Use named constants for sentinel/placeholder values (e.g. `const UNSET_VALUE = "__unset__"`) instead of raw string literals scattered across comparisons. diff --git a/apps/webapp/app/assets/icons/AIMetricsIcon.tsx b/apps/webapp/app/assets/icons/AIMetricsIcon.tsx new file mode 100644 index 00000000000..038eea70b49 --- /dev/null +++ b/apps/webapp/app/assets/icons/AIMetricsIcon.tsx @@ -0,0 +1,16 @@ +export function AIMetricsIcon({ className }: { className?: string }) { + return ( + + + + + ); +} diff --git a/apps/webapp/app/assets/icons/AIPromptsIcon.tsx b/apps/webapp/app/assets/icons/AIPromptsIcon.tsx new file mode 100644 index 00000000000..dd434df9931 --- /dev/null +++ b/apps/webapp/app/assets/icons/AIPromptsIcon.tsx @@ -0,0 +1,10 @@ +export function AIPromptsIcon({ className }: { className?: string }) { + return ( + + + + ); +} diff --git a/apps/webapp/app/assets/icons/AiProviderIcons.tsx b/apps/webapp/app/assets/icons/AiProviderIcons.tsx new file mode 100644 index 00000000000..85a01b98d63 --- /dev/null +++ b/apps/webapp/app/assets/icons/AiProviderIcons.tsx @@ -0,0 +1,177 @@ +type IconProps = { className?: string }; + +export function OpenAIIcon({ className }: IconProps) { + return ( + + + + ); +} + +export function AnthropicIcon({ className }: IconProps) { + return ( + + + + ); +} + +export function GeminiIcon({ className }: IconProps) { + return ( + + + + ); +} + +export function LlamaIcon({ className }: IconProps) { + return ( + + + + ); +} + +export function DeepseekIcon({ className }: IconProps) { + return ( + + + + + + + + + + + ); +} + +export function XAIIcon({ className }: IconProps) { + return ( + + + + + + + ); +} + +export function PerplexityIcon({ className }: IconProps) { + return ( + + + + ); +} + +export function CerebrasIcon({ className }: IconProps) { + return ( + + + + + + + + ); +} + +export function MistralIcon({ className }: IconProps) { + return ( + + + + + + + + + + + + + ); +} + +export function AzureIcon({ className }: IconProps) { + return ( + + + + ); +} + diff --git a/apps/webapp/app/assets/icons/AnthropicLogoIcon.tsx b/apps/webapp/app/assets/icons/AnthropicLogoIcon.tsx new file mode 100644 index 00000000000..3e647284cce --- /dev/null +++ b/apps/webapp/app/assets/icons/AnthropicLogoIcon.tsx @@ -0,0 +1,12 @@ +export function AnthropicLogoIcon({ className }: { className?: string }) { + return ( + + + + ); +} diff --git a/apps/webapp/app/assets/icons/SlackMonoIcon.tsx b/apps/webapp/app/assets/icons/SlackMonoIcon.tsx new file mode 100644 index 00000000000..666393a229d --- /dev/null +++ b/apps/webapp/app/assets/icons/SlackMonoIcon.tsx @@ -0,0 +1,10 @@ +export function SlackMonoIcon({ className }: { className?: string }) { + return ( + + + + + + + ); +} diff --git a/apps/webapp/app/clientBeforeFirstRender.ts b/apps/webapp/app/clientBeforeFirstRender.ts new file mode 100644 index 00000000000..3275c54423a --- /dev/null +++ b/apps/webapp/app/clientBeforeFirstRender.ts @@ -0,0 +1,38 @@ +/** + * Runs once on the client, synchronously, before React hydrates the app. + * Reserved for housekeeping that must happen before any component mounts. + */ +export function clientBeforeFirstRender() { + cleanupLegacyResizablePanelStorage(); +} + +/** + * Earlier versions of the resizable panel library wrote a per-session + * localStorage entry for every PanelGroup, including ones without an + * `autosaveId`. The keys look like `panel-group-react-aria-::` + * and accumulate without bound across sessions until they exhaust the + * ~5 MB origin quota and break subsequent `setItem` calls. + * + * The library no longer behaves this way, but existing users still carry + * the residue. Evict it (plus the orphaned `panel-run-parent-v2` key from + * the v2→v3 autosaveId bump) once on load. + */ +function cleanupLegacyResizablePanelStorage() { + try { + const toRemove: string[] = []; + for (let i = 0; i < window.localStorage.length; i++) { + const key = window.localStorage.key(i); + if ( + key && + (key.startsWith("panel-group-react-aria") || key === "panel-run-parent-v2") + ) { + toRemove.push(key); + } + } + for (const key of toRemove) { + window.localStorage.removeItem(key); + } + } catch { + // localStorage may be disabled (private browsing, security policy) + } +} diff --git a/apps/webapp/app/components/AlphaBadge.tsx b/apps/webapp/app/components/AlphaBadge.tsx index 58da1a994cd..0a1c4a7fc9a 100644 --- a/apps/webapp/app/components/AlphaBadge.tsx +++ b/apps/webapp/app/components/AlphaBadge.tsx @@ -30,3 +30,32 @@ export function AlphaTitle({ children }: { children: React.ReactNode }) { ); } + +export function BetaBadge({ + inline = false, + className, +}: { + inline?: boolean; + className?: string; +}) { + return ( + + Beta + + } + content="This feature is in Beta." + disableHoverableContent + /> + ); +} + +export function BetaTitle({ children }: { children: React.ReactNode }) { + return ( + <> + {children} + + + ); +} diff --git a/apps/webapp/app/components/AskAI.tsx b/apps/webapp/app/components/AskAI.tsx index bc55469b84a..814d4649c8f 100644 --- a/apps/webapp/app/components/AskAI.tsx +++ b/apps/webapp/app/components/AskAI.tsx @@ -118,30 +118,31 @@ function AskAIProvider({ websiteId, isCollapsed = false }: AskAIProviderProps) { -
- + + - -
+ + Ask AI - +
diff --git a/apps/webapp/app/components/BackgroundWrapper.tsx b/apps/webapp/app/components/BackgroundWrapper.tsx index ecff3af6dd4..aaf06d56aaf 100644 --- a/apps/webapp/app/components/BackgroundWrapper.tsx +++ b/apps/webapp/app/components/BackgroundWrapper.tsx @@ -5,10 +5,9 @@ import blurredDashboardBackgroundTable from "~/assets/images/blurred-dashboard-b export function BackgroundWrapper({ children }: { children: ReactNode }) { return ( -
- {/* Left menu top background - fixed width 260px, maintains aspect ratio */} +
- {/* Left menu bottom background - fixed width 260px, maintains aspect ratio */}
- {/* Right table background - fixed width 2000px, positioned next to menu */}
- {/* Content layer */}
{children}
); diff --git a/apps/webapp/app/components/BlankStatePanels.tsx b/apps/webapp/app/components/BlankStatePanels.tsx index 7423bd61ac8..9a4884e09d3 100644 --- a/apps/webapp/app/components/BlankStatePanels.tsx +++ b/apps/webapp/app/components/BlankStatePanels.tsx @@ -1,4 +1,5 @@ import { + ArrowsRightLeftIcon, BeakerIcon, BellAlertIcon, BookOpenIcon, @@ -8,10 +9,10 @@ import { QuestionMarkCircleIcon, RectangleGroupIcon, RectangleStackIcon, - ServerStackIcon, Squares2X2Icon, } from "@heroicons/react/20/solid"; import { useLocation } from "react-use"; +import { AIPromptsIcon } from "~/assets/icons/AIPromptsIcon"; import { BranchEnvironmentIconSmall } from "~/assets/icons/EnvironmentIcons"; import { WaitpointTokenIcon } from "~/assets/icons/WaitpointTokenIcon"; import openBulkActionsPanel from "~/assets/images/open-bulk-actions-panel.png"; @@ -23,21 +24,28 @@ import { useOrganization } from "~/hooks/useOrganizations"; import { useProject } from "~/hooks/useProject"; import { type MinimumEnvironment } from "~/presenters/SelectBestEnvironmentPresenter.server"; import { NewBranchPanel } from "~/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.branches/route"; +import { GitHubSettingsPanel } from "~/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.github"; import { docsPath, v3BillingPath, v3CreateBulkActionPath, v3EnvironmentPath, - v3EnvironmentVariablesPath, v3NewProjectAlertPath, v3NewSchedulePath, } from "~/utils/pathBuilder"; import { AskAI } from "./AskAI"; +import { CodeBlock } from "./code/CodeBlock"; import { InlineCode } from "./code/InlineCode"; import { environmentFullTitle, EnvironmentIcon } from "./environments/EnvironmentLabel"; import { Feedback } from "./Feedback"; import { EnvironmentSelector } from "./navigation/EnvironmentSelector"; import { Button, LinkButton } from "./primitives/Buttons"; +import { + ClientTabs, + ClientTabsContent, + ClientTabsList, + ClientTabsTrigger, +} from "./primitives/ClientTabs"; import { Header1 } from "./primitives/Headers"; import { InfoPanel } from "./primitives/InfoPanel"; import { Paragraph } from "./primitives/Paragraph"; @@ -52,13 +60,6 @@ import { } from "./SetupCommands"; import { StepContentContainer } from "./StepContentContainer"; import { V4Badge } from "./V4Badge"; -import { - ClientTabs, - ClientTabsContent, - ClientTabsList, - ClientTabsTrigger, -} from "./primitives/ClientTabs"; -import { GitHubSettingsPanel } from "~/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.github"; export function HasNoTasksDev() { return ( @@ -189,6 +190,28 @@ export function BatchesNone() { ); } +export function SessionsNone() { + return ( + + Sessions docs + + } + > + + You have no sessions in this environment. Sessions are durable, typed, bidirectional I/O + primitives that outlive a single run — used by chat.agent and any + long-running task that needs streaming input and output. + + + ); +} + export function TestHasNoTasks() { const organization = useOrganization(); const project = useProject(); @@ -601,7 +624,9 @@ function DeploymentOnboardingSteps() {
- Deploy your tasks to {environmentFullTitle(environment)} + + Deploy your tasks to {environmentFullTitle(environment)} +
@@ -686,3 +711,50 @@ function DeploymentOnboardingSteps() { ); } + +export function PromptsNone() { + return ( + + Prompts docs + + } + > + + Managed prompts let you define AI prompts in code with typesafe variables, then edit and + version them from the dashboard without redeploying. + + + Add a prompt to your project using prompts.define() + : + + + + Deploy your project and your prompts will appear here with version history and a live + editor. + + + ); +} diff --git a/apps/webapp/app/components/BulkActionFilterSummary.tsx b/apps/webapp/app/components/BulkActionFilterSummary.tsx index 073940d7d0a..a2eabc879de 100644 --- a/apps/webapp/app/components/BulkActionFilterSummary.tsx +++ b/apps/webapp/app/components/BulkActionFilterSummary.tsx @@ -215,6 +215,19 @@ export function BulkActionFilterSummary({ /> ); } + case "regions": { + const values = Array.isArray(value) ? value : [`${value}`]; + return ( + + ); + } case "machines": { const values = Array.isArray(value) ? value : [`${value}`]; return ( @@ -228,6 +241,31 @@ export function BulkActionFilterSummary({ /> ); } + case "errorId": { + return ( + + ); + } + case "sources": { + const values = Array.isArray(value) ? value : [`${value}`]; + return ( + + ); + } default: { assertNever(typedKey); } diff --git a/apps/webapp/app/components/GitHubLoginButton.tsx b/apps/webapp/app/components/GitHubLoginButton.tsx index 87238db087e..76a494927cd 100644 --- a/apps/webapp/app/components/GitHubLoginButton.tsx +++ b/apps/webapp/app/components/GitHubLoginButton.tsx @@ -32,8 +32,6 @@ export function OctoKitty({ className }: { className?: string }) { baseProfile="tiny" id="Layer_1" xmlns="http://www.w3.org/2000/svg" - x="0px" - y="0px" viewBox="0 0 2350 2314.8" xmlSpace="preserve" fill="currentColor" diff --git a/apps/webapp/app/components/GitMetadata.tsx b/apps/webapp/app/components/GitMetadata.tsx index efe3fb0efb7..fb53ee6bfea 100644 --- a/apps/webapp/app/components/GitMetadata.tsx +++ b/apps/webapp/app/components/GitMetadata.tsx @@ -25,9 +25,10 @@ export function GitMetadataBranch({ } + leadingIconClassName="group-hover/table-row:text-text-bright" iconSpacing="gap-x-1" to={git.branchUrl} - className="pl-1" + className="pl-1 duration-0 [&_span]:duration-0 [&_span]:group-hover/table-row:text-text-bright" > {git.branchName} @@ -49,8 +50,9 @@ export function GitMetadataCommit({ variant="minimal/small" to={git.commitUrl} LeadingIcon={} + leadingIconClassName="group-hover/table-row:text-text-bright" iconSpacing="gap-x-1" - className="pl-1" + className="pl-1 duration-0 [&_span]:duration-0 [&_span]:group-hover/table-row:text-text-bright" > {`${git.shortSha} / ${git.commitMessage}`} @@ -74,8 +76,9 @@ export function GitMetadataPullRequest({ variant="minimal/small" to={git.pullRequestUrl} LeadingIcon={} + leadingIconClassName="group-hover/table-row:text-text-bright" iconSpacing="gap-x-1" - className="pl-1" + className="pl-1 duration-0 [&_span]:duration-0 [&_span]:group-hover/table-row:text-text-bright" > #{git.pullRequestNumber} {git.pullRequestTitle} diff --git a/apps/webapp/app/components/LogLevelTooltipInfo.tsx b/apps/webapp/app/components/LogLevelTooltipInfo.tsx index 6f967af70d1..2a8093af066 100644 --- a/apps/webapp/app/components/LogLevelTooltipInfo.tsx +++ b/apps/webapp/app/components/LogLevelTooltipInfo.tsx @@ -1,7 +1,6 @@ -import { BookOpenIcon } from "@heroicons/react/20/solid"; -import { LinkButton } from "./primitives/Buttons"; import { Header3 } from "./primitives/Headers"; import { Paragraph } from "./primitives/Paragraph"; +import { LogLevel } from "./logs/LogLevel"; export function LogLevelTooltipInfo() { return ( @@ -13,51 +12,45 @@ export function LogLevelTooltipInfo() {
-
- Info +
+ +
+ + Traces and spans representing the execution flow of your tasks. + +
+
+
+
General informational messages about task execution.
-
- Warn +
+
Warning messages indicating potential issues that don't prevent execution.
-
- Error +
+
Error messages for failures and exceptions during task execution.
-
- Debug +
+
Detailed diagnostic information for development and debugging.
-
- Tracing & Spans - - Automatically track the flow of your code through task triggers, attempts, and HTTP - requests. Create custom traces to monitor specific operations. - -
- - Read docs -
); } diff --git a/apps/webapp/app/components/LoginPageLayout.tsx b/apps/webapp/app/components/LoginPageLayout.tsx index d9ac7ceb4d7..3e42cd6894f 100644 --- a/apps/webapp/app/components/LoginPageLayout.tsx +++ b/apps/webapp/app/components/LoginPageLayout.tsx @@ -46,10 +46,10 @@ export function LoginPageLayout({ children }: { children: React.ReactNode }) { }, []); return ( -
-
-
-
+
+
+
+
@@ -63,12 +63,12 @@ export function LoginPageLayout({ children }: { children: React.ReactNode }) {
{children}
- Having login issues? Email us{" "} + Having login issues? Email us{" "} or ask us in Discord
-
+
{randomQuote?.quote} diff --git a/apps/webapp/app/components/MachineLabelCombo.tsx b/apps/webapp/app/components/MachineLabelCombo.tsx index 3d22ca527d0..485f6094cf0 100644 --- a/apps/webapp/app/components/MachineLabelCombo.tsx +++ b/apps/webapp/app/components/MachineLabelCombo.tsx @@ -31,7 +31,9 @@ export function MachineLabel({ className?: string; }) { return ( - {formatMachinePresetName(preset)} + + {formatMachinePresetName(preset)} + ); } diff --git a/apps/webapp/app/components/Shortcuts.tsx b/apps/webapp/app/components/Shortcuts.tsx index e3e4d6fe957..2decc82c914 100644 --- a/apps/webapp/app/components/Shortcuts.tsx +++ b/apps/webapp/app/components/Shortcuts.tsx @@ -76,7 +76,8 @@ function ShortcutContent() { - + + @@ -138,8 +139,8 @@ function ShortcutContent() { - - + + @@ -192,6 +193,12 @@ function ShortcutContent() {
+
+ Metrics page + + + +
Schedules page diff --git a/apps/webapp/app/components/TimezoneSetter.tsx b/apps/webapp/app/components/TimezoneSetter.tsx new file mode 100644 index 00000000000..3481af6571d --- /dev/null +++ b/apps/webapp/app/components/TimezoneSetter.tsx @@ -0,0 +1,30 @@ +import { useFetcher } from "@remix-run/react"; +import { useEffect, useRef } from "react"; +import { useTypedLoaderData } from "remix-typedjson"; +import type { loader } from "~/root"; + +export function TimezoneSetter() { + const { timezone: storedTimezone } = useTypedLoaderData(); + const fetcher = useFetcher(); + const hasSetTimezone = useRef(false); + + useEffect(() => { + if (hasSetTimezone.current) return; + + const browserTimezone = Intl.DateTimeFormat().resolvedOptions().timeZone; + + if (browserTimezone && browserTimezone !== storedTimezone) { + hasSetTimezone.current = true; + fetcher.submit( + { timezone: browserTimezone }, + { + method: "POST", + action: "/resources/timezone", + encType: "application/json", + } + ); + } + }, [storedTimezone, fetcher]); + + return null; +} diff --git a/apps/webapp/app/components/admin/FeatureFlagsDialog.tsx b/apps/webapp/app/components/admin/FeatureFlagsDialog.tsx new file mode 100644 index 00000000000..df8669d36dd --- /dev/null +++ b/apps/webapp/app/components/admin/FeatureFlagsDialog.tsx @@ -0,0 +1,290 @@ +import { useFetcher } from "@remix-run/react"; +import { useEffect, useState } from "react"; +import stableStringify from "json-stable-stringify"; +import { + Dialog, + DialogContent, + DialogHeader, + DialogDescription, + DialogFooter, +} from "~/components/primitives/Dialog"; +import { Button } from "~/components/primitives/Buttons"; +import { Callout } from "~/components/primitives/Callout"; +import { LockClosedIcon } from "@heroicons/react/20/solid"; +import { CheckboxWithLabel } from "~/components/primitives/Checkbox"; +import { cn } from "~/utils/cn"; +import { FEATURE_FLAG, ORG_LOCKED_FLAGS, type FlagControlType } from "~/v3/featureFlags"; +import { + UNSET_VALUE, + BooleanControl, + EnumControl, + StringControl, + WorkerGroupControl, + type WorkerGroup, +} from "./FlagControls"; + +type LoaderData = { + org: { id: string; title: string; slug: string }; + orgFlags: Record; + globalFlags: Record; + controlTypes: Record; + workerGroupName?: string; + workerGroups?: WorkerGroup[]; + isManagedCloud?: boolean; +}; + +type ActionData = { + success?: boolean; + error?: string; +}; + +type FeatureFlagsDialogProps = { + orgId: string | null; + orgTitle: string; + open: boolean; + onOpenChange: (open: boolean) => void; +}; + +export function FeatureFlagsDialog({ + orgId, + orgTitle, + open, + onOpenChange, +}: FeatureFlagsDialogProps) { + const loadFetcher = useFetcher(); + const saveFetcher = useFetcher(); + + const [overrides, setOverrides] = useState>({}); + const [initialOverrides, setInitialOverrides] = useState>({}); + const [saveError, setSaveError] = useState(null); + const [unlocked, setUnlocked] = useState(false); + + const isLocked = (key: string) => !unlocked && ORG_LOCKED_FLAGS.includes(key); + + useEffect(() => { + if (open && orgId) { + setSaveError(null); + setOverrides({}); + setInitialOverrides({}); + loadFetcher.load(`/admin/api/v2/orgs/${orgId}/feature-flags`); + } + }, [open, orgId]); + + useEffect(() => { + if (loadFetcher.data) { + const loaded = loadFetcher.data.orgFlags ?? {}; + setOverrides({ ...loaded }); + setInitialOverrides({ ...loaded }); + } + }, [loadFetcher.data]); + + useEffect(() => { + if (saveFetcher.data?.success) { + onOpenChange(false); + } else if (saveFetcher.data?.error) { + setSaveError(saveFetcher.data.error); + } + }, [saveFetcher.data]); + + const isDirty = stableStringify(overrides) !== stableStringify(initialOverrides); + + const setFlagValue = (key: string, value: unknown) => { + setOverrides((prev) => ({ ...prev, [key]: value })); + }; + + const unsetFlag = (key: string) => { + setOverrides((prev) => { + const next = { ...prev }; + delete next[key]; + return next; + }); + }; + + const handleSave = () => { + if (!orgId) return; + const body = Object.keys(overrides).length === 0 ? null : overrides; + saveFetcher.submit(JSON.stringify(body), { + method: "POST", + action: `/admin/api/v2/orgs/${orgId}/feature-flags`, + encType: "application/json", + }); + }; + + const data = loadFetcher.data; + const isLoading = loadFetcher.state === "loading"; + const isSaving = saveFetcher.state === "submitting"; + + const jsonPreview = + Object.keys(overrides).length === 0 ? "null" : JSON.stringify(overrides, null, 2); + + const sortedFlagKeys = data ? Object.keys(data.controlTypes).sort() : []; + + return ( + + + Feature flags - {orgTitle} + + Org-level overrides. Unset flags inherit from global defaults. + + + {data && ( +
+ +
+ )} + +
+ {isLoading ? ( +
Loading flags...
+ ) : data ? ( +
+ {sortedFlagKeys.map((key) => { + const control = data.controlTypes[key]; + const locked = isLocked(key); + const globalValue = data.globalFlags[key as keyof typeof data.globalFlags]; + const isWorkerGroup = key === FEATURE_FLAG.defaultWorkerInstanceGroupId; + const globalDisplay = + isWorkerGroup && data.workerGroupName && globalValue !== undefined + ? `${data.workerGroupName} (${String(globalValue).slice(0, 8)}...)` + : globalValue !== undefined + ? String(globalValue) + : "unset"; + + if (locked) { + return ( +
+
+
{key}
+
global: {globalDisplay}
+
+ +
+ ); + } + + const isOverridden = key in overrides; + + return ( +
+
+
+ {key} +
+
global: {globalDisplay}
+
+ +
+ + + {isWorkerGroup && data.workerGroups ? ( + { + if (val === UNSET_VALUE) { + unsetFlag(key); + } else { + setFlagValue(key, val); + } + }} + dimmed={!isOverridden} + /> + ) : control.type === "boolean" ? ( + setFlagValue(key, val)} + dimmed={!isOverridden} + /> + ) : control.type === "enum" ? ( + { + if (val === UNSET_VALUE) { + unsetFlag(key); + } else { + setFlagValue(key, val); + } + }} + dimmed={!isOverridden} + /> + ) : control.type === "string" ? ( + { + if (val === "") { + unsetFlag(key); + } else { + setFlagValue(key, val); + } + }} + dimmed={!isOverridden} + /> + ) : null} +
+
+ ); + })} +
+ ) : null} +
+ + {data && ( +
+ + Preview JSON + +
+              {jsonPreview}
+            
+
+ )} + + {saveError && {saveError}} + + + + + +
+
+ ); +} diff --git a/apps/webapp/app/components/admin/FlagControls.tsx b/apps/webapp/app/components/admin/FlagControls.tsx new file mode 100644 index 00000000000..b08f925dd90 --- /dev/null +++ b/apps/webapp/app/components/admin/FlagControls.tsx @@ -0,0 +1,120 @@ +import { Switch } from "~/components/primitives/Switch"; +import { Select, SelectItem } from "~/components/primitives/Select"; +import { Input } from "~/components/primitives/Input"; +import { cn } from "~/utils/cn"; + +export const UNSET_VALUE = "__unset__"; + +export function BooleanControl({ + value, + onChange, + dimmed, +}: { + value: boolean | undefined; + onChange: (val: boolean) => void; + dimmed: boolean; +}) { + return ( + + ); +} + +export function EnumControl({ + value, + options, + onChange, + dimmed, +}: { + value: string | undefined; + options: string[]; + onChange: (val: string) => void; + dimmed: boolean; +}) { + const items = [UNSET_VALUE, ...options]; + + return ( + + ); +} + +export type WorkerGroup = { id: string; name: string }; + +export function WorkerGroupControl({ + value, + workerGroups, + onChange, + dimmed, +}: { + value: string | undefined; + workerGroups: WorkerGroup[]; + onChange: (val: string) => void; + dimmed: boolean; +}) { + const items = [UNSET_VALUE, ...workerGroups.map((wg) => wg.id)]; + + return ( + + ); +} + +export function StringControl({ + value, + onChange, + dimmed, +}: { + value: string; + onChange: (val: string) => void; + dimmed: boolean; +}) { + return ( + onChange(e.target.value)} + placeholder="unset" + className={cn("w-40", dimmed && "opacity-50")} + /> + ); +} diff --git a/apps/webapp/app/components/admin/backOffice/ApiRateLimitSection.server.ts b/apps/webapp/app/components/admin/backOffice/ApiRateLimitSection.server.ts new file mode 100644 index 00000000000..4855c4c2465 --- /dev/null +++ b/apps/webapp/app/components/admin/backOffice/ApiRateLimitSection.server.ts @@ -0,0 +1,55 @@ +import { prisma } from "~/db.server"; +import { env } from "~/env.server"; +import { logger } from "~/services/logger.server"; +import { type Duration } from "~/services/rateLimiter.server"; +import { API_RATE_LIMIT_INTENT } from "./ApiRateLimitSection"; +import { + handleRateLimitAction, + resolveEffectiveRateLimit, + type RateLimitActionResult, + type RateLimitDomain, +} from "./RateLimitSection.server"; +import type { EffectiveRateLimit } from "./RateLimitSection"; + +export const apiRateLimitDomain: RateLimitDomain = { + intent: API_RATE_LIMIT_INTENT, + systemDefault: () => ({ + type: "tokenBucket", + refillRate: env.API_RATE_LIMIT_REFILL_RATE, + interval: env.API_RATE_LIMIT_REFILL_INTERVAL as Duration, + maxTokens: env.API_RATE_LIMIT_MAX, + }), + apply: async (orgId, next, adminUserId) => { + const existing = await prisma.organization.findFirst({ + where: { id: orgId }, + select: { apiRateLimiterConfig: true }, + }); + if (!existing) { + throw new Response(null, { status: 404 }); + } + await prisma.organization.update({ + where: { id: orgId }, + data: { apiRateLimiterConfig: next as any }, + }); + logger.info("admin.backOffice.apiRateLimit", { + adminUserId, + orgId, + previous: existing.apiRateLimiterConfig, + next, + }); + }, +}; + +export function resolveEffectiveApiRateLimit( + override: unknown +): EffectiveRateLimit { + return resolveEffectiveRateLimit(override, apiRateLimitDomain); +} + +export function handleApiRateLimitAction( + formData: FormData, + orgId: string, + adminUserId: string +): Promise { + return handleRateLimitAction(formData, orgId, adminUserId, apiRateLimitDomain); +} diff --git a/apps/webapp/app/components/admin/backOffice/ApiRateLimitSection.tsx b/apps/webapp/app/components/admin/backOffice/ApiRateLimitSection.tsx new file mode 100644 index 00000000000..b27956f4360 --- /dev/null +++ b/apps/webapp/app/components/admin/backOffice/ApiRateLimitSection.tsx @@ -0,0 +1,17 @@ +import { + RateLimitSection, + type RateLimitWrapperProps, +} from "./RateLimitSection"; + +export const API_RATE_LIMIT_INTENT = "set-rate-limit"; +export const API_RATE_LIMIT_SAVED_VALUE = "rate-limit"; + +export function ApiRateLimitSection(props: RateLimitWrapperProps) { + return ( + + ); +} diff --git a/apps/webapp/app/components/admin/backOffice/BatchRateLimitSection.server.ts b/apps/webapp/app/components/admin/backOffice/BatchRateLimitSection.server.ts new file mode 100644 index 00000000000..83a368094a9 --- /dev/null +++ b/apps/webapp/app/components/admin/backOffice/BatchRateLimitSection.server.ts @@ -0,0 +1,55 @@ +import { prisma } from "~/db.server"; +import { env } from "~/env.server"; +import { logger } from "~/services/logger.server"; +import { type Duration } from "~/services/rateLimiter.server"; +import { BATCH_RATE_LIMIT_INTENT } from "./BatchRateLimitSection"; +import { + handleRateLimitAction, + resolveEffectiveRateLimit, + type RateLimitActionResult, + type RateLimitDomain, +} from "./RateLimitSection.server"; +import type { EffectiveRateLimit } from "./RateLimitSection"; + +export const batchRateLimitDomain: RateLimitDomain = { + intent: BATCH_RATE_LIMIT_INTENT, + systemDefault: () => ({ + type: "tokenBucket", + refillRate: env.BATCH_RATE_LIMIT_REFILL_RATE, + interval: env.BATCH_RATE_LIMIT_REFILL_INTERVAL as Duration, + maxTokens: env.BATCH_RATE_LIMIT_MAX, + }), + apply: async (orgId, next, adminUserId) => { + const existing = await prisma.organization.findFirst({ + where: { id: orgId }, + select: { batchRateLimitConfig: true }, + }); + if (!existing) { + throw new Response(null, { status: 404 }); + } + await prisma.organization.update({ + where: { id: orgId }, + data: { batchRateLimitConfig: next as any }, + }); + logger.info("admin.backOffice.batchRateLimit", { + adminUserId, + orgId, + previous: existing.batchRateLimitConfig, + next, + }); + }, +}; + +export function resolveEffectiveBatchRateLimit( + override: unknown +): EffectiveRateLimit { + return resolveEffectiveRateLimit(override, batchRateLimitDomain); +} + +export function handleBatchRateLimitAction( + formData: FormData, + orgId: string, + adminUserId: string +): Promise { + return handleRateLimitAction(formData, orgId, adminUserId, batchRateLimitDomain); +} diff --git a/apps/webapp/app/components/admin/backOffice/BatchRateLimitSection.tsx b/apps/webapp/app/components/admin/backOffice/BatchRateLimitSection.tsx new file mode 100644 index 00000000000..0e52124d290 --- /dev/null +++ b/apps/webapp/app/components/admin/backOffice/BatchRateLimitSection.tsx @@ -0,0 +1,17 @@ +import { + RateLimitSection, + type RateLimitWrapperProps, +} from "./RateLimitSection"; + +export const BATCH_RATE_LIMIT_INTENT = "set-batch-rate-limit"; +export const BATCH_RATE_LIMIT_SAVED_VALUE = "batch-rate-limit"; + +export function BatchRateLimitSection(props: RateLimitWrapperProps) { + return ( + + ); +} diff --git a/apps/webapp/app/components/admin/backOffice/MaxProjectsSection.server.ts b/apps/webapp/app/components/admin/backOffice/MaxProjectsSection.server.ts new file mode 100644 index 00000000000..ec27234a306 --- /dev/null +++ b/apps/webapp/app/components/admin/backOffice/MaxProjectsSection.server.ts @@ -0,0 +1,48 @@ +import { z } from "zod"; +import { prisma } from "~/db.server"; +import { logger } from "~/services/logger.server"; +import { MAX_PROJECTS_INTENT } from "./MaxProjectsSection"; + +const SetMaxProjectsSchema = z.object({ + intent: z.literal(MAX_PROJECTS_INTENT), + // Capped at PostgreSQL INTEGER max (Prisma Int) so oversized input fails + // validation cleanly instead of crashing the update. + maximumProjectCount: z.coerce.number().int().min(1).max(2_147_483_647), +}); + +export type MaxProjectsActionResult = + | { ok: true } + | { ok: false; errors: Record }; + +export async function handleMaxProjectsAction( + formData: FormData, + orgId: string, + adminUserId: string +): Promise { + const submission = SetMaxProjectsSchema.safeParse(Object.fromEntries(formData)); + if (!submission.success) { + return { ok: false, errors: submission.error.flatten().fieldErrors }; + } + + const existing = await prisma.organization.findFirst({ + where: { id: orgId }, + select: { maximumProjectCount: true }, + }); + if (!existing) { + throw new Response(null, { status: 404 }); + } + + await prisma.organization.update({ + where: { id: orgId }, + data: { maximumProjectCount: submission.data.maximumProjectCount }, + }); + + logger.info("admin.backOffice.maxProjects", { + adminUserId, + orgId, + previous: existing.maximumProjectCount, + next: submission.data.maximumProjectCount, + }); + + return { ok: true }; +} diff --git a/apps/webapp/app/components/admin/backOffice/MaxProjectsSection.tsx b/apps/webapp/app/components/admin/backOffice/MaxProjectsSection.tsx new file mode 100644 index 00000000000..bf8ecf83161 --- /dev/null +++ b/apps/webapp/app/components/admin/backOffice/MaxProjectsSection.tsx @@ -0,0 +1,115 @@ +import { Form } from "@remix-run/react"; +import { useEffect, useState } from "react"; +import { Button } from "~/components/primitives/Buttons"; +import { FormError } from "~/components/primitives/FormError"; +import { Header2 } from "~/components/primitives/Headers"; +import { Input } from "~/components/primitives/Input"; +import { Label } from "~/components/primitives/Label"; +import { Paragraph } from "~/components/primitives/Paragraph"; +import * as Property from "~/components/primitives/PropertyTable"; + +export const MAX_PROJECTS_INTENT = "set-max-projects"; +export const MAX_PROJECTS_SAVED_VALUE = "max-projects"; + +type FieldErrors = Record | null; + +type Props = { + maximumProjectCount: number; + errors: FieldErrors; + savedJustNow: boolean; + isSubmitting: boolean; +}; + +export function MaxProjectsSection({ + maximumProjectCount, + errors, + savedJustNow, + isSubmitting, +}: Props) { + const hasFieldErrors = !!errors && Object.keys(errors).length > 0; + const fieldError = (field: string) => + errors && field in errors ? errors[field]?.[0] : undefined; + + const [isEditing, setIsEditing] = useState(hasFieldErrors); + const [value, setValue] = useState(String(maximumProjectCount)); + + useEffect(() => { + if (hasFieldErrors) setIsEditing(true); + }, [hasFieldErrors]); + + useEffect(() => { + if (savedJustNow && !hasFieldErrors) setIsEditing(false); + }, [savedJustNow, hasFieldErrors]); + + return ( +
+
+ Maximum projects + {!isEditing && ( + + )} +
+ + {savedJustNow && ( +
+ + Saved. + +
+ )} + + {!isEditing ? ( + + + Limit + + {maximumProjectCount.toLocaleString()} + + + + ) : ( +
+ +
+ + setValue(e.target.value)} + required + /> + {fieldError("maximumProjectCount")} +
+
+ + +
+
+ )} +
+ ); +} diff --git a/apps/webapp/app/components/admin/backOffice/RateLimitSection.server.ts b/apps/webapp/app/components/admin/backOffice/RateLimitSection.server.ts new file mode 100644 index 00000000000..799fc3605df --- /dev/null +++ b/apps/webapp/app/components/admin/backOffice/RateLimitSection.server.ts @@ -0,0 +1,76 @@ +import { z } from "zod"; +import { + RateLimitTokenBucketConfig, + RateLimiterConfig, +} from "~/services/authorizationRateLimitMiddleware.server"; +import { + parseDurationToMs, + type EffectiveRateLimit, +} from "./RateLimitSection"; + +export type RateLimitDomain = { + intent: string; + systemDefault: () => RateLimiterConfig; + apply: ( + orgId: string, + next: RateLimitTokenBucketConfig, + adminUserId: string + ) => Promise; +}; + +export function resolveEffectiveRateLimit( + override: unknown, + domain: RateLimitDomain +): EffectiveRateLimit { + if (override == null) { + return { source: "default", config: domain.systemDefault() }; + } + const parsed = RateLimiterConfig.safeParse(override); + if (parsed.success) { + return { source: "override", config: parsed.data }; + } + // Column holds malformed JSON — fall back silently. Admin must investigate + // at the DB level; this UI can't recover it. + return { source: "default", config: domain.systemDefault() }; +} + +export type RateLimitActionResult = + | { ok: true } + | { ok: false; errors: Record }; + +export async function handleRateLimitAction( + formData: FormData, + orgId: string, + adminUserId: string, + domain: RateLimitDomain +): Promise { + const schema = z.object({ + intent: z.literal(domain.intent), + refillRate: z.coerce.number().int().min(1), + interval: z + .string() + .trim() + .refine((v) => parseDurationToMs(v) > 0, { + message: "Must be a duration like 10s, 1m, 500ms.", + }), + maxTokens: z.coerce.number().int().min(1), + }); + + const submission = schema.safeParse(Object.fromEntries(formData)); + if (!submission.success) { + return { ok: false, errors: submission.error.flatten().fieldErrors }; + } + + const built = RateLimitTokenBucketConfig.safeParse({ + type: "tokenBucket", + refillRate: submission.data.refillRate, + interval: submission.data.interval, + maxTokens: submission.data.maxTokens, + }); + if (!built.success) { + return { ok: false, errors: built.error.flatten().fieldErrors }; + } + + await domain.apply(orgId, built.data, adminUserId); + return { ok: true }; +} diff --git a/apps/webapp/app/components/admin/backOffice/RateLimitSection.tsx b/apps/webapp/app/components/admin/backOffice/RateLimitSection.tsx new file mode 100644 index 00000000000..1af8abab3d9 --- /dev/null +++ b/apps/webapp/app/components/admin/backOffice/RateLimitSection.tsx @@ -0,0 +1,306 @@ +import { Form } from "@remix-run/react"; +import { useEffect, useState } from "react"; +import { Button } from "~/components/primitives/Buttons"; +import { FormError } from "~/components/primitives/FormError"; +import { Header2 } from "~/components/primitives/Headers"; +import { Input } from "~/components/primitives/Input"; +import { Label } from "~/components/primitives/Label"; +import { Paragraph } from "~/components/primitives/Paragraph"; +import * as Property from "~/components/primitives/PropertyTable"; + +// Local shape mirrors the server-side discriminated union just enough for this +// view. Decoupled from the .server module so the component stays client-safe. +// Duration fields are always suffixed strings — the server's DurationSchema +// rejects anything else, so non-string overrides fall back to the default. +export type RateLimitConfig = + | { + type: "tokenBucket"; + refillRate: number; + interval: string; + maxTokens: number; + } + | { + type: "fixedWindow" | "slidingWindow"; + window: string; + tokens: number; + }; + +export type EffectiveRateLimit = { + source: "override" | "default"; + config: RateLimitConfig; +}; + +export type FieldErrors = Record | null; + +// Props shared by every per-domain wrapper (Api / Batch / future ones). +export type RateLimitWrapperProps = { + effective: EffectiveRateLimit; + errors: FieldErrors; + savedJustNow: boolean; + isSubmitting: boolean; +}; + +type Props = RateLimitWrapperProps & { + title: string; + intent: string; +}; + +export function RateLimitSection({ + title, + intent, + effective, + errors, + savedJustNow, + isSubmitting, +}: Props) { + const hasFieldErrors = !!errors && Object.keys(errors).length > 0; + const fieldError = (field: string) => + errors && field in errors ? errors[field]?.[0] : undefined; + + const current = + effective.config.type === "tokenBucket" ? effective.config : null; + + const [isEditing, setIsEditing] = useState(hasFieldErrors); + const [refillRate, setRefillRate] = useState( + current ? String(current.refillRate) : "" + ); + const [intervalStr, setIntervalStr] = useState( + current ? String(current.interval) : "" + ); + const [maxTokens, setMaxTokens] = useState( + current ? String(current.maxTokens) : "" + ); + + useEffect(() => { + if (hasFieldErrors) setIsEditing(true); + }, [hasFieldErrors]); + + useEffect(() => { + if (savedJustNow && !hasFieldErrors) setIsEditing(false); + }, [savedJustNow, hasFieldErrors]); + + const currentDescription = current + ? describeRateLimit( + current.refillRate, + parseDurationToMs(String(current.interval)), + current.maxTokens + ) + : null; + + const previewDescription = describeRateLimit( + Number(refillRate) || 0, + parseDurationToMs(intervalStr), + Number(maxTokens) || 0 + ); + + const cancelEdit = () => { + setRefillRate(current ? String(current.refillRate) : ""); + setIntervalStr(current ? String(current.interval) : ""); + setMaxTokens(current ? String(current.maxTokens) : ""); + setIsEditing(false); + }; + + return ( +
+
+ {title} + {!isEditing && ( + + )} +
+ + {savedJustNow && ( +
+ + Saved. + +
+ )} + + + Status:{" "} + {effective.source === "override" + ? "Custom override active." + : "Using system default."} + + + {!isEditing ? ( + <> + + {effective.config.type === "tokenBucket" ? ( + currentDescription ? ( + <> + + Sustained rate + + {currentDescription.sustained} + + + + Burst allowance + {currentDescription.burst} + + + ) : ( + + + Invalid interval on the stored config. + + + ) + ) : ( + <> + + Type + {effective.config.type} + + + Window + {String(effective.config.window)} + + + Tokens + + {effective.config.tokens.toLocaleString()} + + + + )} + + {effective.config.type !== "tokenBucket" && ( + + This override is a {effective.config.type} limit and can't be + edited from this form. Change it in the database directly. + + )} + + ) : ( +
+ + +
+ + setRefillRate(e.target.value)} + required + /> + {fieldError("refillRate")} +
+ +
+ + setIntervalStr(e.target.value)} + required + /> + {fieldError("interval")} +
+ +
+ + setMaxTokens(e.target.value)} + required + /> + {fieldError("maxTokens")} +
+ + + {previewDescription + ? `Preview: ${previewDescription.sustained} · ${previewDescription.burst}.` + : "Preview: enter valid values to see the effective limit."} + + +
+ + +
+
+ )} +
+ ); +} + +export function parseDurationToMs(duration: string): number { + const match = duration.trim().match(/^(\d+)\s*(ms|s|m|h|d)$/); + if (!match) return 0; + const value = parseInt(match[1], 10); + switch (match[2]) { + case "ms": + return value; + case "s": + return value * 1_000; + case "m": + return value * 60_000; + case "h": + return value * 3_600_000; + case "d": + return value * 86_400_000; + default: + return 0; + } +} + +function describeRateLimit( + refillRate: number, + intervalMs: number, + maxTokens: number +): { sustained: string; burst: string } | null { + if (refillRate <= 0 || intervalMs <= 0 || maxTokens <= 0) return null; + const perMin = (refillRate * 60_000) / intervalMs; + let sustained: string; + if (perMin >= 1) { + sustained = `${formatRateValue(perMin)} requests per minute`; + } else { + const perHour = perMin * 60; + if (perHour >= 1) { + sustained = `${formatRateValue(perHour)} requests per hour`; + } else { + const perDay = perHour * 24; + sustained = `${formatRateValue(perDay)} requests per day`; + } + } + return { + sustained, + burst: `${maxTokens.toLocaleString()} request burst allowance`, + }; +} + +function formatRateValue(value: number): string { + return value >= 10 ? Math.round(value).toLocaleString() : value.toFixed(1); +} diff --git a/apps/webapp/app/components/code/AIQueryInput.tsx b/apps/webapp/app/components/code/AIQueryInput.tsx index 38d0c9b21b1..cd5e9db3bd8 100644 --- a/apps/webapp/app/components/code/AIQueryInput.tsx +++ b/apps/webapp/app/components/code/AIQueryInput.tsx @@ -1,20 +1,9 @@ -import { PencilSquareIcon, PlusIcon, SparklesIcon } from "@heroicons/react/20/solid"; +import { CheckIcon, PencilSquareIcon, PlusIcon, XMarkIcon } from "@heroicons/react/20/solid"; import { AnimatePresence, motion } from "framer-motion"; -import { Suspense, lazy, useCallback, useEffect, useRef, useState } from "react"; -import { AISparkleIcon } from "~/assets/icons/AISparkleIcon"; - -// Lazy load streamdown components to avoid SSR issues -const StreamdownRenderer = lazy(() => - import("streamdown").then((mod) => ({ - default: ({ children, isAnimating }: { children: string; isAnimating: boolean }) => ( - - {children} - - ), - })) -); +import { Suspense, useCallback, useEffect, useRef, useState } from "react"; import { Button } from "~/components/primitives/Buttons"; import { Spinner } from "~/components/primitives/Spinner"; +import { StreamdownRenderer } from "~/components/code/StreamdownRenderer"; import { useEnvironment } from "~/hooks/useEnvironment"; import { useOrganization } from "~/hooks/useOrganizations"; import { useProject } from "~/hooks/useProject"; @@ -179,21 +168,7 @@ export function AIQueryInput({ setThinking((prev) => prev + event.content); break; case "tool_call": - if (event.tool === "setTimeFilter") { - setThinking((prev) => { - if (prev.trimEnd().endsWith("Setting time filter...")) { - return prev; - } - return prev + `\nSetting time filter...\n`; - }); - } else { - setThinking((prev) => { - if (prev.trimEnd().endsWith("Validating query...")) { - return prev; - } - return prev + `\nValidating query...\n`; - }); - } + // Tool calls are handled silently — no UI text needed break; case "time_filter": // Apply time filter immediately when the AI sets it @@ -262,13 +237,13 @@ export function AIQueryInput({ }, [error]); return ( -
+
{/* Gradient border wrapper like the schedules AI input */}
-
+