From 1f6260788c1c4eff36adf9afb54c1398ac9d8e77 Mon Sep 17 00:00:00 2001 From: Jan Librowski Date: Wed, 10 Jun 2026 02:04:21 +0200 Subject: [PATCH 01/11] fix(execution-worker): connect to TEMPORAL_ADDRESS instead of implicit localhost MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Worker.create without an explicit connection dials 127.0.0.1:7233, ignoring the TEMPORAL_ADDRESS env var — correct in local dev, broken in any deployment where Temporal is not on loopback. --- apps/execution-worker/src/engines/temporal/worker.ts | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/apps/execution-worker/src/engines/temporal/worker.ts b/apps/execution-worker/src/engines/temporal/worker.ts index 326ed7c5a..49ecd295e 100644 --- a/apps/execution-worker/src/engines/temporal/worker.ts +++ b/apps/execution-worker/src/engines/temporal/worker.ts @@ -1,4 +1,4 @@ -import { Worker } from '@temporalio/worker'; +import { NativeConnection, Worker } from '@temporalio/worker'; import 'dotenv/config'; import { fileURLToPath } from 'node:url'; @@ -42,7 +42,12 @@ const activities = { }, }; +// Without an explicit connection the worker silently dials 127.0.0.1:7233, +// ignoring TEMPORAL_ADDRESS — correct in local dev, wrong everywhere else. +const connection = await NativeConnection.connect({ address: env.TEMPORAL_ADDRESS }); + const worker = await Worker.create({ + connection, workflowsPath: fileURLToPath(new URL('workflows/run-workflow.ts', import.meta.url)), activities, taskQueue, From ccf7375b43ea77b21bdd0c2f0dfc7b47e6c34b4d Mon Sep 17 00:00:00 2001 From: Jan Librowski Date: Wed, 10 Jun 2026 02:04:35 +0200 Subject: [PATCH 02/11] fix(execution-worker): treat empty-conditions decision branch as catch-all MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The no_branch_matched error message and the Sales Inquiry reference template both treat a branch with no conditions as the explicit catch-all, but branchMatches returned false for it — any input classified outside the keyword branches failed the whole run. Supersedes the empty-conditions bullet of packages/execution-core/decision-no-match.decision-log.md (the strict fail-fast core of that decision is unchanged); see apps/execution-worker/decision-catch-all.decision-log.md. --- .../decision-catch-all.decision-log.md | 74 +++++++++++++++++++ .../src/executors/decision.test.ts | 19 +++-- .../src/executors/decision.ts | 6 +- 3 files changed, 92 insertions(+), 7 deletions(-) create mode 100644 apps/execution-worker/decision-catch-all.decision-log.md diff --git a/apps/execution-worker/decision-catch-all.decision-log.md b/apps/execution-worker/decision-catch-all.decision-log.md new file mode 100644 index 000000000..fef839295 --- /dev/null +++ b/apps/execution-worker/decision-catch-all.decision-log.md @@ -0,0 +1,74 @@ +### Title: Decision branch with no conditions is the explicit catch-all + +### Proposed by: Jan Librowski + +### Date: 10.06.2026 + +## Context + +End-to-end verification of the WB-229 demo deployment failed on the +reference workload: the Sales Inquiry Pipeline's classifier returned +`**Type:** general`, no conditional branch matched, and the run ended in +`execution_failed` — despite the template shipping a 'General' branch with +`conditions: []` as its designed fallback. + +The codebase contradicted itself on what a catch-all is: + +- `decision-no-match.decision-log.md` (execution-core, 29.04.2026) decided + **strict fail-fast on no match** — correct and kept — but its Cons section + declared an empty-conditions branch non-matching, requiring a + tautological condition (`x === x`) as the catch-all idiom. A unit test + pinned that. +- The executor's own `no_branch_matched` error message instructed the + opposite: _"Add an explicit catch-all branch with no conditions."_ +- The reference template (`sales-inquiry-flow.ts`) followed the error + message, not the test — and was broken for any input classified outside + its keyword branches. Local demos always matched 'pricing'/'technical', so + this never surfaced until a different model classified an input as + 'general'. + +Three artifacts said "empty = catch-all", one said the opposite; the +user-facing ones (error message, reference template) all pointed one way. + +## Decision + +`branchMatches` in `apps/execution-worker/src/executors/decision.ts` now +returns `true` for an empty `conditions[]`. First-match order is preserved, +so a catch-all only fires when placed after the conditional branches. The +strict throw from the original decision is untouched: a decision node whose +branches all have conditions and none match still fails with +`no_branch_matched`. + +This supersedes the "empty conditions are non-matching" bullet (and the +test pinning it) from `decision-no-match.decision-log.md`. The fail-fast +core of that decision stands. + +## Alternative Options Considered + +- **Keep the semantics, fix the template with a tautological condition** — + rejected: every UI author following the error message's instruction would + keep hitting the same failure, and `isEqual 'a' 'a'` as the blessed + catch-all idiom is noise a property panel can't explain. +- **`isDefault: true` flag on a designated branch** — still the cleaner + long-term UX (already noted in the original log); still deferred for the + same reason: type + Zod schema + properties-panel changes, separate + ticket. + +## Consequences + +- **Pros** + - The shipped reference template and the executor's error message are now + both true. + - Catch-all is expressible in the UI as-is (an empty branch), no magic + conditions. +- **Cons** + - Semantics change: a flow that contained an empty-conditions branch and + relied on the node failing now routes through that branch. No known + flow does this — the only shipped example wanted the opposite. + - A _misplaced_ empty branch (before conditional ones) silently wins due + to first-match order; the matched branch is visible in the + `matchedBranch` output and event log. + +## Status + +Accepted diff --git a/apps/execution-worker/src/executors/decision.test.ts b/apps/execution-worker/src/executors/decision.test.ts index ac1f98de2..2d4b31242 100644 --- a/apps/execution-worker/src/executors/decision.test.ts +++ b/apps/execution-worker/src/executors/decision.test.ts @@ -82,17 +82,24 @@ describe('executeDecision', () => { } }); - it('treats a branch with no conditions as non-matching (so callers must throw or use explicit operators)', () => { - // Empty conditions array — branchMatches returns false, so this is NOT - // a default. If someone wants a default, they need a branch whose - // conditions evaluate to true (e.g. isEqual 'x' 'x'). + it('treats a branch with no conditions as the catch-all', () => { + // The contract the no_branch_matched error instructs authors to use, and + // what the reference Sales Inquiry template relies on for its 'General' + // branch. First-match order applies: a catch-all placed after conditional + // branches only fires when none of them matched. const node = decisionNode([ { - sourceHandle: 'empty', + sourceHandle: 'no', + conditions: [{ x: 'a', y: 'b', comparisonOperator: 'isEqual' }], + }, + { + sourceHandle: 'fallback', conditions: [], }, ]); - expect(() => executeDecision(node, context())).toThrowError(NodeExecutionError); + const result = executeDecision(node, context()); + + expect(result.nextPort).toBe('fallback'); }); }); diff --git a/apps/execution-worker/src/executors/decision.ts b/apps/execution-worker/src/executors/decision.ts index 847fa2db5..0fbccb140 100644 --- a/apps/execution-worker/src/executors/decision.ts +++ b/apps/execution-worker/src/executors/decision.ts @@ -28,7 +28,11 @@ export function executeDecision(node: DecisionNode, context: ExecutionContext): } function branchMatches(conditions: DecisionBranchCondition[], context: ExecutionContext): boolean { - if (conditions.length === 0) return false; + // A branch with no conditions is the explicit catch-all — the contract the + // error above instructs authors to use, and what the reference Sales + // Inquiry template ships ('General' branch). First-match order still + // applies, so a catch-all only fires when placed after conditional branches. + if (conditions.length === 0) return true; let result = evaluateCondition(conditions[0]!, context); for (let index = 1; index < conditions.length; index++) { From 7ca365ed9bc2106e38ad3645f81949d0e6d69472 Mon Sep 17 00:00:00 2001 From: Jan Librowski Date: Wed, 10 Jun 2026 02:04:48 +0200 Subject: [PATCH 03/11] feat(backend): add per-ip rate limit on the execute route MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixed-window, in-memory limiter (WB-229 abuse gate). Disabled unless RATE_LIMIT_EXECUTE_PER_MINUTE / RATE_LIMIT_EXECUTE_PER_DAY are set, so local dev is unaffected. TRUST_PROXY=true reads the client IP from X-Forwarded-For — only enable behind a proxy that sets it. --- apps/backend/src/env.ts | 5 + .../backend/src/middleware/rate-limit.test.ts | 134 ++++++++++++++++++ apps/backend/src/middleware/rate-limit.ts | 121 ++++++++++++++++ apps/backend/src/server.ts | 17 +++ 4 files changed, 277 insertions(+) create mode 100644 apps/backend/src/middleware/rate-limit.test.ts create mode 100644 apps/backend/src/middleware/rate-limit.ts diff --git a/apps/backend/src/env.ts b/apps/backend/src/env.ts index 12645d6d6..349b5602f 100644 --- a/apps/backend/src/env.ts +++ b/apps/backend/src/env.ts @@ -12,4 +12,9 @@ export const env = { HOST: envOr('HOST', '127.0.0.1'), DATABASE_URL: envOr('DATABASE_URL', 'postgresql://wb:wb@127.0.0.1:5432/workflow_builder'), TEMPORAL_ADDRESS: envOr('TEMPORAL_ADDRESS', '127.0.0.1:7233'), + // Per-IP limits on the execute route. 0 = disabled (local dev default); + // the production compose in deploy/ai-studio sets both. + RATE_LIMIT_EXECUTE_PER_MINUTE: Number(envOr('RATE_LIMIT_EXECUTE_PER_MINUTE', '0')), + RATE_LIMIT_EXECUTE_PER_DAY: Number(envOr('RATE_LIMIT_EXECUTE_PER_DAY', '0')), + TRUST_PROXY: envOr('TRUST_PROXY', 'false') === 'true', }; diff --git a/apps/backend/src/middleware/rate-limit.test.ts b/apps/backend/src/middleware/rate-limit.test.ts new file mode 100644 index 000000000..f3f78b52b --- /dev/null +++ b/apps/backend/src/middleware/rate-limit.test.ts @@ -0,0 +1,134 @@ +import { Hono } from 'hono'; +import { describe, expect, it } from 'vitest'; + +import { type RateLimitOptions, createRateLimitMiddleware } from './rate-limit'; + +const MINUTE_MS = 60_000; +const DAY_MS = 24 * 60 * 60 * 1000; + +/** + * Build a Hono app mirroring the production wiring in `server.ts`: the + * limiter guards a single execute-shaped route. Tests drive the clock through + * the injectable `now` and identify callers via X-Forwarded-For (trustProxy), + * since `app.request()` has no underlying socket. + */ +function makeApp(overrides: Partial = {}) { + let timestamp = 0; + const app = new Hono(); + app.use( + '/api/workflows/:id/execute', + createRateLimitMiddleware({ + perMinute: 2, + perDay: 5, + trustProxy: true, + now: () => timestamp, + ...overrides, + }), + ); + app.post('/api/workflows/:id/execute', (c) => c.json({ ok: true }, 202)); + + return { + app, + advance(ms: number) { + timestamp += ms; + }, + execute(ip = '203.0.113.7') { + return app.request('/api/workflows/wf-1/execute', { + method: 'POST', + headers: { 'x-forwarded-for': ip }, + }); + }, + }; +} + +describe('createRateLimitMiddleware', () => { + it('allows requests under the limit', async () => { + const { execute } = makeApp(); + + const first = await execute(); + const second = await execute(); + expect(first.status).toBe(202); + expect(second.status).toBe(202); + }); + + it('rejects with 429 and Retry-After once the minute limit is hit', async () => { + const { execute, advance } = makeApp(); + + await execute(); + await execute(); + advance(10_000); + + const response = await execute(); + expect(response.status).toBe(429); + expect(response.headers.get('Retry-After')).toBe('50'); + expect(await response.json()).toMatchObject({ code: 'rate_limited', retryAfterSeconds: 50 }); + }); + + it('tracks each IP independently', async () => { + const { execute } = makeApp(); + + await execute('203.0.113.7'); + await execute('203.0.113.7'); + const blocked = await execute('203.0.113.7'); + const otherIp = await execute('198.51.100.9'); + expect(blocked.status).toBe(429); + expect(otherIp.status).toBe(202); + }); + + it('resets the minute window after it elapses', async () => { + const { execute, advance } = makeApp(); + + await execute(); + await execute(); + const blocked = await execute(); + expect(blocked.status).toBe(429); + + advance(MINUTE_MS); + const allowedAgain = await execute(); + expect(allowedAgain.status).toBe(202); + }); + + it('enforces the day limit across minute windows', async () => { + const { execute, advance } = makeApp(); + + for (let index = 0; index < 5; index++) { + const allowed = await execute(); + expect(allowed.status).toBe(202); + advance(MINUTE_MS); + } + + const response = await execute(); + expect(response.status).toBe(429); + // 5 minutes into the day window -> retry once the remaining day elapses + expect(response.headers.get('Retry-After')).toBe(String((DAY_MS - 5 * MINUTE_MS) / 1000)); + }); + + it('resets the day window after it elapses', async () => { + const { execute, advance } = makeApp({ perMinute: 0 }); + + for (let index = 0; index < 5; index++) { + await execute(); + } + const blocked = await execute(); + expect(blocked.status).toBe(429); + + advance(DAY_MS); + const allowedAgain = await execute(); + expect(allowedAgain.status).toBe(202); + }); + + it('uses the first X-Forwarded-For hop as the client identity', async () => { + const { app } = makeApp(); + + const request = (chain: string) => + app.request('/api/workflows/wf-1/execute', { + method: 'POST', + headers: { 'x-forwarded-for': chain }, + }); + + await request('203.0.113.7, 10.0.0.1'); + await request('203.0.113.7, 10.0.0.2'); + const blocked = await request('203.0.113.7, 10.0.0.3'); + expect(blocked.status).toBe(429); + }); +}); diff --git a/apps/backend/src/middleware/rate-limit.ts b/apps/backend/src/middleware/rate-limit.ts new file mode 100644 index 000000000..8496a94f1 --- /dev/null +++ b/apps/backend/src/middleware/rate-limit.ts @@ -0,0 +1,121 @@ +import { getConnInfo } from '@hono/node-server/conninfo'; +import type { Context, MiddlewareHandler } from 'hono'; + +export type RateLimitOptions = { + /** Max requests per IP per minute. 0 disables the minute window. */ + perMinute: number; + /** Max requests per IP per day. 0 disables the day window. */ + perDay: number; + /** + * Read the client IP from X-Forwarded-For. Only enable when the backend is + * reachable exclusively through a proxy that sets the header (the deploy + * nginx does) — a directly-reachable backend would let clients spoof it. + */ + trustProxy: boolean; + /** Injectable clock for tests. */ + now?: () => number; +}; + +type WindowState = { + windowStart: number; + count: number; +}; + +type IpState = { + minute: WindowState; + day: WindowState; +}; + +const MINUTE_MS = 60_000; +const DAY_MS = 24 * 60 * 60 * 1000; +const SWEEP_INTERVAL_MS = 10 * MINUTE_MS; + +function clientIp(c: Context, trustProxy: boolean): string { + if (trustProxy) { + const forwardedFor = c.req.header('x-forwarded-for'); + const first = forwardedFor?.split(',')[0]?.trim(); + if (first) { + return first; + } + } + try { + return getConnInfo(c).remote.address ?? 'unknown'; + } catch { + // No underlying socket (e.g. app.request() in tests) + return 'unknown'; + } +} + +function hitWindow(state: WindowState, limit: number, durationMs: number, now: number): number | null { + if (limit <= 0) { + return null; + } + if (now - state.windowStart >= durationMs) { + state.windowStart = now; + state.count = 0; + } + if (state.count >= limit) { + return state.windowStart + durationMs - now; + } + return null; +} + +/** + * Fixed-window, in-memory, per-IP rate limiter for the execute route. + * + * Deliberately process-local (WB-229 lean MVP runs a single backend + * replica): counters reset on restart and are not shared across replicas. + * The OpenRouter account Guardrail is the independent hard spend cap; this + * gate only stops a single IP from burning the daily budget. + */ +export function createRateLimitMiddleware(options: RateLimitOptions): MiddlewareHandler { + const { perMinute, perDay, trustProxy } = options; + const now = options.now ?? Date.now; + const states = new Map(); + let lastSweep = now(); + + return async (c, next) => { + const timestamp = now(); + + if (timestamp - lastSweep >= SWEEP_INTERVAL_MS) { + lastSweep = timestamp; + for (const [ip, state] of states) { + if (timestamp - state.day.windowStart >= DAY_MS && timestamp - state.minute.windowStart >= MINUTE_MS) { + states.delete(ip); + } + } + } + + const ip = clientIp(c, trustProxy); + let state = states.get(ip); + if (!state) { + state = { + minute: { windowStart: timestamp, count: 0 }, + day: { windowStart: timestamp, count: 0 }, + }; + states.set(ip, state); + } + + const minuteRetry = hitWindow(state.minute, perMinute, MINUTE_MS, timestamp); + const dayRetry = hitWindow(state.day, perDay, DAY_MS, timestamp); + const retryAfterMs = Math.max(minuteRetry ?? 0, dayRetry ?? 0); + + if (retryAfterMs > 0) { + const retryAfterSeconds = Math.ceil(retryAfterMs / 1000); + c.header('Retry-After', String(retryAfterSeconds)); + return c.json( + { + code: 'rate_limited', + message: 'Too many workflow executions from this address — try again later', + retryAfterSeconds, + }, + 429, + ); + } + + state.minute.count += 1; + state.day.count += 1; + + await next(); + }; +} diff --git a/apps/backend/src/server.ts b/apps/backend/src/server.ts index 61a36f952..10e52c1d9 100644 --- a/apps/backend/src/server.ts +++ b/apps/backend/src/server.ts @@ -14,6 +14,7 @@ import { } from './auth'; import { env } from './env'; import { logger } from './logger'; +import { createRateLimitMiddleware } from './middleware/rate-limit'; import { createExecutionsRoutes } from './routes/executions'; import { createWorkflowsRoutes } from './routes/workflows'; @@ -52,6 +53,22 @@ app.get('/api/health', (c) => c.json({ status: 'ok' })); app.use('/api/*', createAuthMiddleware(authPort)); +if (env.RATE_LIMIT_EXECUTE_PER_MINUTE > 0 || env.RATE_LIMIT_EXECUTE_PER_DAY > 0) { + app.use( + '/api/workflows/:id/execute', + createRateLimitMiddleware({ + perMinute: env.RATE_LIMIT_EXECUTE_PER_MINUTE, + perDay: env.RATE_LIMIT_EXECUTE_PER_DAY, + trustProxy: env.TRUST_PROXY, + }), + ); + logger.info('execute rate limit enabled', { + perMinute: env.RATE_LIMIT_EXECUTE_PER_MINUTE, + perDay: env.RATE_LIMIT_EXECUTE_PER_DAY, + trustProxy: env.TRUST_PROXY, + }); +} + app.route('/api/workflows', createWorkflowsRoutes(assertAuthorized)); app.route('/api/executions', createExecutionsRoutes(assertAuthorized)); From b27b90291e01657c72d8b8ebecdfd1f256bda3c5 Mon Sep 17 00:00:00 2001 From: Jan Librowski Date: Wed, 10 Jun 2026 02:05:06 +0200 Subject: [PATCH 04/11] feat(deploy): containerize ai studio stack for production deployment deploy/ai-studio/: multi-target Dockerfile (runtime/migrate/web), production docker-compose (only nginx public, pinned images, automatic migrations), nginx SPA+API proxy with SSE tuning and per-request DNS re-resolution, .env.example with Mistral Small 3.2 default, DevOps README, and a decision log covering the architecture choices. tsx becomes a real dependency of backend and worker (start:prod runs without an .env file); .dockerignore now keeps **/.env out of build contexts. Verified end-to-end: Sales Inquiry Pipeline to execution_completed with live SSE through nginx; rate limiter returns 429 past the budget. --- .dockerignore | 16 +- CLAUDE.md | 2 + apps/backend/package.json | 2 + apps/execution-worker/package.json | 4 +- deploy/ai-studio/.env.example | 36 ++++ deploy/ai-studio/Dockerfile | 73 ++++++++ deploy/ai-studio/README.md | 106 +++++++++++ .../ai-studio-deployment.decision-log.md | 140 +++++++++++++++ deploy/ai-studio/docker-compose.yml | 164 ++++++++++++++++++ deploy/ai-studio/nginx/default.conf | 63 +++++++ pnpm-lock.yaml | 6 + 11 files changed, 609 insertions(+), 3 deletions(-) create mode 100644 deploy/ai-studio/.env.example create mode 100644 deploy/ai-studio/Dockerfile create mode 100644 deploy/ai-studio/README.md create mode 100644 deploy/ai-studio/ai-studio-deployment.decision-log.md create mode 100644 deploy/ai-studio/docker-compose.yml create mode 100644 deploy/ai-studio/nginx/default.conf diff --git a/.dockerignore b/.dockerignore index 84a60d2c3..899987c08 100644 --- a/.dockerignore +++ b/.dockerignore @@ -2,18 +2,30 @@ .git .vscode .idea +.claude # external dependencies node_modules +**/node_modules # docker files docker-compose*.yml **/Dockerfile* +# build artifacts +dist/ +**/dist +coverage/ +**/coverage + # not needed files README.md tools/ !tools/deployment/nginx .gitignore -.env -coverage/ + +# env files hold secrets (e.g. OPENROUTER_API_KEY) and must never enter the +# build context — runtime config is injected via docker-compose `environment` +**/.env +**/.env.* +!**/.env.example diff --git a/CLAUDE.md b/CLAUDE.md index 5dd077b75..74d0ed0dd 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -42,6 +42,8 @@ Long-running processes already emit stable log lines that scripts and agents can ``` tools/ - Root dev scripts: preflight, setup:env, infra wait +deploy/ + ai-studio/ - Production deployment: Dockerfile (runtime/migrate/web), compose, nginx, README apps/ demo/ - Reference app consuming the SDK (React + Vite, port 4200) ai-studio/ - Reference AI workflow product (React + Vite, port 4201) diff --git a/apps/backend/package.json b/apps/backend/package.json index fec96ac60..3f1e1fb1a 100644 --- a/apps/backend/package.json +++ b/apps/backend/package.json @@ -6,6 +6,7 @@ "scripts": { "dev": "tsx watch --env-file=.env ./src/server.ts", "start": "tsx --env-file=.env ./src/server.ts", + "start:prod": "tsx ./src/server.ts", "typecheck": "tsc --noEmit", "lint": "eslint", "lint:fix": "eslint --fix", @@ -24,6 +25,7 @@ "drizzle-orm": "^0.44.0", "hono": "^4.7.0", "postgres": "^3.4.5", + "tsx": "^4.19.3", "zod": "^4.3.6" }, "devDependencies": { diff --git a/apps/execution-worker/package.json b/apps/execution-worker/package.json index 2a3d9aa58..1d394656e 100644 --- a/apps/execution-worker/package.json +++ b/apps/execution-worker/package.json @@ -6,6 +6,7 @@ "scripts": { "dev": "tsx watch --env-file=.env ./src/engines/temporal/worker.ts", "start": "tsx --env-file=.env ./src/engines/temporal/worker.ts", + "start:prod": "tsx ./src/engines/temporal/worker.ts", "typecheck": "tsc --noEmit", "lint": "eslint", "lint:fix": "eslint --fix", @@ -19,7 +20,8 @@ "@workflow-builder/execution-core": "workspace:*", "ai": "^6.0.0", "dotenv": "^17.4.2", - "postgres": "^3.4.5" + "postgres": "^3.4.5", + "tsx": "^4.19.3" }, "devDependencies": { "@types/node": "^22.12.0", diff --git a/deploy/ai-studio/.env.example b/deploy/ai-studio/.env.example new file mode 100644 index 000000000..b80d85932 --- /dev/null +++ b/deploy/ai-studio/.env.example @@ -0,0 +1,36 @@ +# Copy to .env next to docker-compose.yml and fill in. Everything except +# OPENROUTER_API_KEY has a working default. + +# --- required --------------------------------------------------------------- + +# Server-side only; never reaches the browser. Pair it with an OpenRouter +# account Guardrail (hard $/day ceiling) — see README "Spend safety". +OPENROUTER_API_KEY= + +# --- LLM -------------------------------------------------------------------- + +# WB-229 demo model. Cheap, EU-hosted, solid tool calling. +# ~$0.075/M input + $0.20/M output => ~$0.0004 per 3-call template run. +AI_MODEL=mistralai/mistral-small-3.2-24b-instruct + +# --- abuse gate (per-IP, execute route) --------------------------------------- + +RATE_LIMIT_EXECUTE_PER_MINUTE=10 +RATE_LIMIT_EXECUTE_PER_DAY=50 + +# --- network ------------------------------------------------------------------ + +# Where the web container publishes. Put your TLS terminator in front of it +# (or bind 127.0.0.1 and proxy from a host nginx/caddy). +WEB_BIND=0.0.0.0 +WEB_PORT=8080 + +# Leave empty: the SPA then calls /api on its own origin and nginx proxies +# it to the backend (no CORS, SSE intact). Only set this if the frontend is +# served from a different host than the backend. +VITE_BACKEND_URL= + +# --- databases (internal network only, not published) ------------------------- + +APP_DB_PASSWORD=wb +TEMPORAL_DB_PASSWORD=temporal diff --git a/deploy/ai-studio/Dockerfile b/deploy/ai-studio/Dockerfile new file mode 100644 index 000000000..a6fc07f67 --- /dev/null +++ b/deploy/ai-studio/Dockerfile @@ -0,0 +1,73 @@ +# syntax=docker/dockerfile:1 + +# AI Studio execution stack — single Dockerfile, multiple targets: +# +# runtime -> backend + execution-worker (command chosen per compose service) +# migrate -> one-shot Drizzle migration runner (needs backend devDependencies) +# web -> nginx serving the AI Studio SPA + reverse proxy to the backend +# +# Build context must be the repo root (workspace packages are linked via +# pnpm `workspace:*`), e.g.: +# +# docker build -f deploy/ai-studio/Dockerfile --target runtime . +# +# Node is pinned to the exact engines.node version because the workspace sets +# engineStrict=true. pnpm is installed via npm, not corepack — the corepack +# bundled with this Node release fails to load pnpm 10 +# (ERR_VM_DYNAMIC_IMPORT_CALLBACK_MISSING) and ships stale signature keys. +# Keep the version in sync with `packageManager` in the root package.json. +FROM node:22.12.0-bookworm-slim AS base +ENV PNPM_HOME=/pnpm \ + PATH="/pnpm:$PATH" \ + # root `prepare` script runs husky, which needs the .git dir that is + # deliberately excluded from the build context + HUSKY=0 \ + npm_config_store_dir=/pnpm/store \ + CI=true +RUN npm install -g pnpm@10.17.0 +WORKDIR /app + +# Download every dependency from the lockfile alone, then bring in the +# source. Any source change invalidates only the layers below the COPY — +# the package store survives in the cache mount, so reinstalls are cheap. +FROM base AS source +COPY pnpm-lock.yaml pnpm-workspace.yaml ./ +RUN --mount=type=cache,id=pnpm-store,target=/pnpm/store pnpm fetch +COPY . . + +# Production deps for backend + worker and their workspace dependencies +# (execution-core, types). Both apps run TS directly through tsx — the +# Temporal worker additionally requires its workflow TS source on disk at +# runtime (the workflow sandbox bundles it from source), so there is no +# build step to get wrong. +FROM source AS runtime +# root `prepare` runs husky, a devDependency that a --prod install doesn't +# have — drop the script inside the image (root scripts are unused at runtime) +RUN --mount=type=cache,id=pnpm-store,target=/pnpm/store \ + npm pkg delete scripts.prepare && \ + pnpm install --frozen-lockfile --prefer-offline --prod \ + --filter backend... --filter execution-worker... +# command supplied by docker-compose: +# backend: pnpm --filter backend start:prod +# worker: pnpm --filter execution-worker start:prod + +# Migrations need drizzle-kit, a backend devDependency — hence a separate +# target with a dev install. Runs as a one-shot service before the backend. +FROM source AS migrate +RUN --mount=type=cache,id=pnpm-store,target=/pnpm/store \ + pnpm install --frozen-lockfile --prefer-offline --filter backend... +CMD ["pnpm", "--filter", "backend", "db:migrate"] + +# The SPA build imports the SDK from source (vite alias), so this needs the +# full frontend dependency tree. VITE_BACKEND_URL is baked in at build time; +# the default (empty) makes the app call /api on its own origin, which the +# web target's nginx proxies to the backend. +FROM source AS frontend-build +ARG VITE_BACKEND_URL= +RUN --mount=type=cache,id=pnpm-store,target=/pnpm/store \ + pnpm install --frozen-lockfile --prefer-offline --filter @workflow-builder/ai-studio... +RUN VITE_BACKEND_URL=$VITE_BACKEND_URL pnpm build:ai-studio + +FROM nginx:1.31-alpine AS web +COPY deploy/ai-studio/nginx/default.conf /etc/nginx/conf.d/default.conf +COPY --from=frontend-build /app/dist/apps/ai-studio /usr/share/nginx/html diff --git a/deploy/ai-studio/README.md b/deploy/ai-studio/README.md new file mode 100644 index 000000000..351e031de --- /dev/null +++ b/deploy/ai-studio/README.md @@ -0,0 +1,106 @@ +# Deploying AI Studio + +Self-contained, portable deployment of the AI Studio stack (WB-229). Runs on +any Docker host — an Azure VM, AWS, on-prem — with no cloud-specific glue. + +## What runs + +| Service | Image | Role | Exposed | +| ------------- | ------------------------------ | ----------------------------------------------- | ------------------------ | +| `web` | `ai-studio-web` (nginx) | Serves the SPA, proxies `/api` to the backend | `${WEB_PORT}` (only one) | +| `backend` | `ai-studio-runtime` | Hono REST + SSE event stream | internal | +| `worker` | `ai-studio-runtime` | Temporal worker, makes the OpenRouter LLM calls | internal | +| `migrate` | `ai-studio-migrate` | One-shot Drizzle migrations, then exits | internal | +| `temporal` | `temporalio/auto-setup` pinned | Workflow engine | internal | +| `app-db` | `postgres:16` | Workflow snapshots + execution events | internal | +| `temporal-db` | `postgres:16` | Temporal's own state store | internal | +| `temporal-ui` | `temporalio/ui` pinned | Debug only (`--profile debug`) | `127.0.0.1:8233` | + +All images build from one Dockerfile (`deploy/ai-studio/Dockerfile`) with the +repo root as context. Backend and worker share a single image and differ only +in the compose `command`. + +## Quick start + +```bash +cd deploy/ai-studio +cp .env.example .env # set OPENROUTER_API_KEY +docker compose up -d --build +``` + +First boot: migrations run automatically (`migrate` exits 0, then the backend +starts). The worker crash-loops for ~30s until Temporal finishes auto-setup — +that's expected, `restart: unless-stopped` converges it. + +Verify: + +```bash +curl -s http://localhost:8080/api/health # {"status":"ok"} +# open http://localhost:8080, run the "Sales Inquiry Pipeline" template +``` + +## Spend safety (do not skip) + +Two independent controls; both must be in place before the URL goes public: + +1. **OpenRouter Guardrail** (hard $/day ceiling, no code involved): + [openrouter.ai](https://openrouter.ai) → Settings → Guardrails → daily + spend limit, e.g. **$5/day** (resets 00:00 UTC). When hit, OpenRouter + rejects calls and the demo pauses — it cannot overspend. Keep the account + balance low (~$20) as the absolute ceiling. +2. **Per-IP rate limit** (already on in this compose): defaults to 10 + executions/min and 50/day per IP, tunable via + `RATE_LIMIT_EXECUTE_PER_MINUTE` / `RATE_LIMIT_EXECUTE_PER_DAY`. In-memory, + single-replica by design; counters reset on backend restart. + +At the defaults, a worst case full Guardrail day costs $5; a typical +3-LLM-call template run on Mistral Small 3.2 costs ~$0.0004. + +## TLS / going public + +The `web` container speaks plain HTTP on the internal port. Pick one: + +- **Existing ingress** (Azure Application Gateway / Front Door, an nginx that + already routes your other web apps, …): point it at `WEB_PORT`, set + `WEB_BIND=127.0.0.1` if the ingress runs on the same host. SSE caveat: the + ingress must not buffer `/api/executions/*/stream` responses and needs a + read timeout above 60s (the stream heartbeats every 15s). +- **Standalone VM**: run a host-level [Caddy](https://caddyserver.com) + (`reverse_proxy localhost:8080` — automatic Let's Encrypt, SSE-safe out of + the box) or certbot'd nginx in front, and firewall everything except + 80/443. + +Keep 8233 (Temporal UI) and the Postgres ports unreachable from outside — +this compose never publishes them; don't undo that. + +## Configuration + +See [.env.example](.env.example) — every variable is documented there. +Swapping the LLM is a one-liner: change `AI_MODEL` to any +[OpenRouter model id](https://openrouter.ai/models) and +`docker compose up -d worker`. + +## Operations + +```bash +docker compose logs -f backend worker # tail the apps +docker compose --profile debug up -d # Temporal UI on 127.0.0.1:8233 +docker compose up -d --build # deploy a new version (re-runs migrations) +docker compose down # stop (volumes survive) +docker exec ai-studio-app-db-1 pg_dump -U wb workflow_builder > backup.sql +``` + +Workflow data is treated as ephemeral for the public demo — losing the +volumes is acceptable; there is nothing precious in them. + +## Known limitations (accepted for the lean MVP) + +- **No login.** The API is open (`WB_AUTH_PORT=allow-all`); anyone with the + URL can create and run workflows within the rate limits. The SDK has an + `AuthPort` seam for wiring real auth later. +- **Single backend replica.** The rate limiter is process-local. Scaling out + needs a shared store (Redis) — deferred to the scale-ready task. +- **`temporalio/auto-setup` is dev-grade.** Fine for a demo; move to Temporal + Cloud or an operated cluster for sustained load. +- **Anyone-can-edit demo content.** Visitors share one workspace; data is + wiped whenever you decide to recreate the volumes. diff --git a/deploy/ai-studio/ai-studio-deployment.decision-log.md b/deploy/ai-studio/ai-studio-deployment.decision-log.md new file mode 100644 index 000000000..a56e411f3 --- /dev/null +++ b/deploy/ai-studio/ai-studio-deployment.decision-log.md @@ -0,0 +1,140 @@ +### Title: Containerized AI Studio deployment — portable compose stack + +### Proposed by: Jan Librowski + +### Date: 10.06.2026 + +## Context + +WB-229 (lean public demo on an Azure VM) and its parent WB-155 (deployment +preparations) needed a production deployment story for the AI Studio +execution stack: backend (Hono), execution-worker (Temporal), two Postgres +instances, a Temporal server, and the static SPA. Until now only `pnpm dev` +plus an infra-only compose existed — no Dockerfiles for any app. + +Constraints that shaped the design: + +- **Portability over Azure ergonomics.** Workflow Builder is sold to external + customers; whatever ships here must run on AWS / GCP / on-prem / bare + Docker without re-architecting. DevOps asked for containerization + specifically for ease of portability and setup. +- **Surprise bills must be impossible** (WB-229): a hard OpenRouter spend cap + (dashboard Guardrail) plus an in-app per-IP abuse gate. +- **The local dev flow must survive** (`pnpm dev:ai-studio` + `pnpm +infra:up`) — contributors rely on it; nothing in dev changes. +- The repo pins Node 22.12.0 + pnpm 10.17.0 with `engineStrict`, and the + Temporal worker bundles its workflow entrypoint **from TS source at + runtime**, so the source tree must be present in the worker container. + +## Decision + +Everything lives in `deploy/ai-studio/`: one multi-target Dockerfile, a +production `docker-compose.yml`, the nginx config, `.env.example`, and a +DevOps-facing README. + +1. **One Dockerfile, three targets** (`runtime`, `migrate`, `web`), built + with the repo root as context (pnpm `workspace:*` links require it). A + shared `source` stage does `pnpm fetch` against a BuildKit cache mount, so + per-target installs are store-hits. +2. **tsx in production, no build step.** Backend and worker run TS through + `tsx` exactly as in dev — `tsx` moved from a hoisted root devDependency to + a real dependency of both apps, plus `start:prod` scripts (the existing + `start` scripts hard-require a `.env` file; containers inject env + directly). This sidesteps the Temporal-sandbox-needs-source constraint + entirely — there is no bundling step to get wrong. +3. **One shared `runtime` image for backend and worker**; the compose + `command` picks the entrypoint. One image to build, push, and version. +4. **Migrations as a one-shot compose service** (`migrate` target, carries + drizzle-kit as a backend devDependency). `depends_on: +service_completed_successfully` gates the backend, so `docker compose up` + is a complete first boot. Same answer works as a k8s Job / ACA job if a + customer reshapes the topology. +5. **nginx is the only public surface.** It serves the SPA and proxies + `/api` to the backend on the internal network; the SSE stream route gets + `proxy_buffering off` + long read timeout. The backend container is + reached through Docker's embedded DNS **re-resolved per request** + (`resolver 127.0.0.11` + variable `proxy_pass`) — a statically resolved + upstream 502s after the backend container is recreated on redeploy. + Postgres ×2, Temporal, and the backend publish no host ports; Temporal UI + is opt-in behind a `debug` profile bound to loopback. TLS terminates in + front (existing ingress or host-level Caddy/certbot — documented in the + README, deliberately not baked into the stack). +6. **Same-origin frontend.** `VITE_BACKEND_URL` is baked empty at build time; + the SPA calls `/api` on its own origin. No CORS, no second hostname, SSE + intact. +7. **pnpm installed via `npm i -g pnpm@10.17.0` in images, not corepack.** + The corepack bundled with Node 22.12.0 cannot load pnpm 10 + (`ERR_VM_DYNAMIC_IMPORT_CALLBACK_MISSING`) and ships stale signature + keys. Version is duplicated in the Dockerfile — keep in sync with + `packageManager`. +8. **Installs use `--prefer-offline`, not `--offline`**: pnpm propagates + offline mode to lifecycle scripts, and `apps/icons` `prepare` shells out + to `npx @svgr/cli`, which then refuses the network (`ENOTCACHED`). +9. **Per-IP rate limit on the execute route** (`apps/backend`): + fixed-window, in-memory, env-gated (`RATE_LIMIT_EXECUTE_PER_MINUTE/DAY`, + default off so dev is untouched; compose sets 10/min, 50/day). + `TRUST_PROXY=true` makes it read the client from `X-Forwarded-For`, which + only our nginx can set. This is the abuse gate; the money cap is the + OpenRouter account Guardrail — two independent controls. +10. **Model pinned per environment, not in code**: compose defaults + `AI_MODEL=mistralai/mistral-small-3.2-24b-instruct` (price re-verified + 2026-06-10 against the OpenRouter API: $0.075/$0.20 per Mtok ≈ $0.0004 + per 3-call template run). Swapping models is an env change. +11. **Pinned images, no `:latest`**: `temporalio/auto-setup:1.29.6.1`, + `temporalio/ui:2.51.0`, `nginx:1.31-alpine`, `node:22.12.0-bookworm-slim` + (exact pin because `engineStrict` rejects any other 22.x). + +Found and fixed during end-to-end verification: the worker ignored +`TEMPORAL_ADDRESS` (`Worker.create` without an explicit connection dials +`127.0.0.1:7233` — invisible in local dev, fatal in containers). + +## Alternative Options Considered + +- **`pnpm deploy` to materialize standalone app bundles** — rejected: pnpm 10 + requires `inject-workspace-packages` or a legacy-mode flag, adding workspace + config churn for no benefit over running from the installed workspace. +- **Compile step (tsc/tsup/esbuild) + plain `node`** — rejected for the MVP: + the worker needs its TS source on disk for Temporal's runtime bundling + anyway, so compilation only helps the backend while doubling the ways the + artifact can diverge from dev. Revisit if image size or cold-start matters. +- **Azure-specific artifacts (Container Apps / AKS manifests, Key Vault + wiring)** — deferred deliberately: WB-229 targets a single VM, and the + portability requirement says external customers must not inherit Azure + glue. The compose file is the customer-facing artifact; platform topology + can wrap it later. +- **Separate Dockerfiles per app** — rejected: three near-identical + install stages to keep in sync; the multi-target file shares layers. +- **Rate limiting in nginx (`limit_req`)** — rejected: the limit is + per-execute-route and needs structured JSON 429s consistent with the + backend's error contract; nginx zones would split the policy across two + layers. nginx stays dumb, policy lives where the route lives. +- **Redis-backed rate limiter** — deferred to the scale-ready task (WB-229 + explicitly accepts single-replica in-memory for the MVP). + +## Consequences + +- **Pros** + - `cp .env.example .env && docker compose up -d --build` is the whole + deployment; verified end-to-end (Sales Inquiry Pipeline to + `execution_completed` with live SSE through nginx, rate limiter returning + 429s past the budget). + - The artifact is platform-neutral: any Docker host, no cloud SDK anywhere. + - Secrets only travel through compose `environment`; `.dockerignore` now + excludes `**/.env*` so keys cannot be baked into images (previously + `apps/*/.env` files would have been copied into the build context). + - Dev flow untouched; rate limiter is inert without its env vars. +- **Cons** + - `runtime` image is ~1.9 GB (full source tree + pnpm store hardlinks + + Temporal native bridge). Acceptable for a demo VM; a compile step or + `pnpm deploy` bundle is the known optimization path. + - Any source change invalidates the `COPY . .` layer and reinstalls + (mitigated by the store cache mount; rebuilds are minutes, not tens of). + - `temporalio/auto-setup` is dev-grade by Temporal's own docs — accepted + for the demo, swap for Temporal Cloud / operated cluster under sustained + load (the apps only consume `TEMPORAL_ADDRESS`). + - pnpm version is pinned in two places (root `packageManager` + + Dockerfile). + +## Status + +Accepted diff --git a/deploy/ai-studio/docker-compose.yml b/deploy/ai-studio/docker-compose.yml new file mode 100644 index 000000000..52d7dff1a --- /dev/null +++ b/deploy/ai-studio/docker-compose.yml @@ -0,0 +1,164 @@ +# AI Studio execution stack — production-shaped compose (WB-229 lean MVP). +# +# cp .env.example .env # set OPENROUTER_API_KEY +# docker compose up -d --build +# +# Only the `web` service publishes a port. Postgres, Temporal and the +# backend stay on the internal network. Temporal UI is opt-in via the +# `debug` profile and binds to loopback only. + +name: ai-studio + +x-runtime-build: &runtime-build + context: ../.. + dockerfile: deploy/ai-studio/Dockerfile + target: runtime + +services: + app-db: + image: postgres:16 + environment: + POSTGRES_DB: workflow_builder + POSTGRES_USER: wb + POSTGRES_PASSWORD: ${APP_DB_PASSWORD:-wb} + volumes: + - app-db-data:/var/lib/postgresql/data + healthcheck: + test: ['CMD', 'pg_isready', '-U', 'wb', '-d', 'workflow_builder'] + interval: 5s + timeout: 3s + retries: 12 + restart: unless-stopped + + temporal-db: + image: postgres:16 + environment: + POSTGRES_DB: temporal + POSTGRES_USER: temporal + POSTGRES_PASSWORD: ${TEMPORAL_DB_PASSWORD:-temporal} + volumes: + - temporal-db-data:/var/lib/postgresql/data + healthcheck: + test: ['CMD', 'pg_isready', '-U', 'temporal', '-d', 'temporal'] + interval: 5s + timeout: 3s + retries: 12 + restart: unless-stopped + + # auto-setup is Temporal's dev-grade single-binary image. Accepted for the + # demo (WB-229); a sustained-load deployment should move to Temporal Cloud + # or a properly operated self-hosted cluster — the apps only consume + # TEMPORAL_ADDRESS and don't care which. + temporal: + image: temporalio/auto-setup:1.29.6.1 + depends_on: + temporal-db: + condition: service_healthy + environment: + DB: postgres12 + DB_PORT: 5432 + POSTGRES_USER: temporal + POSTGRES_PWD: ${TEMPORAL_DB_PASSWORD:-temporal} + POSTGRES_SEEDS: temporal-db + restart: unless-stopped + + temporal-ui: + image: temporalio/ui:2.51.0 + profiles: [debug] + depends_on: + - temporal + environment: + TEMPORAL_ADDRESS: temporal:7233 + ports: + - '127.0.0.1:8233:8080' + restart: unless-stopped + + migrate: + image: ai-studio-migrate + build: + context: ../.. + dockerfile: deploy/ai-studio/Dockerfile + target: migrate + environment: + DATABASE_URL: postgresql://wb:${APP_DB_PASSWORD:-wb}@app-db:5432/workflow_builder + depends_on: + app-db: + condition: service_healthy + restart: 'no' + + backend: + image: ai-studio-runtime + build: *runtime-build + command: ['pnpm', '--filter', 'backend', 'start:prod'] + environment: + HOST: 0.0.0.0 + PORT: 3001 + DATABASE_URL: postgresql://wb:${APP_DB_PASSWORD:-wb}@app-db:5432/workflow_builder + TEMPORAL_ADDRESS: temporal:7233 + # Reference deployment has no user accounts; the explicit opt-in keeps + # a forgotten env var from silently exposing an unauthenticated API. + WB_AUTH_PORT: allow-all + # Backend is only reachable through the web service's nginx, which + # sets X-Forwarded-For — safe to trust for per-IP rate limiting. + TRUST_PROXY: 'true' + RATE_LIMIT_EXECUTE_PER_MINUTE: ${RATE_LIMIT_EXECUTE_PER_MINUTE:-10} + RATE_LIMIT_EXECUTE_PER_DAY: ${RATE_LIMIT_EXECUTE_PER_DAY:-50} + depends_on: + app-db: + condition: service_healthy + migrate: + condition: service_completed_successfully + temporal: + condition: service_started + healthcheck: + test: + [ + 'CMD', + 'node', + '-e', + "fetch('http://127.0.0.1:3001/api/health').then((r) => process.exit(r.ok ? 0 : 1)).catch(() => process.exit(1))", + ] + interval: 10s + timeout: 5s + retries: 6 + start_period: 15s + restart: unless-stopped + + # Crash-loops until Temporal answers on 7233 (auto-setup has no usable + # healthcheck); `restart: unless-stopped` converges it. + worker: + image: ai-studio-runtime + build: *runtime-build + command: ['pnpm', '--filter', 'execution-worker', 'start:prod'] + environment: + DATABASE_URL: postgresql://wb:${APP_DB_PASSWORD:-wb}@app-db:5432/workflow_builder + TEMPORAL_ADDRESS: temporal:7233 + OPENROUTER_API_KEY: ${OPENROUTER_API_KEY:?set OPENROUTER_API_KEY in deploy/ai-studio/.env} + AI_MODEL: ${AI_MODEL:-mistralai/mistral-small-3.2-24b-instruct} + depends_on: + app-db: + condition: service_healthy + migrate: + condition: service_completed_successfully + temporal: + condition: service_started + restart: unless-stopped + + web: + image: ai-studio-web + build: + context: ../.. + dockerfile: deploy/ai-studio/Dockerfile + target: web + args: + # Empty -> SPA calls /api on its own origin, proxied by this nginx. + VITE_BACKEND_URL: ${VITE_BACKEND_URL:-} + ports: + - '${WEB_BIND:-0.0.0.0}:${WEB_PORT:-8080}:80' + depends_on: + - backend + restart: unless-stopped + +volumes: + app-db-data: + temporal-db-data: diff --git a/deploy/ai-studio/nginx/default.conf b/deploy/ai-studio/nginx/default.conf new file mode 100644 index 000000000..396c1dd6c --- /dev/null +++ b/deploy/ai-studio/nginx/default.conf @@ -0,0 +1,63 @@ +# AI Studio — SPA + API reverse proxy. +# +# This container is the only public surface of the stack. TLS is expected to +# terminate in front of it (cloud ingress / load balancer / a host-level +# certbot'd nginx) — see deploy/ai-studio/README.md for the options. + +server { + listen 80; + server_name _; + + root /usr/share/nginx/html; + index index.html; + + # Resolve the backend through Docker's embedded DNS on every request + # (via the variable indirection below) instead of once at startup — + # otherwise recreating the backend container leaves nginx proxying to a + # stale IP and every /api call 502s until this container restarts too. + resolver 127.0.0.11 valid=10s ipv6=off; + set $backend_upstream http://backend:3001; + + gzip on; + gzip_types text/css application/javascript application/json image/svg+xml; + + # Backend caps request bodies at 1 MB and answers with a structured + # error; keep nginx's own limit above it so the backend owns that path. + client_max_body_size 2m; + + # Live execution streams over SSE: hold the connection open, never + # buffer, and outlast the backend's 15s heartbeat interval. + location ~ ^/api/executions/.+/stream$ { + proxy_pass $backend_upstream; + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header Connection ''; + proxy_buffering off; + proxy_cache off; + proxy_read_timeout 1h; + gzip off; + } + + location /api/ { + proxy_pass $backend_upstream; + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + + # Vite emits content-hashed filenames under /assets — cache forever. + location /assets/ { + add_header Cache-Control "public, max-age=31536000, immutable"; + try_files $uri =404; + } + + # SPA fallback + location / { + try_files $uri /index.html; + } +} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 247fdd644..8b5384f7c 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -174,6 +174,9 @@ importers: postgres: specifier: ^3.4.5 version: 3.4.9 + tsx: + specifier: ^4.19.3 + version: 4.21.0 zod: specifier: ^4.3.6 version: 4.3.6 @@ -366,6 +369,9 @@ importers: postgres: specifier: ^3.4.5 version: 3.4.9 + tsx: + specifier: ^4.19.3 + version: 4.21.0 devDependencies: '@types/node': specifier: ^22.12.0 From 1cfb024b51f66043800326aa15ca07119c984662 Mon Sep 17 00:00:00 2001 From: Jan Librowski Date: Wed, 10 Jun 2026 03:15:15 +0200 Subject: [PATCH 05/11] feat(deploy): add swarm overlay aligned with workflow-builder infra MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit tools/deployment/ mirrors the workflow-builder repo's deployment path (build-docker.sh, deploy.sh, ansible deploy-application playbook) and consumes the same three images from deploy/ai-studio/Dockerfile — only the orchestration layer differs. Deviations forced by AI Studio being stateful: node-pinned volumes for Postgres/Temporal, post-deploy migration step (Swarm ignores depends_on), attachable internal network with short DNS aliases, and an AUTH_ENABLED-gated gatekeeper so the public demo stays login-free. Stack template render-verified in both auth modes; status 'Proposed' pending the DevOps conversation. --- CLAUDE.md | 1 + deploy/ai-studio/README.md | 4 + tools/deployment/README.md | 78 ++++++ .../ansible/deploy-application/main.yml | 241 ++++++++++++++++++ tools/deployment/scripts/build-docker.sh | 38 +++ tools/deployment/scripts/deploy.sh | 11 + .../swarm-alignment.decision-log.md | 84 ++++++ 7 files changed, 457 insertions(+) create mode 100644 tools/deployment/README.md create mode 100644 tools/deployment/ansible/deploy-application/main.yml create mode 100755 tools/deployment/scripts/build-docker.sh create mode 100755 tools/deployment/scripts/deploy.sh create mode 100644 tools/deployment/swarm-alignment.decision-log.md diff --git a/CLAUDE.md b/CLAUDE.md index 74d0ed0dd..2a2e2bb31 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -42,6 +42,7 @@ Long-running processes already emit stable log lines that scripts and agents can ``` tools/ - Root dev scripts: preflight, setup:env, infra wait + deployment/ - Swarm/Ansible deploy path mirroring the workflow-builder repo (ACR, Traefik) deploy/ ai-studio/ - Production deployment: Dockerfile (runtime/migrate/web), compose, nginx, README apps/ diff --git a/deploy/ai-studio/README.md b/deploy/ai-studio/README.md index 351e031de..cc149a182 100644 --- a/deploy/ai-studio/README.md +++ b/deploy/ai-studio/README.md @@ -3,6 +3,10 @@ Self-contained, portable deployment of the AI Studio stack (WB-229). Runs on any Docker host — an Azure VM, AWS, on-prem — with no cloud-specific glue. +> Deploying onto the company Swarm cluster instead? See +> [`tools/deployment/`](../../tools/deployment/README.md) — same images, +> Traefik/ACR/Ansible orchestration aligned with the workflow-builder repo. + ## What runs | Service | Image | Role | Exposed | diff --git a/tools/deployment/README.md b/tools/deployment/README.md new file mode 100644 index 000000000..432e90faa --- /dev/null +++ b/tools/deployment/README.md @@ -0,0 +1,78 @@ +# Swarm deployment (workflow-builder-aligned) + +Deploys AI Studio onto the company Docker Swarm cluster on Azure, following +the same layout, scripts, and Ansible flow as the `workflow-builder` repo's +`tools/deployment/` — so DevOps operates one familiar shape. + +This is an **orchestration overlay, not a second deployment**: it consumes +the exact same three images (`runtime`, `migrate`, `web`) built from +[`deploy/ai-studio/Dockerfile`](../../deploy/ai-studio/Dockerfile). The +compose file in `deploy/ai-studio/` remains the portable, customer-facing +artifact and the local full-stack runner; this directory adds the +ACR + Traefik + Ansible path for our own infrastructure. + +``` +tools/deployment/ +├── scripts/ +│ ├── build-docker.sh # build all 3 targets, tag for ACR, push (CI-gated) +│ └── deploy.sh # run the Ansible playbook (CI image or workstation) +└── ansible/deploy-application/ + └── main.yml # writes the Swarm stack file on the master + deploys + migrates +``` + +## Usage + +```bash +# build + push images (from repo root) +DEPLOY_ENV=dev ./tools/deployment/scripts/build-docker.sh + +# deploy the stack (needs az login + ansible inventory with the `master` host) +DEPLOY_ENV=dev DEPLOYMENT_URL=ai-studio.example.com OPENROUTER_API_KEY=sk-... \ + ./tools/deployment/scripts/deploy.sh +``` + +Bitbucket-style variables (`BITBUCKET_COMMIT`, `BITBUCKET_DEPLOYMENT_ENVIRONMENT`, +`TAG_PREFIX`) take precedence when present, so the scripts drop into the +existing CI pattern unchanged; the fallbacks (`git rev-parse`, `DEPLOY_ENV`) +make them runnable from a workstation or GitHub Actions. + +## Configuration + +| Variable | Required | Default | Purpose | +| ---------------------------------------------------------------------------------------- | --------- | ------------------------------------------ | ------------------------------------------------------- | +| `DEPLOYMENT_URL` | yes | — | Public hostname, drives Traefik routing + TLS | +| `OPENROUTER_API_KEY` | yes | — | Worker-side LLM key (pair with an OpenRouter Guardrail) | +| `DEPLOY_ENV` / `BITBUCKET_DEPLOYMENT_ENVIRONMENT` | no | `dev` | Stack/environment suffix | +| `AI_MODEL` | no | `mistralai/mistral-small-3.2-24b-instruct` | OpenRouter model id | +| `RATE_LIMIT_EXECUTE_PER_MINUTE` / `_DAY` | no | `10` / `50` | Per-IP abuse gate | +| `APP_DB_PASSWORD`, `TEMPORAL_DB_PASSWORD` | no | dev defaults | Internal-network Postgres credentials | +| `AUTH_ENABLED` | no | `false` | Put the gatekeeper OIDC proxy in front (internal envs) | +| `AUTH_DISCOVERY_URL`, `AUTH_CLIENT_ID`, `AUTH_SECRET`, `AUTH_COOKIE_SECRET`, `AUTH_ROLE` | when auth | — | Gatekeeper config, same names as workflow-builder | +| `REGISTRY` | no | `synergycodes.azurecr.io` | Image registry | + +## What differs from the workflow-builder playbook (and why) + +| Deviation | Reason | +| ------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------ | +| Postgres ×2 + Temporal services with named volumes, pinned via `node.labels.ai-studio-data==true` | AI Studio is stateful; Swarm volumes are node-local. **One-time setup:** `docker node update --label-add ai-studio-data=true ` | +| Migrations run post-deploy as a one-shot `docker run` on the stack network (with retries) | Swarm ignores compose `depends_on` conditions | +| `internal` network is `attachable: true` | Lets the migrate container join the overlay | +| Services carry short DNS aliases (`backend`, `app-db`, `temporal`, …) | The web image's nginx proxies to `http://backend:3001`; aliases keep the images and env defaults identical between compose and Swarm | +| Gatekeeper is conditional (`AUTH_ENABLED`) | The WB-229 public demo is deliberately login-free; internal instances can keep SSO | + +SSE note: Traefik streams responses by default, so the live execution stream +works without special ingress config; the 15 s backend heartbeat keeps the +connection alive. + +## Open items for DevOps + +- **Stateful workloads on the cluster** — this would be the first; the + alternative is a dedicated VM running `deploy/ai-studio/docker-compose.yml` + as-is, or managed Azure Postgres. +- **CI home** — this repo lives on GitHub; the existing deploy machinery + (deployment CI image, `setup-az.sh`, Ansible inventory) is Bitbucket-side. + First deploys can run from a workstation. +- **Secrets in the stack file** — the playbook writes env values (incl. the + OpenRouter key) into the stack yml on the Swarm master, same as the + existing workflow-builder flow. Docker Swarm secrets would be stricter; + kept aligned for now. diff --git a/tools/deployment/ansible/deploy-application/main.yml b/tools/deployment/ansible/deploy-application/main.yml new file mode 100644 index 000000000..2d8a4fc2d --- /dev/null +++ b/tools/deployment/ansible/deploy-application/main.yml @@ -0,0 +1,241 @@ +--- +# Deploys the AI Studio execution stack to the Docker Swarm cluster, +# following the workflow-builder repo's deploy-application playbook. The +# images are the same three targets the local compose builds +# (deploy/ai-studio/Dockerfile); only the orchestration differs. +# +# Differences from the workflow-builder playbook, all forced by AI Studio +# being stateful: +# - Postgres x2 + Temporal services with named volumes, pinned to the node +# labeled `ai-studio-data=true` (Swarm volumes are node-local). +# - Migrations run as a one-shot container after stack deploy — Swarm +# ignores compose depends_on conditions, so ordering lives here. +# - The `internal` network is attachable so the migrate container can join. +# - Services get short DNS aliases (backend, app-db, temporal, ...) so the +# same images and env defaults work under compose and Swarm. +# - Gatekeeper is optional (AUTH_ENABLED=true): the WB-229 public demo is +# deliberately login-free; internal stage/dev instances can enable it. + +- hosts: master + + vars: + deployment_environment: "{{ lookup('env', 'BITBUCKET_DEPLOYMENT_ENVIRONMENT') or lookup('env', 'DEPLOY_ENV') or 'dev' }}" + tag_prefix: "{{ lookup('env', 'TAG_PREFIX') }}" + bb_commit: "{{ lookup('env', 'BITBUCKET_COMMIT') or lookup('pipe', 'git rev-parse HEAD') }}" + app_name: ai-studio + registry: "{{ lookup('env', 'REGISTRY') or 'synergycodes.azurecr.io' }}" + deployment_url: "{{ lookup('env', 'DEPLOYMENT_URL') }}" + image_tag: '{{ tag_prefix }}{{ bb_commit }}' + stack_name: '{{ app_name }}--{{ deployment_environment }}' + auth_enabled: "{{ (lookup('env', 'AUTH_ENABLED') or 'false') | bool }}" + openrouter_api_key: "{{ lookup('env', 'OPENROUTER_API_KEY') }}" + ai_model: "{{ lookup('env', 'AI_MODEL') or 'mistralai/mistral-small-3.2-24b-instruct' }}" + app_db_password: "{{ lookup('env', 'APP_DB_PASSWORD') or 'wb' }}" + temporal_db_password: "{{ lookup('env', 'TEMPORAL_DB_PASSWORD') or 'temporal' }}" + database_url: 'postgresql://wb:{{ app_db_password }}@app-db:5432/workflow_builder' + + tasks: + - name: Check required configuration + assert: + that: + - deployment_url | length > 0 + - openrouter_api_key | length > 0 + fail_msg: 'DEPLOYMENT_URL and OPENROUTER_API_KEY must be set' + + - name: Create directory for service data + file: + path: '/mnt/docker-swarm-storage/stacks/{{ stack_name }}' + state: directory + + - name: Create stack definition + copy: + dest: '/mnt/docker-swarm-storage/stacks/{{ stack_name }}/{{ app_name }}.stack.yml' + content: | + services: + {% if auth_enabled %} + ai-studio-gatekeeper--{{ deployment_environment }}: + image: '{{ registry }}/gatekeeper:2.1.1' + environment: + PROXY_LISTEN: :4200 + PROXY_UPSTREAM_URL: http://web + PROXY_DISCOVERY_URL: "{{ lookup('env', 'AUTH_DISCOVERY_URL') }}" + PROXY_CLIENT_ID: "{{ lookup('env', 'AUTH_CLIENT_ID') }}" + PROXY_CLIENT_SECRET: "{{ lookup('env', 'AUTH_SECRET') }}" + PROXY_ENCRYPTION_KEY: "{{ lookup('env', 'AUTH_COOKIE_SECRET') }}" + PROXY_REDIRECTION_URL: 'https://{{ deployment_url }}' + command: + - '-enable-default-deny=false' + - "-resources=uri=/*|roles={{ lookup('env', 'AUTH_ROLE') }}" + networks: + traefik-host-external: + internal: + aliases: [gatekeeper] + deploy: + placement: + constraints: + - node.role==worker + labels: + - 'traefik.enable=true' + - 'traefik.docker.network=traefik-host-external' + - 'traefik.http.routers.{{ stack_name }}-http.rule=Host(`{{ deployment_url }}`) && PathPrefix(`/`)' + - 'traefik.http.routers.{{ stack_name }}-http.entrypoints=http' + - 'traefik.http.routers.{{ stack_name }}-http.middlewares=https-redirect' + - 'traefik.http.routers.{{ stack_name }}-https.rule=Host(`{{ deployment_url }}`) && PathPrefix(`/`)' + - 'traefik.http.routers.{{ stack_name }}-https.entrypoints=https' + - 'traefik.http.routers.{{ stack_name }}-https.tls=true' + - 'traefik.http.routers.{{ stack_name }}-https.tls.certresolver=le' + - 'traefik.http.services.{{ stack_name }}.loadbalancer.server.port=4200' + {% endif %} + + ai-studio-web--{{ deployment_environment }}: + image: '{{ registry }}/{{ app_name }}:web-{{ image_tag }}' + networks: + {% if not auth_enabled %} + traefik-host-external: + {% endif %} + internal: + aliases: [web] + deploy: + placement: + constraints: + - node.role==worker + {% if not auth_enabled %} + labels: + - 'traefik.enable=true' + - 'traefik.docker.network=traefik-host-external' + - 'traefik.http.routers.{{ stack_name }}-http.rule=Host(`{{ deployment_url }}`) && PathPrefix(`/`)' + - 'traefik.http.routers.{{ stack_name }}-http.entrypoints=http' + - 'traefik.http.routers.{{ stack_name }}-http.middlewares=https-redirect' + - 'traefik.http.routers.{{ stack_name }}-https.rule=Host(`{{ deployment_url }}`) && PathPrefix(`/`)' + - 'traefik.http.routers.{{ stack_name }}-https.entrypoints=https' + - 'traefik.http.routers.{{ stack_name }}-https.tls=true' + - 'traefik.http.routers.{{ stack_name }}-https.tls.certresolver=le' + - 'traefik.http.services.{{ stack_name }}.loadbalancer.server.port=80' + {% endif %} + + ai-studio-backend--{{ deployment_environment }}: + image: '{{ registry }}/{{ app_name }}:runtime-{{ image_tag }}' + command: ['pnpm', '--filter', 'backend', 'start:prod'] + environment: + HOST: 0.0.0.0 + PORT: 3001 + DATABASE_URL: '{{ database_url }}' + TEMPORAL_ADDRESS: temporal:7233 + WB_AUTH_PORT: allow-all + TRUST_PROXY: 'true' + RATE_LIMIT_EXECUTE_PER_MINUTE: "{{ lookup('env', 'RATE_LIMIT_EXECUTE_PER_MINUTE') or '10' }}" + RATE_LIMIT_EXECUTE_PER_DAY: "{{ lookup('env', 'RATE_LIMIT_EXECUTE_PER_DAY') or '50' }}" + networks: + internal: + # the web image's nginx proxies to http://backend:3001 + aliases: [backend] + deploy: + placement: + constraints: + - node.role==worker + + ai-studio-worker--{{ deployment_environment }}: + image: '{{ registry }}/{{ app_name }}:runtime-{{ image_tag }}' + command: ['pnpm', '--filter', 'execution-worker', 'start:prod'] + environment: + DATABASE_URL: '{{ database_url }}' + TEMPORAL_ADDRESS: temporal:7233 + OPENROUTER_API_KEY: '{{ openrouter_api_key }}' + AI_MODEL: '{{ ai_model }}' + networks: + internal: + deploy: + placement: + constraints: + - node.role==worker + + ai-studio-app-db--{{ deployment_environment }}: + image: 'postgres:16' + environment: + POSTGRES_DB: workflow_builder + POSTGRES_USER: wb + POSTGRES_PASSWORD: '{{ app_db_password }}' + volumes: + - app-db-data:/var/lib/postgresql/data + networks: + internal: + aliases: [app-db] + deploy: + placement: + constraints: + - node.labels.ai-studio-data==true + + ai-studio-temporal-db--{{ deployment_environment }}: + image: 'postgres:16' + environment: + POSTGRES_DB: temporal + POSTGRES_USER: temporal + POSTGRES_PASSWORD: '{{ temporal_db_password }}' + volumes: + - temporal-db-data:/var/lib/postgresql/data + networks: + internal: + aliases: [temporal-db] + deploy: + placement: + constraints: + - node.labels.ai-studio-data==true + + ai-studio-temporal--{{ deployment_environment }}: + image: 'temporalio/auto-setup:1.29.6.1' + environment: + DB: postgres12 + DB_PORT: 5432 + POSTGRES_USER: temporal + POSTGRES_PWD: '{{ temporal_db_password }}' + POSTGRES_SEEDS: temporal-db + networks: + internal: + aliases: [temporal] + deploy: + placement: + constraints: + - node.role==worker + + volumes: + app-db-data: + temporal-db-data: + + networks: + internal: + # attachable so the one-shot migrate container below can join + attachable: true + traefik-host-external: + external: true + + - name: Ensure Azure CLI is setup + shell: /var/az-autologin.sh + + - name: Ensure jsondiff is installed (required by community.docker.docker_stack) + ansible.builtin.pip: + name: jsondiff + + - name: Deploy stack + community.docker.docker_stack: + state: present + name: '{{ stack_name }}' + resolve_image: 'always' + prune: true + with_registry_auth: yes + compose: + - '/mnt/docker-swarm-storage/stacks/{{ stack_name }}/{{ app_name }}.stack.yml' + + # Swarm has no depends_on / one-shot service semantics: run Drizzle + # migrations as a plain container on the stack's attachable overlay + # network. Retries cover app-db still starting up on first deploy. + - name: Run database migrations + command: > + docker run --rm + --network {{ stack_name }}_internal + -e DATABASE_URL={{ database_url }} + {{ registry }}/{{ app_name }}:migrate-{{ image_tag }} + pnpm --filter backend db:migrate + register: migrate_result + retries: 10 + delay: 6 + until: migrate_result.rc == 0 diff --git a/tools/deployment/scripts/build-docker.sh b/tools/deployment/scripts/build-docker.sh new file mode 100755 index 000000000..8295b05d5 --- /dev/null +++ b/tools/deployment/scripts/build-docker.sh @@ -0,0 +1,38 @@ +#!/bin/sh +# Build + push the AI Studio images to ACR, mirroring the workflow-builder +# repo's tools/deployment/scripts/build-docker.sh. All three images come from +# the same multi-target Dockerfile in deploy/ai-studio/ — this script only +# adds registry tagging; the images are identical to the local-compose ones. +# +# Bitbucket-style env vars are honored when present (TAG_PREFIX, +# BITBUCKET_COMMIT, BITBUCKET_DEPLOYMENT_ENVIRONMENT) and fall back to git + +# DEPLOY_ENV so the script also runs from a workstation or GitHub Actions. +set -eu + +APP_NAME="ai-studio" +REGISTRY="${REGISTRY:-synergycodes.azurecr.io}" +COMMIT="${BITBUCKET_COMMIT:-$(git rev-parse HEAD)}" +ENVIRONMENT="${BITBUCKET_DEPLOYMENT_ENVIRONMENT:-${DEPLOY_ENV:-}}" +export IMAGE_TAG="${TAG_PREFIX:-}$COMMIT" + +for TARGET in runtime migrate web; do + TAG="$REGISTRY/$APP_NAME:$TARGET-$IMAGE_TAG" + docker build \ + -f ./deploy/ai-studio/Dockerfile \ + --target "$TARGET" \ + -t "$TAG" \ + . +done + +ALLOWED_ENVIRONMENTS="stage dev prod" + +if echo "$ALLOWED_ENVIRONMENTS" | grep -w "$ENVIRONMENT" > /dev/null; then + # setup-az.sh exists in the deployment CI image; logging in by other means + # (az acr login / docker login) is fine when running elsewhere + [ -f /var/setup-az.sh ] && . /var/setup-az.sh + for TARGET in runtime migrate web; do + docker push "$REGISTRY/$APP_NAME:$TARGET-$IMAGE_TAG" + done +else + echo "Environment '$ENVIRONMENT' is not configured for image push. Skipping." +fi diff --git a/tools/deployment/scripts/deploy.sh b/tools/deployment/scripts/deploy.sh new file mode 100755 index 000000000..1ac078e57 --- /dev/null +++ b/tools/deployment/scripts/deploy.sh @@ -0,0 +1,11 @@ +#!/bin/sh +# Deploy the AI Studio stack to the Docker Swarm cluster, mirroring the +# workflow-builder repo's tools/deployment/scripts/deploy.sh. The setup +# scripts are baked into the synergycodes deployment CI image; guards let the +# playbook also run from a workstation with az + ansible already configured. +set -eu + +[ -f /var/setup-az.sh ] && . /var/setup-az.sh +[ -f /var/setup-ansible.sh ] && . /var/setup-ansible.sh + +ansible-playbook ./tools/deployment/ansible/deploy-application/main.yml diff --git a/tools/deployment/swarm-alignment.decision-log.md b/tools/deployment/swarm-alignment.decision-log.md new file mode 100644 index 000000000..44049bbaa --- /dev/null +++ b/tools/deployment/swarm-alignment.decision-log.md @@ -0,0 +1,84 @@ +### Title: Swarm overlay aligned with the workflow-builder deployment + +### Proposed by: Jan Librowski + +### Date: 10.06.2026 + +## Context + +The compose-based deployment in `deploy/ai-studio/` (see its decision log) +targets a single Docker host and ships TLS as a bring-your-own concern. The +company's actual Azure footprint, found in the `workflow-builder` repo's +`tools/deployment/`, is different: a self-managed Docker Swarm cluster with +Traefik (Let's Encrypt, host-based routing), images in ACR tagged by commit, +deployment via an Ansible playbook that writes a stack file onto the Swarm +master, and an optional gatekeeper OIDC proxy for internal apps. DevOps +operates that machinery daily. + +Rather than choosing one target, the compose branch is kept as a snapshot +(`WB-229-ai-studio-deployment`) and this branch adds the Swarm-aligned path +on top of it. + +## Decision + +Add `tools/deployment/` mirroring the workflow-builder repo's structure — +`scripts/build-docker.sh`, `scripts/deploy.sh`, +`ansible/deploy-application/main.yml` — with the same conventions: ACR +commit-tagged images, per-environment stack names (`ai-studio--dev`), +Traefik labels copied from the existing stack, Bitbucket-style env variables +honored with workstation fallbacks. + +**The images are shared, not duplicated.** Both paths build the same three +targets from `deploy/ai-studio/Dockerfile`; the overlay only changes +orchestration. Four deliberate deviations from the workflow-builder +playbook, all forced by AI Studio being stateful where the editor demo was +a static frontend: + +1. Database/Temporal services with named volumes pinned to a labeled node + (`node.labels.ai-studio-data==true`) — Swarm volumes are node-local. +2. Migrations as a post-deploy one-shot `docker run` with retries — Swarm + ignores compose `depends_on` conditions, so the ordering that compose + expressed declaratively lives in the playbook. +3. An `attachable` internal network plus short DNS aliases (`backend`, + `app-db`, `temporal`) so the unmodified web image's nginx upstream and + the compose env defaults resolve identically under Swarm. +4. Gatekeeper made conditional (`AUTH_ENABLED`, default off) — the public + demo is login-free by design; internal stage/dev instances can keep SSO. + +## Alternative Options Considered + +- **Compose on a dedicated VM only** (the snapshot branch) — fully working + and remains the customer-facing artifact; rejected as the _only_ path + because it adds a second ops surface (new VM, separate TLS) when a + maintained cluster exists. +- **Kubernetes/AKS manifests** — nothing in the org runs on k8s per the + available evidence; would be infrastructure invention, not alignment. +- **Managed Azure Postgres instead of in-cluster databases** — cleaner + state story, but contradicts the near-zero-cost requirement for a demo + whose data is explicitly ephemeral; revisit for sustained load. +- **Swarm secrets for the OpenRouter key** — stricter than env-in-stack-file, + but diverges from how the existing playbook handles `AUTH_SECRET`; + consistency won for now, flagged in the README. + +## Consequences + +- **Pros** + - DevOps sees the exact shape they already operate; review is a diff + against a known playbook, not a new system. + - TLS, registry auth, and routing are inherited from cluster-level + Traefik instead of being re-solved per deployment. + - Stack template render-verified in both auth modes (YAML parses; correct + public surface and Traefik port in each). +- **Cons** + - Not exercised against a real cluster yet — inventory, ACR push rights, + the `ai-studio-data` node label, and the first stateful workload on the + cluster all need DevOps sign-off. + - The rate limiter's `X-Forwarded-For` trust now spans Traefik (and + optionally gatekeeper) before nginx; the first-hop assumption should be + verified on the real cluster. + - Secrets land in a stack file on the Swarm master's disk (inherited + trade-off from the existing flow). + +## Status + +Proposed — pending the DevOps conversation From 3d10ff45865cf6d165eaf6419b26228d90b2aa69 Mon Sep 17 00:00:00 2001 From: Jan Librowski Date: Thu, 11 Jun 2026 11:25:08 +0200 Subject: [PATCH 06/11] feat(backend): apply drizzle migrations on boot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit drizzle-orm's programmatic migrator runs the SQL files from apps/backend/drizzle/ before the server accepts traffic. A failure (database still starting) exits the process; container restart policies retry until it converges. drizzle-kit stays a devDependency — db:migrate remains available for out-of-band use. --- apps/backend/src/db/migrate.ts | 23 +++++++++++++++++++++++ apps/backend/src/server.ts | 7 +++++++ 2 files changed, 30 insertions(+) create mode 100644 apps/backend/src/db/migrate.ts diff --git a/apps/backend/src/db/migrate.ts b/apps/backend/src/db/migrate.ts new file mode 100644 index 000000000..005b661d0 --- /dev/null +++ b/apps/backend/src/db/migrate.ts @@ -0,0 +1,23 @@ +import { drizzle } from 'drizzle-orm/postgres-js'; +import { migrate } from 'drizzle-orm/postgres-js/migrator'; +import { fileURLToPath } from 'node:url'; +import postgres from 'postgres'; + +import { env } from '../env'; + +// Programmatic equivalent of `pnpm db:migrate`, reading the same SQL files +// from apps/backend/drizzle/. Runs at backend boot so deployments need no +// separate migration step or image — and drizzle-kit can stay a +// devDependency. Single-replica assumption (WB-229): concurrent backends +// would race the migrator. +export async function runMigrations(): Promise { + const migrationsFolder = fileURLToPath(new URL('../../drizzle', import.meta.url)); + // Dedicated throwaway connection — the app pool in client.ts outlives this, + // but the migrator's connection must not linger once it finishes. + const sql = postgres(env.DATABASE_URL, { max: 1 }); + try { + await migrate(drizzle(sql), { migrationsFolder }); + } finally { + await sql.end(); + } +} diff --git a/apps/backend/src/server.ts b/apps/backend/src/server.ts index 10e52c1d9..a2b81ea20 100644 --- a/apps/backend/src/server.ts +++ b/apps/backend/src/server.ts @@ -12,6 +12,7 @@ import { createAuthMiddleware, makeAssertAuthorized, } from './auth'; +import { runMigrations } from './db/migrate'; import { env } from './env'; import { logger } from './logger'; import { createRateLimitMiddleware } from './middleware/rate-limit'; @@ -72,6 +73,12 @@ if (env.RATE_LIMIT_EXECUTE_PER_MINUTE > 0 || env.RATE_LIMIT_EXECUTE_PER_DAY > 0) app.route('/api/workflows', createWorkflowsRoutes(assertAuthorized)); app.route('/api/executions', createExecutionsRoutes(assertAuthorized)); +// Apply pending migrations before accepting traffic. A failure (e.g. the +// database still starting) exits the process — the container restart policy +// retries until it converges, so deployments need no separate migration step. +await runMigrations(); +logger.info('database migrations applied'); + serve({ fetch: app.fetch, port: env.PORT, hostname: env.HOST }, () => { logger.info('backend listening', { url: `http://${env.HOST}:${env.PORT}` }); }); From 7e272213aa1041268ed2ca4071c6244e10409406 Mon Sep 17 00:00:00 2001 From: Jan Librowski Date: Thu, 11 Jun 2026 11:25:24 +0200 Subject: [PATCH 07/11] refactor(deploy): drop the migrate service and image The backend migrates itself at boot, so the migrate Dockerfile target, compose service, and the Swarm playbook's post-deploy migration task (plus its attachable-network requirement) all go away. Two images remain: runtime and web. The worker now waits for the backend healthcheck so it never touches a pre-migration schema. Verified on a wiped stack: virgin database boots, backend logs 'database migrations applied' before listening, Sales Inquiry Pipeline runs to execution_completed over live SSE, rate limiter returns 429 past the budget. --- CLAUDE.md | 6 ++--- deploy/ai-studio/Dockerfile | 12 +++------ deploy/ai-studio/README.md | 15 ++++++----- .../ai-studio-deployment.decision-log.md | 20 +++++++++----- deploy/ai-studio/docker-compose.yml | 22 ++++------------ tools/deployment/README.md | 17 ++++++------ .../ansible/deploy-application/main.yml | 26 +++---------------- tools/deployment/scripts/build-docker.sh | 6 ++--- .../swarm-alignment.decision-log.md | 18 ++++++++----- 9 files changed, 61 insertions(+), 81 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 2a2e2bb31..d2932a89f 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -11,7 +11,7 @@ Three onboarding paths (A, B local-run; C docs-only). README "Get started" is th | `pnpm preflight` | both | Verify Node / pnpm / Docker / ports / `.env` files. Add `--json` for agents | | `pnpm dev` / `pnpm dev:demo` | A | Demo (UI only, port 4200). No backend, no Docker | | `pnpm infra:up` | B | Start Postgres + Temporal in Docker. Required before backend/worker | -| `pnpm -F backend db:migrate` | B | Apply Drizzle migrations. First run, or after schema changes | +| `pnpm -F backend db:migrate` | B | Apply Drizzle migrations out-of-band (backend also auto-migrates on boot) | | `pnpm dev:ai-studio` | B | Full stack: infra + backend (3001) + worker + AI Studio frontend (4201) | | `pnpm dev:backend` | B | Backend only (debug). Needs infra up | | `pnpm dev:worker` | B | Execution worker only (debug). Needs infra up | @@ -22,7 +22,7 @@ Three onboarding paths (A, B local-run; C docs-only). README "Get started" is th | `pnpm test` | - | Run tests in `packages/sdk` and `packages/execution-core` | | `pnpm check` | - | Lint + typecheck + format + knip | -Path A is UI-only and does not need Docker. Path B requires `pnpm infra:up` before backend/worker can start, and `db:migrate` on the first run. +Path A is UI-only and does not need Docker. Path B requires `pnpm infra:up` before backend/worker can start; the backend applies pending migrations automatically at boot. ### Agent signals @@ -44,7 +44,7 @@ Long-running processes already emit stable log lines that scripts and agents can tools/ - Root dev scripts: preflight, setup:env, infra wait deployment/ - Swarm/Ansible deploy path mirroring the workflow-builder repo (ACR, Traefik) deploy/ - ai-studio/ - Production deployment: Dockerfile (runtime/migrate/web), compose, nginx, README + ai-studio/ - Production deployment: Dockerfile (runtime/web), compose, nginx, README apps/ demo/ - Reference app consuming the SDK (React + Vite, port 4200) ai-studio/ - Reference AI workflow product (React + Vite, port 4201) diff --git a/deploy/ai-studio/Dockerfile b/deploy/ai-studio/Dockerfile index a6fc07f67..5c9834cbd 100644 --- a/deploy/ai-studio/Dockerfile +++ b/deploy/ai-studio/Dockerfile @@ -3,9 +3,12 @@ # AI Studio execution stack — single Dockerfile, multiple targets: # # runtime -> backend + execution-worker (command chosen per compose service) -# migrate -> one-shot Drizzle migration runner (needs backend devDependencies) # web -> nginx serving the AI Studio SPA + reverse proxy to the backend # +# Database migrations run inside the backend at boot (drizzle-orm's +# programmatic migrator over apps/backend/drizzle/), so there is no separate +# migration image or deploy step. +# # Build context must be the repo root (workspace packages are linked via # pnpm `workspace:*`), e.g.: # @@ -51,13 +54,6 @@ RUN --mount=type=cache,id=pnpm-store,target=/pnpm/store \ # backend: pnpm --filter backend start:prod # worker: pnpm --filter execution-worker start:prod -# Migrations need drizzle-kit, a backend devDependency — hence a separate -# target with a dev install. Runs as a one-shot service before the backend. -FROM source AS migrate -RUN --mount=type=cache,id=pnpm-store,target=/pnpm/store \ - pnpm install --frozen-lockfile --prefer-offline --filter backend... -CMD ["pnpm", "--filter", "backend", "db:migrate"] - # The SPA build imports the SDK from source (vite alias), so this needs the # full frontend dependency tree. VITE_BACKEND_URL is baked in at build time; # the default (empty) makes the app call /api on its own origin, which the diff --git a/deploy/ai-studio/README.md b/deploy/ai-studio/README.md index cc149a182..d1f4c4959 100644 --- a/deploy/ai-studio/README.md +++ b/deploy/ai-studio/README.md @@ -14,15 +14,16 @@ any Docker host — an Azure VM, AWS, on-prem — with no cloud-specific glue. | `web` | `ai-studio-web` (nginx) | Serves the SPA, proxies `/api` to the backend | `${WEB_PORT}` (only one) | | `backend` | `ai-studio-runtime` | Hono REST + SSE event stream | internal | | `worker` | `ai-studio-runtime` | Temporal worker, makes the OpenRouter LLM calls | internal | -| `migrate` | `ai-studio-migrate` | One-shot Drizzle migrations, then exits | internal | | `temporal` | `temporalio/auto-setup` pinned | Workflow engine | internal | | `app-db` | `postgres:16` | Workflow snapshots + execution events | internal | | `temporal-db` | `postgres:16` | Temporal's own state store | internal | | `temporal-ui` | `temporalio/ui` pinned | Debug only (`--profile debug`) | `127.0.0.1:8233` | -All images build from one Dockerfile (`deploy/ai-studio/Dockerfile`) with the +Both images build from one Dockerfile (`deploy/ai-studio/Dockerfile`) with the repo root as context. Backend and worker share a single image and differ only -in the compose `command`. +in the compose `command`. Database migrations are applied by the backend at +boot (drizzle-orm's programmatic migrator) — there is no separate migration +service or step. ## Quick start @@ -32,9 +33,9 @@ cp .env.example .env # set OPENROUTER_API_KEY docker compose up -d --build ``` -First boot: migrations run automatically (`migrate` exits 0, then the backend -starts). The worker crash-loops for ~30s until Temporal finishes auto-setup — -that's expected, `restart: unless-stopped` converges it. +First boot: the backend applies migrations and only then starts serving (its +healthcheck gates the worker). The worker crash-loops for ~30s until Temporal +finishes auto-setup — that's expected, `restart: unless-stopped` converges it. Verify: @@ -89,7 +90,7 @@ Swapping the LLM is a one-liner: change `AI_MODEL` to any ```bash docker compose logs -f backend worker # tail the apps docker compose --profile debug up -d # Temporal UI on 127.0.0.1:8233 -docker compose up -d --build # deploy a new version (re-runs migrations) +docker compose up -d --build # deploy a new version (backend re-applies migrations at boot) docker compose down # stop (volumes survive) docker exec ai-studio-app-db-1 pg_dump -U wb workflow_builder > backup.sql ``` diff --git a/deploy/ai-studio/ai-studio-deployment.decision-log.md b/deploy/ai-studio/ai-studio-deployment.decision-log.md index a56e411f3..40e1fc6ac 100644 --- a/deploy/ai-studio/ai-studio-deployment.decision-log.md +++ b/deploy/ai-studio/ai-studio-deployment.decision-log.md @@ -32,7 +32,7 @@ Everything lives in `deploy/ai-studio/`: one multi-target Dockerfile, a production `docker-compose.yml`, the nginx config, `.env.example`, and a DevOps-facing README. -1. **One Dockerfile, three targets** (`runtime`, `migrate`, `web`), built +1. **One Dockerfile, two targets** (`runtime`, `web`), built with the repo root as context (pnpm `workspace:*` links require it). A shared `source` stage does `pnpm fetch` against a BuildKit cache mount, so per-target installs are store-hits. @@ -44,11 +44,13 @@ DevOps-facing README. entirely — there is no bundling step to get wrong. 3. **One shared `runtime` image for backend and worker**; the compose `command` picks the entrypoint. One image to build, push, and version. -4. **Migrations as a one-shot compose service** (`migrate` target, carries - drizzle-kit as a backend devDependency). `depends_on: -service_completed_successfully` gates the backend, so `docker compose up` - is a complete first boot. Same answer works as a k8s Job / ACA job if a - customer reshapes the topology. +4. **Migrations on backend boot** (revised 11.06.2026 — originally a + one-shot `migrate` compose service). The backend applies pending Drizzle + migrations via drizzle-orm's programmatic migrator before accepting + traffic; on failure it exits and the restart policy retries until + Postgres answers. One less image, no orchestrator-specific ordering — + the same behavior on compose, Swarm, or anything else. Single-replica + assumption: concurrent backends would race the migrator. 5. **nginx is the only public surface.** It serves the SPA and proxies `/api` to the backend on the internal network; the SSE stream route gets `proxy_buffering off` + long read timeout. The backend container is @@ -135,6 +137,12 @@ Found and fixed during end-to-end verification: the worker ignored - pnpm version is pinned in two places (root `packageManager` + Dockerfile). +## Revisions + +- **11.06.2026** — `migrate` target and service removed; the backend now + migrates itself at boot (Jan's simplification request during WB-229 + review). Dockerfile is down to two targets (`runtime`, `web`). + ## Status Accepted diff --git a/deploy/ai-studio/docker-compose.yml b/deploy/ai-studio/docker-compose.yml index 52d7dff1a..c4f827984 100644 --- a/deploy/ai-studio/docker-compose.yml +++ b/deploy/ai-studio/docker-compose.yml @@ -73,19 +73,8 @@ services: - '127.0.0.1:8233:8080' restart: unless-stopped - migrate: - image: ai-studio-migrate - build: - context: ../.. - dockerfile: deploy/ai-studio/Dockerfile - target: migrate - environment: - DATABASE_URL: postgresql://wb:${APP_DB_PASSWORD:-wb}@app-db:5432/workflow_builder - depends_on: - app-db: - condition: service_healthy - restart: 'no' - + # Applies Drizzle migrations at boot, before accepting traffic. A failure + # (e.g. Postgres still starting) exits the process and `restart` retries. backend: image: ai-studio-runtime build: *runtime-build @@ -106,8 +95,6 @@ services: depends_on: app-db: condition: service_healthy - migrate: - condition: service_completed_successfully temporal: condition: service_started healthcheck: @@ -138,8 +125,9 @@ services: depends_on: app-db: condition: service_healthy - migrate: - condition: service_completed_successfully + # healthy = migrations applied — the worker writes to the same schema + backend: + condition: service_healthy temporal: condition: service_started restart: unless-stopped diff --git a/tools/deployment/README.md b/tools/deployment/README.md index 432e90faa..f8f99fd2c 100644 --- a/tools/deployment/README.md +++ b/tools/deployment/README.md @@ -5,7 +5,7 @@ the same layout, scripts, and Ansible flow as the `workflow-builder` repo's `tools/deployment/` — so DevOps operates one familiar shape. This is an **orchestration overlay, not a second deployment**: it consumes -the exact same three images (`runtime`, `migrate`, `web`) built from +the exact same two images (`runtime`, `web`) built from [`deploy/ai-studio/Dockerfile`](../../deploy/ai-studio/Dockerfile). The compose file in `deploy/ai-studio/` remains the portable, customer-facing artifact and the local full-stack runner; this directory adds the @@ -17,7 +17,7 @@ tools/deployment/ │ ├── build-docker.sh # build all 3 targets, tag for ACR, push (CI-gated) │ └── deploy.sh # run the Ansible playbook (CI image or workstation) └── ansible/deploy-application/ - └── main.yml # writes the Swarm stack file on the master + deploys + migrates + └── main.yml # writes the Swarm stack file on the master + deploys ``` ## Usage @@ -52,13 +52,12 @@ make them runnable from a workstation or GitHub Actions. ## What differs from the workflow-builder playbook (and why) -| Deviation | Reason | -| ------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------ | -| Postgres ×2 + Temporal services with named volumes, pinned via `node.labels.ai-studio-data==true` | AI Studio is stateful; Swarm volumes are node-local. **One-time setup:** `docker node update --label-add ai-studio-data=true ` | -| Migrations run post-deploy as a one-shot `docker run` on the stack network (with retries) | Swarm ignores compose `depends_on` conditions | -| `internal` network is `attachable: true` | Lets the migrate container join the overlay | -| Services carry short DNS aliases (`backend`, `app-db`, `temporal`, …) | The web image's nginx proxies to `http://backend:3001`; aliases keep the images and env defaults identical between compose and Swarm | -| Gatekeeper is conditional (`AUTH_ENABLED`) | The WB-229 public demo is deliberately login-free; internal instances can keep SSO | +| Deviation | Reason | +| ------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------ | +| Postgres ×2 + Temporal services with named volumes, pinned via `node.labels.ai-studio-data==true` | AI Studio is stateful; Swarm volumes are node-local. **One-time setup:** `docker node update --label-add ai-studio-data=true ` | +| No migration step — the backend applies Drizzle migrations at boot and restarts until Postgres answers | Swarm ignores compose `depends_on` conditions, so ordering must not rely on them | +| Services carry short DNS aliases (`backend`, `app-db`, `temporal`, …) | The web image's nginx proxies to `http://backend:3001`; aliases keep the images and env defaults identical between compose and Swarm | +| Gatekeeper is conditional (`AUTH_ENABLED`) | The WB-229 public demo is deliberately login-free; internal instances can keep SSO | SSE note: Traefik streams responses by default, so the live execution stream works without special ingress config; the 15 s backend heartbeat keeps the diff --git a/tools/deployment/ansible/deploy-application/main.yml b/tools/deployment/ansible/deploy-application/main.yml index 2d8a4fc2d..f83dc6ddb 100644 --- a/tools/deployment/ansible/deploy-application/main.yml +++ b/tools/deployment/ansible/deploy-application/main.yml @@ -1,16 +1,15 @@ --- # Deploys the AI Studio execution stack to the Docker Swarm cluster, # following the workflow-builder repo's deploy-application playbook. The -# images are the same three targets the local compose builds -# (deploy/ai-studio/Dockerfile); only the orchestration differs. +# images are the same two targets the local compose builds +# (deploy/ai-studio/Dockerfile); only the orchestration differs. Database +# migrations run inside the backend at boot, so there is no migration step +# here — the backend restarts until Postgres answers, then migrates itself. # # Differences from the workflow-builder playbook, all forced by AI Studio # being stateful: # - Postgres x2 + Temporal services with named volumes, pinned to the node # labeled `ai-studio-data=true` (Swarm volumes are node-local). -# - Migrations run as a one-shot container after stack deploy — Swarm -# ignores compose depends_on conditions, so ordering lives here. -# - The `internal` network is attachable so the migrate container can join. # - Services get short DNS aliases (backend, app-db, temporal, ...) so the # same images and env defaults work under compose and Swarm. # - Gatekeeper is optional (AUTH_ENABLED=true): the WB-229 public demo is @@ -203,8 +202,6 @@ networks: internal: - # attachable so the one-shot migrate container below can join - attachable: true traefik-host-external: external: true @@ -224,18 +221,3 @@ with_registry_auth: yes compose: - '/mnt/docker-swarm-storage/stacks/{{ stack_name }}/{{ app_name }}.stack.yml' - - # Swarm has no depends_on / one-shot service semantics: run Drizzle - # migrations as a plain container on the stack's attachable overlay - # network. Retries cover app-db still starting up on first deploy. - - name: Run database migrations - command: > - docker run --rm - --network {{ stack_name }}_internal - -e DATABASE_URL={{ database_url }} - {{ registry }}/{{ app_name }}:migrate-{{ image_tag }} - pnpm --filter backend db:migrate - register: migrate_result - retries: 10 - delay: 6 - until: migrate_result.rc == 0 diff --git a/tools/deployment/scripts/build-docker.sh b/tools/deployment/scripts/build-docker.sh index 8295b05d5..10e90db1e 100755 --- a/tools/deployment/scripts/build-docker.sh +++ b/tools/deployment/scripts/build-docker.sh @@ -1,6 +1,6 @@ #!/bin/sh # Build + push the AI Studio images to ACR, mirroring the workflow-builder -# repo's tools/deployment/scripts/build-docker.sh. All three images come from +# repo's tools/deployment/scripts/build-docker.sh. Both images come from # the same multi-target Dockerfile in deploy/ai-studio/ — this script only # adds registry tagging; the images are identical to the local-compose ones. # @@ -15,7 +15,7 @@ COMMIT="${BITBUCKET_COMMIT:-$(git rev-parse HEAD)}" ENVIRONMENT="${BITBUCKET_DEPLOYMENT_ENVIRONMENT:-${DEPLOY_ENV:-}}" export IMAGE_TAG="${TAG_PREFIX:-}$COMMIT" -for TARGET in runtime migrate web; do +for TARGET in runtime web; do TAG="$REGISTRY/$APP_NAME:$TARGET-$IMAGE_TAG" docker build \ -f ./deploy/ai-studio/Dockerfile \ @@ -30,7 +30,7 @@ if echo "$ALLOWED_ENVIRONMENTS" | grep -w "$ENVIRONMENT" > /dev/null; then # setup-az.sh exists in the deployment CI image; logging in by other means # (az acr login / docker login) is fine when running elsewhere [ -f /var/setup-az.sh ] && . /var/setup-az.sh - for TARGET in runtime migrate web; do + for TARGET in runtime web; do docker push "$REGISTRY/$APP_NAME:$TARGET-$IMAGE_TAG" done else diff --git a/tools/deployment/swarm-alignment.decision-log.md b/tools/deployment/swarm-alignment.decision-log.md index 44049bbaa..83d58de78 100644 --- a/tools/deployment/swarm-alignment.decision-log.md +++ b/tools/deployment/swarm-alignment.decision-log.md @@ -36,12 +36,12 @@ a static frontend: 1. Database/Temporal services with named volumes pinned to a labeled node (`node.labels.ai-studio-data==true`) — Swarm volumes are node-local. -2. Migrations as a post-deploy one-shot `docker run` with retries — Swarm - ignores compose `depends_on` conditions, so the ordering that compose - expressed declaratively lives in the playbook. -3. An `attachable` internal network plus short DNS aliases (`backend`, - `app-db`, `temporal`) so the unmodified web image's nginx upstream and - the compose env defaults resolve identically under Swarm. +2. No migration step (revised 11.06.2026) — the backend applies Drizzle + migrations at boot and restarts until Postgres answers, which sidesteps + Swarm's lack of `depends_on` ordering entirely. +3. Short DNS aliases (`backend`, `app-db`, `temporal`) so the unmodified + web image's nginx upstream and the compose env defaults resolve + identically under Swarm. 4. Gatekeeper made conditional (`AUTH_ENABLED`, default off) — the public demo is login-free by design; internal stage/dev instances can keep SSO. @@ -79,6 +79,12 @@ a static frontend: - Secrets land in a stack file on the Swarm master's disk (inherited trade-off from the existing flow). +## Revisions + +- **11.06.2026** — playbook migration task and the `attachable` network + removed; the backend migrates itself at boot. Image set is down to + `runtime` + `web`. + ## Status Proposed — pending the DevOps conversation From 1e95bb0d731a3b73af72bb01bc72a1f3bc99dd24 Mon Sep 17 00:00:00 2001 From: Jan Librowski Date: Thu, 11 Jun 2026 14:15:21 +0200 Subject: [PATCH 08/11] style: trim comments to the non-obvious Keep only what the code cannot say itself: traps (Worker.create's implicit localhost, corepack/pnpm 10 failure, offline mode leaking into lifecycle scripts), constraints (single-replica migrator, X-Forwarded-For trust), and magic values. Drop the narration. --- .dockerignore | 3 +- apps/backend/src/db/migrate.ts | 9 +--- apps/backend/src/env.ts | 3 +- .../backend/src/middleware/rate-limit.test.ts | 6 --- apps/backend/src/middleware/rate-limit.ts | 23 +++------ apps/backend/src/server.ts | 4 +- .../src/engines/temporal/worker.ts | 3 +- .../src/executors/decision.test.ts | 4 -- .../src/executors/decision.ts | 5 +- deploy/ai-studio/Dockerfile | 51 +++++-------------- deploy/ai-studio/docker-compose.yml | 33 ++++-------- deploy/ai-studio/nginx/default.conf | 15 ++---- .../ansible/deploy-application/main.yml | 20 ++------ tools/deployment/scripts/build-docker.sh | 14 ++--- tools/deployment/scripts/deploy.sh | 6 +-- 15 files changed, 54 insertions(+), 145 deletions(-) diff --git a/.dockerignore b/.dockerignore index 899987c08..e0fcd955c 100644 --- a/.dockerignore +++ b/.dockerignore @@ -24,8 +24,7 @@ tools/ !tools/deployment/nginx .gitignore -# env files hold secrets (e.g. OPENROUTER_API_KEY) and must never enter the -# build context — runtime config is injected via docker-compose `environment` +# env files hold secrets — never in a build context **/.env **/.env.* !**/.env.example diff --git a/apps/backend/src/db/migrate.ts b/apps/backend/src/db/migrate.ts index 005b661d0..6d7d4c68d 100644 --- a/apps/backend/src/db/migrate.ts +++ b/apps/backend/src/db/migrate.ts @@ -5,15 +5,10 @@ import postgres from 'postgres'; import { env } from '../env'; -// Programmatic equivalent of `pnpm db:migrate`, reading the same SQL files -// from apps/backend/drizzle/. Runs at backend boot so deployments need no -// separate migration step or image — and drizzle-kit can stay a -// devDependency. Single-replica assumption (WB-229): concurrent backends -// would race the migrator. +// Same SQL files as `pnpm db:migrate`. Concurrent backends would race the +// migrator — single replica assumed. export async function runMigrations(): Promise { const migrationsFolder = fileURLToPath(new URL('../../drizzle', import.meta.url)); - // Dedicated throwaway connection — the app pool in client.ts outlives this, - // but the migrator's connection must not linger once it finishes. const sql = postgres(env.DATABASE_URL, { max: 1 }); try { await migrate(drizzle(sql), { migrationsFolder }); diff --git a/apps/backend/src/env.ts b/apps/backend/src/env.ts index 349b5602f..a813c14f6 100644 --- a/apps/backend/src/env.ts +++ b/apps/backend/src/env.ts @@ -12,8 +12,7 @@ export const env = { HOST: envOr('HOST', '127.0.0.1'), DATABASE_URL: envOr('DATABASE_URL', 'postgresql://wb:wb@127.0.0.1:5432/workflow_builder'), TEMPORAL_ADDRESS: envOr('TEMPORAL_ADDRESS', '127.0.0.1:7233'), - // Per-IP limits on the execute route. 0 = disabled (local dev default); - // the production compose in deploy/ai-studio sets both. + // 0 disables (dev default); the deploy compose sets both RATE_LIMIT_EXECUTE_PER_MINUTE: Number(envOr('RATE_LIMIT_EXECUTE_PER_MINUTE', '0')), RATE_LIMIT_EXECUTE_PER_DAY: Number(envOr('RATE_LIMIT_EXECUTE_PER_DAY', '0')), TRUST_PROXY: envOr('TRUST_PROXY', 'false') === 'true', diff --git a/apps/backend/src/middleware/rate-limit.test.ts b/apps/backend/src/middleware/rate-limit.test.ts index f3f78b52b..7b9196358 100644 --- a/apps/backend/src/middleware/rate-limit.test.ts +++ b/apps/backend/src/middleware/rate-limit.test.ts @@ -6,12 +6,6 @@ import { type RateLimitOptions, createRateLimitMiddleware } from './rate-limit'; const MINUTE_MS = 60_000; const DAY_MS = 24 * 60 * 60 * 1000; -/** - * Build a Hono app mirroring the production wiring in `server.ts`: the - * limiter guards a single execute-shaped route. Tests drive the clock through - * the injectable `now` and identify callers via X-Forwarded-For (trustProxy), - * since `app.request()` has no underlying socket. - */ function makeApp(overrides: Partial = {}) { let timestamp = 0; const app = new Hono(); diff --git a/apps/backend/src/middleware/rate-limit.ts b/apps/backend/src/middleware/rate-limit.ts index 8496a94f1..2a5b4c70f 100644 --- a/apps/backend/src/middleware/rate-limit.ts +++ b/apps/backend/src/middleware/rate-limit.ts @@ -2,17 +2,12 @@ import { getConnInfo } from '@hono/node-server/conninfo'; import type { Context, MiddlewareHandler } from 'hono'; export type RateLimitOptions = { - /** Max requests per IP per minute. 0 disables the minute window. */ + // 0 disables a window perMinute: number; - /** Max requests per IP per day. 0 disables the day window. */ perDay: number; - /** - * Read the client IP from X-Forwarded-For. Only enable when the backend is - * reachable exclusively through a proxy that sets the header (the deploy - * nginx does) — a directly-reachable backend would let clients spoof it. - */ + // only safe when the backend is reachable exclusively through a proxy that + // sets X-Forwarded-For — a directly reachable backend lets clients spoof it trustProxy: boolean; - /** Injectable clock for tests. */ now?: () => number; }; @@ -41,7 +36,7 @@ function clientIp(c: Context, trustProxy: boolean): string { try { return getConnInfo(c).remote.address ?? 'unknown'; } catch { - // No underlying socket (e.g. app.request() in tests) + // no underlying socket (app.request() in tests) return 'unknown'; } } @@ -60,14 +55,8 @@ function hitWindow(state: WindowState, limit: number, durationMs: number, now: n return null; } -/** - * Fixed-window, in-memory, per-IP rate limiter for the execute route. - * - * Deliberately process-local (WB-229 lean MVP runs a single backend - * replica): counters reset on restart and are not shared across replicas. - * The OpenRouter account Guardrail is the independent hard spend cap; this - * gate only stops a single IP from burning the daily budget. - */ +// In-memory fixed windows: counters reset on restart and are not shared +// across replicas — fine for the single-replica demo deployment. export function createRateLimitMiddleware(options: RateLimitOptions): MiddlewareHandler { const { perMinute, perDay, trustProxy } = options; const now = options.now ?? Date.now; diff --git a/apps/backend/src/server.ts b/apps/backend/src/server.ts index a2b81ea20..7b33a7f96 100644 --- a/apps/backend/src/server.ts +++ b/apps/backend/src/server.ts @@ -73,9 +73,7 @@ if (env.RATE_LIMIT_EXECUTE_PER_MINUTE > 0 || env.RATE_LIMIT_EXECUTE_PER_DAY > 0) app.route('/api/workflows', createWorkflowsRoutes(assertAuthorized)); app.route('/api/executions', createExecutionsRoutes(assertAuthorized)); -// Apply pending migrations before accepting traffic. A failure (e.g. the -// database still starting) exits the process — the container restart policy -// retries until it converges, so deployments need no separate migration step. +// a failure (DB still starting) exits the process; the container restart policy retries await runMigrations(); logger.info('database migrations applied'); diff --git a/apps/execution-worker/src/engines/temporal/worker.ts b/apps/execution-worker/src/engines/temporal/worker.ts index 49ecd295e..401c60e93 100644 --- a/apps/execution-worker/src/engines/temporal/worker.ts +++ b/apps/execution-worker/src/engines/temporal/worker.ts @@ -42,8 +42,7 @@ const activities = { }, }; -// Without an explicit connection the worker silently dials 127.0.0.1:7233, -// ignoring TEMPORAL_ADDRESS — correct in local dev, wrong everywhere else. +// without an explicit connection, Worker.create dials 127.0.0.1:7233 and ignores TEMPORAL_ADDRESS const connection = await NativeConnection.connect({ address: env.TEMPORAL_ADDRESS }); const worker = await Worker.create({ diff --git a/apps/execution-worker/src/executors/decision.test.ts b/apps/execution-worker/src/executors/decision.test.ts index 2d4b31242..2f11c3fbd 100644 --- a/apps/execution-worker/src/executors/decision.test.ts +++ b/apps/execution-worker/src/executors/decision.test.ts @@ -83,10 +83,6 @@ describe('executeDecision', () => { }); it('treats a branch with no conditions as the catch-all', () => { - // The contract the no_branch_matched error instructs authors to use, and - // what the reference Sales Inquiry template relies on for its 'General' - // branch. First-match order applies: a catch-all placed after conditional - // branches only fires when none of them matched. const node = decisionNode([ { sourceHandle: 'no', diff --git a/apps/execution-worker/src/executors/decision.ts b/apps/execution-worker/src/executors/decision.ts index 0fbccb140..d9e8eefe9 100644 --- a/apps/execution-worker/src/executors/decision.ts +++ b/apps/execution-worker/src/executors/decision.ts @@ -28,10 +28,7 @@ export function executeDecision(node: DecisionNode, context: ExecutionContext): } function branchMatches(conditions: DecisionBranchCondition[], context: ExecutionContext): boolean { - // A branch with no conditions is the explicit catch-all — the contract the - // error above instructs authors to use, and what the reference Sales - // Inquiry template ships ('General' branch). First-match order still - // applies, so a catch-all only fires when placed after conditional branches. + // no conditions = the explicit catch-all the no_branch_matched error instructs authors to add if (conditions.length === 0) return true; let result = evaluateCondition(conditions[0]!, context); diff --git a/deploy/ai-studio/Dockerfile b/deploy/ai-studio/Dockerfile index 5c9834cbd..664c6d6ea 100644 --- a/deploy/ai-studio/Dockerfile +++ b/deploy/ai-studio/Dockerfile @@ -1,63 +1,40 @@ # syntax=docker/dockerfile:1 -# AI Studio execution stack — single Dockerfile, multiple targets: +# Targets: runtime (backend + worker, command chosen per compose service), +# web (nginx, SPA + /api proxy). Build context must be the repo root — +# workspace packages are linked via pnpm `workspace:*`. # -# runtime -> backend + execution-worker (command chosen per compose service) -# web -> nginx serving the AI Studio SPA + reverse proxy to the backend -# -# Database migrations run inside the backend at boot (drizzle-orm's -# programmatic migrator over apps/backend/drizzle/), so there is no separate -# migration image or deploy step. -# -# Build context must be the repo root (workspace packages are linked via -# pnpm `workspace:*`), e.g.: -# -# docker build -f deploy/ai-studio/Dockerfile --target runtime . -# -# Node is pinned to the exact engines.node version because the workspace sets -# engineStrict=true. pnpm is installed via npm, not corepack — the corepack -# bundled with this Node release fails to load pnpm 10 -# (ERR_VM_DYNAMIC_IMPORT_CALLBACK_MISSING) and ships stale signature keys. -# Keep the version in sync with `packageManager` in the root package.json. +# Exact Node pin: engineStrict rejects any other version. pnpm via npm, not +# corepack — this Node's corepack cannot load pnpm 10 +# (ERR_VM_DYNAMIC_IMPORT_CALLBACK_MISSING). Keep in sync with `packageManager`. FROM node:22.12.0-bookworm-slim AS base ENV PNPM_HOME=/pnpm \ PATH="/pnpm:$PATH" \ - # root `prepare` script runs husky, which needs the .git dir that is - # deliberately excluded from the build context + # husky needs the .git dir that the build context excludes HUSKY=0 \ npm_config_store_dir=/pnpm/store \ CI=true RUN npm install -g pnpm@10.17.0 WORKDIR /app -# Download every dependency from the lockfile alone, then bring in the -# source. Any source change invalidates only the layers below the COPY — -# the package store survives in the cache mount, so reinstalls are cheap. FROM base AS source COPY pnpm-lock.yaml pnpm-workspace.yaml ./ RUN --mount=type=cache,id=pnpm-store,target=/pnpm/store pnpm fetch COPY . . -# Production deps for backend + worker and their workspace dependencies -# (execution-core, types). Both apps run TS directly through tsx — the -# Temporal worker additionally requires its workflow TS source on disk at -# runtime (the workflow sandbox bundles it from source), so there is no -# build step to get wrong. +# tsx runs TS directly — required anyway for the worker, whose workflow +# sandbox bundles from TS source on disk at runtime. +# --prefer-offline (not --offline): offline mode leaks into lifecycle +# scripts and breaks the icons build, which shells out to npx. FROM source AS runtime -# root `prepare` runs husky, a devDependency that a --prod install doesn't -# have — drop the script inside the image (root scripts are unused at runtime) RUN --mount=type=cache,id=pnpm-store,target=/pnpm/store \ + # `prepare` runs husky, absent from a --prod install npm pkg delete scripts.prepare && \ pnpm install --frozen-lockfile --prefer-offline --prod \ --filter backend... --filter execution-worker... -# command supplied by docker-compose: -# backend: pnpm --filter backend start:prod -# worker: pnpm --filter execution-worker start:prod -# The SPA build imports the SDK from source (vite alias), so this needs the -# full frontend dependency tree. VITE_BACKEND_URL is baked in at build time; -# the default (empty) makes the app call /api on its own origin, which the -# web target's nginx proxies to the backend. +# VITE_BACKEND_URL is baked at build time; empty = same-origin /api, +# proxied by the web target's nginx. FROM source AS frontend-build ARG VITE_BACKEND_URL= RUN --mount=type=cache,id=pnpm-store,target=/pnpm/store \ diff --git a/deploy/ai-studio/docker-compose.yml b/deploy/ai-studio/docker-compose.yml index c4f827984..b891cea66 100644 --- a/deploy/ai-studio/docker-compose.yml +++ b/deploy/ai-studio/docker-compose.yml @@ -1,11 +1,6 @@ -# AI Studio execution stack — production-shaped compose (WB-229 lean MVP). -# -# cp .env.example .env # set OPENROUTER_API_KEY -# docker compose up -d --build -# -# Only the `web` service publishes a port. Postgres, Temporal and the -# backend stay on the internal network. Temporal UI is opt-in via the -# `debug` profile and binds to loopback only. +# AI Studio production stack (WB-229). Usage: cp .env.example .env, set +# OPENROUTER_API_KEY, then `docker compose up -d --build`. Only `web` +# publishes a port. name: ai-studio @@ -45,10 +40,8 @@ services: retries: 12 restart: unless-stopped - # auto-setup is Temporal's dev-grade single-binary image. Accepted for the - # demo (WB-229); a sustained-load deployment should move to Temporal Cloud - # or a properly operated self-hosted cluster — the apps only consume - # TEMPORAL_ADDRESS and don't care which. + # auto-setup is dev-grade; sustained load should move to Temporal Cloud + # or an operated cluster — the apps only consume TEMPORAL_ADDRESS temporal: image: temporalio/auto-setup:1.29.6.1 depends_on: @@ -73,8 +66,7 @@ services: - '127.0.0.1:8233:8080' restart: unless-stopped - # Applies Drizzle migrations at boot, before accepting traffic. A failure - # (e.g. Postgres still starting) exits the process and `restart` retries. + # applies migrations at boot; on failure exits and `restart` retries backend: image: ai-studio-runtime build: *runtime-build @@ -84,11 +76,9 @@ services: PORT: 3001 DATABASE_URL: postgresql://wb:${APP_DB_PASSWORD:-wb}@app-db:5432/workflow_builder TEMPORAL_ADDRESS: temporal:7233 - # Reference deployment has no user accounts; the explicit opt-in keeps - # a forgotten env var from silently exposing an unauthenticated API. + # explicit opt-in — a forgotten env var fails loudly instead of exposing the API WB_AUTH_PORT: allow-all - # Backend is only reachable through the web service's nginx, which - # sets X-Forwarded-For — safe to trust for per-IP rate limiting. + # only nginx can reach the backend, so X-Forwarded-For is trustworthy TRUST_PROXY: 'true' RATE_LIMIT_EXECUTE_PER_MINUTE: ${RATE_LIMIT_EXECUTE_PER_MINUTE:-10} RATE_LIMIT_EXECUTE_PER_DAY: ${RATE_LIMIT_EXECUTE_PER_DAY:-50} @@ -111,8 +101,7 @@ services: start_period: 15s restart: unless-stopped - # Crash-loops until Temporal answers on 7233 (auto-setup has no usable - # healthcheck); `restart: unless-stopped` converges it. + # crash-loops until Temporal answers (no usable healthcheck); restart converges it worker: image: ai-studio-runtime build: *runtime-build @@ -125,7 +114,7 @@ services: depends_on: app-db: condition: service_healthy - # healthy = migrations applied — the worker writes to the same schema + # backend healthy = migrations applied backend: condition: service_healthy temporal: @@ -139,7 +128,7 @@ services: dockerfile: deploy/ai-studio/Dockerfile target: web args: - # Empty -> SPA calls /api on its own origin, proxied by this nginx. + # empty -> SPA calls /api on its own origin via this nginx VITE_BACKEND_URL: ${VITE_BACKEND_URL:-} ports: - '${WEB_BIND:-0.0.0.0}:${WEB_PORT:-8080}:80' diff --git a/deploy/ai-studio/nginx/default.conf b/deploy/ai-studio/nginx/default.conf index 396c1dd6c..501c28d4f 100644 --- a/deploy/ai-studio/nginx/default.conf +++ b/deploy/ai-studio/nginx/default.conf @@ -1,8 +1,5 @@ -# AI Studio — SPA + API reverse proxy. -# -# This container is the only public surface of the stack. TLS is expected to -# terminate in front of it (cloud ingress / load balancer / a host-level -# certbot'd nginx) — see deploy/ai-studio/README.md for the options. +# AI Studio — SPA + /api reverse proxy; the stack's only public surface. +# TLS terminates in front (see README.md). server { listen 80; @@ -21,12 +18,10 @@ server { gzip on; gzip_types text/css application/javascript application/json image/svg+xml; - # Backend caps request bodies at 1 MB and answers with a structured - # error; keep nginx's own limit above it so the backend owns that path. + # backend enforces 1 MB with a structured error — stay above it client_max_body_size 2m; - # Live execution streams over SSE: hold the connection open, never - # buffer, and outlast the backend's 15s heartbeat interval. + # SSE: never buffer, outlast the 15s heartbeat location ~ ^/api/executions/.+/stream$ { proxy_pass $backend_upstream; proxy_http_version 1.1; @@ -50,7 +45,7 @@ server { proxy_set_header X-Forwarded-Proto $scheme; } - # Vite emits content-hashed filenames under /assets — cache forever. + # content-hashed filenames — cache forever location /assets/ { add_header Cache-Control "public, max-age=31536000, immutable"; try_files $uri =404; diff --git a/tools/deployment/ansible/deploy-application/main.yml b/tools/deployment/ansible/deploy-application/main.yml index f83dc6ddb..0fd88edf3 100644 --- a/tools/deployment/ansible/deploy-application/main.yml +++ b/tools/deployment/ansible/deploy-application/main.yml @@ -1,19 +1,9 @@ --- -# Deploys the AI Studio execution stack to the Docker Swarm cluster, -# following the workflow-builder repo's deploy-application playbook. The -# images are the same two targets the local compose builds -# (deploy/ai-studio/Dockerfile); only the orchestration differs. Database -# migrations run inside the backend at boot, so there is no migration step -# here — the backend restarts until Postgres answers, then migrates itself. -# -# Differences from the workflow-builder playbook, all forced by AI Studio -# being stateful: -# - Postgres x2 + Temporal services with named volumes, pinned to the node -# labeled `ai-studio-data=true` (Swarm volumes are node-local). -# - Services get short DNS aliases (backend, app-db, temporal, ...) so the -# same images and env defaults work under compose and Swarm. -# - Gatekeeper is optional (AUTH_ENABLED=true): the WB-229 public demo is -# deliberately login-free; internal stage/dev instances can enable it. +# AI Studio on the Swarm cluster, following the workflow-builder repo's +# deploy-application playbook with the same images as deploy/ai-studio. +# No migration step: the backend migrates itself at boot. DB/Temporal +# volumes are pinned via node.labels.ai-studio-data (Swarm volumes are +# node-local); gatekeeper is optional (AUTH_ENABLED). - hosts: master diff --git a/tools/deployment/scripts/build-docker.sh b/tools/deployment/scripts/build-docker.sh index 10e90db1e..2e97e77b2 100755 --- a/tools/deployment/scripts/build-docker.sh +++ b/tools/deployment/scripts/build-docker.sh @@ -1,12 +1,7 @@ #!/bin/sh -# Build + push the AI Studio images to ACR, mirroring the workflow-builder -# repo's tools/deployment/scripts/build-docker.sh. Both images come from -# the same multi-target Dockerfile in deploy/ai-studio/ — this script only -# adds registry tagging; the images are identical to the local-compose ones. -# -# Bitbucket-style env vars are honored when present (TAG_PREFIX, -# BITBUCKET_COMMIT, BITBUCKET_DEPLOYMENT_ENVIRONMENT) and fall back to git + -# DEPLOY_ENV so the script also runs from a workstation or GitHub Actions. +# Build + push the AI Studio images (deploy/ai-studio/Dockerfile) to ACR, +# mirroring workflow-builder's build-docker.sh. Bitbucket CI vars win when +# present; git/DEPLOY_ENV fallbacks keep it runnable from a workstation. set -eu APP_NAME="ai-studio" @@ -27,8 +22,7 @@ done ALLOWED_ENVIRONMENTS="stage dev prod" if echo "$ALLOWED_ENVIRONMENTS" | grep -w "$ENVIRONMENT" > /dev/null; then - # setup-az.sh exists in the deployment CI image; logging in by other means - # (az acr login / docker login) is fine when running elsewhere + # setup-az.sh only exists in the deployment CI image [ -f /var/setup-az.sh ] && . /var/setup-az.sh for TARGET in runtime web; do docker push "$REGISTRY/$APP_NAME:$TARGET-$IMAGE_TAG" diff --git a/tools/deployment/scripts/deploy.sh b/tools/deployment/scripts/deploy.sh index 1ac078e57..fac8f5be4 100755 --- a/tools/deployment/scripts/deploy.sh +++ b/tools/deployment/scripts/deploy.sh @@ -1,8 +1,6 @@ #!/bin/sh -# Deploy the AI Studio stack to the Docker Swarm cluster, mirroring the -# workflow-builder repo's tools/deployment/scripts/deploy.sh. The setup -# scripts are baked into the synergycodes deployment CI image; guards let the -# playbook also run from a workstation with az + ansible already configured. +# Deploy the AI Studio stack to Swarm, mirroring workflow-builder's +# deploy.sh. The setup scripts exist only in the deployment CI image. set -eu [ -f /var/setup-az.sh ] && . /var/setup-az.sh From 130fdc7958788829ce67768f789974c557a1f71d Mon Sep 17 00:00:00 2001 From: Jan Librowski Date: Thu, 11 Jun 2026 14:31:47 +0200 Subject: [PATCH 09/11] revert(execution-worker): move the decision catch-all fix to its own pr MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reverts the empty-conditions-as-catch-all change (ccf7375) and its decision log. It changes execution semantics and supersedes a clause of decision-no-match.decision-log.md — that deserves a focused review, not a ride-along in a deployment PR. --- .../decision-catch-all.decision-log.md | 74 ------------------- .../src/executors/decision.test.ts | 15 ++-- .../src/executors/decision.ts | 3 +- .../swarm-alignment.decision-log.md | 2 +- 4 files changed, 8 insertions(+), 86 deletions(-) delete mode 100644 apps/execution-worker/decision-catch-all.decision-log.md diff --git a/apps/execution-worker/decision-catch-all.decision-log.md b/apps/execution-worker/decision-catch-all.decision-log.md deleted file mode 100644 index fef839295..000000000 --- a/apps/execution-worker/decision-catch-all.decision-log.md +++ /dev/null @@ -1,74 +0,0 @@ -### Title: Decision branch with no conditions is the explicit catch-all - -### Proposed by: Jan Librowski - -### Date: 10.06.2026 - -## Context - -End-to-end verification of the WB-229 demo deployment failed on the -reference workload: the Sales Inquiry Pipeline's classifier returned -`**Type:** general`, no conditional branch matched, and the run ended in -`execution_failed` — despite the template shipping a 'General' branch with -`conditions: []` as its designed fallback. - -The codebase contradicted itself on what a catch-all is: - -- `decision-no-match.decision-log.md` (execution-core, 29.04.2026) decided - **strict fail-fast on no match** — correct and kept — but its Cons section - declared an empty-conditions branch non-matching, requiring a - tautological condition (`x === x`) as the catch-all idiom. A unit test - pinned that. -- The executor's own `no_branch_matched` error message instructed the - opposite: _"Add an explicit catch-all branch with no conditions."_ -- The reference template (`sales-inquiry-flow.ts`) followed the error - message, not the test — and was broken for any input classified outside - its keyword branches. Local demos always matched 'pricing'/'technical', so - this never surfaced until a different model classified an input as - 'general'. - -Three artifacts said "empty = catch-all", one said the opposite; the -user-facing ones (error message, reference template) all pointed one way. - -## Decision - -`branchMatches` in `apps/execution-worker/src/executors/decision.ts` now -returns `true` for an empty `conditions[]`. First-match order is preserved, -so a catch-all only fires when placed after the conditional branches. The -strict throw from the original decision is untouched: a decision node whose -branches all have conditions and none match still fails with -`no_branch_matched`. - -This supersedes the "empty conditions are non-matching" bullet (and the -test pinning it) from `decision-no-match.decision-log.md`. The fail-fast -core of that decision stands. - -## Alternative Options Considered - -- **Keep the semantics, fix the template with a tautological condition** — - rejected: every UI author following the error message's instruction would - keep hitting the same failure, and `isEqual 'a' 'a'` as the blessed - catch-all idiom is noise a property panel can't explain. -- **`isDefault: true` flag on a designated branch** — still the cleaner - long-term UX (already noted in the original log); still deferred for the - same reason: type + Zod schema + properties-panel changes, separate - ticket. - -## Consequences - -- **Pros** - - The shipped reference template and the executor's error message are now - both true. - - Catch-all is expressible in the UI as-is (an empty branch), no magic - conditions. -- **Cons** - - Semantics change: a flow that contained an empty-conditions branch and - relied on the node failing now routes through that branch. No known - flow does this — the only shipped example wanted the opposite. - - A _misplaced_ empty branch (before conditional ones) silently wins due - to first-match order; the matched branch is visible in the - `matchedBranch` output and event log. - -## Status - -Accepted diff --git a/apps/execution-worker/src/executors/decision.test.ts b/apps/execution-worker/src/executors/decision.test.ts index 2f11c3fbd..ac1f98de2 100644 --- a/apps/execution-worker/src/executors/decision.test.ts +++ b/apps/execution-worker/src/executors/decision.test.ts @@ -82,20 +82,17 @@ describe('executeDecision', () => { } }); - it('treats a branch with no conditions as the catch-all', () => { + it('treats a branch with no conditions as non-matching (so callers must throw or use explicit operators)', () => { + // Empty conditions array — branchMatches returns false, so this is NOT + // a default. If someone wants a default, they need a branch whose + // conditions evaluate to true (e.g. isEqual 'x' 'x'). const node = decisionNode([ { - sourceHandle: 'no', - conditions: [{ x: 'a', y: 'b', comparisonOperator: 'isEqual' }], - }, - { - sourceHandle: 'fallback', + sourceHandle: 'empty', conditions: [], }, ]); - const result = executeDecision(node, context()); - - expect(result.nextPort).toBe('fallback'); + expect(() => executeDecision(node, context())).toThrowError(NodeExecutionError); }); }); diff --git a/apps/execution-worker/src/executors/decision.ts b/apps/execution-worker/src/executors/decision.ts index d9e8eefe9..847fa2db5 100644 --- a/apps/execution-worker/src/executors/decision.ts +++ b/apps/execution-worker/src/executors/decision.ts @@ -28,8 +28,7 @@ export function executeDecision(node: DecisionNode, context: ExecutionContext): } function branchMatches(conditions: DecisionBranchCondition[], context: ExecutionContext): boolean { - // no conditions = the explicit catch-all the no_branch_matched error instructs authors to add - if (conditions.length === 0) return true; + if (conditions.length === 0) return false; let result = evaluateCondition(conditions[0]!, context); for (let index = 1; index < conditions.length; index++) { diff --git a/tools/deployment/swarm-alignment.decision-log.md b/tools/deployment/swarm-alignment.decision-log.md index 83d58de78..c60d051e7 100644 --- a/tools/deployment/swarm-alignment.decision-log.md +++ b/tools/deployment/swarm-alignment.decision-log.md @@ -6,7 +6,7 @@ ## Context -The compose-based deployment in `deploy/ai-studio/` (see its decision log) +The compose-based deployment in `deploy/ai-studio/` targets a single Docker host and ships TLS as a bring-your-own concern. The company's actual Azure footprint, found in the `workflow-builder` repo's `tools/deployment/`, is different: a self-managed Docker Swarm cluster with From a03b9a2fc8e9a94f103fe1a96444b57e8a7984ff Mon Sep 17 00:00:00 2001 From: Jan Librowski Date: Thu, 11 Jun 2026 14:32:05 +0200 Subject: [PATCH 10/11] docs(deploy): drop the deployment decision log MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AI Studio is a POC — the README and the comments on the non-obvious pieces carry what operators need; full architecture rationale is premature at this stage. --- .../ai-studio-deployment.decision-log.md | 148 ------------------ 1 file changed, 148 deletions(-) delete mode 100644 deploy/ai-studio/ai-studio-deployment.decision-log.md diff --git a/deploy/ai-studio/ai-studio-deployment.decision-log.md b/deploy/ai-studio/ai-studio-deployment.decision-log.md deleted file mode 100644 index 40e1fc6ac..000000000 --- a/deploy/ai-studio/ai-studio-deployment.decision-log.md +++ /dev/null @@ -1,148 +0,0 @@ -### Title: Containerized AI Studio deployment — portable compose stack - -### Proposed by: Jan Librowski - -### Date: 10.06.2026 - -## Context - -WB-229 (lean public demo on an Azure VM) and its parent WB-155 (deployment -preparations) needed a production deployment story for the AI Studio -execution stack: backend (Hono), execution-worker (Temporal), two Postgres -instances, a Temporal server, and the static SPA. Until now only `pnpm dev` -plus an infra-only compose existed — no Dockerfiles for any app. - -Constraints that shaped the design: - -- **Portability over Azure ergonomics.** Workflow Builder is sold to external - customers; whatever ships here must run on AWS / GCP / on-prem / bare - Docker without re-architecting. DevOps asked for containerization - specifically for ease of portability and setup. -- **Surprise bills must be impossible** (WB-229): a hard OpenRouter spend cap - (dashboard Guardrail) plus an in-app per-IP abuse gate. -- **The local dev flow must survive** (`pnpm dev:ai-studio` + `pnpm -infra:up`) — contributors rely on it; nothing in dev changes. -- The repo pins Node 22.12.0 + pnpm 10.17.0 with `engineStrict`, and the - Temporal worker bundles its workflow entrypoint **from TS source at - runtime**, so the source tree must be present in the worker container. - -## Decision - -Everything lives in `deploy/ai-studio/`: one multi-target Dockerfile, a -production `docker-compose.yml`, the nginx config, `.env.example`, and a -DevOps-facing README. - -1. **One Dockerfile, two targets** (`runtime`, `web`), built - with the repo root as context (pnpm `workspace:*` links require it). A - shared `source` stage does `pnpm fetch` against a BuildKit cache mount, so - per-target installs are store-hits. -2. **tsx in production, no build step.** Backend and worker run TS through - `tsx` exactly as in dev — `tsx` moved from a hoisted root devDependency to - a real dependency of both apps, plus `start:prod` scripts (the existing - `start` scripts hard-require a `.env` file; containers inject env - directly). This sidesteps the Temporal-sandbox-needs-source constraint - entirely — there is no bundling step to get wrong. -3. **One shared `runtime` image for backend and worker**; the compose - `command` picks the entrypoint. One image to build, push, and version. -4. **Migrations on backend boot** (revised 11.06.2026 — originally a - one-shot `migrate` compose service). The backend applies pending Drizzle - migrations via drizzle-orm's programmatic migrator before accepting - traffic; on failure it exits and the restart policy retries until - Postgres answers. One less image, no orchestrator-specific ordering — - the same behavior on compose, Swarm, or anything else. Single-replica - assumption: concurrent backends would race the migrator. -5. **nginx is the only public surface.** It serves the SPA and proxies - `/api` to the backend on the internal network; the SSE stream route gets - `proxy_buffering off` + long read timeout. The backend container is - reached through Docker's embedded DNS **re-resolved per request** - (`resolver 127.0.0.11` + variable `proxy_pass`) — a statically resolved - upstream 502s after the backend container is recreated on redeploy. - Postgres ×2, Temporal, and the backend publish no host ports; Temporal UI - is opt-in behind a `debug` profile bound to loopback. TLS terminates in - front (existing ingress or host-level Caddy/certbot — documented in the - README, deliberately not baked into the stack). -6. **Same-origin frontend.** `VITE_BACKEND_URL` is baked empty at build time; - the SPA calls `/api` on its own origin. No CORS, no second hostname, SSE - intact. -7. **pnpm installed via `npm i -g pnpm@10.17.0` in images, not corepack.** - The corepack bundled with Node 22.12.0 cannot load pnpm 10 - (`ERR_VM_DYNAMIC_IMPORT_CALLBACK_MISSING`) and ships stale signature - keys. Version is duplicated in the Dockerfile — keep in sync with - `packageManager`. -8. **Installs use `--prefer-offline`, not `--offline`**: pnpm propagates - offline mode to lifecycle scripts, and `apps/icons` `prepare` shells out - to `npx @svgr/cli`, which then refuses the network (`ENOTCACHED`). -9. **Per-IP rate limit on the execute route** (`apps/backend`): - fixed-window, in-memory, env-gated (`RATE_LIMIT_EXECUTE_PER_MINUTE/DAY`, - default off so dev is untouched; compose sets 10/min, 50/day). - `TRUST_PROXY=true` makes it read the client from `X-Forwarded-For`, which - only our nginx can set. This is the abuse gate; the money cap is the - OpenRouter account Guardrail — two independent controls. -10. **Model pinned per environment, not in code**: compose defaults - `AI_MODEL=mistralai/mistral-small-3.2-24b-instruct` (price re-verified - 2026-06-10 against the OpenRouter API: $0.075/$0.20 per Mtok ≈ $0.0004 - per 3-call template run). Swapping models is an env change. -11. **Pinned images, no `:latest`**: `temporalio/auto-setup:1.29.6.1`, - `temporalio/ui:2.51.0`, `nginx:1.31-alpine`, `node:22.12.0-bookworm-slim` - (exact pin because `engineStrict` rejects any other 22.x). - -Found and fixed during end-to-end verification: the worker ignored -`TEMPORAL_ADDRESS` (`Worker.create` without an explicit connection dials -`127.0.0.1:7233` — invisible in local dev, fatal in containers). - -## Alternative Options Considered - -- **`pnpm deploy` to materialize standalone app bundles** — rejected: pnpm 10 - requires `inject-workspace-packages` or a legacy-mode flag, adding workspace - config churn for no benefit over running from the installed workspace. -- **Compile step (tsc/tsup/esbuild) + plain `node`** — rejected for the MVP: - the worker needs its TS source on disk for Temporal's runtime bundling - anyway, so compilation only helps the backend while doubling the ways the - artifact can diverge from dev. Revisit if image size or cold-start matters. -- **Azure-specific artifacts (Container Apps / AKS manifests, Key Vault - wiring)** — deferred deliberately: WB-229 targets a single VM, and the - portability requirement says external customers must not inherit Azure - glue. The compose file is the customer-facing artifact; platform topology - can wrap it later. -- **Separate Dockerfiles per app** — rejected: three near-identical - install stages to keep in sync; the multi-target file shares layers. -- **Rate limiting in nginx (`limit_req`)** — rejected: the limit is - per-execute-route and needs structured JSON 429s consistent with the - backend's error contract; nginx zones would split the policy across two - layers. nginx stays dumb, policy lives where the route lives. -- **Redis-backed rate limiter** — deferred to the scale-ready task (WB-229 - explicitly accepts single-replica in-memory for the MVP). - -## Consequences - -- **Pros** - - `cp .env.example .env && docker compose up -d --build` is the whole - deployment; verified end-to-end (Sales Inquiry Pipeline to - `execution_completed` with live SSE through nginx, rate limiter returning - 429s past the budget). - - The artifact is platform-neutral: any Docker host, no cloud SDK anywhere. - - Secrets only travel through compose `environment`; `.dockerignore` now - excludes `**/.env*` so keys cannot be baked into images (previously - `apps/*/.env` files would have been copied into the build context). - - Dev flow untouched; rate limiter is inert without its env vars. -- **Cons** - - `runtime` image is ~1.9 GB (full source tree + pnpm store hardlinks + - Temporal native bridge). Acceptable for a demo VM; a compile step or - `pnpm deploy` bundle is the known optimization path. - - Any source change invalidates the `COPY . .` layer and reinstalls - (mitigated by the store cache mount; rebuilds are minutes, not tens of). - - `temporalio/auto-setup` is dev-grade by Temporal's own docs — accepted - for the demo, swap for Temporal Cloud / operated cluster under sustained - load (the apps only consume `TEMPORAL_ADDRESS`). - - pnpm version is pinned in two places (root `packageManager` + - Dockerfile). - -## Revisions - -- **11.06.2026** — `migrate` target and service removed; the backend now - migrates itself at boot (Jan's simplification request during WB-229 - review). Dockerfile is down to two targets (`runtime`, `web`). - -## Status - -Accepted From d8d80a06079396e62d7f850ed803363754469cb5 Mon Sep 17 00:00:00 2001 From: Jan Librowski Date: Mon, 15 Jun 2026 15:08:15 +0200 Subject: [PATCH 11/11] fix(deploy): add healthcheck, restart policies, and overlay driver to swarm stack MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Backend healthcheck (fetch /api/health) lets Swarm detect when migrations are done — without it the worker can hit a pre-migration schema. Explicit restart_policy on every service replaces the implicit Swarm default; crash-looping services (worker, temporal) get max_attempts. Internal network gets driver: overlay for clarity. --- .../ansible/deploy-application/main.yml | 26 ++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/tools/deployment/ansible/deploy-application/main.yml b/tools/deployment/ansible/deploy-application/main.yml index 0fd88edf3..a5a41f350 100644 --- a/tools/deployment/ansible/deploy-application/main.yml +++ b/tools/deployment/ansible/deploy-application/main.yml @@ -114,11 +114,20 @@ TRUST_PROXY: 'true' RATE_LIMIT_EXECUTE_PER_MINUTE: "{{ lookup('env', 'RATE_LIMIT_EXECUTE_PER_MINUTE') or '10' }}" RATE_LIMIT_EXECUTE_PER_DAY: "{{ lookup('env', 'RATE_LIMIT_EXECUTE_PER_DAY') or '50' }}" + healthcheck: + test: ['CMD', 'node', '-e', "fetch('http://127.0.0.1:3001/api/health').then(r=>process.exit(r.ok?0:1)).catch(()=>process.exit(1))"] + interval: 10s + timeout: 5s + retries: 6 + start_period: 15s networks: internal: - # the web image's nginx proxies to http://backend:3001 aliases: [backend] deploy: + restart_policy: + condition: any + delay: 5s + max_attempts: 10 placement: constraints: - node.role==worker @@ -134,6 +143,10 @@ networks: internal: deploy: + restart_policy: + condition: any + delay: 5s + max_attempts: 20 placement: constraints: - node.role==worker @@ -150,6 +163,9 @@ internal: aliases: [app-db] deploy: + restart_policy: + condition: any + delay: 5s placement: constraints: - node.labels.ai-studio-data==true @@ -166,6 +182,9 @@ internal: aliases: [temporal-db] deploy: + restart_policy: + condition: any + delay: 5s placement: constraints: - node.labels.ai-studio-data==true @@ -182,6 +201,10 @@ internal: aliases: [temporal] deploy: + restart_policy: + condition: any + delay: 5s + max_attempts: 20 placement: constraints: - node.role==worker @@ -192,6 +215,7 @@ networks: internal: + driver: overlay traefik-host-external: external: true