From 1f6260788c1c4eff36adf9afb54c1398ac9d8e77 Mon Sep 17 00:00:00 2001
From: Jan Librowski <jan.librowski@synergycodes.com>
Date: Wed, 10 Jun 2026 02:04:21 +0200
Subject: [PATCH 01/11] fix(execution-worker): connect to TEMPORAL_ADDRESS
 instead of implicit localhost
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Worker.create without an explicit connection dials 127.0.0.1:7233,
ignoring the TEMPORAL_ADDRESS env var — correct in local dev, broken in
any deployment where Temporal is not on loopback.
---
 apps/execution-worker/src/engines/temporal/worker.ts | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/apps/execution-worker/src/engines/temporal/worker.ts b/apps/execution-worker/src/engines/temporal/worker.ts
index 326ed7c5a..49ecd295e 100644
--- a/apps/execution-worker/src/engines/temporal/worker.ts
+++ b/apps/execution-worker/src/engines/temporal/worker.ts
@@ -1,4 +1,4 @@
-import { Worker } from '@temporalio/worker';
+import { NativeConnection, Worker } from '@temporalio/worker';
 import 'dotenv/config';
 import { fileURLToPath } from 'node:url';
 
@@ -42,7 +42,12 @@ const activities = {
   },
 };
 
+// Without an explicit connection the worker silently dials 127.0.0.1:7233,
+// ignoring TEMPORAL_ADDRESS — correct in local dev, wrong everywhere else.
+const connection = await NativeConnection.connect({ address: env.TEMPORAL_ADDRESS });
+
 const worker = await Worker.create({
+  connection,
   workflowsPath: fileURLToPath(new URL('workflows/run-workflow.ts', import.meta.url)),
   activities,
   taskQueue,

From ccf7375b43ea77b21bdd0c2f0dfc7b47e6c34b4d Mon Sep 17 00:00:00 2001
From: Jan Librowski <jan.librowski@synergycodes.com>
Date: Wed, 10 Jun 2026 02:04:35 +0200
Subject: [PATCH 02/11] fix(execution-worker): treat empty-conditions decision
 branch as catch-all
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The no_branch_matched error message and the Sales Inquiry reference
template both treat a branch with no conditions as the explicit
catch-all, but branchMatches returned false for it — any input
classified outside the keyword branches failed the whole run.

Supersedes the empty-conditions bullet of
packages/execution-core/decision-no-match.decision-log.md (the strict
fail-fast core of that decision is unchanged); see
apps/execution-worker/decision-catch-all.decision-log.md.
---
 .../decision-catch-all.decision-log.md        | 74 +++++++++++++++++++
 .../src/executors/decision.test.ts            | 19 +++--
 .../src/executors/decision.ts                 |  6 +-
 3 files changed, 92 insertions(+), 7 deletions(-)
 create mode 100644 apps/execution-worker/decision-catch-all.decision-log.md

diff --git a/apps/execution-worker/decision-catch-all.decision-log.md b/apps/execution-worker/decision-catch-all.decision-log.md
new file mode 100644
index 000000000..fef839295
--- /dev/null
+++ b/apps/execution-worker/decision-catch-all.decision-log.md
@@ -0,0 +1,74 @@
+### Title: Decision branch with no conditions is the explicit catch-all
+
+### Proposed by: Jan Librowski
+
+### Date: 10.06.2026
+
+## Context
+
+End-to-end verification of the WB-229 demo deployment failed on the
+reference workload: the Sales Inquiry Pipeline's classifier returned
+`**Type:** general`, no conditional branch matched, and the run ended in
+`execution_failed` — despite the template shipping a 'General' branch with
+`conditions: []` as its designed fallback.
+
+The codebase contradicted itself on what a catch-all is:
+
+- `decision-no-match.decision-log.md` (execution-core, 29.04.2026) decided
+  **strict fail-fast on no match** — correct and kept — but its Cons section
+  declared an empty-conditions branch non-matching, requiring a
+  tautological condition (`x === x`) as the catch-all idiom. A unit test
+  pinned that.
+- The executor's own `no_branch_matched` error message instructed the
+  opposite: _"Add an explicit catch-all branch with no conditions."_
+- The reference template (`sales-inquiry-flow.ts`) followed the error
+  message, not the test — and was broken for any input classified outside
+  its keyword branches. Local demos always matched 'pricing'/'technical', so
+  this never surfaced until a different model classified an input as
+  'general'.
+
+Three artifacts said "empty = catch-all", one said the opposite; the
+user-facing ones (error message, reference template) all pointed one way.
+
+## Decision
+
+`branchMatches` in `apps/execution-worker/src/executors/decision.ts` now
+returns `true` for an empty `conditions[]`. First-match order is preserved,
+so a catch-all only fires when placed after the conditional branches. The
+strict throw from the original decision is untouched: a decision node whose
+branches all have conditions and none match still fails with
+`no_branch_matched`.
+
+This supersedes the "empty conditions are non-matching" bullet (and the
+test pinning it) from `decision-no-match.decision-log.md`. The fail-fast
+core of that decision stands.
+
+## Alternative Options Considered
+
+- **Keep the semantics, fix the template with a tautological condition** —
+  rejected: every UI author following the error message's instruction would
+  keep hitting the same failure, and `isEqual 'a' 'a'` as the blessed
+  catch-all idiom is noise a property panel can't explain.
+- **`isDefault: true` flag on a designated branch** — still the cleaner
+  long-term UX (already noted in the original log); still deferred for the
+  same reason: type + Zod schema + properties-panel changes, separate
+  ticket.
+
+## Consequences
+
+- **Pros**
+  - The shipped reference template and the executor's error message are now
+    both true.
+  - Catch-all is expressible in the UI as-is (an empty branch), no magic
+    conditions.
+- **Cons**
+  - Semantics change: a flow that contained an empty-conditions branch and
+    relied on the node failing now routes through that branch. No known
+    flow does this — the only shipped example wanted the opposite.
+  - A _misplaced_ empty branch (before conditional ones) silently wins due
+    to first-match order; the matched branch is visible in the
+    `matchedBranch` output and event log.
+
+## Status
+
+Accepted
diff --git a/apps/execution-worker/src/executors/decision.test.ts b/apps/execution-worker/src/executors/decision.test.ts
index ac1f98de2..2d4b31242 100644
--- a/apps/execution-worker/src/executors/decision.test.ts
+++ b/apps/execution-worker/src/executors/decision.test.ts
@@ -82,17 +82,24 @@ describe('executeDecision', () => {
     }
   });
 
-  it('treats a branch with no conditions as non-matching (so callers must throw or use explicit operators)', () => {
-    // Empty conditions array — branchMatches returns false, so this is NOT
-    // a default. If someone wants a default, they need a branch whose
-    // conditions evaluate to true (e.g. isEqual 'x' 'x').
+  it('treats a branch with no conditions as the catch-all', () => {
+    // The contract the no_branch_matched error instructs authors to use, and
+    // what the reference Sales Inquiry template relies on for its 'General'
+    // branch. First-match order applies: a catch-all placed after conditional
+    // branches only fires when none of them matched.
     const node = decisionNode([
       {
-        sourceHandle: 'empty',
+        sourceHandle: 'no',
+        conditions: [{ x: 'a', y: 'b', comparisonOperator: 'isEqual' }],
+      },
+      {
+        sourceHandle: 'fallback',
         conditions: [],
       },
     ]);
 
-    expect(() => executeDecision(node, context())).toThrowError(NodeExecutionError);
+    const result = executeDecision(node, context());
+
+    expect(result.nextPort).toBe('fallback');
   });
 });
diff --git a/apps/execution-worker/src/executors/decision.ts b/apps/execution-worker/src/executors/decision.ts
index 847fa2db5..0fbccb140 100644
--- a/apps/execution-worker/src/executors/decision.ts
+++ b/apps/execution-worker/src/executors/decision.ts
@@ -28,7 +28,11 @@ export function executeDecision(node: DecisionNode, context: ExecutionContext):
 }
 
 function branchMatches(conditions: DecisionBranchCondition[], context: ExecutionContext): boolean {
-  if (conditions.length === 0) return false;
+  // A branch with no conditions is the explicit catch-all — the contract the
+  // error above instructs authors to use, and what the reference Sales
+  // Inquiry template ships ('General' branch). First-match order still
+  // applies, so a catch-all only fires when placed after conditional branches.
+  if (conditions.length === 0) return true;
 
   let result = evaluateCondition(conditions[0]!, context);
   for (let index = 1; index < conditions.length; index++) {

From 7ca365ed9bc2106e38ad3645f81949d0e6d69472 Mon Sep 17 00:00:00 2001
From: Jan Librowski <jan.librowski@synergycodes.com>
Date: Wed, 10 Jun 2026 02:04:48 +0200
Subject: [PATCH 03/11] feat(backend): add per-ip rate limit on the execute
 route
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixed-window, in-memory limiter (WB-229 abuse gate). Disabled unless
RATE_LIMIT_EXECUTE_PER_MINUTE / RATE_LIMIT_EXECUTE_PER_DAY are set, so
local dev is unaffected. TRUST_PROXY=true reads the client IP from
X-Forwarded-For — only enable behind a proxy that sets it.
---
 apps/backend/src/env.ts                       |   5 +
 .../backend/src/middleware/rate-limit.test.ts | 134 ++++++++++++++++++
 apps/backend/src/middleware/rate-limit.ts     | 121 ++++++++++++++++
 apps/backend/src/server.ts                    |  17 +++
 4 files changed, 277 insertions(+)
 create mode 100644 apps/backend/src/middleware/rate-limit.test.ts
 create mode 100644 apps/backend/src/middleware/rate-limit.ts

diff --git a/apps/backend/src/env.ts b/apps/backend/src/env.ts
index 12645d6d6..349b5602f 100644
--- a/apps/backend/src/env.ts
+++ b/apps/backend/src/env.ts
@@ -12,4 +12,9 @@ export const env = {
   HOST: envOr('HOST', '127.0.0.1'),
   DATABASE_URL: envOr('DATABASE_URL', 'postgresql://wb:wb@127.0.0.1:5432/workflow_builder'),
   TEMPORAL_ADDRESS: envOr('TEMPORAL_ADDRESS', '127.0.0.1:7233'),
+  // Per-IP limits on the execute route. 0 = disabled (local dev default);
+  // the production compose in deploy/ai-studio sets both.
+  RATE_LIMIT_EXECUTE_PER_MINUTE: Number(envOr('RATE_LIMIT_EXECUTE_PER_MINUTE', '0')),
+  RATE_LIMIT_EXECUTE_PER_DAY: Number(envOr('RATE_LIMIT_EXECUTE_PER_DAY', '0')),
+  TRUST_PROXY: envOr('TRUST_PROXY', 'false') === 'true',
 };
diff --git a/apps/backend/src/middleware/rate-limit.test.ts b/apps/backend/src/middleware/rate-limit.test.ts
new file mode 100644
index 000000000..f3f78b52b
--- /dev/null
+++ b/apps/backend/src/middleware/rate-limit.test.ts
@@ -0,0 +1,134 @@
+import { Hono } from 'hono';
+import { describe, expect, it } from 'vitest';
+
+import { type RateLimitOptions, createRateLimitMiddleware } from './rate-limit';
+
+const MINUTE_MS = 60_000;
+const DAY_MS = 24 * 60 * 60 * 1000;
+
+/**
+ * Build a Hono app mirroring the production wiring in `server.ts`: the
+ * limiter guards a single execute-shaped route. Tests drive the clock through
+ * the injectable `now` and identify callers via X-Forwarded-For (trustProxy),
+ * since `app.request()` has no underlying socket.
+ */
+function makeApp(overrides: Partial<RateLimitOptions> = {}) {
+  let timestamp = 0;
+  const app = new Hono();
+  app.use(
+    '/api/workflows/:id/execute',
+    createRateLimitMiddleware({
+      perMinute: 2,
+      perDay: 5,
+      trustProxy: true,
+      now: () => timestamp,
+      ...overrides,
+    }),
+  );
+  app.post('/api/workflows/:id/execute', (c) => c.json({ ok: true }, 202));
+
+  return {
+    app,
+    advance(ms: number) {
+      timestamp += ms;
+    },
+    execute(ip = '203.0.113.7') {
+      return app.request('/api/workflows/wf-1/execute', {
+        method: 'POST',
+        headers: { 'x-forwarded-for': ip },
+      });
+    },
+  };
+}
+
+describe('createRateLimitMiddleware', () => {
+  it('allows requests under the limit', async () => {
+    const { execute } = makeApp();
+
+    const first = await execute();
+    const second = await execute();
+    expect(first.status).toBe(202);
+    expect(second.status).toBe(202);
+  });
+
+  it('rejects with 429 and Retry-After once the minute limit is hit', async () => {
+    const { execute, advance } = makeApp();
+
+    await execute();
+    await execute();
+    advance(10_000);
+
+    const response = await execute();
+    expect(response.status).toBe(429);
+    expect(response.headers.get('Retry-After')).toBe('50');
+    expect(await response.json()).toMatchObject({ code: 'rate_limited', retryAfterSeconds: 50 });
+  });
+
+  it('tracks each IP independently', async () => {
+    const { execute } = makeApp();
+
+    await execute('203.0.113.7');
+    await execute('203.0.113.7');
+    const blocked = await execute('203.0.113.7');
+    const otherIp = await execute('198.51.100.9');
+    expect(blocked.status).toBe(429);
+    expect(otherIp.status).toBe(202);
+  });
+
+  it('resets the minute window after it elapses', async () => {
+    const { execute, advance } = makeApp();
+
+    await execute();
+    await execute();
+    const blocked = await execute();
+    expect(blocked.status).toBe(429);
+
+    advance(MINUTE_MS);
+    const allowedAgain = await execute();
+    expect(allowedAgain.status).toBe(202);
+  });
+
+  it('enforces the day limit across minute windows', async () => {
+    const { execute, advance } = makeApp();
+
+    for (let index = 0; index < 5; index++) {
+      const allowed = await execute();
+      expect(allowed.status).toBe(202);
+      advance(MINUTE_MS);
+    }
+
+    const response = await execute();
+    expect(response.status).toBe(429);
+    // 5 minutes into the day window -> retry once the remaining day elapses
+    expect(response.headers.get('Retry-After')).toBe(String((DAY_MS - 5 * MINUTE_MS) / 1000));
+  });
+
+  it('resets the day window after it elapses', async () => {
+    const { execute, advance } = makeApp({ perMinute: 0 });
+
+    for (let index = 0; index < 5; index++) {
+      await execute();
+    }
+    const blocked = await execute();
+    expect(blocked.status).toBe(429);
+
+    advance(DAY_MS);
+    const allowedAgain = await execute();
+    expect(allowedAgain.status).toBe(202);
+  });
+
+  it('uses the first X-Forwarded-For hop as the client identity', async () => {
+    const { app } = makeApp();
+
+    const request = (chain: string) =>
+      app.request('/api/workflows/wf-1/execute', {
+        method: 'POST',
+        headers: { 'x-forwarded-for': chain },
+      });
+
+    await request('203.0.113.7, 10.0.0.1');
+    await request('203.0.113.7, 10.0.0.2');
+    const blocked = await request('203.0.113.7, 10.0.0.3');
+    expect(blocked.status).toBe(429);
+  });
+});
diff --git a/apps/backend/src/middleware/rate-limit.ts b/apps/backend/src/middleware/rate-limit.ts
new file mode 100644
index 000000000..8496a94f1
--- /dev/null
+++ b/apps/backend/src/middleware/rate-limit.ts
@@ -0,0 +1,121 @@
+import { getConnInfo } from '@hono/node-server/conninfo';
+import type { Context, MiddlewareHandler } from 'hono';
+
+export type RateLimitOptions = {
+  /** Max requests per IP per minute. 0 disables the minute window. */
+  perMinute: number;
+  /** Max requests per IP per day. 0 disables the day window. */
+  perDay: number;
+  /**
+   * Read the client IP from X-Forwarded-For. Only enable when the backend is
+   * reachable exclusively through a proxy that sets the header (the deploy
+   * nginx does) — a directly-reachable backend would let clients spoof it.
+   */
+  trustProxy: boolean;
+  /** Injectable clock for tests. */
+  now?: () => number;
+};
+
+type WindowState = {
+  windowStart: number;
+  count: number;
+};
+
+type IpState = {
+  minute: WindowState;
+  day: WindowState;
+};
+
+const MINUTE_MS = 60_000;
+const DAY_MS = 24 * 60 * 60 * 1000;
+const SWEEP_INTERVAL_MS = 10 * MINUTE_MS;
+
+function clientIp(c: Context, trustProxy: boolean): string {
+  if (trustProxy) {
+    const forwardedFor = c.req.header('x-forwarded-for');
+    const first = forwardedFor?.split(',')[0]?.trim();
+    if (first) {
+      return first;
+    }
+  }
+  try {
+    return getConnInfo(c).remote.address ?? 'unknown';
+  } catch {
+    // No underlying socket (e.g. app.request() in tests)
+    return 'unknown';
+  }
+}
+
+function hitWindow(state: WindowState, limit: number, durationMs: number, now: number): number | null {
+  if (limit <= 0) {
+    return null;
+  }
+  if (now - state.windowStart >= durationMs) {
+    state.windowStart = now;
+    state.count = 0;
+  }
+  if (state.count >= limit) {
+    return state.windowStart + durationMs - now;
+  }
+  return null;
+}
+
+/**
+ * Fixed-window, in-memory, per-IP rate limiter for the execute route.
+ *
+ * Deliberately process-local (WB-229 lean MVP runs a single backend
+ * replica): counters reset on restart and are not shared across replicas.
+ * The OpenRouter account Guardrail is the independent hard spend cap; this
+ * gate only stops a single IP from burning the daily budget.
+ */
+export function createRateLimitMiddleware(options: RateLimitOptions): MiddlewareHandler {
+  const { perMinute, perDay, trustProxy } = options;
+  const now = options.now ?? Date.now;
+  const states = new Map<string, IpState>();
+  let lastSweep = now();
+
+  return async (c, next) => {
+    const timestamp = now();
+
+    if (timestamp - lastSweep >= SWEEP_INTERVAL_MS) {
+      lastSweep = timestamp;
+      for (const [ip, state] of states) {
+        if (timestamp - state.day.windowStart >= DAY_MS && timestamp - state.minute.windowStart >= MINUTE_MS) {
+          states.delete(ip);
+        }
+      }
+    }
+
+    const ip = clientIp(c, trustProxy);
+    let state = states.get(ip);
+    if (!state) {
+      state = {
+        minute: { windowStart: timestamp, count: 0 },
+        day: { windowStart: timestamp, count: 0 },
+      };
+      states.set(ip, state);
+    }
+
+    const minuteRetry = hitWindow(state.minute, perMinute, MINUTE_MS, timestamp);
+    const dayRetry = hitWindow(state.day, perDay, DAY_MS, timestamp);
+    const retryAfterMs = Math.max(minuteRetry ?? 0, dayRetry ?? 0);
+
+    if (retryAfterMs > 0) {
+      const retryAfterSeconds = Math.ceil(retryAfterMs / 1000);
+      c.header('Retry-After', String(retryAfterSeconds));
+      return c.json(
+        {
+          code: 'rate_limited',
+          message: 'Too many workflow executions from this address — try again later',
+          retryAfterSeconds,
+        },
+        429,
+      );
+    }
+
+    state.minute.count += 1;
+    state.day.count += 1;
+
+    await next();
+  };
+}
diff --git a/apps/backend/src/server.ts b/apps/backend/src/server.ts
index 61a36f952..10e52c1d9 100644
--- a/apps/backend/src/server.ts
+++ b/apps/backend/src/server.ts
@@ -14,6 +14,7 @@ import {
 } from './auth';
 import { env } from './env';
 import { logger } from './logger';
+import { createRateLimitMiddleware } from './middleware/rate-limit';
 import { createExecutionsRoutes } from './routes/executions';
 import { createWorkflowsRoutes } from './routes/workflows';
 
@@ -52,6 +53,22 @@ app.get('/api/health', (c) => c.json({ status: 'ok' }));
 
 app.use('/api/*', createAuthMiddleware(authPort));
 
+if (env.RATE_LIMIT_EXECUTE_PER_MINUTE > 0 || env.RATE_LIMIT_EXECUTE_PER_DAY > 0) {
+  app.use(
+    '/api/workflows/:id/execute',
+    createRateLimitMiddleware({
+      perMinute: env.RATE_LIMIT_EXECUTE_PER_MINUTE,
+      perDay: env.RATE_LIMIT_EXECUTE_PER_DAY,
+      trustProxy: env.TRUST_PROXY,
+    }),
+  );
+  logger.info('execute rate limit enabled', {
+    perMinute: env.RATE_LIMIT_EXECUTE_PER_MINUTE,
+    perDay: env.RATE_LIMIT_EXECUTE_PER_DAY,
+    trustProxy: env.TRUST_PROXY,
+  });
+}
+
 app.route('/api/workflows', createWorkflowsRoutes(assertAuthorized));
 app.route('/api/executions', createExecutionsRoutes(assertAuthorized));
 

From b27b90291e01657c72d8b8ebecdfd1f256bda3c5 Mon Sep 17 00:00:00 2001
From: Jan Librowski <jan.librowski@synergycodes.com>
Date: Wed, 10 Jun 2026 02:05:06 +0200
Subject: [PATCH 04/11] feat(deploy): containerize ai studio stack for
 production deployment

deploy/ai-studio/: multi-target Dockerfile (runtime/migrate/web),
production docker-compose (only nginx public, pinned images, automatic
migrations), nginx SPA+API proxy with SSE tuning and per-request DNS
re-resolution, .env.example with Mistral Small 3.2 default, DevOps
README, and a decision log covering the architecture choices.

tsx becomes a real dependency of backend and worker (start:prod runs
without an .env file); .dockerignore now keeps **/.env out of build
contexts.

Verified end-to-end: Sales Inquiry Pipeline to execution_completed with
live SSE through nginx; rate limiter returns 429 past the budget.
---
 .dockerignore                                 |  16 +-
 CLAUDE.md                                     |   2 +
 apps/backend/package.json                     |   2 +
 apps/execution-worker/package.json            |   4 +-
 deploy/ai-studio/.env.example                 |  36 ++++
 deploy/ai-studio/Dockerfile                   |  73 ++++++++
 deploy/ai-studio/README.md                    | 106 +++++++++++
 .../ai-studio-deployment.decision-log.md      | 140 +++++++++++++++
 deploy/ai-studio/docker-compose.yml           | 164 ++++++++++++++++++
 deploy/ai-studio/nginx/default.conf           |  63 +++++++
 pnpm-lock.yaml                                |   6 +
 11 files changed, 609 insertions(+), 3 deletions(-)
 create mode 100644 deploy/ai-studio/.env.example
 create mode 100644 deploy/ai-studio/Dockerfile
 create mode 100644 deploy/ai-studio/README.md
 create mode 100644 deploy/ai-studio/ai-studio-deployment.decision-log.md
 create mode 100644 deploy/ai-studio/docker-compose.yml
 create mode 100644 deploy/ai-studio/nginx/default.conf

diff --git a/.dockerignore b/.dockerignore
index 84a60d2c3..899987c08 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -2,18 +2,30 @@
 .git
 .vscode
 .idea
+.claude
 
 # external dependencies
 node_modules
+**/node_modules
 
 # docker files
 docker-compose*.yml
 **/Dockerfile*
 
+# build artifacts
+dist/
+**/dist
+coverage/
+**/coverage
+
 # not needed files
 README.md
 tools/
 !tools/deployment/nginx
 .gitignore
-.env
-coverage/
+
+# env files hold secrets (e.g. OPENROUTER_API_KEY) and must never enter the
+# build context — runtime config is injected via docker-compose `environment`
+**/.env
+**/.env.*
+!**/.env.example
diff --git a/CLAUDE.md b/CLAUDE.md
index 5dd077b75..74d0ed0dd 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -42,6 +42,8 @@ Long-running processes already emit stable log lines that scripts and agents can
 
 ```
 tools/              - Root dev scripts: preflight, setup:env, infra wait
+deploy/
+  ai-studio/        - Production deployment: Dockerfile (runtime/migrate/web), compose, nginx, README
 apps/
   demo/             - Reference app consuming the SDK (React + Vite, port 4200)
   ai-studio/        - Reference AI workflow product (React + Vite, port 4201)
diff --git a/apps/backend/package.json b/apps/backend/package.json
index fec96ac60..3f1e1fb1a 100644
--- a/apps/backend/package.json
+++ b/apps/backend/package.json
@@ -6,6 +6,7 @@
   "scripts": {
     "dev": "tsx watch --env-file=.env ./src/server.ts",
     "start": "tsx --env-file=.env ./src/server.ts",
+    "start:prod": "tsx ./src/server.ts",
     "typecheck": "tsc --noEmit",
     "lint": "eslint",
     "lint:fix": "eslint --fix",
@@ -24,6 +25,7 @@
     "drizzle-orm": "^0.44.0",
     "hono": "^4.7.0",
     "postgres": "^3.4.5",
+    "tsx": "^4.19.3",
     "zod": "^4.3.6"
   },
   "devDependencies": {
diff --git a/apps/execution-worker/package.json b/apps/execution-worker/package.json
index 2a3d9aa58..1d394656e 100644
--- a/apps/execution-worker/package.json
+++ b/apps/execution-worker/package.json
@@ -6,6 +6,7 @@
   "scripts": {
     "dev": "tsx watch --env-file=.env ./src/engines/temporal/worker.ts",
     "start": "tsx --env-file=.env ./src/engines/temporal/worker.ts",
+    "start:prod": "tsx ./src/engines/temporal/worker.ts",
     "typecheck": "tsc --noEmit",
     "lint": "eslint",
     "lint:fix": "eslint --fix",
@@ -19,7 +20,8 @@
     "@workflow-builder/execution-core": "workspace:*",
     "ai": "^6.0.0",
     "dotenv": "^17.4.2",
-    "postgres": "^3.4.5"
+    "postgres": "^3.4.5",
+    "tsx": "^4.19.3"
   },
   "devDependencies": {
     "@types/node": "^22.12.0",
diff --git a/deploy/ai-studio/.env.example b/deploy/ai-studio/.env.example
new file mode 100644
index 000000000..b80d85932
--- /dev/null
+++ b/deploy/ai-studio/.env.example
@@ -0,0 +1,36 @@
+# Copy to .env next to docker-compose.yml and fill in. Everything except
+# OPENROUTER_API_KEY has a working default.
+
+# --- required ---------------------------------------------------------------
+
+# Server-side only; never reaches the browser. Pair it with an OpenRouter
+# account Guardrail (hard $/day ceiling) — see README "Spend safety".
+OPENROUTER_API_KEY=
+
+# --- LLM --------------------------------------------------------------------
+
+# WB-229 demo model. Cheap, EU-hosted, solid tool calling.
+# ~$0.075/M input + $0.20/M output => ~$0.0004 per 3-call template run.
+AI_MODEL=mistralai/mistral-small-3.2-24b-instruct
+
+# --- abuse gate (per-IP, execute route) ---------------------------------------
+
+RATE_LIMIT_EXECUTE_PER_MINUTE=10
+RATE_LIMIT_EXECUTE_PER_DAY=50
+
+# --- network ------------------------------------------------------------------
+
+# Where the web container publishes. Put your TLS terminator in front of it
+# (or bind 127.0.0.1 and proxy from a host nginx/caddy).
+WEB_BIND=0.0.0.0
+WEB_PORT=8080
+
+# Leave empty: the SPA then calls /api on its own origin and nginx proxies
+# it to the backend (no CORS, SSE intact). Only set this if the frontend is
+# served from a different host than the backend.
+VITE_BACKEND_URL=
+
+# --- databases (internal network only, not published) -------------------------
+
+APP_DB_PASSWORD=wb
+TEMPORAL_DB_PASSWORD=temporal
diff --git a/deploy/ai-studio/Dockerfile b/deploy/ai-studio/Dockerfile
new file mode 100644
index 000000000..a6fc07f67
--- /dev/null
+++ b/deploy/ai-studio/Dockerfile
@@ -0,0 +1,73 @@
+# syntax=docker/dockerfile:1
+
+# AI Studio execution stack — single Dockerfile, multiple targets:
+#
+#   runtime  -> backend + execution-worker (command chosen per compose service)
+#   migrate  -> one-shot Drizzle migration runner (needs backend devDependencies)
+#   web      -> nginx serving the AI Studio SPA + reverse proxy to the backend
+#
+# Build context must be the repo root (workspace packages are linked via
+# pnpm `workspace:*`), e.g.:
+#
+#   docker build -f deploy/ai-studio/Dockerfile --target runtime .
+#
+# Node is pinned to the exact engines.node version because the workspace sets
+# engineStrict=true. pnpm is installed via npm, not corepack — the corepack
+# bundled with this Node release fails to load pnpm 10
+# (ERR_VM_DYNAMIC_IMPORT_CALLBACK_MISSING) and ships stale signature keys.
+# Keep the version in sync with `packageManager` in the root package.json.
+FROM node:22.12.0-bookworm-slim AS base
+ENV PNPM_HOME=/pnpm \
+    PATH="/pnpm:$PATH" \
+    # root `prepare` script runs husky, which needs the .git dir that is
+    # deliberately excluded from the build context
+    HUSKY=0 \
+    npm_config_store_dir=/pnpm/store \
+    CI=true
+RUN npm install -g pnpm@10.17.0
+WORKDIR /app
+
+# Download every dependency from the lockfile alone, then bring in the
+# source. Any source change invalidates only the layers below the COPY —
+# the package store survives in the cache mount, so reinstalls are cheap.
+FROM base AS source
+COPY pnpm-lock.yaml pnpm-workspace.yaml ./
+RUN --mount=type=cache,id=pnpm-store,target=/pnpm/store pnpm fetch
+COPY . .
+
+# Production deps for backend + worker and their workspace dependencies
+# (execution-core, types). Both apps run TS directly through tsx — the
+# Temporal worker additionally requires its workflow TS source on disk at
+# runtime (the workflow sandbox bundles it from source), so there is no
+# build step to get wrong.
+FROM source AS runtime
+# root `prepare` runs husky, a devDependency that a --prod install doesn't
+# have — drop the script inside the image (root scripts are unused at runtime)
+RUN --mount=type=cache,id=pnpm-store,target=/pnpm/store \
+    npm pkg delete scripts.prepare && \
+    pnpm install --frozen-lockfile --prefer-offline --prod \
+    --filter backend... --filter execution-worker...
+# command supplied by docker-compose:
+#   backend: pnpm --filter backend start:prod
+#   worker:  pnpm --filter execution-worker start:prod
+
+# Migrations need drizzle-kit, a backend devDependency — hence a separate
+# target with a dev install. Runs as a one-shot service before the backend.
+FROM source AS migrate
+RUN --mount=type=cache,id=pnpm-store,target=/pnpm/store \
+    pnpm install --frozen-lockfile --prefer-offline --filter backend...
+CMD ["pnpm", "--filter", "backend", "db:migrate"]
+
+# The SPA build imports the SDK from source (vite alias), so this needs the
+# full frontend dependency tree. VITE_BACKEND_URL is baked in at build time;
+# the default (empty) makes the app call /api on its own origin, which the
+# web target's nginx proxies to the backend.
+FROM source AS frontend-build
+ARG VITE_BACKEND_URL=
+RUN --mount=type=cache,id=pnpm-store,target=/pnpm/store \
+    pnpm install --frozen-lockfile --prefer-offline --filter @workflow-builder/ai-studio...
+RUN VITE_BACKEND_URL=$VITE_BACKEND_URL pnpm build:ai-studio
+
+FROM nginx:1.31-alpine AS web
+COPY deploy/ai-studio/nginx/default.conf /etc/nginx/conf.d/default.conf
+COPY --from=frontend-build /app/dist/apps/ai-studio /usr/share/nginx/html
diff --git a/deploy/ai-studio/README.md b/deploy/ai-studio/README.md
new file mode 100644
index 000000000..351e031de
--- /dev/null
+++ b/deploy/ai-studio/README.md
@@ -0,0 +1,106 @@
+# Deploying AI Studio
+
+Self-contained, portable deployment of the AI Studio stack (WB-229). Runs on
+any Docker host — an Azure VM, AWS, on-prem — with no cloud-specific glue.
+
+## What runs
+
+| Service       | Image                          | Role                                            | Exposed                  |
+| ------------- | ------------------------------ | ----------------------------------------------- | ------------------------ |
+| `web`         | `ai-studio-web` (nginx)        | Serves the SPA, proxies `/api` to the backend   | `${WEB_PORT}` (only one) |
+| `backend`     | `ai-studio-runtime`            | Hono REST + SSE event stream                    | internal                 |
+| `worker`      | `ai-studio-runtime`            | Temporal worker, makes the OpenRouter LLM calls | internal                 |
+| `migrate`     | `ai-studio-migrate`            | One-shot Drizzle migrations, then exits         | internal                 |
+| `temporal`    | `temporalio/auto-setup` pinned | Workflow engine                                 | internal                 |
+| `app-db`      | `postgres:16`                  | Workflow snapshots + execution events           | internal                 |
+| `temporal-db` | `postgres:16`                  | Temporal's own state store                      | internal                 |
+| `temporal-ui` | `temporalio/ui` pinned         | Debug only (`--profile debug`)                  | `127.0.0.1:8233`         |
+
+All images build from one Dockerfile (`deploy/ai-studio/Dockerfile`) with the
+repo root as context. Backend and worker share a single image and differ only
+in the compose `command`.
+
+## Quick start
+
+```bash
+cd deploy/ai-studio
+cp .env.example .env        # set OPENROUTER_API_KEY
+docker compose up -d --build
+```
+
+First boot: migrations run automatically (`migrate` exits 0, then the backend
+starts). The worker crash-loops for ~30s until Temporal finishes auto-setup —
+that's expected, `restart: unless-stopped` converges it.
+
+Verify:
+
+```bash
+curl -s http://localhost:8080/api/health   # {"status":"ok"}
+# open http://localhost:8080, run the "Sales Inquiry Pipeline" template
+```
+
+## Spend safety (do not skip)
+
+Two independent controls; both must be in place before the URL goes public:
+
+1. **OpenRouter Guardrail** (hard $/day ceiling, no code involved):
+   [openrouter.ai](https://openrouter.ai) → Settings → Guardrails → daily
+   spend limit, e.g. **$5/day** (resets 00:00 UTC). When hit, OpenRouter
+   rejects calls and the demo pauses — it cannot overspend. Keep the account
+   balance low (~$20) as the absolute ceiling.
+2. **Per-IP rate limit** (already on in this compose): defaults to 10
+   executions/min and 50/day per IP, tunable via
+   `RATE_LIMIT_EXECUTE_PER_MINUTE` / `RATE_LIMIT_EXECUTE_PER_DAY`. In-memory,
+   single-replica by design; counters reset on backend restart.
+
+At the defaults, a worst case full Guardrail day costs $5; a typical
+3-LLM-call template run on Mistral Small 3.2 costs ~$0.0004.
+
+## TLS / going public
+
+The `web` container speaks plain HTTP on the internal port. Pick one:
+
+- **Existing ingress** (Azure Application Gateway / Front Door, an nginx that
+  already routes your other web apps, …): point it at `WEB_PORT`, set
+  `WEB_BIND=127.0.0.1` if the ingress runs on the same host. SSE caveat: the
+  ingress must not buffer `/api/executions/*/stream` responses and needs a
+  read timeout above 60s (the stream heartbeats every 15s).
+- **Standalone VM**: run a host-level [Caddy](https://caddyserver.com)
+  (`reverse_proxy localhost:8080` — automatic Let's Encrypt, SSE-safe out of
+  the box) or certbot'd nginx in front, and firewall everything except
+  80/443.
+
+Keep 8233 (Temporal UI) and the Postgres ports unreachable from outside —
+this compose never publishes them; don't undo that.
+
+## Configuration
+
+See [.env.example](.env.example) — every variable is documented there.
+Swapping the LLM is a one-liner: change `AI_MODEL` to any
+[OpenRouter model id](https://openrouter.ai/models) and
+`docker compose up -d worker`.
+
+## Operations
+
+```bash
+docker compose logs -f backend worker        # tail the apps
+docker compose --profile debug up -d         # Temporal UI on 127.0.0.1:8233
+docker compose up -d --build                 # deploy a new version (re-runs migrations)
+docker compose down                          # stop (volumes survive)
+docker exec ai-studio-app-db-1 pg_dump -U wb workflow_builder > backup.sql
+```
+
+Workflow data is treated as ephemeral for the public demo — losing the
+volumes is acceptable; there is nothing precious in them.
+
+## Known limitations (accepted for the lean MVP)
+
+- **No login.** The API is open (`WB_AUTH_PORT=allow-all`); anyone with the
+  URL can create and run workflows within the rate limits. The SDK has an
+  `AuthPort` seam for wiring real auth later.
+- **Single backend replica.** The rate limiter is process-local. Scaling out
+  needs a shared store (Redis) — deferred to the scale-ready task.
+- **`temporalio/auto-setup` is dev-grade.** Fine for a demo; move to Temporal
+  Cloud or an operated cluster for sustained load.
+- **Anyone-can-edit demo content.** Visitors share one workspace; data is
+  wiped whenever you decide to recreate the volumes.
diff --git a/deploy/ai-studio/ai-studio-deployment.decision-log.md b/deploy/ai-studio/ai-studio-deployment.decision-log.md
new file mode 100644
index 000000000..a56e411f3
--- /dev/null
+++ b/deploy/ai-studio/ai-studio-deployment.decision-log.md
@@ -0,0 +1,140 @@
+### Title: Containerized AI Studio deployment — portable compose stack
+
+### Proposed by: Jan Librowski
+
+### Date: 10.06.2026
+
+## Context
+
+WB-229 (lean public demo on an Azure VM) and its parent WB-155 (deployment
+preparations) needed a production deployment story for the AI Studio
+execution stack: backend (Hono), execution-worker (Temporal), two Postgres
+instances, a Temporal server, and the static SPA. Until now only `pnpm dev`
+plus an infra-only compose existed — no Dockerfiles for any app.
+
+Constraints that shaped the design:
+
+- **Portability over Azure ergonomics.** Workflow Builder is sold to external
+  customers; whatever ships here must run on AWS / GCP / on-prem / bare
+  Docker without re-architecting. DevOps asked for containerization
+  specifically for ease of portability and setup.
+- **Surprise bills must be impossible** (WB-229): a hard OpenRouter spend cap
+  (dashboard Guardrail) plus an in-app per-IP abuse gate.
+- **The local dev flow must survive** (`pnpm dev:ai-studio` + `pnpm
+infra:up`) — contributors rely on it; nothing in dev changes.
+- The repo pins Node 22.12.0 + pnpm 10.17.0 with `engineStrict`, and the
+  Temporal worker bundles its workflow entrypoint **from TS source at
+  runtime**, so the source tree must be present in the worker container.
+
+## Decision
+
+Everything lives in `deploy/ai-studio/`: one multi-target Dockerfile, a
+production `docker-compose.yml`, the nginx config, `.env.example`, and a
+DevOps-facing README.
+
+1. **One Dockerfile, three targets** (`runtime`, `migrate`, `web`), built
+   with the repo root as context (pnpm `workspace:*` links require it). A
+   shared `source` stage does `pnpm fetch` against a BuildKit cache mount, so
+   per-target installs are store-hits.
+2. **tsx in production, no build step.** Backend and worker run TS through
+   `tsx` exactly as in dev — `tsx` moved from a hoisted root devDependency to
+   a real dependency of both apps, plus `start:prod` scripts (the existing
+   `start` scripts hard-require a `.env` file; containers inject env
+   directly). This sidesteps the Temporal-sandbox-needs-source constraint
+   entirely — there is no bundling step to get wrong.
+3. **One shared `runtime` image for backend and worker**; the compose
+   `command` picks the entrypoint. One image to build, push, and version.
+4. **Migrations as a one-shot compose service** (`migrate` target, carries
+   drizzle-kit as a backend devDependency). `depends_on:
+service_completed_successfully` gates the backend, so `docker compose up`
+   is a complete first boot. Same answer works as a k8s Job / ACA job if a
+   customer reshapes the topology.
+5. **nginx is the only public surface.** It serves the SPA and proxies
+   `/api` to the backend on the internal network; the SSE stream route gets
+   `proxy_buffering off` + long read timeout. The backend container is
+   reached through Docker's embedded DNS **re-resolved per request**
+   (`resolver 127.0.0.11` + variable `proxy_pass`) — a statically resolved
+   upstream 502s after the backend container is recreated on redeploy.
+   Postgres ×2, Temporal, and the backend publish no host ports; Temporal UI
+   is opt-in behind a `debug` profile bound to loopback. TLS terminates in
+   front (existing ingress or host-level Caddy/certbot — documented in the
+   README, deliberately not baked into the stack).
+6. **Same-origin frontend.** `VITE_BACKEND_URL` is baked empty at build time;
+   the SPA calls `/api` on its own origin. No CORS, no second hostname, SSE
+   intact.
+7. **pnpm installed via `npm i -g pnpm@10.17.0` in images, not corepack.**
+   The corepack bundled with Node 22.12.0 cannot load pnpm 10
+   (`ERR_VM_DYNAMIC_IMPORT_CALLBACK_MISSING`) and ships stale signature
+   keys. Version is duplicated in the Dockerfile — keep in sync with
+   `packageManager`.
+8. **Installs use `--prefer-offline`, not `--offline`**: pnpm propagates
+   offline mode to lifecycle scripts, and `apps/icons` `prepare` shells out
+   to `npx @svgr/cli`, which then refuses the network (`ENOTCACHED`).
+9. **Per-IP rate limit on the execute route** (`apps/backend`):
+   fixed-window, in-memory, env-gated (`RATE_LIMIT_EXECUTE_PER_MINUTE/DAY`,
+   default off so dev is untouched; compose sets 10/min, 50/day).
+   `TRUST_PROXY=true` makes it read the client from `X-Forwarded-For`, which
+   only our nginx can set. This is the abuse gate; the money cap is the
+   OpenRouter account Guardrail — two independent controls.
+10. **Model pinned per environment, not in code**: compose defaults
+    `AI_MODEL=mistralai/mistral-small-3.2-24b-instruct` (price re-verified
+    2026-06-10 against the OpenRouter API: $0.075/$0.20 per Mtok ≈ $0.0004
+    per 3-call template run). Swapping models is an env change.
+11. **Pinned images, no `:latest`**: `temporalio/auto-setup:1.29.6.1`,
+    `temporalio/ui:2.51.0`, `nginx:1.31-alpine`, `node:22.12.0-bookworm-slim`
+    (exact pin because `engineStrict` rejects any other 22.x).
+
+Found and fixed during end-to-end verification: the worker ignored
+`TEMPORAL_ADDRESS` (`Worker.create` without an explicit connection dials
+`127.0.0.1:7233` — invisible in local dev, fatal in containers).
+
+## Alternative Options Considered
+
+- **`pnpm deploy` to materialize standalone app bundles** — rejected: pnpm 10
+  requires `inject-workspace-packages` or a legacy-mode flag, adding workspace
+  config churn for no benefit over running from the installed workspace.
+- **Compile step (tsc/tsup/esbuild) + plain `node`** — rejected for the MVP:
+  the worker needs its TS source on disk for Temporal's runtime bundling
+  anyway, so compilation only helps the backend while doubling the ways the
+  artifact can diverge from dev. Revisit if image size or cold-start matters.
+- **Azure-specific artifacts (Container Apps / AKS manifests, Key Vault
+  wiring)** — deferred deliberately: WB-229 targets a single VM, and the
+  portability requirement says external customers must not inherit Azure
+  glue. The compose file is the customer-facing artifact; platform topology
+  can wrap it later.
+- **Separate Dockerfiles per app** — rejected: three near-identical
+  install stages to keep in sync; the multi-target file shares layers.
+- **Rate limiting in nginx (`limit_req`)** — rejected: the limit is
+  per-execute-route and needs structured JSON 429s consistent with the
+  backend's error contract; nginx zones would split the policy across two
+  layers. nginx stays dumb, policy lives where the route lives.
+- **Redis-backed rate limiter** — deferred to the scale-ready task (WB-229
+  explicitly accepts single-replica in-memory for the MVP).
+
+## Consequences
+
+- **Pros**
+  - `cp .env.example .env && docker compose up -d --build` is the whole
+    deployment; verified end-to-end (Sales Inquiry Pipeline to
+    `execution_completed` with live SSE through nginx, rate limiter returning
+    429s past the budget).
+  - The artifact is platform-neutral: any Docker host, no cloud SDK anywhere.
+  - Secrets only travel through compose `environment`; `.dockerignore` now
+    excludes `**/.env*` so keys cannot be baked into images (previously
+    `apps/*/.env` files would have been copied into the build context).
+  - Dev flow untouched; rate limiter is inert without its env vars.
+- **Cons**
+  - `runtime` image is ~1.9 GB (full source tree + pnpm store hardlinks +
+    Temporal native bridge). Acceptable for a demo VM; a compile step or
+    `pnpm deploy` bundle is the known optimization path.
+  - Any source change invalidates the `COPY . .` layer and reinstalls
+    (mitigated by the store cache mount; rebuilds are minutes, not tens of).
+  - `temporalio/auto-setup` is dev-grade by Temporal's own docs — accepted
+    for the demo, swap for Temporal Cloud / operated cluster under sustained
+    load (the apps only consume `TEMPORAL_ADDRESS`).
+  - pnpm version is pinned in two places (root `packageManager` +
+    Dockerfile).
+
+## Status
+
+Accepted
diff --git a/deploy/ai-studio/docker-compose.yml b/deploy/ai-studio/docker-compose.yml
new file mode 100644
index 000000000..52d7dff1a
--- /dev/null
+++ b/deploy/ai-studio/docker-compose.yml
@@ -0,0 +1,164 @@
+# AI Studio execution stack — production-shaped compose (WB-229 lean MVP).
+#
+#   cp .env.example .env   # set OPENROUTER_API_KEY
+#   docker compose up -d --build
+#
+# Only the `web` service publishes a port. Postgres, Temporal and the
+# backend stay on the internal network. Temporal UI is opt-in via the
+# `debug` profile and binds to loopback only.
+
+name: ai-studio
+
+x-runtime-build: &runtime-build
+  context: ../..
+  dockerfile: deploy/ai-studio/Dockerfile
+  target: runtime
+
+services:
+  app-db:
+    image: postgres:16
+    environment:
+      POSTGRES_DB: workflow_builder
+      POSTGRES_USER: wb
+      POSTGRES_PASSWORD: ${APP_DB_PASSWORD:-wb}
+    volumes:
+      - app-db-data:/var/lib/postgresql/data
+    healthcheck:
+      test: ['CMD', 'pg_isready', '-U', 'wb', '-d', 'workflow_builder']
+      interval: 5s
+      timeout: 3s
+      retries: 12
+    restart: unless-stopped
+
+  temporal-db:
+    image: postgres:16
+    environment:
+      POSTGRES_DB: temporal
+      POSTGRES_USER: temporal
+      POSTGRES_PASSWORD: ${TEMPORAL_DB_PASSWORD:-temporal}
+    volumes:
+      - temporal-db-data:/var/lib/postgresql/data
+    healthcheck:
+      test: ['CMD', 'pg_isready', '-U', 'temporal', '-d', 'temporal']
+      interval: 5s
+      timeout: 3s
+      retries: 12
+    restart: unless-stopped
+
+  # auto-setup is Temporal's dev-grade single-binary image. Accepted for the
+  # demo (WB-229); a sustained-load deployment should move to Temporal Cloud
+  # or a properly operated self-hosted cluster — the apps only consume
+  # TEMPORAL_ADDRESS and don't care which.
+  temporal:
+    image: temporalio/auto-setup:1.29.6.1
+    depends_on:
+      temporal-db:
+        condition: service_healthy
+    environment:
+      DB: postgres12
+      DB_PORT: 5432
+      POSTGRES_USER: temporal
+      POSTGRES_PWD: ${TEMPORAL_DB_PASSWORD:-temporal}
+      POSTGRES_SEEDS: temporal-db
+    restart: unless-stopped
+
+  temporal-ui:
+    image: temporalio/ui:2.51.0
+    profiles: [debug]
+    depends_on:
+      - temporal
+    environment:
+      TEMPORAL_ADDRESS: temporal:7233
+    ports:
+      - '127.0.0.1:8233:8080'
+    restart: unless-stopped
+
+  migrate:
+    image: ai-studio-migrate
+    build:
+      context: ../..
+      dockerfile: deploy/ai-studio/Dockerfile
+      target: migrate
+    environment:
+      DATABASE_URL: postgresql://wb:${APP_DB_PASSWORD:-wb}@app-db:5432/workflow_builder
+    depends_on:
+      app-db:
+        condition: service_healthy
+    restart: 'no'
+
+  backend:
+    image: ai-studio-runtime
+    build: *runtime-build
+    command: ['pnpm', '--filter', 'backend', 'start:prod']
+    environment:
+      HOST: 0.0.0.0
+      PORT: 3001
+      DATABASE_URL: postgresql://wb:${APP_DB_PASSWORD:-wb}@app-db:5432/workflow_builder
+      TEMPORAL_ADDRESS: temporal:7233
+      # Reference deployment has no user accounts; the explicit opt-in keeps
+      # a forgotten env var from silently exposing an unauthenticated API.
+      WB_AUTH_PORT: allow-all
+      # Backend is only reachable through the web service's nginx, which
+      # sets X-Forwarded-For — safe to trust for per-IP rate limiting.
+      TRUST_PROXY: 'true'
+      RATE_LIMIT_EXECUTE_PER_MINUTE: ${RATE_LIMIT_EXECUTE_PER_MINUTE:-10}
+      RATE_LIMIT_EXECUTE_PER_DAY: ${RATE_LIMIT_EXECUTE_PER_DAY:-50}
+    depends_on:
+      app-db:
+        condition: service_healthy
+      migrate:
+        condition: service_completed_successfully
+      temporal:
+        condition: service_started
+    healthcheck:
+      test:
+        [
+          'CMD',
+          'node',
+          '-e',
+          "fetch('http://127.0.0.1:3001/api/health').then((r) => process.exit(r.ok ? 0 : 1)).catch(() => process.exit(1))",
+        ]
+      interval: 10s
+      timeout: 5s
+      retries: 6
+      start_period: 15s
+    restart: unless-stopped
+
+  # Crash-loops until Temporal answers on 7233 (auto-setup has no usable
+  # healthcheck); `restart: unless-stopped` converges it.
+  worker:
+    image: ai-studio-runtime
+    build: *runtime-build
+    command: ['pnpm', '--filter', 'execution-worker', 'start:prod']
+    environment:
+      DATABASE_URL: postgresql://wb:${APP_DB_PASSWORD:-wb}@app-db:5432/workflow_builder
+      TEMPORAL_ADDRESS: temporal:7233
+      OPENROUTER_API_KEY: ${OPENROUTER_API_KEY:?set OPENROUTER_API_KEY in deploy/ai-studio/.env}
+      AI_MODEL: ${AI_MODEL:-mistralai/mistral-small-3.2-24b-instruct}
+    depends_on:
+      app-db:
+        condition: service_healthy
+      migrate:
+        condition: service_completed_successfully
+      temporal:
+        condition: service_started
+    restart: unless-stopped
+
+  web:
+    image: ai-studio-web
+    build:
+      context: ../..
+      dockerfile: deploy/ai-studio/Dockerfile
+      target: web
+      args:
+        # Empty -> SPA calls /api on its own origin, proxied by this nginx.
+        VITE_BACKEND_URL: ${VITE_BACKEND_URL:-}
+    ports:
+      - '${WEB_BIND:-0.0.0.0}:${WEB_PORT:-8080}:80'
+    depends_on:
+      - backend
+    restart: unless-stopped
+
+volumes:
+  app-db-data:
+  temporal-db-data:
diff --git a/deploy/ai-studio/nginx/default.conf b/deploy/ai-studio/nginx/default.conf
new file mode 100644
index 000000000..396c1dd6c
--- /dev/null
+++ b/deploy/ai-studio/nginx/default.conf
@@ -0,0 +1,63 @@
+# AI Studio — SPA + API reverse proxy.
+#
+# This container is the only public surface of the stack. TLS is expected to
+# terminate in front of it (cloud ingress / load balancer / a host-level
+# certbot'd nginx) — see deploy/ai-studio/README.md for the options.
+
+server {
+    listen 80;
+    server_name _;
+
+    root /usr/share/nginx/html;
+    index index.html;
+
+    # Resolve the backend through Docker's embedded DNS on every request
+    # (via the variable indirection below) instead of once at startup —
+    # otherwise recreating the backend container leaves nginx proxying to a
+    # stale IP and every /api call 502s until this container restarts too.
+    resolver 127.0.0.11 valid=10s ipv6=off;
+    set $backend_upstream http://backend:3001;
+
+    gzip on;
+    gzip_types text/css application/javascript application/json image/svg+xml;
+
+    # Backend caps request bodies at 1 MB and answers with a structured
+    # error; keep nginx's own limit above it so the backend owns that path.
+    client_max_body_size 2m;
+
+    # Live execution streams over SSE: hold the connection open, never
+    # buffer, and outlast the backend's 15s heartbeat interval.
+    location ~ ^/api/executions/.+/stream$ {
+        proxy_pass $backend_upstream;
+        proxy_http_version 1.1;
+        proxy_set_header Host $host;
+        proxy_set_header X-Real-IP $remote_addr;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Proto $scheme;
+        proxy_set_header Connection '';
+        proxy_buffering off;
+        proxy_cache off;
+        proxy_read_timeout 1h;
+        gzip off;
+    }
+
+    location /api/ {
+        proxy_pass $backend_upstream;
+        proxy_http_version 1.1;
+        proxy_set_header Host $host;
+        proxy_set_header X-Real-IP $remote_addr;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Proto $scheme;
+    }
+
+    # Vite emits content-hashed filenames under /assets — cache forever.
+    location /assets/ {
+        add_header Cache-Control "public, max-age=31536000, immutable";
+        try_files $uri =404;
+    }
+
+    # SPA fallback
+    location / {
+        try_files $uri /index.html;
+    }
+}
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 247fdd644..8b5384f7c 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -174,6 +174,9 @@ importers:
       postgres:
         specifier: ^3.4.5
         version: 3.4.9
+      tsx:
+        specifier: ^4.19.3
+        version: 4.21.0
       zod:
         specifier: ^4.3.6
         version: 4.3.6
@@ -366,6 +369,9 @@ importers:
       postgres:
         specifier: ^3.4.5
         version: 3.4.9
+      tsx:
+        specifier: ^4.19.3
+        version: 4.21.0
     devDependencies:
       '@types/node':
         specifier: ^22.12.0

From 1cfb024b51f66043800326aa15ca07119c984662 Mon Sep 17 00:00:00 2001
From: Jan Librowski <jan.librowski@synergycodes.com>
Date: Wed, 10 Jun 2026 03:15:15 +0200
Subject: [PATCH 05/11] feat(deploy): add swarm overlay aligned with
 workflow-builder infra
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

tools/deployment/ mirrors the workflow-builder repo's deployment path
(build-docker.sh, deploy.sh, ansible deploy-application playbook) and
consumes the same three images from deploy/ai-studio/Dockerfile — only
the orchestration layer differs. Deviations forced by AI Studio being
stateful: node-pinned volumes for Postgres/Temporal, post-deploy
migration step (Swarm ignores depends_on), attachable internal network
with short DNS aliases, and an AUTH_ENABLED-gated gatekeeper so the
public demo stays login-free.

Stack template render-verified in both auth modes; status 'Proposed'
pending the DevOps conversation.
---
 CLAUDE.md                                     |   1 +
 deploy/ai-studio/README.md                    |   4 +
 tools/deployment/README.md                    |  78 ++++++
 .../ansible/deploy-application/main.yml       | 241 ++++++++++++++++++
 tools/deployment/scripts/build-docker.sh      |  38 +++
 tools/deployment/scripts/deploy.sh            |  11 +
 .../swarm-alignment.decision-log.md           |  84 ++++++
 7 files changed, 457 insertions(+)
 create mode 100644 tools/deployment/README.md
 create mode 100644 tools/deployment/ansible/deploy-application/main.yml
 create mode 100755 tools/deployment/scripts/build-docker.sh
 create mode 100755 tools/deployment/scripts/deploy.sh
 create mode 100644 tools/deployment/swarm-alignment.decision-log.md

diff --git a/CLAUDE.md b/CLAUDE.md
index 74d0ed0dd..2a2e2bb31 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -42,6 +42,7 @@ Long-running processes already emit stable log lines that scripts and agents can
 
 ```
 tools/              - Root dev scripts: preflight, setup:env, infra wait
+  deployment/       - Swarm/Ansible deploy path mirroring the workflow-builder repo (ACR, Traefik)
 deploy/
   ai-studio/        - Production deployment: Dockerfile (runtime/migrate/web), compose, nginx, README
 apps/
diff --git a/deploy/ai-studio/README.md b/deploy/ai-studio/README.md
index 351e031de..cc149a182 100644
--- a/deploy/ai-studio/README.md
+++ b/deploy/ai-studio/README.md
@@ -3,6 +3,10 @@
 Self-contained, portable deployment of the AI Studio stack (WB-229). Runs on
 any Docker host — an Azure VM, AWS, on-prem — with no cloud-specific glue.
 
+> Deploying onto the company Swarm cluster instead? See
+> [`tools/deployment/`](../../tools/deployment/README.md) — same images,
+> Traefik/ACR/Ansible orchestration aligned with the workflow-builder repo.
+
 ## What runs
 
 | Service       | Image                          | Role                                            | Exposed                  |
diff --git a/tools/deployment/README.md b/tools/deployment/README.md
new file mode 100644
index 000000000..432e90faa
--- /dev/null
+++ b/tools/deployment/README.md
@@ -0,0 +1,78 @@
+# Swarm deployment (workflow-builder-aligned)
+
+Deploys AI Studio onto the company Docker Swarm cluster on Azure, following
+the same layout, scripts, and Ansible flow as the `workflow-builder` repo's
+`tools/deployment/` — so DevOps operates one familiar shape.
+
+This is an **orchestration overlay, not a second deployment**: it consumes
+the exact same three images (`runtime`, `migrate`, `web`) built from
+[`deploy/ai-studio/Dockerfile`](../../deploy/ai-studio/Dockerfile). The
+compose file in `deploy/ai-studio/` remains the portable, customer-facing
+artifact and the local full-stack runner; this directory adds the
+ACR + Traefik + Ansible path for our own infrastructure.
+
+```
+tools/deployment/
+├── scripts/
+│   ├── build-docker.sh    # build all 3 targets, tag for ACR, push (CI-gated)
+│   └── deploy.sh          # run the Ansible playbook (CI image or workstation)
+└── ansible/deploy-application/
+    └── main.yml           # writes the Swarm stack file on the master + deploys + migrates
+```
+
+## Usage
+
+```bash
+# build + push images (from repo root)
+DEPLOY_ENV=dev ./tools/deployment/scripts/build-docker.sh
+
+# deploy the stack (needs az login + ansible inventory with the `master` host)
+DEPLOY_ENV=dev DEPLOYMENT_URL=ai-studio.example.com OPENROUTER_API_KEY=sk-... \
+  ./tools/deployment/scripts/deploy.sh
+```
+
+Bitbucket-style variables (`BITBUCKET_COMMIT`, `BITBUCKET_DEPLOYMENT_ENVIRONMENT`,
+`TAG_PREFIX`) take precedence when present, so the scripts drop into the
+existing CI pattern unchanged; the fallbacks (`git rev-parse`, `DEPLOY_ENV`)
+make them runnable from a workstation or GitHub Actions.
+
+## Configuration
+
+| Variable                                                                                 | Required  | Default                                    | Purpose                                                 |
+| ---------------------------------------------------------------------------------------- | --------- | ------------------------------------------ | ------------------------------------------------------- |
+| `DEPLOYMENT_URL`                                                                         | yes       | —                                          | Public hostname, drives Traefik routing + TLS           |
+| `OPENROUTER_API_KEY`                                                                     | yes       | —                                          | Worker-side LLM key (pair with an OpenRouter Guardrail) |
+| `DEPLOY_ENV` / `BITBUCKET_DEPLOYMENT_ENVIRONMENT`                                        | no        | `dev`                                      | Stack/environment suffix                                |
+| `AI_MODEL`                                                                               | no        | `mistralai/mistral-small-3.2-24b-instruct` | OpenRouter model id                                     |
+| `RATE_LIMIT_EXECUTE_PER_MINUTE` / `_DAY`                                                 | no        | `10` / `50`                                | Per-IP abuse gate                                       |
+| `APP_DB_PASSWORD`, `TEMPORAL_DB_PASSWORD`                                                | no        | dev defaults                               | Internal-network Postgres credentials                   |
+| `AUTH_ENABLED`                                                                           | no        | `false`                                    | Put the gatekeeper OIDC proxy in front (internal envs)  |
+| `AUTH_DISCOVERY_URL`, `AUTH_CLIENT_ID`, `AUTH_SECRET`, `AUTH_COOKIE_SECRET`, `AUTH_ROLE` | when auth | —                                          | Gatekeeper config, same names as workflow-builder       |
+| `REGISTRY`                                                                               | no        | `synergycodes.azurecr.io`                  | Image registry                                          |
+
+## What differs from the workflow-builder playbook (and why)
+
+| Deviation                                                                                         | Reason                                                                                                                               |
+| ------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------ |
+| Postgres ×2 + Temporal services with named volumes, pinned via `node.labels.ai-studio-data==true` | AI Studio is stateful; Swarm volumes are node-local. **One-time setup:** `docker node update --label-add ai-studio-data=true <node>` |
+| Migrations run post-deploy as a one-shot `docker run` on the stack network (with retries)         | Swarm ignores compose `depends_on` conditions                                                                                        |
+| `internal` network is `attachable: true`                                                          | Lets the migrate container join the overlay                                                                                          |
+| Services carry short DNS aliases (`backend`, `app-db`, `temporal`, …)                             | The web image's nginx proxies to `http://backend:3001`; aliases keep the images and env defaults identical between compose and Swarm |
+| Gatekeeper is conditional (`AUTH_ENABLED`)                                                        | The WB-229 public demo is deliberately login-free; internal instances can keep SSO                                                   |
+
+SSE note: Traefik streams responses by default, so the live execution stream
+works without special ingress config; the 15 s backend heartbeat keeps the
+connection alive.
+
+## Open items for DevOps
+
+- **Stateful workloads on the cluster** — this would be the first; the
+  alternative is a dedicated VM running `deploy/ai-studio/docker-compose.yml`
+  as-is, or managed Azure Postgres.
+- **CI home** — this repo lives on GitHub; the existing deploy machinery
+  (deployment CI image, `setup-az.sh`, Ansible inventory) is Bitbucket-side.
+  First deploys can run from a workstation.
+- **Secrets in the stack file** — the playbook writes env values (incl. the
+  OpenRouter key) into the stack yml on the Swarm master, same as the
+  existing workflow-builder flow. Docker Swarm secrets would be stricter;
+  kept aligned for now.
diff --git a/tools/deployment/ansible/deploy-application/main.yml b/tools/deployment/ansible/deploy-application/main.yml
new file mode 100644
index 000000000..2d8a4fc2d
--- /dev/null
+++ b/tools/deployment/ansible/deploy-application/main.yml
@@ -0,0 +1,241 @@
+---
+# Deploys the AI Studio execution stack to the Docker Swarm cluster,
+# following the workflow-builder repo's deploy-application playbook. The
+# images are the same three targets the local compose builds
+# (deploy/ai-studio/Dockerfile); only the orchestration differs.
+#
+# Differences from the workflow-builder playbook, all forced by AI Studio
+# being stateful:
+#   - Postgres x2 + Temporal services with named volumes, pinned to the node
+#     labeled `ai-studio-data=true` (Swarm volumes are node-local).
+#   - Migrations run as a one-shot container after stack deploy — Swarm
+#     ignores compose depends_on conditions, so ordering lives here.
+#   - The `internal` network is attachable so the migrate container can join.
+#   - Services get short DNS aliases (backend, app-db, temporal, ...) so the
+#     same images and env defaults work under compose and Swarm.
+#   - Gatekeeper is optional (AUTH_ENABLED=true): the WB-229 public demo is
+#     deliberately login-free; internal stage/dev instances can enable it.
+
+- hosts: master
+
+  vars:
+    deployment_environment: "{{ lookup('env', 'BITBUCKET_DEPLOYMENT_ENVIRONMENT') or lookup('env', 'DEPLOY_ENV') or 'dev' }}"
+    tag_prefix: "{{ lookup('env', 'TAG_PREFIX') }}"
+    bb_commit: "{{ lookup('env', 'BITBUCKET_COMMIT') or lookup('pipe', 'git rev-parse HEAD') }}"
+    app_name: ai-studio
+    registry: "{{ lookup('env', 'REGISTRY') or 'synergycodes.azurecr.io' }}"
+    deployment_url: "{{ lookup('env', 'DEPLOYMENT_URL') }}"
+    image_tag: '{{ tag_prefix }}{{ bb_commit }}'
+    stack_name: '{{ app_name }}--{{ deployment_environment }}'
+    auth_enabled: "{{ (lookup('env', 'AUTH_ENABLED') or 'false') | bool }}"
+    openrouter_api_key: "{{ lookup('env', 'OPENROUTER_API_KEY') }}"
+    ai_model: "{{ lookup('env', 'AI_MODEL') or 'mistralai/mistral-small-3.2-24b-instruct' }}"
+    app_db_password: "{{ lookup('env', 'APP_DB_PASSWORD') or 'wb' }}"
+    temporal_db_password: "{{ lookup('env', 'TEMPORAL_DB_PASSWORD') or 'temporal' }}"
+    database_url: 'postgresql://wb:{{ app_db_password }}@app-db:5432/workflow_builder'
+
+  tasks:
+    - name: Check required configuration
+      assert:
+        that:
+          - deployment_url | length > 0
+          - openrouter_api_key | length > 0
+        fail_msg: 'DEPLOYMENT_URL and OPENROUTER_API_KEY must be set'
+
+    - name: Create directory for service data
+      file:
+        path: '/mnt/docker-swarm-storage/stacks/{{ stack_name }}'
+        state: directory
+
+    - name: Create stack definition
+      copy:
+        dest: '/mnt/docker-swarm-storage/stacks/{{ stack_name }}/{{ app_name }}.stack.yml'
+        content: |
+          services:
+          {% if auth_enabled %}
+            ai-studio-gatekeeper--{{ deployment_environment }}:
+              image: '{{ registry }}/gatekeeper:2.1.1'
+              environment:
+                PROXY_LISTEN: :4200
+                PROXY_UPSTREAM_URL: http://web
+                PROXY_DISCOVERY_URL: "{{ lookup('env', 'AUTH_DISCOVERY_URL') }}"
+                PROXY_CLIENT_ID: "{{ lookup('env', 'AUTH_CLIENT_ID') }}"
+                PROXY_CLIENT_SECRET: "{{ lookup('env', 'AUTH_SECRET') }}"
+                PROXY_ENCRYPTION_KEY: "{{ lookup('env', 'AUTH_COOKIE_SECRET') }}"
+                PROXY_REDIRECTION_URL: 'https://{{ deployment_url }}'
+              command:
+                - '-enable-default-deny=false'
+                - "-resources=uri=/*|roles={{ lookup('env', 'AUTH_ROLE') }}"
+              networks:
+                traefik-host-external:
+                internal:
+                  aliases: [gatekeeper]
+              deploy:
+                placement:
+                  constraints:
+                    - node.role==worker
+                labels:
+                  - 'traefik.enable=true'
+                  - 'traefik.docker.network=traefik-host-external'
+                  - 'traefik.http.routers.{{ stack_name }}-http.rule=Host(`{{ deployment_url }}`) && PathPrefix(`/`)'
+                  - 'traefik.http.routers.{{ stack_name }}-http.entrypoints=http'
+                  - 'traefik.http.routers.{{ stack_name }}-http.middlewares=https-redirect'
+                  - 'traefik.http.routers.{{ stack_name }}-https.rule=Host(`{{ deployment_url }}`) && PathPrefix(`/`)'
+                  - 'traefik.http.routers.{{ stack_name }}-https.entrypoints=https'
+                  - 'traefik.http.routers.{{ stack_name }}-https.tls=true'
+                  - 'traefik.http.routers.{{ stack_name }}-https.tls.certresolver=le'
+                  - 'traefik.http.services.{{ stack_name }}.loadbalancer.server.port=4200'
+          {% endif %}
+
+            ai-studio-web--{{ deployment_environment }}:
+              image: '{{ registry }}/{{ app_name }}:web-{{ image_tag }}'
+              networks:
+          {% if not auth_enabled %}
+                traefik-host-external:
+          {% endif %}
+                internal:
+                  aliases: [web]
+              deploy:
+                placement:
+                  constraints:
+                    - node.role==worker
+          {% if not auth_enabled %}
+                labels:
+                  - 'traefik.enable=true'
+                  - 'traefik.docker.network=traefik-host-external'
+                  - 'traefik.http.routers.{{ stack_name }}-http.rule=Host(`{{ deployment_url }}`) && PathPrefix(`/`)'
+                  - 'traefik.http.routers.{{ stack_name }}-http.entrypoints=http'
+                  - 'traefik.http.routers.{{ stack_name }}-http.middlewares=https-redirect'
+                  - 'traefik.http.routers.{{ stack_name }}-https.rule=Host(`{{ deployment_url }}`) && PathPrefix(`/`)'
+                  - 'traefik.http.routers.{{ stack_name }}-https.entrypoints=https'
+                  - 'traefik.http.routers.{{ stack_name }}-https.tls=true'
+                  - 'traefik.http.routers.{{ stack_name }}-https.tls.certresolver=le'
+                  - 'traefik.http.services.{{ stack_name }}.loadbalancer.server.port=80'
+          {% endif %}
+
+            ai-studio-backend--{{ deployment_environment }}:
+              image: '{{ registry }}/{{ app_name }}:runtime-{{ image_tag }}'
+              command: ['pnpm', '--filter', 'backend', 'start:prod']
+              environment:
+                HOST: 0.0.0.0
+                PORT: 3001
+                DATABASE_URL: '{{ database_url }}'
+                TEMPORAL_ADDRESS: temporal:7233
+                WB_AUTH_PORT: allow-all
+                TRUST_PROXY: 'true'
+                RATE_LIMIT_EXECUTE_PER_MINUTE: "{{ lookup('env', 'RATE_LIMIT_EXECUTE_PER_MINUTE') or '10' }}"
+                RATE_LIMIT_EXECUTE_PER_DAY: "{{ lookup('env', 'RATE_LIMIT_EXECUTE_PER_DAY') or '50' }}"
+              networks:
+                internal:
+                  # the web image's nginx proxies to http://backend:3001
+                  aliases: [backend]
+              deploy:
+                placement:
+                  constraints:
+                    - node.role==worker
+
+            ai-studio-worker--{{ deployment_environment }}:
+              image: '{{ registry }}/{{ app_name }}:runtime-{{ image_tag }}'
+              command: ['pnpm', '--filter', 'execution-worker', 'start:prod']
+              environment:
+                DATABASE_URL: '{{ database_url }}'
+                TEMPORAL_ADDRESS: temporal:7233
+                OPENROUTER_API_KEY: '{{ openrouter_api_key }}'
+                AI_MODEL: '{{ ai_model }}'
+              networks:
+                internal:
+              deploy:
+                placement:
+                  constraints:
+                    - node.role==worker
+
+            ai-studio-app-db--{{ deployment_environment }}:
+              image: 'postgres:16'
+              environment:
+                POSTGRES_DB: workflow_builder
+                POSTGRES_USER: wb
+                POSTGRES_PASSWORD: '{{ app_db_password }}'
+              volumes:
+                - app-db-data:/var/lib/postgresql/data
+              networks:
+                internal:
+                  aliases: [app-db]
+              deploy:
+                placement:
+                  constraints:
+                    - node.labels.ai-studio-data==true
+
+            ai-studio-temporal-db--{{ deployment_environment }}:
+              image: 'postgres:16'
+              environment:
+                POSTGRES_DB: temporal
+                POSTGRES_USER: temporal
+                POSTGRES_PASSWORD: '{{ temporal_db_password }}'
+              volumes:
+                - temporal-db-data:/var/lib/postgresql/data
+              networks:
+                internal:
+                  aliases: [temporal-db]
+              deploy:
+                placement:
+                  constraints:
+                    - node.labels.ai-studio-data==true
+
+            ai-studio-temporal--{{ deployment_environment }}:
+              image: 'temporalio/auto-setup:1.29.6.1'
+              environment:
+                DB: postgres12
+                DB_PORT: 5432
+                POSTGRES_USER: temporal
+                POSTGRES_PWD: '{{ temporal_db_password }}'
+                POSTGRES_SEEDS: temporal-db
+              networks:
+                internal:
+                  aliases: [temporal]
+              deploy:
+                placement:
+                  constraints:
+                    - node.role==worker
+
+          volumes:
+            app-db-data:
+            temporal-db-data:
+
+          networks:
+            internal:
+              # attachable so the one-shot migrate container below can join
+              attachable: true
+            traefik-host-external:
+              external: true
+
+    - name: Ensure Azure CLI is setup
+      shell: /var/az-autologin.sh
+
+    - name: Ensure jsondiff is installed (required by community.docker.docker_stack)
+      ansible.builtin.pip:
+        name: jsondiff
+
+    - name: Deploy stack
+      community.docker.docker_stack:
+        state: present
+        name: '{{ stack_name }}'
+        resolve_image: 'always'
+        prune: true
+        with_registry_auth: yes
+        compose:
+          - '/mnt/docker-swarm-storage/stacks/{{ stack_name }}/{{ app_name }}.stack.yml'
+
+    # Swarm has no depends_on / one-shot service semantics: run Drizzle
+    # migrations as a plain container on the stack's attachable overlay
+    # network. Retries cover app-db still starting up on first deploy.
+    - name: Run database migrations
+      command: >
+        docker run --rm
+        --network {{ stack_name }}_internal
+        -e DATABASE_URL={{ database_url }}
+        {{ registry }}/{{ app_name }}:migrate-{{ image_tag }}
+        pnpm --filter backend db:migrate
+      register: migrate_result
+      retries: 10
+      delay: 6
+      until: migrate_result.rc == 0
diff --git a/tools/deployment/scripts/build-docker.sh b/tools/deployment/scripts/build-docker.sh
new file mode 100755
index 000000000..8295b05d5
--- /dev/null
+++ b/tools/deployment/scripts/build-docker.sh
@@ -0,0 +1,38 @@
+#!/bin/sh
+# Build + push the AI Studio images to ACR, mirroring the workflow-builder
+# repo's tools/deployment/scripts/build-docker.sh. All three images come from
+# the same multi-target Dockerfile in deploy/ai-studio/ — this script only
+# adds registry tagging; the images are identical to the local-compose ones.
+#
+# Bitbucket-style env vars are honored when present (TAG_PREFIX,
+# BITBUCKET_COMMIT, BITBUCKET_DEPLOYMENT_ENVIRONMENT) and fall back to git +
+# DEPLOY_ENV so the script also runs from a workstation or GitHub Actions.
+set -eu
+
+APP_NAME="ai-studio"
+REGISTRY="${REGISTRY:-synergycodes.azurecr.io}"
+COMMIT="${BITBUCKET_COMMIT:-$(git rev-parse HEAD)}"
+ENVIRONMENT="${BITBUCKET_DEPLOYMENT_ENVIRONMENT:-${DEPLOY_ENV:-}}"
+export IMAGE_TAG="${TAG_PREFIX:-}$COMMIT"
+
+for TARGET in runtime migrate web; do
+  TAG="$REGISTRY/$APP_NAME:$TARGET-$IMAGE_TAG"
+  docker build \
+    -f ./deploy/ai-studio/Dockerfile \
+    --target "$TARGET" \
+    -t "$TAG" \
+    .
+done
+
+ALLOWED_ENVIRONMENTS="stage dev prod"
+
+if echo "$ALLOWED_ENVIRONMENTS" | grep -w "$ENVIRONMENT" > /dev/null; then
+  # setup-az.sh exists in the deployment CI image; logging in by other means
+  # (az acr login / docker login) is fine when running elsewhere
+  [ -f /var/setup-az.sh ] && . /var/setup-az.sh
+  for TARGET in runtime migrate web; do
+    docker push "$REGISTRY/$APP_NAME:$TARGET-$IMAGE_TAG"
+  done
+else
+  echo "Environment '$ENVIRONMENT' is not configured for image push. Skipping."
+fi
diff --git a/tools/deployment/scripts/deploy.sh b/tools/deployment/scripts/deploy.sh
new file mode 100755
index 000000000..1ac078e57
--- /dev/null
+++ b/tools/deployment/scripts/deploy.sh
@@ -0,0 +1,11 @@
+#!/bin/sh
+# Deploy the AI Studio stack to the Docker Swarm cluster, mirroring the
+# workflow-builder repo's tools/deployment/scripts/deploy.sh. The setup
+# scripts are baked into the synergycodes deployment CI image; guards let the
+# playbook also run from a workstation with az + ansible already configured.
+set -eu
+
+[ -f /var/setup-az.sh ] && . /var/setup-az.sh
+[ -f /var/setup-ansible.sh ] && . /var/setup-ansible.sh
+
+ansible-playbook ./tools/deployment/ansible/deploy-application/main.yml
diff --git a/tools/deployment/swarm-alignment.decision-log.md b/tools/deployment/swarm-alignment.decision-log.md
new file mode 100644
index 000000000..44049bbaa
--- /dev/null
+++ b/tools/deployment/swarm-alignment.decision-log.md
@@ -0,0 +1,84 @@
+### Title: Swarm overlay aligned with the workflow-builder deployment
+
+### Proposed by: Jan Librowski
+
+### Date: 10.06.2026
+
+## Context
+
+The compose-based deployment in `deploy/ai-studio/` (see its decision log)
+targets a single Docker host and ships TLS as a bring-your-own concern. The
+company's actual Azure footprint, found in the `workflow-builder` repo's
+`tools/deployment/`, is different: a self-managed Docker Swarm cluster with
+Traefik (Let's Encrypt, host-based routing), images in ACR tagged by commit,
+deployment via an Ansible playbook that writes a stack file onto the Swarm
+master, and an optional gatekeeper OIDC proxy for internal apps. DevOps
+operates that machinery daily.
+
+Rather than choosing one target, the compose branch is kept as a snapshot
+(`WB-229-ai-studio-deployment`) and this branch adds the Swarm-aligned path
+on top of it.
+
+## Decision
+
+Add `tools/deployment/` mirroring the workflow-builder repo's structure —
+`scripts/build-docker.sh`, `scripts/deploy.sh`,
+`ansible/deploy-application/main.yml` — with the same conventions: ACR
+commit-tagged images, per-environment stack names (`ai-studio--dev`),
+Traefik labels copied from the existing stack, Bitbucket-style env variables
+honored with workstation fallbacks.
+
+**The images are shared, not duplicated.** Both paths build the same three
+targets from `deploy/ai-studio/Dockerfile`; the overlay only changes
+orchestration. Four deliberate deviations from the workflow-builder
+playbook, all forced by AI Studio being stateful where the editor demo was
+a static frontend:
+
+1. Database/Temporal services with named volumes pinned to a labeled node
+   (`node.labels.ai-studio-data==true`) — Swarm volumes are node-local.
+2. Migrations as a post-deploy one-shot `docker run` with retries — Swarm
+   ignores compose `depends_on` conditions, so the ordering that compose
+   expressed declaratively lives in the playbook.
+3. An `attachable` internal network plus short DNS aliases (`backend`,
+   `app-db`, `temporal`) so the unmodified web image's nginx upstream and
+   the compose env defaults resolve identically under Swarm.
+4. Gatekeeper made conditional (`AUTH_ENABLED`, default off) — the public
+   demo is login-free by design; internal stage/dev instances can keep SSO.
+
+## Alternative Options Considered
+
+- **Compose on a dedicated VM only** (the snapshot branch) — fully working
+  and remains the customer-facing artifact; rejected as the _only_ path
+  because it adds a second ops surface (new VM, separate TLS) when a
+  maintained cluster exists.
+- **Kubernetes/AKS manifests** — nothing in the org runs on k8s per the
+  available evidence; would be infrastructure invention, not alignment.
+- **Managed Azure Postgres instead of in-cluster databases** — cleaner
+  state story, but contradicts the near-zero-cost requirement for a demo
+  whose data is explicitly ephemeral; revisit for sustained load.
+- **Swarm secrets for the OpenRouter key** — stricter than env-in-stack-file,
+  but diverges from how the existing playbook handles `AUTH_SECRET`;
+  consistency won for now, flagged in the README.
+
+## Consequences
+
+- **Pros**
+  - DevOps sees the exact shape they already operate; review is a diff
+    against a known playbook, not a new system.
+  - TLS, registry auth, and routing are inherited from cluster-level
+    Traefik instead of being re-solved per deployment.
+  - Stack template render-verified in both auth modes (YAML parses; correct
+    public surface and Traefik port in each).
+- **Cons**
+  - Not exercised against a real cluster yet — inventory, ACR push rights,
+    the `ai-studio-data` node label, and the first stateful workload on the
+    cluster all need DevOps sign-off.
+  - The rate limiter's `X-Forwarded-For` trust now spans Traefik (and
+    optionally gatekeeper) before nginx; the first-hop assumption should be
+    verified on the real cluster.
+  - Secrets land in a stack file on the Swarm master's disk (inherited
+    trade-off from the existing flow).
+
+## Status
+
+Proposed — pending the DevOps conversation

From 3d10ff45865cf6d165eaf6419b26228d90b2aa69 Mon Sep 17 00:00:00 2001
From: Jan Librowski <jan.librowski@synergycodes.com>
Date: Thu, 11 Jun 2026 11:25:08 +0200
Subject: [PATCH 06/11] feat(backend): apply drizzle migrations on boot
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

drizzle-orm's programmatic migrator runs the SQL files from
apps/backend/drizzle/ before the server accepts traffic. A failure
(database still starting) exits the process; container restart policies
retry until it converges. drizzle-kit stays a devDependency — db:migrate
remains available for out-of-band use.
---
 apps/backend/src/db/migrate.ts | 23 +++++++++++++++++++++++
 apps/backend/src/server.ts     |  7 +++++++
 2 files changed, 30 insertions(+)
 create mode 100644 apps/backend/src/db/migrate.ts

diff --git a/apps/backend/src/db/migrate.ts b/apps/backend/src/db/migrate.ts
new file mode 100644
index 000000000..005b661d0
--- /dev/null
+++ b/apps/backend/src/db/migrate.ts
@@ -0,0 +1,23 @@
+import { drizzle } from 'drizzle-orm/postgres-js';
+import { migrate } from 'drizzle-orm/postgres-js/migrator';
+import { fileURLToPath } from 'node:url';
+import postgres from 'postgres';
+
+import { env } from '../env';
+
+// Programmatic equivalent of `pnpm db:migrate`, reading the same SQL files
+// from apps/backend/drizzle/. Runs at backend boot so deployments need no
+// separate migration step or image — and drizzle-kit can stay a
+// devDependency. Single-replica assumption (WB-229): concurrent backends
+// would race the migrator.
+export async function runMigrations(): Promise<void> {
+  const migrationsFolder = fileURLToPath(new URL('../../drizzle', import.meta.url));
+  // Dedicated throwaway connection — the app pool in client.ts outlives this,
+  // but the migrator's connection must not linger once it finishes.
+  const sql = postgres(env.DATABASE_URL, { max: 1 });
+  try {
+    await migrate(drizzle(sql), { migrationsFolder });
+  } finally {
+    await sql.end();
+  }
+}
diff --git a/apps/backend/src/server.ts b/apps/backend/src/server.ts
index 10e52c1d9..a2b81ea20 100644
--- a/apps/backend/src/server.ts
+++ b/apps/backend/src/server.ts
@@ -12,6 +12,7 @@ import {
   createAuthMiddleware,
   makeAssertAuthorized,
 } from './auth';
+import { runMigrations } from './db/migrate';
 import { env } from './env';
 import { logger } from './logger';
 import { createRateLimitMiddleware } from './middleware/rate-limit';
@@ -72,6 +73,12 @@ if (env.RATE_LIMIT_EXECUTE_PER_MINUTE > 0 || env.RATE_LIMIT_EXECUTE_PER_DAY > 0)
 app.route('/api/workflows', createWorkflowsRoutes(assertAuthorized));
 app.route('/api/executions', createExecutionsRoutes(assertAuthorized));
 
+// Apply pending migrations before accepting traffic. A failure (e.g. the
+// database still starting) exits the process — the container restart policy
+// retries until it converges, so deployments need no separate migration step.
+await runMigrations();
+logger.info('database migrations applied');
+
 serve({ fetch: app.fetch, port: env.PORT, hostname: env.HOST }, () => {
   logger.info('backend listening', { url: `http://${env.HOST}:${env.PORT}` });
 });

From 7e272213aa1041268ed2ca4071c6244e10409406 Mon Sep 17 00:00:00 2001
From: Jan Librowski <jan.librowski@synergycodes.com>
Date: Thu, 11 Jun 2026 11:25:24 +0200
Subject: [PATCH 07/11] refactor(deploy): drop the migrate service and image

The backend migrates itself at boot, so the migrate Dockerfile target,
compose service, and the Swarm playbook's post-deploy migration task
(plus its attachable-network requirement) all go away. Two images
remain: runtime and web. The worker now waits for the backend
healthcheck so it never touches a pre-migration schema.

Verified on a wiped stack: virgin database boots, backend logs
'database migrations applied' before listening, Sales Inquiry Pipeline
runs to execution_completed over live SSE, rate limiter returns 429
past the budget.
---
 CLAUDE.md                                     |  6 ++---
 deploy/ai-studio/Dockerfile                   | 12 +++------
 deploy/ai-studio/README.md                    | 15 ++++++-----
 .../ai-studio-deployment.decision-log.md      | 20 +++++++++-----
 deploy/ai-studio/docker-compose.yml           | 22 ++++------------
 tools/deployment/README.md                    | 17 ++++++------
 .../ansible/deploy-application/main.yml       | 26 +++----------------
 tools/deployment/scripts/build-docker.sh      |  6 ++---
 .../swarm-alignment.decision-log.md           | 18 ++++++++-----
 9 files changed, 61 insertions(+), 81 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index 2a2e2bb31..d2932a89f 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -11,7 +11,7 @@ Three onboarding paths (A, B local-run; C docs-only). README "Get started" is th
 | `pnpm preflight`             | both | Verify Node / pnpm / Docker / ports / `.env` files. Add `--json` for agents |
 | `pnpm dev` / `pnpm dev:demo` | A    | Demo (UI only, port 4200). No backend, no Docker                            |
 | `pnpm infra:up`              | B    | Start Postgres + Temporal in Docker. Required before backend/worker         |
-| `pnpm -F backend db:migrate` | B    | Apply Drizzle migrations. First run, or after schema changes                |
+| `pnpm -F backend db:migrate` | B    | Apply Drizzle migrations out-of-band (backend also auto-migrates on boot)   |
 | `pnpm dev:ai-studio`         | B    | Full stack: infra + backend (3001) + worker + AI Studio frontend (4201)     |
 | `pnpm dev:backend`           | B    | Backend only (debug). Needs infra up                                        |
 | `pnpm dev:worker`            | B    | Execution worker only (debug). Needs infra up                               |
@@ -22,7 +22,7 @@ Three onboarding paths (A, B local-run; C docs-only). README "Get started" is th
 | `pnpm test`                  | -    | Run tests in `packages/sdk` and `packages/execution-core`                   |
 | `pnpm check`                 | -    | Lint + typecheck + format + knip                                            |
 
-Path A is UI-only and does not need Docker. Path B requires `pnpm infra:up` before backend/worker can start, and `db:migrate` on the first run.
+Path A is UI-only and does not need Docker. Path B requires `pnpm infra:up` before backend/worker can start; the backend applies pending migrations automatically at boot.
 
 ### Agent signals
 
@@ -44,7 +44,7 @@ Long-running processes already emit stable log lines that scripts and agents can
 tools/              - Root dev scripts: preflight, setup:env, infra wait
   deployment/       - Swarm/Ansible deploy path mirroring the workflow-builder repo (ACR, Traefik)
 deploy/
-  ai-studio/        - Production deployment: Dockerfile (runtime/migrate/web), compose, nginx, README
+  ai-studio/        - Production deployment: Dockerfile (runtime/web), compose, nginx, README
 apps/
   demo/             - Reference app consuming the SDK (React + Vite, port 4200)
   ai-studio/        - Reference AI workflow product (React + Vite, port 4201)
diff --git a/deploy/ai-studio/Dockerfile b/deploy/ai-studio/Dockerfile
index a6fc07f67..5c9834cbd 100644
--- a/deploy/ai-studio/Dockerfile
+++ b/deploy/ai-studio/Dockerfile
@@ -3,9 +3,12 @@
 # AI Studio execution stack — single Dockerfile, multiple targets:
 #
 #   runtime  -> backend + execution-worker (command chosen per compose service)
-#   migrate  -> one-shot Drizzle migration runner (needs backend devDependencies)
 #   web      -> nginx serving the AI Studio SPA + reverse proxy to the backend
 #
+# Database migrations run inside the backend at boot (drizzle-orm's
+# programmatic migrator over apps/backend/drizzle/), so there is no separate
+# migration image or deploy step.
+#
 # Build context must be the repo root (workspace packages are linked via
 # pnpm `workspace:*`), e.g.:
 #
@@ -51,13 +54,6 @@ RUN --mount=type=cache,id=pnpm-store,target=/pnpm/store \
 #   backend: pnpm --filter backend start:prod
 #   worker:  pnpm --filter execution-worker start:prod
 
-# Migrations need drizzle-kit, a backend devDependency — hence a separate
-# target with a dev install. Runs as a one-shot service before the backend.
-FROM source AS migrate
-RUN --mount=type=cache,id=pnpm-store,target=/pnpm/store \
-    pnpm install --frozen-lockfile --prefer-offline --filter backend...
-CMD ["pnpm", "--filter", "backend", "db:migrate"]
-
 # The SPA build imports the SDK from source (vite alias), so this needs the
 # full frontend dependency tree. VITE_BACKEND_URL is baked in at build time;
 # the default (empty) makes the app call /api on its own origin, which the
diff --git a/deploy/ai-studio/README.md b/deploy/ai-studio/README.md
index cc149a182..d1f4c4959 100644
--- a/deploy/ai-studio/README.md
+++ b/deploy/ai-studio/README.md
@@ -14,15 +14,16 @@ any Docker host — an Azure VM, AWS, on-prem — with no cloud-specific glue.
 | `web`         | `ai-studio-web` (nginx)        | Serves the SPA, proxies `/api` to the backend   | `${WEB_PORT}` (only one) |
 | `backend`     | `ai-studio-runtime`            | Hono REST + SSE event stream                    | internal                 |
 | `worker`      | `ai-studio-runtime`            | Temporal worker, makes the OpenRouter LLM calls | internal                 |
-| `migrate`     | `ai-studio-migrate`            | One-shot Drizzle migrations, then exits         | internal                 |
 | `temporal`    | `temporalio/auto-setup` pinned | Workflow engine                                 | internal                 |
 | `app-db`      | `postgres:16`                  | Workflow snapshots + execution events           | internal                 |
 | `temporal-db` | `postgres:16`                  | Temporal's own state store                      | internal                 |
 | `temporal-ui` | `temporalio/ui` pinned         | Debug only (`--profile debug`)                  | `127.0.0.1:8233`         |
 
-All images build from one Dockerfile (`deploy/ai-studio/Dockerfile`) with the
+Both images build from one Dockerfile (`deploy/ai-studio/Dockerfile`) with the
 repo root as context. Backend and worker share a single image and differ only
-in the compose `command`.
+in the compose `command`. Database migrations are applied by the backend at
+boot (drizzle-orm's programmatic migrator) — there is no separate migration
+service or step.
 
 ## Quick start
 
@@ -32,9 +33,9 @@ cp .env.example .env        # set OPENROUTER_API_KEY
 docker compose up -d --build
 ```
 
-First boot: migrations run automatically (`migrate` exits 0, then the backend
-starts). The worker crash-loops for ~30s until Temporal finishes auto-setup —
-that's expected, `restart: unless-stopped` converges it.
+First boot: the backend applies migrations and only then starts serving (its
+healthcheck gates the worker). The worker crash-loops for ~30s until Temporal
+finishes auto-setup — that's expected, `restart: unless-stopped` converges it.
 
 Verify:
 
@@ -89,7 +90,7 @@ Swapping the LLM is a one-liner: change `AI_MODEL` to any
 ```bash
 docker compose logs -f backend worker        # tail the apps
 docker compose --profile debug up -d         # Temporal UI on 127.0.0.1:8233
-docker compose up -d --build                 # deploy a new version (re-runs migrations)
+docker compose up -d --build                 # deploy a new version (backend re-applies migrations at boot)
 docker compose down                          # stop (volumes survive)
 docker exec ai-studio-app-db-1 pg_dump -U wb workflow_builder > backup.sql
 ```
diff --git a/deploy/ai-studio/ai-studio-deployment.decision-log.md b/deploy/ai-studio/ai-studio-deployment.decision-log.md
index a56e411f3..40e1fc6ac 100644
--- a/deploy/ai-studio/ai-studio-deployment.decision-log.md
+++ b/deploy/ai-studio/ai-studio-deployment.decision-log.md
@@ -32,7 +32,7 @@ Everything lives in `deploy/ai-studio/`: one multi-target Dockerfile, a
 production `docker-compose.yml`, the nginx config, `.env.example`, and a
 DevOps-facing README.
 
-1. **One Dockerfile, three targets** (`runtime`, `migrate`, `web`), built
+1. **One Dockerfile, two targets** (`runtime`, `web`), built
    with the repo root as context (pnpm `workspace:*` links require it). A
    shared `source` stage does `pnpm fetch` against a BuildKit cache mount, so
    per-target installs are store-hits.
@@ -44,11 +44,13 @@ DevOps-facing README.
    entirely — there is no bundling step to get wrong.
 3. **One shared `runtime` image for backend and worker**; the compose
    `command` picks the entrypoint. One image to build, push, and version.
-4. **Migrations as a one-shot compose service** (`migrate` target, carries
-   drizzle-kit as a backend devDependency). `depends_on:
-service_completed_successfully` gates the backend, so `docker compose up`
-   is a complete first boot. Same answer works as a k8s Job / ACA job if a
-   customer reshapes the topology.
+4. **Migrations on backend boot** (revised 11.06.2026 — originally a
+   one-shot `migrate` compose service). The backend applies pending Drizzle
+   migrations via drizzle-orm's programmatic migrator before accepting
+   traffic; on failure it exits and the restart policy retries until
+   Postgres answers. One less image, no orchestrator-specific ordering —
+   the same behavior on compose, Swarm, or anything else. Single-replica
+   assumption: concurrent backends would race the migrator.
 5. **nginx is the only public surface.** It serves the SPA and proxies
    `/api` to the backend on the internal network; the SSE stream route gets
    `proxy_buffering off` + long read timeout. The backend container is
@@ -135,6 +137,12 @@ Found and fixed during end-to-end verification: the worker ignored
   - pnpm version is pinned in two places (root `packageManager` +
     Dockerfile).
 
+## Revisions
+
+- **11.06.2026** — `migrate` target and service removed; the backend now
+  migrates itself at boot (Jan's simplification request during WB-229
+  review). Dockerfile is down to two targets (`runtime`, `web`).
+
 ## Status
 
 Accepted
diff --git a/deploy/ai-studio/docker-compose.yml b/deploy/ai-studio/docker-compose.yml
index 52d7dff1a..c4f827984 100644
--- a/deploy/ai-studio/docker-compose.yml
+++ b/deploy/ai-studio/docker-compose.yml
@@ -73,19 +73,8 @@ services:
       - '127.0.0.1:8233:8080'
     restart: unless-stopped
 
-  migrate:
-    image: ai-studio-migrate
-    build:
-      context: ../..
-      dockerfile: deploy/ai-studio/Dockerfile
-      target: migrate
-    environment:
-      DATABASE_URL: postgresql://wb:${APP_DB_PASSWORD:-wb}@app-db:5432/workflow_builder
-    depends_on:
-      app-db:
-        condition: service_healthy
-    restart: 'no'
-
+  # Applies Drizzle migrations at boot, before accepting traffic. A failure
+  # (e.g. Postgres still starting) exits the process and `restart` retries.
   backend:
     image: ai-studio-runtime
     build: *runtime-build
@@ -106,8 +95,6 @@ services:
     depends_on:
       app-db:
         condition: service_healthy
-      migrate:
-        condition: service_completed_successfully
       temporal:
         condition: service_started
     healthcheck:
@@ -138,8 +125,9 @@ services:
     depends_on:
       app-db:
         condition: service_healthy
-      migrate:
-        condition: service_completed_successfully
+      # healthy = migrations applied — the worker writes to the same schema
+      backend:
+        condition: service_healthy
       temporal:
         condition: service_started
     restart: unless-stopped
diff --git a/tools/deployment/README.md b/tools/deployment/README.md
index 432e90faa..f8f99fd2c 100644
--- a/tools/deployment/README.md
+++ b/tools/deployment/README.md
@@ -5,7 +5,7 @@ the same layout, scripts, and Ansible flow as the `workflow-builder` repo's
 `tools/deployment/` — so DevOps operates one familiar shape.
 
 This is an **orchestration overlay, not a second deployment**: it consumes
-the exact same three images (`runtime`, `migrate`, `web`) built from
+the exact same two images (`runtime`, `web`) built from
 [`deploy/ai-studio/Dockerfile`](../../deploy/ai-studio/Dockerfile). The
 compose file in `deploy/ai-studio/` remains the portable, customer-facing
 artifact and the local full-stack runner; this directory adds the
@@ -17,7 +17,7 @@ tools/deployment/
 │   ├── build-docker.sh    # build all 3 targets, tag for ACR, push (CI-gated)
 │   └── deploy.sh          # run the Ansible playbook (CI image or workstation)
 └── ansible/deploy-application/
-    └── main.yml           # writes the Swarm stack file on the master + deploys + migrates
+    └── main.yml           # writes the Swarm stack file on the master + deploys
 ```
 
 ## Usage
@@ -52,13 +52,12 @@ make them runnable from a workstation or GitHub Actions.
 
 ## What differs from the workflow-builder playbook (and why)
 
-| Deviation                                                                                         | Reason                                                                                                                               |
-| ------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------ |
-| Postgres ×2 + Temporal services with named volumes, pinned via `node.labels.ai-studio-data==true` | AI Studio is stateful; Swarm volumes are node-local. **One-time setup:** `docker node update --label-add ai-studio-data=true <node>` |
-| Migrations run post-deploy as a one-shot `docker run` on the stack network (with retries)         | Swarm ignores compose `depends_on` conditions                                                                                        |
-| `internal` network is `attachable: true`                                                          | Lets the migrate container join the overlay                                                                                          |
-| Services carry short DNS aliases (`backend`, `app-db`, `temporal`, …)                             | The web image's nginx proxies to `http://backend:3001`; aliases keep the images and env defaults identical between compose and Swarm |
-| Gatekeeper is conditional (`AUTH_ENABLED`)                                                        | The WB-229 public demo is deliberately login-free; internal instances can keep SSO                                                   |
+| Deviation                                                                                              | Reason                                                                                                                               |
+| ------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------ |
+| Postgres ×2 + Temporal services with named volumes, pinned via `node.labels.ai-studio-data==true`      | AI Studio is stateful; Swarm volumes are node-local. **One-time setup:** `docker node update --label-add ai-studio-data=true <node>` |
+| No migration step — the backend applies Drizzle migrations at boot and restarts until Postgres answers | Swarm ignores compose `depends_on` conditions, so ordering must not rely on them                                                     |
+| Services carry short DNS aliases (`backend`, `app-db`, `temporal`, …)                                  | The web image's nginx proxies to `http://backend:3001`; aliases keep the images and env defaults identical between compose and Swarm |
+| Gatekeeper is conditional (`AUTH_ENABLED`)                                                             | The WB-229 public demo is deliberately login-free; internal instances can keep SSO                                                   |
 
 SSE note: Traefik streams responses by default, so the live execution stream
 works without special ingress config; the 15 s backend heartbeat keeps the
diff --git a/tools/deployment/ansible/deploy-application/main.yml b/tools/deployment/ansible/deploy-application/main.yml
index 2d8a4fc2d..f83dc6ddb 100644
--- a/tools/deployment/ansible/deploy-application/main.yml
+++ b/tools/deployment/ansible/deploy-application/main.yml
@@ -1,16 +1,15 @@
 ---
 # Deploys the AI Studio execution stack to the Docker Swarm cluster,
 # following the workflow-builder repo's deploy-application playbook. The
-# images are the same three targets the local compose builds
-# (deploy/ai-studio/Dockerfile); only the orchestration differs.
+# images are the same two targets the local compose builds
+# (deploy/ai-studio/Dockerfile); only the orchestration differs. Database
+# migrations run inside the backend at boot, so there is no migration step
+# here — the backend restarts until Postgres answers, then migrates itself.
 #
 # Differences from the workflow-builder playbook, all forced by AI Studio
 # being stateful:
 #   - Postgres x2 + Temporal services with named volumes, pinned to the node
 #     labeled `ai-studio-data=true` (Swarm volumes are node-local).
-#   - Migrations run as a one-shot container after stack deploy — Swarm
-#     ignores compose depends_on conditions, so ordering lives here.
-#   - The `internal` network is attachable so the migrate container can join.
 #   - Services get short DNS aliases (backend, app-db, temporal, ...) so the
 #     same images and env defaults work under compose and Swarm.
 #   - Gatekeeper is optional (AUTH_ENABLED=true): the WB-229 public demo is
@@ -203,8 +202,6 @@
 
           networks:
             internal:
-              # attachable so the one-shot migrate container below can join
-              attachable: true
             traefik-host-external:
               external: true
 
@@ -224,18 +221,3 @@
         with_registry_auth: yes
         compose:
           - '/mnt/docker-swarm-storage/stacks/{{ stack_name }}/{{ app_name }}.stack.yml'
-
-    # Swarm has no depends_on / one-shot service semantics: run Drizzle
-    # migrations as a plain container on the stack's attachable overlay
-    # network. Retries cover app-db still starting up on first deploy.
-    - name: Run database migrations
-      command: >
-        docker run --rm
-        --network {{ stack_name }}_internal
-        -e DATABASE_URL={{ database_url }}
-        {{ registry }}/{{ app_name }}:migrate-{{ image_tag }}
-        pnpm --filter backend db:migrate
-      register: migrate_result
-      retries: 10
-      delay: 6
-      until: migrate_result.rc == 0
diff --git a/tools/deployment/scripts/build-docker.sh b/tools/deployment/scripts/build-docker.sh
index 8295b05d5..10e90db1e 100755
--- a/tools/deployment/scripts/build-docker.sh
+++ b/tools/deployment/scripts/build-docker.sh
@@ -1,6 +1,6 @@
 #!/bin/sh
 # Build + push the AI Studio images to ACR, mirroring the workflow-builder
-# repo's tools/deployment/scripts/build-docker.sh. All three images come from
+# repo's tools/deployment/scripts/build-docker.sh. Both images come from
 # the same multi-target Dockerfile in deploy/ai-studio/ — this script only
 # adds registry tagging; the images are identical to the local-compose ones.
 #
@@ -15,7 +15,7 @@ COMMIT="${BITBUCKET_COMMIT:-$(git rev-parse HEAD)}"
 ENVIRONMENT="${BITBUCKET_DEPLOYMENT_ENVIRONMENT:-${DEPLOY_ENV:-}}"
 export IMAGE_TAG="${TAG_PREFIX:-}$COMMIT"
 
-for TARGET in runtime migrate web; do
+for TARGET in runtime web; do
   TAG="$REGISTRY/$APP_NAME:$TARGET-$IMAGE_TAG"
   docker build \
     -f ./deploy/ai-studio/Dockerfile \
@@ -30,7 +30,7 @@ if echo "$ALLOWED_ENVIRONMENTS" | grep -w "$ENVIRONMENT" > /dev/null; then
   # setup-az.sh exists in the deployment CI image; logging in by other means
   # (az acr login / docker login) is fine when running elsewhere
   [ -f /var/setup-az.sh ] && . /var/setup-az.sh
-  for TARGET in runtime migrate web; do
+  for TARGET in runtime web; do
     docker push "$REGISTRY/$APP_NAME:$TARGET-$IMAGE_TAG"
   done
 else
diff --git a/tools/deployment/swarm-alignment.decision-log.md b/tools/deployment/swarm-alignment.decision-log.md
index 44049bbaa..83d58de78 100644
--- a/tools/deployment/swarm-alignment.decision-log.md
+++ b/tools/deployment/swarm-alignment.decision-log.md
@@ -36,12 +36,12 @@ a static frontend:
 
 1. Database/Temporal services with named volumes pinned to a labeled node
    (`node.labels.ai-studio-data==true`) — Swarm volumes are node-local.
-2. Migrations as a post-deploy one-shot `docker run` with retries — Swarm
-   ignores compose `depends_on` conditions, so the ordering that compose
-   expressed declaratively lives in the playbook.
-3. An `attachable` internal network plus short DNS aliases (`backend`,
-   `app-db`, `temporal`) so the unmodified web image's nginx upstream and
-   the compose env defaults resolve identically under Swarm.
+2. No migration step (revised 11.06.2026) — the backend applies Drizzle
+   migrations at boot and restarts until Postgres answers, which sidesteps
+   Swarm's lack of `depends_on` ordering entirely.
+3. Short DNS aliases (`backend`, `app-db`, `temporal`) so the unmodified
+   web image's nginx upstream and the compose env defaults resolve
+   identically under Swarm.
 4. Gatekeeper made conditional (`AUTH_ENABLED`, default off) — the public
    demo is login-free by design; internal stage/dev instances can keep SSO.
 
@@ -79,6 +79,12 @@ a static frontend:
   - Secrets land in a stack file on the Swarm master's disk (inherited
     trade-off from the existing flow).
 
+## Revisions
+
+- **11.06.2026** — playbook migration task and the `attachable` network
+  removed; the backend migrates itself at boot. Image set is down to
+  `runtime` + `web`.
+
 ## Status
 
 Proposed — pending the DevOps conversation

From 1e95bb0d731a3b73af72bb01bc72a1f3bc99dd24 Mon Sep 17 00:00:00 2001
From: Jan Librowski <jan.librowski@synergycodes.com>
Date: Thu, 11 Jun 2026 14:15:21 +0200
Subject: [PATCH 08/11] style: trim comments to the non-obvious

Keep only what the code cannot say itself: traps (Worker.create's
implicit localhost, corepack/pnpm 10 failure, offline mode leaking into
lifecycle scripts), constraints (single-replica migrator, X-Forwarded-For
trust), and magic values. Drop the narration.
---
 .dockerignore                                 |  3 +-
 apps/backend/src/db/migrate.ts                |  9 +---
 apps/backend/src/env.ts                       |  3 +-
 .../backend/src/middleware/rate-limit.test.ts |  6 ---
 apps/backend/src/middleware/rate-limit.ts     | 23 +++------
 apps/backend/src/server.ts                    |  4 +-
 .../src/engines/temporal/worker.ts            |  3 +-
 .../src/executors/decision.test.ts            |  4 --
 .../src/executors/decision.ts                 |  5 +-
 deploy/ai-studio/Dockerfile                   | 51 +++++--------------
 deploy/ai-studio/docker-compose.yml           | 33 ++++--------
 deploy/ai-studio/nginx/default.conf           | 15 ++----
 .../ansible/deploy-application/main.yml       | 20 ++------
 tools/deployment/scripts/build-docker.sh      | 14 ++---
 tools/deployment/scripts/deploy.sh            |  6 +--
 15 files changed, 54 insertions(+), 145 deletions(-)

diff --git a/.dockerignore b/.dockerignore
index 899987c08..e0fcd955c 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -24,8 +24,7 @@ tools/
 !tools/deployment/nginx
 .gitignore
 
-# env files hold secrets (e.g. OPENROUTER_API_KEY) and must never enter the
-# build context — runtime config is injected via docker-compose `environment`
+# env files hold secrets — never in a build context
 **/.env
 **/.env.*
 !**/.env.example
diff --git a/apps/backend/src/db/migrate.ts b/apps/backend/src/db/migrate.ts
index 005b661d0..6d7d4c68d 100644
--- a/apps/backend/src/db/migrate.ts
+++ b/apps/backend/src/db/migrate.ts
@@ -5,15 +5,10 @@ import postgres from 'postgres';
 
 import { env } from '../env';
 
-// Programmatic equivalent of `pnpm db:migrate`, reading the same SQL files
-// from apps/backend/drizzle/. Runs at backend boot so deployments need no
-// separate migration step or image — and drizzle-kit can stay a
-// devDependency. Single-replica assumption (WB-229): concurrent backends
-// would race the migrator.
+// Same SQL files as `pnpm db:migrate`. Concurrent backends would race the
+// migrator — single replica assumed.
 export async function runMigrations(): Promise<void> {
   const migrationsFolder = fileURLToPath(new URL('../../drizzle', import.meta.url));
-  // Dedicated throwaway connection — the app pool in client.ts outlives this,
-  // but the migrator's connection must not linger once it finishes.
   const sql = postgres(env.DATABASE_URL, { max: 1 });
   try {
     await migrate(drizzle(sql), { migrationsFolder });
diff --git a/apps/backend/src/env.ts b/apps/backend/src/env.ts
index 349b5602f..a813c14f6 100644
--- a/apps/backend/src/env.ts
+++ b/apps/backend/src/env.ts
@@ -12,8 +12,7 @@ export const env = {
   HOST: envOr('HOST', '127.0.0.1'),
   DATABASE_URL: envOr('DATABASE_URL', 'postgresql://wb:wb@127.0.0.1:5432/workflow_builder'),
   TEMPORAL_ADDRESS: envOr('TEMPORAL_ADDRESS', '127.0.0.1:7233'),
-  // Per-IP limits on the execute route. 0 = disabled (local dev default);
-  // the production compose in deploy/ai-studio sets both.
+  // 0 disables (dev default); the deploy compose sets both
   RATE_LIMIT_EXECUTE_PER_MINUTE: Number(envOr('RATE_LIMIT_EXECUTE_PER_MINUTE', '0')),
   RATE_LIMIT_EXECUTE_PER_DAY: Number(envOr('RATE_LIMIT_EXECUTE_PER_DAY', '0')),
   TRUST_PROXY: envOr('TRUST_PROXY', 'false') === 'true',
diff --git a/apps/backend/src/middleware/rate-limit.test.ts b/apps/backend/src/middleware/rate-limit.test.ts
index f3f78b52b..7b9196358 100644
--- a/apps/backend/src/middleware/rate-limit.test.ts
+++ b/apps/backend/src/middleware/rate-limit.test.ts
@@ -6,12 +6,6 @@ import { type RateLimitOptions, createRateLimitMiddleware } from './rate-limit';
 const MINUTE_MS = 60_000;
 const DAY_MS = 24 * 60 * 60 * 1000;
 
-/**
- * Build a Hono app mirroring the production wiring in `server.ts`: the
- * limiter guards a single execute-shaped route. Tests drive the clock through
- * the injectable `now` and identify callers via X-Forwarded-For (trustProxy),
- * since `app.request()` has no underlying socket.
- */
 function makeApp(overrides: Partial<RateLimitOptions> = {}) {
   let timestamp = 0;
   const app = new Hono();
diff --git a/apps/backend/src/middleware/rate-limit.ts b/apps/backend/src/middleware/rate-limit.ts
index 8496a94f1..2a5b4c70f 100644
--- a/apps/backend/src/middleware/rate-limit.ts
+++ b/apps/backend/src/middleware/rate-limit.ts
@@ -2,17 +2,12 @@ import { getConnInfo } from '@hono/node-server/conninfo';
 import type { Context, MiddlewareHandler } from 'hono';
 
 export type RateLimitOptions = {
-  /** Max requests per IP per minute. 0 disables the minute window. */
+  // 0 disables a window
   perMinute: number;
-  /** Max requests per IP per day. 0 disables the day window. */
   perDay: number;
-  /**
-   * Read the client IP from X-Forwarded-For. Only enable when the backend is
-   * reachable exclusively through a proxy that sets the header (the deploy
-   * nginx does) — a directly-reachable backend would let clients spoof it.
-   */
+  // only safe when the backend is reachable exclusively through a proxy that
+  // sets X-Forwarded-For — a directly reachable backend lets clients spoof it
   trustProxy: boolean;
-  /** Injectable clock for tests. */
   now?: () => number;
 };
 
@@ -41,7 +36,7 @@ function clientIp(c: Context, trustProxy: boolean): string {
   try {
     return getConnInfo(c).remote.address ?? 'unknown';
   } catch {
-    // No underlying socket (e.g. app.request() in tests)
+    // no underlying socket (app.request() in tests)
     return 'unknown';
   }
 }
@@ -60,14 +55,8 @@ function hitWindow(state: WindowState, limit: number, durationMs: number, now: n
   return null;
 }
 
-/**
- * Fixed-window, in-memory, per-IP rate limiter for the execute route.
- *
- * Deliberately process-local (WB-229 lean MVP runs a single backend
- * replica): counters reset on restart and are not shared across replicas.
- * The OpenRouter account Guardrail is the independent hard spend cap; this
- * gate only stops a single IP from burning the daily budget.
- */
+// In-memory fixed windows: counters reset on restart and are not shared
+// across replicas — fine for the single-replica demo deployment.
 export function createRateLimitMiddleware(options: RateLimitOptions): MiddlewareHandler {
   const { perMinute, perDay, trustProxy } = options;
   const now = options.now ?? Date.now;
diff --git a/apps/backend/src/server.ts b/apps/backend/src/server.ts
index a2b81ea20..7b33a7f96 100644
--- a/apps/backend/src/server.ts
+++ b/apps/backend/src/server.ts
@@ -73,9 +73,7 @@ if (env.RATE_LIMIT_EXECUTE_PER_MINUTE > 0 || env.RATE_LIMIT_EXECUTE_PER_DAY > 0)
 app.route('/api/workflows', createWorkflowsRoutes(assertAuthorized));
 app.route('/api/executions', createExecutionsRoutes(assertAuthorized));
 
-// Apply pending migrations before accepting traffic. A failure (e.g. the
-// database still starting) exits the process — the container restart policy
-// retries until it converges, so deployments need no separate migration step.
+// a failure (DB still starting) exits the process; the container restart policy retries
 await runMigrations();
 logger.info('database migrations applied');
 
diff --git a/apps/execution-worker/src/engines/temporal/worker.ts b/apps/execution-worker/src/engines/temporal/worker.ts
index 49ecd295e..401c60e93 100644
--- a/apps/execution-worker/src/engines/temporal/worker.ts
+++ b/apps/execution-worker/src/engines/temporal/worker.ts
@@ -42,8 +42,7 @@ const activities = {
   },
 };
 
-// Without an explicit connection the worker silently dials 127.0.0.1:7233,
-// ignoring TEMPORAL_ADDRESS — correct in local dev, wrong everywhere else.
+// without an explicit connection, Worker.create dials 127.0.0.1:7233 and ignores TEMPORAL_ADDRESS
 const connection = await NativeConnection.connect({ address: env.TEMPORAL_ADDRESS });
 
 const worker = await Worker.create({
diff --git a/apps/execution-worker/src/executors/decision.test.ts b/apps/execution-worker/src/executors/decision.test.ts
index 2d4b31242..2f11c3fbd 100644
--- a/apps/execution-worker/src/executors/decision.test.ts
+++ b/apps/execution-worker/src/executors/decision.test.ts
@@ -83,10 +83,6 @@ describe('executeDecision', () => {
   });
 
   it('treats a branch with no conditions as the catch-all', () => {
-    // The contract the no_branch_matched error instructs authors to use, and
-    // what the reference Sales Inquiry template relies on for its 'General'
-    // branch. First-match order applies: a catch-all placed after conditional
-    // branches only fires when none of them matched.
     const node = decisionNode([
       {
         sourceHandle: 'no',
diff --git a/apps/execution-worker/src/executors/decision.ts b/apps/execution-worker/src/executors/decision.ts
index 0fbccb140..d9e8eefe9 100644
--- a/apps/execution-worker/src/executors/decision.ts
+++ b/apps/execution-worker/src/executors/decision.ts
@@ -28,10 +28,7 @@ export function executeDecision(node: DecisionNode, context: ExecutionContext):
 }
 
 function branchMatches(conditions: DecisionBranchCondition[], context: ExecutionContext): boolean {
-  // A branch with no conditions is the explicit catch-all — the contract the
-  // error above instructs authors to use, and what the reference Sales
-  // Inquiry template ships ('General' branch). First-match order still
-  // applies, so a catch-all only fires when placed after conditional branches.
+  // no conditions = the explicit catch-all the no_branch_matched error instructs authors to add
   if (conditions.length === 0) return true;
 
   let result = evaluateCondition(conditions[0]!, context);
diff --git a/deploy/ai-studio/Dockerfile b/deploy/ai-studio/Dockerfile
index 5c9834cbd..664c6d6ea 100644
--- a/deploy/ai-studio/Dockerfile
+++ b/deploy/ai-studio/Dockerfile
@@ -1,63 +1,40 @@
 # syntax=docker/dockerfile:1
 
-# AI Studio execution stack — single Dockerfile, multiple targets:
+# Targets: runtime (backend + worker, command chosen per compose service),
+# web (nginx, SPA + /api proxy). Build context must be the repo root —
+# workspace packages are linked via pnpm `workspace:*`.
 #
-#   runtime  -> backend + execution-worker (command chosen per compose service)
-#   web      -> nginx serving the AI Studio SPA + reverse proxy to the backend
-#
-# Database migrations run inside the backend at boot (drizzle-orm's
-# programmatic migrator over apps/backend/drizzle/), so there is no separate
-# migration image or deploy step.
-#
-# Build context must be the repo root (workspace packages are linked via
-# pnpm `workspace:*`), e.g.:
-#
-#   docker build -f deploy/ai-studio/Dockerfile --target runtime .
-#
-# Node is pinned to the exact engines.node version because the workspace sets
-# engineStrict=true. pnpm is installed via npm, not corepack — the corepack
-# bundled with this Node release fails to load pnpm 10
-# (ERR_VM_DYNAMIC_IMPORT_CALLBACK_MISSING) and ships stale signature keys.
-# Keep the version in sync with `packageManager` in the root package.json.
+# Exact Node pin: engineStrict rejects any other version. pnpm via npm, not
+# corepack — this Node's corepack cannot load pnpm 10
+# (ERR_VM_DYNAMIC_IMPORT_CALLBACK_MISSING). Keep in sync with `packageManager`.
 FROM node:22.12.0-bookworm-slim AS base
 ENV PNPM_HOME=/pnpm \
     PATH="/pnpm:$PATH" \
-    # root `prepare` script runs husky, which needs the .git dir that is
-    # deliberately excluded from the build context
+    # husky needs the .git dir that the build context excludes
     HUSKY=0 \
     npm_config_store_dir=/pnpm/store \
     CI=true
 RUN npm install -g pnpm@10.17.0
 WORKDIR /app
 
-# Download every dependency from the lockfile alone, then bring in the
-# source. Any source change invalidates only the layers below the COPY —
-# the package store survives in the cache mount, so reinstalls are cheap.
 FROM base AS source
 COPY pnpm-lock.yaml pnpm-workspace.yaml ./
 RUN --mount=type=cache,id=pnpm-store,target=/pnpm/store pnpm fetch
 COPY . .
 
-# Production deps for backend + worker and their workspace dependencies
-# (execution-core, types). Both apps run TS directly through tsx — the
-# Temporal worker additionally requires its workflow TS source on disk at
-# runtime (the workflow sandbox bundles it from source), so there is no
-# build step to get wrong.
+# tsx runs TS directly — required anyway for the worker, whose workflow
+# sandbox bundles from TS source on disk at runtime.
+# --prefer-offline (not --offline): offline mode leaks into lifecycle
+# scripts and breaks the icons build, which shells out to npx.
 FROM source AS runtime
-# root `prepare` runs husky, a devDependency that a --prod install doesn't
-# have — drop the script inside the image (root scripts are unused at runtime)
 RUN --mount=type=cache,id=pnpm-store,target=/pnpm/store \
+    # `prepare` runs husky, absent from a --prod install
     npm pkg delete scripts.prepare && \
     pnpm install --frozen-lockfile --prefer-offline --prod \
     --filter backend... --filter execution-worker...
-# command supplied by docker-compose:
-#   backend: pnpm --filter backend start:prod
-#   worker:  pnpm --filter execution-worker start:prod
 
-# The SPA build imports the SDK from source (vite alias), so this needs the
-# full frontend dependency tree. VITE_BACKEND_URL is baked in at build time;
-# the default (empty) makes the app call /api on its own origin, which the
-# web target's nginx proxies to the backend.
+# VITE_BACKEND_URL is baked at build time; empty = same-origin /api,
+# proxied by the web target's nginx.
 FROM source AS frontend-build
 ARG VITE_BACKEND_URL=
 RUN --mount=type=cache,id=pnpm-store,target=/pnpm/store \
diff --git a/deploy/ai-studio/docker-compose.yml b/deploy/ai-studio/docker-compose.yml
index c4f827984..b891cea66 100644
--- a/deploy/ai-studio/docker-compose.yml
+++ b/deploy/ai-studio/docker-compose.yml
@@ -1,11 +1,6 @@
-# AI Studio execution stack — production-shaped compose (WB-229 lean MVP).
-#
-#   cp .env.example .env   # set OPENROUTER_API_KEY
-#   docker compose up -d --build
-#
-# Only the `web` service publishes a port. Postgres, Temporal and the
-# backend stay on the internal network. Temporal UI is opt-in via the
-# `debug` profile and binds to loopback only.
+# AI Studio production stack (WB-229). Usage: cp .env.example .env, set
+# OPENROUTER_API_KEY, then `docker compose up -d --build`. Only `web`
+# publishes a port.
 
 name: ai-studio
 
@@ -45,10 +40,8 @@ services:
       retries: 12
     restart: unless-stopped
 
-  # auto-setup is Temporal's dev-grade single-binary image. Accepted for the
-  # demo (WB-229); a sustained-load deployment should move to Temporal Cloud
-  # or a properly operated self-hosted cluster — the apps only consume
-  # TEMPORAL_ADDRESS and don't care which.
+  # auto-setup is dev-grade; sustained load should move to Temporal Cloud
+  # or an operated cluster — the apps only consume TEMPORAL_ADDRESS
   temporal:
     image: temporalio/auto-setup:1.29.6.1
     depends_on:
@@ -73,8 +66,7 @@ services:
       - '127.0.0.1:8233:8080'
     restart: unless-stopped
 
-  # Applies Drizzle migrations at boot, before accepting traffic. A failure
-  # (e.g. Postgres still starting) exits the process and `restart` retries.
+  # applies migrations at boot; on failure exits and `restart` retries
   backend:
     image: ai-studio-runtime
     build: *runtime-build
@@ -84,11 +76,9 @@ services:
       PORT: 3001
       DATABASE_URL: postgresql://wb:${APP_DB_PASSWORD:-wb}@app-db:5432/workflow_builder
       TEMPORAL_ADDRESS: temporal:7233
-      # Reference deployment has no user accounts; the explicit opt-in keeps
-      # a forgotten env var from silently exposing an unauthenticated API.
+      # explicit opt-in — a forgotten env var fails loudly instead of exposing the API
       WB_AUTH_PORT: allow-all
-      # Backend is only reachable through the web service's nginx, which
-      # sets X-Forwarded-For — safe to trust for per-IP rate limiting.
+      # only nginx can reach the backend, so X-Forwarded-For is trustworthy
       TRUST_PROXY: 'true'
       RATE_LIMIT_EXECUTE_PER_MINUTE: ${RATE_LIMIT_EXECUTE_PER_MINUTE:-10}
       RATE_LIMIT_EXECUTE_PER_DAY: ${RATE_LIMIT_EXECUTE_PER_DAY:-50}
@@ -111,8 +101,7 @@ services:
       start_period: 15s
     restart: unless-stopped
 
-  # Crash-loops until Temporal answers on 7233 (auto-setup has no usable
-  # healthcheck); `restart: unless-stopped` converges it.
+  # crash-loops until Temporal answers (no usable healthcheck); restart converges it
   worker:
     image: ai-studio-runtime
     build: *runtime-build
@@ -125,7 +114,7 @@ services:
     depends_on:
       app-db:
         condition: service_healthy
-      # healthy = migrations applied — the worker writes to the same schema
+      # backend healthy = migrations applied
       backend:
         condition: service_healthy
       temporal:
@@ -139,7 +128,7 @@ services:
       dockerfile: deploy/ai-studio/Dockerfile
       target: web
       args:
-        # Empty -> SPA calls /api on its own origin, proxied by this nginx.
+        # empty -> SPA calls /api on its own origin via this nginx
         VITE_BACKEND_URL: ${VITE_BACKEND_URL:-}
     ports:
       - '${WEB_BIND:-0.0.0.0}:${WEB_PORT:-8080}:80'
diff --git a/deploy/ai-studio/nginx/default.conf b/deploy/ai-studio/nginx/default.conf
index 396c1dd6c..501c28d4f 100644
--- a/deploy/ai-studio/nginx/default.conf
+++ b/deploy/ai-studio/nginx/default.conf
@@ -1,8 +1,5 @@
-# AI Studio — SPA + API reverse proxy.
-#
-# This container is the only public surface of the stack. TLS is expected to
-# terminate in front of it (cloud ingress / load balancer / a host-level
-# certbot'd nginx) — see deploy/ai-studio/README.md for the options.
+# AI Studio — SPA + /api reverse proxy; the stack's only public surface.
+# TLS terminates in front (see README.md).
 
 server {
     listen 80;
@@ -21,12 +18,10 @@ server {
     gzip on;
     gzip_types text/css application/javascript application/json image/svg+xml;
 
-    # Backend caps request bodies at 1 MB and answers with a structured
-    # error; keep nginx's own limit above it so the backend owns that path.
+    # backend enforces 1 MB with a structured error — stay above it
     client_max_body_size 2m;
 
-    # Live execution streams over SSE: hold the connection open, never
-    # buffer, and outlast the backend's 15s heartbeat interval.
+    # SSE: never buffer, outlast the 15s heartbeat
     location ~ ^/api/executions/.+/stream$ {
         proxy_pass $backend_upstream;
         proxy_http_version 1.1;
@@ -50,7 +45,7 @@ server {
         proxy_set_header X-Forwarded-Proto $scheme;
     }
 
-    # Vite emits content-hashed filenames under /assets — cache forever.
+    # content-hashed filenames — cache forever
     location /assets/ {
         add_header Cache-Control "public, max-age=31536000, immutable";
         try_files $uri =404;
diff --git a/tools/deployment/ansible/deploy-application/main.yml b/tools/deployment/ansible/deploy-application/main.yml
index f83dc6ddb..0fd88edf3 100644
--- a/tools/deployment/ansible/deploy-application/main.yml
+++ b/tools/deployment/ansible/deploy-application/main.yml
@@ -1,19 +1,9 @@
 ---
-# Deploys the AI Studio execution stack to the Docker Swarm cluster,
-# following the workflow-builder repo's deploy-application playbook. The
-# images are the same two targets the local compose builds
-# (deploy/ai-studio/Dockerfile); only the orchestration differs. Database
-# migrations run inside the backend at boot, so there is no migration step
-# here — the backend restarts until Postgres answers, then migrates itself.
-#
-# Differences from the workflow-builder playbook, all forced by AI Studio
-# being stateful:
-#   - Postgres x2 + Temporal services with named volumes, pinned to the node
-#     labeled `ai-studio-data=true` (Swarm volumes are node-local).
-#   - Services get short DNS aliases (backend, app-db, temporal, ...) so the
-#     same images and env defaults work under compose and Swarm.
-#   - Gatekeeper is optional (AUTH_ENABLED=true): the WB-229 public demo is
-#     deliberately login-free; internal stage/dev instances can enable it.
+# AI Studio on the Swarm cluster, following the workflow-builder repo's
+# deploy-application playbook with the same images as deploy/ai-studio.
+# No migration step: the backend migrates itself at boot. DB/Temporal
+# volumes are pinned via node.labels.ai-studio-data (Swarm volumes are
+# node-local); gatekeeper is optional (AUTH_ENABLED).
 
 - hosts: master
 
diff --git a/tools/deployment/scripts/build-docker.sh b/tools/deployment/scripts/build-docker.sh
index 10e90db1e..2e97e77b2 100755
--- a/tools/deployment/scripts/build-docker.sh
+++ b/tools/deployment/scripts/build-docker.sh
@@ -1,12 +1,7 @@
 #!/bin/sh
-# Build + push the AI Studio images to ACR, mirroring the workflow-builder
-# repo's tools/deployment/scripts/build-docker.sh. Both images come from
-# the same multi-target Dockerfile in deploy/ai-studio/ — this script only
-# adds registry tagging; the images are identical to the local-compose ones.
-#
-# Bitbucket-style env vars are honored when present (TAG_PREFIX,
-# BITBUCKET_COMMIT, BITBUCKET_DEPLOYMENT_ENVIRONMENT) and fall back to git +
-# DEPLOY_ENV so the script also runs from a workstation or GitHub Actions.
+# Build + push the AI Studio images (deploy/ai-studio/Dockerfile) to ACR,
+# mirroring workflow-builder's build-docker.sh. Bitbucket CI vars win when
+# present; git/DEPLOY_ENV fallbacks keep it runnable from a workstation.
 set -eu
 
 APP_NAME="ai-studio"
@@ -27,8 +22,7 @@ done
 ALLOWED_ENVIRONMENTS="stage dev prod"
 
 if echo "$ALLOWED_ENVIRONMENTS" | grep -w "$ENVIRONMENT" > /dev/null; then
-  # setup-az.sh exists in the deployment CI image; logging in by other means
-  # (az acr login / docker login) is fine when running elsewhere
+  # setup-az.sh only exists in the deployment CI image
   [ -f /var/setup-az.sh ] && . /var/setup-az.sh
   for TARGET in runtime web; do
     docker push "$REGISTRY/$APP_NAME:$TARGET-$IMAGE_TAG"
diff --git a/tools/deployment/scripts/deploy.sh b/tools/deployment/scripts/deploy.sh
index 1ac078e57..fac8f5be4 100755
--- a/tools/deployment/scripts/deploy.sh
+++ b/tools/deployment/scripts/deploy.sh
@@ -1,8 +1,6 @@
 #!/bin/sh
-# Deploy the AI Studio stack to the Docker Swarm cluster, mirroring the
-# workflow-builder repo's tools/deployment/scripts/deploy.sh. The setup
-# scripts are baked into the synergycodes deployment CI image; guards let the
-# playbook also run from a workstation with az + ansible already configured.
+# Deploy the AI Studio stack to Swarm, mirroring workflow-builder's
+# deploy.sh. The setup scripts exist only in the deployment CI image.
 set -eu
 
 [ -f /var/setup-az.sh ] && . /var/setup-az.sh

From 130fdc7958788829ce67768f789974c557a1f71d Mon Sep 17 00:00:00 2001
From: Jan Librowski <jan.librowski@synergycodes.com>
Date: Thu, 11 Jun 2026 14:31:47 +0200
Subject: [PATCH 09/11] revert(execution-worker): move the decision catch-all
 fix to its own pr
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reverts the empty-conditions-as-catch-all change (ccf7375) and its
decision log. It changes execution semantics and supersedes a clause of
decision-no-match.decision-log.md — that deserves a focused review, not
a ride-along in a deployment PR.
---
 .../decision-catch-all.decision-log.md        | 74 -------------------
 .../src/executors/decision.test.ts            | 15 ++--
 .../src/executors/decision.ts                 |  3 +-
 .../swarm-alignment.decision-log.md           |  2 +-
 4 files changed, 8 insertions(+), 86 deletions(-)
 delete mode 100644 apps/execution-worker/decision-catch-all.decision-log.md

diff --git a/apps/execution-worker/decision-catch-all.decision-log.md b/apps/execution-worker/decision-catch-all.decision-log.md
deleted file mode 100644
index fef839295..000000000
--- a/apps/execution-worker/decision-catch-all.decision-log.md
+++ /dev/null
@@ -1,74 +0,0 @@
-### Title: Decision branch with no conditions is the explicit catch-all
-
-### Proposed by: Jan Librowski
-
-### Date: 10.06.2026
-
-## Context
-
-End-to-end verification of the WB-229 demo deployment failed on the
-reference workload: the Sales Inquiry Pipeline's classifier returned
-`**Type:** general`, no conditional branch matched, and the run ended in
-`execution_failed` — despite the template shipping a 'General' branch with
-`conditions: []` as its designed fallback.
-
-The codebase contradicted itself on what a catch-all is:
-
-- `decision-no-match.decision-log.md` (execution-core, 29.04.2026) decided
-  **strict fail-fast on no match** — correct and kept — but its Cons section
-  declared an empty-conditions branch non-matching, requiring a
-  tautological condition (`x === x`) as the catch-all idiom. A unit test
-  pinned that.
-- The executor's own `no_branch_matched` error message instructed the
-  opposite: _"Add an explicit catch-all branch with no conditions."_
-- The reference template (`sales-inquiry-flow.ts`) followed the error
-  message, not the test — and was broken for any input classified outside
-  its keyword branches. Local demos always matched 'pricing'/'technical', so
-  this never surfaced until a different model classified an input as
-  'general'.
-
-Three artifacts said "empty = catch-all", one said the opposite; the
-user-facing ones (error message, reference template) all pointed one way.
-
-## Decision
-
-`branchMatches` in `apps/execution-worker/src/executors/decision.ts` now
-returns `true` for an empty `conditions[]`. First-match order is preserved,
-so a catch-all only fires when placed after the conditional branches. The
-strict throw from the original decision is untouched: a decision node whose
-branches all have conditions and none match still fails with
-`no_branch_matched`.
-
-This supersedes the "empty conditions are non-matching" bullet (and the
-test pinning it) from `decision-no-match.decision-log.md`. The fail-fast
-core of that decision stands.
-
-## Alternative Options Considered
-
-- **Keep the semantics, fix the template with a tautological condition** —
-  rejected: every UI author following the error message's instruction would
-  keep hitting the same failure, and `isEqual 'a' 'a'` as the blessed
-  catch-all idiom is noise a property panel can't explain.
-- **`isDefault: true` flag on a designated branch** — still the cleaner
-  long-term UX (already noted in the original log); still deferred for the
-  same reason: type + Zod schema + properties-panel changes, separate
-  ticket.
-
-## Consequences
-
-- **Pros**
-  - The shipped reference template and the executor's error message are now
-    both true.
-  - Catch-all is expressible in the UI as-is (an empty branch), no magic
-    conditions.
-- **Cons**
-  - Semantics change: a flow that contained an empty-conditions branch and
-    relied on the node failing now routes through that branch. No known
-    flow does this — the only shipped example wanted the opposite.
-  - A _misplaced_ empty branch (before conditional ones) silently wins due
-    to first-match order; the matched branch is visible in the
-    `matchedBranch` output and event log.
-
-## Status
-
-Accepted
diff --git a/apps/execution-worker/src/executors/decision.test.ts b/apps/execution-worker/src/executors/decision.test.ts
index 2f11c3fbd..ac1f98de2 100644
--- a/apps/execution-worker/src/executors/decision.test.ts
+++ b/apps/execution-worker/src/executors/decision.test.ts
@@ -82,20 +82,17 @@ describe('executeDecision', () => {
     }
   });
 
-  it('treats a branch with no conditions as the catch-all', () => {
+  it('treats a branch with no conditions as non-matching (so callers must throw or use explicit operators)', () => {
+    // Empty conditions array — branchMatches returns false, so this is NOT
+    // a default. If someone wants a default, they need a branch whose
+    // conditions evaluate to true (e.g. isEqual 'x' 'x').
     const node = decisionNode([
       {
-        sourceHandle: 'no',
-        conditions: [{ x: 'a', y: 'b', comparisonOperator: 'isEqual' }],
-      },
-      {
-        sourceHandle: 'fallback',
+        sourceHandle: 'empty',
         conditions: [],
       },
     ]);
 
-    const result = executeDecision(node, context());
-
-    expect(result.nextPort).toBe('fallback');
+    expect(() => executeDecision(node, context())).toThrowError(NodeExecutionError);
   });
 });
diff --git a/apps/execution-worker/src/executors/decision.ts b/apps/execution-worker/src/executors/decision.ts
index d9e8eefe9..847fa2db5 100644
--- a/apps/execution-worker/src/executors/decision.ts
+++ b/apps/execution-worker/src/executors/decision.ts
@@ -28,8 +28,7 @@ export function executeDecision(node: DecisionNode, context: ExecutionContext):
 }
 
 function branchMatches(conditions: DecisionBranchCondition[], context: ExecutionContext): boolean {
-  // no conditions = the explicit catch-all the no_branch_matched error instructs authors to add
-  if (conditions.length === 0) return true;
+  if (conditions.length === 0) return false;
 
   let result = evaluateCondition(conditions[0]!, context);
   for (let index = 1; index < conditions.length; index++) {
diff --git a/tools/deployment/swarm-alignment.decision-log.md b/tools/deployment/swarm-alignment.decision-log.md
index 83d58de78..c60d051e7 100644
--- a/tools/deployment/swarm-alignment.decision-log.md
+++ b/tools/deployment/swarm-alignment.decision-log.md
@@ -6,7 +6,7 @@
 
 ## Context
 
-The compose-based deployment in `deploy/ai-studio/` (see its decision log)
+The compose-based deployment in `deploy/ai-studio/`
 targets a single Docker host and ships TLS as a bring-your-own concern. The
 company's actual Azure footprint, found in the `workflow-builder` repo's
 `tools/deployment/`, is different: a self-managed Docker Swarm cluster with

From a03b9a2fc8e9a94f103fe1a96444b57e8a7984ff Mon Sep 17 00:00:00 2001
From: Jan Librowski <jan.librowski@synergycodes.com>
Date: Thu, 11 Jun 2026 14:32:05 +0200
Subject: [PATCH 10/11] docs(deploy): drop the deployment decision log
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

AI Studio is a POC — the README and the comments on the non-obvious
pieces carry what operators need; full architecture rationale is
premature at this stage.
---
 .../ai-studio-deployment.decision-log.md      | 148 ------------------
 1 file changed, 148 deletions(-)
 delete mode 100644 deploy/ai-studio/ai-studio-deployment.decision-log.md

diff --git a/deploy/ai-studio/ai-studio-deployment.decision-log.md b/deploy/ai-studio/ai-studio-deployment.decision-log.md
deleted file mode 100644
index 40e1fc6ac..000000000
--- a/deploy/ai-studio/ai-studio-deployment.decision-log.md
+++ /dev/null
@@ -1,148 +0,0 @@
-### Title: Containerized AI Studio deployment — portable compose stack
-
-### Proposed by: Jan Librowski
-
-### Date: 10.06.2026
-
-## Context
-
-WB-229 (lean public demo on an Azure VM) and its parent WB-155 (deployment
-preparations) needed a production deployment story for the AI Studio
-execution stack: backend (Hono), execution-worker (Temporal), two Postgres
-instances, a Temporal server, and the static SPA. Until now only `pnpm dev`
-plus an infra-only compose existed — no Dockerfiles for any app.
-
-Constraints that shaped the design:
-
-- **Portability over Azure ergonomics.** Workflow Builder is sold to external
-  customers; whatever ships here must run on AWS / GCP / on-prem / bare
-  Docker without re-architecting. DevOps asked for containerization
-  specifically for ease of portability and setup.
-- **Surprise bills must be impossible** (WB-229): a hard OpenRouter spend cap
-  (dashboard Guardrail) plus an in-app per-IP abuse gate.
-- **The local dev flow must survive** (`pnpm dev:ai-studio` + `pnpm
-infra:up`) — contributors rely on it; nothing in dev changes.
-- The repo pins Node 22.12.0 + pnpm 10.17.0 with `engineStrict`, and the
-  Temporal worker bundles its workflow entrypoint **from TS source at
-  runtime**, so the source tree must be present in the worker container.
-
-## Decision
-
-Everything lives in `deploy/ai-studio/`: one multi-target Dockerfile, a
-production `docker-compose.yml`, the nginx config, `.env.example`, and a
-DevOps-facing README.
-
-1. **One Dockerfile, two targets** (`runtime`, `web`), built
-   with the repo root as context (pnpm `workspace:*` links require it). A
-   shared `source` stage does `pnpm fetch` against a BuildKit cache mount, so
-   per-target installs are store-hits.
-2. **tsx in production, no build step.** Backend and worker run TS through
-   `tsx` exactly as in dev — `tsx` moved from a hoisted root devDependency to
-   a real dependency of both apps, plus `start:prod` scripts (the existing
-   `start` scripts hard-require a `.env` file; containers inject env
-   directly). This sidesteps the Temporal-sandbox-needs-source constraint
-   entirely — there is no bundling step to get wrong.
-3. **One shared `runtime` image for backend and worker**; the compose
-   `command` picks the entrypoint. One image to build, push, and version.
-4. **Migrations on backend boot** (revised 11.06.2026 — originally a
-   one-shot `migrate` compose service). The backend applies pending Drizzle
-   migrations via drizzle-orm's programmatic migrator before accepting
-   traffic; on failure it exits and the restart policy retries until
-   Postgres answers. One less image, no orchestrator-specific ordering —
-   the same behavior on compose, Swarm, or anything else. Single-replica
-   assumption: concurrent backends would race the migrator.
-5. **nginx is the only public surface.** It serves the SPA and proxies
-   `/api` to the backend on the internal network; the SSE stream route gets
-   `proxy_buffering off` + long read timeout. The backend container is
-   reached through Docker's embedded DNS **re-resolved per request**
-   (`resolver 127.0.0.11` + variable `proxy_pass`) — a statically resolved
-   upstream 502s after the backend container is recreated on redeploy.
-   Postgres ×2, Temporal, and the backend publish no host ports; Temporal UI
-   is opt-in behind a `debug` profile bound to loopback. TLS terminates in
-   front (existing ingress or host-level Caddy/certbot — documented in the
-   README, deliberately not baked into the stack).
-6. **Same-origin frontend.** `VITE_BACKEND_URL` is baked empty at build time;
-   the SPA calls `/api` on its own origin. No CORS, no second hostname, SSE
-   intact.
-7. **pnpm installed via `npm i -g pnpm@10.17.0` in images, not corepack.**
-   The corepack bundled with Node 22.12.0 cannot load pnpm 10
-   (`ERR_VM_DYNAMIC_IMPORT_CALLBACK_MISSING`) and ships stale signature
-   keys. Version is duplicated in the Dockerfile — keep in sync with
-   `packageManager`.
-8. **Installs use `--prefer-offline`, not `--offline`**: pnpm propagates
-   offline mode to lifecycle scripts, and `apps/icons` `prepare` shells out
-   to `npx @svgr/cli`, which then refuses the network (`ENOTCACHED`).
-9. **Per-IP rate limit on the execute route** (`apps/backend`):
-   fixed-window, in-memory, env-gated (`RATE_LIMIT_EXECUTE_PER_MINUTE/DAY`,
-   default off so dev is untouched; compose sets 10/min, 50/day).
-   `TRUST_PROXY=true` makes it read the client from `X-Forwarded-For`, which
-   only our nginx can set. This is the abuse gate; the money cap is the
-   OpenRouter account Guardrail — two independent controls.
-10. **Model pinned per environment, not in code**: compose defaults
-    `AI_MODEL=mistralai/mistral-small-3.2-24b-instruct` (price re-verified
-    2026-06-10 against the OpenRouter API: $0.075/$0.20 per Mtok ≈ $0.0004
-    per 3-call template run). Swapping models is an env change.
-11. **Pinned images, no `:latest`**: `temporalio/auto-setup:1.29.6.1`,
-    `temporalio/ui:2.51.0`, `nginx:1.31-alpine`, `node:22.12.0-bookworm-slim`
-    (exact pin because `engineStrict` rejects any other 22.x).
-
-Found and fixed during end-to-end verification: the worker ignored
-`TEMPORAL_ADDRESS` (`Worker.create` without an explicit connection dials
-`127.0.0.1:7233` — invisible in local dev, fatal in containers).
-
-## Alternative Options Considered
-
-- **`pnpm deploy` to materialize standalone app bundles** — rejected: pnpm 10
-  requires `inject-workspace-packages` or a legacy-mode flag, adding workspace
-  config churn for no benefit over running from the installed workspace.
-- **Compile step (tsc/tsup/esbuild) + plain `node`** — rejected for the MVP:
-  the worker needs its TS source on disk for Temporal's runtime bundling
-  anyway, so compilation only helps the backend while doubling the ways the
-  artifact can diverge from dev. Revisit if image size or cold-start matters.
-- **Azure-specific artifacts (Container Apps / AKS manifests, Key Vault
-  wiring)** — deferred deliberately: WB-229 targets a single VM, and the
-  portability requirement says external customers must not inherit Azure
-  glue. The compose file is the customer-facing artifact; platform topology
-  can wrap it later.
-- **Separate Dockerfiles per app** — rejected: three near-identical
-  install stages to keep in sync; the multi-target file shares layers.
-- **Rate limiting in nginx (`limit_req`)** — rejected: the limit is
-  per-execute-route and needs structured JSON 429s consistent with the
-  backend's error contract; nginx zones would split the policy across two
-  layers. nginx stays dumb, policy lives where the route lives.
-- **Redis-backed rate limiter** — deferred to the scale-ready task (WB-229
-  explicitly accepts single-replica in-memory for the MVP).
-
-## Consequences
-
-- **Pros**
-  - `cp .env.example .env && docker compose up -d --build` is the whole
-    deployment; verified end-to-end (Sales Inquiry Pipeline to
-    `execution_completed` with live SSE through nginx, rate limiter returning
-    429s past the budget).
-  - The artifact is platform-neutral: any Docker host, no cloud SDK anywhere.
-  - Secrets only travel through compose `environment`; `.dockerignore` now
-    excludes `**/.env*` so keys cannot be baked into images (previously
-    `apps/*/.env` files would have been copied into the build context).
-  - Dev flow untouched; rate limiter is inert without its env vars.
-- **Cons**
-  - `runtime` image is ~1.9 GB (full source tree + pnpm store hardlinks +
-    Temporal native bridge). Acceptable for a demo VM; a compile step or
-    `pnpm deploy` bundle is the known optimization path.
-  - Any source change invalidates the `COPY . .` layer and reinstalls
-    (mitigated by the store cache mount; rebuilds are minutes, not tens of).
-  - `temporalio/auto-setup` is dev-grade by Temporal's own docs — accepted
-    for the demo, swap for Temporal Cloud / operated cluster under sustained
-    load (the apps only consume `TEMPORAL_ADDRESS`).
-  - pnpm version is pinned in two places (root `packageManager` +
-    Dockerfile).
-
-## Revisions
-
-- **11.06.2026** — `migrate` target and service removed; the backend now
-  migrates itself at boot (Jan's simplification request during WB-229
-  review). Dockerfile is down to two targets (`runtime`, `web`).
-
-## Status
-
-Accepted

From d8d80a06079396e62d7f850ed803363754469cb5 Mon Sep 17 00:00:00 2001
From: Jan Librowski <jan.librowski@synergycodes.com>
Date: Mon, 15 Jun 2026 15:08:15 +0200
Subject: [PATCH 11/11] fix(deploy): add healthcheck, restart policies, and
 overlay driver to swarm stack
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Backend healthcheck (fetch /api/health) lets Swarm detect when
migrations are done — without it the worker can hit a pre-migration
schema. Explicit restart_policy on every service replaces the implicit
Swarm default; crash-looping services (worker, temporal) get
max_attempts. Internal network gets driver: overlay for clarity.
---
 .../ansible/deploy-application/main.yml       | 26 ++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/tools/deployment/ansible/deploy-application/main.yml b/tools/deployment/ansible/deploy-application/main.yml
index 0fd88edf3..a5a41f350 100644
--- a/tools/deployment/ansible/deploy-application/main.yml
+++ b/tools/deployment/ansible/deploy-application/main.yml
@@ -114,11 +114,20 @@
                 TRUST_PROXY: 'true'
                 RATE_LIMIT_EXECUTE_PER_MINUTE: "{{ lookup('env', 'RATE_LIMIT_EXECUTE_PER_MINUTE') or '10' }}"
                 RATE_LIMIT_EXECUTE_PER_DAY: "{{ lookup('env', 'RATE_LIMIT_EXECUTE_PER_DAY') or '50' }}"
+              healthcheck:
+                test: ['CMD', 'node', '-e', "fetch('http://127.0.0.1:3001/api/health').then(r=>process.exit(r.ok?0:1)).catch(()=>process.exit(1))"]
+                interval: 10s
+                timeout: 5s
+                retries: 6
+                start_period: 15s
               networks:
                 internal:
-                  # the web image's nginx proxies to http://backend:3001
                   aliases: [backend]
               deploy:
+                restart_policy:
+                  condition: any
+                  delay: 5s
+                  max_attempts: 10
                 placement:
                   constraints:
                     - node.role==worker
@@ -134,6 +143,10 @@
               networks:
                 internal:
               deploy:
+                restart_policy:
+                  condition: any
+                  delay: 5s
+                  max_attempts: 20
                 placement:
                   constraints:
                     - node.role==worker
@@ -150,6 +163,9 @@
                 internal:
                   aliases: [app-db]
               deploy:
+                restart_policy:
+                  condition: any
+                  delay: 5s
                 placement:
                   constraints:
                     - node.labels.ai-studio-data==true
@@ -166,6 +182,9 @@
                 internal:
                   aliases: [temporal-db]
               deploy:
+                restart_policy:
+                  condition: any
+                  delay: 5s
                 placement:
                   constraints:
                     - node.labels.ai-studio-data==true
@@ -182,6 +201,10 @@
                 internal:
                   aliases: [temporal]
               deploy:
+                restart_policy:
+                  condition: any
+                  delay: 5s
+                  max_attempts: 20
                 placement:
                   constraints:
                     - node.role==worker
@@ -192,6 +215,7 @@
 
           networks:
             internal:
+              driver: overlay
             traefik-host-external:
               external: true