diff --git a/.dockerignore b/.dockerignore index 84a60d2c3..e0fcd955c 100644 --- a/.dockerignore +++ b/.dockerignore @@ -2,18 +2,29 @@ .git .vscode .idea +.claude # external dependencies node_modules +**/node_modules # docker files docker-compose*.yml **/Dockerfile* +# build artifacts +dist/ +**/dist +coverage/ +**/coverage + # not needed files README.md tools/ !tools/deployment/nginx .gitignore -.env -coverage/ + +# env files hold secrets — never in a build context +**/.env +**/.env.* +!**/.env.example diff --git a/CLAUDE.md b/CLAUDE.md index 5dd077b75..d2932a89f 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -11,7 +11,7 @@ Three onboarding paths (A, B local-run; C docs-only). README "Get started" is th | `pnpm preflight` | both | Verify Node / pnpm / Docker / ports / `.env` files. Add `--json` for agents | | `pnpm dev` / `pnpm dev:demo` | A | Demo (UI only, port 4200). No backend, no Docker | | `pnpm infra:up` | B | Start Postgres + Temporal in Docker. Required before backend/worker | -| `pnpm -F backend db:migrate` | B | Apply Drizzle migrations. First run, or after schema changes | +| `pnpm -F backend db:migrate` | B | Apply Drizzle migrations out-of-band (backend also auto-migrates on boot) | | `pnpm dev:ai-studio` | B | Full stack: infra + backend (3001) + worker + AI Studio frontend (4201) | | `pnpm dev:backend` | B | Backend only (debug). Needs infra up | | `pnpm dev:worker` | B | Execution worker only (debug). Needs infra up | @@ -22,7 +22,7 @@ Three onboarding paths (A, B local-run; C docs-only). README "Get started" is th | `pnpm test` | - | Run tests in `packages/sdk` and `packages/execution-core` | | `pnpm check` | - | Lint + typecheck + format + knip | -Path A is UI-only and does not need Docker. Path B requires `pnpm infra:up` before backend/worker can start, and `db:migrate` on the first run. +Path A is UI-only and does not need Docker. Path B requires `pnpm infra:up` before backend/worker can start; the backend applies pending migrations automatically at boot. ### Agent signals @@ -42,6 +42,9 @@ Long-running processes already emit stable log lines that scripts and agents can ``` tools/ - Root dev scripts: preflight, setup:env, infra wait + deployment/ - Swarm/Ansible deploy path mirroring the workflow-builder repo (ACR, Traefik) +deploy/ + ai-studio/ - Production deployment: Dockerfile (runtime/web), compose, nginx, README apps/ demo/ - Reference app consuming the SDK (React + Vite, port 4200) ai-studio/ - Reference AI workflow product (React + Vite, port 4201) diff --git a/apps/backend/package.json b/apps/backend/package.json index fec96ac60..3f1e1fb1a 100644 --- a/apps/backend/package.json +++ b/apps/backend/package.json @@ -6,6 +6,7 @@ "scripts": { "dev": "tsx watch --env-file=.env ./src/server.ts", "start": "tsx --env-file=.env ./src/server.ts", + "start:prod": "tsx ./src/server.ts", "typecheck": "tsc --noEmit", "lint": "eslint", "lint:fix": "eslint --fix", @@ -24,6 +25,7 @@ "drizzle-orm": "^0.44.0", "hono": "^4.7.0", "postgres": "^3.4.5", + "tsx": "^4.19.3", "zod": "^4.3.6" }, "devDependencies": { diff --git a/apps/backend/src/db/migrate.ts b/apps/backend/src/db/migrate.ts new file mode 100644 index 000000000..6d7d4c68d --- /dev/null +++ b/apps/backend/src/db/migrate.ts @@ -0,0 +1,18 @@ +import { drizzle } from 'drizzle-orm/postgres-js'; +import { migrate } from 'drizzle-orm/postgres-js/migrator'; +import { fileURLToPath } from 'node:url'; +import postgres from 'postgres'; + +import { env } from '../env'; + +// Same SQL files as `pnpm db:migrate`. Concurrent backends would race the +// migrator — single replica assumed. +export async function runMigrations(): Promise { + const migrationsFolder = fileURLToPath(new URL('../../drizzle', import.meta.url)); + const sql = postgres(env.DATABASE_URL, { max: 1 }); + try { + await migrate(drizzle(sql), { migrationsFolder }); + } finally { + await sql.end(); + } +} diff --git a/apps/backend/src/env.ts b/apps/backend/src/env.ts index 12645d6d6..a813c14f6 100644 --- a/apps/backend/src/env.ts +++ b/apps/backend/src/env.ts @@ -12,4 +12,8 @@ export const env = { HOST: envOr('HOST', '127.0.0.1'), DATABASE_URL: envOr('DATABASE_URL', 'postgresql://wb:wb@127.0.0.1:5432/workflow_builder'), TEMPORAL_ADDRESS: envOr('TEMPORAL_ADDRESS', '127.0.0.1:7233'), + // 0 disables (dev default); the deploy compose sets both + RATE_LIMIT_EXECUTE_PER_MINUTE: Number(envOr('RATE_LIMIT_EXECUTE_PER_MINUTE', '0')), + RATE_LIMIT_EXECUTE_PER_DAY: Number(envOr('RATE_LIMIT_EXECUTE_PER_DAY', '0')), + TRUST_PROXY: envOr('TRUST_PROXY', 'false') === 'true', }; diff --git a/apps/backend/src/middleware/rate-limit.test.ts b/apps/backend/src/middleware/rate-limit.test.ts new file mode 100644 index 000000000..7b9196358 --- /dev/null +++ b/apps/backend/src/middleware/rate-limit.test.ts @@ -0,0 +1,128 @@ +import { Hono } from 'hono'; +import { describe, expect, it } from 'vitest'; + +import { type RateLimitOptions, createRateLimitMiddleware } from './rate-limit'; + +const MINUTE_MS = 60_000; +const DAY_MS = 24 * 60 * 60 * 1000; + +function makeApp(overrides: Partial = {}) { + let timestamp = 0; + const app = new Hono(); + app.use( + '/api/workflows/:id/execute', + createRateLimitMiddleware({ + perMinute: 2, + perDay: 5, + trustProxy: true, + now: () => timestamp, + ...overrides, + }), + ); + app.post('/api/workflows/:id/execute', (c) => c.json({ ok: true }, 202)); + + return { + app, + advance(ms: number) { + timestamp += ms; + }, + execute(ip = '203.0.113.7') { + return app.request('/api/workflows/wf-1/execute', { + method: 'POST', + headers: { 'x-forwarded-for': ip }, + }); + }, + }; +} + +describe('createRateLimitMiddleware', () => { + it('allows requests under the limit', async () => { + const { execute } = makeApp(); + + const first = await execute(); + const second = await execute(); + expect(first.status).toBe(202); + expect(second.status).toBe(202); + }); + + it('rejects with 429 and Retry-After once the minute limit is hit', async () => { + const { execute, advance } = makeApp(); + + await execute(); + await execute(); + advance(10_000); + + const response = await execute(); + expect(response.status).toBe(429); + expect(response.headers.get('Retry-After')).toBe('50'); + expect(await response.json()).toMatchObject({ code: 'rate_limited', retryAfterSeconds: 50 }); + }); + + it('tracks each IP independently', async () => { + const { execute } = makeApp(); + + await execute('203.0.113.7'); + await execute('203.0.113.7'); + const blocked = await execute('203.0.113.7'); + const otherIp = await execute('198.51.100.9'); + expect(blocked.status).toBe(429); + expect(otherIp.status).toBe(202); + }); + + it('resets the minute window after it elapses', async () => { + const { execute, advance } = makeApp(); + + await execute(); + await execute(); + const blocked = await execute(); + expect(blocked.status).toBe(429); + + advance(MINUTE_MS); + const allowedAgain = await execute(); + expect(allowedAgain.status).toBe(202); + }); + + it('enforces the day limit across minute windows', async () => { + const { execute, advance } = makeApp(); + + for (let index = 0; index < 5; index++) { + const allowed = await execute(); + expect(allowed.status).toBe(202); + advance(MINUTE_MS); + } + + const response = await execute(); + expect(response.status).toBe(429); + // 5 minutes into the day window -> retry once the remaining day elapses + expect(response.headers.get('Retry-After')).toBe(String((DAY_MS - 5 * MINUTE_MS) / 1000)); + }); + + it('resets the day window after it elapses', async () => { + const { execute, advance } = makeApp({ perMinute: 0 }); + + for (let index = 0; index < 5; index++) { + await execute(); + } + const blocked = await execute(); + expect(blocked.status).toBe(429); + + advance(DAY_MS); + const allowedAgain = await execute(); + expect(allowedAgain.status).toBe(202); + }); + + it('uses the first X-Forwarded-For hop as the client identity', async () => { + const { app } = makeApp(); + + const request = (chain: string) => + app.request('/api/workflows/wf-1/execute', { + method: 'POST', + headers: { 'x-forwarded-for': chain }, + }); + + await request('203.0.113.7, 10.0.0.1'); + await request('203.0.113.7, 10.0.0.2'); + const blocked = await request('203.0.113.7, 10.0.0.3'); + expect(blocked.status).toBe(429); + }); +}); diff --git a/apps/backend/src/middleware/rate-limit.ts b/apps/backend/src/middleware/rate-limit.ts new file mode 100644 index 000000000..2a5b4c70f --- /dev/null +++ b/apps/backend/src/middleware/rate-limit.ts @@ -0,0 +1,110 @@ +import { getConnInfo } from '@hono/node-server/conninfo'; +import type { Context, MiddlewareHandler } from 'hono'; + +export type RateLimitOptions = { + // 0 disables a window + perMinute: number; + perDay: number; + // only safe when the backend is reachable exclusively through a proxy that + // sets X-Forwarded-For — a directly reachable backend lets clients spoof it + trustProxy: boolean; + now?: () => number; +}; + +type WindowState = { + windowStart: number; + count: number; +}; + +type IpState = { + minute: WindowState; + day: WindowState; +}; + +const MINUTE_MS = 60_000; +const DAY_MS = 24 * 60 * 60 * 1000; +const SWEEP_INTERVAL_MS = 10 * MINUTE_MS; + +function clientIp(c: Context, trustProxy: boolean): string { + if (trustProxy) { + const forwardedFor = c.req.header('x-forwarded-for'); + const first = forwardedFor?.split(',')[0]?.trim(); + if (first) { + return first; + } + } + try { + return getConnInfo(c).remote.address ?? 'unknown'; + } catch { + // no underlying socket (app.request() in tests) + return 'unknown'; + } +} + +function hitWindow(state: WindowState, limit: number, durationMs: number, now: number): number | null { + if (limit <= 0) { + return null; + } + if (now - state.windowStart >= durationMs) { + state.windowStart = now; + state.count = 0; + } + if (state.count >= limit) { + return state.windowStart + durationMs - now; + } + return null; +} + +// In-memory fixed windows: counters reset on restart and are not shared +// across replicas — fine for the single-replica demo deployment. +export function createRateLimitMiddleware(options: RateLimitOptions): MiddlewareHandler { + const { perMinute, perDay, trustProxy } = options; + const now = options.now ?? Date.now; + const states = new Map(); + let lastSweep = now(); + + return async (c, next) => { + const timestamp = now(); + + if (timestamp - lastSweep >= SWEEP_INTERVAL_MS) { + lastSweep = timestamp; + for (const [ip, state] of states) { + if (timestamp - state.day.windowStart >= DAY_MS && timestamp - state.minute.windowStart >= MINUTE_MS) { + states.delete(ip); + } + } + } + + const ip = clientIp(c, trustProxy); + let state = states.get(ip); + if (!state) { + state = { + minute: { windowStart: timestamp, count: 0 }, + day: { windowStart: timestamp, count: 0 }, + }; + states.set(ip, state); + } + + const minuteRetry = hitWindow(state.minute, perMinute, MINUTE_MS, timestamp); + const dayRetry = hitWindow(state.day, perDay, DAY_MS, timestamp); + const retryAfterMs = Math.max(minuteRetry ?? 0, dayRetry ?? 0); + + if (retryAfterMs > 0) { + const retryAfterSeconds = Math.ceil(retryAfterMs / 1000); + c.header('Retry-After', String(retryAfterSeconds)); + return c.json( + { + code: 'rate_limited', + message: 'Too many workflow executions from this address — try again later', + retryAfterSeconds, + }, + 429, + ); + } + + state.minute.count += 1; + state.day.count += 1; + + await next(); + }; +} diff --git a/apps/backend/src/server.ts b/apps/backend/src/server.ts index 61a36f952..7b33a7f96 100644 --- a/apps/backend/src/server.ts +++ b/apps/backend/src/server.ts @@ -12,8 +12,10 @@ import { createAuthMiddleware, makeAssertAuthorized, } from './auth'; +import { runMigrations } from './db/migrate'; import { env } from './env'; import { logger } from './logger'; +import { createRateLimitMiddleware } from './middleware/rate-limit'; import { createExecutionsRoutes } from './routes/executions'; import { createWorkflowsRoutes } from './routes/workflows'; @@ -52,9 +54,29 @@ app.get('/api/health', (c) => c.json({ status: 'ok' })); app.use('/api/*', createAuthMiddleware(authPort)); +if (env.RATE_LIMIT_EXECUTE_PER_MINUTE > 0 || env.RATE_LIMIT_EXECUTE_PER_DAY > 0) { + app.use( + '/api/workflows/:id/execute', + createRateLimitMiddleware({ + perMinute: env.RATE_LIMIT_EXECUTE_PER_MINUTE, + perDay: env.RATE_LIMIT_EXECUTE_PER_DAY, + trustProxy: env.TRUST_PROXY, + }), + ); + logger.info('execute rate limit enabled', { + perMinute: env.RATE_LIMIT_EXECUTE_PER_MINUTE, + perDay: env.RATE_LIMIT_EXECUTE_PER_DAY, + trustProxy: env.TRUST_PROXY, + }); +} + app.route('/api/workflows', createWorkflowsRoutes(assertAuthorized)); app.route('/api/executions', createExecutionsRoutes(assertAuthorized)); +// a failure (DB still starting) exits the process; the container restart policy retries +await runMigrations(); +logger.info('database migrations applied'); + serve({ fetch: app.fetch, port: env.PORT, hostname: env.HOST }, () => { logger.info('backend listening', { url: `http://${env.HOST}:${env.PORT}` }); }); diff --git a/apps/execution-worker/package.json b/apps/execution-worker/package.json index 2a3d9aa58..1d394656e 100644 --- a/apps/execution-worker/package.json +++ b/apps/execution-worker/package.json @@ -6,6 +6,7 @@ "scripts": { "dev": "tsx watch --env-file=.env ./src/engines/temporal/worker.ts", "start": "tsx --env-file=.env ./src/engines/temporal/worker.ts", + "start:prod": "tsx ./src/engines/temporal/worker.ts", "typecheck": "tsc --noEmit", "lint": "eslint", "lint:fix": "eslint --fix", @@ -19,7 +20,8 @@ "@workflow-builder/execution-core": "workspace:*", "ai": "^6.0.0", "dotenv": "^17.4.2", - "postgres": "^3.4.5" + "postgres": "^3.4.5", + "tsx": "^4.19.3" }, "devDependencies": { "@types/node": "^22.12.0", diff --git a/apps/execution-worker/src/engines/temporal/worker.ts b/apps/execution-worker/src/engines/temporal/worker.ts index 326ed7c5a..401c60e93 100644 --- a/apps/execution-worker/src/engines/temporal/worker.ts +++ b/apps/execution-worker/src/engines/temporal/worker.ts @@ -1,4 +1,4 @@ -import { Worker } from '@temporalio/worker'; +import { NativeConnection, Worker } from '@temporalio/worker'; import 'dotenv/config'; import { fileURLToPath } from 'node:url'; @@ -42,7 +42,11 @@ const activities = { }, }; +// without an explicit connection, Worker.create dials 127.0.0.1:7233 and ignores TEMPORAL_ADDRESS +const connection = await NativeConnection.connect({ address: env.TEMPORAL_ADDRESS }); + const worker = await Worker.create({ + connection, workflowsPath: fileURLToPath(new URL('workflows/run-workflow.ts', import.meta.url)), activities, taskQueue, diff --git a/deploy/ai-studio/.env.example b/deploy/ai-studio/.env.example new file mode 100644 index 000000000..b80d85932 --- /dev/null +++ b/deploy/ai-studio/.env.example @@ -0,0 +1,36 @@ +# Copy to .env next to docker-compose.yml and fill in. Everything except +# OPENROUTER_API_KEY has a working default. + +# --- required --------------------------------------------------------------- + +# Server-side only; never reaches the browser. Pair it with an OpenRouter +# account Guardrail (hard $/day ceiling) — see README "Spend safety". +OPENROUTER_API_KEY= + +# --- LLM -------------------------------------------------------------------- + +# WB-229 demo model. Cheap, EU-hosted, solid tool calling. +# ~$0.075/M input + $0.20/M output => ~$0.0004 per 3-call template run. +AI_MODEL=mistralai/mistral-small-3.2-24b-instruct + +# --- abuse gate (per-IP, execute route) --------------------------------------- + +RATE_LIMIT_EXECUTE_PER_MINUTE=10 +RATE_LIMIT_EXECUTE_PER_DAY=50 + +# --- network ------------------------------------------------------------------ + +# Where the web container publishes. Put your TLS terminator in front of it +# (or bind 127.0.0.1 and proxy from a host nginx/caddy). +WEB_BIND=0.0.0.0 +WEB_PORT=8080 + +# Leave empty: the SPA then calls /api on its own origin and nginx proxies +# it to the backend (no CORS, SSE intact). Only set this if the frontend is +# served from a different host than the backend. +VITE_BACKEND_URL= + +# --- databases (internal network only, not published) ------------------------- + +APP_DB_PASSWORD=wb +TEMPORAL_DB_PASSWORD=temporal diff --git a/deploy/ai-studio/Dockerfile b/deploy/ai-studio/Dockerfile new file mode 100644 index 000000000..664c6d6ea --- /dev/null +++ b/deploy/ai-studio/Dockerfile @@ -0,0 +1,46 @@ +# syntax=docker/dockerfile:1 + +# Targets: runtime (backend + worker, command chosen per compose service), +# web (nginx, SPA + /api proxy). Build context must be the repo root — +# workspace packages are linked via pnpm `workspace:*`. +# +# Exact Node pin: engineStrict rejects any other version. pnpm via npm, not +# corepack — this Node's corepack cannot load pnpm 10 +# (ERR_VM_DYNAMIC_IMPORT_CALLBACK_MISSING). Keep in sync with `packageManager`. +FROM node:22.12.0-bookworm-slim AS base +ENV PNPM_HOME=/pnpm \ + PATH="/pnpm:$PATH" \ + # husky needs the .git dir that the build context excludes + HUSKY=0 \ + npm_config_store_dir=/pnpm/store \ + CI=true +RUN npm install -g pnpm@10.17.0 +WORKDIR /app + +FROM base AS source +COPY pnpm-lock.yaml pnpm-workspace.yaml ./ +RUN --mount=type=cache,id=pnpm-store,target=/pnpm/store pnpm fetch +COPY . . + +# tsx runs TS directly — required anyway for the worker, whose workflow +# sandbox bundles from TS source on disk at runtime. +# --prefer-offline (not --offline): offline mode leaks into lifecycle +# scripts and breaks the icons build, which shells out to npx. +FROM source AS runtime +RUN --mount=type=cache,id=pnpm-store,target=/pnpm/store \ + # `prepare` runs husky, absent from a --prod install + npm pkg delete scripts.prepare && \ + pnpm install --frozen-lockfile --prefer-offline --prod \ + --filter backend... --filter execution-worker... + +# VITE_BACKEND_URL is baked at build time; empty = same-origin /api, +# proxied by the web target's nginx. +FROM source AS frontend-build +ARG VITE_BACKEND_URL= +RUN --mount=type=cache,id=pnpm-store,target=/pnpm/store \ + pnpm install --frozen-lockfile --prefer-offline --filter @workflow-builder/ai-studio... +RUN VITE_BACKEND_URL=$VITE_BACKEND_URL pnpm build:ai-studio + +FROM nginx:1.31-alpine AS web +COPY deploy/ai-studio/nginx/default.conf /etc/nginx/conf.d/default.conf +COPY --from=frontend-build /app/dist/apps/ai-studio /usr/share/nginx/html diff --git a/deploy/ai-studio/README.md b/deploy/ai-studio/README.md new file mode 100644 index 000000000..d1f4c4959 --- /dev/null +++ b/deploy/ai-studio/README.md @@ -0,0 +1,111 @@ +# Deploying AI Studio + +Self-contained, portable deployment of the AI Studio stack (WB-229). Runs on +any Docker host — an Azure VM, AWS, on-prem — with no cloud-specific glue. + +> Deploying onto the company Swarm cluster instead? See +> [`tools/deployment/`](../../tools/deployment/README.md) — same images, +> Traefik/ACR/Ansible orchestration aligned with the workflow-builder repo. + +## What runs + +| Service | Image | Role | Exposed | +| ------------- | ------------------------------ | ----------------------------------------------- | ------------------------ | +| `web` | `ai-studio-web` (nginx) | Serves the SPA, proxies `/api` to the backend | `${WEB_PORT}` (only one) | +| `backend` | `ai-studio-runtime` | Hono REST + SSE event stream | internal | +| `worker` | `ai-studio-runtime` | Temporal worker, makes the OpenRouter LLM calls | internal | +| `temporal` | `temporalio/auto-setup` pinned | Workflow engine | internal | +| `app-db` | `postgres:16` | Workflow snapshots + execution events | internal | +| `temporal-db` | `postgres:16` | Temporal's own state store | internal | +| `temporal-ui` | `temporalio/ui` pinned | Debug only (`--profile debug`) | `127.0.0.1:8233` | + +Both images build from one Dockerfile (`deploy/ai-studio/Dockerfile`) with the +repo root as context. Backend and worker share a single image and differ only +in the compose `command`. Database migrations are applied by the backend at +boot (drizzle-orm's programmatic migrator) — there is no separate migration +service or step. + +## Quick start + +```bash +cd deploy/ai-studio +cp .env.example .env # set OPENROUTER_API_KEY +docker compose up -d --build +``` + +First boot: the backend applies migrations and only then starts serving (its +healthcheck gates the worker). The worker crash-loops for ~30s until Temporal +finishes auto-setup — that's expected, `restart: unless-stopped` converges it. + +Verify: + +```bash +curl -s http://localhost:8080/api/health # {"status":"ok"} +# open http://localhost:8080, run the "Sales Inquiry Pipeline" template +``` + +## Spend safety (do not skip) + +Two independent controls; both must be in place before the URL goes public: + +1. **OpenRouter Guardrail** (hard $/day ceiling, no code involved): + [openrouter.ai](https://openrouter.ai) → Settings → Guardrails → daily + spend limit, e.g. **$5/day** (resets 00:00 UTC). When hit, OpenRouter + rejects calls and the demo pauses — it cannot overspend. Keep the account + balance low (~$20) as the absolute ceiling. +2. **Per-IP rate limit** (already on in this compose): defaults to 10 + executions/min and 50/day per IP, tunable via + `RATE_LIMIT_EXECUTE_PER_MINUTE` / `RATE_LIMIT_EXECUTE_PER_DAY`. In-memory, + single-replica by design; counters reset on backend restart. + +At the defaults, a worst case full Guardrail day costs $5; a typical +3-LLM-call template run on Mistral Small 3.2 costs ~$0.0004. + +## TLS / going public + +The `web` container speaks plain HTTP on the internal port. Pick one: + +- **Existing ingress** (Azure Application Gateway / Front Door, an nginx that + already routes your other web apps, …): point it at `WEB_PORT`, set + `WEB_BIND=127.0.0.1` if the ingress runs on the same host. SSE caveat: the + ingress must not buffer `/api/executions/*/stream` responses and needs a + read timeout above 60s (the stream heartbeats every 15s). +- **Standalone VM**: run a host-level [Caddy](https://caddyserver.com) + (`reverse_proxy localhost:8080` — automatic Let's Encrypt, SSE-safe out of + the box) or certbot'd nginx in front, and firewall everything except + 80/443. + +Keep 8233 (Temporal UI) and the Postgres ports unreachable from outside — +this compose never publishes them; don't undo that. + +## Configuration + +See [.env.example](.env.example) — every variable is documented there. +Swapping the LLM is a one-liner: change `AI_MODEL` to any +[OpenRouter model id](https://openrouter.ai/models) and +`docker compose up -d worker`. + +## Operations + +```bash +docker compose logs -f backend worker # tail the apps +docker compose --profile debug up -d # Temporal UI on 127.0.0.1:8233 +docker compose up -d --build # deploy a new version (backend re-applies migrations at boot) +docker compose down # stop (volumes survive) +docker exec ai-studio-app-db-1 pg_dump -U wb workflow_builder > backup.sql +``` + +Workflow data is treated as ephemeral for the public demo — losing the +volumes is acceptable; there is nothing precious in them. + +## Known limitations (accepted for the lean MVP) + +- **No login.** The API is open (`WB_AUTH_PORT=allow-all`); anyone with the + URL can create and run workflows within the rate limits. The SDK has an + `AuthPort` seam for wiring real auth later. +- **Single backend replica.** The rate limiter is process-local. Scaling out + needs a shared store (Redis) — deferred to the scale-ready task. +- **`temporalio/auto-setup` is dev-grade.** Fine for a demo; move to Temporal + Cloud or an operated cluster for sustained load. +- **Anyone-can-edit demo content.** Visitors share one workspace; data is + wiped whenever you decide to recreate the volumes. diff --git a/deploy/ai-studio/docker-compose.yml b/deploy/ai-studio/docker-compose.yml new file mode 100644 index 000000000..b891cea66 --- /dev/null +++ b/deploy/ai-studio/docker-compose.yml @@ -0,0 +1,141 @@ +# AI Studio production stack (WB-229). Usage: cp .env.example .env, set +# OPENROUTER_API_KEY, then `docker compose up -d --build`. Only `web` +# publishes a port. + +name: ai-studio + +x-runtime-build: &runtime-build + context: ../.. + dockerfile: deploy/ai-studio/Dockerfile + target: runtime + +services: + app-db: + image: postgres:16 + environment: + POSTGRES_DB: workflow_builder + POSTGRES_USER: wb + POSTGRES_PASSWORD: ${APP_DB_PASSWORD:-wb} + volumes: + - app-db-data:/var/lib/postgresql/data + healthcheck: + test: ['CMD', 'pg_isready', '-U', 'wb', '-d', 'workflow_builder'] + interval: 5s + timeout: 3s + retries: 12 + restart: unless-stopped + + temporal-db: + image: postgres:16 + environment: + POSTGRES_DB: temporal + POSTGRES_USER: temporal + POSTGRES_PASSWORD: ${TEMPORAL_DB_PASSWORD:-temporal} + volumes: + - temporal-db-data:/var/lib/postgresql/data + healthcheck: + test: ['CMD', 'pg_isready', '-U', 'temporal', '-d', 'temporal'] + interval: 5s + timeout: 3s + retries: 12 + restart: unless-stopped + + # auto-setup is dev-grade; sustained load should move to Temporal Cloud + # or an operated cluster — the apps only consume TEMPORAL_ADDRESS + temporal: + image: temporalio/auto-setup:1.29.6.1 + depends_on: + temporal-db: + condition: service_healthy + environment: + DB: postgres12 + DB_PORT: 5432 + POSTGRES_USER: temporal + POSTGRES_PWD: ${TEMPORAL_DB_PASSWORD:-temporal} + POSTGRES_SEEDS: temporal-db + restart: unless-stopped + + temporal-ui: + image: temporalio/ui:2.51.0 + profiles: [debug] + depends_on: + - temporal + environment: + TEMPORAL_ADDRESS: temporal:7233 + ports: + - '127.0.0.1:8233:8080' + restart: unless-stopped + + # applies migrations at boot; on failure exits and `restart` retries + backend: + image: ai-studio-runtime + build: *runtime-build + command: ['pnpm', '--filter', 'backend', 'start:prod'] + environment: + HOST: 0.0.0.0 + PORT: 3001 + DATABASE_URL: postgresql://wb:${APP_DB_PASSWORD:-wb}@app-db:5432/workflow_builder + TEMPORAL_ADDRESS: temporal:7233 + # explicit opt-in — a forgotten env var fails loudly instead of exposing the API + WB_AUTH_PORT: allow-all + # only nginx can reach the backend, so X-Forwarded-For is trustworthy + TRUST_PROXY: 'true' + RATE_LIMIT_EXECUTE_PER_MINUTE: ${RATE_LIMIT_EXECUTE_PER_MINUTE:-10} + RATE_LIMIT_EXECUTE_PER_DAY: ${RATE_LIMIT_EXECUTE_PER_DAY:-50} + depends_on: + app-db: + condition: service_healthy + temporal: + condition: service_started + healthcheck: + test: + [ + 'CMD', + 'node', + '-e', + "fetch('http://127.0.0.1:3001/api/health').then((r) => process.exit(r.ok ? 0 : 1)).catch(() => process.exit(1))", + ] + interval: 10s + timeout: 5s + retries: 6 + start_period: 15s + restart: unless-stopped + + # crash-loops until Temporal answers (no usable healthcheck); restart converges it + worker: + image: ai-studio-runtime + build: *runtime-build + command: ['pnpm', '--filter', 'execution-worker', 'start:prod'] + environment: + DATABASE_URL: postgresql://wb:${APP_DB_PASSWORD:-wb}@app-db:5432/workflow_builder + TEMPORAL_ADDRESS: temporal:7233 + OPENROUTER_API_KEY: ${OPENROUTER_API_KEY:?set OPENROUTER_API_KEY in deploy/ai-studio/.env} + AI_MODEL: ${AI_MODEL:-mistralai/mistral-small-3.2-24b-instruct} + depends_on: + app-db: + condition: service_healthy + # backend healthy = migrations applied + backend: + condition: service_healthy + temporal: + condition: service_started + restart: unless-stopped + + web: + image: ai-studio-web + build: + context: ../.. + dockerfile: deploy/ai-studio/Dockerfile + target: web + args: + # empty -> SPA calls /api on its own origin via this nginx + VITE_BACKEND_URL: ${VITE_BACKEND_URL:-} + ports: + - '${WEB_BIND:-0.0.0.0}:${WEB_PORT:-8080}:80' + depends_on: + - backend + restart: unless-stopped + +volumes: + app-db-data: + temporal-db-data: diff --git a/deploy/ai-studio/nginx/default.conf b/deploy/ai-studio/nginx/default.conf new file mode 100644 index 000000000..501c28d4f --- /dev/null +++ b/deploy/ai-studio/nginx/default.conf @@ -0,0 +1,58 @@ +# AI Studio — SPA + /api reverse proxy; the stack's only public surface. +# TLS terminates in front (see README.md). + +server { + listen 80; + server_name _; + + root /usr/share/nginx/html; + index index.html; + + # Resolve the backend through Docker's embedded DNS on every request + # (via the variable indirection below) instead of once at startup — + # otherwise recreating the backend container leaves nginx proxying to a + # stale IP and every /api call 502s until this container restarts too. + resolver 127.0.0.11 valid=10s ipv6=off; + set $backend_upstream http://backend:3001; + + gzip on; + gzip_types text/css application/javascript application/json image/svg+xml; + + # backend enforces 1 MB with a structured error — stay above it + client_max_body_size 2m; + + # SSE: never buffer, outlast the 15s heartbeat + location ~ ^/api/executions/.+/stream$ { + proxy_pass $backend_upstream; + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header Connection ''; + proxy_buffering off; + proxy_cache off; + proxy_read_timeout 1h; + gzip off; + } + + location /api/ { + proxy_pass $backend_upstream; + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + + # content-hashed filenames — cache forever + location /assets/ { + add_header Cache-Control "public, max-age=31536000, immutable"; + try_files $uri =404; + } + + # SPA fallback + location / { + try_files $uri /index.html; + } +} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 247fdd644..8b5384f7c 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -174,6 +174,9 @@ importers: postgres: specifier: ^3.4.5 version: 3.4.9 + tsx: + specifier: ^4.19.3 + version: 4.21.0 zod: specifier: ^4.3.6 version: 4.3.6 @@ -366,6 +369,9 @@ importers: postgres: specifier: ^3.4.5 version: 3.4.9 + tsx: + specifier: ^4.19.3 + version: 4.21.0 devDependencies: '@types/node': specifier: ^22.12.0 diff --git a/tools/deployment/README.md b/tools/deployment/README.md new file mode 100644 index 000000000..f8f99fd2c --- /dev/null +++ b/tools/deployment/README.md @@ -0,0 +1,77 @@ +# Swarm deployment (workflow-builder-aligned) + +Deploys AI Studio onto the company Docker Swarm cluster on Azure, following +the same layout, scripts, and Ansible flow as the `workflow-builder` repo's +`tools/deployment/` — so DevOps operates one familiar shape. + +This is an **orchestration overlay, not a second deployment**: it consumes +the exact same two images (`runtime`, `web`) built from +[`deploy/ai-studio/Dockerfile`](../../deploy/ai-studio/Dockerfile). The +compose file in `deploy/ai-studio/` remains the portable, customer-facing +artifact and the local full-stack runner; this directory adds the +ACR + Traefik + Ansible path for our own infrastructure. + +``` +tools/deployment/ +├── scripts/ +│ ├── build-docker.sh # build all 3 targets, tag for ACR, push (CI-gated) +│ └── deploy.sh # run the Ansible playbook (CI image or workstation) +└── ansible/deploy-application/ + └── main.yml # writes the Swarm stack file on the master + deploys +``` + +## Usage + +```bash +# build + push images (from repo root) +DEPLOY_ENV=dev ./tools/deployment/scripts/build-docker.sh + +# deploy the stack (needs az login + ansible inventory with the `master` host) +DEPLOY_ENV=dev DEPLOYMENT_URL=ai-studio.example.com OPENROUTER_API_KEY=sk-... \ + ./tools/deployment/scripts/deploy.sh +``` + +Bitbucket-style variables (`BITBUCKET_COMMIT`, `BITBUCKET_DEPLOYMENT_ENVIRONMENT`, +`TAG_PREFIX`) take precedence when present, so the scripts drop into the +existing CI pattern unchanged; the fallbacks (`git rev-parse`, `DEPLOY_ENV`) +make them runnable from a workstation or GitHub Actions. + +## Configuration + +| Variable | Required | Default | Purpose | +| ---------------------------------------------------------------------------------------- | --------- | ------------------------------------------ | ------------------------------------------------------- | +| `DEPLOYMENT_URL` | yes | — | Public hostname, drives Traefik routing + TLS | +| `OPENROUTER_API_KEY` | yes | — | Worker-side LLM key (pair with an OpenRouter Guardrail) | +| `DEPLOY_ENV` / `BITBUCKET_DEPLOYMENT_ENVIRONMENT` | no | `dev` | Stack/environment suffix | +| `AI_MODEL` | no | `mistralai/mistral-small-3.2-24b-instruct` | OpenRouter model id | +| `RATE_LIMIT_EXECUTE_PER_MINUTE` / `_DAY` | no | `10` / `50` | Per-IP abuse gate | +| `APP_DB_PASSWORD`, `TEMPORAL_DB_PASSWORD` | no | dev defaults | Internal-network Postgres credentials | +| `AUTH_ENABLED` | no | `false` | Put the gatekeeper OIDC proxy in front (internal envs) | +| `AUTH_DISCOVERY_URL`, `AUTH_CLIENT_ID`, `AUTH_SECRET`, `AUTH_COOKIE_SECRET`, `AUTH_ROLE` | when auth | — | Gatekeeper config, same names as workflow-builder | +| `REGISTRY` | no | `synergycodes.azurecr.io` | Image registry | + +## What differs from the workflow-builder playbook (and why) + +| Deviation | Reason | +| ------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------ | +| Postgres ×2 + Temporal services with named volumes, pinned via `node.labels.ai-studio-data==true` | AI Studio is stateful; Swarm volumes are node-local. **One-time setup:** `docker node update --label-add ai-studio-data=true ` | +| No migration step — the backend applies Drizzle migrations at boot and restarts until Postgres answers | Swarm ignores compose `depends_on` conditions, so ordering must not rely on them | +| Services carry short DNS aliases (`backend`, `app-db`, `temporal`, …) | The web image's nginx proxies to `http://backend:3001`; aliases keep the images and env defaults identical between compose and Swarm | +| Gatekeeper is conditional (`AUTH_ENABLED`) | The WB-229 public demo is deliberately login-free; internal instances can keep SSO | + +SSE note: Traefik streams responses by default, so the live execution stream +works without special ingress config; the 15 s backend heartbeat keeps the +connection alive. + +## Open items for DevOps + +- **Stateful workloads on the cluster** — this would be the first; the + alternative is a dedicated VM running `deploy/ai-studio/docker-compose.yml` + as-is, or managed Azure Postgres. +- **CI home** — this repo lives on GitHub; the existing deploy machinery + (deployment CI image, `setup-az.sh`, Ansible inventory) is Bitbucket-side. + First deploys can run from a workstation. +- **Secrets in the stack file** — the playbook writes env values (incl. the + OpenRouter key) into the stack yml on the Swarm master, same as the + existing workflow-builder flow. Docker Swarm secrets would be stricter; + kept aligned for now. diff --git a/tools/deployment/ansible/deploy-application/main.yml b/tools/deployment/ansible/deploy-application/main.yml new file mode 100644 index 000000000..a5a41f350 --- /dev/null +++ b/tools/deployment/ansible/deploy-application/main.yml @@ -0,0 +1,237 @@ +--- +# AI Studio on the Swarm cluster, following the workflow-builder repo's +# deploy-application playbook with the same images as deploy/ai-studio. +# No migration step: the backend migrates itself at boot. DB/Temporal +# volumes are pinned via node.labels.ai-studio-data (Swarm volumes are +# node-local); gatekeeper is optional (AUTH_ENABLED). + +- hosts: master + + vars: + deployment_environment: "{{ lookup('env', 'BITBUCKET_DEPLOYMENT_ENVIRONMENT') or lookup('env', 'DEPLOY_ENV') or 'dev' }}" + tag_prefix: "{{ lookup('env', 'TAG_PREFIX') }}" + bb_commit: "{{ lookup('env', 'BITBUCKET_COMMIT') or lookup('pipe', 'git rev-parse HEAD') }}" + app_name: ai-studio + registry: "{{ lookup('env', 'REGISTRY') or 'synergycodes.azurecr.io' }}" + deployment_url: "{{ lookup('env', 'DEPLOYMENT_URL') }}" + image_tag: '{{ tag_prefix }}{{ bb_commit }}' + stack_name: '{{ app_name }}--{{ deployment_environment }}' + auth_enabled: "{{ (lookup('env', 'AUTH_ENABLED') or 'false') | bool }}" + openrouter_api_key: "{{ lookup('env', 'OPENROUTER_API_KEY') }}" + ai_model: "{{ lookup('env', 'AI_MODEL') or 'mistralai/mistral-small-3.2-24b-instruct' }}" + app_db_password: "{{ lookup('env', 'APP_DB_PASSWORD') or 'wb' }}" + temporal_db_password: "{{ lookup('env', 'TEMPORAL_DB_PASSWORD') or 'temporal' }}" + database_url: 'postgresql://wb:{{ app_db_password }}@app-db:5432/workflow_builder' + + tasks: + - name: Check required configuration + assert: + that: + - deployment_url | length > 0 + - openrouter_api_key | length > 0 + fail_msg: 'DEPLOYMENT_URL and OPENROUTER_API_KEY must be set' + + - name: Create directory for service data + file: + path: '/mnt/docker-swarm-storage/stacks/{{ stack_name }}' + state: directory + + - name: Create stack definition + copy: + dest: '/mnt/docker-swarm-storage/stacks/{{ stack_name }}/{{ app_name }}.stack.yml' + content: | + services: + {% if auth_enabled %} + ai-studio-gatekeeper--{{ deployment_environment }}: + image: '{{ registry }}/gatekeeper:2.1.1' + environment: + PROXY_LISTEN: :4200 + PROXY_UPSTREAM_URL: http://web + PROXY_DISCOVERY_URL: "{{ lookup('env', 'AUTH_DISCOVERY_URL') }}" + PROXY_CLIENT_ID: "{{ lookup('env', 'AUTH_CLIENT_ID') }}" + PROXY_CLIENT_SECRET: "{{ lookup('env', 'AUTH_SECRET') }}" + PROXY_ENCRYPTION_KEY: "{{ lookup('env', 'AUTH_COOKIE_SECRET') }}" + PROXY_REDIRECTION_URL: 'https://{{ deployment_url }}' + command: + - '-enable-default-deny=false' + - "-resources=uri=/*|roles={{ lookup('env', 'AUTH_ROLE') }}" + networks: + traefik-host-external: + internal: + aliases: [gatekeeper] + deploy: + placement: + constraints: + - node.role==worker + labels: + - 'traefik.enable=true' + - 'traefik.docker.network=traefik-host-external' + - 'traefik.http.routers.{{ stack_name }}-http.rule=Host(`{{ deployment_url }}`) && PathPrefix(`/`)' + - 'traefik.http.routers.{{ stack_name }}-http.entrypoints=http' + - 'traefik.http.routers.{{ stack_name }}-http.middlewares=https-redirect' + - 'traefik.http.routers.{{ stack_name }}-https.rule=Host(`{{ deployment_url }}`) && PathPrefix(`/`)' + - 'traefik.http.routers.{{ stack_name }}-https.entrypoints=https' + - 'traefik.http.routers.{{ stack_name }}-https.tls=true' + - 'traefik.http.routers.{{ stack_name }}-https.tls.certresolver=le' + - 'traefik.http.services.{{ stack_name }}.loadbalancer.server.port=4200' + {% endif %} + + ai-studio-web--{{ deployment_environment }}: + image: '{{ registry }}/{{ app_name }}:web-{{ image_tag }}' + networks: + {% if not auth_enabled %} + traefik-host-external: + {% endif %} + internal: + aliases: [web] + deploy: + placement: + constraints: + - node.role==worker + {% if not auth_enabled %} + labels: + - 'traefik.enable=true' + - 'traefik.docker.network=traefik-host-external' + - 'traefik.http.routers.{{ stack_name }}-http.rule=Host(`{{ deployment_url }}`) && PathPrefix(`/`)' + - 'traefik.http.routers.{{ stack_name }}-http.entrypoints=http' + - 'traefik.http.routers.{{ stack_name }}-http.middlewares=https-redirect' + - 'traefik.http.routers.{{ stack_name }}-https.rule=Host(`{{ deployment_url }}`) && PathPrefix(`/`)' + - 'traefik.http.routers.{{ stack_name }}-https.entrypoints=https' + - 'traefik.http.routers.{{ stack_name }}-https.tls=true' + - 'traefik.http.routers.{{ stack_name }}-https.tls.certresolver=le' + - 'traefik.http.services.{{ stack_name }}.loadbalancer.server.port=80' + {% endif %} + + ai-studio-backend--{{ deployment_environment }}: + image: '{{ registry }}/{{ app_name }}:runtime-{{ image_tag }}' + command: ['pnpm', '--filter', 'backend', 'start:prod'] + environment: + HOST: 0.0.0.0 + PORT: 3001 + DATABASE_URL: '{{ database_url }}' + TEMPORAL_ADDRESS: temporal:7233 + WB_AUTH_PORT: allow-all + TRUST_PROXY: 'true' + RATE_LIMIT_EXECUTE_PER_MINUTE: "{{ lookup('env', 'RATE_LIMIT_EXECUTE_PER_MINUTE') or '10' }}" + RATE_LIMIT_EXECUTE_PER_DAY: "{{ lookup('env', 'RATE_LIMIT_EXECUTE_PER_DAY') or '50' }}" + healthcheck: + test: ['CMD', 'node', '-e', "fetch('http://127.0.0.1:3001/api/health').then(r=>process.exit(r.ok?0:1)).catch(()=>process.exit(1))"] + interval: 10s + timeout: 5s + retries: 6 + start_period: 15s + networks: + internal: + aliases: [backend] + deploy: + restart_policy: + condition: any + delay: 5s + max_attempts: 10 + placement: + constraints: + - node.role==worker + + ai-studio-worker--{{ deployment_environment }}: + image: '{{ registry }}/{{ app_name }}:runtime-{{ image_tag }}' + command: ['pnpm', '--filter', 'execution-worker', 'start:prod'] + environment: + DATABASE_URL: '{{ database_url }}' + TEMPORAL_ADDRESS: temporal:7233 + OPENROUTER_API_KEY: '{{ openrouter_api_key }}' + AI_MODEL: '{{ ai_model }}' + networks: + internal: + deploy: + restart_policy: + condition: any + delay: 5s + max_attempts: 20 + placement: + constraints: + - node.role==worker + + ai-studio-app-db--{{ deployment_environment }}: + image: 'postgres:16' + environment: + POSTGRES_DB: workflow_builder + POSTGRES_USER: wb + POSTGRES_PASSWORD: '{{ app_db_password }}' + volumes: + - app-db-data:/var/lib/postgresql/data + networks: + internal: + aliases: [app-db] + deploy: + restart_policy: + condition: any + delay: 5s + placement: + constraints: + - node.labels.ai-studio-data==true + + ai-studio-temporal-db--{{ deployment_environment }}: + image: 'postgres:16' + environment: + POSTGRES_DB: temporal + POSTGRES_USER: temporal + POSTGRES_PASSWORD: '{{ temporal_db_password }}' + volumes: + - temporal-db-data:/var/lib/postgresql/data + networks: + internal: + aliases: [temporal-db] + deploy: + restart_policy: + condition: any + delay: 5s + placement: + constraints: + - node.labels.ai-studio-data==true + + ai-studio-temporal--{{ deployment_environment }}: + image: 'temporalio/auto-setup:1.29.6.1' + environment: + DB: postgres12 + DB_PORT: 5432 + POSTGRES_USER: temporal + POSTGRES_PWD: '{{ temporal_db_password }}' + POSTGRES_SEEDS: temporal-db + networks: + internal: + aliases: [temporal] + deploy: + restart_policy: + condition: any + delay: 5s + max_attempts: 20 + placement: + constraints: + - node.role==worker + + volumes: + app-db-data: + temporal-db-data: + + networks: + internal: + driver: overlay + traefik-host-external: + external: true + + - name: Ensure Azure CLI is setup + shell: /var/az-autologin.sh + + - name: Ensure jsondiff is installed (required by community.docker.docker_stack) + ansible.builtin.pip: + name: jsondiff + + - name: Deploy stack + community.docker.docker_stack: + state: present + name: '{{ stack_name }}' + resolve_image: 'always' + prune: true + with_registry_auth: yes + compose: + - '/mnt/docker-swarm-storage/stacks/{{ stack_name }}/{{ app_name }}.stack.yml' diff --git a/tools/deployment/scripts/build-docker.sh b/tools/deployment/scripts/build-docker.sh new file mode 100755 index 000000000..2e97e77b2 --- /dev/null +++ b/tools/deployment/scripts/build-docker.sh @@ -0,0 +1,32 @@ +#!/bin/sh +# Build + push the AI Studio images (deploy/ai-studio/Dockerfile) to ACR, +# mirroring workflow-builder's build-docker.sh. Bitbucket CI vars win when +# present; git/DEPLOY_ENV fallbacks keep it runnable from a workstation. +set -eu + +APP_NAME="ai-studio" +REGISTRY="${REGISTRY:-synergycodes.azurecr.io}" +COMMIT="${BITBUCKET_COMMIT:-$(git rev-parse HEAD)}" +ENVIRONMENT="${BITBUCKET_DEPLOYMENT_ENVIRONMENT:-${DEPLOY_ENV:-}}" +export IMAGE_TAG="${TAG_PREFIX:-}$COMMIT" + +for TARGET in runtime web; do + TAG="$REGISTRY/$APP_NAME:$TARGET-$IMAGE_TAG" + docker build \ + -f ./deploy/ai-studio/Dockerfile \ + --target "$TARGET" \ + -t "$TAG" \ + . +done + +ALLOWED_ENVIRONMENTS="stage dev prod" + +if echo "$ALLOWED_ENVIRONMENTS" | grep -w "$ENVIRONMENT" > /dev/null; then + # setup-az.sh only exists in the deployment CI image + [ -f /var/setup-az.sh ] && . /var/setup-az.sh + for TARGET in runtime web; do + docker push "$REGISTRY/$APP_NAME:$TARGET-$IMAGE_TAG" + done +else + echo "Environment '$ENVIRONMENT' is not configured for image push. Skipping." +fi diff --git a/tools/deployment/scripts/deploy.sh b/tools/deployment/scripts/deploy.sh new file mode 100755 index 000000000..fac8f5be4 --- /dev/null +++ b/tools/deployment/scripts/deploy.sh @@ -0,0 +1,9 @@ +#!/bin/sh +# Deploy the AI Studio stack to Swarm, mirroring workflow-builder's +# deploy.sh. The setup scripts exist only in the deployment CI image. +set -eu + +[ -f /var/setup-az.sh ] && . /var/setup-az.sh +[ -f /var/setup-ansible.sh ] && . /var/setup-ansible.sh + +ansible-playbook ./tools/deployment/ansible/deploy-application/main.yml diff --git a/tools/deployment/swarm-alignment.decision-log.md b/tools/deployment/swarm-alignment.decision-log.md new file mode 100644 index 000000000..c60d051e7 --- /dev/null +++ b/tools/deployment/swarm-alignment.decision-log.md @@ -0,0 +1,90 @@ +### Title: Swarm overlay aligned with the workflow-builder deployment + +### Proposed by: Jan Librowski + +### Date: 10.06.2026 + +## Context + +The compose-based deployment in `deploy/ai-studio/` +targets a single Docker host and ships TLS as a bring-your-own concern. The +company's actual Azure footprint, found in the `workflow-builder` repo's +`tools/deployment/`, is different: a self-managed Docker Swarm cluster with +Traefik (Let's Encrypt, host-based routing), images in ACR tagged by commit, +deployment via an Ansible playbook that writes a stack file onto the Swarm +master, and an optional gatekeeper OIDC proxy for internal apps. DevOps +operates that machinery daily. + +Rather than choosing one target, the compose branch is kept as a snapshot +(`WB-229-ai-studio-deployment`) and this branch adds the Swarm-aligned path +on top of it. + +## Decision + +Add `tools/deployment/` mirroring the workflow-builder repo's structure — +`scripts/build-docker.sh`, `scripts/deploy.sh`, +`ansible/deploy-application/main.yml` — with the same conventions: ACR +commit-tagged images, per-environment stack names (`ai-studio--dev`), +Traefik labels copied from the existing stack, Bitbucket-style env variables +honored with workstation fallbacks. + +**The images are shared, not duplicated.** Both paths build the same three +targets from `deploy/ai-studio/Dockerfile`; the overlay only changes +orchestration. Four deliberate deviations from the workflow-builder +playbook, all forced by AI Studio being stateful where the editor demo was +a static frontend: + +1. Database/Temporal services with named volumes pinned to a labeled node + (`node.labels.ai-studio-data==true`) — Swarm volumes are node-local. +2. No migration step (revised 11.06.2026) — the backend applies Drizzle + migrations at boot and restarts until Postgres answers, which sidesteps + Swarm's lack of `depends_on` ordering entirely. +3. Short DNS aliases (`backend`, `app-db`, `temporal`) so the unmodified + web image's nginx upstream and the compose env defaults resolve + identically under Swarm. +4. Gatekeeper made conditional (`AUTH_ENABLED`, default off) — the public + demo is login-free by design; internal stage/dev instances can keep SSO. + +## Alternative Options Considered + +- **Compose on a dedicated VM only** (the snapshot branch) — fully working + and remains the customer-facing artifact; rejected as the _only_ path + because it adds a second ops surface (new VM, separate TLS) when a + maintained cluster exists. +- **Kubernetes/AKS manifests** — nothing in the org runs on k8s per the + available evidence; would be infrastructure invention, not alignment. +- **Managed Azure Postgres instead of in-cluster databases** — cleaner + state story, but contradicts the near-zero-cost requirement for a demo + whose data is explicitly ephemeral; revisit for sustained load. +- **Swarm secrets for the OpenRouter key** — stricter than env-in-stack-file, + but diverges from how the existing playbook handles `AUTH_SECRET`; + consistency won for now, flagged in the README. + +## Consequences + +- **Pros** + - DevOps sees the exact shape they already operate; review is a diff + against a known playbook, not a new system. + - TLS, registry auth, and routing are inherited from cluster-level + Traefik instead of being re-solved per deployment. + - Stack template render-verified in both auth modes (YAML parses; correct + public surface and Traefik port in each). +- **Cons** + - Not exercised against a real cluster yet — inventory, ACR push rights, + the `ai-studio-data` node label, and the first stateful workload on the + cluster all need DevOps sign-off. + - The rate limiter's `X-Forwarded-For` trust now spans Traefik (and + optionally gatekeeper) before nginx; the first-hop assumption should be + verified on the real cluster. + - Secrets land in a stack file on the Swarm master's disk (inherited + trade-off from the existing flow). + +## Revisions + +- **11.06.2026** — playbook migration task and the `attachable` network + removed; the backend migrates itself at boot. Image set is down to + `runtime` + `web`. + +## Status + +Proposed — pending the DevOps conversation