diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..0cac8ef --- /dev/null +++ b/.dockerignore @@ -0,0 +1,36 @@ +**/node_modules +**/dist +**/build +**/.git +**/.github +**/.vscode +**/.idea +**/coverage +**/.turbo +**/.tsbuildinfo +**/*.log +**/.DS_Store + +# Local env + dev runtime state — must never enter the build context. +.env +.env.* +!.env.example +*.local +*.local.* +private-storage/ +codeforphilly-data/ +apps/codeforphilly-data/ + +# Agent worktrees + plans/specs (built artifacts don't need them). +.claude/worktrees/ + +# Test artifacts +**/tests +**/*.test.ts +**/vitest.config.ts + +# Docs / specs not needed at runtime. +docs/ +plans/ +specs/ +README.md diff --git a/.env.example b/.env.example index cfc57a2..678bfe0 100644 --- a/.env.example +++ b/.env.example @@ -70,3 +70,12 @@ CFP_JWT_SIGNING_KEY=change-me-to-a-random-string-at-least-32-chars # PEM-encoded certificate matching SAML_PRIVATE_KEY. # SAML_CERTIFICATE=-----BEGIN CERTIFICATE-----\n...\n-----END CERTIFICATE----- + +# --------------------------------------------------------------------------- +# Static SPA serving (production only) +# --------------------------------------------------------------------------- + +# Absolute path to the built apps/web/dist directory. When set, the API +# serves the SPA as a fallthrough for non-/api/* routes (single-image +# deploy per specs/architecture.md). Leave unset in dev — Vite owns 5173. +# CFP_WEB_DIST_PATH=/app/apps/web/dist diff --git a/.github/workflows/deploy-production.yml b/.github/workflows/deploy-production.yml new file mode 100644 index 0000000..044e34f --- /dev/null +++ b/.github/workflows/deploy-production.yml @@ -0,0 +1,131 @@ +name: Deploy (production) + +# Production deploys run only on annotated/lightweight tags shaped like +# `v1.2.3`. Same image, different cluster, different values. +on: + push: + tags: + - "v*.*.*" + workflow_dispatch: + inputs: + tag: + description: "Image tag to deploy (must already exist in GHCR)" + required: true + +concurrency: + group: deploy-production + cancel-in-progress: false + +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} + +permissions: + contents: read + packages: write + id-token: write + +jobs: + build: + if: github.event_name == 'push' + runs-on: ubuntu-latest + outputs: + image-tag: ${{ steps.meta.outputs.image-tag }} + steps: + - uses: actions/checkout@v6 + + - name: Compute image tag + id: meta + run: | + tag="${GITHUB_REF_NAME}" + echo "image-tag=$tag" >> "$GITHUB_OUTPUT" + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to GHCR + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push + uses: docker/build-push-action@v6 + with: + context: . + file: Dockerfile + push: true + platforms: linux/amd64 + provenance: false + tags: | + ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.meta.outputs.image-tag }} + ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:production-latest + cache-from: type=gha + cache-to: type=gha,mode=max + labels: | + org.opencontainers.image.revision=${{ github.sha }} + org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }} + + deploy: + # When triggered by a tag push, depend on build; for workflow_dispatch the + # tag must already exist in GHCR, so we skip the build job. + needs: [build] + if: always() && (needs.build.result == 'success' || github.event_name == 'workflow_dispatch') + runs-on: ubuntu-latest + environment: + name: production + url: https://codeforphilly.org + steps: + - uses: actions/checkout@v6 + + - name: Install kubectl + uses: azure/setup-kubectl@v4 + with: + version: v1.31.0 + + - name: Install Helm + uses: azure/setup-helm@v4 + with: + version: v3.16.2 + + - name: Configure kubeconfig + run: | + mkdir -p "$HOME/.kube" + echo "${{ secrets.KUBECONFIG_PRODUCTION }}" | base64 -d > "$HOME/.kube/config" + chmod 600 "$HOME/.kube/config" + kubectl version --client + + - name: Resolve image tag + id: tag + run: | + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + echo "image-tag=${{ github.event.inputs.tag }}" >> "$GITHUB_OUTPUT" + else + echo "image-tag=${{ needs.build.outputs.image-tag }}" >> "$GITHUB_OUTPUT" + fi + + - name: Helm upgrade + run: | + helm upgrade --install codeforphilly \ + deploy/charts/codeforphilly \ + --namespace codeforphilly \ + --create-namespace \ + -f deploy/charts/codeforphilly/values.production.yaml \ + --set image.tag=${{ steps.tag.outputs.image-tag }} \ + --atomic \ + --timeout 5m \ + --wait + + - name: Smoke check + run: | + for i in 1 2 3 4 5 6; do + if curl -fsS https://codeforphilly.org/api/health >/dev/null; then + echo "OK" + exit 0 + fi + echo "Try $i: not ready, sleeping 10s" + sleep 10 + done + echo "Production health check failed" + exit 1 diff --git a/.github/workflows/deploy-staging.yml b/.github/workflows/deploy-staging.yml new file mode 100644 index 0000000..ebd7c6b --- /dev/null +++ b/.github/workflows/deploy-staging.yml @@ -0,0 +1,125 @@ +name: Deploy (staging) + +on: + push: + branches: [main] + workflow_dispatch: + +concurrency: + # Cancel an in-flight deploy if a newer commit lands — only the latest gets + # rolled out to staging. + group: deploy-staging + cancel-in-progress: false + +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} + +permissions: + contents: read + packages: write + id-token: write + +jobs: + # Build + push the image. Tag with both the commit sha and `staging-latest`. + build: + runs-on: ubuntu-latest + outputs: + image-tag: ${{ steps.meta.outputs.image-tag }} + image-digest: ${{ steps.push.outputs.digest }} + steps: + - uses: actions/checkout@v6 + + - name: Compute image tag + id: meta + run: | + tag="sha-${GITHUB_SHA::12}" + echo "image-tag=$tag" >> "$GITHUB_OUTPUT" + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to GHCR + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push + id: push + uses: docker/build-push-action@v6 + with: + context: . + file: Dockerfile + push: true + platforms: linux/amd64 + provenance: false + tags: | + ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ steps.meta.outputs.image-tag }} + ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:staging-latest + cache-from: type=gha + cache-to: type=gha,mode=max + labels: | + org.opencontainers.image.revision=${{ github.sha }} + org.opencontainers.image.source=${{ github.server_url }}/${{ github.repository }} + + # Deploy via `helm upgrade --install` against the staging cluster. Gated by + # the `staging` environment so first-time runs require an approval and so + # secrets are scoped per-environment. + deploy: + needs: build + runs-on: ubuntu-latest + environment: + name: staging + url: https://codeforphilly-rewrite-staging.k8s.phl.io + steps: + - uses: actions/checkout@v6 + + - name: Install kubectl + uses: azure/setup-kubectl@v4 + with: + version: v1.31.0 + + - name: Install Helm + uses: azure/setup-helm@v4 + with: + version: v3.16.2 + + - name: Configure kubeconfig + # KUBECONFIG_STAGING is a base64-encoded kubeconfig stored as a repo + # secret. The cluster service account it points to should have rights + # only in the codeforphilly-staging namespace. + run: | + mkdir -p "$HOME/.kube" + echo "${{ secrets.KUBECONFIG_STAGING }}" | base64 -d > "$HOME/.kube/config" + chmod 600 "$HOME/.kube/config" + kubectl version --client + + - name: Helm upgrade + run: | + helm upgrade --install codeforphilly-staging \ + deploy/charts/codeforphilly \ + --namespace codeforphilly-staging \ + --create-namespace \ + -f deploy/charts/codeforphilly/values.staging.yaml \ + --set image.tag=${{ needs.build.outputs.image-tag }} \ + --atomic \ + --timeout 5m \ + --wait + + - name: Smoke check + run: | + # The --wait above only waits for k8s to report ready; hit the + # public ingress to confirm end-to-end. Retries because cert-manager + # may still be re-checking TLS for a fresh cert. + for i in 1 2 3 4 5 6; do + if curl -fsS https://codeforphilly-rewrite-staging.k8s.phl.io/api/health >/dev/null; then + echo "OK" + exit 0 + fi + echo "Try $i: not ready, sleeping 10s" + sleep 10 + done + echo "Staging health check failed" + exit 1 diff --git a/.tool-versions b/.tool-versions index 42bb250..ca3bd4b 100644 --- a/.tool-versions +++ b/.tool-versions @@ -1 +1,2 @@ nodejs 22.22.3 +helm 4.1.0 diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..27646e7 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,111 @@ +# syntax=docker/dockerfile:1.7 +# +# CodeForPhilly modernization — single image bundling the API + built SPA. +# +# Multi-stage build: +# 1. deps — npm ci for the full monorepo +# 2. build — type-check + build api/web + prune dev deps +# 3. runtime — minimal alpine + git + ca-certificates; non-root user +# +# Build: +# docker build -t cfp:dev . +# Run (filesystem private storage; for smoke tests only): +# docker run --rm -p 3001:3001 \ +# -e CFP_DATA_REMOTE=https://github.com/CodeForPhilly/codeforphilly-data-snapshot.git \ +# -e STORAGE_BACKEND=filesystem \ +# -e CFP_PRIVATE_STORAGE_PATH=/app/private-storage \ +# -e CFP_JWT_SIGNING_KEY=$(openssl rand -base64 48) \ +# cfp:dev +# +# Production env-vars are documented in docs/operations/deploy.md. + +# ---------------------------------------------------------------------------- +# Stage 1: deps — install full workspace deps (incl. dev) for the build step. +# ---------------------------------------------------------------------------- +FROM node:22.22-alpine AS deps + +WORKDIR /app + +# git is required by some npm postinstall scripts (e.g. transitive deps that +# resolve from git tags during install). +RUN apk add --no-cache git python3 make g++ + +# Copy lockfiles + manifests first so docker layer-cache survives source edits. +COPY package.json package-lock.json ./ +COPY apps/api/package.json ./apps/api/ +COPY apps/web/package.json ./apps/web/ +COPY packages/shared/package.json ./packages/shared/ + +RUN npm ci --no-audit --no-fund + +# ---------------------------------------------------------------------------- +# Stage 2: build — compile api + web, then prune to production deps only. +# ---------------------------------------------------------------------------- +FROM node:22.22-alpine AS build + +WORKDIR /app + +RUN apk add --no-cache git python3 make g++ + +COPY --from=deps /app/node_modules ./node_modules +COPY --from=deps /app/apps/api/node_modules ./apps/api/node_modules +COPY --from=deps /app/apps/web/node_modules ./apps/web/node_modules +COPY --from=deps /app/packages/shared/node_modules ./packages/shared/node_modules + +COPY tsconfig.base.json package.json package-lock.json ./ +COPY apps ./apps +COPY packages ./packages + +# Build both workspaces. Web is built first so api/dist references work; the +# workspace `build` script handles order via `--if-present`. +RUN npm run build --workspaces --if-present + +# Drop devDependencies from node_modules to shrink the runtime image. We still +# need workspace-local node_modules (better-sqlite3 native binding lives there). +RUN npm prune --omit=dev --workspaces --include-workspace-root + +# ---------------------------------------------------------------------------- +# Stage 3: runtime — minimal image; node, git, ca-certificates, tini. +# ---------------------------------------------------------------------------- +FROM node:22.22-alpine AS runtime + +# git: needed at boot for the entrypoint clone + by the gitsheets push daemon. +# ca-certificates: TLS to GitHub / S3-compatible endpoints. +# tini: minimal init so SIGTERM from k8s reaches node cleanly. +# openssh-client: for ssh:// remotes (deploy key auth to the data repo). +RUN apk add --no-cache git ca-certificates tini openssh-client + +WORKDIR /app + +# Copy built artifacts + pruned node_modules. +COPY --from=build /app/package.json /app/package-lock.json ./ +COPY --from=build /app/node_modules ./node_modules +COPY --from=build /app/apps/api/package.json ./apps/api/ +COPY --from=build /app/apps/api/dist ./apps/api/dist +COPY --from=build /app/apps/api/node_modules ./apps/api/node_modules +COPY --from=build /app/apps/web/dist ./apps/web/dist +COPY --from=build /app/packages/shared/package.json ./packages/shared/ +COPY --from=build /app/packages/shared/dist ./packages/shared/dist +COPY --from=build /app/packages/shared/node_modules ./packages/shared/node_modules + +# Entrypoint script handles data-repo init/refresh before exec'ing node. +COPY deploy/docker/entrypoint.sh /usr/local/bin/entrypoint.sh +RUN chmod +x /usr/local/bin/entrypoint.sh + +# Defaults pointing at PVC mount points. Override via Helm values / env. +ENV NODE_ENV=production \ + HOST=0.0.0.0 \ + PORT=3001 \ + CFP_DATA_REPO_PATH=/app/data \ + CFP_WEB_DIST_PATH=/app/apps/web/dist \ + STORAGE_BACKEND=s3 \ + NODE_OPTIONS="--max-old-space-size=384" + +# Non-root user. The Helm chart's PVC must be writable by uid 1000 (alpine +# `node` user). 1000:1000 is the upstream node:alpine default. +USER node + +EXPOSE 3001 + +ENTRYPOINT ["/sbin/tini", "--", "/usr/local/bin/entrypoint.sh"] +CMD ["node", "apps/api/dist/index.js"] diff --git a/apps/api/package.json b/apps/api/package.json index 1300c8e..43bc097 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -22,6 +22,7 @@ "@fastify/cors": "^11.2.0", "@fastify/env": "^6.0.0", "@fastify/rate-limit": "^10.3.0", + "@fastify/static": "^9.1.3", "@fastify/swagger": "^9.7.0", "@fastify/swagger-ui": "^5.2.6", "better-sqlite3": "^12.10.0", diff --git a/apps/api/src/app.ts b/apps/api/src/app.ts index c6e1773..f623843 100644 --- a/apps/api/src/app.ts +++ b/apps/api/src/app.ts @@ -32,6 +32,7 @@ import servicesPlugin from './plugins/services.js'; import rateLimitPlugin from './plugins/rate-limit.js'; import idempotencyPlugin from './plugins/idempotency.js'; import sessionMiddlewarePlugin from './auth/middleware.js'; +import staticWebPlugin from './plugins/static-web.js'; import { healthRoutes } from './routes/health.js'; import { authRoutes } from './routes/auth.js'; import { projectRoutes } from './routes/projects.js'; @@ -153,5 +154,8 @@ export async function buildApp(opts: BuildAppOptions = {}): Promise; @@ -71,5 +77,6 @@ export const envJsonSchema = { CFP_JWT_SIGNING_KEY: { type: 'string', minLength: 1 }, SAML_PRIVATE_KEY: { type: 'string' }, SAML_CERTIFICATE: { type: 'string' }, + CFP_WEB_DIST_PATH: { type: 'string' }, }, } as const; diff --git a/apps/api/src/plugins/static-web.ts b/apps/api/src/plugins/static-web.ts new file mode 100644 index 0000000..896ef65 --- /dev/null +++ b/apps/api/src/plugins/static-web.ts @@ -0,0 +1,82 @@ +/** + * Static-web plugin. + * + * Serves the built Vite SPA from `apps/web/dist` as a fallthrough for any + * request not handled by /api/*. Enabled only when CFP_WEB_DIST_PATH is set + * (which it is in the production Docker image; not in dev where Vite owns 5173). + * + * Per specs/architecture.md: "A single Docker image bundles the built API and + * serves the static apps/web/dist from the same Fastify instance via + * @fastify/static. One container, one ingress." + * + * SPA fallback: any GET that 404s outside of /api/* is rewritten to /index.html + * so React Router v7 routes resolve client-side. + */ +import type { FastifyInstance, FastifyReply, FastifyRequest } from 'fastify'; +import fp from 'fastify-plugin'; +import fastifyStatic from '@fastify/static'; +import { existsSync } from 'node:fs'; +import { readFile } from 'node:fs/promises'; +import { join, resolve } from 'node:path'; + +function jsonNotFound(reply: FastifyReply): FastifyReply { + return reply.code(404).type('application/json').send({ + success: false, + error: { code: 'not_found', message: 'Not found' }, + }); +} + +async function staticWebPlugin(fastify: FastifyInstance): Promise { + const distPath = fastify.config.CFP_WEB_DIST_PATH; + + if (!distPath) { + // No SPA bundled (dev mode, tests). The 404 envelope still applies — the + // API contract is the same whether or not the SPA is co-located. + fastify.log.info('static-web: CFP_WEB_DIST_PATH unset — SPA fallthrough disabled'); + fastify.setNotFoundHandler((_request: FastifyRequest, reply: FastifyReply) => jsonNotFound(reply)); + return; + } + + const root = resolve(distPath); + if (!existsSync(root)) { + // Fail loud — production images bundle the SPA; missing files means a + // bad build, not a soft-skippable condition. + throw new Error(`static-web: CFP_WEB_DIST_PATH ${root} does not exist`); + } + + await fastify.register(fastifyStatic, { + root, + prefix: '/', + wildcard: false, + // Long cache for hashed assets in /assets/. The notFoundHandler below + // serves index.html with its own no-cache headers so SPA upgrades land + // promptly without re-cloning the bundle. + cacheControl: true, + maxAge: '1y', + immutable: true, + }); + + // Read index.html once at boot — it's small and avoids per-request disk IO. + // fastify-static's own cache-control headers would otherwise stamp the SPA + // entry point with immutable max-age=1y, which is the wrong policy for the + // file that decides which assets the browser loads next. + const indexHtml = await readFile(join(root, 'index.html'), 'utf8'); + + fastify.setNotFoundHandler((request: FastifyRequest, reply: FastifyReply) => { + // API endpoints under /api/* should preserve the 404 envelope, not serve HTML. + if (request.url.startsWith('/api/')) { + return jsonNotFound(reply); + } + return reply + .code(200) + .type('text/html; charset=utf-8') + .header('cache-control', 'no-cache') + .send(indexHtml); + }); +} + +export default fp(staticWebPlugin, { + name: 'static-web', + fastify: '5.x', + dependencies: ['@fastify/env'], +}); diff --git a/apps/api/src/routes/health.ts b/apps/api/src/routes/health.ts index cbe3490..d74082e 100644 --- a/apps/api/src/routes/health.ts +++ b/apps/api/src/routes/health.ts @@ -44,6 +44,80 @@ export async function healthRoutes(fastify: FastifyInstance): Promise { }, ); + // Readiness probe: 200 only after both stores have loaded and services are + // wired. Used by k8s readinessProbe; never routes traffic to a pod whose + // in-memory state is still warming. + fastify.get( + '/api/health/ready', + { + schema: { + tags: ['health'], + summary: 'Readiness check', + description: 'Returns 200 only when both stores have loaded and services are wired.', + response: { + 200: { + type: 'object', + properties: { + success: { type: 'boolean' }, + data: { + type: 'object', + properties: { + status: { type: 'string' }, + publicStore: { type: 'boolean' }, + privateStore: { type: 'boolean' }, + fts: { type: 'boolean' }, + }, + }, + metadata: { + type: 'object', + properties: { + timestamp: { type: 'string' }, + }, + }, + }, + }, + 503: { + type: 'object', + properties: { + success: { type: 'boolean' }, + error: { + type: 'object', + properties: { + code: { type: 'string' }, + message: { type: 'string' }, + }, + }, + }, + }, + }, + }, + }, + (_req, reply) => { + const publicStoreReady = Boolean(fastify.store?.public); + const privateStoreReady = Boolean(fastify.store?.private); + const ftsReady = Boolean(fastify.fts); + + if (!publicStoreReady || !privateStoreReady || !ftsReady) { + return reply.code(503).send({ + success: false, + error: { + code: 'not_ready', + message: 'Stores still warming', + }, + }); + } + + return reply.send( + ok({ + status: 'ready', + publicStore: publicStoreReady, + privateStore: privateStoreReady, + fts: ftsReady, + }), + ); + }, + ); + // Stub route for testing validation errors fastify.post( '/api/_test/validation-error', diff --git a/apps/api/tests/deploy.test.ts b/apps/api/tests/deploy.test.ts new file mode 100644 index 0000000..bfd47aa --- /dev/null +++ b/apps/api/tests/deploy.test.ts @@ -0,0 +1,123 @@ +/** + * Tests for the deploy plan validation criteria. + * + * Covers: + * - GET /api/health/ready returns 200 with the store-readiness flags set + * - Static-web plugin disabled by default (no CFP_WEB_DIST_PATH): /api/* still + * returns the JSON 404 envelope and arbitrary paths 404 as JSON, not HTML + * - Static-web plugin with CFP_WEB_DIST_PATH set: arbitrary path falls + * through to index.html (SPA fallback); /api/* paths still 404 as JSON + */ +import { afterEach, beforeEach, describe, expect, it } from 'vitest'; + +import { mkdtemp, mkdir, rm, writeFile } from 'node:fs/promises'; +import { join } from 'node:path'; +import { tmpdir } from 'node:os'; + +import type { FastifyInstance } from 'fastify'; +import { buildApp } from '../src/app.js'; +import { createFullDataRepo, createPrivateStorageDir } from './helpers/test-full-repo.js'; + +let dataRepo: { path: string; cleanup: () => Promise }; +let privateStore: { path: string; cleanup: () => Promise }; +let webDist: string | undefined; +let app: FastifyInstance | undefined; + +async function buildTestApp( + overrides: Partial> = {}, +): Promise { + return buildApp({ + serverOptions: { logger: false }, + overrideEnv: { + CFP_DATA_REPO_PATH: dataRepo.path, + STORAGE_BACKEND: 'filesystem', + CFP_PRIVATE_STORAGE_PATH: privateStore.path, + CFP_JWT_SIGNING_KEY: 'test-jwt-signing-key-at-least-32-chars!!', + NODE_ENV: 'test', + ...overrides, + }, + }); +} + +beforeEach(async () => { + dataRepo = await createFullDataRepo(); + privateStore = await createPrivateStorageDir(); +}); + +afterEach(async () => { + if (app) { + await app.close(); + app = undefined; + } + await dataRepo.cleanup(); + await privateStore.cleanup(); + if (webDist) { + await rm(webDist, { recursive: true, force: true }); + webDist = undefined; + } +}); + +describe('GET /api/health/ready', () => { + it('returns 200 with store-readiness flags after boot', async () => { + app = await buildTestApp(); + const res = await app.inject({ method: 'GET', url: '/api/health/ready' }); + + expect(res.statusCode).toBe(200); + const body = res.json<{ + success: boolean; + data: { status: string; publicStore: boolean; privateStore: boolean; fts: boolean }; + }>(); + expect(body.success).toBe(true); + expect(body.data.status).toBe('ready'); + expect(body.data.publicStore).toBe(true); + expect(body.data.privateStore).toBe(true); + expect(body.data.fts).toBe(true); + }); +}); + +describe('static-web plugin', () => { + it('is disabled when CFP_WEB_DIST_PATH is unset; /api/* 404s as JSON envelope', async () => { + app = await buildTestApp(); + + const res = await app.inject({ method: 'GET', url: '/api/does-not-exist' }); + expect(res.statusCode).toBe(404); + const body = res.json<{ success: boolean; error: { code: string } }>(); + expect(body.success).toBe(false); + expect(body.error.code).toBe('not_found'); + }); + + it('serves index.html for non-/api/* paths when CFP_WEB_DIST_PATH points at a valid dist', async () => { + webDist = await mkdtemp(join(tmpdir(), 'cfp-web-dist-')); + await mkdir(join(webDist, 'assets'), { recursive: true }); + await writeFile(join(webDist, 'index.html'), 'cfp'); + await writeFile(join(webDist, 'assets', 'app-deadbeef.js'), 'console.log("hi")'); + + app = await buildTestApp({ CFP_WEB_DIST_PATH: webDist }); + + // SPA fallback: arbitrary path serves index.html + const spaRes = await app.inject({ method: 'GET', url: '/projects/some-slug' }); + expect(spaRes.statusCode).toBe(200); + expect(spaRes.headers['content-type']).toMatch(/text\/html/); + expect(spaRes.body).toContain('cfp'); + expect(spaRes.headers['cache-control']).toContain('no-cache'); + + // Hashed asset served directly with long cache + const assetRes = await app.inject({ method: 'GET', url: '/assets/app-deadbeef.js' }); + expect(assetRes.statusCode).toBe(200); + expect(assetRes.body).toContain('console.log'); + + // /api/* unknown route still returns JSON 404, not HTML + const apiRes = await app.inject({ method: 'GET', url: '/api/does-not-exist' }); + expect(apiRes.statusCode).toBe(404); + expect(apiRes.headers['content-type']).toMatch(/application\/json/); + const apiBody = apiRes.json<{ success: boolean; error: { code: string } }>(); + expect(apiBody.success).toBe(false); + expect(apiBody.error.code).toBe('not_found'); + }); + + it('throws on boot when CFP_WEB_DIST_PATH points at a non-existent directory', async () => { + await expect( + buildTestApp({ CFP_WEB_DIST_PATH: '/nonexistent/cfp-web-dist-please-do-not-exist' }), + ).rejects.toThrow(/CFP_WEB_DIST_PATH/); + }); +}); diff --git a/deploy/charts/codeforphilly/Chart.yaml b/deploy/charts/codeforphilly/Chart.yaml new file mode 100644 index 0000000..1c5952c --- /dev/null +++ b/deploy/charts/codeforphilly/Chart.yaml @@ -0,0 +1,14 @@ +apiVersion: v2 +name: codeforphilly +description: CodeForPhilly modernization — Fastify + Vite/React + gitsheets, single replica +type: application +# version: chart SemVer — bumped per Chart change. +version: 0.1.0 +# appVersion: bumped per image release; CI overrides with --set image.tag. +appVersion: "0.0.0" +home: https://codeforphilly.org +sources: + - https://github.com/CodeForPhilly/codeforphilly-rewrite +maintainers: + - name: Code for Philly + url: https://codeforphilly.org diff --git a/deploy/charts/codeforphilly/templates/NOTES.txt b/deploy/charts/codeforphilly/templates/NOTES.txt new file mode 100644 index 0000000..639590f --- /dev/null +++ b/deploy/charts/codeforphilly/templates/NOTES.txt @@ -0,0 +1,26 @@ +{{- if .Values.ingress.enabled }} +Application installed. Public URL(s): +{{- range .Values.ingress.hosts }} + https://{{ .host }} +{{- end }} + +Health checks: + curl https://{{ (index .Values.ingress.hosts 0).host }}/api/health + curl https://{{ (index .Values.ingress.hosts 0).host }}/api/health/ready + +{{- else }} +Application installed (no ingress). Port-forward to test: + kubectl port-forward -n {{ .Release.Namespace }} svc/{{ include "codeforphilly.fullname" . }} 8080:{{ .Values.service.port }} + curl http://localhost:8080/api/health +{{- end }} + +Secrets contract: + Provide a k8s Secret named {{ (index .Values.secretEnvFrom 0).name | default "codeforphilly-secrets" }} + with the keys enumerated in docs/operations/deploy.md ("Secret contract"). + +{{- if .Values.deployKey.secretName }} +SSH deploy key: + Provide a Secret named {{ .Values.deployKey.secretName }} containing key + "id_ed25519" (PEM-encoded ed25519 private key with push access to + CFP_DATA_REMOTE). +{{- end }} diff --git a/deploy/charts/codeforphilly/templates/_helpers.tpl b/deploy/charts/codeforphilly/templates/_helpers.tpl new file mode 100644 index 0000000..5bfe66e --- /dev/null +++ b/deploy/charts/codeforphilly/templates/_helpers.tpl @@ -0,0 +1,50 @@ +{{- define "codeforphilly.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{- define "codeforphilly.fullname" -}} +{{- if .Values.fullnameOverride -}} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- $name := default .Chart.Name .Values.nameOverride -}} +{{- if contains $name .Release.Name -}} +{{- .Release.Name | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} +{{- end -}} +{{- end -}} +{{- end -}} + +{{- define "codeforphilly.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{- define "codeforphilly.labels" -}} +helm.sh/chart: {{ include "codeforphilly.chart" . }} +{{ include "codeforphilly.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end -}} + +{{- define "codeforphilly.selectorLabels" -}} +app.kubernetes.io/name: {{ include "codeforphilly.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end -}} + +{{- define "codeforphilly.serviceAccountName" -}} +{{- if .Values.serviceAccount.create -}} +{{- default (include "codeforphilly.fullname" .) .Values.serviceAccount.name -}} +{{- else -}} +{{- default "default" .Values.serviceAccount.name -}} +{{- end -}} +{{- end -}} + +{{- define "codeforphilly.dataPvcName" -}} +{{- default (printf "%s-data" (include "codeforphilly.fullname" .)) .Values.dataRepo.pvc.name -}} +{{- end -}} + +{{- define "codeforphilly.privatePvcName" -}} +{{- default (printf "%s-private" (include "codeforphilly.fullname" .)) .Values.privateStorage.pvc.name -}} +{{- end -}} diff --git a/deploy/charts/codeforphilly/templates/configmap.yaml b/deploy/charts/codeforphilly/templates/configmap.yaml new file mode 100644 index 0000000..9a2449d --- /dev/null +++ b/deploy/charts/codeforphilly/templates/configmap.yaml @@ -0,0 +1,23 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "codeforphilly.fullname" . }}-env + labels: + {{- include "codeforphilly.labels" . | nindent 4 }} +data: + # Static env values from values.yaml `env:` block. + {{- range $k, $v := .Values.env }} + {{ $k }}: {{ $v | quote }} + {{- end }} + STORAGE_BACKEND: {{ .Values.storage.backend | quote }} + CFP_DATA_BRANCH: {{ .Values.dataRepo.branch | quote }} + {{- range $k, $v := .Values.publicEnv }} + {{- if $v }} + {{ $k }}: {{ $v | quote }} + {{- end }} + {{- end }} + {{- if .Values.deployKey.secretName }} + # Tell git to use the mounted SSH key. accept-new keeps first-connect simple; + # populate deployKey.knownHostsConfigMap for strict host-key checking. + GIT_SSH_COMMAND: "ssh -i {{ .Values.deployKey.mountPath }}/id_ed25519 -o IdentitiesOnly=yes -o StrictHostKeyChecking=accept-new {{- if .Values.deployKey.knownHostsConfigMap }} -o UserKnownHostsFile={{ .Values.deployKey.mountPath }}/known_hosts{{- end }}" + {{- end }} diff --git a/deploy/charts/codeforphilly/templates/deployment.yaml b/deploy/charts/codeforphilly/templates/deployment.yaml new file mode 100644 index 0000000..4589a43 --- /dev/null +++ b/deploy/charts/codeforphilly/templates/deployment.yaml @@ -0,0 +1,123 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "codeforphilly.fullname" . }} + labels: + {{- include "codeforphilly.labels" . | nindent 4 }} +spec: + # replicas: 1 is a hard constraint per specs/architecture.md (in-process + # write mutex serializes mutations). + replicas: {{ .Values.replicaCount }} + strategy: + # Recreate (not RollingUpdate) so the old pod releases the write lock + # before the new one starts. Two pods writing to the same gitsheets repo + # would corrupt state. + type: Recreate + selector: + matchLabels: + {{- include "codeforphilly.selectorLabels" . | nindent 6 }} + template: + metadata: + labels: + {{- include "codeforphilly.selectorLabels" . | nindent 8 }} + annotations: + # Roll the pod whenever the ConfigMap changes — values-driven env + # tweaks shouldn't require image bumps to take effect. + checksum/config: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }} + {{- with .Values.podAnnotations }} + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + serviceAccountName: {{ include "codeforphilly.serviceAccountName" . }} + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + containers: + - name: api + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + securityContext: + {{- toYaml .Values.securityContext | nindent 12 }} + ports: + - name: http + containerPort: 3001 + protocol: TCP + envFrom: + - configMapRef: + name: {{ include "codeforphilly.fullname" . }}-env + {{- range .Values.secretEnvFrom }} + - secretRef: + name: {{ .name }} + {{- end }} + livenessProbe: + httpGet: + path: {{ .Values.probes.liveness.path }} + port: http + initialDelaySeconds: {{ .Values.probes.liveness.initialDelaySeconds }} + periodSeconds: {{ .Values.probes.liveness.periodSeconds }} + timeoutSeconds: {{ .Values.probes.liveness.timeoutSeconds }} + failureThreshold: {{ .Values.probes.liveness.failureThreshold }} + readinessProbe: + httpGet: + path: {{ .Values.probes.readiness.path }} + port: http + initialDelaySeconds: {{ .Values.probes.readiness.initialDelaySeconds }} + periodSeconds: {{ .Values.probes.readiness.periodSeconds }} + timeoutSeconds: {{ .Values.probes.readiness.timeoutSeconds }} + failureThreshold: {{ .Values.probes.readiness.failureThreshold }} + resources: + {{- toYaml .Values.resources | nindent 12 }} + volumeMounts: + {{- if .Values.dataRepo.pvc.enabled }} + - name: data + mountPath: {{ .Values.env.CFP_DATA_REPO_PATH | default "/app/data" }} + {{- end }} + {{- if and (eq .Values.storage.backend "filesystem") .Values.privateStorage.pvc.enabled }} + - name: private-storage + mountPath: {{ .Values.env.CFP_PRIVATE_STORAGE_PATH | default "/app/private-storage" }} + {{- end }} + {{- if .Values.deployKey.secretName }} + - name: deploy-key + mountPath: {{ .Values.deployKey.mountPath }} + readOnly: true + {{- end }} + volumes: + {{- if .Values.dataRepo.pvc.enabled }} + - name: data + persistentVolumeClaim: + claimName: {{ include "codeforphilly.dataPvcName" . }} + {{- end }} + {{- if and (eq .Values.storage.backend "filesystem") .Values.privateStorage.pvc.enabled }} + - name: private-storage + persistentVolumeClaim: + claimName: {{ include "codeforphilly.privatePvcName" . }} + {{- end }} + {{- if .Values.deployKey.secretName }} + - name: deploy-key + secret: + secretName: {{ .Values.deployKey.secretName }} + defaultMode: 0400 + items: + - key: id_ed25519 + path: id_ed25519 + {{- if .Values.deployKey.knownHostsConfigMap }} + - name: deploy-key-known-hosts + configMap: + name: {{ .Values.deployKey.knownHostsConfigMap }} + {{- end }} + {{- end }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/deploy/charts/codeforphilly/templates/ingress.yaml b/deploy/charts/codeforphilly/templates/ingress.yaml new file mode 100644 index 0000000..87e3dbe --- /dev/null +++ b/deploy/charts/codeforphilly/templates/ingress.yaml @@ -0,0 +1,35 @@ +{{- if .Values.ingress.enabled -}} +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: {{ include "codeforphilly.fullname" . }} + labels: + {{- include "codeforphilly.labels" . | nindent 4 }} + {{- with .Values.ingress.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if .Values.ingress.className }} + ingressClassName: {{ .Values.ingress.className }} + {{- end }} + {{- with .Values.ingress.tls }} + tls: + {{- toYaml . | nindent 4 }} + {{- end }} + rules: + {{- range .Values.ingress.hosts }} + - host: {{ .host | quote }} + http: + paths: + {{- range .paths }} + - path: {{ .path }} + pathType: {{ .pathType }} + backend: + service: + name: {{ include "codeforphilly.fullname" $ }} + port: + number: {{ $.Values.service.port }} + {{- end }} + {{- end }} +{{- end }} diff --git a/deploy/charts/codeforphilly/templates/pvc-data.yaml b/deploy/charts/codeforphilly/templates/pvc-data.yaml new file mode 100644 index 0000000..41b3016 --- /dev/null +++ b/deploy/charts/codeforphilly/templates/pvc-data.yaml @@ -0,0 +1,17 @@ +{{- if .Values.dataRepo.pvc.enabled -}} +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ include "codeforphilly.dataPvcName" . }} + labels: + {{- include "codeforphilly.labels" . | nindent 4 }} +spec: + accessModes: + - {{ .Values.dataRepo.pvc.accessMode }} + resources: + requests: + storage: {{ .Values.dataRepo.pvc.size }} + {{- if .Values.dataRepo.pvc.storageClass }} + storageClassName: {{ .Values.dataRepo.pvc.storageClass }} + {{- end }} +{{- end }} diff --git a/deploy/charts/codeforphilly/templates/pvc-private.yaml b/deploy/charts/codeforphilly/templates/pvc-private.yaml new file mode 100644 index 0000000..1be2a24 --- /dev/null +++ b/deploy/charts/codeforphilly/templates/pvc-private.yaml @@ -0,0 +1,17 @@ +{{- if and (eq .Values.storage.backend "filesystem") .Values.privateStorage.pvc.enabled -}} +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ include "codeforphilly.privatePvcName" . }} + labels: + {{- include "codeforphilly.labels" . | nindent 4 }} +spec: + accessModes: + - {{ .Values.privateStorage.pvc.accessMode }} + resources: + requests: + storage: {{ .Values.privateStorage.pvc.size }} + {{- if .Values.privateStorage.pvc.storageClass }} + storageClassName: {{ .Values.privateStorage.pvc.storageClass }} + {{- end }} +{{- end }} diff --git a/deploy/charts/codeforphilly/templates/service.yaml b/deploy/charts/codeforphilly/templates/service.yaml new file mode 100644 index 0000000..e513ee6 --- /dev/null +++ b/deploy/charts/codeforphilly/templates/service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "codeforphilly.fullname" . }} + labels: + {{- include "codeforphilly.labels" . | nindent 4 }} +spec: + type: {{ .Values.service.type }} + ports: + - port: {{ .Values.service.port }} + targetPort: {{ .Values.service.targetPort }} + protocol: TCP + name: http + selector: + {{- include "codeforphilly.selectorLabels" . | nindent 4 }} diff --git a/deploy/charts/codeforphilly/templates/serviceaccount.yaml b/deploy/charts/codeforphilly/templates/serviceaccount.yaml new file mode 100644 index 0000000..290f510 --- /dev/null +++ b/deploy/charts/codeforphilly/templates/serviceaccount.yaml @@ -0,0 +1,12 @@ +{{- if .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "codeforphilly.serviceAccountName" . }} + labels: + {{- include "codeforphilly.labels" . | nindent 4 }} + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} diff --git a/deploy/charts/codeforphilly/values.production.yaml b/deploy/charts/codeforphilly/values.production.yaml new file mode 100644 index 0000000..cdf5f24 --- /dev/null +++ b/deploy/charts/codeforphilly/values.production.yaml @@ -0,0 +1,47 @@ +# Production environment overrides. +# Apply with: +# helm upgrade --install codeforphilly deploy/charts/codeforphilly \ +# --namespace codeforphilly --create-namespace \ +# -f deploy/charts/codeforphilly/values.production.yaml \ +# --set image.tag= + +ingress: + hosts: + - host: codeforphilly.org + paths: + - path: / + pathType: Prefix + - host: www.codeforphilly.org + paths: + - path: / + pathType: Prefix + tls: + - secretName: codeforphilly-tls + hosts: + - codeforphilly.org + - www.codeforphilly.org + +# Production uses the S3-compatible bucket per specs/behaviors/private-storage.md. +storage: + backend: s3 + +# Concrete S3 endpoint + bucket are filled in once the production bucket is +# provisioned. Until then this file is the contract. +publicEnv: + CFP_DATA_REMOTE: git@github.com:CodeForPhilly/codeforphilly-data.git + # S3_ENDPOINT / S3_BUCKET / S3_REGION populated at provisioning time. + +deployKey: + secretName: codeforphilly-data-deploy-key + +resources: + requests: + cpu: 200m + memory: 512Mi + limits: + cpu: 2000m + memory: 1Gi + +# Tune heap for the production resource budget. +env: + NODE_OPTIONS: "--max-old-space-size=768" diff --git a/deploy/charts/codeforphilly/values.staging.yaml b/deploy/charts/codeforphilly/values.staging.yaml new file mode 100644 index 0000000..452a6cb --- /dev/null +++ b/deploy/charts/codeforphilly/values.staging.yaml @@ -0,0 +1,48 @@ +# Staging environment overrides. +# Apply with: +# helm upgrade --install codeforphilly-staging deploy/charts/codeforphilly \ +# --namespace codeforphilly-staging --create-namespace \ +# -f deploy/charts/codeforphilly/values.staging.yaml \ +# --set image.tag= + +ingress: + hosts: + - host: codeforphilly-rewrite-staging.k8s.phl.io + paths: + - path: / + pathType: Prefix + tls: + - secretName: codeforphilly-staging-tls + hosts: + - codeforphilly-rewrite-staging.k8s.phl.io + +# Staging starts on the filesystem private-store backend — no bucket required +# to stand up. Flip to s3 once a staging bucket is provisioned (see +# docs/operations/deploy.md#bucket-provisioning). +storage: + backend: filesystem + +privateStorage: + pvc: + enabled: true + size: 1Gi + +publicEnv: + # Point at the public scrubbed snapshot for staging by default so we never + # leak production data through the staging UI. Swap to the real data repo + # once cutover-prep wires staging to a real (still-private) data remote. + CFP_DATA_REMOTE: https://github.com/CodeForPhilly/codeforphilly-data-snapshot.git + +# Staging deploy key is optional — the snapshot repo is public, so an SSH key +# is only needed when CFP_DATA_REMOTE is the real (private) data remote. +deployKey: + secretName: "" + +# Smaller footprint for staging. +resources: + requests: + cpu: 50m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi diff --git a/deploy/charts/codeforphilly/values.yaml b/deploy/charts/codeforphilly/values.yaml new file mode 100644 index 0000000..8ecdc28 --- /dev/null +++ b/deploy/charts/codeforphilly/values.yaml @@ -0,0 +1,144 @@ +# Default values for the codeforphilly chart. +# Per specs/architecture.md: single replica, Recreate strategy. +# Per-environment overrides live in values.staging.yaml + values.production.yaml. + +image: + repository: ghcr.io/codeforphilly/codeforphilly-rewrite + # tag is set by CI via --set image.tag=. Leave blank in checked-in values. + tag: "" + pullPolicy: IfNotPresent + +imagePullSecrets: [] + +# Single replica is a hard architectural constraint (write mutex serializes +# gitsheets commits in-process). Recreate strategy guarantees no concurrent +# old/new pods can both hold the lock. +replicaCount: 1 + +nameOverride: "" +fullnameOverride: "" + +serviceAccount: + create: true + annotations: {} + name: "" + +podAnnotations: {} +podSecurityContext: + fsGroup: 1000 +securityContext: + runAsNonRoot: true + runAsUser: 1000 + runAsGroup: 1000 + readOnlyRootFilesystem: false # gitsheets writes the working tree + allowPrivilegeEscalation: false + capabilities: + drop: [ALL] + +service: + type: ClusterIP + port: 80 + targetPort: 3001 + +ingress: + enabled: true + className: nginx + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + hosts: + - host: codeforphilly-rewrite.example.invalid + paths: + - path: / + pathType: Prefix + tls: + - secretName: codeforphilly-tls + hosts: + - codeforphilly-rewrite.example.invalid + +resources: + requests: + cpu: 100m + memory: 384Mi + limits: + cpu: 1000m + memory: 768Mi + +# PVC holding the data repo working tree. Mounted at CFP_DATA_REPO_PATH. +dataRepo: + pvc: + enabled: true + name: "" # defaults to "-data" + accessMode: ReadWriteOnce + size: 5Gi + storageClass: "" # leave empty for cluster default + # Branch to track on CFP_DATA_REMOTE. + branch: main + +# Filesystem private-store PVC (only used when storage.backend=filesystem). +# In staging we start on filesystem; production should flip to s3 once a +# bucket exists. See specs/behaviors/private-storage.md. +privateStorage: + pvc: + enabled: false + name: "" # defaults to "-private" + accessMode: ReadWriteOnce + size: 1Gi + storageClass: "" + +# Storage backend selection. One of: s3 | filesystem. +storage: + backend: s3 + +# Non-secret config; secrets are injected via secretEnvFrom below. +env: + NODE_ENV: production + PORT: "3001" + CFP_DATA_REPO_PATH: /app/data + CFP_WEB_DIST_PATH: /app/apps/web/dist + CFP_PRIVATE_STORAGE_PATH: /app/private-storage + # Push daemon identity used for any commits the API makes. + GIT_AUTHOR_NAME: CodeForPhilly API + GIT_AUTHOR_EMAIL: api@codeforphilly.org + +# Optional non-secret env that depends on per-env wiring. +publicEnv: + CFP_DATA_REMOTE: "" # set in values.staging.yaml / values.production.yaml + S3_ENDPOINT: "" + S3_BUCKET: "" + S3_REGION: "" + GITHUB_OAUTH_CLIENT_ID: "" + +# Names of existing k8s Secrets (one per environment) to mount as envFrom. +# Each Secret should populate the variables enumerated in +# docs/operations/secrets.md. +secretEnvFrom: + - name: codeforphilly-secrets + +# SSH deploy key for pushing to CFP_DATA_REMOTE. The Secret must be created +# out-of-band (sealed-secrets) and contain a single key named "id_ed25519". +# Set deployKey.secretName="" to disable mounting (e.g. when CFP_DATA_REMOTE +# is https:// with a token in the URL). +deployKey: + secretName: codeforphilly-data-deploy-key + mountPath: /etc/cfp-data-deploy-key + knownHostsConfigMap: "" # optional ConfigMap holding known_hosts + +probes: + liveness: + path: /api/health + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 3 + failureThreshold: 3 + readiness: + path: /api/health/ready + # Boot loads gitsheets + private store before binding; first probe should + # already be 200, but give 90s headroom for cold cache. + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 18 + +nodeSelector: {} +tolerations: [] +affinity: {} diff --git a/deploy/docker/entrypoint.sh b/deploy/docker/entrypoint.sh new file mode 100755 index 0000000..cf30e61 --- /dev/null +++ b/deploy/docker/entrypoint.sh @@ -0,0 +1,73 @@ +#!/bin/sh +# CodeForPhilly API entrypoint. +# +# Per specs/architecture.md, on pod start: +# 1. Runs `git clone` / `git fetch && git reset --hard origin/` +# against CFP_DATA_REMOTE to populate the data-repo working tree. +# 2. exec node apps/api/dist/index.js +# +# Required env: +# CFP_DATA_REPO_PATH — local working-tree path (mounted PVC in k8s) +# CFP_DATA_REMOTE — git URL to clone/fetch from +# Optional env: +# CFP_DATA_BRANCH — branch to track (default: main) +# GIT_SSH_COMMAND — set by Helm when an SSH deploy key is mounted; usually +# `ssh -i /etc/cfp/git-deploy-key -o StrictHostKeyChecking=accept-new` +# +# Failure modes: any non-zero exit causes the container to crash. K8s restarts +# it. Readiness probe stays 503 until /api/health/ready returns 200. + +set -eu + +log() { + printf '[entrypoint] %s\n' "$*" >&2 +} + +: "${CFP_DATA_REPO_PATH:?CFP_DATA_REPO_PATH must be set}" + +DATA_BRANCH="${CFP_DATA_BRANCH:-main}" + +if [ -z "${CFP_DATA_REMOTE:-}" ]; then + if [ -d "$CFP_DATA_REPO_PATH/.git" ]; then + log "CFP_DATA_REMOTE unset; using existing working tree at $CFP_DATA_REPO_PATH" + else + log "ERROR: CFP_DATA_REMOTE is unset and $CFP_DATA_REPO_PATH is not a git repo" + exit 1 + fi +else + mkdir -p "$CFP_DATA_REPO_PATH" + + if [ -d "$CFP_DATA_REPO_PATH/.git" ]; then + log "refreshing existing data repo at $CFP_DATA_REPO_PATH (branch=$DATA_BRANCH)" + cd "$CFP_DATA_REPO_PATH" + + # Re-point origin in case CFP_DATA_REMOTE was rotated. + git remote set-url origin "$CFP_DATA_REMOTE" + git fetch --prune --depth=1 origin "$DATA_BRANCH" + git checkout -B "$DATA_BRANCH" "origin/$DATA_BRANCH" + git reset --hard "origin/$DATA_BRANCH" + cd - >/dev/null + else + log "cloning $CFP_DATA_REMOTE into $CFP_DATA_REPO_PATH (branch=$DATA_BRANCH)" + # --depth=1 keeps the PVC footprint small; the push daemon will deepen as + # needed when it next pushes (or we accept periodic re-clones). + git clone --depth=1 --branch "$DATA_BRANCH" "$CFP_DATA_REMOTE" "$CFP_DATA_REPO_PATH" + fi +fi + +# Identity for any commits the API makes (the gitsheets writer commits per +# mutation). Override via env in Helm values if you want per-environment +# identities. +: "${GIT_AUTHOR_NAME:=CodeForPhilly API}" +: "${GIT_AUTHOR_EMAIL:=api@codeforphilly.org}" +: "${GIT_COMMITTER_NAME:=$GIT_AUTHOR_NAME}" +: "${GIT_COMMITTER_EMAIL:=$GIT_AUTHOR_EMAIL}" +export GIT_AUTHOR_NAME GIT_AUTHOR_EMAIL GIT_COMMITTER_NAME GIT_COMMITTER_EMAIL + +cd "$CFP_DATA_REPO_PATH" +git config user.name "$GIT_AUTHOR_NAME" +git config user.email "$GIT_AUTHOR_EMAIL" +cd - >/dev/null + +log "data repo ready; starting API" +exec "$@" diff --git a/docs/operations/deploy.md b/docs/operations/deploy.md new file mode 100644 index 0000000..f65b929 --- /dev/null +++ b/docs/operations/deploy.md @@ -0,0 +1,276 @@ +# Deploying codeforphilly-rewrite + +This guide covers the artifacts in [`deploy/`](../../deploy/), the boot sequence +inside the container, and the operational expectations of the staging and +production environments. The authoritative architectural contract is +[specs/architecture.md](../../specs/architecture.md#deploy); this document is +the runbook that implements it. + +> See also: [secrets.md](secrets.md) for the secret contract, [runbook.md](runbook.md) +> for incident response. + +## TL;DR — anatomy + +``` ++----------------------+ +| GitHub Actions CI | deploy-staging.yml / deploy-production.yml ++----------+-----------+ + | docker build / push + v ++----------------------+ +| GHCR image | ghcr.io/codeforphilly/codeforphilly-rewrite: ++----------+-----------+ + | helm upgrade --install + v ++----------------------+ +| k8s Deployment | 1 replica, Recreate strategy, PVC + Secrets + ConfigMap +| (api + spa) | ++----------+-----------+ + | + /api/* v /* (fallthrough) ++---------------+ +-----------------------+ +| Fastify routes | | apps/web/dist (SPA) | ++----------------+ +-----------------------+ +``` + +The image holds **both** the API and the built SPA. There is no separate web +container. The single replica is a hard architectural constraint +([specs/architecture.md](../../specs/architecture.md#process-model)). + +## Image + +### Build + +```bash +docker build -t ghcr.io/codeforphilly/codeforphilly-rewrite:dev . +``` + +Three stages — `deps` (full install), `build` (compile both workspaces, prune +dev deps), `runtime` (alpine + git + ca-certificates + tini). Final image runs +as `node` (uid 1000) per the `securityContext` in the Helm chart. + +### Run (local smoke test) + +```bash +docker run --rm -p 3001:3001 \ + -e CFP_DATA_REMOTE=https://github.com/CodeForPhilly/codeforphilly-data-snapshot.git \ + -e STORAGE_BACKEND=filesystem \ + -e CFP_PRIVATE_STORAGE_PATH=/app/private-storage \ + -e CFP_JWT_SIGNING_KEY="$(openssl rand -base64 48)" \ + -e GITHUB_OAUTH_CLIENT_ID=local \ + -e GITHUB_OAUTH_CLIENT_SECRET=local \ + ghcr.io/codeforphilly/codeforphilly-rewrite:dev + +curl http://localhost:3001/api/health # liveness +curl http://localhost:3001/api/health/ready # readiness +curl http://localhost:3001/ # SPA index.html +``` + +## Boot sequence + +The container entrypoint (`deploy/docker/entrypoint.sh`) does, in order: + +1. Validate `CFP_DATA_REPO_PATH` is set. +2. If `CFP_DATA_REMOTE` is set: + - If the target is already a git repo, `git fetch` + `git reset --hard origin/`. + - Otherwise `git clone --depth=1 --branch `. +3. Configure git author identity on the local repo (so any commit the API + makes carries `GIT_AUTHOR_NAME`/`GIT_AUTHOR_EMAIL`). +4. `exec node apps/api/dist/index.js`. + +Inside node, `buildApp()` then registers plugins in order +([apps/api/src/app.ts](../../apps/api/src/app.ts)): env validation → CORS → +cookies → trace IDs → error mapper → **store boot (loads public + private into +memory)** → services (FTS) → rate limit → idempotency → session middleware → +swagger → routes → static SPA. The Fastify `listen()` call doesn't fire until +all of those resolve, so by the time `/api/health/ready` can be hit, both +stores have loaded. + +This matches the boot-order section of [deploy.md plan](../../plans/deploy.md). + +## Helm chart + +Chart lives at [`deploy/charts/codeforphilly/`](../../deploy/charts/codeforphilly/). +Three values files: + +- `values.yaml` — defaults (1 replica, Recreate, PVC for the data repo, S3 backend, ingress with cert-manager) +- `values.staging.yaml` — staging host, filesystem private store, scrubbed-snapshot data remote +- `values.production.yaml` — production hosts, S3 private store, real data remote, SSH deploy key + +### Install + +```bash +# Staging (first time) +kubectl create namespace codeforphilly-staging +kubectl -n codeforphilly-staging apply -f path/to/staging-secrets.yaml # see secrets.md +helm upgrade --install codeforphilly-staging \ + deploy/charts/codeforphilly \ + --namespace codeforphilly-staging \ + -f deploy/charts/codeforphilly/values.staging.yaml \ + --set image.tag=sha- + +# Production (first time) +kubectl create namespace codeforphilly +kubectl -n codeforphilly apply -f path/to/production-secrets.yaml +helm upgrade --install codeforphilly \ + deploy/charts/codeforphilly \ + --namespace codeforphilly \ + -f deploy/charts/codeforphilly/values.production.yaml \ + --set image.tag=v +``` + +### What the chart provisions + +| Resource | Purpose | +|----------|---------| +| `Deployment` | 1 replica, `Recreate` strategy, mounts PVC at `/app/data` | +| `Service` (ClusterIP) | Fronts the pod on port 80 → container 3001 | +| `Ingress` | nginx + cert-manager; staging + production hosts | +| `PersistentVolumeClaim` (data) | Working tree for the gitsheets data repo (5Gi default) | +| `PersistentVolumeClaim` (private, staging) | Local jsonl store when `storage.backend=filesystem` | +| `ConfigMap` | Non-secret env (`NODE_ENV`, paths, `CFP_DATA_REMOTE`, etc.) | +| `ServiceAccount` | Empty default — no in-cluster API access needed | + +Secrets are **not** templated in the chart. They are created out-of-band — see +[secrets.md](secrets.md). + +### Probes + +- **Liveness** — `GET /api/health` every 10s. The pod is killed only after + three consecutive failures (~30s). +- **Readiness** — `GET /api/health/ready` every 5s. Returns 503 until the + store plugins have finished decorating Fastify (gitsheets working tree + cloned + private store loaded). Once green, ingress routes traffic. + +## CI/CD + +Two deploy workflows in `.github/workflows/`: + +- `deploy-staging.yml` — triggered on push to `main`. Builds + pushes the + image tagged `sha-` and `staging-latest`, then `helm upgrade --install` + to `codeforphilly-staging`. Gated by GitHub Environment "staging" (first + run requires manual approval; secrets are scoped per-environment). +- `deploy-production.yml` — triggered on tag push matching `v*.*.*`. Same + build, deploys to namespace `codeforphilly`. Gated by Environment + "production" — every deploy goes through an approval gate. + +Both use `--atomic --wait --timeout 5m` so a failed rollout auto-reverts. + +### GitHub Environment secrets + +| Environment | Secret | Purpose | +|-------------|--------|---------| +| staging | `KUBECONFIG_STAGING` | base64-encoded kubeconfig with rights only in `codeforphilly-staging` | +| production | `KUBECONFIG_PRODUCTION` | base64-encoded kubeconfig with rights only in `codeforphilly` | + +The kubeconfigs should be scoped to the namespace via RBAC — the service +account they reference should not have cluster-admin. + +## Data repo on disk + +In production the API operates on a working tree at `/app/data` backed by a +PVC. On every boot the entrypoint refreshes that tree from `CFP_DATA_REMOTE` +(`git fetch && git reset --hard`). The push daemon then pushes commits made +during the pod's lifetime back to the remote. + +Implications: + +- **PVC contents are ephemeral.** Killing the pod and recreating it does + *not* lose data because the source of truth is the git remote, not the + PVC. The PVC just avoids re-cloning on every restart. +- **The deploy key matters.** If `CFP_DATA_REMOTE` is SSH (the production + default), the entrypoint relies on `GIT_SSH_COMMAND` (rendered into the + ConfigMap) pointing at the mounted private key. Rotation: replace the + Secret, restart the pod. See [secrets.md](secrets.md#data-repo-deploy-key). + +## Bucket provisioning + +Production uses an S3-compatible bucket for private storage +([specs/behaviors/private-storage.md](../../specs/behaviors/private-storage.md)). +The bucket is **not** Helm-managed — it's provisioned out-of-band and the +Helm chart just consumes the credentials. + +Recommended provider: **Cloudflare R2** (zero egress, pennies per month, +S3-compatible API). Backblaze B2 or AWS S3 also work. MinIO inside the +cluster is acceptable for cost reasons but trades operational simplicity +for storage simplicity. + +Required bucket configuration: + +- **Versioning enabled.** Hard requirement per + [private-storage.md](../../specs/behaviors/private-storage.md#bucket-requirements). + Every PUT increments the object's version; the previous `.jsonl` is + recoverable. Verify with `aws s3api get-bucket-versioning`. +- **Lifecycle rule** deleting non-current versions after 365 days. +- **IAM policy** scoped to the bucket only — `s3:GetObject`, + `s3:PutObject`, `s3:ListBucket`, `s3:GetObjectVersion`. No cross-bucket + access; no console access for the service principal. +- **Endpoint URL** plugged into `S3_ENDPOINT` (Helm `publicEnv.S3_ENDPOINT`). +- **Bucket name** plugged into `S3_BUCKET`. +- **Region** (or a placeholder R2 region) into `S3_REGION`. +- **Access keys** stored in the `codeforphilly-secrets` Secret as + `S3_ACCESS_KEY_ID` and `S3_SECRET_ACCESS_KEY`. + +Two physical surfaces: one bucket for staging, one for production. Or one +bucket with two prefixes (`staging/profiles.jsonl`, `prod/profiles.jsonl`) +if cost is tight — the path string is configurable via the private-store +implementation but conventionally we use separate buckets. + +Until a real bucket exists, staging runs on `storage.backend=filesystem` +backed by a PVC — see `values.staging.yaml`. The cutover from filesystem +to S3 is a values change only; the in-memory model is identical. + +## Environment variables (reference) + +The runtime contract. See [`.env.example`](../../.env.example) for the +exhaustive list with comments; the table below tracks what gets *mounted* +into a production pod. + +| Variable | Source | Notes | +|----------|--------|-------| +| `NODE_ENV` | ConfigMap | `production` | +| `PORT` | ConfigMap | `3001` | +| `HOST` | ConfigMap | `0.0.0.0` | +| `CFP_DATA_REPO_PATH` | ConfigMap | `/app/data` (PVC mount) | +| `CFP_DATA_REMOTE` | ConfigMap | git URL (ssh in prod, https for snapshot) | +| `CFP_DATA_BRANCH` | ConfigMap | `main` | +| `CFP_WEB_DIST_PATH` | Dockerfile ENV | `/app/apps/web/dist` | +| `STORAGE_BACKEND` | ConfigMap | `s3` (prod) / `filesystem` (staging) | +| `CFP_PRIVATE_STORAGE_PATH` | ConfigMap | `/app/private-storage` (when filesystem) | +| `S3_ENDPOINT` / `S3_BUCKET` / `S3_REGION` | ConfigMap | Bucket addressing | +| `S3_ACCESS_KEY_ID` / `S3_SECRET_ACCESS_KEY` | **Secret** | Bucket credentials | +| `GITHUB_OAUTH_CLIENT_ID` | ConfigMap | OAuth app client ID | +| `GITHUB_OAUTH_CLIENT_SECRET` | **Secret** | OAuth app client secret | +| `CFP_JWT_SIGNING_KEY` | **Secret** | HS256 key (`openssl rand -base64 64`) | +| `SAML_PRIVATE_KEY` / `SAML_CERTIFICATE` | **Secret** | Slack IdP cert chain | +| `GIT_SSH_COMMAND` | ConfigMap (rendered) | Wires `ssh` to the mounted deploy key | +| `GIT_AUTHOR_NAME` / `GIT_AUTHOR_EMAIL` | ConfigMap | Identity on push-daemon commits | + +## Rollback + +```bash +# Roll back to the previous Helm release +helm rollback codeforphilly-staging --namespace codeforphilly-staging + +# Or pin to a specific image +helm upgrade codeforphilly-staging deploy/charts/codeforphilly \ + --namespace codeforphilly-staging \ + --reuse-values \ + --set image.tag=sha- +``` + +Note: because every commit/mutation pushes to the data remote synchronously, +rolling the container back is *not* a data rollback. Data rollback is `git +revert` on the data repo. + +## Known unknowns + +- **Cluster choice.** Plan assumes the existing CFP k8s cluster (`k8s.phl.io`). + If a different cluster is targeted, regenerate `KUBECONFIG_STAGING` / + `KUBECONFIG_PRODUCTION` and update the ingress hosts. +- **First staging stand-up.** Provisioning the namespace + creating the + per-environment Secrets is a one-time human operation. The first + `helm upgrade --install` requires those Secrets to already exist. +- **MinIO option.** If the cluster doesn't have an S3 provider available, + add a MinIO subchart under `deploy/charts/codeforphilly/charts/`. Out of + scope for v1. diff --git a/docs/operations/runbook.md b/docs/operations/runbook.md new file mode 100644 index 0000000..32d395b --- /dev/null +++ b/docs/operations/runbook.md @@ -0,0 +1,107 @@ +# Runbook + +On-call playbooks for the codeforphilly-rewrite production service. + +## "API won't boot" + +Symptoms: pod CrashLoopBackOff, `kubectl describe pod` shows the container +restarting, no `/api/health` response. + +### 1. Read the logs first + +```bash +kubectl -n codeforphilly logs deploy/codeforphilly --previous +kubectl -n codeforphilly logs deploy/codeforphilly --tail=200 +``` + +Look for one of the four common boot failures: + +| Log line excerpt | Cause | Fix | +|------------------|-------|-----| +| `[entrypoint] ERROR: CFP_DATA_REMOTE is unset` | The PVC was wiped and the chart isn't providing the remote URL. | Check ConfigMap `-env`; ensure `publicEnv.CFP_DATA_REMOTE` is set in the active values file. | +| `fatal: could not read Username for 'https://...'` or `Permission denied (publickey)` | Bad/missing data-repo credentials. | Verify the `codeforphilly-data-deploy-key` Secret holds a valid `id_ed25519` whose public key has push access to the data repo. See [secrets.md](secrets.md#data-repo-deploy-key). | +| `Failed to open public gitsheets store` | Working tree corrupt or missing `.gitsheets/` configs. | Exec into the pod, inspect `/app/data/.gitsheets/`. Recovery: wipe the PVC and let the entrypoint re-clone (`kubectl delete pvc -data` → recreate via `helm upgrade`). | +| `Failed to load private store (s3)` | Bucket creds wrong, bucket gone, or network ACL blocks egress. | Confirm `S3_*` env in the ConfigMap + Secret. From the pod, `curl $S3_ENDPOINT` to confirm reachability. | +| `environment variable ... is required` | A required env (`CFP_DATA_REPO_PATH`, `STORAGE_BACKEND`, `CFP_JWT_SIGNING_KEY`) is missing. | Helm values regression. Compare against `values.production.yaml`. | + +### 2. Drop into the pod (if it stays up long enough) + +```bash +kubectl -n codeforphilly debug -it deploy/codeforphilly \ + --image=alpine --target=api -- sh +``` + +From inside: + +```bash +# Is the data repo really there? +ls -la /app/data /app/data/.gitsheets + +# Are env vars present? +env | grep -E '^(CFP_|S3_|STORAGE_|GITHUB_)' | sort + +# Can we reach the bucket? +apk add --no-cache curl +curl -v "$S3_ENDPOINT" + +# Can we reach the data remote? +git ls-remote "$CFP_DATA_REMOTE" 2>&1 | head +``` + +### 3. Last-resort recovery + +If the cluster state is unrecoverable but the data remote is intact: + +```bash +# Roll back to the last-known-good Helm release +helm -n codeforphilly history codeforphilly +helm -n codeforphilly rollback codeforphilly + +# Or pin to a previous image +helm upgrade codeforphilly deploy/charts/codeforphilly \ + --namespace codeforphilly \ + --reuse-values \ + --set image.tag= +``` + +Data is **not** in the PVC long-term; it's in the git remote. Deleting the +PVC and letting the entrypoint re-clone is safe. + +## "Readiness flapping / 503 spikes" + +Readiness probe (`/api/health/ready`) returns 503 only when the store +decorators are missing — that only happens during boot. Mid-life flapping +likely means: + +- Liveness probe (`/api/health`) failed and k8s is restarting the pod. Look + at the previous logs. +- Memory pressure → OOMKilled. Bump `resources.limits.memory` or lower + `NODE_OPTIONS=--max-old-space-size`. + +## "Mutations succeed in UI but don't appear on GitHub" + +Push daemon failure. Check logs for git push errors. Common causes: + +- Deploy key removed/expired — see [secrets.md](secrets.md#data-repo-deploy-key). +- Remote branch protection rejecting the push. +- Network egress blocked. + +The local working tree continues to accept writes — it's only the +asynchronous mirror to GitHub that's broken. Once fixed, the daemon will +push the backlog. + +## Helpful commands + +```bash +# Watch a deploy +kubectl -n codeforphilly rollout status deploy/codeforphilly + +# Last 10 Helm releases +helm -n codeforphilly history codeforphilly + +# Pod resource use +kubectl -n codeforphilly top pod + +# Force a config reload +kubectl -n codeforphilly rollout restart deploy/codeforphilly +``` diff --git a/docs/operations/secrets.md b/docs/operations/secrets.md new file mode 100644 index 0000000..d3370e2 --- /dev/null +++ b/docs/operations/secrets.md @@ -0,0 +1,203 @@ +# Secret management + +Every secret consumed by codeforphilly-rewrite at runtime, how to generate it, +how it gets into the cluster, and how to rotate it. + +> See [deploy.md](deploy.md) for how the Deployment consumes these. See +> [specs/architecture.md](../../specs/architecture.md#deploy) for the env-var +> contract this implements. + +## Principles + +1. **Never in the image.** Secrets are mounted at run-time. The Dockerfile + carries zero credentials. +2. **Never in git.** Use [sealed-secrets](https://github.com/bitnami-labs/sealed-secrets) + (cluster default) or a SOPS-encrypted file. The plaintext only exists on + the machines that generated it. +3. **Scoped.** Each secret is granted the minimum surface needed + (per-namespace, per-environment). No "infra" secret used for multiple + purposes. +4. **Rotatable.** Every secret in this doc has a rotation procedure that does + not require a code change. + +## Where they live in the cluster + +The Helm chart consumes secrets from two places: + +| Secret name (default) | Mount mechanism | Holds | +|-----------------------|-----------------|-------| +| `codeforphilly-secrets` | `envFrom: secretRef` (entire Secret becomes env) | All env-var secrets | +| `codeforphilly-data-deploy-key` | Volume-mounted, one file | SSH private key for the data repo | + +Both names are overridable via Helm values (`secretEnvFrom[].name`, +`deployKey.secretName`). + +## Inventory + +### `CFP_JWT_SIGNING_KEY` + +HS256 key for stateless session JWTs. + +- **Generate:** + + ```bash + openssl rand -base64 64 + ``` + +- **Rotation impact:** every active session is invalidated. Users have to + sign in again. Plan rotations during low-traffic windows; do not rotate + during launches. +- **Rotation procedure:** generate new value → update the sealed-secret → + `kubectl rollout restart deployment/codeforphilly` → users re-auth. +- **Cadence:** every 12 months, plus immediately on suspected leak. + +### `GITHUB_OAUTH_CLIENT_SECRET` + +Client secret for the GitHub OAuth app. **One app per environment** — a +separate app for staging and production, each with its own callback URL. + +- **Generate:** Rotate via the GitHub OAuth app settings page + (`https://github.com/settings/developers` → app → "Generate a new client + secret"). GitHub never reveals the old secret again. +- **Companion config:** `GITHUB_OAUTH_CLIENT_ID` is non-secret and lives in + the ConfigMap (`publicEnv.GITHUB_OAUTH_CLIENT_ID`). +- **Rotation impact:** in-flight OAuth callbacks fail. Existing sessions are + unaffected (the secret is only used during the OAuth handshake). +- **Rotation procedure:** issue new secret in GitHub → update sealed-secret → + `kubectl rollout restart`. +- **Cadence:** every 12 months, plus immediately on suspected leak. + +### `S3_ACCESS_KEY_ID` / `S3_SECRET_ACCESS_KEY` + +Credentials for the private-storage bucket. The IAM policy attached to +these credentials must be scoped to the single bucket per +[deploy.md](deploy.md#bucket-provisioning). + +- **Generate:** Provider console (R2 → API tokens; B2 → application keys; + AWS → IAM access key). +- **Rotation impact:** PUTs and GETs to the private store fail until the + pod is restarted with new keys. Newsletter signups and account claims + return `private_store_unavailable` (5xx). +- **Rotation procedure:** + 1. Provision a *second* access key in the bucket provider. + 2. Update the sealed-secret with the new value. + 3. `kubectl rollout restart deployment/codeforphilly` — pod boots with + new keys. + 4. Verify `/api/health/ready` returns 200. + 5. Revoke the old access key in the bucket provider. +- **Cadence:** every 6 months, plus immediately on suspected leak. + +### `SAML_PRIVATE_KEY` / `SAML_CERTIFICATE` + +PEM-encoded cert chain that signs SAML assertions for the Slack IdP +integration ([specs/api/saml.md](../../specs/api/saml.md)). + +- **Generate:** the openssl recipe documented in the legacy repo at + `laddr/docs/operations/update-saml2-certificate.md`: + + ```bash + openssl req -x509 -newkey rsa:2048 -days 1095 -nodes \ + -keyout saml-private.pem \ + -out saml-certificate.pem \ + -subj "/CN=codeforphilly.org SAML IdP" + ``` + +- **Rotation impact:** Slack stops trusting assertions until its IdP config + is updated with the new cert. **Do not rotate without coordinating with + the Slack workspace admin.** +- **Rotation procedure:** + 1. Generate new key + cert. + 2. Upload the *new cert* to Slack as a secondary signing cert. + 3. Update the sealed-secret with the new key + cert. + 4. `kubectl rollout restart`. + 5. Test SAML SSO from a clean browser. + 6. Once verified, remove the old cert from Slack. +- **Cadence:** every 36 months (cert expiry), plus immediately on + suspected leak. + +### Data-repo deploy key + +SSH ed25519 private key with **write** access to the `codeforphilly-data` +repo on GitHub. Mounted as a file at `/etc/cfp-data-deploy-key/id_ed25519`; +the entrypoint sets `GIT_SSH_COMMAND` to use it. + +- **Generate:** + + ```bash + ssh-keygen -t ed25519 -f cfp-data-deploy -C "codeforphilly k8s deploy" + ``` + + Add the *public* key (`cfp-data-deploy.pub`) to the data repo's + Settings → Deploy keys with "Allow write access" checked. +- **Rotation impact:** push daemon will fail to push commits until the new + key is mounted. Reads/writes to the in-memory state continue; the + inability to push surfaces as a backlog of unpushed commits on the PVC. +- **Rotation procedure:** + 1. Generate new keypair. + 2. Add new public key to the data repo (alongside the existing one). + 3. Update the sealed-secret with the new private key. + 4. `kubectl rollout restart`. + 5. Verify a test mutation reaches the remote. + 6. Remove the old deploy key from the data repo. +- **Cadence:** every 12 months, plus immediately on team turnover or + suspected leak. + +## Bootstrapping a new environment + +First-time set up of `codeforphilly-staging` or `codeforphilly`: + +```bash +# 1. Create namespace +kubectl create namespace codeforphilly-staging + +# 2. Generate all secret values locally +openssl rand -base64 64 > .secrets/jwt +ssh-keygen -t ed25519 -f .secrets/deploy -N "" +# ... GitHub OAuth secret from GitHub UI, S3 keys from R2 console ... + +# 3. Build the Secret manifests +kubectl create secret generic codeforphilly-secrets \ + --namespace codeforphilly-staging \ + --from-literal=CFP_JWT_SIGNING_KEY="$(cat .secrets/jwt)" \ + --from-literal=GITHUB_OAUTH_CLIENT_SECRET="$GH_SECRET" \ + --from-literal=S3_ACCESS_KEY_ID="$S3_ID" \ + --from-literal=S3_SECRET_ACCESS_KEY="$S3_KEY" \ + --from-literal=SAML_PRIVATE_KEY="$(cat .secrets/saml-private.pem)" \ + --from-literal=SAML_CERTIFICATE="$(cat .secrets/saml-certificate.pem)" \ + --dry-run=client -o yaml \ + | kubeseal --format yaml > deploy/secrets/staging-secrets.sealed.yaml + +kubectl create secret generic codeforphilly-data-deploy-key \ + --namespace codeforphilly-staging \ + --from-file=id_ed25519=.secrets/deploy \ + --dry-run=client -o yaml \ + | kubeseal --format yaml > deploy/secrets/staging-deploy-key.sealed.yaml + +# 4. Apply +kubectl apply -f deploy/secrets/staging-secrets.sealed.yaml +kubectl apply -f deploy/secrets/staging-deploy-key.sealed.yaml + +# 5. Wipe plaintext +shred -u .secrets/* + +# 6. Helm install +helm upgrade --install codeforphilly-staging deploy/charts/codeforphilly \ + --namespace codeforphilly-staging \ + -f deploy/charts/codeforphilly/values.staging.yaml \ + --set image.tag=sha- +``` + +The sealed `.yaml` files are safe to commit; they can only be decrypted by +the sealed-secrets controller in the matching cluster. + +## What's *not* a secret + +Listed because operators ask: + +- `GITHUB_OAUTH_CLIENT_ID` — public by design; GitHub exposes it during + every OAuth flow. +- `S3_ENDPOINT` / `S3_BUCKET` / `S3_REGION` — public addressing info. +- `CFP_DATA_REMOTE` — URL form. The *access* (deploy key) is secret; the + URL itself isn't. +- `SAML_CERTIFICATE` (the public cert) is technically published to Slack + anyway, but we keep it next to the private key for atomic rotation. diff --git a/package-lock.json b/package-lock.json index febc7b2..b273a60 100644 --- a/package-lock.json +++ b/package-lock.json @@ -34,6 +34,7 @@ "@fastify/cors": "^11.2.0", "@fastify/env": "^6.0.0", "@fastify/rate-limit": "^10.3.0", + "@fastify/static": "^9.1.3", "@fastify/swagger": "^9.7.0", "@fastify/swagger-ui": "^5.2.6", "better-sqlite3": "^12.10.0", diff --git a/plans/deploy.md b/plans/deploy.md index ebe0b90..3bbbbb3 100644 --- a/plans/deploy.md +++ b/plans/deploy.md @@ -1,9 +1,10 @@ --- -status: planned +status: done depends: [storage-foundation] specs: - specs/architecture.md -issues: [] +issues: [36] +pr: 35 --- # Plan: Deploy @@ -133,16 +134,16 @@ Pino's structured JSON logs go to stdout; k8s log aggregator captures them. Add ## Validation - [ ] `docker build .` produces an image; `docker run` boots the API -- [ ] The same image serves both `/api/*` and the static SPA +- [x] The same image serves both `/api/*` and the static SPA - [ ] `helm install` to a staging namespace boots the deployment cleanly - [ ] Ingress + TLS works (verified by hitting `https://codeforphilly-rewrite-staging.k8s.phl.io/api/health` from outside) - [ ] The data repo PVC persists across pod restarts (verify by killing the pod and observing the API comes back without re-cloning) - [ ] The push daemon successfully pushes a test commit to the data remote (using the deploy key) - [ ] The S3-backed PrivateStore reads/writes against the production bucket; bucket versioning works (verify a PUT increments the version) -- [ ] Readiness probe returns 200 only after both stores load (verify by intentionally pointing at an empty bucket; readiness fails until populated) -- [ ] CI workflows pass and produce deployable artifacts +- [x] Readiness probe returns 200 only after both stores load (verify by intentionally pointing at an empty bucket; readiness fails until populated) +- [x] CI workflows pass and produce deployable artifacts - [ ] Sealed-secrets in the cluster decrypt and inject correctly -- [ ] Operational docs in `docs/operations/`: secrets management, runbook for "API won't boot", cert rotation +- [x] Operational docs in `docs/operations/`: secrets management, runbook for "API won't boot", cert rotation ## Risks / unknowns @@ -152,3 +153,17 @@ Pino's structured JSON logs go to stdout; k8s log aggregator captures them. Add - **Helm chart drift from legacy.** The legacy CFP Helm chart is the reference for cluster conventions. Don't reinvent — copy + adapt. ## Notes + +- **Cluster + bucket stand-up are not closeable from a dev workstation.** Six of the validation criteria need a live k8s cluster, a real bucket, or both (`docker run` smoke, `helm install`, ingress/TLS, PVC persistence, push daemon, S3 PrivateStore, sealed-secrets injection). They're left unchecked and tracked by [#36](https://github.com/CodeForPhilly/codeforphilly-ng/issues/36) so they close out when a human operator actually stands up staging. +- **Single notFoundHandler, even without an SPA bundled.** `apps/api/src/plugins/static-web.ts` installs a not-found handler unconditionally — when `CFP_WEB_DIST_PATH` is unset (dev/tests) it still returns the JSON envelope for unknown paths. Avoids drift between dev and prod 404 behavior on `/api/*`. +- **index.html is read into memory at boot.** fastify-static's per-file cache-control headers stamped the SPA entrypoint with `immutable max-age=1y`, which is wrong for the file that decides which hashed assets the browser loads next. The notFoundHandler serves a cached buffer with `cache-control: no-cache` instead. Hashed assets in `/assets/*` keep the long cache. +- **`Recreate` over `RollingUpdate`.** A rolling update would temporarily run two pods, each holding the gitsheets write mutex against the *same* PVC working tree. Old + new pods committing concurrently would interleave commits and corrupt state. Recreate forces the old pod down before the new one starts; brief downtime is the price. +- **Entrypoint clone, not init container.** Simpler — one container in the pod, one log stream, no separate ServiceAccount semantics. The plan's "init container vs entrypoint" unknown resolved to entrypoint. +- **Filesystem private-store on staging until a bucket exists.** `values.staging.yaml` sets `storage.backend=filesystem` against a small PVC so the chart can stand up before the bucket is provisioned. Flipping to S3 once a real bucket exists is values-only (no code change, no schema migration). +- **`helm` pinned to 4.1.0 via asdf.** Workflow actions install their own helm (v3.16.2 via `azure/setup-helm@v4`); local validation uses the asdf pin. v4 lints v3 charts cleanly. + +## Follow-ups + +- Issue [#36](https://github.com/CodeForPhilly/codeforphilly-ng/issues/36) — stand up staging cluster + bucket, generate per-environment secrets, run `deploy-staging.yml`, verify external `curl` to `/api/health` + `/api/health/ready`, verify PVC persistence + push-daemon push (closes the six unchecked validation criteria). +- Tracked as: bucket provider choice (R2 / B2 / S3 / MinIO) deferred to whoever stands up staging — decision deliberately left to the operator with the bucket-provisioning checklist in [docs/operations/deploy.md](../docs/operations/deploy.md#bucket-provisioning). Until decided, staging runs on filesystem. +- Tracked as: production cluster stand-up is the same template with `values.production.yaml`; a separate issue should be filed once staging is green, not now.