From 589f25fc0dac41d5890363e16151ce19946a4ccb Mon Sep 17 00:00:00 2001 From: Abhijai Srivastava Date: Thu, 2 Jul 2026 21:08:00 +0530 Subject: [PATCH] docs: restructure self-hosting sidebar; add Production subsection and Support Sidebar reshaped to the agreed tree: Configuration and Production as nested groups. Production split from the flat page into Checklist, Security & TLS, Backups & restore, Monitoring, and Upgrades & rollback. New Support page. Production overview slimmed to route into the subsection. --- src/lib/navigation.ts | 28 +++- src/pages/docs/self-hosting/production.mdx | 145 +++--------------- .../production/backups-restore.mdx | 68 ++++++++ .../self-hosting/production/checklist.mdx | 57 +++++++ .../self-hosting/production/monitoring.mdx | 46 ++++++ .../self-hosting/production/security-tls.mdx | 59 +++++++ .../production/upgrades-rollback.mdx | 59 +++++++ src/pages/docs/self-hosting/support.mdx | 43 ++++++ 8 files changed, 371 insertions(+), 134 deletions(-) create mode 100644 src/pages/docs/self-hosting/production/backups-restore.mdx create mode 100644 src/pages/docs/self-hosting/production/checklist.mdx create mode 100644 src/pages/docs/self-hosting/production/monitoring.mdx create mode 100644 src/pages/docs/self-hosting/production/security-tls.mdx create mode 100644 src/pages/docs/self-hosting/production/upgrades-rollback.mdx create mode 100644 src/pages/docs/self-hosting/support.mdx diff --git a/src/lib/navigation.ts b/src/lib/navigation.ts index 8f8646a3..d6c5cffd 100644 --- a/src/lib/navigation.ts +++ b/src/lib/navigation.ts @@ -60,13 +60,27 @@ export const tabNavigation: NavTab[] = [ title: 'Self-Hosting', items: [ { title: 'Overview', href: '/docs/self-hosting' }, - { title: 'System requirements', href: '/docs/self-hosting/requirements' }, - { title: 'Environment variables', href: '/docs/self-hosting/environment' }, - { title: 'Configuration', href: '/docs/self-hosting/configuration' }, - { title: 'Docker Compose', href: '/docs/self-hosting/docker-compose' }, - { title: 'Production', href: '/docs/self-hosting/production' }, - { title: 'User management', href: '/docs/self-hosting/user-management' }, - { title: 'Troubleshooting and FAQs', href: '/docs/self-hosting/troubleshooting' }, + { title: 'Requirements', href: '/docs/self-hosting/requirements' }, + { title: 'Install', href: '/docs/self-hosting/docker-compose' }, + { + title: 'Configuration', + items: [ + { title: 'Environment variables', href: '/docs/self-hosting/environment' }, + { title: 'System configuration', href: '/docs/self-hosting/configuration' }, + ] + }, + { + title: 'Production', + items: [ + { title: 'Checklist', href: '/docs/self-hosting/production/checklist' }, + { title: 'Security & TLS', href: '/docs/self-hosting/production/security-tls' }, + { title: 'Backups & restore', href: '/docs/self-hosting/production/backups-restore' }, + { title: 'Monitoring', href: '/docs/self-hosting/production/monitoring' }, + { title: 'Upgrades & rollback', href: '/docs/self-hosting/production/upgrades-rollback' }, + ] + }, + { title: 'Troubleshooting & FAQs', href: '/docs/self-hosting/troubleshooting' }, + { title: 'Support', href: '/docs/self-hosting/support' }, ] }, { diff --git a/src/pages/docs/self-hosting/production.mdx b/src/pages/docs/self-hosting/production.mdx index c5fbb682..5b0b2c5b 100644 --- a/src/pages/docs/self-hosting/production.mdx +++ b/src/pages/docs/self-hosting/production.mdx @@ -1,137 +1,28 @@ --- -title: "Production Hardening & Operations" -description: "Production readiness checklist — replace secrets, configure TLS, set up managed data stores, run Postgres/ClickHouse/MinIO backups, and follow the upgrade runbook." +title: "Production" +description: "What to harden before a self-hosted Future AGI instance goes live" --- -## About +Everything past a local trial happens here. The default Docker Compose stack boots with placeholder secrets, no TLS, and compose-managed data stores. That's fine on a laptop. Before real traffic reaches the instance, work through the flow below in order, then keep each page as a runbook. -Run through this before exposing the stack to real users. Covers secrets, TLS, swapping in managed data stores, backup commands for Postgres/ClickHouse/MinIO, Prometheus monitoring, and the upgrade and rollback runbook. +## In this page -## Hardening checklist - -**Secrets** — replace all `CHANGEME` values before going live: - -```bash -openssl rand -hex 32 # SECRET_KEY, AGENTCC_INTERNAL_API_KEY -openssl rand -base64 24 # PG_PASSWORD, MINIO_ROOT_PASSWORD -``` - -**Runtime flags** in `.env`: -- `ENV_TYPE=prod` -- `FAST_STARTUP=false` -- `GRANIAN_WORKERS=` - -**TLS** — the frontend and backend don't terminate TLS. Put Caddy, nginx, or Traefik in front: - -``` -# Caddyfile (simplest — auto-issues Let's Encrypt certs) -app.yourcompany.com { reverse_proxy localhost:3000 } -api.yourcompany.com { reverse_proxy localhost:8000 } -``` - -After setting up TLS, set `VITE_HOST_API=https://api.yourcompany.com` in `.env` and rebuild: - -```bash -docker compose build frontend && docker compose up -d frontend -``` - -**Managed data stores** — for production, replace compose-managed services: - -| Replace | With | Change | -|---|---|---| -| `postgres` | RDS / Aurora / Cloud SQL | Set `PG_*` vars to managed endpoint | -| `clickhouse` | ClickHouse Cloud | Set `CH_HOST`, `CH_PORT`, etc. | -| `redis` | ElastiCache / Upstash | Set `REDIS_URL` | -| `minio` | AWS S3 | Set `S3_ENDPOINT_URL=https://s3.amazonaws.com` + AWS creds | - - -`code-executor` requires `privileged: true`. Run on EC2 / GCE instances — not Fargate or Cloud Run. - - -**Secrets manager** — use AWS Secrets Manager, HashiCorp Vault, or GCP Secret Manager instead of a plain `.env` file. - ---- - -## Backups - -### PostgreSQL - -```bash -# Backup -docker compose exec postgres \ - pg_dump -U futureagi -d futureagi --format=custom \ - > backup-$(date +%F).dump - -# Restore -docker compose exec -T postgres \ - pg_restore -U futureagi -d futureagi --clean --if-exists \ - < backup-2026-04-22.dump -``` - -Volumes: `future-agi_postgres-data` · `future-agi_clickhouse-data` · `future-agi_redis-data` · `future-agi_minio-data` · `future-agi_peerdb-catalog-data` · `future-agi_peerdb-minio-data` - -### ClickHouse - -```sql -BACKUP TABLE default.traces TO S3('s3://your-bucket/ch-backup/', 'KEY', 'SECRET'); -``` - -ClickHouse data can also be rebuilt from scratch by re-running PeerDB init since it replicates from Postgres. - -### MinIO - -```bash -mc alias set local http://localhost:9005 futureagi -mc alias set s3 https://s3.amazonaws.com -mc mirror local/ s3/your-bucket/ -``` - ---- - -## Monitoring - -Backend exposes Prometheus metrics at `http://localhost:8000/metrics`. Add a scraper: - -```yaml -# prometheus.yml -scrape_configs: - - job_name: futureagi - static_configs: - - targets: ['localhost:8000'] - metrics_path: /metrics -``` - -Key signals: backend error rate, Temporal workflow success/failure, Postgres WAL lag (PeerDB health), ClickHouse query latency, PeerDB mirror status at [localhost:3001](http://localhost:3001). - ---- - -## Upgrades - -```bash -git pull -docker compose build -docker compose up -d -``` - -Migrations run automatically. If a migration fails: `docker compose exec backend python manage.py migrate` - -If release notes mention PeerDB changes: `docker compose run --rm peerdb-init bash /setup.sh` - -**Rollback:** - -```bash -git log --oneline -5 -git checkout -docker compose build && docker compose up -d -``` - -## Next Steps +Production readiness for a self-hosted instance breaks into five steps. Do them in order the first time. - - Symptoms, causes, and fixes for common errors. + + The go-live pass: secrets, prod runtime flags, and managed data stores + + + Terminate TLS in front of the stack and lock down secrets + + + Back up and restore Postgres, ClickHouse, and MinIO + + + Scrape Prometheus metrics and watch the signals that matter - - Tune the LLM gateway, PeerDB mirrors, and Temporal workers. + + Pull a release, run migrations, and roll back safely diff --git a/src/pages/docs/self-hosting/production/backups-restore.mdx b/src/pages/docs/self-hosting/production/backups-restore.mdx new file mode 100644 index 00000000..f04c193d --- /dev/null +++ b/src/pages/docs/self-hosting/production/backups-restore.mdx @@ -0,0 +1,68 @@ +--- +title: "Backups & restore" +description: "Back up and restore the data stores behind a self-hosted instance" +--- + +A self-hosted instance keeps state in four stores: Postgres for application data, ClickHouse for observability records, MinIO for object storage, and Redis for cache and queues. This page covers backing up and restoring the three that hold durable data. Redis is a cache and doesn't need a backup. + +## Postgres + +Postgres holds the application data, so back it up on a schedule. Use the custom format so restores can run selectively: + +```bash +# Backup +docker compose exec postgres \ + pg_dump -U futureagi -d futureagi --format=custom \ + > backup-$(date +%F).dump + +# Restore +docker compose exec -T postgres \ + pg_restore -U futureagi -d futureagi --clean --if-exists \ + < backup-2026-04-22.dump +``` + +The Docker volumes that hold state: + +| Volume | Holds | +|---|---| +| `future-agi_postgres-data` | Postgres application data | +| `future-agi_clickhouse-data` | ClickHouse records | +| `future-agi_redis-data` | Redis cache | +| `future-agi_minio-data` | MinIO objects | +| `future-agi_peerdb-catalog-data` | PeerDB replication catalog | +| `future-agi_peerdb-minio-data` | PeerDB staging objects | + +## ClickHouse + +ClickHouse can back up straight to S3: + +```sql +BACKUP TABLE default.traces TO S3('s3://your-bucket/ch-backup/', 'KEY', 'SECRET'); +``` + + +ClickHouse is a replica of Postgres data, streamed through PeerDB. If you lose it, rebuild it from scratch by re-running PeerDB init rather than restoring a backup. The steps are in [Upgrades & rollback](/docs/self-hosting/production/upgrades-rollback). + + +## MinIO + +Mirror the MinIO bucket to S3 with the MinIO client: + +```bash +mc alias set local http://localhost:9005 futureagi +mc alias set s3 https://s3.amazonaws.com +mc mirror local/ s3/your-bucket/ +``` + +If you've already moved to [managed data stores](/docs/self-hosting/production/checklist), your provider's own backup tooling replaces these commands. + +## Dive deeper + + + + Watch store health and replication lag + + + Rebuild ClickHouse and roll back releases + + diff --git a/src/pages/docs/self-hosting/production/checklist.mdx b/src/pages/docs/self-hosting/production/checklist.mdx new file mode 100644 index 00000000..272d324e --- /dev/null +++ b/src/pages/docs/self-hosting/production/checklist.mdx @@ -0,0 +1,57 @@ +--- +title: "Checklist" +description: "The go-live pass before a self-hosted instance takes real traffic" +--- + +Run through this once before the stack is reachable by anyone else. It covers the three things that separate a laptop trial from a real deployment: replacing the shipped secrets, switching the backend into production mode, and swapping compose-managed data stores for managed ones. + +## Replace the shipped secrets + +The stack boots with `CHANGEME` placeholders. Replace every one before the instance is reachable, and generate each value rather than making one up: + +```bash +openssl rand -hex 32 # SECRET_KEY, AGENTCC_INTERNAL_API_KEY +openssl rand -base64 24 # PG_PASSWORD, MINIO_ROOT_PASSWORD +``` + + +`PG_PASSWORD` and `MINIO_ROOT_PASSWORD` are written to their volumes on first boot only. Set them before your first `docker compose up`. Changing them after the volume exists locks you out. The full field list is in [Environment variables](/docs/self-hosting/environment). + + +## Switch the backend to production mode + +Set these runtime flags before going live: + +| Variable | Go-live value | Why | +|---|---|---| +| `ENV_TYPE` | `prod` | Disables debug output and runs Django `check --deploy` | +| `FAST_STARTUP` | `false` | Always applies migrations on restart | +| `GRANIAN_WORKERS` | your CPU count | One worker per core, up from the default `1` | + +To tune the gateway, PeerDB, and Temporal workers, see [System configuration](/docs/self-hosting/configuration). + +## Move to managed data stores + +Compose-managed Postgres, ClickHouse, Redis, and MinIO are fine for a trial. For production, point the stack at managed services so data outlives the containers: + +| Replace | With | Set | +|---|---|---| +| `postgres` | RDS, Aurora, or Cloud SQL | `PG_*` to the managed endpoint | +| `clickhouse` | ClickHouse Cloud | `CH_HOST`, `CH_PORT`, and credentials | +| `redis` | ElastiCache or Upstash | `REDIS_URL` | +| `minio` | AWS S3 | `S3_ENDPOINT_URL` plus AWS credentials | + + +`code-executor` needs `privileged: true`, so it can't run on ECS Fargate or Cloud Run. Put it on an EC2 or GCE instance. The platform matrix is in [Requirements](/docs/self-hosting/requirements). + + +## Dive deeper + + + + Put TLS in front of the stack and move secrets into a manager + + + Set up backups before the instance holds real data + + diff --git a/src/pages/docs/self-hosting/production/monitoring.mdx b/src/pages/docs/self-hosting/production/monitoring.mdx new file mode 100644 index 00000000..cb23a7ce --- /dev/null +++ b/src/pages/docs/self-hosting/production/monitoring.mdx @@ -0,0 +1,46 @@ +--- +title: "Monitoring" +description: "Scrape a self-hosted instance and watch the signals that predict trouble" +--- + +The backend exposes Prometheus metrics, so any Prometheus-compatible stack can scrape it. This page covers wiring up the scrape and the handful of signals worth alerting on. + +## Scrape the backend + +The backend serves Prometheus metrics at `http://localhost:8000/metrics`. Add it as a scrape target: + +```yaml +# prometheus.yml +scrape_configs: + - job_name: futureagi + static_configs: + - targets: ['localhost:8000'] + metrics_path: /metrics +``` + +## Watch these signals + +| Signal | Why it matters | +|---|---| +| Backend error rate | The first sign a release or dependency broke | +| Temporal workflow success and failure | Failing workflows mean evals and background jobs are stuck | +| Postgres WAL lag | Rising lag means PeerDB replication is falling behind | +| ClickHouse query latency | Slow queries surface as a slow dashboard | +| PeerDB mirror status | The health of the Postgres to ClickHouse pipeline | + +PeerDB has its own console at [localhost:3001](http://localhost:3001) for mirror status and throughput. + + +Postgres WAL lag and PeerDB mirror status are the two to page on first. When ClickHouse drifts from Postgres, dashboards read stale before anything visibly breaks. + + +## Dive deeper + + + + Keep the stack current without downtime + + + Symptoms, causes, and fixes for common errors + + diff --git a/src/pages/docs/self-hosting/production/security-tls.mdx b/src/pages/docs/self-hosting/production/security-tls.mdx new file mode 100644 index 00000000..aa6d228b --- /dev/null +++ b/src/pages/docs/self-hosting/production/security-tls.mdx @@ -0,0 +1,59 @@ +--- +title: "Security & TLS" +description: "Terminate TLS in front of a self-hosted instance and lock down its secrets" +--- + +Neither the frontend nor the backend terminates TLS. In production you put a reverse proxy in front of the stack to handle certificates, then point the frontend at the HTTPS endpoint. This page covers both, plus where production secrets should live. + +## Terminate TLS with a reverse proxy + +Run Caddy, nginx, or Traefik in front of the stack. Caddy is the shortest path because it issues and renews Let's Encrypt certificates on its own: + +``` +# Caddyfile +app.yourcompany.com { reverse_proxy localhost:3000 } +api.yourcompany.com { reverse_proxy localhost:8000 } +``` + + + + Point the proxy at the frontend on `localhost:3000` and the backend on `localhost:8000`. The full port list is in [Requirements](/docs/self-hosting/requirements#ports-reference). + + + Set `VITE_HOST_API=https://api.yourcompany.com` in `.env`. This is a build-time value, so the frontend has to be rebuilt for it to take effect. + + + ```bash + docker compose build frontend && docker compose up -d frontend + ``` + + + + +`VITE_HOST_API` is baked in at build time, not read at runtime. If the browser still calls the old host after you change it, the frontend wasn't rebuilt. + + +## Keep secrets out of the compose file + +For anything past a single trial host, store secrets in a dedicated manager instead of a plain `.env`: + +- AWS Secrets Manager +- HashiCorp Vault +- GCP Secret Manager + +Rotate the `CHANGEME` values from the [Checklist](/docs/self-hosting/production/checklist) first, then move them into the manager and inject them at deploy time. + +## Isolate the code executor + +`code-executor` runs with `privileged: true` so it can sandbox evaluation code. Keep it on a host you control, an EC2 or GCE instance, never a managed-container platform that can't grant that flag. + +## Dive deeper + + + + Protect the data behind the proxy + + + Tune the gateway, PeerDB, and Temporal workers + + diff --git a/src/pages/docs/self-hosting/production/upgrades-rollback.mdx b/src/pages/docs/self-hosting/production/upgrades-rollback.mdx new file mode 100644 index 00000000..0e1b903d --- /dev/null +++ b/src/pages/docs/self-hosting/production/upgrades-rollback.mdx @@ -0,0 +1,59 @@ +--- +title: "Upgrades & rollback" +description: "Pull a new release, run migrations, and roll back when one goes wrong" +--- + +Upgrades are a git pull and a rebuild, and migrations run automatically on boot. This page covers the routine upgrade, the two cases that need a manual step, and how to roll back. + +## Upgrade to a new release + + + + ```bash + git pull + docker compose build + docker compose up -d + ``` + + + Database migrations run automatically on backend startup. If one fails, run it by hand: + ```bash + docker compose exec backend python manage.py migrate + ``` + + + When a release changes the replication setup, re-run init so ClickHouse stays in sync: + ```bash + docker compose run --rm peerdb-init bash /setup.sh + ``` + + + + +Because ClickHouse replicates from Postgres through PeerDB, re-running PeerDB init also rebuilds ClickHouse from scratch. That's the recovery path when the [ClickHouse store](/docs/self-hosting/production/backups-restore) is lost. + + +## Roll back a bad release + +Roll back to the previous commit and rebuild: + +```bash +git log --oneline -5 +git checkout +docker compose build && docker compose up -d +``` + + +Checking out older code does not undo a migration that already ran. If a release applied a migration you need to reverse, roll it back before you switch code, or restore Postgres from a backup. + + +## Dive deeper + + + + Symptoms, causes, and fixes for common errors + + + Where to get help when you're stuck + + diff --git a/src/pages/docs/self-hosting/support.mdx b/src/pages/docs/self-hosting/support.mdx new file mode 100644 index 00000000..38d69bd0 --- /dev/null +++ b/src/pages/docs/self-hosting/support.mdx @@ -0,0 +1,43 @@ +--- +title: "Support" +description: "Where to get help running a self-hosted Future AGI instance" +--- + +Running the open-source stack and hit something these pages don't cover? Here's where to reach the team and the community, and what to include so you get a useful answer fast. + +## Where to get help + + + + Ask the community and the team in the Future AGI Discord + + + Report a bug or request a feature on the open-source repo + + + +## Before you post + +A self-hosting question is easier to answer with the basics attached: + +- What you ran and what happened, with the exact error +- Output of `docker compose ps` so the team can see which service is down +- Logs from the failing service: `docker compose logs --tail=100` +- Your platform (Linux host, EC2, GCE) and whether you're on managed data stores + +Most self-hosting questions are already answered in [Troubleshooting & FAQs](/docs/self-hosting/troubleshooting). Check there first. + +## Commercial support + +For managed hosting, an SLA, or help with a production rollout, reach out at [sales@futureagi.com](mailto:sales@futureagi.com). + +## Dive deeper + + + + Symptoms, causes, and fixes for common errors + + + Harden the instance before it goes live + +