From 2423e4c78774ea5490d5c47464220fcbdd0e2dce Mon Sep 17 00:00:00 2001 From: Manas Srivastava <[email protected]> Date: Thu, 11 Jun 2026 00:57:37 +0530 Subject: [PATCH 1/3] feat(dropguard): name-convention guard on every customer-data drop + migration-safety CI gate (truehomie D3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Layer 2 of the truehomie-db DROP hardening (layer 1 = the guardedDrop audit chokepoint, PR #50). The chokepoint made every sanctioned drop AUDITABLE; this makes a MIS-TARGETED drop UNEXECUTABLE: - internal/dropguard: charset+denylist validation of naming tokens and final db/user identifiers. Refuses empty/garbage tokens, SQL metacharacters, system databases (postgres, template0/1, instant_customers, instant_platform, mongo admin/local/config) and admin roles (instanode_admin, instant_cust, doadmin, postgres, redis "default"). Deliberately permissive on token SHAPE (uuid, dashless hex, pool-*, e2e cohorts all pass) so no legitimate legacy deprovision can wedge — this repo auto-deploys. - server.guardedDrop: validates the poolident-resolved naming token BEFORE dispatch; refusal = error + `provisioner.drop.refused` log event + instant_provisioner_drop_total{outcome="refused"}. - postgres local Deprovision + cleanupProvisionPartial (non-chokepoint rollback path), dedicated deprovisionLocal: validate the FINAL constructed dbName/username via validateDropTargets right before DROP. - mongo Deprovision: token guard before connect + per-candidate name guard (refused canonical = hard error; refused legacy candidate = skip). - redis local/dedicated DELUSER: skip any non-tenant-shaped username (ACL DELUSER "default" would brick the shared pod). - scripts/check-migration-safety.sh + ci.yml step: destructive DDL (DROP TABLE/DATABASE/SCHEMA/COLUMN/ROLE/USER, TRUNCATE, DELETE FROM without WHERE) in migrations/*.sql fails CI unless the file carries `-- destructive: acknowledged `. Self-test fixture suite included. No behavior change for legitimate deprovision flows (regression tests assert uuid/dashless/pool/e2e token shapes still reach the backend). New refusal branches are unit-tested in every package; dropguard itself is 100% covered. Co-Authored-By: Claude Fable 5 --- .github/workflows/ci.yml | 10 ++ internal/backend/mongo/dropguard_test.go | 69 ++++++++ internal/backend/mongo/mongo.go | 41 ++++- internal/backend/postgres/dedicated.go | 7 + internal/backend/postgres/dropcheck.go | 31 ++++ internal/backend/postgres/dropcheck_test.go | 87 +++++++++ internal/backend/postgres/local.go | 16 ++ internal/backend/redis/dedicated.go | 10 ++ internal/backend/redis/dropguard_test.go | 50 ++++++ internal/backend/redis/local.go | 9 + internal/dropguard/dropguard.go | 184 ++++++++++++++++++++ internal/dropguard/dropguard_test.go | 183 +++++++++++++++++++ internal/server/drop_chokepoint.go | 34 +++- internal/server/drop_chokepoint_test.go | 90 ++++++++++ scripts/check-migration-safety.sh | 173 ++++++++++++++++++ 15 files changed, 987 insertions(+), 7 deletions(-) create mode 100644 internal/backend/mongo/dropguard_test.go create mode 100644 internal/backend/postgres/dropcheck.go create mode 100644 internal/backend/postgres/dropcheck_test.go create mode 100644 internal/backend/redis/dropguard_test.go create mode 100644 internal/dropguard/dropguard.go create mode 100644 internal/dropguard/dropguard_test.go create mode 100755 scripts/check-migration-safety.sh diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index bfbef2a..15c9255 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -55,6 +55,16 @@ jobs: steps: - uses: actions/checkout@v6 + # Migration-safety gate (truehomie-db DROP hardening, task D3): any + # destructive DDL (DROP TABLE/DATABASE/SCHEMA/COLUMN/ROLE/USER, TRUNCATE, + # DELETE FROM without WHERE) in a migrations/*.sql file fails CI unless + # the file carries an explicit `-- destructive: acknowledged ` + # marker. Self-test runs first so a regression in the gate itself reds CI. + - name: Migration safety gate + run: | + bash scripts/check-migration-safety.sh --self-test + bash scripts/check-migration-safety.sh + - name: Checkout proto sibling (replace ../proto) uses: actions/checkout@v6 with: diff --git a/internal/backend/mongo/dropguard_test.go b/internal/backend/mongo/dropguard_test.go new file mode 100644 index 0000000..3f0c870 --- /dev/null +++ b/internal/backend/mongo/dropguard_test.go @@ -0,0 +1,69 @@ +package mongo + +// dropguard_test.go — name-convention guard tests for the Mongo Deprovision +// path (truehomie hardening, task D3). The invariant: a reserved/system naming +// token is refused BEFORE any connection, and a reserved derived candidate name +// is never passed to dropUser/dropDatabase. + +import ( + "context" + "errors" + "testing" + "time" + + "instant.dev/provisioner/internal/dropguard" +) + +// TestLocalDeprovision_RefusedToken_NeverConnects points the backend at an +// unreachable URI: a refused token must return dropguard.ErrRefused instantly, +// proving the guard runs before connect (a connect attempt would instead +// return a server-selection error after the timeout). +func TestLocalDeprovision_RefusedToken_NeverConnects(t *testing.T) { + b := newLocalBackend("mongodb://127.0.0.1:1", "127.0.0.1:1") + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + for _, tok := range []string{"postgres", "instant_customers", "", "admin", "a b"} { + err := b.Deprovision(ctx, tok, "") + if !errors.Is(err, dropguard.ErrRefused) { + t.Fatalf("Deprovision(%q): expected dropguard.ErrRefused before connect, got %v", tok, err) + } + } +} + +// TestLocalDeprovision_ReservedLegacyCandidate_SkippedNotFatal uses a token +// whose 12-char legacy truncation collides with a reserved identifier +// ("instant_customersabc"[:12] == "instant_cust"). The legacy candidate must +// be SKIPPED (refused, logged) while the canonical candidate proceeds — the +// deprovision still succeeds. +func TestLocalDeprovision_ReservedLegacyCandidate_SkippedNotFatal(t *testing.T) { + uri, ok := liveMongoURI(t) + if !ok { + t.Skip("CUSTOMER_MONGO_URL unreachable; skipping") + } + b := newLocalBackend(uri, hostFromURI(uri)) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + if err := b.Deprovision(ctx, "instant_customersabc", ""); err != nil { + t.Fatalf("Deprovision with reserved legacy candidate: want nil (skip), got %v", err) + } +} + +// TestLocalDeprovision_ReservedCanonicalDB_ReturnsError uses a token that +// passes the token guard ("ad-min") but whose canonical dash-stripped form is +// the reserved identifier "admin" — so the CANONICAL candidates (usr_admin, +// db_admin) embed a reserved tail. A refused canonical DB candidate is a hard +// error (unlike a legacy candidate, which is skipped): the deprovision must +// fail loudly rather than silently leak the real database. +func TestLocalDeprovision_ReservedCanonicalDB_ReturnsError(t *testing.T) { + uri, ok := liveMongoURI(t) + if !ok { + t.Skip("CUSTOMER_MONGO_URL unreachable; skipping") + } + b := newLocalBackend(uri, hostFromURI(uri)) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + err := b.Deprovision(ctx, "ad-min", "") + if !errors.Is(err, dropguard.ErrRefused) { + t.Fatalf("Deprovision(%q): expected dropguard.ErrRefused for reserved canonical db candidate, got %v", "ad-min", err) + } +} diff --git a/internal/backend/mongo/mongo.go b/internal/backend/mongo/mongo.go index b519058..cd4f61d 100644 --- a/internal/backend/mongo/mongo.go +++ b/internal/backend/mongo/mongo.go @@ -20,6 +20,7 @@ import ( "go.mongodb.org/mongo-driver/mongo" "go.mongodb.org/mongo-driver/mongo/options" + "instant.dev/provisioner/internal/dropguard" "instant.dev/provisioner/internal/poolident" ) @@ -196,6 +197,25 @@ func (b *LocalBackend) StorageBytes(ctx context.Context, token, providerResource // Deprovision drops the user and database for the given token. // Drops user first, then drops the database. func (b *LocalBackend) Deprovision(ctx context.Context, token, providerResourceID string) error { + // P0-2: a pool-claimed database/user are named from the pool token, not + // the request token. Resolve the canonical naming token from + // provider_resource_id so dropUser/dropDatabase target the real infra + // instead of no-op'ing on db_, which would leak it forever. + namingToken := poolident.NamingToken(token, providerResourceID) + + // Name-convention guard (truehomie hardening, task D3): every candidate + // user/database name below derives from this token; refuse the whole + // deprovision — before even connecting — when the token is empty, + // malformed, or a reserved system identifier. Per-candidate names are + // additionally validated in the loops so a future naming.go refactor + // stays covered. + if guardErr := dropguard.CheckNamingToken(namingToken); guardErr != nil { + slog.Error("provisioner.drop.refused", + "event", "provisioner.drop.refused", "site", "nosql.Deprovision", + "token", token, "naming_token", namingToken, "error", guardErr) + return fmt.Errorf("nosql.Deprovision: %w", guardErr) + } + client, err := b.connect(ctx, options.Client().ApplyURI(b.adminURI). SetServerSelectionTimeout(connectTimeout)) if err != nil { @@ -207,18 +227,18 @@ func (b *LocalBackend) Deprovision(ctx context.Context, token, providerResourceI } }() - // P0-2: a pool-claimed database/user are named from the pool token, not - // the request token. Resolve the canonical naming token from - // provider_resource_id so dropUser/dropDatabase target the real infra - // instead of no-op'ing on db_, which would leak it forever. - namingToken := poolident.NamingToken(token, providerResourceID) - // Drop every candidate user. The resource was provisioned under exactly // one scheme, but we cannot know which without a stored name, so we drop // the canonical name and every legacy form. dropUser on a non-existent // user is harmless (logged, non-fatal). adminDB := client.Database("admin") for _, username := range legacyMongoUserNames(namingToken) { + if guardErr := dropguard.CheckUserName(username); guardErr != nil { + slog.Error("provisioner.drop.refused", + "event", "provisioner.drop.refused", "site", "nosql.Deprovision", + "token", token, "user", username, "error", guardErr) + continue + } if r := adminDB.RunCommand(ctx, bson.D{{Key: "dropUser", Value: username}}); r.Err() != nil { slog.Debug("nosql.Deprovision: dropUser miss (continuing)", "token", token, "user", username, "error", r.Err()) } @@ -229,6 +249,15 @@ func (b *LocalBackend) Deprovision(ctx context.Context, token, providerResourceI // the canonical name still propagates. canonicalDB := mongoDBName(namingToken) for _, dbName := range legacyMongoDBNames(namingToken) { + if guardErr := dropguard.CheckDatabaseName(dbName); guardErr != nil { + slog.Error("provisioner.drop.refused", + "event", "provisioner.drop.refused", "site", "nosql.Deprovision", + "token", token, "db", dbName, "error", guardErr) + if dbName == canonicalDB { + return fmt.Errorf("nosql.Deprovision: %w", guardErr) + } + continue + } if dropErr := client.Database(dbName).Drop(ctx); dropErr != nil { if dbName == canonicalDB { return fmt.Errorf("nosql.Deprovision: drop database %s: %w", dbName, dropErr) diff --git a/internal/backend/postgres/dedicated.go b/internal/backend/postgres/dedicated.go index 2b3564b..5bccf3a 100644 --- a/internal/backend/postgres/dedicated.go +++ b/internal/backend/postgres/dedicated.go @@ -330,6 +330,13 @@ func (p *DedicatedProvider) deprovisionLocal(ctx context.Context, token string) dbName := "dedicated_db_" + token username := "dedicated_usr_" + token + // Name-convention guard (truehomie hardening, task D3): validate the FINAL + // constructed identifiers before the destructive statements run. Refusal is + // an error — a wrong-name bug must never reach DROP. + if guardErr := validateDropTargets("db.dedicated.deprovisionLocal", token, dbName, username); guardErr != nil { + return fmt.Errorf("db.dedicated.deprovisionLocal: %w", guardErr) + } + adminDSN := p.localAdminDSN() conn, err := pgxConnect(ctx, adminDSN) if err != nil { diff --git a/internal/backend/postgres/dropcheck.go b/internal/backend/postgres/dropcheck.go new file mode 100644 index 0000000..44c28fa --- /dev/null +++ b/internal/backend/postgres/dropcheck.go @@ -0,0 +1,31 @@ +package postgres + +// dropcheck.go — the postgres-package seam for the dropguard name-convention +// guard (truehomie hardening, task D3). Every DROP DATABASE / DROP USER this +// package executes validates its FINAL constructed identifiers here first, so +// a bug constructing a wrong target (empty token, system database, admin role) +// is refused before the destructive statement runs. + +import ( + "log/slog" + + "instant.dev/provisioner/internal/dropguard" +) + +// validateDropTargets refuses a (dbName, username) pair that does not match the +// per-tenant naming convention. On refusal it emits the structured +// `provisioner.drop.refused` audit event (same event name as the server +// chokepoint, so one NR/grep surface covers every refusal site) and returns the +// dropguard error for the caller to wrap. site names the calling function. +func validateDropTargets(site, token, dbName, username string) error { + err := dropguard.CheckDatabaseName(dbName) + if err == nil { + err = dropguard.CheckUserName(username) + } + if err != nil { + slog.Error("provisioner.drop.refused", + "event", "provisioner.drop.refused", "site", site, + "token", token, "db", dbName, "user", username, "error", err) + } + return err +} diff --git a/internal/backend/postgres/dropcheck_test.go b/internal/backend/postgres/dropcheck_test.go new file mode 100644 index 0000000..84562a3 --- /dev/null +++ b/internal/backend/postgres/dropcheck_test.go @@ -0,0 +1,87 @@ +package postgres + +// dropcheck_test.go — unit tests for the name-convention guard on every DROP +// DATABASE / DROP USER site in this package (truehomie hardening, task D3). +// The invariant: a wrong-name bug must never reach a destructive statement, +// and legitimate per-tenant names must pass untouched. + +import ( + "context" + "errors" + "testing" + + "instant.dev/provisioner/internal/dropguard" +) + +func TestValidateDropTargets_OK(t *testing.T) { + if err := validateDropTargets("test.site", "tok", + "db_96edf9eed8ed42929036b63298ec5b2b", + "usr_96edf9eed8ed42929036b63298ec5b2b"); err != nil { + t.Fatalf("validateDropTargets: unexpected refusal: %v", err) + } +} + +func TestValidateDropTargets_RefusesBadDatabase(t *testing.T) { + err := validateDropTargets("test.site", "tok", "instant_customers", "usr_ok123") + if !errors.Is(err, dropguard.ErrRefused) { + t.Fatalf("expected ErrRefused for system database, got %v", err) + } +} + +func TestValidateDropTargets_RefusesBadUser(t *testing.T) { + // Database valid, user invalid — exercises the second check independently. + err := validateDropTargets("test.site", "tok", "db_ok123", "instanode_admin") + if !errors.Is(err, dropguard.ErrRefused) { + t.Fatalf("expected ErrRefused for admin role, got %v", err) + } +} + +// TestLocalDeprovision_RefusedToken_NeverConnects asserts the guard refuses a +// reserved/system naming token BEFORE any admin connection or DROP statement. +// The backend is constructed with a nil router: if the guard let the call +// through, the router deref would panic — passing proves the early return. +func TestLocalDeprovision_RefusedToken_NeverConnects(t *testing.T) { + b := &LocalBackend{} + for _, tok := range []string{"postgres", "instant_customers", "", "x y"} { + err := b.Deprovision(context.Background(), tok, "") + if !errors.Is(err, dropguard.ErrRefused) { + t.Fatalf("Deprovision(%q): expected ErrRefused, got %v", tok, err) + } + } +} + +func TestCleanupProvisionPartial_RefusedNames_ExecutesNothing(t *testing.T) { + conn := &fakePGConn{} + cleanupProvisionPartial(conn, "instant_customers", "usr_ok123") + if len(conn.execCalls) != 0 { + t.Fatalf("rollback executed %d statements for a refused database name: %v", len(conn.execCalls), conn.execCalls) + } + cleanupProvisionPartial(conn, "db_ok123", "instanode_admin") + if len(conn.execCalls) != 0 { + t.Fatalf("rollback executed %d statements for a refused user name: %v", len(conn.execCalls), conn.execCalls) + } +} + +func TestCleanupProvisionPartial_ValidNames_StillExecutesDrops(t *testing.T) { + // Behaviour preservation: the legitimate rollback path still runs both + // IF EXISTS drops. + conn := &fakePGConn{} + cleanupProvisionPartial(conn, "db_ok123", "usr_ok123") + if len(conn.execCalls) != 2 { + t.Fatalf("rollback: want 2 statements (DROP DATABASE + DROP USER), got %d: %v", len(conn.execCalls), conn.execCalls) + } +} + +// TestDedicatedDeprovisionLocal_RefusedToken_NeverConnects mirrors the local +// test for the dedicated provider: a reserved token is refused before the +// admin DSN is ever dialed (empty adminDSN would otherwise attempt a real +// connection to the default customers URL). +func TestDedicatedDeprovisionLocal_RefusedToken_NeverConnects(t *testing.T) { + p := &DedicatedProvider{} + for _, tok := range []string{"postgres", "", "a b"} { + err := p.deprovisionLocal(context.Background(), tok) + if !errors.Is(err, dropguard.ErrRefused) { + t.Fatalf("deprovisionLocal(%q): expected ErrRefused, got %v", tok, err) + } + } +} diff --git a/internal/backend/postgres/local.go b/internal/backend/postgres/local.go index a5072f3..4f74cf9 100644 --- a/internal/backend/postgres/local.go +++ b/internal/backend/postgres/local.go @@ -252,6 +252,14 @@ func (b *LocalBackend) Provision(ctx context.Context, token, tier string, connLi // even if the incoming request context is on the verge of its deadline. Both // DROPs use IF EXISTS so partial states (db created, user not) are safe. func cleanupProvisionPartial(conn pgConn, dbName, username string) { + // Name-convention guard (truehomie hardening, task D3): this rollback path + // does NOT pass through the server's guardedDrop chokepoint, so it + // validates its own targets. A wrong-name bug must SKIP the drops, never + // execute them — the cost of a refusal is a leaked just-created db, not a + // destroyed customer one. + if validateDropTargets("db.local.cleanupProvisionPartial", "", dbName, username) != nil { + return + } ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() if _, err := conn.Exec(ctx, fmt.Sprintf("DROP DATABASE IF EXISTS %q WITH (FORCE)", dbName)); err != nil { @@ -307,6 +315,14 @@ func (b *LocalBackend) Deprovision(ctx context.Context, token, providerResourceI dbName := dbNamePrefix + namingToken username := userNamePrefix + namingToken + // Name-convention guard (truehomie hardening, task D3): validate the FINAL + // constructed identifiers right before the destructive statements run, so a + // future refactor that builds them differently is still covered. Refusal is + // an error (the caller's deprovision fails loudly) — never a wrong drop. + if guardErr := validateDropTargets("db.local.Deprovision", token, dbName, username); guardErr != nil { + return fmt.Errorf("db.local.Deprovision: %w", guardErr) + } + adminURL := b.router.AdminURLForResource(poolident.BasePRID(providerResourceID)) conn, err := pgxConnect(ctx, adminURL) if err != nil { diff --git a/internal/backend/redis/dedicated.go b/internal/backend/redis/dedicated.go index a4db662..8082eb9 100644 --- a/internal/backend/redis/dedicated.go +++ b/internal/backend/redis/dedicated.go @@ -18,6 +18,8 @@ import ( "log/slog" goredis "github.com/redis/go-redis/v9" + + "instant.dev/provisioner/internal/dropguard" ) // DedicatedProvider provisions a dedicated Redis instance. @@ -228,6 +230,14 @@ func (p *DedicatedProvider) deprovisionLocal(ctx context.Context, token, provide probes = append(probes, legacy) } for _, username := range probes { + // Name-convention guard (truehomie hardening, task D3): refuse a + // DELUSER of any non-tenant-shaped name (e.g. "default", admin users). + if guardErr := dropguard.CheckUserName(username); guardErr != nil { + slog.Error("provisioner.drop.refused", + "event", "provisioner.drop.refused", "site", "cache.dedicated.deprovisionLocal", + "token", token, "user", username, "error", guardErr) + continue + } if err := p.rdb.Do(ctx, "ACL", "DELUSER", username).Err(); err != nil { // Best-effort — user may not exist. slog.Warn("cache.dedicated.deprovisionLocal: ACL DELUSER (non-fatal)", diff --git a/internal/backend/redis/dropguard_test.go b/internal/backend/redis/dropguard_test.go new file mode 100644 index 0000000..1baa7f8 --- /dev/null +++ b/internal/backend/redis/dropguard_test.go @@ -0,0 +1,50 @@ +package redis + +// dropguard_test.go — name-convention guard tests for the Redis ACL DELUSER +// sites (truehomie hardening, task D3). The invariant: a non-tenant-shaped +// username (e.g. "default", an admin user) is never passed to ACL DELUSER — +// deleting "default" would brick every tenant on the shared pod. + +import ( + "context" + "testing" + "time" + + goredis "github.com/redis/go-redis/v9" +) + +// TestDedicatedDeprovisionLocal_RefusedProbe_SkippedNotExecuted stamps a +// providerResourceID that resolves to the Redis admin user "default" (the +// resolveDedicatedACLUsername fast-path returns it verbatim). The guard must +// SKIP the DELUSER: the provider is constructed with a nil client, so any +// executed Redis command would panic — returning nil proves the skip. +func TestDedicatedDeprovisionLocal_RefusedProbe_SkippedNotExecuted(t *testing.T) { + p := &DedicatedProvider{} + if err := p.deprovisionLocal(context.Background(), "x", "default"); err != nil { + t.Fatalf("deprovisionLocal with refused probe: want nil (skip), got %v", err) + } +} + +// TestLocalDeprovision_ReservedLegacyACLUser_Skipped uses a token whose 8-char +// legacy truncation is the reserved identifier "postgres" +// ("postgres1ab2"[:8] == "postgres"). The legacy DELUSER candidate must be +// refused/skipped while the canonical one proceeds. The client points at an +// unreachable address: the canonical DELUSER error is best-effort (ignored), +// and the subsequent SCAN fails — so the expected outcome is the SCAN error, +// never a dropguard refusal and never a DELUSER of usr_postgres. +func TestLocalDeprovision_ReservedLegacyACLUser_Skipped(t *testing.T) { + rdb := goredis.NewClient(&goredis.Options{ + Addr: "127.0.0.1:1", // nothing listens here + DialTimeout: 200 * time.Millisecond, + MaxRetries: -1, + }) + defer rdb.Close() + b := &LocalBackend{rdb: rdb} + + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + err := b.Deprovision(ctx, "postgres1ab2", "") + if err == nil { + t.Fatal("Deprovision against unreachable Redis: want SCAN error, got nil") + } +} diff --git a/internal/backend/redis/local.go b/internal/backend/redis/local.go index f26e87d..722d17d 100644 --- a/internal/backend/redis/local.go +++ b/internal/backend/redis/local.go @@ -15,6 +15,7 @@ import ( goredis "github.com/redis/go-redis/v9" + "instant.dev/provisioner/internal/dropguard" "instant.dev/provisioner/internal/poolident" ) @@ -280,6 +281,14 @@ func (b *LocalBackend) Deprovision(ctx context.Context, token, providerResourceI if username == "" { continue } + // Name-convention guard (truehomie hardening, task D3): never DELUSER a + // non-tenant-shaped name — DELUSER "default" would brick the shared pod. + if guardErr := dropguard.CheckUserName(username); guardErr != nil { + slog.Error("provisioner.drop.refused", + "event", "provisioner.drop.refused", "site", "cache.local.Deprovision", + "token", token, "user", username, "error", guardErr) + continue + } b.rdb.Do(ctx, "ACL", "DELUSER", username) } diff --git a/internal/dropguard/dropguard.go b/internal/dropguard/dropguard.go new file mode 100644 index 0000000..ae9ad8a --- /dev/null +++ b/internal/dropguard/dropguard.go @@ -0,0 +1,184 @@ +// Package dropguard validates the TARGET IDENTITY of a customer-data +// destruction before it executes. +// +// # Why this package exists (truehomie-db DROP incident, 2026-06-03) +// +// An active Pro customer's Postgres database + role were dropped on the shared +// postgres-customers cluster by an unidentified path. The guardedDrop +// chokepoint (internal/server/drop_chokepoint.go) made every sanctioned drop +// AUDITABLE; this package makes a *mis-targeted* drop UNEXECUTABLE: a bug that +// constructs the wrong identifier (an empty token, a system database such as +// "postgres"/"instant_customers", an admin role such as "instanode_admin", or +// an arbitrary non-tenant-shaped name) is refused BEFORE the destructive +// statement runs. +// +// # What the convention is +// +// Every per-tenant identifier the provisioner ever destroys is a fixed prefix +// plus a platform-issued naming token: +// +// postgres shared → db_ usr_ +// postgres dedicated → dedicated_db_ dedicated_usr_ +// mongo → db_ usr_ (canonical + legacy forms) +// redis ACL → usr_ usr_ (legacy) +// +// Tokens are UUIDs (with or without dashes), optionally "pool-"-prefixed for +// hot-pool items; e2e/internal cohorts may use other shapes. The guard is +// therefore deliberately CHARSET-AND-DENYLIST based, not format-strict: +// refusing a legitimate legacy form would wedge a deprovision in prod (this +// repo auto-deploys), while the catastrophic mis-target class — system +// databases, admin roles, empty/garbage names — is exactly what the charset + +// denylist refuse. Loosening NEVER happens implicitly: every accepted name +// still has to carry a per-tenant prefix. +// +// Refusals are surfaced by the callers as a structured +// `provisioner.drop.refused` log event; through the guardedDrop chokepoint +// they also increment instant_provisioner_drop_total{outcome="refused"}. +package dropguard + +import ( + "errors" + "fmt" + "strings" +) + +// ErrRefused is the sentinel wrapped by every refusal so callers and tests can +// errors.Is() a dropguard refusal regardless of the message text. +var ErrRefused = errors.New("dropguard: destructive target refused") + +// maxIdentifierLen bounds every validated identifier. Postgres truncates +// identifiers at 63 bytes; nothing legitimate is longer. +const maxIdentifierLen = 63 + +// reservedTokens are naming tokens that must never be destroyed even when the +// charset would otherwise allow them (e.g. a bug that flows a database or role +// name INTO the token field). Compared case-insensitively. +var reservedTokens = map[string]bool{ + "postgres": true, + "template0": true, + "template1": true, + "admin": true, + "default": true, + "root": true, + "instant_customers": true, + "instant_platform": true, + "instant_cust": true, + "instanode_admin": true, + "doadmin": true, +} + +// systemDatabases are database names that must never be dropped. The per-tenant +// prefix requirement already excludes them; this denylist is belt-and-suspenders +// for any future caller that validates a name without a prefix expectation. +var systemDatabases = map[string]bool{ + "postgres": true, + "template0": true, + "template1": true, + "instant_customers": true, + "instant_platform": true, + // Mongo system databases. + "admin": true, + "local": true, + "config": true, +} + +// systemRoles are role/user names that must never be dropped or DELUSER'd. +var systemRoles = map[string]bool{ + "postgres": true, + "instant_cust": true, + "instanode_admin": true, + "doadmin": true, + "admin": true, + "root": true, + "default": true, // the Redis ACL admin user — DELUSER default bricks the pod + "replication": true, +} + +// dbPrefixes / userPrefixes are the only prefixes a destroyable per-tenant +// identifier may carry. Longest-first so "dedicated_db_x" never strips as +// "db_" with tail "dedicated_..." (it can't — prefix match is from the start — +// but the order keeps the loop's intent obvious). +var ( + dbPrefixes = []string{"dedicated_db_", "db_"} + // "ded_" is the dedicated-Redis ACL user prefix (redis/dedident.go); + // "dedicated_usr_" the dedicated-Postgres role prefix; "usr_" everything else. + userPrefixes = []string{"dedicated_usr_", "usr_", "ded_"} +) + +// validTokenChars reports whether every byte of tok is in the conservative +// identifier charset [a-z0-9._-] (case-insensitive). UUIDs (dashed or dashless), +// "pool-" forms, and e2e cohort tokens all fit; SQL metacharacters, quotes, +// spaces, '%' and '*' wildcards do not. +func validTokenChars(tok string) bool { + for i := 0; i < len(tok); i++ { + c := tok[i] + switch { + case c >= 'a' && c <= 'z': + case c >= 'A' && c <= 'Z': + case c >= '0' && c <= '9': + case c == '-' || c == '_' || c == '.': + default: + return false + } + } + return true +} + +// CheckNamingToken validates the canonical naming token of a resource about to +// be destroyed (the resolved poolident.NamingToken — pool token or request +// token). It refuses empty, over-long, bad-charset, and reserved tokens. +func CheckNamingToken(token string) error { + switch { + case token == "": + return fmt.Errorf("%w: empty naming token", ErrRefused) + case len(token) > maxIdentifierLen: + return fmt.Errorf("%w: naming token %q exceeds %d bytes", ErrRefused, token, maxIdentifierLen) + case !validTokenChars(token): + return fmt.Errorf("%w: naming token %q contains characters outside [A-Za-z0-9._-]", ErrRefused, token) + case reservedTokens[strings.ToLower(token)]: + return fmt.Errorf("%w: naming token %q is a reserved system identifier", ErrRefused, token) + } + return nil +} + +// CheckDatabaseName validates a database name about to be passed to +// DROP DATABASE (or a Mongo dropDatabase). The name must carry a per-tenant +// db prefix, have a valid non-reserved tail, and never be a system database. +func CheckDatabaseName(name string) error { + return checkPrefixed("database", name, dbPrefixes, systemDatabases) +} + +// CheckUserName validates a role/user name about to be passed to DROP USER / +// DROP ROLE / Mongo dropUser / Redis ACL DELUSER. The name must carry a +// per-tenant usr prefix, have a valid non-reserved tail, and never be a +// system role. +func CheckUserName(name string) error { + return checkPrefixed("user", name, userPrefixes, systemRoles) +} + +// checkPrefixed is the shared prefix + tail + denylist validation. +func checkPrefixed(kind, name string, prefixes []string, denylist map[string]bool) error { + if denylist[strings.ToLower(name)] { + return fmt.Errorf("%w: %s %q is a protected system identifier", ErrRefused, kind, name) + } + if len(name) > maxIdentifierLen { + return fmt.Errorf("%w: %s %q exceeds %d bytes", ErrRefused, kind, name, maxIdentifierLen) + } + for _, p := range prefixes { + tail, ok := strings.CutPrefix(name, p) + if !ok { + continue + } + if tail == "" { + return fmt.Errorf("%w: %s %q has an empty token after prefix %q", ErrRefused, kind, name, p) + } + if !validTokenChars(tail) { + return fmt.Errorf("%w: %s %q has characters outside [A-Za-z0-9._-] after prefix %q", ErrRefused, kind, name, p) + } + if reservedTokens[strings.ToLower(tail)] { + return fmt.Errorf("%w: %s %q embeds a reserved system identifier", ErrRefused, kind, name) + } + return nil + } + return fmt.Errorf("%w: %s %q does not carry a per-tenant prefix (%s)", ErrRefused, kind, name, strings.Join(prefixes, ", ")) +} diff --git a/internal/dropguard/dropguard_test.go b/internal/dropguard/dropguard_test.go new file mode 100644 index 0000000..b04e336 --- /dev/null +++ b/internal/dropguard/dropguard_test.go @@ -0,0 +1,183 @@ +package dropguard + +import ( + "errors" + "strings" + "testing" +) + +func TestCheckNamingToken_AcceptsEveryLegitimateForm(t *testing.T) { + valid := []string{ + "96edf9eed8ed42929036b63298ec5b2b", // dashless 32-hex (observed prod form) + "96edf9ee-d8ed-4292-9036-b63298ec5b2b", // uuid.String() with dashes + "pool-96edf9ee-d8ed-4292-9036-b63298ec5b2b", // hot-pool synthetic token + "pool96edf9eed8ed42929036b63298ec5b2b", // mongo canonical (dash-stripped pool token) + "a1b2c3d4", // legacy short hex + "tok", // unit-test / internal short token + "e2e-tok", // e2e cohort shape + "E2E-Cohort.1", // mixed case + dot + } + for _, tok := range valid { + if err := CheckNamingToken(tok); err != nil { + t.Errorf("CheckNamingToken(%q): unexpected refusal: %v", tok, err) + } + } +} + +func TestCheckNamingToken_RefusesDangerousForms(t *testing.T) { + invalid := []string{ + "", // empty — would derive db_ / a *:* scan prefix + "postgres", // system database + "template0", // system database + "template1", // system database + "instant_customers", // the shared customer cluster's own database + "instant_platform", // the platform database + "INSTANODE_ADMIN", // admin role, case-insensitive + "instant_cust", // provisioner admin role + "doadmin", // DO managed-PG admin + "admin", // mongo system db / generic admin + "default", // redis ACL admin user + "root", // generic admin + "db_*", // wildcard + "tok; DROP DATABASE x", // SQL metacharacters + "a b", // whitespace + `a"b`, // quote + "%", // redis SCAN wildcard material + strings.Repeat("a", 64), // over-long + } + for _, tok := range invalid { + err := CheckNamingToken(tok) + if err == nil { + t.Errorf("CheckNamingToken(%q): expected refusal, got nil", tok) + continue + } + if !errors.Is(err, ErrRefused) { + t.Errorf("CheckNamingToken(%q): refusal not wrapped in ErrRefused: %v", tok, err) + } + } +} + +func TestCheckDatabaseName_AcceptsPerTenantForms(t *testing.T) { + valid := []string{ + "db_96edf9eed8ed42929036b63298ec5b2b", + "db_96edf9ee-d8ed-4292-9036-b63298ec5b2b", + "db_pool-96edf9ee-d8ed-4292-9036-b63298ec5b2b", + "db_pool96edf9eed8ed42929036b63298ec5b2b", // mongo canonical pool form + "db_96edf9eed8ed", // mongo legacy 12-char form + "dedicated_db_96edf9eed8ed42929036b63298ec5b2b", + "db_tok", // unit-test shape + } + for _, name := range valid { + if err := CheckDatabaseName(name); err != nil { + t.Errorf("CheckDatabaseName(%q): unexpected refusal: %v", name, err) + } + } +} + +func TestCheckDatabaseName_RefusesSystemAndMisshapenNames(t *testing.T) { + invalid := []string{ + "postgres", // THE truehomie nightmare targets + "template0", + "template1", + "instant_customers", + "Instant_Customers", // case-insensitive denylist + "instant_platform", + "admin", // mongo system dbs + "local", + "config", + "", // empty + "db_", // prefix with empty token + "db_postgres", // reserved token behind a valid prefix + "db_instant_customers", + "96edf9eed8ed42929036b63298ec5b2b", // missing prefix entirely + "customers", // arbitrary non-tenant name + `db_x"; DROP DATABASE "postgres`, // injection-shaped + "db_" + strings.Repeat("a", 64), // over-long + "dedicated_db_", // dedicated prefix, empty token + } + for _, name := range invalid { + err := CheckDatabaseName(name) + if err == nil { + t.Errorf("CheckDatabaseName(%q): expected refusal, got nil", name) + continue + } + if !errors.Is(err, ErrRefused) { + t.Errorf("CheckDatabaseName(%q): refusal not wrapped in ErrRefused: %v", name, err) + } + } +} + +func TestCheckUserName_AcceptsPerTenantForms(t *testing.T) { + valid := []string{ + "usr_96edf9eed8ed42929036b63298ec5b2b", + "usr_96edf9ee-d8ed-4292-9036-b63298ec5b2b", + "usr_pool-96edf9ee-d8ed-4292-9036-b63298ec5b2b", + "usr_96edf9ee", // redis legacy 8-char ACL form + "dedicated_usr_96edf9eed8ed42929036b63298ec5b2b", + "usr_tok", + } + for _, name := range valid { + if err := CheckUserName(name); err != nil { + t.Errorf("CheckUserName(%q): unexpected refusal: %v", name, err) + } + } +} + +func TestCheckUserName_RefusesSystemAndMisshapenNames(t *testing.T) { + invalid := []string{ + "postgres", + "instanode_admin", // the confirmed truehomie vector role + "instant_cust", + "doadmin", + "default", // redis ACL admin — DELUSER default bricks the shared pod + "admin", + "root", + "replication", + "", + "usr_", + "usr_postgres", + "usr_instanode_admin", + "96edf9eed8ed42929036b63298ec5b2b", // missing prefix + "usr_a b", + "usr_" + strings.Repeat("a", 64), + "dedicated_usr_", + } + for _, name := range invalid { + err := CheckUserName(name) + if err == nil { + t.Errorf("CheckUserName(%q): expected refusal, got nil", name) + continue + } + if !errors.Is(err, ErrRefused) { + t.Errorf("CheckUserName(%q): refusal not wrapped in ErrRefused: %v", name, err) + } + } +} + +// TestRegistryDenylistsAreSubsetsOfReservedTokens guards the rule-18 invariant: +// any identifier protected as a bare system database/role is ALSO refused when +// it appears as a naming-token (so a bug that flows the name into the token +// field is refused at the chokepoint too). Iterates the live registries, not a +// hand-typed list. +func TestRegistryDenylistsAreSubsetsOfReservedTokens(t *testing.T) { + for name := range systemDatabases { + // Mongo system dbs (admin/local/config) are protected as database + // names; admin is additionally a reserved token. local/config are + // harmless as tokens (db_local is tenant-shaped), so only assert the + // postgres-side names. + if name == "local" || name == "config" { + continue + } + if !reservedTokens[name] { + t.Errorf("systemDatabases[%q] is not in reservedTokens — a bug flowing it into the token field would not be refused at the chokepoint", name) + } + } + for name := range systemRoles { + if name == "replication" { + continue // not a plausible token; protected at the name layer + } + if !reservedTokens[name] { + t.Errorf("systemRoles[%q] is not in reservedTokens", name) + } + } +} diff --git a/internal/server/drop_chokepoint.go b/internal/server/drop_chokepoint.go index 981681b..5018713 100644 --- a/internal/server/drop_chokepoint.go +++ b/internal/server/drop_chokepoint.go @@ -47,11 +47,13 @@ import ( provisionerv1 "instant.dev/proto/provisioner/v1" "instant.dev/provisioner/internal/circuit" + "instant.dev/provisioner/internal/dropguard" + "instant.dev/provisioner/internal/poolident" ) // dropTotal counts every customer-data drop the provisioner performs through the // sanctioned chokepoint, labelled by resource_type, backend (shared|dedicated), -// and outcome (ok|error|breaker_open). A spike in this counter is the alertable +// and outcome (ok|error|breaker_open|refused). A spike in this counter is the alertable // signal that drops are happening at an abnormal rate (the truehomie incident // class). Eager-registered (NewCounterVec via promauto on the default registry) // so the series exists at /metrics before the first drop — but only label @@ -71,6 +73,12 @@ const ( dropBackendDedicated dropBackend = "dedicated" ) +// dropOutcomeRefused is the dropTotal outcome label for a drop the +// name-convention guard refused before any backend was invoked. A non-zero +// refused count is ALWAYS a bug or an attack: some caller asked the +// provisioner to destroy a target that no legitimate flow can name. +const dropOutcomeRefused = "refused" + // callerFromContext returns a best-effort identifier for the gRPC peer that // issued the request (e.g. "10.109.3.201:54422"), or "unknown" when the peer is // not available. This is the provisioner-side attribution that was MISSING in the @@ -100,6 +108,30 @@ func (s *Server) guardedDrop( resType := req.ResourceType.String() caller := callerFromContext(ctx) + // Name-convention guard (truehomie hardening, task D3): the canonical + // naming token derives EVERY identifier a backend destroys (db_, + // usr_, keyspace :*, namespace instant-customer-). + // Refuse the drop outright when the token is empty, malformed, or a + // reserved system identifier — a bug constructing a wrong target must die + // here, BEFORE any backend executes. Legitimate deprovision flows are + // unaffected: every platform-issued token form passes dropguard. + namingToken := poolident.NamingToken(req.Token, req.ProviderResourceId) + if guardErr := dropguard.CheckNamingToken(namingToken); guardErr != nil { + dropTotal.WithLabelValues(resType, string(backend), dropOutcomeRefused).Inc() + slog.Error("provisioner.drop.refused", + "event", "provisioner.drop.refused", + "token", req.Token, + "provider_resource_id", req.ProviderResourceId, + "naming_token", namingToken, + "resource_type", resType, + "backend", string(backend), + "request_id", req.RequestId, + "caller", caller, + "error", guardErr, + ) + return guardErr + } + // DDL-audit: the always-on, provisioner-side record of the drop. This is the // in-app equivalent of the cluster's log_statement='ddl' trap. slog.Info("provisioner.drop", diff --git a/internal/server/drop_chokepoint_test.go b/internal/server/drop_chokepoint_test.go index dfd8e3f..3147d81 100644 --- a/internal/server/drop_chokepoint_test.go +++ b/internal/server/drop_chokepoint_test.go @@ -19,6 +19,7 @@ import ( provisionerv1 "instant.dev/proto/provisioner/v1" "instant.dev/provisioner/internal/circuit" + "instant.dev/provisioner/internal/dropguard" ) // freshBreaker returns a closed breaker with a high threshold so a single @@ -102,6 +103,95 @@ func TestGuardedDrop_BreakerOpen_DoesNotInvokeBackend_AndRecordsBreakerOpen(t *t } } +func TestGuardedDrop_RefusedToken_DoesNotInvokeBackend_AndRecordsRefused(t *testing.T) { + s := &Server{breakers: circuit.NewBreakers()} + + cases := []struct { + name string + req *provisionerv1.DeprovisionRequest + }{ + {"reserved system token", dropReq("postgres")}, + {"reserved cluster db as token", dropReq("instant_customers")}, + {"admin role as token", dropReq("instanode_admin")}, + {"empty token, no pool marker", &provisionerv1.DeprovisionRequest{ + Token: "", + ProviderResourceId: "local:0", + ResourceType: commonv1.ResourceType_RESOURCE_TYPE_POSTGRES, + RequestId: "req-test", + }}, + {"sql metacharacters", dropReq(`x"; DROP DATABASE "postgres`)}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + before := testutil.ToFloat64(dropTotal.WithLabelValues("RESOURCE_TYPE_POSTGRES", "shared", dropOutcomeRefused)) + called := false + err := s.guardedDrop(context.Background(), tc.req, dropBackendShared, freshBreaker(), func() error { + called = true + return nil + }) + if !errors.Is(err, dropguard.ErrRefused) { + t.Fatalf("expected dropguard.ErrRefused, got %v", err) + } + if called { + t.Fatal("backend fn must NOT be invoked for a refused drop target") + } + after := testutil.ToFloat64(dropTotal.WithLabelValues("RESOURCE_TYPE_POSTGRES", "shared", dropOutcomeRefused)) + if after != before+1 { + t.Fatalf("refused counter: got %v want %v", after, before+1) + } + }) + } +} + +func TestGuardedDrop_PoolTokenFromPRID_IsValidated(t *testing.T) { + // A request whose provider_resource_id carries a pool-token marker resolves + // THAT token for naming — the guard must validate the resolved token, not + // the request token. A reserved pool-resolved token is refused even when + // the request token looks fine. + s := &Server{breakers: circuit.NewBreakers()} + req := &provisionerv1.DeprovisionRequest{ + Token: "96edf9eed8ed42929036b63298ec5b2b", + ProviderResourceId: "local:0;pooltok:instant_customers", + ResourceType: commonv1.ResourceType_RESOURCE_TYPE_POSTGRES, + RequestId: "req-test", + } + called := false + err := s.guardedDrop(context.Background(), req, dropBackendShared, freshBreaker(), func() error { + called = true + return nil + }) + if !errors.Is(err, dropguard.ErrRefused) { + t.Fatalf("expected dropguard.ErrRefused for reserved pool-resolved token, got %v", err) + } + if called { + t.Fatal("backend fn must NOT be invoked for a refused pool-resolved token") + } +} + +func TestGuardedDrop_LegitimateForms_StillExecute(t *testing.T) { + // Regression guard for the auto-deploy invariant: every token shape a real + // flow produces (uuid, dashless hex, pool token, e2e cohort) must still + // reach the backend. + s := &Server{breakers: circuit.NewBreakers()} + for _, tok := range []string{ + "96edf9ee-d8ed-4292-9036-b63298ec5b2b", + "96edf9eed8ed42929036b63298ec5b2b", + "pool-96edf9ee-d8ed-4292-9036-b63298ec5b2b", + "e2e-tok", + } { + called := false + if err := s.guardedDrop(context.Background(), dropReq(tok), dropBackendShared, freshBreaker(), func() error { + called = true + return nil + }); err != nil { + t.Fatalf("guardedDrop(%q): unexpected error: %v", tok, err) + } + if !called { + t.Fatalf("guardedDrop(%q): backend fn was not invoked", tok) + } + } +} + func TestCallerFromContext_WithPeer_ReturnsAddr(t *testing.T) { ctx := peer.NewContext(context.Background(), &peer.Peer{ Addr: &net.TCPAddr{IP: net.IPv4(10, 109, 3, 201), Port: 54422}, diff --git a/scripts/check-migration-safety.sh b/scripts/check-migration-safety.sh new file mode 100755 index 0000000..ff31699 --- /dev/null +++ b/scripts/check-migration-safety.sh @@ -0,0 +1,173 @@ +#!/usr/bin/env bash +# check-migration-safety.sh — CI gate that flags DESTRUCTIVE DDL in migration +# files (truehomie-db DROP incident hardening, task D3, 2026-06-11). +# +# WHAT IT CATCHES +# DROP TABLE / DROP DATABASE / DROP SCHEMA / DROP COLUMN / DROP ROLE / +# DROP USER / TRUNCATE, and DELETE FROM statements with no WHERE clause — +# in any *.sql file under a migrations directory. +# +# HOW TO SHIP A LEGITIMATE DESTRUCTIVE MIGRATION +# Add an explicit acknowledgement marker comment ANYWHERE in the file: +# -- destructive: acknowledged +# e.g. -- destructive: acknowledged dropping legacy column replaced by mig 063 +# The marker forces the destruction to be a reviewed, deliberate act — the +# reviewer sees the reason next to the DDL — instead of a silent drift. +# +# PORTABILITY +# Self-contained bash + grep + awk; no repo-specific assumptions. To adopt in +# another repo, copy this file and add the two CI steps: +# bash scripts/check-migration-safety.sh --self-test +# bash scripts/check-migration-safety.sh +# Override the scan root with MIGRATION_SAFETY_ROOT (default: repo root). +# +# EXIT CODES: 0 = clean (or all destructive files acknowledged); 1 = violation; +# 2 = self-test failure. + +set -euo pipefail + +MARKER_REGEX='--[[:space:]]*destructive:[[:space:]]*acknowledged[[:space:]]+[^[:space:]]' + +# scan_file — prints a violation line per un-acknowledged destructive +# statement; returns 0 when the file is safe. +scan_file() { + local f="$1" + local violations="" + + # Acknowledged files pass wholesale: the marker is the reviewed waiver. + if grep -qiE -e "$MARKER_REGEX" "$f"; then + return 0 + fi + + # Plain destructive DDL patterns (case-insensitive, line-based; SQL comments + # starting with -- are stripped first so prose about DROP doesn't trip it). + local stripped + stripped="$(sed 's/--.*$//' "$f")" + + local pat + for pat in 'DROP[[:space:]]+TABLE' 'DROP[[:space:]]+DATABASE' 'DROP[[:space:]]+SCHEMA' \ + 'DROP[[:space:]]+COLUMN' 'DROP[[:space:]]+ROLE' 'DROP[[:space:]]+USER' \ + 'TRUNCATE'; do + if printf '%s' "$stripped" | grep -qiE "$pat"; then + violations="${violations} - $(printf '%s' "$pat" | sed 's/\[\[:space:\]\]+/ /g')\n" + fi + done + + # DELETE FROM without a WHERE in the same statement (statements split on ';'). + # awk joins the comment-stripped file into one line, splits on ';', and flags + # any statement that contains DELETE FROM but no WHERE. + if printf '%s' "$stripped" | awk ' + { gsub(/\r/, ""); buf = buf " " $0 } + END { + n = split(buf, stmts, ";") + for (i = 1; i <= n; i++) { + s = toupper(stmts[i]) + if (s ~ /DELETE[ \t]+FROM/ && s !~ /WHERE/) { exit 0 } + } + exit 1 + }'; then + violations="${violations} - DELETE FROM without WHERE\n" + fi + + if [ -n "$violations" ]; then + printf 'VIOLATION %s\n%b' "$f" "$violations" + return 1 + fi + return 0 +} + +run_scan() { + local root="${MIGRATION_SAFETY_ROOT:-.}" + local failed=0 scanned=0 f + + # Every *.sql under any directory whose path contains "migrations". + while IFS= read -r f; do + scanned=$((scanned + 1)) + if ! scan_file "$f"; then + failed=1 + fi + done < <(find "$root" -type f -name '*.sql' -path '*migrations*' \ + -not -path '*/vendor/*' -not -path '*/node_modules/*' -not -path '*/.git/*' | sort) + + if [ "$failed" -ne 0 ]; then + echo "" + echo "Migration-safety gate FAILED: destructive DDL found without an acknowledgement marker." + echo "If the destruction is intentional and reviewed, add to the migration file:" + echo " -- destructive: acknowledged " + echo "(truehomie-db DROP incident hardening — see provisioner/internal/dropguard)" + return 1 + fi + echo "migration-safety: OK ($scanned migration file(s) scanned, 0 unacknowledged destructive statements)" + return 0 +} + +self_test() { + local tmp + tmp="$(mktemp -d)" + trap 'rm -rf "$tmp"' RETURN + mkdir -p "$tmp/migrations" + + local rc=0 + + # 1. Destructive without marker → must FAIL. + cat > "$tmp/migrations/001_bad.sql" <<'SQL' +ALTER TABLE foo ADD COLUMN bar TEXT; +DROP TABLE legacy_widgets; +SQL + if MIGRATION_SAFETY_ROOT="$tmp" run_scan >/dev/null 2>&1; then + echo "self-test FAIL: unacknowledged DROP TABLE was not flagged"; rc=2 + fi + + # 2. Destructive WITH marker → must PASS. + cat > "$tmp/migrations/001_bad.sql" <<'SQL' +-- destructive: acknowledged legacy_widgets replaced by widgets_v2 in mig 000 +DROP TABLE legacy_widgets; +SQL + if ! MIGRATION_SAFETY_ROOT="$tmp" run_scan >/dev/null 2>&1; then + echo "self-test FAIL: acknowledged DROP TABLE was flagged"; rc=2 + fi + + # 3. DELETE FROM without WHERE → must FAIL. + cat > "$tmp/migrations/001_bad.sql" <<'SQL' +DELETE FROM sessions; +SQL + if MIGRATION_SAFETY_ROOT="$tmp" run_scan >/dev/null 2>&1; then + echo "self-test FAIL: DELETE FROM without WHERE was not flagged"; rc=2 + fi + + # 4. DELETE FROM with WHERE → must PASS. + cat > "$tmp/migrations/001_bad.sql" <<'SQL' +DELETE FROM sessions WHERE expires_at < now(); +SQL + if ! MIGRATION_SAFETY_ROOT="$tmp" run_scan >/dev/null 2>&1; then + echo "self-test FAIL: DELETE FROM with WHERE was flagged"; rc=2 + fi + + # 5. Non-destructive migration → must PASS (incl. prose comments about DROP). + cat > "$tmp/migrations/001_bad.sql" <<'SQL' +-- this migration does NOT drop table anything; see DROP DATABASE docs. +CREATE TABLE widgets (id INT); +ALTER TABLE widgets ADD COLUMN name TEXT; +SQL + if ! MIGRATION_SAFETY_ROOT="$tmp" run_scan >/dev/null 2>&1; then + echo "self-test FAIL: non-destructive migration was flagged"; rc=2 + fi + + # 6. TRUNCATE without marker → must FAIL. + cat > "$tmp/migrations/001_bad.sql" <<'SQL' +TRUNCATE sessions; +SQL + if MIGRATION_SAFETY_ROOT="$tmp" run_scan >/dev/null 2>&1; then + echo "self-test FAIL: TRUNCATE was not flagged"; rc=2 + fi + + if [ "$rc" -eq 0 ]; then + echo "migration-safety: self-test OK (6/6 fixtures behaved)" + fi + return "$rc" +} + +case "${1:-}" in + --self-test) self_test ;; + *) run_scan ;; +esac From 76b5ab4224ea92e33adab55e94393c07a1450e95 Mon Sep 17 00:00:00 2001 From: Manas Srivastava <[email protected]> Date: Thu, 11 Jun 2026 01:04:32 +0530 Subject: [PATCH 2/3] feat(pool): chokepoint contract on the pool reaper's deprovision dispatch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pool.deprovisionBacking is the ONE customer-infra drop dispatch that does not pass through server.guardedDrop (no gRPC request). Give it the same contract: dropguard naming-token validation (refusal = provisioner.drop.refused) and the `provisioner.drop` audit event emitted BEFORE the backend executes, with caller="pool_reaper" and the proto-style resource_type strings so one NR query covers RPC drops and pool-reap drops alike. Without this, every pool reap of a failed postgres item would page the new customer-db-destructive-ddl NR alert as an unsanctioned DROP (and was itself an un-audited drop path). Test fixture fix: fakeRows.Scan filled every string dest with "postgres", which dropguard now (correctly) refuses as a pool naming token — fill positionally so pool_token carries a valid pool-token shape. Co-Authored-By: Claude Fable 5 --- internal/pool/dropguard_test.go | 59 ++++++++++++++++++++++++++++ internal/pool/manager.go | 44 +++++++++++++++++++++ internal/pool/manager_reaper_test.go | 21 ++++++++-- 3 files changed, 120 insertions(+), 4 deletions(-) create mode 100644 internal/pool/dropguard_test.go diff --git a/internal/pool/dropguard_test.go b/internal/pool/dropguard_test.go new file mode 100644 index 0000000..cf7c37d --- /dev/null +++ b/internal/pool/dropguard_test.go @@ -0,0 +1,59 @@ +package pool + +// dropguard_test.go — name-convention guard + DDL-audit tests for the pool +// reaper's deprovisionBacking dispatch (truehomie hardening, task D3). The +// pool reaper is the one customer-infra drop path that does not pass through +// server.guardedDrop, so it carries its own copy of the chokepoint contract. + +import ( + "context" + "errors" + "strings" + "testing" + + "instant.dev/provisioner/internal/dropguard" +) + +// TestDeprovisionBacking_RefusedToken_NeverDispatches asserts a reserved or +// malformed naming token is refused BEFORE any backend dispatch. The Manager +// has nil backends: if the guard let the call through, the dispatch would +// panic — a returned ErrRefused proves the early return. +func TestDeprovisionBacking_RefusedToken_NeverDispatches(t *testing.T) { + m := &Manager{} + for _, tok := range []string{"postgres", "instant_customers", "", "a b"} { + err := m.deprovisionBacking(context.Background(), "postgres", tok, "") + if !errors.Is(err, dropguard.ErrRefused) { + t.Fatalf("deprovisionBacking(token=%q): expected dropguard.ErrRefused, got %v", tok, err) + } + } +} + +// TestDeprovisionBacking_ValidToken_EmitsAuditThenDispatches uses an unknown +// resource type so the dispatch lands on the error default (no backend +// needed): a valid token must pass the guard, emit the provisioner.drop audit +// event, and reach the dispatch switch. +func TestDeprovisionBacking_ValidToken_EmitsAuditThenDispatches(t *testing.T) { + m := &Manager{} + err := m.deprovisionBacking(context.Background(), "weird-type", "pool-96edf9ee-d8ed-4292-9036-b63298ec5b2b", "") + if err == nil || !strings.Contains(err.Error(), "unknown resource type") { + t.Fatalf("expected unknown-resource-type dispatch error after passing the guard, got %v", err) + } + if errors.Is(err, dropguard.ErrRefused) { + t.Fatalf("valid pool token must not be refused: %v", err) + } +} + +func TestPoolResourceTypeProto_MapsEveryPoolType(t *testing.T) { + want := map[string]string{ + "postgres": "RESOURCE_TYPE_POSTGRES", + "redis": "RESOURCE_TYPE_REDIS", + "mongodb": "RESOURCE_TYPE_MONGODB", + "queue": "RESOURCE_TYPE_QUEUE", + "somethingelse": "RESOURCE_TYPE_UNSPECIFIED", + } + for in, out := range want { + if got := poolResourceTypeProto(in); got != out { + t.Errorf("poolResourceTypeProto(%q) = %q, want %q", in, got, out) + } + } +} diff --git a/internal/pool/manager.go b/internal/pool/manager.go index ae3ddc3..ead91e0 100644 --- a/internal/pool/manager.go +++ b/internal/pool/manager.go @@ -25,6 +25,7 @@ import ( "instant.dev/provisioner/internal/backend/postgres" "instant.dev/provisioner/internal/backend/queue" "instant.dev/provisioner/internal/backend/redis" + "instant.dev/provisioner/internal/dropguard" ) // Item is a pre-provisioned resource claimed from the pool. @@ -416,6 +417,30 @@ func (m *Manager) reportStuckAssigned(ctx context.Context) { // backends derive the real db_/usr_/keyspace names from the naming token, so // passing the pool_token here targets exactly the infra the pool created. func (m *Manager) deprovisionBacking(ctx context.Context, resourceType, token, providerResourceID string) error { + // Name-convention guard + DDL-audit (truehomie hardening, task D3). The + // pool reaper is the ONE customer-infra drop dispatch that does not pass + // through server.guardedDrop (it has no gRPC request), so it carries its + // own copy of the chokepoint contract: validate the naming token, then + // emit the same `provisioner.drop` audit event BEFORE the backend executes + // — the NR DDL-trap alert correlates shared-cluster DROP statements + // against these events, and an un-logged drop path would page as + // unsanctioned. + if guardErr := dropguard.CheckNamingToken(token); guardErr != nil { + slog.Error("provisioner.drop.refused", + "event", "provisioner.drop.refused", "site", "pool.deprovisionBacking", + "token", token, "provider_resource_id", providerResourceID, + "resource_type", poolResourceTypeProto(resourceType), "error", guardErr) + return fmt.Errorf("pool.deprovisionBacking: %w", guardErr) + } + slog.Info("provisioner.drop", + "event", "provisioner.drop", + "token", token, + "provider_resource_id", providerResourceID, + "resource_type", poolResourceTypeProto(resourceType), + "backend", "shared", + "request_id", "", + "caller", "pool_reaper", + ) switch resourceType { case "postgres": return m.postgresB.Deprovision(ctx, token, providerResourceID) @@ -720,3 +745,22 @@ func (m *Manager) migrate(ctx context.Context) error { // Keep rand imported for the uuid package's internal use. var _ = rand.Reader + +// poolResourceTypeProto maps a pool_items.resource_type value to the proto +// enum string used by server.guardedDrop's audit events, so one NR query +// (resource_type = 'RESOURCE_TYPE_POSTGRES' …) covers RPC drops and pool-reap +// drops alike. +func poolResourceTypeProto(resourceType string) string { + switch resourceType { + case "postgres": + return "RESOURCE_TYPE_POSTGRES" + case "redis": + return "RESOURCE_TYPE_REDIS" + case "mongodb": + return "RESOURCE_TYPE_MONGODB" + case "queue": + return "RESOURCE_TYPE_QUEUE" + default: + return "RESOURCE_TYPE_UNSPECIFIED" + } +} diff --git a/internal/pool/manager_reaper_test.go b/internal/pool/manager_reaper_test.go index 91dddac..fe3429f 100644 --- a/internal/pool/manager_reaper_test.go +++ b/internal/pool/manager_reaper_test.go @@ -322,13 +322,26 @@ func (r *fakeRows) Scan(dest ...any) error { if r.scanErrOn != 0 && r.delivered == r.scanErrOn { return r.scanErr } - // Populate string destinations with a placeholder so a non-error Scan in - // reportStuckAssigned (resource_type, count) does not panic. count is an int - // destination there; tolerate both. + // Populate string destinations positionally. reapFailed scans + // (id, resource_type, provider_resource_id, pool_token): resource_type must + // stay "postgres" so the dispatch routes to the tracked backend, but the + // pool_token (4th string) must be a VALID pool-token shape — dropguard + // (truehomie hardening, task D3) refuses "postgres" as a naming token, which + // would short-circuit deprovisionBacking before the backend is invoked. + // reportStuckAssigned scans (resource_type, count); count is an int. + sIdx := 0 for _, d := range dest { switch v := d.(type) { case *string: - *v = "postgres" + switch sIdx { + case 2: + *v = "local:0" + case 3: + *v = "pool-96edf9ee-d8ed-4292-9036-b63298ec5b2b" + default: + *v = "postgres" + } + sIdx++ case *int: *v = 0 } From 61dc69ba24fddb5dea8271cc94d47d023ec990d9 Mon Sep 17 00:00:00 2001 From: Manas Srivastava <[email protected]> Date: Thu, 11 Jun 2026 01:13:52 +0530 Subject: [PATCH 3/3] fix(dropguard): identifier length bound is an absurdity bound (128), not postgres's 63 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The live-cluster CI suite provisions tokens like tok_ (>63 bytes with the usr_ prefix) — postgres truncates such identifiers CONSISTENTLY on CREATE and DROP, so they round-trip fine in reality, and the 63-byte refusal wedged their Deprovision (4 coverage-job test failures). Keep the cap purely as an absurdity bound at 128. Verified the four failing live tests pass against a real postgres 16. Co-Authored-By: Claude Fable 5 --- internal/dropguard/dropguard.go | 12 ++++++--- internal/dropguard/dropguard_test.go | 40 ++++++++++++++-------------- 2 files changed, 29 insertions(+), 23 deletions(-) diff --git a/internal/dropguard/dropguard.go b/internal/dropguard/dropguard.go index ae9ad8a..3683b8d 100644 --- a/internal/dropguard/dropguard.go +++ b/internal/dropguard/dropguard.go @@ -46,9 +46,15 @@ import ( // errors.Is() a dropguard refusal regardless of the message text. var ErrRefused = errors.New("dropguard: destructive target refused") -// maxIdentifierLen bounds every validated identifier. Postgres truncates -// identifiers at 63 bytes; nothing legitimate is longer. -const maxIdentifierLen = 63 +// maxIdentifierLen bounds every validated identifier. This is an ABSURDITY +// bound only, deliberately looser than Postgres's own 63-byte identifier +// truncation: Postgres truncates a long name CONSISTENTLY on CREATE and DROP +// (so long-token resources — e.g. the live-cluster CI suite's +// tok_ tokens — round-trip correctly), and Mongo allows +// 64-byte database names. Refusing at 63 would wedge those legitimate +// deprovisions; only a name far beyond anything any naming scheme produces is +// refused. +const maxIdentifierLen = 128 // reservedTokens are naming tokens that must never be destroyed even when the // charset would otherwise allow them (e.g. a bug that flows a database or role diff --git a/internal/dropguard/dropguard_test.go b/internal/dropguard/dropguard_test.go index b04e336..b8edb73 100644 --- a/internal/dropguard/dropguard_test.go +++ b/internal/dropguard/dropguard_test.go @@ -26,24 +26,24 @@ func TestCheckNamingToken_AcceptsEveryLegitimateForm(t *testing.T) { func TestCheckNamingToken_RefusesDangerousForms(t *testing.T) { invalid := []string{ - "", // empty — would derive db_ / a *:* scan prefix - "postgres", // system database - "template0", // system database - "template1", // system database - "instant_customers", // the shared customer cluster's own database - "instant_platform", // the platform database - "INSTANODE_ADMIN", // admin role, case-insensitive - "instant_cust", // provisioner admin role - "doadmin", // DO managed-PG admin - "admin", // mongo system db / generic admin - "default", // redis ACL admin user - "root", // generic admin - "db_*", // wildcard - "tok; DROP DATABASE x", // SQL metacharacters - "a b", // whitespace - `a"b`, // quote - "%", // redis SCAN wildcard material - strings.Repeat("a", 64), // over-long + "", // empty — would derive db_ / a *:* scan prefix + "postgres", // system database + "template0", // system database + "template1", // system database + "instant_customers", // the shared customer cluster's own database + "instant_platform", // the platform database + "INSTANODE_ADMIN", // admin role, case-insensitive + "instant_cust", // provisioner admin role + "doadmin", // DO managed-PG admin + "admin", // mongo system db / generic admin + "default", // redis ACL admin user + "root", // generic admin + "db_*", // wildcard + "tok; DROP DATABASE x", // SQL metacharacters + "a b", // whitespace + `a"b`, // quote + "%", // redis SCAN wildcard material + strings.Repeat("a", 129), // over-long (absurdity bound) } for _, tok := range invalid { err := CheckNamingToken(tok) @@ -92,7 +92,7 @@ func TestCheckDatabaseName_RefusesSystemAndMisshapenNames(t *testing.T) { "96edf9eed8ed42929036b63298ec5b2b", // missing prefix entirely "customers", // arbitrary non-tenant name `db_x"; DROP DATABASE "postgres`, // injection-shaped - "db_" + strings.Repeat("a", 64), // over-long + "db_" + strings.Repeat("a", 129), // over-long (absurdity bound) "dedicated_db_", // dedicated prefix, empty token } for _, name := range invalid { @@ -139,7 +139,7 @@ func TestCheckUserName_RefusesSystemAndMisshapenNames(t *testing.T) { "usr_instanode_admin", "96edf9eed8ed42929036b63298ec5b2b", // missing prefix "usr_a b", - "usr_" + strings.Repeat("a", 64), + "usr_" + strings.Repeat("a", 129), "dedicated_usr_", } for _, name := range invalid {