diff --git a/internal/jobs/customer_backup_reason_test.go b/internal/jobs/customer_backup_reason_test.go index 201702a..1566ea7 100644 --- a/internal/jobs/customer_backup_reason_test.go +++ b/internal/jobs/customer_backup_reason_test.go @@ -27,8 +27,19 @@ func TestBackupFailReason(t *testing.T) { {"no password supplied", errors.New("pg_dump: error: no password supplied"), "auth"}, {"role does not exist", errors.New(`FATAL: role "usr_abc" does not exist`), "auth"}, {"permission denied", errors.New("permission denied for table users"), "auth"}, + // R2 (2026-06-11) — the exact mongo + redis prod stderr from the P1 + // incident. Before the fix these classified "dump" (transient) and + // told the customer "briefly unreachable, we'll retry" for a + // non-self-healing credential failure, and never paged ops. + {"prod mongo SCRAM auth", errors.New(`mongodump: exit status 1 (stderr: Failed: can't create session: failed to connect to mongodb://usr_x:y@mongo.instanode.dev:27017/db_x?authSource=db_x: connection() error occurred during connection handshake: auth error: sasl conversation error: unable to authenticate using mechanism "SCRAM-SHA-256": connection(mongo.instanode.dev:27017[-3]) socket was unexpectedly closed: EOF)`), "auth"}, + {"mongo plain authentication failed", errors.New("mongodump: Failed: Authentication failed."), "auth"}, + {"mongo unable to authenticate", errors.New("unable to authenticate using mechanism"), "auth"}, + {"prod redis WRONGPASS", errors.New(`redis-cli --rdb: exit status 1 (stderr: AUTH failed: WRONGPASS invalid username-password pair or user is disabled.)`), "auth"}, + {"redis NOAUTH", errors.New("redis-cli --rdb: NOAUTH Authentication required."), "auth"}, {"server unavailable is transient", errors.New("pg_dump: server unavailable"), "dump"}, {"connection refused is transient", errors.New("connection refused"), "dump"}, + {"mongo connection refused is transient", errors.New("mongodump: Failed: can't create session: connection refused"), "dump"}, + {"redis timeout is transient", errors.New("redis-cli --rdb: Could not connect to Redis: Connection timed out"), "dump"}, {"nil err defaults to dump", nil, "dump"}, } for _, c := range cases { diff --git a/internal/jobs/customer_backup_runner.go b/internal/jobs/customer_backup_runner.go index c6a5ce6..7b7f53d 100644 --- a/internal/jobs/customer_backup_runner.go +++ b/internal/jobs/customer_backup_runner.go @@ -348,10 +348,19 @@ func (w *CustomerBackupRunnerWorker) Work(ctx context.Context, job *river.Job[Cu // failure here is logged and the sweep proceeds (the pending-row scan // still drains the normal queue). func (w *CustomerBackupRunnerWorker) recoverStuckRows(ctx context.Context) { + // We DELIBERATELY do not touch started_at here. The column is + // `TIMESTAMPTZ NOT NULL` (api migration 031_backups.sql), so an explicit + // `started_at = NULL` violates the constraint and makes the UPDATE fail + // on EVERY tick (the row stays orphaned at 'running', recovery never + // works, and the log floods with stuck_row_recovery_failed). Leaving the + // stale started_at in place is correct: the re-claim in processBackup + // runs `SET started_at = now()`, so the value is overwritten the instant + // the row is re-picked, and the WHERE-clause floor below (started_at < + // now() - timeout) only ever re-matches a row that has been 'running' + // again past the timeout — never a freshly-reset 'pending' row. res, err := w.db.ExecContext(ctx, ` UPDATE resource_backups SET status = 'pending', - started_at = NULL, error_summary = 'recovered: runner pod lost before finalize — re-queued' WHERE status = 'running' AND started_at IS NOT NULL @@ -592,23 +601,43 @@ func (w *CustomerBackupRunnerWorker) processBackup(parentCtx context.Context, p // the api's internal refund endpoint so the team's daily counter is // credited. Scheduled backups don't burn the manual-counter so no // refund is needed. -// backupFailReason classifies a pg_dump failure into "auth" (the credential -// was rejected — password auth failed, missing role, no password supplied: -// credential drift that will NOT self-heal and is SLA-relevant) vs "dump" (any -// other pg_dump failure — DB briefly unreachable, timeout: transient, retried -// next run). The match is on Postgres' own error text, lower-cased so it's -// resilient to surrounding formatting. +// backupFailReason classifies a dump failure into "auth" (the credential was +// rejected — credential drift that will NOT self-heal and is SLA-relevant, +// PAGES ops) vs "dump" (any other failure — DB briefly unreachable, timeout: +// transient, retried next run). The match is on the dump tool's own error +// text, lower-cased so it's resilient to surrounding formatting. +// +// R2 (2026-06-11) — the matcher now spans all THREE dump tools, not just +// pg_dump. The 2026-06-11 P1 incident showed mongodump auth errors +// ("auth error: ... SCRAM-SHA-256") and redis-cli auth errors ("WRONGPASS", +// "NOAUTH") were silently bucketed as transient "dump" — telling the customer +// "briefly unreachable, we'll retry" for a credential failure that retrying +// can NEVER fix, and NOT paging ops on a non-self-healing condition. Each +// backend speaks its own auth dialect, so we match all of them here. func backupFailReason(err error) string { if err == nil { return "dump" } s := strings.ToLower(err.Error()) switch { + // Postgres (pg_dump) auth dialect. case strings.Contains(s, "password authentication failed"), strings.Contains(s, "authentication failed"), strings.Contains(s, "no password supplied"), strings.Contains(s, "role") && strings.Contains(s, "does not exist"), - strings.Contains(s, "permission denied for"): + strings.Contains(s, "permission denied for"), + // MongoDB (mongodump) auth dialect: the driver reports an + // "auth error" wrapping a SASL/SCRAM failure, and "authentication + // failed" on an outright user/db mismatch. + strings.Contains(s, "auth error"), + strings.Contains(s, "unable to authenticate"), + strings.Contains(s, "sasl"), + // Redis (redis-cli) auth dialect: WRONGPASS (bad user/pass), + // NOAUTH (server wants a password we didn't send), and the + // human-readable "invalid username-password pair". + strings.Contains(s, "wrongpass"), + strings.Contains(s, "noauth"), + strings.Contains(s, "invalid username-password"): return "auth" default: return "dump" diff --git a/internal/jobs/customer_backup_runner_test.go b/internal/jobs/customer_backup_runner_test.go index 88a3731..6cc5e9e 100644 --- a/internal/jobs/customer_backup_runner_test.go +++ b/internal/jobs/customer_backup_runner_test.go @@ -8,6 +8,7 @@ import ( "os" "path/filepath" "runtime" + "strings" "sync" "testing" "time" @@ -425,6 +426,82 @@ func TestRunner_RecoversStuckRunningRows(t *testing.T) { } } +// TestRecoverStuckRows_NeverBindsNullStartedAt is the regression test for the +// 2026-06-11 P1: recoverStuckRows shipped with `SET ... started_at = NULL`, +// but resource_backups.started_at is `TIMESTAMPTZ NOT NULL` (api migration +// 031_backups.sql). In prod that UPDATE failed on EVERY 30-60s tick with +// +// pq: null value in column "started_at" of relation "resource_backups" +// violates not-null constraint +// +// so stuck-row recovery NEVER worked and the log flooded. sqlmock does NOT +// enforce NOT-NULL, which is exactly why the prior regex-matcher test stayed +// green through the bug. This test inspects the literal SQL the runner emits +// for the recovery UPDATE and FAILS if it ever sets started_at to NULL again. +// It reds on the old code (which contained `started_at = NULL`) and greens on +// the fix (which drops the clause, leaving started_at untouched on re-queue). +func TestRecoverStuckRows_NeverBindsNullStartedAt(t *testing.T) { + var capturedRecoverySQL string + matcher := sqlmock.QueryMatcherFunc(func(expectedSQL, actualSQL string) error { + // The recovery UPDATE is the only statement that resets a row to + // 'pending'. Capture its exact text so we can assert on it below. + if strings.Contains(actualSQL, "SET status = 'pending'") && + strings.Contains(actualSQL, "WHERE status = 'running'") { + capturedRecoverySQL = actualSQL + } + // Defer to the default regex semantics so the rest of the tick's + // expectations match exactly as in the other tests. + return sqlmock.QueryMatcherRegexp.Match(expectedSQL, actualSQL) + }) + + db, mock, err := sqlmock.New(sqlmock.QueryMatcherOption(matcher)) + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + mock.ExpectExec(`UPDATE resource_backups\s+SET status = 'pending'`). + WillReturnResult(sqlmock.NewResult(0, 1)) + // Pending sweep finds nothing else this tick. + mock.ExpectQuery(`SELECT b.id::text`). + WithArgs(backupBatchSize). + WillReturnRows(sqlmock.NewRows([]string{ + "id", "resource_id", "tier_at_backup", "backup_kind", + "token", "connection_url", "resource_type", "team_id", + })) + // Retention sweep loops the five fallback tier names. + for i := 0; i < 5; i++ { + mock.ExpectQuery(`SELECT id::text, s3_key\s+FROM resource_backups`). + WillReturnRows(sqlmock.NewRows([]string{"id", "s3_key"})) + } + + w := &CustomerBackupRunnerWorker{ + db: db, + store: newFakeBackupStore(), + pgDump: &fakePgDump{payload: []byte("x")}, + bucket: "instant-shared", + prefix: "backups", + aesKey: testAESKeyHex, + now: time.Now, + timeout: time.Minute, + batchN: backupBatchSize, + } + if err := w.Work(context.Background(), fakeRunnerJob()); err != nil { + t.Fatalf("Work: %v", err) + } + + if capturedRecoverySQL == "" { + t.Fatal("recovery UPDATE was never emitted — test did not exercise recoverStuckRows") + } + // The crux: the recovery UPDATE must NOT set started_at to NULL. The + // column is NOT NULL, so any `started_at = NULL` makes the statement + // fail in prod on every tick. Case-insensitive, whitespace-tolerant. + normalized := strings.ToLower(strings.Join(strings.Fields(capturedRecoverySQL), " ")) + if strings.Contains(normalized, "started_at = null") { + t.Errorf("recovery UPDATE binds NULL to NOT-NULL started_at — P1 regression:\n%s", capturedRecoverySQL) + } +} + // TestBackupObjectKey — pure function contract. func TestBackupObjectKey(t *testing.T) { got := backupObjectKey("backups", "tok-abc", "bk-uuid")