InstaNode-dev · mastermanas805 · Jun 11, 2026 · Jun 11, 2026
diff --git a/internal/jobs/customer_backup_reason_test.go b/internal/jobs/customer_backup_reason_test.go
@@ -27,8 +27,19 @@ func TestBackupFailReason(t *testing.T) {
 		{"no password supplied", errors.New("pg_dump: error: no password supplied"), "auth"},
 		{"role does not exist", errors.New(`FATAL: role "usr_abc" does not exist`), "auth"},
 		{"permission denied", errors.New("permission denied for table users"), "auth"},
+		// R2 (2026-06-11) — the exact mongo + redis prod stderr from the P1
+		// incident. Before the fix these classified "dump" (transient) and
+		// told the customer "briefly unreachable, we'll retry" for a
+		// non-self-healing credential failure, and never paged ops.
+		{"prod mongo SCRAM auth", errors.New(`mongodump: exit status 1 (stderr: Failed: can't create session: failed to connect to mongodb://usr_x:y@mongo.instanode.dev:27017/db_x?authSource=db_x: connection() error occurred during connection handshake: auth error: sasl conversation error: unable to authenticate using mechanism "SCRAM-SHA-256":  connection(mongo.instanode.dev:27017[-3]) socket was unexpectedly closed: EOF)`), "auth"},
+		{"mongo plain authentication failed", errors.New("mongodump: Failed: Authentication failed."), "auth"},
+		{"mongo unable to authenticate", errors.New("unable to authenticate using mechanism"), "auth"},
+		{"prod redis WRONGPASS", errors.New(`redis-cli --rdb: exit status 1 (stderr: AUTH failed: WRONGPASS invalid username-password pair or user is disabled.)`), "auth"},
+		{"redis NOAUTH", errors.New("redis-cli --rdb: NOAUTH Authentication required."), "auth"},
 		{"server unavailable is transient", errors.New("pg_dump: server unavailable"), "dump"},
 		{"connection refused is transient", errors.New("connection refused"), "dump"},
+		{"mongo connection refused is transient", errors.New("mongodump: Failed: can't create session: connection refused"), "dump"},
+		{"redis timeout is transient", errors.New("redis-cli --rdb: Could not connect to Redis: Connection timed out"), "dump"},
 		{"nil err defaults to dump", nil, "dump"},
 	}
 	for _, c := range cases {

diff --git a/internal/jobs/customer_backup_runner.go b/internal/jobs/customer_backup_runner.go
@@ -348,10 +348,19 @@ func (w *CustomerBackupRunnerWorker) Work(ctx context.Context, job *river.Job[Cu
 // failure here is logged and the sweep proceeds (the pending-row scan
 // still drains the normal queue).
 func (w *CustomerBackupRunnerWorker) recoverStuckRows(ctx context.Context) {
+	// We DELIBERATELY do not touch started_at here. The column is
+	// `TIMESTAMPTZ NOT NULL` (api migration 031_backups.sql), so an explicit
+	// `started_at = NULL` violates the constraint and makes the UPDATE fail
+	// on EVERY tick (the row stays orphaned at 'running', recovery never
+	// works, and the log floods with stuck_row_recovery_failed). Leaving the
+	// stale started_at in place is correct: the re-claim in processBackup
+	// runs `SET started_at = now()`, so the value is overwritten the instant
+	// the row is re-picked, and the WHERE-clause floor below (started_at <
+	// now() - timeout) only ever re-matches a row that has been 'running'
+	// again past the timeout — never a freshly-reset 'pending' row.
 	res, err := w.db.ExecContext(ctx, `
 		UPDATE resource_backups
 		   SET status = 'pending',
-		       started_at = NULL,
 		       error_summary = 'recovered: runner pod lost before finalize — re-queued'
 		 WHERE status = 'running'
 		   AND started_at IS NOT NULL
@@ -592,23 +601,43 @@ func (w *CustomerBackupRunnerWorker) processBackup(parentCtx context.Context, p
 // the api's internal refund endpoint so the team's daily counter is
 // credited. Scheduled backups don't burn the manual-counter so no
 // refund is needed.
-// backupFailReason classifies a pg_dump failure into "auth" (the credential
-// was rejected — password auth failed, missing role, no password supplied:
-// credential drift that will NOT self-heal and is SLA-relevant) vs "dump" (any
-// other pg_dump failure — DB briefly unreachable, timeout: transient, retried
-// next run). The match is on Postgres' own error text, lower-cased so it's
-// resilient to surrounding formatting.
+// backupFailReason classifies a dump failure into "auth" (the credential was
+// rejected — credential drift that will NOT self-heal and is SLA-relevant,
+// PAGES ops) vs "dump" (any other failure — DB briefly unreachable, timeout:
+// transient, retried next run). The match is on the dump tool's own error
+// text, lower-cased so it's resilient to surrounding formatting.
+//
+// R2 (2026-06-11) — the matcher now spans all THREE dump tools, not just
+// pg_dump. The 2026-06-11 P1 incident showed mongodump auth errors
+// ("auth error: ... SCRAM-SHA-256") and redis-cli auth errors ("WRONGPASS",
+// "NOAUTH") were silently bucketed as transient "dump" — telling the customer
+// "briefly unreachable, we'll retry" for a credential failure that retrying
+// can NEVER fix, and NOT paging ops on a non-self-healing condition. Each
+// backend speaks its own auth dialect, so we match all of them here.
 func backupFailReason(err error) string {
 	if err == nil {
 		return "dump"
 	}
 	s := strings.ToLower(err.Error())
 	switch {
+	// Postgres (pg_dump) auth dialect.
 	case strings.Contains(s, "password authentication failed"),
 		strings.Contains(s, "authentication failed"),
 		strings.Contains(s, "no password supplied"),
 		strings.Contains(s, "role") && strings.Contains(s, "does not exist"),
-		strings.Contains(s, "permission denied for"):
+		strings.Contains(s, "permission denied for"),
+		// MongoDB (mongodump) auth dialect: the driver reports an
+		// "auth error" wrapping a SASL/SCRAM failure, and "authentication
+		// failed" on an outright user/db mismatch.
+		strings.Contains(s, "auth error"),
+		strings.Contains(s, "unable to authenticate"),
+		strings.Contains(s, "sasl"),
+		// Redis (redis-cli) auth dialect: WRONGPASS (bad user/pass),
+		// NOAUTH (server wants a password we didn't send), and the
+		// human-readable "invalid username-password pair".
+		strings.Contains(s, "wrongpass"),
+		strings.Contains(s, "noauth"),
+		strings.Contains(s, "invalid username-password"):
 		return "auth"
 	default:
 		return "dump"

diff --git a/internal/jobs/customer_backup_runner_test.go b/internal/jobs/customer_backup_runner_test.go
@@ -8,6 +8,7 @@ import (
 	"os"
 	"path/filepath"
 	"runtime"
+	"strings"
 	"sync"
 	"testing"
 	"time"
@@ -425,6 +426,82 @@ func TestRunner_RecoversStuckRunningRows(t *testing.T) {
 	}
 }
 
+// TestRecoverStuckRows_NeverBindsNullStartedAt is the regression test for the
+// 2026-06-11 P1: recoverStuckRows shipped with `SET ... started_at = NULL`,
+// but resource_backups.started_at is `TIMESTAMPTZ NOT NULL` (api migration
+// 031_backups.sql). In prod that UPDATE failed on EVERY 30-60s tick with
+//
+//	pq: null value in column "started_at" of relation "resource_backups"
+//	violates not-null constraint
+//
+// so stuck-row recovery NEVER worked and the log flooded. sqlmock does NOT
+// enforce NOT-NULL, which is exactly why the prior regex-matcher test stayed
+// green through the bug. This test inspects the literal SQL the runner emits
+// for the recovery UPDATE and FAILS if it ever sets started_at to NULL again.
+// It reds on the old code (which contained `started_at = NULL`) and greens on
+// the fix (which drops the clause, leaving started_at untouched on re-queue).
+func TestRecoverStuckRows_NeverBindsNullStartedAt(t *testing.T) {
+	var capturedRecoverySQL string
+	matcher := sqlmock.QueryMatcherFunc(func(expectedSQL, actualSQL string) error {
+		// The recovery UPDATE is the only statement that resets a row to
+		// 'pending'. Capture its exact text so we can assert on it below.
+		if strings.Contains(actualSQL, "SET status = 'pending'") &&
+			strings.Contains(actualSQL, "WHERE status = 'running'") {
+			capturedRecoverySQL = actualSQL
+		}
+		// Defer to the default regex semantics so the rest of the tick's
+		// expectations match exactly as in the other tests.
+		return sqlmock.QueryMatcherRegexp.Match(expectedSQL, actualSQL)
+	})
+
+	db, mock, err := sqlmock.New(sqlmock.QueryMatcherOption(matcher))
+	if err != nil {
+		t.Fatalf("sqlmock.New: %v", err)
+	}
+	defer db.Close()
+
+	mock.ExpectExec(`UPDATE resource_backups\s+SET status = 'pending'`).
+		WillReturnResult(sqlmock.NewResult(0, 1))
+	// Pending sweep finds nothing else this tick.
+	mock.ExpectQuery(`SELECT b.id::text`).
+		WithArgs(backupBatchSize).
+		WillReturnRows(sqlmock.NewRows([]string{
+			"id", "resource_id", "tier_at_backup", "backup_kind",
+			"token", "connection_url", "resource_type", "team_id",
+		}))
+	// Retention sweep loops the five fallback tier names.
+	for i := 0; i < 5; i++ {
+		mock.ExpectQuery(`SELECT id::text, s3_key\s+FROM resource_backups`).
+			WillReturnRows(sqlmock.NewRows([]string{"id", "s3_key"}))
+	}
+
+	w := &CustomerBackupRunnerWorker{
+		db:      db,
+		store:   newFakeBackupStore(),
+		pgDump:  &fakePgDump{payload: []byte("x")},
+		bucket:  "instant-shared",
+		prefix:  "backups",
+		aesKey:  testAESKeyHex,
+		now:     time.Now,
+		timeout: time.Minute,
+		batchN:  backupBatchSize,
+	}
+	if err := w.Work(context.Background(), fakeRunnerJob()); err != nil {
+		t.Fatalf("Work: %v", err)
+	}
+
+	if capturedRecoverySQL == "" {
+		t.Fatal("recovery UPDATE was never emitted — test did not exercise recoverStuckRows")
+	}
+	// The crux: the recovery UPDATE must NOT set started_at to NULL. The
+	// column is NOT NULL, so any `started_at = NULL` makes the statement
+	// fail in prod on every tick. Case-insensitive, whitespace-tolerant.
+	normalized := strings.ToLower(strings.Join(strings.Fields(capturedRecoverySQL), " "))
+	if strings.Contains(normalized, "started_at = null") {
+		t.Errorf("recovery UPDATE binds NULL to NOT-NULL started_at — P1 regression:\n%s", capturedRecoverySQL)
+	}
+}
+
 // TestBackupObjectKey — pure function contract.
 func TestBackupObjectKey(t *testing.T) {
 	got := backupObjectKey("backups", "tok-abc", "bk-uuid")