Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions internal/jobs/customer_backup_reason_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,19 @@ func TestBackupFailReason(t *testing.T) {
{"no password supplied", errors.New("pg_dump: error: no password supplied"), "auth"},
{"role does not exist", errors.New(`FATAL: role "usr_abc" does not exist`), "auth"},
{"permission denied", errors.New("permission denied for table users"), "auth"},
// R2 (2026-06-11) — the exact mongo + redis prod stderr from the P1
// incident. Before the fix these classified "dump" (transient) and
// told the customer "briefly unreachable, we'll retry" for a
// non-self-healing credential failure, and never paged ops.
{"prod mongo SCRAM auth", errors.New(`mongodump: exit status 1 (stderr: Failed: can't create session: failed to connect to mongodb://usr_x:y@mongo.instanode.dev:27017/db_x?authSource=db_x: connection() error occurred during connection handshake: auth error: sasl conversation error: unable to authenticate using mechanism "SCRAM-SHA-256": connection(mongo.instanode.dev:27017[-3]) socket was unexpectedly closed: EOF)`), "auth"},
{"mongo plain authentication failed", errors.New("mongodump: Failed: Authentication failed."), "auth"},
{"mongo unable to authenticate", errors.New("unable to authenticate using mechanism"), "auth"},
{"prod redis WRONGPASS", errors.New(`redis-cli --rdb: exit status 1 (stderr: AUTH failed: WRONGPASS invalid username-password pair or user is disabled.)`), "auth"},
{"redis NOAUTH", errors.New("redis-cli --rdb: NOAUTH Authentication required."), "auth"},
{"server unavailable is transient", errors.New("pg_dump: server unavailable"), "dump"},
{"connection refused is transient", errors.New("connection refused"), "dump"},
{"mongo connection refused is transient", errors.New("mongodump: Failed: can't create session: connection refused"), "dump"},
{"redis timeout is transient", errors.New("redis-cli --rdb: Could not connect to Redis: Connection timed out"), "dump"},
{"nil err defaults to dump", nil, "dump"},
}
for _, c := range cases {
Expand Down
45 changes: 37 additions & 8 deletions internal/jobs/customer_backup_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -348,10 +348,19 @@ func (w *CustomerBackupRunnerWorker) Work(ctx context.Context, job *river.Job[Cu
// failure here is logged and the sweep proceeds (the pending-row scan
// still drains the normal queue).
func (w *CustomerBackupRunnerWorker) recoverStuckRows(ctx context.Context) {
// We DELIBERATELY do not touch started_at here. The column is
// `TIMESTAMPTZ NOT NULL` (api migration 031_backups.sql), so an explicit
// `started_at = NULL` violates the constraint and makes the UPDATE fail
// on EVERY tick (the row stays orphaned at 'running', recovery never
// works, and the log floods with stuck_row_recovery_failed). Leaving the
// stale started_at in place is correct: the re-claim in processBackup
// runs `SET started_at = now()`, so the value is overwritten the instant
// the row is re-picked, and the WHERE-clause floor below (started_at <
// now() - timeout) only ever re-matches a row that has been 'running'
// again past the timeout — never a freshly-reset 'pending' row.
res, err := w.db.ExecContext(ctx, `
UPDATE resource_backups
SET status = 'pending',
started_at = NULL,
error_summary = 'recovered: runner pod lost before finalize — re-queued'
WHERE status = 'running'
AND started_at IS NOT NULL
Expand Down Expand Up @@ -592,23 +601,43 @@ func (w *CustomerBackupRunnerWorker) processBackup(parentCtx context.Context, p
// the api's internal refund endpoint so the team's daily counter is
// credited. Scheduled backups don't burn the manual-counter so no
// refund is needed.
// backupFailReason classifies a pg_dump failure into "auth" (the credential
// was rejected — password auth failed, missing role, no password supplied:
// credential drift that will NOT self-heal and is SLA-relevant) vs "dump" (any
// other pg_dump failure — DB briefly unreachable, timeout: transient, retried
// next run). The match is on Postgres' own error text, lower-cased so it's
// resilient to surrounding formatting.
// backupFailReason classifies a dump failure into "auth" (the credential was
// rejected — credential drift that will NOT self-heal and is SLA-relevant,
// PAGES ops) vs "dump" (any other failure — DB briefly unreachable, timeout:
// transient, retried next run). The match is on the dump tool's own error
// text, lower-cased so it's resilient to surrounding formatting.
//
// R2 (2026-06-11) — the matcher now spans all THREE dump tools, not just
// pg_dump. The 2026-06-11 P1 incident showed mongodump auth errors
// ("auth error: ... SCRAM-SHA-256") and redis-cli auth errors ("WRONGPASS",
// "NOAUTH") were silently bucketed as transient "dump" — telling the customer
// "briefly unreachable, we'll retry" for a credential failure that retrying
// can NEVER fix, and NOT paging ops on a non-self-healing condition. Each
// backend speaks its own auth dialect, so we match all of them here.
func backupFailReason(err error) string {
if err == nil {
return "dump"
}
s := strings.ToLower(err.Error())
switch {
// Postgres (pg_dump) auth dialect.
case strings.Contains(s, "password authentication failed"),
strings.Contains(s, "authentication failed"),
strings.Contains(s, "no password supplied"),
strings.Contains(s, "role") && strings.Contains(s, "does not exist"),
strings.Contains(s, "permission denied for"):
strings.Contains(s, "permission denied for"),
// MongoDB (mongodump) auth dialect: the driver reports an
// "auth error" wrapping a SASL/SCRAM failure, and "authentication
// failed" on an outright user/db mismatch.
strings.Contains(s, "auth error"),
strings.Contains(s, "unable to authenticate"),
strings.Contains(s, "sasl"),
// Redis (redis-cli) auth dialect: WRONGPASS (bad user/pass),
// NOAUTH (server wants a password we didn't send), and the
// human-readable "invalid username-password pair".
strings.Contains(s, "wrongpass"),
strings.Contains(s, "noauth"),
strings.Contains(s, "invalid username-password"):
return "auth"
default:
return "dump"
Expand Down
77 changes: 77 additions & 0 deletions internal/jobs/customer_backup_runner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"os"
"path/filepath"
"runtime"
"strings"
"sync"
"testing"
"time"
Expand Down Expand Up @@ -425,6 +426,82 @@ func TestRunner_RecoversStuckRunningRows(t *testing.T) {
}
}

// TestRecoverStuckRows_NeverBindsNullStartedAt is the regression test for the
// 2026-06-11 P1: recoverStuckRows shipped with `SET ... started_at = NULL`,
// but resource_backups.started_at is `TIMESTAMPTZ NOT NULL` (api migration
// 031_backups.sql). In prod that UPDATE failed on EVERY 30-60s tick with
//
// pq: null value in column "started_at" of relation "resource_backups"
// violates not-null constraint
//
// so stuck-row recovery NEVER worked and the log flooded. sqlmock does NOT
// enforce NOT-NULL, which is exactly why the prior regex-matcher test stayed
// green through the bug. This test inspects the literal SQL the runner emits
// for the recovery UPDATE and FAILS if it ever sets started_at to NULL again.
// It reds on the old code (which contained `started_at = NULL`) and greens on
// the fix (which drops the clause, leaving started_at untouched on re-queue).
func TestRecoverStuckRows_NeverBindsNullStartedAt(t *testing.T) {
var capturedRecoverySQL string
matcher := sqlmock.QueryMatcherFunc(func(expectedSQL, actualSQL string) error {
// The recovery UPDATE is the only statement that resets a row to
// 'pending'. Capture its exact text so we can assert on it below.
if strings.Contains(actualSQL, "SET status = 'pending'") &&
strings.Contains(actualSQL, "WHERE status = 'running'") {
capturedRecoverySQL = actualSQL
}
// Defer to the default regex semantics so the rest of the tick's
// expectations match exactly as in the other tests.
return sqlmock.QueryMatcherRegexp.Match(expectedSQL, actualSQL)
})

db, mock, err := sqlmock.New(sqlmock.QueryMatcherOption(matcher))
if err != nil {
t.Fatalf("sqlmock.New: %v", err)
}
defer db.Close()

mock.ExpectExec(`UPDATE resource_backups\s+SET status = 'pending'`).
WillReturnResult(sqlmock.NewResult(0, 1))
// Pending sweep finds nothing else this tick.
mock.ExpectQuery(`SELECT b.id::text`).
WithArgs(backupBatchSize).
WillReturnRows(sqlmock.NewRows([]string{
"id", "resource_id", "tier_at_backup", "backup_kind",
"token", "connection_url", "resource_type", "team_id",
}))
// Retention sweep loops the five fallback tier names.
for i := 0; i < 5; i++ {
mock.ExpectQuery(`SELECT id::text, s3_key\s+FROM resource_backups`).
WillReturnRows(sqlmock.NewRows([]string{"id", "s3_key"}))
}

w := &CustomerBackupRunnerWorker{
db: db,
store: newFakeBackupStore(),
pgDump: &fakePgDump{payload: []byte("x")},
bucket: "instant-shared",
prefix: "backups",
aesKey: testAESKeyHex,
now: time.Now,
timeout: time.Minute,
batchN: backupBatchSize,
}
if err := w.Work(context.Background(), fakeRunnerJob()); err != nil {
t.Fatalf("Work: %v", err)
}

if capturedRecoverySQL == "" {
t.Fatal("recovery UPDATE was never emitted — test did not exercise recoverStuckRows")
}
// The crux: the recovery UPDATE must NOT set started_at to NULL. The
// column is NOT NULL, so any `started_at = NULL` makes the statement
// fail in prod on every tick. Case-insensitive, whitespace-tolerant.
normalized := strings.ToLower(strings.Join(strings.Fields(capturedRecoverySQL), " "))
if strings.Contains(normalized, "started_at = null") {
t.Errorf("recovery UPDATE binds NULL to NOT-NULL started_at — P1 regression:\n%s", capturedRecoverySQL)
}
}

// TestBackupObjectKey — pure function contract.
func TestBackupObjectKey(t *testing.T) {
got := backupObjectKey("backups", "tok-abc", "bk-uuid")
Expand Down
Loading