From 8edd54bee6aea25e8832d1d93d7585ced3eda7c0 Mon Sep 17 00:00:00 2001 From: Manas Srivastava Date: Thu, 4 Jun 2026 01:41:52 +0530 Subject: [PATCH] =?UTF-8?q?fix(resource):=20atomic=20single-shot=20deprovi?= =?UTF-8?q?sion=20on=20DELETE=20=E2=80=94=20close=20double-deprovision=20T?= =?UTF-8?q?OCTOU?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Design-review follow-up to #229. The idempotent-DELETE fix used a `resource.Status == "deleted"` PRE-READ before SoftDeleteResource + deprovision — a check-then-act TOCTOU: two concurrent DELETEs both read status='active', both pass the pre-read, and BOTH fire the destructive provisioner Deprovision / storage Deprovision RPC against the same backend. That is the truehomie-db DROP incident class (an active customer DB dropped via a path that re-ran teardown). Fix: make the soft-delete the authoritative atomic guard. New models.SoftDeleteResourceIfActive does `UPDATE ... SET status='deleted' WHERE id=$1 AND status != 'deleted'` and returns RowsAffected>0. The handler deprovisions ONLY when this call won the transition (deleted==true); any racer or sequential retry gets deleted==false → idempotent {"ok":true,"already_deleted":true} with NO second deprovision. Removes the redundant pre-read (the rows-affected check subsumes it and is race-safe). SoftDeleteResource is kept unchanged for the ~18 provision-rollback callers that always operate on a just-created row. Tests: models.TestSoftDeleteResourceIfActive_Branches (sqlmock: won / no-op / error — 100% of the new fn). The existing TestResourceDelete_Idempotent_ DoubleDelete now exercises the rows-affected !deleted branch via a sequential retry (200 already_deleted, no re-deprovision). All changed handler lines covered. Verified vs real Postgres+Redis. Co-Authored-By: Claude Opus 4.8 (1M context) --- internal/handlers/resource.go | 25 ++++++++++------ internal/models/coverage_resource_test.go | 28 ++++++++++++++++++ internal/models/resource.go | 35 +++++++++++++++++++---- 3 files changed, 74 insertions(+), 14 deletions(-) diff --git a/internal/handlers/resource.go b/internal/handlers/resource.go index d91dc4ac..961860b9 100644 --- a/internal/handlers/resource.go +++ b/internal/handlers/resource.go @@ -216,15 +216,16 @@ func (h *ResourceHandler) Delete(c *fiber.Ctx) error { return respondError(c, fiber.StatusNotFound, "not_found", "Resource not found") } - // Idempotent DELETE (bug-bash #4/#12): a repeated DELETE (retry, double-click, - // concurrent call) must NOT re-soft-delete and, worse, re-deprovision the - // backend a second time. The first DELETE already tore it down — report - // success and do nothing. - if resource.Status == "deleted" { - return c.JSON(fiber.Map{"ok": true, "already_deleted": true, "id": resource.ID.String()}) - } - - if err := models.SoftDeleteResource(c.Context(), h.db, resource.ID); err != nil { + // Idempotent DELETE (bug-bash #4/#12, hardened post-review): a repeated + // DELETE (retry, double-click, CONCURRENT call) must NOT re-fire the + // destructive deprovision against already-gone backend infra. The original + // fix used a `status == "deleted"` pre-read, but that is a check-then-act + // TOCTOU: two concurrent DELETEs both read status='active' and both + // deprovision. The authoritative guard is the atomic status-gated UPDATE — + // only the call that actually transitions active→deleted (RowsAffected==1) + // proceeds to deprovision; any racer/retry gets a no-op success. + deleted, err := models.SoftDeleteResourceIfActive(c.Context(), h.db, resource.ID) + if err != nil { slog.Error("resource.delete.failed", "error", err, "resource_id", resource.ID, @@ -232,6 +233,12 @@ func (h *ResourceHandler) Delete(c *fiber.Ctx) error { ) return respondError(c, fiber.StatusServiceUnavailable, "delete_failed", "Failed to delete resource") } + if !deleted { + // Already deleted (sequential retry) or a concurrent DELETE won the + // race. Either way the backend is being / was torn down by the winner — + // report idempotent success WITHOUT re-firing deprovision. + return c.JSON(fiber.Map{"ok": true, "already_deleted": true, "id": resource.ID.String()}) + } // Deprovision the physical resource. // Fail open: a provisioner error is logged but does not affect the 200 response. diff --git a/internal/models/coverage_resource_test.go b/internal/models/coverage_resource_test.go index 4ba75dbd..f53c96c5 100644 --- a/internal/models/coverage_resource_test.go +++ b/internal/models/coverage_resource_test.go @@ -234,6 +234,34 @@ func TestSoftDeleteResource_Branches(t *testing.T) { require.ErrorContains(t, SoftDeleteResource(ctx, db2, uuid.New()), "boom") } +func TestSoftDeleteResourceIfActive_Branches(t *testing.T) { + ctx := context.Background() + + // Winner: the status-gated UPDATE flips a row → deleted==true. + db, mock := newMock(t) + mock.ExpectExec(`UPDATE resources SET status = 'deleted' WHERE id = \$1 AND status != 'deleted'`). + WillReturnResult(sqlmock.NewResult(0, 1)) + got, err := SoftDeleteResourceIfActive(ctx, db, uuid.New()) + require.NoError(t, err) + require.True(t, got, "a row was transitioned → deleted must be true") + + // Loser/retry: row already deleted, 0 rows affected → deleted==false + // (this is the guard that stops the second concurrent DELETE from + // re-firing the destructive deprovision). + db2, mock2 := newMock(t) + mock2.ExpectExec(`UPDATE resources SET status = 'deleted' WHERE id = \$1 AND status != 'deleted'`). + WillReturnResult(sqlmock.NewResult(0, 0)) + got2, err := SoftDeleteResourceIfActive(ctx, db2, uuid.New()) + require.NoError(t, err) + require.False(t, got2, "no row affected → deleted must be false (no deprovision)") + + // Error path. + db3, mock3 := newMock(t) + mock3.ExpectExec(`UPDATE resources SET status = 'deleted'`).WillReturnError(errors.New("boom")) + _, err = SoftDeleteResourceIfActive(ctx, db3, uuid.New()) + require.ErrorContains(t, err, "boom") +} + func TestPauseResource_Branches(t *testing.T) { ctx := context.Background() db, mock := newMock(t) diff --git a/internal/models/resource.go b/internal/models/resource.go index a816ebe2..632814b3 100644 --- a/internal/models/resource.go +++ b/internal/models/resource.go @@ -454,7 +454,9 @@ func SetWebhookHMACSecret(ctx context.Context, db *sql.DB, resourceID uuid.UUID, return nil } -// SoftDeleteResource marks a resource status as 'deleted'. +// SoftDeleteResource marks a resource status as 'deleted'. Used by the +// provision-rollback paths (db/cache/nosql/queue/storage/vector), which always +// operate on a row they just created — they don't need the concurrency guard. func SoftDeleteResource(ctx context.Context, db *sql.DB, id uuid.UUID) error { _, err := db.ExecContext(ctx, ` UPDATE resources SET status = 'deleted' WHERE id = $1 @@ -465,6 +467,28 @@ func SoftDeleteResource(ctx context.Context, db *sql.DB, id uuid.UUID) error { return nil } +// SoftDeleteResourceIfActive flips status to 'deleted' ONLY when the row is not +// already deleted, returning whether THIS call performed the transition. +// +// The `status != 'deleted'` guard makes a concurrent DELETE single-shot: when +// two requests race (both read the row before either commits), the UPDATE +// serializes at the row and exactly one reports deleted==true. The DELETE +// handler fires the DESTRUCTIVE deprovision RPC only on that winning call, so +// the physical backend is torn down once — never twice against already-gone +// infra (the truehomie-db DROP incident class). This is the authoritative +// idempotency guard for DELETE /resources/:id, replacing a status pre-read +// that had a check-then-act TOCTOU under concurrency. +func SoftDeleteResourceIfActive(ctx context.Context, db *sql.DB, id uuid.UUID) (bool, error) { + res, err := db.ExecContext(ctx, ` + UPDATE resources SET status = 'deleted' WHERE id = $1 AND status != 'deleted' + `, id) + if err != nil { + return false, fmt.Errorf("models.SoftDeleteResourceIfActive: %w", err) + } + n, _ := res.RowsAffected() + return n > 0, nil +} + // ErrResourceNotActive is returned by PauseResource when the row exists but // status != 'active'. The handler maps this to 409 conflict (already paused // or terminal). Distinct error type so the handler doesn't have to second-guess @@ -676,10 +700,11 @@ func UpdateProviderResourceID(ctx context.Context, db *sql.DB, resourceID uuid.U // resource to newTier and clears its TTL (expires_at = NULL). // // Called from the Razorpay subscription.charged webhook. Picks up two cases: -// 1) Resources that are already permanent (expires_at IS NULL) — a hobby -// user upgrading to pro: lift their existing resources to the new tier. -// 2) Resources still on anonymous TTL (expires_at > now()) — a freshly -// claimed user paying for the first time: clear the TTL + set tier. +// 1. Resources that are already permanent (expires_at IS NULL) — a hobby +// user upgrading to pro: lift their existing resources to the new tier. +// 2. Resources still on anonymous TTL (expires_at > now()) — a freshly +// claimed user paying for the first time: clear the TTL + set tier. +// // This is the second half of "pay from day one": claim transfers team // ownership but does NOT clear the TTL or change tier. Only payment does. //