diff --git a/internal/handlers/resource.go b/internal/handlers/resource.go index 521f819..2413286 100644 --- a/internal/handlers/resource.go +++ b/internal/handlers/resource.go @@ -216,15 +216,16 @@ func (h *ResourceHandler) Delete(c *fiber.Ctx) error { return respondError(c, fiber.StatusNotFound, "not_found", "Resource not found") } - // Idempotent DELETE (bug-bash #4/#12): a repeated DELETE (retry, double-click, - // concurrent call) must NOT re-soft-delete and, worse, re-deprovision the - // backend a second time. The first DELETE already tore it down — report - // success and do nothing. - if resource.Status == "deleted" { - return c.JSON(fiber.Map{"ok": true, "already_deleted": true, "id": resource.ID.String()}) - } - - if err := models.SoftDeleteResource(c.Context(), h.db, resource.ID); err != nil { + // Idempotent DELETE (bug-bash #4/#12, hardened post-review): a repeated + // DELETE (retry, double-click, CONCURRENT call) must NOT re-fire the + // destructive deprovision against already-gone backend infra. The original + // fix used a `status == "deleted"` pre-read, but that is a check-then-act + // TOCTOU: two concurrent DELETEs both read status='active' and both + // deprovision. The authoritative guard is the atomic status-gated UPDATE — + // only the call that actually transitions active→deleted (RowsAffected==1) + // proceeds to deprovision; any racer/retry gets a no-op success. + deleted, err := models.SoftDeleteResourceIfActive(c.Context(), h.db, resource.ID) + if err != nil { slog.Error("resource.delete.failed", "error", err, "resource_id", resource.ID, @@ -232,6 +233,12 @@ func (h *ResourceHandler) Delete(c *fiber.Ctx) error { ) return respondError(c, fiber.StatusServiceUnavailable, "delete_failed", "Failed to delete resource") } + if !deleted { + // Already deleted (sequential retry) or a concurrent DELETE won the + // race. Either way the backend is being / was torn down by the winner — + // report idempotent success WITHOUT re-firing deprovision. + return c.JSON(fiber.Map{"ok": true, "already_deleted": true, "id": resource.ID.String()}) + } // Deprovision the physical resource. // Fail open: a provisioner error is logged but does not affect the 200 response. diff --git a/internal/models/coverage_resource_test.go b/internal/models/coverage_resource_test.go index 4ba75db..f53c96c 100644 --- a/internal/models/coverage_resource_test.go +++ b/internal/models/coverage_resource_test.go @@ -234,6 +234,34 @@ func TestSoftDeleteResource_Branches(t *testing.T) { require.ErrorContains(t, SoftDeleteResource(ctx, db2, uuid.New()), "boom") } +func TestSoftDeleteResourceIfActive_Branches(t *testing.T) { + ctx := context.Background() + + // Winner: the status-gated UPDATE flips a row → deleted==true. + db, mock := newMock(t) + mock.ExpectExec(`UPDATE resources SET status = 'deleted' WHERE id = \$1 AND status != 'deleted'`). + WillReturnResult(sqlmock.NewResult(0, 1)) + got, err := SoftDeleteResourceIfActive(ctx, db, uuid.New()) + require.NoError(t, err) + require.True(t, got, "a row was transitioned → deleted must be true") + + // Loser/retry: row already deleted, 0 rows affected → deleted==false + // (this is the guard that stops the second concurrent DELETE from + // re-firing the destructive deprovision). + db2, mock2 := newMock(t) + mock2.ExpectExec(`UPDATE resources SET status = 'deleted' WHERE id = \$1 AND status != 'deleted'`). + WillReturnResult(sqlmock.NewResult(0, 0)) + got2, err := SoftDeleteResourceIfActive(ctx, db2, uuid.New()) + require.NoError(t, err) + require.False(t, got2, "no row affected → deleted must be false (no deprovision)") + + // Error path. + db3, mock3 := newMock(t) + mock3.ExpectExec(`UPDATE resources SET status = 'deleted'`).WillReturnError(errors.New("boom")) + _, err = SoftDeleteResourceIfActive(ctx, db3, uuid.New()) + require.ErrorContains(t, err, "boom") +} + func TestPauseResource_Branches(t *testing.T) { ctx := context.Background() db, mock := newMock(t) diff --git a/internal/models/resource.go b/internal/models/resource.go index a816ebe..632814b 100644 --- a/internal/models/resource.go +++ b/internal/models/resource.go @@ -454,7 +454,9 @@ func SetWebhookHMACSecret(ctx context.Context, db *sql.DB, resourceID uuid.UUID, return nil } -// SoftDeleteResource marks a resource status as 'deleted'. +// SoftDeleteResource marks a resource status as 'deleted'. Used by the +// provision-rollback paths (db/cache/nosql/queue/storage/vector), which always +// operate on a row they just created — they don't need the concurrency guard. func SoftDeleteResource(ctx context.Context, db *sql.DB, id uuid.UUID) error { _, err := db.ExecContext(ctx, ` UPDATE resources SET status = 'deleted' WHERE id = $1 @@ -465,6 +467,28 @@ func SoftDeleteResource(ctx context.Context, db *sql.DB, id uuid.UUID) error { return nil } +// SoftDeleteResourceIfActive flips status to 'deleted' ONLY when the row is not +// already deleted, returning whether THIS call performed the transition. +// +// The `status != 'deleted'` guard makes a concurrent DELETE single-shot: when +// two requests race (both read the row before either commits), the UPDATE +// serializes at the row and exactly one reports deleted==true. The DELETE +// handler fires the DESTRUCTIVE deprovision RPC only on that winning call, so +// the physical backend is torn down once — never twice against already-gone +// infra (the truehomie-db DROP incident class). This is the authoritative +// idempotency guard for DELETE /resources/:id, replacing a status pre-read +// that had a check-then-act TOCTOU under concurrency. +func SoftDeleteResourceIfActive(ctx context.Context, db *sql.DB, id uuid.UUID) (bool, error) { + res, err := db.ExecContext(ctx, ` + UPDATE resources SET status = 'deleted' WHERE id = $1 AND status != 'deleted' + `, id) + if err != nil { + return false, fmt.Errorf("models.SoftDeleteResourceIfActive: %w", err) + } + n, _ := res.RowsAffected() + return n > 0, nil +} + // ErrResourceNotActive is returned by PauseResource when the row exists but // status != 'active'. The handler maps this to 409 conflict (already paused // or terminal). Distinct error type so the handler doesn't have to second-guess @@ -676,10 +700,11 @@ func UpdateProviderResourceID(ctx context.Context, db *sql.DB, resourceID uuid.U // resource to newTier and clears its TTL (expires_at = NULL). // // Called from the Razorpay subscription.charged webhook. Picks up two cases: -// 1) Resources that are already permanent (expires_at IS NULL) — a hobby -// user upgrading to pro: lift their existing resources to the new tier. -// 2) Resources still on anonymous TTL (expires_at > now()) — a freshly -// claimed user paying for the first time: clear the TTL + set tier. +// 1. Resources that are already permanent (expires_at IS NULL) — a hobby +// user upgrading to pro: lift their existing resources to the new tier. +// 2. Resources still on anonymous TTL (expires_at > now()) — a freshly +// claimed user paying for the first time: clear the TTL + set tier. +// // This is the second half of "pay from day one": claim transfers team // ownership but does NOT clear the TTL or change tier. Only payment does. //