Skip to content
25 changes: 16 additions & 9 deletions internal/handlers/resource.go
Original file line number Diff line number Diff line change
Expand Up @@ -216,22 +216,29 @@ func (h *ResourceHandler) Delete(c *fiber.Ctx) error {
return respondError(c, fiber.StatusNotFound, "not_found", "Resource not found")
}

// Idempotent DELETE (bug-bash #4/#12): a repeated DELETE (retry, double-click,
// concurrent call) must NOT re-soft-delete and, worse, re-deprovision the
// backend a second time. The first DELETE already tore it down — report
// success and do nothing.
if resource.Status == "deleted" {
return c.JSON(fiber.Map{"ok": true, "already_deleted": true, "id": resource.ID.String()})
}

if err := models.SoftDeleteResource(c.Context(), h.db, resource.ID); err != nil {
// Idempotent DELETE (bug-bash #4/#12, hardened post-review): a repeated
// DELETE (retry, double-click, CONCURRENT call) must NOT re-fire the
// destructive deprovision against already-gone backend infra. The original
// fix used a `status == "deleted"` pre-read, but that is a check-then-act
// TOCTOU: two concurrent DELETEs both read status='active' and both
// deprovision. The authoritative guard is the atomic status-gated UPDATE —
// only the call that actually transitions active→deleted (RowsAffected==1)
// proceeds to deprovision; any racer/retry gets a no-op success.
deleted, err := models.SoftDeleteResourceIfActive(c.Context(), h.db, resource.ID)
if err != nil {
slog.Error("resource.delete.failed",
"error", err,
"resource_id", resource.ID,
"request_id", requestID,
)
return respondError(c, fiber.StatusServiceUnavailable, "delete_failed", "Failed to delete resource")
}
if !deleted {
// Already deleted (sequential retry) or a concurrent DELETE won the
// race. Either way the backend is being / was torn down by the winner —
// report idempotent success WITHOUT re-firing deprovision.
return c.JSON(fiber.Map{"ok": true, "already_deleted": true, "id": resource.ID.String()})
}

// Deprovision the physical resource.
// Fail open: a provisioner error is logged but does not affect the 200 response.
Expand Down
28 changes: 28 additions & 0 deletions internal/models/coverage_resource_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,34 @@ func TestSoftDeleteResource_Branches(t *testing.T) {
require.ErrorContains(t, SoftDeleteResource(ctx, db2, uuid.New()), "boom")
}

func TestSoftDeleteResourceIfActive_Branches(t *testing.T) {
ctx := context.Background()

// Winner: the status-gated UPDATE flips a row → deleted==true.
db, mock := newMock(t)
mock.ExpectExec(`UPDATE resources SET status = 'deleted' WHERE id = \$1 AND status != 'deleted'`).
WillReturnResult(sqlmock.NewResult(0, 1))
got, err := SoftDeleteResourceIfActive(ctx, db, uuid.New())
require.NoError(t, err)
require.True(t, got, "a row was transitioned → deleted must be true")

// Loser/retry: row already deleted, 0 rows affected → deleted==false
// (this is the guard that stops the second concurrent DELETE from
// re-firing the destructive deprovision).
db2, mock2 := newMock(t)
mock2.ExpectExec(`UPDATE resources SET status = 'deleted' WHERE id = \$1 AND status != 'deleted'`).
WillReturnResult(sqlmock.NewResult(0, 0))
got2, err := SoftDeleteResourceIfActive(ctx, db2, uuid.New())
require.NoError(t, err)
require.False(t, got2, "no row affected → deleted must be false (no deprovision)")

// Error path.
db3, mock3 := newMock(t)
mock3.ExpectExec(`UPDATE resources SET status = 'deleted'`).WillReturnError(errors.New("boom"))
_, err = SoftDeleteResourceIfActive(ctx, db3, uuid.New())
require.ErrorContains(t, err, "boom")
}

func TestPauseResource_Branches(t *testing.T) {
ctx := context.Background()
db, mock := newMock(t)
Expand Down
35 changes: 30 additions & 5 deletions internal/models/resource.go
Original file line number Diff line number Diff line change
Expand Up @@ -454,7 +454,9 @@ func SetWebhookHMACSecret(ctx context.Context, db *sql.DB, resourceID uuid.UUID,
return nil
}

// SoftDeleteResource marks a resource status as 'deleted'.
// SoftDeleteResource marks a resource status as 'deleted'. Used by the
// provision-rollback paths (db/cache/nosql/queue/storage/vector), which always
// operate on a row they just created — they don't need the concurrency guard.
func SoftDeleteResource(ctx context.Context, db *sql.DB, id uuid.UUID) error {
_, err := db.ExecContext(ctx, `
UPDATE resources SET status = 'deleted' WHERE id = $1
Expand All @@ -465,6 +467,28 @@ func SoftDeleteResource(ctx context.Context, db *sql.DB, id uuid.UUID) error {
return nil
}

// SoftDeleteResourceIfActive flips status to 'deleted' ONLY when the row is not
// already deleted, returning whether THIS call performed the transition.
//
// The `status != 'deleted'` guard makes a concurrent DELETE single-shot: when
// two requests race (both read the row before either commits), the UPDATE
// serializes at the row and exactly one reports deleted==true. The DELETE
// handler fires the DESTRUCTIVE deprovision RPC only on that winning call, so
// the physical backend is torn down once — never twice against already-gone
// infra (the truehomie-db DROP incident class). This is the authoritative
// idempotency guard for DELETE /resources/:id, replacing a status pre-read
// that had a check-then-act TOCTOU under concurrency.
func SoftDeleteResourceIfActive(ctx context.Context, db *sql.DB, id uuid.UUID) (bool, error) {
res, err := db.ExecContext(ctx, `
UPDATE resources SET status = 'deleted' WHERE id = $1 AND status != 'deleted'
`, id)
if err != nil {
return false, fmt.Errorf("models.SoftDeleteResourceIfActive: %w", err)
}
n, _ := res.RowsAffected()
return n > 0, nil
}

// ErrResourceNotActive is returned by PauseResource when the row exists but
// status != 'active'. The handler maps this to 409 conflict (already paused
// or terminal). Distinct error type so the handler doesn't have to second-guess
Expand Down Expand Up @@ -676,10 +700,11 @@ func UpdateProviderResourceID(ctx context.Context, db *sql.DB, resourceID uuid.U
// resource to newTier and clears its TTL (expires_at = NULL).
//
// Called from the Razorpay subscription.charged webhook. Picks up two cases:
// 1) Resources that are already permanent (expires_at IS NULL) — a hobby
// user upgrading to pro: lift their existing resources to the new tier.
// 2) Resources still on anonymous TTL (expires_at > now()) — a freshly
// claimed user paying for the first time: clear the TTL + set tier.
// 1. Resources that are already permanent (expires_at IS NULL) — a hobby
// user upgrading to pro: lift their existing resources to the new tier.
// 2. Resources still on anonymous TTL (expires_at > now()) — a freshly
// claimed user paying for the first time: clear the TTL + set tier.
//
// This is the second half of "pay from day one": claim transfers team
// ownership but does NOT clear the TTL or change tier. Only payment does.
//
Expand Down
Loading