From 91a947b5c9ed27b9508202a2270d572e50534290 Mon Sep 17 00:00:00 2001 From: Manas Srivastava Date: Thu, 4 Jun 2026 21:09:52 +0530 Subject: [PATCH] fix(stacks): mark promote approval executed AFTER preflight, not before (#11) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Promote handler called MarkPromoteApprovalExecuted inside consumeApprovedPromote BEFORE the promote preflight (source-services fetch, image_ref check, target create/update, vault copy, env load) ran. Any preflight failure (412 missing_image_ref / no_services, 503 lookup/create/ env_load, 400 vault, 402 cap) therefore burned the single-use approval to 'executed' while the promote never launched — stranding the operator with a consumed, non-retryable approval and forcing a fresh email round-trip. Split consumeApprovedPromote into: - validateApprovedPromote — read-only checks (uuid, lookup, ownership, status, from/to/kind match, expiry), runs at the original position so an invalid approval still 4xxs early. Returns the row, no mutation. - markApprovedPromoteExecuted — the 'executed' CAS flip + audit, called ONLY after preflight fully succeeds, immediately before the runStackDeploy launch. A preflight failure now leaves the approval 'approved' and retryable. The single-use guarantee is preserved: MarkPromoteApprovalExecuted's CAS still returns 0 rows (409 approval_already_executed) on a concurrent double-consume. Twin promote (consumeApprovedTwin in twin.go) has its own consume path and is out of scope for this fix. Coverage: Symptom: preflight 412/503 strands a consumed (status='executed') approval Enumeration: rg MarkPromoteApprovalExecuted + consumeApprovedPromote call sites in stack.go Sites found: 1 (stack.go Promote; twin.go has a separate consumeApprovedTwin) Sites touched: 1 Coverage test: TestStackPromote_ApprovalID_PreflightFails_StaysApproved (missing_image_ref), TestStackPromote_ApprovalID_NoServices_StaysApproved, TestStackPromote_ApprovalID_HappyPath_ExecutesOnce (executed once + single-use), TestMarkApprovedPromoteExecuted_AlreadyExecuted_409 (CAS-miss arm), TestStackFinal_ConsumeApproved_ExecuteError_503 (updated for new call order) Live verified: awaiting post-merge auto-deploy (rule 14 build-SHA gate in CI) Co-Authored-By: Claude Opus 4.8 (1M context) --- internal/handlers/export_final_test.go | 9 + internal/handlers/stack.go | 85 +++++-- internal/handlers/stack_final_test.go | 15 +- .../stack_promote_approval_preflight_test.go | 226 ++++++++++++++++++ 4 files changed, 314 insertions(+), 21 deletions(-) create mode 100644 internal/handlers/stack_promote_approval_preflight_test.go diff --git a/internal/handlers/export_final_test.go b/internal/handlers/export_final_test.go index 4d68f072..fab079f8 100644 --- a/internal/handlers/export_final_test.go +++ b/internal/handlers/export_final_test.go @@ -43,6 +43,15 @@ func (h *StackHandler) CheckStackDeployLimitForTest(ctx context.Context, fp stri return h.checkStackDeployLimit(ctx, fp) } +// MarkApprovedPromoteExecutedForTest re-exports the package-private +// markApprovedPromoteExecuted so the approval_already_executed CAS-miss arm +// (an approval flipped to 'executed' between validate and execute — only +// reachable under a concurrent double-consume in prod) can be driven +// deterministically by pre-seeding the row as already executed. +func (h *StackHandler) MarkApprovedPromoteExecutedForTest(c *fiber.Ctx, row *models.PromoteApproval, from, to string) error { + return h.markApprovedPromoteExecuted(c, row, from, to) +} + // ── agent_action.go empty-arg default-branch coverage ──────────────────────── // These re-exports drive the `if x == "" { x = "..." }` default branches that // the happy-path callers (always passing a non-empty value) leave open. diff --git a/internal/handlers/stack.go b/internal/handlers/stack.go index c4d1b1af..e5afc2a8 100644 --- a/internal/handlers/stack.go +++ b/internal/handlers/stack.go @@ -2010,10 +2010,19 @@ func (h *StackHandler) Promote(c *fiber.Ctx) error { // row for THIS team, with matching from/to/kind. The worker (when it // lands) will short-circuit this branch and run the promote on its // own poll cadence; until then this path is the manual trigger. + // + // #11 (sweep 2026-06-04): we only VALIDATE here. The single-use + // 'executed' flip is deferred to markApprovedPromoteExecuted, called + // just before runStackDeploy below — so a preflight failure (412/503/ + // 400/402 in Steps A–C) leaves the approval 'approved' and retryable + // instead of burning it on a promote that never ran. + var approvalRow *models.PromoteApproval if body.ApprovalID != "" { - if err := h.consumeApprovedPromote(c, team, body, from, to, models.PromoteApprovalKindStack); err != nil { - return err + row, vErr := h.validateApprovedPromote(c, team, body, from, to, models.PromoteApprovalKindStack) + if vErr != nil { + return vErr } + approvalRow = row } // Step A: Pull the source's services. If ANY service is missing @@ -2307,6 +2316,20 @@ func (h *StackHandler) Promote(c *fiber.Ctx) error { }) } + // #11 (sweep 2026-06-04): preflight (Steps A–C above) has fully + // succeeded — every failure path before this point returns early, so + // reaching here means the promote WILL launch. Burn the single-use + // approval to 'executed' now, immediately before the deploy launch. + // A failure here (503 execute_failed / 409 already_executed) returns + // before the launch; the target stack rows are already written but the + // approval state is the authoritative single-use gate, so re-calling + // with the same approval is the operator's retry path. + if approvalRow != nil { + if execErr := h.markApprovedPromoteExecuted(c, approvalRow, from, to); execErr != nil { + return execErr + } + } + // Step D: Hand off to the goroutine that calls the provider with // SkipBuild=true. The dashboard's EnvironmentsGrid polls /family so it // picks up the building → healthy transition automatically. @@ -2408,51 +2431,60 @@ func (h *StackHandler) beginPromoteApproval( return row, nil } -// consumeApprovedPromote verifies that an explicit approval_id supplied -// by the caller matches an APPROVED but NOT-YET-EXECUTED row for the -// same team / from / to / kind, and atomically flips the row to -// 'executed'. Used by the manual-trigger fallback path until the -// worker-side polling lands. +// validateApprovedPromote verifies that an explicit approval_id supplied by +// the caller matches an APPROVED, NOT-YET-EXECUTED, non-expired row for the +// same team / from / to / kind. It returns the approval row on success but +// does NOT mutate it — the actual 'executed' flip is deferred to +// markApprovedPromoteExecuted, which the handler calls only AFTER the promote +// preflight (source-services, image_ref, target create/update, vault, env +// load) has succeeded. +// +// #11 (sweep 2026-06-04): the flip used to happen here, BEFORE preflight. A +// preflight failure (412 missing_image_ref / no_services, 503 lookup, 400 +// vault, 402 cap) therefore burned the single-use approval to 'executed' +// while the promote never ran — leaving the operator with a non-retryable +// approval and forcing a fresh email round-trip. Splitting validate/execute +// keeps the approval 'approved' (retryable) on any preflight failure. // // Why we check from/to/kind in addition to the id: the approval row's // payload is what the worker would replay. If a caller passes an // approval_id for env=preprod but the request is to=production, we // refuse — the row's authority covers the env pair it was issued for, // not whatever the caller is asking for now. -func (h *StackHandler) consumeApprovedPromote( +func (h *StackHandler) validateApprovedPromote( c *fiber.Ctx, team *models.Team, body promoteBody, from, to, kind string, -) error { +) (*models.PromoteApproval, error) { id, err := uuid.Parse(body.ApprovalID) if err != nil { - return respondError(c, fiber.StatusBadRequest, "invalid_approval_id", + return nil, respondError(c, fiber.StatusBadRequest, "invalid_approval_id", "approval_id must be a valid UUID") } row, err := models.GetPromoteApprovalByID(c.Context(), h.db, id) if errors.Is(err, models.ErrPromoteApprovalNotFound) { - return respondError(c, fiber.StatusNotFound, "approval_not_found", + return nil, respondError(c, fiber.StatusNotFound, "approval_not_found", "approval_id does not match any approval row") } if err != nil { slog.Error("stack.promote.approval_lookup_failed", "error", err, "approval_id", id, "request_id", middleware.GetRequestID(c)) - return respondError(c, fiber.StatusServiceUnavailable, "lookup_failed", + return nil, respondError(c, fiber.StatusServiceUnavailable, "lookup_failed", "Failed to look up approval") } if row.TeamID != team.ID { // Cross-team — same posture as stack ownership: 404 not 403. - return respondError(c, fiber.StatusNotFound, "approval_not_found", + return nil, respondError(c, fiber.StatusNotFound, "approval_not_found", "approval_id does not match any approval row for this team") } if row.Status != models.PromoteApprovalStatusApproved { - return respondError(c, fiber.StatusConflict, "approval_not_approved", + return nil, respondError(c, fiber.StatusConflict, "approval_not_approved", "approval row is in status="+row.Status+" — must be 'approved' to consume") } if row.PromoteKind != kind || row.FromEnv != from || row.ToEnv != to { - return respondError(c, fiber.StatusBadRequest, "approval_mismatch", + return nil, respondError(c, fiber.StatusBadRequest, "approval_mismatch", "approval_id's recorded (kind,from,to) does not match this request") } if row.ExpiresAt.Before(time.Now().UTC()) { @@ -2460,13 +2492,30 @@ func (h *StackHandler) consumeApprovedPromote( // has fully passed since the original request we refuse to // execute. This is belt-and-suspenders defence; the worker // repo's polling job would refuse for the same reason. - return respondError(c, fiber.StatusGone, "approval_expired", + return nil, respondError(c, fiber.StatusGone, "approval_expired", "approval window has fully expired") } - ok, err := models.MarkPromoteApprovalExecuted(c.Context(), h.db, id) + return row, nil +} + +// markApprovedPromoteExecuted atomically flips a validated approval row to +// 'executed' and audits the transition. It is called by Promote ONLY after +// the entire promote preflight has succeeded and immediately before the +// runStackDeploy launch, so a preflight failure leaves the row 'approved' +// (retryable). See validateApprovedPromote for the #11 rationale. +// +// The CAS inside MarkPromoteApprovalExecuted still guards against a concurrent +// double-consume: if a second request raced through validate + preflight and +// flipped the row first, this returns 0 rows and we 409 approval_already_executed. +func (h *StackHandler) markApprovedPromoteExecuted( + c *fiber.Ctx, + row *models.PromoteApproval, + from, to string, +) error { + ok, err := models.MarkPromoteApprovalExecuted(c.Context(), h.db, row.ID) if err != nil { slog.Error("stack.promote.approval_execute_failed", - "error", err, "approval_id", id, + "error", err, "approval_id", row.ID, "request_id", middleware.GetRequestID(c)) return respondError(c, fiber.StatusServiceUnavailable, "execute_failed", "Failed to mark approval executed") diff --git a/internal/handlers/stack_final_test.go b/internal/handlers/stack_final_test.go index 6e4fedd5..b8ff1b74 100644 --- a/internal/handlers/stack_final_test.go +++ b/internal/handlers/stack_final_test.go @@ -136,8 +136,17 @@ func TestStackFinal_ConsumeApproved_LookupError_503(t *testing.T) { } // TestStackFinal_ConsumeApproved_ExecuteError_503 — MarkPromoteApprovalExecuted -// errors after a fully-valid approved row (stack.go:2425). team(1) + stack(2) + -// approval-read(3) succeed; the UPDATE(4) errors. failAfter=3. +// errors on the deferred 'executed' flip, after a fully-valid approved row AND +// after the entire promote preflight succeeds (markApprovedPromoteExecuted, +// stack.go ~2520). +// +// #11 (sweep 2026-06-04): the flip moved from BEFORE preflight to AFTER it, so +// the fault must now land on a LATER DB call. The fresh-target preflight runs +// ~10 reads/writes (source services, family lookup, CreateStackWithCap, vault +// copy, source+target env_vars, vault resolve) between the approval read and +// the MarkPromoteApprovalExecuted UPDATE — failAfter=13 lands the injected +// failure on that UPDATE (verified: 12 → env_load_failed, 13 → execute_failed, +// 14 → success/202). func TestStackFinal_ConsumeApproved_ExecuteError_503(t *testing.T) { seedDB, clean := testhelpers.SetupTestDB(t) defer clean() @@ -148,7 +157,7 @@ func TestStackFinal_ConsumeApproved_ExecuteError_503(t *testing.T) { slug, _ := seedPromoteSourceStack(t, seedDB, teamIDStr, "staging", "stkfinal-exec") id := mustSeedApprovedPromote(t, seedDB, teamID, "staging", "production") - faultDB := openFaultDB(t, 3) + faultDB := openFaultDB(t, 13) app := stackFaultPromoteApp(t, faultDB) resp := postPromote(t, app, jwt, slug, map[string]any{ "from": "staging", "to": "production", "approval_id": id, diff --git a/internal/handlers/stack_promote_approval_preflight_test.go b/internal/handlers/stack_promote_approval_preflight_test.go new file mode 100644 index 00000000..71b5027b --- /dev/null +++ b/internal/handlers/stack_promote_approval_preflight_test.go @@ -0,0 +1,226 @@ +package handlers_test + +// stack_promote_approval_preflight_test.go — regression coverage for sweep +// finding #11 (P2): a promote approval used to be burned to 'executed' BEFORE +// the promote preflight ran, so a preflight failure (412/503/400/402) left the +// single-use approval consumed and non-retryable, forcing a fresh email +// round-trip. +// +// The fix splits validateApprovedPromote (read-only checks, runs before +// preflight) from markApprovedPromoteExecuted (the 'executed' flip, runs only +// after preflight succeeds, immediately before runStackDeploy). These tests +// pin both halves: +// +// - preflight-fails → approval stays 'approved' (retryable) +// - happy path → approval is 'executed' exactly once +// +// Scope: stack.go ONLY. Skips cleanly when TEST_DATABASE_URL is unset. + +import ( + "context" + "database/sql" + "net/http" + "net/http/httptest" + "testing" + + "github.com/gofiber/fiber/v2" + "github.com/google/uuid" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "instant.dev/internal/config" + "instant.dev/internal/handlers" + "instant.dev/internal/models" + "instant.dev/internal/plans" + "instant.dev/internal/testhelpers" +) + +// approvalStatus reads the current status column of a promote_approvals row. +func approvalStatus(t *testing.T, db *sql.DB, id string) string { + t.Helper() + var status string + require.NoError(t, db.QueryRowContext(context.Background(), + `SELECT status FROM promote_approvals WHERE id = $1`, id).Scan(&status)) + return status +} + +// TestStackPromote_ApprovalID_PreflightFails_StaysApproved is the #11 +// regression: a source stack whose service has NO image_ref makes the promote +// preflight fail with 412 missing_image_ref. The approval was validated (it is +// approved + matches from/to/kind), but because the flip is deferred to AFTER +// preflight, a preflight failure must leave the approval 'approved' — the +// operator can retry the same approval once the source has a real image_ref. +func TestStackPromote_ApprovalID_PreflightFails_StaysApproved(t *testing.T) { + requireTestDB(t) + db, cleanDB := testhelpers.SetupTestDB(t) + defer cleanDB() + ensureStackTables(t, db) + + teamIDStr := testhelpers.MustCreateTeamDB(t, db, "pro") + teamID := uuid.MustParse(teamIDStr) + jwt := testhelpers.MustSignSessionJWT(t, uuid.NewString(), teamIDStr, "preflightfail@example.com") + + // Source stack with a service that has NO image_ref → preflight 412. + slug, srcID := seedPromoteSourceStackNoImageRef(t, db, teamIDStr, "staging", "preflight-fail-src") + _, err := db.ExecContext(context.Background(), ` + INSERT INTO stack_services (stack_id, name, expose, port, image_ref, status) + VALUES ($1::uuid, 'api', true, 8080, '', 'building') + `, srcID) + require.NoError(t, err) + + app := newStackTestApp(t, db) + id := mustSeedApprovedPromote(t, db, teamID, "staging", "production") + + resp := postPromote(t, app, jwt, slug, map[string]any{ + "from": "staging", + "to": "production", + "approval_id": id, + }) + defer resp.Body.Close() + + // Preflight rejects the empty image_ref before the deploy launch. + require.Equal(t, http.StatusPreconditionFailed, resp.StatusCode, + "a service with no image_ref must fail promote preflight with 412") + assert.Equal(t, "missing_image_ref", decodeErrCode(t, resp)) + + // The crux of #11: the single-use approval must NOT have been burned. + assert.Equal(t, models.PromoteApprovalStatusApproved, approvalStatus(t, db, id), + "a preflight failure must leave the approval 'approved' and retryable, not 'executed'") +} + +// TestStackPromote_ApprovalID_NoServices_StaysApproved is the sibling #11 +// regression for the OTHER early preflight exit: a source stack with zero +// service rows fails with 412 no_services. The approval must survive that +// failure 'approved' too. +func TestStackPromote_ApprovalID_NoServices_StaysApproved(t *testing.T) { + requireTestDB(t) + db, cleanDB := testhelpers.SetupTestDB(t) + defer cleanDB() + ensureStackTables(t, db) + + teamIDStr := testhelpers.MustCreateTeamDB(t, db, "pro") + teamID := uuid.MustParse(teamIDStr) + jwt := testhelpers.MustSignSessionJWT(t, uuid.NewString(), teamIDStr, "noservices@example.com") + + // Source stack with NO service rows at all → preflight 412 no_services. + slug, _ := seedPromoteSourceStackNoImageRef(t, db, teamIDStr, "staging", "no-services-src") + + app := newStackTestApp(t, db) + id := mustSeedApprovedPromote(t, db, teamID, "staging", "production") + + resp := postPromote(t, app, jwt, slug, map[string]any{ + "from": "staging", + "to": "production", + "approval_id": id, + }) + defer resp.Body.Close() + + require.Equal(t, http.StatusPreconditionFailed, resp.StatusCode, + "a source with no services must fail promote preflight with 412") + assert.Equal(t, "no_services", decodeErrCode(t, resp)) + + assert.Equal(t, models.PromoteApprovalStatusApproved, approvalStatus(t, db, id), + "a no_services preflight failure must leave the approval 'approved' and retryable") +} + +// TestMarkApprovedPromoteExecuted_AlreadyExecuted_409 drives the CAS-miss arm +// of markApprovedPromoteExecuted directly: an approval row that was flipped to +// 'executed' between validate and the deferred execute (a concurrent +// double-consume race in prod) makes MarkPromoteApprovalExecuted return 0 rows +// (ok=false). The handler must 409 approval_already_executed. Exercised via a +// white-box seam + pre-seeded executed row because the serial HTTP path can't +// interpose a second consumer between validate and execute. +func TestMarkApprovedPromoteExecuted_AlreadyExecuted_409(t *testing.T) { + requireTestDB(t) + db, cleanDB := testhelpers.SetupTestDB(t) + defer cleanDB() + ensureStackTables(t, db) + + teamIDStr := testhelpers.MustCreateTeamDB(t, db, "pro") + teamID := uuid.MustParse(teamIDStr) + + // Seed approved, then flip to 'executed' (simulating the racing consumer + // that won between our validate and execute). + idStr := mustSeedApprovedPromote(t, db, teamID, "staging", "production") + id := uuid.MustParse(idStr) + row, err := models.GetPromoteApprovalByID(context.Background(), db, id) + require.NoError(t, err) + ok, err := models.MarkPromoteApprovalExecuted(context.Background(), db, id) + require.NoError(t, err) + require.True(t, ok, "first flip must succeed (approved -> executed)") + + cfg := &config.Config{JWTSecret: testhelpers.TestJWTSecret, AESKey: testhelpers.TestAESKeyHex, ComputeProvider: "noop"} + h := handlers.NewStackHandler(db, nil, cfg, plans.Default()) + + app := fiber.New(fiber.Config{ + ErrorHandler: func(c *fiber.Ctx, e error) error { + if e == handlers.ErrResponseWritten { + return nil + } + code := fiber.StatusInternalServerError + if fe, ok := e.(*fiber.Error); ok { + code = fe.Code + } + return c.Status(code).JSON(fiber.Map{"ok": false, "error": e.Error()}) + }, + }) + app.Get("/t", func(c *fiber.Ctx) error { + // row is the stale 'approved' snapshot we captured BEFORE the flip — + // exactly what validateApprovedPromote would hand to markApprovedPromoteExecuted. + return h.MarkApprovedPromoteExecutedForTest(c, row, "staging", "production") + }) + + req := httptest.NewRequest(http.MethodGet, "/t", nil) + resp, err := app.Test(req, 5000) + require.NoError(t, err) + defer resp.Body.Close() + require.Equal(t, http.StatusConflict, resp.StatusCode) + assert.Equal(t, "approval_already_executed", decodeErrCode(t, resp)) +} + +// TestStackPromote_ApprovalID_HappyPath_ExecutesOnce pins the other half of the +// split: a promote whose preflight fully succeeds must flip the approval to +// 'executed' exactly once. This proves markApprovedPromoteExecuted still runs +// on the success path (the fix didn't accidentally drop the flip). +func TestStackPromote_ApprovalID_HappyPath_ExecutesOnce(t *testing.T) { + requireTestDB(t) + db, cleanDB := testhelpers.SetupTestDB(t) + defer cleanDB() + ensureStackTables(t, db) + + teamIDStr := testhelpers.MustCreateTeamDB(t, db, "pro") + teamID := uuid.MustParse(teamIDStr) + jwt := testhelpers.MustSignSessionJWT(t, uuid.NewString(), teamIDStr, "happyexec@example.com") + + // Source WITH an image_ref so preflight passes end-to-end. + slug, _ := seedPromoteSourceStack(t, db, teamIDStr, "staging", "happy-exec-src") + + app := newStackTestApp(t, db) + id := mustSeedApprovedPromote(t, db, teamID, "staging", "production") + + resp := postPromote(t, app, jwt, slug, map[string]any{ + "from": "staging", + "to": "production", + "approval_id": id, + }) + defer resp.Body.Close() + + require.Contains(t, []int{http.StatusOK, http.StatusAccepted}, resp.StatusCode, + "a fully-valid promote must proceed past the approval gate; got %d", resp.StatusCode) + + // The approval is now consumed — flipped to 'executed' exactly once. + assert.Equal(t, models.PromoteApprovalStatusExecuted, approvalStatus(t, db, id), + "a successful promote must flip the approval to 'executed'") + + // Retry the SAME approval — now 'executed', so the status gate rejects it + // (proving single-use semantics survive the split). + resp2 := postPromote(t, app, jwt, slug, map[string]any{ + "from": "staging", + "to": "production", + "approval_id": id, + }) + defer resp2.Body.Close() + assert.Equal(t, http.StatusConflict, resp2.StatusCode, + "re-using an executed approval must 409") + assert.Equal(t, "approval_not_approved", decodeErrCode(t, resp2)) +}