diff --git a/internal/handlers/stack.go b/internal/handlers/stack.go index 5aff5be..c4d1b1a 100644 --- a/internal/handlers/stack.go +++ b/internal/handlers/stack.go @@ -1187,13 +1187,10 @@ func (h *StackHandler) UpdateEnv(c *fiber.Ctx) error { return respondError(c, fiber.StatusNotFound, "not_found", "Stack not found") } - // A stack mid-teardown cannot accept an env change — the teardown - // worker will delete the row. 409 so the caller knows the request was - // valid but lost the race, not malformed. - if stack.Status == stackStatusDeleting { - return respondError(c, fiber.StatusConflict, "stack_deleting", - "This stack is being deleted and can no longer be modified.") - } + // Note: the "stack is mid-teardown" guard is NOT a pre-read here — it lives + // inside MergeStackEnvVars under the FOR UPDATE lock (returns ErrStackDeleting, + // mapped to 409 below). A pre-read would be a TOCTOU: the teardown worker can + // flip status between this handler's GetStackBySlug and the merge. var body updateStackEnvBody if err := c.BodyParser(&body); err != nil { @@ -1228,6 +1225,12 @@ func (h *StackHandler) UpdateEnv(c *fiber.Ctx) error { return respondError(c, fiber.StatusRequestEntityTooLarge, "env_too_large", "Total env_vars payload exceeds 64KiB. Trim values or split across services.") } + if errors.Is(err, models.ErrStackDeleting) { + // Authoritative teardown check (under the FOR UPDATE lock): the stack + // is being deleted and can no longer be modified. + return respondError(c, fiber.StatusConflict, "stack_deleting", + "This stack is being deleted and can no longer be modified.") + } var notFound *models.ErrStackNotFound if errors.As(err, ¬Found) { // Row vanished between GetStackBySlug and the merge tx. Treat as 404. @@ -1239,9 +1242,20 @@ func (h *StackHandler) UpdateEnv(c *fiber.Ctx) error { "Failed to persist env vars") } + // keys_set counts only actual upserts (non-empty values). The old + // `len(body.Env) - deletes` over-counted when a PATCH sent an empty value + // for a key that wasn't present (a no-op delete: not counted in `deletes`, + // yet not a "set" either) — making the rule-12 audit surface lie. + keysSet := 0 + for _, v := range body.Env { + if v != "" { + keysSet++ + } + } + // Best-effort audit emit — never block the response on this. auditMeta, _ := json.Marshal(map[string]any{ - "keys_set": len(body.Env) - deletes, + "keys_set": keysSet, "keys_deleted": deletes, "total_after": len(merged), }) @@ -1267,7 +1281,7 @@ func (h *StackHandler) UpdateEnv(c *fiber.Ctx) error { slog.Info("stack.env.updated", "slug", slug, "team_id", team.ID, "stack_id", stack.ID, - "keys_set", len(body.Env)-deletes, "keys_deleted", deletes, "total_after", len(merged)) + "keys_set", keysSet, "keys_deleted", deletes, "total_after", len(merged)) return c.JSON(fiber.Map{ "ok": true, diff --git a/internal/handlers/stack_final2_test.go b/internal/handlers/stack_final2_test.go index e6d9d16..f483ba0 100644 --- a/internal/handlers/stack_final2_test.go +++ b/internal/handlers/stack_final2_test.go @@ -113,8 +113,10 @@ func TestStackFinal2_UpdateEnv_MergeSelectFailed(t *testing.T) { _, slug := seedStack(t, seedDB, &teamID, "healthy") jwt := testhelpers.MustSignSessionJWT(t, uuid.NewString(), teamIDStr, "stkf2@example.com") - // team(1)+GetStackBySlug(2) ok, merge tx SELECT ... FOR UPDATE(3) errors. - app := stackFaultApp(t, openFaultDB(t, 2)) + // team(1)+GetStackBySlug(2) ok, merge tx SET LOCAL lock_timeout(3) ok, then the + // SELECT ... FOR UPDATE(4) errors → persist_failed. (failAfter bumped 2→3 when + // the in-tx SET LOCAL lock_timeout exec was added — panel #7 hardening.) + app := stackFaultApp(t, openFaultDB(t, 3)) status, body := patchStackEnvF2(t, app, slug, jwt, `{"env":{"FOO":"bar"}}`) assert.Equal(t, http.StatusServiceUnavailable, status) assert.Contains(t, body, "persist_failed") @@ -132,8 +134,9 @@ func TestStackFinal2_UpdateEnv_PersistFailed(t *testing.T) { _, slug := seedStack(t, seedDB, &teamID, "healthy") jwt := testhelpers.MustSignSessionJWT(t, uuid.NewString(), teamIDStr, "stkf2@example.com") - // team(1)+slug(2)+merge SELECT FOR UPDATE(3) ok, merge UPDATE(4) errors. - app := stackFaultApp(t, openFaultDB(t, 3)) + // team(1)+slug(2)+SET LOCAL(3)+SELECT FOR UPDATE(4) ok, merge UPDATE(5) errors. + // (failAfter bumped 3→4 for the in-tx SET LOCAL lock_timeout exec — panel #7.) + app := stackFaultApp(t, openFaultDB(t, 4)) status, body := patchStackEnvF2(t, app, slug, jwt, `{"env":{"FOO":"bar"}}`) assert.Equal(t, http.StatusServiceUnavailable, status) assert.Contains(t, body, "persist_failed") diff --git a/internal/models/coverage_stack_test.go b/internal/models/coverage_stack_test.go index 9eb5647..7be41d4 100644 --- a/internal/models/coverage_stack_test.go +++ b/internal/models/coverage_stack_test.go @@ -337,75 +337,100 @@ func TestMergeStackEnvVars_Branches(t *testing.T) { ctx := context.Background() id := uuid.New() patch := map[string]string{"A": "1"} - - // 1) BeginTx error. + // selRe matches the status+env_vars FOR UPDATE select (post-hardening). + selRe := `SELECT status, COALESCE\(env_vars.*FOR UPDATE` + lockRe := `SET LOCAL lock_timeout` + okRows := func(status, env string) *sqlmock.Rows { + return sqlmock.NewRows([]string{"status", "env_vars"}).AddRow(status, []byte(env)) + } + + // 1) BeginTx error (before SET LOCAL). dbB, mockB := newMock(t) mockB.ExpectBegin().WillReturnError(errors.New("begin-boom")) _, _, err := MergeStackEnvVars(ctx, dbB, id, patch) require.ErrorContains(t, err, "begin-boom") - // 2) SELECT ... FOR UPDATE → no rows → ErrStackNotFound (rolls back). + // 2) SET LOCAL lock_timeout error (rolls back). + dbLT, mockLT := newMock(t) + mockLT.ExpectBegin() + mockLT.ExpectExec(lockRe).WillReturnError(errors.New("lock-boom")) + mockLT.ExpectRollback() + _, _, err = MergeStackEnvVars(ctx, dbLT, id, patch) + require.ErrorContains(t, err, "lock_timeout") + + // 3) SELECT ... FOR UPDATE → no rows → ErrStackNotFound (rolls back). dbNF, mockNF := newMock(t) mockNF.ExpectBegin() - mockNF.ExpectQuery(`SELECT COALESCE\(env_vars.*FOR UPDATE`).WillReturnError(errNoRows()) + mockNF.ExpectExec(lockRe).WillReturnResult(sqlmock.NewResult(0, 0)) + mockNF.ExpectQuery(selRe).WillReturnError(errNoRows()) mockNF.ExpectRollback() var nf *ErrStackNotFound _, _, err = MergeStackEnvVars(ctx, dbNF, id, patch) require.ErrorAs(t, err, &nf) - // 3) SELECT error (non-NoRows). + // 4) SELECT error (non-NoRows). dbSE, mockSE := newMock(t) mockSE.ExpectBegin() - mockSE.ExpectQuery(`SELECT COALESCE\(env_vars.*FOR UPDATE`).WillReturnError(errors.New("sel-boom")) + mockSE.ExpectExec(lockRe).WillReturnResult(sqlmock.NewResult(0, 0)) + mockSE.ExpectQuery(selRe).WillReturnError(errors.New("sel-boom")) mockSE.ExpectRollback() _, _, err = MergeStackEnvVars(ctx, dbSE, id, patch) require.ErrorContains(t, err, "sel-boom") - // 4) unmarshal error (malformed jsonb). + // 5) status='deleting' → ErrStackDeleting (in-tx teardown guard, rolls back). + dbDel, mockDel := newMock(t) + mockDel.ExpectBegin() + mockDel.ExpectExec(lockRe).WillReturnResult(sqlmock.NewResult(0, 0)) + mockDel.ExpectQuery(selRe).WillReturnRows(okRows(StackStatusDeleting, `{}`)) + mockDel.ExpectRollback() + _, _, err = MergeStackEnvVars(ctx, dbDel, id, patch) + require.ErrorIs(t, err, ErrStackDeleting) + + // 6) unmarshal error (malformed jsonb). dbUM, mockUM := newMock(t) mockUM.ExpectBegin() - mockUM.ExpectQuery(`SELECT COALESCE\(env_vars.*FOR UPDATE`). - WillReturnRows(sqlmock.NewRows([]string{"env_vars"}).AddRow([]byte(`not json`))) + mockUM.ExpectExec(lockRe).WillReturnResult(sqlmock.NewResult(0, 0)) + mockUM.ExpectQuery(selRe).WillReturnRows(okRows("healthy", `not json`)) mockUM.ExpectRollback() _, _, err = MergeStackEnvVars(ctx, dbUM, id, patch) require.ErrorContains(t, err, "unmarshal") - // 5) over-cap → ErrStackEnvVarsTooLarge, checked BEFORE the UPDATE (rolls back). + // 7) over-cap → ErrStackEnvVarsTooLarge, checked BEFORE the UPDATE (rolls back). dbTL, mockTL := newMock(t) mockTL.ExpectBegin() - mockTL.ExpectQuery(`SELECT COALESCE\(env_vars.*FOR UPDATE`). - WillReturnRows(sqlmock.NewRows([]string{"env_vars"}).AddRow([]byte(`{}`))) + mockTL.ExpectExec(lockRe).WillReturnResult(sqlmock.NewResult(0, 0)) + mockTL.ExpectQuery(selRe).WillReturnRows(okRows("healthy", `{}`)) mockTL.ExpectRollback() big := map[string]string{"K": strings.Repeat("x", maxStackEnvVarsBytes+1)} _, _, err = MergeStackEnvVars(ctx, dbTL, id, big) require.ErrorIs(t, err, ErrStackEnvVarsTooLarge) - // 6) UPDATE error. + // 8) UPDATE error. dbUE, mockUE := newMock(t) mockUE.ExpectBegin() - mockUE.ExpectQuery(`SELECT COALESCE\(env_vars.*FOR UPDATE`). - WillReturnRows(sqlmock.NewRows([]string{"env_vars"}).AddRow([]byte(`{}`))) + mockUE.ExpectExec(lockRe).WillReturnResult(sqlmock.NewResult(0, 0)) + mockUE.ExpectQuery(selRe).WillReturnRows(okRows("healthy", `{}`)) mockUE.ExpectExec(`UPDATE stacks SET env_vars`).WillReturnError(errors.New("upd-boom")) mockUE.ExpectRollback() _, _, err = MergeStackEnvVars(ctx, dbUE, id, patch) require.ErrorContains(t, err, "upd-boom") - // 7) Commit error. + // 9) Commit error. dbCE, mockCE := newMock(t) mockCE.ExpectBegin() - mockCE.ExpectQuery(`SELECT COALESCE\(env_vars.*FOR UPDATE`). - WillReturnRows(sqlmock.NewRows([]string{"env_vars"}).AddRow([]byte(`{}`))) + mockCE.ExpectExec(lockRe).WillReturnResult(sqlmock.NewResult(0, 0)) + mockCE.ExpectQuery(selRe).WillReturnRows(okRows("healthy", `{}`)) mockCE.ExpectExec(`UPDATE stacks SET env_vars`).WillReturnResult(sqlmock.NewResult(0, 1)) mockCE.ExpectCommit().WillReturnError(errors.New("commit-boom")) _, _, err = MergeStackEnvVars(ctx, dbCE, id, patch) require.ErrorContains(t, err, "commit-boom") - // 8) Happy path: existing {A:old, B:keep}; patch upserts A, adds C, deletes B - // (present → counted) and deletes MISSING (absent → NOT counted). + // 10) Happy path: existing {A:old, B:keep}; patch upserts A, adds C, deletes B + // (present → counted) and deletes MISSING (absent → NOT counted). dbOK, mockOK := newMock(t) mockOK.ExpectBegin() - mockOK.ExpectQuery(`SELECT COALESCE\(env_vars.*FOR UPDATE`). - WillReturnRows(sqlmock.NewRows([]string{"env_vars"}).AddRow([]byte(`{"A":"old","B":"keep"}`))) + mockOK.ExpectExec(lockRe).WillReturnResult(sqlmock.NewResult(0, 0)) + mockOK.ExpectQuery(selRe).WillReturnRows(okRows("healthy", `{"A":"old","B":"keep"}`)) mockOK.ExpectExec(`UPDATE stacks SET env_vars`).WillReturnResult(sqlmock.NewResult(0, 1)) mockOK.ExpectCommit() merged, deletes, err := MergeStackEnvVars(ctx, dbOK, id, diff --git a/internal/models/stack.go b/internal/models/stack.go index 124636f..d760722 100644 --- a/internal/models/stack.go +++ b/internal/models/stack.go @@ -636,6 +636,19 @@ func GetStackEnvVars(ctx context.Context, db *sql.DB, stackID uuid.UUID) (map[st // pairs at ~64 bytes each). var ErrStackEnvVarsTooLarge = errors.New("stack env_vars payload exceeds 64KiB") +// StackStatusDeleting is the status of a stack mid-teardown. Lifecycle +// mutations (env merge) must be rejected for it. Single source of truth shared +// by the model guard and the handler. +const StackStatusDeleting = "deleting" + +// ErrStackDeleting is returned by MergeStackEnvVars when the row is mid-teardown +// (status='deleting') at the moment the FOR UPDATE lock is taken. Checking this +// INSIDE the locked transaction — rather than via a pre-read in the handler — +// closes the TOCTOU where the teardown worker flips the row between the +// handler's GetStackBySlug and the merge, which would otherwise commit an env +// change onto a stack that's being deleted. +var ErrStackDeleting = errors.New("stack is being deleted") + // maxStackEnvVarsBytes is the serialized-JSON byte cap on env_vars. // 64*1024 = 65536. See ErrStackEnvVarsTooLarge for rationale. const maxStackEnvVarsBytes = 64 * 1024 @@ -699,19 +712,34 @@ func MergeStackEnvVars(ctx context.Context, db *sql.DB, stackID uuid.UUID, patch return nil, 0, fmt.Errorf("models.MergeStackEnvVars: begin: %w", err) } // No-op after a successful Commit; rolls back on any early return (over-cap, - // marshal error, scan error) so the row is never left half-updated. + // marshal error, scan error, deleting) so the row is never left half-updated. defer func() { _ = tx.Rollback() }() + // Bound the time we'll wait for the row lock. Without this a PATCH that + // races a long-held lock (another in-flight merge, or a teardown holding the + // row) would block the request goroutine indefinitely; 3s fails fast to a + // 503 the caller can retry. SET LOCAL is scoped to this transaction. + if _, err := tx.ExecContext(ctx, `SET LOCAL lock_timeout = '3s'`); err != nil { + return nil, 0, fmt.Errorf("models.MergeStackEnvVars: set lock_timeout: %w", err) + } + + // Fetch status under the same FOR UPDATE lock so the teardown check is + // race-free: a stack flipped to 'deleting' between the handler's + // GetStackBySlug and here is caught authoritatively, not via a stale pre-read. var raw []byte + var status string err = tx.QueryRowContext(ctx, ` - SELECT COALESCE(env_vars, '{}'::jsonb) FROM stacks WHERE id = $1 FOR UPDATE - `, stackID).Scan(&raw) + SELECT status, COALESCE(env_vars, '{}'::jsonb) FROM stacks WHERE id = $1 FOR UPDATE + `, stackID).Scan(&status, &raw) if err == sql.ErrNoRows { return nil, 0, &ErrStackNotFound{Slug: stackID.String()} } if err != nil { return nil, 0, fmt.Errorf("models.MergeStackEnvVars: select: %w", err) } + if status == StackStatusDeleting { + return nil, 0, ErrStackDeleting + } merged := map[string]string{} if len(raw) > 0 {