Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 23 additions & 9 deletions internal/handlers/stack.go
Original file line number Diff line number Diff line change
Expand Up @@ -1187,13 +1187,10 @@ func (h *StackHandler) UpdateEnv(c *fiber.Ctx) error {
return respondError(c, fiber.StatusNotFound, "not_found", "Stack not found")
}

// A stack mid-teardown cannot accept an env change — the teardown
// worker will delete the row. 409 so the caller knows the request was
// valid but lost the race, not malformed.
if stack.Status == stackStatusDeleting {
return respondError(c, fiber.StatusConflict, "stack_deleting",
"This stack is being deleted and can no longer be modified.")
}
// Note: the "stack is mid-teardown" guard is NOT a pre-read here — it lives
// inside MergeStackEnvVars under the FOR UPDATE lock (returns ErrStackDeleting,
// mapped to 409 below). A pre-read would be a TOCTOU: the teardown worker can
// flip status between this handler's GetStackBySlug and the merge.

var body updateStackEnvBody
if err := c.BodyParser(&body); err != nil {
Expand Down Expand Up @@ -1228,6 +1225,12 @@ func (h *StackHandler) UpdateEnv(c *fiber.Ctx) error {
return respondError(c, fiber.StatusRequestEntityTooLarge, "env_too_large",
"Total env_vars payload exceeds 64KiB. Trim values or split across services.")
}
if errors.Is(err, models.ErrStackDeleting) {
// Authoritative teardown check (under the FOR UPDATE lock): the stack
// is being deleted and can no longer be modified.
return respondError(c, fiber.StatusConflict, "stack_deleting",
"This stack is being deleted and can no longer be modified.")
}
var notFound *models.ErrStackNotFound
if errors.As(err, &notFound) {
// Row vanished between GetStackBySlug and the merge tx. Treat as 404.
Expand All @@ -1239,9 +1242,20 @@ func (h *StackHandler) UpdateEnv(c *fiber.Ctx) error {
"Failed to persist env vars")
}

// keys_set counts only actual upserts (non-empty values). The old
// `len(body.Env) - deletes` over-counted when a PATCH sent an empty value
// for a key that wasn't present (a no-op delete: not counted in `deletes`,
// yet not a "set" either) — making the rule-12 audit surface lie.
keysSet := 0
for _, v := range body.Env {
if v != "" {
keysSet++
}
}

// Best-effort audit emit — never block the response on this.
auditMeta, _ := json.Marshal(map[string]any{
"keys_set": len(body.Env) - deletes,
"keys_set": keysSet,
"keys_deleted": deletes,
"total_after": len(merged),
})
Expand All @@ -1267,7 +1281,7 @@ func (h *StackHandler) UpdateEnv(c *fiber.Ctx) error {

slog.Info("stack.env.updated",
"slug", slug, "team_id", team.ID, "stack_id", stack.ID,
"keys_set", len(body.Env)-deletes, "keys_deleted", deletes, "total_after", len(merged))
"keys_set", keysSet, "keys_deleted", deletes, "total_after", len(merged))

return c.JSON(fiber.Map{
"ok": true,
Expand Down
11 changes: 7 additions & 4 deletions internal/handlers/stack_final2_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,10 @@ func TestStackFinal2_UpdateEnv_MergeSelectFailed(t *testing.T) {
_, slug := seedStack(t, seedDB, &teamID, "healthy")
jwt := testhelpers.MustSignSessionJWT(t, uuid.NewString(), teamIDStr, "stkf2@example.com")

// team(1)+GetStackBySlug(2) ok, merge tx SELECT ... FOR UPDATE(3) errors.
app := stackFaultApp(t, openFaultDB(t, 2))
// team(1)+GetStackBySlug(2) ok, merge tx SET LOCAL lock_timeout(3) ok, then the
// SELECT ... FOR UPDATE(4) errors → persist_failed. (failAfter bumped 2→3 when
// the in-tx SET LOCAL lock_timeout exec was added — panel #7 hardening.)
app := stackFaultApp(t, openFaultDB(t, 3))
status, body := patchStackEnvF2(t, app, slug, jwt, `{"env":{"FOO":"bar"}}`)
assert.Equal(t, http.StatusServiceUnavailable, status)
assert.Contains(t, body, "persist_failed")
Expand All @@ -132,8 +134,9 @@ func TestStackFinal2_UpdateEnv_PersistFailed(t *testing.T) {
_, slug := seedStack(t, seedDB, &teamID, "healthy")
jwt := testhelpers.MustSignSessionJWT(t, uuid.NewString(), teamIDStr, "stkf2@example.com")

// team(1)+slug(2)+merge SELECT FOR UPDATE(3) ok, merge UPDATE(4) errors.
app := stackFaultApp(t, openFaultDB(t, 3))
// team(1)+slug(2)+SET LOCAL(3)+SELECT FOR UPDATE(4) ok, merge UPDATE(5) errors.
// (failAfter bumped 3→4 for the in-tx SET LOCAL lock_timeout exec — panel #7.)
app := stackFaultApp(t, openFaultDB(t, 4))
status, body := patchStackEnvF2(t, app, slug, jwt, `{"env":{"FOO":"bar"}}`)
assert.Equal(t, http.StatusServiceUnavailable, status)
assert.Contains(t, body, "persist_failed")
Expand Down
69 changes: 47 additions & 22 deletions internal/models/coverage_stack_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -337,75 +337,100 @@
ctx := context.Background()
id := uuid.New()
patch := map[string]string{"A": "1"}

// 1) BeginTx error.
// selRe matches the status+env_vars FOR UPDATE select (post-hardening).
selRe := `SELECT status, COALESCE\(env_vars.*FOR UPDATE`
lockRe := `SET LOCAL lock_timeout`
okRows := func(status, env string) *sqlmock.Rows {
return sqlmock.NewRows([]string{"status", "env_vars"}).AddRow(status, []byte(env))
}

// 1) BeginTx error (before SET LOCAL).
dbB, mockB := newMock(t)
mockB.ExpectBegin().WillReturnError(errors.New("begin-boom"))
_, _, err := MergeStackEnvVars(ctx, dbB, id, patch)
require.ErrorContains(t, err, "begin-boom")

// 2) SELECT ... FOR UPDATE → no rows → ErrStackNotFound (rolls back).
// 2) SET LOCAL lock_timeout error (rolls back).
dbLT, mockLT := newMock(t)
mockLT.ExpectBegin()
mockLT.ExpectExec(lockRe).WillReturnError(errors.New("lock-boom"))
mockLT.ExpectRollback()
_, _, err = MergeStackEnvVars(ctx, dbLT, id, patch)
require.ErrorContains(t, err, "lock_timeout")

// 3) SELECT ... FOR UPDATE → no rows → ErrStackNotFound (rolls back).
dbNF, mockNF := newMock(t)
mockNF.ExpectBegin()
mockNF.ExpectQuery(`SELECT COALESCE\(env_vars.*FOR UPDATE`).WillReturnError(errNoRows())
mockNF.ExpectExec(lockRe).WillReturnResult(sqlmock.NewResult(0, 0))
mockNF.ExpectQuery(selRe).WillReturnError(errNoRows())
mockNF.ExpectRollback()
var nf *ErrStackNotFound
_, _, err = MergeStackEnvVars(ctx, dbNF, id, patch)
require.ErrorAs(t, err, &nf)

// 3) SELECT error (non-NoRows).
// 4) SELECT error (non-NoRows).
dbSE, mockSE := newMock(t)
mockSE.ExpectBegin()
mockSE.ExpectQuery(`SELECT COALESCE\(env_vars.*FOR UPDATE`).WillReturnError(errors.New("sel-boom"))
mockSE.ExpectExec(lockRe).WillReturnResult(sqlmock.NewResult(0, 0))
mockSE.ExpectQuery(selRe).WillReturnError(errors.New("sel-boom"))
mockSE.ExpectRollback()
_, _, err = MergeStackEnvVars(ctx, dbSE, id, patch)
require.ErrorContains(t, err, "sel-boom")

// 4) unmarshal error (malformed jsonb).
// 5) status='deleting' → ErrStackDeleting (in-tx teardown guard, rolls back).
dbDel, mockDel := newMock(t)
mockDel.ExpectBegin()
mockDel.ExpectExec(lockRe).WillReturnResult(sqlmock.NewResult(0, 0))
mockDel.ExpectQuery(selRe).WillReturnRows(okRows(StackStatusDeleting, `{}`))
mockDel.ExpectRollback()
_, _, err = MergeStackEnvVars(ctx, dbDel, id, patch)
require.ErrorIs(t, err, ErrStackDeleting)

// 6) unmarshal error (malformed jsonb).
dbUM, mockUM := newMock(t)
mockUM.ExpectBegin()
mockUM.ExpectQuery(`SELECT COALESCE\(env_vars.*FOR UPDATE`).
WillReturnRows(sqlmock.NewRows([]string{"env_vars"}).AddRow([]byte(`not json`)))
mockUM.ExpectExec(lockRe).WillReturnResult(sqlmock.NewResult(0, 0))
mockUM.ExpectQuery(selRe).WillReturnRows(okRows("healthy", `not json`))
mockUM.ExpectRollback()
_, _, err = MergeStackEnvVars(ctx, dbUM, id, patch)
require.ErrorContains(t, err, "unmarshal")

// 5) over-cap → ErrStackEnvVarsTooLarge, checked BEFORE the UPDATE (rolls back).
// 7) over-cap → ErrStackEnvVarsTooLarge, checked BEFORE the UPDATE (rolls back).
dbTL, mockTL := newMock(t)
mockTL.ExpectBegin()
mockTL.ExpectQuery(`SELECT COALESCE\(env_vars.*FOR UPDATE`).
WillReturnRows(sqlmock.NewRows([]string{"env_vars"}).AddRow([]byte(`{}`)))
mockTL.ExpectExec(lockRe).WillReturnResult(sqlmock.NewResult(0, 0))
mockTL.ExpectQuery(selRe).WillReturnRows(okRows("healthy", `{}`))
mockTL.ExpectRollback()
big := map[string]string{"K": strings.Repeat("x", maxStackEnvVarsBytes+1)}
_, _, err = MergeStackEnvVars(ctx, dbTL, id, big)
require.ErrorIs(t, err, ErrStackEnvVarsTooLarge)

// 6) UPDATE error.
// 8) UPDATE error.
dbUE, mockUE := newMock(t)

Check warning on line 409 in internal/models/coverage_stack_test.go

View workflow job for this annotation

GitHub Actions / typos

"UE" should be "USE" or "DUE".

Check warning on line 409 in internal/models/coverage_stack_test.go

View workflow job for this annotation

GitHub Actions / typos

"UE" should be "USE" or "DUE".
mockUE.ExpectBegin()

Check warning on line 410 in internal/models/coverage_stack_test.go

View workflow job for this annotation

GitHub Actions / typos

"UE" should be "USE" or "DUE".
mockUE.ExpectQuery(`SELECT COALESCE\(env_vars.*FOR UPDATE`).
WillReturnRows(sqlmock.NewRows([]string{"env_vars"}).AddRow([]byte(`{}`)))
mockUE.ExpectExec(lockRe).WillReturnResult(sqlmock.NewResult(0, 0))

Check warning on line 411 in internal/models/coverage_stack_test.go

View workflow job for this annotation

GitHub Actions / typos

"UE" should be "USE" or "DUE".
mockUE.ExpectQuery(selRe).WillReturnRows(okRows("healthy", `{}`))

Check warning on line 412 in internal/models/coverage_stack_test.go

View workflow job for this annotation

GitHub Actions / typos

"UE" should be "USE" or "DUE".
mockUE.ExpectExec(`UPDATE stacks SET env_vars`).WillReturnError(errors.New("upd-boom"))

Check warning on line 413 in internal/models/coverage_stack_test.go

View workflow job for this annotation

GitHub Actions / typos

"UE" should be "USE" or "DUE".
mockUE.ExpectRollback()

Check warning on line 414 in internal/models/coverage_stack_test.go

View workflow job for this annotation

GitHub Actions / typos

"UE" should be "USE" or "DUE".
_, _, err = MergeStackEnvVars(ctx, dbUE, id, patch)

Check warning on line 415 in internal/models/coverage_stack_test.go

View workflow job for this annotation

GitHub Actions / typos

"UE" should be "USE" or "DUE".
require.ErrorContains(t, err, "upd-boom")

// 7) Commit error.
// 9) Commit error.
dbCE, mockCE := newMock(t)
mockCE.ExpectBegin()
mockCE.ExpectQuery(`SELECT COALESCE\(env_vars.*FOR UPDATE`).
WillReturnRows(sqlmock.NewRows([]string{"env_vars"}).AddRow([]byte(`{}`)))
mockCE.ExpectExec(lockRe).WillReturnResult(sqlmock.NewResult(0, 0))
mockCE.ExpectQuery(selRe).WillReturnRows(okRows("healthy", `{}`))
mockCE.ExpectExec(`UPDATE stacks SET env_vars`).WillReturnResult(sqlmock.NewResult(0, 1))
mockCE.ExpectCommit().WillReturnError(errors.New("commit-boom"))
_, _, err = MergeStackEnvVars(ctx, dbCE, id, patch)
require.ErrorContains(t, err, "commit-boom")

// 8) Happy path: existing {A:old, B:keep}; patch upserts A, adds C, deletes B
// (present → counted) and deletes MISSING (absent → NOT counted).
// 10) Happy path: existing {A:old, B:keep}; patch upserts A, adds C, deletes B
// (present → counted) and deletes MISSING (absent → NOT counted).
dbOK, mockOK := newMock(t)
mockOK.ExpectBegin()
mockOK.ExpectQuery(`SELECT COALESCE\(env_vars.*FOR UPDATE`).
WillReturnRows(sqlmock.NewRows([]string{"env_vars"}).AddRow([]byte(`{"A":"old","B":"keep"}`)))
mockOK.ExpectExec(lockRe).WillReturnResult(sqlmock.NewResult(0, 0))
mockOK.ExpectQuery(selRe).WillReturnRows(okRows("healthy", `{"A":"old","B":"keep"}`))
mockOK.ExpectExec(`UPDATE stacks SET env_vars`).WillReturnResult(sqlmock.NewResult(0, 1))
mockOK.ExpectCommit()
merged, deletes, err := MergeStackEnvVars(ctx, dbOK, id,
Expand Down
34 changes: 31 additions & 3 deletions internal/models/stack.go
Original file line number Diff line number Diff line change
Expand Up @@ -636,6 +636,19 @@ func GetStackEnvVars(ctx context.Context, db *sql.DB, stackID uuid.UUID) (map[st
// pairs at ~64 bytes each).
var ErrStackEnvVarsTooLarge = errors.New("stack env_vars payload exceeds 64KiB")

// StackStatusDeleting is the status of a stack mid-teardown. Lifecycle
// mutations (env merge) must be rejected for it. Single source of truth shared
// by the model guard and the handler.
const StackStatusDeleting = "deleting"

// ErrStackDeleting is returned by MergeStackEnvVars when the row is mid-teardown
// (status='deleting') at the moment the FOR UPDATE lock is taken. Checking this
// INSIDE the locked transaction — rather than via a pre-read in the handler —
// closes the TOCTOU where the teardown worker flips the row between the
// handler's GetStackBySlug and the merge, which would otherwise commit an env
// change onto a stack that's being deleted.
var ErrStackDeleting = errors.New("stack is being deleted")

// maxStackEnvVarsBytes is the serialized-JSON byte cap on env_vars.
// 64*1024 = 65536. See ErrStackEnvVarsTooLarge for rationale.
const maxStackEnvVarsBytes = 64 * 1024
Expand Down Expand Up @@ -699,19 +712,34 @@ func MergeStackEnvVars(ctx context.Context, db *sql.DB, stackID uuid.UUID, patch
return nil, 0, fmt.Errorf("models.MergeStackEnvVars: begin: %w", err)
}
// No-op after a successful Commit; rolls back on any early return (over-cap,
// marshal error, scan error) so the row is never left half-updated.
// marshal error, scan error, deleting) so the row is never left half-updated.
defer func() { _ = tx.Rollback() }()

// Bound the time we'll wait for the row lock. Without this a PATCH that
// races a long-held lock (another in-flight merge, or a teardown holding the
// row) would block the request goroutine indefinitely; 3s fails fast to a
// 503 the caller can retry. SET LOCAL is scoped to this transaction.
if _, err := tx.ExecContext(ctx, `SET LOCAL lock_timeout = '3s'`); err != nil {
return nil, 0, fmt.Errorf("models.MergeStackEnvVars: set lock_timeout: %w", err)
}

// Fetch status under the same FOR UPDATE lock so the teardown check is
// race-free: a stack flipped to 'deleting' between the handler's
// GetStackBySlug and here is caught authoritatively, not via a stale pre-read.
var raw []byte
var status string
err = tx.QueryRowContext(ctx, `
SELECT COALESCE(env_vars, '{}'::jsonb) FROM stacks WHERE id = $1 FOR UPDATE
`, stackID).Scan(&raw)
SELECT status, COALESCE(env_vars, '{}'::jsonb) FROM stacks WHERE id = $1 FOR UPDATE
`, stackID).Scan(&status, &raw)
if err == sql.ErrNoRows {
return nil, 0, &ErrStackNotFound{Slug: stackID.String()}
}
if err != nil {
return nil, 0, fmt.Errorf("models.MergeStackEnvVars: select: %w", err)
}
if status == StackStatusDeleting {
return nil, 0, ErrStackDeleting
}

merged := map[string]string{}
if len(raw) > 0 {
Expand Down
Loading