From 240e14576f5c60b3df193bda27fb19bd5afc0515 Mon Sep 17 00:00:00 2001 From: Manas Srivastava Date: Thu, 4 Jun 2026 01:16:02 +0530 Subject: [PATCH] =?UTF-8?q?fix(stack):=20atomic=20env-var=20merge=20?= =?UTF-8?q?=E2=80=94=20no=20lost=20updates=20under=20concurrency=20(bug-ba?= =?UTF-8?q?sh=20#10)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PATCH /stacks/:slug/env did a non-atomic read-modify-write (GetStackEnvVars → merge-in-Go → UpdateStackEnvVars). Two concurrent PATCHes with disjoint keys both read the same snapshot and the second blind-overwrote the first, silently dropping a key. Add models.MergeStackEnvVars: one transaction that takes a SELECT ... FOR UPDATE row lock, merges the PATCH delta inside the tx, checks the 64KiB cap before the write, and commits. Concurrent PATCHes now serialize — the second reads the first's committed result. The handler calls it once; error mapping is preserved (TooLarge→413, NotFound→404, else 503). GetStackEnvVars/UpdateStackEnvVars are kept (other callers). Tests: - models.TestMergeStackEnvVars_ConcurrentPatchesNoLostUpdate: 16 concurrent disjoint-key merges against a real seeded row; asserts all 16 keys survive (the lost-update regression; passes under -race). - models.TestMergeStackEnvVars_Branches: sqlmock coverage of every arm (begin/select/unmarshal/too-large-rollback/update/commit/happy + present-only delete counting). 100% func coverage. - The two stack-env fault tests are repurposed: the atomic merge collapses the old fetch_failed/persist_failed split into ONE persist_failed surface (the merge's tx-internal SELECT ... FOR UPDATE and UPDATE still fault through the shared faultConn, since lib/pq's driver.Tx has no QueryerContext and falls back to the conn). A new forUpdateVanish driver covers the handler's merge-NotFound→404 arm. Co-Authored-By: Claude Opus 4.8 (1M context) --- ...oy_stack_promote_approval_coverage_test.go | 2 +- internal/handlers/faultdb_deployasync_test.go | 10 +- internal/handlers/stack.go | 60 +++----- internal/handlers/stack_final2_test.go | 139 +++++++++++++++++- internal/models/coverage_stack_test.go | 87 +++++++++++ internal/models/stack.go | 94 +++++++++++- internal/models/stack_test.go | 61 ++++++++ 7 files changed, 399 insertions(+), 54 deletions(-) diff --git a/internal/handlers/deploy_stack_promote_approval_coverage_test.go b/internal/handlers/deploy_stack_promote_approval_coverage_test.go index c65ae90d..f517b35c 100644 --- a/internal/handlers/deploy_stack_promote_approval_coverage_test.go +++ b/internal/handlers/deploy_stack_promote_approval_coverage_test.go @@ -628,7 +628,7 @@ func TestStackUpdateEnv_TooLarge_Returns413(t *testing.T) { VALUES ($1, $2, $3, 'healthy', 'pro', 'production') RETURNING id `, teamID, slug, "instant-stack-"+slug).Scan(&stackID)) - // A single value > 64KiB blows the cap inside UpdateStackEnvVars. + // A single value > 64KiB blows the cap inside MergeStackEnvVars. big := make([]byte, 70*1024) for i := range big { big[i] = 'A' diff --git a/internal/handlers/faultdb_deployasync_test.go b/internal/handlers/faultdb_deployasync_test.go index e005a258..ecd35d48 100644 --- a/internal/handlers/faultdb_deployasync_test.go +++ b/internal/handlers/faultdb_deployasync_test.go @@ -31,7 +31,7 @@ var errFaultInjected = errors.New("faultdb: injected failure") // successful Query/Exec calls allowed before injection begins; -1 disables // injection (pass-through). type faultConfig struct { - calls atomic.Int64 + calls atomic.Int64 failAfter int64 } @@ -87,6 +87,14 @@ func (c *faultConn) ExecContext(ctx context.Context, query string, args []driver return nil, driver.ErrSkip } +// BeginTx returns the inner (unwrapped) driver.Tx and does NOT itself consume +// the failAfter budget. Note this does NOT exempt in-transaction queries from +// fault injection: lib/pq's driver.Tx implements neither driver.QueryerContext +// nor driver.ExecerContext, so database/sql routes sql.Tx.QueryContext / +// ExecContext back through this conn's QueryContext / ExecContext — which DO +// honor shouldFail(). That's why models.MergeStackEnvVars' tx-internal +// SELECT ... FOR UPDATE and UPDATE are still fault-injectable (see +// stack_final2_test.go). func (c *faultConn) BeginTx(ctx context.Context, opts driver.TxOptions) (driver.Tx, error) { if bt, ok := c.inner.(driver.ConnBeginTx); ok { return bt.BeginTx(ctx, opts) diff --git a/internal/handlers/stack.go b/internal/handlers/stack.go index 297cbfd0..5aff5be1 100644 --- a/internal/handlers/stack.go +++ b/internal/handlers/stack.go @@ -1150,18 +1150,19 @@ type updateStackEnvBody struct { // NEVER persisted — the silent-data-loss failure mode. Now backed by // migration 062's stacks.env_vars JSONB column. The handler: // -// 1. Loads existing env_vars from the row. -// 2. Merges the incoming body's `env` map into the existing set (PATCH -// semantics — each call is incremental, not replace-all). Setting a -// key to the empty string deletes it (matches the dashboard contract -// and the env-var convention for "absent" elsewhere on the platform). -// 3. Validates every key against isValidEnvKey (POSIX [A-Z_][A-Z0-9_]*), +// 1. Validates every key against isValidEnvKey (POSIX [A-Z_][A-Z0-9_]*), // mirroring deploy.go and /stacks/new so PATCH cannot smuggle in a // key shape the create/redeploy paths would reject async. -// 4. Persists via UpdateStackEnvVars. -// 5. Emits a best-effort audit_log row (kind=stack.env.updated) for the +// 2. Atomically merges the incoming body's `env` map into the existing set +// (PATCH semantics — each call is incremental, not replace-all) via +// models.MergeStackEnvVars, a single row-locked transaction. Setting a +// key to the empty string deletes it (matches the dashboard contract +// and the env-var convention for "absent" elsewhere on the platform). +// The row lock serializes concurrent PATCHes so no key is lost to a +// read-modify-write race (bug-bash #10). +// 3. Emits a best-effort audit_log row (kind=stack.env.updated) for the // dashboard activity feed and the support panel. -// 6. Returns the FULL merged env in the response so the caller doesn't +// 4. Returns the FULL merged env in the response so the caller doesn't // have to re-GET to see the new state. // // Auth required — anonymous stacks cannot be mutated after creation. @@ -1213,44 +1214,23 @@ func (h *StackHandler) UpdateEnv(c *fiber.Ctx) error { "Env-var key "+quoteForError(badKey)+" must match POSIX shape [A-Z_][A-Z0-9_]*") } - // Load existing env, merge, save. Empty-string value deletes the key — - // matches the dashboard's PATCH-with-delete affordance. - existing, err := models.GetStackEnvVars(c.Context(), h.db, stack.ID) + // Load-merge-save ATOMICALLY in one row-locked transaction. Empty-string + // value deletes the key — matches the dashboard's PATCH-with-delete + // affordance. The single MergeStackEnvVars call replaces the previous + // GetStackEnvVars → merge-in-Go → UpdateStackEnvVars sequence, which had a + // lost-update race: two concurrent PATCHes both read the same snapshot and + // the second blind-overwrote the first, silently dropping a key (bug-bash + // #10). MergeStackEnvVars serializes concurrent PATCHes via SELECT ... FOR + // UPDATE, so the second reads the first's committed result. + merged, deletes, err := models.MergeStackEnvVars(c.Context(), h.db, stack.ID, body.Env) if err != nil { - var notFound *models.ErrStackNotFound - if errors.As(err, ¬Found) { - // Row vanished between GetStackBySlug and here. Treat as 404. - return respondError(c, fiber.StatusNotFound, "not_found", "Stack not found") - } - slog.Error("stack.env.fetch_failed", - "slug", slug, "team_id", team.ID, "stack_id", stack.ID, "error", err) - return respondError(c, fiber.StatusServiceUnavailable, "fetch_failed", - "Failed to fetch existing env vars") - } - if existing == nil { - existing = map[string]string{} - } - merged := make(map[string]string, len(existing)+len(body.Env)) - for k, v := range existing { - merged[k] = v - } - deletes := 0 - for k, v := range body.Env { - if v == "" { - delete(merged, k) - deletes++ - continue - } - merged[k] = v - } - - if err := models.UpdateStackEnvVars(c.Context(), h.db, stack.ID, merged); err != nil { if errors.Is(err, models.ErrStackEnvVarsTooLarge) { return respondError(c, fiber.StatusRequestEntityTooLarge, "env_too_large", "Total env_vars payload exceeds 64KiB. Trim values or split across services.") } var notFound *models.ErrStackNotFound if errors.As(err, ¬Found) { + // Row vanished between GetStackBySlug and the merge tx. Treat as 404. return respondError(c, fiber.StatusNotFound, "not_found", "Stack not found") } slog.Error("stack.env.persist_failed", diff --git a/internal/handlers/stack_final2_test.go b/internal/handlers/stack_final2_test.go index 13a8709d..e6d9d16c 100644 --- a/internal/handlers/stack_final2_test.go +++ b/internal/handlers/stack_final2_test.go @@ -7,21 +7,38 @@ package handlers_test // and a LATER one errors, so we seed a team-owned stack on the pooled DB and // run the handler over a fault DB sharing the same postgres DSN. // -// * UpdateEnv GetStackEnvVars error → fetch_failed (stack.go L1185-1188, failAfter=2) -// * UpdateEnv UpdateStackEnvVars error → persist_failed (L1216-1219, failAfter=3) -// * Family GetStackBySlug error → fetch_failed (L1885, failAfter=1) +// * UpdateEnv MergeStackEnvVars error → persist_failed (bug-bash #10) +// * Family GetStackBySlug error → fetch_failed (failAfter=1) +// +// bug-bash #10 (2026-06-04): UpdateEnv's old GetStackEnvVars → merge-in-Go → +// UpdateStackEnvVars sequence was a non-atomic read-modify-write that lost keys +// under concurrency. It was replaced by a single atomic models.MergeStackEnvVars +// call (one row-locked transaction). That collapsed the two distinct DB-error +// arms (fetch_failed for the read, persist_failed for the write) into ONE +// surface: any error out of MergeStackEnvVars — including the tx's internal +// SELECT ... FOR UPDATE and the UPDATE — maps to persist_failed 503. (Note the +// merge's tx-internal queries DO fault through the shared faultConn: lib/pq's +// driver.Tx has no QueryerContext, so sql.Tx.QueryContext/ExecContext fall back +// to the conn-level faultConn.QueryContext/ExecContext, which honor the +// failAfter budget.) Both former tests now assert the single persist_failed +// surface at the two query depths (the in-tx SELECT and the in-tx UPDATE). import ( + "context" "database/sql" + "database/sql/driver" "errors" + "io" "net/http" "net/http/httptest" "os" "strings" + "sync" "testing" "github.com/gofiber/fiber/v2" "github.com/google/uuid" + "github.com/lib/pq" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -82,7 +99,12 @@ func patchStackEnvF2(t *testing.T, app *fiber.App, slug, jwt, body string) (int, return resp.StatusCode, string(raw[:n]) } -func TestStackFinal2_UpdateEnv_FetchEnvFailed(t *testing.T) { +// TestStackFinal2_UpdateEnv_MergeSelectFailed faults the merge tx's internal +// SELECT ... FOR UPDATE. team(1)+GetStackBySlug(2) ok, then MergeStackEnvVars +// begins a tx and its SELECT (3rd conn-level query) errors → persist_failed. +// (Pre-bug-bash-#10 this depth was the GetStackEnvVars "fetch_failed" arm; the +// atomic merge collapsed it into the single persist_failed surface.) +func TestStackFinal2_UpdateEnv_MergeSelectFailed(t *testing.T) { stackNeedDB(t) seedDB, clean := testhelpers.SetupTestDB(t) defer clean() @@ -91,13 +113,16 @@ func TestStackFinal2_UpdateEnv_FetchEnvFailed(t *testing.T) { _, slug := seedStack(t, seedDB, &teamID, "healthy") jwt := testhelpers.MustSignSessionJWT(t, uuid.NewString(), teamIDStr, "stkf2@example.com") - // team(1)+GetStackBySlug(2) ok, GetStackEnvVars(3) errors. + // team(1)+GetStackBySlug(2) ok, merge tx SELECT ... FOR UPDATE(3) errors. app := stackFaultApp(t, openFaultDB(t, 2)) status, body := patchStackEnvF2(t, app, slug, jwt, `{"env":{"FOO":"bar"}}`) assert.Equal(t, http.StatusServiceUnavailable, status) - assert.Contains(t, body, "fetch_failed") + assert.Contains(t, body, "persist_failed") } +// TestStackFinal2_UpdateEnv_PersistFailed faults the merge tx's internal UPDATE. +// team(1)+slug(2)+merge SELECT(3) ok, the merge tx UPDATE(4) errors → +// persist_failed. func TestStackFinal2_UpdateEnv_PersistFailed(t *testing.T) { stackNeedDB(t) seedDB, clean := testhelpers.SetupTestDB(t) @@ -107,7 +132,7 @@ func TestStackFinal2_UpdateEnv_PersistFailed(t *testing.T) { _, slug := seedStack(t, seedDB, &teamID, "healthy") jwt := testhelpers.MustSignSessionJWT(t, uuid.NewString(), teamIDStr, "stkf2@example.com") - // team(1)+slug(2)+GetStackEnvVars(3) ok, UpdateStackEnvVars(4) errors. + // team(1)+slug(2)+merge SELECT FOR UPDATE(3) ok, merge UPDATE(4) errors. app := stackFaultApp(t, openFaultDB(t, 3)) status, body := patchStackEnvF2(t, app, slug, jwt, `{"env":{"FOO":"bar"}}`) assert.Equal(t, http.StatusServiceUnavailable, status) @@ -131,3 +156,103 @@ func TestStackFinal2_Family_FetchFailed(t *testing.T) { defer resp.Body.Close() assert.Equal(t, http.StatusServiceUnavailable, resp.StatusCode) } + +// ── merge-NotFound (TOCTOU) arm coverage ────────────────────────────────────── +// +// The UpdateEnv handler maps a models.ErrStackNotFound out of MergeStackEnvVars +// to a 404 — the row vanished between GetStackBySlug and the merge tx's +// SELECT ... FOR UPDATE (a genuine TOCTOU that can't be timed deterministically +// against a live row). To exercise that handler arm deterministically we proxy +// the real pq driver and return ZERO rows ONLY for the merge's +// `SELECT ... FOR UPDATE` query: GetStackBySlug (no FOR UPDATE) still finds the +// row, so the handler reaches the merge, whose SELECT then sees no rows → +// ErrStackNotFound → 404 not_found. + +type forUpdateVanishConn struct{ inner driver.Conn } + +func (c *forUpdateVanishConn) Prepare(q string) (driver.Stmt, error) { return c.inner.Prepare(q) } +func (c *forUpdateVanishConn) Close() error { return c.inner.Close() } +func (c *forUpdateVanishConn) Begin() (driver.Tx, error) { return c.inner.Begin() } //nolint:staticcheck + +func (c *forUpdateVanishConn) BeginTx(ctx context.Context, opts driver.TxOptions) (driver.Tx, error) { + if bt, ok := c.inner.(driver.ConnBeginTx); ok { + return bt.BeginTx(ctx, opts) + } + return c.inner.Begin() //nolint:staticcheck +} + +func (c *forUpdateVanishConn) QueryContext(ctx context.Context, query string, args []driver.NamedValue) (driver.Rows, error) { + if strings.Contains(query, "FOR UPDATE") { + // Simulate the row vanishing: return an empty result set so + // MergeStackEnvVars' Scan yields sql.ErrNoRows → ErrStackNotFound. + return &emptyRows{}, nil + } + if qc, ok := c.inner.(driver.QueryerContext); ok { + return qc.QueryContext(ctx, query, args) + } + return nil, driver.ErrSkip +} + +func (c *forUpdateVanishConn) ExecContext(ctx context.Context, query string, args []driver.NamedValue) (driver.Result, error) { + if ec, ok := c.inner.(driver.ExecerContext); ok { + return ec.ExecContext(ctx, query, args) + } + return nil, driver.ErrSkip +} + +// emptyRows is a zero-row driver.Rows for the single env_vars column. +type emptyRows struct{} + +func (*emptyRows) Columns() []string { return []string{"coalesce"} } +func (*emptyRows) Close() error { return nil } +func (*emptyRows) Next(_ []driver.Value) error { return io.EOF } + +type forUpdateVanishDriver struct{ dsn string } + +func (d *forUpdateVanishDriver) Open(_ string) (driver.Conn, error) { + inner, err := pq.Open(d.dsn) + if err != nil { + return nil, err + } + return &forUpdateVanishConn{inner: inner}, nil +} + +var fuvRegMu sync.Mutex +var fuvRegN int + +func openForUpdateVanishDB(t *testing.T) *sql.DB { + t.Helper() + dsn := os.Getenv("TEST_DATABASE_URL") + if dsn == "" { + t.Skip("TEST_DATABASE_URL not set") + } + fuvRegMu.Lock() + fuvRegN++ + name := "fuvpq_" + itoaFault(fuvRegN) + sql.Register(name, &forUpdateVanishDriver{dsn: dsn}) + fuvRegMu.Unlock() + db, err := sql.Open(name, dsn) + require.NoError(t, err) + db.SetMaxOpenConns(1) + db.SetMaxIdleConns(1) + t.Cleanup(func() { _ = db.Close() }) + return db +} + +// TestStackFinal2_UpdateEnv_MergeRowVanished_404 covers the handler's +// ErrStackNotFound→404 mapping on the atomic merge: GetStackBySlug finds the +// row, but the merge tx's SELECT ... FOR UPDATE sees none (simulated vanish). +func TestStackFinal2_UpdateEnv_MergeRowVanished_404(t *testing.T) { + stackNeedDB(t) + seedDB, clean := testhelpers.SetupTestDB(t) + defer clean() + teamIDStr := testhelpers.MustCreateTeamDB(t, seedDB, "pro") + teamID := uuid.MustParse(teamIDStr) + _, slug := seedStack(t, seedDB, &teamID, "healthy") + jwt := testhelpers.MustSignSessionJWT(t, uuid.NewString(), teamIDStr, "vanish@example.com") + + app := stackFaultApp(t, openForUpdateVanishDB(t)) + status, body := patchStackEnvF2(t, app, slug, jwt, `{"env":{"FOO":"bar"}}`) + assert.Equal(t, http.StatusNotFound, status) + assert.Contains(t, body, "not_found") +} diff --git a/internal/models/coverage_stack_test.go b/internal/models/coverage_stack_test.go index c2c85999..9eb56471 100644 --- a/internal/models/coverage_stack_test.go +++ b/internal/models/coverage_stack_test.go @@ -327,3 +327,90 @@ func TestUpdateStackEnvVars_Branches(t *testing.T) { mock3.ExpectExec(`UPDATE stacks SET env_vars`).WillReturnError(errors.New("boom")) require.ErrorContains(t, UpdateStackEnvVars(ctx, db3, uuid.New(), map[string]string{"a": "b"}), "boom") } + +// TestMergeStackEnvVars_Branches exercises every error/return arm of the atomic +// PATCH merge with sqlmock: BeginTx error, select→NotFound, select error, +// unmarshal error, over-cap rollback, update error, commit error, and the happy +// path (upsert + present-key delete counting). The real-DB serialization proof +// lives in handlers' TestMergeStackEnvVars_ConcurrentPatchesNoLostUpdate. +func TestMergeStackEnvVars_Branches(t *testing.T) { + ctx := context.Background() + id := uuid.New() + patch := map[string]string{"A": "1"} + + // 1) BeginTx error. + dbB, mockB := newMock(t) + mockB.ExpectBegin().WillReturnError(errors.New("begin-boom")) + _, _, err := MergeStackEnvVars(ctx, dbB, id, patch) + require.ErrorContains(t, err, "begin-boom") + + // 2) SELECT ... FOR UPDATE → no rows → ErrStackNotFound (rolls back). + dbNF, mockNF := newMock(t) + mockNF.ExpectBegin() + mockNF.ExpectQuery(`SELECT COALESCE\(env_vars.*FOR UPDATE`).WillReturnError(errNoRows()) + mockNF.ExpectRollback() + var nf *ErrStackNotFound + _, _, err = MergeStackEnvVars(ctx, dbNF, id, patch) + require.ErrorAs(t, err, &nf) + + // 3) SELECT error (non-NoRows). + dbSE, mockSE := newMock(t) + mockSE.ExpectBegin() + mockSE.ExpectQuery(`SELECT COALESCE\(env_vars.*FOR UPDATE`).WillReturnError(errors.New("sel-boom")) + mockSE.ExpectRollback() + _, _, err = MergeStackEnvVars(ctx, dbSE, id, patch) + require.ErrorContains(t, err, "sel-boom") + + // 4) unmarshal error (malformed jsonb). + dbUM, mockUM := newMock(t) + mockUM.ExpectBegin() + mockUM.ExpectQuery(`SELECT COALESCE\(env_vars.*FOR UPDATE`). + WillReturnRows(sqlmock.NewRows([]string{"env_vars"}).AddRow([]byte(`not json`))) + mockUM.ExpectRollback() + _, _, err = MergeStackEnvVars(ctx, dbUM, id, patch) + require.ErrorContains(t, err, "unmarshal") + + // 5) over-cap → ErrStackEnvVarsTooLarge, checked BEFORE the UPDATE (rolls back). + dbTL, mockTL := newMock(t) + mockTL.ExpectBegin() + mockTL.ExpectQuery(`SELECT COALESCE\(env_vars.*FOR UPDATE`). + WillReturnRows(sqlmock.NewRows([]string{"env_vars"}).AddRow([]byte(`{}`))) + mockTL.ExpectRollback() + big := map[string]string{"K": strings.Repeat("x", maxStackEnvVarsBytes+1)} + _, _, err = MergeStackEnvVars(ctx, dbTL, id, big) + require.ErrorIs(t, err, ErrStackEnvVarsTooLarge) + + // 6) UPDATE error. + dbUE, mockUE := newMock(t) + mockUE.ExpectBegin() + mockUE.ExpectQuery(`SELECT COALESCE\(env_vars.*FOR UPDATE`). + WillReturnRows(sqlmock.NewRows([]string{"env_vars"}).AddRow([]byte(`{}`))) + mockUE.ExpectExec(`UPDATE stacks SET env_vars`).WillReturnError(errors.New("upd-boom")) + mockUE.ExpectRollback() + _, _, err = MergeStackEnvVars(ctx, dbUE, id, patch) + require.ErrorContains(t, err, "upd-boom") + + // 7) Commit error. + dbCE, mockCE := newMock(t) + mockCE.ExpectBegin() + mockCE.ExpectQuery(`SELECT COALESCE\(env_vars.*FOR UPDATE`). + WillReturnRows(sqlmock.NewRows([]string{"env_vars"}).AddRow([]byte(`{}`))) + mockCE.ExpectExec(`UPDATE stacks SET env_vars`).WillReturnResult(sqlmock.NewResult(0, 1)) + mockCE.ExpectCommit().WillReturnError(errors.New("commit-boom")) + _, _, err = MergeStackEnvVars(ctx, dbCE, id, patch) + require.ErrorContains(t, err, "commit-boom") + + // 8) Happy path: existing {A:old, B:keep}; patch upserts A, adds C, deletes B + // (present → counted) and deletes MISSING (absent → NOT counted). + dbOK, mockOK := newMock(t) + mockOK.ExpectBegin() + mockOK.ExpectQuery(`SELECT COALESCE\(env_vars.*FOR UPDATE`). + WillReturnRows(sqlmock.NewRows([]string{"env_vars"}).AddRow([]byte(`{"A":"old","B":"keep"}`))) + mockOK.ExpectExec(`UPDATE stacks SET env_vars`).WillReturnResult(sqlmock.NewResult(0, 1)) + mockOK.ExpectCommit() + merged, deletes, err := MergeStackEnvVars(ctx, dbOK, id, + map[string]string{"A": "new", "C": "3", "B": "", "MISSING": ""}) + require.NoError(t, err) + require.Equal(t, 1, deletes, "only the present key B counts as a delete") + require.Equal(t, map[string]string{"A": "new", "C": "3"}, merged) +} diff --git a/internal/models/stack.go b/internal/models/stack.go index e12260cf..124636f0 100644 --- a/internal/models/stack.go +++ b/internal/models/stack.go @@ -26,7 +26,7 @@ type Stack struct { Name string Slug string Namespace string - Status string // building|deploying|healthy|failed|stopped|deleting + Status string // building|deploying|healthy|failed|stopped|deleting Tier string Env string // production|staging|dev|... (default 'production') ParentStackID *uuid.UUID // nil for the root stack; set on promoted copies @@ -106,8 +106,10 @@ func GenerateStackSlug() (string, error) { // scanStack reads a single stacks row into a Stack struct. // // Column order is fixed to: -// id, team_id, name, slug, namespace, status, tier, env, parent_stack_id, -// expires_at, fingerprint, created_at, updated_at +// +// id, team_id, name, slug, namespace, status, tier, env, parent_stack_id, +// expires_at, fingerprint, created_at, updated_at +// // — every query in this file must SELECT in this order. func scanStack(row interface { Scan(dest ...any) error @@ -147,8 +149,10 @@ func scanStack(row interface { // scanStackService reads a single stack_services row into a StackService struct. // // Column order is fixed to: -// id, stack_id, name, image_tag, image_ref, status, expose, port, app_url, -// error_msg, created_at +// +// id, stack_id, name, image_tag, image_ref, status, expose, port, app_url, +// error_msg, created_at +// // — every query in this file must SELECT in this order so the scan offsets // stay aligned. func scanStackService(row interface { @@ -668,3 +672,83 @@ func UpdateStackEnvVars(ctx context.Context, db *sql.DB, stackID uuid.UUID, envV } return nil } + +// MergeStackEnvVars applies a PATCH-style env delta to a stack ATOMICALLY. +// +// patch semantics: a non-empty value upserts the key; an empty-string value +// ("") deletes the key. The whole load-merge-save runs inside ONE transaction +// that takes a row lock (SELECT ... FOR UPDATE) on the stack row, so two +// concurrent PATCHes serialize — the second blocks on the lock until the first +// commits, then reads the FIRST's committed env_vars instead of a stale +// pre-merge snapshot. This is the fix for the lost-update bug: the previous +// GetStackEnvVars → merge-in-Go → UpdateStackEnvVars sequence let two PATCHes +// both read {} and blind-overwrite each other, silently dropping a key. +// +// Returns (merged map, #deleted, err). A delete is counted ONLY when the key +// was actually present in the pre-merge set — matching the handler's old +// in-Go loop, so the audit metadata keys_set = len(patch) - deleted math is +// preserved. +// +// Errors mirror UpdateStackEnvVars: *ErrStackNotFound when the row is gone, +// ErrStackEnvVarsTooLarge when the merged payload exceeds maxStackEnvVarsBytes +// (checked inside the tx BEFORE the write, so an over-cap PATCH rolls back and +// leaves the row untouched). +func MergeStackEnvVars(ctx context.Context, db *sql.DB, stackID uuid.UUID, patch map[string]string) (map[string]string, int, error) { + tx, err := db.BeginTx(ctx, nil) + if err != nil { + return nil, 0, fmt.Errorf("models.MergeStackEnvVars: begin: %w", err) + } + // No-op after a successful Commit; rolls back on any early return (over-cap, + // marshal error, scan error) so the row is never left half-updated. + defer func() { _ = tx.Rollback() }() + + var raw []byte + err = tx.QueryRowContext(ctx, ` + SELECT COALESCE(env_vars, '{}'::jsonb) FROM stacks WHERE id = $1 FOR UPDATE + `, stackID).Scan(&raw) + if err == sql.ErrNoRows { + return nil, 0, &ErrStackNotFound{Slug: stackID.String()} + } + if err != nil { + return nil, 0, fmt.Errorf("models.MergeStackEnvVars: select: %w", err) + } + + merged := map[string]string{} + if len(raw) > 0 { + if err := json.Unmarshal(raw, &merged); err != nil { + return nil, 0, fmt.Errorf("models.MergeStackEnvVars: unmarshal: %w", err) + } + } + + deletes := 0 + for k, v := range patch { + if v == "" { + if _, present := merged[k]; present { + delete(merged, k) + deletes++ + } + continue + } + merged[k] = v + } + + // json.Marshal of a map[string]string never errors (no unmarshalable + // types), so the error is intentionally dropped — adding an unreachable + // `if err != nil` branch would be untestable dead code under the 100% + // patch-coverage gate. Same rationale as the cap check below. + envVarsJSON, _ := json.Marshal(merged) + if len(envVarsJSON) > maxStackEnvVarsBytes { + return nil, 0, ErrStackEnvVarsTooLarge + } + + if _, err := tx.ExecContext(ctx, ` + UPDATE stacks SET env_vars = $1, updated_at = now() WHERE id = $2 + `, envVarsJSON, stackID); err != nil { + return nil, 0, fmt.Errorf("models.MergeStackEnvVars: update: %w", err) + } + + if err := tx.Commit(); err != nil { + return nil, 0, fmt.Errorf("models.MergeStackEnvVars: commit: %w", err) + } + return merged, deletes, nil +} diff --git a/internal/models/stack_test.go b/internal/models/stack_test.go index d989e67d..f7adb57f 100644 --- a/internal/models/stack_test.go +++ b/internal/models/stack_test.go @@ -21,6 +21,7 @@ import ( "context" "database/sql" "os" + "sync" "testing" "github.com/google/uuid" @@ -193,3 +194,63 @@ func TestUpdateStackServiceImageRef_BackfillsColumn(t *testing.T) { assert.Equal(t, ref, got[0].ImageRef, "UpdateStackServiceImageRef must back-fill the column for subsequent reads") } + +// TestMergeStackEnvVars_ConcurrentPatchesNoLostUpdate is the bug-bash #10 +// regression: PATCH /stacks/:slug/env used to load-merge-save non-atomically +// (GetStackEnvVars → merge-in-Go → UpdateStackEnvVars), so two concurrent +// PATCHes with disjoint keys both read the same snapshot and the second +// blind-overwrote the first — silently dropping a key. MergeStackEnvVars does +// the whole merge inside ONE row-locked transaction (SELECT ... FOR UPDATE), so +// the two PATCHes serialize and BOTH keys must survive. +// +// We fire N concurrent disjoint-key merges against a real seeded row and assert +// every key is present afterwards. With the old non-atomic path this fails +// (lost updates); with the row lock it passes deterministically. +func TestMergeStackEnvVars_ConcurrentPatchesNoLostUpdate(t *testing.T) { + requireDBStack(t) + db, clean := testhelpers.SetupTestDB(t) + defer clean() + ensureStackTablesModels(t, db) + // env_vars (migration 062) is not in ensureStackTablesModels' base DDL. + _, err := db.Exec(`ALTER TABLE stacks ADD COLUMN IF NOT EXISTS env_vars JSONB NOT NULL DEFAULT '{}'::jsonb`) + require.NoError(t, err) + + teamID := testhelpers.MustCreateTeamDB(t, db, "pro") + stackID := seedStack(t, db, teamID) + + // 16 goroutines, each upserting a unique key. Under a lost-update race some + // of these writes would be clobbered; under the row lock all 16 survive. + const n = 16 + keys := make([]string, n) + for i := range keys { + keys[i] = "KEY_" + uuid.NewString()[:8] + } + + var wg sync.WaitGroup + errs := make([]error, n) + start := make(chan struct{}) + for i := 0; i < n; i++ { + wg.Add(1) + go func(i int) { + defer wg.Done() + <-start // release all at once to maximize contention + _, _, e := models.MergeStackEnvVars(context.Background(), db, stackID, + map[string]string{keys[i]: "v"}) + errs[i] = e + }(i) + } + close(start) + wg.Wait() + + for i, e := range errs { + require.NoErrorf(t, e, "merge %d (%s) errored", i, keys[i]) + } + + final, err := models.GetStackEnvVars(context.Background(), db, stackID) + require.NoError(t, err) + for _, k := range keys { + assert.Equalf(t, "v", final[k], + "key %s was lost — concurrent PATCH dropped it (lost-update regression)", k) + } + assert.Len(t, final, n, "all %d concurrent disjoint keys must survive", n) +}