Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -628,7 +628,7 @@ func TestStackUpdateEnv_TooLarge_Returns413(t *testing.T) {
VALUES ($1, $2, $3, 'healthy', 'pro', 'production') RETURNING id
`, teamID, slug, "instant-stack-"+slug).Scan(&stackID))

// A single value > 64KiB blows the cap inside UpdateStackEnvVars.
// A single value > 64KiB blows the cap inside MergeStackEnvVars.
big := make([]byte, 70*1024)
for i := range big {
big[i] = 'A'
Expand Down
10 changes: 9 additions & 1 deletion internal/handlers/faultdb_deployasync_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ var errFaultInjected = errors.New("faultdb: injected failure")
// successful Query/Exec calls allowed before injection begins; -1 disables
// injection (pass-through).
type faultConfig struct {
calls atomic.Int64
calls atomic.Int64
failAfter int64
}

Expand Down Expand Up @@ -87,6 +87,14 @@ func (c *faultConn) ExecContext(ctx context.Context, query string, args []driver
return nil, driver.ErrSkip
}

// BeginTx returns the inner (unwrapped) driver.Tx and does NOT itself consume
// the failAfter budget. Note this does NOT exempt in-transaction queries from
// fault injection: lib/pq's driver.Tx implements neither driver.QueryerContext
// nor driver.ExecerContext, so database/sql routes sql.Tx.QueryContext /
// ExecContext back through this conn's QueryContext / ExecContext — which DO
// honor shouldFail(). That's why models.MergeStackEnvVars' tx-internal
// SELECT ... FOR UPDATE and UPDATE are still fault-injectable (see
// stack_final2_test.go).
func (c *faultConn) BeginTx(ctx context.Context, opts driver.TxOptions) (driver.Tx, error) {
if bt, ok := c.inner.(driver.ConnBeginTx); ok {
return bt.BeginTx(ctx, opts)
Expand Down
60 changes: 20 additions & 40 deletions internal/handlers/stack.go
Original file line number Diff line number Diff line change
Expand Up @@ -1150,18 +1150,19 @@ type updateStackEnvBody struct {
// NEVER persisted — the silent-data-loss failure mode. Now backed by
// migration 062's stacks.env_vars JSONB column. The handler:
//
// 1. Loads existing env_vars from the row.
// 2. Merges the incoming body's `env` map into the existing set (PATCH
// semantics — each call is incremental, not replace-all). Setting a
// key to the empty string deletes it (matches the dashboard contract
// and the env-var convention for "absent" elsewhere on the platform).
// 3. Validates every key against isValidEnvKey (POSIX [A-Z_][A-Z0-9_]*),
// 1. Validates every key against isValidEnvKey (POSIX [A-Z_][A-Z0-9_]*),
// mirroring deploy.go and /stacks/new so PATCH cannot smuggle in a
// key shape the create/redeploy paths would reject async.
// 4. Persists via UpdateStackEnvVars.
// 5. Emits a best-effort audit_log row (kind=stack.env.updated) for the
// 2. Atomically merges the incoming body's `env` map into the existing set
// (PATCH semantics — each call is incremental, not replace-all) via
// models.MergeStackEnvVars, a single row-locked transaction. Setting a
// key to the empty string deletes it (matches the dashboard contract
// and the env-var convention for "absent" elsewhere on the platform).
// The row lock serializes concurrent PATCHes so no key is lost to a
// read-modify-write race (bug-bash #10).
// 3. Emits a best-effort audit_log row (kind=stack.env.updated) for the
// dashboard activity feed and the support panel.
// 6. Returns the FULL merged env in the response so the caller doesn't
// 4. Returns the FULL merged env in the response so the caller doesn't
// have to re-GET to see the new state.
//
// Auth required — anonymous stacks cannot be mutated after creation.
Expand Down Expand Up @@ -1213,44 +1214,23 @@ func (h *StackHandler) UpdateEnv(c *fiber.Ctx) error {
"Env-var key "+quoteForError(badKey)+" must match POSIX shape [A-Z_][A-Z0-9_]*")
}

// Load existing env, merge, save. Empty-string value deletes the key —
// matches the dashboard's PATCH-with-delete affordance.
existing, err := models.GetStackEnvVars(c.Context(), h.db, stack.ID)
// Load-merge-save ATOMICALLY in one row-locked transaction. Empty-string
// value deletes the key — matches the dashboard's PATCH-with-delete
// affordance. The single MergeStackEnvVars call replaces the previous
// GetStackEnvVars → merge-in-Go → UpdateStackEnvVars sequence, which had a
// lost-update race: two concurrent PATCHes both read the same snapshot and
// the second blind-overwrote the first, silently dropping a key (bug-bash
// #10). MergeStackEnvVars serializes concurrent PATCHes via SELECT ... FOR
// UPDATE, so the second reads the first's committed result.
merged, deletes, err := models.MergeStackEnvVars(c.Context(), h.db, stack.ID, body.Env)
if err != nil {
var notFound *models.ErrStackNotFound
if errors.As(err, &notFound) {
// Row vanished between GetStackBySlug and here. Treat as 404.
return respondError(c, fiber.StatusNotFound, "not_found", "Stack not found")
}
slog.Error("stack.env.fetch_failed",
"slug", slug, "team_id", team.ID, "stack_id", stack.ID, "error", err)
return respondError(c, fiber.StatusServiceUnavailable, "fetch_failed",
"Failed to fetch existing env vars")
}
if existing == nil {
existing = map[string]string{}
}
merged := make(map[string]string, len(existing)+len(body.Env))
for k, v := range existing {
merged[k] = v
}
deletes := 0
for k, v := range body.Env {
if v == "" {
delete(merged, k)
deletes++
continue
}
merged[k] = v
}

if err := models.UpdateStackEnvVars(c.Context(), h.db, stack.ID, merged); err != nil {
if errors.Is(err, models.ErrStackEnvVarsTooLarge) {
return respondError(c, fiber.StatusRequestEntityTooLarge, "env_too_large",
"Total env_vars payload exceeds 64KiB. Trim values or split across services.")
}
var notFound *models.ErrStackNotFound
if errors.As(err, &notFound) {
// Row vanished between GetStackBySlug and the merge tx. Treat as 404.
return respondError(c, fiber.StatusNotFound, "not_found", "Stack not found")
}
slog.Error("stack.env.persist_failed",
Expand Down
139 changes: 132 additions & 7 deletions internal/handlers/stack_final2_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,38 @@ package handlers_test
// and a LATER one errors, so we seed a team-owned stack on the pooled DB and
// run the handler over a fault DB sharing the same postgres DSN.
//
// * UpdateEnv GetStackEnvVars error → fetch_failed (stack.go L1185-1188, failAfter=2)
// * UpdateEnv UpdateStackEnvVars error → persist_failed (L1216-1219, failAfter=3)
// * Family GetStackBySlug error → fetch_failed (L1885, failAfter=1)
// * UpdateEnv MergeStackEnvVars error → persist_failed (bug-bash #10)
// * Family GetStackBySlug error → fetch_failed (failAfter=1)
//
// bug-bash #10 (2026-06-04): UpdateEnv's old GetStackEnvVars → merge-in-Go →
// UpdateStackEnvVars sequence was a non-atomic read-modify-write that lost keys
// under concurrency. It was replaced by a single atomic models.MergeStackEnvVars
// call (one row-locked transaction). That collapsed the two distinct DB-error
// arms (fetch_failed for the read, persist_failed for the write) into ONE
// surface: any error out of MergeStackEnvVars — including the tx's internal
// SELECT ... FOR UPDATE and the UPDATE — maps to persist_failed 503. (Note the
// merge's tx-internal queries DO fault through the shared faultConn: lib/pq's
// driver.Tx has no QueryerContext, so sql.Tx.QueryContext/ExecContext fall back
// to the conn-level faultConn.QueryContext/ExecContext, which honor the
// failAfter budget.) Both former tests now assert the single persist_failed
// surface at the two query depths (the in-tx SELECT and the in-tx UPDATE).

import (
"context"
"database/sql"
"database/sql/driver"
"errors"
"io"
"net/http"
"net/http/httptest"
"os"
"strings"
"sync"
"testing"

"github.com/gofiber/fiber/v2"
"github.com/google/uuid"
"github.com/lib/pq"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

Expand Down Expand Up @@ -82,7 +99,12 @@ func patchStackEnvF2(t *testing.T, app *fiber.App, slug, jwt, body string) (int,
return resp.StatusCode, string(raw[:n])
}

func TestStackFinal2_UpdateEnv_FetchEnvFailed(t *testing.T) {
// TestStackFinal2_UpdateEnv_MergeSelectFailed faults the merge tx's internal
// SELECT ... FOR UPDATE. team(1)+GetStackBySlug(2) ok, then MergeStackEnvVars
// begins a tx and its SELECT (3rd conn-level query) errors → persist_failed.
// (Pre-bug-bash-#10 this depth was the GetStackEnvVars "fetch_failed" arm; the
// atomic merge collapsed it into the single persist_failed surface.)
func TestStackFinal2_UpdateEnv_MergeSelectFailed(t *testing.T) {
stackNeedDB(t)
seedDB, clean := testhelpers.SetupTestDB(t)
defer clean()
Expand All @@ -91,13 +113,16 @@ func TestStackFinal2_UpdateEnv_FetchEnvFailed(t *testing.T) {
_, slug := seedStack(t, seedDB, &teamID, "healthy")
jwt := testhelpers.MustSignSessionJWT(t, uuid.NewString(), teamIDStr, "stkf2@example.com")

// team(1)+GetStackBySlug(2) ok, GetStackEnvVars(3) errors.
// team(1)+GetStackBySlug(2) ok, merge tx SELECT ... FOR UPDATE(3) errors.
app := stackFaultApp(t, openFaultDB(t, 2))
status, body := patchStackEnvF2(t, app, slug, jwt, `{"env":{"FOO":"bar"}}`)
assert.Equal(t, http.StatusServiceUnavailable, status)
assert.Contains(t, body, "fetch_failed")
assert.Contains(t, body, "persist_failed")
}

// TestStackFinal2_UpdateEnv_PersistFailed faults the merge tx's internal UPDATE.
// team(1)+slug(2)+merge SELECT(3) ok, the merge tx UPDATE(4) errors →
// persist_failed.
func TestStackFinal2_UpdateEnv_PersistFailed(t *testing.T) {
stackNeedDB(t)
seedDB, clean := testhelpers.SetupTestDB(t)
Expand All @@ -107,7 +132,7 @@ func TestStackFinal2_UpdateEnv_PersistFailed(t *testing.T) {
_, slug := seedStack(t, seedDB, &teamID, "healthy")
jwt := testhelpers.MustSignSessionJWT(t, uuid.NewString(), teamIDStr, "stkf2@example.com")

// team(1)+slug(2)+GetStackEnvVars(3) ok, UpdateStackEnvVars(4) errors.
// team(1)+slug(2)+merge SELECT FOR UPDATE(3) ok, merge UPDATE(4) errors.
app := stackFaultApp(t, openFaultDB(t, 3))
status, body := patchStackEnvF2(t, app, slug, jwt, `{"env":{"FOO":"bar"}}`)
assert.Equal(t, http.StatusServiceUnavailable, status)
Expand All @@ -131,3 +156,103 @@ func TestStackFinal2_Family_FetchFailed(t *testing.T) {
defer resp.Body.Close()
assert.Equal(t, http.StatusServiceUnavailable, resp.StatusCode)
}

// ── merge-NotFound (TOCTOU) arm coverage ──────────────────────────────────────
//
// The UpdateEnv handler maps a models.ErrStackNotFound out of MergeStackEnvVars
// to a 404 — the row vanished between GetStackBySlug and the merge tx's
// SELECT ... FOR UPDATE (a genuine TOCTOU that can't be timed deterministically
// against a live row). To exercise that handler arm deterministically we proxy
// the real pq driver and return ZERO rows ONLY for the merge's
// `SELECT ... FOR UPDATE` query: GetStackBySlug (no FOR UPDATE) still finds the
// row, so the handler reaches the merge, whose SELECT then sees no rows →
// ErrStackNotFound → 404 not_found.

type forUpdateVanishConn struct{ inner driver.Conn }

func (c *forUpdateVanishConn) Prepare(q string) (driver.Stmt, error) { return c.inner.Prepare(q) }
func (c *forUpdateVanishConn) Close() error { return c.inner.Close() }
func (c *forUpdateVanishConn) Begin() (driver.Tx, error) { return c.inner.Begin() } //nolint:staticcheck

func (c *forUpdateVanishConn) BeginTx(ctx context.Context, opts driver.TxOptions) (driver.Tx, error) {
if bt, ok := c.inner.(driver.ConnBeginTx); ok {
return bt.BeginTx(ctx, opts)
}
return c.inner.Begin() //nolint:staticcheck
}

func (c *forUpdateVanishConn) QueryContext(ctx context.Context, query string, args []driver.NamedValue) (driver.Rows, error) {
if strings.Contains(query, "FOR UPDATE") {
// Simulate the row vanishing: return an empty result set so
// MergeStackEnvVars' Scan yields sql.ErrNoRows → ErrStackNotFound.
return &emptyRows{}, nil
}
if qc, ok := c.inner.(driver.QueryerContext); ok {
return qc.QueryContext(ctx, query, args)
}
return nil, driver.ErrSkip
}

func (c *forUpdateVanishConn) ExecContext(ctx context.Context, query string, args []driver.NamedValue) (driver.Result, error) {
if ec, ok := c.inner.(driver.ExecerContext); ok {
return ec.ExecContext(ctx, query, args)
}
return nil, driver.ErrSkip
}

// emptyRows is a zero-row driver.Rows for the single env_vars column.
type emptyRows struct{}

func (*emptyRows) Columns() []string { return []string{"coalesce"} }
func (*emptyRows) Close() error { return nil }
func (*emptyRows) Next(_ []driver.Value) error { return io.EOF }

type forUpdateVanishDriver struct{ dsn string }

func (d *forUpdateVanishDriver) Open(_ string) (driver.Conn, error) {
inner, err := pq.Open(d.dsn)
if err != nil {
return nil, err
}
return &forUpdateVanishConn{inner: inner}, nil
}

var fuvRegMu sync.Mutex
var fuvRegN int

func openForUpdateVanishDB(t *testing.T) *sql.DB {
t.Helper()
dsn := os.Getenv("TEST_DATABASE_URL")
if dsn == "" {
t.Skip("TEST_DATABASE_URL not set")
}
fuvRegMu.Lock()
fuvRegN++
name := "fuvpq_" + itoaFault(fuvRegN)
sql.Register(name, &forUpdateVanishDriver{dsn: dsn})
fuvRegMu.Unlock()
db, err := sql.Open(name, dsn)
require.NoError(t, err)
db.SetMaxOpenConns(1)
db.SetMaxIdleConns(1)
t.Cleanup(func() { _ = db.Close() })
return db
}

// TestStackFinal2_UpdateEnv_MergeRowVanished_404 covers the handler's
// ErrStackNotFound→404 mapping on the atomic merge: GetStackBySlug finds the
// row, but the merge tx's SELECT ... FOR UPDATE sees none (simulated vanish).
func TestStackFinal2_UpdateEnv_MergeRowVanished_404(t *testing.T) {
stackNeedDB(t)
seedDB, clean := testhelpers.SetupTestDB(t)
defer clean()
teamIDStr := testhelpers.MustCreateTeamDB(t, seedDB, "pro")
teamID := uuid.MustParse(teamIDStr)
_, slug := seedStack(t, seedDB, &teamID, "healthy")
jwt := testhelpers.MustSignSessionJWT(t, uuid.NewString(), teamIDStr, "vanish@example.com")

app := stackFaultApp(t, openForUpdateVanishDB(t))
status, body := patchStackEnvF2(t, app, slug, jwt, `{"env":{"FOO":"bar"}}`)
assert.Equal(t, http.StatusNotFound, status)
assert.Contains(t, body, "not_found")
}
87 changes: 87 additions & 0 deletions internal/models/coverage_stack_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -327,3 +327,90 @@
mock3.ExpectExec(`UPDATE stacks SET env_vars`).WillReturnError(errors.New("boom"))
require.ErrorContains(t, UpdateStackEnvVars(ctx, db3, uuid.New(), map[string]string{"a": "b"}), "boom")
}

// TestMergeStackEnvVars_Branches exercises every error/return arm of the atomic
// PATCH merge with sqlmock: BeginTx error, select→NotFound, select error,
// unmarshal error, over-cap rollback, update error, commit error, and the happy
// path (upsert + present-key delete counting). The real-DB serialization proof
// lives in handlers' TestMergeStackEnvVars_ConcurrentPatchesNoLostUpdate.
func TestMergeStackEnvVars_Branches(t *testing.T) {
ctx := context.Background()
id := uuid.New()
patch := map[string]string{"A": "1"}

// 1) BeginTx error.
dbB, mockB := newMock(t)
mockB.ExpectBegin().WillReturnError(errors.New("begin-boom"))
_, _, err := MergeStackEnvVars(ctx, dbB, id, patch)
require.ErrorContains(t, err, "begin-boom")

// 2) SELECT ... FOR UPDATE → no rows → ErrStackNotFound (rolls back).
dbNF, mockNF := newMock(t)
mockNF.ExpectBegin()
mockNF.ExpectQuery(`SELECT COALESCE\(env_vars.*FOR UPDATE`).WillReturnError(errNoRows())
mockNF.ExpectRollback()
var nf *ErrStackNotFound
_, _, err = MergeStackEnvVars(ctx, dbNF, id, patch)
require.ErrorAs(t, err, &nf)

// 3) SELECT error (non-NoRows).
dbSE, mockSE := newMock(t)
mockSE.ExpectBegin()
mockSE.ExpectQuery(`SELECT COALESCE\(env_vars.*FOR UPDATE`).WillReturnError(errors.New("sel-boom"))
mockSE.ExpectRollback()
_, _, err = MergeStackEnvVars(ctx, dbSE, id, patch)
require.ErrorContains(t, err, "sel-boom")

// 4) unmarshal error (malformed jsonb).
dbUM, mockUM := newMock(t)
mockUM.ExpectBegin()
mockUM.ExpectQuery(`SELECT COALESCE\(env_vars.*FOR UPDATE`).
WillReturnRows(sqlmock.NewRows([]string{"env_vars"}).AddRow([]byte(`not json`)))
mockUM.ExpectRollback()
_, _, err = MergeStackEnvVars(ctx, dbUM, id, patch)
require.ErrorContains(t, err, "unmarshal")

// 5) over-cap → ErrStackEnvVarsTooLarge, checked BEFORE the UPDATE (rolls back).
dbTL, mockTL := newMock(t)
mockTL.ExpectBegin()
mockTL.ExpectQuery(`SELECT COALESCE\(env_vars.*FOR UPDATE`).
WillReturnRows(sqlmock.NewRows([]string{"env_vars"}).AddRow([]byte(`{}`)))
mockTL.ExpectRollback()
big := map[string]string{"K": strings.Repeat("x", maxStackEnvVarsBytes+1)}
_, _, err = MergeStackEnvVars(ctx, dbTL, id, big)
require.ErrorIs(t, err, ErrStackEnvVarsTooLarge)

// 6) UPDATE error.
dbUE, mockUE := newMock(t)

Check warning on line 384 in internal/models/coverage_stack_test.go

View workflow job for this annotation

GitHub Actions / typos

"UE" should be "USE" or "DUE".

Check warning on line 384 in internal/models/coverage_stack_test.go

View workflow job for this annotation

GitHub Actions / typos

"UE" should be "USE" or "DUE".
mockUE.ExpectBegin()

Check warning on line 385 in internal/models/coverage_stack_test.go

View workflow job for this annotation

GitHub Actions / typos

"UE" should be "USE" or "DUE".
mockUE.ExpectQuery(`SELECT COALESCE\(env_vars.*FOR UPDATE`).

Check warning on line 386 in internal/models/coverage_stack_test.go

View workflow job for this annotation

GitHub Actions / typos

"UE" should be "USE" or "DUE".
WillReturnRows(sqlmock.NewRows([]string{"env_vars"}).AddRow([]byte(`{}`)))
mockUE.ExpectExec(`UPDATE stacks SET env_vars`).WillReturnError(errors.New("upd-boom"))

Check warning on line 388 in internal/models/coverage_stack_test.go

View workflow job for this annotation

GitHub Actions / typos

"UE" should be "USE" or "DUE".
mockUE.ExpectRollback()
_, _, err = MergeStackEnvVars(ctx, dbUE, id, patch)
require.ErrorContains(t, err, "upd-boom")

// 7) Commit error.
dbCE, mockCE := newMock(t)
mockCE.ExpectBegin()
mockCE.ExpectQuery(`SELECT COALESCE\(env_vars.*FOR UPDATE`).
WillReturnRows(sqlmock.NewRows([]string{"env_vars"}).AddRow([]byte(`{}`)))
mockCE.ExpectExec(`UPDATE stacks SET env_vars`).WillReturnResult(sqlmock.NewResult(0, 1))
mockCE.ExpectCommit().WillReturnError(errors.New("commit-boom"))
_, _, err = MergeStackEnvVars(ctx, dbCE, id, patch)
require.ErrorContains(t, err, "commit-boom")

// 8) Happy path: existing {A:old, B:keep}; patch upserts A, adds C, deletes B
// (present → counted) and deletes MISSING (absent → NOT counted).
dbOK, mockOK := newMock(t)
mockOK.ExpectBegin()
mockOK.ExpectQuery(`SELECT COALESCE\(env_vars.*FOR UPDATE`).
WillReturnRows(sqlmock.NewRows([]string{"env_vars"}).AddRow([]byte(`{"A":"old","B":"keep"}`)))
mockOK.ExpectExec(`UPDATE stacks SET env_vars`).WillReturnResult(sqlmock.NewResult(0, 1))
mockOK.ExpectCommit()
merged, deletes, err := MergeStackEnvVars(ctx, dbOK, id,
map[string]string{"A": "new", "C": "3", "B": "", "MISSING": ""})
require.NoError(t, err)
require.Equal(t, 1, deletes, "only the present key B counts as a delete")
require.Equal(t, map[string]string{"A": "new", "C": "3"}, merged)
}
Loading
Loading