diff --git a/internal/handlers/deploy.go b/internal/handlers/deploy.go index cf44bd03..71ac311a 100644 --- a/internal/handlers/deploy.go +++ b/internal/handlers/deploy.go @@ -1087,10 +1087,20 @@ func (h *DeployHandler) New(c *fiber.Ctx) error { "Existing deployment has no provider ID yet — initial build may still be running. Try again in a few seconds.") } - // Flip the row to 'building' (mirrors POST /deploy/:id/redeploy). - if err := models.UpdateDeploymentStatus(c.Context(), h.db, existing.ID, "building", ""); err != nil { + // Flip the row to 'building' as a guarded CAS (mirrors POST + // /deploy/:id/redeploy). FindActiveDeploymentByTeamEnvName already + // filters to redeployable statuses, but the reaper can flip this row + // to expired/deleted in the TOCTOU window between that lookup and + // here — the CAS reports 0 rows in that case and we 409 rather than + // resurrecting a dead workload. A driver error is non-determinate; + // log and continue (runRedeployAsync reconciles the status later). + if n, err := models.MarkDeploymentBuilding(c.Context(), h.db, existing.ID); err != nil { slog.Warn("deploy.new.redeploy_status_update_failed", "app_id", existing.AppID, "error", err) + } else if n == 0 { + metrics.DeployRedeployInPlaceTotal.WithLabelValues("not_redeployable").Inc() + return respondError(c, fiber.StatusConflict, errCodeDeploymentNotRedeployable, + "This deployment is no longer in a redeployable state (it was reaped concurrently). Create a new deployment instead.") } // Audit BEFORE the async build (source="deploy_new_in_place" so the @@ -1905,9 +1915,19 @@ func (h *DeployHandler) Redeploy(c *fiber.Ctx) error { "Failed to read tarball bytes") } - // Update status to "building" while the redeploy runs. - if err := models.UpdateDeploymentStatus(c.Context(), h.db, d.ID, "building", ""); err != nil { + // Flip the row to "building" as a guarded CAS. The IsDeploymentTerminal + // gate above is a check-then-act with a TOCTOU window: the reaper can + // flip this row to expired/deleted between that read and here. The CAS + // only matches a redeployable status, so a row reaped in that window + // reports 0 rows and we 409 rather than resurrecting a dead workload. + // A driver error is non-determinate (we can't tell whether the flip + // landed) — log and continue, since runRedeployAsync will reconcile the + // status; only the explicit 0-row CAS miss means "reaped, do not re-arm". + if n, err := models.MarkDeploymentBuilding(c.Context(), h.db, d.ID); err != nil { slog.Warn("deploy.redeploy.status_update_failed", "app_id", appID, "error", err) + } else if n == 0 { + return respondError(c, fiber.StatusConflict, errCodeDeploymentNotRedeployable, + "This deployment is no longer in a redeployable state (it was reaped concurrently). Create a new deployment instead.") } // Emit audit trail BEFORE the async build runs — same shape as diff --git a/internal/handlers/deploy_redeploy_inplace_mock_test.go b/internal/handlers/deploy_redeploy_inplace_mock_test.go index dcd86482..c7e1c20c 100644 --- a/internal/handlers/deploy_redeploy_inplace_mock_test.go +++ b/internal/handlers/deploy_redeploy_inplace_mock_test.go @@ -330,11 +330,13 @@ func TestDeployNew_Redeploy_UpdateStatusError_StillAccepts(t *testing.T) { "", "", "", // git_url, git_ref, git_token_enc (mig 065) )) - // UPDATE deployments SET status = $1 ... → driver error. The handler - // must slog.Warn and CONTINUE (NOT return 5xx) — the redeploy itself - // is still useful because runRedeployAsync will flip the row later. - mock.ExpectExec(`UPDATE deployments\s+SET status = \$1, error_message = \$2, updated_at = now\(\)`). - WithArgs("building", nil, rowID). + // MarkDeploymentBuilding (guarded CAS) → driver error. The handler must + // slog.Warn and CONTINUE (NOT return 5xx) — a driver error is + // non-determinate (we can't tell whether the flip landed), and + // runRedeployAsync will reconcile the row later. Only an explicit 0-row + // CAS miss means "reaped concurrently, return 409". + mock.ExpectExec(`UPDATE deployments\s+SET status = 'building', error_message = NULL, updated_at = now\(\)\s+WHERE id = \$1 AND status IN`). + WithArgs(rowID). WillReturnError(errMockRedeployDriver) // MatchExpectationsInOrder=false because the audit goroutine @@ -448,6 +450,282 @@ func TestDeployNew_Redeploy_EmptyProviderID_Returns409(t *testing.T) { require.NoError(t, mock.ExpectationsWereMet()) } +// TestDeployNew_Redeploy_CASMiss_Returns409 pins the #14 TOCTOU hardening on +// the in-place path. FindActiveDeploymentByTeamEnvName returns a redeployable +// row (status=healthy, provider_id set), but between that lookup and the +// 'building' flip the reaper advanced the row to a terminal status. The +// guarded CAS therefore matches 0 rows. The handler MUST 409 +// deployment_not_redeployable (NOT re-arm the dead workload as 'building'). +func TestDeployNew_Redeploy_CASMiss_Returns409(t *testing.T) { + db, mock, err := sqlmock.New() + require.NoError(t, err) + defer db.Close() + + app, teamID, _ := redeployMockApp(t, db) + rowID := uuid.New() + + expectTeamLookupOK(mock, teamID, "pro") + + envVarsJSON, _ := json.Marshal(map[string]string{"_name": "raced-reaper"}) + mock.ExpectQuery(`FROM deployments\s+WHERE team_id = \$1\s+AND env = \$2\s+AND env_vars->>'_name' = \$3`). + WithArgs(teamID, "development", "raced-reaper"). + WillReturnRows(sqlmock.NewRows(deploymentColumnsList).AddRow( + rowID, + teamID, + uuid.NullUUID{}, + "racedrpr", + "app-racedrpr", // non-empty provider_id → passes the not_ready guard + "healthy", + "https://racedrpr.deploy.", + envVarsJSON, + 8080, "pro", "development", + false, "", + sql.NullString{}, + time.Now(), time.Now(), + sql.NullString{}, sql.NullString{}, "unset", 0, + sql.NullTime{}, "permanent", 0, sql.NullTime{}, + "tarball", "", "", // source, image_ref, registry_creds_enc (mig 064) + "", "", "", // git_url, git_ref, git_token_enc (mig 065) + )) + + // Guarded CAS matches 0 rows — the reaper won the race. Handler 409s. + mock.ExpectExec(`UPDATE deployments\s+SET status = 'building', error_message = NULL, updated_at = now\(\)\s+WHERE id = \$1 AND status IN`). + WithArgs(rowID). + WillReturnResult(sqlmock.NewResult(0, 0)) + + body, ct := multipartRedeployMockBody(t, map[string]string{ + "name": "raced-reaper", + "redeploy": "true", + "port": "8080", + "env": "development", + }) + req := httptest.NewRequest(http.MethodPost, "/deploy/new", body) + req.Header.Set("Content-Type", ct) + + resp, err := app.Test(req, 5000) + require.NoError(t, err) + defer resp.Body.Close() + respBody, _ := io.ReadAll(resp.Body) + require.Equal(t, http.StatusConflict, resp.StatusCode, + "a CAS miss (row reaped concurrently) must 409, not resurrect; body: %s", string(respBody)) + + var errBody struct { + OK bool `json:"ok"` + Error string `json:"error"` + } + require.NoError(t, json.Unmarshal(respBody, &errBody)) + assert.False(t, errBody.OK) + assert.Equal(t, errCodeDeploymentNotRedeployable, errBody.Error, + "CAS-miss response code must be deployment_not_redeployable") + + require.NoError(t, mock.ExpectationsWereMet()) +} + +// redeployByIDMockApp wires a Fiber app exposing POST /deploy/:id/redeploy +// against a sqlmock-backed DB, with the same Locals-fake auth as +// redeployMockApp. Used to exercise Redeploy's guarded-CAS branch +// deterministically (the real-DB test in deploy_async_deployasync_test.go +// can't interpose a reaper between GetDeploymentByAppID and the CAS flip). +func redeployByIDMockApp(t *testing.T, db *sql.DB) (*fiber.App, uuid.UUID) { + t.Helper() + teamID := uuid.New() + + cfg := &config.Config{ + JWTSecret: "test-secret-that-is-at-least-32-bytes-long!!", + AESKey: "0102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f20", + EnabledServices: "deploy", + Environment: "test", + } + h := &DeployHandler{db: db, cfg: cfg, compute: noop.New(), planRegistry: plans.Default()} + + app := fiber.New(fiber.Config{ + BodyLimit: 50 * 1024 * 1024, + ErrorHandler: func(c *fiber.Ctx, err error) error { + if errors.Is(err, ErrResponseWritten) { + return nil + } + code := fiber.StatusInternalServerError + if e, ok := err.(*fiber.Error); ok { + code = e.Code + } + return c.Status(code).JSON(fiber.Map{"ok": false, "error": "internal_error", "message": err.Error()}) + }, + }) + app.Use(func(c *fiber.Ctx) error { + c.Locals(middleware.LocalKeyTeamID, teamID.String()) + c.Locals(middleware.LocalKeyUserID, uuid.NewString()) + return c.Next() + }) + app.Post("/deploy/:id/redeploy", h.Redeploy) + return app, teamID +} + +// TestDeployRedeploy_ByID_CASMiss_Returns409 pins the #14 TOCTOU hardening on +// the POST /deploy/:id/redeploy path. GetDeploymentByAppID returns a +// non-terminal row (status=healthy, provider_id set) so the IsDeploymentTerminal +// gate passes, but the reaper advances the row to a terminal status before the +// 'building' flip. The guarded CAS matches 0 rows, and the handler must 409 +// deployment_not_redeployable rather than resurrecting the reaped workload. +func TestDeployRedeploy_ByID_CASMiss_Returns409(t *testing.T) { + db, mock, err := sqlmock.New() + require.NoError(t, err) + defer db.Close() + + app, teamID := redeployByIDMockApp(t, db) + rowID := uuid.New() + appID := "racedbyid" + + expectTeamLookupOK(mock, teamID, "pro") + + envVarsJSON, _ := json.Marshal(map[string]string{"_name": "raced-by-id"}) + mock.ExpectQuery(`SELECT .* FROM deployments WHERE app_id = \$1`). + WithArgs(appID). + WillReturnRows(sqlmock.NewRows(deploymentColumnsList).AddRow( + rowID, + teamID, + uuid.NullUUID{}, + appID, + "app-racedbyid", // non-empty provider_id → passes the not_ready guard + "healthy", // non-terminal → passes IsDeploymentTerminal gate + "https://racedbyid.deploy.", + envVarsJSON, + 8080, "pro", "development", + false, "", + sql.NullString{}, + time.Now(), time.Now(), + sql.NullString{}, sql.NullString{}, "unset", 0, + sql.NullTime{}, "permanent", 0, sql.NullTime{}, + "tarball", "", "", + "", "", "", + )) + + // Guarded CAS matches 0 rows — the reaper won the race after the read. + mock.ExpectExec(`UPDATE deployments\s+SET status = 'building', error_message = NULL, updated_at = now\(\)\s+WHERE id = \$1 AND status IN`). + WithArgs(rowID). + WillReturnResult(sqlmock.NewResult(0, 0)) + + body, ct := multipartRedeployMockBody(t, map[string]string{"port": "8080"}) + req := httptest.NewRequest(http.MethodPost, "/deploy/"+appID+"/redeploy", body) + req.Header.Set("Content-Type", ct) + + resp, err := app.Test(req, 5000) + require.NoError(t, err) + defer resp.Body.Close() + respBody, _ := io.ReadAll(resp.Body) + require.Equal(t, http.StatusConflict, resp.StatusCode, + "a CAS miss on the :id redeploy path must 409, not resurrect; body: %s", string(respBody)) + + var errBody struct { + OK bool `json:"ok"` + Error string `json:"error"` + } + require.NoError(t, json.Unmarshal(respBody, &errBody)) + assert.False(t, errBody.OK) + assert.Equal(t, errCodeDeploymentNotRedeployable, errBody.Error) + + require.NoError(t, mock.ExpectationsWereMet()) +} + +// TestDeployRedeploy_ByID_CASSuccess_Returns202 is the happy-path partner: the +// guarded CAS matches 1 row (the deploy was still redeployable), so the +// handler proceeds to 202 and launches the async build. Pins that the CAS +// guard does NOT break the normal redeploy. +func TestDeployRedeploy_ByID_CASSuccess_Returns202(t *testing.T) { + db, mock, err := sqlmock.New() + require.NoError(t, err) + defer db.Close() + + app, teamID := redeployByIDMockApp(t, db) + rowID := uuid.New() + appID := "happybyid" + + expectTeamLookupOK(mock, teamID, "pro") + + envVarsJSON, _ := json.Marshal(map[string]string{"_name": "happy-by-id"}) + mock.ExpectQuery(`SELECT .* FROM deployments WHERE app_id = \$1`). + WithArgs(appID). + WillReturnRows(sqlmock.NewRows(deploymentColumnsList).AddRow( + rowID, teamID, uuid.NullUUID{}, appID, "app-happybyid", "healthy", + "https://happybyid.deploy.", envVarsJSON, + 8080, "pro", "development", false, "", sql.NullString{}, + time.Now(), time.Now(), + sql.NullString{}, sql.NullString{}, "unset", 0, + sql.NullTime{}, "permanent", 0, sql.NullTime{}, + "tarball", "", "", "", "", "", + )) + + mock.ExpectExec(`UPDATE deployments\s+SET status = 'building', error_message = NULL, updated_at = now\(\)\s+WHERE id = \$1 AND status IN`). + WithArgs(rowID). + WillReturnResult(sqlmock.NewResult(0, 1)) + + // emitDeployAudit + runRedeployAsync race the response; don't pin further. + mock.MatchExpectationsInOrder(false) + + body, ct := multipartRedeployMockBody(t, map[string]string{"port": "8080"}) + req := httptest.NewRequest(http.MethodPost, "/deploy/"+appID+"/redeploy", body) + req.Header.Set("Content-Type", ct) + + resp, err := app.Test(req, 5000) + require.NoError(t, err) + defer resp.Body.Close() + respBody, _ := io.ReadAll(resp.Body) + require.Equal(t, http.StatusAccepted, resp.StatusCode, + "a successful CAS (1 row) must 202 and launch the async build; body: %s", string(respBody)) + + time.Sleep(50 * time.Millisecond) // let the async goroutines drain +} + +// TestDeployRedeploy_ByID_CASDriverError_StillAccepts pins the :id redeploy +// driver-error arm of the guarded CAS. A driver error on MarkDeploymentBuilding +// is non-determinate (we can't tell whether the flip landed), so the handler +// must slog.Warn and CONTINUE to 202 — runRedeployAsync reconciles the status +// later. Only an explicit 0-row CAS miss means "reaped, 409". +func TestDeployRedeploy_ByID_CASDriverError_StillAccepts(t *testing.T) { + db, mock, err := sqlmock.New() + require.NoError(t, err) + defer db.Close() + + app, teamID := redeployByIDMockApp(t, db) + rowID := uuid.New() + appID := "drvbyid" + + expectTeamLookupOK(mock, teamID, "pro") + + envVarsJSON, _ := json.Marshal(map[string]string{"_name": "drv-by-id"}) + mock.ExpectQuery(`SELECT .* FROM deployments WHERE app_id = \$1`). + WithArgs(appID). + WillReturnRows(sqlmock.NewRows(deploymentColumnsList).AddRow( + rowID, teamID, uuid.NullUUID{}, appID, "app-drvbyid", "healthy", + "https://drvbyid.deploy.", envVarsJSON, + 8080, "pro", "development", false, "", sql.NullString{}, + time.Now(), time.Now(), + sql.NullString{}, sql.NullString{}, "unset", 0, + sql.NullTime{}, "permanent", 0, sql.NullTime{}, + "tarball", "", "", "", "", "", + )) + + // Guarded CAS → driver error (non-determinate). Handler logs + continues. + mock.ExpectExec(`UPDATE deployments\s+SET status = 'building', error_message = NULL, updated_at = now\(\)\s+WHERE id = \$1 AND status IN`). + WithArgs(rowID). + WillReturnError(errMockRedeployDriver) + + // emitDeployAudit + runRedeployAsync race the response; don't pin further. + mock.MatchExpectationsInOrder(false) + + body, ct := multipartRedeployMockBody(t, map[string]string{"port": "8080"}) + req := httptest.NewRequest(http.MethodPost, "/deploy/"+appID+"/redeploy", body) + req.Header.Set("Content-Type", ct) + + resp, err := app.Test(req, 5000) + require.NoError(t, err) + defer resp.Body.Close() + respBody, _ := io.ReadAll(resp.Body) + require.Equal(t, http.StatusAccepted, resp.StatusCode, + "a CAS driver error must NOT block the 202 accept; body: %s", string(respBody)) + + time.Sleep(50 * time.Millisecond) // let the async goroutines drain +} + // TestDeployNew_Redeploy_MissingName_AfterValidation pins deploy.go:655-661. // The branch fires when shouldRedeployInPlace is true AND name == "". In // practice requireName at line 604 fires first on empty/whitespace input, diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index 0e352837..5ade4f81 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -244,10 +244,13 @@ var ( // DeployRedeployInPlaceTotal counts the POST /deploy/new in-place // redeploy outcomes (redeploy=true form field). Labels: // - // outcome = "success" — match found, redeploy compute path invoked - // "not_found" — no live deployment for (team, env, name) - // "wrong_team" — name exists on a different team (404 is - // still returned — we never confirm existence) + // outcome = "success" — match found, redeploy compute path invoked + // "not_found" — no live deployment for (team, env, name) + // "wrong_team" — name exists on a different team (404 is + // still returned — we never confirm existence) + // "not_redeployable" — row was reaped (expired/deleted) in the + // TOCTOU window between lookup and the + // guarded 'building' CAS → 409 (#14) // // Closes the agent-UX gap surfaced 2026-05-30 (duplicate-URL incident): // agents previously called /deploy/new repeatedly, minting a fresh diff --git a/internal/models/coverage_deployment_test.go b/internal/models/coverage_deployment_test.go index 68bf5f72..0bcfb1e6 100644 --- a/internal/models/coverage_deployment_test.go +++ b/internal/models/coverage_deployment_test.go @@ -340,6 +340,38 @@ func TestMarkDeploymentTornDown_Branches(t *testing.T) { require.ErrorContains(t, err, "boom") } +// TestMarkDeploymentBuilding_Branches pins the guarded-CAS behaviour of the +// redeploy 'building' flip (#14, sweep 2026-06-04): +// - 1 row affected → the row was redeployable, swap landed (caller proceeds) +// - 0 rows affected → the row was reaped to a terminal status in the TOCTOU +// window; the swap was a no-op (caller must 409) +// - driver error → wrapped, non-determinate (caller logs + continues) +func TestMarkDeploymentBuilding_Branches(t *testing.T) { + ctx := context.Background() + + // 1 row matched — redeployable status flipped to building. + db, mock := newMock(t) + mock.ExpectExec(`UPDATE deployments\s+SET status = 'building', error_message = NULL, updated_at = now\(\)\s+WHERE id = \$1 AND status IN`). + WillReturnResult(sqlmock.NewResult(0, 1)) + n, err := MarkDeploymentBuilding(ctx, db, uuid.New()) + require.NoError(t, err) + require.Equal(t, int64(1), n) + + // 0 rows matched — the row was reaped to a terminal status; CAS no-op. + db2, mock2 := newMock(t) + mock2.ExpectExec(`UPDATE deployments\s+SET status = 'building', error_message = NULL, updated_at = now\(\)\s+WHERE id = \$1 AND status IN`). + WillReturnResult(sqlmock.NewResult(0, 0)) + n, err = MarkDeploymentBuilding(ctx, db2, uuid.New()) + require.NoError(t, err) + require.Equal(t, int64(0), n) + + // driver error — wrapped. + db3, mock3 := newMock(t) + mock3.ExpectExec(`UPDATE deployments\s+SET status = 'building'`).WillReturnError(errors.New("boom")) + _, err = MarkDeploymentBuilding(ctx, db3, uuid.New()) + require.ErrorContains(t, err, "boom") +} + func TestCountDeploymentsByTeam_Branches(t *testing.T) { ctx := context.Background() diff --git a/internal/models/deployment.go b/internal/models/deployment.go index 081e13a2..843ec50a 100644 --- a/internal/models/deployment.go +++ b/internal/models/deployment.go @@ -503,6 +503,43 @@ func UpdateDeploymentStatus(ctx context.Context, db *sql.DB, id uuid.UUID, statu return nil } +// redeployableStatusesSQL is the SQL IN-list of statuses from which a +// deployment may be flipped back to 'building' by a redeploy. It is the exact +// inverse of IsDeploymentTerminal (expired/deleted/stopped are excluded) plus +// 'failed' (a failed build is retryable). Kept in lockstep with +// FindActiveDeploymentByTeamEnvName's WHERE clause and IsDeploymentTerminal — +// the redeploy-guard test pins the classification. +const redeployableStatusesSQL = `('building', 'deploying', 'healthy', 'failed')` + +// MarkDeploymentBuilding flips a deployment back to 'building' as a guarded +// compare-and-swap: the UPDATE only matches a row whose current status is +// redeployable (building/deploying/healthy/failed). RowsAffected reports +// whether the swap happened. +// +// TOCTOU hardening (#14, sweep 2026-06-04): both redeploy entry points +// (POST /deploy/:id/redeploy and POST /deploy/new redeploy=true) read the row, +// assert it is non-terminal, then flip it to 'building'. Between the read and +// the flip the DeploymentExpirer / teardown reconciler can reap the row to +// 'expired'/'deleted'. An unconditional UPDATE would resurrect that reaped +// workload back to 'building'. The guarded WHERE makes a reaped row report +// 0 rows so the handler can return 409 instead of re-arming it — mirroring +// the CAS guards already on MarkDeploymentTornDown and SetDeploymentTTL. +// +// updated_at is set to now() by the database. error_message is cleared (a +// fresh build supersedes any prior failure message). +func MarkDeploymentBuilding(ctx context.Context, db *sql.DB, id uuid.UUID) (int64, error) { + res, err := db.ExecContext(ctx, ` + UPDATE deployments + SET status = 'building', error_message = NULL, updated_at = now() + WHERE id = $1 AND status IN `+redeployableStatusesSQL+` + `, id) + if err != nil { + return 0, fmt.Errorf("models.MarkDeploymentBuilding: %w", err) + } + n, _ := res.RowsAffected() + return n, nil +} + // UpdateDeploymentProviderID records the k8s Deployment name and the resolved app URL // after the k8s Deployment object has been successfully created. // updated_at is set to now() by the database.