From 139b47a0698ec486bcf8474c4fd02f52131140bc Mon Sep 17 00:00:00 2001 From: Manas Srivastava Date: Sat, 6 Jun 2026 10:14:15 +0530 Subject: [PATCH] test(ci): deploy-failure auto-debug path + anon-stack gap + with_failed_deploy factory (#70) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Failure-diagnosis CI integration tests for the deploy-failure AUTO-DEBUG PATH (docs/ci/02-FAILURE-DIAGNOSIS-AND-AUTODEBUG.md §5), api side. 1. Auto-debug PATH integration test (deploy_autodebug_path_test.go): seeds a status=failed deployment + an older lifecycle row + a failure_autopsy deployment_events row (reason/exit_code/last_lines/ hint) against a real test DB, then asserts the full agent debug loop as ONE coherent contract: - GET /api/v1/deployments/:id → status=failed + non-empty error_message (the one-line cause) - GET /api/v1/deployments/:id/events → autopsy with reason + non-empty last_lines + hint, newest-first, count correct - auth-negative: no/invalid bearer → 401 - cross-team: another team's token → 404 (no existence leak) 2. Anonymous-stack failure-diagnosis contract (stack_anon_failure_diag_ test.go): drives an anon stack (NULL team_id) to failed, asserts GET /stacks/:slug (slug-bearer, no auth) returns status=failed and the raw build error is persisted on the service row, then PINS the documented gap by enumerating the LIVE router (router.New + GetRoutes) and asserting NO /stacks/:slug/events route exists. Adding a stack-autopsy endpoint later reds this test deliberately. Anon failure-diagnosis is status + logs only (no classified autopsy). 3. with_failed_deploy factory flag (internal_e2e_account.go): cohort- only, inert-by-default pre-seed of ONE failed deployment + ONE failure_autopsy event via the production deploy models (CreateDeployment → UpdateDeploymentStatus → UpsertDeploymentAutopsy), surfaced as failed_deploy_id, reaped with the team. Lets the web wave load /app/deployments/:id and render the FailureAutopsyPanel against a real backend. Tests: seeds exactly one failed deploy + one autopsy with the factory payload; omitting seeds none; seam-driven seed_failed 503; whitebox sqlmock coverage of all three seed error branches. The producer↔consumer schema parity (worker autopsy write ↔ api /events read) is asserted in the worker PR's deploy_failure_autopsy_schema_ parity_test.go (cross-referenced). make gate: green except pre-existing local-only flakes outside this diff (internal/models/TestLinkGitHubID DB-pollution, handlers TestQueue_CredIssueError NATS flake). CI (fresh DB, Go 1.25) is authoritative. New tests + donebar/manner-matrix/error-envelope guards all pass. Co-Authored-By: Claude Opus 4.8 --- .../handlers/deploy_autodebug_path_test.go | 243 ++++++++++++++++++ internal/handlers/internal_e2e_account.go | 148 ++++++++++- .../internal_e2e_account_export_test.go | 23 ++ ...internal_e2e_account_failed_deploy_test.go | 155 +++++++++++ ...internal_e2e_account_seed_whitebox_test.go | 93 +++++++ .../handlers/internal_e2e_account_test.go | 19 +- .../handlers/stack_anon_failure_diag_test.go | 195 ++++++++++++++ 7 files changed, 859 insertions(+), 17 deletions(-) create mode 100644 internal/handlers/deploy_autodebug_path_test.go create mode 100644 internal/handlers/internal_e2e_account_failed_deploy_test.go create mode 100644 internal/handlers/stack_anon_failure_diag_test.go diff --git a/internal/handlers/deploy_autodebug_path_test.go b/internal/handlers/deploy_autodebug_path_test.go new file mode 100644 index 0000000..da7b3b3 --- /dev/null +++ b/internal/handlers/deploy_autodebug_path_test.go @@ -0,0 +1,243 @@ +package handlers_test + +// deploy_autodebug_path_test.go — the end-to-end AUTO-DEBUG PATH integration +// test for a FAILED deployment (task #70, docs/ci/02-FAILURE-DIAGNOSIS-AND- +// AUTODEBUG.md §5.1). +// +// The pieces of the failure-diagnosis surface are each unit/integration-tested +// elsewhere (deploy_events_endpoint_test.go: ordering/empty/clamp/cross-team; +// deploy_buildfailed_autopsy_test.go: the autopsy "failure" field on GET +// /deploy/:id). This file asserts them as ONE coherent contract — the exact +// loop an MCP agent or the dashboard FailureAutopsyPanel runs to diagnose a +// failed deploy WITHOUT cluster access: +// +// 1. GET /api/v1/deployments/:id → status="failed" + non-empty +// error_message (the one-line cause +// the worker autopsy stamped). +// 2. GET /api/v1/deployments/:id/events → events[] carrying the +// failure_autopsy with reason + +// non-empty last_lines + hint, newest +// first, count correct. +// 3. auth-negative: no / invalid bearer → 401 (the surface is gated). +// 4. cross-team: another team's token → 404 (you can NOT read another +// team's failure — never 403, no +// existence leak). +// +// This mirrors the seeding pattern in deploy_events_endpoint_test.go and +// deploy_lifecycle_block_integration_test.go (real Postgres test DB via +// testhelpers.SetupTestDB, the production RequireAuth chain via +// NewTestAppWithServices), so the HTTP envelope, route resolution, JWT +// middleware, and model SQL path are exercised end-to-end against the same SQL +// the production handler issues. The producer side (worker autopsy) and this +// consumer side (the /events + /:id read) are proven schema-compatible by the +// worker's deploy_failure_autopsy_schema_parity_test.go. + +import ( + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + + "github.com/google/uuid" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "instant.dev/internal/testhelpers" +) + +// adbDeploymentEnvelope is the GET /api/v1/deployments/:id response shape (the +// item.error one-liner is the agent's first debug read). +type adbDeploymentEnvelope struct { + OK bool `json:"ok"` + Item struct { + AppID string `json:"app_id"` + Status string `json:"status"` + Error string `json:"error"` + } `json:"item"` +} + +// adbEventsEnvelope is the GET /api/v1/deployments/:id/events response shape. +type adbEventsEnvelope struct { + OK bool `json:"ok"` + DeploymentID string `json:"deployment_id"` + Events []struct { + Kind string `json:"kind"` + Reason string `json:"reason"` + ExitCode *int `json:"exit_code"` + Event string `json:"event"` + LastLines []string `json:"last_lines"` + Hint string `json:"hint"` + CreatedAt string `json:"created_at"` + } `json:"events"` + Count int `json:"count"` +} + +// TestDeployAutodebugPath_FailedDeploy_FullAgentLoop is the §5.1 contract: +// status+error_message AND the events autopsy AND auth-negative AND cross-team, +// asserted as one coherent debug-path test against a real test DB. +func TestDeployAutodebugPath_FailedDeploy_FullAgentLoop(t *testing.T) { + db, cleanDB := testhelpers.SetupTestDB(t) + defer cleanDB() + rdb, cleanRedis := testhelpers.SetupTestRedis(t) + defer cleanRedis() + + teamID := testhelpers.MustCreateTeamDB(t, db, "pro") + otherTeamID := testhelpers.MustCreateTeamDB(t, db, "pro") + ownerJWT := testhelpers.MustSignSessionJWT(t, + "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa", teamID, "adb-owner@example.com") + otherJWT := testhelpers.MustSignSessionJWT(t, + "bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb", otherTeamID, "adb-other@example.com") + + // Seed a FAILED deployment with the one-line error_message the worker + // autopsy stamps (": "). + depID := uuid.New() + appID := "adb" + uuid.NewString()[:8] + const wantErrorMessage = "OOMKilled: Your app exceeded its memory limit and was killed by the kernel." + _, err := db.Exec(` + INSERT INTO deployments (id, team_id, app_id, port, tier, status, error_message) + VALUES ($1, $2, $3, 8080, 'pro', 'failed', $4) + `, depID, teamID, appID, wantErrorMessage) + require.NoError(t, err) + + // Older lifecycle row + newer failure_autopsy row (the real autopsy shape). + _, err = db.Exec(` + INSERT INTO deployment_events + (deployment_id, kind, reason, exit_code, event, last_lines, hint, created_at) + VALUES ($1, 'lifecycle', 'image_pull_failed', NULL, 'ErrImagePull', + '["pulling image","ErrImagePull"]', 'check the image reference', + now() - interval '10 minutes') + `, depID) + require.NoError(t, err) + + autopsyLastLines := []string{ + "npm ERR! code ELIFECYCLE", + "FATAL: out of memory: Killed process 1 (node)", + } + _, err = db.Exec(` + INSERT INTO deployment_events + (deployment_id, kind, reason, exit_code, event, last_lines, hint, created_at) + VALUES ($1, 'failure_autopsy', 'OOMKilled', 137, 'OOMKilling: Memory cgroup out of memory', + '["npm ERR! code ELIFECYCLE","FATAL: out of memory: Killed process 1 (node)"]', + 'Your app exceeded its memory limit and was killed by the kernel.', + now() - interval '1 minute') + `, depID) + require.NoError(t, err) + + app, cleanApp := testhelpers.NewTestAppWithServices(t, db, rdb, + "postgres,redis,mongodb,queue,webhook,storage,deploy") + defer cleanApp() + + // ── Step 1: GET /api/v1/deployments/:id → status=failed + error_message ── + t.Run("status_and_error_message", func(t *testing.T) { + req := httptest.NewRequest(http.MethodGet, "/api/v1/deployments/"+appID, nil) + req.Header.Set("Authorization", "Bearer "+ownerJWT) + req.Header.Set("X-Forwarded-For", "10.70.0.1") + resp, err := app.Test(req, 5000) + require.NoError(t, err) + defer resp.Body.Close() + require.Equal(t, http.StatusOK, resp.StatusCode) + + var env adbDeploymentEnvelope + require.NoError(t, json.NewDecoder(resp.Body).Decode(&env)) + assert.True(t, env.OK) + assert.Equal(t, appID, env.Item.AppID) + assert.Equal(t, "failed", env.Item.Status, + "the agent's first read must show the deploy is failed") + assert.NotEmpty(t, env.Item.Error, + "error_message must be non-empty — it is the one-line cause the agent acts on") + assert.Equal(t, wantErrorMessage, env.Item.Error) + }) + + // ── Step 2: GET /api/v1/deployments/:id/events → autopsy timeline ──────── + t.Run("events_autopsy_timeline", func(t *testing.T) { + req := httptest.NewRequest(http.MethodGet, "/api/v1/deployments/"+appID+"/events", nil) + req.Header.Set("Authorization", "Bearer "+ownerJWT) + req.Header.Set("X-Forwarded-For", "10.70.0.2") + resp, err := app.Test(req, 5000) + require.NoError(t, err) + defer resp.Body.Close() + require.Equal(t, http.StatusOK, resp.StatusCode) + + var env adbEventsEnvelope + require.NoError(t, json.NewDecoder(resp.Body).Decode(&env)) + assert.True(t, env.OK) + assert.Equal(t, depID.String(), env.DeploymentID, + "deployment_id must echo the canonical UUID the agent can re-query") + assert.Equal(t, 2, env.Count) + require.Len(t, env.Events, 2) + + // Newest first (DESC by created_at): the autopsy row leads. + autopsy := env.Events[0] + assert.Equal(t, "failure_autopsy", autopsy.Kind, + "the dedicated classified row is kind=failure_autopsy") + assert.Equal(t, "OOMKilled", autopsy.Reason, + "reason is the machine-readable classification the agent branches on") + require.NotNil(t, autopsy.ExitCode) + assert.Equal(t, 137, *autopsy.ExitCode) + assert.NotEmpty(t, autopsy.LastLines, + "last_lines (the real build/pod error tail) MUST be non-empty — "+ + "it is the surface the agent reads to fix the Dockerfile/config") + assert.Equal(t, autopsyLastLines, autopsy.LastLines) + assert.NotEmpty(t, autopsy.Hint, + "hint is the plain-language remedy the agent acts on") + assert.Contains(t, autopsy.Hint, "memory") + assert.NotEmpty(t, autopsy.CreatedAt) + + // Older row trails. + assert.Equal(t, "image_pull_failed", env.Events[1].Reason, "older row trails (DESC)") + assert.Equal(t, "lifecycle", env.Events[1].Kind) + }) + + // ── Step 3: auth-negative — the debug surface is gated ─────────────────── + t.Run("auth_negative_401", func(t *testing.T) { + // No bearer. + reqNoAuth := httptest.NewRequest(http.MethodGet, "/api/v1/deployments/"+appID+"/events", nil) + reqNoAuth.Header.Set("X-Forwarded-For", "10.70.0.3") + respNoAuth, err := app.Test(reqNoAuth, 5000) + require.NoError(t, err) + defer respNoAuth.Body.Close() + assert.Equal(t, http.StatusUnauthorized, respNoAuth.StatusCode, + "no bearer → 401 (events surface is RequireAuth)") + + // Garbage bearer. + reqBad := httptest.NewRequest(http.MethodGet, "/api/v1/deployments/"+appID+"/events", nil) + reqBad.Header.Set("Authorization", "Bearer not-a-valid-jwt") + reqBad.Header.Set("X-Forwarded-For", "10.70.0.4") + respBad, err := app.Test(reqBad, 5000) + require.NoError(t, err) + defer respBad.Body.Close() + assert.Equal(t, http.StatusUnauthorized, respBad.StatusCode, + "invalid bearer → 401") + }) + + // ── Step 4: cross-team — you can NOT read another team's failure ───────── + t.Run("cross_team_404", func(t *testing.T) { + // /:id (status) read. + reqGet := httptest.NewRequest(http.MethodGet, "/api/v1/deployments/"+appID, nil) + reqGet.Header.Set("Authorization", "Bearer "+otherJWT) + reqGet.Header.Set("X-Forwarded-For", "10.70.0.5") + respGet, err := app.Test(reqGet, 5000) + require.NoError(t, err) + defer respGet.Body.Close() + require.Equal(t, http.StatusNotFound, respGet.StatusCode, + "cross-team GET /:id must be 404, never 403 (no existence leak)") + + // /events read. + reqEv := httptest.NewRequest(http.MethodGet, "/api/v1/deployments/"+appID+"/events", nil) + reqEv.Header.Set("Authorization", "Bearer "+otherJWT) + reqEv.Header.Set("X-Forwarded-For", "10.70.0.6") + respEv, err := app.Test(reqEv, 5000) + require.NoError(t, err) + defer respEv.Body.Close() + require.Equal(t, http.StatusNotFound, respEv.StatusCode, + "cross-team /events must be 404, never 403 (no existence leak)") + + var envelope struct { + OK bool `json:"ok"` + Error string `json:"error"` + } + require.NoError(t, json.NewDecoder(respEv.Body).Decode(&envelope)) + assert.False(t, envelope.OK) + assert.Equal(t, "not_found", envelope.Error) + }) +} diff --git a/internal/handlers/internal_e2e_account.go b/internal/handlers/internal_e2e_account.go index bb1e340..cd9afcc 100644 --- a/internal/handlers/internal_e2e_account.go +++ b/internal/handlers/internal_e2e_account.go @@ -153,6 +153,25 @@ type e2eCreateRequest struct { // snapshot, exactly like a real provision under that tier, and are reaped // with the team (team_id→NULL + marked-for-reaper) by ReapAccount. WithResources bool `json:"with_resources"` + + // WithFailedDeploy, when true, pre-seeds ONE deployment row in + // status='failed' (with a one-line error_message) plus ONE failure_autopsy + // deployment_events row (reason/exit_code/last_lines/hint) on the minted + // team — the EXACT shape the worker autopsy writes for a real build + // failure. This lets the web wave load /app/deployments/:id and render the + // FailureAutopsyPanel against a REAL backend (not a mock), and lets an + // agent journey exercise GET /api/v1/deployments/:id/events without first + // having to drive (and fail) a real Kaniko build. + // + // Cohort-only + inert by default (omitted → no deploy seeded). The seeded + // deployment is owned by the minted is_test_cohort team, so it is reaped + // with the team (DeleteTeamHard cascades deployments + deployment_events) + // by ReapAccount. No backend RPC, no k8s namespace — pure DB rows via the + // SAME models the production deploy path uses, so the seed is synchronous + + // sub-millisecond, safe inside the mint request. The seeded deployment's + // app_id is surfaced in the response as failed_deploy_id so the caller can + // navigate straight to /app/deployments/. + WithFailedDeploy bool `json:"with_failed_deploy"` } // e2eSeedResourceTypes is the closed set of FAST, row-only resource types the @@ -162,6 +181,44 @@ type e2eCreateRequest struct { // automatically expands what the seed creates AND what the seed test asserts. var e2eSeedResourceTypes = []string{"webhook", "cache"} +// e2eFailedDeploy* constants describe the single seeded failed deployment + +// its failure_autopsy event. Named (not inline literals) so the seed payload +// is a single source of truth the seed test asserts against — adding a field +// here is a one-place change. The shape mirrors what the worker autopsy writes +// for a real OOMKilled build failure so the web FailureAutopsyPanel renders the +// same content it would for a genuine failure. +const ( + // e2eFailedDeployErrorMessage is the one-line cause stamped on + // deployments.error_message (the ": " the worker + // autopsy writes; surfaced by GET /api/v1/deployments/:id as item.error). + e2eFailedDeployErrorMessage = "OOMKilled: Your app exceeded its memory limit and was killed by the kernel." + + // e2eFailedDeployReason is the classified failure reason on the autopsy row + // (matches models.FailureReasonOOMKilled — a string literal here keeps this + // file free of an api-internal import the worker-mirror constants avoid). + e2eFailedDeployReason = "OOMKilled" + + // e2eFailedDeployEvent is the k8s event text on the autopsy row. + e2eFailedDeployEvent = "OOMKilling: Memory cgroup out of memory: Killed process 1 (node)" + + // e2eFailedDeployHint is the plain-language remedy on the autopsy row. + e2eFailedDeployHint = "Your app exceeded its memory limit and was killed by the kernel. " + + "Reduce memory usage or upgrade to a tier with a higher memory cap." + + // e2eFailedDeployExitCode is the container exit code on the autopsy row. + e2eFailedDeployExitCode = 137 +) + +// e2eFailedDeployLastLines is the build-pod log tail on the autopsy row — the +// real error output the FailureAutopsyPanel renders and an agent reads to fix +// the failure. Non-empty so the panel's "diagnostics pending" empty-state is +// NOT what the web test sees. +var e2eFailedDeployLastLines = []string{ + "npm ERR! code ELIFECYCLE", + "<--- Last few GCs --->", + "FATAL ERROR: Reached heap limit Allocation failed - JavaScript heap out of memory", +} + // authorize runs the X-E2E-Token guard. It returns true iff the token is // configured AND the header matches in constant time. On any failure it has // ALREADY written the 404 response and bumped the unauthorized metric — the @@ -301,6 +358,23 @@ func (h *E2EAccountHandler) CreateAccount(c *fiber.Ctx) error { seededTokens = toks } + // 3c. Optionally pre-seed ONE failed deployment + its failure_autopsy event + // so the web wave can render the FailureAutopsyPanel against a real + // backend and an agent journey can exercise GET /deployments/:id/events. + // Synchronous (pure DB rows via the production deploy models, no backend + // RPC / no k8s namespace). A seed failure is a hard error for the same + // reason as with_resources: a partial account makes the journey flaky. + var failedDeployID string + if req.WithFailedDeploy { + appID, derr := e2eSeedFailedDeploy(h, ctx, team.ID, tier, env) + if derr != nil { + metrics.E2EAccountTotal.WithLabelValues(e2eMetricOpCreate, e2eResultError).Inc() + slog.Error("internal.e2e.create.failed_deploy_seed_failed", "error", derr, "team_id", team.ID.String()) + return respondError(c, fiber.StatusServiceUnavailable, "seed_failed", "failed to seed failed deployment") + } + failedDeployID = appID + } + // 4. Mint the session JWT with the SAME signer + claim shape the customer // auth path uses, so it authenticates through ordinary RequireAuth. expiresAt := time.Now().UTC().Add(e2eSessionTTL) @@ -338,15 +412,20 @@ func (h *E2EAccountHandler) CreateAccount(c *fiber.Ctx) error { if seededTokens == nil { seededTokens = []string{} } + // failed_deploy_id is the app_id of the seeded failed deployment when + // with_failed_deploy was set, "" otherwise. The caller navigates to + // /app/deployments/ (or GETs /api/v1/deployments/) + // to drive the FailureAutopsyPanel / events surface against a real backend. return c.JSON(fiber.Map{ - "team_id": team.ID.String(), - "user_id": user.ID.String(), - "email": email, - "tier": tier, - "session_jwt": sessionJWT, - "expires_at": expiresAt.Format(time.RFC3339), - "seeded_tokens": seededTokens, - "seeded_count": len(seededTokens), + "team_id": team.ID.String(), + "user_id": user.ID.String(), + "email": email, + "tier": tier, + "session_jwt": sessionJWT, + "expires_at": expiresAt.Format(time.RFC3339), + "seeded_tokens": seededTokens, + "seeded_count": len(seededTokens), + "failed_deploy_id": failedDeployID, }) } @@ -388,6 +467,59 @@ func (h *E2EAccountHandler) seedFastResources(ctx context.Context, teamID uuid.U return tokens, nil } +// e2eSeedFailedDeploy pre-seeds ONE failed deployment + its failure_autopsy +// event on teamID, returning the deployment's app_id. It uses the SAME +// production models the real deploy path uses — CreateDeployment (status +// 'building') → UpdateDeploymentStatus(...,"failed", error_message) → +// UpsertDeploymentAutopsy — so the seeded rows are indistinguishable from a +// genuine OOMKilled build failure for every read path (GET /deployments/:id +// item.error, GET /deployments/:id/events autopsy row, the web +// FailureAutopsyPanel). No backend RPC, no k8s namespace: pure DB rows, so the +// seed is synchronous + sub-millisecond. Any error aborts (returns it) — the +// caller turns it into a 503 so CI never receives a half-seeded account. +// +// The deployment is owned by the cohort team and carries the team's tier +// snapshot (TTLPolicy=permanent so the deployment_expirer never sweeps it +// mid-journey); DeleteTeamHard cascades both the deployment and its +// deployment_events on reap. +// +// A package-var seam (not a direct method call) so a test can force the +// caller's seed_failed (503) arm deterministically. +var e2eSeedFailedDeploy = (*E2EAccountHandler).seedFailedDeploy + +func (h *E2EAccountHandler) seedFailedDeploy(ctx context.Context, teamID uuid.UUID, tier, env string) (string, error) { + appID := "e2e-fail-" + uuid.NewString()[:10] + + d, err := models.CreateDeployment(ctx, h.db, models.CreateDeploymentParams{ + TeamID: teamID, + AppID: appID, + Port: 8080, + Tier: tier, + Env: env, + TTLPolicy: models.DeployTTLPolicyPermanent, + }) + if err != nil { + return "", fmt.Errorf("seed failed deploy: create: %w", err) + } + + if err := models.UpdateDeploymentStatus(ctx, h.db, d.ID, "failed", e2eFailedDeployErrorMessage); err != nil { + return "", fmt.Errorf("seed failed deploy: set failed: %w", err) + } + + if err := models.UpsertDeploymentAutopsy(ctx, h.db, models.UpsertAutopsyParams{ + DeploymentID: d.ID, + Reason: e2eFailedDeployReason, + ExitCode: sql.NullInt32{Int32: e2eFailedDeployExitCode, Valid: true}, + Event: e2eFailedDeployEvent, + LastLines: e2eFailedDeployLastLines, + Hint: e2eFailedDeployHint, + }); err != nil { + return "", fmt.Errorf("seed failed deploy: autopsy: %w", err) + } + + return appID, nil +} + // ReapAccount handles DELETE /internal/e2e/account/:team_id. func (h *E2EAccountHandler) ReapAccount(c *fiber.Ctx) error { if !h.authorize(c, e2eMetricOpReap) { diff --git a/internal/handlers/internal_e2e_account_export_test.go b/internal/handlers/internal_e2e_account_export_test.go index 1869005..020a3e4 100644 --- a/internal/handlers/internal_e2e_account_export_test.go +++ b/internal/handlers/internal_e2e_account_export_test.go @@ -62,3 +62,26 @@ func SetE2ESignSessionJWTForTest(fn func(jwtSecret string, userID, teamID uuid.U e2eSignSessionJWT = fn return func() { e2eSignSessionJWT = prev } } + +// SetE2ESeedFailedDeployForTest overrides the e2eSeedFailedDeploy seam so a +// test can force CreateAccount's with_failed_deploy seed_failed (503) arm +// deterministically, without making the real deployments table reject an +// insert mid-request. Returns a restore func. +func SetE2ESeedFailedDeployForTest(err error) (restore func()) { + prev := e2eSeedFailedDeploy + e2eSeedFailedDeploy = func(_ *E2EAccountHandler, _ context.Context, _ uuid.UUID, _, _ string) (string, error) { + return "", err + } + return func() { e2eSeedFailedDeploy = prev } +} + +// E2EFailedDeploySeedForTest exposes the seeded failed-deploy autopsy payload +// so the seed test asserts the API serves exactly what the factory wrote +// (reason/exit_code/last_lines/hint) — single source of truth, not a re-typed +// expectation. Returns copies so a test cannot mutate the handler's constants. +func E2EFailedDeploySeedForTest() (errorMessage, reason, event, hint string, exitCode int, lastLines []string) { + ll := make([]string, len(e2eFailedDeployLastLines)) + copy(ll, e2eFailedDeployLastLines) + return e2eFailedDeployErrorMessage, e2eFailedDeployReason, e2eFailedDeployEvent, + e2eFailedDeployHint, e2eFailedDeployExitCode, ll +} diff --git a/internal/handlers/internal_e2e_account_failed_deploy_test.go b/internal/handlers/internal_e2e_account_failed_deploy_test.go new file mode 100644 index 0000000..9b7083a --- /dev/null +++ b/internal/handlers/internal_e2e_account_failed_deploy_test.go @@ -0,0 +1,155 @@ +package handlers_test + +// internal_e2e_account_failed_deploy_test.go — coverage for the +// with_failed_deploy factory pre-seed (task #70, +// docs/ci/02-FAILURE-DIAGNOSIS-AND-AUTODEBUG.md §5.4 enabler). +// +// The with_failed_deploy flag lets the web wave load /app/deployments/:id and +// render the FailureAutopsyPanel against a REAL backend. Contract pinned here: +// +// - with_failed_deploy=true → exactly ONE failed deployment + ONE +// failure_autopsy deployment_events row, owned by the minted cohort team, +// carrying the factory's reason/exit_code/last_lines/hint payload; the +// response surfaces the deployment's app_id as failed_deploy_id. +// - with_failed_deploy omitted → ZERO deployments seeded (inert by default). +// - non-cohort / token-unset paths unaffected (the seed runs only on a +// successful authorized mint, which is already cohort-scoped). +// - a seed FAILURE surfaces as a 503 seed_failed (never a half-seeded 200). +// +// Seeds are asserted from the DB directly (mirrors the with_resources seed +// test) AND the autopsy payload is compared against the handler's exported +// single-source-of-truth constants (E2EFailedDeploySeedForTest) so a future +// payload edit auto-updates the assertion rather than drifting. + +import ( + "context" + "errors" + "net/http" + "testing" + + "github.com/stretchr/testify/require" + + "instant.dev/internal/handlers" + "instant.dev/internal/testhelpers" +) + +// TestE2EAccount_Create_WithFailedDeploy_SeedsOneFailedDeployAndAutopsy asserts +// the seed writes exactly one failed deployment + one autopsy event with the +// factory's payload, owned by the minted team, and surfaces failed_deploy_id. +func TestE2EAccount_Create_WithFailedDeploy_SeedsOneFailedDeployAndAutopsy(t *testing.T) { + skipUnlessE2EDB(t) + db, cleanup := testhelpers.SetupTestDB(t) + defer cleanup() + app := newE2ETestApp(t, db, nil, testE2EToken) + + resp := postE2ECreate(t, app, testE2EToken, `{"tier":"pro","with_failed_deploy":true}`) + require.Equal(t, http.StatusOK, resp.StatusCode) + out := decodeE2ECreate(t, resp) + + require.NotEmpty(t, out.FailedDeployID, + "failed_deploy_id must be surfaced so the web wave can navigate to it") + + ctx := context.Background() + + // Exactly ONE deployment, in status=failed, owned by the minted team, with + // the factory's one-line error_message. + wantErrMsg, wantReason, wantEvent, wantHint, wantExit, wantLines := + handlers.E2EFailedDeploySeedForTest() + + var ( + depCount int + status string + appID string + teamID string + errorMessage string + ) + require.NoError(t, db.QueryRowContext(ctx, + `SELECT count(*) FROM deployments WHERE team_id = $1`, out.TeamID).Scan(&depCount)) + require.Equal(t, 1, depCount, "exactly one deployment must be seeded") + + require.NoError(t, db.QueryRowContext(ctx, ` + SELECT app_id, status, team_id::text, error_message + FROM deployments WHERE team_id = $1 + `, out.TeamID).Scan(&appID, &status, &teamID, &errorMessage)) + require.Equal(t, out.FailedDeployID, appID, "failed_deploy_id must echo the seeded deployment's app_id") + require.Equal(t, "failed", status) + require.Equal(t, out.TeamID, teamID, "seeded deployment must be owned by the minted team") + require.Equal(t, wantErrMsg, errorMessage, "error_message must be the factory's one-liner") + + // Exactly ONE failure_autopsy deployment_events row with the factory payload. + var ( + depID string + eventCount int + autReason string + autExitCode int + autEvent string + autHint string + autLastLines []byte + ) + require.NoError(t, db.QueryRowContext(ctx, + `SELECT id::text FROM deployments WHERE team_id = $1`, out.TeamID).Scan(&depID)) + + require.NoError(t, db.QueryRowContext(ctx, ` + SELECT count(*) FROM deployment_events + WHERE deployment_id = $1 AND kind = 'failure_autopsy' + `, depID).Scan(&eventCount)) + require.Equal(t, 1, eventCount, "exactly one failure_autopsy event must be seeded") + + require.NoError(t, db.QueryRowContext(ctx, ` + SELECT reason, exit_code, event, hint, last_lines + FROM deployment_events + WHERE deployment_id = $1 AND kind = 'failure_autopsy' + `, depID).Scan(&autReason, &autExitCode, &autEvent, &autHint, &autLastLines)) + + require.Equal(t, wantReason, autReason) + require.Equal(t, wantExit, autExitCode) + require.Equal(t, wantEvent, autEvent) + require.Equal(t, wantHint, autHint) + // last_lines is JSONB — assert it carries the factory's (non-empty) tail. + require.NotEmpty(t, wantLines, "factory last_lines must be non-empty by design") + for _, line := range wantLines { + require.Contains(t, string(autLastLines), line, + "seeded last_lines must carry the factory's build-error tail") + } +} + +// TestE2EAccount_Create_WithoutFailedDeploy_SeedsNothing pins inert-by-default: +// omitting with_failed_deploy seeds ZERO deployments and surfaces an empty +// failed_deploy_id. +func TestE2EAccount_Create_WithoutFailedDeploy_SeedsNothing(t *testing.T) { + skipUnlessE2EDB(t) + db, cleanup := testhelpers.SetupTestDB(t) + defer cleanup() + app := newE2ETestApp(t, db, nil, testE2EToken) + + resp := postE2ECreate(t, app, testE2EToken, `{"tier":"free"}`) + require.Equal(t, http.StatusOK, resp.StatusCode) + out := decodeE2ECreate(t, resp) + require.Empty(t, out.FailedDeployID, + "failed_deploy_id must be empty when with_failed_deploy is omitted") + + var n int + require.NoError(t, db.QueryRowContext(context.Background(), + `SELECT count(*) FROM deployments WHERE team_id = $1`, out.TeamID).Scan(&n)) + require.Equal(t, 0, n, "no deployment must be seeded when with_failed_deploy is omitted") +} + +// TestE2EAccount_Create_WithFailedDeploy_SeedFailure_Returns503 forces the +// failed-deploy seed to fail (via the e2eSeedFailedDeploy seam) and asserts +// CreateAccount surfaces a 503 seed_failed — CI must never receive a +// half-seeded account. +func TestE2EAccount_Create_WithFailedDeploy_SeedFailure_Returns503(t *testing.T) { + skipUnlessE2EDB(t) + db, cleanup := testhelpers.SetupTestDB(t) + defer cleanup() + app := newE2ETestApp(t, db, nil, testE2EToken) + + restore := handlers.SetE2ESeedFailedDeployForTest(errors.New("deploy seed exploded")) + defer restore() + + resp := postE2ECreate(t, app, testE2EToken, `{"tier":"pro","with_failed_deploy":true}`) + require.Equal(t, http.StatusServiceUnavailable, resp.StatusCode, + "a failed-deploy seed failure must surface as 503, never a half-seeded 200") + out := decodeE2ECreate(t, resp) + require.Equal(t, "seed_failed", out.Error) +} diff --git a/internal/handlers/internal_e2e_account_seed_whitebox_test.go b/internal/handlers/internal_e2e_account_seed_whitebox_test.go index 6223958..2b58f1f 100644 --- a/internal/handlers/internal_e2e_account_seed_whitebox_test.go +++ b/internal/handlers/internal_e2e_account_seed_whitebox_test.go @@ -11,6 +11,7 @@ package handlers import ( "context" + "database/sql" "errors" "testing" "time" @@ -75,3 +76,95 @@ func TestSeedFastResources_MarkResourceActiveError(t *testing.T) { require.Nil(t, toks) require.NoError(t, mock.ExpectationsWereMet()) } + +// ── seedFailedDeploy error arms ────────────────────────────────────────────── +// +// The with_failed_deploy seed has three error branches; the happy path is +// covered end-to-end by the external suite against a real test DB. These drive +// each failure branch deterministically with sqlmock so the 100%-patch gate is +// satisfied without a flaky "make the real DB fail" dance: +// +// - CreateDeployment error → "seed failed deploy: create: ..." +// - UpdateDeploymentStatus err → "seed failed deploy: set failed: ..." +// - UpsertDeploymentAutopsy err → "seed failed deploy: autopsy: ..." + +// failedDeployReturningRow builds a single deployments row in the column order +// scanDeployment expects, so a mocked CreateDeployment INSERT … RETURNING parses +// cleanly and the test can advance to the UPDATE / autopsy steps. Mirrors the +// AddRow shape in deploy_redeploy_inplace_mock_test.go (deploymentColumnsList). +func failedDeployReturningRow() *sqlmock.Rows { + envVarsJSON := []byte("{}") + return sqlmock.NewRows(deploymentColumnsList).AddRow( + uuid.New(), // id + uuid.New(), // team_id + uuid.NullUUID{}, // resource_id + "e2e-fail-x", // app_id + "app-e2e-fail-x", // provider_id + "building", // status + "", // app_url + envVarsJSON, // env_vars + 8080, // port + "pro", // tier + "development", // env + false, // private + "", // allowed_ips + sql.NullString{}, // error_message + time.Now(), time.Now(), // created_at, updated_at + sql.NullString{}, sql.NullString{}, "unset", 0, // notify_* + sql.NullTime{}, "permanent", 0, sql.NullTime{}, // ttl_* + "tarball", "", "", // source, image_ref, registry_creds_enc + "", "", "", // git_url, git_ref, git_token_enc + sql.NullTime{}, false, false, // last_activity_at, scaled_to_zero, always_on + ) +} + +func TestSeedFailedDeploy_CreateDeploymentError(t *testing.T) { + db, mock, err := sqlmock.New(sqlmock.QueryMatcherOption(sqlmock.QueryMatcherRegexp)) + require.NoError(t, err) + defer db.Close() + + mock.ExpectQuery(`INSERT INTO deployments`).WillReturnError(errors.New("insert boom")) + + h := &E2EAccountHandler{db: db, cfg: &config.Config{}} + appID, serr := h.seedFailedDeploy(context.Background(), uuid.New(), "pro", "") + require.Error(t, serr) + require.Contains(t, serr.Error(), "seed failed deploy: create") + require.Contains(t, serr.Error(), "insert boom") + require.Empty(t, appID) + require.NoError(t, mock.ExpectationsWereMet()) +} + +func TestSeedFailedDeploy_UpdateStatusError(t *testing.T) { + db, mock, err := sqlmock.New(sqlmock.QueryMatcherOption(sqlmock.QueryMatcherRegexp)) + require.NoError(t, err) + defer db.Close() + + mock.ExpectQuery(`INSERT INTO deployments`).WillReturnRows(failedDeployReturningRow()) + mock.ExpectExec(`UPDATE deployments`).WillReturnError(errors.New("update boom")) + + h := &E2EAccountHandler{db: db, cfg: &config.Config{}} + appID, serr := h.seedFailedDeploy(context.Background(), uuid.New(), "pro", "") + require.Error(t, serr) + require.Contains(t, serr.Error(), "seed failed deploy: set failed") + require.Contains(t, serr.Error(), "update boom") + require.Empty(t, appID) + require.NoError(t, mock.ExpectationsWereMet()) +} + +func TestSeedFailedDeploy_AutopsyError(t *testing.T) { + db, mock, err := sqlmock.New(sqlmock.QueryMatcherOption(sqlmock.QueryMatcherRegexp)) + require.NoError(t, err) + defer db.Close() + + mock.ExpectQuery(`INSERT INTO deployments`).WillReturnRows(failedDeployReturningRow()) + mock.ExpectExec(`UPDATE deployments`).WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectExec(`INSERT INTO deployment_events`).WillReturnError(errors.New("autopsy boom")) + + h := &E2EAccountHandler{db: db, cfg: &config.Config{}} + appID, serr := h.seedFailedDeploy(context.Background(), uuid.New(), "pro", "") + require.Error(t, serr) + require.Contains(t, serr.Error(), "seed failed deploy: autopsy") + require.Contains(t, serr.Error(), "autopsy boom") + require.Empty(t, appID) + require.NoError(t, mock.ExpectationsWereMet()) +} diff --git a/internal/handlers/internal_e2e_account_test.go b/internal/handlers/internal_e2e_account_test.go index ccae448..de3cfb7 100644 --- a/internal/handlers/internal_e2e_account_test.go +++ b/internal/handlers/internal_e2e_account_test.go @@ -92,15 +92,16 @@ func newE2ETestApp(t *testing.T, db *sql.DB, rdb *redis.Client, token string) *f // e2eCreateResp is the create-endpoint response shape we assert on. type e2eCreateResp struct { - TeamID string `json:"team_id"` - UserID string `json:"user_id"` - Email string `json:"email"` - Tier string `json:"tier"` - SessionJWT string `json:"session_jwt"` - ExpiresAt string `json:"expires_at"` - SeededTokens []string `json:"seeded_tokens"` - SeededCount int `json:"seeded_count"` - Error string `json:"error"` + TeamID string `json:"team_id"` + UserID string `json:"user_id"` + Email string `json:"email"` + Tier string `json:"tier"` + SessionJWT string `json:"session_jwt"` + ExpiresAt string `json:"expires_at"` + SeededTokens []string `json:"seeded_tokens"` + SeededCount int `json:"seeded_count"` + FailedDeployID string `json:"failed_deploy_id"` + Error string `json:"error"` } func postE2ECreate(t *testing.T, app *fiber.App, token, body string) *http.Response { diff --git a/internal/handlers/stack_anon_failure_diag_test.go b/internal/handlers/stack_anon_failure_diag_test.go new file mode 100644 index 0000000..c75c56e --- /dev/null +++ b/internal/handlers/stack_anon_failure_diag_test.go @@ -0,0 +1,195 @@ +package handlers_test + +// stack_anon_failure_diag_test.go — the ANONYMOUS-stack failure-diagnosis +// contract test (task #70, docs/ci/02-FAILURE-DIAGNOSIS-AND-AUTODEBUG.md §3 + +// §5.2). +// +// Anonymous users cannot use /deploy/new (RequireAuth; deployments.team_id is +// NOT NULL — memory project_anonymous_deploy_via_stacks_not_deploy_new). They +// deploy via POST /stacks/new (OptionalAuth; anon stacks carry NULL team_id). +// +// ANON FAILURE-DIAGNOSIS IS STATUS + LOGS ONLY (the documented gap): +// +// - GET /stacks/:slug (slug-bearer, NO auth) returns status="failed" — the +// stack-level failure is visible to the anonymous owner. +// - the raw err.Error() string the deploy goroutine hit is persisted at the +// SERVICE level (stack_services.error_msg via UpdateStackServiceStatus; +// UpdateStackStatus's errMsg arg is intentionally NOT persisted — the +// stacks table has no error column). So the failure string lives on the +// service row, and the per-service build logs are read via +// GET /stacks/:slug/logs/:svc. +// - there is NO classified autopsy: NO /stacks/:slug/events route, NO +// reason/last_lines/hint. That is the diagnosis-quality gap vs the +// authenticated /api/v1/deployments/:id/events surface. +// +// This test PINS that contract so that: +// (a) anon users provably get status=failed (regression guard on the thin +// surface they DO have), and +// (b) adding a stack-autopsy endpoint later is a DELIBERATE, test-updating +// change — the route-absence assertion below REDS the moment someone adds +// GET /stacks/:slug/events, forcing them to update this contract. +// +// In short: anon failure-diagnosis is status + logs only (gap: no classified +// autopsy). + +import ( + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + + "github.com/redis/go-redis/v9" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "instant.dev/internal/config" + "instant.dev/internal/email" + "instant.dev/internal/plans" + "instant.dev/internal/router" + "instant.dev/internal/testhelpers" +) + +// anonStackEventsRoutePath is the route that would expose a classified autopsy +// to anonymous stack owners. It does NOT exist today — the assertion below pins +// its absence. Named so a future PR adding the route greps to exactly here. +const anonStackEventsRoutePath = "/stacks/:slug/events" + +// TestStackAnonFailureDiag_StatusAndLogsOnly drives an anonymous stack to +// status=failed and asserts the thin diagnosis surface: GET /stacks/:slug +// (slug-bearer, no auth) returns status=failed, and the failure string is +// stored on the service row. Anon failure-diagnosis is status + logs only +// (gap: no classified autopsy). +func TestStackAnonFailureDiag_StatusAndLogsOnly(t *testing.T) { + requireCoverageDB(t) + db, cleanDB := testhelpers.SetupTestDB(t) + defer cleanDB() + ensureStackTables2(t, db) + + // Anonymous stack: NULL team_id, status driven to 'failed'. seedStack with + // teamID=nil mirrors the mig-005 anon-stack shape (NULL team_id). + stackID, slug := seedStack(t, db, nil, "failed") + + // The deploy goroutine's raw err.Error() lands on the SERVICE row via + // UpdateStackServiceStatus(...,"failed", errMsg). seedStack created a 'web' + // service in 'healthy' — flip it to failed with the raw build error string + // so this test exercises the real failure-string truth surface. + const rawBuildErr = "kaniko build failed: COPY failed: no source files were specified" + _, err := db.Exec(` + UPDATE stack_services SET status = 'failed', error_msg = $2 + WHERE stack_id = $1 + `, stackID, rawBuildErr) + require.NoError(t, err) + + app, _ := newCoverageStackApp(t, db) + + // GET /stacks/:slug with NO Authorization header — the anonymous owner + // reads their own stack by slug (slug IS the bearer for an anon stack). + req := httptest.NewRequest(http.MethodGet, "/stacks/"+slug, nil) + resp, err := app.Test(req, 5000) + require.NoError(t, err) + defer resp.Body.Close() + require.Equal(t, http.StatusOK, resp.StatusCode, + "anon owner reads their own stack by slug with NO auth") + + var body struct { + OK bool `json:"ok"` + StackID string `json:"stack_id"` + Status string `json:"status"` + Services []struct { + Name string `json:"name"` + Status string `json:"status"` + } `json:"services"` + } + require.NoError(t, json.NewDecoder(resp.Body).Decode(&body)) + assert.True(t, body.OK) + assert.Equal(t, slug, body.StackID) + assert.Equal(t, "failed", body.Status, + "anon stack failure IS visible at the stack level (status=failed)") + + // The failing service is surfaced. (serializeServices intentionally does + // NOT echo the raw error_msg string — the per-service failure detail is + // read via the logs surface, not the status JSON. The string is persisted + // on the row, asserted below from the DB.) + require.NotEmpty(t, body.Services) + var sawFailedSvc bool + for _, s := range body.Services { + if s.Status == "failed" { + sawFailedSvc = true + } + } + assert.True(t, sawFailedSvc, "the failed service is surfaced in the status JSON") + + // Truth surface: the raw err.Error() string is persisted on the service + // row (stack_services.error_msg) — this is what the logs/diagnostics path + // reads. The stacks table itself has NO error column (UpdateStackStatus's + // errMsg arg is discarded by design), so the failure string lives here. + var storedErr string + require.NoError(t, db.QueryRow( + `SELECT error_msg FROM stack_services WHERE stack_id = $1 AND status = 'failed'`, + stackID).Scan(&storedErr)) + assert.Equal(t, rawBuildErr, storedErr, + "the raw build error is persisted on the service row (the anon truth surface)") +} + +// TestStackAnonFailureDiag_NoClassifiedAutopsyEndpoint pins the documented gap: +// there is NO /stacks/:slug/events route. An anonymous stack owner gets status +// + logs but NO classified reason/last_lines/hint (unlike the authenticated +// /api/v1/deployments/:id/events surface). +// +// This walks the LIVE production router (router.New + GetRoutes) — the same +// authoritative route table the done-bar guard uses — so the assertion can't +// drift from what's actually mounted. The moment someone adds a stack-autopsy +// route, this test REDS, forcing the §3 contract + the anon-gap docs to be +// updated deliberately rather than the gap silently closing untested. +func TestStackAnonFailureDiag_NoClassifiedAutopsyEndpoint(t *testing.T) { + cfg := anonStackRouterConfig() + rdb := redis.NewClient(&redis.Options{Addr: "127.0.0.1:6379"}) + defer func() { _ = rdb.Close() }() + + app := router.New(cfg, nil, rdb, nil, email.NewNoop(), plans.Default(), nil, nil) + + // Enumerate the live route table. There must be NO route whose path is + // /stacks/:slug/events under ANY method. + for _, r := range app.GetRoutes(true) { + assert.NotEqual(t, anonStackEventsRoutePath, r.Path, + "a %s %s route now EXISTS — anonymous stacks gained a classified-autopsy "+ + "endpoint. This closes the documented §3 gap; UPDATE this test + "+ + "docs/ci/02-FAILURE-DIAGNOSIS-AND-AUTODEBUG.md §3 to assert the new "+ + "reason/last_lines/hint contract instead of the absence.", + r.Method, r.Path) + } + + // Sanity: the routes anon DOES have (status + per-service logs) ARE present, + // so the assertion above is meaningful (not vacuously true because stacks + // routes failed to register at all). + var sawGet, sawLogs bool + for _, r := range app.GetRoutes(true) { + if r.Method == http.MethodGet && r.Path == "/stacks/:slug" { + sawGet = true + } + if r.Method == http.MethodGet && r.Path == "/stacks/:slug/logs/:svc" { + sawLogs = true + } + } + assert.True(t, sawGet, "GET /stacks/:slug (status surface) must be mounted") + assert.True(t, sawLogs, "GET /stacks/:slug/logs/:svc (logs surface) must be mounted") +} + +// anonStackRouterConfig is a minimal config sufficient for router.New to mount +// the full route table for the route-presence enumeration above. No DB call is +// made (the test only inspects GetRoutes, never serves a request), so the nil +// db passed to router.New is safe here. +func anonStackRouterConfig() *config.Config { + return &config.Config{ + Port: "8080", + JWTSecret: testhelpers.TestJWTSecret, + AESKey: testhelpers.TestAESKeyHex, + EnabledServices: "postgres,redis,mongodb,queue,webhook,storage,deploy", + Environment: "development", + PostgresProvisionBackend: "local", + ComputeProvider: "noop", + QueueBackend: "legacy_open", + ObjectStoreBucket: "instant-shared", + // AdminPathPrefix empty → admin subtree skipped; irrelevant to stacks. + } +}