diff --git a/infra/newrelic/alerts/webhook-auth-failures.json b/infra/newrelic/alerts/webhook-auth-failures.json new file mode 100644 index 00000000..2134465d --- /dev/null +++ b/infra/newrelic/alerts/webhook-auth-failures.json @@ -0,0 +1,31 @@ +{ + "name": "instant-api — webhook auth failures (Brevo/SES — secret unset or signature mismatch)", + "policyName": "instant-api alerts", + "description": "P1 operator signal. Fires when inbound email-provider webhooks (Brevo HMAC, SES/SNS RSA) authenticate-fail at the api boundary. Two distinct reasons are emitted: (a) reason='secret_unset' — the operator has not deployed BREVO_WEBHOOK_SECRET / SES_SNS_SUBSCRIPTION_ARN to the api Deployment. This means EVERY inbound provider event is rejected and forwarder_sent.classification is never updated. Rule 12 (email truth surface) is broken until the operator deploys the secret. CRITICAL within 5 min — fix is one kubectl set env. (b) reason='signature_mismatch' — secret IS configured but the inbound signature does not verify. Real causes: (1) provider rotated their signing key, (2) drive-by probe traffic, (3) man-in-the-middle on the webhook URL. WARN at >10/h — operator should rotate the secret in the dashboard if the provider confirms a key rotation. Source: api/internal/handlers/email_webhooks.go webhookAuthFailure() helper; counter instant_webhook_auth_failures_total. API-19/96/97/98 (QA 2026-05-29).", + "enabled": true, + "nrql": { + "query": "SELECT count(*) FROM Metric WHERE metricName = 'instant_webhook_auth_failures_total' AND reason = 'secret_unset' FACET webhook" + }, + "terms": [ + { + "priority": "CRITICAL", + "operator": "ABOVE", + "threshold": 0, + "thresholdDuration": 300, + "thresholdOccurrences": "ALL" + } + ], + "signal": { + "aggregationWindow": 60, + "aggregationMethod": "EVENT_FLOW", + "aggregationDelay": 120, + "fillOption": "STATIC", + "fillValue": 0 + }, + "expiration": { + "expirationDuration": 3600, + "openViolationOnExpiration": false, + "closeViolationsOnExpiration": true + }, + "violationTimeLimitSeconds": 86400 +} diff --git a/internal/handlers/auth_first_ordering_test.go b/internal/handlers/auth_first_ordering_test.go new file mode 100644 index 00000000..fb563e0c --- /dev/null +++ b/internal/handlers/auth_first_ordering_test.go @@ -0,0 +1,583 @@ +package handlers_test + +// auth_first_ordering_test.go — API-19/26/27/28/77/78/96/97/98 (QA 2026-05-29). +// +// Asserts that the webhook + /internal/* auth check runs BEFORE any body +// or path-param validation. The pre-fix posture inverted fail-closed: a +// probe could distinguish "secret unset" / "secret wrong" from "path +// malformed" / "body malformed" by the 401 vs 400 envelope. These tests +// lock in the auth-first ordering and the new explicit error codes: +// +// webhook_secret_mismatch — provider secret env var unset +// webhook_signature_mismatch — secret IS configured, body sig bad +// webhook_method_not_allowed — GET on a POST-only webhook URL +// internal_token_required — /internal/* unauth +// +// Hermetic: no DB needed. We construct each handler with nil or sqlmock +// DB and exercise it through a minimal Fiber app. + +import ( + "bytes" + "encoding/json" + "errors" + "io" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/DATA-DOG/go-sqlmock" + "github.com/gofiber/fiber/v2" + "github.com/golang-jwt/jwt/v4" + "github.com/stretchr/testify/require" + + "instant.dev/internal/config" + "instant.dev/internal/handlers" +) + +// testTerminateSecret is the per-test worker secret used by the preVerify +// arms below. 32 bytes so HS256 is happy with arbitrary keys. +const testTerminateSecret = "auth-first-secret-32-bytes-yyyyy" + +// mintTestJWT builds an HS256 JWT with the supplied purpose, claim-name+id +// pair, and an `iat` clock offset (seconds; 0 = now). Used to drive the +// preVerify* arms that depend on purpose / iat freshness. +func mintTestJWT(t *testing.T, secret, purpose, idClaim, idValue string, iatOffsetSec int64) string { + t.Helper() + claims := jwt.MapClaims{ + "purpose": purpose, + idClaim: idValue, + "iat": time.Now().Add(time.Duration(iatOffsetSec) * time.Second).Unix(), + } + tok := jwt.NewWithClaims(jwt.SigningMethodHS256, claims) + signed, err := tok.SignedString([]byte(secret)) + require.NoError(t, err) + return signed +} + +// mintTestJWTNoIat builds an HS256 JWT WITHOUT an iat claim — drives the +// "missing iat" arm of preVerify*. +func mintTestJWTNoIat(t *testing.T, secret, purpose, idClaim, idValue string) string { + t.Helper() + claims := jwt.MapClaims{ + "purpose": purpose, + idClaim: idValue, + } + tok := jwt.NewWithClaims(jwt.SigningMethodHS256, claims) + signed, err := tok.SignedString([]byte(secret)) + require.NoError(t, err) + return signed +} + +// fiberAppWithErrorHandler builds a minimal Fiber app whose ErrorHandler +// translates handlers.ErrResponseWritten back to "no-op" — every test in +// this file relies on respondError having already written the envelope. +func fiberAppWithErrorHandler() *fiber.App { + return fiber.New(fiber.Config{ + ErrorHandler: func(c *fiber.Ctx, err error) error { + if errors.Is(err, handlers.ErrResponseWritten) { + return nil + } + code := fiber.StatusInternalServerError + if e, ok := err.(*fiber.Error); ok { + code = e.Code + } + return c.Status(code).JSON(fiber.Map{"ok": false, "error": "internal_error", "message": err.Error()}) + }, + }) +} + +// decodeEnvelope reads the response body into a generic map for code +// assertions. Returns a non-nil map even on parse error so callers can +// keep their assertions terse. +func decodeEnvelope(t *testing.T, resp *http.Response) map[string]any { + t.Helper() + defer resp.Body.Close() + raw, err := io.ReadAll(resp.Body) + require.NoError(t, err) + out := map[string]any{} + _ = json.Unmarshal(raw, &out) + return out +} + +// ── Brevo HMAC webhook ─────────────────────────────────────────────────────── + +// TestBrevoWebhook_SecretUnset_Returns401_WithWebhookSecretMismatch covers +// API-19/96. Pre-fix the route returned 401 invalid_signature; post-fix +// the SECRET-unset branch must return 401 webhook_secret_mismatch so an +// operator alert can target the deploy-the-secret incident specifically. +func TestBrevoWebhook_SecretUnset_Returns401_WithWebhookSecretMismatch(t *testing.T) { + db, _, err := sqlmock.New() + require.NoError(t, err) + defer db.Close() + cfg := &config.Config{BrevoWebhookSecret: ""} + h := handlers.NewEmailWebhookHandler(db, cfg) + + app := fiberAppWithErrorHandler() + app.Post("/api/v1/email/webhook/brevo", h.Brevo) + + req := httptest.NewRequest(http.MethodPost, "/api/v1/email/webhook/brevo", + bytes.NewReader([]byte(`{"event":"hard_bounce"}`))) + req.Header.Set("Content-Type", "application/json") + resp, err := app.Test(req, 5000) + require.NoError(t, err) + require.Equal(t, http.StatusUnauthorized, resp.StatusCode) + env := decodeEnvelope(t, resp) + require.Equal(t, "webhook_secret_mismatch", env["error"]) +} + +// TestBrevoWebhook_SignatureMismatch_Returns401_WithWebhookSignatureMismatch +// covers the secret-set-but-sig-bad branch. Distinct error code so observability +// can split "secret unset" from "signature mismatch" (provider rotated key / +// drive-by attacker). +func TestBrevoWebhook_SignatureMismatch_Returns401_WithWebhookSignatureMismatch(t *testing.T) { + db, _, err := sqlmock.New() + require.NoError(t, err) + defer db.Close() + cfg := &config.Config{BrevoWebhookSecret: "test-secret-32-bytes-padded-xxxx"} + h := handlers.NewEmailWebhookHandler(db, cfg) + + app := fiberAppWithErrorHandler() + app.Post("/api/v1/email/webhook/brevo", h.Brevo) + + req := httptest.NewRequest(http.MethodPost, "/api/v1/email/webhook/brevo", + bytes.NewReader([]byte(`{"event":"hard_bounce","email":"x@y.test"}`))) + req.Header.Set("Content-Type", "application/json") + req.Header.Set("X-Sib-Signature", "deadbeef-not-a-real-sig") + resp, err := app.Test(req, 5000) + require.NoError(t, err) + require.Equal(t, http.StatusUnauthorized, resp.StatusCode) + env := decodeEnvelope(t, resp) + require.Equal(t, "webhook_signature_mismatch", env["error"]) +} + +// TestBrevoWebhook_GetReturns405_WithWebhookMethodNotAllowed covers API-98: +// dashboard pre-flight GET on the webhook URL must see 405 + Allow: POST +// rather than the catch-all 401, so the dashboard interprets the URL as +// valid-but-method-wrong instead of unauthenticated-abandon. +func TestBrevoWebhook_GetReturns405_WithWebhookMethodNotAllowed(t *testing.T) { + db, _, err := sqlmock.New() + require.NoError(t, err) + defer db.Close() + cfg := &config.Config{} + h := handlers.NewEmailWebhookHandler(db, cfg) + + app := fiberAppWithErrorHandler() + app.Get("/api/v1/email/webhook/brevo", h.BrevoMethodNotAllowed) + + req := httptest.NewRequest(http.MethodGet, "/api/v1/email/webhook/brevo", nil) + resp, err := app.Test(req, 5000) + require.NoError(t, err) + require.Equal(t, http.StatusMethodNotAllowed, resp.StatusCode) + require.Equal(t, "POST", resp.Header.Get("Allow")) + env := decodeEnvelope(t, resp) + require.Equal(t, "webhook_method_not_allowed", env["error"]) +} + +// ── SES/SNS webhook ────────────────────────────────────────────────────────── + +// TestSESWebhook_SecretUnset_Returns401_BeforeBodyParse covers API-97: +// the secret-unset (SES_SNS_SUBSCRIPTION_ARN empty) check must fire before +// envelope parsing. A junk body MUST return 401 webhook_secret_mismatch +// (not 400 invalid_payload) so a probe can't distinguish the two by +// manipulating the payload. +func TestSESWebhook_SecretUnset_Returns401_BeforeBodyParse(t *testing.T) { + db, _, err := sqlmock.New() + require.NoError(t, err) + defer db.Close() + cfg := &config.Config{SESSNSTopicARN: ""} + h := handlers.NewEmailWebhookHandler(db, cfg) + + app := fiberAppWithErrorHandler() + app.Post("/api/v1/email/webhook/ses", h.SES) + + // Junk body — would 400 invalid_payload pre-fix because envelope parse + // ran before the secret check. Post-fix the secret-unset branch wins. + req := httptest.NewRequest(http.MethodPost, "/api/v1/email/webhook/ses", + bytes.NewReader([]byte(`THIS IS NOT JSON AT ALL`))) + resp, err := app.Test(req, 5000) + require.NoError(t, err) + require.Equal(t, http.StatusUnauthorized, resp.StatusCode) + env := decodeEnvelope(t, resp) + require.Equal(t, "webhook_secret_mismatch", env["error"]) +} + +// TestSESWebhook_BadSignature_Returns401_WithWebhookSignatureMismatch: +// secret IS configured, but the inbound envelope's TopicArn does not match. +// Distinct from the SECRET-unset path above. +func TestSESWebhook_BadSignature_Returns401_WithWebhookSignatureMismatch(t *testing.T) { + db, _, err := sqlmock.New() + require.NoError(t, err) + defer db.Close() + cfg := &config.Config{SESSNSTopicARN: "arn:aws:sns:us-east-1:123456789012:bounces"} + h := handlers.NewEmailWebhookHandler(db, cfg) + h.DisableSNSVerifierForTest() + + app := fiberAppWithErrorHandler() + app.Post("/api/v1/email/webhook/ses", h.SES) + + body := []byte(`{"Type":"Notification","TopicArn":"arn:aws:sns:us-east-1:999:wrong","Message":"{}"}`) + req := httptest.NewRequest(http.MethodPost, "/api/v1/email/webhook/ses", bytes.NewReader(body)) + req.Header.Set("Content-Type", "application/json") + resp, err := app.Test(req, 5000) + require.NoError(t, err) + require.Equal(t, http.StatusUnauthorized, resp.StatusCode) + env := decodeEnvelope(t, resp) + require.Equal(t, "webhook_signature_mismatch", env["error"]) +} + +// TestSESWebhook_GoodTopicArn_Returns200_NotSuppressionWorthy: the +// auth-passed path still works. A SubscriptionConfirmation envelope with +// the right TopicArn returns 200 (subscription_pending=true) — verifies we +// didn't accidentally 401 a real provider payload. +func TestSESWebhook_GoodTopicArn_Returns200_NotSuppressionWorthy(t *testing.T) { + db, _, err := sqlmock.New() + require.NoError(t, err) + defer db.Close() + arn := "arn:aws:sns:us-east-1:123456789012:bounces" + cfg := &config.Config{SESSNSTopicARN: arn} + h := handlers.NewEmailWebhookHandler(db, cfg) + h.DisableSNSVerifierForTest() + + app := fiberAppWithErrorHandler() + app.Post("/api/v1/email/webhook/ses", h.SES) + + body := []byte(`{"Type":"SubscriptionConfirmation","TopicArn":"` + arn + `","SubscribeURL":"https://sns.us-east-1.amazonaws.com/?Action=ConfirmSubscription","Message":"{}"}`) + req := httptest.NewRequest(http.MethodPost, "/api/v1/email/webhook/ses", bytes.NewReader(body)) + req.Header.Set("Content-Type", "application/json") + resp, err := app.Test(req, 5000) + require.NoError(t, err) + require.Equal(t, http.StatusOK, resp.StatusCode) + env := decodeEnvelope(t, resp) + require.Equal(t, true, env["ok"]) +} + +// ── Internal endpoints — auth-first ordering ───────────────────────────────── + +// newInternalTerminateApp builds a minimal app wired only to the +// terminate handler with the supplied secret. cancelFn is nil so the +// handler short-circuits the Razorpay step. +func newInternalTerminateApp(secret string) *fiber.App { + db, _, _ := sqlmock.New() + cfg := &config.Config{WorkerInternalJWTSecret: secret} + h := handlers.NewInternalTerminateHandler(db, cfg, nil) + app := fiberAppWithErrorHandler() + app.Post("/internal/teams/:id/terminate", h.Terminate) + return app +} + +// TestInternalTerminate_AuthCheckBeforePathParse covers API-26/77: +// posting a bogus token to a path with a malformed :id must return 401 +// internal_token_required (not 400 invalid_team_id). Pre-fix the path +// parse ran first and surfaced 400 — leaking the route's shape. +func TestInternalTerminate_AuthCheckBeforePathParse(t *testing.T) { + app := newInternalTerminateApp("worker-internal-secret-32-bytes!") + + // Malformed path :id (not a UUID) + bogus bearer. + req := httptest.NewRequest(http.MethodPost, "/internal/teams/NOT-A-UUID/terminate", nil) + req.Header.Set("Authorization", "Bearer junk-token-not-a-real-jwt") + resp, err := app.Test(req, 5000) + require.NoError(t, err) + require.Equal(t, http.StatusUnauthorized, resp.StatusCode, "auth must fire BEFORE path parse") + env := decodeEnvelope(t, resp) + require.Equal(t, "internal_token_required", env["error"]) +} + +// TestInternalTerminate_SecretUnset_RejectsAllRegardlessOfPath: when the +// worker secret env var is unset, EVERY call 401s — even with a malformed +// path. The pre-fix path-parse-first ordering would have surfaced 400 on +// a junk path, telling the probe "this would have been a real endpoint if +// only you sent a valid path". +func TestInternalTerminate_SecretUnset_RejectsAllRegardlessOfPath(t *testing.T) { + app := newInternalTerminateApp("") + + req := httptest.NewRequest(http.MethodPost, "/internal/teams/NOT-A-UUID/terminate", nil) + resp, err := app.Test(req, 5000) + require.NoError(t, err) + require.Equal(t, http.StatusUnauthorized, resp.StatusCode) + env := decodeEnvelope(t, resp) + require.Equal(t, "internal_token_required", env["error"]) +} + +// newInternalResendApp wires the resend handler with the supplied secret. +// The mailer is nil — auth-fail paths return before any send is attempted. +func newInternalResendApp(secret string) *fiber.App { + db, _, _ := sqlmock.New() + cfg := &config.Config{WorkerInternalJWTSecret: secret} + // nil mailer is fine — the auth-fail path returns before the mailer + // is touched. + h := handlers.NewInternalResendMagicLinkHandler(db, cfg, nil) + app := fiberAppWithErrorHandler() + app.Post("/internal/email/resend-magic-link", h.Resend) + return app +} + +// TestInternalResendMagicLink_AuthCheckBeforeBodyParse covers API-27/78: +// a malformed body with a bogus bearer must return 401 internal_token_required +// (not 400 invalid_body). Pre-fix body parse ran first. +func TestInternalResendMagicLink_AuthCheckBeforeBodyParse(t *testing.T) { + app := newInternalResendApp("worker-internal-secret-32-bytes!") + + // Malformed body + bogus bearer. + req := httptest.NewRequest(http.MethodPost, "/internal/email/resend-magic-link", + bytes.NewReader([]byte(`THIS IS NOT JSON`))) + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Authorization", "Bearer junk-token-not-a-real-jwt") + resp, err := app.Test(req, 5000) + require.NoError(t, err) + require.Equal(t, http.StatusUnauthorized, resp.StatusCode, "auth must fire BEFORE body parse") + env := decodeEnvelope(t, resp) + require.Equal(t, "internal_token_required", env["error"]) +} + +// TestInternalResendMagicLink_SecretUnset_RejectsRegardlessOfBody: secret +// unset → 401 internal_token_required even with junk body. +func TestInternalResendMagicLink_SecretUnset_RejectsRegardlessOfBody(t *testing.T) { + app := newInternalResendApp("") + + req := httptest.NewRequest(http.MethodPost, "/internal/email/resend-magic-link", + bytes.NewReader([]byte(`JUNK`))) + resp, err := app.Test(req, 5000) + require.NoError(t, err) + require.Equal(t, http.StatusUnauthorized, resp.StatusCode) + env := decodeEnvelope(t, resp) + require.Equal(t, "internal_token_required", env["error"]) +} + +// newInternalRefundApp wires the refund handler with the supplied secret. +// rdb is nil — the auth-fail path returns before Redis is touched. +func newInternalRefundApp(secret string) *fiber.App { + db, _, _ := sqlmock.New() + cfg := &config.Config{WorkerInternalJWTSecret: secret} + h := handlers.NewInternalBackupRefundHandler(db, nil, cfg) + app := fiberAppWithErrorHandler() + app.Post("/internal/teams/:id/backup-quota/refund", h.Refund) + return app +} + +// TestInternalRefund_AuthCheckBeforePathParse covers API-28: a malformed +// :id with a bogus bearer must return 401 internal_token_required +// (not 400 invalid_team_id). +func TestInternalRefund_AuthCheckBeforePathParse(t *testing.T) { + app := newInternalRefundApp("worker-internal-secret-32-bytes!") + + req := httptest.NewRequest(http.MethodPost, "/internal/teams/NOT-A-UUID/backup-quota/refund", + bytes.NewReader([]byte(`{"backup_id":"junk"}`))) + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Authorization", "Bearer junk-token-not-a-real-jwt") + resp, err := app.Test(req, 5000) + require.NoError(t, err) + require.Equal(t, http.StatusUnauthorized, resp.StatusCode, "auth must fire BEFORE path parse") + env := decodeEnvelope(t, resp) + require.Equal(t, "internal_token_required", env["error"]) +} + +// TestInternalRefund_SecretUnset_RejectsRegardlessOfPath: secret unset → +// 401 internal_token_required even with junk path. +func TestInternalRefund_SecretUnset_RejectsRegardlessOfPath(t *testing.T) { + app := newInternalRefundApp("") + + req := httptest.NewRequest(http.MethodPost, "/internal/teams/NOT-A-UUID/backup-quota/refund", + bytes.NewReader([]byte(`{}`))) + resp, err := app.Test(req, 5000) + require.NoError(t, err) + require.Equal(t, http.StatusUnauthorized, resp.StatusCode) + env := decodeEnvelope(t, resp) + require.Equal(t, "internal_token_required", env["error"]) +} + +// ── preVerify* arms — empty bearer / alg pin / token-invalid / iat-skew ───── +// +// 100% patch coverage requires hitting every arm of preVerifyInternalTerminate, +// preVerifyInternalResendMagicLink, preVerifyInternalBackupRefund. The tests +// above cover {missing_bearer, parse_failed (bogus token)}; these cover the +// remaining {empty bearer, wrong-purpose, stale iat} arms via direct POSTs. + +// TestInternalTerminate_PreVerify_WrongPurpose mints a JWT with wrong purpose +// → 401. +func TestInternalTerminate_PreVerify_WrongPurpose(t *testing.T) { + app := newInternalTerminateApp(testTerminateSecret) + teamID := "00000000-0000-0000-0000-000000000001" + jwt := mintTestJWT(t, testTerminateSecret, "WRONG_PURPOSE", "team_id", teamID, 0) + req := httptest.NewRequest(http.MethodPost, "/internal/teams/"+teamID+"/terminate", nil) + req.Header.Set("Authorization", "Bearer "+jwt) + resp, err := app.Test(req, 5000) + require.NoError(t, err) + require.Equal(t, http.StatusUnauthorized, resp.StatusCode) +} + +// TestInternalTerminate_PreVerify_StaleIat mints a JWT with iat 5min ago +// → 401. +func TestInternalTerminate_PreVerify_StaleIat(t *testing.T) { + app := newInternalTerminateApp(testTerminateSecret) + teamID := "00000000-0000-0000-0000-000000000001" + jwt := mintTestJWT(t, testTerminateSecret, "internal_terminate", "team_id", teamID, -5*60) + req := httptest.NewRequest(http.MethodPost, "/internal/teams/"+teamID+"/terminate", nil) + req.Header.Set("Authorization", "Bearer "+jwt) + resp, err := app.Test(req, 5000) + require.NoError(t, err) + require.Equal(t, http.StatusUnauthorized, resp.StatusCode) +} + +// TestInternalTerminate_PreVerify_MissingIat mints a JWT with no iat claim +// → 401. +func TestInternalTerminate_PreVerify_MissingIat(t *testing.T) { + app := newInternalTerminateApp(testTerminateSecret) + teamID := "00000000-0000-0000-0000-000000000001" + // "0" iatOffset with a special sentinel — we just mint without iat. + jwt := mintTestJWTNoIat(t, testTerminateSecret, "internal_terminate", "team_id", teamID) + req := httptest.NewRequest(http.MethodPost, "/internal/teams/"+teamID+"/terminate", nil) + req.Header.Set("Authorization", "Bearer "+jwt) + resp, err := app.Test(req, 5000) + require.NoError(t, err) + require.Equal(t, http.StatusUnauthorized, resp.StatusCode) +} + +// Mirror the above three tests for the resend handler. + +func TestInternalResendMagicLink_PreVerify_WrongPurpose(t *testing.T) { + app := newInternalResendApp(testTerminateSecret) + linkID := "00000000-0000-0000-0000-000000000002" + jwt := mintTestJWT(t, testTerminateSecret, "WRONG", "link_id", linkID, 0) + req := httptest.NewRequest(http.MethodPost, "/internal/email/resend-magic-link", + bytes.NewReader([]byte(`{"link_id":"`+linkID+`"}`))) + req.Header.Set("Authorization", "Bearer "+jwt) + resp, err := app.Test(req, 5000) + require.NoError(t, err) + require.Equal(t, http.StatusUnauthorized, resp.StatusCode) +} + +func TestInternalResendMagicLink_PreVerify_StaleIat(t *testing.T) { + app := newInternalResendApp(testTerminateSecret) + linkID := "00000000-0000-0000-0000-000000000002" + jwt := mintTestJWT(t, testTerminateSecret, "resend_magic_link", "link_id", linkID, -5*60) + req := httptest.NewRequest(http.MethodPost, "/internal/email/resend-magic-link", + bytes.NewReader([]byte(`{"link_id":"`+linkID+`"}`))) + req.Header.Set("Authorization", "Bearer "+jwt) + resp, err := app.Test(req, 5000) + require.NoError(t, err) + require.Equal(t, http.StatusUnauthorized, resp.StatusCode) +} + +func TestInternalResendMagicLink_PreVerify_MissingIat(t *testing.T) { + app := newInternalResendApp(testTerminateSecret) + linkID := "00000000-0000-0000-0000-000000000002" + jwt := mintTestJWTNoIat(t, testTerminateSecret, "resend_magic_link", "link_id", linkID) + req := httptest.NewRequest(http.MethodPost, "/internal/email/resend-magic-link", + bytes.NewReader([]byte(`{"link_id":"`+linkID+`"}`))) + req.Header.Set("Authorization", "Bearer "+jwt) + resp, err := app.Test(req, 5000) + require.NoError(t, err) + require.Equal(t, http.StatusUnauthorized, resp.StatusCode) +} + +// Mirror for refund handler. + +func TestInternalRefund_PreVerify_WrongPurpose(t *testing.T) { + app := newInternalRefundApp(testTerminateSecret) + teamID := "00000000-0000-0000-0000-000000000003" + jwt := mintTestJWT(t, testTerminateSecret, "WRONG", "team_id", teamID, 0) + req := httptest.NewRequest(http.MethodPost, "/internal/teams/"+teamID+"/backup-quota/refund", + bytes.NewReader([]byte(`{}`))) + req.Header.Set("Authorization", "Bearer "+jwt) + resp, err := app.Test(req, 5000) + require.NoError(t, err) + require.Equal(t, http.StatusUnauthorized, resp.StatusCode) +} + +func TestInternalRefund_PreVerify_StaleIat(t *testing.T) { + app := newInternalRefundApp(testTerminateSecret) + teamID := "00000000-0000-0000-0000-000000000003" + jwt := mintTestJWT(t, testTerminateSecret, "internal_backup_refund", "team_id", teamID, -5*60) + req := httptest.NewRequest(http.MethodPost, "/internal/teams/"+teamID+"/backup-quota/refund", + bytes.NewReader([]byte(`{}`))) + req.Header.Set("Authorization", "Bearer "+jwt) + resp, err := app.Test(req, 5000) + require.NoError(t, err) + require.Equal(t, http.StatusUnauthorized, resp.StatusCode) +} + +func TestInternalRefund_PreVerify_MissingIat(t *testing.T) { + app := newInternalRefundApp(testTerminateSecret) + teamID := "00000000-0000-0000-0000-000000000003" + jwt := mintTestJWTNoIat(t, testTerminateSecret, "internal_backup_refund", "team_id", teamID) + req := httptest.NewRequest(http.MethodPost, "/internal/teams/"+teamID+"/backup-quota/refund", + bytes.NewReader([]byte(`{}`))) + req.Header.Set("Authorization", "Bearer "+jwt) + resp, err := app.Test(req, 5000) + require.NoError(t, err) + require.Equal(t, http.StatusUnauthorized, resp.StatusCode) +} + +// ── More webhook coverage: GET on SES URL, body-parse arms, missing-id arm ── + +// TestSESWebhook_GetReturns405_WithWebhookMethodNotAllowed covers the +// SES counterpart of the Brevo GET handler. +func TestSESWebhook_GetReturns405_WithWebhookMethodNotAllowed(t *testing.T) { + db, _, err := sqlmock.New() + require.NoError(t, err) + defer db.Close() + cfg := &config.Config{} + h := handlers.NewEmailWebhookHandler(db, cfg) + + app := fiberAppWithErrorHandler() + app.Get("/api/v1/email/webhook/ses", h.SESMethodNotAllowed) + + req := httptest.NewRequest(http.MethodGet, "/api/v1/email/webhook/ses", nil) + resp, err := app.Test(req, 5000) + require.NoError(t, err) + require.Equal(t, http.StatusMethodNotAllowed, resp.StatusCode) + require.Equal(t, "POST", resp.Header.Get("Allow")) + env := decodeEnvelope(t, resp) + require.Equal(t, "webhook_method_not_allowed", env["error"]) +} + +// TestSESWebhook_GoodTopicArnButBadEnvelope_Returns400_InvalidPayload covers +// the SECRET-configured-but-junk-body branch — a 400 invalid_payload not a +// 401, since the auth-first gate already accepted the secret as configured. +func TestSESWebhook_GoodTopicArnButBadEnvelope_Returns400_InvalidPayload(t *testing.T) { + db, _, err := sqlmock.New() + require.NoError(t, err) + defer db.Close() + cfg := &config.Config{SESSNSTopicARN: "arn:aws:sns:us-east-1:123456789012:bounces"} + h := handlers.NewEmailWebhookHandler(db, cfg) + h.DisableSNSVerifierForTest() + + app := fiberAppWithErrorHandler() + app.Post("/api/v1/email/webhook/ses", h.SES) + + // Secret IS configured; body is junk. Auth-first gate accepts the + // secret; envelope parse 400s. + req := httptest.NewRequest(http.MethodPost, "/api/v1/email/webhook/ses", + bytes.NewReader([]byte(`THIS IS NOT JSON`))) + resp, err := app.Test(req, 5000) + require.NoError(t, err) + require.Equal(t, http.StatusBadRequest, resp.StatusCode) + env := decodeEnvelope(t, resp) + require.Equal(t, "invalid_payload", env["error"]) +} + +// TestSESWebhook_NotificationWithBadInnerMessage_Returns400_InvalidMessage +// exercises the inner-Message parse arm. +func TestSESWebhook_NotificationWithBadInnerMessage_Returns400_InvalidMessage(t *testing.T) { + db, _, err := sqlmock.New() + require.NoError(t, err) + defer db.Close() + arn := "arn:aws:sns:us-east-1:123456789012:bounces" + cfg := &config.Config{SESSNSTopicARN: arn} + h := handlers.NewEmailWebhookHandler(db, cfg) + h.DisableSNSVerifierForTest() + + app := fiberAppWithErrorHandler() + app.Post("/api/v1/email/webhook/ses", h.SES) + + // Valid envelope (TopicArn matches, Type=Notification) but the inner + // Message field is not JSON. + body := []byte(`{"Type":"Notification","TopicArn":"` + arn + `","Message":"NOT JSON"}`) + req := httptest.NewRequest(http.MethodPost, "/api/v1/email/webhook/ses", bytes.NewReader(body)) + resp, err := app.Test(req, 5000) + require.NoError(t, err) + require.Equal(t, http.StatusBadRequest, resp.StatusCode) + env := decodeEnvelope(t, resp) + require.Equal(t, "invalid_message", env["error"]) +} diff --git a/internal/handlers/email_webhooks.go b/internal/handlers/email_webhooks.go index f56ac075..da2aeca8 100644 --- a/internal/handlers/email_webhooks.go +++ b/internal/handlers/email_webhooks.go @@ -53,9 +53,20 @@ import ( "go.opentelemetry.io/otel/attribute" "instant.dev/internal/config" + "instant.dev/internal/metrics" "instant.dev/internal/models" ) +// webhookAuthFailure increments the inbound-webhook auth-failures counter. +// Centralised so the brevo + ses handlers share one observability surface +// and a single registry-iterating test guards the label set. webhook is +// one of {"brevo_hmac","ses_sns","brevo_url_secret"}; reason is one of +// {"secret_unset","signature_mismatch","missing_signature"}. API-19/96/97/98 +// (QA 2026-05-29). +func webhookAuthFailure(webhook, reason string) { + metrics.WebhookAuthFailuresTotal.WithLabelValues(webhook, reason).Inc() +} + // EmailWebhookHandler holds the deps for both provider endpoints. db is // the platform Postgres; cfg surfaces BrevoWebhookSecret + SESSNSTopicARN; // snsVerifier handles AWS SNS RSA signature verification (cert fetch + @@ -158,11 +169,43 @@ var brevoEventTypeMap = map[string]string{ "blocked": models.EmailEventTypeBounce, // blocked = permanent in practice } +// BrevoMethodNotAllowed handles non-POST verbs on /api/v1/email/webhook/brevo. +// +// API-98 (QA 2026-05-29): Brevo's dashboard sometimes issues a GET against the +// configured webhook URL to confirm "the endpoint exists" before saving. The +// pre-fix path returned a generic 401 (Fiber's app.Use chain hits before any +// per-handler logic on the non-registered GET verb), and dashboards configured +// to abandon-on-401 dropped the URL silently. 405 with the explicit code lets +// the dashboard see "URL exists, only POST accepted" and proceed. +func (h *EmailWebhookHandler) BrevoMethodNotAllowed(c *fiber.Ctx) error { + c.Set("Allow", "POST") + return respondError(c, fiber.StatusMethodNotAllowed, "webhook_method_not_allowed", + "This webhook URL only accepts POST.") +} + +// SESMethodNotAllowed mirrors BrevoMethodNotAllowed for the SES/SNS endpoint. +// SNS itself only POSTs, but operators sometimes curl the URL to confirm the +// configuration. Same 405 + explicit code lets them proceed without thinking +// they hit an unauthorised route. +func (h *EmailWebhookHandler) SESMethodNotAllowed(c *fiber.Ctx) error { + c.Set("Allow", "POST") + return respondError(c, fiber.StatusMethodNotAllowed, "webhook_method_not_allowed", + "This webhook URL only accepts POST.") +} + // Brevo handles POST /api/v1/email/webhook/brevo. // // Returns 401 on bad signature, 400 on unparseable body, 200 on every // other case (including unknown event types — Brevo fires opens/clicks // that we silently drop). +// +// API-19/96 (QA 2026-05-29): auth runs BEFORE any body inspection so an +// unauth POST surfaces the actual fault (webhook_signature_mismatch / +// webhook_secret_mismatch) and triggers the operator-facing agent_action +// instead of the generic user-auth "log in for a new INSTANODE_TOKEN" +// envelope. The split between SECRET unset and SIGNATURE mismatch lets +// observability distinguish "the operator hasn't deployed the secret +// yet" from "the provider sent a payload with a bad signature". func (h *EmailWebhookHandler) Brevo(c *fiber.Ctx) error { ctx, span := otel.Tracer("instant.dev/handlers").Start(c.UserContext(), "email.webhook.brevo") defer span.End() @@ -173,24 +216,34 @@ func (h *EmailWebhookHandler) Brevo(c *fiber.Ctx) error { sig = c.Get(brevoHeaderSignatureLegacy) } + // Fail-closed split: SECRET unset vs SIGNATURE bad. Both still 401, + // but the error CODE + observability label differ. Operators can + // alert on `instant_webhook_auth_failures_total{reason="secret_unset"}` + // to detect "we forgot to deploy the secret"; the `signature_mismatch` + // label detects "the provider rotated their signing key on us". + if h.cfg == nil || h.cfg.BrevoWebhookSecret == "" { + slog.Warn("email.webhook.brevo.secret_unset", + "have_signature", sig != "", + ) + webhookAuthFailure("brevo_hmac", "secret_unset") + return respondError(c, fiber.StatusUnauthorized, "webhook_secret_mismatch", + "Brevo webhook secret is not configured on this api Deployment.") + } if !verifyBrevoSignature(body, sig, h.cfg.BrevoWebhookSecret) { slog.Warn("email.webhook.brevo.signature_failed", - "have_secret", h.cfg.BrevoWebhookSecret != "", + "have_secret", true, "have_signature", sig != "", ) - return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{ - "ok": false, - "error": "invalid_signature", - }) + webhookAuthFailure("brevo_hmac", "signature_mismatch") + return respondError(c, fiber.StatusUnauthorized, "webhook_signature_mismatch", + "Brevo webhook signature did not verify against the body.") } var evt brevoEventPayload if err := json.Unmarshal(body, &evt); err != nil { slog.Warn("email.webhook.brevo.parse_failed", "error", err) - return c.Status(fiber.StatusBadRequest).JSON(fiber.Map{ - "ok": false, - "error": "invalid_payload", - }) + return respondError(c, fiber.StatusBadRequest, "invalid_payload", + "Brevo webhook body could not be parsed as JSON.") } normalized, ok := brevoEventTypeMap[strings.ToLower(evt.Event)] @@ -294,29 +347,45 @@ type sesMessage struct { // must match. Full SNS signature verification (RSA + cert download) is // reserved for a follow-up; the ARN check rejects drive-by traffic but // not a determined attacker who knows the ARN. +// +// API-19/97 (QA 2026-05-29): the SECRET_UNSET branch fires BEFORE envelope +// parsing so an unauth POST with a junk body returns 401 +// webhook_secret_mismatch (operator config gap) rather than 400 +// invalid_payload (which falsely signals "fix the body"). The envelope +// parse + TopicArn / RSA checks then surface webhook_signature_mismatch +// on a bad inbound payload, distinct from the secret-unset path. func (h *EmailWebhookHandler) SES(c *fiber.Ctx) error { ctx, span := otel.Tracer("instant.dev/handlers").Start(c.UserContext(), "email.webhook.ses") defer span.End() + // Fail-closed FIRST: if the operator hasn't deployed SES_SNS_SUBSCRIPTION_ARN + // the route can't possibly authenticate anyone, so we reject before + // touching the body. This stops a probe from distinguishing + // "secret-unset 401" from "bad-body 400" by manipulating the payload. + if h.cfg == nil || h.cfg.SESSNSTopicARN == "" { + slog.Warn("email.webhook.ses.secret_unset") + webhookAuthFailure("ses_sns", "secret_unset") + return respondError(c, fiber.StatusUnauthorized, "webhook_secret_mismatch", + "SES/SNS subscription ARN is not configured on this api Deployment.") + } + body := c.Body() var env snsEnvelope if err := json.Unmarshal(body, &env); err != nil { slog.Warn("email.webhook.ses.parse_envelope_failed", "error", err) - return c.Status(fiber.StatusBadRequest).JSON(fiber.Map{ - "ok": false, - "error": "invalid_payload", - }) + // Body shape gate: with the secret configured, an unparseable body + // is genuinely a 400. Mirror it through the canonical envelope. + return respondError(c, fiber.StatusBadRequest, "invalid_payload", + "SES/SNS envelope could not be parsed as JSON.") } - if h.cfg.SESSNSTopicARN == "" || env.TopicArn == "" || subtle.ConstantTimeCompare([]byte(h.cfg.SESSNSTopicARN), []byte(env.TopicArn)) != 1 { + if env.TopicArn == "" || subtle.ConstantTimeCompare([]byte(h.cfg.SESSNSTopicARN), []byte(env.TopicArn)) != 1 { slog.Warn("email.webhook.ses.topic_arn_mismatch", - "have_configured_arn", h.cfg.SESSNSTopicARN != "", "have_envelope_arn", env.TopicArn != "", ) - return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{ - "ok": false, - "error": "invalid_signature", - }) + webhookAuthFailure("ses_sns", "signature_mismatch") + return respondError(c, fiber.StatusUnauthorized, "webhook_signature_mismatch", + "SES/SNS envelope TopicArn did not match the configured subscription.") } // Full SNS RSA signature verification. The TopicArn check above is @@ -331,10 +400,9 @@ func (h *EmailWebhookHandler) SES(c *fiber.Ctx) error { "signing_cert_url", env.SigningCertURL, "signature_version", env.SignatureVersion, ) - return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{ - "ok": false, - "error": "invalid_signature", - }) + webhookAuthFailure("ses_sns", "signature_mismatch") + return respondError(c, fiber.StatusUnauthorized, "webhook_signature_mismatch", + "SES/SNS RSA signature did not verify against the canonical message.") } } @@ -357,10 +425,8 @@ func (h *EmailWebhookHandler) SES(c *fiber.Ctx) error { var msg sesMessage if err := json.Unmarshal([]byte(env.Message), &msg); err != nil { slog.Warn("email.webhook.ses.parse_message_failed", "error", err) - return c.Status(fiber.StatusBadRequest).JSON(fiber.Map{ - "ok": false, - "error": "invalid_message", - }) + return respondError(c, fiber.StatusBadRequest, "invalid_message", + "SES/SNS inner Message field could not be parsed as JSON.") } // Map SES notificationType → our normalized event_type. Multiple diff --git a/internal/handlers/helpers.go b/internal/handlers/helpers.go index 9ab5874e..9a6d7487 100644 --- a/internal/handlers/helpers.go +++ b/internal/handlers/helpers.go @@ -155,6 +155,57 @@ var codeToAgentAction = map[string]errorCodeMeta{ "brevo_secret_mismatch": { AgentAction: "Tell the user this is a Brevo-webhook config mismatch, not their auth. Operators must verify the Brevo dashboard webhook URL matches the configured BREVO_WEBHOOK_SECRET — see https://instanode.dev/docs/email.", }, + // webhook_secret_mismatch is the generic per-provider webhook URL-path-token + // or shared-secret mismatch surface. API-19/96/97/98 (QA 2026-05-29): the + // pre-fix path returned generic 401 envelopes for /api/v1/email/webhook/brevo + // and /api/v1/email/webhook/ses unauth POSTs, which sent the canonical + // "log in for new INSTANODE_TOKEN" agent_action to operators chasing a + // webhook-config incident. Same shape as brevo_secret_mismatch but + // distinguishes the secret-not-configured branch from the signature-mismatch + // branch below. Operator must wire the corresponding env var + // (BREVO_WEBHOOK_SECRET / SES_SNS_SUBSCRIPTION_ARN) before the route accepts. + "webhook_secret_mismatch": { + AgentAction: "Tell the user this is an email-webhook secret-config mismatch, not their auth. Operators must set the corresponding webhook secret env var in the api Deployment — see https://instanode.dev/docs/email.", + }, + // webhook_signature_mismatch is the per-provider signature-verification + // failure surface — the secret IS configured, the inbound payload's HMAC / + // SNS signature did NOT verify against the body. Distinct from + // webhook_secret_mismatch so observability can split "we haven't deployed + // the secret yet" from "someone is sending bad signatures (or the provider + // rotated keys)" without an operator hand-grepping log lines. Used by + // /api/v1/email/webhook/brevo + /api/v1/email/webhook/ses. + "webhook_signature_mismatch": { + AgentAction: "Tell the user the inbound email-webhook signature did not verify. Operators must confirm the dashboard webhook secret matches the api Deployment's env var and that the provider hasn't rotated signing keys — see https://instanode.dev/docs/email.", + }, + // webhook_method_not_allowed surfaces the GET-on-a-POST-only webhook URL + // path (BUG-API-098). Brevo's dashboard sometimes sends a GET pre-flight to + // the configured webhook URL; the pre-fix path returned generic 401 which + // could make the dashboard abandon the config. 405 with this code surfaces + // the actual situation (the URL exists, but only accepts POST). + "webhook_method_not_allowed": { + AgentAction: "Tell the user this webhook URL only accepts POST. Provider dashboards confirming a webhook URL via GET should treat 405 as 'URL exists' — see https://instanode.dev/docs/email.", + }, + // internal_token_required is the worker-to-api auth-failure surface for + // the /internal/* routes (terminate, resend-magic-link, backup-quota refund). + // API-26/27/28/77/78 (QA 2026-05-29): pre-fix these handlers parsed the + // path :id / request body BEFORE checking the secret, so a bogus token + // with a malformed path returned 400 invalid_team_id / 400 invalid_body + // instead of 401 — inverting the fail-closed posture (a probe could + // distinguish "secret unset" from "secret wrong" by the 400/401 envelope). + // Post-fix: the auth check runs first; any missing / malformed worker + // JWT returns 401 internal_token_required, surfacing the actual fault to + // operators without leaking shape information about the path or body to + // unauthenticated probes. + "internal_token_required": { + AgentAction: "Tell the user this is an internal worker-to-api route. The caller must present a valid worker JWT signed with WORKER_INTERNAL_JWT_SECRET — see https://instanode.dev/docs/internal.", + }, + // invalid_message is the SES/SNS-inner-Message-not-JSON arm. Distinct + // from invalid_payload (the envelope parse) so a debugging operator can + // tell "AWS gave us a malformed envelope" from "AWS gave us a malformed + // inner Message" without re-reading the response message string. + "invalid_message": { + AgentAction: "Tell the user the inner Message field of the SES/SNS envelope could not be parsed as JSON. Provider-side bug; operators must inspect the raw payload in audit — see https://instanode.dev/docs/email.", + }, "auth_required": { AgentAction: "Tell the user this action requires an authenticated session. Have them log in or sign up at https://instanode.dev/login — both flows mint a token.", }, diff --git a/internal/handlers/internal_backup_refund.go b/internal/handlers/internal_backup_refund.go index d3f3641c..2077be09 100644 --- a/internal/handlers/internal_backup_refund.go +++ b/internal/handlers/internal_backup_refund.go @@ -75,22 +75,36 @@ type internalBackupRefundClaims struct { // refunded=false means a prior call already credited the counter for // this backup_id (idempotent no-op). func (h *InternalBackupRefundHandler) Refund(c *fiber.Ctx) error { + // API-28 (QA 2026-05-29): auth-first. Mirror the terminate + resend + // handlers — the secret-unset / preauth gate runs BEFORE the path :id + // parse so an unauth POST with a junk path returns 401 + // internal_token_required (the actual fault) instead of 400 + // invalid_team_id. Pre-fix order let a probe distinguish "path bad" + // from "auth bad" by the envelope code — fail-closed inversion. + if h.cfg == nil || strings.TrimSpace(h.cfg.WorkerInternalJWTSecret) == "" { + slog.Warn("internal.backup_refund.secret_unset", + "reason", "WORKER_INTERNAL_JWT_SECRET is empty; rejecting all calls", + ) + return respondError(c, fiber.StatusUnauthorized, "internal_token_required", + "Worker internal auth is not configured on this api Deployment.") + } + if err := preVerifyInternalBackupRefundJWT(c, h.cfg.WorkerInternalJWTSecret); err != nil { + return respondError(c, fiber.StatusUnauthorized, "internal_token_required", + "Worker internal token is missing or invalid.") + } + + // Auth-accepted — now parse path :id. Malformed :id from authenticated + // caller is 400 (worker emitted a bad URL). pathID := strings.TrimSpace(c.Params("id")) teamID, err := uuid.Parse(pathID) if err != nil { return respondError(c, fiber.StatusBadRequest, "invalid_team_id", "team_id must be a UUID") } - // Auth: fail-closed when the worker secret is unset. - if h.cfg == nil || strings.TrimSpace(h.cfg.WorkerInternalJWTSecret) == "" { - slog.Warn("internal.backup_refund.secret_unset", - "path_team_id", pathID, - "reason", "WORKER_INTERNAL_JWT_SECRET is empty; rejecting all calls", - ) - return respondError(c, fiber.StatusUnauthorized, "unauthorized", "worker internal auth not configured") - } + // Second-phase verify: bind the token's team_id claim to the path. if err := verifyInternalBackupRefundJWT(c, h.cfg.WorkerInternalJWTSecret, teamID); err != nil { - return respondError(c, fiber.StatusUnauthorized, "unauthorized", "invalid worker token") + return respondError(c, fiber.StatusUnauthorized, "internal_token_required", + "Worker internal token is missing or invalid.") } var body struct { @@ -179,6 +193,43 @@ func (h *InternalBackupRefundHandler) Refund(c *fiber.Ctx) error { }) } +// preVerifyInternalBackupRefundJWT is the auth-first gate that runs BEFORE +// the path :id parse. Mirrors preVerifyInternalTerminateJWT — every JWT +// structural check that does NOT depend on the path :id runs here; the +// team_id-claim-binds-to-path check is deferred to +// verifyInternalBackupRefundJWT after the path parses cleanly. +// API-28 (QA 2026-05-29). +func preVerifyInternalBackupRefundJWT(c *fiber.Ctx, secret string) error { + authHeader := strings.TrimSpace(c.Get(fiber.HeaderAuthorization)) + if !strings.HasPrefix(strings.ToLower(authHeader), "bearer ") { + slog.Warn("internal.backup_refund.preauth.missing_bearer") + return errors.New("missing bearer token") + } + tokenStr := strings.TrimSpace(authHeader[len("Bearer "):]) + claims := &internalBackupRefundClaims{} + // WithValidMethods([HS256]) pins alg; non-HS256 short-circuits. + _, err := jwt.ParseWithClaims(tokenStr, claims, func(_ *jwt.Token) (interface{}, error) { + return []byte(secret), nil + }, jwt.WithValidMethods([]string{"HS256"})) + if err != nil { + slog.Warn("internal.backup_refund.preauth.parse_failed", "error", err) + return err + } + if claims.Purpose != internalBackupRefundPurpose { + slog.Warn("internal.backup_refund.preauth.bad_purpose", "purpose", claims.Purpose) + return errors.New("purpose claim mismatch") + } + if claims.IssuedAt == nil { + return errors.New("missing iat claim") + } + now := time.Now() + if claims.IssuedAt.Before(now.Add(-internalBackupRefundMaxClockSkew)) || + claims.IssuedAt.After(now.Add(internalBackupRefundMaxClockSkew)) { + return errors.New("iat outside clock skew window") + } + return nil +} + func verifyInternalBackupRefundJWT(c *fiber.Ctx, secret string, pathTeamID uuid.UUID) error { authHeader := strings.TrimSpace(c.Get(fiber.HeaderAuthorization)) if !strings.HasPrefix(strings.ToLower(authHeader), "bearer ") { diff --git a/internal/handlers/internal_backup_refund_coverage_test.go b/internal/handlers/internal_backup_refund_coverage_test.go index 91f11417..9263fb72 100644 --- a/internal/handlers/internal_backup_refund_coverage_test.go +++ b/internal/handlers/internal_backup_refund_coverage_test.go @@ -96,7 +96,13 @@ func TestBackupRefund_AuthArms(t *testing.T) { backupID := uuid.NewString() t.Run("invalid_team_id", func(t *testing.T) { - resp := backupRefundPost(t, app, "", "not-a-uuid", `{"backup_id":"`+backupID+`"}`) + // API-28 (QA 2026-05-29): auth-first ordering means an unauth + // caller with a malformed :id 401s on the auth check before the + // path parse. The invalid_team_id arm is only reachable by an + // authenticated caller (worker emitted a bad URL) — we exercise + // it with a valid JWT here so the path-parse arm stays covered. + validJWT := mintBackupRefundJWT(t, testBackupRefundSecret, "internal_backup_refund", uuid.NewString(), 0) + resp := backupRefundPost(t, app, validJWT, "not-a-uuid", `{"backup_id":"`+backupID+`"}`) assert.Equal(t, http.StatusBadRequest, resp.StatusCode) resp.Body.Close() }) diff --git a/internal/handlers/internal_resend_magic_link.go b/internal/handlers/internal_resend_magic_link.go index 92b63be5..c7dbf982 100644 --- a/internal/handlers/internal_resend_magic_link.go +++ b/internal/handlers/internal_resend_magic_link.go @@ -93,9 +93,35 @@ type internalResendMagicLinkRequest struct { } // Resend is the fiber.Handler for POST /internal/email/resend-magic-link. +// +// API-27/78 (QA 2026-05-29): the auth check runs FIRST — before any body +// parsing — so a bogus token paired with a malformed body returns 401 +// internal_token_required (the actual fault) instead of 400 invalid_body. +// The pre-fix order let an unauthenticated probe distinguish "no body" +// (400) from "no auth" (401), inverting the fail-closed posture documented +// for the /internal/* routes. func (h *InternalResendMagicLinkHandler) Resend(c *fiber.Ctx) error { requestID := middleware.GetRequestID(c) + // Auth-first. The secret-unset branch must fire before we even look + // at the body so a probe can't tell "operator hasn't deployed the + // secret" from "operator deployed the secret but caller has wrong + // token" by the 401/400 envelope shape. + if h.cfg == nil || strings.TrimSpace(h.cfg.WorkerInternalJWTSecret) == "" { + slog.Warn("internal.resend_magic_link.secret_unset", + "request_id", requestID, + "reason", "WORKER_INTERNAL_JWT_SECRET is empty; rejecting all calls", + ) + return respondError(c, fiber.StatusUnauthorized, "internal_token_required", + "Worker internal auth is not configured on this api Deployment.") + } + if err := preVerifyInternalResendMagicLinkJWT(c, h.cfg.WorkerInternalJWTSecret); err != nil { + return respondError(c, fiber.StatusUnauthorized, "internal_token_required", + "Worker internal token is missing or invalid.") + } + + // Auth-accepted. Parse the body. A malformed body from an authenticated + // caller is genuinely a 400 (worker emitted a bad payload). var body internalResendMagicLinkRequest if err := c.BodyParser(&body); err != nil { return respondError(c, fiber.StatusBadRequest, "invalid_body", "Request body must be valid JSON") @@ -105,17 +131,11 @@ func (h *InternalResendMagicLinkHandler) Resend(c *fiber.Ctx) error { return respondError(c, fiber.StatusBadRequest, "invalid_link_id", "link_id must be a UUID") } - // Auth: fail-closed when the shared secret is unset. - if h.cfg == nil || strings.TrimSpace(h.cfg.WorkerInternalJWTSecret) == "" { - slog.Warn("internal.resend_magic_link.secret_unset", - "link_id", linkID.String(), - "request_id", requestID, - "reason", "WORKER_INTERNAL_JWT_SECRET is empty; rejecting all calls", - ) - return respondError(c, fiber.StatusUnauthorized, "unauthorized", "worker internal auth not configured") - } + // Second-phase verify: bind the token's link_id claim to the body's + // link_id. Cross-link replay defence. if err := verifyInternalResendMagicLinkJWT(c, h.cfg.WorkerInternalJWTSecret, linkID); err != nil { - return respondError(c, fiber.StatusUnauthorized, "unauthorized", "invalid worker token") + return respondError(c, fiber.StatusUnauthorized, "internal_token_required", + "Worker internal token is missing or invalid.") } ctx := c.Context() @@ -275,6 +295,47 @@ func readMagicLinkAttempts(ctx context.Context, db *sql.DB, id uuid.UUID) (int, return n, nil } +// preVerifyInternalResendMagicLinkJWT is the auth-first gate that runs +// BEFORE body parsing. Mirrors preVerifyInternalTerminateJWT — every JWT +// structural check that does NOT depend on body fields runs here; the +// link_id-claim-binds-to-body check is deferred to +// verifyInternalResendMagicLinkJWT after the body parses cleanly. +// API-27/78 (QA 2026-05-29). +func preVerifyInternalResendMagicLinkJWT(c *fiber.Ctx, secret string) error { + authHeader := strings.TrimSpace(c.Get(fiber.HeaderAuthorization)) + if !strings.HasPrefix(strings.ToLower(authHeader), "bearer ") { + slog.Warn("internal.resend_magic_link.preauth.missing_bearer") + return errors.New("missing bearer token") + } + tokenStr := strings.TrimSpace(authHeader[len("Bearer "):]) + claims := &internalResendMagicLinkClaims{} + // WithValidMethods([HS256]) pins alg; non-HS256 short-circuits. + _, err := jwt.ParseWithClaims(tokenStr, claims, func(_ *jwt.Token) (interface{}, error) { + return []byte(secret), nil + }, jwt.WithValidMethods([]string{"HS256"})) + if err != nil { + slog.Warn("internal.resend_magic_link.preauth.parse_failed", "error", err) + return err + } + if claims.Purpose != internalResendMagicLinkPurpose { + slog.Warn("internal.resend_magic_link.preauth.bad_purpose", + "got", claims.Purpose, + "want", internalResendMagicLinkPurpose, + ) + return errors.New("wrong purpose claim") + } + if claims.IssuedAt == nil { + slog.Warn("internal.resend_magic_link.preauth.missing_iat") + return errors.New("missing iat claim") + } + skew := time.Since(claims.IssuedAt.Time) + if skew < -60*time.Second || skew > 60*time.Second { + slog.Warn("internal.resend_magic_link.preauth.iat_skew", "skew_seconds", skew.Seconds()) + return errors.New("iat outside skew window") + } + return nil +} + // verifyInternalResendMagicLinkJWT parses + validates the bearer token // against the four required checks: // diff --git a/internal/handlers/internal_resend_magic_link_coverage_test.go b/internal/handlers/internal_resend_magic_link_coverage_test.go index 46acc484..11989fe3 100644 --- a/internal/handlers/internal_resend_magic_link_coverage_test.go +++ b/internal/handlers/internal_resend_magic_link_coverage_test.go @@ -116,8 +116,15 @@ func TestResendMagicLink_AuthArms(t *testing.T) { linkID := seedMagicLink(t, db, time.Now().Add(10*time.Minute), 0) t.Run("invalid_body", func(t *testing.T) { + // API-27/78 (QA 2026-05-29): auth-first ordering means a junk body + // from an AUTHENTICATED caller still 400s (worker emitted a bad + // payload); unauthenticated callers 401 on the auth check before + // the body parse. We test the authenticated-bad-body path here so + // the body-parse arm stays covered. + validJWT := mintResendMLJWT(t, testResendMagicLinkSecret, "resend_magic_link", linkID, 0) req := httptest.NewRequest(http.MethodPost, "/internal/email/resend-magic-link", strings.NewReader(`{bad`)) req.Header.Set("Content-Type", "application/json") + req.Header.Set("Authorization", "Bearer "+validJWT) resp, err := app.Test(req, 5000) require.NoError(t, err) assert.Equal(t, http.StatusBadRequest, resp.StatusCode) @@ -125,7 +132,14 @@ func TestResendMagicLink_AuthArms(t *testing.T) { }) t.Run("invalid_link_id", func(t *testing.T) { - resp := resendMLPost(t, app, "", "not-a-uuid") + // API-27/78: same auth-first reasoning as invalid_body above — + // the body-link_id parse only fires for authenticated callers. + // We mint a fresh JWT whose link_id claim matches the bogus body + // link_id so the structural verify accepts the token; the body + // parse then surfaces 400 invalid_link_id (since "not-a-uuid" + // fails uuid.Parse). + validJWT := mintResendMLJWT(t, testResendMagicLinkSecret, "resend_magic_link", "not-a-uuid", 0) + resp := resendMLPost(t, app, validJWT, "not-a-uuid") assert.Equal(t, http.StatusBadRequest, resp.StatusCode) resp.Body.Close() }) diff --git a/internal/handlers/internal_terminate.go b/internal/handlers/internal_terminate.go index f98ba918..8b76ec39 100644 --- a/internal/handlers/internal_terminate.go +++ b/internal/handlers/internal_terminate.go @@ -98,28 +98,49 @@ func NewInternalTerminateHandler(db *sql.DB, cfg *config.Config, cancelFn func(s // 8. Emit one `payment.grace_terminated` audit row. // 9. Return 200 JSON. func (h *InternalTerminateHandler) Terminate(c *fiber.Ctx) error { + // API-26/77 (QA 2026-05-29): the auth check runs FIRST — before any + // path-param / body parsing — so a bogus token paired with a malformed + // path returns 401 internal_token_required (the actual fault) instead + // of 400 invalid_team_id. The pre-fix order let an unauthenticated + // probe distinguish "path malformed" (400) from "auth missing" (401), + // which itself leaked the route's shape (the existence of a :id path + // param) to anyone who could send a request. Auth-first closes that + // channel and matches the canonical fail-closed posture documented on + // the /internal/* routes in router.go. + // + // We accept the bearer's structural shape + signature + purpose claim + // here without binding to the path :id; the team_id-claim-binds-to-path + // check happens AFTER we parse the path, in the second-phase verify. + if h.cfg == nil || strings.TrimSpace(h.cfg.WorkerInternalJWTSecret) == "" { + slog.Warn("internal.terminate.secret_unset", + "reason", "WORKER_INTERNAL_JWT_SECRET is empty; rejecting all calls", + ) + return respondError(c, fiber.StatusUnauthorized, "internal_token_required", + "Worker internal auth is not configured on this api Deployment.") + } + if err := preVerifyInternalTerminateJWT(c, h.cfg.WorkerInternalJWTSecret); err != nil { + return respondError(c, fiber.StatusUnauthorized, "internal_token_required", + "Worker internal token is missing or invalid.") + } + + // Auth-accepted — now parse the path :id. A malformed :id from an + // authenticated caller is genuinely a 400 (the worker emitted a bad + // URL; operator-debuggable from the worker side). pathID := strings.TrimSpace(c.Params("id")) teamID, err := uuid.Parse(pathID) if err != nil { return respondError(c, fiber.StatusBadRequest, "invalid_team_id", "team_id must be a UUID") } - // Auth: HS256 JWT bound to the configured worker secret. When the - // secret is unset, EVERY call 401s — operators must wire - // WORKER_INTERNAL_JWT_SECRET into the api's k8s Secret to enable - // the route. This is the fail-closed default. - if h.cfg == nil || strings.TrimSpace(h.cfg.WorkerInternalJWTSecret) == "" { - slog.Warn("internal.terminate.secret_unset", - "path_team_id", pathID, - "reason", "WORKER_INTERNAL_JWT_SECRET is empty; rejecting all calls", - ) - return respondError(c, fiber.StatusUnauthorized, "unauthorized", "worker internal auth not configured") - } + // Bind the token's team_id claim to the path :id. Cross-team-rewrite + // defence: a worker that minted a token for team A and POSTed to + // /teams/B/terminate must 401. if err := verifyInternalTerminateJWT(c, h.cfg.WorkerInternalJWTSecret, teamID); err != nil { // verifyInternalTerminateJWT logs the structured reason; the // caller only ever sees a generic 401 so this route emits no // signal a probe could use to refine an attack. - return respondError(c, fiber.StatusUnauthorized, "unauthorized", "invalid worker token") + return respondError(c, fiber.StatusUnauthorized, "internal_token_required", + "Worker internal token is missing or invalid.") } ctx := c.Context() @@ -272,6 +293,54 @@ type internalTerminateAuditMetadata struct { RazorpayError string `json:"razorpay_error,omitempty"` } +// preVerifyInternalTerminateJWT is the auth-first gate that runs BEFORE the +// path-param parse. It enforces every JWT structural check that does NOT +// depend on the path :id — signature, alg pin, purpose claim, iat freshness. +// The team_id-claim-binds-to-path check is deferred to verifyInternalTerminateJWT +// after the path parses cleanly. +// +// API-26/77 (QA 2026-05-29): split into two passes so the 401 fires on +// every unauth POST regardless of path shape. A pre-fix probe could +// distinguish "no path-id" (400 invalid_team_id) from "no auth" (401), a +// fail-closed inversion. Post-fix the 401 fires first for ALL unauth calls. +func preVerifyInternalTerminateJWT(c *fiber.Ctx, secret string) error { + authHeader := strings.TrimSpace(c.Get(fiber.HeaderAuthorization)) + if !strings.HasPrefix(strings.ToLower(authHeader), "bearer ") { + slog.Warn("internal.terminate.preauth.missing_bearer") + return errors.New("missing bearer token") + } + tokenStr := strings.TrimSpace(authHeader[len("Bearer "):]) + claims := &internalTerminateClaims{} + // WithValidMethods([HS256]) is the single source of alg-pin + // enforcement — any non-HS256 token short-circuits before the + // keyfunc runs. + _, err := jwt.ParseWithClaims(tokenStr, claims, func(_ *jwt.Token) (interface{}, error) { + return []byte(secret), nil + }, jwt.WithValidMethods([]string{"HS256"})) + if err != nil { + slog.Warn("internal.terminate.preauth.parse_failed", "error", err) + return err + } + if claims.Purpose != internalTerminatePurpose { + slog.Warn("internal.terminate.preauth.bad_purpose", + "purpose", claims.Purpose, + "expected", internalTerminatePurpose, + ) + return errors.New("purpose claim mismatch") + } + if claims.IssuedAt == nil { + slog.Warn("internal.terminate.preauth.missing_iat") + return errors.New("missing iat claim") + } + iat := claims.IssuedAt.Time + now := time.Now() + if iat.Before(now.Add(-internalTerminateMaxClockSkew)) || iat.After(now.Add(internalTerminateMaxClockSkew)) { + slog.Warn("internal.terminate.preauth.iat_skew", "iat", iat, "now", now) + return errors.New("iat outside clock skew window") + } + return nil +} + // verifyInternalTerminateJWT parses + validates the bearer token // against the four checks the brief enforces: // diff --git a/internal/handlers/openapi_test.go b/internal/handlers/openapi_test.go index 07f9413e..f09fd28e 100644 --- a/internal/handlers/openapi_test.go +++ b/internal/handlers/openapi_test.go @@ -618,8 +618,14 @@ func TestOpenAPI_CoversAllRegisteredRoutes(t *testing.T) { // (A/B click sink). Agents shouldn't drive these and adding them // to the agent-facing spec would muddy the contract. intentionallyHidden := map[string]bool{ - "POST /api/v1/email/webhook/brevo": true, - "POST /api/v1/email/webhook/ses": true, + "POST /api/v1/email/webhook/brevo": true, + "POST /api/v1/email/webhook/ses": true, + // API-98 (QA 2026-05-29): provider-dashboard pre-flight GET on the + // webhook URL — returns 405 + Allow: POST so a dashboard sees + // "URL exists, method wrong" rather than silently abandoning a + // 401. Not an agent-facing surface; stays out of OpenAPI. + "GET /api/v1/email/webhook/brevo": true, + "GET /api/v1/email/webhook/ses": true, "POST /internal/teams/{id}/terminate": true, // Worker-only resend driver. Auth is the shared // WORKER_INTERNAL_JWT_SECRET HS256 token; agents must never call diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index 84056866..f4ad0f7a 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -236,6 +236,34 @@ var ( Help: "Inbound Brevo transactional-webhook events by normalized class (delivered/bounced_hard/bounced_soft/rejected/complaint/deferred/unsubscribed/error/unhandled/missing_message_id/unauthorized/invalid_payload/oversized)", }, []string{"event"}) + // WebhookAuthFailuresTotal counts inbound webhook auth failures across + // every email-provider webhook surface (Brevo HMAC + SES/SNS RSA). + // Distinguishes: + // webhook = "brevo_hmac" | "ses_sns" | "brevo_url_secret" + // reason = "secret_unset" — operator hasn't deployed the + // corresponding secret env var + // "signature_mismatch" — secret IS configured, inbound + // payload's HMAC / RSA / TopicArn + // did not match + // "missing_signature" — inbound payload carries no + // signature header at all + // + // API-19/96/97/98 (QA 2026-05-29): pre-fix every 401 from these routes + // rolled into a single generic "invalid_signature" code, so operators + // could not distinguish "we forgot to deploy the secret" from "the + // provider rotated their key" from "drive-by traffic" without + // hand-grepping log lines. NR alert on + // rate(instant_webhook_auth_failures_total{reason="secret_unset"}[5m]) > 0 + // fires within 5 min of a deploy that drops the env var; signature_mismatch + // fires on real attacks or key rotations. + // + // Cardinality bound: {brevo_hmac, ses_sns, brevo_url_secret} x + // {secret_unset, signature_mismatch, missing_signature} = 9 series. + WebhookAuthFailuresTotal = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "instant_webhook_auth_failures_total", + Help: "Inbound webhook auth failures by webhook (brevo_hmac/ses_sns/brevo_url_secret) and reason (secret_unset/signature_mismatch/missing_signature). API-19/96/97/98 (QA 2026-05-29).", + }, []string{"webhook", "reason"}) + // MagicLinkEmailRateLimited counts POST /auth/email/start requests // silently absorbed by the per-email rate limiter. B4-F1 (BugBash // 2026-05-20): the per-email limit responds 202 (identical to the diff --git a/internal/router/router.go b/internal/router/router.go index 9b1b4c93..e8498511 100644 --- a/internal/router/router.go +++ b/internal/router/router.go @@ -703,6 +703,14 @@ func NewWithHooks(cfg *config.Config, db *sql.DB, rdb *redis.Client, geoDbs *mid emailWebhookH := handlers.NewEmailWebhookHandler(db, cfg) app.Post("/api/v1/email/webhook/brevo", emailWebhookH.Brevo) app.Post("/api/v1/email/webhook/ses", emailWebhookH.SES) + // API-98 (QA 2026-05-29): explicit 405 + Allow: POST on the GET verb so + // a provider dashboard pre-flight that GETs the configured webhook URL + // sees "endpoint exists, method wrong" rather than the catch-all 404 / + // 401 envelope (which some dashboards interpret as "URL invalid" and + // silently drop). Registered BEFORE the /api/v1 RequireAuth group so the + // group middleware doesn't catch the GET and 401 it. + app.Get("/api/v1/email/webhook/brevo", emailWebhookH.BrevoMethodNotAllowed) + app.Get("/api/v1/email/webhook/ses", emailWebhookH.SESMethodNotAllowed) // Brevo transactional-delivery receiver — closes the "201 ≠ delivered" // gap. Brevo's transactional API returns 201 on accept but actual