Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions infra/newrelic/alerts/webhook-auth-failures.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{
"name": "instant-api — webhook auth failures (Brevo/SES — secret unset or signature mismatch)",
"policyName": "instant-api alerts",
"description": "P1 operator signal. Fires when inbound email-provider webhooks (Brevo HMAC, SES/SNS RSA) authenticate-fail at the api boundary. Two distinct reasons are emitted: (a) reason='secret_unset' — the operator has not deployed BREVO_WEBHOOK_SECRET / SES_SNS_SUBSCRIPTION_ARN to the api Deployment. This means EVERY inbound provider event is rejected and forwarder_sent.classification is never updated. Rule 12 (email truth surface) is broken until the operator deploys the secret. CRITICAL within 5 min — fix is one kubectl set env. (b) reason='signature_mismatch' — secret IS configured but the inbound signature does not verify. Real causes: (1) provider rotated their signing key, (2) drive-by probe traffic, (3) man-in-the-middle on the webhook URL. WARN at >10/h — operator should rotate the secret in the dashboard if the provider confirms a key rotation. Source: api/internal/handlers/email_webhooks.go webhookAuthFailure() helper; counter instant_webhook_auth_failures_total. API-19/96/97/98 (QA 2026-05-29).",
"enabled": true,
"nrql": {
"query": "SELECT count(*) FROM Metric WHERE metricName = 'instant_webhook_auth_failures_total' AND reason = 'secret_unset' FACET webhook"
},
"terms": [
{
"priority": "CRITICAL",
"operator": "ABOVE",
"threshold": 0,
"thresholdDuration": 300,
"thresholdOccurrences": "ALL"
}
],
"signal": {
"aggregationWindow": 60,
"aggregationMethod": "EVENT_FLOW",
"aggregationDelay": 120,
"fillOption": "STATIC",
"fillValue": 0
},
"expiration": {
"expirationDuration": 3600,
"openViolationOnExpiration": false,
"closeViolationsOnExpiration": true
},
"violationTimeLimitSeconds": 86400
}
583 changes: 583 additions & 0 deletions internal/handlers/auth_first_ordering_test.go

Large diffs are not rendered by default.

120 changes: 93 additions & 27 deletions internal/handlers/email_webhooks.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,20 @@ import (
"go.opentelemetry.io/otel/attribute"

"instant.dev/internal/config"
"instant.dev/internal/metrics"
"instant.dev/internal/models"
)

// webhookAuthFailure increments the inbound-webhook auth-failures counter.
// Centralised so the brevo + ses handlers share one observability surface
// and a single registry-iterating test guards the label set. webhook is
// one of {"brevo_hmac","ses_sns","brevo_url_secret"}; reason is one of
// {"secret_unset","signature_mismatch","missing_signature"}. API-19/96/97/98
// (QA 2026-05-29).
func webhookAuthFailure(webhook, reason string) {
metrics.WebhookAuthFailuresTotal.WithLabelValues(webhook, reason).Inc()
}

// EmailWebhookHandler holds the deps for both provider endpoints. db is
// the platform Postgres; cfg surfaces BrevoWebhookSecret + SESSNSTopicARN;
// snsVerifier handles AWS SNS RSA signature verification (cert fetch +
Expand Down Expand Up @@ -158,11 +169,43 @@ var brevoEventTypeMap = map[string]string{
"blocked": models.EmailEventTypeBounce, // blocked = permanent in practice
}

// BrevoMethodNotAllowed handles non-POST verbs on /api/v1/email/webhook/brevo.
//
// API-98 (QA 2026-05-29): Brevo's dashboard sometimes issues a GET against the
// configured webhook URL to confirm "the endpoint exists" before saving. The
// pre-fix path returned a generic 401 (Fiber's app.Use chain hits before any
// per-handler logic on the non-registered GET verb), and dashboards configured
// to abandon-on-401 dropped the URL silently. 405 with the explicit code lets
// the dashboard see "URL exists, only POST accepted" and proceed.
func (h *EmailWebhookHandler) BrevoMethodNotAllowed(c *fiber.Ctx) error {
c.Set("Allow", "POST")
return respondError(c, fiber.StatusMethodNotAllowed, "webhook_method_not_allowed",
"This webhook URL only accepts POST.")
}

// SESMethodNotAllowed mirrors BrevoMethodNotAllowed for the SES/SNS endpoint.
// SNS itself only POSTs, but operators sometimes curl the URL to confirm the
// configuration. Same 405 + explicit code lets them proceed without thinking
// they hit an unauthorised route.
func (h *EmailWebhookHandler) SESMethodNotAllowed(c *fiber.Ctx) error {
c.Set("Allow", "POST")
return respondError(c, fiber.StatusMethodNotAllowed, "webhook_method_not_allowed",
"This webhook URL only accepts POST.")
}

// Brevo handles POST /api/v1/email/webhook/brevo.
//
// Returns 401 on bad signature, 400 on unparseable body, 200 on every
// other case (including unknown event types — Brevo fires opens/clicks
// that we silently drop).
//
// API-19/96 (QA 2026-05-29): auth runs BEFORE any body inspection so an
// unauth POST surfaces the actual fault (webhook_signature_mismatch /
// webhook_secret_mismatch) and triggers the operator-facing agent_action
// instead of the generic user-auth "log in for a new INSTANODE_TOKEN"
// envelope. The split between SECRET unset and SIGNATURE mismatch lets
// observability distinguish "the operator hasn't deployed the secret
// yet" from "the provider sent a payload with a bad signature".
func (h *EmailWebhookHandler) Brevo(c *fiber.Ctx) error {
ctx, span := otel.Tracer("instant.dev/handlers").Start(c.UserContext(), "email.webhook.brevo")
defer span.End()
Expand All @@ -173,24 +216,34 @@ func (h *EmailWebhookHandler) Brevo(c *fiber.Ctx) error {
sig = c.Get(brevoHeaderSignatureLegacy)
}

// Fail-closed split: SECRET unset vs SIGNATURE bad. Both still 401,
// but the error CODE + observability label differ. Operators can
// alert on `instant_webhook_auth_failures_total{reason="secret_unset"}`
// to detect "we forgot to deploy the secret"; the `signature_mismatch`
// label detects "the provider rotated their signing key on us".
if h.cfg == nil || h.cfg.BrevoWebhookSecret == "" {
slog.Warn("email.webhook.brevo.secret_unset",
"have_signature", sig != "",
)
webhookAuthFailure("brevo_hmac", "secret_unset")
return respondError(c, fiber.StatusUnauthorized, "webhook_secret_mismatch",
"Brevo webhook secret is not configured on this api Deployment.")
}
if !verifyBrevoSignature(body, sig, h.cfg.BrevoWebhookSecret) {
slog.Warn("email.webhook.brevo.signature_failed",
"have_secret", h.cfg.BrevoWebhookSecret != "",
"have_secret", true,
"have_signature", sig != "",
)
return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{
"ok": false,
"error": "invalid_signature",
})
webhookAuthFailure("brevo_hmac", "signature_mismatch")
return respondError(c, fiber.StatusUnauthorized, "webhook_signature_mismatch",
"Brevo webhook signature did not verify against the body.")
}

var evt brevoEventPayload
if err := json.Unmarshal(body, &evt); err != nil {
slog.Warn("email.webhook.brevo.parse_failed", "error", err)
return c.Status(fiber.StatusBadRequest).JSON(fiber.Map{
"ok": false,
"error": "invalid_payload",
})
return respondError(c, fiber.StatusBadRequest, "invalid_payload",
"Brevo webhook body could not be parsed as JSON.")
}

normalized, ok := brevoEventTypeMap[strings.ToLower(evt.Event)]
Expand Down Expand Up @@ -294,29 +347,45 @@ type sesMessage struct {
// must match. Full SNS signature verification (RSA + cert download) is
// reserved for a follow-up; the ARN check rejects drive-by traffic but
// not a determined attacker who knows the ARN.
//
// API-19/97 (QA 2026-05-29): the SECRET_UNSET branch fires BEFORE envelope
// parsing so an unauth POST with a junk body returns 401
// webhook_secret_mismatch (operator config gap) rather than 400
// invalid_payload (which falsely signals "fix the body"). The envelope
// parse + TopicArn / RSA checks then surface webhook_signature_mismatch
// on a bad inbound payload, distinct from the secret-unset path.
func (h *EmailWebhookHandler) SES(c *fiber.Ctx) error {
ctx, span := otel.Tracer("instant.dev/handlers").Start(c.UserContext(), "email.webhook.ses")
defer span.End()

// Fail-closed FIRST: if the operator hasn't deployed SES_SNS_SUBSCRIPTION_ARN
// the route can't possibly authenticate anyone, so we reject before
// touching the body. This stops a probe from distinguishing
// "secret-unset 401" from "bad-body 400" by manipulating the payload.
if h.cfg == nil || h.cfg.SESSNSTopicARN == "" {
slog.Warn("email.webhook.ses.secret_unset")
webhookAuthFailure("ses_sns", "secret_unset")
return respondError(c, fiber.StatusUnauthorized, "webhook_secret_mismatch",
"SES/SNS subscription ARN is not configured on this api Deployment.")
}

body := c.Body()
var env snsEnvelope
if err := json.Unmarshal(body, &env); err != nil {
slog.Warn("email.webhook.ses.parse_envelope_failed", "error", err)
return c.Status(fiber.StatusBadRequest).JSON(fiber.Map{
"ok": false,
"error": "invalid_payload",
})
// Body shape gate: with the secret configured, an unparseable body
// is genuinely a 400. Mirror it through the canonical envelope.
return respondError(c, fiber.StatusBadRequest, "invalid_payload",
"SES/SNS envelope could not be parsed as JSON.")
}

if h.cfg.SESSNSTopicARN == "" || env.TopicArn == "" || subtle.ConstantTimeCompare([]byte(h.cfg.SESSNSTopicARN), []byte(env.TopicArn)) != 1 {
if env.TopicArn == "" || subtle.ConstantTimeCompare([]byte(h.cfg.SESSNSTopicARN), []byte(env.TopicArn)) != 1 {
slog.Warn("email.webhook.ses.topic_arn_mismatch",
"have_configured_arn", h.cfg.SESSNSTopicARN != "",
"have_envelope_arn", env.TopicArn != "",
)
return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{
"ok": false,
"error": "invalid_signature",
})
webhookAuthFailure("ses_sns", "signature_mismatch")
return respondError(c, fiber.StatusUnauthorized, "webhook_signature_mismatch",
"SES/SNS envelope TopicArn did not match the configured subscription.")
}

// Full SNS RSA signature verification. The TopicArn check above is
Expand All @@ -331,10 +400,9 @@ func (h *EmailWebhookHandler) SES(c *fiber.Ctx) error {
"signing_cert_url", env.SigningCertURL,
"signature_version", env.SignatureVersion,
)
return c.Status(fiber.StatusUnauthorized).JSON(fiber.Map{
"ok": false,
"error": "invalid_signature",
})
webhookAuthFailure("ses_sns", "signature_mismatch")
return respondError(c, fiber.StatusUnauthorized, "webhook_signature_mismatch",
"SES/SNS RSA signature did not verify against the canonical message.")
}
}

Expand All @@ -357,10 +425,8 @@ func (h *EmailWebhookHandler) SES(c *fiber.Ctx) error {
var msg sesMessage
if err := json.Unmarshal([]byte(env.Message), &msg); err != nil {
slog.Warn("email.webhook.ses.parse_message_failed", "error", err)
return c.Status(fiber.StatusBadRequest).JSON(fiber.Map{
"ok": false,
"error": "invalid_message",
})
return respondError(c, fiber.StatusBadRequest, "invalid_message",
"SES/SNS inner Message field could not be parsed as JSON.")
}

// Map SES notificationType → our normalized event_type. Multiple
Expand Down
51 changes: 51 additions & 0 deletions internal/handlers/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,57 @@ var codeToAgentAction = map[string]errorCodeMeta{
"brevo_secret_mismatch": {
AgentAction: "Tell the user this is a Brevo-webhook config mismatch, not their auth. Operators must verify the Brevo dashboard webhook URL matches the configured BREVO_WEBHOOK_SECRET — see https://instanode.dev/docs/email.",
},
// webhook_secret_mismatch is the generic per-provider webhook URL-path-token
// or shared-secret mismatch surface. API-19/96/97/98 (QA 2026-05-29): the
// pre-fix path returned generic 401 envelopes for /api/v1/email/webhook/brevo
// and /api/v1/email/webhook/ses unauth POSTs, which sent the canonical
// "log in for new INSTANODE_TOKEN" agent_action to operators chasing a
// webhook-config incident. Same shape as brevo_secret_mismatch but
// distinguishes the secret-not-configured branch from the signature-mismatch
// branch below. Operator must wire the corresponding env var
// (BREVO_WEBHOOK_SECRET / SES_SNS_SUBSCRIPTION_ARN) before the route accepts.
"webhook_secret_mismatch": {
AgentAction: "Tell the user this is an email-webhook secret-config mismatch, not their auth. Operators must set the corresponding webhook secret env var in the api Deployment — see https://instanode.dev/docs/email.",
},
// webhook_signature_mismatch is the per-provider signature-verification
// failure surface — the secret IS configured, the inbound payload's HMAC /
// SNS signature did NOT verify against the body. Distinct from
// webhook_secret_mismatch so observability can split "we haven't deployed
// the secret yet" from "someone is sending bad signatures (or the provider
// rotated keys)" without an operator hand-grepping log lines. Used by
// /api/v1/email/webhook/brevo + /api/v1/email/webhook/ses.
"webhook_signature_mismatch": {
AgentAction: "Tell the user the inbound email-webhook signature did not verify. Operators must confirm the dashboard webhook secret matches the api Deployment's env var and that the provider hasn't rotated signing keys — see https://instanode.dev/docs/email.",
},
// webhook_method_not_allowed surfaces the GET-on-a-POST-only webhook URL
// path (BUG-API-098). Brevo's dashboard sometimes sends a GET pre-flight to
// the configured webhook URL; the pre-fix path returned generic 401 which
// could make the dashboard abandon the config. 405 with this code surfaces
// the actual situation (the URL exists, but only accepts POST).
"webhook_method_not_allowed": {
AgentAction: "Tell the user this webhook URL only accepts POST. Provider dashboards confirming a webhook URL via GET should treat 405 as 'URL exists' — see https://instanode.dev/docs/email.",
},
// internal_token_required is the worker-to-api auth-failure surface for
// the /internal/* routes (terminate, resend-magic-link, backup-quota refund).
// API-26/27/28/77/78 (QA 2026-05-29): pre-fix these handlers parsed the
// path :id / request body BEFORE checking the secret, so a bogus token
// with a malformed path returned 400 invalid_team_id / 400 invalid_body
// instead of 401 — inverting the fail-closed posture (a probe could
// distinguish "secret unset" from "secret wrong" by the 400/401 envelope).
// Post-fix: the auth check runs first; any missing / malformed worker
// JWT returns 401 internal_token_required, surfacing the actual fault to
// operators without leaking shape information about the path or body to
// unauthenticated probes.
"internal_token_required": {
AgentAction: "Tell the user this is an internal worker-to-api route. The caller must present a valid worker JWT signed with WORKER_INTERNAL_JWT_SECRET — see https://instanode.dev/docs/internal.",
},
// invalid_message is the SES/SNS-inner-Message-not-JSON arm. Distinct
// from invalid_payload (the envelope parse) so a debugging operator can
// tell "AWS gave us a malformed envelope" from "AWS gave us a malformed
// inner Message" without re-reading the response message string.
"invalid_message": {
AgentAction: "Tell the user the inner Message field of the SES/SNS envelope could not be parsed as JSON. Provider-side bug; operators must inspect the raw payload in audit — see https://instanode.dev/docs/email.",
},
"auth_required": {
AgentAction: "Tell the user this action requires an authenticated session. Have them log in or sign up at https://instanode.dev/login — both flows mint a token.",
},
Expand Down
69 changes: 60 additions & 9 deletions internal/handlers/internal_backup_refund.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,22 +75,36 @@ type internalBackupRefundClaims struct {
// refunded=false means a prior call already credited the counter for
// this backup_id (idempotent no-op).
func (h *InternalBackupRefundHandler) Refund(c *fiber.Ctx) error {
// API-28 (QA 2026-05-29): auth-first. Mirror the terminate + resend
// handlers — the secret-unset / preauth gate runs BEFORE the path :id
// parse so an unauth POST with a junk path returns 401
// internal_token_required (the actual fault) instead of 400
// invalid_team_id. Pre-fix order let a probe distinguish "path bad"
// from "auth bad" by the envelope code — fail-closed inversion.
if h.cfg == nil || strings.TrimSpace(h.cfg.WorkerInternalJWTSecret) == "" {
slog.Warn("internal.backup_refund.secret_unset",
"reason", "WORKER_INTERNAL_JWT_SECRET is empty; rejecting all calls",
)
return respondError(c, fiber.StatusUnauthorized, "internal_token_required",
"Worker internal auth is not configured on this api Deployment.")
}
if err := preVerifyInternalBackupRefundJWT(c, h.cfg.WorkerInternalJWTSecret); err != nil {
return respondError(c, fiber.StatusUnauthorized, "internal_token_required",
"Worker internal token is missing or invalid.")
}

// Auth-accepted — now parse path :id. Malformed :id from authenticated
// caller is 400 (worker emitted a bad URL).
pathID := strings.TrimSpace(c.Params("id"))
teamID, err := uuid.Parse(pathID)
if err != nil {
return respondError(c, fiber.StatusBadRequest, "invalid_team_id", "team_id must be a UUID")
}

// Auth: fail-closed when the worker secret is unset.
if h.cfg == nil || strings.TrimSpace(h.cfg.WorkerInternalJWTSecret) == "" {
slog.Warn("internal.backup_refund.secret_unset",
"path_team_id", pathID,
"reason", "WORKER_INTERNAL_JWT_SECRET is empty; rejecting all calls",
)
return respondError(c, fiber.StatusUnauthorized, "unauthorized", "worker internal auth not configured")
}
// Second-phase verify: bind the token's team_id claim to the path.
if err := verifyInternalBackupRefundJWT(c, h.cfg.WorkerInternalJWTSecret, teamID); err != nil {
return respondError(c, fiber.StatusUnauthorized, "unauthorized", "invalid worker token")
return respondError(c, fiber.StatusUnauthorized, "internal_token_required",
"Worker internal token is missing or invalid.")
}

var body struct {
Expand Down Expand Up @@ -179,6 +193,43 @@ func (h *InternalBackupRefundHandler) Refund(c *fiber.Ctx) error {
})
}

// preVerifyInternalBackupRefundJWT is the auth-first gate that runs BEFORE
// the path :id parse. Mirrors preVerifyInternalTerminateJWT — every JWT
// structural check that does NOT depend on the path :id runs here; the
// team_id-claim-binds-to-path check is deferred to
// verifyInternalBackupRefundJWT after the path parses cleanly.
// API-28 (QA 2026-05-29).
func preVerifyInternalBackupRefundJWT(c *fiber.Ctx, secret string) error {
authHeader := strings.TrimSpace(c.Get(fiber.HeaderAuthorization))
if !strings.HasPrefix(strings.ToLower(authHeader), "bearer ") {
slog.Warn("internal.backup_refund.preauth.missing_bearer")
return errors.New("missing bearer token")
}
tokenStr := strings.TrimSpace(authHeader[len("Bearer "):])
claims := &internalBackupRefundClaims{}
// WithValidMethods([HS256]) pins alg; non-HS256 short-circuits.
_, err := jwt.ParseWithClaims(tokenStr, claims, func(_ *jwt.Token) (interface{}, error) {
return []byte(secret), nil
}, jwt.WithValidMethods([]string{"HS256"}))
if err != nil {
slog.Warn("internal.backup_refund.preauth.parse_failed", "error", err)
return err
}
if claims.Purpose != internalBackupRefundPurpose {
slog.Warn("internal.backup_refund.preauth.bad_purpose", "purpose", claims.Purpose)
return errors.New("purpose claim mismatch")
}
if claims.IssuedAt == nil {
return errors.New("missing iat claim")
}
now := time.Now()
if claims.IssuedAt.Before(now.Add(-internalBackupRefundMaxClockSkew)) ||
claims.IssuedAt.After(now.Add(internalBackupRefundMaxClockSkew)) {
return errors.New("iat outside clock skew window")
}
return nil
}

func verifyInternalBackupRefundJWT(c *fiber.Ctx, secret string, pathTeamID uuid.UUID) error {
authHeader := strings.TrimSpace(c.Get(fiber.HeaderAuthorization))
if !strings.HasPrefix(strings.ToLower(authHeader), "bearer ") {
Expand Down
Loading
Loading