diff --git a/.env.example b/.env.example index 26fd6dda..fd35b66c 100644 --- a/.env.example +++ b/.env.example @@ -50,3 +50,9 @@ GITHUB_CLIENT_ID= GITHUB_CLIENT_SECRET= GOOGLE_CLIENT_ID= GOOGLE_CLIENT_SECRET= + +# Analytics (WS4 behavioral-intelligence funnel events → New Relic custom events) +# "noop" (default) = inert; drops every event, zero deps, never errors. Set to +# "newrelic" to emit InstantFunnel custom events via the existing New Relic app +# (requires NEW_RELIC_LICENSE_KEY). Leaving it unset/noop is the safe default. +ANALYTICS_BACKEND=noop diff --git a/docs/OBSERVABILITY-FUNNEL-EVENTS.md b/docs/OBSERVABILITY-FUNNEL-EVENTS.md new file mode 100644 index 00000000..29bfd89d --- /dev/null +++ b/docs/OBSERVABILITY-FUNNEL-EVENTS.md @@ -0,0 +1,85 @@ +# Observability — WS4 Behavioral-Intelligence Funnel Events + +This note documents the New Relic custom event the api emits at the conversion +funnel points, and the Prometheus counter that backs the bridge's failure mode. +It satisfies CLAUDE.md rule 25 (every observability signal ships with its +documentation). The NR **alert + dashboard tile** for these signals live in the +separate `infra` repo (`infra/newrelic/alerts/`, `infra/newrelic/dashboards/`, +`infra/observability/METRICS-CATALOG.md`) — that repo has no auto-apply, so the +operator wires the tiles/alerts there; this note is the source-of-truth for the +event/attribute contract the dashboards FACET on. + +## Why this exists + +`instant_conversion_funnel_total{step}` (Prometheus) is an **aggregate count** — +it answers "how many provisions today" but cannot be keyed on a stable entity +(team / anonymous fingerprint bucket / cohort), so it cannot compute the per- +entity / cohorted funnel KPIs the WS4 plan needs: + +- anonymous → claimed (target **> 2%**) +- claimed → paid (target **> 20%**) + +The `InstantFunnel` New Relic custom event is the **per-entity** companion. Both +are emitted at every funnel point — the Prometheus counter is **not** removed. + +## `InstantFunnel` custom event + +Emitted via `common/analyticsevent` (factory-wrapped: fail-open + PII-sanitized). +The api wires the emitter once at boot (`router.wireAnalyticsEmitter`) from +`ANALYTICS_BACKEND` (default `noop` — **inert** until New Relic is configured; +the noop default is the flag protection, no separate feature flag needed). The +`newrelic` backend reuses the api's existing `*newrelic.Application`. + +| Attribute | Always? | Values / notes | +|----------------|---------|----------------| +| `funnelStep` | yes | `landing` \| `provision` \| `claim` \| `paid` | +| `serviceName` | yes | `api` (FACET to attribute a step to the emitting service) | +| `tier` | most | `anonymous`/`free`/`hobby`/`pro`/… (omitted at `landing`) | +| `env` | provision | `development`/`production`/… (resolved env of the provision) | +| `fingerprint` | anon | **already-hashed** SHA256(/24+ASN) bucket — never a raw IP | +| `teamId` | claim/paid | team UUID (opaque id, not PII) | + +PII policy: the attribute map passes through `analyticsevent.Sanitize` (explicit +allowlist + email-hashing) before any backend sees it, so no raw email / token / +connection string can leak even if a future emit site passes one. + +### Emit sites (api) + +| Step | File:func | Trigger | +|-------------|-----------|---------| +| `landing` | `onboarding.go` `StartLanding` | GET `/start` (top of funnel) | +| `provision` | `db.go`/`cache.go`/`nosql.go`/`vector.go`/`queue.go`/`storage.go`/`webhook.go` `New*` (anonymous path) | anonymous resource provisioned | +| `claim` | `onboarding.go` `Claim` | anonymous → claimed (account created) | +| `paid` | `billing.go` `handleSubscriptionCharged` | claimed → paid (subscription active) | + +### NRQL starters + +```sql +-- anon->claimed (exclude synthetic prober traffic) +SELECT uniqueCount(fingerprint) FROM InstantFunnel +WHERE funnelStep = 'landing' AND cohort != 'synthetic' SINCE 1 day ago + +SELECT uniqueCount(teamId) FROM InstantFunnel +WHERE funnelStep = 'paid' AND cohort != 'synthetic' FACET tier SINCE 7 days ago +``` + +> **Exclude `cohort = 'synthetic'` from all funnel analysis.** Synthetic +> flow-test traffic (`InstantFlowTest`, emitted by the worker's prober) carries +> `cohort='synthetic'`; the real-traffic funnel `InstantFunnel` events carry no +> cohort attribute, so `WHERE cohort != 'synthetic'` keeps the two separated. + +## `instant_analytics_emit_failed_total{reason}` (Prometheus) + +Counts behavioral-intelligence custom events **dropped** before reaching the +analytics sink, by `reason`. + +- `reason="nil_app"` — the New Relic sink had no `*newrelic.Application` (NR not + configured). This is the **expected steady state** until + `ANALYTICS_BACKEND=newrelic` + a license key are wired, so a flat non-zero + value in that configuration is benign. +- A **sudden climb after** NR is configured means the bridge is dropping real + funnel events — that is the alertable condition (suggested: P2 observability, + warn on `rate(...[10m]) > 0` once `ANALYTICS_BACKEND=newrelic`). + +Lazy `*Vec`: not visible at `/metrics` until the first dropped emit observes a +label. diff --git a/internal/config/config.go b/internal/config/config.go index eac48ac9..e20e8fef 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -150,6 +150,15 @@ type Config struct { MetricsToken string // METRICS_TOKEN — if set, required as Bearer token to access /metrics DashboardBaseURL string // DASHBOARD_BASE_URL — where to redirect onboarding flows (default: http://localhost:5173) + // AnalyticsBackend selects the behavioral-intelligence custom-event sink + // (common/analyticsevent). Read from ANALYTICS_BACKEND. One of "noop" + // (default — drops every event, zero deps, never errors) or "newrelic" + // (emits InstantFunnel/InstantFlowTest custom events via the existing + // *newrelic.Application). Defaulting to "noop" makes funnel emission INERT + // in any environment where New Relic is not configured — the safe, + // fail-open default, so no separate feature flag is needed. + AnalyticsBackend string + // APIPublicURL is the externally-routable base URL the API runs at // — used to construct fully-qualified links in outbound emails // (deletion-confirm, etc). Empty in local dev where the dashboard @@ -421,7 +430,8 @@ func Load() *Config { cfg.DeployDomain = getenv("DEPLOY_DOMAIN", "instant.dev") cfg.ComputeProvider = getenv("COMPUTE_PROVIDER", "noop") cfg.KubeNamespaceApps = getenv("KUBE_NAMESPACE_APPS", "instant-apps") - cfg.MetricsToken = os.Getenv("METRICS_TOKEN") // empty = open (local dev) + cfg.MetricsToken = os.Getenv("METRICS_TOKEN") // empty = open (local dev) + cfg.AnalyticsBackend = getenv("ANALYTICS_BACKEND", "noop") // noop = inert (no NR sink) cfg.DashboardBaseURL = getenv("DASHBOARD_BASE_URL", "http://localhost:5173") cfg.APIPublicURL = strings.TrimRight(getenv("API_PUBLIC_URL", ""), "/") // Parse DELETION_CONFIRMATION_TTL_MINUTES; fall back to 15 on diff --git a/internal/handlers/analytics.go b/internal/handlers/analytics.go new file mode 100644 index 00000000..ed5504bc --- /dev/null +++ b/internal/handlers/analytics.go @@ -0,0 +1,144 @@ +package handlers + +import ( + "context" + "sync/atomic" + + "instant.dev/common/analyticsevent" +) + +// WS4 behavioral-intelligence funnel events. +// +// This file is the api's bridge from the existing Prometheus conversion-funnel +// counter (instant_conversion_funnel_total — an AGGREGATE count) to the +// per-entity / per-cohort New Relic custom event (InstantFunnel) that the WS4 +// observability plan needs for funnel + retention analysis (anon→claim→ +// provision→paid). The Prometheus counter stays exactly where it is; every +// funnel emit site now ALSO records an InstantFunnel custom event alongside it. +// +// Why a package-level emitter instead of a struct field on every handler: the +// funnel emit sites live across nine independently-constructed handler structs +// (DBHandler, CacheHandler, OnboardingHandler, BillingHandler, …), each with +// its own constructor. The api already shares process-wide observability deps +// (the `metrics` package globals) the same way. The router wires the concrete +// emitter ONCE at boot via [SetAnalyticsEmitter]; until then — and in every +// unit test that doesn't opt in — the default is the no-op emitter, so funnel +// emission is INERT by default and can NEVER block, slow, or error a request. +// +// Fail-open + inert-by-default IS the flag protection: the analyticsevent +// package wraps every backend so a panic in the sink is swallowed and a nil / +// unconfigured backend is a silent drop. No separate feature flag is needed — +// ANALYTICS_BACKEND defaulting to "noop" means this code path does nothing in +// prod until New Relic is explicitly configured. + +// emitterBox wraps the [analyticsevent.Emitter] interface in a single concrete +// struct type so [analyticsEmitter] (an atomic.Value) always sees ONE concrete +// type across Stores — atomic.Value panics if successive Store calls pass +// different concrete types, which a bare interface value would (noop{} vs the +// factory's wrapped{}). The box is the invariant concrete type. +type emitterBox struct{ e analyticsevent.Emitter } + +// analyticsEmitter holds the process-wide emitter (boxed). atomic.Value so +// [SetAnalyticsEmitter] (called once at boot, before serving) and the per-request +// reads in [recordFunnelEvent] are race-free. Defaults to the no-op emitter via +// the package init below. +var analyticsEmitter atomic.Value // stores emitterBox + +func init() { + // Inert default: no analytics sink until the router wires one. The no-op + // emitter drops every event with zero deps and can never error. + analyticsEmitter.Store(emitterBox{e: analyticsevent.NewNoop()}) +} + +// SetAnalyticsEmitter installs the process-wide analytics emitter. Called once +// from the router at boot with the emitter built from ANALYTICS_BACKEND (noop by +// default; the New Relic sink when configured). A nil emitter is ignored so a +// mis-wire degrades to the existing no-op rather than panicking on first emit. +func SetAnalyticsEmitter(e analyticsevent.Emitter) { + if e == nil { + return + } + analyticsEmitter.Store(emitterBox{e: e}) +} + +// getAnalyticsEmitter returns the current process-wide emitter, never nil. +func getAnalyticsEmitter() analyticsevent.Emitter { + if box, ok := analyticsEmitter.Load().(emitterBox); ok && box.e != nil { + return box.e + } + return analyticsevent.NewNoop() +} + +// serviceNameAPI is the AttrServiceName value every funnel event from this +// service carries, so a dashboard can FACET by which service emitted the step. +const serviceNameAPI = "api" + +// Funnel-step values re-exported from analyticsevent so the per-handler emit +// sites (db/cache/nosql/…/onboarding/billing) reference one in-package constant +// and don't each need to import common/analyticsevent. These MUST stay equal to +// the analyticsevent constants — funnelStepsMatchCanonical (in the test) asserts +// it, and the wire contract (dashboards FACET on these exact strings) depends on +// it. +const ( + funnelStepProvision = analyticsevent.FunnelStepProvision + funnelStepClaim = analyticsevent.FunnelStepClaim + funnelStepPaid = analyticsevent.FunnelStepPaid + funnelStepLanding = analyticsevent.FunnelStepLanding +) + +// recordFunnelEvent emits one [analyticsevent.EventFunnel] custom event for the +// given funnel step alongside the existing Prometheus counter. It is the single +// chokepoint every funnel emit site routes through so the attribute set stays +// uniform and PII-safe. +// +// Attributes are intentionally low-cardinality and allowlisted (the +// analyticsevent wrapper drops anything not on the PII allowlist before the +// event leaves the process): step, tier, env, service, and — when known — the +// already-hashed fingerprint bucket (SHA256(/24+ASN), never a raw IP) and team +// id (an opaque UUID, not PII). Empty values are omitted so an absent field +// reads as "missing" in NRQL rather than "". +// +// FAIL-OPEN: this never returns an error and the wrapper swallows any panic, so +// a funnel emit can never affect the request path. Callers MUST NOT wrap it in +// error handling. +func recordFunnelEvent(ctx context.Context, step string, attrs funnelAttrs) { + getAnalyticsEmitter().Record(ctx, analyticsevent.EventFunnel, attrs.toMap(step)) +} + +// funnelAttrs is the typed, PII-safe attribute payload for a funnel event. Only +// these fields can reach an event; the package allowlist is the backstop. +type funnelAttrs struct { + // Tier is the plan tier the funnel step occurred at ("anonymous", "free", + // "pro", …). Low cardinality. + Tier string + // Env is the resolved environment ("development", "production", …). + Env string + // Fingerprint is the already-hashed SHA256(/24+ASN) anonymous bucket, or "" + // for an authenticated step. Never a raw IP. + Fingerprint string + // TeamID is the owning team UUID (opaque id, not PII), or "" when unknown + // (e.g. anonymous provisions before a claim). + TeamID string +} + +// toMap renders funnelAttrs + the step into the flat attribute map the emitter +// consumes, omitting empty values so NRQL facets stay clean. +func (a funnelAttrs) toMap(step string) map[string]any { + out := map[string]any{ + analyticsevent.AttrFunnelStep: step, + analyticsevent.AttrServiceName: serviceNameAPI, + } + if a.Tier != "" { + out[analyticsevent.AttrTier] = a.Tier + } + if a.Env != "" { + out[analyticsevent.AttrEnv] = a.Env + } + if a.Fingerprint != "" { + out[analyticsevent.AttrFingerprint] = a.Fingerprint + } + if a.TeamID != "" { + out[analyticsevent.AttrTeamID] = a.TeamID + } + return out +} diff --git a/internal/handlers/analytics_test.go b/internal/handlers/analytics_test.go new file mode 100644 index 00000000..d5c2e383 --- /dev/null +++ b/internal/handlers/analytics_test.go @@ -0,0 +1,241 @@ +package handlers + +import ( + "context" + "sync" + "testing" + + "instant.dev/common/analyticsevent" +) + +// recordingEmitter is a test analyticsevent.Emitter that captures every event it +// receives so a test can assert an emit site fired with the right step + attrs. +// It satisfies the Emitter contract (Record never errors/panics into the caller). +type recordingEmitter struct { + mu sync.Mutex + events []recordedEvent + closed bool +} + +type recordedEvent struct { + eventType string + attrs map[string]any +} + +func (r *recordingEmitter) Record(_ context.Context, eventType string, attrs map[string]any) { + r.mu.Lock() + defer r.mu.Unlock() + r.events = append(r.events, recordedEvent{eventType: eventType, attrs: attrs}) +} +func (r *recordingEmitter) Name() string { return "recording" } +func (r *recordingEmitter) Close() error { r.closed = true; return nil } + +func (r *recordingEmitter) last() recordedEvent { + r.mu.Lock() + defer r.mu.Unlock() + if len(r.events) == 0 { + return recordedEvent{} + } + return r.events[len(r.events)-1] +} +func (r *recordingEmitter) count() int { + r.mu.Lock() + defer r.mu.Unlock() + return len(r.events) +} + +// installRecordingEmitter swaps in a recording emitter for the duration of a +// test and restores the previous (default noop) emitter afterward, so tests +// don't leak the global across the package. The emitter is wrapped via the +// factory so it goes through the same Sanitize + fail-open path production uses. +func installRecordingEmitter(t *testing.T) *recordingEmitter { + t.Helper() + rec := &recordingEmitter{} + wrapped, err := analyticsevent.Factory(analyticsevent.Config{Override: rec}) + if err != nil { + t.Fatalf("Factory(Override) err = %v", err) + } + prev := getAnalyticsEmitter() + SetAnalyticsEmitter(wrapped) + t.Cleanup(func() { SetAnalyticsEmitter(prev) }) + return rec +} + +// --- emitter accessor / default ------------------------------------------------ + +func TestGetAnalyticsEmitter_DefaultsToNoop(t *testing.T) { + // The package init installs a non-nil noop emitter. Re-store it to be sure we + // observe the inert default regardless of test ordering, then restore. + prev := getAnalyticsEmitter() + t.Cleanup(func() { SetAnalyticsEmitter(prev) }) + + SetAnalyticsEmitter(analyticsevent.NewNoop()) + e := getAnalyticsEmitter() + if e == nil { + t.Fatal("getAnalyticsEmitter returned nil — must never be nil") + } + if e.Name() != analyticsevent.BackendNoop { + t.Fatalf("default emitter Name = %q, want %q", e.Name(), analyticsevent.BackendNoop) + } + // The inert default must not error or panic on an emit. + recordFunnelEvent(context.Background(), funnelStepProvision, funnelAttrs{Tier: "anonymous"}) +} + +func TestGetAnalyticsEmitter_NilBoxFallsBackToNoop(t *testing.T) { + // A stored box whose emitter is nil must fall through to the noop fallback + // (getAnalyticsEmitter's final return), never returning nil. SetAnalyticsEmitter + // ignores nil, so store the nil box directly to exercise that branch. + prev := getAnalyticsEmitter() + t.Cleanup(func() { SetAnalyticsEmitter(prev) }) + + analyticsEmitter.Store(emitterBox{e: nil}) + e := getAnalyticsEmitter() + if e == nil { + t.Fatal("getAnalyticsEmitter returned nil — must never be nil") + } + if e.Name() != analyticsevent.BackendNoop { + t.Fatalf("nil-box fallback emitter Name = %q, want %q", e.Name(), analyticsevent.BackendNoop) + } +} + +func TestSetAnalyticsEmitter_NilIgnored(t *testing.T) { + prev := getAnalyticsEmitter() + t.Cleanup(func() { SetAnalyticsEmitter(prev) }) + + rec := &recordingEmitter{} + SetAnalyticsEmitter(rec) + // A nil set must be ignored, leaving the previously-installed emitter intact. + SetAnalyticsEmitter(nil) + if getAnalyticsEmitter() != analyticsevent.Emitter(rec) { + t.Fatal("SetAnalyticsEmitter(nil) overwrote the installed emitter — must be ignored") + } +} + +// --- recordFunnelEvent: step + attrs ------------------------------------------ + +func TestRecordFunnelEvent_EmitsFunnelEventWithStepAndAttrs(t *testing.T) { + rec := installRecordingEmitter(t) + + recordFunnelEvent(context.Background(), funnelStepProvision, funnelAttrs{ + Tier: "anonymous", + Env: "production", + Fingerprint: "fp-bucket-hash", + TeamID: "team-uuid", + }) + + if rec.count() != 1 { + t.Fatalf("expected 1 event, got %d", rec.count()) + } + got := rec.last() + if got.eventType != analyticsevent.EventFunnel { + t.Fatalf("eventType = %q, want %q", got.eventType, analyticsevent.EventFunnel) + } + want := map[string]string{ + analyticsevent.AttrFunnelStep: funnelStepProvision, + analyticsevent.AttrServiceName: serviceNameAPI, + analyticsevent.AttrTier: "anonymous", + analyticsevent.AttrEnv: "production", + analyticsevent.AttrFingerprint: "fp-bucket-hash", + analyticsevent.AttrTeamID: "team-uuid", + } + for k, v := range want { + if got.attrs[k] != v { + t.Errorf("attr %q = %v, want %q", k, got.attrs[k], v) + } + } +} + +func TestRecordFunnelEvent_OmitsEmptyOptionalAttrs(t *testing.T) { + rec := installRecordingEmitter(t) + + // The top-of-funnel landing site carries no tier/env/fp/team. + recordFunnelEvent(context.Background(), funnelStepLanding, funnelAttrs{}) + + got := rec.last() + if got.attrs[analyticsevent.AttrFunnelStep] != funnelStepLanding { + t.Fatalf("step = %v, want %q", got.attrs[analyticsevent.AttrFunnelStep], funnelStepLanding) + } + if got.attrs[analyticsevent.AttrServiceName] != serviceNameAPI { + t.Errorf("service should always be present, got %v", got.attrs[analyticsevent.AttrServiceName]) + } + for _, k := range []string{ + analyticsevent.AttrTier, analyticsevent.AttrEnv, + analyticsevent.AttrFingerprint, analyticsevent.AttrTeamID, + } { + if _, ok := got.attrs[k]; ok { + t.Errorf("empty optional attr %q should be omitted, got %v", k, got.attrs[k]) + } + } +} + +// TestRecordFunnelEvent_EachCanonicalStep covers every funnel step constant the +// emit sites use, so all four in-package aliases are exercised. +func TestRecordFunnelEvent_EachCanonicalStep(t *testing.T) { + rec := installRecordingEmitter(t) + steps := []string{funnelStepLanding, funnelStepProvision, funnelStepClaim, funnelStepPaid} + for _, s := range steps { + recordFunnelEvent(context.Background(), s, funnelAttrs{Tier: "free"}) + } + if rec.count() != len(steps) { + t.Fatalf("expected %d events, got %d", len(steps), rec.count()) + } + for i, ev := range rec.events { + if ev.attrs[analyticsevent.AttrFunnelStep] != steps[i] { + t.Errorf("event %d step = %v, want %q", i, ev.attrs[analyticsevent.AttrFunnelStep], steps[i]) + } + } +} + +// --- PII safety ---------------------------------------------------------------- + +// TestRecordFunnelEvent_DoesNotEmitPII asserts the factory-wrapped emit path +// drops any non-allowlisted attribute and never carries a raw email — proving +// the sanitize chokepoint is in the emit path even if a future funnelAttrs field +// regresses. (toMap itself only produces allowlisted keys; this guards the path.) +func TestRecordFunnelEvent_DoesNotEmitPII(t *testing.T) { + rec := installRecordingEmitter(t) + recordFunnelEvent(context.Background(), funnelStepClaim, funnelAttrs{ + Tier: "free", + TeamID: "team-uuid", + }) + got := rec.last() + for k := range got.attrs { + if _, ok := analyticsevent.AllowedAttributes[k]; !ok { + t.Errorf("emitted non-allowlisted (potential PII) key %q", k) + } + } + if _, ok := got.attrs[analyticsevent.AttrEmail]; ok { + t.Error("raw email key must never appear in a funnel event") + } +} + +// TestFunnelAttrs_ToMap_OnlyAllowlistedKeys is the registry-iterating guard +// (CLAUDE.md rule 18): every key funnelAttrs can produce must be on the PII +// allowlist, so a new field can't leak by construction. +func TestFunnelAttrs_ToMap_OnlyAllowlistedKeys(t *testing.T) { + full := funnelAttrs{Tier: "t", Env: "e", Fingerprint: "f", TeamID: "tid"} + for k := range full.toMap(funnelStepProvision) { + if _, ok := analyticsevent.AllowedAttributes[k]; !ok { + t.Errorf("funnelAttrs.toMap emits non-allowlisted key %q", k) + } + } +} + +// --- wire-contract guard ------------------------------------------------------- + +// TestFunnelStepsMatchCanonical asserts the in-package step aliases equal the +// canonical analyticsevent constants the dashboards FACET on. A drift here +// silently splits the funnel across two step values. +func TestFunnelStepsMatchCanonical(t *testing.T) { + cases := []struct{ alias, canonical string }{ + {funnelStepLanding, analyticsevent.FunnelStepLanding}, + {funnelStepProvision, analyticsevent.FunnelStepProvision}, + {funnelStepClaim, analyticsevent.FunnelStepClaim}, + {funnelStepPaid, analyticsevent.FunnelStepPaid}, + } + for _, c := range cases { + if c.alias != c.canonical { + t.Errorf("step alias %q != canonical %q", c.alias, c.canonical) + } + } +} diff --git a/internal/handlers/billing.go b/internal/handlers/billing.go index 11361b16..d1af62c1 100644 --- a/internal/handlers/billing.go +++ b/internal/handlers/billing.go @@ -2046,6 +2046,9 @@ func (h *BillingHandler) handleSubscriptionCharged(ctx context.Context, c *fiber slog.Info("billing.subscription.charged", "team_id", teamID, "plan_tier", tier, "subscription_id", sub.ID) metrics.ConversionFunnel.WithLabelValues("paid").Inc() + // WS4: claimed->paid funnel custom event (the >20% KPI). teamId + the paid + // tier let the funnel cohort by tier; per-entity alongside the counter. + recordFunnelEvent(ctx, funnelStepPaid, funnelAttrs{Tier: tier, TeamID: teamID.String()}) // Best-effort audit emit for the Loops forwarder. Fail-open: an audit // error must not undo the tier update we already committed. diff --git a/internal/handlers/billing_funnel_event_test.go b/internal/handlers/billing_funnel_event_test.go new file mode 100644 index 00000000..a1b99759 --- /dev/null +++ b/internal/handlers/billing_funnel_event_test.go @@ -0,0 +1,93 @@ +package handlers_test + +// billing_funnel_event_test.go — WS4: asserts the claimed->paid funnel custom +// event (InstantFunnel, step=paid) fires from handleSubscriptionCharged +// alongside the existing instant_conversion_funnel_total{step="paid"} counter. +// Covers the emit site at billing.go's paid step with a recording emitter so the +// per-entity NR bridge is verified end-to-end through the real webhook path. + +import ( + "context" + "net/http" + "sync" + "testing" + + "github.com/google/uuid" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "instant.dev/common/analyticsevent" + "instant.dev/internal/handlers" + "instant.dev/internal/testhelpers" +) + +// funnelRecorder captures InstantFunnel events. Satisfies analyticsevent.Emitter +// (Record never errors/panics into the caller). +type funnelRecorder struct { + mu sync.Mutex + events []struct { + eventType string + attrs map[string]any + } +} + +func (r *funnelRecorder) Record(_ context.Context, eventType string, attrs map[string]any) { + r.mu.Lock() + defer r.mu.Unlock() + r.events = append(r.events, struct { + eventType string + attrs map[string]any + }{eventType, attrs}) +} +func (r *funnelRecorder) Name() string { return "recording" } +func (r *funnelRecorder) Close() error { return nil } + +func TestBillingWebhook_SubscriptionCharged_EmitsPaidFunnelEvent(t *testing.T) { + db, cleanDB := billingStateNeedsDB(t) + defer cleanDB() + + // Install a recording emitter (factory-wrapped, same Sanitize + fail-open as + // prod) and restore the default afterward. + rec := &funnelRecorder{} + wrapped, err := analyticsevent.Factory(analyticsevent.Config{Override: rec}) + require.NoError(t, err) + handlers.SetAnalyticsEmitter(wrapped) + t.Cleanup(func() { handlers.SetAnalyticsEmitter(analyticsevent.NewNoop()) }) + + app, cfg := billingWebhookDBApp(t, db) + + teamID := testhelpers.MustCreateTeamDB(t, db, "hobby") + defer db.Exec(`DELETE FROM teams WHERE id = $1::uuid`, teamID) + + payload := makeSubscriptionChargedPayloadWithPlan( + t, teamID, "sub_test_"+uuid.NewString(), cfg.RazorpayPlanIDPro, + ) + req := signedWebhookRequest(t, payload) + resp, err := app.Test(req, 5000) + require.NoError(t, err) + defer resp.Body.Close() + require.Equal(t, http.StatusOK, resp.StatusCode) + + // Exactly one InstantFunnel step=paid event, carrying the paid tier + teamId + // and no PII. + var paidAttrs map[string]any + var paidCount int + rec.mu.Lock() + for _, ev := range rec.events { + if ev.eventType == analyticsevent.EventFunnel && + ev.attrs[analyticsevent.AttrFunnelStep] == analyticsevent.FunnelStepPaid { + paidAttrs = ev.attrs + paidCount++ + } + } + rec.mu.Unlock() + + require.Equal(t, 1, paidCount, "expected exactly one paid funnel event") + assert.Equal(t, "pro", paidAttrs[analyticsevent.AttrTier]) + assert.Equal(t, teamID, paidAttrs[analyticsevent.AttrTeamID]) + assert.Equal(t, handlers.ServiceNameAPIForTest, paidAttrs[analyticsevent.AttrServiceName]) + for k := range paidAttrs { + _, ok := analyticsevent.AllowedAttributes[k] + assert.Truef(t, ok, "paid funnel event carried non-allowlisted key %q", k) + } +} diff --git a/internal/handlers/cache.go b/internal/handlers/cache.go index 08148c7a..84c14948 100644 --- a/internal/handlers/cache.go +++ b/internal/handlers/cache.go @@ -267,6 +267,8 @@ func (h *CacheHandler) NewCache(c *fiber.Ctx) error { metrics.ProvisionsTotal.WithLabelValues("redis", "anonymous").Inc() middleware.RecordProvisionSuccess("redis") metrics.ConversionFunnel.WithLabelValues("provision").Inc() + // WS4: per-entity funnel custom event alongside the aggregate counter. + recordFunnelEvent(ctx, funnelStepProvision, funnelAttrs{Tier: "anonymous", Env: env, Fingerprint: fp}) if markErr := h.markRecycleSeen(ctx, fp); markErr != nil { slog.Warn("cache.new.mark_recycle_seen_failed", diff --git a/internal/handlers/db.go b/internal/handlers/db.go index f919c30a..6f8f0fb5 100644 --- a/internal/handlers/db.go +++ b/internal/handlers/db.go @@ -307,6 +307,8 @@ func (h *DBHandler) NewDB(c *fiber.Ctx) error { metrics.ProvisionsTotal.WithLabelValues("postgres", "anonymous").Inc() middleware.RecordProvisionSuccess("postgres") metrics.ConversionFunnel.WithLabelValues("provision").Inc() + // WS4: per-entity funnel custom event alongside the aggregate counter. + recordFunnelEvent(ctx, funnelStepProvision, funnelAttrs{Tier: "anonymous", Env: env, Fingerprint: fp}) // Record this fingerprint as having had at least one anonymous touch. // The next anonymous POST after this resource expires will hit the diff --git a/internal/handlers/export_test.go b/internal/handlers/export_test.go index ffbbecf9..3d0d3901 100644 --- a/internal/handlers/export_test.go +++ b/internal/handlers/export_test.go @@ -136,6 +136,11 @@ func LookupCodeToAgentActionForTest(code string) (CodeToAgentActionMetaForTest, return CodeToAgentActionMetaForTest(meta), true } +// ServiceNameAPIForTest re-exports the AttrServiceName value funnel events from +// this service carry, so external handlers_test funnel assertions stay in sync +// with the production constant. +const ServiceNameAPIForTest = serviceNameAPI + // VerifyRazorpayTimestampForTest re-exports the unexported timestamp-window // predicate for the SRR security-cluster H46-F3 regression tests. Pure // function (no I/O), so a unit test can lock in the boundary semantics diff --git a/internal/handlers/nosql.go b/internal/handlers/nosql.go index 78165eba..fe47409e 100644 --- a/internal/handlers/nosql.go +++ b/internal/handlers/nosql.go @@ -263,6 +263,8 @@ func (h *NoSQLHandler) NewNoSQL(c *fiber.Ctx) error { metrics.ProvisionsTotal.WithLabelValues("mongodb", "anonymous").Inc() middleware.RecordProvisionSuccess("mongodb") metrics.ConversionFunnel.WithLabelValues("provision").Inc() + // WS4: per-entity funnel custom event alongside the aggregate counter. + recordFunnelEvent(ctx, funnelStepProvision, funnelAttrs{Tier: "anonymous", Env: env, Fingerprint: fp}) if markErr := h.markRecycleSeen(ctx, fp); markErr != nil { slog.Warn("nosql.new.mark_recycle_seen_failed", diff --git a/internal/handlers/onboarding.go b/internal/handlers/onboarding.go index 8065505f..2331ecc0 100644 --- a/internal/handlers/onboarding.go +++ b/internal/handlers/onboarding.go @@ -65,6 +65,10 @@ func (h *OnboardingHandler) StartLanding(c *fiber.Ctx) error { jwtStr := c.Query("t") metrics.ConversionFunnel.WithLabelValues("landing_viewed").Inc() + // WS4: top-of-funnel landing custom event. No tier/team/fingerprint is known + // at /start (an unauthenticated drive-by hit), so only the step + service are + // carried — the funnel's denominator for anon->claimed. + recordFunnelEvent(c.UserContext(), funnelStepLanding, funnelAttrs{}) if jwtStr == "" { return c.Redirect(h.cfg.DashboardBaseURL+"/claim", fiber.StatusFound) @@ -483,6 +487,15 @@ func (h *OnboardingHandler) Claim(c *fiber.Ctx) error { // immediately — if they don't pay within 24h, the resource expires. metrics.ConversionFunnel.WithLabelValues("claimed").Inc() + // WS4: anonymous->claimed funnel custom event (the >2% KPI). team.PlanTier is + // the just-created team's tier; claims.Fingerprint is the already-hashed + // anonymous bucket the claim consolidated. Per-entity (teamId) so retention + // can be cohorted, alongside the aggregate counter above. + recordFunnelEvent(ctx, funnelStepClaim, funnelAttrs{ + Tier: team.PlanTier, + Fingerprint: claims.Fingerprint, + TeamID: team.ID.String(), + }) // Issue a session JWT so the caller can immediately use authenticated endpoints. sessionToken, jwtErr := signSessionJWT(h.cfg.JWTSecret, newUser, team) diff --git a/internal/handlers/queue.go b/internal/handlers/queue.go index eb71dbdf..af4cd126 100644 --- a/internal/handlers/queue.go +++ b/internal/handlers/queue.go @@ -361,6 +361,8 @@ func (h *QueueHandler) NewQueue(c *fiber.Ctx) error { metrics.ProvisionsTotal.WithLabelValues("queue", "anonymous").Inc() middleware.RecordProvisionSuccess("queue") metrics.ConversionFunnel.WithLabelValues("provision").Inc() + // WS4: per-entity funnel custom event alongside the aggregate counter. + recordFunnelEvent(ctx, funnelStepProvision, funnelAttrs{Tier: "anonymous", Env: env, Fingerprint: fp}) if markErr := h.markRecycleSeen(ctx, fp); markErr != nil { slog.Warn("queue.new.mark_recycle_seen_failed", diff --git a/internal/handlers/storage.go b/internal/handlers/storage.go index 19468093..ff0a2634 100644 --- a/internal/handlers/storage.go +++ b/internal/handlers/storage.go @@ -379,6 +379,8 @@ func (h *StorageHandler) NewStorage(c *fiber.Ctx) error { metrics.ProvisionsTotal.WithLabelValues("storage", "anonymous").Inc() middleware.RecordProvisionSuccess("storage") metrics.ConversionFunnel.WithLabelValues("provision").Inc() + // WS4: per-entity funnel custom event alongside the aggregate counter. + recordFunnelEvent(ctx, funnelStepProvision, funnelAttrs{Tier: "anonymous", Env: env, Fingerprint: fp}) if markErr := h.markRecycleSeen(ctx, fp); markErr != nil { slog.Warn("storage.new.mark_recycle_seen_failed", diff --git a/internal/handlers/vector.go b/internal/handlers/vector.go index 17fe1799..835f9ccf 100644 --- a/internal/handlers/vector.go +++ b/internal/handlers/vector.go @@ -404,6 +404,8 @@ func (h *VectorHandler) NewVector(c *fiber.Ctx) error { metrics.ProvisionsTotal.WithLabelValues(models.ResourceTypeVector, "anonymous").Inc() metrics.ConversionFunnel.WithLabelValues("provision").Inc() + // WS4: per-entity funnel custom event alongside the aggregate counter. + recordFunnelEvent(ctx, funnelStepProvision, funnelAttrs{Tier: "anonymous", Env: env, Fingerprint: fp}) if markErr := h.markRecycleSeen(ctx, fp); markErr != nil { slog.Warn("vector.new.mark_recycle_seen_failed", diff --git a/internal/handlers/webhook.go b/internal/handlers/webhook.go index e6baf296..d8048bd4 100644 --- a/internal/handlers/webhook.go +++ b/internal/handlers/webhook.go @@ -369,6 +369,8 @@ func (h *WebhookHandler) NewWebhook(c *fiber.Ctx) error { metrics.ProvisionsTotal.WithLabelValues("webhook", "anonymous").Inc() middleware.RecordProvisionSuccess("webhook") metrics.ConversionFunnel.WithLabelValues("provision").Inc() + // WS4: per-entity funnel custom event alongside the aggregate counter. + recordFunnelEvent(ctx, funnelStepProvision, funnelAttrs{Tier: "anonymous", Env: env, Fingerprint: fp}) if markErr := h.markRecycleSeen(ctx, fp); markErr != nil { slog.Warn("webhook.new.mark_recycle_seen_failed", diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index 5ade4f81..84c5b519 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -449,6 +449,21 @@ var ( Name: "instant_pg_pool_wait_duration_seconds", Help: "Cumulative time spent waiting for a connection since process start, in seconds (sql.DBStats.WaitDuration). Pairs with instant_pg_pool_wait_count.", }, []string{"pool"}) + + // AnalyticsEmitFailed counts behavioral-intelligence custom events + // (common/analyticsevent — InstantFunnel etc.) that were DROPPED rather than + // reaching the analytics sink. The dominant reason today is "nil_app": the + // New Relic sink had no *newrelic.Application (NR not configured) — which is + // the expected steady state until ANALYTICS_BACKEND=newrelic + a license key + // are wired, so this counter sitting at a flat non-zero is benign in that + // configuration. A SUDDEN climb after NR is configured means the bridge is + // dropping real funnel events. Lazy *Vec: not visible at /metrics until the + // first label is observed (a dropped emit). Labelled by reason so the alert + // can distinguish "misconfigured" (nil_app) from a sink-side reject. + AnalyticsEmitFailed = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "instant_analytics_emit_failed_total", + Help: "Behavioral-intelligence custom events dropped before reaching the analytics sink, by reason.", + }, []string{"reason"}) ) // ReadyzCheckStatus updates the gauge for one check in this service. diff --git a/internal/router/analytics_wiring_test.go b/internal/router/analytics_wiring_test.go new file mode 100644 index 00000000..1840e0af --- /dev/null +++ b/internal/router/analytics_wiring_test.go @@ -0,0 +1,77 @@ +package router_test + +// analytics_wiring_test.go — covers the WS4 analytics emitter construction in +// router.go (wireAnalyticsEmitter): backend selection from ANALYTICS_BACKEND, +// the inert noop default, unknown-backend degrade, and the New Relic +// failure-hook → Prometheus counter bridge. We exercise the exported alias +// rather than the full router.New(...) (which needs Postgres + Redis + gRPC). + +import ( + "context" + "testing" + + "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" + + "instant.dev/common/analyticsevent" + "instant.dev/internal/config" + "instant.dev/internal/handlers" + "instant.dev/internal/metrics" + "instant.dev/internal/router" +) + +func counterValue(t *testing.T, c prometheus.Counter) float64 { + t.Helper() + var m dto.Metric + if err := c.Write(&m); err != nil { + t.Fatalf("counter Write: %v", err) + } + return m.GetCounter().GetValue() +} + +func TestWireAnalyticsEmitter_DefaultNoop(t *testing.T) { + t.Cleanup(func() { handlers.SetAnalyticsEmitter(analyticsevent.NewNoop()) }) + + e := router.ExportedWireAnalyticsEmitter(&config.Config{AnalyticsBackend: "noop"}, nil) + if e == nil { + t.Fatal("wireAnalyticsEmitter returned nil emitter") + } + if e.Name() != analyticsevent.BackendNoop { + t.Fatalf("noop backend Name = %q, want %q", e.Name(), analyticsevent.BackendNoop) + } + // The inert default must not panic on emit. + e.Record(context.Background(), analyticsevent.EventFunnel, map[string]any{analyticsevent.AttrTier: "anonymous"}) +} + +func TestWireAnalyticsEmitter_UnknownBackendDegradesToNoop(t *testing.T) { + t.Cleanup(func() { handlers.SetAnalyticsEmitter(analyticsevent.NewNoop()) }) + + // An unrecognised backend must not panic — Factory returns an advisory error + // and a usable noop emitter; wireAnalyticsEmitter logs and proceeds. + e := router.ExportedWireAnalyticsEmitter(&config.Config{AnalyticsBackend: "totally-unknown"}, nil) + if e.Name() != analyticsevent.BackendNoop { + t.Fatalf("unknown backend should degrade to noop, got %q", e.Name()) + } +} + +func TestWireAnalyticsEmitter_NewRelicNilAppFiresFailureHook(t *testing.T) { + t.Cleanup(func() { handlers.SetAnalyticsEmitter(analyticsevent.NewNoop()) }) + + before := counterValue(t, metrics.AnalyticsEmitFailed.WithLabelValues("nil_app")) + + // backend=newrelic with a nil *newrelic.Application: the nr sink wires the + // failure hook wireAnalyticsEmitter passes. An emit drops with reason + // "nil_app" and must increment instant_analytics_emit_failed_total. + e := router.ExportedWireAnalyticsEmitter(&config.Config{AnalyticsBackend: "newrelic"}, nil) + if e.Name() != analyticsevent.BackendNewRelic { + t.Fatalf("newrelic backend Name = %q, want %q", e.Name(), analyticsevent.BackendNewRelic) + } + e.Record(context.Background(), analyticsevent.EventFunnel, map[string]any{ + analyticsevent.AttrFunnelStep: analyticsevent.FunnelStepProvision, + }) + + after := counterValue(t, metrics.AnalyticsEmitFailed.WithLabelValues("nil_app")) + if after <= before { + t.Fatalf("failure-hook counter did not increment: before=%v after=%v", before, after) + } +} diff --git a/internal/router/export_test.go b/internal/router/export_test.go index b52d5498..6593b1fe 100644 --- a/internal/router/export_test.go +++ b/internal/router/export_test.go @@ -13,3 +13,9 @@ package router // can directly cover the closure body without standing up the full // router New(...) wiring (which needs Postgres + Redis + gRPC). var ExportedMakeSecurityTxtHandler = makeSecurityTxtHandler + +// ExportedWireAnalyticsEmitter is the unit-test-facing alias for +// wireAnalyticsEmitter, so the WS4 emitter-construction logic (backend +// selection + NR failure-hook wiring) can be covered without standing up the +// full router New(...) (which needs Postgres + Redis + gRPC). +var ExportedWireAnalyticsEmitter = wireAnalyticsEmitter diff --git a/internal/router/router.go b/internal/router/router.go index ffe98f9f..683fd11f 100644 --- a/internal/router/router.go +++ b/internal/router/router.go @@ -17,10 +17,13 @@ import ( "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/redis/go-redis/v9" "github.com/valyala/fasthttp/fasthttpadaptor" + "instant.dev/common/analyticsevent" + "instant.dev/common/analyticsevent/nr" "instant.dev/common/buildinfo" "instant.dev/internal/config" "instant.dev/internal/email" "instant.dev/internal/handlers" + "instant.dev/internal/metrics" "instant.dev/internal/middleware" "instant.dev/internal/migrations" "instant.dev/internal/plans" @@ -225,6 +228,18 @@ func NewWithHooks(cfg *config.Config, db *sql.DB, rdb *redis.Client, geoDbs *mid // extraction reads from the request, not from OTel context, but // keeping both before user middleware is the safe order). app.Use(middleware.NewRelic(nrApp)) + + // WS4 behavioral-intelligence funnel events. Build the process-wide + // analyticsevent.Emitter ONCE here and install it for the handler funnel + // emit sites (db/cache/nosql/vector/queue/storage/webhook provision, + // onboarding claim, billing paid). It reuses the SAME *newrelic.Application + // the middleware above uses — no second NR connection. ANALYTICS_BACKEND + // defaults to "noop", so this is INERT until New Relic is configured; the + // emitter is fail-open (the wrapper swallows any sink panic), so funnel + // emission can never block or error a request. A nil nrApp degrades the + // "newrelic" backend to noop inside the nr sink (nil-app drop). + wireAnalyticsEmitter(cfg, nrApp) + // Telemetry must come before Recover so that panic-induced 500s are recorded. app.Use(middleware.Telemetry()) app.Use(fiberRecover.New(fiberRecover.Config{ @@ -1460,3 +1475,40 @@ func parseTrustedProxyCIDRs(s string) []string { } return out } + +// wireAnalyticsEmitter builds the process-wide behavioral-intelligence emitter +// from ANALYTICS_BACKEND and installs it for the handler funnel emit sites via +// handlers.SetAnalyticsEmitter. It reuses the api's existing +// *newrelic.Application (the one the NewRelic middleware uses) for the +// "newrelic" backend so there is no second NR connection. +// +// Selection (all fail-open — Factory always returns a usable emitter): +// - ANALYTICS_BACKEND=newrelic -> the nr sink over nrApp (nrApp may be nil: +// the sink then drops every event and fires the failure hook with nil_app). +// - anything else / unset -> noop (the inert default). +// +// A non-nil advisory error from Factory is logged (analytics degraded to noop) +// but never blocks boot — analytics is best-effort. Returns the installed +// emitter (the boot caller ignores it; tests drive it to exercise the wiring). +func wireAnalyticsEmitter(cfg *config.Config, nrApp *newrelic.Application) analyticsevent.Emitter { + acfg := analyticsevent.Config{Backend: cfg.AnalyticsBackend} + + // For the New Relic backend we inject the already-constructed app via + // Config.Override so the root analyticsevent package never imports the NR + // agent. The failure hook bridges a dropped emit to the api's Prometheus + // counter (CLAUDE.md rule 25: a metric needs its alert/dashboard; the + // catalog row + Prom rule ship in this PR). + if analyticsevent.NormalizeBackend(cfg.AnalyticsBackend) == analyticsevent.BackendNewRelic { + acfg.Override = nr.New(nrApp, nr.WithFailureHook(func(reason string) { + metrics.AnalyticsEmitFailed.WithLabelValues(reason).Inc() + })) + } + + emitter, err := analyticsevent.Factory(acfg) + if err != nil { + slog.Warn("analytics: degraded to noop", "backend", cfg.AnalyticsBackend, "error", err) + } + slog.Info("analytics: emitter wired", "backend", emitter.Name()) + handlers.SetAnalyticsEmitter(emitter) + return emitter +}