diff --git a/k8s/prometheus-rules.yaml b/k8s/prometheus-rules.yaml index 6fb500a..4021752 100644 --- a/k8s/prometheus-rules.yaml +++ b/k8s/prometheus-rules.yaml @@ -345,6 +345,39 @@ spec: are fine; a sustained rate means the reap path itself is broken. Check instant-worker pod logs for `jobs.orphan_sweep.*_delete_failed` lines. + # instant-worker — audit-only orphan-DB / redis-namespace sweep (2026-06-09). + # DISTINCT from the namespace orphan_sweep above: orphan_db_sweep (worker + # internal/jobs/orphan_db_sweep.go) is a DETECTION/DRY-RUN surface for the + # ~25-orphan customer-DB drain backlog. Flag-gated OFF (ORPHAN_DB_SWEEP_ENABLED); + # when on it only LOGS + counts candidates (no drop — the destructive arm is a + # SECOND flag that routes through the audited provisioner chokepoint, never a raw + # DROP; truehomie safety). The gauge stays 0 while the flag is off. These alerts + # surface the backlog so an operator reviews the candidate list before any + # destructive enablement. Metrics: instant_orphan_db_sweep_candidates_{total,current}{kind}. + - name: instant-worker-orphan-db-sweep + rules: + - alert: OrphanDBSweepBacklog + # Sustained backlog → orphaned customer DBs / instant-customer-* redis + # namespaces (no active resource row) accumulating faster than they drain. + # P2 capacity hygiene, not an outage. Only fires once the sweep flag is on + # (gauge is 0 when off). kind ∈ {customer_namespace, redis_namespace}. + expr: | + max(instant_orphan_db_sweep_candidates_current) by (kind) > 25 + for: 1h + labels: + severity: warning + service: worker + annotations: + summary: "orphan-DB sweep backlog > 25 (kind={{ $labels.kind }}) — review candidates before destructive enablement" + description: | + instant_orphan_db_sweep_candidates_current{kind="{{ $labels.kind }}"} > 25 for >1h. + The audit-only orphan_db_sweep is detecting orphaned customer DBs / instant-customer-* + redis namespaces faster than they drain — the drain backlog the sweep was built to + surface. Review the candidate list in NR Logs (jobs.orphan_db_sweep.candidate); once + verified, an operator may enable the AUDITED destructive arm + (ORPHAN_DB_SWEEP_DESTRUCTIVE_ENABLED), which reclaims via the provisioner chokepoint. + NEVER drop a customer DB manually (truehomie incident, 2026-06-03). + # instant-* — code-defect signals (BugBash 2026-05-20). # Both counters are incremented by the safego.Go wrapper's deferred # recover() when a panic would otherwise crash a background goroutine. diff --git a/newrelic/alerts/orphan-db-sweep-backlog.json b/newrelic/alerts/orphan-db-sweep-backlog.json new file mode 100644 index 0000000..8a5591d --- /dev/null +++ b/newrelic/alerts/orphan-db-sweep-backlog.json @@ -0,0 +1,31 @@ +{ + "name": "instant-worker — orphan-DB sweep backlog > 25 (1h) [drain-backlog hygiene]", + "type": "NRQL", + "description": "Fires when the audit-only orphan_db_sweep (worker/internal/jobs/orphan_db_sweep.go) reports a sustained backlog of orphaned customer DBs / instant-customer-* redis namespaces (no active resource row). This is a DETECTION/DRY-RUN surface for the ~25-orphan drain backlog — flag-gated OFF (ORPHAN_DB_SWEEP_ENABLED), so the gauge stays 0 until an operator enables it; once on, this alert surfaces the backlog so the candidate list can be reviewed BEFORE any destructive enablement. The destructive arm is a separate flag (ORPHAN_DB_SWEEP_DESTRUCTIVE_ENABLED) that reclaims ONLY via the audited provisioner DeprovisionResource chokepoint — never a raw DROP (truehomie incident, 2026-06-03). P2: capacity hygiene, not an outage. kind in {customer_namespace, redis_namespace}. Source counter/gauge: OrphanDBSweepCandidatesTotal / OrphanDBSweepCandidatesCurrent in worker/internal/metrics/metrics.go.", + "enabled": true, + "nrql": { + "query": "SELECT max(instant_orphan_db_sweep_candidates_current) FROM Metric WHERE service = 'worker' FACET kind" + }, + "terms": [ + { + "priority": "WARNING", + "operator": "ABOVE", + "threshold": 25, + "thresholdDuration": 3600, + "thresholdOccurrences": "ALL" + } + ], + "signal": { + "aggregationWindow": 300, + "aggregationMethod": "EVENT_FLOW", + "aggregationDelay": 180, + "fillOption": "STATIC", + "fillValue": 0 + }, + "expiration": { + "expirationDuration": 3600, + "openViolationOnExpiration": false, + "closeViolationsOnExpiration": true + }, + "violationTimeLimitSeconds": 86400 +} diff --git a/newrelic/dashboards/instanode-reliability.json b/newrelic/dashboards/instanode-reliability.json index 5a6276e..8752ac0 100644 --- a/newrelic/dashboards/instanode-reliability.json +++ b/newrelic/dashboards/instanode-reliability.json @@ -981,6 +981,31 @@ } } }, + { + "title": "Orphan-DB sweep — current candidate backlog by kind (0 until enabled)", + "layout": { + "column": 1, + "row": 81, + "width": 4, + "height": 3 + }, + "visualization": { + "id": "viz.line" + }, + "rawConfiguration": { + "nrqlQueries": [ + { + "accountIds": [ + 0 + ], + "query": "SELECT latest(instant_orphan_db_sweep_candidates_current) FROM Metric WHERE service = 'worker' FACET kind TIMESERIES SINCE 6 hours ago" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } + }, { "title": "Billing reconciler gap detected by direction (6h)", "layout": { diff --git a/observability/METRICS-CATALOG.md b/observability/METRICS-CATALOG.md index 8db9574..39b6326 100644 --- a/observability/METRICS-CATALOG.md +++ b/observability/METRICS-CATALOG.md @@ -30,6 +30,8 @@ fires. Operators need this so they don't panic when a fresh deploy looks | `instant_propagation_unexpected_skip_total` | worker | `kind,resource_type,skip_reason` | lazy (post-CHAOS-F1 sentinel — only ticks on the schema/state drift class) | `propagation-unexpected-skip.json` | `PropagationUnexpectedSkip` | "Propagation queue depth + dead-lettered rate" | | `instant_orphan_sweep_reaped_total` | worker | `reason` | lazy (CounterVec — only when an orphan namespace is actually reaped) | `orphan-sweep-no-db-row.json`, `orphan-sweep-stuck-build-spike.json` | `OrphanSweepNoDBRowReap`, `OrphanSweepStuckBuildSpike` | "Orphan sweep — reaped by reason (24h)" | | `instant_orphan_sweep_reap_failed_total` | worker | `reason` | lazy | `orphan-sweep-reap-failed.json` | `OrphanSweepReapFailureRate` | "Orphan sweep — reap failures by reason (24h)" | +| `instant_orphan_db_sweep_candidates_total` | worker | `kind` | lazy (CounterVec — audit-only orphan-DB/redis-namespace sweep; both `kind` labels {customer_namespace, redis_namespace} primed in metrics_test so the series register at boot. Flag-gated OFF via ORPHAN_DB_SWEEP_ENABLED — stays 0 until enabled. Detection/dry-run only; the destructive arm routes through the audited provisioner chokepoint, never a raw DROP — truehomie 2026-06-03) | `orphan-db-sweep-backlog.json` | `OrphanDBSweepBacklog` (instant-worker-orphan-db-sweep group) | "Orphan-DB sweep — current candidate backlog by kind (0 until enabled)" | +| `instant_orphan_db_sweep_candidates_current` | worker | `kind` | lazy (GaugeVec — current orphan backlog, falls to 0 when drained; same flag/safety posture as `_total`. Drives the OrphanDBSweepBacklog alert) | `orphan-db-sweep-backlog.json` | `OrphanDBSweepBacklog` (instant-worker-orphan-db-sweep group) | "Orphan-DB sweep — current candidate backlog by kind (0 until enabled)" | | `instant_magic_link_email_rate_limited_total` | api | (none) | **eager** (Counter, registered at boot — visible as 0 immediately) | `magic-link-email-rate-limited.json` | `MagicLinkEmailRateLimited` | "Magic-link rate-limited / hour" | | `brevo_send_errors_total` | worker | `classification,status_code` | lazy (CounterVec — first failure creates label series; `permanent`/`transient` only after the first 401/5xx) | `brevo-send-errors-spike.json` | `BrevoSendErrorsSpike`, `BrevoSendErrorsWarning` | "Brevo send errors by classification (1h)" | | `brevo_webhook_events_total` | api | `event` | lazy (CounterVec — populates as Brevo posts each event class; `delivered` appears on first successful send, `bounced_hard` only if a bounce happens) | `email-delivery-ratio-low.json` | `BrevoDeliveryRatioLow`, `BrevoDeliveryRatioWarn` | "Brevo delivery ratio (1h sliding)", "Brevo webhook events funnel (24h)" |