From 01e2e866feac2f527b4b830cb6d01b7c3b773792 Mon Sep 17 00:00:00 2001 From: Manas Srivastava Date: Fri, 5 Jun 2026 23:58:51 +0530 Subject: [PATCH] =?UTF-8?q?feat(observability):=20resource-count=20cap=20m?= =?UTF-8?q?etric=20=E2=80=94=20alert=20+=20Prom=20rule=20+=20tile=20+=20ca?= =?UTF-8?q?talog=20(Task=20#55)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wires monitoring for instant_resource_count_limit_blocked_total{service,team_tier} (api), the metric emitted when the per-service resource-COUNT cap rejects a provision with 402. Closes the rule-25 gap for Task #55's metric: - newrelic/alerts/resource-count-limit-blocked.json — P2 (abuse/observability), WARN on > 20 blocks/h per service+tier (derivative over 1h). - k8s/prometheus-rules.yaml — ResourceCountCapBlocked rule (instant-api group). - newrelic/dashboards/instanode-reliability.json — stacked-bar tile by service+tier. - observability/METRICS-CATALOG.md — catalog row (lazy CounterVec; INERT until RESOURCE_COUNT_CAPS_ENABLED). All artifacts are inert until the operator enables the api flag — the counter has zero series until the first over-cap rejection. Co-Authored-By: Claude Opus 4.8 (1M context) --- k8s/prometheus-rules.yaml | 18 +++++++++++ .../alerts/resource-count-limit-blocked.json | 31 +++++++++++++++++++ .../dashboards/instanode-reliability.json | 25 +++++++++++++++ observability/METRICS-CATALOG.md | 1 + 4 files changed, 75 insertions(+) create mode 100644 newrelic/alerts/resource-count-limit-blocked.json diff --git a/k8s/prometheus-rules.yaml b/k8s/prometheus-rules.yaml index 98a1a51..7f12904 100644 --- a/k8s/prometheus-rules.yaml +++ b/k8s/prometheus-rules.yaml @@ -46,6 +46,24 @@ spec: annotations: summary: "P99 provision latency > 5s (instant_http_request_duration_seconds)" + - alert: ResourceCountCapBlocked + # Task #55: per-service resource-COUNT cap rejections. INERT until an + # operator sets RESOURCE_COUNT_CAPS_ENABLED (default off), so this rule + # stays quiet until enforcement is enabled. P2 (abuse/observability): + # a sustained block rate after enable is either a tenant hammering a + # cap (upsell/abuse signal) or a too-low cap (revisit plans.yaml). + # Lazy CounterVec — the {service,team_tier} series only appears after + # the first block. + expr: | + sum by (service, team_tier) ( + rate(instant_resource_count_limit_blocked_total[1h]) + ) * 3600 > 20 + for: 1h + labels: + severity: warning + annotations: + summary: "Resource-count cap blocking > 20 provisions/h for a tier+service (instant_resource_count_limit_blocked_total)" + - alert: APIDown expr: up{job="instant-api"} == 0 for: 1m diff --git a/newrelic/alerts/resource-count-limit-blocked.json b/newrelic/alerts/resource-count-limit-blocked.json new file mode 100644 index 0000000..08aa3b1 --- /dev/null +++ b/newrelic/alerts/resource-count-limit-blocked.json @@ -0,0 +1,31 @@ +{ + "name": "instant-api — resource-count cap blocks (1h) [Task #55]", + "type": "NRQL", + "description": "P2 (abuse/observability). Fires when the per-service resource-COUNT cap rejects provisions. instant_resource_count_limit_blocked_total{service,team_tier} is emitted by api/internal/handlers/resource_count_cap.go when a team at its per-tier count cap (postgres/vector/redis/mongodb/storage) attempts another provision and gets 402. The cap closes the strict-≥80%-margin hole where only queue_count was capped — a tenant could otherwise create MANY resources each at the per-resource size cap and blow the saturated-COGS bound (Redis the binding constraint at $6.50/GB). The whole feature is INERT until an operator sets RESOURCE_COUNT_CAPS_ENABLED (default off), so this alert sits quiet until enforcement is enabled. A non-trivial, sustained rate after enable means either (a) a tenant is hammering against a cap (upsell/abuse signal — point sales/support at the team) or (b) the cap is set too low for legitimate use (revisit plans.yaml). P2 because it is not data-loss and not a user-blocking outage — the 402 is the intended, recoverable behaviour with an agent_action telling the user to upgrade. Query is derivative(...,1 hour) per service+tier (NR ingests the counter as a cumulative monotonic OTLP sum). Source: api/internal/handlers/resource_count_cap.go; counter ResourceCountLimitBlocked in api/internal/metrics/metrics.go.", + "enabled": true, + "nrql": { + "query": "SELECT derivative(instant_resource_count_limit_blocked_total, 1 hour) FROM Metric WHERE service = 'api' FACET service, team_tier" + }, + "terms": [ + { + "priority": "WARNING", + "operator": "ABOVE", + "threshold": 20, + "thresholdDuration": 3600, + "thresholdOccurrences": "ALL" + } + ], + "signal": { + "aggregationWindow": 300, + "aggregationMethod": "EVENT_FLOW", + "aggregationDelay": 180, + "fillOption": "STATIC", + "fillValue": 0 + }, + "expiration": { + "expirationDuration": 3600, + "openViolationOnExpiration": false, + "closeViolationsOnExpiration": true + }, + "violationTimeLimitSeconds": 86400 +} diff --git a/newrelic/dashboards/instanode-reliability.json b/newrelic/dashboards/instanode-reliability.json index a4d0475..fd95323 100644 --- a/newrelic/dashboards/instanode-reliability.json +++ b/newrelic/dashboards/instanode-reliability.json @@ -1336,6 +1336,31 @@ "ignoreTimeRange": false } } + }, + { + "title": "Resource-count cap blocks by service+tier (6h; Task #55, inert until RESOURCE_COUNT_CAPS_ENABLED)", + "layout": { + "column": 1, + "row": 66, + "width": 6, + "height": 3 + }, + "visualization": { + "id": "viz.stacked-bar" + }, + "rawConfiguration": { + "nrqlQueries": [ + { + "accountIds": [ + 0 + ], + "query": "SELECT rate(sum(instant_resource_count_limit_blocked_total), 1 minute) FROM Metric WHERE service = 'api' FACET service, team_tier TIMESERIES SINCE 6 hours ago" + } + ], + "platformOptions": { + "ignoreTimeRange": false + } + } } ] } diff --git a/observability/METRICS-CATALOG.md b/observability/METRICS-CATALOG.md index 15e536c..17262ab 100644 --- a/observability/METRICS-CATALOG.md +++ b/observability/METRICS-CATALOG.md @@ -65,6 +65,7 @@ fires. Operators need this so they don't panic when a fresh deploy looks | `instant_flow_test_total` | worker | `flow,actor,tier,layer,result` | lazy (CounterVec — INERT until `FLOW_SYNTHETIC_ENABLED=true`; once on, `pass`/`degraded` materialise on the first happy tick and `fail` only on a real regression. Continuous-monitoring synthetic flow runner (`flow_synthetic.go`): every 5 min runs the P0 flow matrix (healthz / auth_me / provision→reap) against prod. The matrix dashboard FACETs this into the green/red grid, one cell per flow×actor) | `flow-test-p0-fail.json`, `flow-test-silent-death.json` | `FlowTestP0Fail`, `FlowTestSilentDeath` (instant-worker-flow-synthetic group) | "Flow matrix — latest result per flow×actor (grid)", "Flow matrix — fails by flow (1h, must be 0)" | | `instant_flow_test_latency_seconds` | worker | `flow,actor,tier,layer` | lazy (HistogramVec — observation only on a real HTTP response; DNS/TCP errors omit it so the histogram isn't polluted with 0s timeouts. INERT until `FLOW_SYNTHETIC_ENABLED=true`) | `flow-test-latency-regression.json` | `FlowTestLatencyRegression` (instant-worker-flow-synthetic group) | "Flow matrix — P95 latency per flow (6h)" | | `instant_flow_synthetic_reaped_total` | worker | `flow,outcome` | lazy (CounterVec — rule-24 cleanup ledger; `reaped` materialises on the first provision→reap tick, `leaked` ONLY on a failed reap (a real DO/k8s resource leak — must stay 0), `skip` when a flow created nothing. INERT until `FLOW_SYNTHETIC_ENABLED=true`) | `flow-synthetic-leak.json` | `FlowSyntheticLeak` (instant-worker-flow-synthetic group) | "Flow synthetic — leaked reaps (1h, must be 0)" | +| `instant_resource_count_limit_blocked_total` | api | `service,team_tier` | lazy (CounterVec — Task #55. INERT until `RESOURCE_COUNT_CAPS_ENABLED=true`; once on, a `{service,team_tier}` series materialises the first time a team at its per-tier count cap (postgres/vector/redis/mongodb/storage) is rejected with 402. Closes the strict-≥80%-margin hole where only queue_count was capped — Redis the binding constraint at $6.50/GB. A sustained rate after enable = tenant hammering a cap (upsell/abuse) or a too-low cap. P2.) | `resource-count-limit-blocked.json` | `ResourceCountCapBlocked` (instant-api group) | "Resource-count cap blocks by service+tier (6h; Task #55, inert until RESOURCE_COUNT_CAPS_ENABLED)" | ## Lazy-emit gotcha — what operators should expect