From 888ca721e3d3460a115bf758145035143566193c Mon Sep 17 00:00:00 2001
From: Nikolai Emil Damm <nikolaiemildamm@icloud.com>
Date: Fri, 29 May 2026 00:13:15 +0200
Subject: [PATCH] ci: discount Unhealthy probe warnings on deleted pods in the
 deploy gate
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

> 🤖 Generated by the Daily AI Assistant

The merge-queue "Deploy to Prod" gate (check-event-warnings) fails on any
Warning event firing in its 90s settle window. During a rollout, kubelet
fires one last liveness/readiness probe ~1s after the CNI (Cilium) tears
down a terminating pod's route, emitting:

  Unhealthy: …/healthz: connect: no route to host

against a pod that is already deleted. That is a teardown artifact, not a
steady-state fault — but the gate counted it and failed the deploy. It hit
OpenCost in PR #1637 and is a recurring source of flaky merge-queue failures
for any PR that rolls a pod.

Fix: snapshot live pods alongside the events, and split off (do not count)
"Unhealthy" warnings whose involved Pod no longer exists. Precise and safe:

  - Only reason == "Unhealthy" on a Pod that is absent from the live snapshot
    is discounted. Real crash loops and persistent probe failures occur on
    pods that STILL EXIST, so they stay counted and still fail the gate.
  - Other warning reasons (BackOff, FailedMount, …) are never discounted,
    even on deleted pods.
  - Fail-safe: if the pod snapshot can't be fetched, the filter keeps every
    warning (never silently hides warnings because the snapshot broke).
  - Transparency: discounted warnings are printed in a report-only group so a
    masked issue always leaves a trace in the log.
  - Both event shapes handled (core/v1 .involvedObject, events.k8s.io/v1
    .regarding).

Validated: jq partitioning unit-tested across all cases (deleted-pod
Unhealthy → dropped; live-pod Unhealthy, non-Unhealthy on deleted pods,
non-Pod warnings, pre-marker events → correct; null snapshot → keep all).
shellcheck clean on the run script; actionlint clean; yq parses.
---
 .../actions/check-event-warnings/action.yaml  | 57 ++++++++++++++++---
 1 file changed, 49 insertions(+), 8 deletions(-)

diff --git a/.github/actions/check-event-warnings/action.yaml b/.github/actions/check-event-warnings/action.yaml
index 51800d689..b12b162c1 100644
--- a/.github/actions/check-event-warnings/action.yaml
+++ b/.github/actions/check-event-warnings/action.yaml
@@ -5,8 +5,12 @@ description: >
   inspects Warning events whose most-recent occurrence is at/after the marker —
   i.e. warnings still firing at steady state (crash loops, repeated probe
   failures, image back-off, etc.). Transient one-shot warnings emitted during
-  bootstrap are ignored because they fired before the marker. The full Warning
-  history is always printed for context.
+  bootstrap are ignored because they fired before the marker. "Unhealthy"
+  (probe-failure) warnings on pods that no longer exist are discounted as
+  rollout teardown artifacts (a final probe after the CNI removed a terminating
+  pod's route); real crash loops and persistent probe failures occur on pods
+  that still exist and remain counted. The full Warning history — and anything
+  discounted — is always printed for context.
 
 inputs:
   context:
@@ -48,23 +52,52 @@ runs:
 
         events_json=$("${kc[@]}" get events -A -o json)
 
-        # Normalise both event shapes (core/v1 Event and events.k8s.io/v1) and
-        # keep only Warnings whose most-recent occurrence is at/after the marker.
-        new_warnings=$(printf '%s' "${events_json}" | jq -c --arg marker "${marker}" '
+        # Snapshot live pods immediately after the events so the two views are
+        # as close in time as possible. Used to discount "Unhealthy" warnings
+        # on pods that no longer exist (teardown artifacts — see the jq below).
+        # On any failure emit "null" so the filter keeps every warning
+        # (fail-safe: never silently hide warnings because the snapshot broke).
+        live_pods_json=$("${kc[@]}" get pods -A -o json 2>/dev/null \
+          | jq -c '[.items[] | "\(.metadata.namespace)/\(.metadata.name)"]' 2>/dev/null) || live_pods_json=""
+        [ -n "${live_pods_json}" ] || live_pods_json="null"
+
+        # Normalise both event shapes (core/v1 Event and events.k8s.io/v1), keep
+        # only Warnings whose most-recent occurrence is at/after the marker, and
+        # tag each as a teardown artifact when it is an "Unhealthy" probe failure
+        # on a Pod absent from the live snapshot. During a rollout kubelet fires
+        # one last liveness/readiness probe ~1s after the CNI tears down a
+        # terminating pod's route, emitting "Unhealthy: … connect: no route to
+        # host / connection refused" against a pod that is already gone. That is
+        # not a steady-state fault, so it is split into `dropped` rather than
+        # failing the gate. Crash loops and persistent probe failures happen on
+        # pods that STILL EXIST, so they land in `kept` and still fail the gate.
+        partitioned=$(printf '%s' "${events_json}" | jq -c --arg marker "${marker}" --argjson livePods "${live_pods_json}" '
           [ .items[]
             | select(.type == "Warning")
             | (.series.lastObservedTime // .lastTimestamp // .deprecatedLastTimestamp // .eventTime // .metadata.creationTimestamp) as $ts
             | select($ts != null and ($ts[0:19]) >= ($marker[0:19]))
-            | { ns:     (.metadata.namespace // .involvedObject.namespace // .regarding.namespace // "-"),
+            | (.metadata.namespace // .involvedObject.namespace // .regarding.namespace // "-") as $ns
+            | (.involvedObject.kind // .regarding.kind // "?") as $kind
+            | (.involvedObject.name // .regarding.name // "?") as $name
+            | { ns:     $ns,
                 reason: (.reason // "-"),
-                obj:    "\(.involvedObject.kind // .regarding.kind // "?")/\(.involvedObject.name // .regarding.name // "?")",
+                obj:    "\($kind)/\($name)",
                 msg:    ((.message // .note // "") | gsub("\\s+"; " ") | .[0:300]),
                 count:  (.count // .series.count // .deprecatedCount // 1),
-                ts:     $ts }
+                ts:     $ts,
+                teardown: ( $livePods != null
+                            and .reason == "Unhealthy"
+                            and $kind == "Pod"
+                            and ( ("\($ns)/\($name)") | IN($livePods[]) | not ) ) }
           ]
           | sort_by(.ts)
+          | { kept:    [ .[] | select(.teardown | not) | del(.teardown) ],
+              dropped: [ .[] | select(.teardown)       | del(.teardown) ] }
         ')
+        new_warnings=$(printf '%s' "${partitioned}" | jq -c '.kept')
+        discounted=$(printf '%s' "${partitioned}" | jq -c '.dropped')
         count=$(printf '%s' "${new_warnings}" | jq 'length')
+        dropped_count=$(printf '%s' "${discounted}" | jq 'length')
 
         echo "::group::New Warning events since ${marker} (${count})"
         if [ "${count}" -eq 0 ]; then
@@ -74,6 +107,14 @@ runs:
         fi
         echo "::endgroup::"
 
+        # Report-only: probe warnings discounted because their pod is already
+        # gone. Printed so a real issue can never be masked without a trace.
+        if [ "${dropped_count}" -gt 0 ]; then
+          echo "::group::Discounted ${dropped_count} Unhealthy warning(s) on already-deleted pods (rollout teardown artifacts, report-only)"
+          printf '%s' "${discounted}" | jq -r '.[] | "\(.ts)  [\(.ns)] \(.obj)  \(.reason) (x\(.count)): \(.msg)"'
+          echo "::endgroup::"
+        fi
+
         # Always print the full Warning history (any time) for context — report-only.
         echo "::group::All Warning events (history, report-only)"
         "${kc[@]}" get events -A --field-selector type=Warning --sort-by=.lastTimestamp 2>/dev/null | tail -200 || echo "(none / unavailable)"