From 5adeb901ac40c26a3281c6b260473ba8d2bc1fdc Mon Sep 17 00:00:00 2001 From: Nikolai Emil Damm Date: Thu, 28 May 2026 18:08:50 +0200 Subject: [PATCH 1/2] fix(apps): add startupProbe to homepage, headlamp, actual-budget MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Each of these charts hardcodes liveness/readiness with Kubernetes defaults (timeoutSeconds: 1, periodSeconds: 10, failureThreshold: 3) and does not template a startupProbe. On a cold start the container takes ~10–13s to begin serving HTTP, so each pod creation logs 1–3 'Unhealthy' Warning events and leaves only ~17s of headroom before the liveness restart fires. Add a strategic-merge startupProbe via the existing postRenderer block (60s startup window, 2s period/timeout) so liveness/readiness are gated until the container is actually serving. No change to liveness/readiness — once startup succeeds the tight defaults are fine on a warm pod. Observed (prod, 14:59–15:44 UTC on 2026-05-28): - homepage: 3 Unhealthy events per rollout pod (5 pods affected) - actual-budget: 2 events at cold start - headlamp: 1 event per KEDA scale-from-zero Co-Authored-By: Claude Opus 4.7 (1M context) --- k8s/bases/apps/actual-budget/helm-release.yaml | 12 ++++++++++++ k8s/bases/apps/headlamp/helm-release.yaml | 15 +++++++++++++++ k8s/bases/apps/homepage/helm-release.yaml | 15 +++++++++++++++ 3 files changed, 42 insertions(+) diff --git a/k8s/bases/apps/actual-budget/helm-release.yaml b/k8s/bases/apps/actual-budget/helm-release.yaml index c6582230e..89cdc3775 100644 --- a/k8s/bases/apps/actual-budget/helm-release.yaml +++ b/k8s/bases/apps/actual-budget/helm-release.yaml @@ -49,6 +49,18 @@ spec: matchLabels: app.kubernetes.io/name: actualbudget app.kubernetes.io/instance: actual-budget + # Chart hardcodes startupProbe absence; values override the + # liveness/readiness blocks below but not startupProbe. Gate + # liveness/readiness on the container actually serving HTTP. + - op: add + path: /spec/template/spec/containers/0/startupProbe + value: + httpGet: + path: / + port: http + periodSeconds: 2 + timeoutSeconds: 2 + failureThreshold: 30 # 60s max startup window # https://github.com/community-charts/helm-charts/blob/main/charts/actualbudget/values.yaml values: replicaCount: ${actual_budget_replicas:=1} diff --git a/k8s/bases/apps/headlamp/helm-release.yaml b/k8s/bases/apps/headlamp/helm-release.yaml index 0dc9c7d9f..111bacd81 100644 --- a/k8s/bases/apps/headlamp/helm-release.yaml +++ b/k8s/bases/apps/headlamp/helm-release.yaml @@ -71,6 +71,21 @@ spec: value: name: tmp-dir mountPath: /tmp + # Chart hardcodes liveness/readiness with K8s defaults + # (timeoutSeconds: 1, failureThreshold: 3, periodSeconds: 10). + # Headlamp is KEDA-scaled to 0 in prod; every cold start logs + # 1-3 Unhealthy probe warnings while the Go binary initialises. + # Add a startupProbe so liveness/readiness are gated until the + # main container is actually serving. + - op: add + path: /spec/template/spec/containers/0/startupProbe + value: + httpGet: + path: / + port: http + periodSeconds: 2 + timeoutSeconds: 2 + failureThreshold: 30 # 60s max startup window - target: kind: Deployment name: headlamp diff --git a/k8s/bases/apps/homepage/helm-release.yaml b/k8s/bases/apps/homepage/helm-release.yaml index abbb078cf..b8ab7bdad 100644 --- a/k8s/bases/apps/homepage/helm-release.yaml +++ b/k8s/bases/apps/homepage/helm-release.yaml @@ -52,6 +52,21 @@ spec: matchLabels: app.kubernetes.io/name: homepage app.kubernetes.io/instance: homepage + # Chart hardcodes liveness/readiness with K8s defaults + # (timeoutSeconds: 1, failureThreshold: 3, periodSeconds: 10). + # Homepage takes ~13s to start serving on a fresh pod, so each + # rollout produces 3 Unhealthy probe warnings per pod and leaves + # only ~17s of headroom before the liveness restart fires. Add + # a startupProbe to gate liveness/readiness during initial boot. + - op: add + path: /spec/template/spec/containers/0/startupProbe + value: + httpGet: + path: / + port: http + periodSeconds: 2 + timeoutSeconds: 2 + failureThreshold: 30 # 60s max startup window # ICONS: # https://github.com/walkxcode/dashboard-icons # https://simpleicons.org From 19fb2b32b6af1100608c2cebc3932944d7b5eee6 Mon Sep 17 00:00:00 2001 From: Nikolai Emil Damm Date: Thu, 28 May 2026 23:14:01 +0200 Subject: [PATCH 2/2] fix(apps): startupProbe initialDelaySeconds=20 to clear settle window MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The original probe (periodSeconds: 2, failureThreshold: 30, no initialDelay) silenced cold-start liveness/readiness *restarts* but not the underlying "Unhealthy" Warning events — kubelet emits the same event for startup, liveness, and readiness probe failures, and the 2s period generates 5-7 failures during the ~13s cold start instead of the chart-default 1-3 (periodSeconds: 10). Merge-queue deploy of #1636 failed the check-event-warnings action, which records a marker post-reconcile and fails if any Warning event has lastTimestamp within a 90s settle window. The rollout these patches force created new pods during that window; their startup probes fired every 2s during cold start; their events landed past the marker. Set initialDelaySeconds: 20 (past the observed ~13s cold start) and periodSeconds: 5 so the first probe lands on a serving container. Zero failure events on a normal rollout; failureThreshold: 12 leaves 60s of grace if a container is unusually slow. Co-Authored-By: Claude Opus 4.7 (1M context) --- k8s/bases/apps/actual-budget/helm-release.yaml | 11 ++++++++--- k8s/bases/apps/headlamp/helm-release.yaml | 11 ++++++++--- k8s/bases/apps/homepage/helm-release.yaml | 13 ++++++++++--- 3 files changed, 26 insertions(+), 9 deletions(-) diff --git a/k8s/bases/apps/actual-budget/helm-release.yaml b/k8s/bases/apps/actual-budget/helm-release.yaml index 89cdc3775..6dcc3ebcf 100644 --- a/k8s/bases/apps/actual-budget/helm-release.yaml +++ b/k8s/bases/apps/actual-budget/helm-release.yaml @@ -52,15 +52,20 @@ spec: # Chart hardcodes startupProbe absence; values override the # liveness/readiness blocks below but not startupProbe. Gate # liveness/readiness on the container actually serving HTTP. + # initialDelaySeconds skips past the ~10s cold-start window so + # the first probe lands on a serving container — zero failure + # events during the merge-queue's 90s steady-state Warning + # check. - op: add path: /spec/template/spec/containers/0/startupProbe value: httpGet: path: / port: http - periodSeconds: 2 - timeoutSeconds: 2 - failureThreshold: 30 # 60s max startup window + initialDelaySeconds: 20 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 12 # 60s grace beyond initial delay # https://github.com/community-charts/helm-charts/blob/main/charts/actualbudget/values.yaml values: replicaCount: ${actual_budget_replicas:=1} diff --git a/k8s/bases/apps/headlamp/helm-release.yaml b/k8s/bases/apps/headlamp/helm-release.yaml index 111bacd81..0ec11876d 100644 --- a/k8s/bases/apps/headlamp/helm-release.yaml +++ b/k8s/bases/apps/headlamp/helm-release.yaml @@ -77,15 +77,20 @@ spec: # 1-3 Unhealthy probe warnings while the Go binary initialises. # Add a startupProbe so liveness/readiness are gated until the # main container is actually serving. + # initialDelaySeconds skips past the cold-start window so the + # first probe lands on a serving container — zero failure + # events during the merge-queue's 90s steady-state Warning + # check. - op: add path: /spec/template/spec/containers/0/startupProbe value: httpGet: path: / port: http - periodSeconds: 2 - timeoutSeconds: 2 - failureThreshold: 30 # 60s max startup window + initialDelaySeconds: 20 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 12 # 60s grace beyond initial delay - target: kind: Deployment name: headlamp diff --git a/k8s/bases/apps/homepage/helm-release.yaml b/k8s/bases/apps/homepage/helm-release.yaml index b8ab7bdad..5e8fe4800 100644 --- a/k8s/bases/apps/homepage/helm-release.yaml +++ b/k8s/bases/apps/homepage/helm-release.yaml @@ -58,15 +58,22 @@ spec: # rollout produces 3 Unhealthy probe warnings per pod and leaves # only ~17s of headroom before the liveness restart fires. Add # a startupProbe to gate liveness/readiness during initial boot. + # + # initialDelaySeconds is past the observed ~13s cold start so + # the first probe lands on a serving container — zero failure + # events during a normal rollout (the merge-queue's 90s + # steady-state Warning check would otherwise count probe + # failures fired in that window). - op: add path: /spec/template/spec/containers/0/startupProbe value: httpGet: path: / port: http - periodSeconds: 2 - timeoutSeconds: 2 - failureThreshold: 30 # 60s max startup window + initialDelaySeconds: 20 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 12 # 60s grace beyond initial delay # ICONS: # https://github.com/walkxcode/dashboard-icons # https://simpleicons.org