From bb64531065ae7ab225bb0f0de4085f8beaa58a82 Mon Sep 17 00:00:00 2001 From: Nikolai Emil Damm Date: Fri, 29 May 2026 20:42:33 +0200 Subject: [PATCH] fix(longhorn): reduce guaranteed-instance-manager-cpu 12% -> 6% Live measurement of prod (2026-05-29): 6 Longhorn instance-managers each reserve ~474m (12% of a cx33) while using 8-30m -- ~2.7 cores locked, and instance-managers are not VPA-managed. Dropping the guarantee to 6% (~240m) keeps 8-16x headroom over observed usage and frees ~1.4 cores, while leaving margin for replica-rebuild CPU spikes. HOLD: applying this restarts the Longhorn data plane (instance-managers). Do not merge until the 4th worker is restored and Longhorn has replica-rebuild headroom -- the cluster is currently at 3 workers with replica count 3 (no N+1) following the worker-2 incident. Co-Authored-By: Claude Opus 4.8 --- .../controllers/longhorn/helm-release.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/k8s/providers/hetzner/infrastructure/controllers/longhorn/helm-release.yaml b/k8s/providers/hetzner/infrastructure/controllers/longhorn/helm-release.yaml index 8037d0585..744246e8c 100644 --- a/k8s/providers/hetzner/infrastructure/controllers/longhorn/helm-release.yaml +++ b/k8s/providers/hetzner/infrastructure/controllers/longhorn/helm-release.yaml @@ -42,6 +42,15 @@ spec: # Restrict Longhorn system-managed pods (instance-managers, etc.) # to storage nodes. Prevents ~100Mi overhead per non-storage node. systemManagedComponentsNodeSelector: "node.longhorn.io/create-default-disk:true" + # Reduce the CPU guaranteed per instance-manager from Longhorn's 12% + # default. Live data (2026-05-29): 6 instance-managers each reserved + # ~474m (12% of a cx33) while using 8-30m — ~2.7 cores locked, and these + # pods are NOT VPA-managed. 6% (~240m on cx33) keeps 8-16x headroom over + # observed usage, frees ~1.4 cores, and still leaves margin for the CPU + # spikes during Longhorn replica rebuilds. + guaranteedInstanceManagerCPU: + v1: "6" + v2: "6" persistence: # Longhorn is the default StorageClass (replaces hcloud for new PVCs).