From f5c2c27ed152dffa74209ed5d01b10642bc75bd5 Mon Sep 17 00:00:00 2001 From: Nikolai Emil Damm Date: Fri, 29 May 2026 15:39:35 +0200 Subject: [PATCH] fix(longhorn): run CSI attacher/provisioner/resizer with 2 replicas for HA MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Longhorn CSI control-plane sidecars (csi-attacher, csi-provisioner, csi-resizer) default to a single replica in the base HelmRelease, and the prod variables ConfigMap never overrode them — so each ran as a single point of failure. On 2026-05-28 the sole csi-attacher/provisioner happened to sit on prod-worker-2, whose Cilium ClusterIP datapath had degraded after an OOMKill. With the only replica unreachable, volume attach/detach orchestration failed cluster-wide (FailedAttachVolume storms, pods stuck in ContainerCreating, and the CD deploy health gate tripping on "FailedAttachVolume ×107"). These sidecars are leader-elected, so a second replica is an idle warm standby on another node — cheap insurance that keeps CSI functioning through a single-node outage. Matches the existing "2 replicas for HA" pattern already applied to cert-manager, metrics-server, KEDA and external-secrets. Co-Authored-By: Claude Opus 4.8 --- .../prod/variables/variables-cluster-config-map.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/k8s/clusters/prod/variables/variables-cluster-config-map.yaml b/k8s/clusters/prod/variables/variables-cluster-config-map.yaml index 1e820672a..61b4b3643 100644 --- a/k8s/clusters/prod/variables/variables-cluster-config-map.yaml +++ b/k8s/clusters/prod/variables/variables-cluster-config-map.yaml @@ -86,6 +86,17 @@ data: # CSI snapshotter crash-loops trying to list them. Disable until CRDs # are added. longhorn_csi_snapshotter_replicas: "0" + # CSI control-plane sidecars (attacher/provisioner/resizer) default to a + # single replica in the base HelmRelease. They are leader-elected, so a + # second replica is a cheap warm standby that costs ~nothing when idle. + # Run 2 for HA: on 2026-05-28 the sole csi-attacher/provisioner happened to + # sit on a worker whose Cilium ClusterIP datapath had degraded, which took + # out volume attach/detach orchestration cluster-wide (FailedAttachVolume + # storms, pods stuck ContainerCreating). A standby on another node keeps CSI + # functioning through a single-node outage. + longhorn_csi_attacher_replicas: "2" + longhorn_csi_provisioner_replicas: "2" + longhorn_csi_resizer_replicas: "2" # --- Non-essential services scaled to 0 --- # Origin CA issuer requests 1 CPU (50% of worker allocatable) for a simple # cert controller. Disabled until we actually need Cloudflare Origin certs.