diff --git a/k8s/bases/infrastructure/controllers/cilium/helm-release.yaml b/k8s/bases/infrastructure/controllers/cilium/helm-release.yaml index e8184ccf3..3c57fc71d 100644 --- a/k8s/bases/infrastructure/controllers/cilium/helm-release.yaml +++ b/k8s/bases/infrastructure/controllers/cilium/helm-release.yaml @@ -79,6 +79,10 @@ spec: enabled: false operator: replicas: ${cilium_replicas:=2} + resources: + requests: + cpu: 100m + memory: 256Mi podDisruptionBudget: enabled: true minAvailable: 1 @@ -94,6 +98,24 @@ spec: ipam: mode: kubernetes kubeProxyReplacement: true + # ------------------------------------------------------------------ + # Resource requests for the agent DaemonSet and the standalone + # cilium-envoy DaemonSet. These promote the pods out of BestEffort + # QoS so they survive node memory pressure; an OOMKilled cilium-agent + # leaves BPF state degraded and the node loses ClusterIP routing + # (observed cascading into ~13 workload crash-loops on prod-worker-2, + # 2026-05-28). Limits intentionally unset — Cilium recommends against + # capping the agent (https://docs.cilium.io/en/stable/operations/performance/). + # ------------------------------------------------------------------ + resources: + requests: + cpu: 200m + memory: 512Mi + envoy: + resources: + requests: + cpu: 50m + memory: 128Mi # Transparent WireGuard encryption for all pod-to-pod and node-to-node # traffic. KubeSpan (Talos-layer WireGuard between nodes) is not # enabled in this cluster, so without this setting inter-node pod @@ -118,8 +140,23 @@ spec: install: namespace: kube-system existingNamespace: true + # Resource requests promote spire-server and spire-agent pods + # out of BestEffort QoS. cilium-agent's SPIRE Delegate API + # client relies on the per-node spire-agent admin socket — if + # the agent is evicted/OOMKilled the cilium-agent on that + # node stays stuck retrying "SPIRE admin socket does not + # exist" and ClusterIP routing degrades alongside it. + agent: + resources: + requests: + cpu: 50m + memory: 128Mi # TODO: Remove workaround when SPIRE no longer fails to start (https://github.com/cilium/cilium/issues/40533) server: + resources: + requests: + cpu: 50m + memory: 128Mi initContainers: - command: - /bin/sh