From c6a948194d3223c924618f1b32d5695216861060 Mon Sep 17 00:00:00 2001 From: Nikolai Emil Damm Date: Sat, 30 May 2026 09:10:41 +0200 Subject: [PATCH 1/3] feat(node): handle memory pressure gracefully and protect critical kube-system pods MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two complementary changes so the prod (Hetzner) cluster degrades gracefully under node memory pressure instead of letting the kernel OOM-killer reap a node-critical daemon. 1. talos/cluster/kubelet.yaml — kubelet memory eviction config: - systemReserved/kubeReserved (256Mi each) carve headroom out of node-allocatable so pods can never starve the OS or the kubelet. - evictionSoft (memory.available<500Mi, 90s grace) sheds the lowest-priority pods *before* the hard floor, with evictionMaxPodGracePeriod=60 bounding the drain and evictionMinimumReclaim=200Mi avoiding immediate re-trigger. - evictionHard (memory.available<100Mi) stays as the last-resort floor. Node-pressure eviction is priority-aware and skips critical pods, so workload pods are always shed before the kube-system control/storage plane. 2. hcloud-csi HelmRelease — assign priority classes (chart default leaves both unset → priority 0, making them first eviction candidates): - controller -> system-cluster-critical (provisioning/attach control plane) - node (DaemonSet) -> system-node-critical (per-node volume mount/unmount) Critical pods are exempt from node-pressure eviction, so storage keeps working under memory pressure. (hcloud-ccm already defaults to system-cluster-critical, so no change there.) Scope: prod only. The local/CI Docker cluster shares host memory across node containers, where absolute eviction thresholds could evict pods during the CI system-test; left out deliberately and can be added if desired. Validation: - talosctl gen + machineconfig patch + validate -m cloud --strict on both controlplane and worker: valid; extraConfig merges as expected. - ksail --config ksail.prod.yaml workload validate: 256 files validated. - ksail workload validate (local): 256 files validated (shared build intact). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../controllers/hcloud-csi/helm-release.yaml | 10 +++++ talos/cluster/kubelet.yaml | 42 +++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 talos/cluster/kubelet.yaml diff --git a/k8s/providers/hetzner/infrastructure/controllers/hcloud-csi/helm-release.yaml b/k8s/providers/hetzner/infrastructure/controllers/hcloud-csi/helm-release.yaml index a29940455..20dda7eab 100644 --- a/k8s/providers/hetzner/infrastructure/controllers/hcloud-csi/helm-release.yaml +++ b/k8s/providers/hetzner/infrastructure/controllers/hcloud-csi/helm-release.yaml @@ -33,6 +33,11 @@ spec: # it avoids "topology not in requisite" provisioning failures. enableProvidedByTopology: false controller: + # Provisioning/attach control plane. The chart leaves priorityClassName + # unset (priority 0), which makes it a first candidate for node-pressure + # eviction. system-cluster-critical exempts it so volume operations keep + # working when a node is under memory pressure. + priorityClassName: system-cluster-critical replicaCount: ${hcloud_csi_controller_replicas:=2} hcloudVolumeDefaultLocation: fsn1 podDisruptionBudget: @@ -50,6 +55,11 @@ spec: app.kubernetes.io/name: hcloud-csi app.kubernetes.io/instance: hcloud-csi node: + # Per-node volume mount/unmount plumbing — node-critical. The chart + # leaves priorityClassName unset (priority 0); system-node-critical + # exempts the DaemonSet from node-pressure eviction so stateful pods can + # always mount their storage. + priorityClassName: system-node-critical affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/talos/cluster/kubelet.yaml b/talos/cluster/kubelet.yaml new file mode 100644 index 000000000..e2605e27f --- /dev/null +++ b/talos/cluster/kubelet.yaml @@ -0,0 +1,42 @@ +# Kubelet memory-pressure handling — graceful node OOM. +# +# Out of the box the kubelet only steps in at its tiny default hard floor +# (memory.available<100Mi), by which point the Linux kernel OOM-killer may +# already have reaped a node-critical daemon. This patch makes a node shed +# load *gracefully* well before that point and guarantees the OS + kubelet +# always keep headroom: +# +# - systemReserved / kubeReserved carve memory out of node-allocatable so +# pods can never starve the OS or the kubelet itself — the usual trigger +# for an ungraceful kernel OOM-kill of a system daemon. +# - evictionSoft begins reclaiming ~90s before the hard floor by evicting +# the lowest-priority / over-request pods first, giving them time to +# terminate cleanly (evictionMaxPodGracePeriod bounds that wait). +# - evictionHard stays as the last-resort floor before the kernel acts. +# +# Node-pressure eviction is priority-aware and skips *critical* pods +# (system-node-critical / system-cluster-critical), so workload pods are +# always shed before the critical kube-system control/storage plane. +# +# Reference: https://kubernetes.io/docs/concepts/scheduling-eviction/node-pressure-eviction/ +machine: + kubelet: + extraConfig: + # Headroom reserved from allocatable for the OS and kubelet/runtime. + systemReserved: + memory: 256Mi + kubeReserved: + memory: 256Mi + # Graceful first line: evict low-priority pods before memory is critical. + evictionSoft: + memory.available: 500Mi + evictionSoftGracePeriod: + memory.available: 1m30s + # Reclaim a little past the threshold so we don't immediately re-trigger. + evictionMinimumReclaim: + memory.available: 200Mi + # Bound how long a soft-evicted pod may take to terminate. + evictionMaxPodGracePeriod: 60 + # Hard floor: last resort before the kernel OOM-killer. + evictionHard: + memory.available: 100Mi From 302d2cf0e46899a2cee8307a3707dc1304b303dc Mon Sep 17 00:00:00 2001 From: Nikolai Emil Damm Date: Sat, 30 May 2026 10:37:01 +0200 Subject: [PATCH 2/3] feat(qos): protect monitoring stack and CDI from node-pressure eviction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thorough audit of every cluster-critical component's priority/QoS (CNI, CSI, CDI, monitoring, control plane, CCM, DNS). Most were already covered by chart or platform defaults: - Cilium agent -> system-node-critical, operator -> system-cluster-critical (chart helper fallback) - Longhorn -> longhorn-critical (chart default on all components) - hcloud-ccm + metrics-server -> system-cluster-critical (chart default) - hcloud-csi -> set in the prior commit - CoreDNS -> system-cluster-critical (docker: explicit; prod: Talos-managed) - control plane (apiserver/etcd/scheduler/controller-manager) -> Talos static pods, inherently exempt from kubelet node-pressure eviction Gaps closed here: - Monitoring stack (kube-prometheus-stack) ran at priority 0. New `platform-critical` PriorityClass (value 1000000000 — above workloads, below the system-* classes so true infra still wins; NOT eviction-exempt, so a runaway Prometheus is still reclaimable before kernel OOM) applied to Prometheus, Alertmanager, the operator, node-exporter, and kube-state-metrics. - CDI control plane ran at priority 0. Set the CDI CR `spec.priorityClass` to the existing `kubevirt-cluster-critical` (CDI is a KubeVirt subproject; that class is already created by the KubeVirt operator). Validation: - ksail workload validate (local + prod): 257 files each. - kubectl kustomize hetzner controllers build: PriorityClass renders; all 5 monitoring priorityClassName refs + CDI priorityClass present. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../controllers/cdi/cdi-cr.yaml | 7 ++++- .../kube-prometheus-stack/helm-release.yaml | 12 +++++++++ .../controllers/kustomization.yaml | 1 + .../priority-classes/kustomization.yaml | 5 ++++ .../priority-classes/platform-critical.yaml | 26 +++++++++++++++++++ 5 files changed, 50 insertions(+), 1 deletion(-) create mode 100644 k8s/bases/infrastructure/controllers/priority-classes/kustomization.yaml create mode 100644 k8s/bases/infrastructure/controllers/priority-classes/platform-critical.yaml diff --git a/k8s/bases/infrastructure/controllers/cdi/cdi-cr.yaml b/k8s/bases/infrastructure/controllers/cdi/cdi-cr.yaml index 4b2a567d9..533bcc4cc 100644 --- a/k8s/bases/infrastructure/controllers/cdi/cdi-cr.yaml +++ b/k8s/bases/infrastructure/controllers/cdi/cdi-cr.yaml @@ -4,4 +4,9 @@ apiVersion: cdi.kubevirt.io/v1beta1 kind: CDI metadata: name: cdi -spec: {} +spec: + # Run the CDI control plane (cdi-apiserver / cdi-controller / cdi-uploadproxy) + # at the KubeVirt-ecosystem critical tier so it survives node memory pressure. + # CDI is a KubeVirt subproject and kubevirt-cluster-critical (value + # 1000000000) is created by the KubeVirt operator already deployed here. + priorityClass: kubevirt-cluster-critical diff --git a/k8s/bases/infrastructure/controllers/kube-prometheus-stack/helm-release.yaml b/k8s/bases/infrastructure/controllers/kube-prometheus-stack/helm-release.yaml index d00dec1dd..526eee29c 100644 --- a/k8s/bases/infrastructure/controllers/kube-prometheus-stack/helm-release.yaml +++ b/k8s/bases/infrastructure/controllers/kube-prometheus-stack/helm-release.yaml @@ -44,6 +44,10 @@ spec: prometheus: prometheusSpec: replicas: 1 + # Monitoring is critical platform infrastructure — keep it alive under + # node memory pressure (evicted only after normal workloads). See the + # platform-critical PriorityClass. + priorityClassName: platform-critical retention: 14d retentionSize: "5GiB" resources: @@ -72,6 +76,9 @@ spec: alertmanager: alertmanagerSpec: replicas: ${alertmanager_replicas:=2} + # Alerting must survive memory pressure (it is how a degrading node is + # noticed in the first place). + priorityClassName: platform-critical podDisruptionBudget: enabled: true minAvailable: 1 @@ -138,8 +145,12 @@ spec: nodeExporter: enabled: true + # node-exporter subchart passthrough (alias `prometheus-node-exporter`). + prometheus-node-exporter: + priorityClassName: platform-critical kube-state-metrics: enabled: true + priorityClassName: platform-critical # Disable the bundled "default" Prometheus rules -- the chart ships # ~200 alerts that mostly aren't relevant for a homelab. We add our @@ -148,6 +159,7 @@ spec: create: false prometheusOperator: + priorityClassName: platform-critical resources: requests: cpu: 50m diff --git a/k8s/bases/infrastructure/controllers/kustomization.yaml b/k8s/bases/infrastructure/controllers/kustomization.yaml index 6ee6f468e..0a45a72b2 100644 --- a/k8s/bases/infrastructure/controllers/kustomization.yaml +++ b/k8s/bases/infrastructure/controllers/kustomization.yaml @@ -20,6 +20,7 @@ resources: - oauth2-proxy/ - openbao/ - opencost/ + - priority-classes/ - reloader/ - trust-manager/ - velero/ diff --git a/k8s/bases/infrastructure/controllers/priority-classes/kustomization.yaml b/k8s/bases/infrastructure/controllers/priority-classes/kustomization.yaml new file mode 100644 index 000000000..9f05c265d --- /dev/null +++ b/k8s/bases/infrastructure/controllers/priority-classes/kustomization.yaml @@ -0,0 +1,5 @@ +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - platform-critical.yaml diff --git a/k8s/bases/infrastructure/controllers/priority-classes/platform-critical.yaml b/k8s/bases/infrastructure/controllers/priority-classes/platform-critical.yaml new file mode 100644 index 000000000..19af7600b --- /dev/null +++ b/k8s/bases/infrastructure/controllers/priority-classes/platform-critical.yaml @@ -0,0 +1,26 @@ +# Priority tier for important platform add-ons that are NOT core Kubernetes / +# node infrastructure but must still survive node memory pressure — currently +# the monitoring & alerting stack (kube-prometheus-stack). +# +# Sits ABOVE normal workloads (priority 0) so the kubelet evicts workload pods +# first, but BELOW the built-in system-* classes (2000000000) so true cluster +# and node infrastructure — CNI (Cilium), CSI (hcloud-csi/Longhorn), DNS +# (CoreDNS), cloud-controller-manager, and the control plane — always outrank +# it. Value matches kubevirt-cluster-critical (1000000000): both are the +# "important platform add-on" tier. +# +# Deliberately a custom class rather than system-cluster-critical: system-* +# pods are *exempt* from node-pressure eviction entirely, which would let a +# runaway Prometheus drive the node into the kernel OOM-killer instead of being +# reclaimed. platform-critical is "evicted last" without being un-evictable. +--- +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: platform-critical +value: 1000000000 +globalDefault: false +description: >- + Important platform add-ons (monitoring/alerting) that must survive node + memory pressure. Ranked far above normal workloads but below the system-* + classes reserved for core cluster/node infrastructure. Not eviction-exempt. From a282f22c8624bc648c0dc02a20d2b9b3309ba102 Mon Sep 17 00:00:00 2001 From: Nikolai Emil Damm Date: Sat, 30 May 2026 11:06:20 +0200 Subject: [PATCH 3/3] docs(qos): correct eviction wording and order PriorityClass before its consumers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address Copilot review feedback on #1667: - Reword eviction comments: node-pressure eviction ranks pods by Priority (critical pods are the *last* eviction candidates), not a strict exemption. Fix the evictionSoft description — it triggers when memory.available stays below the soft threshold (500Mi) for the grace period (1m30s), not "~90s before the hard floor". - Move priority-classes/ to the top of the controllers kustomization so the platform-critical PriorityClass is applied before the HelmReleases (e.g. kube-prometheus-stack) that reference it, avoiding a fresh-reconcile race. No behavioural change to the manifests themselves (comments + resource order). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../infrastructure/controllers/kustomization.yaml | 4 +++- .../priority-classes/platform-critical.yaml | 10 ++++++---- .../controllers/hcloud-csi/helm-release.yaml | 12 ++++++------ talos/cluster/kubelet.yaml | 15 +++++++++------ 4 files changed, 24 insertions(+), 17 deletions(-) diff --git a/k8s/bases/infrastructure/controllers/kustomization.yaml b/k8s/bases/infrastructure/controllers/kustomization.yaml index 0a45a72b2..6fe53aa5d 100644 --- a/k8s/bases/infrastructure/controllers/kustomization.yaml +++ b/k8s/bases/infrastructure/controllers/kustomization.yaml @@ -2,6 +2,9 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: + # PriorityClasses first — HelmReleases below (e.g. kube-prometheus-stack) + # reference platform-critical, so it must be applied before they install. + - priority-classes/ - auth-proxy/ - cdi/ - cert-manager/ @@ -20,7 +23,6 @@ resources: - oauth2-proxy/ - openbao/ - opencost/ - - priority-classes/ - reloader/ - trust-manager/ - velero/ diff --git a/k8s/bases/infrastructure/controllers/priority-classes/platform-critical.yaml b/k8s/bases/infrastructure/controllers/priority-classes/platform-critical.yaml index 19af7600b..8d35c337d 100644 --- a/k8s/bases/infrastructure/controllers/priority-classes/platform-critical.yaml +++ b/k8s/bases/infrastructure/controllers/priority-classes/platform-critical.yaml @@ -9,10 +9,12 @@ # it. Value matches kubevirt-cluster-critical (1000000000): both are the # "important platform add-on" tier. # -# Deliberately a custom class rather than system-cluster-critical: system-* -# pods are *exempt* from node-pressure eviction entirely, which would let a -# runaway Prometheus drive the node into the kernel OOM-killer instead of being -# reclaimed. platform-critical is "evicted last" without being un-evictable. +# Deliberately a custom class rather than system-cluster-critical: putting the +# monitoring stack at the very top would rank it alongside core infra, so a +# memory-hungry Prometheus would be reclaimed only after everything else — +# pushing the node toward the kernel OOM-killer. platform-critical keeps it +# "evicted after normal workloads" while still leaving it reclaimable before +# true cluster/node infrastructure. --- apiVersion: scheduling.k8s.io/v1 kind: PriorityClass diff --git a/k8s/providers/hetzner/infrastructure/controllers/hcloud-csi/helm-release.yaml b/k8s/providers/hetzner/infrastructure/controllers/hcloud-csi/helm-release.yaml index 20dda7eab..0acb95bdb 100644 --- a/k8s/providers/hetzner/infrastructure/controllers/hcloud-csi/helm-release.yaml +++ b/k8s/providers/hetzner/infrastructure/controllers/hcloud-csi/helm-release.yaml @@ -34,9 +34,9 @@ spec: enableProvidedByTopology: false controller: # Provisioning/attach control plane. The chart leaves priorityClassName - # unset (priority 0), which makes it a first candidate for node-pressure - # eviction. system-cluster-critical exempts it so volume operations keep - # working when a node is under memory pressure. + # unset by default (priority 0), which makes it a first candidate for + # node-pressure eviction. system-cluster-critical ranks it among the last + # eviction candidates so volume operations keep working under pressure. priorityClassName: system-cluster-critical replicaCount: ${hcloud_csi_controller_replicas:=2} hcloudVolumeDefaultLocation: fsn1 @@ -56,9 +56,9 @@ spec: app.kubernetes.io/instance: hcloud-csi node: # Per-node volume mount/unmount plumbing — node-critical. The chart - # leaves priorityClassName unset (priority 0); system-node-critical - # exempts the DaemonSet from node-pressure eviction so stateful pods can - # always mount their storage. + # leaves priorityClassName unset by default (priority 0); system-node-critical + # ranks the DaemonSet among the last eviction candidates so stateful pods + # can keep mounting their storage under node memory pressure. priorityClassName: system-node-critical affinity: nodeAffinity: diff --git a/talos/cluster/kubelet.yaml b/talos/cluster/kubelet.yaml index e2605e27f..52821ba8b 100644 --- a/talos/cluster/kubelet.yaml +++ b/talos/cluster/kubelet.yaml @@ -9,14 +9,17 @@ # - systemReserved / kubeReserved carve memory out of node-allocatable so # pods can never starve the OS or the kubelet itself — the usual trigger # for an ungraceful kernel OOM-kill of a system daemon. -# - evictionSoft begins reclaiming ~90s before the hard floor by evicting -# the lowest-priority / over-request pods first, giving them time to -# terminate cleanly (evictionMaxPodGracePeriod bounds that wait). +# - evictionSoft starts reclaiming once memory.available stays below the +# soft threshold (500Mi) for the grace period (1m30s), evicting the +# lowest-priority / over-request pods first and letting them terminate +# cleanly (evictionMaxPodGracePeriod bounds that drain) — well before the +# much lower hard floor. # - evictionHard stays as the last-resort floor before the kernel acts. # -# Node-pressure eviction is priority-aware and skips *critical* pods -# (system-node-critical / system-cluster-critical), so workload pods are -# always shed before the critical kube-system control/storage plane. +# Node-pressure eviction ranks pods by Priority, so high-priority *critical* +# pods (system-node-critical / system-cluster-critical) are the last eviction +# candidates: workload pods are shed before the critical kube-system +# control/storage plane. (Priority ordering, not a strict exemption.) # # Reference: https://kubernetes.io/docs/concepts/scheduling-eviction/node-pressure-eviction/ machine: