From c6a948194d3223c924618f1b32d5695216861060 Mon Sep 17 00:00:00 2001
From: Nikolai Emil Damm <nikolaiemildamm@icloud.com>
Date: Sat, 30 May 2026 09:10:41 +0200
Subject: [PATCH 1/3] feat(node): handle memory pressure gracefully and protect
 critical kube-system pods
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two complementary changes so the prod (Hetzner) cluster degrades gracefully
under node memory pressure instead of letting the kernel OOM-killer reap a
node-critical daemon.

1. talos/cluster/kubelet.yaml — kubelet memory eviction config:
   - systemReserved/kubeReserved (256Mi each) carve headroom out of
     node-allocatable so pods can never starve the OS or the kubelet.
   - evictionSoft (memory.available<500Mi, 90s grace) sheds the lowest-priority
     pods *before* the hard floor, with evictionMaxPodGracePeriod=60 bounding
     the drain and evictionMinimumReclaim=200Mi avoiding immediate re-trigger.
   - evictionHard (memory.available<100Mi) stays as the last-resort floor.
   Node-pressure eviction is priority-aware and skips critical pods, so
   workload pods are always shed before the kube-system control/storage plane.

2. hcloud-csi HelmRelease — assign priority classes (chart default leaves both
   unset → priority 0, making them first eviction candidates):
   - controller -> system-cluster-critical (provisioning/attach control plane)
   - node (DaemonSet) -> system-node-critical (per-node volume mount/unmount)
   Critical pods are exempt from node-pressure eviction, so storage keeps
   working under memory pressure. (hcloud-ccm already defaults to
   system-cluster-critical, so no change there.)

Scope: prod only. The local/CI Docker cluster shares host memory across node
containers, where absolute eviction thresholds could evict pods during the CI
system-test; left out deliberately and can be added if desired.

Validation:
- talosctl gen + machineconfig patch + validate -m cloud --strict on both
  controlplane and worker: valid; extraConfig merges as expected.
- ksail --config ksail.prod.yaml workload validate: 256 files validated.
- ksail workload validate (local): 256 files validated (shared build intact).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../controllers/hcloud-csi/helm-release.yaml  | 10 +++++
 talos/cluster/kubelet.yaml                    | 42 +++++++++++++++++++
 2 files changed, 52 insertions(+)
 create mode 100644 talos/cluster/kubelet.yaml

diff --git a/k8s/providers/hetzner/infrastructure/controllers/hcloud-csi/helm-release.yaml b/k8s/providers/hetzner/infrastructure/controllers/hcloud-csi/helm-release.yaml
index a29940455..20dda7eab 100644
--- a/k8s/providers/hetzner/infrastructure/controllers/hcloud-csi/helm-release.yaml
+++ b/k8s/providers/hetzner/infrastructure/controllers/hcloud-csi/helm-release.yaml
@@ -33,6 +33,11 @@ spec:
       # it avoids "topology not in requisite" provisioning failures.
       enableProvidedByTopology: false
     controller:
+      # Provisioning/attach control plane. The chart leaves priorityClassName
+      # unset (priority 0), which makes it a first candidate for node-pressure
+      # eviction. system-cluster-critical exempts it so volume operations keep
+      # working when a node is under memory pressure.
+      priorityClassName: system-cluster-critical
       replicaCount: ${hcloud_csi_controller_replicas:=2}
       hcloudVolumeDefaultLocation: fsn1
       podDisruptionBudget:
@@ -50,6 +55,11 @@ spec:
                     app.kubernetes.io/name: hcloud-csi
                     app.kubernetes.io/instance: hcloud-csi
     node:
+      # Per-node volume mount/unmount plumbing — node-critical. The chart
+      # leaves priorityClassName unset (priority 0); system-node-critical
+      # exempts the DaemonSet from node-pressure eviction so stateful pods can
+      # always mount their storage.
+      priorityClassName: system-node-critical
       affinity:
         nodeAffinity:
           requiredDuringSchedulingIgnoredDuringExecution:
diff --git a/talos/cluster/kubelet.yaml b/talos/cluster/kubelet.yaml
new file mode 100644
index 000000000..e2605e27f
--- /dev/null
+++ b/talos/cluster/kubelet.yaml
@@ -0,0 +1,42 @@
+# Kubelet memory-pressure handling — graceful node OOM.
+#
+# Out of the box the kubelet only steps in at its tiny default hard floor
+# (memory.available<100Mi), by which point the Linux kernel OOM-killer may
+# already have reaped a node-critical daemon. This patch makes a node shed
+# load *gracefully* well before that point and guarantees the OS + kubelet
+# always keep headroom:
+#
+#   - systemReserved / kubeReserved carve memory out of node-allocatable so
+#     pods can never starve the OS or the kubelet itself — the usual trigger
+#     for an ungraceful kernel OOM-kill of a system daemon.
+#   - evictionSoft begins reclaiming ~90s before the hard floor by evicting
+#     the lowest-priority / over-request pods first, giving them time to
+#     terminate cleanly (evictionMaxPodGracePeriod bounds that wait).
+#   - evictionHard stays as the last-resort floor before the kernel acts.
+#
+# Node-pressure eviction is priority-aware and skips *critical* pods
+# (system-node-critical / system-cluster-critical), so workload pods are
+# always shed before the critical kube-system control/storage plane.
+#
+# Reference: https://kubernetes.io/docs/concepts/scheduling-eviction/node-pressure-eviction/
+machine:
+  kubelet:
+    extraConfig:
+      # Headroom reserved from allocatable for the OS and kubelet/runtime.
+      systemReserved:
+        memory: 256Mi
+      kubeReserved:
+        memory: 256Mi
+      # Graceful first line: evict low-priority pods before memory is critical.
+      evictionSoft:
+        memory.available: 500Mi
+      evictionSoftGracePeriod:
+        memory.available: 1m30s
+      # Reclaim a little past the threshold so we don't immediately re-trigger.
+      evictionMinimumReclaim:
+        memory.available: 200Mi
+      # Bound how long a soft-evicted pod may take to terminate.
+      evictionMaxPodGracePeriod: 60
+      # Hard floor: last resort before the kernel OOM-killer.
+      evictionHard:
+        memory.available: 100Mi

From 302d2cf0e46899a2cee8307a3707dc1304b303dc Mon Sep 17 00:00:00 2001
From: Nikolai Emil Damm <nikolaiemildamm@icloud.com>
Date: Sat, 30 May 2026 10:37:01 +0200
Subject: [PATCH 2/3] feat(qos): protect monitoring stack and CDI from
 node-pressure eviction
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Thorough audit of every cluster-critical component's priority/QoS (CNI, CSI,
CDI, monitoring, control plane, CCM, DNS). Most were already covered by chart
or platform defaults:
  - Cilium agent -> system-node-critical, operator -> system-cluster-critical
    (chart helper fallback)
  - Longhorn -> longhorn-critical (chart default on all components)
  - hcloud-ccm + metrics-server -> system-cluster-critical (chart default)
  - hcloud-csi -> set in the prior commit
  - CoreDNS -> system-cluster-critical (docker: explicit; prod: Talos-managed)
  - control plane (apiserver/etcd/scheduler/controller-manager) -> Talos static
    pods, inherently exempt from kubelet node-pressure eviction

Gaps closed here:
  - Monitoring stack (kube-prometheus-stack) ran at priority 0. New
    `platform-critical` PriorityClass (value 1000000000 — above workloads,
    below the system-* classes so true infra still wins; NOT eviction-exempt,
    so a runaway Prometheus is still reclaimable before kernel OOM) applied to
    Prometheus, Alertmanager, the operator, node-exporter, and kube-state-metrics.
  - CDI control plane ran at priority 0. Set the CDI CR `spec.priorityClass` to
    the existing `kubevirt-cluster-critical` (CDI is a KubeVirt subproject; that
    class is already created by the KubeVirt operator).

Validation:
- ksail workload validate (local + prod): 257 files each.
- kubectl kustomize hetzner controllers build: PriorityClass renders; all 5
  monitoring priorityClassName refs + CDI priorityClass present.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../controllers/cdi/cdi-cr.yaml               |  7 ++++-
 .../kube-prometheus-stack/helm-release.yaml   | 12 +++++++++
 .../controllers/kustomization.yaml            |  1 +
 .../priority-classes/kustomization.yaml       |  5 ++++
 .../priority-classes/platform-critical.yaml   | 26 +++++++++++++++++++
 5 files changed, 50 insertions(+), 1 deletion(-)
 create mode 100644 k8s/bases/infrastructure/controllers/priority-classes/kustomization.yaml
 create mode 100644 k8s/bases/infrastructure/controllers/priority-classes/platform-critical.yaml

diff --git a/k8s/bases/infrastructure/controllers/cdi/cdi-cr.yaml b/k8s/bases/infrastructure/controllers/cdi/cdi-cr.yaml
index 4b2a567d9..533bcc4cc 100644
--- a/k8s/bases/infrastructure/controllers/cdi/cdi-cr.yaml
+++ b/k8s/bases/infrastructure/controllers/cdi/cdi-cr.yaml
@@ -4,4 +4,9 @@ apiVersion: cdi.kubevirt.io/v1beta1
 kind: CDI
 metadata:
   name: cdi
-spec: {}
+spec:
+  # Run the CDI control plane (cdi-apiserver / cdi-controller / cdi-uploadproxy)
+  # at the KubeVirt-ecosystem critical tier so it survives node memory pressure.
+  # CDI is a KubeVirt subproject and kubevirt-cluster-critical (value
+  # 1000000000) is created by the KubeVirt operator already deployed here.
+  priorityClass: kubevirt-cluster-critical
diff --git a/k8s/bases/infrastructure/controllers/kube-prometheus-stack/helm-release.yaml b/k8s/bases/infrastructure/controllers/kube-prometheus-stack/helm-release.yaml
index d00dec1dd..526eee29c 100644
--- a/k8s/bases/infrastructure/controllers/kube-prometheus-stack/helm-release.yaml
+++ b/k8s/bases/infrastructure/controllers/kube-prometheus-stack/helm-release.yaml
@@ -44,6 +44,10 @@ spec:
     prometheus:
       prometheusSpec:
         replicas: 1
+        # Monitoring is critical platform infrastructure — keep it alive under
+        # node memory pressure (evicted only after normal workloads). See the
+        # platform-critical PriorityClass.
+        priorityClassName: platform-critical
         retention: 14d
         retentionSize: "5GiB"
         resources:
@@ -72,6 +76,9 @@ spec:
     alertmanager:
       alertmanagerSpec:
         replicas: ${alertmanager_replicas:=2}
+        # Alerting must survive memory pressure (it is how a degrading node is
+        # noticed in the first place).
+        priorityClassName: platform-critical
         podDisruptionBudget:
           enabled: true
           minAvailable: 1
@@ -138,8 +145,12 @@ spec:
 
     nodeExporter:
       enabled: true
+    # node-exporter subchart passthrough (alias `prometheus-node-exporter`).
+    prometheus-node-exporter:
+      priorityClassName: platform-critical
     kube-state-metrics:
       enabled: true
+      priorityClassName: platform-critical
 
     # Disable the bundled "default" Prometheus rules -- the chart ships
     # ~200 alerts that mostly aren't relevant for a homelab. We add our
@@ -148,6 +159,7 @@ spec:
       create: false
 
     prometheusOperator:
+      priorityClassName: platform-critical
       resources:
         requests:
           cpu: 50m
diff --git a/k8s/bases/infrastructure/controllers/kustomization.yaml b/k8s/bases/infrastructure/controllers/kustomization.yaml
index 6ee6f468e..0a45a72b2 100644
--- a/k8s/bases/infrastructure/controllers/kustomization.yaml
+++ b/k8s/bases/infrastructure/controllers/kustomization.yaml
@@ -20,6 +20,7 @@ resources:
   - oauth2-proxy/
   - openbao/
   - opencost/
+  - priority-classes/
   - reloader/
   - trust-manager/
   - velero/
diff --git a/k8s/bases/infrastructure/controllers/priority-classes/kustomization.yaml b/k8s/bases/infrastructure/controllers/priority-classes/kustomization.yaml
new file mode 100644
index 000000000..9f05c265d
--- /dev/null
+++ b/k8s/bases/infrastructure/controllers/priority-classes/kustomization.yaml
@@ -0,0 +1,5 @@
+---
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+resources:
+  - platform-critical.yaml
diff --git a/k8s/bases/infrastructure/controllers/priority-classes/platform-critical.yaml b/k8s/bases/infrastructure/controllers/priority-classes/platform-critical.yaml
new file mode 100644
index 000000000..19af7600b
--- /dev/null
+++ b/k8s/bases/infrastructure/controllers/priority-classes/platform-critical.yaml
@@ -0,0 +1,26 @@
+# Priority tier for important platform add-ons that are NOT core Kubernetes /
+# node infrastructure but must still survive node memory pressure — currently
+# the monitoring & alerting stack (kube-prometheus-stack).
+#
+# Sits ABOVE normal workloads (priority 0) so the kubelet evicts workload pods
+# first, but BELOW the built-in system-* classes (2000000000) so true cluster
+# and node infrastructure — CNI (Cilium), CSI (hcloud-csi/Longhorn), DNS
+# (CoreDNS), cloud-controller-manager, and the control plane — always outrank
+# it. Value matches kubevirt-cluster-critical (1000000000): both are the
+# "important platform add-on" tier.
+#
+# Deliberately a custom class rather than system-cluster-critical: system-*
+# pods are *exempt* from node-pressure eviction entirely, which would let a
+# runaway Prometheus drive the node into the kernel OOM-killer instead of being
+# reclaimed. platform-critical is "evicted last" without being un-evictable.
+---
+apiVersion: scheduling.k8s.io/v1
+kind: PriorityClass
+metadata:
+  name: platform-critical
+value: 1000000000
+globalDefault: false
+description: >-
+  Important platform add-ons (monitoring/alerting) that must survive node
+  memory pressure. Ranked far above normal workloads but below the system-*
+  classes reserved for core cluster/node infrastructure. Not eviction-exempt.

From a282f22c8624bc648c0dc02a20d2b9b3309ba102 Mon Sep 17 00:00:00 2001
From: Nikolai Emil Damm <nikolaiemildamm@icloud.com>
Date: Sat, 30 May 2026 11:06:20 +0200
Subject: [PATCH 3/3] docs(qos): correct eviction wording and order
 PriorityClass before its consumers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Address Copilot review feedback on #1667:
- Reword eviction comments: node-pressure eviction ranks pods by Priority
  (critical pods are the *last* eviction candidates), not a strict exemption.
  Fix the evictionSoft description — it triggers when memory.available stays
  below the soft threshold (500Mi) for the grace period (1m30s), not "~90s
  before the hard floor".
- Move priority-classes/ to the top of the controllers kustomization so the
  platform-critical PriorityClass is applied before the HelmReleases (e.g.
  kube-prometheus-stack) that reference it, avoiding a fresh-reconcile race.

No behavioural change to the manifests themselves (comments + resource order).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../infrastructure/controllers/kustomization.yaml |  4 +++-
 .../priority-classes/platform-critical.yaml       | 10 ++++++----
 .../controllers/hcloud-csi/helm-release.yaml      | 12 ++++++------
 talos/cluster/kubelet.yaml                        | 15 +++++++++------
 4 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/k8s/bases/infrastructure/controllers/kustomization.yaml b/k8s/bases/infrastructure/controllers/kustomization.yaml
index 0a45a72b2..6fe53aa5d 100644
--- a/k8s/bases/infrastructure/controllers/kustomization.yaml
+++ b/k8s/bases/infrastructure/controllers/kustomization.yaml
@@ -2,6 +2,9 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
+  # PriorityClasses first — HelmReleases below (e.g. kube-prometheus-stack)
+  # reference platform-critical, so it must be applied before they install.
+  - priority-classes/
   - auth-proxy/
   - cdi/
   - cert-manager/
@@ -20,7 +23,6 @@ resources:
   - oauth2-proxy/
   - openbao/
   - opencost/
-  - priority-classes/
   - reloader/
   - trust-manager/
   - velero/
diff --git a/k8s/bases/infrastructure/controllers/priority-classes/platform-critical.yaml b/k8s/bases/infrastructure/controllers/priority-classes/platform-critical.yaml
index 19af7600b..8d35c337d 100644
--- a/k8s/bases/infrastructure/controllers/priority-classes/platform-critical.yaml
+++ b/k8s/bases/infrastructure/controllers/priority-classes/platform-critical.yaml
@@ -9,10 +9,12 @@
 # it. Value matches kubevirt-cluster-critical (1000000000): both are the
 # "important platform add-on" tier.
 #
-# Deliberately a custom class rather than system-cluster-critical: system-*
-# pods are *exempt* from node-pressure eviction entirely, which would let a
-# runaway Prometheus drive the node into the kernel OOM-killer instead of being
-# reclaimed. platform-critical is "evicted last" without being un-evictable.
+# Deliberately a custom class rather than system-cluster-critical: putting the
+# monitoring stack at the very top would rank it alongside core infra, so a
+# memory-hungry Prometheus would be reclaimed only after everything else —
+# pushing the node toward the kernel OOM-killer. platform-critical keeps it
+# "evicted after normal workloads" while still leaving it reclaimable before
+# true cluster/node infrastructure.
 ---
 apiVersion: scheduling.k8s.io/v1
 kind: PriorityClass
diff --git a/k8s/providers/hetzner/infrastructure/controllers/hcloud-csi/helm-release.yaml b/k8s/providers/hetzner/infrastructure/controllers/hcloud-csi/helm-release.yaml
index 20dda7eab..0acb95bdb 100644
--- a/k8s/providers/hetzner/infrastructure/controllers/hcloud-csi/helm-release.yaml
+++ b/k8s/providers/hetzner/infrastructure/controllers/hcloud-csi/helm-release.yaml
@@ -34,9 +34,9 @@ spec:
       enableProvidedByTopology: false
     controller:
       # Provisioning/attach control plane. The chart leaves priorityClassName
-      # unset (priority 0), which makes it a first candidate for node-pressure
-      # eviction. system-cluster-critical exempts it so volume operations keep
-      # working when a node is under memory pressure.
+      # unset by default (priority 0), which makes it a first candidate for
+      # node-pressure eviction. system-cluster-critical ranks it among the last
+      # eviction candidates so volume operations keep working under pressure.
       priorityClassName: system-cluster-critical
       replicaCount: ${hcloud_csi_controller_replicas:=2}
       hcloudVolumeDefaultLocation: fsn1
@@ -56,9 +56,9 @@ spec:
                     app.kubernetes.io/instance: hcloud-csi
     node:
       # Per-node volume mount/unmount plumbing — node-critical. The chart
-      # leaves priorityClassName unset (priority 0); system-node-critical
-      # exempts the DaemonSet from node-pressure eviction so stateful pods can
-      # always mount their storage.
+      # leaves priorityClassName unset by default (priority 0); system-node-critical
+      # ranks the DaemonSet among the last eviction candidates so stateful pods
+      # can keep mounting their storage under node memory pressure.
       priorityClassName: system-node-critical
       affinity:
         nodeAffinity:
diff --git a/talos/cluster/kubelet.yaml b/talos/cluster/kubelet.yaml
index e2605e27f..52821ba8b 100644
--- a/talos/cluster/kubelet.yaml
+++ b/talos/cluster/kubelet.yaml
@@ -9,14 +9,17 @@
 #   - systemReserved / kubeReserved carve memory out of node-allocatable so
 #     pods can never starve the OS or the kubelet itself — the usual trigger
 #     for an ungraceful kernel OOM-kill of a system daemon.
-#   - evictionSoft begins reclaiming ~90s before the hard floor by evicting
-#     the lowest-priority / over-request pods first, giving them time to
-#     terminate cleanly (evictionMaxPodGracePeriod bounds that wait).
+#   - evictionSoft starts reclaiming once memory.available stays below the
+#     soft threshold (500Mi) for the grace period (1m30s), evicting the
+#     lowest-priority / over-request pods first and letting them terminate
+#     cleanly (evictionMaxPodGracePeriod bounds that drain) — well before the
+#     much lower hard floor.
 #   - evictionHard stays as the last-resort floor before the kernel acts.
 #
-# Node-pressure eviction is priority-aware and skips *critical* pods
-# (system-node-critical / system-cluster-critical), so workload pods are
-# always shed before the critical kube-system control/storage plane.
+# Node-pressure eviction ranks pods by Priority, so high-priority *critical*
+# pods (system-node-critical / system-cluster-critical) are the last eviction
+# candidates: workload pods are shed before the critical kube-system
+# control/storage plane. (Priority ordering, not a strict exemption.)
 #
 # Reference: https://kubernetes.io/docs/concepts/scheduling-eviction/node-pressure-eviction/
 machine: