devantler-tech · devantler · May 30, 2026 · May 30, 2026 · May 30, 2026
@@ -4,4 +4,9 @@ apiVersion: cdi.kubevirt.io/v1beta1
 kind: CDI
 metadata:
   name: cdi
-spec: {}
+spec:
+  # Run the CDI control plane (cdi-apiserver / cdi-controller / cdi-uploadproxy)
+  # at the KubeVirt-ecosystem critical tier so it survives node memory pressure.
+  # CDI is a KubeVirt subproject and kubevirt-cluster-critical (value
+  # 1000000000) is created by the KubeVirt operator already deployed here.
+  priorityClass: kubevirt-cluster-critical
@@ -44,6 +44,10 @@ spec:
     prometheus:
       prometheusSpec:
         replicas: 1
+        # Monitoring is critical platform infrastructure — keep it alive under
+        # node memory pressure (evicted only after normal workloads). See the
+        # platform-critical PriorityClass.
+        priorityClassName: platform-critical
         retention: 14d
         retentionSize: "5GiB"
         resources:
@@ -72,6 +76,9 @@ spec:
     alertmanager:
       alertmanagerSpec:
         replicas: ${alertmanager_replicas:=2}
+        # Alerting must survive memory pressure (it is how a degrading node is
+        # noticed in the first place).
+        priorityClassName: platform-critical
         podDisruptionBudget:
           enabled: true
           minAvailable: 1
@@ -138,8 +145,12 @@ spec:
 
     nodeExporter:
       enabled: true
+    # node-exporter subchart passthrough (alias `prometheus-node-exporter`).
+    prometheus-node-exporter:
+      priorityClassName: platform-critical
     kube-state-metrics:
       enabled: true
+      priorityClassName: platform-critical
 
     # Disable the bundled "default" Prometheus rules -- the chart ships
     # ~200 alerts that mostly aren't relevant for a homelab. We add our
@@ -148,6 +159,7 @@ spec:
       create: false
 
     prometheusOperator:
+      priorityClassName: platform-critical
       resources:
         requests:
           cpu: 50m

@@ -2,6 +2,9 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
+  # PriorityClasses first — HelmReleases below (e.g. kube-prometheus-stack)
+  # reference platform-critical, so it must be applied before they install.
+  - priority-classes/
   - auth-proxy/
   - cdi/
   - cert-manager/

@@ -0,0 +1,5 @@
+---
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+resources:
+  - platform-critical.yaml
@@ -0,0 +1,28 @@
+# Priority tier for important platform add-ons that are NOT core Kubernetes /
+# node infrastructure but must still survive node memory pressure — currently
+# the monitoring & alerting stack (kube-prometheus-stack).
+#
+# Sits ABOVE normal workloads (priority 0) so the kubelet evicts workload pods
+# first, but BELOW the built-in system-* classes (2000000000) so true cluster
+# and node infrastructure — CNI (Cilium), CSI (hcloud-csi/Longhorn), DNS
+# (CoreDNS), cloud-controller-manager, and the control plane — always outrank
+# it. Value matches kubevirt-cluster-critical (1000000000): both are the
+# "important platform add-on" tier.
+#
+# Deliberately a custom class rather than system-cluster-critical: putting the
+# monitoring stack at the very top would rank it alongside core infra, so a
+# memory-hungry Prometheus would be reclaimed only after everything else —
+# pushing the node toward the kernel OOM-killer. platform-critical keeps it
+# "evicted after normal workloads" while still leaving it reclaimable before
+# true cluster/node infrastructure.
+---
+apiVersion: scheduling.k8s.io/v1
+kind: PriorityClass
+metadata:
+  name: platform-critical
+value: 1000000000
+globalDefault: false
+description: >-
+  Important platform add-ons (monitoring/alerting) that must survive node
+  memory pressure. Ranked far above normal workloads but below the system-*
+  classes reserved for core cluster/node infrastructure. Not eviction-exempt.
@@ -33,6 +33,11 @@ spec:
       # it avoids "topology not in requisite" provisioning failures.
       enableProvidedByTopology: false
     controller:
+      # Provisioning/attach control plane. The chart leaves priorityClassName
+      # unset by default (priority 0), which makes it a first candidate for
+      # node-pressure eviction. system-cluster-critical ranks it among the last
+      # eviction candidates so volume operations keep working under pressure.
+      priorityClassName: system-cluster-critical
       replicaCount: ${hcloud_csi_controller_replicas:=2}
       hcloudVolumeDefaultLocation: fsn1
       podDisruptionBudget:
@@ -50,6 +55,11 @@ spec:
                     app.kubernetes.io/name: hcloud-csi
                     app.kubernetes.io/instance: hcloud-csi
     node:
+      # Per-node volume mount/unmount plumbing — node-critical. The chart
+      # leaves priorityClassName unset by default (priority 0); system-node-critical
+      # ranks the DaemonSet among the last eviction candidates so stateful pods
+      # can keep mounting their storage under node memory pressure.
+      priorityClassName: system-node-critical
       affinity:
         nodeAffinity:
           requiredDuringSchedulingIgnoredDuringExecution:

@@ -0,0 +1,45 @@
+# Kubelet memory-pressure handling — graceful node OOM.
+#
+# Out of the box the kubelet only steps in at its tiny default hard floor
+# (memory.available<100Mi), by which point the Linux kernel OOM-killer may
+# already have reaped a node-critical daemon. This patch makes a node shed
+# load *gracefully* well before that point and guarantees the OS + kubelet
+# always keep headroom:
+#
+#   - systemReserved / kubeReserved carve memory out of node-allocatable so
+#     pods can never starve the OS or the kubelet itself — the usual trigger
+#     for an ungraceful kernel OOM-kill of a system daemon.
+#   - evictionSoft starts reclaiming once memory.available stays below the
+#     soft threshold (500Mi) for the grace period (1m30s), evicting the
+#     lowest-priority / over-request pods first and letting them terminate
+#     cleanly (evictionMaxPodGracePeriod bounds that drain) — well before the
+#     much lower hard floor.
+#   - evictionHard stays as the last-resort floor before the kernel acts.
+#
+# Node-pressure eviction ranks pods by Priority, so high-priority *critical*
+# pods (system-node-critical / system-cluster-critical) are the last eviction
+# candidates: workload pods are shed before the critical kube-system
+# control/storage plane. (Priority ordering, not a strict exemption.)
+#
+# Reference: https://kubernetes.io/docs/concepts/scheduling-eviction/node-pressure-eviction/
+machine:
+  kubelet:
+    extraConfig:
+      # Headroom reserved from allocatable for the OS and kubelet/runtime.
+      systemReserved:
+        memory: 256Mi
+      kubeReserved:
+        memory: 256Mi
+      # Graceful first line: evict low-priority pods before memory is critical.
+      evictionSoft:
+        memory.available: 500Mi
+      evictionSoftGracePeriod:
+        memory.available: 1m30s
+      # Reclaim a little past the threshold so we don't immediately re-trigger.
+      evictionMinimumReclaim:
+        memory.available: 200Mi
+      # Bound how long a soft-evicted pod may take to terminate.
+      evictionMaxPodGracePeriod: 60
+      # Hard floor: last resort before the kernel OOM-killer.
+      evictionHard:
+        memory.available: 100Mi