diff --git a/k8s/bases/infrastructure/controllers/cdi/cdi-cr.yaml b/k8s/bases/infrastructure/controllers/cdi/cdi-cr.yaml index 4b2a567d9..533bcc4cc 100644 --- a/k8s/bases/infrastructure/controllers/cdi/cdi-cr.yaml +++ b/k8s/bases/infrastructure/controllers/cdi/cdi-cr.yaml @@ -4,4 +4,9 @@ apiVersion: cdi.kubevirt.io/v1beta1 kind: CDI metadata: name: cdi -spec: {} +spec: + # Run the CDI control plane (cdi-apiserver / cdi-controller / cdi-uploadproxy) + # at the KubeVirt-ecosystem critical tier so it survives node memory pressure. + # CDI is a KubeVirt subproject and kubevirt-cluster-critical (value + # 1000000000) is created by the KubeVirt operator already deployed here. + priorityClass: kubevirt-cluster-critical diff --git a/k8s/bases/infrastructure/controllers/kube-prometheus-stack/helm-release.yaml b/k8s/bases/infrastructure/controllers/kube-prometheus-stack/helm-release.yaml index d00dec1dd..526eee29c 100644 --- a/k8s/bases/infrastructure/controllers/kube-prometheus-stack/helm-release.yaml +++ b/k8s/bases/infrastructure/controllers/kube-prometheus-stack/helm-release.yaml @@ -44,6 +44,10 @@ spec: prometheus: prometheusSpec: replicas: 1 + # Monitoring is critical platform infrastructure — keep it alive under + # node memory pressure (evicted only after normal workloads). See the + # platform-critical PriorityClass. + priorityClassName: platform-critical retention: 14d retentionSize: "5GiB" resources: @@ -72,6 +76,9 @@ spec: alertmanager: alertmanagerSpec: replicas: ${alertmanager_replicas:=2} + # Alerting must survive memory pressure (it is how a degrading node is + # noticed in the first place). + priorityClassName: platform-critical podDisruptionBudget: enabled: true minAvailable: 1 @@ -138,8 +145,12 @@ spec: nodeExporter: enabled: true + # node-exporter subchart passthrough (alias `prometheus-node-exporter`). + prometheus-node-exporter: + priorityClassName: platform-critical kube-state-metrics: enabled: true + priorityClassName: platform-critical # Disable the bundled "default" Prometheus rules -- the chart ships # ~200 alerts that mostly aren't relevant for a homelab. We add our @@ -148,6 +159,7 @@ spec: create: false prometheusOperator: + priorityClassName: platform-critical resources: requests: cpu: 50m diff --git a/k8s/bases/infrastructure/controllers/kustomization.yaml b/k8s/bases/infrastructure/controllers/kustomization.yaml index 6ee6f468e..6fe53aa5d 100644 --- a/k8s/bases/infrastructure/controllers/kustomization.yaml +++ b/k8s/bases/infrastructure/controllers/kustomization.yaml @@ -2,6 +2,9 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: + # PriorityClasses first — HelmReleases below (e.g. kube-prometheus-stack) + # reference platform-critical, so it must be applied before they install. + - priority-classes/ - auth-proxy/ - cdi/ - cert-manager/ diff --git a/k8s/bases/infrastructure/controllers/priority-classes/kustomization.yaml b/k8s/bases/infrastructure/controllers/priority-classes/kustomization.yaml new file mode 100644 index 000000000..9f05c265d --- /dev/null +++ b/k8s/bases/infrastructure/controllers/priority-classes/kustomization.yaml @@ -0,0 +1,5 @@ +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - platform-critical.yaml diff --git a/k8s/bases/infrastructure/controllers/priority-classes/platform-critical.yaml b/k8s/bases/infrastructure/controllers/priority-classes/platform-critical.yaml new file mode 100644 index 000000000..8d35c337d --- /dev/null +++ b/k8s/bases/infrastructure/controllers/priority-classes/platform-critical.yaml @@ -0,0 +1,28 @@ +# Priority tier for important platform add-ons that are NOT core Kubernetes / +# node infrastructure but must still survive node memory pressure — currently +# the monitoring & alerting stack (kube-prometheus-stack). +# +# Sits ABOVE normal workloads (priority 0) so the kubelet evicts workload pods +# first, but BELOW the built-in system-* classes (2000000000) so true cluster +# and node infrastructure — CNI (Cilium), CSI (hcloud-csi/Longhorn), DNS +# (CoreDNS), cloud-controller-manager, and the control plane — always outrank +# it. Value matches kubevirt-cluster-critical (1000000000): both are the +# "important platform add-on" tier. +# +# Deliberately a custom class rather than system-cluster-critical: putting the +# monitoring stack at the very top would rank it alongside core infra, so a +# memory-hungry Prometheus would be reclaimed only after everything else — +# pushing the node toward the kernel OOM-killer. platform-critical keeps it +# "evicted after normal workloads" while still leaving it reclaimable before +# true cluster/node infrastructure. +--- +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: platform-critical +value: 1000000000 +globalDefault: false +description: >- + Important platform add-ons (monitoring/alerting) that must survive node + memory pressure. Ranked far above normal workloads but below the system-* + classes reserved for core cluster/node infrastructure. Not eviction-exempt. diff --git a/k8s/providers/hetzner/infrastructure/controllers/hcloud-csi/helm-release.yaml b/k8s/providers/hetzner/infrastructure/controllers/hcloud-csi/helm-release.yaml index a29940455..0acb95bdb 100644 --- a/k8s/providers/hetzner/infrastructure/controllers/hcloud-csi/helm-release.yaml +++ b/k8s/providers/hetzner/infrastructure/controllers/hcloud-csi/helm-release.yaml @@ -33,6 +33,11 @@ spec: # it avoids "topology not in requisite" provisioning failures. enableProvidedByTopology: false controller: + # Provisioning/attach control plane. The chart leaves priorityClassName + # unset by default (priority 0), which makes it a first candidate for + # node-pressure eviction. system-cluster-critical ranks it among the last + # eviction candidates so volume operations keep working under pressure. + priorityClassName: system-cluster-critical replicaCount: ${hcloud_csi_controller_replicas:=2} hcloudVolumeDefaultLocation: fsn1 podDisruptionBudget: @@ -50,6 +55,11 @@ spec: app.kubernetes.io/name: hcloud-csi app.kubernetes.io/instance: hcloud-csi node: + # Per-node volume mount/unmount plumbing — node-critical. The chart + # leaves priorityClassName unset by default (priority 0); system-node-critical + # ranks the DaemonSet among the last eviction candidates so stateful pods + # can keep mounting their storage under node memory pressure. + priorityClassName: system-node-critical affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/talos/cluster/kubelet.yaml b/talos/cluster/kubelet.yaml new file mode 100644 index 000000000..52821ba8b --- /dev/null +++ b/talos/cluster/kubelet.yaml @@ -0,0 +1,45 @@ +# Kubelet memory-pressure handling — graceful node OOM. +# +# Out of the box the kubelet only steps in at its tiny default hard floor +# (memory.available<100Mi), by which point the Linux kernel OOM-killer may +# already have reaped a node-critical daemon. This patch makes a node shed +# load *gracefully* well before that point and guarantees the OS + kubelet +# always keep headroom: +# +# - systemReserved / kubeReserved carve memory out of node-allocatable so +# pods can never starve the OS or the kubelet itself — the usual trigger +# for an ungraceful kernel OOM-kill of a system daemon. +# - evictionSoft starts reclaiming once memory.available stays below the +# soft threshold (500Mi) for the grace period (1m30s), evicting the +# lowest-priority / over-request pods first and letting them terminate +# cleanly (evictionMaxPodGracePeriod bounds that drain) — well before the +# much lower hard floor. +# - evictionHard stays as the last-resort floor before the kernel acts. +# +# Node-pressure eviction ranks pods by Priority, so high-priority *critical* +# pods (system-node-critical / system-cluster-critical) are the last eviction +# candidates: workload pods are shed before the critical kube-system +# control/storage plane. (Priority ordering, not a strict exemption.) +# +# Reference: https://kubernetes.io/docs/concepts/scheduling-eviction/node-pressure-eviction/ +machine: + kubelet: + extraConfig: + # Headroom reserved from allocatable for the OS and kubelet/runtime. + systemReserved: + memory: 256Mi + kubeReserved: + memory: 256Mi + # Graceful first line: evict low-priority pods before memory is critical. + evictionSoft: + memory.available: 500Mi + evictionSoftGracePeriod: + memory.available: 1m30s + # Reclaim a little past the threshold so we don't immediately re-trigger. + evictionMinimumReclaim: + memory.available: 200Mi + # Bound how long a soft-evicted pod may take to terminate. + evictionMaxPodGracePeriod: 60 + # Hard floor: last resort before the kernel OOM-killer. + evictionHard: + memory.available: 100Mi