Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion k8s/bases/infrastructure/controllers/cdi/cdi-cr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,9 @@ apiVersion: cdi.kubevirt.io/v1beta1
kind: CDI
metadata:
name: cdi
spec: {}
spec:
# Run the CDI control plane (cdi-apiserver / cdi-controller / cdi-uploadproxy)
# at the KubeVirt-ecosystem critical tier so it survives node memory pressure.
# CDI is a KubeVirt subproject and kubevirt-cluster-critical (value
# 1000000000) is created by the KubeVirt operator already deployed here.
priorityClass: kubevirt-cluster-critical
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ spec:
prometheus:
prometheusSpec:
replicas: 1
# Monitoring is critical platform infrastructure β€” keep it alive under
# node memory pressure (evicted only after normal workloads). See the
# platform-critical PriorityClass.
priorityClassName: platform-critical
retention: 14d
retentionSize: "5GiB"
resources:
Expand Down Expand Up @@ -72,6 +76,9 @@ spec:
alertmanager:
alertmanagerSpec:
replicas: ${alertmanager_replicas:=2}
# Alerting must survive memory pressure (it is how a degrading node is
# noticed in the first place).
priorityClassName: platform-critical
podDisruptionBudget:
enabled: true
minAvailable: 1
Expand Down Expand Up @@ -138,8 +145,12 @@ spec:

nodeExporter:
enabled: true
# node-exporter subchart passthrough (alias `prometheus-node-exporter`).
prometheus-node-exporter:
priorityClassName: platform-critical
kube-state-metrics:
enabled: true
priorityClassName: platform-critical

# Disable the bundled "default" Prometheus rules -- the chart ships
# ~200 alerts that mostly aren't relevant for a homelab. We add our
Expand All @@ -148,6 +159,7 @@ spec:
create: false

prometheusOperator:
priorityClassName: platform-critical
resources:
requests:
cpu: 50m
Expand Down
3 changes: 3 additions & 0 deletions k8s/bases/infrastructure/controllers/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
# PriorityClasses first β€” HelmReleases below (e.g. kube-prometheus-stack)
# reference platform-critical, so it must be applied before they install.
- priority-classes/
- auth-proxy/
- cdi/
- cert-manager/
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- platform-critical.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Priority tier for important platform add-ons that are NOT core Kubernetes /
# node infrastructure but must still survive node memory pressure β€” currently
# the monitoring & alerting stack (kube-prometheus-stack).
#
# Sits ABOVE normal workloads (priority 0) so the kubelet evicts workload pods
# first, but BELOW the built-in system-* classes (2000000000) so true cluster
# and node infrastructure β€” CNI (Cilium), CSI (hcloud-csi/Longhorn), DNS
# (CoreDNS), cloud-controller-manager, and the control plane β€” always outrank
# it. Value matches kubevirt-cluster-critical (1000000000): both are the
# "important platform add-on" tier.
#
# Deliberately a custom class rather than system-cluster-critical: putting the
# monitoring stack at the very top would rank it alongside core infra, so a
# memory-hungry Prometheus would be reclaimed only after everything else β€”
# pushing the node toward the kernel OOM-killer. platform-critical keeps it
# "evicted after normal workloads" while still leaving it reclaimable before
# true cluster/node infrastructure.
---
apiVersion: scheduling.k8s.io/v1
kind: PriorityClass
metadata:
name: platform-critical
value: 1000000000
globalDefault: false
description: >-
Important platform add-ons (monitoring/alerting) that must survive node
memory pressure. Ranked far above normal workloads but below the system-*
classes reserved for core cluster/node infrastructure. Not eviction-exempt.
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@ spec:
# it avoids "topology not in requisite" provisioning failures.
enableProvidedByTopology: false
controller:
# Provisioning/attach control plane. The chart leaves priorityClassName
# unset by default (priority 0), which makes it a first candidate for
# node-pressure eviction. system-cluster-critical ranks it among the last
# eviction candidates so volume operations keep working under pressure.
priorityClassName: system-cluster-critical
replicaCount: ${hcloud_csi_controller_replicas:=2}
hcloudVolumeDefaultLocation: fsn1
podDisruptionBudget:
Expand All @@ -50,6 +55,11 @@ spec:
app.kubernetes.io/name: hcloud-csi
app.kubernetes.io/instance: hcloud-csi
node:
# Per-node volume mount/unmount plumbing β€” node-critical. The chart
# leaves priorityClassName unset by default (priority 0); system-node-critical
# ranks the DaemonSet among the last eviction candidates so stateful pods
# can keep mounting their storage under node memory pressure.
priorityClassName: system-node-critical
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
Expand Down
45 changes: 45 additions & 0 deletions talos/cluster/kubelet.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Kubelet memory-pressure handling β€” graceful node OOM.
#
# Out of the box the kubelet only steps in at its tiny default hard floor
# (memory.available<100Mi), by which point the Linux kernel OOM-killer may
# already have reaped a node-critical daemon. This patch makes a node shed
# load *gracefully* well before that point and guarantees the OS + kubelet
# always keep headroom:
#
# - systemReserved / kubeReserved carve memory out of node-allocatable so
# pods can never starve the OS or the kubelet itself β€” the usual trigger
# for an ungraceful kernel OOM-kill of a system daemon.
# - evictionSoft starts reclaiming once memory.available stays below the
# soft threshold (500Mi) for the grace period (1m30s), evicting the
# lowest-priority / over-request pods first and letting them terminate
# cleanly (evictionMaxPodGracePeriod bounds that drain) β€” well before the
# much lower hard floor.
# - evictionHard stays as the last-resort floor before the kernel acts.
#
# Node-pressure eviction ranks pods by Priority, so high-priority *critical*
# pods (system-node-critical / system-cluster-critical) are the last eviction
# candidates: workload pods are shed before the critical kube-system
# control/storage plane. (Priority ordering, not a strict exemption.)
#
# Reference: https://kubernetes.io/docs/concepts/scheduling-eviction/node-pressure-eviction/
machine:
kubelet:
extraConfig:
# Headroom reserved from allocatable for the OS and kubelet/runtime.
systemReserved:
memory: 256Mi
kubeReserved:
memory: 256Mi
# Graceful first line: evict low-priority pods before memory is critical.
evictionSoft:
memory.available: 500Mi
evictionSoftGracePeriod:
memory.available: 1m30s
# Reclaim a little past the threshold so we don't immediately re-trigger.
evictionMinimumReclaim:
memory.available: 200Mi
# Bound how long a soft-evicted pod may take to terminate.
evictionMaxPodGracePeriod: 60
# Hard floor: last resort before the kernel OOM-killer.
evictionHard:
memory.available: 100Mi
Loading