From 19c498c7614856c024667860d3481c216f924b0c Mon Sep 17 00:00:00 2001 From: varashi Date: Wed, 3 Jun 2026 13:57:26 +0200 Subject: [PATCH 1/4] feat: optional crash-fence controller (out-of-service taint on host crash) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds fence.py — a second, optional Deployment (fence.enabled, off by default) sharing this image and reusing the vCenter client + node↔VM map. Automates non-graceful node shutdown for passthrough-GPU workers that vSphere HA can't restart elsewhere during a host crash: applies the node.kubernetes.io/out-of-service taint to a node confirmed dead by BOTH k8s (NotReady) and vCenter (VM connectionState disconnected/inaccessible/ orphaned), sustained graceSeconds, so RWO volumes force-detach and pods reschedule; removes it on recovery (connected + Ready). Disjoint from the maintenance controller (clean power-off stays 'connected', only a crash goes 'disconnected') — no coordination needed. Taint/un-taint only; power-on stays with vSphere HA. Own SA + least-privilege ClusterRole (nodes only) + kill switch + dryRun. test_fence.py covers the two-gate guard, grace hold, no-double-apply, un-fence on recovery, and the partition/notfound guards (8/8). Co-Authored-By: Claude Opus 4.8 --- .gitignore | 1 + CHANGELOG.md | 23 +++++ Dockerfile | 4 +- README.md | 43 ++++++++++ chart/Chart.yaml | 8 +- chart/templates/fence.yaml | 154 ++++++++++++++++++++++++++++++++++ chart/values.yaml | 24 ++++++ controller.py | 61 ++++++++++++++ fence.py | 115 +++++++++++++++++++++++++ test_fence.py | 166 +++++++++++++++++++++++++++++++++++++ 10 files changed, 594 insertions(+), 5 deletions(-) create mode 100644 .gitignore create mode 100644 chart/templates/fence.yaml create mode 100644 fence.py create mode 100644 test_fence.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c18dd8d --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +__pycache__/ diff --git a/CHANGELOG.md b/CHANGELOG.md index cd0cfbd..ecb165e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,29 @@ this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm ## [Unreleased] +## [0.5.0] — 2026-06-03 + +### Added +- **Crash-fence controller** (`fence.py`) — a second, optional Deployment + (`fence.enabled`, **off by default**) that shares this image and reuses the + vCenter client + node↔VM mapping. It automates non-graceful node shutdown for + passthrough-GPU workers that vSphere HA can't restart elsewhere during a host + crash: it applies the `node.kubernetes.io/out-of-service` taint to a node + confirmed dead by **both** gates — k8s `NotReady` **and** vCenter VM + `runtime.connectionState` in `{disconnected, inaccessible, orphaned}` — + sustained for `fence.graceSeconds`, so RWO volumes force-detach and stateful + pods reschedule. The taint is removed on recovery (VM `connected` + node + `Ready`). + - **Disjoint from the maintenance controller**: a clean (maintenance) + power-off leaves the VM `connected`; only a real host loss makes it + `disconnected`. The two controllers trigger on different vCenter facts and + never collide — no coordination contract needed. + - **Taint/un-taint only.** Power-on is owned by vSphere HA (it restarts + passthrough VMs on the original host once it returns); eviction is handled + by `tolerationSeconds` + the taint. + - Own ServiceAccount + least-privilege ClusterRole (`nodes` get/list/watch/ + patch only) + kill switch (`fence.enabled`) + independent `fence.dryRun`. + ## [0.4.4] — 2026-05-01 ### Fixed diff --git a/Dockerfile b/Dockerfile index dfaa2c2..6bd5352 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,6 +11,8 @@ WORKDIR /app RUN pip install --no-cache-dir --disable-pip-version-check \ pyVmomi==8.0.3.0.1 kubernetes==31.0.0 -COPY controller.py . +COPY controller.py fence.py . +# Default entrypoint = maintenance controller. The fence controller (fence.py) +# is the same image with the command overridden to `python -u fence.py`. CMD ["python", "-u", "controller.py"] diff --git a/README.md b/README.md index d83bd1e..c3f1879 100644 --- a/README.md +++ b/README.md @@ -90,6 +90,49 @@ Recovery: if a `powered-off` VM ends up on a different host (DRS race, operator intervention), the controller notices on the next poll and transitions it to `migrated`. +## Crash-fence controller (optional, off by default) + +The maintenance controller above handles *planned* host maintenance. A separate, +optional controller (`fence.py`) handles the *unplanned* case — a host **crash**. + +A passthrough-GPU VM can't be vSphere-HA-restarted on another host (the device +pins it to the original host), so when its host crashes the node stays down. Its +**RWO volume stays attached to the dead node** and Kubernetes won't auto-detach +it — it can't distinguish a crash from a network partition, where force-detaching +a still-live node's volume would corrupt it. So a rescheduled stateful pod hangs +indefinitely on `Multi-Attach`. The fix is the `node.kubernetes.io/out-of-service` +taint (non-graceful node shutdown), which force-detaches volumes and force-deletes +pods — but it must only ever be applied to a node you've *confirmed* is dead. + +The fence controller provides that confirmation by requiring **two gates**, +sustained for `fence.graceSeconds`: + +1. **k8s** — node `NotReady`, and +2. **vCenter** — that node's VM `runtime.connectionState` is `disconnected` + (or `inaccessible`/`orphaned`). A host crash makes its VMs `disconnected`; a + clean (maintenance) power-off keeps them `connected` — so this signal is + **disjoint from the maintenance controller** and the two never collide. + +On recovery (VM `connected` + node `Ready`) the taint is removed. The controller +does **taint/un-taint only** — power-on is left to vSphere HA (which restarts a +passthrough VM on the original host once it reconnects), and eviction is handled +by `tolerationSeconds` + the taint. + +Enable it (and start with `dryRun` to watch its decisions): + +```yaml +fence: + enabled: true + dryRun: true # logs "would fence" without tainting; flip to false when confident + graceSeconds: 60 # both gates must hold this long before fencing + pollSeconds: 20 +``` + +It runs as its own Deployment with its own ServiceAccount, a least-privilege +ClusterRole (`nodes` get/list/watch/patch only — no pods/eviction), and an +independent kill switch (`fence.enabled`). It's **off by default** because a +mis-fire is destructive; turn it on once you trust the signal in your environment. + ## Requirements - Kubernetes 1.26+ (eviction API, server-side apply) diff --git a/chart/Chart.yaml b/chart/Chart.yaml index ac8c21a..ffb534f 100644 --- a/chart/Chart.yaml +++ b/chart/Chart.yaml @@ -6,8 +6,8 @@ description: | on ESXi hosts, drains the matching Kubernetes nodes, powers off the VMs, migrates them to a free GPU host if possible, and returns them to service. type: application -version: 0.4.4 -appVersion: "0.4.4" +version: 0.5.0 +appVersion: "0.5.0" kubeVersion: ">=1.26.0-0" keywords: - vsphere @@ -25,5 +25,5 @@ maintainers: annotations: artifacthub.io/license: MIT artifacthub.io/changes: | - - kind: fixed - description: Tolerate vCLS VMs vanishing mid-enumeration during host maintenance entry. + - kind: added + description: Optional crash-fence controller (separate Deployment, off by default) — applies the out-of-service taint to a GPU node confirmed dead by both k8s (NotReady) and vCenter (VM disconnected) so RWO volumes force-detach and stateful pods reschedule. diff --git a/chart/templates/fence.yaml b/chart/templates/fence.yaml new file mode 100644 index 0000000..05db042 --- /dev/null +++ b/chart/templates/fence.yaml @@ -0,0 +1,154 @@ +{{- if .Values.fence.enabled }} +{{- $fullname := include "gpu-node-vsphere-maintenance-controller.fullname" . -}} +{{- $fenceName := printf "%s-fence" $fullname -}} +{{- $fenceSA := ternary $fenceName (default "default" .Values.serviceAccount.name) .Values.serviceAccount.create -}} +{{- if and .Values.serviceAccount.create }} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ $fenceName }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "gpu-node-vsphere-maintenance-controller.labels" . | nindent 4 }} + app.kubernetes.io/component: fence +--- +{{- end }} +{{- if .Values.rbac.create }} +# Least-privilege: the fence controller only reads node state and patches the +# out-of-service taint. No pods/eviction (vSphere HA + tolerationSeconds handle +# eviction once the taint lands). +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ $fenceName }} + labels: + {{- include "gpu-node-vsphere-maintenance-controller.labels" . | nindent 4 }} + app.kubernetes.io/component: fence +rules: + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch", "patch", "update"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ $fenceName }} + labels: + {{- include "gpu-node-vsphere-maintenance-controller.labels" . | nindent 4 }} + app.kubernetes.io/component: fence +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ $fenceName }} +subjects: + - kind: ServiceAccount + name: {{ $fenceSA }} + namespace: {{ .Release.Namespace }} +--- +{{- end }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ $fenceName }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "gpu-node-vsphere-maintenance-controller.labels" . | nindent 4 }} + app.kubernetes.io/component: fence +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + {{- include "gpu-node-vsphere-maintenance-controller.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: fence + template: + metadata: + labels: + {{- include "gpu-node-vsphere-maintenance-controller.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: fence + {{- with .Values.podLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + serviceAccountName: {{ $fenceSA }} + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.priorityClassName }} + priorityClassName: {{ . }} + {{- end }} + {{- with .Values.podSecurityContext }} + securityContext: + {{- toYaml . | nindent 8 }} + {{- end }} + containers: + - name: fence + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + command: ["python", "-u", "fence.py"] + envFrom: + - secretRef: + name: {{ include "gpu-node-vsphere-maintenance-controller.vcenterSecretName" . }} + env: + - name: FENCE_POLL_SECONDS + value: {{ .Values.fence.pollSeconds | quote }} + - name: FENCE_GRACE_SECONDS + value: {{ .Values.fence.graceSeconds | quote }} + - name: DRY_RUN + value: {{ .Values.fence.dryRun | quote }} + - name: GPU_NODE_LABEL + value: {{ .Values.config.gpuNodeLabel | quote }} + - name: VCENTER_TLS_VERIFY + value: {{ .Values.vcenter.tlsVerify | quote }} + {{- if .Values.vcenter.caBundle.configMapName }} + - name: VCENTER_CA_BUNDLE + value: {{ printf "%s/%s" .Values.vcenter.caBundle.mountPath .Values.vcenter.caBundle.key | quote }} + {{- end }} + {{- with .Values.extraEnv }} + {{- toYaml . | nindent 12 }} + {{- end }} + {{- if or .Values.vcenter.caBundle.configMapName .Values.extraVolumeMounts }} + volumeMounts: + {{- if .Values.vcenter.caBundle.configMapName }} + - name: vcenter-ca + mountPath: {{ .Values.vcenter.caBundle.mountPath }} + readOnly: true + {{- end }} + {{- with .Values.extraVolumeMounts }} + {{- toYaml . | nindent 12 }} + {{- end }} + {{- end }} + resources: + {{- toYaml .Values.fence.resources | nindent 12 }} + securityContext: + {{- toYaml .Values.securityContext | nindent 12 }} + {{- if or .Values.vcenter.caBundle.configMapName .Values.extraVolumes }} + volumes: + {{- if .Values.vcenter.caBundle.configMapName }} + - name: vcenter-ca + configMap: + name: {{ .Values.vcenter.caBundle.configMapName }} + {{- end }} + {{- with .Values.extraVolumes }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} diff --git a/chart/values.yaml b/chart/values.yaml index 9478122..a8ffa3c 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -53,6 +53,30 @@ config: gpuNodeLabel: "intel.feature.node.kubernetes.io/gpu=true" dryRun: false +# Crash-fence controller — a SEPARATE Deployment (own RBAC + kill switch) that +# shares this image. Applies the `node.kubernetes.io/out-of-service` taint to a +# GPU node confirmed dead by BOTH k8s (NotReady) and vCenter (VM +# connectionState disconnected/inaccessible/orphaned), sustained graceSeconds, +# so RWO volumes force-detach and stateful pods reschedule; removes it on +# recovery (VM connected + node Ready). Disjoint from maintenance (which keys +# off maintenance-mode tasks and leaves a clean power-off 'connected'), so the +# two never collide. Power-on stays with vSphere HA. +# +# OFF by default — fencing is the most destructive action in the cluster (a +# mis-fire force-detaches a live/partitioned node's volume → corruption). +# Enable explicitly once you trust the signal. +fence: + enabled: false + pollSeconds: 20 + graceSeconds: 60 + dryRun: false + resources: + requests: + cpu: 25m + memory: 64Mi + limits: + memory: 128Mi + serviceAccount: create: true name: "" diff --git a/controller.py b/controller.py index 01b5edc..c77083d 100644 --- a/controller.py +++ b/controller.py @@ -73,6 +73,13 @@ def _is_transient_k8s_error(exc: BaseException) -> bool: STATE_POWERED_OFF = "powered-off" STATE_MIGRATED = "migrated" +# Non-graceful node shutdown (used by the separate fence controller, fence.py). +OUT_OF_SERVICE_TAINT_KEY = "node.kubernetes.io/out-of-service" +OUT_OF_SERVICE_TAINT_VALUE = "nodeshutdown" +# vm.runtime.connectionState values meaning the host managing the VM is gone +# (a crash). A clean maintenance power-off leaves the VM 'connected'. +VM_DEAD_CONNECTION_STATES = {"disconnected", "inaccessible", "orphaned"} + # ── Logging ─────────────────────────────────────────────────────────────────── logging.basicConfig( @@ -270,6 +277,17 @@ def get_vm_host(self, vm_name: str): return vm.runtime.host.name return None + def get_vm_connection_state(self, vm_name: str) -> str: + """vm.runtime.connectionState — 'connected' | 'disconnected' | + 'inaccessible' | 'orphaned' | 'invalid'. Returns 'notfound' if the VM + is not in inventory. A crashed host makes its VMs 'disconnected'; a + clean (maintenance) power-off keeps them 'connected'.""" + self._ensure_connected() + vm = self._find_vm(vm_name) + if vm is None: + return "notfound" + return str(vm.runtime.connectionState) + def relocate_vm(self, vm_name: str, target_host_name: str): """Cold migrate a powered-off VM to the target ESXi host.""" self._ensure_connected() @@ -463,6 +481,49 @@ def uncordon(self, node_name): log.info(f"Uncordoning {node_name}") self.core.patch_node(node_name, {"spec": {"unschedulable": False}}) + def has_out_of_service_taint(self, node_name) -> bool: + node = self.get_node(node_name) + return any( + t.key == OUT_OF_SERVICE_TAINT_KEY for t in (node.spec.taints or []) + ) + + def _patch_taints(self, node_name, taints): + # sanitize_for_serialization turns V1Taint objects into proper camelCase + # API dicts (incl. timeAdded), so existing taints are preserved verbatim. + body = {"spec": {"taints": self.core.api_client.sanitize_for_serialization(taints)}} + self.core.patch_node(node_name, body) + + def apply_out_of_service_taint(self, node_name): + """Force-detach volumes + force-delete pods on a confirmed-dead node.""" + node = self.get_node(node_name) + taints = list(node.spec.taints or []) + if any(t.key == OUT_OF_SERVICE_TAINT_KEY for t in taints): + return # already fenced + if DRY_RUN: + log.warning(f"[DRY RUN] Would FENCE {node_name} (out-of-service taint)") + return + log.warning(f"FENCING {node_name}: applying {OUT_OF_SERVICE_TAINT_KEY} taint") + taints.append( + k8s_client.V1Taint( + key=OUT_OF_SERVICE_TAINT_KEY, + value=OUT_OF_SERVICE_TAINT_VALUE, + effect="NoExecute", + ) + ) + self._patch_taints(node_name, taints) + + def remove_out_of_service_taint(self, node_name): + node = self.get_node(node_name) + taints = list(node.spec.taints or []) + if not any(t.key == OUT_OF_SERVICE_TAINT_KEY for t in taints): + return + if DRY_RUN: + log.info(f"[DRY RUN] Would un-fence {node_name} (remove out-of-service taint)") + return + log.info(f"Un-fencing {node_name}: removing {OUT_OF_SERVICE_TAINT_KEY} taint") + kept = [t for t in taints if t.key != OUT_OF_SERVICE_TAINT_KEY] + self._patch_taints(node_name, kept) + def is_ready(self, node_name): node = self.get_node(node_name) for condition in node.status.conditions: diff --git a/fence.py b/fence.py new file mode 100644 index 0000000..b5df2f7 --- /dev/null +++ b/fence.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 +""" +GPU-node crash fence controller (sibling of the maintenance controller). + +Automates non-graceful node shutdown for passthrough-GPU workers that can't be +vSphere-HA-restarted during a host crash. When a host crashes, the node's RWO +volume stays attached to the dead node and k8s won't auto-detach it (it can't +tell a crash from a network partition) — so a rescheduled stateful pod hangs on +`Multi-Attach`. The fix is the `node.kubernetes.io/out-of-service` taint, which +force-detaches volumes + force-deletes pods. This controller applies it only on +a node confirmed dead by BOTH k8s and vCenter, and removes it on recovery. + +Two-gate fence condition (both required, sustained for FENCE_GRACE_SECONDS): + 1. k8s: node NotReady + 2. vCenter: that node's VM runtime.connectionState is disconnected/inaccessible + (a crash — a clean maintenance power-off keeps it 'connected', so + this never collides with the maintenance controller) + +Un-fence when the node recovers: VM 'connected' AND node Ready. + +Deliberately scoped to taint/un-taint only: + * Power-on is owned by vSphere HA (it restarts passthrough VMs on the original + host once it returns). + * Graceful maintenance drains are owned by the maintenance controller + (controller.py), which keys off vCenter maintenance-mode tasks — a disjoint + signal, so the two never clash. + +Runs as its own Deployment with its own RBAC + kill switch (Values.fence.*). +""" + +import os +import time + +from kubernetes.client.rest import ApiException + +from controller import ( + VM_DEAD_CONNECTION_STATES, + DRY_RUN, + K8sClient, + VSphereClient, + _is_transient_k8s_error, + log, +) + +FENCE_POLL_SECONDS = int(os.environ.get("FENCE_POLL_SECONDS", "20")) +# How long both gates must hold before we fence — guards against transient +# blips (kubelet restart, brief vCenter/host comms loss) and lets vCenter's +# host-down detection settle (it lags node-NotReady by tens of seconds). +FENCE_GRACE_SECONDS = int(os.environ.get("FENCE_GRACE_SECONDS", "60")) + + +class FenceController: + def __init__(self): + self.vsphere = VSphereClient() + self.k8s = K8sClient() + # node -> monotonic time the two-gate condition first became true + self.gate_since: dict[str, float] = {} + + def reconcile(self): + for node in self.k8s.get_gpu_nodes(): + name = node.metadata.name + ready = self.k8s.is_ready(name) + vm_state = self.vsphere.get_vm_connection_state(name) + tainted = self.k8s.has_out_of_service_taint(name) + dead_vm = vm_state in VM_DEAD_CONNECTION_STATES + + # ── FENCE gate: NotReady AND vCenter says the VM is gone ── + if not ready and dead_vm: + first = self.gate_since.setdefault(name, time.monotonic()) + elapsed = time.monotonic() - first + if tainted: + continue # already fenced + if elapsed >= FENCE_GRACE_SECONDS: + log.warning( + f"{name}: NotReady + vm={vm_state} for {elapsed:.0f}s " + f"(>= {FENCE_GRACE_SECONDS}s grace) — fencing" + ) + self.k8s.apply_out_of_service_taint(name) + else: + log.info( + f"{name}: fence-gate pending {elapsed:.0f}/" + f"{FENCE_GRACE_SECONDS}s (NotReady + vm={vm_state})" + ) + continue + + # ── Gates not both true: reset timer, and un-fence on recovery ── + self.gate_since.pop(name, None) + if tainted and ready and vm_state == "connected": + log.info(f"{name}: recovered (Ready + vm connected) — un-fencing") + self.k8s.remove_out_of_service_taint(name) + + def run(self): + log.info( + f"Fence controller started — poll={FENCE_POLL_SECONDS}s, " + f"grace={FENCE_GRACE_SECONDS}s, dry_run={DRY_RUN}" + ) + while True: + try: + self.reconcile() + except Exception as e: + if _is_transient_k8s_error(e): + reason = getattr(e, "reason", type(e).__name__) + status = getattr(e, "status", None) + detail = f" status={status}" if status else "" + log.warning( + f"Transient k8s/transport error in fence loop: " + f"{reason}{detail} — retrying next poll" + ) + else: + log.exception("Unhandled error in fence loop") + time.sleep(FENCE_POLL_SECONDS) + + +if __name__ == "__main__": + FenceController().run() diff --git a/test_fence.py b/test_fence.py new file mode 100644 index 0000000..2ec85f3 --- /dev/null +++ b/test_fence.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 +"""Unit test for the fence controller's two-gate logic — no cluster/vCenter +needed. Stubs the heavy imports so `import controller`/`fence` works, then +drives FenceController.reconcile() through the scenarios that matter for the +out-of-service taint (the most destructive action in the cluster). + +Run: python3 test_fence.py +""" +import os +import sys +import types + +# ── stub heavy deps so `import controller` succeeds ────────────────────────── +os.environ.setdefault("VCENTER_HOST", "x") +os.environ.setdefault("VCENTER_USER", "x") +os.environ.setdefault("VCENTER_PASSWORD", "x") + + +def _mod(name): + m = types.ModuleType(name) + sys.modules[name] = m + return m + + +class _ApiException(Exception): + def __init__(self, status=None): + self.status = status + + +_u = _mod("urllib3") +_ue = _mod("urllib3.exceptions") +_u.exceptions = _ue +for _n in ("ProtocolError", "ReadTimeoutError", "MaxRetryError", "ConnectionError"): + setattr(_ue, _n, type(_n, (Exception,), {})) + +_k = _mod("kubernetes") +_kc = _mod("kubernetes.client") +_kcr = _mod("kubernetes.client.rest") +_kcfg = _mod("kubernetes.config") +_k.client = _kc +_k.config = _kcfg +_kc.rest = _kcr +_kcr.ApiException = _ApiException +_kc.V1Taint = lambda **kw: types.SimpleNamespace(**kw) +_kc.CoreV1Api = object +_kc.V1Eviction = object +_kc.V1ObjectMeta = object + + +class _ConfigException(Exception): + pass + + +_kcfg.ConfigException = _ConfigException +_kcfg.load_incluster_config = lambda: None +_kcfg.load_kube_config = lambda **kw: None + +_pv = _mod("pyVim") +_pvc = _mod("pyVim.connect") +_pv.connect = _pvc +_pvc.SmartConnect = lambda **kw: None +_pvm = _mod("pyVmomi") +_pvm.vim = types.SimpleNamespace() +_pvm.vmodl = types.SimpleNamespace() + +import fence # noqa: E402 + + +# ── fakes ──────────────────────────────────────────────────────────────────── +class FakeK8s: + def __init__(self, nodes): + # nodes: name -> {"ready": bool, "tainted": bool} + self.nodes = nodes + self.applied = [] + self.removed = [] + + def get_gpu_nodes(self): + return [types.SimpleNamespace(metadata=types.SimpleNamespace(name=n)) + for n in self.nodes] + + def is_ready(self, name): + return self.nodes[name]["ready"] + + def has_out_of_service_taint(self, name): + return self.nodes[name]["tainted"] + + def apply_out_of_service_taint(self, name): + self.nodes[name]["tainted"] = True + self.applied.append(name) + + def remove_out_of_service_taint(self, name): + self.nodes[name]["tainted"] = False + self.removed.append(name) + + +class FakeVS: + def __init__(self, states): + self.states = states # name -> connectionState + + def get_vm_connection_state(self, name): + return self.states[name] + + +def make(nodes, states, grace=0): + fc = fence.FenceController.__new__(fence.FenceController) + fc.k8s = FakeK8s(nodes) + fc.vsphere = FakeVS(states) + fc.gate_since = {} + fence.FENCE_GRACE_SECONDS = grace + return fc + + +results = [] + + +def check(desc, cond): + results.append((desc, cond)) + print(("PASS " if cond else "FAIL ") + desc) + + +# 1. healthy node — never fenced +fc = make({"n": {"ready": True, "tainted": False}}, {"n": "connected"}) +fc.reconcile() +check("healthy (Ready+connected) -> no fence", not fc.k8s.applied) + +# 2. NotReady but VM still connected (vCenter lag) — must NOT fence (one gate) +fc = make({"n": {"ready": False, "tainted": False}}, {"n": "connected"}) +fc.reconcile() +check("NotReady + vm connected -> no fence (one gate only)", not fc.k8s.applied) + +# 3. both gates but within grace — no fence yet +fc = make({"n": {"ready": False, "tainted": False}}, {"n": "disconnected"}, grace=9999) +fc.reconcile() +check("NotReady + disconnected, within grace -> no fence yet", not fc.k8s.applied) + +# 4. both gates, grace elapsed -> fence +fc = make({"n": {"ready": False, "tainted": False}}, {"n": "disconnected"}, grace=0) +fc.reconcile() +check("NotReady + disconnected, grace met -> FENCE", fc.k8s.applied == ["n"]) + +# 5. already tainted -> no double apply +fc = make({"n": {"ready": False, "tainted": True}}, {"n": "disconnected"}, grace=0) +fc.reconcile() +check("already tainted -> no re-apply", not fc.k8s.applied) + +# 6. recovery: tainted + Ready + connected -> un-fence +fc = make({"n": {"ready": True, "tainted": True}}, {"n": "connected"}) +fc.reconcile() +check("recovered (Ready+connected) while tainted -> un-fence", fc.k8s.removed == ["n"]) + +# 7. tainted, node back Ready but vm still disconnected -> do NOT un-fence yet +fc = make({"n": {"ready": True, "tainted": True}}, {"n": "disconnected"}) +fc.reconcile() +check("tainted, Ready but vm disconnected -> stay fenced", not fc.k8s.removed) + +# 8. 'notfound' VM is not a dead-state -> no fence +fc = make({"n": {"ready": False, "tainted": False}}, {"n": "notfound"}, grace=0) +fc.reconcile() +check("NotReady + vm notfound -> no fence", not fc.k8s.applied) + +print() +failed = [d for d, c in results if not c] +if failed: + print(f"{len(failed)} FAILED") + sys.exit(1) +print(f"all {len(results)} passed") From b5e35029020c40b6548222d864f5c43c1fda550f Mon Sep 17 00:00:00 2001 From: varashi Date: Wed, 3 Jun 2026 18:01:47 +0200 Subject: [PATCH 2/4] refactor: rename to vsphere-passthrough-node-controller MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The product now covers two passthrough-node host-lifecycle concerns — planned maintenance and (new) crash fencing — so "maintenance-controller" no longer fits. Renames repo/image/chart/templates/helpers/docs to vsphere-passthrough-node-controller (image + chart at ghcr.io/varashi/[charts/]vsphere-passthrough-node-controller). GitHub repo renamed (redirects cover old git/web refs). The internal maintenance-state annotation domain (vsphere-maintenance.boeye.net/*) is left unchanged — it's controller state, not user-facing, and no nodes carry it currently. Co-Authored-By: Claude Opus 4.8 --- .github/workflows/ci.yaml | 2 +- .github/workflows/release.yaml | 6 +- CHANGELOG.md | 26 ++++---- Dockerfile | 6 +- README.md | 80 ++++++++++++------------- chart/Chart.yaml | 6 +- chart/templates/NOTES.txt | 4 +- chart/templates/_helpers.tpl | 24 ++++---- chart/templates/clusterrole.yaml | 4 +- chart/templates/clusterrolebinding.yaml | 8 +-- chart/templates/configmap.yaml | 4 +- chart/templates/deployment.yaml | 14 ++--- chart/templates/fence.yaml | 16 ++--- chart/templates/secret.yaml | 4 +- chart/templates/serviceaccount.yaml | 4 +- chart/values.yaml | 4 +- 16 files changed, 106 insertions(+), 106 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 966ecd8..dd22033 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -75,4 +75,4 @@ jobs: context: . platforms: linux/amd64,linux/arm64 push: false - tags: ci/gpu-node-vsphere-maintenance-controller:ci + tags: ci/vsphere-passthrough-node-controller:ci diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index f0fafc9..6365d8f 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -148,7 +148,7 @@ jobs: run: | version="${{ steps.ver.outputs.version }}" # helm push prints "Digest: sha256:..." to stderr; tee to capture. - helm push "gpu-node-vsphere-maintenance-controller-${version}.tgz" \ + helm push "vsphere-passthrough-node-controller-${version}.tgz" \ "oci://${{ env.CHART_REPO }}" 2>&1 | tee push.log digest=$(awk '/^Digest: /{print $2}' push.log) if [ -z "$digest" ]; then @@ -160,7 +160,7 @@ jobs: - name: Cosign keyless sign (chart) env: DIGEST: ${{ steps.chart_push.outputs.digest }} - CHART_REF: ${{ env.CHART_REPO }}/gpu-node-vsphere-maintenance-controller + CHART_REF: ${{ env.CHART_REPO }}/vsphere-passthrough-node-controller run: cosign sign --yes "${CHART_REF}@${DIGEST}" - name: Create GitHub Release @@ -173,4 +173,4 @@ jobs: prerelease: false files: | sbom.spdx.json - gpu-node-vsphere-maintenance-controller-${{ steps.ver.outputs.version }}.tgz + vsphere-passthrough-node-controller-${{ steps.ver.outputs.version }}.tgz diff --git a/CHANGELOG.md b/CHANGELOG.md index ecb165e..4dcd9b7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -107,7 +107,7 @@ No controller code change. Supply-chain and CI polish only. now consults the map instead of making a per-node `get_vm_host` round-trip to vCenter on every poll. - Minimal Helm chart under `chart/`, published as OCI to - `ghcr.io/varashi/charts/gpu-node-vsphere-maintenance-controller`. + `ghcr.io/varashi/charts/vsphere-passthrough-node-controller`. - GitHub Actions: `ci.yaml` (ruff, hadolint, helm lint, buildx smoke build) on pull requests; `release.yaml` on `v*.*.*` tag push builds multi-arch images (amd64, arm64), cosign-signs keyless via OIDC, attaches SBOM and @@ -186,15 +186,15 @@ No controller code change. Supply-chain and CI polish only. - Initial release: drain → power-off → wait-for-exit → power-on → uncordon, driven by edge-triggered `HostSystem.recentTask` polling. -[Unreleased]: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/compare/v0.4.3...HEAD -[0.4.3]: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/compare/v0.4.2...v0.4.3 -[0.4.2]: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/compare/v0.4.1...v0.4.2 -[0.4.1]: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/compare/v0.4.0...v0.4.1 -[0.4.0]: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/compare/v0.3.0...v0.4.0 -[0.3.0]: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/compare/v0.2.3...v0.3.0 -[0.2.3]: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/compare/v0.2.2...v0.2.3 -[0.2.2]: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/compare/v0.2.1...v0.2.2 -[0.2.1]: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/compare/v0.2.0...v0.2.1 -[0.2.0]: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/compare/v0.1.1...v0.2.0 -[0.1.1]: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/compare/v0.1.0...v0.1.1 -[0.1.0]: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/releases/tag/v0.1.0 +[Unreleased]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.4.3...HEAD +[0.4.3]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.4.2...v0.4.3 +[0.4.2]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.4.1...v0.4.2 +[0.4.1]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.4.0...v0.4.1 +[0.4.0]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.3.0...v0.4.0 +[0.3.0]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.2.3...v0.3.0 +[0.2.3]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.2.2...v0.2.3 +[0.2.2]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.2.1...v0.2.2 +[0.2.1]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.2.0...v0.2.1 +[0.2.0]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.1.1...v0.2.0 +[0.1.1]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.1.0...v0.1.1 +[0.1.0]: https://github.com/Varashi/vsphere-passthrough-node-controller/releases/tag/v0.1.0 diff --git a/Dockerfile b/Dockerfile index 6bd5352..882b389 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,9 +1,9 @@ FROM python:3.13-slim -LABEL org.opencontainers.image.title="gpu-node-vsphere-maintenance-controller" +LABEL org.opencontainers.image.title="vsphere-passthrough-node-controller" LABEL org.opencontainers.image.description="Kubernetes controller that automates ESXi maintenance mode for worker nodes with PCI passthrough (GPU or otherwise)." -LABEL org.opencontainers.image.source="https://github.com/Varashi/gpu-node-vsphere-maintenance-controller" -LABEL org.opencontainers.image.documentation="https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/blob/main/README.md" +LABEL org.opencontainers.image.source="https://github.com/Varashi/vsphere-passthrough-node-controller" +LABEL org.opencontainers.image.documentation="https://github.com/Varashi/vsphere-passthrough-node-controller/blob/main/README.md" LABEL org.opencontainers.image.licenses="MIT" WORKDIR /app diff --git a/README.md b/README.md index c3f1879..70970bf 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# gpu-node-vsphere-maintenance-controller +# vsphere-passthrough-node-controller A Kubernetes controller that safely handles ESXi maintenance mode transitions for worker nodes that use **PCI passthrough** (Intel ARC / NVIDIA / any @@ -12,7 +12,7 @@ possible — migrates it (cold) to another GPU-capable host and brings it back online. When the original host exits maintenance, a powered-off node is returned to service automatically. -Image: `ghcr.io/varashi/gpu-node-vsphere-maintenance-controller` (public). +Image: `ghcr.io/varashi/vsphere-passthrough-node-controller` (public). ## Why this exists @@ -149,10 +149,10 @@ mis-fire is destructive; turn it on once you trust the signal in your environmen The chart is published as an OCI artifact alongside the image: ```bash -helm upgrade --install gpu-node-vsphere-maintenance \ - oci://ghcr.io/varashi/charts/gpu-node-vsphere-maintenance-controller \ +helm upgrade --install vsphere-passthrough-node \ + oci://ghcr.io/varashi/charts/vsphere-passthrough-node-controller \ --version 0.4.3 \ - --namespace gpu-node-vsphere-maintenance --create-namespace \ + --namespace vsphere-passthrough-node --create-namespace \ --set vcenter.host=vcenter.example.com \ --set vcenter.user=maintenance-controller@vsphere.local \ --set vcenter.password='replace-me' @@ -170,24 +170,24 @@ A Flux `HelmRelease` example: apiVersion: source.toolkit.fluxcd.io/v1 kind: OCIRepository metadata: - name: gpu-node-vsphere-maintenance-controller - namespace: gpu-node-vsphere-maintenance + name: vsphere-passthrough-node-controller + namespace: vsphere-passthrough-node spec: interval: 1h - url: oci://ghcr.io/varashi/charts/gpu-node-vsphere-maintenance-controller + url: oci://ghcr.io/varashi/charts/vsphere-passthrough-node-controller ref: tag: 0.4.3 --- apiVersion: helm.toolkit.fluxcd.io/v2 kind: HelmRelease metadata: - name: gpu-node-vsphere-maintenance-controller - namespace: gpu-node-vsphere-maintenance + name: vsphere-passthrough-node-controller + namespace: vsphere-passthrough-node spec: interval: 1h chartRef: kind: OCIRepository - name: gpu-node-vsphere-maintenance-controller + name: vsphere-passthrough-node-controller values: vcenter: existingSecret: vsphere-credentials @@ -206,18 +206,18 @@ and credentials source as needed): apiVersion: v1 kind: Namespace metadata: - name: gpu-node-vsphere-maintenance + name: vsphere-passthrough-node --- apiVersion: v1 kind: ServiceAccount metadata: - name: gpu-node-vsphere-maintenance - namespace: gpu-node-vsphere-maintenance + name: vsphere-passthrough-node + namespace: vsphere-passthrough-node --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - name: gpu-node-vsphere-maintenance + name: vsphere-passthrough-node rules: - apiGroups: [""] resources: ["nodes"] @@ -232,21 +232,21 @@ rules: apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: gpu-node-vsphere-maintenance + name: vsphere-passthrough-node roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: gpu-node-vsphere-maintenance + name: vsphere-passthrough-node subjects: - kind: ServiceAccount - name: gpu-node-vsphere-maintenance - namespace: gpu-node-vsphere-maintenance + name: vsphere-passthrough-node + namespace: vsphere-passthrough-node --- apiVersion: v1 kind: Secret metadata: name: vsphere-credentials - namespace: gpu-node-vsphere-maintenance + namespace: vsphere-passthrough-node type: Opaque stringData: VCENTER_HOST: vcenter.example.com @@ -257,7 +257,7 @@ apiVersion: v1 kind: ConfigMap metadata: name: controller-config - namespace: gpu-node-vsphere-maintenance + namespace: vsphere-passthrough-node data: POLL_INTERVAL_SECONDS: "30" DRAIN_TIMEOUT_SECONDS: "600" @@ -270,24 +270,24 @@ data: apiVersion: apps/v1 kind: Deployment metadata: - name: gpu-node-vsphere-maintenance - namespace: gpu-node-vsphere-maintenance + name: vsphere-passthrough-node + namespace: vsphere-passthrough-node spec: replicas: 1 strategy: type: Recreate selector: matchLabels: - app: gpu-node-vsphere-maintenance + app: vsphere-passthrough-node template: metadata: labels: - app: gpu-node-vsphere-maintenance + app: vsphere-passthrough-node spec: - serviceAccountName: gpu-node-vsphere-maintenance + serviceAccountName: vsphere-passthrough-node containers: - name: controller - image: ghcr.io/varashi/gpu-node-vsphere-maintenance-controller:v0.3.0 + image: ghcr.io/varashi/vsphere-passthrough-node-controller:v0.3.0 envFrom: - secretRef: name: vsphere-credentials @@ -330,7 +330,7 @@ apiVersion: external-secrets.io/v1 kind: ExternalSecret metadata: name: vsphere-credentials - namespace: gpu-node-vsphere-maintenance + namespace: vsphere-passthrough-node spec: refreshInterval: 1h secretStoreRef: @@ -393,8 +393,8 @@ are set. ## Building from source ```bash -docker build -t ghcr.io/you/gpu-node-vsphere-maintenance-controller:dev . -docker push ghcr.io/you/gpu-node-vsphere-maintenance-controller:dev +docker build -t ghcr.io/you/vsphere-passthrough-node-controller:dev . +docker push ghcr.io/you/vsphere-passthrough-node-controller:dev ``` Source layout is deliberately tiny — a single `controller.py` plus a @@ -444,12 +444,12 @@ git push origin v0.3.1 The `release.yaml` GitHub Actions workflow then: 1. Builds and pushes the controller image to - `ghcr.io/varashi/gpu-node-vsphere-maintenance-controller`, multi-arch + `ghcr.io/varashi/vsphere-passthrough-node-controller`, multi-arch (`linux/amd64`, `linux/arm64`), with cosign keyless signatures (GitHub OIDC), an SPDX SBOM, and a build-provenance attestation. 2. Packages the Helm chart in `chart/` with `version` and `appVersion` matching the tag and pushes it to - `oci://ghcr.io/varashi/charts/gpu-node-vsphere-maintenance-controller`. + `oci://ghcr.io/varashi/charts/vsphere-passthrough-node-controller`. 3. Creates a GitHub Release whose body is extracted from the matching section of [`CHANGELOG.md`](./CHANGELOG.md) and attaches the SBOM and the packaged chart `.tgz`. @@ -463,19 +463,19 @@ attached as a cosign attestation. Verify any of these before deploying: ```bash # 1. Image signature. cosign verify \ - --certificate-identity-regexp 'https://github\.com/Varashi/gpu-node-vsphere-maintenance-controller/\.github/workflows/release\.yaml@refs/tags/v.*' \ + --certificate-identity-regexp 'https://github\.com/Varashi/vsphere-passthrough-node-controller/\.github/workflows/release\.yaml@refs/tags/v.*' \ --certificate-oidc-issuer https://token.actions.githubusercontent.com \ - ghcr.io/varashi/gpu-node-vsphere-maintenance-controller: + ghcr.io/varashi/vsphere-passthrough-node-controller: # 2. SBOM attestation (SPDX). cosign verify-attestation --type spdxjson \ - --certificate-identity-regexp 'https://github\.com/Varashi/gpu-node-vsphere-maintenance-controller/\.github/workflows/release\.yaml@refs/tags/v.*' \ + --certificate-identity-regexp 'https://github\.com/Varashi/vsphere-passthrough-node-controller/\.github/workflows/release\.yaml@refs/tags/v.*' \ --certificate-oidc-issuer https://token.actions.githubusercontent.com \ - ghcr.io/varashi/gpu-node-vsphere-maintenance-controller: + ghcr.io/varashi/vsphere-passthrough-node-controller: # 3. SLSA build provenance (GitHub Attestations). gh attestation verify \ - oci://ghcr.io/varashi/gpu-node-vsphere-maintenance-controller: \ + oci://ghcr.io/varashi/vsphere-passthrough-node-controller: \ --owner Varashi ``` @@ -484,9 +484,9 @@ chart digests too: ```bash cosign verify \ - --certificate-identity-regexp 'https://github\.com/Varashi/gpu-node-vsphere-maintenance-controller/\.github/workflows/release\.yaml@refs/tags/v.*' \ + --certificate-identity-regexp 'https://github\.com/Varashi/vsphere-passthrough-node-controller/\.github/workflows/release\.yaml@refs/tags/v.*' \ --certificate-oidc-issuer https://token.actions.githubusercontent.com \ - ghcr.io/varashi/charts/gpu-node-vsphere-maintenance-controller: + ghcr.io/varashi/charts/vsphere-passthrough-node-controller: ``` `helm pull --verify` is *not* supported against this chart: `--verify` @@ -497,7 +497,7 @@ above instead. ## Version history See [`CHANGELOG.md`](./CHANGELOG.md) for the full history. Released tags -are also listed on the [GitHub Releases](https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/releases) +are also listed on the [GitHub Releases](https://github.com/Varashi/vsphere-passthrough-node-controller/releases) page with signed assets and SBOMs. ## License diff --git a/chart/Chart.yaml b/chart/Chart.yaml index ffb534f..b5d2b1a 100644 --- a/chart/Chart.yaml +++ b/chart/Chart.yaml @@ -1,5 +1,5 @@ apiVersion: v2 -name: gpu-node-vsphere-maintenance-controller +name: vsphere-passthrough-node-controller description: | Kubernetes controller that automates ESXi maintenance mode for worker nodes with PCI passthrough (GPU or otherwise). Detects EnterMaintenanceMode @@ -16,9 +16,9 @@ keywords: - gpu - pci-passthrough - kubernetes -home: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller +home: https://github.com/Varashi/vsphere-passthrough-node-controller sources: - - https://github.com/Varashi/gpu-node-vsphere-maintenance-controller + - https://github.com/Varashi/vsphere-passthrough-node-controller maintainers: - name: Varashi url: https://github.com/Varashi diff --git a/chart/templates/NOTES.txt b/chart/templates/NOTES.txt index da1ad58..58e54ee 100644 --- a/chart/templates/NOTES.txt +++ b/chart/templates/NOTES.txt @@ -20,8 +20,8 @@ Note: vCenter TLS verification is DISABLED. To enable, pick one: Verify the controller is running: - kubectl -n {{ .Release.Namespace }} rollout status deploy/{{ include "gpu-node-vsphere-maintenance-controller.fullname" . }} - kubectl -n {{ .Release.Namespace }} logs deploy/{{ include "gpu-node-vsphere-maintenance-controller.fullname" . }} -f + kubectl -n {{ .Release.Namespace }} rollout status deploy/{{ include "vsphere-passthrough-node-controller.fullname" . }} + kubectl -n {{ .Release.Namespace }} logs deploy/{{ include "vsphere-passthrough-node-controller.fullname" . }} -f GPU-worker Node label (`{{ .Values.config.gpuNodeLabel }}`) identifies the nodes this controller will drain when their ESXi host enters maintenance. diff --git a/chart/templates/_helpers.tpl b/chart/templates/_helpers.tpl index 122c2a5..8984d62 100644 --- a/chart/templates/_helpers.tpl +++ b/chart/templates/_helpers.tpl @@ -1,14 +1,14 @@ {{/* Expand the name of the chart. */}} -{{- define "gpu-node-vsphere-maintenance-controller.name" -}} +{{- define "vsphere-passthrough-node-controller.name" -}} {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} {{- end }} {{/* Create a default fully qualified app name. */}} -{{- define "gpu-node-vsphere-maintenance-controller.fullname" -}} +{{- define "vsphere-passthrough-node-controller.fullname" -}} {{- if .Values.fullnameOverride }} {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} {{- else }} @@ -24,16 +24,16 @@ Create a default fully qualified app name. {{/* Chart label. */}} -{{- define "gpu-node-vsphere-maintenance-controller.chart" -}} +{{- define "vsphere-passthrough-node-controller.chart" -}} {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} {{- end }} {{/* Common labels. */}} -{{- define "gpu-node-vsphere-maintenance-controller.labels" -}} -helm.sh/chart: {{ include "gpu-node-vsphere-maintenance-controller.chart" . }} -{{ include "gpu-node-vsphere-maintenance-controller.selectorLabels" . }} +{{- define "vsphere-passthrough-node-controller.labels" -}} +helm.sh/chart: {{ include "vsphere-passthrough-node-controller.chart" . }} +{{ include "vsphere-passthrough-node-controller.selectorLabels" . }} {{- if .Chart.AppVersion }} app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} {{- end }} @@ -43,17 +43,17 @@ app.kubernetes.io/managed-by: {{ .Release.Service }} {{/* Selector labels. */}} -{{- define "gpu-node-vsphere-maintenance-controller.selectorLabels" -}} -app.kubernetes.io/name: {{ include "gpu-node-vsphere-maintenance-controller.name" . }} +{{- define "vsphere-passthrough-node-controller.selectorLabels" -}} +app.kubernetes.io/name: {{ include "vsphere-passthrough-node-controller.name" . }} app.kubernetes.io/instance: {{ .Release.Name }} {{- end }} {{/* ServiceAccount name. */}} -{{- define "gpu-node-vsphere-maintenance-controller.serviceAccountName" -}} +{{- define "vsphere-passthrough-node-controller.serviceAccountName" -}} {{- if .Values.serviceAccount.create }} -{{- default (include "gpu-node-vsphere-maintenance-controller.fullname" .) .Values.serviceAccount.name }} +{{- default (include "vsphere-passthrough-node-controller.fullname" .) .Values.serviceAccount.name }} {{- else }} {{- default "default" .Values.serviceAccount.name }} {{- end }} @@ -62,10 +62,10 @@ ServiceAccount name. {{/* Name of the Secret holding vCenter credentials (existing or rendered). */}} -{{- define "gpu-node-vsphere-maintenance-controller.vcenterSecretName" -}} +{{- define "vsphere-passthrough-node-controller.vcenterSecretName" -}} {{- if .Values.vcenter.existingSecret -}} {{- .Values.vcenter.existingSecret -}} {{- else -}} -{{- printf "%s-vcenter" (include "gpu-node-vsphere-maintenance-controller.fullname" .) -}} +{{- printf "%s-vcenter" (include "vsphere-passthrough-node-controller.fullname" .) -}} {{- end -}} {{- end }} diff --git a/chart/templates/clusterrole.yaml b/chart/templates/clusterrole.yaml index cb06b2f..fd2a04a 100644 --- a/chart/templates/clusterrole.yaml +++ b/chart/templates/clusterrole.yaml @@ -2,9 +2,9 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - name: {{ include "gpu-node-vsphere-maintenance-controller.fullname" . }} + name: {{ include "vsphere-passthrough-node-controller.fullname" . }} labels: - {{- include "gpu-node-vsphere-maintenance-controller.labels" . | nindent 4 }} + {{- include "vsphere-passthrough-node-controller.labels" . | nindent 4 }} rules: - apiGroups: [""] resources: ["nodes"] diff --git a/chart/templates/clusterrolebinding.yaml b/chart/templates/clusterrolebinding.yaml index 6f15360..29fdb07 100644 --- a/chart/templates/clusterrolebinding.yaml +++ b/chart/templates/clusterrolebinding.yaml @@ -2,15 +2,15 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: {{ include "gpu-node-vsphere-maintenance-controller.fullname" . }} + name: {{ include "vsphere-passthrough-node-controller.fullname" . }} labels: - {{- include "gpu-node-vsphere-maintenance-controller.labels" . | nindent 4 }} + {{- include "vsphere-passthrough-node-controller.labels" . | nindent 4 }} roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: {{ include "gpu-node-vsphere-maintenance-controller.fullname" . }} + name: {{ include "vsphere-passthrough-node-controller.fullname" . }} subjects: - kind: ServiceAccount - name: {{ include "gpu-node-vsphere-maintenance-controller.serviceAccountName" . }} + name: {{ include "vsphere-passthrough-node-controller.serviceAccountName" . }} namespace: {{ .Release.Namespace }} {{- end }} diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml index ebb8663..1ebb783 100644 --- a/chart/templates/configmap.yaml +++ b/chart/templates/configmap.yaml @@ -1,10 +1,10 @@ apiVersion: v1 kind: ConfigMap metadata: - name: {{ include "gpu-node-vsphere-maintenance-controller.fullname" . }} + name: {{ include "vsphere-passthrough-node-controller.fullname" . }} namespace: {{ .Release.Namespace }} labels: - {{- include "gpu-node-vsphere-maintenance-controller.labels" . | nindent 4 }} + {{- include "vsphere-passthrough-node-controller.labels" . | nindent 4 }} data: POLL_INTERVAL_SECONDS: {{ .Values.config.pollIntervalSeconds | quote }} DRAIN_TIMEOUT_SECONDS: {{ .Values.config.drainTimeoutSeconds | quote }} diff --git a/chart/templates/deployment.yaml b/chart/templates/deployment.yaml index 4d9be0e..edf2377 100644 --- a/chart/templates/deployment.yaml +++ b/chart/templates/deployment.yaml @@ -1,21 +1,21 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: {{ include "gpu-node-vsphere-maintenance-controller.fullname" . }} + name: {{ include "vsphere-passthrough-node-controller.fullname" . }} namespace: {{ .Release.Namespace }} labels: - {{- include "gpu-node-vsphere-maintenance-controller.labels" . | nindent 4 }} + {{- include "vsphere-passthrough-node-controller.labels" . | nindent 4 }} spec: replicas: {{ .Values.replicaCount }} strategy: {{- toYaml .Values.strategy | nindent 4 }} selector: matchLabels: - {{- include "gpu-node-vsphere-maintenance-controller.selectorLabels" . | nindent 6 }} + {{- include "vsphere-passthrough-node-controller.selectorLabels" . | nindent 6 }} template: metadata: labels: - {{- include "gpu-node-vsphere-maintenance-controller.selectorLabels" . | nindent 8 }} + {{- include "vsphere-passthrough-node-controller.selectorLabels" . | nindent 8 }} {{- with .Values.podLabels }} {{- toYaml . | nindent 8 }} {{- end }} @@ -25,7 +25,7 @@ spec: {{- toYaml . | nindent 8 }} {{- end }} spec: - serviceAccountName: {{ include "gpu-node-vsphere-maintenance-controller.serviceAccountName" . }} + serviceAccountName: {{ include "vsphere-passthrough-node-controller.serviceAccountName" . }} {{- with .Values.imagePullSecrets }} imagePullSecrets: {{- toYaml . | nindent 8 }} @@ -43,9 +43,9 @@ spec: imagePullPolicy: {{ .Values.image.pullPolicy }} envFrom: - secretRef: - name: {{ include "gpu-node-vsphere-maintenance-controller.vcenterSecretName" . }} + name: {{ include "vsphere-passthrough-node-controller.vcenterSecretName" . }} - configMapRef: - name: {{ include "gpu-node-vsphere-maintenance-controller.fullname" . }} + name: {{ include "vsphere-passthrough-node-controller.fullname" . }} {{- with .Values.extraEnv }} env: {{- toYaml . | nindent 12 }} diff --git a/chart/templates/fence.yaml b/chart/templates/fence.yaml index 05db042..77f9a94 100644 --- a/chart/templates/fence.yaml +++ b/chart/templates/fence.yaml @@ -1,5 +1,5 @@ {{- if .Values.fence.enabled }} -{{- $fullname := include "gpu-node-vsphere-maintenance-controller.fullname" . -}} +{{- $fullname := include "vsphere-passthrough-node-controller.fullname" . -}} {{- $fenceName := printf "%s-fence" $fullname -}} {{- $fenceSA := ternary $fenceName (default "default" .Values.serviceAccount.name) .Values.serviceAccount.create -}} {{- if and .Values.serviceAccount.create }} @@ -9,7 +9,7 @@ metadata: name: {{ $fenceName }} namespace: {{ .Release.Namespace }} labels: - {{- include "gpu-node-vsphere-maintenance-controller.labels" . | nindent 4 }} + {{- include "vsphere-passthrough-node-controller.labels" . | nindent 4 }} app.kubernetes.io/component: fence --- {{- end }} @@ -22,7 +22,7 @@ kind: ClusterRole metadata: name: {{ $fenceName }} labels: - {{- include "gpu-node-vsphere-maintenance-controller.labels" . | nindent 4 }} + {{- include "vsphere-passthrough-node-controller.labels" . | nindent 4 }} app.kubernetes.io/component: fence rules: - apiGroups: [""] @@ -34,7 +34,7 @@ kind: ClusterRoleBinding metadata: name: {{ $fenceName }} labels: - {{- include "gpu-node-vsphere-maintenance-controller.labels" . | nindent 4 }} + {{- include "vsphere-passthrough-node-controller.labels" . | nindent 4 }} app.kubernetes.io/component: fence roleRef: apiGroup: rbac.authorization.k8s.io @@ -52,7 +52,7 @@ metadata: name: {{ $fenceName }} namespace: {{ .Release.Namespace }} labels: - {{- include "gpu-node-vsphere-maintenance-controller.labels" . | nindent 4 }} + {{- include "vsphere-passthrough-node-controller.labels" . | nindent 4 }} app.kubernetes.io/component: fence spec: replicas: 1 @@ -60,12 +60,12 @@ spec: type: Recreate selector: matchLabels: - {{- include "gpu-node-vsphere-maintenance-controller.selectorLabels" . | nindent 6 }} + {{- include "vsphere-passthrough-node-controller.selectorLabels" . | nindent 6 }} app.kubernetes.io/component: fence template: metadata: labels: - {{- include "gpu-node-vsphere-maintenance-controller.selectorLabels" . | nindent 8 }} + {{- include "vsphere-passthrough-node-controller.selectorLabels" . | nindent 8 }} app.kubernetes.io/component: fence {{- with .Values.podLabels }} {{- toYaml . | nindent 8 }} @@ -94,7 +94,7 @@ spec: command: ["python", "-u", "fence.py"] envFrom: - secretRef: - name: {{ include "gpu-node-vsphere-maintenance-controller.vcenterSecretName" . }} + name: {{ include "vsphere-passthrough-node-controller.vcenterSecretName" . }} env: - name: FENCE_POLL_SECONDS value: {{ .Values.fence.pollSeconds | quote }} diff --git a/chart/templates/secret.yaml b/chart/templates/secret.yaml index d74d3cb..cb38a96 100644 --- a/chart/templates/secret.yaml +++ b/chart/templates/secret.yaml @@ -2,10 +2,10 @@ apiVersion: v1 kind: Secret metadata: - name: {{ include "gpu-node-vsphere-maintenance-controller.vcenterSecretName" . }} + name: {{ include "vsphere-passthrough-node-controller.vcenterSecretName" . }} namespace: {{ .Release.Namespace }} labels: - {{- include "gpu-node-vsphere-maintenance-controller.labels" . | nindent 4 }} + {{- include "vsphere-passthrough-node-controller.labels" . | nindent 4 }} type: Opaque stringData: VCENTER_HOST: {{ .Values.vcenter.host | quote }} diff --git a/chart/templates/serviceaccount.yaml b/chart/templates/serviceaccount.yaml index ccdb08d..d6dbc4d 100644 --- a/chart/templates/serviceaccount.yaml +++ b/chart/templates/serviceaccount.yaml @@ -2,10 +2,10 @@ apiVersion: v1 kind: ServiceAccount metadata: - name: {{ include "gpu-node-vsphere-maintenance-controller.serviceAccountName" . }} + name: {{ include "vsphere-passthrough-node-controller.serviceAccountName" . }} namespace: {{ .Release.Namespace }} labels: - {{- include "gpu-node-vsphere-maintenance-controller.labels" . | nindent 4 }} + {{- include "vsphere-passthrough-node-controller.labels" . | nindent 4 }} {{- with .Values.serviceAccount.annotations }} annotations: {{- toYaml . | nindent 4 }} diff --git a/chart/values.yaml b/chart/values.yaml index a8ffa3c..49b246f 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -1,7 +1,7 @@ -# Default values for gpu-node-vsphere-maintenance-controller. +# Default values for vsphere-passthrough-node-controller. image: - repository: ghcr.io/varashi/gpu-node-vsphere-maintenance-controller + repository: ghcr.io/varashi/vsphere-passthrough-node-controller # -- Overrides the image tag (defaults to Chart.appVersion) tag: "" pullPolicy: IfNotPresent From 587b32b9106da84025449ddb83cb8994ffa89ef5 Mon Sep 17 00:00:00 2001 From: varashi Date: Fri, 5 Jun 2026 08:22:14 +0200 Subject: [PATCH 3/4] ci: fix ruff format + hadolint DL3021 ruff format controller.py (line-length wrapping); Dockerfile COPY multi-arg destination must end with / (DL3021). Co-Authored-By: Claude Opus 4.8 --- Dockerfile | 2 +- controller.py | 12 +++++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/Dockerfile b/Dockerfile index 882b389..b05c339 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,7 +11,7 @@ WORKDIR /app RUN pip install --no-cache-dir --disable-pip-version-check \ pyVmomi==8.0.3.0.1 kubernetes==31.0.0 -COPY controller.py fence.py . +COPY controller.py fence.py ./ # Default entrypoint = maintenance controller. The fence controller (fence.py) # is the same image with the command overridden to `python -u fence.py`. diff --git a/controller.py b/controller.py index c77083d..2c0766e 100644 --- a/controller.py +++ b/controller.py @@ -483,14 +483,14 @@ def uncordon(self, node_name): def has_out_of_service_taint(self, node_name) -> bool: node = self.get_node(node_name) - return any( - t.key == OUT_OF_SERVICE_TAINT_KEY for t in (node.spec.taints or []) - ) + return any(t.key == OUT_OF_SERVICE_TAINT_KEY for t in (node.spec.taints or [])) def _patch_taints(self, node_name, taints): # sanitize_for_serialization turns V1Taint objects into proper camelCase # API dicts (incl. timeAdded), so existing taints are preserved verbatim. - body = {"spec": {"taints": self.core.api_client.sanitize_for_serialization(taints)}} + body = { + "spec": {"taints": self.core.api_client.sanitize_for_serialization(taints)} + } self.core.patch_node(node_name, body) def apply_out_of_service_taint(self, node_name): @@ -518,7 +518,9 @@ def remove_out_of_service_taint(self, node_name): if not any(t.key == OUT_OF_SERVICE_TAINT_KEY for t in taints): return if DRY_RUN: - log.info(f"[DRY RUN] Would un-fence {node_name} (remove out-of-service taint)") + log.info( + f"[DRY RUN] Would un-fence {node_name} (remove out-of-service taint)" + ) return log.info(f"Un-fencing {node_name}: removing {OUT_OF_SERVICE_TAINT_KEY} taint") kept = [t for t in taints if t.key != OUT_OF_SERVICE_TAINT_KEY] From f2adb23ec58fa6692a0240a1389e82c0824fb75b Mon Sep 17 00:00:00 2001 From: varashi Date: Fri, 5 Jun 2026 08:28:59 +0200 Subject: [PATCH 4/4] fix: address CodeRabbit review on #7 - controller.py: match out-of-service taint by full (key,value,effect) identity, not key-only, so a same-key taint with a different value/effect isn't mistaken for ours on fence/un-fence - chart fence.yaml: drop redundant `and` in SA conditional; remove `update` verb from fence ClusterRole (patch suffices for taint ops) - CHANGELOG: add 0.5.0/0.4.4 link defs, advance Unreleased baseline - README: bump chart examples 0.4.3 -> 0.5.0; note fence.py in layout Co-Authored-By: Claude Opus 4.8 --- CHANGELOG.md | 4 +++- README.md | 12 +++++++----- chart/templates/fence.yaml | 4 ++-- controller.py | 22 +++++++++++++++++----- 4 files changed, 29 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4dcd9b7..b2a97bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -186,7 +186,9 @@ No controller code change. Supply-chain and CI polish only. - Initial release: drain → power-off → wait-for-exit → power-on → uncordon, driven by edge-triggered `HostSystem.recentTask` polling. -[Unreleased]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.4.3...HEAD +[Unreleased]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.5.0...HEAD +[0.5.0]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.4.4...v0.5.0 +[0.4.4]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.4.3...v0.4.4 [0.4.3]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.4.2...v0.4.3 [0.4.2]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.4.1...v0.4.2 [0.4.1]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.4.0...v0.4.1 diff --git a/README.md b/README.md index 70970bf..fa7c9cf 100644 --- a/README.md +++ b/README.md @@ -151,7 +151,7 @@ The chart is published as an OCI artifact alongside the image: ```bash helm upgrade --install vsphere-passthrough-node \ oci://ghcr.io/varashi/charts/vsphere-passthrough-node-controller \ - --version 0.4.3 \ + --version 0.5.0 \ --namespace vsphere-passthrough-node --create-namespace \ --set vcenter.host=vcenter.example.com \ --set vcenter.user=maintenance-controller@vsphere.local \ @@ -176,7 +176,7 @@ spec: interval: 1h url: oci://ghcr.io/varashi/charts/vsphere-passthrough-node-controller ref: - tag: 0.4.3 + tag: 0.5.0 --- apiVersion: helm.toolkit.fluxcd.io/v2 kind: HelmRelease @@ -397,9 +397,11 @@ docker build -t ghcr.io/you/vsphere-passthrough-node-controller:dev . docker push ghcr.io/you/vsphere-passthrough-node-controller:dev ``` -Source layout is deliberately tiny — a single `controller.py` plus a -minimal Python 3.13 Dockerfile. Dependencies: `pyVmomi` and the official -Kubernetes Python client. +Source layout is deliberately tiny — `controller.py` (maintenance-mode +controller) plus the optional `fence.py` (crash-fence controller, which +reuses `controller.py`'s vCenter client and node↔VM mapping), on a minimal +Python 3.13 Dockerfile. Dependencies: `pyVmomi` and the official Kubernetes +Python client. ## Race conditions handled diff --git a/chart/templates/fence.yaml b/chart/templates/fence.yaml index 77f9a94..51d8424 100644 --- a/chart/templates/fence.yaml +++ b/chart/templates/fence.yaml @@ -2,7 +2,7 @@ {{- $fullname := include "vsphere-passthrough-node-controller.fullname" . -}} {{- $fenceName := printf "%s-fence" $fullname -}} {{- $fenceSA := ternary $fenceName (default "default" .Values.serviceAccount.name) .Values.serviceAccount.create -}} -{{- if and .Values.serviceAccount.create }} +{{- if .Values.serviceAccount.create }} apiVersion: v1 kind: ServiceAccount metadata: @@ -27,7 +27,7 @@ metadata: rules: - apiGroups: [""] resources: ["nodes"] - verbs: ["get", "list", "watch", "patch", "update"] + verbs: ["get", "list", "watch", "patch"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding diff --git a/controller.py b/controller.py index 2c0766e..af023d0 100644 --- a/controller.py +++ b/controller.py @@ -76,6 +76,7 @@ def _is_transient_k8s_error(exc: BaseException) -> bool: # Non-graceful node shutdown (used by the separate fence controller, fence.py). OUT_OF_SERVICE_TAINT_KEY = "node.kubernetes.io/out-of-service" OUT_OF_SERVICE_TAINT_VALUE = "nodeshutdown" +OUT_OF_SERVICE_TAINT_EFFECT = "NoExecute" # vm.runtime.connectionState values meaning the host managing the VM is gone # (a crash). A clean maintenance power-off leaves the VM 'connected'. VM_DEAD_CONNECTION_STATES = {"disconnected", "inaccessible", "orphaned"} @@ -481,9 +482,20 @@ def uncordon(self, node_name): log.info(f"Uncordoning {node_name}") self.core.patch_node(node_name, {"spec": {"unschedulable": False}}) + @staticmethod + def _is_out_of_service_taint(t) -> bool: + # Match the full (key, value, effect) identity of the taint THIS + # controller applies. A same-key taint with a different value/effect is + # not ours: don't treat the node as fenced, and never strip it on un-fence. + return ( + t.key == OUT_OF_SERVICE_TAINT_KEY + and t.value == OUT_OF_SERVICE_TAINT_VALUE + and t.effect == OUT_OF_SERVICE_TAINT_EFFECT + ) + def has_out_of_service_taint(self, node_name) -> bool: node = self.get_node(node_name) - return any(t.key == OUT_OF_SERVICE_TAINT_KEY for t in (node.spec.taints or [])) + return any(self._is_out_of_service_taint(t) for t in (node.spec.taints or [])) def _patch_taints(self, node_name, taints): # sanitize_for_serialization turns V1Taint objects into proper camelCase @@ -497,7 +509,7 @@ def apply_out_of_service_taint(self, node_name): """Force-detach volumes + force-delete pods on a confirmed-dead node.""" node = self.get_node(node_name) taints = list(node.spec.taints or []) - if any(t.key == OUT_OF_SERVICE_TAINT_KEY for t in taints): + if any(self._is_out_of_service_taint(t) for t in taints): return # already fenced if DRY_RUN: log.warning(f"[DRY RUN] Would FENCE {node_name} (out-of-service taint)") @@ -507,7 +519,7 @@ def apply_out_of_service_taint(self, node_name): k8s_client.V1Taint( key=OUT_OF_SERVICE_TAINT_KEY, value=OUT_OF_SERVICE_TAINT_VALUE, - effect="NoExecute", + effect=OUT_OF_SERVICE_TAINT_EFFECT, ) ) self._patch_taints(node_name, taints) @@ -515,7 +527,7 @@ def apply_out_of_service_taint(self, node_name): def remove_out_of_service_taint(self, node_name): node = self.get_node(node_name) taints = list(node.spec.taints or []) - if not any(t.key == OUT_OF_SERVICE_TAINT_KEY for t in taints): + if not any(self._is_out_of_service_taint(t) for t in taints): return if DRY_RUN: log.info( @@ -523,7 +535,7 @@ def remove_out_of_service_taint(self, node_name): ) return log.info(f"Un-fencing {node_name}: removing {OUT_OF_SERVICE_TAINT_KEY} taint") - kept = [t for t in taints if t.key != OUT_OF_SERVICE_TAINT_KEY] + kept = [t for t in taints if not self._is_out_of_service_taint(t)] self._patch_taints(node_name, kept) def is_ready(self, node_name):