From 19c498c7614856c024667860d3481c216f924b0c Mon Sep 17 00:00:00 2001
From: varashi <frank@boeye.net>
Date: Wed, 3 Jun 2026 13:57:26 +0200
Subject: [PATCH 1/4] feat: optional crash-fence controller (out-of-service
 taint on host crash)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds fence.py — a second, optional Deployment (fence.enabled, off by
default) sharing this image and reusing the vCenter client + node↔VM map.
Automates non-graceful node shutdown for passthrough-GPU workers that
vSphere HA can't restart elsewhere during a host crash: applies the
node.kubernetes.io/out-of-service taint to a node confirmed dead by BOTH
k8s (NotReady) and vCenter (VM connectionState disconnected/inaccessible/
orphaned), sustained graceSeconds, so RWO volumes force-detach and pods
reschedule; removes it on recovery (connected + Ready).

Disjoint from the maintenance controller (clean power-off stays
'connected', only a crash goes 'disconnected') — no coordination needed.
Taint/un-taint only; power-on stays with vSphere HA. Own SA +
least-privilege ClusterRole (nodes only) + kill switch + dryRun.

test_fence.py covers the two-gate guard, grace hold, no-double-apply,
un-fence on recovery, and the partition/notfound guards (8/8).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .gitignore                 |   1 +
 CHANGELOG.md               |  23 +++++
 Dockerfile                 |   4 +-
 README.md                  |  43 ++++++++++
 chart/Chart.yaml           |   8 +-
 chart/templates/fence.yaml | 154 ++++++++++++++++++++++++++++++++++
 chart/values.yaml          |  24 ++++++
 controller.py              |  61 ++++++++++++++
 fence.py                   | 115 +++++++++++++++++++++++++
 test_fence.py              | 166 +++++++++++++++++++++++++++++++++++++
 10 files changed, 594 insertions(+), 5 deletions(-)
 create mode 100644 .gitignore
 create mode 100644 chart/templates/fence.yaml
 create mode 100644 fence.py
 create mode 100644 test_fence.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..c18dd8d
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+__pycache__/
diff --git a/CHANGELOG.md b/CHANGELOG.md
index cd0cfbd..ecb165e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,29 @@ this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 
 ## [Unreleased]
 
+## [0.5.0] — 2026-06-03
+
+### Added
+- **Crash-fence controller** (`fence.py`) — a second, optional Deployment
+  (`fence.enabled`, **off by default**) that shares this image and reuses the
+  vCenter client + node↔VM mapping. It automates non-graceful node shutdown for
+  passthrough-GPU workers that vSphere HA can't restart elsewhere during a host
+  crash: it applies the `node.kubernetes.io/out-of-service` taint to a node
+  confirmed dead by **both** gates — k8s `NotReady` **and** vCenter VM
+  `runtime.connectionState` in `{disconnected, inaccessible, orphaned}` —
+  sustained for `fence.graceSeconds`, so RWO volumes force-detach and stateful
+  pods reschedule. The taint is removed on recovery (VM `connected` + node
+  `Ready`).
+  - **Disjoint from the maintenance controller**: a clean (maintenance)
+    power-off leaves the VM `connected`; only a real host loss makes it
+    `disconnected`. The two controllers trigger on different vCenter facts and
+    never collide — no coordination contract needed.
+  - **Taint/un-taint only.** Power-on is owned by vSphere HA (it restarts
+    passthrough VMs on the original host once it returns); eviction is handled
+    by `tolerationSeconds` + the taint.
+  - Own ServiceAccount + least-privilege ClusterRole (`nodes` get/list/watch/
+    patch only) + kill switch (`fence.enabled`) + independent `fence.dryRun`.
+
 ## [0.4.4] — 2026-05-01
 
 ### Fixed
diff --git a/Dockerfile b/Dockerfile
index dfaa2c2..6bd5352 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -11,6 +11,8 @@ WORKDIR /app
 RUN pip install --no-cache-dir --disable-pip-version-check \
       pyVmomi==8.0.3.0.1 kubernetes==31.0.0
 
-COPY controller.py .
+COPY controller.py fence.py .
 
+# Default entrypoint = maintenance controller. The fence controller (fence.py)
+# is the same image with the command overridden to `python -u fence.py`.
 CMD ["python", "-u", "controller.py"]
diff --git a/README.md b/README.md
index d83bd1e..c3f1879 100644
--- a/README.md
+++ b/README.md
@@ -90,6 +90,49 @@ Recovery: if a `powered-off` VM ends up on a different host (DRS race,
 operator intervention), the controller notices on the next poll and
 transitions it to `migrated`.
 
+## Crash-fence controller (optional, off by default)
+
+The maintenance controller above handles *planned* host maintenance. A separate,
+optional controller (`fence.py`) handles the *unplanned* case — a host **crash**.
+
+A passthrough-GPU VM can't be vSphere-HA-restarted on another host (the device
+pins it to the original host), so when its host crashes the node stays down. Its
+**RWO volume stays attached to the dead node** and Kubernetes won't auto-detach
+it — it can't distinguish a crash from a network partition, where force-detaching
+a still-live node's volume would corrupt it. So a rescheduled stateful pod hangs
+indefinitely on `Multi-Attach`. The fix is the `node.kubernetes.io/out-of-service`
+taint (non-graceful node shutdown), which force-detaches volumes and force-deletes
+pods — but it must only ever be applied to a node you've *confirmed* is dead.
+
+The fence controller provides that confirmation by requiring **two gates**,
+sustained for `fence.graceSeconds`:
+
+1. **k8s** — node `NotReady`, and
+2. **vCenter** — that node's VM `runtime.connectionState` is `disconnected`
+   (or `inaccessible`/`orphaned`). A host crash makes its VMs `disconnected`; a
+   clean (maintenance) power-off keeps them `connected` — so this signal is
+   **disjoint from the maintenance controller** and the two never collide.
+
+On recovery (VM `connected` + node `Ready`) the taint is removed. The controller
+does **taint/un-taint only** — power-on is left to vSphere HA (which restarts a
+passthrough VM on the original host once it reconnects), and eviction is handled
+by `tolerationSeconds` + the taint.
+
+Enable it (and start with `dryRun` to watch its decisions):
+
+```yaml
+fence:
+  enabled: true
+  dryRun: true        # logs "would fence" without tainting; flip to false when confident
+  graceSeconds: 60    # both gates must hold this long before fencing
+  pollSeconds: 20
+```
+
+It runs as its own Deployment with its own ServiceAccount, a least-privilege
+ClusterRole (`nodes` get/list/watch/patch only — no pods/eviction), and an
+independent kill switch (`fence.enabled`). It's **off by default** because a
+mis-fire is destructive; turn it on once you trust the signal in your environment.
+
 ## Requirements
 
 - Kubernetes 1.26+ (eviction API, server-side apply)
diff --git a/chart/Chart.yaml b/chart/Chart.yaml
index ac8c21a..ffb534f 100644
--- a/chart/Chart.yaml
+++ b/chart/Chart.yaml
@@ -6,8 +6,8 @@ description: |
   on ESXi hosts, drains the matching Kubernetes nodes, powers off the VMs,
   migrates them to a free GPU host if possible, and returns them to service.
 type: application
-version: 0.4.4
-appVersion: "0.4.4"
+version: 0.5.0
+appVersion: "0.5.0"
 kubeVersion: ">=1.26.0-0"
 keywords:
   - vsphere
@@ -25,5 +25,5 @@ maintainers:
 annotations:
   artifacthub.io/license: MIT
   artifacthub.io/changes: |
-    - kind: fixed
-      description: Tolerate vCLS VMs vanishing mid-enumeration during host maintenance entry.
+    - kind: added
+      description: Optional crash-fence controller (separate Deployment, off by default) — applies the out-of-service taint to a GPU node confirmed dead by both k8s (NotReady) and vCenter (VM disconnected) so RWO volumes force-detach and stateful pods reschedule.
diff --git a/chart/templates/fence.yaml b/chart/templates/fence.yaml
new file mode 100644
index 0000000..05db042
--- /dev/null
+++ b/chart/templates/fence.yaml
@@ -0,0 +1,154 @@
+{{- if .Values.fence.enabled }}
+{{- $fullname := include "gpu-node-vsphere-maintenance-controller.fullname" . -}}
+{{- $fenceName := printf "%s-fence" $fullname -}}
+{{- $fenceSA := ternary $fenceName (default "default" .Values.serviceAccount.name) .Values.serviceAccount.create -}}
+{{- if and .Values.serviceAccount.create }}
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: {{ $fenceName }}
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "gpu-node-vsphere-maintenance-controller.labels" . | nindent 4 }}
+    app.kubernetes.io/component: fence
+---
+{{- end }}
+{{- if .Values.rbac.create }}
+# Least-privilege: the fence controller only reads node state and patches the
+# out-of-service taint. No pods/eviction (vSphere HA + tolerationSeconds handle
+# eviction once the taint lands).
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: {{ $fenceName }}
+  labels:
+    {{- include "gpu-node-vsphere-maintenance-controller.labels" . | nindent 4 }}
+    app.kubernetes.io/component: fence
+rules:
+  - apiGroups: [""]
+    resources: ["nodes"]
+    verbs: ["get", "list", "watch", "patch", "update"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: {{ $fenceName }}
+  labels:
+    {{- include "gpu-node-vsphere-maintenance-controller.labels" . | nindent 4 }}
+    app.kubernetes.io/component: fence
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: {{ $fenceName }}
+subjects:
+  - kind: ServiceAccount
+    name: {{ $fenceSA }}
+    namespace: {{ .Release.Namespace }}
+---
+{{- end }}
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ $fenceName }}
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "gpu-node-vsphere-maintenance-controller.labels" . | nindent 4 }}
+    app.kubernetes.io/component: fence
+spec:
+  replicas: 1
+  strategy:
+    type: Recreate
+  selector:
+    matchLabels:
+      {{- include "gpu-node-vsphere-maintenance-controller.selectorLabels" . | nindent 6 }}
+      app.kubernetes.io/component: fence
+  template:
+    metadata:
+      labels:
+        {{- include "gpu-node-vsphere-maintenance-controller.selectorLabels" . | nindent 8 }}
+        app.kubernetes.io/component: fence
+        {{- with .Values.podLabels }}
+        {{- toYaml . | nindent 8 }}
+        {{- end }}
+      {{- with .Values.podAnnotations }}
+      annotations:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+    spec:
+      serviceAccountName: {{ $fenceSA }}
+      {{- with .Values.imagePullSecrets }}
+      imagePullSecrets:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.priorityClassName }}
+      priorityClassName: {{ . }}
+      {{- end }}
+      {{- with .Values.podSecurityContext }}
+      securityContext:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      containers:
+        - name: fence
+          image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
+          imagePullPolicy: {{ .Values.image.pullPolicy }}
+          command: ["python", "-u", "fence.py"]
+          envFrom:
+            - secretRef:
+                name: {{ include "gpu-node-vsphere-maintenance-controller.vcenterSecretName" . }}
+          env:
+            - name: FENCE_POLL_SECONDS
+              value: {{ .Values.fence.pollSeconds | quote }}
+            - name: FENCE_GRACE_SECONDS
+              value: {{ .Values.fence.graceSeconds | quote }}
+            - name: DRY_RUN
+              value: {{ .Values.fence.dryRun | quote }}
+            - name: GPU_NODE_LABEL
+              value: {{ .Values.config.gpuNodeLabel | quote }}
+            - name: VCENTER_TLS_VERIFY
+              value: {{ .Values.vcenter.tlsVerify | quote }}
+            {{- if .Values.vcenter.caBundle.configMapName }}
+            - name: VCENTER_CA_BUNDLE
+              value: {{ printf "%s/%s" .Values.vcenter.caBundle.mountPath .Values.vcenter.caBundle.key | quote }}
+            {{- end }}
+            {{- with .Values.extraEnv }}
+            {{- toYaml . | nindent 12 }}
+            {{- end }}
+          {{- if or .Values.vcenter.caBundle.configMapName .Values.extraVolumeMounts }}
+          volumeMounts:
+            {{- if .Values.vcenter.caBundle.configMapName }}
+            - name: vcenter-ca
+              mountPath: {{ .Values.vcenter.caBundle.mountPath }}
+              readOnly: true
+            {{- end }}
+            {{- with .Values.extraVolumeMounts }}
+            {{- toYaml . | nindent 12 }}
+            {{- end }}
+          {{- end }}
+          resources:
+            {{- toYaml .Values.fence.resources | nindent 12 }}
+          securityContext:
+            {{- toYaml .Values.securityContext | nindent 12 }}
+      {{- if or .Values.vcenter.caBundle.configMapName .Values.extraVolumes }}
+      volumes:
+        {{- if .Values.vcenter.caBundle.configMapName }}
+        - name: vcenter-ca
+          configMap:
+            name: {{ .Values.vcenter.caBundle.configMapName }}
+        {{- end }}
+        {{- with .Values.extraVolumes }}
+        {{- toYaml . | nindent 8 }}
+        {{- end }}
+      {{- end }}
+      {{- with .Values.nodeSelector }}
+      nodeSelector:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.affinity }}
+      affinity:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+      {{- with .Values.tolerations }}
+      tolerations:
+        {{- toYaml . | nindent 8 }}
+      {{- end }}
+{{- end }}
diff --git a/chart/values.yaml b/chart/values.yaml
index 9478122..a8ffa3c 100644
--- a/chart/values.yaml
+++ b/chart/values.yaml
@@ -53,6 +53,30 @@ config:
   gpuNodeLabel: "intel.feature.node.kubernetes.io/gpu=true"
   dryRun: false
 
+# Crash-fence controller — a SEPARATE Deployment (own RBAC + kill switch) that
+# shares this image. Applies the `node.kubernetes.io/out-of-service` taint to a
+# GPU node confirmed dead by BOTH k8s (NotReady) and vCenter (VM
+# connectionState disconnected/inaccessible/orphaned), sustained graceSeconds,
+# so RWO volumes force-detach and stateful pods reschedule; removes it on
+# recovery (VM connected + node Ready). Disjoint from maintenance (which keys
+# off maintenance-mode tasks and leaves a clean power-off 'connected'), so the
+# two never collide. Power-on stays with vSphere HA.
+#
+# OFF by default — fencing is the most destructive action in the cluster (a
+# mis-fire force-detaches a live/partitioned node's volume → corruption).
+# Enable explicitly once you trust the signal.
+fence:
+  enabled: false
+  pollSeconds: 20
+  graceSeconds: 60
+  dryRun: false
+  resources:
+    requests:
+      cpu: 25m
+      memory: 64Mi
+    limits:
+      memory: 128Mi
+
 serviceAccount:
   create: true
   name: ""
diff --git a/controller.py b/controller.py
index 01b5edc..c77083d 100644
--- a/controller.py
+++ b/controller.py
@@ -73,6 +73,13 @@ def _is_transient_k8s_error(exc: BaseException) -> bool:
 STATE_POWERED_OFF = "powered-off"
 STATE_MIGRATED = "migrated"
 
+# Non-graceful node shutdown (used by the separate fence controller, fence.py).
+OUT_OF_SERVICE_TAINT_KEY = "node.kubernetes.io/out-of-service"
+OUT_OF_SERVICE_TAINT_VALUE = "nodeshutdown"
+# vm.runtime.connectionState values meaning the host managing the VM is gone
+# (a crash). A clean maintenance power-off leaves the VM 'connected'.
+VM_DEAD_CONNECTION_STATES = {"disconnected", "inaccessible", "orphaned"}
+
 # ── Logging ───────────────────────────────────────────────────────────────────
 
 logging.basicConfig(
@@ -270,6 +277,17 @@ def get_vm_host(self, vm_name: str):
             return vm.runtime.host.name
         return None
 
+    def get_vm_connection_state(self, vm_name: str) -> str:
+        """vm.runtime.connectionState — 'connected' | 'disconnected' |
+        'inaccessible' | 'orphaned' | 'invalid'. Returns 'notfound' if the VM
+        is not in inventory. A crashed host makes its VMs 'disconnected'; a
+        clean (maintenance) power-off keeps them 'connected'."""
+        self._ensure_connected()
+        vm = self._find_vm(vm_name)
+        if vm is None:
+            return "notfound"
+        return str(vm.runtime.connectionState)
+
     def relocate_vm(self, vm_name: str, target_host_name: str):
         """Cold migrate a powered-off VM to the target ESXi host."""
         self._ensure_connected()
@@ -463,6 +481,49 @@ def uncordon(self, node_name):
         log.info(f"Uncordoning {node_name}")
         self.core.patch_node(node_name, {"spec": {"unschedulable": False}})
 
+    def has_out_of_service_taint(self, node_name) -> bool:
+        node = self.get_node(node_name)
+        return any(
+            t.key == OUT_OF_SERVICE_TAINT_KEY for t in (node.spec.taints or [])
+        )
+
+    def _patch_taints(self, node_name, taints):
+        # sanitize_for_serialization turns V1Taint objects into proper camelCase
+        # API dicts (incl. timeAdded), so existing taints are preserved verbatim.
+        body = {"spec": {"taints": self.core.api_client.sanitize_for_serialization(taints)}}
+        self.core.patch_node(node_name, body)
+
+    def apply_out_of_service_taint(self, node_name):
+        """Force-detach volumes + force-delete pods on a confirmed-dead node."""
+        node = self.get_node(node_name)
+        taints = list(node.spec.taints or [])
+        if any(t.key == OUT_OF_SERVICE_TAINT_KEY for t in taints):
+            return  # already fenced
+        if DRY_RUN:
+            log.warning(f"[DRY RUN] Would FENCE {node_name} (out-of-service taint)")
+            return
+        log.warning(f"FENCING {node_name}: applying {OUT_OF_SERVICE_TAINT_KEY} taint")
+        taints.append(
+            k8s_client.V1Taint(
+                key=OUT_OF_SERVICE_TAINT_KEY,
+                value=OUT_OF_SERVICE_TAINT_VALUE,
+                effect="NoExecute",
+            )
+        )
+        self._patch_taints(node_name, taints)
+
+    def remove_out_of_service_taint(self, node_name):
+        node = self.get_node(node_name)
+        taints = list(node.spec.taints or [])
+        if not any(t.key == OUT_OF_SERVICE_TAINT_KEY for t in taints):
+            return
+        if DRY_RUN:
+            log.info(f"[DRY RUN] Would un-fence {node_name} (remove out-of-service taint)")
+            return
+        log.info(f"Un-fencing {node_name}: removing {OUT_OF_SERVICE_TAINT_KEY} taint")
+        kept = [t for t in taints if t.key != OUT_OF_SERVICE_TAINT_KEY]
+        self._patch_taints(node_name, kept)
+
     def is_ready(self, node_name):
         node = self.get_node(node_name)
         for condition in node.status.conditions:
diff --git a/fence.py b/fence.py
new file mode 100644
index 0000000..b5df2f7
--- /dev/null
+++ b/fence.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python3
+"""
+GPU-node crash fence controller (sibling of the maintenance controller).
+
+Automates non-graceful node shutdown for passthrough-GPU workers that can't be
+vSphere-HA-restarted during a host crash. When a host crashes, the node's RWO
+volume stays attached to the dead node and k8s won't auto-detach it (it can't
+tell a crash from a network partition) — so a rescheduled stateful pod hangs on
+`Multi-Attach`. The fix is the `node.kubernetes.io/out-of-service` taint, which
+force-detaches volumes + force-deletes pods. This controller applies it only on
+a node confirmed dead by BOTH k8s and vCenter, and removes it on recovery.
+
+Two-gate fence condition (both required, sustained for FENCE_GRACE_SECONDS):
+  1. k8s:     node NotReady
+  2. vCenter: that node's VM runtime.connectionState is disconnected/inaccessible
+              (a crash — a clean maintenance power-off keeps it 'connected', so
+               this never collides with the maintenance controller)
+
+Un-fence when the node recovers: VM 'connected' AND node Ready.
+
+Deliberately scoped to taint/un-taint only:
+  * Power-on is owned by vSphere HA (it restarts passthrough VMs on the original
+    host once it returns).
+  * Graceful maintenance drains are owned by the maintenance controller
+    (controller.py), which keys off vCenter maintenance-mode tasks — a disjoint
+    signal, so the two never clash.
+
+Runs as its own Deployment with its own RBAC + kill switch (Values.fence.*).
+"""
+
+import os
+import time
+
+from kubernetes.client.rest import ApiException
+
+from controller import (
+    VM_DEAD_CONNECTION_STATES,
+    DRY_RUN,
+    K8sClient,
+    VSphereClient,
+    _is_transient_k8s_error,
+    log,
+)
+
+FENCE_POLL_SECONDS = int(os.environ.get("FENCE_POLL_SECONDS", "20"))
+# How long both gates must hold before we fence — guards against transient
+# blips (kubelet restart, brief vCenter/host comms loss) and lets vCenter's
+# host-down detection settle (it lags node-NotReady by tens of seconds).
+FENCE_GRACE_SECONDS = int(os.environ.get("FENCE_GRACE_SECONDS", "60"))
+
+
+class FenceController:
+    def __init__(self):
+        self.vsphere = VSphereClient()
+        self.k8s = K8sClient()
+        # node -> monotonic time the two-gate condition first became true
+        self.gate_since: dict[str, float] = {}
+
+    def reconcile(self):
+        for node in self.k8s.get_gpu_nodes():
+            name = node.metadata.name
+            ready = self.k8s.is_ready(name)
+            vm_state = self.vsphere.get_vm_connection_state(name)
+            tainted = self.k8s.has_out_of_service_taint(name)
+            dead_vm = vm_state in VM_DEAD_CONNECTION_STATES
+
+            # ── FENCE gate: NotReady AND vCenter says the VM is gone ──
+            if not ready and dead_vm:
+                first = self.gate_since.setdefault(name, time.monotonic())
+                elapsed = time.monotonic() - first
+                if tainted:
+                    continue  # already fenced
+                if elapsed >= FENCE_GRACE_SECONDS:
+                    log.warning(
+                        f"{name}: NotReady + vm={vm_state} for {elapsed:.0f}s "
+                        f"(>= {FENCE_GRACE_SECONDS}s grace) — fencing"
+                    )
+                    self.k8s.apply_out_of_service_taint(name)
+                else:
+                    log.info(
+                        f"{name}: fence-gate pending {elapsed:.0f}/"
+                        f"{FENCE_GRACE_SECONDS}s (NotReady + vm={vm_state})"
+                    )
+                continue
+
+            # ── Gates not both true: reset timer, and un-fence on recovery ──
+            self.gate_since.pop(name, None)
+            if tainted and ready and vm_state == "connected":
+                log.info(f"{name}: recovered (Ready + vm connected) — un-fencing")
+                self.k8s.remove_out_of_service_taint(name)
+
+    def run(self):
+        log.info(
+            f"Fence controller started — poll={FENCE_POLL_SECONDS}s, "
+            f"grace={FENCE_GRACE_SECONDS}s, dry_run={DRY_RUN}"
+        )
+        while True:
+            try:
+                self.reconcile()
+            except Exception as e:
+                if _is_transient_k8s_error(e):
+                    reason = getattr(e, "reason", type(e).__name__)
+                    status = getattr(e, "status", None)
+                    detail = f" status={status}" if status else ""
+                    log.warning(
+                        f"Transient k8s/transport error in fence loop: "
+                        f"{reason}{detail} — retrying next poll"
+                    )
+                else:
+                    log.exception("Unhandled error in fence loop")
+            time.sleep(FENCE_POLL_SECONDS)
+
+
+if __name__ == "__main__":
+    FenceController().run()
diff --git a/test_fence.py b/test_fence.py
new file mode 100644
index 0000000..2ec85f3
--- /dev/null
+++ b/test_fence.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python3
+"""Unit test for the fence controller's two-gate logic — no cluster/vCenter
+needed. Stubs the heavy imports so `import controller`/`fence` works, then
+drives FenceController.reconcile() through the scenarios that matter for the
+out-of-service taint (the most destructive action in the cluster).
+
+Run: python3 test_fence.py
+"""
+import os
+import sys
+import types
+
+# ── stub heavy deps so `import controller` succeeds ──────────────────────────
+os.environ.setdefault("VCENTER_HOST", "x")
+os.environ.setdefault("VCENTER_USER", "x")
+os.environ.setdefault("VCENTER_PASSWORD", "x")
+
+
+def _mod(name):
+    m = types.ModuleType(name)
+    sys.modules[name] = m
+    return m
+
+
+class _ApiException(Exception):
+    def __init__(self, status=None):
+        self.status = status
+
+
+_u = _mod("urllib3")
+_ue = _mod("urllib3.exceptions")
+_u.exceptions = _ue
+for _n in ("ProtocolError", "ReadTimeoutError", "MaxRetryError", "ConnectionError"):
+    setattr(_ue, _n, type(_n, (Exception,), {}))
+
+_k = _mod("kubernetes")
+_kc = _mod("kubernetes.client")
+_kcr = _mod("kubernetes.client.rest")
+_kcfg = _mod("kubernetes.config")
+_k.client = _kc
+_k.config = _kcfg
+_kc.rest = _kcr
+_kcr.ApiException = _ApiException
+_kc.V1Taint = lambda **kw: types.SimpleNamespace(**kw)
+_kc.CoreV1Api = object
+_kc.V1Eviction = object
+_kc.V1ObjectMeta = object
+
+
+class _ConfigException(Exception):
+    pass
+
+
+_kcfg.ConfigException = _ConfigException
+_kcfg.load_incluster_config = lambda: None
+_kcfg.load_kube_config = lambda **kw: None
+
+_pv = _mod("pyVim")
+_pvc = _mod("pyVim.connect")
+_pv.connect = _pvc
+_pvc.SmartConnect = lambda **kw: None
+_pvm = _mod("pyVmomi")
+_pvm.vim = types.SimpleNamespace()
+_pvm.vmodl = types.SimpleNamespace()
+
+import fence  # noqa: E402
+
+
+# ── fakes ────────────────────────────────────────────────────────────────────
+class FakeK8s:
+    def __init__(self, nodes):
+        # nodes: name -> {"ready": bool, "tainted": bool}
+        self.nodes = nodes
+        self.applied = []
+        self.removed = []
+
+    def get_gpu_nodes(self):
+        return [types.SimpleNamespace(metadata=types.SimpleNamespace(name=n))
+                for n in self.nodes]
+
+    def is_ready(self, name):
+        return self.nodes[name]["ready"]
+
+    def has_out_of_service_taint(self, name):
+        return self.nodes[name]["tainted"]
+
+    def apply_out_of_service_taint(self, name):
+        self.nodes[name]["tainted"] = True
+        self.applied.append(name)
+
+    def remove_out_of_service_taint(self, name):
+        self.nodes[name]["tainted"] = False
+        self.removed.append(name)
+
+
+class FakeVS:
+    def __init__(self, states):
+        self.states = states  # name -> connectionState
+
+    def get_vm_connection_state(self, name):
+        return self.states[name]
+
+
+def make(nodes, states, grace=0):
+    fc = fence.FenceController.__new__(fence.FenceController)
+    fc.k8s = FakeK8s(nodes)
+    fc.vsphere = FakeVS(states)
+    fc.gate_since = {}
+    fence.FENCE_GRACE_SECONDS = grace
+    return fc
+
+
+results = []
+
+
+def check(desc, cond):
+    results.append((desc, cond))
+    print(("PASS " if cond else "FAIL ") + desc)
+
+
+# 1. healthy node — never fenced
+fc = make({"n": {"ready": True, "tainted": False}}, {"n": "connected"})
+fc.reconcile()
+check("healthy (Ready+connected) -> no fence", not fc.k8s.applied)
+
+# 2. NotReady but VM still connected (vCenter lag) — must NOT fence (one gate)
+fc = make({"n": {"ready": False, "tainted": False}}, {"n": "connected"})
+fc.reconcile()
+check("NotReady + vm connected -> no fence (one gate only)", not fc.k8s.applied)
+
+# 3. both gates but within grace — no fence yet
+fc = make({"n": {"ready": False, "tainted": False}}, {"n": "disconnected"}, grace=9999)
+fc.reconcile()
+check("NotReady + disconnected, within grace -> no fence yet", not fc.k8s.applied)
+
+# 4. both gates, grace elapsed -> fence
+fc = make({"n": {"ready": False, "tainted": False}}, {"n": "disconnected"}, grace=0)
+fc.reconcile()
+check("NotReady + disconnected, grace met -> FENCE", fc.k8s.applied == ["n"])
+
+# 5. already tainted -> no double apply
+fc = make({"n": {"ready": False, "tainted": True}}, {"n": "disconnected"}, grace=0)
+fc.reconcile()
+check("already tainted -> no re-apply", not fc.k8s.applied)
+
+# 6. recovery: tainted + Ready + connected -> un-fence
+fc = make({"n": {"ready": True, "tainted": True}}, {"n": "connected"})
+fc.reconcile()
+check("recovered (Ready+connected) while tainted -> un-fence", fc.k8s.removed == ["n"])
+
+# 7. tainted, node back Ready but vm still disconnected -> do NOT un-fence yet
+fc = make({"n": {"ready": True, "tainted": True}}, {"n": "disconnected"})
+fc.reconcile()
+check("tainted, Ready but vm disconnected -> stay fenced", not fc.k8s.removed)
+
+# 8. 'notfound' VM is not a dead-state -> no fence
+fc = make({"n": {"ready": False, "tainted": False}}, {"n": "notfound"}, grace=0)
+fc.reconcile()
+check("NotReady + vm notfound -> no fence", not fc.k8s.applied)
+
+print()
+failed = [d for d, c in results if not c]
+if failed:
+    print(f"{len(failed)} FAILED")
+    sys.exit(1)
+print(f"all {len(results)} passed")

From b5e35029020c40b6548222d864f5c43c1fda550f Mon Sep 17 00:00:00 2001
From: varashi <frank@boeye.net>
Date: Wed, 3 Jun 2026 18:01:47 +0200
Subject: [PATCH 2/4] refactor: rename to vsphere-passthrough-node-controller
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The product now covers two passthrough-node host-lifecycle concerns —
planned maintenance and (new) crash fencing — so "maintenance-controller"
no longer fits. Renames repo/image/chart/templates/helpers/docs to
vsphere-passthrough-node-controller (image + chart at
ghcr.io/varashi/[charts/]vsphere-passthrough-node-controller).

GitHub repo renamed (redirects cover old git/web refs). The internal
maintenance-state annotation domain (vsphere-maintenance.boeye.net/*) is
left unchanged — it's controller state, not user-facing, and no nodes
carry it currently.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .github/workflows/ci.yaml               |  2 +-
 .github/workflows/release.yaml          |  6 +-
 CHANGELOG.md                            | 26 ++++----
 Dockerfile                              |  6 +-
 README.md                               | 80 ++++++++++++-------------
 chart/Chart.yaml                        |  6 +-
 chart/templates/NOTES.txt               |  4 +-
 chart/templates/_helpers.tpl            | 24 ++++----
 chart/templates/clusterrole.yaml        |  4 +-
 chart/templates/clusterrolebinding.yaml |  8 +--
 chart/templates/configmap.yaml          |  4 +-
 chart/templates/deployment.yaml         | 14 ++---
 chart/templates/fence.yaml              | 16 ++---
 chart/templates/secret.yaml             |  4 +-
 chart/templates/serviceaccount.yaml     |  4 +-
 chart/values.yaml                       |  4 +-
 16 files changed, 106 insertions(+), 106 deletions(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 966ecd8..dd22033 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -75,4 +75,4 @@ jobs:
           context: .
           platforms: linux/amd64,linux/arm64
           push: false
-          tags: ci/gpu-node-vsphere-maintenance-controller:ci
+          tags: ci/vsphere-passthrough-node-controller:ci
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index f0fafc9..6365d8f 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -148,7 +148,7 @@ jobs:
         run: |
           version="${{ steps.ver.outputs.version }}"
           # helm push prints "Digest: sha256:..." to stderr; tee to capture.
-          helm push "gpu-node-vsphere-maintenance-controller-${version}.tgz" \
+          helm push "vsphere-passthrough-node-controller-${version}.tgz" \
             "oci://${{ env.CHART_REPO }}" 2>&1 | tee push.log
           digest=$(awk '/^Digest: /{print $2}' push.log)
           if [ -z "$digest" ]; then
@@ -160,7 +160,7 @@ jobs:
       - name: Cosign keyless sign (chart)
         env:
           DIGEST: ${{ steps.chart_push.outputs.digest }}
-          CHART_REF: ${{ env.CHART_REPO }}/gpu-node-vsphere-maintenance-controller
+          CHART_REF: ${{ env.CHART_REPO }}/vsphere-passthrough-node-controller
         run: cosign sign --yes "${CHART_REF}@${DIGEST}"
 
       - name: Create GitHub Release
@@ -173,4 +173,4 @@ jobs:
           prerelease: false
           files: |
             sbom.spdx.json
-            gpu-node-vsphere-maintenance-controller-${{ steps.ver.outputs.version }}.tgz
+            vsphere-passthrough-node-controller-${{ steps.ver.outputs.version }}.tgz
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ecb165e..4dcd9b7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -107,7 +107,7 @@ No controller code change. Supply-chain and CI polish only.
   now consults the map instead of making a per-node `get_vm_host` round-trip
   to vCenter on every poll.
 - Minimal Helm chart under `chart/`, published as OCI to
-  `ghcr.io/varashi/charts/gpu-node-vsphere-maintenance-controller`.
+  `ghcr.io/varashi/charts/vsphere-passthrough-node-controller`.
 - GitHub Actions: `ci.yaml` (ruff, hadolint, helm lint, buildx smoke build)
   on pull requests; `release.yaml` on `v*.*.*` tag push builds multi-arch
   images (amd64, arm64), cosign-signs keyless via OIDC, attaches SBOM and
@@ -186,15 +186,15 @@ No controller code change. Supply-chain and CI polish only.
 - Initial release: drain → power-off → wait-for-exit → power-on →
   uncordon, driven by edge-triggered `HostSystem.recentTask` polling.
 
-[Unreleased]: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/compare/v0.4.3...HEAD
-[0.4.3]: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/compare/v0.4.2...v0.4.3
-[0.4.2]: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/compare/v0.4.1...v0.4.2
-[0.4.1]: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/compare/v0.4.0...v0.4.1
-[0.4.0]: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/compare/v0.3.0...v0.4.0
-[0.3.0]: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/compare/v0.2.3...v0.3.0
-[0.2.3]: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/compare/v0.2.2...v0.2.3
-[0.2.2]: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/compare/v0.2.1...v0.2.2
-[0.2.1]: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/compare/v0.2.0...v0.2.1
-[0.2.0]: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/compare/v0.1.1...v0.2.0
-[0.1.1]: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/compare/v0.1.0...v0.1.1
-[0.1.0]: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/releases/tag/v0.1.0
+[Unreleased]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.4.3...HEAD
+[0.4.3]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.4.2...v0.4.3
+[0.4.2]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.4.1...v0.4.2
+[0.4.1]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.4.0...v0.4.1
+[0.4.0]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.3.0...v0.4.0
+[0.3.0]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.2.3...v0.3.0
+[0.2.3]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.2.2...v0.2.3
+[0.2.2]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.2.1...v0.2.2
+[0.2.1]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.2.0...v0.2.1
+[0.2.0]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.1.1...v0.2.0
+[0.1.1]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.1.0...v0.1.1
+[0.1.0]: https://github.com/Varashi/vsphere-passthrough-node-controller/releases/tag/v0.1.0
diff --git a/Dockerfile b/Dockerfile
index 6bd5352..882b389 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,9 +1,9 @@
 FROM python:3.13-slim
 
-LABEL org.opencontainers.image.title="gpu-node-vsphere-maintenance-controller"
+LABEL org.opencontainers.image.title="vsphere-passthrough-node-controller"
 LABEL org.opencontainers.image.description="Kubernetes controller that automates ESXi maintenance mode for worker nodes with PCI passthrough (GPU or otherwise)."
-LABEL org.opencontainers.image.source="https://github.com/Varashi/gpu-node-vsphere-maintenance-controller"
-LABEL org.opencontainers.image.documentation="https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/blob/main/README.md"
+LABEL org.opencontainers.image.source="https://github.com/Varashi/vsphere-passthrough-node-controller"
+LABEL org.opencontainers.image.documentation="https://github.com/Varashi/vsphere-passthrough-node-controller/blob/main/README.md"
 LABEL org.opencontainers.image.licenses="MIT"
 
 WORKDIR /app
diff --git a/README.md b/README.md
index c3f1879..70970bf 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# gpu-node-vsphere-maintenance-controller
+# vsphere-passthrough-node-controller
 
 A Kubernetes controller that safely handles ESXi maintenance mode transitions
 for worker nodes that use **PCI passthrough** (Intel ARC / NVIDIA / any
@@ -12,7 +12,7 @@ possible — migrates it (cold) to another GPU-capable host and brings it back
 online. When the original host exits maintenance, a powered-off node is
 returned to service automatically.
 
-Image: `ghcr.io/varashi/gpu-node-vsphere-maintenance-controller` (public).
+Image: `ghcr.io/varashi/vsphere-passthrough-node-controller` (public).
 
 ## Why this exists
 
@@ -149,10 +149,10 @@ mis-fire is destructive; turn it on once you trust the signal in your environmen
 The chart is published as an OCI artifact alongside the image:
 
 ```bash
-helm upgrade --install gpu-node-vsphere-maintenance \
-  oci://ghcr.io/varashi/charts/gpu-node-vsphere-maintenance-controller \
+helm upgrade --install vsphere-passthrough-node \
+  oci://ghcr.io/varashi/charts/vsphere-passthrough-node-controller \
   --version 0.4.3 \
-  --namespace gpu-node-vsphere-maintenance --create-namespace \
+  --namespace vsphere-passthrough-node --create-namespace \
   --set vcenter.host=vcenter.example.com \
   --set vcenter.user=maintenance-controller@vsphere.local \
   --set vcenter.password='replace-me'
@@ -170,24 +170,24 @@ A Flux `HelmRelease` example:
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: OCIRepository
 metadata:
-  name: gpu-node-vsphere-maintenance-controller
-  namespace: gpu-node-vsphere-maintenance
+  name: vsphere-passthrough-node-controller
+  namespace: vsphere-passthrough-node
 spec:
   interval: 1h
-  url: oci://ghcr.io/varashi/charts/gpu-node-vsphere-maintenance-controller
+  url: oci://ghcr.io/varashi/charts/vsphere-passthrough-node-controller
   ref:
     tag: 0.4.3
 ---
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
-  name: gpu-node-vsphere-maintenance-controller
-  namespace: gpu-node-vsphere-maintenance
+  name: vsphere-passthrough-node-controller
+  namespace: vsphere-passthrough-node
 spec:
   interval: 1h
   chartRef:
     kind: OCIRepository
-    name: gpu-node-vsphere-maintenance-controller
+    name: vsphere-passthrough-node-controller
   values:
     vcenter:
       existingSecret: vsphere-credentials
@@ -206,18 +206,18 @@ and credentials source as needed):
 apiVersion: v1
 kind: Namespace
 metadata:
-  name: gpu-node-vsphere-maintenance
+  name: vsphere-passthrough-node
 ---
 apiVersion: v1
 kind: ServiceAccount
 metadata:
-  name: gpu-node-vsphere-maintenance
-  namespace: gpu-node-vsphere-maintenance
+  name: vsphere-passthrough-node
+  namespace: vsphere-passthrough-node
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
-  name: gpu-node-vsphere-maintenance
+  name: vsphere-passthrough-node
 rules:
   - apiGroups: [""]
     resources: ["nodes"]
@@ -232,21 +232,21 @@ rules:
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding
 metadata:
-  name: gpu-node-vsphere-maintenance
+  name: vsphere-passthrough-node
 roleRef:
   apiGroup: rbac.authorization.k8s.io
   kind: ClusterRole
-  name: gpu-node-vsphere-maintenance
+  name: vsphere-passthrough-node
 subjects:
   - kind: ServiceAccount
-    name: gpu-node-vsphere-maintenance
-    namespace: gpu-node-vsphere-maintenance
+    name: vsphere-passthrough-node
+    namespace: vsphere-passthrough-node
 ---
 apiVersion: v1
 kind: Secret
 metadata:
   name: vsphere-credentials
-  namespace: gpu-node-vsphere-maintenance
+  namespace: vsphere-passthrough-node
 type: Opaque
 stringData:
   VCENTER_HOST: vcenter.example.com
@@ -257,7 +257,7 @@ apiVersion: v1
 kind: ConfigMap
 metadata:
   name: controller-config
-  namespace: gpu-node-vsphere-maintenance
+  namespace: vsphere-passthrough-node
 data:
   POLL_INTERVAL_SECONDS: "30"
   DRAIN_TIMEOUT_SECONDS: "600"
@@ -270,24 +270,24 @@ data:
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: gpu-node-vsphere-maintenance
-  namespace: gpu-node-vsphere-maintenance
+  name: vsphere-passthrough-node
+  namespace: vsphere-passthrough-node
 spec:
   replicas: 1
   strategy:
     type: Recreate
   selector:
     matchLabels:
-      app: gpu-node-vsphere-maintenance
+      app: vsphere-passthrough-node
   template:
     metadata:
       labels:
-        app: gpu-node-vsphere-maintenance
+        app: vsphere-passthrough-node
     spec:
-      serviceAccountName: gpu-node-vsphere-maintenance
+      serviceAccountName: vsphere-passthrough-node
       containers:
         - name: controller
-          image: ghcr.io/varashi/gpu-node-vsphere-maintenance-controller:v0.3.0
+          image: ghcr.io/varashi/vsphere-passthrough-node-controller:v0.3.0
           envFrom:
             - secretRef:
                 name: vsphere-credentials
@@ -330,7 +330,7 @@ apiVersion: external-secrets.io/v1
 kind: ExternalSecret
 metadata:
   name: vsphere-credentials
-  namespace: gpu-node-vsphere-maintenance
+  namespace: vsphere-passthrough-node
 spec:
   refreshInterval: 1h
   secretStoreRef:
@@ -393,8 +393,8 @@ are set.
 ## Building from source
 
 ```bash
-docker build -t ghcr.io/you/gpu-node-vsphere-maintenance-controller:dev .
-docker push  ghcr.io/you/gpu-node-vsphere-maintenance-controller:dev
+docker build -t ghcr.io/you/vsphere-passthrough-node-controller:dev .
+docker push  ghcr.io/you/vsphere-passthrough-node-controller:dev
 ```
 
 Source layout is deliberately tiny — a single `controller.py` plus a
@@ -444,12 +444,12 @@ git push origin v0.3.1
 The `release.yaml` GitHub Actions workflow then:
 
 1. Builds and pushes the controller image to
-   `ghcr.io/varashi/gpu-node-vsphere-maintenance-controller`, multi-arch
+   `ghcr.io/varashi/vsphere-passthrough-node-controller`, multi-arch
    (`linux/amd64`, `linux/arm64`), with cosign keyless signatures (GitHub
    OIDC), an SPDX SBOM, and a build-provenance attestation.
 2. Packages the Helm chart in `chart/` with `version` and `appVersion`
    matching the tag and pushes it to
-   `oci://ghcr.io/varashi/charts/gpu-node-vsphere-maintenance-controller`.
+   `oci://ghcr.io/varashi/charts/vsphere-passthrough-node-controller`.
 3. Creates a GitHub Release whose body is extracted from the matching
    section of [`CHANGELOG.md`](./CHANGELOG.md) and attaches the SBOM and
    the packaged chart `.tgz`.
@@ -463,19 +463,19 @@ attached as a cosign attestation. Verify any of these before deploying:
 ```bash
 # 1. Image signature.
 cosign verify \
-  --certificate-identity-regexp 'https://github\.com/Varashi/gpu-node-vsphere-maintenance-controller/\.github/workflows/release\.yaml@refs/tags/v.*' \
+  --certificate-identity-regexp 'https://github\.com/Varashi/vsphere-passthrough-node-controller/\.github/workflows/release\.yaml@refs/tags/v.*' \
   --certificate-oidc-issuer https://token.actions.githubusercontent.com \
-  ghcr.io/varashi/gpu-node-vsphere-maintenance-controller:<tag>
+  ghcr.io/varashi/vsphere-passthrough-node-controller:<tag>
 
 # 2. SBOM attestation (SPDX).
 cosign verify-attestation --type spdxjson \
-  --certificate-identity-regexp 'https://github\.com/Varashi/gpu-node-vsphere-maintenance-controller/\.github/workflows/release\.yaml@refs/tags/v.*' \
+  --certificate-identity-regexp 'https://github\.com/Varashi/vsphere-passthrough-node-controller/\.github/workflows/release\.yaml@refs/tags/v.*' \
   --certificate-oidc-issuer https://token.actions.githubusercontent.com \
-  ghcr.io/varashi/gpu-node-vsphere-maintenance-controller:<tag>
+  ghcr.io/varashi/vsphere-passthrough-node-controller:<tag>
 
 # 3. SLSA build provenance (GitHub Attestations).
 gh attestation verify \
-  oci://ghcr.io/varashi/gpu-node-vsphere-maintenance-controller:<tag> \
+  oci://ghcr.io/varashi/vsphere-passthrough-node-controller:<tag> \
   --owner Varashi
 ```
 
@@ -484,9 +484,9 @@ chart digests too:
 
 ```bash
 cosign verify \
-  --certificate-identity-regexp 'https://github\.com/Varashi/gpu-node-vsphere-maintenance-controller/\.github/workflows/release\.yaml@refs/tags/v.*' \
+  --certificate-identity-regexp 'https://github\.com/Varashi/vsphere-passthrough-node-controller/\.github/workflows/release\.yaml@refs/tags/v.*' \
   --certificate-oidc-issuer https://token.actions.githubusercontent.com \
-  ghcr.io/varashi/charts/gpu-node-vsphere-maintenance-controller:<tag>
+  ghcr.io/varashi/charts/vsphere-passthrough-node-controller:<tag>
 ```
 
 `helm pull --verify` is *not* supported against this chart: `--verify`
@@ -497,7 +497,7 @@ above instead.
 ## Version history
 
 See [`CHANGELOG.md`](./CHANGELOG.md) for the full history. Released tags
-are also listed on the [GitHub Releases](https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/releases)
+are also listed on the [GitHub Releases](https://github.com/Varashi/vsphere-passthrough-node-controller/releases)
 page with signed assets and SBOMs.
 
 ## License
diff --git a/chart/Chart.yaml b/chart/Chart.yaml
index ffb534f..b5d2b1a 100644
--- a/chart/Chart.yaml
+++ b/chart/Chart.yaml
@@ -1,5 +1,5 @@
 apiVersion: v2
-name: gpu-node-vsphere-maintenance-controller
+name: vsphere-passthrough-node-controller
 description: |
   Kubernetes controller that automates ESXi maintenance mode for worker
   nodes with PCI passthrough (GPU or otherwise). Detects EnterMaintenanceMode
@@ -16,9 +16,9 @@ keywords:
   - gpu
   - pci-passthrough
   - kubernetes
-home: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller
+home: https://github.com/Varashi/vsphere-passthrough-node-controller
 sources:
-  - https://github.com/Varashi/gpu-node-vsphere-maintenance-controller
+  - https://github.com/Varashi/vsphere-passthrough-node-controller
 maintainers:
   - name: Varashi
     url: https://github.com/Varashi
diff --git a/chart/templates/NOTES.txt b/chart/templates/NOTES.txt
index da1ad58..58e54ee 100644
--- a/chart/templates/NOTES.txt
+++ b/chart/templates/NOTES.txt
@@ -20,8 +20,8 @@ Note: vCenter TLS verification is DISABLED. To enable, pick one:
 
 Verify the controller is running:
 
-  kubectl -n {{ .Release.Namespace }} rollout status deploy/{{ include "gpu-node-vsphere-maintenance-controller.fullname" . }}
-  kubectl -n {{ .Release.Namespace }} logs deploy/{{ include "gpu-node-vsphere-maintenance-controller.fullname" . }} -f
+  kubectl -n {{ .Release.Namespace }} rollout status deploy/{{ include "vsphere-passthrough-node-controller.fullname" . }}
+  kubectl -n {{ .Release.Namespace }} logs deploy/{{ include "vsphere-passthrough-node-controller.fullname" . }} -f
 
 GPU-worker Node label (`{{ .Values.config.gpuNodeLabel }}`) identifies the
 nodes this controller will drain when their ESXi host enters maintenance.
diff --git a/chart/templates/_helpers.tpl b/chart/templates/_helpers.tpl
index 122c2a5..8984d62 100644
--- a/chart/templates/_helpers.tpl
+++ b/chart/templates/_helpers.tpl
@@ -1,14 +1,14 @@
 {{/*
 Expand the name of the chart.
 */}}
-{{- define "gpu-node-vsphere-maintenance-controller.name" -}}
+{{- define "vsphere-passthrough-node-controller.name" -}}
 {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
 {{- end }}
 
 {{/*
 Create a default fully qualified app name.
 */}}
-{{- define "gpu-node-vsphere-maintenance-controller.fullname" -}}
+{{- define "vsphere-passthrough-node-controller.fullname" -}}
 {{- if .Values.fullnameOverride }}
 {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
 {{- else }}
@@ -24,16 +24,16 @@ Create a default fully qualified app name.
 {{/*
 Chart label.
 */}}
-{{- define "gpu-node-vsphere-maintenance-controller.chart" -}}
+{{- define "vsphere-passthrough-node-controller.chart" -}}
 {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
 {{- end }}
 
 {{/*
 Common labels.
 */}}
-{{- define "gpu-node-vsphere-maintenance-controller.labels" -}}
-helm.sh/chart: {{ include "gpu-node-vsphere-maintenance-controller.chart" . }}
-{{ include "gpu-node-vsphere-maintenance-controller.selectorLabels" . }}
+{{- define "vsphere-passthrough-node-controller.labels" -}}
+helm.sh/chart: {{ include "vsphere-passthrough-node-controller.chart" . }}
+{{ include "vsphere-passthrough-node-controller.selectorLabels" . }}
 {{- if .Chart.AppVersion }}
 app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
 {{- end }}
@@ -43,17 +43,17 @@ app.kubernetes.io/managed-by: {{ .Release.Service }}
 {{/*
 Selector labels.
 */}}
-{{- define "gpu-node-vsphere-maintenance-controller.selectorLabels" -}}
-app.kubernetes.io/name: {{ include "gpu-node-vsphere-maintenance-controller.name" . }}
+{{- define "vsphere-passthrough-node-controller.selectorLabels" -}}
+app.kubernetes.io/name: {{ include "vsphere-passthrough-node-controller.name" . }}
 app.kubernetes.io/instance: {{ .Release.Name }}
 {{- end }}
 
 {{/*
 ServiceAccount name.
 */}}
-{{- define "gpu-node-vsphere-maintenance-controller.serviceAccountName" -}}
+{{- define "vsphere-passthrough-node-controller.serviceAccountName" -}}
 {{- if .Values.serviceAccount.create }}
-{{- default (include "gpu-node-vsphere-maintenance-controller.fullname" .) .Values.serviceAccount.name }}
+{{- default (include "vsphere-passthrough-node-controller.fullname" .) .Values.serviceAccount.name }}
 {{- else }}
 {{- default "default" .Values.serviceAccount.name }}
 {{- end }}
@@ -62,10 +62,10 @@ ServiceAccount name.
 {{/*
 Name of the Secret holding vCenter credentials (existing or rendered).
 */}}
-{{- define "gpu-node-vsphere-maintenance-controller.vcenterSecretName" -}}
+{{- define "vsphere-passthrough-node-controller.vcenterSecretName" -}}
 {{- if .Values.vcenter.existingSecret -}}
 {{- .Values.vcenter.existingSecret -}}
 {{- else -}}
-{{- printf "%s-vcenter" (include "gpu-node-vsphere-maintenance-controller.fullname" .) -}}
+{{- printf "%s-vcenter" (include "vsphere-passthrough-node-controller.fullname" .) -}}
 {{- end -}}
 {{- end }}
diff --git a/chart/templates/clusterrole.yaml b/chart/templates/clusterrole.yaml
index cb06b2f..fd2a04a 100644
--- a/chart/templates/clusterrole.yaml
+++ b/chart/templates/clusterrole.yaml
@@ -2,9 +2,9 @@
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
-  name: {{ include "gpu-node-vsphere-maintenance-controller.fullname" . }}
+  name: {{ include "vsphere-passthrough-node-controller.fullname" . }}
   labels:
-    {{- include "gpu-node-vsphere-maintenance-controller.labels" . | nindent 4 }}
+    {{- include "vsphere-passthrough-node-controller.labels" . | nindent 4 }}
 rules:
   - apiGroups: [""]
     resources: ["nodes"]
diff --git a/chart/templates/clusterrolebinding.yaml b/chart/templates/clusterrolebinding.yaml
index 6f15360..29fdb07 100644
--- a/chart/templates/clusterrolebinding.yaml
+++ b/chart/templates/clusterrolebinding.yaml
@@ -2,15 +2,15 @@
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding
 metadata:
-  name: {{ include "gpu-node-vsphere-maintenance-controller.fullname" . }}
+  name: {{ include "vsphere-passthrough-node-controller.fullname" . }}
   labels:
-    {{- include "gpu-node-vsphere-maintenance-controller.labels" . | nindent 4 }}
+    {{- include "vsphere-passthrough-node-controller.labels" . | nindent 4 }}
 roleRef:
   apiGroup: rbac.authorization.k8s.io
   kind: ClusterRole
-  name: {{ include "gpu-node-vsphere-maintenance-controller.fullname" . }}
+  name: {{ include "vsphere-passthrough-node-controller.fullname" . }}
 subjects:
   - kind: ServiceAccount
-    name: {{ include "gpu-node-vsphere-maintenance-controller.serviceAccountName" . }}
+    name: {{ include "vsphere-passthrough-node-controller.serviceAccountName" . }}
     namespace: {{ .Release.Namespace }}
 {{- end }}
diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml
index ebb8663..1ebb783 100644
--- a/chart/templates/configmap.yaml
+++ b/chart/templates/configmap.yaml
@@ -1,10 +1,10 @@
 apiVersion: v1
 kind: ConfigMap
 metadata:
-  name: {{ include "gpu-node-vsphere-maintenance-controller.fullname" . }}
+  name: {{ include "vsphere-passthrough-node-controller.fullname" . }}
   namespace: {{ .Release.Namespace }}
   labels:
-    {{- include "gpu-node-vsphere-maintenance-controller.labels" . | nindent 4 }}
+    {{- include "vsphere-passthrough-node-controller.labels" . | nindent 4 }}
 data:
   POLL_INTERVAL_SECONDS: {{ .Values.config.pollIntervalSeconds | quote }}
   DRAIN_TIMEOUT_SECONDS: {{ .Values.config.drainTimeoutSeconds | quote }}
diff --git a/chart/templates/deployment.yaml b/chart/templates/deployment.yaml
index 4d9be0e..edf2377 100644
--- a/chart/templates/deployment.yaml
+++ b/chart/templates/deployment.yaml
@@ -1,21 +1,21 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: {{ include "gpu-node-vsphere-maintenance-controller.fullname" . }}
+  name: {{ include "vsphere-passthrough-node-controller.fullname" . }}
   namespace: {{ .Release.Namespace }}
   labels:
-    {{- include "gpu-node-vsphere-maintenance-controller.labels" . | nindent 4 }}
+    {{- include "vsphere-passthrough-node-controller.labels" . | nindent 4 }}
 spec:
   replicas: {{ .Values.replicaCount }}
   strategy:
     {{- toYaml .Values.strategy | nindent 4 }}
   selector:
     matchLabels:
-      {{- include "gpu-node-vsphere-maintenance-controller.selectorLabels" . | nindent 6 }}
+      {{- include "vsphere-passthrough-node-controller.selectorLabels" . | nindent 6 }}
   template:
     metadata:
       labels:
-        {{- include "gpu-node-vsphere-maintenance-controller.selectorLabels" . | nindent 8 }}
+        {{- include "vsphere-passthrough-node-controller.selectorLabels" . | nindent 8 }}
         {{- with .Values.podLabels }}
         {{- toYaml . | nindent 8 }}
         {{- end }}
@@ -25,7 +25,7 @@ spec:
         {{- toYaml . | nindent 8 }}
         {{- end }}
     spec:
-      serviceAccountName: {{ include "gpu-node-vsphere-maintenance-controller.serviceAccountName" . }}
+      serviceAccountName: {{ include "vsphere-passthrough-node-controller.serviceAccountName" . }}
       {{- with .Values.imagePullSecrets }}
       imagePullSecrets:
         {{- toYaml . | nindent 8 }}
@@ -43,9 +43,9 @@ spec:
           imagePullPolicy: {{ .Values.image.pullPolicy }}
           envFrom:
             - secretRef:
-                name: {{ include "gpu-node-vsphere-maintenance-controller.vcenterSecretName" . }}
+                name: {{ include "vsphere-passthrough-node-controller.vcenterSecretName" . }}
             - configMapRef:
-                name: {{ include "gpu-node-vsphere-maintenance-controller.fullname" . }}
+                name: {{ include "vsphere-passthrough-node-controller.fullname" . }}
           {{- with .Values.extraEnv }}
           env:
             {{- toYaml . | nindent 12 }}
diff --git a/chart/templates/fence.yaml b/chart/templates/fence.yaml
index 05db042..77f9a94 100644
--- a/chart/templates/fence.yaml
+++ b/chart/templates/fence.yaml
@@ -1,5 +1,5 @@
 {{- if .Values.fence.enabled }}
-{{- $fullname := include "gpu-node-vsphere-maintenance-controller.fullname" . -}}
+{{- $fullname := include "vsphere-passthrough-node-controller.fullname" . -}}
 {{- $fenceName := printf "%s-fence" $fullname -}}
 {{- $fenceSA := ternary $fenceName (default "default" .Values.serviceAccount.name) .Values.serviceAccount.create -}}
 {{- if and .Values.serviceAccount.create }}
@@ -9,7 +9,7 @@ metadata:
   name: {{ $fenceName }}
   namespace: {{ .Release.Namespace }}
   labels:
-    {{- include "gpu-node-vsphere-maintenance-controller.labels" . | nindent 4 }}
+    {{- include "vsphere-passthrough-node-controller.labels" . | nindent 4 }}
     app.kubernetes.io/component: fence
 ---
 {{- end }}
@@ -22,7 +22,7 @@ kind: ClusterRole
 metadata:
   name: {{ $fenceName }}
   labels:
-    {{- include "gpu-node-vsphere-maintenance-controller.labels" . | nindent 4 }}
+    {{- include "vsphere-passthrough-node-controller.labels" . | nindent 4 }}
     app.kubernetes.io/component: fence
 rules:
   - apiGroups: [""]
@@ -34,7 +34,7 @@ kind: ClusterRoleBinding
 metadata:
   name: {{ $fenceName }}
   labels:
-    {{- include "gpu-node-vsphere-maintenance-controller.labels" . | nindent 4 }}
+    {{- include "vsphere-passthrough-node-controller.labels" . | nindent 4 }}
     app.kubernetes.io/component: fence
 roleRef:
   apiGroup: rbac.authorization.k8s.io
@@ -52,7 +52,7 @@ metadata:
   name: {{ $fenceName }}
   namespace: {{ .Release.Namespace }}
   labels:
-    {{- include "gpu-node-vsphere-maintenance-controller.labels" . | nindent 4 }}
+    {{- include "vsphere-passthrough-node-controller.labels" . | nindent 4 }}
     app.kubernetes.io/component: fence
 spec:
   replicas: 1
@@ -60,12 +60,12 @@ spec:
     type: Recreate
   selector:
     matchLabels:
-      {{- include "gpu-node-vsphere-maintenance-controller.selectorLabels" . | nindent 6 }}
+      {{- include "vsphere-passthrough-node-controller.selectorLabels" . | nindent 6 }}
       app.kubernetes.io/component: fence
   template:
     metadata:
       labels:
-        {{- include "gpu-node-vsphere-maintenance-controller.selectorLabels" . | nindent 8 }}
+        {{- include "vsphere-passthrough-node-controller.selectorLabels" . | nindent 8 }}
         app.kubernetes.io/component: fence
         {{- with .Values.podLabels }}
         {{- toYaml . | nindent 8 }}
@@ -94,7 +94,7 @@ spec:
           command: ["python", "-u", "fence.py"]
           envFrom:
             - secretRef:
-                name: {{ include "gpu-node-vsphere-maintenance-controller.vcenterSecretName" . }}
+                name: {{ include "vsphere-passthrough-node-controller.vcenterSecretName" . }}
           env:
             - name: FENCE_POLL_SECONDS
               value: {{ .Values.fence.pollSeconds | quote }}
diff --git a/chart/templates/secret.yaml b/chart/templates/secret.yaml
index d74d3cb..cb38a96 100644
--- a/chart/templates/secret.yaml
+++ b/chart/templates/secret.yaml
@@ -2,10 +2,10 @@
 apiVersion: v1
 kind: Secret
 metadata:
-  name: {{ include "gpu-node-vsphere-maintenance-controller.vcenterSecretName" . }}
+  name: {{ include "vsphere-passthrough-node-controller.vcenterSecretName" . }}
   namespace: {{ .Release.Namespace }}
   labels:
-    {{- include "gpu-node-vsphere-maintenance-controller.labels" . | nindent 4 }}
+    {{- include "vsphere-passthrough-node-controller.labels" . | nindent 4 }}
 type: Opaque
 stringData:
   VCENTER_HOST: {{ .Values.vcenter.host | quote }}
diff --git a/chart/templates/serviceaccount.yaml b/chart/templates/serviceaccount.yaml
index ccdb08d..d6dbc4d 100644
--- a/chart/templates/serviceaccount.yaml
+++ b/chart/templates/serviceaccount.yaml
@@ -2,10 +2,10 @@
 apiVersion: v1
 kind: ServiceAccount
 metadata:
-  name: {{ include "gpu-node-vsphere-maintenance-controller.serviceAccountName" . }}
+  name: {{ include "vsphere-passthrough-node-controller.serviceAccountName" . }}
   namespace: {{ .Release.Namespace }}
   labels:
-    {{- include "gpu-node-vsphere-maintenance-controller.labels" . | nindent 4 }}
+    {{- include "vsphere-passthrough-node-controller.labels" . | nindent 4 }}
   {{- with .Values.serviceAccount.annotations }}
   annotations:
     {{- toYaml . | nindent 4 }}
diff --git a/chart/values.yaml b/chart/values.yaml
index a8ffa3c..49b246f 100644
--- a/chart/values.yaml
+++ b/chart/values.yaml
@@ -1,7 +1,7 @@
-# Default values for gpu-node-vsphere-maintenance-controller.
+# Default values for vsphere-passthrough-node-controller.
 
 image:
-  repository: ghcr.io/varashi/gpu-node-vsphere-maintenance-controller
+  repository: ghcr.io/varashi/vsphere-passthrough-node-controller
   # -- Overrides the image tag (defaults to Chart.appVersion)
   tag: ""
   pullPolicy: IfNotPresent

From 587b32b9106da84025449ddb83cb8994ffa89ef5 Mon Sep 17 00:00:00 2001
From: varashi <frank@boeye.net>
Date: Fri, 5 Jun 2026 08:22:14 +0200
Subject: [PATCH 3/4] ci: fix ruff format + hadolint DL3021

ruff format controller.py (line-length wrapping); Dockerfile COPY
multi-arg destination must end with / (DL3021).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 Dockerfile    |  2 +-
 controller.py | 12 +++++++-----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 882b389..b05c339 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -11,7 +11,7 @@ WORKDIR /app
 RUN pip install --no-cache-dir --disable-pip-version-check \
       pyVmomi==8.0.3.0.1 kubernetes==31.0.0
 
-COPY controller.py fence.py .
+COPY controller.py fence.py ./
 
 # Default entrypoint = maintenance controller. The fence controller (fence.py)
 # is the same image with the command overridden to `python -u fence.py`.
diff --git a/controller.py b/controller.py
index c77083d..2c0766e 100644
--- a/controller.py
+++ b/controller.py
@@ -483,14 +483,14 @@ def uncordon(self, node_name):
 
     def has_out_of_service_taint(self, node_name) -> bool:
         node = self.get_node(node_name)
-        return any(
-            t.key == OUT_OF_SERVICE_TAINT_KEY for t in (node.spec.taints or [])
-        )
+        return any(t.key == OUT_OF_SERVICE_TAINT_KEY for t in (node.spec.taints or []))
 
     def _patch_taints(self, node_name, taints):
         # sanitize_for_serialization turns V1Taint objects into proper camelCase
         # API dicts (incl. timeAdded), so existing taints are preserved verbatim.
-        body = {"spec": {"taints": self.core.api_client.sanitize_for_serialization(taints)}}
+        body = {
+            "spec": {"taints": self.core.api_client.sanitize_for_serialization(taints)}
+        }
         self.core.patch_node(node_name, body)
 
     def apply_out_of_service_taint(self, node_name):
@@ -518,7 +518,9 @@ def remove_out_of_service_taint(self, node_name):
         if not any(t.key == OUT_OF_SERVICE_TAINT_KEY for t in taints):
             return
         if DRY_RUN:
-            log.info(f"[DRY RUN] Would un-fence {node_name} (remove out-of-service taint)")
+            log.info(
+                f"[DRY RUN] Would un-fence {node_name} (remove out-of-service taint)"
+            )
             return
         log.info(f"Un-fencing {node_name}: removing {OUT_OF_SERVICE_TAINT_KEY} taint")
         kept = [t for t in taints if t.key != OUT_OF_SERVICE_TAINT_KEY]

From f2adb23ec58fa6692a0240a1389e82c0824fb75b Mon Sep 17 00:00:00 2001
From: varashi <frank@boeye.net>
Date: Fri, 5 Jun 2026 08:28:59 +0200
Subject: [PATCH 4/4] fix: address CodeRabbit review on #7

- controller.py: match out-of-service taint by full (key,value,effect)
  identity, not key-only, so a same-key taint with a different
  value/effect isn't mistaken for ours on fence/un-fence
- chart fence.yaml: drop redundant `and` in SA conditional; remove
  `update` verb from fence ClusterRole (patch suffices for taint ops)
- CHANGELOG: add 0.5.0/0.4.4 link defs, advance Unreleased baseline
- README: bump chart examples 0.4.3 -> 0.5.0; note fence.py in layout

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 CHANGELOG.md               |  4 +++-
 README.md                  | 12 +++++++-----
 chart/templates/fence.yaml |  4 ++--
 controller.py              | 22 +++++++++++++++++-----
 4 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4dcd9b7..b2a97bc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -186,7 +186,9 @@ No controller code change. Supply-chain and CI polish only.
 - Initial release: drain → power-off → wait-for-exit → power-on →
   uncordon, driven by edge-triggered `HostSystem.recentTask` polling.
 
-[Unreleased]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.4.3...HEAD
+[Unreleased]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.5.0...HEAD
+[0.5.0]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.4.4...v0.5.0
+[0.4.4]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.4.3...v0.4.4
 [0.4.3]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.4.2...v0.4.3
 [0.4.2]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.4.1...v0.4.2
 [0.4.1]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.4.0...v0.4.1
diff --git a/README.md b/README.md
index 70970bf..fa7c9cf 100644
--- a/README.md
+++ b/README.md
@@ -151,7 +151,7 @@ The chart is published as an OCI artifact alongside the image:
 ```bash
 helm upgrade --install vsphere-passthrough-node \
   oci://ghcr.io/varashi/charts/vsphere-passthrough-node-controller \
-  --version 0.4.3 \
+  --version 0.5.0 \
   --namespace vsphere-passthrough-node --create-namespace \
   --set vcenter.host=vcenter.example.com \
   --set vcenter.user=maintenance-controller@vsphere.local \
@@ -176,7 +176,7 @@ spec:
   interval: 1h
   url: oci://ghcr.io/varashi/charts/vsphere-passthrough-node-controller
   ref:
-    tag: 0.4.3
+    tag: 0.5.0
 ---
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
@@ -397,9 +397,11 @@ docker build -t ghcr.io/you/vsphere-passthrough-node-controller:dev .
 docker push  ghcr.io/you/vsphere-passthrough-node-controller:dev
 ```
 
-Source layout is deliberately tiny — a single `controller.py` plus a
-minimal Python 3.13 Dockerfile. Dependencies: `pyVmomi` and the official
-Kubernetes Python client.
+Source layout is deliberately tiny — `controller.py` (maintenance-mode
+controller) plus the optional `fence.py` (crash-fence controller, which
+reuses `controller.py`'s vCenter client and node↔VM mapping), on a minimal
+Python 3.13 Dockerfile. Dependencies: `pyVmomi` and the official Kubernetes
+Python client.
 
 ## Race conditions handled
 
diff --git a/chart/templates/fence.yaml b/chart/templates/fence.yaml
index 77f9a94..51d8424 100644
--- a/chart/templates/fence.yaml
+++ b/chart/templates/fence.yaml
@@ -2,7 +2,7 @@
 {{- $fullname := include "vsphere-passthrough-node-controller.fullname" . -}}
 {{- $fenceName := printf "%s-fence" $fullname -}}
 {{- $fenceSA := ternary $fenceName (default "default" .Values.serviceAccount.name) .Values.serviceAccount.create -}}
-{{- if and .Values.serviceAccount.create }}
+{{- if .Values.serviceAccount.create }}
 apiVersion: v1
 kind: ServiceAccount
 metadata:
@@ -27,7 +27,7 @@ metadata:
 rules:
   - apiGroups: [""]
     resources: ["nodes"]
-    verbs: ["get", "list", "watch", "patch", "update"]
+    verbs: ["get", "list", "watch", "patch"]
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding
diff --git a/controller.py b/controller.py
index 2c0766e..af023d0 100644
--- a/controller.py
+++ b/controller.py
@@ -76,6 +76,7 @@ def _is_transient_k8s_error(exc: BaseException) -> bool:
 # Non-graceful node shutdown (used by the separate fence controller, fence.py).
 OUT_OF_SERVICE_TAINT_KEY = "node.kubernetes.io/out-of-service"
 OUT_OF_SERVICE_TAINT_VALUE = "nodeshutdown"
+OUT_OF_SERVICE_TAINT_EFFECT = "NoExecute"
 # vm.runtime.connectionState values meaning the host managing the VM is gone
 # (a crash). A clean maintenance power-off leaves the VM 'connected'.
 VM_DEAD_CONNECTION_STATES = {"disconnected", "inaccessible", "orphaned"}
@@ -481,9 +482,20 @@ def uncordon(self, node_name):
         log.info(f"Uncordoning {node_name}")
         self.core.patch_node(node_name, {"spec": {"unschedulable": False}})
 
+    @staticmethod
+    def _is_out_of_service_taint(t) -> bool:
+        # Match the full (key, value, effect) identity of the taint THIS
+        # controller applies. A same-key taint with a different value/effect is
+        # not ours: don't treat the node as fenced, and never strip it on un-fence.
+        return (
+            t.key == OUT_OF_SERVICE_TAINT_KEY
+            and t.value == OUT_OF_SERVICE_TAINT_VALUE
+            and t.effect == OUT_OF_SERVICE_TAINT_EFFECT
+        )
+
     def has_out_of_service_taint(self, node_name) -> bool:
         node = self.get_node(node_name)
-        return any(t.key == OUT_OF_SERVICE_TAINT_KEY for t in (node.spec.taints or []))
+        return any(self._is_out_of_service_taint(t) for t in (node.spec.taints or []))
 
     def _patch_taints(self, node_name, taints):
         # sanitize_for_serialization turns V1Taint objects into proper camelCase
@@ -497,7 +509,7 @@ def apply_out_of_service_taint(self, node_name):
         """Force-detach volumes + force-delete pods on a confirmed-dead node."""
         node = self.get_node(node_name)
         taints = list(node.spec.taints or [])
-        if any(t.key == OUT_OF_SERVICE_TAINT_KEY for t in taints):
+        if any(self._is_out_of_service_taint(t) for t in taints):
             return  # already fenced
         if DRY_RUN:
             log.warning(f"[DRY RUN] Would FENCE {node_name} (out-of-service taint)")
@@ -507,7 +519,7 @@ def apply_out_of_service_taint(self, node_name):
             k8s_client.V1Taint(
                 key=OUT_OF_SERVICE_TAINT_KEY,
                 value=OUT_OF_SERVICE_TAINT_VALUE,
-                effect="NoExecute",
+                effect=OUT_OF_SERVICE_TAINT_EFFECT,
             )
         )
         self._patch_taints(node_name, taints)
@@ -515,7 +527,7 @@ def apply_out_of_service_taint(self, node_name):
     def remove_out_of_service_taint(self, node_name):
         node = self.get_node(node_name)
         taints = list(node.spec.taints or [])
-        if not any(t.key == OUT_OF_SERVICE_TAINT_KEY for t in taints):
+        if not any(self._is_out_of_service_taint(t) for t in taints):
             return
         if DRY_RUN:
             log.info(
@@ -523,7 +535,7 @@ def remove_out_of_service_taint(self, node_name):
             )
             return
         log.info(f"Un-fencing {node_name}: removing {OUT_OF_SERVICE_TAINT_KEY} taint")
-        kept = [t for t in taints if t.key != OUT_OF_SERVICE_TAINT_KEY]
+        kept = [t for t in taints if not self._is_out_of_service_taint(t)]
         self._patch_taints(node_name, kept)
 
     def is_ready(self, node_name):