diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 966ecd8..dd22033 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -75,4 +75,4 @@ jobs: context: . platforms: linux/amd64,linux/arm64 push: false - tags: ci/gpu-node-vsphere-maintenance-controller:ci + tags: ci/vsphere-passthrough-node-controller:ci diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index f0fafc9..6365d8f 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -148,7 +148,7 @@ jobs: run: | version="${{ steps.ver.outputs.version }}" # helm push prints "Digest: sha256:..." to stderr; tee to capture. - helm push "gpu-node-vsphere-maintenance-controller-${version}.tgz" \ + helm push "vsphere-passthrough-node-controller-${version}.tgz" \ "oci://${{ env.CHART_REPO }}" 2>&1 | tee push.log digest=$(awk '/^Digest: /{print $2}' push.log) if [ -z "$digest" ]; then @@ -160,7 +160,7 @@ jobs: - name: Cosign keyless sign (chart) env: DIGEST: ${{ steps.chart_push.outputs.digest }} - CHART_REF: ${{ env.CHART_REPO }}/gpu-node-vsphere-maintenance-controller + CHART_REF: ${{ env.CHART_REPO }}/vsphere-passthrough-node-controller run: cosign sign --yes "${CHART_REF}@${DIGEST}" - name: Create GitHub Release @@ -173,4 +173,4 @@ jobs: prerelease: false files: | sbom.spdx.json - gpu-node-vsphere-maintenance-controller-${{ steps.ver.outputs.version }}.tgz + vsphere-passthrough-node-controller-${{ steps.ver.outputs.version }}.tgz diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c18dd8d --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +__pycache__/ diff --git a/CHANGELOG.md b/CHANGELOG.md index cd0cfbd..b2a97bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,29 @@ this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm ## [Unreleased] +## [0.5.0] — 2026-06-03 + +### Added +- **Crash-fence controller** (`fence.py`) — a second, optional Deployment + (`fence.enabled`, **off by default**) that shares this image and reuses the + vCenter client + node↔VM mapping. It automates non-graceful node shutdown for + passthrough-GPU workers that vSphere HA can't restart elsewhere during a host + crash: it applies the `node.kubernetes.io/out-of-service` taint to a node + confirmed dead by **both** gates — k8s `NotReady` **and** vCenter VM + `runtime.connectionState` in `{disconnected, inaccessible, orphaned}` — + sustained for `fence.graceSeconds`, so RWO volumes force-detach and stateful + pods reschedule. The taint is removed on recovery (VM `connected` + node + `Ready`). + - **Disjoint from the maintenance controller**: a clean (maintenance) + power-off leaves the VM `connected`; only a real host loss makes it + `disconnected`. The two controllers trigger on different vCenter facts and + never collide — no coordination contract needed. + - **Taint/un-taint only.** Power-on is owned by vSphere HA (it restarts + passthrough VMs on the original host once it returns); eviction is handled + by `tolerationSeconds` + the taint. + - Own ServiceAccount + least-privilege ClusterRole (`nodes` get/list/watch/ + patch only) + kill switch (`fence.enabled`) + independent `fence.dryRun`. + ## [0.4.4] — 2026-05-01 ### Fixed @@ -84,7 +107,7 @@ No controller code change. Supply-chain and CI polish only. now consults the map instead of making a per-node `get_vm_host` round-trip to vCenter on every poll. - Minimal Helm chart under `chart/`, published as OCI to - `ghcr.io/varashi/charts/gpu-node-vsphere-maintenance-controller`. + `ghcr.io/varashi/charts/vsphere-passthrough-node-controller`. - GitHub Actions: `ci.yaml` (ruff, hadolint, helm lint, buildx smoke build) on pull requests; `release.yaml` on `v*.*.*` tag push builds multi-arch images (amd64, arm64), cosign-signs keyless via OIDC, attaches SBOM and @@ -163,15 +186,17 @@ No controller code change. Supply-chain and CI polish only. - Initial release: drain → power-off → wait-for-exit → power-on → uncordon, driven by edge-triggered `HostSystem.recentTask` polling. -[Unreleased]: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/compare/v0.4.3...HEAD -[0.4.3]: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/compare/v0.4.2...v0.4.3 -[0.4.2]: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/compare/v0.4.1...v0.4.2 -[0.4.1]: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/compare/v0.4.0...v0.4.1 -[0.4.0]: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/compare/v0.3.0...v0.4.0 -[0.3.0]: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/compare/v0.2.3...v0.3.0 -[0.2.3]: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/compare/v0.2.2...v0.2.3 -[0.2.2]: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/compare/v0.2.1...v0.2.2 -[0.2.1]: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/compare/v0.2.0...v0.2.1 -[0.2.0]: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/compare/v0.1.1...v0.2.0 -[0.1.1]: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/compare/v0.1.0...v0.1.1 -[0.1.0]: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/releases/tag/v0.1.0 +[Unreleased]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.5.0...HEAD +[0.5.0]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.4.4...v0.5.0 +[0.4.4]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.4.3...v0.4.4 +[0.4.3]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.4.2...v0.4.3 +[0.4.2]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.4.1...v0.4.2 +[0.4.1]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.4.0...v0.4.1 +[0.4.0]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.3.0...v0.4.0 +[0.3.0]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.2.3...v0.3.0 +[0.2.3]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.2.2...v0.2.3 +[0.2.2]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.2.1...v0.2.2 +[0.2.1]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.2.0...v0.2.1 +[0.2.0]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.1.1...v0.2.0 +[0.1.1]: https://github.com/Varashi/vsphere-passthrough-node-controller/compare/v0.1.0...v0.1.1 +[0.1.0]: https://github.com/Varashi/vsphere-passthrough-node-controller/releases/tag/v0.1.0 diff --git a/Dockerfile b/Dockerfile index dfaa2c2..b05c339 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,9 +1,9 @@ FROM python:3.13-slim -LABEL org.opencontainers.image.title="gpu-node-vsphere-maintenance-controller" +LABEL org.opencontainers.image.title="vsphere-passthrough-node-controller" LABEL org.opencontainers.image.description="Kubernetes controller that automates ESXi maintenance mode for worker nodes with PCI passthrough (GPU or otherwise)." -LABEL org.opencontainers.image.source="https://github.com/Varashi/gpu-node-vsphere-maintenance-controller" -LABEL org.opencontainers.image.documentation="https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/blob/main/README.md" +LABEL org.opencontainers.image.source="https://github.com/Varashi/vsphere-passthrough-node-controller" +LABEL org.opencontainers.image.documentation="https://github.com/Varashi/vsphere-passthrough-node-controller/blob/main/README.md" LABEL org.opencontainers.image.licenses="MIT" WORKDIR /app @@ -11,6 +11,8 @@ WORKDIR /app RUN pip install --no-cache-dir --disable-pip-version-check \ pyVmomi==8.0.3.0.1 kubernetes==31.0.0 -COPY controller.py . +COPY controller.py fence.py ./ +# Default entrypoint = maintenance controller. The fence controller (fence.py) +# is the same image with the command overridden to `python -u fence.py`. CMD ["python", "-u", "controller.py"] diff --git a/README.md b/README.md index d83bd1e..fa7c9cf 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# gpu-node-vsphere-maintenance-controller +# vsphere-passthrough-node-controller A Kubernetes controller that safely handles ESXi maintenance mode transitions for worker nodes that use **PCI passthrough** (Intel ARC / NVIDIA / any @@ -12,7 +12,7 @@ possible — migrates it (cold) to another GPU-capable host and brings it back online. When the original host exits maintenance, a powered-off node is returned to service automatically. -Image: `ghcr.io/varashi/gpu-node-vsphere-maintenance-controller` (public). +Image: `ghcr.io/varashi/vsphere-passthrough-node-controller` (public). ## Why this exists @@ -90,6 +90,49 @@ Recovery: if a `powered-off` VM ends up on a different host (DRS race, operator intervention), the controller notices on the next poll and transitions it to `migrated`. +## Crash-fence controller (optional, off by default) + +The maintenance controller above handles *planned* host maintenance. A separate, +optional controller (`fence.py`) handles the *unplanned* case — a host **crash**. + +A passthrough-GPU VM can't be vSphere-HA-restarted on another host (the device +pins it to the original host), so when its host crashes the node stays down. Its +**RWO volume stays attached to the dead node** and Kubernetes won't auto-detach +it — it can't distinguish a crash from a network partition, where force-detaching +a still-live node's volume would corrupt it. So a rescheduled stateful pod hangs +indefinitely on `Multi-Attach`. The fix is the `node.kubernetes.io/out-of-service` +taint (non-graceful node shutdown), which force-detaches volumes and force-deletes +pods — but it must only ever be applied to a node you've *confirmed* is dead. + +The fence controller provides that confirmation by requiring **two gates**, +sustained for `fence.graceSeconds`: + +1. **k8s** — node `NotReady`, and +2. **vCenter** — that node's VM `runtime.connectionState` is `disconnected` + (or `inaccessible`/`orphaned`). A host crash makes its VMs `disconnected`; a + clean (maintenance) power-off keeps them `connected` — so this signal is + **disjoint from the maintenance controller** and the two never collide. + +On recovery (VM `connected` + node `Ready`) the taint is removed. The controller +does **taint/un-taint only** — power-on is left to vSphere HA (which restarts a +passthrough VM on the original host once it reconnects), and eviction is handled +by `tolerationSeconds` + the taint. + +Enable it (and start with `dryRun` to watch its decisions): + +```yaml +fence: + enabled: true + dryRun: true # logs "would fence" without tainting; flip to false when confident + graceSeconds: 60 # both gates must hold this long before fencing + pollSeconds: 20 +``` + +It runs as its own Deployment with its own ServiceAccount, a least-privilege +ClusterRole (`nodes` get/list/watch/patch only — no pods/eviction), and an +independent kill switch (`fence.enabled`). It's **off by default** because a +mis-fire is destructive; turn it on once you trust the signal in your environment. + ## Requirements - Kubernetes 1.26+ (eviction API, server-side apply) @@ -106,10 +149,10 @@ transitions it to `migrated`. The chart is published as an OCI artifact alongside the image: ```bash -helm upgrade --install gpu-node-vsphere-maintenance \ - oci://ghcr.io/varashi/charts/gpu-node-vsphere-maintenance-controller \ - --version 0.4.3 \ - --namespace gpu-node-vsphere-maintenance --create-namespace \ +helm upgrade --install vsphere-passthrough-node \ + oci://ghcr.io/varashi/charts/vsphere-passthrough-node-controller \ + --version 0.5.0 \ + --namespace vsphere-passthrough-node --create-namespace \ --set vcenter.host=vcenter.example.com \ --set vcenter.user=maintenance-controller@vsphere.local \ --set vcenter.password='replace-me' @@ -127,24 +170,24 @@ A Flux `HelmRelease` example: apiVersion: source.toolkit.fluxcd.io/v1 kind: OCIRepository metadata: - name: gpu-node-vsphere-maintenance-controller - namespace: gpu-node-vsphere-maintenance + name: vsphere-passthrough-node-controller + namespace: vsphere-passthrough-node spec: interval: 1h - url: oci://ghcr.io/varashi/charts/gpu-node-vsphere-maintenance-controller + url: oci://ghcr.io/varashi/charts/vsphere-passthrough-node-controller ref: - tag: 0.4.3 + tag: 0.5.0 --- apiVersion: helm.toolkit.fluxcd.io/v2 kind: HelmRelease metadata: - name: gpu-node-vsphere-maintenance-controller - namespace: gpu-node-vsphere-maintenance + name: vsphere-passthrough-node-controller + namespace: vsphere-passthrough-node spec: interval: 1h chartRef: kind: OCIRepository - name: gpu-node-vsphere-maintenance-controller + name: vsphere-passthrough-node-controller values: vcenter: existingSecret: vsphere-credentials @@ -163,18 +206,18 @@ and credentials source as needed): apiVersion: v1 kind: Namespace metadata: - name: gpu-node-vsphere-maintenance + name: vsphere-passthrough-node --- apiVersion: v1 kind: ServiceAccount metadata: - name: gpu-node-vsphere-maintenance - namespace: gpu-node-vsphere-maintenance + name: vsphere-passthrough-node + namespace: vsphere-passthrough-node --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - name: gpu-node-vsphere-maintenance + name: vsphere-passthrough-node rules: - apiGroups: [""] resources: ["nodes"] @@ -189,21 +232,21 @@ rules: apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: gpu-node-vsphere-maintenance + name: vsphere-passthrough-node roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: gpu-node-vsphere-maintenance + name: vsphere-passthrough-node subjects: - kind: ServiceAccount - name: gpu-node-vsphere-maintenance - namespace: gpu-node-vsphere-maintenance + name: vsphere-passthrough-node + namespace: vsphere-passthrough-node --- apiVersion: v1 kind: Secret metadata: name: vsphere-credentials - namespace: gpu-node-vsphere-maintenance + namespace: vsphere-passthrough-node type: Opaque stringData: VCENTER_HOST: vcenter.example.com @@ -214,7 +257,7 @@ apiVersion: v1 kind: ConfigMap metadata: name: controller-config - namespace: gpu-node-vsphere-maintenance + namespace: vsphere-passthrough-node data: POLL_INTERVAL_SECONDS: "30" DRAIN_TIMEOUT_SECONDS: "600" @@ -227,24 +270,24 @@ data: apiVersion: apps/v1 kind: Deployment metadata: - name: gpu-node-vsphere-maintenance - namespace: gpu-node-vsphere-maintenance + name: vsphere-passthrough-node + namespace: vsphere-passthrough-node spec: replicas: 1 strategy: type: Recreate selector: matchLabels: - app: gpu-node-vsphere-maintenance + app: vsphere-passthrough-node template: metadata: labels: - app: gpu-node-vsphere-maintenance + app: vsphere-passthrough-node spec: - serviceAccountName: gpu-node-vsphere-maintenance + serviceAccountName: vsphere-passthrough-node containers: - name: controller - image: ghcr.io/varashi/gpu-node-vsphere-maintenance-controller:v0.3.0 + image: ghcr.io/varashi/vsphere-passthrough-node-controller:v0.3.0 envFrom: - secretRef: name: vsphere-credentials @@ -287,7 +330,7 @@ apiVersion: external-secrets.io/v1 kind: ExternalSecret metadata: name: vsphere-credentials - namespace: gpu-node-vsphere-maintenance + namespace: vsphere-passthrough-node spec: refreshInterval: 1h secretStoreRef: @@ -350,13 +393,15 @@ are set. ## Building from source ```bash -docker build -t ghcr.io/you/gpu-node-vsphere-maintenance-controller:dev . -docker push ghcr.io/you/gpu-node-vsphere-maintenance-controller:dev +docker build -t ghcr.io/you/vsphere-passthrough-node-controller:dev . +docker push ghcr.io/you/vsphere-passthrough-node-controller:dev ``` -Source layout is deliberately tiny — a single `controller.py` plus a -minimal Python 3.13 Dockerfile. Dependencies: `pyVmomi` and the official -Kubernetes Python client. +Source layout is deliberately tiny — `controller.py` (maintenance-mode +controller) plus the optional `fence.py` (crash-fence controller, which +reuses `controller.py`'s vCenter client and node↔VM mapping), on a minimal +Python 3.13 Dockerfile. Dependencies: `pyVmomi` and the official Kubernetes +Python client. ## Race conditions handled @@ -401,12 +446,12 @@ git push origin v0.3.1 The `release.yaml` GitHub Actions workflow then: 1. Builds and pushes the controller image to - `ghcr.io/varashi/gpu-node-vsphere-maintenance-controller`, multi-arch + `ghcr.io/varashi/vsphere-passthrough-node-controller`, multi-arch (`linux/amd64`, `linux/arm64`), with cosign keyless signatures (GitHub OIDC), an SPDX SBOM, and a build-provenance attestation. 2. Packages the Helm chart in `chart/` with `version` and `appVersion` matching the tag and pushes it to - `oci://ghcr.io/varashi/charts/gpu-node-vsphere-maintenance-controller`. + `oci://ghcr.io/varashi/charts/vsphere-passthrough-node-controller`. 3. Creates a GitHub Release whose body is extracted from the matching section of [`CHANGELOG.md`](./CHANGELOG.md) and attaches the SBOM and the packaged chart `.tgz`. @@ -420,19 +465,19 @@ attached as a cosign attestation. Verify any of these before deploying: ```bash # 1. Image signature. cosign verify \ - --certificate-identity-regexp 'https://github\.com/Varashi/gpu-node-vsphere-maintenance-controller/\.github/workflows/release\.yaml@refs/tags/v.*' \ + --certificate-identity-regexp 'https://github\.com/Varashi/vsphere-passthrough-node-controller/\.github/workflows/release\.yaml@refs/tags/v.*' \ --certificate-oidc-issuer https://token.actions.githubusercontent.com \ - ghcr.io/varashi/gpu-node-vsphere-maintenance-controller: + ghcr.io/varashi/vsphere-passthrough-node-controller: # 2. SBOM attestation (SPDX). cosign verify-attestation --type spdxjson \ - --certificate-identity-regexp 'https://github\.com/Varashi/gpu-node-vsphere-maintenance-controller/\.github/workflows/release\.yaml@refs/tags/v.*' \ + --certificate-identity-regexp 'https://github\.com/Varashi/vsphere-passthrough-node-controller/\.github/workflows/release\.yaml@refs/tags/v.*' \ --certificate-oidc-issuer https://token.actions.githubusercontent.com \ - ghcr.io/varashi/gpu-node-vsphere-maintenance-controller: + ghcr.io/varashi/vsphere-passthrough-node-controller: # 3. SLSA build provenance (GitHub Attestations). gh attestation verify \ - oci://ghcr.io/varashi/gpu-node-vsphere-maintenance-controller: \ + oci://ghcr.io/varashi/vsphere-passthrough-node-controller: \ --owner Varashi ``` @@ -441,9 +486,9 @@ chart digests too: ```bash cosign verify \ - --certificate-identity-regexp 'https://github\.com/Varashi/gpu-node-vsphere-maintenance-controller/\.github/workflows/release\.yaml@refs/tags/v.*' \ + --certificate-identity-regexp 'https://github\.com/Varashi/vsphere-passthrough-node-controller/\.github/workflows/release\.yaml@refs/tags/v.*' \ --certificate-oidc-issuer https://token.actions.githubusercontent.com \ - ghcr.io/varashi/charts/gpu-node-vsphere-maintenance-controller: + ghcr.io/varashi/charts/vsphere-passthrough-node-controller: ``` `helm pull --verify` is *not* supported against this chart: `--verify` @@ -454,7 +499,7 @@ above instead. ## Version history See [`CHANGELOG.md`](./CHANGELOG.md) for the full history. Released tags -are also listed on the [GitHub Releases](https://github.com/Varashi/gpu-node-vsphere-maintenance-controller/releases) +are also listed on the [GitHub Releases](https://github.com/Varashi/vsphere-passthrough-node-controller/releases) page with signed assets and SBOMs. ## License diff --git a/chart/Chart.yaml b/chart/Chart.yaml index ac8c21a..b5d2b1a 100644 --- a/chart/Chart.yaml +++ b/chart/Chart.yaml @@ -1,13 +1,13 @@ apiVersion: v2 -name: gpu-node-vsphere-maintenance-controller +name: vsphere-passthrough-node-controller description: | Kubernetes controller that automates ESXi maintenance mode for worker nodes with PCI passthrough (GPU or otherwise). Detects EnterMaintenanceMode on ESXi hosts, drains the matching Kubernetes nodes, powers off the VMs, migrates them to a free GPU host if possible, and returns them to service. type: application -version: 0.4.4 -appVersion: "0.4.4" +version: 0.5.0 +appVersion: "0.5.0" kubeVersion: ">=1.26.0-0" keywords: - vsphere @@ -16,14 +16,14 @@ keywords: - gpu - pci-passthrough - kubernetes -home: https://github.com/Varashi/gpu-node-vsphere-maintenance-controller +home: https://github.com/Varashi/vsphere-passthrough-node-controller sources: - - https://github.com/Varashi/gpu-node-vsphere-maintenance-controller + - https://github.com/Varashi/vsphere-passthrough-node-controller maintainers: - name: Varashi url: https://github.com/Varashi annotations: artifacthub.io/license: MIT artifacthub.io/changes: | - - kind: fixed - description: Tolerate vCLS VMs vanishing mid-enumeration during host maintenance entry. + - kind: added + description: Optional crash-fence controller (separate Deployment, off by default) — applies the out-of-service taint to a GPU node confirmed dead by both k8s (NotReady) and vCenter (VM disconnected) so RWO volumes force-detach and stateful pods reschedule. diff --git a/chart/templates/NOTES.txt b/chart/templates/NOTES.txt index da1ad58..58e54ee 100644 --- a/chart/templates/NOTES.txt +++ b/chart/templates/NOTES.txt @@ -20,8 +20,8 @@ Note: vCenter TLS verification is DISABLED. To enable, pick one: Verify the controller is running: - kubectl -n {{ .Release.Namespace }} rollout status deploy/{{ include "gpu-node-vsphere-maintenance-controller.fullname" . }} - kubectl -n {{ .Release.Namespace }} logs deploy/{{ include "gpu-node-vsphere-maintenance-controller.fullname" . }} -f + kubectl -n {{ .Release.Namespace }} rollout status deploy/{{ include "vsphere-passthrough-node-controller.fullname" . }} + kubectl -n {{ .Release.Namespace }} logs deploy/{{ include "vsphere-passthrough-node-controller.fullname" . }} -f GPU-worker Node label (`{{ .Values.config.gpuNodeLabel }}`) identifies the nodes this controller will drain when their ESXi host enters maintenance. diff --git a/chart/templates/_helpers.tpl b/chart/templates/_helpers.tpl index 122c2a5..8984d62 100644 --- a/chart/templates/_helpers.tpl +++ b/chart/templates/_helpers.tpl @@ -1,14 +1,14 @@ {{/* Expand the name of the chart. */}} -{{- define "gpu-node-vsphere-maintenance-controller.name" -}} +{{- define "vsphere-passthrough-node-controller.name" -}} {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} {{- end }} {{/* Create a default fully qualified app name. */}} -{{- define "gpu-node-vsphere-maintenance-controller.fullname" -}} +{{- define "vsphere-passthrough-node-controller.fullname" -}} {{- if .Values.fullnameOverride }} {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} {{- else }} @@ -24,16 +24,16 @@ Create a default fully qualified app name. {{/* Chart label. */}} -{{- define "gpu-node-vsphere-maintenance-controller.chart" -}} +{{- define "vsphere-passthrough-node-controller.chart" -}} {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} {{- end }} {{/* Common labels. */}} -{{- define "gpu-node-vsphere-maintenance-controller.labels" -}} -helm.sh/chart: {{ include "gpu-node-vsphere-maintenance-controller.chart" . }} -{{ include "gpu-node-vsphere-maintenance-controller.selectorLabels" . }} +{{- define "vsphere-passthrough-node-controller.labels" -}} +helm.sh/chart: {{ include "vsphere-passthrough-node-controller.chart" . }} +{{ include "vsphere-passthrough-node-controller.selectorLabels" . }} {{- if .Chart.AppVersion }} app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} {{- end }} @@ -43,17 +43,17 @@ app.kubernetes.io/managed-by: {{ .Release.Service }} {{/* Selector labels. */}} -{{- define "gpu-node-vsphere-maintenance-controller.selectorLabels" -}} -app.kubernetes.io/name: {{ include "gpu-node-vsphere-maintenance-controller.name" . }} +{{- define "vsphere-passthrough-node-controller.selectorLabels" -}} +app.kubernetes.io/name: {{ include "vsphere-passthrough-node-controller.name" . }} app.kubernetes.io/instance: {{ .Release.Name }} {{- end }} {{/* ServiceAccount name. */}} -{{- define "gpu-node-vsphere-maintenance-controller.serviceAccountName" -}} +{{- define "vsphere-passthrough-node-controller.serviceAccountName" -}} {{- if .Values.serviceAccount.create }} -{{- default (include "gpu-node-vsphere-maintenance-controller.fullname" .) .Values.serviceAccount.name }} +{{- default (include "vsphere-passthrough-node-controller.fullname" .) .Values.serviceAccount.name }} {{- else }} {{- default "default" .Values.serviceAccount.name }} {{- end }} @@ -62,10 +62,10 @@ ServiceAccount name. {{/* Name of the Secret holding vCenter credentials (existing or rendered). */}} -{{- define "gpu-node-vsphere-maintenance-controller.vcenterSecretName" -}} +{{- define "vsphere-passthrough-node-controller.vcenterSecretName" -}} {{- if .Values.vcenter.existingSecret -}} {{- .Values.vcenter.existingSecret -}} {{- else -}} -{{- printf "%s-vcenter" (include "gpu-node-vsphere-maintenance-controller.fullname" .) -}} +{{- printf "%s-vcenter" (include "vsphere-passthrough-node-controller.fullname" .) -}} {{- end -}} {{- end }} diff --git a/chart/templates/clusterrole.yaml b/chart/templates/clusterrole.yaml index cb06b2f..fd2a04a 100644 --- a/chart/templates/clusterrole.yaml +++ b/chart/templates/clusterrole.yaml @@ -2,9 +2,9 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: - name: {{ include "gpu-node-vsphere-maintenance-controller.fullname" . }} + name: {{ include "vsphere-passthrough-node-controller.fullname" . }} labels: - {{- include "gpu-node-vsphere-maintenance-controller.labels" . | nindent 4 }} + {{- include "vsphere-passthrough-node-controller.labels" . | nindent 4 }} rules: - apiGroups: [""] resources: ["nodes"] diff --git a/chart/templates/clusterrolebinding.yaml b/chart/templates/clusterrolebinding.yaml index 6f15360..29fdb07 100644 --- a/chart/templates/clusterrolebinding.yaml +++ b/chart/templates/clusterrolebinding.yaml @@ -2,15 +2,15 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: {{ include "gpu-node-vsphere-maintenance-controller.fullname" . }} + name: {{ include "vsphere-passthrough-node-controller.fullname" . }} labels: - {{- include "gpu-node-vsphere-maintenance-controller.labels" . | nindent 4 }} + {{- include "vsphere-passthrough-node-controller.labels" . | nindent 4 }} roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole - name: {{ include "gpu-node-vsphere-maintenance-controller.fullname" . }} + name: {{ include "vsphere-passthrough-node-controller.fullname" . }} subjects: - kind: ServiceAccount - name: {{ include "gpu-node-vsphere-maintenance-controller.serviceAccountName" . }} + name: {{ include "vsphere-passthrough-node-controller.serviceAccountName" . }} namespace: {{ .Release.Namespace }} {{- end }} diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml index ebb8663..1ebb783 100644 --- a/chart/templates/configmap.yaml +++ b/chart/templates/configmap.yaml @@ -1,10 +1,10 @@ apiVersion: v1 kind: ConfigMap metadata: - name: {{ include "gpu-node-vsphere-maintenance-controller.fullname" . }} + name: {{ include "vsphere-passthrough-node-controller.fullname" . }} namespace: {{ .Release.Namespace }} labels: - {{- include "gpu-node-vsphere-maintenance-controller.labels" . | nindent 4 }} + {{- include "vsphere-passthrough-node-controller.labels" . | nindent 4 }} data: POLL_INTERVAL_SECONDS: {{ .Values.config.pollIntervalSeconds | quote }} DRAIN_TIMEOUT_SECONDS: {{ .Values.config.drainTimeoutSeconds | quote }} diff --git a/chart/templates/deployment.yaml b/chart/templates/deployment.yaml index 4d9be0e..edf2377 100644 --- a/chart/templates/deployment.yaml +++ b/chart/templates/deployment.yaml @@ -1,21 +1,21 @@ apiVersion: apps/v1 kind: Deployment metadata: - name: {{ include "gpu-node-vsphere-maintenance-controller.fullname" . }} + name: {{ include "vsphere-passthrough-node-controller.fullname" . }} namespace: {{ .Release.Namespace }} labels: - {{- include "gpu-node-vsphere-maintenance-controller.labels" . | nindent 4 }} + {{- include "vsphere-passthrough-node-controller.labels" . | nindent 4 }} spec: replicas: {{ .Values.replicaCount }} strategy: {{- toYaml .Values.strategy | nindent 4 }} selector: matchLabels: - {{- include "gpu-node-vsphere-maintenance-controller.selectorLabels" . | nindent 6 }} + {{- include "vsphere-passthrough-node-controller.selectorLabels" . | nindent 6 }} template: metadata: labels: - {{- include "gpu-node-vsphere-maintenance-controller.selectorLabels" . | nindent 8 }} + {{- include "vsphere-passthrough-node-controller.selectorLabels" . | nindent 8 }} {{- with .Values.podLabels }} {{- toYaml . | nindent 8 }} {{- end }} @@ -25,7 +25,7 @@ spec: {{- toYaml . | nindent 8 }} {{- end }} spec: - serviceAccountName: {{ include "gpu-node-vsphere-maintenance-controller.serviceAccountName" . }} + serviceAccountName: {{ include "vsphere-passthrough-node-controller.serviceAccountName" . }} {{- with .Values.imagePullSecrets }} imagePullSecrets: {{- toYaml . | nindent 8 }} @@ -43,9 +43,9 @@ spec: imagePullPolicy: {{ .Values.image.pullPolicy }} envFrom: - secretRef: - name: {{ include "gpu-node-vsphere-maintenance-controller.vcenterSecretName" . }} + name: {{ include "vsphere-passthrough-node-controller.vcenterSecretName" . }} - configMapRef: - name: {{ include "gpu-node-vsphere-maintenance-controller.fullname" . }} + name: {{ include "vsphere-passthrough-node-controller.fullname" . }} {{- with .Values.extraEnv }} env: {{- toYaml . | nindent 12 }} diff --git a/chart/templates/fence.yaml b/chart/templates/fence.yaml new file mode 100644 index 0000000..51d8424 --- /dev/null +++ b/chart/templates/fence.yaml @@ -0,0 +1,154 @@ +{{- if .Values.fence.enabled }} +{{- $fullname := include "vsphere-passthrough-node-controller.fullname" . -}} +{{- $fenceName := printf "%s-fence" $fullname -}} +{{- $fenceSA := ternary $fenceName (default "default" .Values.serviceAccount.name) .Values.serviceAccount.create -}} +{{- if .Values.serviceAccount.create }} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ $fenceName }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "vsphere-passthrough-node-controller.labels" . | nindent 4 }} + app.kubernetes.io/component: fence +--- +{{- end }} +{{- if .Values.rbac.create }} +# Least-privilege: the fence controller only reads node state and patches the +# out-of-service taint. No pods/eviction (vSphere HA + tolerationSeconds handle +# eviction once the taint lands). +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ $fenceName }} + labels: + {{- include "vsphere-passthrough-node-controller.labels" . | nindent 4 }} + app.kubernetes.io/component: fence +rules: + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch", "patch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ $fenceName }} + labels: + {{- include "vsphere-passthrough-node-controller.labels" . | nindent 4 }} + app.kubernetes.io/component: fence +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ $fenceName }} +subjects: + - kind: ServiceAccount + name: {{ $fenceSA }} + namespace: {{ .Release.Namespace }} +--- +{{- end }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ $fenceName }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "vsphere-passthrough-node-controller.labels" . | nindent 4 }} + app.kubernetes.io/component: fence +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + {{- include "vsphere-passthrough-node-controller.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: fence + template: + metadata: + labels: + {{- include "vsphere-passthrough-node-controller.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: fence + {{- with .Values.podLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + serviceAccountName: {{ $fenceSA }} + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.priorityClassName }} + priorityClassName: {{ . }} + {{- end }} + {{- with .Values.podSecurityContext }} + securityContext: + {{- toYaml . | nindent 8 }} + {{- end }} + containers: + - name: fence + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + command: ["python", "-u", "fence.py"] + envFrom: + - secretRef: + name: {{ include "vsphere-passthrough-node-controller.vcenterSecretName" . }} + env: + - name: FENCE_POLL_SECONDS + value: {{ .Values.fence.pollSeconds | quote }} + - name: FENCE_GRACE_SECONDS + value: {{ .Values.fence.graceSeconds | quote }} + - name: DRY_RUN + value: {{ .Values.fence.dryRun | quote }} + - name: GPU_NODE_LABEL + value: {{ .Values.config.gpuNodeLabel | quote }} + - name: VCENTER_TLS_VERIFY + value: {{ .Values.vcenter.tlsVerify | quote }} + {{- if .Values.vcenter.caBundle.configMapName }} + - name: VCENTER_CA_BUNDLE + value: {{ printf "%s/%s" .Values.vcenter.caBundle.mountPath .Values.vcenter.caBundle.key | quote }} + {{- end }} + {{- with .Values.extraEnv }} + {{- toYaml . | nindent 12 }} + {{- end }} + {{- if or .Values.vcenter.caBundle.configMapName .Values.extraVolumeMounts }} + volumeMounts: + {{- if .Values.vcenter.caBundle.configMapName }} + - name: vcenter-ca + mountPath: {{ .Values.vcenter.caBundle.mountPath }} + readOnly: true + {{- end }} + {{- with .Values.extraVolumeMounts }} + {{- toYaml . | nindent 12 }} + {{- end }} + {{- end }} + resources: + {{- toYaml .Values.fence.resources | nindent 12 }} + securityContext: + {{- toYaml .Values.securityContext | nindent 12 }} + {{- if or .Values.vcenter.caBundle.configMapName .Values.extraVolumes }} + volumes: + {{- if .Values.vcenter.caBundle.configMapName }} + - name: vcenter-ca + configMap: + name: {{ .Values.vcenter.caBundle.configMapName }} + {{- end }} + {{- with .Values.extraVolumes }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- end }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} diff --git a/chart/templates/secret.yaml b/chart/templates/secret.yaml index d74d3cb..cb38a96 100644 --- a/chart/templates/secret.yaml +++ b/chart/templates/secret.yaml @@ -2,10 +2,10 @@ apiVersion: v1 kind: Secret metadata: - name: {{ include "gpu-node-vsphere-maintenance-controller.vcenterSecretName" . }} + name: {{ include "vsphere-passthrough-node-controller.vcenterSecretName" . }} namespace: {{ .Release.Namespace }} labels: - {{- include "gpu-node-vsphere-maintenance-controller.labels" . | nindent 4 }} + {{- include "vsphere-passthrough-node-controller.labels" . | nindent 4 }} type: Opaque stringData: VCENTER_HOST: {{ .Values.vcenter.host | quote }} diff --git a/chart/templates/serviceaccount.yaml b/chart/templates/serviceaccount.yaml index ccdb08d..d6dbc4d 100644 --- a/chart/templates/serviceaccount.yaml +++ b/chart/templates/serviceaccount.yaml @@ -2,10 +2,10 @@ apiVersion: v1 kind: ServiceAccount metadata: - name: {{ include "gpu-node-vsphere-maintenance-controller.serviceAccountName" . }} + name: {{ include "vsphere-passthrough-node-controller.serviceAccountName" . }} namespace: {{ .Release.Namespace }} labels: - {{- include "gpu-node-vsphere-maintenance-controller.labels" . | nindent 4 }} + {{- include "vsphere-passthrough-node-controller.labels" . | nindent 4 }} {{- with .Values.serviceAccount.annotations }} annotations: {{- toYaml . | nindent 4 }} diff --git a/chart/values.yaml b/chart/values.yaml index 9478122..49b246f 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -1,7 +1,7 @@ -# Default values for gpu-node-vsphere-maintenance-controller. +# Default values for vsphere-passthrough-node-controller. image: - repository: ghcr.io/varashi/gpu-node-vsphere-maintenance-controller + repository: ghcr.io/varashi/vsphere-passthrough-node-controller # -- Overrides the image tag (defaults to Chart.appVersion) tag: "" pullPolicy: IfNotPresent @@ -53,6 +53,30 @@ config: gpuNodeLabel: "intel.feature.node.kubernetes.io/gpu=true" dryRun: false +# Crash-fence controller — a SEPARATE Deployment (own RBAC + kill switch) that +# shares this image. Applies the `node.kubernetes.io/out-of-service` taint to a +# GPU node confirmed dead by BOTH k8s (NotReady) and vCenter (VM +# connectionState disconnected/inaccessible/orphaned), sustained graceSeconds, +# so RWO volumes force-detach and stateful pods reschedule; removes it on +# recovery (VM connected + node Ready). Disjoint from maintenance (which keys +# off maintenance-mode tasks and leaves a clean power-off 'connected'), so the +# two never collide. Power-on stays with vSphere HA. +# +# OFF by default — fencing is the most destructive action in the cluster (a +# mis-fire force-detaches a live/partitioned node's volume → corruption). +# Enable explicitly once you trust the signal. +fence: + enabled: false + pollSeconds: 20 + graceSeconds: 60 + dryRun: false + resources: + requests: + cpu: 25m + memory: 64Mi + limits: + memory: 128Mi + serviceAccount: create: true name: "" diff --git a/controller.py b/controller.py index 01b5edc..af023d0 100644 --- a/controller.py +++ b/controller.py @@ -73,6 +73,14 @@ def _is_transient_k8s_error(exc: BaseException) -> bool: STATE_POWERED_OFF = "powered-off" STATE_MIGRATED = "migrated" +# Non-graceful node shutdown (used by the separate fence controller, fence.py). +OUT_OF_SERVICE_TAINT_KEY = "node.kubernetes.io/out-of-service" +OUT_OF_SERVICE_TAINT_VALUE = "nodeshutdown" +OUT_OF_SERVICE_TAINT_EFFECT = "NoExecute" +# vm.runtime.connectionState values meaning the host managing the VM is gone +# (a crash). A clean maintenance power-off leaves the VM 'connected'. +VM_DEAD_CONNECTION_STATES = {"disconnected", "inaccessible", "orphaned"} + # ── Logging ─────────────────────────────────────────────────────────────────── logging.basicConfig( @@ -270,6 +278,17 @@ def get_vm_host(self, vm_name: str): return vm.runtime.host.name return None + def get_vm_connection_state(self, vm_name: str) -> str: + """vm.runtime.connectionState — 'connected' | 'disconnected' | + 'inaccessible' | 'orphaned' | 'invalid'. Returns 'notfound' if the VM + is not in inventory. A crashed host makes its VMs 'disconnected'; a + clean (maintenance) power-off keeps them 'connected'.""" + self._ensure_connected() + vm = self._find_vm(vm_name) + if vm is None: + return "notfound" + return str(vm.runtime.connectionState) + def relocate_vm(self, vm_name: str, target_host_name: str): """Cold migrate a powered-off VM to the target ESXi host.""" self._ensure_connected() @@ -463,6 +482,62 @@ def uncordon(self, node_name): log.info(f"Uncordoning {node_name}") self.core.patch_node(node_name, {"spec": {"unschedulable": False}}) + @staticmethod + def _is_out_of_service_taint(t) -> bool: + # Match the full (key, value, effect) identity of the taint THIS + # controller applies. A same-key taint with a different value/effect is + # not ours: don't treat the node as fenced, and never strip it on un-fence. + return ( + t.key == OUT_OF_SERVICE_TAINT_KEY + and t.value == OUT_OF_SERVICE_TAINT_VALUE + and t.effect == OUT_OF_SERVICE_TAINT_EFFECT + ) + + def has_out_of_service_taint(self, node_name) -> bool: + node = self.get_node(node_name) + return any(self._is_out_of_service_taint(t) for t in (node.spec.taints or [])) + + def _patch_taints(self, node_name, taints): + # sanitize_for_serialization turns V1Taint objects into proper camelCase + # API dicts (incl. timeAdded), so existing taints are preserved verbatim. + body = { + "spec": {"taints": self.core.api_client.sanitize_for_serialization(taints)} + } + self.core.patch_node(node_name, body) + + def apply_out_of_service_taint(self, node_name): + """Force-detach volumes + force-delete pods on a confirmed-dead node.""" + node = self.get_node(node_name) + taints = list(node.spec.taints or []) + if any(self._is_out_of_service_taint(t) for t in taints): + return # already fenced + if DRY_RUN: + log.warning(f"[DRY RUN] Would FENCE {node_name} (out-of-service taint)") + return + log.warning(f"FENCING {node_name}: applying {OUT_OF_SERVICE_TAINT_KEY} taint") + taints.append( + k8s_client.V1Taint( + key=OUT_OF_SERVICE_TAINT_KEY, + value=OUT_OF_SERVICE_TAINT_VALUE, + effect=OUT_OF_SERVICE_TAINT_EFFECT, + ) + ) + self._patch_taints(node_name, taints) + + def remove_out_of_service_taint(self, node_name): + node = self.get_node(node_name) + taints = list(node.spec.taints or []) + if not any(self._is_out_of_service_taint(t) for t in taints): + return + if DRY_RUN: + log.info( + f"[DRY RUN] Would un-fence {node_name} (remove out-of-service taint)" + ) + return + log.info(f"Un-fencing {node_name}: removing {OUT_OF_SERVICE_TAINT_KEY} taint") + kept = [t for t in taints if not self._is_out_of_service_taint(t)] + self._patch_taints(node_name, kept) + def is_ready(self, node_name): node = self.get_node(node_name) for condition in node.status.conditions: diff --git a/fence.py b/fence.py new file mode 100644 index 0000000..b5df2f7 --- /dev/null +++ b/fence.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 +""" +GPU-node crash fence controller (sibling of the maintenance controller). + +Automates non-graceful node shutdown for passthrough-GPU workers that can't be +vSphere-HA-restarted during a host crash. When a host crashes, the node's RWO +volume stays attached to the dead node and k8s won't auto-detach it (it can't +tell a crash from a network partition) — so a rescheduled stateful pod hangs on +`Multi-Attach`. The fix is the `node.kubernetes.io/out-of-service` taint, which +force-detaches volumes + force-deletes pods. This controller applies it only on +a node confirmed dead by BOTH k8s and vCenter, and removes it on recovery. + +Two-gate fence condition (both required, sustained for FENCE_GRACE_SECONDS): + 1. k8s: node NotReady + 2. vCenter: that node's VM runtime.connectionState is disconnected/inaccessible + (a crash — a clean maintenance power-off keeps it 'connected', so + this never collides with the maintenance controller) + +Un-fence when the node recovers: VM 'connected' AND node Ready. + +Deliberately scoped to taint/un-taint only: + * Power-on is owned by vSphere HA (it restarts passthrough VMs on the original + host once it returns). + * Graceful maintenance drains are owned by the maintenance controller + (controller.py), which keys off vCenter maintenance-mode tasks — a disjoint + signal, so the two never clash. + +Runs as its own Deployment with its own RBAC + kill switch (Values.fence.*). +""" + +import os +import time + +from kubernetes.client.rest import ApiException + +from controller import ( + VM_DEAD_CONNECTION_STATES, + DRY_RUN, + K8sClient, + VSphereClient, + _is_transient_k8s_error, + log, +) + +FENCE_POLL_SECONDS = int(os.environ.get("FENCE_POLL_SECONDS", "20")) +# How long both gates must hold before we fence — guards against transient +# blips (kubelet restart, brief vCenter/host comms loss) and lets vCenter's +# host-down detection settle (it lags node-NotReady by tens of seconds). +FENCE_GRACE_SECONDS = int(os.environ.get("FENCE_GRACE_SECONDS", "60")) + + +class FenceController: + def __init__(self): + self.vsphere = VSphereClient() + self.k8s = K8sClient() + # node -> monotonic time the two-gate condition first became true + self.gate_since: dict[str, float] = {} + + def reconcile(self): + for node in self.k8s.get_gpu_nodes(): + name = node.metadata.name + ready = self.k8s.is_ready(name) + vm_state = self.vsphere.get_vm_connection_state(name) + tainted = self.k8s.has_out_of_service_taint(name) + dead_vm = vm_state in VM_DEAD_CONNECTION_STATES + + # ── FENCE gate: NotReady AND vCenter says the VM is gone ── + if not ready and dead_vm: + first = self.gate_since.setdefault(name, time.monotonic()) + elapsed = time.monotonic() - first + if tainted: + continue # already fenced + if elapsed >= FENCE_GRACE_SECONDS: + log.warning( + f"{name}: NotReady + vm={vm_state} for {elapsed:.0f}s " + f"(>= {FENCE_GRACE_SECONDS}s grace) — fencing" + ) + self.k8s.apply_out_of_service_taint(name) + else: + log.info( + f"{name}: fence-gate pending {elapsed:.0f}/" + f"{FENCE_GRACE_SECONDS}s (NotReady + vm={vm_state})" + ) + continue + + # ── Gates not both true: reset timer, and un-fence on recovery ── + self.gate_since.pop(name, None) + if tainted and ready and vm_state == "connected": + log.info(f"{name}: recovered (Ready + vm connected) — un-fencing") + self.k8s.remove_out_of_service_taint(name) + + def run(self): + log.info( + f"Fence controller started — poll={FENCE_POLL_SECONDS}s, " + f"grace={FENCE_GRACE_SECONDS}s, dry_run={DRY_RUN}" + ) + while True: + try: + self.reconcile() + except Exception as e: + if _is_transient_k8s_error(e): + reason = getattr(e, "reason", type(e).__name__) + status = getattr(e, "status", None) + detail = f" status={status}" if status else "" + log.warning( + f"Transient k8s/transport error in fence loop: " + f"{reason}{detail} — retrying next poll" + ) + else: + log.exception("Unhandled error in fence loop") + time.sleep(FENCE_POLL_SECONDS) + + +if __name__ == "__main__": + FenceController().run() diff --git a/test_fence.py b/test_fence.py new file mode 100644 index 0000000..2ec85f3 --- /dev/null +++ b/test_fence.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 +"""Unit test for the fence controller's two-gate logic — no cluster/vCenter +needed. Stubs the heavy imports so `import controller`/`fence` works, then +drives FenceController.reconcile() through the scenarios that matter for the +out-of-service taint (the most destructive action in the cluster). + +Run: python3 test_fence.py +""" +import os +import sys +import types + +# ── stub heavy deps so `import controller` succeeds ────────────────────────── +os.environ.setdefault("VCENTER_HOST", "x") +os.environ.setdefault("VCENTER_USER", "x") +os.environ.setdefault("VCENTER_PASSWORD", "x") + + +def _mod(name): + m = types.ModuleType(name) + sys.modules[name] = m + return m + + +class _ApiException(Exception): + def __init__(self, status=None): + self.status = status + + +_u = _mod("urllib3") +_ue = _mod("urllib3.exceptions") +_u.exceptions = _ue +for _n in ("ProtocolError", "ReadTimeoutError", "MaxRetryError", "ConnectionError"): + setattr(_ue, _n, type(_n, (Exception,), {})) + +_k = _mod("kubernetes") +_kc = _mod("kubernetes.client") +_kcr = _mod("kubernetes.client.rest") +_kcfg = _mod("kubernetes.config") +_k.client = _kc +_k.config = _kcfg +_kc.rest = _kcr +_kcr.ApiException = _ApiException +_kc.V1Taint = lambda **kw: types.SimpleNamespace(**kw) +_kc.CoreV1Api = object +_kc.V1Eviction = object +_kc.V1ObjectMeta = object + + +class _ConfigException(Exception): + pass + + +_kcfg.ConfigException = _ConfigException +_kcfg.load_incluster_config = lambda: None +_kcfg.load_kube_config = lambda **kw: None + +_pv = _mod("pyVim") +_pvc = _mod("pyVim.connect") +_pv.connect = _pvc +_pvc.SmartConnect = lambda **kw: None +_pvm = _mod("pyVmomi") +_pvm.vim = types.SimpleNamespace() +_pvm.vmodl = types.SimpleNamespace() + +import fence # noqa: E402 + + +# ── fakes ──────────────────────────────────────────────────────────────────── +class FakeK8s: + def __init__(self, nodes): + # nodes: name -> {"ready": bool, "tainted": bool} + self.nodes = nodes + self.applied = [] + self.removed = [] + + def get_gpu_nodes(self): + return [types.SimpleNamespace(metadata=types.SimpleNamespace(name=n)) + for n in self.nodes] + + def is_ready(self, name): + return self.nodes[name]["ready"] + + def has_out_of_service_taint(self, name): + return self.nodes[name]["tainted"] + + def apply_out_of_service_taint(self, name): + self.nodes[name]["tainted"] = True + self.applied.append(name) + + def remove_out_of_service_taint(self, name): + self.nodes[name]["tainted"] = False + self.removed.append(name) + + +class FakeVS: + def __init__(self, states): + self.states = states # name -> connectionState + + def get_vm_connection_state(self, name): + return self.states[name] + + +def make(nodes, states, grace=0): + fc = fence.FenceController.__new__(fence.FenceController) + fc.k8s = FakeK8s(nodes) + fc.vsphere = FakeVS(states) + fc.gate_since = {} + fence.FENCE_GRACE_SECONDS = grace + return fc + + +results = [] + + +def check(desc, cond): + results.append((desc, cond)) + print(("PASS " if cond else "FAIL ") + desc) + + +# 1. healthy node — never fenced +fc = make({"n": {"ready": True, "tainted": False}}, {"n": "connected"}) +fc.reconcile() +check("healthy (Ready+connected) -> no fence", not fc.k8s.applied) + +# 2. NotReady but VM still connected (vCenter lag) — must NOT fence (one gate) +fc = make({"n": {"ready": False, "tainted": False}}, {"n": "connected"}) +fc.reconcile() +check("NotReady + vm connected -> no fence (one gate only)", not fc.k8s.applied) + +# 3. both gates but within grace — no fence yet +fc = make({"n": {"ready": False, "tainted": False}}, {"n": "disconnected"}, grace=9999) +fc.reconcile() +check("NotReady + disconnected, within grace -> no fence yet", not fc.k8s.applied) + +# 4. both gates, grace elapsed -> fence +fc = make({"n": {"ready": False, "tainted": False}}, {"n": "disconnected"}, grace=0) +fc.reconcile() +check("NotReady + disconnected, grace met -> FENCE", fc.k8s.applied == ["n"]) + +# 5. already tainted -> no double apply +fc = make({"n": {"ready": False, "tainted": True}}, {"n": "disconnected"}, grace=0) +fc.reconcile() +check("already tainted -> no re-apply", not fc.k8s.applied) + +# 6. recovery: tainted + Ready + connected -> un-fence +fc = make({"n": {"ready": True, "tainted": True}}, {"n": "connected"}) +fc.reconcile() +check("recovered (Ready+connected) while tainted -> un-fence", fc.k8s.removed == ["n"]) + +# 7. tainted, node back Ready but vm still disconnected -> do NOT un-fence yet +fc = make({"n": {"ready": True, "tainted": True}}, {"n": "disconnected"}) +fc.reconcile() +check("tainted, Ready but vm disconnected -> stay fenced", not fc.k8s.removed) + +# 8. 'notfound' VM is not a dead-state -> no fence +fc = make({"n": {"ready": False, "tainted": False}}, {"n": "notfound"}, grace=0) +fc.reconcile() +check("NotReady + vm notfound -> no fence", not fc.k8s.applied) + +print() +failed = [d for d, c in results if not c] +if failed: + print(f"{len(failed)} FAILED") + sys.exit(1) +print(f"all {len(results)} passed")