diff --git a/k8s/bases/infrastructure/controllers/openbao/helm-release.yaml b/k8s/bases/infrastructure/controllers/openbao/helm-release.yaml index b9658d150..76934e72c 100644 --- a/k8s/bases/infrastructure/controllers/openbao/helm-release.yaml +++ b/k8s/bases/infrastructure/controllers/openbao/helm-release.yaml @@ -27,6 +27,26 @@ spec: server: enabled: true replicas: ${openbao_replicas:=1} + # The chart's default readinessProbe is `exec: bao status`, which + # returns exit code 2 on a sealed server -- so the Pod stays NotReady + # until something unseals it. On a fresh cluster that "something" is + # the vault-config Job in the downstream `infrastructure` Flux layer, + # which can't run until this HelmRelease (in `infrastructure-controllers`) + # is Ready. The install therefore deadlocks: Helm waits up to `timeout`, + # `install.remediation.retries: -1` uninstalls + reinstalls, repeat -- + # cold-cluster bootstrap historically took 20-40 min waiting for that + # race to resolve (see vault-config Job comment + PR #1636 system-test + # failure). + # + # Switching to the HTTP health probe with `sealedcode=204` and + # `uninitcode=204` makes a sealed-but-running server Ready immediately. + # The chart template renders the httpGet branch as soon as `path` is + # set (server-statefulset.yaml: `{{- if .Values.server.readinessProbe.path }}`) + # and `openbao.scheme` returns "http" when `global.tlsDisable: true` + # (chart default; matches the `tls_disable = 1` listener below). + # The HashiCorp Vault chart uses the same pattern for the same reason. + readinessProbe: + path: "/v1/sys/health?standbyok=true&sealedcode=204&uninitcode=204" # Mount the unseal key Secret (created by the vault-config Job on first # init, restored by Velero on cluster rebuild) so the postStart hook # can auto-unseal the server after every pod restart.