From bad5aaaf055190db4a115721bc88d33c45f905ec Mon Sep 17 00:00:00 2001 From: bdchatham Date: Thu, 21 May 2026 15:23:39 -0700 Subject: [PATCH] fix(noderesource): clear Status.StatefulSet when SyncStatefulSet deletes an impostor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cross-reconcile churn loop, caught by Cursor review of #345. When SyncStatefulSet detects a UID mismatch between the tracked Status.StatefulSet and the live object, it Deletes the live impostor and returns (nil, nil) so the next reconcile re-Applies fresh. The controller wrapper sees nil and returns without touching Status. The bug: reconcile N continues past the impostor delete into ExecutePlan, where the apply-statefulset task calls SyncStatefulSet again. Status.StatefulSet still holds the stale UID, but the live object is gone — so the call falls through the impostor check (NotFound case) and Applies fresh with a new UID. Reconcile N ends with the new live StatefulSet but Status.StatefulSet still pointing at the original stale UID. Reconcile N+1 sees the mismatch, deletes the fresh StatefulSet as another impostor, the task creates another one, loop. Brief pod outage on every cycle. Fix: when SyncStatefulSet deletes the impostor, also clear node.Status.StatefulSet. Any subsequent SyncStatefulSet — in the same reconcile or the next — routes through the adopt-on-observe path instead of mis-classifying the fresh StatefulSet. Coverage: - TestSyncStatefulSet_ImpostorThenApply extended: asserts Status.StatefulSet is nil after reconcile 1's impostor delete. - TestSyncStatefulSet_ImpostorClearPreventsChurn (new): directly exercises the cross-reconcile scenario — impostor delete in reconcile N, fresh Apply still in reconcile N, then a third Sync call adopts the fresh STS rather than re-deleting it. Co-Authored-By: Claude Opus 4.7 --- internal/noderesource/sync.go | 7 ++++ internal/noderesource/sync_test.go | 61 +++++++++++++++++++++++++++++- 2 files changed, 66 insertions(+), 2 deletions(-) diff --git a/internal/noderesource/sync.go b/internal/noderesource/sync.go index f0a75bb..b223903 100644 --- a/internal/noderesource/sync.go +++ b/internal/noderesource/sync.go @@ -93,6 +93,13 @@ func SyncStatefulSet( if err := c.Delete(ctx, existing); err != nil && !apierrors.IsNotFound(err) { return nil, fmt.Errorf("deleting impostor statefulset: %w", err) } + // Forget the stale identity so any subsequent + // SyncStatefulSet call this reconcile (or the next) + // routes through the adopt-on-observe path. Leaving + // the stale UID would cause a freshly Applied STS to + // be misclassified as another impostor on the very + // next reconcile, producing a churn loop. + node.Status.StatefulSet = nil return nil, nil } case apierrors.IsNotFound(err): diff --git a/internal/noderesource/sync_test.go b/internal/noderesource/sync_test.go index d13b643..5eddfe5 100644 --- a/internal/noderesource/sync_test.go +++ b/internal/noderesource/sync_test.go @@ -137,12 +137,16 @@ func TestSyncStatefulSet_ImpostorThenApply(t *testing.T) { WithStatusSubresource(&seiv1alpha1.SeiNode{}). Build() - // Reconcile 1: detect mismatch, Delete, return nil. + // Reconcile 1: detect mismatch, Delete, clear stale tracking, + // return nil. sts, err := SyncStatefulSet(context.Background(), c, s, node, platformtest.Config()) g.Expect(err).NotTo(HaveOccurred()) g.Expect(sts).To(BeNil()) + g.Expect(node.Status.StatefulSet).To(BeNil(), + "impostor delete must clear Status.StatefulSet so a subsequent Apply is not misclassified") - // Reconcile 2: Get sees NotFound, Apply creates fresh. + // Reconcile 2: Status was cleared, falls into adopt-on-observe. + // Get sees NotFound (impostor deleted), Apply creates fresh. sts, err = SyncStatefulSet(context.Background(), c, s, node, platformtest.Config()) g.Expect(err).NotTo(HaveOccurred()) g.Expect(sts).NotTo(BeNil(), "second reconcile must Apply fresh") @@ -150,6 +154,59 @@ func TestSyncStatefulSet_ImpostorThenApply(t *testing.T) { g.Expect(sts.UID).NotTo(Equal(types.UID(impostorTestUID))) } +// TestSyncStatefulSet_ImpostorClearPreventsChurn pins the cross- +// reconcile invariant: when the controller deletes an impostor and +// the plan executor Applies fresh in the same pass, the next reconcile +// must adopt that fresh StatefulSet rather than delete it as another +// impostor. Clearing Status.StatefulSet on impostor delete is what +// routes the next reconcile through adopt-on-observe. +func TestSyncStatefulSet_ImpostorClearPreventsChurn(t *testing.T) { + g := NewWithT(t) + s := newSyncTestScheme(t) + + node := newGenesisNode(impostorName, testNamespace) + node.Status.StatefulSet = &seiv1alpha1.StatefulSetRef{ + Name: impostorName, + UID: originalTestUID, + } + impostor := &appsv1.StatefulSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: impostorName, + Namespace: testNamespace, + UID: impostorTestUID, + }, + } + c := fake.NewClientBuilder(). + WithScheme(s). + WithObjects(node, impostor). + WithStatusSubresource(&seiv1alpha1.SeiNode{}). + Build() + + // Reconcile N (controller wrapper): impostor detected, Deleted, + // Status.StatefulSet cleared. + _, err := SyncStatefulSet(context.Background(), c, s, node, platformtest.Config()) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(node.Status.StatefulSet).To(BeNil()) + + // Still inside reconcile N (plan task): SyncStatefulSet again. + // Status is nil, impostor check skipped, Get NotFound, Apply lands. + taskSTS, err := SyncStatefulSet(context.Background(), c, s, node, platformtest.Config()) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(taskSTS).NotTo(BeNil(), "task call after impostor clear must Apply a fresh StatefulSet") + newUID := taskSTS.UID + + // Reconcile N+1 (controller wrapper): would-be-bug scenario. + // Status.StatefulSet still nil (controller wrapper does not stamp + // when SyncStatefulSet returned nil in reconcile N). With the + // clear, this call adopts the freshly Applied STS rather than + // deleting it as another impostor. + adopted, err := SyncStatefulSet(context.Background(), c, s, node, platformtest.Config()) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(adopted).NotTo(BeNil()) + g.Expect(adopted.UID).To(Equal(newUID), + "reconcile N+1 must adopt the task-Applied StatefulSet, not delete it as another impostor") +} + // TestSyncStatefulSet_MatchingUID is the steady-state path. A live // StatefulSet exists whose UID matches Status.StatefulSet. Sync must // not delete; it Applies (idempotent) and returns the live object.