Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 28 additions & 3 deletions api/v1alpha1/seinode_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package v1alpha1
import (
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
)

// SeiNodeSpec defines the desired state of a standalone Sei node.
Expand Down Expand Up @@ -68,9 +69,12 @@ type SeiNodeSpec struct {
Validator *ValidatorSpec `json:"validator,omitempty"`

// Paused freezes reconciliation. While true, the controller does not
// advance the lifecycle, start plans, or mutate derived resources.
// In-flight tasks on the cluster run to completion but their results
// are not polled until the field is cleared.
// advance the lifecycle, start plans, or mutate derived resources
// except the owned StatefulSet — which scales to Replicas=0 so pods
// terminate. In-flight tasks on the cluster run to completion but
// their results are not polled until the field is cleared.
// Has no effect on nodes in PhaseFailed; delete and recreate to
// recover from a failed node.
// +optional
Paused bool `json:"paused,omitempty"`
}
Expand Down Expand Up @@ -343,12 +347,33 @@ type SeiNodeStatus struct {
// config so the node advertises a reachable address for gossip discovery.
// +optional
ExternalAddress string `json:"externalAddress,omitempty"`

// StatefulSet references the StatefulSet the controller created for
// this SeiNode. UID is the identity check: an STS with the expected
// name but a different UID is not the one this controller created
// (e.g., manual recreation out-of-band) and triggers replacement.
// +optional
StatefulSet *StatefulSetRef `json:"statefulSet,omitempty"`
}

// StatefulSetRef identifies a StatefulSet owned and managed by a
// SeiNode. Stored on Status so the controller can fetch and mutate the
// owned object directly rather than blindly server-side-applying.
type StatefulSetRef struct {
// Name of the StatefulSet (always equals the SeiNode name).
Name string `json:"name"`

// UID of the StatefulSet. Used to detect out-of-band recreation:
// if a new StatefulSet appears with the same name but a different
// UID, the controller knows it is not the one it created.
UID types.UID `json:"uid"`
}

// +kubebuilder:object:root=true
// +kubebuilder:subresource:status
// +kubebuilder:resource:shortName=snode
// +kubebuilder:printcolumn:name="Phase",type=string,JSONPath=`.status.phase`
// +kubebuilder:printcolumn:name="StatefulSet",type=string,JSONPath=`.status.statefulSet.name`,priority=1
// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=`.metadata.creationTimestamp`

// SeiNode is the Schema for the seinodes API.
Expand Down
20 changes: 20 additions & 0 deletions api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 6 additions & 3 deletions config/crd/sei.io_seinodedeployments.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -349,9 +349,12 @@ spec:
paused:
description: |-
Paused freezes reconciliation. While true, the controller does not
advance the lifecycle, start plans, or mutate derived resources.
In-flight tasks on the cluster run to completion but their results
are not polled until the field is cleared.
advance the lifecycle, start plans, or mutate derived resources
except the owned StatefulSet — which scales to Replicas=0 so pods
terminate. In-flight tasks on the cluster run to completion but
their results are not polled until the field is cleared.
Has no effect on nodes in PhaseFailed; delete and recreate to
recover from a failed node.
type: boolean
peers:
description: |-
Expand Down
34 changes: 31 additions & 3 deletions config/crd/sei.io_seinodes.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ spec:
- jsonPath: .status.phase
name: Phase
type: string
- jsonPath: .status.statefulSet.name
name: StatefulSet
priority: 1
type: string
- jsonPath: .metadata.creationTimestamp
name: Age
type: date
Expand Down Expand Up @@ -218,9 +222,12 @@ spec:
paused:
description: |-
Paused freezes reconciliation. While true, the controller does not
advance the lifecycle, start plans, or mutate derived resources.
In-flight tasks on the cluster run to completion but their results
are not polled until the field is cleared.
advance the lifecycle, start plans, or mutate derived resources
except the owned StatefulSet — which scales to Replicas=0 so pods
terminate. In-flight tasks on the cluster run to completion but
their results are not polled until the field is cleared.
Has no effect on nodes in PhaseFailed; delete and recreate to
recover from a failed node.
type: boolean
peers:
description: |-
Expand Down Expand Up @@ -950,6 +957,27 @@ spec:
items:
type: string
type: array
statefulSet:
description: |-
StatefulSet references the StatefulSet the controller created for
this SeiNode. UID is the identity check: an STS with the expected
name but a different UID is not the one this controller created
(e.g., manual recreation out-of-band) and triggers replacement.
properties:
name:
description: Name of the StatefulSet (always equals the SeiNode
name).
type: string
uid:
description: |-
UID of the StatefulSet. Used to detect out-of-band recreation:
if a new StatefulSet appears with the same name but a different
UID, the controller knows it is not the one it created.
type: string
required:
- name
- uid
type: object
type: object
type: object
served: true
Expand Down
4 changes: 4 additions & 0 deletions internal/controller/node/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,10 @@ func (r *SeiNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
return ctrl.Result{}, nil
}

if err := r.reconcileStatefulSet(ctx, node); err != nil {
return ctrl.Result{}, fmt.Errorf("reconciling statefulset: %w", err)
}

if node.Spec.Paused {
if err := flushStatus(); err != nil {
return ctrl.Result{}, fmt.Errorf("flushing paused status: %w", err)
Expand Down
40 changes: 40 additions & 0 deletions internal/controller/node/statefulset.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package node

import (
"context"
"fmt"

seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1"
"github.com/sei-protocol/sei-k8s-controller/internal/noderesource"
)

// reconcileStatefulSet syncs the owned StatefulSet via the typed
// Get+Create/Update helper and records the resulting object on
// Status.StatefulSet so subsequent reconciles fetch the tracked
// identity directly.
//
// A nil StatefulSet with no error means the impostor branch fired:
// SyncStatefulSet detected a UID mismatch, issued Delete, and deferred
// the Apply to the next reconcile. Leave Status.StatefulSet untouched —
// the stale UID still points at a now-deleted object, and the next
// reconcile (triggered by the StatefulSet delete watch event) observes
// NotFound and Applies a fresh STS whose new UID gets stamped onto
// Status.StatefulSet.
func (r *SeiNodeReconciler) reconcileStatefulSet(ctx context.Context, node *seiv1alpha1.SeiNode) error {
sts, err := noderesource.SyncStatefulSet(ctx, r.Client, r.Scheme, node, r.Platform)
if err != nil {
return fmt.Errorf("syncing statefulset: %w", err)
}
if sts == nil {
return nil
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Impostor delete races with task creating orphaned StatefulSet

Medium Severity

When reconcileStatefulSet detects an impostor (UID mismatch), it deletes it and returns nil without updating Status.StatefulSet. The controller then continues into the plan executor. If the active task is apply-statefulset, it calls SyncStatefulSet again — this time the Get returns NotFound (impostor just deleted), so it falls through and creates a fresh StatefulSet. However, Status.StatefulSet still holds the original stale UID (only the controller's reconcileStatefulSet stamps it, and it skipped that on nil). On the next reconcile, the controller sees the task-created StatefulSet as another impostor and deletes it, causing unnecessary churn and a brief pod outage.

Additional Locations (2)
Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit ae5aecc. Configure here.

}
if node.Status.StatefulSet == nil ||
node.Status.StatefulSet.UID != sts.UID ||
node.Status.StatefulSet.Name != sts.Name {
node.Status.StatefulSet = &seiv1alpha1.StatefulSetRef{
Name: sts.Name,
UID: sts.UID,
}
}
return nil
}
Loading
Loading