From 48394baa78c75d147154ce669712009a45b8baa2 Mon Sep 17 00:00:00 2001 From: Rahul Sharma Date: Tue, 10 Feb 2026 21:13:31 -0800 Subject: [PATCH] Adds support for migrating from clusterpolicy to nvidiadriver and vice-versa This change supports: 1. Migrating cluster from clusterpolicy to nvidiadriver CR 2. Migrating cluster from nvidiadriver CR to clusterpolicy Signed-off-by: Rahul Sharma --- controllers/clusterpolicy_controller.go | 41 +++++ controllers/object_controls.go | 192 +++++++++++++++++++++++- controllers/state_manager.go | 29 +++- controllers/upgrade_controller.go | 21 ++- 4 files changed, 268 insertions(+), 15 deletions(-) diff --git a/controllers/clusterpolicy_controller.go b/controllers/clusterpolicy_controller.go index d16d2d445..ea66c17cf 100644 --- a/controllers/clusterpolicy_controller.go +++ b/controllers/clusterpolicy_controller.go @@ -42,6 +42,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/source" gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1" + nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1" "github.com/NVIDIA/gpu-operator/internal/conditions" ) @@ -170,6 +171,19 @@ func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reques } } + // Check for orphaned driver pods and label nodes AFTER step() completes + // This ensures labeling happens after any daemonset cleanup that might have orphaned pods + // + // Label nodes containing orphaned driver pods based on transition state: + // - When useNvidiaDriverCRD=false: Always label (NVIDIADriver → ClusterPolicy transition) + // - When useNvidiaDriverCRD=true: Only label if NVIDIADriver CRs exist (ClusterPolicy → NVIDIADriver transition) + if !clusterPolicyCtrl.singleton.Spec.Driver.UseNvidiaDriverCRDType() || clusterPolicyCtrl.hasNVIDIADriverCRs(ctx) { + if err := clusterPolicyCtrl.labelNodesWithOrphanedDriverPods(ctx); err != nil { + // Log error but don't fail reconciliation + r.Log.Error(err, "failed to label nodes with orphaned driver pods") + } + } + // if any state is not ready, requeue for reconcile after 5 seconds if overallStatus != gpuv1.Ready { clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusNotReady) @@ -371,6 +385,33 @@ func (r *ClusterPolicyReconciler) SetupWithManager(ctx context.Context, mgr ctrl return err } + // Watch for changes to NVIDIADriver CRs to trigger labeling of orphaned pods + // This is needed when NVIDIADriver CRs are created/updated/deleted during transitions + err = c.Watch( + source.Kind(mgr.GetCache(), + &nvidiav1alpha1.NVIDIADriver{}, + handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, obj *nvidiav1alpha1.NVIDIADriver) []reconcile.Request { + // Trigger reconciliation of all ClusterPolicy instances + list := &gpuv1.ClusterPolicyList{} + if err := mgr.GetClient().List(ctx, list); err != nil { + return []reconcile.Request{} + } + requests := make([]reconcile.Request, len(list.Items)) + for i, item := range list.Items { + requests[i] = reconcile.Request{ + NamespacedName: types.NamespacedName{ + Name: item.GetName(), + }, + } + } + return requests + }), + ), + ) + if err != nil { + return err + } + // TODO(user): Modify this to be the types you create that are owned by the primary resource // Watch for changes to secondary resource Daemonsets and requeue the owner ClusterPolicy err = c.Watch( diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 63e9f03c9..f3561e2a7 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -50,6 +50,7 @@ import ( "sigs.k8s.io/yaml" gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1" + nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1" "github.com/NVIDIA/gpu-operator/internal/consts" "github.com/NVIDIA/gpu-operator/internal/utils" ) @@ -4068,7 +4069,7 @@ func ocpHasDriverToolkitImageStream(n *ClusterPolicyController) (bool, error) { return true, nil } -func (n ClusterPolicyController) cleanupAllDriverDaemonSets(ctx context.Context) error { +func (n ClusterPolicyController) cleanupAllDriverDaemonSets(ctx context.Context, orphanPods bool) error { // Get all DaemonSets owned by ClusterPolicy // // (cdesiniotis) There is a limitation with the controller-runtime client where only a single field selector @@ -4084,8 +4085,17 @@ func (n ClusterPolicyController) cleanupAllDriverDaemonSets(ctx context.Context) ds := ds // filter out DaemonSets which are not the NVIDIA driver/vgpu-manager if strings.HasPrefix(ds.Name, commonDriverDaemonsetName) || strings.HasPrefix(ds.Name, commonVGPUManagerDaemonsetName) { - n.logger.Info("Deleting NVIDIA driver daemonset owned by ClusterPolicy", "Name", ds.Name) - err = n.client.Delete(ctx, &ds) + if orphanPods { + n.logger.Info("Deleting NVIDIA driver daemonset owned by ClusterPolicy with cascade=orphan", "Name", ds.Name) + // Delete with cascade=orphan to keep the driver pods running + propagationPolicy := metav1.DeletePropagationOrphan + err = n.client.Delete(ctx, &ds, &client.DeleteOptions{ + PropagationPolicy: &propagationPolicy, + }) + } else { + n.logger.Info("Deleting NVIDIA driver daemonset owned by ClusterPolicy", "Name", ds.Name) + err = n.client.Delete(ctx, &ds) + } if err != nil { return fmt.Errorf("error deleting NVIDIA driver daemonset: %w", err) } @@ -4095,6 +4105,182 @@ func (n ClusterPolicyController) cleanupAllDriverDaemonSets(ctx context.Context) return nil } +// cleanupNVIDIADriverOwnedDaemonSets deletes all driver daemonsets owned by any NVIDIADriver CR +// This is used when ClusterPolicy has useNvidiaDriverCRD=false to clean up stale daemonsets +func (n ClusterPolicyController) cleanupNVIDIADriverOwnedDaemonSets(ctx context.Context, orphanPods bool) error { + // List all DaemonSets in the namespace using component label + // This includes both NVIDIADriver-owned and orphaned driver daemonsets + list := &appsv1.DaemonSetList{} + err := n.client.List(ctx, list, + client.InNamespace(n.operatorNamespace), + client.MatchingLabels{"app.kubernetes.io/component": "nvidia-driver"}, + ) + if err != nil { + return fmt.Errorf("failed to list driver daemonsets: %w", err) + } + + for _, ds := range list.Items { + ds := ds + + // Skip ClusterPolicy-owned daemonsets - only process NVIDIADriver or orphaned ones + isOwnedByClusterPolicy := false + for _, ownerRef := range ds.OwnerReferences { + if ownerRef.Kind == "ClusterPolicy" { + isOwnedByClusterPolicy = true + break + } + } + if isOwnedByClusterPolicy { + continue + } + + // At this point, the daemonset is either: + // 1. Owned by NVIDIADriver (has NVIDIADriver owner reference) + // 2. Orphaned (no owner references) + // We clean up both cases when useNvidiaDriverCRD=false + n.logger.Info("Found NVIDIADriver-managed or orphaned driver daemonset", "Name", ds.Name, "OwnerReferences", len(ds.OwnerReferences)) + + if orphanPods { + if len(ds.OwnerReferences) > 0 { + n.logger.Info("Removing owner references and deleting NVIDIA driver daemonset with cascade=orphan", "Name", ds.Name) + // Remove owner references to prevent garbage collection from deleting with default cascade + // when the NVIDIADriver CR is deleted later + ds.OwnerReferences = nil + if err := n.client.Update(ctx, &ds); err != nil { + // Continue anyway and try to delete + n.logger.Error(err, "failed to remove owner references from daemonset", "Name", ds.Name) + } + } else { + n.logger.Info("Deleting orphaned NVIDIA driver daemonset with cascade=orphan", "Name", ds.Name) + } + + // Delete with cascade=orphan to keep the driver pods running + propagationPolicy := metav1.DeletePropagationOrphan + err = n.client.Delete(ctx, &ds, &client.DeleteOptions{ + PropagationPolicy: &propagationPolicy, + }) + } else { + n.logger.Info("Deleting NVIDIADriver-managed or orphaned daemonset", "Name", ds.Name) + err = n.client.Delete(ctx, &ds) + } + if err != nil && !apierrors.IsNotFound(err) { + return fmt.Errorf("error deleting NVIDIADriver-managed daemonset %s: %w", ds.Name, err) + } + } + + return nil +} + +// labelNodesWithOrphanedDriverPods detects orphaned driver pods (pods without owner references) +// and labels their nodes with nvidia.com/gpu-driver-upgrade-state=upgrade-required to trigger upgrade +func (n ClusterPolicyController) labelNodesWithOrphanedDriverPods(ctx context.Context) error { + useNvidiaDriverCRD := n.singleton.Spec.Driver.UseNvidiaDriverCRDType() + return labelNodesWithOrphanedDriverPods(ctx, n.client, n.operatorNamespace, useNvidiaDriverCRD) +} + +// labelNodesWithOrphanedDriverPods is a standalone function that can be used by both +// ClusterPolicy and NVIDIADriver controllers to label nodes with orphaned driver pods +// When useNvidiaDriverCRD=true: Only labels nodes if matching NVIDIADriver CR exists (ClusterPolicy→NVIDIADriver transition) +// When useNvidiaDriverCRD=false: Labels all nodes with orphaned pods (NVIDIADriver→ClusterPolicy transition) +func labelNodesWithOrphanedDriverPods(ctx context.Context, c client.Client, namespace string, useNvidiaDriverCRD bool) error { + // List all NVIDIADriver CRs to check nodeSelectors (only needed when useNvidiaDriverCRD=true) + var nvidiaDriverList *nvidiav1alpha1.NVIDIADriverList + if useNvidiaDriverCRD { + nvidiaDriverList = &nvidiav1alpha1.NVIDIADriverList{} + if err := c.List(ctx, nvidiaDriverList); err != nil { + return fmt.Errorf("failed to list NVIDIADriver CRs: %w", err) + } + } + + // List all driver pods in the namespace + // Use app.kubernetes.io/component label which works for both ClusterPolicy and NVIDIADriver managed pods + podList := &corev1.PodList{} + listOpts := []client.ListOption{ + client.InNamespace(namespace), + client.MatchingLabels{"app.kubernetes.io/component": "nvidia-driver"}, + } + if err := c.List(ctx, podList, listOpts...); err != nil { + return fmt.Errorf("failed to list driver pods: %w", err) + } + + // Find orphaned pods (pods without owner references) + for i := range podList.Items { + pod := &podList.Items[i] + + // Skip if pod has owner references (not orphaned) + if len(pod.OwnerReferences) > 0 { + continue + } + + // Skip if pod is not running + if pod.Status.Phase != corev1.PodRunning { + continue + } + + nodeName := pod.Spec.NodeName + if nodeName == "" { + continue + } + + // Get the node + node := &corev1.Node{} + if err := c.Get(ctx, types.NamespacedName{Name: nodeName}, node); err != nil { + continue + } + + // When useNvidiaDriverCRD=true: Check if at least one NVIDIADriver CR's nodeSelector matches this node + // When useNvidiaDriverCRD=false: Label all orphaned pods since ClusterPolicy will take over + if useNvidiaDriverCRD { + hasMatchingDriver := false + for _, nvDriver := range nvidiaDriverList.Items { + if nodeMatchesSelector(node, nvDriver.Spec.NodeSelector) { + hasMatchingDriver = true + break + } + } + + // Only label if there's a matching NVIDIADriver that will create replacement pods + if !hasMatchingDriver { + continue + } + } + + // Check if the node already has the upgrade label + upgradeLabel := "nvidia.com/gpu-driver-upgrade-state" + if node.Labels != nil && node.Labels[upgradeLabel] == "upgrade-required" { + continue + } + + // Label the node + if node.Labels == nil { + node.Labels = make(map[string]string) + } + node.Labels[upgradeLabel] = "upgrade-required" + if err := c.Update(ctx, node); err != nil { + continue + } + } + + return nil +} + +// nodeMatchesSelector checks if a node's labels match the given selector +func nodeMatchesSelector(node *corev1.Node, selector map[string]string) bool { + if len(selector) == 0 { + // Empty selector matches all nodes + return true + } + + for key, value := range selector { + // Check if the key exists in node labels + nodeValue, exists := node.Labels[key] + if !exists || nodeValue != value { + return false + } + } + return true +} + // cleanupStalePrecompiledDaemonsets deletes stale driver daemonsets which can happen // 1. If all nodes upgraded to the latest kernel // 2. no GPU nodes are present diff --git a/controllers/state_manager.go b/controllers/state_manager.go index 4567a7c8c..de3d95458 100644 --- a/controllers/state_manager.go +++ b/controllers/state_manager.go @@ -35,6 +35,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client/config" gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1" + nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1" ) const ( @@ -951,13 +952,28 @@ func (n *ClusterPolicyController) step() (gpuv1.State, error) { n.logger.Info("NVIDIADriver CRD is enabled, cleaning up all NVIDIA driver daemonsets owned by ClusterPolicy") n.idx++ // Cleanup all driver daemonsets owned by ClusterPolicy. - err := n.cleanupAllDriverDaemonSets(n.ctx) + // Use orphanPods=true to keep driver pods running during the transition. + err := n.cleanupAllDriverDaemonSets(n.ctx, true) if err != nil { return gpuv1.NotReady, fmt.Errorf("failed to cleanup all NVIDIA driver daemonsets owned by ClusterPolicy: %w", err) } return gpuv1.Disabled, nil } + // If NVIDIADriver CRD is NOT enabled, clean up any stale driver daemonsets owned by NVIDIADriver CRs + // This handles the case where NVIDIADriver was previously used but now ClusterPolicy manages drivers + if (n.stateNames[n.idx] == "state-driver" || n.stateNames[n.idx] == "state-vgpu-manager") && + !n.singleton.Spec.Driver.UseNvidiaDriverCRDType() { + n.logger.Info("NVIDIADriver CRD is disabled, cleaning up any stale NVIDIA driver daemonsets owned by NVIDIADriver CRs") + // Use orphanPods=true to keep driver pods running during the transition + err := n.cleanupNVIDIADriverOwnedDaemonSets(n.ctx, true) + if err != nil { + // Log error but don't fail - continue with ClusterPolicy-managed driver deployment + n.logger.Error(err, "failed to cleanup stale NVIDIA driver daemonsets owned by NVIDIADriver CRs") + } + // Note: Labeling nodes with orphaned driver pods is handled at the top-level reconciliation loop + } + for _, fs := range n.controls[n.idx] { stat, err := fs(*n) if err != nil { @@ -985,6 +1001,17 @@ func (n ClusterPolicyController) last() bool { return n.idx == len(n.controls) } +// hasNVIDIADriverCRs checks if there are any NVIDIADriver CRs in the cluster +func (n ClusterPolicyController) hasNVIDIADriverCRs(ctx context.Context) bool { + list := &nvidiav1alpha1.NVIDIADriverList{} + err := n.client.List(ctx, list) + if err != nil { + n.logger.Error(err, "failed to list NVIDIADriver CRs") + return false + } + return len(list.Items) > 0 +} + func (n ClusterPolicyController) isStateEnabled(stateName string) bool { clusterPolicySpec := &n.singleton.Spec diff --git a/controllers/upgrade_controller.go b/controllers/upgrade_controller.go index 7481239d1..4275cf13f 100644 --- a/controllers/upgrade_controller.go +++ b/controllers/upgrade_controller.go @@ -127,17 +127,16 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct var driverLabel map[string]string - // initialize with common app=nvidia-driver-daemonset label - driverLabelKey := DriverLabelKey - driverLabelValue := DriverLabelValue - - if clusterPolicy.Spec.Driver.UseNvidiaDriverCRDType() { - // app component label is added for all new driver daemonsets deployed by NVIDIADriver controller - driverLabelKey = AppComponentLabelKey - driverLabelValue = AppComponentLabelValue - } else if clusterPolicyCtrl.openshift != "" && clusterPolicyCtrl.ocpDriverToolkit.enabled { - // For OCP, when DTK is enabled app=nvidia-driver-daemonset label is not constant and changes - // based on rhcos version. Hence use DTK label instead + // Use common app.kubernetes.io/component=nvidia-driver label that works for both + // ClusterPolicy and NVIDIADriver managed daemonsets + driverLabelKey := AppComponentLabelKey + driverLabelValue := AppComponentLabelValue + + // Special case for OCP with DTK: the component label may not be stable across RHCOS versions + if clusterPolicyCtrl.openshift != "" && clusterPolicyCtrl.ocpDriverToolkit.enabled && + !clusterPolicy.Spec.Driver.UseNvidiaDriverCRDType() { + // For OCP, when DTK is enabled and using ClusterPolicy mode, app=nvidia-driver-daemonset + // label is not constant and changes based on rhcos version. Hence use DTK label instead driverLabelKey = ocpDriverToolkitIdentificationLabel driverLabelValue = ocpDriverToolkitIdentificationValue }