From 48394baa78c75d147154ce669712009a45b8baa2 Mon Sep 17 00:00:00 2001
From: Rahul Sharma <rahulsharm@nvidia.com>
Date: Tue, 10 Feb 2026 21:13:31 -0800
Subject: [PATCH] Adds support for migrating from clusterpolicy to nvidiadriver
 and vice-versa

This change supports:
1. Migrating cluster from clusterpolicy to nvidiadriver CR
2. Migrating cluster from nvidiadriver CR to clusterpolicy

Signed-off-by: Rahul Sharma <rahulsharm@nvidia.com>
---
 controllers/clusterpolicy_controller.go |  41 +++++
 controllers/object_controls.go          | 192 +++++++++++++++++++++++-
 controllers/state_manager.go            |  29 +++-
 controllers/upgrade_controller.go       |  21 ++-
 4 files changed, 268 insertions(+), 15 deletions(-)

diff --git a/controllers/clusterpolicy_controller.go b/controllers/clusterpolicy_controller.go
index d16d2d445..ea66c17cf 100644
--- a/controllers/clusterpolicy_controller.go
+++ b/controllers/clusterpolicy_controller.go
@@ -42,6 +42,7 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/source"
 
 	gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1"
+	nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1"
 	"github.com/NVIDIA/gpu-operator/internal/conditions"
 )
 
@@ -170,6 +171,19 @@ func (r *ClusterPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reques
 		}
 	}
 
+	// Check for orphaned driver pods and label nodes AFTER step() completes
+	// This ensures labeling happens after any daemonset cleanup that might have orphaned pods
+	//
+	// Label nodes containing orphaned driver pods based on transition state:
+	// - When useNvidiaDriverCRD=false: Always label (NVIDIADriver → ClusterPolicy transition)
+	// - When useNvidiaDriverCRD=true: Only label if NVIDIADriver CRs exist (ClusterPolicy → NVIDIADriver transition)
+	if !clusterPolicyCtrl.singleton.Spec.Driver.UseNvidiaDriverCRDType() || clusterPolicyCtrl.hasNVIDIADriverCRs(ctx) {
+		if err := clusterPolicyCtrl.labelNodesWithOrphanedDriverPods(ctx); err != nil {
+			// Log error but don't fail reconciliation
+			r.Log.Error(err, "failed to label nodes with orphaned driver pods")
+		}
+	}
+
 	// if any state is not ready, requeue for reconcile after 5 seconds
 	if overallStatus != gpuv1.Ready {
 		clusterPolicyCtrl.operatorMetrics.reconciliationStatus.Set(reconciliationStatusNotReady)
@@ -371,6 +385,33 @@ func (r *ClusterPolicyReconciler) SetupWithManager(ctx context.Context, mgr ctrl
 		return err
 	}
 
+	// Watch for changes to NVIDIADriver CRs to trigger labeling of orphaned pods
+	// This is needed when NVIDIADriver CRs are created/updated/deleted during transitions
+	err = c.Watch(
+		source.Kind(mgr.GetCache(),
+			&nvidiav1alpha1.NVIDIADriver{},
+			handler.TypedEnqueueRequestsFromMapFunc(func(ctx context.Context, obj *nvidiav1alpha1.NVIDIADriver) []reconcile.Request {
+				// Trigger reconciliation of all ClusterPolicy instances
+				list := &gpuv1.ClusterPolicyList{}
+				if err := mgr.GetClient().List(ctx, list); err != nil {
+					return []reconcile.Request{}
+				}
+				requests := make([]reconcile.Request, len(list.Items))
+				for i, item := range list.Items {
+					requests[i] = reconcile.Request{
+						NamespacedName: types.NamespacedName{
+							Name: item.GetName(),
+						},
+					}
+				}
+				return requests
+			}),
+		),
+	)
+	if err != nil {
+		return err
+	}
+
 	// TODO(user): Modify this to be the types you create that are owned by the primary resource
 	// Watch for changes to secondary resource Daemonsets and requeue the owner ClusterPolicy
 	err = c.Watch(
diff --git a/controllers/object_controls.go b/controllers/object_controls.go
index 63e9f03c9..f3561e2a7 100644
--- a/controllers/object_controls.go
+++ b/controllers/object_controls.go
@@ -50,6 +50,7 @@ import (
 	"sigs.k8s.io/yaml"
 
 	gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1"
+	nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1"
 	"github.com/NVIDIA/gpu-operator/internal/consts"
 	"github.com/NVIDIA/gpu-operator/internal/utils"
 )
@@ -4068,7 +4069,7 @@ func ocpHasDriverToolkitImageStream(n *ClusterPolicyController) (bool, error) {
 	return true, nil
 }
 
-func (n ClusterPolicyController) cleanupAllDriverDaemonSets(ctx context.Context) error {
+func (n ClusterPolicyController) cleanupAllDriverDaemonSets(ctx context.Context, orphanPods bool) error {
 	// Get all DaemonSets owned by ClusterPolicy
 	//
 	// (cdesiniotis) There is a limitation with the controller-runtime client where only a single field selector
@@ -4084,8 +4085,17 @@ func (n ClusterPolicyController) cleanupAllDriverDaemonSets(ctx context.Context)
 		ds := ds
 		// filter out DaemonSets which are not the NVIDIA driver/vgpu-manager
 		if strings.HasPrefix(ds.Name, commonDriverDaemonsetName) || strings.HasPrefix(ds.Name, commonVGPUManagerDaemonsetName) {
-			n.logger.Info("Deleting NVIDIA driver daemonset owned by ClusterPolicy", "Name", ds.Name)
-			err = n.client.Delete(ctx, &ds)
+			if orphanPods {
+				n.logger.Info("Deleting NVIDIA driver daemonset owned by ClusterPolicy with cascade=orphan", "Name", ds.Name)
+				// Delete with cascade=orphan to keep the driver pods running
+				propagationPolicy := metav1.DeletePropagationOrphan
+				err = n.client.Delete(ctx, &ds, &client.DeleteOptions{
+					PropagationPolicy: &propagationPolicy,
+				})
+			} else {
+				n.logger.Info("Deleting NVIDIA driver daemonset owned by ClusterPolicy", "Name", ds.Name)
+				err = n.client.Delete(ctx, &ds)
+			}
 			if err != nil {
 				return fmt.Errorf("error deleting NVIDIA driver daemonset: %w", err)
 			}
@@ -4095,6 +4105,182 @@ func (n ClusterPolicyController) cleanupAllDriverDaemonSets(ctx context.Context)
 	return nil
 }
 
+// cleanupNVIDIADriverOwnedDaemonSets deletes all driver daemonsets owned by any NVIDIADriver CR
+// This is used when ClusterPolicy has useNvidiaDriverCRD=false to clean up stale daemonsets
+func (n ClusterPolicyController) cleanupNVIDIADriverOwnedDaemonSets(ctx context.Context, orphanPods bool) error {
+	// List all DaemonSets in the namespace using component label
+	// This includes both NVIDIADriver-owned and orphaned driver daemonsets
+	list := &appsv1.DaemonSetList{}
+	err := n.client.List(ctx, list,
+		client.InNamespace(n.operatorNamespace),
+		client.MatchingLabels{"app.kubernetes.io/component": "nvidia-driver"},
+	)
+	if err != nil {
+		return fmt.Errorf("failed to list driver daemonsets: %w", err)
+	}
+
+	for _, ds := range list.Items {
+		ds := ds
+
+		// Skip ClusterPolicy-owned daemonsets - only process NVIDIADriver or orphaned ones
+		isOwnedByClusterPolicy := false
+		for _, ownerRef := range ds.OwnerReferences {
+			if ownerRef.Kind == "ClusterPolicy" {
+				isOwnedByClusterPolicy = true
+				break
+			}
+		}
+		if isOwnedByClusterPolicy {
+			continue
+		}
+
+		// At this point, the daemonset is either:
+		// 1. Owned by NVIDIADriver (has NVIDIADriver owner reference)
+		// 2. Orphaned (no owner references)
+		// We clean up both cases when useNvidiaDriverCRD=false
+		n.logger.Info("Found NVIDIADriver-managed or orphaned driver daemonset", "Name", ds.Name, "OwnerReferences", len(ds.OwnerReferences))
+
+		if orphanPods {
+			if len(ds.OwnerReferences) > 0 {
+				n.logger.Info("Removing owner references and deleting NVIDIA driver daemonset with cascade=orphan", "Name", ds.Name)
+				// Remove owner references to prevent garbage collection from deleting with default cascade
+				// when the NVIDIADriver CR is deleted later
+				ds.OwnerReferences = nil
+				if err := n.client.Update(ctx, &ds); err != nil {
+					// Continue anyway and try to delete
+					n.logger.Error(err, "failed to remove owner references from daemonset", "Name", ds.Name)
+				}
+			} else {
+				n.logger.Info("Deleting orphaned NVIDIA driver daemonset with cascade=orphan", "Name", ds.Name)
+			}
+
+			// Delete with cascade=orphan to keep the driver pods running
+			propagationPolicy := metav1.DeletePropagationOrphan
+			err = n.client.Delete(ctx, &ds, &client.DeleteOptions{
+				PropagationPolicy: &propagationPolicy,
+			})
+		} else {
+			n.logger.Info("Deleting NVIDIADriver-managed or orphaned daemonset", "Name", ds.Name)
+			err = n.client.Delete(ctx, &ds)
+		}
+		if err != nil && !apierrors.IsNotFound(err) {
+			return fmt.Errorf("error deleting NVIDIADriver-managed daemonset %s: %w", ds.Name, err)
+		}
+	}
+
+	return nil
+}
+
+// labelNodesWithOrphanedDriverPods detects orphaned driver pods (pods without owner references)
+// and labels their nodes with nvidia.com/gpu-driver-upgrade-state=upgrade-required to trigger upgrade
+func (n ClusterPolicyController) labelNodesWithOrphanedDriverPods(ctx context.Context) error {
+	useNvidiaDriverCRD := n.singleton.Spec.Driver.UseNvidiaDriverCRDType()
+	return labelNodesWithOrphanedDriverPods(ctx, n.client, n.operatorNamespace, useNvidiaDriverCRD)
+}
+
+// labelNodesWithOrphanedDriverPods is a standalone function that can be used by both
+// ClusterPolicy and NVIDIADriver controllers to label nodes with orphaned driver pods
+// When useNvidiaDriverCRD=true: Only labels nodes if matching NVIDIADriver CR exists (ClusterPolicy→NVIDIADriver transition)
+// When useNvidiaDriverCRD=false: Labels all nodes with orphaned pods (NVIDIADriver→ClusterPolicy transition)
+func labelNodesWithOrphanedDriverPods(ctx context.Context, c client.Client, namespace string, useNvidiaDriverCRD bool) error {
+	// List all NVIDIADriver CRs to check nodeSelectors (only needed when useNvidiaDriverCRD=true)
+	var nvidiaDriverList *nvidiav1alpha1.NVIDIADriverList
+	if useNvidiaDriverCRD {
+		nvidiaDriverList = &nvidiav1alpha1.NVIDIADriverList{}
+		if err := c.List(ctx, nvidiaDriverList); err != nil {
+			return fmt.Errorf("failed to list NVIDIADriver CRs: %w", err)
+		}
+	}
+
+	// List all driver pods in the namespace
+	// Use app.kubernetes.io/component label which works for both ClusterPolicy and NVIDIADriver managed pods
+	podList := &corev1.PodList{}
+	listOpts := []client.ListOption{
+		client.InNamespace(namespace),
+		client.MatchingLabels{"app.kubernetes.io/component": "nvidia-driver"},
+	}
+	if err := c.List(ctx, podList, listOpts...); err != nil {
+		return fmt.Errorf("failed to list driver pods: %w", err)
+	}
+
+	// Find orphaned pods (pods without owner references)
+	for i := range podList.Items {
+		pod := &podList.Items[i]
+
+		// Skip if pod has owner references (not orphaned)
+		if len(pod.OwnerReferences) > 0 {
+			continue
+		}
+
+		// Skip if pod is not running
+		if pod.Status.Phase != corev1.PodRunning {
+			continue
+		}
+
+		nodeName := pod.Spec.NodeName
+		if nodeName == "" {
+			continue
+		}
+
+		// Get the node
+		node := &corev1.Node{}
+		if err := c.Get(ctx, types.NamespacedName{Name: nodeName}, node); err != nil {
+			continue
+		}
+
+		// When useNvidiaDriverCRD=true: Check if at least one NVIDIADriver CR's nodeSelector matches this node
+		// When useNvidiaDriverCRD=false: Label all orphaned pods since ClusterPolicy will take over
+		if useNvidiaDriverCRD {
+			hasMatchingDriver := false
+			for _, nvDriver := range nvidiaDriverList.Items {
+				if nodeMatchesSelector(node, nvDriver.Spec.NodeSelector) {
+					hasMatchingDriver = true
+					break
+				}
+			}
+
+			// Only label if there's a matching NVIDIADriver that will create replacement pods
+			if !hasMatchingDriver {
+				continue
+			}
+		}
+
+		// Check if the node already has the upgrade label
+		upgradeLabel := "nvidia.com/gpu-driver-upgrade-state"
+		if node.Labels != nil && node.Labels[upgradeLabel] == "upgrade-required" {
+			continue
+		}
+
+		// Label the node
+		if node.Labels == nil {
+			node.Labels = make(map[string]string)
+		}
+		node.Labels[upgradeLabel] = "upgrade-required"
+		if err := c.Update(ctx, node); err != nil {
+			continue
+		}
+	}
+
+	return nil
+}
+
+// nodeMatchesSelector checks if a node's labels match the given selector
+func nodeMatchesSelector(node *corev1.Node, selector map[string]string) bool {
+	if len(selector) == 0 {
+		// Empty selector matches all nodes
+		return true
+	}
+
+	for key, value := range selector {
+		// Check if the key exists in node labels
+		nodeValue, exists := node.Labels[key]
+		if !exists || nodeValue != value {
+			return false
+		}
+	}
+	return true
+}
+
 // cleanupStalePrecompiledDaemonsets deletes stale driver daemonsets which can happen
 // 1. If all nodes upgraded to the latest kernel
 // 2. no GPU nodes are present
diff --git a/controllers/state_manager.go b/controllers/state_manager.go
index 4567a7c8c..de3d95458 100644
--- a/controllers/state_manager.go
+++ b/controllers/state_manager.go
@@ -35,6 +35,7 @@ import (
 	"sigs.k8s.io/controller-runtime/pkg/client/config"
 
 	gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1"
+	nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1"
 )
 
 const (
@@ -951,13 +952,28 @@ func (n *ClusterPolicyController) step() (gpuv1.State, error) {
 		n.logger.Info("NVIDIADriver CRD is enabled, cleaning up all NVIDIA driver daemonsets owned by ClusterPolicy")
 		n.idx++
 		// Cleanup all driver daemonsets owned by ClusterPolicy.
-		err := n.cleanupAllDriverDaemonSets(n.ctx)
+		// Use orphanPods=true to keep driver pods running during the transition.
+		err := n.cleanupAllDriverDaemonSets(n.ctx, true)
 		if err != nil {
 			return gpuv1.NotReady, fmt.Errorf("failed to cleanup all NVIDIA driver daemonsets owned by ClusterPolicy: %w", err)
 		}
 		return gpuv1.Disabled, nil
 	}
 
+	// If NVIDIADriver CRD is NOT enabled, clean up any stale driver daemonsets owned by NVIDIADriver CRs
+	// This handles the case where NVIDIADriver was previously used but now ClusterPolicy manages drivers
+	if (n.stateNames[n.idx] == "state-driver" || n.stateNames[n.idx] == "state-vgpu-manager") &&
+		!n.singleton.Spec.Driver.UseNvidiaDriverCRDType() {
+		n.logger.Info("NVIDIADriver CRD is disabled, cleaning up any stale NVIDIA driver daemonsets owned by NVIDIADriver CRs")
+		// Use orphanPods=true to keep driver pods running during the transition
+		err := n.cleanupNVIDIADriverOwnedDaemonSets(n.ctx, true)
+		if err != nil {
+			// Log error but don't fail - continue with ClusterPolicy-managed driver deployment
+			n.logger.Error(err, "failed to cleanup stale NVIDIA driver daemonsets owned by NVIDIADriver CRs")
+		}
+		// Note: Labeling nodes with orphaned driver pods is handled at the top-level reconciliation loop
+	}
+
 	for _, fs := range n.controls[n.idx] {
 		stat, err := fs(*n)
 		if err != nil {
@@ -985,6 +1001,17 @@ func (n ClusterPolicyController) last() bool {
 	return n.idx == len(n.controls)
 }
 
+// hasNVIDIADriverCRs checks if there are any NVIDIADriver CRs in the cluster
+func (n ClusterPolicyController) hasNVIDIADriverCRs(ctx context.Context) bool {
+	list := &nvidiav1alpha1.NVIDIADriverList{}
+	err := n.client.List(ctx, list)
+	if err != nil {
+		n.logger.Error(err, "failed to list NVIDIADriver CRs")
+		return false
+	}
+	return len(list.Items) > 0
+}
+
 func (n ClusterPolicyController) isStateEnabled(stateName string) bool {
 	clusterPolicySpec := &n.singleton.Spec
 
diff --git a/controllers/upgrade_controller.go b/controllers/upgrade_controller.go
index 7481239d1..4275cf13f 100644
--- a/controllers/upgrade_controller.go
+++ b/controllers/upgrade_controller.go
@@ -127,17 +127,16 @@ func (r *UpgradeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
 
 	var driverLabel map[string]string
 
-	// initialize with common app=nvidia-driver-daemonset label
-	driverLabelKey := DriverLabelKey
-	driverLabelValue := DriverLabelValue
-
-	if clusterPolicy.Spec.Driver.UseNvidiaDriverCRDType() {
-		// app component label is added for all new driver daemonsets deployed by NVIDIADriver controller
-		driverLabelKey = AppComponentLabelKey
-		driverLabelValue = AppComponentLabelValue
-	} else if clusterPolicyCtrl.openshift != "" && clusterPolicyCtrl.ocpDriverToolkit.enabled {
-		// For OCP, when DTK is enabled app=nvidia-driver-daemonset label is not constant and changes
-		// based on rhcos version. Hence use DTK label instead
+	// Use common app.kubernetes.io/component=nvidia-driver label that works for both
+	// ClusterPolicy and NVIDIADriver managed daemonsets
+	driverLabelKey := AppComponentLabelKey
+	driverLabelValue := AppComponentLabelValue
+
+	// Special case for OCP with DTK: the component label may not be stable across RHCOS versions
+	if clusterPolicyCtrl.openshift != "" && clusterPolicyCtrl.ocpDriverToolkit.enabled &&
+		!clusterPolicy.Spec.Driver.UseNvidiaDriverCRDType() {
+		// For OCP, when DTK is enabled and using ClusterPolicy mode, app=nvidia-driver-daemonset
+		// label is not constant and changes based on rhcos version. Hence use DTK label instead
 		driverLabelKey = ocpDriverToolkitIdentificationLabel
 		driverLabelValue = ocpDriverToolkitIdentificationValue
 	}