diff --git a/images/virtualization-artifact/pkg/controller/vmclass/internal/discovery.go b/images/virtualization-artifact/pkg/controller/vmclass/internal/discovery.go index de4882f3b4..c9a83dde0f 100644 --- a/images/virtualization-artifact/pkg/controller/vmclass/internal/discovery.go +++ b/images/virtualization-artifact/pkg/controller/vmclass/internal/discovery.go @@ -19,7 +19,6 @@ package internal import ( "context" "fmt" - "slices" "sort" "strings" @@ -80,38 +79,33 @@ func (h *DiscoveryHandler) Handle(ctx context.Context, s state.VirtualMachineCla availableNodeNames[i] = n.GetName() } - var ( - featuresEnabled []string - featuresNotEnabled []string - ) + var featuresEnabled []string switch cpuType { case v1alpha2.CPUTypeDiscovery: - if fs := current.Status.CpuFeatures.Enabled; len(fs) > 0 { - featuresEnabled = fs - break - } - featuresEnabled = h.discoveryCommonFeatures(nodes) + // Always recompute features from availableNodes so the result reflects + // the current set of nodes filtered by spec.nodeSelector. Caching + // Status.CpuFeatures.Enabled would lock the class to whatever node + // composition existed at first reconcile and break when nodes are + // added/removed later. + featuresEnabled = h.discoveryCommonFeatures(availableNodes) case v1alpha2.CPUTypeFeatures: featuresEnabled = current.Spec.CPU.Features } - if cpuType == v1alpha2.CPUTypeDiscovery || cpuType == v1alpha2.CPUTypeFeatures { - commonFeatures := h.discoveryCommonFeatures(availableNodes) - for _, cf := range commonFeatures { - if !slices.Contains(featuresEnabled, cf) { - featuresNotEnabled = append(featuresNotEnabled, cf) - } - } - } - cb := conditions.NewConditionBuilder(vmclasscondition.TypeDiscovered).Generation(current.GetGeneration()) switch cpuType { case v1alpha2.CPUTypeDiscovery: + if len(availableNodes) == 0 { + cb.Message("No nodes match the class nodeSelector; skipping feature discovery."). + Reason(vmclasscondition.ReasonDiscoveryFailed). + Status(metav1.ConditionFalse) + break + } if len(featuresEnabled) > 0 { cb.Message("").Reason(vmclasscondition.ReasonDiscoverySucceeded).Status(metav1.ConditionTrue) break } - cb.Message("No common features are discovered on nodes."). + cb.Message("No common CPU features are discovered across available nodes."). Reason(vmclasscondition.ReasonDiscoveryFailed). Status(metav1.ConditionFalse) default: @@ -123,7 +117,6 @@ func (h *DiscoveryHandler) Handle(ctx context.Context, s state.VirtualMachineCla sort.Strings(availableNodeNames) sort.Strings(featuresEnabled) - sort.Strings(featuresNotEnabled) addedNodes, removedNodes := NodeNamesDiff(current.Status.AvailableNodes, availableNodeNames) if len(addedNodes) > 0 || len(removedNodes) > 0 { @@ -150,8 +143,7 @@ func (h *DiscoveryHandler) Handle(ctx context.Context, s state.VirtualMachineCla changed.Status.AvailableNodes = availableNodeNames changed.Status.MaxAllocatableResources = h.maxAllocatableResources(availableNodes) changed.Status.CpuFeatures = v1alpha2.CpuFeatures{ - Enabled: featuresEnabled, - NotEnabledCommon: featuresNotEnabled, + Enabled: featuresEnabled, } return reconcile.Result{}, nil diff --git a/images/virtualization-artifact/pkg/controller/vmclass/internal/discovery_test.go b/images/virtualization-artifact/pkg/controller/vmclass/internal/discovery_test.go index 0fa47ed3aa..3998111337 100644 --- a/images/virtualization-artifact/pkg/controller/vmclass/internal/discovery_test.go +++ b/images/virtualization-artifact/pkg/controller/vmclass/internal/discovery_test.go @@ -519,5 +519,102 @@ var _ = Describe("DiscoveryHandler", func() { Expect(changed.Status.CpuFeatures.Enabled).To(ConsistOf("vmx", "svm")) Expect(changed.Status.CpuFeatures.Enabled).NotTo(ContainElement("lm")) }) + + It("should recompute features when available nodes change, not reuse cached status", func() { + // Regression test: previously the handler cached + // Status.CpuFeatures.Enabled forever, so when a node with a rare + // CPU feature disappeared from availableNodes, the class kept + // advertising that feature and VMs refused to schedule. + nodeOld := newNodeWithLabels("node-old", map[string]string{ + virtv1.CPUFeatureLabel + "vmx": "true", + virtv1.CPUFeatureLabel + "avx": "true", + virtv1.CPUFeatureLabel + "hle": "true", + }) + nodeNew := newNodeWithLabels("node-new", map[string]string{ + virtv1.CPUFeatureLabel + "vmx": "true", + virtv1.CPUFeatureLabel + "avx": "true", + virtv1.CPUFeatureLabel + "fma": "true", + }) + handlerOld := newVirtHandlerPod("node-old") + handlerNew := newVirtHandlerPod("node-new") + + vmc := newVMClass("test-cache-staleness", v1alpha2.CPUTypeDiscovery, nil, nil) + vmcState, resource := setupDiscoveryEnvironment(vmc, + nodeOld, nodeNew, + handlerOld, handlerNew) + + ctx := context.Background() + mockRecorder := &eventrecord.EventRecorderLoggerMock{ + EventfFunc: func(involved client.Object, eventtype, reason, messageFmt string, args ...any) {}, + } + handler := NewDiscoveryHandler(mockRecorder) + + // Two passes are required: first sets the Discovered condition to + // Unknown (addAllUnknown requeue), second performs the actual + // discovery and writes Status.CpuFeatures. + _, err := handler.Handle(ctx, vmcState) + Expect(err).NotTo(HaveOccurred()) + _, err = handler.Handle(ctx, vmcState) + Expect(err).NotTo(HaveOccurred()) + + changed := resource.Changed() + Expect(changed.Status.AvailableNodes).To(ConsistOf("node-old", "node-new")) + Expect(changed.Status.CpuFeatures.Enabled).To(ConsistOf("vmx", "avx")) + + // Simulate node-old disappearing (e.g. drained, deleted) by + // re-running discovery with only node-new in availableNodes. The + // next reconcile must drop hle/rtm from Enabled without manual + // intervention on the class status. + vmcState, resource = setupDiscoveryEnvironment(vmc, + nodeNew, + handlerNew) + // Replay the same reconcile pattern again on the fresh state. + _, err = handler.Handle(ctx, vmcState) + Expect(err).NotTo(HaveOccurred()) + _, err = handler.Handle(ctx, vmcState) + Expect(err).NotTo(HaveOccurred()) + + changed = resource.Changed() + Expect(changed.Status.AvailableNodes).To(ConsistOf("node-new")) + Expect(changed.Status.CpuFeatures.Enabled).To(ConsistOf("vmx", "avx", "fma")) + }) + + It("should mark Discovered=False when available nodes have no common CPU features", func() { + // Pick two labels that exist on opposite nodes so there is no + // common CPU feature between availableNodes. + node1 := newNodeWithLabels("node1", map[string]string{ + virtv1.CPUFeatureLabel + "hle": "true", + }) + node2 := newNodeWithLabels("node2", map[string]string{ + virtv1.CPUFeatureLabel + "rtm": "true", + }) + handler1 := newVirtHandlerPod("node1") + handler2 := newVirtHandlerPod("node2") + + vmc := newVMClass("test-no-common", v1alpha2.CPUTypeDiscovery, nil, nil) + vmcState, resource := setupDiscoveryEnvironment(vmc, + node1, node2, + handler1, handler2) + + ctx := context.Background() + mockRecorder := &eventrecord.EventRecorderLoggerMock{ + EventfFunc: func(involved client.Object, eventtype, reason, messageFmt string, args ...any) {}, + } + handler := NewDiscoveryHandler(mockRecorder) + + _, err := handler.Handle(ctx, vmcState) + Expect(err).NotTo(HaveOccurred()) + _, err = handler.Handle(ctx, vmcState) + Expect(err).NotTo(HaveOccurred()) + + changed := resource.Changed() + Expect(changed.Status.AvailableNodes).To(ConsistOf("node1", "node2")) + Expect(changed.Status.CpuFeatures.Enabled).To(BeEmpty()) + + cond := conditions.FindStatusCondition(changed.Status.Conditions, vmclasscondition.TypeDiscovered.String()) + Expect(cond).NotTo(BeNil()) + Expect(cond.Status).To(Equal(metav1.ConditionFalse)) + Expect(cond.Reason).To(Equal(vmclasscondition.ReasonDiscoveryFailed.String())) + }) }) })