From bb87e014b4ffd14253eb55fba6ea4587fe976ce1 Mon Sep 17 00:00:00 2001 From: Pavel Tishkov Date: Wed, 17 Jun 2026 12:43:19 +0300 Subject: [PATCH] fix(vmclass): recompute cpu features so vms can schedule on listed available nodes VirtualMachineClass with cpu.type=Discovery kept CPU features from the very first reconcile in Status.CpuFeatures.Enabled. When nodes were added or drained later, the cached feature list was never updated, so the class kept advertising features that no longer existed on availableNodes. This caused VMs stuck in Pending because the virt-launcher pod required labels that no node provided. Compute features from availableNodes on every reconcile so the list always matches the current node set filtered by spec.nodeSelector. Signed-off-by: Pavel Tishkov --- .../controller/vmclass/internal/discovery.go | 38 +++----- .../vmclass/internal/discovery_test.go | 97 +++++++++++++++++++ 2 files changed, 112 insertions(+), 23 deletions(-) diff --git a/images/virtualization-artifact/pkg/controller/vmclass/internal/discovery.go b/images/virtualization-artifact/pkg/controller/vmclass/internal/discovery.go index de4882f3b4..c9a83dde0f 100644 --- a/images/virtualization-artifact/pkg/controller/vmclass/internal/discovery.go +++ b/images/virtualization-artifact/pkg/controller/vmclass/internal/discovery.go @@ -19,7 +19,6 @@ package internal import ( "context" "fmt" - "slices" "sort" "strings" @@ -80,38 +79,33 @@ func (h *DiscoveryHandler) Handle(ctx context.Context, s state.VirtualMachineCla availableNodeNames[i] = n.GetName() } - var ( - featuresEnabled []string - featuresNotEnabled []string - ) + var featuresEnabled []string switch cpuType { case v1alpha2.CPUTypeDiscovery: - if fs := current.Status.CpuFeatures.Enabled; len(fs) > 0 { - featuresEnabled = fs - break - } - featuresEnabled = h.discoveryCommonFeatures(nodes) + // Always recompute features from availableNodes so the result reflects + // the current set of nodes filtered by spec.nodeSelector. Caching + // Status.CpuFeatures.Enabled would lock the class to whatever node + // composition existed at first reconcile and break when nodes are + // added/removed later. + featuresEnabled = h.discoveryCommonFeatures(availableNodes) case v1alpha2.CPUTypeFeatures: featuresEnabled = current.Spec.CPU.Features } - if cpuType == v1alpha2.CPUTypeDiscovery || cpuType == v1alpha2.CPUTypeFeatures { - commonFeatures := h.discoveryCommonFeatures(availableNodes) - for _, cf := range commonFeatures { - if !slices.Contains(featuresEnabled, cf) { - featuresNotEnabled = append(featuresNotEnabled, cf) - } - } - } - cb := conditions.NewConditionBuilder(vmclasscondition.TypeDiscovered).Generation(current.GetGeneration()) switch cpuType { case v1alpha2.CPUTypeDiscovery: + if len(availableNodes) == 0 { + cb.Message("No nodes match the class nodeSelector; skipping feature discovery."). + Reason(vmclasscondition.ReasonDiscoveryFailed). + Status(metav1.ConditionFalse) + break + } if len(featuresEnabled) > 0 { cb.Message("").Reason(vmclasscondition.ReasonDiscoverySucceeded).Status(metav1.ConditionTrue) break } - cb.Message("No common features are discovered on nodes."). + cb.Message("No common CPU features are discovered across available nodes."). Reason(vmclasscondition.ReasonDiscoveryFailed). Status(metav1.ConditionFalse) default: @@ -123,7 +117,6 @@ func (h *DiscoveryHandler) Handle(ctx context.Context, s state.VirtualMachineCla sort.Strings(availableNodeNames) sort.Strings(featuresEnabled) - sort.Strings(featuresNotEnabled) addedNodes, removedNodes := NodeNamesDiff(current.Status.AvailableNodes, availableNodeNames) if len(addedNodes) > 0 || len(removedNodes) > 0 { @@ -150,8 +143,7 @@ func (h *DiscoveryHandler) Handle(ctx context.Context, s state.VirtualMachineCla changed.Status.AvailableNodes = availableNodeNames changed.Status.MaxAllocatableResources = h.maxAllocatableResources(availableNodes) changed.Status.CpuFeatures = v1alpha2.CpuFeatures{ - Enabled: featuresEnabled, - NotEnabledCommon: featuresNotEnabled, + Enabled: featuresEnabled, } return reconcile.Result{}, nil diff --git a/images/virtualization-artifact/pkg/controller/vmclass/internal/discovery_test.go b/images/virtualization-artifact/pkg/controller/vmclass/internal/discovery_test.go index 0fa47ed3aa..3998111337 100644 --- a/images/virtualization-artifact/pkg/controller/vmclass/internal/discovery_test.go +++ b/images/virtualization-artifact/pkg/controller/vmclass/internal/discovery_test.go @@ -519,5 +519,102 @@ var _ = Describe("DiscoveryHandler", func() { Expect(changed.Status.CpuFeatures.Enabled).To(ConsistOf("vmx", "svm")) Expect(changed.Status.CpuFeatures.Enabled).NotTo(ContainElement("lm")) }) + + It("should recompute features when available nodes change, not reuse cached status", func() { + // Regression test: previously the handler cached + // Status.CpuFeatures.Enabled forever, so when a node with a rare + // CPU feature disappeared from availableNodes, the class kept + // advertising that feature and VMs refused to schedule. + nodeOld := newNodeWithLabels("node-old", map[string]string{ + virtv1.CPUFeatureLabel + "vmx": "true", + virtv1.CPUFeatureLabel + "avx": "true", + virtv1.CPUFeatureLabel + "hle": "true", + }) + nodeNew := newNodeWithLabels("node-new", map[string]string{ + virtv1.CPUFeatureLabel + "vmx": "true", + virtv1.CPUFeatureLabel + "avx": "true", + virtv1.CPUFeatureLabel + "fma": "true", + }) + handlerOld := newVirtHandlerPod("node-old") + handlerNew := newVirtHandlerPod("node-new") + + vmc := newVMClass("test-cache-staleness", v1alpha2.CPUTypeDiscovery, nil, nil) + vmcState, resource := setupDiscoveryEnvironment(vmc, + nodeOld, nodeNew, + handlerOld, handlerNew) + + ctx := context.Background() + mockRecorder := &eventrecord.EventRecorderLoggerMock{ + EventfFunc: func(involved client.Object, eventtype, reason, messageFmt string, args ...any) {}, + } + handler := NewDiscoveryHandler(mockRecorder) + + // Two passes are required: first sets the Discovered condition to + // Unknown (addAllUnknown requeue), second performs the actual + // discovery and writes Status.CpuFeatures. + _, err := handler.Handle(ctx, vmcState) + Expect(err).NotTo(HaveOccurred()) + _, err = handler.Handle(ctx, vmcState) + Expect(err).NotTo(HaveOccurred()) + + changed := resource.Changed() + Expect(changed.Status.AvailableNodes).To(ConsistOf("node-old", "node-new")) + Expect(changed.Status.CpuFeatures.Enabled).To(ConsistOf("vmx", "avx")) + + // Simulate node-old disappearing (e.g. drained, deleted) by + // re-running discovery with only node-new in availableNodes. The + // next reconcile must drop hle/rtm from Enabled without manual + // intervention on the class status. + vmcState, resource = setupDiscoveryEnvironment(vmc, + nodeNew, + handlerNew) + // Replay the same reconcile pattern again on the fresh state. + _, err = handler.Handle(ctx, vmcState) + Expect(err).NotTo(HaveOccurred()) + _, err = handler.Handle(ctx, vmcState) + Expect(err).NotTo(HaveOccurred()) + + changed = resource.Changed() + Expect(changed.Status.AvailableNodes).To(ConsistOf("node-new")) + Expect(changed.Status.CpuFeatures.Enabled).To(ConsistOf("vmx", "avx", "fma")) + }) + + It("should mark Discovered=False when available nodes have no common CPU features", func() { + // Pick two labels that exist on opposite nodes so there is no + // common CPU feature between availableNodes. + node1 := newNodeWithLabels("node1", map[string]string{ + virtv1.CPUFeatureLabel + "hle": "true", + }) + node2 := newNodeWithLabels("node2", map[string]string{ + virtv1.CPUFeatureLabel + "rtm": "true", + }) + handler1 := newVirtHandlerPod("node1") + handler2 := newVirtHandlerPod("node2") + + vmc := newVMClass("test-no-common", v1alpha2.CPUTypeDiscovery, nil, nil) + vmcState, resource := setupDiscoveryEnvironment(vmc, + node1, node2, + handler1, handler2) + + ctx := context.Background() + mockRecorder := &eventrecord.EventRecorderLoggerMock{ + EventfFunc: func(involved client.Object, eventtype, reason, messageFmt string, args ...any) {}, + } + handler := NewDiscoveryHandler(mockRecorder) + + _, err := handler.Handle(ctx, vmcState) + Expect(err).NotTo(HaveOccurred()) + _, err = handler.Handle(ctx, vmcState) + Expect(err).NotTo(HaveOccurred()) + + changed := resource.Changed() + Expect(changed.Status.AvailableNodes).To(ConsistOf("node1", "node2")) + Expect(changed.Status.CpuFeatures.Enabled).To(BeEmpty()) + + cond := conditions.FindStatusCondition(changed.Status.Conditions, vmclasscondition.TypeDiscovered.String()) + Expect(cond).NotTo(BeNil()) + Expect(cond.Status).To(Equal(metav1.ConditionFalse)) + Expect(cond.Reason).To(Equal(vmclasscondition.ReasonDiscoveryFailed.String())) + }) }) })