Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ package internal
import (
"context"
"fmt"
"slices"
"sort"
"strings"

Expand Down Expand Up @@ -80,38 +79,33 @@ func (h *DiscoveryHandler) Handle(ctx context.Context, s state.VirtualMachineCla
availableNodeNames[i] = n.GetName()
}

var (
featuresEnabled []string
featuresNotEnabled []string
)
var featuresEnabled []string
switch cpuType {
case v1alpha2.CPUTypeDiscovery:
if fs := current.Status.CpuFeatures.Enabled; len(fs) > 0 {
featuresEnabled = fs
break
}
featuresEnabled = h.discoveryCommonFeatures(nodes)
// Always recompute features from availableNodes so the result reflects
// the current set of nodes filtered by spec.nodeSelector. Caching
// Status.CpuFeatures.Enabled would lock the class to whatever node
// composition existed at first reconcile and break when nodes are
// added/removed later.
featuresEnabled = h.discoveryCommonFeatures(availableNodes)
case v1alpha2.CPUTypeFeatures:
featuresEnabled = current.Spec.CPU.Features
}

if cpuType == v1alpha2.CPUTypeDiscovery || cpuType == v1alpha2.CPUTypeFeatures {
commonFeatures := h.discoveryCommonFeatures(availableNodes)
for _, cf := range commonFeatures {
if !slices.Contains(featuresEnabled, cf) {
featuresNotEnabled = append(featuresNotEnabled, cf)
}
}
}

cb := conditions.NewConditionBuilder(vmclasscondition.TypeDiscovered).Generation(current.GetGeneration())
switch cpuType {
case v1alpha2.CPUTypeDiscovery:
if len(availableNodes) == 0 {
cb.Message("No nodes match the class nodeSelector; skipping feature discovery.").
Reason(vmclasscondition.ReasonDiscoveryFailed).
Status(metav1.ConditionFalse)
break
}
if len(featuresEnabled) > 0 {
cb.Message("").Reason(vmclasscondition.ReasonDiscoverySucceeded).Status(metav1.ConditionTrue)
break
}
cb.Message("No common features are discovered on nodes.").
cb.Message("No common CPU features are discovered across available nodes.").
Reason(vmclasscondition.ReasonDiscoveryFailed).
Status(metav1.ConditionFalse)
default:
Expand All @@ -123,7 +117,6 @@ func (h *DiscoveryHandler) Handle(ctx context.Context, s state.VirtualMachineCla

sort.Strings(availableNodeNames)
sort.Strings(featuresEnabled)
sort.Strings(featuresNotEnabled)

addedNodes, removedNodes := NodeNamesDiff(current.Status.AvailableNodes, availableNodeNames)
if len(addedNodes) > 0 || len(removedNodes) > 0 {
Expand All @@ -150,8 +143,7 @@ func (h *DiscoveryHandler) Handle(ctx context.Context, s state.VirtualMachineCla
changed.Status.AvailableNodes = availableNodeNames
changed.Status.MaxAllocatableResources = h.maxAllocatableResources(availableNodes)
changed.Status.CpuFeatures = v1alpha2.CpuFeatures{
Enabled: featuresEnabled,
NotEnabledCommon: featuresNotEnabled,
Enabled: featuresEnabled,
}

return reconcile.Result{}, nil
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -519,5 +519,102 @@ var _ = Describe("DiscoveryHandler", func() {
Expect(changed.Status.CpuFeatures.Enabled).To(ConsistOf("vmx", "svm"))
Expect(changed.Status.CpuFeatures.Enabled).NotTo(ContainElement("lm"))
})

It("should recompute features when available nodes change, not reuse cached status", func() {
// Regression test: previously the handler cached
// Status.CpuFeatures.Enabled forever, so when a node with a rare
// CPU feature disappeared from availableNodes, the class kept
// advertising that feature and VMs refused to schedule.
nodeOld := newNodeWithLabels("node-old", map[string]string{
virtv1.CPUFeatureLabel + "vmx": "true",
virtv1.CPUFeatureLabel + "avx": "true",
virtv1.CPUFeatureLabel + "hle": "true",
})
nodeNew := newNodeWithLabels("node-new", map[string]string{
virtv1.CPUFeatureLabel + "vmx": "true",
virtv1.CPUFeatureLabel + "avx": "true",
virtv1.CPUFeatureLabel + "fma": "true",
})
handlerOld := newVirtHandlerPod("node-old")
handlerNew := newVirtHandlerPod("node-new")

vmc := newVMClass("test-cache-staleness", v1alpha2.CPUTypeDiscovery, nil, nil)
vmcState, resource := setupDiscoveryEnvironment(vmc,
nodeOld, nodeNew,
handlerOld, handlerNew)

ctx := context.Background()
mockRecorder := &eventrecord.EventRecorderLoggerMock{
EventfFunc: func(involved client.Object, eventtype, reason, messageFmt string, args ...any) {},
}
handler := NewDiscoveryHandler(mockRecorder)

// Two passes are required: first sets the Discovered condition to
// Unknown (addAllUnknown requeue), second performs the actual
// discovery and writes Status.CpuFeatures.
_, err := handler.Handle(ctx, vmcState)
Expect(err).NotTo(HaveOccurred())
_, err = handler.Handle(ctx, vmcState)
Expect(err).NotTo(HaveOccurred())

changed := resource.Changed()
Expect(changed.Status.AvailableNodes).To(ConsistOf("node-old", "node-new"))
Expect(changed.Status.CpuFeatures.Enabled).To(ConsistOf("vmx", "avx"))

// Simulate node-old disappearing (e.g. drained, deleted) by
// re-running discovery with only node-new in availableNodes. The
// next reconcile must drop hle/rtm from Enabled without manual
// intervention on the class status.
vmcState, resource = setupDiscoveryEnvironment(vmc,
nodeNew,
handlerNew)
// Replay the same reconcile pattern again on the fresh state.
_, err = handler.Handle(ctx, vmcState)
Expect(err).NotTo(HaveOccurred())
_, err = handler.Handle(ctx, vmcState)
Expect(err).NotTo(HaveOccurred())

changed = resource.Changed()
Expect(changed.Status.AvailableNodes).To(ConsistOf("node-new"))
Expect(changed.Status.CpuFeatures.Enabled).To(ConsistOf("vmx", "avx", "fma"))
})

It("should mark Discovered=False when available nodes have no common CPU features", func() {
// Pick two labels that exist on opposite nodes so there is no
// common CPU feature between availableNodes.
node1 := newNodeWithLabels("node1", map[string]string{
virtv1.CPUFeatureLabel + "hle": "true",
})
node2 := newNodeWithLabels("node2", map[string]string{
virtv1.CPUFeatureLabel + "rtm": "true",
})
handler1 := newVirtHandlerPod("node1")
handler2 := newVirtHandlerPod("node2")

vmc := newVMClass("test-no-common", v1alpha2.CPUTypeDiscovery, nil, nil)
vmcState, resource := setupDiscoveryEnvironment(vmc,
node1, node2,
handler1, handler2)

ctx := context.Background()
mockRecorder := &eventrecord.EventRecorderLoggerMock{
EventfFunc: func(involved client.Object, eventtype, reason, messageFmt string, args ...any) {},
}
handler := NewDiscoveryHandler(mockRecorder)

_, err := handler.Handle(ctx, vmcState)
Expect(err).NotTo(HaveOccurred())
_, err = handler.Handle(ctx, vmcState)
Expect(err).NotTo(HaveOccurred())

changed := resource.Changed()
Expect(changed.Status.AvailableNodes).To(ConsistOf("node1", "node2"))
Expect(changed.Status.CpuFeatures.Enabled).To(BeEmpty())

cond := conditions.FindStatusCondition(changed.Status.Conditions, vmclasscondition.TypeDiscovered.String())
Expect(cond).NotTo(BeNil())
Expect(cond.Status).To(Equal(metav1.ConditionFalse))
Expect(cond.Reason).To(Equal(vmclasscondition.ReasonDiscoveryFailed.String()))
})
})
})
Loading