From 14a6a0250294fea5d879275b379842f13d3d0fd5 Mon Sep 17 00:00:00 2001 From: Tariq Ibrahim Date: Wed, 18 Feb 2026 20:38:22 -0800 Subject: [PATCH] remove dependency on /etc/os-release host mount Signed-off-by: Tariq Ibrahim --- ...rator-certified.clusterserviceversion.yaml | 8 - config/manager/manager.yaml | 8 - controllers/object_controls.go | 74 +++------ controllers/object_controls_test.go | 71 --------- controllers/transforms_test.go | 144 +++++++++++++++--- .../gpu-operator/templates/operator.yaml | 8 - 6 files changed, 140 insertions(+), 173 deletions(-) diff --git a/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml b/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml index c79bbb09a..e2b03100b 100644 --- a/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml +++ b/bundle/manifests/gpu-operator-certified.clusterserviceversion.yaml @@ -904,10 +904,6 @@ spec: memory: 200Mi securityContext: allowPrivilegeEscalation: false - volumeMounts: - - mountPath: /host-etc/os-release - name: host-os-release - readOnly: true env: - name: OPERATOR_NAMESPACE valueFrom: @@ -945,10 +941,6 @@ spec: - name: "GDRCOPY_IMAGE" value: "nvcr.io/nvidia/cloud-native/gdrdrv@sha256:5c4e61f7ba83d7a64ff2523d447c209ce5bde1ddc79acaf1f32f19620b4912d6" terminationGracePeriodSeconds: 10 - volumes: - - hostPath: - path: /etc/os-release - name: host-os-release serviceAccountName: gpu-operator strategy: deployment installModes: diff --git a/config/manager/manager.yaml b/config/manager/manager.yaml index c6c05b64c..b8247fde4 100644 --- a/config/manager/manager.yaml +++ b/config/manager/manager.yaml @@ -56,10 +56,6 @@ spec: memory: 50Mi securityContext: allowPrivilegeEscalation: false - volumeMounts: - - mountPath: /host-etc/os-release - name: host-os-release - readOnly: true env: - name: OPERATOR_NAMESPACE valueFrom: @@ -72,7 +68,3 @@ spec: - name: metrics containerPort: 8080 terminationGracePeriodSeconds: 10 - volumes: - - hostPath: - path: /etc/os-release - name: host-os-release diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 05c809625..390d01935 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -17,11 +17,9 @@ package controllers import ( - "bufio" "context" "errors" "fmt" - "os" "path" "path/filepath" "regexp" @@ -1003,36 +1001,6 @@ func setNRIPluginAnnotation(o *metav1.ObjectMeta, cdiConfig *gpuv1.CDIConfigSpec o.Annotations = annotations } -// parseOSRelease can be overridden in tests for mocking filesystem access. -// In production, it reads and parses /host-etc/os-release. -var parseOSRelease = parseOSReleaseFromFile - -// osReleaseFilePath is the path to the os-release file, configurable for testing. -var osReleaseFilePath = "/host-etc/os-release" - -// parseOSReleaseFromFile reads and parses the os-release file from the host filesystem. -func parseOSReleaseFromFile() (map[string]string, error) { - release := map[string]string{} - - f, err := os.Open(osReleaseFilePath) - if err != nil { - return nil, err - } - defer f.Close() - - re := regexp.MustCompile(`^(?P\w+)=(?P.+)`) - - // Read line-by-line - s := bufio.NewScanner(f) - for s.Scan() { - line := s.Text() - if m := re.FindStringSubmatch(line); m != nil { - release[m[1]] = strings.Trim(m[2], `"`) - } - } - return release, nil -} - func TransformDCGMExporterService(obj *corev1.Service, config *gpuv1.ClusterPolicySpec) error { serviceConfig := config.DCGMExporter.ServiceSpec if serviceConfig != nil { @@ -3299,9 +3267,9 @@ func resolveDriverTag(n ClusterPolicyController, driverSpec interface{}) (string return image, nil } -// gpuNodeOSID returns the base OS identifier (e.g. "rhel", "ubuntu", "rocky") for GPU +// getGPUNodeOSID returns the base OS identifier (e.g. "rhel", "ubuntu", "rocky") for GPU // worker nodes by extracting the version suffix from the osTag obtained via NFD labels. -func (n ClusterPolicyController) gpuNodeOSID() (string, string, error) { +func (n ClusterPolicyController) getGPUNodeOSID() (string, string, error) { _, osTag, _ := kernelFullVersion(n) if osTag == "" { return "", "", fmt.Errorf("unable to determine GPU node OS from NFD labels, is NFD installed?") @@ -3314,7 +3282,7 @@ func (n ClusterPolicyController) gpuNodeOSID() (string, string, error) { // getRepoConfigPath returns the standard OS specific path for repository configuration files. func (n ClusterPolicyController) getRepoConfigPath() (string, error) { - osID, osTag, err := n.gpuNodeOSID() + osID, osTag, err := n.getGPUNodeOSID() if err != nil { return "", err } @@ -3326,7 +3294,7 @@ func (n ClusterPolicyController) getRepoConfigPath() (string, error) { // getCertConfigPath returns the standard OS specific path for ssl keys/certificates. func (n ClusterPolicyController) getCertConfigPath() (string, error) { - osID, osTag, err := n.gpuNodeOSID() + osID, osTag, err := n.getGPUNodeOSID() if err != nil { return "", err } @@ -3338,17 +3306,15 @@ func (n ClusterPolicyController) getCertConfigPath() (string, error) { // getSubscriptionPathsToVolumeSources returns the MountPathToVolumeSource map containing all // OS-specific subscription/entitlement paths that need to be mounted in the container. -func getSubscriptionPathsToVolumeSources() (MountPathToVolumeSource, error) { - release, err := parseOSRelease() +func (n ClusterPolicyController) getSubscriptionPathsToVolumeSources() (MountPathToVolumeSource, error) { + osID, osTag, err := n.getGPUNodeOSID() if err != nil { return nil, err } - - os := release["ID"] - if pathToVolumeSource, ok := SubscriptionPathMap[os]; ok { + if pathToVolumeSource, ok := SubscriptionPathMap[osID]; ok { return pathToVolumeSource, nil } - return nil, fmt.Errorf("distribution not supported") + return nil, fmt.Errorf("subscription paths not found for distribution %s", osTag) } // createConfigMapVolumeMounts creates a VolumeMount for each key @@ -3612,15 +3578,14 @@ func transformDriverContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicy } } - release, err := parseOSRelease() + osID, _, err := n.getGPUNodeOSID() if err != nil { - return fmt.Errorf("ERROR: failed to get os-release: %s", err) + return fmt.Errorf("ERROR: failed to retrieve OS name of GPU Node: %w", err) } - // set up subscription entitlements for RHEL(using K8s with a non-CRIO runtime) and SLES - if (release["ID"] == "rhel" && n.openshift == "" && n.runtime != gpuv1.CRIO) || release["ID"] == "sles" || release["ID"] == "sl-micro" { - n.logger.Info("Mounting subscriptions into the driver container", "OS", release["ID"]) - pathToVolumeSource, err := getSubscriptionPathsToVolumeSources() + if (osID == "rhel" && n.openshift == "" && n.runtime != gpuv1.CRIO) || osID == "sles" || osID == "sl-micro" { + n.logger.Info("Mounting subscriptions into the driver container", "OS", osID) + pathToVolumeSource, err := n.getSubscriptionPathsToVolumeSources() if err != nil { return fmt.Errorf("ERROR: failed to get path items for subscription entitlements: %v", err) } @@ -3648,8 +3613,8 @@ func transformDriverContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicy } // apply proxy and env settings if this is an OpenShift cluster - if _, ok := release["OPENSHIFT_VERSION"]; ok { - setContainerEnv(driverContainer, "OPENSHIFT_VERSION", release["OPENSHIFT_VERSION"]) + if len(n.openshift) > 0 { + setContainerEnv(driverContainer, "OPENSHIFT_VERSION", n.openshift) // Automatically apply proxy settings for OCP and inject custom CA if configured by user // https://docs.openshift.com/container-platform/4.6/networking/configuring-a-custom-pki.html @@ -3720,14 +3685,9 @@ func transformVGPUManagerContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterP container.Args = config.VGPUManager.Args } - release, err := parseOSRelease() - if err != nil { - return fmt.Errorf("ERROR: failed to get os-release: %s", err) - } - // add env for OCP - if _, ok := release["OPENSHIFT_VERSION"]; ok { - setContainerEnv(container, "OPENSHIFT_VERSION", release["OPENSHIFT_VERSION"]) + if len(n.openshift) > 0 { + setContainerEnv(container, "OPENSHIFT_VERSION", n.openshift) } if len(config.VGPUManager.Env) > 0 { diff --git a/controllers/object_controls_test.go b/controllers/object_controls_test.go index 311566873..41e934d3c 100644 --- a/controllers/object_controls_test.go +++ b/controllers/object_controls_test.go @@ -149,18 +149,6 @@ func getModuleRoot(dir string) (string, error) { return dir, nil } -// mockOSRelease returns a mock parseOSRelease function for testing. -// It allows tests to simulate different operating systems without filesystem access. -func mockOSRelease(osID, version string) func() (map[string]string, error) { - return func() (map[string]string, error) { - return map[string]string{ - "ID": osID, - "VERSION_ID": version, - "NAME": osID, - }, nil - } -} - // setup creates a mock kubernetes cluster and client. Nodes are labeled with the minimum // required NFD labels to be detected as GPU nodes by the GPU Operator. A sample // ClusterPolicy resource is applied to the cluster. The ClusterPolicyController @@ -173,9 +161,6 @@ func setup() error { boolTrue = new(bool) *boolTrue = true - // Mock parseOSRelease to avoid filesystem dependency in tests - parseOSRelease = mockOSRelease("ubuntu", "20.04") - s := scheme.Scheme if err := gpuv1.AddToScheme(s); err != nil { return fmt.Errorf("unable to add ClusterPolicy v1 schema: %v", err) @@ -1452,62 +1437,6 @@ func TestService(t *testing.T) { } } -func TestParseOSReleaseFromFile(t *testing.T) { - tests := []struct { - description string - content string - expected map[string]string - }{ - { - description: "quoted values", - content: `NAME="Ubuntu"` + "\n" + `VERSION_ID="20.04"`, - expected: map[string]string{"NAME": "Ubuntu", "VERSION_ID": "20.04"}, - }, - { - description: "unquoted values", - content: `NAME=Ubuntu` + "\n" + `ID=ubuntu`, - expected: map[string]string{"NAME": "Ubuntu", "ID": "ubuntu"}, - }, - { - description: "mixed quoted and unquoted", - content: `ID="rhel"` + "\n" + `VERSION_ID=8.5`, - expected: map[string]string{"ID": "rhel", "VERSION_ID": "8.5"}, - }, - { - description: "empty lines and comments", - content: `NAME="Ubuntu"` + "\n\n# comment\n" + `ID=ubuntu`, - expected: map[string]string{"NAME": "Ubuntu", "ID": "ubuntu"}, - }, - } - - tempDir := t.TempDir() - - // Save original value and restore after tests for future subsequent tests (if needed) - originalPath := osReleaseFilePath - defer func() { osReleaseFilePath = originalPath }() - - for i, test := range tests { - t.Run(test.description, func(t *testing.T) { - testFile := filepath.Join(tempDir, fmt.Sprintf("os-release-%d", i)) - err := os.WriteFile(testFile, []byte(test.content), 0600) - require.NoError(t, err) - - // Override the path for this test - osReleaseFilePath = testFile - result, err := parseOSReleaseFromFile() - require.NoError(t, err) - require.Equal(t, test.expected, result) - }) - } - - t.Run("file not found", func(t *testing.T) { - osReleaseFilePath = "/nonexistent/path" - _, err := parseOSReleaseFromFile() - require.Error(t, err) - require.True(t, os.IsNotExist(err)) - }) -} - func TestCertConfigPathMap(t *testing.T) { expectedPaths := map[string]string{ "centos": "/etc/pki/ca-trust/extracted/pem", diff --git a/controllers/transforms_test.go b/controllers/transforms_test.go index 26cdd4ccc..3d75efeee 100644 --- a/controllers/transforms_test.go +++ b/controllers/transforms_test.go @@ -50,8 +50,10 @@ func initMockK8sClients() { ObjectMeta: metav1.ObjectMeta{ Name: "test-node", Labels: map[string]string{ - nfdKernelLabelKey: "6.8.0-60-generic", - commonGPULabelKey: "true", + nfdOSReleaseIDLabelKey: "ubuntu", + nfdOSVersionIDLabelKey: "20.04", + nfdKernelLabelKey: "6.8.0-60-generic", + commonGPULabelKey: "true", }, }, } @@ -2866,7 +2868,7 @@ func TestTransformDriver(t *testing.T) { client: mockClientMap["secret-env-client"], expectedDs: NewDaemonset().WithContainer(corev1.Container{ Name: "nvidia-driver-ctr", - Image: "nvcr.io/nvidia/driver:570.172.08-", + Image: "nvcr.io/nvidia/driver:570.172.08-ubuntu20.04", ImagePullPolicy: corev1.PullIfNotPresent, EnvFrom: []corev1.EnvFromSource{{ SecretRef: &corev1.SecretEnvSource{ @@ -2887,7 +2889,7 @@ func TestTransformDriver(t *testing.T) { }, }).WithContainer(corev1.Container{ Name: "nvidia-fs", - Image: "nvcr.io/nvidia/cloud-native/nvidia-fs:2.20.5-", + Image: "nvcr.io/nvidia/cloud-native/nvidia-fs:2.20.5-ubuntu20.04", EnvFrom: []corev1.EnvFromSource{{ SecretRef: &corev1.SecretEnvSource{ LocalObjectReference: corev1.LocalObjectReference{ @@ -2897,7 +2899,7 @@ func TestTransformDriver(t *testing.T) { }}, }).WithContainer(corev1.Container{ Name: "nvidia-gdrcopy", - Image: "nvcr.io/nvidia/cloud-native/gdrdrv:v2.5-", + Image: "nvcr.io/nvidia/cloud-native/gdrdrv:v2.5-ubuntu20.04", EnvFrom: []corev1.EnvFromSource{{ SecretRef: &corev1.SecretEnvSource{ LocalObjectReference: corev1.LocalObjectReference{ @@ -3176,8 +3178,10 @@ func TestTransformDriverWithLicensingConfig(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Name: "test-node", Labels: map[string]string{ - nfdKernelLabelKey: "6.8.0-60-generic", - commonGPULabelKey: "true", + nfdOSReleaseIDLabelKey: "ubuntu", + nfdOSVersionIDLabelKey: "20.04", + nfdKernelLabelKey: "6.8.0-60-generic", + commonGPULabelKey: "true", }, }, } @@ -3215,7 +3219,7 @@ func TestTransformDriverWithLicensingConfig(t *testing.T) { client: mockClient, expectedDs: NewDaemonset().WithContainer(corev1.Container{ Name: "nvidia-driver-ctr", - Image: "nvcr.io/nvidia/driver:570.172.08-", + Image: "nvcr.io/nvidia/driver:570.172.08-ubuntu20.04", ImagePullPolicy: corev1.PullIfNotPresent, VolumeMounts: []corev1.VolumeMount{ { @@ -3269,7 +3273,7 @@ func TestTransformDriverWithLicensingConfig(t *testing.T) { client: mockClient, expectedDs: NewDaemonset().WithContainer(corev1.Container{ Name: "nvidia-driver-ctr", - Image: "nvcr.io/nvidia/driver:570.172.08-", + Image: "nvcr.io/nvidia/driver:570.172.08-ubuntu20.04", ImagePullPolicy: corev1.PullIfNotPresent, VolumeMounts: []corev1.VolumeMount{ { @@ -3326,8 +3330,10 @@ func TestTransformDriverWithResources(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Name: "test-node", Labels: map[string]string{ - nfdKernelLabelKey: "6.8.0-60-generic", - commonGPULabelKey: "true", + nfdOSReleaseIDLabelKey: "ubuntu", + nfdOSVersionIDLabelKey: "20.04", + nfdKernelLabelKey: "6.8.0-60-generic", + commonGPULabelKey: "true", }, }, } @@ -3387,7 +3393,7 @@ func TestTransformDriverWithResources(t *testing.T) { client: mockClient, expectedDs: NewDaemonset().WithContainer(corev1.Container{ Name: "nvidia-driver-ctr", - Image: "nvcr.io/nvidia/driver:570.172.08-", + Image: "nvcr.io/nvidia/driver:570.172.08-ubuntu20.04", ImagePullPolicy: corev1.PullIfNotPresent, Resources: corev1.ResourceRequirements{ Requests: resources.Requests, @@ -3405,14 +3411,14 @@ func TestTransformDriverWithResources(t *testing.T) { }, }).WithContainer(corev1.Container{ Name: "nvidia-fs", - Image: "nvcr.io/nvidia/cloud-native/nvidia-fs:2.20.5-", + Image: "nvcr.io/nvidia/cloud-native/nvidia-fs:2.20.5-ubuntu20.04", Resources: corev1.ResourceRequirements{ Requests: resources.Requests, Limits: resources.Limits, }, }).WithContainer(corev1.Container{ Name: "nvidia-gdrcopy", - Image: "nvcr.io/nvidia/cloud-native/gdrdrv:v2.5-", + Image: "nvcr.io/nvidia/cloud-native/gdrdrv:v2.5-ubuntu20.04", Resources: corev1.ResourceRequirements{ Requests: resources.Requests, Limits: resources.Limits, @@ -3448,8 +3454,10 @@ func TestTransformDriverRDMA(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Name: "test-node", Labels: map[string]string{ - nfdKernelLabelKey: "6.8.0-60-generic", - commonGPULabelKey: "true", + nfdOSReleaseIDLabelKey: "ubuntu", + nfdOSVersionIDLabelKey: "20.04", + nfdKernelLabelKey: "6.8.0-60-generic", + commonGPULabelKey: "true", }, }, } @@ -3478,7 +3486,7 @@ func TestTransformDriverRDMA(t *testing.T) { expectedDs := NewDaemonset().WithContainer(corev1.Container{ Name: "nvidia-driver-ctr", - Image: "nvcr.io/nvidia/driver:570.172.08-", + Image: "nvcr.io/nvidia/driver:570.172.08-ubuntu20.04", ImagePullPolicy: corev1.PullIfNotPresent, Env: []corev1.EnvVar{ { @@ -3505,7 +3513,7 @@ func TestTransformDriverRDMA(t *testing.T) { }, }).WithContainer(corev1.Container{ Name: "nvidia-peermem", - Image: "nvcr.io/nvidia/driver:570.172.08-", + Image: "nvcr.io/nvidia/driver:570.172.08-ubuntu20.04", Env: []corev1.EnvVar{ { Name: "USE_HOST_MOFED", @@ -3529,8 +3537,10 @@ func TestTransformDriverVGPUTopologyConfig(t *testing.T) { ObjectMeta: metav1.ObjectMeta{ Name: "test-node", Labels: map[string]string{ - nfdKernelLabelKey: "6.8.0-60-generic", - commonGPULabelKey: "true", + nfdOSReleaseIDLabelKey: "ubuntu", + nfdOSVersionIDLabelKey: "20.04", + nfdKernelLabelKey: "6.8.0-60-generic", + commonGPULabelKey: "true", }, }, } @@ -3555,7 +3565,7 @@ func TestTransformDriverVGPUTopologyConfig(t *testing.T) { expectedDs := NewDaemonset().WithContainer(corev1.Container{ Name: "nvidia-driver-ctr", - Image: "nvcr.io/nvidia/driver:570.172.08-", + Image: "nvcr.io/nvidia/driver:570.172.08-ubuntu20.04", ImagePullPolicy: corev1.PullIfNotPresent, VolumeMounts: []corev1.VolumeMount{ { @@ -3816,3 +3826,95 @@ func TestTransformVGPUManager(t *testing.T) { }) } } + +func TestTransformDriverWithAdditionalConfig(t *testing.T) { + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-node", + Labels: map[string]string{ + nfdOSReleaseIDLabelKey: "ubuntu", + nfdOSVersionIDLabelKey: "24.04", + nfdKernelLabelKey: "6.8.0-60-generic", + commonGPULabelKey: "true", + }, + }, + } + + testCertConfigMap := &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cert", + Namespace: "test-ns", + }, + } + + mockClient := fake.NewFakeClient(node, testCertConfigMap) + + testCases := []struct { + description string + ds Daemonset + cpSpec *gpuv1.ClusterPolicySpec + client client.Client + expectedDs Daemonset + errorExpected bool + }{ + { + description: "transform driver with cert config", + ds: NewDaemonset().WithContainer(corev1.Container{Name: "nvidia-driver-ctr"}). + WithInitContainer(corev1.Container{Name: "k8s-driver-manager"}), + cpSpec: &gpuv1.ClusterPolicySpec{ + Driver: gpuv1.DriverSpec{ + Repository: "nvcr.io/nvidia", + Image: "driver", + ImagePullPolicy: "IfNotPresent", + Version: "580.126.16", + Manager: gpuv1.DriverManagerSpec{ + Repository: "nvcr.io/nvidia/cloud-native", + Image: "k8s-driver-manager", + ImagePullPolicy: "IfNotPresent", + Version: "v0.8.0", + }, + CertConfig: &gpuv1.DriverCertConfigSpec{ + Name: "test-cert", + }, + }, + }, + client: mockClient, + expectedDs: NewDaemonset().WithContainer(corev1.Container{ + Name: "nvidia-driver-ctr", + Image: "nvcr.io/nvidia/driver:580.126.16-ubuntu24.04", + ImagePullPolicy: corev1.PullIfNotPresent, + }).WithInitContainer(corev1.Container{ + Name: "k8s-driver-manager", + Image: "nvcr.io/nvidia/cloud-native/k8s-driver-manager:v0.8.0", + ImagePullPolicy: corev1.PullIfNotPresent, + }).WithVolume(corev1.Volume{ + Name: "test-cert", + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: "test-cert", + }, + }, + }, + }), + errorExpected: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + err := TransformDriver(tc.ds.DaemonSet, tc.cpSpec, + ClusterPolicyController{client: tc.client, runtime: gpuv1.Containerd, + operatorNamespace: "test-ns", logger: ctrl.Log.WithName("test")}) + if tc.errorExpected { + require.Error(t, err) + return + } + require.NoError(t, err) + + // Remove dynamically generated digest before comparison + removeDigestFromDaemonSet(tc.ds.DaemonSet) + require.EqualValues(t, tc.expectedDs, tc.ds) + }) + } +} diff --git a/deployments/gpu-operator/templates/operator.yaml b/deployments/gpu-operator/templates/operator.yaml index b5dc545ad..c6ec3bf4d 100644 --- a/deployments/gpu-operator/templates/operator.yaml +++ b/deployments/gpu-operator/templates/operator.yaml @@ -59,10 +59,6 @@ spec: fieldPath: metadata.namespace - name: "DRIVER_MANAGER_IMAGE" value: "{{ include "driver-manager.fullimage" . }}" - volumeMounts: - - name: host-os-release - mountPath: "/host-etc/os-release" - readOnly: true livenessProbe: httpGet: path: /healthz @@ -82,10 +78,6 @@ spec: ports: - name: metrics containerPort: 8080 - volumes: - - name: host-os-release - hostPath: - path: "/etc/os-release" {{- with .Values.operator.nodeSelector }} nodeSelector: {{- toYaml . | nindent 8 }}