From 324639158688439f09d1f05623afe3489af52604 Mon Sep 17 00:00:00 2001 From: Miyoung Choi Date: Mon, 18 May 2026 20:27:13 -0700 Subject: [PATCH] docs(skills): initial conversion --- .claude/skills | 1 + .../gpu-operator-container-device/SKILL.md | 189 ++ .../gpu-operator-custom-driver/SKILL.md | 104 + .../gpu-operator-driver-upgrades/SKILL.md | 313 ++ .../gpu-operator-gpudirect-rdma/SKILL.md | 463 +++ .../SKILL.md | 359 +++ .../SKILL.md | 163 ++ .../gpu-operator-install-http-proxy/SKILL.md | 99 + .../gpu-operator-install-ing-nvidia/SKILL.md | 493 ++++ .../SKILL.md | 133 + .../gpu-operator-install-nvidia-vgpu/SKILL.md | 204 ++ .../SKILL.md | 98 + .../SKILL.md | 45 + .../gpu-operator-kata-containers/SKILL.md | 565 ++++ .../skills/gpu-operator-kubevirt/SKILL.md | 493 ++++ .../gpu-operator-multiinstance/SKILL.md | 519 ++++ .../gpu-operator-nvidia-amazon/SKILL.md | 167 ++ .../skills/gpu-operator-nvidia-azure/SKILL.md | 117 + .../skills/gpu-operator-nvidia-dra/SKILL.md | 308 ++ .../gpu-operator-nvidia-driver/SKILL.md | 296 ++ .../gpu-operator-nvidia-google/SKILL.md | 211 ++ .../gpu-operator-precompiled-drivers/SKILL.md | 281 ++ .../skills/gpu-operator-references/SKILL.md | 19 + .../confidential-containers-deploy.md | 17 + .../references/life-cycle-policy.md | 64 + .../references/overview.md | 64 + .../references/platform-support.md | 398 +++ .../references/release-notes.md | 2533 +++++++++++++++++ .../references/security.md | 39 + .../references/troubleshooting.md | 554 ++++ .../gpu-operator-timeslicing-gpus/SKILL.md | 390 +++ .../gpu-operator-uninstalling-nvidia/SKILL.md | 85 + .../gpu-operator-upgrading-nvidia/SKILL.md | 179 ++ gpu-operator/amazon-eks.rst | 9 + gpu-operator/cdi.rst | 9 + .../confidential-containers-deploy.rst | 8 + gpu-operator/custom-driver-params.rst | 11 + gpu-operator/deploy-kata-containers.rst | 9 + gpu-operator/dra-intro-install.rst | 9 + gpu-operator/getting-started.rst | 9 + gpu-operator/google-gke.rst | 9 + gpu-operator/gpu-driver-configuration.rst | 9 + gpu-operator/gpu-driver-upgrades.rst | 9 + gpu-operator/gpu-operator-kubevirt.rst | 9 + gpu-operator/gpu-operator-mig.rst | 9 + gpu-operator/gpu-operator-rdma.rst | 9 + gpu-operator/gpu-sharing.rst | 9 + gpu-operator/index.rst | 8 + .../install-gpu-operator-air-gapped.rst | 9 + .../install-gpu-operator-gov-ready.rst | 9 + gpu-operator/install-gpu-operator-nvaie.rst | 9 + .../install-gpu-operator-outdated-kernels.rst | 11 + gpu-operator/install-gpu-operator-proxy.rst | 9 + .../install-gpu-operator-service-mesh.rst | 9 + gpu-operator/install-gpu-operator-vgpu.rst | 11 + gpu-operator/life-cycle-policy.rst | 8 + gpu-operator/microsoft-aks.rst | 9 + gpu-operator/overview.rst | 8 + gpu-operator/platform-support.rst | 8 + gpu-operator/precompiled-drivers.rst | 9 + gpu-operator/release-notes.rst | 10 + gpu-operator/security.rst | 9 + gpu-operator/troubleshooting.rst | 8 + gpu-operator/uninstall.rst | 9 + gpu-operator/upgrade.rst | 9 + 65 files changed, 10252 insertions(+) create mode 120000 .claude/skills create mode 100644 gpu-operator/.agents/skills/gpu-operator-container-device/SKILL.md create mode 100644 gpu-operator/.agents/skills/gpu-operator-custom-driver/SKILL.md create mode 100644 gpu-operator/.agents/skills/gpu-operator-driver-upgrades/SKILL.md create mode 100644 gpu-operator/.agents/skills/gpu-operator-gpudirect-rdma/SKILL.md create mode 100644 gpu-operator/.agents/skills/gpu-operator-install-airgapped-environments/SKILL.md create mode 100644 gpu-operator/.agents/skills/gpu-operator-install-governmentready-environments/SKILL.md create mode 100644 gpu-operator/.agents/skills/gpu-operator-install-http-proxy/SKILL.md create mode 100644 gpu-operator/.agents/skills/gpu-operator-install-ing-nvidia/SKILL.md create mode 100644 gpu-operator/.agents/skills/gpu-operator-install-nvidia-enterprise/SKILL.md create mode 100644 gpu-operator/.agents/skills/gpu-operator-install-nvidia-vgpu/SKILL.md create mode 100644 gpu-operator/.agents/skills/gpu-operator-install-outdated-kernels/SKILL.md create mode 100644 gpu-operator/.agents/skills/gpu-operator-install-service-mesh/SKILL.md create mode 100644 gpu-operator/.agents/skills/gpu-operator-kata-containers/SKILL.md create mode 100644 gpu-operator/.agents/skills/gpu-operator-kubevirt/SKILL.md create mode 100644 gpu-operator/.agents/skills/gpu-operator-multiinstance/SKILL.md create mode 100644 gpu-operator/.agents/skills/gpu-operator-nvidia-amazon/SKILL.md create mode 100644 gpu-operator/.agents/skills/gpu-operator-nvidia-azure/SKILL.md create mode 100644 gpu-operator/.agents/skills/gpu-operator-nvidia-dra/SKILL.md create mode 100644 gpu-operator/.agents/skills/gpu-operator-nvidia-driver/SKILL.md create mode 100644 gpu-operator/.agents/skills/gpu-operator-nvidia-google/SKILL.md create mode 100644 gpu-operator/.agents/skills/gpu-operator-precompiled-drivers/SKILL.md create mode 100644 gpu-operator/.agents/skills/gpu-operator-references/SKILL.md create mode 100644 gpu-operator/.agents/skills/gpu-operator-references/references/confidential-containers-deploy.md create mode 100644 gpu-operator/.agents/skills/gpu-operator-references/references/life-cycle-policy.md create mode 100644 gpu-operator/.agents/skills/gpu-operator-references/references/overview.md create mode 100644 gpu-operator/.agents/skills/gpu-operator-references/references/platform-support.md create mode 100644 gpu-operator/.agents/skills/gpu-operator-references/references/release-notes.md create mode 100644 gpu-operator/.agents/skills/gpu-operator-references/references/security.md create mode 100644 gpu-operator/.agents/skills/gpu-operator-references/references/troubleshooting.md create mode 100644 gpu-operator/.agents/skills/gpu-operator-timeslicing-gpus/SKILL.md create mode 100644 gpu-operator/.agents/skills/gpu-operator-uninstalling-nvidia/SKILL.md create mode 100644 gpu-operator/.agents/skills/gpu-operator-upgrading-nvidia/SKILL.md diff --git a/.claude/skills b/.claude/skills new file mode 120000 index 000000000..5024ca287 --- /dev/null +++ b/.claude/skills @@ -0,0 +1 @@ +../gpu-operator/.agents/skills \ No newline at end of file diff --git a/gpu-operator/.agents/skills/gpu-operator-container-device/SKILL.md b/gpu-operator/.agents/skills/gpu-operator-container-device/SKILL.md new file mode 100644 index 000000000..868b434c2 --- /dev/null +++ b/gpu-operator/.agents/skills/gpu-operator-container-device/SKILL.md @@ -0,0 +1,189 @@ +--- +name: "gpu-operator-container-device" +description: "Explains how to configure CDI and NRI support for GPU workloads. Use when enabling CDI, configuring containerd, or troubleshooting CDI-based GPU injection. Trigger keywords - NVIDIA GPU Operator, CDI, NRI, containerd, Kubernetes." +--- + + + + +# Container Device Interface (CDI) and Node Resource Interface (NRI) Plugin Support + +This page gives an overview of CDI and NRI Plugin support in the GPU Operator. + +## About Container Device Interface (CDI) + +The [Container Device Interface (CDI)](https://github.com/cncf-tags/container-device-interface/blob/main/SPEC.md) +is an open specification for container runtimes that abstracts what access to a device, such as an NVIDIA GPU, means, +and standardizes access across container runtimes. Popular container runtimes can read and process the specification to +ensure that a device is available in a container. CDI simplifies adding support for devices such as NVIDIA GPUs because +the specification is applicable to all container runtimes that support CDI. + +Starting with GPU Operator v25.10.0, CDI is used by default for enabling GPU support in containers running on Kubernetes. +Specifically, CDI support in container runtimes, like containerd and cri-o, is used to inject GPU(s) into workload +containers. This differs from prior GPU Operator releases where CDI was used via a CDI-enabled `nvidia` runtime class. + +If you are upgrading from a version of the GPU Operator prior to v25.10.0, where CDI was disabled by default, and you are upgrading to v25.10.0 or later, where CDI is enabled by default, no configuration changes are required for standard workloads using GPU allocation through the Device Plugin. +For workloads that already have `runtimeClassName: nvidia` set in their pod spec YAML, no change is necessary. + +Use of CDI is transparent to cluster administrators and application developers. +The benefits of CDI are largely to reduce development and support for runtime-specific +plugins. + +### CDI and GPU Management Containers + +When CDI is enabled in GPU Operator versions v25.10.0 and later, GPU Management Containers that use the `NVIDIA_VISIBLE_DEVICES` environment variable to get GPU access, bypassing GPU allocation via the Device Plugin or DRA Driver for GPUs, must set `runtimeClassName: nvidia` in the pod specification. +A GPU Management Container is a container that requires access to all GPUs without them being allocated by Kubernetes. +Examples of GPU Management Containers include monitoring agents and device plugins. + +It is recommended that `NVIDIA_VISIBLE_DEVICES` only be used by GPU Management Containers. + +**Note:** + +Setting `runtimeClassName: nvidia` in the pod specification is not required when the NRI Plugin is enabled in GPU Operator. +Refer to About the Node Resource Interface (NRI) Plugin. + +## Step 1: Enabling CDI + +CDI is enabled by default during installation in GPU Operator v25.10.0 and later. +Follow the instructions for installing the Operator with Helm on the getting-started page. + +CDI is also enabled by default during a Helm upgrade to GPU Operator v25.10.0 and later. + +### Enabling CDI After Installation + +CDI is enabled by default in GPU Operator v25.10.0 and later. +Use the following procedure to enable CDI if you disabled CDI during installation. + +### Procedure +1. Enable CDI by modifying the cluster policy: + + ```console + $ kubectl patch clusterpolicies.nvidia.com/cluster-policy --type='json' \ + -p='[{"op": "replace", "path": "/spec/cdi/enabled", "value":true}]' + ``` + + *Example Output* + + ```output + clusterpolicy.nvidia.com/cluster-policy patched + ``` + +1. (Optional) Confirm that the container toolkit and device plugin pods restart: + + ```console + $ kubectl get pods -n gpu-operator + ``` + + *Example Output* + +## Step 2: Disabling CDI + +While CDI is the default and recommended mechanism for injecting GPU support into containers, you can +disable CDI and use the legacy NVIDIA Container Toolkit stack instead with the following procedure: + +1. If your nodes use the CRI-O container runtime, then temporarily disable the + GPU Operator validator: + + ```console + $ kubectl label nodes \ + nvidia.com/gpu.deploy.operator-validator=false \ + -l nvidia.com/gpu.present=true \ + --overwrite + ``` + + **Tip:** + + You can run `kubectl get nodes -o wide` and view the `CONTAINER-RUNTIME` + column to determine if your nodes use CRI-O. +1. Disable CDI by modifying the cluster policy: + + ```console + $ kubectl patch clusterpolicies.nvidia.com/cluster-policy --type='json' \ + -p='[{"op": "replace", "path": "/spec/cdi/enabled", "value":false}]' + ``` + + *Example Output* + + ```output + clusterpolicy.nvidia.com/cluster-policy patched + ``` + +1. If you temporarily disabled the GPU Operator validator, re-enable the validator: + + ```console + $ kubectl label nodes \ + nvidia.com/gpu.deploy.operator-validator=true \ + nvidia.com/gpu.present=true \ + --overwrite + ``` + +## About the Node Resource Interface (NRI) Plugin + +Node Resource Interface (NRI) is a standardized interface for plugging in extensions, called NRI Plugins, to OCI-compatible container runtimes like containerd. +NRI Plugins serve as hooks which intercept pod and container lifecycle events and perform functions including injecting devices to a container, topology aware placement strategies, and more. For more details on NRI, refer to the [NRI overview](https://github.com/containerd/nri/tree/main?tab=readme-ov-file#background) in the containerd repository. + +When enabled in the GPU Operator, the NVIDIA Container Toolkit daemonset will run an NRI Plugin on every GPU node. +The purpose of the NRI Plugin is to inject GPUs into GPU management containers that use the `NVIDIA_VISIBLE_DEVICES` environment variable to get GPU access, bypassing GPU allocation via the Device Plugin or DRA Driver for GPUs. + +In previous GPU Operator versions, device injection was handled by the `nvidia` container runtime. With CDI and the NRI Plugin enabled, the `nvidia` runtime class is no longer needed. When enabling the NRI plugin during install, the `nvidia` runtime class will not be created. If you enable the NRI Plugin after install, the `nvidia` runtime class will be deleted. + +Additionally, with the NRI Plugin enabled, modifications to the container runtime configuration are no longer needed. For example, no modifications are made to containerd’s config.toml file. +This means that on platforms that configure containerd in a non-standard way, like k3s, k0s, and Rancher Kubernetes Engine 2, users no longer need to configure environment variables like `CONTAINERD_CONFIG`, `CONTAINERD_SOCKET`, or `RUNTIME_CONFIG_SOURCE`. + +## Step 3: Enabling the NRI Plugin + +The NRI Plugin requires the following: + +- CDI to be enabled in the GPU Operator. + +- containerd v1.7.30, v2.1.x, or v2.2.x. + If you are not using the latest containerd version, check that both CDI and NRI are enabled in the containerd configuration file before deploying GPU Operator. + + **Note:** + + Enabling the NRI plugin is not supported with cri-o. +To enable the NRI Plugin during installation, follow the instructions for installing the Operator with Helm on the getting-started page and include the `--set cdi.nriPluginEnabled=true` argument in your Helm command. + +### Enabling the NRI Plugin After Installation + +1. Enable NRI Plugin by modifying the cluster policy: + + ```console + $ kubectl patch clusterpolicies.nvidia.com/cluster-policy --type='json' \ + -p='[{"op": "replace", "path": "/spec/cdi/nriPluginEnabled", "value":true}]' + ``` + + *Example Output* + + ```output + clusterpolicy.nvidia.com/cluster-policy patched + ``` + + After enabling the NRI Plugin, the `nvidia` runtime class will be deleted. + +1. (Optional) Confirm that the container toolkit and device plugin pods restart: + + ```console + $ kubectl get pods -n gpu-operator + ``` + + *Example Output* + +## Step 4: Disabling the NRI Plugin + +Disable the NRI Plugin and use the `nvidia` runtime class instead with the following procedure: + +Disable the NRI Plugin by modifying the cluster policy: + +```console +$ kubectl patch clusterpolicies.nvidia.com/cluster-policy --type='json' \ + -p='[{"op": "replace", "path": "/spec/cdi/nriPluginEnabled", "value":false}]' +``` + +*Example Output* + +```output +clusterpolicy.nvidia.com/cluster-policy patched +``` + +After disabling the NRI Plugin, the `nvidia` runtime class will be created. diff --git a/gpu-operator/.agents/skills/gpu-operator-custom-driver/SKILL.md b/gpu-operator/.agents/skills/gpu-operator-custom-driver/SKILL.md new file mode 100644 index 000000000..f51403551 --- /dev/null +++ b/gpu-operator/.agents/skills/gpu-operator-custom-driver/SKILL.md @@ -0,0 +1,104 @@ +--- +name: "gpu-operator-custom-driver" +description: "Shows how to provide custom NVIDIA driver parameters to GPU Operator driver containers. Use when changing driver module options or customizing driver container behavior. Trigger keywords - NVIDIA GPU Operator, driver parameters, NVIDIA driver, configuration." +--- + + + + +# Customizing NVIDIA GPU Driver Parameters during Installation + +The NVIDIA Driver kernel modules accept a number of parameters which can be used to customize the behavior of the driver. +By default, the GPU Operator loads the kernel modules with default values. +On a machine with the driver already installed, you can list the parameter names and values with the `cat /proc/driver/nvidia/params` command. +You can pass custom parameters to the kernel modules that get loaded as part of the +NVIDIA Driver installation (`nvidia`, `nvidia-modeset`, `nvidia-uvm`, and `nvidia-peermem`). + +## Step 1: Configure Custom Driver Parameters + +To pass custom parameters, execute the following steps. + +1. Create a configuration file named `.conf`, where `` is the name of the kernel module the parameters are for. + The file should contain parameters as key-value pairs -- one parameter per line. + + The following example shows the GPU firmware logging parameter being passed to the `nvidia` module. + + ```console + $ cat nvidia.conf + NVreg_EnableGpuFirmwareLogs=2 + ``` + +1. Create a `ConfigMap` for the configuration file. + If multiple modules are being configured, pass multiple files when creating the `ConfigMap`. + + ```console + $ kubectl create configmap kernel-module-params -n gpu-operator --from-file=nvidia.conf=./nvidia.conf + ``` + +1. Install the GPU Operator and set `driver.kernelModuleConfig.name` to the name of the `ConfigMap` + containing the kernel module parameters. + + ```console + $ helm install --wait --generate-name \ + -n gpu-operator --create-namespace \ + nvidia/gpu-operator \ + --version=${version} \ + --set driver.kernelModuleConfig.name="kernel-module-params" + ``` + +### Example using `nvidia-uvm` module + +This example shows the Heterogeneous Memory Management (HMM) being disabled in the `nvidia-uvm` module. +Refer to [Simplifying GPU Application Development with Heterogeneous Memory Management](https://developer.nvidia.com/blog/simplifying-gpu-application-development-with-heterogeneous-memory-management/) for more information about HMM. + +1. Create a configuration file named `nvidia-uvm.conf`: + + ```console + $ cat nvidia-uvm.conf + uvm_disable_hmm=1 + ``` + +1. Create a `ConfigMap` for the configuration file. + If multiple modules are being configured, pass multiple files when creating the `ConfigMap`. + + ```console + $ kubectl create configmap kernel-module-params -n gpu-operator --from-file=nvidia-uvm.conf=./nvidia-uvm.conf + ``` + +1. Install the GPU Operator and set `driver.kernelModuleConfig.name` to the name of the `ConfigMap` + containing the kernel module parameters. + + ```console + $ helm install --wait --generate-name \ + -n gpu-operator --create-namespace \ + nvidia/gpu-operator \ + --version=${version} \ + --set driver.kernelModuleConfig.name="kernel-module-params" + ``` + +1. Verify the parameter has been correctly applied, go to `/sys/module/nvidia_uvm/parameters/` on the node: + + ```console + $ ls /sys/module/nvidia_uvm/parameters/ + ``` + + *Example Output* + + ```output + ... + uvm_disable_hmm uvm_perf_access_counter_migration_enable uvm_perf_prefetch_min_faults + uvm_downgrade_force_membar_sys uvm_perf_access_counter_threshold uvm_perf_prefetch_threshold + ... + ``` + + Then check the value of the parameter: + + ```console + $ cat /sys/module/nvidia_uvm/parameters/uvm_disable_hmm + ``` + + *Example Output* + + ```output + Y + ``` diff --git a/gpu-operator/.agents/skills/gpu-operator-driver-upgrades/SKILL.md b/gpu-operator/.agents/skills/gpu-operator-driver-upgrades/SKILL.md new file mode 100644 index 000000000..1bc83791b --- /dev/null +++ b/gpu-operator/.agents/skills/gpu-operator-driver-upgrades/SKILL.md @@ -0,0 +1,313 @@ +--- +name: "gpu-operator-driver-upgrades" +description: "Explains GPU driver upgrade behavior and configuration. Use when planning driver upgrades or troubleshooting driver upgrade workflows managed by the GPU Operator. Trigger keywords - NVIDIA GPU Operator, GPU driver, driver upgrades, Kubernetes." +--- + + + + +# GPU Driver Upgrades + +## About Upgrading the GPU Driver + +The NVIDIA driver daemon set requires special consideration for upgrades because the driver kernel modules must be unloaded and loaded again on each driver container restart. +Consequently, the following steps must occur across a driver upgrade: + +1. Disable all clients to the GPU driver. +1. Unload the current GPU driver kernel modules. +1. Start the updated GPU driver pod. +1. Install the updated GPU driver and load the updated kernel modules. +1. Enable the clients of the GPU driver. + +The GPU Operator supports several methods for managing and automating this driver upgrade process. + +**Note:** + +The GPU Operator only manages the lifecycle of containerized drivers. +Drivers which are pre-installed on the host are not managed by the GPU Operator. + +## Step 1: Upgrades with the Upgrade Controller + +NVIDIA recommends upgrading by using the upgrade controller and the controller is enabled by default in the GPU Operator. +The controller automates the upgrade process and generates metrics and events so that you can monitor the upgrade process. + +### Procedure +1. Upgrade the driver by changing the `driver.version` value in the cluster policy: + + ```console + $ kubectl patch clusterpolicies.nvidia.com/cluster-policy \ + --type='json' \ + -p='[{"op": "replace", "path": "/spec/driver/version", "value":"580.95.05"}]' + ``` + + If you are using Openshift, you must update the `driver.version`, `driver.repository` and `driver.image` values in the cluster policy. + + ```console + $ kubectl patch clusterpolicies.nvidia.com/cluster-policy \ + --type='json' \ + -p='[{"op": "replace", "path": "/spec/driver/version", "value":"580.95.05"},{"op": "replace", "path": "/spec/driver/repository", "value":"nvcr.io/nvidia"},{"op": "replace", "path": "/spec/driver/image", "value":"driver"}]' + ``` + +2. (Optional) For each node, monitor the upgrade status: + + ```console + $ kubectl get node -l nvidia.com/gpu.present \ + -ojsonpath='{range .items[*]}{.metadata.name}{"\t"}{.metadata.labels.nvidia\.com/gpu-driver-upgrade-state}{"\n"}{end}' + ``` + + *Example Output* + + ```output + k8s-node-1 upgrade-required + k8s-node-2 upgrade-required + k8s-node-3 upgrade-required + ``` + + You can periodically poll the upgrade status by running the preceding command. + The GPU driver upgrade is complete when the output shows `upgrade-done`: + + ```output + k8s-node-1 upgrade-done + k8s-node-2 upgrade-done + k8s-node-3 upgrade-done + ``` + +### Configuration Options + +You can set the following fields in the cluster policy to configure the upgrade controller: + +```yaml +driver: + + upgradePolicy: + # autoUpgrade (default=true): Switch which enables / disables the driver upgrade controller. + # If set to false all other options are ignored. + autoUpgrade: true + # maxParallelUpgrades (default=1): Number of nodes that can be upgraded in parallel. 0 means infinite. + maxParallelUpgrades: 1 + # maximum number of nodes with the driver installed, that can be unavailable during + # the upgrade. Value can be an absolute number (ex: 5) or + # a percentage of total nodes at the start of upgrade (ex: + # 10%). Absolute number is calculated from percentage by rounding + # up. By default, a fixed value of 25% is used.' + maxUnavailable: 25% + # waitForCompletion: Options for the 'wait-for-completion' state, which will wait for a user-defined group of pods + # to complete before upgrading the driver on a node. + waitForCompletion: + # timeoutSeconds (default=0): The length of time to wait before giving up. 0 means infinite. + timeoutSeconds: 0 + # podSelector (default=""): The label selector defining the group of pods to wait for completion of. "" means to wait on none. + podSelector: "" + + # gpuPodDeletion: Options for the 'pod-deletion' state, which will evict all pods on the node allocated a GPU. + gpuPodDeletion: + # force (default=false): Delete pods even if they are not managed by a controller (for example ReplicationController, ReplicaSet, + # Job, DaemonSet or StatefulSet). + force: false + # timeoutSeconds (default=300): The length of time to wait before giving up. 0 means infinite. When the timeout is met, + # the GPU pod(s) will be forcefully deleted. + timeoutSeconds: 300 + # deleteEmptyDir (default=false): Delete pods even if they are using emptyDir volumes (local data will be deleted). + deleteEmptyDir: false + + # drain: Options for the 'drain' state, which invokes 'kubectl drain' on the node. + # Unlike 'gpuPodDeletion', which targets only GPU-allocated pods, drain evicts all pods on the node. + # This should only be enabled as a fallback when 'gpuPodDeletion' cannot remove all GPU-using pods on its own. + drain: + # enable (default=false): Set to true to allow node drain as a fallback when + # 'gpuPodDeletion' cannot evict all GPU pods. By default, drain evicts all pods + # on the node. Use podSelector to limit which pods are evicted. + enable: false + # force (default=false): Delete pods even if they are not managed by a controller + # (for example, ReplicationController, ReplicaSet, Job, DaemonSet, or StatefulSet). + # Applies to all pods on the node, not just GPU pods. + force: false + # podSelector (default=""): Label selector to restrict which pods are evicted + # during drain. An empty string means all pods on the node are evicted. + podSelector: "" + # timeoutSeconds (default=300): The length of time to wait before giving up. + # 0 means infinite. When the timeout is reached, the drain attempt is abandoned. + timeoutSeconds: 300 + # deleteEmptyDir (default=false): Allow eviction of pods that use emptyDir volumes. + # Enabling this results in permanent loss of any data stored in those volumes. + deleteEmptyDir: false +``` + +**Warning:** + +`driver.upgradePolicy.drain.enable` is a cluster-wide policy setting. +When set to `true`, the upgrade controller drains each node before upgrading the driver on that node. +Draining a node evicts all pods from that node, including workloads unrelated to the GPU driver. +This is a disruptive operation that interrupts running GPU and non-GPU workloads on every node the upgrade controller processes. + +Enable `drain` only when `gpuPodDeletion` is insufficient to remove all GPU-using pods on its own. +Adjust the `gpuPodDeletion` settings first and use `drain` only if those settings do not work. +If you must enable `drain`, use `podSelector` to limit which pods are evicted. +If you specify a value for `maxUnavailable` and also specify `maxParallelUpgrades`, +the `maxUnavailable` value applies an additional constraint on the value of +`maxParallelUpgrades` to ensure that the number of parallel upgrades does not +cause more than the intended number of nodes to become unavailable during the upgrade. +For example, if you specify `maxUnavailable=100%` and `maxParallelUpgrades=1`, +one node is upgraded at a time . + +The `maxUnavailable` value also applies to the currently unavailable nodes in the cluster. +If you cordoned nodes in the cluster and the `maxUnavailable` value is already met by the number of cordoned nodes, +then the upgrade does not progress. + +### Upgrade State Machine + +The upgrade controller manages driver upgrades through a well-defined state machine. +The node label, `nvidia.com/gpu-driver-upgrade-state`, indicates the state a node is currently in. +The set of possible states are: + +* Unknown (empty): The upgrade controller is disabled or the node has not been processed yet. +* `upgrade-required`: NVIDIA driver pod is not up-to-date and requires an upgrade. No actions are performed at this stage. +* `cordon-required`: Node will be marked Unschedulable in preparation for the driver upgrade. +* `wait-for-jobs-required`: Node will wait on the completion of a group of pods/jobs before proceeding. +* `pod-deletion-required`: Pods allocated with GPUs are deleted from the node. If pod deletion fails, the node state is set to `drain-required` + if drain is enabled in ClusterPolicy. +* `drain-required`: Node is drained using `kubectl drain`, which evicts all pods on the + node. + This state is only reached if `gpuPodDeletion` fails to remove all + GPU-using pods and `drain.enable` is set to `true` in the cluster policy. + This state is skipped if all GPU pods are successfully deleted from the node. +* `pod-restart-required`: The NVIDIA driver pod running on the node will be restarted and upgraded to the new version. +* `validation-required`: Validation of the new driver deployed on the node is required before proceeding. The GPU Operator + performs validations in the pod named `operator-validator`. +* `uncordon-required`: Node will be marked Schedulable to complete the upgrade process. +* `upgrade-done`: NVIDIA driver pod is up-to-date and running on the node. +* `upgrade-failed`: A failure occurred during the driver upgrade. + +The complete state machine is depicted in the diagram below. + +![](graphics/upgrade-controller-state-machine.png) +### Pausing Driver Upgrades + +To pause the automatic driver upgrade process in the cluster, toggle `driver.upgradePolicy.autoUpgrade` flag +in the cluster policy. +The entire state machine pauses and effectively disables any pending nodes from being upgraded. +You can toggle the flag to `true` again to re-enable the upgrade controller and resume any pending upgrades. + +### Skipping Driver Upgrades + +To skip driver upgrades on a certain node, label the node with `nvidia.com/gpu-driver-upgrade.skip=true`. + +### Metrics and Events + +The GPU Operator generates the following metrics during the upgrade process which can be scraped by Prometheus. + +* `gpu_operator_auto_upgrade_enabled`: 1 if driver auto upgrade is enabled; 0 if not. +* `gpu_operator_nodes_upgrades_in_progress`: Total number of nodes in which a driver pod is being upgraded on. +* `gpu_operator_nodes_upgrades_done`: Total number of nodes in which a driver pod has been successfully upgraded. +* `gpu_operator_nodes_upgrades_failed`: Total number of nodes in which a driver pod upgrade has failed. +* `gpu_operator_nodes_upgrades_available`: Total number of nodes in which a driver pod upgrade can start on. +* `gpu_operator_nodes_upgrades_pending`: Total number of nodes in which driver pod upgrades are pending. + +The GPU Operator generates events during the upgrade process. +The most common events are for state transitions or failures at a particular state. +Below are an example set of events generated for the upgrade of one node. + +```console +$ kubectl get events -n default --sort-by='.lastTimestamp' | grep GPUDriverUpgrade +``` + +*Example Output* + +```output +10m Normal GPUDriverUpgrade node/localhost.localdomain Successfully updated node state label to [upgrade-required] +10m Normal GPUDriverUpgrade node/localhost.localdomain Successfully updated node state label to [cordon-required] +10m Normal GPUDriverUpgrade node/localhost.localdomain Successfully updated node state label to [wait-for-jobs-required] +10m Normal GPUDriverUpgrade node/localhost.localdomain Successfully updated node state label to [pod-deletion-required] +10m Normal GPUDriverUpgrade node/localhost.localdomain Successfully updated node state label to [pod-restart-required] +7m Normal GPUDriverUpgrade node/localhost.localdomain Successfully updated node state label to [validation-required] +6m Normal GPUDriverUpgrade node/localhost.localdomain Successfully updated node state label to [uncordon-required] +6m Normal GPUDriverUpgrade node/localhost.localdomain Successfully updated node state label to [upgrade-done] +``` + +### Troubleshooting + +If the upgrade fails for a particular node, the node is labelled with the `upgrade-failed` state. + +1. View the upgrade state labels: + + ```console + $ kubectl get node -l nvidia.com/gpu.present \ + -ojsonpath='{range .items[*]}{.metadata.name}{"\t"}{.metadata.labels.nvidia\.com/gpu-driver-upgrade-state}{"\n"}{end}' + ``` + + *Example Output* + + ```output + k8s-node-1 upgrade-done + k8s-node-2 upgrade-done + k8s-node-3 upgrade-failed + ``` + +1. Check the events to determine the stage that the upgrade failed: + + $ kubectl get events -n default --sort-by='.lastTimestamp' | grep GPUDriverUpgrade +1. (Optional) Check the logs from the upgrade controller in the gpu-operator container: + + $ kubectl logs -n gpu-operator gpu-operator-xxxxx | grep controllers.Upgrade +1. After resolving the upgrade failures for a particular node, you can restart the upgrade process on the node by placing it in the `upgrade-required` state: + + $ kubectl label node nvidia.com/gpu-driver-upgrade-state=upgrade-required --overwrite + +## Step 2: Upgrades without the Upgrade Controller + +If the upgrade controller is disabled or not supported for your GPU Operator version, a component called `k8s-driver-manager` is responsible +for executing the driver upgrade process. +The `k8s-driver-manager` is an `initContainer` within the driver Daemonset, which ensures all existing GPU driver clients are disabled before +unloading the current driver modules and continuing with the new driver installation. +This method still automates the core driver upgrade process, but lacks the observability that the upgrade controller provides as well as additional +controls such as pausing/skipping upgrades. +In addition, no new features will be added to the `k8s-driver-manager` moving forward in favor of the upgrade controller. + +### Procedure +1. Upgrade the driver by changing `driver.version` value in ClusterPolicy: + + ```console + $ kubectl patch clusterpolicies.nvidia.com/cluster-policy --type='json' -p='[{"op": "replace", "path": "/spec/driver/version", "value":"580.95.05"},{"op": "replace", "path": "/spec/driver/repository", "value":"nvcr.io/nvidia"},{"op": "replace", "path": "/spec/driver/image", "value":"driver"}]' + ``` + +2. (Optional) To monitor the status of the upgrade, watch the deployment of the new driver pod on GPU worker nodes: + + ```console + $ kubectl get pods -n gpu-operator -lapp=nvidia-driver-daemonset -w + ``` + +### Configuration Options + +The following configuration options are available for `k8s-driver-manager`. The options allow users to control the +GPU pod eviction and node drain behavior. + +```yaml +driver: + manager: + env: + - name: ENABLE_GPU_POD_EVICTION + value: "true" + - name: ENABLE_AUTO_DRAIN + value: "true" + - name: DRAIN_USE_FORCE + value: "false" + - name: DRAIN_POD_SELECTOR_LABEL + value: "" + - name: DRAIN_TIMEOUT_SECONDS + value: "0s" + - name: DRAIN_DELETE_EMPTYDIR_DATA + value: "false" +``` + +* The `ENABLE_GPU_POD_EVICTION` environment variable enables `k8s-driver-manager` to attempt evicting only GPU pods from the node before attempting a node drain. Only if this fails and + `ENABLE_AUTO_DRAIN` is enabled will the node ever be drained. +* The `DRAIN_USE_FORCE` environment variable must be enabled to evict GPU pods that are not managed by any of the replication controllers such as deployment, daemon set, stateful set, and replica set. +* The `DRAIN_DELETE_EMPTYDIR_DATA` environment variable must be enabled to delete GPU pods that use the `emptyDir` type volume. + +**Note:** + +Since GPU pods get evicted whenever the NVIDIA Driver daemon set specification is updated, it might not always be desirable to allow this to happen automatically. +To prevent this `daemonsets.updateStrategy` parameter in the `ClusterPolicy` can be set to [OnDelete](https://kubernetes.io/docs/tasks/manage-daemon/update-daemon-set/#daemonset-update-strategy) . +With `OnDelete` update strategy, a new driver pod with the updated spec will only get deployed on a node once the old driver pod is manually deleted. +Thus, admins can control when to rollout spec updates to driver pods on any given node. +For more information on DaemonSet update strategies, refer to the [Kubernetes documentation](https://kubernetes.io/docs/tasks/manage-daemon/update-daemon-set/#daemonset-update-strategy). diff --git a/gpu-operator/.agents/skills/gpu-operator-gpudirect-rdma/SKILL.md b/gpu-operator/.agents/skills/gpu-operator-gpudirect-rdma/SKILL.md new file mode 100644 index 000000000..578edc132 --- /dev/null +++ b/gpu-operator/.agents/skills/gpu-operator-gpudirect-rdma/SKILL.md @@ -0,0 +1,463 @@ +--- +name: "gpu-operator-gpudirect-rdma" +description: "Guides users through GPUDirect RDMA and GPUDirect Storage configuration. Use when enabling high-performance networking or storage access for GPU workloads. Trigger keywords - NVIDIA GPU Operator, GPUDirect RDMA, GPUDirect Storage, networking." +--- + + + + +# GPUDirect RDMA and GPUDirect Storage + +## About GPUDirect RDMA and GPUDirect Storage + +[GPUDirect RDMA](https://docs.nvidia.com/cuda/gpudirect-rdma/index.html) is a technology in NVIDIA GPUs that enables direct +data exchange between GPUs and a third-party peer device using PCI Express. The third-party devices could be network interfaces +such as NVIDIA ConnectX SmartNICs or BlueField DPUs, or video acquisition adapters. + +[GPUDirect Storage](https://docs.nvidia.com/gpudirect-storage/overview-guide/index.html) (GDS) enables a direct data path between local or remote storage, such as NFS servers or NVMe/NVMe over Fabric (NVMe-oF), and GPU memory. +GDS performs direct memory access (DMA) transfers between GPU memory and storage. +DMA avoids a bounce buffer through the CPU. +This direct path increases system bandwidth and decreases the latency and utilization load on the CPU. + +To support GPUDirect RDMA, userspace CUDA APIs are required. +The kernel mode support is provided by one of two approaches: DMA-BUF from the Linux kernel or the legacy `nvidia-peermem` kernel module. +NVIDIA recommends using the DMA-BUF rather than using the `nvidia-peermem` kernel module from the GPU Driver. + +The Operator uses GDS driver version 2.17.5 or newer. +This version and higher is only supported with the NVIDIA Open GPU Kernel module driver. +In GPU Operator v25.3.0 and later, the `driver.kernelModuleType` default is `auto`, for the supported driver versions. +This configuration allows the GPU Operator to choose the recommended driver kernel module type depending on the driver branch and the GPU devices available. +Newer driver versions will use the open kernel module by default, however to make sure you are using the open kernel module, include `--set driver.kernelModuleType=open` command-line argument in your helm Operator install command. + +In conjunction with the Network Operator, the GPU Operator can be used to +set up the networking related components such as network device kernel drivers and Kubernetes device plugins to enable +workloads to take advantage of GPUDirect RDMA and GPUDirect Storage. +Refer to the Network Operator [documentation](https://docs.nvidia.com/networking/software/cloud-orchestration/index.html) for installation information. + +## Step 1: Common Prerequisites + +The prerequisites for configuring GPUDirect RDMA or GPUDirect Storage depend on whether you use DMA-BUF from the Linux kernel or the legacy `nvidia-peermem` kernel module. + +| Technology | DMA-BUF | Legacy NVIDIA-peermem | +| --- | --- | --- | +| GPU Driver | An Open Kernel module driver is required. | Any supported driver. | +| CUDA | CUDA 11.7 or higher. The CUDA runtime is provided by the driver. | No minimum version. The CUDA runtime is provided by the driver. | +| GPU | Turing architecture data center, Quadro RTX, and RTX GPU or higher. | All data center, Quadro RTX, and RTX GPU or higher. | +| Network Device Drivers | MLNX_OFED or DOCA-OFED are optional. You can use the Linux driver packages from the package manager. | MLNX_OFED or DOCA-OFED are required. | +| Linux Kernel | 5.12 or higher. | No minimum version. | +* Make sure the network device drivers are installed. + + You can use the [Network Operator](https://docs.nvidia.com/networking/software/cloud-orchestration/index.html) + to manage the driver lifecycle for MLNX_OFED and DOCA-OFED drivers. + + You can install the drivers on each host. + Refer to [Adapter Software](https://docs.nvidia.com/networking/software/adapter-software/index.html) + in the networking documentation for information about the MLNX_OFED, DOCA-OFED, and Linux inbox drivers. + +* For installations on VMware vSphere, refer to the following additional prerequisites: + + * Make sure the network interface controller and the NVIDIA GPU are in the same PCIe IO root complex. + * Enable the following PCI options: + + * `pciPassthru.allowP2P = true` + * `pciPassthru.RelaxACSforP2P = true` + * `pciPassthru.use64bitMMIO = true` + * `pciPassthru.64bitMMIOSizeGB = 128` + + For information about configuring the settings, refer to the + [Deploy an AI-Ready Enterprise Platform on vSphere 7](https://www.vmware.com/docs/deploy-an-ai-ready-enterprise-platform-on-vsphere-7-update-2#vm-settings-A) + document from VMWare. + +## Step 2: Configuring GPUDirect RDMA + +### Platform Support + +The following platforms are supported for GPUDirect with RDMA: + +* Kubernetes on bare metal and on vSphere VMs with GPU passthrough and vGPU. +* VMware vSphere with Tanzu. +* For Red Hat OpenShift Container Platform on bare metal and on vSphere VMs with GPU passthrough and vGPU configurations, + refer to NVIDIA AI Enterprise with OpenShift. + +For information about the supported versions, refer to Support for GPUDirect RDMA on the platform support page. + +### Installing the GPU Operator and Enabling GPUDirect RDMA + +To use DMA-BUF and network device drivers that are installed by the Network Operator: + +```console +$ helm install --wait --generate-name \ + -n gpu-operator --create-namespace \ + nvidia/gpu-operator \ + --version=${version} \ +``` + +To use DMA-BUF and network device drivers that are installed on the host: + +```console +$ helm install --wait --generate-name \ + -n gpu-operator --create-namespace \ + nvidia/gpu-operator \ + --version=${version} \ + --set driver.rdma.useHostMofed=true +``` + +To use the legacy `nvidia-peermem` kernel module instead of DMA-BUF, add `--set driver.rdma.enabled=true` to either of the preceding commands. +Add `--set driver.kernelModuleType=open` if you are using a driver version from a branch earlier than R570. + +### Verifying the Installation of GPUDirect with RDMA + +During the installation, the NVIDIA driver daemon set runs an `init container` to wait on the network device kernel drivers to be ready. +This init container checks for Mellanox NICs on the node and ensures that the necessary kernel symbols are exported by the kernel drivers. + +If you were required to use the `driver.rdma.enabled=true` argument when you installed the Operator, the nvidia-peermem-ctr container is started inside each driver pod after the verification. + +1. Confirm that the pod template for the driver daemon set includes the mofed-validation init container and + the nvidia-driver-ctr containers: + + ```console + $ kubectl describe ds -n gpu-operator nvidia-driver-daemonset + ``` + + *Example Output* + + The following partial output omits the init containers and containers that are common to all installations. + + ```output + ... + Init Containers: + mofed-validation: + Container ID: containerd://5a36c66b43f676df616e25ba7ae0c81aeaa517308f28ec44e474b2f699218de3 + Image: nvcr.io/nvidia/cloud-native/gpu-operator-validator:v1.8.1 + Image ID: nvcr.io/nvidia/cloud-native/gpu-operator-validator@sha256:7a70e95fd19c3425cd4394f4b47bbf2119a70bd22d67d72e485b4d730853262c + ... + Containers: + nvidia-driver-ctr: + Container ID: containerd://199a760946c55c3d7254fa0ebe6a6557dd231179057d4909e26c0e6aec49ab0f + Image: nvcr.io/nvaie/vgpu-guest-driver:470.63.01-ubuntu20.04 + Image ID: nvcr.io/nvaie/vgpu-guest-driver@sha256:a1b7d2c8e1bad9bb72d257ddfc5cec341e790901e7574ba2c32acaddaaa94625 + ... + nvidia-peermem-ctr: + Container ID: containerd://0742d86f6017bf0c304b549ebd8caad58084a4185a1225b2c9a7f5c4a171054d + Image: nvcr.io/nvaie/vgpu-guest-driver:470.63.01-ubuntu20.04 + Image ID: nvcr.io/nvaie/vgpu-guest-driver@sha256:a1b7d2c8e1bad9bb72d257ddfc5cec341e790901e7574ba2c32acaddaaa94625 + ... + ``` + + The nvidia-peermem-ctr container is present only if you were required to specify the `driver.rdma.enabled=true` argument when you installed the Operator. + +1. Legacy only: Confirm that the nvidia-peermem-ctr container successfully loaded the nvidia-peermem kernel module: + + ```console + $ kubectl logs -n gpu-operator ds/nvidia-driver-daemonset -c nvidia-peermem-ctr + ``` + + Alternatively, run `kubectl logs -n gpu-operator nvidia-driver-daemonset-xxxxx -c nvidia-peermem-ctr` for each pod in the daemonset. + + *Example Output* + + ```output + waiting for mellanox ofed and nvidia drivers to be installed + waiting for mellanox ofed and nvidia drivers to be installed + successfully loaded nvidia-peermem module + ``` + +### Verifying the Installation by Performing a Data Transfer + +You can perform the following steps to verify that GPUDirect with RDMA is configured +correctly and that pods can perform RDMA data transfers. + +1. Get the network interface name of the InfiniBand device on the host: + + ```console + $ kubectl exec -it -n network-operator mofed-ubuntu22.04-ds-xxxxx -- ibdev2netdev + ``` + + *Example Output* + + ```output + mlx5_0 port 1 ==> ens64np1 (Up) + ``` + +1. Configure a secondary network on the device using a macvlan network attachment: + + - Create a file, such as `demo-macvlannetwork.yaml`, with contents like the following example: + + ```yaml + apiVersion: mellanox.com/v1alpha1 + kind: MacvlanNetwork + metadata: + name: demo-macvlannetwork + spec: + networkNamespace: "default" + master: "ens64np1" + mode: "bridge" + mtu: 1500 + ipam: | + { + "type": "whereabouts", + "range": "192.168.2.225/28", + "exclude": [ + "192.168.2.229/30", + "192.168.2.236/32" + ] + } + ``` + + Replace `ens64np1` with the the network interface name reported by the `ibdev2netdev` command + from the preceding step. + + - Apply the manifest: + + ```console + $ kubectl apply -f demo-macvlannetwork.yaml + ``` + + - Confirm that the additional network is ready: + + ```console + $ kubectl get macvlannetworks demo-macvlannetwork + ``` + + *Example Output* + + ```output + NAME STATUS AGE + demo-macvlannetwork ready 2023-03-10T18:22:28Z + ``` + +1. Start two pods that run the `mellanox/cuda-perftest` container on two different nodes in the cluster. + + ### demo-pod-1 + + - Create a file, such as `demo-pod-1.yaml`, for the first pod with contents like the following: + + - Apply the manifest: + + ```console + $ kubectl apply -f demo-pod-1.yaml + ``` + + ### demo-pod-2 + + - Create a file, such as `demo-pod-2.yaml`, for the second pod with contents like the following: + + - Apply the manifest: + + ```console + $ kubectl apply -f demo-pod-2.yaml + ``` + +1. Get the IP addresses of the pods: + + ```console + $ kubectl get pods -o wide + ``` + + *Example Output* + + ```output + NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES + demo-pod-1 1/1 Running 0 3d4h 192.168.38.90 nvnode1 + demo-pod-2 1/1 Running 0 3d4h 192.168.47.89 nvnode2 + ``` + +1. From one terminal, open a shell in the container on the first pod and start the performance test server: + + ```console + $ kubectl exec -it demo-pod-1 -- ib_write_bw --use_cuda=0 --use_cuda_dmabuf \ + -d mlx5_0 -a -F --report_gbits -q 1 + ``` + + *Example Output* + + ```output + ************************************ + * Waiting for client to connect... * + ************************************ + ``` + +1. From another terminal, open a shell in the container on the second pod and run the performance client: + + ```console + $ kubectl exec -it demo-pod-2 -- ib_write_bw -n 5000 --use_cuda=0 --use_cuda_dmabuf \ + -d mlx5_0 -a -F --report_gbits -q 1 192.168.38.90 + ``` + + *Example Output* + + ```output + --------------------------------------------------------------------------------------- + RDMA_Write BW Test + Dual-port : OFF Device : mlx5_0 + Number of qps : 1 Transport type : IB + Connection type : RC Using SRQ : OFF + PCIe relax order: ON + ibv_wr* API : ON + TX depth : 128 + CQ Moderation : 100 + Mtu : 1024[B] + Link type : Ethernet + GID index : 5 + Max inline data : 0[B] + rdma_cm QPs : OFF + Data ex. method : Ethernet + --------------------------------------------------------------------------------------- + local address: LID 0000 QPN 0x01ac PSN 0xc76db1 RKey 0x23beb2 VAddr 0x007f26a2c8b000 + GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:02:226 + remote address: LID 0000 QPN 0x01a9 PSN 0x2f722 RKey 0x23beaf VAddr 0x007f820b24f000 + GID: 00:00:00:00:00:00:00:00:00:00:255:255:192:168:02:225 + --------------------------------------------------------------------------------------- + #bytes #iterations BW peak[Gb/sec] BW average[Gb/sec] MsgRate[Mpps] + 2 5000 0.11 0.11 6.897101 + 4 5000 0.22 0.22 6.995646 + 8 5000 0.45 0.45 7.014752 + 16 5000 0.90 0.90 7.017509 + 32 5000 1.80 1.80 7.020162 + 64 5000 3.59 3.59 7.007110 + 128 5000 7.19 7.18 7.009540 + 256 5000 15.06 14.98 7.313517 + 512 5000 30.04 29.73 7.259329 + 1024 5000 59.65 58.81 7.178529 + 2048 5000 91.53 91.47 5.582931 + 4096 5000 92.13 92.06 2.809574 + 8192 5000 92.35 92.31 1.408535 + 16384 5000 92.46 92.46 0.705381 + 32768 5000 92.36 92.35 0.352302 + 65536 5000 92.39 92.38 0.176196 + 131072 5000 92.42 92.41 0.088131 + 262144 5000 92.45 92.44 0.044080 + 524288 5000 92.42 92.42 0.022034 + 1048576 5000 92.40 92.40 0.011015 + 2097152 5000 92.40 92.39 0.005507 + 4194304 5000 92.40 92.39 0.002753 + 8388608 5000 92.39 92.39 0.001377 + --------------------------------------------------------------------------------------- + ``` + + The command output indicates that the data transfer rate was approximately 92 Gbps. + +1. Delete the pods: + + ```console + $ kubectl delete -f demo-pod-1.yaml -f demo-pod-2.yaml + ``` + +1. Delete the secondary network: + + ```console + $ kubectl delete -f demo-macvlannetworks.yaml + ``` + +## Step 3: Using GPUDirect Storage + +### Platform Support + +See Support for GPUDirect Storage on the platform support page. + +### Installing the GPU Operator and Enabling GPUDirect Storage + +The following section is applicable to the following configurations and describe how to deploy the GPU Operator using the Helm Chart: + +* Kubernetes on bare metal and on vSphere VMs with GPU passthrough and vGPU. + +Starting with v22.9.1, the GPU Operator provides an option to load the `nvidia-fs` kernel module during the bootstrap of the NVIDIA driver daemon set. +Starting with v23.9.1, the GPU Operator deploys a version of GDS that requires using the NVIDIA Open Kernel module driver. + +The following sample command applies to clusters that use the Network Operator to install the network device kernel drivers. + +```console +$ helm install --wait --generate-name \ + -n gpu-operator --create-namespace \ + nvidia/gpu-operator \ + --version=${version} \ + --set gds.enabled=true +``` + +Add `--set driver.rdma.enabled=true` to the command to use the legacy `nvidia-peermem` kernel module. + +Add `--set driver.kernelModuleType=open` if you are using a driver version from a branch earlier than R570. + +### Verification + +During the installation, an init container is used with the driver daemon set to wait on the network device kernel drivers to be ready. +This init container checks for Mellanox NICs on the node and ensures that the necessary kernel symbols are exported by the kernel drivers. +After the verification completes, the nvidia-fs-ctr container starts inside the driver pods. + +If you were required to use the `driver.rdma.enabled=true` argument when you installed the Operator, the nvidia-peermem-ctr container is started inside each driver pod after the verification. + +```console +$ kubectl get pod -n gpu-operator +``` + +*Example Output* + +```output +gpu-operator gpu-feature-discovery-pktzg 1/1 Running 0 11m +gpu-operator gpu-operator-1672257888-node-feature-discovery-master-7ccb7txmc 1/1 Running 0 12m +gpu-operator gpu-operator-1672257888-node-feature-discovery-worker-bqhrl 1/1 Running 0 11m +gpu-operator gpu-operator-6f64c86bc-zjqdh 1/1 Running 0 12m +gpu-operator nvidia-container-toolkit-daemonset-rgwqg 1/1 Running 0 11m +gpu-operator nvidia-cuda-validator-8whvt 0/1 Completed 0 8m50s +gpu-operator nvidia-dcgm-exporter-pt9q9 1/1 Running 0 11m +gpu-operator nvidia-device-plugin-daemonset-472fc 1/1 Running 0 11m +gpu-operator nvidia-device-plugin-validator-29nhc 0/1 Completed 0 8m34s +gpu-operator nvidia-driver-daemonset-j9vw6 3/3 Running 0 12m +gpu-operator nvidia-mig-manager-mtjcw 1/1 Running 0 7m35s +gpu-operator nvidia-operator-validator-b8nz2 1/1 Running 0 11m +``` + +```console +$ kubectl describe pod -n gpu-operator nvidia-driver-daemonset-xxxx + + Init Containers: + mofed-validation: + Container ID: containerd://a31a8c16ce7596073fef7cb106da94c452fdff111879e7fc3ec58b9cef83856a + Image: nvcr.io/nvidia/cloud-native/gpu-operator-validator:v22.9.1 + Image ID: nvcr.io/nvidia/cloud-native/gpu-operator-validator@sha256:18c9ea88ae06d479e6657b8a4126a8ee3f4300a40c16ddc29fb7ab3763d46005 + + + Containers: + nvidia-driver-ctr: + Container ID: containerd://7cf162e4ee4af865c0be2023d61fbbf68c828d396207e7eab2506f9c2a5238a4 + Image: nvcr.io/nvidia/driver:525.60.13-ubuntu20.04 + Image ID: nvcr.io/nvidia/driver@sha256:0ee0c585fa720f177734b3295a073f402d75986c1fe018ae68bd73fe9c21b8d8 + + + nvidia-peermem-ctr: + Container ID: containerd://5c71c9f8ccb719728a0503500abecfb5423e8088f474d686ee34b5fe3746c28e + Image: nvcr.io/nvidia/driver:525.60.13-ubuntu20.04 + Image ID: nvcr.io/nvidia/driver@sha256:0ee0c585fa720f177734b3295a073f402d75986c1fe018ae68bd73fe9c21b8d8 + + + nvidia-fs-ctr: + Container ID: containerd://f5c597d59e1cf8747aa20b8c229a6f6edd3ed588b9d24860209ba0cc009c0850 + Image: nvcr.io/nvidia/cloud-native/nvidia-fs:2.14.13-ubuntu20.04 + Image ID: nvcr.io/nvidia/cloud-native/nvidia-fs@sha256:109485365f68caeaee1edee0f3f4d722fe5b5d7071811fc81c630c8a840b847b + + +``` + +Lastly, verify that NVIDIA kernel modules are loaded on the worker node: + +```console +$ lsmod | grep nvidia + +nvidia_fs 245760 0 +nvidia_peermem 16384 0 +nvidia_modeset 1159168 0 +nvidia_uvm 1048576 0 +nvidia 39059456 115 nvidia_uvm,nvidia_modeset +ib_core 319488 9 rdma_cm,ib_ipoib,iw_cm,ib_umad,rdma_ucm,ib_uverbs,mlx5_ib,ib_cm +drm 491520 6 drm_kms_helper,drm_vram_helper,nvidia,mgag200,ttm +``` + +## Step 4: Related Information + +Refer to the following resources for more information: + + * GPUDirect RDMA: https://docs.nvidia.com/cuda/gpudirect-rdma/index.html + + * NVIDIA Network Operator: https://github.com/Mellanox/network-operator + + * Blog post on deploying the Network Operator: https://developer.nvidia.com/blog/deploying-gpudirect-rdma-on-egx-stack-with-the-network-operator/ diff --git a/gpu-operator/.agents/skills/gpu-operator-install-airgapped-environments/SKILL.md b/gpu-operator/.agents/skills/gpu-operator-install-airgapped-environments/SKILL.md new file mode 100644 index 000000000..a7da6e3c6 --- /dev/null +++ b/gpu-operator/.agents/skills/gpu-operator-install-airgapped-environments/SKILL.md @@ -0,0 +1,359 @@ +--- +name: "gpu-operator-install-airgapped-environments" +description: "Guides users through installing the GPU Operator in air-gapped or restricted network environments. Use when users need mirrored images, private registries, or offline installation steps. Trigger keywords - NVIDIA GPU Operator, air-gapped, restricted network, installation." +--- + + + + +# Install NVIDIA GPU Operator in Air-Gapped Environments + +## About Air-Gapped Installations + +This page describes how to successfully deploy the GPU Operator in clusters with restricted internet access. +By default, The GPU Operator requires internet access for the following reasons: + + 1) Container images need to be pulled during GPU Operator installation. + 2) The `driver` container needs to download several OS packages prior to driver installation. + + **Tip:** + + Using precompiled-drivers removes the need for the `driver` containers to + download operating system packages and removes the need to create a local package repository. +To address these requirements, it may be necessary to create a local image registry and/or a local package repository +so that the necessary images and packages are available for your cluster. In subsequent sections, we detail how to +configure the GPU Operator to use local image registries and local package repositories. If your cluster is behind +a proxy, also follow the steps from install-gpu-operator-proxy. + +Different steps are required for different environments with varying levels of internet connectivity. +The supported use cases/environments are listed in the below table: + ++--------------------------+-----------------------------------------+ + Network Flow | ++--------------------------+--------------------+--------------------+ + Use Case Pulling Images Pulling Packages ++========+=================+====================+====================+ + **1** HTTP Proxy with K8s node --> HTTP Driver container | + full Internet Proxy --> Internet --> HTTP Proxy --> | + access Image Registry Internet Package | + Repository | ++--------+-----------------+--------------------+--------------------+ + **2** HTTP Proxy with K8s node --> HTTP Driver container | + limited Internet Proxy --> Internet --> HTTP Proxy --> | + access Image Registry Local Package | + Repository | ++--------+-----------------+--------------------+--------------------+ + **3a** Full Air-Gapped K8s node --> Local Driver container | + (w/ HTTP Proxy) Image Registry --> HTTP Proxy --> | + Local Package | + Repository | ++--------+-----------------+--------------------+--------------------+ + **3b** Full Air-Gapped K8s node --> Local Driver container-->| + (w/o HTTP Proxy) Image Registry Local Package | + Repository | ++--------+-----------------+--------------------+--------------------+ + +**Note:** + +For Red Hat Openshift deployments in air-gapped environments (use cases 2, 3a and 3b), +refer to :external+ocpmirror-gpu-ocp-disconnected. +**Note:** + +Ensure that Kubernetes nodes can successfully reach the local DNS server(s). +Public name resolution for image registry and package repositories are +mandatory for use cases 1 and 2. +Before proceeding to the next sections, get the `values.yaml` file used for GPU Operator configuration. + +```console +$ curl -sO https://raw.githubusercontent.com/NVIDIA/gpu-operator/v1.7.0/deployments/gpu-operator/values.yaml +``` + +**Note:** + +Replace `v1.7.0` in the above command with the version you want to use. + +## Step 1: Local Image Registry + +Without internet access, the GPU Operator requires all images to be hosted in a local image registry that is accessible +to all nodes in the cluster. To allow the GPU Operator to work with a local registry, users can specify local +repository, image, tag along with pull secrets in `values.yaml`. + +To pull the correct images from the NVIDIA registry, you can leverage the fields `repository`, `image` and `version` +specified in the file `values.yaml`. + +The general syntax for the container image is `/:`. + +If the version is not specified, you can retrieve the information from the NVIDIA NGC catalog at https://catalog.ngc.nvidia.com/containers. +Search for an image, such as `gpu-operator` and then check the available tags for the image. + +An example is shown below with the Operator container image: + +```yaml +operator: + repository: nvcr.io/nvidia + image: gpu-operator + version: "${version}" +``` + +For instance, to pull the gpu-operator image version ${version}, use the following instruction: + +```console +$ docker pull nvcr.io/nvidia/gpu-operator:${version} +``` + +There is one caveat with regards to the driver image. The version field must be appended by the OS name running on the worker node. + +```yaml +driver: + repository: nvcr.io/nvidia + image: driver + version: "${recommended}" +``` + +To pull the driver image for Ubuntu 20.04: + +```console +$ docker pull nvcr.io/nvidia/driver:${recommended}-ubuntu20.04 +``` + +To push the images to the local registry, simply tag the pulled images by prefixing the image with the image registry information. + +Using the above examples, this will result in: + +```console +$ docker tag nvcr.io/nvidia/gpu-operator:${version} //gpu-operator:${version} +$ docker tag nvcr.io/nvidia/driver:${recommended}-ubuntu20.04 //driver:${recommended}-ubuntu20.04 +``` + +Finally, push the images to the local registry: + +```console +$ docker push //gpu-operator:${version} +$ docker push //driver:${recommended}-ubuntu20.04 +``` + +Update `values.yaml` with local registry information in the repository field. + +**Note:** + +Replace below with your local image registry URL and port. +Sample of `values.yaml` for GPU Operator v1.9.0: + +```yaml +operator: + repository: + image: gpu-operator + version: 1.9.0 + imagePullSecrets: [] + initContainer: + image: cuda + repository: + version: 11.4.2-base-ubi8 + + validator: + image: gpu-operator-validator + repository: + version: 1.9.0 + imagePullSecrets: [] + + driver: + repository: + image: driver + version: "470.82.01" + imagePullSecrets: [] + manager: + image: k8s-driver-manager + repository: + version: v0.2.0 + + toolkit: + repository: + image: container-toolkit + version: 1.7.2-ubuntu18.04 + imagePullSecrets: [] + + devicePlugin: + repository: + image: k8s-device-plugin + version: v0.10.0-ubi8 + imagePullSecrets: [] + + dcgmExporter: + repository: + image: dcgm-exporter + version: 2.3.1-2.6.0-ubuntu20.04 + imagePullSecrets: [] + + gfd: + repository: + image: gpu-feature-discovery + version: v0.4.1 + imagePullSecrets: [] + + nodeStatusExporter: + enabled: false + repository: + image: gpu-operator-validator + version: "1.9.0" + + migManager: + enabled: true + repository: + image: k8s-mig-manager + version: v0.2.0-ubuntu20.04 + + node-feature-discovery: + image: + repository: + pullPolicy: IfNotPresent + # tag, if defined will use the given image tag, else Chart.AppVersion will be used + # tag: + imagePullSecrets: [] +``` + +## Step 2: Local Package Repository + +The `driver` container deployed as part of the GPU Operator requires certain packages to be available as part of the +driver installation. In restricted internet access or air-gapped installations, users are required to create a +local mirror repository for their OS distribution and make the following packages available: + +**Note:** + +KERNEL_VERSION is the underlying running kernel version on the GPU node +GCC_VERSION is the gcc version matching the one used for building underlying kernel + +Configuring a local package repository is not necessary for clusters that +can run precompiled-drivers. +### Required Packages + +```yaml +ubuntu: + linux-headers-${KERNEL_VERSION} + linux-image-${KERNEL_VERSION} + linux-modules-${KERNEL_VERSION} + +centos: + elfutils-libelf.x86_64 + elfutils-libelf-devel.x86_64 + kernel-headers-${KERNEL_VERSION} + kernel-devel-${KERNEL_VERSION} + kernel-core-${KERNEL_VERSION} + gcc-${GCC_VERSION} + +rhel/rhcos: + kernel-headers-${KERNEL_VERSION} + kernel-devel-${KERNEL_VERSION} + kernel-core-${KERNEL_VERSION} + gcc-${GCC_VERSION} +``` + +For example, for Ubuntu, these packages can be found at `archive.ubuntu.com`. +This is the mirror to be replicate locally for your cluster. +You can use `apt-mirror` to mirror these packages to your local package repository server. + +For CentOS, `reposync` can be used to create the local mirror. + +After all the required packages are mirrored to the local repository, repo lists need to be created following +distribution specific documentation. A `ConfigMap` containing the repo list file needs to be created in +the namespace where the GPU Operator gets deployed. + +An example of repo list is shown below for Ubuntu 22.04 (access to local package repository via HTTP): + +`custom-repo.list`: + +```text +deb [arch=amd64] http:///ubuntu/mirror/archive.ubuntu.com/ubuntu jammy main universe +deb [arch=amd64] http:///ubuntu/mirror/archive.ubuntu.com/ubuntu jammy-updates main universe +deb [arch=amd64] http:///ubuntu/mirror/archive.ubuntu.com/ubuntu jammy-security main universe +``` + +An example of repo list is shown below for Ubuntu 20.04 (access to local package repository via HTTP): + +`custom-repo.list`: + +```text +deb [arch=amd64] http:///ubuntu/mirror/archive.ubuntu.com/ubuntu focal main universe +deb [arch=amd64] http:///ubuntu/mirror/archive.ubuntu.com/ubuntu focal-updates main universe +deb [arch=amd64] http:///ubuntu/mirror/archive.ubuntu.com/ubuntu focal-security main universe +``` + +An example of repo list is shown below for CentOS 8 (access to local package repository via HTTP): + +`custom-repo.repo`: + +```text +[baseos] +name=CentOS Linux $releasever - BaseOS +baseurl=http:///repos/centos/$releasever/$basearch/os/baseos/ +gpgcheck=0 +enabled=1 + +[appstream] +name=CentOS Linux $releasever - AppStream +baseurl=http:///repos/centos/$releasever/$basearch/os/appstream/ +gpgcheck=0 +enabled=1 + +[extras] +name=CentOS Linux $releasever - Extras +baseurl=http:///repos/centos/$releasever/$basearch/os/extras/ +gpgcheck=0 +enabled=1 +``` + +Create a `ConfigMap` object from the file: + +```console +$ kubectl create configmap repo-config -n gpu-operator --from-file= +``` + +Update the `custom-repo.list` file and config map as appropriate if the containerization software platform, such as Tanzu, upgrades the Kubernetes cluster nodes to a newer operating system version. + +After the config map is created, update `values.yaml` with this information to let the GPU Operator mount the repo configuration +within the `driver` container to pull required packages. Based on the OS distribution the GPU Operator automatically mounts this config map into the appropriate directory. + +```yaml +driver: + repoConfig: + configMapName: repo-config +``` + +If self-signed certificates are used for an HTTPS based internal repository then you must add a config map for those certificates. +You then specify the config map during the GPU Operator install. +Based on the OS distribution the GPU Operator automatically mounts this config map into the appropriate directory. +Similarly, the certificate file format and suffix, such as `.crt` or `.pem`, also depends on the OS distribution. + +```console +$ kubectl create configmap cert-config -n gpu-operator --from-file= --from-file= +``` + +```yaml +driver: + certConfig: + name: cert-config +``` + +## Step 3: Deploy GPU Operator + +Download and deploy GPU Operator Helm Chart with the updated `values.yaml`. + +Fetch the chart from the NGC repository: + +```console +$ helm fetch https://helm.ngc.nvidia.com/nvidia/charts/gpu-operator-${version}.tgz +``` + +Install the GPU Operator with the customized `values.yaml`: + +```console +$ helm install --wait gpu-operator \ + -n gpu-operator --create-namespace \ + gpu-operator-${version}.tgz \ + -f values.yaml +``` + +Check the status of the pods to ensure all the containers are running: + +```console +$ kubectl get pods -n gpu-operator +``` diff --git a/gpu-operator/.agents/skills/gpu-operator-install-governmentready-environments/SKILL.md b/gpu-operator/.agents/skills/gpu-operator-install-governmentready-environments/SKILL.md new file mode 100644 index 000000000..d10011099 --- /dev/null +++ b/gpu-operator/.agents/skills/gpu-operator-install-governmentready-environments/SKILL.md @@ -0,0 +1,163 @@ +--- +name: "gpu-operator-install-governmentready-environments" +description: "Guides users through government-ready GPU Operator installation considerations. Use when deploying in hardened or regulated Kubernetes environments. Trigger keywords - NVIDIA GPU Operator, government-ready, installation, Kubernetes." +--- + + + + +# NVIDIA GPU Operator Government Ready + +The NVIDIA GPU Operator now offers government-ready components for NVIDIA AI Enterprise customers. +Government ready is NVIDIA's designation for software that meets applicable security requirements for deployment in your FedRAMP High or equivalent sovereign use case. +For more information on NVIDIA's government-ready support, refer to the white paper [AI Software for Regulated Environments](https://docs.nvidia.com/ai-enterprise/planning-resource/ai-software-regulated-environments-white-paper/latest/index.html). + +## Step 1: Supported GPU Operator Components + +Refer to the operator-component-matrix for a full list of supported government-ready GPU Operator components. + +Artifacts for these components are available from the [NVIDIA NGC Catalog](https://registry.ngc.nvidia.com/orgs/nvstaging/teams/cloud-native/containers/gpu-driver-stig-fips). + +**Note:** + +Not all GPU Operator components and features are available as government-ready containers in this release. +For example, NVIDIA GDS Driver, NVIDIA Confidential Computing Manager, and NVIDIA GDRCopy Driver are not yet supported. + +## Step 2: Validated Kubernetes Distributions + +The government-ready NVIDIA GPU Operator has been validated on the following Kubernetes distributions: + +- Canonical Kubernetes 1.34 with Ubuntu Pro 24.04 and FIPS-compliant kernel +- Red Hat OpenShift 4.19 in FIPS mode +- Rancher Kubernetes Engine 2 with Ubuntu 24.04 +- VMware VKS with Ubuntu 24.04 + +## Step 3: Install Government-Ready NVIDIA GPU Operator + +Once you have your gov-ready-prerequisites configured, use the following steps to install the NVIDIA GPU Operator on Canonical Kubernetes distributions: + +1. install-nfd +1. create-ngc-api-pull-secret +1. create-ubuntu-pro-token-secret +1. deploy-nvidia-gpu-operator-gov-ready + +**Note:** + +For deployment on OpenShift, refer to the :external+ocpinstall-gpu-operator-gov-ready-openshift page. +### Prerequisites + +- An active NVIDIA AI Enterprise subscription and NGC API token to access GPU Operator government-ready containers. + Refer to [Generating Your NGC API Key](https://docs.nvidia.com/ngc/gpu-cloud/ngc-user-guide/index.html#generating-api-key) in the NVIDIA NGC User Guide for more information on NGC API tokens. + +- An Ubuntu Pro token for Canonical Kubernetes deployments. + This token is required for the driver container to download kernel headers and other necessary packages from the Canonical repository when using the FIPS-enabled kernel on Ubuntu 24.04. + Refer to the [Ubuntu Pro documentation](https://documentation.ubuntu.com/pro-client/en/v30/howtoguides/get_token_and_attach/) for more information on accessing Ubuntu Pro tokens. + +- The `helm` CLI installed on a client machine. + + You can run the following commands to install the Helm CLI: + + ```console + $ curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 \ + && chmod 700 get_helm.sh \ + && ./get_helm.sh + ``` + +- A namespace to deploy the NVIDIA GPU Operator. + The example install commands below use `gpu-operator` as the namespace. + +- Optionally, Service Mesh for intra-cluster traffic encryption. + By default, the NVIDIA GPU Operator does not encrypt traffic between its controller (and operands) and the Kubernetes API server. + If you wish to encrypt this communication, you should deploy and maintain a service mesh application within the Kubernetes cluster to enable secure traffic. + +### Install Node Feature Discovery (NFD) + +NFD is an open-source project that is a dependency for the Operator on each node in your cluster. +It must be deployed before installing the NVIDIA GPU Operator. + +GPU Operator does not maintain a government ready version of NFD, it is recommended that you install the upstream NFD version that aligns with the operator-component-matrix. +The NFD container is built on top of a scratch image, providing a highly secure container environment. +For information on NFD CVEs and security updates, refer to the [NFD GitHub repository](https://github.com/kubernetes-sigs/node-feature-discovery/security). + +Refer to the NFD documentation for [installation instructions](https://kubernetes-sigs.github.io/node-feature-discovery/stable/get-started/index.html). + +### Create NGC API Pull Secret + +Add a Docker registry secret for downloading the GPU Operator artifacts from NVIDIA NGC in the same namespace where you are planning to deploy the NVIDIA GPU Operator. +Update `ngc-api-key` in the command below with your NGC API key. + +```console +$ kubectl create secret -n gpu-operator docker-registry ngc-secret \ + --docker-server=nvcr.io \ + --docker-username='$oauthtoken' \ + --docker-password= +``` + +### Create Ubuntu Pro Token Secret + +Create a Kubernetes secret to hold the value of your Ubuntu Pro token secret. +This secret will be used in the install command in the next step. + +The Ubuntu Pro Token is required for the driver container to download kernel headers and other necessary packages from the Canonical repository when using the FIPS-enabled kernel on Ubuntu 24.04. + +1. Get the Ubuntu Pro token: + + ```console + $ echo UBUNTU_PRO_TOKEN= > ubuntu-fips.env + ``` + + Replace `` with your actual Ubuntu Pro token. + +2. Create Ubuntu Pro token Secret: + + ```console + $ kubectl create secret generic ubuntu-fips-secret \ + --from-env-file=./ubuntu-fips.env --namespace gpu-operator + ``` + + Note that the namespace in the above command is `gpu-operator`. + Update this to the namespace you are planning to use for the NVIDIA GPU Operator. + +### Install NVIDIA GPU Operator Government-Ready Components + +1. Label your `gpu-operator` namespace for the Operator to set the enforcement policy to privilege. + + ```console + $ kubectl label --overwrite ns gpu-operator pod-security.kubernetes.io/enforce=privileged + ``` + +1. Add the NVIDIA Helm repository: + + ```console + $ helm repo add nvidia https://helm.ngc.nvidia.com/nvidia \ + && helm repo update + ``` + +1. Install the NVIDIA GPU Operator. + + ```console + $ helm install gpu-operator nvidia/gpu-operator \ + --namespace gpu-operator \ + --set driver.secretEnv=ubuntu-fips-secret \ + --set driver.repository=nvcr.io/nvidia \ + --set driver.version=580.95.05-stig-fips \ + --set driver.image=gpu-driver-stig-fips \ + --set driver.imagePullSecrets={ngc-secret} \ + --set nfd.enabled=false + ``` + +Refer to [Common Chart Customization Options](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/getting-started.html#common-chart-customization-options) for more information about installation options. + +## Step 4: Update Ubuntu Pro Token in ClusterPolicy + +You can update your Ubuntu Pro Token after installation by editing your Ubuntu Pro Token secret. +This secret name is set as value of `driver.secretEnv` of the GPU Operator ClusterPolicy. + +Edit your Ubuntu Pro Token secret. + +```console +$ kubectl edit secrets +``` + +Then update the secret with your new Ubuntu Pro Token. +This token is required for the driver container to download kernel headers and other necessary packages from the Canonical repository when using the FIPS-enabled kernel on Ubuntu 24.04. diff --git a/gpu-operator/.agents/skills/gpu-operator-install-http-proxy/SKILL.md b/gpu-operator/.agents/skills/gpu-operator-install-http-proxy/SKILL.md new file mode 100644 index 000000000..0dd005cd7 --- /dev/null +++ b/gpu-operator/.agents/skills/gpu-operator-install-http-proxy/SKILL.md @@ -0,0 +1,99 @@ +--- +name: "gpu-operator-install-http-proxy" +description: "Guides users through installing the GPU Operator with HTTP proxy settings. Use when clusters require proxy configuration for image pulls or network access. Trigger keywords - NVIDIA GPU Operator, HTTP proxy, installation, Kubernetes." +--- + + + + +# Prerequisites + +* Kubernetes cluster is configured with HTTP proxy settings (container runtime should be enabled with HTTP proxy) + +# Install GPU Operator in Proxy Environments + +## Introduction + +This page describes how to successfully deploy the GPU Operator in clusters behind an HTTP proxy. +By default, the GPU Operator requires internet access for the following reasons: + + 1) Container images need to be pulled during GPU Operator installation. + 2) The `driver` container needs to download several OS packages prior to driver installation. + + **Tip:** + + Using precompiled-drivers removes the need for the `driver` containers to + download operating system packages. +To address these requirements, all Kubernetes nodes as well as the `driver` container need proper configuration +in order to direct traffic through the proxy. + +This document demonstrates how to configure the GPU Operator so that the `driver` container can successfully +download packages behind a HTTP proxy. Since configuring Kubernetes/container runtime components to use +a proxy is not specific to the GPU Operator, we do not include those instructions here. + +The instructions for Openshift are different, so skip the section titled proxy_config_openshift if you are not running Openshift. + +## Step 1: HTTP Proxy Configuration for Openshift + +For Openshift, it is recommended to use the cluster-wide Proxy object to provide proxy information for the cluster. +Follow the procedure described in [Configuring the cluster-wide proxy](https://docs.openshift.com/container-platform/latest/networking/enable-cluster-wide-proxy.html) +from Red Hat Openshift public documentation. The GPU Operator will automatically inject proxy related ENV into the `driver` container +based on information present in the cluster-wide Proxy object. + +## Step 2: HTTP Proxy Configuration + +First, get the `values.yaml` file used for GPU Operator configuration: + +```console +$ curl -sO https://raw.githubusercontent.com/NVIDIA/gpu-operator/${version}/deployments/gpu-operator/values.yaml +``` + +Specify `driver.env` in `values.yaml` with appropriate HTTP_PROXY, HTTPS_PROXY, and NO_PROXY environment variables +(in both uppercase and lowercase). + +```yaml +driver: + env: + - name: HTTPS_PROXY + value: http:// + - name: HTTP_PROXY + value: http:// + - name: NO_PROXY + value: + - name: https_proxy + value: http:// + - name: http_proxy + value: http:// + - name: no_proxy + value: +``` + +**Note:** + +* Proxy related ENV are automatically injected by GPU Operator into the `driver` container to indicate proxy information used when downloading necessary packages. +* If HTTPS Proxy server is setup then change the values of HTTPS_PROXY and https_proxy to use `https` instead. + +## Step 3: Deploy GPU Operator + +Download and deploy GPU Operator Helm Chart with the updated `values.yaml`. + +Fetch the chart from the NGC repository: + +```console +$ helm fetch https://helm.ngc.nvidia.com/nvidia/charts/gpu-operator-${version}.tgz +``` + +Install the GPU Operator with updated `values.yaml`: + +```console +$ helm install --wait gpu-operator \ + -n gpu-operator --create-namespace \ + gpu-operator-${version}.tgz \ + -f values.yaml +``` + +Check the status of the pods to ensure all the containers are running: + +```console +$ kubectl get pods -n gpu-operator +``` diff --git a/gpu-operator/.agents/skills/gpu-operator-install-ing-nvidia/SKILL.md b/gpu-operator/.agents/skills/gpu-operator-install-ing-nvidia/SKILL.md new file mode 100644 index 000000000..34f24953f --- /dev/null +++ b/gpu-operator/.agents/skills/gpu-operator-install-ing-nvidia/SKILL.md @@ -0,0 +1,493 @@ +--- +name: "gpu-operator-install-ing-nvidia" +description: "Installs the NVIDIA GPU Operator in a Kubernetes cluster with Helm. Use when users are getting started, installing the Operator for the first time, or checking installation prerequisites. Trigger keywords - NVIDIA GPU Operator, installation, Helm, Kubernetes, getting started." +--- + + + + +# Prerequisites + +1. You have the `kubectl` and `helm` CLIs available on a client machine. + +# Installing the NVIDIA GPU Operator + +**Version:** + +The current patch release of this version of the NVIDIA GPU Operator is `${version}`. +**Red Hat OpenShift Container Platform Install:** + +For installation on Red Hat OpenShift Container Platform, refer to :external+ocpsteps-overview. + +## Step 1: Procedure + +**Tip:** + +For installation on Red Hat OpenShift Container Platform, +refer to :external+ocpsteps-overview. +1. Add the NVIDIA Helm repository: + + ```console + $ helm repo add nvidia https://helm.ngc.nvidia.com/nvidia \ + && helm repo update + ``` + +1. Install the GPU Operator. + + - Install the Operator with the default configuration: + + ```console + $ helm install --wait --generate-name \ + -n gpu-operator --create-namespace \ + nvidia/gpu-operator \ + --version=${version} + ``` + + - Install the Operator and specify configuration options: + + ```console + $ helm install --wait --generate-name \ + -n gpu-operator --create-namespace \ + nvidia/gpu-operator \ + --version=${version} \ + --set = + ``` + + Refer to the gpu-operator-helm-chart-options + and common deployment scenarios for more information. + +## Step 2: Common Chart Customization Options + +The following options are available when using the Helm chart. +These options can be used with `--set` when installing with Helm. + +The following table identifies the most frequently used options. +To view all the options, run `helm show values nvidia/gpu-operator`. + +| Parameter | Description | Default | +| --- | --- | --- | +| `ccManager.enabled` | When set to `true`, the Operator deploys NVIDIA Confidential Computing Manager for Kubernetes. | `false` | +| `cdi.enabled` | When set to `true` (default), the Container Device Interface (CDI) will be used for injecting GPUs into workload containers. The Operator will no longer configure the `nvidia` runtime class as the default runtime handler. Instead, native-CDI support in container runtimes like containerd or cri-o will be leveraged for injecting GPUs into workload containers. Refer to the cdi page for more information. | `true` | +| `cdi.nriPluginEnabled` | When set to `true`, the Node Resource Interface (NRI) Plugin will be used for injecting GPUs into workload containers. In NRI Plugin mode, the NVIDIA Container Toolkit will no longer modify the runtime config. This feature requires containerd v1.7.30, v2.1.x, or v2.2.x. Refer to the cdi page for more information. | `false` | +| `cdi.default` Deprecated. | This field is deprecated as of v25.10.0 and will be ignored. The `cdi.enabled` field is set to `true` by default in versions 25.10.0 and later. When set to `true`, the container runtime uses CDI to perform device injection by default. | `false` | +| `daemonsets.annotations` | Map of custom annotations to add to all GPU Operator managed pods. | `{}` | +| `daemonsets.labels` | Map of custom labels to add to all GPU Operator managed pods. | `{}` | +| `dcgmExporter.enabled` | By default, the Operator gathers GPU telemetry in Kubernetes using [DCGM Exporter](https://docs.nvidia.com/datacenter/cloud-native/gpu-telemetry/latest/dcgm-exporter.html). Set this value to `false` to disable it. Available values are `true` (default) or `false`. | `true` | +| `dcgmExporter.service.internalTrafficPolicy` | Specifies the [internalTrafficPolicy](https://kubernetes.io/docs/concepts/services-networking/service/#traffic-policies) for the DCGM Exporter service. Available values are `Cluster` (default) or `Local`. | `Cluster` | +| `dcgmExporter.hostNetwork` | When set to `true`, the DCGM Exporter will expose a metric port on the host's network namespace. | `false` | +| `devicePlugin.config` | Specifies the configuration for the NVIDIA Device Plugin as a config map. In most cases, this field is configured after installing the Operator, such as to configure gpu-sharing. | `{}` | +| `driver.enabled` | By default, the Operator deploys NVIDIA drivers as a container on the system. Set this value to `false` when using the Operator on systems with pre-installed drivers. | `true` | +| `driver.image` | Name of the NVIDIA Driver Container image to use. | `driver` | +| `driver.imagePullSecrets` | List of the image pull secret used for pulling the driver container image from the registry. | None | +| `driver.kernelModuleType` | Specifies the type of the NVIDIA GPU Kernel modules to use. Valid values are `auto` (default), `proprietary`, and `open`. `Auto` means that the recommended kernel module type (open or proprietary) is chosen based on the GPU devices on the host and the driver branch used. The `auto` option is only supported with the 570.86.15 and 570.124.06 or later driver containers. 550 and 535 branch drivers do not yet support this mode. `Open` means the open kernel module is used. `Proprietary` means the proprietary module is used. | `auto` | +| `driver.nvidiaDriverCRD.enabled` | When set to `true`, the Operator deploys NVIDIA GPU Driver Custom Resource Definition. Refer to the NVIDIA GPU Driver Custom Resource Definition (use the `gpu-operator-nvidia-driver` skill) page for more information. | `false` | +| `driver.repository` | The images are downloaded from NGC. Specify another image repository when using custom driver images. | `nvcr.io/nvidia` | +| `driver.rdma.enabled` | Controls whether the driver daemon set builds and loads the legacy `nvidia-peermem` kernel module. You might be able to use GPUDirect RDMA without enabling this option. Refer to gpu-operator-rdma for information about whether you can use DMA-BUF or you need to use legacy `nvidia-peermem`. | `false` | +| `driver.rdma.useHostMofed` | Indicate if MLNX_OFED (MOFED) drivers are pre-installed on the host. | `false` | +| `driver.secretEnv` | The name of the secret to the driver container. A common use case is to use this field to pass your Ubuntu Pro token secret if you are deploying the GPU Operator with government-ready components. Refer to install-gpu-operator-gov-ready for more information. | None | +| `driver.startupProbe` | By default, the driver container has an initial delay of `60s` before starting liveness probes. The probe runs the `nvidia-smi` command with a timeout duration of `60s`. You can increase the `timeoutSeconds` duration if the `nvidia-smi` command runs slowly in your cluster. | `60s` | +| `driver.useOpenKernelModules` Deprecated. | This field is deprecated as of v25.3.0 and will be ignored. Use `kernelModuleType` instead. When set to `true`, the driver containers install the NVIDIA Open GPU Kernel module driver. | `false` | +| `driver.usePrecompiled` | When set to `true`, the Operator attempts to deploy driver containers that have precompiled kernel drivers. Refer to the precompiled driver containers (use the `gpu-operator-precompiled-drivers` skill) page for the supported operating systems. | `false` | +| `driver.version` | Version of the NVIDIA datacenter driver supported by the Operator. If you set `driver.usePrecompiled` to `true`, then set this field to a driver branch, such as `525`. | Depends on the version of the Operator. Refer to the GPU Operator Component Matrix for more information on supported drivers. | +| `gdrcopy.enabled` | Enables support for GDRCopy. When set to `true`, the GDRCopy Driver runs as a sidecar container in the GPU driver pod. For information about GDRCopy, refer to the [gdrcopy](https://developer.nvidia.com/gdrcopy) page. You can enable GDRCopy if you use the gpu-driver-configuration. | `false` | +| `mig.strategy` | Controls the strategy to be used with MIG on supported NVIDIA GPUs. Options are either `mixed` or `single`. | `single` | +| `migManager.enabled` | The MIG manager watches for changes to the MIG geometry and applies reconfiguration as needed. By default, the MIG manager only runs on nodes with GPUs that support MIG (such as the A100). | `true` | +| `nfd.enabled` | Deploys Node Feature Discovery plugin as a daemonset. Set this variable to `false` if NFD is already running in the cluster. | `true` | +| `nfd.nodefeaturerules` | Installs node feature rules that are related to confidential computing. NFD uses the rules to detect security features in CPUs and NVIDIA GPUs. Set this variable to `true` when you configure the Operator for Confidential Containers. | `false` | +| `operator.labels` | Map of custom labels that will be added to all GPU Operator managed pods. | `{}` | +| `psp.enabled` | The GPU Operator deploys `PodSecurityPolicies` if enabled. | `false` | +| `sandboxWorkloads.enabled` | Specifies if sandbox containers are enabled. | `false` | +| `sandboxWorkloads.defaultWorkload` | Specifies the default type of workload for the cluster, one of `container`, `vm-passthrough`, or `vm-vgpu`. Setting `vm-passthrough` or `vm-vgpu` can be helpful if you plan to run all or mostly virtual machines in your cluster. Refer to KubeVirt (use the `gpu-operator-kubevirt` skill), Kata Containers (use the `gpu-operator-kata-containers` skill) for more details on deploying different workload containers. | `container` | +| `sandboxWorkloads.mode` | Specifies the sandbox mode to use when deploying sandbox workloads. Accepted values are `kubevirt` (default) and `kata`. Refer to the KubeVirt (use the `gpu-operator-kubevirt` skill) or the Kata Containers (use the `gpu-operator-kata-containers` skill) pages for more information on using KubeVirt or Kata based workloads. | `kubevirt` | +| `toolkit.enabled` | By default, the Operator deploys the NVIDIA Container Toolkit (`nvidia-docker2` stack) as a container on the system. Set this value to `false` when using the Operator on systems with pre-installed NVIDIA runtimes. | `true` | + +## Step 3: Common Deployment Scenarios + +The following common deployment scenarios and sample commands apply best to +bare metal hosts or virtual machines with GPU passthrough. + +### Specifying the Operator Namespace + +Both the Operator and operands are installed in the same namespace. +The namespace is configurable and is specified during installation. +For example, to install the GPU Operator in the `nvidia-gpu-operator` namespace: + +```console +$ helm install --wait --generate-name \ + -n nvidia-gpu-operator --create-namespace \ + nvidia/gpu-operator \ + --version=${version} \ +``` + +If you do not specify a namespace during installation, all GPU Operator components are installed in the `default` namespace. + +### Preventing Installation of Operands on Some Nodes + +By default, the GPU Operator operands are deployed on all GPU worker nodes in the cluster. +GPU worker nodes are identified by the presence of the label `feature.node.kubernetes.io/pci-10de.present=true`. +The value `0x10de` is the PCI vendor ID that is assigned to NVIDIA. + +To disable operands from getting deployed on a GPU worker node, label the node with `nvidia.com/gpu.deploy.operands=false`. + +```console +$ kubectl label nodes $NODE nvidia.com/gpu.deploy.operands=false +``` + +### Preventing Installation of NVIDIA GPU Driver on Some Nodes + +By default, the GPU Operator deploys the driver on all GPU worker nodes in the cluster. +To prevent installing the driver on a GPU worker node, label the node like the following sample command. + +```console +$ kubectl label nodes $NODE nvidia.com/gpu.deploy.driver=false +``` + +### Installation on Red Hat Enterprise Linux + +In this scenario, use the NVIDIA Container Toolkit image that is built on UBI 8: + +```console +$ helm install --wait --generate-name \ + -n gpu-operator --create-namespace \ + nvidia/gpu-operator \ + --version=${version} \ + --set toolkit.version=v1.16.1-ubi8 +``` + +Replace the `v1.16.1` value in the preceding command with the version that is supported +with the NVIDIA GPU Operator. +Refer to the GPU Operator Component Matrix on the platform support page. + +When using RHEL8 with Kubernetes, SELinux must be enabled either in permissive or enforcing mode for use with the GPU Operator. +Additionally, when using RHEL8 with containerd as the runtime and SELinux is enabled (either in permissive or enforcing mode) at the host level, containerd must also be configured for SELinux, by setting the `enable_selinux=true` configuration option. +Network restricted environments are not supported. + +### Pre-Installed NVIDIA GPU Drivers + +In this scenario, the NVIDIA GPU driver is already installed on the worker nodes that have GPUs: + +```console +$ helm install --wait --generate-name \ + -n gpu-operator --create-namespace \ + nvidia/gpu-operator \ + --version=${version} \ + --set driver.enabled=false +``` + +The preceding command prevents the Operator from installing the GPU driver on any nodes in the cluster. + +If you do not specify the `driver.enabled=false` argument and nodes in the cluster have a pre-installed GPU driver, the init container in the driver pod detects that the driver is preinstalled and labels the node so that the driver pod is terminated and does not get re-scheduled on to the node. +The Operator proceeds to start other pods, such as the container toolkit pod. + +### Pre-Installed NVIDIA GPU Drivers and NVIDIA Container Toolkit + +In this scenario, the NVIDIA GPU driver and the NVIDIA Container Toolkit are already installed on +the worker nodes that have GPUs. + +**Tip:** + +This scenario applies to NVIDIA DGX Systems that run NVIDIA Base OS. +Before installing the Operator, ensure that the default runtime is set to `nvidia`. +Refer to :external+ctkconfiguration in the NVIDIA Container Toolkit documentation for more information. + +Install the Operator with the following options: + +```console +$ helm install --wait --generate-name \ + -n gpu-operator --create-namespace \ + nvidia/gpu-operator \ + --version=${version} \ + --set driver.enabled=false \ + --set toolkit.enabled=false +``` + +### Pre-Installed NVIDIA Container Toolkit (but no drivers) + +In this scenario, the NVIDIA Container Toolkit is already installed on the worker nodes that have GPUs. + +1. Configure toolkit to use the `root` directory of the driver installation as `/run/nvidia/driver`, because this is the path mounted by driver container. + + ```console + $ sudo sed -i 's/^#root/root/' /etc/nvidia-container-runtime/config.toml + ``` + +1. Install the Operator with the following options (which will provision a driver): + + ```console + $ helm install --wait --generate-name \ + -n gpu-operator --create-namespace \ + nvidia/gpu-operator \ + --version=${version} \ + --set toolkit.enabled=false + ``` + +### Running a Custom Driver Image + +If you want to use custom driver container images, such as version 465.27, then +you can build a custom driver container image. Follow these steps: + +- Rebuild the driver container by specifying the `$DRIVER_VERSION` argument when building the Docker image. For + reference, the driver container Dockerfiles are available on the Git repository at https://github.com/NVIDIA/gpu-driver-container/. +- Build the container using the appropriate Dockerfile. For example: + + ```console + $ docker build --pull -t \ + --build-arg DRIVER_VERSION=455.28 \ + nvidia/driver:455.28-ubuntu20.04 \ + --file Dockerfile . + ``` + + Ensure that the driver container is tagged as shown in the example by using the `driver:-` schema. +- Specify the new driver image and repository by overriding the defaults in + the Helm install command. For example: + + ```console + $ helm install --wait --generate-name \ + -n gpu-operator --create-namespace \ + nvidia/gpu-operator \ + --version=${version} \ + --set driver.repository=docker.io/nvidia \ + --set driver.version="465.27" + ``` + +These instructions are provided for reference and evaluation purposes. +Not using the standard releases of the GPU Operator from NVIDIA would mean limited +support for such custom configurations. + +## Step 4: Specifying Configuration Options for containerd + +**Note:** + +It's recommended that you enable the NRI Plugin to configure the container runtime by setting `cdi.nriPluginEnabled=true`. +When enabled, you do not need to specify the `toolkit.env` options and injecting GPUs into workload containers is handled by the NRI Plugin. +Refer to the NRI Plugin documentation, for more information. +When you use containerd as the container runtime, the following configuration +options are used with the container-toolkit deployed with GPU Operator: + +```yaml +toolkit: + env: + - name: CONTAINERD_CONFIG + value: /etc/containerd/config.toml + - name: CONTAINERD_SOCKET + value: /run/containerd/containerd.sock + - name: RUNTIME_CONFIG_SOURCE + value: "command,file" +``` + +If you need to specify custom values, refer to the following sample command for the syntax: + +```console +helm install gpu-operator -n gpu-operator --create-namespace \ + nvidia/gpu-operator $HELM_OPTIONS \ + --version=${version} \ + --set toolkit.env[0].name=CONTAINERD_CONFIG \ + --set toolkit.env[0].value=/etc/containerd/containerd.toml \ + --set toolkit.env[1].name=CONTAINERD_SOCKET \ + --set toolkit.env[1].value=/run/containerd/containerd.sock \ + --set toolkit.env[2].name=RUNTIME_CONFIG_SOURCE \ + --set toolkit.env[2].value="command,file" +``` + +These options are defined as follows: + +CONTAINERD_CONFIG + The path on the host to the top-level `containerd` config file. + By default this will point to `/etc/containerd/containerd.toml` + (the default location for `containerd`). It should be customized if your `containerd` + installation is not in the default location. + +CONTAINERD_SOCKET + The path on the host to the socket file used to + communicate with `containerd`. The operator will use this to send a + `SIGHUP` signal to the `containerd` daemon to reload its config. By + default this will point to `/run/containerd/containerd.sock` + (the default location for `containerd`). It should be customized if + your `containerd` installation is not in the default location. + +RUNTIME_CONFIG_SOURCE + The config source(s) that the container-toolkit uses when fetching + the current containerd configuration. A valid value for this setting is any + combination of [command | file]. By default this will be configured as + "command,file" which means the container-toolkit will attempt to fetch + the configuration using the containerd CLI before falling back to reading the + config from the top-level `containerd` config file (configured using + CONTAINERD_CONFIG). When `file` is specified, the absolute path to the file + to be used as a config source can be specified as `file=/path/to/source/config.toml` + +RUNTIME_DROP_IN_CONFIG + The path on the host where the NVIDIA-specific drop-in config file + will be created. By default this will point to `/etc/containerd/conf.d/99-nvidia.toml`. + +### Rancher Kubernetes Engine 2 + +For Rancher Kubernetes Engine 2 (RKE2), refer to +[Deploy NVIDIA Operator](https://docs.rke2.io/add-ons/gpu_operators#deploy-nvidia-operator) +in the RKE2 documentation. + +It's recommended that you enable CDI (default) and the NRI Plugin on RKE. +With both features enabled, you do not need to set `runtimeClassName: nvidia` in your pod spec. + +Refer to the v24.9.0-known-limitations. + +### MicroK8s + +For MicroK8s, set the following in the `ClusterPolicy`. + +```yaml +toolkit: + env: + - name: CONTAINERD_CONFIG + value: /var/snap/microk8s/current/args/containerd-template.toml + - name: CONTAINERD_SOCKET + value: /var/snap/microk8s/common/run/containerd.sock + - name: RUNTIME_CONFIG_SOURCE + value: "file=/var/snap/microk8s/current/args/containerd.toml" +``` + +These options can be passed to GPU Operator during install time as below. + +```console +helm install gpu-operator -n gpu-operator --create-namespace \ + nvidia/gpu-operator $HELM_OPTIONS \ + --version=${version} \ + --set toolkit.env[0].name=CONTAINERD_CONFIG \ + --set toolkit.env[0].value=/var/snap/microk8s/current/args/containerd-template.toml \ + --set toolkit.env[1].name=CONTAINERD_SOCKET \ + --set toolkit.env[1].value=/var/snap/microk8s/common/run/containerd.sock \ + --set toolkit.env[2].name=RUNTIME_CONFIG_SOURCE \ + --set-string toolkit.env[2].value=file=/var/snap/microk8s/current/args/containerd.toml +``` + +## Step 5: Verification: Running Sample GPU Applications + +### CUDA VectorAdd + +In the first example, let's run a simple CUDA sample, which adds two vectors together: + +1. Create a file, such as `cuda-vectoradd.yaml`, with contents like the following: + + ```yaml + apiVersion: v1 + kind: Pod + metadata: + name: cuda-vectoradd + spec: + restartPolicy: OnFailure + containers: + - name: cuda-vectoradd + image: "nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.7.1-ubuntu20.04" + resources: + limits: + nvidia.com/gpu: 1 + ``` + +1. Run the pod: + + ```console + $ kubectl apply -f cuda-vectoradd.yaml + ``` + + The pod starts, runs the `vectorAdd` command, and then exits. + +1. View the logs from the container: + + ```console + $ kubectl logs pod/cuda-vectoradd + ``` + + *Example Output* + + ```output + [Vector addition of 50000 elements] + Copy input data from the host memory to the CUDA device + CUDA kernel launch with 196 blocks of 256 threads + Copy output data from the CUDA device to the host memory + Test PASSED + Done + ``` + +1. Remove the stopped pod: + + ```console + $ kubectl delete -f cuda-vectoradd.yaml + ``` + + *Example Output* + + ```output + pod "cuda-vectoradd" deleted + ``` + +### Jupyter Notebook + +You can perform the following steps to deploy Jupyter Notebook in your cluster: + +1. Create a file, such as `tf-notebook.yaml`, with contents like the following example: + +1. Apply the manifest to deploy the pod and start the service: + + ```console + $ kubectl apply -f tf-notebook.yaml + ``` + +1. Check the pod status: + + ```console + $ kubectl get pod tf-notebook + ``` + + *Example Output* + + ```output + NAMESPACE NAME READY STATUS RESTARTS AGE + default tf-notebook 1/1 Running 0 3m45s + ``` + +1. Because the manifest includes a service, get the external port for the notebook: + + ```console + $ kubectl get svc tf-notebook + ``` + + *Example Output* + + ```output + NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE + tf-notebook NodePort 10.106.229.20 80:30001/TCP 4m41s + ``` + +1. Get the token for the Jupyter notebook: + + ```console + $ kubectl logs tf-notebook + ``` + + *Example Output* + + ```output + [I 21:50:23.188 NotebookApp] Writing notebook server cookie secret to /root/.local/share/jupyter/runtime/notebook_cookie_secret + [I 21:50:23.390 NotebookApp] Serving notebooks from local directory: /tf + [I 21:50:23.391 NotebookApp] The Jupyter Notebook is running at: + [I 21:50:23.391 NotebookApp] http://tf-notebook:8888/?token=3660c9ee9b225458faaf853200bc512ff2206f635ab2b1d9 + [I 21:50:23.391 NotebookApp] or http://127.0.0.1:8888/?token=3660c9ee9b225458faaf853200bc512ff2206f635ab2b1d9 + [I 21:50:23.391 NotebookApp] Use Control-C to stop this server and shut down all kernels (twice to skip confirmation). + [C 21:50:23.394 NotebookApp] + + To access the notebook, open this file in a browser: + file:///root/.local/share/jupyter/runtime/nbserver-1-open.html + Or copy and paste one of these URLs: + http://tf-notebook:8888/?token=3660c9ee9b225458faaf853200bc512ff2206f635ab2b1d9 + or http://127.0.0.1:8888/?token=3660c9ee9b225458faaf853200bc512ff2206f635ab2b1d9 + ``` + +The notebook should now be accessible from your browser at this URL: +[http://your-machine-ip:30001/?token=3660c9ee9b225458faaf853200bc512ff2206f635ab2b1d9](http://your-machine-ip:30001/?token=3660c9ee9b225458faaf853200bc512ff2206f635ab2b1d9). + +## Step 6: Installation on Commercially Supported Kubernetes Platforms + +| Product | Documentation | +| --- | --- | +| Red Hat OpenShift 4 using RHCOS worker nodes | :external+ocpindex | +| VMware vSphere Kubernetes Service and NVIDIA AI Enterprise | nvaie-tanzu_ | +| Google Cloud Anthos | :external+edgeanthos-guide | diff --git a/gpu-operator/.agents/skills/gpu-operator-install-nvidia-enterprise/SKILL.md b/gpu-operator/.agents/skills/gpu-operator-install-nvidia-enterprise/SKILL.md new file mode 100644 index 000000000..d06e723cb --- /dev/null +++ b/gpu-operator/.agents/skills/gpu-operator-install-nvidia-enterprise/SKILL.md @@ -0,0 +1,133 @@ +--- +name: "gpu-operator-install-nvidia-enterprise" +description: "Guides users through installing the GPU Operator with NVIDIA AI Enterprise. Use when deploying licensed NVIDIA AI Enterprise GPU software on Kubernetes. Trigger keywords - NVIDIA GPU Operator, NVIDIA AI Enterprise, installation, Kubernetes." +--- + + + + +# NVIDIA AI Enterprise + +## About NVIDIA AI Enterprise and Supported Platforms + +NVIDIA AI Enterprise is an end-to-end, cloud-native suite of AI and data analytics software, optimized, certified, and supported by NVIDIA with NVIDIA-Certified Systems. + +Deploying the GPU Operator with NVIDIA AI Enterprise offers two installation options. + +| vGPU Guest Driver | Data Center Driver | +| --- | --- | +| Uses a a prebuilt vGPU driver image that is only available to NVIDIA AI Enterprise customers. It is configured to use the [NVIDIA License System (NLS)](https://docs.nvidia.com/license-system/latest/). Installations on virtualization platforms must use the vGPU driver installation. Installation is performed by downloading a Bash script from NVIDIA NGC and running the script. | Uses the GPU Operator Helm chart that is publicly available and GPU driver containers that are publicly available. You must determine the supported driver branch, such as 550, for your NVIDIA AI Enterprise release. Installation is performed by running the `helm` command. | +For information about supported platforms, hypervisors, and operating systems, refer to the +[Product Support Matrix](https://docs.nvidia.com/ai-enterprise/latest/product-support-matrix/index.html) +in the NVIDIA AI Enterprise documentation. + +For information about using vGPU with Red Hat OpenShift, refer to :external+ocpnvaie-with-ocp. + +## Step 1: Installing GPU Operator Using the vGPU Driver + +### Prerequisites + +- A client configuration token has been generated for the client on which the script will install the vGPU guest driver. + Refer to [Generating a Client Configuration Token](https://docs.nvidia.com/license-system/latest/nvidia-license-system-user-guide/index.html#generating-client-configuration-token) + in the *NVIDIA License System User Guide* for more information. +- An NGC CLI API key that is used to create an image pull secret. + The secret is used to pull the prebuilt vGPU driver image from NVIDIA NGC. + Refer to [Generating Your NGC API Key](https://docs.nvidia.com/ngc/latest/ngc-private-registry-user-guide.html#prug-generating-personal-api-key) + in the *NVIDIA NGC Private Registry User Guide* for more information. + +### Procedure + +1. Export the NGC CLI API key and your email address as environment variables: + + ```console + $ export NGC_API_KEY="M2Vub3QxYmgyZ..." + $ export NGC_USER_EMAIL="user@example.com" + ``` + +1. Go to the + [NVIDIA GPU Operator - Deploy Installer Script](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/vgpu/resources/gpu-operator-installer-5) + web page on NVIDIA NGC. + + Click the **File Browser** tab, identify your NVIDIA AI Enterprise release, click ellipses-img, and select **Download File**. + + Copy the downloaded script to the same directory as the client configuration token. + +1. Rename the client configuration token that you downloaded to `client_configuration_token.tok`. + Originally, the client configuration token is named to match the pattern: `client_configuration_token_mm-dd-yyyy-hh-mm-ss.tok`. + +1. From the directory that contains the downloaded script and the client configuration token, run the script: + + ```console + $ bash gpu-operator-nvaie.sh install + ``` + +## Step 2: Updating NLS Client License Token + +In case the NLS client license token needs to be updated, use the following procedure: + +Create an empty vGPU license configuration file: + +```console +$ sudo touch gridd.conf +``` + +Generate and download a new NLS client license token. Refer to Section 4.6 of the [NLS User Guide](https://docs.nvidia.com/license-system/latest/pdf/nvidia-license-system-user-guide.pdf) for instructions. + +Rename the NLS client license token that you downloaded to `client_configuration_token.tok`. + +**Warning:** + +The `configMap(configMapName)` is **deprecated** and will be removed in a future release. +Use `secrets(secretName)` instead. +Create a new `licensing-config-new` Secret object in the `gpu-operator` namespace (make sure the name of the secret is not already used in the kubernetes cluster). Both the vGPU license configuration file and the NLS client license token will be added to this Secret: + +```console +$ kubectl create secret generic licensing-config-new \ + -n gpu-operator --from-file=gridd.conf --from-file=/client_configuration_token.tok +``` + +Edit the clusterpolicies by using the command: + +```console +$ kubectl edit clusterpolicies.nvidia.com +``` + +Go to the driver section and replace the following argument: + +```console +licensingConfig: + secretName: licensing-config +``` + +with + +```console +licensingConfig: + secretName: licensing-config-new +``` + +Write and exit from the kubectl edit session (you can use :qw for instance if vi utility is used) + +GPU Operator sequentially redeploys all the driver pods with this new licensing information. + +## Step 3: Installing GPU Operator Using the Data Center Driver + +This installation method is available for bare metal clusters or any cluster that does not use virtualization. + +You must install the driver that matches the supported driver branch for your NVIDIA AI Enterprise release. + +To identify the correct driver branch: + +1. Refer to the [NVIDIA AI Enterprise Infra Release Branches](https://docs.nvidia.com/ai-enterprise/index.html#nvidiatab-infrastructure-software---infra-release-branches) + table to determine the driver branch for your release. + + For example, NVIDIA AI Enterprise Infra 7.x uses the R580 driver branch. + +1. Refer to operator-component-matrix to identify the recommended GPU Operator version and driver version that uses the same driver branch. + +After identifying the correct driver version, refer to install-gpu-operator for installation instructions. +Use the `--version=` argument when installing with Helm. + +## Step 4: Related Information + +- [NVIDIA AI Enterprise](https://www.nvidia.com/en-us/data-center/products/ai-enterprise-suite/) web page. diff --git a/gpu-operator/.agents/skills/gpu-operator-install-nvidia-vgpu/SKILL.md b/gpu-operator/.agents/skills/gpu-operator-install-nvidia-vgpu/SKILL.md new file mode 100644 index 000000000..111e8df78 --- /dev/null +++ b/gpu-operator/.agents/skills/gpu-operator-install-nvidia-vgpu/SKILL.md @@ -0,0 +1,204 @@ +--- +name: "gpu-operator-install-nvidia-vgpu" +description: "Guides users through installing the GPU Operator with NVIDIA vGPU. Use when deploying virtual GPU software or configuring vGPU licensing with Kubernetes. Trigger keywords - NVIDIA GPU Operator, NVIDIA vGPU, installation, Kubernetes." +--- + + + + +# Prerequisites + +Before installing the GPU Operator on NVIDIA vGPU, ensure the following: + +# Using NVIDIA vGPU + +## About Installing the Operator and NVIDIA vGPU + +NVIDIA Virtual GPU (vGPU) enables multiple virtual machines (VMs) to have simultaneous, +direct access to a single physical GPU, using the same NVIDIA graphics drivers that are deployed on non-virtualized operating systems. + +The installation steps assume `gpu-operator` as the default namespace for installing the NVIDIA GPU Operator. +In case of Red Hat OpenShift Container Platform, the default namespace is `nvidia-gpu-operator`. +Change the namespace shown in the commands accordingly based on your cluster configuration. +Also replace `kubectl` in the following commands with `oc` when running on Red Hat OpenShift. + +NVIDIA vGPU is only supported with the NVIDIA License System. + +## Step 1: Platform Support + +For information about the supported platforms, refer to Supported Deployment Options, Hypervisors, and NVIDIA vGPU Based Products. + +For Red Hat OpenShift Virtualization, refer to NVIDIA GPU Operator with OpenShift Virtualization. + +## Step 2: Download vGPU Software + +Perform the following steps to download the vGPU software and the latest NVIDIA vGPU driver catalog file from the NVIDIA Licensing Portal. + +1. Log in to the NVIDIA Enterprise Application Hub at https://nvid.nvidia.com/dashboard and then click **NVIDIA LICENSING PORTAL**. +1. In the left navigation pane of the NVIDIA Licensing Portal, click **SOFTWARE DOWNLOADS**. +1. Locate **vGPU Driver Catalog** in the table of driver downloads and click **Download**. +1. Click the **PRODUCT FAMILY** menu and select **vGPU** to filter the downloads to vGPU only. +1. Locate the vGPU software for your platform in the table of software downloads and click **Download**. + +The vGPU software is packaged as a ZIP file. +Unzip the file to obtain the NVIDIA vGPU Linux guest driver. +The guest driver file name follows the pattern `NVIDIA-Linux-x86_64--grid.run`. + +## Step 3: Build the Driver Container + +Perform the following steps to build and push a container image that includes the vGPU Linux guest driver. + +1. Clone the driver container repository and change directory into the repository: + + ```console + $ git clone https://github.com/NVIDIA/gpu-driver-container.git + ``` + + ```console + $ cd gpu-driver-container + ``` + +1. Copy the NVIDIA vGPU guest driver from your extracted ZIP file and the NVIDIA vGPU driver catalog file to the operating system version you want to build the driver container for: + + Copy `/\*-grid.run` and `vgpuDriverCatalog.yaml` to `ubuntu22.04/drivers/`. + + ```console + $ cp /*-grid.run ubuntu22.04/drivers/ + ``` + + ```console + $ cp vgpuDriverCatalog.yaml ubuntu22.04/drivers/ + ``` + + For Red Hat OpenShift Container Platform, use a directory that includes `rhel` in the directory name. + +1. Set environment variables for building the driver container image. + + - Specify your private registry URL: + + ```console + $ export PRIVATE_REGISTRY= + ``` + + - Specify the `OS_TAG` environment variable to identify the guest operating system name and version: + + ```console + $ export OS_TAG=ubuntu22.04 + ``` + + The value must match the guest operating system version. + For Red Hat OpenShift Container Platform, specify `rhcos4.` where `x` is the supported minor OCP version. + Refer to Supported Operating Systems and Kubernetes Platforms for the list of supported OS distributions. + + - Specify the Linux guest vGPU driver version that you downloaded from the NVIDIA Licensing Portal: + + ```console + $ export VGPU_DRIVER_VERSION=580.95.05 + ``` + + The Operator automatically selects the compatible guest driver version from the drivers bundled with the `driver` image. + If you disable the version check by specifying `--build-arg DISABLE_VGPU_VERSION_CHECK=true` when you build the driver image, + then the `VGPU_DRIVER_VERSION` value is used as default. + +1. Build the driver container image. + + **Note:** + + Docker is the only supported container tool for building the driver container image. + Multi-architecture builds additionally require [buildx](https://github.com/docker/buildx). + + ```console + $ VGPU_GUEST_DRIVER_VERSION=${VGPU_DRIVER_VERSION} IMAGE_NAME=${PRIVATE_REGISTRY}/driver make build-vgpuguest-${OS_TAG} + ``` + +1. Push the driver container image to your private registry. + + 1. Log in to your private registry: + + ```console + $ sudo docker login ${PRIVATE_REGISTRY} --username= + ``` + + Enter your password when prompted. + + 1. Push the driver container image to your private registry: + + ```console + $ VGPU_GUEST_DRIVER_VERSION=${VGPU_DRIVER_VERSION} IMAGE_NAME=${PRIVATE_REGISTRY}/driver make push-vgpuguest-${OS_TAG} + ``` + +## Step 4: Configure the Cluster with the vGPU License Information and the Driver Container Image + +1. Create an NVIDIA vGPU license file named `gridd.conf` with contents like the following example: + + ```text + # Description: Set Feature to be enabled + # Data type: integer + # Possible values: + # 0 => for unlicensed state + # 1 => for NVIDIA vGPU + # 2 => for NVIDIA RTX Virtual Workstation + # 4 => for NVIDIA Virtual Compute Server + FeatureType=1 + ``` + +1. Rename the client configuration token file that you downloaded to `client_configuration_token.tok` using a command like the following example: + + ```console + $ cp ~/Downloads/client_configuration_token_03-28-2023-16-16-36.tok client_configuration_token.tok + ``` + + The file must be named `client_configuration_token.tok`. + +1. Create the `gpu-operator` namespace: + + ```console + $ kubectl create namespace gpu-operator + ``` + +1. Create a secret that is named `licensing-config` using the `gridd.conf` and `client_configuration_token.tok` files: + + ```console + $ kubectl create secret generic licensing-config \ + -n gpu-operator --from-file=gridd.conf --from-file=client_configuration_token.tok + ``` + +1. Create an image pull secret in the `gpu-operator` namespace with the registry secret and private registry. + + 1. Set an environment variable with the name of the secret: + + ```console + $ export REGISTRY_SECRET_NAME=registry-secret + ``` + + 1. Create the secret: + + ```console + $ kubectl create secret docker-registry ${REGISTRY_SECRET_NAME} \ + --docker-server=${PRIVATE_REGISTRY} --docker-username= \ + --docker-password= \ + --docker-email= -n gpu-operator + ``` + + You need to specify the secret name `REGISTRY_SECRET_NAME` when you install the GPU Operator with Helm. + +## Step 5: Install the Operator + +- Install the Operator: + + ```console + $ helm install --wait --generate-name \ + -n gpu-operator --create-namespace \ + nvidia/gpu-operator \ + --set driver.repository=${PRIVATE_REGISTRY} \ + --set driver.version=${VGPU_DRIVER_VERSION} \ + --set driver.imagePullSecrets={$REGISTRY_SECRET_NAME} \ + --set driver.licensingConfig.secretName=licensing-config + ``` + +The preceding command installs the Operator with the default configuration. +Refer to gpu-operator-helm-chart-options for information about configuration options. + +## Related Skills + +- verify gpu operator install diff --git a/gpu-operator/.agents/skills/gpu-operator-install-outdated-kernels/SKILL.md b/gpu-operator/.agents/skills/gpu-operator-install-outdated-kernels/SKILL.md new file mode 100644 index 000000000..5d4e6d78f --- /dev/null +++ b/gpu-operator/.agents/skills/gpu-operator-install-outdated-kernels/SKILL.md @@ -0,0 +1,98 @@ +--- +name: "gpu-operator-install-outdated-kernels" +description: "Explains how to install the GPU Operator when nodes run outdated kernels. Use when driver containers fail because kernel versions are older than supported defaults. Trigger keywords - NVIDIA GPU Operator, outdated kernels, driver containers, installation." +--- + + + + +# Considerations when Installing with Outdated Kernels in Cluster + +The `driver` container deployed as part of the GPU Operator requires certain packages to be available as part of the driver installation. +On GPU nodes where the running kernel is not the latest, the `driver` container may fail to find the right version of these packages +(e.g. kernel-headers, kernel-devel) that correspond to the running kernel version. In the `driver` container logs, you will most likely +see the following error message: `Could not resolve Linux kernel version`. + +In general, upgrading your system to the latest kernel should fix this issue. But if this is not an option, the following is a +workaround to successfully deploy the GPU Operator when GPU nodes in your cluster may not be running the latest kernel. + +## Step 1: Add Archived Package Repositories + +The workaround is to find the package archive containing packages for your outdated kernel and to add this repository to the package +manager running inside the `driver` container. To achieve this, we can simply mount a repository list file into the `driver` container using a `ConfigMap`. +The `ConfigMap` containing the repository list file needs to be created in the `gpu-operator` namespace. + +Let us demonstrate this workaround via an example. The system used in this example is running CentOS 7 with an outdated kernel: + +```console +$ uname -r +3.10.0-1062.12.1.el7.x86_64 +``` + +The official archive for older CentOS packages is https://vault.centos.org/. Typically, most archived CentOS repositories +are found in `/etc/yum.repos.d/CentOS-Vault.repo` but they are disabled by default. If the appropriate archive repository +was enabled, then the `driver` container would resolve the kernel version and be able to install the correct versions +of the prerequisite packages. + +We can simply drop in a replacement of `/etc/yum.repos.d/CentOS-Vault.repo` to ensure the appropriate CentOS archive is enabled. +For the kernel running in this example, the `CentOS-7.7.1908` archive contains the kernel-headers version we are looking for. +Here is our example drop-in replacement file: + +```text +[C7.7.1908-base] +name=CentOS-7.7.1908 - Base +baseurl=http://vault.centos.org/7.7.1908/os/$basearch/ +gpgcheck=1 +gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-7 +enabled=1 + +[C7.7.1908-updates] +name=CentOS-7.7.1908 - Updates +baseurl=http://vault.centos.org/7.7.1908/updates/$basearch/ +gpgcheck=1 +gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-CentOS-7 +enabled=1 +``` + +Once the repo list file is created, we can create a `ConfigMap` for it: + +```console +$ kubectl create configmap repo-config -n gpu-operator --from-file= +``` + +Once the `ConfigMap` is created using the above command, update `values.yaml` with this information, to let the GPU Operator mount the repo configuration +within the `driver` container to pull required packages. + +For Ubuntu: + +```yaml +driver: + repoConfig: + configMapName: repo-config + destinationDir: /etc/apt/sources.list.d +``` + +For RHEL/Centos/RHCOS: + +```yaml +driver: + repoConfig: + configMapName: repo-config + destinationDir: /etc/yum.repos.d +``` + +Deploy GPU Operator with updated `values.yaml`: + +```console +$ helm install --wait --generate-name \ + -n gpu-operator --create-namespace \ + nvidia/gpu-operator \ + --version=${version} \ + -f values.yaml +``` + +Check the status of the pods to ensure all the containers are running: + +```console +$ kubectl get pods -n gpu-operator +``` diff --git a/gpu-operator/.agents/skills/gpu-operator-install-service-mesh/SKILL.md b/gpu-operator/.agents/skills/gpu-operator-install-service-mesh/SKILL.md new file mode 100644 index 000000000..8ca89b4d0 --- /dev/null +++ b/gpu-operator/.agents/skills/gpu-operator-install-service-mesh/SKILL.md @@ -0,0 +1,45 @@ +--- +name: "gpu-operator-install-service-mesh" +description: "Guides users through GPU Operator service mesh considerations. Use when deploying with Istio or troubleshooting sidecar injection and service mesh interactions. Trigger keywords - NVIDIA GPU Operator, service mesh, Istio, Kubernetes." +--- + + + + +# Install GPU Operator with Service Mesh + +## Step 1: Special Considerations for Service Meshes + +You can use NVIDIA GPU Operator in a cluster that uses a service mesh provided by Istio CNI or Linkerd CNI. + +The typical consideration for using the Operator with a service mesh is that the `k8s-driver-manager` init container +for the `driver` container needs network access to the Kubernetes API server of the cluster. + +The data plane---implemented by Istio CNI or Linkerd CNI as proxies running as sidecar containers---must be running for any pod networking to work. +The proxy sidecar containers start only after the init phase of the pod, so init containers are not able to communicate with the API server. + +To address the connectivity challenge, NVIDIA recommends disabling injection for the GPU Operator namespace. +Refer to the following documentation for more information: + +- [Controlling the injection policy](https://istio.io/latest/docs/setup/additional-setup/sidecar-injection/#controlling-the-injection-policy) + in the Istio documentation. +- [Overriding injection](https://linkerd.io/2.14/features/proxy-injection/#overriding-injection) + in the Linkerd documentation. + +## Step 2: Label the Namespace to Disable Injection + +- Label the Operator namespace to prevent automatic injection: + + ```console + $ kubectl label namespace gpu-operator istio-injection=disabled + ``` + + Or, for Linkerd: + + ```console + $ kubectl label namespace gpu-operator linkerd.io/inject=disabled + ``` + +If the GPU Operator is not already installed, refer to +getting-started +for information about custom options and common installation scenarios. diff --git a/gpu-operator/.agents/skills/gpu-operator-kata-containers/SKILL.md b/gpu-operator/.agents/skills/gpu-operator-kata-containers/SKILL.md new file mode 100644 index 000000000..e28bfa3a0 --- /dev/null +++ b/gpu-operator/.agents/skills/gpu-operator-kata-containers/SKILL.md @@ -0,0 +1,565 @@ +--- +name: "gpu-operator-kata-containers" +description: "Guides users through configuring Kata Containers for GPU workloads with the GPU Operator. Use when deploying sandboxed GPU workloads with Kata Containers. Trigger keywords - NVIDIA GPU Operator, Kata Containers, sandboxed workloads, Kubernetes." +--- + + + + +# Deploy with Kata Containers + +## About the Operator with Kata Containers + +[Kata Containers](https://katacontainers.io/) is an open source project that creates lightweight Virtual Machines (VMs) that feel and perform like traditional containers such as a Docker container. +A traditional container packages software for user-space isolation from the host, +but the container runs on the host and shares the operating system kernel with the host. +Sharing the operating system kernel is a potential vulnerability. + +A Kata container runs in a virtual machine on the host. +The virtual machine has a separate operating system and operating system kernel. +Hardware virtualization and a separate kernel provide improved workload isolation +in comparison with traditional containers. + +The NVIDIA GPU Operator works with the Kata container runtime. +Kata uses a hypervisor, such as QEMU, to provide a lightweight virtual machine with a single purpose: to run a Kubernetes pod. + +The following diagram shows the software components that Kubernetes uses to run a Kata container. + +```mermaid +flowchart LR + a[Kubelet] --> b[CRI] --> c[Kata\nRuntime] --> d[Lightweight\nQEMU VM] --> e[Lightweight\nGuest OS] --> f[Pod] --> g[Container] +``` + +**Tip:** + +This page describes deploying with Kata containers only. +Refer to the Confidential Containers documentation if you are interested in deploying Confidential Containers with Kata Containers and the GPU Operator. + +## Step 1: Benefits of Using Kata Containers + +The primary benefits of Kata Containers are as follows: + +* Running untrusted workloads in a container. + The virtual machine provides a layer of defense against the untrusted code. + +* Limiting access to hardware devices such as NVIDIA GPUs. + The virtual machine is provided access to specific devices. + This approach ensures that the workload cannot access additional devices. + +* Transparent deployment of unmodified containers. + +## Step 2: Limitations and Restrictions + +* For GPU passthrough workloads, all GPUs must be assigned to one Kata Container virtual machine. + Configuring only some GPUs on a node for Kata Containers is not supported. + vGPU is not supported. + +* Support for Kata Containers is limited to the implementation described on this page. + The Operator offers Technology Preview support for Red Hat OpenShift Sandboxed Containers v1.12. + +* NVIDIA supports the Operator and Kata Containers with the containerd runtime only. + +## Step 3: Cluster Topology Considerations + +You can configure all the worker nodes in your cluster for Kata Containers or you can configure some nodes for Kata Containers and others for traditional containers. +Consider the following example where node A is configured to run traditional containers and node B is configured to run Kata Containers. + +| Node A - Traditional Container nodes receive the following software components | Node B - Kata Container nodes receive the following software components | +| --- | --- | +| * `NVIDIA Driver Manager for Kubernetes` -- to install the data-center driver. * `NVIDIA Container Toolkit` -- to ensure that containers can access GPUs. * `NVIDIA Device Plugin for Kubernetes` -- to discover and advertise GPU resources to kubelet. * `NVIDIA DCGM and DCGM Exporter` -- to monitor GPUs. * `NVIDIA MIG Manager for Kubernetes` -- to manage MIG-capable GPUs. * `Node Feature Discovery` -- to detect CPU, kernel, and host features and label worker nodes. * `NVIDIA GPU Feature Discovery` -- to detect NVIDIA GPUs and label worker nodes. | * `NVIDIA Confidential Computing Manager for Kubernetes` -- to set the confidential computing (CC) mode on the NVIDIA GPUs. This component is deployed to all nodes configured for Kata Containers, even if you are not planning to run Confidential Containers. Refer to the Confidential Containers documentation for more details. * `NVIDIA Sandbox Device Plugin` -- to discover and advertise the passthrough GPUs to kubelet. * `NVIDIA VFIO Manager` -- to bind NVIDIA GPUs and NVIDIA NVSwitches to the vfio-pci driver for VFIO passthrough. * `Node Feature Discovery` -- to detect CPU security features, NVIDIA GPUs, and label worker nodes. | +This configuration can be controlled through node labelling, as described in the Label Nodes section. +You can also set `sandboxWorkloads.defaultWorkload=vm-passthrough` when you install the GPU Operator to configure all nodes to run Kata Containers by default. + +## Step 4: Configure the GPU Operator for Kata Containers + +To enable Kata Containers for GPUs on your cluster, you do the following: + +1. Make sure your cluster meets the prerequisites. +1. Label the nodes you want to use for Kata Containers. +1. Install the upstream `kata-deploy` Helm chart, which deploys all Kata runtime classes, including NVIDIA-specific runtime classes. + The `kata-qemu-nvidia-gpu` runtime class is used with Kata Containers. +1. Install the NVIDIA GPU Operator with Kata sandbox mode enabled. + +After installation, you can run a sample workload that uses the Kata runtime class. + +### Prerequisites + +#### Hardware and BIOS + +* Ensure hosts are configured to enable hardware virtualization and Access Control Services (ACS). + With some AMD CPUs and BIOSes, ACS might be grouped under Advanced Error Reporting (AER). + Enabling these features is typically performed by configuring the host BIOS. + +* Configure hosts to support IOMMU. + You can check if your host is configured for IOMMU by running the following command: + + ```console + $ ls /sys/kernel/iommu_groups + ``` + + If the output of this command includes 0, 1, and so on, then your host is configured for IOMMU. + + If the host is not configured or if you are unsure, add the `intel_iommu=on` (or `amd_iommu=on` for AMD CPUs) Linux kernel command-line argument. + For most Linux distributions, add the argument to the `/etc/default/grub` file: + + ```text + ... + GRUB_CMDLINE_LINUX_DEFAULT="quiet intel_iommu=on modprobe.blacklist=nouveau" + ... + ``` + + On Ubuntu systems, run `sudo update-grub` after making the change to configure the bootloader. + On other systems, you might need to run `sudo dracut` after making the change. + Refer to the documentation for your operating system. + Reboot the host after configuring the bootloader. + + **Note:** + + After configuring IOMMU, you might see QEMU warnings about PCI P2P DMA when running GPU workloads. + These are expected and can be safely ignored. +* Ensure that no NVIDIA GPU drivers are installed on the host. + Kata Containers uses VFIO to pass GPUs directly to the VM, and host-level GPU drivers interfere with VFIO device binding. + + To check if NVIDIA GPU drivers are installed, run the following command: + + ```console + $ lsmod | grep nvidia + ``` + + If the output is empty, no NVIDIA GPU drivers are loaded. + If modules such as `nvidia`, `nvidia_uvm`, or `nvidia_modeset` are listed, NVIDIA GPU drivers are present and must be removed before proceeding. + Refer to [Removing the Driver](https://docs.nvidia.com/datacenter/tesla/driver-installation-guide/removing-the-driver.html) in the NVIDIA Driver Installation Guide. + +#### Kubernetes Cluster + +* A Kubernetes cluster with cluster administrator privileges. + +* Helm installed on your cluster. + Use the command below to install Helm or refer to the [Helm documentation](https://helm.sh/docs/intro/install/) for installation instructions. + + ```console + $ curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 \ + && chmod 700 get_helm.sh \ + && ./get_helm.sh + ``` + +* Enable the `KubeletPodResourcesGet` Kubelet feature gate on your cluster. + The Kata runtime uses this feature gate to query the Kubelet Pod Resources API and discover allocated GPU devices during sandbox creation. + + * For Kubernetes v1.34 and later, the `KubeletPodResourcesGet` feature gate is enabled by default. + + * For Kubernetes versions older than v1.34, you must explicitly enable the `KubeletPodResourcesGet` feature gate. + Add the feature gate to your Kubelet configuration (typically `/var/lib/kubelet/config.yaml`): + + ```yaml + apiVersion: kubelet.config.k8s.io/v1beta1 + kind: KubeletConfiguration + featureGates: + KubeletPodResourcesGet: true + ``` + + If your `config.yaml` already has a `featureGates` section, add the gate to the existing section rather than creating a duplicate. + + Restart the Kubelet service to apply the changes: + + ```console + $ sudo systemctl restart kubelet + ``` + + Refer to the [Kata Containers documentation](https://github.com/kata-containers/kata-containers/blob/main/docs/use-cases/NVIDIA-GPU-passthrough-and-Kata-QEMU.md#kata-runtime) for more details on the Kata runtime and VFIO cold-plug. + +### Label Nodes to use Kata Containers + +1. Get a list of the nodes in your cluster: + + ```console + $ kubectl get nodes + ``` + + *Example Output:* + + ```output + NAME STATUS ROLES AGE VERSION + node-01 Ready 10d v1.34.0 + node-02 Ready 10d v1.34.0 + ``` + +1. Label the nodes you want to use for Kata Containers: + + ```console + $ kubectl label node nvidia.com/gpu.workload.config=vm-passthrough + ``` + + The GPU Operator uses this label to determine what software components to deploy to a node. + The `nvidia.com/gpu.workload.config=vm-passthrough` label specifies that the node should receive the software components to run Kata Containers. + A node can only run one container runtime at a time, so a labeled node runs only Kata container workloads and cannot run traditional GPU container workloads. + The labeling approach is useful if you want to run Kata container workloads on some nodes and traditional GPU container workloads on other nodes in your cluster. + Refer to the GPU Operator Cluster Topology Considerations section for more details on what gets deployed to a Kata Container node. + + **Tip:** + + Skip this section if you plan to set `sandboxWorkloads.defaultWorkload=vm-passthrough` when you install the GPU Operator. +1. Verify the node label was added: + + ```console + $ kubectl describe node | grep nvidia.com/gpu.workload.config + ``` + + *Example Output:* + + ```output + nvidia.com/gpu.workload.config: vm-passthrough + ``` + +After labeling the nodes, you can continue to the next steps to install Kata Containers and the NVIDIA GPU Operator. + +### Install the Kata Containers Helm Chart + +Install Kata Containers using the `kata-deploy` Helm chart. +The `kata-deploy` chart installs all required components from the Kata Containers project including the Kata Containers runtime binary, runtime configuration, UVM kernel, and images that NVIDIA uses for Kata Containers. + +The minimum required version is 3.29.0. + +1. Set the chart version and registry path: + + ```console + $ export VERSION="3.29.0" + $ export CHART="oci://ghcr.io/kata-containers/kata-deploy-charts/kata-deploy" + ``` + +1. Install the kata-deploy Helm chart: + + ```console + $ helm install kata-deploy "${CHART}" \ + --namespace kata-system --create-namespace \ + --set nfd.enabled=false \ + --wait --timeout 10m \ + --version "${VERSION}" + ``` + + *Example Output:* + + ```output + LAST DEPLOYED: Wed Apr 1 17:03:00 2026 + NAMESPACE: kata-system + STATUS: deployed + REVISION: 1 + DESCRIPTION: Install complete + TEST SUITE: None + ``` + + **Note:** + + The `--wait` flag in the install command instructs Helm to wait until the release is deployed before returning. + It can take a few minutes to return output. + + There is a [known Helm issue](https://github.com/helm/helm/issues/8660) on single node clusters, that may result in the Helm command finishing before all deployed pods are finished initializing. + If you are deploying to a single node cluster, you may need to wait for an additional few minutes after the Helm command completes for the `kata-deploy` pod to be in the Running state. + **Note:** + + Both `kata-deploy` and the GPU Operator deploy Node Feature Discovery (NFD) by default. + The install command includes `--set nfd.enabled=false` to prevent `kata-deploy` from deploying NFD. + The GPU Operator will deploy and manage NFD in the next step. +1. Optional: Verify that the `kata-deploy` pod is running: + + ```console + $ kubectl get pods -n kata-system | grep kata-deploy + ``` + + *Example Output:* + + ```output + NAME READY STATUS RESTARTS AGE + kata-deploy-b2lzs 1/1 Running 0 6m37s + ``` + +1. Optional: Verify that the `kata-qemu-nvidia-gpu` runtime class is available: + + ```console + $ kubectl get runtimeclass | grep kata-qemu-nvidia-gpu + ``` + + *Example Output:* + + ```output + NAME HANDLER AGE + kata-qemu-nvidia-gpu kata-qemu-nvidia-gpu 40s + kata-qemu-nvidia-gpu-snp kata-qemu-nvidia-gpu-snp 40s + kata-qemu-nvidia-gpu-tdx kata-qemu-nvidia-gpu-tdx 40s + ``` + + Several runtime classes are installed by the `kata-deploy` chart. + The `kata-qemu-nvidia-gpu` runtime class is used with Kata Containers. + The `kata-qemu-nvidia-gpu-snp` and `kata-qemu-nvidia-gpu-tdx` runtime classes are used to deploy Confidential Containers. + + **Note:** + + To manage the lifecycle of Kata Containers, including upgrades and day-two operations, + install the [Kata Lifecycle Manager](https://github.com/kata-containers/lifecycle-manager). + This Argo Workflows-based tool is the recommended way to manage Kata Containers deployments. +1. Optional: If you have an issue deploying the `kata-deploy` pod or are not seeing the expected runtime classes, get the pod name and view the logs: + + ```console + $ kubectl get pods -n kata-system | grep kata-deploy + $ kubectl logs -n kata-system + ``` + + Replace `` with the name of the `kata-deploy` pod from the first command's output. + +### Install the NVIDIA GPU Operator + +Install the NVIDIA GPU Operator and configure it to deploy Kata Container components. + +1. Add and update the NVIDIA Helm repository: + + ```console + $ helm repo add nvidia https://helm.ngc.nvidia.com/nvidia \ + && helm repo update + ``` + + *Example Output:* + + ```output + "nvidia" has been added to your repositories + Hang tight while we grab the latest from your chart repositories... + ...Successfully got an update from the "nvidia" chart repository + Update Complete. ⎈Happy Helming!⎈ + ``` + +1. Install the GPU Operator. + The following configures the GPU Operator to deploy the operands that are required for Kata Containers. + Refer to Common Chart Customization Options for more details on the additional configuration options you can specify when installing the GPU Operator. + + ```console + $ helm install --generate-name \ + -n gpu-operator --create-namespace \ + nvidia/gpu-operator \ + --version=${version} \ + --set sandboxWorkloads.enabled=true \ + --set sandboxWorkloads.mode=kata \ + --set nfd.enabled=true \ + --set nfd.nodefeaturerules=true + ``` + + *Example Output:* + + ```output + NAME: gpu-operator + LAST DEPLOYED: Wed Mar 25 17:21:34 2026 + NAMESPACE: gpu-operator + STATUS: deployed + REVISION: 1 + DESCRIPTION: Install complete + TEST SUITE: None + ``` + + **Tip:** + + Add `--set sandboxWorkloads.defaultWorkload=vm-passthrough` if every worker node should use Kata by default. +1. Optional: Verify that all GPU Operator pods, especially the Sandbox Device Plugin and VFIO Manager operands, are running: + + ```console + $ kubectl get pods -n gpu-operator + ``` + + *Example Output:* + + ```output + NAME READY STATUS RESTARTS AGE + gpu-operator-1766001809-node-feature-discovery-gc-75776475sxzkp 1/1 Running 0 86s + gpu-operator-1766001809-node-feature-discovery-master-6869lxq2g 1/1 Running 0 86s + gpu-operator-1766001809-node-feature-discovery-worker-mh4cv 1/1 Running 0 86s + gpu-operator-f48fd66b-vtfrl 1/1 Running 0 86s + nvidia-cc-manager-7z74t 1/1 Running 0 61s + nvidia-kata-sandbox-device-plugin-daemonset-d5rvg 1/1 Running 0 30s + nvidia-sandbox-validator-6xnzc 1/1 Running 0 30s + nvidia-vfio-manager-h229x 1/1 Running 0 62s + ``` + + **Note:** + + It can take several minutes for all GPU Operator pods to be in the Running state. + If you are not seeing the expected output, you can view the logs for the GPU Operator pods: + + ```console + $ kubectl logs -n gpu-operator + ``` + + Replace `` with the name of the GPU Operator pod from `kubectl get pods -n gpu-operator`. + **Note:** + + The NVIDIA Confidential Computing (CC) Manager for Kubernetes (`nvidia-cc-manager`) is deployed to all nodes configured to run Kata containers, even if you are not planning to run Confidential Containers. + This manager sets the confidential computing mode on the NVIDIA GPUs, if your GPU is capable of Confidential Computing, but will not be used if you are deploying in Kata Containers only. + Refer to Confidential Containers for more details. +1. Optional: If you have host access to the worker node, you can perform the following validation step: + + a. Confirm that the host uses the `vfio-pci` device driver for GPUs: + + ```console + $ lspci -nnk -d 10de: + ``` + + *Example Output:* + + ```output + 65:00.0 3D controller [0302]: NVIDIA Corporation xxxxxxx [xxx] [10de:xxxx] (rev xx) + Subsystem: NVIDIA Corporation xxxxxxx [xxx] [10de:xxxx] + Kernel driver in use: vfio-pci + Kernel modules: nvidiafb, nouveau + ``` + +### Optional: Configuring GPU or NVSwitch Resource Types Name + +By default, the NVIDIA GPU Operator creates a resource type for GPUs and NVSwitches, `nvidia.com/pgpu` and `nvidia.com/nvswitch`. +You can reference these names in your manifests to request GPU or NVSwitch resources for your workload. +If you want to use a different name, you can set the `P_GPU_ALIAS` or `NVSWITCH_ALIAS` environment variables in the Kata device plugin to your preferred name. +In clusters where all GPUs are the same model, a single resource type is typically sufficient. + +In heterogeneous clusters, where you have different GPU types on your nodes, you might want to use specific GPU types for your workload. +To do this, specify an empty `P_GPU_ALIAS` environment variable in the Kata device plugin by adding the following to your GPU Operator installation: +`--set kataSandboxDevicePlugin.env[0].name=P_GPU_ALIAS` and +`--set kataSandboxDevicePlugin.env[0].value=""`. + +When this variable is set to `""`, the Kata device plugin creates GPU model-specific resource types, for example `nvidia.com/GH100_H100L_94GB`, instead of the default `nvidia.com/pgpu` type. +Use the exposed device resource types in pod specs by specifying respective resource limits. + +Similarly, you can set `NVSWITCH_ALIAS` to `""` to advertise model-specific NVSwitch resource types. + +The following example installs the GPU Operator with both `P_GPU_ALIAS` and `NVSWITCH_ALIAS` configured: + +```console +$ helm install --generate-name \ + -n gpu-operator --create-namespace \ + nvidia/gpu-operator \ + --version=${version} \ + --set sandboxWorkloads.enabled=true \ + --set sandboxWorkloads.mode=kata \ + --set nfd.enabled=true \ + --set nfd.nodefeaturerules=true \ + --set kataSandboxDevicePlugin.env[0].name=P_GPU_ALIAS \ + --set kataSandboxDevicePlugin.env[0].value="" \ + --set kataSandboxDevicePlugin.env[1].name=NVSWITCH_ALIAS \ + --set kataSandboxDevicePlugin.env[1].value="" +``` + +After installing the GPU Operator, you can view the GPU or NVSwitch resource types available on a node by running the following command: + +```console +$ kubectl get node -o json | grep nvidia.com +``` + +*Example Output:* + +```output +"nvidia.com/GH100_H100L_94GB": "1" +``` + +## Step 5: Run a Sample Workload + +A pod specification for a Kata container requires the following: + +* Specify a Kata runtime class. + +* Specify a passthrough GPU resource. + +1. Create a file, such as `cuda-vectoradd-kata.yaml`, with the following content: + + ```yaml + apiVersion: v1 + kind: Pod + metadata: + name: cuda-vectoradd-kata + namespace: default + spec: + runtimeClassName: kata-qemu-nvidia-gpu + restartPolicy: OnFailure + containers: + - name: cuda-vectoradd + image: "nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0-ubuntu22.04" + resources: + limits: + nvidia.com/pgpu: "1" + memory: 16Gi + ``` + +1. Create the pod: + + ```console + $ kubectl apply -f cuda-vectoradd-kata.yaml + ``` + + *Example Output:* + + ```output + pod/cuda-vectoradd-kata created + ``` + +1. Optional: Verify the pod is running: + + ```console + $ kubectl get pod cuda-vectoradd-kata + ``` + + *Example Output:* + + ```output + NAME READY STATUS RESTARTS AGE + cuda-vectoradd-kata 1/1 Running 0 10s + ``` + +1. View the pod logs: + + ```console + $ kubectl logs -n default cuda-vectoradd-kata + ``` + + *Example Output:* + + ```output + [Vector addition of 50000 elements] + Copy input data from the host memory to the CUDA device + CUDA kernel launch with 196 blocks of 256 threads + Copy output data from the CUDA device to the host memory + Test PASSED + Done + ``` + +1. Delete the pod: + + ```console + $ kubectl delete -f cuda-vectoradd-kata.yaml + ``` + +### Troubleshooting Workloads + +If the sample workload does not run, confirm that you labeled nodes to run virtual machines in containers: + +```console +$ kubectl get nodes -l nvidia.com/gpu.workload.config=vm-passthrough +``` + +*Example Output:* + +```output +NAME STATUS ROLES AGE VERSION +kata-worker-1 Ready 10d v1.35.3 +kata-worker-2 Ready 10d v1.35.3 +kata-worker-3 Ready 10d v1.35.3 +``` + +You might have configured `vm-passthrough` as the default sandbox workload in the ClusterPolicy resource. +That setting applies the default sandbox workload cluster-wide, including for Kata when `mode` is `kata`. +Also confirm in the ClusterPolicy that `sandboxWorkloads` is configured for Kata as shown in the following example. + +```console +$ kubectl describe clusterpolicy | grep sandboxWorkloads +``` + +*Example Output:* + +```output +sandboxWorkloads: + enabled: true + defaultWorkload: vm-passthrough + mode: kata +``` diff --git a/gpu-operator/.agents/skills/gpu-operator-kubevirt/SKILL.md b/gpu-operator/.agents/skills/gpu-operator-kubevirt/SKILL.md new file mode 100644 index 000000000..e57e4ae7b --- /dev/null +++ b/gpu-operator/.agents/skills/gpu-operator-kubevirt/SKILL.md @@ -0,0 +1,493 @@ +--- +name: "gpu-operator-kubevirt" +description: "Guides users through configuring the GPU Operator for KubeVirt virtual machine workloads. Use when deploying GPU-enabled VMs or troubleshooting KubeVirt GPU passthrough. Trigger keywords - NVIDIA GPU Operator, KubeVirt, virtual machines, Kubernetes." +--- + + + + +# Prerequisites + +Before using KubeVirt with the GPU Operator, ensure the following prerequisites are configured on your cluster and nodes: + +# GPU Operator with KubeVirt + +## About the Operator with KubeVirt + +[KubeVirt](https://kubevirt.io/) is a virtual machine management add-on to Kubernetes that allows you to run and manage virtual machines in a Kubernetes cluster. +It eliminates the need to manage separate clusters for virtual machine and container workloads because both can now coexist in a single Kubernetes cluster. + +In addition to the GPU Operator being able to provision worker nodes for running GPU-accelerated containers, the GPU Operator can also be used to provision worker nodes for running GPU-accelerated virtual machines with KubeVirt. + +There are some different prerequisites required when running virtual machines with GPUs compared to running containers with GPUs. +The primary difference is the drivers required. +For example, the datacenter driver is needed for containers, the vfio-pci driver is needed for GPU passthrough, and the [NVIDIA vGPU Manager](https://docs.nvidia.com/grid/latest/grid-vgpu-user-guide/index.html#installing-configuring-grid-vgpu) is needed for creating vGPU devices. + +### Configure Worker Nodes for GPU Operator components + +The GPU Operator can now be configured to deploy different software components on worker nodes depending on what GPU workload is configured to run on those nodes. +This is configured by adding a `nvidia.com/gpu.workload.config` label to the worker node with the value of `container`, `vm-passthrough`, or `vm-vgpu` depending on if you are planning to use vGPU or not. +The GPU Operator will use the label to determine which software components to deploy on the worker nodes. + +Given the following node configuration: + +* Node A is configured with the label `nvidia.com/gpu.workload.config=container` and configured to run containers. +* Node B is configured with the label `nvidia.com/gpu.workload.config=vm-passthrough` and configured to run virtual machines with Passthrough GPU. +* Node C is configured with the label `nvidia.com/gpu.workload.config=vm-vgpu` and configured to run virtual machines with vGPU. + +The GPU Operator will deploy the following software components on each node: + +* Node A receives the following software components: + * `NVIDIA Datacenter Driver` - to install the driver + * `NVIDIA Container Toolkit` - to ensure containers can properly access GPUs + * `NVIDIA Kubernetes Device Plugin` - to discover and advertise GPU resources to kubelet + * `NVIDIA DCGM and DCGM Exporter` - to monitor the GPU(s) + +* Node B receives the following software components: + * `VFIO Manager` - to load `vfio-pci` and bind it to all GPUs on the node + * `Sandbox Device Plugin` - to discover and advertise the passthrough GPUs to kubelet + +* Node C receives the following software components: + * `NVIDIA vGPU Manager` - to install the driver + * `NVIDIA vGPU Device Manager` - to create vGPU devices on the node + * `Sandbox Device Plugin` - to discover and advertise the vGPU devices to kubelet + +If the node label `nvidia.com/gpu.workload.config` does not exist on the node, the GPU Operator will assume the default GPU workload configuration, `container`, and will deploy the software components needed to support this workload type. +To override the default GPU workload configuration, set the following value in `ClusterPolicy`: `sandboxWorkloads.defaultWorkload=`. + +### Assumptions, constraints, and dependencies + +* A GPU worker node can run GPU workloads of a particular type, such as containers, virtual machines with GPU Passthrough, or virtual machines with vGPU, but not a combination of any of them. + +* The cluster admin or developer has knowledge about their cluster ahead of time and can properly label nodes to indicate what types of GPU workloads they will run. + +* Worker nodes running GPU accelerated virtual machines (with GPU passthrough or vGPU) are assumed to be bare metal. + +* The GPU Operator will not automate the installation of NVIDIA drivers inside KubeVirt virtual machines with GPUs/vGPUs attached. + +* Users must manually add all passthrough GPU and vGPU resources to the `permittedDevices` list in the KubeVirt CR before assigning them to KubeVirt virtual machines. Refer to the [KubeVirt documentation](https://kubevirt.io/user-guide/compute/host-devices/#listing-permitted-devices) for more information. + +## Step 1: Configure KubeVirt with the GPU Operator + +After configuring the prerequisites, the high level workflow for using the GPU Operator with KubeVirt is as follows: + +* Label worker nodes based on the GPU workloads they will run. +* Install the GPU Operator and set `sandboxWorkloads.enabled=true` + +If you are planning to deploy VMs with vGPU, the workflow is as follows: + +* Build the NVIDIA vGPU Manager image +* Label the node for the vGPU configuration +* Add vGPU resources to KubeVirt CR +* Create a virtual machine with vGPU + +If you are planning to deploy VMs with GPU passthrough, the workflow is as follows: + +* Add GPU passthrough resources to KubeVirt CR +* Create a virtual machine with GPU passthrough + +### Label worker nodes + +The GPU Operator uses the value of the `nvidia.com/gpu.workload.config` label to determine which operands to deploy on your worker node. + +1. Add a `nvidia.com/gpu.workload.config` label to a worker node: + + ```console + $ kubectl label node --overwrite nvidia.com/gpu.workload.config=vm-vgpu + ``` + + You can assign the following values to the label: + + * `container` + * `vm-passthrough` + * `vm-vgpu` + + Refer to the Configure Worker Nodes for GPU Operator components section for more information on the different configurations options. + +### Install the GPU Operator + +Follow one of the below subsections for installing the GPU Operator, depending on whether you plan to use NVIDIA vGPU or not. + +**Note:** + +The following commands set the `sandboxWorkloads.enabled` flag. +This `ClusterPolicy` flag controls whether the GPU Operator can provision GPU worker nodes for virtual machine workloads, in addition to container workloads. +This flag is disabled by default, meaning all nodes get provisioned with the same software to enable container workloads, and the `nvidia.com/gpu.workload.config` node label is not used. + +The term *sandboxing* refers to running software in a separate isolated environment, typically for added security (that is, a virtual machine). +We use the term `sandbox workloads` to signify workloads that run in a virtual machine, irrespective of the virtualization technology used. +#### Install the GPU Operator without NVIDIA vGPU + +Install the GPU Operator, enabling `sandboxWorkloads`: + +```console +$ helm install --wait --generate-name \ + -n gpu-operator --create-namespace \ + nvidia/gpu-operator \ + --version=${version} \ + --set sandboxWorkloads.enabled=true +``` + +#### Install the GPU Operator with NVIDIA vGPU + +Before installing the GPU Operator with NVIDIA vGPU, you must build a private NVIDIA vGPU Manager container image and push to a private registry. +Follow the steps provided in this section. + +1. Create a namespace for GPU Operator: + + ```console + $ kubectl create namespace gpu-operator + ``` + +1. Create an ImagePullSecret for accessing the NVIDIA vGPU Manager image: + + ```console + $ kubectl create secret docker-registry ${REGISTRY_SECRET_NAME} \ + --docker-server=${PRIVATE_REGISTRY} --docker-username= \ + --docker-password= \ + --docker-email= -n gpu-operator + ``` + +1. Install the GPU Operator with `sandboxWorkloads` and `vgpuManager` enabled and specify the NVIDIA vGPU Manager image built previously: + + ```console + $ helm install --wait --generate-name \ + -n gpu-operator --create-namespace \ + nvidia/gpu-operator \ + --version=${version} \ + --set sandboxWorkloads.enabled=true \ + --set vgpuManager.enabled=true \ + --set vgpuManager.repository= \ + --set vgpuManager.image=vgpu-manager \ + --set vgpuManager.version= \ + --set vgpuManager.imagePullSecrets={${REGISTRY_SECRET_NAME}} + ``` + +The vGPU Device Manager, deployed by the GPU Operator, automatically creates vGPU devices that can be assigned to KubeVirt virtual machines. +Without additional configuration, the GPU Operator creates a default set of devices on all GPUs. +To learn more about the vGPU Device Manager and configure which types of vGPU devices get created in your cluster, refer to vGPU Device Configuration. + +### Add GPU resources to KubeVirt CR +Follow one of the below subsections for adding GPU resources to the KubeVirt CR, depending on whether you plan to use NVIDIA vGPU or not. + +#### Add vGPU resources to KubeVirt CR + +Update the KubeVirt custom resource so that all vGPU devices in your cluster are permitted and can be assigned to virtual machines. + +The following example shows how to permit the A10-12Q vGPU device, the device names for the GPUs on your cluster will likely be different. + +1. Determine the resource names for the GPU devices: + + ```console + $ kubectl get node cnt-server-2 -o json | jq '.status.allocatable | with_entries(select(.key | startswith("nvidia.com/"))) | with_entries(select(.value != "0"))' + ``` + + *Example Output* + + ```output + { + "nvidia.com/NVIDIA_A10-12Q": "4" + } + ``` + +1. Determine the PCI device IDs for the GPUs. + + * You can search by device name in the [PCI IDs database](https://pci-ids.ucw.cz/v2.2/pci.ids). + + * If you have host access to the node, you can list the NVIDIA GPU devices with a command like the following example: + + ```console + $ lspci -nnk -d 10de: + ``` + + *Example Output* + + ```output + 65:00.0 3D controller [0302]: NVIDIA Corporation GA102GL [A10] [10de:2236] (rev a1) + Subsystem: NVIDIA Corporation GA102GL [A10] [10de:1482] + Kernel modules: nvidiafb, nouveau + ``` + +1. Modify the `KubeVirt` custom resource like the following partial example. + + ```yaml + ... + spec: + configuration: + developerConfiguration: + featureGates: + - GPU + - DisableMDEVConfiguration + permittedHostDevices: # Defines VM devices to import. + mediatedDevices: # Include for vGPU + - externalResourceProvider: true + mdevNameSelector: NVIDIA A10-12Q + resourceName: nvidia.com/NVIDIA_A10-12Q + ... + ``` + + Replace the values in the YAML as follows: + + * `mdevNameSelector` and `resourceName` under `mediatedDevices` to correspond to your vGPU type. + + * Set `externalResourceProvider=true` to indicate that this resource is provided by an external device plugin, in this case the `sandbox-device-plugin` that is deployed by the GPU Operator. + +Refer to the [KubeVirt user guide](https://kubevirt.io/user-guide/virtual_machines/host-devices/#listing-permitted-devices) for more information on the configuration options. + +#### Add GPU passthrough resources to KubeVirt CR + +Update the KubeVirt custom resource so that all GPU passthrough devices in your cluster are permitted and can be assigned to virtual machines. + +The following example shows how to permit the A10 GPU device, the device names for the GPUs on your cluster will likely be different. + +1. Determine the resource names for the GPU devices: + + ```console + $ kubectl get node cnt-server-2 -o json | jq '.status.allocatable | with_entries(select(.key | startswith("nvidia.com/"))) | with_entries(select(.value != "0"))' + ``` + + *Example Output* + + ```output + { + "nvidia.com/GA102GL_A10": "1" + } + ``` + +1. Determine the PCI device IDs for the GPUs. + + * You can search by device name in the [PCI IDs database](https://pci-ids.ucw.cz/v2.2/pci.ids). + + * If you have host access to the node, you can list the NVIDIA GPU devices with a command like the following example: + + ```console + $ lspci -nnk -d 10de: + ``` + + *Example Output* + + ```output + 65:00.0 3D controller [0302]: NVIDIA Corporation GA102GL [A10] [10de:2236] (rev a1) + Subsystem: NVIDIA Corporation GA102GL [A10] [10de:1482] + Kernel modules: nvidiafb, nouveau + ``` + +1. Modify the `KubeVirt` custom resource like the following partial example. + + ```yaml + ... + spec: + configuration: + developerConfiguration: + featureGates: + - GPU + - DisableMDEVConfiguration + permittedHostDevices: # Defines VM devices to import. + pciHostDevices: # Include for GPU passthrough + - externalResourceProvider: true + pciVendorSelector: 10DE:2236 + resourceName: nvidia.com/GA102GL_A10 + ... + ``` + + Replace the values in the YAML as follows: + + * `pciVendorSelector` and `resourceName` under `pciHostDevices` to correspond to your GPU model. + + * Set `externalResourceProvider=true` to indicate that this resource is provided by an external device plugin, in this case the `sandbox-device-plugin` that is deployed by the GPU Operator. + +Refer to the [KubeVirt user guide](https://kubevirt.io/user-guide/virtual_machines/host-devices/#listing-permitted-devices) for more information on the configuration options. + +### Create a virtual machine with GPU + +After the `sandbox-device-plugin` pod is running on your worker nodes and the GPU resources have been added to the +KubeVirt allowlist, you can assign a GPU to a virtual machine by editing the `spec.domain.devices.gpus` field +in the `VirtualMachineInstance` manifest. + +Example for GPU passthrough: + +```yaml +apiVersion: kubevirt.io/v1alpha3 +kind: VirtualMachineInstance +... +spec: + domain: + devices: + gpus: + - deviceName: nvidia.com/GA102GL_A10 + name: gpu1 +... +``` + +Example for vGPU: + +```yaml +apiVersion: kubevirt.io/v1alpha3 +kind: VirtualMachineInstance +... +spec: + domain: + devices: + gpus: + - deviceName: nvidia.com/NVIDIA_A10-12Q + name: gpu1 +... +``` + +* `deviceName` is the resource name representing the device. + +* `name` is a name to identify the device in the virtual machine + +## Step 2: vGPU Device Configuration + +The vGPU Device Manager assists in creating vGPU devices on GPU worker nodes. +The vGPU Device Manager allows administrators to declaratively define a set of possible vGPU device configurations they would like applied to GPUs on a node. +At runtime, adminstrators then point the vGPU Device Manager at one of these configurations, and vGPU Device Manager takes care of applying it. + +The configuration file is created as a ConfigMap, and is shared across all worker nodes. +At runtime, a node label, `nvidia.com/vgpu.config`, can be used to decide which of these configurations to actually apply to a node at any given time. +If the node is not labeled, then the `default` configuration will be used. +For more information on this component and how it is configured, refer to the [NVIDIA vGPU Device Manager README](https://github.com/NVIDIA/vgpu-device-manager). + +By default, the GPU Operator deploys a ConfigMap for the vGPU Device Manager, containing named configurations for all [vGPU types supported by NVIDIA vGPU](https://docs.nvidia.com/grid/latest/grid-vgpu-user-guide/index.html#supported-gpus-grid-vgpu). +Users can select a specific configuration for a worker node by applying the `nvidia.com/vgpu.config` node label. +For example, labeling a node with `nvidia.com/vgpu.config=A10-8Q` would create three vGPU devices of type **A10-8Q** on all **A10** GPUs on the node. Note that three is the maximum number of **A10-8Q** devices that can be created per GPU. +If the node is not labeled, the `default` configuration will be applied. +The `default` configuration will create Q-series vGPU devices on all GPUs, where the amount of framebuffer memory per vGPU device is half the total GPU memory. +For example, the `default` configuration will create two **A10-12Q** devices on all **A10** GPUs. + +You can also create different vGPU Q profiles on the same GPU using vGPU Device Manager configuration. +For example, you can create a **A10-4Q** and a **A10-6Q** device on same GPU by creating a vGPU Device Manager configuration with the following content: + +```yaml +version: v1 +vgpu-configs: + custom-A10-config: + - devices: all + vgpu-devices: + "A10-4Q": 3 + "A10-6Q": 2 +``` + +If custom vGPU device configuration is desired, more than the default config map provides, you can create your own config map: + +```console +$ kubectl create configmap custom-vgpu-config -n gpu-operator --from-file=config.yaml=/path/to/file +``` + +And then configure the GPU Operator to use it by setting `vgpuDeviceManager.config.name=custom-vgpu-config`. + +### Apply a New vGPU Device Configuration + +You can apply a specific vGPU device configuration on a per-node basis by setting the `nvidia.com/vgpu.config` node label. +It is recommended to set this node label prior to installing the GPU Operator if you do not want the default configuration applied. + +Switching vGPU device configuration after one has been successfully applied assumes that no virtual machines with vGPU are currently running on the node. +Any existing virtual machines should be shutdown/migrated before you apply the new configuration. + +To apply a new configuration after GPU Operator install, update the `nvidia.com/vgpu.config` node label. + +**Note:** + +On GPUs that support MIG, you have the option to select MIG-backed vGPU instances instead of time-sliced vGPU instances. +To select a MIG-backed vGPU profile, label the node with the name of the MIG-backed vGPU profile. +The following example shows how to apply a new configuration on a system with two **A10** GPUs. + +```console +$ nvidia-smi -L +GPU 0: NVIDIA A10 (UUID: GPU-ebd34bdf-1083-eaac-2aff-4b71a022f9bd) +GPU 1: NVIDIA A10 (UUID: GPU-1795e88b-3395-b27b-dad8-0488474eec0c) +``` + +In this example, the GPU Operator has been installed and the `nvidia.com/vgpu.config` was not added to worker nodes, meaning the `default` vGPU config got applied. +This resulted in the creation of four **A10-12Q** devices (two per GPU): + +```console +$ kubectl get node cnt-server-2 -o json | jq '.status.allocatable | with_entries(select(.key | startswith("nvidia.com/"))) | with_entries(select(.value != "0"))' +{ + "nvidia.com/NVIDIA_A10-12Q": "4" +} +``` + +Now if you wanted to create **A10-4Q** devices, add the `nvidia.com/vgpu.config` label to the node: + +```console +$ kubectl label node --overwrite nvidia.com/vgpu.config=A10-4Q +``` + +After the vGPU Device Manager finishes applying the new configuration, all GPU Operator pods should return to the Running state. + +```console +$ kubectl get pods -n gpu-operator +NAME READY STATUS RESTARTS AGE +... +nvidia-sandbox-device-plugin-daemonset-brtb6 1/1 Running 0 10s +nvidia-sandbox-validator-ljnwg 1/1 Running 0 10s +nvidia-vgpu-device-manager-8mgg8 1/1 Running 0 30m +nvidia-vgpu-manager-daemonset-fpplc 1/1 Running 0 31m +``` + +You can now see 12 **A10-4Q** devices on the node, as six **A10-4Q** devices can be created per **A10** GPU. + +```console +$ kubectl get node cnt-server-2 -o json | jq '.status.allocatable | with_entries(select(.key | startswith("nvidia.com/"))) | with_entries(select(.value != "0"))' +{ + "nvidia.com/NVIDIA_A10-4Q": "12" +} +``` + +## Step 3: Building the NVIDIA vGPU Manager image + +**Note:** + +Building the NVIDIA vGPU Manager image is only required if you are planning to use NVIDIA vGPU. +If only planning to use PCI passthrough, skip this section. +This section covers building the NVIDIA vGPU Manager container image and pushing it to a private registry. + +Download the vGPU Software from the [NVIDIA Licensing Portal](https://stg.ui.licensing.nvidia.com/). + +* Login to the NVIDIA Licensing Portal and navigate to the **Software Downloads** section. +* The NVIDIA vGPU Software is located in the **Software Downloads** section of the NVIDIA Licensing Portal. +* The vGPU Software bundle is packaged as a zip file. Download and unzip the bundle to obtain the NVIDIA vGPU Manager for Linux file, `NVIDIA-Linux-x86_64--vgpu-kvm.run`. + +Next, clone the driver container repository and build the driver image with the following steps. + +Open a terminal and clone the driver container image repository. + +```console +$ git clone https://github.com/NVIDIA/gpu-driver-container.git +$ cd gpu-driver-container +``` + +1. Copy the NVIDIA vGPU manager from your extracted ZIP file to the operating system version you want to build the image for: + * We use Ubuntu 22.04 as an example. + + Copy `/\*-vgpu-kvm.run` to `vgpu-manager/ubuntu22.04/`. + + ```console + $ cp /*-vgpu-kvm.run vgpu-manager/ubuntu22.04/ + ``` + +**Note:** + +For Red Hat OpenShift, use a directory that includes `rhel` in the directory name. For example, `vgpu-manager/rhel8`. +| Set the following environment variables: +| `PRIVATE_REGISTRY` - name of private registry used to store driver image +| `VGPU_HOST_DRIVER_VERSION` - NVIDIA vGPU Manager version downloaded from NVIDIA Software Portal +| `OS_TAG` - this must match the Guest OS version. In the following example `ubuntu22.04` is used. For Red Hat OpenShift this should be set to `rhcos4.x` where x is the supported minor OCP version. + +```console +$ export PRIVATE_REGISTRY=my/private/registry VGPU_HOST_DRIVER_VERSION=580.82.07 OS_TAG=ubuntu22.04 +``` + +Build the NVIDIA vGPU Manager image. + +```console +$ VGPU_HOST_DRIVER_VERSION=${VGPU_HOST_DRIVER_VERSION} IMAGE_NAME=${PRIVATE_REGISTRY}/vgpu-manager make build-vgpuhost-${OS_TAG} +``` + +Push NVIDIA vGPU Manager image to your private registry. + +```console +$ VGPU_HOST_DRIVER_VERSION=${VGPU_HOST_DRIVER_VERSION} IMAGE_NAME=${PRIVATE_REGISTRY}/vgpu-manager make push-vgpuhost-${OS_TAG} +``` diff --git a/gpu-operator/.agents/skills/gpu-operator-multiinstance/SKILL.md b/gpu-operator/.agents/skills/gpu-operator-multiinstance/SKILL.md new file mode 100644 index 000000000..dc27809af --- /dev/null +++ b/gpu-operator/.agents/skills/gpu-operator-multiinstance/SKILL.md @@ -0,0 +1,519 @@ +--- +name: "gpu-operator-multiinstance" +description: "Explains MIG strategies, labels, and configuration with the GPU Operator. Use when partitioning GPUs, enabling MIG, or troubleshooting MIG resource exposure. Trigger keywords - NVIDIA GPU Operator, MIG, Multi-Instance GPU, GPU partitioning." +--- + + + + +# GPU Operator with MIG + +## About Multi-Instance GPU + +Multi-Instance GPU (MIG) enables GPUs based on the NVIDIA Ampere and later architectures, such as NVIDIA A100, to be partitioned into separate and secure GPU instances for CUDA applications. +Refer to the [MIG User Guide](https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html) for more information about MIG. + +GPU Operator deploys MIG Manager to manage MIG configuration on nodes in your Kubernetes cluster. +You must enable MIG during installation by choosing a MIG strategy before you can configure MIG. + +Refer to the architecture section for more information about how MIG is implemented in the GPU Operator. + +## Step 1: Enabling MIG During Installation + +Use the following steps to enable MIG and deploy MIG Manager. + +1. Install the Operator: + + ```console + $ helm install --wait --generate-name \ + -n gpu-operator --create-namespace \ + nvidia/gpu-operator \ + --version=${version} \ + --set mig.strategy=single + ``` + + This example sets `single` as the MIG strategy. + Available MIG strategy options: + + * `single`: MIG mode is enabled on all GPUs on a node. + * `mixed`: MIG mode is not enabled on all GPUs on a node. + + In a cloud service provider (CSP) environment such as Google Cloud, also specify + `--set migManager.env[0].name=WITH_REBOOT --set-string migManager.env[0].value=true` + to ensure that the node reboots and can apply the MIG configuration. + + MIG Manager supports preinstalled drivers, meaning drivers that are not managed by the GPU Operator and you installed directly on the host. + If drivers are preinstalled, also specify `--set driver.enabled=false`. + Refer to mig-with-preinstalled-drivers for more details. + + After several minutes, all GPU Operator pods, including the `nvidia-mig-manager` are deployed on nodes that have MIG capable GPUs. + + **Note:** + + MIG Manager requires that no user workloads are running on the GPUs being configured. + In some cases, the node might need to be rebooted, such as a CSP, so the node might need to be cordoned + before changing the MIG mode or the MIG geometry on the GPUs. +1. Optional: Display the pods in the Operator namespace: + + ```console + $ kubectl get pods -n gpu-operator + ``` + + *Example Output* + +1. Optional: Display the labels applied to the node: + + ```console + $ kubectl get node -o json | jq '.items[].metadata.labels' + ``` + + *Partial Output* + +## Step 2: Configuring MIG Profiles + +When MIG is enabled, nodes are labeled with `nvidia.com/mig.config: all-disabled` by default. +To use a profile on a node, update the label value with the desired profile, for example, `nvidia.com/mig.config=all-1g.10gb`. + +Introduced in GPU Operator v26.3.0, MIG Manager generates the MIG configuration for a node at runtime from the available hardware. +The configuration is generated on startup, discovering MIG profiles for each MIG-capable GPU on a node using [NVIDIA Management Library (NVML)](https://developer.nvidia.com/management-library-nvml), then writing it to a ConfigMap for each MIG-capable node in your cluster. +The ConfigMap is named `-mig-config`, where `` is the name of each MIG-capable node. +Each ConfigMap contains a complete mig-parted config, including `all-disabled`, `all-enabled`, per-profile configs such as `all-1g.10gb`, and `all-balanced` with device-filter support for mixed GPU types. +When a new MIG-capable GPU is added to a node, the new GPU is automatically added to the ConfigMap. + +If you need custom profiles, you can use a custom MIG configuration instead of the generated one. +You can use the Helm chart to create a ConfigMap from values at install time, or create and reference your own ConfigMap. +For an example, refer to dynamically-creating-the-mig-configuration-configmap. + +**Note:** + +Generated MIG configuration might not be available on older drivers, such as 535 branch GPU drivers, as they do not support querying MIG profiles when MIG mode is disabled. In those cases, the GPU Operator will use a [static Configmap](https://github.com/NVIDIA/gpu-operator/blob/main/assets/state-mig-manager/0400_configmap.yaml), `default-mig-parted-config`, for MIG profiles. +### Example: Single MIG Strategy + +The following steps show how to use the single MIG strategy and configure the `1g.10gb` profile on one node. + +1. Configure the MIG strategy to `single` if you are unsure of the current strategy: + + ```console + $ kubectl patch clusterpolicies.nvidia.com/cluster-policy \ + --type='json' \ + -p='[{"op":"replace", "path":"/spec/mig/strategy", "value":"single"}]' + ``` + +1. Label the nodes with the profile to configure: + + ```console + $ kubectl label nodes nvidia.com/mig.config=all-1g.10gb --overwrite + ``` + + MIG Manager proceeds to apply a `mig.config.state` label to the node and terminates all + the GPU pods in preparation to enable MIG mode and configure the GPU into the desired MIG geometry. + +1. Optional: Display the node labels: + + ```console + $ kubectl get node -o=jsonpath='{.metadata.labels}' | jq . + ``` + + *Partial Output* + + ```json + "nvidia.com/gpu.product": "NVIDIA-H100-80GB-HBM3", + "nvidia.com/gpu.replicas": "1", + "nvidia.com/gpu.sharing-strategy": "none", + "nvidia.com/mig.capable": "true", + "nvidia.com/mig.config": "all-1g.10gb", + "nvidia.com/mig.config.state": "pending", + "nvidia.com/mig.strategy": "single" + } + ``` + + When the `WITH_REBOOT` option is set, MIG Manager sets the label to `nvidia.com/mig.config.state: rebooting`. + +1. Confirm that MIG Manager completed the configuration by checking the node labels: + + ```console + $ kubectl get node -o=jsonpath='{.metadata.labels}' | jq . + ``` + + Check for the following labels: + + * `nvidia.com/gpu.count: 7` (the value differs according to the GPU model) + * `nvidia.com/gpu.slices.ci: 1` + * `nvidia.com/gpu.slices.gi: 1` + * `nvidia.com/mig.config.state: success` + + *Partial Output* + + ```json + "nvidia.com/gpu.count": "7", + "nvidia.com/gpu.present": "true", + "nvidia.com/gpu.product": "NVIDIA-H100-80GB-HBM3-MIG-1g.10gb", + "nvidia.com/gpu.slices.ci": "1", + "nvidia.com/gpu.slices.gi": "1", + "nvidia.com/mig.capable": "true", + "nvidia.com/mig.config": "all-1g.10gb", + "nvidia.com/mig.config.state": "success", + "nvidia.com/mig.strategy": "single" + ``` + +1. Optional: Run the `nvidia-smi` command in the driver container to verify that the MIG configuration has been applied. + + ```console + $ kubectl exec -it -n gpu-operator ds/nvidia-driver-daemonset -- nvidia-smi -L + ``` + + *Example Output* + +### Example: Mixed MIG Strategy + +The following steps show how to use the `mixed` MIG strategy and configure the `all-balanced` profile on one node. + +1. Configure the MIG strategy to `mixed` if you are unsure of the current strategy: + + ```console + $ kubectl patch clusterpolicies.nvidia.com/cluster-policy \ + --type='json' \ + -p='[{"op":"replace", "path":"/spec/mig/strategy", "value":"mixed"}]' + ``` + +1. Label the nodes with the profile to configure: + + ```console + $ kubectl label nodes nvidia.com/mig.config=all-balanced --overwrite + ``` + + MIG Manager proceeds to apply a `mig.config.state` label to the node and terminates all + the GPU pods in preparation to enable MIG mode and configure the GPU into the desired MIG geometry. + +1. Confirm that MIG Manager completed the configuration by checking the node labels: + + ```console + $ kubectl get node -o=jsonpath='{.metadata.labels}' | jq . + ``` + + Check for labels like the following. + The profiles and GPU counts differ according to the GPU model. + + * `nvidia.com/mig-1g.10gb.count: 2` + * `nvidia.com/mig-2g.20gb.count: 1` + * `nvidia.com/mig-3g.40gb.count: 1` + * `nvidia.com/mig.config.state: success` + + *Partial Output* + +1. Optional: Run the `nvidia-smi` command in the driver container to verify that the GPU has been configured. + + ```console + $ kubectl exec -it -n gpu-operator ds/nvidia-driver-daemonset -- nvidia-smi -L + ``` + + *Example Output* + +### Example: Reconfiguring MIG Profiles + +MIG Manager supports dynamic reconfiguration of the MIG geometry. +The following steps show how to update a GPU on a node to the `3g.40gb` profile with the single MIG strategy. + +1. Label the node with the profile: + + ```console + $ kubectl label nodes nvidia.com/mig.config=all-3g.40gb --overwrite + ``` + +1. Optional: Monitor the MIG Manager logs to confirm the new MIG geometry is applied: + + ```console + $ kubectl logs -n gpu-operator -l app=nvidia-mig-manager -c nvidia-mig-manager + ``` + + *Example Output* + + ```console + Applying the selected MIG config to the node + time="2024-05-14T18:31:26Z" level=debug msg="Parsing config file..." + time="2024-05-14T18:31:26Z" level=debug msg="Selecting specific MIG config..." + time="2024-05-14T18:31:26Z" level=debug msg="Running apply-start hook" + time="2024-05-14T18:31:26Z" level=debug msg="Checking current MIG mode..." + time="2024-05-14T18:31:26Z" level=debug msg="Walking MigConfig for (devices=all)" + time="2024-05-14T18:31:26Z" level=debug msg=" GPU 0: 0x233010DE" + time="2024-05-14T18:31:26Z" level=debug msg=" Asserting MIG mode: Enabled" + time="2024-05-14T18:31:26Z" level=debug msg=" MIG capable: true\n" + time="2024-05-14T18:31:26Z" level=debug msg=" Current MIG mode: Enabled" + time="2024-05-14T18:31:26Z" level=debug msg="Checking current MIG device configuration..." + time="2024-05-14T18:31:26Z" level=debug msg="Walking MigConfig for (devices=all)" + time="2024-05-14T18:31:26Z" level=debug msg=" GPU 0: 0x233010DE" + time="2024-05-14T18:31:26Z" level=debug msg=" Asserting MIG config: map[3g.40gb:2]" + time="2024-05-14T18:31:26Z" level=debug msg="Running pre-apply-config hook" + time="2024-05-14T18:31:26Z" level=debug msg="Applying MIG device configuration..." + time="2024-05-14T18:31:26Z" level=debug msg="Walking MigConfig for (devices=all)" + time="2024-05-14T18:31:26Z" level=debug msg=" GPU 0: 0x233010DE" + time="2024-05-14T18:31:26Z" level=debug msg=" MIG capable: true\n" + time="2024-05-14T18:31:26Z" level=debug msg=" Updating MIG config: map[3g.40gb:2]" + MIG configuration applied successfully + time="2024-05-14T18:31:27Z" level=debug msg="Running apply-exit hook" + Restarting validator pod to re-run all validations + pod "nvidia-operator-validator-kmncw" deleted + Restarting all GPU clients previously shutdown in Kubernetes by reenabling their component-specific nodeSelector labels + node/node-name labeled + Changing the 'nvidia.com/mig.config.state' node label to 'success' + ``` + +1. Optional: Display the node labels to confirm the GPU count (`2`), slices (`3`), and profile are set: + + ```console + $ kubectl get node -o=jsonpath='{.metadata.labels}' | jq . + ``` + + *Partial Output* + + ```json + "nvidia.com/gpu.count": "2", + "nvidia.com/gpu.present": "true", + "nvidia.com/gpu.product": "NVIDIA-H100-80GB-HBM3-MIG-3g.40gb", + "nvidia.com/gpu.replicas": "1", + "nvidia.com/gpu.sharing-strategy": "none", + "nvidia.com/gpu.slices.ci": "3", + "nvidia.com/gpu.slices.gi": "3", + "nvidia.com/mig.capable": "true", + "nvidia.com/mig.config": "all-3g.40gb", + "nvidia.com/mig.config.state": "success", + "nvidia.com/mig.strategy": "single", + "nvidia.com/mps.capable": "false" + } + ``` + +### Example: Custom MIG Configuration During Installation + +If you need to use custom profiles, you can create a custom ConfigMap during installation by passing in a name and data for the ConfigMap with the Helm command. + +The MIG Manager daemonset is configured to use this ConfigMap instead of the auto-generated one. + +In your values.yaml file, set `migManager.config.create` to `true`, set `migManager.config.name`, and add the ConfigMap data under `migManager.config.data`, for example: + +1. In your `values.yaml` file, add the data for the ConfigMap, like the following example: + +**Note:** + +Custom ConfigMaps must contain a key named "config.yaml" +1. Install or upgrade the GPU Operator with this values file so the chart creates the ConfigMap: + + ```console + $ helm upgrade --install gpu-operator -n gpu-operator --create-namespace \ + nvidia/gpu-operator --version=${version} \ + -f values.yaml + ``` + +1. If the custom configuration specifies more than one instance profile, set the strategy to `mixed`: + + ```console + $ kubectl patch clusterpolicies.nvidia.com/cluster-policy \ + --type='json' \ + -p='[{"op":"replace", "path":"/spec/mig/strategy", "value":"mixed"}]' + ``` + +1. Label the nodes with the profile to configure: + + ```console + $ kubectl label nodes nvidia.com/mig.config=custom-mig --overwrite + ``` + +1. Optional: Monitor the MIG Manager logs to confirm the new MIG geometry is applied: + + ```console + $ kubectl logs -n gpu-operator -l app=nvidia-mig-manager -c nvidia-mig-manager + ``` + + *Example Output* + + ```console + Applying the selected MIG config to the node + time="2024-05-15T13:40:08Z" level=debug msg="Parsing config file..." + time="2024-05-15T13:40:08Z" level=debug msg="Selecting specific MIG config..." + time="2024-05-15T13:40:08Z" level=debug msg="Running apply-start hook" + time="2024-05-15T13:40:08Z" level=debug msg="Checking current MIG mode..." + time="2024-05-15T13:40:08Z" level=debug msg="Walking MigConfig for (devices=all)" + time="2024-05-15T13:40:08Z" level=debug msg=" GPU 0: 0x233010DE" + time="2024-05-15T13:40:08Z" level=debug msg=" Asserting MIG mode: Enabled" + time="2024-05-15T13:40:08Z" level=debug msg=" MIG capable: true\n" + time="2024-05-15T13:40:08Z" level=debug msg=" Current MIG mode: Enabled" + time="2024-05-15T13:40:08Z" level=debug msg="Checking current MIG device configuration..." + time="2024-05-15T13:40:08Z" level=debug msg="Walking MigConfig for (devices=all)" + time="2024-05-15T13:40:08Z" level=debug msg=" GPU 0: 0x233010DE" + time="2024-05-15T13:40:08Z" level=debug msg=" Asserting MIG config: map[1g.10gb:5 2g.20gb:1]" + time="2024-05-15T13:40:08Z" level=debug msg="Running pre-apply-config hook" + time="2024-05-15T13:40:08Z" level=debug msg="Applying MIG device configuration..." + time="2024-05-15T13:40:08Z" level=debug msg="Walking MigConfig for (devices=all)" + time="2024-05-15T13:40:08Z" level=debug msg=" GPU 0: 0x233010DE" + time="2024-05-15T13:40:08Z" level=debug msg=" MIG capable: true\n" + time="2024-05-15T13:40:08Z" level=debug msg=" Updating MIG config: map[1g.10gb:5 2g.20gb:1]" + time="2024-05-15T13:40:09Z" level=debug msg="Running apply-exit hook" + MIG configuration applied successfully + ``` + +### Example: Custom MIG Configuration + +You can create and apply a ConfigMap yourself if the default profiles do not meet your needs. + +1. Create a file, such as `custom-mig-config.yaml`, with contents like the following example: + +**Note:** + +Custom ConfigMaps must contain a key named "config.yaml" +1. Apply the manifest: + + ```console + $ kubectl apply -n gpu-operator -f custom-mig-config.yaml + ``` + +1. If the custom configuration specifies more than one instance profile, set the strategy to `mixed`: + + ```console + $ kubectl patch clusterpolicies.nvidia.com/cluster-policy \ + --type='json' \ + -p='[{"op":"replace", "path":"/spec/mig/strategy", "value":"mixed"}]' + ``` + +1. Patch the cluster policy so MIG Manager uses the custom ConfigMap: + + ```console + $ kubectl patch clusterpolicies.nvidia.com/cluster-policy \ + --type='json' \ + -p='[{"op":"replace", "path":"/spec/migManager/config/name", "value":"custom-mig-config"}]' + ``` + +1. Label the nodes with the profile to configure: + + ```console + $ kubectl label nodes nvidia.com/mig.config=five-1g-one-2g --overwrite + ``` + +## Step 3: Verification: Running Sample CUDA Workloads + +## Step 4: Disabling MIG + +You can disable MIG on a node by setting the `nvidia.com/mig.config` label to `all-disabled`: + +```console +$ kubectl label nodes nvidia.com/mig.config=all-disabled --overwrite +``` + +## Step 5: MIG Manager with Preinstalled Drivers + +MIG Manager supports preinstalled drivers. +Information in the preceding sections still applies, however there are a few additional details to consider. + +### Install + +During GPU Operator installation, `driver.enabled=false` must be set. The following options +can be used to install the GPU Operator: + +```console +$ helm install gpu-operator \ + -n gpu-operator --create-namespace \ + nvidia/gpu-operator \ + --version=${version} \ + --set driver.enabled=false +``` + +### Managing Host GPU Clients + +MIG Manager stops all operator-managed pods that have access to GPUs when applying a MIG reconfiguration. +When drivers are preinstalled, there can be GPU clients on the host that also need to be stopped. + +When drivers are preinstalled, MIG Manager attempts to stop and restart a list of systemd services on the host across a MIG reconfiguration. +The list of services is specified in the `default-gpu-clients` ConfigMap. + +The following sample GPU clients file, `clients.yaml`, is used to create the `default-gpu-clients` ConfigMap: + +```yaml +version: v1 +systemd-services: + - nvsm.service + - nvsm-mqtt.service + - nvsm-core.service + - nvsm-api-gateway.service + - nvsm-notifier.service + - nv_peer_mem.service + - nvidia-dcgm.service + - dcgm.service + - dcgm-exporter.service +``` + +You can modify the list by editing the ConfigMap after installation. +Alternatively, you can create a custom ConfigMap for use by MIG Manager by performing the following steps: + +1. Create the `gpu-operator` namespace: + + ```console + $ kubectl create namespace gpu-operator + ``` + +1. Create a `ConfigMap` containing the custom `clients.yaml` file with a list of GPU clients: + + ```console + $ kubectl create configmap -n gpu-operator gpu-clients --from-file=clients.yaml + ``` + +1. Install the GPU Operator: + + ```console + $ helm install gpu-operator \ + -n gpu-operator --create-namespace \ + nvidia/gpu-operator \ + --version=${version} \ + --set migManager.gpuClientsConfig.name=gpu-clients \ + --set driver.enabled=false + ``` + +## Step 6: Architecture + +MIG Manager is designed as a controller within Kubernetes. It watches for changes to the +`nvidia.com/mig.config` label on the node and then applies the user-requested MIG configuration. +When the label changes, MIG Manager first stops all GPU pods, including device plugin, GPU feature discovery, +and DCGM exporter. +MIG Manager then stops all host GPU clients listed in the `clients.yaml` ConfigMap if drivers are preinstalled. +Finally, it applies the MIG reconfiguration and restarts the GPU pods and possibly, host GPU clients. +The MIG reconfiguration can also involve rebooting a node if a reboot is required to enable MIG mode. + +The default MIG profiles are specified in the `-mig-config` ConfigMap. +This ConfigMap is auto-generated by the MIG Manager for each MIG-capable node and contains the standard MIG profiles for the available GPUs on the node. +You can also configure the operator to configure a custom ConfigMap to use instead of the auto-generated one. + +You can specify one of these profiles to apply to the `mig.config` label to trigger a reconfiguration of the MIG geometry. + +MIG Manager uses the [mig-parted](https://github.com/NVIDIA/mig-parted) tool to apply the configuration +changes to the GPU, including enabling MIG mode, with a node reboot as required by some scenarios. + +```mermaid +flowchart + +subgraph mig[MIG Manager] + direction TB + A[Controller] <--> B[MIG-Parted] +end + +A -- on change --> C + +subgraph recon[Reconfiguration] + C["Config is Pending + or Rebooting"] + --> + D["Stop Operator Pods"] + --> + E["Enable MIG Mode and + Reboot if Required"] + --> + F["Use mig-parted to + Configure MIG Geometry"] + --> + G["Restart Operator Pods"] +end + +H["Set mig.config label + to Success"] +I["Set mig.config label + to Failed"] + +G --> H +G -- on failure --> I +``` diff --git a/gpu-operator/.agents/skills/gpu-operator-nvidia-amazon/SKILL.md b/gpu-operator/.agents/skills/gpu-operator-nvidia-amazon/SKILL.md new file mode 100644 index 000000000..b1688ab5f --- /dev/null +++ b/gpu-operator/.agents/skills/gpu-operator-nvidia-amazon/SKILL.md @@ -0,0 +1,167 @@ +--- +name: "gpu-operator-nvidia-amazon" +description: "Guides users through installing and configuring the NVIDIA GPU Operator on Amazon EKS. Use when deploying GPU workloads on AWS or troubleshooting EKS-specific GPU Operator setup. Trigger keywords - NVIDIA GPU Operator, Amazon EKS, AWS, Kubernetes, installation." +--- + + + + +# NVIDIA GPU Operator with Amazon EKS + +## Step 1: Approaches for Working with Amazon EKS + +You can approach running workloads in Amazon EKS with NVIDIA GPUs in at least two ways. + +### Default EKS configuration without the GPU Operator + +By default, you can run Amazon EKS optimized Amazon Linux AMIs on instance types +that support NVIDIA GPUs. + +Using the default configuration has the following limitations: + +* The pre-installed NVIDIA GPU driver version and NVIDIA container runtime version + lags the release schedule from NVIDIA. +* You must deploy the NVIDIA device plugin and you assume responsibility for + upgrading the plugin. + +If these limitations are acceptable to you, refer to +[Amazon EKS optimized Amazon Linux AMIs](https://docs.aws.amazon.com/eks/latest/userguide/eks-optimized-ami.html) +in the Amazon EKS documentation for information about configuring your cluster. +You do not need to install the NVIDIA GPU Operator. + +### EKS Node Group with the GPU Operator + +To overcome the limitations with the first approach, you can create a node group for your cluster. +Configure the node group with instance types that have +NVIDIA GPUs and use an AMI with an operating system that the GPU Operator supports. +The Operator does not support a mix of some nodes running Amazon Linux 2 and others +running a supported operating system in the same cluster. + +In this case, the Operator manages the lifecycle of all the operands, including +the NVIDIA GPU driver containers. +This approach enables you to run the most recent NVIDIA GPU drivers and use the +Operator to manage upgrades of the driver and other software components such as +the NVIDIA device plugin, NVIDIA Container Toolkit, and NVIDIA MIG Manager. + +This approach provides the most up-to-date software and the Operator reduces +the administrative overhead. + +### EKS Node Groups in Brief and Client Applications + +When you configure an Amazon EKS node group, you can configure +[self-managed nodes](https://docs.aws.amazon.com/eks/latest/userguide/worker.html) +or [managed nodes groups](https://docs.aws.amazon.com/eks/latest/userguide/managed-node-groups.html). + +Amazon EKS supports many clients for creating a node group. + +For self-managed nodes, you can use the `eksctl` CLI or Amazon Management Console. +Refer to the preceding URL for concepts and procedures. + +For managed node groups, you can use the Amazon Management Console. +The Amazon EKS documentation describes how to use the `eksctl` CLI, +but the CLI does not support operating systems other than Amazon Linux 2 and +the Operator does not support that operating system. +Refer to the preceding URL for concepts and procedures. + +Terraform supports creating self-managed and managed node groups. +Refer to +[AWS EKS Terraform module](https://registry.terraform.io/modules/terraform-aws-modules/eks/aws/latest) +in the Terraform Registry for more information. + +## About Using the Operator with Amazon EKS + +To use the NVIDIA GPU Operator with Amazon Elastic Kubernetes Service (EKS) +without any limitations, you perform the following high-level actions: + +* Create a self-managed or managed node group with instance types that have NVIDIA GPUs. + + Refer to the following resources in the Amazon EC2 documentation to help you choose + the instance type to meet your needs: + + * Table of accelerated computing + [instance types](https://aws.amazon.com/ec2/instance-types/accelerated-computing/) + for information about GPU model and count, RAM, and storage. + + * [Maximum IP addresses per network interface](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AvailableIpPerENI.html) + for accelerated computing instance types. + Make sure the instance type supports enough IP addresses for your workload. + For example, the `g4dn.xlarge` instance type supports `29` IP addresses for pods on the node. + +* Use an Amazon EKS optimized Amazon Machine Image (AMI) with a supported operating system (use the `gpu-operator-references` skill) on the nodes in the node group. + + AMIs support are specific to an AWS region and Kubernetes version. + See https://cloud-images.ubuntu.com/aws-eks/ for the AMI values such as `ami-00687acd80b7a620a`. + +* Use your preferred client application to create the node group. + +## Step 2: Example: Create a Self-Managed Node Group with eksctl + +### Prerequisites + +* You have access to the Amazon Management Console or you installed and configured the AWS CLI. + Refer to + [Installing or updating to the latest version of the AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) + and [Configuring the AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html) + in the AWS CLI documentation. +* You installed the `eksctl` CLI if you prefer it as your client application. + The CLI is available from https://docs.aws.amazon.com/eks/latest/userguide/install-kubectl.html#eksctl-install-update. +* You have the AMI value from https://cloud-images.ubuntu.com/aws-eks/. +* You have the EC2 instance type to use for your nodes. + +### Procedure + +The following steps show how to create an Amazon EKS cluster with the `eksctl` CLI. +The steps create a self-managed node group that uses an Amazon EKS optimized AMI. + +1. Create a file, such as `cluster-config.yaml`, with contents like the following example: + + Replace the values for the cluster name, Kubernetes version, and so on. + To resolve the environment variables in the override bootstrap command, you must source the bootstrap helper script. + + **Tip:** + + The default volume size for each node is 20 GB. + In many cases, containers with frameworks for AI/ML workloads are often very large. + The sample YAML file specifies a 100 GB volume to ensure enough local disk space for containers. +1. Create the Amazon EKS cluster with the node group: + + ```console + $ eksctl create cluster -f cluster-config.yaml + ``` + + Creating the cluster requires several minutes. + + *Example Output* + + ```output + 2022-08-19 17:51:04 [i] eksctl version 0.105.0 + 2022-08-19 17:51:04 [i] using region us-west-2 + 2022-08-19 17:51:04 [i] setting availability zones to [us-west-2d us-west-2c us-west-2a] + 2022-08-19 17:51:04 [i] subnets for us-west-2d - public:192.168.0.0/19 private:192.168.96.0/19 + ... + [✓] EKS cluster "demo-cluster" in "us-west-2" region is ready + ``` + +1. Optional: View the cluster name: + + ```console + $ eksctl get cluster + ``` + + *Example Output* + + ```output + NAME REGION EKSCTL CREATED + demo-cluster us-west-2 True + ``` + +## Step 3: Related Information + +* The preceding procedure is derived from + [Getting started with Amazon EKS - eksctl](https://docs.aws.amazon.com/eks/latest/userguide/getting-started-eksctl.html) + in the Amazon EKS documentation. +* If you have an existing Amazon EKS cluster, you can refer to + [Launching self-managed Amazon Linux nodes](https://docs.aws.amazon.com/eks/latest/userguide/launch-workers.html) + in the Amazon EKS documentation to add a self-managed node group to your cluster. + However, all nodes in the cluster must run Ubuntu 20.04 or 22.04. + This documentation includes steps for using the AWS Management Console. diff --git a/gpu-operator/.agents/skills/gpu-operator-nvidia-azure/SKILL.md b/gpu-operator/.agents/skills/gpu-operator-nvidia-azure/SKILL.md new file mode 100644 index 000000000..08d7a39d7 --- /dev/null +++ b/gpu-operator/.agents/skills/gpu-operator-nvidia-azure/SKILL.md @@ -0,0 +1,117 @@ +--- +name: "gpu-operator-nvidia-azure" +description: "Guides users through installing and configuring the NVIDIA GPU Operator on Azure AKS. Use when deploying GPU workloads on Azure or troubleshooting AKS-specific GPU Operator setup. Trigger keywords - NVIDIA GPU Operator, Azure AKS, Microsoft Azure, Kubernetes." +--- + + + + +# NVIDIA GPU Operator with Azure Kubernetes Service + +## Step 1: Approaches for Working with Azure AKS + +### Create AKS Cluster with a Node Pool to Skip GPU Driver installation + +Azure Kubernetes Service has a preview feature that enables a `--skip-gpu-driver-install` +command-line argument to the `az aks nodepool add` command. +This argument prevents installing +the NVIDIA GPU Driver in the stock Ubuntu operating system. + +This approach enables you to take advantage of the lifecycle management +that the NVIDIA GPU Operator provides for managing your cluster. + +```console +$ az aks nodepool add --resource-group --name gpunodes --cluster-name \ + --node-count \ + --skip-gpu-driver-install \ + ... +``` + +When you follow this approach, you can install the Operator without any special +considerations or arguments. +Refer to Install NVIDIA GPU Operator. + +For more information about this feature, see +[Skip GPU driver installation](https://learn.microsoft.com/en-us/azure/aks/use-nvidia-gpu?source=recommendations&tabs=add-ubuntu-gpu-node-pool#skip-gpu-driver-installation) +in the Azure Kubernetes Service documentation. + +### Default AKS configuration without the GPU Operator + +By default, you can run Azure AKS images on GPU-enabled virtual machines with NVIDIA GPUs, +and not use the NVIDIA GPU Operator. + +AKS images include a preinstalled NVIDIA GPU Driver and preinstalled NVIDIA Container Toolkit. + +Using the default configuration, without the Operator, has the following limitations: + +* Metrics are not collected or reported with NVIDIA DCGM Exporter. +* Validating the container runtime is manual rather than automatic with the Operator. +* Multi-Instance GPU (MIG) profiles must be set when you create the node pool and you + cannot change the profile at run time. + +If these limitations are acceptable to you, refer to +[Use GPUs for compute-intensive workloads on Azure Kubernetes Services](https://learn.microsoft.com/en-us/azure/aks/gpu-cluster) +in the Microsoft Azure product documentation for information about configuring your cluster. + +### GPU Operator with Preinstalled Driver and Container Toolkit + +The images that are available in AKS always include a preinstalled NVIDIA GPU driver +and a preinstalled NVIDIA Container Toolkit. +These images reduce the primary benefit of installing the Operator so that it can +manage the lifecycle of these software components and others. + +However, using the Operator can overcome the limitations identified in the preceding section. + +## Step 2: Installing the Operator for Preinstalled Driver and Toolkit + +After you start your Azure AKS cluster with an image that includes a preinstalled NVIDIA GPU Driver +and NVIDIA Container Toolkit, you are ready to install the NVIDIA GPU Operator. + +When you install the Operator, you must prevent the Operator from automatically +deploying NVIDIA Driver Containers and the NVIDIA Container Toolkit. + +1. Add the NVIDIA Helm repository: + + ```console + $ helm repo add nvidia https://helm.ngc.nvidia.com/nvidia \ + && helm repo update + ``` + +1. Install the Operator without the driver containers and toolkit: + + ```console + $ helm install gpu-operator nvidia/gpu-operator \ + -n gpu-operator --create-namespace \ + --version=${version} \ + --set driver.enabled=false \ + --set toolkit.enabled=false \ + --set operator.runtimeClass=nvidia-container-runtime + ``` + + Refer to Common Chart Customization Options for more information about installation options. + + *Example Output* + + ```output + NAME: gpu-operator + LAST DEPLOYED: Fri May 5 15:30:05 2023 + NAMESPACE: gpu-operator + STATUS: deployed + REVISION: 1 + TEST SUITE: None + ``` + + The Operator requires several minutes to install. + +1. Confirm that the Operator is installed and ran the CUDA validation container to completion: + + ```console + $ kubectl get pods -n gpu-operator -l app=nvidia-cuda-validator + ``` + + *Example Output* + + ```output + NAME READY STATUS RESTARTS AGE + nvidia-cuda-validator-bpvkt 0/1 Completed 0 3m56s + ``` diff --git a/gpu-operator/.agents/skills/gpu-operator-nvidia-dra/SKILL.md b/gpu-operator/.agents/skills/gpu-operator-nvidia-dra/SKILL.md new file mode 100644 index 000000000..7ec843abf --- /dev/null +++ b/gpu-operator/.agents/skills/gpu-operator-nvidia-dra/SKILL.md @@ -0,0 +1,308 @@ +--- +name: "gpu-operator-nvidia-dra" +description: "Explains how to install and use the NVIDIA DRA Driver for GPUs. Use when users ask about Dynamic Resource Allocation, DRA installation, or GPU resource claims. Trigger keywords - NVIDIA GPU Operator, DRA, Dynamic Resource Allocation, Kubernetes, installation." +--- + + + + +# Prerequisites + +**Tip:** + +# NVIDIA DRA Driver for GPUs + +Dynamic Resource Allocation (DRA) is a Kubernetes concept for flexibly requesting, configuring, and sharing specialized devices like GPUs. +DRA puts device configuration and scheduling into the hands of device vendors through drivers such as the DRA Driver for GPUs. +This page outlines how to install the NVIDIA DRA Driver for GPUs v25.12.0 and later with the NVIDIA GPU Operator. + +Before using the DRA Driver for GPUs, it is recommended that you are familiar with the following concepts: + +* [Upstream Kubernetes DRA documentation](https://kubernetes.io/docs/concepts/scheduling-eviction/dynamic-resource-allocation/). +* [DRA Driver repository documentation](https://github.com/NVIDIA/k8s-dra-driver-gpu) + +## Overview + +With NVIDIA's DRA Driver for GPUs, your Kubernetes workload can allocate and consume the following two types of resources: + +* GPU allocation: for controlled sharing and dynamic reconfiguration of GPUs. This functionality is a replacement for the traditional GPU allocation method used by the NVIDIA Kubernetes Device Plugin. +* ComputeDomains: An abstraction for robust and secure [Multi-Node NVLink (MNNVL)](https://docs.nvidia.com/multi-node-nvlink-systems/index.html) for NVIDIA GB200 and similar systems. + +You can use the NVIDIA DRA Driver for GPUs with the NVIDIA GPU Operator to deploy and manage your GPUs and ComputeDomains. + +### Known Issues + +* There is a known issue where the NVIDIA Driver Manager is not aware of the DRA driver kubelet plugin, and will not correctly evict it on pod restarts. + You must label the nodes you plan to use with DRA GPU allocation and pass the node label in the GPU Operator Helm command in the `driver.manager.env` flag. + This enables the NVIDIA Driver Manager to evict the GPU kubelet plugin correctly on driver container upgrades. +* For A100 GPUs, the MIG manager does not automatically evict the DRA kubelet plugin during MIG configuration changes. + If the DRA kubelet plugin is deployed before a MIG change, then you must manually restart the DRA kubelet plugin. + +## Step 1: Install the NVIDIA GPU Operator + +### GPU Allocation + +1. Create a node selector label on all the nodes in your cluster that support GPU allocation through DRA: + + ```console + kubectl label node $HOSTNAME nvidia.com/dra-kubelet-plugin=true + ``` + +2. Add the Helm repo: + + ```console + helm repo add nvidia https://helm.ngc.nvidia.com/nvidia \ + && helm repo update + ``` + +3. Install the GPU Operator with the NVIDIA Kubernetes Device Plugin disabled: + + ```console + helm upgrade --install gpu-operator nvidia/gpu-operator \ + --version=${version} \ + --create-namespace \ + --namespace gpu-operator \ + --set devicePlugin.enabled=false \ + --set driver.manager.env[0].name=NODE_LABEL_FOR_GPU_POD_EVICTION \ + --set driver.manager.env[0].value="nvidia.com/dra-kubelet-plugin" + ``` + + Make sure that the value of `driver.manager.env` matches the node selector label that was used when installing the DRA driver helm chart. +### ComputeDomain + +1. Add the Helm repo: + +```console +helm repo add nvidia https://helm.ngc.nvidia.com/nvidia \ +&& helm repo update +``` + +2. Install the GPU Operator with the device plugin disabled: + +```console +helm upgrade --install gpu-operator nvidia/gpu-operator \ + --version=${version} \ + --create-namespace \ + --namespace gpu-operator +``` + +Refer to the [GPU Operator installation guide](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-install.html) for additional configuration options when installing the GPU Operator. + +If you are planning to use MIG devices, refer to the [NVIDIA GPU Operator MIG documentation](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-mig.html) to configure your cluster for MIG support. + +## Step 2: Install DRA Driver for GPUs + +**Note:** + +The `gpuResourcesEnabledOverride=true` is an additional flag that is required to fully enable GPU allocation support. +Include it in the Helm command if you want to enable GPU allocation support. + +If you want to disable either functionality: + +* To disable GPU allocation support, include `--set resources.gpus.enabled=false` in the Helm command. +* To disable ComputeDomain support, include `--set resources.computeDomains.enabled=false` in the Helm command. +**Note:** + +The `nvidiaDriverRoot` flag sets the root directory for the NVIDIA GPU driver. +The default value is `/`, which is the typical value for drivers installed directly on the host. +If you are using GPU Operator managed drivers (default), the drivers are installed to `/run/nvidia/driver` by default. +If you are using [pre-installed drivers](https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/getting-started.html#pre-installed-nvidia-gpu-drivers), you can remove the `nvidiaDriverRoot` flag or set it to `/` in the command above. +### GPU Allocation + +1. Create a custom `values.yaml` file for installing the DRA driver helm chart. + + ### values.yaml file + + Specifies the node selector label for nodes that will support GPU allocation through the DRA Driver. + + ```yaml + image: + pullPolicy: IfNotPresent + kubeletPlugin: + nodeSelector: + nvidia.com/dra-kubelet-plugin: "true" + ``` + + ### GKE values.yaml file + + Google Kubernetes Engine requires some specific values to be set in the `values.yaml` file, including the driver root on the host in `nvidiaDriverRoot` as well as the node selector label for nodes that will support GPU allocation through the DRA Driver. + + ```yaml + # Specify the driver root on the host in nvidiaDriverRoot. + # "/home/kubernetes/bin/nvidia" is the default driver root on GKE. + nvidiaDriverRoot: "/home/kubernetes/bin/nvidia" + + controller: + priorityClassName: "" + affinity: null + image: + pullPolicy: IfNotPresent + kubeletPlugin: + priorityClassName: "" + tolerations: + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists + nodeSelector: + nvidia.com/dra-kubelet-plugin: "true" + ``` + +2. Add the Helm repo: + + ```console + helm repo add nvidia https://helm.ngc.nvidia.com/nvidia \ + && helm repo update + ``` + +3. Install the DRA driver: + + ### install command + + ```console + helm upgrade -i nvidia-dra-driver-gpu nvidia/nvidia-dra-driver-gpu \ + --version="${dra_version}" \ + --namespace nvidia-dra-driver-gpu \ + --create-namespace \ + --set nvidiaDriverRoot=/run/nvidia/driver \ + --set gpuResourcesEnabledOverride=true \ + -f values.yaml + ``` + + ### GKE install command + + ```console + helm upgrade -i nvidia-dra-driver-gpu nvidia/nvidia-dra-driver-gpu \ + --version="${dra_version}" \ + --namespace nvidia-dra-driver-gpu \ + --create-namespace \ + --set gpuResourcesEnabledOverride=true \ + -f values.yaml + ``` + +### ComputeDomain + +1. Add the NVIDIA NGC Catalog's Helm chart repository: + + ```console + helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update + ``` + +2. Install the DRA driver. + + Example for Operator-provided GPU driver: + + ```console + helm upgrade -i nvidia-dra-driver-gpu nvidia/nvidia-dra-driver-gpu \ + --version="${dra_version}" \ + --create-namespace \ + --namespace nvidia-dra-driver-gpu \ + --set resources.gpus.enabled=false \ + --set nvidiaDriverRoot=/run/nvidia/driver + ``` + + Example for host-provided GPU driver: + + ```console + helm upgrade -i nvidia-dra-driver-gpu nvidia/nvidia-dra-driver-gpu \ + --version="${dra_version}" \ + --create-namespace \ + --namespace nvidia-dra-driver-gpu \ + --set resources.gpus.enabled=false + ``` + +## Step 3: Validate Installation + +1. Confirm that the DRA driver components are running: + + ```console + kubectl get pods -n nvidia-dra-driver-gpu + ``` + + *Example Output* + + ```output + NAME READY STATUS RESTARTS AGE + nvidia-dra-driver-gpu-controller-67cb99d84b-5q7kj 1/1 Running 0 7m26s + nvidia-dra-driver-gpu-kubelet-plugin-h5xsn 1/1 Running 0 7m27s + ``` + +2. Verify that GPU DeviceClasses are available: + + ```console + kubectl get deviceclass + ``` + + *Example Output* + + ```output + NAME AGE + compute-domain-daemon.nvidia.com 55s + compute-domain-default-channel.nvidia.com 55s + gpu.nvidia.com 55s + mig.nvidia.com 55s + ``` + +The `compute-domain-daemon.nvidia.com` and `compute-domain-default-channel.nvidia.com` DeviceClasses are installed when ComputeDomain support is enabled. +The `gpu.nvidia.com` and `mig.nvidia.com` DeviceClasses are installed when GPU allocation support is enabled. + +Additional validation steps are available in the DRA Driver repository documentation: + +* [Validate setup for ComputeDomain allocation](https://github.com/NVIDIA/k8s-dra-driver-gpu/wiki/Validate-setup-for-ComputeDomain-allocation) +* [Validate setup for GPU allocation](https://github.com/NVIDIA/k8s-dra-driver-gpu/wiki/Validate-setup-for-GPU-allocation) + +## Step 4: Enable Health Checks + +The NVIDIA DRA driver supports GPU health monitoring using the [NVIDIA Management Library (NVML)](https://developer.nvidia.com/management-library-nvml). +This feature uses NVML to check for [GPU XID errors](https://docs.nvidia.com/deploy/xid-errors/introduction.html) and determines if a GPU or MIG device is functioning properly. + +Health checking is managed by the `NVMLDeviceHealthCheck` feature gate. +This is currently an alpha feature and is disabled by default. + +When enabled, the DRA Driver for GPUs continuously monitors GPUs for XID errors and assigns health statuses: +* Healthy - GPU is functioning normally. The GPU may have a non-critical XID error but is still available for workloads. +* Unhealthy - GPU has a critical XID error and is not suitable for workloads. + +To enable GPU health monitoring, deploy the DRA driver with the NVMLDeviceHealthCheck feature gate: + +```console +helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update +helm upgrade -i nvidia-dra-driver-gpu nvidia/nvidia-dra-driver-gpu \ + --namespace nvidia-dra-driver-gpu \ + --set gpuResourcesEnabledOverride=true \ + --set featureGates.NVMLDeviceHealthCheck=true +``` + +**Note:** + +Unhealthy GPUs will not appear in the ResourceSlice list. After the device recovers and is marked healthy again, you must restart the DRA Driver for the device to be added back into the available resources pool. +After enabling health checks, you can monitor health status in the kubelet logs. + +1. Check kubelet plugin logs. + Health status changes are logged in the kubelet plugin container. Run `kubectl get pods -n nvidia-dra-driver-gpu` and find the `nvidia-dra-driver-gpu-kubelet-plugin-` pod name. Replace `` with your actual pod name. + + ```console + kubectl logs nvidia-dra-driver-gpu-kubelet-plugin- \ + -n nvidia-dra-driver-gpu \ + -c gpus + ``` + +2. List all ResourceSlices. + View all ResourceSlices in the cluster to see which devices are available: + + ```console + kubectl get resourceslice + ``` + +3. Inspect a specific ResourceSlice. + View detailed information about a specific resource slice. Healthy devices are listed in the resource slice, while unhealthy devices are not listed: + + ```console + kubectl get resourceslice -o yaml + ``` + +## Step 5: Additional Documentation + +Refer to the [DRA Driver for GPUs repository](https://github.com/NVIDIA/k8s-dra-driver-gpu/wiki) for additional documentation, including + +* [Upgrade Guide](https://github.com/NVIDIA/k8s-dra-driver-gpu/wiki/Installation#upgrading) +* [Troubleshooting Guide](https://github.com/NVIDIA/k8s-dra-driver-gpu/wiki/Troubleshooting) diff --git a/gpu-operator/.agents/skills/gpu-operator-nvidia-driver/SKILL.md b/gpu-operator/.agents/skills/gpu-operator-nvidia-driver/SKILL.md new file mode 100644 index 000000000..e71d298b2 --- /dev/null +++ b/gpu-operator/.agents/skills/gpu-operator-nvidia-driver/SKILL.md @@ -0,0 +1,296 @@ +--- +name: "gpu-operator-nvidia-driver" +description: "Explains how to configure NVIDIA GPU Driver custom resources for driver lifecycle management. Use when users need custom driver configuration or mixed operating system support. Trigger keywords - NVIDIA GPU Operator, GPU driver, custom resource, driver configuration." +--- + + + + +# NVIDIA GPU Driver Custom Resource Definition + +## Overview of the GPU Driver Custom Resource Definition + +You can create one or more instances of an NVIDIA driver (`NVIDIADriver`) custom resource +to specify the NVIDIA GPU driver type and driver version to configure on specific nodes. +You can specify labels in the node selector field to control which NVIDIA driver configuration is applied to specific nodes. + +### Limitations + +* This feature is recommended for new cluster installations only. + Upgrades from ClusterPolicy managed drivers to NVIDIA driver custom resource managed drivers are not supported. + Switching from ClusterPolicy to the NVIDIA driver custom resource will cause all existing driver pods to be terminated immediately and redeployed using the new NVIDIADriver configuration. +* You must either use the default NVIDIA driver custom resource that the Helm chart creates or create and manage your own custom NVIDIA driver custom resource. +* You can't use ClusterPolicy and the NVIDIA driver custom resource at the same time. You can only use one or the other in a cluster. + +### Comparison: Managing the Driver with CRD versus the Cluster Policy + +Before the introduction of the NVIDIA GPU Driver custom resource definition, you managed the driver by modifying +the driver field and subfields of the cluster policy custom resource definition. + +The key differences between the two approaches are summarized in the following table. + +| Cluster Policy CRD | NVIDIA Driver CRD * - | Supports a single driver type and version on all nodes. | Does not support multiple operating system versions. This limitation complicates performing an operating system upgrade on your nodes. - | Supports multiple driver types and versions on different nodes. | Supports multiple operating system versions on nodes. | +| --- | --- | --- | --- | --- | --- | +### Driver Daemon Sets + +The NVIDIA GPU Operator starts a driver daemon set for each NVIDIA driver custom resource and each operating system version. + +For example, if your cluster has one NVIDIA driver custom resource that specifies a 580 branch GPU driver and some +worker nodes run Ubuntu 20.04 and other worker nodes run Ubuntu 22.04, the Operator starts two driver daemon sets. +One daemon set configures the GPU driver on the Ubuntu 20.04 nodes and the other configures the driver on the Ubuntu 22.04 nodes. +All the nodes run the same 580 branch GPU driver. + +![](graphics/nvd-basics.svg) +If you choose to use precompiled driver containers, the Operator starts a driver daemon set for each Linux kernel version. + +For example, if some nodes run Ubuntu 22.04 and the 5.15.0-84-generic kernel, and other nodes run the 5.15.0-78-generic kernel, +then the Operator starts two daemon sets. + +### About the Default NVIDIA Driver Custom Resource + +By default, the Helm chart configures a default NVIDIA driver custom resource during installation. +This custom resource does not include a node selector and as a result, the custom resource applies to every node in your cluster +that has an NVIDIA GPU. +The Operator starts a driver daemon set and pods for each operating system version in your cluster. + +If you plan to configure your own driver custom resources to specify driver versions, types, and so on, then +you might prefer to avoid installing the default custom resource. +By preventing the installation, you can avoid node selector conflicts due to the default custom resource +matching all nodes and your custom resources matching some of the same nodes. + +To prevent configuring the default custom resource, specify the `--set driver.nvidiaDriverCRD.deployDefaultCR=false` +argument when you install the Operator with Helm. + +If the Operator is already installed with the default custom resource and you want to create your own +driver custom resources and apply them to specific nodes, delete the default custom resource. + +**Note:** + +After you delete the default custom resource, your custom resources might not reconcile +automatically due to a known issue. Refer to the v26.3.0 known issues +for the workaround. +### Feature Compatibility + +Driver type + Each NVIDIA driver custom resource specifies the driver type and is one of `gpu`, `vgpu`, or `vgpu-host-manager`. + You can run the data-center driver (`gpu`) on some nodes and the vGPU driver on other nodes. + +GPUDirect RDMA and GPUDirect Storage + Each NVIDIA driver custom resource can specify how to configure GPUDirect RDMA and GPUDirect Storage (GDS). + Refer to GPUDirect RDMA and GPUDirect Storage for the platform support and prerequisites. + +GDRCopy + Each NVIDIA driver custom resource can enable the GDRCopy sidecar container in the driver pod. + +Precompiled and signed drivers + You can run the default driver type that is compiled when the driver pod starts on some nodes + and precompiled driver containers on other nodes. + The precomp-limitations-restrictions for precompiled driver containers apply. + +Preinstalled drivers on nodes + If a node has an NVIDIA GPU driver installed in the operating system, then no driver container runs on the node. + +Support for X86_64 and ARM64 + Each daemon set can run pods and driver containers for the X86_64 and ARM64 architectures. + Refer to the [NVIDIA GPU Driver tags](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/driver/tags) + web page to determine which driver version and operating system combinations support both architectures. + +Custom Driver Parameters + Each NVIDIA driver custom resource can specify custom kernel module parameters by using a ConfigMap. + For more information, refer to Customizing NVIDIA GPU Driver Parameters during Installation (use the `gpu-operator-custom-driver` skill). + +## About the NVIDIA Driver Custom Resource + +An instance of the NVIDIA driver custom resource represents a specific NVIDIA GPU driver type and driver version to install and manage +on nodes. + +The following table describes some of the fields in the custom resource. + +| Field | Description | Default Value | | | | +| --- | --- | --- | --- | --- | --- | +| `metadata.name` | Specifies the name of the NVIDIA driver custom resource. | None | | | | +| `annotations` | Specifies a map of key and value pairs to add as custom annotations to the driver pod. | None | | | | +| `driverType` | Specifies one of the following: | `gpu` to use the NVIDIA data-center GPU driver. | `vgpu` to use the NVIDIA vGPU guest driver. | `vgpu-host-manager` to use the NVIDIA vGPU Manager. | `gpu` | +| `env` | Specifies environment variables to pass to the driver container. | None | | | | +| `gdrcopy.enabled` | Specifies whether to deploy the GDRCopy Driver. When set to `true` the GDRCopy Driver image runs as a sidecar container. | `false` | | | | +| `gds.enabled` | Specifies whether to enable GPUDirect Storage. | `false` | | | | +| `image` | Specifies the driver container image name. | `driver` | | | | +| `imagePullPolicy` | Specifies the policy for kubelet to download the container image. Refer to the Kubernetes documentation for [image pull policy](https://kubernetes.io/docs/concepts/containers/images/#image-pull-policy). | Refer to the Kubernetes documentation. | | | | +| `imagePullSecrets` | Specifies the credentials to provide to the registry if the registry is secured. | None | | | | +| `kernelModuleType` | Specifies the type of the NVIDIA GPU Kernel modules to use. Valid values are `auto` (default), `proprietary`, and `open`. `Auto` means that the recommended kernel module type is chosen based on the GPU devices on the host and the driver branch used. | `auto` | | | | +| `labels` | Specifies a map of key and value pairs to add as custom labels to the driver pod. | None | | | | +| `nodeSelector` | Specifies one or more node labels to match. The driver container is scheduled to nodes that match all the labels. | None. When you do not specify this field, the driver custom resource selects all nodes. | | | | +| `priorityClassName` | Specifies the priority class for the driver pod. | `system-node-critical` | | | | +| `rdma.enabled` | Specifies whether to enable GPUDirect RDMA. | `false` | | | | +| `repository` | Specifies the container registry that contains the driver container. | `nvcr.io/nvidia` | | | | +| `useOpenKernelModules` Deprecated. | This field is deprecated as of v25.3.0 and will be ignored. Use `kernelModuleType` instead. Specifies to use the NVIDIA Open GPU Kernel modules. | `false` | | | | +| `tolerations` | Specifies a set of tolerations to apply to the driver pod. | None | | | | +| `usePrecompiled` | When set to `true`, the Operator deploys a driver container image with a precompiled driver. | `false` | | | | +| `version` | Specifies the GPU driver version to install. For a data-center driver, specify a value like `580.126.20`. If you set `usePrecompiled` to `true`, specify the driver branch, such as `580`. | Refer to the operator-component-matrix. | | | | + +## Step 1: Installing the NVIDIA GPU Operator + +Perform the following steps to install the GPU Operator and use the NVIDIA driver custom resources. + +1. Optional: If you want to run more than one driver type or version in the cluster, + label the worker nodes to identify the driver type and version to install on each node: + + *Example* + + ```console + $ kubectl label node --overwrite driver.version=580.126.20 + ``` + + - To use a mix of driver types, such as vGPU, label nodes for the driver type. + - To use a mix of driver versions, label the nodes for the different versions. + - To use a mix of conventional drivers and precompiled driver containers, label the nodes for the different types. + +1. Install the Operator. + + - Add the NVIDIA Helm repository: + + ```console + $ helm repo add nvidia https://helm.ngc.nvidia.com/nvidia \ + && helm repo update + ``` + + - Install the Operator and specify at least the `--set driver.nvidiaDriverCRD.enabled=true` argument: + + ```console + $ helm install --wait --generate-name \ + -n gpu-operator --create-namespace \ + nvidia/gpu-operator \ + --version=${version} \ + --set driver.nvidiaDriverCRD.enabled=true + ``` + + By default, Helm configures a `default` NVIDIA driver custom resource during installation. + To prevent configuring the default custom resource, also specify `--set driver.nvidiaDriverCRD.deployDefaultCR=false`. + +1. Apply NVIDIA driver custom resources manifests to install the NVIDIA GPU driver version, type, and so on for your nodes. + Refer to the sample manifests. + +## Step 2: Sample NVIDIA Driver Manifests + +### One Driver Type and Version on All Nodes + +1. Optional: Remove previously applied node labels. + +1. Create a file, such as `nvd-all.yaml`, with contents like the following: + +1. Apply the manifest: + + ```console + $ kubectl apply -n gpu-operator -f nvd-all.yaml + ``` + +1. Optional: Monitor the progress: + + ```console + $ kubectl get events -n gpu-operator --sort-by='.lastTimestamp' + ``` + +### Multiple Driver Versions + +1. Label the nodes. + + - On some nodes, apply a label like the following: + + ```console + $ kubectl label node --overwrite driver.config="gold" + ``` + + - On other nodes, apply a label like the following: + + ```console + $ kubectl label node --overwrite driver.config="silver" + ``` + +1. Create a file, such as `nvd-driver-multiple.yaml`, with contents like the following: + +1. Apply the manifest: + + ```console + $ kubectl apply -n gpu-operator -f nvd-driver-multiple.yaml + ``` + +1. Optional: Monitor the progress: + + ```console + $ kubectl get events -n gpu-operator --sort-by='.lastTimestamp' + ``` + +### One Precompiled Driver Container on All Nodes + +1. Optional: Remove previously applied node labels. + +1. Create a file, such as `nvd-precompiled-all.yaml`, with contents like the following: + + **Tip:** + + Because the manifest does not include a `nodeSelector` field, the driver custom + resource selects all nodes in the cluster that have an NVIDIA GPU. +1. Apply the manifest: + + ```console + $ kubectl apply -n gpu-operator -f nvd-precompiled-all.yaml + ``` + +1. Optional: Monitor the progress: + + ```console + $ kubectl get events -n gpu-operator --sort-by='.lastTimestamp' + ``` + +### Precompiled Driver Container on Some Nodes + +1. Label the nodes like the following sample: + + ```console + $ kubectl label node --overwrite driver.precompiled="true" + $ kubectl label node --overwrite driver.version="580" + ``` + +1. Create a file, such as `nvd-precomiled-some.yaml`, with contents like the following: + +1. Apply the manifest: + + ```console + $ kubectl apply -n gpu-operator -f nvd-precompiled-some.yaml + ``` + +1. Optional: Monitor the progress: + + ```console + $ kubectl get events -n gpu-operator --sort-by='.lastTimestamp' + ``` + +## Step 3: Upgrading the NVIDIA GPU Driver + +You can upgrade the driver version by editing or patching the NVIDIA driver custom resource. + +When you update the custom resource, the Operator performs a rolling update of the pods in the affected daemon set. + +1. Update the `driver.version` field in the driver custom resource: + + ```console + $ kubectl patch nvidiadriver/demo-silver --type='json' \ + -p='[{"op": "replace", "path": "/spec/version", "value": "525.125.06"}]' + ``` + +1. Optional: Monitor the progress: + + ```console + $ kubectl get pods -n gpu-operator -l app.kubernetes.io/component=nvidia-driver + ``` + + *Example Output* + + ```output + NAME READY STATUS RESTARTS AGE + nvidia-gpu-driver-ubuntu20.04-788484b9bb-6zhd9 1/1 Running 0 5m1s + nvidia-gpu-driver-ubuntu22.04-8896c4bf7-7s68q 1/1 Terminating 0 37m + nvidia-gpu-driver-ubuntu22.04-8896c4bf7-jm74l 1/1 Running 0 37m + ``` + +Eventually, the Operator replaces the pods that used the previous driver version with pods that use the updated driver version. diff --git a/gpu-operator/.agents/skills/gpu-operator-nvidia-google/SKILL.md b/gpu-operator/.agents/skills/gpu-operator-nvidia-google/SKILL.md new file mode 100644 index 000000000..3f7b89d4c --- /dev/null +++ b/gpu-operator/.agents/skills/gpu-operator-nvidia-google/SKILL.md @@ -0,0 +1,211 @@ +--- +name: "gpu-operator-nvidia-google" +description: "Guides users through installing and configuring the NVIDIA GPU Operator on Google GKE. Use when deploying GPU workloads on GKE or troubleshooting GKE-specific GPU Operator setup. Trigger keywords - NVIDIA GPU Operator, Google GKE, Kubernetes, installation." +--- + + + + +# Prerequisites + +* You installed and initialized the Google Cloud CLI. + +- name: RUNTIME_CONFIG_SOURCE + +# NVIDIA GPU Operator with Google GKE + +## About Using the Operator with Google GKE + +There are two ways to use NVIDIA GPU Operator with Google Kubernetes Engine (GKE). +You can use Google driver installer to install and manage NVIDIA GPU Driver on the nodes +or you can use the Operator and driver manager to manage the driver and other NVIDIA software components. + +The choice depends on the operating system and whether you prefer to have the Operator manage all the software components. + +| Google Driver Installer - | Container-Optimized OS | Ubuntu with containerd | The Google driver installer manages the NVIDIA GPU Driver. NVIDIA GPU Operator manages other software components. | +| --- | --- | --- | --- | +| NVIDIA Driver Manager - | Ubuntu with containerd | NVIDIA GPU Operator manages the lifecycle and upgrades of the driver and other NVIDIA software. | | +The preceding information relates to using GKE Standard node pools. +For Autopilot Pods, using the GPU Operator is not supported, and you can refer to +[Deploy GPU workloads in Autopilot](https://cloud.google.com/kubernetes-engine/docs/how-to/autopilot-gpus). + +## Step 1: Using the Google Driver Installer + +Perform the following steps to create a GKE cluster with the `gcloud` CLI and use Google driver installer to manage the GPU driver. +You can create a node pool that uses a Container-Optimized OS node image or a Ubuntu node image. + +1. Create the node pool. + Refer to [Running GPUs in GKE Standard clusters](https://cloud.google.com/kubernetes-engine/docs/how-to/gpus#create) + in the GKE documentation. + + When you create the node pool, specify the following additional `gcloud` command-line options to disable GKE features that are not supported with the Operator: + + - `--node-labels="gke-no-default-nvidia-gpu-device-plugin=true"` + + The node label disables the GKE GPU device plugin daemon set on GPU nodes. + + - `--accelerator type=...,gpu-driver-version=disabled` + + This argument disables automatically installing the GPU driver on GPU nodes. + +1. Get the authentication credentials for the cluster: + + ```console + $ gcloud container clusters get-credentials demo-cluster --location us-west1 + ``` + +1. Optional: Verify that you can connect to the cluster: + + ```console + $ kubectl get nodes -o wide + ``` + +1. Create the namespace for the NVIDIA GPU Operator: + + ```console + $ kubectl create ns gpu-operator + ``` + +1. Create a file, such as `gpu-operator-quota.yaml`, with contents like the following example: + +1. Apply the resource quota: + + ```console + $ kubectl apply -n gpu-operator -f gpu-operator-quota.yaml + ``` + +1. Optional: View the resource quota: + + ```console + $ kubectl get -n gpu-operator resourcequota + ``` + + *Example Output* + + ```output + NAME AGE REQUEST + gpu-operator-quota 38s pods: 0/100 + ``` + +1. Install the Google driver installer daemon set. + + For Container-Optimized OS: + + ```console + $ kubectl apply -f https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/nvidia-driver-installer/cos/daemonset-preloaded.yaml + ``` + + For Ubuntu, the manifest to apply depends on GPU model and node version. + Refer to the **Ubuntu** tab at + [Manually install NVIDIA GPU drivers](https://cloud.google.com/kubernetes-engine/docs/how-to/gpus#installing_drivers) + in the GKE documentation. + +1. Install the Operator using Helm: + + ```console + $ helm install --wait --generate-name \ + -n gpu-operator \ + nvidia/gpu-operator \ + --version=${version} \ + --set hostPaths.driverInstallDir=/home/kubernetes/bin/nvidia \ + --set toolkit.installDir=/home/kubernetes/bin/nvidia \ + --set cdi.enabled=true \ + --set cdi.default=true \ + --set driver.enabled=false + ``` + + Set the NVIDIA Container Toolkit and driver installation path to `/home/kubernetes/bin/nvidia`. + On GKE node images, this directory is writable and is a stateful location for storing the NVIDIA runtime binaries. + + To configure MIG with NVIDIA MIG Manager, specify the following additional Helm command arguments: + + ```console + --set migManager.env[0].name=WITH_REBOOT \ + --set-string migManager.env[0].value=true + ``` + +## Step 2: Using NVIDIA Driver Manager + +Perform the following steps to create a GKE cluster with the `gcloud` CLI and use the Operator and NVIDIA Driver Manager to manage the GPU driver. +The steps create the cluster with a node pool that uses a Ubuntu and containerd node image. + +1. Create the cluster by running a command that is similar to the following example: + + ```console + $ gcloud beta container clusters create demo-cluster \ + --project \ + --location us-west1 \ + --release-channel "regular" \ + --machine-type "n1-standard-4" \ + --accelerator "type=nvidia-tesla-t4,count=1" \ + --image-type "UBUNTU_CONTAINERD" \ + --node-labels="gke-no-default-nvidia-gpu-device-plugin=true" \ + --disk-type "pd-standard" \ + --disk-size "1000" \ + --no-enable-intra-node-visibility \ + --metadata disable-legacy-endpoints=true \ + --max-pods-per-node "110" \ + --num-nodes "1" \ + --logging=SYSTEM,WORKLOAD \ + --monitoring=SYSTEM \ + --enable-ip-alias \ + --default-max-pods-per-node "110" \ + --no-enable-master-authorized-networks \ + --tags=nvidia-ingress-all + ``` + + Creating the cluster requires several minutes. + +1. Get the authentication credentials for the cluster: + + ```console + $ USE_GKE_GCLOUD_AUTH_PLUGIN=True \ + gcloud container clusters get-credentials demo-cluster --zone us-west1 + ``` + +1. Optional: Verify that you can connect to the cluster: + + ```console + $ kubectl get nodes -o wide + ``` + +1. Create the namespace for the NVIDIA GPU Operator: + + ```console + $ kubectl create ns gpu-operator + ``` + +1. Create a file, such as `gpu-operator-quota.yaml`, with contents like the following example: + +1. Apply the resource quota: + + ```console + $ kubectl apply -n gpu-operator -f gpu-operator-quota.yaml + ``` + +1. Optional: View the resource quota: + + ```console + $ kubectl get -n gpu-operator resourcequota + ``` + + *Example Output* + + ```output + NAME AGE REQUEST + gke-resource-quotas 6m56s count/ingresses.extensions: 0/100, count/ingresses.networking.k8s.io: 0/100, count/jobs.batch: 0/5k, pods: 2/1500, services: 1/500 + gpu-operator-quota 38s pods: 0/100 + ``` + +1. Install the Operator. + Refer to install the NVIDIA GPU Operator. + +## Step 3: Related Information + +* If you have an existing GKE cluster, refer to + [Add and manage node pools](https://cloud.google.com/kubernetes-engine/docs/how-to/node-pools) + in the GKE documentation. +* When you create new node pools, specify the + `--node-labels="gke-no-default-nvidia-gpu-device-plugin=true"` and + `--accelerator type=...,gpu-driver-version=disabled` CLI arguments + to disable the GKE GPU device plugin daemon set and automatic driver installation on GPU nodes. diff --git a/gpu-operator/.agents/skills/gpu-operator-precompiled-drivers/SKILL.md b/gpu-operator/.agents/skills/gpu-operator-precompiled-drivers/SKILL.md new file mode 100644 index 000000000..b0ad4008c --- /dev/null +++ b/gpu-operator/.agents/skills/gpu-operator-precompiled-drivers/SKILL.md @@ -0,0 +1,281 @@ +--- +name: "gpu-operator-precompiled-drivers" +description: "Explains how to use precompiled NVIDIA driver containers with the GPU Operator. Use when reducing driver build time or selecting precompiled driver images. Trigger keywords - NVIDIA GPU Operator, precompiled drivers, driver containers, Kubernetes." +--- + + + + +# Precompiled Driver Containers + +## About Precompiled Driver Containers + +Containers with precompiled drivers do not require internet access to download Linux kernel +header files, GCC compiler tooling, or operating system packages. + +Using precompiled drivers also avoids the burst of compute demand that is required +to compile the kernel drivers with the conventional driver containers. + +These two benefits are valuable to most sites, but are especially beneficial to sites +with restricted internet access or sites with resource-constrained hardware. + +### Limitations and Restrictions + +* Support for deploying the driver containers with precompiled drivers is limited to + hosts with the x86_64 architecture and operating system versions listed in the supported-precompiled-drivers table. + + For information about using precompiled drivers with OpenShift Container Platform, + refer to :external+ocpgpu-operator-with-precompiled-drivers. + +* NVIDIA supports precompiled driver containers for the most recently released long-term + servicing branch (LTSB) driver branch. + +* NVIDIA builds images for the `aws`, `azure`, `generic`, `nvidia`, and `oracle` kernel variants. + If your hosts run a different kernel variant, you can build a precompiled driver image + and use your own container registry. + +* Precompiled driver containers do not support NVIDIA vGPU or GPUDirect Storage (GDS). + +## Step 1: Determining if a Precompiled Driver Container is Available + +The precompiled driver containers are named according to the following pattern: + + -- + +For example, `525-5.15.0-69-generic-ubuntu22.04`. + +Use one of the following ways to check if a driver container is available for your Linux kernel and driver branch: + +* Use a web browser to access the NVIDIA GPU Driver page of the NVIDIA GPU Cloud registry at + https://catalog.ngc.nvidia.com/orgs/nvidia/containers/driver/tags. + Use the search field to filter the tags by your operating system version. + +* Use the [NGC CLI](https://ngc.nvidia.com/setup/installers/cli) tool to list the tags for the driver container: + + ```console + $ ngc registry image info nvidia/driver + ``` + + *Example Output* + + ```output + Image Repository Information + Name: driver + Display Name: NVIDIA GPU Driver + Short Description: Provision NVIDIA GPU Driver as a Container. + Built By: NVIDIA + Publisher: NVIDIA + Multinode Support: False + Multi-Arch Support: True + Logo: https://assets.nvidiagrid.net/ngc/logos/Infrastructure.png + Labels: Multi-Arch, NVIDIA AI Enterprise Supported, Infrastructure Software, Kubernetes Infrastructure + Public: Yes + Last Updated: Apr 20, 2023 + Latest Image Size: 688.87 MB + Latest Tag: 525-5.15.0-69-generic-ubuntu22.04 + Tags: + 525-5.15.0-69-generic-ubuntu22.04 + 525-5.15.0-70-generic-ubuntu22.04 + ... + ``` + +## Step 2: Enabling Precompiled Driver Container Support During Installation + +Refer to the common instructions for installing the Operator with Helm at install-gpu-operator. +Specify the `--set driver.usePrecompiled=true` and `--set driver.version=` arguments like the following example command: + +```console +$ helm install --wait gpu-operator \ + -n gpu-operator --create-namespace \ + nvidia/gpu-operator \ + --version=${version} \ + --set driver.usePrecompiled=true \ + --set driver.version="" +``` + +Specify a value like `525` for ``. +Refer to Common Chart Customization Options for information about other installation options. + +## Step 3: Enabling Support After Installation + +Perform the following steps to enable support for precompiled driver containers: + +1. Enable support by modifying the cluster policy: + + ```shell + $ kubectl patch clusterpolicies.nvidia.com/cluster-policy --type='json' \ + -p='[ + {"op":"replace", "path":"/spec/driver/usePrecompiled", "value":true}, + {"op":"replace", "path":"/spec/driver/version", "value":""} + ]' + ``` + + Specify a value like `525` for ``. + + *Example Output* + + ```output + clusterpolicy.nvidia.com/cluster-policy patched + ``` + +1. Optional: Confirm that the driver daemon set pods terminate: + + ```console + $ kubectl get pods -n gpu-operator + ``` + + *Example Output* + +1. Confirm that the driver container pods are running: + + ```console + $ kubectl get pods -l app=nvidia-driver-daemonset -n gpu-operator + ``` + + *Example Output* + + Ensure that the pod names include a Linux kernel semantic version number like `5.15.0-69-generic`. + +## Step 4: Disabling Support for Precompiled Driver Containers + +Perform the following steps to disable support for precompiled driver containers: + +1. Disable support by modifying the cluster policy: + + ```shell + $ kubectl patch clusterpolicies.nvidia.com/cluster-policy --type='json' \ + -p='[ + {"op": "replace", "path": "/spec/driver/usePrecompiled", "value":false}, + {"op": "replace", "path": "/spec/driver/version", "value":"550.90.07"}, + ]' + ``` + + *Example Output* + + ```output + clusterpolicy.nvidia.com/cluster-policy patched + ``` + +1. Confirm that the conventional driver container pods are running: + + ```console + $ kubectl get pods -l app=nvidia-driver-daemonset -n gpu-operator + ``` + + *Example Output* + + Ensure that the pod names do not include a Linux kernel semantic version number. + +## Step 5: Building a Custom Driver Container Image + +If a precompiled driver container for your Linux kernel variant is not available, +you can perform the following steps to build and run a container image. + +**Note:** + +NVIDIA provides limited support for custom driver container images. +### Prerequisites +* You have access to a private container registry, such as NVIDIA NGC Private Registry, and can push container images to the registry. +* Your build machine has access to the internet to download operating system packages. +* You know a CUDA version, such as `12.1.0`, that you want to use. + The CUDA version only specifies which base image is used to build the driver container. + The version does not have any correlation to the version of CUDA that is associated with or supported by the resulting driver container. + + One way to find a supported CUDA version for your operating system is to access the NVIDIA GPU Cloud registry + at https://catalog.ngc.nvidia.com/orgs/nvidia/containers/cuda/tags and view the tags. + Use the search field to filter the tags, such as `base-ubuntu22.04`. + The filtered results show the CUDA versions, such as `12.1.0`, `12.0.1`, `12.0.0`, and so on. +* You know the GPU driver branch, such as `525`, that you want to use. + +### Procedure +1. Clone the driver container repository and change directory into the repository: + + ```console + $ git clone https://github.com/NVIDIA/gpu-driver-container.git + ``` + + ```console + $ cd gpu-driver-container + ``` + +1. Change directory to the operating system name and version under the driver directory: + + ```console + $ cd ubuntu22.04/precompiled + ``` + +1. Set environment variables for building the driver container image. + + - Specify your private registry URL: + + ```console + $ export PRIVATE_REGISTRY= + ``` + + - Specify the `KERNEL_VERSION` environment variable that matches your kernel variant, such as `5.15.0-1033-aws`: + + ```console + $ export KERNEL_VERSION=5.15.0-1033-aws + ``` + + - Specify the version of the CUDA base image to use when building the driver container: + + ```console + $ export CUDA_VERSION=12.1.0 + ``` + + - Specify the driver branch, such as `525`: + + ```console + $ export DRIVER_BRANCH=525 + ``` + + - Specify the `OS_TAG` environment variable to identify the guest operating system name and version: + + ```console + $ export OS_TAG=ubuntu22.04 + ``` + + The value must match the guest operating system version. + +1. Build the driver container image: + + ```console + $ sudo docker build \ + --build-arg KERNEL_VERSION=$KERNEL_VERSION \ + --build-arg CUDA_VERSION=$CUDA_VERSION \ + --build-arg DRIVER_BRANCH=$DRIVER_BRANCH \ + -t ${PRIVATE_REGISTRY}/driver:${DRIVER_BRANCH}-${KERNEL_VERSION}-${OS_TAG} . + ``` + +1. Push the driver container image to your private registry. + + - Log in to your private registry: + + ```console + $ sudo docker login ${PRIVATE_REGISTRY} --username= + ``` + + Enter your password when prompted. + + - Push the driver container image to your private registry: + + ```console + $ sudo docker push ${PRIVATE_REGISTRY}/driver:${DRIVER_BRANCH}-${KERNEL_VERSION}-${OS_TAG} + ``` + +### Next Steps +* To use the custom driver container image, follow the steps for enabling support during or after installation. + + If you have not already installed the GPU Operator, in addition to the `--set driver.usePrecompiled=true` + and `--set driver.version=${DRIVER_BRANCH}` arguments for Helm, also specify the `--set driver.repository="$PRIVATE_REGISTRY"` argument. + + If the container registry is not public, you need to create an image pull secret in the GPU Operator namespace + and specify it in the `--set driver.imagePullSecrets=` argument. + + If you already installed the GPU Operator, specify the private registry for the driver in the cluster policy: + + ```console + $ kubectl patch clusterpolicies.nvidia.com/cluster-policy --type='json' \ + -p='[{"op": "replace", "path": "/spec/driver/repository", "value":"$PRIVATE_REGISTRY"}]' + ``` diff --git a/gpu-operator/.agents/skills/gpu-operator-references/SKILL.md b/gpu-operator/.agents/skills/gpu-operator-references/SKILL.md new file mode 100644 index 000000000..fddafd1a3 --- /dev/null +++ b/gpu-operator/.agents/skills/gpu-operator-references/SKILL.md @@ -0,0 +1,19 @@ +--- +name: "gpu-operator-references" +description: "Points users to the Confidential Containers reference architecture and deployment documentation. Use when users ask about confidential GPU workloads or Confidential Containers with the GPU Operator. Trigger keywords - NVIDIA GPU Operator, Confidential Containers, sandboxed workloads, Kubernetes, life cycle policy, support, releases, overview, GPU workloads, platform support, operating systems, release notes, component versions, changelog, security, deployment, troubleshooting, diagnostics." +--- + + + + +# GPU Operator References + +## References + +- **Load [references/confidential-containers-deploy.md](references/confidential-containers-deploy.md)** when users ask about confidential GPU workloads or Confidential Containers with the GPU Operator. Points users to the Confidential Containers reference architecture and deployment documentation. +- **Load [references/overview.md](references/overview.md)** when users ask for a GPU Operator overview or documentation orientation. Explains what the NVIDIA GPU Operator is, which components it manages, and how it automates GPU node provisioning. +- **Load [references/security.md](references/security.md)** when reviewing security posture, vulnerability exposure, or operator namespace access. Explains GPU Operator security considerations, elevated privileges, and known CVEs. +- **Load [references/life-cycle-policy.md](references/life-cycle-policy.md)** when users ask about release support windows, maintenance, or version lifecycle. Explains the GPU Operator life cycle and support policy. +- **Load [references/platform-support.md](references/platform-support.md)** when checking compatibility before installation or upgrade. Lists supported Kubernetes platforms, operating systems, container runtimes, and GPU Operator configurations. +- **Load [references/release-notes.md](references/release-notes.md)** when users ask what changed, which component versions are included, or whether a release contains a fix. Includes release notes and component version information for the NVIDIA GPU Operator. +- **Load [references/troubleshooting.md](references/troubleshooting.md)** when diagnosing failed pods, driver problems, validator failures, or GPU workload issues. Provides troubleshooting steps for GPU Operator installation and runtime issues. diff --git a/gpu-operator/.agents/skills/gpu-operator-references/references/confidential-containers-deploy.md b/gpu-operator/.agents/skills/gpu-operator-references/references/confidential-containers-deploy.md new file mode 100644 index 000000000..68378b8f0 --- /dev/null +++ b/gpu-operator/.agents/skills/gpu-operator-references/references/confidential-containers-deploy.md @@ -0,0 +1,17 @@ + + +# Confidential Containers + +The NVIDIA GPU Operator supports deploying Confidential Containers using Kata Containers and NVIDIA Reference Architecture for Confidential Containers. + +Documentation for configuring the GPU Operator for Confidential Containers is available in the [NVIDIA Reference Architecture for Confidential Containers documentation](https://docs.nvidia.com/datacenter/cloud-native/confidential-containers/latest/overview.html). + +## Reference Architecture + +Overview, reference architecture, and software components for Confidential Containers. +### Deploy + +Deploy Confidential Containers with the NVIDIA GPU Operator on Kubernetes. +### Supported Platforms + +Hardware, host, and component versions validated for general availability (GA). diff --git a/gpu-operator/.agents/skills/gpu-operator-references/references/life-cycle-policy.md b/gpu-operator/.agents/skills/gpu-operator-references/references/life-cycle-policy.md new file mode 100644 index 000000000..386ca6346 --- /dev/null +++ b/gpu-operator/.agents/skills/gpu-operator-references/references/life-cycle-policy.md @@ -0,0 +1,64 @@ + + +# NVIDIA GPU Operator Versioning + +NVIDIA GPU Operator is versioned following the calendar versioning convention. + +The version follows the pattern `YY.MM.PP`, such as 23.6.0, 23.6.1, and 23.9.0. +The first two fields, `YY.MM` identify a major version and indicates when the major version was initially released. +The third field, `PP`, identifies the patch version of the major version. +Patch releases typically include critical bug and CVE fixes, but can include minor features. + +## NVIDIA GPU Operator Life Cycle + +When a new major version of NVIDIA GPU Operator is released, the previous major version enters deprecated support and only receives patch release updates for critical bug and CVE fixes. +All prior major versions enter end of support and are no longer supported and do not receive patch release updates. + +The product life cycle and versioning are subject to change in the future. + +**Note:** + +Upgrades are only supported within a major release or to the next major release. +| GPU Operator Version | Status | +| --- | --- | +| 26.3.x | Supported | +| 25.10.x | Deprecated | +| 25.3.x and lower | End of Support | +# GPU Operator Component Matrix + +The following table shows the operands and default operand versions that correspond to a GPU Operator version. + +When post-release testing confirms support for newer versions of operands, these updates are identified as *recommended updates* to a GPU Operator version. +Refer to Upgrading the NVIDIA GPU Operator for more information. + +**Note:** + +All the following components are supported as government-ready in the NVIDIA GPU Operator v26.3, except for NVIDIA GDS Driver, NVIDIA Confidential Computing Manager, and NVIDIA GDRCopy Driver. +**D** = Default driver, **R** = Recommended driver + +| 1 Component | 1 GPU Operator Version | | +| --- | --- | --- | +| v26.3.0 | v26.3.1 | | +| NVIDIA GPU Driver ki_ | [595.71.05](https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-595-71-05/index.html) [595.58.03](https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-595-58-03/index.html) [590.48.01](https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-590-48-01/index.html) [580.159.03](https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-580-159-03/index.html) (**R**) [580.126.20](https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-580-126-20/index.html) (**D**) [570.211.01](https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-570-211-01/index.html) [535.309.01](https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-535-309-01/index.html) [535.288.01](https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-535-288-01/index.html) | [595.71.05](https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-595-71-05/index.html) [595.58.03](https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-595-58-03/index.html) [590.48.01](https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-590-48-01/index.html) [580.159.03](https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-580-159-03/index.html) (**R**) [580.126.20](https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-580-126-20/index.html) (**D**) [570.211.01](https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-570-211-01/index.html) [535.309.01](https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-535-309-01/index.html) [535.288.01](https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-535-288-01/index.html) | +| NVIDIA Driver Manager for Kubernetes | [v0.10.0](https://ngc.nvidia.com/catalog/containers/nvidia:cloud-native:k8s-driver-manager) | | +| NVIDIA Container Toolkit | [1.19.0](https://github.com/NVIDIA/nvidia-container-toolkit/releases) | | +| NVIDIA Kubernetes Device Plugin | [0.19.0](https://github.com/NVIDIA/k8s-device-plugin/releases) | | +| DCGM Exporter | [v4.5.1-4.8.0](https://github.com/NVIDIA/dcgm-exporter/releases) | | +| Node Feature Discovery | [v0.18.3](https://github.com/kubernetes-sigs/node-feature-discovery/releases/) | | +| NVIDIA GPU Feature Discovery for Kubernetes | [0.19.0](https://github.com/NVIDIA/k8s-device-plugin/releases) | | +| NVIDIA MIG Manager for Kubernetes | [0.14.0](https://github.com/NVIDIA/mig-parted/blob/main/CHANGELOG.md) | | +| DCGM | [4.5.2-1](https://docs.nvidia.com/datacenter/dcgm/latest/release-notes/changelog.html) | | +| Validator for NVIDIA GPU Operator | v26.3.0 | v26.3.1 | +| NVIDIA KubeVirt GPU Device Plugin | [v1.5.0](https://github.com/NVIDIA/kubevirt-gpu-device-plugin) | | +| NVIDIA vGPU Device Manager | [v0.4.2](https://github.com/NVIDIA/vgpu-device-manager) | | +| NVIDIA GDS Driver gds_ | [2.27.3](https://github.com/NVIDIA/gds-nvidia-fs/releases) | | +| NVIDIA Confidential Computing Manager for Kubernetes | [v0.3.0](https://github.com/NVIDIA/k8s-cc-manager/releases) | [v0.4.0](https://github.com/NVIDIA/k8s-cc-manager/releases) | +| NVIDIA GDRCopy Driver | [v2.5.1](https://github.com/NVIDIA/gdrcopy/releases) | [v2.5.2](https://github.com/NVIDIA/gdrcopy/releases) | +| NVIDIA Kata Sandbox Device Plugin | [v0.0.2](https://github.com/NVIDIA/sandbox-device-plugin/releases) | [v0.0.3](https://github.com/NVIDIA/sandbox-device-plugin/releases) | +**Note:** + +- Driver version could be different with NVIDIA vGPU, as it depends on the driver + version downloaded from the [NVIDIA Licensing Portal](https://ui.licensing.nvidia.com). +- The GPU Operator is supported on all active NVIDIA data center production drivers. + Refer to [Supported Drivers and CUDA Toolkit Versions](https://docs.nvidia.com/datacenter/tesla/drivers/index.html#supported-drivers-and-cuda-toolkit-versions) + for more information. diff --git a/gpu-operator/.agents/skills/gpu-operator-references/references/overview.md b/gpu-operator/.agents/skills/gpu-operator-references/references/overview.md new file mode 100644 index 000000000..0c05983a4 --- /dev/null +++ b/gpu-operator/.agents/skills/gpu-operator-references/references/overview.md @@ -0,0 +1,64 @@ + + +# About the NVIDIA GPU Operator + +![](graphics/nvidia-gpu-operator-image.jpg) +Kubernetes provides access to special hardware resources such as NVIDIA GPUs, NICs, Infiniband adapters and other devices +through the [device plugin framework](https://kubernetes.io/docs/concepts/extend-kubernetes/compute-storage-net/device-plugins/). +However, configuring and managing nodes with these hardware resources requires +configuration of multiple software components such as drivers, container runtimes or other libraries which are difficult +and prone to errors. The NVIDIA GPU Operator uses the [operator framework](https://coreos.com/blog/introducing-operator-framework) +within Kubernetes to automate the management of all NVIDIA software components needed to provision GPU. These components include the NVIDIA drivers (to enable CUDA), +Kubernetes device plugin for GPUs, the [NVIDIA Container Toolkit](https://github.com/NVIDIA/nvidia-container-toolkit), +automatic node labeling using [GFD](https://github.com/NVIDIA/gpu-feature-discovery), [DCGM](https://developer.nvidia.com/dcgm) based monitoring and others. + +## About This Documentation + +Browse through the following documents for getting started, platform support and release notes for the NVIDIA GPU Operator. + +**Red Hat OpenShift Container Platform:** + +Refer to :external+ocpindex for information about installing, managing, and upgrading the Operator on Red Hat OpenShift Container Platform. +### Getting Started + +The operator-install-guide guide includes information on installing the GPU Operator in a Kubernetes cluster. + +### Release Notes + +Refer to operator-release-notes for information about releases. + +### Platform Support + +The operator-platform-support describes the supported platform configurations. + +## Licenses and Contributing + +The NVIDIA GPU Operator source code is licensed under [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) and +contributions are accepted with a DCO. Refer to the [contributing](https://github.com/NVIDIA/gpu-operator/blob/master/CONTRIBUTING.md) document for +more information on how to contribute and the release artifacts. + +The base images used by the software might include software that is licensed under open-source licenses such as GPL. +The source code for these components is archived on the CUDA opensource [index](https://developer.download.nvidia.com/compute/cuda/opensource/). + +The following table identifieis the licenses for the Operator and software components. +By installing and using the GPU Operator, you accept the terms and conditions of these licenses. + +| Component | Artifact Type | Artifact Licenses | +| --- | --- | --- | +| NVIDIA GPU Operator | Helm Chart | [Apache 2.0](https://www.apache.org/licenses/LICENSE-2.0) | +| NVIDIA GPU Operator | Image | pstai_ | +| NVIDIA GPU Feature Discovery | Image | pstai_ | +| NVIDIA GPU Driver | Image | [License for Customer Use of NVIDIA Software](http://www.nvidia.com/content/DriverDownload-March2009/licence.php?lang=us) pstai_ | +| NVIDIA Container Toolkit | Image | pstai_ | +| NVIDIA Kubernetes Device Plugin | Image | pstai_ | +| NVIDIA MIG Manager for Kubernetes | Image | pstai_ | +| Validator for NVIDIA GPU Operator | Image | pstai_ | +| NVIDIA DCGM | Image | pstai_ | +| NVIDIA DCGM Exporter | Image | pstai_ | +| NVIDIA Driver Manager for Kubernetes | Image | pstai_ | +| NVIDIA KubeVirt GPU Device Plugin | Image | pstai_ | +| NVIDIA vGPU Device Manager | Image | pstai_ | +| NVIDIA GDS Driver | Image | [License for Customer Use of NVIDIA Software](http://www.nvidia.com/content/DriverDownload-March2009/licence.php?lang=us) pstai_ | +| NVIDIA Confidential Computing Manager for Kubernetes | Image | pstai_ | +| NVIDIA Kata Manager for Kubernetes | Image | pstai_ | +| NVIDIA GDRCopy Driver | Image | pstai_ | diff --git a/gpu-operator/.agents/skills/gpu-operator-references/references/platform-support.md b/gpu-operator/.agents/skills/gpu-operator-references/references/platform-support.md new file mode 100644 index 000000000..7b96f6b8c --- /dev/null +++ b/gpu-operator/.agents/skills/gpu-operator-references/references/platform-support.md @@ -0,0 +1,398 @@ + + +# Platform Support + +## Supported NVIDIA Data Center GPUs and Systems + +The following NVIDIA data center GPUs are supported on x86 based platforms: + +### GH-series Products + +| Product | Architecture | Notes | +| --- | --- | --- | +| NVIDIA GH200 open-kern-module_ | NVIDIA Grace Hopper - | | +1 +NVIDIA GH200 systems require the NVIDIA Open GPU Kernel module driver. +You can install the open kernel modules by specifying the `driver.useOpenKernelModules=true` +argument to the `helm` command. +Refer to Common Chart Customization Options for more information. +### A, H and L-series Products + ++-------------------------+---------------------------+-------+ + Product Architecture Notes ++=========================+===========================+=======+ + NVIDIA H800 NVIDIA Hopper ++-------------------------+---------------------------+-------+ + NVIDIA H200, NVIDIA Hopper | + NVIDIA H200 NVL | ++-------------------------+---------------------------+-------+ + NVIDIA DGX H100 NVIDIA Hopper and + NVSwitch ++-------------------------+---------------------------+-------+ + NVIDIA DGX H200 NVIDIA Hopper and + NVSwitch ++-------------------------+---------------------------+-------+ + NVIDIA HGX H100 NVIDIA Hopper and + NVSwitch ++-------------------------+---------------------------+-------+ + NVIDIA HGX H200 NVIDIA Hopper and + NVSwitch ++-------------------------+---------------------------+-------+ + NVIDIA H100, NVIDIA Hopper | + NVIDIA H100 NVL | ++-------------------------+---------------------------+-------+ + NVIDIA H20 NVIDIA Hopper ++-------------------------+---------------------------+-------+ + NVIDIA L20 NVIDIA Ada ++-------------------------+---------------------------+-------+ + NVIDIA L40, NVIDIA Ada | + NVIDIA L40S | ++-------------------------+---------------------------+-------+ + NVIDIA L4 NVIDIA Ada ++-------------------------+---------------------------+-------+ + NVIDIA DGX A100 A100 and NVSwitch ++-------------------------+---------------------------+-------+ + NVIDIA HGX A100 A100 and NVSwitch ++-------------------------+---------------------------+-------+ + NVIDIA A800 NVIDIA Ampere ++-------------------------+---------------------------+-------+ + NVIDIA A100 NVIDIA Ampere ++-------------------------+---------------------------+-------+ + NVIDIA A100X NVIDIA Ampere ++-------------------------+---------------------------+-------+ + NVIDIA A40 NVIDIA Ampere ++-------------------------+---------------------------+-------+ + NVIDIA A30 NVIDIA Ampere ++-------------------------+---------------------------+-------+ + NVIDIA A30X NVIDIA Ampere ++-------------------------+---------------------------+-------+ + NVIDIA A16 NVIDIA Ampere ++-------------------------+---------------------------+-------+ + NVIDIA A10 NVIDIA Ampere ++-------------------------+---------------------------+-------+ + NVIDIA A2 NVIDIA Ampere ++-------------------------+---------------------------+-------+ + +**Note:** + +* The GPU Operator supports DGX A100 with DGX OS 5.1+ and Red Hat OpenShift using Red Hat Core OS. + For installation instructions, see preinstalled-drivers-and-toolkit for DGX OS 5.1+ and openshift-introduction for Red Hat OpenShift. +### D,T and V-series Products + ++-----------------------+------------------------+-------+ + Product Architecture Notes ++=======================+========================+=======+ + NVIDIA T4 Turing ++-----------------------+------------------------+-------+ + NVIDIA V100 Volta ++-----------------------+------------------------+-------+ + NVIDIA P100 Pascal ++-----------------------+------------------------+-------+ + NVIDIA P40 Pascal ++-----------------------+------------------------+-------+ + NVIDIA P4 Pascal ++-----------------------+------------------------+-------+ +### RTX / T-series Products + ++-------------------------+------------------------+-------+ + Product Architecture Notes ++=========================+========================+=======+ + NVIDIA RTX PRO 6000 NVIDIA Blackwell + Blackwell Server Edition ++-------------------------+------------------------+-------+ + NVIDIA RTX PRO 6000D NVIDIA Blackwell ++-------------------------+------------------------+-------+ + NVIDIA RTX Pro 4500 NVIDIA Blackwell + Blackwell Server Edition ++-------------------------+------------------------+-------+ + NVIDIA RTX A6000 NVIDIA Ampere /Ada ++-------------------------+------------------------+-------+ + NVIDIA RTX A5000 NVIDIA Ampere ++-------------------------+------------------------+-------+ + NVIDIA RTX A4500 NVIDIA Ampere ++-------------------------+------------------------+-------+ + NVIDIA RTX A4000 NVIDIA Ampere ++-------------------------+------------------------+-------+ + NVIDIA Quadro RTX 8000 Turing ++-------------------------+------------------------+-------+ + NVIDIA Quadro RTX 6000 Turing ++-------------------------+------------------------+-------+ + NVIDIA Quadro RTX 5000 Turing ++-------------------------+------------------------+-------+ + NVIDIA Quadro RTX 4000 Turing ++-------------------------+------------------------+-------+ + NVIDIA T1000 Turing ++-------------------------+------------------------+-------+ + NVIDIA T600 Turing ++-------------------------+------------------------+-------+ + NVIDIA T400 Turing ++-------------------------+------------------------+-------+ + +**Note:** + +NVIDIA RTX PRO 6000 Blackwell Server Edition notes: + +* Driver versions 575.57.08 or later is required. +* MIG is not supported on the 575.57.08 driver release. +* In cases where CUDA init fails, you may need to disable Heterogeneous Memory Management (HMM) in UVM by Customizing NVIDIA GPU Driver Parameters during Installation. +### B-series Products + ++-------------------------+------------------------+-------+ + Product Architecture Notes ++=========================+========================+=======+ + NVIDIA DGX B300 NVIDIA Blackwell ++-------------------------+------------------------+-------+ + NVIDIA DGX B200 NVIDIA Blackwell ++-------------------------+------------------------+-------+ + NVIDIA DGX Spark NVIDIA Blackwell ++-------------------------+------------------------+-------+ + NVIDIA HGX B200 NVIDIA Blackwell ++-------------------------+------------------------+-------+ + NVIDIA HGX B300 NVIDIA Blackwell ++-------------------------+------------------------+-------+ + NVIDIA HGX GB200 NVL72 NVIDIA Blackwell ++-------------------------+------------------------+-------+ + NVIDIA HGX GB200 NVL4 NVIDIA Blackwell ++-------------------------+------------------------+-------+ + NVIDIA HGX GB300 NVL72 NVIDIA Blackwell ++-------------------------+------------------------+-------+ + NVIDIA DGX Station NVIDIA Blackwell ++-------------------------+------------------------+-------+ + +**Note:** + +* HGX B200 requires a driver container version of 570.133.20 or later. +## Supported ARM Based Platforms + +The following NVIDIA data center GPUs are supported: + ++-------------------------+---------------------------+-------+ + Product Architecture Notes ++=========================+===========================+=======+ + NVIDIA A100X Ampere ++-------------------------+---------------------------+-------+ + NVIDIA A30X Ampere ++-------------------------+---------------------------+-------+ + NVIDIA IGX Orin Ampere ++-------------------------+---------------------------+-------+ + AWS EC2 G5g instances Turing ++-------------------------+---------------------------+-------+ + NVIDIA DGX Spark Blackwell ++-------------------------+---------------------------+-------+ + NVIDIA HGX GB200 NVL72 Blackwell ++-------------------------+---------------------------+-------+ + NVIDIA HGX GB300 NVL72 Blackwell ++-------------------------+---------------------------+-------+ + +In addition to the products specified in the preceding table, any ARM based +system that meets the following requirements is supported: + +- NVIDIA GPUs connected to the PCI bus. +- A supported operating system + such as Ubuntu or Red Hat Enterprise Linux. + +**Note:** + +The GPU Operator only supports platforms using discrete GPUs. +NVIDIA Jetson, or other embedded products with integrated GPUs, are not supported. + +NVIDIA IGX Orin, a platform with an integrated GPU, is supported as long as the discrete GPU is the device being used. +## Supported Deployment Options + +The GPU Operator has been validated in the following scenarios: + ++-----------------------------------------------------+ + Deployment Options ++=====================================================+ + Bare Metal ++-----------------------------------------------------+ + Virtual machines with GPU Passthrough ++-----------------------------------------------------+ + Virtual machines with NVIDIA vGPU based products ++-----------------------------------------------------+ + +**Note:** + +GPU Operator is supported with NVIDIA vGPU 12.0+. +## Supported Operating Systems and Kubernetes Platforms + +The GPU Operator has been validated in the following scenarios: + +### Bare Metal / Virtual Machines with GPU Passthrough and NVIDIA vGPU + +| Operating System | Kubernetes fn1_ | Red Hat OpenShift | VMware vSphere Kubernetes Service (VKS) | Rancher Kubernetes Engine 2 | K3s | Mirantis k0s | Canonical MicroK8s | Nutanix NKP | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | +| Ubuntu 20.04 LTS fn2_ | 1.32---1.35 - | 1.32---1.35 | 1.32---1.35 - - - - | | | | | | +| Ubuntu 22.04 LTS fn2_ | 1.32---1.35 - | 1.32---1.35 | 1.32---1.35 | 1.32---1.35 | 1.32---1.35 | 1.33---1.35 | 2.15 2.16 2.17 | | +| Ubuntu 24.04 LTS | 1.32---1.35 - - | 1.32---1.35 | 1.32---1.35 | 1.32---1.35 | 1.33---1.35 | 2.17 | | | +| Red Hat Core OS - | 4.17---4.21 - - - - - - | | | | | | | | +| Red Hat Enterprise Linux 10.0, 10.1 | 1.32---1.35 - - | 1.32---1.35 - - - - | | | | | | | +| Red Hat Enterprise Linux 9.2, 9.4, 9.6, 9.7 fn3|_ | 1.32---1.35 - - | 1.32---1.35 - - - | 2.17 | | | | | | +| Red Hat Enterprise Linux 8.8, 8.10 | 1.32---1.35 - - | 1.32---1.35 - - - | 2.15, 2.16, 2.17 | | | | | | +| Rocky Linux 9.7 | 1.32---1.35 - - - - - - - | | | | | | | | +1 +The Kubernetes community only supports the last three minor [releases](https://kubernetes.io/releases/). +Older releases may be supported through enterprise distributions of Kubernetes such as Red Hat OpenShift. + +2 +For Ubuntu 22.04 LTS, kernel versions 6.8 (non-precompiled driver containers only) 6.5 and 5.15 are LTS ESM kernels. +For Ubuntu 20.04 LTS, kernel versions 5.4 and 5.15 are LTS ESM kernels. +The GPU Driver containers support these Linux kernels. +Refer to the Kernel release schedule on Canonical's +[Ubuntu kernel lifecycle and enablement stack](https://ubuntu.com/kernel/lifecycle) page for more information. +NVIDIA recommends disabling automatic updates for the Linux kernel that are performed +by the `unattended-upgrades` package to prevent an upgrade to an unsupported kernel version. + +3 +Non-precompiled driver containers for Red Hat Enterprise Linux 9.2, 9.4, 9.6, and 9.7 versions are available for x86 based platforms only. +They are not available for ARM based systems. + +**Note:** + +ocp_csp_support +### Cloud Service Providers + +| Operating System | Amazon EKS Kubernetes | Google GKE Kubernetes | +| --- | --- | --- | +| Ubuntu 20.04 LTS | 1.32---1.35 | 1.32---1.35 | +| Ubuntu 22.04 LTS | 1.32---1.35 | 1.32---1.35 | +| Ubuntu 24.04 LTS | 1.32---1.35 | 1.32---1.35 | +## Supported Precompiled Drivers + +The GPU Operator has been validated with the following precompiled drivers. +See the precompiled-drivers page for more information about using precompiled drivers. + ++----------------------------+------------------------+----------------+---------------------+ + Operating System Kernel Flavor Kernel Version CUDA Driver Branch | ++============================+========================+================+=====================+ + Ubuntu 22.04 Generic, NVIDIA, Azure 5.15 R535, R570, R580 | + AWS, Oracle | ++----------------------------+------------------------+----------------+---------------------+ + Ubuntu 22.04 Generic, NVIDIA, Azure 6.8 R535, R570, R580 | + AWS, Oracle | ++----------------------------+------------------------+----------------+---------------------+ + Ubuntu 24.04 Generic, NVIDIA, Azure 6.8 R570, R580 | + AWS, Oracle | ++----------------------------+------------------------+----------------+---------------------+ + +## Supported Container Runtimes + +The GPU Operator has been validated for the following container runtimes: + ++----------------------------+------------------------+----------------+ + Operating System Containerd 1.7 - 2.2 CRI-O ++============================+========================+================+ + Ubuntu 20.04 LTS Yes Yes ++----------------------------+------------------------+----------------+ + Ubuntu 22.04 LTS Yes Yes ++----------------------------+------------------------+----------------+ + Ubuntu 24.04 LTS Yes Yes ++----------------------------+------------------------+----------------+ + Red Hat Core OS (RHCOS) No Yes ++----------------------------+------------------------+----------------+ + Red Hat Enterprise Linux 8 Yes Yes ++----------------------------+------------------------+----------------+ + Red Hat Enterprise Linux 9 Yes Yes ++----------------------------+------------------------+----------------+ + +**Note:** + +If you are planning to use the NRI Plugin, you must use containerd version v1.7.30+, v2.1.x and v2.2.x. +The NRI Plugin is not supported with CRI-O. +## Support for KubeVirt and OpenShift Virtualization + +Red Hat OpenShift Virtualization is based on KubeVirt. + +================ =========== ============= ========= ============= =========== +Operating System Kubernetes KubeVirt OpenShift Virtualization +---------------- ----------- ------------------------- ---------------------------- +\ \ GPU vGPU GPU vGPU + Passthrough Passthrough +================ =========== ============= ========= ============= =========== +Ubuntu 24.04 LTS 1.32---1.35 0.36+ +Ubuntu 22.04 LTS 1.32---1.35 0.36+ 0.59.1+ +Ubuntu 20.04 LTS 1.32---1.35 0.36+ 0.59.1+ +Red Hat Core OS 4.17---4.21 4.17---4.21 +================ =========== ============= ========= ============= =========== + +You can run GPU passthrough and NVIDIA vGPU in the same cluster as long as you use +a software version that meets both requirements. + +NVIDIA vGPU is incompatible with KubeVirt v0.58.0, v0.58.1, and v0.59.0, as well +as OpenShift Virtualization 4.12.0---4.12.2. +Starting with KubeVirt v0.58.2 and v0.59.1, and OpenShift Virtualization 4.12.3 and 4.13, +you must set the `DisableMDEVConfiguration` feature gate. +Refer to GPU Operator with KubeVirt or NVIDIA GPU Operator with OpenShift Virtualization. + +KubeVirt and OpenShift Virtualization with NVIDIA vGPU is supported on the following devices: + +- RTX Pro 6000 Blackwell Server Edition + +- H200NVL + +- H100 + +- GA10x: A100, A40, RTX A6000, RTX A5500, RTX A5000, A30, A16, A10, A2. + + The A10G and A10M GPUs are excluded. + +- AD10x: L40, RTX 6000 Ada, L4. + + The L40G GPU is excluded. + +- NVIDIA HGX GB200 NVL72, GB300 NVL72 on Ubuntu 24.04 LTS. + +**Note:** + +KubeVirt with NVIDIA vGPU is supported on `nodes` with Linux kernel < 6.0, such as Ubuntu 22.04 `LTS`. +## Support for GPUDirect RDMA + +Supported operating systems and NVIDIA GPU Drivers with GPUDirect RDMA. + +- Ubuntu 24.04 and 22.04 LTS with Network Operator 26.1.0 +- Red Hat Enterprise Linux 10.0 and 8.10 with Network Operator 26.1.0 +- Red Hat Enterprise Linux 8.10 with Network Operator 25.10.0 +- Ubuntu 22.04 LTS with Network Operator 25.10.0 +- RHEL 8 with Network Operator 25.7.0. +- Ubuntu 24.04 LTS with Network Operator 25.7.0. +- Ubuntu 20.04 and 22.04 LTS with Network Operator 25.7.0. +- Red Hat Enterprise Linux 9.2, 9.4, and 9.6 with Network Operator 25.7.0. +- Red Hat OpenShift 4.17 and higher with Network Operator 25.7.0. +- Ubuntu 24.04 LTS with Network Operator 25.10.0 + +For information about configuring GPUDirect RDMA, refer to gpu-operator-rdma. + +## Support for GPUDirect Storage + +Supported operating systems and NVIDIA GPU Drivers with GPUDirect Storage. + +- Ubuntu 24.04 LTS Network Operator 25.7.0. +- Ubuntu 20.04 and 22.04 LTS with Network Operator 25.7.0. +- Red Hat OpenShift Container Platform 4.17 and higher. + +**Note:** + +Version v2.17.5 and higher of the NVIDIA GPUDirect Storage kernel driver, `nvidia-fs`, +requires the NVIDIA Open GPU Kernel module driver. +You can install the open kernel modules by specifying the `driver.kernelModuleType=auto` if you are using driver container version 570.86.15, 570.124.06 or later. +Or use `driver.kernelModuleType=open` if you are using a different driver version or branch. +argument to the `helm` command. +Refer to Common Chart Customization Options for more information. + +Not supported with secure boot. +Supported storage types are local NVMe and remote NFS. +## Additional Supported Tools and Integrations + +Container management tools: + +* [Helm v3](https://helm.sh/) +* [Red Hat Operator Lifecycle Manager (OLM)](https://docs.redhat.com/en/documentation/openshift_container_platform/latest/html/operators/understanding-operators#operator-lifecycle-manager-olm) + +Orchestration & resource scheduling: + +* [NVIDIA Run:ai](https://run-ai-docs.nvidia.com/) + +**Note:** + +Run:ai requires the GPU Operator as a prerequisite and works with default GPU Operator settings. +Running the GPU Operator with Container Device Interface (CDI) enabled (default in v25.10.0 and later) requires Run:ai v2.24.38 and later, or v2.23.35 and later. Refer to the Run:ai [cluster requirements documentation](https://run-ai-docs.nvidia.com/self-hosted/getting-started/installation/install-using-helm/system-requirements#nvidia-gpu-operator) for more information. diff --git a/gpu-operator/.agents/skills/gpu-operator-references/references/release-notes.md b/gpu-operator/.agents/skills/gpu-operator-references/references/release-notes.md new file mode 100644 index 000000000..428057245 --- /dev/null +++ b/gpu-operator/.agents/skills/gpu-operator-references/references/release-notes.md @@ -0,0 +1,2533 @@ + + +# Release Notes + +This document describes the new features, improvements, fixed issues, and known issues for the NVIDIA GPU Operator. + +Refer to the GPU Operator Component Matrix for a list of software components and versions included in each release. + +**Note:** + +GPU Operator beta releases are documented on [GitHub](https://github.com/NVIDIA/gpu-operator/releases). NVIDIA AI Enterprise builds are not posted on GitHub. +---- + +## 26.3.1 + +### New Features + +* Updated software component versions: + + - NVIDIA GDRCopy Driver v2.5.2 + - NVIDIA Kata Sandbox Device Plugin v0.0.3 + - NVIDIA Confidential Computing Manager for Kubernetes v0.4.0 + +* The ClusterPolicy and NVIDIADriver custom resources now support `hostNetwork` for all GPU Operator operands. + Previously, only DCGM Exporter supported the `hostNetwork` field. + Setting `hostNetwork: true` for a component causes its pods to share the host's network namespace, + binding directly to the host's network interfaces and IP address rather than using the cluster's virtual network. + This is useful in environments where GPU Operator component pods need to expose ports directly on the host network, + such as when a Prometheus instance scrapes metrics from the host network namespace, + or in bare-metal and HPC environments where cluster network overhead or non-standard network configuration + makes host networking preferable. ([PR #2246](https://github.com/NVIDIA/gpu-operator/pull/2246)) + +* Added support for mounting `/lib/modules` from the host when using precompiled drivers. + This is required for precompiled driver containers on SUSE Linux Enterprise Server (SLES) 15 SP7 and SLES 16, + which use host kernel modules without requiring the full kernel to be bundled in the driver container. ([PR #2252](https://github.com/NVIDIA/gpu-operator/pull/2252)) + +### Fixed Issues + +* Fixed an issue in the OLM bundle where the NVIDIA KubeVirt GPU Device Plugin referenced an amd64-only image digest instead of a multi-arch digest. + On ARM servers, this caused pods to fail with an `Exec format error`. ([PR #2265](https://github.com/NVIDIA/gpu-operator/pull/2265)) + +* Fixed an issue where the operating system release name was recalculated from the node label tag rather than being stored when it was first retrieved. + This could cause errors when the tag format was not recognized. ([PR #2244](https://github.com/NVIDIA/gpu-operator/pull/2244)) + +## 26.3.0 + +### New Features + +* Updated software component versions: + + - NVIDIA Driver Manager for Kubernetes v0.10.0 + - NVIDIA Container Toolkit v1.19.0 + - NVIDIA DCGM v4.5.2-1 + - NVIDIA DCGM Exporter v4.5.1-4.8.0 + - NVIDIA GDS Driver v2.27.3 + - NVIDIA Kubernetes Device Plugin v0.19.0 + - NVIDIA MIG Manager for Kubernetes v0.14.0 + - NVIDIA GPU Feature Discovery for Kubernetes v0.19.0 + - NVIDIA vGPU Device Manager v0.4.2 + - NVIDIA KubeVirt GPU Device Plugin v1.5.0 + - NVIDIA Kata Sandbox Device Plugin v0.0.2 + - NVIDIA Confidential Computing Manager for Kubernetes v0.3.0 + +* Added support for these NVIDIA Data Center GPU Driver versions: + + - 580.126.20 (default) + +* Added support for Node Resource Interface (NRI) Plugin. + The NRI Plugin offers a new way of injecting GPUs into GPU management containers, without needing the `nvidia` runtime class. + Enable by setting the `cdi.nriPluginEnabled` field to true in the ClusterPolicy custom resource or by setting the `cdi.nriPluginEnabled` flag in the Helm chart. + + When the NRI Plugin is enabled, no `nvidia` runtime class gets created and no modifications are made to the container runtime configuration, e.g. no modifications are made to containerd’s config.toml file. + This is particularly advantageous for platforms like k3s, k0s, and Rancher Kubernetes Engine 2 that configure containerd in a non-standard way. + On such platforms, when the NRI plugin is enabled users no longer need to configure environment variables like `CONTAINERD_CONFIG`, `CONTAINERD_SOCKET`, or `RUNTIME_CONFIG_SOURCE`. + + This feature requires containerd v1.7.30+, v2.1.x, or v2.2.x. + + To learn more, refer to Container Device Interface (CDI) and Node Resource Interface (NRI) Plugin Support (use the `gpu-operator-container-device` skill). + + **Note:** + + Enabling the NRI plugin is not supported with cri-o. +* Added support for dynamic MIG config generation. + By default, the MIG Manager will automatically generate a per-node ConfigMap with the default MIG profiles for the available GPUs on the node. + This replaces the previous static ConfigMap. + You are still able to use a custom MIG configuration if you have specific requirements. + Refer to the MIG Manager documentation (use the `gpu-operator-multiinstance` skill) for more information. + + There is a known issue with MIG configurations on RHEL 8 with pre-installed NVIDIA drivers, refer to the Known Issues section for more information. + +* Added support for the NVIDIA Driver Custom Resource Definition (CRD). + Use this feature on new cluster installations to configure multiple driver types and versions on different nodes or multiple operating system versions on nodes. + Refer to the NVIDIA Driver Custom Resource Definition documentation (use the `gpu-operator-nvidia-driver` skill) for more information. + + **Note:** + + This feature does not support an upgrade from an earlier version of the NVIDIA GPU Operator or switching from ClusterPolicy to the NVIDIA Driver CRD. + It is recommended that you only use this feature from new installations. +* Added support for KubeVirt with GPU passthrough on Ubuntu 24.04 LTS + +* Added support for K3s. + +* Added support for containerd 2.2. + +* Added support for new operating systems: + + - Rocky Linux 9.7 + - Red Hat Enterprise Linux 10.0, 10.1 + - Red Hat Enterprise Linux 9.7 + +* Added support for NVIDIA GB200 NVL4 + +* Added support for NVIDIA RTX Pro 4500 Blackwell Server Edition. + +* Added support for NVIDIA Network Operator v26.1.0. + +* Added support for including extra manifests with the Helm chart in the `extraObjects` field. + +* Added support for the DCGM Exporter to expose a metric port on the host network namespace. + Enabled by setting `hostNetwork: true` in the ClusterPolicy custom resource, or passing `--set dcgmExporter.hostNetwork=true` to the Helm chart. ([PR #1962](https://github.com/NVIDIA/gpu-operator/pull/1962)) + +* Added liveness and readiness probes for the DCGM and DCGM Exporter pods. + The probes ensure that pods are not marked as ready until DCGM is ready to serve traffic. ([PR #2175](https://github.com/NVIDIA/gpu-operator/pull/2175)) + +* Added PodSecurityContext support for DaemonSets ([PR #2120](https://github.com/NVIDIA/gpu-operator/pull/2120)). + In ClusterPolicy, set `spec.daemonsets.podSecurityContext`; in NVIDIADriver, set `spec.podSecurityContext`. + +* Validated Operator government-ready component support with Rancher Kubernetes Engine 2 using Ubuntu 24.04. + +* The following components are now available as government ready components: NVIDIA sandbox device plugin, Kubevirt Device Plugin, and vGPU Device Manager. + +### Improvements + +* Improved NVIDIA Driver resiliency when the driver container is removed. + In previous versions, the NVIDIA Driver would unload the kernel modules and perform the driver compilation process, which could take several minutes to complete, delaying the driver container from restarting. + In v26.3.0, if there is no change to the CUDA driver version (or other driver configuration) in the ClusterPolicy, the NVIDIA Driver will reuse the kernel modules that are available on the node. + This reduces the time to recover from the driver container removal from minutes to seconds. + +* Reduced unnecessary API calls and decreased reconciliation time on large GPU clusters by improving node label logic ([PR #2113](https://github.com/NVIDIA/gpu-operator/pull/2113)). + +* Improved the GPU Operator to now use operating system version labels from GPU worker nodes (added by NFD) when determining OS-specific paths for repository configuration files. ([PR #562](https://github.com/NVIDIA/gpu-operator/issues/562), [PR #2138](https://github.com/NVIDIA/gpu-operator/pull/2138)) + +* Driver validation now waits for all enabled additional drivers (such as GDS and GDRCopy) to be installed before proceeding, and each node records a node-local view of enabled features when using multiple NVIDIADriver CRs or optional components. ([PR #2014](https://github.com/NVIDIA/gpu-operator/pull/2014)) + +* Improved support for Kata Containers. + Changes in this release include: + + * Deprecating the NVIDIA Kata Manager. + You now use `kata-deploy` to install the Kata Container and the Kata runtime class + * Adding support for the NVIDIA Kata Sandbox Device Plugin. + * Configure `sandboxWorkload.mode=kata` during installation or in the ClusterPolicy to enable Kata Containers. + + Refer to the Kata Containers documentation (use the `gpu-operator-kata-containers` skill) for full details on configuring the GPU Operator to use Kata Containers. + +* Improved support for Confidential Containers. + The GPU Operator now supports deploying Confidential Containers using Kata Containers and NVIDIA Reference Architecture for Confidential Containers. + Refer to the Confidential Containers documentation for full details the Confidential Contaienrs reference architecture and on configuring the GPU Operator to use Confidential Containers. + +### Fixed Issues + +* Fixed an issue where driver installations can fail because cached packages were incorrectly referenced. ([PR #592](https://github.com/NVIDIA/gpu-driver-container/pull/592)) + +* Fixed a shared state issue that caused incorrect driver images in multi-node-pool clusters. ([PR #1952](https://github.com/NVIDIA/gpu-operator/issues/1952)) + +* Fixed an issue where the GPU Operator was applying driver upgrade annotations when the driver is disabled. ([PR #1968](https://github.com/NVIDIA/gpu-operator/pull/1981)) + +* Fixed an issue where an empty value in the Helm chart for `device.plugin` was incorrectly causing an error. ([PR #1999](https://github.com/NVIDIA/gpu-operator/pull/1999)) + +* Fixed an issue on OpenShift clusters where the `dcgm-exporter` pod gets bound to another Security Context Constraint (SCC) object instead of the `nvidia-dcgm-exporter` SCC that the GPU Operator creates. ([PR #2122](https://github.com/NVIDIA/gpu-operator/pull/2122)) + +* Fixed an issue where the GPU Operator was not correctly cleaning up DaemonSets ([PR #2081](https://github.com/NVIDIA/gpu-operator/pull/2081)). + +* Fixed an issue where the GPU Operator was not adding a namespace to ServiceAccount objects. ([PR #2039](https://github.com/NVIDIA/gpu-operator/pull/2039)) + +### Known Issues + +* When GPUDirect RDMA is enabled, the `nvidia-peermem` container may fail to restart after the driver pod restarts without a node reboot and without any driver configuration changes. + In this scenario, the driver uses a fast-path optimization that skips recompilation, but the `nvidia-peermem` sidecar does not detect that its module is already loaded and fails to start. + This occurs because the kernel state is not cleared when the driver pod restarts. + + To work around this issue, set the `FORCE_REINSTALL=true` environment variable in the ClusterPolicy. + + ```console + $ kubectl patch clusterpolicy cluster-policy --type=json \ + -p='[{"op": "add", "path": "/spec/driver/manager/env/-", "value": {"name": "FORCE_REINSTALL", "value": "true"}}]' + ``` + + Setting `FORCE_REINSTALL=true` forces full driver recompilation, node drain, and GPU workload disruption on every restart. + Alternatively, rebooting the node clears the kernel state and allows the `nvidia-peermem` module to load successfully, though this may disrupt running workloads. + +* On RHEL 8 nodes with pre-installed NVIDIA drivers (`driver.enabled=false`), MIG configuration can fail when using NVIDIA MIG Manager v0.13.1 or later. + NVIDIA MIG Manager copies the `nvidia-mig-parted` binary to the host and runs it in the host userspace by using `chroot`. + Recent versions of the binary were compiled against a UBI9 base image and require GLIBC 2.32 and GLIBC 2.34 which are not available on RHEL 8, causing the following errors in the MIG Manager pod logs: + + ```console + /usr/local/nvidia/mig-manager/nvidia-mig-parted: /lib64/libc.so.6: version `GLIBC_2.32' not found + /usr/local/nvidia/mig-manager/nvidia-mig-parted: /lib64/libc.so.6: version `GLIBC_2.34' not found + ``` + + To work around this issue, downgrade the NVIDIA MIG Manager component to v0.12.3. + After downgrading, automatically generated per-node MIG configuration ConfigMaps will not be available. + MIG configuration information will be available in the `default-mig-parted-config` ConfigMap instead. + Refer to the MIG Manager documentation (use the `gpu-operator-multiinstance` skill) for more information on MIG configuration. + + Refer to the MIG Controller issue [#329](https://github.com/NVIDIA/mig-parted/issues/329) for more information. + +* After you delete the default NVIDIADriver custom resource, any custom NVIDIADriver + custom resources that you created might not become active automatically. + The custom resources remain in a pending state because the Operator controller + does not re-evaluate them after the conflicting default custom resource is removed. + + To work around this issue, restart the GPU Operator controller by deleting + the controller pod: + + ```console + $ kubectl delete pod -n gpu-operator -l app=gpu-operator + ``` + + Restarting the controller pod does not disrupt running GPU workloads or + driver pods on nodes. + + Refer to issue [#2259](https://github.com/NVIDIA/gpu-operator/issues/2259) + for more information. + +### Removals and Deprecations + +* Marked unused field `defaultRuntime` as optional in the ClusterPolicy. ([PR #2000](https://github.com/NVIDIA/gpu-operator/pull/2000)) +* NVIDIA Kata Manager is now deprecated. + Refer to the Kata Containers documentation (use the `gpu-operator-kata-containers` skill) for more information on using Kata Containers without this component. + +## 25.10.1 + +### New Features + +* Updated software component versions: + + - NVIDIA Container Toolkit v1.18.1 + - NVIDIA DCGM v4.4.2-1 + - NVIDIA DCGM Exporter v4.4.2-4.7.0 + - NVIDIA Kubernetes Device Plugin v0.18.1 + - NVIDIA GPU Feature Discovery v0.18.1 + - NVIDIA MIG Manager for Kubernetes 0.13.1 + - NVIDIA Driver Manager for Kubernetes v0.9.1 + +* Added support for this NVIDIA Data Center GPU Driver version: + + - 580.105.08 (default) + +* Add HPC job mapping support to DCGM Exporter to collect metrics for HPC jobs running on the cluster. + + Configure the HPC job mapping by setting the `dcgmExporter.hpcJobMapping.enabled` field to `true` in the ClusterPolicy custom resource. + Set `dcgmExporter.hpcJobMapping.directory` with the directory path where HPC job mapping files are created by the workload manager. + The default directory is `/var/lib/dcgm-exporter/job-mapping`. + +* Improved the cluster policy reconciler to be more resilient to race conditions during node updates. + +### Fixed Issues + +* Fixed the following known issue introduced in GPU Operator v25.10.0: + + * When using cri-o as the container runtime, several GPU Operator pods can be stuck in the `Init:RunContainerError` or `Init:CreateContainerError` state during GPU Operator installation or upgrade, or during GPU driver daemonset upgrade. + * NVIDIA Container Toolkit 1.18.0 overwrites the imports field in the top-level containerd configuration file, so any previously imported paths are lost. + This was fixed in NVIDIA Container Toolkit v1.18.1. + +* Fixed a race condition where user-supplied NVIDIA kernel module parameters were sometimes not being applied by the driver daemonset. + For more information, refer to [PR #1939](https://github.com/NVIDIA/gpu-operator/pull/1939). + +* Fixed a bug where driver images were being incorrectly assigned in multi-nodepool clusters. + For more information, refer to [Issue #1622](https://github.com/NVIDIA/gpu-operator/issues/1622). +* Fixed a bug where the GPU Operator Helm chart template was not assigning the correct namespace to resources it created. +* Fixed a bug where the k8s-driver-manager would wait indefinitely when MOFED is enabled and `USE_HOST_MOFED` is set to true despite the MOFED being pre-installed on the host. + +### Known Issues + +* When deploying the GPU Operator on systems with SELinux in enforcing mode, the MIG Manager does not get scheduled on GPU nodes. + This happens because the GPU Feature Discovery pod has insufficient permissions on Node Feature Discovery's feature-file drop-in directory, so it cannot add the label that indicates a MIG-capable GPU is present. + To work around this issue, configure NVIDIA GPU Feature Discovery to use the Node Feature API instead of feature files in ClusterPolicy: + + ```yaml + gfd: + env: + - name: USE_NODE_FEATURE_API + value: "true" + ``` + +## 25.10.0 + +### New Features + +* The NVIDIA GPU Operator now supports government ready components for NVIDIA AI Enterprise customers. + Government ready is NVIDIA's designation for software that meets applicable security requirements for deployment in your FedRAMP High or equivalent sovereign use case. For more information on NVIDIA's government ready support, refer to the install-gpu-operator-gov-ready deployment guide or the [AI Software for Regulated Environments White Paper](https://docs.nvidia.com/ai-enterprise/planning-resource/ai-software-regulated-environments-white-paper/latest/index.html). + +* Updated software component versions: + + - NVIDIA Driver Manager for Kubernetes v0.9.0 + - NVIDIA Container Toolkit v1.18.0 + - NVIDIA DCGM v4.4.1 + - NVIDIA DCGM Exporter v4.4.1-4.6.0 + - Node Feature Discovery v0.18.2 + - NVIDIA GDS Driver v2.26.6 + - NVIDIA Kubernetes Device Plugin v0.18.0 + - NVIDIA MIG Manager for Kubernetes v0.13.0 + - NVIDIA vGPU Device Manager v0.4.1 + +* Added support for these NVIDIA Data Center GPU Driver versions: + + - 580.95.05 (default) + - 570.195.03 + - 535.274.02 + +* Container Device Interface (CDI) is now enabled by default when installing or upgrading (via helm) the GPU Operator to 25.10.0. + The `cdi.enabled` field in the ClusterPolicy is now set to `true` by default. + The `cdi.default` field is now deprecated and will be ignored. + + - When `cdi.enabled` is `true` the GPU Operator now leverages CDI support in container + runtimes, such as containerd and cri-o, for injecting GPU support into workload containers. + This differs from prior releases where CDI support in container runtimes was not used, and + instead, an `nvidia` runtime class configured in CDI mode was used. + - When CDI is enabled, no configuration changes are required for standard workloads using GPU allocation through the Device Plugin. Setting `runtimeClassName` is not required for standard workloads. For workloads that already have `runtimeClassName: nvidia` set in their pod spec YAML, no change is necessary. + - GPU Management Containers that use the `NVIDIA_VISIBLE_DEVICES` environment variable to get GPU access, bypassing GPU allocation via the Device Plugin or DRA Driver for GPUs, must set `runtimeClassName: nvidia` in the pod specification. It's recommended that `NVIDIA_VISIBLE_DEVICES` only be used by GPU Management Containers. A GPU Management Container is a container that requires access to all GPUs without them being allocated by Kubernetes. Examples include monitoring agents and device plugins. + - For OpenShift users upgrading to v25.10.0, we recommend updating the `cdi.enabled` + field in ClusterPolicy to `true` post-upgrade. This field will not automatically be + updated to `true` since the Operator Lifecycle Manager (OLM) does not mutate custom + resources on operator upgrades. + +* When using NVIDIA vGPU with KubeVirt / OpenShift Virtualization, on GPUs that support MIG, you now have the option to select MIG-backed vGPU instances instead of time-sliced vGPU instances. + To select a MIG-backed vGPU profile, label the node with the name of the MIG-backed vGPU profile. + +* Added support for NVIDIA Network Operator 25.7.0 integration. + Refer to Support for GPUDirect RDMA and Support for GPUDirect Storage. + +* Added support for Mirantis k0s. + +* Added support for Red Hat OpenShift GPU dashboard integration. + +* Added support for Red Hat OpenShift Container Platform 4.20. + +* Added support for Red Hat OpenShift with HGX GB200 NVL72. + +* Added support for Kubernetes v1.34. + +* Added support for NVIDIA HGX B300 and NVIDIA HGX GB300 NVL72. + +* Added support for new MIG profiles with NVIDIA HGX B300. + + * Supports these profiles: + + * `1g.34gb` + * `1g.34gb+me` + * `1g.67gb` + * `2g.67gb` + * `3g.135gb` + * `4g.135gb` + * `7g.269gb` + + * Added an `all-balanced` profile that creates the following GPU instances: + + * `1g.34gb` \times 2 + * `2g.67gb` \times 1 + * `3g.135gb` \times 1 + +* Added support for new MIG profiles with NVIDIA HGX GB300 NVL72. + + * Supports these profiles: + + * `1g.35gb` + * `1g.35gb+me` + * `1g.70gb` + * `2g.70gb` + * `3g.139gb` + * `4g.139gb` + * `7g.278gb` + + * Added an `all-balanced` profile that creates the following GPU instances: + + * `1g.35gb` \times 2 + * `2g.70gb` \times 1 + * `3g.139gb` \times 1 + +### Improvements + +* The GPU Operator now configures containerd and cri-o to use drop-in files for container runtime config overrides by default. + As a consequence of this change, some of the install procedures for Kubernetes distributions + that use custom containerd installations have changed. + + - The install procedure for microk8s has changed. Refer to the latest MicroK8s install procedure. + +* Hardened the GPU Operator container image by using a distroless image as a base image. + +* Validator for NVIDIA GPU Operator is now included as part of the GPU Operator container image. + It is no longer a separate image. + +* The GPU Operator now supports passing the vGPU licensing token as a secret. + It is recommended that you migrate to using secrets instead of a configMap for improved security. + +* Enhanced the driver pod to allow resource requests and limits to be configurable for all containers in the driver pod. + +* Added support for specifying hostPID via the GPU Operator Helm charts + +### Fixed Issues + +* Fixed an issue where the vGPU Manager pod was terminated before it finished disabling VFs on all GPUs. + The terminationGracePeriodSeconds is now set to 120 seconds to ensure the vGPU Manager has enough time to finish its cleanup logic when the pod is terminated. + +* Added GDRCopy validation to validator daemonset. When GDRCopy is enabled, this ensures that the GDRCopy driver is loaded prior to the k8s-device-plugin from starting up. + +* Added required permissions when GPU Feature Discovery is configured to use the Node Feature API instead of feature files. + +### Known Issues + +* When using cri-o as the container runtime, several of the GPU Operator pods may be stuck in the `Init:RunContainerError` or `Init:CreateContainerError` state during installation of GPU Operator, upgrade of GPU Operator, or upgrade of the GPU driver daemonset. + The pods may be in this state for several minutes and restart several times. + The pods will recover from this state as soon as the container toolkit pod starts running. + +* NVIDIA Container Toolkit 1.18.0 will overwrite the `imports` field in the top-level containerd configuration file, so any previously imported paths will be lost. + +* When using MIG-backed vGPU on the RTX Pro 6000 Blackwell Server Edition, the vgpu-device-manager will fail to configure nodes with the default vgpu-device-manager configuration. + To work around this, create a custom ConfigMap that adds the GFX suffix to the vGPU profile name. + All of the MIG-backed vGPU profiles are only supported on MIG instances created with the `+gfx` attribute. + Refer to the following example: + + ```yaml + version: v1 + vgpu-configs: + DC-1-2Q: + - devices: all + vgpu-devices: + DC-1-2QGFX: 48 + ``` + + Create the ConfigMap, then update the ClusterPolicy with the name of the configMap in the `vgpuDeviceManager.config.name`, and restart the vgpu-device-manager pod. + +- When using GKE 1.33+, there is a known issue where NVIDIA Container Toolkit will misconfigure the containerd `config.toml` file and prevent GPU Operator containers from starting up correctly. + To resolve this issue, set the `RUNTIME_CONFIG_SOURCE=file` environment variable in the toolkit container. + You can set this environment variable by setting the below in the ClusterPolicy CR: + + ```yaml + toolkit: + env: + - name: RUNTIME_CONFIG_SOURCE + value: "file" + ``` + +## 25.3.4 + +### New Features + +* Supports these NVIDIA Data Center GPU Driver versions: + + - 580.82.07 (default) + +* Added support for additional features: + + - RTX Pro 6000 Blackwell Server Edition + + - MIG profiles support + - KubeVirt and OpenShift Virtualization: VM with GPU passthrough (Ubuntu 22.04 only) + - KubeVirt and OpenShift Virtualization: VM with time-slice vGPU (Ubuntu 22.04 only) + + - RTX Pro 6000D + + - KubeVirt and OpenShift Virtualization: VM with GPU passthrough (Ubuntu 22.04 only) + +### Fixed Issues + +* Fixed an issue where user-supplied environment variables configured in ClusterPolicy were not getting set in the rendered DaemonSet. + User-supplied environment variables now take precedence over environment variables set by the ClusterPolicy controller. + +## 25.3.3 + +### Fixed Issues + +* Fixed an issue where the GPU Operator failed to render the nvidia-container-toolkit DaemonSet correctly when a custom value for `CONTAINERD_SOCKET` was provided as input. + Specifically, the hostPath volumes were not included in the DaemonSet. + Refer to GitHub issue https://github.com/NVIDIA/gpu-operator/issues/1694 for more details. + +## 25.3.2 + +### New Features + +* Updated software component versions: + + - NVIDIA Kubernetes Device Plugin/NVIDIA GPU Feature Discovery v0.17.3 + - NVIDIA MIG Manager for Kubernetes v0.12.2 + +* Supports these NVIDIA Data Center GPU Driver versions: + + - 580.65.06 (recommended) + - 570.172.08 (default) + - 535.261.03 + +### Known Issues + +* Starting with version **580.65.06**, the driver container has **Coherent Driver Memory Management (CDMM)** enabled by default to support **GB200** on Kubernetes. + For more information about CDMM, refer to the [release notes](https://docs.nvidia.com/datacenter/tesla/tesla-release-notes-580-65-06/index.html#hardware-software-support). + + **Note:** + + Currently, CDMM is not compatible with the **Multi-Instance GPUs (MIG)** sharing. + CDMM is also not compatible with **GPU Direct Storage**. + CDMM support for these features is planned for future driver updates. + However, these limitations will remain in place until a future driver update removes them. + CDMM enablement applies only to **Grace-based systems** such as **GH200** and **GB200** and is ignored on other GPU platforms. + NVIDIA strongly recommends keeping CDMM enabled with Kubernetes on supported systems to prevent memory over-reporting and uncontrolled GPU memory access. + +* For drivers 570.124.06, 570.133.20, 570.148.08, and 570.158.01, + GPU workloads cannot be scheduled on nodes that have a mix of MIG slices and full GPUs. + This manifests as GPU pods getting stuck indefinitely in the `Pending` state. + NVIDIA recommends that you upgrade the driver to version 570.172.08 to avoid this issue. + For more detailed information, see GitHub issue https://github.com/NVIDIA/gpu-operator/issues/1361. + +* Configuring the Operator to enable CDI is not supported on Rancher Kubernetes Engine 2 (RKE2). + +* The `nouveau` driver must be blacklisted when using NVIDIA vGPU. + Otherwise the driver fails to initialize the GPU with the error `Failed to enable MSI-X` in the system journal logs. + Additionally, all GPU Operator pods become stuck in the `Init` state. + +### Fixed Issues + +* Fixed security vulnerabilities in NVIDIA Container Toolkit and related components. + This release addresses CVE-2025-23266 (Critical) and CVE-2025-23267 (High) that could allow + arbitrary code execution and link following attacks in container environments. + For complete details, refer to the [NVIDIA Security Bulletin](https://nvidia.custhelp.com/app/answers/detail/a_id/5659). + +## 25.3.1 + +### New Features + +* Includes these software component versions: + + - NVIDIA Container Toolkit version v1.17.8 + - NVIDIA DCGM v4.2.3 + - NVIDIA DCGM Exporter v4.2.3-4.1.3 + - NVIDIA Kubernetes Device Plugin v0.17.2 + - Node Feature Discovery v0.17.3 + - NVIDIA GDRCopy Driver v2.5.0 + +* Supports these NVIDIA Data Center GPU Driver versions: + + - 580.65.06 (recommended) + - 570.172.08 (default) + - 570.148.08 + - 570.133.20 + - 550.163.01 + - 535.261.03 + - 535.247.01 + +* Added support for Red Hat Enterprise Linux 9. + Non-precompiled driver containers for Red Hat Enterprise Linux 9.2, 9.4, 9.5, and 9.6 versions are available for x86 based platforms only. + They are not available for ARM based systems. + +* Added support for Kubernetes v1.33. + +* Added support for setting the internalTrafficPolicy for the DCGM Exporter service. + You can configure this in the Helm chart value by setting `dcgmexporter.service.internalTrafficPolicy` to `Local` or `Cluster` (default). + Choose Local if you want to route internal traffic within the node only. + +### Known Issues + +* For drivers 570.124.06, 570.133.20, 570.148.08, and 570.158.01, + GPU workloads cannot be scheduled on nodes that have a mix of MIG slices and full GPUs. + This manifests as GPU pods getting stuck indefinitely in the `Pending` state. + NVIDIA recommends that you upgrade the driver to version 570.172.08 to avoid this issue. + For more detailed information, see GitHub issue https://github.com/NVIDIA/gpu-operator/issues/1361. + +* GPU Operator in CDI mode is not operational with RKE2. + +### Fixed Issues + +* Fixed an issue where the NVIDIADriver controller may enter an endless loop of creating and deleting a DaemonSet. + This could occur when the NVIDIADriver DaemonSet does not tolerate a taint present on all nodes matching its configured nodeSelector, or when none of the DaemonSet pods have been scheduled yet. + Refer to GitHub [pull request #1416](https://github.com/NVIDIA/gpu-operator/pull/1416) for more details. + +## 25.3.0 + +### New Features + +* Includes these software component versions: + + - NVIDIA Container Toolkit version v1.17.5 + - NVIDIA Driver Manager for Kubernetes v0.8.0 + - NVIDIA Kubernetes Device Plugin v0.17.1 + - NVIDIA DCGM Exporter v4.1.1-4.0.4 + - NVIDIA DCGM v4.1.1-2 + - Node Feature Discovery v0.17.2 + - NVIDIA GPU Feature Discovery for Kubernetes v0.17.1 + - NVIDIA MIG Manager for Kubernetes v0.12.1 + - NVIDIA KubeVirt GPU Device Plugin v1.3.1 + - NVIDIA vGPU Device Manager v0.3.0 + - NVIDIA Kata Manager for Kubernetes v0.2.3 + - NVIDIA GDRCopy Driver v2.4.4 + +* Added support for the NVIDIA GPU DRA Driver v25.3.0 component (coming soon) which enables Multi-Node NVLink through Kubernetes Dynamic Resource Allocation (DRA) and IMEX support. + + This component can be installed alongside the GPU Operator. + It is supported on Kubernetes v1.32 clusters, running on NVIDIA HGX GB200 NVL, and with CDI enabled on your GPU Operator. + +* Transitioned to installing the open kernel modules by default starting with R570 driver containers. + +* Added a new parameter, `kernelModuleType`, to the ClusterPolicy and NVIDIADriver APIs which specifies how the GPU Operator and driver containers will choose kernel modules to use. + + Valid values include: + + * `auto`: Default and recommended option. `auto` means that the recommended kernel module type (open or proprietary) is chosen based on the GPU devices on the host and the driver branch used. + * `open`: Use the NVIDIA Open GPU kernel module driver. + * `proprietary`: Use the NVIDIA Proprietary GPU kernel module driver. + + Currently, `auto` is only supported with the 570.86.15 and 570.124.06 or later driver containers. + 550 and 535 branch drivers do not yet support this mode. + + In previous versions, the `useOpenKernelModules` field specified the driver containers to install the NVIDIA Open GPU kernel module driver. + This field is now deprecated and will be removed in a future release. + If you were using the `useOpenKernelModules` field, NVIDIA recommends that you update your configuration to use the `kernelModuleType` field instead. + +* Added support for Ubuntu 24.04 LTS. + +* Added support for NVIDIA HGX GB200 NVL and NVIDIA HGX B200. + Note that HGX B200 requires a driver container version of 570.133.20 or later. + +* Added support for the NVIDIA Data Center GPU Driver version 570.124.06. + +* Added support for KubeVirt and OpenShift Virtualization with vGPU v18 on H200NVL. + +* Added support for NVIDIA Network Operator v25.1.0. + Refer to Support for GPUDirect RDMA and Support for GPUDirect Storage. + +* Added support for OpenShift v4.18. + +* Added support for Containerd v2.0. + +* Added support for Kubernetes v1.32. + Note that the minimum supported Kubernetes versions has been updated to v1.29. + +* Added support for new MIG profiles with HGX B200. + + * Supports these profiles: + + * `1g.23gb` + * `1g.23gb+me` + * `1g.45gb` + * `2g.45gb` + * `3g.90gb` + * `4g.90gb` + * `7g.180gb` + + * Added an `all-balanced` profile that creates the following GPU instances: + + * `1g.23gb` \times 2 + * `2g.45gb` \times 1 + * `3g.90gb` \times 1 + +* Added support for new MIG profiles with HGX GB200. + + * Supports these profiles: + + * `1g.24gb` + * `1g.24gb+me` + * `1g.47gb` + * `2g.47gb` + * `3g.95gb` + * `4g.95gb` + * `7g.189gb` + + * Added an `all-balanced` profile that creates the following GPU instances: + + * `1g.24gb` \times 2 + * `2g.47gb` \times 1 + * `3g.95gb` \times 1 + +### Improvements + +* Improved security by removing unnecessary permissions in the GPU Operator ClusterRole. + +* Improved GPU Operator metrics to include a `operatorMetricsNamespace` field that sets the metrics namespace to `gpu_operator`. + +* Improved error handling in Driver Manager for Kubernetes by adding pod watch permissions. + +### Fixed Issues + +* Removed default liveness probe from the `nvidia-fs-ctr` and `nvidia-gdrcopy-ctr` containers of the GPU driver daemonset. + Long response times of the `lsmod` commands were causing timeout errors in the probe and unnecessary restarts of the container, resulting in the DaemonSet being in a bad state. + +* Fixed an issue where the GPU Operator failed to create a valid DaemonSet name on OpenShift Container Platform when using 64 kernel page size. + Refer to GitHub [issue #1207](https://github.com/NVIDIA/gpu-operator/issues/1207) for more details. + +* Removed deprecated `operator.defaultRuntime` parameter. + +## 24.9.2 + +### New Features + +* Added support for the NVIDIA Data Center GPU Driver version 570.86.15. +* The default driver in this version is now 550.144.03. + Refer to the GPU Operator Component Matrix + on the platform support page for more details on supported drivers. + +* Added support for NVIDIA Container Toolkit 1.17.4. This version includes updates for [NVIDIA CVE-2025-23359](https://nvidia.custhelp.com/app/answers/detail/a_id/5616). + + To view any published security bulletins for NVIDIA products, refer to the NVIDIA product security page at https://www.nvidia.com/en-us/security/. + + For more information regarding NVIDIA security vulnerability remediation policies, refer to https://www.nvidia.com/en-us/security/psirt-policies/. + +## 24.9.1 + +### New Features + +* Added support for the NVIDIA Data Center GPU Driver versions 550.127.08 and 535.216.03. + Refer to the GPU Operator Component Matrix + on the platform support page. + +* Added support for NVIDIA Container Toolkit 1.17.3. + This version includes updates for: + + * [NVIDIA CVE-2024-0135](https://nvidia.custhelp.com/app/answers/detail/a_id/5599) + * [NVIDIA CVE-2024-0136](https://nvidia.custhelp.com/app/answers/detail/a_id/5599) + * [NVIDIA CVE-2024-0137](https://nvidia.custhelp.com/app/answers/detail/a_id/5599) + + To view any published security bulletins for NVIDIA products, refer to the NVIDIA product security page at https://www.nvidia.com/en-us/security/. + + For more information regarding NVIDIA security vulnerability remediation policies, refer to https://www.nvidia.com/en-us/security/psirt-policies/. + +* Includes these software component versions: + + - NVIDIA Container Toolkit v1.17.3 + - NVIDIA DCGM v3.3.9-1 + - NVIDIA DCGM Exporter v3.3.9-3.6.1 + +* Added support for NVIDIA Network Operator v24.10.0. + Refer to Support for GPUDirect RDMA and Support for GPUDirect Storage. + +* Added an `all-balanced` MIG profile for H200 NVL which creates the following GPU instances: + + - `1g.18gb` \times 2 + - `2g.35gb` \times 1 + - `3g.71gb` \times 1 + +### Fixed Issues + +* Fixed an issue where NVIDIA Container Toolkit would fail to start on Rancher RKE2, K3s, and Canonical MicroK8s. + Refer to GitHub [issue #1109](https://github.com/NVIDIA/gpu-operator/issues/1109) for more details. + +* Fixed an issue where events were not being generated by the NVIDIA driver upgrade controller. + Refer to GitHub [issue #1101](https://github.com/NVIDIA/gpu-operator/issues/1101) for more details. + +## 24.9.0 + +### New Features + +* Added support for NVIDIA Container Toolkit 1.17.0. + This version includes updates for the following CVEs: + + * [NVIDIA CVE-2024-0134](https://nvidia.custhelp.com/app/answers/detail/a_id/5585) + + To view any published security bulletins for NVIDIA products, refer to the NVIDIA product security page at https://www.nvidia.com/en-us/security/. + + For more information regarding NVIDIA security vulnerability remediation policies, refer to https://www.nvidia.com/en-us/security/psirt-policies/. + + For Rancher RKE2 and K3s, refer to the v24.9.0-known-limitations. + +* Added support for the NVIDIA Data Center GPU Driver version 550.127.05. + Refer to the GPU Operator Component Matrix + on the platform support page. + +* Includes these software component versions: + + - NVIDIA Container Toolkit v1.17.0 + - NVIDIA Driver Manager for Kubernetes v0.7.0 + - NVIDIA Kubernetes Device Plugin v0.17.0 + - NVIDIA DCGM Exporter v3.3.8-3.6.0 + - NVIDIA DCGM v3.3.8-1 + - Node Feature Discovery v0.16.6 + - NVIDIA GPU Feature Discovery for Kubernetes v0.17.0 + - NVIDIA MIG Manager for Kubernetes v0.10.0 + - NVIDIA KubeVirt GPU Device Plugin v1.2.10 + - NVIDIA vGPU Device Manager v0.2.8 + - NVIDIA GDS Driver v2.20.5 + - NVIDIA Kata Manager for Kubernetes v0.2.2 + +* Added support for NVIDIA Network Operator v24.7.0. + Refer to Support for GPUDirect RDMA and Support for GPUDirect Storage. + +* Added generally available (GA) support for precompiled driver containers. + This feature was previously a technical preview feature. + For more information, refer to precompiled-drivers. + +* Enabled automatic upgrade of Operator and Node Feature Discovery CRDs by default. + In previous releases, the `operator.upgradeCRD` field was `false`. + This release sets the default value to `true` and automatically runs a Helm hook when you upgrade the Operator. + For more information, refer to Option 2: Automatically Upgrading CRDs Using a Helm Hook. + +* Added support for new MIG profiles with GH200 NVL2 144GB HBM3e. + + * Supports these profiles: + + * `1g.18gb` + * `1g.18gb+me` + * `1g.36gb` + * `2g.36gb` + * `3g.72gb` + * `4g.72gb` + * `7g.144gb` + + * Added an `all-balanced` profile that creates the following GPU instances: + + * `1g.18gb` \times 2 + * `2g.36gb` \times 1 + * `3g.72gb` \times 1 + +* Added support for KubeVirt and OpenShift Virtualization with vGPU v17.4 for A30, A100, and H100 GPUs. + These GPUs are supported with an NVIDIA AI Enterprise subscription only and require building the NVIDIA vGPU Manager container image with the `aie` .run file. + +* Revised roles and role-based access controls for the Operator. + The Operator is revised to use Kubernetes controller-runtime caching that is limited to the Operator namespace and the OpenShift namespace, `openshift`. + The OpenShift namespace is required for the Operator to monitor for changes to image stream objects. + Limiting caching to specific namespaces enables the Operator to use the namespace-scoped role, `gpu-operator`, instead of a cluster role for monitoring changes to resources in the Operator namespace. + This change follows the principle of least privilege and improves the security posture of the Operator. + +* Enhanced the GPU Driver Container to set the `NODE_NAME` environment variable from the node host name and the `NODE_IP` environment variable from the node host IP address. + +### Fixed Issues + +* Fixed an issue with the cleanup CRD and upgrade CRD jobs that are triggered by Helm hooks. + On clusters that have nodes with taints, even when `operator.tolerations` includes tolerations, the jobs are not scheduled. + In this release, the tolerations that you specify for the Operator are applied to the jobs. + For more information about the hooks, refer to Option 2: Automatically Upgrading CRDs Using a Helm Hook. + +* Fixed an issue with configuring NVIDIA Container Toolkit to use CDI on nodes that use CRI-O. + Previously, the toolkit could configure the `runc` handler with the `nvidia` runtime handler even if `runc` was not the default runtime and cause CRI-O to crash. + In this release, the toolkit determines the default runtime by running `crio status config` and configures that runtime with the `nvidia` runtime handler. + +### Known Limitations + +* On Rancher RKE2 and K3s, NVIDIA Container Toolkit v1.17.0 fails to start. + The toolkit attempts to run `containerd config dump` to determine the container runtime configuration on the host. + On these platforms, the `containerd` executable is not on the PATH and results in an error. + + NVIDIA recommends installing v1.17.1 of the toolkit when you install or upgrade the Operator. + You can specify the `--set toolkit.version=v1.17.1-ubuntu20.04` or `v1.17.1-ubi8` argument to Helm. + +## 24.6.2 + +### New Features + +**This release provides critical security updates and is recommended for all users.** + +Added support for NVIDIA Container Toolkit 1.16.2. +This version includes updates for the following CVEs: + +* [NVIDIA CVE-2024-0132](https://nvidia.custhelp.com/app/answers/detail/a_id/5582) +* [NVIDIA CVE-2024-0133](https://nvidia.custhelp.com/app/answers/detail/a_id/5582) + +To view any published security bulletins for NVIDIA products, refer to the NVIDIA product security page at https://www.nvidia.com/en-us/security/. + +For more information regarding NVIDIA security vulnerability remediation policies, refer to https://www.nvidia.com/en-us/security/psirt-policies/. + +## 24.6.1 + +### New Features + +* Includes these software component versions: + + - NVIDIA Kubernetes Device Plugin v0.16.2 + - NVIDIA GPU Feature Discovery for Kubernetes v0.16.2 + + Refer to the GPU Operator Component Matrix + on the platform support page. + +### Fixed Issues + +* Fixed an issue with role-based access controls that prevented a service account from accessing config maps. + Refer to GitHub [issue #883](https://github.com/NVIDIA/gpu-operator/issues/883) for more details. +* Fixed an issue with role-based access controls in the GPU Operator validator that prevented retrieving NVIDIA Driver daemon set information. + On OpenShift Container Platform, this issue triggered `GPUOperatorNodeDeploymentDriverFailed` alerts. + Refer to GitHub [issue #892](https://github.com/NVIDIA/gpu-operator/issues/892) for more details. + +## 24.6.0 + +### New Features + +* Added support for the NVIDIA Data Center GPU Driver version 550.90.07. + Refer to the GPU Operator Component Matrix + on the platform support page. + +* Includes these software component versions: + + - NVIDIA Container Toolkit v1.16.1 + - NVIDIA Driver Manager for Kubernetes v0.6.10 + - NVIDIA Kubernetes Device Plugin v0.16.1 + - NVIDIA DCGM Exporter v3.3.7-3.5.0 + - NVIDIA DCGM v3.3.7-1 + - Node Feature Discovery v0.16.3 + - NVIDIA GPU Feature Discovery for Kubernetes v0.16.1 + - NVIDIA MIG Manager for Kubernetes v0.8.0 + - NVIDIA KubeVirt GPU Device Plugin v1.2.9 + - NVIDIA vGPU Device Manager v0.2.7 + - NVIDIA GDS Driver v2.17.5 + - NVIDIA Kata Manager for Kubernetes v0.2.1 + - NVIDIA GDRCopy Driver v2.4.1-1 + +* Added support for NVIDIA Network Operator v24.4.0. + Refer to Support for GPUDirect RDMA and Support for GPUDirect Storage. + +* Added support for using the Operator with Container-Optimized OS on Google Kubernetes Engine (GKE). + The process uses the Google driver installer to manage the NVIDIA GPU Driver. + For Ubuntu on GKE, you can use the Google driver installer or continue to use the NVIDIA Driver Manager as with previous releases. + Refer to google-gke for more information. + +* Added support for precompiled driver containers with Open Kernel module drivers. + Specify `--set driver.useOpenKernelModules=true --set driver.usePrecompiled=true --set driver.version=` + when you install or upgrade the Operator. + Support remains limited to Ubuntu 22.04. + Refer to precompiled-drivers for more information. + + NVIDIA began publishing driver containers with this support on July 15, 2024. + The tags for the first containers with this support are as follows: + + * -5.15.0-116-generic-ubuntu22.04 + * -5.15.0-1060-nvidia-ubuntu22.04 + * -5.15.0-1063-oracle-ubuntu22.04 + * -5.15.0-1068-azure-ubuntu22.04 + * -5.15.0-1065-aws-ubuntu22.04 + + Precompiled driver containers built after July 15 include support for the Open Kernel module drivers. + +* Added support for new MIG profiles. + + * For H200 devices: + + * `1g.18gb` + * `1g.18gb+me` + * `1g.35gb` + * `2g.35gb` + * `3g.71gb` + * `4g.71gb` + * `7g.141gb` + + * Added an `all-balanced` profile for H200 devices that creates the following GPU instances: + + * `1g.12gb` \times 2 + * `2g.24gb` \times 1 + * `3g.48gb` \times 1 + +* Added support for creating a config map with custom MIG profiles during installation or upgrade with Helm. + Refer to Example: Custom MIG Configuration During Installation for more information. + +### Fixed Issues + +* Role-based access controls for the following components were reviewed and revised to use least-required privileges: + + * GPU Operator + * Operator Validator + * MIG Manager + * GPU Driver Manager + * GPU Feature Discovery + * Kubernetes Device Plugin + * KubeVirt Device Plugin + * vGPU Host Manager + + In previous releases, the permissions were more permissive than necessary. + +* Fixed an issue with Node Feature Discovery (NFD). + When an NFD pod was deleted or restarted, all NFD node labels were removed from the node and GPU Operator operands were restarted. + The v0.16.2 release of NFD fixes the issue. + Refer to GitHub [issue #782](https://github.com/NVIDIA/gpu-operator/issues/782) for more details. + +* Fixed an issue with NVIDIA vGPU Manager not working correctly on nodes with GPUs that require Open Kernel module drivers and GPU System Processor (GSP) firmware. + Refer to GitHub [issue #761](https://github.com/NVIDIA/gpu-operator/issues/761) for more details. + +* DGCM is revised to use a cluster IP and a service with the internal traffic policy set to `Local`. + In previous releases, DCGM was a host networked pod. + The `dcgm.hostPort` field of the NVIDIA cluster policy resource is now deprecated. + +* Fixed an issue that prevented enabling GDRCopy and additional volume mounts with the NVIDIA Driver custom resource. + Previously, the driver daemon set did not update with the change and the Operator logs included an error message. + Refer to GitHub [issue #713](https://github.com/NVIDIA/gpu-operator/issues/713) for more details. + +* Fixed an issue with deleting GPU Driver daemon sets due to having misscheduled pods rather than zero pods. + Previously, if a node had an untolerated taint such as `node.kubernetes.io/unreachable:NoSchedule`, + the Operator could repeatedly delete and recreate the driver daemon sets. + Refer to GitHub [issue #715](https://github.com/NVIDIA/gpu-operator/issues/715) for more details. + +* Fixed an issue with reporting the correct GPU capacity and allocatable resources from the KubeVirt GPU Device Plugin. + Previously, if a GPU became unavailable, the reported GPU capacity and allocatable resources remained unchanged. + Refer to GitHub [issue #97](https://github.com/NVIDIA/kubevirt-gpu-device-plugin/issues/97) for more details. + +### Known Limitations + +* The `1g.12gb` MIG profile does not operate as expected on the NVIDIA GH200 GPU when the MIG configuration is set to `all-balanced`. +* The GPU Driver container does not run on hosts that have a custom kernel with the SEV-SNP CPU feature + because of the missing `kernel-headers` package within the container. + With a custom kernel, NVIDIA recommends pre-installing the NVIDIA drivers on the host if you want to + run traditional container workloads with NVIDIA GPUs. +* If you cordon a node while the GPU driver upgrade process is already in progress, + the Operator uncordons the node and upgrades the driver on the node. + You can determine if an upgrade is in progress by checking the node label + `nvidia.com/gpu-driver-upgrade-state != upgrade-done`. +* NVIDIA vGPU is incompatible with KubeVirt v0.58.0, v0.58.1, and v0.59.0, as well + as OpenShift Virtualization 4.12.0---4.12.2. +* Using NVIDIA vGPU on bare metal nodes and NVSwitch is not supported. +* All worker nodes in the Kubernetes cluster must run the same operating system version to use the NVIDIA GPU Driver container. + Alternatively, if you pre-install the NVIDIA GPU Driver on the nodes, then you can run different operating systems. + The technical preview feature that provides gpu-driver-configuration is also an alternative. +* NVIDIA GPUDirect Storage (GDS) is not supported with secure boot enabled systems. +* The NVIDIA GPU Operator can only be used to deploy a single NVIDIA GPU Driver type and version. + The NVIDIA vGPU and Data Center GPU Driver cannot be used within the same cluster. + The technical preview feature that provides gpu-driver-configuration is an alternative. +* The `nouveau` driver must be blacklisted when using NVIDIA vGPU. + Otherwise the driver fails to initialize the GPU with the error `Failed to enable MSI-X` in the system journal logs. + Additionally, all GPU Operator pods become stuck in the `Init` state. +* When using RHEL 8 with containerd as the runtime and SELinux is enabled (either in permissive or enforcing mode) + at the host level, containerd must also be configured for SELinux, such as setting the `enable_selinux=true` + configuration option. + Additionally, network-restricted environments are not supported. + +## 24.3.0 + +### New Features + +* Added support to enable NVIDIA GDRCopy v2.4.1. + + When you enable support for GDRCopy, the Operator configures the GDRCopy Driver container image + as a sidecar container in the GPU driver pod. + The sidecar container compiles and installs the gdrdrv Linux kernel module. + This feature is supported on Ubuntu 22.04 and RHCOS operating systems and on X86_64 and ARM64 architectures. + + Refer to Chart Customization Options for more information about the `driver.gdrcopy` field. + +* Added support for the NVIDIA Data Center GPU Driver version 550.54.15. + Refer to the GPU Operator Component Matrix + on the platform support page. + +* Includes these software component versions: + + - NVIDIA Container Toolkit version v1.15.0 + - NVIDIA MIG Manager version v0.7.0 + - NVIDIA Driver Manager for K8s v0.6.8 + - NVIDIA Kubernetes Device Plugin v0.15.0 + - DCGM 3.3.5-1 + - DCGM Exporter 3.3.5-3.4.1 + - Node Feature Discovery v0.15.4 + - NVIDIA GPU Feature Discovery for Kubernetes v0.15.0 + - NVIDIA KubeVirt GPU Device Plugin v1.2.7 + - NVIDIA vGPU Device Manager v0.2.6 + - NVIDIA Kata Manager for Kubernetes v0.2.0 + +* Added support for Kubernetes v1.29 and v1.30. + Refer to Supported Operating Systems and Kubernetes Platforms. + +* Added support for NVIDIA GH200 Grace Hopper Superchip as a generally available feature. + Refer to supported nvidia gpus and systems. + + The following prerequisites are required for using the Operator with GH200: + + - Run Ubuntu 22.04, the 550.54.15 GPU driver, and an NVIDIA Linux kernel, such as one provided with a `linux-nvidia-` package. + - Add `init_on_alloc=0` and `memhp_default_state=online_movable` as Linux kernel boot parameters. + - Run the NVIDIA Open GPU Kernel module driver. + +* Added support for NVIDIA Network Operator v24.1.1. + Refer to Support for GPUDirect RDMA and Support for GPUDirect Storage. + +* Added support for the NVIDIA IGX Orin platform when configured to use the discrete GPU. + Refer to gpu-operator-arm-platforms. + +* Removed support for Kubernetes pod security policy (PSP). + PSP was deprecated in the Kubernetes v1.21 release and removed in v1.25. + +### Fixed Issues + +* Installation on Red Hat OpenShift Container Platform 4.15 no longer requires a workaround related to + secrets and storage for the integrated image registry. +* Previously, the vGPU Device Manager would panic if no NVIDIA devices were found in `/sys/class/mdev_bus`. +* Previously, the MOFED validation init container would run for the GPU driver pod. + In this release, the init container no longer runs because the MOFED installation check is performed by the Kubernetes Driver Manager init container. +* Previously, for Red Hat OpenShift Container Platform, the GPU driver installation would fail when the Linux kernel version did not match the `/etc/os-release` file. + In this release, the Kernel version is determined from the running kernel to prevent the issue. + Refer to GitHub [issue #617](https://github.com/NVIDIA/gpu-operator/issues/617) for more details. +* Previously, if the metrics for DCGM Exporter were configured in a config map and the cluster policy + specified the name of the config map as `:` in the `DCGM_EXPORTER_CONFIGMAP_DATA` environment variable, the exporter + pods could not read the configuration from the config map. + In this release, the role used by the exporter is granted access to read from config maps. +* Previously, under load, the Operator could fail with the message `fatal error: concurrent map read and map write`. + In this release, the Operator controller is refactored to prevent the race condition. + Refer to GitHub [issue #689](https://github.com/NVIDIA/gpu-operator/issues/689) for more details. +* Previously, if any node in the cluster was in the `NotReady` state, the GPU driver upgrade controller failed to make progress. + In this release, the upgrade library is updated and skips unhealthy nodes. + Refer to GitHub [issue #688](https://github.com/NVIDIA/gpu-operator/issues/688) for more details. + +### Known Limitations + +* NVIDIA vGPU Manager does not work correctly on nodes with GPUs that require Open Kernel module drivers and GPU System Processor (GSP) firmware. + The logs for vGPU Device Manager pods include lines like the following example: + + ```output + time="2024-07-23T08:50:11Z" level=fatal msg="error setting VGPU config: no parent devices found for GPU at index '1'" + time="2024-07-23T08:50:11Z" level=error msg="Failed to apply vGPU config: unable to apply config 'default': exit status 1" + time="2024-07-23T08:50:11Z" level=info msg="Setting node label: nvidia.com/vgpu.config.state=failed" + time="2024-07-23T08:50:11Z" level=info msg="Waiting for change to 'nvidia.com/vgpu.config' label" + ``` + + The output of the `kubectl exec -it nvidia-vgpu-manager-daemonset-xxxxx -n gpu-operator -- bash -c 'dmesg | grep -i nvrm'` command + resembles the following example: + + ```output + kernel: NVRM: loading NVIDIA UNIX Open Kernel Module for x86_64 550.90.05 Release Build (dvs-builder@U16-I1-N08-05-1) + kernel: NVRM: RmFetchGspRmImages: No firmware image found + kernel: NVRM: GPU 0000:ae:00.0: RmInitAdapter failed! (0x61:0x56:1697) + kernel: NVRM: GPU 0000:ae:00.0: rm_init_adapter failed, device minor number 0 + ``` + + The vGPU Manager pods do not mount the `/sys/module/firmware_class/parameters/path` and `/lib/firmware` + paths on the host and the pods fail to copy the GSP firmware files on the host. + + As a workaround, you can add the following volume mounts to the vGPU Manager daemon set, for the `nvidia-vgpu-manager-ctr` container: + + ```yaml + - name: firmware-search-path + mountPath: /sys/module/firmware_class/parameters/path + - name: nv-firmware + mountPath: /lib/firmware + ``` + + This issue is fixed in the next release of the GPU Operator. +* The `1g.12gb` MIG profile does not operate as expected on the NVIDIA GH200 GPU when the MIG configuration is set to `all-balanced`. +* The GPU Driver container does not run on hosts that have a custom kernel with the SEV-SNP CPU feature + because of the missing `kernel-headers` package within the container. + With a custom kernel, NVIDIA recommends pre-installing the NVIDIA drivers on the host if you want to + run traditional container workloads with NVIDIA GPUs. +* If you cordon a node while the GPU driver upgrade process is already in progress, + the Operator uncordons the node and upgrades the driver on the node. + You can determine if an upgrade is in progress by checking the node label + `nvidia.com/gpu-driver-upgrade-state != upgrade-done`. +* NVIDIA vGPU is incompatible with KubeVirt v0.58.0, v0.58.1, and v0.59.0, as well + as OpenShift Virtualization 4.12.0---4.12.2. +* Using NVIDIA vGPU on bare metal nodes and NVSwitch is not supported. +* When installing the Operator on Amazon EKS and using Kubernetes versions lower than + `1.25`, specify the `--set psp.enabled=true` Helm argument because EKS enables + pod security policy (PSP). + If you use Kubernetes version `1.25` or higher, do not specify the `psp.enabled` + argument so that the default value, `false`, is used. +* All worker nodes in the Kubernetes cluster must run the same operating system version to use the NVIDIA GPU Driver container. + Alternatively, if you pre-install the NVIDIA GPU Driver on the nodes, then you can run different operating systems. + The technical preview feature that provides gpu-driver-configuration is also an alternative. +* NVIDIA GPUDirect Storage (GDS) is not supported with secure boot enabled systems. +* Driver Toolkit images are broken with Red Hat OpenShift version `4.11.12` and require cluster-level entitlements to be enabled + in this case for the driver installation to succeed. +* The NVIDIA GPU Operator can only be used to deploy a single NVIDIA GPU Driver type and version. + The NVIDIA vGPU and Data Center GPU Driver cannot be used within the same cluster. + The technical preview feature that provides gpu-driver-configuration is an alternative. +* The `nouveau` driver must be blacklisted when using NVIDIA vGPU. + Otherwise the driver fails to initialize the GPU with the error `Failed to enable MSI-X` in the system journal logs. + Additionally, all GPU Operator pods become stuck in the `Init` state. +* When using RHEL 8 with containerd as the runtime and SELinux is enabled (either in permissive or enforcing mode) + at the host level, containerd must also be configured for SELinux, such as setting the `enable_selinux=true` + configuration option. + Additionally, network-restricted environments are not supported. + +## 23.9.2 + +### New Features + +* Added support for the NVIDIA Data Center GPU Driver version 550.54.14. + Refer to the GPU Operator Component Matrix + on the platform support page. + +* Added support for Kubernetes v1.29. + Refer to Supported Operating Systems and Kubernetes Platforms + on the platform support page. + +* Added support for Red Hat OpenShift Container Platform 4.15. + Refer to Supported Operating Systems and Kubernetes Platforms + on the platform support page. + +* Includes these software component versions: + + - NVIDIA Data Center GPU Driver version 550.54.14 + - NVIDIA Container Toolkit version v1.14.6 + - NVIDIA Kubernetes Device Plugin version v1.14.5 + - NVIDIA MIG Manager version v0.6.0 + +* Added support for NVIDIA AI Enterprise release 5.0. + Refer to install-gpu-operator-nvaie for information about installing the Operator with a Bash script. + +### Fixed issues + +* Previously, duplicate image pull secrets were added to some daemon sets and caused an error + like the following when a node is deleted and the controller manager deleted the pods. + + ```output + I1031 00:09:44.553742 1 gc_controller.go:329] "PodGC is force deleting Pod" pod="gpu-operator/nvidia-driver-daemonset-k69f2" + E1031 00:09:44.556500 1 gc_controller.go:255] failed to create manager for existing fields: failed to convert new object (gpu-operator/nvidia-driver-daemonset-k69f2; /v1, Kind=Pod) to smd typed: .spec.imagePullSecrets: duplicate entries for key [name="ngc-secret"] + ``` + +* Previously, common daemon set labels, annotations, and tolerations configured in ClusterPolicy were not + also applied to the default NVIDIADriver CR instance. + Refer to GitHub [issue #665](https://github.com/NVIDIA/gpu-operator/issues/665) for more details. + +* Previously, the technical preview NVIDIA driver custom resource was failing to render the `licensing-config` + volume mount that is required for licensing a vGPU guest driver. + Refer to GitHub [issue #672](https://github.com/NVIDIA/gpu-operator/issues/672) for more details. + +* Previously, the technical preview NVIDIA driver custom resource was broken when GDS was enabled. + An OS suffix was not appended to the image path of the GDS driver container image. + Refer to GitHub [issue #608](https://github.com/NVIDIA/gpu-operator/issues/608) for more details. + +* Previously, the technical preview NVIDIA driver custom resource failed to render daemon sets + when `additionalConfig` volumes were configured that were host path volumes. This issue + prevented users from mounting entitlements on RHEL systems. + +* Previously, it was not possible to disable the CUDA workload validation pod that the `operator-validator` pod + deploys. You can now disable this pod by setting the following environment variable in ClusterPolicy: + + ```yaml + validator: + cuda: + env: + - name: "WITH_WORKLOAD" + value: "false" + ``` + +### Known Limitations + +* When installing on Red Hat OpenShift Container Platform 4.15 clusters that disable the integrated image registry, + secrets are no longer automatically generated and this change causes installation of the Operator to stall. + Refer to special considerations for openshift 4.15 for more information. + +* The `1g.12gb` MIG profile does not operate as expected on the NVIDIA GH200 GPU when the MIG configuration is set to `all-balanced`. +* The GPU Driver container does not run on hosts that have a custom kernel with the SEV-SNP CPU feature + because of the missing `kernel-headers` package within the container. + With a custom kernel, NVIDIA recommends pre-installing the NVIDIA drivers on the host if you want to + run traditional container workloads with NVIDIA GPUs. +* If you cordon a node while the GPU driver upgrade process is already in progress, + the Operator uncordons the node and upgrades the driver on the node. + You can determine if an upgrade is in progress by checking the node label + `nvidia.com/gpu-driver-upgrade-state != upgrade-done`. +* NVIDIA vGPU is incompatible with KubeVirt v0.58.0, v0.58.1, and v0.59.0, as well + as OpenShift Virtualization 4.12.0---4.12.2. +* Using NVIDIA vGPU on bare metal nodes and NVSwitch is not supported. +* When installing the Operator on Amazon EKS and using Kubernetes versions lower than + `1.25`, specify the `--set psp.enabled=true` Helm argument because EKS enables + pod security policy (PSP). + If you use Kubernetes version `1.25` or higher, do not specify the `psp.enabled` + argument so that the default value, `false`, is used. +* All worker nodes in the Kubernetes cluster must run the same operating system version to use the NVIDIA GPU Driver container. + Alternatively, if you pre-install the NVIDIA GPU Driver on the nodes, then you can run different operating systems. + The technical preview feature that provides gpu-driver-configuration is also an alternative. +* NVIDIA GPUDirect Storage (GDS) is not supported with secure boot enabled systems. +* Driver Toolkit images are broken with Red Hat OpenShift version `4.11.12` and require cluster-level entitlements to be enabled + in this case for the driver installation to succeed. +* The NVIDIA GPU Operator can only be used to deploy a single NVIDIA GPU Driver type and version. + The NVIDIA vGPU and Data Center GPU Driver cannot be used within the same cluster. + The technical preview feature that provides gpu-driver-configuration is an alternative. +* The `nouveau` driver must be blacklisted when using NVIDIA vGPU. + Otherwise the driver fails to initialize the GPU with the error `Failed to enable MSI-X` in the system journal logs. + Additionally, all GPU Operator pods become stuck in the `Init` state. +* When using RHEL 8 with containerd as the runtime and SELinux is enabled (either in permissive or enforcing mode) + at the host level, containerd must also be configured for SELinux, such as setting the `enable_selinux=true` + configuration option. + Additionally, network-restricted environments are not supported. + +## 23.9.1 + +### New Features + +* Added support for NVIDIA GH200 Grace Hopper Superchip as a technology preview feature. + Refer to supported nvidia gpus and systems. + + The following prerequisites are required for using the Operator with GH200: + + - Run Ubuntu 22.04 and an NVIDIA Linux kernel, such as one provided with a `linux-nvidia-` package. + - Add `init_on_alloc=0` and `memhp_default_state=online_movable` as Linux kernel boot parameters. + - Run the NVIDIA Open GPU Kernel module driver. + +* Added support for configuring the driver container to use the NVIDIA Open GPU Kernel module driver. + Support is limited to installation using the runfile installer. + Support for precompiled driver containers with open kernel modules is not available. + + For clusters that use GPUDirect Storage (GDS), beginning with CUDA toolkit 12.2.2 and + the NVIDIA GPUDirect Storage kernel driver version v2.17.5, are only supported + with the open kernel modules. + + NVIDIA GH200 Grace Hopper Superchip systems are only supported with the open kernel modules. + + - Refer to gpu-operator-helm-chart-options for information about setting + `useOpenKernelModules` if you manage the driver containers with the NVIDIA cluster policy custom resource definition. + - Refer to gpu-driver-configuration for information about setting `spec.useOpenKernelModules` + if you manage the driver containers with the technology preview NVIDIA driver custom resource. + +* Includes these software component versions: + + - NVIDIA Data Center GPU Driver version 535.129.03 + - NVIDIA Driver Manager for Kubernetes v0.6.5 + - NVIDIA Kubernetes Device Plugin v1.14.3 + - NVIDIA DCGM Exporter 3.3.0-3.2.0 + - NVIDIA Data Center GPU Manager (DCGM) v3.3.0-1 + - NVIDIA KubeVirt GPU Device Plugin v1.2.4 + - NVIDIA GPUDirect Storage (GDS) Driver v2.17.5 + + This version, and newer versions of the NVIDIA GDS kernel driver, require that you use the NVIDIA open kernel modules. + Refer to the GPU Operator Component Matrix + on the platform support page. + +* Added support for NVIDIA Network Operator v23.10.0. + +### Improvements + +* The `must-gather.sh` script that is used for support is enhanced to collect logs + from NVIDIA vGPU Manager pods. + +### Fixed issues + +* Previously, the technical preview NVIDIA driver custom resource did not support adding + custom labels, annotations, or tolerations to the pods that run as part of the driver daemon set. + This limitation prevented scheduling the driver daemon set in some environments. + Refer to GitHub [issue #602](https://github.com/NVIDIA/gpu-operator/issues/602) for more details. + +* Previously, when you specified the `operator.upgradeCRD=true` argument to the `helm upgrade` + command, the pre-upgrade hook ran with the `gpu-operator` service account + that is added by running `helm install`. + This dependency is a known issue for Argo CD users. + Argo CD treats pre-install and pre-upgrade hooks the same as pre-sync hooks and leads to failures + because the hook depends on the `gpu-operator` service account that does not exist on an initial installation. + + Now, the Operator is enhanced to run the hook with a new service account, `gpu-operator-upgrade-crd-hook-sa`. + This fix creates the new service account, a new cluster role, and a new cluster role binding. + The update prevents failures with Argo CD. + +* Previously, adding an NVIDIA driver custom resource with a node selector that conflicts with another + driver custom resource, the controller failed to set the error condition in the custom resource status. + The issue produced an error message like the following example: + + ```output + {"level":"error","ts":1698702848.8472972,"msg":"NVIDIADriver.nvidia.com \""\" is invalid: state: Unsupported value: \"\": supported values: \"ignored\", \"ready\", \"notReady\"","controller":"nvidia-driver-\ + controller","object":{"name":""},"namespace":"","name":"","reconcileID":"78d58d7b-cd94-4849-a292-391da9a0b049"} + ``` + +* Previously, the NVIDIA KubeVirt GPU Device Plugin could have a GLIBC mismatch error and produce a log + message like the following example: + + ```output + nvidia-kubevirt-gpu-device-plugin: /lib64/libc.so.6: version `GLIBC_2.32` not found (required by nvidia-kubevirt-gpu-device-plugin) + ``` + + This issue is fixed by including v1.2.4 of the plugin in this release. + +* Previously, on some machines and Linux kernel versions, GPU Feature Discovery was unable to determine + the machine type because the `/sys/class/dmi/id/product_name` file did not exist on the host. + Now, accessing the file is performed by mounting `/sys` instead of the fully-qualified path and + if the file does not exist, GPU Feature Discovery is able to label the node with `nvidia.com/gpu.machine=unknown`. + +* Previously, enabling GPUDirect RDMA on Red Hat OpenShift Container Platform clusters could + experience an error with the nvidia-peermem container. + The error was related to the `RHEL_VERSION` variable being unbound. + +### Known Limitations + +* The `1g.12gb` MIG profile does not operate as expected on the NVIDIA GH200 GPU when the MIG configuration is set to `all-balanced`. +* The GPU Driver container does not run on hosts that have a custom kernel with the SEV-SNP CPU feature + because of the missing `kernel-headers` package within the container. + With a custom kernel, NVIDIA recommends pre-installing the NVIDIA drivers on the host if you want to + run traditional container workloads with NVIDIA GPUs. +* If you cordon a node while the GPU driver upgrade process is already in progress, + the Operator uncordons the node and upgrades the driver on the node. + You can determine if an upgrade is in progress by checking the node label + `nvidia.com/gpu-driver-upgrade-state != upgrade-done`. +* NVIDIA vGPU is incompatible with KubeVirt v0.58.0, v0.58.1, and v0.59.0, as well + as OpenShift Virtualization 4.12.0---4.12.2. +* Using NVIDIA vGPU on bare metal nodes and NVSwitch is not supported. +* When installing the Operator on Amazon EKS and using Kubernetes versions lower than + `1.25`, specify the `--set psp.enabled=true` Helm argument because EKS enables + pod security policy (PSP). + If you use Kubernetes version `1.25` or higher, do not specify the `psp.enabled` + argument so that the default value, `false`, is used. +* All worker nodes in the Kubernetes cluster must run the same operating system version to use the NVIDIA GPU Driver container. + Alternatively, if you pre-install the NVIDIA GPU Driver on the nodes, then you can run different operating systems. + The technical preview feature that provides gpu-driver-configuration is also an alternative. +* NVIDIA GPUDirect Storage (GDS) is not supported with secure boot enabled systems. +* Driver Toolkit images are broken with Red Hat OpenShift version `4.11.12` and require cluster-level entitlements to be enabled + in this case for the driver installation to succeed. +* The NVIDIA GPU Operator can only be used to deploy a single NVIDIA GPU Driver type and version. + The NVIDIA vGPU and Data Center GPU Driver cannot be used within the same cluster. + The technical preview feature that provides gpu-driver-configuration is an alternative. +* The `nouveau` driver must be blacklisted when using NVIDIA vGPU. + Otherwise the driver fails to initialize the GPU with the error `Failed to enable MSI-X` in the system journal logs. + Additionally, all GPU Operator pods become stuck in the `Init` state. +* When using RHEL 8 with containerd as the runtime and SELinux is enabled (either in permissive or enforcing mode) + at the host level, containerd must also be configured for SELinux, such as setting the `enable_selinux=true` + configuration option. + Additionally, network-restricted environments are not supported. + +## 23.9.0 + +### New Features + +* Added support for an NVIDIA driver custom resource definition that enables + running multiple GPU driver types and versions on the same cluster and adds + support for multiple operating system versions. + This feature is a technology preview. + Refer to gpu-driver-configuration for more information. + +* Added support for additional Linux kernel variants for precompiled driver containers. + + - driver:535-5.15.0-xxxx-nvidia-ubuntu22.04 + - driver:535-5.15.0-xxxx-azure-ubuntu22.04 + - driver:535-5.15.0-xxxx-aws-ubuntu22.04 + + Refer to the **Tags** tab of the [NVIDIA GPU Driver](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/driver) + page in the NGC catalog to determine if a container for your kernel is built. + Refer to precompiled-drivers for information about using precompiled driver containers + and steps to build your own driver container. + +* The API for the NVIDIA cluster policy custom resource definition is enhanced to include + the current state of the cluster policy. + When you view the cluster policy with a command like `kubectl get cluster-policy`, the response + now includes a `Status.Conditions` field. + +* Includes these software component versions: + + - NVIDIA Data Center GPU Driver version 535.104.12. + - NVIDIA Driver Manager for Kubernetes v0.6.4 + - NVIDIA Container Toolkit v1.14.3 + - NVIDIA Kubernetes Device Plugin v1.14.2 + - NVIDIA DCGM Exporter 3.2.6-3.1.9 + - NVIDIA GPU Feature Discovery for Kubernetes v0.8.2 + - NVIDIA MIG Manager for Kubernetes v0.5.5 + - NVIDIA Data Center GPU Manager (DCGM) v3.2.6-1 + - NVIDIA KubeVirt GPU Device Plugin v1.2.3 + - NVIDIA vGPU Device Manager v0.2.4 + - NVIDIA Kata Manager for Kubernetes v0.1.2 + - NVIDIA Confidential Computing Manager for Kubernetes v0.1.1 + - Node Feature Discovery v0.14.2 + + Refer to the GPU Operator Component Matrix + on the platform support page. + +### Fixed issues + +* Previously, if the `RHEL_VERSION` environment variable was set for the Operator, the variable was + propagated to the driver container and used in the `--releasever` argument to the `dnf` command. + With this release, you can specify the `DNF_RELEASEVER` environment variable for the driver container + to override the value of the `--releasever` argument. + +* Previously, stale node feature and node feature topology objects could remain in the Kubernetes API + server after a node is deleted from the cluster. + The upgrade to Node Feature Discovery v0.14.2 includes an enhancement to garbage collection that + ensures the objects are removed after a node is deleted. + +### Known Limitations + +* The GPU Driver container does not run on hosts that have a custom kernel with the SEV-SNP CPU feature + because of the missing `kernel-headers` package within the container. + With a custom kernel, NVIDIA recommends pre-installing the NVIDIA drivers on the host if you want to + run traditional container workloads with NVIDIA GPUs. +* If you cordon a node while the GPU driver upgrade process is already in progress, + the Operator uncordons the node and upgrades the driver on the node. + You can determine if an upgrade is in progress by checking the node label + `nvidia.com/gpu-driver-upgrade-state != upgrade-done`. +* NVIDIA vGPU is incompatible with KubeVirt v0.58.0, v0.58.1, and v0.59.0, as well + as OpenShift Virtualization 4.12.0---4.12.2. +* Using NVIDIA vGPU on bare metal nodes and NVSwitch is not supported. +* When installing the Operator on Amazon EKS and using Kubernetes versions lower than + `1.25`, specify the `--set psp.enabled=true` Helm argument because EKS enables + pod security policy (PSP). + If you use Kubernetes version `1.25` or higher, do not specify the `psp.enabled` + argument so that the default value, `false`, is used. +* All worker nodes in the Kubernetes cluster must run the same operating system version to use the NVIDIA GPU Driver container. + Alternatively, if you pre-install the NVIDIA GPU Driver on the nodes, then you can run different operating systems. + The technical preview feature that provides gpu-driver-configuration is also an alternative. +* NVIDIA GPUDirect Storage (GDS) is not supported with secure boot enabled systems. +* Driver Toolkit images are broken with Red Hat OpenShift version `4.11.12` and require cluster-level entitlements to be enabled + in this case for the driver installation to succeed. +* The NVIDIA GPU Operator can only be used to deploy a single NVIDIA GPU Driver type and version. + The NVIDIA vGPU and Data Center GPU Driver cannot be used within the same cluster. + The technical preview feature that provides gpu-driver-configuration is an alternative. +* The `nouveau` driver must be blacklisted when using NVIDIA vGPU. + Otherwise the driver fails to initialize the GPU with the error `Failed to enable MSI-X` in the system journal logs. + Additionally, all GPU Operator pods become stuck in the `Init` state. +* When using RHEL 8 with containerd as the runtime and SELinux is enabled (either in permissive or enforcing mode) + at the host level, containerd must also be configured for SELinux, such as setting the `enable_selinux=true` + configuration option. + Additionally, network-restricted environments are not supported. + +## 23.6.2 + +This patch release back ports a fix that was introduced in the v23.9.1 release. + +### Fixed Issues + +* Previously, when you specified the `operator.upgradeCRD=true` argument to the `helm upgrade` + command, the pre-upgrade hook ran with the `gpu-operator` service account + that is added by running `helm install`. + This dependency is a known issue for Argo CD users. + Argo CD treats pre-install and pre-upgrade hooks the same as pre-sync hooks and leads to failures + because the hook depends on the `gpu-operator` service account that does not exist on an initial installation. + + Now, the Operator is enhanced to run the hook with a new service account, `gpu-operator-upgrade-crd-hook-sa`. + This fix creates the new service account, a new cluster role, and a new cluster role binding. + The update prevents failures with Argo CD. + +## 23.6.1 + +### New Features + +* Added support for NVIDIA L40S GPUs. + +* Added support for the NVIDIA Data Center GPU Driver version 535.104.05. + Refer to the GPU Operator Component Matrix + on the platform support page. + +### Fixed issues + +* Previously, the NVIDIA Container Toolkit daemon set could fail when running on + nodes with certain types of GPUs. + The driver-validation init container would fail when iterating over NVIDIA PCI devices + if the device PCI ID was not in the PCI database. + The error message is similar to the following example: + + ```output + Error: error validating driver installation: error creating symlinks: + failed to get device nodes: failed to get GPU information: error getting + all NVIDIA devices: error constructing NVIDIA PCI device 0000:21:00.0: + unable to get device name: failed to find device with id '26b9'\n\n + Failed to create symlinks under /dev/char that point to all possible NVIDIA + character devices.\nThe existence of these symlinks is required to address + the following bug:\n\n https://github.com/NVIDIA/gpu-operator/issues/430\n\n + This bug impacts container runtimes configured with systemd cgroup management + enabled.\nTo disable the symlink creation, set the following envvar in ClusterPolicy:\n\n + validator:\n driver:\n env:\n - name: DISABLE_DEV_CHAR_SYMLINK_CREATION\n value: \"true\"" + ``` + +## 23.6.0 + +### New Features + +* Added support for configuring Kata Containers for GPU workloads as a technology preview feature. + This feature introduces NVIDIA Kata Manager for Kubernetes as an operand of GPU Operator. + +* Added support for configuring Confidential Containers for GPU workloads as a technology preview feature. + This feature builds on the work for configuring Kata Containers and + introduces NVIDIA Confidential Computing Manager for Kubernetes as an operand of GPU Operator. + Refer to gpu-operator-confidential-containers for more information. + +* Added support for the NVIDIA Data Center GPU Driver version 535.86.10. + Refer to the GPU Operator Component Matrix + on the platform support page. + +* Added support for NVIDIA vGPU 16.0. + +* Added support for NVIDIA Network Operator 23.7.0. + +* Added support for new MIG profiles with the 535 driver. + + * For H100 NVL and H800 NVL devices: + + * `1g.12gb.me` + * `1g.24gb` + * `2g.24gb` + * `3g.47gb` + * `4g.47gb` + * `7g.94gb` + +### Improvements + +* The Operator is updated to use the `node-role.kubernetes.io/control-plane` label + that is the default label for Kubernetes version 1.27. + As a fallback for older Kubernetes versions, the Operator runs on nodes with the + `master` label if the `control-plane` label is not available. + +* Added support for setting Pod Security Admission for the GPU Operator namespace. + Pod Security Admission applies to Kubernetes versions 1.25 and higher. + You can specify `--set psa.enabled=true` when you install or upgrade the Operator, + or you can patch the `cluster-policy` instance of the `ClusterPolicy` object. + The Operator sets the following standards: + + ```yaml + pod-security.kubernetes.io/audit=privileged + pod-security.kubernetes.io/enforce=privileged + pod-security.kubernetes.io/warn=privileged + ``` + +* The Operator performs plugin validation when the Operator is installed or upgraded. + Previously, the plugin validation ran a workload pod that requires access to a GPU. + On a busy node with the GPUs consumed by other workloads, the validation can falsely + report failure because it was not scheduled. + The plugin validation still confirms that GPUs are advertised to kubelet, but it no longer + runs a workload. + To override the new behavior and run a plugin validation workload, specify + `--set validator.plugin.env.WITH_WORKLOAD=true` when you install or upgrade the Operator. + +### Fixed issues + +* In clusters that use a network proxy and configure GPU Direct Storage, the `nvidia-fs-ctr` + container can use the network proxy and any other environment variable that you specify + with the `--set gds.env=key1=val1,key2=val2` option when you install or upgrade the Operator. + +* In previous releases, when you performed a GPU driver upgrade with the `OnDelete` strategy, + the status reported in the `cluster-policy` instance of the `ClusterPolicy` object could indicate + `Ready` even though the driver daemon set has not completed the upgrade of pods on all nodes. + In this release, the status is reported as `notReady` until the upgrade is complete. + +### Known Limitations + +* The GPU Driver container does not run on hosts that have a custom kernel with the SEV-SNP CPU feature + because of the missing `kernel-headers` package within the container. + With a custom kernel, NVIDIA recommends pre-installing the NVIDIA drivers on the host if you want to + run traditional container workloads with NVIDIA GPUs. +* If you cordon a node while the GPU driver upgrade process is already in progress, + the Operator uncordons the node and upgrades the driver on the node. + You can determine if an upgrade is in progress by checking the node label + `nvidia.com/gpu-driver-upgrade-state != upgrade-done`. +* NVIDIA vGPU is incompatible with KubeVirt v0.58.0, v0.58.1, and v0.59.0, as well + as OpenShift Virtualization 4.12.0---4.12.2. +* Using NVIDIA vGPU on bare metal nodes and NVSwitch is not supported. +* When installing the Operator on Amazon EKS and using Kubernetes versions lower than + `1.25`, specify the `--set psp.enabled=true` Helm argument because EKS enables + pod security policy (PSP). + If you use Kubernetes version `1.25` or higher, do not specify the `psp.enabled` + argument so that the default value, `false`, is used. +* All worker nodes in the Kubernetes cluster must run the same operating system version to use the NVIDIA GPU Driver container. + Alternatively, if you pre-install the NVIDIA GPU Driver on the nodes, then you can run different operating systems. +* NVIDIA GPUDirect Storage (GDS) is not supported with secure boot enabled systems. +* Driver Toolkit images are broken with Red Hat OpenShift version `4.11.12` and require cluster-level entitlements to be enabled + in this case for the driver installation to succeed. +* The NVIDIA GPU Operator can only be used to deploy a single NVIDIA GPU Driver type and version. The NVIDIA vGPU and Data Center GPU Driver cannot be used within the same cluster. +* The `nouveau` driver must be blacklisted when using NVIDIA vGPU. + Otherwise the driver fails to initialize the GPU with the error `Failed to enable MSI-X` in the system journal logs. + Additionally, all GPU Operator pods become stuck in the `Init` state. +* When using RHEL 8 with Kubernetes, SELinux must be enabled (either in permissive or enforcing mode) for use with the GPU Operator. + Additionally, network-restricted environments are not supported. + +## 23.3.2 + +### New Features + +* Added support for Kubernetes v1.27. + Refer to Supported Operating Systems and Kubernetes Platforms + on the platform support page. + +* Added support for Red Hat OpenShift Container Platform 4.13. + Refer to Supported Operating Systems and Kubernetes Platforms + on the platform support page. + +* Added support for KubeVirt v0.59 and Red Hat OpenShift Virtualization 4.13. + Starting with KubeVirt versions v0.58.2 and v0.59.1 and OpenShift Virtualization 4.12.3 and 4.13.0, + you must set the `DisableMDEVConfiguration` feature gate to use NVIDIA vGPU. + Refer to GPU Operator with KubeVirt or + NVIDIA GPU Operator with OpenShift Virtualization. + +* Added support for running the Operator with Microsoft Azure Kubernetes Service (AKS). + You must use an AKS image with a preinstalled NVIDIA GPU driver and a preinstalled + NVIDIA Container Toolkit. + Refer to microsoft-aks for more information. + +* Added support for VMWare vSphere 8.0 U1 with Tanzu. + +* Added support for CRI-0 v1.26 with Red Hat Enterprise Linux 8.7 + and support for CRI-0 v1.27 with Ubuntu 20.04. + +### Improvements + +* Increased the default timeout for the `nvidia-smi` command that is used by the + NVIDIA Driver Container startup probe and made the timeout configurable. + Previously, the timeout duration for the startup probe was `30s`. + In this release, the default duration is `60s`. + This change reduces the frequency of container restarts when `nvidia-smi` + runs slowly. + Refer to Chart Customization Options for more information. + +### Fixed issues + +* Fixed an issue with NVIDIA GPU Direct Storage (GDS) and Ubuntu 22.04. + The Operator was not able to deploy GDS and other daemon sets. + + Previously, the Operator produced the following error log: + + ```output + {"level":"error","ts":1681889507.829097,"msg":"Reconciler error","controller":"clusterpolicy-controller","object":{"name":"cluster-policy"},"namespace":"","name":"cluster-policy","reconcileID":"c5d55183-3ce9-4376-9d20-e3d53dc441cb","error":"ERROR: failed to transform the Driver Toolkit Container: could not find the 'openshift-driver-toolkit-ctr' container"} + ``` + +### Known Limitations + +* If you cordon a node while the GPU driver upgrade process is already in progress, + the Operator uncordons the node and upgrades the driver on the node. + You can determine if an upgrade is in progress by checking the node label + `nvidia.com/gpu-driver-upgrade-state != upgrade-done`. +* NVIDIA vGPU is incompatible with KubeVirt v0.58.0, v0.58.1, and v0.59.0, as well + as OpenShift Virtualization 4.12.0---4.12.2. +* Using NVIDIA vGPU on bare metal nodes and NVSwitch is not supported. +* When installing the Operator on Amazon EKS and using Kubernetes versions lower than + `1.25`, specify the `--set psp.enabled=true` Helm argument because EKS enables + pod security policy (PSP). + If you use Kubernetes version `1.25` or higher, do not specify the `psp.enabled` + argument so that the default value, `false`, is used. +* Ubuntu 18.04 is scheduled to reach end of standard support in May of 2023. + When Ubuntu transitions it to end of life (EOL), the NVIDIA GPU Operator and + related projects plan to cease building containers for 18.04 and to + cease providing support. +* All worker nodes within the Kubernetes cluster must use the same operating system version. +* NVIDIA GPUDirect Storage (GDS) is not supported with secure boot enabled systems. +* Driver Toolkit images are broken with Red Hat OpenShift version `4.11.12` and require cluster-level entitlements to be enabled + in this case for the driver installation to succeed. +* The NVIDIA GPU Operator can only be used to deploy a single NVIDIA GPU Driver type and version. The NVIDIA vGPU and Data Center GPU Driver cannot be used within the same cluster. +* The `nouveau` driver must be blacklisted when using NVIDIA vGPU. + Otherwise the driver fails to initialize the GPU with the error `Failed to enable MSI-X` in the system journal logs. + Additionally, all GPU Operator pods become stuck in the `Init` state. +* When using RHEL 8 with Kubernetes, SELinux must be enabled (either in permissive or enforcing mode) for use with the GPU Operator. + Additionally, network-restricted environments are not supported. + +## 23.3.1 + +This release provides a packaging-only update to the 23.3.0 release to fix installation on Red Hat OpenShift Container Platform. Refer to GitHub [issue #513](https://github.com/NVIDIA/gpu-operator/issues/513). + +## 23.3.0 + +### New Features + +* Added support for the NVIDIA Data Center GPU Driver version 525.105.17. + Refer to the GPU Operator Component Matrix + on the platform support page. + +* Added support for GPUDirect Storage with Red Hat OpenShift Container Platform 4.11. + Refer to Support for GPUDirect Storage on the platform support page. + +* Added support for Canonical MicroK8s v1.26. + Refer to Supported Operating Systems and Kubernetes Platforms + on the platform support page. + +* Added support for containerd v1.7. + Refer to Supported Container Runtimes + on the platform support page. + +* Added support for Node Feature Discovery v0.12.1. + Added support for using the NodeFeature API CRD for labeling nodes + instead of labeling nodes over gRPC. + The documentation for upgrading the Operator manually + is updated to include applying the custom resource definitions for Node Feature Discovery. + +* Added support for running the NVIDIA GPU Operator in Amazon EKS (use the `gpu-operator-nvidia-amazon` skill) + and Google GKE (use the `gpu-operator-nvidia-google` skill). + You must configure the cluster with custom nodes that run a supported operating + system, such as Ubuntu 22.04. + +* Added support for the Container Device Interface (CDI) that is implemented by the + NVIDIA Container Toolkit v1.13.0. + Refer to gpu-operator-helm-chart-options for information about the `cdi.enable` and + `cdi.default` options to enable CDI during installation + or cdi for post-installation configuration information. + +* [Technology Preview] Added support for precompiled driver containers for select operating systems. + This feature removes the dynamic dependencies to build the driver during installation in the + cluster such as downloading kernel header packages and GCC tooling. + Sites with isolated networks that cannot access the internet can benefit. + Sites with machines that are resource constrained can also benefit by removing the computational demand + to compile the driver. + For more information, see precompiled-drivers. + +* Added support for the NVIDIA H800 GPU in the Supported NVIDIA GPUs and Systems table on the Platform Support page. + +### Improvements + +* The upgrade process for the GPU driver is enhanced. + This release introduces a `maxUnavailable` field that you can use to specify + the number of nodes that can be unavailable during an upgrade. + The value can be an integer or a string that specifies a percentage. + If you specify a percentage, the number of nodes is calculated by rounding up. + The default value is `25%`. + + If you specify a value for `maxUnavailable` and also specify `maxParallelUpgrades`, + the `maxUnavailable` value applies an additional constraint on the value of + `maxParallelUpgrades` to ensure that the number of parallel upgrades does not + cause more than the intended number of nodes to become unavailable during the upgrade. + For example, if you specify `maxUnavailable=100%` and `maxParallelUpgrades=1`, + one node at a time is upgraded. + +* In previous releases, when you upgrade the GPU driver, the Operator validator + pod could fail to complete all the validation checks. + As a result, the node could remain in the validation required state indefinitely + and prevent performing the driver upgrade on the other nodes in the cluster. + This release adds a `600` second timeout for the validation process. + If the validation does not complete successfully within the duration, the node is + labelled `upgrade-failed` and the upgrade process proceeds on other nodes. + +* The Multi-Instance GPU (MIG) manager is enhanced to support setting an initial + value for the `nvidia.com/mig.config` node annotation. + On nodes with MIG-capable GPUs that do not already have the annotation set, the + value is set to `all-disabled` and the MIG manager does not create MIG devices. + The value is overwritten when you label the node with a MIG profile. + For configuration information, see gpu-operator-mig. + +### Fixed issues + +* Fixed an issue that prevented building the GPU driver container when a Local Package Repository + is used. + Previously, if you needed to provide CA certificates, the certificates were not installed correctly. + The certificates are now installed in the correct directories. + Refer to GitHub [issue #299](https://github.com/NVIDIA/gpu-operator/issues/299) for more details. + +* Fixed an issue that created audit log records related to deprecated API requests for pod security policy. + on Red Hat OpenShift Container Platform. + Refer to GitHub [issue #451](https://github.com/NVIDIA/gpu-operator/issues/451) + and [issue #490](https://github.com/NVIDIA/gpu-operator/issues/490) for more details. + +* Fixed an issue that caused the Operator to attempt to add a pod security policy on pre-release versions + of Kubernetes v1.25. + Refer to GitHub [issue #484](https://github.com/NVIDIA/gpu-operator/issues/484) for more details. + +* Fixed a race condition that is related to preinstalled GPU drivers, validator pods, and the device plugin pods. + The race condition can cause the device plugin pods to set the wrong path to the GPU driver. + Refer to GitHub [issue #508](https://github.com/NVIDIA/gpu-operator/issues/508) for more details. + +* Fixed an issue with the driver manager that prevented the manager from accurately detecting whether a + node has preinstalled GPU drivers. + This issue can appear if preinstalled GPU drivers were initially installed and later removed. + The resolution is for the manager to check that the `nvidia-smi` file exists on the host + and to check the output from executing the file. + +* Fixed an issue that prevented adding custom annotations to daemon sets that the Operator starts. + Refer to GitHub [issue #499](https://github.com/NVIDIA/gpu-operator/issues/499) for more details. + +* Fixed an issue that is related to not starting the GPU Feature Discovery (GFD) pods when the DCGM Exporter + service monitor is enabled, but a service monitor custom resource definition does not exist. + Previously, there was no log record to describe why the GFD pods were not started. + In this release, the Operator logs the error `Couldn't find ServiceMonitor CRD` and the + message `Install Prometheus and necessary CRDs for gathering GPU metrics` to indicate + the reason. + +* Fixed a race condition that prevented the GPU driver containers from loading the nvidia-peermem Linux kernel module + and caused the driver daemon set pods to crash loop back off. + The condition could occur when both GPUDirect RDMA and GPUDirect Storage are enabled. + In this release, the start script for the driver containers confirm that Operator validator + indicates the driver container is ready before attempting to load the kernel module. + +* Fixed an issue that caused upgrade of the GPU driver to fail when GPUDirect Storage is enabled. + In this release, the driver manager unloads the nvidia-fs Linux kernel module before + performing the upgrade. + +* Added support for new MIG profiles with the 525 driver. + + * For A100-40GB devices: + + * `1g.5gb.me` + * `1g.10gb` + * `4g.20gb` + + * For H100-80GB and A100-80GB devices: + + * `1g.10gb` + * `1g.10gb.me` + * `1g.20gb` + * `4g.40gb` + + * For A30-24GB devices: + + * `1g.6gb.me` + * `2g.12gb.me` + +### Common Vulnerabilities and Exposures (CVEs) + +The `gpu-operator:v23.3.0` and `gpu-operator-validator:v23.3.0` images have the following known high-vulnerability CVEs. +These CVEs are from the base images and are not in libraries that are used by the GPU Operator: + +* `openssl-libs` - [CVE-2023-0286](https://access.redhat.com/security/cve/CVE-2023-0286) +* `platform-python` and `python3-libs` - [CVE-2023-24329](https://access.redhat.com/security/cve/CVE-2023-24329) + +### Known Limitations + +* Using NVIDIA vGPU on bare metal nodes and NVSwitch is not supported. +* When installing the Operator on Amazon EKS and using Kubernetes versions lower than + `1.25`, specify the `--set psp.enabled=true` Helm argument because EKS enables + pod security policy (PSP). + If you use Kubernetes version `1.25` or higher, do not specify the `psp.enabled` + argument so that the default value, `false`, is used. +* Ubuntu 18.04 is scheduled to reach end of standard support in May of 2023. + When Ubuntu transitions it to end of life (EOL), the NVIDIA GPU Operator and + related projects plan to cease building containers for 18.04 and to + cease providing support. +* All worker nodes within the Kubernetes cluster must use the same operating system version. +* NVIDIA GPUDirect Storage (GDS) is not supported with secure boot enabled systems. +* Driver Toolkit images are broken with Red Hat OpenShift version `4.11.12` and require cluster-level entitlements to be enabled + in this case for the driver installation to succeed. +* The NVIDIA GPU Operator can only be used to deploy a single NVIDIA GPU Driver type and version. The NVIDIA vGPU and Data Center GPU Driver cannot be used within the same cluster. +* The `nouveau` driver must be blacklisted when using NVIDIA vGPU. + Otherwise the driver fails to initialize the GPU with the error `Failed to enable MSI-X` in the system journal logs. + Additionally, all GPU Operator pods become stuck in the `Init` state. +* When using RHEL 8 with Kubernetes, SELinux must be enabled (either in permissive or enforcing mode) for use with the GPU Operator. + Additionally, network-restricted environments are not supported. + +---- + +## 22.9.2 + +### New Features + +* Added support for Kubernetes v1.26 and Red Hat OpenShift 4.12. + Refer to platform-support for more details. +* Added a new controller that is responsible for managing NVIDIA driver upgrades. + Refer to gpu-driver-upgrades for more details. +* Added the ability to apply custom labels and annotations for all of the GPU Operator pods. + Refer to gpu-operator-helm-chart-options for how to configure custom labels and annotations. +* Added support for NVIDIA vGPU 15.1. + Refer to the [NVIDIA Virtual GPU Software Documentation](https://docs.nvidia.com/grid/15.0/index.html). +* Added support for the NVIDIA HGX H100 System in the Supported NVIDIA GPUs and Systems table on the Platform Support page. +* Added 525.85.12 as the recommended driver version and 3.1.6 as the recommended DCGM version in the GPU Operator Component Matrix. + These updates enable support for the NVIDIA HGX H100 System. + +### Improvements + +* Enhanced the driver validation logic to make sure that the current instance of the driver container has successfully finished installing drivers. + This enhancement prevents other operands from incorrectly starting with previously loaded drivers. +* Increased overall driver startup probe timeout from 10 to 20 minutes. + The increased timeout improves the installation experience for clusters with slow networks by avoiding unnecessary driver container restarts. + +### Fixed issues + +* Fixed an issue where containers allocated GPU lose access to them when systemd is triggered to run some reevaluation of the cgroups it manages. + The issue affects systems using runc configured with systemd cgroups. + Refer to GitHub [issue #430](https://github.com/NVIDIA/gpu-operator/issues/430) for more details. +* Fixed an issue that prevented the GPU Operator from applying PSA labels on the namespace when no prior labels existed. + +### Common Vulnerabilities and Exposures (CVEs) + +The `gpu-operator:v22.9.2` and `gpu-operator:v22.9.2-ubi8` images have the following known high-vulnerability CVEs. +These CVEs are from the base images and are not in libraries that are used by the GPU Operator: + + * `libksba` - [CVE-2022-47629](https://access.redhat.com/security/cve/CVE-2022-47629) + +### Known Limitations + +* All worker nodes within the Kubernetes cluster must use the same operating system version. +* NVIDIA GPUDirect Storage (GDS) is not supported with secure boot enabled systems. +* Driver Toolkit images are broken with Red Hat OpenShift version `4.11.12` and require cluster-level entitlements to be enabled + in this case for the driver installation to succeed. +* No support for newer MIG profiles `1g.10gb`, `1g.20gb`, `2.12gb+me` with R525 drivers. +* The NVIDIA GPU Operator can only be used to deploy a single NVIDIA GPU Driver type and version. The NVIDIA vGPU and Data Center GPU Driver cannot be used within the same cluster. +* The `nouveau` driver must be blacklisted when using NVIDIA vGPU. + Otherwise the driver fails to initialize the GPU with the error `Failed to enable MSI-X` in the system journal logs. + Additionally, all GPU Operator pods become stuck in the `Init` state. +* When using RHEL 8 with Kubernetes, SELinux must be enabled (either in permissive or enforcing mode) for use with the GPU Operator. + Additionally, network-restricted environments are not supported. + +---- + +## 22.9.1 + +### New Features + +* Support for CUDA 12.0 / R525 Data Center drivers on x86 / ARM servers. +* Support for RHEL 8.7 with Kubernetes and Containerd or CRI-O. +* Support for Ubuntu 20.4 and 22.04 with Kubernetes and CRI-O. +* Support for NVIDIA GPUDirect Storage using Ubuntu 20.04 and Ubuntu 22.04 with Kubernetes. +* Support for RTX 6000 ADA GPU +* Support for A800 GPU +* Support for vSphere 8.0 with Tanzu +* Support for vGPU 15.0 +* Support for HPE Ezmeral Runtime Enterprise. Version 5.5 - with RHEL 8.4 and 8.5 + +### Improvements + +* Added helm parameters to control operator logging levels and time encoding. +* When using CRI-O runtime with Kubernetes, it is no longer required to update the CRI-O config file to include `/run/containers/oci/hooks.d` as an additional path for OCI hooks. By default, the NVIDIA OCI runtime hook gets installed at `/usr/share/containers/oci/hooks.d` which is the default path configured with CRI-O. +* Allow per node configurations for NVIDIA Device Plugin using a custom ConfigMap and node label `nvidia.com/device-plugin.config=`. +* Support for [OnDelete](https://kubernetes.io/docs/tasks/manage-daemon/update-daemon-set/#daemonset-update-strategy) upgrade strategy for all Daemonsets deployed by the GPU Operator. + This can be configured using `daemonsets.upgradeStrategy` parameter in the `ClusterPolicy`. This prevents pods managed by the GPU Operator from being restarted automatically on spec updates. +* Enable eviction of GPU Pods only during driver container upgrades with `ENABLE_GPU_POD_EVICTION` env (default: "true") set under `driver.manager.env` in the `ClusterPolicy`. + This eliminates the requirement to drain the entire node currently. + +### Fixed issues + +* Fix repeated restarts of container-toolkit when used with containerd versions `v1.6.9` and above. Refer to GitHub [issue #432](https://github.com/NVIDIA/gpu-operator/issues/432) for more details. +* Disable creation of PodSecurityPolicies (PSP) with K8s versions `1.25` and above as it is removed. + +### Common Vulnerabilities and Exposures (CVEs) +* Fixed - Updated driver images for `515.86.01`, `510.108.03`, `470.161.03`, `450.216.04` to address CVEs noted [here](https://nvidia.custhelp.com/app/answers/detail/a_id/5415). +* The `gpu-operator:v22.9.1` and `gpu-operator:v22.9.1-ubi8` images have been released with the following known HIGH Vulnerability CVEs. + These are from the base images and are not in libraries used by GPU Operator: + + * `krb5-libs` - [CVE-2022-42898](https://nvd.nist.gov/vuln/detail/CVE-2022-42898) + +### Known Limitations + +* All worker nodes within the Kubernetes cluster must use the same operating system version. +* NVIDIA GPUDirect Storage (GDS) is not supported with secure boot enabled systems. +* Driver Toolkit images are broken with Red Hat OpenShift version `4.11.12` and require cluster level entitlements to be enabled + in this case for the driver installation to succeed. +* No support for newer MIG profiles `1g.10gb`, `1g.20gb`, `2.12gb+me` with R525 drivers. It will be added in the following release. +* The NVIDIA GPU Operator can only be used to deploy a single NVIDIA GPU Driver type and version. The NVIDIA vGPU and Data Center GPU Driver cannot be used within the same cluster. +* `nouveau` driver has to be blacklisted when using NVIDIA vGPU. Otherwise the driver will fail to initialize the GPU with the error `Failed to enable MSI-X` in the system journal logs and all GPU Operator pods will be stuck in `Init` state. +* When using RHEL8 with Kubernetes, SELinux has to be enabled (either in permissive or enforcing mode) for use with the GPU Operator. Additionally, network restricted environments are not supported. + +## 22.9.0 + +### New Features + +* Support for Hopper (H100) GPU with CUDA 11.8 / R520 Data Center drivers on x86 servers. +* Support for RHEL 8 with Kubernetes and Containerd or CRI-O. +* Support with Kubernetes 1.25. +* Support for RKE2 (Rancher Kubernetes Engine 2) with Ubuntu 20.04 and RHEL8. +* Support for GPUDirect RDMA with NVIDIA Network Operator 1.3. +* Support for Red Hat OpenShift with Cloud Service Providers (CSPs) Amazon AWS, Google GKE and Microsoft Azure. +* [General Availability] - Support for KubeVirt and Red Hat OpenShift Virtualization with GPU Passthrough and NVIDIA vGPU based products. +* [General Availability] - OCP and Upstream Kubernetes on ARM with supported platforms. +* Support for [Pod Security Admission (PSA)](https://kubernetes.io/docs/concepts/security/pod-security-admission/) through the `psp.enabled` flag. If enabled, the namespace where the operator is installed in will be labeled with the `privileged` pod security level. + +### Improvements + +* Support automatic upgrade and cleanup of `clusterpolicies.nvidia.com` CRD using Helm hooks. Refer to Operator upgrades for more info. +* Support for dynamically enabling/disabling GFD, MIG Manager, DCGM and DCGM-Exporter. +* Switched to calendar versioning starting from this release for better life cycle management and support. Refer to NVIDIA GPU Operator Versioning for more info. + +### Fixed issues + +* Remove CUDA compat libs from the operator and all operand images to avoid mismatch with installed CUDA driver version. More info [here](https://github.com/NVIDIA/gpu-operator/issues/391) and [here](https://github.com/NVIDIA/gpu-operator/issues/389). +* Migrate to `node.k8s.io/v1` API for creation of `RuntimeClass` objects. More info [here](https://github.com/NVIDIA/gpu-operator/issues/409). +* Remove PodSecurityPolicy (PSP) starting with Kubernetes v1.25. Setting `psp.enabled` will now enable Pod Security Admission (PSA) instead. + +### Known Limitations + +* All worker nodes within the Kubernetes cluster must use the same operating system version. +* The NVIDIA GPU Operator can only be used to deploy a single NVIDIA GPU Driver type and version. The NVIDIA vGPU and Data Center GPU Driver cannot be used within the same cluster. +* `nouveau` driver has to be blacklisted when using NVIDIA vGPU. Otherwise the driver will fail to initialize the GPU with the error `Failed to enable MSI-X` in the system journal logs and all GPU Operator pods will be stuck in `Init` state. +* When using `CRI-O` runtime with Kubernetes, the config file `/etc/crio/crio.conf` has to include `/run/containers/oci/hooks.d` as path for `hooks_dir`. Refer custom-runtime-options for steps to configure this. +* When using RHEL8 with Kubernetes, SELinux has to be enabled (either in permissive or enforcing mode) for use with the GPU Operator. Additionally, network restricted environments are not supported. +* The `gpu-operator:v22.9.0` and `gpu-operator:v22.9.0-ubi8` images have been released with the following known HIGH Vulnerability CVEs. + These are from the base images and are not in libraries used by GPU Operator: + + * `expat` - [CVE-2022-40674](https://access.redhat.com/security/cve/CVE-2022-40674) + * `systemd-pam` - [CVE-2022-2526](https://access.redhat.com/security/cve/CVE-2022-2526) + * `systemd` - [CVE-2022-2526](https://access.redhat.com/security/cve/CVE-2022-2526) + * `systemd-libs` - [CVE-2022-2526](https://access.redhat.com/security/cve/CVE-2022-2526) + +---- + +## 1.11.1 + +### Improvements + +* Added `startupProbe` to NVIDIA driver container to allow RollingUpgrades to progress to other nodes only after driver modules are successfully loaded on current one. +* Added support for `driver.rollingUpdate.maxUnavailable` parameter to specify maximum nodes for simultaneous driver upgrades. Default is 1. +* NVIDIA driver container will auto-disable itself on the node with pre-installed drivers by applying label `nvidia.com/gpu.deploy.driver=pre-installed`. This is useful for heterogeneous clusters where only some GPU nodes have pre-installed drivers(e.g. DGX OS). + +### Fixed issues + +* Apply tolerations to `cuda-validator` and `device-plugin-validator` Pods based on `deamonsets.tolerations` in `ClusterPolicy`. For more info refer [here](https://github.com/NVIDIA/gpu-operator/issues/360). +* Fixed an issue causing `cuda-validator` Pod to fail when `accept-nvidia-visible-devices-envvar-when-unprivileged = false` is set with NVIDIA Container Toolkit. For more info refer [here](https://github.com/NVIDIA/gpu-operator/issues/365). +* Fixed an issue which caused recursive mounts under `/run/nvidia/driver` when both `driver.rdma.enabled` and `driver.rdma.useHostMofed` are set to `true`. This caused other GPU Pods to fail to start. + +---- + +## 1.11.0 + +### New Features + +* Support for NVIDIA Data Center GPU Driver version `515.48.07`. +* Support for NVIDIA AI Enterprise 2.1. +* Support for NVIDIA Virtual Compute Server 14.1 (vGPU). +* Support for Ubuntu 22.04 LTS. +* Support for secure boot with GPU Driver version 515 and Ubuntu Server 20.04 LTS and 22.04 LTS. +* Support for Kubernetes 1.24. +* Support for Time-Slicing GPUs in Kubernetes. +* Support for Red Hat OpenShift on AWS, Azure and GCP instances. Refer to the Platform Support Matrix for the supported instances. +* Support for Red Hat Openshift 4.10 on AWS EC2 G5g instances(ARM). +* Support for Kubernetes 1.24 on AWS EC2 G5g instances(ARM). +* Support for use with the NVIDIA Network Operator 1.2. +* [Technical Preview] - Support for KubeVirt and Red Hat OpenShift Virtualization with GPU Passthrough and NVIDIA vGPU based products. +* [Technical Preview] - Kubernetes on ARM with Server Base System Architecture (SBSA). + +### Improvements + +* GPUDirect RDMA is now supported with CentOS using MOFED installed on the node. +* The NVIDIA vGPU Manager can now be upgraded to a newer branch while using an older, compatible guest driver. +* DGX A100 and non-DGX servers can now be used within the same cluster. +* Improved user interface while deploying a ClusterPolicy instance(CR) for the GPU Operator through Red Hat OpenShift Console. +* Improved the container-toolkit to handle v1 containerd configurations. + +### Fixed issues + +* Fix for incorrect reporting of `DCGM_FI_DEV_FB_USED` where reserved memory is reported as used memory. For more details refer to [GitHub issue](https://github.com/NVIDIA/gpu-operator/issues/348). +* Fixed nvidia-peermem sidecar container to correctly load the `nvidia-peermem` module when MOFED is directly installed on the node. +* Fixed duplicate mounts of `/run/mellanox/drivers` within the driver container which caused driver cleanup or re-install to fail. +* Fixed uncordoning of the node with k8s-driver-manager whenever ENABLE_AUTO_DRAIN env is disabled. +* Fixed readiness check for MOFED driver installation by the NVIDIA Network Operator. This will avoid the GPU driver containers to be in `CrashLoopBackOff` while waiting for MOFED drivers to be ready. + +### Known Limitations + +* All worker nodes within the Kubernetes cluster must use the same operating system version. +* The NVIDIA GPU Operator can only be used to deploy a single NVIDIA GPU Driver type and version. The NVIDIA vGPU and Data Center GPU Driver cannot be used within the same cluster. +* See the limitations sections for the [Technical Preview] of GPU Operator support for KubeVirt. +* The `clusterpolicies.nvidia.com` CRD has to be manually deleted after the GPU Operator is uninstalled using Helm. +* `nouveau` driver has to be blacklisted when using the NVIDIA vGPU. Otherwise the driver will fail to initialize the GPU with the error `Failed to enable MSI-X` in the system journal logs and all GPU Operator pods will be stuck in `init` state. +* The `gpu-operator:v1.11.0` and `gpu-operator:v1.11.0-ubi8` images have been released with the following known HIGH Vulnerability CVEs. + These are from the base images and are not in libraries used by GPU Operator: + + * `xz-libs` - [CVE-2022-1271](https://access.redhat.com/security/cve/CVE-2022-1271) + +---- + +## 1.10.1 + +### Improvements +* Validated secure boot with signed NVIDIA Data Center Driver R510. +* Validated cgroup v2 with Ubuntu Server 20.04 LTS. + +### Fixed issues +* Fixed an issue when GPU Operator was installed and MIG was already enabled on a GPU. The GPU Operator will now install successfully and MIG can either be disabled via the label `nvidia.com/mig.config=all-disabled` or configured with the required MIG profiles. + +### Known Limitations + +* The `gpu-operator:v1.10.1` and `gpu-operator:v1.10.1-ubi8` images have been released with the following known HIGH Vulnerability CVEs. + These are from the base images and are not in libraries used by GPU Operator: + + * `openssl-libs` - [CVE-2022-0778](https://access.redhat.com/security/cve/CVE-2022-0778) + * `zlib` - [CVE-2018-25032](https://access.redhat.com/security/cve/CVE-2018-25032) + * `gzip` - [CVE-2022-1271](https://access.redhat.com/security/cve/CVE-2022-1271) + +---- + +## 1.10.0 + +### New Features +* Support for NVIDIA Data Center GPU Driver version `510.47.03`. +* Support NVIDIA A2, A100X and A30X +* Support for A100X and A30X on the DPU’s Arm processor. +* Support for secure boot with Ubuntu Server 20.04 and NVIDIA Data Center GPU Driver version R470. +* Support for Red Hat OpenShift 4.10. +* Support for GPUDirect RDMA with Red Hat OpenShift. +* Support for NVIDIA AI Enterprise 2.0. +* Support for NVIDIA Virtual Compute Server 14 (vGPU). + +### Improvements +* Enabling/Disabling of GPU System Processor (GSP) Mode through NVIDIA driver module parameters. +* Ability to avoid deploying GPU Operator Operands on certain worker nodes through labels. Useful for running VMs with GPUs using KubeVirt. + +### Fixed issues +* Increased lease duration of GPU Operator to 60s to avoid restarts during etcd defrag. More details [here](https://github.com/NVIDIA/gpu-operator/issues/326). +* Avoid spurious alerts generated of type `GPUOperatorOpenshiftDriverToolkitEnabledNfdTooOld` on RedHat OpenShift when there are no GPU nodes in the cluster. +* Avoid uncordoning nodes during driver pod startup when `ENABLE_AUTO_DRAIN` is set to `false`. +* Collection of GPU metrics in MIG mode is now supported with 470+ drivers. +* Fabric Manager (required for NVSwitch based systems) with CentOS 7 is now supported. + +### Known Limitations +* Upgrading to a new NVIDIA AI Enterprise major branch: + + Upgrading the vGPU host driver to a newer major branch than the vGPU guest driver will result in GPU driver pod transitioning to a failed state. This happens for instance when the Host is upgraded to vGPU version 14.x while the Kubernetes nodes are still running with vGPU version 13.x. + + To overcome this situation, before upgrading the host driver to the new vGPU branch, apply the following steps: + + 1. kubectl edit clusterpolicy + 1. modify the policy and set the environment variable DISABLE_VGPU_VERSION_CHECK to true as shown below: + + ```yaml + driver: + env: + - name: DISABLE_VGPU_VERSION_CHECK + value: "true" + ``` + + 1. write and quit the clusterpolicy edit + +* The `gpu-operator:v1.10.0` and `gpu-operator:v1.10.0-ubi8` images have been released with the following known HIGH Vulnerability CVEs. + These are from the base images and are not in libraries used by GPU Operator: + + * `openssl-libs` - [CVE-2022-0778](https://access.redhat.com/security/cve/CVE-2022-0778) + +---- + +## 1.9.1 + +### Improvements +* Improved logic in the driver container for waiting on MOFED driver readiness. This ensures that `nvidia-peermem` is built and installed correctly. + +### Fixed issues +* Allow `driver` container to fallback to using cluster entitlements on Red Hat OpenShift on build failures. This issue exposed itself when using GPU Operator with some Red Hat OpenShift 4.8.z versions and Red Hat OpenShift 4.9.8. GPU Operator 1.9+ with Red Hat OpenShift 4.9.9+ doesn't require entitlements. +* Fixed an issue when DCGM-Exporter didn't work correctly with using the separate DCGM host engine that is part of the standalone DCGM pod. Fixed the issue and changed the default behavior to use the DCGM Host engine that is embedded in DCGM-Exporter. The standalone DCGM pod will not be launched by default but can be enabled for use with DGX A100. +* Update to latest Go vendor packages to avoid any CVE's. +* Fixed an issue to allow GPU Operator to work with `CRI-O` runtime on Kubernetes. +* Mount correct source path for Mellanox OFED 5.x drivers for enabling GPUDirect RDMA. + +---- + +## 1.9.0 + +### New Features +* Support for NVIDIA Data Center GPU Driver version `470.82.01`. +* Support for DGX A100 with DGX OS 5.1+. +* Support for preinstalled GPU Driver with MIG Manager. +* Removed dependency to maintain active Red Hat OpenShift entitlements to build the GPU Driver. Introduce entitlement free driver builds starting with Red Hat OpenShift 4.9.9. +* Support for GPUDirect RDMA with preinstalled Mellanox OFED drivers. +* Support for GPU Operator and operands upgrades using Red Hat OpenShift Lifecycle Manager (OLM). +* Support for NVIDIA Virtual Compute Server 13.1 (vGPU). + +### Improvements +* Automatic detection of default runtime used in the cluster. Deprecate the operator.defaultRuntime parameter. +* GPU Operator and its operands are installed into a single user specified namespace. +* A loaded Nouveau driver is automatically detected and unloaded as part of the GPU Operator install. +* Added an option to mount a ConfigMap of self-signed certificates into the driver container. Enables SSL connections to private package repositories. + +### Fixed issues +* Fixed an issue when DCGM Exporter was in CrashLoopBackOff as it could not connect to the DCGM port on the same node. + +### Known Limitations +* GPUDirect RDMA is only supported with R470 drivers on Ubuntu 20.04 LTS and is not supported on other distributions (e.g. CoreOS, CentOS etc.) +* The GPU Operator supports GPUDirect RDMA only in conjunction with the Network Operator. The Mellanox OFED drivers can be installed by the Network Operator or pre-installed on the host. +* Upgrades from v1.8.x to v1.9.x are not supported due to GPU Operator 1.9 installing the GPU Operator and its operands into a single namespace. Previous GPU Operator versions installed them into different namespaces. Upgrading to GPU Operator 1.9 requires uninstalling pre 1.9 GPU Operator versions prior to installing GPU Operator 1.9 +* Collection of GPU metrics in MIG mode is not supported with 470+ drivers. +* The GPU Operator requires all MIG related configurations to be executed by MIG Manager. Enabling/Disabling MIG and other MIG related configurations directly on the host is discouraged. +* Fabric Manager (required for NVSwitch based systems) with CentOS 7 is not supported. + +---- + +## 1.8.2 + +### Fixed issues +* Fixed an issue where Driver Daemonset was spuriously updated on RedHat OpenShift causing repeated restarts in Proxy environments. +* The MIG Manager version was bumped to `v0.1.3` to fix an issue when checking whether a GPU was in MIG mode or not. + Previously, it would always check for MIG mode directly over the PCIe bus instead of using NVML. Now it checks with NVML when it can, only falling back to the PCIe bus when NVML is not available. + Please refer to the [Release notes](https://github.com/NVIDIA/mig-parted/releases/tag/v0.1.3) for a complete list of fixed issues. +* Container Toolkit bumped to version `v1.7.1` to fix an issue when using A100 80GB. + +### Improvements +* Added support for user-defined MIG partition configuration via a `ConfigMap`. + +---- + +## 1.8.1 + +### Fixed issues +* Fixed an issue with using the [NVIDIA License System](https://docs.nvidia.com/license-system/latest/) in NVIDIA AI Enterprise deployments. + +---- + +## 1.8.0 + +### New Features +* Support for NVIDIA Data Center GPU Driver version `470.57.02`. +* Added support for NVSwitch systems such as HGX A100. The driver container detects the presence of NVSwitches + in the system and automatically deploys the [Fabric Manager](https://docs.nvidia.com/datacenter/tesla/pdf/fabric-manager-user-guide.pdf) + for setting up the NVSwitch fabric. +* The driver container now builds and loads the `nvidia-peermem` kernel module when GPUDirect RDMA is enabled and Mellanox devices are present in the system. + This allows the GPU Operator to complement the [NVIDIA Network Operator](https://github.com/Mellanox/network-operator) to enable GPUDirect RDMA in the + Kubernetes cluster. Refer to the RDMA documentation on getting started. + + **Note:** + + This feature is available only when used with R470 drivers on Ubuntu 20.04 LTS. +* Added support for upgrades of the GPU Operator components. A new `k8s-driver-manager` component handles upgrades + of the NVIDIA drivers on nodes in the cluster. +* NVIDIA DCGM is now deployed as a component of the GPU Operator. The standalone DCGM container allows multiple clients such as + [DCGM-Exporter](https://docs.nvidia.com/datacenter/cloud-native/gpu-telemetry/dcgm-exporter.html) and [NVSM](https://docs.nvidia.com/nvidia-system-management-nvsm/) + to be deployed and connect to the existing DCGM container. +* Added a `nodeStatusExporter` component that exports operator and node metrics in a Prometheus format. The component provides + information on the status of the operator (e.g. reconciliation status, number of GPU enabled nodes). + +### Improvements +* Reduced the size of the ClusterPolicy CRD by removing duplicates and redundant fields. +* The GPU Operator now supports detection of the virtual PCIe topology of the system and makes the topology available to + vGPU drivers via a configuration file. The driver container starts the `nvidia-topologyd` daemon in vGPU configurations. +* Added support for specifying the `RuntimeClass` variable via Helm. +* Added `nvidia-container-toolkit` images to support CentOS 7 and CentOS 8. +* `nvidia-container-toolkit` now supports configuring `containerd` correctly for RKE2. +* Added new debug options (logging, verbosity levels) for `nvidia-container-toolkit` + +### Fixed issues +* The driver container now loads `ipmi_devintf` by default. This allows tools such as `ipmitool` that rely on `ipmi` char devices + to be created and available. + +### Known Limitations +* GPUDirect RDMA is only supported with R470 drivers on Ubuntu 20.04 LTS and is not supported on other distributions (e.g. CoreOS, CentOS etc.) +* The operator supports building and loading of `nvidia-peermem` only in conjunction with the Network Operator. Use with pre-installed MOFED drivers + on the host is not supported. This capability will be added in a future release. +* Support for DGX A100 with GPU Operator 1.8 will be available in an upcoming patch release. +* This version of GPU Operator does not work well on RedHat OpenShift when a cluster-wide proxy is configured and causes constant restarts of driver container. + This will be fixed in an upcoming patch release `v1.8.2`. + +---- + +## 1.7.1 + +### Fixed issues +* NFD version bumped to `v0.8.2` to support correct kernel version labeling on Anthos nodes. See [NFD issue](https://github.com/kubernetes-sigs/node-feature-discovery/pull/402) for more details. + +---- + +## 1.7.0 + +### New Features +* Support for NVIDIA Data Center GPU Driver version `460.73.01`. +* Added support for automatic configuration of MIG geometry on NVIDIA Ampere products (e.g. A100) using the `k8s-mig-manager`. +* GPU Operator can now be deployed on systems with pre-installed NVIDIA drivers and the NVIDIA Container Toolkit. +* DCGM-Exporter now supports telemetry for MIG devices on supported Ampere products (e.g. A100). +* Added support for a new `nvidia` `RuntimeClass` with `containerd`. +* The Operator now supports `PodSecurityPolicies` when enabled in the cluster. + +### Improvements +* Changed the label selector used by the DaemonSets of the different states of the GPU Operator. Instead of having a global + label `nvidia.com/gpu.present=true`, each DaemonSet now has its own label, `nvidia.com/gpu.deploy.=true`. This + new behavior allows a finer grain of control over the components deployed on each of the GPU nodes. +* Migrated to using the latest operator-sdk for building the GPU Operator. +* The operator components are deployed with `node-critical` `PriorityClass` to minimize the possibility of eviction. +* Added a spec for the `initContainer` image, to allow flexibility to change the base images as required. +* Added the ability to configure the MIG strategy to be applied by the Operator. +* The driver container now auto-detects OpenShift/RHEL versions to better handle node/cluster upgrades. +* Validations of the container-toolkit and device-plugin installations are done on all GPU nodes in the cluster. +* Added an option to skip plugin validation workload pod during the Operator deployment. + +### Fixed issues +* The `gpu-operator-resources` namespace is now created by the Operator so that they can be used by both Helm + and OpenShift installations. + +### Known Limitations +* DCGM does not support profiling metrics on RTX 6000 and RTX 8000. Support will be added in a future release of DCGM Exporter. +* After uninstall of GPU Operator, NVIDIA driver modules might still be loaded. Either reboot the node or forcefully remove them using + `sudo rmmod nvidia nvidia_modeset nvidia_uvm` command before re-installing GPU Operator. +* When MIG strategy of `mixed` is configured, device-plugin-validation may stay in `Pending` state due to incorrect GPU resource request type. User would need to + modify the pod spec to apply correct resource type to match the MIG devices configured in the cluster. + +---- + +## 1.6.2 + +### Fixed issues +* Fixed an issue with NVIDIA Container Toolkit 1.4.6 which causes an error with containerd as `Error while dialing dial unix /run/containerd/containerd.sock: connect: connection refused`. NVIDIA Container Toolkit 1.4.7 now sets `version` as an integer to fix this error. +* Fixed an issue with NVIDIA Container Toolkit which causes nvidia-container-runtime settings to be persistent across node reboot and causes driver pod to fail. Now nvidia-container-runtime will fallback to using `runc` when driver modules are not yet loaded during node reboot. +* GPU Operator now mounts runtime hook configuration for CRIO under `/run/containers/oci/hooks.d`. + +---- + +## 1.6.1 + +### Fixed issues +* Fixed an issue with NVIDIA Container Toolkit 1.4.5 when used with containerd and an empty containerd configuration which file causes error `Error while dialing dial unix /run/containerd/containerd.sock: connect: connection refused`. NVIDIA Container Toolkit 1.4.6 now explicitly sets the `version=2` along with other changes when the default containerd configuration file is empty. + +---- + +## 1.6.0 + +### New Features +* Support for Red Hat OpenShift 4.7. +* Support for NVIDIA Data Center GPU Driver version `460.32.03`. +* Automatic injection of Proxy settings and custom CA certificates into driver container for Red Hat OpenShift. + +DCGM-Exporter support includes the following: + +* Updated DCGM to v2.1.4 +* Increased reporting interval to 30s instead of 2s to reduce overhead +* Report NVIDIA vGPU licensing status and row-remapping metrics for Ampere GPUs + +### Improvements +* NVIDIA vGPU licensing configuration (gridd.conf) can be provided as a ConfigMap +* ClusterPolicy CRD has been updated from v1beta1 to v1. As a result minimum supported Kubernetes version is 1.16 from GPU Operator 1.6.0 onwards. + +### Fixed issues +* Fixes for DCGM Exporter to work with CPU Manager. +* nvidia-gridd daemon logs are now collected on host by rsyslog. + +### Known Limitations +* DCGM does not support profiling metrics on RTX 6000 and RTX 8000. Support will be added in a future release of DCGM Exporter. +* After uninstall of GPU Operator, NVIDIA driver modules might still be loaded. Either reboot the node or forcefully remove them using + `sudo rmmod nvidia nvidia_modeset nvidia_uvm` command before re-installing GPU Operator. +* When MIG strategy of `mixed` is configured, device-plugin-validation may stay in `Pending` state due to incorrect GPU resource request type. User would need to + modify the pod spec to apply correct resource type to match the MIG devices configured in the cluster. +* `gpu-operator-resources` project in Red Hat OpenShift requires label `openshift.io/cluster-monitoring=true` for Prometheus to collect DCGM metrics. User will need to add this + label manually when project is created. + +---- + +## 1.5.2 + +### Improvements +* Allow `mig.strategy=single` on nodes with non-MIG GPUs. +* Pre-create MIG related `nvcaps` at startup. +* Updated device-plugin and toolkit validation to work with CPU Manager. + +### Fixed issues +* Fixed issue which causes GFD pods to fail with error `Failed to load NVML` error even after driver is loaded. + +---- + +## 1.5.1 + +### Improvements +* Kubelet's cgroup driver as `systemd` is now supported. + +### Fixed issues +* Device-Plugin stuck in `init` phase on node reboot or when new node is added to the cluster. + +---- + +## 1.5.0 + +### New Features +* Added support for NVIDIA vGPU + +### Improvements +* Driver Validation container is run as an initContainer within device-plugin Daemonset pods. Thus driver installation on each NVIDIA GPU/vGPU node will be validated. +* GFD will label vGPU nodes with driver version and branch name of NVIDIA vGPU installed on Hypervisor. +* Driver container will perform automatic compatibility check of NVIDIA vGPU driver with the version installed on the underlying Hypervisor. + +### Fixed issues +* GPU Operator will no longer crash when no GPU nodes are found. +* Container Toolkit pods wait for drivers to be loaded on the system before setting the default container runtime as `nvidia`. +* On host reboot, ordering of pods is maintained to ensure that drivers are always loaded first. +* Fixed device-plugin issue causing `symbol lookup error: nvidia-device-plugin: undefined symbol: nvmlEventSetWait_v2` error. + +### Known Limitations +* The GPU Operator v1.5.x does not support mixed types of GPUs in the same cluster. All GPUs within a cluster need to be either NVIDIA vGPUs, GPU Passthrough GPUs or Bare Metal GPUs. +* GPU Operator v1.5.x with NVIDIA vGPUs support Turing and newer GPU architectures. +* DCGM does not support profiling metrics on RTX 6000 and RTX 8000. Support will be added in a future release of DCGM Exporter. +* After uninstall of GPU Operator, NVIDIA driver modules might still be loaded. Either reboot the node or forcefully remove them using + `sudo rmmod nvidia nvidia_modeset nvidia_uvm` command before re-installing GPU Operator. +* When MIG strategy of `mixed` is configured, device-plugin-validation may stay in `Pending` state due to incorrect GPU resource request type. User would need to + modify the pod spec to apply correct resource type to match the MIG devices configured in the cluster. +* `gpu-operator-resources` project in Red Hat OpenShift requires label `openshift.io/cluster-monitoring=true` for Prometheus to collect DCGM metrics. User will need to add this + label manually when project is created. + +---- + +## 1.4.0 + +### New Features +* Added support for CentOS 7 and 8. + + **Note:** + + Due to a known limitation with the GPU Operator's default values on CentOS, install the operator on CentOS 7/8 + using the following Helm command: + + ```console + $ helm install --wait --generate-name \ + nvidia/gpu-operator \ + --set toolkit.version=1.4.0-ubi8 + ``` + + This issue will be fixed in the next release. +* Added support for airgapped enterprise environments. +* Added support for `containerd` as a container runtime under Kubernetes. + +### Improvements +* Updated DCGM-Exporter to `2.1.2`, which uses DCGM 2.0.13. +* Added the ability to pass arguments to the NVIDIA device plugin to enable `migStrategy` and `deviceListStrategy` flags + that allow additional configuration of the plugin. +* Added more resiliency to `dcgm-exporter`- `dcgm-exporter` would not check whether GPUs support profiling metrics and would result in a `CrashLoopBackOff` + state at launch in these configurations. + +### Fixed issues +* Fixed the issue where the removal of the GPU Operator from the cluster required a restart of the Docker daemon (since the Operator + sets the `nvidia` as the default runtime). +* Fixed volume mounts for `dcgm-exporter` under the GPU Operator to allow pod<->device metrics attribution. +* Fixed an issue where the GFD and `dcgm-exporter` container images were artificially limited to R450+ (CUDA 11.0+) drivers. + +### Known Limitations +* After uninstall of GPU Operator, NVIDIA driver modules might still be loaded. Either reboot the node or forcefully remove them using + `sudo rmmod nvidia nvidia_modeset nvidia_uvm` command before re-installing GPU Operator. + +---- + +## 1.3.0 + +### New Features +* Integrated [GPU Feature Discovery](https://github.com/NVIDIA/gpu-feature-discovery) to automatically generate labels for GPUs leveraging NFD. +* Added support for Red Hat OpenShift 4.4+ (i.e. 4.4.29+, 4.5 and 4.6). The GPU Operator can be deployed from OpenShift OperatorHub. See the catalog + [listing](https://catalog.redhat.com/software/operators/nvidia/gpu-operator/5ea882962937381642a232cd) for more information. + +### Improvements +* Updated DCGM-Exporter to `2.1.0` and added profiling metrics by default. +* Added further capabilities to configure tolerations, node affinity, node selectors, pod security context, resource requirements through the `ClusterPolicy`. +* Optimized the footprint of the validation containers images - the image sizes are now down to ~200MB. +* Validation images are now configurable for air-gapped installations. + +### Fixed issues +* Fixed the ordering of the state machine to ensure that the driver daemonset is deployed before the other components. This fix addresses the issue + where the NVIDIA container toolkit would be set up as the default runtime, causing the driver container initialization to fail. + +### Known Limitations +* After uninstall of GPU Operator, NVIDIA driver modules might still be loaded. Either reboot the node or forcefully remove them using + `sudo rmmod nvidia nvidia_modeset nvidia_uvm` command before re-installing GPU Operator. + +---- + +## 1.2.0 + +### New Features +* Added support for Ubuntu 20.04.z LTS. +* Added support for the NVIDIA A100 GPU (and appropriate updates to the underlying components of the operator). + +### Improvements +* Updated Node Feature Discovery (NFD) to 0.6.0. +* Container images are now hosted (and mirrored) on both [DockerHub](https://hub.docker.com/u/nvidiadocker.io) and [NGC](https://ngc.nvidia.com/catalog/containers/nvidia:gpu-operator). + +### Fixed issues +* Fixed an issue where the GPU Operator would not correctly detect GPU nodes due to inconsistent PCIe node labels. +* Fixed a race condition where some of the NVIDIA pods would start out of order resulting in some pods in `RunContainerError` state. +* Fixed an issue in the driver container where the container would fail to install on systems with the `linux-gke` kernel due to not finding the kernel headers. + +### Known Limitations +* After uninstall of GPU Operator, NVIDIA driver modules might still be loaded. Either reboot the node or forcefully remove them using + `sudo rmmod nvidia nvidia_modeset nvidia_uvm` command before re-installing GPU Operator. + +---- + +## 1.1.0 + +### New features +* DCGM is now deployed as part of the GPU Operator on OpenShift 4.3. + +### Improvements +* The operator CRD has been renamed to `ClusterPolicy`. +* The operator image is now based on UBI8. +* Helm chart has been refactored to fix issues and follow some best practices. + +### Fixed issues +* Fixed an issue with the toolkit container which would set up the NVIDIA runtime under `/run/nvidia` with a symlink to `/usr/local/nvidia`. + If a node was rebooted, this would prevent any containers from being run with Docker as the container runtime configured in `/etc/docker/daemon.json` + would not be available after reboot. +* Fixed a race condition with the creation of the CRD and registration. + +---- + +## 1.0.0 + +### New Features +* Added support for Helm v3. Note that installing the GPU Operator using Helm v2 is no longer supported. +* Added support for Red Hat OpenShift 4 (4.1, 4.2 and 4.3) using Red Hat Enterprise Linux Core OS (RHCOS) and CRI-O runtime on GPU worker nodes. +* GPU Operator now deploys NVIDIA DCGM for GPU telemetry on Ubuntu 18.04 LTS + +### Fixed Issues +* The driver container now sets up the required dependencies on `i2c` and `ipmi_msghandler` modules. +* Fixed an issue with the validation steps (for the driver and device plugin) taking considerable time. Node provisioning times are now improved by 5x. +* The SRO custom resource definition is set up as part of the operator. +* Fixed an issue with the cleanup of driver mount files when deleting the operator from the cluster. This issue previously required a reboot of the node. + +### Known Limitations + +* After uninstall of GPU Operator, NVIDIA driver modules might still be loaded. Either reboot the node or forcefully remove them using + `sudo rmmod nvidia nvidia_modeset nvidia_uvm` command before re-installing GPU Operator. diff --git a/gpu-operator/.agents/skills/gpu-operator-references/references/security.md b/gpu-operator/.agents/skills/gpu-operator-references/references/security.md new file mode 100644 index 000000000..f621786fe --- /dev/null +++ b/gpu-operator/.agents/skills/gpu-operator-references/references/security.md @@ -0,0 +1,39 @@ + + +# Security Considerations + +## Pod Security Context of the Operator and Operands + +Several of the NVIDIA GPU Operator operands, such as the driver containers and container toolkit, +require the following elevated privileges: + +- `privileged: true` +- `hostPID: true` +- `hostIPC: true` + +The elevated privileges are required for the following reasons: + +- Access to the host file system and hardware devices, such as NVIDIA GPUs. +- Restart system services such as containerd. +- Loading and unloading kernel modules. + +Only the Kubernetes cluster administrator needs to access or manage the Operator namespace. +As a best practice, establish proper security policies and prevent any other users from accessing the Operator namespace. + +## CVEs + +The following is a list of known CVEs in the GPU Operator or its operands. +To view any published security bulletins for NVIDIA products published security bulletins for NVIDIA products, refer to the NVIDIA product security page at https://www.nvidia.com/en-us/security/. + +| CVE ID | Affected Components | Fixed Version | +| --- | --- | --- | +| [NVIDIA CVE-2025-23359](https://nvidia.custhelp.com/app/answers/detail/a_id/5616) | NVIDIA Container Toolkit, all versions up to and including 1.17.3 NVIDIA GPU Operator, all versions up to and including 24.9.1 | NVIDIA Container Toolkit 1.17.4 NVIDIA GPU Operator 24.9.2 | +| [NVIDIA CVE-2024-0135](https://nvidia.custhelp.com/app/answers/detail/a_id/5599) | NVIDIA Container Toolkit, all versions up to and including 1.17.2 NVIDIA GPU Operator, all versions up to and including 24.9.0 | NVIDIA Container Toolkit 1.17.3 NVIDIA GPU Operator 24.9.1 | +| [NVIDIA CVE-2024-0136](https://nvidia.custhelp.com/app/answers/detail/a_id/5599) | NVIDIA Container Toolkit, all versions up to and including 1.17.2 NVIDIA GPU Operator, all versions up to and including 24.9.0 | NVIDIA Container Toolkit 1.17.3 NVIDIA GPU Operator 24.9.1 | +| [NVIDIA CVE-2024-0137](https://nvidia.custhelp.com/app/answers/detail/a_id/5599) | NVIDIA Container Toolkit, all versions up to and including 1.17.2 NVIDIA GPU Operator, all versions up to and including 24.9.0 | NVIDIA Container Toolkit 1.17.3 NVIDIA GPU Operator 24.9.1 | +| [NVIDIA CVE-2024-0134](https://nvidia.custhelp.com/app/answers/detail/a_id/5585) | NVIDIA Container Toolkit, all versions up to and including 1.16.2 NVIDIA GPU Operator, all versions up to and including 24.6.2 | NVIDIA Container Toolkit 1.17.0 NVIDIA GPU Operator 24.9.0 | +| [NVIDIA CVE-2024-0132](https://nvidia.custhelp.com/app/answers/detail/a_id/5582) | NVIDIA Container Toolkit, all versions up to and including 1.16.1 NVIDIA GPU Operator, all versions up to and including 24.6.1 | NVIDIA Container Toolkit 1.16.2 NVIDIA GPU Operator 24.6.2 | +| [NVIDIA CVE-2024-0133](https://nvidia.custhelp.com/app/answers/detail/a_id/5582) | NVIDIA Container Toolkit, all versions up to and including 1.16.1 NVIDIA GPU Operator, all versions up to and including 24.6.1 | NVIDIA Container Toolkit 1.16.2 NVIDIA GPU Operator 24.6.2 | +### Report a Vulnerability + +For details on reporting a suspected vulnerability, refer to the [GPU Operator Security policies](https://github.com/NVIDIA/gpu-operator/blob/main/SECURITY.md/) page. diff --git a/gpu-operator/.agents/skills/gpu-operator-references/references/troubleshooting.md b/gpu-operator/.agents/skills/gpu-operator-references/references/troubleshooting.md new file mode 100644 index 000000000..3c4c879de --- /dev/null +++ b/gpu-operator/.agents/skills/gpu-operator-references/references/troubleshooting.md @@ -0,0 +1,554 @@ + + +# Troubleshooting the NVIDIA GPU Operator + +This page outlines common issues and troubleshooting steps for the NVIDIA GPU Operator. + +If you are facing a gpu-operator and/or operand(s) issue that is not documented in this guide, it is recommended that you run the `must-gather` utility, prepare a bug report, then file an issue in the [NVIDIA GPU Operator GitHub repository](https://github.com/NVIDIA/gpu-operator/issues). + +```console +curl -o must-gather.sh -L https://raw.githubusercontent.com/NVIDIA/gpu-operator/main/hack/must-gather.sh +chmod +x must-gather.sh +./must-gather.sh +``` + +This utility is used to collect relevant information from your cluster that is needed for diagnosing and debugging issues. +The final output is an archive file which contains the manifests and logs of all the components managed by gpu-operator. + +## The `nouveau` driver fails to initialize the GPU + +### Observation +- The GPU driver fails to initialize the GPU with the error `Failed to enable MSI-X` in the system journal logs. +- All GPU Operator pods become stuck in the `init` state. + +### Root Cause +- The `nouveau` Linux kernel module is loaded. + +### Action +The `nouveau` driver must be denylisted when using NVIDIA vGPU. + +Follow the instructions in the [NVIDIA AI Enterprise: VMware Deployment Guide](https://docs.nvidia.com/ai-enterprise/deployment/vmware/latest/nouveau.html#disable-nouveau) +to disable `nouveau` on your OS/distro to resolve this issue. + +## GPU Operator pods are stuck in Init + +### Observation +The output from `kubectl get pods -n gpu-operator`, shows something like: + +```console +gpu-feature-discovery-tmblp 0/1 Init:0/1 0 11m +nvidia-container-toolkit-daemonset-mqzwq 0/1 Init:0/1 0 2m +nvidia-dcgm-exporter-qpxxl 0/1 Init:0/1 0 8m32s +nvidia-device-plugin-daemonset-tl9k7 0/1 Init:0/1 0 11m +nvidia-operator-validator-th4w7 0/1 Init:0/4 0 10m +nvidia-driver-daemonset-4rtiu 0/2 Running 3 12m +``` + +### Root Cause +This most likely refers to an issue with the nvidia-driver-daemonset. +Note that the operand pods will only come up when the driver daemonset and toolkit pods come up successfully. + +1. **Check the driver daemonset pod logs:** + + - To retrieve the main driver container logs: + + ```console + kubectl logs -n gpu-operator nvidia-driver-daemonset-p97x5 -c nvidia-driver-ctr + ``` + + - If you see `Init:Error` in the kubectl output, then retrieve the k8s-driver-manager logs + + ```console + kubectl logs -n gpu-operator nvidia-driver-daemonset-p97x5 -c k8s-driver-manager + ``` + +2. **Check the dmesg logs** + + - `dmesg` displays the messages generated by the Linux Kernel. `dmesg` helps us detect any issues loading the GPU driver modules especially when the driver daemonset logs do not provide a lot of information + - You can retrieve `dmesg` using either: kubectl exec or execute `dmesg` in your host terminal. + + kubectl exec + + ```console + kubectl exec -n gpu-operator -it nvidia-driver-daemonset-p97x5 -c nvidia-driver-ctr -- dmesg + ``` + + Execute `dmesg` in your host terminal + + ```console + sudo dmesg + ``` + + **TIP**: You can also grep for NVRM or Xid to view logs emitted by the driver's kernel module. + + ```console + sudo dmesg | grep -i NVRM + ``` + + OR + + ```console + sudo dmesg | grep -i Xid + ``` + +3. **Ensure that your driver daemonset has internet access to download deb/rpm packages during runtime:** + + - Check your Kubernetes cluster's VPC, Security groups and DNS settings + - Consider executing into a container shell and testing internet connectivity with a simple `ping` command + +## No runtime for "nvidia" is configured + +### Observation +When running `kubectl describe` for one of the gpu-operator pods, and you see an error like: + +```console +Warning FailedCreatePodSandBox 2m37s (x94 over 22m) kubelet Failed to create pod sandbox: rpc error: code = Unknown desc = failed to get sandbox runtime: no runtime for "nvidia" is configured +``` + +### Root Cause +This means that the `RuntimeClass` is unable to find the runtime handler named "nvidia" in your container runtime's configuration. +The runtime handler is added by the nvidia-container-toolkit, so this error message is likely related to startup issues with nvidia-container-toolkit + +### Action +1. **Check the nvidia-container-toolkit logs** + + - To retrieve the toolkit pod logs: + + ```console + kubectl logs -n gpu-operator nvidia-container-toolkit-daemonset-2rhwg -c nvidia-container-toolkit-ctr + ``` + +2. **Check the driver daemonset logs** + + - Ensure the driver daemonset is up and running. Refer to GPU Operator pods are stuck in Init. + +3. **Review the container runtime configuration TOML** + + - CRI-O and Containerd are the two main container runtimes supported by the toolkit. You can view the runtime configuration file and verify that the "nvidia" container runtime handler exists + - Here are some ways to retrieve the container runtime config: + + - If using "containerd", run the `containerd config` command to retrieve the active containerd configuration + - If using "cri-o", run the `crio status config` command to retrieve the active cri-o configuration + +## Operator validator pods crashing with "error code system not yet initialized" + +When the operator validator pods are crashing with this error, this most likely points to a GPU node that is NVSwitch-based and requires the nvidia-fabricmanager to be installed. +NVSwitch-based systems, like NVIDIA DGX and NVIDIA HGX server systems, require the memory fabric to be set up after the GPU driver is installed. +Learn more about the Fabric Manager from the [Fabric Manager user guide](https://docs.nvidia.com/datacenter/tesla/fabric-manager-user-guide/index.html) + +### Action +1. **nvidia-smi -q** + + - Execute into the driver container and run `nvidia-smi -q` if you are using gpu driver daemonset. + + ```console + kubectl exec -n gpu-operator -it nvidia-driver-daemonset-p97x5 -c nvidia-driver-ctr -- nvidia-smi -q + ``` + + - The `nvidia-smi -q` displays a verbose output with all the attributes of a GPU + - If you see the following in the `nvidia-smi -q` command output, then the nvidia-fabricmanager needs to be installed + + ```console + Fabric + State : In Progress + Status : N/A + CliqueId : N/A + ClusterUUID : N/A + ``` + + Note: If your driver is pre-installed on your host system, run `nvidia-smi -q` in your host's shell terminal + +2. **Refer to the nvidia-driver-daemonset logs** + + - The driver daemonset has the logic to detect NVSwitches and install the `nvidia-fabricmanager` if they are found + - Check the driver daemonset logs to confirm if the NVSwitch devices were detected and/or if the `nvidia-fabricmanager` was installed successfully + +3. **Check the Fabric Manager logs** + + - If the operator validator pods are still crashing despite fabric manager being installed, you may need to look up the fabric manager logs + - Execute into the driver container and run `cat /var/log/fabricmanager.log` if the gpu driver daemonset is deployed + + ```console + kubectl exec -n gpu-operator -it nvidia-driver-daemonset-p97x5 -c nvidia-driver-ctr -- cat /var/log/fabricmanager.log + ``` + + - If you are using a host-installed driver, SSH into the host and run `cat /var/log/fabricmanager.log` + +## GPU Feature Discovery crashing with CreateContainerError/CrashLoopBackoff + +When the GPU Feature Discovery pods start crashing and you see the error below in the `kubectl describe` output, the root cause is likely a driver/hardware issue. + +```console +.... +.... + Containers: + gpu-feature-discovery: + Container ID: containerd://947879d0f2a3e3a11187c3435c2e13f1d8962540b8853cebb409eaa47f661c34 Image: nvcr.io/nvidia/gpu-feature-discovery:v0.8.0-ubi8 + Image ID: nvcr.io/nvidia/gpu-feature-discovery@sha256:84ce86490d0d313ed6517f2ac3a271e1179d7478d86c772da3846727d7feddc3 Port: + Host Port: State: Waiting + Reason: CrashLoopBackOff Last State: Terminated + Reason: StartError Message: failed to create containerd task: failed to create shim task: OCI runtime create failed: runc create failed: unable to start container process: error during container init: error running + hook #0: error running hook: exit status 1, stdout: , stderr: nvidia-container-cli.real: initialization error: driver rpc error: timed out: unknown +``` + +### Action +1. **Check dmesg logs** + + - `dmesg` can be used to retrieve any issues stemming from gpu driver/hardware. + - You can fine tune your search by grepping for `NVRM` or `Xid` in your dmesg command output + - Your command would look like - `sudo dmesg grep -i NVRM` or `sudo dmesg grep -i Xid` + - If the output from the previous command has something like the snippet below, then it is likely a GPU driver/hardware issue. + + ```console + # dmesg |grep -i xid + NVRM: Xid (PCI:0000:ca:00): 79, pid='', name=, GPU has fallen off the bus. + ``` + + This error message indicates an Xid error with the code 79. For more information on Xid errors and its various error codes, refer to this [page](https://docs.nvidia.com/deploy/xid-errors). + +2. **Check nvidia-device-plugin-daemonset logs** + + - The `nvidia-device-plugin` has a health checker module which periodically monitors the NVML event stream for any Xid errors and marks a GPU as unhealthy if an Xid error is reported against it + - Retrieve the `nvidia-device-plugin-daemonset` pod logs + + ```console + kubectl logs -n gpu-operator nvidia-device-plugin-daemonset-9bmvc -c nvidia-device-plugin + ``` + + - If there are Xid errors, the device plugin logs should look something like + + ```console + XidCriticalError: Xid=48 on Device=GPU-e3dbf294-2783-f38b-4274-5bc836df5be1; marking device as unhealthy. + + 'nvidia.com/gpu' device marked unhealthy: GPU-e3dbf294-2783-f38b-4274-5bc836df5be1 + ``` + +## GPU Node does not have the expected number of GPUs + +When inspecting your GPU node, you may not see the expected number of "Allocatable" GPUs advertised in the node. + +For example, given a GPU node with eight GPUs, the kubectl describe output might look like the following snippet: + +```console +Name: gpu-node-1 +Roles: worker +...... +...... +Addresses: + InternalIP: 10.158.144.58 + Hostname: gpu-node-1 +Capacity: + cpu: 96 + ephemeral-storage: 106935552Ki + hugepages-1Gi: 0 + hugepages-2Mi: 0 + memory: 527422416Ki + nvidia.com/gpu: 7 + pods: 110 +Allocatable: + cpu: 96 + ephemeral-storage: 98551804561 + hugepages-1Gi: 0 + hugepages-2Mi: 0 + memory: 527320016Ki + nvidia.com/gpu: 7 + pods: 110 +.... +.... +``` + +The above node only advertises seven GPU devices as allocatable when we expect it to display eight instead + +### Action +1. Check for any Xid errors in the `nvidia-device-plugin-daemonset` pod logs. If an Xid error is raised for a GPU, + the device plugin will automatically mark the GPU as unhealthy and take it off the list of "Allocatable" GPUs. + Here are some example device-plugin logs in the event of an Xid error: + + ```console + I0624 22:58:05.486593 1 health.go:159] Processing event {Device:{Handle:0x7f7597647848} EventType:8 EventData:109 GpuInstanceId:4294967295 ComputeInstanceId:4294967295} + I0624 22:58:05.486697 1 health.go:185] XidCriticalError: Xid=79 on Device=GPU-adb24b25-1db1-436e-d958-ddee5da83d07; marking device as unhealthy. + I0624 22:58:05.486727 1 server.go:276] 'nvidia.com/gpu' device marked unhealthy: GPU-adb24b25-1db1-436e-d958-ddee5da83d07 + ``` + +2. You can also check for Xid errors in GPU node's `dmesg` logs. + + ```console + sudo dmesg | grep -i xid + ``` + +3. For more information on Xid error codes and how to resolve them, you can refer to [Xid Errors](https://docs.nvidia.com/deploy/xid-errors/index.html) page. + +## DCGM Exporter pods go into CrashLoopBackoff + +By default, the GPU Operator only deploys the `dcgm-exporter` while disabling the standalone `dcgm`. In this setup, the `dcgm-exporter` spawns a dcgm process locally. If, however, `dcgm` is enabled and deployed as a separate pod/container, then the `dcgm-exporter` will attempt to connect to the `dcgm` pod through a Kubernetes service. If the cluster networking settings are not applied correctly, you would likely see the following error messages in the `dcgm-exporter` logs: + +```console +time="2025-06-25T20:09:25Z" level=info msg="Attempting to connect to remote hostengine at nvidia-dcgm:5555" +time="2025-06-25T20:09:30Z" level=error msg="Encountered a failure." stacktrace="goroutine 1 [running]:\nruntime/debug.Stack() +/usr/local/go/src/runtime/debug/stack.go:24 +0x5e\ngithub.com/NVIDIA/dcgm-exporter/pkg/cmd.action.func1.1() +/go/src/github.com/NVIDIA/dcgm-exporter/pkg/cmd/app.go:283 +0x3d\npanic({0x18b42c0?, 0x2a8d3e0?}) +/usr/local/go/src/runtime/panic.go:770 +``` + +### Action +1. If you have `NetworkPolicies` set up, ensure that they are configured to allow the dcgm-exporter pod to communicate with the dcgm pod +2. Ensure that you don't have security groups or network firewall settings preventing pod-pod traffic whether intranode or internode. + +## GPU driver upgrades are not progressing + +Despite initiating a cluster-wide driver upgrade, not every driver daemonset gets updated to the desired version and this state may persist for a long period of time. + +```console +$ kubectl get daemonsets -n gpu-operator nvidia-driver-daemonset +NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE NODE SELECTOR AGE +nvidia-driver-daemonset 4 4 4 3 4 nvidia.com/gpu.deploy.driver=true 14d +``` + +### Action +1. Check for any nodes that have the `upgrade-failed` label. + + ```console + kubectl get nodes -l nvidia.com/gpu-driver-upgrade-state=upgrade-failed + ``` + +2. Check the driver daemonset pod logs in these nodes +3. If the driver daemonset pod logs are not informative, check the node's `dmesg` +4. Once the issue is resolved, you can re-label the node with the command below: + + ```console + kubectl label node "nvidia.com/gpu-driver-upgrade-state=upgrade-required" + ``` + +5. If the driver upgrade is still stuck, delete the driver pod on the node. + +## Pods stuck in Pending state in mixed MIG + full GPU environments + +### Issue +For drivers 570.124.06, 570.133.20, 570.148.08, and 570.158.01, +GPU workloads cannot be scheduled on nodes that have a mix of MIG slices and full GPUs. +For more detailed information, refer to GitHub issue https://github.com/NVIDIA/gpu-operator/issues/1361. + +### Observation +When a GPU pod is created on a node that has a mix of MIG slices and full GPUs, +the GPU pod gets stuck indefinitely in the `Pending` state. + +### Root Cause +This is due to a regression in NVML introduced in the R570 drivers starting from 570.124.06. + +### Action +NVIDIA recommends that you downgrade to driver version 570.86.15 to work around this issue. + +## GPU Operator Validator: Failed to Create Pod Sandbox + +### Issue +On some occasions, the driver container is unable to unload the `nouveau` Linux kernel module. + +### Observation +- Running `kubectl describe pod -n gpu-operator -l app=nvidia-operator-validator` includes the following event: + + ```console + Events: + Type Reason Age From Message + ---- ------ ---- ---- ------- + Warning FailedCreatePodSandBox 8s (x21 over 9m2s) kubelet Failed to create pod sandbox: rpc error: code = Unknown desc = failed to get sandbox runtime: no runtime for "nvidia" is configured + ``` + +- Running one of the following commands on the node indicates that the `nouveau` Linux kernel module is loaded: + + ```console + $ lsmod | grep -i nouveau + $ dmesg | grep -i nouveau + $ journalctl -xb | grep -i nouveau + ``` + +### Root Cause +The `nouveau` Linux kernel module is loaded and the driver container is unable to unload the module. +Because the `nouveau` module is loaded, the driver container cannot load the `nvidia` module. + +### Action +On each node, run the following commands to prevent loading the `nouveau` Linux kernel module on boot: + +```console +$ sudo tee /etc/modules-load.d/ipmi.conf <<< "ipmi_msghandler" \ + && sudo tee /etc/modprobe.d/blacklist-nouveau.conf <<< "blacklist nouveau" \ + && sudo tee -a /etc/modprobe.d/blacklist-nouveau.conf <<< "options nouveau modeset=0" + +$ sudo update-initramfs -u + +$ sudo init 6 +``` + +## No GPU Driver or Operand Pods Running + +### Issue +On some clusters, taints are applied to nodes with a taint effect of `NoSchedule`. + +### Observation +- Running `kubectl get ds -n gpu-operator` shows `0` for `DESIRED`, `CURRENT`, `READY` and so on. + + ```console + NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE NODE SELECTOR AGE + gpu-feature-discovery 0 0 0 0 0 nvidia.com/gpu.deploy.gpu-feature-discovery=true 11m + ... + ``` + +### Root Cause +The `NoSchedule` taint prevents the Operator from deploying the GPU Driver and other Operand pods. + +### Action +Describe each node, identify the taints, and either remove the taints from the nodes or add the taints as tolerations to the daemon sets. + +## GPU Operator Pods Stuck in Crash Loop + +### Issue +On large clusters, such as 300 or more nodes, the GPU Operator pods +can get stuck in a crash loop. + +### Observation +- The GPU Operator pod is not running: + + ```console + $ kubectl get pod -n gpu-operator -l app=gpu-operator + ``` + + *Example Output* + + ```output + NAME READY STATUS RESTARTS AGE + gpu-operator-568c7ff7f6-chg5b 0/1 CrashLoopBackOff 4 (85s ago) 4m42s + ``` + +- The node that is running the GPU Operator pod has sufficient resources and the node is `Ready`: + + ```console + $ kubectl describe node + ``` + + *Example Output* + + ```output + Conditions: + Type Status LastHeartbeatTime LastTransitionTime Reason Message + ---- ------ ----------------- ------------------ ------ ------- + MemoryPressure False Tue, 26 Dec 2023 14:01:31 +0000 Tue, 12 Dec 2023 19:47:47 +0000 KubeletHasSufficientMemory kubelet has sufficient memory available + DiskPressure False Tue, 26 Dec 2023 14:01:31 +0000 Thu, 14 Dec 2023 19:15:03 +0000 KubeletHasNoDiskPressure kubelet has no disk pressure + PIDPressure False Tue, 26 Dec 2023 14:01:31 +0000 Tue, 12 Dec 2023 19:47:47 +0000 KubeletHasSufficientPID kubelet has sufficient PID available + Ready True Tue, 26 Dec 2023 14:01:31 +0000 Thu, 14 Dec 2023 19:15:13 +0000 KubeletReady kubelet is posting ready status + ``` + +### Root Cause +The memory resource limit for the GPU Operator is too low for the cluster size. + +### Action +Increase the memory request and limit for the GPU Operator pod: + +- Set the memory request to a value that matches the average memory consumption over a large time window. +- Set the memory limit to match the spikes in memory consumption that occur occasionally. + +1. Increase the memory resource limit for the GPU Operator pod: + + ```console + $ kubectl patch deployment gpu-operator -n gpu-operator --type='json' \ + -p='[{"op":"replace", "path":"/spec/template/spec/containers/0/resources/limits/memory", "value":"1400Mi"}]' + ``` + +1. Optional: Increase the memory resource request for the pod: + + ```console + $ kubectl patch deployment gpu-operator -n gpu-operator --type='json' \ + -p='[{"op":"replace", "path":"/spec/template/spec/containers/0/resources/requests/memory", "value":"600Mi"}]' + ``` + +Monitor the GPU Operator pod. +Increase the memory request and limit again if the pod remains stuck in a crash loop. + +### infoROM is corrupted (nvidia-smi return code 14) + +### Issue +The nvidia-operator-validator pod fails and nvidia-driver-daemonsets fails as well. + +### Observation +The output from the driver validation container indicates that the infoROM is corrupt: + +```console +$ kubectl logs -n gpu-operator nvidia-operator-validator-xxxxx -c driver-validation +``` + +*Example Output* + +```output +| NVIDIA-SMI 470.82.01 Driver Version: 470.82.01 CUDA Version: 11.4 | +|-------------------------------+----------------------+----------------------+ +| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|===============================+======================+======================| +| 0 Tesla P100-PCIE... On | 00000000:0B:00.0 Off | 0 | +| N/A 42C P0 29W / 250W | 0MiB / 16280MiB | 0% Default | +| | | N/A | ++-------------------------------+----------------------+----------------------+ + ++-----------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=============================================================================| +| No running processes found | ++-----------------------------------------------------------------------------+ +WARNING: infoROM is corrupted at gpu 0000:0B:00.0 +14 +``` + +The GPU emits some warning messages related to infoROM. +The return values for the `nvidia-smi` command are listed below. + +```console +RETURN VALUE + +Return code reflects whether the operation succeeded or failed and what +was the reason of failure. + +· Return code 0 - Success +· Return code 2 - A supplied argument or flag is invalid +· Return code 3 - The requested operation is not available on target device +· Return code 4 - The current user does not have permission to access this device or perform this operation +· Return code 6 - A query to find an object was unsuccessful +· Return code 8 - A device's external power cables are not properly attached +· Return code 9 - NVIDIA driver is not loaded +· Return code 10 - NVIDIA Kernel detected an interrupt issue with a GPU +· Return code 12 - NVML Shared Library couldn't be found or loaded +· Return code 13 - Local version of NVML doesn't implement this function +· Return code 14 - infoROM is corrupted +· Return code 15 - The GPU has fallen off the bus or has otherwise become inaccessible +· Return code 255 - Other error or internal driver error occurred +``` + +### Root Cause +The `nvidia-smi` command should return a success code (return code 0) for the driver-validator container to pass and GPU Operator to successfully deploy driver pod on the node. + +### Action +Replace the faulty GPU. + +### EFI + Secure Boot + +### Issue +GPU Driver pod fails to deploy. + +### Root Cause +EFI Secure Boot is currently not supported with the GPU Operator + +### Action +Disable EFI Secure Boot on the server. + +## GPU Operator pods in `Init:RunContainerError` or `Init:CreateContainerError` state + +### Issue +If you are installing, upgrading, or upgrading the GPU driver daemonset to v25.10.0 with CRI-O as the container runtime, you may notice several of the GPU Operator pods are stuck in the `Init:RunContainerError` or `Init:CreateContainerError` state. + +### Root Cause +Refer to this [GitHub issue](https://github.com/cri-o/cri-o/issues/9521) for details on the root cause and proposed solution to this known CRI-O limitation. + +### Action +The errors will eventually resolve on their own after the driver daemonset is installed or the upgrade is complete. + +This issue was fixed in GPU Operator v25.10.1 and later. diff --git a/gpu-operator/.agents/skills/gpu-operator-timeslicing-gpus/SKILL.md b/gpu-operator/.agents/skills/gpu-operator-timeslicing-gpus/SKILL.md new file mode 100644 index 000000000..bb8074d68 --- /dev/null +++ b/gpu-operator/.agents/skills/gpu-operator-timeslicing-gpus/SKILL.md @@ -0,0 +1,390 @@ +--- +name: "gpu-operator-timeslicing-gpus" +description: "Explains GPU sharing and time-slicing configuration. Use when users need multiple workloads to share GPUs or need to configure time-sliced GPU resources. Trigger keywords - NVIDIA GPU Operator, GPU sharing, time-slicing, Kubernetes." +--- + + + + +# Time-Slicing GPUs in Kubernetes + +## Step 1: Understanding Time-Slicing GPUs + +The NVIDIA GPU Operator enables oversubscription of GPUs through a set +of extended options for the [NVIDIA Kubernetes Device Plugin](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/k8s-device-plugin). +GPU time-slicing enables workloads that are scheduled on oversubscribed GPUs to +interleave with one another. + +This mechanism for enabling *time-slicing* of +GPUs in Kubernetes enables a system administrator to define a set of +*replicas* for a GPU, each of which can be handed out independently to a +pod to run workloads on. Unlike Multi-Instance GPU (MIG), there is no memory or +fault-isolation between replicas, but for some workloads this is better +than not being able to share at all. Internally, GPU +time-slicing is used to multiplex workloads from +replicas of the same underlying GPU. + +**Note:** + +A typical resource request provides exclusive access to GPUs. +A request for a time-sliced GPU provides shared access. +A request for more than one time-sliced GPU does not guarantee that the pod +receives access to a proportional amount of GPU compute power. + +A request for more than one time-sliced GPU only specifies that the pod +receives access to a GPU that is shared by other pods. +Each pod can run as many processes on the underlying GPU without a limit. +The GPU simply provides an equal share of time to all GPU processes, across +all of the pods. +You can apply a cluster-wide default time-slicing configuration. +You can also apply node-specific configurations. +For example, you can apply a time-slicing configuration to nodes with Tesla-T4 GPUs only +and not modify nodes with other GPU models. + +You can combine the two approaches by applying a cluster-wide default configuration +and then label nodes so that those nodes receive a node-specific configuration. + +### Comparison: Time-Slicing and Multi-Instance GPU + +The latest generations of NVIDIA GPUs provide an operation mode called +Multi-Instance GPU (MIG). MIG allows you to partition a GPU +into several smaller, predefined instances, each of which looks like a +mini-GPU that provides memory and fault isolation at the hardware layer. +You can share access to a GPU by running workloads on one of +these predefined instances instead of the full native GPU. + +MIG support was added to Kubernetes in 2020. Refer to [Supporting MIG in Kubernetes](https://www.google.com/url?q=https://docs.google.com/document/d/1mdgMQ8g7WmaI_XVVRrCvHPFPOMCm5LQD5JefgAh6N8g/edit&sa=D&source=editors&ust=1655578433019961&usg=AOvVaw1F-OezvM-Svwr1lLsdQmu3) +for details on how this works. + +Time-slicing trades the memory and fault-isolation that is provided by MIG +for the ability to share a GPU by a larger number of users. +Time-slicing also provides a way to provide shared access to a GPU for +older generation GPUs that do not support MIG. +However, you can combine MIG and time-slicing to provide shared access to +MIG instances. + +### Support Platforms and Resource Types + +GPU time-slicing can be used with bare-metal applications, virtual machines +with GPU passthrough, and virtual machines with NVIDIA vGPU. + +Currently, the only supported resource types are `nvidia.com/gpu` +and any of the resource types that emerge from configuring a node with +the mixed MIG strategy. + +### Limitations + +- DCGM-Exporter does not support associating metrics to containers when GPU time-slicing is enabled with the NVIDIA Kubernetes Device Plugin. +- The Operator does not monitor changes to a time-slicing config map. + Refer to time-slicing-update-config-map. + +### Changes to Node Labels + +In addition to the standard node labels that GPU Feature Discovery (GFD) +applies to nodes, the following label is also applied after you configure +GPU time-slicing for a node: + +```yaml +nvidia.com/.replicas = +``` + +Where `` is the factor by which each resource of `` is oversubscribed. + +Additionally, by default, the `nvidia.com/.product` label is modified: + +```yaml +nvidia.com/.product = -SHARED +``` + +For example, on an NVIDIA DGX A100 machine, depending on the time-slicing configuration, +the labels can be similar to the following example: + +```yaml +nvidia.com/gpu.replicas = 8 +nvidia.com/gpu.product = A100-SXM4-40GB-SHARED +``` + +Using these labels, you can request time-sliced access to a GPU or exclusive access to a GPU +in the same way that you traditionally specify a node selector to request one GPU model over another. +That is, the `-SHARED` product name suffix ensures that you can specify a +node selector to assign pods to nodes with time-sliced GPUs. + +The `migStrategy` configuration option has an effect on the node label for the product name. +When `renameByDefault=false`, the default value, and `migStrategy=single`, both the MIG profile name +and the `-SHARED` suffix are appended to the product name, such as the following example: + +```yaml +nvidia.com/gpu.product = A100-SXM4-40GB-MIG-1g.5gb-SHARED +``` + +If you set `renameByDefault=true`, then the value of the `nvidia.com/gpu.product` node +label is not modified. + +## Step 2: Configuration + +### About Configuring GPU Time-Slicing + +You configure GPU time-slicing by performing the following high-level steps: + +* Add a config map to the namespace that is used by the GPU operator. +* Configure the cluster policy so that the device plugin uses the config map. +* Apply a label to the nodes that you want to configure for GPU time-slicing. + +On a machine with one GPU, the following config map configures Kubernetes so that +the node advertises four GPU resources. +A machine with two GPUs advertises eight GPUs, and so on. + +### Sample Config Map + +The following table describes the key fields in the config map. + +| Field | Type | Description | +| --- | --- | --- | +| `data.` | string | Specifies the time-slicing configuration name. You can specify multiple configurations if you want to assign node-specific configurations. In the preceding example, the value for `key` is `any`. | +| `flags.migStrategy` | string | Specifies how to label MIG devices for the nodes that receive the time-slicing configuration. Specify one of `none`, `single`, or `mixed`. The default value is `none`. | +| `renameByDefault` | boolean | When set to `true`, each resource is advertised under the name `.shared` instead of ``. For example, if this field is set to `true` and the resource is typically `nvidia.com/gpu`, the nodes that are configured for time-sliced GPU access then advertise the resource as `nvidia.com/gpu.shared`. Setting this field to true can be helpful if you want to schedule pods on GPUs with shared access by specifying `.shared` in the resource request. When this field is set to `false`, the advertised resource name, such as `nvidia.com/gpu`, is not modified. However, label for the product name is suffixed with `-SHARED`. For example, if the output of `kubectl describe node` shows the node label `nvidia.com/gpu.product=Tesla-T4`, then after the node is configured for time-sliced GPU access, the label becomes `nvidia.com/gpu.product=Tesla-T4-SHARED`. In this case, you can specify a node selector that includes the `-SHARED` suffix to schedule pods on GPUs with shared access. The default value is `false`. | +| `failRequestsGreaterThanOne` | boolean | The purpose of this field is to enforce awareness that requesting more than one GPU replica does not result in receiving more proportional access to the GPU. For example, if `4` GPU replicas are available and two pods request `1` GPU each and a third pod requests `2` GPUs, the applications in the three pods have an equal share of GPU compute time. Specifically, the pod that requests `2` GPUs does not receive twice as much compute time as the pods that request `1` GPU. When set to `true`, a resource request for more than one GPU fails with an `UnexpectedAdmissionError`. In this case, you must manually delete the pod, update the resource request, and redeploy. | +| `resources.name` | string | Specifies the resource type to make available with time-sliced access, such as `nvidia.com/gpu`, `nvidia.com/mig-1g.5gb`, and so on. | +| `resources.replicas` | integer | Specifies the number of time-sliced GPU replicas to make available for shared access to GPUs of the specified resource type. | +### Applying One Cluster-Wide Configuration + +Perform the following steps to configure GPU time-slicing if you already installed the GPU operator +and want to apply the same time-slicing configuration on all nodes in the cluster. + +1. Create a file, such as `time-slicing-config-all.yaml`, with contents like the following example: + +1. Add the config map to the same namespace as the GPU operator: + + ```console + $ kubectl create -n gpu-operator -f time-slicing-config-all.yaml + ``` + +1. Configure the device plugin with the config map and set the default time-slicing configuration: + + ```console + $ kubectl patch clusterpolicies.nvidia.com/cluster-policy \ + -n gpu-operator --type merge \ + -p '{"spec": {"devicePlugin": {"config": {"name": "time-slicing-config-all", "default": "any"}}}}' + ``` + +1. Optional: Confirm that the `gpu-feature-discovery` and + `nvidia-device-plugin-daemonset` pods restart. + + ```console + $ kubectl get events -n gpu-operator --sort-by='.lastTimestamp' + ``` + + *Example Output* + +Refer to time-slicing-verify. + +### Applying Multiple Node-Specific Configurations + +An alternative to applying one cluster-wide configuration is to specify multiple +time-slicing configurations in the config map and to apply labels node-by-node to +control which configuration is applied to which nodes. + +1. Create a file, such as `time-slicing-config-fine.yaml`, with contents like the following example: + +1. Add the config map to the same namespace as the GPU operator: + + ```console + $ kubectl create -n gpu-operator -f time-slicing-config-fine.yaml + ``` + +1. Configure the device plugin with the config map and set the default time-slicing configuration: + + ```console + $ kubectl patch clusterpolicies.nvidia.com/cluster-policy \ + -n gpu-operator --type merge \ + -p '{"spec": {"devicePlugin": {"config": {"name": "time-slicing-config-fine"}}}}' + ``` + + Because the specification does not include the `devicePlugin.config.default` field, + when the device plugin pods redeploy, they do not automatically apply the time-slicing + configuration to all nodes. + +1. Optional: Confirm that the `gpu-feature-discovery` and + `nvidia-device-plugin-daemonset` pods restart. + + ```console + $ kubectl get events -n gpu-operator --sort-by='.lastTimestamp' + ``` + + *Example Output* + +1. Apply a label to the nodes by running one or more of the following commands: + + * Apply a label to nodes one-by-one by specifying the node name: + + ```console + $ kubectl label node nvidia.com/device-plugin.config=tesla-t4 + ``` + + * Apply a label to several nodes at one time by specifying a label selector: + + ```console + $ kubectl label node \ + --selector=nvidia.com/gpu.product=Tesla-T4 \ + nvidia.com/device-plugin.config=tesla-t4 + ``` + +Refer to time-slicing-verify. + +### Configuring Time-Slicing Before Installing the NVIDIA GPU Operator + +You can enable time-slicing with the NVIDIA GPU Operator by passing the +`devicePlugin.config.name=` parameter during installation. + +Perform the following steps to configure time-slicing before installing the operator: + +1. Create the namespace for the operator: + + ```console + $ kubectl create namespace gpu-operator + ``` + +1. Create a file, such as `time-slicing-config.yaml`, with the config map contents. + + Refer to the time-slicing-cluster-wide-config or + time-slicing-node-specific-config sections. + +1. Add the config map to the same namespace as the GPU operator: + + ```console + $ kubectl create -f time-slicing-config.yaml + ``` + +1. Install the operator with Helm: + + ```console + $ helm install gpu-operator nvidia/gpu-operator \ + -n gpu-operator \ + --version=${version} \ + --set devicePlugin.config.name=time-slicing-config + ``` + +1. Refer to either time-slicing-cluster-wide-config or + time-slicing-node-specific-config and perform the following tasks: + + * Configure the device plugin by running the `kubectl patch` command. + * Apply labels to nodes if you added a config map with node-specific configurations. + +After installation, refer to time-slicing-verify. + +### Updating a Time-Slicing Config Map + +The Operator does not monitor the time-slicing config maps. +As a result, if you modify a config map, the device plugin pods do not restart and do not apply the modified configuration. + +To apply the modified config map, manually restart the device plugin pods: + +```console +$ kubectl rollout restart -n gpu-operator daemonset/nvidia-device-plugin-daemonset +``` + +Currently running workloads are not affected and continue to run, though NVIDIA recommends performing the restart during a maintenance period. + +## Step 3: Verifying the GPU Time-Slicing Configuration + +Perform the following steps to verify that the time-slicing configuration is applied successfully: + +1. Confirm that the node advertises additional GPU resources: + + ```console + $ kubectl describe node + ``` + + *Example Output* + + The example output varies according to the GPU in your node and the configuration + that you apply. + + The following output applies when `renameByDefault` is set to `false`, + the default value. + The key considerations are as follows: + + * The `nvidia.com/gpu.count` label reports the number of physical GPUs in the machine. + * The `nvidia.com/gpu.product` label includes a `-SHARED` suffix to the product name. + * The `nvidia.com/gpu.replicas` label matches the reported capacity. + + ```output + ... + Labels: + nvidia.com/gpu.count=4 + nvidia.com/gpu.product=Tesla-T4-SHARED + nvidia.com/gpu.replicas=4 + Capacity: + nvidia.com/gpu: 16 + ... + Allocatable: + nvidia.com/gpu: 16 + ... + ``` + + The following output applies when `renameByDefault` is set to `true`. + The key considerations are as follows: + + * The `nvidia.com/gpu.count` label reports the number of physical GPUs in the machine. + * The `nvidia.com/gpu` capacity reports `0`. + * The `nvidia.com/gpu.shared` capacity equals the number of physical GPUs multiplied by the + specified number of GPU replicas to create. + + ```output + ... + Labels: + nvidia.com/gpu.count=4 + nvidia.com/gpu.product=Tesla-T4 + nvidia.com/gpu.replicas=4 + Capacity: + nvidia.com/gpu: 0 + nvidia.com/gpu.shared: 16 + ... + Allocatable: + nvidia.com/gpu: 0 + nvidia.com/gpu.shared: 16 + ... + ``` + +1. Optional: Deploy a workload to validate GPU time-slicing: + + * Create a file, such as `time-slicing-verification.yaml`, with contents like the following: + + * Create the deployment with multiple replicas: + + ```console + $ kubectl apply -f time-slicing-verification.yaml + ``` + + * Verify that all five replicas are running: + + ```console + $ kubectl get pods + ``` + + *Example Output* + + * View the logs from one of the pods: + + ```console + $ kubectl logs deploy/time-slicing-verification + ``` + + *Example Output* + + * Stop the deployment: + + ```console + $ kubectl delete -f time-slicing-verification.yaml + ``` + + *Example Output* + + ```output + deployment.apps "time-slicing-verification" deleted + ``` + +## Step 4: References + +- [Blog post on GPU sharing in Kubernetes](https://developer.nvidia.com/blog/improving-gpu-utilization-in-kubernetes). +- [NVIDIA Kubernetes Device Plugin](https://github.com/NVIDIA/k8s-device-plugin) repository on GitHub. diff --git a/gpu-operator/.agents/skills/gpu-operator-uninstalling-nvidia/SKILL.md b/gpu-operator/.agents/skills/gpu-operator-uninstalling-nvidia/SKILL.md new file mode 100644 index 000000000..114d95203 --- /dev/null +++ b/gpu-operator/.agents/skills/gpu-operator-uninstalling-nvidia/SKILL.md @@ -0,0 +1,85 @@ +--- +name: "gpu-operator-uninstalling-nvidia" +description: "Guides users through uninstalling the NVIDIA GPU Operator and cleaning up related resources. Use when removing the Operator from a Kubernetes cluster. Trigger keywords - NVIDIA GPU Operator, uninstall, removal, Kubernetes." +--- + + + + +# Uninstalling the GPU Operator + +Perform the following steps to uninstall the Operator. + +1. Optional: List and delete NVIDIA driver custom resources. + + ```console + $ kubectl get nvidiadrivers + ``` + + *Example Output* + + ```output + NAME STATUS AGE + demo-gold ready 2023-10-16T17:57:12Z + demo-silver ready 2023-10-16T17:57:12Z + ``` + + ```console + $ kubectl delete nvidiadriver demo-gold + $ kubectl delete nvidiadriver demo-silver + ``` + + ```console + $ kubectl delete crd nvidiadrivers.nvidia.com + ``` + +1. Delete the Operator: + + ```console + $ helm delete -n gpu-operator $(helm list -n gpu-operator | grep gpu-operator | awk '{print $1}') + ``` + +1. Optional: List the pods in the Operator namespace to confirm the pods are deleted or in the process of deleting: + + ```console + $ kubectl get pods -n gpu-operator + ``` + + *Example Output* + + ```output + No resources found. + ``` + +By default, Helm does not [support deleting existing CRDs](https://helm.sh/docs/chart_best_practices/custom_resource_definitions/#some-caveats-and-explanations) +when you delete the chart. +As a result, the `clusterpolicy` CRD and `nvidiadrivers` CRD will still remain, by default. + +```console +$ kubectl get crd clusterpolicies.nvidia.com +``` + +To overcome this, the Operator uses a [post-delete hook](https://helm.sh/docs/topics/charts_hooks/#the-available-hooks) +to perform the CRD cleanup. +The `operator.cleanupCRD` chart parameter is added to enable this hook. +This parameter is disabled by default. +You can enable the hook by specifying `--set operator.cleanupCRD=true` during install or upgrade to perform automatic CRD cleanup on chart deletion. + +Alternatively, you can delete the custom resource definition: + +```console +$ kubectl delete crd clusterpolicies.nvidia.com +``` + +**Note:** + +* After uninstalling the Operator, the NVIDIA driver modules might still be loaded. + Either reboot the node or unload them using the following command: + + ```console + $ sudo rmmod nvidia_modeset nvidia_uvm nvidia + ``` + +* Helm hooks used with the GPU Operator use the Operator image itself. + If the Operator image cannot be pulled successfully (either due to network error or an invalid NGC registry secret in case of NVAIE), hooks will fail. + In this case, delete the chart and specify the `--no-hooks` argument to avoid hanging on hook failures. diff --git a/gpu-operator/.agents/skills/gpu-operator-upgrading-nvidia/SKILL.md b/gpu-operator/.agents/skills/gpu-operator-upgrading-nvidia/SKILL.md new file mode 100644 index 000000000..243652de1 --- /dev/null +++ b/gpu-operator/.agents/skills/gpu-operator-upgrading-nvidia/SKILL.md @@ -0,0 +1,179 @@ +--- +name: "gpu-operator-upgrading-nvidia" +description: "Guides users through upgrading the NVIDIA GPU Operator with Helm and handling CRD updates. Use when planning or performing a GPU Operator upgrade. Trigger keywords - NVIDIA GPU Operator, upgrade, Helm, Kubernetes." +--- + + + + +# Prerequisites + +- If your cluster uses Pod Security Admission (PSA) to restrict the behavior of pods, + +# Upgrading the NVIDIA GPU Operator + +## Step 1: Using Helm + +The GPU Operator supports dynamic updates to existing resources. +This ability enables the GPU Operator to ensure settings from the cluster policy specification are always applied and current. + +Because Helm [does not support](https://helm.sh/docs/chart_best_practices/custom_resource_definitions/#some-caveats-and-explanations) automatic upgrade of existing CRDs, +you can upgrade the GPU Operator chart manually or by enabling a Helm hook. + +### Option 1: Manually Upgrading CRDs + + ```mermaid + flowchart LR + + A["Update CRD from + the latest chart"] + --> + B["Upgrade by + using Helm"] + ``` + +With this procedure, all existing GPU Operator resources are updated inline and the cluster policy resource is patched with updates from `values.yaml`. + +1. Specify the Operator release tag in an environment variable: + + ```console + $ export RELEASE_TAG=${version} + ``` + +1. Apply the custom resource definitions for the cluster policy and NVIDIA driver: + + ```console + $ kubectl apply -f \ + https://raw.githubusercontent.com/NVIDIA/gpu-operator/refs/tags/$RELEASE_TAG/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml + + $ kubectl apply -f \ + https://raw.githubusercontent.com/NVIDIA/gpu-operator/refs/tags/$RELEASE_TAG/deployments/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml + ``` + + *Example Output* + + ```output + customresourcedefinition.apiextensions.k8s.io/clusterpolicies.nvidia.com configured + customresourcedefinition.apiextensions.k8s.io/nvidiadrivers.nvidia.com created + ``` + +1. Apply the custom resource definition for Node Feature Discovery: + + ```console + $ kubectl apply -f \ + https://raw.githubusercontent.com/NVIDIA/gpu-operator/refs/tags/$RELEASE_TAG/deployments/gpu-operator/charts/node-feature-discovery/crds/nfd-api-crds.yaml + ``` + + *Example Output* + + ```output + customresourcedefinition.apiextensions.k8s.io/nodefeaturerules.nfd.k8s-sigs.io configured + ``` + +1. Update the information about the Operator chart: + + ```console + $ helm repo update nvidia + ``` + + *Example Output* + + ```output + Hang tight while we grab the latest from your chart repositories... + ...Successfully got an update from the "nvidia" chart repository + Update Complete. ⎈Happy Helming!⎈ + ``` + +1. Fetch the values from the chart: + + ```console + $ helm show values nvidia/gpu-operator --version=$RELEASE_TAG > values-$RELEASE_TAG.yaml + ``` + +1. Update the values file as needed. + +1. Upgrade the Operator: + + ```console + $ helm upgrade gpu-operator nvidia/gpu-operator -n gpu-operator -f values-$RELEASE_TAG.yaml --version $RELEASE_TAG + ``` + + *Example Output* + + ```output + Release "gpu-operator" has been upgraded. Happy Helming! + NAME: gpu-operator + LAST DEPLOYED: Thu Apr 20 15:05:52 2023 + NAMESPACE: gpu-operator + STATUS: deployed + REVISION: 2 + TEST SUITE: None + ``` + +### Option 2: Automatically Upgrading CRDs Using a Helm Hook + +Starting with GPU Operator v22.09, a `pre-upgrade` Helm [hook](https://helm.sh/docs/topics/charts_hooks/#the-available-hooks) can automatically upgrade to latest CRD. + +Starting with GPU Operator v24.9.0, the upgrade CRD Helm hook is enabled by default and runs an upgrade CRD job when you upgrade using Helm. + +1. Specify the Operator release tag in an environment variable: + + ```console + $ export RELEASE_TAG=${version} + ``` + +1. Update the information about the Operator chart: + + ```console + $ helm repo update nvidia + ``` + + *Example Output* + + ```output + Hang tight while we grab the latest from your chart repositories... + ...Successfully got an update from the "nvidia" chart repository + Update Complete. ⎈Happy Helming!⎈ + ``` + +1. Fetch the values from the chart: + + ```console + $ helm show values nvidia/gpu-operator --version=$RELEASE_TAG > values-$RELEASE_TAG.yaml + ``` + +1. Update the values file as needed. + +1. Upgrade the Operator: + + ```console + $ helm upgrade gpu-operator nvidia/gpu-operator -n gpu-operator \ + --disable-openapi-validation -f values-$RELEASE_TAG.yaml --version $RELEASE_TAG + ``` + + **Note:** + + * Option `--disable-openapi-validation` is required in this case so that Helm will not try to validate if CR instance from the new chart is valid as per old CRD. + Since CR instance in the Chart is valid for the upgraded CRD, this will be compatible. + + * Helm hooks used with the GPU Operator use the operator image itself. If operator image itself cannot be pulled successfully (either due to network error or an invalid NGC registry secret in case of NVAIE), hooks will fail. + In this case, chart needs to be deleted using `--no-hooks` option to avoid deletion to be hung on hook failures. + +## Step 2: Cluster Policy Updates + +The GPU Operator also supports dynamic updates to the `ClusterPolicy` CustomResource using `kubectl`: + +```console +$ kubectl edit clusterpolicy +``` + +After the edits are complete, Kubernetes will automatically apply the updates to cluster. + +## Step 3: Additional Controls for Driver Upgrades + +While most of the GPU Operator managed daemonsets can be upgraded seamlessly, the NVIDIA driver daemonset has special considerations. +Refer to GPU Driver Upgrades for more information. + +## Step 4: Using Operator Lifecycle Manager (OLM) in OpenShift + +For upgrading the GPU Operator when running in OpenShift, refer to the official OpenShift documentation on [upgrading installed operators](https://docs.redhat.com/en/documentation/openshift_container_platform/latest/html/operators/administrator-tasks#olm-upgrading-operators). diff --git a/gpu-operator/amazon-eks.rst b/gpu-operator/amazon-eks.rst index 4eb7b7606..c5e338a21 100644 --- a/gpu-operator/amazon-eks.rst +++ b/gpu-operator/amazon-eks.rst @@ -14,6 +14,15 @@ See the License for the specific language governing permissions and limitations under the License. +.. meta:: + :title: NVIDIA GPU Operator with Amazon EKS + :description: Install and configure the NVIDIA GPU Operator on Amazon Elastic Kubernetes Service. + :description-agent: Guides users through installing and configuring the NVIDIA GPU Operator on Amazon EKS. Use when deploying GPU workloads on AWS or troubleshooting EKS-specific GPU Operator setup. + :keywords: NVIDIA GPU Operator, Amazon EKS, AWS, Kubernetes, installation + :tags: gpu-operator, nvidia, kubernetes, gpu, aws, eks + :content.type: how_to + :skill.priority: 40 + .. headings (h1/h2/h3/h4/h5) are # * = - ################################### diff --git a/gpu-operator/cdi.rst b/gpu-operator/cdi.rst index 880cf7c24..982b32696 100644 --- a/gpu-operator/cdi.rst +++ b/gpu-operator/cdi.rst @@ -14,6 +14,15 @@ See the License for the specific language governing permissions and limitations under the License. +.. meta:: + :title: Container Device Interface Support in the NVIDIA GPU Operator + :description: Configure Container Device Interface and Node Resource Interface support with the NVIDIA GPU Operator. + :description-agent: Explains how to configure CDI and NRI support for GPU workloads. Use when enabling CDI, configuring containerd, or troubleshooting CDI-based GPU injection. + :keywords: NVIDIA GPU Operator, CDI, NRI, containerd, Kubernetes + :tags: gpu-operator, nvidia, kubernetes, gpu, cdi, nri, containerd + :content.type: how_to + :skill.priority: 70 + .. headings # #, * *, =, -, ^, " ################################################################################# diff --git a/gpu-operator/confidential-containers-deploy.rst b/gpu-operator/confidential-containers-deploy.rst index 252fd5d37..c9cbe45d3 100644 --- a/gpu-operator/confidential-containers-deploy.rst +++ b/gpu-operator/confidential-containers-deploy.rst @@ -1,5 +1,13 @@ .. _gpu-operator-confidential-containers-deploy: +.. meta:: + :title: Confidential Containers + :description: Deploy GPU workloads with Confidential Containers and the NVIDIA GPU Operator. + :description-agent: Points users to the Confidential Containers reference architecture and deployment documentation. Use when users ask about confidential GPU workloads or Confidential Containers with the GPU Operator. + :keywords: NVIDIA GPU Operator, Confidential Containers, sandboxed workloads, Kubernetes + :tags: gpu-operator, nvidia, kubernetes, gpu, confidential-computing, sandboxed-workloads + :content.type: concept + *********************** Confidential Containers *********************** diff --git a/gpu-operator/custom-driver-params.rst b/gpu-operator/custom-driver-params.rst index 3905269a1..64dc9d62d 100644 --- a/gpu-operator/custom-driver-params.rst +++ b/gpu-operator/custom-driver-params.rst @@ -1,3 +1,14 @@ + + +.. meta:: + :title: Custom GPU Driver Parameters + :description: Customize NVIDIA driver parameters for GPU Operator managed driver containers. + :description-agent: Shows how to provide custom NVIDIA driver parameters to GPU Operator driver containers. Use when changing driver module options or customizing driver container behavior. + :keywords: NVIDIA GPU Operator, driver parameters, NVIDIA driver, configuration + :tags: gpu-operator, nvidia, kubernetes, gpu, driver, configuration + :content.type: how_to + :skill.priority: 70 + .. Date: Mar 11 2022 .. Author: cdesiniotis diff --git a/gpu-operator/deploy-kata-containers.rst b/gpu-operator/deploy-kata-containers.rst index 6cd877c40..01acb9bf3 100644 --- a/gpu-operator/deploy-kata-containers.rst +++ b/gpu-operator/deploy-kata-containers.rst @@ -14,6 +14,15 @@ See the License for the specific language governing permissions and limitations under the License. +.. meta:: + :title: Kata Containers + :description: Deploy GPU workloads with Kata Containers and the NVIDIA GPU Operator. + :description-agent: Guides users through configuring Kata Containers for GPU workloads with the GPU Operator. Use when deploying sandboxed GPU workloads with Kata Containers. + :keywords: NVIDIA GPU Operator, Kata Containers, sandboxed workloads, Kubernetes + :tags: gpu-operator, nvidia, kubernetes, gpu, kata-containers, sandboxed-workloads + :content.type: how_to + :skill.priority: 70 + .. headings (h1/h2/h3/h4/h5) are # * = - .. diff --git a/gpu-operator/dra-intro-install.rst b/gpu-operator/dra-intro-install.rst index 7fa265a8b..46d76ac2e 100644 --- a/gpu-operator/dra-intro-install.rst +++ b/gpu-operator/dra-intro-install.rst @@ -14,6 +14,15 @@ See the License for the specific language governing permissions and limitations under the License. +.. meta:: + :title: NVIDIA DRA Driver for GPUs + :description: Install and use the NVIDIA DRA Driver for GPUs with the NVIDIA GPU Operator. + :description-agent: Explains how to install and use the NVIDIA DRA Driver for GPUs. Use when users ask about Dynamic Resource Allocation, DRA installation, or GPU resource claims. + :keywords: NVIDIA GPU Operator, DRA, Dynamic Resource Allocation, Kubernetes, installation + :tags: gpu-operator, nvidia, kubernetes, gpu, dra, dynamic-resource-allocation + :content.type: how_to + :skill.priority: 60 + .. headings (h1/h2/h3/h4/h5) are # * = - ########################## diff --git a/gpu-operator/getting-started.rst b/gpu-operator/getting-started.rst index 7b418c169..34deb494c 100644 --- a/gpu-operator/getting-started.rst +++ b/gpu-operator/getting-started.rst @@ -14,6 +14,15 @@ See the License for the specific language governing permissions and limitations under the License. +.. meta:: + :title: Installing the NVIDIA GPU Operator + :description: Install the NVIDIA GPU Operator in a Kubernetes cluster. + :description-agent: Installs the NVIDIA GPU Operator in a Kubernetes cluster with Helm. Use when users are getting started, installing the Operator for the first time, or checking installation prerequisites. + :keywords: NVIDIA GPU Operator, installation, Helm, Kubernetes, getting started + :tags: gpu-operator, nvidia, kubernetes, gpu, installation, helm, getting-started + :content.type: get_started + :skill.priority: 10 + .. headings (h1/h2/h3/h4/h5) are # * = - .. _nvaie-tanzu: https://docs.nvidia.com/ai-enterprise/deployment-guide-vmware/0.1.0/index.html diff --git a/gpu-operator/google-gke.rst b/gpu-operator/google-gke.rst index 05424ff68..fb2a4bea1 100644 --- a/gpu-operator/google-gke.rst +++ b/gpu-operator/google-gke.rst @@ -14,6 +14,15 @@ See the License for the specific language governing permissions and limitations under the License. +.. meta:: + :title: NVIDIA GPU Operator with Google Kubernetes Engine + :description: Install and configure the NVIDIA GPU Operator on Google Kubernetes Engine. + :description-agent: Guides users through installing and configuring the NVIDIA GPU Operator on Google GKE. Use when deploying GPU workloads on GKE or troubleshooting GKE-specific GPU Operator setup. + :keywords: NVIDIA GPU Operator, Google GKE, Kubernetes, installation + :tags: gpu-operator, nvidia, kubernetes, gpu, google-cloud, gke + :content.type: how_to + :skill.priority: 40 + .. headings (h1/h2/h3/h4/h5) are # * = - ################################### diff --git a/gpu-operator/gpu-driver-configuration.rst b/gpu-operator/gpu-driver-configuration.rst index aa5d84077..f1b3803b8 100644 --- a/gpu-operator/gpu-driver-configuration.rst +++ b/gpu-operator/gpu-driver-configuration.rst @@ -15,6 +15,15 @@ See the License for the specific language governing permissions and limitations under the License. +.. meta:: + :title: NVIDIA GPU Driver Custom Resource Definition + :description: Configure GPU driver management with the NVIDIA GPU Driver custom resource. + :description-agent: Explains how to configure NVIDIA GPU Driver custom resources for driver lifecycle management. Use when users need custom driver configuration or mixed operating system support. + :keywords: NVIDIA GPU Operator, GPU driver, custom resource, driver configuration + :tags: gpu-operator, nvidia, kubernetes, gpu, driver, custom-resource + :content.type: how_to + :skill.priority: 60 + .. headings (h1/h2/h3/h4/h5) are # * = - ############################################ diff --git a/gpu-operator/gpu-driver-upgrades.rst b/gpu-operator/gpu-driver-upgrades.rst index 18430cde9..3cc2c9b80 100644 --- a/gpu-operator/gpu-driver-upgrades.rst +++ b/gpu-operator/gpu-driver-upgrades.rst @@ -17,6 +17,15 @@ .. Date: Jan 30 2023 .. Author: cdesiniotis +.. meta:: + :title: GPU Driver Upgrades + :description: Understand how the NVIDIA GPU Operator manages GPU driver upgrades. + :description-agent: Explains GPU driver upgrade behavior and configuration. Use when planning driver upgrades or troubleshooting driver upgrade workflows managed by the GPU Operator. + :keywords: NVIDIA GPU Operator, GPU driver, driver upgrades, Kubernetes + :tags: gpu-operator, nvidia, kubernetes, gpu, driver, upgrades + :content.type: how_to + :skill.priority: 60 + .. headings # #, * *, =, -, ^, " .. _gpu-driver-upgrades: diff --git a/gpu-operator/gpu-operator-kubevirt.rst b/gpu-operator/gpu-operator-kubevirt.rst index 967152c81..42676b745 100644 --- a/gpu-operator/gpu-operator-kubevirt.rst +++ b/gpu-operator/gpu-operator-kubevirt.rst @@ -1,6 +1,15 @@ .. Date: Jun 22 2022 .. Author: cdesiniotis +.. meta:: + :title: KubeVirt + :description: Configure the NVIDIA GPU Operator for GPU workloads with KubeVirt. + :description-agent: Guides users through configuring the GPU Operator for KubeVirt virtual machine workloads. Use when deploying GPU-enabled VMs or troubleshooting KubeVirt GPU passthrough. + :keywords: NVIDIA GPU Operator, KubeVirt, virtual machines, Kubernetes + :tags: gpu-operator, nvidia, kubernetes, gpu, kubevirt, virtual-machines + :content.type: how_to + :skill.priority: 70 + .. headings (h1/h2/h3/h4/h5/h6) are # * = - ^ " .. _gpu-operator-kubevirt: diff --git a/gpu-operator/gpu-operator-mig.rst b/gpu-operator/gpu-operator-mig.rst index 7d8ee8f24..3cc5f16b2 100644 --- a/gpu-operator/gpu-operator-mig.rst +++ b/gpu-operator/gpu-operator-mig.rst @@ -17,6 +17,15 @@ .. Date: May 11 2021 .. Author: pramarao +.. meta:: + :title: Multi-Instance GPU + :description: Configure Multi-Instance GPU support with the NVIDIA GPU Operator. + :description-agent: Explains MIG strategies, labels, and configuration with the GPU Operator. Use when partitioning GPUs, enabling MIG, or troubleshooting MIG resource exposure. + :keywords: NVIDIA GPU Operator, MIG, Multi-Instance GPU, GPU partitioning + :tags: gpu-operator, nvidia, kubernetes, gpu, mig, gpu-partitioning + :content.type: how_to + :skill.priority: 60 + .. headings (h1/h2/h3/h4/h5) are # * = - .. _install-gpu-operator-mig: diff --git a/gpu-operator/gpu-operator-rdma.rst b/gpu-operator/gpu-operator-rdma.rst index 2d3893f00..57b927790 100644 --- a/gpu-operator/gpu-operator-rdma.rst +++ b/gpu-operator/gpu-operator-rdma.rst @@ -1,6 +1,15 @@ .. Date: Aug 4 2021 .. Author: pramarao +.. meta:: + :title: GPUDirect RDMA and GPUDirect Storage + :description: Configure GPUDirect RDMA and GPUDirect Storage with the NVIDIA GPU Operator. + :description-agent: Guides users through GPUDirect RDMA and GPUDirect Storage configuration. Use when enabling high-performance networking or storage access for GPU workloads. + :keywords: NVIDIA GPU Operator, GPUDirect RDMA, GPUDirect Storage, networking + :tags: gpu-operator, nvidia, kubernetes, gpu, gpudirect, rdma, storage, networking + :content.type: how_to + :skill.priority: 70 + .. headings (h1/h2/h3/h4/h5) are # * = - .. _net-op: https://docs.nvidia.com/networking/display/cokan10/network+operator diff --git a/gpu-operator/gpu-sharing.rst b/gpu-operator/gpu-sharing.rst index 0c2ff5882..7a441ca84 100644 --- a/gpu-operator/gpu-sharing.rst +++ b/gpu-operator/gpu-sharing.rst @@ -1,6 +1,15 @@ .. Date: Jun 21 2022 .. Author: smerla +.. meta:: + :title: Time-Slicing GPUs + :description: Configure GPU time-slicing and sharing with the NVIDIA GPU Operator. + :description-agent: Explains GPU sharing and time-slicing configuration. Use when users need multiple workloads to share GPUs or need to configure time-sliced GPU resources. + :keywords: NVIDIA GPU Operator, GPU sharing, time-slicing, Kubernetes + :tags: gpu-operator, nvidia, kubernetes, gpu, gpu-sharing, time-slicing + :content.type: how_to + :skill.priority: 60 + .. headings (h1/h2/h3/h4/h5) are # * = - .. _gpu-sharing: diff --git a/gpu-operator/index.rst b/gpu-operator/index.rst index fef903b1f..00385e768 100644 --- a/gpu-operator/index.rst +++ b/gpu-operator/index.rst @@ -14,6 +14,14 @@ See the License for the specific language governing permissions and limitations under the License. +.. meta:: + :title: NVIDIA GPU Operator Documentation + :description: Browse the NVIDIA GPU Operator documentation for installation, upgrades, configuration, and troubleshooting. + :description-agent: Provides the top-level NVIDIA GPU Operator documentation navigation. Use when users need an overview of available GPU Operator topics. + :keywords: NVIDIA GPU Operator, documentation, Kubernetes, GPU workloads + :tags: gpu-operator, nvidia, kubernetes, gpu, documentation + :content.type: get_started + .. headings # #, * *, =, -, ^, " .. toctree:: diff --git a/gpu-operator/install-gpu-operator-air-gapped.rst b/gpu-operator/install-gpu-operator-air-gapped.rst index c4a1b23c9..8fbbec7f4 100644 --- a/gpu-operator/install-gpu-operator-air-gapped.rst +++ b/gpu-operator/install-gpu-operator-air-gapped.rst @@ -14,6 +14,15 @@ See the License for the specific language governing permissions and limitations under the License. +.. meta:: + :title: Install in Air-Gapped Environments + :description: Install the NVIDIA GPU Operator in air-gapped and restricted network environments. + :description-agent: Guides users through installing the GPU Operator in air-gapped or restricted network environments. Use when users need mirrored images, private registries, or offline installation steps. + :keywords: NVIDIA GPU Operator, air-gapped, restricted network, installation + :tags: gpu-operator, nvidia, kubernetes, gpu, air-gapped, private-registry, installation + :content.type: how_to + :skill.priority: 50 + .. headings # #, * *, =, -, ^, " .. Date: Dec 11 2020 diff --git a/gpu-operator/install-gpu-operator-gov-ready.rst b/gpu-operator/install-gpu-operator-gov-ready.rst index 5d85fad48..c41225980 100644 --- a/gpu-operator/install-gpu-operator-gov-ready.rst +++ b/gpu-operator/install-gpu-operator-gov-ready.rst @@ -14,6 +14,15 @@ See the License for the specific language governing permissions and limitations under the License. +.. meta:: + :title: Install in Government-Ready Environments + :description: Install the NVIDIA GPU Operator in environments that require government-ready configuration. + :description-agent: Guides users through government-ready GPU Operator installation considerations. Use when deploying in hardened or regulated Kubernetes environments. + :keywords: NVIDIA GPU Operator, government-ready, installation, Kubernetes + :tags: gpu-operator, nvidia, kubernetes, gpu, government-ready, installation + :content.type: how_to + :skill.priority: 50 + .. headings # #, * *, =, -, ^, " diff --git a/gpu-operator/install-gpu-operator-nvaie.rst b/gpu-operator/install-gpu-operator-nvaie.rst index 0facd1377..812f2b1c4 100644 --- a/gpu-operator/install-gpu-operator-nvaie.rst +++ b/gpu-operator/install-gpu-operator-nvaie.rst @@ -14,6 +14,15 @@ See the License for the specific language governing permissions and limitations under the License. +.. meta:: + :title: Install with NVIDIA AI Enterprise + :description: Install the NVIDIA GPU Operator with NVIDIA AI Enterprise. + :description-agent: Guides users through installing the GPU Operator with NVIDIA AI Enterprise. Use when deploying licensed NVIDIA AI Enterprise GPU software on Kubernetes. + :keywords: NVIDIA GPU Operator, NVIDIA AI Enterprise, installation, Kubernetes + :tags: gpu-operator, nvidia, kubernetes, gpu, nvidia-ai-enterprise, installation + :content.type: how_to + :skill.priority: 50 + .. headings # #, * *, =, -, ^, " .. |ellipses-img| image:: https://brand-assets.cne.ngc.nvidia.com/assets/icons/2.2.2/fill/common-more-horiz.svg diff --git a/gpu-operator/install-gpu-operator-outdated-kernels.rst b/gpu-operator/install-gpu-operator-outdated-kernels.rst index b83483d78..0205a454c 100644 --- a/gpu-operator/install-gpu-operator-outdated-kernels.rst +++ b/gpu-operator/install-gpu-operator-outdated-kernels.rst @@ -1,3 +1,14 @@ + + +.. meta:: + :title: Install with Outdated Kernels + :description: Install the NVIDIA GPU Operator on nodes that run outdated Linux kernels. + :description-agent: Explains how to install the GPU Operator when nodes run outdated kernels. Use when driver containers fail because kernel versions are older than supported defaults. + :keywords: NVIDIA GPU Operator, outdated kernels, driver containers, installation + :tags: gpu-operator, nvidia, kubernetes, gpu, kernels, driver, installation + :content.type: how_to + :skill.priority: 50 + .. Date: Aug 2 2021 .. Author: cdesiniotis diff --git a/gpu-operator/install-gpu-operator-proxy.rst b/gpu-operator/install-gpu-operator-proxy.rst index d685d6913..5ba4729e2 100644 --- a/gpu-operator/install-gpu-operator-proxy.rst +++ b/gpu-operator/install-gpu-operator-proxy.rst @@ -14,6 +14,15 @@ See the License for the specific language governing permissions and limitations under the License. +.. meta:: + :title: Install with an HTTP Proxy + :description: Install the NVIDIA GPU Operator in an HTTP proxy environment. + :description-agent: Guides users through installing the GPU Operator with HTTP proxy settings. Use when clusters require proxy configuration for image pulls or network access. + :keywords: NVIDIA GPU Operator, HTTP proxy, installation, Kubernetes + :tags: gpu-operator, nvidia, kubernetes, gpu, proxy, installation + :content.type: how_to + :skill.priority: 50 + .. headings # #, * *, =, -, ^, " .. Date: Sep 16 2021 diff --git a/gpu-operator/install-gpu-operator-service-mesh.rst b/gpu-operator/install-gpu-operator-service-mesh.rst index ec76f9e3d..280b3369b 100644 --- a/gpu-operator/install-gpu-operator-service-mesh.rst +++ b/gpu-operator/install-gpu-operator-service-mesh.rst @@ -14,6 +14,15 @@ See the License for the specific language governing permissions and limitations under the License. +.. meta:: + :title: Install with a Service Mesh + :description: Install and configure the NVIDIA GPU Operator with a service mesh. + :description-agent: Guides users through GPU Operator service mesh considerations. Use when deploying with Istio or troubleshooting sidecar injection and service mesh interactions. + :keywords: NVIDIA GPU Operator, service mesh, Istio, Kubernetes + :tags: gpu-operator, nvidia, kubernetes, gpu, service-mesh, istio + :content.type: how_to + :skill.priority: 50 + .. headings # #, * *, =, -, ^, " ###################################### diff --git a/gpu-operator/install-gpu-operator-vgpu.rst b/gpu-operator/install-gpu-operator-vgpu.rst index dde1687bf..177881bbd 100644 --- a/gpu-operator/install-gpu-operator-vgpu.rst +++ b/gpu-operator/install-gpu-operator-vgpu.rst @@ -1,3 +1,14 @@ + + +.. meta:: + :title: Install with NVIDIA vGPU + :description: Install the NVIDIA GPU Operator with NVIDIA vGPU. + :description-agent: Guides users through installing the GPU Operator with NVIDIA vGPU. Use when deploying virtual GPU software or configuring vGPU licensing with Kubernetes. + :keywords: NVIDIA GPU Operator, NVIDIA vGPU, installation, Kubernetes + :tags: gpu-operator, nvidia, kubernetes, gpu, vgpu, installation + :content.type: how_to + :skill.priority: 50 + .. license-header SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0 diff --git a/gpu-operator/life-cycle-policy.rst b/gpu-operator/life-cycle-policy.rst index a7fcc56d7..f88243e45 100644 --- a/gpu-operator/life-cycle-policy.rst +++ b/gpu-operator/life-cycle-policy.rst @@ -14,6 +14,14 @@ See the License for the specific language governing permissions and limitations under the License. +.. meta:: + :title: Life Cycle Policy + :description: Review the life cycle policy for NVIDIA GPU Operator releases. + :description-agent: Explains the GPU Operator life cycle and support policy. Use when users ask about release support windows, maintenance, or version lifecycle. + :keywords: NVIDIA GPU Operator, life cycle policy, support, releases + :tags: gpu-operator, nvidia, kubernetes, gpu, lifecycle, support, releases + :content.type: reference + .. headings # #, * *, =, -, ^, " .. Date: September 25 2022 diff --git a/gpu-operator/microsoft-aks.rst b/gpu-operator/microsoft-aks.rst index 937de5f5e..4ec35a426 100644 --- a/gpu-operator/microsoft-aks.rst +++ b/gpu-operator/microsoft-aks.rst @@ -14,6 +14,15 @@ See the License for the specific language governing permissions and limitations under the License. +.. meta:: + :title: NVIDIA GPU Operator with Azure Kubernetes Service + :description: Install and configure the NVIDIA GPU Operator on Microsoft Azure Kubernetes Service. + :description-agent: Guides users through installing and configuring the NVIDIA GPU Operator on Azure AKS. Use when deploying GPU workloads on Azure or troubleshooting AKS-specific GPU Operator setup. + :keywords: NVIDIA GPU Operator, Azure AKS, Microsoft Azure, Kubernetes + :tags: gpu-operator, nvidia, kubernetes, gpu, azure, aks + :content.type: how_to + :skill.priority: 40 + .. headings (h1/h2/h3/h4/h5) are # * = - ################################################# diff --git a/gpu-operator/overview.rst b/gpu-operator/overview.rst index 8d2007e22..5b5025ac7 100644 --- a/gpu-operator/overview.rst +++ b/gpu-operator/overview.rst @@ -14,6 +14,14 @@ See the License for the specific language governing permissions and limitations under the License. +.. meta:: + :title: About the NVIDIA GPU Operator + :description: Learn how the NVIDIA GPU Operator manages NVIDIA software components in Kubernetes. + :description-agent: Explains what the NVIDIA GPU Operator is, which components it manages, and how it automates GPU node provisioning. Use when users ask for a GPU Operator overview or documentation orientation. + :keywords: NVIDIA GPU Operator, overview, Kubernetes, GPU workloads + :tags: gpu-operator, nvidia, kubernetes, gpu, overview + :content.type: concept + .. headings # #, * *, =, -, ^, " diff --git a/gpu-operator/platform-support.rst b/gpu-operator/platform-support.rst index 93edc6fe1..8d427acd8 100644 --- a/gpu-operator/platform-support.rst +++ b/gpu-operator/platform-support.rst @@ -14,6 +14,14 @@ See the License for the specific language governing permissions and limitations under the License. +.. meta:: + :title: Platform Support + :description: Review supported platforms, operating systems, and configurations for the NVIDIA GPU Operator. + :description-agent: Lists supported Kubernetes platforms, operating systems, container runtimes, and GPU Operator configurations. Use when checking compatibility before installation or upgrade. + :keywords: NVIDIA GPU Operator, platform support, operating systems, Kubernetes + :tags: gpu-operator, nvidia, kubernetes, gpu, platform-support, compatibility + :content.type: reference + .. headings # #, * *, =, -, ^, " .. Date: July 30 2020 diff --git a/gpu-operator/precompiled-drivers.rst b/gpu-operator/precompiled-drivers.rst index a7a880424..98d699e09 100644 --- a/gpu-operator/precompiled-drivers.rst +++ b/gpu-operator/precompiled-drivers.rst @@ -14,6 +14,15 @@ See the License for the specific language governing permissions and limitations under the License. +.. meta:: + :title: Precompiled Drivers + :description: Use precompiled NVIDIA driver containers with the NVIDIA GPU Operator. + :description-agent: Explains how to use precompiled NVIDIA driver containers with the GPU Operator. Use when reducing driver build time or selecting precompiled driver images. + :keywords: NVIDIA GPU Operator, precompiled drivers, driver containers, Kubernetes + :tags: gpu-operator, nvidia, kubernetes, gpu, driver, precompiled-drivers + :content.type: how_to + :skill.priority: 60 + .. headings # #, * *, =, -, ^, " .. _install-precompiled-drivers: diff --git a/gpu-operator/release-notes.rst b/gpu-operator/release-notes.rst index b809c9ffb..0a2957b49 100644 --- a/gpu-operator/release-notes.rst +++ b/gpu-operator/release-notes.rst @@ -1,3 +1,13 @@ + + +.. meta:: + :title: Release Notes + :description: Review release notes, component versions, and changes for the NVIDIA GPU Operator. + :description-agent: Includes release notes and component version information for the NVIDIA GPU Operator. Use when users ask what changed, which component versions are included, or whether a release contains a fix. + :keywords: NVIDIA GPU Operator, release notes, component versions, changelog + :tags: gpu-operator, nvidia, kubernetes, gpu, release-notes, changelog + :content.type: reference + .. license-header SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0 diff --git a/gpu-operator/security.rst b/gpu-operator/security.rst index 319100acf..a846cfc74 100644 --- a/gpu-operator/security.rst +++ b/gpu-operator/security.rst @@ -1,4 +1,13 @@ + +.. meta:: + :title: Security Considerations + :description: Review security considerations for deploying and operating the NVIDIA GPU Operator. + :description-agent: Explains GPU Operator security considerations, elevated privileges, and known CVEs. Use when reviewing security posture, vulnerability exposure, or operator namespace access. + :keywords: NVIDIA GPU Operator, security, Kubernetes, deployment + :tags: gpu-operator, nvidia, kubernetes, gpu, security, cve + :content.type: concept + ***************************** Security Considerations ***************************** diff --git a/gpu-operator/troubleshooting.rst b/gpu-operator/troubleshooting.rst index c5317995e..24c397fca 100644 --- a/gpu-operator/troubleshooting.rst +++ b/gpu-operator/troubleshooting.rst @@ -14,6 +14,14 @@ See the License for the specific language governing permissions and limitations under the License. +.. meta:: + :title: Troubleshooting + :description: Troubleshoot common NVIDIA GPU Operator installation and runtime issues. + :description-agent: Provides troubleshooting steps for GPU Operator installation and runtime issues. Use when diagnosing failed pods, driver problems, validator failures, or GPU workload issues. + :keywords: NVIDIA GPU Operator, troubleshooting, diagnostics, Kubernetes + :tags: gpu-operator, nvidia, kubernetes, gpu, troubleshooting, diagnostics + :content.type: reference + .. headings (h1/h2/h3/h4/h5) are # * = - ####################################### diff --git a/gpu-operator/uninstall.rst b/gpu-operator/uninstall.rst index 504315122..805985809 100644 --- a/gpu-operator/uninstall.rst +++ b/gpu-operator/uninstall.rst @@ -14,6 +14,15 @@ See the License for the specific language governing permissions and limitations under the License. +.. meta:: + :title: Uninstalling the NVIDIA GPU Operator + :description: Uninstall the NVIDIA GPU Operator from a Kubernetes cluster. + :description-agent: Guides users through uninstalling the NVIDIA GPU Operator and cleaning up related resources. Use when removing the Operator from a Kubernetes cluster. + :keywords: NVIDIA GPU Operator, uninstall, removal, Kubernetes + :tags: gpu-operator, nvidia, kubernetes, gpu, uninstall, cleanup + :content.type: how_to + :skill.priority: 30 + .. headings # #, * *, =, -, ^, " ############################# diff --git a/gpu-operator/upgrade.rst b/gpu-operator/upgrade.rst index 1c812b09d..aaac52941 100644 --- a/gpu-operator/upgrade.rst +++ b/gpu-operator/upgrade.rst @@ -14,6 +14,15 @@ See the License for the specific language governing permissions and limitations under the License. +.. meta:: + :title: Upgrading the NVIDIA GPU Operator + :description: Upgrade the NVIDIA GPU Operator in a Kubernetes cluster. + :description-agent: Guides users through upgrading the NVIDIA GPU Operator with Helm and handling CRD updates. Use when planning or performing a GPU Operator upgrade. + :keywords: NVIDIA GPU Operator, upgrade, Helm, Kubernetes + :tags: gpu-operator, nvidia, kubernetes, gpu, upgrade, helm + :content.type: how_to + :skill.priority: 20 + .. headings (h1/h2/h3/h4/h5) are # * = - .. _operator-upgrades: