Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
226 changes: 226 additions & 0 deletions gpu-operator/conf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@

import sphinx
import os
import logging
import sys
from string import Template

logger = logging.getLogger(__name__)

sys.path += [
"/work/_repo/deps/repo_docs/omni/repo/docs/include",
]


project = "NVIDIA GPU Operator"

copyright = "2020-2026, NVIDIA Corporation"
author = "NVIDIA Corporation"

release = "25.10"
root_doc = "index"

extensions = [
"sphinx.ext.autodoc", # include documentation from docstrings
"sphinx.ext.ifconfig", # conditional include of text
"sphinx.ext.napoleon", # support for NumPy and Google style docstrings
"sphinx.ext.intersphinx", # link to other projects' documentation
"sphinx.ext.extlinks", # add roles to shorten external links
"myst_parser", # markdown parsing
"sphinxcontrib.mermaid", # create diagrams using text and code
"sphinxcontrib.youtube", # adds youtube:: directive
"sphinxemoji.sphinxemoji", # adds emoji substitutions (e.g. |:fire:|)
"sphinx_design",
"repo_docs.ext.inline_only",
"repo_docs.ext.toctree",
"repo_docs.ext.mdinclude",
"repo_docs.ext.include_patch",
"repo_docs.ext.youtube",
"repo_docs.ext.ifconfig",
"repo_docs.ext.source_substitutions",
"repo_docs.ext.mermaid",
"repo_docs.ext.exhale_file_fix",
"repo_docs.ext.output_format_text",
"repo_docs.ext.output_format_latex",
"repo_docs.ext.include_licenses",
"repo_docs.ext.add_templates",
"repo_docs.ext.breadcrumbs",
"repo_docs.ext.metadata",
"repo_docs.ext.confval",
"repo_docs.ext.customize_layout",
"repo_docs.ext.cpp_xrefs",
]

# automatically add section level labels, up to level 4
myst_heading_anchors = 4


# configure sphinxcontrib.mermaid as we inject mermaid manually on pages that need it
mermaid_init_js = ""
mermaid_version= ""


intersphinx_mapping = {}
exclude_patterns = [
".git",
"Thumbs.db",
".DS_Store",
".pytest_cache",
"_repo",
"README.md",
"life-cycle-policy.rst",
"_build/docs/secure-services-istio-keycloak",
"_build/docs/openshift",
"_build/docs/gpu-telemetry",
"_build/docs/container-toolkit",
"_build/docs/review",
"_build/docs/partner-validated",
"_build/docs/driver-containers",
"_build/docs/sphinx_warnings.txt",
"_build/docs/kubernetes",
"_build/docs/tmp",
"_build/docs/dra-driver",
"_build/docs/edge",
"_build/docs/gpu-operator/24.9.1",
"_build/docs/gpu-operator/24.12.0",
"_build/docs/gpu-operator/25.3.4",
"_build/docs/gpu-operator/25.3.1",
"_build/docs/gpu-operator/24.9.2",
"_build/docs/gpu-operator/version1.json",
"_build/docs/gpu-operator/24.9",
"_build/docs/gpu-operator/25.3.0",
"_build/docs/gpu-operator/25.3",
"_build/docs/gpu-operator/25.10",
]

html_theme = "sphinx_rtd_theme"

html_logo = "/work/assets/nvidia-logo-white.png"
html_favicon = "/work/assets/favicon.ico"

# If true, links to the reST sources are added to the pages.
html_show_sourcelink = False

html_additional_search_indices = []

# If true, the raw source is copied which might be a problem if content is removed with `ifconfig`
html_copy_source = False

# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
html_show_sphinx = False

# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = [
"/work/_repo/deps/repo_docs/media",
]

html_last_updated_fmt = ""

# https://sphinx-rtd-theme.readthedocs.io/en/stable/configuring.html
html_theme_options = {
"logo_only": True,
"prev_next_buttons_location": None, # our docs aren't a novel...
"navigation_depth": 10,
}

html_extra_content_head = [' <script src="https://assets.adobedtm.com/5d4962a43b79/c1061d2c5e7b/launch-191c2462b890.min.js" ></script>\n ']
html_extra_content_footer = [' <script type="text/javascript">if (typeof _satellite !== "undefined") {_satellite.pageBottom();}</script>\n ']
html_logo_target_url = ""

html_breadcrumbs_home_url = ""
html_extra_breadcrumbs = []

html_css_files = [
"omni-style.css",
"api-styles.css",
]

html_js_files = [
"version.js",
"social-media.js",
]

# literal blocks default to c++ (useful for Doxygen \code blocks)
highlight_language = 'c++'


# add additional tags



source_substitutions = {'minor_version': '25.10', 'version': 'v25.10.1', 'recommended': '580.105.08', 'dra_version': '25.12.0'}
source_substitutions.update({
'repo_docs_config': 'debug',
'repo_docs_platform_target': 'linux-x86_64',
'repo_docs_platform': 'linux-x86_64',
'repo_docs_dash_build': '',
'repo_docs_project': 'gpu-operator',
'repo_docs_version': '25.10',
'repo_docs_copyright': '2020-2026, NVIDIA Corporation',
# note: the leading '/' means this is relative to the docs_root (the source directory)
'repo_docs_api_path': '/../_build/docs/gpu-operator/latest',
})

# add global metadata for all built pages
metadata_global = {}

sphinx_event_handlers = []
myst_enable_extensions = [
"colon_fence", "dollarmath",
]
templates_path = ['/work/templates']
extensions.extend([
"linuxdoc.rstFlatTable",
"sphinx.ext.autosectionlabel",
"sphinx_copybutton",
"sphinx_design",
])
suppress_warnings = [ 'autosectionlabel.*' ]
pygments_style = 'sphinx'
copybutton_exclude = '.linenos, .gp'

html_theme = "nvidia_sphinx_theme"
html_copy_source = False
html_show_sourcelink = False
html_show_sphinx = False

html_domain_indices = False
html_use_index = False
html_extra_path = ["versions1.json"]
html_static_path = ["/work/css"]
html_css_files = ["custom.css"]

html_theme_options = {
"icon_links": [],
"switcher": {
"json_url": "../versions1.json",
"version_match": release,
},
}

highlight_language = 'console'

intersphinx_mapping = {
"dcgm": ("https://docs.nvidia.com/datacenter/dcgm/latest/", "../work/dcgm-offline.inv"),
"gpuop": ("https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/",
("_build/docs/gpu-operator/latest/objects.inv", None)),
"ctk": ("https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/",
("_build/docs/container-toolkit/latest/objects.inv", None)),
"drv": ("https://docs.nvidia.com/datacenter/cloud-native/driver-containers/latest/",
("_build/docs/driver-containers/latest/objects.inv", None)),
"ocp": ("https://docs.nvidia.com/datacenter/cloud-native/openshift/latest/",
("_build/docs/openshift/latest/objects.inv", None)),
"edge": ("https://docs.nvidia.com/datacenter/cloud-native/edge/latest/",
("_build/docs/edge/latest/objects.inv", None)),
}
rst_epilog = ".. |gitlab_mr_url| replace:: Sorry Charlie...not a merge request."
if os.environ.get("CI_MERGE_REQUEST_IID") is not None:
rst_epilog = ".. |gitlab_mr_url| replace:: {}/-/merge_requests/{}".format(
os.environ["CI_MERGE_REQUEST_PROJECT_URL"], os.environ["CI_MERGE_REQUEST_IID"])

def setup(app):
app.add_config_value('build_name', 'public', 'env')
for (event, handler) in sphinx_event_handlers:
app.connect(event, handler)
92 changes: 81 additions & 11 deletions gpu-operator/dra-intro-install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,17 @@ With NVIDIA's DRA Driver for GPUs, your Kubernetes workload can allocate and con

You can use the NVIDIA DRA Driver for GPUs with the NVIDIA GPU Operator to deploy and manage your GPUs and ComputeDomains.

.. _known-issues:

Known Issues
************

* There is a known issue where the NVIDIA Driver Manager is not aware of the DRA driver kubelet plugin, and will not correctly evict it on pod restarts.
You must label the nodes you plan to use with DRA GPU allocation and pass the node label in the GPU Operator Helm command in the ``driver.manager.env`` flag.
This enables the NVIDIA Driver Manager to evict the GPU kubelet plugin correctly on driver container upgrades.
* For A100 GPUs, the MIG manager does not automatically evict the DRA kubelet plugin during MIG configuration changes.
If the DRA kubelet plugin is deployed before a MIG change, then you must manually restart the DRA kubelet plugin.

*************
Prerequisites
*************
Expand All @@ -60,19 +71,17 @@ Prerequisites
* Kubernetes v1.34.2 or newer.

.. note::
If you plan to use traditional extended resource requests such as `nvidia.com/gpu` with the DRA driver, you must enable the [DRAExtendedResource](https://kubernetes.io/docs/concepts/scheduling-eviction/dynamic-resource-allocation/#extended-resource) feature gate. This feature allows the scheduler to automatically translate extended resource requests into ResourceClaims, which are then allocated by the DRA driver.
If you plan to use traditional extended resource requests such as `nvidia.com/gpu` with the DRA driver, you must enable the `DRAExtendedResource <https://kubernetes.io/docs/concepts/scheduling-eviction/dynamic-resource-allocation/#extended-resource>`_ feature gate. This feature allows the scheduler to automatically translate extended resource requests into ResourceClaims, which are then allocated by the DRA driver.

* GPU Operator v25.10.0 or later with the NVIDIA Kubernetes Device Plugin disabled to avoid conflicts with the DRA Driver for GPUs.

The DRA Driver requires Container Device Interface (CDI) to be enabled in the underlying container runtime (such as containerd or CRI-O) and NVIDIA Driver version 580 or later.
These are both default in GPU Operator v25.10.0 and later.

* Label nodes you plan to use for GPU allocation with something like ``nvidia.com/dra-kubelet-plugin=true`` and use them as nodeSelectors in the DRA driver helm chart.
Steps for labeling nodes are provided in the next section.

There is a known issue where the NVIDIA Driver Manager is not aware of the DRA driver kubelet plugin, and will not correctly evict it on pod restarts.
You must label the nodes you plan to use with DRA GPU allocation and pass the node label in the GPU Operator Helm command in the ``driver.manager.env`` flag.
This enables the NVIDIA Driver Manager to evict the GPU kubelet plugin correctly on driver container upgrades.
This is required to avoid the :ref:`known issue <known-issues>` when using the GPU Operator with the DRA Driver for GPUs.
Steps for labeling nodes are provided in the next section.
This label is then passed in the GPU Operator Helm command in the ``driver.manager.env`` flag.

.. tab-item:: ComputeDomain
:sync: computedomain
Expand Down Expand Up @@ -154,6 +163,8 @@ Install the NVIDIA GPU Operator

Refer to the `GPU Operator installation guide <https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-install.html>`_ for additional configuration options when installing the GPU Operator.

If you are planning to use MIG devices, refer to the `NVIDIA GPU Operator MIG documentation <https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/gpu-operator-mig.html>`_ to configure your cluster for MIG support.

***************************
Install DRA Driver for GPUs
***************************
Expand All @@ -180,7 +191,7 @@ Install DRA Driver for GPUs
.. tab-item:: GPU Allocation
:sync: gpu-allocation

1. Create a custom ``dra-values.yaml`` file for installing the DRA driver helm chart.
1. Create a custom ``values.yaml`` file for installing the DRA driver helm chart.

.. tab-set::

Expand Down Expand Up @@ -241,7 +252,7 @@ Install DRA Driver for GPUs
--create-namespace \
--set nvidiaDriverRoot=/run/nvidia/driver \
--set gpuResourcesEnabledOverride=true \
-f dra-values.yaml
-f values.yaml

.. tab-item:: GKE install command

Expand All @@ -252,7 +263,7 @@ Install DRA Driver for GPUs
--namespace nvidia-dra-driver-gpu \
--create-namespace \
--set gpuResourcesEnabledOverride=true \
-f dra-values.yaml
-f values.yaml

.. tab-item:: ComputeDomain
:sync: computedomain
Expand Down Expand Up @@ -320,14 +331,73 @@ Validate Installation
compute-domain-default-channel.nvidia.com 55s
gpu.nvidia.com 55s
mig.nvidia.com 55s
vfio.gpu.nvidia.com 55s

The ``compute-domain-daemon.nvidia.com`` and ``compute-domain-default-channel.nvidia.com`` DeviceClasses are installed when ComputeDomain support is enabled.
The ``gpu.nvidia.com``, ``mig.nvidia.com``, and ``vfio.gpu.nvidia.com`` DeviceClasses are installed when GPU allocation support is enabled.
The ``gpu.nvidia.com`` and ``mig.nvidia.com`` DeviceClasses are installed when GPU allocation support is enabled.

Additional validation steps are available in the DRA Driver repository documentation:

* `Validate setup for ComputeDomain allocation <https://github.com/NVIDIA/k8s-dra-driver-gpu/wiki/Validate-setup-for-ComputeDomain-allocation>`_
* `Validate setup for GPU allocation <https://github.com/NVIDIA/k8s-dra-driver-gpu/wiki/Validate-setup-for-GPU-allocation>`_

*********************
Enable Health Checks
*********************

The NVIDIA DRA driver supports GPU health monitoring using the `NVIDIA Management Library (NVML) <https://developer.nvidia.com/management-library-nvml>`_.
This feature uses NVML to check for `GPU XID errors <https://docs.nvidia.com/deploy/xid-errors/introduction.html>`_ and determines if a GPU or MIG device is functioning properly.

Health checking is managed by the ``NVMLDeviceHealthCheck`` feature gate.
This is currently an alpha feature and is disabled by default.

When enabled, the DRA Driver for GPUs continuously monitors GPUs for XID errors and assigns health statuses:
* Healthy - GPU is functioning normally. The GPU may have a non-critical XID error but is still available for workloads.
* Unhealthy - GPU has a critical XID error and is not suitable for workloads.


To enable GPU health monitoring, deploy the DRA driver with the NVMLDeviceHealthCheck feature gate:

.. code-block:: console

helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update
helm upgrade -i nvidia-dra-driver-gpu nvidia/nvidia-dra-driver-gpu \
--namespace nvidia-dra-driver-gpu \
--set gpuResourcesEnabledOverride=true \
--set featureGates.NVMLDeviceHealthCheck=true

.. note::
Unhealthy GPUs will not appear in the ResourceSlice list. After the device recovers and is marked healthy again, you must restart the DRA Driver for the device to be added back into the available resources pool.

After enabling health checks, you can monitor health status in the kubelet logs.

1. Check kubelet plugin logs.
Health status changes are logged in the kubelet plugin container. Run ``kubectl get pods -n nvidia-dra-driver-gpu`` and find the ``nvidia-dra-driver-gpu-kubelet-plugin-<pod>`` pod name. Replace ``<pod>`` with your actual pod name.

.. code-block:: console

kubectl logs nvidia-dra-driver-gpu-kubelet-plugin-<pod> \
-n nvidia-dra-driver-gpu \
-c gpus

2. List all ResourceSlices.
View all ResourceSlices in the cluster to see which devices are available:

.. code-block:: console

kubectl get resourceslice

3. Inspect a specific ResourceSlice.
View detailed information about a specific resource slice. Healthy devices are listed in the resource slice, while unhealthy devices are not listed:

.. code-block:: console

kubectl get resourceslice <resourceslice-name> -o yaml

*************************
Additional Documentation
*************************

Refer to the `DRA Driver for GPUs repository <https://github.com/NVIDIA/k8s-dra-driver-gpu/wiki>`_ for additional documentation, including

* `Upgrade Guide <https://github.com/NVIDIA/k8s-dra-driver-gpu/wiki/Installation#upgrading>`_
* `Troubleshooting Guide <https://github.com/NVIDIA/k8s-dra-driver-gpu/wiki/Troubleshooting>`_