diff --git a/gpu-operator/conf.py b/gpu-operator/conf.py new file mode 100644 index 000000000..464c78557 --- /dev/null +++ b/gpu-operator/conf.py @@ -0,0 +1,226 @@ + +import sphinx +import os +import logging +import sys +from string import Template + +logger = logging.getLogger(__name__) + +sys.path += [ + "/work/_repo/deps/repo_docs/omni/repo/docs/include", +] + + +project = "NVIDIA GPU Operator" + +copyright = "2020-2026, NVIDIA Corporation" +author = "NVIDIA Corporation" + +release = "25.10" +root_doc = "index" + +extensions = [ + "sphinx.ext.autodoc", # include documentation from docstrings + "sphinx.ext.ifconfig", # conditional include of text + "sphinx.ext.napoleon", # support for NumPy and Google style docstrings + "sphinx.ext.intersphinx", # link to other projects' documentation + "sphinx.ext.extlinks", # add roles to shorten external links + "myst_parser", # markdown parsing + "sphinxcontrib.mermaid", # create diagrams using text and code + "sphinxcontrib.youtube", # adds youtube:: directive + "sphinxemoji.sphinxemoji", # adds emoji substitutions (e.g. |:fire:|) + "sphinx_design", + "repo_docs.ext.inline_only", + "repo_docs.ext.toctree", + "repo_docs.ext.mdinclude", + "repo_docs.ext.include_patch", + "repo_docs.ext.youtube", + "repo_docs.ext.ifconfig", + "repo_docs.ext.source_substitutions", + "repo_docs.ext.mermaid", + "repo_docs.ext.exhale_file_fix", + "repo_docs.ext.output_format_text", + "repo_docs.ext.output_format_latex", + "repo_docs.ext.include_licenses", + "repo_docs.ext.add_templates", + "repo_docs.ext.breadcrumbs", + "repo_docs.ext.metadata", + "repo_docs.ext.confval", + "repo_docs.ext.customize_layout", + "repo_docs.ext.cpp_xrefs", +] + +# automatically add section level labels, up to level 4 +myst_heading_anchors = 4 + + +# configure sphinxcontrib.mermaid as we inject mermaid manually on pages that need it +mermaid_init_js = "" +mermaid_version= "" + + +intersphinx_mapping = {} +exclude_patterns = [ + ".git", + "Thumbs.db", + ".DS_Store", + ".pytest_cache", + "_repo", + "README.md", + "life-cycle-policy.rst", + "_build/docs/secure-services-istio-keycloak", + "_build/docs/openshift", + "_build/docs/gpu-telemetry", + "_build/docs/container-toolkit", + "_build/docs/review", + "_build/docs/partner-validated", + "_build/docs/driver-containers", + "_build/docs/sphinx_warnings.txt", + "_build/docs/kubernetes", + "_build/docs/tmp", + "_build/docs/dra-driver", + "_build/docs/edge", + "_build/docs/gpu-operator/24.9.1", + "_build/docs/gpu-operator/24.12.0", + "_build/docs/gpu-operator/25.3.4", + "_build/docs/gpu-operator/25.3.1", + "_build/docs/gpu-operator/24.9.2", + "_build/docs/gpu-operator/version1.json", + "_build/docs/gpu-operator/24.9", + "_build/docs/gpu-operator/25.3.0", + "_build/docs/gpu-operator/25.3", + "_build/docs/gpu-operator/25.10", +] + +html_theme = "sphinx_rtd_theme" + +html_logo = "/work/assets/nvidia-logo-white.png" +html_favicon = "/work/assets/favicon.ico" + +# If true, links to the reST sources are added to the pages. +html_show_sourcelink = False + +html_additional_search_indices = [] + +# If true, the raw source is copied which might be a problem if content is removed with `ifconfig` +html_copy_source = False + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +html_show_sphinx = False + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = [ + "/work/_repo/deps/repo_docs/media", +] + +html_last_updated_fmt = "" + +# https://sphinx-rtd-theme.readthedocs.io/en/stable/configuring.html +html_theme_options = { + "logo_only": True, + "prev_next_buttons_location": None, # our docs aren't a novel... + "navigation_depth": 10, +} + +html_extra_content_head = [' \n '] +html_extra_content_footer = [' \n '] +html_logo_target_url = "" + +html_breadcrumbs_home_url = "" +html_extra_breadcrumbs = [] + +html_css_files = [ + "omni-style.css", + "api-styles.css", +] + +html_js_files = [ + "version.js", + "social-media.js", +] + +# literal blocks default to c++ (useful for Doxygen \code blocks) +highlight_language = 'c++' + + +# add additional tags + + + +source_substitutions = {'minor_version': '25.10', 'version': 'v25.10.1', 'recommended': '580.105.08', 'dra_version': '25.12.0'} +source_substitutions.update({ + 'repo_docs_config': 'debug', + 'repo_docs_platform_target': 'linux-x86_64', + 'repo_docs_platform': 'linux-x86_64', + 'repo_docs_dash_build': '', + 'repo_docs_project': 'gpu-operator', + 'repo_docs_version': '25.10', + 'repo_docs_copyright': '2020-2026, NVIDIA Corporation', + # note: the leading '/' means this is relative to the docs_root (the source directory) + 'repo_docs_api_path': '/../_build/docs/gpu-operator/latest', +}) + +# add global metadata for all built pages +metadata_global = {} + +sphinx_event_handlers = [] +myst_enable_extensions = [ + "colon_fence", "dollarmath", +] +templates_path = ['/work/templates'] +extensions.extend([ + "linuxdoc.rstFlatTable", + "sphinx.ext.autosectionlabel", + "sphinx_copybutton", + "sphinx_design", +]) +suppress_warnings = [ 'autosectionlabel.*' ] +pygments_style = 'sphinx' +copybutton_exclude = '.linenos, .gp' + +html_theme = "nvidia_sphinx_theme" +html_copy_source = False +html_show_sourcelink = False +html_show_sphinx = False + +html_domain_indices = False +html_use_index = False +html_extra_path = ["versions1.json"] +html_static_path = ["/work/css"] +html_css_files = ["custom.css"] + +html_theme_options = { + "icon_links": [], + "switcher": { + "json_url": "../versions1.json", + "version_match": release, + }, +} + +highlight_language = 'console' + +intersphinx_mapping = { + "dcgm": ("https://docs.nvidia.com/datacenter/dcgm/latest/", "../work/dcgm-offline.inv"), + "gpuop": ("https://docs.nvidia.com/datacenter/cloud-native/gpu-operator/latest/", + ("_build/docs/gpu-operator/latest/objects.inv", None)), + "ctk": ("https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/", + ("_build/docs/container-toolkit/latest/objects.inv", None)), + "drv": ("https://docs.nvidia.com/datacenter/cloud-native/driver-containers/latest/", + ("_build/docs/driver-containers/latest/objects.inv", None)), + "ocp": ("https://docs.nvidia.com/datacenter/cloud-native/openshift/latest/", + ("_build/docs/openshift/latest/objects.inv", None)), + "edge": ("https://docs.nvidia.com/datacenter/cloud-native/edge/latest/", + ("_build/docs/edge/latest/objects.inv", None)), +} +rst_epilog = ".. |gitlab_mr_url| replace:: Sorry Charlie...not a merge request." +if os.environ.get("CI_MERGE_REQUEST_IID") is not None: + rst_epilog = ".. |gitlab_mr_url| replace:: {}/-/merge_requests/{}".format( + os.environ["CI_MERGE_REQUEST_PROJECT_URL"], os.environ["CI_MERGE_REQUEST_IID"]) + +def setup(app): + app.add_config_value('build_name', 'public', 'env') + for (event, handler) in sphinx_event_handlers: + app.connect(event, handler) diff --git a/gpu-operator/dra-intro-install.rst b/gpu-operator/dra-intro-install.rst index 5e19ce6a0..7fa265a8b 100644 --- a/gpu-operator/dra-intro-install.rst +++ b/gpu-operator/dra-intro-install.rst @@ -40,6 +40,17 @@ With NVIDIA's DRA Driver for GPUs, your Kubernetes workload can allocate and con You can use the NVIDIA DRA Driver for GPUs with the NVIDIA GPU Operator to deploy and manage your GPUs and ComputeDomains. +.. _known-issues: + +Known Issues +************ + +* There is a known issue where the NVIDIA Driver Manager is not aware of the DRA driver kubelet plugin, and will not correctly evict it on pod restarts. + You must label the nodes you plan to use with DRA GPU allocation and pass the node label in the GPU Operator Helm command in the ``driver.manager.env`` flag. + This enables the NVIDIA Driver Manager to evict the GPU kubelet plugin correctly on driver container upgrades. +* For A100 GPUs, the MIG manager does not automatically evict the DRA kubelet plugin during MIG configuration changes. + If the DRA kubelet plugin is deployed before a MIG change, then you must manually restart the DRA kubelet plugin. + ************* Prerequisites ************* @@ -60,7 +71,7 @@ Prerequisites * Kubernetes v1.34.2 or newer. .. note:: - If you plan to use traditional extended resource requests such as `nvidia.com/gpu` with the DRA driver, you must enable the [DRAExtendedResource](https://kubernetes.io/docs/concepts/scheduling-eviction/dynamic-resource-allocation/#extended-resource) feature gate. This feature allows the scheduler to automatically translate extended resource requests into ResourceClaims, which are then allocated by the DRA driver. + If you plan to use traditional extended resource requests such as `nvidia.com/gpu` with the DRA driver, you must enable the `DRAExtendedResource `_ feature gate. This feature allows the scheduler to automatically translate extended resource requests into ResourceClaims, which are then allocated by the DRA driver. * GPU Operator v25.10.0 or later with the NVIDIA Kubernetes Device Plugin disabled to avoid conflicts with the DRA Driver for GPUs. @@ -68,11 +79,9 @@ Prerequisites These are both default in GPU Operator v25.10.0 and later. * Label nodes you plan to use for GPU allocation with something like ``nvidia.com/dra-kubelet-plugin=true`` and use them as nodeSelectors in the DRA driver helm chart. - Steps for labeling nodes are provided in the next section. - - There is a known issue where the NVIDIA Driver Manager is not aware of the DRA driver kubelet plugin, and will not correctly evict it on pod restarts. - You must label the nodes you plan to use with DRA GPU allocation and pass the node label in the GPU Operator Helm command in the ``driver.manager.env`` flag. - This enables the NVIDIA Driver Manager to evict the GPU kubelet plugin correctly on driver container upgrades. + This is required to avoid the :ref:`known issue ` when using the GPU Operator with the DRA Driver for GPUs. + Steps for labeling nodes are provided in the next section. + This label is then passed in the GPU Operator Helm command in the ``driver.manager.env`` flag. .. tab-item:: ComputeDomain :sync: computedomain @@ -154,6 +163,8 @@ Install the NVIDIA GPU Operator Refer to the `GPU Operator installation guide `_ for additional configuration options when installing the GPU Operator. +If you are planning to use MIG devices, refer to the `NVIDIA GPU Operator MIG documentation `_ to configure your cluster for MIG support. + *************************** Install DRA Driver for GPUs *************************** @@ -180,7 +191,7 @@ Install DRA Driver for GPUs .. tab-item:: GPU Allocation :sync: gpu-allocation - 1. Create a custom ``dra-values.yaml`` file for installing the DRA driver helm chart. + 1. Create a custom ``values.yaml`` file for installing the DRA driver helm chart. .. tab-set:: @@ -241,7 +252,7 @@ Install DRA Driver for GPUs --create-namespace \ --set nvidiaDriverRoot=/run/nvidia/driver \ --set gpuResourcesEnabledOverride=true \ - -f dra-values.yaml + -f values.yaml .. tab-item:: GKE install command @@ -252,7 +263,7 @@ Install DRA Driver for GPUs --namespace nvidia-dra-driver-gpu \ --create-namespace \ --set gpuResourcesEnabledOverride=true \ - -f dra-values.yaml + -f values.yaml .. tab-item:: ComputeDomain :sync: computedomain @@ -320,14 +331,73 @@ Validate Installation compute-domain-default-channel.nvidia.com 55s gpu.nvidia.com 55s mig.nvidia.com 55s - vfio.gpu.nvidia.com 55s The ``compute-domain-daemon.nvidia.com`` and ``compute-domain-default-channel.nvidia.com`` DeviceClasses are installed when ComputeDomain support is enabled. -The ``gpu.nvidia.com``, ``mig.nvidia.com``, and ``vfio.gpu.nvidia.com`` DeviceClasses are installed when GPU allocation support is enabled. +The ``gpu.nvidia.com`` and ``mig.nvidia.com`` DeviceClasses are installed when GPU allocation support is enabled. Additional validation steps are available in the DRA Driver repository documentation: * `Validate setup for ComputeDomain allocation `_ * `Validate setup for GPU allocation `_ +********************* +Enable Health Checks +********************* + +The NVIDIA DRA driver supports GPU health monitoring using the `NVIDIA Management Library (NVML) `_. +This feature uses NVML to check for `GPU XID errors `_ and determines if a GPU or MIG device is functioning properly. + +Health checking is managed by the ``NVMLDeviceHealthCheck`` feature gate. +This is currently an alpha feature and is disabled by default. + +When enabled, the DRA Driver for GPUs continuously monitors GPUs for XID errors and assigns health statuses: +* Healthy - GPU is functioning normally. The GPU may have a non-critical XID error but is still available for workloads. +* Unhealthy - GPU has a critical XID error and is not suitable for workloads. + + +To enable GPU health monitoring, deploy the DRA driver with the NVMLDeviceHealthCheck feature gate: + +.. code-block:: console + + helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update + helm upgrade -i nvidia-dra-driver-gpu nvidia/nvidia-dra-driver-gpu \ + --namespace nvidia-dra-driver-gpu \ + --set gpuResourcesEnabledOverride=true \ + --set featureGates.NVMLDeviceHealthCheck=true + +.. note:: + Unhealthy GPUs will not appear in the ResourceSlice list. After the device recovers and is marked healthy again, you must restart the DRA Driver for the device to be added back into the available resources pool. + +After enabling health checks, you can monitor health status in the kubelet logs. + +1. Check kubelet plugin logs. + Health status changes are logged in the kubelet plugin container. Run ``kubectl get pods -n nvidia-dra-driver-gpu`` and find the ``nvidia-dra-driver-gpu-kubelet-plugin-`` pod name. Replace ```` with your actual pod name. + + .. code-block:: console + + kubectl logs nvidia-dra-driver-gpu-kubelet-plugin- \ + -n nvidia-dra-driver-gpu \ + -c gpus + +2. List all ResourceSlices. + View all ResourceSlices in the cluster to see which devices are available: + + .. code-block:: console + + kubectl get resourceslice + +3. Inspect a specific ResourceSlice. + View detailed information about a specific resource slice. Healthy devices are listed in the resource slice, while unhealthy devices are not listed: + + .. code-block:: console + + kubectl get resourceslice -o yaml + +************************* +Additional Documentation +************************* + +Refer to the `DRA Driver for GPUs repository `_ for additional documentation, including +* `Upgrade Guide `_ +* `Troubleshooting Guide `_