From 7b17d6c5eee6941ae603fc68bb2d42acdb24abb7 Mon Sep 17 00:00:00 2001 From: njrrxx <416965825@qq.com> Date: Tue, 10 Mar 2026 17:52:02 +0800 Subject: [PATCH] Fix: Add thread-safe locking to NVIDIA GPU detection to prevent metrics endpoint hanging --- gpustack_runtime/detector/nvidia.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/gpustack_runtime/detector/nvidia.py b/gpustack_runtime/detector/nvidia.py index 5246582..f0e2f61 100644 --- a/gpustack_runtime/detector/nvidia.py +++ b/gpustack_runtime/detector/nvidia.py @@ -4,6 +4,7 @@ import logging import math import re +import threading import time from _ctypes import byref from functools import lru_cache @@ -75,10 +76,29 @@ def detect_pci_devices() -> dict[str, PCIDevice]: return {} return {dev.address: dev for dev in pci_devs} + _detect_lock = threading.Lock() + def __init__(self): super().__init__(ManufacturerEnum.NVIDIA) - def detect(self) -> Devices | None: # noqa: PLR0915 + def detect(self) -> Devices | None: + """ + Detect NVIDIA GPUs using pynvml with thread-safe locking. + + Returns: + A list of detected NVIDIA GPU devices, + or None if not supported. + + Raises: + If there is an error during detection. + + """ + with self._detect_lock: + result = self._detect_impl() + + return result + + def _detect_impl(self) -> Devices | None: # noqa: PLR0915 """ Detect NVIDIA GPUs using pynvml. @@ -97,7 +117,6 @@ def detect(self) -> Devices | None: # noqa: PLR0915 try: pci_devs = NVIDIADetector.detect_pci_devices() - pynvml.nvmlInit() if not envs.GPUSTACK_RUNTIME_DETECT_NO_TOOLKIT_CALL: try: