From 7b17d6c5eee6941ae603fc68bb2d42acdb24abb7 Mon Sep 17 00:00:00 2001
From: njrrxx <416965825@qq.com>
Date: Tue, 10 Mar 2026 17:52:02 +0800
Subject: [PATCH] Fix: Add thread-safe locking to NVIDIA GPU detection to
 prevent metrics endpoint hanging

---
 gpustack_runtime/detector/nvidia.py | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/gpustack_runtime/detector/nvidia.py b/gpustack_runtime/detector/nvidia.py
index 5246582..f0e2f61 100644
--- a/gpustack_runtime/detector/nvidia.py
+++ b/gpustack_runtime/detector/nvidia.py
@@ -4,6 +4,7 @@
 import logging
 import math
 import re
+import threading
 import time
 from _ctypes import byref
 from functools import lru_cache
@@ -75,10 +76,29 @@ def detect_pci_devices() -> dict[str, PCIDevice]:
             return {}
         return {dev.address: dev for dev in pci_devs}
 
+    _detect_lock = threading.Lock()
+
     def __init__(self):
         super().__init__(ManufacturerEnum.NVIDIA)
 
-    def detect(self) -> Devices | None:  # noqa: PLR0915
+    def detect(self) -> Devices | None:
+        """
+        Detect NVIDIA GPUs using pynvml with thread-safe locking.
+
+        Returns:
+            A list of detected NVIDIA GPU devices,
+            or None if not supported.
+
+        Raises:
+            If there is an error during detection.
+
+        """
+        with self._detect_lock:
+            result = self._detect_impl()
+        
+        return result
+
+    def _detect_impl(self) -> Devices | None:  # noqa: PLR0915
         """
         Detect NVIDIA GPUs using pynvml.
 
@@ -97,7 +117,6 @@ def detect(self) -> Devices | None:  # noqa: PLR0915
 
         try:
             pci_devs = NVIDIADetector.detect_pci_devices()
-
             pynvml.nvmlInit()
             if not envs.GPUSTACK_RUNTIME_DETECT_NO_TOOLKIT_CALL:
                 try: