From 27e69cc197aab2eeccdaa6ed47e6fe29716610df Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Tue, 2 Jun 2026 18:10:38 +0200 Subject: [PATCH 1/4] Fix some threading issues (some free-threading related) This fixes a few threading issues, but we may want to discuss some details still. * The GraphNode cleanup order is an important fix. Another thread may end up with the same pointer (but new object) as soon as we clean it up. So we have to remove it from the cache before cleaning it up. * Use of atomics: I think this is needed, but for this one place an atomic seemed more reasonable. (However, hard to test and if it can fail IIUC only on ARM.) * The critical sections should be pretty safe. I am not sure they will all ensure that the object is always the _identity_ but I am pretty sure it protects from worse races. (Testing did find this for MemPool.attributes, not others yet. Testing with thread-sanitizer might flush out some...) * The split mutex: This is thread-unsafe. But I am honestly not sure if that isn't just expected, or whether the mutex is good but it should also be safe from within CUDA. * Use of `setdefault` cached pattern is largely just normalizing. Without the `return dict.setdefault` a different instance may be returned on different threads (or a cache entry replaced). For the `cyGraphMemoryResource` that triggered a test with pytest-run-parallel although that doesn't mean it is problematic as such. `cuda-pathfinder` uses functools.cache, but usually for strings; the one we may want to look at is `load_nvidia_dynamic_lib`. Signed-off-by: Sebastian Berg --- cuda_core/cuda/core/_device.pyx | 17 ++++++++++------- cuda_core/cuda/core/_device_resources.pxd | 3 +++ cuda_core/cuda/core/_device_resources.pyx | 9 +++++---- cuda_core/cuda/core/_memory/_buffer.pxd | 15 ++++++++------- cuda_core/cuda/core/_memory/_buffer.pyx | 11 ++++++----- .../core/_memory/_graph_memory_resource.pyx | 14 +++++++++++--- cuda_core/cuda/core/_memory/_memory_pool.pyx | 2 ++ cuda_core/cuda/core/_memoryview.pyx | 6 +++++- cuda_core/cuda/core/_module.pyx | 6 +++++- cuda_core/cuda/core/graph/_graph_node.pyx | 8 ++++---- 10 files changed, 59 insertions(+), 32 deletions(-) diff --git a/cuda_core/cuda/core/_device.pyx b/cuda_core/cuda/core/_device.pyx index da6972f372..451ca25dda 100644 --- a/cuda_core/cuda/core/_device.pyx +++ b/cuda_core/cuda/core/_device.pyx @@ -85,9 +85,12 @@ cdef class DeviceProperties: cdef inline int _get_cached_attribute(self, attr, default=0) except? -2: """Retrieve the attribute value, using cache if applicable.""" - if attr not in self._cache: - self._cache[attr] = self._get_attribute(attr, default) - return self._cache[attr] + cached = self._cache.get(attr) + if cached is not None: + return cached + cdef int value = self._get_attribute(attr, default) + self._cache[attr] = value # setdefault not needed for ints + return value @property def max_threads_per_block(self) -> int: @@ -1131,11 +1134,11 @@ class Device: def compute_capability(self) -> ComputeCapability: """Return a named tuple with 2 fields: major and minor.""" cdef DeviceProperties prop = self.properties - if "compute_capability" in prop._cache: - return prop._cache["compute_capability"] + cached = prop._cache.get("compute_capability") + if cached is not None: + return cached cc = ComputeCapability(prop.compute_capability_major, prop.compute_capability_minor) - prop._cache["compute_capability"] = cc - return cc + return prop._cache.setdefault("compute_capability", cc) @property def arch(self) -> str: diff --git a/cuda_core/cuda/core/_device_resources.pxd b/cuda_core/cuda/core/_device_resources.pxd index d618c24cf1..98f91ab473 100644 --- a/cuda_core/cuda/core/_device_resources.pxd +++ b/cuda_core/cuda/core/_device_resources.pxd @@ -2,6 +2,8 @@ # # SPDX-License-Identifier: Apache-2.0 +cimport cython + from cuda.bindings cimport cydriver from cuda.core._resource_handles cimport ContextHandle, GreenCtxHandle @@ -15,6 +17,7 @@ cdef class SMResource: unsigned int _flags bint _is_usable object __weakref__ + cython.pymutex _split_mutex @staticmethod cdef SMResource _from_dev_resource(cydriver.CUdevResource res, int device_id) diff --git a/cuda_core/cuda/core/_device_resources.pyx b/cuda_core/cuda/core/_device_resources.pyx index ecd9e00bf0..bafc462c93 100644 --- a/cuda_core/cuda/core/_device_resources.pyx +++ b/cuda_core/cuda/core/_device_resources.pyx @@ -498,10 +498,11 @@ cdef class SMResource: ) _resolve_group_count(opts) _check_green_ctx_support() - if _can_use_structured_sm_split(): - return _split_with_general_api(self, opts, dry_run) - # SplitByCount requires the same 12.4+ as green ctx support (already checked above) - return _split_with_count_api(self, opts, dry_run) + with self._split_mutex: + if _can_use_structured_sm_split(): + return _split_with_general_api(self, opts, dry_run) + # SplitByCount requires the same 12.4+ as green ctx support (already checked above) + return _split_with_count_api(self, opts, dry_run) cdef class WorkqueueResource: diff --git a/cuda_core/cuda/core/_memory/_buffer.pxd b/cuda_core/cuda/core/_memory/_buffer.pxd index 98c4b50db3..f744e75657 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pxd +++ b/cuda_core/cuda/core/_memory/_buffer.pxd @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 from libc.stdint cimport uintptr_t +from libcpp.atomic cimport atomic as std_atomic, memory_order_acquire, memory_order_release from cuda.bindings cimport cydriver from cuda.core._resource_handles cimport DevicePtrHandle @@ -18,13 +19,13 @@ cdef struct _MemAttrs: cdef class Buffer: cdef: - DevicePtrHandle _h_ptr - MemoryResource _memory_resource - object _ipc_data - object _owner - _MemAttrs _mem_attrs - bint _mem_attrs_inited - object __weakref__ + DevicePtrHandle _h_ptr + MemoryResource _memory_resource + object _ipc_data + object _owner + _MemAttrs _mem_attrs + std_atomic[bool] _mem_attrs_inited + object __weakref__ cdef public: # Python code in _memory/_virtual_memory_resource.py needs to update # this value, though it is technically private. diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 88f9054385..d9cf350b35 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -96,7 +96,7 @@ cdef class Buffer: self._memory_resource = None self._ipc_data = None self._owner = None - self._mem_attrs_inited = False + self._mem_attrs_inited.store(False) def __init__(self, *args, **kwargs) -> None: raise RuntimeError("Buffer objects cannot be instantiated directly. " @@ -126,7 +126,7 @@ cdef class Buffer: self._memory_resource = mr self._ipc_data = IPCDataForBuffer(ipc_descriptor, True) if ipc_descriptor is not None else None self._owner = owner - self._mem_attrs_inited = False + self._mem_attrs_inited.store(False) return self @staticmethod @@ -191,6 +191,7 @@ cdef class Buffer: return _ipc.Buffer_from_ipc_descriptor(cls, mr, ipc_descriptor, stream) @property + @cython.critical_section def ipc_descriptor(self) -> IPCBufferDescriptor: """Descriptor for sharing this buffer with other processes.""" if self._ipc_data is None: @@ -447,9 +448,9 @@ cdef class Buffer: # ------------------------------ cdef inline void _init_mem_attrs(Buffer self): """Initialize memory attributes by querying the pointer.""" - if not self._mem_attrs_inited: + if not self._mem_attrs_inited.load(memory_order_acquire): _query_memory_attrs(self._mem_attrs, as_cu(self._h_ptr)) - self._mem_attrs_inited = True + self._mem_attrs_inited.store(True, memory_order_release) cdef inline int _query_memory_attrs( @@ -597,7 +598,7 @@ cdef Buffer Buffer_from_deviceptr_handle( buf._memory_resource = mr buf._ipc_data = IPCDataForBuffer(ipc_descriptor, True) if ipc_descriptor is not None else None buf._owner = None - buf._mem_attrs_inited = False + buf._mem_attrs_inited.store(False) return buf diff --git a/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx b/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx index 479322ab01..e845a47b08 100644 --- a/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx +++ b/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx @@ -18,7 +18,6 @@ from cuda.core._resource_handles cimport ( from cuda.core._stream cimport Stream_accept, Stream from cuda.core._utils.cuda_utils cimport HANDLE_RETURN -from functools import cache from typing import TYPE_CHECKING if TYPE_CHECKING: @@ -161,6 +160,8 @@ cdef class cyGraphMemoryResource(MemoryResource): return False +cdef dict _mem_resource_cache = {} + class GraphMemoryResource(cyGraphMemoryResource): """ A memory resource for memory related to graphs. @@ -185,9 +186,16 @@ class GraphMemoryResource(cyGraphMemoryResource): return cls._create(c_device_id) @classmethod - @cache def _create(cls, int device_id) -> GraphMemoryResource: - return cyGraphMemoryResource.__new__(cls, device_id) + # we use a dict currently, because functools.cache is currently less + # thread-safe see also: https://github.com/python/cpython/issues/150708 + res = _mem_resource_cache.get(device_id) + if res is not None: + return res + + # create new instance, but in case of a race may return another: + new = cyGraphMemoryResource.__new__(cls, device_id) + return _mem_resource_cache.setdefault(device_id, new) # Raise an exception if the given stream is capturing. diff --git a/cuda_core/cuda/core/_memory/_memory_pool.pyx b/cuda_core/cuda/core/_memory/_memory_pool.pyx index c6276f0f3d..ddcac2d606 100644 --- a/cuda_core/cuda/core/_memory/_memory_pool.pyx +++ b/cuda_core/cuda/core/_memory/_memory_pool.pyx @@ -4,6 +4,7 @@ from __future__ import annotations +cimport cython from libc.limits cimport ULLONG_MAX from libc.stdint cimport uintptr_t from libc.string cimport memset @@ -177,6 +178,7 @@ cdef class _MemPool(MemoryResource): _MP_deallocate(self, ptr, size, s) @property + @cython.critical_section def attributes(self) -> _MemPoolAttributes: """Memory pool attributes.""" if self._attributes is None: diff --git a/cuda_core/cuda/core/_memoryview.pyx b/cuda_core/cuda/core/_memoryview.pyx index c65107ae27..260980c1da 100644 --- a/cuda_core/cuda/core/_memoryview.pyx +++ b/cuda_core/cuda/core/_memoryview.pyx @@ -4,6 +4,7 @@ from __future__ import annotations +cimport cython from ._dlpack cimport * from ._dlpack import classify_dl_device from libc.stdint cimport intptr_t @@ -80,7 +81,7 @@ cdef inline bint _is_torch_tensor(object obj): cdef str mod = tp.__module__ or "" cdef bint result = mod.startswith("torch") and hasattr(obj, "data_ptr") \ and _torch_version_check() - _torch_type_cache[tp] = result + _torch_type_cache[tp] = result # setdefault not needed for bools return result @@ -539,6 +540,7 @@ cdef class StridedMemoryView: + f" readonly={self.readonly},\n" + f" exporting_obj={get_simple_repr(self.exporting_obj)})") + @cython.critical_section cdef inline _StridedLayout get_layout(self): if self._layout is None: if self.dl_tensor: @@ -549,6 +551,7 @@ cdef class StridedMemoryView: raise ValueError("Cannot infer layout from the exporting object") return self._layout + @cython.critical_section cdef inline object get_buffer(self): """ Returns Buffer instance with the underlying data. @@ -562,6 +565,7 @@ cdef class StridedMemoryView: self._buffer = Buffer.from_handle(self.ptr, 0, owner=self.exporting_obj) return self._buffer + @cython.critical_section cdef inline object get_dtype(self): if self._dtype is None: if self.dl_tensor != NULL: diff --git a/cuda_core/cuda/core/_module.pyx b/cuda_core/cuda/core/_module.pyx index 5cb1b7f005..91c8ad4389 100644 --- a/cuda_core/cuda/core/_module.pyx +++ b/cuda_core/cuda/core/_module.pyx @@ -4,6 +4,7 @@ from __future__ import annotations +cimport cython from libc.stddef cimport size_t from collections import namedtuple @@ -83,7 +84,7 @@ cdef class KernelAttributes: cdef int result with nogil: HANDLE_RETURN(cydriver.cuKernelGetAttribute(&result, attribute, as_cu(self._h_kernel), device_id)) - self._cache[cache_key] = result + self._cache[cache_key] = result # setdefault not needed for ints return result def __getitem__(self, device: Device | int) -> KernelAttributes: @@ -454,6 +455,7 @@ cdef class Kernel: return ker @property + @cython.critical_section def attributes(self) -> KernelAttributes: """Get the read-only attributes of this kernel.""" if self._attributes is None: @@ -501,6 +503,7 @@ cdef class Kernel: return param_info @property + @cython.critical_section def occupancy(self) -> KernelOccupancy: """Get the occupancy information for launching this kernel.""" if self._occupancy is None: @@ -742,6 +745,7 @@ cdef class ObjectCode: # TODO: do we want to unload in a finalizer? Probably not.. + @cython.critical_section cdef int _lazy_load_module(self) except -1: if self._h_library: return 0 diff --git a/cuda_core/cuda/core/graph/_graph_node.pyx b/cuda_core/cuda/core/graph/_graph_node.pyx index f627edf9bb..d3d684aff3 100644 --- a/cuda_core/cuda/core/graph/_graph_node.pyx +++ b/cuda_core/cuda/core/graph/_graph_node.pyx @@ -78,8 +78,7 @@ _node_registry: weakref.WeakValueDictionary[int, GraphNode] = weakref.WeakValueD cdef inline GraphNode _registered(GraphNode n): - _node_registry[n._h_node.get()] = n - return n + return _node_registry.setdefault(n._h_node.get(), n) cdef class GraphNode: @@ -162,10 +161,11 @@ cdef class GraphNode: cdef cydriver.CUgraphNode node = as_cu(self._h_node) if node == NULL: return - with nogil: - HANDLE_RETURN(cydriver.cuGraphDestroyNode(node)) + _node_registry.pop(self._h_node.get(), None) invalidate_graph_node(self._h_node) + with nogil: + HANDLE_RETURN(cydriver.cuGraphDestroyNode(node)) @property def pred(self) -> AdjacencySetProxy: From f4cb18d9085f637f5595b49ed31c28c5a03cd905 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Tue, 2 Jun 2026 19:11:36 +0200 Subject: [PATCH 2/4] Use C++ bool for atomic Signed-off-by: Sebastian Berg --- cuda_core/cuda/core/_memory/_buffer.pxd | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_buffer.pxd b/cuda_core/cuda/core/_memory/_buffer.pxd index f744e75657..83dcd4f68c 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pxd +++ b/cuda_core/cuda/core/_memory/_buffer.pxd @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 from libc.stdint cimport uintptr_t +from libcpp cimport bool as cpp_bool from libcpp.atomic cimport atomic as std_atomic, memory_order_acquire, memory_order_release from cuda.bindings cimport cydriver @@ -19,13 +20,13 @@ cdef struct _MemAttrs: cdef class Buffer: cdef: - DevicePtrHandle _h_ptr - MemoryResource _memory_resource - object _ipc_data - object _owner - _MemAttrs _mem_attrs - std_atomic[bool] _mem_attrs_inited - object __weakref__ + DevicePtrHandle _h_ptr + MemoryResource _memory_resource + object _ipc_data + object _owner + _MemAttrs _mem_attrs + std_atomic[cpp_bool] _mem_attrs_inited + object __weakref__ cdef public: # Python code in _memory/_virtual_memory_resource.py needs to update # this value, though it is technically private. From 91cf6d001f457c648f5b22cea9deef4552080571 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Wed, 10 Jun 2026 22:15:03 +0200 Subject: [PATCH 3/4] Forgot to commit a critical section on this branch Signed-off-by: Sebastian Berg --- cuda_core/cuda/core/_memory/_buffer.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index d9cf350b35..00359c1f0b 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -446,6 +446,7 @@ cdef class Buffer: # Memory Attribute Query Helpers # ------------------------------ +@cython.critical_section cdef inline void _init_mem_attrs(Buffer self): """Initialize memory attributes by querying the pointer.""" if not self._mem_attrs_inited.load(memory_order_acquire): From f5b239cd3fd36ed962cded17f78fb87a246e2243 Mon Sep 17 00:00:00 2001 From: Sebastian Berg Date: Thu, 11 Jun 2026 15:09:50 +0200 Subject: [PATCH 4/4] Regenerate .pyi files, but I am a bit unsure the `critical_section` is good... Signed-off-by: Sebastian Berg --- cuda_core/cuda/core/_memory/_buffer.pyi | 2 ++ cuda_core/cuda/core/_memory/_graph_memory_resource.pyi | 3 --- cuda_core/cuda/core/_memory/_memory_pool.pyi | 2 ++ cuda_core/cuda/core/_module.pyi | 3 +++ 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_buffer.pyi b/cuda_core/cuda/core/_memory/_buffer.pyi index 728853c4bc..7118a3a1e0 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyi +++ b/cuda_core/cuda/core/_memory/_buffer.pyi @@ -2,6 +2,7 @@ from __future__ import annotations +import cython from cuda.core._memory._device_memory_resource import DeviceMemoryResource from cuda.core._memory._ipc import IPCBufferDescriptor from cuda.core._memory._pinned_memory_resource import PinnedMemoryResource @@ -88,6 +89,7 @@ class Buffer: """ @property + @cython.critical_section def ipc_descriptor(self) -> IPCBufferDescriptor: """Descriptor for sharing this buffer with other processes.""" diff --git a/cuda_core/cuda/core/_memory/_graph_memory_resource.pyi b/cuda_core/cuda/core/_memory/_graph_memory_resource.pyi index 4ff85eb597..b34f968fdc 100644 --- a/cuda_core/cuda/core/_memory/_graph_memory_resource.pyi +++ b/cuda_core/cuda/core/_memory/_graph_memory_resource.pyi @@ -2,8 +2,6 @@ from __future__ import annotations -from functools import cache - from cuda.core._device import Device from cuda.core._memory._buffer import Buffer, MemoryResource from cuda.core._stream import Stream @@ -113,7 +111,6 @@ class GraphMemoryResource(cyGraphMemoryResource): ... @classmethod - @cache def _create(cls, device_id: int) -> GraphMemoryResource: ... __all__ = ['GraphMemoryResource'] \ No newline at end of file diff --git a/cuda_core/cuda/core/_memory/_memory_pool.pyi b/cuda_core/cuda/core/_memory/_memory_pool.pyi index 3d15d9f679..7f8c64aedd 100644 --- a/cuda_core/cuda/core/_memory/_memory_pool.pyi +++ b/cuda_core/cuda/core/_memory/_memory_pool.pyi @@ -4,6 +4,7 @@ from __future__ import annotations import uuid +import cython from cuda.core._memory._buffer import Buffer, MemoryResource from cuda.core._stream import Stream from cuda.core.graph import GraphBuilder @@ -97,6 +98,7 @@ class _MemPool(MemoryResource): """ @property + @cython.critical_section def attributes(self) -> _MemPoolAttributes: """Memory pool attributes.""" diff --git a/cuda_core/cuda/core/_module.pyi b/cuda_core/cuda/core/_module.pyi index caf6b09b71..5125b99131 100644 --- a/cuda_core/cuda/core/_module.pyi +++ b/cuda_core/cuda/core/_module.pyi @@ -5,6 +5,7 @@ from __future__ import annotations from collections import namedtuple from os import PathLike +import cython from cuda.core._device import Device from cuda.core._launch_config import LaunchConfig from cuda.core._stream import Stream @@ -253,6 +254,7 @@ class Kernel: ... @property + @cython.critical_section def attributes(self) -> KernelAttributes: """Get the read-only attributes of this kernel.""" @@ -265,6 +267,7 @@ class Kernel: """list[ParamInfo]: (offset, size) for each argument of this function""" @property + @cython.critical_section def occupancy(self) -> KernelOccupancy: """Get the occupancy information for launching this kernel."""