Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/usr/bin/env python
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import json
import sys

from cuda.pathfinder._dynamic_libs.load_dl_common import LoadedDL
from cuda.pathfinder._utils.platform_aware import IS_WINDOWS

if IS_WINDOWS:
from cuda.pathfinder._dynamic_libs.load_dl_windows import load_with_system_search
else:
from cuda.pathfinder._dynamic_libs.load_dl_linux import load_with_system_search


def _probe_canary_abs_path(libname: str) -> str | None:
loaded: LoadedDL | None = load_with_system_search(libname)
if loaded is None:
return None
abs_path = loaded.abs_path
if not isinstance(abs_path, str):
return None
return abs_path


def probe_canary_abs_path_and_print_json(libname: str) -> None:
print(json.dumps(_probe_canary_abs_path(libname))) # noqa: T201


def main(argv: list[str] | None = None) -> int:
args = sys.argv[1:] if argv is None else argv
if len(args) != 1:
return 2
probe_canary_abs_path_and_print_json(args[0])
return 0


if __name__ == "__main__":
raise SystemExit(main())
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def _find_lib_dir_using_anchor_point(libname: str, anchor_point: str, linux_lib_
for rel_path in rel_paths:
for dirname in sorted(glob.glob(os.path.join(anchor_point, rel_path))):
if os.path.isdir(dirname):
return dirname
return os.path.normpath(dirname)

return None

Expand Down Expand Up @@ -152,6 +152,57 @@ def _find_dll_using_lib_dir(
return None


def _derive_ctk_root_linux(resolved_lib_path: str) -> str | None:
"""Derive the CTK installation root from a resolved library path on Linux.

Standard system CTK layout: ``$CTK_ROOT/lib64/libfoo.so.XX``
(some installs use ``lib`` instead of ``lib64``).

Returns None if the path doesn't match a recognized layout.
"""
lib_dir = os.path.dirname(resolved_lib_path)
basename = os.path.basename(lib_dir)
if basename in ("lib64", "lib"):
return os.path.dirname(lib_dir)
return None


def _derive_ctk_root_windows(resolved_lib_path: str) -> str | None:
"""Derive the CTK installation root from a resolved library path on Windows.

Handles two CTK layouts:
- CTK 13: ``$CTK_ROOT/bin/x64/foo.dll``
- CTK 12: ``$CTK_ROOT/bin/foo.dll``

Returns None if the path doesn't match a recognized layout.

Uses ``ntpath`` explicitly so the function is testable on any platform.
"""
import ntpath

lib_dir = ntpath.dirname(resolved_lib_path)
basename = ntpath.basename(lib_dir).lower()
if basename == "x64":
parent = ntpath.dirname(lib_dir)
if ntpath.basename(parent).lower() == "bin":
return ntpath.dirname(parent)
elif basename == "bin":
return ntpath.dirname(lib_dir)
return None


def derive_ctk_root(resolved_lib_path: str) -> str | None:
"""Derive the CTK installation root from a resolved library path.

Given the absolute path of a loaded CTK shared library, walk up the
directory tree to find the CTK root. Returns None if the path doesn't
match any recognized CTK directory layout.
"""
if IS_WINDOWS:
return _derive_ctk_root_windows(resolved_lib_path)
return _derive_ctk_root_linux(resolved_lib_path)


class _FindNvidiaDynamicLib:
def __init__(self, libname: str):
self.libname = libname
Expand Down Expand Up @@ -185,6 +236,16 @@ def try_with_conda_prefix(self) -> str | None:
def try_with_cuda_home(self) -> str | None:
return self._find_using_lib_dir(_find_lib_dir_using_cuda_home(self.libname))

def try_via_ctk_root(self, ctk_root: str) -> str | None:
"""Find the library under a derived CTK root directory.

Uses :func:`_find_lib_dir_using_anchor_point` which already knows
about non-standard sub-paths (e.g. ``nvvm/lib64`` for nvvm).
"""
return self._find_using_lib_dir(
_find_lib_dir_using_anchor_point(self.libname, anchor_point=ctk_root, linux_lib_dir="lib64")
)

def _find_using_lib_dir(self, lib_dir: str | None) -> str | None:
if lib_dir is None:
return None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,22 @@
# SPDX-License-Identifier: Apache-2.0

import functools
import json
import struct
import sys

from cuda.pathfinder._dynamic_libs.find_nvidia_dynamic_lib import _FindNvidiaDynamicLib
from cuda.pathfinder._dynamic_libs.canary_probe_subprocess import probe_canary_abs_path_and_print_json
from cuda.pathfinder._dynamic_libs.find_nvidia_dynamic_lib import (
_FindNvidiaDynamicLib,
derive_ctk_root,
)
from cuda.pathfinder._dynamic_libs.load_dl_common import DynamicLibNotFoundError, LoadedDL, load_dependencies
from cuda.pathfinder._dynamic_libs.supported_nvidia_libs import (
SUPPORTED_LINUX_SONAMES,
SUPPORTED_WINDOWS_DLLS,
)
from cuda.pathfinder._utils.platform_aware import IS_WINDOWS
from cuda.pathfinder._utils.spawned_process_runner import run_in_spawned_child_process

if IS_WINDOWS:
from cuda.pathfinder._dynamic_libs.load_dl_windows import (
Expand Down Expand Up @@ -60,6 +66,66 @@ def _load_driver_lib_no_cache(libname: str) -> LoadedDL:
)


# Libs that reside on the standard linker path in system CTK installs.
# Used to discover the CTK root when a lib with a non-standard path
# (e.g. nvvm under $CTK_ROOT/nvvm/lib64) can't be found directly.
_CTK_ROOT_CANARY_LIBNAMES = ("cudart",)


def _resolve_system_loaded_abs_path_in_subprocess(libname: str) -> str | None:
"""Resolve a library's system-search absolute path in a child process.

This keeps any side-effects of loading the canary library scoped to the
child process instead of polluting the current process.
"""
try:
result = run_in_spawned_child_process(
probe_canary_abs_path_and_print_json,
args=(libname,),
timeout=10.0,
)
except (OSError, RuntimeError):
return None
if result.returncode != 0:
return None

# Read the final non-empty stdout line in case earlier lines are emitted.
lines = [line for line in result.stdout.splitlines() if line.strip()]
if not lines:
return None
try:
payload = json.loads(lines[-1])
except json.JSONDecodeError:
return None
if isinstance(payload, str):
return payload
return None


def _try_ctk_root_canary(finder: _FindNvidiaDynamicLib) -> str | None:
"""Derive the CTK root from a system-installed canary lib.

For libs like nvvm whose shared object doesn't reside on the standard
linker path, we locate a well-known CTK lib that IS on the linker path
via system search, derive the CTK installation root from its resolved
path, and then look for the target lib relative to that root.

The canary load is performed in a subprocess to avoid introducing loader
state into the current process.
"""
for canary_libname in _CTK_ROOT_CANARY_LIBNAMES:
canary_abs_path = _resolve_system_loaded_abs_path_in_subprocess(canary_libname)
if canary_abs_path is None:
continue
ctk_root = derive_ctk_root(canary_abs_path)
if ctk_root is None:
continue
abs_path: str | None = finder.try_via_ctk_root(ctk_root)
if abs_path is not None:
return abs_path
return None


def _load_lib_no_cache(libname: str) -> LoadedDL:
if libname in _DRIVER_ONLY_LIBNAMES:
return _load_driver_lib_no_cache(libname)
Expand Down Expand Up @@ -90,11 +156,21 @@ def _load_lib_no_cache(libname: str) -> LoadedDL:
loaded = load_with_system_search(libname)
if loaded is not None:
return loaded

abs_path = finder.try_with_cuda_home()
if abs_path is None:
finder.raise_not_found_error()
else:
if abs_path is not None:
found_via = "CUDA_HOME"
else:
# Canary probe: if the direct system search and CUDA_HOME both
# failed (e.g. nvvm isn't on the linker path and CUDA_HOME is
# unset), try to discover the CTK root by loading a well-known CTK
# lib in a subprocess, then look for the target lib relative to
# that root.
abs_path = _try_ctk_root_canary(finder)
if abs_path is not None:
found_via = "system-ctk-root"
else:
finder.raise_not_found_error()
Comment on lines +164 to +173
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry I have been trapped in other fires and was unable to provide feedbacks timely 😢

Loading cudart in a subprocess is safer than loading it in the main process. That said, my recollection from the Feb-05 meeting was that we'd use other anchor points such as nvJitLink (basically, #1038). Does this mean we changed our mind a bit and decided to use cudart instead?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nvjitlink is fine to start with as well. I'm honestly not sure why one would be preferable over another. That said, as long as we're not choosing something super niche, it doesn't seem like it's worth spending too much time on and can be changed in a follow-up.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, Phillip. I do not have any objection.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

my recollection from the Feb-05 meeting was that we'd use other anchor points such as nvJitLink (basically, #1038). Does this mean we changed our mind a bit and decided to use cudart instead?

The main idea behind 1038 is that the pivot library has two roles: 1. provide an anchor point to find other libraries from (nvvm is the only case that needs it I think), 2. more importantly actually, limit the scope of future searches (via an object that remembers the pivot library).

This PR doesn't have the concept of a search scope, it's essentially only a trick to find nvvm independently, like any other independently found library.

I was a bit surprised when Phillip sent this PR, but I think it's useful, because I believe that we'll have the independent and scoped searches side by side indefinitely. This PR will make 1038 less important, mainly for safety / internal consistency, and to guide users to a consistent setup via helpful error messages.

Re cudart vs nvjitlink:

For the scoped search, the choice of the pivot library is up to the user.

For the independent search (in an isolated process), cudart is the better choice, because it's much smaller than nvjitlink:

740 vs 96964 KiB

smc120-0009.ipp2a2.colossus.nvidia.com:/usr/local/cuda-13.1/lib64 $ (for so in *.so; do ls -s $(realpath $so); done) | sort -n
32 /usr/local/cuda-13.1/targets/x86_64-linux/lib/libOpenCL.so.1.0.0
40 /usr/local/cuda-13.1/targets/x86_64-linux/lib/libnvtx3interop.so.1.1.0
48 /usr/local/cuda-13.1/targets/x86_64-linux/lib/libcufile_rdma.so.1.16.1
400 /usr/local/cuda-13.1/targets/x86_64-linux/lib/libcuobjclient.so.1.0.0
724 /usr/local/cuda-13.1/targets/x86_64-linux/lib/libnvblas.so.13.2.1.1
740 /usr/local/cuda-13.1/targets/x86_64-linux/lib/libcudart.so.13.1.80
972 /usr/local/cuda-13.1/targets/x86_64-linux/lib/libcufftw.so.12.1.0.78
1636 /usr/local/cuda-13.1/targets/x86_64-linux/lib/libnppc.so.13.0.3.3
1640 /usr/local/cuda-13.1/targets/x86_64-linux/lib/libnppisu.so.13.0.3.3
2424 /usr/local/cuda-13.1/targets/x86_64-linux/lib/libnvfatbin.so.13.1.115
3380 /usr/local/cuda-13.1/targets/x86_64-linux/lib/libcufile.so.1.16.1
4236 /usr/local/cuda-13.1/targets/x86_64-linux/lib/libnppitc.so.13.0.3.3
4284 /usr/local/cuda-13.1/targets/x86_64-linux/lib/libnvrtc-builtins.so.13.1.115
5784 /usr/local/cuda-13.1/targets/x86_64-linux/lib/libnvjpeg.so.13.0.3.75
6592 /usr/local/cuda-13.1/targets/x86_64-linux/lib/libnppim.so.13.0.3.3
6836 /usr/local/cuda-13.1/targets/x86_64-linux/lib/libnppicc.so.13.0.3.3
8456 /usr/local/cuda-13.1/targets/x86_64-linux/lib/libnppidei.so.13.0.3.3
10060 /usr/local/cuda-13.1/targets/x86_64-linux/lib/libnpps.so.13.0.3.3
13112 /usr/local/cuda-13.1/targets/x86_64-linux/lib/libnppial.so.13.0.3.3
25848 /usr/local/cuda-13.1/targets/x86_64-linux/lib/libnppig.so.13.0.3.3
26332 /usr/local/cuda-13.1/targets/x86_64-linux/lib/libnppist.so.13.0.3.3
52924 /usr/local/cuda-13.1/targets/x86_64-linux/lib/libcublas.so.13.2.1.1
58192 /usr/local/cuda-13.1/targets/x86_64-linux/lib/libnppif.so.13.0.3.3
96964 /usr/local/cuda-13.1/targets/x86_64-linux/lib/libnvJitLink.so.13.1.115
101472 /usr/local/cuda-13.1/targets/x86_64-linux/lib/libcusolverMg.so.12.0.9.81
111500 /usr/local/cuda-13.1/targets/x86_64-linux/lib/libnvrtc.so.13.1.115
129928 /usr/local/cuda-13.1/targets/x86_64-linux/lib/libcurand.so.10.4.1.81
139108 /usr/local/cuda-13.1/targets/x86_64-linux/lib/libcusolver.so.12.0.9.81
165772 /usr/local/cuda-13.1/targets/x86_64-linux/lib/libcusparse.so.12.7.3.1
292796 /usr/local/cuda-13.1/targets/x86_64-linux/lib/libcufft.so.12.1.0.78
490832 /usr/local/cuda-13.1/targets/x86_64-linux/lib/libcublasLt.so.13.2.1.1


return load_with_abs_path(libname, abs_path, found_via)

Expand Down Expand Up @@ -164,6 +240,14 @@ def load_nvidia_dynamic_lib(libname: str) -> LoadedDL:

- If set, use ``CUDA_HOME`` or ``CUDA_PATH`` (in that order).

5. **CTK root canary probe**

- For libraries whose shared object doesn't reside on the standard
linker path (e.g. ``libnvvm.so`` lives under ``$CTK_ROOT/nvvm/lib64``),
attempt to discover the CTK installation root by system-loading a
well-known CTK library (``cudart``) in a subprocess, then derive
the root from its resolved absolute path.

**Driver libraries** (``"cuda"``, ``"nvml"``):

These are part of the NVIDIA display driver (not the CUDA Toolkit) and
Expand Down
133 changes: 133 additions & 0 deletions cuda_pathfinder/cuda/pathfinder/_utils/spawned_process_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import multiprocessing
import queue # for Empty
import sys
import traceback
from collections.abc import Callable, Sequence
from dataclasses import dataclass
from io import StringIO
from typing import Any

PROCESS_KILLED = -9
PROCESS_NO_RESULT = -999


# Similar to https://docs.python.org/3/library/subprocess.html#subprocess.CompletedProcess
# (args, check_returncode() are intentionally not supported here.)
@dataclass
class CompletedProcess:
returncode: int
stdout: str
stderr: str


class ChildProcessWrapper:
def __init__(
self,
result_queue: Any,
target: Callable[..., None],
args: Sequence[Any] | None,
kwargs: dict[str, Any] | None,
) -> None:
self.target = target
self.args = () if args is None else args
self.kwargs = {} if kwargs is None else kwargs
self.result_queue = result_queue

def __call__(self) -> None:
# Capture stdout/stderr
old_stdout = sys.stdout
old_stderr = sys.stderr
sys.stdout = StringIO()
sys.stderr = StringIO()

try:
self.target(*self.args, **self.kwargs)
returncode = 0
except SystemExit as e: # Handle sys.exit()
returncode = e.code if isinstance(e.code, int) else 0
except BaseException:
traceback.print_exc()
returncode = 1
finally:
# Collect outputs and restore streams
stdout = sys.stdout.getvalue()
stderr = sys.stderr.getvalue()
sys.stdout = old_stdout
sys.stderr = old_stderr
try: # noqa: SIM105
self.result_queue.put((returncode, stdout, stderr))
except Exception: # noqa: S110
# If the queue is broken (e.g., parent gone), best effort logging
pass


def run_in_spawned_child_process(
target: Callable[..., None],
*,
args: Sequence[Any] | None = None,
kwargs: dict[str, Any] | None = None,
timeout: float | None = None,
rethrow: bool = False,
) -> CompletedProcess:
"""Run `target` in a spawned child process, capturing stdout/stderr.

The provided `target` must be defined at the top level of a module, and must
be importable in the spawned child process. Lambdas, closures, or interactively
defined functions (e.g., in Jupyter notebooks) will not work.

If `rethrow=True` and the child process exits with a nonzero code,
raises ChildProcessError with the captured stderr.
"""
ctx = multiprocessing.get_context("spawn")
result_queue = ctx.Queue()
process = ctx.Process(target=ChildProcessWrapper(result_queue, target, args, kwargs))
process.start()

try:
process.join(timeout)
if process.is_alive():
process.terminate()
process.join()
result = CompletedProcess(
returncode=PROCESS_KILLED,
stdout="",
stderr=f"Process timed out after {timeout} seconds and was terminated.",
)
else:
try:
returncode, stdout, stderr = result_queue.get(timeout=1.0)
except (queue.Empty, EOFError):
result = CompletedProcess(
returncode=PROCESS_NO_RESULT,
stdout="",
stderr="Process exited or crashed before returning results.",
)
else:
result = CompletedProcess(
returncode=returncode,
stdout=stdout,
stderr=stderr,
)

if rethrow and result.returncode != 0:
raise ChildProcessError(
f"Child process exited with code {result.returncode}.\n"
"--- stderr-from-child-process ---\n"
f"{result.stderr}"
"<end-of-stderr-from-child-process>\n"
)

return result

finally:
try:
result_queue.close()
result_queue.join_thread()
except Exception: # noqa: S110
pass
if process.is_alive():
process.kill()
process.join()
Loading
Loading