NVIDIA · cpcloud · Feb 10, 2026 · Feb 10, 2026 · Feb 10, 2026 · Feb 10, 2026
diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/canary_probe_subprocess.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/canary_probe_subprocess.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import sys
+
+from cuda.pathfinder._dynamic_libs.load_dl_common import LoadedDL
+from cuda.pathfinder._utils.platform_aware import IS_WINDOWS
+
+if IS_WINDOWS:
+    from cuda.pathfinder._dynamic_libs.load_dl_windows import load_with_system_search
+else:
+    from cuda.pathfinder._dynamic_libs.load_dl_linux import load_with_system_search
+
+
+def _probe_canary_abs_path(libname: str) -> str | None:
+    loaded: LoadedDL | None = load_with_system_search(libname)
+    if loaded is None:
+        return None
+    abs_path = loaded.abs_path
+    if not isinstance(abs_path, str):
+        return None
+    return abs_path
+
+
+def probe_canary_abs_path_and_print_json(libname: str) -> None:
+    print(json.dumps(_probe_canary_abs_path(libname)))  # noqa: T201
+
+
+def main(argv: list[str] | None = None) -> int:
+    args = sys.argv[1:] if argv is None else argv
+    if len(args) != 1:
+        return 2
+    probe_canary_abs_path_and_print_json(args[0])
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_nvidia_dynamic_lib.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_nvidia_dynamic_lib.py
@@ -101,7 +101,7 @@ def _find_lib_dir_using_anchor_point(libname: str, anchor_point: str, linux_lib_
     for rel_path in rel_paths:
         for dirname in sorted(glob.glob(os.path.join(anchor_point, rel_path))):
             if os.path.isdir(dirname):
-                return dirname
+                return os.path.normpath(dirname)
 
     return None
 
@@ -152,6 +152,57 @@ def _find_dll_using_lib_dir(
     return None
 
 
+def _derive_ctk_root_linux(resolved_lib_path: str) -> str | None:
+    """Derive the CTK installation root from a resolved library path on Linux.
+
+    Standard system CTK layout: ``$CTK_ROOT/lib64/libfoo.so.XX``
+    (some installs use ``lib`` instead of ``lib64``).
+
+    Returns None if the path doesn't match a recognized layout.
+    """
+    lib_dir = os.path.dirname(resolved_lib_path)
+    basename = os.path.basename(lib_dir)
+    if basename in ("lib64", "lib"):
+        return os.path.dirname(lib_dir)
+    return None
+
+
+def _derive_ctk_root_windows(resolved_lib_path: str) -> str | None:
+    """Derive the CTK installation root from a resolved library path on Windows.
+
+    Handles two CTK layouts:
+    - CTK 13: ``$CTK_ROOT/bin/x64/foo.dll``
+    - CTK 12: ``$CTK_ROOT/bin/foo.dll``
+
+    Returns None if the path doesn't match a recognized layout.
+
+    Uses ``ntpath`` explicitly so the function is testable on any platform.
+    """
+    import ntpath
+
+    lib_dir = ntpath.dirname(resolved_lib_path)
+    basename = ntpath.basename(lib_dir).lower()
+    if basename == "x64":
+        parent = ntpath.dirname(lib_dir)
+        if ntpath.basename(parent).lower() == "bin":
+            return ntpath.dirname(parent)
+    elif basename == "bin":
+        return ntpath.dirname(lib_dir)
+    return None
+
+
+def derive_ctk_root(resolved_lib_path: str) -> str | None:
+    """Derive the CTK installation root from a resolved library path.
+
+    Given the absolute path of a loaded CTK shared library, walk up the
+    directory tree to find the CTK root.  Returns None if the path doesn't
+    match any recognized CTK directory layout.
+    """
+    if IS_WINDOWS:
+        return _derive_ctk_root_windows(resolved_lib_path)
+    return _derive_ctk_root_linux(resolved_lib_path)
+
+
 class _FindNvidiaDynamicLib:
     def __init__(self, libname: str):
         self.libname = libname
@@ -185,6 +236,16 @@ def try_with_conda_prefix(self) -> str | None:
     def try_with_cuda_home(self) -> str | None:
         return self._find_using_lib_dir(_find_lib_dir_using_cuda_home(self.libname))
 
+    def try_via_ctk_root(self, ctk_root: str) -> str | None:
+        """Find the library under a derived CTK root directory.
+
+        Uses :func:`_find_lib_dir_using_anchor_point` which already knows
+        about non-standard sub-paths (e.g. ``nvvm/lib64`` for nvvm).
+        """
+        return self._find_using_lib_dir(
+            _find_lib_dir_using_anchor_point(self.libname, anchor_point=ctk_root, linux_lib_dir="lib64")
+        )
+
     def _find_using_lib_dir(self, lib_dir: str | None) -> str | None:
         if lib_dir is None:
             return None

diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_nvidia_dynamic_lib.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_nvidia_dynamic_lib.py
@@ -2,16 +2,22 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import functools
+import json
 import struct
 import sys
 
-from cuda.pathfinder._dynamic_libs.find_nvidia_dynamic_lib import _FindNvidiaDynamicLib
+from cuda.pathfinder._dynamic_libs.canary_probe_subprocess import probe_canary_abs_path_and_print_json
+from cuda.pathfinder._dynamic_libs.find_nvidia_dynamic_lib import (
+    _FindNvidiaDynamicLib,
+    derive_ctk_root,
+)
 from cuda.pathfinder._dynamic_libs.load_dl_common import DynamicLibNotFoundError, LoadedDL, load_dependencies
 from cuda.pathfinder._dynamic_libs.supported_nvidia_libs import (
     SUPPORTED_LINUX_SONAMES,
     SUPPORTED_WINDOWS_DLLS,
 )
 from cuda.pathfinder._utils.platform_aware import IS_WINDOWS
+from cuda.pathfinder._utils.spawned_process_runner import run_in_spawned_child_process
 
 if IS_WINDOWS:
     from cuda.pathfinder._dynamic_libs.load_dl_windows import (
@@ -60,6 +66,66 @@ def _load_driver_lib_no_cache(libname: str) -> LoadedDL:
     )
 
 
+# Libs that reside on the standard linker path in system CTK installs.
+# Used to discover the CTK root when a lib with a non-standard path
+# (e.g. nvvm under $CTK_ROOT/nvvm/lib64) can't be found directly.
+_CTK_ROOT_CANARY_LIBNAMES = ("cudart",)
+
+
+def _resolve_system_loaded_abs_path_in_subprocess(libname: str) -> str | None:
+    """Resolve a library's system-search absolute path in a child process.
+
+    This keeps any side-effects of loading the canary library scoped to the
+    child process instead of polluting the current process.
+    """
+    try:
+        result = run_in_spawned_child_process(
+            probe_canary_abs_path_and_print_json,
+            args=(libname,),
+            timeout=10.0,
+        )
+    except (OSError, RuntimeError):
+        return None
+    if result.returncode != 0:
+        return None
+
+    # Read the final non-empty stdout line in case earlier lines are emitted.
+    lines = [line for line in result.stdout.splitlines() if line.strip()]
+    if not lines:
+        return None
+    try:
+        payload = json.loads(lines[-1])
+    except json.JSONDecodeError:
+        return None
+    if isinstance(payload, str):
+        return payload
+    return None
+
+
+def _try_ctk_root_canary(finder: _FindNvidiaDynamicLib) -> str | None:
+    """Derive the CTK root from a system-installed canary lib.
+
+    For libs like nvvm whose shared object doesn't reside on the standard
+    linker path, we locate a well-known CTK lib that IS on the linker path
+    via system search, derive the CTK installation root from its resolved
+    path, and then look for the target lib relative to that root.
+
+    The canary load is performed in a subprocess to avoid introducing loader
+    state into the current process.
+    """
+    for canary_libname in _CTK_ROOT_CANARY_LIBNAMES:
+        canary_abs_path = _resolve_system_loaded_abs_path_in_subprocess(canary_libname)
+        if canary_abs_path is None:
+            continue
+        ctk_root = derive_ctk_root(canary_abs_path)
+        if ctk_root is None:
+            continue
+        abs_path: str | None = finder.try_via_ctk_root(ctk_root)
+        if abs_path is not None:
+            return abs_path
+    return None
+
+
 def _load_lib_no_cache(libname: str) -> LoadedDL:
     if libname in _DRIVER_ONLY_LIBNAMES:
         return _load_driver_lib_no_cache(libname)
@@ -90,11 +156,21 @@ def _load_lib_no_cache(libname: str) -> LoadedDL:
         loaded = load_with_system_search(libname)
         if loaded is not None:
             return loaded
+
         abs_path = finder.try_with_cuda_home()
-        if abs_path is None:
-            finder.raise_not_found_error()
-        else:
+        if abs_path is not None:
             found_via = "CUDA_HOME"
+        else:
+            # Canary probe: if the direct system search and CUDA_HOME both
+            # failed (e.g. nvvm isn't on the linker path and CUDA_HOME is
+            # unset), try to discover the CTK root by loading a well-known CTK
+            # lib in a subprocess, then look for the target lib relative to
+            # that root.
+            abs_path = _try_ctk_root_canary(finder)
+            if abs_path is not None:
+                found_via = "system-ctk-root"
+            else:
+                finder.raise_not_found_error()
 
     return load_with_abs_path(libname, abs_path, found_via)
 
@@ -164,6 +240,14 @@ def load_nvidia_dynamic_lib(libname: str) -> LoadedDL:
 
            - If set, use ``CUDA_HOME`` or ``CUDA_PATH`` (in that order).
 
+        5. **CTK root canary probe**
+
+           - For libraries whose shared object doesn't reside on the standard
+             linker path (e.g. ``libnvvm.so`` lives under ``$CTK_ROOT/nvvm/lib64``),
+             attempt to discover the CTK installation root by system-loading a
+             well-known CTK library (``cudart``) in a subprocess, then derive
+             the root from its resolved absolute path.
+
     **Driver libraries** (``"cuda"``, ``"nvml"``):
 
         These are part of the NVIDIA display driver (not the CUDA Toolkit) and

diff --git a/cuda_pathfinder/cuda/pathfinder/_utils/spawned_process_runner.py b/cuda_pathfinder/cuda/pathfinder/_utils/spawned_process_runner.py
@@ -0,0 +1,133 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import multiprocessing
+import queue  # for Empty
+import sys
+import traceback
+from collections.abc import Callable, Sequence
+from dataclasses import dataclass
+from io import StringIO
+from typing import Any
+
+PROCESS_KILLED = -9
+PROCESS_NO_RESULT = -999
+
+
+# Similar to https://docs.python.org/3/library/subprocess.html#subprocess.CompletedProcess
+# (args, check_returncode() are intentionally not supported here.)
+@dataclass
+class CompletedProcess:
+    returncode: int
+    stdout: str
+    stderr: str
+
+
+class ChildProcessWrapper:
+    def __init__(
+        self,
+        result_queue: Any,
+        target: Callable[..., None],
+        args: Sequence[Any] | None,
+        kwargs: dict[str, Any] | None,
+    ) -> None:
+        self.target = target
+        self.args = () if args is None else args
+        self.kwargs = {} if kwargs is None else kwargs
+        self.result_queue = result_queue
+
+    def __call__(self) -> None:
+        # Capture stdout/stderr
+        old_stdout = sys.stdout
+        old_stderr = sys.stderr
+        sys.stdout = StringIO()
+        sys.stderr = StringIO()
+
+        try:
+            self.target(*self.args, **self.kwargs)
+            returncode = 0
+        except SystemExit as e:  # Handle sys.exit()
+            returncode = e.code if isinstance(e.code, int) else 0
+        except BaseException:
+            traceback.print_exc()
+            returncode = 1
+        finally:
+            # Collect outputs and restore streams
+            stdout = sys.stdout.getvalue()
+            stderr = sys.stderr.getvalue()
+            sys.stdout = old_stdout
+            sys.stderr = old_stderr
+            try:  # noqa: SIM105
+                self.result_queue.put((returncode, stdout, stderr))
+            except Exception:  # noqa: S110
+                # If the queue is broken (e.g., parent gone), best effort logging
+                pass
+
+
+def run_in_spawned_child_process(
+    target: Callable[..., None],
+    *,
+    args: Sequence[Any] | None = None,
+    kwargs: dict[str, Any] | None = None,
+    timeout: float | None = None,
+    rethrow: bool = False,
+) -> CompletedProcess:
+    """Run `target` in a spawned child process, capturing stdout/stderr.
+
+    The provided `target` must be defined at the top level of a module, and must
+    be importable in the spawned child process. Lambdas, closures, or interactively
+    defined functions (e.g., in Jupyter notebooks) will not work.
+
+    If `rethrow=True` and the child process exits with a nonzero code,
+    raises ChildProcessError with the captured stderr.
+    """
+    ctx = multiprocessing.get_context("spawn")
+    result_queue = ctx.Queue()
+    process = ctx.Process(target=ChildProcessWrapper(result_queue, target, args, kwargs))
+    process.start()
+
+    try:
+        process.join(timeout)
+        if process.is_alive():
+            process.terminate()
+            process.join()
+            result = CompletedProcess(
+                returncode=PROCESS_KILLED,
+                stdout="",
+                stderr=f"Process timed out after {timeout} seconds and was terminated.",
+            )
+        else:
+            try:
+                returncode, stdout, stderr = result_queue.get(timeout=1.0)
+            except (queue.Empty, EOFError):
+                result = CompletedProcess(
+                    returncode=PROCESS_NO_RESULT,
+                    stdout="",
+                    stderr="Process exited or crashed before returning results.",
+                )
+            else:
+                result = CompletedProcess(
+                    returncode=returncode,
+                    stdout=stdout,
+                    stderr=stderr,
+                )
+
+        if rethrow and result.returncode != 0:
+            raise ChildProcessError(
+                f"Child process exited with code {result.returncode}.\n"
+                "--- stderr-from-child-process ---\n"
+                f"{result.stderr}"
+                "<end-of-stderr-from-child-process>\n"
+            )
+
+        return result
+
+    finally:
+        try:
+            result_queue.close()
+            result_queue.join_thread()
+        except Exception:  # noqa: S110
+            pass
+        if process.is_alive():
+            process.kill()
+            process.join()