diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/canary_probe_subprocess.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/canary_probe_subprocess.py
new file mode 100644
index 0000000000..902b57d6e4
--- /dev/null
+++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/canary_probe_subprocess.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import sys
+
+from cuda.pathfinder._dynamic_libs.load_dl_common import LoadedDL
+from cuda.pathfinder._utils.platform_aware import IS_WINDOWS
+
+if IS_WINDOWS:
+    from cuda.pathfinder._dynamic_libs.load_dl_windows import load_with_system_search
+else:
+    from cuda.pathfinder._dynamic_libs.load_dl_linux import load_with_system_search
+
+
+def _probe_canary_abs_path(libname: str) -> str | None:
+    loaded: LoadedDL | None = load_with_system_search(libname)
+    if loaded is None:
+        return None
+    abs_path = loaded.abs_path
+    if not isinstance(abs_path, str):
+        return None
+    return abs_path
+
+
+def probe_canary_abs_path_and_print_json(libname: str) -> None:
+    print(json.dumps(_probe_canary_abs_path(libname)))  # noqa: T201
+
+
+def main(argv: list[str] | None = None) -> int:
+    args = sys.argv[1:] if argv is None else argv
+    if len(args) != 1:
+        return 2
+    probe_canary_abs_path_and_print_json(args[0])
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_nvidia_dynamic_lib.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_nvidia_dynamic_lib.py
index 65c9f4bf3c..6265992e4a 100644
--- a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_nvidia_dynamic_lib.py
+++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/find_nvidia_dynamic_lib.py
@@ -101,7 +101,7 @@ def _find_lib_dir_using_anchor_point(libname: str, anchor_point: str, linux_lib_
     for rel_path in rel_paths:
         for dirname in sorted(glob.glob(os.path.join(anchor_point, rel_path))):
             if os.path.isdir(dirname):
-                return dirname
+                return os.path.normpath(dirname)
 
     return None
 
@@ -152,6 +152,57 @@ def _find_dll_using_lib_dir(
     return None
 
 
+def _derive_ctk_root_linux(resolved_lib_path: str) -> str | None:
+    """Derive the CTK installation root from a resolved library path on Linux.
+
+    Standard system CTK layout: ``$CTK_ROOT/lib64/libfoo.so.XX``
+    (some installs use ``lib`` instead of ``lib64``).
+
+    Returns None if the path doesn't match a recognized layout.
+    """
+    lib_dir = os.path.dirname(resolved_lib_path)
+    basename = os.path.basename(lib_dir)
+    if basename in ("lib64", "lib"):
+        return os.path.dirname(lib_dir)
+    return None
+
+
+def _derive_ctk_root_windows(resolved_lib_path: str) -> str | None:
+    """Derive the CTK installation root from a resolved library path on Windows.
+
+    Handles two CTK layouts:
+    - CTK 13: ``$CTK_ROOT/bin/x64/foo.dll``
+    - CTK 12: ``$CTK_ROOT/bin/foo.dll``
+
+    Returns None if the path doesn't match a recognized layout.
+
+    Uses ``ntpath`` explicitly so the function is testable on any platform.
+    """
+    import ntpath
+
+    lib_dir = ntpath.dirname(resolved_lib_path)
+    basename = ntpath.basename(lib_dir).lower()
+    if basename == "x64":
+        parent = ntpath.dirname(lib_dir)
+        if ntpath.basename(parent).lower() == "bin":
+            return ntpath.dirname(parent)
+    elif basename == "bin":
+        return ntpath.dirname(lib_dir)
+    return None
+
+
+def derive_ctk_root(resolved_lib_path: str) -> str | None:
+    """Derive the CTK installation root from a resolved library path.
+
+    Given the absolute path of a loaded CTK shared library, walk up the
+    directory tree to find the CTK root.  Returns None if the path doesn't
+    match any recognized CTK directory layout.
+    """
+    if IS_WINDOWS:
+        return _derive_ctk_root_windows(resolved_lib_path)
+    return _derive_ctk_root_linux(resolved_lib_path)
+
+
 class _FindNvidiaDynamicLib:
     def __init__(self, libname: str):
         self.libname = libname
@@ -185,6 +236,16 @@ def try_with_conda_prefix(self) -> str | None:
     def try_with_cuda_home(self) -> str | None:
         return self._find_using_lib_dir(_find_lib_dir_using_cuda_home(self.libname))
 
+    def try_via_ctk_root(self, ctk_root: str) -> str | None:
+        """Find the library under a derived CTK root directory.
+
+        Uses :func:`_find_lib_dir_using_anchor_point` which already knows
+        about non-standard sub-paths (e.g. ``nvvm/lib64`` for nvvm).
+        """
+        return self._find_using_lib_dir(
+            _find_lib_dir_using_anchor_point(self.libname, anchor_point=ctk_root, linux_lib_dir="lib64")
+        )
+
     def _find_using_lib_dir(self, lib_dir: str | None) -> str | None:
         if lib_dir is None:
             return None
diff --git a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_nvidia_dynamic_lib.py b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_nvidia_dynamic_lib.py
index 8de2a5511e..1597a5b8b8 100644
--- a/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_nvidia_dynamic_lib.py
+++ b/cuda_pathfinder/cuda/pathfinder/_dynamic_libs/load_nvidia_dynamic_lib.py
@@ -2,16 +2,22 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import functools
+import json
 import struct
 import sys
 
-from cuda.pathfinder._dynamic_libs.find_nvidia_dynamic_lib import _FindNvidiaDynamicLib
+from cuda.pathfinder._dynamic_libs.canary_probe_subprocess import probe_canary_abs_path_and_print_json
+from cuda.pathfinder._dynamic_libs.find_nvidia_dynamic_lib import (
+    _FindNvidiaDynamicLib,
+    derive_ctk_root,
+)
 from cuda.pathfinder._dynamic_libs.load_dl_common import DynamicLibNotFoundError, LoadedDL, load_dependencies
 from cuda.pathfinder._dynamic_libs.supported_nvidia_libs import (
     SUPPORTED_LINUX_SONAMES,
     SUPPORTED_WINDOWS_DLLS,
 )
 from cuda.pathfinder._utils.platform_aware import IS_WINDOWS
+from cuda.pathfinder._utils.spawned_process_runner import run_in_spawned_child_process
 
 if IS_WINDOWS:
     from cuda.pathfinder._dynamic_libs.load_dl_windows import (
@@ -60,6 +66,66 @@ def _load_driver_lib_no_cache(libname: str) -> LoadedDL:
     )
 
 
+# Libs that reside on the standard linker path in system CTK installs.
+# Used to discover the CTK root when a lib with a non-standard path
+# (e.g. nvvm under $CTK_ROOT/nvvm/lib64) can't be found directly.
+_CTK_ROOT_CANARY_LIBNAMES = ("cudart",)
+
+
+def _resolve_system_loaded_abs_path_in_subprocess(libname: str) -> str | None:
+    """Resolve a library's system-search absolute path in a child process.
+
+    This keeps any side-effects of loading the canary library scoped to the
+    child process instead of polluting the current process.
+    """
+    try:
+        result = run_in_spawned_child_process(
+            probe_canary_abs_path_and_print_json,
+            args=(libname,),
+            timeout=10.0,
+        )
+    except (OSError, RuntimeError):
+        return None
+    if result.returncode != 0:
+        return None
+
+    # Read the final non-empty stdout line in case earlier lines are emitted.
+    lines = [line for line in result.stdout.splitlines() if line.strip()]
+    if not lines:
+        return None
+    try:
+        payload = json.loads(lines[-1])
+    except json.JSONDecodeError:
+        return None
+    if isinstance(payload, str):
+        return payload
+    return None
+
+
+def _try_ctk_root_canary(finder: _FindNvidiaDynamicLib) -> str | None:
+    """Derive the CTK root from a system-installed canary lib.
+
+    For libs like nvvm whose shared object doesn't reside on the standard
+    linker path, we locate a well-known CTK lib that IS on the linker path
+    via system search, derive the CTK installation root from its resolved
+    path, and then look for the target lib relative to that root.
+
+    The canary load is performed in a subprocess to avoid introducing loader
+    state into the current process.
+    """
+    for canary_libname in _CTK_ROOT_CANARY_LIBNAMES:
+        canary_abs_path = _resolve_system_loaded_abs_path_in_subprocess(canary_libname)
+        if canary_abs_path is None:
+            continue
+        ctk_root = derive_ctk_root(canary_abs_path)
+        if ctk_root is None:
+            continue
+        abs_path: str | None = finder.try_via_ctk_root(ctk_root)
+        if abs_path is not None:
+            return abs_path
+    return None
+
+
 def _load_lib_no_cache(libname: str) -> LoadedDL:
     if libname in _DRIVER_ONLY_LIBNAMES:
         return _load_driver_lib_no_cache(libname)
@@ -90,11 +156,21 @@ def _load_lib_no_cache(libname: str) -> LoadedDL:
         loaded = load_with_system_search(libname)
         if loaded is not None:
             return loaded
+
         abs_path = finder.try_with_cuda_home()
-        if abs_path is None:
-            finder.raise_not_found_error()
-        else:
+        if abs_path is not None:
             found_via = "CUDA_HOME"
+        else:
+            # Canary probe: if the direct system search and CUDA_HOME both
+            # failed (e.g. nvvm isn't on the linker path and CUDA_HOME is
+            # unset), try to discover the CTK root by loading a well-known CTK
+            # lib in a subprocess, then look for the target lib relative to
+            # that root.
+            abs_path = _try_ctk_root_canary(finder)
+            if abs_path is not None:
+                found_via = "system-ctk-root"
+            else:
+                finder.raise_not_found_error()
 
     return load_with_abs_path(libname, abs_path, found_via)
 
@@ -164,6 +240,14 @@ def load_nvidia_dynamic_lib(libname: str) -> LoadedDL:
 
            - If set, use ``CUDA_HOME`` or ``CUDA_PATH`` (in that order).
 
+        5. **CTK root canary probe**
+
+           - For libraries whose shared object doesn't reside on the standard
+             linker path (e.g. ``libnvvm.so`` lives under ``$CTK_ROOT/nvvm/lib64``),
+             attempt to discover the CTK installation root by system-loading a
+             well-known CTK library (``cudart``) in a subprocess, then derive
+             the root from its resolved absolute path.
+
     **Driver libraries** (``"cuda"``, ``"nvml"``):
 
         These are part of the NVIDIA display driver (not the CUDA Toolkit) and
diff --git a/cuda_pathfinder/cuda/pathfinder/_utils/spawned_process_runner.py b/cuda_pathfinder/cuda/pathfinder/_utils/spawned_process_runner.py
new file mode 100644
index 0000000000..cba0390861
--- /dev/null
+++ b/cuda_pathfinder/cuda/pathfinder/_utils/spawned_process_runner.py
@@ -0,0 +1,133 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import multiprocessing
+import queue  # for Empty
+import sys
+import traceback
+from collections.abc import Callable, Sequence
+from dataclasses import dataclass
+from io import StringIO
+from typing import Any
+
+PROCESS_KILLED = -9
+PROCESS_NO_RESULT = -999
+
+
+# Similar to https://docs.python.org/3/library/subprocess.html#subprocess.CompletedProcess
+# (args, check_returncode() are intentionally not supported here.)
+@dataclass
+class CompletedProcess:
+    returncode: int
+    stdout: str
+    stderr: str
+
+
+class ChildProcessWrapper:
+    def __init__(
+        self,
+        result_queue: Any,
+        target: Callable[..., None],
+        args: Sequence[Any] | None,
+        kwargs: dict[str, Any] | None,
+    ) -> None:
+        self.target = target
+        self.args = () if args is None else args
+        self.kwargs = {} if kwargs is None else kwargs
+        self.result_queue = result_queue
+
+    def __call__(self) -> None:
+        # Capture stdout/stderr
+        old_stdout = sys.stdout
+        old_stderr = sys.stderr
+        sys.stdout = StringIO()
+        sys.stderr = StringIO()
+
+        try:
+            self.target(*self.args, **self.kwargs)
+            returncode = 0
+        except SystemExit as e:  # Handle sys.exit()
+            returncode = e.code if isinstance(e.code, int) else 0
+        except BaseException:
+            traceback.print_exc()
+            returncode = 1
+        finally:
+            # Collect outputs and restore streams
+            stdout = sys.stdout.getvalue()
+            stderr = sys.stderr.getvalue()
+            sys.stdout = old_stdout
+            sys.stderr = old_stderr
+            try:  # noqa: SIM105
+                self.result_queue.put((returncode, stdout, stderr))
+            except Exception:  # noqa: S110
+                # If the queue is broken (e.g., parent gone), best effort logging
+                pass
+
+
+def run_in_spawned_child_process(
+    target: Callable[..., None],
+    *,
+    args: Sequence[Any] | None = None,
+    kwargs: dict[str, Any] | None = None,
+    timeout: float | None = None,
+    rethrow: bool = False,
+) -> CompletedProcess:
+    """Run `target` in a spawned child process, capturing stdout/stderr.
+
+    The provided `target` must be defined at the top level of a module, and must
+    be importable in the spawned child process. Lambdas, closures, or interactively
+    defined functions (e.g., in Jupyter notebooks) will not work.
+
+    If `rethrow=True` and the child process exits with a nonzero code,
+    raises ChildProcessError with the captured stderr.
+    """
+    ctx = multiprocessing.get_context("spawn")
+    result_queue = ctx.Queue()
+    process = ctx.Process(target=ChildProcessWrapper(result_queue, target, args, kwargs))
+    process.start()
+
+    try:
+        process.join(timeout)
+        if process.is_alive():
+            process.terminate()
+            process.join()
+            result = CompletedProcess(
+                returncode=PROCESS_KILLED,
+                stdout="",
+                stderr=f"Process timed out after {timeout} seconds and was terminated.",
+            )
+        else:
+            try:
+                returncode, stdout, stderr = result_queue.get(timeout=1.0)
+            except (queue.Empty, EOFError):
+                result = CompletedProcess(
+                    returncode=PROCESS_NO_RESULT,
+                    stdout="",
+                    stderr="Process exited or crashed before returning results.",
+                )
+            else:
+                result = CompletedProcess(
+                    returncode=returncode,
+                    stdout=stdout,
+                    stderr=stderr,
+                )
+
+        if rethrow and result.returncode != 0:
+            raise ChildProcessError(
+                f"Child process exited with code {result.returncode}.\n"
+                "--- stderr-from-child-process ---\n"
+                f"{result.stderr}"
+                "<end-of-stderr-from-child-process>\n"
+            )
+
+        return result
+
+    finally:
+        try:
+            result_queue.close()
+            result_queue.join_thread()
+        except Exception:  # noqa: S110
+            pass
+        if process.is_alive():
+            process.kill()
+            process.join()
diff --git a/cuda_pathfinder/tests/spawned_process_runner.py b/cuda_pathfinder/tests/spawned_process_runner.py
index f4440743f5..ac0418445c 100644
--- a/cuda_pathfinder/tests/spawned_process_runner.py
+++ b/cuda_pathfinder/tests/spawned_process_runner.py
@@ -1,127 +1,6 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-import multiprocessing
-import queue  # for Empty
-import sys
-import traceback
-from collections.abc import Callable, Sequence
-from dataclasses import dataclass
-from io import StringIO
-from typing import Any
+from cuda.pathfinder._utils.spawned_process_runner import run_in_spawned_child_process
 
-PROCESS_KILLED = -9
-PROCESS_NO_RESULT = -999
-
-
-# Similar to https://docs.python.org/3/library/subprocess.html#subprocess.CompletedProcess
-# (args, check_returncode() are intentionally not supported here.)
-@dataclass
-class CompletedProcess:
-    returncode: int
-    stdout: str
-    stderr: str
-
-
-class ChildProcessWrapper:
-    def __init__(self, result_queue, target, args, kwargs):
-        self.target = target
-        self.args = () if args is None else args
-        self.kwargs = {} if kwargs is None else kwargs
-        self.result_queue = result_queue
-
-    def __call__(self):
-        # Capture stdout/stderr
-        old_stdout = sys.stdout
-        old_stderr = sys.stderr
-        sys.stdout = StringIO()
-        sys.stderr = StringIO()
-
-        try:
-            self.target(*self.args, **self.kwargs)
-            returncode = 0
-        except SystemExit as e:  # Handle sys.exit()
-            returncode = e.code if isinstance(e.code, int) else 0
-        except BaseException:
-            traceback.print_exc()
-            returncode = 1
-        finally:
-            # Collect outputs and restore streams
-            stdout = sys.stdout.getvalue()
-            stderr = sys.stderr.getvalue()
-            sys.stdout = old_stdout
-            sys.stderr = old_stderr
-            try:  # noqa: SIM105
-                self.result_queue.put((returncode, stdout, stderr))
-            except Exception:  # noqa: S110
-                # If the queue is broken (e.g., parent gone), best effort logging
-                pass
-
-
-def run_in_spawned_child_process(
-    target: Callable[..., None],
-    *,
-    args: Sequence[Any] | None = None,
-    kwargs: dict[str, Any] | None = None,
-    timeout: float | None = None,
-    rethrow: bool = False,
-) -> CompletedProcess:
-    """Run `target` in a spawned child process, capturing stdout/stderr.
-
-    The provided `target` must be defined at the top level of a module, and must
-    be importable in the spawned child process. Lambdas, closures, or interactively
-    defined functions (e.g., in Jupyter notebooks) will not work.
-
-    If `rethrow=True` and the child process exits with a nonzero code,
-    raises ChildProcessError with the captured stderr.
-    """
-    ctx = multiprocessing.get_context("spawn")
-    result_queue = ctx.Queue()
-    process = ctx.Process(target=ChildProcessWrapper(result_queue, target, args, kwargs))
-    process.start()
-
-    try:
-        process.join(timeout)
-        if process.is_alive():
-            process.terminate()
-            process.join()
-            result = CompletedProcess(
-                returncode=PROCESS_KILLED,
-                stdout="",
-                stderr=f"Process timed out after {timeout} seconds and was terminated.",
-            )
-        else:
-            try:
-                returncode, stdout, stderr = result_queue.get(timeout=1.0)
-            except (queue.Empty, EOFError):
-                result = CompletedProcess(
-                    returncode=PROCESS_NO_RESULT,
-                    stdout="",
-                    stderr="Process exited or crashed before returning results.",
-                )
-            else:
-                result = CompletedProcess(
-                    returncode=returncode,
-                    stdout=stdout,
-                    stderr=stderr,
-                )
-
-        if rethrow and result.returncode != 0:
-            raise ChildProcessError(
-                f"Child process exited with code {result.returncode}.\n"
-                "--- stderr-from-child-process ---\n"
-                f"{result.stderr}"
-                "<end-of-stderr-from-child-process>\n"
-            )
-
-        return result
-
-    finally:
-        try:
-            result_queue.close()
-            result_queue.join_thread()
-        except Exception:  # noqa: S110
-            pass
-        if process.is_alive():
-            process.kill()
-            process.join()
+__all__ = ["run_in_spawned_child_process"]
diff --git a/cuda_pathfinder/tests/test_ctk_root_discovery.py b/cuda_pathfinder/tests/test_ctk_root_discovery.py
new file mode 100644
index 0000000000..71a61c86c0
--- /dev/null
+++ b/cuda_pathfinder/tests/test_ctk_root_discovery.py
@@ -0,0 +1,276 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+
+import pytest
+
+from cuda.pathfinder._dynamic_libs.find_nvidia_dynamic_lib import (
+    _derive_ctk_root_linux,
+    _derive_ctk_root_windows,
+    _FindNvidiaDynamicLib,
+    derive_ctk_root,
+)
+from cuda.pathfinder._dynamic_libs.load_dl_common import LoadedDL
+from cuda.pathfinder._dynamic_libs.load_nvidia_dynamic_lib import (
+    _load_lib_no_cache,
+    _try_ctk_root_canary,
+)
+from cuda.pathfinder._utils.platform_aware import IS_WINDOWS
+
+_MODULE = "cuda.pathfinder._dynamic_libs.load_nvidia_dynamic_lib"
+_FIND_MODULE = "cuda.pathfinder._dynamic_libs.find_nvidia_dynamic_lib"
+
+
+# ---------------------------------------------------------------------------
+# Platform-aware test helpers
+# ---------------------------------------------------------------------------
+
+
+def _create_nvvm_in_ctk(ctk_root):
+    """Create a fake nvvm lib in the platform-appropriate CTK subdirectory."""
+    if IS_WINDOWS:
+        nvvm_dir = ctk_root / "nvvm" / "bin"
+        nvvm_dir.mkdir(parents=True)
+        nvvm_lib = nvvm_dir / "nvvm64.dll"
+    else:
+        nvvm_dir = ctk_root / "nvvm" / "lib64"
+        nvvm_dir.mkdir(parents=True)
+        nvvm_lib = nvvm_dir / "libnvvm.so"
+    nvvm_lib.write_bytes(b"fake")
+    return nvvm_lib
+
+
+def _create_cudart_in_ctk(ctk_root):
+    """Create a fake cudart lib in the platform-appropriate CTK subdirectory."""
+    if IS_WINDOWS:
+        lib_dir = ctk_root / "bin"
+        lib_dir.mkdir(parents=True)
+        lib_file = lib_dir / "cudart64_12.dll"
+    else:
+        lib_dir = ctk_root / "lib64"
+        lib_dir.mkdir(parents=True)
+        lib_file = lib_dir / "libcudart.so"
+    lib_file.write_bytes(b"fake")
+    return lib_file
+
+
+def _fake_canary_path(ctk_root):
+    """Return the path a system-loaded canary lib would resolve to."""
+    if IS_WINDOWS:
+        return str(ctk_root / "bin" / "cudart64_13.dll")
+    return str(ctk_root / "lib64" / "libcudart.so.13")
+
+
+# ---------------------------------------------------------------------------
+# derive_ctk_root
+# ---------------------------------------------------------------------------
+
+
+def test_derive_ctk_root_linux_lib64():
+    assert _derive_ctk_root_linux("/usr/local/cuda-13/lib64/libcudart.so.13") == "/usr/local/cuda-13"
+
+
+def test_derive_ctk_root_linux_lib():
+    assert _derive_ctk_root_linux("/opt/cuda/lib/libcudart.so.12") == "/opt/cuda"
+
+
+def test_derive_ctk_root_linux_unrecognized():
+    assert _derive_ctk_root_linux("/some/weird/path/libcudart.so.13") is None
+
+
+def test_derive_ctk_root_linux_root_level():
+    assert _derive_ctk_root_linux("/lib64/libcudart.so.13") == "/"
+
+
+def test_derive_ctk_root_windows_ctk13():
+    path = r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0\bin\x64\cudart64_13.dll"
+    assert _derive_ctk_root_windows(path) == r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0"
+
+
+def test_derive_ctk_root_windows_ctk12():
+    path = r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\bin\cudart64_12.dll"
+    assert _derive_ctk_root_windows(path) == r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8"
+
+
+def test_derive_ctk_root_windows_unrecognized():
+    assert _derive_ctk_root_windows(r"C:\weird\cudart64_13.dll") is None
+
+
+def test_derive_ctk_root_windows_case_insensitive_bin():
+    assert _derive_ctk_root_windows(r"C:\CUDA\Bin\cudart64_12.dll") == r"C:\CUDA"
+
+
+def test_derive_ctk_root_windows_case_insensitive_x64():
+    assert _derive_ctk_root_windows(r"C:\CUDA\BIN\X64\cudart64_13.dll") == r"C:\CUDA"
+
+
+def test_derive_ctk_root_dispatches_to_linux(mocker):
+    mocker.patch(f"{_FIND_MODULE}.IS_WINDOWS", False)
+    assert derive_ctk_root("/usr/local/cuda/lib64/libcudart.so.13") == "/usr/local/cuda"
+
+
+def test_derive_ctk_root_dispatches_to_windows(mocker):
+    mocker.patch(f"{_FIND_MODULE}.IS_WINDOWS", True)
+    assert derive_ctk_root(r"C:\CUDA\v13\bin\cudart64_13.dll") == r"C:\CUDA\v13"
+
+
+# ---------------------------------------------------------------------------
+# _FindNvidiaDynamicLib.try_via_ctk_root
+# ---------------------------------------------------------------------------
+
+
+def test_try_via_ctk_root_finds_nvvm(tmp_path):
+    ctk_root = tmp_path / "cuda-13"
+    nvvm_lib = _create_nvvm_in_ctk(ctk_root)
+
+    assert _FindNvidiaDynamicLib("nvvm").try_via_ctk_root(str(ctk_root)) == str(nvvm_lib)
+
+
+def test_try_via_ctk_root_returns_none_when_dir_missing(tmp_path):
+    ctk_root = tmp_path / "cuda-13"
+    ctk_root.mkdir()
+
+    assert _FindNvidiaDynamicLib("nvvm").try_via_ctk_root(str(ctk_root)) is None
+
+
+def test_try_via_ctk_root_regular_lib(tmp_path):
+    ctk_root = tmp_path / "cuda-13"
+    cudart_lib = _create_cudart_in_ctk(ctk_root)
+
+    assert _FindNvidiaDynamicLib("cudart").try_via_ctk_root(str(ctk_root)) == str(cudart_lib)
+
+
+# ---------------------------------------------------------------------------
+# _try_ctk_root_canary
+# ---------------------------------------------------------------------------
+
+
+def _make_loaded_dl(path, found_via):
+    return LoadedDL(path, False, 0xDEAD, found_via)
+
+
+def test_canary_finds_nvvm(tmp_path, mocker):
+    ctk_root = tmp_path / "cuda-13"
+    _create_cudart_in_ctk(ctk_root)
+    nvvm_lib = _create_nvvm_in_ctk(ctk_root)
+
+    probe = mocker.patch(
+        f"{_MODULE}._resolve_system_loaded_abs_path_in_subprocess",
+        return_value=_fake_canary_path(ctk_root),
+    )
+    parent_system_loader = mocker.patch(f"{_MODULE}.load_with_system_search")
+
+    assert _try_ctk_root_canary(_FindNvidiaDynamicLib("nvvm")) == str(nvvm_lib)
+    probe.assert_called_once_with("cudart")
+    parent_system_loader.assert_not_called()
+
+
+def test_canary_returns_none_when_subprocess_probe_fails(mocker):
+    mocker.patch(f"{_MODULE}._resolve_system_loaded_abs_path_in_subprocess", return_value=None)
+    assert _try_ctk_root_canary(_FindNvidiaDynamicLib("nvvm")) is None
+
+
+def test_canary_returns_none_when_ctk_root_unrecognized(mocker):
+    mocker.patch(
+        f"{_MODULE}._resolve_system_loaded_abs_path_in_subprocess",
+        return_value="/weird/path/libcudart.so.13",
+    )
+    assert _try_ctk_root_canary(_FindNvidiaDynamicLib("nvvm")) is None
+
+
+def test_canary_returns_none_when_nvvm_not_in_ctk_root(tmp_path, mocker):
+    ctk_root = tmp_path / "cuda-13"
+    # Create only the canary lib dir, not nvvm
+    _create_cudart_in_ctk(ctk_root)
+
+    mocker.patch(
+        f"{_MODULE}._resolve_system_loaded_abs_path_in_subprocess",
+        return_value=_fake_canary_path(ctk_root),
+    )
+    assert _try_ctk_root_canary(_FindNvidiaDynamicLib("nvvm")) is None
+
+
+def test_canary_skips_when_abs_path_none(mocker):
+    mocker.patch(f"{_MODULE}._resolve_system_loaded_abs_path_in_subprocess", return_value=None)
+    assert _try_ctk_root_canary(_FindNvidiaDynamicLib("nvvm")) is None
+
+
+# ---------------------------------------------------------------------------
+# _load_lib_no_cache search-order
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def _isolate_load_cascade(mocker):
+    """Disable the search steps that run before system-search in _load_lib_no_cache.
+
+    This lets the ordering tests focus on system-search, CUDA_HOME, and the
+    canary probe without needing a real site-packages or conda environment.
+    """
+    # No wheels installed
+    mocker.patch.object(_FindNvidiaDynamicLib, "try_site_packages", return_value=None)
+    # No conda env
+    mocker.patch.object(_FindNvidiaDynamicLib, "try_with_conda_prefix", return_value=None)
+    # Lib not already loaded by another component
+    mocker.patch(f"{_MODULE}.check_if_already_loaded_from_elsewhere", return_value=None)
+    # Skip transitive dependency loading
+    mocker.patch(f"{_MODULE}.load_dependencies")
+
+
+@pytest.mark.usefixtures("_isolate_load_cascade")
+def test_cuda_home_takes_priority_over_canary(tmp_path, mocker):
+    # Two competing CTK roots: one from CUDA_HOME, one the canary would find.
+    cuda_home_root = tmp_path / "cuda-home"
+    nvvm_home_lib = _create_nvvm_in_ctk(cuda_home_root)
+
+    canary_root = tmp_path / "cuda-system"
+    _create_cudart_in_ctk(canary_root)
+    _create_nvvm_in_ctk(canary_root)
+
+    canary_mock = mocker.MagicMock(return_value=_fake_canary_path(canary_root))
+
+    # System search finds nothing for nvvm.
+    mocker.patch(f"{_MODULE}.load_with_system_search", return_value=None)
+    # Canary subprocess probe would find cudart if consulted.
+    mocker.patch(f"{_MODULE}._resolve_system_loaded_abs_path_in_subprocess", side_effect=canary_mock)
+    # CUDA_HOME points to a separate root that also has nvvm
+    mocker.patch(f"{_FIND_MODULE}.get_cuda_home_or_path", return_value=str(cuda_home_root))
+    # Capture the final load call
+    mocker.patch(
+        f"{_MODULE}.load_with_abs_path",
+        side_effect=lambda _libname, path, via: _make_loaded_dl(path, via),
+    )
+
+    result = _load_lib_no_cache("nvvm")
+
+    # CUDA_HOME must win; the canary should never have been consulted
+    assert result.found_via == "CUDA_HOME"
+    assert result.abs_path == str(nvvm_home_lib)
+    canary_mock.assert_not_called()
+
+
+@pytest.mark.usefixtures("_isolate_load_cascade")
+def test_canary_fires_only_after_all_earlier_steps_fail(tmp_path, mocker):
+    canary_root = tmp_path / "cuda-system"
+    _create_cudart_in_ctk(canary_root)
+    nvvm_lib = _create_nvvm_in_ctk(canary_root)
+
+    # System search: nvvm not on linker path.
+    mocker.patch(f"{_MODULE}.load_with_system_search", return_value=None)
+    # Canary subprocess probe finds cudart under a system CTK root.
+    mocker.patch(
+        f"{_MODULE}._resolve_system_loaded_abs_path_in_subprocess",
+        return_value=_fake_canary_path(canary_root),
+    )
+    # No CUDA_HOME set
+    mocker.patch(f"{_FIND_MODULE}.get_cuda_home_or_path", return_value=None)
+    # Capture the final load call
+    mocker.patch(
+        f"{_MODULE}.load_with_abs_path",
+        side_effect=lambda _libname, path, via: _make_loaded_dl(path, via),
+    )
+
+    result = _load_lib_no_cache("nvvm")
+
+    assert result.found_via == "system-ctk-root"
+    assert result.abs_path == str(nvvm_lib)