Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cuda_pathfinder/cuda/pathfinder/_testing/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#!/usr/bin/env python
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently wondering: do we need this new subdirectory?

I'll play with this for a few minutes, hoping that we don't have to move the test-only code here.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gist of my exploration with Cursor:

We'll merge this PR now and plan a follow-on PR to unify the production and testing subprocess code paths.

Why unify later

  • Reduce duplication between the canary probe subprocess and the test-only
    load subprocess entrypoint.
  • Share a single JSON response protocol for success, not-found, and errors.
  • Centralize subprocess invocation, timeout handling, and stderr formatting.
  • Keep production and tests aligned as future behavior evolves.
  • Make it easier to reason about, test, and maintain subprocess behavior.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here is the result of continuing the exploration with Cursor: #1779

# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

from __future__ import annotations

import json
import os
import sys
import traceback
from collections.abc import Sequence

DYNAMIC_LIB_NOT_FOUND_MARKER = "CHILD_LOAD_NVIDIA_DYNAMIC_LIB_HELPER_DYNAMIC_LIB_NOT_FOUND_ERROR:"


def _validate_abs_path(abs_path: str) -> None:
assert abs_path, f"empty path: {abs_path=!r}"
assert os.path.isabs(abs_path), f"not absolute: {abs_path=!r}"
assert os.path.isfile(abs_path), f"not a file: {abs_path=!r}"


def _load_nvidia_dynamic_lib_for_test(libname: str) -> str:
# Keep imports inside the subprocess body so startup stays focused on the
# code under test rather than the parent test module.
from cuda.pathfinder import load_nvidia_dynamic_lib
from cuda.pathfinder._dynamic_libs.load_dl_common import LoadedDL
from cuda.pathfinder._dynamic_libs.load_nvidia_dynamic_lib import _load_lib_no_cache
from cuda.pathfinder._dynamic_libs.supported_nvidia_libs import (
SUPPORTED_LINUX_SONAMES,
SUPPORTED_WINDOWS_DLLS,
)
from cuda.pathfinder._utils.platform_aware import IS_WINDOWS

def require_abs_path(loaded_dl: LoadedDL) -> str:
abs_path = loaded_dl.abs_path
if not isinstance(abs_path, str):
raise RuntimeError(f"loaded dynamic library is missing abs_path: {loaded_dl!r}")
_validate_abs_path(abs_path)
return abs_path

loaded_dl_fresh: LoadedDL = load_nvidia_dynamic_lib(libname)
if loaded_dl_fresh.was_already_loaded_from_elsewhere:
raise RuntimeError("loaded_dl_fresh.was_already_loaded_from_elsewhere")

fresh_abs_path = require_abs_path(loaded_dl_fresh)
assert loaded_dl_fresh.found_via is not None

loaded_dl_from_cache: LoadedDL = load_nvidia_dynamic_lib(libname)
if loaded_dl_from_cache is not loaded_dl_fresh:
raise RuntimeError("loaded_dl_from_cache is not loaded_dl_fresh")

loaded_dl_no_cache = _load_lib_no_cache(libname)
no_cache_abs_path = require_abs_path(loaded_dl_no_cache)
supported_libs = SUPPORTED_WINDOWS_DLLS if IS_WINDOWS else SUPPORTED_LINUX_SONAMES
if not loaded_dl_no_cache.was_already_loaded_from_elsewhere and libname in supported_libs:
raise RuntimeError("not loaded_dl_no_cache.was_already_loaded_from_elsewhere")
if not os.path.samefile(no_cache_abs_path, fresh_abs_path):
raise RuntimeError(f"not os.path.samefile({no_cache_abs_path=!r}, {fresh_abs_path=!r})")
return fresh_abs_path


def probe_load_nvidia_dynamic_lib_and_print_json(libname: str) -> None:
from cuda.pathfinder import DynamicLibNotFoundError

try:
abs_path = _load_nvidia_dynamic_lib_for_test(libname)
except DynamicLibNotFoundError:
sys.stdout.write(f"{DYNAMIC_LIB_NOT_FOUND_MARKER}\n")
traceback.print_exc(file=sys.stdout)
return
sys.stdout.write(f"{json.dumps(abs_path)}\n")


def main(argv: Sequence[str] | None = None) -> int:
args = list(sys.argv[1:] if argv is None else argv)
if len(args) != 1:
raise SystemExit("Usage: python -m cuda.pathfinder._testing.load_nvidia_dynamic_lib_subprocess <libname>")
probe_load_nvidia_dynamic_lib_and_print_json(args[0])
return 0


if __name__ == "__main__":
raise SystemExit(main())
131 changes: 0 additions & 131 deletions cuda_pathfinder/cuda/pathfinder/_utils/spawned_process_runner.py

This file was deleted.

83 changes: 38 additions & 45 deletions cuda_pathfinder/tests/child_load_nvidia_dynamic_lib_helper.py
Original file line number Diff line number Diff line change
@@ -1,61 +1,54 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

# This helper is factored out so spawned child processes only import this
# lightweight module. That avoids re-importing the test module (and
# repeating its potentially expensive setup) in every child process.
from __future__ import annotations

import json
import os
import subprocess
import sys
import traceback
import tempfile
from pathlib import Path

from cuda.pathfinder._testing.load_nvidia_dynamic_lib_subprocess import DYNAMIC_LIB_NOT_FOUND_MARKER

def build_child_process_failed_for_libname_message(libname, result):
LOAD_NVIDIA_DYNAMIC_LIB_SUBPROCESS_MODULE = "cuda.pathfinder._testing.load_nvidia_dynamic_lib_subprocess"
# Launch the child from a neutral directory so `python -m cuda.pathfinder...`
# resolves the installed package instead of the source checkout. In CI the
# checkout does not contain the generated `_version.py` file.
LOAD_NVIDIA_DYNAMIC_LIB_SUBPROCESS_CWD = Path(tempfile.gettempdir())
PROCESS_TIMED_OUT = -9


def build_child_process_failed_for_libname_message(libname: str, result: subprocess.CompletedProcess[str]) -> str:
return (
f"Child process failed for {libname=!r} with exit code {result.returncode}\n"
f"--- stdout-from-child-process ---\n{result.stdout}<end-of-stdout-from-child-process>\n"
f"--- stderr-from-child-process ---\n{result.stderr}<end-of-stderr-from-child-process>\n"
)


def validate_abs_path(abs_path):
assert abs_path, f"empty path: {abs_path=!r}"
assert os.path.isabs(abs_path), f"not absolute: {abs_path=!r}"
assert os.path.isfile(abs_path), f"not a file: {abs_path=!r}"
def child_process_reported_dynamic_lib_not_found(result: subprocess.CompletedProcess[str]) -> bool:
return result.stdout.startswith(DYNAMIC_LIB_NOT_FOUND_MARKER)


def child_process_func(libname):
from cuda.pathfinder import DynamicLibNotFoundError, load_nvidia_dynamic_lib
from cuda.pathfinder._dynamic_libs.load_nvidia_dynamic_lib import _load_lib_no_cache
from cuda.pathfinder._dynamic_libs.supported_nvidia_libs import (
SUPPORTED_LINUX_SONAMES,
SUPPORTED_WINDOWS_DLLS,
)
from cuda.pathfinder._utils.platform_aware import IS_WINDOWS

def run_load_nvidia_dynamic_lib_in_subprocess(
libname: str,
*,
timeout: float,
) -> subprocess.CompletedProcess[str]:
command = [sys.executable, "-m", LOAD_NVIDIA_DYNAMIC_LIB_SUBPROCESS_MODULE, libname]
try:
loaded_dl_fresh = load_nvidia_dynamic_lib(libname)
except DynamicLibNotFoundError:
print("CHILD_LOAD_NVIDIA_DYNAMIC_LIB_HELPER_DYNAMIC_LIB_NOT_FOUND_ERROR:")
traceback.print_exc(file=sys.stdout)
return
if loaded_dl_fresh.was_already_loaded_from_elsewhere:
raise RuntimeError("loaded_dl_fresh.was_already_loaded_from_elsewhere")
validate_abs_path(loaded_dl_fresh.abs_path)
assert loaded_dl_fresh.found_via is not None

loaded_dl_from_cache = load_nvidia_dynamic_lib(libname)
if loaded_dl_from_cache is not loaded_dl_fresh:
raise RuntimeError("loaded_dl_from_cache is not loaded_dl_fresh")

loaded_dl_no_cache = _load_lib_no_cache(libname)
# check_if_already_loaded_from_elsewhere relies on these:
supported_libs = SUPPORTED_WINDOWS_DLLS if IS_WINDOWS else SUPPORTED_LINUX_SONAMES
if not loaded_dl_no_cache.was_already_loaded_from_elsewhere and libname in supported_libs:
raise RuntimeError("not loaded_dl_no_cache.was_already_loaded_from_elsewhere")
if not os.path.samefile(loaded_dl_no_cache.abs_path, loaded_dl_fresh.abs_path):
raise RuntimeError(f"not os.path.samefile({loaded_dl_no_cache.abs_path=!r}, {loaded_dl_fresh.abs_path=!r})")
validate_abs_path(loaded_dl_no_cache.abs_path)

print(json.dumps(loaded_dl_fresh.abs_path))
return subprocess.run( # noqa: S603 - trusted argv: current interpreter + internal test helper module
command,
capture_output=True,
text=True,
timeout=timeout,
check=False,
cwd=LOAD_NVIDIA_DYNAMIC_LIB_SUBPROCESS_CWD,
)
except subprocess.TimeoutExpired:
return subprocess.CompletedProcess(
args=command,
returncode=PROCESS_TIMED_OUT,
stdout="",
stderr=f"Process timed out after {timeout} seconds and was terminated.",
)
15 changes: 9 additions & 6 deletions cuda_pathfinder/tests/test_driver_lib_loading.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,11 @@
import os

import pytest
from child_load_nvidia_dynamic_lib_helper import build_child_process_failed_for_libname_message, child_process_func
from child_load_nvidia_dynamic_lib_helper import (
build_child_process_failed_for_libname_message,
child_process_reported_dynamic_lib_not_found,
run_load_nvidia_dynamic_lib_in_subprocess,
)

from cuda.pathfinder._dynamic_libs.lib_descriptor import LIB_DESCRIPTORS
from cuda.pathfinder._dynamic_libs.load_dl_common import DynamicLibNotFoundError, LoadedDL
Expand All @@ -22,7 +26,6 @@
_load_lib_no_cache,
)
from cuda.pathfinder._utils.platform_aware import IS_WINDOWS, quote_for_shell
from cuda.pathfinder._utils.spawned_process_runner import run_in_spawned_child_process

STRICTNESS = os.environ.get("CUDA_PATHFINDER_TEST_LOAD_NVIDIA_DYNAMIC_LIB_STRICTNESS", "see_what_works")
assert STRICTNESS in ("see_what_works", "all_must_work")
Expand Down Expand Up @@ -119,27 +122,27 @@ def test_load_lib_no_cache_does_not_dispatch_ctk_lib_to_driver_path(mocker):


# ---------------------------------------------------------------------------
# Real loading tests (spawned child process for isolation)
# Real loading tests (dedicated subprocess for isolation)
# ---------------------------------------------------------------------------


@pytest.mark.parametrize("libname", sorted(_DRIVER_ONLY_LIBNAMES))
def test_real_load_driver_lib(info_summary_append, libname):
"""Load a real driver library in a child process.
"""Load a real driver library in a dedicated subprocess.

This complements the mock tests above: it exercises the actual OS
loader path and logs results via INFO for CI/QA inspection.
"""
timeout = 120 if IS_WINDOWS else 30
result = run_in_spawned_child_process(child_process_func, args=(libname,), timeout=timeout)
result = run_load_nvidia_dynamic_lib_in_subprocess(libname, timeout=timeout)

def raise_child_process_failed():
raise RuntimeError(build_child_process_failed_for_libname_message(libname, result))

if result.returncode != 0:
raise_child_process_failed()
assert not result.stderr
if result.stdout.startswith("CHILD_LOAD_NVIDIA_DYNAMIC_LIB_HELPER_DYNAMIC_LIB_NOT_FOUND_ERROR:"):
if child_process_reported_dynamic_lib_not_found(result):
if STRICTNESS == "all_must_work":
raise_child_process_failed()
info_summary_append(f"Not found: {libname=!r}")
Expand Down
Loading
Loading