Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 32 additions & 18 deletions src/memory/topology_discovery.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

#include <algorithm>
#include <array>
#include <bit>
#include <cctype>
#include <cstdlib>
#include <cstring>
Expand Down Expand Up @@ -46,6 +47,10 @@ struct NvmlLoader {
nvmlReturn_t (*p_nvmlDeviceIsMigDeviceHandle)(nvmlDevice_t, unsigned int*) = nullptr;
nvmlReturn_t (*p_nvmlDeviceGetDeviceHandleFromMigDeviceHandle)(nvmlDevice_t,
nvmlDevice_t*) = nullptr;
nvmlReturn_t (*p_nvmlDeviceGetMemoryAffinity)(nvmlDevice_t,
unsigned int,
unsigned long*,
nvmlAffinityScope_t) = nullptr;
const char* (*p_nvmlErrorString)(nvmlReturn_t) = nullptr;

NvmlLoader() { load(); }
Expand Down Expand Up @@ -82,12 +87,15 @@ struct NvmlLoader {
p_nvmlDeviceGetDeviceHandleFromMigDeviceHandle =
reinterpret_cast<nvmlReturn_t (*)(nvmlDevice_t, nvmlDevice_t*)>(
dlsym(handle, "nvmlDeviceGetDeviceHandleFromMigDeviceHandle"));
p_nvmlDeviceGetMemoryAffinity = reinterpret_cast<nvmlReturn_t (*)(
nvmlDevice_t, unsigned int, unsigned long*, nvmlAffinityScope_t)>(
dlsym(handle, "nvmlDeviceGetMemoryAffinity"));
p_nvmlErrorString =
reinterpret_cast<const char* (*)(nvmlReturn_t)>(dlsym(handle, "nvmlErrorString"));
// If any required symbol is missing, treat NVML as unavailable
if (!p_nvmlInit_v2 || !p_nvmlShutdown || !p_nvmlDeviceGetCount_v2 ||
!p_nvmlDeviceGetHandleByIndex_v2 || !p_nvmlDeviceGetName || !p_nvmlDeviceGetPciInfo_v3 ||
!p_nvmlDeviceGetUUID || !p_nvmlErrorString) {
!p_nvmlDeviceGetUUID || !p_nvmlDeviceGetMemoryAffinity || !p_nvmlErrorString) {
dlclose(handle);
handle = nullptr;
p_nvmlInit_v2 = nullptr;
Expand All @@ -100,6 +108,7 @@ struct NvmlLoader {
p_nvmlDeviceGetHandleByUUID = nullptr;
p_nvmlDeviceIsMigDeviceHandle = nullptr;
p_nvmlDeviceGetDeviceHandleFromMigDeviceHandle = nullptr;
p_nvmlDeviceGetMemoryAffinity = nullptr;
p_nvmlErrorString = nullptr;
}
}
Expand All @@ -108,7 +117,7 @@ struct NvmlLoader {
{
return handle && p_nvmlInit_v2 && p_nvmlShutdown && p_nvmlDeviceGetCount_v2 &&
p_nvmlDeviceGetHandleByIndex_v2 && p_nvmlDeviceGetName && p_nvmlDeviceGetPciInfo_v3 &&
p_nvmlDeviceGetUUID && p_nvmlErrorString;
p_nvmlDeviceGetUUID && p_nvmlDeviceGetMemoryAffinity && p_nvmlErrorString;
}
};

Expand Down Expand Up @@ -323,25 +332,31 @@ std::vector<size_t> resolve_visible_gpu_indices(
}

/**
* @brief Get NUMA node from /sys for a PCI device.
* @brief Get the host NUMA node id with the best memory affinity to a GPU.
*
* Reads /sys/bus/pci/devices/<pci>/numa_node. If the file is missing, empty, or
* cannot be parsed as an integer, returns -1.
* Queries NVML's `nvmlDeviceGetMemoryAffinity` with `NVML_AFFINITY_SCOPE_NODE` and
* returns the lowest-numbered NUMA node in the resulting bitmask. NVML walks the
* GPU driver's PCI bridge topology directly, so this is unaffected by ACPI
* SRAT/SLIT firmware quirks that cause `/sys/bus/pci/devices/<pci>/numa_node` to
* report -1 on otherwise NUMA-aware hosts. Same source as `nvidia-smi topo -m`.
Comment on lines +337 to +341
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
* Queries NVML's `nvmlDeviceGetMemoryAffinity` with `NVML_AFFINITY_SCOPE_NODE` and
* returns the lowest-numbered NUMA node in the resulting bitmask. NVML walks the
* GPU driver's PCI bridge topology directly, so this is unaffected by ACPI
* SRAT/SLIT firmware quirks that cause `/sys/bus/pci/devices/<pci>/numa_node` to
* report -1 on otherwise NUMA-aware hosts. Same source as `nvidia-smi topo -m`.
* Queries NVML's `nvmlDeviceGetMemoryAffinity` with `NVML_AFFINITY_SCOPE_NODE` and
* returns the lowest-numbered NUMA node in the resulting bitmask. Same source as
* `nvidia-smi topo -m`.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here, we don't care about the previous implementations.

*
* @param pci_bus_id PCI bus ID of the device.
* @return NUMA node number on success; -1 if unavailable.
* @param nvml Loaded NVML loader; `nvml.available()` must already be true.
* @param device NVML device handle.
* @return NUMA node id on success; -1 if NVML reports no affinity (empty bitmask
* or query failure).
*
* @note `nodeSetSize = 1` (one `unsigned long` = 64 NUMA bits) is sufficient for
* any realistic system.
*/
int get_numa_node_from_sys(std::string const& pci_bus_id)
int get_numa_node_from_nvml(NvmlLoader const& nvml, nvmlDevice_t device)
{
std::string normalized_id = normalize_pci_bus_id(pci_bus_id);
std::string path = "/sys/bus/pci/devices/" + normalized_id + "/numa_node";
std::string content = read_file_content(path);
if (content.empty()) { return -1; }
try {
return std::stoi(content);
} catch (...) {
return -1;
unsigned long nodeset = 0;
if (nvml.p_nvmlDeviceGetMemoryAffinity(device, 1, &nodeset, NVML_AFFINITY_SCOPE_NODE) ==
NVML_SUCCESS &&
nodeset != 0) {
return std::countr_zero(nodeset);
}
return -1;
}

/**
Expand Down Expand Up @@ -856,8 +871,7 @@ bool topology_discovery::discover(NetworkDeviceVerification net_verification)
result = nvml.p_nvmlDeviceGetUUID(device, uuid.data(), NVML_DEVICE_UUID_BUFFER_SIZE);
gpu.uuid = (result == NVML_SUCCESS) ? std::string(uuid.data()) : "Unknown";

// Get NUMA node and CPU affinity from /sys
gpu.numa_node = get_numa_node_from_sys(gpu.pci_bus_id);
gpu.numa_node = get_numa_node_from_nvml(nvml, device);
gpu.cpu_affinity_list = get_cpu_affinity_from_sys(gpu.pci_bus_id);
gpu.cpu_cores = parse_cpu_list(gpu.cpu_affinity_list);

Expand Down
31 changes: 31 additions & 0 deletions test/memory/test_topology_discovery.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,37 @@ TEST_CASE("Topology Discovery default uses strictest verification", "[hw_topolog
REQUIRE(topo_default.network_devices.size() == topo_explicit.network_devices.size());
}

// Regression test for the bug where GPU NUMA node was reported as -1 on hosts whose ACPI
// SRAT/SLIT tables do not publish PCIe-to-NUMA affinity data. The previous implementation
// read /sys/bus/pci/devices/<pci>/numa_node, which returns -1 in that case;
// topology_discovery now resolves the NUMA node via nvmlDeviceGetMemoryAffinity, which
// walks the GPU driver's PCI bridge topology and is unaffected by firmware quirks.
//
// Invariant: when the host advertises NUMA topology and GPUs are present, every discovered
// GPU must resolve to a valid NUMA node. This test would catch a regression that
// re-introduced the -1 leak.
Comment on lines +185 to +193
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// Regression test for the bug where GPU NUMA node was reported as -1 on hosts whose ACPI
// SRAT/SLIT tables do not publish PCIe-to-NUMA affinity data. The previous implementation
// read /sys/bus/pci/devices/<pci>/numa_node, which returns -1 in that case;
// topology_discovery now resolves the NUMA node via nvmlDeviceGetMemoryAffinity, which
// walks the GPU driver's PCI bridge topology and is unaffected by firmware quirks.
//
// Invariant: when the host advertises NUMA topology and GPUs are present, every discovered
// GPU must resolve to a valid NUMA node. This test would catch a regression that
// re-introduced the -1 leak.
// Invariant: when the host advertises NUMA topology and GPUs are present, every discovered
// GPU must resolve to a valid NUMA node.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't care about details of the previous implementation. Only the expected behavior, which is what being tested, matters.

TEST_CASE("Topology Discovery resolves GPU NUMA node on NUMA-aware hosts", "[hw_topology]")
{
topology_discovery discovery;
REQUIRE(discovery.discover());

auto const& topology = discovery.get_topology();

if (topology.num_gpus == 0) {
SUCCEED("Skipped: requires at least one GPU");
return;
}

for (auto const& gpu : topology.gpus) {
INFO("GPU " << gpu.id << " (" << gpu.name << " @ " << gpu.pci_bus_id << ")");
Comment thread
pentschev marked this conversation as resolved.
REQUIRE(gpu.numa_node >= 0);
REQUIRE(gpu.numa_node < topology.num_numa_nodes);
// memory_binding is populated from numa_node whenever the latter is known.
REQUIRE(!gpu.memory_binding.empty());
REQUIRE(gpu.memory_binding.front() == gpu.numa_node);
}
}

TEST_CASE("Topology Discovery rejects out-of-range CUDA_VISIBLE_DEVICES", "[hw_topology]")
{
ScopedEnvVar env("CUDA_VISIBLE_DEVICES", "99999999");
Expand Down
Loading