NVIDIA · nirandaperera · May 6, 2026 · May 6, 2026 · May 6, 2026 · pentschev
diff --git a/src/memory/topology_discovery.cpp b/src/memory/topology_discovery.cpp
@@ -13,6 +13,7 @@
 
 #include <algorithm>
 #include <array>
+#include <bit>
 #include <cctype>
 #include <cstdlib>
 #include <cstring>
@@ -46,6 +47,10 @@ struct NvmlLoader {
   nvmlReturn_t (*p_nvmlDeviceIsMigDeviceHandle)(nvmlDevice_t, unsigned int*)    = nullptr;
   nvmlReturn_t (*p_nvmlDeviceGetDeviceHandleFromMigDeviceHandle)(nvmlDevice_t,
                                                                  nvmlDevice_t*) = nullptr;
+  nvmlReturn_t (*p_nvmlDeviceGetMemoryAffinity)(nvmlDevice_t,
+                                                unsigned int,
+                                                unsigned long*,
+                                                nvmlAffinityScope_t)            = nullptr;
   const char* (*p_nvmlErrorString)(nvmlReturn_t)                                = nullptr;
 
   NvmlLoader() { load(); }
@@ -82,12 +87,15 @@ struct NvmlLoader {
     p_nvmlDeviceGetDeviceHandleFromMigDeviceHandle =
       reinterpret_cast<nvmlReturn_t (*)(nvmlDevice_t, nvmlDevice_t*)>(
         dlsym(handle, "nvmlDeviceGetDeviceHandleFromMigDeviceHandle"));
+    p_nvmlDeviceGetMemoryAffinity = reinterpret_cast<nvmlReturn_t (*)(
+      nvmlDevice_t, unsigned int, unsigned long*, nvmlAffinityScope_t)>(
+      dlsym(handle, "nvmlDeviceGetMemoryAffinity"));
     p_nvmlErrorString =
       reinterpret_cast<const char* (*)(nvmlReturn_t)>(dlsym(handle, "nvmlErrorString"));
     // If any required symbol is missing, treat NVML as unavailable
     if (!p_nvmlInit_v2 || !p_nvmlShutdown || !p_nvmlDeviceGetCount_v2 ||
         !p_nvmlDeviceGetHandleByIndex_v2 || !p_nvmlDeviceGetName || !p_nvmlDeviceGetPciInfo_v3 ||
-        !p_nvmlDeviceGetUUID || !p_nvmlErrorString) {
+        !p_nvmlDeviceGetUUID || !p_nvmlDeviceGetMemoryAffinity || !p_nvmlErrorString) {
       dlclose(handle);
       handle                                         = nullptr;
       p_nvmlInit_v2                                  = nullptr;
@@ -100,6 +108,7 @@ struct NvmlLoader {
       p_nvmlDeviceGetHandleByUUID                    = nullptr;
       p_nvmlDeviceIsMigDeviceHandle                  = nullptr;
       p_nvmlDeviceGetDeviceHandleFromMigDeviceHandle = nullptr;
+      p_nvmlDeviceGetMemoryAffinity                  = nullptr;
       p_nvmlErrorString                              = nullptr;
     }
   }
@@ -108,7 +117,7 @@ struct NvmlLoader {
   {
     return handle && p_nvmlInit_v2 && p_nvmlShutdown && p_nvmlDeviceGetCount_v2 &&
            p_nvmlDeviceGetHandleByIndex_v2 && p_nvmlDeviceGetName && p_nvmlDeviceGetPciInfo_v3 &&
-           p_nvmlDeviceGetUUID && p_nvmlErrorString;
+           p_nvmlDeviceGetUUID && p_nvmlDeviceGetMemoryAffinity && p_nvmlErrorString;
   }
 };
 
@@ -323,25 +332,31 @@ std::vector<size_t> resolve_visible_gpu_indices(
 }
 
 /**
- * @brief Get NUMA node from /sys for a PCI device.
+ * @brief Get the host NUMA node id with the best memory affinity to a GPU.
  *
- * Reads /sys/bus/pci/devices/<pci>/numa_node. If the file is missing, empty, or
- * cannot be parsed as an integer, returns -1.
+ * Queries NVML's `nvmlDeviceGetMemoryAffinity` with `NVML_AFFINITY_SCOPE_NODE` and
+ * returns the lowest-numbered NUMA node in the resulting bitmask. NVML walks the
+ * GPU driver's PCI bridge topology directly, so this is unaffected by ACPI
+ * SRAT/SLIT firmware quirks that cause `/sys/bus/pci/devices/<pci>/numa_node` to
+ * report -1 on otherwise NUMA-aware hosts. Same source as `nvidia-smi topo -m`.
- * Queries NVML's `nvmlDeviceGetMemoryAffinity` with `NVML_AFFINITY_SCOPE_NODE` and
- * returns the lowest-numbered NUMA node in the resulting bitmask. NVML walks the
- * GPU driver's PCI bridge topology directly, so this is unaffected by ACPI
- * SRAT/SLIT firmware quirks that cause `/sys/bus/pci/devices/<pci>/numa_node` to
- * report -1 on otherwise NUMA-aware hosts. Same source as `nvidia-smi topo -m`.
+ * Queries NVML's `nvmlDeviceGetMemoryAffinity` with `NVML_AFFINITY_SCOPE_NODE` and
+ * returns the lowest-numbered NUMA node in the resulting bitmask. Same source as
+ * `nvidia-smi topo -m`.
- * Queries NVML's `nvmlDeviceGetMemoryAffinity` with `NVML_AFFINITY_SCOPE_NODE` and
- * returns the lowest-numbered NUMA node in the resulting bitmask. NVML walks the
- * GPU driver's PCI bridge topology directly, so this is unaffected by ACPI
- * SRAT/SLIT firmware quirks that cause `/sys/bus/pci/devices/<pci>/numa_node` to
- * report -1 on otherwise NUMA-aware hosts. Same source as `nvidia-smi topo -m`.
+ * Queries NVML's `nvmlDeviceGetMemoryAffinity` with `NVML_AFFINITY_SCOPE_NODE` and
+ * returns the lowest-numbered NUMA node in the resulting bitmask. Same source as
+ * `nvidia-smi topo -m`.
  *
- * @param pci_bus_id PCI bus ID of the device.
- * @return NUMA node number on success; -1 if unavailable.
+ * @param nvml Loaded NVML loader; `nvml.available()` must already be true.
+ * @param device NVML device handle.
+ * @return NUMA node id on success; -1 if NVML reports no affinity (empty bitmask
+ * or query failure).
+ *
+ * @note `nodeSetSize = 1` (one `unsigned long` = 64 NUMA bits) is sufficient for
+ * any realistic system.
  */
-int get_numa_node_from_sys(std::string const& pci_bus_id)
+int get_numa_node_from_nvml(NvmlLoader const& nvml, nvmlDevice_t device)
 {
-  std::string normalized_id = normalize_pci_bus_id(pci_bus_id);
-  std::string path          = "/sys/bus/pci/devices/" + normalized_id + "/numa_node";
-  std::string content       = read_file_content(path);
-  if (content.empty()) { return -1; }
-  try {
-    return std::stoi(content);
-  } catch (...) {
-    return -1;
+  unsigned long nodeset = 0;
+  if (nvml.p_nvmlDeviceGetMemoryAffinity(device, 1, &nodeset, NVML_AFFINITY_SCOPE_NODE) ==
+        NVML_SUCCESS &&
+      nodeset != 0) {
+    return std::countr_zero(nodeset);
   }
+  return -1;
 }
 
 /**
@@ -856,8 +871,7 @@ bool topology_discovery::discover(NetworkDeviceVerification net_verification)
       result   = nvml.p_nvmlDeviceGetUUID(device, uuid.data(), NVML_DEVICE_UUID_BUFFER_SIZE);
       gpu.uuid = (result == NVML_SUCCESS) ? std::string(uuid.data()) : "Unknown";
 
-      // Get NUMA node and CPU affinity from /sys
-      gpu.numa_node         = get_numa_node_from_sys(gpu.pci_bus_id);
+      gpu.numa_node         = get_numa_node_from_nvml(nvml, device);
       gpu.cpu_affinity_list = get_cpu_affinity_from_sys(gpu.pci_bus_id);
       gpu.cpu_cores         = parse_cpu_list(gpu.cpu_affinity_list);
 

diff --git a/test/memory/test_topology_discovery.cpp b/test/memory/test_topology_discovery.cpp
@@ -182,6 +182,37 @@ TEST_CASE("Topology Discovery default uses strictest verification", "[hw_topolog
   REQUIRE(topo_default.network_devices.size() == topo_explicit.network_devices.size());
 }
 
+// Regression test for the bug where GPU NUMA node was reported as -1 on hosts whose ACPI
+// SRAT/SLIT tables do not publish PCIe-to-NUMA affinity data. The previous implementation
+// read /sys/bus/pci/devices/<pci>/numa_node, which returns -1 in that case;
+// topology_discovery now resolves the NUMA node via nvmlDeviceGetMemoryAffinity, which
+// walks the GPU driver's PCI bridge topology and is unaffected by firmware quirks.
+//
+// Invariant: when the host advertises NUMA topology and GPUs are present, every discovered
+// GPU must resolve to a valid NUMA node. This test would catch a regression that
+// re-introduced the -1 leak.
-// Regression test for the bug where GPU NUMA node was reported as -1 on hosts whose ACPI
-// SRAT/SLIT tables do not publish PCIe-to-NUMA affinity data. The previous implementation
-// read /sys/bus/pci/devices/<pci>/numa_node, which returns -1 in that case;
-// topology_discovery now resolves the NUMA node via nvmlDeviceGetMemoryAffinity, which
-// walks the GPU driver's PCI bridge topology and is unaffected by firmware quirks.
-//
-// Invariant: when the host advertises NUMA topology and GPUs are present, every discovered
-// GPU must resolve to a valid NUMA node. This test would catch a regression that
-// re-introduced the -1 leak.
+// Invariant: when the host advertises NUMA topology and GPUs are present, every discovered
+// GPU must resolve to a valid NUMA node.
-// Regression test for the bug where GPU NUMA node was reported as -1 on hosts whose ACPI
-// SRAT/SLIT tables do not publish PCIe-to-NUMA affinity data. The previous implementation
-// read /sys/bus/pci/devices/<pci>/numa_node, which returns -1 in that case;
-// topology_discovery now resolves the NUMA node via nvmlDeviceGetMemoryAffinity, which
-// walks the GPU driver's PCI bridge topology and is unaffected by firmware quirks.
-//
-// Invariant: when the host advertises NUMA topology and GPUs are present, every discovered
-// GPU must resolve to a valid NUMA node. This test would catch a regression that
-// re-introduced the -1 leak.
+// Invariant: when the host advertises NUMA topology and GPUs are present, every discovered
+// GPU must resolve to a valid NUMA node.
+TEST_CASE("Topology Discovery resolves GPU NUMA node on NUMA-aware hosts", "[hw_topology]")
+{
+  topology_discovery discovery;
+  REQUIRE(discovery.discover());
+
+  auto const& topology = discovery.get_topology();
+
+  if (topology.num_gpus == 0) {
+    SUCCEED("Skipped: requires at least one GPU");
+    return;
+  }
+
+  for (auto const& gpu : topology.gpus) {
+    INFO("GPU " << gpu.id << " (" << gpu.name << " @ " << gpu.pci_bus_id << ")");
+    REQUIRE(gpu.numa_node >= 0);
+    REQUIRE(gpu.numa_node < topology.num_numa_nodes);
+    // memory_binding is populated from numa_node whenever the latter is known.
+    REQUIRE(!gpu.memory_binding.empty());
+    REQUIRE(gpu.memory_binding.front() == gpu.numa_node);
+  }
+}
+
 TEST_CASE("Topology Discovery rejects out-of-range CUDA_VISIBLE_DEVICES", "[hw_topology]")
 {
   ScopedEnvVar env("CUDA_VISIBLE_DEVICES", "99999999");