From 9c3189589f2edb72b147dd2ab05e462e749a80bb Mon Sep 17 00:00:00 2001 From: gilbertlee-amd <44450918+gilbertlee-amd@users.noreply.github.com> Date: Mon, 7 Feb 2022 12:16:19 -0700 Subject: [PATCH] [TransferBench] Fix for cases with subsets of configured numa nodes (#495) [ROCm/rccl commit: f3c2cafd9dcb8e2833f28502a17165f36bb01185] --- .../rccl/tools/TransferBench/TransferBench.cpp | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/projects/rccl/tools/TransferBench/TransferBench.cpp b/projects/rccl/tools/TransferBench/TransferBench.cpp index 0b9fc6e1e4..907b8dcbb6 100644 --- a/projects/rccl/tools/TransferBench/TransferBench.cpp +++ b/projects/rccl/tools/TransferBench/TransferBench.cpp @@ -867,11 +867,19 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, float** memP if (memType == MEM_CPU) { // Set numa policy prior to call to hipHostMalloc - unsigned long nodemask = (1ULL << devIndex); + // NOTE: It may be possible that the actual configured numa nodes do not start at 0 + // so remapping may be necessary + // Find the 'deviceId'-th available NUMA node + int numaIdx = 0; + for (int i = 0; i <= devIndex; i++) + while (!numa_bitmask_isbitset(numa_get_mems_allowed(), numaIdx)) + ++numaIdx; + + unsigned long nodemask = (1ULL << numaIdx); long retCode = set_mempolicy(MPOL_BIND, &nodemask, sizeof(nodemask)*8); if (retCode) { - printf("[ERROR] Unable to set NUMA memory policy to bind to NUMA node %d\n", devIndex); + printf("[ERROR] Unable to set NUMA memory policy to bind to NUMA node %d\n", numaIdx); exit(1); } @@ -879,7 +887,7 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, float** memP HIP_CALL(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser)); // Check that the allocated pages are actually on the correct NUMA node - CheckPages((char*)*memPtr, numBytes, devIndex); + CheckPages((char*)*memPtr, numBytes, numaIdx); // Reset to default numa mem policy retCode = set_mempolicy(MPOL_DEFAULT, NULL, 8);