From 69d976532baff08594bd7e20efad30586e5e12fd Mon Sep 17 00:00:00 2001 From: saurabhAMD <160164138+saurabhAMD@users.noreply.github.com> Date: Mon, 4 Nov 2024 10:51:00 -0600 Subject: [PATCH] GPU allocation for CPX Unit Tests using PCI bus id (#1403) * mapping devices wrt pci * Gpu allocation by using pci mapping * Passing gpuPriorityOrder in as an argument rather than making the functions non-static. * Removing redundant testBed instance calling [ROCm/rccl commit: 69b2b712ab1ed310bcc36269b152970e186e85b9] --- projects/rccl/test/AllReduceTests.cpp | 3 +- projects/rccl/test/AllToAllVTests.cpp | 6 +- projects/rccl/test/GroupCallTests.cpp | 15 +++-- projects/rccl/test/NonBlockingTests.cpp | 3 +- projects/rccl/test/SendRecvTests.cpp | 6 +- projects/rccl/test/common/EnvVars.cpp | 80 +++++++++++++++++++++++++ projects/rccl/test/common/EnvVars.hpp | 2 + projects/rccl/test/common/TestBed.cpp | 16 +++-- projects/rccl/test/common/TestBed.hpp | 7 ++- 9 files changed, 119 insertions(+), 19 deletions(-) diff --git a/projects/rccl/test/AllReduceTests.cpp b/projects/rccl/test/AllReduceTests.cpp index a82a9ac19d..ac393dfe8e 100644 --- a/projects/rccl/test/AllReduceTests.cpp +++ b/projects/rccl/test/AllReduceTests.cpp @@ -193,7 +193,8 @@ namespace RcclUnitTesting for (int isMultiProcess : testBed.ev.GetIsMultiProcessList()) { int const numProcesses = isMultiProcess ? totalRanks : 1; - testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks)); + const std::vector& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder(); + testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks, gpuPriorityOrder)); for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx) { diff --git a/projects/rccl/test/AllToAllVTests.cpp b/projects/rccl/test/AllToAllVTests.cpp index b8d1afd966..f1a250a90d 100644 --- a/projects/rccl/test/AllToAllVTests.cpp +++ b/projects/rccl/test/AllToAllVTests.cpp @@ -73,7 +73,8 @@ namespace RcclUnitTesting for (int isMultiProcess : testBed.ev.GetIsMultiProcessList()) { int const numProcesses = isMultiProcess ? totalRanks : 1; - testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks)); + const std::vector& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder(); + testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks, gpuPriorityOrder)); // Prepare AllToAllV options std::vector numInputElements; @@ -130,7 +131,8 @@ namespace RcclUnitTesting for (int isMultiProcess : testBed.ev.GetIsMultiProcessList()) { int const numProcesses = isMultiProcess ? totalRanks : 1; - testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks)); + const std::vector& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder(); + testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks, gpuPriorityOrder)); // Prepare AllToAllV options std::vector numInputElements; diff --git a/projects/rccl/test/GroupCallTests.cpp b/projects/rccl/test/GroupCallTests.cpp index 6bc01be052..14b69ed247 100644 --- a/projects/rccl/test/GroupCallTests.cpp +++ b/projects/rccl/test/GroupCallTests.cpp @@ -28,7 +28,8 @@ namespace RcclUnitTesting { // Test either single process all GPUs, or 1 process per GPU int const numProcesses = isMultiProcess ? totalRanks : 1; - testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup); + const std::vector& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder(); + testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks, gpuPriorityOrder), numCollPerGroup); if (testBed.ev.showNames) INFO("%s %d-ranks GroupCall Identical\n", isMultiProcess ? "MP" : "SP", totalRanks); @@ -84,7 +85,8 @@ namespace RcclUnitTesting { // Test either single process all GPUs, or 1 process per GPU int const numProcesses = isMultiProcess ? totalRanks : 1; - testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup); + const std::vector& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder(); + testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks, gpuPriorityOrder), numCollPerGroup); if (testBed.ev.showNames) INFO("%s %d-ranks GroupCall Different\n", isMultiProcess ? "MP" : "SP", totalRanks); @@ -139,7 +141,8 @@ namespace RcclUnitTesting { // Test either single process all GPUs, or 1 process per GPU int const numProcesses = isMultiProcess ? totalRanks : 1; - testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), numCollPerGroup); + const std::vector& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder(); + testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks, gpuPriorityOrder), numCollPerGroup); if (testBed.ev.showNames) INFO("%s %d-ranks GroupCall MixedDayaType\n", isMultiProcess ? "MP" : "SP", totalRanks); @@ -194,7 +197,8 @@ namespace RcclUnitTesting INFO("%s %d-ranks Multistream %d-Group Calls across %d streams\n", isMultiProcess ? "MP" : "SP", totalRanks, numCollPerGroup, numStreamsPerGroup); - testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), + const std::vector& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder(); + testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks, gpuPriorityOrder), numCollPerGroup, numStreamsPerGroup); // Set up each collective in group in different stream (modulo numStreamsPerGroup) @@ -244,7 +248,8 @@ namespace RcclUnitTesting int const numProcesses = isMultiProcess ? totalRanks : 1; // Initialize comms by specifying the # of group calls - testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), numCollsPerGroup, numStreamsPerGroup, numGroupCalls, useBlocking); + const std::vector& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder(); + testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks, gpuPriorityOrder), numCollsPerGroup, numStreamsPerGroup, numGroupCalls, useBlocking); if (testBed.ev.showNames) INFO("%s %d-ranks GroupCall MultiGroupCall\n", isMultiProcess ? "MP" : "SP", totalRanks); diff --git a/projects/rccl/test/NonBlockingTests.cpp b/projects/rccl/test/NonBlockingTests.cpp index 5b505c1869..75b9cb9290 100644 --- a/projects/rccl/test/NonBlockingTests.cpp +++ b/projects/rccl/test/NonBlockingTests.cpp @@ -34,7 +34,8 @@ namespace RcclUnitTesting { int const numProcesses = isMultiProcess ? totalRanks : 1; // Initialize communicators in non-blocking mode - testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), 1, 1, 1, useBlocking); + const std::vector& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder(); + testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks, gpuPriorityOrder), 1, 1, 1, useBlocking); // Loop over various collective functions for (auto funcType : funcTypes) diff --git a/projects/rccl/test/SendRecvTests.cpp b/projects/rccl/test/SendRecvTests.cpp index da152c0c04..9c7b21361c 100644 --- a/projects/rccl/test/SendRecvTests.cpp +++ b/projects/rccl/test/SendRecvTests.cpp @@ -27,7 +27,8 @@ namespace RcclUnitTesting int ranksPerGpu = rpg == 0 ? 1 : testBed.ev.maxRanksPerGpu; int totalRanks = numGpus * ranksPerGpu; int const numProcesses = isMultiProcess ? numGpus : 1; - testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu), + const std::vector& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder(); + testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu, gpuPriorityOrder), {1,2}, //two group, second group sendrecv to self, has 2 coll testBed.GetNumStreamsPerGroup(1,2), 2); @@ -119,7 +120,8 @@ namespace RcclUnitTesting int ranksPerGpu = rpg == 0 ? 1 : testBed.ev.maxRanksPerGpu; int totalRanks = numGpus * ranksPerGpu; int const numProcesses = isMultiProcess ? numGpus : 1; - testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu), + const std::vector& gpuPriorityOrder = testBed.ev.GetGpuPriorityOrder(); + testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu, gpuPriorityOrder), {1,2}, //two group, second group sendrecv to self, has 2 coll testBed.GetNumStreamsPerGroup(1,2), 2); diff --git a/projects/rccl/test/common/EnvVars.cpp b/projects/rccl/test/common/EnvVars.cpp index 99f96a32c4..3e0c73bb6e 100644 --- a/projects/rccl/test/common/EnvVars.cpp +++ b/projects/rccl/test/common/EnvVars.cpp @@ -123,6 +123,73 @@ namespace RcclUnitTesting return 0; } + ncclResult_t busIdToInt64(const char* busId, int64_t* id) { + char hexStr[17]; // Longest possible int64 hex string + null terminator. + int hexOffset = 0; + for (int i = 0; hexOffset < sizeof(hexStr) - 1; i++) { + char c = busId[i]; + if (c == ':') continue; + if (c == '.') break; //ignore everything after . as they belong to same physical pci + if ((c >= '0' && c <= '9') || + (c >= 'A' && c <= 'F') || + (c >= 'a' && c <= 'f')) { + hexStr[hexOffset++] = busId[i]; + } else break; + } + hexStr[hexOffset] = '\0'; + *id = strtol(hexStr, NULL, 16); + return ncclSuccess; + } + + int getDevicePriority (std::vector *gpuPriorityOrder){ + // Prepare parent->child pipe + int pipefd[2]; + if (pipe(pipefd) == -1) { + ERROR("Unable to create parent->child pipe for getting the device priority vector.\n"); + return TEST_FAIL; + } + pid_t pid = fork(); + if (0 == pid) { + std::vector result; + try { + int numDev; + hipGetDeviceCount(&numDev); + std::unordered_map> uniqueIdToGpuIndexes; + for(int dev=0;dev>> sortedIds(uniqueIdToGpuIndexes.begin(), uniqueIdToGpuIndexes.end()); + std::sort(sortedIds.begin(), sortedIds.end(), [](const auto& a, const auto& b) { + return a.second.size() > b.second.size(); + }); + for (const auto& pair : sortedIds) { + result.insert(result.end(), pair.second.begin(), pair.second.end()); + } + } catch (const std::exception& e) { + std::cerr << "Error: " << e.what() << std::endl; + return 1; + } + if (write(pipefd[1], result.data(), gpuPriorityOrder->size() * sizeof(int)) != gpuPriorityOrder->size() * sizeof(int)) return TEST_FAIL; + close(pipefd[0]); + close(pipefd[1]); + exit(EXIT_SUCCESS); + } + else { + int status; + if (read(pipefd[0], gpuPriorityOrder->data(), gpuPriorityOrder->size() * sizeof(int)) != gpuPriorityOrder->size() * sizeof(int)) return TEST_FAIL; + waitpid(pid, &status, 0); + assert(!status); + close(pipefd[0]); + close(pipefd[1]); + } + return TEST_SUCCESS; + return 0; + } + EnvVars::EnvVars() { @@ -151,10 +218,18 @@ namespace RcclUnitTesting // Total number of reduction ops int numOps = ncclNumOps; + gpuPriorityOrder.resize(numDetectedGpus); + for(int i=0;i const& EnvVars::GetGpuPriorityOrder() + { + return gpuPriorityOrder; + } + std::vector const& EnvVars::GetIsMultiProcessList() { return isMultiProcessList; diff --git a/projects/rccl/test/common/EnvVars.hpp b/projects/rccl/test/common/EnvVars.hpp index c20e82cf45..f75f31a3c7 100644 --- a/projects/rccl/test/common/EnvVars.hpp +++ b/projects/rccl/test/common/EnvVars.hpp @@ -41,6 +41,7 @@ namespace RcclUnitTesting std::vector const& GetNumGpusList(); std::vector const& GetIsMultiProcessList(); + std::vector const& GetGpuPriorityOrder(); // Orders the gpus based on the associativity of them with OAM with higher gpus linked. void ShowConfig(); protected: @@ -49,6 +50,7 @@ namespace RcclUnitTesting std::vector numGpusList; // List of # Gpus to use [UT_MIN_GPUS/UT_MAX_GPUS/UT_POW2_GPUS] std::vector isMultiProcessList; // Single or multi process [UT_PROCESS_MASK] int numDetectedGpus; + std::vector gpuPriorityOrder; // Orders the gpus based on the associativity of them with OAM with higher gpus linked. // Helper functions to parse environment variables int GetEnvVar(std::string const varname, int defaultValue); diff --git a/projects/rccl/test/common/TestBed.cpp b/projects/rccl/test/common/TestBed.cpp index 1421ff5c94..612b2d68f1 100644 --- a/projects/rccl/test/common/TestBed.cpp +++ b/projects/rccl/test/common/TestBed.cpp @@ -193,7 +193,8 @@ namespace RcclUnitTesting void TestBed::InitComms(int const numGpus, int const numCollectivesInGroup, int const numStreamsPerGroup, int const numGroupCalls, bool const useBlocking) { - InitComms(TestBed::GetDeviceIdsList(1, numGpus), TestBed::GetNumCollsPerGroup(numCollectivesInGroup, numGroupCalls), TestBed::GetNumStreamsPerGroup(numStreamsPerGroup, numGroupCalls), numGroupCalls, useBlocking); + const std::vector& gpuPriorityOrder = ev.GetGpuPriorityOrder(); + InitComms(GetDeviceIdsList(1, numGpus, gpuPriorityOrder), TestBed::GetNumCollsPerGroup(numCollectivesInGroup, numGroupCalls), TestBed::GetNumStreamsPerGroup(numStreamsPerGroup, numGroupCalls), numGroupCalls, useBlocking); } void TestBed::SetCollectiveArgs(ncclFunc_t const funcType, @@ -562,21 +563,23 @@ namespace RcclUnitTesting } std::vector> TestBed::GetDeviceIdsList(int const numProcesses, - int const numGpus) + int const numGpus, + const std::vector& gpuPriorityOrder) { - return GetDeviceIdsList(numProcesses, numGpus, 1); + return GetDeviceIdsList(numProcesses, numGpus, 1, gpuPriorityOrder); } std::vector> TestBed::GetDeviceIdsList(int const numProcesses, int const numGpus, - int const ranksPerGpu) + int const ranksPerGpu, + const std::vector& gpuPriorityOrder) { std::vector> result(numProcesses); int ntasks = numProcesses == 1 ? numGpus : 1; int k=0; for (int i = 0; i < numProcesses; i++) for (int j = 0; j < ntasks * ranksPerGpu; j++) { - result[i].push_back(k%numGpus); + result[i].push_back(gpuPriorityOrder[k%numGpus]); k++; } return result; @@ -668,7 +671,8 @@ namespace RcclUnitTesting if(enableSweep == false && (numGpus < 8 || numRanks < 8)) { continue; } - this->InitComms(TestBed::GetDeviceIdsList(numChildren, numGpus, ranksPerGpu)); + const std::vector& gpuPriorityOrder = ev.GetGpuPriorityOrder(); + this->InitComms(this->GetDeviceIdsList(numChildren, numGpus, ranksPerGpu, gpuPriorityOrder)); if (testing::Test::HasFailure()) { isCorrect = false; diff --git a/projects/rccl/test/common/TestBed.hpp b/projects/rccl/test/common/TestBed.hpp index d74d10c048..132aa5256c 100644 --- a/projects/rccl/test/common/TestBed.hpp +++ b/projects/rccl/test/common/TestBed.hpp @@ -136,9 +136,12 @@ namespace RcclUnitTesting // Helper function that splits up GPUs to the given number of processes static std::vector> GetDeviceIdsList(int const numProcesses, int const numGpus, - int const ranksPerGpu); + int const ranksPerGpu, + const std::vector& gpuPriorityOrder); + static std::vector> GetDeviceIdsList(int const numProcesses, - int const numGpus); + int const numGpus, + const std::vector& gpuPriorityOrder); // Generate a test case name static std::string GetTestCaseName(int const totalRanks,