From a87d61db2b68fa4e38261b35b6b6c3938ee70eb8 Mon Sep 17 00:00:00 2001 From: Edgar Date: Tue, 15 Mar 2022 18:53:52 -0400 Subject: [PATCH] extending the unit-tests for multi-rank support --- test/SendRecv_SinglePairs.cpp | 10 +++--- test/common/EnvVars.cpp | 30 +++++++++--------- test/common/EnvVars.hpp | 1 + test/common/TestBed.cpp | 57 +++++++++++++++++++++++++++-------- test/common/TestBed.hpp | 6 +++- test/common/TestBedChild.cpp | 22 +++++++++++--- 6 files changed, 90 insertions(+), 36 deletions(-) diff --git a/test/SendRecv_SinglePairs.cpp b/test/SendRecv_SinglePairs.cpp index 1911fcf6dd..3a4ad38206 100644 --- a/test/SendRecv_SinglePairs.cpp +++ b/test/SendRecv_SinglePairs.cpp @@ -19,13 +19,15 @@ namespace RcclUnitTesting OptionalColArgs options; bool isCorrect = true; - int totalRanks = testBed.ev.maxGpus; + int numGpus = testBed.ev.maxGpus; + for (int rpg=0; rpg < 2 && isCorrect; ++rpg) for (int isMultiProcess = 0; isMultiProcess <= 1 && isCorrect; ++isMultiProcess) { if (!(testBed.ev.processMask & (1 << isMultiProcess))) continue; - - int const numProcesses = isMultiProcess ? totalRanks : 1; - testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), 1); + int ranksPerGpu = rpg == 0 ? 1 : testBed.ev.maxRanksPerGpu; + int totalRanks = numGpus * ranksPerGpu; + int const numProcesses = isMultiProcess ? numGpus : 1; + testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu), 1); for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx) for (int numIdx = 0; numIdx < numElements.size() && isCorrect; ++numIdx) diff --git a/test/common/EnvVars.cpp b/test/common/EnvVars.cpp index 5f8d615d49..120814cc40 100644 --- a/test/common/EnvVars.cpp +++ b/test/common/EnvVars.cpp @@ -32,12 +32,13 @@ namespace RcclUnitTesting hsa_iterate_agents(CountGpus, &numDevicesAvailable); hsa_shut_down(); - showNames = GetEnvVar("UT_SHOW_NAMES" , 1); - minGpus = GetEnvVar("UT_MIN_GPUS" , 2); - maxGpus = GetEnvVar("UT_MAX_GPUS" , numDevicesAvailable); - processMask = GetEnvVar("UT_PROCESS_MASK", UT_SINGLE_PROCESS | UT_MULTI_PROCESS); - verbose = GetEnvVar("UT_VERBOSE" , 0); - printValues = GetEnvVar("UT_PRINT_VALUES", 0); + showNames = GetEnvVar("UT_SHOW_NAMES" , 1); + minGpus = GetEnvVar("UT_MIN_GPUS" , 2); + maxGpus = GetEnvVar("UT_MAX_GPUS" , numDevicesAvailable); + processMask = GetEnvVar("UT_PROCESS_MASK", UT_SINGLE_PROCESS | UT_MULTI_PROCESS); + verbose = GetEnvVar("UT_VERBOSE" , 0); + printValues = GetEnvVar("UT_PRINT_VALUES", 0); + maxRanksPerGpu = GetEnvVar("UT_MAX_RANKS_PER_GPU", 2); // Limit number of supported reduction operators to just ncclSum if only allReduce is built #ifdef BUILD_ALLREDUCE_ONLY @@ -139,14 +140,15 @@ namespace RcclUnitTesting { std::vector> supported = { - std::make_pair("UT_SHOW_NAMES" , "Show test case names"), - std::make_pair("UT_MIN_GPUS" , "Minimum number of GPUs to use"), - std::make_pair("UT_MAX_GPUS" , "Maximum number of GPUs to use"), - std::make_pair("UT_PROCESS_MASK", "Whether to run single/multi process"), - std::make_pair("UT_VERBOSE" , "Show verbose unit test output"), - std::make_pair("UT_REDOPS" , "List of reduction ops to test"), - std::make_pair("UT_DATATYPES" , "List of datatypes to test"), - std::make_pair("UT_PRINT_VALUES", "Print array values (# of values to print, < 0 for all)") + std::make_pair("UT_SHOW_NAMES" , "Show test case names"), + std::make_pair("UT_MIN_GPUS" , "Minimum number of GPUs to use"), + std::make_pair("UT_MAX_GPUS" , "Maximum number of GPUs to use"), + std::make_pair("UT_PROCESS_MASK" , "Whether to run single/multi process"), + std::make_pair("UT_VERBOSE" , "Show verbose unit test output"), + std::make_pair("UT_REDOPS" , "List of reduction ops to test"), + std::make_pair("UT_DATATYPES" , "List of datatypes to test"), + std::make_pair("UT_MAX_RANKS_PER_GPU", "Maximum number of ranks using the same GPU"), + std::make_pair("UT_PRINT_VALUES" , "Print array values (# of values to print, < 0 for all)") }; printf("================================================================================\n"); diff --git a/test/common/EnvVars.hpp b/test/common/EnvVars.hpp index 5574df2321..b4c348338b 100644 --- a/test/common/EnvVars.hpp +++ b/test/common/EnvVars.hpp @@ -24,6 +24,7 @@ namespace RcclUnitTesting int processMask; // Filter single/multi process [UT_PROCESS_MASK] bool verbose; // Show verbose TestBed output for debug [UT_VERBOSE] int printValues; // Print out input/output/expected arrays [UT_PRINT_VALUES] + int maxRanksPerGpu; // Number of ranks using the same GPU [UT_MAX_RANKS_PER_GPU] // Constructor that parses and collects environment variables EnvVars(); diff --git a/test/common/TestBed.cpp b/test/common/TestBed.cpp index d3a962ac7f..e07035356a 100644 --- a/test/common/TestBed.cpp +++ b/test/common/TestBed.cpp @@ -104,6 +104,12 @@ namespace RcclUnitTesting } } + //Determine number of unique GPUs being used. + std::set unique_devices; + for (auto a: this->rankToDeviceMap) + unique_devices.insert(a); + bool useMulti = unique_devices.size() < this->rankToDeviceMap.size() ? true : false; + // Tell first rank to get ncclUniqueId int getIdCmd = TestBedChild::CHILD_GET_UNIQUE_ID; PIPE_WRITE(0, getIdCmd); @@ -133,6 +139,9 @@ namespace RcclUnitTesting // Send the number of collectives to be run per group call PIPE_WRITE(childId, numCollectivesInGroup); + // Send whether to use MultiRank interfaces or not. + PIPE_WRITE(childId, useMulti); + // Send the GPUs this child uses int const numGpus = deviceIdsPerProcess[childId].size(); PIPE_WRITE(childId, numGpus); @@ -357,11 +366,23 @@ namespace RcclUnitTesting } std::vector> TestBed::GetDeviceIdsList(int const numProcesses, - int const numGpus) + int const numGpus) + { + return GetDeviceIdsList(numProcesses, numGpus, 1); + } + + std::vector> TestBed::GetDeviceIdsList(int const numProcesses, + int const numGpus, + int const ranksPerGpu) { std::vector> result(numProcesses); - for (int i = 0; i < numGpus; i++) - result[i % numProcesses].push_back(i); + int ntasks = numProcesses == 1 ? numGpus : 1; + int k=0; + for (int i = 0; i < numProcesses; i++) + for (int j = 0; j < ntasks * ranksPerGpu; j++) { + result[i].push_back(k%numGpus); + k++; + } return result; } @@ -372,11 +393,17 @@ namespace RcclUnitTesting ncclRedOp_t const redOp, int const root, bool const inPlace, - bool const managedMem) + bool const managedMem, + int const ranksPerProc) { std::stringstream ss; ss << (isMultiProcess ? "MP" : "SP") << " "; - ss << totalRanks << " ranks "; + ss << totalRanks; + if (ranksPerProc > 1) + ss << "(" << ranksPerProc << ") "; + else + ss << " "; + ss << "ranks "; ss << ncclFuncNames[funcType] << " "; ss << "(" << (inPlace ? "IP" : "OP") << "," << (managedMem ? "MM" : "GM") << ") "; ss << ncclDataTypeNames[dataType] << " "; @@ -430,14 +457,16 @@ namespace RcclUnitTesting bool isCorrect = true; // Sweep over the number of ranks - for (int totalRanks = ev.minGpus; totalRanks <= ev.maxGpus && isCorrect; ++totalRanks) + for (int ranksPerGpu=1; ranksPerGpu <= ev.maxRanksPerGpu; ranksPerGpu++) + for (int numGpus = ev.minGpus; numGpus <= ev.maxGpus && isCorrect; ++numGpus) for (int isMultiProcess = 0; isMultiProcess <= 1 && isCorrect; ++isMultiProcess) { if (!(ev.processMask & (1 << isMultiProcess))) continue; // Test either single process all GPUs, or 1 process per GPU - int const numProcesses = isMultiProcess ? totalRanks : 1; - this->InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks)); + int const numChildren = isMultiProcess ? numGpus : 1; + int const numRanks = numGpus*ranksPerGpu; + this->InitComms(TestBed::GetDeviceIdsList(numChildren, numGpus, ranksPerGpu)); for (int ftIdx = 0; ftIdx < funcTypes.size() && isCorrect; ++ftIdx) for (int dtIdx = 0; dtIdx < dataTypes.size() && isCorrect; ++dtIdx) @@ -448,10 +477,11 @@ namespace RcclUnitTesting { if (ev.showNames) { - std::string name = this->GetTestCaseName(totalRanks, isMultiProcess, + std::string name = this->GetTestCaseName(numGpus, isMultiProcess, funcTypes[ftIdx], dataTypes[dtIdx], redOps[rdIdx], roots[rtIdx], - inPlaceList[ipIdx], managedMemList[mmIdx]); + inPlaceList[ipIdx], managedMemList[mmIdx], + ranksPerGpu); INFO("%s\n", name.c_str()); } @@ -460,7 +490,7 @@ namespace RcclUnitTesting int numInputElements, numOutputElements; CollectiveArgs::GetNumElementsForFuncType(funcTypes[ftIdx], sortedN[neIdx], - totalRanks, + numRanks, &numInputElements, &numOutputElements); optionalArgs.redOp = redOps[rdIdx]; @@ -486,10 +516,11 @@ namespace RcclUnitTesting this->ValidateResults(isCorrect); if (!isCorrect) { - std::string name = this->GetTestCaseName(totalRanks, isMultiProcess, + std::string name = this->GetTestCaseName(numGpus, isMultiProcess, funcTypes[ftIdx], dataTypes[dtIdx], redOps[rdIdx], roots[rtIdx], - inPlaceList[ipIdx], managedMemList[mmIdx]); + inPlaceList[ipIdx], managedMemList[mmIdx], + ranksPerGpu); ERROR("Incorrect output for %s\n", name.c_str()); } } diff --git a/test/common/TestBed.hpp b/test/common/TestBed.hpp index 73e4070fdd..e28dcf9208 100644 --- a/test/common/TestBed.hpp +++ b/test/common/TestBed.hpp @@ -94,6 +94,9 @@ namespace RcclUnitTesting std::vector const& GetAllSupportedDataTypes(); // Helper function that splits up GPUs to the given number of processes + static std::vector> GetDeviceIdsList(int const numProcesses, + int const numGpus, + int const ranksPerGpu); static std::vector> GetDeviceIdsList(int const numProcesses, int const numGpus); @@ -105,7 +108,8 @@ namespace RcclUnitTesting ncclRedOp_t const redOp, int const root, bool const inPlace, - bool const managedMem); + bool const managedMem, + int const ranksPerProc=1); // Run a simple sweep void RunSimpleSweep(std::vector const& funcTypes, diff --git a/test/common/TestBedChild.cpp b/test/common/TestBedChild.cpp index ae2d4bf755..233d99ff14 100644 --- a/test/common/TestBedChild.cpp +++ b/test/common/TestBedChild.cpp @@ -126,6 +126,8 @@ namespace RcclUnitTesting PIPE_READ(this->totalRanks); PIPE_READ(this->rankOffset); PIPE_READ(this->numCollectivesInGroup); + bool useMultiRankPerGpu; + PIPE_READ(useMultiRankPerGpu); // Read the GPUs this child uses and prepare storage for collective args / datasets int numGpus; @@ -166,11 +168,23 @@ namespace RcclUnitTesting break; } - if (ncclCommInitRank(&this->comms[localRank], this->totalRanks, id, globalRank) != ncclSuccess) + if (useMultiRankPerGpu) { - ERROR("Rank %d on child %d unable to call ncclCommInitRank\n", globalRank, this->childId); - status = TEST_FAIL; - break; + if (ncclCommInitRankMulti(&this->comms[localRank], this->totalRanks, id, globalRank, globalRank) != ncclSuccess) + { + ERROR("Rank %d on child %d unable to call ncclCommInitRankMulti\n", globalRank, this->childId); + status = TEST_FAIL; + break; + } + } + else + { + if (ncclCommInitRank(&this->comms[localRank], this->totalRanks, id, globalRank) != ncclSuccess) + { + ERROR("Rank %d on child %d unable to call ncclCommInitRank\n", globalRank, this->childId); + status = TEST_FAIL; + break; + } } } if (status == TEST_SUCCESS)