extending the unit-tests for multi-rank support

This commit is contained in:
Edgar
2022-03-15 18:53:52 -04:00
zatwierdzone przez Edgar Gabriel
rodzic 0336ffdf70
commit a87d61db2b
6 zmienionych plików z 90 dodań i 36 usunięć
+6 -4
Wyświetl plik
@@ -19,13 +19,15 @@ namespace RcclUnitTesting
OptionalColArgs options;
bool isCorrect = true;
int totalRanks = testBed.ev.maxGpus;
int numGpus = testBed.ev.maxGpus;
for (int rpg=0; rpg < 2 && isCorrect; ++rpg)
for (int isMultiProcess = 0; isMultiProcess <= 1 && isCorrect; ++isMultiProcess)
{
if (!(testBed.ev.processMask & (1 << isMultiProcess))) continue;
int const numProcesses = isMultiProcess ? totalRanks : 1;
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), 1);
int ranksPerGpu = rpg == 0 ? 1 : testBed.ev.maxRanksPerGpu;
int totalRanks = numGpus * ranksPerGpu;
int const numProcesses = isMultiProcess ? numGpus : 1;
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu), 1);
for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx)
for (int numIdx = 0; numIdx < numElements.size() && isCorrect; ++numIdx)
+16 -14
Wyświetl plik
@@ -32,12 +32,13 @@ namespace RcclUnitTesting
hsa_iterate_agents(CountGpus, &numDevicesAvailable);
hsa_shut_down();
showNames = GetEnvVar("UT_SHOW_NAMES" , 1);
minGpus = GetEnvVar("UT_MIN_GPUS" , 2);
maxGpus = GetEnvVar("UT_MAX_GPUS" , numDevicesAvailable);
processMask = GetEnvVar("UT_PROCESS_MASK", UT_SINGLE_PROCESS | UT_MULTI_PROCESS);
verbose = GetEnvVar("UT_VERBOSE" , 0);
printValues = GetEnvVar("UT_PRINT_VALUES", 0);
showNames = GetEnvVar("UT_SHOW_NAMES" , 1);
minGpus = GetEnvVar("UT_MIN_GPUS" , 2);
maxGpus = GetEnvVar("UT_MAX_GPUS" , numDevicesAvailable);
processMask = GetEnvVar("UT_PROCESS_MASK", UT_SINGLE_PROCESS | UT_MULTI_PROCESS);
verbose = GetEnvVar("UT_VERBOSE" , 0);
printValues = GetEnvVar("UT_PRINT_VALUES", 0);
maxRanksPerGpu = GetEnvVar("UT_MAX_RANKS_PER_GPU", 2);
// Limit number of supported reduction operators to just ncclSum if only allReduce is built
#ifdef BUILD_ALLREDUCE_ONLY
@@ -139,14 +140,15 @@ namespace RcclUnitTesting
{
std::vector<std::pair<std::string, std::string>> supported =
{
std::make_pair("UT_SHOW_NAMES" , "Show test case names"),
std::make_pair("UT_MIN_GPUS" , "Minimum number of GPUs to use"),
std::make_pair("UT_MAX_GPUS" , "Maximum number of GPUs to use"),
std::make_pair("UT_PROCESS_MASK", "Whether to run single/multi process"),
std::make_pair("UT_VERBOSE" , "Show verbose unit test output"),
std::make_pair("UT_REDOPS" , "List of reduction ops to test"),
std::make_pair("UT_DATATYPES" , "List of datatypes to test"),
std::make_pair("UT_PRINT_VALUES", "Print array values (# of values to print, < 0 for all)")
std::make_pair("UT_SHOW_NAMES" , "Show test case names"),
std::make_pair("UT_MIN_GPUS" , "Minimum number of GPUs to use"),
std::make_pair("UT_MAX_GPUS" , "Maximum number of GPUs to use"),
std::make_pair("UT_PROCESS_MASK" , "Whether to run single/multi process"),
std::make_pair("UT_VERBOSE" , "Show verbose unit test output"),
std::make_pair("UT_REDOPS" , "List of reduction ops to test"),
std::make_pair("UT_DATATYPES" , "List of datatypes to test"),
std::make_pair("UT_MAX_RANKS_PER_GPU", "Maximum number of ranks using the same GPU"),
std::make_pair("UT_PRINT_VALUES" , "Print array values (# of values to print, < 0 for all)")
};
printf("================================================================================\n");
+1
Wyświetl plik
@@ -24,6 +24,7 @@ namespace RcclUnitTesting
int processMask; // Filter single/multi process [UT_PROCESS_MASK]
bool verbose; // Show verbose TestBed output for debug [UT_VERBOSE]
int printValues; // Print out input/output/expected arrays [UT_PRINT_VALUES]
int maxRanksPerGpu; // Number of ranks using the same GPU [UT_MAX_RANKS_PER_GPU]
// Constructor that parses and collects environment variables
EnvVars();
+44 -13
Wyświetl plik
@@ -104,6 +104,12 @@ namespace RcclUnitTesting
}
}
//Determine number of unique GPUs being used.
std::set<int> unique_devices;
for (auto a: this->rankToDeviceMap)
unique_devices.insert(a);
bool useMulti = unique_devices.size() < this->rankToDeviceMap.size() ? true : false;
// Tell first rank to get ncclUniqueId
int getIdCmd = TestBedChild::CHILD_GET_UNIQUE_ID;
PIPE_WRITE(0, getIdCmd);
@@ -133,6 +139,9 @@ namespace RcclUnitTesting
// Send the number of collectives to be run per group call
PIPE_WRITE(childId, numCollectivesInGroup);
// Send whether to use MultiRank interfaces or not.
PIPE_WRITE(childId, useMulti);
// Send the GPUs this child uses
int const numGpus = deviceIdsPerProcess[childId].size();
PIPE_WRITE(childId, numGpus);
@@ -357,11 +366,23 @@ namespace RcclUnitTesting
}
std::vector<std::vector<int>> TestBed::GetDeviceIdsList(int const numProcesses,
int const numGpus)
int const numGpus)
{
return GetDeviceIdsList(numProcesses, numGpus, 1);
}
std::vector<std::vector<int>> TestBed::GetDeviceIdsList(int const numProcesses,
int const numGpus,
int const ranksPerGpu)
{
std::vector<std::vector<int>> result(numProcesses);
for (int i = 0; i < numGpus; i++)
result[i % numProcesses].push_back(i);
int ntasks = numProcesses == 1 ? numGpus : 1;
int k=0;
for (int i = 0; i < numProcesses; i++)
for (int j = 0; j < ntasks * ranksPerGpu; j++) {
result[i].push_back(k%numGpus);
k++;
}
return result;
}
@@ -372,11 +393,17 @@ namespace RcclUnitTesting
ncclRedOp_t const redOp,
int const root,
bool const inPlace,
bool const managedMem)
bool const managedMem,
int const ranksPerProc)
{
std::stringstream ss;
ss << (isMultiProcess ? "MP" : "SP") << " ";
ss << totalRanks << " ranks ";
ss << totalRanks;
if (ranksPerProc > 1)
ss << "(" << ranksPerProc << ") ";
else
ss << " ";
ss << "ranks ";
ss << ncclFuncNames[funcType] << " ";
ss << "(" << (inPlace ? "IP" : "OP") << "," << (managedMem ? "MM" : "GM") << ") ";
ss << ncclDataTypeNames[dataType] << " ";
@@ -430,14 +457,16 @@ namespace RcclUnitTesting
bool isCorrect = true;
// Sweep over the number of ranks
for (int totalRanks = ev.minGpus; totalRanks <= ev.maxGpus && isCorrect; ++totalRanks)
for (int ranksPerGpu=1; ranksPerGpu <= ev.maxRanksPerGpu; ranksPerGpu++)
for (int numGpus = ev.minGpus; numGpus <= ev.maxGpus && isCorrect; ++numGpus)
for (int isMultiProcess = 0; isMultiProcess <= 1 && isCorrect; ++isMultiProcess)
{
if (!(ev.processMask & (1 << isMultiProcess))) continue;
// Test either single process all GPUs, or 1 process per GPU
int const numProcesses = isMultiProcess ? totalRanks : 1;
this->InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks));
int const numChildren = isMultiProcess ? numGpus : 1;
int const numRanks = numGpus*ranksPerGpu;
this->InitComms(TestBed::GetDeviceIdsList(numChildren, numGpus, ranksPerGpu));
for (int ftIdx = 0; ftIdx < funcTypes.size() && isCorrect; ++ftIdx)
for (int dtIdx = 0; dtIdx < dataTypes.size() && isCorrect; ++dtIdx)
@@ -448,10 +477,11 @@ namespace RcclUnitTesting
{
if (ev.showNames)
{
std::string name = this->GetTestCaseName(totalRanks, isMultiProcess,
std::string name = this->GetTestCaseName(numGpus, isMultiProcess,
funcTypes[ftIdx], dataTypes[dtIdx],
redOps[rdIdx], roots[rtIdx],
inPlaceList[ipIdx], managedMemList[mmIdx]);
inPlaceList[ipIdx], managedMemList[mmIdx],
ranksPerGpu);
INFO("%s\n", name.c_str());
}
@@ -460,7 +490,7 @@ namespace RcclUnitTesting
int numInputElements, numOutputElements;
CollectiveArgs::GetNumElementsForFuncType(funcTypes[ftIdx],
sortedN[neIdx],
totalRanks,
numRanks,
&numInputElements,
&numOutputElements);
optionalArgs.redOp = redOps[rdIdx];
@@ -486,10 +516,11 @@ namespace RcclUnitTesting
this->ValidateResults(isCorrect);
if (!isCorrect)
{
std::string name = this->GetTestCaseName(totalRanks, isMultiProcess,
std::string name = this->GetTestCaseName(numGpus, isMultiProcess,
funcTypes[ftIdx], dataTypes[dtIdx],
redOps[rdIdx], roots[rtIdx],
inPlaceList[ipIdx], managedMemList[mmIdx]);
inPlaceList[ipIdx], managedMemList[mmIdx],
ranksPerGpu);
ERROR("Incorrect output for %s\n", name.c_str());
}
}
+5 -1
Wyświetl plik
@@ -94,6 +94,9 @@ namespace RcclUnitTesting
std::vector<ncclDataType_t> const& GetAllSupportedDataTypes();
// Helper function that splits up GPUs to the given number of processes
static std::vector<std::vector<int>> GetDeviceIdsList(int const numProcesses,
int const numGpus,
int const ranksPerGpu);
static std::vector<std::vector<int>> GetDeviceIdsList(int const numProcesses,
int const numGpus);
@@ -105,7 +108,8 @@ namespace RcclUnitTesting
ncclRedOp_t const redOp,
int const root,
bool const inPlace,
bool const managedMem);
bool const managedMem,
int const ranksPerProc=1);
// Run a simple sweep
void RunSimpleSweep(std::vector<ncclFunc_t> const& funcTypes,
+18 -4
Wyświetl plik
@@ -126,6 +126,8 @@ namespace RcclUnitTesting
PIPE_READ(this->totalRanks);
PIPE_READ(this->rankOffset);
PIPE_READ(this->numCollectivesInGroup);
bool useMultiRankPerGpu;
PIPE_READ(useMultiRankPerGpu);
// Read the GPUs this child uses and prepare storage for collective args / datasets
int numGpus;
@@ -166,11 +168,23 @@ namespace RcclUnitTesting
break;
}
if (ncclCommInitRank(&this->comms[localRank], this->totalRanks, id, globalRank) != ncclSuccess)
if (useMultiRankPerGpu)
{
ERROR("Rank %d on child %d unable to call ncclCommInitRank\n", globalRank, this->childId);
status = TEST_FAIL;
break;
if (ncclCommInitRankMulti(&this->comms[localRank], this->totalRanks, id, globalRank, globalRank) != ncclSuccess)
{
ERROR("Rank %d on child %d unable to call ncclCommInitRankMulti\n", globalRank, this->childId);
status = TEST_FAIL;
break;
}
}
else
{
if (ncclCommInitRank(&this->comms[localRank], this->totalRanks, id, globalRank) != ncclSuccess)
{
ERROR("Rank %d on child %d unable to call ncclCommInitRank\n", globalRank, this->childId);
status = TEST_FAIL;
break;
}
}
}
if (status == TEST_SUCCESS)