extending the unit-tests for multi-rank support
This commit is contained in:
zatwierdzone przez
Edgar Gabriel
rodzic
0336ffdf70
commit
a87d61db2b
@@ -19,13 +19,15 @@ namespace RcclUnitTesting
|
||||
|
||||
OptionalColArgs options;
|
||||
bool isCorrect = true;
|
||||
int totalRanks = testBed.ev.maxGpus;
|
||||
int numGpus = testBed.ev.maxGpus;
|
||||
for (int rpg=0; rpg < 2 && isCorrect; ++rpg)
|
||||
for (int isMultiProcess = 0; isMultiProcess <= 1 && isCorrect; ++isMultiProcess)
|
||||
{
|
||||
if (!(testBed.ev.processMask & (1 << isMultiProcess))) continue;
|
||||
|
||||
int const numProcesses = isMultiProcess ? totalRanks : 1;
|
||||
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks), 1);
|
||||
int ranksPerGpu = rpg == 0 ? 1 : testBed.ev.maxRanksPerGpu;
|
||||
int totalRanks = numGpus * ranksPerGpu;
|
||||
int const numProcesses = isMultiProcess ? numGpus : 1;
|
||||
testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu), 1);
|
||||
|
||||
for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx)
|
||||
for (int numIdx = 0; numIdx < numElements.size() && isCorrect; ++numIdx)
|
||||
|
||||
+16
-14
@@ -32,12 +32,13 @@ namespace RcclUnitTesting
|
||||
hsa_iterate_agents(CountGpus, &numDevicesAvailable);
|
||||
hsa_shut_down();
|
||||
|
||||
showNames = GetEnvVar("UT_SHOW_NAMES" , 1);
|
||||
minGpus = GetEnvVar("UT_MIN_GPUS" , 2);
|
||||
maxGpus = GetEnvVar("UT_MAX_GPUS" , numDevicesAvailable);
|
||||
processMask = GetEnvVar("UT_PROCESS_MASK", UT_SINGLE_PROCESS | UT_MULTI_PROCESS);
|
||||
verbose = GetEnvVar("UT_VERBOSE" , 0);
|
||||
printValues = GetEnvVar("UT_PRINT_VALUES", 0);
|
||||
showNames = GetEnvVar("UT_SHOW_NAMES" , 1);
|
||||
minGpus = GetEnvVar("UT_MIN_GPUS" , 2);
|
||||
maxGpus = GetEnvVar("UT_MAX_GPUS" , numDevicesAvailable);
|
||||
processMask = GetEnvVar("UT_PROCESS_MASK", UT_SINGLE_PROCESS | UT_MULTI_PROCESS);
|
||||
verbose = GetEnvVar("UT_VERBOSE" , 0);
|
||||
printValues = GetEnvVar("UT_PRINT_VALUES", 0);
|
||||
maxRanksPerGpu = GetEnvVar("UT_MAX_RANKS_PER_GPU", 2);
|
||||
|
||||
// Limit number of supported reduction operators to just ncclSum if only allReduce is built
|
||||
#ifdef BUILD_ALLREDUCE_ONLY
|
||||
@@ -139,14 +140,15 @@ namespace RcclUnitTesting
|
||||
{
|
||||
std::vector<std::pair<std::string, std::string>> supported =
|
||||
{
|
||||
std::make_pair("UT_SHOW_NAMES" , "Show test case names"),
|
||||
std::make_pair("UT_MIN_GPUS" , "Minimum number of GPUs to use"),
|
||||
std::make_pair("UT_MAX_GPUS" , "Maximum number of GPUs to use"),
|
||||
std::make_pair("UT_PROCESS_MASK", "Whether to run single/multi process"),
|
||||
std::make_pair("UT_VERBOSE" , "Show verbose unit test output"),
|
||||
std::make_pair("UT_REDOPS" , "List of reduction ops to test"),
|
||||
std::make_pair("UT_DATATYPES" , "List of datatypes to test"),
|
||||
std::make_pair("UT_PRINT_VALUES", "Print array values (# of values to print, < 0 for all)")
|
||||
std::make_pair("UT_SHOW_NAMES" , "Show test case names"),
|
||||
std::make_pair("UT_MIN_GPUS" , "Minimum number of GPUs to use"),
|
||||
std::make_pair("UT_MAX_GPUS" , "Maximum number of GPUs to use"),
|
||||
std::make_pair("UT_PROCESS_MASK" , "Whether to run single/multi process"),
|
||||
std::make_pair("UT_VERBOSE" , "Show verbose unit test output"),
|
||||
std::make_pair("UT_REDOPS" , "List of reduction ops to test"),
|
||||
std::make_pair("UT_DATATYPES" , "List of datatypes to test"),
|
||||
std::make_pair("UT_MAX_RANKS_PER_GPU", "Maximum number of ranks using the same GPU"),
|
||||
std::make_pair("UT_PRINT_VALUES" , "Print array values (# of values to print, < 0 for all)")
|
||||
};
|
||||
|
||||
printf("================================================================================\n");
|
||||
|
||||
@@ -24,6 +24,7 @@ namespace RcclUnitTesting
|
||||
int processMask; // Filter single/multi process [UT_PROCESS_MASK]
|
||||
bool verbose; // Show verbose TestBed output for debug [UT_VERBOSE]
|
||||
int printValues; // Print out input/output/expected arrays [UT_PRINT_VALUES]
|
||||
int maxRanksPerGpu; // Number of ranks using the same GPU [UT_MAX_RANKS_PER_GPU]
|
||||
|
||||
// Constructor that parses and collects environment variables
|
||||
EnvVars();
|
||||
|
||||
+44
-13
@@ -104,6 +104,12 @@ namespace RcclUnitTesting
|
||||
}
|
||||
}
|
||||
|
||||
//Determine number of unique GPUs being used.
|
||||
std::set<int> unique_devices;
|
||||
for (auto a: this->rankToDeviceMap)
|
||||
unique_devices.insert(a);
|
||||
bool useMulti = unique_devices.size() < this->rankToDeviceMap.size() ? true : false;
|
||||
|
||||
// Tell first rank to get ncclUniqueId
|
||||
int getIdCmd = TestBedChild::CHILD_GET_UNIQUE_ID;
|
||||
PIPE_WRITE(0, getIdCmd);
|
||||
@@ -133,6 +139,9 @@ namespace RcclUnitTesting
|
||||
// Send the number of collectives to be run per group call
|
||||
PIPE_WRITE(childId, numCollectivesInGroup);
|
||||
|
||||
// Send whether to use MultiRank interfaces or not.
|
||||
PIPE_WRITE(childId, useMulti);
|
||||
|
||||
// Send the GPUs this child uses
|
||||
int const numGpus = deviceIdsPerProcess[childId].size();
|
||||
PIPE_WRITE(childId, numGpus);
|
||||
@@ -357,11 +366,23 @@ namespace RcclUnitTesting
|
||||
}
|
||||
|
||||
std::vector<std::vector<int>> TestBed::GetDeviceIdsList(int const numProcesses,
|
||||
int const numGpus)
|
||||
int const numGpus)
|
||||
{
|
||||
return GetDeviceIdsList(numProcesses, numGpus, 1);
|
||||
}
|
||||
|
||||
std::vector<std::vector<int>> TestBed::GetDeviceIdsList(int const numProcesses,
|
||||
int const numGpus,
|
||||
int const ranksPerGpu)
|
||||
{
|
||||
std::vector<std::vector<int>> result(numProcesses);
|
||||
for (int i = 0; i < numGpus; i++)
|
||||
result[i % numProcesses].push_back(i);
|
||||
int ntasks = numProcesses == 1 ? numGpus : 1;
|
||||
int k=0;
|
||||
for (int i = 0; i < numProcesses; i++)
|
||||
for (int j = 0; j < ntasks * ranksPerGpu; j++) {
|
||||
result[i].push_back(k%numGpus);
|
||||
k++;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -372,11 +393,17 @@ namespace RcclUnitTesting
|
||||
ncclRedOp_t const redOp,
|
||||
int const root,
|
||||
bool const inPlace,
|
||||
bool const managedMem)
|
||||
bool const managedMem,
|
||||
int const ranksPerProc)
|
||||
{
|
||||
std::stringstream ss;
|
||||
ss << (isMultiProcess ? "MP" : "SP") << " ";
|
||||
ss << totalRanks << " ranks ";
|
||||
ss << totalRanks;
|
||||
if (ranksPerProc > 1)
|
||||
ss << "(" << ranksPerProc << ") ";
|
||||
else
|
||||
ss << " ";
|
||||
ss << "ranks ";
|
||||
ss << ncclFuncNames[funcType] << " ";
|
||||
ss << "(" << (inPlace ? "IP" : "OP") << "," << (managedMem ? "MM" : "GM") << ") ";
|
||||
ss << ncclDataTypeNames[dataType] << " ";
|
||||
@@ -430,14 +457,16 @@ namespace RcclUnitTesting
|
||||
bool isCorrect = true;
|
||||
|
||||
// Sweep over the number of ranks
|
||||
for (int totalRanks = ev.minGpus; totalRanks <= ev.maxGpus && isCorrect; ++totalRanks)
|
||||
for (int ranksPerGpu=1; ranksPerGpu <= ev.maxRanksPerGpu; ranksPerGpu++)
|
||||
for (int numGpus = ev.minGpus; numGpus <= ev.maxGpus && isCorrect; ++numGpus)
|
||||
for (int isMultiProcess = 0; isMultiProcess <= 1 && isCorrect; ++isMultiProcess)
|
||||
{
|
||||
if (!(ev.processMask & (1 << isMultiProcess))) continue;
|
||||
|
||||
// Test either single process all GPUs, or 1 process per GPU
|
||||
int const numProcesses = isMultiProcess ? totalRanks : 1;
|
||||
this->InitComms(TestBed::GetDeviceIdsList(numProcesses, totalRanks));
|
||||
int const numChildren = isMultiProcess ? numGpus : 1;
|
||||
int const numRanks = numGpus*ranksPerGpu;
|
||||
this->InitComms(TestBed::GetDeviceIdsList(numChildren, numGpus, ranksPerGpu));
|
||||
|
||||
for (int ftIdx = 0; ftIdx < funcTypes.size() && isCorrect; ++ftIdx)
|
||||
for (int dtIdx = 0; dtIdx < dataTypes.size() && isCorrect; ++dtIdx)
|
||||
@@ -448,10 +477,11 @@ namespace RcclUnitTesting
|
||||
{
|
||||
if (ev.showNames)
|
||||
{
|
||||
std::string name = this->GetTestCaseName(totalRanks, isMultiProcess,
|
||||
std::string name = this->GetTestCaseName(numGpus, isMultiProcess,
|
||||
funcTypes[ftIdx], dataTypes[dtIdx],
|
||||
redOps[rdIdx], roots[rtIdx],
|
||||
inPlaceList[ipIdx], managedMemList[mmIdx]);
|
||||
inPlaceList[ipIdx], managedMemList[mmIdx],
|
||||
ranksPerGpu);
|
||||
INFO("%s\n", name.c_str());
|
||||
}
|
||||
|
||||
@@ -460,7 +490,7 @@ namespace RcclUnitTesting
|
||||
int numInputElements, numOutputElements;
|
||||
CollectiveArgs::GetNumElementsForFuncType(funcTypes[ftIdx],
|
||||
sortedN[neIdx],
|
||||
totalRanks,
|
||||
numRanks,
|
||||
&numInputElements,
|
||||
&numOutputElements);
|
||||
optionalArgs.redOp = redOps[rdIdx];
|
||||
@@ -486,10 +516,11 @@ namespace RcclUnitTesting
|
||||
this->ValidateResults(isCorrect);
|
||||
if (!isCorrect)
|
||||
{
|
||||
std::string name = this->GetTestCaseName(totalRanks, isMultiProcess,
|
||||
std::string name = this->GetTestCaseName(numGpus, isMultiProcess,
|
||||
funcTypes[ftIdx], dataTypes[dtIdx],
|
||||
redOps[rdIdx], roots[rtIdx],
|
||||
inPlaceList[ipIdx], managedMemList[mmIdx]);
|
||||
inPlaceList[ipIdx], managedMemList[mmIdx],
|
||||
ranksPerGpu);
|
||||
ERROR("Incorrect output for %s\n", name.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -94,6 +94,9 @@ namespace RcclUnitTesting
|
||||
std::vector<ncclDataType_t> const& GetAllSupportedDataTypes();
|
||||
|
||||
// Helper function that splits up GPUs to the given number of processes
|
||||
static std::vector<std::vector<int>> GetDeviceIdsList(int const numProcesses,
|
||||
int const numGpus,
|
||||
int const ranksPerGpu);
|
||||
static std::vector<std::vector<int>> GetDeviceIdsList(int const numProcesses,
|
||||
int const numGpus);
|
||||
|
||||
@@ -105,7 +108,8 @@ namespace RcclUnitTesting
|
||||
ncclRedOp_t const redOp,
|
||||
int const root,
|
||||
bool const inPlace,
|
||||
bool const managedMem);
|
||||
bool const managedMem,
|
||||
int const ranksPerProc=1);
|
||||
|
||||
// Run a simple sweep
|
||||
void RunSimpleSweep(std::vector<ncclFunc_t> const& funcTypes,
|
||||
|
||||
@@ -126,6 +126,8 @@ namespace RcclUnitTesting
|
||||
PIPE_READ(this->totalRanks);
|
||||
PIPE_READ(this->rankOffset);
|
||||
PIPE_READ(this->numCollectivesInGroup);
|
||||
bool useMultiRankPerGpu;
|
||||
PIPE_READ(useMultiRankPerGpu);
|
||||
|
||||
// Read the GPUs this child uses and prepare storage for collective args / datasets
|
||||
int numGpus;
|
||||
@@ -166,11 +168,23 @@ namespace RcclUnitTesting
|
||||
break;
|
||||
}
|
||||
|
||||
if (ncclCommInitRank(&this->comms[localRank], this->totalRanks, id, globalRank) != ncclSuccess)
|
||||
if (useMultiRankPerGpu)
|
||||
{
|
||||
ERROR("Rank %d on child %d unable to call ncclCommInitRank\n", globalRank, this->childId);
|
||||
status = TEST_FAIL;
|
||||
break;
|
||||
if (ncclCommInitRankMulti(&this->comms[localRank], this->totalRanks, id, globalRank, globalRank) != ncclSuccess)
|
||||
{
|
||||
ERROR("Rank %d on child %d unable to call ncclCommInitRankMulti\n", globalRank, this->childId);
|
||||
status = TEST_FAIL;
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (ncclCommInitRank(&this->comms[localRank], this->totalRanks, id, globalRank) != ncclSuccess)
|
||||
{
|
||||
ERROR("Rank %d on child %d unable to call ncclCommInitRank\n", globalRank, this->childId);
|
||||
status = TEST_FAIL;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (status == TEST_SUCCESS)
|
||||
|
||||
Reference in New Issue
Block a user