Update MP UT to support arbitrary # of GPUs; multiple bugfixes (#16)

* Fixing temp file creation/deletion for Clique kernel mode.

* Refactoring of MP unit tests; include bugfixes and general support for any number of GPUs

* GroupCall MP UT properly quits when too many devices specified

* MP UT will programmatically set NCCL_COMM_ID if not specified; updated install script
Этот коммит содержится в:
Stanley Tsang
2021-02-05 17:49:25 -07:00
коммит произвёл GitHub
родитель 6dfdfef98f
Коммит d00b7d17bd
23 изменённых файлов: 538 добавлений и 716 удалений
+21 -57
Просмотреть файл
@@ -25,75 +25,39 @@ namespace CorrectnessTests
datasets[i] = (Dataset*)mmap(NULL, sizeof(Dataset), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
datasets[i]->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclFuncs[i]);
}
Barrier::ClearShmFiles(std::atoi(getenv("NCCL_COMM_ID")));
int pid1 = 0;
int pid2 = 0;
int pid3 = 0;
pid1 = fork();
int const numGpusPerProcess = 2;
int const numProcesses = numDevices / numGpusPerProcess;
std::vector<int> pids(numProcesses);
int process = -1;
// From this point on, ignore original process as we cannot have it create a HIP context
if (pid1 == 0)
for (int i = 0; i < numDevices; i+= numGpusPerProcess)
{
pid2 = fork();
if (numDevices > 4)
process++;
int pid = fork();
if (pid == 0)
{
pid3 = fork();
}
if ((pid2 > 0 && pid3 == 0 && numDevices == 4) || (pid2 > 0 && pid3 > 0 && numDevices > 4))
{
// Process 0
std::vector<int> ranks;
ranks.push_back(0);
ranks.push_back(1);
int gpuIdx = i;
int maxIdx = gpuIdx + (numGpusPerProcess - 1) >= numDevices ? numDevices : gpuIdx + numGpusPerProcess;
TestGroupCalls(0, ranks, datasets, ncclFuncs);
if (pid3 > 0)
std::vector<int> ranks;
for (; gpuIdx < maxIdx; gpuIdx++)
{
waitpid(pid3, NULL, 0);
ranks.push_back(gpuIdx);
}
}
else if ((pid2 == 0 && pid3 == 0 && numDevices == 4) || (pid2 == 0 && pid3 > 0 && numDevices > 4))
{
// Process 1
std::vector<int> ranks;
ranks.push_back(2);
ranks.push_back(3);
TestGroupCalls(1, ranks, datasets, ncclFuncs);
if (pid3 > 0)
{
waitpid(pid3, NULL, 0);
}
exit(0);
}
else if (pid2 > 0 && pid3 == 0 && numDevices == 8)
{
// Process 2 (available when numDevices == 8)
std::vector<int> ranks;
ranks.push_back(4);
ranks.push_back(5);
TestGroupCalls(2, ranks, datasets, ncclFuncs);
exit(0);
}
else if (pid2 == 0 && pid3 == 0 && numDevices == 8)
{
// Process 3 (available when numDevices == 8)
std::vector<int> ranks;
ranks.push_back(6);
ranks.push_back(7);
TestGroupCalls(3, ranks, datasets, ncclFuncs);
exit(0);
bool pass;
TestGroupCalls(process, ranks, datasets, ncclFuncs, pass);
TerminateChildProcess(pass);
}
else
{
exit(0);
pids[process] = pid;
}
waitpid(pid2, NULL, 0);
exit(0);
}
waitpid(pid1, NULL, 0);
ValidateProcesses(pids);
for (int i = 0; i < datasets.size(); i++)
{
munmap(datasets[i], sizeof(Dataset));
@@ -119,7 +83,7 @@ namespace CorrectnessTests
// Number of elements
testing::Values(3072, 3145728),
// Number of devices
testing::Values(4),
testing::Values(4,8),
// In-place or not
testing::Values(false, true),
testing::Values("")),