Update MP UT to support arbitrary # of GPUs; multiple bugfixes (#16)

* Fixing temp file creation/deletion for Clique kernel mode.

* Refactoring of MP unit tests; include bugfixes and general support for any number of GPUs

* GroupCall MP UT properly quits when too many devices specified

* MP UT will programmatically set NCCL_COMM_ID if not specified; updated install script
Этот коммит содержится в:
Stanley Tsang
2021-02-05 17:49:25 -07:00
коммит произвёл GitHub
родитель 6dfdfef98f
Коммит d00b7d17bd
23 изменённых файлов: 538 добавлений и 716 удалений
+13 -49
Просмотреть файл
@@ -10,63 +10,27 @@ namespace CorrectnessTests
{
TEST_P(ReduceScatterMultiProcessCorrectnessTest, Correctness)
{
Dataset* dataset = (Dataset*)mmap(NULL, sizeof(Dataset), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollReduceScatter);
Barrier::ClearShmFiles(std::atoi(getenv("NCCL_COMM_ID")));
std::vector<int> pids(numDevices);
int pid1 = 0;
int pid2 = 0;
int pid3 = 0;
pid1 = fork();
// From this point on, ignore original process as we cannot have it create a HIP context
if (pid1 == 0)
int gpu = -1;
for (int i = 0; i < numDevices; i++)
{
pid2 = fork();
if (numDevices > 2)
gpu++;
int pid = fork();
if (pid == 0)
{
pid3 = fork();
}
if ((pid2 > 0 && pid3 == 0 && numDevices == 2) || (pid2 > 0 && pid3 > 0 && numDevices > 2))
{
// Process 0
TestReduceScatter(0, *dataset);
if (pid3 > 0)
{
waitpid(pid3, NULL, 0);
}
}
else if ((pid2 == 0 && pid3 == 0 && numDevices == 2) || (pid2 == 0 && pid3 > 0 && numDevices > 2))
{
// Process 1
TestReduceScatter(1, *dataset);
if (numDevices > 2)
{
waitpid(pid3, NULL, 0);
}
exit(0);
}
else if (pid2 > 0 && pid3 == 0 && numDevices > 2)
{
// Process 2 (available when numDevices > 2)
TestReduceScatter(2, *dataset);
exit(0);
}
else if (pid2 == 0 && pid3 == 0 && numDevices == 4)
{
// Process 3 (available when numDevices == 4)
TestReduceScatter(3, *dataset);
exit(0);
bool pass;
TestReduceScatter(gpu, *dataset, pass);
TerminateChildProcess(pass);
}
else
{
exit(0);
pids[gpu] = pid;
}
waitpid(pid2, NULL, 0);
exit(0);
}
waitpid(pid1, NULL, 0);
munmap(dataset, sizeof(Dataset));
ValidateProcesses(pids);
}
INSTANTIATE_TEST_SUITE_P(ReduceScatterMultiProcessCorrectnessSweep,
@@ -88,7 +52,7 @@ namespace CorrectnessTests
// Number of elements
testing::Values(3072, 3145728),
// Number of devices
testing::Values(2,3,4),
testing::Values(2,3,4,8),
// In-place or not
testing::Values(false, true),
testing::Values("")),