Update MP UT to support arbitrary # of GPUs; multiple bugfixes (#16)
* Fixing temp file creation/deletion for Clique kernel mode.
* Refactoring of MP unit tests; include bugfixes and general support for any number of GPUs
* GroupCall MP UT properly quits when too many devices specified
* MP UT will programmatically set NCCL_COMM_ID if not specified; updated install script
[ROCm/rccl commit: d00b7d17bd]
Cette révision appartient à :
@@ -200,10 +200,10 @@ if ($run_tests); then
|
||||
if (test -f "./test/UnitTests"); then
|
||||
if ($run_tests_all); then
|
||||
./test/UnitTests
|
||||
NCCL_COMM_ID=$HOSTNAME:55512 ./test/UnitTestsMultiProcess
|
||||
./test/UnitTestsMultiProcess
|
||||
else
|
||||
./test/UnitTests --gtest_filter="BroadcastCorrectnessSweep*:*float32*"
|
||||
NCCL_COMM_ID=$HOSTNAME:55512 ./test/UnitTestsMultiProcess --gtest_filter="BroadcastMultiProcessCorrectnessSweep*:*float32*"
|
||||
./test/UnitTestsMultiProcess --gtest_filter="BroadcastMultiProcessCorrectnessSweep*:*float32*"
|
||||
fi
|
||||
else
|
||||
echo "Unit tests have not been built yet; please re-run script with -t to build unit tests."
|
||||
|
||||
@@ -40,6 +40,7 @@ THE SOFTWARE.
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <thread>
|
||||
#include <unistd.h>
|
||||
|
||||
cliqueDevicePtrs_t CliqueManager::m_staticCliquePtrs[NCCL_MAX_OPS] = {};
|
||||
int* CliqueManager::m_staticGpuBarrierMem = NULL;
|
||||
@@ -502,28 +503,36 @@ void CliqueManager::WaitForBarrier()
|
||||
|
||||
ncclResult_t CliqueManager::BootstrapRootInit(int pid, unsigned long hash)
|
||||
{
|
||||
for (auto it = CliqueShmNames.begin(); it != CliqueShmNames.end(); it++)
|
||||
if (rcclParamEnableClique())
|
||||
{
|
||||
int msgid, fd;
|
||||
std::string msgQueueName = "/tmp/" + it->second + std::to_string(hash) + "_" + std::to_string(pid);
|
||||
SYSCHECKVAL(open(msgQueueName.c_str(), O_CREAT | O_RDWR, 0606), "open", fd);
|
||||
NCCLCHECK(MsgQueueGetId(msgQueueName, hash, true, msgid));
|
||||
SYSCHECK(close(fd), "close");
|
||||
for (auto it = CliqueShmNames.begin(); it != CliqueShmNames.end(); it++)
|
||||
{
|
||||
int msgid, fd;
|
||||
std::string msgQueueName = "/tmp/" + it->second + std::to_string(hash) + "_" + std::to_string(pid);
|
||||
SYSCHECKVAL(open(msgQueueName.c_str(), O_CREAT | O_RDWR, 0606), "open", fd);
|
||||
NCCLCHECK(MsgQueueGetId(msgQueueName, hash, true, msgid));
|
||||
SYSCHECK(unlink(msgQueueName.c_str()), "unlink");
|
||||
SYSCHECK(close(fd), "close");
|
||||
}
|
||||
|
||||
std::string shmDir = "/dev/shm/";
|
||||
|
||||
for (auto it = CliqueShmNames.begin(); it != CliqueShmNames.end(); it++)
|
||||
{
|
||||
struct stat fileStatus;
|
||||
std::string shmFileName = it->second + std::to_string(hash) + "_" + std::to_string(pid);
|
||||
std::string shmFullPath = shmDir + shmFileName;
|
||||
|
||||
// Check if shm file already exists; if so, unlink it
|
||||
if (stat(shmFullPath.c_str(), &fileStatus) == 0)
|
||||
{
|
||||
NCCLCHECK(shmUnlink(shmFileName.c_str()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::string shmDir = "/dev/shm/";
|
||||
|
||||
for (auto it = CliqueShmNames.begin(); it != CliqueShmNames.end(); it++)
|
||||
else
|
||||
{
|
||||
struct stat fileStatus;
|
||||
std::string shmFileName = it->second + std::to_string(hash) + "_" + std::to_string(pid);
|
||||
std::string shmFullPath = shmDir + shmFileName;
|
||||
|
||||
// Check if shm file already exists; if so, unlink it
|
||||
if (stat(shmFullPath.c_str(), &fileStatus) == 0)
|
||||
{
|
||||
NCCLCHECK(shmUnlink(shmFileName.c_str()));
|
||||
}
|
||||
INFO(NCCL_INIT, "Not performing bootstrap root for clique kernels as clique mode not enabled.");
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -702,13 +702,18 @@ namespace CorrectnessTests
|
||||
class MultiProcessCorrectnessTest : public CorrectnessTest
|
||||
{
|
||||
protected:
|
||||
// IMPORTANT: We cannot have any HIP API calls in the parent process.
|
||||
// Do any HIP setup in SetupPerProcess().
|
||||
void SetUp() override
|
||||
{
|
||||
// Check for NCCL_COMM_ID env variable (otherwise will not init)
|
||||
// Check if NCCL_COMM_ID is already set; if not, set it now
|
||||
if (!getenv("NCCL_COMM_ID"))
|
||||
{
|
||||
printf("Must set NCCL_COMM_ID prior to execution\n");
|
||||
exit(0);
|
||||
char hostname[HOST_NAME_MAX+1];
|
||||
gethostname(hostname, HOST_NAME_MAX+1);
|
||||
std::string hostnameString(hostname);
|
||||
hostnameString.append(":55513");
|
||||
setenv("NCCL_COMM_ID", hostnameString.c_str(), 0);
|
||||
}
|
||||
|
||||
// Make the test tuple parameters accessible
|
||||
@@ -737,10 +742,14 @@ namespace CorrectnessTests
|
||||
|
||||
comms.resize(numDevices);
|
||||
streams.resize(numDevices);
|
||||
dataset = (Dataset*)mmap(NULL, sizeof(Dataset), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
|
||||
Barrier::ClearShmFiles(std::atoi(getenv("NCCL_COMM_ID")));
|
||||
}
|
||||
|
||||
void TearDown() override
|
||||
{
|
||||
munmap(dataset, sizeof(Dataset));
|
||||
|
||||
// Restore env vars after tests
|
||||
for (int i = 0; i < numTokens/2; i++) {
|
||||
if (savedEnv[i]) {
|
||||
@@ -773,9 +782,11 @@ namespace CorrectnessTests
|
||||
// Only proceed with testing if there are enough GPUs
|
||||
if (numDevices > numDevicesAvailable)
|
||||
{
|
||||
fprintf(stdout, "[ SKIPPED ] Test requires %d devices (only %d available)\n",
|
||||
numDevices, numDevicesAvailable);
|
||||
|
||||
if (rank == 0)
|
||||
{
|
||||
fprintf(stdout, "[ SKIPPED ] Test requires %d devices (only %d available)\n",
|
||||
numDevices, numDevicesAvailable);
|
||||
}
|
||||
// Modify the number of devices so that tear-down doesn't occur
|
||||
// This is temporary until GTEST_SKIP() becomes available
|
||||
numDevices = 0;
|
||||
@@ -795,7 +806,7 @@ namespace CorrectnessTests
|
||||
if (res != ncclSuccess)
|
||||
{
|
||||
printf("Test failure:%s %d '%s' numRanks:%d\n", __FILE__,__LINE__,ncclGetErrorString(res), numDevices);
|
||||
ASSERT_EQ(res, hipSuccess);
|
||||
ASSERT_EQ(res, ncclSuccess);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -803,17 +814,22 @@ namespace CorrectnessTests
|
||||
void SetUpPerProcess(int rank, ncclFunc_t const func, ncclComm_t& comm, hipStream_t& stream, Dataset& dataset)
|
||||
{
|
||||
SetUpPerProcessHelper(rank, comm, stream);
|
||||
dataset.Initialize(numDevices, numElements, dataType, inPlace, func, rank);
|
||||
if (numDevices <= numDevicesAvailable)
|
||||
{
|
||||
dataset.Initialize(numDevices, numElements, dataType, inPlace, func, rank);
|
||||
}
|
||||
}
|
||||
|
||||
// To be called by each process/rank individually (see GroupCallsMultiProcess)
|
||||
void SetUpPerProcess(int rank, std::vector<ncclFunc_t> const& func, ncclComm_t& comm, hipStream_t& stream, std::vector<Dataset*>& datasets)
|
||||
{
|
||||
SetUpPerProcessHelper(rank, comm, stream);
|
||||
|
||||
for (int i = 0; i < datasets.size(); i++)
|
||||
if (numDevices <= numDevicesAvailable)
|
||||
{
|
||||
datasets[i]->Initialize(numDevices, numElements, dataType, inPlace, func[i], rank);
|
||||
for (int i = 0; i < datasets.size(); i++)
|
||||
{
|
||||
datasets[i]->Initialize(numDevices, numElements, dataType, inPlace, func[i], rank);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -875,7 +891,7 @@ namespace CorrectnessTests
|
||||
free(arrayI1);
|
||||
}
|
||||
|
||||
void ValidateResults(Dataset const& dataset, int rank, int root = 0) const
|
||||
bool ValidateResults(Dataset const& dataset, int rank, int root = 0) const
|
||||
{
|
||||
int8_t* outputI1 = (int8_t *)malloc(dataset.NumBytes(ncclOutputBuffer));
|
||||
uint8_t* outputU1 = (uint8_t *)outputI1;
|
||||
@@ -894,8 +910,11 @@ namespace CorrectnessTests
|
||||
|
||||
// only output on root rank is valid for gather collective
|
||||
if (dataset.function == ncclCollGather && rank != root)
|
||||
return;
|
||||
HIP_CALL(hipMemcpy(outputI1, dataset.outputs[rank], dataset.NumBytes(ncclOutputBuffer), hipMemcpyDeviceToHost));
|
||||
return true;
|
||||
|
||||
hipError_t err = hipMemcpy(outputI1, dataset.outputs[rank], dataset.NumBytes(ncclOutputBuffer), hipMemcpyDeviceToHost);
|
||||
if (err != hipSuccess)
|
||||
return false;
|
||||
|
||||
int8_t* expectedI1 = (int8_t *)dataset.expected[rank];
|
||||
uint8_t* expectedU1 = (uint8_t *)expectedI1;
|
||||
@@ -953,8 +972,35 @@ namespace CorrectnessTests
|
||||
}
|
||||
}
|
||||
}
|
||||
ASSERT_EQ(isMatch, true);
|
||||
return isMatch;
|
||||
}
|
||||
|
||||
void ValidateProcesses(std::vector<int> const& pids)
|
||||
{
|
||||
int numProcesses = pids.size();
|
||||
int status[numProcesses];
|
||||
for (int i = 0; i < numProcesses; i++)
|
||||
{
|
||||
waitpid(pids[i], &status[i], 0);
|
||||
|
||||
ASSERT_NE(WIFEXITED(status[i]), 0) << "[ERROR] Child process " << i << " did not exit cleanly.";
|
||||
ASSERT_EQ(WEXITSTATUS(status[i]), EXIT_SUCCESS) << "[ERROR] Child process " << i << " had a test failure.";
|
||||
}
|
||||
}
|
||||
|
||||
void TerminateChildProcess(bool const pass)
|
||||
{
|
||||
if (pass)
|
||||
{
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
else
|
||||
{
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
Dataset* dataset;
|
||||
};
|
||||
|
||||
std::string GenerateTestNameString(testing::TestParamInfo<MultiProcessCorrectnessTest::ParamType>& info);
|
||||
|
||||
@@ -9,63 +9,27 @@ namespace CorrectnessTests
|
||||
{
|
||||
TEST_P(AllGatherMultiProcessCorrectnessTest, Correctness)
|
||||
{
|
||||
Dataset* dataset = (Dataset*)mmap(NULL, sizeof(Dataset), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
|
||||
dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollAllGather);
|
||||
Barrier::ClearShmFiles(std::atoi(getenv("NCCL_COMM_ID")));
|
||||
std::vector<int> pids(numDevices);
|
||||
|
||||
int pid1 = 0;
|
||||
int pid2 = 0;
|
||||
int pid3 = 0;
|
||||
pid1 = fork();
|
||||
|
||||
// From this point on, ignore original process as we cannot have it create a HIP context
|
||||
if (pid1 == 0)
|
||||
int gpu = -1;
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
pid2 = fork();
|
||||
if (numDevices > 2)
|
||||
gpu++;
|
||||
int pid = fork();
|
||||
if (pid == 0)
|
||||
{
|
||||
pid3 = fork();
|
||||
}
|
||||
if ((pid2 > 0 && pid3 == 0 && numDevices == 2) || (pid2 > 0 && pid3 > 0 && numDevices > 2))
|
||||
{
|
||||
// Process 0
|
||||
TestAllGather(0, *dataset);
|
||||
if (pid3 > 0)
|
||||
{
|
||||
waitpid(pid3, NULL, 0);
|
||||
}
|
||||
}
|
||||
else if ((pid2 == 0 && pid3 == 0 && numDevices == 2) || (pid2 == 0 && pid3 > 0 && numDevices > 2))
|
||||
{
|
||||
// Process 1
|
||||
TestAllGather(1, *dataset);
|
||||
if (numDevices > 2)
|
||||
{
|
||||
waitpid(pid3, NULL, 0);
|
||||
}
|
||||
exit(0);
|
||||
}
|
||||
else if (pid2 > 0 && pid3 == 0 && numDevices > 2)
|
||||
{
|
||||
// Process 2 (available when numDevices > 2)
|
||||
TestAllGather(2, *dataset);
|
||||
exit(0);
|
||||
}
|
||||
else if (pid2 == 0 && pid3 == 0 && numDevices == 4)
|
||||
{
|
||||
// Process 3 (available when numDevices == 4)
|
||||
TestAllGather(3, *dataset);
|
||||
exit(0);
|
||||
bool pass;
|
||||
TestAllGather(gpu, *dataset, pass);
|
||||
TerminateChildProcess(pass);
|
||||
}
|
||||
else
|
||||
{
|
||||
exit(0);
|
||||
pids[gpu] = pid;
|
||||
}
|
||||
waitpid(pid2, NULL, 0);
|
||||
exit(0);
|
||||
}
|
||||
waitpid(pid1, NULL, 0);
|
||||
munmap(dataset, sizeof(Dataset));
|
||||
|
||||
ValidateProcesses(pids);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(AllGatherMultiProcessCorrectnessSweep,
|
||||
@@ -87,7 +51,7 @@ namespace CorrectnessTests
|
||||
// Number of elements
|
||||
testing::Values(3072, 3145728),
|
||||
// Number of devices
|
||||
testing::Values(2,3,4),
|
||||
testing::Values(2,3,4,8),
|
||||
// In-place or not
|
||||
testing::Values(false, true),
|
||||
testing::Values("")),
|
||||
|
||||
@@ -13,38 +13,50 @@ namespace CorrectnessTests
|
||||
class AllGatherMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
|
||||
{
|
||||
public:
|
||||
static void ComputeExpectedResults(Dataset& dataset, Barrier& barrier, int const rank, int const numDevices)
|
||||
static void ComputeExpectedResults(Dataset& dataset, Barrier& barrier, int const numDevices, std::vector<int> const& ranks)
|
||||
{
|
||||
size_t const byteCount = dataset.NumBytes() / dataset.numDevices;
|
||||
|
||||
HIP_CALL(hipMemcpy(static_cast<char*>(dataset.expected[0]) + rank * byteCount, (int8_t *)dataset.inputs[rank] + (rank * byteCount),
|
||||
byteCount, hipMemcpyDeviceToHost));
|
||||
|
||||
barrier.Wait();
|
||||
// Rank 0 sends answer to other ranks
|
||||
if (rank == 0)
|
||||
for (int i = 0; i < ranks.size(); i++)
|
||||
{
|
||||
for (int i = 0; i < dataset.numDevices; i++)
|
||||
int rank = ranks[i];
|
||||
HIP_CALL(hipMemcpy(static_cast<char*>(dataset.expected[0]) + rank * byteCount, (int8_t *)dataset.inputs[rank] + (rank * byteCount),
|
||||
byteCount, hipMemcpyDeviceToHost));
|
||||
}
|
||||
barrier.Wait();
|
||||
|
||||
// Rank 0 sends answer to other ranks
|
||||
for (int i = 0; i < ranks.size(); i++)
|
||||
{
|
||||
int rank = ranks[i];
|
||||
if (rank == 0)
|
||||
{
|
||||
if (i == rank) continue;
|
||||
memcpy(dataset.expected[i], dataset.expected[0], dataset.NumBytes());
|
||||
for (int i = 0; i < dataset.numDevices; i++)
|
||||
{
|
||||
if (i == rank) continue;
|
||||
memcpy(dataset.expected[i], dataset.expected[0], dataset.NumBytes());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TestAllGather(int rank, Dataset& dataset)
|
||||
void TestAllGather(int rank, Dataset& dataset, bool& pass)
|
||||
{
|
||||
// Prepare input / output / expected results
|
||||
SetUpPerProcess(rank, ncclCollAllGather, comms[rank], streams[rank], dataset);
|
||||
|
||||
if (numDevices > numDevicesAvailable) return;
|
||||
if (numElements % numDevices != 0) return;
|
||||
if (numDevices > numDevicesAvailable || numElements % numDevices != 0)
|
||||
{
|
||||
pass = true;
|
||||
return;
|
||||
}
|
||||
|
||||
Barrier barrier(rank, numDevices, std::atoi(getenv("NCCL_COMM_ID")));
|
||||
|
||||
// Prepare input / output / expected results
|
||||
FillDatasetWithPattern(dataset, rank);
|
||||
ComputeExpectedResults(dataset, barrier, rank, numDevices);
|
||||
|
||||
ComputeExpectedResults(dataset, barrier, numDevices, std::vector<int>(1, rank));
|
||||
|
||||
size_t const byteCount = dataset.NumBytes() / numDevices;
|
||||
size_t const sendCount = dataset.numElements / numDevices;
|
||||
@@ -58,7 +70,7 @@ namespace CorrectnessTests
|
||||
HIP_CALL(hipStreamSynchronize(streams[rank]));
|
||||
|
||||
// Check results
|
||||
ValidateResults(dataset, rank);
|
||||
pass = ValidateResults(dataset, rank);
|
||||
|
||||
TearDownPerProcess(comms[rank], streams[rank]);
|
||||
dataset.Release(rank);
|
||||
|
||||
@@ -10,63 +10,27 @@ namespace CorrectnessTests
|
||||
{
|
||||
TEST_P(AllReduceMultiProcessCorrectnessTest, Correctness)
|
||||
{
|
||||
Dataset* dataset = (Dataset*)mmap(NULL, sizeof(Dataset), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
|
||||
dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollAllReduce);
|
||||
Barrier::ClearShmFiles(std::atoi(getenv("NCCL_COMM_ID")));
|
||||
std::vector<int> pids(numDevices);
|
||||
|
||||
int pid1 = 0;
|
||||
int pid2 = 0;
|
||||
int pid3 = 0;
|
||||
pid1 = fork();
|
||||
|
||||
// From this point on, ignore original process as we cannot have it create a HIP context
|
||||
if (pid1 == 0)
|
||||
int gpu = -1;
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
pid2 = fork();
|
||||
if (numDevices > 2)
|
||||
gpu++;
|
||||
int pid = fork();
|
||||
if (pid == 0)
|
||||
{
|
||||
pid3 = fork();
|
||||
}
|
||||
if ((pid2 > 0 && pid3 == 0 && numDevices == 2) || (pid2 > 0 && pid3 > 0 && numDevices > 2))
|
||||
{
|
||||
// Process 0
|
||||
TestAllReduce(0, *dataset);
|
||||
if (pid3 > 0)
|
||||
{
|
||||
waitpid(pid3, NULL, 0);
|
||||
}
|
||||
}
|
||||
else if ((pid2 == 0 && pid3 == 0 && numDevices == 2) || (pid2 == 0 && pid3 > 0 && numDevices > 2))
|
||||
{
|
||||
// Process 1
|
||||
TestAllReduce(1, *dataset);
|
||||
if (numDevices > 2)
|
||||
{
|
||||
waitpid(pid3, NULL, 0);
|
||||
}
|
||||
exit(0);
|
||||
}
|
||||
else if (pid2 > 0 && pid3 == 0 && numDevices > 2)
|
||||
{
|
||||
// Process 2 (available when numDevices > 2)
|
||||
TestAllReduce(2, *dataset);
|
||||
exit(0);
|
||||
}
|
||||
else if (pid2 == 0 && pid3 == 0 && numDevices == 4)
|
||||
{
|
||||
// Process 3 (available when numDevices == 4)
|
||||
TestAllReduce(3, *dataset);
|
||||
exit(0);
|
||||
bool pass;
|
||||
TestAllReduce(gpu, *dataset, pass);
|
||||
TerminateChildProcess(pass);
|
||||
}
|
||||
else
|
||||
{
|
||||
exit(0);
|
||||
pids[gpu] = pid;
|
||||
}
|
||||
waitpid(pid2, NULL, 0);
|
||||
exit(0);
|
||||
}
|
||||
waitpid(pid1, NULL, 0);
|
||||
munmap(dataset, sizeof(Dataset));
|
||||
|
||||
ValidateProcesses(pids);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(AllReduceMultiProcessCorrectnessSweep,
|
||||
@@ -88,7 +52,7 @@ namespace CorrectnessTests
|
||||
// Number of elements
|
||||
testing::Values(1024, 1048576),
|
||||
// Number of devices
|
||||
testing::Values(2,3,4),
|
||||
testing::Values(2,3,4,8),
|
||||
// In-place or not
|
||||
testing::Values(false, true),
|
||||
testing::Values("")),
|
||||
|
||||
@@ -13,13 +13,17 @@ namespace CorrectnessTests
|
||||
class AllReduceMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
|
||||
{
|
||||
public:
|
||||
static void ComputeExpectedResults(Dataset& dataset, Barrier& barrier, ncclRedOp_t const op, int const rank)
|
||||
static void ComputeExpectedResults(Dataset& dataset, Barrier& barrier, ncclRedOp_t const op, std::vector<int> const& ranks)
|
||||
{
|
||||
// Copy all inputs to expected arrays temporarily to perform reduction on host
|
||||
HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.inputs[rank],
|
||||
dataset.NumBytes(), hipMemcpyDeviceToHost));
|
||||
|
||||
for (int i = 0; i < ranks.size(); i++)
|
||||
{
|
||||
int rank = ranks[i];
|
||||
HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.inputs[rank],
|
||||
dataset.NumBytes(), hipMemcpyDeviceToHost));
|
||||
}
|
||||
barrier.Wait();
|
||||
|
||||
// Allocate temporary host array to accumulate results
|
||||
int8_t* resultI1 = (int8_t *)malloc(dataset.NumBytes());
|
||||
uint8_t* resultU1 = (uint8_t *)resultI1;
|
||||
@@ -68,23 +72,31 @@ namespace CorrectnessTests
|
||||
}
|
||||
}
|
||||
barrier.Wait();
|
||||
// Copy results into expected array
|
||||
memcpy(dataset.expected[rank], resultI1, dataset.NumBytes());
|
||||
|
||||
// Copy results into expected array
|
||||
for (int i = 0; i < ranks.size(); i++)
|
||||
{
|
||||
int rank = ranks[i];
|
||||
memcpy(dataset.expected[rank], resultI1, dataset.NumBytes());
|
||||
}
|
||||
free(resultI1);
|
||||
}
|
||||
|
||||
void TestAllReduce(int rank, Dataset& dataset)
|
||||
void TestAllReduce(int rank, Dataset& dataset, bool& pass)
|
||||
{
|
||||
SetUpPerProcess(rank, ncclCollAllReduce, comms[rank], streams[rank], dataset);
|
||||
|
||||
if (numDevices > numDevicesAvailable) return;
|
||||
if (numDevices > numDevicesAvailable)
|
||||
{
|
||||
pass = true;
|
||||
return;
|
||||
}
|
||||
|
||||
Barrier barrier(rank, numDevices, std::atoi(getenv("NCCL_COMM_ID")));
|
||||
|
||||
// Prepare input / output / expected results
|
||||
FillDatasetWithPattern(dataset, rank);
|
||||
ComputeExpectedResults(dataset, barrier, op, rank);
|
||||
ComputeExpectedResults(dataset, barrier, op, std::vector<int>(1, rank));
|
||||
|
||||
// Launch the reduction
|
||||
ncclAllReduce(dataset.inputs[rank], dataset.outputs[rank],
|
||||
@@ -94,7 +106,7 @@ namespace CorrectnessTests
|
||||
HIP_CALL(hipStreamSynchronize(streams[rank]));
|
||||
|
||||
// Check results
|
||||
ValidateResults(dataset, rank);
|
||||
pass = ValidateResults(dataset, rank);
|
||||
|
||||
TearDownPerProcess(comms[rank], streams[rank]);
|
||||
dataset.Release(rank);
|
||||
|
||||
@@ -10,62 +10,27 @@ namespace CorrectnessTests
|
||||
{
|
||||
TEST_P(AllToAllMultiProcessCorrectnessTest, Correctness)
|
||||
{
|
||||
Dataset* dataset = (Dataset*)mmap(NULL, sizeof(Dataset), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
|
||||
dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollAllToAll);
|
||||
std::vector<int> pids(numDevices);
|
||||
|
||||
int pid1 = 0;
|
||||
int pid2 = 0;
|
||||
int pid3 = 0;
|
||||
pid1 = fork();
|
||||
|
||||
// From this point on, ignore original process as we cannot have it create a HIP context
|
||||
if (pid1 == 0)
|
||||
int gpu = -1;
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
pid2 = fork();
|
||||
if (numDevices > 2)
|
||||
gpu++;
|
||||
int pid = fork();
|
||||
if (pid == 0)
|
||||
{
|
||||
pid3 = fork();
|
||||
}
|
||||
if ((pid2 > 0 && pid3 == 0 && numDevices == 2) || (pid2 > 0 && pid3 > 0 && numDevices > 2))
|
||||
{
|
||||
// Process 0
|
||||
TestAllToAll(0, *dataset);
|
||||
if (pid3 > 0)
|
||||
{
|
||||
waitpid(pid3, NULL, 0);
|
||||
}
|
||||
}
|
||||
else if ((pid2 == 0 && pid3 == 0 && numDevices == 2) || (pid2 == 0 && pid3 > 0 && numDevices > 2))
|
||||
{
|
||||
// Process 1
|
||||
TestAllToAll(1, *dataset);
|
||||
if (numDevices > 2)
|
||||
{
|
||||
waitpid(pid3, NULL, 0);
|
||||
}
|
||||
exit(0);
|
||||
}
|
||||
else if (pid2 > 0 && pid3 == 0 && numDevices > 2)
|
||||
{
|
||||
// Process 2 (available when numDevices > 2)
|
||||
TestAllToAll(2, *dataset);
|
||||
exit(0);
|
||||
}
|
||||
else if (pid2 == 0 && pid3 == 0 && numDevices == 4)
|
||||
{
|
||||
// Process 3 (available when numDevices == 4)
|
||||
TestAllToAll(3, *dataset);
|
||||
exit(0);
|
||||
bool pass;
|
||||
TestAllToAll(gpu, *dataset, pass);
|
||||
TerminateChildProcess(pass);
|
||||
}
|
||||
else
|
||||
{
|
||||
exit(0);
|
||||
pids[gpu] = pid;
|
||||
}
|
||||
waitpid(pid2, NULL, 0);
|
||||
exit(0);
|
||||
}
|
||||
waitpid(pid1, NULL, 0);
|
||||
munmap(dataset, sizeof(Dataset));
|
||||
|
||||
ValidateProcesses(pids);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(AllToAllMultiProcessCorrectnessSweep,
|
||||
@@ -87,7 +52,7 @@ namespace CorrectnessTests
|
||||
// Number of elements
|
||||
testing::Values(1024, 1048576),
|
||||
// Number of devices
|
||||
testing::Values(2,3,4),
|
||||
testing::Values(2,3,4,8),
|
||||
// In-place or not
|
||||
testing::Values(false),
|
||||
testing::Values("RCCL_ALLTOALL_KERNEL_DISABLE=0", "RCCL_ALLTOALL_KERNEL_DISABLE=1")),
|
||||
|
||||
@@ -13,24 +13,32 @@ namespace CorrectnessTests
|
||||
class AllToAllMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
|
||||
{
|
||||
public:
|
||||
static void ComputeExpectedResults(Dataset& dataset, int const rank)
|
||||
static void ComputeExpectedResults(Dataset& dataset, std::vector<int> const& ranks)
|
||||
{
|
||||
for (int i = 0; i < dataset.numDevices; i++)
|
||||
for (int i = 0; i < ranks.size(); i++)
|
||||
{
|
||||
HIP_CALL(hipMemcpy((int8_t *)dataset.expected[i]+dataset.NumBytes()*rank, (int8_t *)dataset.inputs[rank]+dataset.NumBytes()*i,
|
||||
dataset.NumBytes(), hipMemcpyDeviceToHost));
|
||||
int rank = ranks[i];
|
||||
for (int j = 0; j < dataset.numDevices; j++)
|
||||
{
|
||||
HIP_CALL(hipMemcpy((int8_t *)dataset.expected[j]+dataset.NumBytes()*rank, (int8_t *)dataset.inputs[rank]+dataset.NumBytes()*j,
|
||||
dataset.NumBytes(), hipMemcpyDeviceToHost));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TestAllToAll(int rank, Dataset& dataset)
|
||||
void TestAllToAll(int rank, Dataset& dataset, bool& pass)
|
||||
{
|
||||
SetUpPerProcess(rank, ncclCollAllToAll, comms[rank], streams[rank], dataset);
|
||||
|
||||
if (numDevices > numDevicesAvailable) return;
|
||||
if (numDevices > numDevicesAvailable)
|
||||
{
|
||||
pass = true;
|
||||
return;
|
||||
}
|
||||
|
||||
// Prepare input / output / expected results
|
||||
FillDatasetWithPattern(dataset, rank);
|
||||
ComputeExpectedResults(dataset, rank);
|
||||
ComputeExpectedResults(dataset, std::vector<int>(1, rank));
|
||||
|
||||
// Launch the reduction
|
||||
ncclAllToAll(dataset.inputs[rank],
|
||||
@@ -42,7 +50,7 @@ namespace CorrectnessTests
|
||||
HIP_CALL(hipStreamSynchronize(streams[rank]));
|
||||
|
||||
// Check results
|
||||
ValidateResults(dataset, rank);
|
||||
pass = ValidateResults(dataset, rank);
|
||||
|
||||
TearDownPerProcess(comms[rank], streams[rank]);
|
||||
dataset.Release(rank);
|
||||
|
||||
@@ -17,63 +17,27 @@ namespace CorrectnessTests
|
||||
{
|
||||
TEST_P(BroadcastMultiProcessCorrectnessTest, Correctness)
|
||||
{
|
||||
Dataset* dataset = (Dataset*)mmap(NULL, sizeof(Dataset), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
|
||||
dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollBroadcast);
|
||||
Barrier::ClearShmFiles(std::atoi(getenv("NCCL_COMM_ID")));
|
||||
std::vector<int> pids(numDevices);
|
||||
|
||||
int pid1 = 0;
|
||||
int pid2 = 0;
|
||||
int pid3 = 0;
|
||||
pid1 = fork();
|
||||
|
||||
// From this point on, ignore original process as we cannot have it create a HIP context
|
||||
if (pid1 == 0)
|
||||
int gpu = -1;
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
pid2 = fork();
|
||||
if (numDevices > 2)
|
||||
gpu++;
|
||||
int pid = fork();
|
||||
if (pid == 0)
|
||||
{
|
||||
pid3 = fork();
|
||||
}
|
||||
if ((pid2 > 0 && pid3 == 0 && numDevices == 2) || (pid2 > 0 && pid3 > 0 && numDevices > 2))
|
||||
{
|
||||
// Process 0
|
||||
TestBroadcast(0, *dataset);
|
||||
if (pid3 > 0)
|
||||
{
|
||||
waitpid(pid3, NULL, 0);
|
||||
}
|
||||
}
|
||||
else if ((pid2 == 0 && pid3 == 0 && numDevices == 2) || (pid2 == 0 && pid3 > 0 && numDevices > 2))
|
||||
{
|
||||
// Process 1
|
||||
TestBroadcast(1, *dataset);
|
||||
if (numDevices > 2)
|
||||
{
|
||||
waitpid(pid3, NULL, 0);
|
||||
}
|
||||
exit(0);
|
||||
}
|
||||
else if (pid2 > 0 && pid3 == 0 && numDevices > 2)
|
||||
{
|
||||
// Process 2 (available when numDevices > 2)
|
||||
TestBroadcast(2, *dataset);
|
||||
exit(0);
|
||||
}
|
||||
else if (pid2 == 0 && pid3 == 0 && numDevices == 4)
|
||||
{
|
||||
// Process 3 (available when numDevices == 4)
|
||||
TestBroadcast(3, *dataset);
|
||||
exit(0);
|
||||
bool pass;
|
||||
TestBroadcast(gpu, *dataset, pass);
|
||||
TerminateChildProcess(pass);
|
||||
}
|
||||
else
|
||||
{
|
||||
exit(0);
|
||||
pids[gpu] = pid;
|
||||
}
|
||||
waitpid(pid2, NULL, 0);
|
||||
exit(0);
|
||||
}
|
||||
waitpid(pid1, NULL, 0);
|
||||
munmap(dataset, sizeof(Dataset));
|
||||
|
||||
ValidateProcesses(pids);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(BroadcastMultiProcessCorrectnessSweep,
|
||||
@@ -95,7 +59,7 @@ namespace CorrectnessTests
|
||||
// Number of elements
|
||||
testing::Values(1024, 1048576),
|
||||
// Number of devices
|
||||
testing::Values(2,3,4),
|
||||
testing::Values(2,3,4,8),
|
||||
// In-place or not
|
||||
testing::Values(false, true),
|
||||
testing::Values("")),
|
||||
|
||||
@@ -13,26 +13,35 @@ namespace CorrectnessTests
|
||||
class BroadcastMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
|
||||
{
|
||||
public:
|
||||
static void ComputeExpectedResults(Dataset& dataset, int const root, int const rank)
|
||||
static void ComputeExpectedResults(Dataset& dataset, int const root, std::vector<int> const& ranks)
|
||||
{
|
||||
// Root has the answer; share it via host memcpy's
|
||||
if (rank == root)
|
||||
for (int h = 0; h < ranks.size(); h++)
|
||||
{
|
||||
HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.inputs[rank],
|
||||
dataset.NumBytes(), hipMemcpyDeviceToHost));
|
||||
for (int i = 0; i < dataset.numDevices; i++)
|
||||
int rank = ranks[h];
|
||||
// Root has the answer; share it via host memcpy's
|
||||
if (rank == root)
|
||||
{
|
||||
if (i == rank) continue;
|
||||
memcpy(dataset.expected[i], dataset.expected[root], dataset.NumBytes());
|
||||
HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.inputs[rank],
|
||||
dataset.NumBytes(), hipMemcpyDeviceToHost));
|
||||
for (int i = 0; i < dataset.numDevices; i++)
|
||||
{
|
||||
if (i == rank) continue;
|
||||
memcpy(dataset.expected[i], dataset.expected[root], dataset.NumBytes());
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TestBroadcast(int rank, Dataset& dataset)
|
||||
void TestBroadcast(int rank, Dataset& dataset, bool& pass)
|
||||
{
|
||||
SetUpPerProcess(rank, ncclCollBroadcast, comms[rank], streams[rank], dataset);
|
||||
|
||||
if (numDevices > numDevicesAvailable) return;
|
||||
if (numDevices > numDevicesAvailable)
|
||||
{
|
||||
pass = true;
|
||||
return;
|
||||
}
|
||||
|
||||
Barrier barrier(rank, numDevices, std::atoi(getenv("NCCL_COMM_ID")));
|
||||
|
||||
@@ -41,7 +50,7 @@ namespace CorrectnessTests
|
||||
{
|
||||
// Prepare input / output / expected results
|
||||
FillDatasetWithPattern(dataset, rank);
|
||||
ComputeExpectedResults(dataset, root, rank);
|
||||
ComputeExpectedResults(dataset, root, std::vector<int>(1, rank));
|
||||
|
||||
// Launch the reduction (1 process per GPU)
|
||||
ncclResult_t res = ncclBroadcast(dataset.inputs[rank],
|
||||
@@ -53,7 +62,7 @@ namespace CorrectnessTests
|
||||
HIP_CALL(hipStreamSynchronize(streams[rank]));
|
||||
|
||||
// Check results
|
||||
ValidateResults(dataset, rank);
|
||||
pass = ValidateResults(dataset, rank);
|
||||
|
||||
// Ensure all processes have finished current iteration before proceeding
|
||||
barrier.Wait();
|
||||
|
||||
@@ -25,60 +25,28 @@ namespace CorrectnessTests
|
||||
datasets[i] = (Dataset*)mmap(NULL, sizeof(Dataset), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
|
||||
datasets[i]->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclFuncs[i]);
|
||||
}
|
||||
Barrier::ClearShmFiles(std::atoi(getenv("NCCL_COMM_ID")));
|
||||
|
||||
int pid1 = 0;
|
||||
int pid2 = 0;
|
||||
int pid3 = 0;
|
||||
pid1 = fork();
|
||||
std::vector<int> pids(numDevices);
|
||||
|
||||
// From this point on, ignore original process as we cannot have it create a HIP context
|
||||
if (pid1 == 0)
|
||||
int gpu = -1;
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
pid2 = fork();
|
||||
if (numDevices > 2)
|
||||
gpu++;
|
||||
int pid = fork();
|
||||
if (pid == 0)
|
||||
{
|
||||
pid3 = fork();
|
||||
}
|
||||
if ((pid2 > 0 && pid3 == 0 && numDevices == 2) || (pid2 > 0 && pid3 > 0 && numDevices > 2))
|
||||
{
|
||||
// Process 0
|
||||
TestCombinedCalls(0, datasets, ncclFuncs);
|
||||
if (pid3 > 0)
|
||||
{
|
||||
waitpid(pid3, NULL, 0);
|
||||
}
|
||||
}
|
||||
else if ((pid2 == 0 && pid3 == 0 && numDevices == 2) || (pid2 == 0 && pid3 > 0 && numDevices > 2))
|
||||
{
|
||||
// Process 1
|
||||
TestCombinedCalls(1, datasets, ncclFuncs);
|
||||
if (numDevices > 2)
|
||||
{
|
||||
waitpid(pid3, NULL, 0);
|
||||
}
|
||||
exit(0);
|
||||
}
|
||||
else if (pid2 > 0 && pid3 == 0 && numDevices > 2)
|
||||
{
|
||||
// Process 2 (available when numDevices > 2)
|
||||
TestCombinedCalls(2, datasets, ncclFuncs);
|
||||
exit(0);
|
||||
}
|
||||
else if (pid2 == 0 && pid3 == 0 && numDevices == 4)
|
||||
{
|
||||
// Process 3 (available when numDevices == 4)
|
||||
TestCombinedCalls(3, datasets, ncclFuncs);
|
||||
exit(0);
|
||||
bool pass;
|
||||
TestCombinedCalls(gpu, datasets, ncclFuncs, pass);
|
||||
TerminateChildProcess(pass);
|
||||
}
|
||||
else
|
||||
{
|
||||
exit(0);
|
||||
pids[gpu] = pid;
|
||||
}
|
||||
waitpid(pid2, NULL, 0);
|
||||
exit(0);
|
||||
}
|
||||
waitpid(pid1, NULL, 0);
|
||||
|
||||
ValidateProcesses(pids);
|
||||
|
||||
for (int i = 0; i < datasets.size(); i++)
|
||||
{
|
||||
munmap(datasets[i], sizeof(Dataset));
|
||||
@@ -104,7 +72,7 @@ namespace CorrectnessTests
|
||||
// Number of elements
|
||||
testing::Values(3072, 3145728),
|
||||
// Number of devices
|
||||
testing::Values(2,3,4),
|
||||
testing::Values(2,3,4,8),
|
||||
// In-place or not
|
||||
testing::Values(false, true),
|
||||
testing::Values("")),
|
||||
|
||||
@@ -20,21 +20,26 @@ namespace CorrectnessTests
|
||||
class CombinedCallsMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
|
||||
{
|
||||
public:
|
||||
void TestCombinedCalls(int rank, std::vector<Dataset*>& datasets, std::vector<ncclFunc_t> const& funcs)
|
||||
void TestCombinedCalls(int rank, std::vector<Dataset*>& datasets, std::vector<ncclFunc_t> const& funcs, bool& pass)
|
||||
{
|
||||
SetUpPerProcess(rank, funcs, comms[rank], streams[rank], datasets);
|
||||
|
||||
if (numDevices > numDevicesAvailable) return;
|
||||
if (numDevices > numDevicesAvailable)
|
||||
{
|
||||
pass = true;
|
||||
return;
|
||||
}
|
||||
|
||||
Barrier barrier(rank, numDevices, std::atoi(getenv("NCCL_COMM_ID")));
|
||||
|
||||
// Compute expected results for each dataset in combined
|
||||
int const root = 0;
|
||||
AllGatherMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[0], barrier, rank, numDevices);
|
||||
AllReduceMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[1], barrier, op, rank);
|
||||
BroadcastMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[2], root, rank);
|
||||
ReduceMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[3], barrier, op, root, rank);
|
||||
ReduceScatterMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[4], barrier, op, rank);
|
||||
std::vector<int> ranks(1, rank);
|
||||
AllGatherMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[0], barrier, numDevices, ranks);
|
||||
AllReduceMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[1], barrier, op, ranks);
|
||||
BroadcastMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[2], root, ranks);
|
||||
ReduceMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[3], barrier, op, root, ranks);
|
||||
ReduceScatterMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[4], barrier, op, ranks);
|
||||
|
||||
size_t const byteCount = datasets[0]->NumBytes() / numDevices;
|
||||
size_t const elemCount = numElements / numDevices;
|
||||
@@ -64,12 +69,26 @@ namespace CorrectnessTests
|
||||
// Wait for reduction to complete
|
||||
HIP_CALL(hipStreamSynchronize(streams[rank]));
|
||||
|
||||
// Check results for each collective in the combined
|
||||
for (int i = 0; i < 5; i++)
|
||||
for (int i = 0; i < funcs.size(); i++)
|
||||
{
|
||||
ValidateResults(*datasets[i], rank);
|
||||
for (int j = 0; j < ranks.size(); j++)
|
||||
{
|
||||
pass = ValidateResults(*datasets[i], ranks[j], root);
|
||||
if (!pass)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
barrier.Wait();
|
||||
datasets[i]->Release(rank);
|
||||
for (int j = 0; j < ranks.size(); j++)
|
||||
{
|
||||
datasets[i]->Release(ranks[j]);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < ranks.size(); i++)
|
||||
{
|
||||
TearDownPerProcess(comms[ranks[i]], streams[ranks[i]]);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
@@ -10,63 +10,27 @@ namespace CorrectnessTests
|
||||
{
|
||||
TEST_P(GatherMultiProcessCorrectnessTest, Correctness)
|
||||
{
|
||||
Dataset* dataset = (Dataset*)mmap(NULL, sizeof(Dataset), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
|
||||
dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollGather);
|
||||
Barrier::ClearShmFiles(std::atoi(getenv("NCCL_COMM_ID")));
|
||||
std::vector<int> pids(numDevices);
|
||||
|
||||
int pid1 = 0;
|
||||
int pid2 = 0;
|
||||
int pid3 = 0;
|
||||
pid1 = fork();
|
||||
|
||||
// From this point on, ignore original process as we cannot have it create a HIP context
|
||||
if (pid1 == 0)
|
||||
int gpu = -1;
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
pid2 = fork();
|
||||
if (numDevices > 2)
|
||||
gpu++;
|
||||
int pid = fork();
|
||||
if (pid == 0)
|
||||
{
|
||||
pid3 = fork();
|
||||
}
|
||||
if ((pid2 > 0 && pid3 == 0 && numDevices == 2) || (pid2 > 0 && pid3 > 0 && numDevices > 2))
|
||||
{
|
||||
// Process 0
|
||||
TestGather(0, *dataset);
|
||||
if (pid3 > 0)
|
||||
{
|
||||
waitpid(pid3, NULL, 0);
|
||||
}
|
||||
}
|
||||
else if ((pid2 == 0 && pid3 == 0 && numDevices == 2) || (pid2 == 0 && pid3 > 0 && numDevices > 2))
|
||||
{
|
||||
// Process 1
|
||||
TestGather(1, *dataset);
|
||||
if (numDevices > 2)
|
||||
{
|
||||
waitpid(pid3, NULL, 0);
|
||||
}
|
||||
exit(0);
|
||||
}
|
||||
else if (pid2 > 0 && pid3 == 0 && numDevices > 2)
|
||||
{
|
||||
// Process 2 (available when numDevices > 2)
|
||||
TestGather(2, *dataset);
|
||||
exit(0);
|
||||
}
|
||||
else if (pid2 == 0 && pid3 == 0 && numDevices == 4)
|
||||
{
|
||||
// Process 3 (available when numDevices == 4)
|
||||
TestGather(3, *dataset);
|
||||
exit(0);
|
||||
bool pass;
|
||||
TestGather(gpu, *dataset, pass);
|
||||
TerminateChildProcess(pass);
|
||||
}
|
||||
else
|
||||
{
|
||||
exit(0);
|
||||
pids[gpu] = pid;
|
||||
}
|
||||
waitpid(pid2, NULL, 0);
|
||||
exit(0);
|
||||
}
|
||||
waitpid(pid1, NULL, 0);
|
||||
munmap(dataset, sizeof(Dataset));
|
||||
|
||||
ValidateProcesses(pids);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(GatherMultiProcessCorrectnessSweep,
|
||||
@@ -88,7 +52,7 @@ namespace CorrectnessTests
|
||||
// Number of elements
|
||||
testing::Values(1024, 1048576),
|
||||
// Number of devices
|
||||
testing::Values(2,3,4),
|
||||
testing::Values(2,3,4,8),
|
||||
// In-place or not
|
||||
testing::Values(false),
|
||||
testing::Values("RCCL_ALLTOALL_KERNEL_DISABLE=0", "RCCL_ALLTOALL_KERNEL_DISABLE=1")),
|
||||
|
||||
@@ -19,11 +19,15 @@ namespace CorrectnessTests
|
||||
dataset.NumBytes(), hipMemcpyDeviceToHost));
|
||||
}
|
||||
|
||||
void TestGather(int rank, Dataset& dataset)
|
||||
void TestGather(int rank, Dataset& dataset, bool& pass)
|
||||
{
|
||||
SetUpPerProcess(rank, ncclCollGather, comms[rank], streams[rank], dataset);
|
||||
|
||||
if (numDevices > numDevicesAvailable) return;
|
||||
if (numDevices > numDevicesAvailable)
|
||||
{
|
||||
pass = true;
|
||||
return;
|
||||
}
|
||||
|
||||
Barrier barrier(rank, numDevices, std::atoi(getenv("NCCL_COMM_ID")));
|
||||
|
||||
@@ -44,7 +48,7 @@ namespace CorrectnessTests
|
||||
HIP_CALL(hipStreamSynchronize(streams[rank]));
|
||||
|
||||
// Check results
|
||||
ValidateResults(dataset, rank, root);
|
||||
pass = ValidateResults(dataset, rank, root);
|
||||
|
||||
// Ensure all processes have finished current iteration before proceeding
|
||||
barrier.Wait();
|
||||
|
||||
@@ -25,75 +25,39 @@ namespace CorrectnessTests
|
||||
datasets[i] = (Dataset*)mmap(NULL, sizeof(Dataset), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
|
||||
datasets[i]->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclFuncs[i]);
|
||||
}
|
||||
Barrier::ClearShmFiles(std::atoi(getenv("NCCL_COMM_ID")));
|
||||
|
||||
int pid1 = 0;
|
||||
int pid2 = 0;
|
||||
int pid3 = 0;
|
||||
pid1 = fork();
|
||||
int const numGpusPerProcess = 2;
|
||||
int const numProcesses = numDevices / numGpusPerProcess;
|
||||
std::vector<int> pids(numProcesses);
|
||||
int process = -1;
|
||||
|
||||
// From this point on, ignore original process as we cannot have it create a HIP context
|
||||
if (pid1 == 0)
|
||||
for (int i = 0; i < numDevices; i+= numGpusPerProcess)
|
||||
{
|
||||
pid2 = fork();
|
||||
if (numDevices > 4)
|
||||
process++;
|
||||
int pid = fork();
|
||||
if (pid == 0)
|
||||
{
|
||||
pid3 = fork();
|
||||
}
|
||||
if ((pid2 > 0 && pid3 == 0 && numDevices == 4) || (pid2 > 0 && pid3 > 0 && numDevices > 4))
|
||||
{
|
||||
// Process 0
|
||||
std::vector<int> ranks;
|
||||
ranks.push_back(0);
|
||||
ranks.push_back(1);
|
||||
int gpuIdx = i;
|
||||
int maxIdx = gpuIdx + (numGpusPerProcess - 1) >= numDevices ? numDevices : gpuIdx + numGpusPerProcess;
|
||||
|
||||
TestGroupCalls(0, ranks, datasets, ncclFuncs);
|
||||
if (pid3 > 0)
|
||||
std::vector<int> ranks;
|
||||
for (; gpuIdx < maxIdx; gpuIdx++)
|
||||
{
|
||||
waitpid(pid3, NULL, 0);
|
||||
ranks.push_back(gpuIdx);
|
||||
}
|
||||
}
|
||||
else if ((pid2 == 0 && pid3 == 0 && numDevices == 4) || (pid2 == 0 && pid3 > 0 && numDevices > 4))
|
||||
{
|
||||
// Process 1
|
||||
std::vector<int> ranks;
|
||||
ranks.push_back(2);
|
||||
ranks.push_back(3);
|
||||
TestGroupCalls(1, ranks, datasets, ncclFuncs);
|
||||
if (pid3 > 0)
|
||||
{
|
||||
waitpid(pid3, NULL, 0);
|
||||
}
|
||||
exit(0);
|
||||
}
|
||||
else if (pid2 > 0 && pid3 == 0 && numDevices == 8)
|
||||
{
|
||||
// Process 2 (available when numDevices == 8)
|
||||
std::vector<int> ranks;
|
||||
ranks.push_back(4);
|
||||
ranks.push_back(5);
|
||||
|
||||
TestGroupCalls(2, ranks, datasets, ncclFuncs);
|
||||
exit(0);
|
||||
}
|
||||
else if (pid2 == 0 && pid3 == 0 && numDevices == 8)
|
||||
{
|
||||
// Process 3 (available when numDevices == 8)
|
||||
std::vector<int> ranks;
|
||||
ranks.push_back(6);
|
||||
ranks.push_back(7);
|
||||
|
||||
TestGroupCalls(3, ranks, datasets, ncclFuncs);
|
||||
exit(0);
|
||||
bool pass;
|
||||
TestGroupCalls(process, ranks, datasets, ncclFuncs, pass);
|
||||
TerminateChildProcess(pass);
|
||||
}
|
||||
else
|
||||
{
|
||||
exit(0);
|
||||
pids[process] = pid;
|
||||
}
|
||||
waitpid(pid2, NULL, 0);
|
||||
exit(0);
|
||||
}
|
||||
waitpid(pid1, NULL, 0);
|
||||
|
||||
ValidateProcesses(pids);
|
||||
|
||||
for (int i = 0; i < datasets.size(); i++)
|
||||
{
|
||||
munmap(datasets[i], sizeof(Dataset));
|
||||
@@ -119,7 +83,7 @@ namespace CorrectnessTests
|
||||
// Number of elements
|
||||
testing::Values(3072, 3145728),
|
||||
// Number of devices
|
||||
testing::Values(4),
|
||||
testing::Values(4,8),
|
||||
// In-place or not
|
||||
testing::Values(false, true),
|
||||
testing::Values("")),
|
||||
|
||||
@@ -21,34 +21,50 @@ namespace CorrectnessTests
|
||||
class GroupCallsMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
|
||||
{
|
||||
public:
|
||||
void TestGroupCalls(int process, std::vector<int> const& ranks, std::vector<Dataset*>& datasets, std::vector<ncclFunc_t> const& funcs)
|
||||
void TestGroupCalls(int process, std::vector<int> const& ranks, std::vector<Dataset*>& datasets, std::vector<ncclFunc_t> const& funcs, bool& pass)
|
||||
{
|
||||
if (numDevices > numDevicesAvailable) return;
|
||||
|
||||
ncclGroupStart();
|
||||
for (int i = 0; i < ranks.size(); i++)
|
||||
{
|
||||
SetUpPerProcess(ranks[i], funcs, comms[ranks[i]], streams[ranks[i]], datasets);
|
||||
if (numDevices > numDevicesAvailable)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
ncclGroupEnd();
|
||||
|
||||
if (numDevices > numDevicesAvailable)
|
||||
{
|
||||
pass = true;
|
||||
return;
|
||||
}
|
||||
|
||||
int numProcesses = numDevices / ranks.size();
|
||||
Barrier barrier(process, numProcesses, std::atoi(getenv("NCCL_COMM_ID")));
|
||||
|
||||
int const root = 0;
|
||||
for (int i = 0; i < ranks.size(); i++)
|
||||
{
|
||||
AllGatherMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[0], barrier, numDevices, ranks[i]);
|
||||
AllReduceMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[1], barrier, op, ranks[i]);
|
||||
BroadcastMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[2], root, ranks[i]);
|
||||
ReduceMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[3], barrier, op, root, ranks[i]);
|
||||
ReduceScatterMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[4], barrier, op, ranks[i]);
|
||||
for (int j = 0; j < datasets.size(); j++)
|
||||
{
|
||||
FillDatasetWithPattern(*datasets[j], ranks[i]);
|
||||
}
|
||||
}
|
||||
|
||||
int const root = 0;
|
||||
|
||||
AllGatherMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[0], barrier, numDevices, ranks);
|
||||
AllReduceMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[1], barrier, op, ranks);
|
||||
BroadcastMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[2], root, ranks);
|
||||
ReduceMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[3], barrier, op, root, ranks);
|
||||
ReduceScatterMultiProcessCorrectnessTest::ComputeExpectedResults(*datasets[4], barrier, op, ranks);
|
||||
barrier.Wait();
|
||||
|
||||
ncclGroupStart();
|
||||
|
||||
// AllGather
|
||||
size_t const byteCount = datasets[0]->NumBytes() / numDevices;
|
||||
size_t const elemCount = numElements / numDevices;
|
||||
|
||||
ncclGroupStart();
|
||||
// AllGather
|
||||
for (int i = 0; i < ranks.size(); i++)
|
||||
{
|
||||
int rank = ranks[i];
|
||||
@@ -90,7 +106,7 @@ namespace CorrectnessTests
|
||||
{
|
||||
int rank = ranks[i];
|
||||
ncclReduceScatter(datasets[4]->inputs[rank],
|
||||
(int8_t *)datasets[4]->outputs[rank] + (i * byteCount),
|
||||
(int8_t *)datasets[4]->outputs[rank] + (rank * byteCount),
|
||||
elemCount, dataType, op,
|
||||
comms[rank], streams[rank]);
|
||||
}
|
||||
@@ -108,7 +124,11 @@ namespace CorrectnessTests
|
||||
{
|
||||
for (int j = 0; j < ranks.size(); j++)
|
||||
{
|
||||
ValidateResults(*datasets[i], ranks[j]);
|
||||
pass = ValidateResults(*datasets[i], ranks[j], root);
|
||||
if (!pass)
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
barrier.Wait();
|
||||
for (int j = 0; j < ranks.size(); j++)
|
||||
|
||||
@@ -10,63 +10,27 @@ namespace CorrectnessTests
|
||||
{
|
||||
TEST_P(ReduceMultiProcessCorrectnessTest, Correctness)
|
||||
{
|
||||
Dataset* dataset = (Dataset*)mmap(NULL, sizeof(Dataset), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
|
||||
dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollReduce);
|
||||
Barrier::ClearShmFiles(std::atoi(getenv("NCCL_COMM_ID")));
|
||||
std::vector<int> pids(numDevices);
|
||||
|
||||
int pid1 = 0;
|
||||
int pid2 = 0;
|
||||
int pid3 = 0;
|
||||
pid1 = fork();
|
||||
|
||||
// From this point on, ignore original process as we cannot have it create a HIP context
|
||||
if (pid1 == 0)
|
||||
int gpu = -1;
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
pid2 = fork();
|
||||
if (numDevices > 2)
|
||||
gpu++;
|
||||
int pid = fork();
|
||||
if (pid == 0)
|
||||
{
|
||||
pid3 = fork();
|
||||
}
|
||||
if ((pid2 > 0 && pid3 == 0 && numDevices == 2) || (pid2 > 0 && pid3 > 0 && numDevices > 2))
|
||||
{
|
||||
// Process 0
|
||||
TestReduce(0, *dataset);
|
||||
if (pid3 > 0)
|
||||
{
|
||||
waitpid(pid3, NULL, 0);
|
||||
}
|
||||
}
|
||||
else if ((pid2 == 0 && pid3 == 0 && numDevices == 2) || (pid2 == 0 && pid3 > 0 && numDevices > 2))
|
||||
{
|
||||
// Process 1
|
||||
TestReduce(1, *dataset);
|
||||
if (numDevices > 2)
|
||||
{
|
||||
waitpid(pid3, NULL, 0);
|
||||
}
|
||||
exit(0);
|
||||
}
|
||||
else if (pid2 > 0 && pid3 == 0 && numDevices > 2)
|
||||
{
|
||||
// Process 2 (available when numDevices > 2)
|
||||
TestReduce(2, *dataset);
|
||||
exit(0);
|
||||
}
|
||||
else if (pid2 == 0 && pid3 == 0 && numDevices == 4)
|
||||
{
|
||||
// Process 3 (available when numDevices == 4)
|
||||
TestReduce(3, *dataset);
|
||||
exit(0);
|
||||
bool pass;
|
||||
TestReduce(gpu, *dataset, pass);
|
||||
TerminateChildProcess(pass);
|
||||
}
|
||||
else
|
||||
{
|
||||
exit(0);
|
||||
pids[gpu] = pid;
|
||||
}
|
||||
waitpid(pid2, NULL, 0);
|
||||
exit(0);
|
||||
}
|
||||
waitpid(pid1, NULL, 0);
|
||||
munmap(dataset, sizeof(Dataset));
|
||||
|
||||
ValidateProcesses(pids);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(ReduceMultiProcessCorrectnessSweep,
|
||||
@@ -88,7 +52,7 @@ namespace CorrectnessTests
|
||||
// Number of elements
|
||||
testing::Values(1024, 1048576),
|
||||
// Number of devices
|
||||
testing::Values(2,3,4),
|
||||
testing::Values(2,3,4,8),
|
||||
// In-place or not
|
||||
testing::Values(false, true),
|
||||
testing::Values("")),
|
||||
|
||||
@@ -13,77 +13,93 @@ namespace CorrectnessTests
|
||||
class ReduceMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
|
||||
{
|
||||
public:
|
||||
static void ComputeExpectedResults(Dataset& dataset, Barrier& barrier, ncclRedOp_t const op, int const root, int const rank)
|
||||
static void ComputeExpectedResults(Dataset& dataset, Barrier& barrier, ncclRedOp_t const op, int const root, std::vector<int> const& ranks)
|
||||
{
|
||||
// Copy all inputs to expected arrays temporarily to perform reduction on host
|
||||
HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.inputs[rank],
|
||||
dataset.NumBytes(), hipMemcpyDeviceToHost));
|
||||
for (int i = 0; i < ranks.size(); i++)
|
||||
{
|
||||
int rank = ranks[i];
|
||||
HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.inputs[rank],
|
||||
dataset.NumBytes(), hipMemcpyDeviceToHost));
|
||||
}
|
||||
barrier.Wait();
|
||||
|
||||
if (rank == root)
|
||||
for (int h = 0; h < ranks.size(); h++)
|
||||
{
|
||||
// Allocate temporary host array to accumulate results
|
||||
int8_t* resultI1 = (int8_t *)malloc(dataset.NumBytes());
|
||||
uint8_t* resultU1 = (uint8_t *)resultI1;
|
||||
int32_t* resultI4 = (int32_t *)resultI1;
|
||||
uint32_t* resultU4 = (uint32_t *)resultI1;
|
||||
int64_t* resultI8 = (int64_t *)resultI1;
|
||||
uint64_t* resultU8 = (uint64_t *)resultI1;
|
||||
float* resultF4 = (float *)resultI1;
|
||||
double* resultF8 = (double *)resultI1;
|
||||
rccl_bfloat16* resultB2 = (rccl_bfloat16 *)resultI1;
|
||||
|
||||
// Initialize the result with the first device's array
|
||||
memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
|
||||
|
||||
// Perform reduction on the other device arrays
|
||||
for (int i = 1; i < dataset.numDevices; i++)
|
||||
int rank = ranks[h];
|
||||
if (rank == root)
|
||||
{
|
||||
int8_t* arrayI1 = (int8_t *)dataset.expected[i];
|
||||
uint8_t* arrayU1 = (uint8_t *)arrayI1;
|
||||
int32_t* arrayI4 = (int32_t *)arrayI1;
|
||||
uint32_t* arrayU4 = (uint32_t *)arrayI1;
|
||||
int64_t* arrayI8 = (int64_t *)arrayI1;
|
||||
uint64_t* arrayU8 = (uint64_t *)arrayI1;
|
||||
float* arrayF4 = (float *)arrayI1;
|
||||
double* arrayF8 = (double *)arrayI1;
|
||||
rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1;
|
||||
// Allocate temporary host array to accumulate results
|
||||
int8_t* resultI1 = (int8_t *)malloc(dataset.NumBytes());
|
||||
uint8_t* resultU1 = (uint8_t *)resultI1;
|
||||
int32_t* resultI4 = (int32_t *)resultI1;
|
||||
uint32_t* resultU4 = (uint32_t *)resultI1;
|
||||
int64_t* resultI8 = (int64_t *)resultI1;
|
||||
uint64_t* resultU8 = (uint64_t *)resultI1;
|
||||
float* resultF4 = (float *)resultI1;
|
||||
double* resultF8 = (double *)resultI1;
|
||||
rccl_bfloat16* resultB2 = (rccl_bfloat16 *)resultI1;
|
||||
|
||||
for (int j = 0; j < dataset.numElements; j++)
|
||||
// Initialize the result with the first device's array
|
||||
memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
|
||||
|
||||
// Perform reduction on the other device arrays
|
||||
for (int i = 1; i < dataset.numDevices; i++)
|
||||
{
|
||||
switch (dataset.dataType)
|
||||
int8_t* arrayI1 = (int8_t *)dataset.expected[i];
|
||||
uint8_t* arrayU1 = (uint8_t *)arrayI1;
|
||||
int32_t* arrayI4 = (int32_t *)arrayI1;
|
||||
uint32_t* arrayU4 = (uint32_t *)arrayI1;
|
||||
int64_t* arrayI8 = (int64_t *)arrayI1;
|
||||
uint64_t* arrayU8 = (uint64_t *)arrayI1;
|
||||
float* arrayF4 = (float *)arrayI1;
|
||||
double* arrayF8 = (double *)arrayI1;
|
||||
rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1;
|
||||
|
||||
for (int j = 0; j < dataset.numElements; j++)
|
||||
{
|
||||
case ncclInt8: resultI1[j] = ReduceOp(op, resultI1[j], arrayI1[j]); break;
|
||||
case ncclUint8: resultU1[j] = ReduceOp(op, resultU1[j], arrayU1[j]); break;
|
||||
case ncclInt32: resultI4[j] = ReduceOp(op, resultI4[j], arrayI4[j]); break;
|
||||
case ncclUint32: resultU4[j] = ReduceOp(op, resultU4[j], arrayU4[j]); break;
|
||||
case ncclInt64: resultI8[j] = ReduceOp(op, resultI8[j], arrayI8[j]); break;
|
||||
case ncclUint64: resultU8[j] = ReduceOp(op, resultU8[j], arrayU8[j]); break;
|
||||
case ncclFloat32: resultF4[j] = ReduceOp(op, resultF4[j], arrayF4[j]); break;
|
||||
case ncclFloat64: resultF8[j] = ReduceOp(op, resultF8[j], arrayF8[j]); break;
|
||||
case ncclBfloat16: resultB2[j] = ReduceOp(op, resultB2[j], arrayB2[j]); break;
|
||||
default:
|
||||
fprintf(stderr, "[ERROR] Unsupported datatype\n");
|
||||
exit(0);
|
||||
switch (dataset.dataType)
|
||||
{
|
||||
case ncclInt8: resultI1[j] = ReduceOp(op, resultI1[j], arrayI1[j]); break;
|
||||
case ncclUint8: resultU1[j] = ReduceOp(op, resultU1[j], arrayU1[j]); break;
|
||||
case ncclInt32: resultI4[j] = ReduceOp(op, resultI4[j], arrayI4[j]); break;
|
||||
case ncclUint32: resultU4[j] = ReduceOp(op, resultU4[j], arrayU4[j]); break;
|
||||
case ncclInt64: resultI8[j] = ReduceOp(op, resultI8[j], arrayI8[j]); break;
|
||||
case ncclUint64: resultU8[j] = ReduceOp(op, resultU8[j], arrayU8[j]); break;
|
||||
case ncclFloat32: resultF4[j] = ReduceOp(op, resultF4[j], arrayF4[j]); break;
|
||||
case ncclFloat64: resultF8[j] = ReduceOp(op, resultF8[j], arrayF8[j]); break;
|
||||
case ncclBfloat16: resultB2[j] = ReduceOp(op, resultB2[j], arrayB2[j]); break;
|
||||
default:
|
||||
fprintf(stderr, "[ERROR] Unsupported datatype\n");
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
memcpy(dataset.expected[root], resultI1, dataset.NumBytes());
|
||||
free(resultI1);
|
||||
}
|
||||
memcpy(dataset.expected[root], resultI1, dataset.NumBytes());
|
||||
free(resultI1);
|
||||
barrier.Wait();
|
||||
}
|
||||
else
|
||||
barrier.Wait();
|
||||
|
||||
for (int i = 0; i < ranks.size(); i++)
|
||||
{
|
||||
barrier.Wait();
|
||||
HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.outputs[rank], dataset.NumBytes(), hipMemcpyDeviceToHost));
|
||||
int rank = ranks[i];
|
||||
if (rank != root)
|
||||
{
|
||||
HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.outputs[rank], dataset.NumBytes(), hipMemcpyDeviceToHost));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TestReduce(int rank, Dataset& dataset)
|
||||
void TestReduce(int rank, Dataset& dataset, bool& pass)
|
||||
{
|
||||
SetUpPerProcess(rank, ncclCollReduce, comms[rank], streams[rank], dataset);
|
||||
|
||||
if (numDevices > numDevicesAvailable) return;
|
||||
if (numDevices > numDevicesAvailable)
|
||||
{
|
||||
pass = true;
|
||||
return;
|
||||
}
|
||||
|
||||
Barrier barrier(rank, numDevices, std::atoi(getenv("NCCL_COMM_ID")));
|
||||
|
||||
@@ -92,7 +108,7 @@ namespace CorrectnessTests
|
||||
{
|
||||
// Prepare input / output / expected results
|
||||
FillDatasetWithPattern(dataset, rank);
|
||||
ComputeExpectedResults(dataset, barrier, op, root, rank);
|
||||
ComputeExpectedResults(dataset, barrier, op, root, std::vector<int>(1, rank));
|
||||
// Launch the reduction (1 process per GPU)
|
||||
ncclResult_t res = ncclReduce(dataset.inputs[rank],
|
||||
dataset.outputs[rank],
|
||||
@@ -101,7 +117,7 @@ namespace CorrectnessTests
|
||||
// Wait for reduction to complete
|
||||
HIP_CALL(hipStreamSynchronize(streams[rank]));
|
||||
// Check results
|
||||
ValidateResults(dataset, rank);
|
||||
pass = ValidateResults(dataset, rank);
|
||||
// Ensure all processes have finished current iteration before proceeding
|
||||
barrier.Wait();
|
||||
}
|
||||
|
||||
@@ -10,63 +10,27 @@ namespace CorrectnessTests
|
||||
{
|
||||
TEST_P(ReduceScatterMultiProcessCorrectnessTest, Correctness)
|
||||
{
|
||||
Dataset* dataset = (Dataset*)mmap(NULL, sizeof(Dataset), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
|
||||
dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollReduceScatter);
|
||||
Barrier::ClearShmFiles(std::atoi(getenv("NCCL_COMM_ID")));
|
||||
std::vector<int> pids(numDevices);
|
||||
|
||||
int pid1 = 0;
|
||||
int pid2 = 0;
|
||||
int pid3 = 0;
|
||||
pid1 = fork();
|
||||
|
||||
// From this point on, ignore original process as we cannot have it create a HIP context
|
||||
if (pid1 == 0)
|
||||
int gpu = -1;
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
pid2 = fork();
|
||||
if (numDevices > 2)
|
||||
gpu++;
|
||||
int pid = fork();
|
||||
if (pid == 0)
|
||||
{
|
||||
pid3 = fork();
|
||||
}
|
||||
if ((pid2 > 0 && pid3 == 0 && numDevices == 2) || (pid2 > 0 && pid3 > 0 && numDevices > 2))
|
||||
{
|
||||
// Process 0
|
||||
TestReduceScatter(0, *dataset);
|
||||
if (pid3 > 0)
|
||||
{
|
||||
waitpid(pid3, NULL, 0);
|
||||
}
|
||||
}
|
||||
else if ((pid2 == 0 && pid3 == 0 && numDevices == 2) || (pid2 == 0 && pid3 > 0 && numDevices > 2))
|
||||
{
|
||||
// Process 1
|
||||
TestReduceScatter(1, *dataset);
|
||||
if (numDevices > 2)
|
||||
{
|
||||
waitpid(pid3, NULL, 0);
|
||||
}
|
||||
exit(0);
|
||||
}
|
||||
else if (pid2 > 0 && pid3 == 0 && numDevices > 2)
|
||||
{
|
||||
// Process 2 (available when numDevices > 2)
|
||||
TestReduceScatter(2, *dataset);
|
||||
exit(0);
|
||||
}
|
||||
else if (pid2 == 0 && pid3 == 0 && numDevices == 4)
|
||||
{
|
||||
// Process 3 (available when numDevices == 4)
|
||||
TestReduceScatter(3, *dataset);
|
||||
exit(0);
|
||||
bool pass;
|
||||
TestReduceScatter(gpu, *dataset, pass);
|
||||
TerminateChildProcess(pass);
|
||||
}
|
||||
else
|
||||
{
|
||||
exit(0);
|
||||
pids[gpu] = pid;
|
||||
}
|
||||
waitpid(pid2, NULL, 0);
|
||||
exit(0);
|
||||
}
|
||||
waitpid(pid1, NULL, 0);
|
||||
munmap(dataset, sizeof(Dataset));
|
||||
|
||||
ValidateProcesses(pids);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(ReduceScatterMultiProcessCorrectnessSweep,
|
||||
@@ -88,7 +52,7 @@ namespace CorrectnessTests
|
||||
// Number of elements
|
||||
testing::Values(3072, 3145728),
|
||||
// Number of devices
|
||||
testing::Values(2,3,4),
|
||||
testing::Values(2,3,4,8),
|
||||
// In-place or not
|
||||
testing::Values(false, true),
|
||||
testing::Values("")),
|
||||
|
||||
@@ -13,60 +13,68 @@ namespace CorrectnessTests
|
||||
class ReduceScatterMultiProcessCorrectnessTest : public MultiProcessCorrectnessTest
|
||||
{
|
||||
public:
|
||||
static void ComputeExpectedResults(Dataset& dataset, Barrier& barrier, ncclRedOp_t const op, int const rank)
|
||||
static void ComputeExpectedResults(Dataset& dataset, Barrier& barrier, ncclRedOp_t const op, std::vector<int> const& ranks)
|
||||
{
|
||||
// Copy all inputs to expected arrays temporarily to perform reduction on host
|
||||
HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.inputs[rank],
|
||||
dataset.NumBytes(), hipMemcpyDeviceToHost));
|
||||
for (int i = 0; i < ranks.size(); i++)
|
||||
{
|
||||
int rank = ranks[i];
|
||||
HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.inputs[rank],
|
||||
dataset.NumBytes(), hipMemcpyDeviceToHost));
|
||||
}
|
||||
barrier.Wait();
|
||||
|
||||
// Have rank 0 do the expected calculation, then send results to other processes
|
||||
int8_t* resultI1;
|
||||
if (rank == 0)
|
||||
int8_t* resultI1;
|
||||
for (int h = 0; h < ranks.size(); h++)
|
||||
{
|
||||
// Allocate temporary host array to accumulate results
|
||||
resultI1 = (int8_t *)malloc(dataset.NumBytes());
|
||||
uint8_t* resultU1 = (uint8_t *)resultI1;
|
||||
int32_t* resultI4 = (int32_t *)resultI1;
|
||||
uint32_t* resultU4 = (uint32_t *)resultI1;
|
||||
int64_t* resultI8 = (int64_t *)resultI1;
|
||||
uint64_t* resultU8 = (uint64_t *)resultI1;
|
||||
float* resultF4 = (float *)resultI1;
|
||||
double* resultF8 = (double *)resultI1;
|
||||
rccl_bfloat16* resultB2 = (rccl_bfloat16 *)resultI1;
|
||||
|
||||
// Initialize the result with the first device's array
|
||||
memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
|
||||
|
||||
// Perform reduction on the other device arrays
|
||||
for (int i = 1; i < dataset.numDevices; i++)
|
||||
int rank = ranks[h];
|
||||
if (rank == 0)
|
||||
{
|
||||
int8_t* arrayI1 = (int8_t *)dataset.expected[i];
|
||||
uint8_t* arrayU1 = (uint8_t *)arrayI1;
|
||||
int32_t* arrayI4 = (int32_t *)arrayI1;
|
||||
uint32_t* arrayU4 = (uint32_t *)arrayI1;
|
||||
int64_t* arrayI8 = (int64_t *)arrayI1;
|
||||
uint64_t* arrayU8 = (uint64_t *)arrayI1;
|
||||
float* arrayF4 = (float *)arrayI1;
|
||||
double* arrayF8 = (double *)arrayI1;
|
||||
rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1;
|
||||
// Allocate temporary host array to accumulate results
|
||||
resultI1 = (int8_t *)malloc(dataset.NumBytes());
|
||||
uint8_t* resultU1 = (uint8_t *)resultI1;
|
||||
int32_t* resultI4 = (int32_t *)resultI1;
|
||||
uint32_t* resultU4 = (uint32_t *)resultI1;
|
||||
int64_t* resultI8 = (int64_t *)resultI1;
|
||||
uint64_t* resultU8 = (uint64_t *)resultI1;
|
||||
float* resultF4 = (float *)resultI1;
|
||||
double* resultF8 = (double *)resultI1;
|
||||
rccl_bfloat16* resultB2 = (rccl_bfloat16 *)resultI1;
|
||||
|
||||
for (int j = 0; j < dataset.numElements; j++)
|
||||
// Initialize the result with the first device's array
|
||||
memcpy(resultI1, dataset.expected[0], dataset.NumBytes());
|
||||
|
||||
// Perform reduction on the other device arrays
|
||||
for (int i = 1; i < dataset.numDevices; i++)
|
||||
{
|
||||
switch (dataset.dataType)
|
||||
int8_t* arrayI1 = (int8_t *)dataset.expected[i];
|
||||
uint8_t* arrayU1 = (uint8_t *)arrayI1;
|
||||
int32_t* arrayI4 = (int32_t *)arrayI1;
|
||||
uint32_t* arrayU4 = (uint32_t *)arrayI1;
|
||||
int64_t* arrayI8 = (int64_t *)arrayI1;
|
||||
uint64_t* arrayU8 = (uint64_t *)arrayI1;
|
||||
float* arrayF4 = (float *)arrayI1;
|
||||
double* arrayF8 = (double *)arrayI1;
|
||||
rccl_bfloat16* arrayB2 = (rccl_bfloat16 *)arrayI1;
|
||||
|
||||
for (int j = 0; j < dataset.numElements; j++)
|
||||
{
|
||||
case ncclInt8: resultI1[j] = ReduceOp(op, resultI1[j], arrayI1[j]); break;
|
||||
case ncclUint8: resultU1[j] = ReduceOp(op, resultU1[j], arrayU1[j]); break;
|
||||
case ncclInt32: resultI4[j] = ReduceOp(op, resultI4[j], arrayI4[j]); break;
|
||||
case ncclUint32: resultU4[j] = ReduceOp(op, resultU4[j], arrayU4[j]); break;
|
||||
case ncclInt64: resultI8[j] = ReduceOp(op, resultI8[j], arrayI8[j]); break;
|
||||
case ncclUint64: resultU8[j] = ReduceOp(op, resultU8[j], arrayU8[j]); break;
|
||||
case ncclFloat32: resultF4[j] = ReduceOp(op, resultF4[j], arrayF4[j]); break;
|
||||
case ncclFloat64: resultF8[j] = ReduceOp(op, resultF8[j], arrayF8[j]); break;
|
||||
case ncclBfloat16: resultB2[j] = ReduceOp(op, resultB2[j], arrayB2[j]); break;
|
||||
default:
|
||||
fprintf(stderr, "[ERROR] Unsupported datatype\n");
|
||||
exit(0);
|
||||
switch (dataset.dataType)
|
||||
{
|
||||
case ncclInt8: resultI1[j] = ReduceOp(op, resultI1[j], arrayI1[j]); break;
|
||||
case ncclUint8: resultU1[j] = ReduceOp(op, resultU1[j], arrayU1[j]); break;
|
||||
case ncclInt32: resultI4[j] = ReduceOp(op, resultI4[j], arrayI4[j]); break;
|
||||
case ncclUint32: resultU4[j] = ReduceOp(op, resultU4[j], arrayU4[j]); break;
|
||||
case ncclInt64: resultI8[j] = ReduceOp(op, resultI8[j], arrayI8[j]); break;
|
||||
case ncclUint64: resultU8[j] = ReduceOp(op, resultU8[j], arrayU8[j]); break;
|
||||
case ncclFloat32: resultF4[j] = ReduceOp(op, resultF4[j], arrayF4[j]); break;
|
||||
case ncclFloat64: resultF8[j] = ReduceOp(op, resultF8[j], arrayF8[j]); break;
|
||||
case ncclBfloat16: resultB2[j] = ReduceOp(op, resultB2[j], arrayB2[j]); break;
|
||||
default:
|
||||
fprintf(stderr, "[ERROR] Unsupported datatype\n");
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -75,34 +83,44 @@ namespace CorrectnessTests
|
||||
// Copy results into expected arrays
|
||||
size_t const byteCount = dataset.NumBytes() / dataset.numDevices;
|
||||
|
||||
HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.outputs[rank],
|
||||
dataset.NumBytes(), hipMemcpyDeviceToHost));
|
||||
|
||||
for (int i = 0; i < ranks.size(); i++)
|
||||
{
|
||||
int rank = ranks[i];
|
||||
HIP_CALL(hipMemcpy(dataset.expected[rank], dataset.outputs[rank],
|
||||
dataset.NumBytes(), hipMemcpyDeviceToHost));
|
||||
}
|
||||
barrier.Wait();
|
||||
|
||||
if (rank == 0)
|
||||
for (int h = 0; h < ranks.size(); h++)
|
||||
{
|
||||
for (int i = 0; i < dataset.numDevices; i++)
|
||||
memcpy((int8_t *)dataset.expected[i] + (i * byteCount),
|
||||
resultI1 + (i * byteCount), byteCount);
|
||||
int rank = ranks[h];
|
||||
if (rank == 0)
|
||||
{
|
||||
for (int i = 0; i < dataset.numDevices; i++)
|
||||
memcpy((int8_t *)dataset.expected[i] + (i * byteCount),
|
||||
resultI1 + (i * byteCount), byteCount);
|
||||
|
||||
free(resultI1);
|
||||
free(resultI1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void TestReduceScatter(int rank, Dataset& dataset)
|
||||
void TestReduceScatter(int rank, Dataset& dataset, bool& pass)
|
||||
{
|
||||
// Prepare input / output / expected results
|
||||
SetUpPerProcess(rank, ncclCollAllGather, comms[rank], streams[rank], dataset);
|
||||
|
||||
if (numDevices > numDevicesAvailable) return;
|
||||
if (numElements % numDevices != 0) return;
|
||||
if (numDevices > numDevicesAvailable || numElements % numDevices != 0)
|
||||
{
|
||||
pass = true;
|
||||
return;
|
||||
}
|
||||
|
||||
Barrier barrier(rank, numDevices, std::atoi(getenv("NCCL_COMM_ID")));
|
||||
|
||||
// Prepare input / output / expected results
|
||||
FillDatasetWithPattern(dataset, rank);
|
||||
ComputeExpectedResults(dataset, barrier, op, rank);
|
||||
ComputeExpectedResults(dataset, barrier, op, std::vector<int>(1, rank));
|
||||
|
||||
size_t const byteCount = dataset.NumBytes() / numDevices;
|
||||
size_t const recvCount = dataset.numElements / numDevices;
|
||||
@@ -117,7 +135,7 @@ namespace CorrectnessTests
|
||||
HIP_CALL(hipStreamSynchronize(streams[rank]));
|
||||
|
||||
// Check results
|
||||
ValidateResults(dataset, rank);
|
||||
pass = ValidateResults(dataset, rank);
|
||||
|
||||
TearDownPerProcess(comms[rank], streams[rank]);
|
||||
dataset.Release(rank);
|
||||
|
||||
@@ -10,63 +10,27 @@ namespace CorrectnessTests
|
||||
{
|
||||
TEST_P(ScatterMultiProcessCorrectnessTest, Correctness)
|
||||
{
|
||||
Dataset* dataset = (Dataset*)mmap(NULL, sizeof(Dataset), PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
|
||||
dataset->InitializeRootProcess(numDevices, numElements, dataType, inPlace, ncclCollScatter);
|
||||
Barrier::ClearShmFiles(std::atoi(getenv("NCCL_COMM_ID")));
|
||||
std::vector<int> pids(numDevices);
|
||||
|
||||
int pid1 = 0;
|
||||
int pid2 = 0;
|
||||
int pid3 = 0;
|
||||
pid1 = fork();
|
||||
|
||||
// From this point on, ignore original process as we cannot have it create a HIP context
|
||||
if (pid1 == 0)
|
||||
int gpu = -1;
|
||||
for (int i = 0; i < numDevices; i++)
|
||||
{
|
||||
pid2 = fork();
|
||||
if (numDevices > 2)
|
||||
gpu++;
|
||||
int pid = fork();
|
||||
if (pid == 0)
|
||||
{
|
||||
pid3 = fork();
|
||||
}
|
||||
if ((pid2 > 0 && pid3 == 0 && numDevices == 2) || (pid2 > 0 && pid3 > 0 && numDevices > 2))
|
||||
{
|
||||
// Process 0
|
||||
TestScatter(0, *dataset);
|
||||
if (pid3 > 0)
|
||||
{
|
||||
waitpid(pid3, NULL, 0);
|
||||
}
|
||||
}
|
||||
else if ((pid2 == 0 && pid3 == 0 && numDevices == 2) || (pid2 == 0 && pid3 > 0 && numDevices > 2))
|
||||
{
|
||||
// Process 1
|
||||
TestScatter(1, *dataset);
|
||||
if (numDevices > 2)
|
||||
{
|
||||
waitpid(pid3, NULL, 0);
|
||||
}
|
||||
exit(0);
|
||||
}
|
||||
else if (pid2 > 0 && pid3 == 0 && numDevices > 2)
|
||||
{
|
||||
// Process 2 (available when numDevices > 2)
|
||||
TestScatter(2, *dataset);
|
||||
exit(0);
|
||||
}
|
||||
else if (pid2 == 0 && pid3 == 0 && numDevices == 4)
|
||||
{
|
||||
// Process 3 (available when numDevices == 4)
|
||||
TestScatter(3, *dataset);
|
||||
exit(0);
|
||||
bool pass;
|
||||
TestScatter(gpu, *dataset, pass);
|
||||
TerminateChildProcess(pass);
|
||||
}
|
||||
else
|
||||
{
|
||||
exit(0);
|
||||
pids[gpu] = pid;
|
||||
}
|
||||
waitpid(pid2, NULL, 0);
|
||||
exit(0);
|
||||
}
|
||||
waitpid(pid1, NULL, 0);
|
||||
munmap(dataset, sizeof(Dataset));
|
||||
|
||||
ValidateProcesses(pids);
|
||||
}
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(ScatterMultiProcessCorrectnessSweep,
|
||||
@@ -88,7 +52,7 @@ namespace CorrectnessTests
|
||||
// Number of elements
|
||||
testing::Values(1024, 1048576),
|
||||
// Number of devices
|
||||
testing::Values(2,3,4),
|
||||
testing::Values(2,3,4,8),
|
||||
// In-place or not
|
||||
testing::Values(false),
|
||||
testing::Values("RCCL_ALLTOALL_KERNEL_DISABLE=0", "RCCL_ALLTOALL_KERNEL_DISABLE=1")),
|
||||
|
||||
@@ -23,12 +23,16 @@ namespace CorrectnessTests
|
||||
}
|
||||
}
|
||||
|
||||
void TestScatter(int rank, Dataset& dataset)
|
||||
void TestScatter(int rank, Dataset& dataset, bool& pass)
|
||||
{
|
||||
// Prepare input / output / expected results
|
||||
SetUpPerProcess(rank, ncclCollScatter, comms[rank], streams[rank], dataset);
|
||||
|
||||
if (numDevices > numDevicesAvailable) return;
|
||||
if (numDevices > numDevicesAvailable)
|
||||
{
|
||||
pass = true;
|
||||
return;
|
||||
}
|
||||
|
||||
Barrier barrier(rank, numDevices, std::atoi(getenv("NCCL_COMM_ID")));
|
||||
|
||||
@@ -50,7 +54,7 @@ namespace CorrectnessTests
|
||||
HIP_CALL(hipStreamSynchronize(streams[rank]));
|
||||
|
||||
// Check results
|
||||
ValidateResults(dataset, rank);
|
||||
pass = ValidateResults(dataset, rank);
|
||||
|
||||
barrier.Wait();
|
||||
}
|
||||
|
||||
Référencer dans un nouveau ticket
Bloquer un utilisateur