Unit Tests for testing channels (#1222)
This commit is contained in:
@@ -674,7 +674,11 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
|
||||
|
||||
int minNchannels = ncclMinNchannels();
|
||||
if (comm->nNodes > 1) {
|
||||
minNchannels = std::min(64, minNchannels);
|
||||
minNchannels = std::min(64, maxChannels);
|
||||
}
|
||||
if (comm->nRanks < 8 && 64 < minNchannels) {
|
||||
minNchannels = 2;
|
||||
WARN("NCCL_MIN_NCHANNELS set by environment is ignored due to less than 8 GPUs.");
|
||||
}
|
||||
|
||||
if (mscclEnabled() && (comm->topo->mscclEnabled || mscclForceEnabled())) {
|
||||
|
||||
+20
-17
@@ -105,23 +105,26 @@ namespace RcclUnitTesting
|
||||
TEST(AllReduce, Channels)
|
||||
{
|
||||
TestBed testBed;
|
||||
if(testBed.ev.isGfx94) {
|
||||
// Configuration
|
||||
std::vector<ncclFunc_t> const funcTypes = {ncclCollAllReduce};
|
||||
std::vector<ncclDataType_t> const dataTypes = {ncclBfloat16, ncclHalf};
|
||||
std::vector<ncclRedOp_t> const redOps = {ncclSum};
|
||||
std::vector<int> const roots = {0};
|
||||
std::vector<int> const numElements = {64 * 1024 * 1024, 1024};
|
||||
std::vector<bool> const inPlaceList = {false};
|
||||
std::vector<bool> const managedMemList = {false};
|
||||
std::vector<bool> const useHipGraphList = {false, true};
|
||||
std::vector<char *> const channelList = {"56", "84", "112"};
|
||||
for (auto channel : channelList) {
|
||||
setenv("NCCL_MIN_NCHANNELS", channel, 1);
|
||||
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements,
|
||||
inPlaceList, managedMemList, useHipGraphList);
|
||||
testBed.Finalize();
|
||||
unsetenv("NCCL_MIN_NCHANNELS");
|
||||
if(testBed.ev.maxGpus >= 8) {
|
||||
if(testBed.ev.isGfx94) {
|
||||
// Configuration
|
||||
std::vector<ncclFunc_t> const funcTypes = {ncclCollAllReduce};
|
||||
std::vector<ncclDataType_t> const dataTypes = {ncclBfloat16};
|
||||
std::vector<ncclRedOp_t> const redOps = {ncclSum};
|
||||
std::vector<int> const roots = {0};
|
||||
std::vector<int> const numElements = {64 * 1024 * 1024, 1024};
|
||||
std::vector<bool> const inPlaceList = {false};
|
||||
std::vector<bool> const managedMemList = {false};
|
||||
std::vector<bool> const useHipGraphList = {false, true};
|
||||
std::vector<char *> const channelList = {"84", "112"};
|
||||
bool const enableSweep = false;
|
||||
for (auto channel : channelList) {
|
||||
setenv("NCCL_MIN_NCHANNELS", channel, 1);
|
||||
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements,
|
||||
inPlaceList, managedMemList, useHipGraphList, enableSweep);
|
||||
testBed.Finalize();
|
||||
unsetenv("NCCL_MIN_NCHANNELS");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
+21
-18
@@ -89,24 +89,27 @@ namespace RcclUnitTesting
|
||||
TEST(AllToAll, Channels)
|
||||
{
|
||||
TestBed testBed;
|
||||
if(testBed.ev.isGfx94) {
|
||||
// Configuration
|
||||
std::vector<ncclFunc_t> const funcTypes = {ncclCollAllToAll};
|
||||
std::vector<ncclDataType_t> const dataTypes = {ncclBfloat16, ncclHalf};
|
||||
std::vector<ncclRedOp_t> const redOps = {ncclSum};
|
||||
std::vector<int> const roots = {0};
|
||||
std::vector<int> const numElements = {64 * 1024 * 1024, 1024};
|
||||
std::vector<bool> const inPlaceList = {false};
|
||||
std::vector<bool> const managedMemList = {false};
|
||||
std::vector<bool> const useHipGraphList = {false, true};
|
||||
std::vector<char *> const channelList = {"56", "84", "112"};
|
||||
for (auto channel : channelList) {
|
||||
setenv("NCCL_MIN_NCHANNELS", channel, 1);
|
||||
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements,
|
||||
inPlaceList, managedMemList, useHipGraphList);
|
||||
testBed.Finalize();
|
||||
unsetenv("NCCL_MIN_NCHANNELS");
|
||||
if(testBed.ev.maxGpus >= 8) {
|
||||
if(testBed.ev.isGfx94) {
|
||||
// Configuration
|
||||
std::vector<ncclFunc_t> const funcTypes = {ncclCollAllToAll};
|
||||
std::vector<ncclDataType_t> const dataTypes = {ncclBfloat16};
|
||||
std::vector<ncclRedOp_t> const redOps = {ncclSum};
|
||||
std::vector<int> const roots = {0};
|
||||
std::vector<int> const numElements = {64 * 1024 * 1024, 1024};
|
||||
std::vector<bool> const inPlaceList = {false};
|
||||
std::vector<bool> const managedMemList = {false};
|
||||
std::vector<bool> const useHipGraphList = {false, true};
|
||||
std::vector<char *> const channelList = {"112"};
|
||||
bool const enableSweep = false;
|
||||
for (auto channel : channelList) {
|
||||
setenv("NCCL_MIN_NCHANNELS", channel, 1);
|
||||
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements,
|
||||
inPlaceList, managedMemList, useHipGraphList, enableSweep);
|
||||
testBed.Finalize();
|
||||
unsetenv("NCCL_MIN_NCHANNELS");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -616,7 +616,8 @@ namespace RcclUnitTesting
|
||||
std::vector<int> const& numElements,
|
||||
std::vector<bool> const& inPlaceList,
|
||||
std::vector<bool> const& managedMemList,
|
||||
std::vector<bool> const& useHipGraphList)
|
||||
std::vector<bool> const& useHipGraphList,
|
||||
bool const& enableSweep)
|
||||
{
|
||||
// Sort numElements in descending order to cut down on # of allocations
|
||||
std::vector<int> sortedN = numElements;
|
||||
@@ -662,6 +663,9 @@ namespace RcclUnitTesting
|
||||
// Test either single process all GPUs, or 1 process per GPU
|
||||
int const numChildren = isMultiProcess ? numGpus : 1;
|
||||
int const numRanks = numGpus*ranksPerGpu;
|
||||
if(enableSweep == false && (numGpus < 8 || numRanks < 8)) {
|
||||
continue;
|
||||
}
|
||||
this->InitComms(TestBed::GetDeviceIdsList(numChildren, numGpus, ranksPerGpu));
|
||||
if (testing::Test::HasFailure())
|
||||
{
|
||||
|
||||
@@ -159,7 +159,8 @@ namespace RcclUnitTesting
|
||||
std::vector<int> const& numElements,
|
||||
std::vector<bool> const& inPlaceList,
|
||||
std::vector<bool> const& managedMemList,
|
||||
std::vector<bool> const& useHipGraphList);
|
||||
std::vector<bool> const& useHipGraphList,
|
||||
bool const& enableSweep = true);
|
||||
|
||||
// Wait for user-input if in interactive mode
|
||||
void InteractiveWait(std::string message);
|
||||
|
||||
Reference in New Issue
Block a user