Unit Tests for testing channels (#1222)

This commit is contained in:
saurabhAMD
2024-06-25 10:10:10 -05:00
zatwierdzone przez GitHub
rodzic 5f2b88bc28
commit e170f41ddd
5 zmienionych plików z 53 dodań i 38 usunięć
+5 -1
Wyświetl plik
@@ -674,7 +674,11 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
int minNchannels = ncclMinNchannels();
if (comm->nNodes > 1) {
minNchannels = std::min(64, minNchannels);
minNchannels = std::min(64, maxChannels);
}
if (comm->nRanks < 8 && 64 < minNchannels) {
minNchannels = 2;
WARN("NCCL_MIN_NCHANNELS set by environment is ignored due to less than 8 GPUs.");
}
if (mscclEnabled() && (comm->topo->mscclEnabled || mscclForceEnabled())) {
+20 -17
Wyświetl plik
@@ -105,23 +105,26 @@ namespace RcclUnitTesting
TEST(AllReduce, Channels)
{
TestBed testBed;
if(testBed.ev.isGfx94) {
// Configuration
std::vector<ncclFunc_t> const funcTypes = {ncclCollAllReduce};
std::vector<ncclDataType_t> const dataTypes = {ncclBfloat16, ncclHalf};
std::vector<ncclRedOp_t> const redOps = {ncclSum};
std::vector<int> const roots = {0};
std::vector<int> const numElements = {64 * 1024 * 1024, 1024};
std::vector<bool> const inPlaceList = {false};
std::vector<bool> const managedMemList = {false};
std::vector<bool> const useHipGraphList = {false, true};
std::vector<char *> const channelList = {"56", "84", "112"};
for (auto channel : channelList) {
setenv("NCCL_MIN_NCHANNELS", channel, 1);
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements,
inPlaceList, managedMemList, useHipGraphList);
testBed.Finalize();
unsetenv("NCCL_MIN_NCHANNELS");
if(testBed.ev.maxGpus >= 8) {
if(testBed.ev.isGfx94) {
// Configuration
std::vector<ncclFunc_t> const funcTypes = {ncclCollAllReduce};
std::vector<ncclDataType_t> const dataTypes = {ncclBfloat16};
std::vector<ncclRedOp_t> const redOps = {ncclSum};
std::vector<int> const roots = {0};
std::vector<int> const numElements = {64 * 1024 * 1024, 1024};
std::vector<bool> const inPlaceList = {false};
std::vector<bool> const managedMemList = {false};
std::vector<bool> const useHipGraphList = {false, true};
std::vector<char *> const channelList = {"84", "112"};
bool const enableSweep = false;
for (auto channel : channelList) {
setenv("NCCL_MIN_NCHANNELS", channel, 1);
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements,
inPlaceList, managedMemList, useHipGraphList, enableSweep);
testBed.Finalize();
unsetenv("NCCL_MIN_NCHANNELS");
}
}
}
}
+21 -18
Wyświetl plik
@@ -89,24 +89,27 @@ namespace RcclUnitTesting
TEST(AllToAll, Channels)
{
TestBed testBed;
if(testBed.ev.isGfx94) {
// Configuration
std::vector<ncclFunc_t> const funcTypes = {ncclCollAllToAll};
std::vector<ncclDataType_t> const dataTypes = {ncclBfloat16, ncclHalf};
std::vector<ncclRedOp_t> const redOps = {ncclSum};
std::vector<int> const roots = {0};
std::vector<int> const numElements = {64 * 1024 * 1024, 1024};
std::vector<bool> const inPlaceList = {false};
std::vector<bool> const managedMemList = {false};
std::vector<bool> const useHipGraphList = {false, true};
std::vector<char *> const channelList = {"56", "84", "112"};
for (auto channel : channelList) {
setenv("NCCL_MIN_NCHANNELS", channel, 1);
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements,
inPlaceList, managedMemList, useHipGraphList);
testBed.Finalize();
unsetenv("NCCL_MIN_NCHANNELS");
if(testBed.ev.maxGpus >= 8) {
if(testBed.ev.isGfx94) {
// Configuration
std::vector<ncclFunc_t> const funcTypes = {ncclCollAllToAll};
std::vector<ncclDataType_t> const dataTypes = {ncclBfloat16};
std::vector<ncclRedOp_t> const redOps = {ncclSum};
std::vector<int> const roots = {0};
std::vector<int> const numElements = {64 * 1024 * 1024, 1024};
std::vector<bool> const inPlaceList = {false};
std::vector<bool> const managedMemList = {false};
std::vector<bool> const useHipGraphList = {false, true};
std::vector<char *> const channelList = {"112"};
bool const enableSweep = false;
for (auto channel : channelList) {
setenv("NCCL_MIN_NCHANNELS", channel, 1);
testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements,
inPlaceList, managedMemList, useHipGraphList, enableSweep);
testBed.Finalize();
unsetenv("NCCL_MIN_NCHANNELS");
}
}
}
}
}
}
+5 -1
Wyświetl plik
@@ -616,7 +616,8 @@ namespace RcclUnitTesting
std::vector<int> const& numElements,
std::vector<bool> const& inPlaceList,
std::vector<bool> const& managedMemList,
std::vector<bool> const& useHipGraphList)
std::vector<bool> const& useHipGraphList,
bool const& enableSweep)
{
// Sort numElements in descending order to cut down on # of allocations
std::vector<int> sortedN = numElements;
@@ -662,6 +663,9 @@ namespace RcclUnitTesting
// Test either single process all GPUs, or 1 process per GPU
int const numChildren = isMultiProcess ? numGpus : 1;
int const numRanks = numGpus*ranksPerGpu;
if(enableSweep == false && (numGpus < 8 || numRanks < 8)) {
continue;
}
this->InitComms(TestBed::GetDeviceIdsList(numChildren, numGpus, ranksPerGpu));
if (testing::Test::HasFailure())
{
+2 -1
Wyświetl plik
@@ -159,7 +159,8 @@ namespace RcclUnitTesting
std::vector<int> const& numElements,
std::vector<bool> const& inPlaceList,
std::vector<bool> const& managedMemList,
std::vector<bool> const& useHipGraphList);
std::vector<bool> const& useHipGraphList,
bool const& enableSweep = true);
// Wait for user-input if in interactive mode
void InteractiveWait(std::string message);