From de7ea612d72020681653be20cb8c89999d5d15d9 Mon Sep 17 00:00:00 2001 From: saurabhAMD <160164138+saurabhAMD@users.noreply.github.com> Date: Tue, 25 Jun 2024 10:10:10 -0500 Subject: [PATCH] Unit Tests for testing channels (#1222) [ROCm/rccl commit: e170f41ddd8a5a769f35ebed8abc15b39400f067] --- projects/rccl/src/graph/connect.cc | 6 ++++- projects/rccl/test/AllReduceTests.cpp | 37 +++++++++++++------------ projects/rccl/test/AllToAllTests.cpp | 39 ++++++++++++++------------- projects/rccl/test/common/TestBed.cpp | 6 ++++- projects/rccl/test/common/TestBed.hpp | 3 ++- 5 files changed, 53 insertions(+), 38 deletions(-) diff --git a/projects/rccl/src/graph/connect.cc b/projects/rccl/src/graph/connect.cc index 871750e0f7..de3e992de3 100644 --- a/projects/rccl/src/graph/connect.cc +++ b/projects/rccl/src/graph/connect.cc @@ -674,7 +674,11 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa int minNchannels = ncclMinNchannels(); if (comm->nNodes > 1) { - minNchannels = std::min(64, minNchannels); + minNchannels = std::min(64, maxChannels); + } + if (comm->nRanks < 8 && 64 < minNchannels) { + minNchannels = 2; + WARN("NCCL_MIN_NCHANNELS set by environment is ignored due to less than 8 GPUs."); } if (mscclEnabled() && (comm->topo->mscclEnabled || mscclForceEnabled())) { diff --git a/projects/rccl/test/AllReduceTests.cpp b/projects/rccl/test/AllReduceTests.cpp index 23f885ca58..5d7d8cdfc0 100644 --- a/projects/rccl/test/AllReduceTests.cpp +++ b/projects/rccl/test/AllReduceTests.cpp @@ -105,23 +105,26 @@ namespace RcclUnitTesting TEST(AllReduce, Channels) { TestBed testBed; - if(testBed.ev.isGfx94) { - // Configuration - std::vector const funcTypes = {ncclCollAllReduce}; - std::vector const dataTypes = {ncclBfloat16, ncclHalf}; - std::vector const redOps = {ncclSum}; - std::vector const roots = {0}; - std::vector const numElements = {64 * 1024 * 1024, 1024}; - std::vector const inPlaceList = {false}; - std::vector const managedMemList = {false}; - std::vector const useHipGraphList = {false, true}; - std::vector const channelList = {"56", "84", "112"}; - for (auto channel : channelList) { - setenv("NCCL_MIN_NCHANNELS", channel, 1); - testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, - inPlaceList, managedMemList, useHipGraphList); - testBed.Finalize(); - unsetenv("NCCL_MIN_NCHANNELS"); + if(testBed.ev.maxGpus >= 8) { + if(testBed.ev.isGfx94) { + // Configuration + std::vector const funcTypes = {ncclCollAllReduce}; + std::vector const dataTypes = {ncclBfloat16}; + std::vector const redOps = {ncclSum}; + std::vector const roots = {0}; + std::vector const numElements = {64 * 1024 * 1024, 1024}; + std::vector const inPlaceList = {false}; + std::vector const managedMemList = {false}; + std::vector const useHipGraphList = {false, true}; + std::vector const channelList = {"84", "112"}; + bool const enableSweep = false; + for (auto channel : channelList) { + setenv("NCCL_MIN_NCHANNELS", channel, 1); + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, + inPlaceList, managedMemList, useHipGraphList, enableSweep); + testBed.Finalize(); + unsetenv("NCCL_MIN_NCHANNELS"); + } } } } diff --git a/projects/rccl/test/AllToAllTests.cpp b/projects/rccl/test/AllToAllTests.cpp index 07e88e751d..926cf41488 100644 --- a/projects/rccl/test/AllToAllTests.cpp +++ b/projects/rccl/test/AllToAllTests.cpp @@ -89,24 +89,27 @@ namespace RcclUnitTesting TEST(AllToAll, Channels) { TestBed testBed; - if(testBed.ev.isGfx94) { - // Configuration - std::vector const funcTypes = {ncclCollAllToAll}; - std::vector const dataTypes = {ncclBfloat16, ncclHalf}; - std::vector const redOps = {ncclSum}; - std::vector const roots = {0}; - std::vector const numElements = {64 * 1024 * 1024, 1024}; - std::vector const inPlaceList = {false}; - std::vector const managedMemList = {false}; - std::vector const useHipGraphList = {false, true}; - std::vector const channelList = {"56", "84", "112"}; - for (auto channel : channelList) { - setenv("NCCL_MIN_NCHANNELS", channel, 1); - testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, - inPlaceList, managedMemList, useHipGraphList); - testBed.Finalize(); - unsetenv("NCCL_MIN_NCHANNELS"); + if(testBed.ev.maxGpus >= 8) { + if(testBed.ev.isGfx94) { + // Configuration + std::vector const funcTypes = {ncclCollAllToAll}; + std::vector const dataTypes = {ncclBfloat16}; + std::vector const redOps = {ncclSum}; + std::vector const roots = {0}; + std::vector const numElements = {64 * 1024 * 1024, 1024}; + std::vector const inPlaceList = {false}; + std::vector const managedMemList = {false}; + std::vector const useHipGraphList = {false, true}; + std::vector const channelList = {"112"}; + bool const enableSweep = false; + for (auto channel : channelList) { + setenv("NCCL_MIN_NCHANNELS", channel, 1); + testBed.RunSimpleSweep(funcTypes, dataTypes, redOps, roots, numElements, + inPlaceList, managedMemList, useHipGraphList, enableSweep); + testBed.Finalize(); + unsetenv("NCCL_MIN_NCHANNELS"); + } } } } -} +} \ No newline at end of file diff --git a/projects/rccl/test/common/TestBed.cpp b/projects/rccl/test/common/TestBed.cpp index df6f5f7593..251ebfd8ae 100644 --- a/projects/rccl/test/common/TestBed.cpp +++ b/projects/rccl/test/common/TestBed.cpp @@ -616,7 +616,8 @@ namespace RcclUnitTesting std::vector const& numElements, std::vector const& inPlaceList, std::vector const& managedMemList, - std::vector const& useHipGraphList) + std::vector const& useHipGraphList, + bool const& enableSweep) { // Sort numElements in descending order to cut down on # of allocations std::vector sortedN = numElements; @@ -662,6 +663,9 @@ namespace RcclUnitTesting // Test either single process all GPUs, or 1 process per GPU int const numChildren = isMultiProcess ? numGpus : 1; int const numRanks = numGpus*ranksPerGpu; + if(enableSweep == false && (numGpus < 8 || numRanks < 8)) { + continue; + } this->InitComms(TestBed::GetDeviceIdsList(numChildren, numGpus, ranksPerGpu)); if (testing::Test::HasFailure()) { diff --git a/projects/rccl/test/common/TestBed.hpp b/projects/rccl/test/common/TestBed.hpp index dbc251ddf5..e5e27e4b81 100644 --- a/projects/rccl/test/common/TestBed.hpp +++ b/projects/rccl/test/common/TestBed.hpp @@ -159,7 +159,8 @@ namespace RcclUnitTesting std::vector const& numElements, std::vector const& inPlaceList, std::vector const& managedMemList, - std::vector const& useHipGraphList); + std::vector const& useHipGraphList, + bool const& enableSweep = true); // Wait for user-input if in interactive mode void InteractiveWait(std::string message);