From 6ab20a7c6bfa1d67c3b5b377af05c20e8d4bcda5 Mon Sep 17 00:00:00 2001 From: Nusrat Islam Date: Tue, 4 Jun 2024 15:22:38 -0500 Subject: [PATCH 1/2] graph: fix multi-node minChannel count --- src/graph/connect.cc | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/graph/connect.cc b/src/graph/connect.cc index f00a34fd8c..d59c092981 100644 --- a/src/graph/connect.cc +++ b/src/graph/connect.cc @@ -668,11 +668,9 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext); } - int minNchannels = 64; - if (comm->nNodes == 1) { - minNchannels = ncclMinNchannels(); - } else { - minNchannels = std::min(64,ncclMinNchannels()); + int minNchannels = ncclMinNchannels(); + if (comm->nNodes > 1) { + minNchannels = std::min(64, maxChannels); } if (mscclEnabled() && (comm->topo->mscclEnabled || mscclForceEnabled())) { From 526cce9bf41bb499fa95901cc2210f340477d75b Mon Sep 17 00:00:00 2001 From: Nusrat Islam Date: Thu, 6 Jun 2024 07:56:41 -0500 Subject: [PATCH 2/2] graph: restrict maxChannels to 64 for multi-node and RCCL_ENABLE_INTRANET=1 --- src/graph/connect.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/graph/connect.cc b/src/graph/connect.cc index d59c092981..5e1f7e4c89 100644 --- a/src/graph/connect.cc +++ b/src/graph/connect.cc @@ -627,6 +627,10 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa // Only use full MAXCHANNELS for gfx94x int maxChannels = IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") ? MAXCHANNELS : 2*CHANNEL_LIMIT; + if (graphs[NCCL_ALGO_RING]->nIntraChannels > 0 || comm->nNodes > 1) { + maxChannels = std::min(64, maxChannels); + } + // Duplicate ringPrev/ringNext for ncclBuildRing if (nChannels <= maxChannels/2) memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int)); if (nChannels <= maxChannels/2) memcpy(ringNext+nChannels*nranks, ringNext, nChannels*nranks*sizeof(int));