From 526cce9bf41bb499fa95901cc2210f340477d75b Mon Sep 17 00:00:00 2001 From: Nusrat Islam Date: Thu, 6 Jun 2024 07:56:41 -0500 Subject: [PATCH] graph: restrict maxChannels to 64 for multi-node and RCCL_ENABLE_INTRANET=1 --- src/graph/connect.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/graph/connect.cc b/src/graph/connect.cc index d59c092981..5e1f7e4c89 100644 --- a/src/graph/connect.cc +++ b/src/graph/connect.cc @@ -627,6 +627,10 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa // Only use full MAXCHANNELS for gfx94x int maxChannels = IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx94") ? MAXCHANNELS : 2*CHANNEL_LIMIT; + if (graphs[NCCL_ALGO_RING]->nIntraChannels > 0 || comm->nNodes > 1) { + maxChannels = std::min(64, maxChannels); + } + // Duplicate ringPrev/ringNext for ncclBuildRing if (nChannels <= maxChannels/2) memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int)); if (nChannels <= maxChannels/2) memcpy(ringNext+nChannels*nranks, ringNext, nChannels*nranks*sizeof(int));