diff --git a/projects/rccl/src/graph/connect.cc b/projects/rccl/src/graph/connect.cc index 6a366bb519..cc2c8210e7 100644 --- a/projects/rccl/src/graph/connect.cc +++ b/projects/rccl/src/graph/connect.cc @@ -722,6 +722,11 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa int maxChannels; int minNchannels, maxNchannels; int duplicateCount = 1; +#ifdef ENABLE_WARP_SPEED + const int wsEnabled = comm->topo->warpSpeedEnabled; + const bool singleNode = comm->nNodes == 1; +#endif + NCCLCHECK(ncclCalloc(&ringRecv, nNodes*MAXCHANNELS)); NCCLCHECKGOTO(ncclCalloc(&ringSend, nNodes*MAXCHANNELS), ret, fail); NCCLCHECKGOTO(ncclCalloc(&ringPrev, nranks*MAXCHANNELS), ret, fail); @@ -805,17 +810,16 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa } } -#ifdef ENABLE_WARP_SPEED // Only use full MAXCHANNELS for gfx942 (MI300X) and gfx950 maxChannels = (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942") || IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) ? MAXCHANNELS : 2*CHANNEL_LIMIT; - +#ifdef ENABLE_WARP_SPEED + if(!wsEnabled){ + maxChannels = std::min((singleNode? RCCL_MI3XX_MAX_SINGLE_NODE_CHANNELS : RCCL_MI3XX_MAX_MULTI_NODE_CHANNELS), maxChannels); + } + #else - // Only use full MAXCHANNELS for gfx942 (MI300X) and gfx950 - maxChannels = (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942") || - IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950")) - ? std::min(comm->topo->nodes[GPU].nodes[0].gpu.cu, MAXCHANNELS) : 2*CHANNEL_LIMIT; if (graphs[NCCL_ALGO_RING]->nIntraChannels > 0 || comm->nNodes > 1) { maxChannels = std::min(64, maxChannels); } @@ -849,7 +853,6 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa // Duplication should be complete now nChannels = comm->nChannels = std::min(maxChannels, (nChannels <= maxChannels/2) ? nChannels*2 : nChannels); - // Setup CollNet if (comm->config.collnetEnable) { struct ncclTopoGraph* collNetChainGraph = graphs[NCCL_ALGO_COLLNET_CHAIN]; diff --git a/projects/rccl/src/include/rccl_common.h b/projects/rccl/src/include/rccl_common.h index 435ddbe0d3..dd3a4b396e 100644 --- a/projects/rccl/src/include/rccl_common.h +++ b/projects/rccl/src/include/rccl_common.h @@ -50,6 +50,8 @@ typedef enum RcclTunableColls { #define RCCL_DEFAULT_MAX_NTHREADS 256 // for Simple and LL64/LL128 other archs #define RCCL_LL_MAX_NTHREADS 256 #define RCCL_P2P_MAX_NTHREADS 256 +#define RCCL_MI3XX_MAX_MULTI_NODE_CHANNELS 64 +#define RCCL_MI3XX_MAX_SINGLE_NODE_CHANNELS 56 typedef enum { RCCL_VALUE_UNSET = -2, diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index 130dd60af6..a5e1ea83ee 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -1611,6 +1611,11 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p allGather3Data[rank].cpuVendor = comm->cpuVendor; comm->nChannels = std::min(treeGraph->nChannels, ringGraph->nChannels); + + //For a 1‑rank job there’s no topology constraint, so ncclTopoCompute drives the ring to its allowed maximum, which results 4 x MAXCHANNELS channels for single rank comms and causes issues. + if (comm->nRanks == 1) { + comm->nChannels = treeGraph->nChannels = ringGraph->nChannels = 8; + } NCCLCHECKGOTO(ncclTopoPreset(comm, graphs, &allGather3Data[rank].topoRanks), ret, fail); NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)), ret, fail);