Fix channel overuse for 1 rank comms (#2760)
* Fix channel overuse for 1 rank comms * limit channels when warpSpeed is enabled but not used * enable std::min check against # of CUs for maxChannels computation when warpSpeed is enabled --------- Co-authored-by: Mustafa Abduljabbar <muabdulj@amd.com> Co-authored-by: isaki001 <ioannissakiotis@gmail.com> Co-authored-by: Corey Derochie <161367113+corey-derochie-amd@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
b509e9bd77
commit
58c203e252
@@ -722,6 +722,11 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
|
|||||||
int maxChannels;
|
int maxChannels;
|
||||||
int minNchannels, maxNchannels;
|
int minNchannels, maxNchannels;
|
||||||
int duplicateCount = 1;
|
int duplicateCount = 1;
|
||||||
|
#ifdef ENABLE_WARP_SPEED
|
||||||
|
const int wsEnabled = comm->topo->warpSpeedEnabled;
|
||||||
|
const bool singleNode = comm->nNodes == 1;
|
||||||
|
#endif
|
||||||
|
|
||||||
NCCLCHECK(ncclCalloc(&ringRecv, nNodes*MAXCHANNELS));
|
NCCLCHECK(ncclCalloc(&ringRecv, nNodes*MAXCHANNELS));
|
||||||
NCCLCHECKGOTO(ncclCalloc(&ringSend, nNodes*MAXCHANNELS), ret, fail);
|
NCCLCHECKGOTO(ncclCalloc(&ringSend, nNodes*MAXCHANNELS), ret, fail);
|
||||||
NCCLCHECKGOTO(ncclCalloc(&ringPrev, nranks*MAXCHANNELS), ret, fail);
|
NCCLCHECKGOTO(ncclCalloc(&ringPrev, nranks*MAXCHANNELS), ret, fail);
|
||||||
@@ -805,17 +810,16 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef ENABLE_WARP_SPEED
|
|
||||||
// Only use full MAXCHANNELS for gfx942 (MI300X) and gfx950
|
// Only use full MAXCHANNELS for gfx942 (MI300X) and gfx950
|
||||||
maxChannels = (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942") ||
|
maxChannels = (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942") ||
|
||||||
IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950"))
|
IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950"))
|
||||||
? MAXCHANNELS : 2*CHANNEL_LIMIT;
|
? MAXCHANNELS : 2*CHANNEL_LIMIT;
|
||||||
|
#ifdef ENABLE_WARP_SPEED
|
||||||
|
if(!wsEnabled){
|
||||||
|
maxChannels = std::min((singleNode? RCCL_MI3XX_MAX_SINGLE_NODE_CHANNELS : RCCL_MI3XX_MAX_MULTI_NODE_CHANNELS), maxChannels);
|
||||||
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
// Only use full MAXCHANNELS for gfx942 (MI300X) and gfx950
|
|
||||||
maxChannels = (IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942") ||
|
|
||||||
IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx950"))
|
|
||||||
? std::min(comm->topo->nodes[GPU].nodes[0].gpu.cu, MAXCHANNELS) : 2*CHANNEL_LIMIT;
|
|
||||||
if (graphs[NCCL_ALGO_RING]->nIntraChannels > 0 || comm->nNodes > 1) {
|
if (graphs[NCCL_ALGO_RING]->nIntraChannels > 0 || comm->nNodes > 1) {
|
||||||
maxChannels = std::min(64, maxChannels);
|
maxChannels = std::min(64, maxChannels);
|
||||||
}
|
}
|
||||||
@@ -849,7 +853,6 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
|
|||||||
|
|
||||||
// Duplication should be complete now
|
// Duplication should be complete now
|
||||||
nChannels = comm->nChannels = std::min(maxChannels, (nChannels <= maxChannels/2) ? nChannels*2 : nChannels);
|
nChannels = comm->nChannels = std::min(maxChannels, (nChannels <= maxChannels/2) ? nChannels*2 : nChannels);
|
||||||
|
|
||||||
// Setup CollNet
|
// Setup CollNet
|
||||||
if (comm->config.collnetEnable) {
|
if (comm->config.collnetEnable) {
|
||||||
struct ncclTopoGraph* collNetChainGraph = graphs[NCCL_ALGO_COLLNET_CHAIN];
|
struct ncclTopoGraph* collNetChainGraph = graphs[NCCL_ALGO_COLLNET_CHAIN];
|
||||||
|
|||||||
@@ -50,6 +50,8 @@ typedef enum RcclTunableColls {
|
|||||||
#define RCCL_DEFAULT_MAX_NTHREADS 256 // for Simple and LL64/LL128 other archs
|
#define RCCL_DEFAULT_MAX_NTHREADS 256 // for Simple and LL64/LL128 other archs
|
||||||
#define RCCL_LL_MAX_NTHREADS 256
|
#define RCCL_LL_MAX_NTHREADS 256
|
||||||
#define RCCL_P2P_MAX_NTHREADS 256
|
#define RCCL_P2P_MAX_NTHREADS 256
|
||||||
|
#define RCCL_MI3XX_MAX_MULTI_NODE_CHANNELS 64
|
||||||
|
#define RCCL_MI3XX_MAX_SINGLE_NODE_CHANNELS 56
|
||||||
|
|
||||||
typedef enum {
|
typedef enum {
|
||||||
RCCL_VALUE_UNSET = -2,
|
RCCL_VALUE_UNSET = -2,
|
||||||
|
|||||||
@@ -1611,6 +1611,11 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
|
|||||||
allGather3Data[rank].cpuVendor = comm->cpuVendor;
|
allGather3Data[rank].cpuVendor = comm->cpuVendor;
|
||||||
|
|
||||||
comm->nChannels = std::min(treeGraph->nChannels, ringGraph->nChannels);
|
comm->nChannels = std::min(treeGraph->nChannels, ringGraph->nChannels);
|
||||||
|
|
||||||
|
//For a 1‑rank job there’s no topology constraint, so ncclTopoCompute drives the ring to its allowed maximum, which results 4 x MAXCHANNELS channels for single rank comms and causes issues.
|
||||||
|
if (comm->nRanks == 1) {
|
||||||
|
comm->nChannels = treeGraph->nChannels = ringGraph->nChannels = 8;
|
||||||
|
}
|
||||||
NCCLCHECKGOTO(ncclTopoPreset(comm, graphs, &allGather3Data[rank].topoRanks), ret, fail);
|
NCCLCHECKGOTO(ncclTopoPreset(comm, graphs, &allGather3Data[rank].topoRanks), ret, fail);
|
||||||
|
|
||||||
NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)), ret, fail);
|
NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)), ret, fail);
|
||||||
|
|||||||
Reference in New Issue
Block a user