Optimize alltoall for 64 GPUs and above for gfx942 (#1828)
Add pxn and p2p net chunksize mi300x tuning
[ROCm/rccl commit: 4ce3df8d3a]
This commit is contained in:
committed by
GitHub
parent
cca5172260
commit
cafd7a5126
@@ -591,7 +591,7 @@ ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Default value of PXN_DISABLE may be overwritten by changes in src/rccl_wrap.cc
|
||||
NCCL_PARAM(PxnDisable, "PXN_DISABLE", 1);
|
||||
|
||||
// Net v4 plugins don't have non-blocking connect/accept. We can't therefore use
|
||||
@@ -603,7 +603,8 @@ int ncclPxnDisable(struct ncclComm* comm) {
|
||||
INFO(NCCL_INIT, "PXN Disabled as plugin is v4");
|
||||
pxnDisable = 1;
|
||||
} else {
|
||||
pxnDisable = ncclParamPxnDisable();
|
||||
rcclSetPxn(comm, pxnDisable);
|
||||
pxnDisable = (pxnDisable > RCCL_VALUE_INVALID)? pxnDisable : ncclParamPxnDisable();
|
||||
}
|
||||
}
|
||||
return pxnDisable;
|
||||
|
||||
@@ -41,6 +41,11 @@ typedef enum RcclTunableColls {
|
||||
#define RCCL_PROTOCOL_FACTOR_IDX 2
|
||||
#define RCCL_PROTOCOL_THREAD_THRESHOLD_IDX 3
|
||||
|
||||
typedef enum {
|
||||
RCCL_VALUE_UNSET = -2,
|
||||
RCCL_VALUE_INVALID = -1
|
||||
} rcclValueState_t;
|
||||
|
||||
#ifdef RCCL_EXPOSE_STATIC
|
||||
#define RCCL_STATIC_EXPOSE_CHECK()
|
||||
#else
|
||||
@@ -78,7 +83,8 @@ void rcclUpdateThreadThreshold(struct ncclComm* comm, size_t const& nBytes, stru
|
||||
ncclResult_t rcclGetAlgoInfo(struct ncclComm* comm, ncclFunc_t coll, uint64_t count, ncclDataType_t dataType,
|
||||
int collNetSupport, int nvlsSupport, int numPipeOps,
|
||||
int* algo, int* protocol, int* maxChannels);
|
||||
|
||||
void rcclSetPxn(struct ncclComm* comm, int& rcclPxnDisable);
|
||||
void rcclSetP2pNetChunkSize(struct ncclComm* comm, int& rcclP2pNetChunkSize);
|
||||
ncclResult_t rcclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count, size_t& maxCount);
|
||||
ncclResult_t commSetUnrollFactor(struct ncclComm* comm);
|
||||
#endif
|
||||
|
||||
@@ -396,7 +396,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
|
||||
WARN("%s", comm->proxyState->proxyTrace->dump().c_str());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
#ifdef ENABLE_PROFILING
|
||||
struct ncclProf *prof, *prof_seq;
|
||||
@@ -981,7 +981,7 @@ static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank,
|
||||
NCCL_PARAM(BuffSize, "BUFFSIZE", -2);
|
||||
NCCL_PARAM(LlBuffSize, "LL_BUFFSIZE", -2);
|
||||
NCCL_PARAM(Ll128BuffSize, "LL128_BUFFSIZE", -2);
|
||||
|
||||
// Default value of P2P_NET_CHUNKSIZE may be overwritten by changes in src/rccl_wrap.cc
|
||||
NCCL_PARAM(P2pNetChunkSize, "P2P_NET_CHUNKSIZE", (1 << 17)); /* 128 kB */
|
||||
NCCL_PARAM(P2pPciChunkSize, "P2P_PCI_CHUNKSIZE", (1 << 17)); /* 128 kB */
|
||||
NCCL_PARAM(P2pNvlChunkSize, "P2P_NVL_CHUNKSIZE", (1 << 19)); /* 512 kB */
|
||||
@@ -994,7 +994,10 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
|
||||
comm->buffSizes[p] = envs[p] != -2 ? envs[p] : defaults[p];
|
||||
}
|
||||
|
||||
if (comm->nNodes > 1) comm->p2pChunkSize = ncclParamP2pNetChunkSize();
|
||||
if (comm->nNodes > 1) {
|
||||
rcclSetP2pNetChunkSize(comm, comm->p2pChunkSize);
|
||||
comm->p2pChunkSize = (comm->p2pChunkSize > RCCL_VALUE_INVALID)? comm->p2pChunkSize : ncclParamP2pNetChunkSize();
|
||||
}
|
||||
else if (comm->isAllNvlink) comm->p2pChunkSize = ncclParamP2pNvlChunkSize();
|
||||
else comm->p2pChunkSize = ncclParamP2pPciChunkSize();
|
||||
|
||||
|
||||
@@ -120,6 +120,33 @@ ncclResult_t rcclGetAlgoInfo(struct ncclComm* comm, ncclFunc_t coll, uint64_t co
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
void rcclSetPxn(struct ncclComm* comm, int& rcclPxnDisable) {
|
||||
static int pxnDisable = RCCL_VALUE_UNSET;
|
||||
if(pxnDisable == RCCL_VALUE_UNSET) {
|
||||
const char *inputStr = getenv("NCCL_PXN_DISABLE");
|
||||
if(!IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942") || inputStr) {
|
||||
rcclPxnDisable = pxnDisable = RCCL_VALUE_INVALID;
|
||||
return;
|
||||
}
|
||||
pxnDisable = (comm->nRanks >= 64)? 0 : 1;
|
||||
INFO(NCCL_INIT, "RCCL PXN set as %s", !pxnDisable? "enabled" : "disabled");
|
||||
}
|
||||
rcclPxnDisable = pxnDisable;
|
||||
}
|
||||
|
||||
void rcclSetP2pNetChunkSize(struct ncclComm* comm, int& rcclP2pNetChunkSize) {
|
||||
static int p2pNetChunkSize = RCCL_VALUE_UNSET;
|
||||
if(p2pNetChunkSize == RCCL_VALUE_UNSET) {
|
||||
const char *inputStr = getenv("NCCL_P2P_NET_CHUNKSIZE");
|
||||
if(!IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942") || inputStr) {
|
||||
rcclP2pNetChunkSize = p2pNetChunkSize = RCCL_VALUE_INVALID;
|
||||
return;
|
||||
}
|
||||
p2pNetChunkSize = (comm->nRanks >= 64)? (1 << 19) : (1 << 17);
|
||||
INFO(NCCL_INIT, "RCCL P2P net chunk size default set to: %d", p2pNetChunkSize);
|
||||
}
|
||||
rcclP2pNetChunkSize = p2pNetChunkSize;
|
||||
}
|
||||
|
||||
ncclResult_t rcclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count, size_t& maxCount) {
|
||||
RCCL_STATIC_EXPOSE_CHECK();
|
||||
|
||||
Reference in New Issue
Block a user