Optimize alltoall for 64 GPUs and above for gfx942 (#1828)

Add pxn and p2p net chunksize mi300x tuning

[ROCm/rccl commit: 4ce3df8d3a]
This commit is contained in:
Mustafa Abduljabbar
2025-07-30 15:14:43 -04:00
committed by GitHub
parent cca5172260
commit cafd7a5126
4 changed files with 43 additions and 6 deletions
+3 -2
View File
@@ -591,7 +591,7 @@ ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank
}
return ncclSuccess;
}
// Default value of PXN_DISABLE may be overwritten by changes in src/rccl_wrap.cc
NCCL_PARAM(PxnDisable, "PXN_DISABLE", 1);
// Net v4 plugins don't have non-blocking connect/accept. We can't therefore use
@@ -603,7 +603,8 @@ int ncclPxnDisable(struct ncclComm* comm) {
INFO(NCCL_INIT, "PXN Disabled as plugin is v4");
pxnDisable = 1;
} else {
pxnDisable = ncclParamPxnDisable();
rcclSetPxn(comm, pxnDisable);
pxnDisable = (pxnDisable > RCCL_VALUE_INVALID)? pxnDisable : ncclParamPxnDisable();
}
}
return pxnDisable;
+7 -1
View File
@@ -41,6 +41,11 @@ typedef enum RcclTunableColls {
#define RCCL_PROTOCOL_FACTOR_IDX 2
#define RCCL_PROTOCOL_THREAD_THRESHOLD_IDX 3
typedef enum {
RCCL_VALUE_UNSET = -2,
RCCL_VALUE_INVALID = -1
} rcclValueState_t;
#ifdef RCCL_EXPOSE_STATIC
#define RCCL_STATIC_EXPOSE_CHECK()
#else
@@ -78,7 +83,8 @@ void rcclUpdateThreadThreshold(struct ncclComm* comm, size_t const& nBytes, stru
ncclResult_t rcclGetAlgoInfo(struct ncclComm* comm, ncclFunc_t coll, uint64_t count, ncclDataType_t dataType,
int collNetSupport, int nvlsSupport, int numPipeOps,
int* algo, int* protocol, int* maxChannels);
void rcclSetPxn(struct ncclComm* comm, int& rcclPxnDisable);
void rcclSetP2pNetChunkSize(struct ncclComm* comm, int& rcclP2pNetChunkSize);
ncclResult_t rcclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count, size_t& maxCount);
ncclResult_t commSetUnrollFactor(struct ncclComm* comm);
#endif
+6 -3
View File
@@ -396,7 +396,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
WARN("%s", comm->proxyState->proxyTrace->dump().c_str());
}
}
#ifdef ENABLE_PROFILING
struct ncclProf *prof, *prof_seq;
@@ -981,7 +981,7 @@ static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank,
NCCL_PARAM(BuffSize, "BUFFSIZE", -2);
NCCL_PARAM(LlBuffSize, "LL_BUFFSIZE", -2);
NCCL_PARAM(Ll128BuffSize, "LL128_BUFFSIZE", -2);
// Default value of P2P_NET_CHUNKSIZE may be overwritten by changes in src/rccl_wrap.cc
NCCL_PARAM(P2pNetChunkSize, "P2P_NET_CHUNKSIZE", (1 << 17)); /* 128 kB */
NCCL_PARAM(P2pPciChunkSize, "P2P_PCI_CHUNKSIZE", (1 << 17)); /* 128 kB */
NCCL_PARAM(P2pNvlChunkSize, "P2P_NVL_CHUNKSIZE", (1 << 19)); /* 512 kB */
@@ -994,7 +994,10 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) {
comm->buffSizes[p] = envs[p] != -2 ? envs[p] : defaults[p];
}
if (comm->nNodes > 1) comm->p2pChunkSize = ncclParamP2pNetChunkSize();
if (comm->nNodes > 1) {
rcclSetP2pNetChunkSize(comm, comm->p2pChunkSize);
comm->p2pChunkSize = (comm->p2pChunkSize > RCCL_VALUE_INVALID)? comm->p2pChunkSize : ncclParamP2pNetChunkSize();
}
else if (comm->isAllNvlink) comm->p2pChunkSize = ncclParamP2pNvlChunkSize();
else comm->p2pChunkSize = ncclParamP2pPciChunkSize();
+27
View File
@@ -120,6 +120,33 @@ ncclResult_t rcclGetAlgoInfo(struct ncclComm* comm, ncclFunc_t coll, uint64_t co
return ncclSuccess;
}
void rcclSetPxn(struct ncclComm* comm, int& rcclPxnDisable) {
static int pxnDisable = RCCL_VALUE_UNSET;
if(pxnDisable == RCCL_VALUE_UNSET) {
const char *inputStr = getenv("NCCL_PXN_DISABLE");
if(!IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942") || inputStr) {
rcclPxnDisable = pxnDisable = RCCL_VALUE_INVALID;
return;
}
pxnDisable = (comm->nRanks >= 64)? 0 : 1;
INFO(NCCL_INIT, "RCCL PXN set as %s", !pxnDisable? "enabled" : "disabled");
}
rcclPxnDisable = pxnDisable;
}
void rcclSetP2pNetChunkSize(struct ncclComm* comm, int& rcclP2pNetChunkSize) {
static int p2pNetChunkSize = RCCL_VALUE_UNSET;
if(p2pNetChunkSize == RCCL_VALUE_UNSET) {
const char *inputStr = getenv("NCCL_P2P_NET_CHUNKSIZE");
if(!IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942") || inputStr) {
rcclP2pNetChunkSize = p2pNetChunkSize = RCCL_VALUE_INVALID;
return;
}
p2pNetChunkSize = (comm->nRanks >= 64)? (1 << 19) : (1 << 17);
INFO(NCCL_INIT, "RCCL P2P net chunk size default set to: %d", p2pNetChunkSize);
}
rcclP2pNetChunkSize = p2pNetChunkSize;
}
ncclResult_t rcclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count, size_t& maxCount) {
RCCL_STATIC_EXPOSE_CHECK();