diff --git a/projects/rccl/src/graph/paths.cc b/projects/rccl/src/graph/paths.cc index c53f15766d..61055eeb38 100644 --- a/projects/rccl/src/graph/paths.cc +++ b/projects/rccl/src/graph/paths.cc @@ -591,7 +591,7 @@ ncclResult_t ncclTopoGetIntermediateRank(struct ncclTopoSystem* system, int rank } return ncclSuccess; } - +// Default value of PXN_DISABLE may be overwritten by changes in src/rccl_wrap.cc NCCL_PARAM(PxnDisable, "PXN_DISABLE", 1); // Net v4 plugins don't have non-blocking connect/accept. We can't therefore use @@ -603,7 +603,8 @@ int ncclPxnDisable(struct ncclComm* comm) { INFO(NCCL_INIT, "PXN Disabled as plugin is v4"); pxnDisable = 1; } else { - pxnDisable = ncclParamPxnDisable(); + rcclSetPxn(comm, pxnDisable); + pxnDisable = (pxnDisable > RCCL_VALUE_INVALID)? pxnDisable : ncclParamPxnDisable(); } } return pxnDisable; diff --git a/projects/rccl/src/include/rccl_common.h b/projects/rccl/src/include/rccl_common.h index f29096a618..914dc2c9ed 100644 --- a/projects/rccl/src/include/rccl_common.h +++ b/projects/rccl/src/include/rccl_common.h @@ -41,6 +41,11 @@ typedef enum RcclTunableColls { #define RCCL_PROTOCOL_FACTOR_IDX 2 #define RCCL_PROTOCOL_THREAD_THRESHOLD_IDX 3 +typedef enum { + RCCL_VALUE_UNSET = -2, + RCCL_VALUE_INVALID = -1 +} rcclValueState_t; + #ifdef RCCL_EXPOSE_STATIC #define RCCL_STATIC_EXPOSE_CHECK() #else @@ -78,7 +83,8 @@ void rcclUpdateThreadThreshold(struct ncclComm* comm, size_t const& nBytes, stru ncclResult_t rcclGetAlgoInfo(struct ncclComm* comm, ncclFunc_t coll, uint64_t count, ncclDataType_t dataType, int collNetSupport, int nvlsSupport, int numPipeOps, int* algo, int* protocol, int* maxChannels); - +void rcclSetPxn(struct ncclComm* comm, int& rcclPxnDisable); +void rcclSetP2pNetChunkSize(struct ncclComm* comm, int& rcclP2pNetChunkSize); ncclResult_t rcclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count, size_t& maxCount); ncclResult_t commSetUnrollFactor(struct ncclComm* comm); #endif diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index 16a8a6179a..e8a8ca79e8 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -396,7 +396,7 @@ static ncclResult_t commFree(ncclComm_t comm) { WARN("%s", comm->proxyState->proxyTrace->dump().c_str()); } } - + #ifdef ENABLE_PROFILING struct ncclProf *prof, *prof_seq; @@ -981,7 +981,7 @@ static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank, NCCL_PARAM(BuffSize, "BUFFSIZE", -2); NCCL_PARAM(LlBuffSize, "LL_BUFFSIZE", -2); NCCL_PARAM(Ll128BuffSize, "LL128_BUFFSIZE", -2); - +// Default value of P2P_NET_CHUNKSIZE may be overwritten by changes in src/rccl_wrap.cc NCCL_PARAM(P2pNetChunkSize, "P2P_NET_CHUNKSIZE", (1 << 17)); /* 128 kB */ NCCL_PARAM(P2pPciChunkSize, "P2P_PCI_CHUNKSIZE", (1 << 17)); /* 128 kB */ NCCL_PARAM(P2pNvlChunkSize, "P2P_NVL_CHUNKSIZE", (1 << 19)); /* 512 kB */ @@ -994,7 +994,10 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) { comm->buffSizes[p] = envs[p] != -2 ? envs[p] : defaults[p]; } - if (comm->nNodes > 1) comm->p2pChunkSize = ncclParamP2pNetChunkSize(); + if (comm->nNodes > 1) { + rcclSetP2pNetChunkSize(comm, comm->p2pChunkSize); + comm->p2pChunkSize = (comm->p2pChunkSize > RCCL_VALUE_INVALID)? comm->p2pChunkSize : ncclParamP2pNetChunkSize(); + } else if (comm->isAllNvlink) comm->p2pChunkSize = ncclParamP2pNvlChunkSize(); else comm->p2pChunkSize = ncclParamP2pPciChunkSize(); diff --git a/projects/rccl/src/rccl_wrap.cc b/projects/rccl/src/rccl_wrap.cc index 0ef494961a..162df74862 100644 --- a/projects/rccl/src/rccl_wrap.cc +++ b/projects/rccl/src/rccl_wrap.cc @@ -120,6 +120,33 @@ ncclResult_t rcclGetAlgoInfo(struct ncclComm* comm, ncclFunc_t coll, uint64_t co return ncclSuccess; } +void rcclSetPxn(struct ncclComm* comm, int& rcclPxnDisable) { + static int pxnDisable = RCCL_VALUE_UNSET; + if(pxnDisable == RCCL_VALUE_UNSET) { + const char *inputStr = getenv("NCCL_PXN_DISABLE"); + if(!IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942") || inputStr) { + rcclPxnDisable = pxnDisable = RCCL_VALUE_INVALID; + return; + } + pxnDisable = (comm->nRanks >= 64)? 0 : 1; + INFO(NCCL_INIT, "RCCL PXN set as %s", !pxnDisable? "enabled" : "disabled"); + } + rcclPxnDisable = pxnDisable; +} + +void rcclSetP2pNetChunkSize(struct ncclComm* comm, int& rcclP2pNetChunkSize) { + static int p2pNetChunkSize = RCCL_VALUE_UNSET; + if(p2pNetChunkSize == RCCL_VALUE_UNSET) { + const char *inputStr = getenv("NCCL_P2P_NET_CHUNKSIZE"); + if(!IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942") || inputStr) { + rcclP2pNetChunkSize = p2pNetChunkSize = RCCL_VALUE_INVALID; + return; + } + p2pNetChunkSize = (comm->nRanks >= 64)? (1 << 19) : (1 << 17); + INFO(NCCL_INIT, "RCCL P2P net chunk size default set to: %d", p2pNetChunkSize); + } + rcclP2pNetChunkSize = p2pNetChunkSize; +} ncclResult_t rcclFuncMaxSendRecvCount(ncclFunc_t func, int nRanks, size_t count, size_t& maxCount) { RCCL_STATIC_EXPOSE_CHECK();