From 7f878baef066423b41cfce31cc34f5ec6bd388c8 Mon Sep 17 00:00:00 2001 From: PedramAlizadeh Date: Wed, 21 May 2025 08:50:55 -0700 Subject: [PATCH] Revert "[AG and RS channel tuning] Add thread work threshold to tuning models and precompute reg index in LL128 (#1641)" This reverts commit 00c1eb098ce45d936529b24f53f2a386a8735b15. --- CHANGELOG.md | 8 -------- src/device/prims_ll128.h | 17 ++++++----------- src/enqueue.cc | 1 - src/graph/tuning.cc | 8 ++++---- src/include/device.h | 4 ++-- src/include/rccl_common.h | 6 +++--- src/rccl_wrap.cc | 23 ----------------------- 7 files changed, 15 insertions(+), 52 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7068467439..ef3e738ab8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,18 +6,10 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https: ### Resolved issues * Resolved an issue when using more than 64 channels when multiple collectives are used in the same `ncclGroup()` call. -* Suboptimal algorithmic switching point for AllReduce on MI300x ### Added * Added new GPU target `gfx950`. -* Added support for `unroll=1` in device-code generation to improve performance -* Added MSCCL support for AllGather single node and multinode (i.e., 8, 16 and 32 GPUs). To enable on multinode, set the - environment variable `RCCL_MSCCL_FORCE_ENABLE=1`. Max message size for MSCCL AllGather usage is `12292 * sizeof(datatype) * nGPUs`. -* Thread thresholds for LL/LL128 are selected in Tuning Models for the MI300X. This impacts the number of channels used for AG and RS. Channel tuning model is bypassed if `NCCL_THREAD_THRESHOLDS`, `NCCL_MIN_NCHANNELS', or 'NCCL_MAX_NCHANNELS` are set. -* Multi-node tuning for AllGather, AllReduce, and ReduceScatter that leverages LL/LL64/LL128 protocol to use nontemporal vector load/store for tunable message size ranges. -* LL/LL128 usage ranges for AR, AG, and RS are part of the tuning models, which enable architecture-specific tuning in conjunction with the existing Rome Models scheme in RCCL. -* Two new APIs are exposed as part of an initiative to separate RCCL code. These APIs are `rcclGetAlgoInfo` and `rcclFuncMaxSendRecvCount`. However, user-level invocation requires that RCCL be built with `RCCL_EXPOSE_STATIC` enabled. ### Changed diff --git a/src/device/prims_ll128.h b/src/device/prims_ll128.h index bcc72d075c..21804f5526 100644 --- a/src/device/prims_ll128.h +++ b/src/device/prims_ll128.h @@ -126,11 +126,6 @@ private: template __device__ __forceinline__ void loadRegsBegin(uint64_t(®s)[WordPerThread], T const *src, int eltN) { constexpr int EltPer16B = 16/sizeof(T); - int ix[WordPerThread/2]; - #pragma unroll - for(int g=0; g < WordPerThread/2; g++) { - ix[g] = g*WARP_SIZE - 16*(g/2) + wid - (g%2)*(wid/4); - } if(reinterpret_cast(src)%16 == 0) { /* We are aligned to 16 bytes, so load directly to registers no shmem. * Flag threads load half as much data which gets shuffled to the even @@ -140,9 +135,10 @@ private: */ #pragma unroll for(int g=0; g < WordPerThread/2; g++) { + int ix = g*WARP_SIZE - 16*(g/2) + wid - (g%2)*(wid/4); if(!flagThread || g%2==0) { - if(ix[g]*EltPer16B < eltN) - load128((uint64_t*)(src + ix[g]*EltPer16B), regs[2*g+0], regs[2*g+1]); + if(ix*EltPer16B < eltN) + load128((uint64_t*)(src + ix*EltPer16B), regs[2*g+0], regs[2*g+1]); } } } @@ -167,10 +163,10 @@ private: T *shm = (T*)shm8 + misalignment/sizeof(T); #pragma unroll for(int g=0; g < WordPerThread/2; g++) { - // int ix = g*WARP_SIZE - 16*(g/2) + wid - (g%2)*(wid/4); + int ix = g*WARP_SIZE - 16*(g/2) + wid - (g%2)*(wid/4); if(!flagThread || g%2==0) { - if(ix[g]*EltPer16B < eltN) - loadShmemMisaligned128(shm + ix[g]*EltPer16B, regs[2*g+0], regs[2*g+1]); + if(ix*EltPer16B < eltN) + loadShmemMisaligned128(shm + ix*EltPer16B, regs[2*g+0], regs[2*g+1]); } } } @@ -193,7 +189,6 @@ private: for (int g=1; g < WordPerThread/2; g+=2) { if (flagThread) regs[2*g-1] = regs[2*g]; } - // Write to dst if 4-byte aligned, shmem otherwise. int misalignment = reinterpret_cast(dst)%16; uint64_t *shm8 = shmemCvtPtr((uint64_t*)ncclScratchForWarp(warpInBlock)); diff --git a/src/enqueue.cc b/src/enqueue.cc index 128024a0e5..ebad6f68fe 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -1769,7 +1769,6 @@ static ncclResult_t topoGetAlgoInfo( // NVLS should not need more than 16 channels to get peak BW. nc = comm->nvlsChannels; } else { - rcclUpdateThreadThreshold(comm, nBytes, info, threadThreshold); // Ring/Tree channel tuning while (nBytes < nc * nt * threadThreshold) { if (nc >= 2) nc--; diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc index 4944ce3a9c..3ec9f24fd9 100644 --- a/src/graph/tuning.cc +++ b/src/graph/tuning.cc @@ -329,11 +329,11 @@ static struct tuningModel tuning_model_5 { // Follow order in RcclTunableColls .llProtoRanges = { /*ReduceScatter*/ - {/*LL (min/max/factor/thread_threshold)*/ {0, 655360, 1, 16}, /*LL64/128 (min/max/factor/thread_threshold)*/ {131072, 4793500, 1, 64}}, + {/*LL (min/max/factor)*/ {0, 655360, 1}, /*LL64/128 (min/max/factor)*/ {131072, 3211264, 1}}, /*AllGather*/ - {/*LL (min/max/factor/thread_threshold)*/ {0, 98304, 1, 16}, /*LL64/128 (min/max/factor/thread_threshold)*/ {98304, 5592500, 1, 64}}, + {/*LL (min/max/factor)*/ {0, 98304, 1}, /*LL64/128 (min/max/factor)*/ {98304, 5046272, 1}}, /*AllReduce*/ - {/*LL (min/max/factor/thread_threshold)*/ {0, 1048576, 1, 0},/*LL64/128 (min/max/factor/thread_threshold)*/ {1048576, 144217728, 3145728, 0}}, + {/*LL (min/max/factor)*/ {0, 1048576, 1},/*LL64/128 (min/max/factor)*/ {1048576, 9437184, 3145728}}, }, }; @@ -722,7 +722,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom } } } - + // Set per-thread amount of work before we increase nThreads and nChannels for (int a=0; athreadThresholds[a][NCCL_PROTO_LL] = NCCL_LL_THREAD_THRESHOLD; diff --git a/src/include/device.h b/src/include/device.h index dc8d1e9bfd..fe6f94c1ee 100644 --- a/src/include/device.h +++ b/src/include/device.h @@ -95,7 +95,7 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK #define NCCL_LL128_MAX_NTHREADS 256 #define NCCL_LL128_ELEMS_PER_THREAD 28 -#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 16 +#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 4 #define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS) #define NCCL_P2P_WRITE 0x01 @@ -698,7 +698,7 @@ inline int ncclDevFuncId(int coll, int devRedOp, int type, int algo, int proto) // RING / / / if (coll == ncclFuncReduce) { - row += ((proto * ncclNumDevRedOps + devRedOp) * ncclNumTypes + type) - NCCL_NUM_FLOATS * proto; + row += ((proto * ncclNumDevRedOps + devRedOp) * ncclNumTypes + type) - NCCL_NUM_FLOATS * proto; break; } row += NCCL_NUM_PROTOCOLS * (ncclNumDevRedOps * ncclNumTypes - NCCL_NUM_FLOATS); diff --git a/src/include/rccl_common.h b/src/include/rccl_common.h index 9090924ccb..9b245685fb 100644 --- a/src/include/rccl_common.h +++ b/src/include/rccl_common.h @@ -33,11 +33,10 @@ typedef enum RcclTunableColls { } rcclTunableIndex_t; #define RCCL_LL_LIMITS_UNDEFINED 0 -#define RCCL_PROTOCOL_ENTRY_SIZE 4 +#define RCCL_PROTOCOL_ENTRY_SIZE 3 #define RCCL_PROTOCOL_MIN_IDX 0 #define RCCL_PROTOCOL_MAX_IDX 1 #define RCCL_PROTOCOL_FACTOR_IDX 2 -#define RCCL_PROTOCOL_THREAD_THRESHOLD_IDX 3 #ifdef RCCL_EXPOSE_STATIC #define RCCL_STATIC_EXPOSE_CHECK() @@ -72,7 +71,8 @@ inline size_t rcclGetSizePerRank(ncclFunc_t const& func, size_t const& nBytes, i return (func == ncclFuncReduceScatter || func == ncclFuncAllGather) ? nBytes / nRanks : nBytes; } void rcclUpdateCollectiveProtocol(struct ncclComm* comm, size_t const& nBytes, struct ncclTaskColl* info); -void rcclUpdateThreadThreshold(struct ncclComm* comm, size_t const& nBytes, struct ncclTaskColl* info, int& threadThreshold); + + ncclResult_t rcclGetAlgoInfo(struct ncclComm* comm, ncclFunc_t coll, uint64_t count, ncclDataType_t dataType, int collNetSupport, int nvlsSupport, int numPipeOps, int* algo, int* protocol, int* maxChannels); diff --git a/src/rccl_wrap.cc b/src/rccl_wrap.cc index 397cbc7664..7ed13a6aeb 100644 --- a/src/rccl_wrap.cc +++ b/src/rccl_wrap.cc @@ -72,29 +72,6 @@ void rcclUpdateCollectiveProtocol(struct ncclComm* comm, size_t const& nBytes, s } } -void rcclUpdateThreadThreshold(struct ncclComm* comm, size_t const& nBytes, struct ncclTaskColl* info, int& threadThreshold) { - // Honor user input for thread thresholds - static int userChannelControlInput = -2; - if (userChannelControlInput == -2) { - const char *inputStr = getenv("NCCL_THREAD_THRESHOLDS"); - if (!inputStr) { - inputStr = getenv("NCCL_MAX_NCHANNELS"); - } - if (!inputStr) { - inputStr = getenv("NCCL_MIN_NCHANNELS"); - } - userChannelControlInput = !inputStr ? 0 : 1; - } - - if(!userChannelControlInput && comm->nNodes >= 2 && (info->func == ncclFuncReduceScatter || info->func == ncclFuncAllGather)) { - auto tunableIndex = rcclGetTunableIndex(info->func); - auto tunedThreshold = comm->minMaxLLRange[tunableIndex][info->protocol][RCCL_PROTOCOL_THREAD_THRESHOLD_IDX]; - if(tunedThreshold != RCCL_LL_LIMITS_UNDEFINED) { - threadThreshold = tunedThreshold * comm->nRanks; - } - } -} - extern ncclResult_t getAlgoInfo( struct ncclComm* comm, struct ncclTaskColl* task, int collNetSupport, int nvlsSupport, int numPipeOps, ncclSimInfo_t* simInfo = NULL