Revert "[AG and RS channel tuning] Add thread work threshold to tuning models and precompute reg index in LL128 (#1641)"
This reverts commit 00c1eb098c.
This commit is contained in:
committed by
Arm Patinyasakdikul
parent
66ef428714
commit
7f878baef0
@@ -6,18 +6,10 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https:
|
||||
|
||||
### Resolved issues
|
||||
* Resolved an issue when using more than 64 channels when multiple collectives are used in the same `ncclGroup()` call.
|
||||
* Suboptimal algorithmic switching point for AllReduce on MI300x
|
||||
|
||||
### Added
|
||||
|
||||
* Added new GPU target `gfx950`.
|
||||
* Added support for `unroll=1` in device-code generation to improve performance
|
||||
* Added MSCCL support for AllGather single node and multinode (i.e., 8, 16 and 32 GPUs). To enable on multinode, set the
|
||||
environment variable `RCCL_MSCCL_FORCE_ENABLE=1`. Max message size for MSCCL AllGather usage is `12292 * sizeof(datatype) * nGPUs`.
|
||||
* Thread thresholds for LL/LL128 are selected in Tuning Models for the MI300X. This impacts the number of channels used for AG and RS. Channel tuning model is bypassed if `NCCL_THREAD_THRESHOLDS`, `NCCL_MIN_NCHANNELS', or 'NCCL_MAX_NCHANNELS` are set.
|
||||
* Multi-node tuning for AllGather, AllReduce, and ReduceScatter that leverages LL/LL64/LL128 protocol to use nontemporal vector load/store for tunable message size ranges.
|
||||
* LL/LL128 usage ranges for AR, AG, and RS are part of the tuning models, which enable architecture-specific tuning in conjunction with the existing Rome Models scheme in RCCL.
|
||||
* Two new APIs are exposed as part of an initiative to separate RCCL code. These APIs are `rcclGetAlgoInfo` and `rcclFuncMaxSendRecvCount`. However, user-level invocation requires that RCCL be built with `RCCL_EXPOSE_STATIC` enabled.
|
||||
|
||||
### Changed
|
||||
|
||||
|
||||
@@ -126,11 +126,6 @@ private:
|
||||
template<int WordPerThread>
|
||||
__device__ __forceinline__ void loadRegsBegin(uint64_t(®s)[WordPerThread], T const *src, int eltN) {
|
||||
constexpr int EltPer16B = 16/sizeof(T);
|
||||
int ix[WordPerThread/2];
|
||||
#pragma unroll
|
||||
for(int g=0; g < WordPerThread/2; g++) {
|
||||
ix[g] = g*WARP_SIZE - 16*(g/2) + wid - (g%2)*(wid/4);
|
||||
}
|
||||
if(reinterpret_cast<uintptr_t>(src)%16 == 0) {
|
||||
/* We are aligned to 16 bytes, so load directly to registers no shmem.
|
||||
* Flag threads load half as much data which gets shuffled to the even
|
||||
@@ -140,9 +135,10 @@ private:
|
||||
*/
|
||||
#pragma unroll
|
||||
for(int g=0; g < WordPerThread/2; g++) {
|
||||
int ix = g*WARP_SIZE - 16*(g/2) + wid - (g%2)*(wid/4);
|
||||
if(!flagThread || g%2==0) {
|
||||
if(ix[g]*EltPer16B < eltN)
|
||||
load128((uint64_t*)(src + ix[g]*EltPer16B), regs[2*g+0], regs[2*g+1]);
|
||||
if(ix*EltPer16B < eltN)
|
||||
load128((uint64_t*)(src + ix*EltPer16B), regs[2*g+0], regs[2*g+1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -167,10 +163,10 @@ private:
|
||||
T *shm = (T*)shm8 + misalignment/sizeof(T);
|
||||
#pragma unroll
|
||||
for(int g=0; g < WordPerThread/2; g++) {
|
||||
// int ix = g*WARP_SIZE - 16*(g/2) + wid - (g%2)*(wid/4);
|
||||
int ix = g*WARP_SIZE - 16*(g/2) + wid - (g%2)*(wid/4);
|
||||
if(!flagThread || g%2==0) {
|
||||
if(ix[g]*EltPer16B < eltN)
|
||||
loadShmemMisaligned128(shm + ix[g]*EltPer16B, regs[2*g+0], regs[2*g+1]);
|
||||
if(ix*EltPer16B < eltN)
|
||||
loadShmemMisaligned128(shm + ix*EltPer16B, regs[2*g+0], regs[2*g+1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -193,7 +189,6 @@ private:
|
||||
for (int g=1; g < WordPerThread/2; g+=2) {
|
||||
if (flagThread) regs[2*g-1] = regs[2*g];
|
||||
}
|
||||
|
||||
// Write to dst if 4-byte aligned, shmem otherwise.
|
||||
int misalignment = reinterpret_cast<uintptr_t>(dst)%16;
|
||||
uint64_t *shm8 = shmemCvtPtr((uint64_t*)ncclScratchForWarp(warpInBlock));
|
||||
|
||||
@@ -1769,7 +1769,6 @@ static ncclResult_t topoGetAlgoInfo(
|
||||
// NVLS should not need more than 16 channels to get peak BW.
|
||||
nc = comm->nvlsChannels;
|
||||
} else {
|
||||
rcclUpdateThreadThreshold(comm, nBytes, info, threadThreshold);
|
||||
// Ring/Tree channel tuning
|
||||
while (nBytes < nc * nt * threadThreshold) {
|
||||
if (nc >= 2) nc--;
|
||||
|
||||
+4
-4
@@ -329,11 +329,11 @@ static struct tuningModel tuning_model_5 {
|
||||
// Follow order in RcclTunableColls
|
||||
.llProtoRanges = {
|
||||
/*ReduceScatter*/
|
||||
{/*LL (min/max/factor/thread_threshold)*/ {0, 655360, 1, 16}, /*LL64/128 (min/max/factor/thread_threshold)*/ {131072, 4793500, 1, 64}},
|
||||
{/*LL (min/max/factor)*/ {0, 655360, 1}, /*LL64/128 (min/max/factor)*/ {131072, 3211264, 1}},
|
||||
/*AllGather*/
|
||||
{/*LL (min/max/factor/thread_threshold)*/ {0, 98304, 1, 16}, /*LL64/128 (min/max/factor/thread_threshold)*/ {98304, 5592500, 1, 64}},
|
||||
{/*LL (min/max/factor)*/ {0, 98304, 1}, /*LL64/128 (min/max/factor)*/ {98304, 5046272, 1}},
|
||||
/*AllReduce*/
|
||||
{/*LL (min/max/factor/thread_threshold)*/ {0, 1048576, 1, 0},/*LL64/128 (min/max/factor/thread_threshold)*/ {1048576, 144217728, 3145728, 0}},
|
||||
{/*LL (min/max/factor)*/ {0, 1048576, 1},/*LL64/128 (min/max/factor)*/ {1048576, 9437184, 3145728}},
|
||||
},
|
||||
};
|
||||
|
||||
@@ -722,7 +722,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Set per-thread amount of work before we increase nThreads and nChannels
|
||||
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
|
||||
comm->threadThresholds[a][NCCL_PROTO_LL] = NCCL_LL_THREAD_THRESHOLD;
|
||||
|
||||
@@ -95,7 +95,7 @@ static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK
|
||||
#define NCCL_LL128_MAX_NTHREADS 256
|
||||
#define NCCL_LL128_ELEMS_PER_THREAD 28
|
||||
|
||||
#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 16
|
||||
#define NCCL_LL128_SHMEM_ELEMS_PER_THREAD 4
|
||||
#define NCCL_LL128_SHMEM_SIZE (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*NCCL_LL128_MAX_NTHREADS)
|
||||
|
||||
#define NCCL_P2P_WRITE 0x01
|
||||
@@ -698,7 +698,7 @@ inline int ncclDevFuncId(int coll, int devRedOp, int type, int algo, int proto)
|
||||
|
||||
// RING / <all_protos> / <all_redops> / <all_types>
|
||||
if (coll == ncclFuncReduce) {
|
||||
row += ((proto * ncclNumDevRedOps + devRedOp) * ncclNumTypes + type) - NCCL_NUM_FLOATS * proto;
|
||||
row += ((proto * ncclNumDevRedOps + devRedOp) * ncclNumTypes + type) - NCCL_NUM_FLOATS * proto;
|
||||
break;
|
||||
}
|
||||
row += NCCL_NUM_PROTOCOLS * (ncclNumDevRedOps * ncclNumTypes - NCCL_NUM_FLOATS);
|
||||
|
||||
@@ -33,11 +33,10 @@ typedef enum RcclTunableColls {
|
||||
} rcclTunableIndex_t;
|
||||
|
||||
#define RCCL_LL_LIMITS_UNDEFINED 0
|
||||
#define RCCL_PROTOCOL_ENTRY_SIZE 4
|
||||
#define RCCL_PROTOCOL_ENTRY_SIZE 3
|
||||
#define RCCL_PROTOCOL_MIN_IDX 0
|
||||
#define RCCL_PROTOCOL_MAX_IDX 1
|
||||
#define RCCL_PROTOCOL_FACTOR_IDX 2
|
||||
#define RCCL_PROTOCOL_THREAD_THRESHOLD_IDX 3
|
||||
|
||||
#ifdef RCCL_EXPOSE_STATIC
|
||||
#define RCCL_STATIC_EXPOSE_CHECK()
|
||||
@@ -72,7 +71,8 @@ inline size_t rcclGetSizePerRank(ncclFunc_t const& func, size_t const& nBytes, i
|
||||
return (func == ncclFuncReduceScatter || func == ncclFuncAllGather) ? nBytes / nRanks : nBytes;
|
||||
}
|
||||
void rcclUpdateCollectiveProtocol(struct ncclComm* comm, size_t const& nBytes, struct ncclTaskColl* info);
|
||||
void rcclUpdateThreadThreshold(struct ncclComm* comm, size_t const& nBytes, struct ncclTaskColl* info, int& threadThreshold);
|
||||
|
||||
|
||||
ncclResult_t rcclGetAlgoInfo(struct ncclComm* comm, ncclFunc_t coll, uint64_t count, ncclDataType_t dataType,
|
||||
int collNetSupport, int nvlsSupport, int numPipeOps,
|
||||
int* algo, int* protocol, int* maxChannels);
|
||||
|
||||
@@ -72,29 +72,6 @@ void rcclUpdateCollectiveProtocol(struct ncclComm* comm, size_t const& nBytes, s
|
||||
}
|
||||
}
|
||||
|
||||
void rcclUpdateThreadThreshold(struct ncclComm* comm, size_t const& nBytes, struct ncclTaskColl* info, int& threadThreshold) {
|
||||
// Honor user input for thread thresholds
|
||||
static int userChannelControlInput = -2;
|
||||
if (userChannelControlInput == -2) {
|
||||
const char *inputStr = getenv("NCCL_THREAD_THRESHOLDS");
|
||||
if (!inputStr) {
|
||||
inputStr = getenv("NCCL_MAX_NCHANNELS");
|
||||
}
|
||||
if (!inputStr) {
|
||||
inputStr = getenv("NCCL_MIN_NCHANNELS");
|
||||
}
|
||||
userChannelControlInput = !inputStr ? 0 : 1;
|
||||
}
|
||||
|
||||
if(!userChannelControlInput && comm->nNodes >= 2 && (info->func == ncclFuncReduceScatter || info->func == ncclFuncAllGather)) {
|
||||
auto tunableIndex = rcclGetTunableIndex(info->func);
|
||||
auto tunedThreshold = comm->minMaxLLRange[tunableIndex][info->protocol][RCCL_PROTOCOL_THREAD_THRESHOLD_IDX];
|
||||
if(tunedThreshold != RCCL_LL_LIMITS_UNDEFINED) {
|
||||
threadThreshold = tunedThreshold * comm->nRanks;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extern ncclResult_t getAlgoInfo(
|
||||
struct ncclComm* comm, struct ncclTaskColl* task,
|
||||
int collNetSupport, int nvlsSupport, int numPipeOps, ncclSimInfo_t* simInfo = NULL
|
||||
|
||||
Reference in New Issue
Block a user