diff --git a/projects/rccl/CHANGELOG.md b/projects/rccl/CHANGELOG.md index 6da80ac03d..4f43e0cd35 100644 --- a/projects/rccl/CHANGELOG.md +++ b/projects/rccl/CHANGELOG.md @@ -7,6 +7,7 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https: ### Added * Added `RCCL_P2P_BATCH_THRESHOLD` to set the message size limit for batching P2P operations. This mainly affects small message performance for alltoall at a large scale but also applies to alltoallv. * Added `RCCL_P2P_BATCH_ENABLE` to enable batching P2P operations to receive performance gains for smaller messages up to 4MB for alltoall when the workload requires it. This is to avoid performance dips for larger messages. +* added `RCCL_CHANNEL_TUNING_ENABLE` to enable channel tuning that overrides RCCL's internal adjustments based on threadThreshold. ### Changed diff --git a/projects/rccl/src/enqueue.cc b/projects/rccl/src/enqueue.cc index f94c11ea21..c03ba7b8cd 100644 --- a/projects/rccl/src/enqueue.cc +++ b/projects/rccl/src/enqueue.cc @@ -2015,6 +2015,8 @@ static ncclResult_t updateCollCostTable( return ncclSuccess; } +extern int64_t ncclParamMinNchannels(); + static ncclResult_t topoGetAlgoInfo( struct ncclComm* comm, struct ncclTaskColl* info, size_t nBytes, float** collCostTable, ncclSimInfo_t* simInfo @@ -2079,11 +2081,17 @@ static ncclResult_t topoGetAlgoInfo( nc = comm->nvlsChannels; } else { rcclUpdateThreadThreshold(comm, nBytes, info, threadThreshold); + INFO(NCCL_INIT, "pre-adjustment threadThreshold:%i nBytes:%lu nc:%i", threadThreshold, nBytes, nc); + + int minNChannels = ncclParamMinNchannels(); // Ring/Tree channel tuning - while (nBytes < nc * nt * threadThreshold) { + INFO(NCCL_INIT, "minNChannels:%i", minNChannels); + while (nBytes < nc * nt * threadThreshold && nc > minNChannels) { if (nc >= 2) nc--; else break; } + INFO(NCCL_INIT, "post-adjustment based on threadThreshold:%i nBytes:%lu nc:%i", threadThreshold, nBytes, nc); + rcclOverrideChannels(comm, info->func, nBytes, nc); } #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) #else diff --git a/projects/rccl/src/graph/tuning.cc b/projects/rccl/src/graph/tuning.cc index e858888ffc..dba12f3b26 100644 --- a/projects/rccl/src/graph/tuning.cc +++ b/projects/rccl/src/graph/tuning.cc @@ -151,6 +151,7 @@ struct tuningModel { float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][27]; float ringCorrectionFactor[NCCL_NUM_PROTOCOLS][27]; uint64_t llProtoRanges[RCCL_TUNABLE_COLLS][NCCL_NUM_PROTOCOLS - 1][RCCL_PROTOCOL_ENTRY_SIZE]; + uint64_t channelThresholds[RCCL_TUNABLE_COLLS][RCCL_CHANNELS_TUNABLE_ENTRIES][3]; //for each collective, set for 5 channel-counts: 2,4,8,16,32,40,48,56,64, {min,max,nchannels} }; static struct tuningModel tuning_model_0 { @@ -183,6 +184,7 @@ static struct tuningModel tuning_model_0 { }, .llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}}, + .channelThresholds = {{{CHAN_THRESHOLDS_UNDEFINED}}}, }; static struct tuningModel tuning_model_1 { @@ -215,6 +217,7 @@ static struct tuningModel tuning_model_1 { }, .llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}}, + .channelThresholds = {{{CHAN_THRESHOLDS_UNDEFINED}}}, }; static struct tuningModel tuning_model_2 { @@ -247,6 +250,7 @@ static struct tuningModel tuning_model_2 { }, .llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}}, + .channelThresholds = {{{CHAN_THRESHOLDS_UNDEFINED}}}, }; static struct tuningModel tuning_model_3 { @@ -279,6 +283,7 @@ static struct tuningModel tuning_model_3 { }, .llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}}, + .channelThresholds = {{{CHAN_THRESHOLDS_UNDEFINED}}}, }; static struct tuningModel tuning_model_4 { @@ -311,6 +316,7 @@ static struct tuningModel tuning_model_4 { }, .llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}}, + .channelThresholds = {{{CHAN_THRESHOLDS_UNDEFINED}}}, }; static struct tuningModel tuning_model_5 { @@ -354,6 +360,9 @@ static struct tuningModel tuning_model_5 { /*Broadcast*/ {/*LL (min/max/factor/thread_threshold)*/ {0, 8192, 1, 0},/*LL64/128 (min/max/factor/thread_threshold)*/ {8192, 33554432, 1, 0}}, }, + + .channelThresholds = {{{CHAN_THRESHOLDS_UNDEFINED}}}, + }; static struct tuningModel tuning_model_6 { @@ -396,7 +405,14 @@ static struct tuningModel tuning_model_6 { {/*LL (min/max/factor/thread_threshold)*/ {0, 16383, 1, 0},/*LL64/128 (min/max/factor/thread_threshold)*/ {16383, 16777216, 1, 0}}, /*Broadcast*/ {/*LL (min/max/factor/thread_threshold)*/ {0, 2048, 1, 0},/*LL64/128 (min/max/factor/thread_threshold)*/ {2048, 16777216, 1, 0}}, - }, + }, + + .channelThresholds = { + // For each collective, define minMax per-rank size threshold for 32,40,48,56,64 channels + /*ReduceScatter*/ {{512, 1024, 2},{1024, 2048, 4},{2048, 4096, 8},{4096, 65536, 16}, {65536, 262144, 32}, {262144, 524288, 40}, {1,1, 48}, {524288, 1048576, 56}, {1048576, 268435457, 64}}, + /*AllGather*/ {{2048, 4096, 2},{4096, 8192, 4},{8192, 16384, 8},{16384, 262144, 16},{262144, 524288, 32}, {524288, 1048576, 40}, {1,1, 48}, {1048576, 4194304, 56}, {4194304, 268435457, 64}}, + /*AllReduce*/ {{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, {0,0,0}, {0,0,0}, {0,0,0}, {0,0,0}}, + }, }; static struct tuningModel rcclTuningModel[] = { @@ -407,7 +423,6 @@ static struct tuningModel rcclTuningModel[] = { tuning_model_4, tuning_model_5, tuning_model_6, - }; /* Array indexes used below */ @@ -519,6 +534,10 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom rcclTuningModel[comm->topo->tuning].llProtoRanges, sizeof(rcclTuningModel[comm->topo->tuning].llProtoRanges)); + memcpy(comm->minMaxChannelThresholds, + rcclTuningModel[comm->topo->tuning].channelThresholds, + sizeof(rcclTuningModel[comm->topo->tuning].channelThresholds)); + for (int coll=0; coll // Use this param to experiment pipelining new data types besides bfloat16 // Make sure you generate the device code with the new data type (i.e. in generate.py) RCCL_PARAM(PipelineAllDTypes, "PIPELINE_ALL_DATA_TYPES", 0); @@ -108,6 +108,56 @@ ncclResult_t rcclGetAlgoProtoIndex(const char *envStr, const char* algoProtoStri return ncclInvalidUsage; } +extern int64_t ncclParamMinNchannels(); +extern int64_t ncclParamMaxNchannels(); +RCCL_PARAM(ChannelTuningEnable, "CHANNEL_TUNING_ENABLE", 1); + +ncclResult_t rcclOverrideChannels(struct ncclComm* comm, ncclFunc_t coll, size_t nBytes, int& nc){ + if(comm->nNodes < 2 || !rcclParamChannelTuningEnable()){ + INFO(NCCL_TUNING, "RCCL Channel Tuning not applied"); + return ncclSuccess; + } + + auto tunableIndex = rcclGetTunableIndex(coll); + if(tunableIndex == RCCL_UNSUPPORTED_TUNABLE){ + INFO(NCCL_TUNING, "tunableIndex:%i not supported", tunableIndex); + return ncclSuccess; + } + + int minCTAs = comm->config.minCTAs; + int maxCTAs = comm->config.maxCTAs; + int minNChannels = ncclParamMinNchannels(); + int maxNChannels = std::max(comm->nChannels, static_cast(ncclParamMaxNchannels())); + size_t bytesPerRank = divUp(nBytes, comm->nRanks); + + for(int channelCountIndex = 0; channelCountIndex < RCCL_CHANNELS_TUNABLE_ENTRIES; ++channelCountIndex){ + size_t minByteThreshold = comm->minMaxChannelThresholds[tunableIndex][channelCountIndex][0]; + size_t maxByteThreshold = comm->minMaxChannelThresholds[tunableIndex][channelCountIndex][1]; + INFO(NCCL_TUNING, "nBytes:%lu bytesPerRank:%lu minByteThreshold:%lu maxByteThreshold:%lu NCCL_MIN_NCHANNELS:%i or NCCL_MAX_NCHANNELS:%i minCTAs:%i maxCTAs:%i", nBytes, bytesPerRank, minByteThreshold, maxByteThreshold, minNChannels, maxNChannels, minCTAs, maxCTAs); + if(minByteThreshold == CHAN_THRESHOLDS_UNDEFINED || maxByteThreshold == CHAN_THRESHOLDS_UNDEFINED) { + INFO(NCCL_TUNING, "RCCL tuning model does not define threshold for coll:%i and nbytes:%lu", coll, nBytes); + break; // Skip undefined thresholds + } + + if(bytesPerRank > minByteThreshold && bytesPerRank <= maxByteThreshold){ + int channelCount = comm->minMaxChannelThresholds[tunableIndex][channelCountIndex][2]; + + //honor user's min/max channels defined through NCCL_MIN_NCHANNELS and NCCL_MAX_NCHANNELS + if(channelCount >= minNChannels && channelCount <= maxNChannels && channelCount >= minCTAs && channelCount <= maxCTAs){ + nc = comm->minMaxChannelThresholds[tunableIndex][channelCountIndex][2]; + INFO(NCCL_TUNING, "RCCL tuning model overrides nchannels to %i, channels may be decreased further due to MinTrafficPerchannel thresholds", channelCount); + } + else{ + INFO(NCCL_TUNING, "RCCL tuning model cannot override nchannels to %i due to conflicting NCCL_MIN_NCHANNELS:%i or NCCL_MAX_NCHANNELS:%i minCTAs:%i maxCTAs:%i", channelCount, minNChannels, maxNChannels, minCTAs, maxCTAs); + } + + break; + } + + } + return ncclSuccess; +} + ncclResult_t rcclOverrideProtocol(const char* ncclProtoStr[], float table[][NCCL_NUM_PROTOCOLS], struct ncclTaskColl* info) { static const char* protoOverrideEnv = ncclGetEnv("RCCL_OVERRIDE_PROTO"); static bool validInput = true;