gfx950 channel tuning for ReduceScatter and AllGather (#1940)
* add channel thresholds to override channel-count adjustments
[ROCm/rccl commit: 0f99fd84a3]
Этот коммит содержится в:
@@ -7,6 +7,7 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https:
|
||||
### Added
|
||||
* Added `RCCL_P2P_BATCH_THRESHOLD` to set the message size limit for batching P2P operations. This mainly affects small message performance for alltoall at a large scale but also applies to alltoallv.
|
||||
* Added `RCCL_P2P_BATCH_ENABLE` to enable batching P2P operations to receive performance gains for smaller messages up to 4MB for alltoall when the workload requires it. This is to avoid performance dips for larger messages.
|
||||
* added `RCCL_CHANNEL_TUNING_ENABLE` to enable channel tuning that overrides RCCL's internal adjustments based on threadThreshold.
|
||||
|
||||
### Changed
|
||||
|
||||
|
||||
@@ -2015,6 +2015,8 @@ static ncclResult_t updateCollCostTable(
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
extern int64_t ncclParamMinNchannels();
|
||||
|
||||
static ncclResult_t topoGetAlgoInfo(
|
||||
struct ncclComm* comm, struct ncclTaskColl* info, size_t nBytes,
|
||||
float** collCostTable, ncclSimInfo_t* simInfo
|
||||
@@ -2079,11 +2081,17 @@ static ncclResult_t topoGetAlgoInfo(
|
||||
nc = comm->nvlsChannels;
|
||||
} else {
|
||||
rcclUpdateThreadThreshold(comm, nBytes, info, threadThreshold);
|
||||
INFO(NCCL_INIT, "pre-adjustment threadThreshold:%i nBytes:%lu nc:%i", threadThreshold, nBytes, nc);
|
||||
|
||||
int minNChannels = ncclParamMinNchannels();
|
||||
// Ring/Tree channel tuning
|
||||
while (nBytes < nc * nt * threadThreshold) {
|
||||
INFO(NCCL_INIT, "minNChannels:%i", minNChannels);
|
||||
while (nBytes < nc * nt * threadThreshold && nc > minNChannels) {
|
||||
if (nc >= 2) nc--;
|
||||
else break;
|
||||
}
|
||||
INFO(NCCL_INIT, "post-adjustment based on threadThreshold:%i nBytes:%lu nc:%i", threadThreshold, nBytes, nc);
|
||||
rcclOverrideChannels(comm, info->func, nBytes, nc);
|
||||
}
|
||||
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
|
||||
#else
|
||||
|
||||
@@ -151,6 +151,7 @@ struct tuningModel {
|
||||
float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][27];
|
||||
float ringCorrectionFactor[NCCL_NUM_PROTOCOLS][27];
|
||||
uint64_t llProtoRanges[RCCL_TUNABLE_COLLS][NCCL_NUM_PROTOCOLS - 1][RCCL_PROTOCOL_ENTRY_SIZE];
|
||||
uint64_t channelThresholds[RCCL_TUNABLE_COLLS][RCCL_CHANNELS_TUNABLE_ENTRIES][3]; //for each collective, set for 5 channel-counts: 2,4,8,16,32,40,48,56,64, {min,max,nchannels}
|
||||
};
|
||||
|
||||
static struct tuningModel tuning_model_0 {
|
||||
@@ -183,6 +184,7 @@ static struct tuningModel tuning_model_0 {
|
||||
},
|
||||
|
||||
.llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}},
|
||||
.channelThresholds = {{{CHAN_THRESHOLDS_UNDEFINED}}},
|
||||
};
|
||||
|
||||
static struct tuningModel tuning_model_1 {
|
||||
@@ -215,6 +217,7 @@ static struct tuningModel tuning_model_1 {
|
||||
},
|
||||
|
||||
.llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}},
|
||||
.channelThresholds = {{{CHAN_THRESHOLDS_UNDEFINED}}},
|
||||
};
|
||||
|
||||
static struct tuningModel tuning_model_2 {
|
||||
@@ -247,6 +250,7 @@ static struct tuningModel tuning_model_2 {
|
||||
},
|
||||
|
||||
.llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}},
|
||||
.channelThresholds = {{{CHAN_THRESHOLDS_UNDEFINED}}},
|
||||
};
|
||||
|
||||
static struct tuningModel tuning_model_3 {
|
||||
@@ -279,6 +283,7 @@ static struct tuningModel tuning_model_3 {
|
||||
},
|
||||
|
||||
.llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}},
|
||||
.channelThresholds = {{{CHAN_THRESHOLDS_UNDEFINED}}},
|
||||
};
|
||||
|
||||
static struct tuningModel tuning_model_4 {
|
||||
@@ -311,6 +316,7 @@ static struct tuningModel tuning_model_4 {
|
||||
},
|
||||
|
||||
.llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}},
|
||||
.channelThresholds = {{{CHAN_THRESHOLDS_UNDEFINED}}},
|
||||
};
|
||||
|
||||
static struct tuningModel tuning_model_5 {
|
||||
@@ -354,6 +360,9 @@ static struct tuningModel tuning_model_5 {
|
||||
/*Broadcast*/
|
||||
{/*LL (min/max/factor/thread_threshold)*/ {0, 8192, 1, 0},/*LL64/128 (min/max/factor/thread_threshold)*/ {8192, 33554432, 1, 0}},
|
||||
},
|
||||
|
||||
.channelThresholds = {{{CHAN_THRESHOLDS_UNDEFINED}}},
|
||||
|
||||
};
|
||||
|
||||
static struct tuningModel tuning_model_6 {
|
||||
@@ -396,7 +405,14 @@ static struct tuningModel tuning_model_6 {
|
||||
{/*LL (min/max/factor/thread_threshold)*/ {0, 16383, 1, 0},/*LL64/128 (min/max/factor/thread_threshold)*/ {16383, 16777216, 1, 0}},
|
||||
/*Broadcast*/
|
||||
{/*LL (min/max/factor/thread_threshold)*/ {0, 2048, 1, 0},/*LL64/128 (min/max/factor/thread_threshold)*/ {2048, 16777216, 1, 0}},
|
||||
},
|
||||
},
|
||||
|
||||
.channelThresholds = {
|
||||
// For each collective, define minMax per-rank size threshold for 32,40,48,56,64 channels
|
||||
/*ReduceScatter*/ {{512, 1024, 2},{1024, 2048, 4},{2048, 4096, 8},{4096, 65536, 16}, {65536, 262144, 32}, {262144, 524288, 40}, {1,1, 48}, {524288, 1048576, 56}, {1048576, 268435457, 64}},
|
||||
/*AllGather*/ {{2048, 4096, 2},{4096, 8192, 4},{8192, 16384, 8},{16384, 262144, 16},{262144, 524288, 32}, {524288, 1048576, 40}, {1,1, 48}, {1048576, 4194304, 56}, {4194304, 268435457, 64}},
|
||||
/*AllReduce*/ {{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, {0,0,0}, {0,0,0}, {0,0,0}, {0,0,0}},
|
||||
},
|
||||
};
|
||||
|
||||
static struct tuningModel rcclTuningModel[] = {
|
||||
@@ -407,7 +423,6 @@ static struct tuningModel rcclTuningModel[] = {
|
||||
tuning_model_4,
|
||||
tuning_model_5,
|
||||
tuning_model_6,
|
||||
|
||||
};
|
||||
|
||||
/* Array indexes used below */
|
||||
@@ -519,6 +534,10 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
rcclTuningModel[comm->topo->tuning].llProtoRanges,
|
||||
sizeof(rcclTuningModel[comm->topo->tuning].llProtoRanges));
|
||||
|
||||
memcpy(comm->minMaxChannelThresholds,
|
||||
rcclTuningModel[comm->topo->tuning].channelThresholds,
|
||||
sizeof(rcclTuningModel[comm->topo->tuning].channelThresholds));
|
||||
|
||||
for (int coll=0; coll<NCCL_NUM_FUNCTIONS; coll++) {
|
||||
int nsteps = coll == ncclFuncAllReduce ? 2*(nRanks-1) :
|
||||
coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nRanks-1 :
|
||||
|
||||
@@ -550,6 +550,7 @@ struct ncclComm {
|
||||
float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||
int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
|
||||
uint64_t minMaxLLRange[RCCL_TUNABLE_COLLS][NCCL_NUM_PROTOCOLS - 1][RCCL_PROTOCOL_ENTRY_SIZE];
|
||||
uint64_t minMaxChannelThresholds[RCCL_TUNABLE_COLLS][RCCL_CHANNELS_TUNABLE_ENTRIES][3]; //for each collective, set for 5 channel-counts: 32,40,48,56,64, the two values for min/max size-threshold
|
||||
|
||||
/* This attribute can indicate the states of communicators and return code of
|
||||
* asynchronous NCCL operations. */
|
||||
|
||||
@@ -91,4 +91,5 @@ typedef enum {
|
||||
#define NCCL_UNROLL_4 2
|
||||
|
||||
#define NCCL_NUM_FLOATS 6 // half/float/double/rccl_bfloat16/rccl_float8/rccl_bfloat8
|
||||
|
||||
#endif
|
||||
|
||||
@@ -35,6 +35,9 @@ typedef enum RcclTunableColls {
|
||||
RCCL_TUNABLE_COLLS = 5 // LL/LL64/LL128 tunable collectives count
|
||||
} rcclTunableIndex_t;
|
||||
|
||||
#define CHAN_THRESHOLDS_UNDEFINED 0
|
||||
#define RCCL_CHANNELS_TUNABLE_ENTRIES 9 // 2,4,8,16,32,40,48,56,64 channels
|
||||
|
||||
#define RCCL_LL_LIMITS_UNDEFINED 0
|
||||
#define RCCL_PROTOCOL_ENTRY_SIZE 4
|
||||
#define RCCL_PROTOCOL_MIN_IDX 0
|
||||
@@ -88,6 +91,7 @@ inline size_t rcclGetSizePerRank(ncclFunc_t const& func, size_t const& nBytes, i
|
||||
// For AR, this is the send/recv size per rank
|
||||
return (func == ncclFuncReduceScatter || func == ncclFuncAllGather || func == ncclFuncBroadcast || func == ncclFuncReduce) ? nBytes / nRanks : nBytes;
|
||||
}
|
||||
ncclResult_t rcclOverrideChannels(struct ncclComm* comm, ncclFunc_t coll, size_t nBytes, int& nc);
|
||||
ncclResult_t rcclGetAlgoProtoIndex(const char *envStr, const char* algoProtoString[], int nEntries, int& result);
|
||||
ncclResult_t rcclOverrideProtocol(const char* ncclProtoStr[], float table[][NCCL_NUM_PROTOCOLS], struct ncclTaskColl* info);
|
||||
ncclResult_t rcclOverrideAlgorithm(const char* ncclAlgoStr[], float table[][NCCL_NUM_PROTOCOLS], struct ncclTaskColl* info);
|
||||
|
||||
@@ -25,7 +25,7 @@ THE SOFTWARE.
|
||||
#include "graph/topo.h"
|
||||
#include "enqueue.h"
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
|
||||
#include <algorithm>
|
||||
// Use this param to experiment pipelining new data types besides bfloat16
|
||||
// Make sure you generate the device code with the new data type (i.e. in generate.py)
|
||||
RCCL_PARAM(PipelineAllDTypes, "PIPELINE_ALL_DATA_TYPES", 0);
|
||||
@@ -108,6 +108,56 @@ ncclResult_t rcclGetAlgoProtoIndex(const char *envStr, const char* algoProtoStri
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
|
||||
extern int64_t ncclParamMinNchannels();
|
||||
extern int64_t ncclParamMaxNchannels();
|
||||
RCCL_PARAM(ChannelTuningEnable, "CHANNEL_TUNING_ENABLE", 1);
|
||||
|
||||
ncclResult_t rcclOverrideChannels(struct ncclComm* comm, ncclFunc_t coll, size_t nBytes, int& nc){
|
||||
if(comm->nNodes < 2 || !rcclParamChannelTuningEnable()){
|
||||
INFO(NCCL_TUNING, "RCCL Channel Tuning not applied");
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
auto tunableIndex = rcclGetTunableIndex(coll);
|
||||
if(tunableIndex == RCCL_UNSUPPORTED_TUNABLE){
|
||||
INFO(NCCL_TUNING, "tunableIndex:%i not supported", tunableIndex);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
int minCTAs = comm->config.minCTAs;
|
||||
int maxCTAs = comm->config.maxCTAs;
|
||||
int minNChannels = ncclParamMinNchannels();
|
||||
int maxNChannels = std::max(comm->nChannels, static_cast<int>(ncclParamMaxNchannels()));
|
||||
size_t bytesPerRank = divUp(nBytes, comm->nRanks);
|
||||
|
||||
for(int channelCountIndex = 0; channelCountIndex < RCCL_CHANNELS_TUNABLE_ENTRIES; ++channelCountIndex){
|
||||
size_t minByteThreshold = comm->minMaxChannelThresholds[tunableIndex][channelCountIndex][0];
|
||||
size_t maxByteThreshold = comm->minMaxChannelThresholds[tunableIndex][channelCountIndex][1];
|
||||
INFO(NCCL_TUNING, "nBytes:%lu bytesPerRank:%lu minByteThreshold:%lu maxByteThreshold:%lu NCCL_MIN_NCHANNELS:%i or NCCL_MAX_NCHANNELS:%i minCTAs:%i maxCTAs:%i", nBytes, bytesPerRank, minByteThreshold, maxByteThreshold, minNChannels, maxNChannels, minCTAs, maxCTAs);
|
||||
if(minByteThreshold == CHAN_THRESHOLDS_UNDEFINED || maxByteThreshold == CHAN_THRESHOLDS_UNDEFINED) {
|
||||
INFO(NCCL_TUNING, "RCCL tuning model does not define threshold for coll:%i and nbytes:%lu", coll, nBytes);
|
||||
break; // Skip undefined thresholds
|
||||
}
|
||||
|
||||
if(bytesPerRank > minByteThreshold && bytesPerRank <= maxByteThreshold){
|
||||
int channelCount = comm->minMaxChannelThresholds[tunableIndex][channelCountIndex][2];
|
||||
|
||||
//honor user's min/max channels defined through NCCL_MIN_NCHANNELS and NCCL_MAX_NCHANNELS
|
||||
if(channelCount >= minNChannels && channelCount <= maxNChannels && channelCount >= minCTAs && channelCount <= maxCTAs){
|
||||
nc = comm->minMaxChannelThresholds[tunableIndex][channelCountIndex][2];
|
||||
INFO(NCCL_TUNING, "RCCL tuning model overrides nchannels to %i, channels may be decreased further due to MinTrafficPerchannel thresholds", channelCount);
|
||||
}
|
||||
else{
|
||||
INFO(NCCL_TUNING, "RCCL tuning model cannot override nchannels to %i due to conflicting NCCL_MIN_NCHANNELS:%i or NCCL_MAX_NCHANNELS:%i minCTAs:%i maxCTAs:%i", channelCount, minNChannels, maxNChannels, minCTAs, maxCTAs);
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t rcclOverrideProtocol(const char* ncclProtoStr[], float table[][NCCL_NUM_PROTOCOLS], struct ncclTaskColl* info) {
|
||||
static const char* protoOverrideEnv = ncclGetEnv("RCCL_OVERRIDE_PROTO");
|
||||
static bool validInput = true;
|
||||
|
||||
Ссылка в новой задаче
Block a user