gfx950 channel tuning for ReduceScatter and AllGather (#1940)

* add channel thresholds to override channel-count adjustments

[ROCm/rccl commit: 0f99fd84a3]
Этот коммит содержится в:
isaki001
2025-10-14 09:50:44 -05:00
коммит произвёл GitHub
родитель 433251272b
Коммит 6d151d4e21
7 изменённых файлов: 88 добавлений и 4 удалений
+1
Просмотреть файл
@@ -7,6 +7,7 @@ Full documentation for RCCL is available at [https://rccl.readthedocs.io](https:
### Added
* Added `RCCL_P2P_BATCH_THRESHOLD` to set the message size limit for batching P2P operations. This mainly affects small message performance for alltoall at a large scale but also applies to alltoallv.
* Added `RCCL_P2P_BATCH_ENABLE` to enable batching P2P operations to receive performance gains for smaller messages up to 4MB for alltoall when the workload requires it. This is to avoid performance dips for larger messages.
* added `RCCL_CHANNEL_TUNING_ENABLE` to enable channel tuning that overrides RCCL's internal adjustments based on threadThreshold.
### Changed
+9 -1
Просмотреть файл
@@ -2015,6 +2015,8 @@ static ncclResult_t updateCollCostTable(
return ncclSuccess;
}
extern int64_t ncclParamMinNchannels();
static ncclResult_t topoGetAlgoInfo(
struct ncclComm* comm, struct ncclTaskColl* info, size_t nBytes,
float** collCostTable, ncclSimInfo_t* simInfo
@@ -2079,11 +2081,17 @@ static ncclResult_t topoGetAlgoInfo(
nc = comm->nvlsChannels;
} else {
rcclUpdateThreadThreshold(comm, nBytes, info, threadThreshold);
INFO(NCCL_INIT, "pre-adjustment threadThreshold:%i nBytes:%lu nc:%i", threadThreshold, nBytes, nc);
int minNChannels = ncclParamMinNchannels();
// Ring/Tree channel tuning
while (nBytes < nc * nt * threadThreshold) {
INFO(NCCL_INIT, "minNChannels:%i", minNChannels);
while (nBytes < nc * nt * threadThreshold && nc > minNChannels) {
if (nc >= 2) nc--;
else break;
}
INFO(NCCL_INIT, "post-adjustment based on threadThreshold:%i nBytes:%lu nc:%i", threadThreshold, nBytes, nc);
rcclOverrideChannels(comm, info->func, nBytes, nc);
}
#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
#else
+21 -2
Просмотреть файл
@@ -151,6 +151,7 @@ struct tuningModel {
float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][27];
float ringCorrectionFactor[NCCL_NUM_PROTOCOLS][27];
uint64_t llProtoRanges[RCCL_TUNABLE_COLLS][NCCL_NUM_PROTOCOLS - 1][RCCL_PROTOCOL_ENTRY_SIZE];
uint64_t channelThresholds[RCCL_TUNABLE_COLLS][RCCL_CHANNELS_TUNABLE_ENTRIES][3]; //for each collective, set for 5 channel-counts: 2,4,8,16,32,40,48,56,64, {min,max,nchannels}
};
static struct tuningModel tuning_model_0 {
@@ -183,6 +184,7 @@ static struct tuningModel tuning_model_0 {
},
.llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}},
.channelThresholds = {{{CHAN_THRESHOLDS_UNDEFINED}}},
};
static struct tuningModel tuning_model_1 {
@@ -215,6 +217,7 @@ static struct tuningModel tuning_model_1 {
},
.llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}},
.channelThresholds = {{{CHAN_THRESHOLDS_UNDEFINED}}},
};
static struct tuningModel tuning_model_2 {
@@ -247,6 +250,7 @@ static struct tuningModel tuning_model_2 {
},
.llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}},
.channelThresholds = {{{CHAN_THRESHOLDS_UNDEFINED}}},
};
static struct tuningModel tuning_model_3 {
@@ -279,6 +283,7 @@ static struct tuningModel tuning_model_3 {
},
.llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}},
.channelThresholds = {{{CHAN_THRESHOLDS_UNDEFINED}}},
};
static struct tuningModel tuning_model_4 {
@@ -311,6 +316,7 @@ static struct tuningModel tuning_model_4 {
},
.llProtoRanges = {{{RCCL_LL_LIMITS_UNDEFINED}}},
.channelThresholds = {{{CHAN_THRESHOLDS_UNDEFINED}}},
};
static struct tuningModel tuning_model_5 {
@@ -354,6 +360,9 @@ static struct tuningModel tuning_model_5 {
/*Broadcast*/
{/*LL (min/max/factor/thread_threshold)*/ {0, 8192, 1, 0},/*LL64/128 (min/max/factor/thread_threshold)*/ {8192, 33554432, 1, 0}},
},
.channelThresholds = {{{CHAN_THRESHOLDS_UNDEFINED}}},
};
static struct tuningModel tuning_model_6 {
@@ -396,7 +405,14 @@ static struct tuningModel tuning_model_6 {
{/*LL (min/max/factor/thread_threshold)*/ {0, 16383, 1, 0},/*LL64/128 (min/max/factor/thread_threshold)*/ {16383, 16777216, 1, 0}},
/*Broadcast*/
{/*LL (min/max/factor/thread_threshold)*/ {0, 2048, 1, 0},/*LL64/128 (min/max/factor/thread_threshold)*/ {2048, 16777216, 1, 0}},
},
},
.channelThresholds = {
// For each collective, define minMax per-rank size threshold for 32,40,48,56,64 channels
/*ReduceScatter*/ {{512, 1024, 2},{1024, 2048, 4},{2048, 4096, 8},{4096, 65536, 16}, {65536, 262144, 32}, {262144, 524288, 40}, {1,1, 48}, {524288, 1048576, 56}, {1048576, 268435457, 64}},
/*AllGather*/ {{2048, 4096, 2},{4096, 8192, 4},{8192, 16384, 8},{16384, 262144, 16},{262144, 524288, 32}, {524288, 1048576, 40}, {1,1, 48}, {1048576, 4194304, 56}, {4194304, 268435457, 64}},
/*AllReduce*/ {{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, {0,0,0}, {0,0,0}, {0,0,0}, {0,0,0}},
},
};
static struct tuningModel rcclTuningModel[] = {
@@ -407,7 +423,6 @@ static struct tuningModel rcclTuningModel[] = {
tuning_model_4,
tuning_model_5,
tuning_model_6,
};
/* Array indexes used below */
@@ -519,6 +534,10 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
rcclTuningModel[comm->topo->tuning].llProtoRanges,
sizeof(rcclTuningModel[comm->topo->tuning].llProtoRanges));
memcpy(comm->minMaxChannelThresholds,
rcclTuningModel[comm->topo->tuning].channelThresholds,
sizeof(rcclTuningModel[comm->topo->tuning].channelThresholds));
for (int coll=0; coll<NCCL_NUM_FUNCTIONS; coll++) {
int nsteps = coll == ncclFuncAllReduce ? 2*(nRanks-1) :
coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nRanks-1 :
+1
Просмотреть файл
@@ -550,6 +550,7 @@ struct ncclComm {
float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
int maxThreads[NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS];
uint64_t minMaxLLRange[RCCL_TUNABLE_COLLS][NCCL_NUM_PROTOCOLS - 1][RCCL_PROTOCOL_ENTRY_SIZE];
uint64_t minMaxChannelThresholds[RCCL_TUNABLE_COLLS][RCCL_CHANNELS_TUNABLE_ENTRIES][3]; //for each collective, set for 5 channel-counts: 32,40,48,56,64, the two values for min/max size-threshold
/* This attribute can indicate the states of communicators and return code of
* asynchronous NCCL operations. */
+1
Просмотреть файл
@@ -91,4 +91,5 @@ typedef enum {
#define NCCL_UNROLL_4 2
#define NCCL_NUM_FLOATS 6 // half/float/double/rccl_bfloat16/rccl_float8/rccl_bfloat8
#endif
+4
Просмотреть файл
@@ -35,6 +35,9 @@ typedef enum RcclTunableColls {
RCCL_TUNABLE_COLLS = 5 // LL/LL64/LL128 tunable collectives count
} rcclTunableIndex_t;
#define CHAN_THRESHOLDS_UNDEFINED 0
#define RCCL_CHANNELS_TUNABLE_ENTRIES 9 // 2,4,8,16,32,40,48,56,64 channels
#define RCCL_LL_LIMITS_UNDEFINED 0
#define RCCL_PROTOCOL_ENTRY_SIZE 4
#define RCCL_PROTOCOL_MIN_IDX 0
@@ -88,6 +91,7 @@ inline size_t rcclGetSizePerRank(ncclFunc_t const& func, size_t const& nBytes, i
// For AR, this is the send/recv size per rank
return (func == ncclFuncReduceScatter || func == ncclFuncAllGather || func == ncclFuncBroadcast || func == ncclFuncReduce) ? nBytes / nRanks : nBytes;
}
ncclResult_t rcclOverrideChannels(struct ncclComm* comm, ncclFunc_t coll, size_t nBytes, int& nc);
ncclResult_t rcclGetAlgoProtoIndex(const char *envStr, const char* algoProtoString[], int nEntries, int& result);
ncclResult_t rcclOverrideProtocol(const char* ncclProtoStr[], float table[][NCCL_NUM_PROTOCOLS], struct ncclTaskColl* info);
ncclResult_t rcclOverrideAlgorithm(const char* ncclAlgoStr[], float table[][NCCL_NUM_PROTOCOLS], struct ncclTaskColl* info);
+51 -1
Просмотреть файл
@@ -25,7 +25,7 @@ THE SOFTWARE.
#include "graph/topo.h"
#include "enqueue.h"
#include "rocm_smi/rocm_smi.h"
#include <algorithm>
// Use this param to experiment pipelining new data types besides bfloat16
// Make sure you generate the device code with the new data type (i.e. in generate.py)
RCCL_PARAM(PipelineAllDTypes, "PIPELINE_ALL_DATA_TYPES", 0);
@@ -108,6 +108,56 @@ ncclResult_t rcclGetAlgoProtoIndex(const char *envStr, const char* algoProtoStri
return ncclInvalidUsage;
}
extern int64_t ncclParamMinNchannels();
extern int64_t ncclParamMaxNchannels();
RCCL_PARAM(ChannelTuningEnable, "CHANNEL_TUNING_ENABLE", 1);
ncclResult_t rcclOverrideChannels(struct ncclComm* comm, ncclFunc_t coll, size_t nBytes, int& nc){
if(comm->nNodes < 2 || !rcclParamChannelTuningEnable()){
INFO(NCCL_TUNING, "RCCL Channel Tuning not applied");
return ncclSuccess;
}
auto tunableIndex = rcclGetTunableIndex(coll);
if(tunableIndex == RCCL_UNSUPPORTED_TUNABLE){
INFO(NCCL_TUNING, "tunableIndex:%i not supported", tunableIndex);
return ncclSuccess;
}
int minCTAs = comm->config.minCTAs;
int maxCTAs = comm->config.maxCTAs;
int minNChannels = ncclParamMinNchannels();
int maxNChannels = std::max(comm->nChannels, static_cast<int>(ncclParamMaxNchannels()));
size_t bytesPerRank = divUp(nBytes, comm->nRanks);
for(int channelCountIndex = 0; channelCountIndex < RCCL_CHANNELS_TUNABLE_ENTRIES; ++channelCountIndex){
size_t minByteThreshold = comm->minMaxChannelThresholds[tunableIndex][channelCountIndex][0];
size_t maxByteThreshold = comm->minMaxChannelThresholds[tunableIndex][channelCountIndex][1];
INFO(NCCL_TUNING, "nBytes:%lu bytesPerRank:%lu minByteThreshold:%lu maxByteThreshold:%lu NCCL_MIN_NCHANNELS:%i or NCCL_MAX_NCHANNELS:%i minCTAs:%i maxCTAs:%i", nBytes, bytesPerRank, minByteThreshold, maxByteThreshold, minNChannels, maxNChannels, minCTAs, maxCTAs);
if(minByteThreshold == CHAN_THRESHOLDS_UNDEFINED || maxByteThreshold == CHAN_THRESHOLDS_UNDEFINED) {
INFO(NCCL_TUNING, "RCCL tuning model does not define threshold for coll:%i and nbytes:%lu", coll, nBytes);
break; // Skip undefined thresholds
}
if(bytesPerRank > minByteThreshold && bytesPerRank <= maxByteThreshold){
int channelCount = comm->minMaxChannelThresholds[tunableIndex][channelCountIndex][2];
//honor user's min/max channels defined through NCCL_MIN_NCHANNELS and NCCL_MAX_NCHANNELS
if(channelCount >= minNChannels && channelCount <= maxNChannels && channelCount >= minCTAs && channelCount <= maxCTAs){
nc = comm->minMaxChannelThresholds[tunableIndex][channelCountIndex][2];
INFO(NCCL_TUNING, "RCCL tuning model overrides nchannels to %i, channels may be decreased further due to MinTrafficPerchannel thresholds", channelCount);
}
else{
INFO(NCCL_TUNING, "RCCL tuning model cannot override nchannels to %i due to conflicting NCCL_MIN_NCHANNELS:%i or NCCL_MAX_NCHANNELS:%i minCTAs:%i maxCTAs:%i", channelCount, minNChannels, maxNChannels, minCTAs, maxCTAs);
}
break;
}
}
return ncclSuccess;
}
ncclResult_t rcclOverrideProtocol(const char* ncclProtoStr[], float table[][NCCL_NUM_PROTOCOLS], struct ncclTaskColl* info) {
static const char* protoOverrideEnv = ncclGetEnv("RCCL_OVERRIDE_PROTO");
static bool validInput = true;