diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc index 1a6b136499..1cdcf9edab 100644 --- a/src/graph/tuning.cc +++ b/src/graph/tuning.cc @@ -143,17 +143,25 @@ static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { #define NCCL_HW_PCI 1 #define NCCL_HW_NET 2 - - +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) +#define RCCL_FACTOR_TABLE_SIZE 27 +#define RCCL_FACTOR_TABLE_MAX_INDEX (RCCL_FACTOR_TABLE_SIZE - 1) +#endif struct tuningModel { float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; float bwRatio [2][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) + float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][RCCL_FACTOR_TABLE_SIZE]; + float ringCorrectionFactor[NCCL_NUM_PROTOCOLS][RCCL_FACTOR_TABLE_SIZE]; +#else float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][27]; float ringCorrectionFactor[NCCL_NUM_PROTOCOLS][27]; +#endif uint64_t llProtoRanges[RCCL_TUNABLE_COLLS][NCCL_NUM_PROTOCOLS - 1][RCCL_PROTOCOL_ENTRY_SIZE]; uint64_t channelThresholds[RCCL_TUNABLE_COLLS][RCCL_CHANNELS_TUNABLE_ENTRIES][3]; //for each collective, set for 5 channel-counts: 2,4,8,16,32,40,48,56,64, {min,max,nchannels} }; + static struct tuningModel tuning_model_0 { .hwLat = { /* NVLINK */ @@ -887,10 +895,7 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclComm* comm, int coll, int algorithm, } int logSize = log2i(nBytes>>6); #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) - - if (logSize < 0) logSize = 0; - if (logSize > 26) logSize = 26; - + logSize = std::max(0, std::min(RCCL_FACTOR_TABLE_MAX_INDEX, logSize)); if (algorithm == NCCL_ALGO_TREE) { bw *= rcclTuningModel[comm->topo->tuning].treeCorrectionFactor[protocol][logSize]; }