diff --git a/src/enqueue.cc b/src/enqueue.cc index d5416bb17e..284856b7ff 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -2060,6 +2060,14 @@ static ncclResult_t topoGetAlgoInfo( } } } + if(algorithm == NCCL_ALGO_UNDEF){ + INFO(NCCL_INIT,"Optimal algorithm is not found in collCostTable, Setting it a default value NCCL_ALGO_RING"); + algorithm = NCCL_ALGO_RING; + } + if(protocol == NCCL_PROTO_UNDEF){ + INFO(NCCL_INIT,"Optimal protocol is not found in collCostTable, Setting it a default value NCCL_PROTO_SIMPLE"); + protocol = NCCL_PROTO_SIMPLE; + } info->algorithm = algorithm; info->protocol = protocol; @@ -2119,9 +2127,8 @@ static ncclResult_t topoGetAlgoInfo( int minNChannels = ncclParamMinNchannels(); // Ring/Tree channel tuning INFO(NCCL_INIT, "minNChannels:%i", minNChannels); - while (nBytes < nc * nt * threadThreshold && nc > minNChannels) { - if (nc >= 2) nc--; - else break; + if(nBytes < nc * nt * threadThreshold && nc > minNChannels){ + nc = std::max(1,std::max(minNChannels,(int)(nBytes/std::max(1,nt * threadThreshold)))); } INFO(NCCL_INIT, "post-adjustment based on threadThreshold:%i nBytes:%lu nc:%i", threadThreshold, nBytes, nc); rcclOverrideChannels(comm, info->func, nBytes, nc); diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc index 7ec4911d07..68a4f25958 100644 --- a/src/graph/tuning.cc +++ b/src/graph/tuning.cc @@ -423,6 +423,56 @@ static struct tuningModel tuning_model_6 { }, }; +static struct tuningModel tuning_model_7 { + .hwLat = { + /* NVLINK */ + { /* Tree (LL/LL128/Simple)*/ { 0.8, 1.4, 2.5 }, /* Ring (LL/LL128/Simple)*/ { 0.8, 2.2, 3.6 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 0.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 1.4 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 }, /* PAT */ { 0, 0, 3.6} }, + /* PCI */ + { /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 }, /* PAT */ { 0, 0, 5.7} }, + /* NET */ + { /* Tree (LL/LL128/Simple)*/ { 11.8, 18.2, 20.8 }, /* Ring (LL/LL128/Simple)*/ { 9.5, 19.8, 15.1 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 11.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 18.2 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 }, /* PAT */ { 0, 0, 15.1} }, + }, + + .bwRatio = { + /* 2 nodes */ + { /* Tree (LL/LL128/Simple)*/ { 0.051, 0.22, 0.64 }, /* Ring (LL/LL128/Simple)*/ { 0.74, 0.34, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 }, /* PAT */ { 0, 0, 0} }, + /* more than 2 nodes */ + { /* Tree (LL/LL128/Simple)*/ { 0.051, 0.22, 0.64 }, /* Ring (LL/LL128/Simple)*/ { 0.74, 0.34, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 }, /* PAT */ { 0, 0, 1.00} }, + }, + + .treeCorrectionFactor = { + { 0.1, 0.2, 0.1, 0.1, 0.9, 0.3, 0.4, 0.1, 0.2, 0.4, 0.2, 0.1, 0.3, 0.3, 0.2, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, }, + { 0.1, 0.3, 1.0, 0.1, 0.5, 1.0, 0.9, 1.0, 1.0, 1.0, 0.3, 0.1, 0.4, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, }, + { 0.2, 1.0, 0.1, 0.1, 0.7, 0.2, 0.4, 0.1, 0.1, 0.3, 0.4, 0.3, 0.6, 0.8, 1.0, 1.0, 1.0, 1.0, 0.9, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, }, + }, + + .ringCorrectionFactor = { + { 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.4, 0.2, 0.3, 0.5, 0.3, 0.1, 0.5, 0.5, 0.3, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, }, + { 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.3, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.7, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, }, + { 1.0, 0.8, 0.2, 1.0, 1.0, 0.3, 1.0, 0.1, 0.1, 0.2, 0.2, 0.1, 0.5, 1.0, 0.8, 0.8, 1.0, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, }, + }, + // Follow order in RcclTunableColls + .llProtoRanges = { + /*ReduceScatter*/ + {/*LL (min/max/factor/thread_threshold)*/ {0, 65536, 1, 16}, /*LL64/128 (min/max/factor/thread_threshold)*/ {65536, 8388608, 1, 64}}, + /*AllGather*/ + {/*LL (min/max/factor/thread_threshold)*/ {0, 65536, 1, 16}, /*LL64/128 (min/max/factor/thread_threshold)*/ {65536, 8388608, 1, 64}}, + /*AllReduce*/ + {/*LL (min/max/factor/thread_threshold)*/ {0, 1048576, 1, 0},/*LL64/128 (min/max/factor/thread_threshold)*/ {1048576, 70640910, 3145728, 0}}, + /*Reduce*/ + {/*LL (min/max/factor/thread_threshold)*/ {0, 16383, 1, 0},/*LL64/128 (min/max/factor/thread_threshold)*/ {16383, 16777216, 1, 0}}, + /*Broadcast*/ + {/*LL (min/max/factor/thread_threshold)*/ {0, 2048, 1, 0},/*LL64/128 (min/max/factor/thread_threshold)*/ {2048, 16777216, 1, 0}}, + }, + + .channelThresholds = { + // For each collective, define minMax per-rank size threshold for 32,40,48,56,64 channels + /*ReduceScatter*/ {{512, 1024, 2},{1024, 2048, 4},{2048, 4096, 8},{4096, 65536, 16}, {65536, 262144, 32}, {262144, 524288, 40}, {1,1, 48}, {524288, 1048576, 56}, {1048576, 268435457, 64}}, + /*AllGather*/ {{2048, 4096, 2},{4096, 8192, 4},{8192, 16384, 8},{16384, 262144, 16},{262144, 524288, 32}, {524288, 1048576, 40}, {1,1, 48}, {1048576, 4194304, 56}, {4194304, 268435457, 64}}, + /*AllReduce*/ {{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0}, {0,0,0}, {0,0,0}, {0,0,0}, {0,0,0}}, + }, +}; + static struct tuningModel rcclTuningModel[] = { tuning_model_0, tuning_model_1, @@ -431,6 +481,7 @@ static struct tuningModel rcclTuningModel[] = { tuning_model_4, tuning_model_5, tuning_model_6, + tuning_model_7, }; /* Array indexes used below */ @@ -542,7 +593,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS]; for (int a=0; atypeIntra == LINK_NVL ? NCCL_HW_NVLINK : NCCL_HW_PCI; for (int a=0; atopo->tuning); memcpy(comm->minMaxLLRange, rcclTuningModel[comm->topo->tuning].llProtoRanges, sizeof(rcclTuningModel[comm->topo->tuning].llProtoRanges)); @@ -766,8 +817,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom } for (int c=0; ctopo->nodes[GPU].nodes[0].gpu.gcn, "gfx12")) ? 0 : protoEnable[c*NCCL_NUM_PROTOCOLS+p]; + int pEnable = protoEnable[c*NCCL_NUM_PROTOCOLS+p]; if (pEnable != 0 && p == NCCL_PROTO_LL128) { #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) #if defined(ENABLE_LL128) diff --git a/src/rccl_wrap.cc b/src/rccl_wrap.cc index ea0b83b0d4..c3a3ce243c 100644 --- a/src/rccl_wrap.cc +++ b/src/rccl_wrap.cc @@ -51,6 +51,24 @@ void rcclRestrictMaxChannels(struct ncclComm* comm, int& nc ) { } } +int32_t rcclGetProtoForGfx12(ncclFunc_t collectiveFunc, size_t sizePerRank){ + int returnVal = NCCL_PROTO_SIMPLE; + int SingleNodeLLCutoffs[] = { + /*ncclFuncBroadcast*/ 1536, + /*ncclFuncReduce*/ 8192, + /*ncclFuncAllGather*/ 98304, + /*ncclFuncReduceScatter*/ 98304, + /*ncclFuncAllReduce*/ 913532, + /*ncclFuncSendRecv*/ 0, + /*ncclFuncSend*/ 0, + /*ncclFuncRecv*/ 0 + }; + if(collectiveFunc < sizeof(SingleNodeLLCutoffs)/sizeof(int)) { + returnVal = (sizePerRank <= SingleNodeLLCutoffs[collectiveFunc]) ? NCCL_PROTO_LL : NCCL_PROTO_SIMPLE; + } + return returnVal; +} + void rcclUpdateCollectiveProtocol(struct ncclComm* comm, size_t const& nBytes, struct ncclTaskColl* info) { // Honor user input for protocol choice static int userProtocolInput = -2; @@ -69,6 +87,8 @@ void rcclUpdateCollectiveProtocol(struct ncclComm* comm, size_t const& nBytes, s } else if (!userProtocolInput && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx942") && comm->nNodes == 1 && (info->func == ncclFuncReduceScatter) && sizePerRank <= 352128) { // Change LL protocol threshold info->protocol = NCCL_PROTO_LL; + } else if (!userProtocolInput && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx12") && comm->nNodes == 1){ + info->protocol = rcclGetProtoForGfx12( info->func,sizePerRank); } else if(!userProtocolInput && comm->nNodes >= 2 && (info->func == ncclFuncReduceScatter || info->func == ncclFuncAllGather || info->func == ncclFuncAllReduce || info->func == ncclFuncBroadcast || info->func == ncclFuncReduce)) { auto tunableIndex = rcclGetTunableIndex(info->func); auto llMin = comm->minMaxLLRange[tunableIndex][NCCL_PROTO_LL][RCCL_PROTOCOL_MIN_IDX];