From 562eb089785c0a6cc1f614e80b69aa77936cb50c Mon Sep 17 00:00:00 2001 From: Pedram Alizadeh Date: Mon, 29 Jul 2024 15:43:36 -0400 Subject: [PATCH] Adding tuner plugin example for MI300 (#1274) [ROCm/rccl commit: b005c13292ee33ccd4cccc69bad17f6d27388dc3] --- projects/rccl/ext-tuner/example/Makefile | 6 +- projects/rccl/ext-tuner/example/plugin.c | 214 ++++++++++++++++++++++- 2 files changed, 214 insertions(+), 6 deletions(-) diff --git a/projects/rccl/ext-tuner/example/Makefile b/projects/rccl/ext-tuner/example/Makefile index 9d9ace4842..c11e321718 100644 --- a/projects/rccl/ext-tuner/example/Makefile +++ b/projects/rccl/ext-tuner/example/Makefile @@ -3,9 +3,9 @@ # # See LICENSE.txt for license information # -NCCL_HOME:=../../build/ -CUDA_HOME:=/usr/local/cuda -INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include -Inccl +RCCL_HOME:=../../build/release +HIP_HOME:=/opt/rocm +INC:= -I$(RCCL_HOME)/include/ -I$(HIP_HOME)/include/ -D__HIP_PLATFORM_AMD__ -Inccl PLUGIN_SO:=libnccl-tuner.so default: $(PLUGIN_SO) diff --git a/projects/rccl/ext-tuner/example/plugin.c b/projects/rccl/ext-tuner/example/plugin.c index d972de3d3a..5b7f53d0ff 100644 --- a/projects/rccl/ext-tuner/example/plugin.c +++ b/projects/rccl/ext-tuner/example/plugin.c @@ -5,14 +5,222 @@ ************************************************************************/ #include "tuner.h" - #define __hidden __attribute__ ((visibility("hidden"))) +#define HOPPER_COMPCAP_IDX 2 +// NVLink, PCI, Network +#define NCCL_HW_NVLINK 0 +#define NCCL_HW_PCI 1 +#define NCCL_HW_NET 2 -__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction) { return ncclSuccess; } +static long log2i(long n) { + long l = 0; + while (n>>=1) l++; + return l; +} +// Latencies in us, Bandwidths in GB/s +// Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple } +static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { + { 12.0, 12.0, 17.0 }, { 12.0, 12.0, 17.0 }, // Tree, Ring + { 12.0, 12.0, 17.0 }, { 12.0, 12.0, 17.0 }, // Collnet Direct, Chain + { 0, 0, 0 }, { 0, 0, 0 }}; // NVLS, NVLS Tree + +struct tuningModel { + float hwLat[3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + float bwRatio[2][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][27]; + float ringCorrectionFactor[NCCL_NUM_PROTOCOLS][27]; +}; + +static struct tuningModel tuning_model = { + { + /* NVLINK */ + { /* Tree (LL/LL128/Simple)*/ { 0.8, 0.0, 2.5 }, /* Ring (LL/LL128/Simple)*/ { 0.8, 0.0, 3.6 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 0.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 0.0 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } }, + /* PCI */ + { /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } }, + /* NET */ + { /* Tree (LL/LL128/Simple)*/ { 12.5, 0.0, 22.4 }, /* Ring (LL/LL128/Simple)*/ { 9.5, 0.0, 19.8 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 12.5 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 0.0 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } }, + }, + + { + /* 2 nodes */ + { /* Tree (LL/LL128/Simple)*/ { 0.41, 0.00, 1.00 }, /* Ring (LL/LL128/Simple)*/ { 0.41, 0.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } }, + /* more than 2 nodes */ + { /* Tree (LL/LL128/Simple)*/ { 0.41, 0.00, 0.86 }, /* Ring (LL/LL128/Simple)*/ { 0.41, 0.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 }, /* NVLS */ { 0, 0, 0 }, /* NVLS Tree */ { 0, 0, 0 } }, + }, + + { + { 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 0.8, 0.1, 0.4, 0.5, 1.0, 0.6, 0.4, 0.6, 0.1, 0.3, 0.4, 0.4, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, }, + { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, }, + { 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 1.0, 0.4, 1.0, 1.0, 1.0, 0.2, 0.7, 1.0, 1.0, 1.0, 0.8, 0.7, 0.7, 0.8, 0.8, 0.8, 0.9, }, + }, + + { + { 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 0.1, 0.2, 0.2, 0.1, 0.5, 0.8, 1.0, 0.2, 0.4, 0.5, 0.4, 0.4, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, }, + { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, }, + { 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.7, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 1.0, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, }, + }, +}; + +float latencies[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; +float bandwidths[NCCL_NUM_FUNCTIONS][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS]; + +ncclResult_t ncclTopoGetAlgoTime_Tuner(ncclFunc_t collType, int algorithm, int protocol, int numPipeOps, float* time, size_t nBytes) { + float bw = bandwidths[collType][algorithm][protocol]; + float lat = latencies[collType][algorithm][protocol]; + + if (bw == 0) { + *time = -1.0; return ncclSuccess; + } + int logSize = log2i(nBytes>>6); + if (algorithm == NCCL_ALGO_TREE) { + if (logSize < 27) bw *= tuning_model.treeCorrectionFactor[protocol][logSize]; + else bw *= tuning_model.treeCorrectionFactor[protocol][26]; + } + else if (algorithm == NCCL_ALGO_RING) { + if(logSize < 27) bw *= tuning_model.ringCorrectionFactor[protocol][logSize]; + else bw *= tuning_model.ringCorrectionFactor[protocol][26]; + } + + int latCount = 1; + *time = lat * latCount + (nBytes) / (1000 * bw); + return ncclSuccess; +} + +__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction) { + if (nRanks <= 1) return ncclSuccess; + int compCapIndex = HOPPER_COMPCAP_IDX; + int index2 = nNodes <= 2 ? nNodes-1 : 2; + int index1 = nNodes == 1 ? compCapIndex : 1; + float ppn = (float)nRanks / nNodes; // if ppn < 2, then we are sending/receiving at the same GPU through the NIC, apply some bw discount + + int intraHw[NCCL_NUM_ALGORITHMS], hw[NCCL_NUM_ALGORITHMS]; + for (int a=0; a 1 ? 2*nNodes :0) : + coll == ncclFuncReduceScatter || coll == ncclFuncAllGather ? nNodes-1 : + nNodes; + + for (int a=0; abwIntra : graphs[a]->bwInter + if (a == NCCL_ALGO_NVLS) bw = 0.0; + if (a == NCCL_ALGO_NVLS_TREE) bw = 0.0; + if (collnet == 1) bw = 0.0; + int nChannels = 28; //nNodes==1 && MI300 + float busBw = nChannels * bw; //comm->topo->baseBw != 0.0 ? comm->topo->baseBw : graphs[a]->nChannels * bw + + // Various model refinements + if (nNodes <= 2) + busBw *= tuning_model.bwRatio[0][a][p]; + else + busBw *= tuning_model.bwRatio[1][a][p]; + if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL && (coll == ncclFuncBroadcast || coll == ncclFuncReduce) && nNodes == 1) { busBw = busBw * 1.65; } + + // Convert bus BW to algorithm BW + if (!(a == NCCL_ALGO_COLLNET_DIRECT && (coll == ncclFuncAllGather || coll == ncclFuncReduceScatter))) { + float ratio = 1.0f; + if (a == NCCL_ALGO_RING) ratio *= (1.0 * nRanks) / nsteps; + else if (a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) ratio *= 5.0/6.0; + else ratio *= .5; + busBw *= ratio; + } + bandwidths[coll][a][p] = busBw; + latencies[coll][a][p] = baseLat[a][p]; + float intraLat = tuning_model.hwLat[intraHw[a]][a][p]; + float interLat = tuning_model.hwLat[NCCL_HW_NET][a][p]; + + if (a == NCCL_ALGO_RING) { + float lat = tuning_model.hwLat[hw[a]][a][p]; + if ((coll == ncclFuncReduce || coll == ncclFuncBroadcast)) { + latencies[coll][a][p] += lat; + } else { + // Inter-node rings still have to launch nsteps * net overhead. + float netOverhead = 0.0; + if (nNodes > 1) { + netOverhead = 1; + if (p == NCCL_PROTO_SIMPLE) netOverhead *= 3; + } + if (intraLat < netOverhead) intraLat = netOverhead; + latencies[coll][a][p] += (nsteps-nInterSteps)*intraLat + nInterSteps*interLat; + } + } else if (a == NCCL_ALGO_TREE) { + latencies[coll][a][p] += + 2 * ((nRanks/nNodes-1) * intraLat + log2i(nNodes) * interLat); + } else if (a == NCCL_ALGO_COLLNET_DIRECT) { + int minimum = 1; + if ((nRanks/nNodes-1) < 1) minimum = (nRanks/nNodes-1); + latencies[coll][a][p] += + 2 * (minimum * intraLat + (nRanks/nNodes-1) * 0.4) + interLat; // Add 0.4 us arity serialization latency + } else if (a == NCCL_ALGO_COLLNET_CHAIN) { + latencies[coll][a][p] += 2 * (nRanks/nNodes-1) * intraLat + interLat; + } else if (a == NCCL_ALGO_NVLS) { + if (nNodes > 1) latencies[coll][a][p] += tuning_model.hwLat[NCCL_HW_NET][a][p]; + } else if (a == NCCL_ALGO_NVLS_TREE) { + latencies[coll][a][p] += 2*(nNodes-1)*tuning_model.hwLat[NCCL_HW_NET][a][p]; + } + } + } + } + // Protocols/Algorithms enable/disable, and user overrides. + // All are enabled except ll128 which is enabled by default only in certain cases. + int protoEnable[NCCL_NUM_PROTOCOLS] = { 1, 2, 1 }; + int algoEnable[NCCL_NUM_ALGORITHMS] = { 1, 1, 1, 1, 1, 1 }; + + // MNNVL: NVLS not yet supported + algoEnable[NCCL_ALGO_NVLS_TREE] = 0; + algoEnable[NCCL_ALGO_COLLNET_DIRECT] = 0; + algoEnable[NCCL_ALGO_COLLNET_CHAIN] = 0; + algoEnable[NCCL_ALGO_NVLS] = 0; + + for (int c=0; c= 0 && time < minTime) { + *algorithm = a; + *protocol = p; + minTime = time; + } + } + } + return ncclSuccess; +} __hidden ncclResult_t pluginDestroy() { return ncclSuccess; }