diff --git a/projects/rccl/src/graph/tuning.cc b/projects/rccl/src/graph/tuning.cc index a63ef585c4..07d6e6df7c 100644 --- a/projects/rccl/src/graph/tuning.cc +++ b/projects/rccl/src/graph/tuning.cc @@ -54,7 +54,7 @@ ncclResult_t parseList(const char* str, const char* elems[], int nelems, int* li // Latencies in us, Bandwidths in GB/s // Tree { LL, LL128, Simple } , Ring { LL, LL128, Simple } -static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 39.0, 39.0, 49.0 }, { 21.0, 21.0, 30.0 }, { 37.9, 37.9, 40.4 } }; +static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 12.0, 12.0, 17.0 }, { 12.0, 12.0, 17.0 }, { 12.0, 12.0, 17.0 } }; // NVLink, PCI, Network #define NCCL_HW_NVLINK 0 @@ -63,11 +63,11 @@ static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 39.0 // Tree/Simple is the latency a 256kB chunk, which is ~ base lat + 256k/12GB/s (+ 256k/12GB/s for the network). static const float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { /* NVLINK */ - { /* Tree (LL/LL128/Simple)*/ { 2.5, 2.5, 5.5 }, /* Ring (LL/LL128/Simple)*/ { 2.5, 2.5, 5 }, /* CollNet (LL/LL128/Simple)*/ { 1.2, 1.2, 3.8 } }, + { /* Tree (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* Ring (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* CollNet (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 } }, /* PCI */ - { /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 1.3, 1.3, 1.9 }, /* CollNet (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 } }, + { /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNet (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 } }, /* NET */ - { /* Tree (LL/LL128/Simple)*/ { 28.0, 28.0, 66.0 }, /* Ring (LL/LL128/Simple)*/ { 8.5, 8.5, 19.0 }, /* CollNet (LL/LL128/Simple)*/ { 9.8, 9.8, 19.5 } } + { /* Tree (LL/LL128/Simple)*/ { 40.0, 40.0, 50.0 }, /* Ring (LL/LL128/Simple)*/ { 4.0, 4.0, 25.0 }, /* CollNet (LL/LL128/Simple)*/ { 40.0, 40.0, 50.0 } } }; // LL128 max BW (per channel) for the different collectives @@ -76,7 +76,7 @@ static const double ll128MaxBwPerCh[NCCL_NUM_FUNCTIONS] = { 18.8, 12.0, 18.3, 15 static const double llMaxBws[2][3] = { /* Volta-N1/Intel-N2/Intel-N4) */ {39.0, 39.0, 20.4}, /* Ampere-N1/AMD-N2/AMD-N4) */ {87.7, 22.5 /*avg of ring & tree*/, 19.0} }; static const double perChMaxTreeBws[2][3] = { /* Volta (N1/N2/N4) */ {26.5, 18.5, 10.0}, /* Ampere (N1/N2/N4) */ {24.0, 22.5, 16.0} }; -ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph) { +ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph, int gcn) { int simpleDefaultThreads = (ringGraph->speedIntra*ringGraph->nChannels <= PCI_WIDTH) ? 256 : NCCL_SIMPLE_MAX_NTHREADS; comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) @@ -133,14 +133,10 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom // Various model refinements #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) - if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) busBw *= 1.0/5.0; - if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (ppn < 2 ? 0.7 : 0.92 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels); - double maxTreeBw = comm->nNodes > 2 ? - compCap80 && p == NCCL_PROTO_LL128 ? 105.0 : 80.0 : - compCap80 && p == NCCL_PROTO_LL128 ? 130.0 : 110.0; - if (a == NCCL_ALGO_TREE) busBw = std::min(busBw*.27, comm->nNodes > 1 ? 70.0 : 90.0); - if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw *= 1.0/2.3; - if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (comm->nNodes == 1 ? 7.0/9.0 : 0.915 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels*7.0/9.0); + if (a == NCCL_ALGO_RING && (p == NCCL_PROTO_LL || p == NCCL_PROTO_LL128)) busBw *= 0.05; + if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_SIMPLE) (nNodes == 2) ? busBw *= 0.33 : busBw *= 0.11; + if (a == NCCL_ALGO_TREE && (p == NCCL_PROTO_LL || p == NCCL_PROTO_LL128)) busBw *= 0.04; + if (gcn == 910 && p == NCCL_PROTO_LL && nNodes == 1 && nRanks == 16) busBw *= 5.9; #else if (compCap80) busBw = std::min(busBw, 235.0f); if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * ((nNodes > 1 || coll == ncclFuncAllReduce || coll == ncclFuncReduce) ? 1.0/4.0 : 1.0/3.0)); } @@ -159,7 +155,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom comm->latencies[coll][a][p] = baseLat[a][p]; float intraLat = hwLat[intraHw[a]][a][p]; float interLat = hwLat[NCCL_HW_NET][a][p]; - if (nNodes > 1 && p == NCCL_PROTO_LL) intraLat *= 1.8; + //if (nNodes > 1 && p == NCCL_PROTO_LL) intraLat *= 1.8; if (a == NCCL_ALGO_RING) { float lat = hwLat[hw[a]][a][p]; if ((coll == ncclFuncReduce || coll == ncclFuncBroadcast)) { @@ -285,15 +281,15 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom // Trees are not perfectly sticking to the model for medium sizes. Applying a static correction // factor is not ideal but works quite well. Powers of two, 64 B to 128MB. static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][22] = { - { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, .84, .49, .42, .60, .75, .87, .94, .94, .99, 1.0, 1.0 , 1.0 , 1.0 , 1.0 , 1.0 }, - { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, .84, .49, .42, .60, .75, .87, .94, .94, .99, 1.0, 1.0 , 1.0 , 1.0 , 1.0 , 1.0 }, - { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, .41, .27, .25, .39, .46, .72, .76, .87, .92, .97, 1.0, 1.0 , 1.0 , 1.0 , 1.0 , 1.0 } + { 0.7, 0.7, 0.7, 0.6, 0.6, 0.3, 0.9, 0.5, 0.5, 0.6, 0.5, 0.5, 0.8, 0.9, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 1.0, }, + { 0.7, 0.7, 0.7, 0.6, 0.6, 0.3, 0.9, 0.5, 0.5, 0.6, 0.5, 0.5, 0.8, 0.9, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 1.0, }, + { 0.4, 0.4, 0.3, 0.3, 0.2, 0.5, 0.5, 0.7, 0.2, 0.2, 0.3, 0.6, 0.7, 1.0, 1.3, 1.0, 1.2, 1.2, 1.1, 1.1, 1.2, 1.2, }, }; static float ringCorrectionFactor[NCCL_NUM_PROTOCOLS][22] = { - { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, .25, .41, .55, .56, .78, .94, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 , 1.0 , 1.0 , 1.0 , 1.0 }, - { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, .25, .41, .55, .56, .78, .94, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 , 1.0 , 1.0 , 1.0 , 1.0 }, - { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, .04, .08, .09, .09, .11, .13, .25, .40, .59, .76, .86, 1.0 , 1.0 , 1.0 , 1.0 , 1.0 } + { 0.4, 0.6, 0.6, 0.3, 0.2, 0.2, 0.2, 0.2, 0.4, 0.6, 0.7, 0.9, 1.4, 1.5, 1.0, 0.8, 0.7, 0.8, 0.8, 0.9, 0.9, 0.9, }, + { 0.4, 0.6, 0.6, 0.3, 0.2, 0.2, 0.2, 0.2, 0.4, 0.6, 0.7, 0.9, 1.4, 1.5, 1.0, 0.8, 0.7, 0.8, 0.8, 0.9, 0.9, 0.9, }, + { 0.6, 0.4, 0.4, 0.4, 0.2, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.4, 0.6, 0.8, 0.9, }, }; ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, float* time) { diff --git a/projects/rccl/src/include/graph.h b/projects/rccl/src/include/graph.h index 5072ba1249..34b93be4e7 100644 --- a/projects/rccl/src/include/graph.h +++ b/projects/rccl/src/include/graph.h @@ -105,7 +105,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph* collNetGraph, int nc); -ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph); +ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph, int gcn); #include "info.h" ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, float* time); diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index d24b33cb55..74ae05e3f4 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -1114,7 +1114,7 @@ collnet_cleanup: TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels); // Compute time models for algorithm and protocol combinations - NCCLCHECK(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph)); + NCCLCHECK(ncclTopoTuneModel(comm, minCompCap, maxCompCap, &treeGraph, &ringGraph, &collNetGraph, comm->topo->nodes[GPU].nodes[0].gpu.gcn)); // Compute nChannels per peer for p2p NCCLCHECK(ncclTopoComputeP2pChannels(comm));