From ffecb74b1e40bca97ea38cce5c4a54460edbe2cd Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Fri, 4 Nov 2022 22:54:29 +0000 Subject: [PATCH] Update tuning table and fix topo_expl [ROCm/rccl commit: 94ad7f6f51e925593decde12f75ece0446a55d41] --- projects/rccl/src/graph/rome_models.cc | 2 +- projects/rccl/src/graph/tuning.cc | 41 +++++++++++++------------ projects/rccl/tools/topo_expl/utils.cpp | 4 +-- 3 files changed, 24 insertions(+), 23 deletions(-) diff --git a/projects/rccl/src/graph/rome_models.cc b/projects/rccl/src/graph/rome_models.cc index 018b1bef6c..0ee8149c26 100644 --- a/projects/rccl/src/graph/rome_models.cc +++ b/projects/rccl/src/graph/rome_models.cc @@ -721,7 +721,7 @@ newchannel: } while (str[offset++] != 0); end: graph->nChannels = nChannels; - graph->bwIntra = graph->bwInter = system->maxBw; + graph->bwIntra = graph->bwInter = system->totalBw/nChannels; if (graph->id == 1) { for (int i=0; inChannels; i++) { int net; diff --git a/projects/rccl/src/graph/tuning.cc b/projects/rccl/src/graph/tuning.cc index f45b881cc5..2112279bb5 100644 --- a/projects/rccl/src/graph/tuning.cc +++ b/projects/rccl/src/graph/tuning.cc @@ -71,30 +71,30 @@ struct tuningModel { static struct tuningModel tuning_model_0 { .hwLat = { /* NVLINK */ - { /* Tree (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* Ring (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 4.5 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 4.5 } }, + { /* Tree (LL/LL128/Simple)*/ { 0.8, 1.4, 2.5 }, /* Ring (LL/LL128/Simple)*/ { 0.8, 2.2, 3.6 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 0.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 1.4 } }, /* PCI */ { /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 } }, /* NET */ - { /* Tree (LL/LL128/Simple)*/ { 28.3, 28.3, 45.4 }, /* Ring (LL/LL128/Simple)*/ { 2.0, 2.0, 24.1 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 45.4 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 45.4 } }, + { /* Tree (LL/LL128/Simple)*/ { 11.8, 18.2, 20.8 }, /* Ring (LL/LL128/Simple)*/ { 9.5, 19.8, 15.1 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 11.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 18.2 } }, }, .bwRatio = { /* 2 nodes */ - { /* Tree (LL/LL128/Simple)*/ { 0.06, 1.00, 1.30 }, /* Ring (LL/LL128/Simple)*/ { 0.07, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 } }, + { /* Tree (LL/LL128/Simple)*/ { 0.04, 0.22, 0.91 }, /* Ring (LL/LL128/Simple)*/ { 0.04, 0.34, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 } }, /* more than 2 nodes */ - { /* Tree (LL/LL128/Simple)*/ { 0.06, 1.00, 0.30 }, /* Ring (LL/LL128/Simple)*/ { 0.07, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 } }, + { /* Tree (LL/LL128/Simple)*/ { 0.04, 0.22, 0.95 }, /* Ring (LL/LL128/Simple)*/ { 0.04, 0.34, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 } }, }, .treeCorrectionFactor = { - { 0.3, 0.9, 0.8, 0.7, 0.6, 0.3, 0.1, 0.4, 0.8, 0.8, 0.5, 0.3, 0.4, 0.3, 0.2, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, }, - { 0.3, 0.9, 0.8, 0.7, 0.6, 0.3, 0.1, 0.4, 0.8, 0.8, 0.5, 0.3, 0.4, 0.3, 0.2, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, }, - { 0.4, 1.0, 1.0, 0.8, 1.0, 1.0, 0.3, 1.0, 0.9, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.7, 0.4, 0.3, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, }, + { 0.1, 0.2, 0.1, 0.1, 0.9, 0.3, 0.4, 0.1, 0.2, 0.4, 0.2, 0.1, 0.3, 0.3, 0.2, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, }, + { 0.1, 0.3, 1.0, 0.1, 0.5, 1.0, 0.9, 1.0, 1.0, 1.0, 0.3, 0.1, 0.4, 0.5, 0.5, 0.4, 0.4, 0.3, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, }, + { 0.2, 1.0, 0.1, 0.1, 0.7, 0.2, 0.4, 0.1, 0.1, 0.3, 0.4, 0.3, 0.6, 0.8, 1.0, 1.0, 1.0, 1.0, 0.9, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 0.9, }, }, .ringCorrectionFactor = { - { 0.2, 0.7, 0.7, 0.6, 0.6, 0.3, 0.2, 0.5, 1.0, 1.0, 0.8, 0.6, 0.8, 0.6, 0.3, 0.3, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, }, - { 0.2, 0.7, 0.7, 0.6, 0.6, 0.3, 0.2, 0.5, 1.0, 1.0, 0.8, 0.6, 0.8, 0.6, 0.3, 0.3, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, }, - { 1.0, 0.6, 0.9, 1.0, 0.7, 0.7, 1.0, 0.6, 0.8, 0.2, 0.1, 0.1, 0.1, 0.1, 0.2, 0.5, 0.8, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, }, + { 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.4, 0.2, 0.3, 0.5, 0.3, 0.1, 0.5, 0.5, 0.3, 0.2, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, }, + { 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.3, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.7, 0.5, 0.4, 0.4, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, }, + { 1.0, 0.8, 0.2, 1.0, 1.0, 0.3, 1.0, 0.1, 0.1, 0.2, 0.2, 0.1, 0.5, 1.0, 0.8, 0.8, 1.0, 0.9, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, }, }, }; @@ -161,30 +161,30 @@ static struct tuningModel tuning_model_2 { static struct tuningModel tuning_model_3 { .hwLat = { /* NVLINK */ - { /* Tree (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* Ring (LL/LL128/Simple)*/ { 1.5, 1.5, 4.5 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 4.5 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 4.5 } }, + { /* Tree (LL/LL128/Simple)*/ { 0.8, 0.0, 2.5 }, /* Ring (LL/LL128/Simple)*/ { 0.8, 0.0, 3.6 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 0.8 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 0.0 } }, /* PCI */ { /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 5.7 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 5.7 } }, /* NET */ - { /* Tree (LL/LL128/Simple)*/ { 17.4, 17.4, 40.3 }, /* Ring (LL/LL128/Simple)*/ { 4.1, 4.1, 40.6 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 40.3 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 40.3 } }, + { /* Tree (LL/LL128/Simple)*/ { 12.5, 0.0, 22.4 }, /* Ring (LL/LL128/Simple)*/ { 9.5, 0.0, 19.8 }, /* CollNetDirect (Simple)*/ { 0.0, 0.0, 12.5 }, /* CollNetChain (Simple)*/ { 0.0, 0.0, 0.0 } }, }, .bwRatio = { /* 2 nodes */ - { /* Tree (LL/LL128/Simple)*/ { 0.08, 1.00, 0.95 }, /* Ring (LL/LL128/Simple)*/ { 0.08, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 } }, + { /* Tree (LL/LL128/Simple)*/ { 0.20, 0.00, 1.75 }, /* Ring (LL/LL128/Simple)*/ { 0.20, 0.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 } }, /* more than 2 nodes */ - { /* Tree (LL/LL128/Simple)*/ { 0.08, 1.00, 0.41 }, /* Ring (LL/LL128/Simple)*/ { 0.08, 1.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 } }, + { /* Tree (LL/LL128/Simple)*/ { 0.20, 0.00, 0.96 }, /* Ring (LL/LL128/Simple)*/ { 0.20, 0.00, 1.00 }, /* CollNetDirect (Simple)*/ { 0.00, 0.00, 1.00 }, /* CollNetChain (Simple)*/ { 0.00, 0.00, 1.00 } }, }, .treeCorrectionFactor = { - { 0.6, 1.0, 1.0, 1.0, 0.1, 0.2, 0.1, 0.2, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 0.9, 0.9, 0.8, 0.6, 0.4, 0.5, 0.4, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, }, - { 0.6, 1.0, 1.0, 1.0, 0.1, 0.2, 0.1, 0.2, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 0.9, 0.9, 0.8, 0.6, 0.4, 0.5, 0.4, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, }, - { 1.0, 0.1, 0.1, 0.1, 1.0, 1.0, 1.0, 1.0, 0.7, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.5, 0.5, 0.6, 0.6, 0.6, 0.6, }, + { 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 0.2, 1.0, 0.9, 1.0, 0.6, 0.4, 0.6, 0.4, 0.3, 0.3, 0.3, 0.3, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, }, + { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, }, + { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.1, 0.1, 0.1, 0.2, 1.0, 0.8, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.8, 0.7, 0.8, 0.9, 0.7, 0.7, }, }, .ringCorrectionFactor = { - { 0.5, 0.2, 0.1, 0.1, 0.3, 0.3, 0.1, 0.3, 0.7, 0.8, 0.5, 0.4, 0.2, 0.1, 0.1, 0.1, 0.3, 0.5, 0.4, 0.4, 0.3, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, }, - { 0.5, 0.2, 0.1, 0.1, 0.3, 0.3, 0.1, 0.3, 0.7, 0.8, 0.5, 0.4, 0.2, 0.1, 0.1, 0.1, 0.3, 0.5, 0.4, 0.4, 0.3, 0.2, 0.1, 0.1, 0.1, 0.1, 0.1, }, - { 0.2, 0.3, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 1.0, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.6, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, }, + { 0.1, 0.1, 0.1, 0.1, 0.1, 0.3, 0.1, 0.2, 0.1, 0.4, 0.4, 0.2, 0.2, 0.3, 0.7, 0.5, 0.4, 0.3, 0.3, 0.3, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, }, + { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, }, + { 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.5, 1.0, 0.1, 0.3, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.3, 0.4, 0.7, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, }, }, }; @@ -304,6 +304,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom int collnet = (a == NCCL_ALGO_COLLNET_DIRECT || a == NCCL_ALGO_COLLNET_CHAIN) ? 1 : 0; float bw = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter; float busBw = comm->topo->baseBw != 0.0 ? comm->topo->baseBw : graphs[a]->nChannels * bw; + //INFO(NCCL_INIT, "algo %s proto %s busBw %f baseBw %f bw %f nChannels %d bwIntra %f bwInter %f", ncclAlgoStr[a], ncclProtoStr[p], busBw, comm->topo->baseBw, bw, graphs[a]->nChannels, graphs[a]->bwIntra, graphs[a]->bwInter); // Various model refinements #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) diff --git a/projects/rccl/tools/topo_expl/utils.cpp b/projects/rccl/tools/topo_expl/utils.cpp index 5a5728a886..18ff6a72ba 100644 --- a/projects/rccl/tools/topo_expl/utils.cpp +++ b/projects/rccl/tools/topo_expl/utils.cpp @@ -32,7 +32,7 @@ #include "rocm_smi/rocm_smi.h" const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+2] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce", "SendRecv", "AllToAllPivot" }; -const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNet" }; +const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNetDirect", "CollNetChain" }; const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" }; extern NodeModel *node_model; @@ -698,7 +698,7 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather3Data_t comm->nChannels = (comm->topo->nodes[GPU].count != comm->topo->nRanks && comm->topo->nodes[NET].count) ? std::min(treeGraph.nChannels, ringGraph.nChannels) : ringGraph.nChannels; - NCCLCHECK(ncclTopoPreset(comm, &treeGraph, &ringGraph, &allGather3Data[rank].topoRanks)); + NCCLCHECK(ncclTopoPreset(comm, &treeGraph, &ringGraph, &collNetGraph, &allGather3Data[rank].topoRanks)); return ncclSuccess; }