diff --git a/projects/rccl/src/graph/rome_models.cc b/projects/rccl/src/graph/rome_models.cc index 70c1cb37cf..be213ae3b2 100644 --- a/projects/rccl/src/graph/rome_models.cc +++ b/projects/rccl/src/graph/rome_models.cc @@ -354,6 +354,42 @@ static struct rcclRomeModel rome_model_58 = { .ringBase = "0 1 3 2 4 5 7 6|6 7 5 4 2 3 1 0|0 1 5 4 6 7 3 2|2 3 7 6 4 5 1 0", }; +static struct rcclRomeModel rome_model_59 = { + .nGpus = 16, .nCpus = 4, .nNics = 8, .nLinks = 4, + .gpuIds = { 0x4e000, 0x51000, 0x56000, 0x59000, 0xe000, 0x11000, 0x16000, 0x19000, 0xcf000, 0xd2000, 0xd7000, 0xda000, 0x8f000, 0x92000, 0x97000, 0x9a000, }, + .nicIds = { 0x4b000, 0x5a000, 0xb000, 0x1a000, 0xcc000, 0xdb000, 0x8c000, 0x9b000, }, + .gpuNuma = { 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, }, + .nicNuma = { 0, 0, 1, 1, 2, 2, 3, 3, }, + .connMatrix = { 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, }, + .gdrLevel = { 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 4, 4, 4, 4, }, + .pattern = "42424242", + .ringBase = "N0 1 3 2 6 7 15 14 10 11 9 8 12 13 5 4 0 N0|N1 3 7 6 13 12 8 9 10 11 15 14 5 4 0 1 2 N1|N2 4 5 1 0 2 3 7 6 14 15 11 10 8 9 13 12 N6|N3 6 2 3 1 0 4 5 13 12 8 9 11 10 14 15 7 N3|N4 8 12 13 6 7 3 2 1 0 4 5 14 15 11 10 9 N4|N5 11 15 14 6 7 3 2 0 1 5 4 12 13 9 8 10 N5|N6 12 13 5 4 0 1 3 2 6 7 15 14 10 11 9 8 N4|N7 14 5 4 0 1 2 3 7 6 13 12 8 9 10 11 15 N7|N3 7 6 14 15 11 10 8 9 13 12 4 5 1 0 2 3 N1|N1 2 3 1 0 4 5 13 12 8 9 11 10 14 15 7 6 N3|N2 5 14 15 11 10 9 8 12 13 6 7 3 2 1 0 4 N2|N4 9 8 10 11 15 14 6 7 3 2 0 1 5 4 12 13 N6|N5 10 11 9 8 12 13 5 4 0 1 3 2 6 7 15 14 N7|N7 15 14 5 4 0 1 2 3 7 6 13 12 8 9 10 11 N5|N0 0 2 3 7 6 14 15 11 10 8 9 13 12 4 5 1 N0|N6 13 12 8 9 11 10 14 15 7 6 2 3 1 0 4 5 N2|", +}; + +static struct rcclRomeModel rome_model_62 = { + .nGpus = 8, .nCpus = 4, .nNics = 0, .nLinks = 3, + .gpuIds = { 0xc1000, 0xc6000, 0xc9000, 0xce000, 0xd1000, 0xd6000, 0xd9000, 0xde000, }, + .nicIds = { }, + .gpuNuma = { 3, 3, 1, 1, 0, 0, 2, 2, }, + .nicNuma = { }, + .connMatrix = { 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, }, + .gdrLevel = { }, + .pattern = "20202020", + .ringBase = "0 1 3 2 4 5 7 6|6 7 5 4 2 3 1 0|0 1 5 4 6 7 3 2|2 3 7 6 4 5 1 0", +}; + +static struct rcclRomeModel rome_model_63 = { + .nGpus = 8, .nCpus = 4, .nNics = 4, .nLinks = 3, + .gpuIds = { 0xc1000, 0xc6000, 0xc9000, 0xce000, 0xd1000, 0xd6000, 0xd9000, 0xde000, }, + .nicIds = { 0xc5000, 0xcd000, 0xd5000, 0xdd000, }, + .gpuNuma = { 3, 3, 1, 1, 0, 0, 2, 2, }, + .nicNuma = { 3, 1, 0, 2, }, + .connMatrix = { 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, }, + .gdrLevel = { 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, }, + .pattern = "21212121", + .ringBase = "N0 0 1 5 4 6 7 3 2 N1|N1 2 3 7 6 4 5 1 0 N0|N3 7 6 0 1 3 2 4 5 N2|N2 5 4 2 3 1 0 6 7 N3|N0 0 1 5 4 6 7 3 2 N1|N1 2 3 7 6 4 5 1 0 N0|N3 7 6 0 1 3 2 4 5 N2|N2 5 4 2 3 1 0 6 7 N3", +}; + static struct rcclRomeModel romeTopoModels[] = { rome_model_22, rome_model_25, @@ -381,6 +417,9 @@ static struct rcclRomeModel romeTopoModels[] = { rome_model_55, rome_model_56, rome_model_58, + rome_model_59, + rome_model_62, + rome_model_63, }; /* Parse user defined rings. Format is like : diff --git a/projects/rccl/src/graph/tuning.cc b/projects/rccl/src/graph/tuning.cc index 1c670ac5f0..35d8225f98 100644 --- a/projects/rccl/src/graph/tuning.cc +++ b/projects/rccl/src/graph/tuning.cc @@ -138,6 +138,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_SIMPLE) (nNodes == 2) ? busBw *= 0.33 : busBw *= 0.11; if (a == NCCL_ALGO_TREE && (p == NCCL_PROTO_LL || p == NCCL_PROTO_LL128)) busBw *= 0.04; if (gcn == 910 && p == NCCL_PROTO_LL && nNodes == 1 && nRanks == 16) busBw *= 5.9; + if (gcn == 910 && a == NCCL_ALGO_TREE && p == NCCL_PROTO_SIMPLE && nNodes == 2 && nRanks == 32) busBw *= 3.2; #else if (compCap80) busBw = std::min(busBw, 235.0f); if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) { busBw = std::min(llMaxBw, busBw * ((nNodes > 1 || coll == ncclFuncAllReduce || coll == ncclFuncReduce) ? 1.0/4.0 : 1.0/3.0)); } @@ -281,16 +282,16 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom // Trees are not perfectly sticking to the model for medium sizes. Applying a static correction // factor is not ideal but works quite well. Powers of two, 64 B to 128MB. -static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][22] = { - { 0.7, 0.7, 0.7, 0.6, 0.6, 0.3, 0.9, 0.5, 0.5, 0.6, 0.5, 0.5, 0.8, 0.9, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 1.0, }, - { 0.7, 0.7, 0.7, 0.6, 0.6, 0.3, 0.9, 0.5, 0.5, 0.6, 0.5, 0.5, 0.8, 0.9, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 1.0, }, - { 0.4, 0.4, 0.3, 0.3, 0.2, 0.5, 0.5, 0.7, 0.2, 0.2, 0.3, 0.6, 0.7, 1.0, 1.3, 1.0, 1.2, 1.2, 1.1, 1.1, 1.2, 1.2, }, +static float treeCorrectionFactor[NCCL_NUM_PROTOCOLS][25] = { + { 0.7, 0.7, 0.7, 0.6, 0.6, 0.3, 0.9, 0.5, 0.5, 0.6, 0.5, 0.5, 0.8, 0.9, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 1.0, 1.0, 1.0, 1.0, }, + { 0.7, 0.7, 0.7, 0.6, 0.6, 0.3, 0.9, 0.5, 0.5, 0.6, 0.5, 0.5, 0.8, 0.9, 0.8, 0.8, 0.8, 0.8, 0.8, 0.9, 0.9, 1.0, 1.0, 1.0, 1.0, }, + { 0.4, 0.4, 0.3, 0.3, 0.2, 0.5, 0.5, 0.7, 0.2, 0.2, 0.3, 0.6, 0.7, 1.0, 1.3, 1.0, 1.2, 1.2, 1.1, 1.1, 1.2, 1.2, 1.5, 1.7, 2.4, }, }; -static float ringCorrectionFactor[NCCL_NUM_PROTOCOLS][22] = { - { 0.4, 0.6, 0.6, 0.3, 0.2, 0.2, 0.2, 0.2, 0.4, 0.6, 0.7, 0.9, 1.4, 1.5, 1.0, 0.8, 0.7, 0.8, 0.8, 0.9, 0.9, 0.9, }, - { 0.4, 0.6, 0.6, 0.3, 0.2, 0.2, 0.2, 0.2, 0.4, 0.6, 0.7, 0.9, 1.4, 1.5, 1.0, 0.8, 0.7, 0.8, 0.8, 0.9, 0.9, 0.9, }, - { 0.6, 0.4, 0.4, 0.4, 0.2, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.4, 0.6, 0.8, 0.9, }, +static float ringCorrectionFactor[NCCL_NUM_PROTOCOLS][25] = { + { 0.4, 0.6, 0.6, 0.3, 0.2, 0.2, 0.2, 0.2, 0.4, 0.6, 0.7, 0.9, 1.4, 1.5, 1.0, 0.8, 0.7, 0.8, 0.8, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, }, + { 0.4, 0.6, 0.6, 0.3, 0.2, 0.2, 0.2, 0.2, 0.4, 0.6, 0.7, 0.9, 1.4, 1.5, 1.0, 0.8, 0.7, 0.8, 0.8, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, }, + { 0.6, 0.4, 0.4, 0.4, 0.2, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3, 0.4, 0.6, 0.8, 0.9, 1.1, 2.0, 2.9, }, }; ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, int numPipeOps, float* time) { @@ -302,8 +303,14 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int proto int logSize = log2i(info->nBytes>>6); #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) - if (algorithm == NCCL_ALGO_TREE && logSize < 22) bw *= treeCorrectionFactor[protocol][logSize]; - else if (algorithm == NCCL_ALGO_RING && logSize < 22) bw *= ringCorrectionFactor[protocol][logSize]; + if (algorithm == NCCL_ALGO_TREE) { + if (logSize < 25) bw *= treeCorrectionFactor[protocol][logSize]; + else bw *= treeCorrectionFactor[protocol][24]; + } + else if (algorithm == NCCL_ALGO_RING) { + if(logSize < 25) bw *= ringCorrectionFactor[protocol][logSize]; + else bw *= ringCorrectionFactor[protocol][24]; + } #else if (algorithm == NCCL_ALGO_TREE && logSize < 23) bw *= treeCorrectionFactor[protocol][logSize]; if (info->nChannels != 0) bw = bw / info->comm->nChannels * info->nChannels; diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index 989274015d..a4f3ce3a10 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -931,6 +931,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm allGather3Data[rank].nc = 4; if (comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910) allGather3Data[rank].nc = 4; + if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910) + allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph.nChannels); allGather3Data[rank].tree.pattern = treeGraph.pattern; allGather3Data[rank].tree.nChannels = treeGraph.nChannels; allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels; diff --git a/projects/rccl/tools/scripts/topo_val.sh b/projects/rccl/tools/scripts/topo_val.sh index 3a71359d92..2d3a096ef4 100755 --- a/projects/rccl/tools/scripts/topo_val.sh +++ b/projects/rccl/tools/scripts/topo_val.sh @@ -21,7 +21,7 @@ DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -for i in {0..58} +for i in {0..63} do if [[ $i -eq 50 ]] || [[ $i -eq 51 ]] then diff --git a/projects/rccl/tools/topo_expl/models/topo_16p1h.xml b/projects/rccl/tools/topo_expl/models/topo_16p1h.xml index 9225986e4a..3389967c79 100644 --- a/projects/rccl/tools/topo_expl/models/topo_16p1h.xml +++ b/projects/rccl/tools/topo_expl/models/topo_16p1h.xml @@ -191,7 +191,7 @@ - + @@ -217,15 +217,10 @@ - + - - - - - diff --git a/projects/rccl/tools/topo_expl/models/topo_3p_pcie.xml b/projects/rccl/tools/topo_expl/models/topo_3p_pcie.xml new file mode 100644 index 0000000000..16bea77b3e --- /dev/null +++ b/projects/rccl/tools/topo_expl/models/topo_3p_pcie.xml @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/projects/rccl/tools/topo_expl/models/topo_3p_pcie_1.xml b/projects/rccl/tools/topo_expl/models/topo_3p_pcie_1.xml new file mode 100644 index 0000000000..b478ddf627 --- /dev/null +++ b/projects/rccl/tools/topo_expl/models/topo_3p_pcie_1.xml @@ -0,0 +1,26 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/projects/rccl/tools/topo_expl/models/topo_8p_4nics.xml b/projects/rccl/tools/topo_expl/models/topo_8p_4nics.xml new file mode 100644 index 0000000000..8129927d71 --- /dev/null +++ b/projects/rccl/tools/topo_expl/models/topo_8p_4nics.xml @@ -0,0 +1,126 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/projects/rccl/tools/topo_expl/topo_expl.cpp b/projects/rccl/tools/topo_expl/topo_expl.cpp index 320948febc..91109ed90a 100644 --- a/projects/rccl/tools/topo_expl/topo_expl.cpp +++ b/projects/rccl/tools/topo_expl/topo_expl.cpp @@ -128,6 +128,11 @@ NodeModelDesc model_descs[] = { {1, "topo_16p1h.xml", "single node 16P1H"}, {4, "topo_8p_rome_4n_2.xml", "4 nodes 8 gfx908 Rome 4 NICs NPS=4 Alt. Model"}, {1, "topo_8p_90a_1.xml", "single node gfx90a Alt. Model"}, + {4, "topo_16p1h.xml", "4 nodes 16P1H"}, + {4, "topo_3p_pcie.xml", "4 nodes 3P"}, + {4, "topo_3p_pcie_1.xml", "4 nodes 3P Alt. Model"}, + {1, "topo_8p_4nics.xml", "single nodes 8P 4 NICs"}, + {4, "topo_8p_4nics.xml", "4 nodes 8P 4 NICs"}, }; int main(int argc,char* argv[]) diff --git a/projects/rccl/tools/topo_expl/utils.cpp b/projects/rccl/tools/topo_expl/utils.cpp index 95d3702cef..d232f003e9 100644 --- a/projects/rccl/tools/topo_expl/utils.cpp +++ b/projects/rccl/tools/topo_expl/utils.cpp @@ -672,6 +672,8 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t allGather3Data[rank].nc = 4; if (comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910) allGather3Data[rank].nc = 4; + if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910) + allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4/ringGraph.nChannels); allGather3Data[rank].tree.pattern = treeGraph.pattern; allGather3Data[rank].tree.nChannels = treeGraph.nChannels; allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels;