From 95b30d9762889daeb231c33ea48d7ee3b3fe4e12 Mon Sep 17 00:00:00 2001 From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com> Date: Tue, 26 Apr 2022 15:40:07 -0700 Subject: [PATCH] topo_expl: fix build and add tuning support (#539) [ROCm/rccl commit: 063da25563ae8bc28f27fbd0ebbbef6b1f61e6f4] --- projects/rccl/tools/topo_expl/Makefile | 2 +- projects/rccl/tools/topo_expl/topo_expl.cpp | 29 +++++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/projects/rccl/tools/topo_expl/Makefile b/projects/rccl/tools/topo_expl/Makefile index 9ca8fe4e8c..90cef51954 100644 --- a/projects/rccl/tools/topo_expl/Makefile +++ b/projects/rccl/tools/topo_expl/Makefile @@ -8,7 +8,7 @@ HIPCC = $(HIP_PATH)/bin/hipcc EXE = topo_expl CXXFLAGS = -g -O3 -Iinclude -I../../src -I../../src/include -I../../src/graph/ -I/opt/rocm/rocm_smi/include/ -DTOPO_EXPL -DENABLE_TRACE -files = $(EXE).cpp model.cpp utils.cpp ../../src/graph/topo.cc ../../src/graph/rings.cc ../../src/graph/paths.cc ../../src/graph/trees.cc \ +files = $(EXE).cpp model.cpp utils.cpp ../../src/graph/topo.cc ../../src/graph/rings.cc ../../src/graph/paths.cc ../../src/graph/trees.cc ../../src/misc/param.cc \ ../../src/graph/search.cc ../../src/graph/connect.cc ../../src/graph/tuning.cc ../../src/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc ../../src/graph/rome_models.cc all: $(EXE) diff --git a/projects/rccl/tools/topo_expl/topo_expl.cpp b/projects/rccl/tools/topo_expl/topo_expl.cpp index e4fe37e251..a44abacaf9 100644 --- a/projects/rccl/tools/topo_expl/topo_expl.cpp +++ b/projects/rccl/tools/topo_expl/topo_expl.cpp @@ -46,6 +46,7 @@ THE SOFTWARE. #include "model.h" #include "utils.h" #include "topo.h" +#include "graph.h" NodeModel *node_model; @@ -236,6 +237,34 @@ int main(int argc,char* argv[]) initTransportsRank_3(&comm[i], allGather3Data, treeGraph[i], ringGraph[i], collNetGraph[i]); } + for (uint64_t len = 8; len <= 4294967296L; len *= 2) { + struct ncclInfo info; + float minTime = 3600000000.0; + info.comm = &comm[0]; + info.coll = ncclFuncAllReduce; + info.nBytes = len; + // Find algorithm / protocol. + info.algorithm = -1; + info.protocol = -1; + int nAlgos = NCCL_NUM_ALGORITHMS; + for (int a=0; a= 0 && time < minTime) { + info.algorithm = a; + info.protocol = p; + minTime = time; + } + } + } + if (info.algorithm == -1 || info.protocol == -1) { + WARN("Error : no algorithm/protocol available"); + return ncclInternalError; + } + INFO(NCCL_TUNING, "%10ld %s %s time %f", info.nBytes, ncclAlgoStr[info.algorithm], ncclProtoStr[info.protocol], minTime); + } + for (int i = 0; i < nranks; i++) { free(comm[i].connectSend); free(comm[i].connectRecv);