topo_expl: fix build and add tuning support (#539)

This commit is contained in:
Wenkai Du
2022-04-26 15:40:07 -07:00
کامیت شده توسط GitHub
والد 379940dfac
کامیت 063da25563
2فایلهای تغییر یافته به همراه30 افزوده شده و 1 حذف شده
+1 -1
مشاهده پرونده
@@ -8,7 +8,7 @@ HIPCC = $(HIP_PATH)/bin/hipcc
EXE = topo_expl
CXXFLAGS = -g -O3 -Iinclude -I../../src -I../../src/include -I../../src/graph/ -I/opt/rocm/rocm_smi/include/ -DTOPO_EXPL -DENABLE_TRACE
files = $(EXE).cpp model.cpp utils.cpp ../../src/graph/topo.cc ../../src/graph/rings.cc ../../src/graph/paths.cc ../../src/graph/trees.cc \
files = $(EXE).cpp model.cpp utils.cpp ../../src/graph/topo.cc ../../src/graph/rings.cc ../../src/graph/paths.cc ../../src/graph/trees.cc ../../src/misc/param.cc \
../../src/graph/search.cc ../../src/graph/connect.cc ../../src/graph/tuning.cc ../../src/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc ../../src/graph/rome_models.cc
all: $(EXE)
@@ -46,6 +46,7 @@ THE SOFTWARE.
#include "model.h"
#include "utils.h"
#include "topo.h"
#include "graph.h"
NodeModel *node_model;
@@ -236,6 +237,34 @@ int main(int argc,char* argv[])
initTransportsRank_3(&comm[i], allGather3Data, treeGraph[i], ringGraph[i], collNetGraph[i]);
}
for (uint64_t len = 8; len <= 4294967296L; len *= 2) {
struct ncclInfo info;
float minTime = 3600000000.0;
info.comm = &comm[0];
info.coll = ncclFuncAllReduce;
info.nBytes = len;
// Find algorithm / protocol.
info.algorithm = -1;
info.protocol = -1;
int nAlgos = NCCL_NUM_ALGORITHMS;
for (int a=0; a<nAlgos; a++) {
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
float time;
NCCLCHECK(ncclTopoGetAlgoTime(&info, a, p, 1, &time));
if (time >= 0 && time < minTime) {
info.algorithm = a;
info.protocol = p;
minTime = time;
}
}
}
if (info.algorithm == -1 || info.protocol == -1) {
WARN("Error : no algorithm/protocol available");
return ncclInternalError;
}
INFO(NCCL_TUNING, "%10ld %s %s time %f", info.nBytes, ncclAlgoStr[info.algorithm], ncclProtoStr[info.protocol], minTime);
}
for (int i = 0; i < nranks; i++) {
free(comm[i].connectSend);
free(comm[i].connectRecv);