topo_expl: 2.19.4 update and fix build error (#1098)

Этот коммит содержится в:
Wenkai Du
2024-03-07 08:52:50 -08:00
коммит произвёл GitHub
родитель 77615cce28
Коммит d2224fd3e1
2 изменённых файлов: 14 добавлений и 17 удалений
+1 -1
Просмотреть файл
@@ -6,7 +6,7 @@ endif
HIPCC = $(HIP_PATH)/bin/hipcc
EXE = topo_expl
CXXFLAGS = -g -Iinclude -Ihipify_rccl/include -Ihipify_rccl/graph -I/opt/rocm/include/ -DTOPO_EXPL -DENABLE_TRACE -DNVTX_NO_IMPL
CXXFLAGS = -g -Iinclude -Ihipify_rccl/include -Ihipify_rccl/graph -I/opt/rocm/include/ -DTOPO_EXPL -DENABLE_TRACE -DNVTX_NO_IMPL -lpthread
files = $(EXE).cpp model.cpp utils.cpp hipify_rccl/graph/topo.cc hipify_rccl/graph/rings.cc hipify_rccl/graph/paths.cc hipify_rccl/graph/trees.cc ../../src/misc/param.cc \
hipify_rccl/graph/search.cc hipify_rccl/graph/connect.cc hipify_rccl/graph/tuning.cc hipify_rccl/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc hipify_rccl/graph/rome_models.cc hipify_rccl/graph/archinfo.cc
+13 -16
Просмотреть файл
@@ -408,9 +408,9 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
int done = 0;
int maxPeers = ncclParamConnectRoundMaxPeers();
data = (ncclConnect**) malloc(sizeof(ncclConnect*) * maxPeers); // Store intermediate send/recvData structs for connect
recvData = (ncclConnect**) malloc(sizeof(ncclConnect*) * maxPeers); // Points to entries inside data for given recv connection within a channel
sendData = (ncclConnect**) malloc(sizeof(ncclConnect*) * maxPeers); // Points to entries inside data for given send connection within a channel
NCCLCHECK(ncclCalloc(&data, maxPeers));
NCCLCHECK(ncclCalloc(&recvData, maxPeers));
NCCLCHECK(ncclCalloc(&sendData, maxPeers));
//NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail);
// First time initialization
for (int i=1; i<comm->nRanks; i++) {
@@ -426,7 +426,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
// The next M entries contain sendData, connection information for send connections
// It's not guaranteed that each entry of data has the same number of total or send/recv specific connections
int p = i-(done+1);
if (recvMask || sendMask) data[p] = (ncclConnect*) malloc(sizeof(ncclConnect) * 2*MAXCHANNELS);
if (recvMask || sendMask) NCCLCHECK(ncclCalloc(data+p, 2*MAXCHANNELS));
recvData[p] = data[p];
int sendChannels = 0, recvChannels = 0;
int type;
@@ -553,9 +553,8 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail);
NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail);
}
}
}*/
comm->connectRecv[recvPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = comm->connectSend[sendPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = 0UL;
*/
}
free(data);
@@ -789,6 +788,8 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGatherInfo *a
comm->topo->ll128Enabled = false;
// Topology hint for MSCCL internal scheduler about whether to enable MSCCL
comm->topo->mscclEnabled = false;
// Topology hint if tree has been defined by model or User
comm->topo->treeDefined = false;
// Compute paths between GPUs and NICs
NCCLCHECKGOTO(ncclTopoComputePaths(comm->topo, comm), ret, fail);
// Remove inaccessible GPUs and unused NICs
@@ -810,7 +811,7 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGatherInfo *a
// Determine local CollNet support
if (collNetSupport(comm)) {
char *collNetEnable = getenv("NCCL_COLLNET_ENABLE");
const char *collNetEnable = ncclGetEnv("NCCL_COLLNET_ENABLE");
if (collNetEnable != NULL) {
INFO(NCCL_ALL, "NCCL_COLLNET_ENABLE set by environment to %s.", collNetEnable);
if (strcmp(collNetEnable, "1") == 0) {
@@ -825,7 +826,6 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGatherInfo *a
// Get rings and trees
ringGraph.id = 0;
ringGraph.pattern = NCCL_TOPO_PATTERN_RING;
ringGraph.collNet = 0;
ringGraph.minChannels = 1;
ringGraph.maxChannels = MAXCHANNELS/2;
NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &ringGraph), ret, fail);
@@ -846,20 +846,15 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGatherInfo *a
if (comm->collNetSupport) {
NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &collNetGraph), ret, fail);
NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &collNetGraph), ret, fail);
} else {
collNetGraph.nChannels = 0;
}
nvlsGraph.id = 3;
nvlsGraph.pattern = NCCL_TOPO_PATTERN_NVLS;
nvlsGraph.collNet = 0;
nvlsGraph.minChannels = 1;
nvlsGraph.maxChannels = MAXCHANNELS;
if (comm->nvlsSupport) {
NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &nvlsGraph), ret, fail);
NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &nvlsGraph), ret, fail);
} else {
nvlsGraph.nChannels = 0;
}
bool allXgmi, hasPeerAccess;
@@ -960,6 +955,8 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGatherInfo *a
int *topParentLocalRanks = NULL;
int tpProxyRank;
int highestTransportType = TRANSPORT_P2P;
bool needsProxy = false;
//NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)), ret, fail);
// Determine nNodes, firstRanks, ...
@@ -1113,7 +1110,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGatherInfo *a
if (comm->nRanks == 1) continue;
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, fail);
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, 0), ret, fail);
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, 0, &highestTransportType, &needsProxy), ret, fail);
if (ringGraph.nIntraChannels && rcclParamP2pNetDisable() == 0) {
comm->useIntraNet = 1;
// Connect NET for intranode use
@@ -1122,7 +1119,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGatherInfo *a
if (comm->nRanks == 1) continue;
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, NCCL_CONN_IDX_P2P_NET), ret, fail);
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, NCCL_CONN_IDX_P2P_NET), ret, fail);
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, NCCL_CONN_IDX_P2P_NET, &highestTransportType, &needsProxy), ret, fail);
}
INFO(NCCL_INIT, "Connected all rings");
@@ -1133,7 +1130,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGatherInfo *a
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up, 0), ret, fail);
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down, 0), ret, fail);
}
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, 0), ret, fail);
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, 0, &highestTransportType, &needsProxy), ret, fail);
INFO(NCCL_INIT, "Connected all trees");
#if 0