topo_expl: 2.19.4 update and fix build error (#1098)
Этот коммит содержится в:
@@ -6,7 +6,7 @@ endif
|
||||
HIPCC = $(HIP_PATH)/bin/hipcc
|
||||
|
||||
EXE = topo_expl
|
||||
CXXFLAGS = -g -Iinclude -Ihipify_rccl/include -Ihipify_rccl/graph -I/opt/rocm/include/ -DTOPO_EXPL -DENABLE_TRACE -DNVTX_NO_IMPL
|
||||
CXXFLAGS = -g -Iinclude -Ihipify_rccl/include -Ihipify_rccl/graph -I/opt/rocm/include/ -DTOPO_EXPL -DENABLE_TRACE -DNVTX_NO_IMPL -lpthread
|
||||
|
||||
files = $(EXE).cpp model.cpp utils.cpp hipify_rccl/graph/topo.cc hipify_rccl/graph/rings.cc hipify_rccl/graph/paths.cc hipify_rccl/graph/trees.cc ../../src/misc/param.cc \
|
||||
hipify_rccl/graph/search.cc hipify_rccl/graph/connect.cc hipify_rccl/graph/tuning.cc hipify_rccl/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc hipify_rccl/graph/rome_models.cc hipify_rccl/graph/archinfo.cc
|
||||
|
||||
@@ -408,9 +408,9 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
int done = 0;
|
||||
|
||||
int maxPeers = ncclParamConnectRoundMaxPeers();
|
||||
data = (ncclConnect**) malloc(sizeof(ncclConnect*) * maxPeers); // Store intermediate send/recvData structs for connect
|
||||
recvData = (ncclConnect**) malloc(sizeof(ncclConnect*) * maxPeers); // Points to entries inside data for given recv connection within a channel
|
||||
sendData = (ncclConnect**) malloc(sizeof(ncclConnect*) * maxPeers); // Points to entries inside data for given send connection within a channel
|
||||
NCCLCHECK(ncclCalloc(&data, maxPeers));
|
||||
NCCLCHECK(ncclCalloc(&recvData, maxPeers));
|
||||
NCCLCHECK(ncclCalloc(&sendData, maxPeers));
|
||||
//NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail);
|
||||
// First time initialization
|
||||
for (int i=1; i<comm->nRanks; i++) {
|
||||
@@ -426,7 +426,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
// The next M entries contain sendData, connection information for send connections
|
||||
// It's not guaranteed that each entry of data has the same number of total or send/recv specific connections
|
||||
int p = i-(done+1);
|
||||
if (recvMask || sendMask) data[p] = (ncclConnect*) malloc(sizeof(ncclConnect) * 2*MAXCHANNELS);
|
||||
if (recvMask || sendMask) NCCLCHECK(ncclCalloc(data+p, 2*MAXCHANNELS));
|
||||
recvData[p] = data[p];
|
||||
int sendChannels = 0, recvChannels = 0;
|
||||
int type;
|
||||
@@ -553,9 +553,8 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail);
|
||||
NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail);
|
||||
}
|
||||
}
|
||||
}*/
|
||||
comm->connectRecv[recvPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = comm->connectSend[sendPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = 0UL;
|
||||
*/
|
||||
}
|
||||
|
||||
free(data);
|
||||
@@ -789,6 +788,8 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGatherInfo *a
|
||||
comm->topo->ll128Enabled = false;
|
||||
// Topology hint for MSCCL internal scheduler about whether to enable MSCCL
|
||||
comm->topo->mscclEnabled = false;
|
||||
// Topology hint if tree has been defined by model or User
|
||||
comm->topo->treeDefined = false;
|
||||
// Compute paths between GPUs and NICs
|
||||
NCCLCHECKGOTO(ncclTopoComputePaths(comm->topo, comm), ret, fail);
|
||||
// Remove inaccessible GPUs and unused NICs
|
||||
@@ -810,7 +811,7 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGatherInfo *a
|
||||
|
||||
// Determine local CollNet support
|
||||
if (collNetSupport(comm)) {
|
||||
char *collNetEnable = getenv("NCCL_COLLNET_ENABLE");
|
||||
const char *collNetEnable = ncclGetEnv("NCCL_COLLNET_ENABLE");
|
||||
if (collNetEnable != NULL) {
|
||||
INFO(NCCL_ALL, "NCCL_COLLNET_ENABLE set by environment to %s.", collNetEnable);
|
||||
if (strcmp(collNetEnable, "1") == 0) {
|
||||
@@ -825,7 +826,6 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGatherInfo *a
|
||||
// Get rings and trees
|
||||
ringGraph.id = 0;
|
||||
ringGraph.pattern = NCCL_TOPO_PATTERN_RING;
|
||||
ringGraph.collNet = 0;
|
||||
ringGraph.minChannels = 1;
|
||||
ringGraph.maxChannels = MAXCHANNELS/2;
|
||||
NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &ringGraph), ret, fail);
|
||||
@@ -846,20 +846,15 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGatherInfo *a
|
||||
if (comm->collNetSupport) {
|
||||
NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &collNetGraph), ret, fail);
|
||||
NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &collNetGraph), ret, fail);
|
||||
} else {
|
||||
collNetGraph.nChannels = 0;
|
||||
}
|
||||
|
||||
nvlsGraph.id = 3;
|
||||
nvlsGraph.pattern = NCCL_TOPO_PATTERN_NVLS;
|
||||
nvlsGraph.collNet = 0;
|
||||
nvlsGraph.minChannels = 1;
|
||||
nvlsGraph.maxChannels = MAXCHANNELS;
|
||||
if (comm->nvlsSupport) {
|
||||
NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &nvlsGraph), ret, fail);
|
||||
NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &nvlsGraph), ret, fail);
|
||||
} else {
|
||||
nvlsGraph.nChannels = 0;
|
||||
}
|
||||
|
||||
bool allXgmi, hasPeerAccess;
|
||||
@@ -960,6 +955,8 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGatherInfo *a
|
||||
int *topParentLocalRanks = NULL;
|
||||
int tpProxyRank;
|
||||
|
||||
int highestTransportType = TRANSPORT_P2P;
|
||||
bool needsProxy = false;
|
||||
//NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)), ret, fail);
|
||||
|
||||
// Determine nNodes, firstRanks, ...
|
||||
@@ -1113,7 +1110,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGatherInfo *a
|
||||
if (comm->nRanks == 1) continue;
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, fail);
|
||||
}
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, 0), ret, fail);
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, 0, &highestTransportType, &needsProxy), ret, fail);
|
||||
if (ringGraph.nIntraChannels && rcclParamP2pNetDisable() == 0) {
|
||||
comm->useIntraNet = 1;
|
||||
// Connect NET for intranode use
|
||||
@@ -1122,7 +1119,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGatherInfo *a
|
||||
if (comm->nRanks == 1) continue;
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, NCCL_CONN_IDX_P2P_NET), ret, fail);
|
||||
}
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, NCCL_CONN_IDX_P2P_NET), ret, fail);
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, NCCL_CONN_IDX_P2P_NET, &highestTransportType, &needsProxy), ret, fail);
|
||||
}
|
||||
INFO(NCCL_INIT, "Connected all rings");
|
||||
|
||||
@@ -1133,7 +1130,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGatherInfo *a
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up, 0), ret, fail);
|
||||
NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down, 0), ret, fail);
|
||||
}
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, 0), ret, fail);
|
||||
NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, 0, &highestTransportType, &needsProxy), ret, fail);
|
||||
INFO(NCCL_INIT, "Connected all trees");
|
||||
|
||||
#if 0
|
||||
|
||||
Ссылка в новой задаче
Block a user