From d2224fd3e1a1196f5b4e878bea90fad3f01db95c Mon Sep 17 00:00:00 2001 From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com> Date: Thu, 7 Mar 2024 08:52:50 -0800 Subject: [PATCH] topo_expl: 2.19.4 update and fix build error (#1098) --- tools/topo_expl/Makefile | 2 +- tools/topo_expl/utils.cpp | 29 +++++++++++++---------------- 2 files changed, 14 insertions(+), 17 deletions(-) diff --git a/tools/topo_expl/Makefile b/tools/topo_expl/Makefile index 23d76e8a16..7327d4b2a6 100644 --- a/tools/topo_expl/Makefile +++ b/tools/topo_expl/Makefile @@ -6,7 +6,7 @@ endif HIPCC = $(HIP_PATH)/bin/hipcc EXE = topo_expl -CXXFLAGS = -g -Iinclude -Ihipify_rccl/include -Ihipify_rccl/graph -I/opt/rocm/include/ -DTOPO_EXPL -DENABLE_TRACE -DNVTX_NO_IMPL +CXXFLAGS = -g -Iinclude -Ihipify_rccl/include -Ihipify_rccl/graph -I/opt/rocm/include/ -DTOPO_EXPL -DENABLE_TRACE -DNVTX_NO_IMPL -lpthread files = $(EXE).cpp model.cpp utils.cpp hipify_rccl/graph/topo.cc hipify_rccl/graph/rings.cc hipify_rccl/graph/paths.cc hipify_rccl/graph/trees.cc ../../src/misc/param.cc \ hipify_rccl/graph/search.cc hipify_rccl/graph/connect.cc hipify_rccl/graph/tuning.cc hipify_rccl/graph/xml.cc ../../src/misc/nvmlwrap_stub.cc hipify_rccl/graph/rome_models.cc hipify_rccl/graph/archinfo.cc diff --git a/tools/topo_expl/utils.cpp b/tools/topo_expl/utils.cpp index 372a3ca369..3d9de6a0d4 100644 --- a/tools/topo_expl/utils.cpp +++ b/tools/topo_expl/utils.cpp @@ -408,9 +408,9 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* int done = 0; int maxPeers = ncclParamConnectRoundMaxPeers(); - data = (ncclConnect**) malloc(sizeof(ncclConnect*) * maxPeers); // Store intermediate send/recvData structs for connect - recvData = (ncclConnect**) malloc(sizeof(ncclConnect*) * maxPeers); // Points to entries inside data for given recv connection within a channel - sendData = (ncclConnect**) malloc(sizeof(ncclConnect*) * maxPeers); // Points to entries inside data for given send connection within a channel + NCCLCHECK(ncclCalloc(&data, maxPeers)); + NCCLCHECK(ncclCalloc(&recvData, maxPeers)); + NCCLCHECK(ncclCalloc(&sendData, maxPeers)); //NCCLCHECKGOTO(ncclStrongStreamAcquireUncaptured(&comm->sharedRes->hostStream), ret, fail); // First time initialization for (int i=1; inRanks; i++) { @@ -426,7 +426,7 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* // The next M entries contain sendData, connection information for send connections // It's not guaranteed that each entry of data has the same number of total or send/recv specific connections int p = i-(done+1); - if (recvMask || sendMask) data[p] = (ncclConnect*) malloc(sizeof(ncclConnect) * 2*MAXCHANNELS); + if (recvMask || sendMask) NCCLCHECK(ncclCalloc(data+p, 2*MAXCHANNELS)); recvData[p] = data[p]; int sendChannels = 0, recvChannels = 0; int type; @@ -553,9 +553,8 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail); NCCLCHECKGOTO(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, &flag, sizeof(int)), ret, fail); } - } + }*/ comm->connectRecv[recvPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = comm->connectSend[sendPeer+comm->nRanks*(connIndex == NCCL_CONN_IDX_P2P_NET ? NCCL_CONN_IDX_P2P_NET : 0)] = 0UL; - */ } free(data); @@ -789,6 +788,8 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGatherInfo *a comm->topo->ll128Enabled = false; // Topology hint for MSCCL internal scheduler about whether to enable MSCCL comm->topo->mscclEnabled = false; + // Topology hint if tree has been defined by model or User + comm->topo->treeDefined = false; // Compute paths between GPUs and NICs NCCLCHECKGOTO(ncclTopoComputePaths(comm->topo, comm), ret, fail); // Remove inaccessible GPUs and unused NICs @@ -810,7 +811,7 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGatherInfo *a // Determine local CollNet support if (collNetSupport(comm)) { - char *collNetEnable = getenv("NCCL_COLLNET_ENABLE"); + const char *collNetEnable = ncclGetEnv("NCCL_COLLNET_ENABLE"); if (collNetEnable != NULL) { INFO(NCCL_ALL, "NCCL_COLLNET_ENABLE set by environment to %s.", collNetEnable); if (strcmp(collNetEnable, "1") == 0) { @@ -825,7 +826,6 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGatherInfo *a // Get rings and trees ringGraph.id = 0; ringGraph.pattern = NCCL_TOPO_PATTERN_RING; - ringGraph.collNet = 0; ringGraph.minChannels = 1; ringGraph.maxChannels = MAXCHANNELS/2; NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &ringGraph), ret, fail); @@ -846,20 +846,15 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGatherInfo *a if (comm->collNetSupport) { NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &collNetGraph), ret, fail); NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &collNetGraph), ret, fail); - } else { - collNetGraph.nChannels = 0; } nvlsGraph.id = 3; nvlsGraph.pattern = NCCL_TOPO_PATTERN_NVLS; - nvlsGraph.collNet = 0; nvlsGraph.minChannels = 1; nvlsGraph.maxChannels = MAXCHANNELS; if (comm->nvlsSupport) { NCCLCHECKGOTO(ncclTopoCompute(comm->topo, &nvlsGraph), ret, fail); NCCLCHECKGOTO(ncclTopoPrintGraph(comm->topo, &nvlsGraph), ret, fail); - } else { - nvlsGraph.nChannels = 0; } bool allXgmi, hasPeerAccess; @@ -960,6 +955,8 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGatherInfo *a int *topParentLocalRanks = NULL; int tpProxyRank; + int highestTransportType = TRANSPORT_P2P; + bool needsProxy = false; //NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)), ret, fail); // Determine nNodes, firstRanks, ... @@ -1113,7 +1110,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGatherInfo *a if (comm->nRanks == 1) continue; NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, fail); } - NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, 0), ret, fail); + NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, 0, &highestTransportType, &needsProxy), ret, fail); if (ringGraph.nIntraChannels && rcclParamP2pNetDisable() == 0) { comm->useIntraNet = 1; // Connect NET for intranode use @@ -1122,7 +1119,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGatherInfo *a if (comm->nRanks == 1) continue; NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->ring.prev, 1, &channel->ring.next, NCCL_CONN_IDX_P2P_NET), ret, fail); } - NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, NCCL_CONN_IDX_P2P_NET), ret, fail); + NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, NCCL_CONN_IDX_P2P_NET, &highestTransportType, &needsProxy), ret, fail); } INFO(NCCL_INIT, "Connected all rings"); @@ -1133,7 +1130,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGatherInfo *a NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up, 0), ret, fail); NCCLCHECKGOTO(ncclTransportP2pConnect(comm, c, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down, 0), ret, fail); } - NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, 0), ret, fail); + NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, 0, &highestTransportType, &needsProxy), ret, fail); INFO(NCCL_INIT, "Connected all trees"); #if 0