diff --git a/tools/topo_expl/model.cpp b/tools/topo_expl/model.cpp index 910da923e1..9c4449f280 100644 --- a/tools/topo_expl/model.cpp +++ b/tools/topo_expl/model.cpp @@ -78,7 +78,7 @@ int busIdToCudaDev(int64_t busId) { static int useMemcpy = 0; /* Determine if two peers can communicate with P2P */ -ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { +ncclResult_t p2pCanConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { if (!info1->hasFineGrain || !info2->hasFineGrain) { *ret = 0; return ncclSuccess; @@ -92,7 +92,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop // Check topology / p2p level. int intermediateRank; - NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, ret, NULL, &intermediateRank)); + NCCLCHECK(ncclTopoCheckP2p(comm->topo, info1->rank, info2->rank, ret, NULL, &intermediateRank)); if (*ret == 0) return ncclSuccess; if (intermediateRank != -1) { if (useMemcpy) *ret = 0; @@ -101,7 +101,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop // Check if NET would work better int useNet = 0; - NCCLCHECK(ncclTopoCheckNet(topo, info1->busId, info2->busId, &useNet)); + NCCLCHECK(ncclTopoCheckNet(comm->topo, info1->rank, info2->rank, &useNet)); if (useNet) { *ret = 0; return ncclSuccess; @@ -146,12 +146,12 @@ struct ncclTransport p2pTransport = { NCCL_PARAM(ShmDisable, "SHM_DISABLE", 0); /* Determine if two peers can communicate with SHM */ -ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { +ncclResult_t shmCanConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { *ret = 0; if (ncclParamShmDisable() == 1) return ncclSuccess; int useNet = 0; - NCCLCHECK(ncclTopoCheckNet(topo, info1->busId, info2->busId, &useNet)); + NCCLCHECK(ncclTopoCheckNet(comm->topo, info1->rank, info2->rank, &useNet)); if (useNet) return ncclSuccess; // Same host? @@ -197,7 +197,7 @@ struct setupReq { }; /* Determine if two peers can communicate with NET */ -ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { +ncclResult_t netCanConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { *ret = 1; return ncclSuccess; } @@ -257,7 +257,7 @@ struct ncclTransport netTransport = { }; /* Determine if two peers can communicate with NET */ -ncclResult_t collNetCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { +ncclResult_t collNetCanConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { *ret = 1; return ncclSuccess; } diff --git a/tools/topo_expl/utils.cpp b/tools/topo_expl/utils.cpp index 2e793b8346..340f1fff50 100644 --- a/tools/topo_expl/utils.cpp +++ b/tools/topo_expl/utils.cpp @@ -353,7 +353,7 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* struct ncclTransport *transport = ncclTransports[t]; struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv; int ret = 0; - NCCLCHECK(transport->canConnect(&ret, comm->topo, graph, myInfo, peerInfo)); + NCCLCHECK(transport->canConnect(&ret, comm, graph, myInfo, peerInfo)); if (ret) { connector->transportComm = transportComm; NCCLCHECK(transportComm->setup(comm, graph, myInfo, peerInfo, connect, connector, channelId, connIndex)); @@ -721,7 +721,7 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGatherInfo *a //struct ncclTopoGraph treeGraph; //struct ncclTopoGraph collNetGraph; //struct ncclTopoGraph nvlsGraph; - struct ncclTopoGraph* graphs[] = { &treeGraph, &ringGraph, &collNetGraph, &collNetGraph, &nvlsGraph, &nvlsGraph }; + struct ncclTopoGraph* graphs[] = { &treeGraph, &ringGraph, &collNetGraph, &collNetGraph, &nvlsGraph, &nvlsGraph, &nvlsGraph}; int nChannelsOrig; struct ncclTopoRanks** allTopoRanks = NULL; @@ -963,6 +963,7 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGatherInfo *a } comm->nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels); + TRACE(NCCL_INIT,"treeGraph.nChannels: %d , ringGraph.nChannels: %d", treeGraph.nChannels, ringGraph.nChannels); NCCLCHECKGOTO(ncclTopoPreset(comm, graphs, &allGather3Data[rank].topoRanks), ret, fail); fail: return ret; @@ -975,7 +976,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGatherInfo *a int nranks = comm->nRanks; cpu_set_t affinitySave; - struct ncclTopoGraph* graphs[] = { &treeGraph, &ringGraph, &collNetGraph, &collNetGraph, &nvlsGraph, &nvlsGraph }; + struct ncclTopoGraph* graphs[] = { &treeGraph, &ringGraph, &collNetGraph, &collNetGraph, &nvlsGraph, &nvlsGraph, &nvlsGraph}; int nChannelsOrig; struct ncclTopoRanks** allTopoRanks = NULL; @@ -1090,6 +1091,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGatherInfo *a } NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail); + TRACE(NCCL_INIT, "rank %d nranks %d nchannels %d", rank, nranks, comm->nChannels); NCCLCHECKGOTO(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, graphs, NULL, nc), ret, fail); if (comm->topo->treeDefined) NCCLCHECK(ncclTreeBasePostset(comm, &treeGraph));