Fix topo explorer's nccl 2.23 compatibility (#1623)
* Fix compiler issues due to broken compatibility * Fix segfault and pass rank instead of busid and add a pointer to cover a new algorithm
This commit is contained in:
committed by
GitHub
parent
ffe255d285
commit
aace4e27f8
@@ -78,7 +78,7 @@ int busIdToCudaDev(int64_t busId) {
|
||||
static int useMemcpy = 0;
|
||||
|
||||
/* Determine if two peers can communicate with P2P */
|
||||
ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
ncclResult_t p2pCanConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
if (!info1->hasFineGrain || !info2->hasFineGrain) {
|
||||
*ret = 0;
|
||||
return ncclSuccess;
|
||||
@@ -92,7 +92,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
|
||||
|
||||
// Check topology / p2p level.
|
||||
int intermediateRank;
|
||||
NCCLCHECK(ncclTopoCheckP2p(topo, info1->busId, info2->busId, ret, NULL, &intermediateRank));
|
||||
NCCLCHECK(ncclTopoCheckP2p(comm->topo, info1->rank, info2->rank, ret, NULL, &intermediateRank));
|
||||
if (*ret == 0) return ncclSuccess;
|
||||
if (intermediateRank != -1) {
|
||||
if (useMemcpy) *ret = 0;
|
||||
@@ -101,7 +101,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
|
||||
|
||||
// Check if NET would work better
|
||||
int useNet = 0;
|
||||
NCCLCHECK(ncclTopoCheckNet(topo, info1->busId, info2->busId, &useNet));
|
||||
NCCLCHECK(ncclTopoCheckNet(comm->topo, info1->rank, info2->rank, &useNet));
|
||||
if (useNet) {
|
||||
*ret = 0;
|
||||
return ncclSuccess;
|
||||
@@ -146,12 +146,12 @@ struct ncclTransport p2pTransport = {
|
||||
NCCL_PARAM(ShmDisable, "SHM_DISABLE", 0);
|
||||
|
||||
/* Determine if two peers can communicate with SHM */
|
||||
ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
ncclResult_t shmCanConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
*ret = 0;
|
||||
if (ncclParamShmDisable() == 1) return ncclSuccess;
|
||||
|
||||
int useNet = 0;
|
||||
NCCLCHECK(ncclTopoCheckNet(topo, info1->busId, info2->busId, &useNet));
|
||||
NCCLCHECK(ncclTopoCheckNet(comm->topo, info1->rank, info2->rank, &useNet));
|
||||
if (useNet) return ncclSuccess;
|
||||
|
||||
// Same host?
|
||||
@@ -197,7 +197,7 @@ struct setupReq {
|
||||
};
|
||||
|
||||
/* Determine if two peers can communicate with NET */
|
||||
ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
ncclResult_t netCanConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
*ret = 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -257,7 +257,7 @@ struct ncclTransport netTransport = {
|
||||
};
|
||||
|
||||
/* Determine if two peers can communicate with NET */
|
||||
ncclResult_t collNetCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
ncclResult_t collNetCanConnect(int* ret, struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
*ret = 1;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -353,7 +353,7 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph*
|
||||
struct ncclTransport *transport = ncclTransports[t];
|
||||
struct ncclTransportComm* transportComm = type == 1 ? &transport->send : &transport->recv;
|
||||
int ret = 0;
|
||||
NCCLCHECK(transport->canConnect(&ret, comm->topo, graph, myInfo, peerInfo));
|
||||
NCCLCHECK(transport->canConnect(&ret, comm, graph, myInfo, peerInfo));
|
||||
if (ret) {
|
||||
connector->transportComm = transportComm;
|
||||
NCCLCHECK(transportComm->setup(comm, graph, myInfo, peerInfo, connect, connector, channelId, connIndex));
|
||||
@@ -721,7 +721,7 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGatherInfo *a
|
||||
//struct ncclTopoGraph treeGraph;
|
||||
//struct ncclTopoGraph collNetGraph;
|
||||
//struct ncclTopoGraph nvlsGraph;
|
||||
struct ncclTopoGraph* graphs[] = { &treeGraph, &ringGraph, &collNetGraph, &collNetGraph, &nvlsGraph, &nvlsGraph };
|
||||
struct ncclTopoGraph* graphs[] = { &treeGraph, &ringGraph, &collNetGraph, &collNetGraph, &nvlsGraph, &nvlsGraph, &nvlsGraph};
|
||||
|
||||
int nChannelsOrig;
|
||||
struct ncclTopoRanks** allTopoRanks = NULL;
|
||||
@@ -963,6 +963,7 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGatherInfo *a
|
||||
}
|
||||
|
||||
comm->nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels);
|
||||
TRACE(NCCL_INIT,"treeGraph.nChannels: %d , ringGraph.nChannels: %d", treeGraph.nChannels, ringGraph.nChannels);
|
||||
NCCLCHECKGOTO(ncclTopoPreset(comm, graphs, &allGather3Data[rank].topoRanks), ret, fail);
|
||||
fail:
|
||||
return ret;
|
||||
@@ -975,7 +976,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGatherInfo *a
|
||||
int nranks = comm->nRanks;
|
||||
cpu_set_t affinitySave;
|
||||
|
||||
struct ncclTopoGraph* graphs[] = { &treeGraph, &ringGraph, &collNetGraph, &collNetGraph, &nvlsGraph, &nvlsGraph };
|
||||
struct ncclTopoGraph* graphs[] = { &treeGraph, &ringGraph, &collNetGraph, &collNetGraph, &nvlsGraph, &nvlsGraph, &nvlsGraph};
|
||||
|
||||
int nChannelsOrig;
|
||||
struct ncclTopoRanks** allTopoRanks = NULL;
|
||||
@@ -1090,6 +1091,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGatherInfo *a
|
||||
}
|
||||
|
||||
NCCLCHECKGOTO(ncclCalloc(&rings, nranks*MAXCHANNELS), ret, fail);
|
||||
TRACE(NCCL_INIT, "rank %d nranks %d nchannels %d", rank, nranks, comm->nChannels);
|
||||
|
||||
NCCLCHECKGOTO(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, graphs, NULL, nc), ret, fail);
|
||||
if (comm->topo->treeDefined) NCCLCHECK(ncclTreeBasePostset(comm, &treeGraph));
|
||||
|
||||
Reference in New Issue
Block a user