diff --git a/src/graph/connect.cc b/src/graph/connect.cc index 6db2334183..084a98bc22 100644 --- a/src/graph/connect.cc +++ b/src/graph/connect.cc @@ -110,8 +110,8 @@ static ncclResult_t connectRings(struct ncclComm* comm, int* ringRecv, int* ring channel1->ring.next = nextRecvRank; } } - INFO(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c, channel0->ring.prev, comm->rank, channel0->ring.next); - INFO(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c+nChannels, channel1->ring.prev, comm->rank, channel1->ring.next); + TRACE(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c, channel0->ring.prev, comm->rank, channel0->ring.next); + TRACE(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c+nChannels, channel1->ring.prev, comm->rank, channel1->ring.next); } return ncclSuccess; } @@ -288,6 +288,10 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct nccl memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int)); memcpy(ringNext+nChannels*nranks, ringNext, nChannels*nranks*sizeof(int)); + char *str = NULL; + NCCLCHECK(parseChordalRing(comm->topo, &str)); + int end = std::min((int)ncclMaxNchannels(), (str ? nChannels*3 : ncclMinNchannels())); + // Duplication should be complete now nChannels = comm->nChannels = std::min(MAXCHANNELS,nChannels*2); @@ -295,7 +299,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct nccl // We permit combining max, then min, to only use the first channels, then duplicate them. nChannels = comm->nChannels = std::min((int)ncclMaxNchannels(), nChannels); int c; - for (c=nChannels; cchannels+c, comm->channels+c-nChannels, sizeof(struct ncclChannel)); diff --git a/src/graph/search.cc b/src/graph/search.cc index cedf71522d..4b35d7cf21 100644 --- a/src/graph/search.cc +++ b/src/graph/search.cc @@ -714,7 +714,7 @@ end: return ncclSuccess; } -static void parseChordalRing(struct ncclTopoSystem* system, char **str) { +ncclResult_t parseChordalRing(struct ncclTopoSystem* system, char **str) { static const char *ringBase = "0 6 7 4 5 3 2 1|0 5 6 3 7 1 4 2|0 4 6 2 7 5 1 3|0 1 2 3 5 4 7 6|0 2 4 1 7 3 6 5|0 3 1 5 7 2 6 4"; static char ringRemap[256]; int id[8], dist[8]; @@ -723,7 +723,7 @@ static void parseChordalRing(struct ncclTopoSystem* system, char **str) { int ngpus = system->nodes[GPU].count; // single node CR8G only if (ngpus != 8 || system->nodes[NET].count != 0) - return; + return ncclSuccess; // validate chordal ring and calculate distance for (i=0; inodes[GPU].nodes+i; @@ -741,7 +741,7 @@ static void parseChordalRing(struct ncclTopoSystem* system, char **str) { count ++; } if(count != ngpus-2 || sum < 0 || sum > ngpus-1) { - return; + return ncclSuccess; } dist[i] = sum; } @@ -766,7 +766,7 @@ static void parseChordalRing(struct ncclTopoSystem* system, char **str) { ringRemap[i] = 0; *str = ringRemap; INFO(NCCL_GRAPH, "Use chordal ring: %s", ringRemap); - return; + return ncclSuccess; } #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) @@ -799,7 +799,7 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph if (graph->nChannels > 0) return ncclSuccess; } - if (!str) parseChordalRing(system, &str); + if (!str) NCCLCHECK(parseChordalRing(system, &str)); if (str) { NCCLCHECK(parseGraph(str, &graph->nChannels, ngpus, graph->intra)); for (int i=0; inChannels*ngpus; i++) { diff --git a/src/include/graph.h b/src/include/graph.h index 70117d5e91..f4d623584b 100644 --- a/src/include/graph.h +++ b/src/include/graph.h @@ -101,4 +101,6 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom #include "info.h" ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, float* time); +ncclResult_t parseChordalRing(struct ncclTopoSystem* system, char **str); + #endif diff --git a/src/init.cc b/src/init.cc index 70e51216a5..82a4c8ed96 100644 --- a/src/init.cc +++ b/src/init.cc @@ -870,6 +870,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d|%d->%d->%d/%d/%d", c, treeUp->down[0], treeUp->down[1], treeUp->down[2], rank, treeUp->up, treeDn->up, rank, treeDn->down[0], treeDn->down[1], treeDn->down[2]); + INFO(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c, comm->channels[c].ring.prev, comm->rank, comm->channels[c].ring.next); } line[1023] = '\0'; INFO(NCCL_INIT, "Trees%s", line); diff --git a/tools/topo_expl/utils.cpp b/tools/topo_expl/utils.cpp index 02afeeca0f..6324828134 100644 --- a/tools/topo_expl/utils.cpp +++ b/tools/topo_expl/utils.cpp @@ -448,6 +448,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d|%d->%d->%d/%d/%d", c, treeUp->down[0], treeUp->down[1], treeUp->down[2], rank, treeUp->up, treeDn->up, rank, treeDn->down[0], treeDn->down[1], treeDn->down[2]); + INFO(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c, comm->channels[c].ring.prev, comm->rank, comm->channels[c].ring.next); } line[1023] = '\0'; INFO(NCCL_INIT, "Trees%s", line);