Change default channels duplication for chordal ring (#233)

Этот коммит содержится в:
Wenkai Du
2020-07-14 15:16:50 -07:00
коммит произвёл GitHub
родитель f87ba17737
Коммит ab787c767e
5 изменённых файлов: 16 добавлений и 8 удалений
+7 -3
Просмотреть файл
@@ -110,8 +110,8 @@ static ncclResult_t connectRings(struct ncclComm* comm, int* ringRecv, int* ring
channel1->ring.next = nextRecvRank;
}
}
INFO(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c, channel0->ring.prev, comm->rank, channel0->ring.next);
INFO(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c+nChannels, channel1->ring.prev, comm->rank, channel1->ring.next);
TRACE(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c, channel0->ring.prev, comm->rank, channel0->ring.next);
TRACE(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c+nChannels, channel1->ring.prev, comm->rank, channel1->ring.next);
}
return ncclSuccess;
}
@@ -288,6 +288,10 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct nccl
memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int));
memcpy(ringNext+nChannels*nranks, ringNext, nChannels*nranks*sizeof(int));
char *str = NULL;
NCCLCHECK(parseChordalRing(comm->topo, &str));
int end = std::min((int)ncclMaxNchannels(), (str ? nChannels*3 : ncclMinNchannels()));
// Duplication should be complete now
nChannels = comm->nChannels = std::min(MAXCHANNELS,nChannels*2);
@@ -295,7 +299,7 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct nccl
// We permit combining max, then min, to only use the first channels, then duplicate them.
nChannels = comm->nChannels = std::min((int)ncclMaxNchannels(), nChannels);
int c;
for (c=nChannels; c<ncclMinNchannels(); c++) {
for (c=nChannels; c<end; c++) {
memcpy(ringPrev+c*nranks, ringPrev+(c-nChannels)*nranks, nranks*sizeof(int));
memcpy(ringNext+c*nranks, ringNext+(c-nChannels)*nranks, nranks*sizeof(int));
memcpy(comm->channels+c, comm->channels+c-nChannels, sizeof(struct ncclChannel));
+5 -5
Просмотреть файл
@@ -714,7 +714,7 @@ end:
return ncclSuccess;
}
static void parseChordalRing(struct ncclTopoSystem* system, char **str) {
ncclResult_t parseChordalRing(struct ncclTopoSystem* system, char **str) {
static const char *ringBase = "0 6 7 4 5 3 2 1|0 5 6 3 7 1 4 2|0 4 6 2 7 5 1 3|0 1 2 3 5 4 7 6|0 2 4 1 7 3 6 5|0 3 1 5 7 2 6 4";
static char ringRemap[256];
int id[8], dist[8];
@@ -723,7 +723,7 @@ static void parseChordalRing(struct ncclTopoSystem* system, char **str) {
int ngpus = system->nodes[GPU].count;
// single node CR8G only
if (ngpus != 8 || system->nodes[NET].count != 0)
return;
return ncclSuccess;
// validate chordal ring and calculate distance
for (i=0; i<ngpus; i++) {
struct ncclTopoNode* node = system->nodes[GPU].nodes+i;
@@ -741,7 +741,7 @@ static void parseChordalRing(struct ncclTopoSystem* system, char **str) {
count ++;
}
if(count != ngpus-2 || sum < 0 || sum > ngpus-1) {
return;
return ncclSuccess;
}
dist[i] = sum;
}
@@ -766,7 +766,7 @@ static void parseChordalRing(struct ncclTopoSystem* system, char **str) {
ringRemap[i] = 0;
*str = ringRemap;
INFO(NCCL_GRAPH, "Use chordal ring: %s", ringRemap);
return;
return ncclSuccess;
}
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
@@ -799,7 +799,7 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
if (graph->nChannels > 0) return ncclSuccess;
}
if (!str) parseChordalRing(system, &str);
if (!str) NCCLCHECK(parseChordalRing(system, &str));
if (str) {
NCCLCHECK(parseGraph(str, &graph->nChannels, ngpus, graph->intra));
for (int i=0; i<graph->nChannels*ngpus; i++) {
+2
Просмотреть файл
@@ -101,4 +101,6 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
#include "info.h"
ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int protocol, float* time);
ncclResult_t parseChordalRing(struct ncclTopoSystem* system, char **str);
#endif
+1
Просмотреть файл
@@ -870,6 +870,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d|%d->%d->%d/%d/%d",
c, treeUp->down[0], treeUp->down[1], treeUp->down[2], rank, treeUp->up,
treeDn->up, rank, treeDn->down[0], treeDn->down[1], treeDn->down[2]);
INFO(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c, comm->channels[c].ring.prev, comm->rank, comm->channels[c].ring.next);
}
line[1023] = '\0';
INFO(NCCL_INIT, "Trees%s", line);
+1
Просмотреть файл
@@ -448,6 +448,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d/%d/%d->%d->%d|%d->%d->%d/%d/%d",
c, treeUp->down[0], treeUp->down[1], treeUp->down[2], rank, treeUp->up,
treeDn->up, rank, treeDn->down[0], treeDn->down[1], treeDn->down[2]);
INFO(NCCL_GRAPH, "Ring %d : %d -> %d -> %d", c, comm->channels[c].ring.prev, comm->rank, comm->channels[c].ring.next);
}
line[1023] = '\0';
INFO(NCCL_INIT, "Trees%s", line);