2
0

Improve 4P2H topology on Rome (#243)

1. Use bi-directional rings
2. GPU search is sorted by PCI device ID to get consistent results
Este cometimento está contido em:
Wenkai Du
2020-07-28 14:21:44 -07:00
cometido por GitHub
ascendente e7a10aa0e4
cometimento d1e20b4c5e
2 ficheiros modificados com 44 adições e 35 eliminações
+1 -2
Ver ficheiro
@@ -289,9 +289,8 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct nccl
memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int));
memcpy(ringNext+nChannels*nranks, ringNext, nChannels*nranks*sizeof(int));
int nc = 0;
int nc = nChannels*2;
if (comm->topo->nodes[NET].count == 0 && comm->topo->type == RCCL_TOPO_CR8G) nc = nChannels*3;
else if (comm->topo->nodes[NET].count != 0 && comm->topo->type == RCCL_TOPO_4P2H_ROME) nc = nChannels*4;
int end = std::min((int)ncclMaxNchannels(), std::max(nc, ncclMinNchannels()));
// Duplication should be complete now
+43 -33
Ver ficheiro
@@ -783,7 +783,7 @@ static bool getGpuNetCount(struct ncclTopoSystem* system, int id, int *ngpu, int
}
static bool findGpuByXGMI(struct ncclTopoSystem* system, int cpu1, int cpu2, int *gpu1, int *gpu2, int ex1, int ex2) {
int n, m;
int n, m, idx, gid;
int ngpus = system->nodes[GPU].count;
*gpu1 = -1; *gpu2 = -1;
int c1, c2;
@@ -794,6 +794,7 @@ static bool findGpuByXGMI(struct ncclTopoSystem* system, int cpu1, int cpu2, int
if (system->nodes[GPU].nodes[n].paths[CPU][c1].count != 2) continue;
struct ncclTopoNode* node = system->nodes[GPU].nodes+n;
if (node->paths[GPU] == NULL) continue;
idx = -1; gid = 0;
for (m = 0; m < ngpus; m++) {
if (system->nodes[GPU].nodes[m].gpu.dev == ex2) continue;
if (system->nodes[GPU].nodes[m].paths[CPU][c2].count != 2) continue;
@@ -802,13 +803,18 @@ static bool findGpuByXGMI(struct ncclTopoSystem* system, int cpu1, int cpu2, int
if (link->remNode->gpu.dev == system->nodes[GPU].nodes[m].gpu.dev) break;
}
if (!link->remNode) continue;
if (link->type == LINK_NVL) break;
if (link->type == LINK_NVL) {
if (idx == -1 || (idx != -1 && system->nodes[GPU].nodes[m].id < gid)) {
idx = m;
gid = system->nodes[GPU].nodes[m].id;
}
}
}
if (m < ngpus) break;
if (idx != -1) break;
}
if (n < ngpus) {
*gpu1 = system->nodes[GPU].nodes[n].gpu.dev;
*gpu2 = system->nodes[GPU].nodes[m].gpu.dev;
*gpu2 = system->nodes[GPU].nodes[idx].gpu.dev;
return true;
}
return false;
@@ -842,7 +848,7 @@ static bool validate4P1H(struct ncclTopoSystem* system, int *hive) {
}
static ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, char **str) {
static const char *ringBase = "6 7 4 5 1 0 3 2";
static const char *ringBase = "6 7 4 5 1 0 3 2|7 6 2 3 0 1 5 4";
static char ringRemap[64];
int id[8], dist[8];
int i;
@@ -850,8 +856,8 @@ static ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, char **str) {
*str = 0;
int ngpus = system->nodes[GPU].count;
int ncpus = system->nodes[CPU].count;
// 8 GPUs and 4 numa nodes on multi node only
if (ngpus != 8 || ncpus != 4 || !system->nodes[NET].count)
// 8 GPUs and 4 numa nodes only
if (ngpus != 8 || ncpus != 4)
return ncclSuccess;
// only valid on Rome
int arch, vendor, model;
@@ -942,34 +948,38 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
}
graph->speedIntra = graph->speedInter = system->maxWidth;
if (system->nodes[NET].count) {
int *intra, *used;
graph->nChannels = system->nodes[NET].count;
NCCLCHECK(ncclCalloc(&intra, ngpus));
NCCLCHECK(ncclCalloc(&used,system->nodes[NET].count));
for (int n = 0; n < system->nodes[NET].count; n++) {
graph->inter[n*2] = graph->inter[n*2+1] = n;
// do not change ring order for 4P2H on Rome
if (system->type == RCCL_TOPO_4P2H_ROME) continue;
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
struct ncclTopoLinkList* paths = net->paths[GPU];
// find the first unsed GPU that is closest to NIC
int f, m;
for (f = 0; f < ngpus; f++) {
int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[f].gpu.rank) break;
if(j >= n) break;
// do not change ring order for multi node 4P2H on Rome
if (system->type == RCCL_TOPO_4P2H_ROME) {
for (int n = 0; n < graph->nChannels; n++)
graph->inter[n*2] = graph->inter[n*2+1] = n%system->nodes[NET].count;
} else {
int *intra, *used;
graph->nChannels = system->nodes[NET].count;
NCCLCHECK(ncclCalloc(&intra, ngpus));
NCCLCHECK(ncclCalloc(&used,system->nodes[NET].count));
for (int n = 0; n < system->nodes[NET].count; n++) {
graph->inter[n*2] = graph->inter[n*2+1] = n;
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
struct ncclTopoLinkList* paths = net->paths[GPU];
// find the first unsed GPU that is closest to NIC
int f, m;
for (f = 0; f < ngpus; f++) {
int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[f].gpu.rank) break;
if(j >= n) break;
}
for (int i = 0; i < ngpus; i++) {
int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[i].gpu.rank) break;
if (j < n) continue;
if (paths[i].count < paths[f].count) f = i;
}
for (m = 0; m<ngpus; m++) if (graph->intra[n*ngpus+m] == system->nodes[GPU].nodes[f].gpu.rank) break;
used[n] = graph->intra[n*ngpus+m];
for (int i = 0; i < ngpus; i++) intra[i] = graph->intra[n*ngpus+((i+m)%ngpus)];
for (int i = 0; i < ngpus; i++) graph->intra[n*ngpus+i] = intra[i];
}
for (int i = 0; i < ngpus; i++) {
int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[i].gpu.rank) break;
if (j < n) continue;
if (paths[i].count < paths[f].count) f = i;
}
for (m = 0; m<ngpus; m++) if (graph->intra[n*ngpus+m] == system->nodes[GPU].nodes[f].gpu.rank) break;
used[n] = graph->intra[n*ngpus+m];
for (int i = 0; i < ngpus; i++) intra[i] = graph->intra[n*ngpus+((i+m)%ngpus)];
for (int i = 0; i < ngpus; i++) graph->intra[n*ngpus+i] = intra[i];
free(used);
free(intra);
}
free(used);
free(intra);
}
if (graph->nChannels) return ncclSuccess;
}