diff --git a/src/graph/connect.cc b/src/graph/connect.cc index 76a10c5fc4..b5b075f6e7 100644 --- a/src/graph/connect.cc +++ b/src/graph/connect.cc @@ -255,7 +255,7 @@ int ncclMaxNchannels() { return maxNchannels; } -ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct ncclTopoRanks** allTopoRanks, int* rings, int gcn) { +ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct ncclTopoRanks** allTopoRanks, int* rings, int gcn, int nnets) { // Gather data from all ranks int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeUpRecv, *treeUpSend, *treeDnRecv,*treeDnSend; int nranks = comm->nRanks; @@ -292,7 +292,8 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct nccl int nc = nChannels*2; if (gcn == 908) nc = std::max(nc, 4); if (comm->topo->nodes[NET].count == 0 && comm->topo->type == RCCL_TOPO_CR8G) nc = nChannels*4; - if (comm->topo->nodes[NET].count && comm->topo->type == RCCL_TOPO_4P2H_ROME) nc = 4*comm->topo->nodes[NET].count; + if (!nnets) nnets = comm->topo->nodes[NET].count; + if (nnets && comm->topo->type == RCCL_TOPO_4P2H_ROME) nc = 4*nnets; int end = std::min((int)ncclMaxNchannels(), std::max(nc, ncclMinNchannels())); // Duplication should be complete now diff --git a/src/graph/rome_models.h b/src/graph/rome_models.h new file mode 100755 index 0000000000..4788fb2801 --- /dev/null +++ b/src/graph/rome_models.h @@ -0,0 +1,106 @@ +/* +Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#define MAX_ROME_GPUS 8 +#define MAX_ROME_NICS 2 + +struct rcclRomeModel { + int nGpus; + int nCpus; + int nNics; + int nLinks; + int64_t gpuIds[MAX_ROME_GPUS]; + int64_t gpuNuma[MAX_ROME_GPUS]; + int64_t nicNuma[MAX_ROME_NICS]; + int connMatrix[MAX_ROME_GPUS*MAX_ROME_GPUS]; + const char *pattern; + const char *ringBase; +}; + +static struct rcclRomeModel rome_model_22 = { + .nGpus = 8, .nCpus = 4, .nNics = 1, .nLinks = 2, + .gpuIds = { 0x3000, 0x43000, 0x26000, 0xc3000, 0x83000, 0x23000, 0xc6000, 0xa3000, }, + .gpuNuma = { 1, 0, 1, 2, 3, 1, 2, 3, }, + .nicNuma = { 2, }, + .connMatrix = { 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, }, + .pattern = "10302120", + .ringBase = "7 4 5 3 1 0 6 2|4 7 3 5 0 1 2 6", +}; + +static struct rcclRomeModel rome_model_25 = { + .nGpus = 8, .nCpus = 4, .nNics = 2, .nLinks = 2, + .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, }, + .gpuNuma = { 0, 1, 1, 1, 2, 2, 2, 3, }, + .nicNuma = { 0, 3, }, + .connMatrix = { 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, }, + .pattern = "11303011", + .ringBase = "2 1 0 3 6 7 5 4|7 6 4 5 1 2 3 0", +}; + +static struct rcclRomeModel rome_model_27 = { + .nGpus = 8, .nCpus = 4, .nNics = 2, .nLinks = 2, + .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, }, + .gpuNuma = { 0, 1, 1, 1, 2, 2, 2, 3, }, + .nicNuma = { 0, 3, }, + .connMatrix = { 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, }, + .pattern = "11303011", + .ringBase = "0 6 2 3 1 7 5 4|7 1 4 5 6 0 3 2", +}; + +static struct rcclRomeModel rome_model_29 = { + .nGpus = 8, .nCpus = 4, .nNics = 1, .nLinks = 3, + .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, }, + .gpuNuma = { 0, 1, 1, 1, 2, 2, 3, 3, }, + .nicNuma = { 2, }, + .connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, }, + .pattern = "10302120", + .ringBase = "6 5 7 4 0 1 3 2|6 4 7 5 2 3 1 0", +}; + +static struct rcclRomeModel rome_model_31 = { + .nGpus = 8, .nCpus = 8, .nNics = 2, .nLinks = 2, + .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, }, + .gpuNuma = { 1, 2, 2, 3, 4, 5, 5, 7, }, + .nicNuma = { 0, 6, }, + .connMatrix = { 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, }, + .pattern = "0110201010200110", + .ringBase = "1 2 3 0 6 4 5 7|4 6 7 5 2 1 0 3", +}; + +static struct rcclRomeModel rome_model_33 = { + .nGpus = 8, .nCpus = 8, .nNics = 2, .nLinks = 2, + .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, }, + .gpuNuma = { 1, 2, 2, 3, 4, 5, 5, 7, }, + .nicNuma = { 0, 6, }, + .connMatrix = { 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, }, + .pattern = "0110201010200110", + .ringBase = "1 4 5 7 0 3 2 6|4 1 7 5 6 2 3 0", +}; + +static struct rcclRomeModel romeTopoModels[] = { + rome_model_22, + rome_model_25, + rome_model_27, + rome_model_29, + rome_model_31, + rome_model_33, +}; \ No newline at end of file diff --git a/src/graph/search.cc b/src/graph/search.cc index 2297b5c752..b3589ff0c7 100644 --- a/src/graph/search.cc +++ b/src/graph/search.cc @@ -10,6 +10,8 @@ #include "topo.h" #include "xml.h" #include +#include +#include "rome_models.h" // Initialize system->maxWidth. This is the per-channel (i.e. per-SM) // max speed. @@ -663,64 +665,111 @@ ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs } /* Parse user defined rings. Format is like : - * "0 1|1 0|0 1 2 3|3 2 1 0|0 2 3 1|1 3 2 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0" - * Rings with a non-matching number of ranks are ignored so we can provide + * "0 1|1 0|0 1 2 3|3 2 1 0|N0 0 2 3 1 N1|1 3 2 0|0 1 2 3 4 5 6 7|N2 7 6 5 4 3 2 1 0 N1" + * Network interfaces can be optionally specified by N prefix. + * Rings with a non-matching number of gpus are ignored so we can provide * rings for multiple cases. */ -#define MAX_ENV_RANKS 512 -static ncclResult_t parseGraph(const char* str, int* nChannelsRet, int ngpus, int* channels) { - int ranks[MAX_ENV_RANKS]; +static ncclResult_t parseGraph(const char* str, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* gpu_map, int nnets, int* net_map ) { + int gpus[MAX_ROME_GPUS]; int nChannels = 0; - int rank = 0; + int gpu = 0; int offset = 0; - int status = 0; // 0 : between numbers, 1 : inside number + int status = 0; // 0 : between numbers, 1 : inside number, 2: start NET + int nets[2]; + int net = 0; + int ngpus = system->nodes[GPU].count; do { - int digit = str[offset] - '0'; - if (digit >= 0 && digit <= 9) { + if (str[offset] == 'N') { if (status == 0) { - ranks[rank] = digit; - status = 1; - } else { - ranks[rank] = ranks[rank]*10+digit; + status = 2; } } else { - if (status == 1) { - rank++; - if (rank == MAX_ENV_RANKS) goto end; - } - status = 0; - if (str[offset] == '|' || str[offset] == '\0') { - // Ignore if ngpus doesn't match - if (rank != ngpus) goto newchannel; - - for (int r=0; r= ngpus) goto newchannel; - // Ignore if ranks are duplicate - for (int i=0; i= 0 && digit <= 9) { + if (status == 0) { + gpus[gpu] = digit; + status = 1; + } else if (status == 2) { + nets[net] = digit; } - nChannels++; + else{ + gpus[gpu] = gpus[gpu]*10+digit; + } + } else { + if (status == 1) { + gpu++; + if (gpu > MAX_ROME_GPUS) goto end; + } else if (status == 2) { + net++; + if (net > 2) goto end; + } + status = 0; + if (str[offset] == '|' || str[offset] == '\0') { + // Ignore if ngpus doesn't match + if (gpu != ngpus) goto newchannel; + // Ignore if nnets are not 0 or 2 + if (net && net != 2) goto newchannel; + + for (int r=0; r= ngpus) goto newchannel; + // Ignore if gpus are duplicate + for (int i=0; inodes[GPU].nodes[j].gpu.dev) + break; + if (j < ngpus) + graph->intra[nChannels*ngpus+r] = system->nodes[GPU].nodes[j].gpu.rank; + else + return ncclInternalError; + } + + if (net) { + if (nets[0] >= nnets || nets[1] >= nnets) goto newchannel; + graph->inter[nChannels*2] = nets[0]; + graph->inter[nChannels*2+1] = nets[1]; + } else if (net_map && nnets) { + graph->inter[nChannels*2] = net_map[nChannels%nnets]; + graph->inter[nChannels*2+1] = net_map[(nChannels+1)%nnets]; + } else if (nnets) { + graph->inter[nChannels*2] = nChannels%nnets; + graph->inter[nChannels*2+1] = (nChannels+1)%nnets; + } + nChannels++; newchannel: - rank = 0; + gpu = 0; + net = 0; + } } } } while (str[offset++] != 0); end: - *nChannelsRet = nChannels; + graph->nChannels = nChannels; + graph->speedIntra = graph->speedInter = system->maxWidth; +#if 0 + for (int i=0; inChannels; i++) { + printf("%d: ", i); + printf ("NET/%d ", graph->inter[i*2]); + for (int j=0; jintra[i*ngpus+j]); + printf ("NET/%d ", graph->inter[i*2+1]); + printf("\n"); + } +#endif return ncclSuccess; } -static ncclResult_t parseChordalRing(struct ncclTopoSystem* system, char **str) { +static ncclResult_t parseChordalRing(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) { static const char *ringBase = "0 1 2 3 5 4 7 6|0 2 4 1 7 3 6 5|0 3 1 5 7 2 6 4|0 6 7 4 5 3 2 1|0 5 6 3 7 1 4 2|0 4 6 2 7 5 1 3"; - static char ringRemap[256]; int id[8], dist[8]; int i; - *str = 0; int ngpus = system->nodes[GPU].count; if (ngpus != 8) return ncclSuccess; @@ -757,251 +806,242 @@ static ncclResult_t parseChordalRing(struct ncclTopoSystem* system, char **str) dist[m] = dist[n]; dist[n] = temp; } // create chordal ring based on reference and remapped ids - for (i = 0; i = '0' && ringBase[i] <= '9') - ringRemap[i] = id[ringBase[i]-'0']+'0'; - else - ringRemap[i] = ringBase[i]; - } - ringRemap[i] = 0; - *str = ringRemap; system->type = RCCL_TOPO_CR8G; - INFO(NCCL_GRAPH, "Use chordal ring: %s", ringRemap); + NCCLCHECK(parseGraph(ringBase, system, graph, id, 0, NULL)); + if (system->nodes[NET].count) { + int *intra, *used; + graph->nChannels = system->nodes[NET].count; + NCCLCHECK(ncclCalloc(&intra, ngpus)); + NCCLCHECK(ncclCalloc(&used,system->nodes[NET].count)); + for (int n = 0; n < system->nodes[NET].count; n++) { + graph->inter[n*2] = graph->inter[n*2+1] = n; + struct ncclTopoNode* net = system->nodes[NET].nodes+n; + struct ncclTopoLinkList* paths = net->paths[GPU]; + // find the first unsed GPU that is closest to NIC + int f, m; + for (f = 0; f < ngpus; f++) { + int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[f].gpu.rank) break; + if(j >= n) break; + } + for (int i = 0; i < ngpus; i++) { + int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[i].gpu.rank) break; + if (j < n) continue; + if (paths[i].count < paths[f].count) f = i; + } + for (m = 0; mintra[n*ngpus+m] == system->nodes[GPU].nodes[f].gpu.rank) break; + used[n] = graph->intra[n*ngpus+m]; + for (int i = 0; i < ngpus; i++) intra[i] = graph->intra[n*ngpus+((i+m)%ngpus)]; + for (int i = 0; i < ngpus; i++) graph->intra[n*ngpus+i] = intra[i]; + } + free(used); + free(intra); + } return ncclSuccess; } -static bool getGpuNetCount(struct ncclTopoSystem* system, int id, int *ngpu, int *nnet) { - *ngpu = 0; *nnet = 0; +static bool getGpuNetCount(struct ncclTopoSystem* system, int id, int *g, int *n, int nnet, int *net_map) { + *g = 0; *n = 0; int i; if (ncclTopoIdToIndex(system, CPU, id, &i) == ncclInternalError) return false; - for (int n = 0; n < system->nodes[NET].count; n++) - if (system->nodes[NET].nodes[n].paths[CPU][i].count == 2) (*nnet)++; - for (int n = 0; n < system->nodes[GPU].count; n++) - if (system->nodes[GPU].nodes[n].paths[CPU][i].count == 2) (*ngpu)++; + for (int j = 0; j < nnet; j++) + if (system->nodes[NET].nodes[net_map[j]].paths[CPU][i].count == 2) (*n)++; + for (int j = 0; j < system->nodes[GPU].count; j++) + if (system->nodes[GPU].nodes[j].paths[CPU][i].count == 2) (*g)++; return true; } -/* compare GPUs by PCI ID */ -static int compareGPU (const void *g1, const void *g2, void *s) { - struct ncclTopoSystem* system = (struct ncclTopoSystem*)s; - return system->nodes[GPU].nodes[*(int *)g1].id > system->nodes[GPU].nodes[*(int *)g2].id; +static ncclResult_t ncclGpuIdToIndex(struct ncclTopoSystem* system, int id, int* index) { + *index = -1; + for (int i=0; inodes[GPU].count; i++) { + if (system->nodes[GPU].nodes[i].gpu.dev == id) { + *index = i; + return ncclSuccess; + } + } + return ncclInternalError; } -static bool findGpuByXGMI(struct ncclTopoSystem* system, int cpu1, int cpu2, int *gpu1, int *gpu2, int use_shared, int ex1, int ex2) { - int n, m, k, idx, c1, c2; - uint64_t gid; - int ngpus = system->nodes[GPU].count; - if (ncclTopoIdToIndex(system, CPU, cpu1, &c1) == ncclInternalError) return false; - if (ncclTopoIdToIndex(system, CPU, cpu2, &c2) == ncclInternalError) return false; +static ncclResult_t parseRomeSystem(struct ncclTopoSystem* system, struct rcclRomeModel* romeTopo, char *pattern, int *net_map) { + pattern[0] = 0; // pattern will be NULL for invalid topology + romeTopo->nGpus = system->nodes[GPU].count; + romeTopo->nCpus = system->nodes[CPU].count; + romeTopo->nNics = 0; + romeTopo->nLinks = 0; + for (int i = 0; i < romeTopo->nGpus; i ++) { + int gpu, n; + NCCLCHECK(ncclGpuIdToIndex(system, i, &gpu)); + romeTopo->gpuIds[i] = system->nodes[GPU].nodes[gpu].id; + for (n = 0; n < romeTopo->nCpus; n++) + if (system->nodes[GPU].nodes[gpu].paths[CPU][n].count == 2) break; + if (n < romeTopo->nCpus) romeTopo->gpuNuma[i] = system->nodes[CPU].nodes[n].id; - int *s_gpus = (int *)malloc(sizeof(int)*ngpus); - int s_ngpus = 0; - - // build a sorted list of source GPUs - for (n = 0; n < ngpus; n++) { - if (*gpu1 != -1 && system->nodes[GPU].nodes[n].gpu.dev != *gpu1) continue; - if (system->nodes[GPU].nodes[n].gpu.dev == ex1) continue; - if (system->nodes[GPU].nodes[n].paths[CPU][c1].count != 2) continue; - s_gpus[s_ngpus++] = n; - } - if (s_ngpus) qsort_r(s_gpus, s_ngpus, sizeof(int), compareGPU, system); - - for (n = 0; n < s_ngpus; n++) { - struct ncclTopoNode* node = system->nodes[GPU].nodes+s_gpus[n]; + struct ncclTopoNode* node = system->nodes[GPU].nodes+gpu; if (node->paths[GPU] == NULL) continue; - idx = -1; gid = 0; - for (m = 0; m < ngpus; m++) { - if (*gpu2 != -1 && system->nodes[GPU].nodes[m].gpu.dev != *gpu2) continue; - if (system->nodes[GPU].nodes[m].gpu.dev == ex2) continue; - if (system->nodes[GPU].nodes[m].paths[CPU][c2].count != 2) continue; + int count = 0; + for (n = 0; n < romeTopo->nGpus; n++) { + romeTopo->connMatrix[i*romeTopo->nGpus+n] = 0; struct ncclTopoLink* link; for (link = node->links; link->remNode; link++) { - if (link->remNode->gpu.dev == system->nodes[GPU].nodes[m].gpu.dev) break; + if (link->remNode->gpu.dev == n) break; } if (!link->remNode) continue; - if (link->type == LINK_NVL) { - int is_shared = 0; - for (k = 0; k < ngpus; k++) { - if (k == m || k == s_gpus[n]) continue; - if ((system->nodes[GPU].nodes[k].id & 0xf0000) == (system->nodes[GPU].nodes[m].id & 0xf0000)) - break; - } - if (k < ngpus) is_shared = 1; - if (use_shared == -1 || is_shared == use_shared) { - if (idx == -1 || (idx != -1 && system->nodes[GPU].nodes[m].id < gid)) { - idx = m; - gid = system->nodes[GPU].nodes[m].id; - } - } + if (link->type != LINK_NVL) continue; + romeTopo->connMatrix[i*romeTopo->nGpus+n] = 1; + count ++; + } + if (!romeTopo->nLinks) romeTopo->nLinks = count; + else if (romeTopo->nLinks != count) return ncclSuccess; + } + + // trim ports and create NET map + for (int i = 0; i < system->nodes[NET].count; i ++) { + int j; + for (j = 0; j < romeTopo->nNics; j++) { + if (system->nodes[NET].nodes[i].net.asic == system->nodes[NET].nodes[net_map[j]].net.asic) { + if (system->nodes[NET].nodes[i].net.width > system->nodes[NET].nodes[net_map[j]].net.width) + net_map[j] = i; + break; } } - if (idx != -1) break; + if (j >= romeTopo->nNics) { + net_map[j] = i; + (romeTopo->nNics)++; + if (romeTopo->nNics >= MAX_ROME_NICS) break; + } } - if (n < s_ngpus) { - *gpu1 = system->nodes[GPU].nodes[s_gpus[n]].gpu.dev; - *gpu2 = system->nodes[GPU].nodes[idx].gpu.dev; - //printf("%s+: c1 %d c2 %d gpu1 %d gpu2 %d use_shared %d ex1 %d, ex2 %d\n", - // __func__, cpu1, cpu2, *gpu1, *gpu2, use_shared, ex1, ex2); - free(s_gpus); + + // number of GPUs and NICs on each numa node is used as first screening pattern + for (int i = 0; i < romeTopo->nCpus; i++) { + int g, n; + if (!getGpuNetCount(system, i, &g, &n, romeTopo->nNics, net_map)) return ncclSuccess; + pattern[i*2] = '0' + g; + pattern[i*2+1] = '0' + n; + } + pattern[romeTopo->nCpus*2] = 0; + + for (int i = 0; i < romeTopo->nNics; i ++) { + int net, n; + NCCLCHECK(ncclTopoIdToIndex(system, NET, net_map[i], &net)); + for (n = 0; n < romeTopo->nCpus; n++) + if (system->nodes[NET].nodes[net].paths[CPU][n].count == 2) break; + if (n < romeTopo->nCpus) romeTopo->nicNuma[i] = system->nodes[CPU].nodes[n].id; + else return ncclSuccess; + } + + const char* romeModelFile = getenv("RCCL_DUMP_ROME_MODEL_FILE"); + if (romeModelFile) { + INFO(NCCL_ENV, "RCCL_DUMP_ROME_MODEL_FILE set by environment to %s", romeModelFile); + FILE* file = fopen(romeModelFile, "w"); + if (file == NULL) { + WARN("Unable to open %s, not dumping Rome model.", romeModelFile); + return ncclSuccess; + } + fprintf(file, "static struct rcclRomeModel rome_model_ = {\n"); + fprintf(file, " .nGpus = %d, .nCpus = %d, .nNics = %d, .nLinks = %d,\n", romeTopo->nGpus, romeTopo->nCpus, romeTopo->nNics, romeTopo->nLinks); + fprintf(file, " .gpuIds = { "); + for (int i = 0; i < romeTopo->nGpus; i ++) fprintf(file, "0x%lx, ", romeTopo->gpuIds[i]); + fprintf(file, "},\n"); + fprintf(file, " .gpuNuma = { "); + for (int i = 0; i < romeTopo->nGpus; i ++) fprintf(file, "%ld, ", romeTopo->gpuNuma[i]); + fprintf(file, "},\n"); + fprintf(file, " .nicNuma = { "); + for (int i = 0; i < romeTopo->nNics; i ++) fprintf(file, "%ld, ", romeTopo->nicNuma[i]); + fprintf(file, "},\n"); + fprintf(file, " .connMatrix = { "); + for (int i = 0; i < romeTopo->nGpus; i ++) + for (int n = 0; n < romeTopo->nGpus; n++) fprintf(file, "%d, ", romeTopo->connMatrix[i*romeTopo->nGpus+n]); + fprintf(file, "},\n"); + fprintf(file, " .pattern = \"%s\",\n", pattern); + fprintf(file, " .ringBase = \"\",\n"); + fprintf(file, "};\n"); + fclose(file); + } + return ncclSuccess; +} + +static bool permuteGpuIds(int *g, int n, int last, struct rcclRomeModel* ref, struct rcclRomeModel* topo, int* time) { + (*time) ++; + if (n == last) { + int i, j; + // match GPU numa + for (i = 0; i < ref->nGpus; i++) + if (ref->gpuNuma[i] != topo->gpuNuma[g[i]]) break; + if (i < ref->nGpus) return false; + // match XGMI connection + for (i = 0; i < ref->nGpus; i++) { + for (j = 0; j < ref->nGpus; j++) + if (ref->connMatrix[i*ref->nGpus+j] != topo->connMatrix[g[i]*ref->nGpus+g[j]]) break; + if (j < ref->nGpus) break; + } + if (i < ref->nGpus) return false; + // match NBIO + for (i = 0; i < ref->nGpus; i++) { + for (j = 0; j < ref->nGpus; j++) { + if (i == j) continue; + bool nbio_ref = (ref->gpuIds[i]&0xf0000) == (ref->gpuIds[j]&0xf0000); + bool nbio_topo = (topo->gpuIds[g[i]]&0xf0000) == (topo->gpuIds[g[j]]&0xf0000); + if (nbio_ref != nbio_topo) break; + if (nbio_ref && ((ref->gpuIds[i]-ref->gpuIds[j])*(topo->gpuIds[g[i]]-topo->gpuIds[g[j]]) < 0)) break; + } + if (j < ref->nGpus) break; + } + if (i < ref->nGpus) return false; return true; + } else { + for (int i = n; i <= last; i++) { + std::swap(g[n], g[i]); + if (permuteGpuIds(g, n+1, last, ref, topo, time)) return true; + std::swap(g[n], g[i]); + } } - free(s_gpus); return false; } -static bool validate4P1H(struct ncclTopoSystem* system, int *hive) { - int g, n, m; - int ngpus = system->nodes[GPU].count; - for (g = 0; g < 4; g++) { - int gpu = hive[g]; - int next_gpu = hive[(g+1)%4]; - for (n = 0; n < ngpus; n++) { - if (system->nodes[GPU].nodes[n].gpu.dev != gpu) continue; - struct ncclTopoNode* node = system->nodes[GPU].nodes+n; - if (node->paths[GPU] == NULL) continue; - for (m = 0; m < ngpus; m++) { - struct ncclTopoLink* link; - for (link = node->links; link->remNode; link++) { - if (link->remNode->gpu.dev == next_gpu) break; - } - if (!link->remNode) continue; - if (link->type == LINK_NVL) break; - } - if (m < ngpus) break; - } - if (n < ngpus) continue; - else break; - } - if (g < 4) return false; - else return true; -} - -static ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, char **str) { - static const char *ringBase_10302120_1 = "7 4 5 3 1 0 6 2|4 7 3 5 0 1 2 6"; - static const char *ringBase_10302120_2 = "6 4 7 5 0 1 3 2|6 5 7 4 2 3 1 0"; - static const char *ringBase_11303011_1 = "2 1 0 3 6 7 5 4|7 6 4 5 1 2 3 0"; - static const char *ringBase_11303011_2 = "0 6 2 3 1 7 5 4|7 1 4 5 6 0 3 2"; - static const char *ringBase_0110201010200110_1 = "1 2 3 0 6 4 5 7|4 6 7 5 2 1 0 3"; - static const char *ringBase_0110201010200110_2 = "3 0 6 2 1 4 5 7|4 1 0 3 2 6 7 5"; - static const char *ringBase; +static ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) { static char ringRemap[64]; - int id[8], dist[8]; int i; - *str = 0; int ngpus = system->nodes[GPU].count; int ncpus = system->nodes[CPU].count; - // 8 GPUs only - if (ngpus != 8) - return ncclSuccess; + // only valid on Rome int arch, vendor, model; NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model)); if (arch != NCCL_TOPO_CPU_ARCH_X86 || vendor != NCCL_TOPO_CPU_VENDOR_AMD || model != NCCL_TOPO_CPU_TYPE_ROME) return ncclSuccess; - system->type = RCCL_TOPO_4P2H_ROME; - // 4 or 8 numa nodes only - if (ncpus != 4 && ncpus != 8) - return ncclSuccess; - // number of GPUs and NICs on each numa node is used as first screening pattern - char pattern[256]; - for (i = 0; i < ncpus; i++) { - int g, n; - if (!getGpuNetCount(system, i, &g, &n)) return ncclSuccess; - pattern[i*2] = '0' + g; - pattern[i*2+1] = '0' + n; - } - pattern[i*2] = 0; - int g[8], h1[4], h2[4]; - for (int i = 0; i <8; i++) g[i] = -1; - if (strcmp(pattern, "10302120") == 0) { - bool cross = findGpuByXGMI(system, 1, 2, &g[2], &g[6], 1, -1, -1); - g[2] = g[6] = -1; - if (cross) { - // identify GPUs for pattern "10302120" - if (!findGpuByXGMI(system, 0, 1, &g[1], &g[0], 0, -1, -1)) return ncclSuccess; - if (!findGpuByXGMI(system, 0, 1, &g[1], &g[2], 1, -1, g[0])) return ncclSuccess; - if (!findGpuByXGMI(system, 1, 2, &g[2], &g[6], 1, -1, -1)) return ncclSuccess; - if (!findGpuByXGMI(system, 2, 1, &g[3], &g[5], 1, g[6], g[2])) return ncclSuccess; - if (!findGpuByXGMI(system, 1, 3, &g[5], &g[4], -1, -1, -1)) return ncclSuccess; - if (!findGpuByXGMI(system, 2, 3, &g[3], &g[7], -1, g[6], g[4])) return ncclSuccess; - // finally verify two XGMI hives for pattern "10302120" - h1[0] = g[1]; h1[1] = g[0]; h1[2] = g[6]; h1[3] = g[2]; - h2[0] = g[7]; h2[1] = g[4]; h2[2] = g[5]; h2[3] = g[3]; - ringBase = ringBase_10302120_1; - } else { - // identify GPUs for pattern "10302120" - if (!findGpuByXGMI(system, 0, 1, &g[0], &g[1], 1, -1, -1)) return ncclSuccess; - if (!findGpuByXGMI(system, 0, 1, &g[0], &g[3], 0, -1, -1)) return ncclSuccess; - if (!findGpuByXGMI(system, 1, 1, &g[1], &g[2], -1, -1, g[3])) return ncclSuccess; - if (!findGpuByXGMI(system, 2, 3, &g[5], &g[7], -1, -1, -1)) return ncclSuccess; - if (!findGpuByXGMI(system, 2, 3, &g[4], &g[6], -1, g[5], g[7])) return ncclSuccess; - // finally verify two XGMI hives for pattern "10302120" - h1[0] = g[0]; h1[1] = g[1]; h1[2] = g[2]; h1[3] = g[3]; - h2[0] = g[4]; h2[1] = g[5]; h2[2] = g[7]; h2[3] = g[6]; - ringBase = ringBase_10302120_2; - } - } - else if (strcmp(pattern, "11303011") == 0) { - // there are 2 configurations for pattern "11303011" - if (findGpuByXGMI(system, 1, 2, &g[2], &g[6], 1, -1, -1)) { - if (!findGpuByXGMI(system, 2, 1, &g[4], &g[1], 1, g[6], g[2])) return ncclSuccess; - if (!findGpuByXGMI(system, 0, 1, &g[0], &g[3], 0, -1, -1)) return ncclSuccess; - if (!findGpuByXGMI(system, 3, 2, &g[7], &g[5], 1, -1, -1)) return ncclSuccess; - // finally verify two XGMI hives for pattern "11303011" - h1[0] = g[0]; h1[1] = g[3]; h1[2] = g[2]; h1[3] = g[6]; - h2[0] = g[1]; h2[1] = g[4]; h2[2] = g[5]; h2[3] = g[7]; - ringBase = ringBase_11303011_2; - } else { - // identify GPUs for pattern "11303011" - if (!findGpuByXGMI(system, 0, 1, &g[0], &g[1], 1, -1, -1)) return ncclSuccess; - if (!findGpuByXGMI(system, 0, 1, &g[0], &g[3], 0, -1, -1)) return ncclSuccess; - if (!findGpuByXGMI(system, 1, 1, &g[1], &g[2], -1, -1, -1)) return ncclSuccess; - if (!findGpuByXGMI(system, 3, 2, &g[7], &g[5], -1, -1, -1)) return ncclSuccess; - if (!findGpuByXGMI(system, 3, 2, &g[7], &g[6], -1, -1, g[5])) return ncclSuccess; - if (!findGpuByXGMI(system, 2, 2, &g[5], &g[4], -1, -1, -1)) return ncclSuccess; - // finally verify two XGMI hives for pattern "11303011" - h1[0] = g[0]; h1[1] = g[1]; h1[2] = g[2]; h1[3] = g[3]; - h2[0] = g[4]; h2[1] = g[5]; h2[2] = g[7]; h2[3] = g[6]; - ringBase = ringBase_11303011_1; - } - } - else if (strcmp(pattern, "0110201010200110") == 0) { - if (findGpuByXGMI(system, 2, 5, &g[2], &g[6], 1, -1, -1)) { - if (!findGpuByXGMI(system, 4, 2, &g[4], &g[1], 1, g[6], g[2])) return ncclSuccess; - if (!findGpuByXGMI(system, 1, 3, &g[0], &g[3], 0, -1, -1)) return ncclSuccess; - if (!findGpuByXGMI(system, 7, 5, &g[7], &g[5], 1, -1, -1)) return ncclSuccess; - h1[0] = g[0]; h1[1] = g[3]; h1[2] = g[2]; h1[3] = g[6]; - h2[0] = g[1]; h2[1] = g[4]; h2[2] = g[5]; h2[3] = g[7]; - ringBase = ringBase_0110201010200110_2; - } else { - if (!findGpuByXGMI(system, 1, 2, &g[0], &g[1], 1, -1, -1)) return ncclSuccess; - if (!findGpuByXGMI(system, 1, 3, &g[0], &g[3], 0, -1, -1)) return ncclSuccess; - if (!findGpuByXGMI(system, 2, 2, &g[1], &g[2], -1, -1, -1)) return ncclSuccess; - if (!findGpuByXGMI(system, 7, 5, &g[7], &g[5], -1, -1, -1)) return ncclSuccess; - if (!findGpuByXGMI(system, 7, 5, &g[7], &g[6], -1, -1, g[5])) return ncclSuccess; - if (!findGpuByXGMI(system, 4, 5, &g[4], &g[5], -1, -1, -1)) return ncclSuccess; - h1[0] = g[0]; h1[1] = g[1]; h1[2] = g[2]; h1[3] = g[3]; - h2[0] = g[4]; h2[1] = g[5]; h2[2] = g[7]; h2[3] = g[6]; - ringBase = ringBase_0110201010200110_1; - } -} - else - return ncclSuccess; - if (!validate4P1H(system, h1)) return ncclSuccess; - if (!validate4P1H(system, h2)) return ncclSuccess; - // passed all validation - // create 4P2H based on reference and remapped ids - for (i = 0; i = '0' && ringBase[i] <= '9') - ringRemap[i] = g[ringBase[i]-'0'] + '0'; - else - ringRemap[i] = ringBase[i]; + // number of GPUs and NICs on each numa node is used as first screening pattern + struct rcclRomeModel romeTopo; + char pattern[256]; + int net_map[MAX_ROME_NICS]; + parseRomeSystem(system, &romeTopo, pattern, net_map); + + // recognize system as Rome 4P2H even if no matching model + if (ngpus == 8 && romeTopo.nLinks) system->type = RCCL_TOPO_4P2H_ROME; + + int g[MAX_ROME_GPUS]; + int time = 0; + struct timeval tvs, tve; + gettimeofday(&tvs, NULL); + for (i = 0; i < sizeof(romeTopoModels)/sizeof(romeTopoModels[0]); i++) { + if (romeTopo.nCpus != romeTopoModels[i].nCpus || romeTopo.nGpus != romeTopoModels[i].nGpus || + romeTopo.nNics != romeTopoModels[i].nNics || romeTopo.nLinks != romeTopoModels[i].nLinks) continue; + if (strcmp(romeTopoModels[i].pattern, pattern)) continue; + for (int j = 0; j < ngpus; j++) g[j] = (j+2)%ngpus; + if (permuteGpuIds(g, 0, ngpus-1, romeTopoModels+i, &romeTopo, &time)) break; } - ringRemap[i] = 0; - *str = ringRemap; - INFO(NCCL_GRAPH, "Use 4P2H on Rome: %s", ringRemap); + gettimeofday(&tve, NULL); + float t = (tve.tv_sec - tvs.tv_sec)*1E3 + (tve.tv_usec - tvs.tv_usec)/1E3; + if (i >= sizeof(romeTopoModels)/sizeof(romeTopoModels[0])) { + //printf("No solution in %.2fms (%d iter)\n", t, time); + return ncclSuccess; + } + //printf("Solution in %.2fms (%d iter): ", t, time); + //for (int k = 0; k < ngpus; k++) printf("%d ", g[k]); + //printf("\n"); + + // create 4P2H based on reference and remapped ids + NCCLCHECK(parseGraph(romeTopoModels[i].ringBase, system, graph, g, romeTopo.nNics, net_map)); return ncclSuccess; } @@ -1014,6 +1054,7 @@ float speedArray[] = { 42.0, 24.0, 21.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) { int ngpus = system->nodes[GPU].count; + int nnets = system->nodes[NET].count; int crossNic = (system->nodes[NET].count > 1) && graph->crossNic ? 1 : 0; graph->speedIntra = graph->speedInter = 0; if (graph->crossNic == 2) graph->crossNic = 0; @@ -1036,59 +1077,19 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph } str = getenv("NCCL_RINGS"); - if (str) system->type = RCCL_TOPO_4P2H_ROME; - if (!str) NCCLCHECK(parseChordalRing(system, &str)); - if (!str) NCCLCHECK(parseRome4P2H(system, &str)); if (str) { - NCCLCHECK(parseGraph(str, &graph->nChannels, ngpus, graph->intra)); - for (int i=0; inChannels*ngpus; i++) { - // Translate gpu numbers into ranks - int j = 0; - for (j = 0; j < system->nodes[GPU].count; j++) - if (graph->intra[i] == system->nodes[GPU].nodes[j].gpu.dev) - break; - if (j < system->nodes[GPU].count) - graph->intra[i] = system->nodes[GPU].nodes[j].gpu.rank; - else - return ncclInternalError; - } - graph->speedIntra = graph->speedInter = system->maxWidth; - if (system->nodes[NET].count) { - // do not change ring order for multi node 4P2H on Rome - if (system->type == RCCL_TOPO_4P2H_ROME) { - for (int n = 0; n < graph->nChannels; n++) { - graph->inter[n*2] = n%system->nodes[NET].count; - graph->inter[n*2+1] = (n+1)%system->nodes[NET].count; - } - } else { - int *intra, *used; - graph->nChannels = system->nodes[NET].count; - NCCLCHECK(ncclCalloc(&intra, ngpus)); - NCCLCHECK(ncclCalloc(&used,system->nodes[NET].count)); - for (int n = 0; n < system->nodes[NET].count; n++) { - graph->inter[n*2] = graph->inter[n*2+1] = n; - struct ncclTopoNode* net = system->nodes[NET].nodes+n; - struct ncclTopoLinkList* paths = net->paths[GPU]; - // find the first unsed GPU that is closest to NIC - int f, m; - for (f = 0; f < ngpus; f++) { - int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[f].gpu.rank) break; - if(j >= n) break; - } - for (int i = 0; i < ngpus; i++) { - int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[i].gpu.rank) break; - if (j < n) continue; - if (paths[i].count < paths[f].count) f = i; - } - for (m = 0; mintra[n*ngpus+m] == system->nodes[GPU].nodes[f].gpu.rank) break; - used[n] = graph->intra[n*ngpus+m]; - for (int i = 0; i < ngpus; i++) intra[i] = graph->intra[n*ngpus+((i+m)%ngpus)]; - for (int i = 0; i < ngpus; i++) graph->intra[n*ngpus+i] = intra[i]; - } - free(used); - free(intra); - } + // user supplied topo + NCCLCHECK(parseGraph(str, system, graph, NULL, nnets, NULL)); + if (graph->nChannels) { + system->type = RCCL_TOPO_4P2H_ROME; + return ncclSuccess; } + } else { + // try to match 8P6L + NCCLCHECK(parseChordalRing(system, graph)); + if (graph->nChannels) return ncclSuccess; + // try to match Rome 4P2H + NCCLCHECK(parseRome4P2H(system, graph)); if (graph->nChannels) return ncclSuccess; } diff --git a/src/include/graph.h b/src/include/graph.h index 4af2c529c1..8c97379cfd 100644 --- a/src/include/graph.h +++ b/src/include/graph.h @@ -95,7 +95,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoRanks* topoRanks); ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, - struct ncclTopoRanks** allTopoRanks, int* rings, int gcn); + struct ncclTopoRanks** allTopoRanks, int* rings, int gcn, int nnets); ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, int rank); diff --git a/src/init.cc b/src/init.cc index b14d4a1d2b..65cf83342b 100644 --- a/src/init.cc +++ b/src/init.cc @@ -849,6 +849,21 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm } INFO(NCCL_INIT, "RCCL AllToAll(v)/Scatter/Gather kernels %s", comm->alltoallDisable ? "disabled" : "enabled"); + // count NETs used by ring + int nNets = 0; + int nets[MAXCHANNELS*2]; + for (int i = 0; i < ringGraph.nChannels; i++) { + for (int j = 0; j < 2; j++) { + int k; + for (k = 0; k < nNets; k++) + if (nets[k] == ringGraph.inter[i*2+j]) break; + if (k >= nNets) { + nets[nNets] = ringGraph.inter[i*2+j]; + nNets++; + } + } + } + if (comm->nChannels < nChannelsOrig) { // We started duplicating channels during Preset(), so we need to move the // duplicated channels since we have removed some. @@ -858,7 +873,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm int *rings; NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS)); - NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings, gcn)); + NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings, gcn, nNets)); if (comm->nNodes > 1 && ncclParamCollNetEnable() == 1 && collNetSupport() && collNetGraph.nChannels) { diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc index 883c41ca05..5ada202649 100644 --- a/src/transport/p2p.cc +++ b/src/transport/p2p.cc @@ -21,29 +21,17 @@ struct p2pConnectInfo { void* directPtr; hipIpcMemHandle_t devIpc; }; - uint64_t pidHash; - int id; - int sendRank; - int recvRank; }; struct p2pSendResources { struct ncclSendMem* devMem; void* ipcPtr; uint32_t* next_hdp_reg; // Next GPU in ring (for p2p transport use only) - uint64_t* opCount; // opCount allocated in host memory - uint64_t* devOpCount; // device side pointer to opCount - uint64_t* remOpCount; // remote opCount allocated in host memory - uint64_t* devRemOpCount; // device side pointer to remote opCount }; struct p2pRecvResources { struct ncclRecvMem* devMem; void* ipcPtr; - uint64_t* opCount; // opCount allocated in host memory - uint64_t* devOpCount; // device side pointer to opCount - uint64_t* remOpCount; // remote opCount allocated in host memory - uint64_t* devRemOpCount; // device side pointer to remote opCount }; #include @@ -123,7 +111,6 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop TRACE(P2P,"IPC: %016lx %016lx %016lx %016lx", devIpc[4], devIpc[5], devIpc[6], devIpc[7]); \ } while (0) -#define MAX_SHM_NAME_LEN 1024 // Setting this to non zero causes P2P to use Reads rather than Writes NCCL_PARAM(P2pReadEnable, "P2P_READ_ENABLE", -2); @@ -165,16 +152,6 @@ ncclResult_t p2pSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra } struct p2pConnectInfo info; - info.id = channelId; - info.pidHash = myInfo->pidHash; - info.sendRank = myInfo->cudaDev; - info.recvRank = peerInfo->cudaDev; - - char shmName[MAX_SHM_NAME_LEN]; - sprintf(shmName, "nccl-p2p-send-opcount-%lx-%d-%d-%d", info.pidHash, info.id, info.sendRank, info.recvRank); - TRACE(NCCL_P2P,"Open shmName %s", shmName); - NCCLCHECK(shmOpen(shmName, sizeof(uint64_t), (void**)&resources->opCount, (void**)&resources->devOpCount, 1)); - info.read = useRead; const char* useReadStr = info.read ? "/read" : ""; if (myInfo->pidHash == peerInfo->pidHash) { @@ -232,16 +209,6 @@ ncclResult_t p2pRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, recvSize, true)); struct p2pConnectInfo info; - info.id = channelId; - info.pidHash = myInfo->pidHash; - info.sendRank = peerInfo->cudaDev; - info.recvRank = myInfo->cudaDev; - - char shmName[MAX_SHM_NAME_LEN]; - sprintf(shmName, "nccl-p2p-recv-opcount-%lx-%d-%d-%d", info.pidHash, info.id, info.sendRank, info.recvRank); - TRACE(NCCL_P2P,"Open shmName %s", shmName); - NCCLCHECK(shmOpen(shmName, sizeof(uint64_t), (void**)&resources->opCount, (void**)&resources->devOpCount, 1)); - info.read = useRead; if (myInfo->pidHash == peerInfo->pidHash) { info.direct = 1; @@ -298,13 +265,6 @@ static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, int nranks, } } - char shmName[MAX_SHM_NAME_LEN]; - sprintf(shmName, "nccl-p2p-recv-opcount-%lx-%d-%d-%d", info->pidHash, info->id, info->sendRank, info->recvRank); - TRACE(NCCL_P2P,"Open shmName %s", shmName); - NCCLCHECK(shmOpen(shmName, sizeof(uint64_t), (void**)&resources->remOpCount, (void**)&resources->devRemOpCount, 0)); - // Remove the file to ensure proper clean-up - NCCLCHECK(shmUnlink(shmName)); - int offset = 0; for (int p=0; pread && p == NCCL_PROTO_SIMPLE) { @@ -344,12 +304,6 @@ ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, int nranks, int ran } } - char shmName[MAX_SHM_NAME_LEN]; - sprintf(shmName, "nccl-p2p-send-opcount-%lx-%d-%d-%d", info->pidHash, info->id, info->sendRank, info->recvRank); - TRACE(NCCL_P2P,"Open shmName %s", shmName); - NCCLCHECK(shmOpen(shmName, sizeof(uint64_t), (void**)&resources->remOpCount, (void**)&resources->devRemOpCount, 0)); - NCCLCHECK(shmUnlink(shmName)); - int offset = 0; for (int p=0; pread && p == NCCL_PROTO_SIMPLE) { @@ -370,8 +324,6 @@ ncclResult_t p2pSendFree(void* resources) { if (sendRes->ipcPtr) CUDACHECK(hipIpcCloseMemHandle(sendRes->ipcPtr)); CUDACHECK(hipFree(sendRes->devMem)); - NCCLCHECK(shmClose(sendRes->opCount, sendRes->devOpCount, sizeof(uint64_t))); - NCCLCHECK(shmClose(sendRes->remOpCount, sendRes->devRemOpCount, sizeof(uint64_t))); free(sendRes); return ncclSuccess; } @@ -381,8 +333,6 @@ ncclResult_t p2pRecvFree(void* resources) { if (recvRes->ipcPtr) CUDACHECK(hipIpcCloseMemHandle(recvRes->ipcPtr)); CUDACHECK(hipFree(recvRes->devMem)); - NCCLCHECK(shmClose(recvRes->opCount, recvRes->devOpCount, sizeof(uint64_t))); - NCCLCHECK(shmClose(recvRes->remOpCount, recvRes->devRemOpCount, sizeof(uint64_t))); free(recvRes); return ncclSuccess; } diff --git a/tools/scripts/topo_val.sh b/tools/scripts/topo_val.sh index b937eed34c..d20a849dcd 100755 --- a/tools/scripts/topo_val.sh +++ b/tools/scripts/topo_val.sh @@ -21,7 +21,7 @@ DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -for i in {0..35} +for i in {0..37} do $DIR/../topo_expl/topo_expl -m $i > "topo_m$i.log" $DIR/../TopoVisual/topo_visual.sh -i "topo_m$i.log" diff --git a/tools/topo_expl/models/topo_8p_rome_n2_2.xml b/tools/topo_expl/models/topo_8p_rome_n2_2.xml new file mode 100644 index 0000000000..c198a55877 --- /dev/null +++ b/tools/topo_expl/models/topo_8p_rome_n2_2.xml @@ -0,0 +1,81 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tools/topo_expl/models/topo_8p_ts1_n4_2.xml b/tools/topo_expl/models/topo_8p_ts1_n4_2.xml new file mode 100644 index 0000000000..c12ea663a5 --- /dev/null +++ b/tools/topo_expl/models/topo_8p_ts1_n4_2.xml @@ -0,0 +1,93 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tools/topo_expl/topo_expl.cpp b/tools/topo_expl/topo_expl.cpp index 411bfa1903..67323651bf 100644 --- a/tools/topo_expl/topo_expl.cpp +++ b/tools/topo_expl/topo_expl.cpp @@ -105,6 +105,8 @@ NodeModelDesc model_descs[] = { {4, "topo_8p_ts1_n4_1.xml", "4 nodes 8 VEGA20 TS1 NPS=4 Alt. Model"}, {1, "topo_4p3l_ia.xml", "single node 8 gfx908"}, {4, "topo_4p3l_ia.xml", "4 nodes 8 gfx908"}, + {4, "topo_8p_rome_n2_2.xml", "4 nodes 8 VEGA20 Rome NPS=2 Alt. Model 2 NET/IF"}, + {4, "topo_8p_ts1_n4_2.xml", "4 nodes 8 VEGA20 TS1 NPS=4 3 NET/IF"}, }; int main(int argc,char* argv[]) diff --git a/tools/topo_expl/utils.cpp b/tools/topo_expl/utils.cpp index 05d84ecf26..709fcee542 100644 --- a/tools/topo_expl/utils.cpp +++ b/tools/topo_expl/utils.cpp @@ -428,6 +428,21 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t } INFO(NCCL_INIT, "RCCL AllToAll(v)/Scatter/Gather kernels %s", comm->alltoallDisable ? "disabled" : "enabled"); + // count NETs used by ring + int nNets = 0; + int nets[MAXCHANNELS*2]; + for (int i = 0; i < ringGraph.nChannels; i++) { + for (int j = 0; j < 2; j++) { + int k; + for (k = 0; k < nNets; k++) + if (nets[k] == ringGraph.inter[i*2+j]) break; + if (k >= nNets) { + nets[nNets] = ringGraph.inter[i*2+j]; + nNets++; + } + } + } + if (comm->nChannels < nChannelsOrig) { // We started duplicating channels during Preset(), so we need to move the // duplicated channels since we have removed some. @@ -437,7 +452,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t int *rings; NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS)); - NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings, gcn)); + NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings, gcn, nNets)); if (comm->nNodes > 1 && ncclParamCollNetEnable() == 1 && collNetSupport() && collNetGraph.nChannels) {