Rework Rome detection and add multiple network ports models (#274)

* Rework Rome detection and add multiple network ports models

* Remove unused opCount in p2p transport
This commit is contained in:
Wenkai Du
2020-10-07 13:37:36 -07:00
committed by GitHub
parent 88a062342b
commit ae008fd2db
11 changed files with 616 additions and 352 deletions
+3 -2
View File
@@ -255,7 +255,7 @@ int ncclMaxNchannels() {
return maxNchannels;
}
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct ncclTopoRanks** allTopoRanks, int* rings, int gcn) {
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct ncclTopoRanks** allTopoRanks, int* rings, int gcn, int nnets) {
// Gather data from all ranks
int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeUpRecv, *treeUpSend, *treeDnRecv,*treeDnSend;
int nranks = comm->nRanks;
@@ -292,7 +292,8 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct nccl
int nc = nChannels*2;
if (gcn == 908) nc = std::max(nc, 4);
if (comm->topo->nodes[NET].count == 0 && comm->topo->type == RCCL_TOPO_CR8G) nc = nChannels*4;
if (comm->topo->nodes[NET].count && comm->topo->type == RCCL_TOPO_4P2H_ROME) nc = 4*comm->topo->nodes[NET].count;
if (!nnets) nnets = comm->topo->nodes[NET].count;
if (nnets && comm->topo->type == RCCL_TOPO_4P2H_ROME) nc = 4*nnets;
int end = std::min((int)ncclMaxNchannels(), std::max(nc, ncclMinNchannels()));
// Duplication should be complete now
+106
View File
@@ -0,0 +1,106 @@
/*
Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#define MAX_ROME_GPUS 8
#define MAX_ROME_NICS 2
struct rcclRomeModel {
int nGpus;
int nCpus;
int nNics;
int nLinks;
int64_t gpuIds[MAX_ROME_GPUS];
int64_t gpuNuma[MAX_ROME_GPUS];
int64_t nicNuma[MAX_ROME_NICS];
int connMatrix[MAX_ROME_GPUS*MAX_ROME_GPUS];
const char *pattern;
const char *ringBase;
};
static struct rcclRomeModel rome_model_22 = {
.nGpus = 8, .nCpus = 4, .nNics = 1, .nLinks = 2,
.gpuIds = { 0x3000, 0x43000, 0x26000, 0xc3000, 0x83000, 0x23000, 0xc6000, 0xa3000, },
.gpuNuma = { 1, 0, 1, 2, 3, 1, 2, 3, },
.nicNuma = { 2, },
.connMatrix = { 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, },
.pattern = "10302120",
.ringBase = "7 4 5 3 1 0 6 2|4 7 3 5 0 1 2 6",
};
static struct rcclRomeModel rome_model_25 = {
.nGpus = 8, .nCpus = 4, .nNics = 2, .nLinks = 2,
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
.gpuNuma = { 0, 1, 1, 1, 2, 2, 2, 3, },
.nicNuma = { 0, 3, },
.connMatrix = { 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, },
.pattern = "11303011",
.ringBase = "2 1 0 3 6 7 5 4|7 6 4 5 1 2 3 0",
};
static struct rcclRomeModel rome_model_27 = {
.nGpus = 8, .nCpus = 4, .nNics = 2, .nLinks = 2,
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
.gpuNuma = { 0, 1, 1, 1, 2, 2, 2, 3, },
.nicNuma = { 0, 3, },
.connMatrix = { 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, },
.pattern = "11303011",
.ringBase = "0 6 2 3 1 7 5 4|7 1 4 5 6 0 3 2",
};
static struct rcclRomeModel rome_model_29 = {
.nGpus = 8, .nCpus = 4, .nNics = 1, .nLinks = 3,
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, },
.gpuNuma = { 0, 1, 1, 1, 2, 2, 3, 3, },
.nicNuma = { 2, },
.connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, },
.pattern = "10302120",
.ringBase = "6 5 7 4 0 1 3 2|6 4 7 5 2 3 1 0",
};
static struct rcclRomeModel rome_model_31 = {
.nGpus = 8, .nCpus = 8, .nNics = 2, .nLinks = 2,
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
.gpuNuma = { 1, 2, 2, 3, 4, 5, 5, 7, },
.nicNuma = { 0, 6, },
.connMatrix = { 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, },
.pattern = "0110201010200110",
.ringBase = "1 2 3 0 6 4 5 7|4 6 7 5 2 1 0 3",
};
static struct rcclRomeModel rome_model_33 = {
.nGpus = 8, .nCpus = 8, .nNics = 2, .nLinks = 2,
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
.gpuNuma = { 1, 2, 2, 3, 4, 5, 5, 7, },
.nicNuma = { 0, 6, },
.connMatrix = { 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, },
.pattern = "0110201010200110",
.ringBase = "1 4 5 7 0 3 2 6|4 1 7 5 6 2 3 0",
};
static struct rcclRomeModel romeTopoModels[] = {
rome_model_22,
rome_model_25,
rome_model_27,
rome_model_29,
rome_model_31,
rome_model_33,
};
+297 -296
View File
@@ -10,6 +10,8 @@
#include "topo.h"
#include "xml.h"
#include <math.h>
#include <sys/time.h>
#include "rome_models.h"
// Initialize system->maxWidth. This is the per-channel (i.e. per-SM)
// max speed.
@@ -663,64 +665,111 @@ ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs
}
/* Parse user defined rings. Format is like :
* "0 1|1 0|0 1 2 3|3 2 1 0|0 2 3 1|1 3 2 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0"
* Rings with a non-matching number of ranks are ignored so we can provide
* "0 1|1 0|0 1 2 3|3 2 1 0|N0 0 2 3 1 N1|1 3 2 0|0 1 2 3 4 5 6 7|N2 7 6 5 4 3 2 1 0 N1"
* Network interfaces can be optionally specified by N prefix.
* Rings with a non-matching number of gpus are ignored so we can provide
* rings for multiple cases.
*/
#define MAX_ENV_RANKS 512
static ncclResult_t parseGraph(const char* str, int* nChannelsRet, int ngpus, int* channels) {
int ranks[MAX_ENV_RANKS];
static ncclResult_t parseGraph(const char* str, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* gpu_map, int nnets, int* net_map ) {
int gpus[MAX_ROME_GPUS];
int nChannels = 0;
int rank = 0;
int gpu = 0;
int offset = 0;
int status = 0; // 0 : between numbers, 1 : inside number
int status = 0; // 0 : between numbers, 1 : inside number, 2: start NET
int nets[2];
int net = 0;
int ngpus = system->nodes[GPU].count;
do {
int digit = str[offset] - '0';
if (digit >= 0 && digit <= 9) {
if (str[offset] == 'N') {
if (status == 0) {
ranks[rank] = digit;
status = 1;
} else {
ranks[rank] = ranks[rank]*10+digit;
status = 2;
}
} else {
if (status == 1) {
rank++;
if (rank == MAX_ENV_RANKS) goto end;
}
status = 0;
if (str[offset] == '|' || str[offset] == '\0') {
// Ignore if ngpus doesn't match
if (rank != ngpus) goto newchannel;
for (int r=0; r<ngpus; r++) {
int rank = ranks[r];
// Ignore if ranks are out of bounds
if (rank < 0 || rank >= ngpus) goto newchannel;
// Ignore if ranks are duplicate
for (int i=0; i<r; i++)
if (ranks[i] == rank) goto newchannel;
channels[nChannels*ngpus+r] = rank;
int digit = str[offset] - '0';
if (digit >= 0 && digit <= 9) {
if (status == 0) {
gpus[gpu] = digit;
status = 1;
} else if (status == 2) {
nets[net] = digit;
}
nChannels++;
else{
gpus[gpu] = gpus[gpu]*10+digit;
}
} else {
if (status == 1) {
gpu++;
if (gpu > MAX_ROME_GPUS) goto end;
} else if (status == 2) {
net++;
if (net > 2) goto end;
}
status = 0;
if (str[offset] == '|' || str[offset] == '\0') {
// Ignore if ngpus doesn't match
if (gpu != ngpus) goto newchannel;
// Ignore if nnets are not 0 or 2
if (net && net != 2) goto newchannel;
for (int r=0; r<ngpus; r++) {
int g = gpus[r];
// Ignore if gpus are out of bounds
if (g < 0 || g >= ngpus) goto newchannel;
// Ignore if gpus are duplicate
for (int i=0; i<r; i++)
if (gpus[i] == g) goto newchannel;
// remap if needed
if (gpu_map) g = gpu_map[g];
// Translate gpu numbers into ranks
int j = 0;
for (j = 0; j < ngpus; j++)
if (g == system->nodes[GPU].nodes[j].gpu.dev)
break;
if (j < ngpus)
graph->intra[nChannels*ngpus+r] = system->nodes[GPU].nodes[j].gpu.rank;
else
return ncclInternalError;
}
if (net) {
if (nets[0] >= nnets || nets[1] >= nnets) goto newchannel;
graph->inter[nChannels*2] = nets[0];
graph->inter[nChannels*2+1] = nets[1];
} else if (net_map && nnets) {
graph->inter[nChannels*2] = net_map[nChannels%nnets];
graph->inter[nChannels*2+1] = net_map[(nChannels+1)%nnets];
} else if (nnets) {
graph->inter[nChannels*2] = nChannels%nnets;
graph->inter[nChannels*2+1] = (nChannels+1)%nnets;
}
nChannels++;
newchannel:
rank = 0;
gpu = 0;
net = 0;
}
}
}
} while (str[offset++] != 0);
end:
*nChannelsRet = nChannels;
graph->nChannels = nChannels;
graph->speedIntra = graph->speedInter = system->maxWidth;
#if 0
for (int i=0; i<graph->nChannels; i++) {
printf("%d: ", i);
printf ("NET/%d ", graph->inter[i*2]);
for (int j=0; j<ngpus; j++) printf("GPU/%d ", graph->intra[i*ngpus+j]);
printf ("NET/%d ", graph->inter[i*2+1]);
printf("\n");
}
#endif
return ncclSuccess;
}
static ncclResult_t parseChordalRing(struct ncclTopoSystem* system, char **str) {
static ncclResult_t parseChordalRing(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
static const char *ringBase = "0 1 2 3 5 4 7 6|0 2 4 1 7 3 6 5|0 3 1 5 7 2 6 4|0 6 7 4 5 3 2 1|0 5 6 3 7 1 4 2|0 4 6 2 7 5 1 3";
static char ringRemap[256];
int id[8], dist[8];
int i;
*str = 0;
int ngpus = system->nodes[GPU].count;
if (ngpus != 8)
return ncclSuccess;
@@ -757,251 +806,242 @@ static ncclResult_t parseChordalRing(struct ncclTopoSystem* system, char **str)
dist[m] = dist[n]; dist[n] = temp;
}
// create chordal ring based on reference and remapped ids
for (i = 0; i <strlen(ringBase); i++) {
if (ringBase[i] >= '0' && ringBase[i] <= '9')
ringRemap[i] = id[ringBase[i]-'0']+'0';
else
ringRemap[i] = ringBase[i];
}
ringRemap[i] = 0;
*str = ringRemap;
system->type = RCCL_TOPO_CR8G;
INFO(NCCL_GRAPH, "Use chordal ring: %s", ringRemap);
NCCLCHECK(parseGraph(ringBase, system, graph, id, 0, NULL));
if (system->nodes[NET].count) {
int *intra, *used;
graph->nChannels = system->nodes[NET].count;
NCCLCHECK(ncclCalloc(&intra, ngpus));
NCCLCHECK(ncclCalloc(&used,system->nodes[NET].count));
for (int n = 0; n < system->nodes[NET].count; n++) {
graph->inter[n*2] = graph->inter[n*2+1] = n;
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
struct ncclTopoLinkList* paths = net->paths[GPU];
// find the first unsed GPU that is closest to NIC
int f, m;
for (f = 0; f < ngpus; f++) {
int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[f].gpu.rank) break;
if(j >= n) break;
}
for (int i = 0; i < ngpus; i++) {
int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[i].gpu.rank) break;
if (j < n) continue;
if (paths[i].count < paths[f].count) f = i;
}
for (m = 0; m<ngpus; m++) if (graph->intra[n*ngpus+m] == system->nodes[GPU].nodes[f].gpu.rank) break;
used[n] = graph->intra[n*ngpus+m];
for (int i = 0; i < ngpus; i++) intra[i] = graph->intra[n*ngpus+((i+m)%ngpus)];
for (int i = 0; i < ngpus; i++) graph->intra[n*ngpus+i] = intra[i];
}
free(used);
free(intra);
}
return ncclSuccess;
}
static bool getGpuNetCount(struct ncclTopoSystem* system, int id, int *ngpu, int *nnet) {
*ngpu = 0; *nnet = 0;
static bool getGpuNetCount(struct ncclTopoSystem* system, int id, int *g, int *n, int nnet, int *net_map) {
*g = 0; *n = 0;
int i;
if (ncclTopoIdToIndex(system, CPU, id, &i) == ncclInternalError) return false;
for (int n = 0; n < system->nodes[NET].count; n++)
if (system->nodes[NET].nodes[n].paths[CPU][i].count == 2) (*nnet)++;
for (int n = 0; n < system->nodes[GPU].count; n++)
if (system->nodes[GPU].nodes[n].paths[CPU][i].count == 2) (*ngpu)++;
for (int j = 0; j < nnet; j++)
if (system->nodes[NET].nodes[net_map[j]].paths[CPU][i].count == 2) (*n)++;
for (int j = 0; j < system->nodes[GPU].count; j++)
if (system->nodes[GPU].nodes[j].paths[CPU][i].count == 2) (*g)++;
return true;
}
/* compare GPUs by PCI ID */
static int compareGPU (const void *g1, const void *g2, void *s) {
struct ncclTopoSystem* system = (struct ncclTopoSystem*)s;
return system->nodes[GPU].nodes[*(int *)g1].id > system->nodes[GPU].nodes[*(int *)g2].id;
static ncclResult_t ncclGpuIdToIndex(struct ncclTopoSystem* system, int id, int* index) {
*index = -1;
for (int i=0; i<system->nodes[GPU].count; i++) {
if (system->nodes[GPU].nodes[i].gpu.dev == id) {
*index = i;
return ncclSuccess;
}
}
return ncclInternalError;
}
static bool findGpuByXGMI(struct ncclTopoSystem* system, int cpu1, int cpu2, int *gpu1, int *gpu2, int use_shared, int ex1, int ex2) {
int n, m, k, idx, c1, c2;
uint64_t gid;
int ngpus = system->nodes[GPU].count;
if (ncclTopoIdToIndex(system, CPU, cpu1, &c1) == ncclInternalError) return false;
if (ncclTopoIdToIndex(system, CPU, cpu2, &c2) == ncclInternalError) return false;
static ncclResult_t parseRomeSystem(struct ncclTopoSystem* system, struct rcclRomeModel* romeTopo, char *pattern, int *net_map) {
pattern[0] = 0; // pattern will be NULL for invalid topology
romeTopo->nGpus = system->nodes[GPU].count;
romeTopo->nCpus = system->nodes[CPU].count;
romeTopo->nNics = 0;
romeTopo->nLinks = 0;
for (int i = 0; i < romeTopo->nGpus; i ++) {
int gpu, n;
NCCLCHECK(ncclGpuIdToIndex(system, i, &gpu));
romeTopo->gpuIds[i] = system->nodes[GPU].nodes[gpu].id;
for (n = 0; n < romeTopo->nCpus; n++)
if (system->nodes[GPU].nodes[gpu].paths[CPU][n].count == 2) break;
if (n < romeTopo->nCpus) romeTopo->gpuNuma[i] = system->nodes[CPU].nodes[n].id;
int *s_gpus = (int *)malloc(sizeof(int)*ngpus);
int s_ngpus = 0;
// build a sorted list of source GPUs
for (n = 0; n < ngpus; n++) {
if (*gpu1 != -1 && system->nodes[GPU].nodes[n].gpu.dev != *gpu1) continue;
if (system->nodes[GPU].nodes[n].gpu.dev == ex1) continue;
if (system->nodes[GPU].nodes[n].paths[CPU][c1].count != 2) continue;
s_gpus[s_ngpus++] = n;
}
if (s_ngpus) qsort_r(s_gpus, s_ngpus, sizeof(int), compareGPU, system);
for (n = 0; n < s_ngpus; n++) {
struct ncclTopoNode* node = system->nodes[GPU].nodes+s_gpus[n];
struct ncclTopoNode* node = system->nodes[GPU].nodes+gpu;
if (node->paths[GPU] == NULL) continue;
idx = -1; gid = 0;
for (m = 0; m < ngpus; m++) {
if (*gpu2 != -1 && system->nodes[GPU].nodes[m].gpu.dev != *gpu2) continue;
if (system->nodes[GPU].nodes[m].gpu.dev == ex2) continue;
if (system->nodes[GPU].nodes[m].paths[CPU][c2].count != 2) continue;
int count = 0;
for (n = 0; n < romeTopo->nGpus; n++) {
romeTopo->connMatrix[i*romeTopo->nGpus+n] = 0;
struct ncclTopoLink* link;
for (link = node->links; link->remNode; link++) {
if (link->remNode->gpu.dev == system->nodes[GPU].nodes[m].gpu.dev) break;
if (link->remNode->gpu.dev == n) break;
}
if (!link->remNode) continue;
if (link->type == LINK_NVL) {
int is_shared = 0;
for (k = 0; k < ngpus; k++) {
if (k == m || k == s_gpus[n]) continue;
if ((system->nodes[GPU].nodes[k].id & 0xf0000) == (system->nodes[GPU].nodes[m].id & 0xf0000))
break;
}
if (k < ngpus) is_shared = 1;
if (use_shared == -1 || is_shared == use_shared) {
if (idx == -1 || (idx != -1 && system->nodes[GPU].nodes[m].id < gid)) {
idx = m;
gid = system->nodes[GPU].nodes[m].id;
}
}
if (link->type != LINK_NVL) continue;
romeTopo->connMatrix[i*romeTopo->nGpus+n] = 1;
count ++;
}
if (!romeTopo->nLinks) romeTopo->nLinks = count;
else if (romeTopo->nLinks != count) return ncclSuccess;
}
// trim ports and create NET map
for (int i = 0; i < system->nodes[NET].count; i ++) {
int j;
for (j = 0; j < romeTopo->nNics; j++) {
if (system->nodes[NET].nodes[i].net.asic == system->nodes[NET].nodes[net_map[j]].net.asic) {
if (system->nodes[NET].nodes[i].net.width > system->nodes[NET].nodes[net_map[j]].net.width)
net_map[j] = i;
break;
}
}
if (idx != -1) break;
if (j >= romeTopo->nNics) {
net_map[j] = i;
(romeTopo->nNics)++;
if (romeTopo->nNics >= MAX_ROME_NICS) break;
}
}
if (n < s_ngpus) {
*gpu1 = system->nodes[GPU].nodes[s_gpus[n]].gpu.dev;
*gpu2 = system->nodes[GPU].nodes[idx].gpu.dev;
//printf("%s+: c1 %d c2 %d gpu1 %d gpu2 %d use_shared %d ex1 %d, ex2 %d\n",
// __func__, cpu1, cpu2, *gpu1, *gpu2, use_shared, ex1, ex2);
free(s_gpus);
// number of GPUs and NICs on each numa node is used as first screening pattern
for (int i = 0; i < romeTopo->nCpus; i++) {
int g, n;
if (!getGpuNetCount(system, i, &g, &n, romeTopo->nNics, net_map)) return ncclSuccess;
pattern[i*2] = '0' + g;
pattern[i*2+1] = '0' + n;
}
pattern[romeTopo->nCpus*2] = 0;
for (int i = 0; i < romeTopo->nNics; i ++) {
int net, n;
NCCLCHECK(ncclTopoIdToIndex(system, NET, net_map[i], &net));
for (n = 0; n < romeTopo->nCpus; n++)
if (system->nodes[NET].nodes[net].paths[CPU][n].count == 2) break;
if (n < romeTopo->nCpus) romeTopo->nicNuma[i] = system->nodes[CPU].nodes[n].id;
else return ncclSuccess;
}
const char* romeModelFile = getenv("RCCL_DUMP_ROME_MODEL_FILE");
if (romeModelFile) {
INFO(NCCL_ENV, "RCCL_DUMP_ROME_MODEL_FILE set by environment to %s", romeModelFile);
FILE* file = fopen(romeModelFile, "w");
if (file == NULL) {
WARN("Unable to open %s, not dumping Rome model.", romeModelFile);
return ncclSuccess;
}
fprintf(file, "static struct rcclRomeModel rome_model_ = {\n");
fprintf(file, " .nGpus = %d, .nCpus = %d, .nNics = %d, .nLinks = %d,\n", romeTopo->nGpus, romeTopo->nCpus, romeTopo->nNics, romeTopo->nLinks);
fprintf(file, " .gpuIds = { ");
for (int i = 0; i < romeTopo->nGpus; i ++) fprintf(file, "0x%lx, ", romeTopo->gpuIds[i]);
fprintf(file, "},\n");
fprintf(file, " .gpuNuma = { ");
for (int i = 0; i < romeTopo->nGpus; i ++) fprintf(file, "%ld, ", romeTopo->gpuNuma[i]);
fprintf(file, "},\n");
fprintf(file, " .nicNuma = { ");
for (int i = 0; i < romeTopo->nNics; i ++) fprintf(file, "%ld, ", romeTopo->nicNuma[i]);
fprintf(file, "},\n");
fprintf(file, " .connMatrix = { ");
for (int i = 0; i < romeTopo->nGpus; i ++)
for (int n = 0; n < romeTopo->nGpus; n++) fprintf(file, "%d, ", romeTopo->connMatrix[i*romeTopo->nGpus+n]);
fprintf(file, "},\n");
fprintf(file, " .pattern = \"%s\",\n", pattern);
fprintf(file, " .ringBase = \"\",\n");
fprintf(file, "};\n");
fclose(file);
}
return ncclSuccess;
}
static bool permuteGpuIds(int *g, int n, int last, struct rcclRomeModel* ref, struct rcclRomeModel* topo, int* time) {
(*time) ++;
if (n == last) {
int i, j;
// match GPU numa
for (i = 0; i < ref->nGpus; i++)
if (ref->gpuNuma[i] != topo->gpuNuma[g[i]]) break;
if (i < ref->nGpus) return false;
// match XGMI connection
for (i = 0; i < ref->nGpus; i++) {
for (j = 0; j < ref->nGpus; j++)
if (ref->connMatrix[i*ref->nGpus+j] != topo->connMatrix[g[i]*ref->nGpus+g[j]]) break;
if (j < ref->nGpus) break;
}
if (i < ref->nGpus) return false;
// match NBIO
for (i = 0; i < ref->nGpus; i++) {
for (j = 0; j < ref->nGpus; j++) {
if (i == j) continue;
bool nbio_ref = (ref->gpuIds[i]&0xf0000) == (ref->gpuIds[j]&0xf0000);
bool nbio_topo = (topo->gpuIds[g[i]]&0xf0000) == (topo->gpuIds[g[j]]&0xf0000);
if (nbio_ref != nbio_topo) break;
if (nbio_ref && ((ref->gpuIds[i]-ref->gpuIds[j])*(topo->gpuIds[g[i]]-topo->gpuIds[g[j]]) < 0)) break;
}
if (j < ref->nGpus) break;
}
if (i < ref->nGpus) return false;
return true;
} else {
for (int i = n; i <= last; i++) {
std::swap(g[n], g[i]);
if (permuteGpuIds(g, n+1, last, ref, topo, time)) return true;
std::swap(g[n], g[i]);
}
}
free(s_gpus);
return false;
}
static bool validate4P1H(struct ncclTopoSystem* system, int *hive) {
int g, n, m;
int ngpus = system->nodes[GPU].count;
for (g = 0; g < 4; g++) {
int gpu = hive[g];
int next_gpu = hive[(g+1)%4];
for (n = 0; n < ngpus; n++) {
if (system->nodes[GPU].nodes[n].gpu.dev != gpu) continue;
struct ncclTopoNode* node = system->nodes[GPU].nodes+n;
if (node->paths[GPU] == NULL) continue;
for (m = 0; m < ngpus; m++) {
struct ncclTopoLink* link;
for (link = node->links; link->remNode; link++) {
if (link->remNode->gpu.dev == next_gpu) break;
}
if (!link->remNode) continue;
if (link->type == LINK_NVL) break;
}
if (m < ngpus) break;
}
if (n < ngpus) continue;
else break;
}
if (g < 4) return false;
else return true;
}
static ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, char **str) {
static const char *ringBase_10302120_1 = "7 4 5 3 1 0 6 2|4 7 3 5 0 1 2 6";
static const char *ringBase_10302120_2 = "6 4 7 5 0 1 3 2|6 5 7 4 2 3 1 0";
static const char *ringBase_11303011_1 = "2 1 0 3 6 7 5 4|7 6 4 5 1 2 3 0";
static const char *ringBase_11303011_2 = "0 6 2 3 1 7 5 4|7 1 4 5 6 0 3 2";
static const char *ringBase_0110201010200110_1 = "1 2 3 0 6 4 5 7|4 6 7 5 2 1 0 3";
static const char *ringBase_0110201010200110_2 = "3 0 6 2 1 4 5 7|4 1 0 3 2 6 7 5";
static const char *ringBase;
static ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
static char ringRemap[64];
int id[8], dist[8];
int i;
*str = 0;
int ngpus = system->nodes[GPU].count;
int ncpus = system->nodes[CPU].count;
// 8 GPUs only
if (ngpus != 8)
return ncclSuccess;
// only valid on Rome
int arch, vendor, model;
NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
if (arch != NCCL_TOPO_CPU_ARCH_X86 || vendor != NCCL_TOPO_CPU_VENDOR_AMD || model != NCCL_TOPO_CPU_TYPE_ROME)
return ncclSuccess;
system->type = RCCL_TOPO_4P2H_ROME;
// 4 or 8 numa nodes only
if (ncpus != 4 && ncpus != 8)
return ncclSuccess;
// number of GPUs and NICs on each numa node is used as first screening pattern
char pattern[256];
for (i = 0; i < ncpus; i++) {
int g, n;
if (!getGpuNetCount(system, i, &g, &n)) return ncclSuccess;
pattern[i*2] = '0' + g;
pattern[i*2+1] = '0' + n;
}
pattern[i*2] = 0;
int g[8], h1[4], h2[4];
for (int i = 0; i <8; i++) g[i] = -1;
if (strcmp(pattern, "10302120") == 0) {
bool cross = findGpuByXGMI(system, 1, 2, &g[2], &g[6], 1, -1, -1);
g[2] = g[6] = -1;
if (cross) {
// identify GPUs for pattern "10302120"
if (!findGpuByXGMI(system, 0, 1, &g[1], &g[0], 0, -1, -1)) return ncclSuccess;
if (!findGpuByXGMI(system, 0, 1, &g[1], &g[2], 1, -1, g[0])) return ncclSuccess;
if (!findGpuByXGMI(system, 1, 2, &g[2], &g[6], 1, -1, -1)) return ncclSuccess;
if (!findGpuByXGMI(system, 2, 1, &g[3], &g[5], 1, g[6], g[2])) return ncclSuccess;
if (!findGpuByXGMI(system, 1, 3, &g[5], &g[4], -1, -1, -1)) return ncclSuccess;
if (!findGpuByXGMI(system, 2, 3, &g[3], &g[7], -1, g[6], g[4])) return ncclSuccess;
// finally verify two XGMI hives for pattern "10302120"
h1[0] = g[1]; h1[1] = g[0]; h1[2] = g[6]; h1[3] = g[2];
h2[0] = g[7]; h2[1] = g[4]; h2[2] = g[5]; h2[3] = g[3];
ringBase = ringBase_10302120_1;
} else {
// identify GPUs for pattern "10302120"
if (!findGpuByXGMI(system, 0, 1, &g[0], &g[1], 1, -1, -1)) return ncclSuccess;
if (!findGpuByXGMI(system, 0, 1, &g[0], &g[3], 0, -1, -1)) return ncclSuccess;
if (!findGpuByXGMI(system, 1, 1, &g[1], &g[2], -1, -1, g[3])) return ncclSuccess;
if (!findGpuByXGMI(system, 2, 3, &g[5], &g[7], -1, -1, -1)) return ncclSuccess;
if (!findGpuByXGMI(system, 2, 3, &g[4], &g[6], -1, g[5], g[7])) return ncclSuccess;
// finally verify two XGMI hives for pattern "10302120"
h1[0] = g[0]; h1[1] = g[1]; h1[2] = g[2]; h1[3] = g[3];
h2[0] = g[4]; h2[1] = g[5]; h2[2] = g[7]; h2[3] = g[6];
ringBase = ringBase_10302120_2;
}
}
else if (strcmp(pattern, "11303011") == 0) {
// there are 2 configurations for pattern "11303011"
if (findGpuByXGMI(system, 1, 2, &g[2], &g[6], 1, -1, -1)) {
if (!findGpuByXGMI(system, 2, 1, &g[4], &g[1], 1, g[6], g[2])) return ncclSuccess;
if (!findGpuByXGMI(system, 0, 1, &g[0], &g[3], 0, -1, -1)) return ncclSuccess;
if (!findGpuByXGMI(system, 3, 2, &g[7], &g[5], 1, -1, -1)) return ncclSuccess;
// finally verify two XGMI hives for pattern "11303011"
h1[0] = g[0]; h1[1] = g[3]; h1[2] = g[2]; h1[3] = g[6];
h2[0] = g[1]; h2[1] = g[4]; h2[2] = g[5]; h2[3] = g[7];
ringBase = ringBase_11303011_2;
} else {
// identify GPUs for pattern "11303011"
if (!findGpuByXGMI(system, 0, 1, &g[0], &g[1], 1, -1, -1)) return ncclSuccess;
if (!findGpuByXGMI(system, 0, 1, &g[0], &g[3], 0, -1, -1)) return ncclSuccess;
if (!findGpuByXGMI(system, 1, 1, &g[1], &g[2], -1, -1, -1)) return ncclSuccess;
if (!findGpuByXGMI(system, 3, 2, &g[7], &g[5], -1, -1, -1)) return ncclSuccess;
if (!findGpuByXGMI(system, 3, 2, &g[7], &g[6], -1, -1, g[5])) return ncclSuccess;
if (!findGpuByXGMI(system, 2, 2, &g[5], &g[4], -1, -1, -1)) return ncclSuccess;
// finally verify two XGMI hives for pattern "11303011"
h1[0] = g[0]; h1[1] = g[1]; h1[2] = g[2]; h1[3] = g[3];
h2[0] = g[4]; h2[1] = g[5]; h2[2] = g[7]; h2[3] = g[6];
ringBase = ringBase_11303011_1;
}
}
else if (strcmp(pattern, "0110201010200110") == 0) {
if (findGpuByXGMI(system, 2, 5, &g[2], &g[6], 1, -1, -1)) {
if (!findGpuByXGMI(system, 4, 2, &g[4], &g[1], 1, g[6], g[2])) return ncclSuccess;
if (!findGpuByXGMI(system, 1, 3, &g[0], &g[3], 0, -1, -1)) return ncclSuccess;
if (!findGpuByXGMI(system, 7, 5, &g[7], &g[5], 1, -1, -1)) return ncclSuccess;
h1[0] = g[0]; h1[1] = g[3]; h1[2] = g[2]; h1[3] = g[6];
h2[0] = g[1]; h2[1] = g[4]; h2[2] = g[5]; h2[3] = g[7];
ringBase = ringBase_0110201010200110_2;
} else {
if (!findGpuByXGMI(system, 1, 2, &g[0], &g[1], 1, -1, -1)) return ncclSuccess;
if (!findGpuByXGMI(system, 1, 3, &g[0], &g[3], 0, -1, -1)) return ncclSuccess;
if (!findGpuByXGMI(system, 2, 2, &g[1], &g[2], -1, -1, -1)) return ncclSuccess;
if (!findGpuByXGMI(system, 7, 5, &g[7], &g[5], -1, -1, -1)) return ncclSuccess;
if (!findGpuByXGMI(system, 7, 5, &g[7], &g[6], -1, -1, g[5])) return ncclSuccess;
if (!findGpuByXGMI(system, 4, 5, &g[4], &g[5], -1, -1, -1)) return ncclSuccess;
h1[0] = g[0]; h1[1] = g[1]; h1[2] = g[2]; h1[3] = g[3];
h2[0] = g[4]; h2[1] = g[5]; h2[2] = g[7]; h2[3] = g[6];
ringBase = ringBase_0110201010200110_1;
}
}
else
return ncclSuccess;
if (!validate4P1H(system, h1)) return ncclSuccess;
if (!validate4P1H(system, h2)) return ncclSuccess;
// passed all validation
// create 4P2H based on reference and remapped ids
for (i = 0; i <strlen(ringBase); i++) {
if (ringBase[i] >= '0' && ringBase[i] <= '9')
ringRemap[i] = g[ringBase[i]-'0'] + '0';
else
ringRemap[i] = ringBase[i];
// number of GPUs and NICs on each numa node is used as first screening pattern
struct rcclRomeModel romeTopo;
char pattern[256];
int net_map[MAX_ROME_NICS];
parseRomeSystem(system, &romeTopo, pattern, net_map);
// recognize system as Rome 4P2H even if no matching model
if (ngpus == 8 && romeTopo.nLinks) system->type = RCCL_TOPO_4P2H_ROME;
int g[MAX_ROME_GPUS];
int time = 0;
struct timeval tvs, tve;
gettimeofday(&tvs, NULL);
for (i = 0; i < sizeof(romeTopoModels)/sizeof(romeTopoModels[0]); i++) {
if (romeTopo.nCpus != romeTopoModels[i].nCpus || romeTopo.nGpus != romeTopoModels[i].nGpus ||
romeTopo.nNics != romeTopoModels[i].nNics || romeTopo.nLinks != romeTopoModels[i].nLinks) continue;
if (strcmp(romeTopoModels[i].pattern, pattern)) continue;
for (int j = 0; j < ngpus; j++) g[j] = (j+2)%ngpus;
if (permuteGpuIds(g, 0, ngpus-1, romeTopoModels+i, &romeTopo, &time)) break;
}
ringRemap[i] = 0;
*str = ringRemap;
INFO(NCCL_GRAPH, "Use 4P2H on Rome: %s", ringRemap);
gettimeofday(&tve, NULL);
float t = (tve.tv_sec - tvs.tv_sec)*1E3 + (tve.tv_usec - tvs.tv_usec)/1E3;
if (i >= sizeof(romeTopoModels)/sizeof(romeTopoModels[0])) {
//printf("No solution in %.2fms (%d iter)\n", t, time);
return ncclSuccess;
}
//printf("Solution in %.2fms (%d iter): ", t, time);
//for (int k = 0; k < ngpus; k++) printf("%d ", g[k]);
//printf("\n");
// create 4P2H based on reference and remapped ids
NCCLCHECK(parseGraph(romeTopoModels[i].ringBase, system, graph, g, romeTopo.nNics, net_map));
return ncclSuccess;
}
@@ -1014,6 +1054,7 @@ float speedArray[] = { 42.0, 24.0, 21.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0,
ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) {
int ngpus = system->nodes[GPU].count;
int nnets = system->nodes[NET].count;
int crossNic = (system->nodes[NET].count > 1) && graph->crossNic ? 1 : 0;
graph->speedIntra = graph->speedInter = 0;
if (graph->crossNic == 2) graph->crossNic = 0;
@@ -1036,59 +1077,19 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
}
str = getenv("NCCL_RINGS");
if (str) system->type = RCCL_TOPO_4P2H_ROME;
if (!str) NCCLCHECK(parseChordalRing(system, &str));
if (!str) NCCLCHECK(parseRome4P2H(system, &str));
if (str) {
NCCLCHECK(parseGraph(str, &graph->nChannels, ngpus, graph->intra));
for (int i=0; i<graph->nChannels*ngpus; i++) {
// Translate gpu numbers into ranks
int j = 0;
for (j = 0; j < system->nodes[GPU].count; j++)
if (graph->intra[i] == system->nodes[GPU].nodes[j].gpu.dev)
break;
if (j < system->nodes[GPU].count)
graph->intra[i] = system->nodes[GPU].nodes[j].gpu.rank;
else
return ncclInternalError;
}
graph->speedIntra = graph->speedInter = system->maxWidth;
if (system->nodes[NET].count) {
// do not change ring order for multi node 4P2H on Rome
if (system->type == RCCL_TOPO_4P2H_ROME) {
for (int n = 0; n < graph->nChannels; n++) {
graph->inter[n*2] = n%system->nodes[NET].count;
graph->inter[n*2+1] = (n+1)%system->nodes[NET].count;
}
} else {
int *intra, *used;
graph->nChannels = system->nodes[NET].count;
NCCLCHECK(ncclCalloc(&intra, ngpus));
NCCLCHECK(ncclCalloc(&used,system->nodes[NET].count));
for (int n = 0; n < system->nodes[NET].count; n++) {
graph->inter[n*2] = graph->inter[n*2+1] = n;
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
struct ncclTopoLinkList* paths = net->paths[GPU];
// find the first unsed GPU that is closest to NIC
int f, m;
for (f = 0; f < ngpus; f++) {
int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[f].gpu.rank) break;
if(j >= n) break;
}
for (int i = 0; i < ngpus; i++) {
int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[i].gpu.rank) break;
if (j < n) continue;
if (paths[i].count < paths[f].count) f = i;
}
for (m = 0; m<ngpus; m++) if (graph->intra[n*ngpus+m] == system->nodes[GPU].nodes[f].gpu.rank) break;
used[n] = graph->intra[n*ngpus+m];
for (int i = 0; i < ngpus; i++) intra[i] = graph->intra[n*ngpus+((i+m)%ngpus)];
for (int i = 0; i < ngpus; i++) graph->intra[n*ngpus+i] = intra[i];
}
free(used);
free(intra);
}
// user supplied topo
NCCLCHECK(parseGraph(str, system, graph, NULL, nnets, NULL));
if (graph->nChannels) {
system->type = RCCL_TOPO_4P2H_ROME;
return ncclSuccess;
}
} else {
// try to match 8P6L
NCCLCHECK(parseChordalRing(system, graph));
if (graph->nChannels) return ncclSuccess;
// try to match Rome 4P2H
NCCLCHECK(parseRome4P2H(system, graph));
if (graph->nChannels) return ncclSuccess;
}
+1 -1
View File
@@ -95,7 +95,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
struct ncclTopoRanks* topoRanks);
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks,
struct ncclTopoRanks** allTopoRanks, int* rings, int gcn);
struct ncclTopoRanks** allTopoRanks, int* rings, int gcn, int nnets);
ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, int rank);
+16 -1
View File
@@ -849,6 +849,21 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
}
INFO(NCCL_INIT, "RCCL AllToAll(v)/Scatter/Gather kernels %s", comm->alltoallDisable ? "disabled" : "enabled");
// count NETs used by ring
int nNets = 0;
int nets[MAXCHANNELS*2];
for (int i = 0; i < ringGraph.nChannels; i++) {
for (int j = 0; j < 2; j++) {
int k;
for (k = 0; k < nNets; k++)
if (nets[k] == ringGraph.inter[i*2+j]) break;
if (k >= nNets) {
nets[nNets] = ringGraph.inter[i*2+j];
nNets++;
}
}
}
if (comm->nChannels < nChannelsOrig) {
// We started duplicating channels during Preset(), so we need to move the
// duplicated channels since we have removed some.
@@ -858,7 +873,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
int *rings;
NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings, gcn));
NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings, gcn, nNets));
if (comm->nNodes > 1 &&
ncclParamCollNetEnable() == 1 &&
collNetSupport() && collNetGraph.nChannels) {
-50
View File
@@ -21,29 +21,17 @@ struct p2pConnectInfo {
void* directPtr;
hipIpcMemHandle_t devIpc;
};
uint64_t pidHash;
int id;
int sendRank;
int recvRank;
};
struct p2pSendResources {
struct ncclSendMem* devMem;
void* ipcPtr;
uint32_t* next_hdp_reg; // Next GPU in ring (for p2p transport use only)
uint64_t* opCount; // opCount allocated in host memory
uint64_t* devOpCount; // device side pointer to opCount
uint64_t* remOpCount; // remote opCount allocated in host memory
uint64_t* devRemOpCount; // device side pointer to remote opCount
};
struct p2pRecvResources {
struct ncclRecvMem* devMem;
void* ipcPtr;
uint64_t* opCount; // opCount allocated in host memory
uint64_t* devOpCount; // device side pointer to opCount
uint64_t* remOpCount; // remote opCount allocated in host memory
uint64_t* devRemOpCount; // device side pointer to remote opCount
};
#include <sys/types.h>
@@ -123,7 +111,6 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
TRACE(P2P,"IPC: %016lx %016lx %016lx %016lx", devIpc[4], devIpc[5], devIpc[6], devIpc[7]); \
} while (0)
#define MAX_SHM_NAME_LEN 1024
// Setting this to non zero causes P2P to use Reads rather than Writes
NCCL_PARAM(P2pReadEnable, "P2P_READ_ENABLE", -2);
@@ -165,16 +152,6 @@ ncclResult_t p2pSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
}
struct p2pConnectInfo info;
info.id = channelId;
info.pidHash = myInfo->pidHash;
info.sendRank = myInfo->cudaDev;
info.recvRank = peerInfo->cudaDev;
char shmName[MAX_SHM_NAME_LEN];
sprintf(shmName, "nccl-p2p-send-opcount-%lx-%d-%d-%d", info.pidHash, info.id, info.sendRank, info.recvRank);
TRACE(NCCL_P2P,"Open shmName %s", shmName);
NCCLCHECK(shmOpen(shmName, sizeof(uint64_t), (void**)&resources->opCount, (void**)&resources->devOpCount, 1));
info.read = useRead;
const char* useReadStr = info.read ? "/read" : "";
if (myInfo->pidHash == peerInfo->pidHash) {
@@ -232,16 +209,6 @@ ncclResult_t p2pRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, recvSize, true));
struct p2pConnectInfo info;
info.id = channelId;
info.pidHash = myInfo->pidHash;
info.sendRank = peerInfo->cudaDev;
info.recvRank = myInfo->cudaDev;
char shmName[MAX_SHM_NAME_LEN];
sprintf(shmName, "nccl-p2p-recv-opcount-%lx-%d-%d-%d", info.pidHash, info.id, info.sendRank, info.recvRank);
TRACE(NCCL_P2P,"Open shmName %s", shmName);
NCCLCHECK(shmOpen(shmName, sizeof(uint64_t), (void**)&resources->opCount, (void**)&resources->devOpCount, 1));
info.read = useRead;
if (myInfo->pidHash == peerInfo->pidHash) {
info.direct = 1;
@@ -298,13 +265,6 @@ static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, int nranks,
}
}
char shmName[MAX_SHM_NAME_LEN];
sprintf(shmName, "nccl-p2p-recv-opcount-%lx-%d-%d-%d", info->pidHash, info->id, info->sendRank, info->recvRank);
TRACE(NCCL_P2P,"Open shmName %s", shmName);
NCCLCHECK(shmOpen(shmName, sizeof(uint64_t), (void**)&resources->remOpCount, (void**)&resources->devRemOpCount, 0));
// Remove the file to ensure proper clean-up
NCCLCHECK(shmUnlink(shmName));
int offset = 0;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if (info->read && p == NCCL_PROTO_SIMPLE) {
@@ -344,12 +304,6 @@ ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, int nranks, int ran
}
}
char shmName[MAX_SHM_NAME_LEN];
sprintf(shmName, "nccl-p2p-send-opcount-%lx-%d-%d-%d", info->pidHash, info->id, info->sendRank, info->recvRank);
TRACE(NCCL_P2P,"Open shmName %s", shmName);
NCCLCHECK(shmOpen(shmName, sizeof(uint64_t), (void**)&resources->remOpCount, (void**)&resources->devRemOpCount, 0));
NCCLCHECK(shmUnlink(shmName));
int offset = 0;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
if (info->read && p == NCCL_PROTO_SIMPLE) {
@@ -370,8 +324,6 @@ ncclResult_t p2pSendFree(void* resources) {
if (sendRes->ipcPtr)
CUDACHECK(hipIpcCloseMemHandle(sendRes->ipcPtr));
CUDACHECK(hipFree(sendRes->devMem));
NCCLCHECK(shmClose(sendRes->opCount, sendRes->devOpCount, sizeof(uint64_t)));
NCCLCHECK(shmClose(sendRes->remOpCount, sendRes->devRemOpCount, sizeof(uint64_t)));
free(sendRes);
return ncclSuccess;
}
@@ -381,8 +333,6 @@ ncclResult_t p2pRecvFree(void* resources) {
if (recvRes->ipcPtr)
CUDACHECK(hipIpcCloseMemHandle(recvRes->ipcPtr));
CUDACHECK(hipFree(recvRes->devMem));
NCCLCHECK(shmClose(recvRes->opCount, recvRes->devOpCount, sizeof(uint64_t)));
NCCLCHECK(shmClose(recvRes->remOpCount, recvRes->devRemOpCount, sizeof(uint64_t)));
free(recvRes);
return ncclSuccess;
}
+1 -1
View File
@@ -21,7 +21,7 @@
DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
for i in {0..35}
for i in {0..37}
do
$DIR/../topo_expl/topo_expl -m $i > "topo_m$i.log"
$DIR/../TopoVisual/topo_visual.sh -i "topo_m$i.log"
@@ -0,0 +1,81 @@
<system version="2">
<cpu numaid="1" affinity="00000000,00000000,ffffffff,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:01:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:03:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="0" sm="96" gcn="906" arch="38911" rank="0" gdr="1">
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:c6:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:24:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:26:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="2" sm="96" gcn="906" arch="38911" rank="2" gdr="1">
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:c6:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:21:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:23:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="5" sm="96" gcn="906" arch="38911" rank="5" gdr="1">
<xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="0" affinity="00000000,00000000,00000000,ffffffff" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:41:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:43:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="1" sm="96" gcn="906" arch="38911" rank="1" gdr="1">
<xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="2" affinity="00000000,ffffffff,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:c1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:c3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="3" sm="96" gcn="906" arch="38911" rank="3" gdr="1">
<xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:c4:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:c6:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="6" sm="96" gcn="906" arch="38911" rank="6" gdr="1">
<xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:e1:00.0" class="0x020700" link_speed="16 GT/s" link_width="16">
<nic>
<net name="mlx5_0" dev="0" speed="100000" port="1" guid="0x20cd600003da341c" maxconn="262144" gdr="1"/>
<net name="mlx5_1" dev="1" speed="200000" port="2" guid="0x20cd600003da341c" maxconn="262144" gdr="1"/>
</nic>
</pci>
</cpu>
<cpu numaid="3" affinity="ffffffff,00000000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:81:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:83:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="4" sm="96" gcn="906" arch="38911" rank="4" gdr="1">
<xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:a1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:a3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="7" sm="96" gcn="906" arch="38911" rank="7" gdr="1">
<xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
</system>
@@ -0,0 +1,93 @@
<system version="2">
<cpu numaid="1" affinity="00000000,00000000,00000000,ffff0000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:41:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:43:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="0" sm="96" gcn="906" arch="38911" rank="0" gdr="1">
<xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:c6:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="2" affinity="00000000,00000000,0000ffff,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:21:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:23:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="1" sm="96" gcn="906" arch="38911" rank="1" gdr="1">
<xgmi target="0000:e3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:24:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:26:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="2" sm="96" gcn="906" arch="38911" rank="2" gdr="1">
<xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:c6:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="3" affinity="00000000,00000000,ffff0000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:01:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:03:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="3" sm="96" gcn="906" arch="38911" rank="3" gdr="1">
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="4" affinity="00000000,0000ffff,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:e1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:e3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="4" sm="96" gcn="906" arch="38911" rank="4" gdr="1">
<xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="5" affinity="00000000,ffff0000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:c1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:c3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="5" sm="96" gcn="906" arch="38911" rank="5" gdr="1">
<xgmi target="0000:e3:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
<pci busid="0000:c4:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:c6:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="6" sm="96" gcn="906" arch="38911" rank="6" gdr="1">
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="7" affinity="ffff0000,00000000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:81:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
<pci busid="0000:83:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
<gpu dev="7" sm="96" gcn="906" arch="38911" rank="7" gdr="1">
<xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
<xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
</gpu>
</pci>
</pci>
</cpu>
<cpu numaid="0" affinity="00000000,00000000,00000000,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:61:00.0" class="0x020700" link_speed="16 GT/s" link_width="16">
<nic>
<net name="mlx5_0" dev="0" speed="200000" port="1" guid="0xa8134300039f59b8" maxconn="262144" gdr="1"/>
<net name="mlx5_1" dev="1" speed="100000" port="2" guid="0xa8134300039f59b8" maxconn="262144" gdr="1"/>
</nic>
</pci>
</cpu>
<cpu numaid="6" affinity="0000ffff,00000000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
<pci busid="0000:a1:00.0" class="0x020700" link_speed="16 GT/s" link_width="16">
<nic>
<net name="mlx5_2" dev="2" speed="200000" port="1" guid="0x38815600039f59b8" maxconn="262144" gdr="1"/>
</nic>
</pci>
</cpu>
</system>
+2
View File
@@ -105,6 +105,8 @@ NodeModelDesc model_descs[] = {
{4, "topo_8p_ts1_n4_1.xml", "4 nodes 8 VEGA20 TS1 NPS=4 Alt. Model"},
{1, "topo_4p3l_ia.xml", "single node 8 gfx908"},
{4, "topo_4p3l_ia.xml", "4 nodes 8 gfx908"},
{4, "topo_8p_rome_n2_2.xml", "4 nodes 8 VEGA20 Rome NPS=2 Alt. Model 2 NET/IF"},
{4, "topo_8p_ts1_n4_2.xml", "4 nodes 8 VEGA20 TS1 NPS=4 3 NET/IF"},
};
int main(int argc,char* argv[])
+16 -1
View File
@@ -428,6 +428,21 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
}
INFO(NCCL_INIT, "RCCL AllToAll(v)/Scatter/Gather kernels %s", comm->alltoallDisable ? "disabled" : "enabled");
// count NETs used by ring
int nNets = 0;
int nets[MAXCHANNELS*2];
for (int i = 0; i < ringGraph.nChannels; i++) {
for (int j = 0; j < 2; j++) {
int k;
for (k = 0; k < nNets; k++)
if (nets[k] == ringGraph.inter[i*2+j]) break;
if (k >= nNets) {
nets[nNets] = ringGraph.inter[i*2+j];
nNets++;
}
}
}
if (comm->nChannels < nChannelsOrig) {
// We started duplicating channels during Preset(), so we need to move the
// duplicated channels since we have removed some.
@@ -437,7 +452,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
int *rings;
NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings, gcn));
NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings, gcn, nNets));
if (comm->nNodes > 1 &&
ncclParamCollNetEnable() == 1 &&
collNetSupport() && collNetGraph.nChannels) {