Rework Rome detection and add multiple network ports models (#274)
* Rework Rome detection and add multiple network ports models * Remove unused opCount in p2p transport
This commit is contained in:
@@ -255,7 +255,7 @@ int ncclMaxNchannels() {
|
||||
return maxNchannels;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct ncclTopoRanks** allTopoRanks, int* rings, int gcn) {
|
||||
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct ncclTopoRanks** allTopoRanks, int* rings, int gcn, int nnets) {
|
||||
// Gather data from all ranks
|
||||
int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeUpRecv, *treeUpSend, *treeDnRecv,*treeDnSend;
|
||||
int nranks = comm->nRanks;
|
||||
@@ -292,7 +292,8 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct nccl
|
||||
int nc = nChannels*2;
|
||||
if (gcn == 908) nc = std::max(nc, 4);
|
||||
if (comm->topo->nodes[NET].count == 0 && comm->topo->type == RCCL_TOPO_CR8G) nc = nChannels*4;
|
||||
if (comm->topo->nodes[NET].count && comm->topo->type == RCCL_TOPO_4P2H_ROME) nc = 4*comm->topo->nodes[NET].count;
|
||||
if (!nnets) nnets = comm->topo->nodes[NET].count;
|
||||
if (nnets && comm->topo->type == RCCL_TOPO_4P2H_ROME) nc = 4*nnets;
|
||||
int end = std::min((int)ncclMaxNchannels(), std::max(nc, ncclMinNchannels()));
|
||||
|
||||
// Duplication should be complete now
|
||||
|
||||
Executable
+106
@@ -0,0 +1,106 @@
|
||||
/*
|
||||
Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#define MAX_ROME_GPUS 8
|
||||
#define MAX_ROME_NICS 2
|
||||
|
||||
struct rcclRomeModel {
|
||||
int nGpus;
|
||||
int nCpus;
|
||||
int nNics;
|
||||
int nLinks;
|
||||
int64_t gpuIds[MAX_ROME_GPUS];
|
||||
int64_t gpuNuma[MAX_ROME_GPUS];
|
||||
int64_t nicNuma[MAX_ROME_NICS];
|
||||
int connMatrix[MAX_ROME_GPUS*MAX_ROME_GPUS];
|
||||
const char *pattern;
|
||||
const char *ringBase;
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_22 = {
|
||||
.nGpus = 8, .nCpus = 4, .nNics = 1, .nLinks = 2,
|
||||
.gpuIds = { 0x3000, 0x43000, 0x26000, 0xc3000, 0x83000, 0x23000, 0xc6000, 0xa3000, },
|
||||
.gpuNuma = { 1, 0, 1, 2, 3, 1, 2, 3, },
|
||||
.nicNuma = { 2, },
|
||||
.connMatrix = { 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, },
|
||||
.pattern = "10302120",
|
||||
.ringBase = "7 4 5 3 1 0 6 2|4 7 3 5 0 1 2 6",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_25 = {
|
||||
.nGpus = 8, .nCpus = 4, .nNics = 2, .nLinks = 2,
|
||||
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
|
||||
.gpuNuma = { 0, 1, 1, 1, 2, 2, 2, 3, },
|
||||
.nicNuma = { 0, 3, },
|
||||
.connMatrix = { 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, },
|
||||
.pattern = "11303011",
|
||||
.ringBase = "2 1 0 3 6 7 5 4|7 6 4 5 1 2 3 0",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_27 = {
|
||||
.nGpus = 8, .nCpus = 4, .nNics = 2, .nLinks = 2,
|
||||
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
|
||||
.gpuNuma = { 0, 1, 1, 1, 2, 2, 2, 3, },
|
||||
.nicNuma = { 0, 3, },
|
||||
.connMatrix = { 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, },
|
||||
.pattern = "11303011",
|
||||
.ringBase = "0 6 2 3 1 7 5 4|7 1 4 5 6 0 3 2",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_29 = {
|
||||
.nGpus = 8, .nCpus = 4, .nNics = 1, .nLinks = 3,
|
||||
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, },
|
||||
.gpuNuma = { 0, 1, 1, 1, 2, 2, 3, 3, },
|
||||
.nicNuma = { 2, },
|
||||
.connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, },
|
||||
.pattern = "10302120",
|
||||
.ringBase = "6 5 7 4 0 1 3 2|6 4 7 5 2 3 1 0",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_31 = {
|
||||
.nGpus = 8, .nCpus = 8, .nNics = 2, .nLinks = 2,
|
||||
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
|
||||
.gpuNuma = { 1, 2, 2, 3, 4, 5, 5, 7, },
|
||||
.nicNuma = { 0, 6, },
|
||||
.connMatrix = { 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, },
|
||||
.pattern = "0110201010200110",
|
||||
.ringBase = "1 2 3 0 6 4 5 7|4 6 7 5 2 1 0 3",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel rome_model_33 = {
|
||||
.nGpus = 8, .nCpus = 8, .nNics = 2, .nLinks = 2,
|
||||
.gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
|
||||
.gpuNuma = { 1, 2, 2, 3, 4, 5, 5, 7, },
|
||||
.nicNuma = { 0, 6, },
|
||||
.connMatrix = { 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, },
|
||||
.pattern = "0110201010200110",
|
||||
.ringBase = "1 4 5 7 0 3 2 6|4 1 7 5 6 2 3 0",
|
||||
};
|
||||
|
||||
static struct rcclRomeModel romeTopoModels[] = {
|
||||
rome_model_22,
|
||||
rome_model_25,
|
||||
rome_model_27,
|
||||
rome_model_29,
|
||||
rome_model_31,
|
||||
rome_model_33,
|
||||
};
|
||||
+297
-296
@@ -10,6 +10,8 @@
|
||||
#include "topo.h"
|
||||
#include "xml.h"
|
||||
#include <math.h>
|
||||
#include <sys/time.h>
|
||||
#include "rome_models.h"
|
||||
|
||||
// Initialize system->maxWidth. This is the per-channel (i.e. per-SM)
|
||||
// max speed.
|
||||
@@ -663,64 +665,111 @@ ncclResult_t ncclTopoGetXmlFromGraphs(int ngraphs, struct ncclTopoGraph** graphs
|
||||
}
|
||||
|
||||
/* Parse user defined rings. Format is like :
|
||||
* "0 1|1 0|0 1 2 3|3 2 1 0|0 2 3 1|1 3 2 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0"
|
||||
* Rings with a non-matching number of ranks are ignored so we can provide
|
||||
* "0 1|1 0|0 1 2 3|3 2 1 0|N0 0 2 3 1 N1|1 3 2 0|0 1 2 3 4 5 6 7|N2 7 6 5 4 3 2 1 0 N1"
|
||||
* Network interfaces can be optionally specified by N prefix.
|
||||
* Rings with a non-matching number of gpus are ignored so we can provide
|
||||
* rings for multiple cases.
|
||||
*/
|
||||
#define MAX_ENV_RANKS 512
|
||||
static ncclResult_t parseGraph(const char* str, int* nChannelsRet, int ngpus, int* channels) {
|
||||
int ranks[MAX_ENV_RANKS];
|
||||
static ncclResult_t parseGraph(const char* str, struct ncclTopoSystem* system, struct ncclTopoGraph* graph, int* gpu_map, int nnets, int* net_map ) {
|
||||
int gpus[MAX_ROME_GPUS];
|
||||
int nChannels = 0;
|
||||
int rank = 0;
|
||||
int gpu = 0;
|
||||
int offset = 0;
|
||||
int status = 0; // 0 : between numbers, 1 : inside number
|
||||
int status = 0; // 0 : between numbers, 1 : inside number, 2: start NET
|
||||
int nets[2];
|
||||
int net = 0;
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
do {
|
||||
int digit = str[offset] - '0';
|
||||
if (digit >= 0 && digit <= 9) {
|
||||
if (str[offset] == 'N') {
|
||||
if (status == 0) {
|
||||
ranks[rank] = digit;
|
||||
status = 1;
|
||||
} else {
|
||||
ranks[rank] = ranks[rank]*10+digit;
|
||||
status = 2;
|
||||
}
|
||||
} else {
|
||||
if (status == 1) {
|
||||
rank++;
|
||||
if (rank == MAX_ENV_RANKS) goto end;
|
||||
}
|
||||
status = 0;
|
||||
if (str[offset] == '|' || str[offset] == '\0') {
|
||||
// Ignore if ngpus doesn't match
|
||||
if (rank != ngpus) goto newchannel;
|
||||
|
||||
for (int r=0; r<ngpus; r++) {
|
||||
int rank = ranks[r];
|
||||
// Ignore if ranks are out of bounds
|
||||
if (rank < 0 || rank >= ngpus) goto newchannel;
|
||||
// Ignore if ranks are duplicate
|
||||
for (int i=0; i<r; i++)
|
||||
if (ranks[i] == rank) goto newchannel;
|
||||
|
||||
channels[nChannels*ngpus+r] = rank;
|
||||
int digit = str[offset] - '0';
|
||||
if (digit >= 0 && digit <= 9) {
|
||||
if (status == 0) {
|
||||
gpus[gpu] = digit;
|
||||
status = 1;
|
||||
} else if (status == 2) {
|
||||
nets[net] = digit;
|
||||
}
|
||||
nChannels++;
|
||||
else{
|
||||
gpus[gpu] = gpus[gpu]*10+digit;
|
||||
}
|
||||
} else {
|
||||
if (status == 1) {
|
||||
gpu++;
|
||||
if (gpu > MAX_ROME_GPUS) goto end;
|
||||
} else if (status == 2) {
|
||||
net++;
|
||||
if (net > 2) goto end;
|
||||
}
|
||||
status = 0;
|
||||
if (str[offset] == '|' || str[offset] == '\0') {
|
||||
// Ignore if ngpus doesn't match
|
||||
if (gpu != ngpus) goto newchannel;
|
||||
// Ignore if nnets are not 0 or 2
|
||||
if (net && net != 2) goto newchannel;
|
||||
|
||||
for (int r=0; r<ngpus; r++) {
|
||||
int g = gpus[r];
|
||||
// Ignore if gpus are out of bounds
|
||||
if (g < 0 || g >= ngpus) goto newchannel;
|
||||
// Ignore if gpus are duplicate
|
||||
for (int i=0; i<r; i++)
|
||||
if (gpus[i] == g) goto newchannel;
|
||||
// remap if needed
|
||||
if (gpu_map) g = gpu_map[g];
|
||||
// Translate gpu numbers into ranks
|
||||
int j = 0;
|
||||
for (j = 0; j < ngpus; j++)
|
||||
if (g == system->nodes[GPU].nodes[j].gpu.dev)
|
||||
break;
|
||||
if (j < ngpus)
|
||||
graph->intra[nChannels*ngpus+r] = system->nodes[GPU].nodes[j].gpu.rank;
|
||||
else
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
if (net) {
|
||||
if (nets[0] >= nnets || nets[1] >= nnets) goto newchannel;
|
||||
graph->inter[nChannels*2] = nets[0];
|
||||
graph->inter[nChannels*2+1] = nets[1];
|
||||
} else if (net_map && nnets) {
|
||||
graph->inter[nChannels*2] = net_map[nChannels%nnets];
|
||||
graph->inter[nChannels*2+1] = net_map[(nChannels+1)%nnets];
|
||||
} else if (nnets) {
|
||||
graph->inter[nChannels*2] = nChannels%nnets;
|
||||
graph->inter[nChannels*2+1] = (nChannels+1)%nnets;
|
||||
}
|
||||
nChannels++;
|
||||
newchannel:
|
||||
rank = 0;
|
||||
gpu = 0;
|
||||
net = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
} while (str[offset++] != 0);
|
||||
end:
|
||||
*nChannelsRet = nChannels;
|
||||
graph->nChannels = nChannels;
|
||||
graph->speedIntra = graph->speedInter = system->maxWidth;
|
||||
#if 0
|
||||
for (int i=0; i<graph->nChannels; i++) {
|
||||
printf("%d: ", i);
|
||||
printf ("NET/%d ", graph->inter[i*2]);
|
||||
for (int j=0; j<ngpus; j++) printf("GPU/%d ", graph->intra[i*ngpus+j]);
|
||||
printf ("NET/%d ", graph->inter[i*2+1]);
|
||||
printf("\n");
|
||||
}
|
||||
#endif
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t parseChordalRing(struct ncclTopoSystem* system, char **str) {
|
||||
static ncclResult_t parseChordalRing(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
|
||||
static const char *ringBase = "0 1 2 3 5 4 7 6|0 2 4 1 7 3 6 5|0 3 1 5 7 2 6 4|0 6 7 4 5 3 2 1|0 5 6 3 7 1 4 2|0 4 6 2 7 5 1 3";
|
||||
static char ringRemap[256];
|
||||
int id[8], dist[8];
|
||||
int i;
|
||||
|
||||
*str = 0;
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
if (ngpus != 8)
|
||||
return ncclSuccess;
|
||||
@@ -757,251 +806,242 @@ static ncclResult_t parseChordalRing(struct ncclTopoSystem* system, char **str)
|
||||
dist[m] = dist[n]; dist[n] = temp;
|
||||
}
|
||||
// create chordal ring based on reference and remapped ids
|
||||
for (i = 0; i <strlen(ringBase); i++) {
|
||||
if (ringBase[i] >= '0' && ringBase[i] <= '9')
|
||||
ringRemap[i] = id[ringBase[i]-'0']+'0';
|
||||
else
|
||||
ringRemap[i] = ringBase[i];
|
||||
}
|
||||
ringRemap[i] = 0;
|
||||
*str = ringRemap;
|
||||
system->type = RCCL_TOPO_CR8G;
|
||||
INFO(NCCL_GRAPH, "Use chordal ring: %s", ringRemap);
|
||||
NCCLCHECK(parseGraph(ringBase, system, graph, id, 0, NULL));
|
||||
if (system->nodes[NET].count) {
|
||||
int *intra, *used;
|
||||
graph->nChannels = system->nodes[NET].count;
|
||||
NCCLCHECK(ncclCalloc(&intra, ngpus));
|
||||
NCCLCHECK(ncclCalloc(&used,system->nodes[NET].count));
|
||||
for (int n = 0; n < system->nodes[NET].count; n++) {
|
||||
graph->inter[n*2] = graph->inter[n*2+1] = n;
|
||||
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
|
||||
struct ncclTopoLinkList* paths = net->paths[GPU];
|
||||
// find the first unsed GPU that is closest to NIC
|
||||
int f, m;
|
||||
for (f = 0; f < ngpus; f++) {
|
||||
int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[f].gpu.rank) break;
|
||||
if(j >= n) break;
|
||||
}
|
||||
for (int i = 0; i < ngpus; i++) {
|
||||
int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[i].gpu.rank) break;
|
||||
if (j < n) continue;
|
||||
if (paths[i].count < paths[f].count) f = i;
|
||||
}
|
||||
for (m = 0; m<ngpus; m++) if (graph->intra[n*ngpus+m] == system->nodes[GPU].nodes[f].gpu.rank) break;
|
||||
used[n] = graph->intra[n*ngpus+m];
|
||||
for (int i = 0; i < ngpus; i++) intra[i] = graph->intra[n*ngpus+((i+m)%ngpus)];
|
||||
for (int i = 0; i < ngpus; i++) graph->intra[n*ngpus+i] = intra[i];
|
||||
}
|
||||
free(used);
|
||||
free(intra);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static bool getGpuNetCount(struct ncclTopoSystem* system, int id, int *ngpu, int *nnet) {
|
||||
*ngpu = 0; *nnet = 0;
|
||||
static bool getGpuNetCount(struct ncclTopoSystem* system, int id, int *g, int *n, int nnet, int *net_map) {
|
||||
*g = 0; *n = 0;
|
||||
int i;
|
||||
if (ncclTopoIdToIndex(system, CPU, id, &i) == ncclInternalError) return false;
|
||||
for (int n = 0; n < system->nodes[NET].count; n++)
|
||||
if (system->nodes[NET].nodes[n].paths[CPU][i].count == 2) (*nnet)++;
|
||||
for (int n = 0; n < system->nodes[GPU].count; n++)
|
||||
if (system->nodes[GPU].nodes[n].paths[CPU][i].count == 2) (*ngpu)++;
|
||||
for (int j = 0; j < nnet; j++)
|
||||
if (system->nodes[NET].nodes[net_map[j]].paths[CPU][i].count == 2) (*n)++;
|
||||
for (int j = 0; j < system->nodes[GPU].count; j++)
|
||||
if (system->nodes[GPU].nodes[j].paths[CPU][i].count == 2) (*g)++;
|
||||
return true;
|
||||
}
|
||||
|
||||
/* compare GPUs by PCI ID */
|
||||
static int compareGPU (const void *g1, const void *g2, void *s) {
|
||||
struct ncclTopoSystem* system = (struct ncclTopoSystem*)s;
|
||||
return system->nodes[GPU].nodes[*(int *)g1].id > system->nodes[GPU].nodes[*(int *)g2].id;
|
||||
static ncclResult_t ncclGpuIdToIndex(struct ncclTopoSystem* system, int id, int* index) {
|
||||
*index = -1;
|
||||
for (int i=0; i<system->nodes[GPU].count; i++) {
|
||||
if (system->nodes[GPU].nodes[i].gpu.dev == id) {
|
||||
*index = i;
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
static bool findGpuByXGMI(struct ncclTopoSystem* system, int cpu1, int cpu2, int *gpu1, int *gpu2, int use_shared, int ex1, int ex2) {
|
||||
int n, m, k, idx, c1, c2;
|
||||
uint64_t gid;
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
if (ncclTopoIdToIndex(system, CPU, cpu1, &c1) == ncclInternalError) return false;
|
||||
if (ncclTopoIdToIndex(system, CPU, cpu2, &c2) == ncclInternalError) return false;
|
||||
static ncclResult_t parseRomeSystem(struct ncclTopoSystem* system, struct rcclRomeModel* romeTopo, char *pattern, int *net_map) {
|
||||
pattern[0] = 0; // pattern will be NULL for invalid topology
|
||||
romeTopo->nGpus = system->nodes[GPU].count;
|
||||
romeTopo->nCpus = system->nodes[CPU].count;
|
||||
romeTopo->nNics = 0;
|
||||
romeTopo->nLinks = 0;
|
||||
for (int i = 0; i < romeTopo->nGpus; i ++) {
|
||||
int gpu, n;
|
||||
NCCLCHECK(ncclGpuIdToIndex(system, i, &gpu));
|
||||
romeTopo->gpuIds[i] = system->nodes[GPU].nodes[gpu].id;
|
||||
for (n = 0; n < romeTopo->nCpus; n++)
|
||||
if (system->nodes[GPU].nodes[gpu].paths[CPU][n].count == 2) break;
|
||||
if (n < romeTopo->nCpus) romeTopo->gpuNuma[i] = system->nodes[CPU].nodes[n].id;
|
||||
|
||||
int *s_gpus = (int *)malloc(sizeof(int)*ngpus);
|
||||
int s_ngpus = 0;
|
||||
|
||||
// build a sorted list of source GPUs
|
||||
for (n = 0; n < ngpus; n++) {
|
||||
if (*gpu1 != -1 && system->nodes[GPU].nodes[n].gpu.dev != *gpu1) continue;
|
||||
if (system->nodes[GPU].nodes[n].gpu.dev == ex1) continue;
|
||||
if (system->nodes[GPU].nodes[n].paths[CPU][c1].count != 2) continue;
|
||||
s_gpus[s_ngpus++] = n;
|
||||
}
|
||||
if (s_ngpus) qsort_r(s_gpus, s_ngpus, sizeof(int), compareGPU, system);
|
||||
|
||||
for (n = 0; n < s_ngpus; n++) {
|
||||
struct ncclTopoNode* node = system->nodes[GPU].nodes+s_gpus[n];
|
||||
struct ncclTopoNode* node = system->nodes[GPU].nodes+gpu;
|
||||
if (node->paths[GPU] == NULL) continue;
|
||||
idx = -1; gid = 0;
|
||||
for (m = 0; m < ngpus; m++) {
|
||||
if (*gpu2 != -1 && system->nodes[GPU].nodes[m].gpu.dev != *gpu2) continue;
|
||||
if (system->nodes[GPU].nodes[m].gpu.dev == ex2) continue;
|
||||
if (system->nodes[GPU].nodes[m].paths[CPU][c2].count != 2) continue;
|
||||
int count = 0;
|
||||
for (n = 0; n < romeTopo->nGpus; n++) {
|
||||
romeTopo->connMatrix[i*romeTopo->nGpus+n] = 0;
|
||||
struct ncclTopoLink* link;
|
||||
for (link = node->links; link->remNode; link++) {
|
||||
if (link->remNode->gpu.dev == system->nodes[GPU].nodes[m].gpu.dev) break;
|
||||
if (link->remNode->gpu.dev == n) break;
|
||||
}
|
||||
if (!link->remNode) continue;
|
||||
if (link->type == LINK_NVL) {
|
||||
int is_shared = 0;
|
||||
for (k = 0; k < ngpus; k++) {
|
||||
if (k == m || k == s_gpus[n]) continue;
|
||||
if ((system->nodes[GPU].nodes[k].id & 0xf0000) == (system->nodes[GPU].nodes[m].id & 0xf0000))
|
||||
break;
|
||||
}
|
||||
if (k < ngpus) is_shared = 1;
|
||||
if (use_shared == -1 || is_shared == use_shared) {
|
||||
if (idx == -1 || (idx != -1 && system->nodes[GPU].nodes[m].id < gid)) {
|
||||
idx = m;
|
||||
gid = system->nodes[GPU].nodes[m].id;
|
||||
}
|
||||
}
|
||||
if (link->type != LINK_NVL) continue;
|
||||
romeTopo->connMatrix[i*romeTopo->nGpus+n] = 1;
|
||||
count ++;
|
||||
}
|
||||
if (!romeTopo->nLinks) romeTopo->nLinks = count;
|
||||
else if (romeTopo->nLinks != count) return ncclSuccess;
|
||||
}
|
||||
|
||||
// trim ports and create NET map
|
||||
for (int i = 0; i < system->nodes[NET].count; i ++) {
|
||||
int j;
|
||||
for (j = 0; j < romeTopo->nNics; j++) {
|
||||
if (system->nodes[NET].nodes[i].net.asic == system->nodes[NET].nodes[net_map[j]].net.asic) {
|
||||
if (system->nodes[NET].nodes[i].net.width > system->nodes[NET].nodes[net_map[j]].net.width)
|
||||
net_map[j] = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (idx != -1) break;
|
||||
if (j >= romeTopo->nNics) {
|
||||
net_map[j] = i;
|
||||
(romeTopo->nNics)++;
|
||||
if (romeTopo->nNics >= MAX_ROME_NICS) break;
|
||||
}
|
||||
}
|
||||
if (n < s_ngpus) {
|
||||
*gpu1 = system->nodes[GPU].nodes[s_gpus[n]].gpu.dev;
|
||||
*gpu2 = system->nodes[GPU].nodes[idx].gpu.dev;
|
||||
//printf("%s+: c1 %d c2 %d gpu1 %d gpu2 %d use_shared %d ex1 %d, ex2 %d\n",
|
||||
// __func__, cpu1, cpu2, *gpu1, *gpu2, use_shared, ex1, ex2);
|
||||
free(s_gpus);
|
||||
|
||||
// number of GPUs and NICs on each numa node is used as first screening pattern
|
||||
for (int i = 0; i < romeTopo->nCpus; i++) {
|
||||
int g, n;
|
||||
if (!getGpuNetCount(system, i, &g, &n, romeTopo->nNics, net_map)) return ncclSuccess;
|
||||
pattern[i*2] = '0' + g;
|
||||
pattern[i*2+1] = '0' + n;
|
||||
}
|
||||
pattern[romeTopo->nCpus*2] = 0;
|
||||
|
||||
for (int i = 0; i < romeTopo->nNics; i ++) {
|
||||
int net, n;
|
||||
NCCLCHECK(ncclTopoIdToIndex(system, NET, net_map[i], &net));
|
||||
for (n = 0; n < romeTopo->nCpus; n++)
|
||||
if (system->nodes[NET].nodes[net].paths[CPU][n].count == 2) break;
|
||||
if (n < romeTopo->nCpus) romeTopo->nicNuma[i] = system->nodes[CPU].nodes[n].id;
|
||||
else return ncclSuccess;
|
||||
}
|
||||
|
||||
const char* romeModelFile = getenv("RCCL_DUMP_ROME_MODEL_FILE");
|
||||
if (romeModelFile) {
|
||||
INFO(NCCL_ENV, "RCCL_DUMP_ROME_MODEL_FILE set by environment to %s", romeModelFile);
|
||||
FILE* file = fopen(romeModelFile, "w");
|
||||
if (file == NULL) {
|
||||
WARN("Unable to open %s, not dumping Rome model.", romeModelFile);
|
||||
return ncclSuccess;
|
||||
}
|
||||
fprintf(file, "static struct rcclRomeModel rome_model_ = {\n");
|
||||
fprintf(file, " .nGpus = %d, .nCpus = %d, .nNics = %d, .nLinks = %d,\n", romeTopo->nGpus, romeTopo->nCpus, romeTopo->nNics, romeTopo->nLinks);
|
||||
fprintf(file, " .gpuIds = { ");
|
||||
for (int i = 0; i < romeTopo->nGpus; i ++) fprintf(file, "0x%lx, ", romeTopo->gpuIds[i]);
|
||||
fprintf(file, "},\n");
|
||||
fprintf(file, " .gpuNuma = { ");
|
||||
for (int i = 0; i < romeTopo->nGpus; i ++) fprintf(file, "%ld, ", romeTopo->gpuNuma[i]);
|
||||
fprintf(file, "},\n");
|
||||
fprintf(file, " .nicNuma = { ");
|
||||
for (int i = 0; i < romeTopo->nNics; i ++) fprintf(file, "%ld, ", romeTopo->nicNuma[i]);
|
||||
fprintf(file, "},\n");
|
||||
fprintf(file, " .connMatrix = { ");
|
||||
for (int i = 0; i < romeTopo->nGpus; i ++)
|
||||
for (int n = 0; n < romeTopo->nGpus; n++) fprintf(file, "%d, ", romeTopo->connMatrix[i*romeTopo->nGpus+n]);
|
||||
fprintf(file, "},\n");
|
||||
fprintf(file, " .pattern = \"%s\",\n", pattern);
|
||||
fprintf(file, " .ringBase = \"\",\n");
|
||||
fprintf(file, "};\n");
|
||||
fclose(file);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static bool permuteGpuIds(int *g, int n, int last, struct rcclRomeModel* ref, struct rcclRomeModel* topo, int* time) {
|
||||
(*time) ++;
|
||||
if (n == last) {
|
||||
int i, j;
|
||||
// match GPU numa
|
||||
for (i = 0; i < ref->nGpus; i++)
|
||||
if (ref->gpuNuma[i] != topo->gpuNuma[g[i]]) break;
|
||||
if (i < ref->nGpus) return false;
|
||||
// match XGMI connection
|
||||
for (i = 0; i < ref->nGpus; i++) {
|
||||
for (j = 0; j < ref->nGpus; j++)
|
||||
if (ref->connMatrix[i*ref->nGpus+j] != topo->connMatrix[g[i]*ref->nGpus+g[j]]) break;
|
||||
if (j < ref->nGpus) break;
|
||||
}
|
||||
if (i < ref->nGpus) return false;
|
||||
// match NBIO
|
||||
for (i = 0; i < ref->nGpus; i++) {
|
||||
for (j = 0; j < ref->nGpus; j++) {
|
||||
if (i == j) continue;
|
||||
bool nbio_ref = (ref->gpuIds[i]&0xf0000) == (ref->gpuIds[j]&0xf0000);
|
||||
bool nbio_topo = (topo->gpuIds[g[i]]&0xf0000) == (topo->gpuIds[g[j]]&0xf0000);
|
||||
if (nbio_ref != nbio_topo) break;
|
||||
if (nbio_ref && ((ref->gpuIds[i]-ref->gpuIds[j])*(topo->gpuIds[g[i]]-topo->gpuIds[g[j]]) < 0)) break;
|
||||
}
|
||||
if (j < ref->nGpus) break;
|
||||
}
|
||||
if (i < ref->nGpus) return false;
|
||||
return true;
|
||||
} else {
|
||||
for (int i = n; i <= last; i++) {
|
||||
std::swap(g[n], g[i]);
|
||||
if (permuteGpuIds(g, n+1, last, ref, topo, time)) return true;
|
||||
std::swap(g[n], g[i]);
|
||||
}
|
||||
}
|
||||
free(s_gpus);
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool validate4P1H(struct ncclTopoSystem* system, int *hive) {
|
||||
int g, n, m;
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
for (g = 0; g < 4; g++) {
|
||||
int gpu = hive[g];
|
||||
int next_gpu = hive[(g+1)%4];
|
||||
for (n = 0; n < ngpus; n++) {
|
||||
if (system->nodes[GPU].nodes[n].gpu.dev != gpu) continue;
|
||||
struct ncclTopoNode* node = system->nodes[GPU].nodes+n;
|
||||
if (node->paths[GPU] == NULL) continue;
|
||||
for (m = 0; m < ngpus; m++) {
|
||||
struct ncclTopoLink* link;
|
||||
for (link = node->links; link->remNode; link++) {
|
||||
if (link->remNode->gpu.dev == next_gpu) break;
|
||||
}
|
||||
if (!link->remNode) continue;
|
||||
if (link->type == LINK_NVL) break;
|
||||
}
|
||||
if (m < ngpus) break;
|
||||
}
|
||||
if (n < ngpus) continue;
|
||||
else break;
|
||||
}
|
||||
if (g < 4) return false;
|
||||
else return true;
|
||||
}
|
||||
|
||||
static ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, char **str) {
|
||||
static const char *ringBase_10302120_1 = "7 4 5 3 1 0 6 2|4 7 3 5 0 1 2 6";
|
||||
static const char *ringBase_10302120_2 = "6 4 7 5 0 1 3 2|6 5 7 4 2 3 1 0";
|
||||
static const char *ringBase_11303011_1 = "2 1 0 3 6 7 5 4|7 6 4 5 1 2 3 0";
|
||||
static const char *ringBase_11303011_2 = "0 6 2 3 1 7 5 4|7 1 4 5 6 0 3 2";
|
||||
static const char *ringBase_0110201010200110_1 = "1 2 3 0 6 4 5 7|4 6 7 5 2 1 0 3";
|
||||
static const char *ringBase_0110201010200110_2 = "3 0 6 2 1 4 5 7|4 1 0 3 2 6 7 5";
|
||||
static const char *ringBase;
|
||||
static ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
|
||||
static char ringRemap[64];
|
||||
int id[8], dist[8];
|
||||
int i;
|
||||
|
||||
*str = 0;
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
int ncpus = system->nodes[CPU].count;
|
||||
// 8 GPUs only
|
||||
if (ngpus != 8)
|
||||
return ncclSuccess;
|
||||
|
||||
// only valid on Rome
|
||||
int arch, vendor, model;
|
||||
NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
|
||||
if (arch != NCCL_TOPO_CPU_ARCH_X86 || vendor != NCCL_TOPO_CPU_VENDOR_AMD || model != NCCL_TOPO_CPU_TYPE_ROME)
|
||||
return ncclSuccess;
|
||||
system->type = RCCL_TOPO_4P2H_ROME;
|
||||
// 4 or 8 numa nodes only
|
||||
if (ncpus != 4 && ncpus != 8)
|
||||
return ncclSuccess;
|
||||
// number of GPUs and NICs on each numa node is used as first screening pattern
|
||||
char pattern[256];
|
||||
for (i = 0; i < ncpus; i++) {
|
||||
int g, n;
|
||||
if (!getGpuNetCount(system, i, &g, &n)) return ncclSuccess;
|
||||
pattern[i*2] = '0' + g;
|
||||
pattern[i*2+1] = '0' + n;
|
||||
}
|
||||
pattern[i*2] = 0;
|
||||
int g[8], h1[4], h2[4];
|
||||
for (int i = 0; i <8; i++) g[i] = -1;
|
||||
if (strcmp(pattern, "10302120") == 0) {
|
||||
bool cross = findGpuByXGMI(system, 1, 2, &g[2], &g[6], 1, -1, -1);
|
||||
g[2] = g[6] = -1;
|
||||
if (cross) {
|
||||
// identify GPUs for pattern "10302120"
|
||||
if (!findGpuByXGMI(system, 0, 1, &g[1], &g[0], 0, -1, -1)) return ncclSuccess;
|
||||
if (!findGpuByXGMI(system, 0, 1, &g[1], &g[2], 1, -1, g[0])) return ncclSuccess;
|
||||
if (!findGpuByXGMI(system, 1, 2, &g[2], &g[6], 1, -1, -1)) return ncclSuccess;
|
||||
if (!findGpuByXGMI(system, 2, 1, &g[3], &g[5], 1, g[6], g[2])) return ncclSuccess;
|
||||
if (!findGpuByXGMI(system, 1, 3, &g[5], &g[4], -1, -1, -1)) return ncclSuccess;
|
||||
if (!findGpuByXGMI(system, 2, 3, &g[3], &g[7], -1, g[6], g[4])) return ncclSuccess;
|
||||
// finally verify two XGMI hives for pattern "10302120"
|
||||
h1[0] = g[1]; h1[1] = g[0]; h1[2] = g[6]; h1[3] = g[2];
|
||||
h2[0] = g[7]; h2[1] = g[4]; h2[2] = g[5]; h2[3] = g[3];
|
||||
ringBase = ringBase_10302120_1;
|
||||
} else {
|
||||
// identify GPUs for pattern "10302120"
|
||||
if (!findGpuByXGMI(system, 0, 1, &g[0], &g[1], 1, -1, -1)) return ncclSuccess;
|
||||
if (!findGpuByXGMI(system, 0, 1, &g[0], &g[3], 0, -1, -1)) return ncclSuccess;
|
||||
if (!findGpuByXGMI(system, 1, 1, &g[1], &g[2], -1, -1, g[3])) return ncclSuccess;
|
||||
if (!findGpuByXGMI(system, 2, 3, &g[5], &g[7], -1, -1, -1)) return ncclSuccess;
|
||||
if (!findGpuByXGMI(system, 2, 3, &g[4], &g[6], -1, g[5], g[7])) return ncclSuccess;
|
||||
// finally verify two XGMI hives for pattern "10302120"
|
||||
h1[0] = g[0]; h1[1] = g[1]; h1[2] = g[2]; h1[3] = g[3];
|
||||
h2[0] = g[4]; h2[1] = g[5]; h2[2] = g[7]; h2[3] = g[6];
|
||||
ringBase = ringBase_10302120_2;
|
||||
}
|
||||
}
|
||||
else if (strcmp(pattern, "11303011") == 0) {
|
||||
// there are 2 configurations for pattern "11303011"
|
||||
if (findGpuByXGMI(system, 1, 2, &g[2], &g[6], 1, -1, -1)) {
|
||||
if (!findGpuByXGMI(system, 2, 1, &g[4], &g[1], 1, g[6], g[2])) return ncclSuccess;
|
||||
if (!findGpuByXGMI(system, 0, 1, &g[0], &g[3], 0, -1, -1)) return ncclSuccess;
|
||||
if (!findGpuByXGMI(system, 3, 2, &g[7], &g[5], 1, -1, -1)) return ncclSuccess;
|
||||
// finally verify two XGMI hives for pattern "11303011"
|
||||
h1[0] = g[0]; h1[1] = g[3]; h1[2] = g[2]; h1[3] = g[6];
|
||||
h2[0] = g[1]; h2[1] = g[4]; h2[2] = g[5]; h2[3] = g[7];
|
||||
ringBase = ringBase_11303011_2;
|
||||
} else {
|
||||
// identify GPUs for pattern "11303011"
|
||||
if (!findGpuByXGMI(system, 0, 1, &g[0], &g[1], 1, -1, -1)) return ncclSuccess;
|
||||
if (!findGpuByXGMI(system, 0, 1, &g[0], &g[3], 0, -1, -1)) return ncclSuccess;
|
||||
if (!findGpuByXGMI(system, 1, 1, &g[1], &g[2], -1, -1, -1)) return ncclSuccess;
|
||||
if (!findGpuByXGMI(system, 3, 2, &g[7], &g[5], -1, -1, -1)) return ncclSuccess;
|
||||
if (!findGpuByXGMI(system, 3, 2, &g[7], &g[6], -1, -1, g[5])) return ncclSuccess;
|
||||
if (!findGpuByXGMI(system, 2, 2, &g[5], &g[4], -1, -1, -1)) return ncclSuccess;
|
||||
// finally verify two XGMI hives for pattern "11303011"
|
||||
h1[0] = g[0]; h1[1] = g[1]; h1[2] = g[2]; h1[3] = g[3];
|
||||
h2[0] = g[4]; h2[1] = g[5]; h2[2] = g[7]; h2[3] = g[6];
|
||||
ringBase = ringBase_11303011_1;
|
||||
}
|
||||
}
|
||||
else if (strcmp(pattern, "0110201010200110") == 0) {
|
||||
if (findGpuByXGMI(system, 2, 5, &g[2], &g[6], 1, -1, -1)) {
|
||||
if (!findGpuByXGMI(system, 4, 2, &g[4], &g[1], 1, g[6], g[2])) return ncclSuccess;
|
||||
if (!findGpuByXGMI(system, 1, 3, &g[0], &g[3], 0, -1, -1)) return ncclSuccess;
|
||||
if (!findGpuByXGMI(system, 7, 5, &g[7], &g[5], 1, -1, -1)) return ncclSuccess;
|
||||
h1[0] = g[0]; h1[1] = g[3]; h1[2] = g[2]; h1[3] = g[6];
|
||||
h2[0] = g[1]; h2[1] = g[4]; h2[2] = g[5]; h2[3] = g[7];
|
||||
ringBase = ringBase_0110201010200110_2;
|
||||
} else {
|
||||
if (!findGpuByXGMI(system, 1, 2, &g[0], &g[1], 1, -1, -1)) return ncclSuccess;
|
||||
if (!findGpuByXGMI(system, 1, 3, &g[0], &g[3], 0, -1, -1)) return ncclSuccess;
|
||||
if (!findGpuByXGMI(system, 2, 2, &g[1], &g[2], -1, -1, -1)) return ncclSuccess;
|
||||
if (!findGpuByXGMI(system, 7, 5, &g[7], &g[5], -1, -1, -1)) return ncclSuccess;
|
||||
if (!findGpuByXGMI(system, 7, 5, &g[7], &g[6], -1, -1, g[5])) return ncclSuccess;
|
||||
if (!findGpuByXGMI(system, 4, 5, &g[4], &g[5], -1, -1, -1)) return ncclSuccess;
|
||||
h1[0] = g[0]; h1[1] = g[1]; h1[2] = g[2]; h1[3] = g[3];
|
||||
h2[0] = g[4]; h2[1] = g[5]; h2[2] = g[7]; h2[3] = g[6];
|
||||
ringBase = ringBase_0110201010200110_1;
|
||||
}
|
||||
}
|
||||
else
|
||||
return ncclSuccess;
|
||||
|
||||
if (!validate4P1H(system, h1)) return ncclSuccess;
|
||||
if (!validate4P1H(system, h2)) return ncclSuccess;
|
||||
// passed all validation
|
||||
// create 4P2H based on reference and remapped ids
|
||||
for (i = 0; i <strlen(ringBase); i++) {
|
||||
if (ringBase[i] >= '0' && ringBase[i] <= '9')
|
||||
ringRemap[i] = g[ringBase[i]-'0'] + '0';
|
||||
else
|
||||
ringRemap[i] = ringBase[i];
|
||||
// number of GPUs and NICs on each numa node is used as first screening pattern
|
||||
struct rcclRomeModel romeTopo;
|
||||
char pattern[256];
|
||||
int net_map[MAX_ROME_NICS];
|
||||
parseRomeSystem(system, &romeTopo, pattern, net_map);
|
||||
|
||||
// recognize system as Rome 4P2H even if no matching model
|
||||
if (ngpus == 8 && romeTopo.nLinks) system->type = RCCL_TOPO_4P2H_ROME;
|
||||
|
||||
int g[MAX_ROME_GPUS];
|
||||
int time = 0;
|
||||
struct timeval tvs, tve;
|
||||
gettimeofday(&tvs, NULL);
|
||||
for (i = 0; i < sizeof(romeTopoModels)/sizeof(romeTopoModels[0]); i++) {
|
||||
if (romeTopo.nCpus != romeTopoModels[i].nCpus || romeTopo.nGpus != romeTopoModels[i].nGpus ||
|
||||
romeTopo.nNics != romeTopoModels[i].nNics || romeTopo.nLinks != romeTopoModels[i].nLinks) continue;
|
||||
if (strcmp(romeTopoModels[i].pattern, pattern)) continue;
|
||||
for (int j = 0; j < ngpus; j++) g[j] = (j+2)%ngpus;
|
||||
if (permuteGpuIds(g, 0, ngpus-1, romeTopoModels+i, &romeTopo, &time)) break;
|
||||
}
|
||||
ringRemap[i] = 0;
|
||||
*str = ringRemap;
|
||||
INFO(NCCL_GRAPH, "Use 4P2H on Rome: %s", ringRemap);
|
||||
gettimeofday(&tve, NULL);
|
||||
float t = (tve.tv_sec - tvs.tv_sec)*1E3 + (tve.tv_usec - tvs.tv_usec)/1E3;
|
||||
if (i >= sizeof(romeTopoModels)/sizeof(romeTopoModels[0])) {
|
||||
//printf("No solution in %.2fms (%d iter)\n", t, time);
|
||||
return ncclSuccess;
|
||||
}
|
||||
//printf("Solution in %.2fms (%d iter): ", t, time);
|
||||
//for (int k = 0; k < ngpus; k++) printf("%d ", g[k]);
|
||||
//printf("\n");
|
||||
|
||||
// create 4P2H based on reference and remapped ids
|
||||
NCCLCHECK(parseGraph(romeTopoModels[i].ringBase, system, graph, g, romeTopo.nNics, net_map));
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -1014,6 +1054,7 @@ float speedArray[] = { 42.0, 24.0, 21.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0,
|
||||
|
||||
ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) {
|
||||
int ngpus = system->nodes[GPU].count;
|
||||
int nnets = system->nodes[NET].count;
|
||||
int crossNic = (system->nodes[NET].count > 1) && graph->crossNic ? 1 : 0;
|
||||
graph->speedIntra = graph->speedInter = 0;
|
||||
if (graph->crossNic == 2) graph->crossNic = 0;
|
||||
@@ -1036,59 +1077,19 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
|
||||
}
|
||||
|
||||
str = getenv("NCCL_RINGS");
|
||||
if (str) system->type = RCCL_TOPO_4P2H_ROME;
|
||||
if (!str) NCCLCHECK(parseChordalRing(system, &str));
|
||||
if (!str) NCCLCHECK(parseRome4P2H(system, &str));
|
||||
if (str) {
|
||||
NCCLCHECK(parseGraph(str, &graph->nChannels, ngpus, graph->intra));
|
||||
for (int i=0; i<graph->nChannels*ngpus; i++) {
|
||||
// Translate gpu numbers into ranks
|
||||
int j = 0;
|
||||
for (j = 0; j < system->nodes[GPU].count; j++)
|
||||
if (graph->intra[i] == system->nodes[GPU].nodes[j].gpu.dev)
|
||||
break;
|
||||
if (j < system->nodes[GPU].count)
|
||||
graph->intra[i] = system->nodes[GPU].nodes[j].gpu.rank;
|
||||
else
|
||||
return ncclInternalError;
|
||||
}
|
||||
graph->speedIntra = graph->speedInter = system->maxWidth;
|
||||
if (system->nodes[NET].count) {
|
||||
// do not change ring order for multi node 4P2H on Rome
|
||||
if (system->type == RCCL_TOPO_4P2H_ROME) {
|
||||
for (int n = 0; n < graph->nChannels; n++) {
|
||||
graph->inter[n*2] = n%system->nodes[NET].count;
|
||||
graph->inter[n*2+1] = (n+1)%system->nodes[NET].count;
|
||||
}
|
||||
} else {
|
||||
int *intra, *used;
|
||||
graph->nChannels = system->nodes[NET].count;
|
||||
NCCLCHECK(ncclCalloc(&intra, ngpus));
|
||||
NCCLCHECK(ncclCalloc(&used,system->nodes[NET].count));
|
||||
for (int n = 0; n < system->nodes[NET].count; n++) {
|
||||
graph->inter[n*2] = graph->inter[n*2+1] = n;
|
||||
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
|
||||
struct ncclTopoLinkList* paths = net->paths[GPU];
|
||||
// find the first unsed GPU that is closest to NIC
|
||||
int f, m;
|
||||
for (f = 0; f < ngpus; f++) {
|
||||
int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[f].gpu.rank) break;
|
||||
if(j >= n) break;
|
||||
}
|
||||
for (int i = 0; i < ngpus; i++) {
|
||||
int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[i].gpu.rank) break;
|
||||
if (j < n) continue;
|
||||
if (paths[i].count < paths[f].count) f = i;
|
||||
}
|
||||
for (m = 0; m<ngpus; m++) if (graph->intra[n*ngpus+m] == system->nodes[GPU].nodes[f].gpu.rank) break;
|
||||
used[n] = graph->intra[n*ngpus+m];
|
||||
for (int i = 0; i < ngpus; i++) intra[i] = graph->intra[n*ngpus+((i+m)%ngpus)];
|
||||
for (int i = 0; i < ngpus; i++) graph->intra[n*ngpus+i] = intra[i];
|
||||
}
|
||||
free(used);
|
||||
free(intra);
|
||||
}
|
||||
// user supplied topo
|
||||
NCCLCHECK(parseGraph(str, system, graph, NULL, nnets, NULL));
|
||||
if (graph->nChannels) {
|
||||
system->type = RCCL_TOPO_4P2H_ROME;
|
||||
return ncclSuccess;
|
||||
}
|
||||
} else {
|
||||
// try to match 8P6L
|
||||
NCCLCHECK(parseChordalRing(system, graph));
|
||||
if (graph->nChannels) return ncclSuccess;
|
||||
// try to match Rome 4P2H
|
||||
NCCLCHECK(parseRome4P2H(system, graph));
|
||||
if (graph->nChannels) return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
+1
-1
@@ -95,7 +95,7 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
|
||||
struct ncclTopoRanks* topoRanks);
|
||||
|
||||
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks,
|
||||
struct ncclTopoRanks** allTopoRanks, int* rings, int gcn);
|
||||
struct ncclTopoRanks** allTopoRanks, int* rings, int gcn, int nnets);
|
||||
|
||||
ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, int rank);
|
||||
|
||||
|
||||
+16
-1
@@ -849,6 +849,21 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
}
|
||||
INFO(NCCL_INIT, "RCCL AllToAll(v)/Scatter/Gather kernels %s", comm->alltoallDisable ? "disabled" : "enabled");
|
||||
|
||||
// count NETs used by ring
|
||||
int nNets = 0;
|
||||
int nets[MAXCHANNELS*2];
|
||||
for (int i = 0; i < ringGraph.nChannels; i++) {
|
||||
for (int j = 0; j < 2; j++) {
|
||||
int k;
|
||||
for (k = 0; k < nNets; k++)
|
||||
if (nets[k] == ringGraph.inter[i*2+j]) break;
|
||||
if (k >= nNets) {
|
||||
nets[nNets] = ringGraph.inter[i*2+j];
|
||||
nNets++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (comm->nChannels < nChannelsOrig) {
|
||||
// We started duplicating channels during Preset(), so we need to move the
|
||||
// duplicated channels since we have removed some.
|
||||
@@ -858,7 +873,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
int *rings;
|
||||
NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
|
||||
|
||||
NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings, gcn));
|
||||
NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings, gcn, nNets));
|
||||
if (comm->nNodes > 1 &&
|
||||
ncclParamCollNetEnable() == 1 &&
|
||||
collNetSupport() && collNetGraph.nChannels) {
|
||||
|
||||
@@ -21,29 +21,17 @@ struct p2pConnectInfo {
|
||||
void* directPtr;
|
||||
hipIpcMemHandle_t devIpc;
|
||||
};
|
||||
uint64_t pidHash;
|
||||
int id;
|
||||
int sendRank;
|
||||
int recvRank;
|
||||
};
|
||||
|
||||
struct p2pSendResources {
|
||||
struct ncclSendMem* devMem;
|
||||
void* ipcPtr;
|
||||
uint32_t* next_hdp_reg; // Next GPU in ring (for p2p transport use only)
|
||||
uint64_t* opCount; // opCount allocated in host memory
|
||||
uint64_t* devOpCount; // device side pointer to opCount
|
||||
uint64_t* remOpCount; // remote opCount allocated in host memory
|
||||
uint64_t* devRemOpCount; // device side pointer to remote opCount
|
||||
};
|
||||
|
||||
struct p2pRecvResources {
|
||||
struct ncclRecvMem* devMem;
|
||||
void* ipcPtr;
|
||||
uint64_t* opCount; // opCount allocated in host memory
|
||||
uint64_t* devOpCount; // device side pointer to opCount
|
||||
uint64_t* remOpCount; // remote opCount allocated in host memory
|
||||
uint64_t* devRemOpCount; // device side pointer to remote opCount
|
||||
};
|
||||
|
||||
#include <sys/types.h>
|
||||
@@ -123,7 +111,6 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
|
||||
TRACE(P2P,"IPC: %016lx %016lx %016lx %016lx", devIpc[4], devIpc[5], devIpc[6], devIpc[7]); \
|
||||
} while (0)
|
||||
|
||||
#define MAX_SHM_NAME_LEN 1024
|
||||
// Setting this to non zero causes P2P to use Reads rather than Writes
|
||||
NCCL_PARAM(P2pReadEnable, "P2P_READ_ENABLE", -2);
|
||||
|
||||
@@ -165,16 +152,6 @@ ncclResult_t p2pSendSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
|
||||
}
|
||||
|
||||
struct p2pConnectInfo info;
|
||||
info.id = channelId;
|
||||
info.pidHash = myInfo->pidHash;
|
||||
info.sendRank = myInfo->cudaDev;
|
||||
info.recvRank = peerInfo->cudaDev;
|
||||
|
||||
char shmName[MAX_SHM_NAME_LEN];
|
||||
sprintf(shmName, "nccl-p2p-send-opcount-%lx-%d-%d-%d", info.pidHash, info.id, info.sendRank, info.recvRank);
|
||||
TRACE(NCCL_P2P,"Open shmName %s", shmName);
|
||||
NCCLCHECK(shmOpen(shmName, sizeof(uint64_t), (void**)&resources->opCount, (void**)&resources->devOpCount, 1));
|
||||
|
||||
info.read = useRead;
|
||||
const char* useReadStr = info.read ? "/read" : "";
|
||||
if (myInfo->pidHash == peerInfo->pidHash) {
|
||||
@@ -232,16 +209,6 @@ ncclResult_t p2pRecvSetup(struct ncclTopoSystem* topo, struct ncclTopoGraph* gra
|
||||
NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, recvSize, true));
|
||||
|
||||
struct p2pConnectInfo info;
|
||||
info.id = channelId;
|
||||
info.pidHash = myInfo->pidHash;
|
||||
info.sendRank = peerInfo->cudaDev;
|
||||
info.recvRank = myInfo->cudaDev;
|
||||
|
||||
char shmName[MAX_SHM_NAME_LEN];
|
||||
sprintf(shmName, "nccl-p2p-recv-opcount-%lx-%d-%d-%d", info.pidHash, info.id, info.sendRank, info.recvRank);
|
||||
TRACE(NCCL_P2P,"Open shmName %s", shmName);
|
||||
NCCLCHECK(shmOpen(shmName, sizeof(uint64_t), (void**)&resources->opCount, (void**)&resources->devOpCount, 1));
|
||||
|
||||
info.read = useRead;
|
||||
if (myInfo->pidHash == peerInfo->pidHash) {
|
||||
info.direct = 1;
|
||||
@@ -298,13 +265,6 @@ static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, int nranks,
|
||||
}
|
||||
}
|
||||
|
||||
char shmName[MAX_SHM_NAME_LEN];
|
||||
sprintf(shmName, "nccl-p2p-recv-opcount-%lx-%d-%d-%d", info->pidHash, info->id, info->sendRank, info->recvRank);
|
||||
TRACE(NCCL_P2P,"Open shmName %s", shmName);
|
||||
NCCLCHECK(shmOpen(shmName, sizeof(uint64_t), (void**)&resources->remOpCount, (void**)&resources->devRemOpCount, 0));
|
||||
// Remove the file to ensure proper clean-up
|
||||
NCCLCHECK(shmUnlink(shmName));
|
||||
|
||||
int offset = 0;
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
if (info->read && p == NCCL_PROTO_SIMPLE) {
|
||||
@@ -344,12 +304,6 @@ ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, int nranks, int ran
|
||||
}
|
||||
}
|
||||
|
||||
char shmName[MAX_SHM_NAME_LEN];
|
||||
sprintf(shmName, "nccl-p2p-send-opcount-%lx-%d-%d-%d", info->pidHash, info->id, info->sendRank, info->recvRank);
|
||||
TRACE(NCCL_P2P,"Open shmName %s", shmName);
|
||||
NCCLCHECK(shmOpen(shmName, sizeof(uint64_t), (void**)&resources->remOpCount, (void**)&resources->devRemOpCount, 0));
|
||||
NCCLCHECK(shmUnlink(shmName));
|
||||
|
||||
int offset = 0;
|
||||
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
|
||||
if (info->read && p == NCCL_PROTO_SIMPLE) {
|
||||
@@ -370,8 +324,6 @@ ncclResult_t p2pSendFree(void* resources) {
|
||||
if (sendRes->ipcPtr)
|
||||
CUDACHECK(hipIpcCloseMemHandle(sendRes->ipcPtr));
|
||||
CUDACHECK(hipFree(sendRes->devMem));
|
||||
NCCLCHECK(shmClose(sendRes->opCount, sendRes->devOpCount, sizeof(uint64_t)));
|
||||
NCCLCHECK(shmClose(sendRes->remOpCount, sendRes->devRemOpCount, sizeof(uint64_t)));
|
||||
free(sendRes);
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -381,8 +333,6 @@ ncclResult_t p2pRecvFree(void* resources) {
|
||||
if (recvRes->ipcPtr)
|
||||
CUDACHECK(hipIpcCloseMemHandle(recvRes->ipcPtr));
|
||||
CUDACHECK(hipFree(recvRes->devMem));
|
||||
NCCLCHECK(shmClose(recvRes->opCount, recvRes->devOpCount, sizeof(uint64_t)));
|
||||
NCCLCHECK(shmClose(recvRes->remOpCount, recvRes->devRemOpCount, sizeof(uint64_t)));
|
||||
free(recvRes);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -21,7 +21,7 @@
|
||||
|
||||
DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
|
||||
for i in {0..35}
|
||||
for i in {0..37}
|
||||
do
|
||||
$DIR/../topo_expl/topo_expl -m $i > "topo_m$i.log"
|
||||
$DIR/../TopoVisual/topo_visual.sh -i "topo_m$i.log"
|
||||
|
||||
@@ -0,0 +1,81 @@
|
||||
|
||||
<system version="2">
|
||||
<cpu numaid="1" affinity="00000000,00000000,ffffffff,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
|
||||
<pci busid="0000:01:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:03:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="0" sm="96" gcn="906" arch="38911" rank="0" gdr="1">
|
||||
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:c6:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
<pci busid="0000:24:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:26:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="2" sm="96" gcn="906" arch="38911" rank="2" gdr="1">
|
||||
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:c6:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
<pci busid="0000:21:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:23:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="5" sm="96" gcn="906" arch="38911" rank="5" gdr="1">
|
||||
<xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
</cpu>
|
||||
<cpu numaid="0" affinity="00000000,00000000,00000000,ffffffff" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
|
||||
<pci busid="0000:41:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:43:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="1" sm="96" gcn="906" arch="38911" rank="1" gdr="1">
|
||||
<xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
</cpu>
|
||||
<cpu numaid="2" affinity="00000000,ffffffff,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
|
||||
<pci busid="0000:c1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:c3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="3" sm="96" gcn="906" arch="38911" rank="3" gdr="1">
|
||||
<xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
<pci busid="0000:c4:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:c6:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="6" sm="96" gcn="906" arch="38911" rank="6" gdr="1">
|
||||
<xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
<pci busid="0000:e1:00.0" class="0x020700" link_speed="16 GT/s" link_width="16">
|
||||
<nic>
|
||||
<net name="mlx5_0" dev="0" speed="100000" port="1" guid="0x20cd600003da341c" maxconn="262144" gdr="1"/>
|
||||
<net name="mlx5_1" dev="1" speed="200000" port="2" guid="0x20cd600003da341c" maxconn="262144" gdr="1"/>
|
||||
</nic>
|
||||
</pci>
|
||||
</cpu>
|
||||
<cpu numaid="3" affinity="ffffffff,00000000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
|
||||
<pci busid="0000:81:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:83:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="4" sm="96" gcn="906" arch="38911" rank="4" gdr="1">
|
||||
<xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
<pci busid="0000:a1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:a3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="7" sm="96" gcn="906" arch="38911" rank="7" gdr="1">
|
||||
<xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
</cpu>
|
||||
</system>
|
||||
@@ -0,0 +1,93 @@
|
||||
<system version="2">
|
||||
<cpu numaid="1" affinity="00000000,00000000,00000000,ffff0000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
|
||||
<pci busid="0000:41:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:43:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="0" sm="96" gcn="906" arch="38911" rank="0" gdr="1">
|
||||
<xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:c6:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
</cpu>
|
||||
<cpu numaid="2" affinity="00000000,00000000,0000ffff,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
|
||||
<pci busid="0000:21:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:23:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="1" sm="96" gcn="906" arch="38911" rank="1" gdr="1">
|
||||
<xgmi target="0000:e3:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
<pci busid="0000:24:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:26:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="2" sm="96" gcn="906" arch="38911" rank="2" gdr="1">
|
||||
<xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:c6:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
</cpu>
|
||||
<cpu numaid="3" affinity="00000000,00000000,ffff0000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
|
||||
<pci busid="0000:01:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:03:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="3" sm="96" gcn="906" arch="38911" rank="3" gdr="1">
|
||||
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
</cpu>
|
||||
<cpu numaid="4" affinity="00000000,0000ffff,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
|
||||
<pci busid="0000:e1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:e3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="4" sm="96" gcn="906" arch="38911" rank="4" gdr="1">
|
||||
<xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
</cpu>
|
||||
<cpu numaid="5" affinity="00000000,ffff0000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
|
||||
<pci busid="0000:c1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:c3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="5" sm="96" gcn="906" arch="38911" rank="5" gdr="1">
|
||||
<xgmi target="0000:e3:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
<pci busid="0000:c4:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:c6:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="6" sm="96" gcn="906" arch="38911" rank="6" gdr="1">
|
||||
<xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
</cpu>
|
||||
<cpu numaid="7" affinity="ffff0000,00000000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
|
||||
<pci busid="0000:81:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
|
||||
<pci busid="0000:83:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
|
||||
<gpu dev="7" sm="96" gcn="906" arch="38911" rank="7" gdr="1">
|
||||
<xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
|
||||
<xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
|
||||
</gpu>
|
||||
</pci>
|
||||
</pci>
|
||||
</cpu>
|
||||
<cpu numaid="0" affinity="00000000,00000000,00000000,0000ffff" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
|
||||
<pci busid="0000:61:00.0" class="0x020700" link_speed="16 GT/s" link_width="16">
|
||||
<nic>
|
||||
<net name="mlx5_0" dev="0" speed="200000" port="1" guid="0xa8134300039f59b8" maxconn="262144" gdr="1"/>
|
||||
<net name="mlx5_1" dev="1" speed="100000" port="2" guid="0xa8134300039f59b8" maxconn="262144" gdr="1"/>
|
||||
</nic>
|
||||
</pci>
|
||||
</cpu>
|
||||
<cpu numaid="6" affinity="0000ffff,00000000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
|
||||
<pci busid="0000:a1:00.0" class="0x020700" link_speed="16 GT/s" link_width="16">
|
||||
<nic>
|
||||
<net name="mlx5_2" dev="2" speed="200000" port="1" guid="0x38815600039f59b8" maxconn="262144" gdr="1"/>
|
||||
</nic>
|
||||
</pci>
|
||||
</cpu>
|
||||
</system>
|
||||
@@ -105,6 +105,8 @@ NodeModelDesc model_descs[] = {
|
||||
{4, "topo_8p_ts1_n4_1.xml", "4 nodes 8 VEGA20 TS1 NPS=4 Alt. Model"},
|
||||
{1, "topo_4p3l_ia.xml", "single node 8 gfx908"},
|
||||
{4, "topo_4p3l_ia.xml", "4 nodes 8 gfx908"},
|
||||
{4, "topo_8p_rome_n2_2.xml", "4 nodes 8 VEGA20 Rome NPS=2 Alt. Model 2 NET/IF"},
|
||||
{4, "topo_8p_ts1_n4_2.xml", "4 nodes 8 VEGA20 TS1 NPS=4 3 NET/IF"},
|
||||
};
|
||||
|
||||
int main(int argc,char* argv[])
|
||||
|
||||
@@ -428,6 +428,21 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
|
||||
}
|
||||
INFO(NCCL_INIT, "RCCL AllToAll(v)/Scatter/Gather kernels %s", comm->alltoallDisable ? "disabled" : "enabled");
|
||||
|
||||
// count NETs used by ring
|
||||
int nNets = 0;
|
||||
int nets[MAXCHANNELS*2];
|
||||
for (int i = 0; i < ringGraph.nChannels; i++) {
|
||||
for (int j = 0; j < 2; j++) {
|
||||
int k;
|
||||
for (k = 0; k < nNets; k++)
|
||||
if (nets[k] == ringGraph.inter[i*2+j]) break;
|
||||
if (k >= nNets) {
|
||||
nets[nNets] = ringGraph.inter[i*2+j];
|
||||
nNets++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (comm->nChannels < nChannelsOrig) {
|
||||
// We started duplicating channels during Preset(), so we need to move the
|
||||
// duplicated channels since we have removed some.
|
||||
@@ -437,7 +452,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
|
||||
int *rings;
|
||||
NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS));
|
||||
|
||||
NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings, gcn));
|
||||
NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, allTopoRanks, rings, gcn, nNets));
|
||||
if (comm->nNodes > 1 &&
|
||||
ncclParamCollNetEnable() == 1 &&
|
||||
collNetSupport() && collNetGraph.nChannels) {
|
||||
|
||||
Reference in New Issue
Block a user