2.9.6-1
Add support for CUDA graphs. Fuse BCM Gen4 switches to avoid suboptimal performance on some platforms. Issue #439. Fix bootstrap issue caused by connection reordering. Fix CPU locking block. Improve CollNet algorithm. Improve performance on DGX A100 for communicators with only one GPU per node.
Esse commit está contido em:
+75
-46
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -14,7 +14,7 @@
|
||||
/******************************************************************/
|
||||
|
||||
ncclResult_t ncclTopoPreset(struct ncclComm* comm,
|
||||
struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph,
|
||||
struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph,
|
||||
struct ncclTopoRanks* topoRanks) {
|
||||
int rank = comm->rank;
|
||||
int localRanks = comm->localRanks;
|
||||
@@ -25,12 +25,15 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
|
||||
channel->ring.prev = channel->ring.next = -1;
|
||||
channel->tree.up = -1;
|
||||
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->tree.down[i] = -1;
|
||||
channel->collTree.up = -1;
|
||||
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collTree.down[i] = -1;
|
||||
channel->collTree.out = -1;
|
||||
channel->collTree.headRank = -1;
|
||||
channel->collTree.nHeads = 0;
|
||||
channel->collTree.shift = 0;
|
||||
for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collTree.up[i] = -1;
|
||||
for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collTree.down[i] = -1;
|
||||
|
||||
int* ringIntra = ringGraph->intra+c*localRanks;
|
||||
int* treeIntra = treeGraph->intra+c*localRanks;
|
||||
int* collNetIntra = collNetGraph->intra+c*localRanks;
|
||||
|
||||
for (int i=0; i<localRanks; i++) {
|
||||
if (ringIntra[i] == rank) {
|
||||
@@ -50,12 +53,6 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
|
||||
channel->tree.up = i == 0 ? -1 : treeIntra[i-1];
|
||||
channel->tree.down[0] = i == localRanks-1 ? -1 : treeIntra[i+1];
|
||||
}
|
||||
if (collNetIntra[i] == rank) {
|
||||
int prev = (i-1+localRanks)%localRanks, next = (i+1)%localRanks;
|
||||
|
||||
channel->collTree.up = collNetIntra[prev];
|
||||
channel->collTree.down[0] = collNetIntra[next];
|
||||
}
|
||||
}
|
||||
topoRanks->ringPrev[c] = channel->ring.prev;
|
||||
topoRanks->ringNext[c] = channel->ring.next;
|
||||
@@ -167,36 +164,53 @@ static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int*
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, int rank) {
|
||||
int nranks = comm->nRanks;
|
||||
int depth = nranks/comm->nNodes;
|
||||
int sendIndex = collNetGraph->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1; // send GPU index depends on topo pattern
|
||||
int sendEndIndex = (sendIndex+comm->localRanks-1)%comm->localRanks;
|
||||
for (int c=0; c<comm->nChannels/2; c++) {
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
// Set root of collTree to id nranks
|
||||
if (rank == collNetGraph->intra[sendIndex+c*comm->localRanks]) { // is master
|
||||
channel->collTree.up = nranks;
|
||||
}
|
||||
if (rank == collNetGraph->intra[sendEndIndex+c*comm->localRanks]) { // is bottom of intra-node chain
|
||||
channel->collTree.down[0] = -1;
|
||||
}
|
||||
channel->collTree.depth = depth;
|
||||
INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", c, rank, channel->collTree.up, channel->collTree.down[0]);
|
||||
static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph) {
|
||||
int rank = comm->rank;
|
||||
int localRanks = comm->localRanks;
|
||||
int nHeads = collNetGraph->nChannels;
|
||||
int *heads;
|
||||
NCCLCHECK(ncclCalloc(&heads, nHeads));
|
||||
// Find all head ranks
|
||||
// Head index is always 0
|
||||
for (int c=0; c<nHeads; c++) {
|
||||
int* collNetIntra = collNetGraph->intra+c*localRanks;
|
||||
heads[c] = collNetIntra[0];
|
||||
}
|
||||
int recvIndex = 0; // recv GPU index is always 0
|
||||
int recvEndIndex = (recvIndex+comm->localRanks-1)%comm->localRanks;
|
||||
for (int c=0; c<comm->nChannels/2; c++) {
|
||||
struct ncclChannel* channel = comm->channels+comm->nChannels/2+c;
|
||||
// Set root of collTree to id nranks
|
||||
if (rank == collNetGraph->intra[recvIndex+c*comm->localRanks]) { // is master
|
||||
channel->collTree.up = nranks;
|
||||
// For all channels
|
||||
for (int c=0; c<comm->nChannels; c++) {
|
||||
struct ncclChannel* channel = comm->channels+c;
|
||||
char line[1024];
|
||||
sprintf(line, "CollNet channel %d rank %d ", c, rank);
|
||||
int nDown = 0;
|
||||
for (int i=0; i<nHeads; i++) {
|
||||
if (rank == heads[i]) { // is head
|
||||
channel->collTree.headRank = i; // Mark the index for deciding offset in the CUDA kernel
|
||||
channel->collTree.out = comm->nRanks; // Set root of collTree to id nranks
|
||||
int* collNetIntra = collNetGraph->intra+i*localRanks;
|
||||
sprintf(line+strlen(line), "down ");
|
||||
for (int r=0; r<localRanks; r++) {
|
||||
if (collNetIntra[r] == rank) continue;
|
||||
channel->collTree.down[nDown++] = collNetIntra[r]; // connect to all peers
|
||||
sprintf(line+strlen(line), " %d ", collNetIntra[r]);
|
||||
}
|
||||
sprintf(line+strlen(line), "nDown %d ", nDown);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (rank == collNetGraph->intra[recvEndIndex+c*comm->localRanks]) { // is bottom of intra-node chain
|
||||
channel->collTree.down[0] = -1;
|
||||
// Connect to all heads
|
||||
int nUp = 0;
|
||||
sprintf(line+strlen(line), "up ");
|
||||
for (int h=0; h<nHeads; h++) {
|
||||
if (rank == heads[h]) continue;
|
||||
channel->collTree.up[nUp++] = heads[h];
|
||||
sprintf(line+strlen(line), " %d ", heads[h]);
|
||||
}
|
||||
channel->collTree.depth = depth;
|
||||
INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", comm->nChannels/2+c, rank, channel->collTree.up, channel->collTree.down[0]);
|
||||
channel->collTree.nHeads = nHeads;
|
||||
channel->collTree.shift = (rank%localRanks)%nHeads; // Shift by intraRank so that leaves don't send to same head simultaneously
|
||||
channel->collTree.depth = (nUp == 0 && nDown == 0) ? 1 : 2;
|
||||
sprintf(line+strlen(line), "nUp %d nHeads %d ", nUp, nHeads);
|
||||
sprintf(line+strlen(line), "headRank %d out %d shift %d", channel->collTree.headRank, channel->collTree.out, channel->collTree.shift);
|
||||
INFO(NCCL_GRAPH, "%s", line);
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -231,7 +245,18 @@ int ncclMaxNchannels() {
|
||||
return maxNchannels;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings) {
|
||||
static int copyChannels(struct ncclComm* comm, int start, int end, int* ringPrev, int* ringNext) {
|
||||
int nranks = comm->nRanks;
|
||||
int c;
|
||||
for (c=start; c<end; c++) {
|
||||
memcpy(ringPrev+c*nranks, ringPrev+(c-start)*nranks, nranks*sizeof(int));
|
||||
memcpy(ringNext+c*nranks, ringNext+(c-start)*nranks, nranks*sizeof(int));
|
||||
memcpy(comm->channels+c, comm->channels+c-start, sizeof(struct ncclChannel));
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph* collNetGraph) {
|
||||
// Gather data from all ranks
|
||||
int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1;
|
||||
int nranks = comm->nRanks;
|
||||
@@ -266,16 +291,20 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
|
||||
// Duplication should be complete now
|
||||
nChannels = comm->nChannels = std::min(MAXCHANNELS,nChannels*2);
|
||||
|
||||
// Setup CollNet
|
||||
if (comm->collNetSupport == 1) {
|
||||
// Add more channels to saturate intra-node bandwidth, except the 1 PPN case
|
||||
if (collNetGraph->speedIntra > collNetGraph->speedInter && comm->nRanks > comm->nNodes) {
|
||||
int collNetNchannels = std::min(MAXCHANNELS, nChannels+nChannels/2);
|
||||
nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext);
|
||||
}
|
||||
NCCLCHECK(connectCollNet(comm, collNetGraph));
|
||||
}
|
||||
|
||||
// Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS.
|
||||
// We permit combining max, then min, to only use the first channels, then duplicate them.
|
||||
nChannels = comm->nChannels = std::min((int)ncclMaxNchannels(), nChannels);
|
||||
int c;
|
||||
for (c=nChannels; c<ncclMinNchannels(); c++) {
|
||||
memcpy(ringPrev+c*nranks, ringPrev+(c-nChannels)*nranks, nranks*sizeof(int));
|
||||
memcpy(ringNext+c*nranks, ringNext+(c-nChannels)*nranks, nranks*sizeof(int));
|
||||
memcpy(comm->channels+c, comm->channels+c-nChannels, sizeof(struct ncclChannel));
|
||||
}
|
||||
nChannels = comm->nChannels = c;
|
||||
nChannels = comm->nChannels = copyChannels(comm, nChannels, ncclMinNchannels(), ringPrev, ringNext);
|
||||
|
||||
// Create rings array and check all is fine
|
||||
NCCLCHECK(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext));
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -280,8 +280,7 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_
|
||||
NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
|
||||
if (arch == NCCL_TOPO_CPU_ARCH_ARM) p2pLevel = PATH_PXB;
|
||||
if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
|
||||
if (model == NCCL_TOPO_CPU_TYPE_BDW) p2pLevel = PATH_PXB;
|
||||
else p2pLevel = PATH_PHB;
|
||||
p2pLevel = PATH_PXB;
|
||||
}
|
||||
if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
|
||||
p2pLevel = PATH_PXB;
|
||||
|
||||
+64
-4
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -393,9 +393,67 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// Select only NICs with the maximum bandwidth w.r.t. GPUs, and sort them by distance.
|
||||
ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int* nets, int* netcountRet) {
|
||||
float* maxwidths;
|
||||
int* minhops;
|
||||
int netcount = 0;
|
||||
NCCLCHECK(ncclCalloc(&minhops, system->nodes[NET].count));
|
||||
NCCLCHECK(ncclCalloc(&maxwidths, system->nodes[NET].count));
|
||||
for (int n=0; n<system->nodes[NET].count; n++) {
|
||||
maxwidths[n] = 0.0;
|
||||
minhops[n] = 255;
|
||||
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
|
||||
struct ncclTopoLinkList* paths = net->paths[GPU];
|
||||
for (int g=0; g<system->nodes[GPU].count; g++) {
|
||||
if (paths[g].width > maxwidths[n] || (paths[g].width == maxwidths[n] && paths[g].count < minhops[n])) {
|
||||
maxwidths[n] = paths[g].width;
|
||||
minhops[n] = paths[g].count;
|
||||
}
|
||||
}
|
||||
if (netcount && maxwidths[nets[0]] > maxwidths[n]) continue; // Do not keep NICs with lower BW
|
||||
if (netcount && maxwidths[nets[0]] < maxwidths[n]) netcount = 0; // Remove all NICs with lower BW
|
||||
int index;
|
||||
for (index = 0; index < netcount; index++) {
|
||||
if (minhops[n] < minhops[nets[index]]) break;
|
||||
}
|
||||
// Insert net at index
|
||||
// Shift all nets with higher nhops
|
||||
for (int i = netcount; i>index; i--) nets[i] = nets[i-1];
|
||||
// Insert this net at index
|
||||
nets[index] = n;
|
||||
netcount++;
|
||||
}
|
||||
|
||||
*netcountRet = netcount;
|
||||
|
||||
// Then shuffle NICs with the same nhops based on the GPU device number, so that when we have
|
||||
// 2 NICs and 2 GPUs and create communicators with only one GPU, we will use both NICs.
|
||||
for (int start = 0; start < netcount;) {
|
||||
int end = start+1;
|
||||
while (end < netcount && minhops[nets[end]] == minhops[nets[start]]) end++;
|
||||
// Shuffle
|
||||
for (int r=0; r<system->nodes[GPU].nodes[0].gpu.dev % (end-start); r++) {
|
||||
int netStart = nets[start];
|
||||
for (int i=start; i<end-1; i++) nets[i] = nets[i+1];
|
||||
nets[end-1] = netStart;
|
||||
}
|
||||
start = end;
|
||||
}
|
||||
|
||||
free(minhops);
|
||||
free(maxwidths);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) {
|
||||
const int speed = graph->speedInter;
|
||||
for (int n=0; n<system->nodes[NET].count; n++) {
|
||||
int* nets;
|
||||
NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
|
||||
int netcount;
|
||||
NCCLCHECK(ncclTopoSelectNets(system, nets, &netcount));
|
||||
for (int i=0; i<netcount; i++) {
|
||||
int n = nets[i];
|
||||
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
|
||||
struct ncclTopoNode* gpu;
|
||||
if (graph->collNet && net->net.collSupport == 0) continue;
|
||||
@@ -463,6 +521,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
|
||||
}
|
||||
}
|
||||
}
|
||||
free(nets);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -705,6 +764,7 @@ search:
|
||||
for (int g=0; g<ngpus; g++) {
|
||||
printf("%d ", graph->intra[c*ngpus+g]);
|
||||
}
|
||||
printf("[%d %d]", graph->inter[0], graph->inter[1]);
|
||||
printf("\n");
|
||||
}
|
||||
#endif
|
||||
@@ -845,7 +905,7 @@ ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, stru
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int* dev) {
|
||||
ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int rr, int* dev) {
|
||||
if (graph) {
|
||||
// Honor the net device in the graph
|
||||
int channel = channelId%graph->nChannels;
|
||||
@@ -854,7 +914,7 @@ ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct n
|
||||
*dev = graph->inter[channel*2+index];
|
||||
} else {
|
||||
int64_t id;
|
||||
NCCLCHECK(ncclTopoGetLocalNet(system, rank, &id, channelId));
|
||||
NCCLCHECK(ncclTopoGetLocalNet(system, rank, &id, rr));
|
||||
*dev = id;
|
||||
}
|
||||
return ncclSuccess;
|
||||
|
||||
+73
-2
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -172,6 +172,65 @@ ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
// BCM Gen4 Switches present themselves as a two-level hierarchical switch
|
||||
// even though they're supposed to sustain full BW across all ports.
|
||||
// Flatten the switch as this extra level can break the search and make
|
||||
// NCCL take wrong topology decisions.
|
||||
ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) {
|
||||
for (int s=0; s<system->nodes[PCI].count; s++) {
|
||||
struct ncclTopoNode* pciSwitch = system->nodes[PCI].nodes+s;
|
||||
uint64_t device = pciSwitch->pci.device;
|
||||
// Only flatten PEX Gen 4 switches in base mode
|
||||
if ((device & 0xfffffffffffff000) == 0x1000c0101000a000) {
|
||||
// Find sub switches with the same device ID.
|
||||
int64_t* subSwIds;
|
||||
NCCLCHECK(ncclCalloc(&subSwIds, pciSwitch->nlinks));
|
||||
int subs = 0;
|
||||
for (int l=0; l<pciSwitch->nlinks; l++) {
|
||||
struct ncclTopoNode* sub = pciSwitch->links[l].remNode;
|
||||
// Only fuse sub switches with the same device ID.
|
||||
if (sub->type != PCI || sub->pci.device != device) continue;
|
||||
// Save sub switch for later
|
||||
subSwIds[subs++] = sub->id;
|
||||
// Remove link to that sub switch
|
||||
memmove(pciSwitch->links+l, pciSwitch->links+l+1, (pciSwitch->nlinks-l-1)*(sizeof(struct ncclTopoLink)));
|
||||
pciSwitch->nlinks--;
|
||||
// Don't increase l for the next iteration as we just shifted all links by one.
|
||||
l--;
|
||||
}
|
||||
|
||||
for (int s=0; s<subs; s++) {
|
||||
// Find sub switch (system->nodes[PCI].nodes is changing every time we remove a node)
|
||||
int index;
|
||||
NCCLCHECK(ncclTopoIdToIndex(system, PCI, subSwIds[s], &index));
|
||||
struct ncclTopoNode* sub = system->nodes[PCI].nodes+index;
|
||||
// Connect all sub PCI devices to the parent switch
|
||||
for (int l=0; l<sub->nlinks; l++) {
|
||||
struct ncclTopoNode* remNode = sub->links[l].remNode;
|
||||
if (remNode == pciSwitch) continue;
|
||||
// Add link from parent PCI switch -> PCI device
|
||||
memcpy(pciSwitch->links+pciSwitch->nlinks, sub->links+l, sizeof(struct ncclTopoLink));
|
||||
pciSwitch->nlinks++;
|
||||
// Update link from PCI device -> parent PCI switch
|
||||
for (int rl=0; rl<remNode->nlinks; rl++) {
|
||||
if (remNode->links[rl].remNode == sub) {
|
||||
remNode->links[rl].remNode = pciSwitch;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
NCCLCHECK(ncclTopoRemoveNode(system, PCI, index));
|
||||
}
|
||||
// Set subdevice to 0x0000 to make sure we don't merge this switch again.
|
||||
pciSwitch->pci.device = 0x1000c01010000000;
|
||||
free(subSwIds);
|
||||
// Restart, as system->nodes[PCI].nodes has changed.
|
||||
s = 0;
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclTopoConnectCpus(struct ncclTopoSystem* system) {
|
||||
// And connect all CPU nodes together
|
||||
for (int n=0; n<system->nodes[CPU].count; n++) {
|
||||
@@ -190,6 +249,8 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN
|
||||
sprintf(line+offset, "%s/%lX (%d)", topoNodeTypeStr[node->type], node->id, node->gpu.rank);
|
||||
} else if (node->type == CPU) {
|
||||
sprintf(line+offset, "%s/%lX (%d/%d/%d)", topoNodeTypeStr[node->type], node->id, node->cpu.arch, node->cpu.vendor, node->cpu.model);
|
||||
} else if (node->type == PCI) {
|
||||
sprintf(line+offset, "%s/%lX (%lx)", topoNodeTypeStr[node->type], node->id, node->pci.device);
|
||||
} else {
|
||||
sprintf(line+offset, "%s/%lX", topoNodeTypeStr[node->type], node->id);
|
||||
}
|
||||
@@ -345,6 +406,15 @@ ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* s
|
||||
NCCLCHECK(ncclTopoAddNic(xmlNic, system, nicNode));
|
||||
} else if (type == PCI) {
|
||||
NCCLCHECK(ncclTopoCreateNode(system, &node, type, busId));
|
||||
NCCLCHECK(xmlGetAttr(xmlPci, "vendor", &str));
|
||||
if (str) node->pci.device += strtol(str, NULL, 0) << 48;
|
||||
NCCLCHECK(xmlGetAttr(xmlPci, "device", &str));
|
||||
if (str) node->pci.device += strtol(str, NULL, 0) << 32;
|
||||
NCCLCHECK(xmlGetAttr(xmlPci, "subsystem_vendor", &str));
|
||||
if (str) node->pci.device += strtol(str, NULL, 0) << 16;
|
||||
NCCLCHECK(xmlGetAttr(xmlPci, "subsystem_device", &str));
|
||||
if (str) node->pci.device += strtol(str, NULL, 0);
|
||||
|
||||
for (int s=0; s<xmlPci->nSubs; s++) {
|
||||
struct ncclXmlNode* xmlSubPci = xmlPci->subs[s];
|
||||
NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node));
|
||||
@@ -475,6 +545,7 @@ ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem
|
||||
}
|
||||
NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL));
|
||||
|
||||
NCCLCHECK(ncclTopoFlattenBcmSwitches(*topoSystem));
|
||||
NCCLCHECK(ncclTopoConnectCpus(*topoSystem));
|
||||
NCCLCHECK(ncclTopoSortSystem(*topoSystem));
|
||||
|
||||
@@ -602,7 +673,7 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int64_
|
||||
}
|
||||
if (path->width == maxWidth && path->type == minType) nets[count++] = system->nodes[NET].nodes[n].id;
|
||||
}
|
||||
*id = nets[rr % count];
|
||||
*id = nets[rr%count];
|
||||
free(nets);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
+5
-3
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -27,8 +27,7 @@
|
||||
|
||||
// Intel CPU convert GPU P2P traffic into 64B PCI TLPs, so GPU
|
||||
// to GPU traffic consumes more PCI bandwidth.
|
||||
#define INTEL_P2P(speed) (speed*9/12)
|
||||
#define INTEL_P2P_OVERHEAD(speed) (speed*12/9)
|
||||
#define INTEL_P2P_OVERHEAD(speed) (speed*6/5)
|
||||
|
||||
#define NCCL_TOPO_NODE_TYPES 7
|
||||
#define GPU 0
|
||||
@@ -105,6 +104,9 @@ struct ncclTopoNode {
|
||||
int model;
|
||||
cpu_set_t affinity;
|
||||
}cpu;
|
||||
struct {
|
||||
uint64_t device;
|
||||
}pci;
|
||||
};
|
||||
int nlinks;
|
||||
struct ncclTopoLink links[NCCL_TOPO_MAX_LINKS];
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -79,8 +79,10 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
int simpleDefaultThreads = (ringGraph->speedIntra*ringGraph->nChannels <= PCI_WIDTH) ? 256 : NCCL_SIMPLE_MAX_NTHREADS;
|
||||
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] =
|
||||
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, simpleDefaultThreads);
|
||||
comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] =
|
||||
comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] =
|
||||
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS);
|
||||
comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] =
|
||||
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS);
|
||||
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL] =
|
||||
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_LL_MAX_NTHREADS, NCCL_LL_MAX_NTHREADS);
|
||||
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL128] =
|
||||
@@ -128,8 +130,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw);
|
||||
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 0.915 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels);
|
||||
if (a == NCCL_ALGO_COLLNET) busBw *= .9;
|
||||
if (a == NCCL_ALGO_COLLNET && p == NCCL_PROTO_LL) busBw *= 1.0/6.0; // Take into account that GDR read is disabled on both sides
|
||||
if (a == NCCL_ALGO_COLLNET && p == NCCL_PROTO_LL128) busBw = 0; // CollNet does not support LL128
|
||||
if (a == NCCL_ALGO_COLLNET && p != NCCL_PROTO_SIMPLE) busBw = 0; // Oneshot CollNet only supports Simple
|
||||
|
||||
// Convert bus BW to algorithm BW
|
||||
float ratio = (a != NCCL_ALGO_RING) ? .5 : (1.0 * nRanks) / nsteps;
|
||||
@@ -233,6 +234,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
|
||||
comm->threadThresholds[a][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_THREAD_THRESHOLD;
|
||||
}
|
||||
comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL] *= nRanks;
|
||||
comm->threadThresholds[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] = 512;
|
||||
|
||||
// Override defaults with user env
|
||||
char* str = getenv("NCCL_THREAD_THRESHOLDS");
|
||||
|
||||
+21
-1
@@ -1,5 +1,5 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -469,6 +469,26 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
|
||||
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
|
||||
NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "class", "class"));
|
||||
}
|
||||
NCCLCHECK(xmlGetAttrIndex(pciNode, "vendor", &index));
|
||||
if (index == -1) {
|
||||
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
|
||||
NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "vendor", "vendor"));
|
||||
}
|
||||
NCCLCHECK(xmlGetAttrIndex(pciNode, "device", &index));
|
||||
if (index == -1) {
|
||||
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
|
||||
NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "device", "device"));
|
||||
}
|
||||
NCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_vendor", &index));
|
||||
if (index == -1) {
|
||||
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
|
||||
NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "subsystem_vendor", "subsystem_vendor"));
|
||||
}
|
||||
NCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_device", &index));
|
||||
if (index == -1) {
|
||||
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
|
||||
NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "subsystem_device", "subsystem_device"));
|
||||
}
|
||||
NCCLCHECK(xmlGetAttrIndex(pciNode, "link_speed", &index));
|
||||
if (index == -1) {
|
||||
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
|
||||
|
||||
Referência em uma Nova Issue
Bloquear um usuário