Add support for CUDA graphs.
Fuse BCM Gen4 switches to avoid suboptimal performance on some platforms. Issue #439.
Fix bootstrap issue caused by connection reordering.
Fix CPU locking block.
Improve CollNet algorithm.
Improve performance on DGX A100 for communicators with only one GPU per node.
Esse commit está contido em:
Sylvain Jeaugey
2021-04-12 16:00:11 -07:00
commit a46ea10583
43 arquivos alterados com 2687 adições e 1244 exclusões
+75 -46
Ver Arquivo
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -14,7 +14,7 @@
/******************************************************************/
ncclResult_t ncclTopoPreset(struct ncclComm* comm,
struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph,
struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph,
struct ncclTopoRanks* topoRanks) {
int rank = comm->rank;
int localRanks = comm->localRanks;
@@ -25,12 +25,15 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
channel->ring.prev = channel->ring.next = -1;
channel->tree.up = -1;
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->tree.down[i] = -1;
channel->collTree.up = -1;
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collTree.down[i] = -1;
channel->collTree.out = -1;
channel->collTree.headRank = -1;
channel->collTree.nHeads = 0;
channel->collTree.shift = 0;
for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collTree.up[i] = -1;
for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collTree.down[i] = -1;
int* ringIntra = ringGraph->intra+c*localRanks;
int* treeIntra = treeGraph->intra+c*localRanks;
int* collNetIntra = collNetGraph->intra+c*localRanks;
for (int i=0; i<localRanks; i++) {
if (ringIntra[i] == rank) {
@@ -50,12 +53,6 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
channel->tree.up = i == 0 ? -1 : treeIntra[i-1];
channel->tree.down[0] = i == localRanks-1 ? -1 : treeIntra[i+1];
}
if (collNetIntra[i] == rank) {
int prev = (i-1+localRanks)%localRanks, next = (i+1)%localRanks;
channel->collTree.up = collNetIntra[prev];
channel->collTree.down[0] = collNetIntra[next];
}
}
topoRanks->ringPrev[c] = channel->ring.prev;
topoRanks->ringNext[c] = channel->ring.next;
@@ -167,36 +164,53 @@ static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int*
return ncclSuccess;
}
ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, int rank) {
int nranks = comm->nRanks;
int depth = nranks/comm->nNodes;
int sendIndex = collNetGraph->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1; // send GPU index depends on topo pattern
int sendEndIndex = (sendIndex+comm->localRanks-1)%comm->localRanks;
for (int c=0; c<comm->nChannels/2; c++) {
struct ncclChannel* channel = comm->channels+c;
// Set root of collTree to id nranks
if (rank == collNetGraph->intra[sendIndex+c*comm->localRanks]) { // is master
channel->collTree.up = nranks;
}
if (rank == collNetGraph->intra[sendEndIndex+c*comm->localRanks]) { // is bottom of intra-node chain
channel->collTree.down[0] = -1;
}
channel->collTree.depth = depth;
INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", c, rank, channel->collTree.up, channel->collTree.down[0]);
static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph) {
int rank = comm->rank;
int localRanks = comm->localRanks;
int nHeads = collNetGraph->nChannels;
int *heads;
NCCLCHECK(ncclCalloc(&heads, nHeads));
// Find all head ranks
// Head index is always 0
for (int c=0; c<nHeads; c++) {
int* collNetIntra = collNetGraph->intra+c*localRanks;
heads[c] = collNetIntra[0];
}
int recvIndex = 0; // recv GPU index is always 0
int recvEndIndex = (recvIndex+comm->localRanks-1)%comm->localRanks;
for (int c=0; c<comm->nChannels/2; c++) {
struct ncclChannel* channel = comm->channels+comm->nChannels/2+c;
// Set root of collTree to id nranks
if (rank == collNetGraph->intra[recvIndex+c*comm->localRanks]) { // is master
channel->collTree.up = nranks;
// For all channels
for (int c=0; c<comm->nChannels; c++) {
struct ncclChannel* channel = comm->channels+c;
char line[1024];
sprintf(line, "CollNet channel %d rank %d ", c, rank);
int nDown = 0;
for (int i=0; i<nHeads; i++) {
if (rank == heads[i]) { // is head
channel->collTree.headRank = i; // Mark the index for deciding offset in the CUDA kernel
channel->collTree.out = comm->nRanks; // Set root of collTree to id nranks
int* collNetIntra = collNetGraph->intra+i*localRanks;
sprintf(line+strlen(line), "down ");
for (int r=0; r<localRanks; r++) {
if (collNetIntra[r] == rank) continue;
channel->collTree.down[nDown++] = collNetIntra[r]; // connect to all peers
sprintf(line+strlen(line), " %d ", collNetIntra[r]);
}
sprintf(line+strlen(line), "nDown %d ", nDown);
break;
}
}
if (rank == collNetGraph->intra[recvEndIndex+c*comm->localRanks]) { // is bottom of intra-node chain
channel->collTree.down[0] = -1;
// Connect to all heads
int nUp = 0;
sprintf(line+strlen(line), "up ");
for (int h=0; h<nHeads; h++) {
if (rank == heads[h]) continue;
channel->collTree.up[nUp++] = heads[h];
sprintf(line+strlen(line), " %d ", heads[h]);
}
channel->collTree.depth = depth;
INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", comm->nChannels/2+c, rank, channel->collTree.up, channel->collTree.down[0]);
channel->collTree.nHeads = nHeads;
channel->collTree.shift = (rank%localRanks)%nHeads; // Shift by intraRank so that leaves don't send to same head simultaneously
channel->collTree.depth = (nUp == 0 && nDown == 0) ? 1 : 2;
sprintf(line+strlen(line), "nUp %d nHeads %d ", nUp, nHeads);
sprintf(line+strlen(line), "headRank %d out %d shift %d", channel->collTree.headRank, channel->collTree.out, channel->collTree.shift);
INFO(NCCL_GRAPH, "%s", line);
}
return ncclSuccess;
}
@@ -231,7 +245,18 @@ int ncclMaxNchannels() {
return maxNchannels;
}
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings) {
static int copyChannels(struct ncclComm* comm, int start, int end, int* ringPrev, int* ringNext) {
int nranks = comm->nRanks;
int c;
for (c=start; c<end; c++) {
memcpy(ringPrev+c*nranks, ringPrev+(c-start)*nranks, nranks*sizeof(int));
memcpy(ringNext+c*nranks, ringNext+(c-start)*nranks, nranks*sizeof(int));
memcpy(comm->channels+c, comm->channels+c-start, sizeof(struct ncclChannel));
}
return c;
}
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph* collNetGraph) {
// Gather data from all ranks
int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1;
int nranks = comm->nRanks;
@@ -266,16 +291,20 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
// Duplication should be complete now
nChannels = comm->nChannels = std::min(MAXCHANNELS,nChannels*2);
// Setup CollNet
if (comm->collNetSupport == 1) {
// Add more channels to saturate intra-node bandwidth, except the 1 PPN case
if (collNetGraph->speedIntra > collNetGraph->speedInter && comm->nRanks > comm->nNodes) {
int collNetNchannels = std::min(MAXCHANNELS, nChannels+nChannels/2);
nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext);
}
NCCLCHECK(connectCollNet(comm, collNetGraph));
}
// Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS.
// We permit combining max, then min, to only use the first channels, then duplicate them.
nChannels = comm->nChannels = std::min((int)ncclMaxNchannels(), nChannels);
int c;
for (c=nChannels; c<ncclMinNchannels(); c++) {
memcpy(ringPrev+c*nranks, ringPrev+(c-nChannels)*nranks, nranks*sizeof(int));
memcpy(ringNext+c*nranks, ringNext+(c-nChannels)*nranks, nranks*sizeof(int));
memcpy(comm->channels+c, comm->channels+c-nChannels, sizeof(struct ncclChannel));
}
nChannels = comm->nChannels = c;
nChannels = comm->nChannels = copyChannels(comm, nChannels, ncclMinNchannels(), ringPrev, ringNext);
// Create rings array and check all is fine
NCCLCHECK(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext));
+2 -3
Ver Arquivo
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -280,8 +280,7 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_
NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
if (arch == NCCL_TOPO_CPU_ARCH_ARM) p2pLevel = PATH_PXB;
if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
if (model == NCCL_TOPO_CPU_TYPE_BDW) p2pLevel = PATH_PXB;
else p2pLevel = PATH_PHB;
p2pLevel = PATH_PXB;
}
if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
p2pLevel = PATH_PXB;
+64 -4
Ver Arquivo
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -393,9 +393,67 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
return ncclSuccess;
}
// Select only NICs with the maximum bandwidth w.r.t. GPUs, and sort them by distance.
ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int* nets, int* netcountRet) {
float* maxwidths;
int* minhops;
int netcount = 0;
NCCLCHECK(ncclCalloc(&minhops, system->nodes[NET].count));
NCCLCHECK(ncclCalloc(&maxwidths, system->nodes[NET].count));
for (int n=0; n<system->nodes[NET].count; n++) {
maxwidths[n] = 0.0;
minhops[n] = 255;
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
struct ncclTopoLinkList* paths = net->paths[GPU];
for (int g=0; g<system->nodes[GPU].count; g++) {
if (paths[g].width > maxwidths[n] || (paths[g].width == maxwidths[n] && paths[g].count < minhops[n])) {
maxwidths[n] = paths[g].width;
minhops[n] = paths[g].count;
}
}
if (netcount && maxwidths[nets[0]] > maxwidths[n]) continue; // Do not keep NICs with lower BW
if (netcount && maxwidths[nets[0]] < maxwidths[n]) netcount = 0; // Remove all NICs with lower BW
int index;
for (index = 0; index < netcount; index++) {
if (minhops[n] < minhops[nets[index]]) break;
}
// Insert net at index
// Shift all nets with higher nhops
for (int i = netcount; i>index; i--) nets[i] = nets[i-1];
// Insert this net at index
nets[index] = n;
netcount++;
}
*netcountRet = netcount;
// Then shuffle NICs with the same nhops based on the GPU device number, so that when we have
// 2 NICs and 2 GPUs and create communicators with only one GPU, we will use both NICs.
for (int start = 0; start < netcount;) {
int end = start+1;
while (end < netcount && minhops[nets[end]] == minhops[nets[start]]) end++;
// Shuffle
for (int r=0; r<system->nodes[GPU].nodes[0].gpu.dev % (end-start); r++) {
int netStart = nets[start];
for (int i=start; i<end-1; i++) nets[i] = nets[i+1];
nets[end-1] = netStart;
}
start = end;
}
free(minhops);
free(maxwidths);
return ncclSuccess;
}
ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) {
const int speed = graph->speedInter;
for (int n=0; n<system->nodes[NET].count; n++) {
int* nets;
NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
int netcount;
NCCLCHECK(ncclTopoSelectNets(system, nets, &netcount));
for (int i=0; i<netcount; i++) {
int n = nets[i];
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
struct ncclTopoNode* gpu;
if (graph->collNet && net->net.collSupport == 0) continue;
@@ -463,6 +521,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
}
}
}
free(nets);
return ncclSuccess;
}
@@ -705,6 +764,7 @@ search:
for (int g=0; g<ngpus; g++) {
printf("%d ", graph->intra[c*ngpus+g]);
}
printf("[%d %d]", graph->inter[0], graph->inter[1]);
printf("\n");
}
#endif
@@ -845,7 +905,7 @@ ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, stru
return ncclSuccess;
}
ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int* dev) {
ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int rr, int* dev) {
if (graph) {
// Honor the net device in the graph
int channel = channelId%graph->nChannels;
@@ -854,7 +914,7 @@ ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct n
*dev = graph->inter[channel*2+index];
} else {
int64_t id;
NCCLCHECK(ncclTopoGetLocalNet(system, rank, &id, channelId));
NCCLCHECK(ncclTopoGetLocalNet(system, rank, &id, rr));
*dev = id;
}
return ncclSuccess;
+73 -2
Ver Arquivo
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -172,6 +172,65 @@ ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode
return ncclSuccess;
}
// BCM Gen4 Switches present themselves as a two-level hierarchical switch
// even though they're supposed to sustain full BW across all ports.
// Flatten the switch as this extra level can break the search and make
// NCCL take wrong topology decisions.
ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) {
for (int s=0; s<system->nodes[PCI].count; s++) {
struct ncclTopoNode* pciSwitch = system->nodes[PCI].nodes+s;
uint64_t device = pciSwitch->pci.device;
// Only flatten PEX Gen 4 switches in base mode
if ((device & 0xfffffffffffff000) == 0x1000c0101000a000) {
// Find sub switches with the same device ID.
int64_t* subSwIds;
NCCLCHECK(ncclCalloc(&subSwIds, pciSwitch->nlinks));
int subs = 0;
for (int l=0; l<pciSwitch->nlinks; l++) {
struct ncclTopoNode* sub = pciSwitch->links[l].remNode;
// Only fuse sub switches with the same device ID.
if (sub->type != PCI || sub->pci.device != device) continue;
// Save sub switch for later
subSwIds[subs++] = sub->id;
// Remove link to that sub switch
memmove(pciSwitch->links+l, pciSwitch->links+l+1, (pciSwitch->nlinks-l-1)*(sizeof(struct ncclTopoLink)));
pciSwitch->nlinks--;
// Don't increase l for the next iteration as we just shifted all links by one.
l--;
}
for (int s=0; s<subs; s++) {
// Find sub switch (system->nodes[PCI].nodes is changing every time we remove a node)
int index;
NCCLCHECK(ncclTopoIdToIndex(system, PCI, subSwIds[s], &index));
struct ncclTopoNode* sub = system->nodes[PCI].nodes+index;
// Connect all sub PCI devices to the parent switch
for (int l=0; l<sub->nlinks; l++) {
struct ncclTopoNode* remNode = sub->links[l].remNode;
if (remNode == pciSwitch) continue;
// Add link from parent PCI switch -> PCI device
memcpy(pciSwitch->links+pciSwitch->nlinks, sub->links+l, sizeof(struct ncclTopoLink));
pciSwitch->nlinks++;
// Update link from PCI device -> parent PCI switch
for (int rl=0; rl<remNode->nlinks; rl++) {
if (remNode->links[rl].remNode == sub) {
remNode->links[rl].remNode = pciSwitch;
break;
}
}
}
NCCLCHECK(ncclTopoRemoveNode(system, PCI, index));
}
// Set subdevice to 0x0000 to make sure we don't merge this switch again.
pciSwitch->pci.device = 0x1000c01010000000;
free(subSwIds);
// Restart, as system->nodes[PCI].nodes has changed.
s = 0;
}
}
return ncclSuccess;
}
ncclResult_t ncclTopoConnectCpus(struct ncclTopoSystem* system) {
// And connect all CPU nodes together
for (int n=0; n<system->nodes[CPU].count; n++) {
@@ -190,6 +249,8 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN
sprintf(line+offset, "%s/%lX (%d)", topoNodeTypeStr[node->type], node->id, node->gpu.rank);
} else if (node->type == CPU) {
sprintf(line+offset, "%s/%lX (%d/%d/%d)", topoNodeTypeStr[node->type], node->id, node->cpu.arch, node->cpu.vendor, node->cpu.model);
} else if (node->type == PCI) {
sprintf(line+offset, "%s/%lX (%lx)", topoNodeTypeStr[node->type], node->id, node->pci.device);
} else {
sprintf(line+offset, "%s/%lX", topoNodeTypeStr[node->type], node->id);
}
@@ -345,6 +406,15 @@ ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* s
NCCLCHECK(ncclTopoAddNic(xmlNic, system, nicNode));
} else if (type == PCI) {
NCCLCHECK(ncclTopoCreateNode(system, &node, type, busId));
NCCLCHECK(xmlGetAttr(xmlPci, "vendor", &str));
if (str) node->pci.device += strtol(str, NULL, 0) << 48;
NCCLCHECK(xmlGetAttr(xmlPci, "device", &str));
if (str) node->pci.device += strtol(str, NULL, 0) << 32;
NCCLCHECK(xmlGetAttr(xmlPci, "subsystem_vendor", &str));
if (str) node->pci.device += strtol(str, NULL, 0) << 16;
NCCLCHECK(xmlGetAttr(xmlPci, "subsystem_device", &str));
if (str) node->pci.device += strtol(str, NULL, 0);
for (int s=0; s<xmlPci->nSubs; s++) {
struct ncclXmlNode* xmlSubPci = xmlPci->subs[s];
NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node));
@@ -475,6 +545,7 @@ ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem
}
NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL));
NCCLCHECK(ncclTopoFlattenBcmSwitches(*topoSystem));
NCCLCHECK(ncclTopoConnectCpus(*topoSystem));
NCCLCHECK(ncclTopoSortSystem(*topoSystem));
@@ -602,7 +673,7 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int64_
}
if (path->width == maxWidth && path->type == minType) nets[count++] = system->nodes[NET].nodes[n].id;
}
*id = nets[rr % count];
*id = nets[rr%count];
free(nets);
return ncclSuccess;
}
+5 -3
Ver Arquivo
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -27,8 +27,7 @@
// Intel CPU convert GPU P2P traffic into 64B PCI TLPs, so GPU
// to GPU traffic consumes more PCI bandwidth.
#define INTEL_P2P(speed) (speed*9/12)
#define INTEL_P2P_OVERHEAD(speed) (speed*12/9)
#define INTEL_P2P_OVERHEAD(speed) (speed*6/5)
#define NCCL_TOPO_NODE_TYPES 7
#define GPU 0
@@ -105,6 +104,9 @@ struct ncclTopoNode {
int model;
cpu_set_t affinity;
}cpu;
struct {
uint64_t device;
}pci;
};
int nlinks;
struct ncclTopoLink links[NCCL_TOPO_MAX_LINKS];
+6 -4
Ver Arquivo
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -79,8 +79,10 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
int simpleDefaultThreads = (ringGraph->speedIntra*ringGraph->nChannels <= PCI_WIDTH) ? 256 : NCCL_SIMPLE_MAX_NTHREADS;
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] =
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, simpleDefaultThreads);
comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] =
comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] =
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS);
comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] =
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS);
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL] =
getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_LL_MAX_NTHREADS, NCCL_LL_MAX_NTHREADS);
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL128] =
@@ -128,8 +130,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw);
if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 0.915 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels);
if (a == NCCL_ALGO_COLLNET) busBw *= .9;
if (a == NCCL_ALGO_COLLNET && p == NCCL_PROTO_LL) busBw *= 1.0/6.0; // Take into account that GDR read is disabled on both sides
if (a == NCCL_ALGO_COLLNET && p == NCCL_PROTO_LL128) busBw = 0; // CollNet does not support LL128
if (a == NCCL_ALGO_COLLNET && p != NCCL_PROTO_SIMPLE) busBw = 0; // Oneshot CollNet only supports Simple
// Convert bus BW to algorithm BW
float ratio = (a != NCCL_ALGO_RING) ? .5 : (1.0 * nRanks) / nsteps;
@@ -233,6 +234,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
comm->threadThresholds[a][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_THREAD_THRESHOLD;
}
comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL] *= nRanks;
comm->threadThresholds[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] = 512;
// Override defaults with user env
char* str = getenv("NCCL_THREAD_THRESHOLDS");
+21 -1
Ver Arquivo
@@ -1,5 +1,5 @@
/*************************************************************************
* Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -469,6 +469,26 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "class", "class"));
}
NCCLCHECK(xmlGetAttrIndex(pciNode, "vendor", &index));
if (index == -1) {
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "vendor", "vendor"));
}
NCCLCHECK(xmlGetAttrIndex(pciNode, "device", &index));
if (index == -1) {
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "device", "device"));
}
NCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_vendor", &index));
if (index == -1) {
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "subsystem_vendor", "subsystem_vendor"));
}
NCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_device", &index));
if (index == -1) {
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "subsystem_device", "subsystem_device"));
}
NCCLCHECK(xmlGetAttrIndex(pciNode, "link_speed", &index));
if (index == -1) {
if (path == NULL) NCCLCHECK(getPciPath(busId, &path));