2.9.6-1

Add support for CUDA graphs. Fuse BCM Gen4 switches to avoid suboptimal performance on some platforms. Issue #439. Fix bootstrap issue caused by connection reordering. Fix CPU locking block. Improve CollNet algorithm. Improve performance on DGX A100 for communicators with only one GPU per node.
2021-04-12 16:00:11 -07:00
commit a46ea10583
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -14,7 +14,7 @@
 /******************************************************************/

 ncclResult_t ncclTopoPreset(struct ncclComm* comm,
-    struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph,
+    struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph,
    struct ncclTopoRanks* topoRanks) {
  int rank = comm->rank;
  int localRanks = comm->localRanks;
@@ -25,12 +25,15 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
    channel->ring.prev = channel->ring.next = -1;
    channel->tree.up = -1;
    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->tree.down[i] = -1;
-    channel->collTree.up = -1;
-    for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collTree.down[i] = -1;
+    channel->collTree.out = -1;
+    channel->collTree.headRank = -1;
+    channel->collTree.nHeads = 0;
+    channel->collTree.shift = 0;
+    for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collTree.up[i] = -1;
+    for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collTree.down[i] = -1;

    int* ringIntra = ringGraph->intra+c*localRanks;
    int* treeIntra = treeGraph->intra+c*localRanks;
-    int* collNetIntra = collNetGraph->intra+c*localRanks;

    for (int i=0; i<localRanks; i++) {
      if (ringIntra[i] == rank) {
@@ -50,12 +53,6 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm,
        channel->tree.up         = i == 0 ? -1 : treeIntra[i-1];
        channel->tree.down[0]    = i == localRanks-1 ? -1 : treeIntra[i+1];
      }
-      if (collNetIntra[i] == rank) {
-        int prev = (i-1+localRanks)%localRanks, next = (i+1)%localRanks;
-
-        channel->collTree.up      = collNetIntra[prev];
-        channel->collTree.down[0] = collNetIntra[next];
-      }
    }
    topoRanks->ringPrev[c] = channel->ring.prev;
    topoRanks->ringNext[c] = channel->ring.next;
@@ -167,36 +164,53 @@ static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int*
  return ncclSuccess;
 }

-ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, int rank) {
-  int nranks = comm->nRanks;
-  int depth = nranks/comm->nNodes;
-  int sendIndex = collNetGraph->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1;  // send GPU index depends on topo pattern
-  int sendEndIndex = (sendIndex+comm->localRanks-1)%comm->localRanks;
-  for (int c=0; c<comm->nChannels/2; c++) {
-    struct ncclChannel* channel = comm->channels+c;
-    // Set root of collTree to id nranks
-    if (rank == collNetGraph->intra[sendIndex+c*comm->localRanks]) { // is master
-      channel->collTree.up = nranks;
-    }
-    if (rank == collNetGraph->intra[sendEndIndex+c*comm->localRanks]) { // is bottom of intra-node chain
-      channel->collTree.down[0] = -1;
-    }
-    channel->collTree.depth = depth;
-    INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", c, rank, channel->collTree.up, channel->collTree.down[0]);
+static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph) {
+  int rank = comm->rank;
+  int localRanks = comm->localRanks;
+  int nHeads = collNetGraph->nChannels;
+  int *heads;
+  NCCLCHECK(ncclCalloc(&heads, nHeads));
+  // Find all head ranks
+  // Head index is always 0
+  for (int c=0; c<nHeads; c++) {
+    int* collNetIntra = collNetGraph->intra+c*localRanks;
+    heads[c] = collNetIntra[0];
  }
-  int recvIndex = 0;  // recv GPU index is always 0
-  int recvEndIndex = (recvIndex+comm->localRanks-1)%comm->localRanks;
-  for (int c=0; c<comm->nChannels/2; c++) {
-    struct ncclChannel* channel = comm->channels+comm->nChannels/2+c;
-    // Set root of collTree to id nranks
-    if (rank == collNetGraph->intra[recvIndex+c*comm->localRanks]) { // is master
-      channel->collTree.up = nranks;
+  // For all channels
+  for (int c=0; c<comm->nChannels; c++) {
+    struct ncclChannel* channel = comm->channels+c;
+    char line[1024];
+    sprintf(line, "CollNet channel %d rank %d ", c, rank);
+    int nDown = 0;
+    for (int i=0; i<nHeads; i++) {
+      if (rank == heads[i]) { // is head
+        channel->collTree.headRank = i; // Mark the index for deciding offset in the CUDA kernel
+        channel->collTree.out = comm->nRanks; // Set root of collTree to id nranks
+        int* collNetIntra = collNetGraph->intra+i*localRanks;
+        sprintf(line+strlen(line), "down ");
+        for (int r=0; r<localRanks; r++) {
+          if (collNetIntra[r] == rank) continue;
+          channel->collTree.down[nDown++] = collNetIntra[r];  // connect to all peers
+          sprintf(line+strlen(line), " %d ", collNetIntra[r]);
+        }
+        sprintf(line+strlen(line), "nDown %d ", nDown);
+        break;
+      }
    }
-    if (rank == collNetGraph->intra[recvEndIndex+c*comm->localRanks]) { // is bottom of intra-node chain
-      channel->collTree.down[0] = -1;
+    // Connect to all heads
+    int nUp = 0;
+    sprintf(line+strlen(line), "up ");
+    for (int h=0; h<nHeads; h++) {
+      if (rank == heads[h]) continue;
+      channel->collTree.up[nUp++] = heads[h];
+      sprintf(line+strlen(line), " %d ", heads[h]);
    }
-    channel->collTree.depth = depth;
-    INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", comm->nChannels/2+c, rank, channel->collTree.up, channel->collTree.down[0]);
+    channel->collTree.nHeads = nHeads;
+    channel->collTree.shift = (rank%localRanks)%nHeads; // Shift by intraRank so that leaves don't send to same head simultaneously
+    channel->collTree.depth = (nUp == 0 && nDown == 0) ? 1 : 2;
+    sprintf(line+strlen(line), "nUp %d nHeads %d ", nUp, nHeads);
+    sprintf(line+strlen(line), "headRank %d out %d shift %d", channel->collTree.headRank, channel->collTree.out, channel->collTree.shift);
+    INFO(NCCL_GRAPH, "%s", line);
  }
  return ncclSuccess;
 }
@@ -231,7 +245,18 @@ int ncclMaxNchannels() {
  return maxNchannels;
 }

-ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings) {
+static int copyChannels(struct ncclComm* comm, int start, int end, int* ringPrev, int* ringNext) {
+  int nranks = comm->nRanks;
+  int c;
+  for (c=start; c<end; c++) {
+    memcpy(ringPrev+c*nranks, ringPrev+(c-start)*nranks, nranks*sizeof(int));
+    memcpy(ringNext+c*nranks, ringNext+(c-start)*nranks, nranks*sizeof(int));
+    memcpy(comm->channels+c, comm->channels+c-start, sizeof(struct ncclChannel));
+  }
+  return c;
+}
+
+ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph* collNetGraph) {
  // Gather data from all ranks
  int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1;
  int nranks = comm->nRanks;
@@ -266,16 +291,20 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa
  // Duplication should be complete now
  nChannels = comm->nChannels = std::min(MAXCHANNELS,nChannels*2);

+  // Setup CollNet
+  if (comm->collNetSupport == 1) {
+    // Add more channels to saturate intra-node bandwidth, except the 1 PPN case
+    if (collNetGraph->speedIntra > collNetGraph->speedInter && comm->nRanks > comm->nNodes) {
+      int collNetNchannels = std::min(MAXCHANNELS, nChannels+nChannels/2);
+      nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext);
+    }
+    NCCLCHECK(connectCollNet(comm, collNetGraph));
+  }
+
  // Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS.
  // We permit combining max, then min, to only use the first channels, then duplicate them.
  nChannels = comm->nChannels = std::min((int)ncclMaxNchannels(), nChannels);
-  int c;
-  for (c=nChannels; c<ncclMinNchannels(); c++) {
-    memcpy(ringPrev+c*nranks, ringPrev+(c-nChannels)*nranks, nranks*sizeof(int));
-    memcpy(ringNext+c*nranks, ringNext+(c-nChannels)*nranks, nranks*sizeof(int));
-    memcpy(comm->channels+c, comm->channels+c-nChannels, sizeof(struct ncclChannel));
-  }
-  nChannels = comm->nChannels = c;
+  nChannels = comm->nChannels = copyChannels(comm, nChannels, ncclMinNchannels(), ringPrev, ringNext);

  // Create rings array and check all is fine
  NCCLCHECK(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext));
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -280,8 +280,7 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_
  NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
  if (arch == NCCL_TOPO_CPU_ARCH_ARM) p2pLevel = PATH_PXB;
  if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
-    if (model == NCCL_TOPO_CPU_TYPE_BDW) p2pLevel = PATH_PXB;
-    else p2pLevel = PATH_PHB;
+    p2pLevel = PATH_PXB;
  }
  if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
    p2pLevel = PATH_PXB;
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -393,9 +393,67 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo
  return ncclSuccess;
 }

+// Select only NICs with the maximum bandwidth w.r.t. GPUs, and sort them by distance.
+ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int* nets, int* netcountRet) {
+  float* maxwidths;
+  int* minhops;
+  int netcount = 0;
+  NCCLCHECK(ncclCalloc(&minhops, system->nodes[NET].count));
+  NCCLCHECK(ncclCalloc(&maxwidths, system->nodes[NET].count));
+  for (int n=0; n<system->nodes[NET].count; n++) {
+    maxwidths[n] = 0.0;
+    minhops[n] = 255;
+    struct ncclTopoNode* net = system->nodes[NET].nodes+n;
+    struct ncclTopoLinkList* paths = net->paths[GPU];
+    for (int g=0; g<system->nodes[GPU].count; g++) {
+      if (paths[g].width > maxwidths[n] || (paths[g].width == maxwidths[n] && paths[g].count < minhops[n])) {
+        maxwidths[n] = paths[g].width;
+        minhops[n] = paths[g].count;
+      }
+    }
+    if (netcount && maxwidths[nets[0]] > maxwidths[n]) continue; // Do not keep NICs with lower BW
+    if (netcount && maxwidths[nets[0]] < maxwidths[n]) netcount = 0; // Remove all NICs with lower BW
+    int index;
+    for (index = 0; index < netcount; index++) {
+      if (minhops[n] < minhops[nets[index]]) break;
+    }
+    // Insert net at index
+    // Shift all nets with higher nhops
+    for (int i = netcount; i>index; i--) nets[i] = nets[i-1];
+    // Insert this net at index
+    nets[index] = n;
+    netcount++;
+  }
+
+  *netcountRet = netcount;
+
+  // Then shuffle NICs with the same nhops based on the GPU device number, so that when we have
+  // 2 NICs and 2 GPUs and create communicators with only one GPU, we will use both NICs.
+  for (int start = 0; start < netcount;) {
+    int end = start+1;
+    while (end < netcount && minhops[nets[end]] == minhops[nets[start]]) end++;
+    // Shuffle
+    for (int r=0; r<system->nodes[GPU].nodes[0].gpu.dev % (end-start); r++) {
+      int netStart = nets[start];
+      for (int i=start; i<end-1; i++) nets[i] = nets[i+1];
+      nets[end-1] = netStart;
+    }
+    start = end;
+  }
+
+  free(minhops);
+  free(maxwidths);
+  return ncclSuccess;
+}
+
 ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* saveGraph, int backToNet, int backToFirstRank, int* time) {
  const int speed = graph->speedInter;
-  for (int n=0; n<system->nodes[NET].count; n++) {
+  int* nets;
+  NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count));
+  int netcount;
+  NCCLCHECK(ncclTopoSelectNets(system, nets, &netcount));
+  for (int i=0; i<netcount; i++) {
+    int n = nets[i];
    struct ncclTopoNode* net = system->nodes[NET].nodes+n;
    struct ncclTopoNode* gpu;
    if (graph->collNet && net->net.collSupport == 0) continue;
@@ -463,6 +521,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
      }
    }
  }
+  free(nets);
  return ncclSuccess;
 }

@@ -705,6 +764,7 @@ search:
    for (int g=0; g<ngpus; g++) {
      printf("%d ", graph->intra[c*ngpus+g]);
    }
+    printf("[%d %d]", graph->inter[0], graph->inter[1]);
    printf("\n");
  }
 #endif
@@ -845,7 +905,7 @@ ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, stru
  return ncclSuccess;
 }

-ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int* dev) {
+ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int rr, int* dev) {
  if (graph) {
    // Honor the net device in the graph
    int channel = channelId%graph->nChannels;
@@ -854,7 +914,7 @@ ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct n
    *dev = graph->inter[channel*2+index];
  } else {
    int64_t id;
-    NCCLCHECK(ncclTopoGetLocalNet(system, rank, &id, channelId));
+    NCCLCHECK(ncclTopoGetLocalNet(system, rank, &id, rr));
    *dev = id;
  }
  return ncclSuccess;
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -172,6 +172,65 @@ ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode
  return ncclSuccess;
 }

+// BCM Gen4 Switches present themselves as a two-level hierarchical switch
+// even though they're supposed to sustain full BW across all ports.
+// Flatten the switch as this extra level can break the search and make
+// NCCL take wrong topology decisions.
+ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) {
+  for (int s=0; s<system->nodes[PCI].count; s++) {
+    struct ncclTopoNode* pciSwitch = system->nodes[PCI].nodes+s;
+    uint64_t device = pciSwitch->pci.device;
+    // Only flatten PEX Gen 4 switches in base mode
+    if ((device & 0xfffffffffffff000) == 0x1000c0101000a000) {
+      // Find sub switches with the same device ID.
+      int64_t* subSwIds;
+      NCCLCHECK(ncclCalloc(&subSwIds, pciSwitch->nlinks));
+      int subs = 0;
+      for (int l=0; l<pciSwitch->nlinks; l++) {
+        struct ncclTopoNode* sub = pciSwitch->links[l].remNode;
+        // Only fuse sub switches with the same device ID.
+        if (sub->type != PCI || sub->pci.device != device) continue;
+        // Save sub switch for later
+        subSwIds[subs++] = sub->id;
+        // Remove link to that sub switch
+        memmove(pciSwitch->links+l, pciSwitch->links+l+1, (pciSwitch->nlinks-l-1)*(sizeof(struct ncclTopoLink)));
+        pciSwitch->nlinks--;
+        // Don't increase l for the next iteration as we just shifted all links by one.
+        l--;
+      }
+
+      for (int s=0; s<subs; s++) {
+        // Find sub switch (system->nodes[PCI].nodes is changing every time we remove a node)
+        int index;
+        NCCLCHECK(ncclTopoIdToIndex(system, PCI, subSwIds[s], &index));
+        struct ncclTopoNode* sub = system->nodes[PCI].nodes+index;
+        // Connect all sub PCI devices to the parent switch
+        for (int l=0; l<sub->nlinks; l++) {
+          struct ncclTopoNode* remNode = sub->links[l].remNode;
+          if (remNode == pciSwitch) continue;
+          // Add link from parent PCI switch -> PCI device
+          memcpy(pciSwitch->links+pciSwitch->nlinks, sub->links+l, sizeof(struct ncclTopoLink));
+          pciSwitch->nlinks++;
+          // Update link from PCI device -> parent PCI switch
+          for (int rl=0; rl<remNode->nlinks; rl++) {
+            if (remNode->links[rl].remNode == sub) {
+              remNode->links[rl].remNode = pciSwitch;
+              break;
+            }
+          }
+        }
+        NCCLCHECK(ncclTopoRemoveNode(system, PCI, index));
+      }
+      // Set subdevice to 0x0000 to make sure we don't merge this switch again.
+      pciSwitch->pci.device = 0x1000c01010000000;
+      free(subSwIds);
+      // Restart, as system->nodes[PCI].nodes has changed.
+      s = 0;
+    }
+  }
+  return ncclSuccess;
+}
+
 ncclResult_t ncclTopoConnectCpus(struct ncclTopoSystem* system) {
  // And connect all CPU nodes together
  for (int n=0; n<system->nodes[CPU].count; n++) {
@@ -190,6 +249,8 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN
    sprintf(line+offset, "%s/%lX (%d)", topoNodeTypeStr[node->type], node->id, node->gpu.rank);
  } else if (node->type == CPU) {
    sprintf(line+offset, "%s/%lX (%d/%d/%d)", topoNodeTypeStr[node->type], node->id, node->cpu.arch, node->cpu.vendor, node->cpu.model);
+  } else if (node->type == PCI) {
+    sprintf(line+offset, "%s/%lX (%lx)", topoNodeTypeStr[node->type], node->id, node->pci.device);
  } else {
    sprintf(line+offset, "%s/%lX", topoNodeTypeStr[node->type], node->id);
  }
@@ -345,6 +406,15 @@ ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* s
    NCCLCHECK(ncclTopoAddNic(xmlNic, system, nicNode));
  } else if (type == PCI) {
    NCCLCHECK(ncclTopoCreateNode(system, &node, type, busId));
+    NCCLCHECK(xmlGetAttr(xmlPci, "vendor", &str));
+    if (str) node->pci.device += strtol(str, NULL, 0) << 48;
+    NCCLCHECK(xmlGetAttr(xmlPci, "device", &str));
+    if (str) node->pci.device += strtol(str, NULL, 0) << 32;
+    NCCLCHECK(xmlGetAttr(xmlPci, "subsystem_vendor", &str));
+    if (str) node->pci.device += strtol(str, NULL, 0) << 16;
+    NCCLCHECK(xmlGetAttr(xmlPci, "subsystem_device", &str));
+    if (str) node->pci.device += strtol(str, NULL, 0);
+
    for (int s=0; s<xmlPci->nSubs; s++) {
      struct ncclXmlNode* xmlSubPci = xmlPci->subs[s];
      NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node));
@@ -475,6 +545,7 @@ ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem
  }
  NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL));

+  NCCLCHECK(ncclTopoFlattenBcmSwitches(*topoSystem));
  NCCLCHECK(ncclTopoConnectCpus(*topoSystem));
  NCCLCHECK(ncclTopoSortSystem(*topoSystem));

@@ -602,7 +673,7 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int64_
    }
    if (path->width == maxWidth && path->type == minType) nets[count++] = system->nodes[NET].nodes[n].id;
  }
-  *id = nets[rr % count];
+  *id = nets[rr%count];
  free(nets);
  return ncclSuccess;
 }
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -27,8 +27,7 @@

 // Intel CPU convert GPU P2P traffic into 64B PCI TLPs, so GPU
 // to GPU traffic consumes more PCI bandwidth.
-#define INTEL_P2P(speed) (speed*9/12)
-#define INTEL_P2P_OVERHEAD(speed) (speed*12/9)
+#define INTEL_P2P_OVERHEAD(speed) (speed*6/5)

 #define NCCL_TOPO_NODE_TYPES 7
 #define GPU 0
@@ -105,6 +104,9 @@ struct ncclTopoNode {
      int model;
      cpu_set_t affinity;
    }cpu;
+    struct {
+      uint64_t device;
+    }pci;
  };
  int nlinks;
  struct ncclTopoLink links[NCCL_TOPO_MAX_LINKS];
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -79,8 +79,10 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
  int simpleDefaultThreads = (ringGraph->speedIntra*ringGraph->nChannels <= PCI_WIDTH) ? 256 : NCCL_SIMPLE_MAX_NTHREADS;
  comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] =
    getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, simpleDefaultThreads);
-  comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] =
+  comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] =
    getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS);
+  comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] =
+    getNthreads("NCCL_NTHREADS", ncclParamNthreads(), NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS);
  comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL] =
    getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_LL_MAX_NTHREADS, NCCL_LL_MAX_NTHREADS);
  comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL128] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL128] =
@@ -128,8 +130,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
        if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL) busBw = std::min(busBw*1.0/3.8, llMaxBw);
        if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 0.915 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels);
        if (a == NCCL_ALGO_COLLNET) busBw *= .9;
-        if (a == NCCL_ALGO_COLLNET && p == NCCL_PROTO_LL) busBw *= 1.0/6.0; // Take into account that GDR read is disabled on both sides
-        if (a == NCCL_ALGO_COLLNET && p == NCCL_PROTO_LL128) busBw = 0;  // CollNet does not support LL128
+        if (a == NCCL_ALGO_COLLNET && p != NCCL_PROTO_SIMPLE) busBw = 0;  // Oneshot CollNet only supports Simple

        // Convert bus BW to algorithm BW
        float ratio = (a != NCCL_ALGO_RING) ? .5 : (1.0 * nRanks) / nsteps;
@@ -233,6 +234,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
    comm->threadThresholds[a][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_THREAD_THRESHOLD;
  }
  comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL] *= nRanks;
+  comm->threadThresholds[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] = 512;

  // Override defaults with user env
  char* str = getenv("NCCL_THREAD_THRESHOLDS");
@@ -1,5 +1,5 @@
 /*************************************************************************
- * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
 *
 * See LICENSE.txt for license information
 ************************************************************************/
@@ -469,6 +469,26 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml*
    if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
    NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "class", "class"));
  }
+  NCCLCHECK(xmlGetAttrIndex(pciNode, "vendor", &index));
+  if (index == -1) {
+    if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
+    NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "vendor", "vendor"));
+  }
+  NCCLCHECK(xmlGetAttrIndex(pciNode, "device", &index));
+  if (index == -1) {
+    if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
+    NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "device", "device"));
+  }
+  NCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_vendor", &index));
+  if (index == -1) {
+    if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
+    NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "subsystem_vendor", "subsystem_vendor"));
+  }
+  NCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_device", &index));
+  if (index == -1) {
+    if (path == NULL) NCCLCHECK(getPciPath(busId, &path));
+    NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "subsystem_device", "subsystem_device"));
+  }
  NCCLCHECK(xmlGetAttrIndex(pciNode, "link_speed", &index));
  if (index == -1) {
    if (path == NULL) NCCLCHECK(getPciPath(busId, &path));