Revert "Changes to topology based on XGMI (#272)"

This reverts commit 0a9adc16f4. [ROCm/rccl commit: e055229e56]
2020-12-08 16:51:34 -05:00
@@ -30,84 +30,6 @@ static ncclResult_t getPath(struct ncclTopoSystem* system, struct ncclTopoNode*
  return ncclInternalError;
 }

-// [RCCL]
-// This function traverses only XGMI links (including multi-GPU hops) and builds them into the
-// topology system, which corresponds to how XGMI hardware operates
-static ncclResult_t ncclTopoSetXgmi(struct ncclTopoSystem* system)
-{
-  // Compute paths to GPU g
-  for (int g=0; g<system->nodes[GPU].count; g++) {
-    struct ncclTopoNode *baseNode = system->nodes[GPU].nodes+g;
-
-    if (baseNode->paths[baseNode->type] == NULL) {
-      NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count));
-    }
-
-    // breadth-first search to set all paths to that node in the system
-    struct ncclTopoNodeList nodeList;
-    struct ncclTopoNodeList nextNodeList;
-    nodeList.count = 1; nodeList.list[0] = baseNode;
-    nextNodeList.count = 0;
-    struct ncclTopoLinkList* basePath;
-    NCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath));
-    basePath->count = 0;
-    basePath->width = LOC_WIDTH;
-    basePath->type = PATH_LOC;
-
-    while (nodeList.count) {
-      nextNodeList.count = 0;
-      for (int n=0; n<nodeList.count; n++) {
-        struct ncclTopoNode* node = nodeList.list[n];
-        struct ncclTopoLinkList* path;
-        NCCLCHECK(getPath(system, node, baseNode->type, baseNode->id, &path));
-        for (int l=0; l<node->nlinks; l++) {
-          struct ncclTopoLink* link = node->links+l;
-          struct ncclTopoNode* remNode = link->remNode;
-
-          // Skip non-XGMI links
-          if (link->type != LINK_NVL) continue;
-
-          if (remNode->paths[baseNode->type] == NULL) {
-            NCCLCHECK(ncclCalloc(remNode->paths+baseNode->type, system->nodes[baseNode->type].count));
-          }
-
-          struct ncclTopoLinkList* remPath;
-          NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath));
-          float width = std::min(path->width, link->width);
-          if (remPath->width < width) {
-            // Find reverse link
-            for (int l=0; l<remNode->nlinks; l++) {
-              if (remNode->links[l].remNode == node) {
-                remPath->list[0] = remNode->links+l;
-                break;
-              }
-            }
-            if (remPath->list[0] == NULL) {
-              WARN("Failed to find reverse path from remNode %d/%lx nlinks %d to node %d/%lx",
-                   remNode->type, remNode->id, remNode->nlinks, node->type, node->id);
-              return ncclInternalError;
-            }
-            // Copy the rest of the path
-            for (int i=0; i<path->count; i++) remPath->list[i+1] = path->list[i];
-            remPath->count = path->count + 1;
-            remPath->width = width;
-            remPath->type = PATH_NVL;
-
-            // Add to the list for the next iteration if not already in the list
-            // In this case, permit GPUs are intermediate XGMI steps
-            for (int i=0; i<nextNodeList.count; i++) if (nextNodeList.list[i] == remNode) continue;
-            nextNodeList.list[nextNodeList.count++] = remNode;
-          }
-        }
-      }
-      memcpy(&nodeList, &nextNodeList, sizeof(nodeList));
-    }
-  }
-  return ncclSuccess;
-}
-// [/RCCL]
-
-
 static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclTopoSystem* system) {
  if (baseNode->paths[baseNode->type] == NULL) {
    NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count));
@@ -139,14 +61,7 @@ static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclT
        struct ncclTopoLinkList* remPath;
        NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath));
        float width = std::min(path->width, link->width);
-
-        // [RCCL] Do not let XGMI paths be overwritten (even if PCIe path may be faster)
-        //        Unless they are of shorter length
-     // if (remPath->width < width) {
-        bool notXGMI = remPath->type != PATH_NVL;
-        if (remPath->width < width && notXGMI) {
-        // [/RCCL]
-
+        if (remPath->width < width) {
          // Find reverse link
          for (int l=0; l<remNode->nlinks; l++) {
            if (remNode->links[l].remNode == node) {
@@ -451,10 +366,6 @@ ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclPeer
    NCCLCHECK(ncclTopoSetPaths(system->nodes[CPU].nodes+c, system));
  }

-  // [RCCL] Add XGMI-only links between GPUs first before any other paths
-  NCCLCHECK(ncclTopoSetXgmi(system));
-  // [/RCCL]
-
  // Set direct paths from/to GPUs.
  for (int g=0; g<system->nodes[GPU].count; g++) {
    // Compute paths to GPU g
@@ -159,9 +159,6 @@ struct ncclGpuScore {
  int intraNhops;
  int intraWidth;
  int interNhops;
-  // [RCCL]
-  int intraType;     // New sort parameter to favor XGMI
-  // [/RCCL]
  int interPciWidth;
  int interWidth;    // Most important
 };
@@ -172,9 +169,6 @@ static int cmpScore(const void * g1, const void * g2) {
   int d;
   if ((d = (s2->interWidth - s1->interWidth))) return d;
   if ((d = (s2->interPciWidth - s1->interPciWidth))) return d;
-   // [RCCL]
-   if ((d = (s1->intraType - s2->intraType))) return d;   // Prefer XGMI over any other types
-   // [/RCCL]
   if ((d = (s1->interNhops - s2->interNhops))) return d;
   if ((d = (s2->intraWidth - s1->intraWidth))) return d;
   if ((d = (s1->intraNhops - s2->intraNhops))) return d;
@@ -234,25 +228,11 @@ ncclResult_t ncclTopoSearchNextGpuSort(struct ncclTopoSystem* system, struct ncc
  for (int i=1; i<ngpus; i++) {
    int g = (start+i)%ngpus;
    if (paths[g].count == 0) continue; // There is no path to that GPU
-    // [RCCL] - Prune earlier for performance
-    {
-      if (paths[g].type > graph->typeIntra) continue;         // Skip if the intra path type is already slower than the current target
-      if (paths[g].width < graph->speedIntra) continue;
-      if (netPaths)
-      {
-        if (netPaths[g].type > graph->typeInter) continue;    // Skip if the inter path type is already slower than the current target
-        if (netPaths[g].width < graph->speedInter) continue;
-      }
-    }
-    // [/RCCL]
    if (system->nodes[GPU].nodes[g].used & flag) continue;
    scores[count].g = g;
    scores[count].startIndex = i;
    scores[count].intraNhops = paths[g].count;
    scores[count].intraWidth = paths[g].width;
-    // [RCCL] - Add path type as sort factor
-    scores[count].intraType = paths[g].type;
-    // [/RCCL]
    if (netPaths) {
      scores[count].interNhops = netPaths[g].count;
      scores[count].interPciWidth = gpuPciWidth(system->nodes[GPU].nodes+g);
@@ -313,6 +293,35 @@ ncclResult_t ncclTopoSearchTryGpu(struct ncclTopoSystem* system, struct ncclTopo
  return ncclSuccess;
 }

+static int ncclTopoCountXGMI(struct ncclTopoSystem* system, struct ncclTopoGraph* graph) {
+  int ngpus = system->nodes[GPU].count;
+  int count = 0;
+  for (int c=0; c<graph->nChannels; c++) {
+    for (int i=0; i<ngpus; i++) {
+      int g = graph->intra[ngpus*c+i];
+      int n = graph->intra[ngpus*c+((i+1)%ngpus)];
+      struct ncclTopoNode *node;
+      int j;
+      for (j=0; j<ngpus; j++)
+        if (system->nodes[GPU].nodes[j].gpu.rank == g) break;
+      if (j<ngpus) {
+        node = system->nodes[GPU].nodes+j;
+        for (int k = 0; k<system->nodes[GPU].count; k++) {
+          if (node->paths[GPU][k].count == 1) {
+            struct ncclTopoLink* link = node->paths[GPU][k].list[0];
+            struct ncclTopoNode* remNode = link->remNode;
+            if (remNode->gpu.rank == n) {
+              if (link->type == LINK_NVL)
+                count ++;
+            }
+          }
+        }
+      }
+    }
+  }
+  return count;
+}
+
 ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, struct ncclTopoGraph* refGraph, int* copy) {
  // 1. Constraint to get the same nChannels between Rings and Trees
  if (graph->nChannels < graph->minChannels) return ncclSuccess;
@@ -326,6 +335,9 @@ ncclResult_t ncclTopoCompareGraphs(struct ncclTopoSystem* system, struct ncclTop
  // 3. Less hops (but not at the price of going cross NICs)
  if (graph->crossNic == refGraph->crossNic && graph->nHops < refGraph->nHops) *copy = 1;

+  // 4. Prefer graph with more XGMI connections
+  if (graph->nChannels == refGraph->nChannels
+    && ncclTopoCountXGMI(system, refGraph) < ncclTopoCountXGMI(system, graph)) *copy = 1;
  return ncclSuccess;
 }

@@ -451,9 +463,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo
          if (paths[i].count < paths[f].count) f = i;
        int t = 1 << 10;
        NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, &t, NET, n, f));
-        // [RCCL] Event if forced order PCI is found, continue the search instead of ending early
-        // if (t == -1) *time = -1;
-        // [/RCCL]
+        if (t == -1) *time = -1;
      }

      // Then try the most local GPUs
@@ -536,14 +546,6 @@ ncclResult_t ncclTopoSearchRec(struct ncclTopoSystem* system, struct ncclTopoGra
    ncclTopoSearchRecNet(system, graph, saveGraph, backToNet, backToFirstRank, time);
  } else {
    // Intra-node only.
-    // [RCCL] - Instead of trying PCI ordering, or replaying, just go straight to searching
-    {
-      for (int g=0; g<system->nodes[GPU].count; g++) {
-        NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, 0, time, -1, -1, g));
-      }
-      return ncclSuccess;
-    }
-    // [/RCCL]
    if (graph->nChannels == 0) {
      // Try PCI order first
      NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, FORCED_ORDER_PCI, time, -1, -1, 0));
@@ -23,7 +23,7 @@
 #define P9_WIDTH 32.0
 #define ARM_WIDTH 6.0
 #define NET_WIDTH 12.0           // 100Gbit
-#define VEGA_XGMI_WIDTH 20.0
+#define VEGA_XGMI_WIDTH 24.0

 // Intel CPU convert GPU P2P traffic into 64B PCI TLPs, so GPU
 // to GPU traffic consumes more PCI bandwidth.