From f5c0b243a8bdf249f4e9831ef8cd5cd61f288601 Mon Sep 17 00:00:00 2001
From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com>
Date: Fri, 12 Aug 2022 09:52:29 -0700
Subject: [PATCH] Add XGMI sys type and clean up detection code (#597)

---
 src/graph/paths.cc  | 50 +++++++++++++++----------------
 src/graph/search.cc | 71 +++++++++++++++++++++++----------------------
 src/graph/topo.h    |  1 +
 3 files changed, 60 insertions(+), 62 deletions(-)

diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index 762c6a639c..64fbba59a8 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -653,34 +653,30 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
   NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
   if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD
     && model == NCCL_TOPO_CPU_TYPE_ROME) {
-    int gdr, ret = 1;
-    int net;
-    for (int g = 0; g < system->nodes[GPU].count; g++) {
-      NCCLCHECK(ncclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank[0], &net));
-      NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, net, 1, &gdr));
-      if (!gdr) {
-        ret = 0;
-        break;
+    int gdr = 1;
+    bool allXgmi = true;
+    // detect if all GPUs are connected by XGMI
+    for (int i = 0; i < system->nodes[GPU].count && allXgmi; i++) {
+      int cudaDev1 = system->nodes[GPU].nodes[i].gpu.dev;
+      for (int j = 0; j < system->nodes[GPU].count && allXgmi; j++) {
+        if (i == j) continue;
+        int cudaDev2 = system->nodes[GPU].nodes[j].gpu.dev;
+        bool isXGMI;
+        NCCLCHECK(ncclTopoGetLinkType(comm->topo, cudaDev1, cudaDev2, &isXGMI));
+        allXgmi &= isXGMI;
       }
     }
-    if (ret) {
-      bool allXgmi = true;
-      // don't trim NICs unless all GPUs are connected by XGMI
-      for (int i = 0; i < system->nodes[GPU].count && allXgmi; i++) {
-        int cudaDev1 = system->nodes[GPU].nodes[i].gpu.dev;
-        for (int j = 0; j < system->nodes[GPU].count && allXgmi; j++) {
-          if (i == j) continue;
-          int cudaDev2 = system->nodes[GPU].nodes[j].gpu.dev;
-          bool isXGMI;
-          NCCLCHECK(ncclTopoGetLinkType(comm->topo, cudaDev1, cudaDev2, &isXGMI));
-          allXgmi &= isXGMI;
-        }
-      }
-      if (!allXgmi) {
-        remove = 0;
-        system->type |= RCCL_TOPO_GDR_ALL;
-        INFO(NCCL_GRAPH, "GDR is available on all GPUs");
-      }
+    if (allXgmi) system->type |= RCCL_TOPO_XGMI_ALL;
+    for (int g = 0; g < system->nodes[GPU].count; g++) {
+      int net;
+      NCCLCHECK(ncclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank[0], &net));
+      NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, net, 1, &gdr));
+      if (!gdr) break;
+    }
+    if (gdr && !allXgmi) {
+      remove = 0;
+      system->type |= RCCL_TOPO_GDR_ALL;
+      INFO(NCCL_GRAPH, "GDR is available on all GPUs");
     }
   }
 
@@ -755,7 +751,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
     }
   }
 
-  if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (comm->topo->type & RCCL_TOPO_4P2H_ROME) && !(comm->topo->type & RCCL_TOPO_GDR_ALL) && (comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910)) {
+  if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (comm->topo->type & RCCL_TOPO_4P2H_ROME) && !(comm->topo->type & RCCL_TOPO_GDR_ALL) && !(comm->topo->type & RCCL_TOPO_XGMI_ALL)) {
     // Adjust P2P channels on Rome
     comm->p2pnChannelsPerPeer = 2;
     comm->p2pnChannels = 2;
diff --git a/src/graph/search.cc b/src/graph/search.cc
index edbafc0dc1..4b5e5ff66f 100644
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@@ -1171,7 +1171,6 @@ ncclResult_t ncclTopoGetIntraNetDev(struct ncclTopoSystem* system, int rank, str
 ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, int maxInter, int nInter, int *inter) {
   int interGpus[MAX_XGMI_INTER_GPUS+1];
   int ngpus = system->nodes[GPU].count;
-  *isXGMI = false;
   // check for direct XGMI connection
   for (int i=0; i<ngpus; i++) {
     if (system->nodes[GPU].nodes[i].gpu.dev == cudaDev1) {
@@ -1188,42 +1187,44 @@ ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, in
       }
     }
   }
-  if (maxInter == 0) return ncclSuccess;
-  // check if there are intermediate GPUs that are connected to both
-  bool res1, res2, res3;
-  int j;
-  for (j=0; j<nInter; j++) {
-    bool res1;
-    ncclTopoGetLinkType(system, inter[j], inter[j+1], &res1, 0);
-    if (!res1) break;
-  }
-  if (j<nInter) return ncclSuccess;
-  if (nInter > 0 && inter != nullptr) {
-    ncclTopoGetLinkType(system, inter[nInter], cudaDev2, &res2, 0);
-    if (res2) {
-      *isXGMI = true;
-      return ncclSuccess;
+  // try intermediate GPUs
+  if (maxInter) {
+    // check if there are intermediate GPUs that are connected to both
+    bool res1, res2, res3;
+    int j;
+    for (j=0; j<nInter; j++) {
+      ncclTopoGetLinkType(system, inter[j], inter[j+1], &res1, 0);
+      if (!res1) break;
     }
-    memcpy(interGpus+1, inter+1, sizeof(int)*nInter);
-  }
-  interGpus[0] = cudaDev1;
-  // add one more intermediate GPU recursively util reaching max depth
-  nInter++;
-  if (nInter+2 > ngpus || nInter > MAX_XGMI_INTER_GPUS || nInter > maxInter) return ncclSuccess;
-  for (int i=0; i<ngpus; i++) {
-    int dev = system->nodes[GPU].nodes[i].gpu.dev;
-    // skip duplicated GPU
-    if (dev == cudaDev2) continue;
-    for (j=0; j<nInter; j++)
-      if (dev == interGpus[j]) break;
-    if (j<nInter) continue;
-    // check connectivity with intermediate GPUs
-    interGpus[nInter] = dev;
-    ncclTopoGetLinkType(system, cudaDev1, cudaDev2, &res3, maxInter, nInter, interGpus);
-    if (res3) {
-      *isXGMI = true;
-      return ncclSuccess;
+    if (j<nInter) return ncclSuccess;
+    if (nInter > 0 && inter != nullptr) {
+      ncclTopoGetLinkType(system, inter[nInter], cudaDev2, &res2, 0);
+      if (res2) {
+        *isXGMI = true;
+        return ncclSuccess;
+      }
+      memcpy(interGpus+1, inter+1, sizeof(int)*nInter);
+    }
+    interGpus[0] = cudaDev1;
+    // add one more intermediate GPU recursively util reaching max depth
+    nInter++;
+    if (nInter+2 > ngpus || nInter > MAX_XGMI_INTER_GPUS || nInter > maxInter) return ncclSuccess;
+    for (int i=0; i<ngpus; i++) {
+      int dev = system->nodes[GPU].nodes[i].gpu.dev;
+      // skip duplicated GPU
+      if (dev == cudaDev2) continue;
+      for (j=0; j<nInter; j++)
+        if (dev == interGpus[j]) break;
+      if (j<nInter) continue;
+      // check connectivity with intermediate GPUs
+      interGpus[nInter] = dev;
+      ncclTopoGetLinkType(system, cudaDev1, cudaDev2, &res3, maxInter, nInter, interGpus);
+      if (res3) {
+        *isXGMI = true;
+        return ncclSuccess;
+      }
     }
   }
+  *isXGMI = false;
   return ncclSuccess;
 }
diff --git a/src/graph/topo.h b/src/graph/topo.h
index 94de37abf4..d088cee3f7 100644
--- a/src/graph/topo.h
+++ b/src/graph/topo.h
@@ -104,6 +104,7 @@ struct ncclTopoLinkList {
 #define RCCL_TOPO_GDR_ALL   4
 #define RCCL_TOPO_16P1H     8
 #define RCCL_TOPO_FORCE_INTRA 16
+#define RCCL_TOPO_XGMI_ALL  32
 
 #define RCCL_TOPO_MAX_RANKS_PER_GPU 8
 struct ncclTopoNode {