From b250c01cbe06af0935461fecfa052c9a7d28c9ef Mon Sep 17 00:00:00 2001
From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com>
Date: Wed, 6 Jul 2022 07:58:41 -0700
Subject: [PATCH] Use nontemporal in slow path and add XGMI sys type (#575)

* Use nontemporal in slow path and add XGMI sys type

* Clean up XGMI detection
---
 src/collectives/device/common_kernel.h | 12 ++---
 src/graph/paths.cc                     | 50 +++++++++---------
 src/graph/search.cc                    | 71 +++++++++++++-------------
 src/graph/topo.h                       |  1 +
 4 files changed, 66 insertions(+), 68 deletions(-)
diff --git a/src/collectives/device/common_kernel.h b/src/collectives/device/common_kernel.h
index d6fa08f186..e117f70561 100644
--- a/src/collectives/device/common_kernel.h
+++ b/src/collectives/device/common_kernel.h
@@ -425,12 +425,12 @@ struct MULTI<FUNC, int64_t> {
 
 template<typename T> inline __device__
 T vFetch(const volatile T* ptr) {
-  return *ptr;
+  return __builtin_nontemporal_load(ptr);
 }
 
 template<typename T> inline __device__
 void vStore(volatile T* ptr, const T val) {
-  *ptr = val;
+  __builtin_nontemporal_store(val, ptr);
 }
 
 #if CUDART_VERSION < 9000 && !(defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__))
@@ -449,25 +449,25 @@ void vStore<half>(volatile half* ptr, const half val) {
 template<> inline __device__
 half vFetch<half>(const volatile half* ptr) {
   half r;
-  r = ((half*)ptr)[0];
+  r = __builtin_nontemporal_load((uint16_t*)ptr);
   return r;
 }
 
 template<> inline __device__
 void vStore<half>(volatile half* ptr, const half val) {
-  ((half*)ptr)[0] = val;
+  __builtin_nontemporal_store(val, (uint16_t*)ptr);
 }
 
 template<> inline __device__
 rccl_bfloat16 vFetch<rccl_bfloat16>(const volatile rccl_bfloat16* ptr) {
   rccl_bfloat16 r;
-  r.data = ptr->data;
+  r.data = __builtin_nontemporal_load(&ptr->data);
   return r;
 }
 
 template<> inline __device__
 void vStore<rccl_bfloat16>(volatile rccl_bfloat16* ptr, const rccl_bfloat16 val) {
-  ptr->data = val.data;
+  __builtin_nontemporal_store(val.data, &ptr->data);
 }
 #endif
 
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index 762c6a639c..64fbba59a8 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -653,34 +653,30 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm*
   NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
   if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD
     && model == NCCL_TOPO_CPU_TYPE_ROME) {
-    int gdr, ret = 1;
-    int net;
-    for (int g = 0; g < system->nodes[GPU].count; g++) {
-      NCCLCHECK(ncclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank[0], &net));
-      NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, net, 1, &gdr));
-      if (!gdr) {
-        ret = 0;
-        break;
+    int gdr = 1;
+    bool allXgmi = true;
+    // detect if all GPUs are connected by XGMI
+    for (int i = 0; i < system->nodes[GPU].count && allXgmi; i++) {
+      int cudaDev1 = system->nodes[GPU].nodes[i].gpu.dev;
+      for (int j = 0; j < system->nodes[GPU].count && allXgmi; j++) {
+        if (i == j) continue;
+        int cudaDev2 = system->nodes[GPU].nodes[j].gpu.dev;
+        bool isXGMI;
+        NCCLCHECK(ncclTopoGetLinkType(comm->topo, cudaDev1, cudaDev2, &isXGMI));
+        allXgmi &= isXGMI;
       }
     }
-    if (ret) {
-      bool allXgmi = true;
-      // don't trim NICs unless all GPUs are connected by XGMI
-      for (int i = 0; i < system->nodes[GPU].count && allXgmi; i++) {
-        int cudaDev1 = system->nodes[GPU].nodes[i].gpu.dev;
-        for (int j = 0; j < system->nodes[GPU].count && allXgmi; j++) {
-          if (i == j) continue;
-          int cudaDev2 = system->nodes[GPU].nodes[j].gpu.dev;
-          bool isXGMI;
-          NCCLCHECK(ncclTopoGetLinkType(comm->topo, cudaDev1, cudaDev2, &isXGMI));
-          allXgmi &= isXGMI;
-        }
-      }
-      if (!allXgmi) {
-        remove = 0;
-        system->type |= RCCL_TOPO_GDR_ALL;
-        INFO(NCCL_GRAPH, "GDR is available on all GPUs");
-      }
+    if (allXgmi) system->type |= RCCL_TOPO_XGMI_ALL;
+    for (int g = 0; g < system->nodes[GPU].count; g++) {
+      int net;
+      NCCLCHECK(ncclTopoGetLocalNet(system, system->nodes[GPU].nodes[g].gpu.rank[0], &net));
+      NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, net, 1, &gdr));
+      if (!gdr) break;
+    }
+    if (gdr && !allXgmi) {
+      remove = 0;
+      system->type |= RCCL_TOPO_GDR_ALL;
+      INFO(NCCL_GRAPH, "GDR is available on all GPUs");
     }
   }
 
@@ -755,7 +751,7 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
     }
   }
 
-  if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (comm->topo->type & RCCL_TOPO_4P2H_ROME) && !(comm->topo->type & RCCL_TOPO_GDR_ALL) && (comm->topo->nodes[GPU].nodes[0].gpu.gcn != 910)) {
+  if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (comm->topo->type & RCCL_TOPO_4P2H_ROME) && !(comm->topo->type & RCCL_TOPO_GDR_ALL) && !(comm->topo->type & RCCL_TOPO_XGMI_ALL)) {
     // Adjust P2P channels on Rome
     comm->p2pnChannelsPerPeer = 2;
     comm->p2pnChannels = 2;
diff --git a/src/graph/search.cc b/src/graph/search.cc
index edbafc0dc1..4b5e5ff66f 100644
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@@ -1171,7 +1171,6 @@ ncclResult_t ncclTopoGetIntraNetDev(struct ncclTopoSystem* system, int rank, str
 ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, int cudaDev2, bool* isXGMI, int maxInter, int nInter, int *inter) {
   int interGpus[MAX_XGMI_INTER_GPUS+1];
   int ngpus = system->nodes[GPU].count;
-  *isXGMI = false;
   // check for direct XGMI connection
   for (int i=0; i<ngpus; i++) {
     if (system->nodes[GPU].nodes[i].gpu.dev == cudaDev1) {
@@ -1188,42 +1187,44 @@ ncclResult_t ncclTopoGetLinkType(struct ncclTopoSystem* system, int cudaDev1, in
       }
     }
   }
-  if (maxInter == 0) return ncclSuccess;
-  // check if there are intermediate GPUs that are connected to both
-  bool res1, res2, res3;
-  int j;
-  for (j=0; j<nInter; j++) {
-    bool res1;
-    ncclTopoGetLinkType(system, inter[j], inter[j+1], &res1, 0);
-    if (!res1) break;
-  }
-  if (j<nInter) return ncclSuccess;
-  if (nInter > 0 && inter != nullptr) {
-    ncclTopoGetLinkType(system, inter[nInter], cudaDev2, &res2, 0);
-    if (res2) {
-      *isXGMI = true;
-      return ncclSuccess;
+  // try intermediate GPUs
+  if (maxInter) {
+    // check if there are intermediate GPUs that are connected to both
+    bool res1, res2, res3;
+    int j;
+    for (j=0; j<nInter; j++) {
+      ncclTopoGetLinkType(system, inter[j], inter[j+1], &res1, 0);
+      if (!res1) break;
     }
-    memcpy(interGpus+1, inter+1, sizeof(int)*nInter);
-  }
-  interGpus[0] = cudaDev1;
-  // add one more intermediate GPU recursively util reaching max depth
-  nInter++;
-  if (nInter+2 > ngpus || nInter > MAX_XGMI_INTER_GPUS || nInter > maxInter) return ncclSuccess;
-  for (int i=0; i<ngpus; i++) {
-    int dev = system->nodes[GPU].nodes[i].gpu.dev;
-    // skip duplicated GPU
-    if (dev == cudaDev2) continue;
-    for (j=0; j<nInter; j++)
-      if (dev == interGpus[j]) break;
-    if (j<nInter) continue;
-    // check connectivity with intermediate GPUs
-    interGpus[nInter] = dev;
-    ncclTopoGetLinkType(system, cudaDev1, cudaDev2, &res3, maxInter, nInter, interGpus);
-    if (res3) {
-      *isXGMI = true;
-      return ncclSuccess;
+    if (j<nInter) return ncclSuccess;
+    if (nInter > 0 && inter != nullptr) {
+      ncclTopoGetLinkType(system, inter[nInter], cudaDev2, &res2, 0);
+      if (res2) {
+        *isXGMI = true;
+        return ncclSuccess;
+      }
+      memcpy(interGpus+1, inter+1, sizeof(int)*nInter);
+    }
+    interGpus[0] = cudaDev1;
+    // add one more intermediate GPU recursively util reaching max depth
+    nInter++;
+    if (nInter+2 > ngpus || nInter > MAX_XGMI_INTER_GPUS || nInter > maxInter) return ncclSuccess;
+    for (int i=0; i<ngpus; i++) {
+      int dev = system->nodes[GPU].nodes[i].gpu.dev;
+      // skip duplicated GPU
+      if (dev == cudaDev2) continue;
+      for (j=0; j<nInter; j++)
+        if (dev == interGpus[j]) break;
+      if (j<nInter) continue;
+      // check connectivity with intermediate GPUs
+      interGpus[nInter] = dev;
+      ncclTopoGetLinkType(system, cudaDev1, cudaDev2, &res3, maxInter, nInter, interGpus);
+      if (res3) {
+        *isXGMI = true;
+        return ncclSuccess;
+      }
     }
   }
+  *isXGMI = false;
   return ncclSuccess;
 }
diff --git a/src/graph/topo.h b/src/graph/topo.h
index 94de37abf4..d088cee3f7 100644
--- a/src/graph/topo.h
+++ b/src/graph/topo.h
@@ -104,6 +104,7 @@ struct ncclTopoLinkList {
 #define RCCL_TOPO_GDR_ALL   4
 #define RCCL_TOPO_16P1H     8
 #define RCCL_TOPO_FORCE_INTRA 16
+#define RCCL_TOPO_XGMI_ALL  32
 
 #define RCCL_TOPO_MAX_RANKS_PER_GPU 8
 struct ncclTopoNode {