From e9bf01fb7ee153571dc14edac8562fe0e2e889f9 Mon Sep 17 00:00:00 2001
From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com>
Date: Fri, 19 Nov 2021 08:12:53 -0800
Subject: [PATCH] Determine fine grained memory availability at RCCL
 bootstrapping (#471)

---
 src/include/alloc.h     | 10 ----------
 src/include/net.h       |  1 -
 src/include/transport.h |  1 +
 src/init.cc             | 13 ++++++++++++-
 src/transport/p2p.cc    | 13 +------------
 5 files changed, 14 insertions(+), 24 deletions(-)

diff --git a/src/include/alloc.h b/src/include/alloc.h
index 0dad1eb1ad..fe67acfeb8 100644
--- a/src/include/alloc.h
+++ b/src/include/alloc.h
@@ -89,16 +89,6 @@ static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
   return ncclSuccess;
 }
 
-static bool hasFineGrainVramPcie() {
-  int *ptr;
-  if (hipExtMallocWithFlags((void**)&ptr, sizeof(int), hipDeviceMallocFinegrained) == hipSuccess) {
-    CUDACHECK(hipFree(ptr));
-    return true;
-  }
-  else
-    return false;
-}
-
 // Allocate memory to be potentially ibv_reg_mr'd. This needs to be
 // allocated on separate pages as those pages will be marked DONTFORK
 // and if they are shared, that could cause a crash in a child process
diff --git a/src/include/net.h b/src/include/net.h
index 2c9a5ced9c..10a2d85432 100644
--- a/src/include/net.h
+++ b/src/include/net.h
@@ -43,7 +43,6 @@ static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
     NCCLCHECK(ncclNet->getProperties(dev, &props));
     if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue;
 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
-    if (!hasFineGrainVramPcie()) continue;
     *gdrSupport = 1;
     break;
 #endif
diff --git a/src/include/transport.h b/src/include/transport.h
index 115bdc50f1..62913f5b42 100644
--- a/src/include/transport.h
+++ b/src/include/transport.h
@@ -29,6 +29,7 @@ struct ncclPeerInfo {
   int rank;
   int cudaDev;
   int gdrSupport;
+  bool hasFineGrain;
   uint64_t hostHash;
   uint64_t pidHash;
   dev_t shmDev;
diff --git a/src/init.cc b/src/init.cc
index d90d8ea53a..9ad5b4d1c6 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -613,7 +613,18 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u
 
   info->busId = comm->busId;
 
-  NCCLCHECK(ncclGpuGdrSupport(&info->gdrSupport));
+  // detect if fine grained memory is available on this GPU
+  int *ptr;
+  if (hipExtMallocWithFlags((void**)&ptr, sizeof(int), hipDeviceMallocFinegrained) == hipSuccess) {
+    CUDACHECK(hipFree(ptr));
+    info->hasFineGrain = true;
+    NCCLCHECK(ncclGpuGdrSupport(&info->gdrSupport));
+  }
+  else {
+    info->hasFineGrain = false;
+    info->gdrSupport = 0;
+  }
+
   return ncclSuccess;
 }
 
diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc
index 36d9ba07ec..5a9863f9ec 100644
--- a/src/transport/p2p.cc
+++ b/src/transport/p2p.cc
@@ -56,7 +56,7 @@ int busIdToCudaDev(int64_t busId) {
 /* Determine if two peers can communicate through p2p */
 ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
-  if (!hasFineGrainVramPcie())  {
+  if (!info1->hasFineGrain || !info2->hasFineGrain)  {
     *ret = 0;
     return ncclSuccess;
   }
@@ -88,17 +88,6 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
 #endif
   }
 
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
-  int dev;
-  CUDACHECK(hipGetDevice(&dev));
-  CUDACHECK(hipSetDevice(cudaDev2));
-  if (!hasFineGrainVramPcie())  {
-    *ret = 0;
-    CUDACHECK(hipSetDevice(dev));
-    return ncclSuccess;
-  }
-  CUDACHECK(hipSetDevice(dev));
-#endif
   // Check that CUDA can do P2P
   int p2p;
   if (hipDeviceCanAccessPeer(&p2p, cudaDev1, cudaDev2) != hipSuccess) {