Check for fine grain support using memory allocation

2019-11-01 15:58:49 -07:00
@@ -51,4 +51,13 @@ static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
  return ncclSuccess;
 }

+static bool hasFineGrainVramPcie() {
+  int *ptr;
+  if (hipExtMallocWithFlags((void**)&ptr, sizeof(int), hipDeviceMallocFinegrained) == hipSuccess) {
+    CUDACHECK(hipFree(ptr));
+    return true;
+  }
+  else
+    return false;
+}
 #endif
@@ -163,25 +163,6 @@ int ncclThreadThreshold(int minCompCap, int multiNode) {
  return threshold;
 }

-bool useFineGrainVramPcie = false;
-
-void parseHsaForceFineGrainVramPcie() {
-  char* str = getenv("HSA_FORCE_FINE_GRAIN_PCIE");
-  if (str && strlen(str) > 0) {
-    errno = 0;
-    int64_t v = strtoll(str, NULL, 0);
-    if (errno || (v != 0 && v != 1)) {
-      INFO(NCCL_ALL,"Invalid value %s for %s, using default %u.", str, "HSA_FORCE_FINE_GRAIN_PCIE", useFineGrainVramPcie);
-    } else {
-      useFineGrainVramPcie = v;
-      INFO(NCCL_ALL,"%s set by environment to %u.", "HSA_FORCE_FINE_GRAIN_PCIE", useFineGrainVramPcie);
-    }
-  }
-  else {
-    INFO(NCCL_ALL,"%s not set by environment.", "HSA_FORCE_FINE_GRAIN_PCIE");
-  }
-}
-
 pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER;
 static bool initialized = false;
 static ncclResult_t ncclInit() {
@@ -193,8 +174,6 @@ static ncclResult_t ncclInit() {
    initNet();
    initialized = true;
  }
-  // Check if HSA_FORCE_FINE_GRAIN_PCIE is set in env
-  parseHsaForceFineGrainVramPcie();
  pthread_mutex_unlock(&initLock);
  return ncclSuccess;
 }
@@ -245,8 +245,6 @@ end:
  return dev;
 }

-extern bool useFineGrainVramPcie;
-
 NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2);
 NCCL_PARAM(NetGdrLevel, "NET_GDR_LEVEL", PATH_PHB);

@@ -257,7 +255,7 @@ static ncclResult_t netGetGdrSupport(int dev, int read, int* useGdr) {
  CUDACHECK(hipGetDevice(&cudaDev));
  NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev))

-  if (!useFineGrainVramPcie) {
+  if (!hasFineGrainVramPcie()) {
    INFO(NCCL_INIT|NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %d / Need Fine Grain VRAM over PCIe", ncclNetName(), cudaDev);
    return ncclSuccess;
  }
@@ -58,8 +58,6 @@ struct p2pRecvResources {
 NCCL_PARAM(P2pLevel, "P2P_LEVEL", -2);
 NCCL_PARAM(P2pDisable, "P2P_DISABLE", -2);

-extern bool useFineGrainVramPcie;
-
 /* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */
 static int busIdToCudaDev(const char* busId) {
  int ndev;
@@ -86,6 +84,10 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struc

  *ret = 0;

+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+  if (!hasFineGrainVramPcie()) return ncclSuccess;
+#endif
+
  if (p2pLevel == 0) return ncclSuccess;

  // Rule out different nodes
@@ -112,9 +114,6 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struc

  // Do not detect topology if we're on the same GPU. Note this is not really supported.
  if (myInfo->cudaDev == peerCudaDev) {
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
-    if (!useFineGrainVramPcie) return ncclSuccess;
-#endif
    *ret = 1 + PATH_SYS;
    return ncclSuccess;
  }
@@ -145,9 +144,6 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struc
  if (link_type == HSA_AMD_LINK_INFO_TYPE_XGMI) {
    if (hops == 1)
      nvlinkp2p = CONNECT_NVLINK;
-  } else {
-    if (!useFineGrainVramPcie)
-      return ncclSuccess;
  }
 #else
 // Check for NVLink/NVswitch