diff --git a/src/include/alloc.h b/src/include/alloc.h index 3d0f07aa95..601c362998 100644 --- a/src/include/alloc.h +++ b/src/include/alloc.h @@ -51,4 +51,13 @@ static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) { return ncclSuccess; } +static bool hasFineGrainVramPcie() { + int *ptr; + if (hipExtMallocWithFlags((void**)&ptr, sizeof(int), hipDeviceMallocFinegrained) == hipSuccess) { + CUDACHECK(hipFree(ptr)); + return true; + } + else + return false; +} #endif diff --git a/src/init.cc b/src/init.cc index 14825e1309..af64ad01ee 100644 --- a/src/init.cc +++ b/src/init.cc @@ -163,25 +163,6 @@ int ncclThreadThreshold(int minCompCap, int multiNode) { return threshold; } -bool useFineGrainVramPcie = false; - -void parseHsaForceFineGrainVramPcie() { - char* str = getenv("HSA_FORCE_FINE_GRAIN_PCIE"); - if (str && strlen(str) > 0) { - errno = 0; - int64_t v = strtoll(str, NULL, 0); - if (errno || (v != 0 && v != 1)) { - INFO(NCCL_ALL,"Invalid value %s for %s, using default %u.", str, "HSA_FORCE_FINE_GRAIN_PCIE", useFineGrainVramPcie); - } else { - useFineGrainVramPcie = v; - INFO(NCCL_ALL,"%s set by environment to %u.", "HSA_FORCE_FINE_GRAIN_PCIE", useFineGrainVramPcie); - } - } - else { - INFO(NCCL_ALL,"%s not set by environment.", "HSA_FORCE_FINE_GRAIN_PCIE"); - } -} - pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER; static bool initialized = false; static ncclResult_t ncclInit() { @@ -193,8 +174,6 @@ static ncclResult_t ncclInit() { initNet(); initialized = true; } - // Check if HSA_FORCE_FINE_GRAIN_PCIE is set in env - parseHsaForceFineGrainVramPcie(); pthread_mutex_unlock(&initLock); return ncclSuccess; } diff --git a/src/transport/net.cc b/src/transport/net.cc index 86e332c72f..240e196b06 100644 --- a/src/transport/net.cc +++ b/src/transport/net.cc @@ -245,8 +245,6 @@ end: return dev; } -extern bool useFineGrainVramPcie; - NCCL_PARAM(NetGdrRead, "NET_GDR_READ", -2); NCCL_PARAM(NetGdrLevel, "NET_GDR_LEVEL", PATH_PHB); @@ -257,7 +255,7 @@ static ncclResult_t netGetGdrSupport(int dev, int read, int* useGdr) { CUDACHECK(hipGetDevice(&cudaDev)); NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev)) - if (!useFineGrainVramPcie) { + if (!hasFineGrainVramPcie()) { INFO(NCCL_INIT|NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %d / Need Fine Grain VRAM over PCIe", ncclNetName(), cudaDev); return ncclSuccess; } diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc index bce7fd2370..7045ee8abc 100644 --- a/src/transport/p2p.cc +++ b/src/transport/p2p.cc @@ -58,8 +58,6 @@ struct p2pRecvResources { NCCL_PARAM(P2pLevel, "P2P_LEVEL", -2); NCCL_PARAM(P2pDisable, "P2P_DISABLE", -2); -extern bool useFineGrainVramPcie; - /* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */ static int busIdToCudaDev(const char* busId) { int ndev; @@ -86,6 +84,10 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struc *ret = 0; +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) + if (!hasFineGrainVramPcie()) return ncclSuccess; +#endif + if (p2pLevel == 0) return ncclSuccess; // Rule out different nodes @@ -112,9 +114,6 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struc // Do not detect topology if we're on the same GPU. Note this is not really supported. if (myInfo->cudaDev == peerCudaDev) { -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) - if (!useFineGrainVramPcie) return ncclSuccess; -#endif *ret = 1 + PATH_SYS; return ncclSuccess; } @@ -145,9 +144,6 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struc if (link_type == HSA_AMD_LINK_INFO_TYPE_XGMI) { if (hops == 1) nvlinkp2p = CONNECT_NVLINK; - } else { - if (!useFineGrainVramPcie) - return ncclSuccess; } #else // Check for NVLink/NVswitch