From e9bf01fb7ee153571dc14edac8562fe0e2e889f9 Mon Sep 17 00:00:00 2001 From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com> Date: Fri, 19 Nov 2021 08:12:53 -0800 Subject: [PATCH] Determine fine grained memory availability at RCCL bootstrapping (#471) --- src/include/alloc.h | 10 ---------- src/include/net.h | 1 - src/include/transport.h | 1 + src/init.cc | 13 ++++++++++++- src/transport/p2p.cc | 13 +------------ 5 files changed, 14 insertions(+), 24 deletions(-) diff --git a/src/include/alloc.h b/src/include/alloc.h index 0dad1eb1ad..fe67acfeb8 100644 --- a/src/include/alloc.h +++ b/src/include/alloc.h @@ -89,16 +89,6 @@ static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) { return ncclSuccess; } -static bool hasFineGrainVramPcie() { - int *ptr; - if (hipExtMallocWithFlags((void**)&ptr, sizeof(int), hipDeviceMallocFinegrained) == hipSuccess) { - CUDACHECK(hipFree(ptr)); - return true; - } - else - return false; -} - // Allocate memory to be potentially ibv_reg_mr'd. This needs to be // allocated on separate pages as those pages will be marked DONTFORK // and if they are shared, that could cause a crash in a child process diff --git a/src/include/net.h b/src/include/net.h index 2c9a5ced9c..10a2d85432 100644 --- a/src/include/net.h +++ b/src/include/net.h @@ -43,7 +43,6 @@ static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) { NCCLCHECK(ncclNet->getProperties(dev, &props)); if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue; #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) - if (!hasFineGrainVramPcie()) continue; *gdrSupport = 1; break; #endif diff --git a/src/include/transport.h b/src/include/transport.h index 115bdc50f1..62913f5b42 100644 --- a/src/include/transport.h +++ b/src/include/transport.h @@ -29,6 +29,7 @@ struct ncclPeerInfo { int rank; int cudaDev; int gdrSupport; + bool hasFineGrain; uint64_t hostHash; uint64_t pidHash; dev_t shmDev; diff --git a/src/init.cc b/src/init.cc index d90d8ea53a..9ad5b4d1c6 100644 --- a/src/init.cc +++ b/src/init.cc @@ -613,7 +613,18 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u info->busId = comm->busId; - NCCLCHECK(ncclGpuGdrSupport(&info->gdrSupport)); + // detect if fine grained memory is available on this GPU + int *ptr; + if (hipExtMallocWithFlags((void**)&ptr, sizeof(int), hipDeviceMallocFinegrained) == hipSuccess) { + CUDACHECK(hipFree(ptr)); + info->hasFineGrain = true; + NCCLCHECK(ncclGpuGdrSupport(&info->gdrSupport)); + } + else { + info->hasFineGrain = false; + info->gdrSupport = 0; + } + return ncclSuccess; } diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc index 36d9ba07ec..5a9863f9ec 100644 --- a/src/transport/p2p.cc +++ b/src/transport/p2p.cc @@ -56,7 +56,7 @@ int busIdToCudaDev(int64_t busId) { /* Determine if two peers can communicate through p2p */ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) - if (!hasFineGrainVramPcie()) { + if (!info1->hasFineGrain || !info2->hasFineGrain) { *ret = 0; return ncclSuccess; } @@ -88,17 +88,6 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop #endif } -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) - int dev; - CUDACHECK(hipGetDevice(&dev)); - CUDACHECK(hipSetDevice(cudaDev2)); - if (!hasFineGrainVramPcie()) { - *ret = 0; - CUDACHECK(hipSetDevice(dev)); - return ncclSuccess; - } - CUDACHECK(hipSetDevice(dev)); -#endif // Check that CUDA can do P2P int p2p; if (hipDeviceCanAccessPeer(&p2p, cudaDev1, cudaDev2) != hipSuccess) {