Determine fine grained memory availability at RCCL bootstrapping (#471)
[ROCm/rccl commit: e9bf01fb7e]
This commit is contained in:
@@ -89,16 +89,6 @@ static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static bool hasFineGrainVramPcie() {
|
||||
int *ptr;
|
||||
if (hipExtMallocWithFlags((void**)&ptr, sizeof(int), hipDeviceMallocFinegrained) == hipSuccess) {
|
||||
CUDACHECK(hipFree(ptr));
|
||||
return true;
|
||||
}
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
// Allocate memory to be potentially ibv_reg_mr'd. This needs to be
|
||||
// allocated on separate pages as those pages will be marked DONTFORK
|
||||
// and if they are shared, that could cause a crash in a child process
|
||||
|
||||
@@ -43,7 +43,6 @@ static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
|
||||
NCCLCHECK(ncclNet->getProperties(dev, &props));
|
||||
if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue;
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
if (!hasFineGrainVramPcie()) continue;
|
||||
*gdrSupport = 1;
|
||||
break;
|
||||
#endif
|
||||
|
||||
@@ -29,6 +29,7 @@ struct ncclPeerInfo {
|
||||
int rank;
|
||||
int cudaDev;
|
||||
int gdrSupport;
|
||||
bool hasFineGrain;
|
||||
uint64_t hostHash;
|
||||
uint64_t pidHash;
|
||||
dev_t shmDev;
|
||||
|
||||
@@ -613,7 +613,18 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u
|
||||
|
||||
info->busId = comm->busId;
|
||||
|
||||
NCCLCHECK(ncclGpuGdrSupport(&info->gdrSupport));
|
||||
// detect if fine grained memory is available on this GPU
|
||||
int *ptr;
|
||||
if (hipExtMallocWithFlags((void**)&ptr, sizeof(int), hipDeviceMallocFinegrained) == hipSuccess) {
|
||||
CUDACHECK(hipFree(ptr));
|
||||
info->hasFineGrain = true;
|
||||
NCCLCHECK(ncclGpuGdrSupport(&info->gdrSupport));
|
||||
}
|
||||
else {
|
||||
info->hasFineGrain = false;
|
||||
info->gdrSupport = 0;
|
||||
}
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
@@ -56,7 +56,7 @@ int busIdToCudaDev(int64_t busId) {
|
||||
/* Determine if two peers can communicate through p2p */
|
||||
ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
if (!hasFineGrainVramPcie()) {
|
||||
if (!info1->hasFineGrain || !info2->hasFineGrain) {
|
||||
*ret = 0;
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -88,17 +88,6 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
|
||||
int dev;
|
||||
CUDACHECK(hipGetDevice(&dev));
|
||||
CUDACHECK(hipSetDevice(cudaDev2));
|
||||
if (!hasFineGrainVramPcie()) {
|
||||
*ret = 0;
|
||||
CUDACHECK(hipSetDevice(dev));
|
||||
return ncclSuccess;
|
||||
}
|
||||
CUDACHECK(hipSetDevice(dev));
|
||||
#endif
|
||||
// Check that CUDA can do P2P
|
||||
int p2p;
|
||||
if (hipDeviceCanAccessPeer(&p2p, cudaDev1, cudaDev2) != hipSuccess) {
|
||||
|
||||
Reference in New Issue
Block a user