Use cached value for detecting GDR support only once

This commit is contained in:
Wenkai Du
2020-05-22 17:15:21 +00:00
parent 957be85944
commit 67c8e72ce3
+11
Vedi File
@@ -36,6 +36,14 @@ static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->cl
static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
int netDevs;
NCCLCHECK(ncclNetDevices(&netDevs));
pthread_mutex_t ncclParamMutexGpuGdrSupport = PTHREAD_MUTEX_INITIALIZER;
static int gdrSupportCached[16] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
int cudaDev;
CUDACHECK(hipGetDevice(&cudaDev));
if (gdrSupportCached[cudaDev] != -1) {
*gdrSupport = gdrSupportCached[cudaDev];
return ncclSuccess;
}
*gdrSupport = 0;
for (int dev=0; dev<netDevs; dev++) {
// Find a net device which is GDR-capable
@@ -51,6 +59,7 @@ static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
ncclNetHandle_t handle;
void* gpuPtr = NULL;
void* mHandle = NULL;
pthread_mutex_lock(&ncclParamMutexGpuGdrSupport);
NCCLCHECK(ncclNetListen(dev, &handle, &lComm));
NCCLCHECK(ncclNetConnect(dev, &handle, &sComm));
NCCLCHECK(ncclNetAccept(lComm, &rComm));
@@ -67,8 +76,10 @@ static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
NCCLCHECK(ncclNetCloseRecv(rComm));
NCCLCHECK(ncclNetCloseSend(sComm));
NCCLCHECK(ncclNetCloseListen(lComm));
pthread_mutex_unlock(&ncclParamMutexGpuGdrSupport);
break;
}
gdrSupportCached[cudaDev] = *gdrSupport;
return ncclSuccess;
}