Merge pull request #214 from wenkaidu/gdr
Use cached value for detecting GDR support only once
Этот коммит содержится в:
@@ -36,6 +36,14 @@ static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->cl
|
||||
static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
|
||||
int netDevs;
|
||||
NCCLCHECK(ncclNetDevices(&netDevs));
|
||||
pthread_mutex_t ncclParamMutexGpuGdrSupport = PTHREAD_MUTEX_INITIALIZER;
|
||||
static int gdrSupportCached[16] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
|
||||
int cudaDev;
|
||||
CUDACHECK(hipGetDevice(&cudaDev));
|
||||
if (gdrSupportCached[cudaDev] != -1) {
|
||||
*gdrSupport = gdrSupportCached[cudaDev];
|
||||
return ncclSuccess;
|
||||
}
|
||||
*gdrSupport = 0;
|
||||
for (int dev=0; dev<netDevs; dev++) {
|
||||
// Find a net device which is GDR-capable
|
||||
@@ -51,6 +59,7 @@ static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
|
||||
ncclNetHandle_t handle;
|
||||
void* gpuPtr = NULL;
|
||||
void* mHandle = NULL;
|
||||
pthread_mutex_lock(&ncclParamMutexGpuGdrSupport);
|
||||
NCCLCHECK(ncclNetListen(dev, &handle, &lComm));
|
||||
NCCLCHECK(ncclNetConnect(dev, &handle, &sComm));
|
||||
NCCLCHECK(ncclNetAccept(lComm, &rComm));
|
||||
@@ -67,8 +76,10 @@ static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
|
||||
NCCLCHECK(ncclNetCloseRecv(rComm));
|
||||
NCCLCHECK(ncclNetCloseSend(sComm));
|
||||
NCCLCHECK(ncclNetCloseListen(lComm));
|
||||
pthread_mutex_unlock(&ncclParamMutexGpuGdrSupport);
|
||||
break;
|
||||
}
|
||||
gdrSupportCached[cudaDev] = *gdrSupport;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user