From 161763aab2befd038da7cdecbbe8e52a56677200 Mon Sep 17 00:00:00 2001 From: Cao Zongyan Date: Wed, 13 Mar 2019 17:13:39 +0800 Subject: [PATCH 1/4] Fix share memory collision in multi-communicator case. Current SHM object name would only use pidHash and ranks as identification, which would collide each other when program runs with multiple communicators. Here we added commId info into pidHash, it makes 'pidHash'es of different communicators keeping in same process will be distincted with each other. --- src/include/utils.h | 1 + src/init.cu | 13 +++++++------ src/misc/utils.cu | 9 +++++++++ 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/src/include/utils.h b/src/include/utils.h index 5a6a588c43..0ed875c161 100644 --- a/src/include/utils.h +++ b/src/include/utils.h @@ -11,6 +11,7 @@ #include ncclResult_t getHostName(char* hostname, int maxlen); +uint64_t getnHash(const char* string, int n); uint64_t getHostHash(); uint64_t getPidHash(); diff --git a/src/init.cu b/src/init.cu index 75822e60bd..b8032e8e33 100644 --- a/src/init.cu +++ b/src/init.cu @@ -302,12 +302,12 @@ static void showVersion() { } } -static ncclResult_t fillInfo(struct ncclPeerInfo* info, int rank) { +static ncclResult_t fillInfo(struct ncclPeerInfo* info, int rank, uint64_t commHash) { info->rank = rank; CUDACHECK(cudaGetDevice(&info->cudaDev)); NCCLCHECK(getNvmlDevice(info->cudaDev, &info->nvmlDev)) - info->hostHash=getHostHash(); - info->pidHash=getPidHash(); + info->hostHash=getHostHash()+commHash; + info->pidHash=getPidHash()+commHash; // Get PCI Bus Id. We need to get the bus ID through CUDA first, since the // cudaDev is a CUDA runtime dev number which could be different from the @@ -679,7 +679,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm int rank = comm->rank; int nranks = comm->nRanks; - TRACE(NCCL_INIT, "rank %d nranks %d - BEGIN", rank, nranks); + uint64_t commHash = getnHash(commId->internal, NCCL_UNIQUE_ID_BYTES); + TRACE(NCCL_INIT, "comm %p, commHash %lu, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks); NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap)); // AllGather1 - begin @@ -690,7 +691,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm NCCLCHECK(ncclCalloc(&allGather1Data, nranks)); allGather1Data[rank].comm = comm; - NCCLCHECK(fillInfo(&allGather1Data[rank].peerInfo, rank)); + NCCLCHECK(fillInfo(&allGather1Data[rank].peerInfo, rank, commHash)); NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data))); NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks)); @@ -960,7 +961,7 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs, NCCLCHECK(ncclCalloc(&allInfo, nranks)); for (int rank=0; rank Date: Wed, 31 Jul 2019 12:21:35 +0800 Subject: [PATCH 2/4] Refine RPM package building spec file. Add /sbin/ldconfig into RPM package install operations. --- pkg/redhat/nccl.spec.in | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pkg/redhat/nccl.spec.in b/pkg/redhat/nccl.spec.in index f9d83a30df..5fad346f4a 100644 --- a/pkg/redhat/nccl.spec.in +++ b/pkg/redhat/nccl.spec.in @@ -7,6 +7,7 @@ Group: Development/Libraries License: BSD URL: http://developer.nvidia.com/nccl Source0: nccl_${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix}-${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor}_${pkg:Arch}.txz +Prereq: /sbin/ldconfig %description NCCL (pronounced "Nickel") is a stand-alone library of standard collective @@ -50,6 +51,12 @@ ln -s libnccl.so.${nccl:Major} $RPM_BUILD_ROOT/%{_libdir}/libnccl.so # static install -m 644 lib/libnccl_static.a $RPM_BUILD_ROOT/%{_libdir} +%post -p /sbin/ldconfig +%postun -p /sbin/ldconfig + +%post devel -p /sbin/ldconfig +%postun devel -p /sbin/ldconfig + %clean rm -rf $RPM_BUILD_ROOT From 7f2b337e703d73ed369937c9996e1f3d5f664ad0 Mon Sep 17 00:00:00 2001 From: David Addison Date: Tue, 13 Aug 2019 16:32:07 -0700 Subject: [PATCH 3/4] Make use of SO_REUSEPORT conditional Fixes: #244 SO_RESUEPORT was introduced in Linux 3.9 and later. This change allows NCCL to compile against older releases. The functionality is only required if the user is specifying a NCCL bootstrap address via an environment variable. --- src/include/socket.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/include/socket.h b/src/include/socket.h index 68ce235d62..b4f09b9cc1 100644 --- a/src/include/socket.h +++ b/src/include/socket.h @@ -327,7 +327,11 @@ static ncclResult_t createListenSocket(int *fd, union socketAddress *localAddr) if (socketToPort(&localAddr->sa)) { // Port is forced by env. Make sure we get the port. int opt = 1; +#if defined(SO_REUSEPORT) SYSCHECK(setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt)), "setsockopt"); +#else + SYSCHECK(setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)), "setsockopt"); +#endif } // localAddr port should be 0 (Any port) From fad079a8aeb72f4fb30f3564a48ad4ec37ea58f6 Mon Sep 17 00:00:00 2001 From: David Addison Date: Wed, 14 Aug 2019 10:08:39 -0700 Subject: [PATCH 4/4] Updated PR#196 to use a common hash function --- src/include/utils.h | 2 +- src/init.cc | 4 ++-- src/misc/utils.cc | 15 +++------------ 3 files changed, 6 insertions(+), 15 deletions(-) diff --git a/src/include/utils.h b/src/include/utils.h index 3038e68314..5acccc2955 100644 --- a/src/include/utils.h +++ b/src/include/utils.h @@ -11,7 +11,7 @@ #include ncclResult_t getHostName(char* hostname, int maxlen, const char delim); -uint64_t getnHash(const char* string, int n); +uint64_t getHash(const char* string, int n); uint64_t getHostHash(); uint64_t getPidHash(); diff --git a/src/init.cc b/src/init.cc index 229742c1fa..706d3a6ffb 100644 --- a/src/init.cc +++ b/src/init.cc @@ -691,8 +691,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm int rank = comm->rank; int nranks = comm->nRanks; - uint64_t commHash = getnHash(commId->internal, NCCL_UNIQUE_ID_BYTES); - TRACE(NCCL_INIT, "comm %p, commHash %lu, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks); + uint64_t commHash = getHash(commId->internal, NCCL_UNIQUE_ID_BYTES); + TRACE(NCCL_INIT, "comm %p, commHash %lx, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks); NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap)); // AllGather1 - begin diff --git a/src/misc/utils.cc b/src/misc/utils.cc index b511af191a..da9977487c 100644 --- a/src/misc/utils.cc +++ b/src/misc/utils.cc @@ -87,18 +87,9 @@ void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *file } } -uint64_t getHash(const char* string) { +uint64_t getHash(const char* string, int n) { // Based on DJB2, result = result * 33 + char uint64_t result = 5381; - for (int c = 0; string[c] != '\0'; c++) { - result = ((result << 5) + result) + string[c]; - } - return result; -} - -uint64_t getnHash(const char* string, int n) { - // Based on DJB2, result = result * 33 + char - uint64_t result = 9527; for (int c = 0; c < n; c++) { result = ((result << 5) + result) + string[c]; } @@ -129,7 +120,7 @@ uint64_t getHostHash(void) { uname[offset]='\0'; TRACE(NCCL_INIT,"unique hostname '%s'", uname); - return getHash(uname); + return getHash(uname, strlen(uname)); } /* Generate a hash of the unique identifying string for this process @@ -149,7 +140,7 @@ uint64_t getPidHash(void) { pname[plen+len]='\0'; TRACE(NCCL_INIT,"unique PID '%s'", pname); - return getHash(pname); + return getHash(pname, strlen(pname)); } int parseStringList(const char* string, struct netIf* ifList, int maxList) {