From f4a9b9acbaa1dcb2bb9aa46b6aede6da4d70dfc4 Mon Sep 17 00:00:00 2001 From: gilbertlee-amd <44450918+gilbertlee-amd@users.noreply.github.com> Date: Fri, 26 Feb 2021 16:29:55 -0700 Subject: [PATCH] Adding pthread_join / pthread_detach to clean up pthreads to avoid leaks (#322) --- src/bootstrap.cc | 5 ++++- src/transport/net_ib.cc | 4 +++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/bootstrap.cc b/src/bootstrap.cc index b4db1368fd..d7a50e26a0 100644 --- a/src/bootstrap.cc +++ b/src/bootstrap.cc @@ -194,6 +194,7 @@ ncclResult_t bootstrapCreateRoot(ncclUniqueId* id, bool idFromEnv) { rootStruct->hash = djb2Hash(id->internal); rootStruct->listenFd = listenFd; pthread_create(&thread, NULL, bootstrapRoot, (void *)rootStruct); + pthread_detach(thread); // [RCCL] Adding detach to properly clean up bootstrapRoot thread // [/RCCL] return ncclSuccess; @@ -533,7 +534,9 @@ ncclResult_t bootstrapClose(void* commState) { state->allocState->stop = 1; // Join the allocThread so we catch resource leaks as being hung here - // pthread_join(state->allocThread, nullptr); + // [RCCL] Uncommenting this join to clean up the allocThread + pthread_join(state->allocThread, nullptr); + // [/RCCL] free(state->peerCommAddresses); free(state->peerAllocAddresses); diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc index 228a3fd4f8..73893cfb72 100644 --- a/src/transport/net_ib.cc +++ b/src/transport/net_ib.cc @@ -186,6 +186,9 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { ncclNIbDevs++; nPorts++; pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context); + // [RCCL] + pthread_detach(ncclIbAsyncThread); + // [/RCCL] } if (nPorts == 0 && ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; } } @@ -915,4 +918,3 @@ ncclNet_t ncclNetIb = { ncclIbCloseRecv, ncclIbCloseListen }; -