diff --git a/src/bootstrap.cc b/src/bootstrap.cc index b4db1368fd..d7a50e26a0 100644 --- a/src/bootstrap.cc +++ b/src/bootstrap.cc @@ -194,6 +194,7 @@ ncclResult_t bootstrapCreateRoot(ncclUniqueId* id, bool idFromEnv) { rootStruct->hash = djb2Hash(id->internal); rootStruct->listenFd = listenFd; pthread_create(&thread, NULL, bootstrapRoot, (void *)rootStruct); + pthread_detach(thread); // [RCCL] Adding detach to properly clean up bootstrapRoot thread // [/RCCL] return ncclSuccess; @@ -533,7 +534,9 @@ ncclResult_t bootstrapClose(void* commState) { state->allocState->stop = 1; // Join the allocThread so we catch resource leaks as being hung here - // pthread_join(state->allocThread, nullptr); + // [RCCL] Uncommenting this join to clean up the allocThread + pthread_join(state->allocThread, nullptr); + // [/RCCL] free(state->peerCommAddresses); free(state->peerAllocAddresses); diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc index 228a3fd4f8..73893cfb72 100644 --- a/src/transport/net_ib.cc +++ b/src/transport/net_ib.cc @@ -186,6 +186,9 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { ncclNIbDevs++; nPorts++; pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context); + // [RCCL] + pthread_detach(ncclIbAsyncThread); + // [/RCCL] } if (nPorts == 0 && ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; } } @@ -915,4 +918,3 @@ ncclNet_t ncclNetIb = { ncclIbCloseRecv, ncclIbCloseListen }; -