From ccb082074351b560bbce3e1cb8d9ae2045b7beac Mon Sep 17 00:00:00 2001 From: Avinash <44542533+PJAvinash@users.noreply.github.com> Date: Mon, 17 Mar 2025 11:21:19 -0500 Subject: [PATCH] Memory leak fix when numIBDevices = 0 (#1429) * Initial commit for testing * Fix memory leak in checkOptions * Fix memory leak in checkOption * x * Delete cmake-3.28.2-linux-x86_64.sh * gcn changes * gcn memleak fixes * gcn leak fix * memory leak fixes for parseRome4P2H and ncclTopoAddGPU * Keeping only necessary file for fixes Deleting temporary scripts I created for debugging and testing * changing to GCN_ARCH_NAME_LEN * Added sanity check directory * refactoring scripts * Updated to sanity checks folder * Initial fixes * changes in tools * pointing RCCL lib build to debug version * Removed second pthread_detach * Removing sanity checks * Keeping only code changes * addressing memory leaks in ncclIbinit --------- Co-authored-by: Chao Chen --- src/transport/net_ib.cc | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc index 62f3066abb..409a4aadc2 100644 --- a/src/transport/net_ib.cc +++ b/src/transport/net_ib.cc @@ -428,6 +428,10 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { static int shownIbHcaEnv = 0; if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; } + // Detect IB cards + int nIbDevs = 0; + struct ibv_device** devices = NULL; + if (ncclNIbDevs == -1) { pthread_mutex_lock(&ncclIbLock); wrap_ibv_fork_init(); @@ -438,11 +442,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { WARN("NET/IB : No IP interface found."); ret = ncclInternalError; goto fail; - } - - // Detect IB cards - int nIbDevs; - struct ibv_device** devices; + } // Check if user defined which IB device:port to use char* userIbEnv = getenv("NCCL_IB_HCA"); @@ -457,8 +457,8 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) { ret = ncclInternalError; goto fail; } // Should NCCL merge multi-port devices into one? - int mergeNics; - mergeNics = ncclParamIbMergeNics(); + int mergeNics = ncclParamIbMergeNics(); + build_ib_list: for (int d=0; dname); - if (ncclSuccess != wrap_ibv_close_device(context)) { ret = ncclInternalError; goto fail; } + if (ncclSuccess != wrap_ibv_close_device(context)) + { + ret = ncclInternalError; + goto fail; + } continue; } for (int port_num = 1; port_num <= devAttr.phys_port_cnt; port_num++) { @@ -542,9 +546,6 @@ build_ib_list: ncclIbMergedDevs[mergedDev].speed += ncclIbDevs[ncclNIbDevs].speed; ncclNIbDevs++; nPorts++; - // [RCCL] - pthread_detach(ncclIbAsyncThread); - // [/RCCL] } if (nPorts == 0 && ncclSuccess != wrap_ibv_close_device(context)) { ret = ncclInternalError; goto fail; } } @@ -562,8 +563,7 @@ build_ib_list: } } } - - if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { ret = ncclInternalError; goto fail; }; + if (ncclSuccess != wrap_ibv_free_device_list(devices)) { ret = ncclInternalError; goto fail;} } if (ncclNIbDevs == 0) { INFO(NCCL_INIT|NCCL_NET, "NET/IB : No device found."); @@ -600,6 +600,7 @@ build_ib_list: } return ncclSuccess; fail: + if(ncclSuccess != wrap_ibv_free_device_list(devices)){WARN("NET/IB : Unable to free device list");} pthread_mutex_unlock(&ncclIbLock); return ret; }