Memory leak fix when numIBDevices = 0 (#1429)
* Initial commit for testing * Fix memory leak in checkOptions * Fix memory leak in checkOption * x * Delete cmake-3.28.2-linux-x86_64.sh * gcn changes * gcn memleak fixes * gcn leak fix * memory leak fixes for parseRome4P2H and ncclTopoAddGPU * Keeping only necessary file for fixes Deleting temporary scripts I created for debugging and testing * changing to GCN_ARCH_NAME_LEN * Added sanity check directory * refactoring scripts * Updated to sanity checks folder * Initial fixes * changes in tools * pointing RCCL lib build to debug version * Removed second pthread_detach * Removing sanity checks * Keeping only code changes * addressing memory leaks in ncclIbinit --------- Co-authored-by: Chao Chen <cchen104@amd.com>
This commit is contained in:
+14
-13
@@ -428,6 +428,10 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
|
||||
static int shownIbHcaEnv = 0;
|
||||
if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; }
|
||||
|
||||
// Detect IB cards
|
||||
int nIbDevs = 0;
|
||||
struct ibv_device** devices = NULL;
|
||||
|
||||
if (ncclNIbDevs == -1) {
|
||||
pthread_mutex_lock(&ncclIbLock);
|
||||
wrap_ibv_fork_init();
|
||||
@@ -438,11 +442,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
|
||||
WARN("NET/IB : No IP interface found.");
|
||||
ret = ncclInternalError;
|
||||
goto fail;
|
||||
}
|
||||
|
||||
// Detect IB cards
|
||||
int nIbDevs;
|
||||
struct ibv_device** devices;
|
||||
}
|
||||
|
||||
// Check if user defined which IB device:port to use
|
||||
char* userIbEnv = getenv("NCCL_IB_HCA");
|
||||
@@ -457,8 +457,8 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
|
||||
if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) { ret = ncclInternalError; goto fail; }
|
||||
|
||||
// Should NCCL merge multi-port devices into one?
|
||||
int mergeNics;
|
||||
mergeNics = ncclParamIbMergeNics();
|
||||
int mergeNics = ncclParamIbMergeNics();
|
||||
|
||||
build_ib_list:
|
||||
for (int d=0; d<nIbDevs && ncclNIbDevs<MAX_IB_DEVS; d++) {
|
||||
struct ibv_context * context;
|
||||
@@ -471,7 +471,11 @@ build_ib_list:
|
||||
memset(&devAttr, 0, sizeof(devAttr));
|
||||
if (ncclSuccess != wrap_ibv_query_device(context, &devAttr)) {
|
||||
WARN("NET/IB : Unable to query device %s", devices[d]->name);
|
||||
if (ncclSuccess != wrap_ibv_close_device(context)) { ret = ncclInternalError; goto fail; }
|
||||
if (ncclSuccess != wrap_ibv_close_device(context))
|
||||
{
|
||||
ret = ncclInternalError;
|
||||
goto fail;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
for (int port_num = 1; port_num <= devAttr.phys_port_cnt; port_num++) {
|
||||
@@ -542,9 +546,6 @@ build_ib_list:
|
||||
ncclIbMergedDevs[mergedDev].speed += ncclIbDevs[ncclNIbDevs].speed;
|
||||
ncclNIbDevs++;
|
||||
nPorts++;
|
||||
// [RCCL]
|
||||
pthread_detach(ncclIbAsyncThread);
|
||||
// [/RCCL]
|
||||
}
|
||||
if (nPorts == 0 && ncclSuccess != wrap_ibv_close_device(context)) { ret = ncclInternalError; goto fail; }
|
||||
}
|
||||
@@ -562,8 +563,7 @@ build_ib_list:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { ret = ncclInternalError; goto fail; };
|
||||
if (ncclSuccess != wrap_ibv_free_device_list(devices)) { ret = ncclInternalError; goto fail;}
|
||||
}
|
||||
if (ncclNIbDevs == 0) {
|
||||
INFO(NCCL_INIT|NCCL_NET, "NET/IB : No device found.");
|
||||
@@ -600,6 +600,7 @@ build_ib_list:
|
||||
}
|
||||
return ncclSuccess;
|
||||
fail:
|
||||
if(ncclSuccess != wrap_ibv_free_device_list(devices)){WARN("NET/IB : Unable to free device list");}
|
||||
pthread_mutex_unlock(&ncclIbLock);
|
||||
return ret;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user