Memory leak fix when numIBDevices = 0 (#1429)

* Initial commit for testing

* Fix memory leak in checkOptions

* Fix memory leak in checkOption

* x

* Delete cmake-3.28.2-linux-x86_64.sh

* gcn changes

* gcn memleak fixes

* gcn leak fix

* memory leak fixes for parseRome4P2H and ncclTopoAddGPU

* Keeping only necessary file for fixes

Deleting temporary scripts I created for debugging and testing

* changing to GCN_ARCH_NAME_LEN

* Added sanity check directory

* refactoring scripts

* Updated to sanity checks folder

* Initial fixes

* changes in tools

* pointing RCCL lib build to debug version

* Removed second pthread_detach

* Removing sanity checks

* Keeping only code changes

* addressing memory leaks in ncclIbinit

---------

Co-authored-by: Chao Chen <cchen104@amd.com>
This commit is contained in:
Avinash
2025-03-17 11:21:19 -05:00
zatwierdzone przez GitHub
rodzic 5f691aaf65
commit ccb0820743
+14 -13
Wyświetl plik
@@ -428,6 +428,10 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
static int shownIbHcaEnv = 0;
if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; }
// Detect IB cards
int nIbDevs = 0;
struct ibv_device** devices = NULL;
if (ncclNIbDevs == -1) {
pthread_mutex_lock(&ncclIbLock);
wrap_ibv_fork_init();
@@ -438,11 +442,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
WARN("NET/IB : No IP interface found.");
ret = ncclInternalError;
goto fail;
}
// Detect IB cards
int nIbDevs;
struct ibv_device** devices;
}
// Check if user defined which IB device:port to use
char* userIbEnv = getenv("NCCL_IB_HCA");
@@ -457,8 +457,8 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) {
if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) { ret = ncclInternalError; goto fail; }
// Should NCCL merge multi-port devices into one?
int mergeNics;
mergeNics = ncclParamIbMergeNics();
int mergeNics = ncclParamIbMergeNics();
build_ib_list:
for (int d=0; d<nIbDevs && ncclNIbDevs<MAX_IB_DEVS; d++) {
struct ibv_context * context;
@@ -471,7 +471,11 @@ build_ib_list:
memset(&devAttr, 0, sizeof(devAttr));
if (ncclSuccess != wrap_ibv_query_device(context, &devAttr)) {
WARN("NET/IB : Unable to query device %s", devices[d]->name);
if (ncclSuccess != wrap_ibv_close_device(context)) { ret = ncclInternalError; goto fail; }
if (ncclSuccess != wrap_ibv_close_device(context))
{
ret = ncclInternalError;
goto fail;
}
continue;
}
for (int port_num = 1; port_num <= devAttr.phys_port_cnt; port_num++) {
@@ -542,9 +546,6 @@ build_ib_list:
ncclIbMergedDevs[mergedDev].speed += ncclIbDevs[ncclNIbDevs].speed;
ncclNIbDevs++;
nPorts++;
// [RCCL]
pthread_detach(ncclIbAsyncThread);
// [/RCCL]
}
if (nPorts == 0 && ncclSuccess != wrap_ibv_close_device(context)) { ret = ncclInternalError; goto fail; }
}
@@ -562,8 +563,7 @@ build_ib_list:
}
}
}
if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { ret = ncclInternalError; goto fail; };
if (ncclSuccess != wrap_ibv_free_device_list(devices)) { ret = ncclInternalError; goto fail;}
}
if (ncclNIbDevs == 0) {
INFO(NCCL_INIT|NCCL_NET, "NET/IB : No device found.");
@@ -600,6 +600,7 @@ build_ib_list:
}
return ncclSuccess;
fail:
if(ncclSuccess != wrap_ibv_free_device_list(devices)){WARN("NET/IB : Unable to free device list");}
pthread_mutex_unlock(&ncclIbLock);
return ret;
}