Merge remote-tracking branch 'nccl/master' into develop

Este commit está contenido en:
BertanDogancay
2025-08-28 15:45:42 -05:00
Se han modificado 108 ficheros con 7754 adiciones y 2129 borrados
+36 -29
Ver fichero
@@ -23,7 +23,7 @@ enum ncclPluginType {
static void *libHandles[NUM_LIBS];
static const char *pluginNames[NUM_LIBS] = { "NET", "TUNER", "PROFILER" };
static const char *pluginPrefix[NUM_LIBS] = { "librccl-net", "librccl-tuner", "librccl-profiler" };
static const char *pluginFallback[NUM_LIBS] = { "Using internal net plugin.", "Using internal tuner plugin.", "" };
static const char *pluginFallback[NUM_LIBS] = { "", "Using internal tuner plugin.", "" };
static unsigned long subsys[NUM_LIBS] = { NCCL_INIT|NCCL_NET, NCCL_INIT|NCCL_TUNING, NCCL_INIT };
static void* tryOpenLib(char* name, int* err, char* errStr) {
@@ -49,10 +49,9 @@ static void* tryOpenLib(char* name, int* err, char* errStr) {
return handle;
}
static void appendNameToList(char* nameList, int *nameListLen, char* name) {
snprintf(nameList, *nameListLen, " %s", name);
nameList += strlen(name) + 1;
*nameListLen -= strlen(name) + 1;
static void appendNameToList(char* nameList, int *leftChars, char* name) {
snprintf(nameList + PATH_MAX - *leftChars, *leftChars, " %s", name);
*leftChars -= strlen(name) + 1;
}
static void* openPluginLib(enum ncclPluginType type, const char* libName) {
@@ -62,28 +61,31 @@ static void* openPluginLib(enum ncclPluginType type, const char* libName) {
char eNoEntNameList[PATH_MAX] = { 0 };
if (libName && strlen(libName)) {
snprintf(libName_, MAX_STR_LEN, "%s", libName);
libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
if (libHandles[type]) {
INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_);
return libHandles[type];
}
if (openErr == ENOENT) {
appendNameToList(eNoEntNameList, &len, libName_);
// match names that start with 'lib' and end with '.so'
if (strlen(libName) >= strlen("libX.so") && strncmp(libName, "lib", strlen("lib")) == 0 && strncmp(libName + strlen(libName) - strlen(".so"), ".so", strlen(".so")) == 0) {
snprintf(libName_, MAX_STR_LEN, "%s", libName);
libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
if (libHandles[type]) {
INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_);
return libHandles[type];
}
if (openErr == ENOENT) {
appendNameToList(eNoEntNameList, &len, libName_);
} else {
INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr);
}
} else {
INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr);
}
snprintf(libName_, MAX_STR_LEN, "%s-%s.so", pluginPrefix[type], libName);
libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
if (libHandles[type]) {
INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_);
return libHandles[type];
}
if (openErr == ENOENT) {
appendNameToList(eNoEntNameList, &len, libName_);
} else {
INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr);
snprintf(libName_, MAX_STR_LEN, "%s-%s.so", pluginPrefix[type], libName);
libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
if (libHandles[type]) {
INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_);
return libHandles[type];
}
if (openErr == ENOENT) {
appendNameToList(eNoEntNameList, &len, libName_);
} else {
INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr);
}
}
} else {
snprintf(libName_, MAX_STR_LEN, "%s.so", pluginPrefix[type]);
@@ -123,12 +125,17 @@ void* ncclGetNetPluginLib(void) {
}
ncclResult_t ncclClosePluginLib(void* handle) {
bool found = false;
for (int l=0; l<NUM_LIBS; l++) {
if (libHandles[l] == handle) {
libHandles[l] = nullptr;
dlclose(handle);
return ncclSuccess;
if (!found) {
if (handle) {
dlclose(handle);
}
found = true;
}
}
}
return ncclInternalError;
return ncclSuccess;
}