Merge remote-tracking branch 'nccl/master' into develop

This commit is contained in:
BertanDogancay
2025-08-28 15:45:42 -05:00
108 fájl változott, egészen pontosan 7754 új sor hozzáadva és 2129 régi sor törölve
+219 -171
Fájl megtekintése
@@ -8,6 +8,7 @@
#include "bootstrap.h"
#include "checks.h"
#include "plugin.h"
#include "nccl_net.h"
#include <string.h>
#include <errno.h>
@@ -15,137 +16,100 @@
//#include <sys/stat.h>
//#include <unistd.h>
extern ncclNet_t* getNcclNet_v6(void* netPluginLib);
extern ncclNet_t* getNcclNet_v7(void* netPluginLib);
extern ncclNet_t* getNcclNet_v8(void* netPluginLib);
extern ncclNet_t* getNcclNet_v9(void* netPluginLib);
extern ncclNet_t* getNcclNet_v10(void* netPluginLib);
typedef ncclNet_t* getNcclNet_t(void* netPluginLib);
typedef ncclCollNet_t* getNcclCollNet_t(void* netPluginLib);
extern ncclCollNet_t* getNcclCollNet_v6(void* netPluginLib);
extern ncclCollNet_t* getNcclCollNet_v7(void* netPluginLib);
extern ncclCollNet_t* getNcclCollNet_v8(void* netPluginLib);
extern ncclCollNet_t* getNcclCollNet_v9(void* netPluginLib);
extern ncclCollNet_t* getNcclCollNet_v10(void* netPluginLib);
static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER;
ncclNet_t* ncclNets[NCCL_NET_MAX_PLUGINS] = { nullptr, &ncclNetIb, &ncclNetSocket };
static int ncclNetsVer[NCCL_NET_MAX_PLUGINS] = { -1, 10, 10 };
ncclCollNet_t* ncclCollNets[NCCL_NET_MAX_PLUGINS] = { nullptr, nullptr, nullptr };
enum ncclNetState {
ncclNetStateInit = 0,
ncclNetStateEnabled = 1,
ncclNetStateDisabled = 2
};
enum ncclNetState ncclNetStates[NCCL_NET_MAX_PLUGINS] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
enum ncclNetState ncclCollNetStates[NCCL_NET_MAX_PLUGINS] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
extern getNcclNet_t getNcclNet_v6;
extern getNcclNet_t getNcclNet_v7;
extern getNcclNet_t getNcclNet_v8;
extern getNcclNet_t getNcclNet_v9;
extern getNcclNet_t getNcclNet_v10;
extern getNcclCollNet_t getNcclCollNet_v6;
extern getNcclCollNet_t getNcclCollNet_v7;
extern getNcclCollNet_t getNcclCollNet_v8;
extern getNcclCollNet_t getNcclCollNet_v9;
extern getNcclCollNet_t getNcclCollNet_v10;
NCCL_PARAM(NetPluginRefCount, "NET_PLUGIN_REF_COUNT", 1);
#define NCCL_NET_VERSION_COUNT 5
int ncclNetVersion[NCCL_NET_VERSION_COUNT] = {10, 9, 8, 7, 6};
getNcclNet_t* getNcclNet[NCCL_NET_VERSION_COUNT] = {getNcclNet_v10, getNcclNet_v9, getNcclNet_v8, getNcclNet_v7, getNcclNet_v6};
getNcclCollNet_t* getNcclCollNet[NCCL_NET_VERSION_COUNT] = {getNcclCollNet_v10, getNcclCollNet_v9, getNcclCollNet_v8, getNcclCollNet_v7, getNcclCollNet_v6};
#define NCCL_NET_NUM_INTERNAL_PLUGINS 2
typedef enum ncclNetPluginState {
ncclNetPluginStateDisabled = -2, // Plugin library failed to initialize
ncclNetPluginStateLoadFailed = -1, // Plugin library failed to load
ncclNetPluginStateLoadReady = 0, // Plugin library is ready to be loaded
ncclNetPluginStateInitReady = 1, // Plugin library is loaded and ready to be initialized
ncclNetPluginStateEnabled = 2, // Plugin library is loaded and initialized
} ncclNetPluginState_t;
#define MAX_STR_LEN 255
typedef struct netPluginLib {
char name[MAX_STR_LEN]; // Name of the plugin library
void* dlHandle; // Handle to the plugin library
ncclNet_t* ncclNet; // Pointer to the ncclNet_t structure
int ncclNetVer; // Version of the nccl net plugin
ncclCollNet_t* ncclCollNet; // Pointer to the ncclCollNet_t structure
ncclNetPluginState_t ncclNetPluginState; // State of the nccl net plugin
ncclNetPluginState_t ncclCollNetPluginState; // State of the nccl coll net plugin
int ncclNetPluginRefCount; // Reference count for the nccl net plugin
} netPluginLib_t;
int pluginCount = 0;
bool netPluginLibsInitialized = false;
netPluginLib_t netPluginLibs[NCCL_NET_MAX_PLUGINS] = { 0 };
static pthread_mutex_t netPluginLock = PTHREAD_MUTEX_INITIALIZER;
static void* netPluginLib;
static pthread_once_t initPluginLibsOnceControl = PTHREAD_ONCE_INIT;
static int netPluginRefCount;
static void initNetPluginRefCountOnce(void) { netPluginRefCount = ncclParamNetPluginRefCount();}
enum {
netPluginLoadFailed = -1,
netPluginLoadReady = 0,
netPluginLoadSuccess = 1,
};
static int netPluginStatus = netPluginLoadReady;
ncclResult_t ncclNetPluginLoad(struct ncclComm* comm) {
static pthread_once_t netPluginRefCountOnce = PTHREAD_ONCE_INIT;
pthread_once(&netPluginRefCountOnce, initNetPluginRefCountOnce);
pthread_mutex_lock(&netPluginLock);
if (netPluginLoadFailed == netPluginStatus) {
goto exit;
static ncclResult_t ncclNetPluginUnload(netPluginLib_t* pluginLib) {
if ((pluginLib->dlHandle) && ((pluginLib->ncclNetPluginRefCount) == 0)) {
INFO(NCCL_INIT|NCCL_NET, "Unloading plugin %s", pluginLib->name);
NCCLCHECK(ncclClosePluginLib(pluginLib->dlHandle));
memset(pluginLib, 0, sizeof(netPluginLib_t));
}
if (netPluginLoadSuccess == netPluginStatus) {
++netPluginRefCount;
goto exit;
}
netPluginLib = ncclOpenNetPluginLib(ncclGetEnv("NCCL_NET_PLUGIN"));
if (netPluginLib == nullptr) {
goto fail;
}
ncclNets[0] = getNcclNet_v10(netPluginLib);
if (ncclNets[0]) ncclNetsVer[0] = 10;
if (ncclNets[0] == nullptr) {
// Try v9 plugin
ncclNets[0] = getNcclNet_v9(netPluginLib);
if (ncclNets[0]) ncclNetsVer[0] = 9;
}
if (ncclNets[0] == nullptr) {
// Try v8 plugin
ncclNets[0] = getNcclNet_v8(netPluginLib);
if (ncclNets[0]) ncclNetsVer[0] = 8;
}
if (ncclNets[0] == nullptr) {
// Try v7 plugin
ncclNets[0] = getNcclNet_v7(netPluginLib);
if (ncclNets[0]) ncclNetsVer[0] = 7;
}
if (ncclNets[0] == nullptr) {
// Try v6 plugin
ncclNets[0] = getNcclNet_v6(netPluginLib);
if (ncclNets[0]) ncclNetsVer[0] = 6;
}
if (ncclNets[0] == nullptr) {
goto fail;
}
// Check for CollNet
ncclCollNets[0] = getNcclCollNet_v10(netPluginLib);
if (ncclCollNets[0] == nullptr) {
ncclCollNets[0] = getNcclCollNet_v9(netPluginLib);
}
if (ncclCollNets[0] == nullptr) {
ncclCollNets[0] = getNcclCollNet_v8(netPluginLib);
}
if (ncclCollNets[0] == nullptr) {
ncclCollNets[0] = getNcclCollNet_v7(netPluginLib);
}
if (ncclCollNets[0] == nullptr) {
ncclCollNets[0] = getNcclCollNet_v6(netPluginLib);
}
++netPluginRefCount;
netPluginStatus = netPluginLoadSuccess;
comm->netPluginLoaded = 1;
exit:
pthread_mutex_unlock(&netPluginLock);
return ncclSuccess;
fail:
if (netPluginLib) NCCLCHECK(ncclClosePluginLib(netPluginLib));
netPluginStatus = netPluginLoadFailed;
goto exit;
}
ncclResult_t ncclNetPluginUnload(struct ncclComm* comm) {
pthread_mutex_lock(&netPluginLock);
if (comm->netPluginLoaded && 0 == (--netPluginRefCount)) {
if (ncclNets[0]) {
INFO(NCCL_NET, "NET/Plugin: Closing net plugin '%s'", ncclNets[0]->name);
}
if (ncclCollNets[0]) {
INFO(NCCL_NET, "NET/Plugin: Closing collnet plugin '%s'", ncclCollNets[0]->name);
}
NCCLCHECK(ncclClosePluginLib(netPluginLib));
netPluginLib = nullptr;
ncclNets[0] = nullptr;
ncclCollNets[0] = nullptr;
netPluginStatus = netPluginLoadReady;
comm->netPluginLoaded = 0;
for (int i = 0; i < NCCL_NET_MAX_PLUGINS; ++i)
ncclCollNetStates[i] = ncclNetStates[i] = ncclNetStateInit;
static ncclResult_t ncclNetPluginLoad(netPluginLib_t* pluginLib) {
pluginLib->dlHandle = ncclOpenNetPluginLib(pluginLib->name);
if (pluginLib->dlHandle == nullptr) goto fail;
// load ncclNet
for (int i = 0; i < NCCL_NET_VERSION_COUNT; i++) {
pluginLib->ncclNetVer = ncclNetVersion[i];
pluginLib->ncclNet = getNcclNet[i](pluginLib->dlHandle);
if (pluginLib->ncclNet) break;
}
pthread_mutex_unlock(&netPluginLock);
// if we fail to find a net, exit
if (pluginLib->ncclNet == nullptr) goto fail;
pluginLib->ncclNetPluginState = ncclNetPluginStateInitReady;
// load ncclColNet
for (int i = 0; i < NCCL_NET_VERSION_COUNT; i++) {
pluginLib->ncclCollNet = getNcclCollNet[i](pluginLib->dlHandle);
if (pluginLib->ncclCollNet) break;
}
if (pluginLib->ncclCollNet == nullptr)
pluginLib->ncclCollNetPluginState = ncclNetPluginStateLoadFailed;
else
pluginLib->ncclCollNetPluginState = ncclNetPluginStateInitReady;
INFO(NCCL_INIT|NCCL_NET, "Successfully loaded external plugin %s", pluginLib->name);
exit:
return ncclSuccess;
fail:
if (pluginLib->dlHandle) {
NCCLCHECK(ncclClosePluginLib(pluginLib->dlHandle));
}
pluginLib->ncclNetPluginState = ncclNetPluginStateLoadFailed;
pluginLib->ncclCollNetPluginState = ncclNetPluginStateLoadFailed;
goto exit;
}
ncclResult_t ncclNetCheckDeviceVersion(struct ncclComm* comm, ncclNet_t* net, int dev) {
@@ -172,72 +136,156 @@ ncclResult_t ncclNetCheckDeviceVersion(struct ncclComm* comm, ncclNet_t* net, in
return ncclSuccess;
}
static ncclResult_t netGetState(int i, enum ncclNetState* state) {
pthread_mutex_lock(&netLock);
if (ncclNetStates[i] == ncclNetStateInit) {
int ndev;
if (ncclNets[i]->init(ncclDebugLog, ncclProfilerCallback) != ncclSuccess) ncclNetStates[i] = ncclNetStateDisabled;
else if (ncclNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclNetStates[i] = ncclNetStateDisabled;
else ncclNetStates[i] = ncclNetStateEnabled;
static ncclResult_t ncclNetPluginInit(netPluginLib_t* pluginLib) {
int ndev;
if (pluginLib->ncclNetPluginState == ncclNetPluginStateInitReady && pluginLib->ncclNet) {
if (pluginLib->ncclNet->init(ncclDebugLog, ncclProfilerCallback) != ncclSuccess) goto fail;
if (pluginLib->ncclNet->devices(&ndev) != ncclSuccess || ndev <= 0) goto fail;
}
pluginLib->ncclNetPluginState = ncclNetPluginStateEnabled;
INFO(NCCL_INIT|NCCL_NET, "Initialized NET plugin %s", pluginLib->ncclNet->name);
if (pluginLib->ncclCollNetPluginState == ncclNetPluginStateInitReady && pluginLib->ncclCollNet) {
if (pluginLib->ncclCollNet->init(ncclDebugLog) != ncclSuccess) pluginLib->ncclCollNetPluginState = ncclNetPluginStateDisabled;
else if (pluginLib->ncclCollNet->devices(&ndev) != ncclSuccess || ndev <= 0) pluginLib->ncclCollNetPluginState = ncclNetPluginStateDisabled;
else {
pluginLib->ncclCollNetPluginState = ncclNetPluginStateEnabled;
}
}
exit:
return ncclSuccess;
fail:
pluginLib->ncclNetPluginState = ncclNetPluginStateDisabled;
pluginLib->ncclCollNetPluginState = ncclNetPluginStateDisabled;
goto exit;
}
static ncclResult_t ncclNetPluginAssignToComm(struct ncclComm* comm, int pluginIndex, bool* isAssigned) {
const char* netName = comm->config.netName;
if (netName && strcasecmp(netName, netPluginLibs[pluginIndex].ncclNet->name) != 0) goto fail;
if (ncclSuccess != ncclNetCheckDeviceVersion(comm, netPluginLibs[pluginIndex].ncclNet, 0)) goto fail;
if (netPluginLibs[pluginIndex].ncclNetPluginState >= ncclNetPluginStateEnabled) {
comm->ncclNet = netPluginLibs[pluginIndex].ncclNet;
comm->ncclNetVer = netPluginLibs[pluginIndex].ncclNetVer;
comm->netPluginIndex = pluginIndex;
netPluginLibs[pluginIndex].ncclNetPluginRefCount++;
*isAssigned = true;
INFO(NCCL_INIT|NCCL_NET, "Assigned NET plugin %s to comm", netPluginLibs[pluginIndex].ncclNet->name);
if (netPluginLibs[pluginIndex].ncclCollNetPluginState >= ncclNetPluginStateEnabled) {
comm->ncclCollNet = netPluginLibs[pluginIndex].ncclCollNet;
}
}
exit:
return ncclSuccess;
fail:
*isAssigned = false;
netPluginLibs[pluginIndex].ncclNetPluginState = ncclNetPluginStateEnabled;
netPluginLibs[pluginIndex].ncclCollNetPluginState = ncclNetPluginStateEnabled;
goto exit;
}
static ncclResult_t ncclNetPluginDisableOtherExternal(int pluginIndex) {
// Only if an external plugin is enabled, disable other external plugins
if (pluginIndex >= (pluginCount - NCCL_NET_NUM_INTERNAL_PLUGINS)) return ncclSuccess;
char names[MAX_STR_LEN*(NCCL_NET_MAX_PLUGINS - NCCL_NET_NUM_INTERNAL_PLUGINS)] = { 0 };
for (int i = 0; i < (pluginCount - NCCL_NET_NUM_INTERNAL_PLUGINS); i++) {
if (i != pluginIndex) {
// Append all disabled plugin names to a string
snprintf(names+strlen(names), sizeof(names)-strlen(names), (strlen(names) == 0) ? "%s" : ", %s", netPluginLibs[i].name);
netPluginLibs[i].ncclNetPluginState = ncclNetPluginStateDisabled;
}
}
if(strlen(names) > 0) {
INFO(NCCL_INIT|NCCL_NET, "Disabling external plugins: %s", names);
}
*state = ncclNetStates[i];
pthread_mutex_unlock(&netLock);
return ncclSuccess;
}
static ncclResult_t collNetGetState(int i, enum ncclNetState* state) {
pthread_mutex_lock(&netLock);
if (ncclCollNetStates[i] == ncclNetStateInit) {
int ndev;
if (ncclCollNets[i]->init(ncclDebugLog) != ncclSuccess) ncclCollNetStates[i] = ncclNetStateDisabled;
else if (ncclCollNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclCollNetStates[i] = ncclNetStateDisabled;
else ncclCollNetStates[i] = ncclNetStateEnabled;
static void initPluginLibsOnceFunc() {
char* netPluginName = nullptr;
const char* defaultNetPlugin = "libnccl-net.so";
const char* envNetPlugin = nullptr;
char* envNetPluginList = nullptr;
char* savePtr = nullptr;
int pluginCounter = 0;
memset(netPluginLibs, 0, NCCL_NET_MAX_PLUGINS * sizeof(netPluginLib_t));
envNetPlugin = ncclGetEnv("NCCL_NET_PLUGIN");
if (envNetPlugin) {
envNetPluginList = strdup(envNetPlugin);
// Iterate over list until the list is empty
netPluginName = strtok_r(envNetPluginList, ",", &savePtr);
while(netPluginName) {
// We have 2 internal plugins (ib and socket)
// So, we can have at most( NCCL_NET_MAX_PLUGINS - (NCCL_NET_NUM_INTERNAL_PLUGINS)) in the NCCL_NET_PLUGIN list
if (pluginCounter >= (NCCL_NET_MAX_PLUGINS - (NCCL_NET_NUM_INTERNAL_PLUGINS))) {
INFO(NCCL_NET|NCCL_INIT,"NCCL_NET_PLUGIN list contains more than %d plugins, ignoring the rest", (NCCL_NET_MAX_PLUGINS - (NCCL_NET_NUM_INTERNAL_PLUGINS + 1)));
break;
}
// need to leave space for the name + "\n"
if((strlen(netPluginName)+1) <= MAX_STR_LEN) {
netPluginLibs[pluginCounter].ncclNetPluginState = ncclNetPluginStateLoadReady;
netPluginLibs[pluginCounter].ncclNetPluginRefCount = ncclParamNetPluginRefCount();
strcpy(netPluginLibs[pluginCounter].name, netPluginName);
pluginCounter++;
} else {
INFO(NCCL_NET|NCCL_INIT,"NCCL_NET_PLUGIN list contains a plugin name %s longer than %d characters, ignoring it.", netPluginName, MAX_STR_LEN);
}
netPluginName = strtok_r(nullptr, ",", &savePtr);
}
if (envNetPluginList) free(envNetPluginList);
} else {
// Add default net plugin
netPluginLibs[pluginCounter].ncclNetPluginState = ncclNetPluginStateLoadReady;
netPluginLibs[pluginCounter].ncclNetPluginRefCount = ncclParamNetPluginRefCount();
strcpy(netPluginLibs[pluginCounter++].name, defaultNetPlugin);
}
*state = ncclCollNetStates[i];
pthread_mutex_unlock(&netLock);
return ncclSuccess;
// Add 2 internal ib and socket plugins
netPluginLibs[pluginCounter].ncclNet = &ncclNetIb;
netPluginLibs[pluginCounter++].ncclNetPluginState = ncclNetPluginStateInitReady;
netPluginLibs[pluginCounter].ncclNet = &ncclNetSocket;
netPluginLibs[pluginCounter++].ncclNetPluginState = ncclNetPluginStateInitReady;
pluginCount = pluginCounter;
}
ncclResult_t ncclNetInit(struct ncclComm* comm) {
// Initialize main communication network
const char* netName;
bool ok = false;
netName = comm->config.netName;
for (int i=0; i<3; i++) {
if (ncclNets[i] == nullptr) continue;
enum ncclNetState state;
NCCLCHECK(netGetState(i, &state));
if (state != ncclNetStateEnabled) continue;
if (netName && strcasecmp(netName, ncclNets[i]->name) != 0) continue;
if (ncclSuccess != ncclNetCheckDeviceVersion(comm, ncclNets[i], 0)) {
// Mismatched device plugin version
continue;
bool ncclNetPluginInitialized = false;
pthread_once(&initPluginLibsOnceControl, initPluginLibsOnceFunc);
pthread_mutex_lock(&netPluginLock);
for (int pluginIndex = 0; pluginIndex < pluginCount; pluginIndex++) {
if ((pluginIndex < (pluginCount - NCCL_NET_NUM_INTERNAL_PLUGINS)) && (netPluginLibs[pluginIndex].ncclNetPluginState == ncclNetPluginStateLoadReady)) {
NCCLCHECK(ncclNetPluginLoad(&netPluginLibs[pluginIndex]));
}
comm->ncclNet = ncclNets[i];
comm->ncclNetVer = ncclNetsVer[i];
ok = true;
if (ncclCollNets[i]) {
NCCLCHECK(collNetGetState(i, &state));
if (state == ncclNetStateEnabled) {
comm->ncclCollNet = ncclCollNets[i];
if (netPluginLibs[pluginIndex].ncclNetPluginState == ncclNetPluginStateInitReady) {
NCCLCHECK(ncclNetPluginInit(&netPluginLibs[pluginIndex]));
}
if (netPluginLibs[pluginIndex].ncclNetPluginState == ncclNetPluginStateEnabled) {
bool isAssigned = false;
NCCLCHECK(ncclNetPluginAssignToComm(comm, pluginIndex, &isAssigned));
if (isAssigned) {
// If one external plugin is assigned to a comm, then disable all other external plugins
ncclNetPluginDisableOtherExternal(pluginIndex);
ncclNetPluginInitialized = true;
break;
}
}
break;
}
if (!ok) {
WARN("Error: network %s not found.", netName ? netName : "");
return ncclInvalidUsage;
}
return ncclSuccess;
pthread_mutex_unlock(&netPluginLock);
if (ncclNetPluginInitialized) return ncclSuccess;
WARN("Failed to initialize any NET plugin");
return ncclInvalidUsage;
}
ncclResult_t ncclNetFinalize(struct ncclComm* comm) {
comm->ncclNet = nullptr;
comm->ncclCollNet = nullptr;
int pluginIndex = comm->netPluginIndex;
pthread_mutex_lock(&netPluginLock);
netPluginLibs[pluginIndex].ncclNetPluginRefCount--;
for (int i = 0; i < (pluginCount - NCCL_NET_NUM_INTERNAL_PLUGINS); i++) {
NCCLCHECK(ncclNetPluginUnload(&netPluginLibs[i]));
}
pthread_mutex_unlock(&netPluginLock);
return ncclSuccess;
}
+36 -29
Fájl megtekintése
@@ -23,7 +23,7 @@ enum ncclPluginType {
static void *libHandles[NUM_LIBS];
static const char *pluginNames[NUM_LIBS] = { "NET", "TUNER", "PROFILER" };
static const char *pluginPrefix[NUM_LIBS] = { "librccl-net", "librccl-tuner", "librccl-profiler" };
static const char *pluginFallback[NUM_LIBS] = { "Using internal net plugin.", "Using internal tuner plugin.", "" };
static const char *pluginFallback[NUM_LIBS] = { "", "Using internal tuner plugin.", "" };
static unsigned long subsys[NUM_LIBS] = { NCCL_INIT|NCCL_NET, NCCL_INIT|NCCL_TUNING, NCCL_INIT };
static void* tryOpenLib(char* name, int* err, char* errStr) {
@@ -49,10 +49,9 @@ static void* tryOpenLib(char* name, int* err, char* errStr) {
return handle;
}
static void appendNameToList(char* nameList, int *nameListLen, char* name) {
snprintf(nameList, *nameListLen, " %s", name);
nameList += strlen(name) + 1;
*nameListLen -= strlen(name) + 1;
static void appendNameToList(char* nameList, int *leftChars, char* name) {
snprintf(nameList + PATH_MAX - *leftChars, *leftChars, " %s", name);
*leftChars -= strlen(name) + 1;
}
static void* openPluginLib(enum ncclPluginType type, const char* libName) {
@@ -62,28 +61,31 @@ static void* openPluginLib(enum ncclPluginType type, const char* libName) {
char eNoEntNameList[PATH_MAX] = { 0 };
if (libName && strlen(libName)) {
snprintf(libName_, MAX_STR_LEN, "%s", libName);
libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
if (libHandles[type]) {
INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_);
return libHandles[type];
}
if (openErr == ENOENT) {
appendNameToList(eNoEntNameList, &len, libName_);
// match names that start with 'lib' and end with '.so'
if (strlen(libName) >= strlen("libX.so") && strncmp(libName, "lib", strlen("lib")) == 0 && strncmp(libName + strlen(libName) - strlen(".so"), ".so", strlen(".so")) == 0) {
snprintf(libName_, MAX_STR_LEN, "%s", libName);
libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
if (libHandles[type]) {
INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_);
return libHandles[type];
}
if (openErr == ENOENT) {
appendNameToList(eNoEntNameList, &len, libName_);
} else {
INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr);
}
} else {
INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr);
}
snprintf(libName_, MAX_STR_LEN, "%s-%s.so", pluginPrefix[type], libName);
libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
if (libHandles[type]) {
INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_);
return libHandles[type];
}
if (openErr == ENOENT) {
appendNameToList(eNoEntNameList, &len, libName_);
} else {
INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr);
snprintf(libName_, MAX_STR_LEN, "%s-%s.so", pluginPrefix[type], libName);
libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
if (libHandles[type]) {
INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_);
return libHandles[type];
}
if (openErr == ENOENT) {
appendNameToList(eNoEntNameList, &len, libName_);
} else {
INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr);
}
}
} else {
snprintf(libName_, MAX_STR_LEN, "%s.so", pluginPrefix[type]);
@@ -123,12 +125,17 @@ void* ncclGetNetPluginLib(void) {
}
ncclResult_t ncclClosePluginLib(void* handle) {
bool found = false;
for (int l=0; l<NUM_LIBS; l++) {
if (libHandles[l] == handle) {
libHandles[l] = nullptr;
dlclose(handle);
return ncclSuccess;
if (!found) {
if (handle) {
dlclose(handle);
}
found = true;
}
}
}
return ncclInternalError;
return ncclSuccess;
}
+43 -48
Fájl megtekintése
@@ -19,6 +19,7 @@
extern ncclProfiler_t* getNcclProfiler_v1(void* lib);
extern ncclProfiler_t* getNcclProfiler_v2(void* lib);
extern ncclProfiler_t* getNcclProfiler_v3(void* lib);
extern ncclProfiler_t* getNcclProfiler_v4(void* lib);
static pthread_mutex_t profilerLock = PTHREAD_MUTEX_INITIALIZER;
static int profilerPluginRefCount;
@@ -51,7 +52,10 @@ static ncclResult_t ncclProfilerPluginLoad(void) {
goto fail;
}
ncclProfiler = getNcclProfiler_v3(profilerPluginLib);
ncclProfiler = getNcclProfiler_v4(profilerPluginLib);
if (ncclProfiler == nullptr) {
ncclProfiler = getNcclProfiler_v3(profilerPluginLib);
}
if (ncclProfiler == nullptr) {
ncclProfiler = getNcclProfiler_v2(profilerPluginLib);
}
@@ -164,7 +168,7 @@ ncclResult_t ncclProfilerPluginInit(struct ncclComm* comm) {
TIME_START_EVENT(init);
ncclProfilerPluginLoad();
if (__builtin_expect(ncclProfiler != NULL, 0)) {
int err = ncclProfiler->init(&comm->profilerContext, &ncclProfilerEventMask);
int err = ncclProfiler->init(&comm->profilerContext, &ncclProfilerEventMask, comm->config.commName, comm->commHash, comm->nNodes, comm->nRanks, comm->rank, ncclDebugLog);
if (err) {
WARN("Profiler init failed with error (%d). Continue without profiler.", err);
ncclProfiler = NULL;
@@ -241,8 +245,6 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) {
eDescr.type = ncclProfileColl;
eDescr.parentObj = plan->groupEventHandle;
eDescr.rank = plan->comm->rank;
eDescr.coll.name = plan->comm->commName;
eDescr.coll.commHash = plan->comm->commHash;
eDescr.coll.seqNumber = plan->comm->seqNumber[ct->func];
eDescr.coll.func = ncclFuncToString(ct->func);
eDescr.coll.sendBuff = ct->sendbuff;
@@ -250,7 +252,7 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) {
eDescr.coll.count = ct->count;
eDescr.coll.root = ct->root;
eDescr.coll.datatype = ncclDatatypeToString(ct->datatype);
eDescr.coll.nMaxChannels = ct->nMaxChannels;
eDescr.coll.nChannels = ct->nChannels;
eDescr.coll.nWarps = ct->nWarps;
eDescr.coll.algo = ncclAlgoToString(ct->algorithm);
eDescr.coll.proto = ncclProtoToString(ct->protocol);
@@ -266,7 +268,7 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) {
// gives the consistency.
if (!plan->persistent || (__builtin_expect(ncclProfiler != NULL, 0) && plan->groupEventHandle &&
(ct->eActivationMask & ncclProfileKernelCh)))
plan->comm->seqNumber[ct->func]++;
__atomic_fetch_add(&plan->comm->seqNumber[ct->func], 1, __ATOMIC_RELAXED);
ct = ct->next;
}
if (__builtin_expect(ncclProfiler != NULL, 0)) {
@@ -279,13 +281,12 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) {
eDescr.type = ncclProfileP2p;
eDescr.parentObj = plan->groupEventHandle;
eDescr.rank = plan->comm->rank;
eDescr.p2p.name = plan->comm->commName;
eDescr.p2p.commHash = plan->comm->commHash;
eDescr.p2p.func = ncclFuncToString(pt->func);
eDescr.p2p.buff = pt->buff;
eDescr.p2p.count = pt->count;
eDescr.p2p.datatype = ncclDatatypeToString(pt->datatype);
eDescr.p2p.peer = pt->root;
eDescr.p2p.nChannels = pt->nChannels;
ncclProfiler->startEvent(plan->comm->profilerContext, &pt->eventHandle, &eDescr);
}
pt = pt->next;
@@ -321,7 +322,7 @@ ncclResult_t ncclProfilerStopTaskEvents(struct ncclKernelPlan* plan) {
// made of sliceSteps steps rather than one step. In the profiler we are still
// interested in whole network transfers though, so we account for this when
// computing the actual network step number.
ncclResult_t ncclProfilerStartSendProxyOpEvent(int s, struct ncclProxyArgs* args) {
ncclResult_t ncclProfilerStartProxyOpEvent(int s, struct ncclProxyArgs* args) {
TIME_START_EVENT(proxyOpStart);
struct ncclProxySubArgs* sub = &args->subs[s];
if (__builtin_expect(ncclProfiler != NULL, 0)) {
@@ -335,29 +336,7 @@ ncclResult_t ncclProfilerStartSendProxyOpEvent(int s, struct ncclProxyArgs* args
eDescr.proxyOp.peer = sub->peer;
eDescr.proxyOp.nSteps = DIVUP(sub->nsteps, args->sliceSteps);
eDescr.proxyOp.chunkSize = args->chunkSize * args->sliceSteps;
eDescr.proxyOp.isSend = 1;
ncclProfiler->startEvent(sub->profilerContext, &sub->opEventHandle, &eDescr);
}
}
TIME_STOP_EVENT(proxyOpStart);
return ncclSuccess;
}
ncclResult_t ncclProfilerStartRecvProxyOpEvent(int s, struct ncclProxyArgs* args) {
TIME_START_EVENT(proxyOpStart);
struct ncclProxySubArgs* sub = &args->subs[s];
if (__builtin_expect(ncclProfiler != NULL, 0)) {
if (sub->eActivationMask & (ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileNetPlugin)) {
ncclProfilerEventDescr_t eDescr = { 0 };
eDescr.type = ncclProfileProxyOp;
eDescr.parentObj = sub->taskEventHandle;
eDescr.rank = sub->rank;
eDescr.proxyOp.pid = sub->pid;
eDescr.proxyOp.channelId = sub->channelId;
eDescr.proxyOp.peer = sub->peer;
eDescr.proxyOp.nSteps = DIVUP(sub->nsteps, args->sliceSteps);
eDescr.proxyOp.chunkSize = args->chunkSize * args->sliceSteps;
eDescr.proxyOp.isSend = 0;
eDescr.proxyOp.isSend = args->progress == ncclTransports[TRANSPORT_NET]->send.proxyProgress ? 1 : 0;
ncclProfiler->startEvent(sub->profilerContext, &sub->opEventHandle, &eDescr);
}
}
@@ -387,7 +366,8 @@ ncclResult_t ncclProfilerStartSendProxyStepEvent(int s, struct ncclProxyArgs* ar
eDescr.parentObj = sub->opEventHandle;
eDescr.rank = sub->rank;
eDescr.proxyStep.step = step_;
ncclProfiler->startEvent(sub->profilerContext, &sub->stepEventHandles[step_%NCCL_STEPS], &eDescr);
ncclProfiler->startEvent(sub->profilerContext, &sub->pHandles[step_%NCCL_STEPS].stepEventHandle, &eDescr);
sub->pHandles[step_%NCCL_STEPS].subArgPtr = sub;
}
}
TIME_STOP_EVENT(proxyStepStart);
@@ -405,7 +385,8 @@ ncclResult_t ncclProfilerStartRecvProxyStepEvent(int s, struct ncclProxyArgs* ar
eDescr.parentObj = sub->opEventHandle;
eDescr.rank = sub->rank;
eDescr.proxyStep.step = step_;
ncclProfiler->startEvent(sub->profilerContext, &sub->stepEventHandles[step_%NCCL_STEPS], &eDescr);
ncclProfiler->startEvent(sub->profilerContext, &sub->pHandles[step_%NCCL_STEPS].stepEventHandle, &eDescr);
sub->pHandles[step_%NCCL_STEPS].subArgPtr = sub;
}
}
TIME_STOP_EVENT(proxyStepStart);
@@ -417,9 +398,9 @@ ncclResult_t ncclProfilerStopProxyStepEvent(int s, struct ncclProxyArgs* args, i
struct ncclProxySubArgs* sub = &args->subs[s];
if (__builtin_expect(ncclProfiler != NULL, 0)) {
int step_ = DIVUP(stepId, args->sliceSteps);
if (sub->stepEventHandles[step_%NCCL_STEPS]) {
ncclProfiler->stopEvent(sub->stepEventHandles[step_%NCCL_STEPS]);
sub->stepEventHandles[step_%NCCL_STEPS] = NULL;
if (sub->pHandles[step_%NCCL_STEPS].stepEventHandle) {
ncclProfiler->stopEvent(sub->pHandles[step_%NCCL_STEPS].stepEventHandle);
sub->pHandles[step_%NCCL_STEPS].stepEventHandle = NULL;
}
}
TIME_STOP_EVENT(proxyStepStop);
@@ -453,7 +434,7 @@ ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle) {
return ncclSuccess;
}
ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s) {
ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s, uint64_t start) {
if (__builtin_expect(ncclProfiler != NULL, 0)) {
struct ncclProxySubArgs* sub = &args->subs[s];
if (sub->eActivationMask & ncclProfileKernelCh) {
@@ -461,29 +442,31 @@ ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s) {
eDescr.type = ncclProfileKernelCh;
eDescr.parentObj = sub->taskEventHandle;
eDescr.kernelCh.channelId = sub->channelId;
eDescr.kernelCh.pTimer = start;
ncclProfiler->startEvent(sub->profilerContext, &sub->kernelEventHandle, &eDescr);
}
}
return ncclSuccess;
}
ncclResult_t ncclProfilerStopKernelChEvent(struct ncclProxyArgs* args, int s) {
ncclResult_t ncclProfilerStopKernelChEvent(struct ncclProxyArgs* args, int s, uint64_t stop) {
if (__builtin_expect(ncclProfiler != NULL, 0)) {
struct ncclProxySubArgs* sub = &args->subs[s];
if (sub->kernelEventHandle) {
ncclProfilerEventStateArgs_t a = { };
a.kernelCh.pTimer = stop;
ncclProfiler->recordEventState(sub->kernelEventHandle, ncclProfilerKernelChStop, &a);
ncclProfiler->stopEvent(sub->kernelEventHandle);
}
}
return ncclSuccess;
}
ncclResult_t ncclProfilerRecordProxyOpEventState(int s, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState) {
ncclResult_t ncclProfilerRecordProxyOpEventState(int s, struct ncclProxyArgs* args, ncclProfilerEventState_t eState) {
TIME_START_EVENT(proxyOpRecord);
struct ncclProxySubArgs* sub = &args->subs[s];
if (__builtin_expect(ncclProfiler != NULL, 0) && sub->opEventHandle) {
ncclProfilerEventStateArgs_t a = { };
a.proxyOp.steps = DIVUP(steps, args->sliceSteps);
a.proxyOp.transSize = transSize;
ncclProfiler->recordEventState(sub->opEventHandle, eState, &a);
}
TIME_STOP_EVENT(proxyOpRecord);
@@ -495,8 +478,10 @@ ncclResult_t ncclProfilerRecordProxyStepEventState(int s, struct ncclProxyArgs*
struct ncclProxySubArgs* sub = &args->subs[s];
if (__builtin_expect(ncclProfiler != NULL, 0) && sub->opEventHandle) {
int step_ = DIVUP(stepId, args->sliceSteps);
if (sub->stepEventHandles[step_%NCCL_STEPS]) {
ncclProfiler->recordEventState(sub->stepEventHandles[step_%NCCL_STEPS], eState, 0);
if (sub->pHandles[step_%NCCL_STEPS].stepEventHandle) {
ncclProfilerEventStateArgs_t a = { };
a.proxyStep.transSize = sub->transSize;
ncclProfiler->recordEventState(sub->pHandles[step_%NCCL_STEPS].stepEventHandle, eState, &a);
}
}
TIME_STOP_EVENT(proxyStepRecord);
@@ -549,18 +534,28 @@ bool ncclProfilerPluginLoaded(void) {
ncclResult_t ncclProfilerCallback(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData) {
if (__builtin_expect(ncclProfiler != NULL, 0)) {
struct ncclProxySubArgs* sub = (struct ncclProxySubArgs*)pHandle;
if (type == 0) { // start
if (type == ncclProfilerNetEventStart) { // start
struct ncclProxyEventHandle* p = (struct ncclProxyEventHandle*)pHandle;
struct ncclProxySubArgs* sub = p->subArgPtr;
if (sub->eActivationMask & ncclProfileNetPlugin) {
ncclProfilerEventDescr_t eDescr = { 0 };
eDescr.type = ncclProfileNetPlugin;
eDescr.parentObj = sub->stepEventHandles[sub->profilerSteps%NCCL_STEPS];
eDescr.parentObj = p->stepEventHandle;
eDescr.rank = sub->rank;
eDescr.netPlugin.id = pluginId;
eDescr.netPlugin.data = extData;
ncclProfiler->startEvent(sub->profilerContext, eHandle, &eDescr);
}
} else { // stop
} else if (type == ncclProfilerNetEventStop) { // stop
ncclProfiler->stopEvent(*eHandle);
} else if (type == ncclProfilerNetEventUpdate) { // update
ncclProfilerEventStateArgs_t args = { };
args.netPlugin.data = extData;
ncclProfiler->recordEventState(*eHandle, ncclProfilerNetPluginUpdate, &args);
} else { // update and stop
ncclProfilerEventStateArgs_t args = { };
args.netPlugin.data = extData;
ncclProfiler->recordEventState(*eHandle, ncclProfilerNetPluginUpdate, &args);
ncclProfiler->stopEvent(*eHandle);
}
}
+27 -13
Fájl megtekintése
@@ -53,6 +53,7 @@ static uint8_t ncclStringToDatatype(const char* dt) {
}
static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclProfilerEventDescr_t* eDescr) {
*eHandle = NULL;
ncclProfilerEventDescr_v1_t eDescr_v1 = { 0 };
eDescr_v1.type = eDescr->type;
eDescr_v1.parentObj = eDescr->parentObj;
@@ -60,8 +61,8 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP
switch(eDescr->type) {
case ncclProfileGroup: break;
case ncclProfileColl: {
eDescr_v1.coll.name = eDescr->coll.name;
eDescr_v1.coll.commHash = eDescr->coll.commHash;
eDescr_v1.coll.name = nullptr; // removed in v4
eDescr_v1.coll.commHash = 0; // removed in v4
eDescr_v1.coll.seqNumber = eDescr->coll.seqNumber;
eDescr_v1.coll.func = ncclStringToFunc(eDescr->coll.func);
eDescr_v1.coll.sendBuff = eDescr->coll.sendBuff;
@@ -71,14 +72,14 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP
eDescr_v1.coll.datatype = ncclStringToDatatype(eDescr->coll.datatype);
eDescr_v1.coll.op = 0; // removed in v2
eDescr_v1.coll.trafficBytes = 0; // removed in v3
eDescr_v1.coll.nMaxChannels = eDescr->coll.nMaxChannels;
eDescr_v1.coll.nMaxChannels = eDescr->coll.nChannels;
eDescr_v1.coll.nWarps = eDescr->coll.nWarps;
eDescr_v1.coll.algo = ncclStringToAlgo(eDescr->coll.algo);
eDescr_v1.coll.proto = ncclStringToProto(eDescr->coll.proto);
} break;
case ncclProfileP2p: {
eDescr_v1.p2p.name = eDescr->p2p.name;
eDescr_v1.p2p.commHash = eDescr->p2p.commHash;
eDescr_v1.p2p.name = nullptr; // removed in v4
eDescr_v1.p2p.commHash = 0; // removed in v4
eDescr_v1.p2p.func = ncclStringToFunc(eDescr->p2p.func);
eDescr_v1.p2p.buff = eDescr->p2p.buff;
eDescr_v1.p2p.count = eDescr->p2p.count;
@@ -97,21 +98,34 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP
eDescr_v1.proxyStep.step = eDescr->proxyStep.step;
} break;
case ncclProfileProxyCtrl: break;
case ncclProfileKernelCh:
case ncclProfileNetPlugin: {
*eHandle = NULL;
return ncclSuccess;
}
default:;
default: return ncclSuccess;
}
return ncclProfiler_v1->startEvent(context, eHandle, &eDescr_v1);
}
static ncclResult_t ncclProfiler_recordEventState(void* eHandle, ncclProfilerEventState_t eState, ncclProfilerEventStateArgs_t* eStateArgs) {
return ncclProfiler_v1->recordEventState(eHandle, eState, (ncclProfilerEventStateArgs_v1_t*)eStateArgs);
ncclProfilerEventStateArgs_v1_t args = { };
switch (eState) {
case ncclProfilerProxyCtrlIdle:
case ncclProfilerProxyCtrlActive:
case ncclProfilerProxyCtrlSleep:
case ncclProfilerProxyCtrlWakeup:
case ncclProfilerProxyCtrlAppend:
case ncclProfilerProxyCtrlAppendEnd:
args.proxyCtrl.appendedProxyOps = eStateArgs->proxyCtrl.appendedProxyOps;
break;
case ncclProfilerProxyStepSendGPUWait:
case ncclProfilerProxyStepSendWait:
case ncclProfilerProxyStepRecvWait:
case ncclProfilerProxyStepRecvFlushWait:
case ncclProfilerProxyStepRecvGPUWait:
break;
default: return ncclSuccess;
}
return ncclProfiler_v1->recordEventState(eHandle, eState, &args);
}
static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask) {
static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) {
NCCLCHECK(ncclProfiler_v1->init(context, eActivationMask));
ncclProfiler.startEvent = ncclProfiler_startEvent;
ncclProfiler.stopEvent = ncclProfiler_v1->stopEvent;
+25 -7
Fájl megtekintése
@@ -20,8 +20,8 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP
switch(eDescr->type) {
case ncclProfileGroup: break;
case ncclProfileColl: {
eDescr_v2.coll.name = eDescr->coll.name;
eDescr_v2.coll.commHash = eDescr->coll.commHash;
eDescr_v2.coll.name = nullptr; // removed in v4
eDescr_v2.coll.commHash = 0; // removed in v4
eDescr_v2.coll.seqNumber = eDescr->coll.seqNumber;
eDescr_v2.coll.func = eDescr->coll.func;
eDescr_v2.coll.sendBuff = eDescr->coll.sendBuff;
@@ -30,14 +30,14 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP
eDescr_v2.coll.root = eDescr->coll.root;
eDescr_v2.coll.datatype = eDescr->coll.datatype;
eDescr_v2.coll.trafficBytes = 0; // removed in v3
eDescr_v2.coll.nMaxChannels = eDescr->coll.nMaxChannels;
eDescr_v2.coll.nMaxChannels = eDescr->coll.nChannels;
eDescr_v2.coll.nWarps = eDescr->coll.nWarps;
eDescr_v2.coll.algo = eDescr->coll.algo;
eDescr_v2.coll.proto = eDescr->coll.proto;
} break;
case ncclProfileP2p: {
eDescr_v2.p2p.name = eDescr->p2p.name;
eDescr_v2.p2p.commHash = eDescr->p2p.commHash;
eDescr_v2.p2p.name = nullptr; // removed in v4
eDescr_v2.p2p.commHash = 0; // removed in v4
eDescr_v2.p2p.func = eDescr->p2p.func;
eDescr_v2.p2p.buff = eDescr->p2p.buff;
eDescr_v2.p2p.count = eDescr->p2p.count;
@@ -62,10 +62,28 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP
}
static ncclResult_t ncclProfiler_recordEventState(void* eHandle, ncclProfilerEventState_t eState, ncclProfilerEventStateArgs_t* eStateArgs) {
return ncclProfiler_v2->recordEventState(eHandle, eState, (ncclProfilerEventStateArgs_v2_t *)eStateArgs);
ncclProfilerEventStateArgs_v2_t args = { };
switch (eState) {
case ncclProfilerProxyCtrlIdle:
case ncclProfilerProxyCtrlActive:
case ncclProfilerProxyCtrlSleep:
case ncclProfilerProxyCtrlWakeup:
case ncclProfilerProxyCtrlAppend:
case ncclProfilerProxyCtrlAppendEnd:
args.proxyCtrl.appendedProxyOps = eStateArgs->proxyCtrl.appendedProxyOps;
break;
case ncclProfilerProxyStepSendGPUWait:
case ncclProfilerProxyStepSendWait:
case ncclProfilerProxyStepRecvWait:
case ncclProfilerProxyStepRecvFlushWait:
case ncclProfilerProxyStepRecvGPUWait:
break;
default: return ncclSuccess;
}
return ncclProfiler_v2->recordEventState(eHandle, eState, &args);
}
static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask) {
static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) {
NCCLCHECK(ncclProfiler_v2->init(context, eActivationMask));
ncclProfiler.startEvent = ncclProfiler_startEvent;
ncclProfiler.stopEvent = ncclProfiler_v2->stopEvent;
+92 -1
Fájl megtekintése
@@ -6,14 +6,105 @@
#include "comm.h"
#include "nccl_profiler.h"
#include "checks.h"
static ncclProfiler_t ncclProfiler;
static ncclProfiler_v3_t* ncclProfiler_v3;
static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclProfilerEventDescr_t* eDescr) {
*eHandle = nullptr;
ncclProfilerEventDescr_v3_t eDescr_v3 = { };
eDescr_v3.type = eDescr->type;
eDescr_v3.parentObj = eDescr->parentObj;
eDescr_v3.rank = eDescr->rank;
switch(eDescr->type) {
case ncclProfileGroup: break;
case ncclProfileColl: {
eDescr_v3.coll.name = nullptr; // removed in v4
eDescr_v3.coll.commHash = 0; // removed in v4
eDescr_v3.coll.seqNumber = eDescr->coll.seqNumber;
eDescr_v3.coll.func = eDescr->coll.func;
eDescr_v3.coll.sendBuff = eDescr->coll.sendBuff;
eDescr_v3.coll.recvBuff = eDescr->coll.recvBuff;
eDescr_v3.coll.count = eDescr->coll.count;
eDescr_v3.coll.root = eDescr->coll.root;
eDescr_v3.coll.datatype = eDescr->coll.datatype;
eDescr_v3.coll.nMaxChannels = eDescr->coll.nChannels;
eDescr_v3.coll.nWarps = eDescr->coll.nWarps;
eDescr_v3.coll.algo = eDescr->coll.algo;
eDescr_v3.coll.proto = eDescr->coll.proto;
} break;
case ncclProfileP2p: {
eDescr_v3.p2p.name = nullptr; // removed in v4
eDescr_v3.p2p.commHash = 0; // removed in v4
eDescr_v3.p2p.func = eDescr->p2p.func;
eDescr_v3.p2p.buff = eDescr->p2p.buff;
eDescr_v3.p2p.count = eDescr->p2p.count;
eDescr_v3.p2p.datatype = eDescr->p2p.datatype;
eDescr_v3.p2p.peer = eDescr->p2p.peer;
} break;
case ncclProfileProxyOp: {
eDescr_v3.proxyOp.pid = eDescr->proxyOp.pid;
eDescr_v3.proxyOp.channelId = eDescr->proxyOp.channelId;
eDescr_v3.proxyOp.peer = eDescr->proxyOp.peer;
eDescr_v3.proxyOp.nSteps = eDescr->proxyOp.nSteps;
eDescr_v3.proxyOp.chunkSize = eDescr->proxyOp.chunkSize;
eDescr_v3.proxyOp.isSend = eDescr->proxyOp.isSend;
} break;
case ncclProfileProxyStep: {
eDescr_v3.proxyStep.step = eDescr->proxyStep.step;
} break;
case ncclProfileProxyCtrl: break;
case ncclProfileKernelCh: {
eDescr_v3.kernelCh.channelId = eDescr->kernelCh.channelId;
} break;
case ncclProfileNetPlugin: {
eDescr_v3.netPlugin.id = eDescr->netPlugin.id;
eDescr_v3.netPlugin.data = eDescr->netPlugin.data;
} break;
default: return ncclSuccess;
}
return ncclProfiler_v3->startEvent(context, eHandle, &eDescr_v3);
}
static ncclResult_t ncclProfiler_recordEventState(void* eHandle, ncclProfilerEventState_t eState, ncclProfilerEventStateArgs_t* eStateArgs) {
ncclProfilerEventStateArgs_v3_t args = { };
switch (eState) {
case ncclProfilerProxyCtrlIdle:
case ncclProfilerProxyCtrlActive:
case ncclProfilerProxyCtrlSleep:
case ncclProfilerProxyCtrlWakeup:
case ncclProfilerProxyCtrlAppend:
case ncclProfilerProxyCtrlAppendEnd:
args.proxyCtrl.appendedProxyOps = eStateArgs->proxyCtrl.appendedProxyOps;
break;
case ncclProfilerProxyStepSendGPUWait:
case ncclProfilerProxyStepSendWait:
case ncclProfilerProxyStepRecvWait:
case ncclProfilerProxyStepRecvFlushWait:
case ncclProfilerProxyStepRecvGPUWait:
break;
default: return ncclSuccess;
}
return ncclProfiler_v3->recordEventState(eHandle, eState, &args);
}
static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) {
NCCLCHECK(ncclProfiler_v3->init(context, eActivationMask));
ncclProfiler.startEvent = ncclProfiler_startEvent;
ncclProfiler.stopEvent = ncclProfiler_v3->stopEvent;
ncclProfiler.recordEventState = ncclProfiler_recordEventState;
ncclProfiler.finalize = ncclProfiler_v3->finalize;
return ncclSuccess;
}
ncclProfiler_t* getNcclProfiler_v3(void* lib) {
ncclProfiler_v3 = (ncclProfiler_v3_t*)dlsym(lib, "ncclProfiler_v3");
if (ncclProfiler_v3) {
ncclProfiler.name = ncclProfiler_v3->name;
ncclProfiler.init = ncclProfiler_init;
INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded %s", ncclProfiler_v3->name);
return ncclProfiler_v3;
return &ncclProfiler;
}
INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v3");
return NULL;
@@ -0,0 +1,21 @@
/*************************************************************************
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include "comm.h"
#include "nccl_profiler.h"
#include "checks.h"
static ncclProfiler_v4_t* ncclProfiler_v4;
ncclProfiler_t* getNcclProfiler_v4(void* lib) {
ncclProfiler_v4 = (ncclProfiler_v4_t*)dlsym(lib, "ncclProfiler_v4");
if (ncclProfiler_v4) {
INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded %s", ncclProfiler_v4->name);
return ncclProfiler_v4;
}
INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v4");
return NULL;
}