Merge remote-tracking branch 'nccl/master' into develop
This commit is contained in:
+219
-171
@@ -8,6 +8,7 @@
|
||||
#include "bootstrap.h"
|
||||
#include "checks.h"
|
||||
#include "plugin.h"
|
||||
#include "nccl_net.h"
|
||||
|
||||
#include <string.h>
|
||||
#include <errno.h>
|
||||
@@ -15,137 +16,100 @@
|
||||
//#include <sys/stat.h>
|
||||
//#include <unistd.h>
|
||||
|
||||
extern ncclNet_t* getNcclNet_v6(void* netPluginLib);
|
||||
extern ncclNet_t* getNcclNet_v7(void* netPluginLib);
|
||||
extern ncclNet_t* getNcclNet_v8(void* netPluginLib);
|
||||
extern ncclNet_t* getNcclNet_v9(void* netPluginLib);
|
||||
extern ncclNet_t* getNcclNet_v10(void* netPluginLib);
|
||||
typedef ncclNet_t* getNcclNet_t(void* netPluginLib);
|
||||
typedef ncclCollNet_t* getNcclCollNet_t(void* netPluginLib);
|
||||
|
||||
extern ncclCollNet_t* getNcclCollNet_v6(void* netPluginLib);
|
||||
extern ncclCollNet_t* getNcclCollNet_v7(void* netPluginLib);
|
||||
extern ncclCollNet_t* getNcclCollNet_v8(void* netPluginLib);
|
||||
extern ncclCollNet_t* getNcclCollNet_v9(void* netPluginLib);
|
||||
extern ncclCollNet_t* getNcclCollNet_v10(void* netPluginLib);
|
||||
|
||||
static pthread_mutex_t netLock = PTHREAD_MUTEX_INITIALIZER;
|
||||
ncclNet_t* ncclNets[NCCL_NET_MAX_PLUGINS] = { nullptr, &ncclNetIb, &ncclNetSocket };
|
||||
static int ncclNetsVer[NCCL_NET_MAX_PLUGINS] = { -1, 10, 10 };
|
||||
ncclCollNet_t* ncclCollNets[NCCL_NET_MAX_PLUGINS] = { nullptr, nullptr, nullptr };
|
||||
enum ncclNetState {
|
||||
ncclNetStateInit = 0,
|
||||
ncclNetStateEnabled = 1,
|
||||
ncclNetStateDisabled = 2
|
||||
};
|
||||
enum ncclNetState ncclNetStates[NCCL_NET_MAX_PLUGINS] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
|
||||
enum ncclNetState ncclCollNetStates[NCCL_NET_MAX_PLUGINS] = { ncclNetStateInit, ncclNetStateInit, ncclNetStateInit };
|
||||
extern getNcclNet_t getNcclNet_v6;
|
||||
extern getNcclNet_t getNcclNet_v7;
|
||||
extern getNcclNet_t getNcclNet_v8;
|
||||
extern getNcclNet_t getNcclNet_v9;
|
||||
extern getNcclNet_t getNcclNet_v10;
|
||||
extern getNcclCollNet_t getNcclCollNet_v6;
|
||||
extern getNcclCollNet_t getNcclCollNet_v7;
|
||||
extern getNcclCollNet_t getNcclCollNet_v8;
|
||||
extern getNcclCollNet_t getNcclCollNet_v9;
|
||||
extern getNcclCollNet_t getNcclCollNet_v10;
|
||||
|
||||
NCCL_PARAM(NetPluginRefCount, "NET_PLUGIN_REF_COUNT", 1);
|
||||
#define NCCL_NET_VERSION_COUNT 5
|
||||
int ncclNetVersion[NCCL_NET_VERSION_COUNT] = {10, 9, 8, 7, 6};
|
||||
getNcclNet_t* getNcclNet[NCCL_NET_VERSION_COUNT] = {getNcclNet_v10, getNcclNet_v9, getNcclNet_v8, getNcclNet_v7, getNcclNet_v6};
|
||||
getNcclCollNet_t* getNcclCollNet[NCCL_NET_VERSION_COUNT] = {getNcclCollNet_v10, getNcclCollNet_v9, getNcclCollNet_v8, getNcclCollNet_v7, getNcclCollNet_v6};
|
||||
|
||||
#define NCCL_NET_NUM_INTERNAL_PLUGINS 2
|
||||
|
||||
typedef enum ncclNetPluginState {
|
||||
ncclNetPluginStateDisabled = -2, // Plugin library failed to initialize
|
||||
ncclNetPluginStateLoadFailed = -1, // Plugin library failed to load
|
||||
ncclNetPluginStateLoadReady = 0, // Plugin library is ready to be loaded
|
||||
ncclNetPluginStateInitReady = 1, // Plugin library is loaded and ready to be initialized
|
||||
ncclNetPluginStateEnabled = 2, // Plugin library is loaded and initialized
|
||||
} ncclNetPluginState_t;
|
||||
|
||||
#define MAX_STR_LEN 255
|
||||
typedef struct netPluginLib {
|
||||
char name[MAX_STR_LEN]; // Name of the plugin library
|
||||
void* dlHandle; // Handle to the plugin library
|
||||
ncclNet_t* ncclNet; // Pointer to the ncclNet_t structure
|
||||
int ncclNetVer; // Version of the nccl net plugin
|
||||
ncclCollNet_t* ncclCollNet; // Pointer to the ncclCollNet_t structure
|
||||
ncclNetPluginState_t ncclNetPluginState; // State of the nccl net plugin
|
||||
ncclNetPluginState_t ncclCollNetPluginState; // State of the nccl coll net plugin
|
||||
int ncclNetPluginRefCount; // Reference count for the nccl net plugin
|
||||
} netPluginLib_t;
|
||||
|
||||
int pluginCount = 0;
|
||||
bool netPluginLibsInitialized = false;
|
||||
netPluginLib_t netPluginLibs[NCCL_NET_MAX_PLUGINS] = { 0 };
|
||||
static pthread_mutex_t netPluginLock = PTHREAD_MUTEX_INITIALIZER;
|
||||
static void* netPluginLib;
|
||||
static pthread_once_t initPluginLibsOnceControl = PTHREAD_ONCE_INIT;
|
||||
|
||||
static int netPluginRefCount;
|
||||
static void initNetPluginRefCountOnce(void) { netPluginRefCount = ncclParamNetPluginRefCount();}
|
||||
|
||||
enum {
|
||||
netPluginLoadFailed = -1,
|
||||
netPluginLoadReady = 0,
|
||||
netPluginLoadSuccess = 1,
|
||||
};
|
||||
|
||||
static int netPluginStatus = netPluginLoadReady;
|
||||
|
||||
ncclResult_t ncclNetPluginLoad(struct ncclComm* comm) {
|
||||
static pthread_once_t netPluginRefCountOnce = PTHREAD_ONCE_INIT;
|
||||
pthread_once(&netPluginRefCountOnce, initNetPluginRefCountOnce);
|
||||
|
||||
pthread_mutex_lock(&netPluginLock);
|
||||
if (netPluginLoadFailed == netPluginStatus) {
|
||||
goto exit;
|
||||
static ncclResult_t ncclNetPluginUnload(netPluginLib_t* pluginLib) {
|
||||
if ((pluginLib->dlHandle) && ((pluginLib->ncclNetPluginRefCount) == 0)) {
|
||||
INFO(NCCL_INIT|NCCL_NET, "Unloading plugin %s", pluginLib->name);
|
||||
NCCLCHECK(ncclClosePluginLib(pluginLib->dlHandle));
|
||||
memset(pluginLib, 0, sizeof(netPluginLib_t));
|
||||
}
|
||||
if (netPluginLoadSuccess == netPluginStatus) {
|
||||
++netPluginRefCount;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
netPluginLib = ncclOpenNetPluginLib(ncclGetEnv("NCCL_NET_PLUGIN"));
|
||||
if (netPluginLib == nullptr) {
|
||||
goto fail;
|
||||
}
|
||||
|
||||
ncclNets[0] = getNcclNet_v10(netPluginLib);
|
||||
if (ncclNets[0]) ncclNetsVer[0] = 10;
|
||||
if (ncclNets[0] == nullptr) {
|
||||
// Try v9 plugin
|
||||
ncclNets[0] = getNcclNet_v9(netPluginLib);
|
||||
if (ncclNets[0]) ncclNetsVer[0] = 9;
|
||||
}
|
||||
if (ncclNets[0] == nullptr) {
|
||||
// Try v8 plugin
|
||||
ncclNets[0] = getNcclNet_v8(netPluginLib);
|
||||
if (ncclNets[0]) ncclNetsVer[0] = 8;
|
||||
}
|
||||
if (ncclNets[0] == nullptr) {
|
||||
// Try v7 plugin
|
||||
ncclNets[0] = getNcclNet_v7(netPluginLib);
|
||||
if (ncclNets[0]) ncclNetsVer[0] = 7;
|
||||
}
|
||||
if (ncclNets[0] == nullptr) {
|
||||
// Try v6 plugin
|
||||
ncclNets[0] = getNcclNet_v6(netPluginLib);
|
||||
if (ncclNets[0]) ncclNetsVer[0] = 6;
|
||||
}
|
||||
if (ncclNets[0] == nullptr) {
|
||||
goto fail;
|
||||
}
|
||||
|
||||
// Check for CollNet
|
||||
ncclCollNets[0] = getNcclCollNet_v10(netPluginLib);
|
||||
if (ncclCollNets[0] == nullptr) {
|
||||
ncclCollNets[0] = getNcclCollNet_v9(netPluginLib);
|
||||
}
|
||||
if (ncclCollNets[0] == nullptr) {
|
||||
ncclCollNets[0] = getNcclCollNet_v8(netPluginLib);
|
||||
}
|
||||
if (ncclCollNets[0] == nullptr) {
|
||||
ncclCollNets[0] = getNcclCollNet_v7(netPluginLib);
|
||||
}
|
||||
if (ncclCollNets[0] == nullptr) {
|
||||
ncclCollNets[0] = getNcclCollNet_v6(netPluginLib);
|
||||
}
|
||||
|
||||
++netPluginRefCount;
|
||||
netPluginStatus = netPluginLoadSuccess;
|
||||
comm->netPluginLoaded = 1;
|
||||
|
||||
exit:
|
||||
pthread_mutex_unlock(&netPluginLock);
|
||||
return ncclSuccess;
|
||||
fail:
|
||||
if (netPluginLib) NCCLCHECK(ncclClosePluginLib(netPluginLib));
|
||||
netPluginStatus = netPluginLoadFailed;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t ncclNetPluginUnload(struct ncclComm* comm) {
|
||||
pthread_mutex_lock(&netPluginLock);
|
||||
if (comm->netPluginLoaded && 0 == (--netPluginRefCount)) {
|
||||
if (ncclNets[0]) {
|
||||
INFO(NCCL_NET, "NET/Plugin: Closing net plugin '%s'", ncclNets[0]->name);
|
||||
}
|
||||
if (ncclCollNets[0]) {
|
||||
INFO(NCCL_NET, "NET/Plugin: Closing collnet plugin '%s'", ncclCollNets[0]->name);
|
||||
}
|
||||
NCCLCHECK(ncclClosePluginLib(netPluginLib));
|
||||
netPluginLib = nullptr;
|
||||
ncclNets[0] = nullptr;
|
||||
ncclCollNets[0] = nullptr;
|
||||
netPluginStatus = netPluginLoadReady;
|
||||
comm->netPluginLoaded = 0;
|
||||
for (int i = 0; i < NCCL_NET_MAX_PLUGINS; ++i)
|
||||
ncclCollNetStates[i] = ncclNetStates[i] = ncclNetStateInit;
|
||||
static ncclResult_t ncclNetPluginLoad(netPluginLib_t* pluginLib) {
|
||||
pluginLib->dlHandle = ncclOpenNetPluginLib(pluginLib->name);
|
||||
|
||||
if (pluginLib->dlHandle == nullptr) goto fail;
|
||||
// load ncclNet
|
||||
for (int i = 0; i < NCCL_NET_VERSION_COUNT; i++) {
|
||||
pluginLib->ncclNetVer = ncclNetVersion[i];
|
||||
pluginLib->ncclNet = getNcclNet[i](pluginLib->dlHandle);
|
||||
if (pluginLib->ncclNet) break;
|
||||
}
|
||||
pthread_mutex_unlock(&netPluginLock);
|
||||
|
||||
// if we fail to find a net, exit
|
||||
if (pluginLib->ncclNet == nullptr) goto fail;
|
||||
|
||||
pluginLib->ncclNetPluginState = ncclNetPluginStateInitReady;
|
||||
|
||||
// load ncclColNet
|
||||
for (int i = 0; i < NCCL_NET_VERSION_COUNT; i++) {
|
||||
pluginLib->ncclCollNet = getNcclCollNet[i](pluginLib->dlHandle);
|
||||
if (pluginLib->ncclCollNet) break;
|
||||
}
|
||||
|
||||
if (pluginLib->ncclCollNet == nullptr)
|
||||
pluginLib->ncclCollNetPluginState = ncclNetPluginStateLoadFailed;
|
||||
else
|
||||
pluginLib->ncclCollNetPluginState = ncclNetPluginStateInitReady;
|
||||
|
||||
INFO(NCCL_INIT|NCCL_NET, "Successfully loaded external plugin %s", pluginLib->name);
|
||||
exit:
|
||||
return ncclSuccess;
|
||||
fail:
|
||||
if (pluginLib->dlHandle) {
|
||||
NCCLCHECK(ncclClosePluginLib(pluginLib->dlHandle));
|
||||
}
|
||||
pluginLib->ncclNetPluginState = ncclNetPluginStateLoadFailed;
|
||||
pluginLib->ncclCollNetPluginState = ncclNetPluginStateLoadFailed;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
ncclResult_t ncclNetCheckDeviceVersion(struct ncclComm* comm, ncclNet_t* net, int dev) {
|
||||
@@ -172,72 +136,156 @@ ncclResult_t ncclNetCheckDeviceVersion(struct ncclComm* comm, ncclNet_t* net, in
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t netGetState(int i, enum ncclNetState* state) {
|
||||
pthread_mutex_lock(&netLock);
|
||||
if (ncclNetStates[i] == ncclNetStateInit) {
|
||||
int ndev;
|
||||
if (ncclNets[i]->init(ncclDebugLog, ncclProfilerCallback) != ncclSuccess) ncclNetStates[i] = ncclNetStateDisabled;
|
||||
else if (ncclNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclNetStates[i] = ncclNetStateDisabled;
|
||||
else ncclNetStates[i] = ncclNetStateEnabled;
|
||||
static ncclResult_t ncclNetPluginInit(netPluginLib_t* pluginLib) {
|
||||
int ndev;
|
||||
if (pluginLib->ncclNetPluginState == ncclNetPluginStateInitReady && pluginLib->ncclNet) {
|
||||
if (pluginLib->ncclNet->init(ncclDebugLog, ncclProfilerCallback) != ncclSuccess) goto fail;
|
||||
if (pluginLib->ncclNet->devices(&ndev) != ncclSuccess || ndev <= 0) goto fail;
|
||||
}
|
||||
pluginLib->ncclNetPluginState = ncclNetPluginStateEnabled;
|
||||
INFO(NCCL_INIT|NCCL_NET, "Initialized NET plugin %s", pluginLib->ncclNet->name);
|
||||
|
||||
if (pluginLib->ncclCollNetPluginState == ncclNetPluginStateInitReady && pluginLib->ncclCollNet) {
|
||||
if (pluginLib->ncclCollNet->init(ncclDebugLog) != ncclSuccess) pluginLib->ncclCollNetPluginState = ncclNetPluginStateDisabled;
|
||||
else if (pluginLib->ncclCollNet->devices(&ndev) != ncclSuccess || ndev <= 0) pluginLib->ncclCollNetPluginState = ncclNetPluginStateDisabled;
|
||||
else {
|
||||
pluginLib->ncclCollNetPluginState = ncclNetPluginStateEnabled;
|
||||
}
|
||||
}
|
||||
exit:
|
||||
return ncclSuccess;
|
||||
fail:
|
||||
pluginLib->ncclNetPluginState = ncclNetPluginStateDisabled;
|
||||
pluginLib->ncclCollNetPluginState = ncclNetPluginStateDisabled;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclNetPluginAssignToComm(struct ncclComm* comm, int pluginIndex, bool* isAssigned) {
|
||||
const char* netName = comm->config.netName;
|
||||
if (netName && strcasecmp(netName, netPluginLibs[pluginIndex].ncclNet->name) != 0) goto fail;
|
||||
if (ncclSuccess != ncclNetCheckDeviceVersion(comm, netPluginLibs[pluginIndex].ncclNet, 0)) goto fail;
|
||||
|
||||
if (netPluginLibs[pluginIndex].ncclNetPluginState >= ncclNetPluginStateEnabled) {
|
||||
comm->ncclNet = netPluginLibs[pluginIndex].ncclNet;
|
||||
comm->ncclNetVer = netPluginLibs[pluginIndex].ncclNetVer;
|
||||
comm->netPluginIndex = pluginIndex;
|
||||
netPluginLibs[pluginIndex].ncclNetPluginRefCount++;
|
||||
*isAssigned = true;
|
||||
INFO(NCCL_INIT|NCCL_NET, "Assigned NET plugin %s to comm", netPluginLibs[pluginIndex].ncclNet->name);
|
||||
if (netPluginLibs[pluginIndex].ncclCollNetPluginState >= ncclNetPluginStateEnabled) {
|
||||
comm->ncclCollNet = netPluginLibs[pluginIndex].ncclCollNet;
|
||||
}
|
||||
}
|
||||
exit:
|
||||
return ncclSuccess;
|
||||
fail:
|
||||
*isAssigned = false;
|
||||
netPluginLibs[pluginIndex].ncclNetPluginState = ncclNetPluginStateEnabled;
|
||||
netPluginLibs[pluginIndex].ncclCollNetPluginState = ncclNetPluginStateEnabled;
|
||||
goto exit;
|
||||
}
|
||||
|
||||
static ncclResult_t ncclNetPluginDisableOtherExternal(int pluginIndex) {
|
||||
// Only if an external plugin is enabled, disable other external plugins
|
||||
if (pluginIndex >= (pluginCount - NCCL_NET_NUM_INTERNAL_PLUGINS)) return ncclSuccess;
|
||||
char names[MAX_STR_LEN*(NCCL_NET_MAX_PLUGINS - NCCL_NET_NUM_INTERNAL_PLUGINS)] = { 0 };
|
||||
for (int i = 0; i < (pluginCount - NCCL_NET_NUM_INTERNAL_PLUGINS); i++) {
|
||||
if (i != pluginIndex) {
|
||||
// Append all disabled plugin names to a string
|
||||
snprintf(names+strlen(names), sizeof(names)-strlen(names), (strlen(names) == 0) ? "%s" : ", %s", netPluginLibs[i].name);
|
||||
netPluginLibs[i].ncclNetPluginState = ncclNetPluginStateDisabled;
|
||||
}
|
||||
}
|
||||
if(strlen(names) > 0) {
|
||||
INFO(NCCL_INIT|NCCL_NET, "Disabling external plugins: %s", names);
|
||||
}
|
||||
*state = ncclNetStates[i];
|
||||
pthread_mutex_unlock(&netLock);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
static ncclResult_t collNetGetState(int i, enum ncclNetState* state) {
|
||||
pthread_mutex_lock(&netLock);
|
||||
if (ncclCollNetStates[i] == ncclNetStateInit) {
|
||||
int ndev;
|
||||
if (ncclCollNets[i]->init(ncclDebugLog) != ncclSuccess) ncclCollNetStates[i] = ncclNetStateDisabled;
|
||||
else if (ncclCollNets[i]->devices(&ndev) != ncclSuccess || ndev <= 0) ncclCollNetStates[i] = ncclNetStateDisabled;
|
||||
else ncclCollNetStates[i] = ncclNetStateEnabled;
|
||||
static void initPluginLibsOnceFunc() {
|
||||
char* netPluginName = nullptr;
|
||||
const char* defaultNetPlugin = "libnccl-net.so";
|
||||
const char* envNetPlugin = nullptr;
|
||||
char* envNetPluginList = nullptr;
|
||||
char* savePtr = nullptr;
|
||||
int pluginCounter = 0;
|
||||
|
||||
memset(netPluginLibs, 0, NCCL_NET_MAX_PLUGINS * sizeof(netPluginLib_t));
|
||||
envNetPlugin = ncclGetEnv("NCCL_NET_PLUGIN");
|
||||
if (envNetPlugin) {
|
||||
envNetPluginList = strdup(envNetPlugin);
|
||||
// Iterate over list until the list is empty
|
||||
netPluginName = strtok_r(envNetPluginList, ",", &savePtr);
|
||||
while(netPluginName) {
|
||||
// We have 2 internal plugins (ib and socket)
|
||||
// So, we can have at most( NCCL_NET_MAX_PLUGINS - (NCCL_NET_NUM_INTERNAL_PLUGINS)) in the NCCL_NET_PLUGIN list
|
||||
if (pluginCounter >= (NCCL_NET_MAX_PLUGINS - (NCCL_NET_NUM_INTERNAL_PLUGINS))) {
|
||||
INFO(NCCL_NET|NCCL_INIT,"NCCL_NET_PLUGIN list contains more than %d plugins, ignoring the rest", (NCCL_NET_MAX_PLUGINS - (NCCL_NET_NUM_INTERNAL_PLUGINS + 1)));
|
||||
break;
|
||||
}
|
||||
// need to leave space for the name + "\n"
|
||||
if((strlen(netPluginName)+1) <= MAX_STR_LEN) {
|
||||
netPluginLibs[pluginCounter].ncclNetPluginState = ncclNetPluginStateLoadReady;
|
||||
netPluginLibs[pluginCounter].ncclNetPluginRefCount = ncclParamNetPluginRefCount();
|
||||
strcpy(netPluginLibs[pluginCounter].name, netPluginName);
|
||||
pluginCounter++;
|
||||
} else {
|
||||
INFO(NCCL_NET|NCCL_INIT,"NCCL_NET_PLUGIN list contains a plugin name %s longer than %d characters, ignoring it.", netPluginName, MAX_STR_LEN);
|
||||
}
|
||||
netPluginName = strtok_r(nullptr, ",", &savePtr);
|
||||
}
|
||||
if (envNetPluginList) free(envNetPluginList);
|
||||
} else {
|
||||
// Add default net plugin
|
||||
netPluginLibs[pluginCounter].ncclNetPluginState = ncclNetPluginStateLoadReady;
|
||||
netPluginLibs[pluginCounter].ncclNetPluginRefCount = ncclParamNetPluginRefCount();
|
||||
strcpy(netPluginLibs[pluginCounter++].name, defaultNetPlugin);
|
||||
}
|
||||
*state = ncclCollNetStates[i];
|
||||
pthread_mutex_unlock(&netLock);
|
||||
return ncclSuccess;
|
||||
|
||||
// Add 2 internal ib and socket plugins
|
||||
netPluginLibs[pluginCounter].ncclNet = &ncclNetIb;
|
||||
netPluginLibs[pluginCounter++].ncclNetPluginState = ncclNetPluginStateInitReady;
|
||||
netPluginLibs[pluginCounter].ncclNet = &ncclNetSocket;
|
||||
netPluginLibs[pluginCounter++].ncclNetPluginState = ncclNetPluginStateInitReady;
|
||||
pluginCount = pluginCounter;
|
||||
}
|
||||
|
||||
ncclResult_t ncclNetInit(struct ncclComm* comm) {
|
||||
// Initialize main communication network
|
||||
const char* netName;
|
||||
bool ok = false;
|
||||
|
||||
netName = comm->config.netName;
|
||||
for (int i=0; i<3; i++) {
|
||||
if (ncclNets[i] == nullptr) continue;
|
||||
enum ncclNetState state;
|
||||
NCCLCHECK(netGetState(i, &state));
|
||||
if (state != ncclNetStateEnabled) continue;
|
||||
if (netName && strcasecmp(netName, ncclNets[i]->name) != 0) continue;
|
||||
if (ncclSuccess != ncclNetCheckDeviceVersion(comm, ncclNets[i], 0)) {
|
||||
// Mismatched device plugin version
|
||||
continue;
|
||||
bool ncclNetPluginInitialized = false;
|
||||
pthread_once(&initPluginLibsOnceControl, initPluginLibsOnceFunc);
|
||||
pthread_mutex_lock(&netPluginLock);
|
||||
for (int pluginIndex = 0; pluginIndex < pluginCount; pluginIndex++) {
|
||||
if ((pluginIndex < (pluginCount - NCCL_NET_NUM_INTERNAL_PLUGINS)) && (netPluginLibs[pluginIndex].ncclNetPluginState == ncclNetPluginStateLoadReady)) {
|
||||
NCCLCHECK(ncclNetPluginLoad(&netPluginLibs[pluginIndex]));
|
||||
}
|
||||
|
||||
comm->ncclNet = ncclNets[i];
|
||||
comm->ncclNetVer = ncclNetsVer[i];
|
||||
ok = true;
|
||||
|
||||
if (ncclCollNets[i]) {
|
||||
NCCLCHECK(collNetGetState(i, &state));
|
||||
if (state == ncclNetStateEnabled) {
|
||||
comm->ncclCollNet = ncclCollNets[i];
|
||||
if (netPluginLibs[pluginIndex].ncclNetPluginState == ncclNetPluginStateInitReady) {
|
||||
NCCLCHECK(ncclNetPluginInit(&netPluginLibs[pluginIndex]));
|
||||
}
|
||||
if (netPluginLibs[pluginIndex].ncclNetPluginState == ncclNetPluginStateEnabled) {
|
||||
bool isAssigned = false;
|
||||
NCCLCHECK(ncclNetPluginAssignToComm(comm, pluginIndex, &isAssigned));
|
||||
if (isAssigned) {
|
||||
// If one external plugin is assigned to a comm, then disable all other external plugins
|
||||
ncclNetPluginDisableOtherExternal(pluginIndex);
|
||||
ncclNetPluginInitialized = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if (!ok) {
|
||||
WARN("Error: network %s not found.", netName ? netName : "");
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
return ncclSuccess;
|
||||
pthread_mutex_unlock(&netPluginLock);
|
||||
if (ncclNetPluginInitialized) return ncclSuccess;
|
||||
WARN("Failed to initialize any NET plugin");
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
|
||||
ncclResult_t ncclNetFinalize(struct ncclComm* comm) {
|
||||
comm->ncclNet = nullptr;
|
||||
comm->ncclCollNet = nullptr;
|
||||
int pluginIndex = comm->netPluginIndex;
|
||||
pthread_mutex_lock(&netPluginLock);
|
||||
netPluginLibs[pluginIndex].ncclNetPluginRefCount--;
|
||||
for (int i = 0; i < (pluginCount - NCCL_NET_NUM_INTERNAL_PLUGINS); i++) {
|
||||
NCCLCHECK(ncclNetPluginUnload(&netPluginLibs[i]));
|
||||
}
|
||||
pthread_mutex_unlock(&netPluginLock);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@ enum ncclPluginType {
|
||||
static void *libHandles[NUM_LIBS];
|
||||
static const char *pluginNames[NUM_LIBS] = { "NET", "TUNER", "PROFILER" };
|
||||
static const char *pluginPrefix[NUM_LIBS] = { "librccl-net", "librccl-tuner", "librccl-profiler" };
|
||||
static const char *pluginFallback[NUM_LIBS] = { "Using internal net plugin.", "Using internal tuner plugin.", "" };
|
||||
static const char *pluginFallback[NUM_LIBS] = { "", "Using internal tuner plugin.", "" };
|
||||
static unsigned long subsys[NUM_LIBS] = { NCCL_INIT|NCCL_NET, NCCL_INIT|NCCL_TUNING, NCCL_INIT };
|
||||
|
||||
static void* tryOpenLib(char* name, int* err, char* errStr) {
|
||||
@@ -49,10 +49,9 @@ static void* tryOpenLib(char* name, int* err, char* errStr) {
|
||||
return handle;
|
||||
}
|
||||
|
||||
static void appendNameToList(char* nameList, int *nameListLen, char* name) {
|
||||
snprintf(nameList, *nameListLen, " %s", name);
|
||||
nameList += strlen(name) + 1;
|
||||
*nameListLen -= strlen(name) + 1;
|
||||
static void appendNameToList(char* nameList, int *leftChars, char* name) {
|
||||
snprintf(nameList + PATH_MAX - *leftChars, *leftChars, " %s", name);
|
||||
*leftChars -= strlen(name) + 1;
|
||||
}
|
||||
|
||||
static void* openPluginLib(enum ncclPluginType type, const char* libName) {
|
||||
@@ -62,28 +61,31 @@ static void* openPluginLib(enum ncclPluginType type, const char* libName) {
|
||||
char eNoEntNameList[PATH_MAX] = { 0 };
|
||||
|
||||
if (libName && strlen(libName)) {
|
||||
snprintf(libName_, MAX_STR_LEN, "%s", libName);
|
||||
libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
|
||||
if (libHandles[type]) {
|
||||
INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_);
|
||||
return libHandles[type];
|
||||
}
|
||||
if (openErr == ENOENT) {
|
||||
appendNameToList(eNoEntNameList, &len, libName_);
|
||||
// match names that start with 'lib' and end with '.so'
|
||||
if (strlen(libName) >= strlen("libX.so") && strncmp(libName, "lib", strlen("lib")) == 0 && strncmp(libName + strlen(libName) - strlen(".so"), ".so", strlen(".so")) == 0) {
|
||||
snprintf(libName_, MAX_STR_LEN, "%s", libName);
|
||||
libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
|
||||
if (libHandles[type]) {
|
||||
INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_);
|
||||
return libHandles[type];
|
||||
}
|
||||
if (openErr == ENOENT) {
|
||||
appendNameToList(eNoEntNameList, &len, libName_);
|
||||
} else {
|
||||
INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr);
|
||||
}
|
||||
} else {
|
||||
INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr);
|
||||
}
|
||||
|
||||
snprintf(libName_, MAX_STR_LEN, "%s-%s.so", pluginPrefix[type], libName);
|
||||
libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
|
||||
if (libHandles[type]) {
|
||||
INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_);
|
||||
return libHandles[type];
|
||||
}
|
||||
if (openErr == ENOENT) {
|
||||
appendNameToList(eNoEntNameList, &len, libName_);
|
||||
} else {
|
||||
INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr);
|
||||
snprintf(libName_, MAX_STR_LEN, "%s-%s.so", pluginPrefix[type], libName);
|
||||
libHandles[type] = tryOpenLib(libName_, &openErr, openErrStr);
|
||||
if (libHandles[type]) {
|
||||
INFO(subsys[type], "%s/Plugin: Plugin name set by env to %s", pluginNames[type], libName_);
|
||||
return libHandles[type];
|
||||
}
|
||||
if (openErr == ENOENT) {
|
||||
appendNameToList(eNoEntNameList, &len, libName_);
|
||||
} else {
|
||||
INFO(subsys[type], "%s/Plugin: %s", pluginNames[type], openErrStr);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
snprintf(libName_, MAX_STR_LEN, "%s.so", pluginPrefix[type]);
|
||||
@@ -123,12 +125,17 @@ void* ncclGetNetPluginLib(void) {
|
||||
}
|
||||
|
||||
ncclResult_t ncclClosePluginLib(void* handle) {
|
||||
bool found = false;
|
||||
for (int l=0; l<NUM_LIBS; l++) {
|
||||
if (libHandles[l] == handle) {
|
||||
libHandles[l] = nullptr;
|
||||
dlclose(handle);
|
||||
return ncclSuccess;
|
||||
if (!found) {
|
||||
if (handle) {
|
||||
dlclose(handle);
|
||||
}
|
||||
found = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return ncclInternalError;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
extern ncclProfiler_t* getNcclProfiler_v1(void* lib);
|
||||
extern ncclProfiler_t* getNcclProfiler_v2(void* lib);
|
||||
extern ncclProfiler_t* getNcclProfiler_v3(void* lib);
|
||||
extern ncclProfiler_t* getNcclProfiler_v4(void* lib);
|
||||
|
||||
static pthread_mutex_t profilerLock = PTHREAD_MUTEX_INITIALIZER;
|
||||
static int profilerPluginRefCount;
|
||||
@@ -51,7 +52,10 @@ static ncclResult_t ncclProfilerPluginLoad(void) {
|
||||
goto fail;
|
||||
}
|
||||
|
||||
ncclProfiler = getNcclProfiler_v3(profilerPluginLib);
|
||||
ncclProfiler = getNcclProfiler_v4(profilerPluginLib);
|
||||
if (ncclProfiler == nullptr) {
|
||||
ncclProfiler = getNcclProfiler_v3(profilerPluginLib);
|
||||
}
|
||||
if (ncclProfiler == nullptr) {
|
||||
ncclProfiler = getNcclProfiler_v2(profilerPluginLib);
|
||||
}
|
||||
@@ -164,7 +168,7 @@ ncclResult_t ncclProfilerPluginInit(struct ncclComm* comm) {
|
||||
TIME_START_EVENT(init);
|
||||
ncclProfilerPluginLoad();
|
||||
if (__builtin_expect(ncclProfiler != NULL, 0)) {
|
||||
int err = ncclProfiler->init(&comm->profilerContext, &ncclProfilerEventMask);
|
||||
int err = ncclProfiler->init(&comm->profilerContext, &ncclProfilerEventMask, comm->config.commName, comm->commHash, comm->nNodes, comm->nRanks, comm->rank, ncclDebugLog);
|
||||
if (err) {
|
||||
WARN("Profiler init failed with error (%d). Continue without profiler.", err);
|
||||
ncclProfiler = NULL;
|
||||
@@ -241,8 +245,6 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) {
|
||||
eDescr.type = ncclProfileColl;
|
||||
eDescr.parentObj = plan->groupEventHandle;
|
||||
eDescr.rank = plan->comm->rank;
|
||||
eDescr.coll.name = plan->comm->commName;
|
||||
eDescr.coll.commHash = plan->comm->commHash;
|
||||
eDescr.coll.seqNumber = plan->comm->seqNumber[ct->func];
|
||||
eDescr.coll.func = ncclFuncToString(ct->func);
|
||||
eDescr.coll.sendBuff = ct->sendbuff;
|
||||
@@ -250,7 +252,7 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) {
|
||||
eDescr.coll.count = ct->count;
|
||||
eDescr.coll.root = ct->root;
|
||||
eDescr.coll.datatype = ncclDatatypeToString(ct->datatype);
|
||||
eDescr.coll.nMaxChannels = ct->nMaxChannels;
|
||||
eDescr.coll.nChannels = ct->nChannels;
|
||||
eDescr.coll.nWarps = ct->nWarps;
|
||||
eDescr.coll.algo = ncclAlgoToString(ct->algorithm);
|
||||
eDescr.coll.proto = ncclProtoToString(ct->protocol);
|
||||
@@ -266,7 +268,7 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) {
|
||||
// gives the consistency.
|
||||
if (!plan->persistent || (__builtin_expect(ncclProfiler != NULL, 0) && plan->groupEventHandle &&
|
||||
(ct->eActivationMask & ncclProfileKernelCh)))
|
||||
plan->comm->seqNumber[ct->func]++;
|
||||
__atomic_fetch_add(&plan->comm->seqNumber[ct->func], 1, __ATOMIC_RELAXED);
|
||||
ct = ct->next;
|
||||
}
|
||||
if (__builtin_expect(ncclProfiler != NULL, 0)) {
|
||||
@@ -279,13 +281,12 @@ ncclResult_t ncclProfilerStartTaskEvents(struct ncclKernelPlan* plan) {
|
||||
eDescr.type = ncclProfileP2p;
|
||||
eDescr.parentObj = plan->groupEventHandle;
|
||||
eDescr.rank = plan->comm->rank;
|
||||
eDescr.p2p.name = plan->comm->commName;
|
||||
eDescr.p2p.commHash = plan->comm->commHash;
|
||||
eDescr.p2p.func = ncclFuncToString(pt->func);
|
||||
eDescr.p2p.buff = pt->buff;
|
||||
eDescr.p2p.count = pt->count;
|
||||
eDescr.p2p.datatype = ncclDatatypeToString(pt->datatype);
|
||||
eDescr.p2p.peer = pt->root;
|
||||
eDescr.p2p.nChannels = pt->nChannels;
|
||||
ncclProfiler->startEvent(plan->comm->profilerContext, &pt->eventHandle, &eDescr);
|
||||
}
|
||||
pt = pt->next;
|
||||
@@ -321,7 +322,7 @@ ncclResult_t ncclProfilerStopTaskEvents(struct ncclKernelPlan* plan) {
|
||||
// made of sliceSteps steps rather than one step. In the profiler we are still
|
||||
// interested in whole network transfers though, so we account for this when
|
||||
// computing the actual network step number.
|
||||
ncclResult_t ncclProfilerStartSendProxyOpEvent(int s, struct ncclProxyArgs* args) {
|
||||
ncclResult_t ncclProfilerStartProxyOpEvent(int s, struct ncclProxyArgs* args) {
|
||||
TIME_START_EVENT(proxyOpStart);
|
||||
struct ncclProxySubArgs* sub = &args->subs[s];
|
||||
if (__builtin_expect(ncclProfiler != NULL, 0)) {
|
||||
@@ -335,29 +336,7 @@ ncclResult_t ncclProfilerStartSendProxyOpEvent(int s, struct ncclProxyArgs* args
|
||||
eDescr.proxyOp.peer = sub->peer;
|
||||
eDescr.proxyOp.nSteps = DIVUP(sub->nsteps, args->sliceSteps);
|
||||
eDescr.proxyOp.chunkSize = args->chunkSize * args->sliceSteps;
|
||||
eDescr.proxyOp.isSend = 1;
|
||||
ncclProfiler->startEvent(sub->profilerContext, &sub->opEventHandle, &eDescr);
|
||||
}
|
||||
}
|
||||
TIME_STOP_EVENT(proxyOpStart);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProfilerStartRecvProxyOpEvent(int s, struct ncclProxyArgs* args) {
|
||||
TIME_START_EVENT(proxyOpStart);
|
||||
struct ncclProxySubArgs* sub = &args->subs[s];
|
||||
if (__builtin_expect(ncclProfiler != NULL, 0)) {
|
||||
if (sub->eActivationMask & (ncclProfileProxyOp | ncclProfileProxyStep | ncclProfileNetPlugin)) {
|
||||
ncclProfilerEventDescr_t eDescr = { 0 };
|
||||
eDescr.type = ncclProfileProxyOp;
|
||||
eDescr.parentObj = sub->taskEventHandle;
|
||||
eDescr.rank = sub->rank;
|
||||
eDescr.proxyOp.pid = sub->pid;
|
||||
eDescr.proxyOp.channelId = sub->channelId;
|
||||
eDescr.proxyOp.peer = sub->peer;
|
||||
eDescr.proxyOp.nSteps = DIVUP(sub->nsteps, args->sliceSteps);
|
||||
eDescr.proxyOp.chunkSize = args->chunkSize * args->sliceSteps;
|
||||
eDescr.proxyOp.isSend = 0;
|
||||
eDescr.proxyOp.isSend = args->progress == ncclTransports[TRANSPORT_NET]->send.proxyProgress ? 1 : 0;
|
||||
ncclProfiler->startEvent(sub->profilerContext, &sub->opEventHandle, &eDescr);
|
||||
}
|
||||
}
|
||||
@@ -387,7 +366,8 @@ ncclResult_t ncclProfilerStartSendProxyStepEvent(int s, struct ncclProxyArgs* ar
|
||||
eDescr.parentObj = sub->opEventHandle;
|
||||
eDescr.rank = sub->rank;
|
||||
eDescr.proxyStep.step = step_;
|
||||
ncclProfiler->startEvent(sub->profilerContext, &sub->stepEventHandles[step_%NCCL_STEPS], &eDescr);
|
||||
ncclProfiler->startEvent(sub->profilerContext, &sub->pHandles[step_%NCCL_STEPS].stepEventHandle, &eDescr);
|
||||
sub->pHandles[step_%NCCL_STEPS].subArgPtr = sub;
|
||||
}
|
||||
}
|
||||
TIME_STOP_EVENT(proxyStepStart);
|
||||
@@ -405,7 +385,8 @@ ncclResult_t ncclProfilerStartRecvProxyStepEvent(int s, struct ncclProxyArgs* ar
|
||||
eDescr.parentObj = sub->opEventHandle;
|
||||
eDescr.rank = sub->rank;
|
||||
eDescr.proxyStep.step = step_;
|
||||
ncclProfiler->startEvent(sub->profilerContext, &sub->stepEventHandles[step_%NCCL_STEPS], &eDescr);
|
||||
ncclProfiler->startEvent(sub->profilerContext, &sub->pHandles[step_%NCCL_STEPS].stepEventHandle, &eDescr);
|
||||
sub->pHandles[step_%NCCL_STEPS].subArgPtr = sub;
|
||||
}
|
||||
}
|
||||
TIME_STOP_EVENT(proxyStepStart);
|
||||
@@ -417,9 +398,9 @@ ncclResult_t ncclProfilerStopProxyStepEvent(int s, struct ncclProxyArgs* args, i
|
||||
struct ncclProxySubArgs* sub = &args->subs[s];
|
||||
if (__builtin_expect(ncclProfiler != NULL, 0)) {
|
||||
int step_ = DIVUP(stepId, args->sliceSteps);
|
||||
if (sub->stepEventHandles[step_%NCCL_STEPS]) {
|
||||
ncclProfiler->stopEvent(sub->stepEventHandles[step_%NCCL_STEPS]);
|
||||
sub->stepEventHandles[step_%NCCL_STEPS] = NULL;
|
||||
if (sub->pHandles[step_%NCCL_STEPS].stepEventHandle) {
|
||||
ncclProfiler->stopEvent(sub->pHandles[step_%NCCL_STEPS].stepEventHandle);
|
||||
sub->pHandles[step_%NCCL_STEPS].stepEventHandle = NULL;
|
||||
}
|
||||
}
|
||||
TIME_STOP_EVENT(proxyStepStop);
|
||||
@@ -453,7 +434,7 @@ ncclResult_t ncclProfilerStopProxyCtrlEvent(void* eHandle) {
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s) {
|
||||
ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s, uint64_t start) {
|
||||
if (__builtin_expect(ncclProfiler != NULL, 0)) {
|
||||
struct ncclProxySubArgs* sub = &args->subs[s];
|
||||
if (sub->eActivationMask & ncclProfileKernelCh) {
|
||||
@@ -461,29 +442,31 @@ ncclResult_t ncclProfilerStartKernelChEvent(struct ncclProxyArgs* args, int s) {
|
||||
eDescr.type = ncclProfileKernelCh;
|
||||
eDescr.parentObj = sub->taskEventHandle;
|
||||
eDescr.kernelCh.channelId = sub->channelId;
|
||||
eDescr.kernelCh.pTimer = start;
|
||||
ncclProfiler->startEvent(sub->profilerContext, &sub->kernelEventHandle, &eDescr);
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProfilerStopKernelChEvent(struct ncclProxyArgs* args, int s) {
|
||||
ncclResult_t ncclProfilerStopKernelChEvent(struct ncclProxyArgs* args, int s, uint64_t stop) {
|
||||
if (__builtin_expect(ncclProfiler != NULL, 0)) {
|
||||
struct ncclProxySubArgs* sub = &args->subs[s];
|
||||
if (sub->kernelEventHandle) {
|
||||
ncclProfilerEventStateArgs_t a = { };
|
||||
a.kernelCh.pTimer = stop;
|
||||
ncclProfiler->recordEventState(sub->kernelEventHandle, ncclProfilerKernelChStop, &a);
|
||||
ncclProfiler->stopEvent(sub->kernelEventHandle);
|
||||
}
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclResult_t ncclProfilerRecordProxyOpEventState(int s, struct ncclProxyArgs* args, int steps, size_t transSize, ncclProfilerEventState_t eState) {
|
||||
ncclResult_t ncclProfilerRecordProxyOpEventState(int s, struct ncclProxyArgs* args, ncclProfilerEventState_t eState) {
|
||||
TIME_START_EVENT(proxyOpRecord);
|
||||
struct ncclProxySubArgs* sub = &args->subs[s];
|
||||
if (__builtin_expect(ncclProfiler != NULL, 0) && sub->opEventHandle) {
|
||||
ncclProfilerEventStateArgs_t a = { };
|
||||
a.proxyOp.steps = DIVUP(steps, args->sliceSteps);
|
||||
a.proxyOp.transSize = transSize;
|
||||
ncclProfiler->recordEventState(sub->opEventHandle, eState, &a);
|
||||
}
|
||||
TIME_STOP_EVENT(proxyOpRecord);
|
||||
@@ -495,8 +478,10 @@ ncclResult_t ncclProfilerRecordProxyStepEventState(int s, struct ncclProxyArgs*
|
||||
struct ncclProxySubArgs* sub = &args->subs[s];
|
||||
if (__builtin_expect(ncclProfiler != NULL, 0) && sub->opEventHandle) {
|
||||
int step_ = DIVUP(stepId, args->sliceSteps);
|
||||
if (sub->stepEventHandles[step_%NCCL_STEPS]) {
|
||||
ncclProfiler->recordEventState(sub->stepEventHandles[step_%NCCL_STEPS], eState, 0);
|
||||
if (sub->pHandles[step_%NCCL_STEPS].stepEventHandle) {
|
||||
ncclProfilerEventStateArgs_t a = { };
|
||||
a.proxyStep.transSize = sub->transSize;
|
||||
ncclProfiler->recordEventState(sub->pHandles[step_%NCCL_STEPS].stepEventHandle, eState, &a);
|
||||
}
|
||||
}
|
||||
TIME_STOP_EVENT(proxyStepRecord);
|
||||
@@ -549,18 +534,28 @@ bool ncclProfilerPluginLoaded(void) {
|
||||
|
||||
ncclResult_t ncclProfilerCallback(void** eHandle, int type, void* pHandle, int64_t pluginId, void* extData) {
|
||||
if (__builtin_expect(ncclProfiler != NULL, 0)) {
|
||||
struct ncclProxySubArgs* sub = (struct ncclProxySubArgs*)pHandle;
|
||||
if (type == 0) { // start
|
||||
if (type == ncclProfilerNetEventStart) { // start
|
||||
struct ncclProxyEventHandle* p = (struct ncclProxyEventHandle*)pHandle;
|
||||
struct ncclProxySubArgs* sub = p->subArgPtr;
|
||||
if (sub->eActivationMask & ncclProfileNetPlugin) {
|
||||
ncclProfilerEventDescr_t eDescr = { 0 };
|
||||
eDescr.type = ncclProfileNetPlugin;
|
||||
eDescr.parentObj = sub->stepEventHandles[sub->profilerSteps%NCCL_STEPS];
|
||||
eDescr.parentObj = p->stepEventHandle;
|
||||
eDescr.rank = sub->rank;
|
||||
eDescr.netPlugin.id = pluginId;
|
||||
eDescr.netPlugin.data = extData;
|
||||
ncclProfiler->startEvent(sub->profilerContext, eHandle, &eDescr);
|
||||
}
|
||||
} else { // stop
|
||||
} else if (type == ncclProfilerNetEventStop) { // stop
|
||||
ncclProfiler->stopEvent(*eHandle);
|
||||
} else if (type == ncclProfilerNetEventUpdate) { // update
|
||||
ncclProfilerEventStateArgs_t args = { };
|
||||
args.netPlugin.data = extData;
|
||||
ncclProfiler->recordEventState(*eHandle, ncclProfilerNetPluginUpdate, &args);
|
||||
} else { // update and stop
|
||||
ncclProfilerEventStateArgs_t args = { };
|
||||
args.netPlugin.data = extData;
|
||||
ncclProfiler->recordEventState(*eHandle, ncclProfilerNetPluginUpdate, &args);
|
||||
ncclProfiler->stopEvent(*eHandle);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -53,6 +53,7 @@ static uint8_t ncclStringToDatatype(const char* dt) {
|
||||
}
|
||||
|
||||
static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclProfilerEventDescr_t* eDescr) {
|
||||
*eHandle = NULL;
|
||||
ncclProfilerEventDescr_v1_t eDescr_v1 = { 0 };
|
||||
eDescr_v1.type = eDescr->type;
|
||||
eDescr_v1.parentObj = eDescr->parentObj;
|
||||
@@ -60,8 +61,8 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP
|
||||
switch(eDescr->type) {
|
||||
case ncclProfileGroup: break;
|
||||
case ncclProfileColl: {
|
||||
eDescr_v1.coll.name = eDescr->coll.name;
|
||||
eDescr_v1.coll.commHash = eDescr->coll.commHash;
|
||||
eDescr_v1.coll.name = nullptr; // removed in v4
|
||||
eDescr_v1.coll.commHash = 0; // removed in v4
|
||||
eDescr_v1.coll.seqNumber = eDescr->coll.seqNumber;
|
||||
eDescr_v1.coll.func = ncclStringToFunc(eDescr->coll.func);
|
||||
eDescr_v1.coll.sendBuff = eDescr->coll.sendBuff;
|
||||
@@ -71,14 +72,14 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP
|
||||
eDescr_v1.coll.datatype = ncclStringToDatatype(eDescr->coll.datatype);
|
||||
eDescr_v1.coll.op = 0; // removed in v2
|
||||
eDescr_v1.coll.trafficBytes = 0; // removed in v3
|
||||
eDescr_v1.coll.nMaxChannels = eDescr->coll.nMaxChannels;
|
||||
eDescr_v1.coll.nMaxChannels = eDescr->coll.nChannels;
|
||||
eDescr_v1.coll.nWarps = eDescr->coll.nWarps;
|
||||
eDescr_v1.coll.algo = ncclStringToAlgo(eDescr->coll.algo);
|
||||
eDescr_v1.coll.proto = ncclStringToProto(eDescr->coll.proto);
|
||||
} break;
|
||||
case ncclProfileP2p: {
|
||||
eDescr_v1.p2p.name = eDescr->p2p.name;
|
||||
eDescr_v1.p2p.commHash = eDescr->p2p.commHash;
|
||||
eDescr_v1.p2p.name = nullptr; // removed in v4
|
||||
eDescr_v1.p2p.commHash = 0; // removed in v4
|
||||
eDescr_v1.p2p.func = ncclStringToFunc(eDescr->p2p.func);
|
||||
eDescr_v1.p2p.buff = eDescr->p2p.buff;
|
||||
eDescr_v1.p2p.count = eDescr->p2p.count;
|
||||
@@ -97,21 +98,34 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP
|
||||
eDescr_v1.proxyStep.step = eDescr->proxyStep.step;
|
||||
} break;
|
||||
case ncclProfileProxyCtrl: break;
|
||||
case ncclProfileKernelCh:
|
||||
case ncclProfileNetPlugin: {
|
||||
*eHandle = NULL;
|
||||
return ncclSuccess;
|
||||
}
|
||||
default:;
|
||||
default: return ncclSuccess;
|
||||
}
|
||||
return ncclProfiler_v1->startEvent(context, eHandle, &eDescr_v1);
|
||||
}
|
||||
|
||||
static ncclResult_t ncclProfiler_recordEventState(void* eHandle, ncclProfilerEventState_t eState, ncclProfilerEventStateArgs_t* eStateArgs) {
|
||||
return ncclProfiler_v1->recordEventState(eHandle, eState, (ncclProfilerEventStateArgs_v1_t*)eStateArgs);
|
||||
ncclProfilerEventStateArgs_v1_t args = { };
|
||||
switch (eState) {
|
||||
case ncclProfilerProxyCtrlIdle:
|
||||
case ncclProfilerProxyCtrlActive:
|
||||
case ncclProfilerProxyCtrlSleep:
|
||||
case ncclProfilerProxyCtrlWakeup:
|
||||
case ncclProfilerProxyCtrlAppend:
|
||||
case ncclProfilerProxyCtrlAppendEnd:
|
||||
args.proxyCtrl.appendedProxyOps = eStateArgs->proxyCtrl.appendedProxyOps;
|
||||
break;
|
||||
case ncclProfilerProxyStepSendGPUWait:
|
||||
case ncclProfilerProxyStepSendWait:
|
||||
case ncclProfilerProxyStepRecvWait:
|
||||
case ncclProfilerProxyStepRecvFlushWait:
|
||||
case ncclProfilerProxyStepRecvGPUWait:
|
||||
break;
|
||||
default: return ncclSuccess;
|
||||
}
|
||||
return ncclProfiler_v1->recordEventState(eHandle, eState, &args);
|
||||
}
|
||||
|
||||
static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask) {
|
||||
static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) {
|
||||
NCCLCHECK(ncclProfiler_v1->init(context, eActivationMask));
|
||||
ncclProfiler.startEvent = ncclProfiler_startEvent;
|
||||
ncclProfiler.stopEvent = ncclProfiler_v1->stopEvent;
|
||||
|
||||
@@ -20,8 +20,8 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP
|
||||
switch(eDescr->type) {
|
||||
case ncclProfileGroup: break;
|
||||
case ncclProfileColl: {
|
||||
eDescr_v2.coll.name = eDescr->coll.name;
|
||||
eDescr_v2.coll.commHash = eDescr->coll.commHash;
|
||||
eDescr_v2.coll.name = nullptr; // removed in v4
|
||||
eDescr_v2.coll.commHash = 0; // removed in v4
|
||||
eDescr_v2.coll.seqNumber = eDescr->coll.seqNumber;
|
||||
eDescr_v2.coll.func = eDescr->coll.func;
|
||||
eDescr_v2.coll.sendBuff = eDescr->coll.sendBuff;
|
||||
@@ -30,14 +30,14 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP
|
||||
eDescr_v2.coll.root = eDescr->coll.root;
|
||||
eDescr_v2.coll.datatype = eDescr->coll.datatype;
|
||||
eDescr_v2.coll.trafficBytes = 0; // removed in v3
|
||||
eDescr_v2.coll.nMaxChannels = eDescr->coll.nMaxChannels;
|
||||
eDescr_v2.coll.nMaxChannels = eDescr->coll.nChannels;
|
||||
eDescr_v2.coll.nWarps = eDescr->coll.nWarps;
|
||||
eDescr_v2.coll.algo = eDescr->coll.algo;
|
||||
eDescr_v2.coll.proto = eDescr->coll.proto;
|
||||
} break;
|
||||
case ncclProfileP2p: {
|
||||
eDescr_v2.p2p.name = eDescr->p2p.name;
|
||||
eDescr_v2.p2p.commHash = eDescr->p2p.commHash;
|
||||
eDescr_v2.p2p.name = nullptr; // removed in v4
|
||||
eDescr_v2.p2p.commHash = 0; // removed in v4
|
||||
eDescr_v2.p2p.func = eDescr->p2p.func;
|
||||
eDescr_v2.p2p.buff = eDescr->p2p.buff;
|
||||
eDescr_v2.p2p.count = eDescr->p2p.count;
|
||||
@@ -62,10 +62,28 @@ static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclP
|
||||
}
|
||||
|
||||
static ncclResult_t ncclProfiler_recordEventState(void* eHandle, ncclProfilerEventState_t eState, ncclProfilerEventStateArgs_t* eStateArgs) {
|
||||
return ncclProfiler_v2->recordEventState(eHandle, eState, (ncclProfilerEventStateArgs_v2_t *)eStateArgs);
|
||||
ncclProfilerEventStateArgs_v2_t args = { };
|
||||
switch (eState) {
|
||||
case ncclProfilerProxyCtrlIdle:
|
||||
case ncclProfilerProxyCtrlActive:
|
||||
case ncclProfilerProxyCtrlSleep:
|
||||
case ncclProfilerProxyCtrlWakeup:
|
||||
case ncclProfilerProxyCtrlAppend:
|
||||
case ncclProfilerProxyCtrlAppendEnd:
|
||||
args.proxyCtrl.appendedProxyOps = eStateArgs->proxyCtrl.appendedProxyOps;
|
||||
break;
|
||||
case ncclProfilerProxyStepSendGPUWait:
|
||||
case ncclProfilerProxyStepSendWait:
|
||||
case ncclProfilerProxyStepRecvWait:
|
||||
case ncclProfilerProxyStepRecvFlushWait:
|
||||
case ncclProfilerProxyStepRecvGPUWait:
|
||||
break;
|
||||
default: return ncclSuccess;
|
||||
}
|
||||
return ncclProfiler_v2->recordEventState(eHandle, eState, &args);
|
||||
}
|
||||
|
||||
static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask) {
|
||||
static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) {
|
||||
NCCLCHECK(ncclProfiler_v2->init(context, eActivationMask));
|
||||
ncclProfiler.startEvent = ncclProfiler_startEvent;
|
||||
ncclProfiler.stopEvent = ncclProfiler_v2->stopEvent;
|
||||
|
||||
@@ -6,14 +6,105 @@
|
||||
|
||||
#include "comm.h"
|
||||
#include "nccl_profiler.h"
|
||||
#include "checks.h"
|
||||
|
||||
static ncclProfiler_t ncclProfiler;
|
||||
static ncclProfiler_v3_t* ncclProfiler_v3;
|
||||
|
||||
static ncclResult_t ncclProfiler_startEvent(void* context, void** eHandle, ncclProfilerEventDescr_t* eDescr) {
|
||||
*eHandle = nullptr;
|
||||
ncclProfilerEventDescr_v3_t eDescr_v3 = { };
|
||||
eDescr_v3.type = eDescr->type;
|
||||
eDescr_v3.parentObj = eDescr->parentObj;
|
||||
eDescr_v3.rank = eDescr->rank;
|
||||
switch(eDescr->type) {
|
||||
case ncclProfileGroup: break;
|
||||
case ncclProfileColl: {
|
||||
eDescr_v3.coll.name = nullptr; // removed in v4
|
||||
eDescr_v3.coll.commHash = 0; // removed in v4
|
||||
eDescr_v3.coll.seqNumber = eDescr->coll.seqNumber;
|
||||
eDescr_v3.coll.func = eDescr->coll.func;
|
||||
eDescr_v3.coll.sendBuff = eDescr->coll.sendBuff;
|
||||
eDescr_v3.coll.recvBuff = eDescr->coll.recvBuff;
|
||||
eDescr_v3.coll.count = eDescr->coll.count;
|
||||
eDescr_v3.coll.root = eDescr->coll.root;
|
||||
eDescr_v3.coll.datatype = eDescr->coll.datatype;
|
||||
eDescr_v3.coll.nMaxChannels = eDescr->coll.nChannels;
|
||||
eDescr_v3.coll.nWarps = eDescr->coll.nWarps;
|
||||
eDescr_v3.coll.algo = eDescr->coll.algo;
|
||||
eDescr_v3.coll.proto = eDescr->coll.proto;
|
||||
} break;
|
||||
case ncclProfileP2p: {
|
||||
eDescr_v3.p2p.name = nullptr; // removed in v4
|
||||
eDescr_v3.p2p.commHash = 0; // removed in v4
|
||||
eDescr_v3.p2p.func = eDescr->p2p.func;
|
||||
eDescr_v3.p2p.buff = eDescr->p2p.buff;
|
||||
eDescr_v3.p2p.count = eDescr->p2p.count;
|
||||
eDescr_v3.p2p.datatype = eDescr->p2p.datatype;
|
||||
eDescr_v3.p2p.peer = eDescr->p2p.peer;
|
||||
} break;
|
||||
case ncclProfileProxyOp: {
|
||||
eDescr_v3.proxyOp.pid = eDescr->proxyOp.pid;
|
||||
eDescr_v3.proxyOp.channelId = eDescr->proxyOp.channelId;
|
||||
eDescr_v3.proxyOp.peer = eDescr->proxyOp.peer;
|
||||
eDescr_v3.proxyOp.nSteps = eDescr->proxyOp.nSteps;
|
||||
eDescr_v3.proxyOp.chunkSize = eDescr->proxyOp.chunkSize;
|
||||
eDescr_v3.proxyOp.isSend = eDescr->proxyOp.isSend;
|
||||
} break;
|
||||
case ncclProfileProxyStep: {
|
||||
eDescr_v3.proxyStep.step = eDescr->proxyStep.step;
|
||||
} break;
|
||||
case ncclProfileProxyCtrl: break;
|
||||
case ncclProfileKernelCh: {
|
||||
eDescr_v3.kernelCh.channelId = eDescr->kernelCh.channelId;
|
||||
} break;
|
||||
case ncclProfileNetPlugin: {
|
||||
eDescr_v3.netPlugin.id = eDescr->netPlugin.id;
|
||||
eDescr_v3.netPlugin.data = eDescr->netPlugin.data;
|
||||
} break;
|
||||
default: return ncclSuccess;
|
||||
}
|
||||
return ncclProfiler_v3->startEvent(context, eHandle, &eDescr_v3);
|
||||
}
|
||||
|
||||
static ncclResult_t ncclProfiler_recordEventState(void* eHandle, ncclProfilerEventState_t eState, ncclProfilerEventStateArgs_t* eStateArgs) {
|
||||
ncclProfilerEventStateArgs_v3_t args = { };
|
||||
switch (eState) {
|
||||
case ncclProfilerProxyCtrlIdle:
|
||||
case ncclProfilerProxyCtrlActive:
|
||||
case ncclProfilerProxyCtrlSleep:
|
||||
case ncclProfilerProxyCtrlWakeup:
|
||||
case ncclProfilerProxyCtrlAppend:
|
||||
case ncclProfilerProxyCtrlAppendEnd:
|
||||
args.proxyCtrl.appendedProxyOps = eStateArgs->proxyCtrl.appendedProxyOps;
|
||||
break;
|
||||
case ncclProfilerProxyStepSendGPUWait:
|
||||
case ncclProfilerProxyStepSendWait:
|
||||
case ncclProfilerProxyStepRecvWait:
|
||||
case ncclProfilerProxyStepRecvFlushWait:
|
||||
case ncclProfilerProxyStepRecvGPUWait:
|
||||
break;
|
||||
default: return ncclSuccess;
|
||||
}
|
||||
return ncclProfiler_v3->recordEventState(eHandle, eState, &args);
|
||||
}
|
||||
|
||||
static ncclResult_t ncclProfiler_init(void** context, int* eActivationMask, const char* commName, uint64_t commHash, int nNodes, int nranks, int rank, ncclDebugLogger_t logfn) {
|
||||
NCCLCHECK(ncclProfiler_v3->init(context, eActivationMask));
|
||||
ncclProfiler.startEvent = ncclProfiler_startEvent;
|
||||
ncclProfiler.stopEvent = ncclProfiler_v3->stopEvent;
|
||||
ncclProfiler.recordEventState = ncclProfiler_recordEventState;
|
||||
ncclProfiler.finalize = ncclProfiler_v3->finalize;
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
ncclProfiler_t* getNcclProfiler_v3(void* lib) {
|
||||
ncclProfiler_v3 = (ncclProfiler_v3_t*)dlsym(lib, "ncclProfiler_v3");
|
||||
if (ncclProfiler_v3) {
|
||||
ncclProfiler.name = ncclProfiler_v3->name;
|
||||
ncclProfiler.init = ncclProfiler_init;
|
||||
INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded %s", ncclProfiler_v3->name);
|
||||
return ncclProfiler_v3;
|
||||
return &ncclProfiler;
|
||||
}
|
||||
INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v3");
|
||||
return NULL;
|
||||
|
||||
@@ -0,0 +1,21 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#include "comm.h"
|
||||
#include "nccl_profiler.h"
|
||||
#include "checks.h"
|
||||
|
||||
static ncclProfiler_v4_t* ncclProfiler_v4;
|
||||
|
||||
ncclProfiler_t* getNcclProfiler_v4(void* lib) {
|
||||
ncclProfiler_v4 = (ncclProfiler_v4_t*)dlsym(lib, "ncclProfiler_v4");
|
||||
if (ncclProfiler_v4) {
|
||||
INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: loaded %s", ncclProfiler_v4->name);
|
||||
return ncclProfiler_v4;
|
||||
}
|
||||
INFO(NCCL_INIT|NCCL_ENV, "PROFILER/Plugin: failed to find ncclProfiler_v4");
|
||||
return NULL;
|
||||
}
|
||||
Reference in New Issue
Block a user