[Profiler plugin] Fix segfault issue with profiler plugin (#1973)
* Fix profiler plugin segfault by correctly setting p2p->func * Look for librccl-profiler.so instead of libnccl-profiler.so Signed-off-by: rahulvaidya20 <ravaidya@amd.com> --------- Signed-off-by: rahulvaidya20 <ravaidya@amd.com> Co-authored-by: Yongjie Qiu <Yongjie.Qiu@amd.com>
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
154350baaf
Коммит
624f68b2b2
@@ -9,7 +9,7 @@ interface. Any NCCL user can write profiler plugins to extract performance data
|
||||
use it for debugging and analysis.
|
||||
|
||||
Similarly to other plugins (e.g., network plugin), the profiler plugins come as a shared library
|
||||
called `libnccl-profiler.so`. That shared library contains one or more implementations of the
|
||||
called `lirccl-profiler.so`. That shared library contains one or more implementations of the
|
||||
NCCL PROFILER API, in the form of versioned structs, filled with pointers to all required
|
||||
functions.
|
||||
|
||||
@@ -17,15 +17,15 @@ functions.
|
||||
|
||||
## Plugin name and supporting multiple profiler plugins
|
||||
|
||||
When NCCL is initialized, it will look for a `libnccl-profiler.so` library and dynamically load
|
||||
When NCCL is initialized, it will look for a `librccl-profiler.so` library and dynamically load
|
||||
it, then look for symbols inside the library.
|
||||
|
||||
The `NCCL_PROFILER_PLUGIN` environment variable allows multiple plugins to coexist. If set, NCCL
|
||||
will look for a library with a name of `libnccl-profiler-${NCCL_PROFILER_PLUGIN}.so`. It is therefore
|
||||
advised to name the library following that pattern, with a symlink pointing `libnccl-profiler.so`
|
||||
to `libnccl-profiler-${NCCL_PROFILER_PLUGIN}.so`. That way, if there are multiple plugins in the
|
||||
will look for a library with a name of `librccl-profiler-${NCCL_PROFILER_PLUGIN}.so`. It is therefore
|
||||
advised to name the library following that pattern, with a symlink pointing `librccl-profiler.so`
|
||||
to `librccl-profiler-${NCCL_PROFILER_PLUGIN}.so`. That way, if there are multiple plugins in the
|
||||
path, setting `NCCL_PROFILER_PLUGIN` will allow users to select the right plugin. Alternatively,
|
||||
the user can also set `NCCL_PROFILER_PLUGIN` to the pathname of the `libnccl-profiler.so` library.
|
||||
the user can also set `NCCL_PROFILER_PLUGIN` to the pathname of the `librccl-profiler.so` library.
|
||||
|
||||
## Struct versioning
|
||||
|
||||
|
||||
@@ -11,12 +11,12 @@ NCCLDIR := $(BUILDDIR)
|
||||
|
||||
SRC_FILES := $(wildcard *.c)
|
||||
|
||||
build: ${BUILDDIR}/libnccl-profiler-example.so
|
||||
build: ${BUILDDIR}/librccl-profiler.so
|
||||
|
||||
${BUILDDIR}/libnccl-profiler-example.so: ${SRC_FILES}
|
||||
${BUILDDIR}/librccl-profiler.so: ${SRC_FILES}
|
||||
@printf "Compiling %-35s > %s\n" $< $@
|
||||
@mkdir -p ${BUILDDIR}
|
||||
$(CC) -Inccl -fPIC -shared -o $@ $^
|
||||
|
||||
clean:
|
||||
rm -f ${BUILDDIR}/libnccl-profiler-example.so
|
||||
rm -f ${BUILDDIR}/librccl-profiler.so
|
||||
|
||||
@@ -2574,6 +2574,7 @@ static ncclResult_t taskAppend(struct ncclComm* comm, struct ncclInfo* info) {
|
||||
// Must be in thread local group before tasks can be alloc'd in `comm->memScoped`.
|
||||
ncclGroupCommJoin(info->comm, ncclGroupTaskTypeCollective);
|
||||
struct ncclTaskP2p* p2p = ncclMemoryPoolAlloc<struct ncclTaskP2p>(&comm->memPool_ncclTaskP2p, &comm->memPermanent);
|
||||
p2p->func = info->coll;
|
||||
p2p->buff = (void*)info->recvbuff;
|
||||
p2p->count = info->count;
|
||||
p2p->datatype = info->datatype;
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
static char* libNames[NUM_LIBS];
|
||||
static void *libHandles[NUM_LIBS];
|
||||
static const char *pluginNames[NUM_LIBS] = { "NET", "TUNER", "PROFILER" };
|
||||
static const char *pluginPrefix[NUM_LIBS] = { "librccl-net", "libnccl-tuner", "libnccl-profiler" };
|
||||
static const char *pluginPrefix[NUM_LIBS] = { "librccl-net", "libnccl-tuner", "librccl-profiler" };
|
||||
static const char *pluginFallback[NUM_LIBS] = { "", "Using internal tuner plugin.", "" };
|
||||
static unsigned long subsys[NUM_LIBS] = { NCCL_INIT|NCCL_NET, NCCL_INIT|NCCL_TUNING, NCCL_INIT };
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user