diff --git a/projects/rccl/ext-net/dummy/Makefile b/projects/rccl/ext-net/dummy/Makefile new file mode 100644 index 0000000000..d1eb4c5a62 --- /dev/null +++ b/projects/rccl/ext-net/dummy/Makefile @@ -0,0 +1,17 @@ +# +# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# +# See LICENSE.txt for license information +# +NCCL_HOME:=../../build/ +CUDA_HOME:=/usr/local/cuda +INC:= -I$(NCCL_HOME)/include -I$(CUDA_HOME)/include +PLUGIN_SO:=libnccl-net.so + +default: $(PLUGIN_SO) + +$(PLUGIN_SO): plugin.c + $(CC) $(INC) -fPIC -shared -o $@ -Wl,-soname,$(PLUGIN_SO) $^ + +clean: + rm -f $(PLUGIN_SO) diff --git a/projects/rccl/ext-net/dummy/plugin.c b/projects/rccl/ext-net/dummy/plugin.c new file mode 100644 index 0000000000..b11cf76a61 --- /dev/null +++ b/projects/rccl/ext-net/dummy/plugin.c @@ -0,0 +1,44 @@ +/************************************************************************* + * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include +#include +#include + +#define __hidden __attribute__ ((visibility("hidden"))) + +__hidden ncclResult_t pluginInit() { return ncclSuccess; } +__hidden ncclResult_t pluginDevices(int* ndev) { *ndev = 0; return ncclSuccess; } +__hidden ncclResult_t pluginPciPath(int dev, char** path) { return ncclInternalError; } +__hidden ncclResult_t pluginPtrSupport(int dev, int* supportedTypes) { return ncclInternalError; } +__hidden ncclResult_t pluginListen(int dev, void* handle, void** listenComm) { return ncclInternalError; } +__hidden ncclResult_t pluginConnect(int dev, void* handle, void** sendComm) { return ncclInternalError; } +__hidden ncclResult_t pluginAccept(void* listenComm, void** recvComm) { return ncclInternalError; } +__hidden ncclResult_t pluginIsend(void* sendComm, void* data, int size, int type, void** request) { return ncclInternalError; } +__hidden ncclResult_t pluginIrecv(void* recvComm, void* data, int size, int type, void** request) { return ncclInternalError; } +__hidden ncclResult_t pluginFlush(void* recvComm, void* data, int size) { return ncclInternalError; } +__hidden ncclResult_t pluginTest(void* request, int* done, int* size) { return ncclInternalError; } +__hidden ncclResult_t pluginCloseSend(void* sendComm) { return ncclInternalError; } +__hidden ncclResult_t pluginCloseRecv(void* recvComm) { return ncclInternalError; } +__hidden ncclResult_t pluginCloseListen(void* listenComm) { return ncclInternalError; } + +ncclNet_t NCCL_PLUGIN_SYMBOL = { + "Dummy", + pluginInit, + pluginDevices, + pluginPciPath, + pluginPtrSupport, + pluginListen, + pluginConnect, + pluginAccept, + pluginIsend, + pluginIrecv, + pluginFlush, + pluginTest, + pluginCloseSend, + pluginCloseRecv, + pluginCloseListen +}; diff --git a/projects/rccl/src/Makefile b/projects/rccl/src/Makefile index 5d304441aa..481000ad16 100644 --- a/projects/rccl/src/Makefile +++ b/projects/rccl/src/Makefile @@ -8,7 +8,7 @@ include ../makefiles/common.mk include ../makefiles/version.mk ##### src files -INCEXPORTS := nccl.h +INCEXPORTS := nccl.h nccl_net.h LIBSRCFILES := init.cu ring.cu bootstrap.cu transport.cu misc/group.cu \ misc/nvmlwrap.cu misc/ibvwrap.cu misc/rings.cu misc/utils.cu misc/enqueue.cu \ transport/p2p.cu transport/shm.cu transport/net.cu transport/net_socket.cu transport/net_ib.cu \ @@ -80,6 +80,11 @@ $(INCDIR)/%.h : %.h mkdir -p $(INCDIR) cp -f $< $@ +$(INCDIR)/nccl_%.h : include/nccl_%.h + @printf "Grabbing %-35s > %s\n" $< $@ + mkdir -p $(INCDIR) + cp -f $< $@ + $(OBJDIR)/%.o : %.cu @printf "Compiling %-35s > %s\n" $< $@ mkdir -p `dirname $@` diff --git a/projects/rccl/src/bootstrap.cu b/projects/rccl/src/bootstrap.cu index 8593726b67..17aa1f45ef 100644 --- a/projects/rccl/src/bootstrap.cu +++ b/projects/rccl/src/bootstrap.cu @@ -217,7 +217,7 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) { int rank = state->rank; int nranks = state->nranks; - TRACE(INIT, "rank %d nranks %d size %d", rank, nranks, size); + TRACE(NCCL_INIT, "rank %d nranks %d size %d", rank, nranks, size); /* Simple ring based AllGather * At each step i receive data from (rank-i-1) from left @@ -233,7 +233,7 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) { NCCLCHECK(bootstrapRecv(state->extBstrapRingRecvComm, data+rslice*size, size)); } - TRACE(INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size); + TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size); return ncclSuccess; } diff --git a/projects/rccl/src/collectives/all_gather.cu b/projects/rccl/src/collectives/all_gather.cu index e19feff06b..8dec28e63b 100644 --- a/projects/rccl/src/collectives/all_gather.cu +++ b/projects/rccl/src/collectives/all_gather.cu @@ -12,7 +12,7 @@ ncclResult_t ncclAllGatherFunc(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { size_t nbytes = count*ncclTypeSize(datatype); - INFO(COLL,"AllGather: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream); + INFO(NCCL_COLL,"AllGather: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream); if (comm->nRanks == 1) { if (sendbuff != recvbuff) CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream)); diff --git a/projects/rccl/src/collectives/all_reduce.cu b/projects/rccl/src/collectives/all_reduce.cu index 77ae4c8483..cc14083ab7 100644 --- a/projects/rccl/src/collectives/all_reduce.cu +++ b/projects/rccl/src/collectives/all_reduce.cu @@ -12,7 +12,7 @@ ncclResult_t ncclAllReduceFunc(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { size_t nbytes = count*ncclTypeSize(datatype); - INFO(COLL,"AllReduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream); + INFO(NCCL_COLL,"AllReduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream); if (comm->nRanks == 1) { if (sendbuff != recvbuff) CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream)); diff --git a/projects/rccl/src/collectives/broadcast.cu b/projects/rccl/src/collectives/broadcast.cu index 0e5ec7b1cf..91ce905440 100644 --- a/projects/rccl/src/collectives/broadcast.cu +++ b/projects/rccl/src/collectives/broadcast.cu @@ -12,7 +12,7 @@ ncclResult_t ncclBroadcastFunc(const void* sendbuff, void* recvbuff, const size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { size_t nbytes = count*ncclTypeSize(datatype); - INFO(COLL,"Broadcast: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream); + INFO(NCCL_COLL,"Broadcast: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream); if (comm->nRanks == 1) { if (sendbuff != recvbuff) CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream)); diff --git a/projects/rccl/src/collectives/reduce.cu b/projects/rccl/src/collectives/reduce.cu index 76d4a194f7..d8fde80baa 100644 --- a/projects/rccl/src/collectives/reduce.cu +++ b/projects/rccl/src/collectives/reduce.cu @@ -12,7 +12,7 @@ ncclResult_t ncclReduceFunc(const void* sendbuff, void* recvbuff, const size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { size_t nbytes = count*ncclTypeSize(datatype); - INFO(COLL,"Reduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream); + INFO(NCCL_COLL,"Reduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream); if (comm->nRanks == 1) { if (sendbuff != recvbuff) CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream)); diff --git a/projects/rccl/src/collectives/reduce_scatter.cu b/projects/rccl/src/collectives/reduce_scatter.cu index af9d78b289..1447d4a91b 100644 --- a/projects/rccl/src/collectives/reduce_scatter.cu +++ b/projects/rccl/src/collectives/reduce_scatter.cu @@ -12,7 +12,7 @@ ncclResult_t ncclReduceScatterFunc(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { size_t nbytes = count*ncclTypeSize(datatype); - INFO(COLL,"ReduceScatter: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream); + INFO(NCCL_COLL,"ReduceScatter: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream); if (comm->nRanks == 1) { if (sendbuff != recvbuff) CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream)); diff --git a/projects/rccl/src/include/core.h b/projects/rccl/src/include/core.h index 2dd63d60e3..eff2172968 100644 --- a/projects/rccl/src/include/core.h +++ b/projects/rccl/src/include/core.h @@ -271,7 +271,7 @@ struct ncclComm { while (ret == -1) { \ SYSCHECKVAL(call, name, ret); \ if (ret == -1) { \ - INFO(ALL,"Got %s, retrying", strerror(errno)); \ + INFO(NCCL_ALL,"Got %s, retrying", strerror(errno)); \ }\ } \ } while (0); @@ -313,7 +313,7 @@ struct ncclComm { ncclResult_t res = call; \ if (res != ncclSuccess) { \ /* Print the back trace*/ \ - INFO(ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ + INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ return res; \ } \ } while (0); @@ -322,7 +322,7 @@ struct ncclComm { res = call; \ if (res != ncclSuccess) { \ /* Print the back trace*/ \ - INFO(ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ + INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ goto label; \ } \ } while (0); diff --git a/projects/rccl/src/include/debug.h b/projects/rccl/src/include/debug.h index 83ae98b978..55dee1838c 100644 --- a/projects/rccl/src/include/debug.h +++ b/projects/rccl/src/include/debug.h @@ -16,65 +16,24 @@ #include #include #include "nccl.h" +#include "nccl_net.h" + #define gettid() (pid_t) syscall(SYS_gettid) -typedef enum {NONE=0, VERSION=1, WARN=2, INFO=3, ABORT=4, TRACE=5} DebugLevel; -typedef enum {INIT=1, COLL=2, P2P=4, SHM=8, NET=16, ALL=~0} SubSys; -extern DebugLevel ncclDebugLevel; +extern int ncclDebugLevel; extern uint64_t ncclDebugMask; extern pthread_mutex_t ncclDebugOutputLock; extern FILE *ncclDebugFile; extern ncclResult_t getHostName(char* hostname, int maxlen); -#define WARN(...) do { \ - if (ncclDebugLevel >= WARN) { \ - char hostname[1024]; \ - getHostName(hostname, 1024); \ - int cudaDev; \ - cudaGetDevice(&cudaDev); \ - pthread_mutex_lock(&ncclDebugOutputLock); \ - fprintf(ncclDebugFile,"\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, getpid(), gettid(), cudaDev, __FILE__, __LINE__); \ - fprintf(ncclDebugFile,__VA_ARGS__); \ - fprintf(ncclDebugFile,"\n"); \ - fflush(ncclDebugFile); \ - pthread_mutex_unlock(&ncclDebugOutputLock); \ - if (ncclDebugLevel == ABORT) { fprintf(stderr,"\n%s:%d:%d [%d] %s:%d NCCL ABORT\n", hostname, getpid(), gettid(), cudaDev, __FILE__, __LINE__); abort(); } \ - } \ -} while(0) +extern void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...); -#define INFO(FLAGS, ...) do { \ - if (ncclDebugLevel >= INFO && ((FLAGS) & ncclDebugMask)) { \ - char hostname[1024]; \ - getHostName(hostname, 1024); \ - int cudaDev; \ - cudaGetDevice(&cudaDev); \ - pthread_mutex_lock(&ncclDebugOutputLock); \ - fprintf(ncclDebugFile,"%s:%d:%d [%d] NCCL INFO ", hostname, getpid(), gettid(), cudaDev); \ - fprintf(ncclDebugFile,__VA_ARGS__);fprintf(ncclDebugFile,"\n"); \ - fflush(ncclDebugFile); \ - pthread_mutex_unlock(&ncclDebugOutputLock); \ - } \ -} while(0) +#define WARN(...) ncclDebugLog(NCCL_LOG_WARN, NCCL_ALL, __FILE__, __LINE__, __VA_ARGS__) +#define INFO(FLAGS, ...) ncclDebugLog(NCCL_LOG_INFO, (FLAGS), __func__, __LINE__, __VA_ARGS__) #ifdef ENABLE_TRACE -#define TRACE(FLAGS, ...) do { \ - if (ncclDebugLevel == TRACE && ((FLAGS) & ncclDebugMask)) { \ - char hostname[1024]; \ - getHostName(hostname, 1024); \ - int cudaDev; \ - cudaGetDevice(&cudaDev); \ - pthread_mutex_lock(&ncclDebugOutputLock); \ - auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch; \ - double timestamp = std::chrono::duration_cast>(delta).count()*1000; \ - fprintf(ncclDebugFile,"%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, getpid(), gettid(), cudaDev, timestamp, __func__, __LINE__); \ - fprintf(ncclDebugFile,__VA_ARGS__);fprintf(ncclDebugFile,"\n"); \ - fflush(ncclDebugFile); \ - pthread_mutex_unlock(&ncclDebugOutputLock); \ - } \ -} while(0) - +#define TRACE(FLAGS, ...) ncclDebugLog(NCCL_LOG_TRACE, (FLAGS), __func__, __LINE__, __VA_ARGS__) extern std::chrono::high_resolution_clock::time_point ncclEpoch; - #else #define TRACE(...) #endif @@ -84,17 +43,17 @@ extern std::chrono::high_resolution_clock::time_point ncclEpoch; static inline void initDebug() { const char* nccl_debug = getenv("NCCL_DEBUG"); if (nccl_debug == NULL) { - ncclDebugLevel = NONE; + ncclDebugLevel = NCCL_LOG_NONE; } else if (strcasecmp(nccl_debug, "VERSION") == 0) { - ncclDebugLevel = VERSION; + ncclDebugLevel = NCCL_LOG_VERSION; } else if (strcasecmp(nccl_debug, "WARN") == 0) { - ncclDebugLevel = WARN; + ncclDebugLevel = NCCL_LOG_WARN; } else if (strcasecmp(nccl_debug, "INFO") == 0) { - ncclDebugLevel = INFO; + ncclDebugLevel = NCCL_LOG_INFO; } else if (strcasecmp(nccl_debug, "ABORT") == 0) { - ncclDebugLevel = ABORT; + ncclDebugLevel = NCCL_LOG_ABORT; } else if (strcasecmp(nccl_debug, "TRACE") == 0) { - ncclDebugLevel = TRACE; + ncclDebugLevel = NCCL_LOG_TRACE; } /* Parse the NCCL_DEBUG_SUBSYS env var @@ -109,17 +68,17 @@ static inline void initDebug() { uint64_t mask = 0; if (subsys[0] == '^') { invert = 1; subsys++; } if (strcasecmp(subsys, "INIT") == 0) { - mask = INIT; + mask = NCCL_INIT; } else if (strcasecmp(subsys, "COLL") == 0) { - mask = COLL; + mask = NCCL_COLL; } else if (strcasecmp(subsys, "P2P") == 0) { - mask = P2P; + mask = NCCL_P2P; } else if (strcasecmp(subsys, "SHM") == 0) { - mask = SHM; + mask = NCCL_SHM; } else if (strcasecmp(subsys, "NET") == 0) { - mask = NET; + mask = NCCL_NET; } else if (strcasecmp(subsys, "ALL") == 0) { - mask = ALL; + mask = NCCL_ALL; } if (mask) { if (invert) ncclDebugMask &= ~mask; else ncclDebugMask |= mask; @@ -133,7 +92,7 @@ static inline void initDebug() { * NCCL_DEBUG level is > VERSION */ const char* nccl_debug_file = getenv("NCCL_DEBUG_FILE"); - if (ncclDebugLevel > VERSION && nccl_debug_file != NULL) { + if (ncclDebugLevel > NCCL_LOG_VERSION && nccl_debug_file != NULL) { int c = 0; char debug_fn[PATH_MAX+1] = ""; char *dfn = debug_fn; @@ -164,7 +123,7 @@ static inline void initDebug() { if (debug_fn[0] != '\0') { FILE *file = fopen(debug_fn, "w"); if (file != NULL) { - INFO(ALL,"DEBUG file is '%s'", debug_fn); + INFO(NCCL_ALL,"DEBUG file is '%s'", debug_fn); ncclDebugFile = file; } } diff --git a/projects/rccl/src/include/nccl_net.h b/projects/rccl/src/include/nccl_net.h index 58c1335d94..7dbbc37893 100644 --- a/projects/rccl/src/include/nccl_net.h +++ b/projects/rccl/src/include/nccl_net.h @@ -9,25 +9,25 @@ #include "nccl.h" -#define NCCL_NET_MAJOR 1 -#define NCCL_NET_MINOR 0 - #define NCCL_NET_HANDLE_MAXSIZE 64 #define NCCL_PTR_HOST 0x1 #define NCCL_PTR_CUDA 0x2 -#define NCCL_MAX_SCORE 0x7 +typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; +typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_ALL=~0} ncclDebugLogSubSys; + +typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); typedef struct { // Name of the network (mainly for logs) const char* name; - // Return the number of network devices along with their scores relative to the - // current CUDA device. The per device score should be a value from 1-7 with a - // higher score representing a better choice for performance. - // This call should allocate the 'scores' array using malloc(3), and it - // will then be freed automatically by NCCL. - ncclResult_t (*devices)(int* ndev, int** scores); + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Return the device path in /sys. NCCL will call free on this path. + ncclResult_t (*pciPath)(int dev, char** path); // Return whether this device supports host pointers and/or CUDA pointers // as data from the current GPU. Supported types should be composed with // NCCL_PTR_HOST and NCCL_PTR_CUDA. @@ -53,12 +53,10 @@ typedef struct { ncclResult_t (*closeSend)(void* sendComm); ncclResult_t (*closeRecv)(void* recvComm); ncclResult_t (*closeListen)(void* listenComm); -} ncclNet_t; +} ncclNet_v1_t; -extern -#ifdef __cplusplus -"C" -#endif -ncclNet_t* ncclNet; +typedef ncclNet_v1_t ncclNet_t; + +#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v1 #endif // end include guard diff --git a/projects/rccl/src/include/net.h b/projects/rccl/src/include/net.h index d841419a55..ebc967782c 100644 --- a/projects/rccl/src/include/net.h +++ b/projects/rccl/src/include/net.h @@ -10,6 +10,7 @@ #include "nccl.h" #include "nccl_net.h" +extern ncclNet_t* ncclNet; typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE]; /* Socket Interface Selection type */ @@ -19,7 +20,8 @@ typedef enum { findSubnetIf = -1, // Translation to external API static const char* ncclNetName() { return ncclNet->name; } -static ncclResult_t ncclNetDevices(int* ndev, int** scores) { NCCLCHECK(ncclNet->devices(ndev, scores)); return ncclSuccess; } +static ncclResult_t ncclNetDevices(int* ndev) { NCCLCHECK(ncclNet->devices(ndev)); return ncclSuccess; } +static ncclResult_t ncclNetPciPath(int dev, char** path) { NCCLCHECK(ncclNet->pciPath(dev, path)); return ncclSuccess; } static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) { NCCLCHECK(ncclNet->ptrSupport(dev, supportedTypes)); return ncclSuccess; } static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; } static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; } @@ -32,7 +34,6 @@ static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeS static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; } static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; } -extern bool ncclIbSupport(); extern ncclResult_t ncclSocketCreateHandle(void* opaqueHandle, const char* str); extern ncclNet_t ncclNetIb; extern ncclNet_t ncclNetSocket; diff --git a/projects/rccl/src/include/param.h b/projects/rccl/src/include/param.h index 44c1b9a55f..dd5f697e34 100644 --- a/projects/rccl/src/include/param.h +++ b/projects/rccl/src/include/param.h @@ -67,10 +67,10 @@ int64_t ncclParam##name() { \ errno = 0; \ int64_t v = strtoll(str, NULL, 0); \ if (errno) { \ - INFO(ALL,"Invalid value %s for %s, using default %lu.", str, "NCCL_" env, value); \ + INFO(NCCL_ALL,"Invalid value %s for %s, using default %lu.", str, "NCCL_" env, value); \ } else { \ value = v; \ - INFO(ALL,"%s set by environment to %lu.", "NCCL_" env, value); \ + INFO(NCCL_ALL,"%s set by environment to %lu.", "NCCL_" env, value); \ } \ } \ } \ diff --git a/projects/rccl/src/include/socket.h b/projects/rccl/src/include/socket.h index 533cacc9d5..9d2b2c8fc4 100644 --- a/projects/rccl/src/include/socket.h +++ b/projects/rccl/src/include/socket.h @@ -76,7 +76,7 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre if (family != AF_INET && family != AF_INET6) continue; - TRACE(INIT|NET,"Found interface %s:%s", interface->ifa_name, socketToString(interface->ifa_addr, line)); + TRACE(NCCL_INIT|NCCL_NET,"Found interface %s:%s", interface->ifa_name, socketToString(interface->ifa_addr, line)); /* Allow the caller to force the socket family type */ if (sock_family != -1 && family != sock_family) @@ -106,7 +106,7 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre // Store the IP address int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6); memcpy(addrs+found, interface->ifa_addr, salen); - INFO(INIT|NET,"NET : Using interface %s:%s", interface->ifa_name, socketToString(interface->ifa_addr, line)); + INFO(NCCL_INIT|NCCL_NET,"NET : Using interface %s:%s", interface->ifa_name, socketToString(interface->ifa_addr, line)); found++; } } @@ -183,7 +183,7 @@ static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAdd // Store the interface name strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize); - INFO(INIT|NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr.sa), line_a)); + INFO(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr.sa), line_a)); found++; if (found == maxIfs) break; } @@ -333,7 +333,7 @@ static ncclResult_t createListenSocket(int *fd, union socketAddress *localAddr) #ifdef ENABLE_TRACE char line[1024]; - TRACE(INIT|NET,"Listening on socket %s", socketToString(&localAddr->sa, line)); + TRACE(NCCL_INIT|NCCL_NET,"Listening on socket %s", socketToString(&localAddr->sa, line)); #endif /* Put the socket in listen mode */ @@ -363,7 +363,7 @@ static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) { #ifdef ENABLE_TRACE char line[1024]; - TRACE(INIT|NET,"Connecting to socket %s", socketToString(&remoteAddr->sa, line)); + TRACE(NCCL_INIT|NCCL_NET,"Connecting to socket %s", socketToString(&remoteAddr->sa, line)); #endif SYSCHECKNTIMES(connect(*fd, &remoteAddr->sa, salen), "connect", RETRY_TIMES, SLEEP_INT, ECONNREFUSED); @@ -381,7 +381,7 @@ static ncclResult_t socketReceive(int fd, void* ptr, int size) { return ncclSystemError; } if (recvsize == -1) { - INFO(NET,"Recv : got retcode %d, retrying", errno); + INFO(NCCL_NET,"Recv : got retcode %d, retrying", errno); continue; } data += recvsize; @@ -397,7 +397,7 @@ static ncclResult_t socketSend(int fd, void* ptr, int size) { int sendsize; SYSCHECKVAL(write(fd, data, size-offset), "write", sendsize); if (sendsize == -1) { - INFO(NET,"Send : got retcode %d, retrying", errno); + INFO(NCCL_NET,"Send : got retcode %d, retrying", errno); continue; } data += sendsize; diff --git a/projects/rccl/src/include/topo.h b/projects/rccl/src/include/topo.h index 24d7e9d28e..971ae68c2f 100644 --- a/projects/rccl/src/include/topo.h +++ b/projects/rccl/src/include/topo.h @@ -8,53 +8,26 @@ #define NCCL_TOPO_H_ #include "nccl.h" +#include +#include #include -#define MAXPATHSIZE 1024 - static ncclResult_t getCudaPath(int cudaDev, char** path) { char busId[16]; CUDACHECK(cudaDeviceGetPCIBusId(busId, 16, cudaDev)); for (int i=0; i<16; i++) busId[i] = tolower(busId[i]); - char busPath[] = "/sys/class/pci_bus/0000:00/device"; + char busPath[] = "/sys/class/pci_bus/0000:00/../../0000:00:00.0"; memcpy(busPath+sizeof("/sys/class/pci_bus/")-1, busId, sizeof("0000:00")-1); - char* cudaRpath = realpath(busPath, NULL); - char pathname[MAXPATHSIZE]; - strncpy(pathname, cudaRpath, MAXPATHSIZE); - strncpy(pathname+strlen(pathname), "/", MAXPATHSIZE-strlen(pathname)); - strncpy(pathname+strlen(pathname), busId, MAXPATHSIZE-strlen(pathname)); - free(cudaRpath); - *path = realpath(pathname, NULL); + memcpy(busPath+sizeof("/sys/class/pci_bus/0000:00/../../")-1, busId, sizeof("0000:00:00.0")-1); + *path = realpath(busPath, NULL); if (*path == NULL) { - WARN("Could not find real path of %s", pathname); + WARN("Could not find real path of %s", busPath); return ncclSystemError; } return ncclSuccess; } -static ncclResult_t getMlxPath(char* ibName, char** path) { - char devicepath[MAXPATHSIZE]; - snprintf(devicepath, MAXPATHSIZE, "/sys/class/infiniband/%s/device", ibName); - *path = realpath(devicepath, NULL); - if (*path == NULL) { - WARN("Could not find real path of %s", devicepath); - return ncclSystemError; - } - return ncclSuccess; -} - -static ncclResult_t getSockPath(char* ifName, char** path) { - char devicepath[MAXPATHSIZE]; - snprintf(devicepath, MAXPATHSIZE, "/sys/class/net/%s/device", ifName); - *path = realpath(devicepath, NULL); - if (*path == NULL) { - INFO(NET|INIT, "Could not find real path of %s", devicepath); - return ncclSystemError; - } - return ncclSuccess; -} - -enum ncclIbPathDist { +enum ncclPathDist { PATH_PIX = 0, PATH_PXB = 1, PATH_PHB = 2, @@ -74,7 +47,7 @@ static int pciDistance(char* path1, char* path2) { if (same == 1) score++; } } - if (score == 3) return PATH_SOC; + if (score <= 3) return PATH_SOC; if (score == 4) return PATH_PHB; if (score == depth-1) return PATH_PIX; return PATH_PXB; diff --git a/projects/rccl/src/init.cu b/projects/rccl/src/init.cu index ed33d1aa43..c9700a8b4f 100644 --- a/projects/rccl/src/init.cu +++ b/projects/rccl/src/init.cu @@ -28,9 +28,13 @@ #include #include #include +#include -DebugLevel ncclDebugLevel; -uint64_t ncclDebugMask = INIT; // Default debug sub-system mask is INIT +#define STR2(v) #v +#define STR(v) STR2(v) + +int ncclDebugLevel; +uint64_t ncclDebugMask = NCCL_INIT; // Default debug sub-system mask is INIT pthread_mutex_t ncclDebugOutputLock; FILE *ncclDebugFile = stdout; @@ -48,7 +52,6 @@ NCCL_PARAM(GroupCudaStream, "GROUP_CUDA_STREAM", NCCL_GROUP_CUDA_STREAM); NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0); -extern "C" __attribute__ ((visibility("default"))) ncclNet_t* ncclNet = NULL; // We define this as weak to let tests redefine their own @@ -69,13 +72,53 @@ int ncclCudaFullCompCap() { return ccMajor*10+ccMinor; } -void initNet() { - if (ncclNet != NULL) { - INFO(INIT,"Using external Network %s", ncclNetName()); - } else { - ncclNet = ncclIbSupport() ? &ncclNetIb : &ncclNetSocket; - INFO(INIT,"Using internal Network %s", ncclNetName()); +ncclResult_t initNet(ncclNet_t* net) { + int ndev; + NCCLCHECK(net->init(ncclDebugLog)); + NCCLCHECK(net->devices(&ndev)); + if (ndev <= 0) { + INFO(NCCL_INIT, "Net/%s: call to devices() returned 0 devices.", net->name); + return ncclSystemError; } + return ncclSuccess; +} + +ncclResult_t initNetPlugin(ncclNet_t** net) { + void* netPluginLib = dlopen("libnccl-net.so", RTLD_NOW | RTLD_LOCAL); + if (netPluginLib == NULL) { + INFO(NCCL_INIT, "Unable to load libnccl-net.so : %s", dlerror()); + return ncclSuccess; + } + ncclNet_t* extNet = (ncclNet_t*) dlsym(netPluginLib, STR(NCCL_PLUGIN_SYMBOL)); + if (extNet == NULL) { + INFO(NCCL_INIT, "NetPlugin: could not find " STR(NCCL_PLUGIN_SYMBOL) " symbol"); + goto cleanup; + } + if (initNet(extNet) == ncclSuccess) { + *net = extNet; + return ncclSuccess; + } +cleanup: + if (netPluginLib != NULL) dlclose(netPluginLib); + return ncclSuccess; +} + +ncclResult_t initNet() { + // Always initialize sockets as we use it for bootstrap + NCCLCHECK(initNet(&ncclNetSocket)); + + NCCLCHECK(initNetPlugin(&ncclNet)); + if (ncclNet != NULL) { + INFO(NCCL_INIT, "Using external Network %s", ncclNetName()); + return ncclSuccess; + } + if (initNet(&ncclNetIb) == ncclSuccess) { + ncclNet = &ncclNetIb; + } else { + ncclNet = &ncclNetSocket; + } + INFO(NCCL_INIT,"Using internal Network %s", ncclNetName()); + return ncclSuccess; } NCCL_PARAM(LlThreshold, "LL_THRESHOLD", -2); @@ -171,7 +214,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { struct ncclComm* comm; NCCLCHECK(ncclCalloc(&comm, 1)); - INFO(INIT,"comm %p rank %d nranks %d", comm, rank, ndev); + INFO(NCCL_INIT,"comm %p rank %d nranks %d", comm, rank, ndev); comm->rank = rank; comm->nRanks = ndev; cudaGetDevice(&comm->cudaDev); @@ -204,16 +247,14 @@ static ncclResult_t devCommSetup(ncclComm_t comm) { } // Pre-process the string so that running "strings" on the lib can quickly reveal the version. -#define STR2(v) #v -#define STR(v) STR2(v) #define VERSION_STRING "NCCL version " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX "+cuda" STR(CUDA_MAJOR) "." STR(CUDA_MINOR) static void showVersion() { static int shown = 0; - if (shown == 0 && ncclDebugLevel >= VERSION) { + if (shown == 0 && ncclDebugLevel >= NCCL_LOG_VERSION) { printf("%s\n", VERSION_STRING); fflush(stdout); if (ncclDebugFile != stdout) - INFO(ALL,"%s", VERSION_STRING); // Also log NCCL version in one of the files + INFO(NCCL_ALL,"%s", VERSION_STRING); // Also log NCCL version in one of the files shown = 1; } } @@ -294,12 +335,12 @@ void dumpMatrix(int* connectMatrix, int nranks) { line[STRLENGTH] = '\0'; memset(line, ' ', STRLENGTH); for (int j=0; jnThreads = std::max(allData[i], comm->nThreads); - if (rank == 0) INFO(INIT,"Using %d threads", comm->nThreads); + if (rank == 0) INFO(NCCL_INIT,"Using %d threads", comm->nThreads); // Determine the minimum CUDA Compute capability of all GPUs int myCompCap = ncclCudaCompCap(); @@ -486,7 +527,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int))); for (int i=0; irank == 0 && *comm->intraCGMode & 0x10) { *comm->intraCGMode ^= 0x10; - INFO(INIT,"Launch mode %s%s%s", + INFO(NCCL_INIT,"Launch mode %s%s%s", comm->launchMode == ncclComm::GROUP ? "Group" : "Parallel", *comm->intraCGMode ? "/CGMD" : "", (comm->launchMode == ncclComm::GROUP && comm->groupCudaStream) ? "/Stream" : ""); diff --git a/projects/rccl/src/misc/group.cu b/projects/rccl/src/misc/group.cu index c7b31cfeb9..1716a75643 100644 --- a/projects/rccl/src/misc/group.cu +++ b/projects/rccl/src/misc/group.cu @@ -58,7 +58,7 @@ ncclResult_t ncclSetDevice(int cudaDev) { #define CHECK(a) do { \ if ((args->ret = (a)) != ncclSuccess) { \ - INFO(INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \ + INFO(NCCL_INIT,"%s:%d -> %d [Async thread]", __FILE__, __LINE__, args->ret); \ return args; \ } \ } while(0) diff --git a/projects/rccl/src/misc/nvmlwrap.cu b/projects/rccl/src/misc/nvmlwrap.cu index 52d4d41473..d9407f4686 100644 --- a/projects/rccl/src/misc/nvmlwrap.cu +++ b/projects/rccl/src/misc/nvmlwrap.cu @@ -61,7 +61,7 @@ ncclResult_t wrapNvmlSymbols(void) { cast = (void**)&funcptr; \ tmp = dlsym(handle, symbol); \ if (tmp == NULL) { \ - INFO(INIT,"dlsym failed on %s, ignoring", symbol); \ + INFO(NCCL_INIT,"dlsym failed on %s, ignoring", symbol); \ } \ *cast = tmp; \ } while (0) @@ -208,7 +208,7 @@ ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link } nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkState(device, link, isActive); if (ret != NVML_SUCCESS) { - INFO(INIT,"nvmlDeviceGetNvLinkState() failed: %s ", + INFO(NCCL_INIT,"nvmlDeviceGetNvLinkState() failed: %s ", nvmlInternalErrorString(ret)); return ncclSystemError; } @@ -223,7 +223,7 @@ ncclResult_t wrapNvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkRemotePciInfo(device, link, pci); if (ret != NVML_SUCCESS) { if (ret != NVML_ERROR_NOT_SUPPORTED) - INFO(INIT,"nvmlDeviceGetNvLinkRemotePciInfo() failed: %s ", + INFO(NCCL_INIT,"nvmlDeviceGetNvLinkRemotePciInfo() failed: %s ", nvmlInternalErrorString(ret)); return ncclSystemError; } @@ -239,7 +239,7 @@ ncclResult_t wrapNvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkCapability(device, link, capability, capResult); if (ret != NVML_SUCCESS) { if (ret != NVML_ERROR_NOT_SUPPORTED) - INFO(INIT,"nvmlDeviceGetNvLinkCapability() failed: %s ", + INFO(NCCL_INIT,"nvmlDeviceGetNvLinkCapability() failed: %s ", nvmlInternalErrorString(ret)); return ncclSystemError; } diff --git a/projects/rccl/src/misc/rings.cu b/projects/rccl/src/misc/rings.cu index 9ecda49cd9..a5d4616019 100644 --- a/projects/rccl/src/misc/rings.cu +++ b/projects/rccl/src/misc/rings.cu @@ -5,9 +5,10 @@ ************************************************************************/ #include "core.h" -#include "net.h" #include "param.h" +#define NCCL_MAX_SCORE 7 + /* Parse user defined rings. Format is like : * "0 1|1 0|0 1 2 3|3 2 1 0|0 2 3 1|1 3 2 0|0 1 2 3 4 5 6 7|7 6 5 4 3 2 1 0" * Rings with a non-matching number of ranks are ignored so we can provide @@ -188,11 +189,11 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* if (str && strlen(str)>0) { int ret = parseRings(str, nrings, nranks, prev, next); if (ret == ncclSuccess && *nrings > 0) { - if (rank == 0) INFO(INIT,"%d ring(s) set by environment", *nrings); + if (rank == 0) INFO(NCCL_INIT,"%d ring(s) set by environment", *nrings); NCCLCHECK(getEnvThreads(nthreads)); return ncclSuccess; } - if (rank == 0) INFO(INIT,"No valid ring found in environment, ignoring"); + if (rank == 0) INFO(NCCL_INIT,"No valid ring found in environment, ignoring"); *nrings = 0; } @@ -333,13 +334,13 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* minNrings = MAXRINGS; } if (maxNrings > 0 && maxNrings <= *nrings) { - if (rank == 0) INFO(INIT,"Limiting to %d rings per user request.", maxNrings); + if (rank == 0) INFO(NCCL_INIT,"Limiting to %d rings per user request.", maxNrings); *nrings = maxNrings; } else { int defaultMinNrings = ncclCudaCompCap() == 3 ? 2 : 1; if (minNrings < defaultMinNrings) minNrings = defaultMinNrings; if (minNrings > 0 && minNrings > *nrings) { - if (rank == 0 && minNrings > defaultMinNrings) INFO(INIT,"Duplicating rings to %d per user request.", minNrings); + if (rank == 0 && minNrings > defaultMinNrings) INFO(NCCL_INIT,"Duplicating rings to %d per user request.", minNrings); for (int r=*nrings; r #include +#include ncclResult_t getHostName(char* hostname, int maxlen) { if (gethostname(hostname, maxlen) != 0) { @@ -20,6 +22,53 @@ ncclResult_t getHostName(char* hostname, int maxlen) { return ncclSuccess; } +/* Common logging function used by the INFO, WARN and TRACE macros + * Also exported to the dynamically loadable Net transport modules so + * they can share the debugging mechanisms and output files + */ +void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) { + if (ncclDebugLevel <= NCCL_LOG_NONE) return; + + char hostname[1024]; + getHostName(hostname, 1024); + int cudaDev; + cudaGetDevice(&cudaDev); + + char buffer[1024]; + size_t len = 0; + pthread_mutex_lock(&ncclDebugOutputLock); + if (level == NCCL_LOG_WARN && ncclDebugLevel >= NCCL_LOG_WARN) + len = snprintf(buffer, sizeof(buffer), + "\n%s:%d:%d [%d] %s:%d NCCL WARN ", hostname, getpid(), gettid(), cudaDev, filefunc, line); + else if (level == NCCL_LOG_INFO && ncclDebugLevel >= NCCL_LOG_INFO && (flags & ncclDebugMask)) + len = snprintf(buffer, sizeof(buffer), + "%s:%d:%d [%d] NCCL INFO ", hostname, getpid(), gettid(), cudaDev); +#ifdef ENABLE_TRACE + else if (level == NCCL_LOG_TRACE && ncclDebugLevel >= NCCL_LOG_TRACE && (flags & ncclDebugMask)) { + auto delta = std::chrono::high_resolution_clock::now() - ncclEpoch; + double timestamp = std::chrono::duration_cast>(delta).count()*1000; + len = snprintf(buffer, sizeof(buffer), + "%s:%d:%d [%d] %f %s:%d NCCL TRACE ", hostname, getpid(), gettid(), cudaDev, timestamp, filefunc, line); + } +#endif + if (len) { + va_list vargs; + va_start(vargs, fmt); + (void) vsnprintf(buffer+len, sizeof(buffer)-len, fmt, vargs); + va_end(vargs); + fprintf(ncclDebugFile,"%s\n", buffer); + fflush(ncclDebugFile); + } + pthread_mutex_unlock(&ncclDebugOutputLock); + + // If ncclDebugLevel == NCCL_LOG_ABORT then WARN() will also call abort() + if (level == NCCL_LOG_WARN && ncclDebugLevel == NCCL_LOG_ABORT) { + fprintf(stderr,"\n%s:%d:%d [%d] %s:%d NCCL ABORT\n", + hostname, getpid(), gettid(), cudaDev, filefunc, line); + abort(); + } +} + uint64_t getHash(const char* string) { // Based on DJB2, result = result * 33 + char uint64_t result = 5381; @@ -51,7 +100,7 @@ uint64_t getHostHash(void) { offset += len; // Trailing '\0' uname[offset]='\0'; - TRACE(INIT,"unique hostname '%s'", uname); + TRACE(NCCL_INIT,"unique hostname '%s'", uname); return getHash(uname); } @@ -71,7 +120,7 @@ uint64_t getPidHash(void) { if (len < 0) len = 0; pname[plen+len]='\0'; - TRACE(INIT,"unique PID '%s'", pname); + TRACE(NCCL_INIT,"unique PID '%s'", pname); return getHash(pname); } diff --git a/projects/rccl/src/ring.cu b/projects/rccl/src/ring.cu index e9e9573703..fede79387f 100644 --- a/projects/rccl/src/ring.cu +++ b/projects/rccl/src/ring.cu @@ -26,7 +26,7 @@ ncclResult_t initRing(struct ncclComm* comm, int ringid) { NCCLCHECK(ncclCudaCalloc((char**)&recvMem, recvSize)); ring->devMemRecv = recvMem; - TRACE(INIT,"sendMem %p size %d recvMem %p size %d", sendMem, sendSize, recvMem, recvSize); + TRACE(NCCL_INIT,"sendMem %p size %d recvMem %p size %d", sendMem, sendSize, recvMem, recvSize); // Pre-configure send/recv pointers. Those are the default, they may change later. ring->recv.conn.buff = recvMem->buff; diff --git a/projects/rccl/src/transport.cu b/projects/rccl/src/transport.cu index f5f9d75e29..7c13d5c351 100644 --- a/projects/rccl/src/transport.cu +++ b/projects/rccl/src/transport.cu @@ -113,8 +113,8 @@ ncclResult_t transportSaveProxies(int substeps, int subchunks, int nstepsPerRoun int nrounds = (int)(DIVUP(nbytes, ((size_t)nrings * nblocksPerRound * (buffSize/subchunks)))); // Fixed 32-bit overflow int nsteps = nstepsPerRound * nrounds * substeps; - TRACE(NET,"opCount %lx substeps %d subchunks %d nrounds %d nsteps %d comm %p", comm->opCount, subchunks, subchunks, nrounds, nsteps, comm); - TRACE(NET,"opCount %lx nbytes %zi nrings %d buffSize %d pattern %d comm %p", comm->opCount, nbytes, nrings, buffSize, pattern, comm); + TRACE(NCCL_NET,"opCount %lx substeps %d subchunks %d nrounds %d nsteps %d comm %p", comm->opCount, subchunks, subchunks, nrounds, nsteps, comm); + TRACE(NCCL_NET,"opCount %lx nbytes %zi nrings %d buffSize %d pattern %d comm %p", comm->opCount, nbytes, nrings, buffSize, pattern, comm); for (int r=0; rrings+((comm->myParams->gridDim.x+r)%comm->nRings); struct ncclProxyArgs args = { ring, substeps*subchunks, nsteps, comm->opCount, llMode, 0 }; @@ -159,7 +159,7 @@ ncclResult_t transportCreateProxy(int type, struct ncclRing* ring, struct ncclCo struct ncclConnector* connector = (type == RECV) ? &ring->recv : &ring->send; threadFunc_t proxyfunc = (threadFunc_t) ((type == RECV) ? connector->transport->recv.proxy : connector->transport->send.proxy); if (proxyfunc) { - TRACE(NET,"type %d ring %p proxyfunc %p comm %p", type, ring, proxyfunc, comm); + TRACE(NCCL_NET,"type %d ring %p proxyfunc %p comm %p", type, ring, proxyfunc, comm); struct transportProxyInfo* info; NCCLCHECK(ncclCalloc(&info, 1)); connector->proxyInfo = info; diff --git a/projects/rccl/src/transport/net.cu b/projects/rccl/src/transport/net.cu index ed62a66c91..8a7e3b8757 100644 --- a/projects/rccl/src/transport/net.cu +++ b/projects/rccl/src/transport/net.cu @@ -19,11 +19,21 @@ #define NET_BITS_PER_IF 3 #define NET_BITS_PER_IF_MASK ((1<= NET_MAX_IFS*NET_BITS_PER_IF, "NET_MAX_IFS*NET_BITS_PER_IF must fit in a ncclTvalue_t"); +static ncclTvalue_t getTvalue(short* distances, int ndev) { + ncclTvalue_t tvalue = 0; + for (int d=0; drank = rank; - int *scores; - NCCLCHECK(ncclNetDevices(&info->ndev, &scores)); + NCCLCHECK(ncclNetDevices(&info->ndev)); if (info->ndev == 0) { WARN("Error : Network returned 0 device"); return ncclSystemError; } if (info->ndev > NET_MAX_IFS) info->ndev = NET_MAX_IFS; - for (int d=0; dndev; d++) info->scores[d] = scores[d]; - free(scores); + + // Find distance with current GPU + int cudaDev; + cudaGetDevice(&cudaDev); + char* cudaPath; + NCCLCHECK(getCudaPath(cudaDev, &cudaPath)); + + char line[1024]; + sprintf(line, "CUDA Dev %d, %s NIC distance : ", cudaDev, ncclNetName()); + for (int d=0; dndev; d++) { + char* nicPath; + ncclResult_t err = ncclNetPciPath(d, &nicPath); + info->distances[d] = (err != ncclSuccess || nicPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(nicPath, cudaPath); + sprintf(line+strlen(line), " %s", pathDists[info->distances[d]]); + if (err == ncclSuccess) free(nicPath); + } + INFO(NCCL_INIT|NCCL_NET, "%s", line); + free(cudaPath); return ncclSuccess; } /* Determine if we can communicate with the peer */ ncclResult_t netCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo) { - ret[0] = 0; struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo; - for (int d=0; dndev; d++) { - // Keep 3 bits of score info per dev - ret[0] |= ((myInfo->scores[d] & NET_BITS_PER_IF_MASK)<<(NET_BITS_PER_IF*d)); - } + ret[0] = getTvalue(myInfo->distances, myInfo->ndev); return ncclSuccess; } @@ -175,13 +196,13 @@ ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* return ncclSuccess; } -int getDev(int ringId, int nDev, short* scores) { - int maxScore = 0; - for (int d=0; d maxScore) maxScore = scores[d]; +int getDev(int ringId, int nDev, short* distances) { + int minDistance = PATH_SOC; + for (int d=0; d= netGdrLevel) { + INFO(NCCL_INIT|NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %d / HCA %d (distance %d >= %d)", ncclNetName(), cudaDev, dev, distance, netGdrLevel); + return ncclSuccess; + } + + // Finally, check if the NIC supports it + int flags; + NCCLCHECK(ncclNetPtrSupport(dev, &flags)); + if (flags & NCCL_PTR_CUDA == 0) return ncclSuccess; + *useGdr = 1; + INFO(NCCL_INIT|NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %d / HCA %d (distance %d >= %d), read %d", ncclNetName(), cudaDev, dev, distance, netGdrLevel, read); + return ncclSuccess; +} /* Determine if we will use this transport for this peer and return connect * information for this peer */ @@ -200,34 +255,11 @@ ncclResult_t netSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo ring->send.transportResources = resources; struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo; - resources->netDev = getDev(ring->id, myInfo->ndev, myInfo->scores); - resources->cudaSupport = false; - - // Get user's GDR READ setting - int gdrReadParam = ncclParamNetGdrRead(); - - // Determine whether the GPU has NVLink - int cudaDev; - CUDACHECK(cudaGetDevice(&cudaDev)); - char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; - CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev)); - int nvlinks = getNumNvlinks(busId); - - // Enable GDR read when: - // 1) user sets it, or - // 2) we are on a NVSwitch platform (i.e. no P2P traffic over PCI-E switch) AND the GPU is Volta - bool enableGdrRead = (gdrReadParam > 0) || (nvlinks >= CONNECT_NVSWITCH && ncclCudaCompCap() > 6 && gdrReadParam != 0); - if (enableGdrRead) { - int flags; - NCCLCHECK(ncclNetPtrSupport(resources->netDev, &flags)); - if (flags & NCCL_PTR_CUDA) - resources->cudaSupport = true; - } - if (resources->cudaSupport) - INFO(INIT|NET, "Net: enabling net device %d to read from rank %d", resources->netDev, myInfo->rank); + resources->netDev = getDev(ring->id, myInfo->ndev, myInfo->distances); + NCCLCHECK(netGetGdrSupport(resources->netDev, myInfo->distances[resources->netDev], 1, &resources->useGdr)); int size = offsetof(struct ncclRecvMem, buff)+ring->buffSize; - if (resources->cudaSupport) { + if (resources->useGdr) { NCCLCHECK(ncclCudaCalloc((char**)(&resources->devNetMem), size)); } @@ -243,10 +275,8 @@ ncclResult_t netRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo ring->recv.transportResources = resources; struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo; - resources->netDev = getDev(ring->id, myInfo->ndev, myInfo->scores); - int flags; - NCCLCHECK(ncclNetPtrSupport(resources->netDev, &flags)); - resources->cudaSupport = (flags & NCCL_PTR_CUDA) ? true : false; + resources->netDev = getDev(ring->id, myInfo->ndev, myInfo->distances); + NCCLCHECK(netGetGdrSupport(resources->netDev, myInfo->distances[resources->netDev], 0, &resources->useGdr)); int sendSize = sizeof(struct ncclSendMem); NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize)); @@ -255,8 +285,8 @@ ncclResult_t netRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize)); struct netInfo* peerInfo = (struct netInfo*)peerOpaqueInfo; - INFO(INIT|NET,"Ring %02d : %d -> %d via NET/%s/%d%s%s", ring->id, peerInfo->rank, myInfo->rank, ncclNetName(), resources->netDev, - resources->cudaSupport ? "/GDRDMA" : "", + INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d via NET/%s/%d%s%s", ring->id, peerInfo->rank, myInfo->rank, ncclNetName(), resources->netDev, + resources->useGdr ? "/GDRDMA" : "", (resources->hostDevMem != NULL) ? "/GDCopy" : ""); struct netConnectInfo* info = (struct netConnectInfo*) connectInfo; NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm)); @@ -267,7 +297,7 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto // Setup device pointers struct netSendResources* resources = (struct netSendResources*)send->transportResources; - if (resources->cudaSupport) { + if (resources->useGdr) { send->conn.buff = resources->devNetMem->buff; // We don't use devMem for llMode because the CPU has to read the data send->conn.llBuff = resources->devHostRecvMem->llBuff; @@ -299,7 +329,7 @@ ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto recv->conn.head = &resources->devHostSendMem->head; recv->conn.llHead = &resources->devHostSendMem->llHead; - if (resources->cudaSupport == false) { + if (resources->useGdr == 0) { recv->conn.buff = resources->devHostRecvMem->buff; recv->conn.llBuff = resources->devHostRecvMem->llBuff; } @@ -320,7 +350,7 @@ ncclResult_t netSendFree(void* transportResources) { struct netSendResources* resources = (struct netSendResources*)transportResources; NCCLCHECK(ncclCudaHostFree(resources->hostSendMem)); NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem)); - if (resources->cudaSupport) + if (resources->useGdr) CUDACHECK(cudaFree(resources->devNetMem)); NCCLCHECK(ncclNetCloseSend(resources->netSendComm)); free(resources); @@ -344,9 +374,9 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) { volatile uint64_t* prevTail = &resources->hostRecvMem->tail; struct ncclSendMem* prevMem = resources->hostDevMem ? resources->hostDevMem : resources->hostSendMem; uint64_t* prevHead = llMode ? &prevMem->llHead : &prevMem->head; - struct ncclRecvMem* localMem = resources->cudaSupport ? resources->devNetMem : resources->hostRecvMem; + struct ncclRecvMem* localMem = resources->useGdr ? resources->devNetMem : resources->hostRecvMem; char* localBuff = llMode ? resources->hostRecvMem->llBuff : localMem->buff; - int ptrType = resources->cudaSupport ? NCCL_PTR_CUDA : NCCL_PTR_HOST; + int ptrType = resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST; volatile int* sizesFifo = llMode ? resources->hostRecvMem->llSizesFifo : resources->hostRecvMem->sizesFifo; int buffSize = llMode ? NCCL_LL_BUFF_SIZE : ring->buffSize; int sliceSize = buffSize / args->substeps; @@ -362,8 +392,8 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) { if (!args->needProxy) goto nextColl; - TRACE(NET,"opCount %lx head %lx tail %lx end %lx nsteps %d llMode %d", args->opCount, head, tail, end, args->nsteps, llMode); - TRACE(NET,"opCount %lx buffSize %d sliceSize %d ptrType %d", args->opCount, buffSize, sliceSize, ptrType); + TRACE(NCCL_NET,"opCount %lx head %lx tail %lx end %lx nsteps %d llMode %d", args->opCount, head, tail, end, args->nsteps, llMode); + TRACE(NCCL_NET,"opCount %lx buffSize %d sliceSize %d ptrType %d", args->opCount, buffSize, sliceSize, ptrType); // Update in case we skipped some collectives if (llMode == 0) resources->hostRecvMem->opCount = args->opCount; @@ -440,10 +470,10 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) { int llMode = args->llMode; volatile uint64_t* nextHead = llMode ? &resources->hostSendMem->llHead : &resources->hostSendMem->head; - struct ncclRecvMem* localMem = resources->cudaSupport ? ring->devMemRecv : resources->hostRecvMem; + struct ncclRecvMem* localMem = resources->useGdr ? ring->devMemRecv : resources->hostRecvMem; char* localBuff = llMode ? localMem->llBuff : localMem->buff; - char* nextBuff = (resources->cudaSupport == false && resources->hostDevMem) ? resources->hostDevMem->buff : NULL; - int ptrType = resources->cudaSupport ? NCCL_PTR_CUDA : NCCL_PTR_HOST; + char* nextBuff = (resources->useGdr == 0 && resources->hostDevMem) ? resources->hostDevMem->buff : NULL; + int ptrType = resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST; uint64_t* nextTail = resources->hostDevMem ? &resources->hostDevMem->tail : &resources->hostRecvMem->tail; int buffSize = llMode ? NCCL_LL_BUFF_SIZE : ring->buffSize; @@ -458,8 +488,8 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) { if (!args->needProxy) goto nextColl; - TRACE(NET,"opCount %lx head %lx tail %lx end %lx nsteps %d llMode %d", args->opCount, head, tail, end, args->nsteps, llMode); - TRACE(NET,"opCount %lx buffSize %d sliceSize %d ptrType %d", args->opCount, buffSize, sliceSize, ptrType); + TRACE(NCCL_NET,"opCount %lx head %lx tail %lx end %lx nsteps %d llMode %d", args->opCount, head, tail, end, args->nsteps, llMode); + TRACE(NCCL_NET,"opCount %lx buffSize %d sliceSize %d ptrType %d", args->opCount, buffSize, sliceSize, ptrType); if (llMode == 0) { // Waiting for next opCount is only needed before writing nextTail. diff --git a/projects/rccl/src/transport/net_ib.cu b/projects/rccl/src/transport/net_ib.cu index 7b1fc99f46..fb8bd7b7da 100644 --- a/projects/rccl/src/transport/net_ib.cu +++ b/projects/rccl/src/transport/net_ib.cu @@ -82,8 +82,12 @@ static void* ncclIbAsyncThreadMain(void* args) { return NULL; } -static void initDevices() { - if(wrap_ibv_symbols() != ncclSuccess) { return; } +NCCL_PARAM(IbDisable, "IB_DISABLE", 0); + +ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { + if(wrap_ibv_symbols() != ncclSuccess) { return ncclInternalError; } + if (ncclParamIbDisable()) return ncclInternalError; + if (ncclNIbDevs == -1) { pthread_mutex_lock(&ncclIbLock); wrap_ibv_fork_init(); @@ -91,9 +95,9 @@ static void initDevices() { ncclNIbDevs = 0; if (findInterfaces(ncclIbIfName, &ncclIbIfAddr, MAX_IF_NAME_SIZE, 1) != 1) { WARN("NET/IB : No IP interface found."); - return; + return ncclInternalError; } - INFO(INIT|NET,"NET/IB : Using interface %s for sideband communication", ncclIbIfName); + INFO(NCCL_INIT|NCCL_NET,"NET/IB : Using interface %s for sideband communication", ncclIbIfName); // Detect IB cards int nIbDevs; @@ -105,7 +109,7 @@ static void initDevices() { bool searchNot = userIbEnv && userIbEnv[0] == '^'; int nUserIfs = parseStringList(userIbEnv, userIfs, MAX_IB_DEVS); - if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) return; + if (ncclSuccess != wrap_ibv_get_device_list(&devices, &nIbDevs)) return ncclInternalError; for (int d=0; dname, port, userIfs, nUserIfs) ^ searchNot)) { continue; } - INFO(INIT|NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port, + INFO(NCCL_INIT|NCCL_NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port, portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE"); ncclIbDevs[ncclNIbDevs].device = d; ncclIbDevs[ncclNIbDevs].port = port; @@ -145,38 +149,29 @@ static void initDevices() { pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context); } - if (found == 0) { if (ncclSuccess != wrap_ibv_close_device(context)) { return; } } + if (found == 0) { if (ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; } } } } - if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { return; }; + if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { return ncclInternalError; }; } - pthread_mutex_unlock(&ncclIbLock); } + return ncclSuccess; } -ncclResult_t ncclIbDevices(int* ndev, int** scores) { - initDevices(); +ncclResult_t ncclIbDevices(int* ndev) { *ndev = ncclNIbDevs; - int cudaDev; - cudaGetDevice(&cudaDev); - char* cudaPath; - ncclResult_t err1 = getCudaPath(cudaDev, &cudaPath); - int* sc; - NCCLCHECK(ncclCalloc(&sc, ncclNIbDevs)); - char line[1024]; - sprintf(line, "CUDA Dev %d, IB Ports : ", cudaDev); - for (int d=0; d 0) { - int gdrSupport = ncclIbGdrSupport(dev); - if (gdrSupport > 0) { - INFO(INIT|NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d / HCA %s (%s)", cudaDev, ncclIbDevs[dev].devName, gdrSupport == 1 ? "no module" : "not supported by GPU"); - ibGdrLevel = 0; - } - } - - if (ibGdrLevel <= 0) return ncclSuccess; - - char* cudaPath; - if (getCudaPath(cudaDev, &cudaPath) != ncclSuccess) return ncclSuccess; - char* mlxPath; - if (getMlxPath(ncclIbDevs[dev].devName, &mlxPath) != ncclSuccess) { free(cudaPath); return ncclSuccess; } - int distance = (mlxPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(mlxPath, cudaPath); - free(mlxPath); free(cudaPath); - if (distance < ibGdrLevel) { - *supportedTypes |= NCCL_PTR_CUDA; - } else { - INFO(INIT|NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d / HCA %s (distance %d >= %d)", cudaDev, ncclIbDevs[dev].devName, distance, ibGdrLevel); + if (ncclIbGdrSupport(dev) != ncclSuccess) { + INFO(NCCL_INIT|NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for GPU %d / HCA %s (no module or not supported by GPU)", cudaDev, ncclIbDevs[dev].devName); + return ncclSuccess; } + *supportedTypes |= NCCL_PTR_CUDA; return ncclSuccess; } static ncclResult_t GetSocketAddr(union socketAddress* addr) { - if (ncclNIbDevs == -1) initDevices(); memcpy(addr, &ncclIbIfAddr, sizeof(*addr)); return ncclSuccess; } @@ -442,7 +413,6 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) { *sendComm = comm; // IB Setup - initDevices(); /*NOTE: We need to do this for ncclNet unit test that bypasses nccl initialization*/ ibv_context* ctx = ncclIbDevs[dev].context; NCCLCHECK(ncclIbInitVerbs(ctx, &comm->verbs)); uint8_t ib_port = ncclIbDevs[dev].port; @@ -464,13 +434,13 @@ ncclResult_t ncclIbConnect(int dev, void* opaqueHandle, void** sendComm) { // RoCE support qpInfo.lid = portAttr.lid; if (qpInfo.lid) { // IB - INFO(INIT|NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ib_port, qpInfo.qpn, qpInfo.mtu, qpInfo.lid); + INFO(NCCL_INIT|NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d LID %d", dev, ib_port, qpInfo.qpn, qpInfo.mtu, qpInfo.lid); } else { // RoCE union ibv_gid gid; NCCLCHECK(wrap_ibv_query_gid(ctx, ib_port, ncclParamIbGidIndex(), &gid)); qpInfo.spn = gid.global.subnet_prefix; qpInfo.iid = gid.global.interface_id; - INFO(INIT|NET,"NET/IB: Dev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX)", dev, ib_port, qpInfo.qpn, qpInfo.mtu, ncclParamIbGidIndex(), qpInfo.spn, qpInfo.iid); + INFO(NCCL_INIT|NCCL_NET,"NET/IB: Dev %d Port %d qpn %d mtu %d GID %ld (%lX/%lX)", dev, ib_port, qpInfo.qpn, qpInfo.mtu, ncclParamIbGidIndex(), qpInfo.spn, qpInfo.iid); } NCCLCHECK(socketSend(comm->fd, &qpInfo, sizeof(qpInfo))); @@ -649,7 +619,7 @@ ncclResult_t ncclIbGetMr(struct ncclIbVerbs* verbs, void* data, int size, struct NCCLCHECK(wrap_ibv_reg_mr(&verbs->mrPool[elem].mr, verbs->pd, (void*)regAddr, regSize, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ)); *mrRet = verbs->mrPool+elem; verbs->mrPool[elem].refcnt++; - TRACE(INIT,"elem %d regAddr %lx size %ld rkey %x", elem, regAddr, regSize, (verbs->mrPool+elem)->mr->rkey); + TRACE(NCCL_INIT,"elem %d regAddr %lx size %ld rkey %x", elem, regAddr, regSize, (verbs->mrPool+elem)->mr->rkey); return ncclSuccess; } @@ -903,7 +873,9 @@ ncclResult_t ncclIbCloseListen(void* listenComm) { ncclNet_t ncclNetIb = { "IB", + ncclIbInit, ncclIbDevices, + ncclIbPciPath, ncclIbPtrSupport, ncclIbListen, ncclIbConnect, @@ -917,10 +889,3 @@ ncclNet_t ncclNetIb = { ncclIbCloseListen }; -NCCL_PARAM(IbDisable, "IB_DISABLE", 0); - -bool ncclIbSupport() { - if (ncclParamIbDisable()) return 0; - initDevices(); - return ncclNIbDevs > 0; -} diff --git a/projects/rccl/src/transport/net_socket.cu b/projects/rccl/src/transport/net_socket.cu index cff1973a4d..a8ae866e5a 100644 --- a/projects/rccl/src/transport/net_socket.cu +++ b/projects/rccl/src/transport/net_socket.cu @@ -8,67 +8,58 @@ #include "core.h" #include "socket.h" #include "net.h" -#include "topo.h" #include #include #include #include #include +#include /* Init functions */ +static char ncclNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS]; +static union socketAddress ncclNetIfAddrs[MAX_IFS]; +static int ncclNetIfs = -1; +pthread_mutex_t ncclSocketLock = PTHREAD_MUTEX_INITIALIZER; + +ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) { + if (ncclNetIfs == -1) { + pthread_mutex_lock(&ncclSocketLock); + if (ncclNetIfs == -1) { + ncclNetIfs = findInterfaces(ncclNetIfNames, ncclNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS); + INFO(NCCL_INIT|NCCL_NET,"NET/Socket : %d interfaces found", ncclNetIfs); + if (ncclNetIfs <= 0) { + WARN("NET/Socket : no interface found"); + return ncclInternalError; + } + } + pthread_mutex_unlock(&ncclSocketLock); + } + return ncclSuccess; +} ncclResult_t ncclSocketPtrSupport(int dev, int* supportedTypes) { *supportedTypes = NCCL_PTR_HOST; return ncclSuccess; } -static char ncclNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS]; -static union socketAddress ncclNetIfAddrs[MAX_IFS]; -static int ncclNetIfs = -1; -pthread_mutex_t ncclSocketLock = PTHREAD_MUTEX_INITIALIZER; - -static void initDevices() { - if (ncclNetIfs == -1) { - pthread_mutex_lock(&ncclSocketLock); - if (ncclNetIfs == -1) { - ncclNetIfs = findInterfaces(ncclNetIfNames, ncclNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS); - INFO(INIT|NET,"NET/Socket : %d interfaces found", ncclNetIfs); - if (ncclNetIfs <= 0) { - WARN("NET/Socket : no interface found"); - } - } - pthread_mutex_unlock(&ncclSocketLock); - } +ncclResult_t ncclSocketDevices(int* ndev) { + *ndev = ncclNetIfs; + return ncclSuccess; } -ncclResult_t ncclSocketDevices(int* ndev, int** scores) { - initDevices(); - *ndev = ncclNetIfs; - int cudaDev; - cudaGetDevice(&cudaDev); - char* cudaPath; - ncclResult_t err1 = getCudaPath(cudaDev, &cudaPath); - int* sc; - NCCLCHECK(ncclCalloc(&sc, ncclNetIfs)); - char line[1024]; - sprintf(line, "CUDA Dev %d, IP Interfaces : ", cudaDev); - for (int i=0; i= ncclNetIfs) return ncclInternalError; memcpy(addr, ncclNetIfAddrs+dev, sizeof(*addr)); return ncclSuccess; @@ -223,7 +214,9 @@ ncclResult_t ncclSocketClose(void* opaqueComm) { ncclNet_t ncclNetSocket = { "Socket", + ncclSocketInit, ncclSocketDevices, + ncclSocketPciPath, ncclSocketPtrSupport, ncclSocketListen, ncclSocketConnect, diff --git a/projects/rccl/src/transport/p2p.cu b/projects/rccl/src/transport/p2p.cu index 35aebb4dcd..6c4626a77c 100644 --- a/projects/rccl/src/transport/p2p.cu +++ b/projects/rccl/src/transport/p2p.cu @@ -85,7 +85,7 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTin // See if CUDA can do P2P int p2p; if (cudaDeviceCanAccessPeer(&p2p, myInfo->cudaDev, peerInfo->cudaDev) != cudaSuccess) { - INFO(INIT|P2P,"peer query failed between dev %d and dev %d", + INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d and dev %d", myInfo->cudaDev, peerInfo->cudaDev); return ncclSuccess; } @@ -454,7 +454,7 @@ ncclResult_t p2pSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo info.direct = 1; info.directPtr = ring->devMemSend; if (myInfo->cudaDev == peerInfo->cudaDev) { - INFO(INIT|P2P,"Ring %02d : %d -> %d via P2P/common device", ring->id, myInfo->rank, peerInfo->rank); + INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d via P2P/common device", ring->id, myInfo->rank, peerInfo->rank); } else { // Enable P2P access cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0); @@ -465,7 +465,7 @@ ncclResult_t p2pSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo peerInfo->cudaDev, err, cudaGetErrorString(err)); return ncclInternalError; } - INFO(INIT|P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/direct pointer", + INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/direct pointer", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); } } else { @@ -477,7 +477,7 @@ ncclResult_t p2pSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo myInfo->rank, peerInfo->cudaDev, err, cudaGetErrorString(err)); return ncclInternalError; } - INFO(INIT|P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/IPC", + INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/IPC", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); //TRACE_DUMP_IPC(&info.devIpc); } @@ -495,7 +495,7 @@ ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo info.direct = 1; info.directPtr = ring->devMemRecv; if (myInfo->cudaDev == peerInfo->cudaDev) { - TRACE(INIT|P2P,"%d <- %d via P2P/common device", myInfo->rank, peerInfo->rank); + TRACE(NCCL_INIT|NCCL_P2P,"%d <- %d via P2P/common device", myInfo->rank, peerInfo->rank); } else { // Enable P2P access cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0); @@ -506,7 +506,7 @@ ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo peerInfo->cudaDev, err, cudaGetErrorString(err)); return ncclInternalError; } - TRACE(INIT|P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/direct pointer", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); + TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/direct pointer", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); } } else { info.direct = 0; @@ -517,7 +517,7 @@ ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo myInfo->rank, peerInfo->cudaDev, err, cudaGetErrorString(err)); return ncclInternalError; } - TRACE(INIT|P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/IPC", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); + TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/IPC", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); //TRACE_DUMP_IPC(&info.devIpc); } static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big"); diff --git a/projects/rccl/src/transport/shm.cu b/projects/rccl/src/transport/shm.cu index 38b6c6a0c5..317f652dac 100644 --- a/projects/rccl/src/transport/shm.cu +++ b/projects/rccl/src/transport/shm.cu @@ -168,10 +168,10 @@ ncclResult_t shmSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo char shmName[MAX_SHM_NAME_LEN]; sprintf(shmName, "nccl-shm-send-%lx-%d-%d", myInfo->pidHash, ring->id, myInfo->rank); info.shmSize = resources->shmSize = sizeof(struct ncclSendMem); - TRACE(SHM,"Open shmName %s shmSize %d", shmName, info.shmSize); + TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize); NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1)); - INFO(INIT|SHM,"Ring %02d : %d[%d] -> %d[%d] via direct shared memory", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); + INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%d] -> %d[%d] via direct shared memory", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); info.id = ring->id; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash; static_assert(sizeof(struct shmRecvConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big"); memcpy(connectInfo, &info, sizeof(struct shmRecvConnectInfo)); @@ -189,7 +189,7 @@ ncclResult_t shmRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo char shmName[MAX_SHM_NAME_LEN]; sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", myInfo->pidHash, ring->id, myInfo->rank); info.shmSize = resources->shmSize = offsetof(struct ncclRecvMem, buff)+ring->buffSize; - TRACE(SHM,"Open shmName %s shmSize %d", shmName, info.shmSize); + TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize); NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1)); info.id = ring->id; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash; @@ -207,7 +207,7 @@ ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto char shmName[MAX_SHM_NAME_LEN]; sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", info->pidHash, info->id, info->rank); resources->remShmSize = info->shmSize; - TRACE(SHM,"Open shmName %s shmSize %d", shmName, info->shmSize); + TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize); NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0)); // Remove the file to ensure proper clean-up NCCLCHECK(shmUnlink(shmName)); @@ -231,7 +231,7 @@ ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto char shmName[MAX_SHM_NAME_LEN]; sprintf(shmName, "nccl-shm-send-%lx-%d-%d", info->pidHash, info->id, info->rank); resources->remShmSize = info->shmSize; - TRACE(SHM,"Open shmName %s shmSize %d", shmName, info->shmSize); + TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize); NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0)); NCCLCHECK(shmUnlink(shmName)); recv->conn.head = &resources->devRemHostMem->head;