From ffc55560a15b8f1811e78048bab8526643ff3273 Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Thu, 13 Dec 2018 15:56:12 -0800 Subject: [PATCH 01/20] 2.4.2-1 Add tree algorithms for allreduce to improve performance at scale. Add ncclCommAbort() and ncclCommGetAsyncError() to properly handle network errors and be permit recover. Detect initial CPU affinity and no longer escape it. [ROCm/rccl commit: 1450d42675be325cd3b7a684d4b231eedceb22fb] --- projects/rccl/makefiles/common.mk | 7 +- projects/rccl/makefiles/version.mk | 4 +- projects/rccl/pkg/redhat/nccl.spec.in | 4 +- projects/rccl/pkg/srctxz/Makefile | 1 + projects/rccl/pkg/srctxz/create_srctxz.sh.in | 3 +- projects/rccl/src/Makefile | 16 +- projects/rccl/src/bootstrap.cu | 242 ++++-- projects/rccl/src/channel.cu | 51 ++ projects/rccl/src/collectives/all_gather.cu | 22 +- projects/rccl/src/collectives/all_reduce.cu | 26 +- projects/rccl/src/collectives/broadcast.cu | 40 +- projects/rccl/src/collectives/collectives.h | 37 +- projects/rccl/src/collectives/device/Makefile | 39 +- .../rccl/src/collectives/device/all_gather.cu | 8 +- .../rccl/src/collectives/device/all_gather.h | 218 +---- .../rccl/src/collectives/device/all_reduce.cu | 14 +- .../rccl/src/collectives/device/all_reduce.h | 377 ++++----- .../rccl/src/collectives/device/broadcast.cu | 8 +- .../rccl/src/collectives/device/broadcast.h | 200 +---- projects/rccl/src/collectives/device/common.h | 112 ++- .../src/collectives/device/common_kernel.h | 192 ++--- .../rccl/src/collectives/device/functions.cu | 10 +- .../rccl/src/collectives/device/gen_rules.sh | 28 + .../rccl/src/collectives/device/ll_kernel.h | 154 ---- .../rccl/src/collectives/device/primitives.h | 745 +++++++++++++----- .../rccl/src/collectives/device/reduce.cu | 14 +- projects/rccl/src/collectives/device/reduce.h | 165 +--- .../src/collectives/device/reduce_kernel.h | 94 +-- .../src/collectives/device/reduce_scatter.cu | 14 +- .../src/collectives/device/reduce_scatter.h | 158 +--- projects/rccl/src/collectives/reduce.cu | 23 +- .../rccl/src/collectives/reduce_scatter.cu | 22 +- projects/rccl/src/enqueue.cu | 442 +++++++++++ projects/rccl/src/include/bootstrap.h | 2 + projects/rccl/src/include/channel.h | 14 + projects/rccl/src/include/checks.h | 10 + projects/rccl/src/include/common_coll.h | 195 ----- projects/rccl/src/include/core.h | 186 ++++- projects/rccl/src/include/cpuset.h | 61 ++ projects/rccl/src/include/debug.h | 1 + projects/rccl/src/include/enqueue.h | 7 +- projects/rccl/src/include/nccl_net.h | 46 +- projects/rccl/src/include/net.h | 8 +- projects/rccl/src/include/nvlink.h | 74 +- projects/rccl/src/include/nvmlwrap.h | 18 +- projects/rccl/src/include/ring.h | 14 - projects/rccl/src/include/rings.h | 2 +- projects/rccl/src/include/socket.h | 9 +- projects/rccl/src/include/transport.h | 87 +- projects/rccl/src/include/trees.h | 13 + projects/rccl/src/init.cu | 665 ++++++++++++---- projects/rccl/src/misc/checks.cu | 69 ++ projects/rccl/src/misc/enqueue.cu | 248 ------ projects/rccl/src/misc/group.cu | 12 +- projects/rccl/src/misc/nvmlwrap.cu | 61 +- projects/rccl/src/misc/rings.cu | 61 +- projects/rccl/src/misc/trees.cu | 108 +++ projects/rccl/src/misc/utils.cu | 18 + projects/rccl/src/nccl.h.in | 14 +- projects/rccl/src/ring.cu | 70 -- projects/rccl/src/transport.cu | 331 ++++---- projects/rccl/src/transport/net.cu | 568 ++++++------- projects/rccl/src/transport/net_ib.cu | 221 +++--- projects/rccl/src/transport/net_socket.cu | 28 +- projects/rccl/src/transport/p2p.cu | 229 +++--- projects/rccl/src/transport/shm.cu | 57 +- 66 files changed, 3746 insertions(+), 3251 deletions(-) create mode 100644 projects/rccl/src/channel.cu create mode 100755 projects/rccl/src/collectives/device/gen_rules.sh delete mode 100644 projects/rccl/src/collectives/device/ll_kernel.h create mode 100644 projects/rccl/src/enqueue.cu create mode 100644 projects/rccl/src/include/channel.h create mode 100644 projects/rccl/src/include/checks.h delete mode 100644 projects/rccl/src/include/common_coll.h create mode 100644 projects/rccl/src/include/cpuset.h delete mode 100644 projects/rccl/src/include/ring.h create mode 100644 projects/rccl/src/include/trees.h create mode 100644 projects/rccl/src/misc/checks.cu delete mode 100644 projects/rccl/src/misc/enqueue.cu create mode 100644 projects/rccl/src/misc/trees.cu delete mode 100644 projects/rccl/src/ring.cu diff --git a/projects/rccl/makefiles/common.mk b/projects/rccl/makefiles/common.mk index 83a2a3951a..d0e2ca847d 100644 --- a/projects/rccl/makefiles/common.mk +++ b/projects/rccl/makefiles/common.mk @@ -15,8 +15,7 @@ PROFAPI ?= 0 NVCC = $(CUDA_HOME)/bin/nvcc CUDA_LIB ?= $(CUDA_HOME)/lib64 -CUDA_INC ?= $(CUDA_HOME)/include -CUDA_VERSION = $(strip $(shell $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//')) +CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//')) #CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev) CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1) CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2) @@ -36,14 +35,14 @@ CUDA8_PTX = -gencode=arch=compute_61,code=compute_61 CUDA9_PTX = -gencode=arch=compute_70,code=compute_70 # Include Volta support if we're using CUDA9 or above -ifeq ($(shell test "$(CUDA_MAJOR)" -gt 8; echo $$?),0) +ifeq ($(shell test "0$(CUDA_MAJOR)" -gt 8; echo $$?),0) NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA9_GENCODE) $(CUDA9_PTX) else NVCC_GENCODE ?= $(CUDA8_GENCODE) $(CUDA8_PTX) endif #$(info NVCC_GENCODE is ${NVCC_GENCODE}) -CXXFLAGS := -I$(CUDA_INC) -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden +CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden CXXFLAGS += -Wall -Wno-sign-compare NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all # Use addprefix so that we can specify more than one path diff --git a/projects/rccl/makefiles/version.mk b/projects/rccl/makefiles/version.mk index f9cee6a5a8..a8c6e3ab03 100644 --- a/projects/rccl/makefiles/version.mk +++ b/projects/rccl/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 -NCCL_MINOR := 3 -NCCL_PATCH := 7 +NCCL_MINOR := 4 +NCCL_PATCH := 2 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/projects/rccl/pkg/redhat/nccl.spec.in b/projects/rccl/pkg/redhat/nccl.spec.in index 65a2c60154..f9d83a30df 100644 --- a/projects/rccl/pkg/redhat/nccl.spec.in +++ b/projects/rccl/pkg/redhat/nccl.spec.in @@ -1,6 +1,6 @@ Name: libnccl -Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch} -Release: ${pkg:Revision} +Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch}${nccl:Suffix} +Release: ${pkg:Revision}+cuda${cuda:Major}.${cuda:Minor} Summary: NVIDIA Collectives Communication Library (NCCL) Runtime Group: Development/Libraries diff --git a/projects/rccl/pkg/srctxz/Makefile b/projects/rccl/pkg/srctxz/Makefile index 1cb7c06a99..ed677fe3b1 100644 --- a/projects/rccl/pkg/srctxz/Makefile +++ b/projects/rccl/pkg/srctxz/Makefile @@ -36,4 +36,5 @@ $(TXZPREPDIR)/% : %.in -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ -e "s/\$${nccl:Suffix}/$(NCCL_SUFFIX)/g" \ + -e "s/\$${pkg:Revision}/$(PKG_REVISION)/g" \ $< > $@ diff --git a/projects/rccl/pkg/srctxz/create_srctxz.sh.in b/projects/rccl/pkg/srctxz/create_srctxz.sh.in index 0b8e6d2b4c..ae7d01f2ff 100644 --- a/projects/rccl/pkg/srctxz/create_srctxz.sh.in +++ b/projects/rccl/pkg/srctxz/create_srctxz.sh.in @@ -25,8 +25,9 @@ NCCL_MAJOR=${nccl:Major} NCCL_MINOR=${nccl:Minor} NCCL_PATCH=${nccl:Patch} NCCL_SUFFIX=${nccl:Suffix} +NCCL_BUILD=${pkg:Revision} -NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}" +NCCLNAME="nccl-src_${NCCL_MAJOR}.${NCCL_MINOR}.${NCCL_PATCH}${NCCL_SUFFIX}-${NCCL_BUILD}" tar --exclude build \ --exclude ".git*" \ diff --git a/projects/rccl/src/Makefile b/projects/rccl/src/Makefile index 481000ad16..fe60b115f9 100644 --- a/projects/rccl/src/Makefile +++ b/projects/rccl/src/Makefile @@ -9,8 +9,8 @@ include ../makefiles/version.mk ##### src files INCEXPORTS := nccl.h nccl_net.h -LIBSRCFILES := init.cu ring.cu bootstrap.cu transport.cu misc/group.cu \ - misc/nvmlwrap.cu misc/ibvwrap.cu misc/rings.cu misc/utils.cu misc/enqueue.cu \ +LIBSRCFILES := init.cu channel.cu bootstrap.cu transport.cu enqueue.cu \ + misc/group.cu misc/nvmlwrap.cu misc/ibvwrap.cu misc/rings.cu misc/utils.cu misc/checks.cu misc/trees.cu \ transport/p2p.cu transport/shm.cu transport/net.cu transport/net_socket.cu transport/net_ib.cu \ collectives/all_reduce.cu collectives/all_gather.cu collectives/broadcast.cu collectives/reduce.cu collectives/reduce_scatter.cu @@ -29,11 +29,10 @@ LIBTARGET := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH)) STATICLIBTARGET := $(STATICLIBNAME) LIBOBJ := $(LIBSRCFILES:%.cu=$(OBJDIR)/%.o) DEPFILES := $(LIBOBJ:%.o=%.d) -LDFLAGS += -L${CUDA_LIB} -lcudart_static -lrt +LDFLAGS += -L${CUDA_LIB} -lcudart_static -lpthread -lrt -ldl DEVICELIB := $(BUILDDIR)/obj/collectives/device/colldevice.a - ##### rules build : lib staticlib @@ -41,9 +40,12 @@ lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET) staticlib : $(LIBDIR)/$(STATICLIBTARGET) -devicelib: $(INCDIR)/nccl.h +$(DEVICELIB): ALWAYS_REBUILD $(MAKE) -C collectives/device +# Empty target to force rebuild +ALWAYS_REBUILD: + -include $(DEPFILES) $(LIBDIR)/$(LIBTARGET) $(LIBDIR)/$(STATICLIBTARGET) : $(LIBOBJ) @@ -59,14 +61,14 @@ $(INCDIR)/nccl.h : nccl.h.in -e "s/\$${nccl:Version}/$(NCCL_VERSION)/g" \ $< > $@ -$(LIBDIR)/$(LIBTARGET): $(LIBOBJ) devicelib +$(LIBDIR)/$(LIBTARGET): $(LIBOBJ) $(DEVICELIB) @printf "Linking %-35s > %s\n" $(LIBTARGET) $@ mkdir -p $(LIBDIR) $(CXX) $(CXXFLAGS) -shared -Wl,--no-as-needed -Wl,-soname,$(LIBSONAME) -o $@ $(LIBOBJ) $(DEVICELIB) $(LDFLAGS) ln -sf $(LIBSONAME) $(LIBDIR)/$(LIBNAME) ln -sf $(LIBTARGET) $(LIBDIR)/$(LIBSONAME) -$(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) devicelib +$(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) $(DEVICELIB) @printf "Archiving %-35s > %s\n" $(STATICLIBTARGET) $@ mkdir -p $(LIBDIR) $(eval TMP := $(shell mktemp -d)) diff --git a/projects/rccl/src/bootstrap.cu b/projects/rccl/src/bootstrap.cu index 13c6e922b1..6b1d5732df 100644 --- a/projects/rccl/src/bootstrap.cu +++ b/projects/rccl/src/bootstrap.cu @@ -15,27 +15,31 @@ // Always use sockets for bootstrap ncclNet_t* ncclBootstrapNet = &ncclNetSocket; -static ncclResult_t bootstrapListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclBootstrapNet->listen(dev, handle, listenComm)); return ncclSuccess; } -static ncclResult_t bootstrapConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclBootstrapNet->connect(dev, handle, sendComm)); return ncclSuccess; } -static ncclResult_t bootstrapAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclBootstrapNet->accept(listenComm, recvComm)); return ncclSuccess; } -static ncclResult_t bootstrapTest(void* request, int* done, int* size) { NCCLCHECK(ncclBootstrapNet->test(request, done, size)); return ncclSuccess; } -static ncclResult_t bootstrapCloseSend(void* sendComm) { NCCLCHECK(ncclBootstrapNet->closeSend(sendComm)); return ncclSuccess; } -static ncclResult_t bootstrapCloseRecv(void* recvComm) { NCCLCHECK(ncclBootstrapNet->closeRecv(recvComm)); return ncclSuccess; } -static ncclResult_t bootstrapCloseListen(void* listenComm) { NCCLCHECK(ncclBootstrapNet->closeListen(listenComm)); return ncclSuccess; } +static ncclResult_t bootstrapNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclBootstrapNet->listen(dev, handle, listenComm)); return ncclSuccess; } +static ncclResult_t bootstrapNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclBootstrapNet->connect(dev, handle, sendComm)); return ncclSuccess; } +static ncclResult_t bootstrapNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclBootstrapNet->accept(listenComm, recvComm)); return ncclSuccess; } +static ncclResult_t bootstrapNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclBootstrapNet->test(request, done, size)); return ncclSuccess; } +static ncclResult_t bootstrapNetCloseSend(void* sendComm) { NCCLCHECK(ncclBootstrapNet->closeSend(sendComm)); return ncclSuccess; } +static ncclResult_t bootstrapNetCloseRecv(void* recvComm) { NCCLCHECK(ncclBootstrapNet->closeRecv(recvComm)); return ncclSuccess; } +static ncclResult_t bootstrapNetCloseListen(void* listenComm) { NCCLCHECK(ncclBootstrapNet->closeListen(listenComm)); return ncclSuccess; } // Additional sync functions based on async + test for bootstrap, using host ptrs. -static ncclResult_t bootstrapSend(void* sendComm, void* data, int size) { - void* request; - NCCLCHECK(ncclBootstrapNet->isend(sendComm, data, size, NCCL_PTR_HOST, &request)); +static ncclResult_t bootstrapNetSend(void* sendComm, void* data, int size) { + void* request, *mhandle; + NCCLCHECK(ncclBootstrapNet->regMr(sendComm, data, size, NCCL_PTR_HOST, &mhandle)); + NCCLCHECK(ncclBootstrapNet->isend(sendComm, data, size, mhandle, &request)); + NCCLCHECK(ncclBootstrapNet->deregMr(sendComm, mhandle)); int done = 0; - while (!done) NCCLCHECK(bootstrapTest(request, &done, NULL)); + while (!done) NCCLCHECK(bootstrapNetTest(request, &done, NULL)); return ncclSuccess; } -static ncclResult_t bootstrapRecv(void* recvComm, void* data, int size) { - void* request; - NCCLCHECK(ncclBootstrapNet->irecv(recvComm, data, size, NCCL_PTR_HOST, &request)); +static ncclResult_t bootstrapNetRecv(void* recvComm, void* data, int size) { + void* request, *mhandle; + NCCLCHECK(ncclBootstrapNet->regMr(recvComm, data, size, NCCL_PTR_HOST, &mhandle)); + NCCLCHECK(ncclBootstrapNet->irecv(recvComm, data, size, mhandle, &request)); + NCCLCHECK(ncclBootstrapNet->deregMr(recvComm, mhandle)); int done = 0; - while (!done) NCCLCHECK(bootstrapTest(request, &done, NULL)); + while (!done) NCCLCHECK(bootstrapNetTest(request, &done, NULL)); return ncclSuccess; } @@ -51,8 +55,8 @@ struct extId { struct extInfo { int rank; int nranks; - ncclNetHandle_t extHandleListenFromRoot; - ncclNetHandle_t extHandleRing; + ncclNetHandle_t extHandleListenRoot; + ncclNetHandle_t extHandleListen; }; #include @@ -68,28 +72,25 @@ static ncclResult_t setFilesLimit() { static void *bootstrapRoot(void* commId) { struct extInfo info; struct extId* id = (struct extId*)commId; - ncclNetHandle_t *extHandleBstrap = NULL; // for initial rank <-> root information exchange - ncclNetHandle_t *extHandleRing = NULL; // for bootstrap ring creation + ncclNetHandle_t *rankHandles = NULL; + ncclNetHandle_t *rankHandlesRoot = NULL; // for initial rank <-> root information exchange ncclNetHandle_t zero = { 0 }; // for sanity checking void* tmpComm; ncclResult_t res; setFilesLimit(); + TRACE(NCCL_INIT, "BEGIN"); /* Receive addresses from all ranks */ int nranks = 0, c = 0; do { - NCCLCHECKGOTO(bootstrapAccept(id->extListenComm, &tmpComm), res, out); - NCCLCHECKGOTO(bootstrapRecv(tmpComm, &info, sizeof(info)), res, out); - NCCLCHECKGOTO(bootstrapCloseRecv(tmpComm), res, out); + NCCLCHECKGOTO(bootstrapNetAccept(id->extListenComm, &tmpComm), res, out); + NCCLCHECKGOTO(bootstrapNetRecv(tmpComm, &info, sizeof(info)), res, out); + NCCLCHECKGOTO(bootstrapNetCloseRecv(tmpComm), res, out); if (c == 0) { - extHandleBstrap = (ncclNetHandle_t *)calloc(info.nranks, sizeof(ncclNetHandle_t)); - extHandleRing = (ncclNetHandle_t *)calloc(info.nranks, sizeof(ncclNetHandle_t)); - if (extHandleBstrap == NULL || extHandleRing == NULL) { - WARN("Bootstrap thread : failed to allocate memory"); - goto out; - } nranks = info.nranks; + NCCLCHECKGOTO(ncclCalloc(&rankHandles, nranks), res, out); + NCCLCHECKGOTO(ncclCalloc(&rankHandlesRoot, nranks), res, out); } if (nranks != info.nranks) { @@ -97,40 +98,43 @@ static void *bootstrapRoot(void* commId) { goto out; } - if (memcmp(&zero, &extHandleBstrap[info.rank], sizeof(ncclNetHandle_t)) != 0) { + if (memcmp(&zero, &rankHandlesRoot[info.rank], sizeof(ncclNetHandle_t)) != 0) { WARN("Bootstrap Root : rank %d of %d ranks has already checked in", info.rank, nranks); goto out; } - // Save the connection handle for connecting back to the ranks - memcpy(&extHandleBstrap[info.rank], info.extHandleListenFromRoot, sizeof(ncclNetHandle_t)); - // Save the connection handle for the AllGather ring - memcpy(&extHandleRing[info.rank], info.extHandleRing, sizeof(ncclNetHandle_t)); + // Save the connection handle for that rank + memcpy(rankHandlesRoot+info.rank, info.extHandleListenRoot, sizeof(ncclNetHandle_t)); + memcpy(rankHandles+info.rank, info.extHandleListen, sizeof(ncclNetHandle_t)); ++c; } while (c < nranks); + TRACE(NCCL_INIT, "COLLECTED HANDLES"); // Send the connect handle for the next rank in the AllGather ring for (int r=0; rextListenComm); + bootstrapNetCloseListen(id->extListenComm); free(commId); - free(extHandleBstrap); - free(extHandleRing); + if (rankHandles) free(rankHandles); + if (rankHandlesRoot) free(rankHandlesRoot); + + TRACE(NCCL_INIT, "DONE"); return NULL; } ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv) { struct extId* id = (struct extId*)commId; id->hostHash = getHostHash(); - NCCLCHECK(bootstrapListen(idFromEnv ? dontCareIf : 0, &id->extHandleRoot, &id->extListenComm)); + NCCLCHECK(bootstrapNetListen(idFromEnv ? dontCareIf : 0, &id->extHandleRoot, &id->extListenComm)); ncclUniqueId* threadIdCopy; NCCLCHECK(ncclCalloc(&threadIdCopy, 1)); memcpy(threadIdCopy, id, sizeof(ncclUniqueId)); @@ -157,10 +161,18 @@ ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out) { return ncclSuccess; } +struct unexConn { + int peer; + void* comm; + struct unexConn* next; +}; + struct extState { + void* extBstrapListenComm; void* extBstrapRingRecvComm; void* extBstrapRingSendComm; - ncclNetHandle_t extBstrapRootHandle; + ncclNetHandle_t* peerBstrapHandles; + struct unexConn* unexpectedConnections; int rank; int nranks; int dev; @@ -174,39 +186,56 @@ ncclResult_t bootstrapInit(ncclUniqueId* commId, int rank, int nranks, void** co state->rank = rank; state->nranks = nranks; *commState = state; - void* extBstrapRootListenComm; // comm on which we accept root's connections + + TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks); struct extInfo info = { 0 }; info.rank = rank; info.nranks = nranks; - void *tmpSendComm, *extBstrapRingListenComm, *tmpRecvComm; + void *tmpSendComm, *tmpRecvComm; // Pass the remote address to listen via info if (idFromEnv) { - memcpy(&info.extHandleListenFromRoot, &id->extHandleRoot, sizeof(ncclNetHandle_t)); - memcpy(&info.extHandleRing, &id->extHandleRoot, sizeof(ncclNetHandle_t)); + memcpy(&info.extHandleListen, &id->extHandleRoot, sizeof(ncclNetHandle_t)); + memcpy(&info.extHandleListenRoot, &id->extHandleRoot, sizeof(ncclNetHandle_t)); } // listen will return the local address via info (specify interface type 'findSubnetIf') state->dev = idFromEnv ? findSubnetIf : 0; - NCCLCHECK(bootstrapListen(state->dev, &info.extHandleListenFromRoot, &extBstrapRootListenComm)); - NCCLCHECK(bootstrapListen(state->dev, &info.extHandleRing, &extBstrapRingListenComm)); // AllGather Ring + void* extBstrapListenCommRoot; + NCCLCHECK(bootstrapNetListen(state->dev, &info.extHandleListen, &state->extBstrapListenComm)); + NCCLCHECK(bootstrapNetListen(state->dev, &info.extHandleListenRoot, &extBstrapListenCommRoot)); - memcpy(&state->extBstrapRootHandle, &id->extHandleRoot, sizeof(ncclNetHandle_t)); - // send info on my listening sockets to root - NCCLCHECK(bootstrapConnect(state->dev, id->extHandleRoot, &tmpSendComm)); - NCCLCHECK(bootstrapSend(tmpSendComm, &info, sizeof(info))); - NCCLCHECK(bootstrapCloseSend(tmpSendComm)); + // stagger connection times to avoid an overload of the root at very high rank counts + if (nranks > 128) { + long msec = rank; + struct timespec tv; + tv.tv_sec = msec / 1000; + tv.tv_nsec = 1000000 * (msec % 1000); + TRACE(NCCL_INIT, "rank %d delaying connection to root by %ld msec", rank, msec); + (void) nanosleep(&tv, NULL); + } + + // send info on my listening socket to root + NCCLCHECK(bootstrapNetConnect(state->dev, id->extHandleRoot, &tmpSendComm)); + NCCLCHECK(bootstrapNetSend(tmpSendComm, &info, sizeof(info))); + NCCLCHECK(bootstrapNetCloseSend(tmpSendComm)); // get info on my "next" rank in the bootstrap ring from root ncclNetHandle_t extHandleNext; - NCCLCHECK(bootstrapAccept(extBstrapRootListenComm, &tmpRecvComm)); - NCCLCHECK(bootstrapRecv(tmpRecvComm, &extHandleNext, sizeof(extHandleNext))); - NCCLCHECK(bootstrapCloseRecv(tmpRecvComm)); + NCCLCHECK(bootstrapNetAccept(extBstrapListenCommRoot, &tmpRecvComm)); + NCCLCHECK(bootstrapNetRecv(tmpRecvComm, &extHandleNext, sizeof(extHandleNext))); + NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm)); + NCCLCHECK(bootstrapNetCloseListen(extBstrapListenCommRoot)); - NCCLCHECK(bootstrapConnect(state->dev, extHandleNext, &state->extBstrapRingSendComm)); + NCCLCHECK(bootstrapNetConnect(state->dev, extHandleNext, &state->extBstrapRingSendComm)); // Accept the connect request from the previous rank in the AllGather ring - NCCLCHECK(bootstrapAccept(extBstrapRingListenComm, &state->extBstrapRingRecvComm)); - NCCLCHECK(bootstrapCloseListen(extBstrapRingListenComm)); - NCCLCHECK(bootstrapCloseListen(extBstrapRootListenComm)); + NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &state->extBstrapRingRecvComm)); + + // AllGather all listen handlers + NCCLCHECK(ncclCalloc(&state->peerBstrapHandles, nranks)); + memcpy(state->peerBstrapHandles+rank, info.extHandleListen, sizeof(ncclNetHandle_t)); + NCCLCHECK(bootstrapAllGather(state, state->peerBstrapHandles, sizeof(ncclNetHandle_t))); + + TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks); return ncclSuccess; } @@ -224,25 +253,106 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) { * and send previous step's data from (rank-i) to right */ for (int i=0; iextBstrapRingSendComm, data+sslice*size, size)); + NCCLCHECK(bootstrapNetSend(state->extBstrapRingSendComm, data+sslice*size, size)); // Recv slice from the left - NCCLCHECK(bootstrapRecv(state->extBstrapRingRecvComm, data+rslice*size, size)); + NCCLCHECK(bootstrapNetRecv(state->extBstrapRingRecvComm, data+rslice*size, size)); } TRACE(NCCL_INIT, "rank %d nranks %d size %d - DONE", rank, nranks, size); return ncclSuccess; } -ncclResult_t bootstrapClose(void* commState) { +ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size) { + struct extState* state = (struct extState*)commState; + void* tmpSendComm; + NCCLCHECK(bootstrapNetConnect(state->dev, state->peerBstrapHandles[peer], &tmpSendComm)); + NCCLCHECK(bootstrapNetSend(tmpSendComm, &state->rank, sizeof(int))); + NCCLCHECK(bootstrapNetSend(tmpSendComm, data, size)); + NCCLCHECK(bootstrapNetCloseSend(tmpSendComm)); + return ncclSuccess; +} + +ncclResult_t unexpectedEnqueue(struct extState* state, int peer, void* comm) { + // New unex + struct unexConn* unex; + NCCLCHECK(ncclCalloc(&unex, 1)); + unex->peer = peer; + unex->comm = comm; + + // Enqueue + struct unexConn* list = state->unexpectedConnections; + if (list == NULL) { + state->unexpectedConnections = unex; + return ncclSuccess; + } + while (list->next) list = list->next; + list->next = unex; + return ncclSuccess; +} + +void* unexpectedDequeue(struct extState* state, int peer) { + struct unexConn* elem = state->unexpectedConnections; + struct unexConn* prev = NULL; + while (elem) { + if (elem->peer == peer) { + if (prev == NULL) { + state->unexpectedConnections = elem->next; + } else { + prev->next = elem->next; + } + void* comm = elem->comm; + free(elem); + return comm; + } + prev = elem; + elem = elem->next; + } + return NULL; +} + +// We can't know who we'll receive from, so we need to receive everything at once +ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size) { struct extState* state = (struct extState*)commState; - NCCLCHECK(bootstrapCloseSend(state->extBstrapRingSendComm)); - NCCLCHECK(bootstrapCloseRecv(state->extBstrapRingRecvComm)); + void* tmpRecvComm; + // Search unexpected connections first + if ((tmpRecvComm = unexpectedDequeue(state, peer)) != NULL) { + NCCLCHECK(bootstrapNetRecv(tmpRecvComm, ((char*)data), size)); + NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm)); + return ncclSuccess; + } + + // Then look for new connections + while (1) { + NCCLCHECK(bootstrapNetAccept(state->extBstrapListenComm, &tmpRecvComm)); + int newPeer; + NCCLCHECK(bootstrapNetRecv(tmpRecvComm, &newPeer, sizeof(int))); + if (newPeer == peer) { + NCCLCHECK(bootstrapNetRecv(tmpRecvComm, ((char*)data), size)); + NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm)); + return ncclSuccess; + } + // Unexpected connection. Save for later. + NCCLCHECK(unexpectedEnqueue(state, newPeer, tmpRecvComm)); + } +} + +ncclResult_t bootstrapClose(void* commState) { + struct extState* state = (struct extState*)commState; + if (state->unexpectedConnections != NULL) { + WARN("Unexpected connections are not empty.\n"); + return ncclInternalError; + } + NCCLCHECK(bootstrapNetCloseListen(state->extBstrapListenComm)); + NCCLCHECK(bootstrapNetCloseSend(state->extBstrapRingSendComm)); + NCCLCHECK(bootstrapNetCloseRecv(state->extBstrapRingRecvComm)); + + free(state->peerBstrapHandles); free(state); return ncclSuccess; diff --git a/projects/rccl/src/channel.cu b/projects/rccl/src/channel.cu new file mode 100644 index 0000000000..937e84e7a8 --- /dev/null +++ b/projects/rccl/src/channel.cu @@ -0,0 +1,51 @@ +/************************************************************************* + * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "channel.h" +#include "param.h" + +NCCL_PARAM(Buffsize, "BUFFSIZE", DEFAULT_BUFFER_SIZE_BYTES); + +ncclResult_t initChannel(struct ncclComm* comm, int channelid) { + struct ncclChannel* channel = comm->channels+channelid; + channel->id = channelid; + + // Setup intermediate buffering + channel->buffSize = ncclParamBuffsize(); + + // Ring index to user rank table. + NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks)); + NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks)); + + // Communication structures with peers. + NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks)); + NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks)); + for (size_t i=0; inRanks; ++i) { + channel->peers[i].send.comm = comm; + channel->peers[i].recv.comm = comm; + } + + // Per-channel operation list. + NCCLCHECK(ncclCudaHostAlloc((void**)&channel->collectives, (void**)&channel->devCollectives, sizeof(struct ncclColl)*NCCL_MAX_OPS)); + return ncclSuccess; +} + +ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) { + // Operation list + NCCLCHECK(ncclCudaHostFree(channel->collectives)); + + // Free Ring index to rank tables + free(channel->ring.userRanks); + CUDACHECK(cudaFree(channel->ring.devUserRanks)); + + // Free transport proxy resources + for (int r=0; rpeers+r; + if (peer->send.transportResources) NCCLCHECK(peer->send.transportComm->free(peer->send.transportResources)); + if (peer->recv.transportResources) NCCLCHECK(peer->recv.transportComm->free(peer->recv.transportResources)); + } + return ncclSuccess; +} diff --git a/projects/rccl/src/collectives/all_gather.cu b/projects/rccl/src/collectives/all_gather.cu index 8dec28e63b..db21deef25 100644 --- a/projects/rccl/src/collectives/all_gather.cu +++ b/projects/rccl/src/collectives/all_gather.cu @@ -4,29 +4,15 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" -#include "common_coll.h" #include "enqueue.h" #include "collectives.h" -ncclResult_t ncclAllGatherFunc(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { - size_t nbytes = count*ncclTypeSize(datatype); - INFO(NCCL_COLL,"AllGather: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream); - if (comm->nRanks == 1) { - if (sendbuff != recvbuff) - CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream)); - } else { - NCCLCHECK(transportSaveProxies(ALLGATHER_SUBSTEPS, ALLGATHER_BUFCHUNKS, comm->nRanks-1, comm->nRanks, nbytes*comm->nRanks, proxyPatternRing, comm)); - NCCLCHECK(saveKernel(ncclCollAllGather, sendbuff, recvbuff, nbytes, ncclInt8, op, root, comm, stream, nbytes*comm->nRanks, 1)); - } - return ncclSuccess; -} - NCCL_API(ncclResult_t, ncclAllGather, const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) { - return ncclEnqueueCheck(ncclAllGatherFunc, "AllGather", sendbuff, recvbuff, sendcount, datatype, - ncclSum, 0, comm, stream); + struct ncclInfo info = { ncclCollAllGather, "AllGather", + sendbuff, recvbuff, sendcount, datatype, ncclSum, 0, comm, stream, /* Args */ + ALLGATHER_CHUNKSTEPS, ALLGATHER_SLICESTEPS }; + return ncclEnqueueCheck(&info); } diff --git a/projects/rccl/src/collectives/all_reduce.cu b/projects/rccl/src/collectives/all_reduce.cu index cc14083ab7..1492c90949 100644 --- a/projects/rccl/src/collectives/all_reduce.cu +++ b/projects/rccl/src/collectives/all_reduce.cu @@ -4,29 +4,15 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" -#include "common_coll.h" #include "enqueue.h" #include "collectives.h" -ncclResult_t ncclAllReduceFunc(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { - size_t nbytes = count*ncclTypeSize(datatype); - INFO(NCCL_COLL,"AllReduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream); - if (comm->nRanks == 1) { - if (sendbuff != recvbuff) - CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream)); - } else { - NCCLCHECK(transportSaveProxies(ALLREDUCE_SUBSTEPS, ALLREDUCE_BUFCHUNKS, (comm->nRanks)*2-2, comm->nRanks, nbytes, proxyPatternRing, comm)); - NCCLCHECK(saveKernel(ncclCollAllReduce, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes, comm->nRanks)); - } - return ncclSuccess; -} - NCCL_API(ncclResult_t, ncclAllReduce, const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream); + ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream); ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) { - return ncclEnqueueCheck(ncclAllReduceFunc, "AllReduce", sendbuff, recvbuff, count, datatype, - op, 0, comm, stream); + ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) { + struct ncclInfo info = { ncclCollAllReduce, "AllReduce", + sendbuff, recvbuff, count, datatype, op, 0, comm, stream, /* Args */ + ALLREDUCE_CHUNKSTEPS, ALLREDUCE_SLICESTEPS }; + return ncclEnqueueCheck(&info); } diff --git a/projects/rccl/src/collectives/broadcast.cu b/projects/rccl/src/collectives/broadcast.cu index 91ce905440..6a3d0a8b84 100644 --- a/projects/rccl/src/collectives/broadcast.cu +++ b/projects/rccl/src/collectives/broadcast.cu @@ -4,39 +4,23 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" -#include "common_coll.h" #include "enqueue.h" #include "collectives.h" -ncclResult_t ncclBroadcastFunc(const void* sendbuff, void* recvbuff, const size_t count, - ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { - size_t nbytes = count*ncclTypeSize(datatype); - INFO(NCCL_COLL,"Broadcast: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream); - if (comm->nRanks == 1) { - if (sendbuff != recvbuff) - CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream)); - } else { - NCCLCHECK(transportSaveProxies(BROADCAST_SUBSTEPS, BROADCAST_BUFCHUNKS, 1, 1, nbytes, proxyPatternFrom(root), comm)); - NCCLCHECK(saveKernel(ncclCollBroadcast, sendbuff, recvbuff, nbytes, ncclInt8, op, root, comm, stream, nbytes, 1)); - } - - return ncclSuccess; -} - -/* Deprecated original "in place" function, similar to MPI */ -NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root, - ncclComm_t comm, cudaStream_t stream); -ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, - ncclComm_t comm, cudaStream_t stream) { - return ncclEnqueueCheck(ncclBroadcastFunc, "Bcast", buff, buff, count, datatype, - ncclSum, root, comm, stream); -} - NCCL_API(ncclResult_t, ncclBroadcast, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream); ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream) { - return ncclEnqueueCheck(ncclBroadcastFunc, "Broadcast", sendbuff, recvbuff, count, datatype, - ncclSum, root, comm, stream); + struct ncclInfo info = { ncclCollBroadcast, "Broadcast", + sendbuff, recvbuff, count, datatype, ncclSum, root, comm, stream, /* Args */ + BROADCAST_CHUNKSTEPS, BROADCAST_SLICESTEPS }; + return ncclEnqueueCheck(&info); } +/* Deprecated original "in place" function, similar to MPI */ +NCCL_API(ncclResult_t, ncclBcast, void* buff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, cudaStream_t stream); +ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, + ncclComm_t comm, cudaStream_t stream) { + return ncclBroadcast(buff, buff, count, datatype, root, comm, stream); +} + diff --git a/projects/rccl/src/collectives/collectives.h b/projects/rccl/src/collectives/collectives.h index 4a5cb7a98d..e6b19cb786 100644 --- a/projects/rccl/src/collectives/collectives.h +++ b/projects/rccl/src/collectives/collectives.h @@ -7,9 +7,7 @@ #ifndef NCCL_COLLECTIVES_H_ #define NCCL_COLLECTIVES_H_ -typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t; - -#define FUNC_INDEX(coll, redop, dtype, ll) ((((coll*ncclNumOps + redop)*ncclNumTypes) + dtype)*2+ll) +#define FUNC_INDEX(coll, redop, dtype, ll, al) ((((((coll)*ncclNumOps + (redop))*ncclNumTypes) + (dtype))*2+(al))*2+(ll)) #define NCCL_COLL_NAME(coll, op, dtype) \ coll##_##op##_##dtype @@ -18,13 +16,17 @@ typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollRed coll##Kernel_##op##_##dtype /* Declare all collective operations */ -#define DECL_COLL4(coll, op, dtype) \ +#define DECL_COLL5(coll, op, dtype) \ extern __device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args); \ - extern __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl coll); \ + extern __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl c); \ + +#define DECL_COLL4(coll, op, dtype) \ + DECL_COLL5(coll, op, dtype) \ + DECL_COLL5(coll##LL, op, dtype) #define DECL_COLL3(coll, op, dtype) \ - DECL_COLL4(coll##LL, op, dtype) \ - DECL_COLL4(coll, op, dtype) + DECL_COLL4(coll##Ring, op, dtype) \ + DECL_COLL4(coll##Tree, op, dtype) #define DECL_COLL2(coll, op) \ DECL_COLL3(coll, op, i8) \ @@ -52,15 +54,16 @@ typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollRed DECL_ALL_COLLS -#define ALLREDUCE_SUBSTEPS 2 -#define ALLREDUCE_BUFCHUNKS 2 -#define ALLGATHER_SUBSTEPS 2 -#define ALLGATHER_BUFCHUNKS 2 -#define REDUCESCATTER_SUBSTEPS 2 -#define REDUCESCATTER_BUFCHUNKS 2 -#define BROADCAST_SUBSTEPS 8 -#define BROADCAST_BUFCHUNKS 2 -#define REDUCE_SUBSTEPS 8 -#define REDUCE_BUFCHUNKS 2 +// CHUNKSIZE must be a multiple of SLICESIZE +#define ALLREDUCE_SLICESTEPS (NCCL_STEPS/4) +#define ALLREDUCE_CHUNKSTEPS (NCCL_STEPS/2) +#define ALLGATHER_SLICESTEPS (NCCL_STEPS/4) +#define ALLGATHER_CHUNKSTEPS (NCCL_STEPS/2) +#define REDUCESCATTER_SLICESTEPS (NCCL_STEPS/4) +#define REDUCESCATTER_CHUNKSTEPS (NCCL_STEPS/2) +#define BROADCAST_SLICESTEPS 1 +#define BROADCAST_CHUNKSTEPS 1 +#define REDUCE_SLICESTEPS 1 +#define REDUCE_CHUNKSTEPS 1 #endif diff --git a/projects/rccl/src/collectives/device/Makefile b/projects/rccl/src/collectives/device/Makefile index e2bcd49007..8e92596f27 100644 --- a/projects/rccl/src/collectives/device/Makefile +++ b/projects/rccl/src/collectives/device/Makefile @@ -12,18 +12,13 @@ OBJDIR := $(BUILDDIR)/obj/collectives/device LIBSRCFILES := all_reduce.cu broadcast.cu reduce.cu all_gather.cu reduce_scatter.cu -LIBOBJ := $(patsubst %.cu,$(OBJDIR)/%_sum.o, $(LIBSRCFILES)) \ - $(patsubst %.cu,$(OBJDIR)/%_prod.o, $(LIBSRCFILES)) \ - $(patsubst %.cu,$(OBJDIR)/%_min.o, $(LIBSRCFILES)) \ - $(patsubst %.cu,$(OBJDIR)/%_max.o, $(LIBSRCFILES)) \ - $(OBJDIR)/functions.o - LIBSRCFILES += functions.cu DEPFILES := $(patsubst %.cu, $(OBJDIR)/%.d, $(LIBSRCFILES)) -DEPENDFILES := $(DEPFILES:%.d=%.dep) +DEPENDFILES:= $(DEPFILES:%.d=%.dep) STATICLIB := $(OBJDIR)/colldevice.a DEVOBJ := $(OBJDIR)/devlink.o +RULESFILE := $(OBJDIR)/Makefile.rules NVCUFLAGS += -I. -I.. -I$(BUILDDIR)/include -I../../include --compiler-options "-fPIC -fvisibility=hidden" @@ -33,6 +28,16 @@ all: $(STATICLIB) # Dummy rule so that the extra dependency (%.dep) files are preserved by make all_deps: $(DEPENDFILES) +# Auto-generating the rules per op/reduction/datatype/algorithm +$(RULESFILE) : + @printf "Generating %-35s > %s\n" rules $@ + @mkdir -p $(OBJDIR) + @./gen_rules.sh $(OBJDIR) > $@ + +-include $(RULESFILE) + +LIBOBJ := $(GENOBJS) $(OBJDIR)/functions.o + -include $(DEPFILES) $(STATICLIB): $(LIBOBJ) $(DEVOBJ) @@ -58,26 +63,6 @@ $(OBJDIR)/functions.o : functions.cu $(OBJDIR)/functions.dep mkdir -p `dirname $@` $(NVCC) $(NVCUFLAGS) -dc $< -o $@ -$(OBJDIR)/%_sum.o : %.cu $(OBJDIR)/%.dep - @printf "Compiling %-35s > %s\n" $< $@ - mkdir -p `dirname $@` - $(NVCC) -DNCCL_OP=0 $(NVCUFLAGS) -dc $< -o $@ - -$(OBJDIR)/%_prod.o : %.cu $(OBJDIR)/%.dep - @printf "Compiling %-35s > %s\n" $< $@ - mkdir -p `dirname $@` - $(NVCC) -DNCCL_OP=1 $(NVCUFLAGS) -dc $< -o $@ - -$(OBJDIR)/%_min.o : %.cu $(OBJDIR)/%.dep - @printf "Compiling %-35s > %s\n" $< $@ - mkdir -p `dirname $@` - $(NVCC) -DNCCL_OP=2 $(NVCUFLAGS) -dc $< -o $@ - -$(OBJDIR)/%_max.o : %.cu $(OBJDIR)/%.dep - @printf "Compiling %-35s > %s\n" $< $@ - mkdir -p `dirname $@` - $(NVCC) -DNCCL_OP=3 $(NVCUFLAGS) -dc $< -o $@ - # ... and create the device-side linked object with all those. $(DEVOBJ) : $(LIBOBJ) $(NVCC) $(NVCUFLAGS) -dlink $^ -o $@ diff --git a/projects/rccl/src/collectives/device/all_gather.cu b/projects/rccl/src/collectives/device/all_gather.cu index 0f572ce7cb..530bf1457d 100644 --- a/projects/rccl/src/collectives/device/all_gather.cu +++ b/projects/rccl/src/collectives/device/all_gather.cu @@ -4,12 +4,8 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "common.h" #include "all_gather.h" +#include "common.h" #include "collectives.h" -#define UNROLL 4 - -#if NCCL_OP == 0 -IMPL_COLL3(ncclAllGather, copy, FuncSum, i8, int8_t, ncclCollAllGather, ncclSum, ncclInt8); -#endif +IMPL_COLL_C(ncclAllGather, ncclCollAllGather); diff --git a/projects/rccl/src/collectives/device/all_gather.h b/projects/rccl/src/collectives/device/all_gather.h index a30e575570..36809c916c 100644 --- a/projects/rccl/src/collectives/device/all_gather.h +++ b/projects/rccl/src/collectives/device/all_gather.h @@ -8,72 +8,35 @@ #include "primitives.h" #include "collectives.h" -// Increase Step and poffset/noffset for buffer sync -#define NEXT_STEP \ - step++; \ - poffset = noffset; \ - noffset += sliceSize; \ - if (noffset == buffSize) noffset = 0; - template -__device__ void ncclAllGatherKernel(struct CollectiveArgs* args) { +__device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int nthreads = blockDim.x - 1; const int bid = args->bid; - __shared__ T* sharedNextOutput; struct ncclComm* comm = args->comm; - struct ncclRing* ring = comm->rings+blockIdx.x; - int prevdirect = ring->recv.conn.direct; - int nextdirect = ring->send.conn.direct; - - WaitFlag waitDoneFromNext(ring->send.conn.head, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS); - WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLGATHER_SUBSTEPS); - PostFlag postDoneToPrev(ring->recv.conn.head, ALLGATHER_SUBSTEPS, NULL, 0); - PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLGATHER_BUFCHUNKS*ALLGATHER_SUBSTEPS); - - typedef Primitives Prims; - + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; const ssize_t size = args->N; const int nranks = comm->nRanks; - const int buffSize = ring->buffSize / sizeof(T); - const int sliceSize = buffSize / ALLGATHER_BUFCHUNKS; - const ssize_t loopSize = args->nRings*(ssize_t)sliceSize; - - if (tid == 0) { - // Update in case we skipped some collectives - *ring->recv.conn.opCount = args->opCount; - // Wait for next to be ready - WaitFlag waitOpCountNext(ring->send.conn.opCount, 0); - waitOpCountNext.wait(args->opCount); - if (prevdirect) { - *ring->recv.conn.ptrExchange = args->ThisOutput; - } - if (nextdirect) { - void* volatile* ptr = &(ring->devMemSend->ptrExchange); - while (*ptr == nullptr); - sharedNextOutput = (T*)*ptr; - *ptr = nullptr; - } - } - __syncthreads(); - - uint64_t step = 0ULL; - int poffset, noffset = 0; + const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS); + const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS; + const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize; // Compute pointers const T * __restrict__ thisInput = (const T*)args->ThisInput; T * __restrict__ thisOutput = (T*)args->ThisOutput; - T * __restrict__ prevInput = (T*)ring->recv.conn.buff; - T * __restrict__ nextOutput = (T*)ring->send.conn.buff; + + ncclPrimitives + prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings)); - ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); - ssize_t chunkOffset = gridOffset + bid*chunkSize; + int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels)); + ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); + ssize_t chunkOffset = gridOffset + bid*realChunkSize; /////////////// begin AllGather steps /////////////// ssize_t offset; - int maxOffset = min(chunkSize, size-chunkOffset); + int nelem = min(realChunkSize, size-chunkOffset); int rankDest; // step 0: push data to next GPU @@ -81,129 +44,51 @@ __device__ void ncclAllGatherKernel(struct CollectiveArgs* args) { offset = chunkOffset + rankDest * size; if (thisInput + chunkOffset == thisOutput + offset) { // In place - Prims::Copy(tid, nthreads, - thisInput + chunkOffset, - nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset), - sliceSize, maxOffset, - step, - waitDoneFromNext, - postReadyToNext); + prims.directSend(thisInput+chunkOffset, offset, nelem); } else { - Prims::DoubleCopy(tid, nthreads, - thisInput + chunkOffset, - thisOutput + offset, - nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset), - sliceSize, maxOffset, - step, - waitDoneFromNext, - postReadyToNext); + prims.directCopySend(thisInput+chunkOffset, thisOutput+offset, offset, nelem); } - NEXT_STEP; // Increases step, poffset, noffset - // k-2 steps: copy to next GPU - if (prevdirect) { - for (int j=1; jdevUserRanks[nranks-j]; - offset = chunkOffset + rankDest * size; - - Prims::Copy(tid, nthreads, - thisOutput + offset, - nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset), - sliceSize, maxOffset, - step, - waitDoneFromNext, waitReadyFromPrev, - postReadyToNext, postDoneToPrev); - - NEXT_STEP; - } - Prims::Copy(tid, nthreads, - NULL, - NULL, - 0, 0, - step, - waitReadyFromPrev, - postDoneToPrev); - } else { - for (int j=1; jdevUserRanks[nranks-j]; - offset = chunkOffset + rankDest * size; - - Prims::DoubleCopy(tid, nthreads, - prevInput + poffset, - thisOutput + offset, - nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset), - sliceSize, maxOffset, - step, - waitDoneFromNext, waitReadyFromPrev, - postReadyToNext, postDoneToPrev); - - NEXT_STEP; - } - - // Make final copy from buffer to dest. - rankDest = ring->devUserRanks[1]; + for (int j=1; jdevUserRanks[nranks-j]; offset = chunkOffset + rankDest * size; - // Here we need to copy from buffer to this output. - Prims::Copy(tid, nthreads, - prevInput + poffset, - thisOutput + offset, - sliceSize, maxOffset, - step, - waitReadyFromPrev, - postDoneToPrev); + prims.directRecvCopySend(thisOutput+offset, offset, nelem); } - } - if (tid == 0) { - waitDoneFromNext.wait(ALLGATHER_SUBSTEPS*(step + ALLGATHER_BUFCHUNKS)); - *ring->send.conn.head = 0ULL; - *ring->recv.conn.tail = 0ULL; - __threadfence_system(); - *ring->recv.conn.opCount = args->opCount+1; + // Make final copy from buffer to dest. + rankDest = ring->devUserRanks[1]; + offset = chunkOffset + rankDest * size; + + // Final wait/copy. + prims.directRecv(thisOutput+offset, offset, nelem); } } -#include "ll_kernel.h" - -#define NEXT_STEP_LL \ - poffset = noffset; \ - pflag = nflag; \ - noffset += NCCL_LL_SLICE_LINES; \ - if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \ - nflag++; \ - step++; +template +__device__ void ncclAllGatherTreeKernel(struct CollectiveArgs* args) { } template -__device__ void ncclAllGatherLLKernel(struct CollectiveArgs* args) { +__device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int bid = args->bid; - const int llNthreads = args->nThreads; + const int nthreads = args->nThreads; struct ncclComm* comm = args->comm; - struct ncclRing* ring = comm->rings+blockIdx.x; - volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead; - volatile uint64_t * sendHeadPtr = ring->send.conn.llHead; - volatile int * sizesFifo = ring->send.conn.llFifo; - uint64_t sendHead = sendHeadPtr[0]; + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; - typedef LLPrimitives LL; + ncclLLPrimitives LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount); const ssize_t size = args->N; //const int rank = comm->rank; const int nranks = comm->nRanks; ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T); - const ssize_t loopSize = args->nRings*chunkSize; - - uint64_t step = ring->send.conn.llStep; - uint32_t pflag, nflag = step + 1; - int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step); + const ssize_t loopSize = args->nChannels*chunkSize; // Compute pointers const T * __restrict__ thisInput = (const T*)args->ThisInput; T * __restrict__ thisOutput = (T*)args->ThisOutput; - union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff; - union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff; for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { if (size-gridOffset < loopSize) { @@ -213,57 +98,34 @@ __device__ void ncclAllGatherLLKernel(struct CollectiveArgs* args) { /////////////// begin AllGather steps /////////////// ssize_t offset; - int maxOffset = min(chunkSize, size-chunkOffset); + int nelem = min(chunkSize, size-chunkOffset); int rankDest; // step 0: push data to next GPU rankDest = ring->devUserRanks[0]; offset = chunkOffset + rankDest * size; - WAIT_NEXT; if (thisInput + chunkOffset == thisOutput + offset) { // In place - LL::ReduceCopy( - thisInput + chunkOffset, - nextOutput + noffset, - maxOffset, nflag, llNthreads); + LLprims.send(thisInput+chunkOffset, nelem); } else { - LL::ReduceCopy( - thisInput + chunkOffset, - thisOutput + offset, - nextOutput + noffset, - maxOffset, nflag, llNthreads); + LLprims.copySend(thisInput+chunkOffset, thisOutput+offset, nelem); } - POST_SIZE; - - NEXT_STEP_LL; // k-2 steps: copy to next GPU for (int j=1; jdevUserRanks[nranks-j]; offset = chunkOffset + rankDest * size; - WAIT_NEXT; - LL::ReduceCopy( - prevInput + poffset, - thisOutput + offset, - nextOutput + noffset, - maxOffset, pflag, nflag, llNthreads); - POST_SIZE; - ACK_PREV; - - NEXT_STEP_LL; + LLprims.recvCopySend(thisOutput+offset, nelem); } // step k-1: final store rankDest = ring->devUserRanks[1]; offset = chunkOffset + rankDest * size; - LL::ReduceCopy( - prevInput + poffset, - thisOutput + offset, - maxOffset, pflag, llNthreads); - ACK_PREV; + LLprims.recv(thisOutput+offset, nelem); } - - FIFO_CLEANING_AND_SAVE_STEP(nflag); } + +template +__device__ void ncclAllGatherTreeLLKernel(struct CollectiveArgs* args) { } diff --git a/projects/rccl/src/collectives/device/all_reduce.cu b/projects/rccl/src/collectives/device/all_reduce.cu index caa1479c12..aaa96b4175 100644 --- a/projects/rccl/src/collectives/device/all_reduce.cu +++ b/projects/rccl/src/collectives/device/all_reduce.cu @@ -4,18 +4,8 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "common.h" #include "all_reduce.h" +#include "common.h" #include "collectives.h" -#define UNROLL 4 - -#if NCCL_OP == 0 -IMPL_COLL2(ncclAllReduce, sum, FuncSum, ncclCollAllReduce, ncclSum); -#elif NCCL_OP == 1 -IMPL_COLL2(ncclAllReduce, prod, FuncProd, ncclCollAllReduce, ncclProd); -#elif NCCL_OP == 2 -IMPL_COLL2(ncclAllReduce, min, FuncMin, ncclCollAllReduce, ncclMin); -#elif NCCL_OP == 3 -IMPL_COLL2(ncclAllReduce, max, FuncMax, ncclCollAllReduce, ncclMax); -#endif +IMPL_COLL_R(ncclAllReduce, ncclCollAllReduce); diff --git a/projects/rccl/src/collectives/device/all_reduce.h b/projects/rccl/src/collectives/device/all_reduce.h index d7abc6445b..ea89a71255 100644 --- a/projects/rccl/src/collectives/device/all_reduce.h +++ b/projects/rccl/src/collectives/device/all_reduce.h @@ -8,233 +8,152 @@ #include "primitives.h" #include "collectives.h" -// Increase Step and poffset/noffset for buffer sync -#define NEXT_STEP \ - step++; \ - poffset = noffset; \ - noffset += sliceSize; \ - if (noffset == buffSize) noffset = 0; - template -__device__ void ncclAllReduceKernel(struct CollectiveArgs* args) { +__device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int nthreads = blockDim.x - 1; const int bid = args->bid; - __shared__ T* sharedNextOutput; struct ncclComm* comm = args->comm; - struct ncclRing* ring = comm->rings+blockIdx.x; - int prevdirect = ring->recv.conn.direct; - int nextdirect = ring->send.conn.direct; - - WaitFlag waitDoneFromNext(ring->send.conn.head, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS); - WaitFlag waitReadyFromPrev(ring->recv.conn.tail, ALLREDUCE_SUBSTEPS); - PostFlag postDoneToPrev(ring->recv.conn.head, ALLREDUCE_SUBSTEPS, NULL, 0); - PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, ALLREDUCE_BUFCHUNKS*ALLREDUCE_SUBSTEPS); - - typedef Primitives Prims; - + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; const ssize_t size = args->N; - //const int rank = comm->rank; const int nranks = comm->nRanks; - const int buffSize = ring->buffSize / sizeof(T); - const int sliceSize = buffSize / ALLREDUCE_BUFCHUNKS; - const ssize_t loopSize = args->nRings*(ssize_t)sliceSize; - - if (tid == 0) { - // Update in case we skipped some collectives - *ring->recv.conn.opCount = args->opCount; - // Wait for next to be ready - WaitFlag waitOpCountNext(ring->send.conn.opCount, 0); - waitOpCountNext.wait(args->opCount); - if (prevdirect) { - *ring->recv.conn.ptrExchange = args->ThisOutput; - } - if (nextdirect) { - void* volatile* ptr = &(ring->devMemSend->ptrExchange); - while (*ptr == nullptr); - sharedNextOutput = (T*)*ptr; - *ptr = nullptr; - } - } - __syncthreads(); - - uint64_t step = 0ULL; - int poffset, noffset = 0; + const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS); + const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS; + const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize; // Compute pointers const T * __restrict__ thisInput = (const T*)args->ThisInput; T * __restrict__ thisOutput = (T*)args->ThisOutput; - T * __restrict__ prevInput = (T*)ring->recv.conn.buff; - T * __restrict__ nextOutput = (T*)ring->send.conn.buff; + + ncclPrimitives + prims(tid, nthreads, &ring->prev, &ring->next, thisOutput, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += nranks*loopSize) { - int chunkSize = min(sliceSize, DIVUP(size-gridOffset,nranks*args->nRings)); - ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); - ssize_t chunkOffset = gridOffset + bid*nranks*chunkSize; + int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,nranks*args->nChannels)); + ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); + ssize_t chunkOffset = gridOffset + bid*nranks*realChunkSize; /////////////// begin AllReduce steps /////////////// ssize_t offset; - int maxOffset; + int nelem; int slice; // step 0: push data to next GPU slice = ring->devUserRanks[nranks-1]; - offset = chunkOffset + slice * chunkSize; - maxOffset = min(chunkSize, size-offset); + offset = chunkOffset + slice * realChunkSize; + nelem = min(realChunkSize, size-offset); - Prims::Copy(tid, nthreads, - thisInput + offset, - nextOutput + noffset, - sliceSize, maxOffset, - step, - waitDoneFromNext, - postReadyToNext); - - NEXT_STEP; // Increases step, poffset, noffset + prims.send(thisInput+offset, nelem); // k-2 steps: reduce and copy to next GPU for (int j=2; jdevUserRanks[nranks-j]; - offset = chunkOffset + slice * chunkSize; - maxOffset = min(chunkSize, size-offset); + offset = chunkOffset + slice * realChunkSize; + nelem = min(realChunkSize, size-offset); - Prims::Reduce(tid, nthreads, - prevInput + poffset, - thisInput + offset, - nextOutput + noffset, - sliceSize, maxOffset, - step, - waitDoneFromNext, waitReadyFromPrev, - postReadyToNext, postDoneToPrev); - - NEXT_STEP; + prims.recvReduceSend(thisInput+offset, nelem); } // step k-1: reduce this buffer and data, which will produce the final // result that we store in this data and push to the next GPU slice = ring->devUserRanks[0]; - offset = chunkOffset + slice * chunkSize; - maxOffset = min(chunkSize, size-offset); + offset = chunkOffset + slice * realChunkSize; + nelem = min(realChunkSize, size-offset); - Prims::ReduceCopy(tid, nthreads, - prevInput + poffset, - thisInput + offset, - nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset), - thisOutput + offset, - sliceSize, maxOffset, - step, - waitDoneFromNext, waitReadyFromPrev, - postReadyToNext, postDoneToPrev); - - NEXT_STEP; + prims.directRecvReduceCopySend(thisInput+offset, thisOutput+offset, offset, nelem); // k-2 steps: copy to next GPU - if (prevdirect) { - for (int j=1; jdevUserRanks[nranks - j]; - offset = chunkOffset + slice * chunkSize; - maxOffset = min(chunkSize, size-offset); + for (int j=1; jdevUserRanks[nranks-j]; + offset = chunkOffset + slice * realChunkSize; + nelem = min(realChunkSize, size-offset); - Prims::Copy(tid, nthreads, - thisOutput + offset, - nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset), - sliceSize, maxOffset, - step, - waitDoneFromNext, waitReadyFromPrev, - postReadyToNext, postDoneToPrev); - - NEXT_STEP; - } - Prims::Copy(tid, nthreads, - NULL, - NULL, - 0, 0, - step, - waitReadyFromPrev, - postDoneToPrev); - } else { - for (int j=1; jdevUserRanks[nranks - j]; - offset = chunkOffset + slice * chunkSize; - maxOffset = min(chunkSize, size-offset); - - Prims::DoubleCopy(tid, nthreads, - prevInput + poffset, - thisOutput + offset, - nextdirect ? (sharedNextOutput + offset) : (nextOutput + noffset), - sliceSize, maxOffset, - step, - waitDoneFromNext, waitReadyFromPrev, - postReadyToNext, postDoneToPrev); - - NEXT_STEP; - } - - // Make final copy from buffer to dest. - slice = ring->devUserRanks[1]; - offset = chunkOffset + slice * chunkSize; - maxOffset = min(chunkSize, size-offset); - - // Here we need to copy from buffer to this output. - Prims::Copy(tid, nthreads, - prevInput + poffset, - thisOutput + offset, - sliceSize, maxOffset, - step, - waitReadyFromPrev, - postDoneToPrev); + prims.directRecvCopySend(thisOutput+offset, offset, nelem); } - } - if (tid == 0) { - // Wait for next to have consumed all data before we reset the flag - waitDoneFromNext.wait(ALLREDUCE_SUBSTEPS*(step + ALLREDUCE_BUFCHUNKS)); - *ring->send.conn.head = 0ULL; - *ring->recv.conn.tail = 0ULL; - __threadfence_system(); - *ring->recv.conn.opCount = args->opCount+1; + // Make final copy from buffer to dest. + slice = ring->devUserRanks[1]; + offset = chunkOffset + slice * realChunkSize; + nelem = min(realChunkSize, size-offset); + + // Final wait/copy. + prims.directRecv(thisOutput+offset, offset, nelem); } } -#include "ll_kernel.h" +template +__device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) { + const int tid = threadIdx.x; + const int nthreads = blockDim.x - 1; + const int bid = args->bid; + struct ncclComm* comm = args->comm; + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclTree* tree = &channel->tree; + const ssize_t size = args->N; + const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS); + const int chunkSize = args->lastChunkSize; + const ssize_t loopSize = args->nChannels*chunkSize; -#define NEXT_STEP_LL \ - poffset = noffset; \ - pflag = nflag; \ - noffset += NCCL_LL_SLICE_LINES; \ - if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \ - nflag++; \ - step++; + // Compute pointers + const T * __restrict__ thisInput = (const T*)args->ThisInput; + T * __restrict__ thisOutput = (T*)args->ThisOutput; + + do { + // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local) + ncclPrimitives prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount); + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + // Up + ssize_t offset = gridOffset + bid*chunkSize; + int nelem = min(chunkSize, size-offset); + if (tree->up == -1) { + prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem); + } else if (tree->down[0] == -1) { + prims.send(thisInput+offset, nelem); + } else { + prims.recvReduceSend(thisInput+offset, nelem); + } + } + } while(0); + + do { + // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local) + ncclPrimitives prims(tid, nthreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount); + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + // Down + ssize_t offset = gridOffset + bid*chunkSize; + int nelem = min(chunkSize, size-offset); + if (tree->up == -1) { + prims.send(thisOutput+offset, nelem); + } else if (tree->down[0] == -1) { + prims.recv(thisOutput+offset, nelem); + } else { + prims.recvCopySend(thisOutput+offset, nelem); + } + } + } while(0); +} template -__device__ void ncclAllReduceLLKernel(struct CollectiveArgs* args) { +__device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int bid = args->bid; - const int llNthreads = args->nThreads; + const int nthreads = args->nThreads; struct ncclComm* comm = args->comm; - struct ncclRing* ring = comm->rings+blockIdx.x; - volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead; - volatile uint64_t * sendHeadPtr = ring->send.conn.llHead; - volatile int * sizesFifo = ring->send.conn.llFifo; - uint64_t sendHead = sendHeadPtr[0]; + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; - typedef LLPrimitives LL; + ncclLLPrimitives LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount); const ssize_t size = args->N; //const int rank = comm->rank; const int nranks = comm->nRanks; ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T); - const ssize_t loopSize = args->nRings*nranks*chunkSize; - - uint64_t step = ring->send.conn.llStep; - uint32_t pflag, nflag = step + 1; - int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step); + const ssize_t loopSize = args->nChannels*nranks*chunkSize; // Compute pointers const T * __restrict__ thisInput = (const T*)args->ThisInput; T * __restrict__ thisOutput = (T*)args->ThisOutput; - union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff; - union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff; for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { if (size-gridOffset < loopSize) { @@ -244,89 +163,99 @@ __device__ void ncclAllReduceLLKernel(struct CollectiveArgs* args) { /////////////// begin AllReduce steps /////////////// ssize_t offset; - int maxOffset; + int nelem; int slice; // step 0: push data to next GPU slice = ring->devUserRanks[nranks-1]; offset = chunkOffset + slice * chunkSize; - maxOffset = min(chunkSize, size-offset); + nelem = min(chunkSize, size-offset); - WAIT_NEXT; - LL::ReduceCopy( - thisInput + offset, - nextOutput + noffset, - maxOffset, nflag, llNthreads); - POST_SIZE; - - NEXT_STEP_LL; + LLprims.send(thisInput+offset, nelem); // k-2 steps: reduce and copy to next GPU for (int j=2; jdevUserRanks[nranks-j]; offset = chunkOffset + slice * chunkSize; - maxOffset = min(chunkSize, size-offset); + nelem = min(chunkSize, size-offset); - WAIT_NEXT; - LL::ReduceCopy( - thisInput + offset, - prevInput + poffset, - nextOutput + noffset, - maxOffset, pflag, nflag, llNthreads); - POST_SIZE; - ACK_PREV; - - NEXT_STEP_LL; + LLprims.recvReduceSend(thisInput+offset, nelem); } // step k-1: reduce this buffer and data, which will produce the final // result that we store in this data and push to the next GPU slice = ring->devUserRanks[0]; offset = chunkOffset + slice * chunkSize; - maxOffset = min(chunkSize, size-offset); + nelem = min(chunkSize, size-offset); - WAIT_NEXT; - LL::ReduceCopy( - thisInput + offset, - prevInput + poffset, - thisOutput + offset, - nextOutput + noffset, - maxOffset, pflag, nflag, llNthreads); - POST_SIZE; - ACK_PREV; - - NEXT_STEP_LL; + LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem); // k-2 steps: copy to next GPU for (int j=1; jdevUserRanks[nranks - j]; + slice = ring->devUserRanks[nranks-j]; offset = chunkOffset + slice * chunkSize; - maxOffset = min(chunkSize, size-offset); + nelem = min(chunkSize, size-offset); - WAIT_NEXT; - LL::ReduceCopy( - prevInput + poffset, - thisOutput + offset, - nextOutput + noffset, - maxOffset, pflag, nflag, llNthreads); - POST_SIZE; - ACK_PREV; - - NEXT_STEP_LL; + LLprims.recvCopySend(thisOutput+offset, nelem); } // Make final copy from buffer to dest. slice = ring->devUserRanks[1]; offset = chunkOffset + slice * chunkSize; - maxOffset = min(chunkSize, size-offset); + nelem = min(chunkSize, size-offset); // Here we need to copy from buffer to this output. - LL::ReduceCopy( - prevInput + poffset, - thisOutput + offset, - maxOffset, pflag, llNthreads); - ACK_PREV; + LLprims.recv(thisOutput+offset, nelem); } - - FIFO_CLEANING_AND_SAVE_STEP(nflag); +} + +template +__device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) { + const int tid = threadIdx.x; + const int nthreads = args->nThreads; + const int bid = args->bid; + struct ncclComm* comm = args->comm; + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclTree* tree = &channel->tree; + const ssize_t size = args->N; + ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T); + const ssize_t loopSize = args->nChannels*chunkSize; + + // Compute pointers + const T * __restrict__ thisInput = (const T*)args->ThisInput; + T * __restrict__ thisOutput = (T*)args->ThisOutput; + + do { + // Reduce : max number of recv is 3, max number of send is 1 (binary tree + local) + ncclLLPrimitives LLprims(tid, nthreads, tree->down, &tree->up, channel, comm, args->opCount); + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + // Up + ssize_t offset = gridOffset + bid*chunkSize; + int nelem = min(chunkSize, size-offset); + if (tree->up == -1) { + LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem); + } else if (tree->down[0] == -1) { + LLprims.send(thisInput+offset, nelem); + } else { + LLprims.recvReduceSend(thisInput+offset, nelem); + } + } + } while(0); + + do { + // Broadcast : max number of recv is 1, max number of send is 3 (binary tree + local) + ncclLLPrimitives LLprims(tid, nthreads, &tree->up, tree->down, channel, comm, args->opCount); + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + // Down + ssize_t offset = gridOffset + bid*chunkSize; + int nelem = min(chunkSize, size-offset); + if (tree->up == -1) { + LLprims.send(thisOutput+offset, nelem); + } else if (tree->down[0] == -1) { + LLprims.recv(thisOutput+offset, nelem); + } else { + LLprims.recvCopySend(thisOutput+offset, nelem); + } + } + } while(0); } diff --git a/projects/rccl/src/collectives/device/broadcast.cu b/projects/rccl/src/collectives/device/broadcast.cu index 4125de41f9..b83ee7091d 100644 --- a/projects/rccl/src/collectives/device/broadcast.cu +++ b/projects/rccl/src/collectives/device/broadcast.cu @@ -4,12 +4,8 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "common.h" #include "broadcast.h" +#include "common.h" #include "collectives.h" -#define UNROLL 4 - -#if NCCL_OP == 0 -IMPL_COLL3(ncclBroadcast, copy, FuncSum, i8, int8_t, ncclCollBroadcast, ncclSum, ncclInt8); -#endif +IMPL_COLL_C(ncclBroadcast, ncclCollBroadcast); diff --git a/projects/rccl/src/collectives/device/broadcast.h b/projects/rccl/src/collectives/device/broadcast.h index c2f6d001e1..fb183122ff 100644 --- a/projects/rccl/src/collectives/device/broadcast.h +++ b/projects/rccl/src/collectives/device/broadcast.h @@ -8,174 +8,74 @@ #include "primitives.h" #include "collectives.h" -// Increase Step and boffset for buffer sync -#define NEXT_STEP \ - step++; \ - boffset += sliceSize; \ - if (boffset == buffSize) boffset = 0; - template -__device__ void ncclBroadcastKernel(struct CollectiveArgs* args) { +__device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int nthreads = blockDim.x - 1; const int bid = args->bid; - __shared__ T* sharedNextOutput; struct ncclComm* comm = args->comm; - struct ncclRing* ring = comm->rings+blockIdx.x; - int prevdirect = ring->recv.conn.direct; - int nextdirect = ring->send.conn.direct; - - WaitFlag waitDoneFromNext(ring->send.conn.head, (BROADCAST_BUFCHUNKS-1)*BROADCAST_SUBSTEPS); - WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0); - PostFlag postDoneToPrev(ring->recv.conn.head, 0, NULL, 0); - PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, BROADCAST_BUFCHUNKS*BROADCAST_SUBSTEPS); - - typedef Primitives Prims; - + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; const ssize_t size = args->N; - const int buffSize = ring->buffSize / sizeof(T); - const int sliceSize = buffSize / BROADCAST_BUFCHUNKS; - const ssize_t loopSize = args->nRings*(ssize_t)sliceSize; + const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS); + const int chunkSize = stepSize * BROADCAST_CHUNKSTEPS; + const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize; const int rank = ring->devUserRanks[0]; const int nextRank = ring->devUserRanks[1]; const int root = args->root; - if (tid == 0) { - // Update in case we skipped some collectives - *ring->recv.conn.opCount = args->opCount; - if (nextRank != root) { - // Wait for next to be ready - WaitFlag waitOpCountNext(ring->send.conn.opCount, 0); - waitOpCountNext.wait(args->opCount); - } - if (rank != root && prevdirect) { - *ring->recv.conn.ptrExchange = args->ThisOutput; - } - if (nextRank != root && nextdirect) { - void* volatile* ptr = &(ring->devMemSend->ptrExchange); - while (*ptr == nullptr); - sharedNextOutput = (T*)*ptr; - *ptr = nullptr; - } - } - __syncthreads(); - - uint64_t step = 0ULL; - int boffset = 0; - // Compute pointers const T * __restrict__ thisInput = (const T*)args->ThisInput; T * __restrict__ thisOutput = (T*)args->ThisOutput; - T * __restrict__ prevInput = (T*)ring->recv.conn.buff; - T * __restrict__ nextOutput = (T*)ring->send.conn.buff; + + ncclPrimitives + prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings)); - ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); - ssize_t offset = gridOffset + bid*chunkSize; - int maxOffset = min(chunkSize, size-offset); + int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels)); + ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); + ssize_t offset = gridOffset + bid*realChunkSize; + int nelem = min(realChunkSize, size-offset); if (rank == root) { if (thisInput == thisOutput) { - Prims::Copy(tid, nthreads, - thisInput + offset, - nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset), - sliceSize, maxOffset, - step, - waitDoneFromNext, - postReadyToNext); + prims.send(thisInput+offset, nelem); } else { - Prims::DoubleCopy(tid, nthreads, - thisInput + offset, - thisOutput + offset, - nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset), - sliceSize, maxOffset, - step, - waitDoneFromNext, - postReadyToNext); + prims.copySend(thisInput+offset, thisOutput+offset, nelem); } } else if (nextRank == root) { - if (prevdirect) maxOffset = 0; // Only wait for signals - Prims::Copy(tid, nthreads, - prevInput + boffset, - thisOutput + offset, - sliceSize, maxOffset, - step, - waitReadyFromPrev, - postDoneToPrev); + prims.recv(thisOutput+offset, nelem); } else { - if (prevdirect) { - Prims::Copy(tid, nthreads, - thisOutput + offset, - nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset), - sliceSize, maxOffset, - step, - waitDoneFromNext, waitReadyFromPrev, - postReadyToNext, postDoneToPrev); - } else { - Prims::DoubleCopy(tid, nthreads, - prevInput + boffset, - thisOutput + offset, - nextdirect ? (sharedNextOutput + offset) : (nextOutput + boffset), - sliceSize, maxOffset, - step, - waitDoneFromNext, waitReadyFromPrev, - postReadyToNext, postDoneToPrev); - } + prims.recvCopySend(thisOutput+offset, nelem); } - NEXT_STEP; // Increases step, boffset - } - - if (tid == 0) { - if (nextRank != root) { - // Wait for next to have consumed data before resetting the flag - waitDoneFromNext.wait(BROADCAST_SUBSTEPS*(step + BROADCAST_BUFCHUNKS - 1)); - *ring->send.conn.head = 0ULL; - } - *ring->recv.conn.tail = 0ULL; - __threadfence_system(); - *ring->recv.conn.opCount = args->opCount+1; } } -#include "ll_kernel.h" - -#define NEXT_STEP_LL \ - boffset += NCCL_LL_SLICE_LINES; \ - if (boffset == NCCL_LL_BUFF_LINES) boffset = 0; \ - flag++; \ - step++; +template +__device__ void ncclBroadcastTreeKernel(struct CollectiveArgs* args) { } template -__device__ void ncclBroadcastLLKernel(struct CollectiveArgs* args) { +__device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int bid = args->bid; - const int llNthreads = args->nThreads; + const int nthreads = args->nThreads; struct ncclComm* comm = args->comm; - struct ncclRing* ring = comm->rings+blockIdx.x; - volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead; - volatile uint64_t * sendHeadPtr = ring->send.conn.llHead; - volatile int * sizesFifo = ring->send.conn.llFifo; - uint64_t sendHead = sendHeadPtr[0]; - const int rank = comm->rank; + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; + + ncclLLPrimitives LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount); + + const ssize_t size = args->N; + const int rank = ring->devUserRanks[0]; const int nextRank = ring->devUserRanks[1]; const int root = args->root; - typedef LLPrimitives LL; - - const ssize_t size = args->N; ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T); - const ssize_t loopSize = args->nRings*chunkSize; - - uint64_t step = ring->send.conn.llStep; - uint32_t flag = step + 1; - int boffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step); + const ssize_t loopSize = args->nChannels*chunkSize; // Compute pointers const T * __restrict__ thisInput = (const T*)args->ThisInput; T * __restrict__ thisOutput = (T*)args->ThisOutput; - union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff; - union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff; for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { if (size-gridOffset < loopSize) { @@ -183,46 +83,20 @@ __device__ void ncclBroadcastLLKernel(struct CollectiveArgs* args) { } ssize_t offset = gridOffset + bid*chunkSize; - int maxOffset = min(chunkSize, size-offset); + int nelem = min(chunkSize, size-offset); if (rank == root) { - WAIT_NEXT; if (thisInput == thisOutput) { - LL::ReduceCopy( - thisInput + offset, - nextOutput + boffset, - maxOffset, flag, llNthreads); + LLprims.send(thisInput+offset, nelem); } else { - LL::ReduceCopy( - thisInput + offset, - thisOutput + offset, - nextOutput + boffset, - maxOffset, flag, llNthreads); + LLprims.copySend(thisInput + offset, thisOutput + offset, nelem); } - POST_SIZE; - NEXT_STEP_LL; } else if (nextRank == root) { - LL::ReduceCopy( - prevInput + boffset, - thisOutput + offset, - maxOffset, flag, llNthreads); - NEXT_STEP_LL; - ACK_PREV; + LLprims.recv(thisOutput + offset, nelem); } else { - WAIT_NEXT; - LL::ReduceCopy( - prevInput + boffset, - thisOutput + offset, - nextOutput + boffset, - maxOffset, flag, flag, llNthreads); - POST_SIZE; - NEXT_STEP_LL; - ACK_PREV; + LLprims.recvCopySend(thisOutput + offset, nelem); } } - - // We need everyone to acknowledge data even if they didn't receive anything - // so that the next collective can start right away. - ACK_PREV; - - FIFO_CLEANING_AND_SAVE_STEP(flag); } + +template +__device__ void ncclBroadcastTreeLLKernel(struct CollectiveArgs* args) { } diff --git a/projects/rccl/src/collectives/device/common.h b/projects/rccl/src/collectives/device/common.h index c9889133eb..e4aecbd3a0 100644 --- a/projects/rccl/src/collectives/device/common.h +++ b/projects/rccl/src/collectives/device/common.h @@ -11,13 +11,29 @@ #include "core.h" #include "nccl.h" +// Exit If Abort Barrier across CTA: make sure all threads exit consistently +// Each thread sets a predicate to true if abort == 1 +// all CTA's threads enter the barrier and do a popc on their predicates being True +// If any of the thread's predicate was True, all the threads call exit() +static inline __device__ void exitIfAbortBarrier(int abort) { + uint32_t popc; + asm ("{"); + asm volatile (" .reg .pred barr_pred;"); + asm volatile (" setp.eq.u32 barr_pred,%0,1;" :: "r"(abort)); + asm volatile (" bar.red.popc.u32 %0, 13, barr_pred;" : "=r"(popc)); + asm ("}"); + if (popc) { asm volatile ("exit;"); } +} + typedef void(*ncclKern_t)(struct CollectiveArgs* args); extern __device__ ncclKern_t ncclFuncs[]; static __device__ void load_parallel(void* dst, void* src, size_t size, int tid) { int* d = (int*)dst; int* s = (int*)src; - __syncthreads(); + // When aggregation is effective, if some threads have aborted inside the LL kernel, + // make sure the rest of the threads abort as well + exitIfAbortBarrier(0); for (int o = tid; o < (size/sizeof(int)); o += blockDim.x) d[o] = s[o]; __syncthreads(); } @@ -27,12 +43,14 @@ static __device__ void load_coll(struct ncclColl* localColl, struct ncclColl* ho } /* Functions for aggregation case */ -#define IMPL_COLL4(coll, op, ncclFunc, dtype, ctype) \ +#define IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \ __device__ void NCCL_COLL_NAME(coll, op, dtype)(struct CollectiveArgs* args) { \ - coll##Kernel, ctype>(args); \ + coll##Kernel, ctype>(args); \ } + +#if NCCL_OP == 0 /* Kernels with the first operation inlined */ -#define IMPL_COLL4K(coll, op, ncclFunc, dtype, ctype, fIndex) \ +#define IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex) \ __launch_bounds__(MAXTHREADS+WARP_SIZE, 1) \ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \ int tid = threadIdx.x; \ @@ -40,25 +58,25 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \ __shared__ struct ncclColl localColl; \ \ struct ncclComm* comm = firstColl.args.comm; \ - struct ncclRing* ring = comm->rings+bid; \ + struct ncclChannel* channel = comm->channels+bid; \ struct ncclColl* c; \ if (bid == 0) { \ /* To optimize for latency, (only) the first operation is passed as argument.*/ \ c = &firstColl; \ } else { \ c = &localColl; \ - load_coll(c, ring->devCollectives+ring->collFifoHead, tid); \ + load_coll(c, channel->devCollectives+channel->collFifoHead, tid); \ } \ while (1) { \ - if (tid < c->nThreads) { \ + if (tid < c->args.nThreads) { \ if (c->funcIndex == fIndex) { \ - coll##Kernel, ctype>(&c->args); \ + coll##Kernel, ctype>(&c->args); \ } else { \ ncclFuncs[c->funcIndex](&c->args); \ } \ } \ int nextIndex = c->nextIndex; \ - if (tid == 0) ring->collFifoHead = nextIndex; \ + if (tid == 0) channel->collFifoHead = nextIndex; \ \ if (c->active == 2) { \ return; \ @@ -66,25 +84,75 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \ \ /* Load next collective operation*/ \ c = &localColl; /* for bid 0 */ \ - load_coll(c, ring->devCollectives+nextIndex, tid); \ + load_coll(c, channel->devCollectives+nextIndex, tid); \ } \ } +#else +#define IMPL_COLL_KERN(coll, op, ncclFunc, dtype, ctype, fIndex) +#endif + +// Only generate inline kernels for LL +#define IMPL_COLL4(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, al) \ + IMPL_COLL_FUNC(coll, op, ncclFunc, dtype, ctype) \ + IMPL_COLL_FUNC(coll##LL, op, ncclFunc, dtype, ctype) \ + IMPL_COLL_KERN(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 1, al)) \ #define IMPL_COLL3(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType) \ - IMPL_COLL4(coll##LL, op, ncclFunc, dtype, ctype) \ - IMPL_COLL4K(coll##LL, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 1)) \ - IMPL_COLL4(coll, op, ncclFunc, dtype, ctype) \ - IMPL_COLL4K(coll, op, ncclFunc, dtype, ctype, FUNC_INDEX(ncclColl, ncclOp, ncclType, 0)) \ + IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, 0) \ + IMPL_COLL4(coll##Tree, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, 1) +#if NCCL_TYPE == 0 +#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \ + IMPL_COLL3(coll, op, ncclFunc, i8, int8_t, ncclColl, ncclOp, ncclInt8) +#elif NCCL_TYPE == 1 +#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \ + IMPL_COLL3(coll, op, ncclFunc, u8, uint8_t, ncclColl, ncclOp, ncclUint8) +#elif NCCL_TYPE == 2 +#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \ + IMPL_COLL3(coll, op, ncclFunc, i32, int32_t, ncclColl, ncclOp, ncclInt32) +#elif NCCL_TYPE == 3 +#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \ + IMPL_COLL3(coll, op, ncclFunc, u32, uint32_t, ncclColl, ncclOp, ncclUint32) +#elif NCCL_TYPE == 4 +#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \ + IMPL_COLL3(coll, op, ncclFunc, i64, int64_t, ncclColl, ncclOp, ncclInt64) +#elif NCCL_TYPE == 5 +#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \ + IMPL_COLL3(coll, op, ncclFunc, u64, uint64_t, ncclColl, ncclOp, ncclUint64) +#elif NCCL_TYPE == 6 +#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \ + IMPL_COLL3(coll, op, ncclFunc, f16, half, ncclColl, ncclOp, ncclFloat16) +#elif NCCL_TYPE == 7 +#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \ + IMPL_COLL3(coll, op, ncclFunc, f32, float, ncclColl, ncclOp, ncclFloat32) +#elif NCCL_TYPE == 8 #define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \ - IMPL_COLL3(coll, op, ncclFunc, i8, int8_t, ncclColl, ncclOp, ncclInt8) \ - IMPL_COLL3(coll, op, ncclFunc, u8, uint8_t, ncclColl, ncclOp, ncclUint8) \ - IMPL_COLL3(coll, op, ncclFunc, i32, int32_t, ncclColl, ncclOp, ncclInt32) \ - IMPL_COLL3(coll, op, ncclFunc, u32, uint32_t, ncclColl, ncclOp, ncclUint32) \ - IMPL_COLL3(coll, op, ncclFunc, i64, int64_t, ncclColl, ncclOp, ncclInt64) \ - IMPL_COLL3(coll, op, ncclFunc, u64, uint64_t, ncclColl, ncclOp, ncclUint64) \ - IMPL_COLL3(coll, op, ncclFunc, f16, half, ncclColl, ncclOp, ncclFloat16) \ - IMPL_COLL3(coll, op, ncclFunc, f32, float, ncclColl, ncclOp, ncclFloat32) \ IMPL_COLL3(coll, op, ncclFunc, f64, double, ncclColl, ncclOp, ncclFloat64) +#endif + +// Reduction define all functions +#if NCCL_OP == 0 +#define IMPL_COLL_R(collf, colln) \ + IMPL_COLL2(collf, sum, FuncSum, colln, ncclSum); +#elif NCCL_OP == 1 +#define IMPL_COLL_R(collf, colln) \ + IMPL_COLL2(collf, prod, FuncProd, colln, ncclProd); +#elif NCCL_OP == 2 +#define IMPL_COLL_R(collf, colln) \ + IMPL_COLL2(collf, min, FuncMin, colln, ncclMin); +#elif NCCL_OP == 3 +#define IMPL_COLL_R(collf, colln) \ + IMPL_COLL2(collf, max, FuncMax, colln, ncclMax); +#endif + +// Copy primitives only define one +#if NCCL_OP == 0 && NCCL_TYPE == 0 +#define IMPL_COLL_C(collf, colln) \ + IMPL_COLL3(collf, copy, FuncSum, i8, int8_t, colln, ncclSum, ncclInt8); +#else +#define IMPL_COLL_C(collf, colln) +#endif + +#define COLL_UNROLL 4 #endif diff --git a/projects/rccl/src/collectives/device/common_kernel.h b/projects/rccl/src/collectives/device/common_kernel.h index 0eaa0610d0..e1fb096c29 100644 --- a/projects/rccl/src/collectives/device/common_kernel.h +++ b/projects/rccl/src/collectives/device/common_kernel.h @@ -192,14 +192,6 @@ struct MULTI { } }; -#define ALIGNUP(x, a) ((((x)-1) & ~((a)-1)) + (a)) - -template -__device__ inline volatile T* AlignUp(volatile T * ptr, size_t align) { - size_t ptrval = reinterpret_cast(ptr); - return reinterpret_cast(ALIGNUP(ptrval, align)); -} - template inline __device__ T vFetch(const volatile T* ptr) { return *ptr; @@ -236,25 +228,6 @@ void vStore(volatile half* ptr, const half val) { } #endif -template -__device__ inline void ReduceCopy( - const int tid, const int nthreads, - const volatile T * __restrict__ const src0, - const volatile T * __restrict__ const src1, - volatile T * __restrict__ const dest0, - volatile T * __restrict__ const dest1, const int N) { - for (int idx = tid; idx < N; idx += nthreads) { - T val = vFetch(src0+idx); - if (TWO_INPUTS) { - val = FUNC()(val, vFetch(src1+idx)); - } - vStore(dest0+idx, val); - if (TWO_OUTPUTS) { - vStore(dest1+idx, val); - } - } -} - typedef ulong2 Pack128; template @@ -265,72 +238,111 @@ struct MULTI128 { } }; -inline __device__ void Fetch128(Pack128& v, Pack128* p) { +inline __device__ void Fetch128(Pack128& v, const Pack128* p) { asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];" : "=l"(v.x), "=l"(v.y) : "l"(p) : "memory"); } inline __device__ void Store128(Pack128* p, Pack128& v) { asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" :: "l"(p), "l"(v.x), "l"(v.y) : "memory"); } -#define WARP_SIZE 32 -template -__device__ inline void ReduceCopy128b( const int w, const int nw, const int t, - Pack128 * src0, Pack128 * src1, Pack128 * dest0, Pack128 * dest1, - const int N) { - Pack128 t0[UNROLL]; - Pack128 t1[UNROLL]; - const Pack128* src0_end = src0 + N; - const int inc = nw * UNROLL * WARP_SIZE; - const int offset = w * UNROLL * WARP_SIZE + t; - src0 += offset; if (TWO_INPUTS) src1 += offset; - dest0 += offset; if (TWO_OUTPUTS) dest1 += offset; +template +__device__ __forceinline__ void ReduceCopyMulti(const int tid, const int nthreads, + int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS], + const int offset, const int N) { + for (int idx = offset+tid; idx < offset+N; idx += nthreads) { + T val = vFetch(srcs[0]+idx); + #pragma unroll + for (int i=1; i()(t0[u], t1[u]); - Store128(dest0+u*WARP_SIZE, t0[u]); - if (TWO_OUTPUTS) Store128(dest1+u*WARP_SIZE, t0[u]); - } - src0 += inc; if (TWO_INPUTS) src1 += inc; - dest0 += inc; if (TWO_OUTPUTS) dest1 += inc; + #pragma unroll + for (int i=0; i -__device__ inline void ReduceOrCopy(const int tid, const int nthreads, - volatile T * __restrict__ dest0, volatile T * __restrict__ dest1, - const volatile T * __restrict__ src0, const volatile T * __restrict__ src1, +#define WARP_SIZE 32 + +template +__device__ __forceinline__ void ReduceCopy128bMulti( const int w, const int nw, const int t, + int nsrcs, const T* s[MAXSRCS], int ndsts, T* d[MAXDSTS], + const int elemOffset, const int Npack) { + const int inc = nw * UNROLL * WARP_SIZE; + int offset = w * UNROLL * WARP_SIZE + t; + + const Pack128* srcs[MAXSRCS]; + for (int i=0; i()(vals[u], vals2[u]); + } + #pragma unroll 1 + for (int i=MINSRCS; i()(vals[u], vals2[u]); + } + + // Store + for (int i = 0; i < MINDSTS; i++) { + for (int u = 0; u < UNROLL; ++u) Store128(dsts[i]+u*WARP_SIZE, vals[u]); + } + #pragma unroll 1 + for (int i=MINDSTS; i +__device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(Pack128); } + +// Try to limit consecutive load/stores to 8. +// Use UNROLL 8 when we have a single source and a single destination, 4 otherwise +#define AUTOUNROLL (UNROLL*(4/(MINDSTS+MINSRCS))) + +template +__device__ __forceinline__ void ReduceOrCopyMulti(const int tid, const int nthreads, + int nsrcs, const T* srcs[MAXSRCS], int ndsts, T* dsts[MAXDSTS], int N) { int Nrem = N; if (Nrem <= 0) return; - int Npreamble = (Nrem(tid, nthreads, src0, src1, dest0, dest1, Npreamble); - - Nrem -= Npreamble; - if (Nrem == 0) return; - - dest0 += Npreamble; if (HAS_DEST1) { dest1 += Npreamble; } - src0 += Npreamble; if (HAS_SRC1) { src1 += Npreamble; } + if (Npreamble) { + ReduceCopyMulti(tid, nthreads, nsrcs, srcs, ndsts, dsts, 0, Npreamble); + Nrem -= Npreamble; + if (Nrem == 0) return; + } + int offset = Npreamble; // stage 2: fast path: use 128b loads/stores to do the bulk of the work, // assuming the pointers we have are all 128-bit alignable. @@ -338,35 +350,33 @@ __device__ inline void ReduceOrCopy(const int tid, const int nthreads, int nw = nthreads / WARP_SIZE; // Number of warps int t = tid % WARP_SIZE; // Thread (inside the warp) - const int PackFactor = sizeof(Pack128) / sizeof(T); + const int packFactor = sizeof(Pack128) / sizeof(T); // stage 2a: main loop - int Nalign2a = (Nrem / (PackFactor * UNROLL * nthreads)) - * (UNROLL * nthreads); // round down + int Npack2a = (Nrem / (packFactor * AUTOUNROLL * WARP_SIZE)) + * (AUTOUNROLL * WARP_SIZE); // round down + int Nelem2a = Npack2a * packFactor; - ReduceCopy128b(w, nw, t, (Pack128*)src0, (Pack128*)src1, (Pack128*)dest0, (Pack128*)dest1, Nalign2a); + ReduceCopy128bMulti(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2a); - int Ndone2a = Nalign2a * PackFactor; - Nrem -= Ndone2a; + Nrem -= Nelem2a; if (Nrem == 0) return; - dest0 += Ndone2a; if (HAS_DEST1) { dest1 += Ndone2a; } - src0 += Ndone2a; if (HAS_SRC1) { src1 += Ndone2a; } + offset += Nelem2a; // stage 2b: slightly less optimized for section when we don't have full - // UNROLLs + // unrolling - int Nalign2b = Nrem / PackFactor; + int Npack2b = Nrem / packFactor; + int Nelem2b = Npack2b * packFactor; - ReduceCopy128b(w, nw, t, (Pack128*)src0, (Pack128*)src1, (Pack128*)dest0, (Pack128*)dest1, Nalign2b); + ReduceCopy128bMulti(w, nw, t, nsrcs, srcs, ndsts, dsts, offset, Npack2b); - int Ndone2b = Nalign2b * PackFactor; - Nrem -= Ndone2b; + Nrem -= Nelem2b; if (Nrem == 0) return; - dest0 += Ndone2b; if (HAS_DEST1) { dest1 += Ndone2b; } - src0 += Ndone2b; if (HAS_SRC1) { src1 += Ndone2b; } + offset += Nelem2b; // stage 2c: tail - ReduceCopy(tid, nthreads, src0, src1, dest0, dest1, Nrem); + ReduceCopyMulti(tid, nthreads, nsrcs, srcs, ndsts, dsts, offset, Nrem); } #endif // COMMON_KERNEL_H_ diff --git a/projects/rccl/src/collectives/device/functions.cu b/projects/rccl/src/collectives/device/functions.cu index 1fb8108166..ea06b6894b 100644 --- a/projects/rccl/src/collectives/device/functions.cu +++ b/projects/rccl/src/collectives/device/functions.cu @@ -8,9 +8,13 @@ #include "collectives.h" #include "common.h" -#define NCCL_FUNC4(coll, op, dtype) \ +#define NCCL_FUNC5(coll, op, dtype) \ NCCL_COLL_NAME(coll, op, dtype), \ - NCCL_COLL_NAME(coll##LL, op, dtype) \ + NCCL_COLL_NAME(coll##LL, op, dtype) + +#define NCCL_FUNC4(coll, op, dtype) \ + NCCL_FUNC5(coll##Ring, op, dtype), \ + NCCL_FUNC5(coll##Tree, op, dtype) // Must be consistent with ncclDataType_t #define NCCL_FUNCS3A(coll, op) \ @@ -55,7 +59,7 @@ NCCL_FUNCS2A(ncclAllReduce) } // Must be consistent with the ncclFuncSet enum -__device__ ncclKern_t ncclFuncs[ncclCollCount*ncclNumOps*ncclNumTypes*2] = { +__device__ ncclKern_t ncclFuncs[ncclCollCount*ncclNumOps*ncclNumTypes*2*2] = { // Don't try to initialize the host shadow copy of this device-side global // variable. There is no host pointer to a device-side function, which // confuses clang. This will be fixed in the next clang release. diff --git a/projects/rccl/src/collectives/device/gen_rules.sh b/projects/rccl/src/collectives/device/gen_rules.sh new file mode 100755 index 0000000000..3942c8c2b0 --- /dev/null +++ b/projects/rccl/src/collectives/device/gen_rules.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# See LICENSE.txt for license information +# + +dir=$1 + +targets="GENOBJS := \\\\\n" + +for base in all_reduce all_gather broadcast reduce reduce_scatter; do + opn=0 + for op in sum prod min max; do + dtn=0 + for dt in i8 u8 i32 u32 i64 u64 f16 f32 f64; do + echo "${dir}/${base}_${op}_${dt}.o : ${base}.cu ${dir}/${base}.dep" + echo " @printf \"Compiling %-35s > %s\\\\n\" ${base}.cu ${dir}/${base}_${op}_${dt}.o" + echo " mkdir -p ${dir}" + echo " \${NVCC} -DNCCL_OP=${opn} -DNCCL_TYPE=${dtn} \${NVCUFLAGS} -dc ${base}.cu -o ${dir}/${base}_${op}_${dt}.o" + echo "" + targets="$targets\t${dir}/${base}_${op}_${dt}.o \\\\\n" + dtn=$(($dtn + 1)) + done + opn=$(($opn + 1)) + done +done +echo -e "$targets" diff --git a/projects/rccl/src/collectives/device/ll_kernel.h b/projects/rccl/src/collectives/device/ll_kernel.h deleted file mode 100644 index 5ec3c9a871..0000000000 --- a/projects/rccl/src/collectives/device/ll_kernel.h +++ /dev/null @@ -1,154 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_LL_KERNEL_H_ -#define NCCL_LL_KERNEL_H_ - -static __device__ uint64_t readLL(union ncclLLFifoLine* src, uint32_t flag) { - uint32_t data1, flag1, data2, flag2; - do { - asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4)); - } while ((flag1 != flag) || (flag2 != flag)); - uint64_t val64 = data1 + (((uint64_t)data2) << 32); - return val64; -} - -static __device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) { - asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag)); -} - -// Using memcpy handles misaligned pointers. -static __device__ uint64_t readAL(uint64_t* src) { - uint64_t val; - memcpy((char*)&val, (char*)src, sizeof(uint64_t)); - return val; -} -static __device__ void storeAL(uint64_t* dst, uint64_t val) { - memcpy((char*)dst, (char*)&val, sizeof(uint64_t)); -} - -template -class LLPrimitives { - private: - template - static __device__ void ReduceCopyGeneric(const T* src1, union ncclLLFifoLine* src2, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) { - if (size <= 0) return; - size_t size64 = size * sizeof(T) / sizeof(uint64_t); - uint64_t* src1A = (uint64_t*)src1; - uint64_t* dst1A = (uint64_t*)dst1; - int offset = threadIdx.x; - // Do multiples of 64 bits -#pragma unroll 1 - for (; offset < size64; offset += nthreads) { - uint64_t val; - if (HAS_SRC1) { - val = readAL(src1A+offset); - if (HAS_SRC2) val = MULTI()(readLL(src2+offset, iflag), val); - } else if (HAS_SRC2) { - val = readLL(src2+offset, iflag); - } - if (HAS_DST1) storeAL(dst1A+offset, val); - if (HAS_DST2) storeLL(dst2+offset, val, oflag); - } - // Finish last word - int sizeDone = size64*(sizeof(uint64_t)/sizeof(T)); - int sizeRem = size - sizeDone; - if (threadIdx.x == 0 && sizeRem) { - const T* src1B = src1 + sizeDone; - T* dst1B = dst1 + sizeDone; - - uint64_t lastVal; - T* vals = (T*)&lastVal; - - if (HAS_SRC2) { - uint64_t lastVal2 = readLL(src2+size64, iflag); - T* src2B = (T*)&lastVal2; - for (int offset = 0; offset < sizeRem; offset++) { - vals[offset] = HAS_SRC1 ? FUNC()(src2B[offset], src1B[offset]) : src2B[offset]; - } - } else if (HAS_SRC1) { - for (int offset = 0; offset < sizeRem; offset++) { - vals[offset] = src1B[offset]; - } - } - if (HAS_DST2) storeLL(dst2+size64, lastVal, oflag); - if (HAS_DST1) { - for (int offset = 0; offset < sizeRem; offset++) { - dst1B[offset] = vals[offset]; - } - } - } - } - public: - static __device__ void ReduceCopy(const T* src, union ncclLLFifoLine* dst, int size, uint32_t oflag, int nthreads) { - return ReduceCopyGeneric<1, 0, 0, 1>(src, NULL, NULL, dst, size, 0, oflag, nthreads); - } - - static __device__ void ReduceCopy(union ncclLLFifoLine* src, T* dst, int size, uint32_t iflag, int nthreads) { - return ReduceCopyGeneric<0, 1, 1, 0>(NULL, src, dst, NULL, size, iflag, 0, nthreads); - } - - static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, union ncclLLFifoLine* dst, int size, uint32_t iflag, uint32_t oflag, int nthreads) { - return ReduceCopyGeneric<1, 1, 0, 1>(src1, src2, NULL, dst, size, iflag, oflag, nthreads); - } - - static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, T* dst, int size, uint32_t iflag, int nthreads) { - return ReduceCopyGeneric<1, 1, 1, 0>(src1, src2, dst, NULL, size, iflag, 0, nthreads); - } - - static __device__ void ReduceCopy(const T* src, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t oflag, int nthreads) { - return ReduceCopyGeneric<1, 0, 1, 1>(src, NULL, dst1, dst2, size, 0, oflag, nthreads); - } - - static __device__ void ReduceCopy(union ncclLLFifoLine* src, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) { - return ReduceCopyGeneric<0, 1, 1, 1>(NULL, src, dst1, dst2, size, iflag, oflag, nthreads); - } - - static __device__ void ReduceCopy(const T* src1, union ncclLLFifoLine* src2, T* dst1, union ncclLLFifoLine* dst2, int size, uint32_t iflag, uint32_t oflag, int nthreads) { - return ReduceCopyGeneric<1, 1, 1, 1>(src1, src2, dst1, dst2, size, iflag, oflag, nthreads); - } -}; - -// Common macros - -#define STEP_TO_SLOT(step) \ - (step % NCCL_LL_CHUNKS) - -#define WAIT_NEXT \ - if (tid == 0) { \ - while (sendHead + NCCL_LL_CHUNKS <= step) { \ - sendHead = sendHeadPtr[0]; \ - } \ - } \ - asm volatile ("bar.sync 1, %0;" :: "r"(llNthreads)); - -#define POST_SIZE \ - if (tid == 0 && sizesFifo) sizesFifo[step % NCCL_LL_CHUNKS] = (maxOffset <= 0) ? -1 : (maxOffset*2*(int)sizeof(T)); - -#define ACK_PREV \ - asm volatile ("bar.sync 1, %0;" :: "r"(llNthreads)); \ - if (tid == 0) recvHeadPtr[0] = step; - -#define FIFO_CLEANING_AND_SAVE_STEP(flag) do { \ - if (step > ring->send.conn.llLastCleaning + NCCL_LL_CLEAN_FREQ) { \ - /* Reset all flags */ \ - static_assert((NCCL_LL_BUFF_SIZE % NCCL_LL_MAX_NTHREADS) == 0, "NCCL_LL_BUFF_SIZE must be a multiple of THREADS"); \ - static_assert(NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*NCCL_LL_MAX_NTHREADS) > 0, "NCCL_LL_BUFF_SIZE is less than 16 bytes*THREADS"); \ - const union ncclLLFifoLine resetLine = { 0, flag, 0, flag }; \ - for (int i=0; isend.conn.llLastCleaning = step; \ - } \ - ring->send.conn.llStep = step; \ -} while (0); - -#endif diff --git a/projects/rccl/src/collectives/device/primitives.h b/projects/rccl/src/collectives/device/primitives.h index e2baa4b301..c5aaf549b4 100644 --- a/projects/rccl/src/collectives/device/primitives.h +++ b/projects/rccl/src/collectives/device/primitives.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -9,218 +9,579 @@ #include #include "reduce_kernel.h" // for reduction funcs +#include "common.h" +#define SPINS_BEFORE_CHECK_ABORT 1000000 -/* Defines primitive operations: Copy, Reduce, DoubleCopy, and ReduceCopy. - * - * In order to reduce the reptetion of template arguments, the operations - * are bundled as static methods of the Primitives class. - * - * Each primitive operation copies/reduces a contiguous buffer and syncs - * an optional set of flags against a sub-step counter. The sync value is - * based on the step parameter. Sync flags must be of type WaitFlag or - * PostFlag. The primitive routines wait for all WaitFlag args to attain - * at least a value of SUBSTEPS*(step-1)+substep+1 (i.e. completion of - * corresponding substep by previous step) before executing the transfer. - * After each substep is transfered, all PostFlag arguments get updated to - * the value SUBSTEPS*step+substep+1. - */ - - -class WaitFlag { - volatile uint64_t * const flag; - const int shift; - public: - __device__ __forceinline__ - WaitFlag(volatile uint64_t * const flag, const int shift) : flag(flag), shift(shift) { } - __device__ __forceinline__ - void wait(uint64_t val) { while ((*flag + shift) < val) /*SPIN*/; } -}; - - -class PostFlag { - volatile uint64_t * const flag; - const int shift; - volatile int * const fifo; - const int fifo_size; - public: - __device__ __forceinline__ - PostFlag(volatile uint64_t* const flag, const int shift, volatile int* const fifo, const int fifo_size) : flag(flag), shift(shift), fifo(fifo), fifo_size(fifo_size) { } - __device__ __forceinline__ - void post(uint64_t val) { *flag = (val - shift); } - __device__ __forceinline__ - void postSize(uint64_t step, int size) { if (fifo != NULL) fifo[step%fifo_size] = size; }; -}; - - -// Helper to check if any argument is of type T. -// e.g. AnyAre(Flag1, Flag2, ...) -template __device__ __forceinline__ -bool AnyAre() { return false; } - -template -__device__ __forceinline__ -bool AnyAre(FIRST_T first, TAIL_Ts... tail) { - return std::is_same::value || AnyAre(tail...); -} - - -// Wait on all WaitFlags, ignore PostFlags -__device__ __forceinline__ -void WaitOnFlags(uint64_t val) { } - -template __device__ __forceinline__ -void WaitOnFlags(uint64_t val, WaitFlag flag, TAIL_Ts... tail) { - flag.wait(val); - WaitOnFlags(val, tail...); -} - -template __device__ __forceinline__ -void WaitOnFlags(uint64_t val, PostFlag, TAIL_Ts... tail) { - WaitOnFlags(val, tail...); -} - - -// Post all PostFlags, ignore WaitFlags -__device__ __forceinline__ -void PostToFlags(uint64_t val) { } - -template __device__ __forceinline__ -void PostToFlags(uint64_t val, WaitFlag flag, TAIL_Ts... tail) { - PostToFlags(val, tail...); -} - -template __device__ __forceinline__ -void PostToFlags(uint64_t val, PostFlag flag, TAIL_Ts... tail) { - flag.post(val); - PostToFlags(val, tail...); -} - - -// Post sizes for PostFlags, ignore WaitFlags -__device__ __forceinline__ -void PostSizeToFlags(uint64_t step, int size) { } - -template __device__ __forceinline__ -void PostSizeToFlags(uint64_t step, int size, WaitFlag flag, TAIL_Ts... tail) { - PostSizeToFlags(step, size, tail...); -} - -template __device__ __forceinline__ -void PostSizeToFlags(uint64_t step, int size, PostFlag flag, TAIL_Ts... tail) { - flag.postSize(step, size); - PostSizeToFlags(step, size, tail...); -} - - -// Create pointer arithmetic syntax that doesn't break for std::nullptr_t -template __device__ __forceinline__ -Tptr ptradd(Tptr ptr, int i) { - return ptr + i; -} - -__device__ __forceinline__ -std::nullptr_t ptradd(std::nullptr_t ptr, int i) { - return nullptr; -} +// Unroll unconditionally the first send/recv since nsend/nrecv should be at +// least 1 if SEND/RECV is set. +#define FOR_SEND(func, ...) do { \ + if (SEND) { \ + /* Send to far first, then close */ \ + for (int i=1; i > -class Primitives { +template +class ncclPrimitives { private: - template // either WaitFunc or PostFunc - static __device__ __forceinline__ void - GenericOp(const int tid, const int nthreads, - const T* src1, - const SRC2_T src2, - T* dst1, - DST2_T dst2, - int len, int maxoffset, uint64_t step, SYNC_Ts... flags) { + const int tid; + const int nthreads; + int nrecv = 0; + int nsend = 0; + const int stepSize; + struct ncclConnInfo* recvConn[NRECV]; + struct ncclConnInfo* sendConn[NSEND]; + volatile uint64_t* waitPtr; + uint64_t recvStep[NRECV]; + uint64_t sendStep[NSEND]; + uint64_t sendConnHead[NSEND]; + const T* recvDirectBuff[NRECV]; + T* sendDirectBuff[NSEND]; + const T* recvBuff[NRECV]; + T* sendBuff[NSEND]; + struct ncclComm* comm; - enum { noSrc2 = std::is_same::value }; - enum { noDst2 = std::is_same::value }; - static_assert(noSrc2 || std::is_same::value, - "src2 must be of type T* or std::nullptr_t"); - static_assert(noDst2 || std::is_same::value, - "dst2 must be of type T* or std::nullptr_t"); + inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; } + inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; } + inline __device__ const T* recvPtr(int i) { return ((const T*)recvBuff[i])+recvOffset(i); } + inline __device__ T* sendPtr(int i) { return ((T*)sendBuff[i])+sendOffset(i); } - using OpType = typename std::conditional, REDOP>::type; + inline __device__ void barrier() { + asm volatile ("bar.sync 1, %0;" :: "r"(nthreads)); + } - int sliceSize = len / SUBSTEPS; - int sliceOffset = 0; + uint32_t mismatch = 0; + const uint64_t opCount; -#pragma unroll 1 - for (int sub=0; sub(flags...)) { - if (tid == 0) { - WaitOnFlags(SUBSTEPS*step + sub + 1, flags...); - } - asm volatile ("bar.sync 1, %0;" :: "r"(nthreads)); - } - ReduceOrCopy - < - UNROLL, - OpType, - T, - !std::is_same::value, // HAS_DEST1 - !std::is_same::value // HAS_SRC1 - > - ( - tid, nthreads, - ptradd(dst1, sliceOffset), - ptradd(dst2, sliceOffset), - ptradd(src1, sliceOffset), - ptradd(src2, sliceOffset), - realSize - ); - if (AnyAre(flags...)) { - __syncthreads(); - } - } else { - if (AnyAre(flags...)) { - __syncthreads(); - PostSizeToFlags(SUBSTEPS*step+sub, realSize*sizeof(T), flags...); - __threadfence_system(); - PostToFlags(SUBSTEPS*step + sub + 1, flags...); - } + inline __device__ void checkMismatch(volatile uint64_t* remoteOpCount) { + if (mismatch) { + // In non-LL, we use _threadfence_system before incrementing opCount, yet we are still waiting for credits here, so there must be a size mismatch + *(comm->fatalDevError) = ncclDevAssertedMismatch; + } else if (remoteOpCount && *remoteOpCount > opCount) { + mismatch += 1; + } + } + + uint32_t spins = 0; + uint32_t abort = 0; + + inline __device__ int checkAbort(volatile uint64_t* remoteOpCount) { + spins++; + if (spins == SPINS_BEFORE_CHECK_ABORT) { + abort = *(comm->abortFlag); + checkMismatch(remoteOpCount); + spins = 0; + } + return abort; + } + + inline __device__ void waitRecv(int i) { + spins = 0; + mismatch = 0; + recvStep[i] += SLICESTEPS; + if (tid == i) { + while (*(waitPtr) < recvStep[i]) { + if (checkAbort(recvConn[i]->opCountRem)) break; } - sliceOffset += sliceSize; + } + } + + inline __device__ void waitSend(int i) { + spins = 0; + mismatch = 0; + sendStep[i] += SLICESTEPS; + if (tid == WARP_SIZE+i) { + while (sendConnHead[i] + NCCL_STEPS < sendStep[i]) { + sendConnHead[i] = *waitPtr; + if (checkAbort(sendConn[i]->opCountRem)) break; + } + } + } + + inline __device__ void postRecv(int i) { + *(recvConn[i]->head) = recvStep[i] += SLICESTEPS; + } + + inline __device__ void postSend(int i) { + *(sendConn[i]->tail) = sendStep[i] += SLICESTEPS; + } + + inline __device__ void postSendSize(int i, int size) { + if (sendConn[i]->fifo) sendConn[i]->fifo[sendStep[i]%NCCL_STEPS] = size; + } + + template + inline __device__ const T* directRecvPtr(int i, int directOffset) { + return DIRECTRECV && recvDirectBuff[i] ? recvDirectBuff[i]+directOffset : recvPtr(i); + } + + template + inline __device__ T* directSendPtr(int i, int directOffset) { + return DIRECTSEND && sendDirectBuff[i] ? sendDirectBuff[i]+directOffset : sendPtr(i); + } + + template + inline __device__ void + GenericOp(const T* srcPtr, T* dstPtr, int nelem, int directOffset) { + int offset = 0; + int sliceSize = stepSize * SLICESTEPS; + + const T* srcs[RECV*NRECV+SRC]; + srcs[0] = SRC ? srcPtr : directRecvPtr(0, directOffset); + if (RECV) { + if (SRC) srcs[1] = recvPtr(0); + for (int i=1; i(0, directOffset); + if (SEND) { + if (DST) dsts[1] = directSendPtr(0, directOffset); + for (int i=1; i(i, directOffset); + } + + #pragma unroll 1 + for (int slice=0; slice 0) { + barrier(); + if (DIRECTRECV && recvDirectBuff[0]) { + // We can only have one direct receive. Since srcs[0] == dstPtr+offset, skip one copy + if (SEND) { + ReduceOrCopyMulti(tid, nthreads, 1, srcs, nsend, dsts+1, realSize); + } + } else { + ReduceOrCopyMulti(tid, nthreads, RECV*nrecv+SRC, srcs, SEND*nsend+DST, dsts, realSize); + } + } + exitIfAbortBarrier(abort); + } else { + exitIfAbortBarrier(abort); + FOR_SEND(postSendSize, realSize*sizeof(T)); + if (SEND) __threadfence_system(); + FOR_SEND(postSend); + FOR_RECV(postRecv); + } + for (int i=0; ibuff; + recvStep[i] = recvConn[i]->step; + recvStep[i] = ROUNDUP(recvStep[i], SLICESPERCHUNK*SLICESTEPS); + // Return credits in case we rounded up. + if (tid == nthreads) *recvConn[i]->head = recvStep[i]; + if (tid == i) { + waitPtr = recvConn[i]->tail; + *(recvConn[i]->opCountLoc) = opCount; + } + recvDirectBuff[i] = NULL; + if (directBuff && recvConn[i]->direct) { + recvDirectBuff[i] = directBuff; + if (tid == 0) *recvConn[i]->ptrExchange = directBuff; + } + nrecv++; + } + + __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i, T* directBuff) { + sendConn[i] = conn; + sendBuff[i] = (T*)sendConn[i]->buff; + sendStep[i] = sendConn[i]->step; + sendStep[i] = ROUNDUP(sendStep[i], SLICESPERCHUNK*SLICESTEPS); + if (tid == WARP_SIZE+i) { + waitPtr = sendConn[i]->head; + sendConnHead[i] = *waitPtr; + *(sendConn[i]->opCountLoc) = opCount; + } + sendDirectBuff[i] = NULL; + if (directBuff && sendConn[i]->direct) { + void* volatile* ptr = sendConn[i]->ptrExchange; + while ((sendDirectBuff[i] = (T*)(*ptr)) == NULL); + __syncthreads(); + if (tid == 0) *ptr = NULL; + } + nsend++; + } + + __device__ __forceinline__ void saveRecvConn(int i) { + if (tid == i) { + recvConn[i]->step = recvStep[i]; + __threadfence_system(); + *(recvConn[i]->opCountLoc) += 1; + } + } + + __device__ __forceinline__ void saveSendConn(int i) { + if (tid == WARP_SIZE+i) { + sendConn[i]->step = sendStep[i]; + __threadfence_system(); + *(sendConn[i]->opCountLoc) += 1; } } public: - template - static __device__ __forceinline__ void - Copy(const int tid, const int nthreads, const T* src, T* dst, - int len, int maxOffset, uint64_t step, SYNC_Ts... flags) { - GenericOp(tid, nthreads, src, nullptr, dst, nullptr, len, maxOffset, step, flags...); + __device__ __forceinline__ + ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclComm* comm, const uint64_t opCount) + : comm(comm), tid(tid), nthreads(nthreads), stepSize(stepSize), opCount(opCount) { + // Make sure step is updated before we read it + __syncthreads(); + + for (int i=0; i= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i, directBuff); + for (int i=0; i= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i, directBuff); } - template - static __device__ __forceinline__ void - DoubleCopy(const int tid, const int nthreads, const T* src, T* dst1, T* dst2, - int len, int maxOffset, uint64_t step, SYNC_Ts... flags) { - GenericOp(tid, nthreads, src, nullptr, dst1, dst2, len, maxOffset, step, flags...); + __device__ __forceinline__ void + send(const T* src, int nelem) { + GenericOp<0, 0, 0, 1, 1, 0>(src, NULL, nelem, 0); + } + __device__ __forceinline__ void + directSend(const T* src, int directOffset, int nelem) { + GenericOp<0, 1, 0, 1, 1, 0>(src, NULL, nelem, directOffset); } - template - static __device__ __forceinline__ void - Reduce(const int tid, const int nthreads, const T* src1, const T* src2, T* dst, - int len, int maxOffset, uint64_t step, SYNC_Ts... flags) { - GenericOp(tid, nthreads, src1, src2, dst, nullptr, len, maxOffset, step, flags...); + __device__ __forceinline__ void + recv(T* dst, int nelem) { + GenericOp<0, 0, 1, 0, 0, 1>(NULL, dst, nelem, 0); + } + __device__ __forceinline__ void + directRecv(T* dst, int directOffset, int nelem) { + GenericOp<1, 0, 1, 0, 0, 1>(NULL, dst, nelem, directOffset); } - template - static __device__ __forceinline__ void - ReduceCopy(const int tid, const int nthreads, const T* src1, const T* src2, T* dst1, T* dst2, - int len, int maxOffset, uint64_t step, SYNC_Ts... flags) { - GenericOp(tid, nthreads, src1, src2, dst1, dst2, len, maxOffset, step, flags...); + __device__ __forceinline__ void + copySend(const T* src, T* dst, int nelem) { + GenericOp<0, 0, 0, 1, 1, 1>(src, dst, nelem, 0); + } + __device__ __forceinline__ void + directCopySend(const T* src, T* dst, int directOffset, int nelem) { + GenericOp<0, 1, 0, 1, 1, 1>(src, dst, nelem, directOffset); + } + + __device__ __forceinline__ void + recvCopySend(T* dst, int nelem) { + GenericOp<0, 0, 1, 1, 0, 1>(NULL, dst, nelem, 0); + } + __device__ __forceinline__ void + directRecvCopySend(T* dst, int directOffset, int nelem) { + GenericOp<1, 1, 1, 1, 0, 1>(NULL, dst, nelem, directOffset); + } + + __device__ __forceinline__ void + recvReduceCopy(const T* src, T* dst, int nelem) { + GenericOp<0, 0, 1, 0, 1, 1>(src, dst, nelem, 0); + } + + __device__ __forceinline__ void + recvReduceSend(const T* src, int nelem) { + GenericOp<0, 0, 1, 1, 1, 0>(src, NULL, nelem, 0); + } + + __device__ __forceinline__ void + recvReduceCopySend(const T* src, T* dst, int nelem) { + GenericOp<0, 0, 1, 1, 1, 1>(src, dst, nelem, 0); + } + __device__ __forceinline__ void + directRecvReduceCopySend(const T* src, T* dst, int directOffset, int nelem) { + // Direct is only for the send part + GenericOp<0, 1, 1, 1, 1, 1>(src, dst, nelem, directOffset); + } + + __device__ __forceinline__ ~ncclPrimitives() { + // Save steps for next collective. Have thread 0 do it to be compatible + // with the way LL works. + for (int i=0; i +class ncclLLPrimitives { + private: + const int tid; + const int nthreads; + int nrecv = 0; + int nsend = 0; + struct ncclConnInfo* recvConn[NRECV]; + struct ncclConnInfo* sendConn[NSEND]; + volatile uint64_t* waitPtr; + volatile uint64_t* postPtr; + volatile int* fifoPtr; + uint64_t recvStep[NRECV]; + uint64_t sendStep[NSEND]; + uint64_t sendConnHead; + union ncclLLFifoLine* recvBuff[NRECV]; + union ncclLLFifoLine* sendBuff[NSEND]; + struct ncclComm* comm; + + inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; } + inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; } + inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); } + inline __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); } + inline __device__ uint32_t recvFlag(int i) { return recvStep[i]+1; } + inline __device__ uint32_t sendFlag(int i) { return sendStep[i]+1; } + + // Exit If Abort Barrier : make sure all threads exit consistently + // Each thread sets a predicate to true if val == 1 + // all CTA's threads enter the barrier and do a popc on their predicates being True + // If any of the thread's predicate was True, all the threads call exit() + inline __device__ void exitIfAbortLocalBarrier() { + uint32_t popc; + asm ("{"); + asm volatile (" .reg .pred barr_pred;"); + asm volatile (" setp.eq.u32 barr_pred,%0,1;" :: "r"(abort)); + asm volatile (" bar.red.popc.u32 %0, 14, %1, barr_pred;" : "=r"(popc) : "r"(nthreads)); + asm ("}"); + if (popc) { + // Make sure threads not participating in the operation get the abort and all threads exit + exitIfAbortBarrier(1); + } + } + + inline __device__ void barrier() { + asm volatile ("bar.sync 1, %0;" :: "r"(nthreads)); + } + + uint32_t mismatch = 0; + const uint64_t opCount; + + inline __device__ void checkMismatch(volatile uint64_t* remoteOpCount) { + if (mismatch > 20) { + // We have seen that the peer advanced opcount so many times yet we are still waiting for credit of current op, so it is _most likely_ a mismatch + // Note that we are not using _threadfence_system in LL so the error cannot be asserted + *(comm->fatalDevError) = ncclDevSuspectedMismatch; + } else if (remoteOpCount && *remoteOpCount > opCount) { + mismatch += 1; + } + } + + uint32_t spins = 0; + uint32_t abort = 0; + + inline __device__ int checkAbort(volatile uint64_t* remoteOpCount) { + spins++; + if (spins == SPINS_BEFORE_CHECK_ABORT) { + abort = *(comm->abortFlag); + checkMismatch(remoteOpCount); + spins = 0; + } + return abort; + } + + inline __device__ void waitSend(int i, int nbytes) { + spins = 0; + mismatch = 0; + if (tid == WARP_SIZE+i) { + while (sendConnHead + NCCL_STEPS < sendStep[i] + 1) { + sendConnHead = *waitPtr; + if (checkAbort(sendConn[i]->opCountRem)) break; + } + if (fifoPtr) fifoPtr[sendStep[i]%NCCL_STEPS] = nbytes; + } + } + + inline __device__ void postRecv(int i) { + recvStep[i]++; + if (tid == i) *postPtr = recvStep[i]; + } + + inline __device__ void postSend(int i) { + sendStep[i]++; + } + + __device__ uint64_t readLL(int i, int offset) { + union ncclLLFifoLine* src = recvPtr(i) + offset; + uint32_t flag = recvFlag(i); + uint32_t data1, flag1, data2, flag2; + spins = 0; + mismatch = 0; + do { + asm volatile("ld.volatile.global.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(data1), "=r"(flag1), "=r"(data2), "=r"(flag2) : "l"(&src->i4)); + if (checkAbort(recvConn[i]->opCountRem)) break; + } while ((flag1 != flag) || (flag2 != flag)); + uint64_t val64 = data1 + (((uint64_t)data2) << 32); + return val64; + } + + __device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) { + asm volatile("st.volatile.global.v4.u32 [%0], {%1,%2,%3,%4};" :: "l"(&dst->i4), "r"((uint32_t)val), "r"(flag), "r"((uint32_t)(val >> 32)), "r"(flag)); + } + + // Using memcpy handles misaligned pointers. + __device__ uint64_t readAL(uint64_t* src) { + uint64_t val; + memcpy((char*)&val, (char*)src, sizeof(uint64_t)); + return val; + } + + __device__ void storeAL(uint64_t* dst, uint64_t val, uint32_t nbytes) { + memcpy((char*)dst, (char*)&val, nbytes); + } + + template + __device__ void LLGenericOp(const T* srcPtr, T* dstPtr, int nelem) { + uint32_t nbytes = nelem < 0 ? 0 : nelem*sizeof(T); + FOR_SEND(waitSend, nbytes*2); + barrier(); + uint32_t npack = DIVUP(nbytes, sizeof(uint64_t)); + uint64_t* srcPack = (uint64_t*)srcPtr; + uint64_t* dstPack = (uint64_t*)dstPtr; + // Do multiples of 64 bits + #pragma unroll 2 + for (int offset=tid; offset()(readLL(0, offset), val); + for (int i=1; i()(readLL(i, offset), val); + } + } + + // Send : inter-node, then intra-node, then local + if (SEND) { + for (int i=1; illBuff; + recvStep[i] = recvConn[i]->step; + if (tid == i) { + postPtr = recvConn[i]->head; + *(recvConn[i]->opCountLoc) = opCount; + } + nrecv++; + } + + __device__ __forceinline__ void loadSendConn(struct ncclConnInfo* conn, int i) { + sendConn[i] = conn; + sendBuff[i] = sendConn[i]->llBuff; + sendStep[i] = sendConn[i]->step; + if (tid == WARP_SIZE+i) { + waitPtr = sendConn[i]->head; + fifoPtr = sendConn[i]->fifo; + sendConnHead = *waitPtr; + *(sendConn[i]->opCountLoc) = opCount; + } + nsend++; + } + + __device__ __forceinline__ void saveRecvConn(int i) { + if (tid == i) { + recvConn[i]->step = recvStep[i]; + *(recvConn[i]->opCountLoc) += 1; + __threadfence_block(); + } + } + + __device__ __forceinline__ void saveSendConn(int i) { + if (tid == WARP_SIZE+i) { + sendConn[i]->step = sendStep[i]; + *(sendConn[i]->opCountLoc) += 1; + __threadfence_block(); + } + } + + __device__ __forceinline__ void llSendCleaning(int i) { + if (sendStep[i] > sendConn[i]->llLastCleaning + NCCL_LL_CLEAN_FREQ) { + /* Reset all flags */ + static_assert((NCCL_LL_BUFF_SIZE % NCCL_LL_MAX_NTHREADS) == 0, "NCCL_LL_BUFF_SIZE must be a multiple of THREADS"); + static_assert(NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*NCCL_LL_MAX_NTHREADS) > 0, "NCCL_LL_BUFF_SIZE is less than 16 bytes*THREADS"); + for (int s=0; sllLastCleaning = sendStep[i]; + } + } + + __device__ __forceinline__ void llRecvCleaning(int i) { + if (recvStep[i] > recvConn[i]->llLastCleaning + NCCL_LL_CLEAN_FREQ) { + recvStep[i] += NCCL_STEPS; + if (tid == 0) recvConn[i]->llLastCleaning = recvStep[i]; + } + } + + public: + __device__ __forceinline__ + ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclComm* comm, const uint64_t opCount) + : comm(comm), tid(tid), nthreads(nthreads), opCount(opCount) { + // Make sure step is updated before we read it. + barrier(); + + for (int i=0; i= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i); + for (int i=0; i= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i); + } + + __device__ void send(const T* src, int nelem) { + return LLGenericOp<0, 1, 1, 0>(src, NULL, nelem); + } + + __device__ void recv(T* dst, int nelem) { + return LLGenericOp<1, 0, 0, 1>(NULL, dst, nelem); + } + + __device__ void recvReduceSend(const T* src, int nelem) { + return LLGenericOp<1, 1, 1, 0>(src, NULL, nelem); + } + + __device__ void recvReduceCopy(const T* src, T* dst, int nelem) { + return LLGenericOp<1, 0, 1, 1>(src, dst, nelem); + } + + __device__ void copySend(const T* src, T* dst, int nelem) { + return LLGenericOp<0, 1, 1, 1>(src, dst, nelem); + } + + __device__ void recvCopySend(T* dst, int nelem) { + return LLGenericOp<1, 1, 0, 1>(NULL, dst, nelem); + } + + __device__ void recvReduceCopySend(const T* src, T* dst, int nelem) { + return LLGenericOp<1, 1, 1, 1>(src, dst, nelem); + } + + __device__ __forceinline__ ~ncclLLPrimitives() { + for (int i=0; i -__device__ void ncclReduceKernel(struct CollectiveArgs* args) { +__device__ void ncclReduceRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int nthreads = blockDim.x - 1; const int bid = args->bid; struct ncclComm* comm = args->comm; - struct ncclRing* ring = comm->rings+blockIdx.x; - - WaitFlag waitDoneFromNext(ring->send.conn.head, (REDUCE_BUFCHUNKS-1)*REDUCE_SUBSTEPS); - WaitFlag waitReadyFromPrev(ring->recv.conn.tail, 0); - PostFlag postDoneToPrev(ring->recv.conn.head, 0, NULL, 0); - PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCE_BUFCHUNKS*REDUCE_SUBSTEPS); - - typedef Primitives Prims; - + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; const ssize_t size = args->N; const int nranks = comm->nRanks; - const int buffSize = ring->buffSize / sizeof(T); - const int sliceSize = buffSize / REDUCE_BUFCHUNKS; - const ssize_t loopSize = args->nRings*(ssize_t)sliceSize; + const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS); + const int chunkSize = stepSize * REDUCE_CHUNKSTEPS; + const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize; const int rank = ring->devUserRanks[0]; const int prevRank = ring->devUserRanks[nranks-1]; const int root = args->root; - if (tid == 0) { - // Update in case we skipped some collectives - *ring->recv.conn.opCount = args->opCount; - - if (rank != root) { - // Wait for next to be ready - WaitFlag waitOpCountNext(ring->send.conn.opCount, 0); - waitOpCountNext.wait(args->opCount); - } - } - __syncthreads(); - - uint64_t step = 0ULL; - int boffset = 0; - // Compute pointers const T * __restrict__ thisInput = (const T*)args->ThisInput; T * __restrict__ thisOutput = (T*)args->ThisOutput; - T * __restrict__ prevInput = (T*)ring->recv.conn.buff; - T * __restrict__ nextOutput = (T*)ring->send.conn.buff; + + ncclPrimitives + prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings)); - ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); - ssize_t offset = gridOffset + bid*chunkSize; - int maxOffset = min(chunkSize, size-offset); + int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels)); + ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); + ssize_t offset = gridOffset + bid*realChunkSize; + int nelem = min(realChunkSize, size-offset); if (prevRank == root) { - Prims::Copy(tid, nthreads, - thisInput + offset, - nextOutput + boffset, - sliceSize, maxOffset, - step, - waitDoneFromNext, - postReadyToNext); + prims.send(thisInput+offset, nelem); } else if (rank == root) { - Prims::Reduce(tid, nthreads, - prevInput + boffset, - thisInput + offset, - thisOutput + offset, - sliceSize, maxOffset, - step, - waitReadyFromPrev, - postDoneToPrev); + prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem); } else { - Prims::Reduce(tid, nthreads, - prevInput + boffset, - thisInput + offset, - nextOutput + boffset, - sliceSize, maxOffset, - step, - waitDoneFromNext, waitReadyFromPrev, - postReadyToNext, postDoneToPrev); + prims.recvReduceSend(thisInput+offset, nelem); } - NEXT_STEP; // Increases step, boffset - } - - if (tid == 0) { - if (rank != root) { - // Wait for next to have consumed data before resetting the flag - waitDoneFromNext.wait(REDUCE_SUBSTEPS*(step + REDUCE_BUFCHUNKS - 1)); - *ring->send.conn.head = 0ULL; - } - *ring->recv.conn.tail = 0ULL; - __threadfence_system(); - *ring->recv.conn.opCount = args->opCount+1; } } -#include "ll_kernel.h" - -#define NEXT_STEP_LL \ - boffset += NCCL_LL_SLICE_LINES; \ - if (boffset == NCCL_LL_BUFF_LINES) boffset = 0; \ - flag++; \ - step++; +template +__device__ void ncclReduceTreeKernel(struct CollectiveArgs* args) { } template -__device__ void ncclReduceLLKernel(struct CollectiveArgs* args) { +__device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int bid = args->bid; - const int llNthreads = args->nThreads; + const int nthreads = args->nThreads; struct ncclComm* comm = args->comm; - struct ncclRing* ring = comm->rings+blockIdx.x; - volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead; - volatile uint64_t * sendHeadPtr = ring->send.conn.llHead; - volatile int * sizesFifo = ring->send.conn.llFifo; - uint64_t sendHead = sendHeadPtr[0]; - const int nranks = comm->nRanks; + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; + + ncclLLPrimitives LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount); + + const ssize_t size = args->N; const int rank = comm->rank; + const int nranks = comm->nRanks; const int prevRank = ring->devUserRanks[nranks-1]; const int root = args->root; - typedef LLPrimitives LL; - - const ssize_t size = args->N; ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T); - const ssize_t loopSize = args->nRings*chunkSize; - - uint64_t step = ring->send.conn.llStep; - uint32_t flag = step + 1; - int boffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step); + const ssize_t loopSize = args->nChannels*chunkSize; // Compute pointers const T * __restrict__ thisInput = (const T*)args->ThisInput; T * __restrict__ thisOutput = (T*)args->ThisOutput; - union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff; - union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff; for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { if (size-gridOffset < loopSize) { @@ -152,39 +80,16 @@ __device__ void ncclReduceLLKernel(struct CollectiveArgs* args) { } ssize_t offset = gridOffset + bid*chunkSize; - int maxOffset = min(chunkSize, size-offset); + int nelem = min(chunkSize, size-offset); if (prevRank == root) { - WAIT_NEXT; - LL::ReduceCopy( - thisInput + offset, - nextOutput + boffset, - maxOffset, flag, llNthreads); - POST_SIZE; - NEXT_STEP_LL; + LLprims.send(thisInput+offset, nelem); } else if (rank == root) { - LL::ReduceCopy( - thisInput + offset, - prevInput + boffset, - thisOutput + offset, - maxOffset, flag, llNthreads); - NEXT_STEP_LL; - ACK_PREV; + LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem); } else { - WAIT_NEXT; - LL::ReduceCopy( - thisInput + offset, - prevInput + boffset, - nextOutput + boffset, - maxOffset, flag, flag, llNthreads); - POST_SIZE; - NEXT_STEP_LL; - ACK_PREV; + LLprims.recvReduceSend(thisInput+offset, nelem); } } - - // We need everyone to acknowledge data even if they didn't receive anything - // so that the next collective can start right away. - ACK_PREV; - - FIFO_CLEANING_AND_SAVE_STEP(flag); } + +template +__device__ void ncclReduceTreeLLKernel(struct CollectiveArgs* args) { } diff --git a/projects/rccl/src/collectives/device/reduce_kernel.h b/projects/rccl/src/collectives/device/reduce_kernel.h index 0cb8f139f7..0e907939fc 100644 --- a/projects/rccl/src/collectives/device/reduce_kernel.h +++ b/projects/rccl/src/collectives/device/reduce_kernel.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -46,30 +46,28 @@ struct FuncMin { } }; +#define MASK0 0x00ff00ff +#define MASK1 0xff00ff00 +static __device__ uint32_t addChar4(const uint32_t x, const uint32_t y) { + /* This can be used both for signed and unsigned 8-bit addition */ + const uint32_t x0 = x & MASK0; + const uint32_t x1 = x & MASK1; + const uint32_t y0 = y & MASK0; + const uint32_t y1 = y & MASK1; + const uint32_t r0 = (x0+y0); + const uint32_t r1 = (x1+y1); + return (r0 & MASK0) | (r1 & MASK1); +} + template<> struct FuncSum { - union converter { uint32_t storage; char4 a; }; __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const { #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500) int32_t rv, z=0; asm("vadd4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z)); return rv; -#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700) - int32_t rv; - asm("vadd.s32.s32.s32 %0, %1.b0, %2.b0; \n\t" - "vadd.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t" - "vadd.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t" - "vadd.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y)); - return rv; #else - converter cx, cy, cr; - cx.storage = x; - cy.storage = y; - cr.a.x = cx.a.x + cy.a.x; - cr.a.y = cx.a.y + cy.a.y; - cr.a.z = cx.a.z + cy.a.z; - cr.a.w = cx.a.w + cy.a.w; - return cr.storage; + return addChar4(x, y); #endif } __device__ int8_t operator()(const int8_t x, const int8_t y) const { @@ -78,28 +76,13 @@ struct FuncSum { }; template<> struct FuncSum { - union converter { uint32_t storage; uchar4 a; }; __device__ uint32_t operator()(const uint32_t x, const uint32_t y) const { #if (__CUDA_ARCH__ >= 300) && (__CUDA_ARCH__ < 500) int32_t rv, z=0; asm("vadd4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z)); return rv; -#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700) - int32_t rv; - asm("vadd.u32.u32.u32 %0, %1.b0, %2.b0; \n\t" - "vadd.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t" - "vadd.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t" - "vadd.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y)); - return rv; #else - converter cx, cy, cr; - cx.storage = x; - cy.storage = y; - cr.a.x = cx.a.x + cy.a.x; - cr.a.y = cx.a.y + cy.a.y; - cr.a.z = cx.a.z + cy.a.z; - cr.a.w = cx.a.w + cy.a.w; - return cr.storage; + return addChar4(x, y); #endif } __device__ uint8_t operator()(const uint8_t x, const uint8_t y) const { @@ -109,22 +92,6 @@ struct FuncSum { static __device__ uint32_t mulChar4(const uint32_t x, const uint32_t y) { /* This can be used both for signed and unsigned 8-bit multiplication */ -#if (__CUDA_ARCH__ >= 300) - uint32_t rv; - asm("{ .reg .u32 t0, t1, t2, t3;\n\t" - " vmad.u32.u32.u32 t3, %1.b3, %2.b3, 0;\n\t" - " vmad.u32.u32.u32 t2, %1.b2, %2.b2, 0;\n\t" - " shl.b32 t3, t3, 16;\n\t" - " shl.b32 t2, t2, 16;\n\t" - " vmad.u32.u32.u32 t1, %1.b1, %2.b1, t3;\n\t" - " shl.b32 t1, t1, 8;\n\t" - " vmad.u32.u32.u32 t0, %1.b0, %2.b0, t2;\n\t" - " and.b32 t1, t1, 0xff00ff00;\n\t" - " and.b32 t0, t0, 0x00ff00ff;\n\t" - " or.b32 %0, t0, t1;\n\t" - "}" : "=r"(rv) : "r"(x), "r"(y)); - return rv; -#else union converter { uint32_t storage; char4 a; }; converter cx, cy, cr; cx.storage = x; @@ -134,7 +101,6 @@ static __device__ uint32_t mulChar4(const uint32_t x, const uint32_t y) { cr.a.z = cx.a.z * cy.a.z; cr.a.w = cx.a.w * cy.a.w; return cr.storage; -#endif } template<> @@ -164,13 +130,6 @@ struct FuncMax { int32_t rv, z=0; asm("vmax4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z)); return rv; -#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700) - int32_t rv; - asm("vmax.s32.s32.s32 %0, %1.b0, %2.b0; \n\t" - "vmax.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t" - "vmax.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t" - "vmax.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y)); - return rv; #else converter cx, cy, cr; cx.storage = x; @@ -194,13 +153,6 @@ struct FuncMax { int32_t rv, z=0; asm("vmax4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z)); return rv; -#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700) - int32_t rv; - asm("vmax.u32.u32.u32 %0, %1.b0, %2.b0; \n\t" - "vmax.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t" - "vmax.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t" - "vmax.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y)); - return rv; #else converter cx, cy, cr; cx.storage = x; @@ -225,13 +177,6 @@ struct FuncMin { int32_t rv, z=0; asm("vmin4.s32.s32.s32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z)); return rv; -#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700) - int32_t rv; - asm("vmin.s32.s32.s32 %0, %1.b0, %2.b0; \n\t" - "vmin.s32.s32.s32 %0.b1, %1.b1, %2.b1, %0;\n\t" - "vmin.s32.s32.s32 %0.b2, %1.b2, %2.b2, %0;\n\t" - "vmin.s32.s32.s32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y)); - return rv; #else converter cx, cy, cr; cx.storage = x; @@ -255,13 +200,6 @@ struct FuncMin { int32_t rv, z=0; asm("vmin4.u32.u32.u32 %0, %1, %2, %3;" : "=r"(rv) : "r"(x), "r"(y), "r"(z)); return rv; -#elif (__CUDA_ARCH__ >= 500) && (__CUDA_ARCH__ < 700) - int32_t rv; - asm("vmin.u32.u32.u32 %0, %1.b0, %2.b0; \n\t" - "vmin.u32.u32.u32 %0.b1, %1.b1, %2.b1, %0;\n\t" - "vmin.u32.u32.u32 %0.b2, %1.b2, %2.b2, %0;\n\t" - "vmin.u32.u32.u32 %0.b3, %1.b3, %2.b3, %0;" : "=r"(rv) : "r"(x), "r"(y)); - return rv; #else converter cx, cy, cr; cx.storage = x; diff --git a/projects/rccl/src/collectives/device/reduce_scatter.cu b/projects/rccl/src/collectives/device/reduce_scatter.cu index b16053c41b..10857eda54 100644 --- a/projects/rccl/src/collectives/device/reduce_scatter.cu +++ b/projects/rccl/src/collectives/device/reduce_scatter.cu @@ -4,18 +4,8 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "common.h" #include "reduce_scatter.h" +#include "common.h" #include "collectives.h" -#define UNROLL 4 - -#if NCCL_OP == 0 -IMPL_COLL2(ncclReduceScatter, sum, FuncSum, ncclCollReduceScatter, ncclSum); -#elif NCCL_OP == 1 -IMPL_COLL2(ncclReduceScatter, prod, FuncProd, ncclCollReduceScatter, ncclProd); -#elif NCCL_OP == 2 -IMPL_COLL2(ncclReduceScatter, min, FuncMin, ncclCollReduceScatter, ncclMin); -#elif NCCL_OP == 3 -IMPL_COLL2(ncclReduceScatter, max, FuncMax, ncclCollReduceScatter, ncclMax); -#endif +IMPL_COLL_R(ncclReduceScatter, ncclCollReduceScatter); diff --git a/projects/rccl/src/collectives/device/reduce_scatter.h b/projects/rccl/src/collectives/device/reduce_scatter.h index cad011b22e..c70c845267 100644 --- a/projects/rccl/src/collectives/device/reduce_scatter.h +++ b/projects/rccl/src/collectives/device/reduce_scatter.h @@ -8,156 +8,82 @@ #include "primitives.h" #include "collectives.h" -// Increase Step and poffset/noffset for buffer sync -#define NEXT_STEP \ - step++; \ - poffset = noffset; \ - noffset += sliceSize; \ - if (noffset == buffSize) noffset = 0; - template -__device__ void ncclReduceScatterKernel(struct CollectiveArgs* args) { +__device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int nthreads = blockDim.x - 1; const int bid = args->bid; struct ncclComm* comm = args->comm; - struct ncclRing* ring = comm->rings+blockIdx.x; - - WaitFlag waitDoneFromNext(ring->send.conn.head, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS); - WaitFlag waitReadyFromPrev(ring->recv.conn.tail, REDUCESCATTER_SUBSTEPS); - PostFlag postDoneToPrev(ring->recv.conn.head, REDUCESCATTER_SUBSTEPS, NULL, 0); - PostFlag postReadyToNext(ring->send.conn.tail, 0, ring->send.conn.fifo, REDUCESCATTER_BUFCHUNKS*REDUCESCATTER_SUBSTEPS); - - typedef Primitives Prims; - + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; const ssize_t size = args->N; const int nranks = comm->nRanks; - const int buffSize = ring->buffSize / sizeof(T); - const int sliceSize = buffSize / REDUCESCATTER_BUFCHUNKS; - const ssize_t loopSize = args->nRings*(ssize_t)sliceSize; - - if (tid == 0) { - // Update in case we skipped some collectives - *ring->recv.conn.opCount = args->opCount; - // Wait for next to be ready - WaitFlag waitOpCountNext(ring->send.conn.opCount, 0); - waitOpCountNext.wait(args->opCount); - } - __syncthreads(); - - uint64_t step = 0ULL; - int poffset, noffset = 0; + const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS); + const int chunkSize = stepSize * ALLREDUCE_CHUNKSTEPS; + const ssize_t loopSize = args->nChannels*(ssize_t)chunkSize; // Compute pointers const T * __restrict__ thisInput = (const T*)args->ThisInput; T * __restrict__ thisOutput = (T*)args->ThisOutput; - T * __restrict__ prevInput = (T*)ring->recv.conn.buff; - T * __restrict__ nextOutput = (T*)ring->send.conn.buff; + + ncclPrimitives + prims(tid, nthreads, &ring->prev, &ring->next, NULL, stepSize, channel, comm, args->opCount); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - int chunkSize = min(sliceSize, DIVUP(size-gridOffset,args->nRings)); - ALIGN_SIZE(chunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); - ssize_t chunkOffset = gridOffset + bid*chunkSize; + int realChunkSize = min(chunkSize, DIVUP(size-gridOffset,args->nChannels)); + ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T)); + ssize_t chunkOffset = gridOffset + bid*realChunkSize; /////////////// begin ReduceScatter steps /////////////// ssize_t offset; - int maxOffset = min(chunkSize, size-chunkOffset); + int nelem = min(realChunkSize, size-chunkOffset); int rankDest; // step 0: push data to next GPU rankDest = ring->devUserRanks[nranks-1]; offset = chunkOffset + rankDest * size; - Prims::Copy(tid, nthreads, - thisInput + offset, - nextOutput + noffset, - sliceSize, maxOffset, - step, - waitDoneFromNext, - postReadyToNext); - - NEXT_STEP; // Increases step, poffset, noffset + prims.send(thisInput+offset, nelem); // k-2 steps: reduce and copy to next GPU for (int j=2; jdevUserRanks[nranks-j]; offset = chunkOffset + rankDest * size; - Prims::Reduce(tid, nthreads, - prevInput + poffset, - thisInput + offset, - nextOutput + noffset, - sliceSize, maxOffset, - step, - waitDoneFromNext, waitReadyFromPrev, - postReadyToNext, postDoneToPrev); - - NEXT_STEP; + prims.recvReduceSend(thisInput+offset, nelem); } - // step k-1: reduce this buffer and data, which will produce the final - // result that we store in this data and push to the next GPU + // step k-1: reduce this buffer and data, which will produce the final result rankDest = ring->devUserRanks[0]; offset = chunkOffset + rankDest * size; - Prims::Reduce(tid, nthreads, - prevInput + poffset, - thisInput + offset, - thisOutput + chunkOffset, - sliceSize, maxOffset, - step, - waitReadyFromPrev, - postDoneToPrev); - } - - if (tid == 0) { - waitDoneFromNext.wait(REDUCESCATTER_SUBSTEPS*(step + REDUCESCATTER_BUFCHUNKS)); - *ring->send.conn.head = 0ULL; - *ring->recv.conn.tail = 0ULL; - __threadfence_system(); - *ring->recv.conn.opCount = args->opCount+1; + prims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem); } } -#include "ll_kernel.h" - -#define NEXT_STEP_LL \ - poffset = noffset; \ - pflag = nflag; \ - noffset += NCCL_LL_SLICE_LINES; \ - if (noffset == NCCL_LL_BUFF_LINES) { noffset = 0; } \ - nflag++; \ - step++; +template +__device__ void ncclReduceScatterTreeKernel(struct CollectiveArgs* args) { } template -__device__ void ncclReduceScatterLLKernel(struct CollectiveArgs* args) { +__device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int bid = args->bid; - const int llNthreads = args->nThreads; + const int nthreads = args->nThreads; struct ncclComm* comm = args->comm; - struct ncclRing* ring = comm->rings+blockIdx.x; - volatile uint64_t * recvHeadPtr = ring->recv.conn.llHead; - volatile uint64_t * sendHeadPtr = ring->send.conn.llHead; - volatile int * sizesFifo = ring->send.conn.llFifo; - uint64_t sendHead = sendHeadPtr[0]; + struct ncclChannel* channel = comm->channels+blockIdx.x; + struct ncclRing* ring = &channel->ring; - typedef LLPrimitives LL; + ncclLLPrimitives LLprims(tid, nthreads, &ring->prev, &ring->next, channel, comm, args->opCount); const ssize_t size = args->N; //const int rank = comm->rank; const int nranks = comm->nRanks; ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T); - const ssize_t loopSize = args->nRings*chunkSize; - - uint64_t step = ring->send.conn.llStep; - uint32_t pflag, nflag = step + 1; - int poffset, noffset = NCCL_LL_SLICE_LINES * STEP_TO_SLOT(step); + const ssize_t loopSize = args->nChannels*chunkSize; // Compute pointers const T * __restrict__ thisInput = (const T*)args->ThisInput; T * __restrict__ thisOutput = (T*)args->ThisOutput; - union ncclLLFifoLine * prevInput = (union ncclLLFifoLine *)ring->recv.conn.llBuff; - union ncclLLFifoLine * nextOutput = (union ncclLLFifoLine *)ring->send.conn.llBuff; for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { if (size-gridOffset < loopSize) { @@ -167,37 +93,21 @@ __device__ void ncclReduceScatterLLKernel(struct CollectiveArgs* args) { /////////////// begin ReduceScatter steps /////////////// ssize_t offset; - int maxOffset = min(chunkSize, size-chunkOffset); + int nelem = min(chunkSize, size-chunkOffset); int rankDest; // step 0: push data to next GPU rankDest = ring->devUserRanks[nranks-1]; offset = chunkOffset + rankDest * size; - WAIT_NEXT; - LL::ReduceCopy( - thisInput + offset, - nextOutput + noffset, - maxOffset, nflag, llNthreads); - POST_SIZE; - - NEXT_STEP_LL; + LLprims.send(thisInput+offset, nelem); // k-2 steps: reduce and copy to next GPU for (int j=2; jdevUserRanks[nranks-j]; offset = chunkOffset + rankDest * size; - WAIT_NEXT; - LL::ReduceCopy( - thisInput + offset, - prevInput + poffset, - nextOutput + noffset, - maxOffset, pflag, nflag, llNthreads); - POST_SIZE; - ACK_PREV; - - NEXT_STEP_LL; + LLprims.recvReduceSend(thisInput+offset, nelem); } // step k-1: reduce this buffer and data, which will produce the final @@ -205,13 +115,9 @@ __device__ void ncclReduceScatterLLKernel(struct CollectiveArgs* args) { rankDest = ring->devUserRanks[0]; offset = chunkOffset + rankDest * size; - LL::ReduceCopy( - thisInput + offset, - prevInput + poffset, - thisOutput + chunkOffset, - maxOffset, pflag, llNthreads); - ACK_PREV; + LLprims.recvReduceCopy(thisInput+offset, thisOutput+chunkOffset, nelem); } - - FIFO_CLEANING_AND_SAVE_STEP(nflag); } + +template +__device__ void ncclReduceScatterTreeLLKernel(struct CollectiveArgs* args) { } diff --git a/projects/rccl/src/collectives/reduce.cu b/projects/rccl/src/collectives/reduce.cu index d8fde80baa..302d4bcfd9 100644 --- a/projects/rccl/src/collectives/reduce.cu +++ b/projects/rccl/src/collectives/reduce.cu @@ -4,30 +4,15 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" -#include "common_coll.h" #include "enqueue.h" #include "collectives.h" -ncclResult_t ncclReduceFunc(const void* sendbuff, void* recvbuff, const size_t count, - ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { - size_t nbytes = count*ncclTypeSize(datatype); - INFO(NCCL_COLL,"Reduce: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream); - if (comm->nRanks == 1) { - if (sendbuff != recvbuff) - CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream)); - } else { - NCCLCHECK(transportSaveProxies(REDUCE_SUBSTEPS, REDUCE_BUFCHUNKS, 1, 1, nbytes, proxyPatternTo(root), comm)); - NCCLCHECK(saveKernel(ncclCollReduce, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes, 1)); - } - - return ncclSuccess; -} - NCCL_API(ncclResult_t, ncclReduce, const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { - return ncclEnqueueCheck(ncclReduceFunc, "Reduce", sendbuff, recvbuff, count, datatype, - op, root, comm, stream); + struct ncclInfo info = { ncclCollReduce, "Reduce", + sendbuff, recvbuff, count, datatype, op, root, comm, stream, /* Args */ + REDUCE_CHUNKSTEPS, REDUCE_SLICESTEPS }; + return ncclEnqueueCheck(&info); } diff --git a/projects/rccl/src/collectives/reduce_scatter.cu b/projects/rccl/src/collectives/reduce_scatter.cu index 1447d4a91b..4ee77ef985 100644 --- a/projects/rccl/src/collectives/reduce_scatter.cu +++ b/projects/rccl/src/collectives/reduce_scatter.cu @@ -4,29 +4,15 @@ * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" -#include "common_coll.h" #include "enqueue.h" #include "collectives.h" -ncclResult_t ncclReduceScatterFunc(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { - size_t nbytes = count*ncclTypeSize(datatype); - INFO(NCCL_COLL,"ReduceScatter: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", comm->opCount, sendbuff, recvbuff, count, datatype, op, root, comm, comm->nRanks, stream); - if (comm->nRanks == 1) { - if (sendbuff != recvbuff) - CUDACHECK(cudaMemcpyAsync(recvbuff, sendbuff, nbytes, cudaMemcpyDeviceToDevice, stream)); - } else { - NCCLCHECK(transportSaveProxies(REDUCESCATTER_SUBSTEPS, REDUCESCATTER_BUFCHUNKS, comm->nRanks-1, comm->nRanks, nbytes*comm->nRanks, proxyPatternRing, comm)); - NCCLCHECK(saveKernel(ncclCollReduceScatter, sendbuff, recvbuff, count, datatype, op, root, comm, stream, nbytes*comm->nRanks, 1)); - } - return ncclSuccess; -} - NCCL_API(ncclResult_t, ncclReduceScatter, const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream); ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm* comm, cudaStream_t stream) { - return ncclEnqueueCheck(ncclReduceScatterFunc, "ReduceScatter", sendbuff, recvbuff, recvcount, datatype, - op, 0, comm, stream); + struct ncclInfo info = { ncclCollReduceScatter, "ReduceScatter", + sendbuff, recvbuff, recvcount, datatype, op, 0, comm, stream, /* Args */ + REDUCESCATTER_CHUNKSTEPS, REDUCESCATTER_SLICESTEPS }; + return ncclEnqueueCheck(&info); } diff --git a/projects/rccl/src/enqueue.cu b/projects/rccl/src/enqueue.cu new file mode 100644 index 0000000000..d283223fa1 --- /dev/null +++ b/projects/rccl/src/enqueue.cu @@ -0,0 +1,442 @@ +/************************************************************************* + * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "enqueue.h" +#include "checks.h" +#include "param.h" + +#include "collectives/collectives.h" + +// Only generate inline kernels for LL +#define NCCL_FUNC5(coll, op, dtype) \ + (void*)NCCL_KERN_NAME(coll##LL, op, dtype), \ + (void*)NCCL_KERN_NAME(coll##LL, op, dtype) + +#define NCCL_FUNC4(coll, op, dtype) \ + (void*)NCCL_FUNC5(coll##Ring, op, dtype), \ + (void*)NCCL_FUNC5(coll##Tree, op, dtype) + +// Must be consistent with ncclDataType_t +#define NCCL_FUNCS3A(coll, op) \ + (void*)NCCL_FUNC4(coll, op, i8), \ + (void*)NCCL_FUNC4(coll, op, u8), \ + (void*)NCCL_FUNC4(coll, op, i32), \ + (void*)NCCL_FUNC4(coll, op, u32), \ + (void*)NCCL_FUNC4(coll, op, i64), \ + (void*)NCCL_FUNC4(coll, op, u64), \ + (void*)NCCL_FUNC4(coll, op, f16), \ + (void*)NCCL_FUNC4(coll, op, f32), \ + (void*)NCCL_FUNC4(coll, op, f64) +#define NCCL_FUNCS3B(coll, op) \ + (void*)NCCL_FUNC4(coll, op, i8), \ + (void*)NCCL_FUNC4(coll, op, i8), \ + (void*)NCCL_FUNC4(coll, op, i8), \ + (void*)NCCL_FUNC4(coll, op, i8), \ + (void*)NCCL_FUNC4(coll, op, i8), \ + (void*)NCCL_FUNC4(coll, op, i8), \ + (void*)NCCL_FUNC4(coll, op, i8), \ + (void*)NCCL_FUNC4(coll, op, i8), \ + (void*)NCCL_FUNC4(coll, op, i8) + +// Must be consistent with ncclRedOp_t -- but we only generate kernel for sums. +#define NCCL_FUNCS2A(coll) \ + NCCL_FUNCS3A(coll, sum), \ + NCCL_FUNCS3A(coll, sum), \ + NCCL_FUNCS3A(coll, sum), \ + NCCL_FUNCS3A(coll, sum) +#define NCCL_FUNCS2B(coll) \ + NCCL_FUNCS3B(coll, copy), \ + NCCL_FUNCS3B(coll, copy), \ + NCCL_FUNCS3B(coll, copy), \ + NCCL_FUNCS3B(coll, copy) + +// Must be consistent with the ncclFuncSet enum +static void* const ncclKerns[ncclCollCount*ncclNumOps*ncclNumTypes*2*2] = { + NCCL_FUNCS2B(ncclBroadcast), + NCCL_FUNCS2A(ncclReduce), + NCCL_FUNCS2B(ncclAllGather), + NCCL_FUNCS2A(ncclReduceScatter), + NCCL_FUNCS2A(ncclAllReduce) +}; + +/*****************************************************************************/ +/* Launch system : synchronization and CUDA kernel launch */ +/*****************************************************************************/ + +ncclResult_t ncclLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *paramsList, int* cudaDevs, int numDevices, int cgMode) { +#if CUDART_VERSION >= 9000 + if (cgMode & 0x01) { + CUDACHECK(cudaLaunchCooperativeKernelMultiDevice(paramsList, numDevices, + // These flags are to reduce the latency of using this API + cudaCooperativeLaunchMultiDeviceNoPreSync|cudaCooperativeLaunchMultiDeviceNoPostSync)); + return ncclSuccess; + } +#endif + int savedDev; + CUDACHECK(cudaGetDevice(&savedDev)); + for (int i = 0; i < numDevices; i++) { + struct cudaLaunchParams* params = paramsList+i; + CUDACHECK(cudaSetDevice(cudaDevs[i])); + CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream)); + } + CUDACHECK(cudaSetDevice(savedDev)); + return ncclSuccess; +} + +ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams* params) { + params->gridDim.x = std::min((int) params->gridDim.x, comm->nChannels); + + // Set active = 2 for the last operation + for (int r=0; rgridDim.x; r++) { + struct ncclChannel* channel = comm->channels+r; + channel->collectives[(channel->collStart+channel->collCount-1)%NCCL_MAX_OPS].active = 2; + } + + // Find the first operation, choose the kernel accordingly and pass it + // as the first argument. + struct ncclColl* coll = comm->channels[0].collectives+comm->channels[0].collStart; + memcpy(&comm->args, coll, sizeof(struct ncclColl)); + // As we pass that coll directly, we can free it immediately. + coll->active = 0; + + params->func = ncclKerns[coll->funcIndex]; + return ncclSuccess; +} + +ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast) { + volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase); + int val = *ptr; + bool done = false; + while (done == false) { + if (val >= comm->intraRanks) { + WARN("Trying to launch too many collectives"); + return ncclInvalidUsage; + } + if (val+1 == comm->intraRanks) { + // Reset the barrier. + comm->intraBarrier[comm->intraPhase^1] = 0; + *isLast = 1; + return ncclSuccess; + } + done = __sync_bool_compare_and_swap(ptr, val, val+1); + val++; + } + *isLast = 0; + return ncclSuccess; +} + +ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm) { + volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase); + int val = *ptr; + if (__sync_bool_compare_and_swap(ptr, val, val+1) != true) { + WARN("Trying to launch too many collectives"); + return ncclInternalError; + } + return ncclSuccess; +} + +ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm) { + volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase); + while (*ptr < comm->intraRanks) pthread_yield(); + comm->intraPhase ^= 1; + return ncclSuccess; +} + +ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm) { + if (comm->nRanks == 1) return ncclSuccess; + struct cudaLaunchParams* params = comm->myParams; + + NCCLCHECK(setupLaunch(comm, params)); + + // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL + if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) { + // Enqueue event in user stream + CUDACHECK(cudaEventRecord(comm->doneEvent, comm->userStream)); + // Create dependency between user stream and internal NCCL stream + CUDACHECK(cudaStreamWaitEvent(comm->groupStream, comm->doneEvent, 0)); + params->stream = comm->groupStream; + } else { + if (comm->userStream != params->stream) { + // Stream changed from last call, create dependency against last NCCL kernel launch + CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->doneEvent, 0)); + } + params->stream = comm->userStream; + } + + int isLast = 0; + NCCLCHECK(ncclCpuBarrierIn(comm, &isLast)); + + if (isLast) { + if (comm->launchMode == ncclComm::GROUP) { + // I'm the last. Launch all operations. + NCCLCHECK(ncclLaunchCooperativeKernelMultiDevice(comm->intraParams, comm->intraCudaDevs, comm->intraRanks, *comm->intraCGMode)); + } + NCCLCHECK(ncclCpuBarrierLast(comm)); + } + return ncclSuccess; +} + +ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) { + if (comm->nRanks == 1) return ncclSuccess; + // We can't print the CG mode before the first barrier happened. + if (comm->rank == 0 && *comm->intraCGMode & 0x10) { + *comm->intraCGMode ^= 0x10; + INFO(NCCL_INIT,"Launch mode %s%s%s", + comm->launchMode == ncclComm::GROUP ? "Group" : "Parallel", + *comm->intraCGMode ? "/CGMD" : "", + (comm->launchMode == ncclComm::GROUP && comm->groupCudaStream) ? "/Stream" : ""); + } + + NCCLCHECK(ncclCpuBarrierOut(comm)); + + struct cudaLaunchParams *params = comm->myParams; + if (comm->launchMode == ncclComm::PARALLEL) { + CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream)); + } + // Start the network proxies as soon as the kernel has been launched. We can't + // perform any CUDA call between the two or having a cudaFree between the CUDA + // launch and the transportStartProxy call could cause a deadlock. + // Also, starting the proxies after the CUDA launch seems to be better for + // performance (latency). + for (int r=0; rgridDim.x; r++) { + struct ncclChannel* channel = comm->channels+r; + channel->collStart = channel->collFifoTail; + channel->collCount = 0; + } + params->gridDim.x = params->blockDim.x = 0; + NCCLCHECK(transportStartProxy(comm)); + return ncclSuccess; +} + +ncclResult_t ncclEnqueueEvents(ncclComm_t comm) { + struct cudaLaunchParams *params = comm->myParams; + // Enqueue event after NCCL kernel + CUDACHECK(cudaEventRecord(comm->doneEvent, params->stream)); + // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL + if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) { + // Create dependency between NCCL internal stream and user stream + CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->doneEvent, 0)); + } + comm->userStreamSet = false; + return ncclSuccess; +} + +/*****************************************************************************/ +/* Enqueueing system : computation of kernel and proxy operations parameters */ +/*****************************************************************************/ + +static ncclResult_t getPatternInfo(struct ncclInfo* info) { + if (info->coll == ncclCollBroadcast) info->pattern = ncclPatternPipelineFrom; + else if (info->coll == ncclCollReduce) info->pattern = ncclPatternPipelineTo; + else if (info->coll == ncclCollAllGather || info->coll == ncclCollReduceScatter) info->pattern = ncclPatternRing; + else if (info->coll == ncclCollAllReduce) { + if (info->nBytes <= info->comm->treeThreshold) + info->pattern = ncclPatternTreeUpDown; + else + info->pattern = ncclPatternRingTwice; + } + else { + WARN("Unknown collective %d", info->coll); + return ncclInternalError; + } + return ncclSuccess; +} + +static ncclResult_t getLoopInfo(struct ncclInfo* info) { + switch (info->pattern) { + case ncclPatternTreeUp: + case ncclPatternTreeDown: + case ncclPatternTreeUpDown: + case ncclPatternPipelineFrom: + case ncclPatternPipelineTo: + info->nstepsPerLoop = info-> nchunksPerLoop = 1; break; + case ncclPatternRing: + info->nstepsPerLoop = info->comm->nRanks-1; info->nchunksPerLoop = info->comm->nRanks; break; + case ncclPatternRingTwice: + info->nstepsPerLoop = 2*(info->comm->nRanks-1); info->nchunksPerLoop = info->comm->nRanks; break; + default: + WARN("Unknown pattern %d\n", info->pattern); + return ncclInternalError; + } + return ncclSuccess; +} + +static void getKernelInfo(struct ncclInfo* info, uint8_t* nChannels, uint16_t* nThreads, int* llMode) { + // Compute thresholds and limits that users can override + int perThreadLLThreshold = std::min(info->comm->threadThreshold, (ssize_t)NCCL_LL_CHANNEL_THRESHOLD); + int maxLLNthreads = std::min(NCCL_LL_MAX_NTHREADS, info->comm->nThreads); + + // First compute nThreads + int nt = NCCL_LL_MIN_NTHREADS; + while (DIVUP(info->nBytes, nt*info->nchunksPerLoop) > perThreadLLThreshold && nt*2 <= maxLLNthreads) nt *= 2; + + // Then compute nChannels + int nc = DIVUP(info->nBytes, nt*info->nchunksPerLoop*perThreadLLThreshold); + if (nc == 0) nc = 1; + if (nc > info->comm->nChannels) nc = info->comm->nChannels; + + // Check if we have a fixed LL threshold, otherwise compute it. + int perThreadThreshold = info->comm->threadThreshold; + if (info->pattern >= ncclPatternTreeUp) perThreadThreshold *= 4; + ssize_t llThreshold = info->comm->llThreshold >= 0 ? + info->comm->llThreshold : + nc*nt*info->nchunksPerLoop*perThreadThreshold; + + if (info->nBytes <= llThreshold) { + *llMode = 1; + *nChannels = nc; + *nThreads = nt; + } else { + *llMode = 0; + *nChannels = info->comm->nChannels; + *nThreads = info->comm->nThreads+1; + } +} + +static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclColl* coll, struct ncclProxyArgs* proxyArgs /* output */) { + // Set nstepsPerLoop and nchunksPerLoop + NCCLCHECK(getPatternInfo(info)); + NCCLCHECK(getLoopInfo(info)); + + coll->args.root = info->root; + coll->args.N = info->count; + coll->args.ThisInput = info->sendbuff; + coll->args.ThisOutput = info->recvbuff; + coll->args.comm = info->comm->devComm; + coll->args.opCount = info->comm->opCount; + + // Compute llMode, nChannels, nThreads + int llMode; + getKernelInfo(info, &coll->args.nChannels, &coll->args.nThreads, &llMode); + + int treeMode = info->pattern >= ncclPatternTreeUp ? 1 : 0; + coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, llMode, treeMode); + + int stepSize = ( llMode ? NCCL_LL_BUFF_SIZE : info->comm->channels[0].buffSize ) / NCCL_STEPS; + int chunkSteps = (llMode|treeMode) ? 1 : info->chunkSteps; + int sliceSteps = (llMode|treeMode) ? 1 : info->sliceSteps; + int chunkSize = stepSize*chunkSteps; + + // Compute lastChunkSize + if (treeMode == 1 && llMode == 0) { + if (info->pattern == ncclPatternTreeUpDown) { + // Optimize chunkSize / nSteps + while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth*8 && chunkSize > 131072) chunkSize /= 2; + while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth*4 && chunkSize > 65536) chunkSize /= 2; + while (info->nBytes / (coll->args.nChannels*chunkSize) < info->comm->channels[0].tree.depth && chunkSize > 32768) chunkSize /= 2; + } + // Use lastChunkSize as chunkSize + coll->args.lastChunkSize = chunkSize / ncclTypeSize(info->datatype); + } else if (llMode == 1) { + int sliceSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t); + const ssize_t loopSize = coll->args.nChannels*info->nchunksPerLoop*(ssize_t)sliceSize; + coll->args.lastChunkSize = DIVUP((info->nBytes-(info->nBytes/loopSize)*loopSize), coll->args.nChannels*info->nchunksPerLoop); + ALIGN_SIZE(coll->args.lastChunkSize, coll->args.nThreads*sizeof(uint64_t)); + coll->args.lastChunkSize /= ncclTypeSize(info->datatype); + } + + // Compute nSteps for proxies + size_t nBytes = llMode ? info->nBytes*2 : info->nBytes; + + int nLoops = (int)(DIVUP(nBytes, (((size_t)(coll->args.nChannels))*info->nchunksPerLoop*chunkSize))); + proxyArgs->nsteps = info->nstepsPerLoop * nLoops * chunkSteps; + proxyArgs->sliceSteps = sliceSteps; + proxyArgs->chunkSteps = chunkSteps; + proxyArgs->llMode = llMode; + proxyArgs->opCount = info->comm->opCount; + TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> llmode %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p", + coll->args.opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, nBytes, llMode, coll->args.nChannels, coll->args.nThreads, + nLoops, proxyArgs->nsteps, info->comm); + return ncclSuccess; +} + +static ncclResult_t saveKernel(struct ncclInfo* info) { + if (info->comm->nRanks == 1) { + if (info->sendbuff != info->recvbuff) + CUDACHECK(cudaMemcpyAsync(info->recvbuff, info->sendbuff, info->nBytes, cudaMemcpyDeviceToDevice, info->stream)); + return ncclSuccess; + } + + struct ncclColl coll; + struct ncclProxyArgs proxyArgs; + memset(&proxyArgs, 0, sizeof(struct ncclProxyArgs)); + NCCLCHECK(computeColl(info, &coll, &proxyArgs)); + + info->comm->myParams->blockDim.x = max(info->comm->myParams->blockDim.x, coll.args.nThreads); + if (info->comm->userStreamSet == false) { + info->comm->userStream = info->stream; + info->comm->userStreamSet = true; + } else if (info->stream != info->comm->userStream) { + WARN("Error : mixing different streams within a group call is not supported."); + return ncclInvalidUsage; + } + for (int bid=0; bidcomm->channels+(info->comm->myParams->gridDim.x % info->comm->nChannels); + + if (channel->collCount == NCCL_MAX_OPS) { + WARN("Too many aggregated operations (%d max)", NCCL_MAX_OPS); + return ncclInvalidUsage; + } + + // Proxy + proxyArgs.channel = channel; + NCCLCHECK(transportSaveProxies(&proxyArgs, info->pattern, info->root, info->comm->nRanks)); + + info->comm->myParams->gridDim.x++; + + int opIndex = channel->collFifoTail; + struct ncclColl* c = channel->collectives+opIndex; + volatile uint8_t* activePtr = (volatile uint8_t*)&c->active; + while (activePtr[0] != 0) sched_yield(); + + memcpy(c, &coll, sizeof(struct ncclColl)); + + c->args.bid = bid; + c->active = 1; + opIndex = (opIndex+1)%NCCL_MAX_OPS; + c->nextIndex = opIndex; + channel->collFifoTail = opIndex; + channel->collCount++; + } + /*if (llMode == 0)*/ info->comm->opCount++; + return ncclSuccess; +} + + +ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) { + if (info->comm == NULL) return ncclInvalidArgument; + + INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", + info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count, + info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream); + + // Launch asynchronously if needed + if (ncclAsyncMode()) { + ncclResult_t ret = ncclSuccess; + int savedDev = -1; + if (info->comm->checkPointers) { + CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, end); + CUDACHECKGOTO(cudaSetDevice(info->comm->cudaDev), ret, end); + } + // Check arguments + NCCLCHECKGOTO(ArgsCheck(info), ret, end); + // Always register comm even in case of error to make sure ncclGroupEnd + // cleans it up. + NCCLCHECKGOTO(ncclAsyncColl(info->comm), ret, end); + NCCLCHECKGOTO(saveKernel(info), ret, end); +end: + if (savedDev != -1) CUDACHECK(cudaSetDevice(savedDev)); + ncclAsyncErrCheck(ret); + return ret; + } else { + NCCLCHECK(ArgsCheck(info)); + NCCLCHECK(saveKernel(info)); + NCCLCHECK(ncclBarrierEnqueue(info->comm)); + NCCLCHECK(ncclBarrierEnqueueWait(info->comm)); + NCCLCHECK(ncclEnqueueEvents(info->comm)); + return ncclSuccess; + } +} diff --git a/projects/rccl/src/include/bootstrap.h b/projects/rccl/src/include/bootstrap.h index 278593c8cd..a1aaf50a89 100644 --- a/projects/rccl/src/include/bootstrap.h +++ b/projects/rccl/src/include/bootstrap.h @@ -13,5 +13,7 @@ ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv); ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out); ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commState); ncclResult_t bootstrapAllGather(void* commState, void* allData, int size); +ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size); +ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size); ncclResult_t bootstrapClose(void* commState); #endif diff --git a/projects/rccl/src/include/channel.h b/projects/rccl/src/include/channel.h new file mode 100644 index 0000000000..76c5e8ad92 --- /dev/null +++ b/projects/rccl/src/include/channel.h @@ -0,0 +1,14 @@ +/************************************************************************* + * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_CHANNEL_H_ +#define NCCL_CHANNEL_H_ +#include "core.h" + +ncclResult_t initChannel(struct ncclComm* comm, int channelid); +ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks); + +#endif diff --git a/projects/rccl/src/include/checks.h b/projects/rccl/src/include/checks.h new file mode 100644 index 0000000000..bf7750edba --- /dev/null +++ b/projects/rccl/src/include/checks.h @@ -0,0 +1,10 @@ +/************************************************************************* + * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "core.h" + +ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname); +ncclResult_t ArgsCheck(struct ncclInfo* info); diff --git a/projects/rccl/src/include/common_coll.h b/projects/rccl/src/include/common_coll.h deleted file mode 100644 index 3ec7354f5d..0000000000 --- a/projects/rccl/src/include/common_coll.h +++ /dev/null @@ -1,195 +0,0 @@ -/************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef COMMON_COLL_H_ -#define COMMON_COLL_H_ - -#include "core.h" -#include "enqueue.h" -#include "collectives/collectives.h" - -static ncclResult_t PointerCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) { - cudaPointerAttributes attr; - cudaError_t err = cudaPointerGetAttributes(&attr, pointer); - if (err != cudaSuccess || attr.devicePointer == NULL) { - WARN("%s : %s is not a valid pointer", opname, ptrname); - return ncclInvalidArgument; - } -#if CUDART_VERSION >= 10000 - if (attr.type == cudaMemoryTypeDevice && attr.device != comm->cudaDev) { -#else - if (attr.memoryType == cudaMemoryTypeDevice && attr.device != comm->cudaDev) { -#endif - WARN("%s : %s allocated on device %d mismatchs with NCCL device %d", opname, ptrname, attr.device, comm->cudaDev); - return ncclInvalidArgument; - } - return ncclSuccess; -} - -static ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) { - if (ptr == NULL) { - WARN("%s : %s argument is NULL", opname, ptrname); - return ncclInvalidArgument; - } - return ncclSuccess; -} - -static ncclResult_t ArgsCheck(const void* sendbuff, const void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, struct ncclComm* comm, const char* opname) { - NCCLCHECK(PtrCheck(comm, opname, "comm")); - // First, the easy ones - if (root < 0 || root >= comm->nRanks) { - WARN("%s : invalid root %d (root should be in the 0..%d range)", opname, root, comm->nRanks); - return ncclInvalidArgument; - } - if (type < 0 || type >= ncclNumTypes) { - WARN("%s : invalid type %d", opname, type); - return ncclInvalidArgument; - } - if (op < 0 || op >= ncclNumOps) { - WARN("%s : invalid reduction operation %d", opname, op); - return ncclInvalidArgument; - } - - if (comm->checkPointers) { - // Check CUDA device pointers - if (strcmp(opname, "Broadcast") != 0 || comm->rank == root) { - NCCLCHECK(PointerCheck(sendbuff, comm, "sendbuff", opname)); - } - if (strcmp(opname, "Reduce") != 0 || comm->rank == root) { - NCCLCHECK(PointerCheck(recvbuff, comm, "recvbuff", opname)); - } - } - return ncclSuccess; -} - -static __inline__ int ncclTypeSize(ncclDataType_t type) { - switch (type) { - case ncclInt8: - case ncclUint8: - return 1; - case ncclFloat16: - return 2; - case ncclInt32: - case ncclUint32: - case ncclFloat32: - return 4; - case ncclInt64: - case ncclUint64: - case ncclFloat64: - return 8; - default: - return -1; - } -} - -// In : comm, nbytes ; Out : nrings, nthreads, ll -// - We start with the minimum number of threads possible (64) and see if the size fits in LL; -// If not, we increase the number of threads by 2x, until we reach the max number of LL threads (256, or set by user via NCCL_NTHREADS, or platform non-LL default) -// - We use "maxRings" to limit the max number of rings we can use before reaching the max number of LL threads -// This ensures we don't use a large number of rings with a small number of threads -// - We use the NCCL_LL_RING_THRESHOLD as the per-thread threshold before we reach the max number of threads -// we use NCCL_THREAD_THRESHOLD when we reach the max -// - If by the max number of LL threads, the size still cannot fit in LL, then we use non-LL setting -// - We honor the NCCL_LL_THRESHOLD (total threshold) set by user too -static inline void ncclGetCollResource(ncclComm_t comm, size_t nbytes, int* nrings, int* nthreads, int* ll) { - *ll = 0; - int llEnforced = 0; /* see if the size falls in the NCCL_LL_THRESHOLD range set by user */ - if (comm->llThreshold >= 0) { /* user sets total LL threshold */ - if (nbytes > comm->llThreshold) { /* non-LL */ - *nthreads = comm->nThreads+1; - *nrings = comm->nRings; - return; - } else { - llEnforced = 1; /* user wants to use LL */ - } - } - int nt = NCCL_LL_MIN_NTHREADS; /* start with min number of LL threads */ - size_t nr; - int ll_max_nthreads = std::min(NCCL_LL_MAX_NTHREADS, comm->nThreads); /* respect user's setting or platform's default setting */ - int maxRings = (comm->nRanks <= 4) ? 1 : ll_max_nthreads / NCCL_LL_MIN_NTHREADS; - ssize_t threshold = std::min(comm->threadThreshold, (ssize_t)NCCL_LL_RING_THRESHOLD); - while (nt < ll_max_nthreads && *ll == 0) { - nr = DIVUP(nbytes, (NCCL_LL_RING_THRESHOLD*nt*comm->nRanks)); - if (nr <= maxRings) { /* avoid using few threads but many rings */ - nr = nr == 0 ? 1 : nr > comm->nRings ? comm->nRings : nr; - *ll = nbytes > comm->nRanks*nr*nt*threshold ? 0 : 1; - } - if (*ll == 0) { - nt = nt << 1; - } - } - if (*ll == 1) { - *nthreads = nt; - *nrings = (int)nr; - return; /* we can use smaller number of threads to make LL work, stop here */ - } - nr = DIVUP(nbytes, (NCCL_LL_RING_THRESHOLD*ll_max_nthreads*comm->nRanks)); /* else we try the max number of LL threads */ - nr = nr == 0 ? 1 : nr > comm->nRings ? comm->nRings : nr; - *ll = nbytes > comm->nRanks*nr*ll_max_nthreads*comm->threadThreshold ? llEnforced : 1; - *nthreads = *ll ? ll_max_nthreads : comm->nThreads+1; - *nrings = *ll ? (int)nr : comm->nRings; -} - -static ncclResult_t saveKernel(int coll, const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t dtype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream, size_t nbytes, int loopFactor) { - int llMode, nBlocks, nThreads; - ncclGetCollResource(comm, nbytes, &nBlocks, &nThreads, &llMode); - comm->myParams->blockDim.x = std::max((int)comm->myParams->blockDim.x, nThreads); - if (comm->userStreamSet == false) { - comm->userStream = stream; - comm->userStreamSet = true; - } else if (stream != comm->userStream) { - WARN("Error : mixing different streams within a group call is not supported."); - return ncclInvalidUsage; - } - int lastChunkSize = 0; - if (llMode == 1) { - int sliceSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / ncclTypeSize(dtype); - const ssize_t loopSize = nBlocks*loopFactor*(ssize_t)sliceSize; - lastChunkSize = DIVUP((count-count/loopSize*loopSize), nBlocks*loopFactor); - ALIGN_SIZE(lastChunkSize, nThreads*sizeof(uint64_t)/ncclTypeSize(dtype)); - } - for (int bid=0; bidrings+(comm->myParams->gridDim.x % comm->nRings); - if (ring->collCount == NCCL_MAX_OPS) { - WARN("Too many aggregated operations (%d max)", NCCL_MAX_OPS); - return ncclInvalidUsage; - } - - comm->myParams->gridDim.x++; - - int opIndex = ring->collFifoTail; - struct ncclColl* c = ring->collectives+opIndex; - volatile uint8_t* activePtr = (volatile uint8_t*)&c->active; - while (activePtr[0] != 0) sched_yield(); - - struct CollectiveArgs* args = &c->args; - args->root = root; - args->N = count; - args->ThisInput = sendbuff; - args->ThisOutput = recvbuff; - args->comm = comm->devComm; - args->opCount = comm->opCount; - args->bid = bid; - args->nRings = nBlocks; - args->nThreads = nThreads; - args->lastChunkSize = lastChunkSize; - - c->nThreads = nThreads; - c->funcIndex = FUNC_INDEX(coll, op, dtype, llMode); - c->active = 1; - opIndex = (opIndex+1)%NCCL_MAX_OPS; - c->nextIndex = opIndex; - ring->collFifoTail = opIndex; - ring->collCount++; - } - /*if (llMode == 0)*/ comm->opCount++; - return ncclSuccess; -} - -extern __global__ void ncclMultiOpKernel (struct ncclColl firstColl); - -#endif diff --git a/projects/rccl/src/include/core.h b/projects/rccl/src/include/core.h index 8285df5d45..d57d27107e 100644 --- a/projects/rccl/src/include/core.h +++ b/projects/rccl/src/include/core.h @@ -8,6 +8,7 @@ #define NCCL_CORE_H_ #define NCCL_MAX_OPS 2048 +#define NCCL_STEPS 8 #include "nccl.h" #include "transport.h" @@ -29,15 +30,15 @@ struct cudaLaunchParams { }; #endif -#define MAXRINGS 16 +#define MAXCHANNELS 16 #define MAXTHREADS 256 #define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */ -// Rings / LL tuning -#define NCCL_LL_RING_THRESHOLD 8 // Per thread size before we start increasing nrings -#define NCCL_THREAD_THRESHOLD 64 // Per thread size before we switch to non-LL for Volta and above +// Channels / LL tuning +#define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings +#define NCCL_THREAD_THRESHOLD 64 // Per thread size before we switch to non-LL #define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs -#define NCCL_LL_MAX_NTHREADS 256 +#define NCCL_LL_MAX_NTHREADS MAXTHREADS #define NCCL_LL_MIN_NTHREADS 64 #define DIVUP(x, y) \ @@ -63,43 +64,84 @@ union ncclLLFifoLine { int4 i4; }; +typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t; + +typedef enum { + ncclPatternRing, + ncclPatternRingTwice, + ncclPatternPipelineFrom, + ncclPatternPipelineTo, + ncclPatternTreeUp, + ncclPatternTreeDown, + ncclPatternTreeUpDown +} ncclPattern_t; + +typedef enum { + ncclDevSuccess, + ncclDevAssertedMismatch, + ncclDevSuspectedMismatch +} ncclDevError_t; + +// Used to pass NCCL call information between functions +struct ncclInfo { + ncclColl_t coll; + const char* opName; + // NCCL Coll Args + const void* sendbuff; + void* recvbuff; + size_t count; + ncclDataType_t datatype; + ncclRedOp_t op; + int root; + ncclComm_t comm; + cudaStream_t stream; + // Algorithm details + int chunkSteps; + int sliceSteps; + // Computed later + ncclPattern_t pattern; + size_t nBytes; + int nstepsPerLoop; + int nchunksPerLoop; +}; + struct ncclConnInfo { // Regular comm mechanism char *buff; // Local for recv, remote for send uint64_t *tail; // Local for recv, remote for send uint64_t *head; // Local for send, remote for recv - uint64_t *opCount; // Local for recv, remote for send + uint64_t *opCountLoc; // opCount of local rank + uint64_t *opCountRem; // opCount of remote rank int direct; // Direct communication void **ptrExchange; // Pointer exchange for direct communication int *fifo; // Size fifo for proxy + uint64_t step; // Keep where we are + // Low latency mechanism - char *llBuff; // Local for recv, remote for send - uint64_t *llHead; // Local for send, remote for recv - int *llFifo; // LL Size fifo for proxy - uint64_t llStep; // Keep where we are + union ncclLLFifoLine *llBuff; // Local for recv, remote for send uint64_t llLastCleaning; }; struct ncclConnector { - struct transportProxyInfo* proxyInfo; - struct ncclTransport* transport; + int connected; + struct ncclProxyArgs *proxyAppend; + struct ncclTransportComm* transportComm; void* transportResources; // Host-side resources struct ncclConnInfo conn; + struct ncclComm *comm; }; #define CACHE_LINE_SIZE 128 #define MEM_ALIGN 4096 -#define SIZES_FIFO_SIZE 32 #define CUDA_IPC_MIN 2097152UL /* 2MiB - not currently used */ -#define NCCL_LL_CHUNKS 8 #define NUM_LINES_PER_THREAD 8 -#define NCCL_LL_BUFF_SIZE (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS*NCCL_LL_CHUNKS*sizeof(union ncclLLFifoLine)) // 256K -#define NCCL_LL_BUFF_LINES (NCCL_LL_BUFF_SIZE / (2*sizeof(uint64_t))) -#define NCCL_LL_SLICE_LINES (NCCL_LL_BUFF_LINES / NCCL_LL_CHUNKS) +#define NCCL_LL_SLICE_LINES (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS) +#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS) +#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine)) #define NCCL_LL_CLEAN_FREQ 0x10000000 struct ncclSendMem { @@ -109,7 +151,7 @@ struct ncclSendMem { char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)]; void* ptrExchange; char pad2[CACHE_LINE_SIZE-sizeof(void*)]; - uint64_t llHead; + uint64_t opCount; }; char pad3[MEM_ALIGN]; }; @@ -119,37 +161,54 @@ struct ncclRecvMem { union { struct { uint64_t tail; - char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)]; + char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)]; uint64_t opCount; - char pad4[CACHE_LINE_SIZE-sizeof(uint64_t)]; - int sizesFifo[SIZES_FIFO_SIZE]; - int llSizesFifo[SIZES_FIFO_SIZE]; + char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)]; + int sizesFifo[NCCL_STEPS]; }; - char pad5[MEM_ALIGN]; + char pad4[MEM_ALIGN]; }; - char llBuff[NCCL_LL_BUFF_SIZE]; + ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES]; char buff[1]; // Actually larger than that }; struct ncclRing { + // Shortcuts for userRanks[1] and userRanks[n-1] + int prev; + int next; + + // Maps an internal nccl index to user-specified rank order. This is necessary + // since we need to know how the user expects data to be ordered across + // devices. Ordered from current device. + int* userRanks; + int* devUserRanks; +}; + +#define NCCL_MAX_TREE_ARITY 3 +struct ncclTree { + int depth; + int up; + int down[NCCL_MAX_TREE_ARITY]; +}; + +struct ncclPeer { + struct ncclConnector send; + struct ncclConnector recv; +}; + +struct ncclChannel { union { struct { + struct ncclRing ring; + struct ncclTree tree; + int id; int nthreads; - // Per ring resources - struct ncclSendMem* devMemSend; // CUDA-size resources - struct ncclRecvMem* devMemRecv; // CUDA-size resources int buffSize; - int devMemSendSize; // Keep the size for IPCs - int devMemRecvSize; // Keep the size for IPCs - struct ncclConnector send; - struct ncclConnector recv; - // Maps an internal nccl index to user-specified rank order. This is necessary - // since we need to know how the user expects data to be ordered across - // devices. Ordered from current device. - int* userRanks; - int* devUserRanks; + // Communication structures + struct ncclPeer* peers; + struct ncclPeer* devPeers; // Operation list for aggregation struct ncclColl* collectives; @@ -162,7 +221,7 @@ struct ncclRing { int data[0x80]; }; }; -static_assert(sizeof(struct ncclRing) == 0x80*sizeof(int), "ncclRing must have a pow2 size"); +static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size"); /* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */ /* to make sure reads to host from the CUDA kernel are aligned. */ @@ -179,7 +238,7 @@ struct CollectiveArgs { size_t N; uint32_t root; uint8_t bid; - uint8_t nRings; + uint8_t nChannels; uint16_t nThreads; int lastChunkSize; @@ -188,7 +247,6 @@ struct ncclColl { union { struct { struct CollectiveArgs args; - uint16_t nThreads; uint16_t funcIndex; uint16_t nextIndex; uint8_t active; @@ -199,11 +257,16 @@ struct ncclColl { static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size"); struct ncclComm { - struct ncclRing rings[MAXRINGS]; + struct ncclChannel channels[MAXCHANNELS]; + + struct ncclPeerInfo* peerInfo; + + void* bootstrap; int rank; // my rank in the communicator int nRanks; // number of GPUs in communicator int cudaDev; // my cuda device index + int nvmlDev; // my NVML device number enum { GROUP, PARALLEL } launchMode; cudaStream_t userStream; @@ -215,18 +278,31 @@ struct ncclComm { // where syncs are not symmetric). uint64_t opCount; - // Rings for collectives - int nRings; + // Channels for collectives + int nChannels; int nThreads; // Low-latency algorithm threshold ssize_t llThreshold; ssize_t threadThreshold; + // Tree algorithm threshold + ssize_t treeThreshold; + // An internal CUDA stream for NCCL kernel CGMD launches int groupCudaStream; cudaStream_t groupStream; + // Whether there has been a fatal error in this communicator. + ncclResult_t fatalError; + + // Error reported by GPU + volatile ncclDevError_t* fatalDevError; + + // On host: this pointer has been obtained from cudaHostAlloc(cudaHostAllocMapped) + // On device: this pointer has been obtained from cudaHostGetDevicePointer() + volatile uint32_t *abortFlag; + // Device copy of the communicator struct ncclComm *devComm; @@ -244,6 +320,10 @@ struct ncclComm { int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not struct ncclColl args; void* argsptr; + + // Global proxy thread + pthread_t proxyThread; + struct ncclProxyState proxyState; }; // Check CUDA calls @@ -324,6 +404,28 @@ struct ncclComm { #endif // end PROFAPI int ncclCudaCompCap(); +ncclResult_t ncclNvlinkGpu(int* nvlink); +int64_t ncclTreeThreshold(); + +static __inline__ int ncclTypeSize(ncclDataType_t type) { + switch (type) { + case ncclInt8: + case ncclUint8: + return 1; + case ncclFloat16: + return 2; + case ncclInt32: + case ncclUint32: + case ncclFloat32: + return 4; + case ncclInt64: + case ncclUint64: + case ncclFloat64: + return 8; + default: + return -1; + } +} #include static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) { diff --git a/projects/rccl/src/include/cpuset.h b/projects/rccl/src/include/cpuset.h new file mode 100644 index 0000000000..f70d1d8090 --- /dev/null +++ b/projects/rccl/src/include/cpuset.h @@ -0,0 +1,61 @@ +/************************************************************************* + * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_CPUSET_H_ +#define NCCL_CPUSET_H_ + +// Convert local_cpus, e.g. 0003ff,f0003fff to cpu_set_t + +static int hexToInt(char c) { + int v = c - '0'; + if (v < 0) return -1; + if (v > 9) v = 10 + c - 'a'; + if ((v < 0) || (v > 15)) return -1; + return v; +} + +#define CPU_SET_N_U32 (sizeof(cpu_set_t)/sizeof(uint32_t)) + +ncclResult_t ncclStrToCpuset(char* str, cpu_set_t* mask) { + uint32_t cpumasks[CPU_SET_N_U32]; + int m = CPU_SET_N_U32-1; + cpumasks[m] = 0; + for (int o=0; o=0; o--) { + if (c == 0 && m8[o] == 0) continue; + sprintf(str+c, "%02x", m8[o]); + c+=2; + if (o && o%4 == 0) { + sprintf(str+c, ","); + c++; + } + } + str[c] = '\0'; + return ncclSuccess; +} + +#endif diff --git a/projects/rccl/src/include/debug.h b/projects/rccl/src/include/debug.h index 55dee1838c..3acdf8c28a 100644 --- a/projects/rccl/src/include/debug.h +++ b/projects/rccl/src/include/debug.h @@ -25,6 +25,7 @@ extern uint64_t ncclDebugMask; extern pthread_mutex_t ncclDebugOutputLock; extern FILE *ncclDebugFile; extern ncclResult_t getHostName(char* hostname, int maxlen); +extern ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev); extern void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...); diff --git a/projects/rccl/src/include/enqueue.h b/projects/rccl/src/include/enqueue.h index 69d0463d99..4db7094c4e 100644 --- a/projects/rccl/src/include/enqueue.h +++ b/projects/rccl/src/include/enqueue.h @@ -10,12 +10,7 @@ #include "core.h" #include "group.h" -typedef ncclResult_t(*ncclFunc_t)(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t type, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); - -ncclResult_t ncclEnqueueCheck(ncclFunc_t func, const char* primName, const void* sendbuff, - void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, - ncclComm_t comm, cudaStream_t stream); +ncclResult_t ncclEnqueueCheck(struct ncclInfo* info); ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast); ncclResult_t ncclCpuBarrierLast(ncclComm_t comm); ncclResult_t ncclCpuBarrierOut(ncclComm_t comm); diff --git a/projects/rccl/src/include/nccl_net.h b/projects/rccl/src/include/nccl_net.h index ce3f6cab6d..89edbf5024 100644 --- a/projects/rccl/src/include/nccl_net.h +++ b/projects/rccl/src/include/nccl_net.h @@ -58,8 +58,50 @@ typedef struct { ncclResult_t (*closeListen)(void* listenComm); } ncclNet_v1_t; -typedef ncclNet_v1_t ncclNet_t; +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Return the device path in /sys. NCCL will call free on this path. + ncclResult_t (*pciPath)(int dev, char** path); + // Return whether this device supports host pointers and/or CUDA pointers + // as data from the current GPU. Supported types should be composed with + // NCCL_PTR_HOST and NCCL_PTR_CUDA. + ncclResult_t (*ptrSupport)(int dev, int* supportedTypes); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + ncclResult_t (*connect)(int dev, void* handle, void** sendComm); + // Finalize connection establishment after remote peer has called connectHandle + ncclResult_t (*accept)(void* listenComm, void** recvComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request); + // Asynchronous recv from a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*flush)(void* recvComm, void* data, int size, void* mhandle); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* size); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclNet_v2_t; -#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v1 +typedef ncclNet_v2_t ncclNet_t; + +#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v2 #endif // end include guard diff --git a/projects/rccl/src/include/net.h b/projects/rccl/src/include/net.h index ebc967782c..e75e6bbfe2 100644 --- a/projects/rccl/src/include/net.h +++ b/projects/rccl/src/include/net.h @@ -26,9 +26,11 @@ static ncclResult_t ncclNetPtrSupport(int dev, int* supportedTypes) { NCCLCHECK( static ncclResult_t ncclNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclNet->listen(dev, handle, listenComm)); return ncclSuccess; } static ncclResult_t ncclNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclNet->connect(dev, handle, sendComm)); return ncclSuccess; } static ncclResult_t ncclNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclNet->accept(listenComm, recvComm)); return ncclSuccess; } -static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, int type, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, type, request)); return ncclSuccess; } -static ncclResult_t ncclNetIrecv(void* recvComm, void* data, int size, int type, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, data, size, type, request)); return ncclSuccess; } -static ncclResult_t ncclNetFlush(void* recvComm, void* data, int size) { NCCLCHECK(ncclNet->flush(recvComm, data, size)); return ncclSuccess; } +static ncclResult_t ncclNetRegMr(void* comm, void* data, int size, int type, void** mhandle) { NCCLCHECK(ncclNet->regMr(comm, data, size, type, mhandle)); return ncclSuccess; } +static ncclResult_t ncclNetDeregMr(void* comm, void* mhandle) { NCCLCHECK(ncclNet->deregMr(comm, mhandle)); return ncclSuccess; } +static ncclResult_t ncclNetIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->isend(sendComm, data, size, mhandle, request)); return ncclSuccess; } +static ncclResult_t ncclNetIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { NCCLCHECK(ncclNet->irecv(recvComm, data, size, mhandle, request)); return ncclSuccess; } +static ncclResult_t ncclNetFlush(void* recvComm, void* data, int size, void* mhandle) { NCCLCHECK(ncclNet->flush(recvComm, data, size, mhandle)); return ncclSuccess; } static ncclResult_t ncclNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclNet->test(request, done, size)); return ncclSuccess; } static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeSend(sendComm)); return ncclSuccess; } static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; } diff --git a/projects/rccl/src/include/nvlink.h b/projects/rccl/src/include/nvlink.h index 7eb74c9f91..1baf9e536f 100644 --- a/projects/rccl/src/include/nvlink.h +++ b/projects/rccl/src/include/nvlink.h @@ -67,18 +67,15 @@ static int getNvlinkGpu(const char* busId1, const char* busId2) { if (res != ncclSuccess) return 0; for(int l=0; l 6 ? 6 : 4; - for(int l=0; lifa_addr, salen); - INFO(NCCL_INIT|NCCL_NET,"NET : Using interface %s:%s", interface->ifa_name, socketToString(interface->ifa_addr, line)); found++; } } @@ -336,8 +337,10 @@ static ncclResult_t createListenSocket(int *fd, union socketAddress *localAddr) TRACE(NCCL_INIT|NCCL_NET,"Listening on socket %s", socketToString(&localAddr->sa, line)); #endif - /* Put the socket in listen mode */ - SYSCHECK(listen(sockfd, 128), "listen"); + /* Put the socket in listen mode + * NB: The backlog will be silently truncated to the value in /proc/sys/net/core/somaxconn + */ + SYSCHECK(listen(sockfd, 16384), "listen"); *fd = sockfd; return ncclSuccess; } diff --git a/projects/rccl/src/include/transport.h b/projects/rccl/src/include/transport.h index 59f83c9a88..6231a71438 100644 --- a/projects/rccl/src/include/transport.h +++ b/projects/rccl/src/include/transport.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -9,6 +9,7 @@ #include "nccl.h" #include +#include "nvmlwrap.h" #define NTRANSPORTS 3 @@ -19,11 +20,13 @@ struct ncclRing; struct ncclConnector; struct ncclComm; -#define RANK_INFO_SIZE 64 -typedef char ncclTinfo_t[RANK_INFO_SIZE]; - -struct ncclInfo { - ncclTinfo_t tinfo[NTRANSPORTS]; +struct ncclPeerInfo { + int rank; + int cudaDev; + int nvmlDev; + uint64_t hostHash; + uint64_t pidHash; + char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; }; // Used to hold the transport connection values @@ -34,18 +37,47 @@ struct ncclConnect { char data[CONNECT_SIZE]; }; +enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress, ncclProxyOpDone }; + +struct ncclProxyArgs; +typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*); + struct ncclProxyArgs { - struct ncclRing* ring; - int substeps; + proxyProgressFunc_t progress; + struct ncclChannel* channel; + struct ncclConnector* connector; + int sliceSteps; + int chunkSteps; int nsteps; uint64_t opCount; int llMode; - bool needProxy; - int active; // add component before this line -- it is left out during initialization + int state; // add component before this line -- it is left out during initialization + + // Internal state + uint64_t head; + uint64_t tail; + uint64_t end; + void* requests[NCCL_STEPS]; + int idle; + + // Element linking + pthread_mutex_t mutex; + struct ncclProxyArgs* next; + struct ncclProxyArgs* nextPeer; +}; + +struct ncclProxyPool; +struct ncclProxyState { + pthread_cond_t cond; + pthread_mutex_t mutex; + bool stop; + struct ncclProxyArgs* ops; + struct ncclProxyArgs* pool; + struct ncclProxyPool* pools; }; struct ncclTransportComm { - ncclResult_t (*setup)(ncclTinfo_t*, ncclTinfo_t*, struct ncclConnect*, struct ncclRing*); + ncclResult_t (*setup)(struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int buffSize, int channelId); ncclResult_t (*connect)(struct ncclConnect*, struct ncclConnector*); ncclResult_t (*free)(void*); ncclResult_t (*proxy)(struct ncclProxyArgs*); @@ -53,8 +85,7 @@ struct ncclTransportComm { struct ncclTransport { const char name[4]; - ncclResult_t (*fillInfo)(ncclTinfo_t*, int); - ncclResult_t (*canConnect)(ncclTvalue_t*, ncclTinfo_t*, ncclTinfo_t*); + ncclResult_t (*canConnect)(ncclTvalue_t*, struct ncclPeerInfo*, struct ncclPeerInfo*); ncclResult_t (*getRings)(int, int*, int*, ncclTvalue_t*, int*, int*, int*, int, int*); struct ncclTransportComm send; struct ncclTransportComm recv; @@ -64,37 +95,17 @@ struct ncclTransport { typedef ncclResult_t (*threadFunc_t)(struct ncclProxyArgs*); -#define TRANSPORT_PROXY_FIFO_SIZE NCCL_MAX_OPS - -struct transportProxyInfo { - struct ncclComm* comm; - pthread_t thread; - threadFunc_t func; - volatile int proxyReady; - struct ncclProxyArgs argsFifo[TRANSPORT_PROXY_FIFO_SIZE]; - volatile uint64_t argsFifoHead; - volatile uint64_t argsFifoTail; - pthread_cond_t cond; - pthread_mutex_t mutex; -}; - -ncclResult_t transportCreateProxy(int type, struct ncclRing* ring, struct ncclComm* comm); -ncclResult_t transportDestroyProxy(struct ncclConnector* connector); - enum proxyMode { proxyRing = 0, proxyFrom = 1, proxyTo = 2 }; -static int proxyPatternRing = proxyRing; -static inline int proxyPatternFrom(int root) { return 1+root; } -static inline int proxyPatternTo(int root) { return -1-root; } -static inline enum proxyMode proxyPatternMode(int pattern) { return (pattern == 0) ? proxyRing : ((pattern > 0) ? proxyFrom : proxyTo); } -static inline int proxyPatternRoot(int pattern) { return (pattern > 0) ? pattern-1 : -pattern-1; } - -ncclResult_t transportSaveProxies(int substeps, int subchunks, int nstepsPerRound, int nblocksPerRound, size_t size, int pattern, struct ncclComm* comm); -ncclResult_t transportStartProxies(struct ncclComm* comm); +ncclResult_t transportAllocateProxyArgs(struct ncclComm* comm, struct ncclProxyArgs** argsptr); +ncclResult_t transportSaveProxies(struct ncclProxyArgs* args, int pattern, int root, int nranks); +ncclResult_t transportStartProxy(struct ncclComm* comm); +ncclResult_t transportCreateProxy(struct ncclComm* comm); +ncclResult_t transportDestroyProxy(struct ncclComm* comm); #include diff --git a/projects/rccl/src/include/trees.h b/projects/rccl/src/include/trees.h new file mode 100644 index 0000000000..1a151d1388 --- /dev/null +++ b/projects/rccl/src/include/trees.h @@ -0,0 +1,13 @@ +/************************************************************************* + * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_TREES_H_ +#define NCCL_TREES_H_ + +ncclResult_t ncclGetBtree(int nranks, int rank, int* u0, int* d1, int* d0); +ncclResult_t ncclGetDtree(int nranks, int rank, int* u0, int* d0_0, int* d0_1, int* u1, int* d1_0, int* d1_1); + +#endif diff --git a/projects/rccl/src/init.cu b/projects/rccl/src/init.cu index 9d0188edd7..75822e60bd 100644 --- a/projects/rccl/src/init.cu +++ b/projects/rccl/src/init.cu @@ -1,21 +1,26 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "nccl.h" #include "core.h" -#include "ring.h" +#include "channel.h" #include "param.h" #include "nvmlwrap.h" #include "rings.h" +#include "trees.h" #include "bootstrap.h" #include "transport.h" -#include "common_coll.h" #include "group.h" #include "utils.h" #include "net.h" +#include "checks.h" +#include "enqueue.h" +#include "topo.h" +#include "nvlink.h" +#include "cpuset.h" #include #include #include @@ -54,6 +59,16 @@ NCCL_PARAM(CheckPointers, "CHECK_POINTERS", 0); ncclNet_t* ncclNet = NULL; +// We define this as weak to let tests redefine their own +#pragma weak ncclNvlinkGpu +ncclResult_t ncclNvlinkGpu(int* nvlink) { + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; + CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev)); + *nvlink = getNvlinkGpu(busId, NULL); + return ncclSuccess; +} // We define this as weak to let tests redefine their own #pragma weak ncclCudaCompCap int ncclCudaCompCap() { @@ -77,10 +92,7 @@ ncclResult_t initNet(ncclNet_t* net) { int ndev; if (net->init(ncclDebugLog) != ncclSuccess) return ncclInternalError; if (net->devices(&ndev) != ncclSuccess) return ncclInternalError; - if (ndev <= 0) { - INFO(NCCL_INIT|NCCL_NET, "Net/%s: call to devices() returned 0 devices.", net->name); - return ncclSystemError; - } + if (ndev <= 0) return ncclSystemError; return ncclSuccess; } @@ -91,15 +103,15 @@ ncclResult_t initNetPlugin(ncclNet_t** net) { // string, so checking errno doesn't hurt to try to provide a better // error message if (errno == ENOENT) { - INFO(NCCL_INIT|NCCL_NET, "No network plugin found."); + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : No plugin found (libnccl-net.so)."); } else { - INFO(NCCL_INIT|NCCL_NET, "Unable to load libnccl-net.so : %s", dlerror()); + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin : Plugin load returned %d : %s.", errno, dlerror()); } return ncclSuccess; } ncclNet_t* extNet = (ncclNet_t*) dlsym(netPluginLib, STR(NCCL_PLUGIN_SYMBOL)); if (extNet == NULL) { - INFO(NCCL_INIT|NCCL_NET, "NetPlugin: could not find " STR(NCCL_PLUGIN_SYMBOL) " symbol"); + INFO(NCCL_INIT|NCCL_NET, "NET/Plugin: Failed to find " STR(NCCL_PLUGIN_SYMBOL) " symbol."); goto cleanup; } if (initNet(extNet) == ncclSuccess) { @@ -116,21 +128,18 @@ ncclResult_t initNet() { NCCLCHECK(initNet(&ncclNetSocket)); NCCLCHECK(initNetPlugin(&ncclNet)); - if (ncclNet != NULL) { - INFO(NCCL_INIT|NCCL_NET, "Using network plugin %s", ncclNetName()); - return ncclSuccess; - } + if (ncclNet != NULL) return ncclSuccess; if (initNet(&ncclNetIb) == ncclSuccess) { ncclNet = &ncclNetIb; } else { ncclNet = &ncclNetSocket; } - INFO(NCCL_INIT|NCCL_NET,"Using network %s", ncclNetName()); return ncclSuccess; } NCCL_PARAM(LlThreshold, "LL_THRESHOLD", -2); NCCL_PARAM(ThreadThreshold, "THREAD_THRESHOLD", -2); +NCCL_PARAM(TreeThreshold, "TREE_THRESHOLD", -2); int ncclThreadThreshold(int minCompCap, int multiNode) { int threshold = ncclParamThreadThreshold(); @@ -177,10 +186,15 @@ static ncclResult_t commFree(ncclComm_t comm) { if (comm == NULL) return ncclSuccess; + free(comm->peerInfo); + + if (comm->bootstrap) + NCCLCHECK(bootstrapClose(comm->bootstrap)); + CUDACHECK(cudaFree(comm->devComm)); - for (int ring=0; ringnRings; ring++) - NCCLCHECK(freeRing(comm->rings+ring)); + for (int channel=0; channelnChannels; channel++) + NCCLCHECK(freeChannel(comm->channels+channel, comm->nRanks)); if (comm->doneEvent != NULL) CUDACHECK(cudaEventDestroy(comm->doneEvent)); @@ -199,6 +213,8 @@ static ncclResult_t commFree(ncclComm_t comm) { free(comm->intraCGMode); free(comm->intraCC); } + CUDACHECK(cudaFreeHost((void *)comm->abortFlag)); + CUDACHECK(cudaFreeHost((void *)comm->fatalDevError)); free(comm); return ncclSuccess; @@ -222,12 +238,15 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { struct ncclComm* comm; NCCLCHECK(ncclCalloc(&comm, 1)); - INFO(NCCL_INIT,"comm %p rank %d nranks %d", comm, rank, ndev); comm->rank = rank; comm->nRanks = ndev; cudaGetDevice(&comm->cudaDev); + getNvmlDevice(comm->cudaDev, &comm->nvmlDev); + INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d", comm, rank, ndev, comm->cudaDev, comm->nvmlDev); + comm->doneEvent = doneEvent; comm->llThreshold = ncclParamLlThreshold(); + comm->treeThreshold = ncclParamTreeThreshold(); comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false; #if CUDART_VERSION >= 9200 comm->groupCudaStream = ncclParamGroupCudaStream(); @@ -235,6 +254,13 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { // Don't allow the user to overload the default setting in older CUDA builds comm->groupCudaStream = NCCL_GROUP_CUDA_STREAM; #endif + comm->fatalError = ncclSuccess; + + CUDACHECK(cudaHostAlloc((void**) &comm->fatalDevError, sizeof(ncclDevError_t), cudaHostAllocMapped)); + *comm->fatalDevError = ncclDevSuccess; + + CUDACHECK(cudaHostAlloc((void**) &comm->abortFlag, sizeof(uint32_t), cudaHostAllocMapped)); + *comm->abortFlag = 0; comm->argsptr = &comm->args; @@ -248,9 +274,18 @@ static ncclResult_t devCommSetup(ncclComm_t comm) { // Copy the comm on the device NCCLCHECK(ncclCudaMemcpy(comm->devComm, comm, 1)); // Copy userRanks - for (int r=0; rnRings; r++) { - NCCLCHECK(ncclCudaMemcpy(comm->rings[r].devUserRanks, comm->rings[r].userRanks, comm->nRanks)); + for (int r=0; rnChannels; r++) { + NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks)); + NCCLCHECK(ncclCudaMemcpy(comm->channels[r].devPeers, comm->channels[r].peers, comm->nRanks)); } + // Copy the device-accessible pointer to comm->abortFlag + void *devAbortFlag; + CUDACHECK(cudaHostGetDevicePointer(&devAbortFlag, (uint32_t *)comm->abortFlag, 0)); + CUDACHECK(cudaMemcpy(&comm->devComm->abortFlag, &devAbortFlag, sizeof(int *), cudaMemcpyHostToDevice)); + // Copy the device-accessible pointer to comm->fatalDevError + void *devFatalError; + CUDACHECK(cudaHostGetDevicePointer(&devFatalError, (ncclDevError_t *)comm->fatalDevError, 0)); + CUDACHECK(cudaMemcpy(&comm->devComm->fatalDevError, &devFatalError, sizeof(ncclDevError_t *), cudaMemcpyHostToDevice)); return ncclSuccess; } @@ -267,35 +302,81 @@ static void showVersion() { } } -static ncclResult_t fillInfo(struct ncclInfo* info, int rank) { - for (int t=0; ttinfo+t, rank)); - } +static ncclResult_t fillInfo(struct ncclPeerInfo* info, int rank) { + info->rank = rank; + CUDACHECK(cudaGetDevice(&info->cudaDev)); + NCCLCHECK(getNvmlDevice(info->cudaDev, &info->nvmlDev)) + info->hostHash=getHostHash(); + info->pidHash=getPidHash(); + + // Get PCI Bus Id. We need to get the bus ID through CUDA first, since the + // cudaDev is a CUDA runtime dev number which could be different from the + // NVML device number. Then we get the busID from NVML to be sure it is + // consistent with NVML remote PCI bus Ids. + CUDACHECK(cudaDeviceGetPCIBusId(info->busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, info->cudaDev)); + nvmlDevice_t nvmlDevice; + NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(info->busId, &nvmlDevice)); + nvmlPciInfo_t pciInfo; + NCCLCHECK(wrapNvmlDeviceGetPciInfo(nvmlDevice, &pciInfo)); + strncpy(info->busId, pciInfo.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE); return ncclSuccess; } template -static ncclResult_t selectTransport(struct ncclInfo* myInfo, struct ncclInfo* peerInfo, struct ncclConnect* connect, struct ncclTransport** transportRet, struct ncclRing* ring) { +static ncclResult_t selectTransport(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int buffSize, int channelId) { for (int t=0; tsend : &transport->recv; ncclTvalue_t ret = 0; - NCCLCHECK(transport->canConnect(&ret, myInfo->tinfo+t, peerInfo->tinfo+t)); + NCCLCHECK(transport->canConnect(&ret, myInfo, peerInfo)); if (ret > 0) { - NCCLCHECK(transportComm->setup(myInfo->tinfo+t, peerInfo->tinfo+t, connect, ring)); - *transportRet = transport; + connector->transportComm = transportComm; + NCCLCHECK(transportComm->setup(myInfo, peerInfo, connect, connector, buffSize, channelId)); return ncclSuccess; } } WARN("No transport found !"); - *transportRet = NULL; return ncclInternalError; } -static ncclResult_t setupRing(struct ncclComm* comm, int ringid, int rank, int nranks, int* ringRanks, struct ncclInfo* allInfo, struct ncclConnect* connect) { - NCCLCHECK(initRing(comm, ringid)); +static int log2(int n) { + int l = 0; + while (n>>=1) l++; + return l; +} + +static ncclResult_t ncclTreeThreshold(int nnodes, int nranks, int nChannels, ssize_t *treeThreshold) { + int nvlink; + NCCLCHECK(ncclNvlinkGpu(&nvlink)); + float ringbw = nvlink ? 5000*nChannels : 5000; // approx, in MB/s or B/us + float ringlatinter = 6; + float treelatintra = 4; + float treelatinter = 15; + float treebw; + if (!nvlink) { + treebw = ringbw * 2 / 3; + } else { + treebw = ringbw * 3 / 4; + if (nnodes == 2) treebw *= 2; + } + float ringlat = ringlatinter*(nranks-1); + float treelat = treelatinter*log2(nnodes)+treelatintra*(nranks/nnodes-1); + if (nnodes < 2 || ringlat <= treelat) + *treeThreshold = 0; + else if (treebw > ringbw) + *treeThreshold = 0x7fffffffffffffff; + else + *treeThreshold = (ssize_t)(((ringbw*treebw/(ringbw-treebw)))*(ringlat-treelat)); + return ncclSuccess; +} + +static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank, int nranks, int* ringRanks, int* treeMasters) { + TRACE(NCCL_INIT, "rank %d nranks %d", rank, nranks); + NCCLCHECK(initChannel(comm, channelId)); + + struct ncclChannel* channel = comm->channels+channelId; + struct ncclRing* ring = &channel->ring; - struct ncclRing* ring = comm->rings+ringid; // Reorganize ranks to start with rank. int shift; for (shift = 0; shiftuserRanks[i] = ringRanks[(i+shift)%nranks]; } - int prev = ring->userRanks[nranks-1]; - int next = ring->userRanks[1]; + int prev = ring->prev = ring->userRanks[nranks-1]; + int next = ring->next = ring->userRanks[1]; - NCCLCHECK(selectTransport<0>(allInfo+rank, allInfo+prev, connect+0, &ring->recv.transport, ring)); - NCCLCHECK(selectTransport<1>(allInfo+rank, allInfo+next, connect+1, &ring->send.transport, ring)); - NCCLCHECK(transportCreateProxy(0, ring, comm)); - NCCLCHECK(transportCreateProxy(1, ring, comm)); + struct ncclTree* tree = &channel->tree; + tree->up = -1; + tree->down[0] = tree->down[1] = tree->down[2] = -1; + + // + // Find per-node masters and connect them via a binary tree + // + + int nMasters = 0; + for (int r=0; rtreeThreshold == -2) + NCCLCHECK(ncclTreeThreshold(nMasters, comm->nRanks, comm->nChannels, &comm->treeThreshold)); + + if (comm->treeThreshold > 0) { + // Compute tree depth. Not an exact value but a good approximation in most + // cases and consistent across nodes + tree->depth = nranks/nMasters + log2(nMasters); + + // Find my master : go backwards in the ring to find my root + int master = 0; + for (int i = 0; iuserRanks[(nranks-i)%nranks]; + if (treeMasters[r]) { + master = r; + break; + } + } + + int ranks[nMasters]; + int i = 0, masterIndex = -1; + // Build binary tree + for (int r=0; rnChannels, 2)) { + btreeUp = u0; btreeDown0 = d0_0; btreeDown1 = d0_1; + } else { + btreeUp = u1; btreeDown0 = d1_0; btreeDown1 = d1_1; + } + + // + // Now build the full tree, combining the intra-node ring and the + // inter-node binary tree. + // + + if (rank == master) { + int nDown = 0; + if (btreeUp != -1) tree->up = ranks[btreeUp]; + if (treeMasters[next] == 0) tree->down[nDown++] = next; + if (btreeDown0 != -1) tree->down[nDown++] = ranks[btreeDown0]; + if (btreeDown1 != -1) tree->down[nDown++] = ranks[btreeDown1]; + } else { + tree->up = prev; + if (treeMasters[next] == 0) tree->down[0] = next; + } + } + + TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks); return ncclSuccess; } -static ncclResult_t fillConnect(struct ncclInfo* allInfo, int nranks, int rank, int* connectTransport, ncclTvalue_t* connectValue) { +static ncclResult_t fillConnect(struct ncclPeerInfo* peerInfo, int nranks, int rank, int* connectTransport, ncclTvalue_t* connectValue) { for (int r=0; r 0) { connectTransport[r] = t; break; @@ -330,11 +475,6 @@ static ncclResult_t fillConnect(struct ncclInfo* allInfo, int nranks, int rank, return ncclSuccess; } -static void swap(void* mem1, void* mem2, int size) { - char tmp[size]; - memcpy(tmp, mem1, size); memcpy(mem1, mem2, size); memcpy(mem2, tmp, size); -} - #define MAXWIDTH 20 #define PREFIXLEN 15 #define STRLENGTH (PREFIXLEN+5*MAXWIDTH) @@ -380,9 +520,9 @@ void dumpLine(int* values, int nranks, const char* prefix) { static ncclResult_t buildRings(int nrings, int* rings, int rank, int nranks, int* prev, int* next) { for (int r=0; rpeers[peer].recv; + if (conn->connected) { ++nSkippedRecv; continue; } + NCCLCHECK(selectTransport<0>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id)); + NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); + } + for (int i=0; ipeers[peer].send; + if (conn->connected) { ++nSkippedSend; continue; } + NCCLCHECK(selectTransport<1>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id)); + NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); + } + for (int i=0; ipeers[peer].send; + if (conn->connected) {++nSkippedSend; continue; } + NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); + NCCLCHECK(conn->transportComm->connect(&connect, conn)); + conn->connected = 1; + } + for (int i=0; ipeers[peer].recv; + if (conn->connected) {++nSkippedRecv; continue; } + NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); + NCCLCHECK(conn->transportComm->connect(&connect, conn)); + conn->connected = 1; + } + TRACE(NCCL_INIT, "nsend %d nrecv %d nSkippedSend %u nSkippedRecv %u - DONE", nsend, nrecv, nSkippedSend, nSkippedRecv); + return ncclSuccess; +} + static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) { + // We use 3 AllGathers + // 1. { peerInfo, comm } + // 2. ConnectTransport[nranks], ConnectValue[nranks] + // 3. { nThreads, nrings, compCap, prev[MAXCHANNELS], next[MAXCHANNELS] } + int rank = comm->rank; int nranks = comm->nRanks; - void* commState; - NCCLCHECK(bootstrapInit(commId, rank, nranks, &commState)); + TRACE(NCCL_INIT, "rank %d nranks %d - BEGIN", rank, nranks); + NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap)); - struct ncclInfo* allInfo; - NCCLCHECK(ncclCalloc(&allInfo, nranks)); - NCCLCHECK(fillInfo(allInfo+rank, rank)); - NCCLCHECK(bootstrapAllGather(commState, allInfo, sizeof(struct ncclInfo))); + // AllGather1 - begin + struct { + struct ncclPeerInfo peerInfo; + struct ncclComm* comm; + } *allGather1Data; + + NCCLCHECK(ncclCalloc(&allGather1Data, nranks)); + allGather1Data[rank].comm = comm; + NCCLCHECK(fillInfo(&allGather1Data[rank].peerInfo, rank)); + NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather1Data, sizeof(*allGather1Data))); + + NCCLCHECK(ncclCalloc(&comm->peerInfo, nranks)); + for (int i = 0; i < nranks; i++) { + memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo)); + } + // AllGather1 data is used again below + // AllGather1 - end + + // AllGather2 - begin + size_t allGather2DataRowSize = sizeof(int)*nranks + sizeof(ncclTvalue_t)*nranks; + void *allGather2Data; + NCCLCHECK(ncclCalloc((char **)&allGather2Data, allGather2DataRowSize*nranks)); + int *myTransportRow = (int *)((char *)allGather2Data + allGather2DataRowSize*rank); + ncclTvalue_t *myValueRow = (ncclTvalue_t *)(myTransportRow + nranks); + + NCCLCHECK(fillConnect(comm->peerInfo, nranks, rank, myTransportRow, myValueRow)); + NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather2Data, allGather2DataRowSize)); int* connectTransport; ncclTvalue_t* connectValue; NCCLCHECK(ncclCalloc(&connectTransport, nranks*nranks)); NCCLCHECK(ncclCalloc(&connectValue, nranks*nranks)); + for (int i = 0; i < nranks; i++) { + memcpy(connectTransport + i*nranks, (char *)allGather2Data + i*allGather2DataRowSize, sizeof(int)*nranks); + memcpy(connectValue + i*nranks, (char *)allGather2Data + i*allGather2DataRowSize + nranks*sizeof(int), sizeof(ncclTvalue_t)*nranks); + } + free(allGather2Data); + // AllGather2 - end - NCCLCHECK(fillConnect(allInfo, nranks, rank, connectTransport+nranks*rank, connectValue+nranks*rank)); - NCCLCHECK(bootstrapAllGather(commState, connectTransport, nranks*(sizeof(int)))); - NCCLCHECK(bootstrapAllGather(commState, connectValue, nranks*(sizeof(ncclTvalue_t)))); //if (rank == 0) dumpMatrix(connectTransport, nranks); //if (rank == 0) dumpMatrixTvalue(connectValue, nranks); // Get my rings int nrings; - int* prev, *next; - NCCLCHECK(ncclCalloc(&prev, nranks*MAXRINGS)); - NCCLCHECK(ncclCalloc(&next, nranks*MAXRINGS)); + int* prev, *next, *treeIn, *treeOut; + NCCLCHECK(ncclCalloc(&prev, nranks*MAXCHANNELS)); + NCCLCHECK(ncclCalloc(&next, nranks*MAXCHANNELS)); + NCCLCHECK(ncclCalloc(&treeIn, nranks*MAXCHANNELS)); + NCCLCHECK(ncclCalloc(&treeOut, nranks*MAXCHANNELS)); comm->nThreads = getDefaultThreads(); - NCCLCHECK(ncclGetRings(&nrings, &comm->nThreads, rank, nranks, connectTransport, connectValue, prev, next)); + NCCLCHECK(ncclGetRings(&nrings, &comm->nThreads, rank, nranks, connectTransport, connectValue, prev, next, treeIn, treeOut)); + TRACE(NCCL_INIT, "rank %d nranks %d - BUILD %d RINGS", rank, nranks, nrings); + assert(nrings <= MAXCHANNELS); free(connectTransport); free(connectValue); + // AllGather3 - begin + struct { + int nThreads; + int nrings; + int cudaCompCap; + int prev[MAXCHANNELS]; + int next[MAXCHANNELS]; + } *allGather3Data; + + NCCLCHECK(ncclCalloc(&allGather3Data, nranks)); + allGather3Data[rank].nThreads = comm->nThreads; + allGather3Data[rank].nrings = nrings; + allGather3Data[rank].cudaCompCap = ncclCudaCompCap(); + for (int r=0; rbootstrap, allGather3Data, sizeof(*allGather3Data))); + // Find max nThreads - int allData[nranks]; - allData[rank] = comm->nThreads; - NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int))); for (int i=0; inThreads = std::max(allData[i], comm->nThreads); - if (rank == 0) INFO(NCCL_INIT,"Using %d threads", comm->nThreads); + comm->nThreads = std::max(allGather3Data[i].nThreads, comm->nThreads); // Determine the minimum CUDA Compute capability of all GPUs - int myCompCap = ncclCudaCompCap(); + int myCompCap = allGather3Data[rank].cudaCompCap; int minCompCap = myCompCap; - allData[rank] = myCompCap; - NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int))); - for (int i=0; ithreadThreshold = ncclThreadThreshold(minCompCap, nnodes); // Find min nrings across ranks - allData[rank] = nrings; - NCCLCHECK(bootstrapAllGather(commState, allData, sizeof(int))); for (int i=0; inChannels = nrings; - // Exchange data with others to build complete rings - comm->nRings = nrings; - for (int r=0; rrings+r; - NCCLCHECK(setupRing(comm, r, rank, nranks, ringRanks, allInfo, connectData+2*rank)); - int prev_offset = ring->userRanks[nranks-1]*2+1; - int next_offset = ring->userRanks[1]*2; - NCCLCHECK(bootstrapAllGather(commState, connectData, sizeof(struct ncclConnect)*2)); - NCCLCHECK(ring->send.transport->send.connect(connectData+next_offset, &ring->send)); - NCCLCHECK(ring->recv.transport->recv.connect(connectData+prev_offset, &ring->recv)); + struct ncclChannel* channel = comm->channels+r; + NCCLCHECK(setupChannel(comm, r, rank, nranks, rings+r*nranks, treeIn+r*nranks)); + NCCLCHECK(p2pSetup(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next)); + NCCLCHECK(p2pSetup(comm, channel, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up)); + NCCLCHECK(p2pSetup(comm, channel, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down)); } - free(connectData); + if (comm->treeThreshold > 0) { + char line[1024]; + line[0]='\0'; + for (int c=0; cchannels[c].tree; + snprintf(line+strlen(line), 1023-strlen(line), " [%d] %d->%d->%d/%d/%d", + c, tree->up, rank, tree->down[0], tree->down[1], tree->down[2]); + } + line[1023] = '\0'; + INFO(NCCL_INIT, "Trees%s", line); + } + if (rank == 0) { + char treeline[64]; + snprintf(treeline, 64, "enabled up to size %ld", comm->treeThreshold); + INFO(NCCL_INIT,"Using %d threads, Min Comp Cap %d, Trees %s", comm->nThreads, minCompCap, + comm->treeThreshold == 0 ? "disabled" : + comm->treeThreshold == 0x7fffffffffffffff ? "enabled for all sizes" : + treeline); + } + + TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, nrings); + free(connect); free(rings); - free(allInfo); + free(treeIn); + free(treeOut); - // Intra-process barrier setup - struct rankInfo { - uint64_t hostHash; - uint64_t pidHash; - struct ncclComm* comm; - } rankInfos[nranks]; - rankInfos[rank].hostHash = getHostHash(); - rankInfos[rank].pidHash = getPidHash(); - rankInfos[rank].comm = comm; - NCCLCHECK(bootstrapAllGather(commState, rankInfos, sizeof(struct rankInfo))); - - // Compute intra ranks + // Compute intra ranks (using AllGather1 data) int intraRank0 = -1, intraRank = -1, intraRanks = 0; - int multiNode = 0; - for (int r=0; rthreadThreshold = ncclThreadThreshold(minCompCap, multiNode); + // Done with AllGather1 data + free(allGather1Data); - // Barrier - bootstrapClose(commState); + if (nnodes) NCCLCHECK(transportCreateProxy(comm)); + + TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks); return ncclSuccess; } -bool SetCpuAffinity(int cudaDev, nvmlDevice_t* nvmlDevice) { - char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; - if (cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev) != cudaSuccess) return false; - if (wrapNvmlDeviceGetHandleByPciBusId(busId, nvmlDevice) != ncclSuccess) return false; - if (wrapNvmlDeviceSetCpuAffinity(*nvmlDevice) != ncclSuccess) { - WARN("Failed to set CPU affinity"); - return false; +static ncclResult_t getCpuGpuAffinity(int cudaDev, cpu_set_t* mask) { + CPU_ZERO_S(sizeof(cpu_set_t), mask); + char* cudaPath; + NCCLCHECK(getCudaPath(cudaDev, &cudaPath)); + char path[PATH_MAX]; + strncpy(path, cudaPath, PATH_MAX-1); + snprintf(path+strlen(path), PATH_MAX-1-strlen(path), "/local_cpus"); + path[PATH_MAX-1] = '\0'; + int fd; + SYSCHECKVAL(open(path, O_RDONLY), "open", fd); + char affinityStr[sizeof(cpu_set_t)*2]; + int r = read(fd, affinityStr, sizeof(cpu_set_t)*2); + if (r > 0) + NCCLCHECK(ncclStrToCpuset(affinityStr, mask)); + close(fd); + free(cudaPath); + return ncclSuccess; +} + +static ncclResult_t setCpuAffinity(int cudaDev) { + // Work within the enveloppe we were provided + cpu_set_t mask; + SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity"); + + // Find the subpart that is local to our GPU + cpu_set_t gpuMask; + NCCLCHECK(getCpuGpuAffinity(cudaDev, &gpuMask)); + cpu_set_t finalMask; + CPU_AND(&finalMask, &mask, &gpuMask); + + // If those are not disjoint, try to stay local + if (CPU_COUNT(&finalMask)) { + char affinityStr[sizeof(cpu_set_t)*2]; + NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr)); + INFO(NCCL_INIT, "Setting affinity for GPU %d to %s", cudaDev, affinityStr); + SYSCHECK(sched_setaffinity(0, sizeof(cpu_set_t), &finalMask), "sched_setaffinity"); } - return true; + return ncclSuccess; } ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) { @@ -633,9 +907,8 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId // Make sure all host memory allocation are close to the GPU int cudaDev; - nvmlDevice_t nvmlDevice; CUDACHECK(cudaGetDevice(&cudaDev)); - SetCpuAffinity(cudaDev, &nvmlDevice); + NCCLCHECK(setCpuAffinity(cudaDev)); ncclResult_t res; NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup); @@ -645,7 +918,7 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave); NCCLCHECKGOTO(wrapNvmlShutdown(), res, cleanup); - INFO(NCCL_INIT,"comm %p rank %d nranks %d - COMPLETE", *newcomm, myrank, nranks); + INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->nvmlDev); return ncclSuccess; cleanup: @@ -664,8 +937,6 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm NCCLCHECK(ncclInit()); if (myrank == 0) showVersion(); - INFO(NCCL_INIT,"rank %d nranks %d", myrank, nranks); - // Make sure the CUDA runtime is initialized. CUDACHECK(cudaFree(NULL)); @@ -685,7 +956,7 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm } static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs, int nranks) { - struct ncclInfo* allInfo; + struct ncclPeerInfo* allInfo; NCCLCHECK(ncclCalloc(&allInfo, nranks)); for (int rank=0; ranknRings = nrings; + comms[rank]->nChannels = nrings; comms[rank]->nThreads = nthreads; comms[rank]->threadThreshold = threadThreshold; } @@ -751,26 +1023,32 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs, int* ringRanks = rings+r*nranks; for (int rank=0; rankprev and prevRank->next - struct ncclRing *ring = comms[rank]->rings+r; - int prevRank = ring->userRanks[nranks-1]; - struct ncclConnect* prevRankNextConnect = connect+2*prevRank+1; - struct ncclConnect* rankPrevConnect = connect+2*rank; - swap(prevRankNextConnect, rankPrevConnect, sizeof(struct ncclConnect)); + struct ncclChannel* channel = comms[rank]->channels+r; + struct ncclRing *ring = &channel->ring; + NCCLCHECK(setupChannel(comms[rank], r, rank, nranks, ringRanks, treeIn)); + // Make sure we don't use trees, we cannot use them with initAll + comms[rank]->treeThreshold = 0; + int prev = channel->ring.prev = ring->userRanks[nranks-1]; + int next = channel->ring.next = ring->userRanks[1]; + struct ncclConnector* recv = &channel->peers[prev].recv; + struct ncclConnector* send = &channel->peers[next].send; + NCCLCHECK(selectTransport<0>(allInfo+rank, allInfo+prev, connect+rank*2+0, recv, channel->buffSize, channel->id)); + NCCLCHECK(selectTransport<1>(allInfo+rank, allInfo+next, connect+rank*2+1, send, channel->buffSize, channel->id)); } for (int rank=0; rankrings+r; - NCCLCHECK(ring->send.transport->send.connect(connect+2*rank+1, &ring->send)); - NCCLCHECK(ring->recv.transport->recv.connect(connect+2*rank+0, &ring->recv)); + struct ncclChannel* channel = comms[rank]->channels+r; + struct ncclRing *ring = &channel->ring; + struct ncclConnector* recv = &channel->peers[ring->prev].recv; + struct ncclConnector* send = &channel->peers[ring->next].send; + NCCLCHECK(recv->transportComm->connect(connect+ring->prev*2+1, recv)); + NCCLCHECK(send->transportComm->connect(connect+ring->next*2+0, send)); } } - free(rings); free(allInfo); + free(rings); + free(treeIn); + free(treeOut); return ncclSuccess; } @@ -794,7 +1072,6 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) { int savedDevice; int rank, cudaDev; ncclComm_t comm = NULL; - nvmlDevice_t nvmlDevice; int ncclDevList[ndev]; for (int i=0; icudaDev; + int rank = comm->rank; if (savedDevice != commDevice) { CUDACHECK(cudaSetDevice(commDevice)); } + TRACE(NCCL_INIT, "Destroying comm %p rank %d abortFlag %d fatalError %d", comm, rank, *comm->abortFlag, comm->fatalError); + + CUDACHECK(cudaStreamSynchronize(comm->groupStream)); + NCCLCHECK(transportDestroyProxy(comm)); NCCLCHECK(commFree(comm)); if (savedDevice != commDevice) CUDACHECK(cudaSetDevice(savedDevice)); + INFO(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank); + return ncclSuccess; } +NCCL_API(ncclResult_t, ncclCommDestroy, ncclComm_t comm); +ncclResult_t ncclCommDestroy(ncclComm_t comm) { + if (comm == NULL) + return ncclSuccess; + + return commDestroy(comm); +} + +NCCL_API(ncclResult_t, ncclCommAbort, ncclComm_t comm); +ncclResult_t ncclCommAbort(ncclComm_t comm) { + if (comm == NULL) + return ncclSuccess; + + // Ask anything that might still be running on the device to quit + *comm->abortFlag = 1; + + return commDestroy(comm); +} + NCCL_API(const char*, ncclGetErrorString, ncclResult_t code); const char* ncclGetErrorString(ncclResult_t code) { switch (code) { @@ -882,6 +1182,39 @@ const char* ncclGetErrorString(ncclResult_t code) { } } +NCCL_API(ncclResult_t, ncclCommGetAsyncError, ncclComm_t comm, ncclResult_t *asyncError); +ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) { + NCCLCHECK(PtrCheck(comm, "ncclGetAsyncError", "comm")); + NCCLCHECK(PtrCheck(asyncError, "ncclGetAsyncError", "asyncError")); + + // Check device reported error + static ncclDevError_t printedDevErr = ncclDevSuccess; + switch(*comm->fatalDevError) { + case ncclDevSuccess : + break; + case ncclDevAssertedMismatch : + if (printedDevErr != ncclDevAssertedMismatch) { + WARN("Mismatched collective detected, please check your collective calls at and around rank %d. You can use NCCL_DEBUG=INFO and NCCL_DEBUG_SUBSYS=COLL to see the collective logs", comm->rank); + printedDevErr = ncclDevAssertedMismatch; + } + if (comm->fatalError == ncclSuccess) { + comm->fatalError = ncclInvalidUsage; + } + break; + case ncclDevSuspectedMismatch : + if (printedDevErr != ncclDevSuspectedMismatch) { + WARN("Your program may be hanging, this may be caused by a collective mismatch around rank %d. Please check your collective calls at and around this rank. You can use NCCL_DEBUG=INFO and NCCL_DEBUG_SUBSYS=COLL to see the collective logs", comm->rank); + printedDevErr = ncclDevSuspectedMismatch; + } + break; + default: + WARN("Unknown device error %d", *comm->fatalDevError); + return ncclInternalError; + } + *asyncError = comm->fatalError; + return ncclSuccess; +} + NCCL_API(ncclResult_t, ncclCommCount, const ncclComm_t comm, int* count); ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) { NCCLCHECK(PtrCheck(comm, "CommCount", "comm")); diff --git a/projects/rccl/src/misc/checks.cu b/projects/rccl/src/misc/checks.cu new file mode 100644 index 0000000000..a07e577b3c --- /dev/null +++ b/projects/rccl/src/misc/checks.cu @@ -0,0 +1,69 @@ +/************************************************************************* + * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "checks.h" + +static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) { + cudaPointerAttributes attr; + cudaError_t err = cudaPointerGetAttributes(&attr, pointer); + if (err != cudaSuccess || attr.devicePointer == NULL) { + WARN("%s : %s is not a valid pointer", opname, ptrname); + return ncclInvalidArgument; + } +#if CUDART_VERSION >= 10000 + if (attr.type == cudaMemoryTypeDevice && attr.device != comm->cudaDev) { +#else + if (attr.memoryType == cudaMemoryTypeDevice && attr.device != comm->cudaDev) { +#endif + WARN("%s : %s allocated on device %d mismatchs with NCCL device %d", opname, ptrname, attr.device, comm->cudaDev); + return ncclInvalidArgument; + } + return ncclSuccess; +} + +ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname) { + if (ptr == NULL) { + WARN("%s : %s argument is NULL", opname, ptrname); + return ncclInvalidArgument; + } + return ncclSuccess; +} + +ncclResult_t ArgsCheck(struct ncclInfo* info) { + NCCLCHECK(PtrCheck(info->comm, info->opName, "comm")); + // First, the easy ones + if (info->root < 0 || info->root >= info->comm->nRanks) { + WARN("%s : invalid root %d (root should be in the 0..%d range)", info->opName, info->root, info->comm->nRanks); + return ncclInvalidArgument; + } + if (info->datatype < 0 || info->datatype >= ncclNumTypes) { + WARN("%s : invalid type %d", info->opName, info->datatype); + return ncclInvalidArgument; + } + // Type is OK, compute nbytes. Convert Allgather/Broadcast calls to chars. + info->nBytes = info->count * ncclTypeSize(info->datatype); + if (info->coll == ncclCollAllGather || info->coll == ncclCollBroadcast) { + info->count = info->nBytes; + info->datatype = ncclInt8; + } + if (info->coll == ncclCollAllGather || info->coll == ncclCollReduceScatter) info->nBytes *= info->comm->nRanks; // count is per rank + + if (info->op < 0 || info->op >= ncclNumOps) { + WARN("%s : invalid reduction operation %d", info->opName, info->op); + return ncclInvalidArgument; + } + + if (info->comm->checkPointers) { + // Check CUDA device pointers + if (info->coll != ncclCollBroadcast || info->comm->rank == info->root) { + NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", info->opName)); + } + if (info->coll != ncclCollReduce || info->comm->rank == info->root) { + NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", info->opName)); + } + } + return ncclSuccess; +} diff --git a/projects/rccl/src/misc/enqueue.cu b/projects/rccl/src/misc/enqueue.cu deleted file mode 100644 index 80846dd656..0000000000 --- a/projects/rccl/src/misc/enqueue.cu +++ /dev/null @@ -1,248 +0,0 @@ -/************************************************************************* - * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "enqueue.h" -#include "common_coll.h" -#include "param.h" - -#include "collectives/collectives.h" - -#define NCCL_FUNC4(coll, op, dtype) \ - (void*)NCCL_KERN_NAME(coll, op, dtype), \ - (void*)NCCL_KERN_NAME(coll##LL, op, dtype) - -// Must be consistent with ncclDataType_t -#define NCCL_FUNCS3A(coll, op) \ - (void*)NCCL_FUNC4(coll, op, i8), \ - (void*)NCCL_FUNC4(coll, op, u8), \ - (void*)NCCL_FUNC4(coll, op, i32), \ - (void*)NCCL_FUNC4(coll, op, u32), \ - (void*)NCCL_FUNC4(coll, op, i64), \ - (void*)NCCL_FUNC4(coll, op, u64), \ - (void*)NCCL_FUNC4(coll, op, f16), \ - (void*)NCCL_FUNC4(coll, op, f32), \ - (void*)NCCL_FUNC4(coll, op, f64) -#define NCCL_FUNCS3B(coll, op) \ - (void*)NCCL_FUNC4(coll, op, i8), \ - (void*)NCCL_FUNC4(coll, op, i8), \ - (void*)NCCL_FUNC4(coll, op, i8), \ - (void*)NCCL_FUNC4(coll, op, i8), \ - (void*)NCCL_FUNC4(coll, op, i8), \ - (void*)NCCL_FUNC4(coll, op, i8), \ - (void*)NCCL_FUNC4(coll, op, i8), \ - (void*)NCCL_FUNC4(coll, op, i8), \ - (void*)NCCL_FUNC4(coll, op, i8) - -// Must be consistent with ncclRedOp_t -#define NCCL_FUNCS2A(coll) \ - NCCL_FUNCS3A(coll, sum ), \ - NCCL_FUNCS3A(coll, prod), \ - NCCL_FUNCS3A(coll, max ), \ - NCCL_FUNCS3A(coll, min ) -#define NCCL_FUNCS2B(coll) \ - NCCL_FUNCS3B(coll, copy), \ - NCCL_FUNCS3B(coll, copy), \ - NCCL_FUNCS3B(coll, copy), \ - NCCL_FUNCS3B(coll, copy) - -// Must be consistent with the ncclFuncSet enum -static void* const ncclKerns[ncclCollCount*ncclNumOps*ncclNumTypes*2] = { - NCCL_FUNCS2B(ncclBroadcast), - NCCL_FUNCS2A(ncclReduce), - NCCL_FUNCS2B(ncclAllGather), - NCCL_FUNCS2A(ncclReduceScatter), - NCCL_FUNCS2A(ncclAllReduce) -}; - -ncclResult_t ncclLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *paramsList, int* cudaDevs, int numDevices, int cgMode) { -#if CUDART_VERSION >= 9000 - if (cgMode & 0x01) { - CUDACHECK(cudaLaunchCooperativeKernelMultiDevice(paramsList, numDevices, - // These flags are to reduce the latency of using this API - cudaCooperativeLaunchMultiDeviceNoPreSync|cudaCooperativeLaunchMultiDeviceNoPostSync)); - return ncclSuccess; - } -#endif - int savedDev; - CUDACHECK(cudaGetDevice(&savedDev)); - for (int i = 0; i < numDevices; i++) { - struct cudaLaunchParams* params = paramsList+i; - CUDACHECK(cudaSetDevice(cudaDevs[i])); - CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream)); - } - CUDACHECK(cudaSetDevice(savedDev)); - return ncclSuccess; -} - -ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams* params) { - params->gridDim.x = std::min((int) params->gridDim.x, comm->nRings); - - // Set active = 2 for the last operation - for (int r=0; rgridDim.x; r++) { - struct ncclRing* ring = comm->rings+r; - ring->collectives[(ring->collStart+ring->collCount-1)%NCCL_MAX_OPS].active = 2; - } - - // Find the first operation, choose the kernel accordingly and pass it - // as the first argument. - struct ncclColl* coll = comm->rings[0].collectives+comm->rings[0].collStart; - memcpy(&comm->args, coll, sizeof(struct ncclColl)); - // As we pass that coll directly, we can free it immediately. - coll->active = 0; - - params->func = ncclKerns[coll->funcIndex]; - return ncclSuccess; -} - -ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast) { - volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase); - int val = *ptr; - bool done = false; - while (done == false) { - if (val >= comm->intraRanks) { - WARN("Trying to launch too many collectives"); - return ncclInvalidUsage; - } - if (val+1 == comm->intraRanks) { - // Reset the barrier. - comm->intraBarrier[comm->intraPhase^1] = 0; - *isLast = 1; - return ncclSuccess; - } - done = __sync_bool_compare_and_swap(ptr, val, val+1); - val++; - } - *isLast = 0; - return ncclSuccess; -} - -ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm) { - volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase); - int val = *ptr; - if (__sync_bool_compare_and_swap(ptr, val, val+1) != true) { - WARN("Trying to launch too many collectives"); - return ncclInternalError; - } - return ncclSuccess; -} - -ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm) { - volatile int* ptr = (volatile int*)(comm->intraBarrier+comm->intraPhase); - while (*ptr < comm->intraRanks) pthread_yield(); - comm->intraPhase ^= 1; - return ncclSuccess; -} - -ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm) { - if (comm->nRanks == 1) return ncclSuccess; - struct cudaLaunchParams* params = comm->myParams; - - NCCLCHECK(setupLaunch(comm, params)); - - // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL - if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) { - // Enqueue event in user stream - CUDACHECK(cudaEventRecord(comm->doneEvent, comm->userStream)); - // Create dependency between user stream and internal NCCL stream - CUDACHECK(cudaStreamWaitEvent(comm->groupStream, comm->doneEvent, 0)); - params->stream = comm->groupStream; - } else { - if (comm->userStream != params->stream) { - // Stream changed from last call, create dependency against last NCCL kernel launch - CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->doneEvent, 0)); - } - params->stream = comm->userStream; - } - - int isLast = 0; - NCCLCHECK(ncclCpuBarrierIn(comm, &isLast)); - - if (isLast) { - if (comm->launchMode == ncclComm::GROUP) { - // I'm the last. Launch all operations. - NCCLCHECK(ncclLaunchCooperativeKernelMultiDevice(comm->intraParams, comm->intraCudaDevs, comm->intraRanks, *comm->intraCGMode)); - } - NCCLCHECK(ncclCpuBarrierLast(comm)); - } - return ncclSuccess; -} - -ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) { - if (comm->nRanks == 1) return ncclSuccess; - // We can't print the CG mode before the first barrier happened. - if (comm->rank == 0 && *comm->intraCGMode & 0x10) { - *comm->intraCGMode ^= 0x10; - INFO(NCCL_INIT,"Launch mode %s%s%s", - comm->launchMode == ncclComm::GROUP ? "Group" : "Parallel", - *comm->intraCGMode ? "/CGMD" : "", - (comm->launchMode == ncclComm::GROUP && comm->groupCudaStream) ? "/Stream" : ""); - } - - NCCLCHECK(ncclCpuBarrierOut(comm)); - - struct cudaLaunchParams *params = comm->myParams; - if (comm->launchMode == ncclComm::PARALLEL) { - CUDACHECK(cudaLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream)); - } - // Start the network proxies as soon as the kernel has been launched. We can't - // perform any CUDA call between the two or having a cudaFree between the CUDA - // launch and the transportStartProxies call could cause a deadlock. - // Also, starting the proxies after the CUDA launch seems to be better for - // performance (latency). - for (int r=0; rgridDim.x; r++) { - struct ncclRing* ring = comm->rings+r; - ring->collStart = ring->collFifoTail; - ring->collCount = 0; - } - params->gridDim.x = params->blockDim.x = 0; - NCCLCHECK(transportStartProxies(comm)); - return ncclSuccess; -} - -ncclResult_t ncclEnqueueEvents(ncclComm_t comm) { - struct cudaLaunchParams *params = comm->myParams; - // Enqueue event after NCCL kernel - CUDACHECK(cudaEventRecord(comm->doneEvent, params->stream)); - // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL - if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) { - // Create dependency between NCCL internal stream and user stream - CUDACHECK(cudaStreamWaitEvent(comm->userStream, comm->doneEvent, 0)); - } - comm->userStreamSet = false; - return ncclSuccess; -} - -ncclResult_t ncclEnqueueCheck(ncclFunc_t func, const char* primName, const void* sendbuff, - void* recvbuff, size_t count, ncclDataType_t type, ncclRedOp_t op, int root, - ncclComm_t comm, cudaStream_t stream) { - if (comm == NULL) return ncclInvalidArgument; - // Launch asynchronously if needed - if (ncclAsyncMode()) { - ncclResult_t ret = ncclSuccess; - int savedDev = -1; - if (comm->checkPointers) { - CUDACHECKGOTO(cudaGetDevice(&savedDev), ret, end); - CUDACHECKGOTO(cudaSetDevice(comm->cudaDev), ret, end); - } - // Check arguments - NCCLCHECKGOTO(ArgsCheck(sendbuff, recvbuff, count, type, op, root, comm, primName), ret, end); - // Always register comm even in case of error to make sure ncclGroupEnd - // cleans it up. - NCCLCHECK(ncclAsyncColl(comm)); - NCCLCHECKGOTO(func(sendbuff, recvbuff, count, type, op, root, comm, stream), ret, end); -end: - if (savedDev != -1) CUDACHECK(cudaSetDevice(savedDev)); - ncclAsyncErrCheck(ret); - return ret; - } else { - NCCLCHECK(ArgsCheck(sendbuff, recvbuff, count, type, op, root, comm, primName)); - NCCLCHECK(func(sendbuff, recvbuff, count, type, op, root, comm, stream)); - NCCLCHECK(ncclBarrierEnqueue(comm)); - NCCLCHECK(ncclBarrierEnqueueWait(comm)); - NCCLCHECK(ncclEnqueueEvents(comm)); - return ncclSuccess; - } -} diff --git a/projects/rccl/src/misc/group.cu b/projects/rccl/src/misc/group.cu index 1716a75643..c428a22aa8 100644 --- a/projects/rccl/src/misc/group.cu +++ b/projects/rccl/src/misc/group.cu @@ -179,13 +179,13 @@ group_cleanup: // an atomic operation, we need to cancel all operations. for (int i=0; inRings; r++) { - struct ncclRing* ring = comm->rings+r; - for (int i=0; icollCount; i++) { - ring->collectives[(ring->collStart + i)%NCCL_MAX_OPS].active = 0; + for (int c=0; cnChannels; c++) { + struct ncclChannel* channel = comm->channels+c; + for (int i=0; icollCount; i++) { + channel->collectives[(channel->collStart + i)%NCCL_MAX_OPS].active = 0; } - ring->collFifoTail = ring->collStart; - ring->collCount = 0; + channel->collFifoTail = channel->collStart; + channel->collCount = 0; } comm->myParams->gridDim.x = comm->myParams->blockDim.x = 0; comm->userStreamSet = false; diff --git a/projects/rccl/src/misc/nvmlwrap.cu b/projects/rccl/src/misc/nvmlwrap.cu index d9407f4686..635f332a25 100644 --- a/projects/rccl/src/misc/nvmlwrap.cu +++ b/projects/rccl/src/misc/nvmlwrap.cu @@ -16,14 +16,14 @@ static nvmlReturn_t (*nvmlInternalInit)(void); static nvmlReturn_t (*nvmlInternalShutdown)(void); static nvmlReturn_t (*nvmlInternalDeviceGetHandleByPciBusId)(const char* pciBusId, nvmlDevice_t* device); static nvmlReturn_t (*nvmlInternalDeviceGetIndex)(nvmlDevice_t device, unsigned* index); -static nvmlReturn_t (*nvmlInternalDeviceSetCpuAffinity)(nvmlDevice_t device); -static nvmlReturn_t (*nvmlInternalDeviceClearCpuAffinity)(nvmlDevice_t device); static const char* (*nvmlInternalErrorString)(nvmlReturn_t r); static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkState)(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive); static nvmlReturn_t (*nvmlInternalDeviceGetPciInfo)(nvmlDevice_t device, nvmlPciInfo_t* pci); static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkRemotePciInfo)(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci); static nvmlReturn_t (*nvmlInternalDeviceGetNvLinkCapability)(nvmlDevice_t device, unsigned int link, nvmlNvLinkCapability_t capability, unsigned int *capResult); +static nvmlReturn_t (*nvmlInternalDeviceGetMinorNumber)(nvmlDevice_t device, unsigned int* minorNumber); + ncclResult_t wrapNvmlSymbols(void) { if (nvmlState == nvmlInitialized) @@ -70,10 +70,9 @@ ncclResult_t wrapNvmlSymbols(void) { LOAD_SYM(nvmlhandle, "nvmlShutdown", nvmlInternalShutdown); LOAD_SYM(nvmlhandle, "nvmlDeviceGetHandleByPciBusId", nvmlInternalDeviceGetHandleByPciBusId); LOAD_SYM(nvmlhandle, "nvmlDeviceGetIndex", nvmlInternalDeviceGetIndex); - LOAD_SYM(nvmlhandle, "nvmlDeviceSetCpuAffinity", nvmlInternalDeviceSetCpuAffinity); - LOAD_SYM(nvmlhandle, "nvmlDeviceClearCpuAffinity", nvmlInternalDeviceClearCpuAffinity); LOAD_SYM(nvmlhandle, "nvmlErrorString", nvmlInternalErrorString); LOAD_SYM(nvmlhandle, "nvmlDeviceGetPciInfo", nvmlInternalDeviceGetPciInfo); + LOAD_SYM(nvmlhandle, "nvmlDeviceGetMinorNumber", nvmlInternalDeviceGetMinorNumber); LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkState", nvmlInternalDeviceGetNvLinkState); LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkRemotePciInfo", nvmlInternalDeviceGetNvLinkRemotePciInfo); LOAD_SYM_OPTIONAL(nvmlhandle, "nvmlDeviceGetNvLinkCapability", nvmlInternalDeviceGetNvLinkCapability); @@ -86,9 +85,8 @@ teardown: nvmlInternalShutdown = NULL; nvmlInternalDeviceGetHandleByPciBusId = NULL; nvmlInternalDeviceGetIndex = NULL; - nvmlInternalDeviceSetCpuAffinity = NULL; - nvmlInternalDeviceClearCpuAffinity = NULL; nvmlInternalDeviceGetPciInfo = NULL; + nvmlInternalDeviceGetMinorNumber = NULL; nvmlInternalDeviceGetNvLinkState = NULL; nvmlInternalDeviceGetNvLinkRemotePciInfo = NULL; nvmlInternalDeviceGetNvLinkCapability = NULL; @@ -155,38 +153,6 @@ ncclResult_t wrapNvmlDeviceGetIndex(nvmlDevice_t device, unsigned* index) { return ncclSuccess; } -ncclResult_t wrapNvmlDeviceSetCpuAffinity(nvmlDevice_t device) { - if (nvmlInternalDeviceSetCpuAffinity == NULL) { - WARN("lib wrapper not initialized."); - return ncclInternalError; - } - // Workaround : it seems SetCpuAffinity is not thread safe. - static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; - pthread_mutex_lock(&lock); - nvmlReturn_t ret = nvmlInternalDeviceSetCpuAffinity(device); - pthread_mutex_unlock(&lock); - if (ret != NVML_SUCCESS) { - WARN("nvmlDeviceSetCpuAffinity() failed: %s ", - nvmlInternalErrorString(ret)); - return ncclSystemError; - } - return ncclSuccess; -} - -ncclResult_t wrapNvmlDeviceClearCpuAffinity(nvmlDevice_t device) { - if (nvmlInternalInit == NULL) { - WARN("lib wrapper not initialized."); - return ncclInternalError; - } - nvmlReturn_t ret = nvmlInternalDeviceClearCpuAffinity(device); - if (ret != NVML_SUCCESS) { - WARN("nvmlDeviceClearCpuAffinity() failed: %s ", - nvmlInternalErrorString(ret)); - return ncclSystemError; - } - return ncclSuccess; -} - ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) { if (nvmlInternalDeviceGetPciInfo == NULL) { WARN("lib wrapper not initialized."); @@ -201,6 +167,20 @@ ncclResult_t wrapNvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t* pci) { return ncclSuccess; } +ncclResult_t wrapNvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int* minorNumber) { + if (nvmlInternalDeviceGetMinorNumber == NULL) { + WARN("lib wrapper not initialized."); + return ncclInternalError; + } + nvmlReturn_t ret = nvmlInternalDeviceGetMinorNumber(device, minorNumber); + if (ret != NVML_SUCCESS) { + WARN("nvmlDeviceGetMinorNumber() failed: %s ", + nvmlInternalErrorString(ret)); + return ncclSystemError; + } + return ncclSuccess; +} + ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive) { if (nvmlInternalDeviceGetNvLinkState == NULL) { /* Do not warn, this symbol is optional. */ @@ -208,8 +188,9 @@ ncclResult_t wrapNvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link } nvmlReturn_t ret = nvmlInternalDeviceGetNvLinkState(device, link, isActive); if (ret != NVML_SUCCESS) { - INFO(NCCL_INIT,"nvmlDeviceGetNvLinkState() failed: %s ", - nvmlInternalErrorString(ret)); + if (ret != NVML_ERROR_NOT_SUPPORTED) + INFO(NCCL_INIT,"nvmlDeviceGetNvLinkState() failed: %s ", + nvmlInternalErrorString(ret)); return ncclSystemError; } return ncclSuccess; diff --git a/projects/rccl/src/misc/rings.cu b/projects/rccl/src/misc/rings.cu index a5d4616019..a7b122c1e9 100644 --- a/projects/rccl/src/misc/rings.cu +++ b/projects/rccl/src/misc/rings.cu @@ -160,7 +160,10 @@ static ncclResult_t fillCoords(int nranks, int* matrix, int* coords, int* rankTo while ((rank = findConnected(curRank, matrix, nranks, transport, coords)) == -1) { current[transport] = 0; transport++; - if (transport == NTRANSPORTS) { free(p2pConnected); return ncclInternalError; } + if (transport == NTRANSPORTS) { + WARN("Error : Could not find transport to connect next group\n"); + free(p2pConnected); + return ncclInternalError; } } curRank = rank; current[transport]++; @@ -179,8 +182,20 @@ ncclResult_t getEnvThreads(int* nthreads) { return ncclSuccess; } +static inline int copyRings(int nrings, int newNrings, int nranks, int* a, int* b, int* c, int* d) { + if (newNrings > MAXCHANNELS) newNrings = MAXCHANNELS; + for (int r=nrings; r 0) { if (rank == 0) INFO(NCCL_INIT,"%d ring(s) set by environment", *nrings); NCCLCHECK(getEnvThreads(nthreads)); + for (int r = 0; r<*nrings; r++) { + for (int i = 0; i=0; t--) { for (int i=0; i 1 && nvlink) { + *nrings = copyRings(*nrings, *nrings*2, nranks, prev, next, treeIn, treeOut); + } + if (*nrings == 0) { WARN("Could not create rings, falling back on simple ring"); *nrings = 1; @@ -329,9 +364,9 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than NCCL_MAX_NRINGS, ignoring NCCL_MIN_NRINGS"); minNrings = 0; } - if (minNrings > MAXRINGS) { - if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than the maximum number of rings supported (%d), limiting it to %d", MAXRINGS, MAXRINGS); - minNrings = MAXRINGS; + if (minNrings > MAXCHANNELS) { + if (rank == 0) WARN("NCCL_MIN_NRINGS set to a value greater than the maximum number of rings supported (%d), limiting it to %d", MAXCHANNELS, MAXCHANNELS); + minNrings = MAXCHANNELS; } if (maxNrings > 0 && maxNrings <= *nrings) { if (rank == 0) INFO(NCCL_INIT,"Limiting to %d rings per user request.", maxNrings); @@ -341,13 +376,7 @@ ncclResult_t ncclGetRings(int* nrings, int* nthreads, int rank, int nranks, int* if (minNrings < defaultMinNrings) minNrings = defaultMinNrings; if (minNrings > 0 && minNrings > *nrings) { if (rank == 0 && minNrings > defaultMinNrings) INFO(NCCL_INIT,"Duplicating rings to %d per user request.", minNrings); - for (int r=*nrings; r root ? rank-1 : rank) + +/* Btree which alternates leaves and nodes. + * Assumes root is 0, which conveniently builds a tree on powers of two, + * (because we have pow2-1 ranks) which lets us manipulate bits. + * Find first non-zero bit, then : + * Find the parent : + * xx01[0] -> xx10[0] (1,5,9 below) or xx00[0] if xx10[0] is out of bounds (13 below) + * xx11[0] -> xx10[0] (3,7,11 below) + * Find the children : + * xx10[0] -> xx01[0] (2,4,6,8,10,12) or -1 (1,3,5,7,9,11,13) + * xx10[0] -> xx11[0] (2,4,6,8,10) or xx101[0] (12) or xx1001[0] ... or -1 (1,3,5,7,9,11,13) + * + * Illustration : + * 0---------------8 + * ______/ \______ + * 4 12 + * / \ / \ + * 2 6 10 \ + * / \ / \ / \ \ + * 1 3 5 7 9 11 13 + */ +ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1) { + int up, down0, down1; + int bit; + for (bit=1; bit 1 ? bit >> 1 : -1; + *d1 = -1; + return ncclSuccess; + } + + up = (rank ^ bit) | (bit << 1); + if (up >= nranks) up = (rank ^ bit); + *u = up; + + int lowbit = bit >> 1; + // down0 is always within bounds + down0 = lowbit == 0 ? -1 : rank-lowbit; + + down1 = lowbit == 0 ? -1 : rank+lowbit; + // Make sure down1 is within bounds + while (down1 >= nranks) { + down1 = lowbit == 0 ? -1 : rank+lowbit; + lowbit >>= 1; + } + *d0 = down0; *d1 = down1; + + return ncclSuccess; +} + +/* Build a double binary tree. Take the previous tree for the first tree. + * For the second tree, we use a mirror tree (if nranks is odd) + * + * 8---------0---------5 + * ______/ \______ _____/ \______ + * 4 12 1 9 + * / \ / \ / \ + * 2 6 10 3 7 10 + * / \ / \ / \ / \ / \ / \ + * 1 3 5 7 9 11 2 4 6 8 11 12 + * + * or shift it by one rank (if nranks is even) + * + * 8---------0--------------9 + * ______/ \ ______/ \ + * 4 \ 5 \ + * / \ \ / \ \ + * 2 6 10 3 7 11 + * / \ / \ / \ / \ / \ / \ + * 1 3 5 7 9 11 2 4 6 8 10 1 + */ +ncclResult_t ncclGetDtree(int nranks, int rank, int* s0, int* d0_0, int* d0_1, int* s1, int* d1_0, int* d1_1) { + // First tree ... use a btree + ncclGetBtree(nranks, rank, s0, d0_0, d0_1); + // Second tree ... mirror or shift + if (nranks % 2 == 0) { + // shift + int shiftrank = (rank-1+nranks) % nranks; + int u, d0, d1; + ncclGetBtree(nranks, shiftrank, &u, &d0, &d1); + *s1 = u == -1 ? -1 : (u+1) % nranks; + *d1_0 = d0 == -1 ? -1 : (d0+1) % nranks; + *d1_1 = d1 == -1 ? -1 : (d1+1) % nranks; + } else { + // mirror + int u, d0, d1; + ncclGetBtree(nranks, nranks-1-rank, &u, &d0, &d1); + *s1 = u == -1 ? -1 : nranks-1-u; + *d1_0 = d0 == -1 ? -1 : nranks-1-d0; + *d1_1 = d1 == -1 ? -1 : nranks-1-d1; + } + return ncclSuccess; +} diff --git a/projects/rccl/src/misc/utils.cu b/projects/rccl/src/misc/utils.cu index d8e3aec5f5..c618e71974 100644 --- a/projects/rccl/src/misc/utils.cu +++ b/projects/rccl/src/misc/utils.cu @@ -11,6 +11,24 @@ #include #include +#include "nvmlwrap.h" +#include "core.h" + +// Convert a logical cudaDev index to the NVML device minor number +ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev) { + char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; + nvmlDevice_t nvmlDevice; + unsigned int dev; + *nvmlDev = -1; + CUDACHECK(cudaDeviceGetPCIBusId(busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, cudaDev)); + NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(busId, &nvmlDevice)); + NCCLCHECK(wrapNvmlDeviceGetMinorNumber(nvmlDevice, &dev)); + + *nvmlDev = dev; + + return ncclSuccess; +} + ncclResult_t getHostName(char* hostname, int maxlen) { if (gethostname(hostname, maxlen) != 0) { strncpy(hostname, "unknown", maxlen); diff --git a/projects/rccl/src/nccl.h.in b/projects/rccl/src/nccl.h.in index 72276254cd..985274eae9 100644 --- a/projects/rccl/src/nccl.h.in +++ b/projects/rccl/src/nccl.h.in @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -68,14 +68,24 @@ ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); -/* Frees resources associated with communicator object. */ +/* Frees resources associated with communicator object, but waits for any operations + * that might still be running on the device. */ ncclResult_t ncclCommDestroy(ncclComm_t comm); ncclResult_t pncclCommDestroy(ncclComm_t comm); +/* Frees resources associated with communicator object and aborts any operations + * that might still be running on the device. */ +ncclResult_t ncclCommAbort(ncclComm_t comm); +ncclResult_t pncclCommAbort(ncclComm_t comm); + /* Returns a human-readable error message. */ const char* ncclGetErrorString(ncclResult_t result); const char* pncclGetErrorString(ncclResult_t result); +/* Checks whether the comm has encountered any asynchronous errors */ +ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); +ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); + /* Gets the number of ranks in the communicator clique. */ ncclResult_t ncclCommCount(const ncclComm_t comm, int* count); ncclResult_t pncclCommCount(const ncclComm_t comm, int* count); diff --git a/projects/rccl/src/ring.cu b/projects/rccl/src/ring.cu deleted file mode 100644 index fede79387f..0000000000 --- a/projects/rccl/src/ring.cu +++ /dev/null @@ -1,70 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#include "ring.h" -#include "param.h" - -NCCL_PARAM(Buffsize, "BUFFSIZE", DEFAULT_BUFFER_SIZE_BYTES); - -ncclResult_t initRing(struct ncclComm* comm, int ringid) { - struct ncclRing* ring = comm->rings+ringid; - ring->id = ringid; - - // Setup intermediate buffering - ring->buffSize = ncclParamBuffsize(); - - const int sendSize = ring->devMemSendSize = sizeof(struct ncclSendMem); - struct ncclSendMem* sendMem; - NCCLCHECK(ncclCudaCalloc((char**)&sendMem, sendSize)); - ring->devMemSend = sendMem; - - const int recvSize = ring->devMemRecvSize = offsetof(struct ncclRecvMem, buff)+ring->buffSize; - struct ncclRecvMem* recvMem; - NCCLCHECK(ncclCudaCalloc((char**)&recvMem, recvSize)); - ring->devMemRecv = recvMem; - - TRACE(NCCL_INIT,"sendMem %p size %d recvMem %p size %d", sendMem, sendSize, recvMem, recvSize); - - // Pre-configure send/recv pointers. Those are the default, they may change later. - ring->recv.conn.buff = recvMem->buff; - ring->recv.conn.llBuff = recvMem->llBuff; - ring->recv.conn.tail = &recvMem->tail; - ring->recv.conn.opCount = &recvMem->opCount; - ring->recv.conn.direct = 0; - ring->send.conn.head = &sendMem->head; - ring->send.conn.llHead = &sendMem->llHead; - ring->send.conn.direct = 0; - ring->send.conn.llStep = 0; - ring->send.conn.llLastCleaning = 0; - - // Ring index to user rank table. - NCCLCHECK(ncclCudaCalloc(&ring->devUserRanks, comm->nRanks)); - NCCLCHECK(ncclCalloc(&ring->userRanks, comm->nRanks)); - - // Per-ring operation list. - NCCLCHECK(ncclCudaHostAlloc((void**)&ring->collectives, (void**)&ring->devCollectives, sizeof(struct ncclColl)*NCCL_MAX_OPS)); - return ncclSuccess; -} - -ncclResult_t freeRing(struct ncclRing* ring) { - // Intermediate buffering - CUDACHECK(cudaFree(ring->devMemSend)); - CUDACHECK(cudaFree(ring->devMemRecv)); - - // Index to rank table - free(ring->userRanks); - CUDACHECK(cudaFree(ring->devUserRanks)); - - // Operation list - NCCLCHECK(ncclCudaHostFree(ring->collectives)); - - // Free transport proxy resources - if (ring->send.transportResources) NCCLCHECK(ring->send.transport->send.free(ring->send.transportResources)); - NCCLCHECK(transportDestroyProxy(&ring->send)); - if (ring->recv.transportResources) NCCLCHECK(ring->recv.transport->recv.free(ring->recv.transportResources)); - NCCLCHECK(transportDestroyProxy(&ring->recv)); - return ncclSuccess; -} diff --git a/projects/rccl/src/transport.cu b/projects/rccl/src/transport.cu index 7c13d5c351..1436a5b3b3 100644 --- a/projects/rccl/src/transport.cu +++ b/projects/rccl/src/transport.cu @@ -1,11 +1,10 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "core.h" -#include "common_coll.h" extern struct ncclTransport p2pTransport; extern struct ncclTransport shmTransport; @@ -17,74 +16,16 @@ struct ncclTransport ncclTransports[NTRANSPORTS] = { netTransport, }; -static void FifoPullArgs(struct transportProxyInfo* info, struct ncclProxyArgs *args) { - struct ncclProxyArgs *fifoArgs = info->argsFifo + (info->argsFifoHead % TRANSPORT_PROXY_FIFO_SIZE); - pthread_mutex_lock(&info->mutex); - while (fifoArgs->active == 0) - pthread_cond_wait(&info->cond, &info->mutex); - __sync_synchronize(); - memcpy(args, fifoArgs, sizeof(struct ncclProxyArgs)); - __sync_synchronize(); - fifoArgs->active = 0; - pthread_cond_signal(&info->cond); - pthread_mutex_unlock(&info->mutex); - info->argsFifoHead++; -} - -static struct ncclProxyArgs* FifoGetNextArgs(struct transportProxyInfo* info) { - if (info == NULL) return NULL; - struct ncclProxyArgs* fifoArgs = info->argsFifo + (info->argsFifoTail % TRANSPORT_PROXY_FIFO_SIZE); - pthread_mutex_lock(&info->mutex); - while (fifoArgs->active == 1) - pthread_cond_wait(&info->cond, &info->mutex); - pthread_mutex_unlock(&info->mutex); - info->argsFifoTail++; - return fifoArgs; -} - -static void FifoPushArgs(struct transportProxyInfo* info) { - if (info == NULL) return; - - struct ncclProxyArgs* fifoArgs = info->argsFifo + ((info->argsFifoTail-1) % TRANSPORT_PROXY_FIFO_SIZE); - if (fifoArgs->active == 0) return; - - pthread_mutex_lock(&info->mutex); - pthread_cond_signal(&info->cond); - pthread_mutex_unlock(&info->mutex); -} - -static void WaitProxyReady(struct transportProxyInfo* info) { - pthread_mutex_lock(&info->mutex); - while (info->proxyReady == 0) - pthread_cond_wait(&info->cond, &info->mutex); - pthread_mutex_unlock(&info->mutex); -} - -static void SetProxyReady(struct transportProxyInfo* info) { - pthread_mutex_lock(&info->mutex); - info->proxyReady = 1; - pthread_cond_signal(&info->cond); - pthread_mutex_unlock(&info->mutex); -} - -static void StopProxy(struct transportProxyInfo* info) { - struct ncclProxyArgs* fifoArgs = FifoGetNextArgs(info); - fifoArgs->active = -1; - FifoPushArgs(info); -} - #define RECV 0 #define SEND 1 -static bool NeedProxy(int type, int pattern, struct ncclRing* ring, int nranks) { - enum proxyMode mode = proxyPatternMode(pattern); - if (mode == proxyRing) return true; +static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, int nranks) { + if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice) return true; /* In chains, one rank does not need a proxy. Let's figure out which one it is */ - int root = proxyPatternRoot(pattern); // Which index in the reorganized rings should we compare root against */ const int myrank = 0, nextrank = 1, prevrank = nranks-1; - int index = mode == proxyFrom ? + int index = pattern == ncclPatternPipelineFrom ? /* no recv / no send if root = */ /* bcast */ (type == RECV ? myrank : nextrank ): /* reduce */ (type == RECV ? prevrank : myrank ); @@ -92,96 +33,216 @@ static bool NeedProxy(int type, int pattern, struct ncclRing* ring, int nranks) return (root != rank); } -static void SaveProxy(struct ncclConnector* connector, struct ncclProxyArgs* args, int needProxy) { - struct transportProxyInfo* info = connector->proxyInfo; - if (info == NULL) return; - struct ncclProxyArgs* fifoArgs = FifoGetNextArgs(info); - args->needProxy = needProxy; - __sync_synchronize(); - memcpy(fifoArgs, args, sizeof(struct ncclProxyArgs)); - __sync_synchronize(); - fifoArgs->active = 1; +enum { proxyRecv=0, proxySend=1 }; + +#define PROXYARGS_ALLOCATE_SIZE 32 +struct ncclProxyPool { + struct ncclProxyPool *next; + struct ncclProxyArgs elems[PROXYARGS_ALLOCATE_SIZE]; +}; + +ncclResult_t transportAllocateProxyArgs(struct ncclComm* comm, struct ncclProxyArgs** argsptr) { + struct ncclProxyState* state = &comm->proxyState; + struct ncclProxyArgs* elem; + pthread_mutex_lock(&state->mutex); + if (state->pool == NULL) { + // Allocate a new pool of elements + struct ncclProxyPool* newPool; + NCCLCHECK(ncclCalloc(&newPool, 1)); + struct ncclProxyArgs* newElems = newPool->elems; + // Chain newly allocated elements + for (int i=0; ipool = newElems; + // Save the pool memory block for later resource release + newPool->next = state->pools; + state->pools = newPool; + } + elem = state->pool; + state->pool = state->pool->next; + pthread_mutex_unlock(&state->mutex); + elem->next = elem->nextPeer = NULL; + *argsptr = elem; + return ncclSuccess; } -ncclResult_t transportSaveProxies(int substeps, int subchunks, int nstepsPerRound, int nblocksPerRound, size_t nbytes, int pattern, struct ncclComm* comm) { - int llMode, nrings, nthreads; - ncclGetCollResource(comm, nbytes, &nrings, &nthreads, &llMode); - nbytes = llMode ? nbytes * 2 : nbytes; - substeps = llMode ? 1 : substeps; - subchunks = llMode ? NCCL_LL_CHUNKS : subchunks; - int buffSize = llMode ? NCCL_LL_BUFF_SIZE : comm->rings[0].buffSize; +static void ProxyAppend(struct ncclConnector* connector, struct ncclProxyArgs* args) { + struct ncclComm* comm = connector->comm; + struct ncclProxyState* state = &comm->proxyState; + pthread_mutex_lock(&state->mutex); + if (connector->proxyAppend == NULL) { + // Nothing running for that peer. Add to the circular list + if (state->ops == NULL) { + // Create the list + args->next = args; + state->ops = args; + } else { + // Insert element in the list + args->next = state->ops->next; + state->ops->next = args; + } + connector->proxyAppend = args; + } else { + // There is an active operation already for that peer. + // Add it to the per-peer list + connector->proxyAppend->nextPeer = args; + connector->proxyAppend = args; + } + pthread_mutex_unlock(&state->mutex); +} - int nrounds = (int)(DIVUP(nbytes, ((size_t)nrings * nblocksPerRound * (buffSize/subchunks)))); // Fixed 32-bit overflow - int nsteps = nstepsPerRound * nrounds * substeps; - TRACE(NCCL_NET,"opCount %lx substeps %d subchunks %d nrounds %d nsteps %d comm %p", comm->opCount, subchunks, subchunks, nrounds, nsteps, comm); - TRACE(NCCL_NET,"opCount %lx nbytes %zi nrings %d buffSize %d pattern %d comm %p", comm->opCount, nbytes, nrings, buffSize, pattern, comm); - for (int r=0; rrings+((comm->myParams->gridDim.x+r)%comm->nRings); - struct ncclProxyArgs args = { ring, substeps*subchunks, nsteps, comm->opCount, llMode, 0 }; - SaveProxy(&ring->recv, &args, NeedProxy(RECV, pattern, ring, comm->nRanks)); - SaveProxy(&ring->send, &args, NeedProxy(SEND, pattern, ring, comm->nRanks)); +template +static ncclResult_t SaveProxy(int peer, struct ncclProxyArgs* args) { + if (peer < 0) return ncclSuccess; + + struct ncclPeer* peerComm = args->channel->peers+peer; + struct ncclConnector* connector = type == proxyRecv ? &peerComm->recv : &peerComm->send; + if (connector->transportComm->proxy == NULL) return ncclSuccess; + + struct ncclProxyArgs* op; + NCCLCHECK(transportAllocateProxyArgs(connector->comm, &op)); + memcpy(op, args, sizeof(struct ncclProxyArgs)); + op->connector = connector; + op->progress = connector->transportComm->proxy; + op->state = ncclProxyOpReady; + ProxyAppend(connector, op); + return ncclSuccess; +} + +ncclResult_t transportSaveProxies(struct ncclProxyArgs* args, int pattern, int root, int nranks) { + if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice || pattern == ncclPatternPipelineFrom || pattern == ncclPatternPipelineTo) { + struct ncclRing* ring = &args->channel->ring; + if (NeedProxy(RECV, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy(ring->prev, args)); + if (NeedProxy(SEND, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy(ring->next, args)); + } + if (pattern == ncclPatternTreeUp || pattern == ncclPatternTreeUpDown) { + // Tree up + struct ncclTree* tree = &args->channel->tree; + for (int i=0; i(tree->down[i], args)); + NCCLCHECK(SaveProxy(tree->up, args)); + } + if (pattern == ncclPatternTreeDown || pattern == ncclPatternTreeUpDown) { + // Tree down + struct ncclTree* tree = &args->channel->tree; + for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(tree->down[i], args)); + NCCLCHECK(SaveProxy(tree->up, args)); } return ncclSuccess; } -ncclResult_t transportStartProxies(ncclComm* comm) { - for (int r=0; rnRings; r++) { - FifoPushArgs(comm->rings[r].send.proxyInfo); - FifoPushArgs(comm->rings[r].recv.proxyInfo); - } - pthread_yield(); // Let other threads run - return ncclSuccess; -} - -void* persistentThread(void *opaqueInfo) { - struct transportProxyInfo* info = (struct transportProxyInfo*)opaqueInfo; - // We need to initialize the context before launching any NCCL cuda kernel, - // otherwise we would create it during the first cudaMemcpyAsync inside the - // proxy function and that would cause a deadlock - cudaSetDevice(info->comm->cudaDev); - // Signal the main thread the context is created and it can proceed. - SetProxyReady(info); +void* persistentThread(void *comm_) { + struct ncclComm* comm = (struct ncclComm*)comm_; + struct ncclProxyState* state = &comm->proxyState; + struct ncclProxyArgs* op = NULL; + ncclResult_t ret = ncclSuccess; + int idle = 1; + int idleSpin = 0; while (1) { - struct ncclProxyArgs args; - FifoPullArgs(info, &args); - if (args.active == -1) { - // Main thread asked to stop + do { + if (*comm->abortFlag) return NULL; + if (op == NULL) { + pthread_mutex_lock(&state->mutex); + op = state->ops; + if (op == NULL) { + if (state->stop) { + // No more commands to process and proxy has been requested to stop + pthread_mutex_unlock(&state->mutex); + return NULL; + } + pthread_cond_wait(&state->cond, &state->mutex); + } + pthread_mutex_unlock(&state->mutex); + } + } while (op == NULL); + op->idle = 0; + if (op->state != ncclProxyOpNone) ret = op->progress(op); + if (ret != ncclSuccess) { + comm->fatalError = ret; + INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret); return NULL; } - ncclResult_t res = info->func(&args); - if (res != ncclSuccess) { - WARN("%s:%d -> %d [Proxy thread error]", __FILE__, __LINE__, res); + idle &= op->idle; + pthread_mutex_lock(&state->mutex); + if (!idle) idleSpin = 0; + struct ncclProxyArgs *next = op->next; + if (next->state == ncclProxyOpNone) { + struct ncclProxyArgs *freeOp = next; + if (next->nextPeer) { + // Replace next by its next per-peer element. + next = next->nextPeer; + if (op != freeOp) { + next->next = freeOp->next; + op->next = next; + } else { + next->next = next; + } + } else { + // Remove next from circular list + next->connector->proxyAppend = NULL; + if (op != freeOp) { + next = next->next; + op->next = next; + } else { + next = NULL; + } + } + if (freeOp == state->ops) state->ops = next; + freeOp->next = state->pool; + state->pool = freeOp; } + op = next; + if (op == state->ops) { + if (idle == 1) { + if (++idleSpin == 10) { + sched_yield(); + idleSpin = 0; + } + } + idle = 1; + } + pthread_mutex_unlock(&state->mutex); } } -ncclResult_t transportCreateProxy(int type, struct ncclRing* ring, struct ncclComm* comm) { - struct ncclConnector* connector = (type == RECV) ? &ring->recv : &ring->send; - threadFunc_t proxyfunc = (threadFunc_t) ((type == RECV) ? connector->transport->recv.proxy : connector->transport->send.proxy); - if (proxyfunc) { - TRACE(NCCL_NET,"type %d ring %p proxyfunc %p comm %p", type, ring, proxyfunc, comm); - struct transportProxyInfo* info; - NCCLCHECK(ncclCalloc(&info, 1)); - connector->proxyInfo = info; - info->comm = comm; - info->cond = PTHREAD_COND_INITIALIZER; - info->mutex = PTHREAD_MUTEX_INITIALIZER; - info->func = proxyfunc; - info->argsFifoHead = info->argsFifoTail = 0; - info->proxyReady = 0; - pthread_create(&connector->proxyInfo->thread, NULL, persistentThread, info); - // Wait for thread to initialize its CUDA context. - WaitProxyReady(info); +ncclResult_t transportStartProxy(struct ncclComm* comm) { + pthread_mutex_lock(&comm->proxyState.mutex); + if (comm->proxyState.ops != NULL) + pthread_cond_signal(&comm->proxyState.cond); + pthread_mutex_unlock(&comm->proxyState.mutex); + return ncclSuccess; +} + +ncclResult_t transportCreateProxy(struct ncclComm* comm) { + if (!comm->proxyThread) { + comm->proxyState.cond = PTHREAD_COND_INITIALIZER; + comm->proxyState.mutex = PTHREAD_MUTEX_INITIALIZER; + comm->proxyState.ops = NULL; + pthread_create(&comm->proxyThread, NULL, persistentThread, comm); } return ncclSuccess; } -ncclResult_t transportDestroyProxy(struct ncclConnector* connector) { - if (connector->proxyInfo) { - StopProxy(connector->proxyInfo); - pthread_join(connector->proxyInfo->thread, NULL); - free(connector->proxyInfo); - connector->proxyInfo = NULL; +ncclResult_t transportDestroyProxy(struct ncclComm* comm) { + struct ncclProxyState* state = &comm->proxyState; + + // Request the proxy to stop and then wake it + pthread_mutex_lock(&state->mutex); + state->stop = true; + pthread_cond_signal(&state->cond); + pthread_mutex_unlock(&state->mutex); + if (comm->proxyThread) pthread_join(comm->proxyThread, NULL); + + // Free off any memory allocated for the proxy arg pools + pthread_mutex_lock(&state->mutex); + struct ncclProxyState* proxyState = &comm->proxyState; + while (proxyState->pools != NULL) { + struct ncclProxyPool *next = proxyState->pools->next; + free(proxyState->pools); + proxyState->pools = next; } + pthread_mutex_unlock(&state->mutex); + return ncclSuccess; } diff --git a/projects/rccl/src/transport/net.cu b/projects/rccl/src/transport/net.cu index 9c366b32f5..06a6e2359b 100644 --- a/projects/rccl/src/transport/net.cu +++ b/projects/rccl/src/transport/net.cu @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -9,11 +9,17 @@ #include "nvmlwrap.h" #include "net.h" #include "param.h" -#include "nvlink.h" +#include "topo.h" #include #include #define NET_MAX_IFS 16 +#define NET_MAX_GPUS 32 + +// Cache GPU-NIC distances to avoid re-computing them +#define NET_TVALUE_UNKNOWN 0ULL +static ncclTvalue_t ncclNetTvalues[NET_MAX_GPUS] = { NET_TVALUE_UNKNOWN }; +static int ncclNetNDev; // We encode 3 bits of distance per interface into a ncclTvalue_t (64-bit) #define NET_BITS_PER_IF 3 @@ -28,13 +34,9 @@ static ncclTvalue_t getTvalue(short* distances, int ndev) { } return tvalue; } - -struct netInfo { - int rank; - int ndev; - ncclTvalue_t tValue; - short distances[NET_MAX_IFS]; -}; +static int getScore(ncclTvalue_t tvalue, int dev) { + return (tvalue >> (dev*NET_BITS_PER_IF)) & NET_BITS_PER_IF_MASK; +} struct netConnectInfo { ncclNetHandle_t netHandle; @@ -46,11 +48,13 @@ struct netSendResources { struct ncclRecvMem* hostRecvMem; struct ncclSendMem* devHostSendMem; struct ncclRecvMem* devHostRecvMem; - struct ncclSendMem* hostDevMem; int netDev; int useGdr; - struct ncclRecvMem* devNetMem; - uint64_t llStep; + int buffSize; + void* mhandle; + void* llMhandle; + struct ncclRecvMem* devRecvMem; + uint64_t step; uint64_t llLastCleaning; }; @@ -61,50 +65,70 @@ struct netRecvResources { struct ncclRecvMem* hostRecvMem; struct ncclSendMem* devHostSendMem; struct ncclRecvMem* devHostRecvMem; - struct ncclRecvMem* hostDevMem; int netDev; int useGdr; - uint64_t llStep; + int buffSize; + void* mhandle; + void* llMhandle; + struct ncclRecvMem* devRecvMem; + uint64_t step; uint64_t llLastCleaning; }; -/* Fill information necessary to exchange between ranks to choose whether or not - * to use this transport */ -ncclResult_t netFillInfo(ncclTinfo_t* opaqueInfo, int rank) { - struct netInfo* info = (struct netInfo*)opaqueInfo; - static_assert(sizeof(struct netInfo) <= sizeof(ncclTinfo_t), "NET Info too large"); - info->rank = rank; - NCCLCHECK(ncclNetDevices(&info->ndev)); - if (info->ndev == 0) { +static ncclResult_t netDistance(int cudaDev, int dev, short* distance) { + char* cudaPath = NULL; + char* nicPath = NULL; + ncclResult_t err; + NCCLCHECK(getCudaPath(cudaDev, &cudaPath)); + err = ncclNetPciPath(dev, &nicPath); + *distance = (err != ncclSuccess || nicPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(nicPath, cudaPath); + if (nicPath) free(nicPath); + if (cudaPath) free(cudaPath); + return ncclSuccess; +} + +static ncclResult_t netDevices(int* ndev, short** distances) { + NCCLCHECK(ncclNetDevices(ndev)); + if (*ndev == 0) { WARN("Error : Network returned 0 device"); return ncclSystemError; } - if (info->ndev > NET_MAX_IFS) info->ndev = NET_MAX_IFS; + if (*ndev > NET_MAX_IFS) *ndev = NET_MAX_IFS; + + *distances = (short*)malloc(*ndev*sizeof(short)); + if (*distances == NULL) return ncclSystemError; // Find distance with current GPU - int cudaDev; - cudaGetDevice(&cudaDev); - char* cudaPath; - NCCLCHECK(getCudaPath(cudaDev, &cudaPath)); - + int cudaDev, nvmlDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + NCCLCHECK(getNvmlDevice(cudaDev, &nvmlDev)) char line[1024]; - sprintf(line, "CUDA Dev %d, %s NIC distance : ", cudaDev, ncclNetName()); - for (int d=0; dndev; d++) { - char* nicPath; - ncclResult_t err = ncclNetPciPath(d, &nicPath); - info->distances[d] = (err != ncclSuccess || nicPath == NULL || cudaPath == NULL) ? PATH_SOC : pciDistance(nicPath, cudaPath); - sprintf(line+strlen(line), " %s", pathDists[info->distances[d]]); - if (err == ncclSuccess) free(nicPath); + sprintf(line, "CUDA Dev %d[%d], %s NIC distance : ", cudaDev, nvmlDev, ncclNetName()); + for (int d=0; d<*ndev; d++) { + NCCLCHECK(netDistance(cudaDev, d, *distances+d)); + sprintf(line+strlen(line), " %s", pathDists[(*distances)[d]]); } INFO(NCCL_INIT|NCCL_NET, "%s", line); - free(cudaPath); return ncclSuccess; } /* Determine if we can communicate with the peer */ -ncclResult_t netCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo) { - struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo; - ret[0] = getTvalue(myInfo->distances, myInfo->ndev); +ncclResult_t netCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) { + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + ret[0] = ncclNetTvalues[cudaDev]; + if (ret[0] == NET_TVALUE_UNKNOWN) { + if (cudaDev >= NET_MAX_GPUS) { + WARN("CUDA device %d >= MAX %d\n", cudaDev, NET_MAX_GPUS); + return ncclInternalError; + } + int nDev; + short* distances; + NCCLCHECK(netDevices(&nDev, &distances)); + ncclNetTvalues[cudaDev] = ret[0] = getTvalue(distances, nDev); + ncclNetNDev = nDev; + free(distances); + } return ncclSuccess; } @@ -196,45 +220,51 @@ ncclResult_t netGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* return ncclSuccess; } -int getDev(int ringId, int nDev, short* distances) { - int minDistance = PATH_SOC; - for (int d=0; d maxScore) maxScore = getScore(tvalues,d); int skip = ringId+1; while (skip) { - for (int d=0; d= netGdrLevel) { - INFO(NCCL_INIT|NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %d / HCA %d (distance %d >= %d)", ncclNetName(), cudaDev, dev, distance, netGdrLevel); + INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Disabled for GPU %d[%d] / HCA %d (distance %d >= %d)", ncclNetName(), cudaDev, nvmlDev, dev, distance, netGdrLevel); return ncclSuccess; } @@ -243,51 +273,59 @@ static ncclResult_t netGetGdrSupport(int dev, int distance, int read, int* useGd NCCLCHECK(ncclNetPtrSupport(dev, &flags)); if ((flags & NCCL_PTR_CUDA) == 0) return ncclSuccess; *useGdr = 1; - INFO(NCCL_INIT|NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %d / HCA %d (distance %d >= %d), read %d", ncclNetName(), cudaDev, dev, distance, netGdrLevel, read); + INFO(NCCL_NET,"NET/%s : GPU Direct RDMA Enabled for GPU %d[%d] / HCA %d (distance %d < %d), read %d", ncclNetName(), cudaDev, nvmlDev, dev, distance, netGdrLevel, read); return ncclSuccess; } /* Determine if we will use this transport for this peer and return connect * information for this peer */ -ncclResult_t netSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) { +ncclResult_t netSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) { struct netSendResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); - ring->send.transportResources = resources; + send->transportResources = resources; - struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo; - resources->netDev = getDev(ring->id, myInfo->ndev, myInfo->distances); - NCCLCHECK(netGetGdrSupport(resources->netDev, myInfo->distances[resources->netDev], 1, &resources->useGdr)); - - int size = offsetof(struct ncclRecvMem, buff)+ring->buffSize; - if (resources->useGdr) { - NCCLCHECK(ncclCudaCalloc((char**)(&resources->devNetMem), size)); - } - - NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, size)); - NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, size)); - - return ncclSuccess; -} - -ncclResult_t netRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) { - struct netRecvResources* resources; - NCCLCHECK(ncclCalloc(&resources, 1)); - ring->recv.transportResources = resources; - - struct netInfo* myInfo = (struct netInfo*)myOpaqueInfo; - resources->netDev = getDev(ring->id, myInfo->ndev, myInfo->distances); - NCCLCHECK(netGetGdrSupport(resources->netDev, myInfo->distances[resources->netDev], 0, &resources->useGdr)); + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + resources->netDev = getDev(cudaDev, channelId); + NCCLCHECK(netGetGdrSupport(resources->netDev, 1, &resources->useGdr)); int sendSize = sizeof(struct ncclSendMem); NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize)); - int recvSize = offsetof(struct ncclRecvMem, buff)+ring->buffSize; + int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize; + if (resources->useGdr) { + NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize)); + } NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize)); + resources->buffSize = buffSize; - struct netInfo* peerInfo = (struct netInfo*)peerOpaqueInfo; - INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d via NET/%s/%d%s%s", ring->id, peerInfo->rank, myInfo->rank, ncclNetName(), resources->netDev, - resources->useGdr ? "/GDRDMA" : "", - (resources->hostDevMem != NULL) ? "/GDCopy" : ""); + INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d [send] via NET/%s/%d%s", channelId, myInfo->rank, peerInfo->rank, ncclNetName(), resources->netDev, + resources->useGdr ? "/GDRDMA" : ""); + return ncclSuccess; +} + +ncclResult_t netRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) { + struct netRecvResources* resources; + NCCLCHECK(ncclCalloc(&resources, 1)); + recv->transportResources = resources; + + int cudaDev; + CUDACHECK(cudaGetDevice(&cudaDev)); + resources->netDev = getDev(cudaDev, channelId); + NCCLCHECK(netGetGdrSupport(resources->netDev, 0, &resources->useGdr)); + + int sendSize = sizeof(struct ncclSendMem); + NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostSendMem, (void**)&resources->devHostSendMem, sendSize)); + + int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize; + if (resources->useGdr) { + NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize)); + } + NCCLCHECK(ncclCudaHostAlloc((void**)&resources->hostRecvMem, (void**)&resources->devHostRecvMem, recvSize)); + resources->buffSize = buffSize; + + INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d -> %d [receive] via NET/%s/%d%s", channelId, peerInfo->rank, myInfo->rank, ncclNetName(), resources->netDev, + resources->useGdr ? "/GDRDMA" : ""); struct netConnectInfo* info = (struct netConnectInfo*) connectInfo; NCCLCHECK(ncclNetListen(resources->netDev, &info->netHandle, &resources->netListenComm)); return ncclSuccess; @@ -297,27 +335,28 @@ ncclResult_t netSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto // Setup device pointers struct netSendResources* resources = (struct netSendResources*)send->transportResources; - if (resources->useGdr) { - send->conn.buff = resources->devNetMem->buff; - // We don't use devMem for llMode because the CPU has to read the data - send->conn.llBuff = resources->devHostRecvMem->llBuff; - } else { - send->conn.buff = resources->devHostRecvMem->buff; - send->conn.llBuff = resources->devHostRecvMem->llBuff; - } - send->conn.tail = &resources->devHostRecvMem->tail; - send->conn.opCount = &resources->devHostRecvMem->opCount; - send->conn.fifo = resources->devHostRecvMem->sizesFifo; - send->conn.llFifo = resources->devHostRecvMem->llSizesFifo; + // Intermediate buffering on GPU for GPU Direct RDMA, but LL buffer is always on host + struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem; + send->conn.buff = recvMem->buff; + send->conn.llBuff = resources->devHostRecvMem->llBuff; - if (resources->hostDevMem == NULL) { - send->conn.head = &resources->devHostSendMem->head; - send->conn.llHead = &resources->devHostSendMem->llHead; - } + // Head/Tail/Opcount/Fifos are always on host + send->conn.tail = &resources->devHostRecvMem->tail; + send->conn.opCountRem = &resources->devHostRecvMem->opCount; + send->conn.fifo = resources->devHostRecvMem->sizesFifo; + send->conn.head = &resources->devHostSendMem->head; + send->conn.opCountLoc = &resources->devHostSendMem->opCount; + for (int i=0; iconn.fifo[i] = -1; // Connect to remote peer struct netConnectInfo* info = (struct netConnectInfo*)connectInfo; NCCLCHECK(ncclNetConnect(resources->netDev, info->netHandle, &resources->netSendComm)); + + NCCLCHECK(ncclNetRegMr(resources->netSendComm, recvMem->buff, resources->buffSize, + resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandle)); + NCCLCHECK(ncclNetRegMr(resources->netSendComm, resources->devHostRecvMem->llBuff, + NCCL_LL_BUFF_SIZE, NCCL_PTR_HOST, &resources->llMhandle)); + return ncclSuccess; } @@ -326,32 +365,37 @@ ncclResult_t netRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto // Setup device pointers struct netRecvResources* resources = (struct netRecvResources*)recv->transportResources; + // Intermediate buffering on GPU for GPU Direct RDMA + struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->devHostRecvMem; + recv->conn.buff = recvMem->buff; + recv->conn.llBuff = recvMem->llBuff; + + // Head/Tail/Opcount are always on host + recv->conn.tail = &resources->devHostRecvMem->tail; + recv->conn.opCountLoc = &resources->devHostRecvMem->opCount; recv->conn.head = &resources->devHostSendMem->head; - recv->conn.llHead = &resources->devHostSendMem->llHead; + recv->conn.opCountRem = &resources->devHostSendMem->opCount; - if (resources->useGdr == 0) { - recv->conn.buff = resources->devHostRecvMem->buff; - recv->conn.llBuff = resources->devHostRecvMem->llBuff; - } - - if (resources->hostDevMem == NULL) { - recv->conn.tail = &resources->devHostRecvMem->tail; - recv->conn.opCount = &resources->devHostRecvMem->opCount; - } - - // Finish connection establishment + // Finish connection establishment from remote peer NCCLCHECK(ncclNetAccept(resources->netListenComm, &resources->netRecvComm)); NCCLCHECK(ncclNetCloseListen(resources->netListenComm)); + NCCLCHECK(ncclNetRegMr(resources->netRecvComm, recvMem->buff, resources->buffSize, + resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandle)); + NCCLCHECK(ncclNetRegMr(resources->netRecvComm, recvMem->llBuff, NCCL_LL_BUFF_SIZE, + resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->llMhandle)); + return ncclSuccess; } ncclResult_t netSendFree(void* transportResources) { struct netSendResources* resources = (struct netSendResources*)transportResources; NCCLCHECK(ncclCudaHostFree(resources->hostSendMem)); + NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->mhandle)); + NCCLCHECK(ncclNetDeregMr(resources->netSendComm, resources->llMhandle)); NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem)); if (resources->useGdr) - CUDACHECK(cudaFree(resources->devNetMem)); + CUDACHECK(cudaFree(resources->devRecvMem)); NCCLCHECK(ncclNetCloseSend(resources->netSendComm)); free(resources); return ncclSuccess; @@ -360,196 +404,166 @@ ncclResult_t netSendFree(void* transportResources) { ncclResult_t netRecvFree(void* transportResources) { struct netRecvResources* resources = (struct netRecvResources*)transportResources; NCCLCHECK(ncclCudaHostFree(resources->hostSendMem)); + NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->mhandle)); + NCCLCHECK(ncclNetDeregMr(resources->netRecvComm, resources->llMhandle)); NCCLCHECK(ncclCudaHostFree(resources->hostRecvMem)); + if (resources->useGdr) + CUDACHECK(cudaFree(resources->devRecvMem)); NCCLCHECK(ncclNetCloseRecv(resources->netRecvComm)); free(resources); return ncclSuccess; } ncclResult_t netSendProxy(struct ncclProxyArgs* args) { - struct ncclRing* ring = args->ring; - struct netSendResources* resources = (struct netSendResources*) (ring->send.transportResources); - const int llMode = args->llMode; + struct netSendResources* resources = (struct netSendResources*) (args->connector->transportResources); + if (args->state == ncclProxyOpReady) { + // Update opCount + resources->hostRecvMem->opCount = args->opCount; - volatile uint64_t* prevTail = &resources->hostRecvMem->tail; - struct ncclSendMem* prevMem = resources->hostDevMem ? resources->hostDevMem : resources->hostSendMem; - uint64_t* prevHead = llMode ? &prevMem->llHead : &prevMem->head; - struct ncclRecvMem* localMem = resources->useGdr ? resources->devNetMem : resources->hostRecvMem; - char* localBuff = llMode ? resources->hostRecvMem->llBuff : localMem->buff; - int ptrType = resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST; - volatile int* sizesFifo = llMode ? resources->hostRecvMem->llSizesFifo : resources->hostRecvMem->sizesFifo; - int buffSize = llMode ? NCCL_LL_BUFF_SIZE : ring->buffSize; - int sliceSize = buffSize / args->substeps; - - assert(args->substeps <= SIZES_FIFO_SIZE); - - uint64_t head = llMode ? resources->llStep : 0ULL; - uint64_t tail = llMode ? resources->llStep : 0ULL; - uint64_t end = head + args->nsteps; - - int idle = 0; - void* requests[args->substeps]; - - if (!args->needProxy) goto nextColl; - - TRACE(NCCL_NET,"opCount %lx head %lx tail %lx end %lx nsteps %d llMode %d", args->opCount, head, tail, end, args->nsteps, llMode); - TRACE(NCCL_NET,"opCount %lx buffSize %d sliceSize %d ptrType %d", args->opCount, buffSize, sliceSize, ptrType); - - // Update in case we skipped some collectives - if (llMode == 0) resources->hostRecvMem->opCount = args->opCount; - - while (head < end) { - idle++; - if (llMode) { - if (tail < end && tail < head + args->substeps) { - int slot = tail%args->substeps; - int size = sizesFifo[slot]; - if (size != 0) { - if (size == -1) size = 0; - uint32_t flag = tail + 1; - int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine)); - size = nFifoLines * sizeof(union ncclLLFifoLine); - union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+slot*sliceSize); - for (int i=0; inetSendComm, lines, size, ptrType, requests+slot)); - if (requests[slot] != NULL) { - sizesFifo[slot] = size; - tail++; - idle = 0; - } - } - } - } else while (tail < *prevTail) { - // Send through network - int slot = tail%args->substeps; - NCCLCHECK(ncclNetIsend(resources->netSendComm, localBuff+slot*sliceSize, sizesFifo[slot], ptrType, requests+slot)); - if (requests[slot] != NULL) { - tail++; - idle = 0; - } - } - if (head < tail) { - int done; - int slot = head%args->substeps; - NCCLCHECK(ncclNetTest(requests[slot], &done, NULL)); - if (done) { - if (llMode) { - sizesFifo[slot] = 0; - // Make sure size is reset to zero before we update the head. - __sync_synchronize(); - } - head++; - *prevHead = head; - idle = 0; - } - } - if (idle) transportProxyIdle(idle); + // Round to next multiple of sliceSteps + resources->step = ROUNDUP(resources->step, args->chunkSteps); + args->head = resources->step; + args->tail = resources->step; + args->end = args->head + args->nsteps; + args->state = ncclProxyOpProgress; } - - // Reset - if (llMode == 0) *prevTail = 0; - -nextColl: - if (llMode) { - resources->llStep += args->nsteps; - // Don't forget to ack otherwise the GPU won't be able to push data. - *prevHead = resources->llStep; - if (resources->llStep > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) { - memset(localBuff, 0, NCCL_LL_BUFF_SIZE); - resources->llStep += NCCL_LL_CHUNKS; - *prevHead = resources->llStep; - resources->llLastCleaning = resources->llStep; + if (args->state == ncclProxyOpProgress) { + args->idle = 1; + if (args->head < args->end) { + if (args->tail < args->end && args->tail < args->head + NCCL_STEPS) { + volatile int* sizesFifo = resources->hostRecvMem->sizesFifo; + if (args->llMode) { + int buffSlot = args->tail%NCCL_STEPS; + int size = sizesFifo[buffSlot]; + if (size != -1) { + uint32_t flag = args->tail + 1; + int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine)); + size = nFifoLines * sizeof(union ncclLLFifoLine); + union ncclLLFifoLine* lines = resources->hostRecvMem->llBuff+buffSlot*NCCL_LL_SLICE_LINES; + int ready = 1; + for (int i=0; inetSendComm, lines, size, resources->llMhandle, args->requests+buffSlot)); + if (args->requests[buffSlot] != NULL) { + sizesFifo[buffSlot] = -1; + // Make sure size is reset to zero before we update the head. + __sync_synchronize(); + args->tail += args->sliceSteps; + args->idle = 0; + } + } + } + } else if (args->tail < resources->hostRecvMem->tail) { + struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem; + int stepSize = args->channel->buffSize/NCCL_STEPS; + // Send through network + int buffSlot = args->tail%NCCL_STEPS; + NCCLCHECK(ncclNetIsend(resources->netSendComm, localMem->buff+buffSlot*stepSize, sizesFifo[buffSlot], resources->mhandle, args->requests+buffSlot)); + if (args->requests[buffSlot] != NULL) { + sizesFifo[buffSlot] = -1; + // Make sure size is reset to zero before we update the head. + __sync_synchronize(); + args->tail += args->sliceSteps; + args->idle = 0; + } + } + } + if (args->head < args->tail) { + int done; + int buffSlot = args->head%NCCL_STEPS; + NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, NULL)); + if (done) { + args->head += args->sliceSteps; + resources->hostSendMem->head = args->head; + args->idle = 0; + } + } } + if (args->head == args->end) { + resources->step = args->end; + args->idle = 0; + args->state = ncclProxyOpDone; + } + } + if (args->state == ncclProxyOpDone) { + union ncclLLFifoLine* llBuff = resources->hostRecvMem->llBuff; + if (args->llMode && resources->step > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) { + for (int i=0; i< NCCL_LL_BUFF_LINES; i++) llBuff[i].flag1 = llBuff[i].flag2 = resources->step; + resources->step += NCCL_STEPS; + resources->hostSendMem->head = resources->step; + resources->llLastCleaning = resources->step; + } + args->state = ncclProxyOpNone; } return ncclSuccess; } ncclResult_t netRecvProxy(struct ncclProxyArgs* args) { - struct ncclRing* ring = args->ring; - struct netRecvResources* resources = (struct netRecvResources*) (ring->recv.transportResources); - int llMode = args->llMode; + struct netRecvResources* resources = (struct netRecvResources*) (args->connector->transportResources); + if (args->state == ncclProxyOpReady) { + // Update opCount + resources->hostSendMem->opCount = args->opCount; - volatile uint64_t* nextHead = llMode ? &resources->hostSendMem->llHead : &resources->hostSendMem->head; - struct ncclRecvMem* localMem = resources->useGdr ? ring->devMemRecv : resources->hostRecvMem; - char* localBuff = llMode ? localMem->llBuff : localMem->buff; - char* nextBuff = (resources->useGdr == 0 && resources->hostDevMem) ? resources->hostDevMem->buff : NULL; - int ptrType = resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST; - uint64_t* nextTail = resources->hostDevMem ? &resources->hostDevMem->tail : &resources->hostRecvMem->tail; - - int buffSize = llMode ? NCCL_LL_BUFF_SIZE : ring->buffSize; - int sliceSize = buffSize / args->substeps; - - uint64_t head = llMode ? resources->llStep : 0ULL; - uint64_t tail = llMode ? resources->llStep : 0ULL; - uint64_t end = head + args->nsteps; - - int idle = 0; - void* requests[args->substeps]; - - if (!args->needProxy) goto nextColl; - - TRACE(NCCL_NET,"opCount %lx head %lx tail %lx end %lx nsteps %d llMode %d", args->opCount, head, tail, end, args->nsteps, llMode); - TRACE(NCCL_NET,"opCount %lx buffSize %d sliceSize %d ptrType %d", args->opCount, buffSize, sliceSize, ptrType); - - if (llMode == 0) { - // Waiting for next opCount is only needed before writing nextTail. - uint64_t* nextOpCount = resources->hostDevMem ? &resources->hostDevMem->opCount : &resources->hostRecvMem->opCount; - transportProxyWait([=] { return *nextOpCount >= args->opCount; }); + // Round to next multiple of sliceSteps + resources->step = ROUNDUP(resources->step, args->chunkSteps); + args->head = resources->step; + args->tail = resources->step; + args->end = args->head + args->nsteps; + args->state = ncclProxyOpProgress; } - - while (head < end) { - idle++; - if ((tail < head + args->substeps) && (tail < *nextHead + args->substeps) && (tail < end)) { - int slot = tail%args->substeps; - NCCLCHECK(ncclNetIrecv(resources->netRecvComm, localBuff+slot*sliceSize, sliceSize, ptrType, requests+slot)); - if (requests[slot] != NULL) { - tail++; - idle = 0; - } - } - if (tail > head) { - int done; - int slot = head%args->substeps; - int size; - NCCLCHECK(ncclNetTest(requests[slot], &done, &size)); - if (done) { - if (nextBuff) memcpy(nextBuff+slot*sliceSize, localBuff+slot*sliceSize, size); - head++; - if (llMode == 0) { - if (ptrType == NCCL_PTR_CUDA) ncclNetFlush(resources->netRecvComm, localBuff+slot*sliceSize, size); - *nextTail = head; + if (args->state == ncclProxyOpProgress) { + args->idle = 1; + int stepSize = ( args->llMode ? NCCL_LL_BUFF_SIZE : args->channel->buffSize ) / NCCL_STEPS; + if (args->head < args->end) { + struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem; + char* localBuff = args->llMode ? (char*)localMem->llBuff : localMem->buff; + void* mhandle = args->llMode ? resources->llMhandle : resources->mhandle; + if ((args->tail < args->head + NCCL_STEPS) && (args->tail < (resources->hostSendMem->head) + NCCL_STEPS) && (args->tail < args->end)) { + int buffSlot = args->tail%NCCL_STEPS; + int sliceSize = stepSize * args->sliceSteps; + NCCLCHECK(ncclNetIrecv(resources->netRecvComm, localBuff+buffSlot*stepSize, sliceSize, mhandle, args->requests+buffSlot)); + if (args->requests[buffSlot] != NULL) { + args->tail += args->sliceSteps; + args->idle = 0; + } + } + if (args->tail > args->head) { + int buffSlot = args->head%NCCL_STEPS; + int done, size; + NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, &size)); + if (done) { + args->head += args->sliceSteps; + if (args->llMode == 0) { + if (resources->useGdr) ncclNetFlush(resources->netRecvComm, localBuff+buffSlot*stepSize, size, mhandle); + resources->hostRecvMem->tail = args->head; + } + args->idle = 0; } - idle = 0; } } - if (idle) transportProxyIdle(idle); - } - - // Wait for last ack and reset - if (llMode == 0) { - transportProxyWait([=] { return *nextHead == head; }); - *nextHead = 0; - } - -nextColl: - if (llMode) { - resources->llStep += args->nsteps; - if (resources->llStep > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) { - resources->llStep += NCCL_LL_CHUNKS; - while (*nextHead < resources->llStep); - resources->llLastCleaning = resources->llStep; + if (args->head == args->end) { + resources->step = args->end; + args->idle = 0; + args->state = ncclProxyOpDone; } } + if (args->state == ncclProxyOpDone) { + if (args->llMode && resources->step > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) { + resources->step += NCCL_STEPS; + while (resources->hostSendMem->head < resources->step); + resources->llLastCleaning = resources->step; + } + args->state = ncclProxyOpNone; + } return ncclSuccess; } struct ncclTransport netTransport = { "NET", - netFillInfo, netCanConnect, netGetRings, { netSendSetup, netSendConnect, netSendFree, netSendProxy }, diff --git a/projects/rccl/src/transport/net_ib.cu b/projects/rccl/src/transport/net_ib.cu index 18e158df7c..f7c574b5b0 100644 --- a/projects/rccl/src/transport/net_ib.cu +++ b/projects/rccl/src/transport/net_ib.cu @@ -32,6 +32,7 @@ static int ncclNIbDevs = -1; struct ncclIbDev { int device; uint8_t port; + uint8_t link; ibv_context* context; char devName[MAXNAMESIZE]; }; @@ -97,7 +98,6 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { WARN("NET/IB : No IP interface found."); return ncclInternalError; } - INFO(NCCL_INIT|NCCL_NET,"NET/IB : Using interface %s for sideband communication", ncclIbIfName); // Detect IB cards int nIbDevs; @@ -113,47 +113,59 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { for (int d=0; dname); continue; } int found = 0; - if (context) { - struct ibv_device_attr devAttr; - if (ncclSuccess != wrap_ibv_query_device(context, &devAttr)) { - WARN("NET/IB : Unable to query device %s", devices[d]->name); + struct ibv_device_attr devAttr; + if (ncclSuccess != wrap_ibv_query_device(context, &devAttr)) { + WARN("NET/IB : Unable to query device %s", devices[d]->name); + if (ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; } + continue; + } + for (int port = 1; port <= devAttr.phys_port_cnt; port++) { + struct ibv_port_attr portAttr; + if (ncclSuccess != wrap_ibv_query_port(context, port, &portAttr)) { + WARN("NET/IB : Unable to query port %d", port); continue; } - for (int port = 1; port <= devAttr.phys_port_cnt; port++) { - struct ibv_port_attr portAttr; - if (ncclSuccess != wrap_ibv_query_port(context, port, &portAttr)) { - WARN("NET/IB : Unable to query port %d", port); - continue; - } - if (portAttr.state != IBV_PORT_ACTIVE) continue; - if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND - && portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) continue; + if (portAttr.state != IBV_PORT_ACTIVE) continue; + if (portAttr.link_layer != IBV_LINK_LAYER_INFINIBAND + && portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) continue; - // check against user specified HCAs/ports - if (! (matchIfList(devices[d]->name, port, userIfs, nUserIfs) ^ searchNot)) { - continue; - } - INFO(NCCL_INIT|NCCL_NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port, - portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE"); - ncclIbDevs[ncclNIbDevs].device = d; - ncclIbDevs[ncclNIbDevs].port = port; - ncclIbDevs[ncclNIbDevs].context = context; - strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE); - ncclNIbDevs++; - found++; - pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context); + // check against user specified HCAs/ports + if (! (matchIfList(devices[d]->name, port, userIfs, nUserIfs) ^ searchNot)) { + continue; } - - if (found == 0) { if (ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; } } + TRACE(NCCL_INIT|NCCL_NET,"NET/IB: [%d] %s:%d/%s ", d, devices[d]->name, port, + portAttr.link_layer == IBV_LINK_LAYER_INFINIBAND ? "IB" : "RoCE"); + ncclIbDevs[ncclNIbDevs].device = d; + ncclIbDevs[ncclNIbDevs].port = port; + ncclIbDevs[ncclNIbDevs].link = portAttr.link_layer; + ncclIbDevs[ncclNIbDevs].context = context; + strncpy(ncclIbDevs[ncclNIbDevs].devName, devices[d]->name, MAXNAMESIZE); + ncclNIbDevs++; + found++; + pthread_create(&ncclIbAsyncThread, NULL, ncclIbAsyncThreadMain, context); } + if (found == 0 && ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; } } if (nIbDevs && (ncclSuccess != wrap_ibv_free_device_list(devices))) { return ncclInternalError; }; } + if (ncclNIbDevs == 0) { + INFO(NCCL_INIT|NCCL_NET, "NET/IB : No device found."); + } else { + char line[1024]; + line[0] = '\0'; + for (int d=0; dfd, &qpInfo, sizeof(qpInfo))); @@ -537,7 +542,6 @@ ncclResult_t ncclIbGetRequest(struct ncclIbRequest* reqs, struct ncclIbRequest** r->used = 1; r->type = 0; r->verbs = NULL; - r->ibMr = NULL; r->done = 0; r->size = -1; r->free = 0; @@ -583,57 +587,34 @@ ncclResult_t ncclIbTest(void* request, int* done, int* size); #define REG_ALIGN (4096) -// Cache previous MRs to avoid registering/unregistering for each Isend/Irecv -ncclResult_t ncclIbGetMr(struct ncclIbVerbs* verbs, void* data, int size, struct ncclIbMr** mrRet) { +ncclResult_t ncclIbRegMr(void* comm, void* data, int size, int type, void** mhandle) { + struct ncclIbVerbs* verbs = (struct ncclIbVerbs*)comm; uint64_t addr = (uint64_t)data; - int elem = -1; assert(size > 0); - // Look for an already existing MR - for (int i=0; imrPool[i].mr == NULL) continue; - uint64_t regAddr = (uint64_t)verbs->mrPool[i].mr->addr; - uint64_t regSize = (uint64_t)verbs->mrPool[i].mr->length; - if (regAddr <= addr && addr+size <= regAddr+regSize) { - *mrRet = verbs->mrPool+i; - verbs->mrPool[i].refcnt++; - return ncclSuccess; - } - } - - // Find an unused element - if (elem == -1) { - elem = (verbs->mrRotation++); - for (int i=0; imrPool[elem].refcnt > 0) elem++; else break; - } - if (verbs->mrPool[elem].refcnt > 0) { - WARN("NET/IB : memory register : no MR available"); - return ncclInternalError; - } - } - - assert(elem < MAX_REQUESTS); - assert(verbs->mrPool[elem].refcnt == 0); - // Deregister / register uint64_t regAddr = addr & (~(REG_ALIGN-1)); uint64_t regSize = addr+size - regAddr; regSize = ((regSize + REG_ALIGN-1) / REG_ALIGN ) * REG_ALIGN; - if (verbs->mrPool[elem].mr) NCCLCHECK(wrap_ibv_dereg_mr(verbs->mrPool[elem].mr)); - NCCLCHECK(wrap_ibv_reg_mr(&verbs->mrPool[elem].mr, verbs->pd, (void*)regAddr, regSize, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ)); - *mrRet = verbs->mrPool+elem; - verbs->mrPool[elem].refcnt++; - TRACE(NCCL_INIT,"elem %d regAddr %lx size %ld rkey %x", elem, regAddr, regSize, (verbs->mrPool+elem)->mr->rkey); + struct ibv_mr* mr; + NCCLCHECK(wrap_ibv_reg_mr(&mr, verbs->pd, (void*)regAddr, regSize, IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE|IBV_ACCESS_REMOTE_READ)); + *mhandle = (void*)mr; + TRACE(NCCL_INIT,"regAddr %lx size %ld rkey %x", regAddr, regSize, mr->rkey); return ncclSuccess; } -ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int type, void** request) { +ncclResult_t ncclIbDeregMr(void* comm, void* mhandle) { + NCCLCHECK(wrap_ibv_dereg_mr((struct ibv_mr*)mhandle)); + return ncclSuccess; +} + +ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { struct ncclIbSendComm* comm = (struct ncclIbSendComm*)sendComm; if (comm->ready == 0) NCCLCHECK(ncclSendCheck(comm)); if (comm->ready == 0) { *request = NULL; return ncclSuccess; } + struct ibv_mr* mr = (struct ibv_mr*)mhandle; + // Wait for the receiver to have posted the corresponding receive volatile struct ncclIbSendFifo* slot = comm->fifo + (comm->fifoHead%MAX_REQUESTS); volatile uint32_t * readyPtr = &slot->ready; @@ -641,7 +622,6 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int type, void** struct ncclIbRequest* req; NCCLCHECK(ncclIbGetRequest(comm->reqs, &req)); - req->type = type; req->verbs = &comm->verbs; req->size = size; @@ -654,8 +634,7 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, int type, void** wr.sg_list = NULL; wr.num_sge = 0; } else { - NCCLCHECK(ncclIbGetMr(&comm->verbs, data, size, &req->ibMr)); - sge.addr=(uintptr_t)data; sge.length=(unsigned int)size; sge.lkey=req->ibMr->mr->lkey; + sge.addr=(uintptr_t)data; sge.length=(unsigned int)size; sge.lkey=mr->lkey; wr.sg_list = &sge; wr.num_sge = 1; } @@ -720,14 +699,15 @@ ncclResult_t ncclIbPostFifo(struct ncclIbRecvComm* comm, uint32_t rkey, uint64_t return ncclSuccess; } -ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, int type, void** request) { +ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm; if (comm->ready == 0) NCCLCHECK(ncclRecvCheck(comm)); if (comm->ready == 0) { *request = NULL; return ncclSuccess; } + struct ibv_mr* mr = (struct ibv_mr*)mhandle; + struct ncclIbRequest* req; NCCLCHECK(ncclIbGetRequest(comm->reqs, &req)); - req->type = type; req->verbs = &comm->verbs; req->size = size; @@ -739,10 +719,8 @@ ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, int type, void** if (size == 0) { wr.sg_list = NULL; wr.num_sge = 0; - req->ibMr = NULL; } else { - NCCLCHECK(ncclIbGetMr(&comm->verbs, data, size, &req->ibMr)); - sge.addr=(uintptr_t)data; sge.length=(unsigned int)size; sge.lkey=req->ibMr->mr->lkey; + sge.addr=(uintptr_t)data; sge.length=(unsigned int)size; sge.lkey=mr->lkey; wr.sg_list = &sge; wr.num_sge = 1; } @@ -752,25 +730,25 @@ ncclResult_t ncclIbIrecv(void* recvComm, void* data, int size, int type, void** *request = req; // Post to FIFO to notify sender - NCCLCHECK(ncclIbPostFifo(comm, req->ibMr->mr->rkey, (uint64_t)data, size)); + NCCLCHECK(ncclIbPostFifo(comm, mr->rkey, (uint64_t)data, size)); return ncclSuccess; } -ncclResult_t ncclIbFlush(void* recvComm, void* data, int size) { +ncclResult_t ncclIbFlush(void* recvComm, void* data, int size, void* mhandle) { struct ncclIbRecvComm* comm = (struct ncclIbRecvComm*)recvComm; if (comm->gpuFlush.enabled == 0 || size == 0) return ncclSuccess; struct ncclIbRequest* req; NCCLCHECK(ncclIbGetRequest(comm->reqs, &req)); req->verbs = &comm->verbs; - NCCLCHECK(ncclIbGetMr(&comm->verbs, data, 1, &req->ibMr)); + struct ibv_mr* mr = (struct ibv_mr*)mhandle; struct ibv_send_wr wr; memset(&wr, 0, sizeof(wr)); wr.wr_id = (uint64_t)req; wr.wr.rdma.remote_addr = (uint64_t)data; - wr.wr.rdma.rkey = req->ibMr->mr->rkey; + wr.wr.rdma.rkey = mr->rkey; wr.sg_list = &comm->gpuFlush.sge; wr.num_sge = 1; wr.opcode = IBV_WR_RDMA_READ; @@ -800,32 +778,31 @@ ncclResult_t ncclIbTest(void* request, int* done, int* size) { } int wrDone = 0; - struct ibv_wc wc; - NCCLCHECK(wrap_ibv_poll_cq(r->verbs->cq, 1, &wc, &wrDone)); + struct ibv_wc wcs[4]; + NCCLCHECK(wrap_ibv_poll_cq(r->verbs->cq, 4, wcs, &wrDone)); if (wrDone == 0) return ncclSuccess; - if (wc.status != IBV_WC_SUCCESS) { - WARN("NET/IB : Got completion with error %d, opcode %d, len %d, vendor err %d", wc.status, wc.opcode, wc.byte_len, wc.vendor_err); - return ncclSystemError; - } + for (int w=0; wstatus != IBV_WC_SUCCESS) { + WARN("NET/IB : Got completion with error %d, opcode %d, len %d, vendor err %d", wc->status, wc->opcode, wc->byte_len, wc->vendor_err); + return ncclSystemError; + } - struct ncclIbRequest* doneReq = (struct ncclIbRequest*)wc.wr_id; - if (doneReq) { - if (wc.opcode == IBV_WC_RECV) { - doneReq->size = wc.byte_len; + struct ncclIbRequest* doneReq = (struct ncclIbRequest*)wc->wr_id; + if (doneReq) { + if (wc->opcode == IBV_WC_RECV) { + doneReq->size = wc->byte_len; #if USE_RDMA_WRITE - } else if (wc.opcode == IBV_WC_RECV_RDMA_WITH_IMM) { - doneReq->size = wc.imm_data; + } else if (wc->opcode == IBV_WC_RECV_RDMA_WITH_IMM) { + doneReq->size = wc->imm_data; #endif - } - if (doneReq->ibMr != NULL) { - doneReq->ibMr->refcnt--; - if (doneReq->ibMr->refcnt < 0) WARN("NET/IB : doneReq %p MR %p refcount now %d", doneReq, doneReq->ibMr, doneReq->ibMr->refcnt); - } - doneReq->done = 1; - if (doneReq->free == 1) { - // This is an internal (FIFO post) req. Free it immediately. - doneReq->used = 0; + } + doneReq->done = 1; + if (doneReq->free == 1) { + // This is an internal (FIFO post) req. Free it immediately. + doneReq->used = 0; + } } } } @@ -837,12 +814,6 @@ ncclResult_t ncclIbCloseSend(void* sendComm) { close(comm->fd); if (comm->qp != NULL) NCCLCHECK(wrap_ibv_destroy_qp(comm->qp)); if (comm->fifoMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->fifoMr)); - for (int i=0; iverbs.mrPool[i].mr != NULL) { - if (comm->verbs.mrPool[i].refcnt != 0) WARN("NET/IB : TX MR #%d has non-zero (%d) refcnt", i, comm->verbs.mrPool[i].refcnt); - NCCLCHECK(wrap_ibv_dereg_mr(comm->verbs.mrPool[i].mr)); - } - } NCCLCHECK(ncclIbDestroyVerbs(&comm->verbs)); free(comm); } @@ -859,12 +830,6 @@ ncclResult_t ncclIbCloseRecv(void* recvComm) { if (comm->gpuFlush.hostMr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->gpuFlush.hostMr)); } if (comm->remFifo.mr != NULL) NCCLCHECK(wrap_ibv_dereg_mr(comm->remFifo.mr)); - for (int i=0; iverbs.mrPool[i].mr != NULL) { - if (comm->verbs.mrPool[i].refcnt != 0) WARN("NET/IB : RX MR #%d has non-zero (%d) refcnt", i, comm->verbs.mrPool[i].refcnt); - NCCLCHECK(wrap_ibv_dereg_mr(comm->verbs.mrPool[i].mr)); - } - } NCCLCHECK(ncclIbDestroyVerbs(&comm->verbs)); free(comm); } @@ -889,6 +854,8 @@ ncclNet_t ncclNetIb = { ncclIbListen, ncclIbConnect, ncclIbAccept, + ncclIbRegMr, + ncclIbDeregMr, ncclIbIsend, ncclIbIrecv, ncclIbFlush, diff --git a/projects/rccl/src/transport/net_socket.cu b/projects/rccl/src/transport/net_socket.cu index 1efee15dda..0464b43482 100644 --- a/projects/rccl/src/transport/net_socket.cu +++ b/projects/rccl/src/transport/net_socket.cu @@ -27,10 +27,19 @@ ncclResult_t ncclSocketInit(ncclDebugLogger_t logFunction) { pthread_mutex_lock(&ncclSocketLock); if (ncclNetIfs == -1) { ncclNetIfs = findInterfaces(ncclNetIfNames, ncclNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS); - INFO(NCCL_INIT|NCCL_NET,"NET/Socket : %d interfaces found", ncclNetIfs); if (ncclNetIfs <= 0) { WARN("NET/Socket : no interface found"); return ncclInternalError; + } else { + char line[1024]; + char addrline[1024]; + line[0] = '\0'; + for (int i=0; iconnectAddr, MAX_IF_NAME_SIZE, 1) <= 0) { - WARN("No usable listening interface found"); + WARN("NET/Socket : No usable listening interface found"); return ncclSystemError; } // pass the local address back @@ -205,21 +214,24 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) { return ncclSuccess; } -ncclResult_t ncclSocketIsend(void* sendComm, void* data, int size, int type, void** request) { - if (type != NCCL_PTR_HOST) return ncclInternalError; +ncclResult_t ncclSocketRegMr(void* comm, void* data, int size, int type, void** mhandle) { + return (type != NCCL_PTR_HOST) ? ncclInternalError : ncclSuccess; +} +ncclResult_t ncclSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; } + +ncclResult_t ncclSocketIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { struct ncclSocketComm* comm = (struct ncclSocketComm*)sendComm; NCCLCHECK(ncclSocketGetRequest(&comm->reqs, NCCL_SOCKET_SEND, data, size, comm->fd, (struct ncclSocketRequest**)request)); return ncclSuccess; } -ncclResult_t ncclSocketIrecv(void* recvComm, void* data, int size, int type, void** request) { - if (type != NCCL_PTR_HOST) return ncclInternalError; +ncclResult_t ncclSocketIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { struct ncclSocketComm* comm = (struct ncclSocketComm*)recvComm; NCCLCHECK(ncclSocketGetRequest(&comm->reqs, NCCL_SOCKET_RECV, data, size, comm->fd, (struct ncclSocketRequest**)request)); return ncclSuccess; } -ncclResult_t ncclSocketFlush(void* recvComm, void* data, int size) { +ncclResult_t ncclSocketFlush(void* recvComm, void* data, int size, void* mhandle) { // We don't support CUDA pointers, so we don't need a flush operation return ncclInternalError; } @@ -243,6 +255,8 @@ ncclNet_t ncclNetSocket = { ncclSocketListen, ncclSocketConnect, ncclSocketAccept, + ncclSocketRegMr, + ncclSocketDeregMr, ncclSocketIsend, ncclSocketIrecv, ncclSocketFlush, diff --git a/projects/rccl/src/transport/p2p.cu b/projects/rccl/src/transport/p2p.cu index 6c4626a77c..9f3e0b6558 100644 --- a/projects/rccl/src/transport/p2p.cu +++ b/projects/rccl/src/transport/p2p.cu @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -11,18 +11,9 @@ #include "param.h" #include #include -#include "nvmlwrap.h" #include #include "nvlink.h" -struct p2pInfo { - int rank; - int cudaDev; - uint64_t hostHash; - uint64_t pidHash; - char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; -}; - struct p2pConnectInfo { int direct; union { @@ -31,36 +22,40 @@ struct p2pConnectInfo { }; }; +struct p2pSendResources { + struct ncclSendMem* devMem; + void* ipcPtr; +}; + +struct p2pRecvResources { + struct ncclRecvMem* devMem; + void* ipcPtr; +}; + #include -/* Fill information necessary to exchange between ranks to choose whether or not - * to use this transport */ -ncclResult_t p2pFillInfo(ncclTinfo_t* opaqueInfo, int rank) { - struct p2pInfo* info = (struct p2pInfo*)opaqueInfo; - static_assert(sizeof(struct p2pInfo) <= sizeof(ncclTinfo_t), "p2p Info too large"); - info->rank = rank; - CUDACHECK(cudaGetDevice(&info->cudaDev)); - info->hostHash=getHostHash(); - info->pidHash=getPidHash(); - - // Get PCI Bus Id. We need to get the bus ID through CUDA first, since the - // cudaDev is a CUDA runtime dev number which could be different from the - // NVML device number. Then we get the busID from NVML to be sure it is - // consistent with NVML remote PCI bus Ids. - CUDACHECK(cudaDeviceGetPCIBusId(info->busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, info->cudaDev)); - nvmlDevice_t nvmlDevice; - NCCLCHECK(wrapNvmlDeviceGetHandleByPciBusId(info->busId, &nvmlDevice)); - nvmlPciInfo_t pciInfo; - NCCLCHECK(wrapNvmlDeviceGetPciInfo(nvmlDevice, &pciInfo)); - strncpy(info->busId, pciInfo.busId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE); - return ncclSuccess; -} - NCCL_PARAM(P2pLevel, "P2P_LEVEL", -2); NCCL_PARAM(P2pDisable, "P2P_DISABLE", -2); +/* Convert a PCI busId string into a local cudaDev device index (cf. CUDA_VISIBLE_DEVICES) */ +static int busIdToCudaDev(const char* busId) { + int ndev; + if (cudaGetDeviceCount(&ndev) != cudaSuccess) + return -1; + for (int i = 0; i < ndev; i++) { + char devBusId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; + if (cudaDeviceGetPCIBusId(devBusId, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE, i) != cudaSuccess) + return -1; + if (strcmp(busId, devBusId) == 0) { + return i; + } + } + // BusId was not found in our locally visible CUDA devices + return -1; +} + /* Determine if we can communicate with the peer through p2p */ -ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo) { +ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) { // Do not use P2P across root complexes by default (provided CUDA permits it) int p2pLevel = PATH_SOC; if (ncclParamP2pDisable() == 1) p2pLevel = 0; @@ -70,23 +65,26 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTin if (p2pLevel == 0) return ncclSuccess; - struct p2pInfo* myInfo = (struct p2pInfo*)myOpaqueInfo; - struct p2pInfo* peerInfo = (struct p2pInfo*)peerOpaqueInfo; - // Rule out different nodes if (myInfo->hostHash != peerInfo->hostHash) return ncclSuccess; + // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES) + int peerCudaDev = busIdToCudaDev(peerInfo->busId); + if (peerCudaDev == -1) return ncclSuccess; // Peer's CUDA device is not visible in this process + + TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between [%d=%d] and [%d=%d]", myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev); + // Do not detect topology if we're on the same GPU. Note this is not really supported. - if (myInfo->cudaDev == peerInfo->cudaDev) { + if (myInfo->cudaDev == peerCudaDev) { *ret = 1 + PATH_SOC; return ncclSuccess; } // See if CUDA can do P2P int p2p; - if (cudaDeviceCanAccessPeer(&p2p, myInfo->cudaDev, peerInfo->cudaDev) != cudaSuccess) { - INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d and dev %d", - myInfo->cudaDev, peerInfo->cudaDev); + if (cudaDeviceCanAccessPeer(&p2p, myInfo->cudaDev, peerCudaDev) != cudaSuccess) { + INFO(NCCL_INIT|NCCL_P2P,"peer query failed between dev %d(=%d) and dev %d(=%d)", + myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev); return ncclSuccess; } if (p2p == 0) return ncclSuccess; @@ -102,7 +100,7 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTin char* myPath; char* peerPath; ncclResult_t err1 = getCudaPath(myInfo->cudaDev, &myPath); - ncclResult_t err2 = getCudaPath(peerInfo->cudaDev, &peerPath); + ncclResult_t err2 = getCudaPath(peerCudaDev, &peerPath); if (err1 == ncclSuccess && err2 == ncclSuccess) { int distance = pciDistance(myPath, peerPath); if (distance < p2pLevel) { @@ -174,8 +172,8 @@ static int computeRingsRec(ncclTvalue_t* matrix, int n, int *rings, int currentR static inline int copyRings(int nranks, int* rings, int nrings, int newNrings) { if (nrings == 0) return 0; // Copy rings by dup times - if (newNrings > MAXRINGS) { - newNrings = MAXRINGS; + if (newNrings > MAXCHANNELS) { + newNrings = MAXCHANNELS; } for (int r=nrings; r MAXRINGS) { - WARN("Max rings reached, limiting to %d", MAXRINGS); - nrings = MAXRINGS; + if (nrings > MAXCHANNELS) { + WARN("Max rings reached, limiting to %d", MAXCHANNELS); + nrings = MAXCHANNELS; } // Find existing constraints / connections int connect = 0; @@ -239,9 +236,9 @@ int p2pComputeRingsNvLink(ncclTvalue_t* values, int nranks, int* rings, int nrin if (compNrings && compNrings < nrings && nranks <= 4) { // Try to oversubscribe to get a better result - int *rings2 = (int *)malloc(sizeof(int)*MAXRINGS*nranks); - if (rings2 == NULL) { WARN("malloc of %ld bytes failed", sizeof(int)*MAXRINGS*nranks); return 0; } - for (int i=0; i compNrings*2) { @@ -255,7 +252,6 @@ int p2pComputeRingsNvLink(ncclTvalue_t* values, int nranks, int* rings, int nrin // Duplicate the rings for direct NVLink compNrings = copyRings(nranks, rings, compNrings, compNrings*2); - if (ncclCudaCompCap() == 6) *nthreads /= 2; return compNrings; } @@ -367,8 +363,8 @@ int p2pComputeRingsPci(ncclTvalue_t* values, int nranks, int* rings, int nrings, ncclResult_t p2pGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) { if (*nringsRet == 0) return ncclSuccess; int *rings; - NCCLCHECK(ncclCalloc(&rings, MAXRINGS*nranks)); - for (int i=0; itransportResources = resources; + const int sendSize = sizeof(struct ncclSendMem); + NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, sendSize)); + struct p2pConnectInfo info; if (myInfo->pidHash == peerInfo->pidHash) { info.direct = 1; - info.directPtr = ring->devMemSend; + info.directPtr = resources->devMem; if (myInfo->cudaDev == peerInfo->cudaDev) { - INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d via P2P/common device", ring->id, myInfo->rank, peerInfo->rank); + INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d -> %d via P2P/common device", channelId, myInfo->rank, peerInfo->rank); } else { // Enable P2P access cudaError_t err = cudaDeviceEnablePeerAccess(peerInfo->cudaDev, 0); if (err == cudaErrorPeerAccessAlreadyEnabled) { cudaGetLastError(); } else if (err != cudaSuccess) { - WARN("failed to peer with device %d: %d %s", - peerInfo->cudaDev, err, cudaGetErrorString(err)); + WARN("failed to peer with device %d(=%d): %d %s", + peerInfo->cudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err)); return ncclInternalError; } INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/direct pointer", - ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); + channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev); } } else { + // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES) + int peerCudaDev = busIdToCudaDev(peerInfo->busId); info.direct = 0; // Map IPC and enable P2P access - cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)ring->devMemSend); + cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)resources->devMem); if (err != cudaSuccess) { - WARN("rank %d failed to get CUDA IPC handle to device %d : %d %s", - myInfo->rank, peerInfo->cudaDev, err, cudaGetErrorString(err)); + WARN("rank %d failed to get CUDA IPC handle to device %d(=%d) : %d %s", + myInfo->rank, peerCudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err)); return ncclInternalError; } INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/IPC", - ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); + channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev); //TRACE_DUMP_IPC(&info.devIpc); } static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big"); @@ -487,13 +491,19 @@ ncclResult_t p2pSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo } /* Create and return connect structures for this peer to connect to me */ -ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) { - struct p2pInfo* myInfo = (struct p2pInfo*)myOpaqueInfo; - struct p2pInfo* peerInfo = (struct p2pInfo*)peerOpaqueInfo; +ncclResult_t p2pRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, + struct ncclConnect* connectInfo, struct ncclConnector * recv, int buffSize, int channelId) { + + struct p2pRecvResources* resources; + NCCLCHECK(ncclCalloc(&resources, 1)); + recv->transportResources = resources; + const int recvSize = offsetof(struct ncclRecvMem, buff)+buffSize; + NCCLCHECK(ncclCudaCalloc((char**)&resources->devMem, recvSize)); + struct p2pConnectInfo info; if (myInfo->pidHash == peerInfo->pidHash) { info.direct = 1; - info.directPtr = ring->devMemRecv; + info.directPtr = resources->devMem; if (myInfo->cudaDev == peerInfo->cudaDev) { TRACE(NCCL_INIT|NCCL_P2P,"%d <- %d via P2P/common device", myInfo->rank, peerInfo->rank); } else { @@ -502,22 +512,24 @@ ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo if (err == cudaErrorPeerAccessAlreadyEnabled) { cudaGetLastError(); } else if (err != cudaSuccess) { - WARN("failed to peer with device %d: %d %s", - peerInfo->cudaDev, err, cudaGetErrorString(err)); + WARN("failed to peer with device %d(=%d): %d %s", + peerInfo->cudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err)); return ncclInternalError; } - TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/direct pointer", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); + TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/direct pointer", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev); } } else { + // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES) + int peerCudaDev = busIdToCudaDev(peerInfo->busId); info.direct = 0; // Map IPC and enable P2P access - cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)ring->devMemRecv); + cudaError_t err = cudaIpcGetMemHandle(&info.devIpc, (void*)resources->devMem); if (err != cudaSuccess) { - WARN("rank %d failed to get CUDA IPC handle to device %d : %d %s", - myInfo->rank, peerInfo->cudaDev, err, cudaGetErrorString(err)); + WARN("rank %d failed to get CUDA IPC handle to device %d(=%d) : %d %s", + myInfo->rank, peerCudaDev, peerInfo->nvmlDev, err, cudaGetErrorString(err)); return ncclInternalError; } - TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/IPC", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); + TRACE(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] <- %d[%d] via P2P/IPC", channelId, myInfo->rank, myInfo->nvmlDev, peerInfo->rank, peerInfo->nvmlDev); //TRACE_DUMP_IPC(&info.devIpc); } static_assert(sizeof(struct p2pConnectInfo) <= sizeof(struct ncclConnect), "p2p Connect Info is too big"); @@ -527,22 +539,16 @@ ncclResult_t p2pRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo /* Connect/Send to this peer */ static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) { - void** resources = &send->transportResources; + struct p2pSendResources* resources = (struct p2pSendResources*)send->transportResources; struct ncclRecvMem* remDevMem; struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo; if (info->direct) { remDevMem = (struct ncclRecvMem*)(info->directPtr); send->conn.direct = 1; - *resources = NULL; } else { - void* remPtr = NULL; //TRACE_DUMP_IPC(&info->devIpc); - cudaError_t err = cudaIpcOpenMemHandle(&remPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess); - void** ipcPtrSave; - NCCLCHECK(ncclCalloc(&ipcPtrSave, 1)); - *resources = ipcPtrSave; - *ipcPtrSave = remPtr; - remDevMem = (struct ncclRecvMem*)remPtr; + cudaError_t err = cudaIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess); + remDevMem = (struct ncclRecvMem*)resources->ipcPtr; if (err != cudaSuccess) { WARN("failed to open CUDA IPC handle : %d %s", err, cudaGetErrorString(err)); @@ -553,30 +559,26 @@ static ncclResult_t p2pSendConnect(struct ncclConnect* connectInfo, struct ncclC send->conn.buff = remDevMem->buff; send->conn.llBuff = remDevMem->llBuff; send->conn.tail = &remDevMem->tail; - send->conn.opCount = &remDevMem->opCount; - // send->conn->head should have been set to devMemSend already + send->conn.opCountRem = &remDevMem->opCount; + send->conn.head = &resources->devMem->head; + send->conn.ptrExchange = &resources->devMem->ptrExchange; + send->conn.opCountLoc = &resources->devMem->opCount; return ncclSuccess; } /* Connect/Recv from this peer */ ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) { - void** resources = &recv->transportResources; + struct p2pRecvResources* resources = (struct p2pRecvResources*)recv->transportResources; struct ncclSendMem* remDevMem; struct p2pConnectInfo* info = (struct p2pConnectInfo*)connectInfo; if (info->direct) { remDevMem = (struct ncclSendMem*)(info->directPtr); recv->conn.direct = 1; recv->conn.ptrExchange = &remDevMem->ptrExchange; - *resources = NULL; } else { - void* remPtr = NULL; //TRACE_DUMP_IPC(&info->devIpc); - cudaError_t err = cudaIpcOpenMemHandle(&remPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess); - void** ipcPtrSave; - NCCLCHECK(ncclCalloc(&ipcPtrSave, 1)); - *resources = ipcPtrSave; - *ipcPtrSave = remPtr; - remDevMem = (struct ncclSendMem*)remPtr; + cudaError_t err = cudaIpcOpenMemHandle(&resources->ipcPtr, info->devIpc, cudaIpcMemLazyEnablePeerAccess); + remDevMem = (struct ncclSendMem*)resources->ipcPtr; if (err != cudaSuccess) { WARN("failed to open CUDA IPC handle : %d %s", err, cudaGetErrorString(err)); @@ -584,28 +586,35 @@ ncclResult_t p2pRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto } } - // recv->conn->buff should have been set to devMemRecv already - // recv->conn->tail should have been set to devMemRecv already - // recv->conn->opCount should have been set to devMemRecv already + recv->conn.buff = resources->devMem->buff; + recv->conn.llBuff = resources->devMem->llBuff; + recv->conn.tail = &resources->devMem->tail; + recv->conn.opCountLoc = &resources->devMem->opCount; recv->conn.head = &remDevMem->head; - recv->conn.llHead = &remDevMem->llHead; + recv->conn.opCountRem = &remDevMem->opCount; return ncclSuccess; } -ncclResult_t p2pFree(void* resources) { - if (resources != NULL) { - void** ipcPtrSave = (void**) resources; - CUDACHECK(cudaIpcCloseMemHandle(*ipcPtrSave)); - free(resources); - } +ncclResult_t p2pSendFree(void* resources) { + struct p2pSendResources* sendRes = (struct p2pSendResources*)resources; + if (sendRes->ipcPtr) + CUDACHECK(cudaIpcCloseMemHandle(sendRes->ipcPtr)); + CUDACHECK(cudaFree(sendRes->devMem)); + return ncclSuccess; +} + +ncclResult_t p2pRecvFree(void* resources) { + struct p2pRecvResources* recvRes = (struct p2pRecvResources*)resources; + if (recvRes->ipcPtr) + CUDACHECK(cudaIpcCloseMemHandle(recvRes->ipcPtr)); + CUDACHECK(cudaFree(recvRes->devMem)); return ncclSuccess; } struct ncclTransport p2pTransport = { "P2P", - p2pFillInfo, p2pCanConnect, p2pGetRings, - { p2pSendSetup, p2pSendConnect, p2pFree, NULL }, - { p2pRecvSetup, p2pRecvConnect, p2pFree, NULL } + { p2pSendSetup, p2pSendConnect, p2pSendFree, NULL }, + { p2pRecvSetup, p2pRecvConnect, p2pRecvFree, NULL } }; diff --git a/projects/rccl/src/transport/shm.cu b/projects/rccl/src/transport/shm.cu index 317f652dac..56e0242af2 100644 --- a/projects/rccl/src/transport/shm.cu +++ b/projects/rccl/src/transport/shm.cu @@ -12,13 +12,6 @@ #include #include -struct shmInfo { - int rank; - int cudaDev; - uint64_t hostHash; - uint64_t pidHash; -}; - struct shmSendConnectInfo { uint64_t pidHash; int id; @@ -51,24 +44,10 @@ struct shmRecvResources { struct ncclRecvMem* devHostMem; }; -/* Fill information necessary to exchange between ranks to choose whether or not - * to use this transport */ -ncclResult_t shmFillInfo(ncclTinfo_t* opaqueInfo, int rank) { - struct shmInfo* info = (struct shmInfo*)opaqueInfo; - static_assert(sizeof(struct shmInfo) <= sizeof(ncclTinfo_t), "shm Info too large"); - info->rank = rank; - CUDACHECK(cudaGetDevice(&info->cudaDev)); - info->hostHash=getHostHash(); - info->pidHash=getPidHash(); - return ncclSuccess; -} - NCCL_PARAM(ShmDisable, "SHM_DISABLE", 0); /* Determine if we can communicate with the peer */ -ncclResult_t shmCanConnect(ncclTvalue_t* ret, ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo) { - struct shmInfo* myInfo = (struct shmInfo*)myOpaqueInfo; - struct shmInfo* peerInfo = (struct shmInfo*)peerOpaqueInfo; +ncclResult_t shmCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) { *ret = ((ncclParamShmDisable() == 1) || (myInfo->hostHash != peerInfo->hostHash)) ? 0 : 1; return ncclSuccess; } @@ -88,7 +67,7 @@ static inline int groupLast(int nranks, int* groups, int group, int rankToAvoid) } ncclResult_t shmGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) { - if (*nringsRet == MAXRINGS) *nringsRet = 1; + if (*nringsRet == MAXCHANNELS) *nringsRet = 1; int nGroups = groups[nranks-1] + 1; int starts[nGroups]; int ends[nGroups]; @@ -156,43 +135,40 @@ ncclResult_t shmGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* #define MAX_SHM_NAME_LEN 1024 /* Create and return connect structures for this peer to connect to me */ -ncclResult_t shmSendSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) { - struct shmInfo* myInfo = (struct shmInfo*)myOpaqueInfo; - struct shmInfo* peerInfo = (struct shmInfo*)peerOpaqueInfo; +ncclResult_t shmSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int buffSize, int channelId) { struct shmSendResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); - ring->send.transportResources = resources; + send->transportResources = resources; struct shmRecvConnectInfo info; char shmName[MAX_SHM_NAME_LEN]; - sprintf(shmName, "nccl-shm-send-%lx-%d-%d", myInfo->pidHash, ring->id, myInfo->rank); + sprintf(shmName, "nccl-shm-send-%lx-%d-%d", myInfo->pidHash, channelId, myInfo->rank); info.shmSize = resources->shmSize = sizeof(struct ncclSendMem); TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize); NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1)); - INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%d] -> %d[%d] via direct shared memory", ring->id, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); - info.id = ring->id; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash; + INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%d] -> %d[%d] via direct shared memory", channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); + info.id = channelId; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash; static_assert(sizeof(struct shmRecvConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big"); memcpy(connectInfo, &info, sizeof(struct shmRecvConnectInfo)); return ncclSuccess; } -ncclResult_t shmRecvSetup(ncclTinfo_t* myOpaqueInfo, ncclTinfo_t* peerOpaqueInfo, struct ncclConnect* connectInfo, struct ncclRing* ring) { - struct shmInfo* myInfo = (struct shmInfo*)myOpaqueInfo; +ncclResult_t shmRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int buffSize, int channelId) { struct shmRecvResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); - ring->recv.transportResources = resources; + recv->transportResources = resources; struct shmSendConnectInfo info; char shmName[MAX_SHM_NAME_LEN]; - sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", myInfo->pidHash, ring->id, myInfo->rank); - info.shmSize = resources->shmSize = offsetof(struct ncclRecvMem, buff)+ring->buffSize; + sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", myInfo->pidHash, channelId, myInfo->rank); + info.shmSize = resources->shmSize = offsetof(struct ncclRecvMem, buff)+buffSize; TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize); NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1)); - info.id = ring->id; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash; + info.id = channelId; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash; static_assert(sizeof(struct shmRecvConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Send Info is too big"); memcpy(connectInfo, &info, sizeof(struct shmSendConnectInfo)); return ncclSuccess; @@ -216,10 +192,10 @@ ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto send->conn.buff = resources->devRemHostMem->buff; send->conn.llBuff = resources->devRemHostMem->llBuff; send->conn.tail = &resources->devRemHostMem->tail; - send->conn.opCount = &resources->devRemHostMem->opCount; + send->conn.opCountRem = &resources->devRemHostMem->opCount; send->conn.head = &resources->devHostMem->head; - send->conn.llHead = &resources->devHostMem->llHead; + send->conn.opCountLoc = &resources->devHostMem->opCount; return ncclSuccess; } @@ -235,12 +211,12 @@ ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, struct ncclConnecto NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0)); NCCLCHECK(shmUnlink(shmName)); recv->conn.head = &resources->devRemHostMem->head; - recv->conn.llHead = &resources->devRemHostMem->llHead; + recv->conn.opCountRem = &resources->devRemHostMem->opCount; recv->conn.buff = resources->devHostMem->buff; recv->conn.llBuff = resources->devHostMem->llBuff; recv->conn.tail = &resources->devHostMem->tail; - recv->conn.opCount = &resources->devHostMem->opCount; + recv->conn.opCountLoc = &resources->devHostMem->opCount; return ncclSuccess; } @@ -262,7 +238,6 @@ ncclResult_t shmRecvFree(void* transportResources) { struct ncclTransport shmTransport = { "SHM", - shmFillInfo, shmCanConnect, shmGetRings, { shmSendSetup, shmSendConnect, shmSendFree, NULL }, From ab2dd12f3e907c3457ff7e346dac048d477ca57b Mon Sep 17 00:00:00 2001 From: Rong Ou Date: Mon, 4 Mar 2019 11:42:47 -0800 Subject: [PATCH 02/20] Fix crash during shared memory creation (#185) The shared memory filename was only based on the destination. While this was OK for rings since only one rank would send data to a given rank, it would crash with trees because they communicate in both directions. Co-authored-by: Rong Ou [ROCm/rccl commit: 14e0cf644b9ba2214f2b6d2e299e8218f6145d32] --- projects/rccl/src/transport/shm.cu | 47 +++++++++++++++--------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/projects/rccl/src/transport/shm.cu b/projects/rccl/src/transport/shm.cu index 56e0242af2..83cc9d1830 100644 --- a/projects/rccl/src/transport/shm.cu +++ b/projects/rccl/src/transport/shm.cu @@ -12,17 +12,11 @@ #include #include -struct shmSendConnectInfo { +struct shmConnectInfo { uint64_t pidHash; int id; - int rank; - int shmSize; -}; - -struct shmRecvConnectInfo { - uint64_t pidHash; - int id; - int rank; + int sendRank; + int recvRank; int shmSize; }; @@ -141,17 +135,21 @@ ncclResult_t shmSendSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer NCCLCHECK(ncclCalloc(&resources, 1)); send->transportResources = resources; - struct shmRecvConnectInfo info; + struct shmConnectInfo info; + info.id = channelId; + info.pidHash = myInfo->pidHash; + info.sendRank = myInfo->rank; + info.recvRank = peerInfo->rank; + char shmName[MAX_SHM_NAME_LEN]; - sprintf(shmName, "nccl-shm-send-%lx-%d-%d", myInfo->pidHash, channelId, myInfo->rank); + sprintf(shmName, "nccl-shm-send-%lx-%d-%d-%d", info.pidHash, info.id, info.sendRank, info.recvRank); info.shmSize = resources->shmSize = sizeof(struct ncclSendMem); TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize); NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1)); INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%d] -> %d[%d] via direct shared memory", channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); - info.id = channelId; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash; - static_assert(sizeof(struct shmRecvConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big"); - memcpy(connectInfo, &info, sizeof(struct shmRecvConnectInfo)); + static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Recv Info is too big"); + memcpy(connectInfo, &info, sizeof(struct shmConnectInfo)); return ncclSuccess; } @@ -160,28 +158,31 @@ ncclResult_t shmRecvSetup(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peer NCCLCHECK(ncclCalloc(&resources, 1)); recv->transportResources = resources; - struct shmSendConnectInfo info; + struct shmConnectInfo info; + info.id = channelId; + info.pidHash = myInfo->pidHash; + info.sendRank = peerInfo->rank; + info.recvRank = myInfo->rank; char shmName[MAX_SHM_NAME_LEN]; - sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", myInfo->pidHash, channelId, myInfo->rank); + sprintf(shmName, "nccl-shm-recv-%lx-%d-%d-%d", info.pidHash, info.id, info.sendRank, info.recvRank); info.shmSize = resources->shmSize = offsetof(struct ncclRecvMem, buff)+buffSize; TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info.shmSize); NCCLCHECK(shmOpen(shmName, resources->shmSize, (void**)&resources->hostMem, (void**)&resources->devHostMem, 1)); - info.id = channelId; info.rank = myInfo->rank; info.pidHash = myInfo->pidHash; - static_assert(sizeof(struct shmRecvConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Send Info is too big"); - memcpy(connectInfo, &info, sizeof(struct shmSendConnectInfo)); + static_assert(sizeof(struct shmConnectInfo) <= sizeof(struct ncclConnect), "shm Connect Send Info is too big"); + memcpy(connectInfo, &info, sizeof(struct shmConnectInfo)); return ncclSuccess; } /* Connect to this peer */ ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, struct ncclConnector* send) { // Setup device pointers - struct shmSendConnectInfo* info = (struct shmSendConnectInfo*)connectInfo; + struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo; struct shmSendResources* resources = (struct shmSendResources*)send->transportResources; char shmName[MAX_SHM_NAME_LEN]; - sprintf(shmName, "nccl-shm-recv-%lx-%d-%d", info->pidHash, info->id, info->rank); + sprintf(shmName, "nccl-shm-recv-%lx-%d-%d-%d", info->pidHash, info->id, info->sendRank, info->recvRank); resources->remShmSize = info->shmSize; TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize); NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0)); @@ -202,10 +203,10 @@ ncclResult_t shmSendConnect(struct ncclConnect* connectInfo, struct ncclConnecto ncclResult_t shmRecvConnect(struct ncclConnect* connectInfo, struct ncclConnector* recv) { // Setup device pointers struct shmRecvResources* resources = (struct shmRecvResources*)recv->transportResources; - struct shmRecvConnectInfo* info = (struct shmRecvConnectInfo*)connectInfo; + struct shmConnectInfo* info = (struct shmConnectInfo*)connectInfo; char shmName[MAX_SHM_NAME_LEN]; - sprintf(shmName, "nccl-shm-send-%lx-%d-%d", info->pidHash, info->id, info->rank); + sprintf(shmName, "nccl-shm-send-%lx-%d-%d-%d", info->pidHash, info->id, info->sendRank, info->recvRank); resources->remShmSize = info->shmSize; TRACE(NCCL_SHM,"Open shmName %s shmSize %d", shmName, info->shmSize); NCCLCHECK(shmOpen(shmName, resources->remShmSize, (void**)&resources->remHostMem, (void**)&resources->devRemHostMem, 0)); From 17c8317cb1e0f803cc79c439eb15661114f3fdfe Mon Sep 17 00:00:00 2001 From: David Addison Date: Thu, 14 Mar 2019 19:39:20 -0700 Subject: [PATCH 03/20] NCCL 2.4.6-1 Added detection of IBM/Power NVLink bridge device. Add NUMA support to PCI distance calculations. Added NCCL_IGNORE_CPU_AFFINITY env var. Fix memory leaks; GithubIssue#180 Compiler warning fix; GithubIssue#178 Replace non-standard variable length arrays. GithubIssue#171 Fix Tree+Shared Memory crash. GithubPR#185 Fix LL cleanup hang during long running DL jobs. Fix NCCL_RINGS environment variable handling. Added extra checks to catch repeat calls to ncclCommDestroy() GithubIssue#191 Improve bootstrap socket connection reliability at scale. Fix hostname hashing issue. GithubIssue#187 Code cleanup to rename all non device files from *.cu to *.cc [ROCm/rccl commit: f40ce73e8987d2990e4b9ef6c75f4b3423acce78] --- projects/rccl/LICENSE.txt | 2 +- projects/rccl/Makefile | 2 +- projects/rccl/README.md | 2 +- projects/rccl/ext-net/dummy/Makefile | 2 +- projects/rccl/ext-net/dummy/plugin.c | 2 +- projects/rccl/makefiles/common.mk | 8 +- projects/rccl/makefiles/formatting.mk | 2 +- projects/rccl/makefiles/version.mk | 2 +- projects/rccl/pkg/Makefile | 2 +- projects/rccl/pkg/debian/Makefile | 2 +- projects/rccl/pkg/redhat/Makefile | 2 +- projects/rccl/pkg/srctxz/Makefile | 2 +- projects/rccl/pkg/srctxz/create_srctxz.sh.in | 2 +- projects/rccl/pkg/txz/Makefile | 2 +- projects/rccl/pkg/txz/create_txz.sh.in | 2 +- projects/rccl/src/Makefile | 20 +- .../rccl/src/{bootstrap.cu => bootstrap.cc} | 2 +- projects/rccl/src/{channel.cu => channel.cc} | 7 +- .../{all_gather.cu => all_gather.cc} | 2 +- .../{all_reduce.cu => all_reduce.cc} | 2 +- .../{broadcast.cu => broadcast.cc} | 2 +- projects/rccl/src/collectives/collectives.h | 2 +- projects/rccl/src/collectives/device/Makefile | 2 +- .../rccl/src/collectives/device/all_gather.cu | 2 +- .../rccl/src/collectives/device/all_gather.h | 8 +- .../rccl/src/collectives/device/all_reduce.cu | 2 +- .../rccl/src/collectives/device/all_reduce.h | 12 +- .../rccl/src/collectives/device/broadcast.cu | 2 +- .../rccl/src/collectives/device/broadcast.h | 8 +- projects/rccl/src/collectives/device/common.h | 6 +- .../src/collectives/device/common_kernel.h | 4 +- .../rccl/src/collectives/device/functions.cu | 4 +- .../rccl/src/collectives/device/gen_rules.sh | 2 +- .../rccl/src/collectives/device/primitives.h | 54 +-- .../rccl/src/collectives/device/reduce.cu | 2 +- projects/rccl/src/collectives/device/reduce.h | 8 +- .../src/collectives/device/reduce_scatter.cu | 2 +- .../src/collectives/device/reduce_scatter.h | 8 +- .../src/collectives/{reduce.cu => reduce.cc} | 2 +- .../{reduce_scatter.cu => reduce_scatter.cc} | 2 +- projects/rccl/src/{enqueue.cu => enqueue.cc} | 8 +- projects/rccl/src/include/alloc.h | 51 +++ projects/rccl/src/include/argcheck.h | 15 + projects/rccl/src/include/bootstrap.h | 2 +- projects/rccl/src/include/channel.h | 2 +- projects/rccl/src/include/checks.h | 71 ++- projects/rccl/src/include/comm.h | 127 ++++++ projects/rccl/src/include/core.h | 423 +----------------- projects/rccl/src/include/cpuset.h | 2 +- projects/rccl/src/include/debug.h | 6 +- projects/rccl/src/include/devcomm.h | 194 ++++++++ projects/rccl/src/include/enqueue.h | 8 +- projects/rccl/src/include/ibvwrap.h | 2 +- projects/rccl/src/include/info.h | 45 ++ projects/rccl/src/include/nccl_net.h | 7 +- projects/rccl/src/include/net.h | 2 +- projects/rccl/src/include/nvlink.h | 14 +- projects/rccl/src/include/nvmlwrap.h | 2 +- projects/rccl/src/include/param.h | 3 +- projects/rccl/src/include/rings.h | 2 +- projects/rccl/src/include/shm.h | 2 +- projects/rccl/src/include/socket.h | 21 +- projects/rccl/src/include/topo.h | 60 +-- projects/rccl/src/include/transport.h | 7 +- projects/rccl/src/include/trees.h | 2 +- projects/rccl/src/include/utils.h | 4 +- projects/rccl/src/{init.cu => init.cc} | 111 +++-- .../rccl/src/misc/{checks.cu => argcheck.cc} | 4 +- projects/rccl/src/misc/{group.cu => group.cc} | 4 +- .../rccl/src/misc/{ibvwrap.cu => ibvwrap.cc} | 2 +- .../src/misc/{nvmlwrap.cu => nvmlwrap.cc} | 2 +- projects/rccl/src/misc/{rings.cu => rings.cc} | 6 +- projects/rccl/src/misc/topo.cc | 51 +++ projects/rccl/src/misc/{trees.cu => trees.cc} | 2 +- projects/rccl/src/misc/{utils.cu => utils.cc} | 12 +- .../rccl/src/{transport.cu => transport.cc} | 0 .../rccl/src/transport/{net.cu => net.cc} | 52 +-- .../src/transport/{net_ib.cu => net_ib.cc} | 3 +- .../{net_socket.cu => net_socket.cc} | 2 +- .../rccl/src/transport/{p2p.cu => p2p.cc} | 40 +- .../rccl/src/transport/{shm.cu => shm.cc} | 8 +- 81 files changed, 892 insertions(+), 692 deletions(-) rename projects/rccl/src/{bootstrap.cu => bootstrap.cc} (99%) rename projects/rccl/src/{channel.cu => channel.cc} (91%) rename projects/rccl/src/collectives/{all_gather.cu => all_gather.cc} (92%) rename projects/rccl/src/collectives/{all_reduce.cu => all_reduce.cc} (92%) rename projects/rccl/src/collectives/{broadcast.cu => broadcast.cc} (94%) rename projects/rccl/src/collectives/{reduce.cu => reduce.cc} (92%) rename projects/rccl/src/collectives/{reduce_scatter.cu => reduce_scatter.cc} (92%) rename projects/rccl/src/{enqueue.cu => enqueue.cc} (97%) create mode 100644 projects/rccl/src/include/alloc.h create mode 100644 projects/rccl/src/include/argcheck.h create mode 100644 projects/rccl/src/include/comm.h create mode 100644 projects/rccl/src/include/devcomm.h create mode 100644 projects/rccl/src/include/info.h rename projects/rccl/src/{init.cu => init.cc} (93%) rename projects/rccl/src/misc/{checks.cu => argcheck.cc} (96%) rename projects/rccl/src/misc/{group.cu => group.cc} (98%) rename projects/rccl/src/misc/{ibvwrap.cu => ibvwrap.cc} (99%) rename projects/rccl/src/misc/{nvmlwrap.cu => nvmlwrap.cc} (99%) rename projects/rccl/src/misc/{rings.cu => rings.cc} (98%) create mode 100644 projects/rccl/src/misc/topo.cc rename projects/rccl/src/misc/{trees.cu => trees.cc} (98%) rename projects/rccl/src/misc/{utils.cu => utils.cc} (94%) rename projects/rccl/src/{transport.cu => transport.cc} (100%) rename projects/rccl/src/transport/{net.cu => net.cc} (93%) rename projects/rccl/src/transport/{net_ib.cu => net_ib.cc} (99%) rename projects/rccl/src/transport/{net_socket.cu => net_socket.cc} (99%) rename projects/rccl/src/transport/{p2p.cu => p2p.cc} (94%) rename projects/rccl/src/transport/{shm.cu => shm.cc} (98%) diff --git a/projects/rccl/LICENSE.txt b/projects/rccl/LICENSE.txt index 3593a7aa69..e318c66695 100644 --- a/projects/rccl/LICENSE.txt +++ b/projects/rccl/LICENSE.txt @@ -1,5 +1,5 @@ - Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions diff --git a/projects/rccl/Makefile b/projects/rccl/Makefile index 605e3bfaad..caed3d42ac 100644 --- a/projects/rccl/Makefile +++ b/projects/rccl/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/projects/rccl/README.md b/projects/rccl/README.md index fa5145323b..abfd1cd4db 100644 --- a/projects/rccl/README.md +++ b/projects/rccl/README.md @@ -89,4 +89,4 @@ $ ./build/all_reduce_perf -b 8 -e 256M -f 2 -g ## Copyright -All source code and accompanying documentation is copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +All source code and accompanying documentation is copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. diff --git a/projects/rccl/ext-net/dummy/Makefile b/projects/rccl/ext-net/dummy/Makefile index d1eb4c5a62..efa841c53c 100644 --- a/projects/rccl/ext-net/dummy/Makefile +++ b/projects/rccl/ext-net/dummy/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/projects/rccl/ext-net/dummy/plugin.c b/projects/rccl/ext-net/dummy/plugin.c index f11b36590d..67d7d88411 100644 --- a/projects/rccl/ext-net/dummy/plugin.c +++ b/projects/rccl/ext-net/dummy/plugin.c @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/projects/rccl/makefiles/common.mk b/projects/rccl/makefiles/common.mk index d0e2ca847d..2ad5c73200 100644 --- a/projects/rccl/makefiles/common.mk +++ b/projects/rccl/makefiles/common.mk @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # @@ -15,6 +15,7 @@ PROFAPI ?= 0 NVCC = $(CUDA_HOME)/bin/nvcc CUDA_LIB ?= $(CUDA_HOME)/lib64 +CUDA_INC ?= $(CUDA_HOME)/include CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//')) #CUDA_VERSION ?= $(shell ls $(CUDA_LIB)/libcudart.so.* | head -1 | rev | cut -d "." -f -2 | rev) CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1) @@ -43,7 +44,8 @@ endif #$(info NVCC_GENCODE is ${NVCC_GENCODE}) CXXFLAGS := -DCUDA_MAJOR=$(CUDA_MAJOR) -DCUDA_MINOR=$(CUDA_MINOR) -fPIC -fvisibility=hidden -CXXFLAGS += -Wall -Wno-sign-compare +CXXFLAGS += -Wall -Wno-unused-function -Wno-sign-compare -std=c++11 -Wvla +CXXFLAGS += -I $(CUDA_INC) NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -lineinfo -std=c++11 -Xptxas -maxrregcount=96 -Xfatbin -compress-all # Use addprefix so that we can specify more than one path NVLDFLAGS := -L${CUDA_LIB} -lcudart -lrt @@ -67,7 +69,7 @@ CXXFLAGS += -O0 -g -ggdb3 endif ifneq ($(VERBOSE), 0) -NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra +NVCUFLAGS += -Xptxas -v -Xcompiler -Wall,-Wextra,-Wno-unused-parameter CXXFLAGS += -Wall -Wextra else .SILENT: diff --git a/projects/rccl/makefiles/formatting.mk b/projects/rccl/makefiles/formatting.mk index 4a4ab885cf..a543131d59 100644 --- a/projects/rccl/makefiles/formatting.mk +++ b/projects/rccl/makefiles/formatting.mk @@ -1,5 +1,5 @@ # -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/projects/rccl/makefiles/version.mk b/projects/rccl/makefiles/version.mk index a8c6e3ab03..7abaaaff22 100644 --- a/projects/rccl/makefiles/version.mk +++ b/projects/rccl/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 NCCL_MINOR := 4 -NCCL_PATCH := 2 +NCCL_PATCH := 6 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/projects/rccl/pkg/Makefile b/projects/rccl/pkg/Makefile index 04b23da70e..ab6487be9b 100644 --- a/projects/rccl/pkg/Makefile +++ b/projects/rccl/pkg/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/projects/rccl/pkg/debian/Makefile b/projects/rccl/pkg/debian/Makefile index 439635f948..7884cf2545 100644 --- a/projects/rccl/pkg/debian/Makefile +++ b/projects/rccl/pkg/debian/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/projects/rccl/pkg/redhat/Makefile b/projects/rccl/pkg/redhat/Makefile index ffcc973bcd..0808478624 100644 --- a/projects/rccl/pkg/redhat/Makefile +++ b/projects/rccl/pkg/redhat/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/projects/rccl/pkg/srctxz/Makefile b/projects/rccl/pkg/srctxz/Makefile index ed677fe3b1..01cab95a43 100644 --- a/projects/rccl/pkg/srctxz/Makefile +++ b/projects/rccl/pkg/srctxz/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/projects/rccl/pkg/srctxz/create_srctxz.sh.in b/projects/rccl/pkg/srctxz/create_srctxz.sh.in index ae7d01f2ff..11bdd52db7 100644 --- a/projects/rccl/pkg/srctxz/create_srctxz.sh.in +++ b/projects/rccl/pkg/srctxz/create_srctxz.sh.in @@ -1,6 +1,6 @@ #!/bin/bash # -# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/projects/rccl/pkg/txz/Makefile b/projects/rccl/pkg/txz/Makefile index fa587ef186..b7d9aa53c8 100644 --- a/projects/rccl/pkg/txz/Makefile +++ b/projects/rccl/pkg/txz/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/projects/rccl/pkg/txz/create_txz.sh.in b/projects/rccl/pkg/txz/create_txz.sh.in index 73922e0929..deae854830 100644 --- a/projects/rccl/pkg/txz/create_txz.sh.in +++ b/projects/rccl/pkg/txz/create_txz.sh.in @@ -1,6 +1,6 @@ #!/bin/bash # -# Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/projects/rccl/src/Makefile b/projects/rccl/src/Makefile index fe60b115f9..2d32dca78d 100644 --- a/projects/rccl/src/Makefile +++ b/projects/rccl/src/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # @@ -9,10 +9,10 @@ include ../makefiles/version.mk ##### src files INCEXPORTS := nccl.h nccl_net.h -LIBSRCFILES := init.cu channel.cu bootstrap.cu transport.cu enqueue.cu \ - misc/group.cu misc/nvmlwrap.cu misc/ibvwrap.cu misc/rings.cu misc/utils.cu misc/checks.cu misc/trees.cu \ - transport/p2p.cu transport/shm.cu transport/net.cu transport/net_socket.cu transport/net_ib.cu \ - collectives/all_reduce.cu collectives/all_gather.cu collectives/broadcast.cu collectives/reduce.cu collectives/reduce_scatter.cu +LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc \ + misc/group.cc misc/nvmlwrap.cc misc/ibvwrap.cc misc/rings.cc misc/utils.cc misc/argcheck.cc misc/trees.cc misc/topo.cc \ + transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc \ + collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc ##### lib files LIBNAME := libnccl.so @@ -27,7 +27,7 @@ INCTARGETS := $(INCEXPORTS:%=$(INCDIR)/%) LIBSONAME := $(LIBNAME:%=%.$(NCCL_MAJOR)) LIBTARGET := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH)) STATICLIBTARGET := $(STATICLIBNAME) -LIBOBJ := $(LIBSRCFILES:%.cu=$(OBJDIR)/%.o) +LIBOBJ := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o) DEPFILES := $(LIBOBJ:%.o=%.d) LDFLAGS += -L${CUDA_LIB} -lcudart_static -lpthread -lrt -ldl @@ -87,11 +87,11 @@ $(INCDIR)/nccl_%.h : include/nccl_%.h mkdir -p $(INCDIR) cp -f $< $@ -$(OBJDIR)/%.o : %.cu +$(OBJDIR)/%.o : %.cc @printf "Compiling %-35s > %s\n" $< $@ mkdir -p `dirname $@` - $(NVCC) -I. -I$(INCDIR) -Iinclude -c $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< -o $@ - @$(NVCC) -I. -I$(INCDIR) -Iinclude -M $(NVCUFLAGS) --compiler-options "$(CXXFLAGS)" $< > $(@:%.o=%.d.tmp) + $(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -c $< -o $@ + @$(CXX) -I. -I$(INCDIR) $(CXXFLAGS) -Iinclude -M $< > $(@:%.o=%.d.tmp) @sed "0,/^.*:/s//$(subst /,\/,$@):/" $(@:%.o=%.d.tmp) > $(@:%.o=%.d) @sed -e 's/.*://' -e 's/\\$$//' < $(@:%.o=%.d.tmp) | fmt -1 | \ sed -e 's/^ *//' -e 's/$$/:/' >> $(@:%.o=%.d) @@ -107,7 +107,7 @@ install : lib cp -P -v $(BUILDDIR)/lib/* $(PREFIX)/lib/ cp -v $(BUILDDIR)/include/* $(PREFIX)/include/ -FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cu" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h') +FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cc" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h') # Note that formatting.mk defines a new target so in order to not overwrite the default target, # it shouldn't be included at the top. Also, it uses the above definition of FILESTOFORMAT as well # as the BUILDDIR variable. diff --git a/projects/rccl/src/bootstrap.cu b/projects/rccl/src/bootstrap.cc similarity index 99% rename from projects/rccl/src/bootstrap.cu rename to projects/rccl/src/bootstrap.cc index 6b1d5732df..9df38e4433 100644 --- a/projects/rccl/src/bootstrap.cu +++ b/projects/rccl/src/bootstrap.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/projects/rccl/src/channel.cu b/projects/rccl/src/channel.cc similarity index 91% rename from projects/rccl/src/channel.cu rename to projects/rccl/src/channel.cc index 937e84e7a8..b053e5b947 100644 --- a/projects/rccl/src/channel.cu +++ b/projects/rccl/src/channel.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -47,5 +47,10 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) { if (peer->send.transportResources) NCCLCHECK(peer->send.transportComm->free(peer->send.transportResources)); if (peer->recv.transportResources) NCCLCHECK(peer->recv.transportComm->free(peer->recv.transportResources)); } + + // Free the peer structures. + CUDACHECK(cudaFree(channel->devPeers)); + free(channel->peers); + return ncclSuccess; } diff --git a/projects/rccl/src/collectives/all_gather.cu b/projects/rccl/src/collectives/all_gather.cc similarity index 92% rename from projects/rccl/src/collectives/all_gather.cu rename to projects/rccl/src/collectives/all_gather.cc index db21deef25..348c176001 100644 --- a/projects/rccl/src/collectives/all_gather.cu +++ b/projects/rccl/src/collectives/all_gather.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/projects/rccl/src/collectives/all_reduce.cu b/projects/rccl/src/collectives/all_reduce.cc similarity index 92% rename from projects/rccl/src/collectives/all_reduce.cu rename to projects/rccl/src/collectives/all_reduce.cc index 1492c90949..921f2dec94 100644 --- a/projects/rccl/src/collectives/all_reduce.cu +++ b/projects/rccl/src/collectives/all_reduce.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/projects/rccl/src/collectives/broadcast.cu b/projects/rccl/src/collectives/broadcast.cc similarity index 94% rename from projects/rccl/src/collectives/broadcast.cu rename to projects/rccl/src/collectives/broadcast.cc index 6a3d0a8b84..042301b376 100644 --- a/projects/rccl/src/collectives/broadcast.cu +++ b/projects/rccl/src/collectives/broadcast.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/projects/rccl/src/collectives/collectives.h b/projects/rccl/src/collectives/collectives.h index e6b19cb786..73fe7d5c81 100644 --- a/projects/rccl/src/collectives/collectives.h +++ b/projects/rccl/src/collectives/collectives.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/projects/rccl/src/collectives/device/Makefile b/projects/rccl/src/collectives/device/Makefile index 8e92596f27..0ee587bd9a 100644 --- a/projects/rccl/src/collectives/device/Makefile +++ b/projects/rccl/src/collectives/device/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/projects/rccl/src/collectives/device/all_gather.cu b/projects/rccl/src/collectives/device/all_gather.cu index 530bf1457d..109c3416c3 100644 --- a/projects/rccl/src/collectives/device/all_gather.cu +++ b/projects/rccl/src/collectives/device/all_gather.cu @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/projects/rccl/src/collectives/device/all_gather.h b/projects/rccl/src/collectives/device/all_gather.h index 36809c916c..8e78730aa7 100644 --- a/projects/rccl/src/collectives/device/all_gather.h +++ b/projects/rccl/src/collectives/device/all_gather.h @@ -1,10 +1,10 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" +#include "devcomm.h" #include "primitives.h" #include "collectives.h" @@ -13,7 +13,7 @@ __device__ void ncclAllGatherRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int nthreads = blockDim.x - 1; const int bid = args->bid; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; const ssize_t size = args->N; @@ -74,7 +74,7 @@ __device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int bid = args->bid; const int nthreads = args->nThreads; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; diff --git a/projects/rccl/src/collectives/device/all_reduce.cu b/projects/rccl/src/collectives/device/all_reduce.cu index aaa96b4175..85d007e806 100644 --- a/projects/rccl/src/collectives/device/all_reduce.cu +++ b/projects/rccl/src/collectives/device/all_reduce.cu @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/projects/rccl/src/collectives/device/all_reduce.h b/projects/rccl/src/collectives/device/all_reduce.h index ea89a71255..9b058cc621 100644 --- a/projects/rccl/src/collectives/device/all_reduce.h +++ b/projects/rccl/src/collectives/device/all_reduce.h @@ -1,10 +1,10 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" +#include "devcomm.h" #include "primitives.h" #include "collectives.h" @@ -13,7 +13,7 @@ __device__ void ncclAllReduceRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int nthreads = blockDim.x - 1; const int bid = args->bid; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; const ssize_t size = args->N; @@ -87,7 +87,7 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int nthreads = blockDim.x - 1; const int bid = args->bid; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclTree* tree = &channel->tree; const ssize_t size = args->N; @@ -139,7 +139,7 @@ __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int bid = args->bid; const int nthreads = args->nThreads; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; @@ -214,7 +214,7 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int nthreads = args->nThreads; const int bid = args->bid; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclTree* tree = &channel->tree; const ssize_t size = args->N; diff --git a/projects/rccl/src/collectives/device/broadcast.cu b/projects/rccl/src/collectives/device/broadcast.cu index b83ee7091d..8c8dbb602e 100644 --- a/projects/rccl/src/collectives/device/broadcast.cu +++ b/projects/rccl/src/collectives/device/broadcast.cu @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/projects/rccl/src/collectives/device/broadcast.h b/projects/rccl/src/collectives/device/broadcast.h index fb183122ff..ae8667fdb0 100644 --- a/projects/rccl/src/collectives/device/broadcast.h +++ b/projects/rccl/src/collectives/device/broadcast.h @@ -1,10 +1,10 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" +#include "devcomm.h" #include "primitives.h" #include "collectives.h" @@ -13,7 +13,7 @@ __device__ void ncclBroadcastRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int nthreads = blockDim.x - 1; const int bid = args->bid; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; const ssize_t size = args->N; @@ -59,7 +59,7 @@ __device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int bid = args->bid; const int nthreads = args->nThreads; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; diff --git a/projects/rccl/src/collectives/device/common.h b/projects/rccl/src/collectives/device/common.h index e4aecbd3a0..8c336bf945 100644 --- a/projects/rccl/src/collectives/device/common.h +++ b/projects/rccl/src/collectives/device/common.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -8,7 +8,7 @@ #define NCCL_DEVICE_COMMON_H_ #include "../collectives.h" -#include "core.h" +#include "devcomm.h" #include "nccl.h" // Exit If Abort Barrier across CTA: make sure all threads exit consistently @@ -57,7 +57,7 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclColl firstColl) { \ int bid = blockIdx.x; \ __shared__ struct ncclColl localColl; \ \ - struct ncclComm* comm = firstColl.args.comm; \ + struct ncclDevComm* comm = firstColl.args.comm; \ struct ncclChannel* channel = comm->channels+bid; \ struct ncclColl* c; \ if (bid == 0) { \ diff --git a/projects/rccl/src/collectives/device/common_kernel.h b/projects/rccl/src/collectives/device/common_kernel.h index e1fb096c29..435a5983eb 100644 --- a/projects/rccl/src/collectives/device/common_kernel.h +++ b/projects/rccl/src/collectives/device/common_kernel.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -7,7 +7,7 @@ #ifndef NCCL_COMMON_KERNEL_H_ #define NCCL_COMMON_KERNEL_H_ -#include "core.h" +#include "devcomm.h" #include #include diff --git a/projects/rccl/src/collectives/device/functions.cu b/projects/rccl/src/collectives/device/functions.cu index ea06b6894b..010c4548c0 100644 --- a/projects/rccl/src/collectives/device/functions.cu +++ b/projects/rccl/src/collectives/device/functions.cu @@ -1,10 +1,10 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" +#include "devcomm.h" #include "collectives.h" #include "common.h" diff --git a/projects/rccl/src/collectives/device/gen_rules.sh b/projects/rccl/src/collectives/device/gen_rules.sh index 3942c8c2b0..4413213e1e 100755 --- a/projects/rccl/src/collectives/device/gen_rules.sh +++ b/projects/rccl/src/collectives/device/gen_rules.sh @@ -1,6 +1,6 @@ #!/bin/bash # -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # diff --git a/projects/rccl/src/collectives/device/primitives.h b/projects/rccl/src/collectives/device/primitives.h index c5aaf549b4..7beeaf496b 100644 --- a/projects/rccl/src/collectives/device/primitives.h +++ b/projects/rccl/src/collectives/device/primitives.h @@ -50,7 +50,7 @@ class ncclPrimitives { T* sendDirectBuff[NSEND]; const T* recvBuff[NRECV]; T* sendBuff[NSEND]; - struct ncclComm* comm; + struct ncclDevComm* comm; inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*stepSize; } inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*stepSize; } @@ -239,7 +239,7 @@ class ncclPrimitives { public: __device__ __forceinline__ - ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclComm* comm, const uint64_t opCount) + ncclPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount) : comm(comm), tid(tid), nthreads(nthreads), stepSize(stepSize), opCount(opCount) { // Make sure step is updated before we read it __syncthreads(); @@ -329,14 +329,14 @@ class ncclLLPrimitives { uint64_t sendConnHead; union ncclLLFifoLine* recvBuff[NRECV]; union ncclLLFifoLine* sendBuff[NSEND]; - struct ncclComm* comm; + struct ncclDevComm* comm; inline __device__ int recvOffset(int i) { return (recvStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; } inline __device__ int sendOffset(int i) { return (sendStep[i]%NCCL_STEPS)*NCCL_LL_SLICE_LINES; } inline __device__ union ncclLLFifoLine* recvPtr(int i) { return recvBuff[i]+recvOffset(i); } inline __device__ union ncclLLFifoLine* sendPtr(int i) { return sendBuff[i]+sendOffset(i); } - inline __device__ uint32_t recvFlag(int i) { return recvStep[i]+1; } - inline __device__ uint32_t sendFlag(int i) { return sendStep[i]+1; } + inline __device__ uint32_t recvFlag(int i) { return NCCL_LL_FLAG(recvStep[i]+1); } + inline __device__ uint32_t sendFlag(int i) { return NCCL_LL_FLAG(sendStep[i]+1); } // Exit If Abort Barrier : make sure all threads exit consistently // Each thread sets a predicate to true if val == 1 @@ -393,7 +393,10 @@ class ncclLLPrimitives { sendConnHead = *waitPtr; if (checkAbort(sendConn[i]->opCountRem)) break; } - if (fifoPtr) fifoPtr[sendStep[i]%NCCL_STEPS] = nbytes; + if (fifoPtr) { + int size = ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) ? NCCL_LL_SLICE_LINES*sizeof(union ncclLLFifoLine) : nbytes; + fifoPtr[sendStep[i]%NCCL_STEPS] = size; + } } } @@ -402,7 +405,12 @@ class ncclLLPrimitives { if (tid == i) *postPtr = recvStep[i]; } - inline __device__ void postSend(int i) { + inline __device__ void postSend(int i, int offset) { + // LL Cleanup : write all flags in the slice to make sure we don't have + // data corruption when flag loops over. + if ((sendStep[i] & NCCL_LL_CLEAN_MASK) == NCCL_LL_CLEAN_MASK) { + for (int o = offset; o sendConn[i]->llLastCleaning + NCCL_LL_CLEAN_FREQ) { - /* Reset all flags */ - static_assert((NCCL_LL_BUFF_SIZE % NCCL_LL_MAX_NTHREADS) == 0, "NCCL_LL_BUFF_SIZE must be a multiple of THREADS"); - static_assert(NCCL_LL_BUFF_SIZE/(sizeof(union ncclLLFifoLine)*NCCL_LL_MAX_NTHREADS) > 0, "NCCL_LL_BUFF_SIZE is less than 16 bytes*THREADS"); - for (int s=0; sllLastCleaning = sendStep[i]; - } - } - - __device__ __forceinline__ void llRecvCleaning(int i) { - if (recvStep[i] > recvConn[i]->llLastCleaning + NCCL_LL_CLEAN_FREQ) { - recvStep[i] += NCCL_STEPS; - if (tid == 0) recvConn[i]->llLastCleaning = recvStep[i]; - } - } - public: __device__ __forceinline__ - ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclComm* comm, const uint64_t opCount) + ncclLLPrimitives(const int tid, const int nthreads, int* recvPeers, int* sendPeers, struct ncclChannel* channel, struct ncclDevComm* comm, const uint64_t opCount) : comm(comm), tid(tid), nthreads(nthreads), opCount(opCount) { // Make sure step is updated before we read it. barrier(); @@ -577,8 +563,6 @@ class ncclLLPrimitives { } __device__ __forceinline__ ~ncclLLPrimitives() { - for (int i=0; ibid; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; const ssize_t size = args->N; @@ -55,7 +55,7 @@ __device__ void ncclReduceRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int bid = args->bid; const int nthreads = args->nThreads; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; diff --git a/projects/rccl/src/collectives/device/reduce_scatter.cu b/projects/rccl/src/collectives/device/reduce_scatter.cu index 10857eda54..8b45299757 100644 --- a/projects/rccl/src/collectives/device/reduce_scatter.cu +++ b/projects/rccl/src/collectives/device/reduce_scatter.cu @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/projects/rccl/src/collectives/device/reduce_scatter.h b/projects/rccl/src/collectives/device/reduce_scatter.h index c70c845267..09ba56ee02 100644 --- a/projects/rccl/src/collectives/device/reduce_scatter.h +++ b/projects/rccl/src/collectives/device/reduce_scatter.h @@ -1,10 +1,10 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" +#include "devcomm.h" #include "primitives.h" #include "collectives.h" @@ -13,7 +13,7 @@ __device__ void ncclReduceScatterRingKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int nthreads = blockDim.x - 1; const int bid = args->bid; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; const ssize_t size = args->N; @@ -69,7 +69,7 @@ __device__ void ncclReduceScatterRingLLKernel(struct CollectiveArgs* args) { const int tid = threadIdx.x; const int bid = args->bid; const int nthreads = args->nThreads; - struct ncclComm* comm = args->comm; + struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; struct ncclRing* ring = &channel->ring; diff --git a/projects/rccl/src/collectives/reduce.cu b/projects/rccl/src/collectives/reduce.cc similarity index 92% rename from projects/rccl/src/collectives/reduce.cu rename to projects/rccl/src/collectives/reduce.cc index 302d4bcfd9..67f2fae969 100644 --- a/projects/rccl/src/collectives/reduce.cu +++ b/projects/rccl/src/collectives/reduce.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/projects/rccl/src/collectives/reduce_scatter.cu b/projects/rccl/src/collectives/reduce_scatter.cc similarity index 92% rename from projects/rccl/src/collectives/reduce_scatter.cu rename to projects/rccl/src/collectives/reduce_scatter.cc index 4ee77ef985..5ad7f5fa13 100644 --- a/projects/rccl/src/collectives/reduce_scatter.cu +++ b/projects/rccl/src/collectives/reduce_scatter.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/projects/rccl/src/enqueue.cu b/projects/rccl/src/enqueue.cc similarity index 97% rename from projects/rccl/src/enqueue.cu rename to projects/rccl/src/enqueue.cc index d283223fa1..b485634569 100644 --- a/projects/rccl/src/enqueue.cu +++ b/projects/rccl/src/enqueue.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -87,7 +87,7 @@ ncclResult_t ncclLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *par } ncclResult_t setupLaunch(struct ncclComm* comm, struct cudaLaunchParams* params) { - params->gridDim.x = std::min((int) params->gridDim.x, comm->nChannels); + params->gridDim.x = std::min(params->gridDim.x, comm->nChannels); // Set active = 2 for the last operation for (int r=0; rgridDim.x; r++) { @@ -266,7 +266,7 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) { static void getKernelInfo(struct ncclInfo* info, uint8_t* nChannels, uint16_t* nThreads, int* llMode) { // Compute thresholds and limits that users can override - int perThreadLLThreshold = std::min(info->comm->threadThreshold, (ssize_t)NCCL_LL_CHANNEL_THRESHOLD); + ssize_t perThreadLLThreshold = std::min(info->comm->threadThreshold, NCCL_LL_CHANNEL_THRESHOLD); int maxLLNthreads = std::min(NCCL_LL_MAX_NTHREADS, info->comm->nThreads); // First compute nThreads @@ -365,7 +365,7 @@ static ncclResult_t saveKernel(struct ncclInfo* info) { memset(&proxyArgs, 0, sizeof(struct ncclProxyArgs)); NCCLCHECK(computeColl(info, &coll, &proxyArgs)); - info->comm->myParams->blockDim.x = max(info->comm->myParams->blockDim.x, coll.args.nThreads); + info->comm->myParams->blockDim.x = std::max(info->comm->myParams->blockDim.x, coll.args.nThreads); if (info->comm->userStreamSet == false) { info->comm->userStream = info->stream; info->comm->userStreamSet = true; diff --git a/projects/rccl/src/include/alloc.h b/projects/rccl/src/include/alloc.h new file mode 100644 index 0000000000..bcdbd18a03 --- /dev/null +++ b/projects/rccl/src/include/alloc.h @@ -0,0 +1,51 @@ +/************************************************************************* + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_ALLOC_H_ +#define NCCL_ALLOC_H_ + +#include "nccl.h" +#include "checks.h" +#include + +static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) { + CUDACHECK(cudaHostAlloc(ptr, size, cudaHostAllocMapped)); + memset(*ptr, 0, size); + *devPtr = *ptr; + return ncclSuccess; +} + +static inline ncclResult_t ncclCudaHostFree(void* ptr) { + CUDACHECK(cudaFreeHost(ptr)); + return ncclSuccess; +} + +template +static ncclResult_t ncclCalloc(T** ptr, size_t nelem) { + void* p = malloc(nelem*sizeof(T)); + if (p == NULL) { + WARN("Failed to malloc %ld bytes", nelem*sizeof(T)); + return ncclSystemError; + } + memset(p, 0, nelem*sizeof(T)); + *ptr = (T*)p; + return ncclSuccess; +} + +template +static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) { + CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T))); + CUDACHECK(cudaMemset(*ptr, 0, nelem*sizeof(T))); + return ncclSuccess; +} + +template +static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) { + CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault)); + return ncclSuccess; +} + +#endif diff --git a/projects/rccl/src/include/argcheck.h b/projects/rccl/src/include/argcheck.h new file mode 100644 index 0000000000..0d6cca7c30 --- /dev/null +++ b/projects/rccl/src/include/argcheck.h @@ -0,0 +1,15 @@ +/************************************************************************* + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_ARGCHECK_H_ +#define NCCL_ARGCHECK_H_ + +#include "core.h" + +ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname); +ncclResult_t ArgsCheck(struct ncclInfo* info); + +#endif diff --git a/projects/rccl/src/include/bootstrap.h b/projects/rccl/src/include/bootstrap.h index a1aaf50a89..dd7de2ce0e 100644 --- a/projects/rccl/src/include/bootstrap.h +++ b/projects/rccl/src/include/bootstrap.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/projects/rccl/src/include/channel.h b/projects/rccl/src/include/channel.h index 76c5e8ad92..c01d942e4f 100644 --- a/projects/rccl/src/include/channel.h +++ b/projects/rccl/src/include/channel.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/projects/rccl/src/include/checks.h b/projects/rccl/src/include/checks.h index bf7750edba..50737b014e 100644 --- a/projects/rccl/src/include/checks.h +++ b/projects/rccl/src/include/checks.h @@ -1,10 +1,73 @@ /************************************************************************* - * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ -#include "core.h" +#ifndef NCCL_CHECKS_H_ +#define NCCL_CHECKS_H_ -ncclResult_t PtrCheck(void* ptr, const char* opname, const char* ptrname); -ncclResult_t ArgsCheck(struct ncclInfo* info); +#include "debug.h" + +// Check CUDA calls +#define CUDACHECK(cmd) do { \ + cudaError_t e = cmd; \ + if( e != cudaSuccess ) { \ + WARN("Cuda failure '%s'", cudaGetErrorString(e)); \ + return ncclUnhandledCudaError; \ + } \ +} while(false) + +#define CUDACHECKGOTO(cmd, res, label) do { \ + cudaError_t e = cmd; \ + if( e != cudaSuccess ) { \ + WARN("Cuda failure '%s'", cudaGetErrorString(e)); \ + res = ncclUnhandledCudaError; \ + goto label; \ + } \ +} while(false) + +#include +// Check system calls +#define SYSCHECK(call, name) do { \ + int retval; \ + SYSCHECKVAL(call, name, retval); \ +} while (false) + +#define SYSCHECKVAL(call, name, retval) do { \ + SYSCHECKSYNC(call, name, retval); \ + if (retval == -1) { \ + WARN("Call to " name " failed : %s", strerror(errno)); \ + return ncclSystemError; \ + } \ +} while (false) + +#define SYSCHECKSYNC(call, name, retval) do { \ + retval = call; \ + if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \ + INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \ + } else { \ + break; \ + } \ +} while(true) + +// Propagate errors up +#define NCCLCHECK(call) do { \ + ncclResult_t res = call; \ + if (res != ncclSuccess) { \ + /* Print the back trace*/ \ + INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ + return res; \ + } \ +} while (0); + +#define NCCLCHECKGOTO(call, res, label) do { \ + res = call; \ + if (res != ncclSuccess) { \ + /* Print the back trace*/ \ + INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ + goto label; \ + } \ +} while (0); + +#endif diff --git a/projects/rccl/src/include/comm.h b/projects/rccl/src/include/comm.h new file mode 100644 index 0000000000..132eb39c0d --- /dev/null +++ b/projects/rccl/src/include/comm.h @@ -0,0 +1,127 @@ +/************************************************************************* + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_COMM_H_ +#define NCCL_COMM_H_ + +#if CUDART_VERSION < 9000 +struct cudaLaunchParams { + void *func; + dim3 gridDim; + dim3 blockDim; + void **args; + size_t sharedMem; + cudaStream_t stream; +}; +#endif + +#define MAXCHANNELS 16 +#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */ + +#define CACHE_LINE_SIZE 128 +#define MEM_ALIGN 4096 +#define CUDA_IPC_MIN 2097152UL /* 2MiB - not currently used */ + +struct ncclSendMem { + union { + struct { + uint64_t head; + char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)]; + void* ptrExchange; + char pad2[CACHE_LINE_SIZE-sizeof(void*)]; + uint64_t opCount; + }; + char pad3[MEM_ALIGN]; + }; +}; + +struct ncclRecvMem { + union { + struct { + uint64_t tail; + char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)]; + uint64_t opCount; + char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)]; + int sizesFifo[NCCL_STEPS]; + }; + char pad4[MEM_ALIGN]; + }; + ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES]; + char buff[1]; // Actually larger than that +}; + +struct ncclComm { + struct ncclChannel channels[MAXCHANNELS]; + + struct ncclPeerInfo* peerInfo; + + void* bootstrap; + + int rank; // my rank in the communicator + int nRanks; // number of GPUs in communicator + int cudaDev; // my cuda device index + int nvmlDev; // my NVML device number + + enum { GROUP, PARALLEL } launchMode; + cudaStream_t userStream; + bool userStreamSet; + cudaEvent_t doneEvent; + bool checkPointers; + + // Counter to make sure collectives match (needed for bcast/reduce + // where syncs are not symmetric). + uint64_t opCount; + + // Channels for collectives + int nChannels; + int nThreads; + + // Low-latency algorithm threshold + ssize_t llThreshold; + ssize_t threadThreshold; + + // Tree algorithm threshold + ssize_t treeThreshold; + + // An internal CUDA stream for NCCL kernel CGMD launches + int groupCudaStream; + cudaStream_t groupStream; + + // Whether there has been a fatal error in this communicator. + ncclResult_t fatalError; + + // Error reported by GPU + volatile ncclDevError_t* fatalDevError; + + // Flag to ask NCCL kernels to abort + volatile uint32_t *abortFlag; + + // Device side of the communicator + struct ncclDevComm *devComm; + // Host copy of the devComm (to free CUDA allocs) + struct ncclDevComm hostDevComm; + + // Intra-process sync + int intraRank; + int intraRanks; + int* intraBarrier; + int intraPhase; + + // Storage for deferred intra-process launch + struct cudaLaunchParams * intraParams; + struct cudaLaunchParams *myParams; + int* intraCudaDevs; + int* intraCGMode; // Whether we can use CUDA9 CGMD or not + int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not + struct ncclColl args; + void* argsptr; + + // Global proxy thread + pthread_t proxyThread; + struct ncclProxyState proxyState; +}; + +#endif diff --git a/projects/rccl/src/include/core.h b/projects/rccl/src/include/core.h index d57d27107e..8a08b914b0 100644 --- a/projects/rccl/src/include/core.h +++ b/projects/rccl/src/include/core.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -7,385 +7,20 @@ #ifndef NCCL_CORE_H_ #define NCCL_CORE_H_ -#define NCCL_MAX_OPS 2048 -#define NCCL_STEPS 8 - +#include +#include #include "nccl.h" -#include "transport.h" #include "debug.h" +#include "checks.h" +#include "alloc.h" +#include "transport.h" +#include "devcomm.h" +#include "comm.h" +#include "info.h" +#include "argcheck.h" #include -#include // std::min/std::max #include #include -#include - -#if CUDART_VERSION < 9000 -struct cudaLaunchParams { - void *func; - dim3 gridDim; - dim3 blockDim; - void **args; - size_t sharedMem; - cudaStream_t stream; -}; -#endif - -#define MAXCHANNELS 16 -#define MAXTHREADS 256 -#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */ - -// Channels / LL tuning -#define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings -#define NCCL_THREAD_THRESHOLD 64 // Per thread size before we switch to non-LL -#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs -#define NCCL_LL_MAX_NTHREADS MAXTHREADS -#define NCCL_LL_MIN_NTHREADS 64 - -#define DIVUP(x, y) \ - (((x)+(y)-1)/(y)) -#define ROUNDUP(x, y) \ - (DIVUP((x), (y))*(y)) - -#define ALIGN_SIZE(size, align) \ - size = ((size + (align) - 1) / (align)) * (align); - -union ncclLLFifoLine { - /* Flags have to be *after* data, because otherwise, an incomplete receive - from the network may receive the flag but not the data. - Note this is assuming that either we receive contiguous chunks of data - (sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */ - struct { - uint32_t data1; - uint32_t flag1; - uint32_t data2; - uint32_t flag2; - }; - uint64_t v[2]; - int4 i4; -}; - -typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t; - -typedef enum { - ncclPatternRing, - ncclPatternRingTwice, - ncclPatternPipelineFrom, - ncclPatternPipelineTo, - ncclPatternTreeUp, - ncclPatternTreeDown, - ncclPatternTreeUpDown -} ncclPattern_t; - -typedef enum { - ncclDevSuccess, - ncclDevAssertedMismatch, - ncclDevSuspectedMismatch -} ncclDevError_t; - -// Used to pass NCCL call information between functions -struct ncclInfo { - ncclColl_t coll; - const char* opName; - // NCCL Coll Args - const void* sendbuff; - void* recvbuff; - size_t count; - ncclDataType_t datatype; - ncclRedOp_t op; - int root; - ncclComm_t comm; - cudaStream_t stream; - // Algorithm details - int chunkSteps; - int sliceSteps; - // Computed later - ncclPattern_t pattern; - size_t nBytes; - int nstepsPerLoop; - int nchunksPerLoop; -}; - -struct ncclConnInfo { - // Regular comm mechanism - char *buff; // Local for recv, remote for send - uint64_t *tail; // Local for recv, remote for send - uint64_t *head; // Local for send, remote for recv - uint64_t *opCountLoc; // opCount of local rank - uint64_t *opCountRem; // opCount of remote rank - - int direct; // Direct communication - void **ptrExchange; // Pointer exchange for direct communication - - int *fifo; // Size fifo for proxy - - uint64_t step; // Keep where we are - - // Low latency mechanism - union ncclLLFifoLine *llBuff; // Local for recv, remote for send - uint64_t llLastCleaning; -}; - -struct ncclConnector { - int connected; - struct ncclProxyArgs *proxyAppend; - struct ncclTransportComm* transportComm; - void* transportResources; // Host-side resources - struct ncclConnInfo conn; - struct ncclComm *comm; -}; - -#define CACHE_LINE_SIZE 128 -#define MEM_ALIGN 4096 -#define CUDA_IPC_MIN 2097152UL /* 2MiB - not currently used */ - -#define NUM_LINES_PER_THREAD 8 -#define NCCL_LL_SLICE_LINES (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS) -#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS) -#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine)) -#define NCCL_LL_CLEAN_FREQ 0x10000000 - -struct ncclSendMem { - union { - struct { - uint64_t head; - char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)]; - void* ptrExchange; - char pad2[CACHE_LINE_SIZE-sizeof(void*)]; - uint64_t opCount; - }; - char pad3[MEM_ALIGN]; - }; -}; - -struct ncclRecvMem { - union { - struct { - uint64_t tail; - char pad1[CACHE_LINE_SIZE-sizeof(uint64_t)]; - uint64_t opCount; - char pad2[CACHE_LINE_SIZE-sizeof(uint64_t)]; - int sizesFifo[NCCL_STEPS]; - }; - char pad4[MEM_ALIGN]; - }; - ncclLLFifoLine llBuff[NCCL_LL_BUFF_LINES]; - char buff[1]; // Actually larger than that -}; - -struct ncclRing { - // Shortcuts for userRanks[1] and userRanks[n-1] - int prev; - int next; - - // Maps an internal nccl index to user-specified rank order. This is necessary - // since we need to know how the user expects data to be ordered across - // devices. Ordered from current device. - int* userRanks; - int* devUserRanks; -}; - -#define NCCL_MAX_TREE_ARITY 3 -struct ncclTree { - int depth; - int up; - int down[NCCL_MAX_TREE_ARITY]; -}; - -struct ncclPeer { - struct ncclConnector send; - struct ncclConnector recv; -}; - -struct ncclChannel { - union { - struct { - struct ncclRing ring; - struct ncclTree tree; - - int id; - int nthreads; - int buffSize; - - // Communication structures - struct ncclPeer* peers; - struct ncclPeer* devPeers; - - // Operation list for aggregation - struct ncclColl* collectives; - struct ncclColl* devCollectives; - int collStart; - int collCount; - int collFifoHead; // Only used by GPU - int collFifoTail; // Only used by CPU - }; - int data[0x80]; - }; -}; -static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size"); - -/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */ -/* to make sure reads to host from the CUDA kernel are aligned. */ -/* Make sure to adjust padding at the end of ncclColl. */ -struct CollectiveArgs { - struct ncclComm* comm; - uint64_t opCount; - - // local and remote input, output, and buffer - const void * ThisInput; - void * ThisOutput; - - // general parameters - size_t N; - uint32_t root; - uint8_t bid; - uint8_t nChannels; - uint16_t nThreads; - - int lastChunkSize; -}; -struct ncclColl { - union { - struct { - struct CollectiveArgs args; - uint16_t funcIndex; - uint16_t nextIndex; - uint8_t active; - }; - int data[0x10]; - }; -}; -static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size"); - -struct ncclComm { - struct ncclChannel channels[MAXCHANNELS]; - - struct ncclPeerInfo* peerInfo; - - void* bootstrap; - - int rank; // my rank in the communicator - int nRanks; // number of GPUs in communicator - int cudaDev; // my cuda device index - int nvmlDev; // my NVML device number - - enum { GROUP, PARALLEL } launchMode; - cudaStream_t userStream; - bool userStreamSet; - cudaEvent_t doneEvent; - bool checkPointers; - - // Counter to make sure collectives match (needed for bcast/reduce - // where syncs are not symmetric). - uint64_t opCount; - - // Channels for collectives - int nChannels; - int nThreads; - - // Low-latency algorithm threshold - ssize_t llThreshold; - ssize_t threadThreshold; - - // Tree algorithm threshold - ssize_t treeThreshold; - - // An internal CUDA stream for NCCL kernel CGMD launches - int groupCudaStream; - cudaStream_t groupStream; - - // Whether there has been a fatal error in this communicator. - ncclResult_t fatalError; - - // Error reported by GPU - volatile ncclDevError_t* fatalDevError; - - // On host: this pointer has been obtained from cudaHostAlloc(cudaHostAllocMapped) - // On device: this pointer has been obtained from cudaHostGetDevicePointer() - volatile uint32_t *abortFlag; - - // Device copy of the communicator - struct ncclComm *devComm; - - // Intra-process sync - int intraRank; - int intraRanks; - int* intraBarrier; - int intraPhase; - - // Storage for deferred intra-process launch - struct cudaLaunchParams * intraParams; - struct cudaLaunchParams *myParams; - int* intraCudaDevs; - int* intraCGMode; // Whether we can use CUDA9 CGMD or not - int* intraCC; // Only to check all have the same ComputeCap and disable CGMode if not - struct ncclColl args; - void* argsptr; - - // Global proxy thread - pthread_t proxyThread; - struct ncclProxyState proxyState; -}; - -// Check CUDA calls -#define CUDACHECK(cmd) do { \ - cudaError_t e = cmd; \ - if( e != cudaSuccess ) { \ - WARN("Cuda failure '%s'", cudaGetErrorString(e)); \ - return ncclUnhandledCudaError; \ - } \ -} while(false) - -#define CUDACHECKGOTO(cmd, res, label) do { \ - cudaError_t e = cmd; \ - if( e != cudaSuccess ) { \ - WARN("Cuda failure '%s'", cudaGetErrorString(e)); \ - res = ncclUnhandledCudaError; \ - goto label; \ - } \ -} while(false) - -#include -// Check system calls -#define SYSCHECK(call, name) do { \ - int retval; \ - SYSCHECKVAL(call, name, retval); \ -} while (false) - -#define SYSCHECKVAL(call, name, retval) do { \ - SYSCHECKSYNC(call, name, retval); \ - if (retval == -1) { \ - WARN("Call to " name " failed : %s", strerror(errno)); \ - return ncclSystemError; \ - } \ -} while (false) - -#define SYSCHECKSYNC(call, name, retval) do { \ - retval = call; \ - if (retval == -1 && (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \ - INFO(NCCL_ALL,"Call to " name " returned %s, retrying", strerror(errno)); \ - } else { \ - break; \ - } \ -} while(true) - -// Propagate errors up -#define NCCLCHECK(call) do { \ - ncclResult_t res = call; \ - if (res != ncclSuccess) { \ - /* Print the back trace*/ \ - INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ - return res; \ - } \ -} while (0); - -#define NCCLCHECKGOTO(call, res, label) do { \ - res = call; \ - if (res != ncclSuccess) { \ - /* Print the back trace*/ \ - INFO(NCCL_ALL,"%s:%d -> %d", __FILE__, __LINE__, res); \ - goto label; \ - } \ -} while (0); #ifdef PROFAPI #define NCCL_API(ret, func, args...) \ @@ -427,42 +62,4 @@ static __inline__ int ncclTypeSize(ncclDataType_t type) { } } -#include -static inline ncclResult_t ncclCudaHostAlloc(void** ptr, void** devPtr, size_t size) { - CUDACHECK(cudaHostAlloc(ptr, size, cudaHostAllocMapped)); - memset(*ptr, 0, size); - *devPtr = *ptr; - return ncclSuccess; -} - -static inline ncclResult_t ncclCudaHostFree(void* ptr) { - CUDACHECK(cudaFreeHost(ptr)); - return ncclSuccess; -} - -template -static ncclResult_t ncclCalloc(T** ptr, size_t nelem) { - void* p = malloc(nelem*sizeof(T)); - if (p == NULL) { - WARN("Failed to malloc %ld bytes", nelem*sizeof(T)); - return ncclSystemError; - } - memset(p, 0, nelem*sizeof(T)); - *ptr = (T*)p; - return ncclSuccess; -} - -template -static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem) { - CUDACHECK(cudaMalloc(ptr, nelem*sizeof(T))); - CUDACHECK(cudaMemset(*ptr, 0, nelem*sizeof(T))); - return ncclSuccess; -} - -template -static ncclResult_t ncclCudaMemcpy(T* dst, T* src, size_t nelem) { - CUDACHECK(cudaMemcpy(dst, src, nelem*sizeof(T), cudaMemcpyDefault)); - return ncclSuccess; -} - #endif // end include guard diff --git a/projects/rccl/src/include/cpuset.h b/projects/rccl/src/include/cpuset.h index f70d1d8090..98b93de87d 100644 --- a/projects/rccl/src/include/cpuset.h +++ b/projects/rccl/src/include/cpuset.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2018-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/projects/rccl/src/include/debug.h b/projects/rccl/src/include/debug.h index 3acdf8c28a..c3e8fa04bd 100644 --- a/projects/rccl/src/include/debug.h +++ b/projects/rccl/src/include/debug.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -24,7 +24,7 @@ extern int ncclDebugLevel; extern uint64_t ncclDebugMask; extern pthread_mutex_t ncclDebugOutputLock; extern FILE *ncclDebugFile; -extern ncclResult_t getHostName(char* hostname, int maxlen); +extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim); extern ncclResult_t getNvmlDevice(int cudaDev, int *nvmlDev); extern void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...); @@ -108,7 +108,7 @@ static inline void initDebug() { break; case 'h': // %h = hostname char hostname[1024]; - getHostName(hostname, 1024); + getHostName(hostname, 1024, '.'); dfn += snprintf(dfn, PATH_MAX, "%s", hostname); break; case 'p': // %p = pid diff --git a/projects/rccl/src/include/devcomm.h b/projects/rccl/src/include/devcomm.h new file mode 100644 index 0000000000..0a2ef9617c --- /dev/null +++ b/projects/rccl/src/include/devcomm.h @@ -0,0 +1,194 @@ +/************************************************************************* + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_DEVICE_H_ +#define NCCL_DEVICE_H_ + +#include "nccl.h" +#include + +#define NCCL_MAX_OPS 2048 +#define NCCL_STEPS 8 + +typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollCount } ncclColl_t; + +#define DIVUP(x, y) \ + (((x)+(y)-1)/(y)) +#define ROUNDUP(x, y) \ + (DIVUP((x), (y))*(y)) + +#define ALIGN_SIZE(size, align) \ + size = ((size + (align) - 1) / (align)) * (align); + +union ncclLLFifoLine { + /* Flags have to be *after* data, because otherwise, an incomplete receive + from the network may receive the flag but not the data. + Note this is assuming that either we receive contiguous chunks of data + (sockets) or data is written with an atomicity of 8 bytes (IB/RDMA). */ + struct { + uint32_t data1; + uint32_t flag1; + uint32_t data2; + uint32_t flag2; + }; + uint64_t v[2]; + int4 i4; +}; + +#define MAXTHREADS 256 +#define NCCL_LL_MAX_NTHREADS MAXTHREADS +#define NUM_LINES_PER_THREAD 8 +#define NCCL_LL_SLICE_LINES (NUM_LINES_PER_THREAD*NCCL_LL_MAX_NTHREADS) +#define NCCL_LL_BUFF_LINES (NCCL_LL_SLICE_LINES*NCCL_STEPS) +#define NCCL_LL_BUFF_SIZE (NCCL_LL_BUFF_LINES*sizeof(union ncclLLFifoLine)) +#ifdef DEBUG_LL +#define NCCL_LL_CLEAN_MASK 0x00000ff8 +#define NCCL_LL_FLAG_MAX 0x00001000 +#define NCCL_LL_FLAG(a) ((uint32_t)(a % NCCL_LL_FLAG_MAX)) +#else +#define NCCL_LL_CLEAN_MASK 0x7ffffff8 +#define NCCL_LL_FLAG(a) ((uint32_t)(a)) +#endif +// Make sure the clean mask will last for at least NCCL_NSTEPS +static_assert(NCCL_LL_CLEAN_MASK % NCCL_STEPS == 0, "Invalid NCCL_LL_CLEAN_MASK value"); + +struct ncclConnInfo { + // Regular comm mechanism + char *buff; // Local for recv, remote for send + uint64_t *tail; // Local for recv, remote for send + uint64_t *head; // Local for send, remote for recv + uint64_t *opCountLoc; // opCount of local rank + uint64_t *opCountRem; // opCount of remote rank + + int direct; // Direct communication + void **ptrExchange; // Pointer exchange for direct communication + + int *fifo; // Size fifo for proxy + + uint64_t step; // Keep where we are + + // Low latency mechanism + union ncclLLFifoLine *llBuff; // Local for recv, remote for send + uint64_t llLastCleaning; +}; + +struct ncclConnector { + int connected; + struct ncclProxyArgs *proxyAppend; + struct ncclTransportComm* transportComm; + void* transportResources; // Host-side resources + struct ncclConnInfo conn; + struct ncclComm *comm; +}; + +struct ncclRing { + // Shortcuts for userRanks[1] and userRanks[n-1] + int prev; + int next; + + // Maps an internal nccl index to user-specified rank order. This is necessary + // since we need to know how the user expects data to be ordered across + // devices. Ordered from current device. + int* userRanks; + int* devUserRanks; +}; + + +#define NCCL_MAX_TREE_ARITY 3 +struct ncclTree { + int depth; + int up; + int down[NCCL_MAX_TREE_ARITY]; +}; + +struct ncclPeer { + struct ncclConnector send; + struct ncclConnector recv; +}; + +struct ncclDevComm; + +/* CollectiveArgs + ncclColl are to be a power of two, currently 64 bytes, */ +/* to make sure reads to host from the CUDA kernel are aligned. */ +/* Make sure to adjust padding at the end of ncclColl. */ +struct CollectiveArgs { + struct ncclDevComm* comm; + uint64_t opCount; + + // local and remote input, output, and buffer + const void * ThisInput; + void * ThisOutput; + + // general parameters + size_t N; + uint32_t root; + uint8_t bid; + uint8_t nChannels; + uint16_t nThreads; + + int lastChunkSize; +}; +struct ncclColl { + union { + struct { + struct CollectiveArgs args; + uint16_t funcIndex; + uint16_t nextIndex; + uint8_t active; + }; + int data[0x10]; + }; +}; +static_assert(sizeof(struct ncclColl) == (0x10*sizeof(int)), "ncclColl must have a pow2 size"); + +struct ncclChannel { + union { + struct { + struct ncclRing ring; + struct ncclTree tree; + + int id; + int nthreads; + int buffSize; + + // Communication structures + struct ncclPeer* peers; + struct ncclPeer* devPeers; + + // Operation list for aggregation + struct ncclColl* collectives; + struct ncclColl* devCollectives; + int collStart; + int collCount; + int collFifoHead; // Only used by GPU + int collFifoTail; // Only used by CPU + }; + int data[0x80]; + }; +}; +static_assert(sizeof(struct ncclChannel) == 0x80*sizeof(int), "ncclChannel must have a pow2 size"); + +#define MAXCHANNELS 16 + +typedef enum { + ncclDevSuccess, + ncclDevAssertedMismatch, + ncclDevSuspectedMismatch +} ncclDevError_t; + +struct ncclDevComm { + int rank; + int nRanks; + + // Flag to ask NCCL kernels to abort + volatile uint32_t *abortFlag; + volatile ncclDevError_t *fatalDevError; + + // Channels, device side + struct ncclChannel* channels; +}; + +#endif diff --git a/projects/rccl/src/include/enqueue.h b/projects/rccl/src/include/enqueue.h index 4db7094c4e..3b7a18c821 100644 --- a/projects/rccl/src/include/enqueue.h +++ b/projects/rccl/src/include/enqueue.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -10,6 +10,12 @@ #include "core.h" #include "group.h" +// Channels / LL tuning +#define NCCL_LL_CHANNEL_THRESHOLD 8 // Per thread size before we start increasing nrings +#define NCCL_THREAD_THRESHOLD 64 // Per thread size before we switch to non-LL +#define NCCL_THREAD_THRESHOLD_PREVOLTA 32 // Per thread size before we switch to non-LL for pre-Volta archs +#define NCCL_LL_MIN_NTHREADS 64 + ncclResult_t ncclEnqueueCheck(struct ncclInfo* info); ncclResult_t ncclCpuBarrierIn(ncclComm_t comm, int* isLast); ncclResult_t ncclCpuBarrierLast(ncclComm_t comm); diff --git a/projects/rccl/src/include/ibvwrap.h b/projects/rccl/src/include/ibvwrap.h index 4f3e8311dc..0943f9962c 100644 --- a/projects/rccl/src/include/ibvwrap.h +++ b/projects/rccl/src/include/ibvwrap.h @@ -4,7 +4,7 @@ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2005 PathScale, Inc. All rights reserved. * - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/projects/rccl/src/include/info.h b/projects/rccl/src/include/info.h new file mode 100644 index 0000000000..401298a931 --- /dev/null +++ b/projects/rccl/src/include/info.h @@ -0,0 +1,45 @@ +/************************************************************************* + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_INFO_H_ +#define NCCL_INFO_H_ + +#include "nccl.h" + +typedef enum { + ncclPatternRing, + ncclPatternRingTwice, + ncclPatternPipelineFrom, + ncclPatternPipelineTo, + ncclPatternTreeUp, + ncclPatternTreeDown, + ncclPatternTreeUpDown +} ncclPattern_t; + +// Used to pass NCCL call information between functions +struct ncclInfo { + ncclColl_t coll; + const char* opName; + // NCCL Coll Args + const void* sendbuff; + void* recvbuff; + size_t count; + ncclDataType_t datatype; + ncclRedOp_t op; + int root; + ncclComm_t comm; + cudaStream_t stream; + // Algorithm details + int chunkSteps; + int sliceSteps; + // Computed later + ncclPattern_t pattern; + size_t nBytes; + int nstepsPerLoop; + int nchunksPerLoop; +}; + +#endif diff --git a/projects/rccl/src/include/nccl_net.h b/projects/rccl/src/include/nccl_net.h index 89edbf5024..797c759e69 100644 --- a/projects/rccl/src/include/nccl_net.h +++ b/projects/rccl/src/include/nccl_net.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -80,12 +80,13 @@ typedef struct { // Finalize connection establishment after remote peer has called connectHandle ncclResult_t (*accept)(void* listenComm, void** recvComm); // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); ncclResult_t (*deregMr)(void* comm, void* mhandle); - // Asynchronous send to a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + // Asynchronous send to a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request); - // Asynchronous recv from a peer. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + // Asynchronous recv from a peer. // May return request == NULL if the call cannot be performed (or would block) ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request); // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is diff --git a/projects/rccl/src/include/net.h b/projects/rccl/src/include/net.h index e75e6bbfe2..da3eceaa36 100644 --- a/projects/rccl/src/include/net.h +++ b/projects/rccl/src/include/net.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/projects/rccl/src/include/nvlink.h b/projects/rccl/src/include/nvlink.h index 1baf9e536f..8a0f99e729 100644 --- a/projects/rccl/src/include/nvlink.h +++ b/projects/rccl/src/include/nvlink.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -18,6 +18,7 @@ enum ncclNvLinkDeviceType { ncclNvLinkDeviceGpu, ncclNvLinkDeviceSwitch, + ncclNvLinkDeviceBridge, // IBM/Power NVLink bridge (Device 04ea) }; static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* type) { @@ -25,7 +26,13 @@ static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* memcpy(classPath+sizeof("/sys/bus/pci/devices/")-1, busId, sizeof("0000:00:00.0")-1); char* rPath = realpath(classPath, NULL); int fd; - SYSCHECKVAL(open(rPath, O_RDONLY), "open", fd); + if ((fd = open(rPath, O_RDONLY)) == -1) { + // Could not find device. It might be because we're in a VM and + // we don't see the whole machine. This is handled silently so + // we don't want to print an INFO error. + TRACE(NCCL_INIT, "Open of %s failed : %s\n", rPath, strerror(errno)); + return ncclSystemError; + } free(rPath); char pciClass[9]; strncpy(pciClass, "0x000000", 9); @@ -35,6 +42,9 @@ static ncclResult_t ncclDeviceType(const char* busId, enum ncclNvLinkDeviceType* if (strcmp(pciClass, "0x068000") == 0) { // PCI device is of type "Bridge / Other Bridge Device" (NVswitch) *type = ncclNvLinkDeviceSwitch; + } else if (strcmp(pciClass, "0x068001") == 0) { + // PCI device is of type "Bridge: IBM Device 04ea" + *type = ncclNvLinkDeviceBridge; } else if (strcmp(pciClass, "0x030200") == 0 // "3D Controller" (Tesla) || strcmp(pciClass, "0x030000") == 0) { // "VGA Controller" (GeForce) *type = ncclNvLinkDeviceGpu; diff --git a/projects/rccl/src/include/nvmlwrap.h b/projects/rccl/src/include/nvmlwrap.h index 0b6198abdb..f658279807 100644 --- a/projects/rccl/src/include/nvmlwrap.h +++ b/projects/rccl/src/include/nvmlwrap.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/projects/rccl/src/include/param.h b/projects/rccl/src/include/param.h index dd5f697e34..54317571e7 100644 --- a/projects/rccl/src/include/param.h +++ b/projects/rccl/src/include/param.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -36,7 +36,6 @@ static void setEnvFile(const char* fileName) { s++; strncpy(envValue, line+s, 1024); setenv(envVar, envValue, 0); - char *str = getenv(envVar); } if (line) free(line); fclose(file); diff --git a/projects/rccl/src/include/rings.h b/projects/rccl/src/include/rings.h index 43fc595a69..9701f845d7 100644 --- a/projects/rccl/src/include/rings.h +++ b/projects/rccl/src/include/rings.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/projects/rccl/src/include/shm.h b/projects/rccl/src/include/shm.h index 4fb49cbb8f..9cd9d053e3 100644 --- a/projects/rccl/src/include/shm.h +++ b/projects/rccl/src/include/shm.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/projects/rccl/src/include/socket.h b/projects/rccl/src/include/socket.h index fb5cfc04c5..739c0c4968 100644 --- a/projects/rccl/src/include/socket.h +++ b/projects/rccl/src/include/socket.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -18,8 +18,9 @@ #define MAX_IFS 16 #define MAX_IF_NAME_SIZE 16 -#define SLEEP_INT 1000 // sleep interval in usec -#define RETRY_TIMES 2e4 // retry times before reporting a timeout (20 sec) +#define SLEEP_INT 1000 // connection retry sleep interval in usec +#define RETRY_REFUSED_TIMES 2e4 // connection refused retry times before reporting a timeout (20 sec) +#define RETRY_TIMEDOUT_TIMES 3 // connection timed out retry times (each one can take 20s) /* Common socket address storage structure for IPv4/IPv6 */ union socketAddress { @@ -370,14 +371,18 @@ static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) { #endif int ret; - int retries = 0; + int timedout_retries = 0; + int refused_retries = 0; retry: SYSCHECKSYNC(connect(*fd, &remoteAddr->sa, salen), "connect", ret); if (ret == 0) return ncclSuccess; - if (errno == ECONNREFUSED && ++retries < RETRY_TIMES) { - INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno)); \ - usleep(SLEEP_INT); - goto retry; + if ((errno == ECONNREFUSED || errno == ETIMEDOUT)) { + if ((errno == ECONNREFUSED && ++refused_retries < RETRY_REFUSED_TIMES) || + (errno == ETIMEDOUT && ++timedout_retries < RETRY_TIMEDOUT_TIMES)) { + INFO(NCCL_ALL,"Call to connect returned %s, retrying", strerror(errno)); + usleep(SLEEP_INT); + goto retry; + } } WARN("Connect to %s failed : %s", socketToString(&remoteAddr->sa, line), strerror(errno)); return ncclSystemError; diff --git a/projects/rccl/src/include/topo.h b/projects/rccl/src/include/topo.h index e824a81023..69cd100743 100644 --- a/projects/rccl/src/include/topo.h +++ b/projects/rccl/src/include/topo.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -11,49 +11,35 @@ #include #include #include +#include -#define BUSID_SIZE (sizeof("0000:00:00.0")) -#define BUSID_REDUCED_SIZE (sizeof("0000:00")) +ncclResult_t getCudaPath(int cudaDev, char** path); -static ncclResult_t getCudaPath(int cudaDev, char** path) { - char busId[BUSID_SIZE]; - CUDACHECK(cudaDeviceGetPCIBusId(busId, BUSID_SIZE, cudaDev)); - for (int i=0; i #include "nvmlwrap.h" @@ -37,7 +38,7 @@ struct ncclConnect { char data[CONNECT_SIZE]; }; -enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress, ncclProxyOpDone }; +enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress }; struct ncclProxyArgs; typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*); @@ -117,8 +118,4 @@ inline void transportProxyWait(const FUNC& func) { } } -inline void transportProxyIdle(int idle) { - sched_yield(); -} - #endif diff --git a/projects/rccl/src/include/trees.h b/projects/rccl/src/include/trees.h index 1a151d1388..7eadd8556e 100644 --- a/projects/rccl/src/include/trees.h +++ b/projects/rccl/src/include/trees.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/projects/rccl/src/include/utils.h b/projects/rccl/src/include/utils.h index 5a6a588c43..29b72ad186 100644 --- a/projects/rccl/src/include/utils.h +++ b/projects/rccl/src/include/utils.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -10,7 +10,7 @@ #include "nccl.h" #include -ncclResult_t getHostName(char* hostname, int maxlen); +ncclResult_t getHostName(char* hostname, int maxlen, const char delim); uint64_t getHostHash(); uint64_t getPidHash(); diff --git a/projects/rccl/src/init.cu b/projects/rccl/src/init.cc similarity index 93% rename from projects/rccl/src/init.cu rename to projects/rccl/src/init.cc index 75822e60bd..80af287012 100644 --- a/projects/rccl/src/init.cu +++ b/projects/rccl/src/init.cc @@ -47,7 +47,7 @@ FILE *ncclDebugFile = stdout; std::chrono::high_resolution_clock::time_point ncclEpoch; #endif -#if CUDART_VERSION >= 9200 +#if CUDART_VERSION >= 9020 #define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream #else #define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream @@ -182,6 +182,11 @@ ncclResult_t ncclGetUniqueId(ncclUniqueId* out) { return bootstrapGetUniqueId(out); } +// Prevent compiler from optimizing out these operations +void __attribute__((optimize("O0"))) commPoison(ncclComm_t comm) { + comm->rank = comm->cudaDev = comm->nvmlDev = comm->nRanks = -1; +} + static ncclResult_t commFree(ncclComm_t comm) { if (comm == NULL) return ncclSuccess; @@ -191,6 +196,7 @@ static ncclResult_t commFree(ncclComm_t comm) { if (comm->bootstrap) NCCLCHECK(bootstrapClose(comm->bootstrap)); + CUDACHECK(cudaFree(comm->hostDevComm.channels)); CUDACHECK(cudaFree(comm->devComm)); for (int channel=0; channelnChannels; channel++) @@ -216,6 +222,9 @@ static ncclResult_t commFree(ncclComm_t comm) { CUDACHECK(cudaFreeHost((void *)comm->abortFlag)); CUDACHECK(cudaFreeHost((void *)comm->fatalDevError)); + // Poison comm to try and catch a double free + commPoison(comm); + free(comm); return ncclSuccess; } @@ -238,17 +247,17 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { struct ncclComm* comm; NCCLCHECK(ncclCalloc(&comm, 1)); - comm->rank = rank; - comm->nRanks = ndev; + comm->rank = comm->hostDevComm.rank =rank; + comm->nRanks = comm->hostDevComm.nRanks = ndev; cudaGetDevice(&comm->cudaDev); getNvmlDevice(comm->cudaDev, &comm->nvmlDev); - INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d", comm, rank, ndev, comm->cudaDev, comm->nvmlDev); + TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d nvmlDev %d", comm, rank, ndev, comm->cudaDev, comm->nvmlDev); comm->doneEvent = doneEvent; comm->llThreshold = ncclParamLlThreshold(); comm->treeThreshold = ncclParamTreeThreshold(); comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false; -#if CUDART_VERSION >= 9200 +#if CUDART_VERSION >= 9020 comm->groupCudaStream = ncclParamGroupCudaStream(); #else // Don't allow the user to overload the default setting in older CUDA builds @@ -256,10 +265,10 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { #endif comm->fatalError = ncclSuccess; - CUDACHECK(cudaHostAlloc((void**) &comm->fatalDevError, sizeof(ncclDevError_t), cudaHostAllocMapped)); + NCCLCHECK(ncclCudaHostAlloc((void**) &comm->fatalDevError, (void**) &comm->hostDevComm.fatalDevError, sizeof(ncclDevError_t))); *comm->fatalDevError = ncclDevSuccess; - CUDACHECK(cudaHostAlloc((void**) &comm->abortFlag, sizeof(uint32_t), cudaHostAllocMapped)); + NCCLCHECK(ncclCudaHostAlloc((void**) &comm->abortFlag, (void**) &comm->hostDevComm.abortFlag, sizeof(uint32_t))); *comm->abortFlag = 0; comm->argsptr = &comm->args; @@ -269,23 +278,19 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { } static ncclResult_t devCommSetup(ncclComm_t comm) { - // Fully duplicate the comm on the device - NCCLCHECK(ncclCudaCalloc(&comm->devComm, 1)); - // Copy the comm on the device - NCCLCHECK(ncclCudaMemcpy(comm->devComm, comm, 1)); - // Copy userRanks + // Duplicate the channels on the device + NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, comm->nChannels)); + NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, comm->nChannels)); + + // Copy userRanks and peers for (int r=0; rnChannels; r++) { NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks)); NCCLCHECK(ncclCudaMemcpy(comm->channels[r].devPeers, comm->channels[r].peers, comm->nRanks)); } - // Copy the device-accessible pointer to comm->abortFlag - void *devAbortFlag; - CUDACHECK(cudaHostGetDevicePointer(&devAbortFlag, (uint32_t *)comm->abortFlag, 0)); - CUDACHECK(cudaMemcpy(&comm->devComm->abortFlag, &devAbortFlag, sizeof(int *), cudaMemcpyHostToDevice)); - // Copy the device-accessible pointer to comm->fatalDevError - void *devFatalError; - CUDACHECK(cudaHostGetDevicePointer(&devFatalError, (ncclDevError_t *)comm->fatalDevError, 0)); - CUDACHECK(cudaMemcpy(&comm->devComm->fatalDevError, &devFatalError, sizeof(ncclDevError_t *), cudaMemcpyHostToDevice)); + + // Duplicate the dev comm on the device + NCCLCHECK(ncclCudaCalloc(&comm->devComm, 1)); + NCCLCHECK(ncclCudaMemcpy(comm->devComm, &comm->hostDevComm, 1)); return ncclSuccess; } @@ -423,7 +428,8 @@ static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank, } } - int ranks[nMasters]; + int* ranks; + NCCLCHECK(ncclCalloc(&ranks, nMasters)); int i = 0, masterIndex = -1; // Build binary tree for (int r=0; rup = prev; if (treeMasters[next] == 0) tree->down[0] = next; } + free(ranks); } TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks); @@ -638,6 +645,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel, if (peer == -1) continue; conn = &channel->peers[peer].recv; if (conn->connected) { ++nSkippedRecv; continue; } + memset(&connect, 0, sizeof(connect)); NCCLCHECK(selectTransport<0>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id)); NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); } @@ -646,6 +654,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel, if (peer == -1) continue; conn = &channel->peers[peer].send; if (conn->connected) { ++nSkippedSend; continue; } + memset(&connect, 0, sizeof(connect)); NCCLCHECK(selectTransport<1>(comm->peerInfo+comm->rank, comm->peerInfo+peer, &connect, conn, channel->buffSize, channel->id)); NCCLCHECK(bootstrapSend(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); } @@ -654,6 +663,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel, if (peer == -1) continue; conn = &channel->peers[peer].send; if (conn->connected) {++nSkippedSend; continue; } + memset(&connect, 0, sizeof(connect)); NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); NCCLCHECK(conn->transportComm->connect(&connect, conn)); conn->connected = 1; @@ -663,6 +673,7 @@ static ncclResult_t p2pSetup(struct ncclComm* comm, struct ncclChannel* channel, if (peer == -1) continue; conn = &channel->peers[peer].recv; if (conn->connected) {++nSkippedRecv; continue; } + memset(&connect, 0, sizeof(connect)); NCCLCHECK(bootstrapRecv(comm->bootstrap, peer, &connect, sizeof(struct ncclConnect))); NCCLCHECK(conn->transportComm->connect(&connect, conn)); conn->connected = 1; @@ -877,18 +888,42 @@ static ncclResult_t getCpuGpuAffinity(int cudaDev, cpu_set_t* mask) { return ncclSuccess; } +NCCL_PARAM(IgnoreCpuAffinity, "IGNORE_CPU_AFFINITY", 0); + static ncclResult_t setCpuAffinity(int cudaDev) { - // Work within the enveloppe we were provided + // Query the CPU affinity set we were provided cpu_set_t mask; SYSCHECK(sched_getaffinity(0, sizeof(cpu_set_t), &mask), "sched_getaffinity"); - // Find the subpart that is local to our GPU +#ifdef ENABLE_TRACE + { + char affinityStr[sizeof(cpu_set_t)*2]; + NCCLCHECK(ncclCpusetToStr(&mask, affinityStr)); + TRACE(NCCL_INIT, "Current affinity for GPU %d is %s", cudaDev, affinityStr); + } +#endif + + // Find the CPUs that are local to the supplied GPU cpu_set_t gpuMask; NCCLCHECK(getCpuGpuAffinity(cudaDev, &gpuMask)); - cpu_set_t finalMask; - CPU_AND(&finalMask, &mask, &gpuMask); - // If those are not disjoint, try to stay local +#ifdef ENABLE_TRACE + { + char affinityStr[sizeof(cpu_set_t)*2]; + NCCLCHECK(ncclCpusetToStr(&gpuMask, affinityStr)); + TRACE(NCCL_INIT, "CPU GPU affinity for GPU %d is %s", cudaDev, affinityStr); + } +#endif + + cpu_set_t finalMask; + if (ncclParamIgnoreCpuAffinity()) + // Ignore the CPU affinity set and use the GPU one instead + finalMask = gpuMask; + else + // Use a subset of the GPU affinity set + CPU_AND(&finalMask, &mask, &gpuMask); + + // If there is a non empty set, use it to set affinity if (CPU_COUNT(&finalMask)) { char affinityStr[sizeof(cpu_set_t)*2]; NCCLCHECK(ncclCpusetToStr(&finalMask, affinityStr)); @@ -1018,8 +1053,9 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs, comms[rank]->threadThreshold = threadThreshold; } + struct ncclConnect* connect; + NCCLCHECK(ncclCalloc(&connect, 2*nranks)); for (int r=0; rtransportComm->connect(connect+ring->next*2+0, send)); } } + free(connect); free(allInfo); free(rings); free(treeIn); @@ -1072,12 +1109,13 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) { int savedDevice; int rank, cudaDev; ncclComm_t comm = NULL; - int ncclDevList[ndev]; + int* ncclDevList = NULL; + NCCLCHECK(ncclCalloc(&ncclDevList, ndev)); for (int i=0; irank; +#endif CUDACHECK(cudaGetDevice(&savedDevice)); int commDevice = comm->cudaDev; - int rank = comm->rank; if (savedDevice != commDevice) { CUDACHECK(cudaSetDevice(commDevice)); @@ -1145,7 +1186,7 @@ static ncclResult_t commDestroy(ncclComm_t comm) { if (savedDevice != commDevice) CUDACHECK(cudaSetDevice(savedDevice)); - INFO(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank); + TRACE(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank); return ncclSuccess; } @@ -1155,6 +1196,14 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) { if (comm == NULL) return ncclSuccess; + TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d nvmlDev %d", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->nvmlDev); + + // Try and prevent a double free of the comm struct (user error) + if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->nvmlDev == -1) { + WARN("comm %p has already been destroyed", comm); + return ncclInvalidArgument; + } + return commDestroy(comm); } diff --git a/projects/rccl/src/misc/checks.cu b/projects/rccl/src/misc/argcheck.cc similarity index 96% rename from projects/rccl/src/misc/checks.cu rename to projects/rccl/src/misc/argcheck.cc index a07e577b3c..364f04152f 100644 --- a/projects/rccl/src/misc/checks.cu +++ b/projects/rccl/src/misc/argcheck.cc @@ -1,10 +1,10 @@ /************************************************************************* - * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ -#include "checks.h" +#include "argcheck.h" static ncclResult_t CudaPtrCheck(const void* pointer, struct ncclComm* comm, const char* ptrname, const char* opname) { cudaPointerAttributes attr; diff --git a/projects/rccl/src/misc/group.cu b/projects/rccl/src/misc/group.cc similarity index 98% rename from projects/rccl/src/misc/group.cu rename to projects/rccl/src/misc/group.cc index c428a22aa8..7bc64cd26c 100644 --- a/projects/rccl/src/misc/group.cu +++ b/projects/rccl/src/misc/group.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -118,7 +118,7 @@ ncclResult_t ncclGroupEnd() { int savedDev; CUDACHECK(cudaGetDevice(&savedDev)); int done = ncclGroupIndex; - int doneArray[ncclGroupIndex]; + int doneArray[MAX_ASYNC_OPS]; for (int i=0; i= NET_MAX_IFS*NET_BITS_PER_IF, "NET_MAX_IF static ncclTvalue_t getTvalue(short* distances, int ndev) { ncclTvalue_t tvalue = 0; for (int d=0; dhead < args->end) { if (args->tail < args->end && args->tail < args->head + NCCL_STEPS) { volatile int* sizesFifo = resources->hostRecvMem->sizesFifo; + volatile uint64_t* recvTail = &resources->hostRecvMem->tail; if (args->llMode) { int buffSlot = args->tail%NCCL_STEPS; int size = sizesFifo[buffSlot]; if (size != -1) { - uint32_t flag = args->tail + 1; + uint32_t flag = NCCL_LL_FLAG(args->tail + 1); int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine)); size = nFifoLines * sizeof(union ncclLLFifoLine); union ncclLLFifoLine* lines = resources->hostRecvMem->llBuff+buffSlot*NCCL_LL_SLICE_LINES; @@ -457,7 +462,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) { } } } - } else if (args->tail < resources->hostRecvMem->tail) { + } else if (args->tail < *recvTail) { struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem; int stepSize = args->channel->buffSize/NCCL_STEPS; // Send through network @@ -486,19 +491,9 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) { if (args->head == args->end) { resources->step = args->end; args->idle = 0; - args->state = ncclProxyOpDone; + args->state = ncclProxyOpNone; } } - if (args->state == ncclProxyOpDone) { - union ncclLLFifoLine* llBuff = resources->hostRecvMem->llBuff; - if (args->llMode && resources->step > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) { - for (int i=0; i< NCCL_LL_BUFF_LINES; i++) llBuff[i].flag1 = llBuff[i].flag2 = resources->step; - resources->step += NCCL_STEPS; - resources->hostSendMem->head = resources->step; - resources->llLastCleaning = resources->step; - } - args->state = ncclProxyOpNone; - } return ncclSuccess; } @@ -522,7 +517,8 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) { struct ncclRecvMem* localMem = resources->useGdr ? resources->devRecvMem : resources->hostRecvMem; char* localBuff = args->llMode ? (char*)localMem->llBuff : localMem->buff; void* mhandle = args->llMode ? resources->llMhandle : resources->mhandle; - if ((args->tail < args->head + NCCL_STEPS) && (args->tail < (resources->hostSendMem->head) + NCCL_STEPS) && (args->tail < args->end)) { + volatile uint64_t* sendHead = &resources->hostSendMem->head; + if ((args->tail < args->head + NCCL_STEPS) && (args->tail < *sendHead + NCCL_STEPS) && (args->tail < args->end)) { int buffSlot = args->tail%NCCL_STEPS; int sliceSize = stepSize * args->sliceSteps; NCCLCHECK(ncclNetIrecv(resources->netRecvComm, localBuff+buffSlot*stepSize, sliceSize, mhandle, args->requests+buffSlot)); @@ -548,17 +544,9 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) { if (args->head == args->end) { resources->step = args->end; args->idle = 0; - args->state = ncclProxyOpDone; + args->state = ncclProxyOpNone; } } - if (args->state == ncclProxyOpDone) { - if (args->llMode && resources->step > resources->llLastCleaning + NCCL_LL_CLEAN_FREQ) { - resources->step += NCCL_STEPS; - while (resources->hostSendMem->head < resources->step); - resources->llLastCleaning = resources->step; - } - args->state = ncclProxyOpNone; - } return ncclSuccess; } diff --git a/projects/rccl/src/transport/net_ib.cu b/projects/rccl/src/transport/net_ib.cc similarity index 99% rename from projects/rccl/src/transport/net_ib.cu rename to projects/rccl/src/transport/net_ib.cc index f7c574b5b0..de72f891e2 100644 --- a/projects/rccl/src/transport/net_ib.cu +++ b/projects/rccl/src/transport/net_ib.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -119,6 +119,7 @@ ncclResult_t ncclIbInit(ncclDebugLogger_t logFunction) { } int found = 0; struct ibv_device_attr devAttr; + memset(&devAttr, 0, sizeof(devAttr)); if (ncclSuccess != wrap_ibv_query_device(context, &devAttr)) { WARN("NET/IB : Unable to query device %s", devices[d]->name); if (ncclSuccess != wrap_ibv_close_device(context)) { return ncclInternalError; } diff --git a/projects/rccl/src/transport/net_socket.cu b/projects/rccl/src/transport/net_socket.cc similarity index 99% rename from projects/rccl/src/transport/net_socket.cu rename to projects/rccl/src/transport/net_socket.cc index 0464b43482..9958936201 100644 --- a/projects/rccl/src/transport/net_socket.cu +++ b/projects/rccl/src/transport/net_socket.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/projects/rccl/src/transport/p2p.cu b/projects/rccl/src/transport/p2p.cc similarity index 94% rename from projects/rccl/src/transport/p2p.cu rename to projects/rccl/src/transport/p2p.cc index 9f3e0b6558..42b549e72e 100644 --- a/projects/rccl/src/transport/p2p.cu +++ b/projects/rccl/src/transport/p2p.cc @@ -57,7 +57,7 @@ static int busIdToCudaDev(const char* busId) { /* Determine if we can communicate with the peer through p2p */ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo) { // Do not use P2P across root complexes by default (provided CUDA permits it) - int p2pLevel = PATH_SOC; + int p2pLevel = PATH_NODE; if (ncclParamP2pDisable() == 1) p2pLevel = 0; if (ncclParamP2pLevel() != -2) p2pLevel = ncclParamP2pLevel(); @@ -70,13 +70,26 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struc // Convert the peer's busId into a local cudaDev index (cf. CUDA_VISIBLE_DEVICES) int peerCudaDev = busIdToCudaDev(peerInfo->busId); - if (peerCudaDev == -1) return ncclSuccess; // Peer's CUDA device is not visible in this process + if (peerCudaDev == -1) { + // Peer's CUDA device is not visible in this process +#if CUDART_VERSION >= 10010 + // But in CUDA 10.1 we can still communicate with 'invisible' devices + TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between %d(%s) and %d(%s)", myInfo->nvmlDev, myInfo->busId, peerInfo->nvmlDev, peerInfo->busId); + // Check for NVLink/NVswitch including P2P access + int nvlinkp2p = getNvlinkGpu(myInfo->busId, peerInfo->busId); + if (nvlinkp2p > 0) { + *ret = nvlinkp2p; + return ncclSuccess; + } +#endif + return ncclSuccess; + } TRACE(NCCL_INIT|NCCL_P2P, "Checking P2P connection between [%d=%d] and [%d=%d]", myInfo->cudaDev, myInfo->nvmlDev, peerCudaDev, peerInfo->nvmlDev); // Do not detect topology if we're on the same GPU. Note this is not really supported. if (myInfo->cudaDev == peerCudaDev) { - *ret = 1 + PATH_SOC; + *ret = 1 + PATH_SYS; return ncclSuccess; } @@ -104,7 +117,7 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struc if (err1 == ncclSuccess && err2 == ncclSuccess) { int distance = pciDistance(myPath, peerPath); if (distance < p2pLevel) { - *ret = 1 + PATH_SOC - distance; + *ret = 1 + PATH_SYS - distance; } } if (err1 == ncclSuccess) free(myPath); @@ -112,6 +125,9 @@ ncclResult_t p2pCanConnect(ncclTvalue_t* ret, struct ncclPeerInfo* myInfo, struc return ncclSuccess; } +#define MAXGPUS_NVLINKP2P 8 // 16 would take an almost infinite time anyway +#define MAXGPUS_PCI 64 + static int computeRingsRec(ncclTvalue_t* matrix, int n, int *rings, int currentRing, int nRingsMax, int* inTheRing, int current, int remaining, int connect) { int nrings = 0; ncclTvalue_t* line = matrix+current*n; @@ -139,7 +155,7 @@ static int computeRingsRec(ncclTvalue_t* matrix, int n, int *rings, int currentR } } } else { - int ringsSave[nRingsMax*n]; + int ringsSave[MAXCHANNELS*MAXGPUS_NVLINKP2P]; int maxStep = 0; for (int i=0; i 0) { @@ -297,9 +313,9 @@ int p2pComputeRingsSeqNew(ncclTvalue_t* values, int nranks, int* rings, int nrin } static int findClosestPci(ncclTvalue_t* values, int* inRing, int rank, int end, int nranks, int minScore) { - for (int score = PATH_SOC+1; score >= minScore; score--) { + for (int score = PATH_SYS+1; score >= minScore; score--) { int best = -1; - int worst_end_score = PATH_SOC+2; // find the closest to rank, farthest from end + int worst_end_score = PATH_SYS+2; // find the closest to rank, farthest from end for (int n = 0; n < nranks; n++) { if (inRing[n]) continue; if (values[rank*nranks+n] == score) { @@ -321,7 +337,7 @@ int p2pComputeRingsPci(ncclTvalue_t* values, int nranks, int* rings, int nrings, int start = findConnect(nranks, prev+r*nranks); int end = findConnect(nranks, next+r*nranks); - int inRing[nranks]; + int inRing[MAXGPUS_PCI]; for (int i=0; i 0) { // NVLink : Connect rings or create new ones + if (nranks > MAXGPUS_NVLINKP2P) { + WARN("Recursive P2P computation cannot work for >8 GPUs"); + return ncclInternalError; + } nrings = p2pComputeRingsNvLink(values, nranks, rings, nrings, prev, next, 0, nthreads); goto end; } @@ -600,6 +620,7 @@ ncclResult_t p2pSendFree(void* resources) { if (sendRes->ipcPtr) CUDACHECK(cudaIpcCloseMemHandle(sendRes->ipcPtr)); CUDACHECK(cudaFree(sendRes->devMem)); + free(sendRes); return ncclSuccess; } @@ -608,6 +629,7 @@ ncclResult_t p2pRecvFree(void* resources) { if (recvRes->ipcPtr) CUDACHECK(cudaIpcCloseMemHandle(recvRes->ipcPtr)); CUDACHECK(cudaFree(recvRes->devMem)); + free(recvRes); return ncclSuccess; } diff --git a/projects/rccl/src/transport/shm.cu b/projects/rccl/src/transport/shm.cc similarity index 98% rename from projects/rccl/src/transport/shm.cu rename to projects/rccl/src/transport/shm.cc index 83cc9d1830..2ec5f2342b 100644 --- a/projects/rccl/src/transport/shm.cu +++ b/projects/rccl/src/transport/shm.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -60,11 +60,13 @@ static inline int groupLast(int nranks, int* groups, int group, int rankToAvoid) return -1; } +#define MAXGROUPS 16 + ncclResult_t shmGetRings(int nranks, int* groups, int* subgroups, ncclTvalue_t* values, int* nringsRet, int* prev, int* next, int minScore, int* nthreads) { if (*nringsRet == MAXCHANNELS) *nringsRet = 1; int nGroups = groups[nranks-1] + 1; - int starts[nGroups]; - int ends[nGroups]; + int starts[MAXGROUPS]; + int ends[MAXGROUPS]; for (int ring = 0; ring<*nringsRet; ring++) { int startGroup = -1, endGroup = -1; for (int group = 0; group Date: Mon, 8 Apr 2019 18:16:54 +0200 Subject: [PATCH 04/20] Add pkgconfig file (#190) [ROCm/rccl commit: 9db4b1d801624a00591b7aafd426d6dd23547443] --- projects/rccl/src/Makefile | 28 ++++++++++++++++++++++++---- projects/rccl/src/nccl.pc.in | 10 ++++++++++ 2 files changed, 34 insertions(+), 4 deletions(-) create mode 100755 projects/rccl/src/nccl.pc.in diff --git a/projects/rccl/src/Makefile b/projects/rccl/src/Makefile index 2d32dca78d..b5baa29b91 100644 --- a/projects/rccl/src/Makefile +++ b/projects/rccl/src/Makefile @@ -17,16 +17,20 @@ LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc \ ##### lib files LIBNAME := libnccl.so STATICLIBNAME := libnccl_static.a +##### pkgconfig files +PKGCONFIGFILE := nccl.pc ##### dirs BUILDDIR ?= $(abspath ../build) INCDIR := $(BUILDDIR)/include LIBDIR := $(BUILDDIR)/lib OBJDIR := $(BUILDDIR)/obj +PKGDIR := $(BUILDDIR)/lib/pkgconfig ##### target files INCTARGETS := $(INCEXPORTS:%=$(INCDIR)/%) LIBSONAME := $(LIBNAME:%=%.$(NCCL_MAJOR)) LIBTARGET := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH)) STATICLIBTARGET := $(STATICLIBNAME) +PKGTARGET := $(PKGCONFIGFILE) LIBOBJ := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o) DEPFILES := $(LIBOBJ:%.o=%.d) LDFLAGS += -L${CUDA_LIB} -lcudart_static -lpthread -lrt -ldl @@ -36,7 +40,7 @@ DEVICELIB := $(BUILDDIR)/obj/collectives/device/colldevice.a ##### rules build : lib staticlib -lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET) +lib : $(INCTARGETS) $(LIBDIR)/$(LIBTARGET) $(PKGDIR)/$(PKGTARGET) staticlib : $(LIBDIR)/$(STATICLIBTARGET) @@ -53,7 +57,7 @@ $(INCDIR)/nccl.h : nccl.h.in # NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z)) @$(eval NCCL_VERSION := $(shell printf "%d%d%02d" $(NCCL_MAJOR) $(NCCL_MINOR) $(NCCL_PATCH))) mkdir -p $(INCDIR) - printf "Generating %-35s > %s\n" $< $@ + @printf "Generating %-35s > %s\n" $< $@ sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ @@ -77,6 +81,15 @@ $(LIBDIR)/$(STATICLIBTARGET): $(LIBOBJ) $(DEVICELIB) ar cr $@ $(LIBOBJ) $(TMP)/*.o rm -Rf $(TMP) +$(PKGDIR)/nccl.pc : nccl.pc.in + mkdir -p $(PKGDIR) + @printf "Generating %-35s > %s\n" $< $@ + sed -e 's|$${nccl:Prefix}|\$(PREFIX)|g' \ + -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ + -e "s/\$${nccl:Minor}/$(NCCL_MINOR)/g" \ + -e "s/\$${nccl:Patch}/$(NCCL_PATCH)/g" \ + $< > $@ + $(INCDIR)/%.h : %.h @printf "Grabbing %-35s > %s\n" $< $@ mkdir -p $(INCDIR) @@ -87,6 +100,11 @@ $(INCDIR)/nccl_%.h : include/nccl_%.h mkdir -p $(INCDIR) cp -f $< $@ +$(PKGDIR)/%.pc : %.pc + @printf "Grabbing %-35s > %s\n" $< $@ + mkdir -p $(PKGDIR) + cp -f $< $@ + $(OBJDIR)/%.o : %.cc @printf "Compiling %-35s > %s\n" $< $@ mkdir -p `dirname $@` @@ -98,13 +116,15 @@ $(OBJDIR)/%.o : %.cc @rm -f $(@:%.o=%.d.tmp) clean : - rm -rf ${INCDIR} ${LIBDIR} ${OBJDIR} + rm -rf ${INCDIR} ${LIBDIR} ${PKGDIR} ${OBJDIR} $(MAKE) -C collectives/device clean install : lib mkdir -p $(PREFIX)/lib + mkdir -p $(PREFIX)/lib/pkgconfig mkdir -p $(PREFIX)/include - cp -P -v $(BUILDDIR)/lib/* $(PREFIX)/lib/ + cp -P -v $(BUILDDIR)/lib/lib* $(PREFIX)/lib/ + cp -P -v $(BUILDDIR)/lib/pkgconfig/* $(PREFIX)/lib/pkgconfig/ cp -v $(BUILDDIR)/include/* $(PREFIX)/include/ FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cc" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h') diff --git a/projects/rccl/src/nccl.pc.in b/projects/rccl/src/nccl.pc.in new file mode 100755 index 0000000000..0d98494999 --- /dev/null +++ b/projects/rccl/src/nccl.pc.in @@ -0,0 +1,10 @@ +prefix=${nccl:Prefix} +exec_prefix=${prefix} +libdir=${exec_prefix}/lib +includedir=${prefix}/include + +Name: nccl +Description: Optimized primitives for collective multi-GPU communication +Version: ${nccl:Major}.${nccl:Minor}.${nccl:Patch} +Libs: -L${libdir} -lnccl +Cflags: -I${includedir} From 1a4357b99ab10b7176cf35e11f5ab1deb081e252 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 7 May 2019 20:35:14 -0400 Subject: [PATCH 05/20] Allow CUDA runtime library selection (#220) Makes a change to allow the user to select between the static CUDA runtime library (default) and the dynamic CUDA runtime library. Does this by allowing `CUDARTLIB` to be overridden. [ROCm/rccl commit: 60a586ded9312c201cf3ed59818b23514d4a9888] --- projects/rccl/src/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/projects/rccl/src/Makefile b/projects/rccl/src/Makefile index b5baa29b91..452adf52ae 100644 --- a/projects/rccl/src/Makefile +++ b/projects/rccl/src/Makefile @@ -26,6 +26,7 @@ LIBDIR := $(BUILDDIR)/lib OBJDIR := $(BUILDDIR)/obj PKGDIR := $(BUILDDIR)/lib/pkgconfig ##### target files +CUDARTLIB ?= cudart_static INCTARGETS := $(INCEXPORTS:%=$(INCDIR)/%) LIBSONAME := $(LIBNAME:%=%.$(NCCL_MAJOR)) LIBTARGET := $(LIBNAME:%=%.$(NCCL_MAJOR).$(NCCL_MINOR).$(NCCL_PATCH)) @@ -33,7 +34,7 @@ STATICLIBTARGET := $(STATICLIBNAME) PKGTARGET := $(PKGCONFIGFILE) LIBOBJ := $(LIBSRCFILES:%.cc=$(OBJDIR)/%.o) DEPFILES := $(LIBOBJ:%.o=%.d) -LDFLAGS += -L${CUDA_LIB} -lcudart_static -lpthread -lrt -ldl +LDFLAGS += -L${CUDA_LIB} -l$(CUDARTLIB) -lpthread -lrt -ldl DEVICELIB := $(BUILDDIR)/obj/collectives/device/colldevice.a From 00bc3a724aadc8c727d964da24f47bef14429d4c Mon Sep 17 00:00:00 2001 From: David Addison Date: Tue, 16 Apr 2019 15:27:06 -0700 Subject: [PATCH 06/20] NCCL 2.4.7-1 Performance tweaks for PowerPC builds only; Set default NCCL_MIN_NRINGS to 4 Disable PCI-E NUMA distance detection [ROCm/rccl commit: 0ceaec9cee96ae7658aa45686853286651f36384] --- projects/rccl/makefiles/version.mk | 2 +- projects/rccl/src/misc/rings.cc | 8 +++++++- projects/rccl/src/misc/topo.cc | 6 ++++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/projects/rccl/makefiles/version.mk b/projects/rccl/makefiles/version.mk index 7abaaaff22..8341f336b6 100644 --- a/projects/rccl/makefiles/version.mk +++ b/projects/rccl/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 NCCL_MINOR := 4 -NCCL_PATCH := 6 +NCCL_PATCH := 7 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/projects/rccl/src/misc/rings.cc b/projects/rccl/src/misc/rings.cc index 27ca9b6582..7e1fc1b823 100644 --- a/projects/rccl/src/misc/rings.cc +++ b/projects/rccl/src/misc/rings.cc @@ -170,7 +170,13 @@ static ncclResult_t fillCoords(int nranks, int* matrix, int* coords, int* rankTo } } -NCCL_PARAM(MinNrings, "MIN_NRINGS", 0); +#ifdef __PPC__ +// Make the default NCCL_MIN_NRINGS=4 for IBM/Power nodes +#define DEFAULT_MIN_NRINGS 4 +#else +#define DEFAULT_MIN_NRINGS 0 +#endif +NCCL_PARAM(MinNrings, "MIN_NRINGS", DEFAULT_MIN_NRINGS); NCCL_PARAM(MaxNrings, "MAX_NRINGS", 0); /* Users can force the number of threads with an environment variable */ diff --git a/projects/rccl/src/misc/topo.cc b/projects/rccl/src/misc/topo.cc index 636497899e..3f5bdf9c2c 100644 --- a/projects/rccl/src/misc/topo.cc +++ b/projects/rccl/src/misc/topo.cc @@ -39,11 +39,17 @@ int pciDistance(char* path1, char* path2) { } } if (score <= 3) { +#ifdef __PPC__ + // NUMA distance detection and PATH_SYS not supported on IBM/Power nodes + // nodes currently + return PATH_NODE; +#else /* Split the former PATH_SOC distance into PATH_NODE and PATH_SYS based on numaId */ int numaId1 = getNumaId(path1); int numaId2 = getNumaId(path2); TRACE(NCCL_INIT, "depth %d score %d path1 %s numaId %d path2 %s numaId %d", depth, score, path1, numaId1, path2, numaId2); return ((numaId1 == numaId2) ? PATH_NODE : PATH_SYS); +#endif } if (score == 4) return PATH_PHB; if (score == depth-1) return PATH_PIX; From 1071f54eeb335551a0504eafacf732cef4993cb5 Mon Sep 17 00:00:00 2001 From: Rajat Chopra Date: Wed, 22 May 2019 21:19:36 -0700 Subject: [PATCH 07/20] Update debian dependencies in README (#228) 'fakeroot' is needed for building deb packages [ROCm/rccl commit: 6d8b2421bc087f142a1edfb5f60a53040a5eac82] --- projects/rccl/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/rccl/README.md b/projects/rccl/README.md index abfd1cd4db..7f0a72f5df 100644 --- a/projects/rccl/README.md +++ b/projects/rccl/README.md @@ -55,7 +55,7 @@ To install NCCL on the system, create a package then install it as root. Debian/Ubuntu : ```shell $ # Install tools to create debian packages -$ sudo apt install build-essential devscripts debhelper +$ sudo apt install build-essential devscripts debhelper fakeroot $ # Build NCCL deb package $ make pkg.debian.build $ ls build/pkg/deb/ From d2f579ba8bfd028668d676cc061ae3e0f3f1274a Mon Sep 17 00:00:00 2001 From: Felix Abecassis Date: Fri, 21 Jun 2019 01:25:08 -0700 Subject: [PATCH 08/20] Fix out-of-bounds read in ncclStrToCpuset (#233) The affinityStr string was not null-terminated but was passed to strlen(3). Signed-off-by: Felix Abecassis [ROCm/rccl commit: 37e4f8729e5e6604ab739b2353064139af43fe2d] --- projects/rccl/src/init.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index 80af287012..66a4865c8f 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -879,10 +879,12 @@ static ncclResult_t getCpuGpuAffinity(int cudaDev, cpu_set_t* mask) { path[PATH_MAX-1] = '\0'; int fd; SYSCHECKVAL(open(path, O_RDONLY), "open", fd); - char affinityStr[sizeof(cpu_set_t)*2]; + char affinityStr[sizeof(cpu_set_t)*2 + 1]; int r = read(fd, affinityStr, sizeof(cpu_set_t)*2); - if (r > 0) + if (r > 0) { + affinityStr[r] = '\0'; NCCLCHECK(ncclStrToCpuset(affinityStr, mask)); + } close(fd); free(cudaPath); return ncclSuccess; From b91d8170f85319340d1a22ae0e266ea1411958ea Mon Sep 17 00:00:00 2001 From: Ke Wen Date: Tue, 25 Jun 2019 13:22:47 -0700 Subject: [PATCH 09/20] 2.4.8-1 Fix #209: improve socket transport performance Split transfers over multiple sockets Launch multiple threads to drive sockets Detect AWS NICs and set nsockets/nthreads accordingly [ROCm/rccl commit: 7c72dee660e4d055b81721dd6b03e4e1c0a983cf] --- projects/rccl/makefiles/version.mk | 2 +- projects/rccl/src/bootstrap.cc | 152 ++++++++-- projects/rccl/src/include/bootstrap.h | 1 + projects/rccl/src/include/net.h | 6 - projects/rccl/src/include/socket.h | 21 +- projects/rccl/src/init.cc | 5 +- projects/rccl/src/transport/net_socket.cc | 334 ++++++++++++++++++---- 7 files changed, 425 insertions(+), 96 deletions(-) diff --git a/projects/rccl/makefiles/version.mk b/projects/rccl/makefiles/version.mk index 8341f336b6..bab58ec0bf 100644 --- a/projects/rccl/makefiles/version.mk +++ b/projects/rccl/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 NCCL_MINOR := 4 -NCCL_PATCH := 7 +NCCL_PATCH := 8 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/projects/rccl/src/bootstrap.cc b/projects/rccl/src/bootstrap.cc index 9df38e4433..d7c2ac6760 100644 --- a/projects/rccl/src/bootstrap.cc +++ b/projects/rccl/src/bootstrap.cc @@ -9,37 +9,145 @@ #include "utils.h" #include "bootstrap.h" #include "net.h" +#include "socket.h" #include #include // Always use sockets for bootstrap -ncclNet_t* ncclBootstrapNet = &ncclNetSocket; +struct bootstrapNetHandle { + union socketAddress connectAddr; +}; -static ncclResult_t bootstrapNetListen(int dev, void* handle, void** listenComm) { NCCLCHECK(ncclBootstrapNet->listen(dev, handle, listenComm)); return ncclSuccess; } -static ncclResult_t bootstrapNetConnect(int dev, void* handle, void** sendComm) { NCCLCHECK(ncclBootstrapNet->connect(dev, handle, sendComm)); return ncclSuccess; } -static ncclResult_t bootstrapNetAccept(void* listenComm, void** recvComm) { NCCLCHECK(ncclBootstrapNet->accept(listenComm, recvComm)); return ncclSuccess; } -static ncclResult_t bootstrapNetTest(void* request, int* done, int* size) { NCCLCHECK(ncclBootstrapNet->test(request, done, size)); return ncclSuccess; } -static ncclResult_t bootstrapNetCloseSend(void* sendComm) { NCCLCHECK(ncclBootstrapNet->closeSend(sendComm)); return ncclSuccess; } -static ncclResult_t bootstrapNetCloseRecv(void* recvComm) { NCCLCHECK(ncclBootstrapNet->closeRecv(recvComm)); return ncclSuccess; } -static ncclResult_t bootstrapNetCloseListen(void* listenComm) { NCCLCHECK(ncclBootstrapNet->closeListen(listenComm)); return ncclSuccess; } +struct bootstrapNetComm { + int fd; +}; -// Additional sync functions based on async + test for bootstrap, using host ptrs. +/* Init functions */ +static char bootstrapNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS]; +static union socketAddress bootstrapNetIfAddrs[MAX_IFS]; +static int bootstrapNetIfs = -1; +pthread_mutex_t bootstrapNetLock = PTHREAD_MUTEX_INITIALIZER; + +ncclResult_t bootstrapNetInit() { + if (bootstrapNetIfs == -1) { + pthread_mutex_lock(&bootstrapNetLock); + if (bootstrapNetIfs == -1) { + bootstrapNetIfs = findInterfaces(bootstrapNetIfNames, bootstrapNetIfAddrs, MAX_IF_NAME_SIZE, MAX_IFS); + if (bootstrapNetIfs <= 0) { + WARN("Bootstrap : no socket interface found"); + return ncclInternalError; + } else { + char line[1024]; + char addrline[1024]; + line[0] = '\0'; + for (int i=0; ifd = -1; + return ncclSuccess; +} + +static ncclResult_t bootstrapNetGetSocketAddr(int dev, union socketAddress* addr) { + if (dev >= bootstrapNetIfs) return ncclInternalError; + memcpy(addr, bootstrapNetIfAddrs+dev, sizeof(*addr)); + return ncclSuccess; +} + +/* Socket Interface Selection type */ +enum bootstrapInterface_t { findSubnetIf = -1, dontCareIf = -2 }; + +static ncclResult_t bootstrapNetListen(int dev, void* opaqueHandle, void** listenComm) { + struct bootstrapNetHandle* handle = (struct bootstrapNetHandle*) opaqueHandle; + static_assert(sizeof(struct bootstrapNetHandle) < NCCL_NET_HANDLE_MAXSIZE, "bootstrapNetHandle size too large"); + // if dev >= 0, listen based on dev + if (dev >= 0) { + NCCLCHECK(bootstrapNetGetSocketAddr(dev, &(handle->connectAddr))); + } else if (dev == findSubnetIf) { + // handle stores a remote address + // need to find a local addr that is in the same network as the remote addr + union socketAddress localAddr; + char ifName[MAX_IF_NAME_SIZE]; + if (findInterfaceMatchSubnet(ifName, &localAddr, handle->connectAddr, MAX_IF_NAME_SIZE, 1) <= 0) { + WARN("NET/Socket : No usable listening interface found"); + return ncclSystemError; + } + // pass the local address back + memcpy(&handle->connectAddr, &localAddr, sizeof(handle->connectAddr)); + } // Otherwise, handle stores a local address + struct bootstrapNetComm* comm; + NCCLCHECK(bootstrapNetNewComm(&comm)); + NCCLCHECK(createListenSocket(&comm->fd, &handle->connectAddr)); + *listenComm = comm; + return ncclSuccess; +} + +static ncclResult_t bootstrapNetConnect(int dev, void* opaqueHandle, void** sendComm) { + struct bootstrapNetComm* comm; + NCCLCHECK(bootstrapNetNewComm(&comm)); + struct bootstrapNetHandle* handle = (struct bootstrapNetHandle*) opaqueHandle; + NCCLCHECK(connectAddress(&comm->fd, &handle->connectAddr)); + *sendComm = comm; + return ncclSuccess; +} + +static ncclResult_t bootstrapNetAccept(void* listenComm, void** recvComm) { + struct bootstrapNetComm* lComm = (struct bootstrapNetComm*)listenComm; + struct bootstrapNetComm* rComm; + NCCLCHECK(bootstrapNetNewComm(&rComm)); + struct sockaddr_in sockaddr; + socklen_t socklen = sizeof(struct sockaddr_in); + SYSCHECKVAL(accept(lComm->fd, (struct sockaddr*)&sockaddr, &socklen), "accept", rComm->fd); + *recvComm = rComm; + return ncclSuccess; +} + +static ncclResult_t bootstrapNetClose(void* opaqueComm) { + struct bootstrapNetComm* comm = (struct bootstrapNetComm*)opaqueComm; + if (comm) { + close(comm->fd); + free(comm); + } + return ncclSuccess; +} + +static ncclResult_t bootstrapNetCloseSend(void* sendComm) { NCCLCHECK(bootstrapNetClose(sendComm)); return ncclSuccess; } +static ncclResult_t bootstrapNetCloseRecv(void* recvComm) { NCCLCHECK(bootstrapNetClose(recvComm)); return ncclSuccess; } +static ncclResult_t bootstrapNetCloseListen(void* listenComm) { NCCLCHECK(bootstrapNetClose(listenComm)); return ncclSuccess; } + +// Additional sync functions static ncclResult_t bootstrapNetSend(void* sendComm, void* data, int size) { - void* request, *mhandle; - NCCLCHECK(ncclBootstrapNet->regMr(sendComm, data, size, NCCL_PTR_HOST, &mhandle)); - NCCLCHECK(ncclBootstrapNet->isend(sendComm, data, size, mhandle, &request)); - NCCLCHECK(ncclBootstrapNet->deregMr(sendComm, mhandle)); - int done = 0; - while (!done) NCCLCHECK(bootstrapNetTest(request, &done, NULL)); + struct bootstrapNetComm* comm = (struct bootstrapNetComm*)sendComm; + NCCLCHECK(socketSend(comm->fd, &size, sizeof(int))); + NCCLCHECK(socketSend(comm->fd, data, size)); return ncclSuccess; } static ncclResult_t bootstrapNetRecv(void* recvComm, void* data, int size) { - void* request, *mhandle; - NCCLCHECK(ncclBootstrapNet->regMr(recvComm, data, size, NCCL_PTR_HOST, &mhandle)); - NCCLCHECK(ncclBootstrapNet->irecv(recvComm, data, size, mhandle, &request)); - NCCLCHECK(ncclBootstrapNet->deregMr(recvComm, mhandle)); - int done = 0; - while (!done) NCCLCHECK(bootstrapNetTest(request, &done, NULL)); + struct bootstrapNetComm* comm = (struct bootstrapNetComm*)recvComm; + int recvSize; + NCCLCHECK(socketReceive(comm->fd, &recvSize, sizeof(int))); + if (recvSize > size) { + WARN("Message truncated : received %d bytes instead of %d\n", recvSize, size); + return ncclInternalError; + } + NCCLCHECK(socketReceive(comm->fd, data, std::min(recvSize, size))); + return ncclSuccess; +} + +ncclResult_t bootstrapNetCreateHandle(void* opaqueHandle, const char* str) { + struct bootstrapNetHandle* handle = (struct bootstrapNetHandle*) opaqueHandle; + NCCLCHECK(GetSocketAddrFromString(&handle->connectAddr, str)); return ncclSuccess; } @@ -148,7 +256,7 @@ ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out) { char* env = getenv("NCCL_COMM_ID"); if (env) { - if (ncclSocketCreateHandle(&id->extHandleRoot, env) != 0) { + if (bootstrapNetCreateHandle(&id->extHandleRoot, env) != 0) { WARN("Invalid NCCL_COMM_ID, please use format: : or []: or :"); return ncclInvalidArgument; } diff --git a/projects/rccl/src/include/bootstrap.h b/projects/rccl/src/include/bootstrap.h index dd7de2ce0e..dacbc7c5e1 100644 --- a/projects/rccl/src/include/bootstrap.h +++ b/projects/rccl/src/include/bootstrap.h @@ -9,6 +9,7 @@ #include "nccl.h" +ncclResult_t bootstrapNetInit(); ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv); ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out); ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commState); diff --git a/projects/rccl/src/include/net.h b/projects/rccl/src/include/net.h index da3eceaa36..950b5e5c0c 100644 --- a/projects/rccl/src/include/net.h +++ b/projects/rccl/src/include/net.h @@ -13,11 +13,6 @@ extern ncclNet_t* ncclNet; typedef char ncclNetHandle_t[NCCL_NET_HANDLE_MAXSIZE]; -/* Socket Interface Selection type */ -typedef enum { findSubnetIf = -1, - dontCareIf = -2 -} ncclSocketIfSl_t; - // Translation to external API static const char* ncclNetName() { return ncclNet->name; } static ncclResult_t ncclNetDevices(int* ndev) { NCCLCHECK(ncclNet->devices(ndev)); return ncclSuccess; } @@ -36,7 +31,6 @@ static ncclResult_t ncclNetCloseSend(void* sendComm) { NCCLCHECK(ncclNet->closeS static ncclResult_t ncclNetCloseRecv(void* recvComm) { NCCLCHECK(ncclNet->closeRecv(recvComm)); return ncclSuccess; } static ncclResult_t ncclNetCloseListen(void* listenComm) { NCCLCHECK(ncclNet->closeListen(listenComm)); return ncclSuccess; } -extern ncclResult_t ncclSocketCreateHandle(void* opaqueHandle, const char* str); extern ncclNet_t ncclNetIb; extern ncclNet_t ncclNetSocket; diff --git a/projects/rccl/src/include/socket.h b/projects/rccl/src/include/socket.h index 739c0c4968..8197a658d9 100644 --- a/projects/rccl/src/include/socket.h +++ b/projects/rccl/src/include/socket.h @@ -42,7 +42,7 @@ static inline const char *socketToString(struct sockaddr *saddr, char *buf) { return buf; } -static inline short socketToPort(struct sockaddr *saddr) { +static inline uint16_t socketToPort(struct sockaddr *saddr) { return ntohs(saddr->sa_family == AF_INET ? ((struct sockaddr_in*)saddr)->sin_port : ((struct sockaddr_in6*)saddr)->sin6_port); } @@ -161,7 +161,10 @@ static bool matchSubnet(struct ifaddrs local_if, union socketAddress remote) { } static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAddrs, union socketAddress remoteAddr, int ifNameMaxSize, int maxIfs) { - char line[1024], line_a[1024]; +#ifdef ENABLE_TRACE + char line[1024]; +#endif + char line_a[1024]; int found = 0; struct ifaddrs *interfaces, *interface; getifaddrs(&interfaces); @@ -185,7 +188,7 @@ static int findInterfaceMatchSubnet(char* ifNames, union socketAddress* localAdd // Store the interface name strncpy(ifNames+found*ifNameMaxSize, interface->ifa_name, ifNameMaxSize); - INFO(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr.sa), line_a)); + TRACE(NCCL_INIT|NCCL_NET,"NET : Found interface %s:%s in the same subnet as remote address %s", interface->ifa_name, socketToString(&(localAddrs[found].sa), line), socketToString(&(remoteAddr.sa), line_a)); found++; if (found == maxIfs) break; } @@ -390,12 +393,12 @@ retry: #define NCCL_SOCKET_SEND 0 #define NCCL_SOCKET_RECV 1 -static ncclResult_t socketProgress(int op, int fd, void* ptr, int size, int* offset) { +static ncclResult_t socketProgressOpt(int op, int fd, void* ptr, int size, int* offset, int block) { int bytes = 0; char* data = (char*)ptr; do { - if (op == NCCL_SOCKET_RECV) bytes = recv(fd, data+(*offset), size-(*offset), MSG_DONTWAIT); - if (op == NCCL_SOCKET_SEND) bytes = send(fd, data+(*offset), size-(*offset), MSG_DONTWAIT); + if (op == NCCL_SOCKET_RECV) bytes = recv(fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT); + if (op == NCCL_SOCKET_SEND) bytes = send(fd, data+(*offset), size-(*offset), block ? 0 : MSG_DONTWAIT); if (op == NCCL_SOCKET_RECV && bytes == 0) { WARN("Net : Connection closed by remote peer"); return ncclSystemError; @@ -413,9 +416,13 @@ static ncclResult_t socketProgress(int op, int fd, void* ptr, int size, int* off return ncclSuccess; } +static ncclResult_t socketProgress(int op, int fd, void* ptr, int size, int* offset) { + return socketProgressOpt(op, fd, ptr, size, offset, 0); +} + static ncclResult_t socketWait(int op, int fd, void* ptr, int size, int* offset) { while (*offset < size) - NCCLCHECK(socketProgress(op, fd, ptr, size, offset)); + NCCLCHECK(socketProgressOpt(op, fd, ptr, size, offset, 1)); return ncclSuccess; } diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index 80af287012..42499c0313 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -124,14 +124,15 @@ cleanup: } ncclResult_t initNet() { - // Always initialize sockets as we use it for bootstrap - NCCLCHECK(initNet(&ncclNetSocket)); + // Always initialize bootstrap network + NCCLCHECK(bootstrapNetInit()); NCCLCHECK(initNetPlugin(&ncclNet)); if (ncclNet != NULL) return ncclSuccess; if (initNet(&ncclNetIb) == ncclSuccess) { ncclNet = &ncclNetIb; } else { + NCCLCHECK(initNet(&ncclNetSocket)); ncclNet = &ncclNetSocket; } return ncclSuccess; diff --git a/projects/rccl/src/transport/net_socket.cc b/projects/rccl/src/transport/net_socket.cc index 9958936201..ab5e8ecbee 100644 --- a/projects/rccl/src/transport/net_socket.cc +++ b/projects/rccl/src/transport/net_socket.cc @@ -8,6 +8,7 @@ #include "core.h" #include "socket.h" #include "net.h" +#include "param.h" #include #include @@ -15,6 +16,7 @@ #include #include #include +#include /* Init functions */ static char ncclNetIfNames[MAX_IF_NAME_SIZE*MAX_IFS]; @@ -68,7 +70,7 @@ ncclResult_t ncclSocketPciPath(int dev, char** path) { return ncclSuccess; } -static ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) { +ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) { if (dev >= ncclNetIfs) return ncclInternalError; memcpy(addr, ncclNetIfAddrs+dev, sizeof(*addr)); return ncclSuccess; @@ -76,105 +78,281 @@ static ncclResult_t GetSocketAddr(int dev, union socketAddress* addr) { /* Communication functions */ +#define MAX_SOCKETS 64 +#define MAX_THREADS 16 +#define MAX_REQUESTS 128 +#define MAX_QUEUE_LEN MAX_REQUESTS +#define MIN_CHUNKSIZE (64*1024) + +NCCL_PARAM(SocketNsocksPerThread, "NSOCKS_PERTHREAD", -2); +NCCL_PARAM(SocketNthreads, "SOCKET_NTHREADS", -2); + struct ncclSocketHandle { union socketAddress connectAddr; + int nSocks; + int nThreads; }; -struct ncclSocketRequest { +struct ncclSocketTask { int op; void* data; int size; int fd; int offset; int used; + ncclResult_t result; }; -struct ncclSocketReqs { - struct ncclSocketRequest* requests; +struct ncclSocketRequest { + int op; + void* data; + int size; + int ctrlFd; + int used; + struct ncclSocketComm* comm; + struct ncclSocketTask* tasks[MAX_SOCKETS]; + int nSubs; +}; + +struct ncclSocketTaskQueue { + int next; + struct ncclSocketTask* tasks; +}; + +enum threadState {start, stop}; + +struct ncclSocketThreadResources { + struct ncclSocketTaskQueue threadTaskQueue; + enum threadState state; + struct ncclSocketComm* comm; + pthread_mutex_t threadLock; + pthread_cond_t threadCond; +}; + +struct ncclSocketListenComm { + int fd; + int nSocks; + int nThreads; }; struct ncclSocketComm { - int fd; - struct ncclSocketReqs reqs; + int ctrlFd; + int fds[MAX_SOCKETS]; + int nSocks; + int nThreads; + int nextFd; + struct ncclSocketRequest requests[MAX_REQUESTS]; + pthread_t helperThread[MAX_THREADS]; + struct ncclSocketThreadResources threadResources[MAX_THREADS]; }; -ncclResult_t ncclSocketNewComm(struct ncclSocketComm** comm) { +void* persistentSocketThread(void *args_) { + struct ncclSocketThreadResources* resource = (struct ncclSocketThreadResources*)args_; + struct ncclSocketComm* comm = resource->comm; + volatile enum threadState* state = &resource->state; + struct ncclSocketTaskQueue* myQueue = &resource->threadTaskQueue; + int nSocksPerThread = comm->nSocks / comm->nThreads; + while (1) { + int idle = 1; + int mark = myQueue->next; // mark newest task seen + for (int i=0; itasks+i+j; + if (r != NULL && r->used == 1 && r->offset < r->size) { + r->result = socketProgress(r->op, r->fd, r->data, r->size, &r->offset); + if (r->result != ncclSuccess) { + WARN("NET/Socket : socket progress error"); + return NULL; + } + idle = 0; + if (r->offset < r->size) repeat = 1; + } + } + } while (repeat); + } + if (idle) { + pthread_mutex_lock(&resource->threadLock); + while (mark == myQueue->next && *state != stop) { // no new tasks, wait + pthread_cond_wait(&resource->threadCond, &resource->threadLock); + } + pthread_mutex_unlock(&resource->threadLock); + } + if (*state == stop) return NULL; + } +} + +ncclResult_t ncclSocketGetNsockNthread(int dev, int* ns, int* nt) { + int nSocksPerThread = ncclParamSocketNsocksPerThread(); + int nThreads = ncclParamSocketNthreads(); + if (nThreads > MAX_THREADS) { + WARN("NET/Socket : NCCL_SOCKET_NTHREADS is greater than the maximum allowed, setting to %d", MAX_THREADS); + nThreads = MAX_THREADS; + } + if (nThreads == -2 || nSocksPerThread == -2) { + // Auto-detection + int autoNt=1, autoNs=1; + char vendorPath[PATH_MAX]; + snprintf(vendorPath, PATH_MAX, "/sys/class/net/%s/device/vendor", ncclNetIfNames+dev*MAX_IF_NAME_SIZE); + char* rPath = realpath(vendorPath, NULL); + int fd = open(rPath, O_RDONLY); + free(rPath); + if (fd == -1) { + // Could not find device vendor. This is handled silently so + // we don't want to print an INFO error. + TRACE(NCCL_NET, "Open of %s failed : %s\n", vendorPath, strerror(errno)); + goto end; + } + char vendor[7]; + strncpy(vendor, "0x0000", 7); + int len; + SYSCHECKVAL(read(fd, vendor, 6), "read", len); + SYSCHECK(close(fd), "close"); + if (strcmp(vendor, "0x1d0f") == 0) { // AWS + autoNt = 2; + autoNs = 8; + } +end: + if (nThreads == -2) nThreads = autoNt; + if (nSocksPerThread == -2) nSocksPerThread = autoNs; + } + int nSocks = nSocksPerThread * nThreads; + if (nSocks > MAX_SOCKETS) { + nSocksPerThread = MAX_SOCKETS/nThreads; + WARN("NET/Socket : the total number of sockets is greater than the maximum allowed, setting NCCL_NSOCKS_PERTHREAD to %d", nSocksPerThread); + nSocks = nSocksPerThread * nThreads; + } + *ns = nSocks; + *nt = nThreads; + INFO(NCCL_INIT, "NET/Socket: Using %d threads and %d sockets per thread", nThreads, nSocksPerThread); + return ncclSuccess; +} + +ncclResult_t ncclSocketNewListenComm(struct ncclSocketListenComm** comm) { NCCLCHECK(ncclCalloc(comm, 1)); (*comm)->fd = -1; return ncclSuccess; } -ncclResult_t ncclSocketCreateHandle(void* opaqueHandle, const char* str) { - struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle; - NCCLCHECK(GetSocketAddrFromString(&(handle->connectAddr), str)); +ncclResult_t ncclSocketNewComm(struct ncclSocketComm** comm) { + NCCLCHECK(ncclCalloc(comm, 1)); + (*comm)->ctrlFd = -1; + for (int i=0; i < MAX_SOCKETS; i++) { + (*comm)->fds[i] = -1; + } + (*comm)->nextFd = 0; return ncclSuccess; } ncclResult_t ncclSocketListen(int dev, void* opaqueHandle, void** listenComm) { + if (dev < 0) { // data transfer socket is based on specified dev + return ncclInternalError; + } struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle; static_assert(sizeof(struct ncclSocketHandle) < NCCL_NET_HANDLE_MAXSIZE, "ncclSocketHandle size too large"); - // if dev >= 0, listen based on dev - if (dev >= 0) { - NCCLCHECK(GetSocketAddr(dev, &(handle->connectAddr))); - } else if (dev == findSubnetIf) { - // handle stores a remote address - // need to find a local addr that is in the same network as the remote addr - union socketAddress localAddr; - char ifName[MAX_IF_NAME_SIZE]; - if (findInterfaceMatchSubnet(ifName, &localAddr, handle->connectAddr, MAX_IF_NAME_SIZE, 1) <= 0) { - WARN("NET/Socket : No usable listening interface found"); - return ncclSystemError; - } - // pass the local address back - memcpy(&handle->connectAddr, &localAddr, sizeof(handle->connectAddr)); - } // Otherwise, handle stores a local address - struct ncclSocketComm* comm; - NCCLCHECK(ncclSocketNewComm(&comm)); + struct ncclSocketListenComm* comm; + NCCLCHECK(ncclSocketNewListenComm(&comm)); + NCCLCHECK(GetSocketAddr(dev, &handle->connectAddr)); NCCLCHECK(createListenSocket(&comm->fd, &handle->connectAddr)); + NCCLCHECK(ncclSocketGetNsockNthread(dev, &comm->nSocks, &comm->nThreads)); + handle->nSocks = comm->nSocks; + handle->nThreads = comm->nThreads; *listenComm = comm; return ncclSuccess; } ncclResult_t ncclSocketConnect(int dev, void* opaqueHandle, void** sendComm) { + if (dev < 0) { // data transfer socket is based on specified dev + return ncclInternalError; + } struct ncclSocketComm* comm; NCCLCHECK(ncclSocketNewComm(&comm)); struct ncclSocketHandle* handle = (struct ncclSocketHandle*) opaqueHandle; - NCCLCHECK(connectAddress(&comm->fd, &handle->connectAddr)); + comm->nSocks = handle->nSocks; + comm->nThreads = handle->nThreads; + for (int i=0; inSocks+1; i++) { + int tmpFd, offset=0; + NCCLCHECK(connectAddress(&tmpFd, &handle->connectAddr)); + NCCLCHECK(socketWait(NCCL_SOCKET_SEND, tmpFd, &i, sizeof(int), &offset)); + if (i == comm->nSocks) comm->ctrlFd = tmpFd; + else comm->fds[i] = tmpFd; + } *sendComm = comm; return ncclSuccess; } ncclResult_t ncclSocketAccept(void* listenComm, void** recvComm) { - struct ncclSocketComm* lComm = (struct ncclSocketComm*)listenComm; + struct ncclSocketListenComm* lComm = (struct ncclSocketListenComm*)listenComm; struct ncclSocketComm* rComm; NCCLCHECK(ncclSocketNewComm(&rComm)); - struct sockaddr_in sockaddr; - socklen_t socklen = sizeof(struct sockaddr_in); - SYSCHECKVAL(accept(lComm->fd, (struct sockaddr*)&sockaddr, &socklen), "accept", rComm->fd); + rComm->nSocks = lComm->nSocks; + rComm->nThreads = lComm->nThreads; + for (int i=0; inSocks+1; i++) { + int tmpFd, sendSockIdx, offset=0; + struct sockaddr_in sockaddr; + socklen_t socklen = sizeof(struct sockaddr_in); + SYSCHECKVAL(accept(lComm->fd, (struct sockaddr*)&sockaddr, &socklen), "accept", tmpFd); + NCCLCHECK(socketWait(NCCL_SOCKET_RECV, tmpFd, &sendSockIdx, sizeof(int), &offset)); + if (sendSockIdx == rComm->nSocks) rComm->ctrlFd = tmpFd; + else rComm->fds[sendSockIdx] = tmpFd; + } *recvComm = rComm; return ncclSuccess; } -#define MAX_REQUESTS 128 - -ncclResult_t ncclSocketGetRequest(struct ncclSocketReqs* reqs, int op, void* data, int size, int fd, struct ncclSocketRequest** req) { - if (reqs->requests == NULL) { - NCCLCHECK(ncclCalloc(&reqs->requests, MAX_REQUESTS)); - } +ncclResult_t ncclSocketGetRequest(struct ncclSocketComm* comm, int op, void* data, int size, struct ncclSocketRequest** req) { for (int i=0; irequests+i; + struct ncclSocketRequest* r = comm->requests+i; if (r->used == 0) { r->op = op; r->data = data; r->size = size; - r->fd = fd; - r->offset = -1; + r->ctrlFd = comm->ctrlFd; r->used = 1; + r->comm = comm; + r->nSubs = 0; *req = r; return ncclSuccess; } } - WARN("Socket : unable to allocate requests"); + WARN("NET/Socket : unable to allocate requests"); + return ncclInternalError; +} + +ncclResult_t ncclSocketGetTask(struct ncclSocketComm* comm, int op, void* data, int size, struct ncclSocketTask** req) { + int tid = comm->nextFd % comm->nThreads; + struct ncclSocketThreadResources* res = comm->threadResources+tid; + struct ncclSocketTaskQueue* queue = &res->threadTaskQueue; + // create helper threads and prepare per-thread task queue + if (queue->tasks == NULL) { + NCCLCHECK(ncclCalloc(&queue->tasks, MAX_QUEUE_LEN)); + queue->next = 0; + res->comm = comm; + pthread_mutex_init(&res->threadLock, NULL); + pthread_cond_init(&res->threadCond, NULL); + pthread_create(comm->helperThread+tid, NULL, persistentSocketThread, res); + } + struct ncclSocketTask* r = queue->tasks+queue->next; + if (r->used == 0) { + r->op = op; + r->data = data; + r->size = size; + r->fd = comm->fds[comm->nextFd]; + r->offset = 0; + r->result = ncclSuccess; + comm->nextFd = (comm->nextFd + 1) % comm->nSocks; + r->used = 1; + *req = r; + pthread_mutex_lock(&res->threadLock); + queue->next = (queue->next+1)%MAX_QUEUE_LEN; + res->state = start; + pthread_cond_signal(&res->threadCond); + pthread_mutex_unlock(&res->threadLock); + return ncclSuccess; + } + WARN("NET/Socket : unable to allocate subtasks"); return ncclInternalError; } @@ -185,15 +363,15 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) { WARN("NET/Socket : test called with NULL request"); return ncclInternalError; } - if (r->offset == -1) { /* try to send/recv size */ + if (r->used == 1) { /* try to send/recv size */ int data = r->size; int offset = 0; - NCCLCHECK(socketProgress(r->op, r->fd, &data, sizeof(int), &offset)); + NCCLCHECK(socketProgress(r->op, r->ctrlFd, &data, sizeof(int), &offset)); if (offset == 0) return ncclSuccess; /* Not ready -- retry later */ // Not sure we could ever receive less than 4 bytes, but just in case ... - if (offset < sizeof(int)) NCCLCHECK(socketWait(r->op, r->fd, &data, sizeof(int), &offset)); + if (offset < sizeof(int)) NCCLCHECK(socketWait(r->op, r->ctrlFd, &data, sizeof(int), &offset)); // Check size is less or equal to the size provided by the user if (r->op == NCCL_SOCKET_RECV && data > r->size) { @@ -201,15 +379,33 @@ ncclResult_t ncclSocketTest(void* request, int* done, int* size) { return ncclInternalError; } r->size = data; - r->offset = 0; + r->used = 2; // done exchanging size + // divide into subtasks + int taskSize = std::max(MIN_CHUNKSIZE, DIVUP(r->size, r->comm->nSocks)); + int chunkOffset = 0, i = 0; + while (chunkOffset < r->size) { + int chunkSize = std::min(taskSize, r->size-chunkOffset); + NCCLCHECK(ncclSocketGetTask(r->comm, r->op, (char*)(r->data)+chunkOffset, chunkSize, r->tasks+i++)); + chunkOffset += chunkSize; + } + r->nSubs = i; } - if (r->offset < r->size) { - NCCLCHECK(socketProgress(r->op, r->fd, r->data, r->size, &r->offset)); - } - if (r->offset == r->size) { - if (size) *size = r->size; - *done = 1; - r->used = 0; + if (r->used == 2) { // already exchanged size + int nCompleted = 0; + for (int i=0; inSubs; i++) { + struct ncclSocketTask* sub = r->tasks[i]; + if (sub->result != ncclSuccess) return sub->result; + if (sub->offset == sub->size) nCompleted++; + } + if (nCompleted == r->nSubs) { + if (size) *size = r->size; + *done = 1; + r->used = 0; + for (int i=0; inSubs; i++) { + struct ncclSocketTask* sub = r->tasks[i]; + sub->used = 0; + } + } } return ncclSuccess; } @@ -221,13 +417,13 @@ ncclResult_t ncclSocketDeregMr(void* comm, void* mhandle) { return ncclSuccess; ncclResult_t ncclSocketIsend(void* sendComm, void* data, int size, void* mhandle, void** request) { struct ncclSocketComm* comm = (struct ncclSocketComm*)sendComm; - NCCLCHECK(ncclSocketGetRequest(&comm->reqs, NCCL_SOCKET_SEND, data, size, comm->fd, (struct ncclSocketRequest**)request)); + NCCLCHECK(ncclSocketGetRequest(comm, NCCL_SOCKET_SEND, data, size, (struct ncclSocketRequest**)request)); return ncclSuccess; } ncclResult_t ncclSocketIrecv(void* recvComm, void* data, int size, void* mhandle, void** request) { struct ncclSocketComm* comm = (struct ncclSocketComm*)recvComm; - NCCLCHECK(ncclSocketGetRequest(&comm->reqs, NCCL_SOCKET_RECV, data, size, comm->fd, (struct ncclSocketRequest**)request)); + NCCLCHECK(ncclSocketGetRequest(comm, NCCL_SOCKET_RECV, data, size, (struct ncclSocketRequest**)request)); return ncclSuccess; } @@ -236,11 +432,33 @@ ncclResult_t ncclSocketFlush(void* recvComm, void* data, int size, void* mhandle return ncclInternalError; } +ncclResult_t ncclSocketCloseListen(void* opaqueComm) { + struct ncclSocketListenComm* comm = (struct ncclSocketListenComm*)opaqueComm; + if (comm) { + if (comm->fd != -1) close(comm->fd); + free(comm); + } + return ncclSuccess; +} + ncclResult_t ncclSocketClose(void* opaqueComm) { struct ncclSocketComm* comm = (struct ncclSocketComm*)opaqueComm; if (comm) { - free(comm->reqs.requests); - close(comm->fd); + for (int i=0; inThreads; i++) { + struct ncclSocketThreadResources* res = comm->threadResources+i; + if (comm->helperThread[i]) { + pthread_mutex_lock(&res->threadLock); + res->state = stop; + pthread_cond_signal(&res->threadCond); + pthread_mutex_unlock(&res->threadLock); + pthread_join(comm->helperThread[i], NULL); + } + free(res->threadTaskQueue.tasks); + } + if (comm->ctrlFd != -1) close(comm->ctrlFd); + for (int i=0; inSocks; i++) { + if (comm->fds[i] != -1) close(comm->fds[i]); + } free(comm); } return ncclSuccess; @@ -263,5 +481,5 @@ ncclNet_t ncclNetSocket = { ncclSocketTest, ncclSocketClose, ncclSocketClose, - ncclSocketClose + ncclSocketCloseListen }; From ee08e8b421054770fef24c1b6d892a116d5546d9 Mon Sep 17 00:00:00 2001 From: Hirochika Asai Date: Wed, 10 Jul 2019 06:45:41 +0900 Subject: [PATCH 10/20] Add the exact matching modifier support "=" to the NCCL_IB_HCA variable (#236) Perform exact matching when the prefix "=" is specified in the NCCL_IB_HCA variable to exclude HCAs mlx5_X[0-9]+ when mlx5_X is specified. [ROCm/rccl commit: 0b192d2299146e64a096aee16f8b8f7638d2d9d4] --- projects/rccl/src/include/socket.h | 3 ++- projects/rccl/src/include/utils.h | 2 +- projects/rccl/src/misc/utils.cc | 14 ++++++++------ projects/rccl/src/transport/net_ib.cc | 3 ++- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/projects/rccl/src/include/socket.h b/projects/rccl/src/include/socket.h index 8197a658d9..68ce235d62 100644 --- a/projects/rccl/src/include/socket.h +++ b/projects/rccl/src/include/socket.h @@ -66,6 +66,7 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre #endif struct netIf userIfs[MAX_IFS]; bool searchNot = prefixList && prefixList[0] == '^'; + bool searchExact = prefixList && prefixList[0] == '='; int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS); int found = 0; @@ -92,7 +93,7 @@ static int findInterfaces(const char* prefixList, char* names, union socketAddre } // check against user specified interfaces - if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs) ^ searchNot)) { + if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs, searchExact) ^ searchNot)) { continue; } diff --git a/projects/rccl/src/include/utils.h b/projects/rccl/src/include/utils.h index 29b72ad186..93e72c80b4 100644 --- a/projects/rccl/src/include/utils.h +++ b/projects/rccl/src/include/utils.h @@ -20,6 +20,6 @@ struct netIf { }; int parseStringList(const char* string, struct netIf* ifList, int maxList); -bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize); +bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact); #endif diff --git a/projects/rccl/src/misc/utils.cc b/projects/rccl/src/misc/utils.cc index 5e884ae582..509375563e 100644 --- a/projects/rccl/src/misc/utils.cc +++ b/projects/rccl/src/misc/utils.cc @@ -147,8 +147,8 @@ int parseStringList(const char* string, struct netIf* ifList, int maxList) { if (!string) return 0; const char* ptr = string; - // Ignore "^" prefix, will be detected outside of this function - if (ptr[0] == '^') ptr++; + // Ignore "^" or "=" prefix, will be detected outside of this function + if (ptr[0] == '^' || ptr[0] == '=') ptr++; int ifNum = 0; int ifC = 0; @@ -177,8 +177,10 @@ int parseStringList(const char* string, struct netIf* ifList, int maxList) { return ifNum; } -static bool matchPrefix(const char* string, const char* prefix) { - return (strncmp(string, prefix, strlen(prefix)) == 0); +static bool matchIf(const char* string, const char* ref, bool matchExact) { + // Make sure to include '\0' in the exact case + int matchLen = matchExact ? strlen(string) + 1 : strlen(ref); + return strncmp(string, ref, matchLen) == 0; } static bool matchPort(const int port1, const int port2) { @@ -189,12 +191,12 @@ static bool matchPort(const int port1, const int port2) { } -bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize) { +bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact) { // Make an exception for the case where no user list is defined if (listSize == 0) return true; for (int i=0; i + + project.paths.construct_build_prefix() + def command = """#!/usr/bin/env bash + set -x + cd ${project.paths.project_build_prefix} + LD_LIBRARY_PATH=/opt/rocm/hcc/lib CXX=${project.compiler.compiler_path} ${project.paths.build_command} + """ + + sh command + } + + def testCommand = + { + platform, project-> + + def command = """#!/usr/bin/env bash + set -x + cd ${project.paths.project_build_prefix}/build/release/test + HSA_FORCE_FINE_GRAIN_PCIE=1 ./UnitTests --gtest_output=xml --gtest_color=yes + """ + + sh command + //junit "${project.paths.project_build_prefix}/build/release/*.xml" + } + + def packageCommand = + { + platform, project-> + + def command = """ + set -x + cd ${project.paths.project_build_prefix}/build + make package + rm -rf package && mkdir -p package + mv *.deb package/ + sudo dpkg -i package/*.deb + """ + + + //platform.archiveArtifacts(this, """${project.paths.project_build_prefix}/build/package/*.deb""") + } + + buildProjectNoDocker(rccl, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) + +} \ No newline at end of file diff --git a/projects/rccl/LICENSE.txt b/projects/rccl/LICENSE.txt index e318c66695..60db84a684 100644 --- a/projects/rccl/LICENSE.txt +++ b/projects/rccl/LICENSE.txt @@ -1,5 +1,6 @@ Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions diff --git a/projects/rccl/NOTICES.txt b/projects/rccl/NOTICES.txt new file mode 100644 index 0000000000..1b9bcc8eec --- /dev/null +++ b/projects/rccl/NOTICES.txt @@ -0,0 +1,66 @@ +Notices and Licenses file +_______________________________________________________________ + +Dependencies on nvidia-nccl v2.3.7-1 (BSD3) +Copyright (c) 2015-2018, NVIDIA CORPORATION. +Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National + Laboratory, the U.S. Department of Energy, nor the names of their + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The U.S. Department of Energy funded the development of this software +under subcontract 7078610 with Lawrence Berkeley National Laboratory. + + +nvidia-nccl v2.3.7-1 (BSD2) +Copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of NVIDIA CORPORATION, Lawrence Berkeley National + Laboratory, the U.S. Department of Energy, nor the names of their + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The U.S. Department of Energy funded the development of this software +under subcontract 7078610 with Lawrence Berkeley National Laboratory. \ No newline at end of file diff --git a/projects/rccl/README.md b/projects/rccl/README.md index 7f0a72f5df..56eca69c52 100644 --- a/projects/rccl/README.md +++ b/projects/rccl/README.md @@ -1,92 +1,80 @@ -# NCCL +# RCCL -Optimized primitives for collective multi-GPU communication. +ROCm Communication Collectives Library ## Introduction -NCCL (pronounced "Nickel") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, and reduce-scatter. It has been optimized to achieve high bandwidth on platforms using PCIe, NVLink, NVswitch, as well as networking using InfiniBand Verbs or TCP/IP sockets. NCCL supports an arbitrary number of GPUs installed in a single node or across multiple nodes, and can be used in either single- or multi-process (e.g., MPI) applications. +RCCL (pronounced "Rickle") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, and reduce-scatter. It has been optimized to achieve high bandwidth on platforms using PCIe, xGMI as well as networking using InfiniBand Verbs or TCP/IP sockets. RCCL supports an arbitrary number of GPUs installed in a single node, and can be used in either single- or multi-process (e.g., MPI) applications. Multi node support is planned for a future release. -For more information on NCCL usage, please refer to the [NCCL documentation](https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/index.html). - -## What's inside - -At present, the library implements the following collectives operations: - -- all-reduce -- all-gather -- reduce-scatter -- reduce -- broadcast - -These operations are implemented using ring algorithms and have been optimized for throughput and latency. For best performance, small operations can be either batched into larger operations or aggregated through the API. +The collective operations are implemented using ring algorithms and have been optimized for throughput and latency. For best performance, small operations can be either batched into larger operations or aggregated through the API. ## Requirements -NCCL requires at least CUDA 7.0 and Kepler or newer GPUs. For PCIe based platforms, best performance is achieved when all GPUs are located on a common PCIe root complex, but multi-socket configurations are also supported. +1. ROCm supported GPUs +2. ROCm stack installed on the system (HIP runtime & HCC) +3. For building and running the unit tests, chrpath will need to be installed on your machine first. (sudo apt-get install chrpath) -## Build +## Quickstart RCCL Build -Note: the official and tested builds of NCCL can be downloaded from: https://developer.nvidia.com/nccl. You can skip the following build steps if you choose to use the official builds. +RCCL directly depends on HIP runtime & HCC C++ compiler which are part of the ROCm software stack. +In addition, HC Direct Function call support needs to be present on your machine. There are binaries for hcc and HIP that need to be installed to get HC Direct Function call support. These binaries are currently packaged with roc-master, and will be included in ROCm 2.4. -To build the library : +The root of this repository has a helper script 'install.sh' to build and install RCCL on Ubuntu with a single command. It does not take a lot of options and hard-codes configuration that can be specified through invoking cmake directly, but it's a great way to get started quickly and can serve as an example of how to build/install. + +* `./install.sh` -- builds library including unit tests +* `./install.sh -i` -- builds and installs the library to /opt/rocm/rccl; installation path can be changed with --prefix argument (see below.) +* `./install.sh -h` -- shows help +* `./install.sh -t` -- builds library including unit tests +* `./install.sh -r` -- runs unit tests (must be already built) +* `./install.sh -p` -- builds RCCL package +* `./install.sh --prefix` -- specify custom path to install RCCL to (default:/opt/rocm) + +## Manual build +#### To build the library : ```shell -$ cd nccl -$ make -j src.build +$ git clone https://github.com/ROCmSoftwarePlatform/rccl.git +$ cd rccl +$ mkdir build +$ cd build +$ CXX=/opt/rocm/bin/hcc cmake -DCMAKE_INSTALL_PREFIX=$PWD/rccl-install .. +$ make -j 8 ``` +You may substitute a path of your own choosing for CMAKE_INSTALL_PREFIX. Note: ensure rocm-cmake is installed, `apt install rocm-cmake`. -If CUDA is not installed in the default /usr/local/cuda path, you can define the CUDA path with : +#### To build the RCCL package and install package : + +Assuming you have already cloned this repository and built the library as shown in the previous section: ```shell -$ make src.build CUDA_HOME= +$ cd rccl/build +$ make package +$ sudo dpkg -i *.deb ``` -NCCL will be compiled and installed in `build/` unless `BUILDDIR` is set. - -By default, NCCL is compiled for all supported architectures. To accelerate the compilation and reduce the binary size, consider redefining `NVCC_GENCODE` (defined in `makefiles/common.mk`) to only include the architecture of the target platform : -```shell -$ make -j src.build NVCC_GENCODE="-gencode=arch=compute_70,code=sm_70" -``` - -## Install - -To install NCCL on the system, create a package then install it as root. - -Debian/Ubuntu : -```shell -$ # Install tools to create debian packages -$ sudo apt install build-essential devscripts debhelper fakeroot -$ # Build NCCL deb package -$ make pkg.debian.build -$ ls build/pkg/deb/ -``` - -RedHat/CentOS : -```shell -$ # Install tools to create rpm packages -$ sudo yum install rpm-build rpmdevtools -$ # Build NCCL rpm package -$ make pkg.redhat.build -$ ls build/pkg/rpm/ -``` - -OS-agnostic tarball : -```shell -$ make pkg.txz.build -$ ls build/pkg/txz/ -``` +RCCL package install requires sudo/root access because it creates a directory called "rccl" under /opt/rocm/. This is an optional step and RCCL can be used directly by including the path containing librccl.so. ## Tests -Tests for NCCL are maintained separately at https://github.com/nvidia/nccl-tests. +There are unit tests implemented with the Googletest framework in RCCL, which are currently a work-in-progress. To invoke the unit tests, go to the rccl-install folder, then the test/ subfolder, and execute the appropriate unit test executable(s). Several notes for running the unit tests: +1. The LD_LIBRARY_PATH environment variable will need to be set to include /path/to/rccl-install/lib/ in order to run the unit tests. +2. The HSA_FORCE_FINE_GRAIN_PCIE environment variable will need to be set to 1 in order to run the unit tests. + +An example call to the unit tests: ```shell -$ git clone https://github.com/NVIDIA/nccl-tests.git -$ cd nccl-tests -$ make -$ ./build/all_reduce_perf -b 8 -e 256M -f 2 -g +$ LD_LIBRARY_PATH=rccl-install/lib/ HSA_FORCE_FINE_GRAIN_PCIE=1 rccl-install/test/UnitTests ``` +There are also other performance and error-checking tests for RCCL. These are maintained separately at https://github.com/ROCmSoftwarePlatform/rccl-tests. +See the rccl-tests README for more information on how to build and run those tests. + +## Library and API Documentation + +Please refer to the [Library documentation](http://rccl.readthedocs.io/) for current documentation. + ## Copyright -All source code and accompanying documentation is copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. +All source code and accompanying documentation is copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. + +All modifications are copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. diff --git a/projects/rccl/docs/Doxyfile b/projects/rccl/docs/Doxyfile new file mode 100644 index 0000000000..42dae7cc30 --- /dev/null +++ b/projects/rccl/docs/Doxyfile @@ -0,0 +1,2456 @@ +# Doxyfile 1.8.10 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project. +# +# All text after a double hash (##) is considered a comment and is placed in +# front of the TAG it is preceding. +# +# All text after a single hash (#) is considered a comment and will be ignored. +# The format is: +# TAG = value [value, ...] +# For lists, items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (\" \"). + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# This tag specifies the encoding used for all characters in the config file +# that follow. The default is UTF-8 which is also the encoding used for all text +# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv +# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv +# for the list of possible encodings. +# The default value is: UTF-8. + +DOXYFILE_ENCODING = UTF-8 + +# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by +# double-quotes, unless you are using Doxywizard) that should identify the +# project for which the documentation is generated. This name is used in the +# title of most generated pages and in a few other places. +# The default value is: My Project. + +PROJECT_NAME = "RCCL" + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. This +# could be handy for archiving the generated documentation or if some version +# control system is used. + +PROJECT_NUMBER = v3.0.1.0 + +# Using the PROJECT_BRIEF tag one can provide an optional one line description +# for a project that appears at the top of each page and should give viewer a +# quick idea about the purpose of the project. Keep the description short. + +PROJECT_BRIEF = "prototype interfaces compatible with ROCm platform and HiP" + +# With the PROJECT_LOGO tag one can specify a logo or an icon that is included +# in the documentation. The maximum height of the logo should not exceed 55 +# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy +# the logo to the output directory. + +PROJECT_LOGO = ./rocm.jpg + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path +# into which the generated documentation will be written. If a relative path is +# entered, it will be relative to the location where doxygen was started. If +# left blank the current directory will be used. + +OUTPUT_DIRECTORY = docBin + +# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- +# directories (in 2 levels) under the output directory of each output format and +# will distribute the generated files over these directories. Enabling this +# option can be useful when feeding doxygen a huge amount of source files, where +# putting all generated files in the same directory would otherwise causes +# performance problems for the file system. +# The default value is: NO. + +CREATE_SUBDIRS = NO + +# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII +# characters to appear in the names of generated files. If set to NO, non-ASCII +# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode +# U+3044. +# The default value is: NO. + +ALLOW_UNICODE_NAMES = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, +# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), +# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, +# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), +# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, +# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, +# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, +# Ukrainian and Vietnamese. +# The default value is: English. + +OUTPUT_LANGUAGE = English + +# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member +# descriptions after the members that are listed in the file and class +# documentation (similar to Javadoc). Set to NO to disable this. +# The default value is: YES. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief +# description of a member or function before the detailed description +# +# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. +# The default value is: YES. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator that is +# used to form the text in various listings. Each string in this list, if found +# as the leading text of the brief description, will be stripped from the text +# and the result, after processing the whole list, is used as the annotated +# text. Otherwise, the brief description is used as-is. If left blank, the +# following values are used ($name is automatically replaced with the name of +# the entity):The $name class, The $name widget, The $name file, is, provides, +# specifies, contains, represents, a, an and the. + +ABBREVIATE_BRIEF = "The $name class" \ + "The $name widget" \ + "The $name file" \ + is \ + provides \ + specifies \ + contains \ + represents \ + a \ + an \ + the + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# doxygen will generate a detailed section even if there is only a brief +# description. +# The default value is: NO. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. +# The default value is: NO. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path +# before files name in the file list and in the header files. If set to NO the +# shortest path that makes the file name unique will be used +# The default value is: YES. + +FULL_PATH_NAMES = YES + +# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. +# Stripping is only done if one of the specified strings matches the left-hand +# part of the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the path to +# strip. +# +# Note that you can specify absolute paths here, but also relative paths, which +# will be relative from the directory where doxygen is started. +# This tag requires that the tag FULL_PATH_NAMES is set to YES. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the +# path mentioned in the documentation of a class, which tells the reader which +# header file to include in order to use a class. If left blank only the name of +# the header file containing the class definition is used. Otherwise one should +# specify the list of include paths that are normally passed to the compiler +# using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but +# less readable) file names. This can be useful is your file systems doesn't +# support long names like on DOS, Mac, or CD-ROM. +# The default value is: NO. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the +# first line (until the first dot) of a Javadoc-style comment as the brief +# description. If set to NO, the Javadoc-style will behave just like regular Qt- +# style comments (thus requiring an explicit @brief command for a brief +# description.) +# The default value is: NO. + +JAVADOC_AUTOBRIEF = NO + +# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first +# line (until the first dot) of a Qt-style comment as the brief description. If +# set to NO, the Qt-style will behave just like regular Qt-style comments (thus +# requiring an explicit \brief command for a brief description.) +# The default value is: NO. + +QT_AUTOBRIEF = NO + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a +# multi-line C++ special comment block (i.e. a block of //! or /// comments) as +# a brief description. This used to be the default behavior. The new default is +# to treat a multi-line C++ comment block as a detailed description. Set this +# tag to YES if you prefer the old behavior instead. +# +# Note that setting this tag to YES also means that rational rose comments are +# not recognized any more. +# The default value is: NO. + +MULTILINE_CPP_IS_BRIEF = NO + +# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the +# documentation from any documented member that it re-implements. +# The default value is: YES. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new +# page for each member. If set to NO, the documentation of a member will be part +# of the file/class/namespace that contains it. +# The default value is: NO. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen +# uses this value to replace tabs by spaces in code fragments. +# Minimum value: 1, maximum value: 16, default value: 4. + +TAB_SIZE = 4 + +# This tag can be used to specify a number of aliases that act as commands in +# the documentation. An alias has the form: +# name=value +# For example adding +# "sideeffect=@par Side Effects:\n" +# will allow you to put the command \sideeffect (or @sideeffect) in the +# documentation, which will result in a user-defined paragraph with heading +# "Side Effects:". You can put \n's in the value part of an alias to insert +# newlines. + +ALIASES = + +# This tag can be used to specify a number of word-keyword mappings (TCL only). +# A mapping has the form "name=value". For example adding "class=itcl::class" +# will allow you to use the command class in the itcl::class meaning. + +TCL_SUBST = + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources +# only. Doxygen will then generate output that is more tailored for C. For +# instance, some of the names that are used will be different. The list of all +# members will be omitted, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_FOR_C = NO + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or +# Python sources only. Doxygen will then generate output that is more tailored +# for that language. For instance, namespaces will be presented as packages, +# qualified scopes will look different, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_JAVA = NO + +# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran +# sources. Doxygen will then generate output that is tailored for Fortran. +# The default value is: NO. + +OPTIMIZE_FOR_FORTRAN = NO + +# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL +# sources. Doxygen will then generate output that is tailored for VHDL. +# The default value is: NO. + +OPTIMIZE_OUTPUT_VHDL = NO + +# Doxygen selects the parser to use depending on the extension of the files it +# parses. With this tag you can assign which parser to use for a given +# extension. Doxygen has a built-in mapping, but you can override or extend it +# using this tag. The format is ext=language, where ext is a file extension, and +# language is one of the parsers supported by doxygen: IDL, Java, Javascript, +# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran: +# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran: +# Fortran. In the later case the parser tries to guess whether the code is fixed +# or free formatted code, this is the default for Fortran type files), VHDL. For +# instance to make doxygen treat .inc files as Fortran files (default is PHP), +# and .f files as C (default is Fortran), use: inc=Fortran f=C. +# +# Note: For files without extension you can use no_extension as a placeholder. +# +# Note that for custom extensions you also need to set FILE_PATTERNS otherwise +# the files are not read by doxygen. + +EXTENSION_MAPPING = + +# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments +# according to the Markdown format, which allows for more readable +# documentation. See http://daringfireball.net/projects/markdown/ for details. +# The output of markdown processing is further processed by doxygen, so you can +# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in +# case of backward compatibilities issues. +# The default value is: YES. + +MARKDOWN_SUPPORT = YES + +# When enabled doxygen tries to link words that correspond to documented +# classes, or namespaces to their corresponding documentation. Such a link can +# be prevented in individual cases by putting a % sign in front of the word or +# globally by setting AUTOLINK_SUPPORT to NO. +# The default value is: YES. + +AUTOLINK_SUPPORT = YES + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want +# to include (a tag file for) the STL sources as input, then you should set this +# tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); +# versus func(std::string) {}). This also make the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. +# The default value is: NO. + +BUILTIN_STL_SUPPORT = NO + +# If you use Microsoft's C++/CLI language, you should set this option to YES to +# enable parsing support. +# The default value is: NO. + +CPP_CLI_SUPPORT = NO + +# Set the SIP_SUPPORT tag to YES if your project consists of sip (see: +# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen +# will parse them like normal C++ but will assume all classes use public instead +# of private inheritance when no explicit protection keyword is present. +# The default value is: NO. + +SIP_SUPPORT = NO + +# For Microsoft's IDL there are propget and propput attributes to indicate +# getter and setter methods for a property. Setting this option to YES will make +# doxygen to replace the get and set methods by a property in the documentation. +# This will only work if the methods are indeed getting or setting a simple +# type. If this is not the case, or you want to show the methods anyway, you +# should set this option to NO. +# The default value is: YES. + +IDL_PROPERTY_SUPPORT = YES + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. +# The default value is: NO. + +DISTRIBUTE_GROUP_DOC = YES + +# If one adds a struct or class to a group and this option is enabled, then also +# any nested class or struct is added to the same group. By default this option +# is disabled and one has to add nested compounds explicitly via \ingroup. +# The default value is: NO. + +GROUP_NESTED_COMPOUNDS = NO + +# Set the SUBGROUPING tag to YES to allow class member groups of the same type +# (for instance a group of public functions) to be put as a subgroup of that +# type (e.g. under the Public Functions section). Set it to NO to prevent +# subgrouping. Alternatively, this can be done per class using the +# \nosubgrouping command. +# The default value is: YES. + +SUBGROUPING = YES + +# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions +# are shown inside the group in which they are included (e.g. using \ingroup) +# instead of on a separate page (for HTML and Man pages) or section (for LaTeX +# and RTF). +# +# Note that this feature does not work in combination with +# SEPARATE_MEMBER_PAGES. +# The default value is: NO. + +INLINE_GROUPED_CLASSES = NO + +# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions +# with only public data fields or simple typedef fields will be shown inline in +# the documentation of the scope in which they are defined (i.e. file, +# namespace, or group documentation), provided this scope is documented. If set +# to NO, structs, classes, and unions are shown on a separate page (for HTML and +# Man pages) or section (for LaTeX and RTF). +# The default value is: NO. + +INLINE_SIMPLE_STRUCTS = NO + +# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or +# enum is documented as struct, union, or enum with the name of the typedef. So +# typedef struct TypeS {} TypeT, will appear in the documentation as a struct +# with name TypeT. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named TypeS. This can typically be +# useful for C code in case the coding convention dictates that all compound +# types are typedef'ed and only the typedef is referenced, never the tag name. +# The default value is: NO. + +TYPEDEF_HIDES_STRUCT = YES + +# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This +# cache is used to resolve symbols given their name and scope. Since this can be +# an expensive process and often the same symbol appears multiple times in the +# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small +# doxygen will become slower. If the cache is too large, memory is wasted. The +# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range +# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 +# symbols. At the end of a run doxygen will report the cache usage and suggest +# the optimal cache size from a speed point of view. +# Minimum value: 0, maximum value: 9, default value: 0. + +LOOKUP_CACHE_SIZE = 0 + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +SHOW_NAMESPACES = NO + +# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in +# documentation are documented, even if no documentation was available. Private +# class members and static file members will be hidden unless the +# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. +# Note: This will also disable the warnings about undocumented members that are +# normally produced when WARNINGS is set to YES. +# The default value is: NO. + +EXTRACT_ALL = NO + +# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will +# be included in the documentation. +# The default value is: NO. + +EXTRACT_PRIVATE = NO + +# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal +# scope will be included in the documentation. +# The default value is: NO. + +EXTRACT_PACKAGE = NO + +# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be +# included in the documentation. +# The default value is: NO. + +EXTRACT_STATIC = NO + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined +# locally in source files will be included in the documentation. If set to NO, +# only classes defined in header files are included. Does not have any effect +# for Java sources. +# The default value is: YES. + +EXTRACT_LOCAL_CLASSES = YES + +# This flag is only useful for Objective-C code. If set to YES, local methods, +# which are defined in the implementation section but not in the interface are +# included in the documentation. If set to NO, only methods in the interface are +# included. +# The default value is: NO. + +EXTRACT_LOCAL_METHODS = NO + +# If this flag is set to YES, the members of anonymous namespaces will be +# extracted and appear in the documentation as a namespace called +# 'anonymous_namespace{file}', where file will be replaced with the base name of +# the file that contains the anonymous namespace. By default anonymous namespace +# are hidden. +# The default value is: NO. + +EXTRACT_ANON_NSPACES = NO + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all +# undocumented members inside documented classes or files. If set to NO these +# members will be included in the various overviews, but no documentation +# section is generated. This option has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_MEMBERS = NO + +# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. If set +# to NO, these classes will be included in the various overviews. This option +# has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_CLASSES = NO + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend +# (class|struct|union) declarations. If set to NO, these declarations will be +# included in the documentation. +# The default value is: NO. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any +# documentation blocks found inside the body of a function. If set to NO, these +# blocks will be appended to the function's detailed documentation block. +# The default value is: NO. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation that is typed after a +# \internal command is included. If the tag is set to NO then the documentation +# will be excluded. Set it to YES to include the internal documentation. +# The default value is: NO. + +INTERNAL_DOCS = NO + +# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file +# names in lower-case letters. If set to YES, upper-case letters are also +# allowed. This is useful if you have classes or files whose names only differ +# in case and if your file system supports case sensitive file names. Windows +# and Mac users are advised to set this option to NO. +# The default value is: system dependent. + +CASE_SENSE_NAMES = NO + +# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with +# their full class and namespace scopes in the documentation. If set to YES, the +# scope will be hidden. +# The default value is: NO. + +HIDE_SCOPE_NAMES = NO + +# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will +# append additional text to a page's title, such as Class Reference. If set to +# YES the compound reference will be hidden. +# The default value is: NO. + +HIDE_COMPOUND_REFERENCE= NO + +# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of +# the files that are included by a file in the documentation of that file. +# The default value is: YES. + +SHOW_INCLUDE_FILES = YES + +# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each +# grouped member an include statement to the documentation, telling the reader +# which file to include in order to use the member. +# The default value is: NO. + +SHOW_GROUPED_MEMB_INC = NO + +# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include +# files with double quotes in the documentation rather than with sharp brackets. +# The default value is: NO. + +FORCE_LOCAL_INCLUDES = NO + +# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the +# documentation for inline members. +# The default value is: YES. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the +# (detailed) documentation of file and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. +# The default value is: YES. + +SORT_MEMBER_DOCS = YES + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief +# descriptions of file, namespace and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. Note that +# this will also influence the order of the classes in the class list. +# The default value is: NO. + +SORT_BRIEF_DOCS = NO + +# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the +# (brief and detailed) documentation of class members so that constructors and +# destructors are listed first. If set to NO the constructors will appear in the +# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. +# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief +# member documentation. +# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting +# detailed member documentation. +# The default value is: NO. + +SORT_MEMBERS_CTORS_1ST = NO + +# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy +# of group names into alphabetical order. If set to NO the group names will +# appear in their defined order. +# The default value is: NO. + +SORT_GROUP_NAMES = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by +# fully-qualified names, including namespaces. If set to NO, the class list will +# be sorted only by class name, not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the alphabetical +# list. +# The default value is: NO. + +SORT_BY_SCOPE_NAME = NO + +# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper +# type resolution of all parameters of a function it will reject a match between +# the prototype and the implementation of a member function even if there is +# only one candidate or it is obvious which candidate to choose by doing a +# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still +# accept a match between prototype and implementation in such cases. +# The default value is: NO. + +STRICT_PROTO_MATCHING = NO + +# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo +# list. This list is created by putting \todo commands in the documentation. +# The default value is: YES. + +GENERATE_TODOLIST = YES + +# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test +# list. This list is created by putting \test commands in the documentation. +# The default value is: YES. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug +# list. This list is created by putting \bug commands in the documentation. +# The default value is: YES. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) +# the deprecated list. This list is created by putting \deprecated commands in +# the documentation. +# The default value is: YES. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional documentation +# sections, marked by \if ... \endif and \cond +# ... \endcond blocks. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the +# initial value of a variable or macro / define can have for it to appear in the +# documentation. If the initializer consists of more lines than specified here +# it will be hidden. Use a value of 0 to hide initializers completely. The +# appearance of the value of individual variables and macros / defines can be +# controlled using \showinitializer or \hideinitializer command in the +# documentation regardless of this setting. +# Minimum value: 0, maximum value: 10000, default value: 30. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at +# the bottom of the documentation of classes and structs. If set to YES, the +# list will mention the files that were used to generate the documentation. +# The default value is: YES. + +SHOW_USED_FILES = YES + +# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This +# will remove the Files entry from the Quick Index and from the Folder Tree View +# (if specified). +# The default value is: YES. + +SHOW_FILES = YES + +# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces +# page. This will remove the Namespaces entry from the Quick Index and from the +# Folder Tree View (if specified). +# The default value is: YES. + +SHOW_NAMESPACES = YES + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from +# the version control system). Doxygen will invoke the program by executing (via +# popen()) the command command input-file, where command is the value of the +# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided +# by doxygen. Whatever the program writes to standard output is used as the file +# version. For an example see the documentation. + +FILE_VERSION_FILTER = + +# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed +# by doxygen. The layout file controls the global structure of the generated +# output files in an output format independent way. To create the layout file +# that represents doxygen's defaults, run doxygen with the -l option. You can +# optionally specify a file name after the option, if omitted DoxygenLayout.xml +# will be used as the name of the layout file. +# +# Note that if you run doxygen from a directory containing a file called +# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE +# tag is left empty. + +LAYOUT_FILE = + +# The CITE_BIB_FILES tag can be used to specify one or more bib files containing +# the reference definitions. This must be a list of .bib files. The .bib +# extension is automatically appended if omitted. This requires the bibtex tool +# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info. +# For LaTeX the style of the bibliography can be controlled using +# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the +# search path. See also \cite for info how to create references. + +CITE_BIB_FILES = + +#--------------------------------------------------------------------------- +# Configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated to +# standard output by doxygen. If QUIET is set to YES this implies that the +# messages are off. +# The default value is: NO. + +QUIET = NO + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES +# this implies that the warnings are on. +# +# Tip: Turn warnings on while writing the documentation. +# The default value is: YES. + +WARNINGS = YES + +# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate +# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag +# will automatically be disabled. +# The default value is: YES. + +WARN_IF_UNDOCUMENTED = YES + +# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some parameters +# in a documented function, or documenting parameters that don't exist or using +# markup commands wrongly. +# The default value is: YES. + +WARN_IF_DOC_ERROR = YES + +# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that +# are documented, but have no documentation for their parameters or return +# value. If set to NO, doxygen will only warn about wrong or incomplete +# parameter documentation, but not about the absence of documentation. +# The default value is: NO. + +WARN_NO_PARAMDOC = NO + +# The WARN_FORMAT tag determines the format of the warning messages that doxygen +# can produce. The string should contain the $file, $line, and $text tags, which +# will be replaced by the file and line number from which the warning originated +# and the warning text. Optionally the format may contain $version, which will +# be replaced by the version of the file (if it could be obtained via +# FILE_VERSION_FILTER) +# The default value is: $file:$line: $text. + +WARN_FORMAT = "$file:$line: $text" + +# The WARN_LOGFILE tag can be used to specify a file to which warning and error +# messages should be written. If left blank the output is written to standard +# error (stderr). + +WARN_LOGFILE = + +#--------------------------------------------------------------------------- +# Configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag is used to specify the files and/or directories that contain +# documented source files. You may enter file names like myfile.cpp or +# directories like /usr/src/myproject. Separate the files or directories with +# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING +# Note: If this tag is empty the current directory is searched. + +INPUT = nccl.h + +# This tag can be used to specify the character encoding of the source files +# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses +# libiconv (or the iconv built into libc) for the transcoding. See the libiconv +# documentation (see: http://www.gnu.org/software/libiconv) for the list of +# possible encodings. +# The default value is: UTF-8. + +INPUT_ENCODING = UTF-8 + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and +# *.h) to filter out the source-files in the directories. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# read by doxygen. +# +# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, +# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, +# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, +# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd, +# *.vhdl, *.ucf, *.qsf, *.as and *.js. + +FILE_PATTERNS = *.c \ + *.cc \ + *.cxx \ + *.cpp \ + *.c++ \ + *.java \ + *.ii \ + *.ixx \ + *.ipp \ + *.i++ \ + *.inl \ + *.idl \ + *.ddl \ + *.odl \ + *.h \ + *.hh \ + *.hxx \ + *.hpp \ + *.h++ \ + *.cs \ + *.d \ + *.php \ + *.php4 \ + *.php5 \ + *.phtml \ + *.inc \ + *.m \ + *.markdown \ + *.md \ + *.mm \ + *.dox \ + *.py \ + *.f90 \ + *.f \ + *.for \ + *.tcl \ + *.vhd \ + *.vhdl \ + *.ucf \ + *.qsf \ + *.as \ + *.js + +# The RECURSIVE tag can be used to specify whether or not subdirectories should +# be searched for input files as well. +# The default value is: NO. + +RECURSIVE = NO + +# The EXCLUDE tag can be used to specify files and/or directories that should be +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. +# +# Note that relative paths are relative to the directory from which doxygen is +# run. + +EXCLUDE = + +# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or +# directories that are symbolic links (a Unix file system feature) are excluded +# from the input. +# The default value is: NO. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories for example use the pattern */test/* + +EXCLUDE_PATTERNS = + +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the +# output. The symbol name can be a fully qualified name, a word, or if the +# wildcard * is used, a substring. Examples: ANamespace, AClass, +# AClass::ANamespace, ANamespace::*Test +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories use the pattern */test/* + +EXCLUDE_SYMBOLS = + +# The EXAMPLE_PATH tag can be used to specify one or more files or directories +# that contain example code fragments that are included (see the \include +# command). + +EXAMPLE_PATH = + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and +# *.h) to filter out the source-files in the directories. If left blank all +# files are included. + +EXAMPLE_PATTERNS = * + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude commands +# irrespective of the value of the RECURSIVE tag. +# The default value is: NO. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or directories +# that contain images that are to be included in the documentation (see the +# \image command). + +IMAGE_PATH = + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command: +# +# +# +# where is the value of the INPUT_FILTER tag, and is the +# name of an input file. Doxygen will then use the output that the filter +# program writes to standard output. If FILTER_PATTERNS is specified, this tag +# will be ignored. +# +# Note that the filter must not add or remove lines; it is applied before the +# code is scanned, but not when the output code is generated. If lines are added +# or removed, the anchors will not be placed correctly. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. The filters are a list of the form: pattern=filter +# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how +# filters are used. If the FILTER_PATTERNS tag is empty or if none of the +# patterns match the file name, INPUT_FILTER is applied. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will also be used to filter the input files that are used for +# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). +# The default value is: NO. + +FILTER_SOURCE_FILES = NO + +# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file +# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and +# it is also possible to disable source filtering for a specific pattern using +# *.ext= (so without naming a filter). +# This tag requires that the tag FILTER_SOURCE_FILES is set to YES. + +FILTER_SOURCE_PATTERNS = + +# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that +# is part of the input, its contents will be placed on the main page +# (index.html). This can be useful if you have a project on for instance GitHub +# and want to reuse the introduction page also for the doxygen output. + +USE_MDFILE_AS_MAINPAGE = ../README.md + +#--------------------------------------------------------------------------- +# Configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will be +# generated. Documented entities will be cross-referenced with these sources. +# +# Note: To get rid of all source code in the generated output, make sure that +# also VERBATIM_HEADERS is set to NO. +# The default value is: NO. + +SOURCE_BROWSER = NO + +# Setting the INLINE_SOURCES tag to YES will include the body of functions, +# classes and enums directly into the documentation. +# The default value is: NO. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any +# special comment blocks from generated source code fragments. Normal C, C++ and +# Fortran comments will always remain visible. +# The default value is: YES. + +STRIP_CODE_COMMENTS = YES + +# If the REFERENCED_BY_RELATION tag is set to YES then for each documented +# function all documented functions referencing it will be listed. +# The default value is: NO. + +REFERENCED_BY_RELATION = NO + +# If the REFERENCES_RELATION tag is set to YES then for each documented function +# all documented entities called/used by that function will be listed. +# The default value is: NO. + +REFERENCES_RELATION = NO + +# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set +# to YES then the hyperlinks from functions in REFERENCES_RELATION and +# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will +# link to the documentation. +# The default value is: YES. + +REFERENCES_LINK_SOURCE = YES + +# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the +# source code will show a tooltip with additional information such as prototype, +# brief description and links to the definition and documentation. Since this +# will make the HTML file larger and loading of large files a bit slower, you +# can opt to disable this feature. +# The default value is: YES. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +SOURCE_TOOLTIPS = YES + +# If the USE_HTAGS tag is set to YES then the references to source code will +# point to the HTML generated by the htags(1) tool instead of doxygen built-in +# source browser. The htags tool is part of GNU's global source tagging system +# (see http://www.gnu.org/software/global/global.html). You will need version +# 4.8.6 or higher. +# +# To use it do the following: +# - Install the latest version of global +# - Enable SOURCE_BROWSER and USE_HTAGS in the config file +# - Make sure the INPUT points to the root of the source tree +# - Run doxygen as normal +# +# Doxygen will invoke htags (and that will in turn invoke gtags), so these +# tools must be available from the command line (i.e. in the search path). +# +# The result: instead of the source browser generated by doxygen, the links to +# source code will now point to the output of htags. +# The default value is: NO. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a +# verbatim copy of the header file for each class for which an include is +# specified. Set to NO to disable this. +# See also: Section \class. +# The default value is: YES. + +VERBATIM_HEADERS = YES + +# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the +# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the +# cost of reduced performance. This can be particularly helpful with template +# rich C++ code for which doxygen's built-in parser lacks the necessary type +# information. +# Note: The availability of this option depends on whether or not doxygen was +# compiled with the --with-libclang option. +# The default value is: NO. + +CLANG_ASSISTED_PARSING = NO + +# If clang assisted parsing is enabled you can provide the compiler with command +# line options that you would normally use when invoking the compiler. Note that +# the include paths will already be set by doxygen for the files and directories +# specified with INPUT and INCLUDE_PATH. +# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. + +CLANG_OPTIONS = + +#--------------------------------------------------------------------------- +# Configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all +# compounds will be generated. Enable this if the project contains a lot of +# classes, structs, unions or interfaces. +# The default value is: YES. + +ALPHABETICAL_INDEX = YES + +# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in +# which the alphabetical index list will be split. +# Minimum value: 1, maximum value: 20, default value: 5. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +COLS_IN_ALPHA_INDEX = 5 + +# In case all classes in a project start with a common prefix, all classes will +# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag +# can be used to specify a prefix (or a list of prefixes) that should be ignored +# while generating the index headers. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output +# The default value is: YES. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a +# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of +# it. +# The default directory is: html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_OUTPUT = html + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each +# generated HTML page (for example: .htm, .php, .asp). +# The default value is: .html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a user-defined HTML header file for +# each generated HTML page. If the tag is left blank doxygen will generate a +# standard header. +# +# To get valid HTML the header file that includes any scripts and style sheets +# that doxygen needs, which is dependent on the configuration options used (e.g. +# the setting GENERATE_TREEVIEW). It is highly recommended to start with a +# default header using +# doxygen -w html new_header.html new_footer.html new_stylesheet.css +# YourConfigFile +# and then modify the file new_header.html. See also section "Doxygen usage" +# for information on how to generate the default header that doxygen normally +# uses. +# Note: The header is subject to change so you typically have to regenerate the +# default header when upgrading to a newer version of doxygen. For a description +# of the possible markers and block names see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_HEADER = + +# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each +# generated HTML page. If the tag is left blank doxygen will generate a standard +# footer. See HTML_HEADER for more information on how to generate a default +# footer and what special commands can be used inside the footer. See also +# section "Doxygen usage" for information on how to generate the default footer +# that doxygen normally uses. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FOOTER = + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style +# sheet that is used by each HTML page. It can be used to fine-tune the look of +# the HTML output. If left blank doxygen will generate a default style sheet. +# See also section "Doxygen usage" for information on how to generate the style +# sheet that doxygen normally uses. +# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as +# it is more robust and this tag (HTML_STYLESHEET) will in the future become +# obsolete. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_STYLESHEET = + +# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined +# cascading style sheets that are included after the standard style sheets +# created by doxygen. Using this option one can overrule certain style aspects. +# This is preferred over using HTML_STYLESHEET since it does not replace the +# standard style sheet and is therefore more robust against future updates. +# Doxygen will copy the style sheet files to the output directory. +# Note: The order of the extra style sheet files is of importance (e.g. the last +# style sheet in the list overrules the setting of the previous ones in the +# list). For an example see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_STYLESHEET = + +# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or +# other source files which should be copied to the HTML output directory. Note +# that these files will be copied to the base HTML output directory. Use the +# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these +# files. In the HTML_STYLESHEET file, use the file name only. Also note that the +# files will be copied as-is; there are no commands or markers available. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_FILES = + +# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen +# will adjust the colors in the style sheet and background images according to +# this color. Hue is specified as an angle on a colorwheel, see +# http://en.wikipedia.org/wiki/Hue for more information. For instance the value +# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 +# purple, and 360 is red again. +# Minimum value: 0, maximum value: 359, default value: 220. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_HUE = 220 + +# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors +# in the HTML output. For a value of 0 the output will use grayscales only. A +# value of 255 will produce the most vivid colors. +# Minimum value: 0, maximum value: 255, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_SAT = 100 + +# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the +# luminance component of the colors in the HTML output. Values below 100 +# gradually make the output lighter, whereas values above 100 make the output +# darker. The value divided by 100 is the actual gamma applied, so 80 represents +# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not +# change the gamma. +# Minimum value: 40, maximum value: 240, default value: 80. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_GAMMA = 80 + +# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML +# page will contain the date and time when the page was generated. Setting this +# to YES can help to show when doxygen was last run and thus if the +# documentation is up to date. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_TIMESTAMP = NO + +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_SECTIONS = NO + +# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries +# shown in the various tree structured indices initially; the user can expand +# and collapse entries dynamically later on. Doxygen will expand the tree to +# such a level that at most the specified number of entries are visible (unless +# a fully collapsed tree already exceeds this amount). So setting the number of +# entries 1 will produce a full collapsed tree by default. 0 is a special value +# representing an infinite number of entries and will result in a full expanded +# tree by default. +# Minimum value: 0, maximum value: 9999, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_INDEX_NUM_ENTRIES = 100 + +# If the GENERATE_DOCSET tag is set to YES, additional index files will be +# generated that can be used as input for Apple's Xcode 3 integrated development +# environment (see: http://developer.apple.com/tools/xcode/), introduced with +# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a +# Makefile in the HTML output directory. Running make will produce the docset in +# that directory and running make install will install the docset in +# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at +# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html +# for more information. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_DOCSET = NO + +# This tag determines the name of the docset feed. A documentation feed provides +# an umbrella under which multiple documentation sets from a single provider +# (such as a company or product suite) can be grouped. +# The default value is: Doxygen generated docs. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_FEEDNAME = "Doxygen generated docs" + +# This tag specifies a string that should uniquely identify the documentation +# set bundle. This should be a reverse domain-name style string, e.g. +# com.mycompany.MyDocSet. Doxygen will append .docset to the name. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_BUNDLE_ID = org.doxygen.Project + +# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify +# the documentation publisher. This should be a reverse domain-name style +# string, e.g. com.mycompany.MyDocSet.documentation. +# The default value is: org.doxygen.Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_ID = org.doxygen.Publisher + +# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. +# The default value is: Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_NAME = Publisher + +# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three +# additional HTML index files: index.hhp, index.hhc, and index.hhk. The +# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop +# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on +# Windows. +# +# The HTML Help Workshop contains a compiler that can convert all HTML output +# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML +# files are now used as the Windows 98 help format, and will replace the old +# Windows help format (.hlp) on all Windows platforms in the future. Compressed +# HTML files also contain an index, a table of contents, and you can search for +# words in the documentation. The HTML workshop also contains a viewer for +# compressed HTML files. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_HTMLHELP = NO + +# The CHM_FILE tag can be used to specify the file name of the resulting .chm +# file. You can add a path in front of the file if the result should not be +# written to the html output directory. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_FILE = + +# The HHC_LOCATION tag can be used to specify the location (absolute path +# including file name) of the HTML help compiler (hhc.exe). If non-empty, +# doxygen will try to run the HTML help compiler on the generated index.hhp. +# The file has to be specified with full path. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +HHC_LOCATION = + +# The GENERATE_CHI flag controls if a separate .chi index file is generated +# (YES) or that it should be included in the master .chm file (NO). +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +GENERATE_CHI = NO + +# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) +# and project file content. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_INDEX_ENCODING = + +# The BINARY_TOC flag controls whether a binary table of contents is generated +# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it +# enables the Previous and Next buttons. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members to +# the table of contents of the HTML help documentation and to the tree view. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +TOC_EXPAND = NO + +# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and +# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that +# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help +# (.qch) of the generated HTML documentation. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_QHP = NO + +# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify +# the file name of the resulting .qch file. The path specified is relative to +# the HTML output folder. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QCH_FILE = + +# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help +# Project output. For more information please see Qt Help Project / Namespace +# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace). +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_NAMESPACE = org.doxygen.Project + +# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt +# Help Project output. For more information please see Qt Help Project / Virtual +# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual- +# folders). +# The default value is: doc. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_VIRTUAL_FOLDER = doc + +# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom +# filter to add. For more information please see Qt Help Project / Custom +# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- +# filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_NAME = + +# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the +# custom filter to add. For more information please see Qt Help Project / Custom +# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- +# filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_ATTRS = + +# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this +# project's filter section matches. Qt Help Project / Filter Attributes (see: +# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_SECT_FILTER_ATTRS = + +# The QHG_LOCATION tag can be used to specify the location of Qt's +# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the +# generated .qhp file. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHG_LOCATION = + +# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be +# generated, together with the HTML files, they form an Eclipse help plugin. To +# install this plugin and make it available under the help contents menu in +# Eclipse, the contents of the directory containing the HTML and XML files needs +# to be copied into the plugins directory of eclipse. The name of the directory +# within the plugins directory should be the same as the ECLIPSE_DOC_ID value. +# After copying Eclipse needs to be restarted before the help appears. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_ECLIPSEHELP = NO + +# A unique identifier for the Eclipse help plugin. When installing the plugin +# the directory name containing the HTML and XML files should also have this +# name. Each documentation set should have its own identifier. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. + +ECLIPSE_DOC_ID = org.doxygen.Project + +# If you want full control over the layout of the generated HTML pages it might +# be necessary to disable the index and replace it with your own. The +# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top +# of each HTML page. A value of NO enables the index and the value YES disables +# it. Since the tabs in the index contain the same information as the navigation +# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +DISABLE_INDEX = NO + +# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index +# structure should be generated to display hierarchical information. If the tag +# value is set to YES, a side panel will be generated containing a tree-like +# index structure (just like the one that is generated for HTML Help). For this +# to work a browser that supports JavaScript, DHTML, CSS and frames is required +# (i.e. any modern browser). Windows users are probably better off using the +# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can +# further fine-tune the look of the index. As an example, the default style +# sheet generated by doxygen has an example that shows how to put an image at +# the root of the tree instead of the PROJECT_NAME. Since the tree basically has +# the same information as the tab index, you could consider setting +# DISABLE_INDEX to YES when enabling this option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_TREEVIEW = NO + +# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that +# doxygen will group on one line in the generated HTML documentation. +# +# Note that a value of 0 will completely suppress the enum values from appearing +# in the overview section. +# Minimum value: 0, maximum value: 20, default value: 4. +# This tag requires that the tag GENERATE_HTML is set to YES. + +ENUM_VALUES_PER_LINE = 1 + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used +# to set the initial width (in pixels) of the frame in which the tree is shown. +# Minimum value: 0, maximum value: 1500, default value: 250. +# This tag requires that the tag GENERATE_HTML is set to YES. + +TREEVIEW_WIDTH = 250 + +# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to +# external symbols imported via tag files in a separate window. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +EXT_LINKS_IN_WINDOW = NO + +# Use this tag to change the font size of LaTeX formulas included as images in +# the HTML documentation. When you change the font size after a successful +# doxygen run you need to manually remove any form_*.png images from the HTML +# output directory to force them to be regenerated. +# Minimum value: 8, maximum value: 50, default value: 10. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_FONTSIZE = 10 + +# Use the FORMULA_TRANPARENT tag to determine whether or not the images +# generated for formulas are transparent PNGs. Transparent PNGs are not +# supported properly for IE 6.0, but are supported on all modern browsers. +# +# Note that when changing this option you need to delete any form_*.png files in +# the HTML output directory before the changes have effect. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_TRANSPARENT = YES + +# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see +# http://www.mathjax.org) which uses client side Javascript for the rendering +# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX +# installed or if you want to formulas look prettier in the HTML output. When +# enabled you may also need to install MathJax separately and configure the path +# to it using the MATHJAX_RELPATH option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +USE_MATHJAX = YES + +# When MathJax is enabled you can set the default output format to be used for +# the MathJax output. See the MathJax site (see: +# http://docs.mathjax.org/en/latest/output.html) for more details. +# Possible values are: HTML-CSS (which is slower, but has the best +# compatibility), NativeMML (i.e. MathML) and SVG. +# The default value is: HTML-CSS. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_FORMAT = HTML-CSS + +# When MathJax is enabled you need to specify the location relative to the HTML +# output directory using the MATHJAX_RELPATH option. The destination directory +# should contain the MathJax.js script. For instance, if the mathjax directory +# is located at the same level as the HTML output directory, then +# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax +# Content Delivery Network so you can quickly see the result without installing +# MathJax. However, it is strongly recommended to install a local copy of +# MathJax from http://www.mathjax.org before deployment. +# The default value is: http://cdn.mathjax.org/mathjax/latest. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest + +# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax +# extension names that should be enabled during MathJax rendering. For example +# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_EXTENSIONS = + +# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces +# of code that will be used on startup of the MathJax code. See the MathJax site +# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an +# example see the documentation. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_CODEFILE = + +# When the SEARCHENGINE tag is enabled doxygen will generate a search box for +# the HTML output. The underlying search engine uses javascript and DHTML and +# should work on any modern browser. Note that when using HTML help +# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) +# there is already a search function so this one should typically be disabled. +# For large projects the javascript based search engine can be slow, then +# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to +# search using the keyboard; to jump to the search box use + S +# (what the is depends on the OS and browser, but it is typically +# , /