diff --git a/CMakeLists.txt b/CMakeLists.txt index 08e5ef1744..16e1a5208b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -203,8 +203,8 @@ if("${HIP_COMPILER}" MATCHES "clang") find_program( hipcc_executable hipcc ) execute_process(COMMAND bash "-c" "${hipcc_executable} -help | grep 'parallel-jobs'" OUTPUT_VARIABLE hipcc_parallel_jobs) if("${hipcc_parallel_jobs}" MATCHES "parallel-jobs") - target_compile_options(rccl PRIVATE -parallel-jobs=4 PRIVATE -Wno-format-nonliteral) - target_link_libraries(rccl PRIVATE -parallel-jobs=4) + target_compile_options(rccl PRIVATE -parallel-jobs=8 PRIVATE -Wno-format-nonliteral) + target_link_libraries(rccl PRIVATE -parallel-jobs=8) endif() # RCCL static lib uses -fgpu-rdc which requires hipcc as the linker and archiver diff --git a/makefiles/version.mk b/makefiles/version.mk index f64e8ad49a..6a40a92637 100644 --- a/makefiles/version.mk +++ b/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 -NCCL_MINOR := 8 -NCCL_PATCH := 4 +NCCL_MINOR := 9 +NCCL_PATCH := 6 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/src/Makefile b/src/Makefile index 487f790985..a548840b3d 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. +# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. # # See LICENSE.txt for license information # @@ -10,7 +10,7 @@ include ../makefiles/version.mk ##### src files INCEXPORTS := nccl.h nccl_net.h LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc proxy.cc \ - misc/nvmlwrap.cc misc/ibvwrap.cc misc/utils.cc misc/argcheck.cc \ + misc/nvmlwrap.cc misc/ibvwrap.cc misc/gdrwrap.cc misc/utils.cc misc/argcheck.cc \ transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc \ collectives/sendrecv.cc collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \ graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc graph/xml.cc @@ -56,8 +56,8 @@ ALWAYS_REBUILD: $(LIBDIR)/$(LIBTARGET) $(LIBDIR)/$(STATICLIBTARGET) : $(LIBOBJ) $(INCDIR)/nccl.h : nccl.h.in -# NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z)) - @$(eval NCCL_VERSION := $(shell printf "%d%d%02d" $(NCCL_MAJOR) $(NCCL_MINOR) $(NCCL_PATCH))) +# NCCL_VERSION(X,Y,Z) ((X) * 10000 + (Y) * 100 + (Z)) + @$(eval NCCL_VERSION := $(shell printf "%d%02d%02d" $(NCCL_MAJOR) $(NCCL_MINOR) $(NCCL_PATCH))) mkdir -p $(INCDIR) @printf "Generating %-35s > %s\n" $< $@ sed -e "s/\$${nccl:Major}/$(NCCL_MAJOR)/g" \ @@ -129,7 +129,7 @@ install : lib cp -P -v $(BUILDDIR)/lib/pkgconfig/* $(PREFIX)/lib/pkgconfig/ cp -v $(BUILDDIR)/include/* $(PREFIX)/include/ -FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cc" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|nccl.h') +FILESTOFORMAT := $(shell find . -name ".\#*" -prune -o \( -name "*.cc" -o -name "*.h" \) -print | grep -v -E 'ibvwrap.h|nvmlwrap.h|gdrwrap.h|nccl.h') # Note that formatting.mk defines a new target so in order to not overwrite the default target, # it shouldn't be included at the top. Also, it uses the above definition of FILESTOFORMAT as well # as the BUILDDIR variable. diff --git a/src/bootstrap.cc b/src/bootstrap.cc index d7a50e26a0..4488bbc665 100644 --- a/src/bootstrap.cc +++ b/src/bootstrap.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information @@ -222,6 +222,7 @@ ncclResult_t bootstrapGetUniqueId(ncclUniqueId* id) { struct unexConn { int peer; + int tag; int fd; struct unexConn* next; }; @@ -445,21 +446,23 @@ ncclResult_t bootstrapAllGather(void* commState, void* allData, int size) { return ncclSuccess; } -ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size) { +ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size) { struct extState* state = (struct extState*)commState; int tmpSendFd; NCCLCHECK(connectAddress(&tmpSendFd, state->peerCommAddresses+peer)); NCCLCHECK(bootstrapNetSend(tmpSendFd, &state->rank, sizeof(int))); + NCCLCHECK(bootstrapNetSend(tmpSendFd, &tag, sizeof(int))); NCCLCHECK(bootstrapNetSend(tmpSendFd, data, size)); close(tmpSendFd); return ncclSuccess; } -ncclResult_t unexpectedEnqueue(struct extState* state, int peer, int fd) { +ncclResult_t unexpectedEnqueue(struct extState* state, int peer, int tag, int fd) { // New unex struct unexConn* unex; NCCLCHECK(ncclCalloc(&unex, 1)); unex->peer = peer; + unex->tag = tag; unex->fd = fd; // Enqueue @@ -473,11 +476,11 @@ ncclResult_t unexpectedEnqueue(struct extState* state, int peer, int fd) { return ncclSuccess; } -int unexpectedDequeue(struct extState* state, int peer) { +int unexpectedDequeue(struct extState* state, int peer, int tag) { struct unexConn* elem = state->unexpectedConnections; struct unexConn* prev = NULL; while (elem) { - if (elem->peer == peer) { + if (elem->peer == peer && elem->tag == tag) { if (prev == NULL) { state->unexpectedConnections = elem->next; } else { @@ -494,13 +497,13 @@ int unexpectedDequeue(struct extState* state, int peer) { } // We can't know who we'll receive from, so we need to receive everything at once -ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size) { +ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size) { struct extState* state = (struct extState*)commState; int tmpRecvFd; // Search unexpected connections first - if ((tmpRecvFd = unexpectedDequeue(state, peer)) != -1) { + if ((tmpRecvFd = unexpectedDequeue(state, peer, tag)) != -1) { NCCLCHECK(bootstrapNetRecv(tmpRecvFd, ((char*)data), size)); close(tmpRecvFd); return ncclSuccess; @@ -509,15 +512,16 @@ ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size) { // Then look for new connections while (1) { NCCLCHECK(bootstrapNetAccept(state->extListenFd, &tmpRecvFd)); - int newPeer; + int newPeer, newTag; NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &newPeer, sizeof(int))); - if (newPeer == peer) { + NCCLCHECK(bootstrapNetRecv(tmpRecvFd, &newTag, sizeof(int))); + if (newPeer == peer && newTag == tag) { NCCLCHECK(bootstrapNetRecv(tmpRecvFd, ((char*)data), size)); close(tmpRecvFd); return ncclSuccess; } // Unexpected connection. Save for later. - NCCLCHECK(unexpectedEnqueue(state, newPeer, tmpRecvFd)); + NCCLCHECK(unexpectedEnqueue(state, newPeer, newTag, tmpRecvFd)); } } diff --git a/src/channel.cc b/src/channel.cc index 12cc44238d..5fa25c7bce 100644 --- a/src/channel.cc +++ b/src/channel.cc @@ -1,12 +1,16 @@ /************************************************************************* - * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "channel.h" #include "param.h" +#include "gdrwrap.h" + +// GDRCOPY support: FIFO_ENABLE when enabled locates a workFifo in CUDA memory +NCCL_PARAM(GdrCopyFifoEnable, "GDRCOPY_FIFO_ENABLE", 1); ncclResult_t initChannel(struct ncclComm* comm, int channelid) { struct ncclChannel* channel = comm->channels+channelid; @@ -21,14 +25,25 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelid) { NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks+1)); // The extra one rank is for collnet root (i.e. network) NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks+1)); for (size_t i=0; inRanks+1; ++i) { - channel->peers[i].send.comm = comm; - channel->peers[i].recv.comm = comm; - channel->peers[i].p2pSend.comm = comm; - channel->peers[i].p2pRecv.comm = comm; + for (int b=0; bpeers[i].send[b].comm = comm; + channel->peers[i].recv[b].comm = comm; + } } // Per-channel operation list. NCCLCHECK(ncclCudaHostCalloc(&channel->workFifo, NCCL_MAX_OPS)); + if (ncclGdrCopy != NULL && ncclParamGdrCopyFifoEnable() == 1) { + // GDRCOPY support + // We allocate a workFifo in GDR mapped CUDA memory + // But we still allocate the Host workFifo so that we + // can copy the work elements to CUDA memory on kernel launch + NCCLCHECK(ncclGdrCudaCalloc(&channel->workFifoGdr, &channel->workFifoDev, NCCL_MAX_OPS, &channel->gdrMemDesc)); + } else { + // The device workFifo is the Host one + channel->workFifoDev = channel->workFifo; + } + return ncclSuccess; } @@ -36,6 +51,10 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) { if (channel->id == -1) return ncclSuccess; // Operation list NCCLCHECK(ncclCudaHostFree(channel->workFifo)); + if (channel->gdrMemDesc) { + // GDRCOPY support + NCCLCHECK(ncclGdrCudaFree(channel->gdrMemDesc)); + } // Free Ring index to rank tables free(channel->ring.userRanks); @@ -45,17 +64,15 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) { // Note: free all send resources first due to CollNet arrangement for (int r=0; rpeers+r; - if (peer->send.transportResources) NCCLCHECK(peer->send.transportComm->free(peer->send.transportResources)); - if (peer->send.transportResources == peer->p2pSend.transportResources) peer->p2pSend.transportResources = NULL; - peer->send.transportResources = NULL; - if (peer->p2pSend.transportResources) NCCLCHECK(peer->p2pSend.transportComm->free(peer->p2pSend.transportResources)); + for (int b=0; bsend[b].transportResources) NCCLCHECK(peer->send[b].transportComm->free(peer->send[b].transportResources)); + } } for (int r=0; rpeers+r; - if (peer->recv.transportResources) NCCLCHECK(peer->recv.transportComm->free(peer->recv.transportResources)); - if (peer->recv.transportResources == peer->p2pRecv.transportResources) peer->p2pRecv.transportResources = NULL; - peer->recv.transportResources = NULL; - if (peer->p2pRecv.transportResources) NCCLCHECK(peer->p2pRecv.transportComm->free(peer->p2pRecv.transportResources)); + for (int b=0; brecv[b].transportResources) NCCLCHECK(peer->recv[b].transportComm->free(peer->recv[b].transportResources)); + } } // Free the peer structures. diff --git a/src/clique/CliqueManager.cc b/src/clique/CliqueManager.cc index 7e6100d603..c63da205da 100644 --- a/src/clique/CliqueManager.cc +++ b/src/clique/CliqueManager.cc @@ -350,7 +350,7 @@ ncclResult_t CliqueManager::SetCliqueArgs(ncclWorkElem* args) } // Prepare clique argments (NOTE: clique pointers are not ready yet) - int opIndex = args->opCount % NCCL_MAX_OPS; + int opIndex = args->op.opCount % NCCL_MAX_OPS; args->clique.ptrs = &m_pinnedCliquePtrs[opIndex]; return ncclSuccess; diff --git a/src/collectives/all_gather_api.cc b/src/collectives/all_gather_api.cc index 7f23bb7894..83ff3cbbd1 100644 --- a/src/collectives/all_gather_api.cc +++ b/src/collectives/all_gather_api.cc @@ -1,6 +1,6 @@ /************************************************************************* * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/all_reduce_api.cc b/src/collectives/all_reduce_api.cc index aee52927d0..1acbe60e38 100644 --- a/src/collectives/all_reduce_api.cc +++ b/src/collectives/all_reduce_api.cc @@ -1,6 +1,6 @@ /************************************************************************* * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/all_to_all_api.cc b/src/collectives/all_to_all_api.cc index a58f42e1eb..562f451a35 100644 --- a/src/collectives/all_to_all_api.cc +++ b/src/collectives/all_to_all_api.cc @@ -1,6 +1,6 @@ /************************************************************************* * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/all_to_allv_api.cc b/src/collectives/all_to_allv_api.cc index b0ac549d43..42e021ed4a 100755 --- a/src/collectives/all_to_allv_api.cc +++ b/src/collectives/all_to_allv_api.cc @@ -1,6 +1,6 @@ /************************************************************************* * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/broadcast_api.cc b/src/collectives/broadcast_api.cc index 9193eb3195..a08961ca43 100644 --- a/src/collectives/broadcast_api.cc +++ b/src/collectives/broadcast_api.cc @@ -1,6 +1,6 @@ /************************************************************************* * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h index 52d3bd96f8..b86d755783 100644 --- a/src/collectives/device/all_gather.h +++ b/src/collectives/device/all_gather.h @@ -1,6 +1,6 @@ /************************************************************************* * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h index 21cb89b39a..2fe3d1d2ab 100644 --- a/src/collectives/device/all_reduce.h +++ b/src/collectives/device/all_reduce.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information @@ -169,7 +169,7 @@ class ncclFunction= 256) nthreadsSplit += 64; if (tree->up == -1) { if (tid < nthreads) { // ReduceAndBroadcast : max number of recv is 3, max number of send is 3 @@ -218,59 +218,78 @@ class ncclFunction class ncclFunction { +#define COLLNET_COPY_THREADS 64 public: __device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) { const int tid = threadIdx.x; - const int nthreads = args->nThreads; + //const int nthreads = args->nThreads-3*WARP_SIZE; const int bid = args->coll.bid; const int nChannels = args->coll.nChannels; struct ncclDevComm* comm = args->comm; struct ncclChannel* channel = comm->channels+blockIdx.x; - struct ncclTree* tree = &channel->collTree; + struct ncclDirect* tree = &channel->collTree; const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS); int chunkSize = args->coll.lastChunkSize; - const ssize_t minChunkSize = nthreads*8*sizeof(uint64_t) / sizeof(T); - const ssize_t loopSize = nChannels*chunkSize; const ssize_t size = args->coll.count; - - if (loopSize > size) { - chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize; - } + const ssize_t loopSize = nChannels*tree->nHeads*chunkSize; // Compute pointers const T * __restrict__ thisInput = (const T*)args->sendbuff; T * __restrict__ thisOutput = (T*)args->recvbuff; - if (blockIdx.x < nChannels) { // first half of the channels do reduce - ncclPrimitives - prims(tid, nthreads, tree->down, &tree->up, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0); + const int hasUp = (tree->up[0] >= 0) ? 1 : 0; + const int hasDn = (tree->down[0] >= 0) ? 1 : 0; + const int nThreadsScatter = (hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 2*COLLNET_COPY_THREADS : 0; + const int nThreadsGather = (hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 1*COLLNET_COPY_THREADS : 0; + const int nThreadsBcast = (hasUp && hasDn) ? COLLNET_COPY_THREADS : hasUp ? 0 : 1*COLLNET_COPY_THREADS; + // Gather does not need sync threads, sparing one more warp for reduce + const int nThreadsReduce = NCCL_SIMPLE_MAX_NTHREADS - nThreadsScatter - nThreadsGather - nThreadsBcast; + const int tidStartBcast = nThreadsGather; + const int tidStartScatter = tidStartBcast + nThreadsBcast; + const int tidStartReduce = tidStartScatter + nThreadsScatter; + + if (tid >= tidStartScatter && tid < tidStartReduce && hasUp) { + // Scatter + ncclPrimitives + prims(tid-tidStartScatter, nThreadsScatter, NULL, tree->up, NULL, stepSize, channel, comm, ncclShmem->ptrs, 2); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - // Up - ssize_t offset = gridOffset + bid*chunkSize; + ssize_t offset = gridOffset + bid*tree->nHeads*chunkSize; + int nelem = min(tree->nHeads*chunkSize, size-offset); + prims.scatter(thisInput+offset, nelem, chunkSize, tree->headRank, tree->shift); + } + } else if (tid >= tidStartReduce && tree->out != -1) { + // Reduce, send to network + ncclPrimitives + prims(tid-tidStartReduce, nThreadsReduce, tree->down, &tree->out, NULL, stepSize, channel, comm, ncclShmem->ptrs, 3); + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize; int nelem = min(chunkSize, size-offset); - if (tree->up == -1) { - prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem); - } else if (tree->down[0] == -1) { - prims.send(thisInput+offset, nelem); - } else { + if (hasDn) { prims.recvReduceSend(thisInput+offset, nelem); + } else { + prims.send(thisInput+offset, nelem); } } - } - - if (blockIdx.x >= nChannels && blockIdx.x < 2*nChannels) { // second half of the channels do broadcast - ncclPrimitives - prims(tid, nthreads, &tree->up, tree->down, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0); + } else if (tid < tidStartBcast && hasUp) { + // Gather + ncclPrimitives + prims(tid, nThreadsGather, tree->up, NULL, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, 0); for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - // Down - ssize_t offset = gridOffset + bid*chunkSize; + ssize_t offset = gridOffset + bid*tree->nHeads*chunkSize; + int nelem = min(tree->nHeads*chunkSize, size-offset); + prims.gather(thisOutput+offset, nelem, chunkSize, tree->headRank, tree->shift); + } + } else if (tid >= tidStartBcast && tid < tidStartScatter && tree->out != -1) { + // Recv from network, broadcast + ncclPrimitives + prims(tid-tidStartBcast, nThreadsBcast, &tree->out, tree->down, thisOutput, stepSize, channel, comm, ncclShmem->ptrs, 1); + for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { + ssize_t offset = gridOffset + (bid*tree->nHeads+tree->headRank)*chunkSize; int nelem = min(chunkSize, size-offset); - if (tree->up == -1) { - prims.send(thisOutput+offset, nelem); - } else if (tree->down[0] == -1) { - prims.recv(thisOutput+offset, nelem); - } else { + if (hasDn) { prims.recvCopySend(thisOutput+offset, nelem); + } else { + prims.recv(thisOutput+offset, nelem); } } } @@ -417,60 +436,7 @@ class ncclFunction class ncclFunction { public: - __device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) { - const int tid = threadIdx.x; - const int nthreads = args->nThreads; - const int bid = args->coll.bid; - const int nChannels = args->coll.nChannels; - struct ncclDevComm* comm = args->comm; - struct ncclChannel* channel = comm->channels+blockIdx.x; - struct ncclTree* tree = &channel->collTree; - const int stepLines = comm->buffSizes[NCCL_PROTO_LL] / (sizeof(union ncclLLFifoLine)*NCCL_STEPS); - ssize_t chunkSize = stepLines * sizeof(uint64_t) / sizeof(T); - const ssize_t minChunkSize = nthreads*sizeof(uint64_t) / sizeof(T); - const ssize_t loopSize = nChannels*chunkSize; - const ssize_t size = args->coll.count; - - if (loopSize > size) { - chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize; - } - - // Compute pointers - const T * __restrict__ thisInput = (const T*)args->sendbuff; - T * __restrict__ thisOutput = (T*)args->recvbuff; - - if (blockIdx.x < nChannels) { // first half of the channels do reduce - ncclLLPrimitives LLprims(tid, nthreads, tree->down, &tree->up, stepLines, channel, comm); - for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - // Up - ssize_t offset = gridOffset + bid*chunkSize; - int nelem = min(chunkSize, size-offset); - if (tree->up == -1) { - LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem); - } else if (tree->down[0] == -1) { - LLprims.send(thisInput+offset, nelem); - } else { - LLprims.recvReduceSend(thisInput+offset, nelem); - } - } - } - - if (blockIdx.x >= nChannels && blockIdx.x < 2*nChannels) { // second half of the channels do broadcast - ncclLLPrimitives LLprims(tid, nthreads, &tree->up, tree->down, stepLines, channel, comm); - for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) { - // Down - ssize_t offset = gridOffset + bid*chunkSize; - int nelem = min(chunkSize, size-offset); - if (tree->up == -1) { - LLprims.send(thisOutput+offset, nelem); - } else if (tree->down[0] == -1) { - LLprims.recv(thisOutput+offset, nelem); - } else { - LLprims.recvCopySend(thisOutput+offset, nelem); - } - } - } - } + __device__ void run(struct ncclWorkElem* args) { } }; #include "prims_ll128.h" diff --git a/src/collectives/device/broadcast.h b/src/collectives/device/broadcast.h index b673f034d9..539544e36a 100644 --- a/src/collectives/device/broadcast.h +++ b/src/collectives/device/broadcast.h @@ -1,6 +1,6 @@ /************************************************************************* * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h index 6fd5ea28ae..980989ab62 100644 --- a/src/collectives/device/common.h +++ b/src/collectives/device/common.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information @@ -181,9 +181,8 @@ static __device__ void load_parallel(void* dst, void* src, size_t size, int tid) for (int o = tid; o < (size/sizeof(int)); o += blockDim.x) d[o] = s[o]; } -static __device__ bool load_coll(struct ncclWork* localWork, struct ncclWork* hostWork, int tid, struct ncclDevComm* comm, uint32_t* abortCount) { - __syncthreads(); - load_parallel(localWork, hostWork, sizeof(struct ncclWork), tid); +static __device__ bool load_coll(struct ncclWork* localWork, struct ncclWork *hostWork, struct ncclWork* workFifo, int tid, struct ncclDevComm* comm, uint32_t* abortCount) { + load_parallel(localWork, workFifo, sizeof(struct ncclWork), tid); // Check whether the last operation was aborted and make sure all threads exit int abort = tid == 0 ? LOAD(comm->abortFlag) : 0; exitIfAbortBarrier(abort, abortCount); @@ -201,7 +200,7 @@ class ncclFunction { #define traceColl(fIdx) \ uint32_t pos = __atomic_fetch_add(comm->collTraceTail, 1, __ATOMIC_SEQ_CST)%COLLTRACE_NUM_ITEMS; \ comm->collTrace[pos].timeStamp = __builtin_amdgcn_s_memrealtime(); \ - comm->collTrace[pos].opCount = w->opCount; \ + comm->collTrace[pos].opCount = w->op.opCount; \ comm->collTrace[pos].bid = bid; \ comm->collTrace[pos].funcIndex = fIdx; \ if (fIdx == FUNC_INDEX_P2P) { \ @@ -246,8 +245,8 @@ class ncclFunction { #define MAXWARPS (NCCL_MAX_NTHREADS/WARP_SIZE) struct ncclShmemPtrs { - void* srcs[NCCL_MAX_DEV_ARITY+1]; - void* dsts[NCCL_MAX_DEV_ARITY+1]; + void* srcs[NCCL_MAX_DIRECT_ARITY+1]; + void* dsts[NCCL_MAX_DIRECT_ARITY+1]; uint64_t barrier; uint64_t barrier_next[MAXWARPS]; }; @@ -287,7 +286,6 @@ __device__ void ncclKernel(struct ncclWorkElem first) { struct ncclDevComm* comm = first.comm; struct ncclChannel* channel = comm->channels+bid; struct ncclWorkElem* w = NULL; - uint16_t index = first.index; bool firstLaunch = true; if (bid == 0 && first.funcIndex != FUNC_INDEX_P2P) w = &first; @@ -295,7 +293,8 @@ __device__ void ncclKernel(struct ncclWorkElem first) { while (1) { if (w == NULL) { w = shmem.localWork.elems; - if (!load_coll(&shmem.localWork, channel->workFifo+index, tid, comm, &abortCount)) { + __syncthreads(); + if (!load_coll(&shmem.localWork, channel->workFifo+channel->index, channel->workFifoDev+channel->index, tid, comm, &abortCount)) { if (COLLTRACE && tid == 0) traceAbort(0xffff); return; } @@ -315,7 +314,7 @@ __device__ void ncclKernel(struct ncclWorkElem first) { NCCL_CALL_FUNCTIONS(w); } } - index = (index+1) % NCCL_MAX_OPS; + if (tid == 0) channel->index = (channel->index+1) % NCCL_MAX_OPS; if (w->active == 2) { if (COLLTRACE && tid == 0) traceCollEnd(0xffff); return; diff --git a/src/collectives/device/common_kernel.h b/src/collectives/device/common_kernel.h index 30c9619887..8117dad8bf 100644 --- a/src/collectives/device/common_kernel.h +++ b/src/collectives/device/common_kernel.h @@ -1,6 +1,6 @@ /************************************************************************* * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/device/functions.cu b/src/collectives/device/functions.cu index 9ff3ee35fa..1689b3df09 100644 --- a/src/collectives/device/functions.cu +++ b/src/collectives/device/functions.cu @@ -1,6 +1,6 @@ /************************************************************************* * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/device/op128.h b/src/collectives/device/op128.h index 1db4a18439..23abb143af 100644 --- a/src/collectives/device/op128.h +++ b/src/collectives/device/op128.h @@ -1,6 +1,6 @@ /************************************************************************* * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/device/primitives.h b/src/collectives/device/primitives.h index ad5593c20c..88bead68a0 100644 --- a/src/collectives/device/primitives.h +++ b/src/collectives/device/primitives.h @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -71,11 +71,11 @@ class ncclPrimitives { int peer = -1; int role = 0; int group; - const int p2p; uint64_t step; T* direct = NULL; T* buff; struct ncclDevComm* comm; + const int p2pNet; const T** srcs; T** dsts; @@ -130,7 +130,7 @@ class ncclPrimitives { STORE(connSizesFifoPtr+step%NCCL_STEPS, nbytes); } - if (connPtrsFifoPtr) dsts[DST+index] = ((T **)connPtrsFifoPtr)[step%NCCL_STEPS]; + if (connPtrsFifoPtr) dsts[DST+index] = (T *)LOAD(connPtrsFifoPtr+step%NCCL_STEPS); else dsts[DST+index] = directPtr(directOffset); step += SLICESTEPS; } @@ -148,7 +148,7 @@ class ncclPrimitives { #ifdef ENABLE_PROFILING if (tid == 0) __atomic_fetch_add(&comm->devProf->wait_recv_cycle[blockIdx.x], __builtin_amdgcn_s_memrealtime() - t0, __ATOMIC_SEQ_CST); #endif - if (connPtrsFifoPtr) srcs[SRC+index] = ((T **)connPtrsFifoPtr)[step%NCCL_STEPS]; + if (connPtrsFifoPtr) srcs[SRC+index] = (const T *)LOAD(connPtrsFifoPtr+step%NCCL_STEPS); else srcs[SRC+index] = directPtr(directOffset); step += SLICESTEPS; } @@ -197,9 +197,58 @@ class ncclPrimitives { } } + // Scatter and gather do not support DIRECT + template + inline __device__ void + ScatterGatherOp(const T* srcPtr, T* dstPtr, int totalElem, int peerElem, int skip, int shift) { + int offset = 0; // slice offset + int sliceSize = stepSize*SLICESTEPS; + int dataSize = max(DIVUP(peerElem, 16*SLICESPERCHUNK)*16, sliceSize/32); // per-peer slice size + + #pragma unroll + for (int slice=0; slice(0); + // realSize is not accurate here; but intra-node does not rely on sizes FIFO + if (SEND && (role & ROLE_WAIT_SEND)) waitSend<0, 0>(0, realSize*sizeof(T)); + subBarrier(); + if (SEND) { + #pragma unroll 1 + for (int j=0; j=0 && i >= skip) peerOffset += peerElem; + const T* src0 = srcPtr + peerOffset; + int realPeerSize = min(realSize, totalElem-peerOffset); + if (realPeerSize > 0) ReduceOrCopyMulti(tid, nworkers, 1, &src0, 1, dsts+i, realPeerSize); + } + } else if (RECV) { + #pragma unroll 1 + for (int j=0; j= 0 && i >= skip) peerOffset += peerElem; + T* dst0 = dstPtr + peerOffset; + int realPeerSize = min(realSize, totalElem-peerOffset); + if (realPeerSize > 0) ReduceOrCopyMulti(tid, nworkers, 1, srcs+i, 1, &dst0, realPeerSize); + } + } + } + barrier(); + if (SEND && (role & ROLE_POST_SEND) && realSize > 0 && index == 0) __threadfence_system(); + __syncwarp(); + if (SEND && (role & ROLE_POST_SEND)) postSend(); + if (RECV && (role & ROLE_POST_RECV)) postRecv(); + offset += realSize; + } + } + __device__ __forceinline__ void loadRecvConn(struct ncclChannel* channel, T* directBuff) { if (role & (ROLE_WAIT_RECV|ROLE_POST_RECV)) { - conn = (LOAD(comm->p2pNet) && p2p) ? &channel->devPeers[peer].p2pRecv.conn : &channel->devPeers[peer].recv.conn; + // For oneshot: groups 0,1 use conn 0, groups 2,3 use conn 1 + const int connIndex = (NSEND == NCCL_MAX_DIRECT_ARITY || NRECV == NCCL_MAX_DIRECT_ARITY) ? group/2 : (((p2pNet && (NSEND+NRECV) == 1)) ? NCCL_CONN_IDX_P2P_NET : 0); + conn = &channel->devPeers[peer].recv[connIndex].conn; step = conn->step; step = ROUNDUP(step, SLICESPERCHUNK*SLICESTEPS); if (role & ROLE_POST_RECV) { @@ -222,7 +271,9 @@ class ncclPrimitives { __device__ __forceinline__ void loadSendConn(struct ncclChannel* channel) { if (role & (ROLE_WAIT_SEND|ROLE_POST_SEND)) { - conn = (LOAD(comm->p2pNet) && p2p) ? &channel->devPeers[peer].p2pSend.conn : &channel->devPeers[peer].send.conn; + // For oneshot: groups 0,1 use conn 0, groups 2,3 use conn 1 + const int connIndex = (NSEND == NCCL_MAX_DIRECT_ARITY || NRECV == NCCL_MAX_DIRECT_ARITY) ? group/2 : (((p2pNet && (NSEND+NRECV) == 1)) ? NCCL_CONN_IDX_P2P_NET : 0); + conn = &channel->devPeers[peer].send[connIndex].conn; step = conn->step; step = ROUNDUP(step, SLICESPERCHUNK*SLICESTEPS); if (role & ROLE_POST_SEND) { @@ -230,11 +281,13 @@ class ncclPrimitives { } if (role & ROLE_WAIT_SEND) { buff = (T*)conn->buffs[NCCL_PROTO_SIMPLE]; - //if (DIRECT && (conn->direct & NCCL_DIRECT_GPU)) { - // void* volatile* ptr = conn->ptrExchange; - // while ((direct = (T*)(*ptr)) == NULL); - // *ptr = NULL; - //} +#if 0 + if (DIRECT && (conn->direct & NCCL_DIRECT_GPU)) { + void* volatile* ptr = conn->ptrExchange; + while ((direct = (T*)(*ptr)) == NULL) { if (checkAbort()) break; } + *ptr = NULL; + } +#endif connHeadPtr = conn->head; connHeadCache = LOAD(connHeadPtr); connSizesFifoPtr = conn->sizesFifo; @@ -252,8 +305,8 @@ class ncclPrimitives { public: __device__ __forceinline__ - ncclPrimitives(const int tid, const int nworkers, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, struct ncclShmemPtrs* ptrs, int group, int p2p = 0) - : comm(comm), tid(tid), nworkers(nworkers), stepSize(stepSize), srcs((const T**)ptrs[group].srcs), dsts((T**)ptrs[group].dsts), group(group), barriers(&ptrs[group].barrier), barrier_next(ptrs[group].barrier_next), p2p(p2p) { + ncclPrimitives(const int tid, const int nworkers, int* recvPeers, int* sendPeers, T* directBuff, int stepSize, struct ncclChannel* channel, struct ncclDevComm* comm, struct ncclShmemPtrs* ptrs, int group) + : comm(comm), tid(tid), nworkers(nworkers), stepSize(stepSize), srcs((const T**)ptrs[group].srcs), dsts((T**)ptrs[group].dsts), group(group), barriers(&ptrs[group].barrier), barrier_next(ptrs[group].barrier_next), p2pNet(*comm->p2pNet) { nthreads = nworkers; // For send operations, we need an extra warp to overlap the threadfence and the copy // int postThreads = NSEND && nworkers >= 64 ? WARP_SIZE : 0; @@ -347,6 +400,16 @@ class ncclPrimitives { GenericOp<0, 1, 1, 1, 1, 1>(src, dst, nelem, directOffset); } + __device__ __forceinline__ void + scatter(const T* src, int totalElem, int peerElem, int skip, int shift) { + ScatterGatherOp<0, 1>(src, NULL, totalElem, peerElem, skip, shift); + } + + __device__ __forceinline__ void + gather(T* dst, int totalElem, int peerElem, int skip, int shift) { + ScatterGatherOp<1, 0>(NULL, dst, totalElem, peerElem, skip, shift); + } + __device__ __forceinline__ ~ncclPrimitives() { // Save steps for the next operation saveSync(); diff --git a/src/collectives/device/prims_ll.h b/src/collectives/device/prims_ll.h index ce6ea697ee..87850d877c 100644 --- a/src/collectives/device/prims_ll.h +++ b/src/collectives/device/prims_ll.h @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -228,8 +228,9 @@ class ncclLLPrimitives { // Make sure step is updated before we read it. barrier(); - for (int i=0; i= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i); - for (int i=0; i= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i); + // If we are going to support oneshot collNet + LL, then we would need to add connector index here + for (int i=0; i= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv->conn, i); + for (int i=0; i= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send->conn, i); loadRecvSync(); loadSendSync(); } diff --git a/src/collectives/device/prims_ll128.h b/src/collectives/device/prims_ll128.h index 212832c9ef..b5a4cc8165 100644 --- a/src/collectives/device/prims_ll128.h +++ b/src/collectives/device/prims_ll128.h @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -394,8 +394,8 @@ class ncclLL128Primitives { // Make sure step is updated before we read it. barrier(); - for (int i=0; i= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv.conn, i); - for (int i=0; i= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send.conn, i); + for (int i=0; i= 0; i++) loadRecvConn(&channel->devPeers[recvPeers[i]].recv->conn, i); + for (int i=0; i= 0; i++) loadSendConn(&channel->devPeers[sendPeers[i]].send->conn, i); loadRecvSync(); loadSendSync(); } diff --git a/src/collectives/device/reduce.h b/src/collectives/device/reduce.h index 0412258be7..29441326a0 100644 --- a/src/collectives/device/reduce.h +++ b/src/collectives/device/reduce.h @@ -1,6 +1,6 @@ /************************************************************************* * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/device/reduce_kernel.h b/src/collectives/device/reduce_kernel.h index 7de8248c4a..9a15d12e75 100644 --- a/src/collectives/device/reduce_kernel.h +++ b/src/collectives/device/reduce_kernel.h @@ -1,6 +1,6 @@ /************************************************************************* * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/device/reduce_scatter.h b/src/collectives/device/reduce_scatter.h index a765f26499..b11f236307 100644 --- a/src/collectives/device/reduce_scatter.h +++ b/src/collectives/device/reduce_scatter.h @@ -1,6 +1,6 @@ /************************************************************************* * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/device/sendrecv.h b/src/collectives/device/sendrecv.h index abc6368ae3..2471a43501 100644 --- a/src/collectives/device/sendrecv.h +++ b/src/collectives/device/sendrecv.h @@ -1,5 +1,6 @@ /************************************************************************* - * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -49,14 +50,14 @@ class ncclFunctionchannels+blockIdx.x; const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(sizeof(T)*NCCL_STEPS); - const int chunkSize = stepSize/SENDRECV_SLICEFACTOR; int nThreadsSplit = nThreads/2; if ((tid < nThreadsSplit) && recvCount >= 0) { + const int chunkSize = args->p2p.recvChunkSize/sizeof(T); int peer = (comm->rank-delta+comm->nRanks)%comm->nRanks; int nt = nThreadsSplit; ncclPrimitives - prims(tid, nt, &peer, NULL, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, groupRecv, 1); + prims(tid, nt, &peer, NULL, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, groupRecv); if (recvCount == 0) { prims.recv(recvbuff, 0); @@ -68,10 +69,11 @@ class ncclFunction= nThreadsSplit) && sendCount >= 0) { + const int chunkSize = args->p2p.sendChunkSize/sizeof(T); int peer = (comm->rank+delta)%comm->nRanks; int nt = nThreads-nThreadsSplit; ncclPrimitives - prims(tid-nThreadsSplit, nt, NULL, &peer, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, groupSend, 1); + prims(tid-nThreadsSplit, nt, NULL, &peer, recvbuff, stepSize, channel, comm, ncclShmem->ptrs, groupSend); if (sendCount == 0) { prims.send(sendbuff, 0); diff --git a/src/collectives/gather_api.cc b/src/collectives/gather_api.cc index 8b53538933..e6d227fe1b 100644 --- a/src/collectives/gather_api.cc +++ b/src/collectives/gather_api.cc @@ -1,6 +1,6 @@ /************************************************************************* * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/reduce_api.cc b/src/collectives/reduce_api.cc index 2dc599e6bb..9735342b30 100644 --- a/src/collectives/reduce_api.cc +++ b/src/collectives/reduce_api.cc @@ -1,6 +1,6 @@ /************************************************************************* * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/reduce_scatter_api.cc b/src/collectives/reduce_scatter_api.cc index 35c93342dc..e64e621ab9 100644 --- a/src/collectives/reduce_scatter_api.cc +++ b/src/collectives/reduce_scatter_api.cc @@ -1,6 +1,6 @@ /************************************************************************* * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/scatter_api.cc b/src/collectives/scatter_api.cc index 2a0bc20df4..30ae12b810 100644 --- a/src/collectives/scatter_api.cc +++ b/src/collectives/scatter_api.cc @@ -1,6 +1,6 @@ /************************************************************************* * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/collectives/sendrecv_api.cc b/src/collectives/sendrecv_api.cc index b85e11d96c..296144dff1 100644 --- a/src/collectives/sendrecv_api.cc +++ b/src/collectives/sendrecv_api.cc @@ -1,6 +1,6 @@ /************************************************************************* * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/debug.cc b/src/debug.cc index e30c1406ca..76a84649ab 100644 --- a/src/debug.cc +++ b/src/debug.cc @@ -1,6 +1,6 @@ /************************************************************************* * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/enqueue.cc b/src/enqueue.cc index b5365a492f..b9a6107fa6 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information @@ -11,6 +11,7 @@ #include "graph/topo.h" #include #include +#include "gdrwrap.h" // Only generate inline kernels for LL #define NCCL_FUNC5(func, algo, redop, dtype) \ @@ -65,6 +66,21 @@ static ncclKern_t const ncclKerns[1] = { NCCL_KERN_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), }; +// Determine the maximum kernel stack size of all CUDA kernels +size_t ncclKernMaxLocalSize() { + ncclResult_t res = ncclSuccess; + int numNcclKerns = sizeof(ncclKerns)/sizeof(ncclKerns[0]); + hipFuncAttributes attr = {0}; + size_t max = 0; + for (int i = 0; i < numNcclKerns; i++) { + CUDACHECKGOTO(hipFuncGetAttributes(&attr, (const void*)(ncclKerns[i])), res, error); + if (attr.localSizeBytes > max) max = attr.localSizeBytes; + } + +error: + return (res != ncclSuccess) ? 0 : max; +} + /*****************************************************************************/ /* Launch system : synchronization and CUDA kernel launch */ /*****************************************************************************/ @@ -108,14 +124,23 @@ static ncclResult_t getNextOp(struct ncclChannel* channel, struct ncclWork** wor return ncclSuccess; } -static ncclResult_t setupLaunch(struct ncclComm* comm, hipLaunchParams* params) { +static ncclResult_t setupLaunch(struct ncclQueueInfo* eqInfo, int usingCudaGraph) { + ncclComm_t comm = eqInfo->comm; + hipLaunchParams* params = comm->myParams; + // Only launch blocks where we have work to do. - for (int c=0; cnChannels, comm->p2pnChannels); c++) { - if (comm->channels[c].workCount) params->gridDim.x = c+1; + // This is not supported when we are in cudaGraph mode. + // Because in cudaGraph mode the launch param needs to be determined + // at capture time instead of launch time. + if (!usingCudaGraph) { + for (int c=0; cnChannels, comm->p2pnChannels); c++) { + if (comm->channels[c].workCount) params->gridDim.x = c+1; + } + eqInfo->maxChannels = params->gridDim.x; } // Set active = 2 for the last operation and add a no-op on empty channels (p2p case). - for (int c=0; cgridDim.x; c++) { + for (int c=0; cmaxChannels; c++) { struct ncclChannel* channel = comm->channels+c; if (channel->workCount == 0) { struct ncclWork* w; @@ -126,22 +151,39 @@ static ncclResult_t setupLaunch(struct ncclComm* comm, hipLaunchParams* params) e->p2p.nThreads = 0; } STORE(&channel->workFifo[(channel->workFifoTail-1)%NCCL_MAX_OPS].elems[0].active, 2); + + { // [RCCL] Wait for any clique-based collectives + NCCLCHECK(comm->cliqueManager->WaitForPointers()); + } // [/RCCL] + + if (c == 0) { + // Find the first operation, choose the kernel accordingly and pass it as the first argument. + // Note that changing cuda launch argument after capture is not supported by cudaGraph + struct ncclWork* work = channel->workFifo+((channel->workFifoTail-channel->workCount)%NCCL_MAX_OPS); + struct ncclWorkElem* elem = work->elems; + if (!usingCudaGraph) { + params->func = (void *)ncclKerns[0]; + memcpy(&comm->args, elem, sizeof(struct ncclWorkElem)); + } + // As we inline the first coll directly, we can free it immediately. + if (elem->funcIndex != FUNC_INDEX_P2P) elem->active = 0; + } + + if (channel->gdrMemDesc) { + // GDRCOPY support + uint64_t first = (channel->workFifoTail-channel->workCount)%NCCL_MAX_OPS; + uint64_t nelems = channel->workCount; + TRACE(NCCL_INIT, "GDRCOPY : copy workFifo %p to %p first %ld nelems %zi", + channel->workFifo, channel->workFifoGdr, first, nelems); + + for (int i = 0; i < nelems; i++) { + int elem = (first+i) % NCCL_MAX_OPS; + // Copy Host workFifo to CUDA workFifo via the GDRCOPY mapping + NCCLCHECK(ncclGdrCudaCopy(channel->gdrMemDesc, channel->workFifoGdr+elem, channel->workFifo+elem, 1)); + } + } } - { // [RCCL] Wait for any clique-based collectives - NCCLCHECK(comm->cliqueManager->WaitForPointers()); - } // [/RCCL] - - // Find the first operation, choose the kernel accordingly and pass it - // as the first argument. - struct ncclChannel* c0 = comm->channels; - struct ncclWork* work = c0->workFifo+((c0->workFifoTail-c0->workCount)%NCCL_MAX_OPS); - struct ncclWorkElem* elem = work->elems; - memcpy(&comm->args, elem, sizeof(struct ncclWorkElem)); - // As we inline the first coll directly, we can free it immediately. - if (elem->funcIndex != FUNC_INDEX_P2P) elem->active = 0; - - params->func = (void *)ncclKerns[0]; return ncclSuccess; } @@ -184,21 +226,23 @@ ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm) { return ncclSuccess; } -ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm) { +ncclResult_t ncclLaunchBarrier(struct ncclComm* comm) { hipLaunchParams* params = comm->myParams; if (params->gridDim.x == 0) return ncclSuccess; - NCCLCHECK(setupLaunch(comm, params)); - // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL - if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) { + if (comm->launchMode == ncclComm::GROUP && + (comm->groupCudaStream || + comm->userStream == hipStreamDefault/* || + comm->userStream == hipStreamLegacy || + comm->userStream == hipStreamPerThread*/)) { // Enqueue event in user stream - CUDACHECK(hipEventRecord(comm->doneEvent, comm->userStream)); + CUDACHECK(hipEventRecord(comm->intDoneEvent, comm->userStream)); // Create dependency between user stream and internal NCCL stream - CUDACHECK(hipStreamWaitEvent(comm->groupStream, comm->doneEvent, 0)); + CUDACHECK(hipStreamWaitEvent(comm->groupStream, comm->intDoneEvent, 0)); params->stream = comm->groupStream; } else { - if (comm->userStream != params->stream) { + if (comm->userStream != params->stream && !comm->usingCudaGraph) { // Stream changed from last call, create dependency against last NCCL kernel launch CUDACHECK(hipStreamWaitEvent(comm->userStream, comm->doneEvent, 0)); } @@ -217,7 +261,7 @@ ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm) { return ncclSuccess; } -ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) { +ncclResult_t ncclLaunchKernel(ncclComm_t comm) { hipLaunchParams *params = comm->myParams; if (params->gridDim.x == 0) return ncclSuccess; @@ -230,51 +274,80 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) { (comm->launchMode == ncclComm::GROUP && comm->groupCudaStream) ? "/Stream" : ""); } - hipEvent_t startEvent; - hipEvent_t stopEvent; - if (comm->launchMode == ncclComm::PARALLEL) { - hipLaunchKernelGGL(((void (*)(struct ncclWorkElem))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclWorkElem**)params->args)); - } else { + if (comm->launchMode == ncclComm::GROUP) { NCCLCHECK(ncclCpuBarrierOut(comm)); + } else { + CUDACHECK(hipLaunchKernel(params->func, params->gridDim, params->blockDim, params->args, params->sharedMem, params->stream)); } + return ncclSuccess; +} + +static ncclResult_t ncclLaunchProxy(struct ncclQueueInfo* eqInfo) { // Start the network proxies as soon as the kernel has been launched. We can't // perform any CUDA call between the two or having a cudaFree between the CUDA // launch and the ncclProxyStart call could cause a deadlock. // Also, starting the proxies after the CUDA launch seems to be better for // performance (latency). - uint64_t max = 0ULL; - for (int r=0; rgridDim.x; r++) { + ncclComm_t comm = eqInfo->comm; + if (eqInfo->maxChannels == 0) return ncclSuccess; + + for (int r=0; rmaxChannels; r++) { struct ncclChannel* channel = comm->channels+r; - max = std::max(max, channel->workFifoTail); channel->workCount = 0; } - for (int r=0; rnChannels, comm->p2pnChannels); r++) { - struct ncclChannel* channel = comm->channels+r; - channel->workFifoTail = max; - } - params->gridDim.x = params->blockDim.x = 0; - comm->lastOpCount = max; + comm->lastChannel = 0; NCCLCHECK(ncclProxyStart(comm)); return ncclSuccess; } -ncclResult_t ncclEnqueueEvents(ncclComm_t comm) { +ncclResult_t ncclRecordEvents(ncclComm_t comm) { hipLaunchParams *params = comm->myParams; - // Enqueue event after NCCL kernel - CUDACHECK(hipEventRecord(comm->doneEvent, params->stream)); + + // Enqueue event after NCCL kernel (only in non-graph mode) + if (!comm->usingCudaGraph) CUDACHECK(hipEventRecord(comm->doneEvent, params->stream)); // Use internal NCCL stream for CGMD/GROUP launch if required or if the user stream is NULL - if (comm->launchMode == ncclComm::GROUP && (comm->groupCudaStream || comm->userStream == NULL)) { + if (comm->launchMode == ncclComm::GROUP && + (comm->groupCudaStream || + comm->userStream == hipStreamDefault/* || + comm->userStream == hipStreamLegacy || + comm->userStream == hipStreamPerThread*/)) { + CUDACHECK(hipEventRecord(comm->intDoneEvent, params->stream)); // Create dependency between NCCL internal stream and user stream - CUDACHECK(hipStreamWaitEvent(comm->userStream, comm->doneEvent, 0)); + CUDACHECK(hipStreamWaitEvent(comm->userStream, comm->intDoneEvent, 0)); } + return ncclSuccess; +} + +ncclResult_t ncclLaunchReset(ncclComm_t comm) { comm->userStreamSet = false; + + // We are finishing capture of the current launch + // But we need to keep the current enqueue info for CUDA graph + // Thus we need to creating a new enqueue info for the next run + if (comm->usingCudaGraph) { + NCCLCHECK(ncclCalloc(&comm->enqueueInfo, 1)); + comm->enqueueInfo->comm = comm; + } else { + // If not in CUDA graph mode, we reuse the same info space + NCCLCHECK(ncclResetQueueInfo(comm->enqueueInfo)); + } + + hipLaunchParams *params = comm->myParams; + params->gridDim.x = params->blockDim.x = 0; + params->func = NULL; + + // Reset launch mode to GROUP if changed + if (comm->launchMode == ncclComm::GROUP_GRAPH) comm->launchMode = ncclComm::GROUP; + comm->usingCudaGraph = 0; + return ncclSuccess; } /*****************************************************************************/ /* Enqueueing system : computation of kernel and proxy operations parameters */ /*****************************************************************************/ +RCCL_PARAM(SharpThreshold, "SHARP_THRESHOLD", 16384); static ncclResult_t getAlgoInfo(struct ncclInfo* info) { struct ncclComm* comm = info->comm; @@ -283,14 +356,13 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) { info->algorithm = -1; info->protocol = -1; int nAlgos = NCCL_NUM_ALGORITHMS; - #define SHARP_COLL_SAT_THRESHOLD 16384 // Check collNet support int collNetTypeSupport = 0; - if (info->comm->collNetSupport && info->nBytes < SHARP_COLL_SAT_THRESHOLD*comm->collNetnChannels/2) + if (info->comm->collNetSupport > 0 && info->nBytes < rcclParamSharpThreshold()) NCCLCHECK(collNetReduceSupport(info->datatype, info->op, &collNetTypeSupport)); - if (collNetTypeSupport != 1) nAlgos--; for (int a=0; arank == 0) INFO(NCCL_TUNING, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime); TRACE(NCCL_COLL, "%ld Bytes -> Algo %d proto %d time %f", info->nBytes, info->algorithm, info->protocol, minTime); - int nc = (info->nChannels > 0) ? info->nChannels : - (info->algorithm == NCCL_ALGO_COLLNET) ? comm->collNetnChannels/2 : comm->nChannels; // CollNet uses one channel for up and one channel for down + int nc = (info->nChannels > 0) ? info->nChannels : comm->nChannels; int nt = comm->maxThreads[info->algorithm][info->protocol]; int threadThreshold = comm->threadThresholds[info->algorithm][info->protocol]; - while (info->nBytes < nc*nt*threadThreshold) { - if (info->algorithm != NCCL_ALGO_COLLNET && nc >= 2) nc--; + if (info->algorithm == NCCL_ALGO_COLLNET) { + int ncSwitch = 16; + bool flag = true; + while (ncSwitch >= 1 && flag) { + while ((flag = info->nBytes < nc*nt*info->comm->channels[0].collTree.nHeads*threadThreshold) && nc > ncSwitch) { + if (nc == ncSwitch+ncSwitch/2) threadThreshold /= 2; + nc--; + } + ncSwitch /= 2; + } + } else { + while (info->nBytes < nc*nt*threadThreshold) { + if (nc >= 2) nc--; #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) - // do not reduce threads count on VEGA + // do not reduce threads count on VEGA #else - else if ((nt % 128) == 0) nt/=2; + else if ((nt % 128) == 0) nt/=2; #endif - else break; + else break; + } } #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) #else - if (info->protocol == NCCL_PROTO_SIMPLE) nt += WARP_SIZE; // Extra warp for sync - if (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_TREE) nt += WARP_SIZE; + if (info->protocol == NCCL_PROTO_SIMPLE) { + nt += WARP_SIZE; // Extra warp for sync + if (info->algorithm == NCCL_ALGO_TREE) nt += WARP_SIZE; + if (info->algorithm == NCCL_ALGO_COLLNET) nt += 3*WARP_SIZE; + } #endif info->nChannels = nc; info->nThreads = nt; @@ -341,7 +427,7 @@ static ncclResult_t getPatternInfo(struct ncclInfo* info) { case ncclFuncAllGather: info->pattern = ncclPatternRing; break; case ncclFuncAllReduce: - info->pattern = info->algorithm == NCCL_ALGO_COLLNET ? ncclPatternCollTreeUp : info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : ncclPatternRingTwice; break; + info->pattern = info->algorithm == NCCL_ALGO_COLLNET ? ncclPatternCollTreeUpDown : info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : ncclPatternRingTwice; break; default: WARN("Unknown pattern for collective %d algorithm %d", info->coll, info->algorithm); return ncclInternalError; @@ -356,9 +442,9 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) { case ncclPatternTreeUpDown: case ncclPatternPipelineFrom: case ncclPatternPipelineTo: - case ncclPatternCollTreeUp: - case ncclPatternCollTreeDown: info->nstepsPerLoop = info-> nchunksPerLoop = 1; break; + case ncclPatternCollTreeUpDown: + info->nstepsPerLoop = 1; info->nchunksPerLoop = info->comm->channels[0].collTree.nHeads; break; case ncclPatternRing: info->nstepsPerLoop = info->comm->nRanks-1; info->nchunksPerLoop = info->comm->nRanks; break; case ncclPatternRingTwice: @@ -378,7 +464,7 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclWo NCCLCHECK(getPatternInfo(info)); NCCLCHECK(getLoopInfo(info)); - work->opCount = info->comm->opCount; + work->op.opCount = info->comm->collOpCount; work->sendbuff = info->sendbuff; work->recvbuff = info->recvbuff; work->coll.root = info->root; @@ -432,9 +518,10 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclWo work->coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype); } else if (info->algorithm == NCCL_ALGO_COLLNET && info->protocol == NCCL_PROTO_SIMPLE) { // Optimize chunkSize / nSteps - while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTree.depth*16 && chunkSize > 131072) chunkSize /= 2; - while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTree.depth*4 && chunkSize > 65536) chunkSize /= 2; - while (info->nBytes / (info->nChannels*chunkSize) < info->comm->channels[0].collTree.depth && chunkSize > 32768) chunkSize /= 2; + while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth*32 && chunkSize > 262144) chunkSize /= 2; + while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth*16 && chunkSize > 131072) chunkSize /= 2; + while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth*8 && chunkSize > 32768) chunkSize /= 2; + while (info->nBytes / (info->nChannels*info->comm->channels[0].collTree.nHeads*chunkSize) < info->comm->channels[0].collTree.depth/2 && chunkSize > 16384) chunkSize /= 2; // Use lastChunkSize as chunkSize work->coll.lastChunkSize = chunkSize / ncclTypeSize(info->datatype); } else if (info->protocol == NCCL_PROTO_LL) { @@ -459,20 +546,23 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclWo if (info->protocol == NCCL_PROTO_LL128) chunkEffectiveSize = (chunkSize / NCCL_LL128_LINEELEMS) * NCCL_LL128_DATAELEMS; //if (info->comm->rank == 0) printf("Coll %d, size %ld -> %dx%d, chunkSize %d (algo %d proto%d)\n", info->coll, info->nBytes, info->nChannels, info->nThreads, chunkSize, info->algorithm, info->protocol); int nLoops = (int)(DIVUP(info->nBytes, (((size_t)(info->nChannels))*info->nchunksPerLoop*chunkEffectiveSize))); - proxyArgs->nsteps = info->nstepsPerLoop * nLoops * chunkSteps; + proxyArgs->subs[0].nsteps = info->nstepsPerLoop * nLoops * chunkSteps; proxyArgs->sliceSteps = sliceSteps; proxyArgs->chunkSteps = chunkSteps; + proxyArgs->chunkSize = chunkSize; proxyArgs->protocol = info->protocol; proxyArgs->dtype = info->datatype; - proxyArgs->redOp = info->op; + proxyArgs->redOp = (info->algorithm == NCCL_ALGO_COLLNET) ? info->op : ncclNumOps; // Only set redOp when using CollNet + proxyArgs->pattern = info->pattern; + proxyArgs->root = info->root; // This is used by P2P to reduce the receive buffer size. We don't use it in collectives // because some protocols need to transmit more than the total size, plus they sometimes // round up - proxyArgs->recvbytes = stepSize*proxyArgs->sliceSteps; + proxyArgs->subs[0].recvbytes = stepSize*proxyArgs->sliceSteps; - TRACE(NCCL_NET,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d comm %p", - proxyArgs->opCount, proxyArgs->sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, info->nBytes, info->protocol, info->nChannels, info->nThreads, - nLoops, proxyArgs->nsteps, info->comm); + TRACE(NCCL_COLL,"opCount %lx slicesteps %d spl %d cpl %d nbytes %zi -> protocol %d nchannels %d nthreads %d, nloops %d nsteps %d chunksize %d comm %p", + proxyArgs->opCount, sliceSteps, info->nstepsPerLoop, info->nchunksPerLoop, info->nBytes, info->protocol, info->nChannels, info->nThreads, + nLoops, proxyArgs->subs[0].nsteps, chunkSize, info->comm); return ncclSuccess; } @@ -487,72 +577,100 @@ static ncclResult_t checkSetStream(struct ncclInfo* info) { return ncclSuccess; } -ncclResult_t ncclSaveKernel(struct ncclInfo* info) { - if (info->comm->nRanks == 1) { +// Compute enqueue element, save it in list +// Compute CUDA launch parameters +// Capture time code in view of CUDA graph +static ncclResult_t ncclSetupCollKernel(struct ncclInfo* info) { + ncclComm_t comm = info->comm; + if (comm->nRanks == 1) { if (info->sendbuff != info->recvbuff) CUDACHECK(hipMemcpyAsync(info->recvbuff, info->sendbuff, info->nBytes, hipMemcpyDeviceToDevice, info->stream)); return ncclSuccess; } - struct ncclWorkElem work; - struct ncclProxyArgs proxyArgs; - memset(&proxyArgs, 0, sizeof(struct ncclProxyArgs)); - NCCLCHECK(computeColl(info, &work, &proxyArgs)); + // Compute cuda kernel arg and proxy arg templates + struct ncclQueueElem* eqElem; + NCCLCHECK(ncclAddQueueElem(comm->enqueueInfo, &eqElem)); + struct ncclWorkElem* work = &eqElem->work; + eqElem->proxyArgs.nsubs = 1; + NCCLCHECK(computeColl(info, work, &eqElem->proxyArgs)); - info->comm->myParams->blockDim.x = std::max(info->comm->myParams->blockDim.x, info->nThreads); + // Determine grid size + hipLaunchParams* params = comm->myParams; + params->gridDim.x += info->nChannels; + params->gridDim.x = std::min(params->gridDim.x, comm->nChannels); + params->blockDim.x = std::max(params->blockDim.x, info->nThreads); + comm->enqueueInfo->maxChannels = params->gridDim.x; // params may be varied by a second graph hence we need to capture it here - int nChannels = work.coll.nChannels; - int nSubChannels = (info->pattern == ncclPatternCollTreeUp || info->pattern == ncclPatternCollTreeDown) ? 2 : 1; + // Inline the first kernel + if (params->func == NULL) { + params->func = (void *)ncclKerns[0]; + memcpy(&comm->args, work, sizeof(struct ncclWorkElem)); + comm->args.coll.bid = 0; // Only inline for channel 0 + comm->args.active = 2; // I am so far the last element; may be changed later in aggregation mode + } - for (int bid=0; bidcomm->myParams->gridDim.x % info->comm->nChannels; - struct ncclChannel* channel = info->comm->channels+channelId; + return ncclSuccess; +} + +// Dynamic enqueue code +static ncclResult_t ncclEnqueueCollKernel(ncclComm_t comm, struct ncclQueueElem* eqElem) { + struct ncclWorkElem* work = &eqElem->work; + struct ncclProxyArgs* proxyArgs = &eqElem->proxyArgs; + + int nChannels = work->coll.nChannels; + for (int bid=0; bidlastChannel % comm->nChannels; + struct ncclChannel* channel = comm->channels+channelId; // Proxy - proxyArgs.channel = channel; - // Adjust pattern for CollNet based on channel index - if (nSubChannels == 2) { - info->pattern = (channelId < info->comm->collNetnChannels/nSubChannels) ? ncclPatternCollTreeUp : ncclPatternCollTreeDown; - } + proxyArgs->subs[0].channel = channel; + proxyArgs->opCount = comm->collOpCount; + proxyArgs->commOpCount = comm->opCount; - if (proxyArgs.nsteps) NCCLCHECK(ncclProxySaveColl(&proxyArgs, info->pattern, info->root, info->comm->nRanks)); - - info->comm->myParams->gridDim.x++; + if (proxyArgs->subs[0].nsteps) NCCLCHECK(ncclProxySaveColl(proxyArgs, comm->nRanks)); + comm->lastChannel++; // [RCCL] Setup pointers to where all the input/output pointers will be - if (info->protocol == NCCL_PROTO_CLIQUE) { - NCCLCHECK(info->comm->cliqueManager->SetCliqueArgs(&work)); + if (proxyArgs->protocol == NCCL_PROTO_CLIQUE) { + NCCLCHECK(comm->cliqueManager->SetCliqueArgs(work)); } // [/RCCL] - - work.coll.bid = bid % nChannels; - NCCLCHECK(getNextOp(channel, NULL, &work)); + work->coll.bid = bid % nChannels; + NCCLCHECK(getNextOp(channel, NULL, work)); + //INFO(NCCL_COLL, "Host enqueue: bid %d channel %d index %ld nThreads %d funcIndex %d count %ld nChannels %d", + // work->coll.bid, channelId, channel->workFifoTail, work->nThreads, work->funcIndex, work->coll.count, work->coll.nChannels); } - info->comm->opCount++; + comm->collOpCount++; return ncclSuccess; } #define NCCL_MIN_CHANNEL_SIZE (NCCL_LL_THREAD_THRESHOLD*64) #define NCCL_AGG_CHANNEL_SIZE (1LL << 21) /* 2 MiB, ideal per-channel size to fully utilize bandwidth */ -ncclResult_t ncclSaveCommKernels(ncclComm_t comm) { +ncclResult_t ncclSetupAsyncKernels(ncclComm_t comm) { if (comm->asyncOpCount == 0) { return ncclSuccess; } else if (comm->asyncOpCount == 1) { // No aggregation struct ncclInfo* info = comm->asyncOps; info->nChannels = 0; - NCCLCHECK(ncclSaveKernel(info)); + NCCLCHECK(ncclSetupCollKernel(info)); } else { // Aggregation size_t channelSize = NCCL_AGG_CHANNEL_SIZE * comm->nRanks; // scale channel size based on nranks as latency increases // Reduce the per-channel size if we cannot fully utilize the channels while (comm->asyncTotalSize < channelSize * comm->nChannels && channelSize > NCCL_MIN_CHANNEL_SIZE) channelSize /= 2; + int channelUsed = 0; for (int c = 0; c < comm->asyncOpCount; c++) { struct ncclInfo* info = comm->asyncOps+c; info->nChannels = std::min((int)DIVUP(info->nBytes, channelSize), comm->nChannels); // assign number of channels - NCCLCHECK(ncclSaveKernel(info)); + channelUsed += info->nChannels; + NCCLCHECK(ncclSetupCollKernel(info)); } + // If we wrap around on channels, then the inlined op on channel 0 is not the last one on this channel + // Then we need to change active from 2 to 1 + if (channelUsed > comm->nChannels) comm->args.active = 1; } // Reset counters comm->asyncOpCount = 0; @@ -583,8 +701,7 @@ static ncclResult_t ncclSaveP2p(struct ncclInfo* info) { int delta = (comm->nRanks - (comm->rank-peer)) % comm->nRanks; for (int c=0; cp2pnChannelsPerPeer; c++) { int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels; - if ((LOAD(comm->p2pNet) ? comm->channels[channelId].peers[peer].p2pSend.connected : - comm->channels[channelId].peers[peer].send.connected) == 0) { + if (comm->channels[channelId].peers[peer].send[NCCL_CONN_IDX_P2P].connected == 0) { // P2P uses only 1 connector comm->connectSend[peer] |= (1<connect = 1; } @@ -597,8 +714,7 @@ static ncclResult_t ncclSaveP2p(struct ncclInfo* info) { int delta = (comm->nRanks + (comm->rank-peer)) % comm->nRanks; for (int c=0; cp2pnChannelsPerPeer; c++) { int channelId = (delta+comm->p2pChannels[c]) % comm->p2pnChannels; - if ((LOAD(comm->p2pNet) ? comm->channels[channelId].peers[peer].p2pRecv.connected : - comm->channels[channelId].peers[peer].recv.connected ) == 0) { + if (comm->channels[channelId].peers[peer].recv[NCCL_CONN_IDX_P2P].connected == 0) { // P2P uses only 1 connector comm->connectRecv[peer] |= (1<connect = 1; } @@ -610,59 +726,169 @@ static ncclResult_t ncclSaveP2p(struct ncclInfo* info) { return ncclSuccess; } -static int getSegment(struct ncclInfo* info, struct ncclWork* work) { - const int e = (info->comm->topo->nodes[GPU].count == info->comm->topo->nRanks && (info->comm->topo->type & RCCL_TOPO_4P2H_ROME)) - ? 1 : NCCL_MAX_WORK_ELEMENTS; - for (int s=0; selems[s].p2p.delta != info->delta; s++) { +static int getSegment(int delta, struct ncclWork* work, int e) { + for (int s=0; selems[s].p2p.delta != delta; s++) { if (work->elems[s].p2p.nThreads == 0) return s; } return -1; } -static ncclResult_t saveP2pOp(struct ncclInfo* info /* input */, struct ncclWork* work, int s) { - struct ncclWorkElem* elem = work->elems+s; +static ncclResult_t computeP2pWorkElem(struct ncclInfo* info /* input */, struct ncclWorkElem* elem /* output */) { elem->comm = info->comm->devComm; elem->funcIndex = FUNC_INDEX_P2P; - elem->nThreads = info->nThreads = NCCL_MAX_NTHREADS; + elem->nThreads = NCCL_MAX_NTHREADS; elem->sendbuff = info->sendbuff; elem->recvbuff = info->recvbuff; - elem->opCount = info->comm->lastOpCount; + elem->op.opCount = info->comm->collOpCount; elem->p2p.sendCount = info->sendbytes; elem->p2p.recvCount = info->recvbytes; + elem->p2p.sendChunkSize = info->sendChunkSize; + elem->p2p.recvChunkSize = info->recvChunkSize; elem->p2p.delta = info->delta; + return ncclSuccess; +} + +static ncclResult_t enqueueP2pOp(struct ncclWorkElem* elem /* input */, struct ncclWork* work, int s) { + // Copy element into corresponding segment of ncclWork + memcpy(work->elems+s, elem, sizeof(struct ncclWorkElem)); + + // Determine nThreads at dynamic time const int nsegments = s+1; int nThreads = 512; while (nsegments*nThreads > 256) nThreads /= 2; //if (nThreads >= 128) nThreads += WARP_SIZE; for (int i=0; ielems[i].p2p.nThreads = nThreads; + return ncclSuccess; } -ncclResult_t ncclSaveP2pKernel(struct ncclInfo* info) { - int channelId = info->channelId; - struct ncclChannel* channel = info->comm->channels+channelId; +ncclResult_t ncclEnqueueP2pKernel(struct ncclComm* comm, struct ncclQueueElem* eqElem) { + struct ncclWorkElem* workElem = &eqElem->work; + struct ncclProxyArgs* proxyArgs = &eqElem->proxyArgs; // Try to reuse last p2p operation if not full yet + struct ncclChannel* channel = proxyArgs->subs[0].channel; int opIndex = (channel->workFifoTail-1+NCCL_MAX_OPS)%NCCL_MAX_OPS; struct ncclWork* w = channel->workFifo+opIndex; int segment = -1; + const int e = ((comm->topo->nodes[GPU].count == comm->topo->nRanks) && (comm->topo->type & RCCL_TOPO_4P2H_ROME)) + ? 1 : NCCL_MAX_WORK_ELEMENTS; if (channel->workCount && w->elems[0].funcIndex == FUNC_INDEX_P2P && w->elems[NCCL_MAX_WORK_ELEMENTS-1].p2p.nThreads == 0) { // Try to pack more segments into a single operation - segment = getSegment(info, w); + segment = getSegment(workElem->p2p.delta, w, e); } if (segment == -1) { NCCLCHECK(getNextOp(channel, &w, NULL)); segment = 0; } - NCCLCHECK(ncclProxySaveP2p(info, channel, segment)); - NCCLCHECK(saveP2pOp(info, w, segment)); - info->comm->myParams->gridDim.x = std::max(info->comm->myParams->gridDim.x, channelId+1); - info->comm->myParams->blockDim.x = std::max(info->comm->myParams->blockDim.x, info->nThreads); - + // store work element into FIFO + NCCLCHECK(ncclProxySaveP2p(comm, proxyArgs)); + NCCLCHECK(enqueueP2pOp(workElem, w, segment)); + comm->collOpCount++; return ncclSuccess; } +ncclResult_t ncclSetupP2pKernel(struct ncclInfo* info) { + ncclComm* comm = info->comm; + // Compute cuda kernel arg and proxy arg templates + struct ncclQueueElem* eqElem; + NCCLCHECK(ncclAddQueueElem(comm->enqueueInfo, &eqElem)); + // The proxy code will set and tune the send/recv chunk size, make sure to run it first. + NCCLCHECK(ncclProxyComputeP2p(info, &eqElem->proxyArgs)); + NCCLCHECK(computeP2pWorkElem(info, &eqElem->work)); + + int channelId = info->channelId; + hipLaunchParams* params = comm->myParams; + params->gridDim.x = std::max(params->gridDim.x, channelId+1); + params->blockDim.x = std::max(params->blockDim.x, eqElem->work.nThreads); + comm->enqueueInfo->maxChannels = params->gridDim.x; // params may be varied by a second graph hence we need to capture it here + + // Record the first kernel to launch + // Just for CUDA kernel to know this is a P2P operation + // The CUDA kernel does not use the inlined first work element as fastpath argument + if (params->func == NULL) { + params->func = (void *)ncclKerns[0]; + memcpy(&comm->args, &eqElem->work, sizeof(struct ncclWorkElem)); + } + return ncclSuccess; +} + +template +void HIPRT_CB ncclEnqueueHostSetup(void* arg) { + ncclResult_t ret; + struct ncclQueueInfo* eqInfo = (struct ncclQueueInfo*)arg; + ncclComm_t comm = eqInfo->comm; + + // Iterate through the element list + struct ncclQueueElem* eqElem = eqInfo->elemList.head; + while (eqElem != eqInfo->elemList.tail) { // The queue always has one extra element + if (eqElem->work.funcIndex == FUNC_INDEX_P2P) { + NCCLCHECKGOTO(ncclEnqueueP2pKernel(comm, eqElem), ret, cb_end); + } else { + NCCLCHECKGOTO(ncclEnqueueCollKernel(comm, eqElem), ret, cb_end); + } + eqElem = eqElem->next; + } + + NCCLCHECKGOTO(setupLaunch(eqInfo, USING_CUDA_GRAPH), ret, cb_end); + NCCLCHECKGOTO(ncclLaunchProxy(eqInfo), ret, cb_end); + +cb_end: + if (ret != ncclSuccess) { + WARN("Failure in host setup : %s", ncclGetErrorString(ret)); + } + eqInfo->ret = ret; +} + +template void HIPRT_CB ncclEnqueueHostSetup<0>(void*); +template void HIPRT_CB ncclEnqueueHostSetup<1>(void*); + +ncclResult_t ncclGetCudaGraph(ncclComm_t comm, cudaGraph_t* graph) { + comm->usingCudaGraph = 0; +#if CUDART_VERSION >= 11030 + cudaStreamCaptureStatus captureStatus; + unsigned long long cudaGraphId; + CUDACHECK(cudaStreamGetCaptureInfo_v2(comm->userStream, &captureStatus, &cudaGraphId, graph, NULL, NULL)); + if (captureStatus == cudaStreamCaptureStatusActive) { + if (cudaGraphId != comm->lastCudaGraphId) { + INFO(NCCL_COLL, "stream is being captured by a new graph, id %llu", cudaGraphId); + // We are in a new graph, hence need to forget the last setup node so that + // the first setup node in the new graph will not have a dependency + comm->lastCudaGraphId = cudaGraphId; + comm->lastSetupNode = NULL; + } + if (comm->launchMode == ncclComm::GROUP) comm->launchMode = ncclComm::GROUP_GRAPH; + comm->usingCudaGraph = 1; + } +#endif + return ncclSuccess; +} + +ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, cudaGraph_t graph) { +#if CUDART_VERSION >= 11030 + struct ncclQueueInfo* eqInfo = comm->enqueueInfo; + // Create a CUDA object to wrap around the argument space + // which CUDA graph would manage lifetime of + cudaUserObject_t object; + CUDACHECK(cudaUserObjectCreate(&object, eqInfo, ncclDestroyQueueInfo, 1/*initialRefcount*/, cudaUserObjectNoDestructorSync)); + CUDACHECK(cudaGraphRetainUserObject(graph, object, 1, cudaGraphUserObjectMove)); + + cudaHostFn_t fn = ncclEnqueueHostSetup<1>; + // Add a CPU node to the graph + cudaGraphNode_t setupNode; + cudaHostNodeParams setupNodeParams = {fn, eqInfo}; + int numDependencies = comm->lastSetupNode == NULL ? 0 : 1; + CUDACHECK(cudaGraphAddHostNode(&setupNode, graph, &comm->lastSetupNode, numDependencies, &setupNodeParams)); + CUDACHECK(cudaStreamUpdateCaptureDependencies(comm->userStream, &setupNode, 1, cudaStreamAddCaptureDependencies)); + comm->lastSetupNode = setupNode; + return ncclSuccess; +#else + WARN("NCCL does not support this CUDA version for CUDA graph feature"); + return ncclInternalError; +#endif +} + ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) { // Launch asynchronously if needed if (ncclAsyncMode()) { @@ -681,12 +907,12 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) { NCCLCHECKGOTO(checkSetStream(info), ret, end); INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", - info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count, + info->opName, info->comm->collOpCount, info->sendbuff, info->recvbuff, info->count, info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream); if (info->coll == ncclFuncSendRecv) { //p2p stored separately INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", - info->opName, info->comm->lastOpCount, info->sendbuff, info->recvbuff, info->count, + info->opName, info->comm->collOpCount, info->sendbuff, info->recvbuff, info->count, info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream); NCCLCHECKGOTO(ncclSaveP2p(info), ret, end); } else { @@ -703,13 +929,30 @@ end: NCCLCHECK(checkSetStream(info)); INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p", - info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count, + info->opName, info->comm->collOpCount, info->sendbuff, info->recvbuff, info->count, info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream); - NCCLCHECK(ncclSaveKernel(info)); - NCCLCHECK(ncclBarrierEnqueue(info->comm)); - NCCLCHECK(ncclBarrierEnqueueWait(info->comm)); - NCCLCHECK(ncclEnqueueEvents(info->comm)); + // Check whether we are in cuda graph mode + cudaGraph_t graph; + ncclComm_t comm = info->comm; + NCCLCHECK(ncclGetCudaGraph(comm, &graph)); + + // Common part between graph mode and non-graph mode + NCCLCHECK(ncclSetupCollKernel(info)); + + // Host setup + if (comm->usingCudaGraph) { + NCCLCHECK(ncclCudaGraphHostSetup(comm, graph)); + } else { + ncclEnqueueHostSetup<0>(comm->enqueueInfo); + NCCLCHECK(comm->enqueueInfo->ret); + } + + // Common part between graph mode and non-graph mode + NCCLCHECK(ncclLaunchBarrier(comm)); + NCCLCHECK(ncclLaunchKernel(comm)); + NCCLCHECK(ncclRecordEvents(comm)); + NCCLCHECK(ncclLaunchReset(comm)); return ncclSuccess; } } diff --git a/src/graph/connect.cc b/src/graph/connect.cc index 7f52375ae8..e7e249e4fe 100644 --- a/src/graph/connect.cc +++ b/src/graph/connect.cc @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -9,23 +9,29 @@ #include "graph.h" #include "trees.h" #include "rings.h" -#include "topo.h" /******************************************************************/ /********************* Internode connection ***********************/ /******************************************************************/ ncclResult_t ncclTopoPreset(struct ncclComm* comm, - struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph, + struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoRanks* topoRanks) { int rank = comm->rank; int localRanks = comm->localRanks; + int nChannels = comm->nChannels; - for (int c=0; cnChannels; c++) { + for (int c=0; cchannels+c; channel->ring.prev = channel->ring.next = -1; channel->tree.up = -1; for (int i=0; itree.down[i] = -1; + channel->collTree.out = -1; + channel->collTree.headRank = -1; + channel->collTree.nHeads = 0; + channel->collTree.shift = 0; + for (int i=0; icollTree.up[i] = -1; + for (int i=0; icollTree.down[i] = -1; int* ringIntra = ringGraph->intra+c*localRanks; int* treeIntra = treeGraph->intra+c*localRanks; @@ -54,25 +60,8 @@ ncclResult_t ncclTopoPreset(struct ncclComm* comm, } // Duplicate channels rings/trees struct ncclChannel* channel0 = comm->channels; - struct ncclChannel* channel1 = channel0+comm->nChannels; - memcpy(channel1, channel0, comm->nChannels*sizeof(struct ncclChannel)); - // Setup collnet tree - for (int c=0; ccollNetnChannels; c++) { - struct ncclChannel* channel = comm->channels+c; - channel->collTree.up = -1; - for (int i=0; icollTree.down[i] = -1; - - int* collNetIntra = collNetGraph->intra+c*localRanks; - - for (int i=0; icollTree.up = collNetIntra[prev]; - channel->collTree.down[0] = collNetIntra[next]; - } - } - } + struct ncclChannel* channel1 = channel0+nChannels; + memcpy(channel1, channel0, nChannels*sizeof(struct ncclChannel)); return ncclSuccess; } @@ -176,36 +165,53 @@ static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int* return ncclSuccess; } -ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, int rank) { - int nranks = comm->nRanks; - int depth = nranks/comm->nNodes; - int sendIndex = collNetGraph->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1; // send GPU index depends on topo pattern - int sendEndIndex = (sendIndex+comm->localRanks-1)%comm->localRanks; - for (int c=0; ccollNetnChannels/2; c++) { - struct ncclChannel* channel = comm->channels+c; - // Set root of collTree to id nranks - if (rank == collNetGraph->intra[sendIndex+c*comm->localRanks]) { // is master - channel->collTree.up = nranks; - } - if (rank == collNetGraph->intra[sendEndIndex+c*comm->localRanks]) { // is bottom of intra-node chain - channel->collTree.down[0] = -1; - } - channel->collTree.depth = depth; - INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", c, rank, channel->collTree.up, channel->collTree.down[0]); +static ncclResult_t connectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph) { + int rank = comm->rank; + int localRanks = comm->localRanks; + int nHeads = collNetGraph->nChannels; + int *heads; + NCCLCHECK(ncclCalloc(&heads, nHeads)); + // Find all head ranks + // Head index is always 0 + for (int c=0; cintra+c*localRanks; + heads[c] = collNetIntra[0]; } - int recvIndex = 0; // recv GPU index is always 0 - int recvEndIndex = (recvIndex+comm->localRanks-1)%comm->localRanks; - for (int c=0; ccollNetnChannels/2; c++) { - struct ncclChannel* channel = comm->channels+comm->collNetnChannels/2+c; - // Set root of collTree to id nranks - if (rank == collNetGraph->intra[recvIndex+c*comm->localRanks]) { // is master - channel->collTree.up = nranks; + // For all channels + for (int c=0; cnChannels; c++) { + struct ncclChannel* channel = comm->channels+c; + char line[1024]; + sprintf(line, "CollNet channel %d rank %d ", c, rank); + int nDown = 0; + for (int i=0; icollTree.headRank = i; // Mark the index for deciding offset in the CUDA kernel + channel->collTree.out = comm->nRanks; // Set root of collTree to id nranks + int* collNetIntra = collNetGraph->intra+i*localRanks; + sprintf(line+strlen(line), "down "); + for (int r=0; rcollTree.down[nDown++] = collNetIntra[r]; // connect to all peers + sprintf(line+strlen(line), " %d ", collNetIntra[r]); + } + sprintf(line+strlen(line), "nDown %d ", nDown); + break; + } } - if (rank == collNetGraph->intra[recvEndIndex+c*comm->localRanks]) { // is bottom of intra-node chain - channel->collTree.down[0] = -1; + // Connect to all heads + int nUp = 0; + sprintf(line+strlen(line), "up "); + for (int h=0; hcollTree.up[nUp++] = heads[h]; + sprintf(line+strlen(line), " %d ", heads[h]); } - channel->collTree.depth = depth; - INFO(NCCL_GRAPH, "CollNet Channel %d rank %d up %d down %d", comm->collNetnChannels/2+c, rank, channel->collTree.up, channel->collTree.down[0]); + channel->collTree.nHeads = nHeads; + channel->collTree.shift = (rank%localRanks)%nHeads; // Shift by intraRank so that leaves don't send to same head simultaneously + channel->collTree.depth = (nUp == 0 && nDown == 0) ? 1 : 2; + sprintf(line+strlen(line), "nUp %d nHeads %d ", nUp, nHeads); + sprintf(line+strlen(line), "headRank %d out %d shift %d", channel->collTree.headRank, channel->collTree.out, channel->collTree.shift); + INFO(NCCL_GRAPH, "%s", line); } return ncclSuccess; } @@ -240,7 +246,18 @@ int ncclMaxNchannels() { return maxNchannels; } -ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, int nc) { +static int copyChannels(struct ncclComm* comm, int start, int end, int* ringPrev, int* ringNext) { + int nranks = comm->nRanks; + int c; + for (c=start; cchannels+c, comm->channels+c-start, sizeof(struct ncclChannel)); + } + return c; +} + +ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph* collNetGraph, int nc) { // Gather data from all ranks int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1; int nranks = comm->nRanks; @@ -272,19 +289,25 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePa memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int)); memcpy(ringNext+nChannels*nranks, ringNext, nChannels*nranks*sizeof(int)); + // Get number of channels after duplication + nc *= comm->nChannels; // Duplication should be complete now nChannels = comm->nChannels = std::min(MAXCHANNELS,nChannels*2); + // Setup CollNet + if (comm->collNetSupport == 1) { + // Add more channels to saturate intra-node bandwidth, except the 1 PPN case + if (collNetGraph->speedIntra > collNetGraph->speedInter && comm->nRanks > comm->nNodes) { + int collNetNchannels = std::min(MAXCHANNELS, nChannels+nChannels/2); + nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext); + } + NCCLCHECK(connectCollNet(comm, collNetGraph)); + } + // Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS. // We permit combining max, then min, to only use the first channels, then duplicate them. nChannels = comm->nChannels = std::min((int)ncclMaxNchannels(), nChannels); - int c; - for (c=nChannels; cchannels+c, comm->channels+c-nChannels, sizeof(struct ncclChannel)); - } - nChannels = comm->nChannels = c; + nChannels = comm->nChannels = copyChannels(comm, nChannels, std::max(nc, ncclMinNchannels()), ringPrev, ringNext); // Create rings array and check all is fine NCCLCHECK(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext)); diff --git a/src/graph/paths.cc b/src/graph/paths.cc index 7c29920d1e..29938c7211 100644 --- a/src/graph/paths.cc +++ b/src/graph/paths.cc @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -279,8 +279,7 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_ NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model)); if (arch == NCCL_TOPO_CPU_ARCH_ARM) p2pLevel = PATH_PXB; if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_INTEL) { - if (model == NCCL_TOPO_CPU_TYPE_BDW) p2pLevel = PATH_PXB; - else p2pLevel = PATH_SYS; + p2pLevel = PATH_PXB; } if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) { p2pLevel = PATH_PXB; @@ -595,6 +594,6 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) { for (int b=1, mb=(comm->p2pnChannels>>1); bp2pnChannels; b<<=1, mb>>=1) if (c & b) mirror |= mb; comm->p2pChannels[c] = mirror; } - INFO(NCCL_INIT, "%d coll channels, %d collnet channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->collNetnChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer); + INFO(NCCL_INIT, "%d coll channels, %d p2p channels, %d p2p channels per peer", comm->nChannels, comm->p2pnChannels, comm->p2pnChannelsPerPeer); return ncclSuccess; } diff --git a/src/graph/search.cc b/src/graph/search.cc index 55456e0073..35842c7eb7 100644 --- a/src/graph/search.cc +++ b/src/graph/search.cc @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -429,9 +429,67 @@ ncclResult_t ncclTopoSearchRecGpu(struct ncclTopoSystem* system, struct ncclTopo return ncclSuccess; } +// Select only NICs with the maximum bandwidth w.r.t. GPUs, and sort them by distance. +ncclResult_t ncclTopoSelectNets(struct ncclTopoSystem* system, int* nets, int* netcountRet) { + float* maxwidths; + int* minhops; + int netcount = 0; + NCCLCHECK(ncclCalloc(&minhops, system->nodes[NET].count)); + NCCLCHECK(ncclCalloc(&maxwidths, system->nodes[NET].count)); + for (int n=0; nnodes[NET].count; n++) { + maxwidths[n] = 0.0; + minhops[n] = 255; + struct ncclTopoNode* net = system->nodes[NET].nodes+n; + struct ncclTopoLinkList* paths = net->paths[GPU]; + for (int g=0; gnodes[GPU].count; g++) { + if (paths[g].width > maxwidths[n] || (paths[g].width == maxwidths[n] && paths[g].count < minhops[n])) { + maxwidths[n] = paths[g].width; + minhops[n] = paths[g].count; + } + } + if (netcount && maxwidths[nets[0]] > maxwidths[n]) continue; // Do not keep NICs with lower BW + if (netcount && maxwidths[nets[0]] < maxwidths[n]) netcount = 0; // Remove all NICs with lower BW + int index; + for (index = 0; index < netcount; index++) { + if (minhops[n] < minhops[nets[index]]) break; + } + // Insert net at index + // Shift all nets with higher nhops + for (int i = netcount; i>index; i--) nets[i] = nets[i-1]; + // Insert this net at index + nets[index] = n; + netcount++; + } + + *netcountRet = netcount; + + // Then shuffle NICs with the same nhops based on the GPU device number, so that when we have + // 2 NICs and 2 GPUs and create communicators with only one GPU, we will use both NICs. + for (int start = 0; start < netcount;) { + int end = start+1; + while (end < netcount && minhops[nets[end]] == minhops[nets[start]]) end++; + // Shuffle + for (int r=0; rnodes[GPU].nodes[0].gpu.dev % (end-start); r++) { + int netStart = nets[start]; + for (int i=start; ispeedInter; - for (int n=0; nnodes[NET].count; n++) { + int* nets; + NCCLCHECK(ncclCalloc(&nets, system->nodes[NET].count)); + int netcount; + NCCLCHECK(ncclTopoSelectNets(system, nets, &netcount)); + for (int i=0; inodes[NET].nodes+n; struct ncclTopoNode* gpu; if (graph->collNet && net->net.collSupport == 0) continue; @@ -457,10 +515,19 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo if (graph->nChannels == 0) { // Always try the PCI order first to set a reference, but don't count in the timeout nor let it run for long struct ncclTopoLinkList* paths = net->paths[GPU]; + int f = 0, f_gdr = 0; // find the first GPU that is closest to NIC - int f = 0; - for (int i = 0; inodes[GPU].count; i++) - if (paths[i].count < paths[f].count) f = i; + for (int i = 0; inodes[GPU].count; i++) { + if (paths[i].count <= paths[f].count) { + // prefer GPU direct RDMA + int gdr; + NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[i].id, net->id, 0, &gdr)); + if (paths[i].count < paths[f].count || (paths[i].count == paths[f].count && !f_gdr && gdr)) { + f = i; + f_gdr = gdr; + } + } + } int t = 1 << 10; NCCLCHECK(ncclTopoSearchTryGpu(system, graph, saveGraph, 0, backToNet, backToFirstRank, (f == 0) ? FORCED_ORDER_PCI : 0, &t, NET, n, f)); if (t == -1) *time = -1; @@ -504,6 +571,7 @@ ncclResult_t ncclTopoSearchRecNet(struct ncclTopoSystem* system, struct ncclTopo } } } + free(nets); return ncclSuccess; } @@ -700,7 +768,6 @@ RCCL_PARAM(ModelMatchingDisable, "MODEL_MATCHING_DISABLE", 0); ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) { int ngpus = system->nodes[GPU].count; - int nnets = system->nodes[NET].count; int crossNic = (system->nodes[NET].count > 1) && graph->crossNic ? 1 : 0; graph->speedIntra = graph->speedInter = 0; if (graph->crossNic == 2) graph->crossNic = 0; @@ -729,41 +796,13 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph if (graph->nChannels) { system->type |= RCCL_TOPO_4P2H_ROME; } - } else if (!rcclParamModelMatchingDisable()) { + } else if (!rcclParamModelMatchingDisable() && !graph->collNet) { // try to match 8P6L NCCLCHECK(parseChordalRing(system, graph)); if (graph->nChannels) return ncclSuccess; // try to match Rome 4P2H NCCLCHECK(parseRome4P2H(system, graph)); } - if (graph->collNet && graph->nChannels) { - struct ncclTopoGraph tmpGraph; - memcpy(&tmpGraph, graph, sizeof(struct ncclTopoGraph)); - int nets[MAXCHANNELS], n = 0; - for (int i = 0; i < tmpGraph.nChannels; i++) { - int j; - for (j = 0; j < n; j++) { - if (nets[j] == tmpGraph.inter[i*2]) - break; - } - if (j >= n) - nets[n++] = tmpGraph.inter[i*2]; - } - for (int i = 0; i < n; i++) { - int j; - for (j = 0; j < tmpGraph.nChannels; j++) { - if (nets[i] == tmpGraph.inter[j*2]) - break; - } - if (j < tmpGraph.nChannels) { - memcpy(graph->intra+i*ngpus, &tmpGraph.intra[j*ngpus], ngpus*sizeof(int)); - memcpy(graph->inter+i*2, &tmpGraph.inter[j*2], 2*sizeof(int)); - } - } - memcpy(graph->intra+n*ngpus, graph->intra, ngpus*sizeof(int)*n); - memcpy(graph->inter+n*2, graph->inter, 2*sizeof(int)*n); - graph->nChannels = n; - } if (graph->nChannels) return ncclSuccess; if ((graph->pattern == NCCL_TOPO_PATTERN_RING) && (system->type & RCCL_TOPO_4P2H_ROME) && (ngpus == system->nRanks)) { @@ -806,6 +845,7 @@ search: for (int g=0; gintra[c*ngpus+g]); } + printf("[%d %d]", graph->inter[0], graph->inter[1]); printf("\n"); } #endif @@ -897,12 +937,7 @@ done: graph->nChannels = 1; } - if (graph->nChannels && graph->collNet) { - // duplicate collnet channels - memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, ngpus*sizeof(int)*graph->nChannels); - memcpy(graph->inter+graph->nChannels*2, graph->inter, 2*sizeof(int)*graph->nChannels); - } - else if (graph->speedIntra >= 25.0) { + if (graph->speedIntra >= 25.0) { int dupChannels = std::min(graph->nChannels*2, graph->maxChannels); memcpy(graph->intra+graph->nChannels*ngpus, graph->intra, (dupChannels-graph->nChannels)*ngpus*sizeof(int)); memcpy(graph->inter+graph->nChannels*2,graph->inter, (dupChannels-graph->nChannels)*2*sizeof(int)); @@ -951,7 +986,7 @@ ncclResult_t ncclTopoDumpGraphs(struct ncclTopoSystem* system, int ngraphs, stru return ncclSuccess; } -ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int* dev) { +ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int rr, int* dev) { if (graph) { // Honor the net device in the graph int channel = channelId%graph->nChannels; @@ -960,7 +995,7 @@ ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct n *dev = graph->inter[channel*2+index]; } else { int64_t id; - NCCLCHECK(ncclTopoGetLocalNet(system, rank, &id, channelId)); + NCCLCHECK(ncclTopoGetLocalNet(system, rank, &id, rr)); *dev = id; } return ncclSuccess; diff --git a/src/graph/topo.cc b/src/graph/topo.cc index d7ce17fd6f..b385d5c3ad 100644 --- a/src/graph/topo.cc +++ b/src/graph/topo.cc @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -182,6 +182,65 @@ ncclResult_t ncclTopoConnectNodes(struct ncclTopoNode* node, struct ncclTopoNode return ncclSuccess; } +// BCM Gen4 Switches present themselves as a two-level hierarchical switch +// even though they're supposed to sustain full BW across all ports. +// Flatten the switch as this extra level can break the search and make +// NCCL take wrong topology decisions. +ncclResult_t ncclTopoFlattenBcmSwitches(struct ncclTopoSystem* system) { + for (int s=0; snodes[PCI].count; s++) { + struct ncclTopoNode* pciSwitch = system->nodes[PCI].nodes+s; + uint64_t device = pciSwitch->pci.device; + // Only flatten PEX Gen 4 switches in base mode + if ((device & 0xfffffffffffff000) == 0x1000c0101000a000) { + // Find sub switches with the same device ID. + int64_t* subSwIds; + NCCLCHECK(ncclCalloc(&subSwIds, pciSwitch->nlinks)); + int subs = 0; + for (int l=0; lnlinks; l++) { + struct ncclTopoNode* sub = pciSwitch->links[l].remNode; + // Only fuse sub switches with the same device ID. + if (sub->type != PCI || sub->pci.device != device) continue; + // Save sub switch for later + subSwIds[subs++] = sub->id; + // Remove link to that sub switch + memmove(pciSwitch->links+l, pciSwitch->links+l+1, (pciSwitch->nlinks-l-1)*(sizeof(struct ncclTopoLink))); + pciSwitch->nlinks--; + // Don't increase l for the next iteration as we just shifted all links by one. + l--; + } + + for (int s=0; snodes[PCI].nodes is changing every time we remove a node) + int index; + NCCLCHECK(ncclTopoIdToIndex(system, PCI, subSwIds[s], &index)); + struct ncclTopoNode* sub = system->nodes[PCI].nodes+index; + // Connect all sub PCI devices to the parent switch + for (int l=0; lnlinks; l++) { + struct ncclTopoNode* remNode = sub->links[l].remNode; + if (remNode == pciSwitch) continue; + // Add link from parent PCI switch -> PCI device + memcpy(pciSwitch->links+pciSwitch->nlinks, sub->links+l, sizeof(struct ncclTopoLink)); + pciSwitch->nlinks++; + // Update link from PCI device -> parent PCI switch + for (int rl=0; rlnlinks; rl++) { + if (remNode->links[rl].remNode == sub) { + remNode->links[rl].remNode = pciSwitch; + break; + } + } + } + NCCLCHECK(ncclTopoRemoveNode(system, PCI, index)); + } + // Set subdevice to 0x0000 to make sure we don't merge this switch again. + pciSwitch->pci.device = 0x1000c01010000000; + free(subSwIds); + // Restart, as system->nodes[PCI].nodes has changed. + s = 0; + } + } + return ncclSuccess; +} + ncclResult_t ncclTopoConnectCpus(struct ncclTopoSystem* system) { // And connect all CPU nodes together for (int n=0; nnodes[CPU].count; n++) { @@ -200,6 +259,8 @@ static ncclResult_t ncclTopoPrintRec(struct ncclTopoNode* node, struct ncclTopoN sprintf(line+offset, "%s/%lX (%d)", topoNodeTypeStr[node->type], node->id, node->gpu.rank); } else if (node->type == CPU) { sprintf(line+offset, "%s/%lX (%d/%d/%d)", topoNodeTypeStr[node->type], node->id, node->cpu.arch, node->cpu.vendor, node->cpu.model); + } else if (node->type == PCI) { + sprintf(line+offset, "%s/%lX (%lx)", topoNodeTypeStr[node->type], node->id, node->pci.device); } else { sprintf(line+offset, "%s/%lX", topoNodeTypeStr[node->type], node->id); } @@ -360,6 +421,15 @@ ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* s NCCLCHECK(ncclTopoAddNic(xmlNic, system, nicNode, busId)); } else if (type == PCI) { NCCLCHECK(ncclTopoCreateNode(system, &node, type, busId)); + NCCLCHECK(xmlGetAttr(xmlPci, "vendor", &str)); + if (str) node->pci.device += strtol(str, NULL, 0) << 48; + NCCLCHECK(xmlGetAttr(xmlPci, "device", &str)); + if (str) node->pci.device += strtol(str, NULL, 0) << 32; + NCCLCHECK(xmlGetAttr(xmlPci, "subsystem_vendor", &str)); + if (str) node->pci.device += strtol(str, NULL, 0) << 16; + NCCLCHECK(xmlGetAttr(xmlPci, "subsystem_device", &str)); + if (str) node->pci.device += strtol(str, NULL, 0); + for (int s=0; snSubs; s++) { struct ncclXmlNode* xmlSubPci = xmlPci->subs[s]; NCCLCHECK(ncclTopoAddPci(xmlSubPci, system, node)); @@ -554,6 +624,7 @@ ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL)); #endif + NCCLCHECK(ncclTopoFlattenBcmSwitches(*topoSystem)); NCCLCHECK(ncclTopoConnectCpus(*topoSystem)); NCCLCHECK(ncclTopoSortSystem(*topoSystem)); @@ -681,7 +752,7 @@ ncclResult_t ncclTopoGetLocalNet(struct ncclTopoSystem* system, int rank, int64_ } if (path->width == maxWidth && path->type == minType) nets[count++] = system->nodes[NET].nodes[n].id; } - *id = nets[rr % count]; + *id = nets[rr%count]; free(nets); return ncclSuccess; } diff --git a/src/graph/topo.h b/src/graph/topo.h index f4054c23f3..fdef9a0069 100644 --- a/src/graph/topo.h +++ b/src/graph/topo.h @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -29,8 +29,7 @@ // Intel CPU convert GPU P2P traffic into 64B PCI TLPs, so GPU // to GPU traffic consumes more PCI bandwidth. -#define INTEL_P2P(speed) (speed*9/12) -#define INTEL_P2P_OVERHEAD(speed) (speed*12/9) +#define INTEL_P2P_OVERHEAD(speed) (speed*6/5) #define NCCL_TOPO_NODE_TYPES 7 #define GPU 0 @@ -114,6 +113,9 @@ struct ncclTopoNode { int model; cpu_set_t affinity; }cpu; + struct { + uint64_t device; + }pci; }; int nlinks; struct ncclTopoLink links[NCCL_TOPO_MAX_LINKS]; diff --git a/src/graph/tuning.cc b/src/graph/tuning.cc index 88f19e0e90..a63ef585c4 100644 --- a/src/graph/tuning.cc +++ b/src/graph/tuning.cc @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -63,11 +63,11 @@ static const float baseLat [NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { { 39.0 // Tree/Simple is the latency a 256kB chunk, which is ~ base lat + 256k/12GB/s (+ 256k/12GB/s for the network). static const float hwLat [3][NCCL_NUM_ALGORITHMS][NCCL_NUM_PROTOCOLS] = { /* NVLINK */ - { /* Tree (LL/LL128/Simple)*/ { 2.5, 2.5, 5.5 }, /* Ring (LL/LL128/Simple)*/ { 2.5, 2.5, 5 }, /* CollNet (LL/LL128/Simple)*/ { 1.1, 1.1, 3.3 } }, + { /* Tree (LL/LL128/Simple)*/ { 2.5, 2.5, 5.5 }, /* Ring (LL/LL128/Simple)*/ { 2.5, 2.5, 5 }, /* CollNet (LL/LL128/Simple)*/ { 1.2, 1.2, 3.8 } }, /* PCI */ - { /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 1.3, 1.3, 1.9 }, /* CollNet (LL/LL128/Simple)*/ { 1.1, 1.1, 1.7 } }, + { /* Tree (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 }, /* Ring (LL/LL128/Simple)*/ { 1.3, 1.3, 1.9 }, /* CollNet (LL/LL128/Simple)*/ { 2.2, 2.2, 5.7 } }, /* NET */ - { /* Tree (LL/LL128/Simple)*/ { 28.0, 28.0, 66.0 }, /* Ring (LL/LL128/Simple)*/ { 8.5, 8.5, 19.0 }, /* CollNet (LL/LL128/Simple)*/ { 6.5, 6.5, 14.5 } } + { /* Tree (LL/LL128/Simple)*/ { 28.0, 28.0, 66.0 }, /* Ring (LL/LL128/Simple)*/ { 8.5, 8.5, 19.0 }, /* CollNet (LL/LL128/Simple)*/ { 9.8, 9.8, 19.5 } } }; // LL128 max BW (per channel) for the different collectives @@ -87,8 +87,10 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*WARP_SIZE, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS); #else getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, simpleDefaultThreads); - comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] = + comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS); + comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] = + getNthreads("NCCL_NTHREADS", ncclParamNthreads(), NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS, NCCL_SIMPLE_MAX_NTHREADS); comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_LL] = comm->maxThreads[NCCL_ALGO_COLLNET][NCCL_PROTO_LL] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 2*WARP_SIZE, NCCL_LL_MAX_NTHREADS, NCCL_LL_MAX_NTHREADS); #endif @@ -148,8 +150,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom if (a == NCCL_ALGO_TREE && p == NCCL_PROTO_LL128) busBw = std::min(busBw * (nNodes == 1 ? 7.0/9.0 : 0.915 /*120.0/128.0*/), ll128MaxBwPerCh[coll]*graphs[a]->nChannels); #endif if (a == NCCL_ALGO_COLLNET) busBw *= .9; - if (a == NCCL_ALGO_COLLNET && p == NCCL_PROTO_LL) busBw *= 1.0/2.0; // Take into account that GDR read is disabled on both sides - if (a == NCCL_ALGO_COLLNET && p == NCCL_PROTO_LL128) busBw = 0; // CollNet does not support LL128 + if (a == NCCL_ALGO_COLLNET && p != NCCL_PROTO_SIMPLE) busBw = 0; // Oneshot CollNet only supports Simple // Convert bus BW to algorithm BW float ratio = (a != NCCL_ALGO_RING) ? .5 : (1.0 * nRanks) / nsteps; @@ -158,6 +159,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom comm->latencies[coll][a][p] = baseLat[a][p]; float intraLat = hwLat[intraHw[a]][a][p]; float interLat = hwLat[NCCL_HW_NET][a][p]; + if (nNodes > 1 && p == NCCL_PROTO_LL) intraLat *= 1.8; if (a == NCCL_ALGO_RING) { float lat = hwLat[hw[a]][a][p]; if ((coll == ncclFuncReduce || coll == ncclFuncBroadcast)) { @@ -252,6 +254,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom comm->threadThresholds[a][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_THREAD_THRESHOLD; } comm->threadThresholds[NCCL_ALGO_RING][NCCL_PROTO_LL] *= nRanks; + comm->threadThresholds[NCCL_ALGO_COLLNET][NCCL_PROTO_SIMPLE] = 512; // Override defaults with user env char* str = getenv("NCCL_THREAD_THRESHOLDS"); diff --git a/src/graph/xml.cc b/src/graph/xml.cc index e1a0b431d8..c5891a2507 100644 --- a/src/graph/xml.cc +++ b/src/graph/xml.cc @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -17,10 +17,6 @@ #include "core.h" #include "nvmlwrap.h" #include "xml.h" -#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) -#include -#include -#endif /*******************/ /* XML File Parser */ @@ -482,6 +478,26 @@ ncclResult_t ncclTopoGetXmlFromSys(struct ncclXmlNode* pciNode, struct ncclXml* if (path == NULL) NCCLCHECK(getPciPath(busId, &path)); NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "class", "class")); } + NCCLCHECK(xmlGetAttrIndex(pciNode, "vendor", &index)); + if (index == -1) { + if (path == NULL) NCCLCHECK(getPciPath(busId, &path)); + NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "vendor", "vendor")); + } + NCCLCHECK(xmlGetAttrIndex(pciNode, "device", &index)); + if (index == -1) { + if (path == NULL) NCCLCHECK(getPciPath(busId, &path)); + NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "device", "device")); + } + NCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_vendor", &index)); + if (index == -1) { + if (path == NULL) NCCLCHECK(getPciPath(busId, &path)); + NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "subsystem_vendor", "subsystem_vendor")); + } + NCCLCHECK(xmlGetAttrIndex(pciNode, "subsystem_device", &index)); + if (index == -1) { + if (path == NULL) NCCLCHECK(getPciPath(busId, &path)); + NCCLCHECK(ncclTopoSetAttrFromSys(pciNode, path, "subsystem_device", "subsystem_device")); + } NCCLCHECK(xmlGetAttrIndex(pciNode, "link_speed", &index)); if (index == -1) { if (path == NULL) NCCLCHECK(getPciPath(busId, &path)); diff --git a/src/group.cc b/src/group.cc index 56dc892e0d..03d3538d72 100644 --- a/src/group.cc +++ b/src/group.cc @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -127,7 +127,7 @@ static ncclResult_t scheduleSendRecv(struct ncclComm* comm, int delta, int chann info.sendbytes = sendbytes; info.recvbytes = recvbytes; if (delta == 0 && sendbytes != recvbytes) return ncclInvalidUsage; - NCCLCHECK(ncclSaveP2pKernel(&info)); + NCCLCHECK(ncclSetupP2pKernel(&info)); return ncclSuccess; } @@ -135,7 +135,7 @@ void* ncclAsyncThreadPreconnect(void* args_) { struct ncclAsyncArgs* args = (struct ncclAsyncArgs*)args_; struct ncclComm* comm = args->coll.comm; CUDACHECKTHREAD(hipSetDevice(comm->cudaDev)); - NCCLCHECKTHREAD(ncclTransportP2pSetup(comm, NULL)); + NCCLCHECKTHREAD(ncclTransportP2pSetup(comm, NULL, NCCL_CONN_IDX_P2P)); return args; } @@ -165,6 +165,7 @@ ncclResult_t ncclGroupEnd() { int doneArray[MAX_ASYNC_OPS]; for (int i=0; ifuncType == ASYNC_FUNC_COLL) { ncclComm_t comm = args->coll.comm; - NCCLCHECKGOTO(ncclSaveCommKernels(comm), ret, group_cleanup); + NCCLCHECKGOTO(ncclGetCudaGraph(comm, graphs+i), ret, group_cleanup); + if (usingCudaGraphAll == -1) { + usingCudaGraphAll = comm->usingCudaGraph; + } else if (usingCudaGraphAll != comm->usingCudaGraph) { + WARN("Illegal to have some communicators in graph mode while others not"); + ret = ncclInvalidUsage; + goto group_cleanup; + } } } for (int i=0; ifuncType == ASYNC_FUNC_COLL) { - if (args->coll.comm->userStream == NULL) + ncclComm_t comm = args->coll.comm; + NCCLCHECKGOTO(ncclSetupAsyncKernels(comm), ret, group_cleanup); + } + } + for (int i=0; ifuncType == ASYNC_FUNC_COLL) { + if (args->coll.comm->userStream == hipStreamDefault/* || + args->coll.comm->userStream == hipStreamPerThread || + args->coll.comm->userStream == hipStreamLegacy*/) CUDACHECKGOTO(hipSetDevice(args->coll.comm->cudaDev), ret, end); - NCCLCHECKGOTO(ncclBarrierEnqueue(args->coll.comm), ret, end); + if (usingCudaGraphAll == 1) { + NCCLCHECKGOTO(ncclCudaGraphHostSetup(args->coll.comm, graphs[i]), ret, end); + } else { + ncclEnqueueHostSetup<0>(args->coll.comm->enqueueInfo); + } + NCCLCHECKGOTO(ncclLaunchBarrier(args->coll.comm), ret, end); } } for (int i=0; ifuncType == ASYNC_FUNC_COLL) { CUDACHECKGOTO(hipSetDevice(args->coll.comm->cudaDev), ret, end); - NCCLCHECKGOTO(ncclBarrierEnqueueWait(args->coll.comm), ret, end); + NCCLCHECKGOTO(ncclLaunchKernel(args->coll.comm), ret, end); } } for (int i=0; ifuncType == ASYNC_FUNC_COLL) { - if (args->coll.comm->userStream == NULL) + if (args->coll.comm->userStream == hipStreamDefault/* || + args->coll.comm->userStream == hipStreamPerThread || + args->coll.comm->userStream == hipStreamLegacy*/) CUDACHECKGOTO(hipSetDevice(args->coll.comm->cudaDev), ret, end); - NCCLCHECKGOTO(ncclEnqueueEvents(args->coll.comm), ret, end); + NCCLCHECKGOTO(ncclRecordEvents(args->coll.comm), ret, end); + NCCLCHECKGOTO(ncclLaunchReset(args->coll.comm), ret, end); } } @@ -373,8 +402,7 @@ group_cleanup: pthread_mutex_unlock(&state->poolMutex); state->nextOps = NULL; - comm->myParams->gridDim.x = comm->myParams->blockDim.x = 0; - comm->userStreamSet = false; + ncclLaunchReset(comm); } } } diff --git a/src/include/alloc.h b/src/include/alloc.h index 27a9ab7761..be71f6a54f 100644 --- a/src/include/alloc.h +++ b/src/include/alloc.h @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -52,11 +52,16 @@ extern struct allocationTracker allocTracker[]; template static ncclResult_t ncclCudaCalloc(T** ptr, size_t nelem, bool isFineGrain = false) { + // Need async stream for P2P pre-connect + CUDA Graph + hipStream_t stream; + CUDACHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); if (isFineGrain) CUDACHECK(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained)); else CUDACHECK(hipMalloc(ptr, nelem*sizeof(T))); - CUDACHECK(hipMemset(*ptr, 0, nelem*sizeof(T))); + CUDACHECK(hipMemsetAsync(*ptr, 0, nelem*sizeof(T), stream)); + CUDACHECK(hipStreamSynchronize(stream)); + CUDACHECK(hipStreamDestroy(stream)); int dev; CUDACHECK(hipGetDevice(&dev)); if (dev < MAX_ALLOC_TRACK_NGPU) { diff --git a/src/include/bootstrap.h b/src/include/bootstrap.h index 7926785e73..629d15fc7b 100644 --- a/src/include/bootstrap.h +++ b/src/include/bootstrap.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -14,8 +14,8 @@ ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv); ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out); ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commState, int* rootPid); // [RCCL] Adding rootPid ncclResult_t bootstrapAllGather(void* commState, void* allData, int size); -ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size); -ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size); +ncclResult_t bootstrapSend(void* commState, int peer, int tag, void* data, int size); +ncclResult_t bootstrapRecv(void* commState, int peer, int tag, void* data, int size); ncclResult_t bootstrapRemAlloc(size_t size, int rank, void* commState, int* id, hipIpcMemHandle_t* ipc, void** ptr); ncclResult_t bootstrapRemFree(int id, int rank, void* commState); ncclResult_t bootstrapClose(void* commState); diff --git a/src/include/checks.h b/src/include/checks.h index 48489e368f..98cf164133 100644 --- a/src/include/checks.h +++ b/src/include/checks.h @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -28,6 +28,15 @@ } \ } while(false) +// Report failure but clear error and continue +#define CUDACHECKIGNORE(cmd) do { \ + hipError_t err = cmd; \ + if( err != hipSuccess ) { \ + INFO(NCCL_ALL,"%s:%d Cuda failure '%s'", __FILE__, __LINE__, hipGetErrorString(err)); \ + (void) hipGetLastError(); \ + } \ +} while(false) + #include // Check system calls #define SYSCHECK(call, name) do { \ diff --git a/src/include/collectives.h b/src/include/collectives.h index ee4cc9b53c..50c309d08c 100644 --- a/src/include/collectives.h +++ b/src/include/collectives.h @@ -1,6 +1,6 @@ /************************************************************************* * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -81,4 +81,5 @@ DECL_ALL #define REDUCE_SLICESTEPS 1 #define REDUCE_CHUNKSTEPS 1 #define SENDRECV_SLICEFACTOR 1 + #endif diff --git a/src/include/comm.h b/src/include/comm.h index 665e259c68..4b02e559d5 100644 --- a/src/include/comm.h +++ b/src/include/comm.h @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -15,6 +15,9 @@ // [/RCCL] #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) + typedef void *cudaGraph_t; + typedef void *cudaGraphNode_t; + #define HIPRT_CB #else #if CUDART_VERSION < 9000 struct cudaLaunchParams { @@ -84,16 +87,17 @@ struct ncclComm { int nNodes; int localRanks; - enum { GROUP, PARALLEL } launchMode; + enum { GROUP, PARALLEL, GROUP_GRAPH } launchMode; hipStream_t userStream; bool userStreamSet; hipEvent_t doneEvent; + hipEvent_t intDoneEvent; bool checkPointers; - // Counter to make sure collectives match (needed for bcast/reduce - // where syncs are not symmetric). + // Counter for tracking CUDA launches (P2P and collectives included) uint64_t opCount; - uint64_t lastOpCount; + // Collective operation counter + uint64_t collOpCount; // Channels for collectives int nChannels; @@ -101,8 +105,6 @@ struct ncclComm { int p2pnChannels; int p2pnChannelsPerPeer; int p2pChannels[MAXCHANNELS]; - //Channels for collnet - int collNetnChannels; // Buffer sizes int buffSizes[NCCL_NUM_PROTOCOLS]; @@ -157,6 +159,7 @@ struct ncclComm { struct ncclInfo* asyncOps; int asyncOpCount; size_t asyncTotalSize; + int lastChannel; //list of async p2p operation queued in a group semantics struct ncclP2Plist* p2pSends; @@ -169,6 +172,11 @@ struct ncclComm { int rootPid; // Process ID of root // [/RCCL] + // Store info for cudaGraph + int usingCudaGraph; // Only use it during capture time, not launch time + struct ncclQueueInfo* enqueueInfo; + cudaGraphNode_t lastSetupNode; + unsigned long long lastCudaGraphId; }; #endif diff --git a/src/include/core.h b/src/include/core.h index 75a903fbf0..f2efd2853d 100644 --- a/src/include/core.h +++ b/src/include/core.h @@ -1,6 +1,6 @@ /************************************************************************* * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/include/devcomm.h b/src/include/devcomm.h index 117dc40d74..0221e1e99b 100644 --- a/src/include/devcomm.h +++ b/src/include/devcomm.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information @@ -125,8 +125,9 @@ struct ncclConnInfo { struct ncclConnector { int connected; struct ncclProxyArgs *proxyAppend; + struct ncclProxyArgs **proxyAppendPtr; struct ncclTransportComm* transportComm; - void* transportResources; // Host-side resources + void* transportResources; struct ncclConnInfo conn; struct ncclComm *comm; }; @@ -151,11 +152,23 @@ struct ncclTree { int down[NCCL_MAX_TREE_ARITY]; }; +#define NCCL_MAX_DIRECT_ARITY 7 +struct ncclDirect { + int depth; + int out; + int nHeads; + int headRank; + int shift; + int up[NCCL_MAX_DIRECT_ARITY]; + int down[NCCL_MAX_DIRECT_ARITY]; +}; + +#define NCCL_CONN_IDX_P2P (*(comm->p2pNet)*2) +#define NCCL_CONN_IDX_P2P_NET 2 +#define NCCL_MAX_CONNS 3 struct ncclPeer { - struct ncclConnector send; - struct ncclConnector recv; - struct ncclConnector p2pSend; - struct ncclConnector p2pRecv; + struct ncclConnector send[NCCL_MAX_CONNS]; + struct ncclConnector recv[NCCL_MAX_CONNS]; }; struct ncclDevComm; @@ -179,7 +192,6 @@ struct ncclWorkElem { const void * sendbuff; void * recvbuff; - uint64_t opCount; // Op-specific fields. union { struct { @@ -192,9 +204,15 @@ struct ncclWorkElem { struct { size_t sendCount; size_t recvCount; + int sendChunkSize; + int recvChunkSize; int32_t delta; uint16_t nThreads; } p2p; + struct { + uint16_t padding[15]; + uint16_t opCount; + } op; // [RCCL] Clique-based arguments // NOTE: Follows same field structure as coll // because nChannels is accessed from "coll" struct. @@ -206,7 +224,7 @@ struct ncclWorkElem { uint8_t nChannels; } clique; // [/RCCL] - uint64_t align[3]; + uint64_t align[4]; }; }; struct ncclWork { @@ -219,7 +237,7 @@ struct ncclChannel { struct { struct ncclRing ring; struct ncclTree tree; - struct ncclTree collTree; + struct ncclDirect collTree; int id; @@ -241,6 +259,12 @@ struct ncclChannel { float bw_cumulative; int bw_count; #endif + uint16_t index; // Only used by GPU + + // GDRCOPY support + struct ncclWork* workFifoGdr; + struct ncclWork* workFifoDev; + void* gdrMemDesc; }; int data[0x80]; }; diff --git a/src/include/enqueue.h b/src/include/enqueue.h index 2c2ab1fe78..d806ab7fb5 100644 --- a/src/include/enqueue.h +++ b/src/include/enqueue.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -11,15 +11,82 @@ #include "group.h" #include "collectives.h" +size_t ncclKernMaxLocalSize(); ncclResult_t ncclEnqueueCheck(struct ncclInfo* info); ncclResult_t ncclCpuBarrierIn(struct ncclComm* comm, int* isLast); ncclResult_t ncclCpuBarrierLast(struct ncclComm* comm); ncclResult_t ncclCpuBarrierOut(struct ncclComm* comm); -ncclResult_t ncclBarrierEnqueue(struct ncclComm* comm); -ncclResult_t ncclBarrierEnqueueWait(struct ncclComm* comm); -ncclResult_t ncclEnqueueEvents(struct ncclComm* comm); -ncclResult_t ncclSaveKernel(struct ncclInfo* info); -ncclResult_t ncclSaveP2pKernel(struct ncclInfo* info); -ncclResult_t ncclSaveCommKernels(struct ncclComm* comm); +ncclResult_t ncclLaunchBarrier(struct ncclComm* comm); +ncclResult_t ncclLaunchKernel(ncclComm_t comm); +ncclResult_t ncclRecordEvents(struct ncclComm* comm); +ncclResult_t ncclLaunchReset(ncclComm_t comm); +ncclResult_t ncclSetupP2pKernel(struct ncclInfo* info); +ncclResult_t ncclSetupAsyncKernels(struct ncclComm* comm); +template +void HIPRT_CB ncclEnqueueHostSetup(void* arg); +ncclResult_t ncclGetCudaGraph(ncclComm_t comm, cudaGraph_t* graph); +ncclResult_t ncclCudaGraphHostSetup(ncclComm_t comm, cudaGraph_t graph); +// Enqueue information (for kernel and proxy) for each operation +struct ncclQueueElem { + struct ncclWorkElem work; + struct ncclProxyArgs proxyArgs; + struct ncclQueueElem* next; +}; + +// Store enqueue elements in a list +struct ncclQueueElemList { + struct ncclQueueElem* head; + struct ncclQueueElem* tail; +}; + +// Structure passed to CUDA graph +struct ncclQueueInfo { + ncclComm_t comm; + int maxChannels; // Dynamic version of gridDim + ncclResult_t ret; // Return value of host setup call + struct ncclQueueElemList elemList; +}; + +// Get next element from enqueue list +static ncclResult_t ncclAddQueueElem(struct ncclQueueInfo* eqInfo, struct ncclQueueElem** elemOut) { + if (eqInfo == NULL) return ncclInternalError; + struct ncclQueueElemList* list = &eqInfo->elemList; + if (list->tail != NULL) { + *elemOut = list->tail; + memset(*elemOut, 0, sizeof(struct ncclWorkElem) + sizeof(struct ncclProxyArgs)); + } else { + NCCLCHECK(ncclCalloc(&list->tail, 1)); + *elemOut = list->tail; + list->head = list->tail; + } + if (list->tail->next == NULL) { + NCCLCHECK(ncclCalloc(&list->tail->next, 1)); + } + list->tail = list->tail->next; + return ncclSuccess; +} + +// Reset element queue +static ncclResult_t ncclResetQueueInfo(struct ncclQueueInfo* eqInfo) { + if (eqInfo == NULL) return ncclInternalError; + eqInfo->maxChannels = 0; + eqInfo->ret = ncclSuccess; + eqInfo->elemList.tail = eqInfo->elemList.head; + return ncclSuccess; +} + +// Destroy enqueue info space +// used by both CUDA graph and non CUDA graph +static void ncclDestroyQueueInfo(void* ptr) { + if (ptr == NULL) return; + struct ncclQueueInfo* eqInfo = (struct ncclQueueInfo*)ptr; + struct ncclQueueElem* head = eqInfo->elemList.head; + while (head != NULL) { + struct ncclQueueElem* temp = head; + head = head->next; + free(temp); + } + free(eqInfo); +} #endif // End include guard diff --git a/src/include/gdrwrap.h b/src/include/gdrwrap.h new file mode 100644 index 0000000000..e3d923661f --- /dev/null +++ b/src/include/gdrwrap.h @@ -0,0 +1,272 @@ +/************************************************************************* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#ifndef NCCL_GDRWRAP_H_ +#define NCCL_GDRWRAP_H_ + +#include "nccl.h" +#include // for standard [u]intX_t types +#include + +// These can be used if the GDR library isn't thread safe +#include +extern pthread_mutex_t gdrLock; +#define GDRLOCK() pthread_mutex_lock(&gdrLock) +#define GDRUNLOCK() pthread_mutex_unlock(&gdrLock) +#define GDRLOCKCALL(cmd, ret) do { \ + GDRLOCK(); \ + ret = cmd; \ + GDRUNLOCK(); \ +} while(false) + +#define GDRCHECK(cmd) do { \ + int e; \ + /* GDRLOCKCALL(cmd, e); */ \ + e = cmd; \ + if( e != 0 ) { \ + WARN("GDRCOPY failure %d", e); \ + return ncclSystemError; \ + } \ +} while(false) + +// This is required as the GDR memory is mapped WC +#if !defined(__NVCC__) +#if defined(__PPC__) +static inline void wc_store_fence(void) { asm volatile("sync") ; } +#elif defined(__x86_64__) +#include +static inline void wc_store_fence(void) { _mm_sfence(); } +#elif defined(__aarch64__) +#ifdef __cplusplus +#include +static inline void wc_store_fence(void) { std::atomic_thread_fence(std::memory_order_release); } +#else +#include +static inline void wc_store_fence(void) { atomic_thread_fence(memory_order_release); } +#endif +#endif +#endif + +//#define GDR_DIRECT 1 +#ifdef GDR_DIRECT +// Call the GDR API library code directly rather than via +// dlopen() wrappers +#include + +static ncclResult_t wrap_gdr_symbols(void) { return ncclSuccess; } +static gdr_t wrap_gdr_open(void) { gdr_t g = gdr_open(); return g; } +static ncclResult_t wrap_gdr_close(gdr_t g) { GDRCHECK(gdr_close(g)); return ncclSuccess; } +static ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle) { + GDRCHECK(gdr_pin_buffer(g, addr, size, p2p_token, va_space, handle)); + return ncclSuccess; +} +static ncclResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle) { + GDRCHECK(gdr_unpin_buffer(g, handle)); + return ncclSuccess; +} +static ncclResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info) { + GDRCHECK(gdr_get_info(g, handle, info)); + return ncclSuccess; +} +static ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size) { + GDRCHECK(gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size)); + return ncclSuccess; +} +static ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size) { + GDRCHECK(gdr_unmap(gdr_t g, gdr_mh_t handle, void **va, size_t size)); + return ncclSuccess; +} +static void wrap_gdr_runtime_get_version(int *major, int *minor) { + gdr_runtime_get_version(major, minor); + return ncclSuccess; +} +static void wrap_gdr_driver_get_version(gdr_t g, int *major, int *minor) { + gdr_driver_get_version(g, major, minor); + return ncclSuccess; +} +static ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size) { + GDRCHECK(gdr_copy_to_mapping(handle, map_d_ptr, h_ptr, size)); + return ncclSuccess; +} +static ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size) { + GDRCHECK(gdr_copy_from_mapping(handle, h_ptr, map_d_ptr, size)); + return ncclSuccess; +} + +#else +// Dynamically handle dependency the GDR API library + +/* Extracted from gdrapi.h (v2.1 Nov 2020) */ + +#define GPU_PAGE_SHIFT 16 +#define GPU_PAGE_SIZE (1UL << GPU_PAGE_SHIFT) +#define GPU_PAGE_OFFSET (GPU_PAGE_SIZE-1) +#define GPU_PAGE_MASK (~GPU_PAGE_OFFSET) + +struct gdr; +typedef struct gdr *gdr_t; + +typedef struct gdr_mh_s { + unsigned long h; +} gdr_mh_t; + +struct gdr_info { + uint64_t va; + uint64_t mapped_size; + uint32_t page_size; + uint64_t tm_cycles; + uint32_t cycles_per_ms; + unsigned mapped:1; + unsigned wc_mapping:1; +}; +typedef struct gdr_info gdr_info_t; + +/* End of gdrapi.h */ + +ncclResult_t wrap_gdr_symbols(void); + +gdr_t wrap_gdr_open(void); +ncclResult_t wrap_gdr_close(gdr_t g); +ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle); +ncclResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle); +ncclResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info); +ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size); +ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size); +ncclResult_t wrap_gdr_runtime_get_version(int *major, int *minor); +ncclResult_t wrap_gdr_driver_get_version(gdr_t g, int *major, int *minor); +ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size); +ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size); + +#endif // GDR_DIRECT + +// Global GDR driver handle +extern gdr_t ncclGdrCopy; + +#include "alloc.h" + +typedef struct gdr_mem_desc { + void *gdrDevMem; + void *gdrMap; + size_t gdrOffset; + size_t gdrMapSize; + gdr_mh_t gdrMh; +} gdr_mem_desc_t; + +#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) +static gdr_t ncclGdrInit() { + return NULL; +} + +template +static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** gdrHandle) { + return ncclSuccess; +} + +template +static ncclResult_t ncclGdrCudaCopy(void *gdrHandle, T* dst, T* src, size_t nelem) { + return ncclSuccess; +} + +static ncclResult_t ncclGdrCudaFree(void* gdrHandle) { + return ncclSuccess; +} +#else +static gdr_t ncclGdrInit() { + int libMajor, libMinor, drvMajor, drvMinor; + gdr_t handle = NULL; + // Dynamically load the GDRAPI library symbols + if (wrap_gdr_symbols() == ncclSuccess) { + handle = wrap_gdr_open(); + + if (handle != NULL) { + ncclResult_t res; + + // Query the version of libgdrapi + NCCLCHECKGOTO(wrap_gdr_runtime_get_version(&libMajor, &libMinor), res, error); + + // Query the version of gdrdrv driver + NCCLCHECKGOTO(wrap_gdr_driver_get_version(handle, &drvMajor, &drvMinor), res, error); + + // Only support GDRAPI 2.1 and later + if (libMajor < 2 || (libMajor == 2 && libMinor < 1) || drvMajor < 2 || (drvMajor == 2 && drvMinor < 1)) { + goto error; + } + else + INFO(NCCL_INIT, "GDRCOPY enabled library %d.%d driver %d.%d", libMajor, libMinor, drvMajor, drvMinor); + } + } + return handle; +error: + if (handle != NULL) (void) wrap_gdr_close(handle); + return NULL; +} + +template +static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void** gdrHandle) { + gdr_info_t info; + size_t mapSize; + gdr_mh_t mh; + char *devMem; + void *gdrMap; + + mapSize = sizeof(T)*nelem; + + // GDRCOPY Pinned buffer has to be a minimum of a GPU_PAGE_SIZE + ALIGN_SIZE(mapSize, GPU_PAGE_SIZE); + // GDRCOPY Pinned buffer has to be GPU_PAGE_SIZE aligned too + NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1)); + uint64_t alignedAddr = (((uint64_t) devMem) + GPU_PAGE_OFFSET) & GPU_PAGE_MASK; + size_t align = alignedAddr - (uint64_t)devMem; + + //TRACE(NCCL_INIT, "GDRCOPY: Pin buffer 0x%lx (%p) align %zi size %zi", alignedAddr, devMem, align, mapSize); + NCCLCHECK(wrap_gdr_pin_buffer(ncclGdrCopy, alignedAddr, mapSize, 0, 0, &mh)); + + NCCLCHECK(wrap_gdr_map(ncclGdrCopy, mh, &gdrMap, mapSize)); + //TRACE(NCCL_INIT, "GDRCOPY : mapped %p (0x%lx) at %p", devMem, alignedAddr, gdrMap); + + NCCLCHECK(wrap_gdr_get_info(ncclGdrCopy, mh, &info)); + + // Will offset ever be non zero ? + ssize_t off = info.va - alignedAddr; + + gdr_mem_desc_t* md; + NCCLCHECK(ncclCalloc(&md, 1)); + md->gdrDevMem = devMem; + md->gdrMap = gdrMap; + md->gdrMapSize = mapSize; + md->gdrOffset = off+align; + md->gdrMh = mh; + *gdrHandle = md; + + *ptr = (T *)((char *)gdrMap+off); + if (devPtr) *devPtr = (T *)(devMem+off+align); + + TRACE(NCCL_INIT, "GDRCOPY : allocated devMem %p gdrMap %p offset %lx mh %lx mapSize %zi at %p", + md->gdrDevMem, md->gdrMap, md->gdrOffset, md->gdrMh.h, md->gdrMapSize, *ptr); + + return ncclSuccess; +} + +template +static ncclResult_t ncclGdrCudaCopy(void *gdrHandle, T* dst, T* src, size_t nelem) { + gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle; + NCCLCHECK(wrap_gdr_copy_to_mapping(md->gdrMh, dst, src, nelem*sizeof(T))); + return ncclSuccess; +} + +static ncclResult_t ncclGdrCudaFree(void* gdrHandle) { + gdr_mem_desc_t *md = (gdr_mem_desc_t*)gdrHandle; + NCCLCHECK(wrap_gdr_unmap(ncclGdrCopy, md->gdrMh, md->gdrMap, md->gdrMapSize)); + NCCLCHECK(wrap_gdr_unpin_buffer(ncclGdrCopy, md->gdrMh)); + CUDACHECK(hipFree(md->gdrDevMem)); + free(md); + + return ncclSuccess; +} +#endif + +#endif // End include guard diff --git a/src/include/graph.h b/src/include/graph.h index 893db11814..cfdf6359d5 100644 --- a/src/include/graph.h +++ b/src/include/graph.h @@ -1,5 +1,6 @@ /************************************************************************* - * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -28,7 +29,7 @@ ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm); // Query topology -ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int* net); +ncclResult_t ncclTopoGetNetDev(struct ncclTopoSystem* system, int rank, struct ncclTopoGraph* graph, int channelId, int rr, int* net); ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_t id2, int* p2p, int *read, int* intermediateRank); ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* topo, int64_t busId, int netDev, int read, int* useGdr); @@ -93,13 +94,11 @@ struct ncclTopoRanks { }; ncclResult_t ncclTopoPreset(struct ncclComm* comm, - struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph, + struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoRanks* topoRanks); ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, - struct ncclTopoRanks** allTopoRanks, int* rings, int nc); - -ncclResult_t ncclTopoConnectCollNet(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, int rank); + struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph* collNetGraph, int nc); ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph* treeGraph, struct ncclTopoGraph* ringGraph, struct ncclTopoGraph* collNetGraph); #include "info.h" diff --git a/src/include/group.h b/src/include/group.h index ab75d01a1c..dec32b6476 100644 --- a/src/include/group.h +++ b/src/include/group.h @@ -1,6 +1,6 @@ /************************************************************************* * Copyright (c) 2015-2017, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/include/ibvwrap.h b/src/include/ibvwrap.h index 0943f9962c..4ec1ac6d4e 100644 --- a/src/include/ibvwrap.h +++ b/src/include/ibvwrap.h @@ -4,7 +4,7 @@ * Copyright (c) 2005, 2006, 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2005 PathScale, Inc. All rights reserved. * - * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -1089,7 +1089,7 @@ static inline int ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struc static inline ncclResult_t wrap_ibv_post_send(struct ibv_qp *qp, struct ibv_send_wr *wr, struct ibv_send_wr **bad_wr) { int ret = qp->context->ops.post_send(qp, wr, bad_wr); /*returns 0 on success, or the value of errno on failure (which indicates the failure reason)*/ if (ret != IBV_SUCCESS) { - WARN("ibv_post_send() failed with error %s", strerror(ret)); + WARN("ibv_post_send() failed with error %s, Bad WR %p, First WR %p", strerror(ret), wr, *bad_wr); return ncclSystemError; } return ncclSuccess; diff --git a/src/include/info.h b/src/include/info.h index 6b71492bd1..a4856893bf 100644 --- a/src/include/info.h +++ b/src/include/info.h @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -19,8 +19,7 @@ typedef enum { ncclPatternTreeUp, ncclPatternTreeDown, ncclPatternTreeUpDown, - ncclPatternCollTreeUp, - ncclPatternCollTreeDown + ncclPatternCollTreeUpDown } ncclPattern_t; // Used to pass NCCL call information between functions @@ -50,6 +49,8 @@ struct ncclInfo { int nchunksPerLoop; ssize_t sendbytes; ssize_t recvbytes; + int recvChunkSize; + int sendChunkSize; uint32_t delta; int channelId; }; diff --git a/src/include/net.h b/src/include/net.h index e78a3cd088..2c9a5ced9c 100644 --- a/src/include/net.h +++ b/src/include/net.h @@ -1,6 +1,6 @@ /************************************************************************* * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -53,11 +53,12 @@ static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) { ncclNetHandle_t handle; void* gpuPtr = NULL; void* mHandle = NULL; - NCCLCHECK(ncclNetListen(dev, &handle, &lComm)); - NCCLCHECK(ncclNetConnect(dev, &handle, &sComm)); - NCCLCHECK(ncclNetAccept(lComm, &rComm)); - CUDACHECK(hipExtMallocWithFlags(&gpuPtr, GPU_BUF_SIZE, hipDeviceMallocFinegrained)); + ncclResult_t ret; ncclDebugNoWarn = NCCL_NET; + NCCLCHECKGOTO(ncclNetListen(dev, &handle, &lComm), ret, cleanup1); + NCCLCHECKGOTO(ncclNetConnect(dev, &handle, &sComm), ret, cleanup2); + NCCLCHECKGOTO(ncclNetAccept(lComm, &rComm), ret, cleanup3); + CUDACHECKGOTO(hipMalloc(&gpuPtr, GPU_BUF_SIZE), ret, cleanup4); if (ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) { NCCLCHECK(ncclNetDeregMr(sComm, mHandle)); NCCLCHECK(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle)); @@ -66,9 +67,13 @@ static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) { } ncclDebugNoWarn = 0; CUDACHECK(hipFree(gpuPtr)); +cleanup4: NCCLCHECK(ncclNetCloseRecv(rComm)); +cleanup3: NCCLCHECK(ncclNetCloseSend(sComm)); +cleanup2: NCCLCHECK(ncclNetCloseListen(lComm)); +cleanup1: break; } return ncclSuccess; diff --git a/src/include/param.h b/src/include/param.h index c295c2c89d..6303ef8959 100644 --- a/src/include/param.h +++ b/src/include/param.h @@ -1,5 +1,6 @@ /************************************************************************* * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/include/proxy.h b/src/include/proxy.h index 2e7773e964..a2ff39c0a7 100644 --- a/src/include/proxy.h +++ b/src/include/proxy.h @@ -1,5 +1,6 @@ /************************************************************************* - * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -14,48 +15,67 @@ enum ncclProxyOpState { ncclProxyOpNone, ncclProxyOpReady, ncclProxyOpProgress } struct ncclProxyArgs; typedef ncclResult_t (*proxyProgressFunc_t)(struct ncclProxyArgs*); -struct ncclProxyArgs { - proxyProgressFunc_t progress; +#define NCCL_PROXY_MAX_SUBS MAXCHANNELS +static_assert(NCCL_MAX_WORK_ELEMENTS <= MAXCHANNELS, "Not enough sub space for max work elements"); + +struct ncclProxySubArgs { struct ncclChannel* channel; struct ncclConnector* connector; - size_t sendbytes; - size_t recvbytes; - int sliceSteps; - int chunkSteps; int nsteps; - uint64_t opCount; - int protocol; - int segment; // Only for profiling - ncclDataType_t dtype; - ncclRedOp_t redOp; - int state; // add component before this line -- it is left out during initialization + ssize_t sendbytes; + ssize_t recvbytes; + int sendChunkSize; + int recvChunkSize; + int delta; // Internal state + uint64_t base; uint64_t posted; - uint64_t received; // Only used by recv proxy to wait for flush. + uint64_t received; + uint64_t flushed; uint64_t transmitted; uint64_t done; uint64_t end; - uint64_t hdp_flushed; void* requests[NCCL_STEPS]; +}; + +struct ncclProxyArgs { + proxyProgressFunc_t progress; + struct ncclProxySubArgs subs[NCCL_PROXY_MAX_SUBS]; + int nsubs; + int done; + int sliceSteps; + int chunkSteps; + int chunkSize; + uint64_t opCount; + uint64_t commOpCount; + int protocol; + ncclDataType_t dtype; + ncclRedOp_t redOp; + ncclPattern_t pattern; + int root; + int state; + char* sharedBuff[NCCL_STEPS]; + int sharedSize[NCCL_STEPS]; + int idle; + uint64_t hdp_flushed; // Element linking pthread_mutex_t mutex; struct ncclProxyArgs* next; struct ncclProxyArgs* nextPeer; - struct ncclProxyArgs* nextGroup; struct ncclProxyArgs** proxyAppendPtr; }; struct ncclProxySharedBuffers { - int nslots; - int slotSize; - char* cudaBuff[2*MAXCHANNELS]; - int* cudaUsed[2*MAXCHANNELS]; - char* hostBuff[2*MAXCHANNELS]; - int* hostUsed[2*MAXCHANNELS]; + int size; + char* cudaBuff; + char* hostBuff; struct ncclProxyArgs* proxyAppend[2*MAXCHANNELS]; // Separate send and recv + // Collnet sharing is technically per device, but for now MAXDEVICES == MAXCHANNELS. + struct ncclProxyArgs* proxyAppendCollNet[2*MAXCHANNELS]; + void* collNetResources; }; struct ncclProxyPool; @@ -64,11 +84,16 @@ struct ncclProxyState { pthread_mutex_t opsMutex; pthread_mutex_t poolMutex; bool stop; - struct ncclProxySharedBuffers* sharedBuffs; - struct ncclProxyArgs* ops; - struct ncclProxyArgs* nextOps; + struct ncclProxySharedBuffers sharedBuffs; + struct ncclProxyArgs* ops; // Running operations, used by proxy thread + struct ncclProxyArgs* postedOps; // Posted operations, shared between proxy and main thread, locked with opsMutex + struct ncclProxyArgs* postedOpsEnd; + struct ncclProxyArgs* nextOps; // Pending operations, used by main thread (could still be cancelled) struct ncclProxyArgs* nextOpsEnd; - struct ncclProxyArgs* pool; + struct ncclProxyArgs* pool; // Free operations for main thread + struct ncclProxyArgs* poolFreed; // Freed operations by the progress thread + struct ncclProxyArgs* poolReturned; // Shared between main and progress thread, lock with poolMutex + struct ncclProxyPool* pools; }; @@ -80,15 +105,16 @@ enum proxyMode { proxyTo = 2 }; -ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int pattern, int root, int nranks); -ncclResult_t ncclProxySaveP2p(struct ncclInfo* info, struct ncclChannel* channel, int segment); +ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int nranks); +ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyArgs* args); +ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyArgs* args); ncclResult_t ncclProxyStart(struct ncclComm* comm); ncclResult_t ncclProxyCreate(struct ncclComm* comm); ncclResult_t ncclProxyDestroy(struct ncclComm* comm); ncclResult_t ncclProxySharedBuffersInit(struct ncclComm* comm, int cuda, int* size, char** ptr); -ncclResult_t ncclProxySharedBuffersAlloc(struct ncclComm* comm, int cuda, int type, int channel, int size, char** ptr); -ncclResult_t ncclProxySharedBuffersFree(struct ncclComm* comm, int cuda, int type, int channel, int size, char* ptr); +ncclResult_t ncclProxySharedBuffersGetP2p(struct ncclComm* comm, int cuda, int type, int channel, int slot, int index, char** ptr); +ncclResult_t ncclProxySharedBuffersGetCollNet(struct ncclComm* comm, int cuda, int type, int slot, int channel, char** ptr); ncclResult_t ncclProxySharedBuffersDestroy(struct ncclComm* comm); #include diff --git a/src/include/shm.h b/src/include/shm.h index 6a3e556c1e..0b93995089 100644 --- a/src/include/shm.h +++ b/src/include/shm.h @@ -1,6 +1,6 @@ /************************************************************************* * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/include/socket.h b/src/include/socket.h index aaf137713e..fb7f85d5c1 100644 --- a/src/include/socket.h +++ b/src/include/socket.h @@ -1,5 +1,6 @@ /************************************************************************* * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/include/transport.h b/src/include/transport.h index 2ecc72748d..33ff43237a 100644 --- a/src/include/transport.h +++ b/src/include/transport.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -41,7 +41,7 @@ struct ncclConnect { }; struct ncclTransportComm { - ncclResult_t (*setup)(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId); + ncclResult_t (*setup)(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo*, struct ncclPeerInfo*, struct ncclConnect*, struct ncclConnector*, int channelId, int connIndex); ncclResult_t (*connect)(struct ncclComm* comm, struct ncclConnect*, int nranks, int rank, struct ncclConnector*); ncclResult_t (*free)(void*); ncclResult_t (*proxy)(struct ncclProxyArgs*); @@ -54,7 +54,10 @@ struct ncclTransport { struct ncclTransportComm recv; }; -ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend); -ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph); +ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex); +ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex); +enum { collNetRecv=0, collNetSend=1 }; +int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type); +ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail); #endif diff --git a/src/include/utils.h b/src/include/utils.h index 03dcd96b58..86ab3a2ad2 100644 --- a/src/include/utils.h +++ b/src/include/utils.h @@ -37,6 +37,4 @@ static long log2i(long n) { return l; } -int busIdToCudaDev(int64_t busId); - #endif diff --git a/src/init.cc b/src/init.cc index 80de771fe0..03e28e88b2 100644 --- a/src/init.cc +++ b/src/init.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information @@ -8,6 +8,7 @@ #include "nccl.h" #include "channel.h" #include "nvmlwrap.h" +#include "gdrwrap.h" #include "bootstrap.h" #include "transport.h" #include "group.h" @@ -123,15 +124,31 @@ ncclResult_t initNet() { return ncclSuccess; } +// GDRCOPY support: Off by default +NCCL_PARAM(GdrCopyEnable, "GDRCOPY_ENABLE", 0); + +// GDRCOPY support +gdr_t ncclGdrCopy = NULL; + +ncclResult_t initGdrCopy() { + if (ncclParamGdrCopyEnable() == 1) { + ncclGdrCopy = ncclGdrInit(); + } + return ncclSuccess; +} + NCCL_PARAM(CollNetEnable, "COLLNET_ENABLE", 0); pthread_mutex_t initLock = PTHREAD_MUTEX_INITIALIZER; static bool initialized = false; +static size_t maxLocalSizeBytes = 0; static ncclResult_t ncclInit() { if (initialized) return ncclSuccess; pthread_mutex_lock(&initLock); if (!initialized) { initEnv(); + initGdrCopy(); + maxLocalSizeBytes = ncclKernMaxLocalSize(); NCCLCHECK(initNet()); INFO(NCCL_INIT, "Using network %s", ncclNetName()); initialized = true; @@ -339,10 +356,15 @@ static ncclResult_t commFree(ncclComm_t comm) { if (comm->doneEvent != NULL) CUDACHECK(hipEventDestroy(comm->doneEvent)); + if (comm->intDoneEvent != NULL) + CUDACHECK(hipEventDestroy(comm->intDoneEvent)); + if (comm->launchMode == ncclComm::GROUP) { CUDACHECK(hipStreamDestroy(comm->groupStream)); } + ncclDestroyQueueInfo(comm->enqueueInfo); + // Last rank frees shared resources between threads int isLast; NCCLCHECK(ncclCpuBarrierIn(comm, &isLast)); @@ -380,6 +402,8 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { // the device we're on (failure cause #1) , better know it early. hipEvent_t doneEvent; CUDACHECK(hipEventCreateWithFlags(&doneEvent, hipEventDisableTiming)); + hipEvent_t intDoneEvent; + CUDACHECK(hipEventCreateWithFlags(&intDoneEvent, hipEventDisableTiming)); struct ncclComm* comm; NCCLCHECK(ncclCalloc(&comm, 1)); @@ -391,6 +415,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx", comm, rank, ndev, comm->cudaDev, comm->busId); comm->doneEvent = doneEvent; + comm->intDoneEvent = intDoneEvent; comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false; #if CUDART_VERSION >= 9020 || defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__) comm->groupCudaStream = ncclParamGroupCudaStream(); @@ -429,6 +454,11 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { comm->asyncOpCount = 0; comm->asyncTotalSize = 0; + NCCLCHECK(ncclCalloc(&comm->enqueueInfo, 1)); + comm->enqueueInfo->comm = comm; + comm->lastSetupNode = NULL; + comm->lastCudaGraphId = -1; + static_assert(MAXCHANNELS <= sizeof(*comm->connectSend)*8, "comm->connectSend must have enough bits for all channels"); static_assert(MAXCHANNELS <= sizeof(*comm->connectRecv)*8, "comm->connectRecv must have enough bits for all channels"); NCCLCHECK(ncclCalloc(&comm->connectSend, comm->nRanks)); @@ -567,11 +597,11 @@ ncclResult_t ncclCommSetIntra(struct ncclComm* comm, int rank, int ranks, struct int cgMdLaunch = 1; // Set CG Mode - comm->launchMode = ncclComm::GROUP; + comm->launchMode = ncclComm::PARALLEL; char* str = getenv("NCCL_LAUNCH_MODE"); if (str) INFO(NCCL_ENV, "NCCL_LAUNCH_MODE set by environment to %s", str); - if (comm->intraRanks == 1 || (str && strcmp(str, "PARALLEL") == 0)) { - comm->launchMode = ncclComm::PARALLEL; + if (str && strcmp(str, "GROUP") == 0) { + comm->launchMode = ncclComm::GROUP; } if (comm->launchMode == ncclComm::GROUP) { CUDACHECK(hipStreamCreateWithFlags(&comm->groupStream, hipStreamNonBlocking)); @@ -619,128 +649,6 @@ static ncclResult_t computeBuffSizes(struct ncclComm* comm) { return ncclSuccess; } -extern struct ncclTransport collNetTransport; - -// All ranks must participate in collNetSetup call -// type: 0 for send, 1 for recv -// return: 0 - unsupported, 1 - supported -// We do not NCCLCHECK this call because we would fall back to P2P network in case CollNet setup fails -static int collNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int rank, int nranks, int masterRank, int masterPeer, int nMasters, int type) { - int rankInCollNet = -1; - int supported = 0; - int isMaster = (rank == masterRank) ? 1 : 0; - struct { - int collNetRank; - ncclConnect connect; - } sendrecvExchange; - - // check if we can connect to collnet, whose root is the nranks-th rank - struct ncclPeerInfo *myInfo = comm->peerInfo+rank, *peerInfo = comm->peerInfo+nranks; - peerInfo->rank = nranks; - int ret = 1; - if (isMaster) { - NCCLCHECK(collNetTransport.canConnect(&ret, comm->topo, collNetGraph, myInfo, peerInfo)); - } - - // send master receives connect info from peer recv master - if (isMaster && type == 0) { - NCCLCHECK(bootstrapRecv(comm->bootstrap, masterPeer, &sendrecvExchange, sizeof(sendrecvExchange))); - rankInCollNet = sendrecvExchange.collNetRank; - INFO(NCCL_INIT, "CollNet [send] : rank %d collNetRank %d collNetNranks %d received connect from rank %d", rank, rankInCollNet, nMasters, masterPeer); - } - - // select - struct ncclPeer* root = channel->peers+nranks; - struct ncclConnector* conn = (type == 1) ? &root->recv : &root->send; - struct ncclTransportComm* transportComm = (type == 1) ? &(collNetTransport.recv) : &(collNetTransport.send); - conn->transportComm = transportComm; - // setup - struct ncclConnect myConnect; - if (isMaster && ret > 0) { - NCCLCHECK(transportComm->setup(comm, collNetGraph, myInfo, peerInfo, &myConnect, conn, channel->id)); - } - // prepare connect handles - ncclResult_t res; - struct { - int isMaster; - ncclConnect connect; - } *allConnects = NULL; - ncclConnect *masterConnects = NULL; - NCCLCHECK(ncclCalloc(&masterConnects, nMasters)); - if (type == 1) { // recv side: AllGather - // all ranks must participate - NCCLCHECK(ncclCalloc(&allConnects, nranks)); - allConnects[rank].isMaster = isMaster; - memcpy(&(allConnects[rank].connect), &myConnect, sizeof(struct ncclConnect)); - NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allConnects, sizeof(*allConnects)), res, cleanup); - // consolidate - int c = 0; - for (int r = 0; r < nranks; r++) { - if (allConnects[r].isMaster) { - memcpy(masterConnects+c, &(allConnects[r].connect), sizeof(struct ncclConnect)); - if (r == rank) rankInCollNet = c; - c++; - } - } - } else { // send side : copy in connect info received from peer recv master - if (isMaster) memcpy(masterConnects+rankInCollNet, &(sendrecvExchange.connect), sizeof(struct ncclConnect)); - } - // connect - if (isMaster && ret > 0) { - NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, rankInCollNet, conn), res, cleanup); - struct ncclPeer* devRoot = channel->devPeers+nranks; - struct ncclConnector* devConn = (type == 1) ? &devRoot->recv : &devRoot->send; - CUDACHECKGOTO(hipMemcpy(devConn, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice), res, cleanup); - } - // recv side sends connect info to send side - if (isMaster && type == 1) { - sendrecvExchange.collNetRank = rankInCollNet; - memcpy(&sendrecvExchange.connect, masterConnects+rankInCollNet, sizeof(struct ncclConnect)); - NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, masterPeer, &sendrecvExchange, sizeof(sendrecvExchange)), res, cleanup); - INFO(NCCL_INIT, "CollNet [recv] : rank %d collNetRank %d collNetNranks %d sent connect to rank %d", rank, rankInCollNet, nMasters, masterPeer); - } - if (ret > 0) { - supported = 1; - } -cleanup: - if (allConnects != NULL) free(allConnects); - if (masterConnects != NULL) free(masterConnects); - return supported; -} - -static ncclResult_t checkCollNetSetup(struct ncclComm* comm, int rank, int collNetSetupFail) { - int nranks = comm->nRanks; - // AllGather collNet setup results - int* allGatherFailures; - NCCLCHECK(ncclCalloc(&allGatherFailures, nranks)); - allGatherFailures[rank] = collNetSetupFail; - NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGatherFailures, sizeof(int))); - for (int i=0; icollNetnChannels; r++) { - struct ncclChannel* channel = comm->channels+r; - struct ncclPeer* peer = channel->peers+nranks; - if (peer->send.transportResources && peer->send.transportComm) NCCLCHECK(peer->send.transportComm->free(peer->send.transportResources)); - if (peer->recv.transportResources && peer->recv.transportComm) NCCLCHECK(peer->recv.transportComm->free(peer->recv.transportResources)); - peer->send.transportResources = NULL; // avoid double free - peer->recv.transportResources = NULL; // avoid double free - } - // Set support to 0 - comm->collNetSupport = 0; - } else { - comm->collNetSupport = 1; - } - return ncclSuccess; -} - NCCL_PARAM(CrossNic, "CROSS_NIC", 2); NCCL_PARAM(GraphDumpFileRank, "GRAPH_DUMP_FILE_RANK", 0); @@ -857,7 +765,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm collNetGraph.pattern = NCCL_TOPO_PATTERN_TREE; collNetGraph.collNet = 1; collNetGraph.crossNic = ncclParamCrossNic(); - collNetGraph.minChannels = collNetGraph.maxChannels = ringGraph.nChannels; + collNetGraph.minChannels = 1; + collNetGraph.maxChannels = ringGraph.nChannels; NCCLCHECK(ncclTopoCompute(comm->topo, &collNetGraph)); NCCLCHECK(ncclTopoPrintGraph(comm->topo, &collNetGraph)); @@ -910,6 +819,13 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm NCCLCHECK(ncclTopoDumpGraphs(comm->topo, 3, graphs)); } + // Determine CollNet support + if (tmpNnodes > 1 && ncclParamCollNetEnable() == 1 && collNetSupport() == 1 && collNetGraph.nChannels > 0) comm->collNetSupport = 1; + if (intraRanks > 8) { + if (comm->collNetSupport == 1) WARN("CollNet currently only supports up to 8 GPUs per node"); + comm->collNetSupport = 0; + } + if ((comm->topo->type & RCCL_TOPO_4P2H_ROME) && (comm->topo->type & RCCL_TOPO_GDR_ALL)) { if (rcclParamP2pNetDisable() == 0) { STORE(comm->p2pNet, 1); @@ -921,6 +837,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm // AllGather3 - begin struct ncclGraphInfo { int pattern; + int nChannels; int sameChannels; float speedIntra; float speedInter; @@ -929,9 +846,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm }; struct { - int cudaCompCap; - int fullCudaCompCap; - int nChannels; + int collNetSupport; int nc; struct ncclGraphInfo tree; struct ncclGraphInfo ring; @@ -942,39 +857,37 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm NCCLCHECK(ncclCalloc(&allGather3Data, nranks)); int idx; NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, myInfo->busId, &idx)); - allGather3Data[rank].cudaCompCap = comm->topo->nodes[GPU].nodes[idx].gpu.cudaCompCap; - allGather3Data[rank].nChannels = comm->nChannels = treeGraph.nChannels = ringGraph.nChannels = - std::min(treeGraph.nChannels, ringGraph.nChannels); - allGather3Data[rank].nc = comm->nChannels*2; - if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 908) allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4); + allGather3Data[rank].nc = 2; if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (comm->topo->type & RCCL_TOPO_CR8G)) - allGather3Data[rank].nc = comm->nChannels*4; - if (comm->topo->nodes[GPU].count != comm->topo->nRanks && comm->topo->nodes[NET].count && (comm->topo->type & RCCL_TOPO_4P2H_ROME)) - allGather3Data[rank].nc = (comm->topo->nodes[NET].count > 3 ? 2 : 4)*comm->topo->nodes[NET].count; + allGather3Data[rank].nc = 4; if (comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910) - allGather3Data[rank].nc = comm->nChannels*6; + allGather3Data[rank].nc = 6; allGather3Data[rank].tree.pattern = treeGraph.pattern; + allGather3Data[rank].tree.nChannels = treeGraph.nChannels; allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels; allGather3Data[rank].tree.speedIntra = treeGraph.speedIntra; allGather3Data[rank].tree.speedInter = treeGraph.speedInter; allGather3Data[rank].tree.typeIntra = treeGraph.typeIntra; allGather3Data[rank].tree.typeInter = treeGraph.typeInter; allGather3Data[rank].ring.pattern = ringGraph.pattern; + allGather3Data[rank].ring.nChannels = ringGraph.nChannels; allGather3Data[rank].ring.sameChannels = ringGraph.sameChannels; allGather3Data[rank].ring.speedIntra = ringGraph.speedIntra; allGather3Data[rank].ring.speedInter = ringGraph.speedInter; allGather3Data[rank].ring.typeIntra = ringGraph.typeIntra; allGather3Data[rank].ring.typeInter = ringGraph.typeInter; allGather3Data[rank].collNet.pattern = collNetGraph.pattern; + allGather3Data[rank].collNet.nChannels = collNetGraph.nChannels; allGather3Data[rank].collNet.sameChannels = collNetGraph.sameChannels; allGather3Data[rank].collNet.speedIntra = collNetGraph.speedIntra; allGather3Data[rank].collNet.speedInter = collNetGraph.speedInter; allGather3Data[rank].collNet.typeIntra = collNetGraph.typeIntra; allGather3Data[rank].collNet.typeInter = collNetGraph.typeInter; + allGather3Data[rank].collNetSupport = comm->collNetSupport; - // CollNet channels are already duplicated - comm->collNetnChannels = 2*collNetGraph.nChannels; - NCCLCHECK(ncclTopoPreset(comm, &treeGraph, &ringGraph, &collNetGraph, &allGather3Data[rank].topoRanks)); + comm->nChannels = (comm->topo->nodes[GPU].count != comm->topo->nRanks && comm->topo->nodes[NET].count) + ? std::min(treeGraph.nChannels, ringGraph.nChannels) : ringGraph.nChannels; + NCCLCHECK(ncclTopoPreset(comm, &treeGraph, &ringGraph, &allGather3Data[rank].topoRanks)); NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data))); @@ -1005,24 +918,30 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm allTopoRanks[i] = &allGather3Data[i].topoRanks; nc = std::min(allGather3Data[i].nc, nc); // Make sure we align all ranks so that the tuning is consistent across ranks - treeGraph.nChannels = ringGraph.nChannels = comm->nChannels = std::min(allGather3Data[i].nChannels, comm->nChannels); + treeGraph.nChannels = std::min(allGather3Data[i].tree.nChannels, treeGraph.nChannels); treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels); treeGraph.speedIntra = std::min(allGather3Data[i].tree.speedIntra, treeGraph.speedIntra); treeGraph.speedInter = std::min(allGather3Data[i].tree.speedInter, treeGraph.speedInter); treeGraph.typeIntra = std::min(allGather3Data[i].tree.typeIntra, treeGraph.typeIntra); treeGraph.typeInter = std::min(allGather3Data[i].tree.typeInter, treeGraph.typeInter); + ringGraph.nChannels = std::min(allGather3Data[i].ring.nChannels, ringGraph.nChannels); ringGraph.sameChannels = std::min(allGather3Data[i].ring.sameChannels, ringGraph.sameChannels); ringGraph.speedIntra = std::min(allGather3Data[i].ring.speedIntra, ringGraph.speedIntra); ringGraph.speedInter = std::min(allGather3Data[i].ring.speedInter, ringGraph.speedInter); ringGraph.typeIntra = std::min(allGather3Data[i].ring.typeIntra, ringGraph.typeIntra); ringGraph.typeInter = std::min(allGather3Data[i].ring.typeInter, ringGraph.typeInter); + collNetGraph.nChannels = std::min(allGather3Data[i].collNet.nChannels, collNetGraph.nChannels); collNetGraph.sameChannels = std::min(allGather3Data[i].collNet.sameChannels, collNetGraph.sameChannels); collNetGraph.speedIntra = std::min(allGather3Data[i].collNet.speedIntra, collNetGraph.speedIntra); collNetGraph.speedInter = std::min(allGather3Data[i].collNet.speedInter, collNetGraph.speedInter); collNetGraph.typeIntra = std::min(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra); collNetGraph.typeInter = std::min(allGather3Data[i].collNet.typeInter, collNetGraph.typeInter); + comm->collNetSupport = std::min(allGather3Data[i].collNetSupport, comm->collNetSupport); } + comm->nChannels = treeGraph.nChannels = ringGraph.nChannels = + (comm->topo->nodes[GPU].count != comm->topo->nRanks && comm->topo->nodes[NET].count) + ? std::min(treeGraph.nChannels, ringGraph.nChannels) : ringGraph.nChannels; if (comm->nChannels < nChannelsOrig) { // We started duplicating channels during Preset(), so we need to move the // duplicated channels since we have removed some. @@ -1031,15 +950,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm int *rings; NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS)); - - NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, nc)); - if (comm->nNodes > 1 && - ncclParamCollNetEnable() == 1 && - collNetSupport() && collNetGraph.nChannels) { - NCCLCHECK(ncclTopoConnectCollNet(comm, &collNetGraph, rank)); - } else { - comm->collNetnChannels = 0; - } + NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, &collNetGraph, nc)); free(allTopoRanks); free(nodesTreePatterns); @@ -1076,46 +987,58 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm struct ncclChannel* channel = comm->channels+c; NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, affinity_restore); if (comm->nRanks == 1) continue; - NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next), ret, affinity_restore); + NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, affinity_restore); } - NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph), ret, affinity_restore); + NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, 0), ret, affinity_restore); INFO(NCCL_INIT, "Connected all rings"); // Connect Trees for (int c=0; cnChannels; c++) { struct ncclChannel* channel = comm->channels+c; if (comm->nRanks == 1) continue; - NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up), ret, affinity_restore); - NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down), ret, affinity_restore); + NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up, 0), ret, affinity_restore); + NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down, 0), ret, affinity_restore); } - NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph), ret, affinity_restore); + NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, 0), ret, affinity_restore); INFO(NCCL_INIT, "Connected all trees"); // Check if we can setup CollNet - if (comm->nNodes > 1 && - ncclParamCollNetEnable() == 1 && - collNetSupport() && collNetGraph.nChannels) { - for (int c=comm->nChannels; ccollNetnChannels; c++) - NCCLCHECK(initChannel(comm, c));; - int logicChannels = comm->collNetnChannels/2; + if (comm->collNetSupport > 0) { int collNetSetupFail = 0; - const int recvIndex = 0; // recv GPU index is always 0 - const int sendIndex = collNetGraph.pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1; // send GPU index depends on topo pattern - for (int c=0; cchannels+logicChannels+c; - struct ncclChannel* channelSend = comm->channels+c; - NCCLCHECK(ncclTransportP2pConnect(comm, channelRecv, 1, &channelRecv->collTree.up, 1, channelRecv->collTree.down)); - NCCLCHECK(ncclTransportP2pConnect(comm, channelSend, 1, channelSend->collTree.down, 1, &channelSend->collTree.up)); - const int recvMaster = collNetGraph.intra[c*comm->localRanks+recvIndex]; - const int sendMaster = collNetGraph.intra[c*comm->localRanks+sendIndex]; - if (collNetSetup(comm, &collNetGraph, channelRecv, rank, nranks, recvMaster, sendMaster, comm->nNodes, 1) != 1) - collNetSetupFail = 1; - else if (collNetSetup(comm, &collNetGraph, channelSend, rank, nranks, sendMaster, recvMaster, comm->nNodes, 0) != 1) - collNetSetupFail = 1; + // Find all head ranks + int nHeads = collNetGraph.nChannels; + int *heads; + NCCLCHECK(ncclCalloc(&heads, nHeads)); + // Head GPU index is always 0 + for (int c=0; clocalRanks+0]; + } + for (int c=0; cnChannels; c++) { + struct ncclChannel* channel = comm->channels+c; + for (int h=0; hcollNetSupport) { + TRACE(NCCL_INIT, "rank %d Connected inter-node CollNet", rank); + for (int c=0; cnChannels; c++) { + struct ncclChannel* channelRecv = comm->channels+c; + NCCLCHECK(ncclTransportP2pConnect(comm, channelRecv, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.down, 0)); + } + NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph, 0)); + for (int c=0; cnChannels; c++) { + struct ncclChannel* channelSend = comm->channels+c; + NCCLCHECK(ncclTransportP2pConnect(comm, channelSend, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.down, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.up, 1)); + } + NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph, 1)); + INFO(NCCL_INIT, "rank %d Connected CollNet", rank); + } } TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels); free(rings); @@ -1140,10 +1063,18 @@ affinity_restore: return ncclSuccess; } +NCCL_PARAM(SetStackSize, "SET_STACK_SIZE", 0); + ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int cudaDev) { ncclResult_t res; CUDACHECK(hipSetDevice(cudaDev)); + // Set the maximum kernel stack size of all kernels to avoid + // a CUDA memory reconfig on load (c.f. NVSHMEM issue) + //if (maxLocalSizeBytes > 0 && ncclParamSetStackSize() == 1) { + // TRACE(NCCL_INIT, "Setting hipLimitStackSize to %zi", maxLocalSizeBytes); + // CUDACHECKIGNORE(hipDeviceSetLimit(hipLimitStackSize, maxLocalSizeBytes)); + //} NCCLCHECKGOTO(commAlloc(newcomm, nranks, myrank), res, cleanup); NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup); NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup); @@ -1184,6 +1115,7 @@ static ncclResult_t ncclCommInitRankDev(ncclComm_t* newcomm, int nranks, ncclUni } else { NCCLCHECKGOTO(ncclCommInitRankSync(newcomm, nranks, commId, myrank, cudaDev), res, end); } + end: if (ncclAsyncMode()) return ncclAsyncErrCheck(res); else return res; diff --git a/src/misc/argcheck.cc b/src/misc/argcheck.cc index 716cd53408..285720f87b 100644 --- a/src/misc/argcheck.cc +++ b/src/misc/argcheck.cc @@ -1,6 +1,6 @@ /************************************************************************* * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/misc/gdrwrap.cc b/src/misc/gdrwrap.cc new file mode 100644 index 0000000000..ed0c697946 --- /dev/null +++ b/src/misc/gdrwrap.cc @@ -0,0 +1,246 @@ +/************************************************************************* + * Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. + * + * See LICENSE.txt for license information + ************************************************************************/ + +#include "gdrwrap.h" + +#ifndef GDR_DIRECT +#include "core.h" + +static enum { gdrUninitialized, gdrInitializing, gdrInitialized, gdrError } gdrState = gdrUninitialized; + +/* Function pointers assigned from dlopen() */ +static gdr_t (*gdr_internal_open)(void); +static int (*gdr_internal_close)(gdr_t g); +static int (*gdr_internal_pin_buffer)(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle); +static int (*gdr_internal_unpin_buffer)(gdr_t g, gdr_mh_t handle); +static int (*gdr_internal_get_info)(gdr_t g, gdr_mh_t handle, gdr_info_t *info); +static int (*gdr_internal_map)(gdr_t g, gdr_mh_t handle, void **va, size_t size); +static int (*gdr_internal_unmap)(gdr_t g, gdr_mh_t handle, void *va, size_t size); +static void (*gdr_internal_runtime_get_version)(int *major, int *minor); +static void (*gdr_internal_driver_get_version)(gdr_t g, int *major, int *minor); +static int (*gdr_internal_copy_to_mapping)(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size); +static int (*gdr_internal_copy_from_mapping)(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size); + + +// Used to make the GDR library calls thread safe +pthread_mutex_t gdrLock = PTHREAD_MUTEX_INITIALIZER; + +#define GDRAPI_LIBNAME "libgdrapi.so" + +#define LOAD_SYM(handle, symbol, funcptr) do { \ + cast = (void**)&funcptr; \ + tmp = dlsym(handle, symbol); \ + if (tmp == NULL) { \ + WARN("dlsym failed on %s - %s", symbol, dlerror());\ + goto teardown; \ + } \ + *cast = tmp; \ + } while (0) + +#define LOAD_SYM_OPTIONAL(handle, symbol, funcptr) do {\ + cast = (void**)&funcptr; \ + tmp = dlsym(handle, symbol); \ + if (tmp == NULL) { \ + INFO(NCCL_INIT,"dlsym failed on %s, ignoring", symbol); \ + } \ + *cast = tmp; \ + } while (0) + +ncclResult_t wrap_gdr_symbols(void) { + if (gdrState == gdrInitialized) + return ncclSuccess; + if (gdrState == gdrError) + return ncclSystemError; + + if (__sync_bool_compare_and_swap(&gdrState, gdrUninitialized, gdrInitializing) == false) { + // Another thread raced in front of us. Wait for it to be done. + while (gdrState == gdrInitializing) pthread_yield(); + return (gdrState == gdrInitialized) ? ncclSuccess : ncclSystemError; + } + + static void* gdrhandle = NULL; + void* tmp; + void** cast; + + gdrhandle=dlopen(GDRAPI_LIBNAME, RTLD_NOW); + if (!gdrhandle) { + WARN("Failed to open %s", GDRAPI_LIBNAME); + goto teardown; + } + + /* Load the function pointers from the DL library image */ + LOAD_SYM(gdrhandle, "gdr_open", gdr_internal_open); + LOAD_SYM(gdrhandle, "gdr_close", gdr_internal_close); + LOAD_SYM(gdrhandle, "gdr_pin_buffer", gdr_internal_pin_buffer); + LOAD_SYM(gdrhandle, "gdr_unpin_buffer", gdr_internal_unpin_buffer); + LOAD_SYM(gdrhandle, "gdr_get_info", gdr_internal_get_info); + LOAD_SYM(gdrhandle, "gdr_map", gdr_internal_map); + LOAD_SYM(gdrhandle, "gdr_unmap", gdr_internal_unmap); + LOAD_SYM(gdrhandle, "gdr_runtime_get_version", gdr_internal_runtime_get_version); + LOAD_SYM(gdrhandle, "gdr_driver_get_version", gdr_internal_driver_get_version); + LOAD_SYM(gdrhandle, "gdr_copy_to_mapping", gdr_internal_copy_to_mapping); + LOAD_SYM(gdrhandle, "gdr_copy_from_mapping", gdr_internal_copy_from_mapping); + + gdrState = gdrInitialized; + return ncclSuccess; + +teardown: + gdr_internal_open = NULL; + gdr_internal_close = NULL; + gdr_internal_pin_buffer = NULL; + gdr_internal_unpin_buffer = NULL; + gdr_internal_get_info = NULL; + gdr_internal_map = NULL; + gdr_internal_unmap = NULL; + gdr_internal_runtime_get_version = NULL; + gdr_internal_driver_get_version = NULL; + gdr_internal_copy_to_mapping = NULL; + gdr_internal_copy_from_mapping = NULL; + + if (gdrhandle != NULL) dlclose(gdrhandle); + gdrState = gdrError; + return ncclSystemError; +} + + +gdr_t wrap_gdr_open(void) { + if (gdr_internal_open == NULL) { + WARN("GDRCOPY lib wrapper not initialized."); + return NULL; + } + return gdr_internal_open(); +} + +ncclResult_t wrap_gdr_close(gdr_t g) { + if (gdr_internal_close == NULL) { + WARN("GDRCOPY lib wrapper not initialized."); + return ncclInternalError; + } + int ret = gdr_internal_close(g); + if (ret != 0) { + WARN("gdr_close() failed: %d", ret); + return ncclSystemError; + } + return ncclSuccess; +} + +ncclResult_t wrap_gdr_pin_buffer(gdr_t g, unsigned long addr, size_t size, uint64_t p2p_token, uint32_t va_space, gdr_mh_t *handle) { + if (gdr_internal_pin_buffer == NULL) { + WARN("GDRCOPY lib wrapper not initialized."); + return ncclInternalError; + } + int ret; + GDRLOCKCALL(gdr_internal_pin_buffer(g, addr, size, p2p_token, va_space, handle), ret); + if (ret != 0) { + WARN("gdr_pin_buffer(addr %lx, size %zi) failed: %d", addr, size, ret); + return ncclSystemError; + } + return ncclSuccess; +} + +ncclResult_t wrap_gdr_unpin_buffer(gdr_t g, gdr_mh_t handle) { + if (gdr_internal_unpin_buffer == NULL) { + WARN("GDRCOPY lib wrapper not initialized."); + return ncclInternalError; + } + int ret; + GDRLOCKCALL(gdr_internal_unpin_buffer(g, handle), ret); + if (ret != 0) { + WARN("gdr_unpin_buffer(handle %lx) failed: %d", handle.h, ret); + return ncclSystemError; + } + return ncclSuccess; +} + +ncclResult_t wrap_gdr_get_info(gdr_t g, gdr_mh_t handle, gdr_info_t *info) { + if (gdr_internal_get_info == NULL) { + WARN("GDRCOPY lib wrapper not initialized."); + return ncclInternalError; + } + int ret; + GDRLOCKCALL(gdr_internal_get_info(g, handle, info), ret); + if (ret != 0) { + WARN("gdr_get_info(handle %lx) failed: %d", handle.h, ret); + return ncclSystemError; + } + return ncclSuccess; +} + +ncclResult_t wrap_gdr_map(gdr_t g, gdr_mh_t handle, void **va, size_t size) { + if (gdr_internal_map == NULL) { + WARN("GDRCOPY lib wrapper not initialized."); + return ncclInternalError; + } + int ret; + GDRLOCKCALL(gdr_internal_map(g, handle, va, size), ret); + if (ret != 0) { + WARN("gdr_map(handle %lx, size %zi) failed: %d", handle.h, size, ret); + return ncclSystemError; + } + return ncclSuccess; +} + +ncclResult_t wrap_gdr_unmap(gdr_t g, gdr_mh_t handle, void *va, size_t size) { + if (gdr_internal_unmap == NULL) { + WARN("GDRCOPY lib wrapper not initialized."); + return ncclInternalError; + } + int ret; + GDRLOCKCALL(gdr_internal_unmap(g, handle, va, size), ret); + if (ret != 0) { + WARN("gdr_unmap(handle %lx, va %p, size %zi) failed: %d", handle.h, va, size, ret); + return ncclSystemError; + } + return ncclSuccess; +} + +ncclResult_t wrap_gdr_runtime_get_version(int *major, int *minor) { + if (gdr_internal_runtime_get_version == NULL) { + WARN("GDRCOPY lib wrapper not initialized."); + return ncclInternalError; + } + gdr_internal_runtime_get_version(major, minor); + return ncclSuccess; +} + +ncclResult_t wrap_gdr_driver_get_version(gdr_t g, int *major, int *minor) { + if (gdr_internal_driver_get_version == NULL) { + WARN("GDRCOPY lib wrapper not initialized."); + return ncclInternalError; + } + gdr_internal_driver_get_version(g, major, minor); + return ncclSuccess; +} + +ncclResult_t wrap_gdr_copy_to_mapping(gdr_mh_t handle, void *map_d_ptr, const void *h_ptr, size_t size) { + if (gdr_internal_copy_to_mapping == NULL) { + WARN("GDRCOPY lib wrapper not initialized."); + return ncclInternalError; + } + int ret; + GDRLOCKCALL(gdr_internal_copy_to_mapping(handle, map_d_ptr, h_ptr, size), ret); + if (ret != 0) { + WARN("gdr_copy_to_mapping(handle %lx, map_d_ptr %p, h_ptr %p, size %zi) failed: %d", handle.h, map_d_ptr, h_ptr, size, ret); + return ncclSystemError; + } + return ncclSuccess; +} + +ncclResult_t wrap_gdr_copy_from_mapping(gdr_mh_t handle, void *h_ptr, const void *map_d_ptr, size_t size) { + if (gdr_internal_copy_from_mapping == NULL) { + WARN("GDRCOPY lib wrapper not initialized."); + return ncclInternalError; + } + int ret; + GDRLOCKCALL(gdr_internal_copy_from_mapping(handle, h_ptr, map_d_ptr, size), ret); + if (ret != 0) { + WARN("gdr_copy_from_mapping(handle %lx, h_ptr %p, map_d_ptr %p, size %zi) failed: %d", handle.h, h_ptr, map_d_ptr, size, ret); + return ncclSystemError; + } + return ncclSuccess; +} + +#endif /* !GDR_DIRECT */ diff --git a/src/misc/nvmlwrap_stub.cc b/src/misc/nvmlwrap_stub.cc index eed704de49..41485df101 100644 --- a/src/misc/nvmlwrap_stub.cc +++ b/src/misc/nvmlwrap_stub.cc @@ -1,6 +1,6 @@ /************************************************************************* * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/misc/utils.cc b/src/misc/utils.cc index 29b618196c..74190de6b7 100644 --- a/src/misc/utils.cc +++ b/src/misc/utils.cc @@ -1,6 +1,6 @@ /************************************************************************* * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ diff --git a/src/nccl.h.in b/src/nccl.h.in index 06b51d1871..8b99d0f096 100644 --- a/src/nccl.h.in +++ b/src/nccl.h.in @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -17,7 +17,7 @@ #define NCCL_SUFFIX "${NCCL_SUFFIX}" #define NCCL_VERSION_CODE ${NCCL_VERSION} -#define NCCL_VERSION(X,Y,Z) ((X) * 1000 + (Y) * 100 + (Z)) +#define NCCL_VERSION(X,Y,Z) (((X) >= 2 && (Y) >= 9) ? (X) * 10000 + (Y) * 100 + (Z) : (X) * 1000 + (Y) * 100 + (Z)) #define RCCL_BFLOAT16 1 #define RCCL_GATHER_SCATTER 1 diff --git a/src/proxy.cc b/src/proxy.cc index 84a632e724..6a63919e48 100644 --- a/src/proxy.cc +++ b/src/proxy.cc @@ -1,15 +1,15 @@ /************************************************************************* - * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ #include "comm.h" #include "info.h" -#include "graph.h" #include "collectives.h" -enum { proxyRecv=0, proxySend=1, p2pProxyRecv=2, p2pProxySend=3 }; +enum { proxyRecv=0, proxySend=1 }; static bool NeedProxy(int type, int pattern, int root, struct ncclRing* ring, int nranks) { if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice) return true; @@ -34,26 +34,32 @@ struct ncclProxyPool { static ncclResult_t allocateArgs(struct ncclComm* comm, struct ncclProxyArgs** argsptr) { struct ncclProxyState* state = &comm->proxyState; struct ncclProxyArgs* elem; - pthread_mutex_lock(&state->poolMutex); if (state->pool == NULL) { - // Allocate a new pool of elements - struct ncclProxyPool* newPool; - NCCLCHECK(ncclCalloc(&newPool, 1)); - struct ncclProxyArgs* newElems = newPool->elems; - // Chain newly allocated elements - for (int i=0; ipoolReturned) { + pthread_mutex_lock(&state->poolMutex); + state->pool = state->poolReturned; + state->poolReturned = NULL; + pthread_mutex_unlock(&state->poolMutex); + } else { + // Allocate a new pool of elements + struct ncclProxyPool* newPool; + NCCLCHECK(ncclCalloc(&newPool, 1)); + struct ncclProxyArgs* newElems = newPool->elems; + // Chain newly allocated elements + for (int i=0; ipool = newElems; + // Save the pool memory block for later resource release + newPool->next = state->pools; + state->pools = newPool; } - // Add them all to the pool list - state->pool = newElems; - // Save the pool memory block for later resource release - newPool->next = state->pools; - state->pools = newPool; } elem = state->pool; state->pool = state->pool->next; - pthread_mutex_unlock(&state->poolMutex); - elem->next = elem->nextPeer = elem->nextGroup = NULL; + elem->next = elem->nextPeer = NULL; *argsptr = elem; return ncclSuccess; } @@ -75,23 +81,18 @@ ncclResult_t dumpProxyState(struct ncclProxyState* state) { WARN("Active list loop at element %ld", OP_INDEX(op)); } op->idle |= OP_SEEN; - printf("[%ld]", OP_INDEX(op)); + printf("[%ld(%ld/%d)]", OP_INDEX(op), op->opCount, op->nsubs); if (op->nextPeer) { printf("(%ld)", OP_INDEX(op->nextPeer)); struct ncclProxyArgs* n = op->nextPeer; n->idle |= OP_SEEN; - while (n->nextGroup || n->nextPeer) { - n = n->nextGroup ? n->nextGroup : n->nextPeer; + while (n->nextPeer) { + n = n->nextPeer; n->idle |= OP_SEEN; } } - if (op->nextGroup) { - printf("--G->"); - op = op->nextGroup; - } else { - printf("--N->"); - op = op->next; - } + printf("->"); + op = op->next; } printf("[X]\n"); @@ -128,46 +129,62 @@ ncclResult_t dumpProxyState(struct ncclProxyState* state) { return ncclSuccess; } -static ncclResult_t ProxyAppend(struct ncclProxyState* state, struct ncclProxyArgs* args, int shared) { +static ncclResult_t ProxyAppend(struct ncclProxyState* state, struct ncclProxyArgs* args) { struct ncclProxyArgs* proxyAppend = *args->proxyAppendPtr; + int shared = args->subs[0].connector->conn.shared; if (proxyAppend) { if (shared && proxyAppend->opCount == args->opCount) { + if ((proxyAppend->sliceSteps != args->sliceSteps) || + (proxyAppend->chunkSteps != args->chunkSteps) || + (proxyAppend->protocol != args->protocol) || + (proxyAppend->dtype != args->dtype) || + (proxyAppend->redOp != args->redOp)) { + WARN("Proxy append mismatch"); + return ncclInternalError; + } + if (proxyAppend->nsubs >= NCCL_PROXY_MAX_SUBS) { + WARN("Proxy append out of bound"); + return ncclInternalError; + } + memcpy(proxyAppend->subs+proxyAppend->nsubs, args->subs, sizeof(struct ncclProxySubArgs)); + proxyAppend->nsubs++; args->next = proxyAppend->next; - proxyAppend->next = NULL; - proxyAppend->nextGroup = args; - DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld/%5ld) as group, prevGroup %5ld, next %5ld : \n", OP_INDEX(args), shared, proxyAppend->opCount, args->opCount, OP_INDEX(proxyAppend), OP_INDEX(args->next)); + // Free args as we merged them + args->next = state->poolFreed; + state->poolFreed = args; + DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld/%5ld) as group with %5ld\n", OP_INDEX(args), shared, proxyAppend->opCount, args->opCount, OP_INDEX(proxyAppend)); } else { proxyAppend->nextPeer = args; - DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld/%5ld) as nextPeer of %5ld : \n", OP_INDEX(args), shared, proxyAppend->opCount, args->opCount, OP_INDEX(proxyAppend)); + DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld/%5ld) as nextPeer of %5ld\n", OP_INDEX(args), shared, proxyAppend->opCount, args->opCount, OP_INDEX(proxyAppend)); + *(args->proxyAppendPtr) = args; } } else { // Nothing running for that peer. Add to the list if (state->ops == NULL) { // Create the list - DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld) as first element : \n", OP_INDEX(args), shared, args->opCount); + DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld) as first element\n", OP_INDEX(args), shared, args->opCount); state->ops = args; } else { // Append element at the end of the list struct ncclProxyArgs* last = state->ops; - while (last->nextGroup || last->next) last = last->nextGroup ? last->nextGroup : last->next; + while (last->next) last = last->next; last->next = args; - DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld) as last element : \n", OP_INDEX(args),shared, args->opCount); + DEBUG_PROXY_PRINT("Insert %5ld (%d/%5ld) as last element\n", OP_INDEX(args),shared, args->opCount); } + *(args->proxyAppendPtr) = args; } - *(args->proxyAppendPtr) = args; return ncclSuccess; } -static ncclResult_t SaveProxy(int type, int peer, struct ncclProxyArgs* args) { +static ncclResult_t SaveProxy(int type, int peer, struct ncclProxyArgs* args, int connIndex) { if (peer < 0) return ncclSuccess; - struct ncclPeer* peerComm = args->channel->peers+peer; - struct ncclConnector* connector = type < p2pProxyRecv ? (type == proxyRecv ? &peerComm->recv : &peerComm->send) - : (type == p2pProxyRecv ? &peerComm->p2pRecv : &peerComm->p2pSend); + struct ncclChannel* channel = args->subs[0].channel; + struct ncclPeer* peerComm = channel->peers+peer; + struct ncclConnector* connector = type == proxyRecv ? peerComm->recv+connIndex : peerComm->send+connIndex; if (connector->transportComm == NULL) { - WARN("[%d] Error no transport for %s peer %d on channel %d", connector->comm->rank, - type < p2pProxyRecv ? (type == proxyRecv ? "recv" : "send") : (type == p2pProxyRecv ? "p2pRecv" : "p2pSend"), - peer, args->channel->id); + WARN("Rank %d has no transport for %s peer %d on channel %d", connector->comm->rank, + type == proxyRecv ? "recv" : "send", peer, channel->id); return ncclInternalError; } if (connector->transportComm->proxy == NULL) return ncclSuccess; @@ -176,14 +193,10 @@ static ncclResult_t SaveProxy(int type, int peer, struct ncclProxyArgs* args) { struct ncclProxyArgs* op; NCCLCHECK(allocateArgs(connector->comm, &op)); memcpy(op, args, sizeof(struct ncclProxyArgs)); - op->connector = connector; + op->subs[0].connector = connector; op->progress = connector->transportComm->proxy; op->state = ncclProxyOpReady; - - op->proxyAppendPtr = - connector->conn.shared ? - state->sharedBuffs->proxyAppend+2*args->channel->id+type : // Shared buffers - &connector->proxyAppend; // Dedicated buffers + op->proxyAppendPtr = connector->proxyAppendPtr; if (state->nextOps == NULL) state->nextOps = op; else state->nextOpsEnd->next = op; @@ -191,120 +204,131 @@ static ncclResult_t SaveProxy(int type, int peer, struct ncclProxyArgs* args) { return ncclSuccess; } -ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int pattern, int root, int nranks) { +ncclResult_t ncclProxySaveColl(struct ncclProxyArgs* args, int nranks) { + struct ncclChannel* channel = args->subs[0].channel; + int pattern = args->pattern; if (pattern == ncclPatternRing || pattern == ncclPatternRingTwice || pattern == ncclPatternPipelineFrom || pattern == ncclPatternPipelineTo) { - struct ncclRing* ring = &args->channel->ring; - if (NeedProxy(proxyRecv, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy(proxyRecv, ring->prev, args)); - if (NeedProxy(proxySend, pattern, root, ring, nranks)) NCCLCHECK(SaveProxy(proxySend, ring->next, args)); + struct ncclRing* ring = &channel->ring; + if (NeedProxy(proxyRecv, pattern, args->root, ring, nranks)) NCCLCHECK(SaveProxy(proxyRecv, ring->prev, args, 0)); + if (NeedProxy(proxySend, pattern, args->root, ring, nranks)) NCCLCHECK(SaveProxy(proxySend, ring->next, args, 0)); } if (pattern == ncclPatternTreeUp || pattern == ncclPatternTreeUpDown) { // Tree up - struct ncclTree* tree = &args->channel->tree; - for (int i=0; idown[i], args)); - NCCLCHECK(SaveProxy(proxySend, tree->up, args)); + struct ncclTree* tree = &channel->tree; + for (int i=0; idown[i], args, 0)); + NCCLCHECK(SaveProxy(proxySend, tree->up, args, 0)); } if (pattern == ncclPatternTreeDown || pattern == ncclPatternTreeUpDown) { // Tree down - struct ncclTree* tree = &args->channel->tree; - for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(proxySend, tree->down[i], args)); - NCCLCHECK(SaveProxy(proxyRecv, tree->up, args)); + struct ncclTree* tree = &channel->tree; + for (int i=0; i< NCCL_MAX_TREE_ARITY; i++) NCCLCHECK(SaveProxy(proxySend, tree->down[i], args, 0)); + NCCLCHECK(SaveProxy(proxyRecv, tree->up, args, 0)); } - if (pattern == ncclPatternCollTreeUp) { + if (pattern == ncclPatternCollTreeUpDown) { // CollTree up - struct ncclTree* tree = &args->channel->collTree; - NCCLCHECK(SaveProxy(proxyRecv, tree->down[0], args)); - NCCLCHECK(SaveProxy(proxySend, tree->up, args)); - } - if (pattern == ncclPatternCollTreeDown) { + NCCLCHECK(SaveProxy(proxySend, channel->collTree.out, args, 1)); // For CollTree up, we are using push // CollTree down - struct ncclTree* tree = &args->channel->collTree; - NCCLCHECK(SaveProxy(proxySend, tree->down[0], args)); - NCCLCHECK(SaveProxy(proxyRecv, tree->up, args)); + NCCLCHECK(SaveProxy(proxyRecv, channel->collTree.out, args, 0)); } return ncclSuccess; } -ncclResult_t ncclProxySaveP2p(struct ncclInfo* info, struct ncclChannel* channel, int segment) { - struct ncclProxyArgs args; - memset(&args, 0, sizeof(struct ncclProxyArgs)); - args.channel = channel; - args.sliceSteps = 1; - args.chunkSteps = 1; - args.protocol = NCCL_PROTO_SIMPLE; - args.segment = segment; - args.opCount = channel->workFifoTail-1; - args.dtype = info->datatype; +ncclResult_t ncclProxyComputeP2p(struct ncclInfo* info, struct ncclProxyArgs* args) { + memset(args, 0, sizeof(struct ncclProxyArgs)); + int channelId = info->channelId; + args->nsubs = 1; + struct ncclProxySubArgs* sub = args->subs; + + struct ncclChannel* channel = info->comm->channels+channelId; + sub->channel = channel; + args->sliceSteps = 1; + args->chunkSteps = 1; + args->protocol = NCCL_PROTO_SIMPLE; + args->dtype = info->datatype; + sub->delta = info->delta; + sub->recvbytes = info->recvbytes; + sub->sendbytes = info->sendbytes; + + int stepSize = info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/SENDRECV_SLICEFACTOR; + info->recvChunkSize = stepSize; + info->sendChunkSize = stepSize; + if (info->delta > 0 && info->recvbytes >= 0) { int peerrecv = (info->comm->nRanks+info->comm->rank-info->delta)%info->comm->nRanks; - args.nsteps = DIVUP(info->recvbytes, info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/SENDRECV_SLICEFACTOR); - if (args.nsteps == 0) args.nsteps = 1; - args.recvbytes = info->recvbytes; - args.sendbytes = 0; - NCCLCHECK(SaveProxy(LOAD(info->comm->p2pNet) ? p2pProxyRecv : proxyRecv, peerrecv, &args)); + if (channel->peers[peerrecv].recv[0].transportComm && channel->peers[peerrecv].recv[0].transportComm->proxy) { + // Tune chunk size for the network + if (info->recvbytes < stepSize) info->recvChunkSize /= 4; + else if (info->recvbytes < 8*stepSize) info->recvChunkSize /= 2; + } + sub->recvChunkSize = info->recvChunkSize; } if (info->delta > 0 && info->sendbytes >= 0) { int peersend = (info->comm->rank+info->delta)%info->comm->nRanks; - args.nsteps = DIVUP(info->sendbytes, info->comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS/SENDRECV_SLICEFACTOR); - if (args.nsteps == 0) args.nsteps = 1; - args.sendbytes = info->sendbytes; - args.recvbytes = 0; - NCCLCHECK(SaveProxy(LOAD(info->comm->p2pNet) ? p2pProxySend : proxySend, peersend, &args)); + if (channel->peers[peersend].send[0].transportComm && channel->peers[peersend].send[0].transportComm->proxy) { + // Tune chunk size for the network + if (info->sendbytes < stepSize) info->sendChunkSize /= 4; + else if (info->sendbytes < 8*stepSize) info->sendChunkSize /= 2; + } + sub->sendChunkSize = info->sendChunkSize; } return ncclSuccess; } -static ncclResult_t removeOp(struct ncclProxyState* state, struct ncclProxyArgs** opPtr, struct ncclProxyArgs** prevOpPtr, struct ncclProxyArgs** prevGroupPtr) { +ncclResult_t ncclProxySaveP2p(struct ncclComm* comm, struct ncclProxyArgs* args) { + struct ncclProxySubArgs* sub = args->subs; + struct ncclChannel* channel = sub->channel; + args->opCount = channel->workFifoTail-1; + args->commOpCount = comm->opCount; + const ssize_t recvbytesOrig = sub->recvbytes; + const ssize_t sendbytesOrig = sub->sendbytes; + if (sub->delta > 0 && recvbytesOrig >= ssize_t(0)) { + int peerrecv = (comm->nRanks+comm->rank-sub->delta)%comm->nRanks; + sub->recvbytes = recvbytesOrig; + sub->sendbytes = 0; + sub->nsteps = DIVUP(sub->recvbytes, sub->recvChunkSize); + if (sub->nsteps == 0) sub->nsteps = 1; + NCCLCHECK(SaveProxy(proxyRecv, peerrecv, args, NCCL_CONN_IDX_P2P)); + } + if (sub->delta > 0 && sendbytesOrig >= ssize_t(0)) { + int peersend = (comm->rank+sub->delta)%comm->nRanks; + sub->sendbytes = sendbytesOrig; + sub->recvbytes = 0; + sub->nsteps = DIVUP(sub->sendbytes, sub->sendChunkSize); + if (sub->nsteps == 0) sub->nsteps = 1; + NCCLCHECK(SaveProxy(proxySend, peersend, args, NCCL_CONN_IDX_P2P)); + } + // Reset proxy args for potentially multiple cuda graph launches + // It is safe as long as SaveProxy copies contents of args to op + sub->recvbytes = recvbytesOrig; + sub->sendbytes = sendbytesOrig; + return ncclSuccess; +} + +static ncclResult_t removeOp(struct ncclProxyState* state, struct ncclProxyArgs** opPtr, struct ncclProxyArgs** prevOpPtr) { struct ncclProxyArgs* freeOp = *opPtr; - DEBUG_PROXY_PRINT("Remove %ld/%ld -> %ld -> %ld/%ld\n", OP_INDEX(*prevOpPtr), OP_INDEX(*prevGroupPtr), OP_INDEX(freeOp), OP_INDEX(freeOp->next), OP_INDEX(freeOp->nextGroup)); - if (*prevGroupPtr && *prevOpPtr) return ncclInternalError; - if (freeOp->nextGroup) { - // Part of a group : remove the element - struct ncclProxyArgs* next = freeOp->nextGroup; - *opPtr = next; - if (*prevGroupPtr) { - (*prevGroupPtr)->nextGroup = next; - } else if (*prevOpPtr) { + DEBUG_PROXY_PRINT("Remove %ld -> %ld -> %ld\n", OP_INDEX(*prevOpPtr), OP_INDEX(freeOp), OP_INDEX(freeOp->next)); + struct ncclProxyArgs* next = freeOp->next; + *opPtr = next; + if (freeOp->nextPeer) { + // replace op by nextPeer + struct ncclProxyArgs* nextPeer = freeOp->nextPeer; + if (*prevOpPtr) { + (*prevOpPtr)->next = nextPeer; + } else { + state->ops = nextPeer; + } + nextPeer->next = next; + *(prevOpPtr) = nextPeer; + } else { + *(freeOp->proxyAppendPtr) = NULL; + if (*prevOpPtr) { (*prevOpPtr)->next = next; } else { state->ops = next; } - } else { - struct ncclProxyArgs* next = freeOp->next; - *opPtr = next; - if ((*prevGroupPtr)) { - (*prevGroupPtr)->next = next; - (*prevGroupPtr)->nextGroup = NULL; - (*prevGroupPtr)->nextPeer = freeOp->nextPeer; - if (*(freeOp->proxyAppendPtr) == freeOp) *(freeOp->proxyAppendPtr) = *prevGroupPtr; - (*prevOpPtr) = *prevGroupPtr; - (*prevGroupPtr) = NULL; - } else { - if (freeOp->nextPeer) { - // replace op by nextPeer - struct ncclProxyArgs* nextPeer = freeOp->nextPeer; - if (*prevOpPtr) { - (*prevOpPtr)->next = nextPeer; - } else { - state->ops = nextPeer; - } - struct ncclProxyArgs* lastGroup = nextPeer; - while (lastGroup->nextGroup) lastGroup = lastGroup->nextGroup; - lastGroup->next = next; - *(prevOpPtr) = lastGroup; - } else { - *(freeOp->proxyAppendPtr) = NULL; - if (*prevOpPtr) { - (*prevOpPtr)->next = next; - } else { - state->ops = next; - } - } - } } - pthread_mutex_lock(&state->poolMutex); - freeOp->next = state->pool; - state->pool = freeOp; - pthread_mutex_unlock(&state->poolMutex); + freeOp->next = state->poolFreed; + state->poolFreed = freeOp; DEBUG_PROXY_PRINT("Removed %5ld (%5ld) : ", OP_INDEX(freeOp), OP_INDEX(*freeOp->proxyAppendPtr)); NCCLCHECK(dumpProxyState(state)); return ncclSuccess; @@ -312,33 +336,81 @@ static ncclResult_t removeOp(struct ncclProxyState* state, struct ncclProxyArgs* static ncclResult_t progressOps(struct ncclProxyState* state, struct ncclProxyArgs** opsPtr, int* idle, struct ncclComm* comm) { struct ncclProxyArgs* prevOp = NULL; - struct ncclProxyArgs* prevGroup = NULL; struct ncclProxyArgs* op = *opsPtr; while (op) { if (op->state == ncclProxyOpNone) return ncclInternalError; - // opCount >= lastOpCount are part of an ongoing GroupStart/GroupEnd that hasn't started - // yet and might be cancelled before they even start. Hold on on those. - if (op->opCount < comm->lastOpCount) { - NCCLCHECK(op->progress(op)); - *idle &= op->idle; - } + NCCLCHECK(op->progress(op)); + *idle &= op->idle; if (op->state == ncclProxyOpNone) { - NCCLCHECK(removeOp(state, &op, &prevOp, &prevGroup)); + NCCLCHECK(removeOp(state, &op, &prevOp)); } else { - if (op->nextGroup) { - prevGroup = op; - prevOp = NULL; - op = op->nextGroup; - } else { - prevOp = op; - prevGroup = NULL; - op = op->next; - } + prevOp = op; + op = op->next; } } return ncclSuccess; } +ncclResult_t ncclProxyAppendPosted(struct ncclProxyState* state) { + // Return any freed element first + if (state->poolFreed) { + struct ncclProxyArgs* end = state->poolFreed; + while (end->next) end = end->next; + pthread_mutex_lock(&state->poolMutex); + end->next = state->poolReturned; + state->poolReturned = state->poolFreed; + pthread_mutex_unlock(&state->poolMutex); + state->poolFreed = NULL; + } + + // Then wait until we have new work to do + pthread_mutex_lock(&state->opsMutex); + while (state->postedOps == NULL) { + if (state->stop) return ncclSuccess; + pthread_cond_wait(&state->cond, &state->opsMutex); + } + + // Sort operations as we append them : collectives and + // receives first, then sends. + + struct ncclProxyArgs* next, *prev = NULL, *op = state->postedOps; + int commOpCount = op->commOpCount; + while (op && op->commOpCount == commOpCount) { + next = op->next; + if (op->subs[0].sendbytes) { + if (prev) prev->next = next; + else state->postedOps = next; + op->next = NULL; + NCCLCHECK(ProxyAppend(state, op)); + } else prev = op; + op = next; + } + op = state->postedOps; + while (op && op->commOpCount == commOpCount) { + next = op->next; + op->next = NULL; + NCCLCHECK(ProxyAppend(state, op)); + op = next; + } + state->postedOps = op; + if (op == NULL) state->postedOpsEnd = NULL; + NCCLCHECK(dumpProxyState(state)); + pthread_mutex_unlock(&state->opsMutex); + + if (state->poolFreed) { + struct ncclProxyArgs* end = state->poolFreed; + while (end->next) end = end->next; + pthread_mutex_lock(&state->poolMutex); + end->next = state->poolReturned; + state->poolReturned = state->poolFreed; + pthread_mutex_unlock(&state->poolMutex); + state->poolFreed = NULL; + } + + return ncclSuccess; +} + + void* persistentThread(void *comm_) { struct ncclComm* comm = (struct ncclComm*)comm_; struct ncclProxyState* state = &comm->proxyState; @@ -346,158 +418,95 @@ void* persistentThread(void *comm_) { sprintf(threadName, "NCCLproxy %5d", comm->rank); nvtxNameOsThreadA(syscall(SYS_gettid), threadName); - pthread_mutex_lock(&state->opsMutex); struct ncclProxyArgs** opsPtr = &state->ops; while (1) { if (LOAD(comm->abortFlag)) { - pthread_mutex_unlock(&state->opsMutex); return NULL; } while (LOAD(opsPtr) == NULL) { if (state->stop) { // No more commands to process and proxy has been requested to stop - pthread_mutex_unlock(&state->opsMutex); return NULL; } - pthread_cond_wait(&state->cond, &state->opsMutex); + ncclResult_t ret = ncclProxyAppendPosted(state); + if (ret != ncclSuccess) { + comm->fatalError = ret; + INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret); + return NULL; + } } int idle = 1; ncclResult_t ret = progressOps(state, opsPtr, &idle, comm); if (ret != ncclSuccess) { comm->fatalError = ret; INFO(NCCL_ALL,"%s:%d -> %d [Proxy Thread]", __FILE__, __LINE__, ret); - pthread_mutex_unlock(&state->opsMutex); return NULL; } if (idle) { - pthread_mutex_unlock(&state->opsMutex); sched_yield(); // No request progressed. Let others run. - pthread_mutex_lock(&state->opsMutex); } } } ncclResult_t ncclProxyStart(struct ncclComm* comm) { struct ncclProxyState* state = &comm->proxyState; + if (state->nextOps == NULL) return ncclSuccess; pthread_mutex_lock(&state->opsMutex); - - // Sort operations as we append them : collectives and - // receives first, then sends. - ncclProxyArgs* next, *prev = NULL, *op = state->nextOps; - while (op) { - next = op->next; - if (op->sendbytes) { - if (prev) prev->next = next; - else state->nextOps = next; - op->next = NULL; - NCCLCHECK(ProxyAppend(state, op, op->connector->conn.shared)); - } else prev = op; - op = next; - } - op = state->nextOps; - while (op) { - next = op->next; - op->next = NULL; - NCCLCHECK(ProxyAppend(state, op, op->connector->conn.shared)); - op = next; - } + if (state->postedOps) state->postedOpsEnd->next = state->nextOps; + else state->postedOps = state->nextOps; + state->postedOpsEnd = state->nextOpsEnd; state->nextOps = state->nextOpsEnd = NULL; - NCCLCHECK(dumpProxyState(state)); - - if (state->ops != NULL) - pthread_cond_signal(&state->cond); + pthread_cond_signal(&state->cond); pthread_mutex_unlock(&state->opsMutex); + comm->opCount++; return ncclSuccess; } -NCCL_PARAM(ProxySharedBuffersCount, "SHARED_BUFF_COUNT", -2); - ncclResult_t ncclProxySharedBuffersInit(struct ncclComm* comm, int cuda, int* size, char** ptr) { - struct ncclProxySharedBuffers* state = comm->proxyState.sharedBuffs; - if (state == NULL) { - NCCLCHECK(ncclCalloc(&state, 1)); - comm->proxyState.sharedBuffs = state; - state->nslots = ncclParamProxySharedBuffersCount(); - if (state->nslots == -2) { - state->nslots = NCCL_STEPS*NCCL_MAX_WORK_ELEMENTS; - } - state->slotSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(NCCL_STEPS*SENDRECV_SLICEFACTOR); + struct ncclProxySharedBuffers* state = &comm->proxyState.sharedBuffs; + if (state->size == 0) { + int p2pnChannels = 1; + while (p2pnChannels < comm->nChannels) p2pnChannels *= 2; + int p2pSize = 2*p2pnChannels*NCCL_MAX_WORK_ELEMENTS*comm->buffSizes[NCCL_PROTO_SIMPLE]/SENDRECV_SLICEFACTOR; + int collNetSize = 2*comm->nChannels*comm->buffSizes[NCCL_PROTO_SIMPLE]; + state->size = std::max(p2pSize, collNetSize); } - char* buff; - int* used; - *size = 2*std::max(comm->nChannels, comm->p2pnChannels)*state->slotSize*state->nslots; + *size = state->size; - if (cuda && state->cudaBuff[0] == NULL) { - NCCLCHECK(ncclCudaCalloc(&buff, *size, cuda)); - NCCLCHECK(ncclCalloc(&used, 2*std::max(comm->nChannels, comm->p2pnChannels)*state->nslots)); - for (int i=0; i<2*std::max(comm->nChannels, comm->p2pnChannels); i++) { - state->cudaBuff[i] = buff + state->nslots*state->slotSize*i; - state->cudaUsed[i] = used + state->nslots*i; - } - } else if (state->hostBuff[0] == NULL) { - NCCLCHECK(ncclCudaHostCalloc(&buff, *size)); - NCCLCHECK(ncclCalloc(&used, 2*std::max(comm->nChannels, comm->p2pnChannels)*state->nslots)); - for (int i=0; i<2*std::max(comm->nChannels, comm->p2pnChannels); i++) { - state->hostBuff[i] = buff + state->nslots*state->slotSize*i; - state->hostUsed[i] = used + state->nslots*i; - } + if (cuda && state->cudaBuff == NULL) { + NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, *size, cuda)); + } else if (state->hostBuff == NULL) { + NCCLCHECK(ncclCudaHostCalloc(&state->hostBuff, *size)); } - buff = cuda ? state->cudaBuff[0] : state->hostBuff[0]; - - *ptr = buff; + *ptr = cuda ? state->cudaBuff : state->hostBuff; return ncclSuccess; } -ncclResult_t ncclProxySharedBuffersAlloc(struct ncclComm* comm, int cuda, int type, int channel, int size, char** ptr) { - struct ncclProxySharedBuffers* state = comm->proxyState.sharedBuffs; - // Use different pools for different channels and also separate send/recv. - int p = 2*channel+type; - int* used = cuda ? state->cudaUsed[p] : state->hostUsed[p]; - char* buff = cuda ? state->cudaBuff[p] : state->hostBuff[p]; - if (buff == NULL) return ncclInternalError; - int nslots = 1; - while (nslots*state->slotSize < size) nslots *= 2; - for (int s=0; snslots; s+=nslots) { - int u = 0; - for (int i=0; islotSize*s; - return ncclSuccess; - } - } - *ptr = NULL; +ncclResult_t ncclProxySharedBuffersGetP2p(struct ncclComm* comm, int cuda, int type, int channel, int slot, int index, char** ptr) { + struct ncclProxySharedBuffers* state = &comm->proxyState.sharedBuffs; + // Use different pools for separate send/recv. + char* buff = cuda ? state->cudaBuff : state->hostBuff; + int slotSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/(NCCL_STEPS*SENDRECV_SLICEFACTOR); + int globalSlot = (((type*comm->p2pnChannels+channel)*NCCL_STEPS)+slot)*NCCL_MAX_WORK_ELEMENTS+index; + *ptr = buff + slotSize * globalSlot; return ncclSuccess; } - -ncclResult_t ncclProxySharedBuffersFree(struct ncclComm* comm, int cuda, int type, int channel, int size, char* ptr) { - struct ncclProxySharedBuffers* state = comm->proxyState.sharedBuffs; - int p = 2*channel+type; - int* used = cuda ? state->cudaUsed[p] : state->hostUsed[p]; - char* buff = cuda ? state->cudaBuff[p] : state->hostBuff[p]; - if (buff == NULL) return ncclInternalError; - int nslots = 1; - while (nslots*state->slotSize < size) nslots *= 2; - int s = (ptr-buff)/state->slotSize; - if (s < 0 || s+nslots > state->nslots) { - WARN("Error freeing shared buffer : freeing ptr %p size %d (start %p slot size %d nslots %d)", ptr, size, buff, state->slotSize, state->nslots); - return ncclInternalError; - } - for (int i=0; iproxyState.sharedBuffs; + // Use different pools for different channels. + char* buff = cuda ? state->cudaBuff : state->hostBuff; + int slotSize = comm->buffSizes[NCCL_PROTO_SIMPLE]/NCCL_STEPS; + int globalSlot = (type*NCCL_STEPS+slot)*comm->nChannels+channel; + *ptr = buff + slotSize * globalSlot; return ncclSuccess; } ncclResult_t ncclProxySharedBuffersDestroy(struct ncclComm* comm) { - struct ncclProxySharedBuffers* state = comm->proxyState.sharedBuffs; - if (state) { - CUDACHECK(hipFree(state->cudaBuff[0])); - free(state->cudaUsed[0]); - NCCLCHECK(ncclCudaHostFree(state->hostBuff[0])); - free(state->hostUsed[0]); - free(state); - } + struct ncclProxySharedBuffers* state = &comm->proxyState.sharedBuffs; + CUDACHECK(hipFree(state->cudaBuff)); + NCCLCHECK(ncclCudaHostFree(state->hostBuff)); return ncclSuccess; } diff --git a/src/transport.cc b/src/transport.cc index c81f9b9c48..538cc302a2 100644 --- a/src/transport.cc +++ b/src/transport.cc @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -31,33 +31,23 @@ static ncclResult_t connectedByXGMI(int* ret, struct ncclTopoSystem* system, str } template -static ncclResult_t selectTransportN(struct ncclComm* comm, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int channelId, int n) { - for (int t=n; tsend : &transport->recv; - int ret = 0; - NCCLCHECK(transport->canConnect(&ret, comm->topo, NULL, myInfo, peerInfo)); - if (ret) { - connector->transportComm = transportComm; - NCCLCHECK(transportComm->setup(comm, NULL, myInfo, peerInfo, connect, connector, channelId)); - return ncclSuccess; - } - } - WARN("No transport found !"); - return ncclInternalError; -} - -template -static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int channelId) { +static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclConnect* connect, int channelId, int peer, int connIndex) { + struct ncclPeerInfo* myInfo = comm->peerInfo+comm->rank; + struct ncclPeerInfo* peerInfo = comm->peerInfo+peer; + struct ncclConnector* connector = (type == 1) ? comm->channels[channelId].peers[peer].send + connIndex : + comm->channels[channelId].peers[peer].recv + connIndex; + int xgmi; + NCCLCHECK(connectedByXGMI(&xgmi, comm->topo, myInfo, peerInfo)); for (int t=0; tsend : &transport->recv; int ret = 0; NCCLCHECK(transport->canConnect(&ret, comm->topo, graph, myInfo, peerInfo)); if (ret) { connector->transportComm = transportComm; - NCCLCHECK(transportComm->setup(comm, graph, myInfo, peerInfo, connect, connector, channelId)); + NCCLCHECK(transportComm->setup(comm, graph, myInfo, peerInfo, connect, connector, channelId, connIndex)); return ncclSuccess; } } @@ -65,17 +55,17 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* return ncclInternalError; } -ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend) { +ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex) { TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv); uint32_t mask = 1 << channel->id; for (int i=0; i= comm->nRanks || peer == comm->rank || channel->peers[peer].recv.connected) continue; + if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].recv[connIndex].connected) continue; comm->connectRecv[peer] |= mask; } for (int i=0; i= comm->nRanks || peer == comm->rank || channel->peers[peer].send.connected) continue; + if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].send[connIndex].connected) continue; comm->connectSend[peer] |= mask; } return ncclSuccess; @@ -90,10 +80,14 @@ void dumpData(struct ncclConnect* data, int ndata) { } } -ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph) { +ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex) { + // Stream used during transport setup; need for P2P pre-connect + CUDA Graph + hipStream_t transportSetupStream; + CUDACHECK(hipStreamCreateWithFlags(&transportSetupStream, hipStreamNonBlocking)); + struct ncclConnect data[2*MAXCHANNELS]; - uint32_t p2pNet = LOAD(comm->p2pNet); for (int i=1; inRanks; i++) { + int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0); int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks; int sendPeer = (comm->rank + i) % comm->nRanks; uint32_t recvMask = comm->connectRecv[recvPeer]; @@ -103,79 +97,173 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* int sendChannels = 0, recvChannels = 0; for (int c=0; cchannels[c].peers[recvPeer].p2pRecv; - NCCLCHECK(connectedByXGMI(&xgmi, comm->topo, comm->peerInfo+comm->rank, comm->peerInfo+recvPeer)); - if (xgmi) { - NCCLCHECK(selectTransportN<0>(comm, comm->peerInfo+comm->rank, comm->peerInfo+recvPeer, recvData+recvChannels++, conn, c, TRANSPORT_P2P)); - } - else { - NCCLCHECK(selectTransportN<0>(comm, comm->peerInfo+comm->rank, comm->peerInfo+recvPeer, recvData+recvChannels++, conn, c, TRANSPORT_NET)); - } - } - else { - struct ncclConnector* conn = &comm->channels[c].peers[recvPeer].recv; - NCCLCHECK(selectTransport<0>(comm, graph, comm->peerInfo+comm->rank, comm->peerInfo+recvPeer, recvData+recvChannels++, conn, c)); - } + NCCLCHECK(selectTransport<0>(comm, graph, recvData+recvChannels++, c, recvPeer, connIndex)); } } struct ncclConnect* sendData = recvData+recvChannels; for (int c=0; cchannels[c].peers[sendPeer].p2pSend; - NCCLCHECK(connectedByXGMI(&xgmi, comm->topo, comm->peerInfo+comm->rank, comm->peerInfo+sendPeer)); - if (xgmi) { - NCCLCHECK(selectTransportN<1>(comm, comm->peerInfo+comm->rank, comm->peerInfo+sendPeer, sendData+sendChannels++, conn, c, TRANSPORT_P2P)); - } - else { - NCCLCHECK(selectTransportN<1>(comm, comm->peerInfo+comm->rank, comm->peerInfo+sendPeer, sendData+sendChannels++, conn, c, TRANSPORT_NET)); - } - } - else { - struct ncclConnector* conn = &comm->channels[c].peers[sendPeer].send; - NCCLCHECK(selectTransport<1>(comm, graph, comm->peerInfo+comm->rank, comm->peerInfo+sendPeer, sendData+sendChannels++, conn, c)); - } + NCCLCHECK(selectTransport<1>(comm, graph, sendData+sendChannels++, c, sendPeer, connIndex)); } } if (sendPeer == recvPeer) { if (recvChannels+sendChannels) { - NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels))); - NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels))); + NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels))); + NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels))); sendData = data; recvData = data+sendChannels; } } else { - if (recvChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, recvData, sizeof(struct ncclConnect)*recvChannels)); - if (sendChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, sendPeer, sendData, sizeof(struct ncclConnect)*sendChannels)); - if (sendChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, sendPeer, sendData, sizeof(struct ncclConnect)*sendChannels)); - if (recvChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, recvData, sizeof(struct ncclConnect)*recvChannels)); + if (recvChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels)); + if (sendChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels)); + if (sendChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels)); + if (recvChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels)); } for (int c=0; cchannels[c].peers[sendPeer].p2pSend - : &comm->channels[c].peers[sendPeer].send; + struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex; NCCLCHECK(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn)); conn->connected = 1; - if (p2pNet && graph == NULL) CUDACHECK(hipMemcpy(&comm->channels[c].devPeers[sendPeer].p2pSend, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice)); - else CUDACHECK(hipMemcpy(&comm->channels[c].devPeers[sendPeer].send, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice)); + CUDACHECK(hipMemcpyAsync(comm->channels[c].devPeers[sendPeer].send+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice, transportSetupStream)); } } for (int c=0; cchannels[c].peers[recvPeer].p2pRecv - : &comm->channels[c].peers[recvPeer].recv; + struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex; NCCLCHECK(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn)); conn->connected = 1; - if (p2pNet && graph == NULL) CUDACHECK(hipMemcpy(&comm->channels[c].devPeers[recvPeer].p2pRecv, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice)); - else CUDACHECK(hipMemcpy(&comm->channels[c].devPeers[recvPeer].recv, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice)); + CUDACHECK(hipMemcpyAsync(comm->channels[c].devPeers[recvPeer].recv+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice, transportSetupStream)); } } comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0; } + CUDACHECK(hipStreamSynchronize(transportSetupStream)); + CUDACHECK(hipStreamDestroy(transportSetupStream)); + return ncclSuccess; +} + +extern struct ncclTransport collNetTransport; + +// All ranks must participate in collNetSetup call +// return: 0 - unsupported, 1 - supported +// We do not NCCLCHECK this call because we would fall back to P2P network in case CollNet setup fails +int ncclTransportCollNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGraph, struct ncclChannel* channel, int masterRank, int masterPeer, int collNetGraphChannelId, int type) { + int rank = comm->rank; + int nranks = comm->nRanks; + int nMasters = comm->nNodes; + int rankInCollNet = -1; + int supported = 0; + int isMaster = (rank == masterRank) ? 1 : 0; + struct { + int collNetRank; + ncclConnect connect; + } sendrecvExchange; + + // check if we can connect to collnet, whose root is the nranks-th rank + struct ncclPeerInfo *myInfo = comm->peerInfo+rank, *peerInfo = comm->peerInfo+nranks; + peerInfo->rank = nranks; + int ret = 1; + if (isMaster) { + NCCLCHECK(collNetTransport.canConnect(&ret, comm->topo, collNetGraph, myInfo, peerInfo)); + } + + // send master receives connect info from peer recv master + if (isMaster && type == collNetSend) { + NCCLCHECK(bootstrapRecv(comm->bootstrap, masterPeer, collNetGraph->id, &sendrecvExchange, sizeof(sendrecvExchange))); + rankInCollNet = sendrecvExchange.collNetRank; + TRACE(NCCL_INIT, "CollNet [send] : rank %d collNetRank %d collNetNranks %d received connect from rank %d", rank, rankInCollNet, nMasters, masterPeer); + } + + // select + struct ncclPeer* root = channel->peers+nranks; + // connector index: 0 for recv, 1 for send + struct ncclConnector* conn = (type == collNetRecv) ? root->recv+type : root->send+type; + struct ncclTransportComm* transportComm = (type == collNetRecv) ? &(collNetTransport.recv) : &(collNetTransport.send); + conn->transportComm = transportComm; + // setup + struct ncclConnect myConnect; + if (isMaster && ret > 0) { + NCCLCHECK(transportComm->setup(comm, collNetGraph, myInfo, peerInfo, &myConnect, conn, collNetGraphChannelId, type)); + } + // prepare connect handles + ncclResult_t res; + struct { + int isMaster; + ncclConnect connect; + } *allConnects = NULL; + ncclConnect *masterConnects = NULL; + NCCLCHECK(ncclCalloc(&masterConnects, nMasters)); + if (type == collNetRecv) { // recv side: AllGather + // all ranks must participate + NCCLCHECK(ncclCalloc(&allConnects, nranks)); + allConnects[rank].isMaster = isMaster; + memcpy(&(allConnects[rank].connect), &myConnect, sizeof(struct ncclConnect)); + NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allConnects, sizeof(*allConnects)), res, cleanup); + // consolidate + int c = 0; + for (int r = 0; r < nranks; r++) { + if (allConnects[r].isMaster) { + memcpy(masterConnects+c, &(allConnects[r].connect), sizeof(struct ncclConnect)); + if (r == rank) rankInCollNet = c; + c++; + } + } + } else { // send side : copy in connect info received from peer recv master + if (isMaster) memcpy(masterConnects+rankInCollNet, &(sendrecvExchange.connect), sizeof(struct ncclConnect)); + } + // connect + if (isMaster && ret > 0) { + NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, rankInCollNet, conn), res, cleanup); + struct ncclPeer* devRoot = channel->devPeers+nranks; + struct ncclConnector* devConn = (type == collNetRecv) ? devRoot->recv+type : devRoot->send+type; + CUDACHECKGOTO(hipMemcpy(devConn, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice), res, cleanup); + } + // recv side sends connect info to send side + if (isMaster && type == collNetRecv) { + sendrecvExchange.collNetRank = rankInCollNet; + memcpy(&sendrecvExchange.connect, masterConnects+rankInCollNet, sizeof(struct ncclConnect)); + NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, masterPeer, collNetGraph->id, &sendrecvExchange, sizeof(sendrecvExchange)), res, cleanup); + TRACE(NCCL_INIT, "CollNet [recv] : rank %d collNetRank %d collNetNranks %d sent connect to rank %d", rank, rankInCollNet, nMasters, masterPeer); + } + if (ret > 0) { + supported = 1; + } +cleanup: + if (allConnects != NULL) free(allConnects); + if (masterConnects != NULL) free(masterConnects); + return supported; +} + +ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail) { + int rank = comm->rank; + int nranks = comm->nRanks; + // AllGather collNet setup results + int* allGatherFailures; + NCCLCHECK(ncclCalloc(&allGatherFailures, nranks)); + allGatherFailures[rank] = collNetSetupFail; + NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGatherFailures, sizeof(int))); + for (int i=0; inChannels; r++) { + struct ncclChannel* channel = comm->channels+r; + struct ncclPeer* peer = channel->peers+nranks; + if (peer->send->transportResources && peer->send->transportComm) NCCLCHECK(peer->send->transportComm->free(peer->send->transportResources)); + if (peer->recv->transportResources && peer->recv->transportComm) NCCLCHECK(peer->recv->transportComm->free(peer->recv->transportResources)); + peer->send->transportResources = NULL; // avoid double free + peer->recv->transportResources = NULL; // avoid double free + } + // Set support to 0 + comm->collNetSupport = 0; + } return ncclSuccess; } diff --git a/src/transport/coll_net.cc b/src/transport/coll_net.cc index 9399ba2af1..db6650321e 100644 --- a/src/transport/coll_net.cc +++ b/src/transport/coll_net.cc @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -8,18 +8,17 @@ #include "comm.h" #include "coll_net.h" #include "graph.h" -#include -#include + +#define COLLNET_GROUP_NSUBS 8 +#define COLLNET_MAX_GROUPS (NCCL_PROXY_MAX_SUBS/COLLNET_GROUP_NSUBS) struct collNetRecvConnectInfo { collNetHandle_t collNetHandle; }; struct collNetSendConnectInfo { - collNetHandle_t collNetHandle; - void* collNetComm; void* mhandles[NCCL_NUM_PROTOCOLS]; - struct reqSlot* reqFifo; + void* reqFifo; }; struct reqSlot { @@ -28,10 +27,10 @@ struct reqSlot { }; struct collNetSendResources { - void* collNetSendComm; + struct ncclComm* comm; + void* collNetComm; struct ncclSendMem* sendMem; struct ncclRecvMem* recvMem; - uint32_t* llData; int netDev; int useGdr; void* sendMhandles[NCCL_NUM_PROTOCOLS]; @@ -39,47 +38,68 @@ struct collNetSendResources { struct ncclRecvMem* devRecvMem; uint64_t step; uint64_t llLastCleaning; - struct reqSlot* reqFifo; + struct reqSlot (*reqFifo)[NCCL_STEPS]; int collNetRank; uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only) }; struct collNetRecvResources { - void* netListenComm; - void* collNetRecvComm; + struct ncclComm* comm; + void* collNetComm; struct ncclSendMem* sendMem; struct ncclRecvMem* recvMem; - uint32_t* llData; int netDev; int useGdr; void* mhandles[NCCL_NUM_PROTOCOLS]; struct ncclRecvMem* devRecvMem; uint64_t step; uint64_t llLastCleaning; - struct reqSlot* reqFifo; + struct reqSlot reqFifo[COLLNET_MAX_GROUPS][NCCL_STEPS]; int collNetRank; uint32_t* curr_hdp_reg; // Curr GPU in ring (for rdma transport use only) }; +struct collNetSharedResources { + void* collNetListenComms[MAXCHANNELS]; + void* collNetComms[MAXCHANNELS]; + int collNetCommRefCount[MAXCHANNELS]; +}; + /* Determine if we can communicate with the peer */ ncclResult_t collNetCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { *ret = 1; return ncclSuccess; } +ncclResult_t collNetSharedListen(struct ncclComm* comm, int netDev, void* collNetHandle) { + struct collNetSharedResources* resources = (struct collNetSharedResources*)comm->proxyState.sharedBuffs.collNetResources; + if (resources == NULL) { + NCCLCHECK(ncclCalloc(&resources, 1)); + comm->proxyState.sharedBuffs.collNetResources = resources; + } + if (resources->collNetComms[netDev] == NULL) + NCCLCHECK(collNetListen(netDev, collNetHandle, resources->collNetListenComms+netDev)); + return ncclSuccess; +} + /* Setup send connector, and return connect information for others in the coll communicator to connect to me */ -ncclResult_t collNetSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) { +ncclResult_t collNetSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { struct collNetSendResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); send->transportResources = resources; + send->conn.shared = 1; + resources->comm = comm; - NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, &resources->netDev)); + NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, 0, &resources->netDev)); NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 1, &resources->useGdr)); + send->proxyAppendPtr = comm->proxyState.sharedBuffs.proxyAppendCollNet+2*resources->netDev+1; + NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1)); int recvSize = offsetof(struct ncclRecvMem, buff); - for (int p=0; pcomm->buffSizes[p]; + // Simple uses shared buffers and we don't support LL128 + recvSize += send->comm->buffSizes[NCCL_PROTO_LL]; if (resources->useGdr) { NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize, resources->useGdr)); @@ -87,38 +107,64 @@ ncclResult_t collNetSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph send->conn.curr_hdp_reg = resources->curr_hdp_reg; } NCCLCHECK(ncclCudaHostCalloc((char**)&resources->recvMem, recvSize)); - NCCLCHECK(ncclIbMalloc((void**)&(resources->llData), send->comm->buffSizes[NCCL_PROTO_LL]/2)); - INFO(NCCL_INIT|NCCL_NET,"Coll %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), resources->netDev, + INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), resources->netDev, resources->useGdr ? "/GDRDMA" : ""); return ncclSuccess; } /* Setup recv connector */ -ncclResult_t collNetRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) { +ncclResult_t collNetRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { struct collNetRecvResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); recv->transportResources = resources; + recv->conn.shared = 1; + resources->comm = comm; - NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, &resources->netDev)); + NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, 0, &resources->netDev)); NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 0, &resources->useGdr)); + recv->proxyAppendPtr = comm->proxyState.sharedBuffs.proxyAppendCollNet+2*resources->netDev; + NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1)); int recvSize = offsetof(struct ncclRecvMem, buff); - for (int p=0; pcomm->buffSizes[p]; + // Simple uses shared buffers and we don't support LL128 + recvSize += recv->comm->buffSizes[NCCL_PROTO_LL]; if (resources->useGdr) { NCCLCHECK(ncclCudaCalloc((char**)(&resources->devRecvMem), recvSize, resources->useGdr)); } NCCLCHECK(ncclCudaHostCalloc((char**)&resources->recvMem, recvSize)); - NCCLCHECK(ncclIbMalloc((void**)&(resources->llData), recv->comm->buffSizes[NCCL_PROTO_LL]/2)); - - INFO(NCCL_INIT|NCCL_NET,"Coll %02d : %d [receive] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), resources->netDev, + INFO(NCCL_INIT|NCCL_NET,"CollNet %02d : %d [receive] via COLLNET/%s/%d%s", channelId, myInfo->rank, collNetName(), resources->netDev, resources->useGdr ? "/GDRDMA" : ""); struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*) connectInfo; - NCCLCHECK(collNetListen(resources->netDev, &info->collNetHandle, &resources->netListenComm)); + + NCCLCHECK(collNetSharedListen(comm, resources->netDev, &info->collNetHandle)); + return ncclSuccess; +} + +ncclResult_t collNetSharedConnect(struct ncclComm* comm, int netDev, struct ncclConnect* connectInfos, int nranks, int rank, void** collNetComm) { + struct collNetSharedResources* resources = (struct collNetSharedResources*)comm->proxyState.sharedBuffs.collNetResources; + if (resources->collNetComms[netDev] == NULL) { + // Connect to coll comm + collNetHandle_t** handlePtrs = NULL; + NCCLCHECK(ncclCalloc(&handlePtrs, nranks)); + for (int i = 0; i < nranks; i++) { + struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*)(connectInfos+i); + handlePtrs[i] = &(info->collNetHandle); + } + ncclResult_t ret = collNetConnect((void**)handlePtrs, nranks, rank, + resources->collNetListenComms[netDev], + resources->collNetComms+netDev); + free(handlePtrs); + NCCLCHECK(ret); + // Close listen comm + NCCLCHECK(collNetCloseListen(resources->collNetListenComms[netDev])); + } + *collNetComm = resources->collNetComms[netDev]; + resources->collNetCommRefCount[netDev]++; return ncclSuccess; } @@ -128,33 +174,40 @@ ncclResult_t collNetSendConnect(struct ncclComm* comm, struct ncclConnect* conne struct collNetSendConnectInfo* info = (struct collNetSendConnectInfo*)(connectInfos+rank); // Intermediate buffering on GPU for GPU Direct RDMA, but LL buffer is always on host - struct ncclRecvMem* recvMem = resources->useGdr ? resources->devRecvMem : resources->recvMem; - int offset = 0; - for (int p=0; pconn.buffs[p] = (p == NCCL_PROTO_LL ? resources->recvMem->buff : recvMem->buff) + offset; - offset += send->comm->buffSizes[p]; - } + send->conn.buffs[NCCL_PROTO_LL] = resources->recvMem->buff; + send->conn.buffs[NCCL_PROTO_LL128] = send->conn.buffs[NCCL_PROTO_SIMPLE] = NULL; send->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0; // Head/Tail/Opcount/Fifos are always on host send->conn.tail = &resources->recvMem->tail; send->conn.sizesFifo = resources->recvMem->sizesFifo; + send->conn.ptrsFifo = resources->recvMem->ptrsFifo; send->conn.head = &resources->sendMem->head; + resources->sendMem->head = -NCCL_STEPS; // Don't give any credit yet when sharing buffers for (int i=0; iconn.sizesFifo[i] = -1; // Get info from recv side resources->collNetRank = rank; - resources->reqFifo = info->reqFifo; - resources->collNetSendComm = info->collNetComm; + resources->reqFifo = (struct reqSlot (*)[NCCL_STEPS])(info->reqFifo); for (int p=0; precvMhandles[p] = info->mhandles[p]; - // Register buffers - NCCLCHECK(collNetRegMr(resources->collNetSendComm, send->conn.buffs[NCCL_PROTO_SIMPLE], send->comm->buffSizes[NCCL_PROTO_SIMPLE], - resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->sendMhandles[NCCL_PROTO_SIMPLE])); - NCCLCHECK(collNetRegMr(resources->collNetSendComm, resources->llData, send->comm->buffSizes[NCCL_PROTO_LL]/2, - NCCL_PTR_HOST, &resources->sendMhandles[NCCL_PROTO_LL])); + NCCLCHECK(collNetSharedConnect(comm, resources->netDev, connectInfos, nranks, rank, &resources->collNetComm)); + + int size; + char* ptr; + // Allocate & Register shared buffers for the Simple protocol + NCCLCHECK(ncclProxySharedBuffersInit(send->comm, resources->useGdr, &size, &ptr)); + NCCLCHECK(collNetRegMr(resources->collNetComm, ptr, size, + resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, + &resources->sendMhandles[NCCL_PROTO_SIMPLE])); + + // Allocate & Register shared buffers for the LL protocol + NCCLCHECK(ncclProxySharedBuffersInit(send->comm, 0, &size, &ptr)); + NCCLCHECK(collNetRegMr(resources->collNetComm, ptr, size, + NCCL_PTR_HOST, + &resources->sendMhandles[NCCL_PROTO_LL])); return ncclSuccess; } @@ -175,52 +228,57 @@ ncclResult_t collNetRecvConnect(struct ncclComm* comm, struct ncclConnect* conne // Head/Tail/Opcount are always on host recv->conn.tail = &resources->recvMem->tail; + recv->conn.ptrsFifo = resources->recvMem->ptrsFifo; recv->conn.head = &resources->sendMem->head; - // Connect to coll comm - collNetHandle_t** handlePtrs = NULL; - NCCLCHECK(ncclCalloc(&handlePtrs, nranks)); - for (int i = 0; i < nranks; i++) { - struct collNetRecvConnectInfo* info = (struct collNetRecvConnectInfo*)(connectInfos+i); - handlePtrs[i] = &(info->collNetHandle); - } - ncclResult_t res; - NCCLCHECKGOTO(collNetConnect((void**)handlePtrs, nranks, rank, resources->netListenComm, &resources->collNetRecvComm), res, cleanup); + NCCLCHECK(collNetSharedConnect(comm, resources->netDev, connectInfos, nranks, rank, &resources->collNetComm)); - // Register buffers - NCCLCHECK(collNetRegMr(resources->collNetRecvComm, recv->conn.buffs[NCCL_PROTO_SIMPLE], recv->comm->buffSizes[NCCL_PROTO_SIMPLE], - resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, &resources->mhandles[NCCL_PROTO_SIMPLE])); - NCCLCHECK(collNetRegMr(resources->collNetRecvComm, resources->llData, recv->comm->buffSizes[NCCL_PROTO_LL]/2, - NCCL_PTR_HOST, &resources->mhandles[NCCL_PROTO_LL])); + int size; + char* ptr; - // Create shared info between send and recv proxies - NCCLCHECK(ncclCalloc(&(resources->reqFifo), NCCL_STEPS)); + // Allocate & Register shared buffers for the Simple protocol + NCCLCHECK(ncclProxySharedBuffersInit(recv->comm, resources->useGdr, &size, &ptr)); + NCCLCHECK(collNetRegMr(resources->collNetComm, ptr, size, + resources->useGdr ? NCCL_PTR_CUDA : NCCL_PTR_HOST, + &resources->mhandles[NCCL_PROTO_SIMPLE])); + + // Allocate & Register shared buffers for the LL protocol + NCCLCHECK(ncclProxySharedBuffersInit(recv->comm, 0, &size, &ptr)); + NCCLCHECK(collNetRegMr(resources->collNetComm, ptr, size, + NCCL_PTR_HOST, + &resources->mhandles[NCCL_PROTO_LL])); // Pass info to send side info->reqFifo = resources->reqFifo; - info->collNetComm = resources->collNetRecvComm; for (int p=0; pmhandles[p] = resources->mhandles[p]; -cleanup: - if (handlePtrs != NULL) free(handlePtrs); - // Close listen comm - NCCLCHECK(collNetCloseListen(resources->netListenComm)); + return ncclSuccess; +} - return res; +ncclResult_t collNetSharedFree(struct ncclComm* comm, int netDev) { + struct collNetSharedResources* resources = (struct collNetSharedResources*)comm->proxyState.sharedBuffs.collNetResources; + resources->collNetCommRefCount[netDev]--; + if (resources->collNetCommRefCount[netDev] == 0) { + NCCLCHECK(collNetCloseColl(resources->collNetComms[netDev])); + } + for (int c=0; ccollNetCommRefCount[c]) return ncclSuccess; + comm->proxyState.sharedBuffs.collNetResources = NULL; + free(resources); + return ncclSuccess; } ncclResult_t collNetSendFree(void* sendTransportResources) { struct collNetSendResources* resources = (struct collNetSendResources*)sendTransportResources; NCCLCHECK(ncclCudaHostFree(resources->sendMem)); NCCLCHECK(ncclCudaHostFree(resources->recvMem)); - if (resources->collNetSendComm) { - NCCLCHECK(collNetDeregMr(resources->collNetSendComm, resources->sendMhandles[NCCL_PROTO_LL])); - NCCLCHECK(collNetDeregMr(resources->collNetSendComm, resources->sendMhandles[NCCL_PROTO_SIMPLE])); + if (resources->collNetComm) { + NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->sendMhandles[NCCL_PROTO_LL])); + NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->sendMhandles[NCCL_PROTO_SIMPLE])); } - if (resources->useGdr) - CUDACHECK(hipFree(resources->devRecvMem)); - free(resources->llData); + if (resources->useGdr) CUDACHECK(hipFree(resources->devRecvMem)); + + NCCLCHECK(collNetSharedFree(resources->comm, resources->netDev)); free(resources); return ncclSuccess; } @@ -228,115 +286,151 @@ ncclResult_t collNetSendFree(void* sendTransportResources) { ncclResult_t collNetRecvFree(void* recvTransportResources) { struct collNetRecvResources* resources = (struct collNetRecvResources*)recvTransportResources; NCCLCHECK(ncclCudaHostFree(resources->sendMem)); - if (resources->collNetRecvComm) { - NCCLCHECK(collNetDeregMr(resources->collNetRecvComm, resources->mhandles[NCCL_PROTO_LL])); - NCCLCHECK(collNetDeregMr(resources->collNetRecvComm, resources->mhandles[NCCL_PROTO_SIMPLE])); - } NCCLCHECK(ncclCudaHostFree(resources->recvMem)); - if (resources->useGdr) - CUDACHECK(hipFree(resources->devRecvMem)); - free(resources->llData); - free(resources->reqFifo); - - // Make sure SendFree is called before RecvFree - if (resources->collNetRecvComm) { - NCCLCHECK(collNetCloseColl(resources->collNetRecvComm)); + if (resources->collNetComm) { + NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->mhandles[NCCL_PROTO_LL])); + NCCLCHECK(collNetDeregMr(resources->collNetComm, resources->mhandles[NCCL_PROTO_SIMPLE])); } + if (resources->useGdr) CUDACHECK(hipFree(resources->devRecvMem)); + + NCCLCHECK(collNetSharedFree(resources->comm, resources->netDev)); free(resources); return ncclSuccess; } +#define LAST_OF_GROUP(s) \ + (s % COLLNET_GROUP_NSUBS == COLLNET_GROUP_NSUBS-1 || s == args->nsubs-1) + ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) { if (args->protocol == NCCL_PROTO_LL128) { WARN("CollNet does not support LL128"); return ncclInternalError; } - struct collNetSendResources* resources = (struct collNetSendResources*) (args->connector->transportResources); if (args->state == ncclProxyOpReady) { - // Round to next multiple of sliceSteps - resources->step = ROUNDUP(resources->step, args->chunkSteps); - args->posted = args->transmitted = args->done = resources->step; - args->end = resources->step + args->nsteps; + for (int s=0; snsubs; s++) { + struct ncclProxySubArgs* sub = args->subs+s; + struct collNetSendResources* resources = (struct collNetSendResources*) (sub->connector->transportResources); + // Round to next multiple of sliceSteps + sub->base = ROUNDUP(resources->step, args->chunkSteps); + sub->posted = sub->received = sub->transmitted = sub->done = 0; + resources->step = sub->base + sub->nsteps; + } args->state = ncclProxyOpProgress; + args->hdp_flushed = 0; } args->idle = 1; if (args->state == ncclProxyOpProgress) { int p = args->protocol; - int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS; - char* localBuff = args->connector->conn.buffs[p]; - void* sendMhandle = resources->sendMhandles[p]; - void* recvMhandle = resources->recvMhandles[p]; - struct reqSlot* reqFifo = resources->reqFifo; - int buffSlot = args->transmitted%NCCL_STEPS; - if (args->transmitted < args->end && args->transmitted < args->done + NCCL_STEPS - && LOAD(&reqFifo[buffSlot].recvBuff) != NULL) { - volatile int* sizesFifo = resources->recvMem->sizesFifo; - volatile uint64_t* recvTail = &resources->recvMem->tail; - if (LOAD(sizesFifo+buffSlot) != -1 && (LOAD(recvTail) > args->transmitted || args->protocol == NCCL_PROTO_LL)) { - // We have something to receive, let's check if it's completely ready. - int size = LOAD(sizesFifo+buffSlot); - char* buff = localBuff+buffSlot*stepSize; - int ready = 1; - if (args->protocol == NCCL_PROTO_LL) { - uint32_t flag = NCCL_LL_FLAG(args->transmitted + 1); - int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine)); - union ncclLLFifoLine* lines = (union ncclLLFifoLine*)buff; - // Pack data into another buffer - int stepLines = stepSize / sizeof(union ncclLLFifoLine); - uint32_t* sendBuff = resources->llData+buffSlot*2*stepLines; // each line has two data elements - buff = (char*)sendBuff; - for (int i=0; insubs, COLLNET_GROUP_NSUBS); + int perGroupSteps = NCCL_STEPS / nGroups; + for (int s=0; snsubs; s++) { + struct ncclProxySubArgs* sub = args->subs+s; + struct collNetSendResources* resources = (struct collNetSendResources*) (sub->connector->transportResources); + void* sendMhandle = resources->sendMhandles[p]; + void* recvMhandle = resources->recvMhandles[p]; + int stepSize = sub->connector->comm->buffSizes[p] / NCCL_STEPS; + auto reqFifo = resources->reqFifo; + if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) { + int buffSlot = (sub->base+sub->posted)%NCCL_STEPS; + if (p == NCCL_PROTO_SIMPLE) { + char* ptr; + int sharedBuffSlot = sub->posted%NCCL_STEPS; + NCCLCHECK(ncclProxySharedBuffersGetCollNet(sub->connector->comm, resources->useGdr, 0, sharedBuffSlot, 0, &ptr)); + resources->recvMem->ptrsFifo[buffSlot] = ptr + s*args->chunkSize; + __sync_synchronize(); } - if (ready) { - // flush HDP if not done - if (resources->curr_hdp_reg && args->hdp_flushed < LOAD(recvTail)) { - args->hdp_flushed = LOAD(recvTail); - STORE(resources->curr_hdp_reg, 1); + volatile uint64_t* sendHead = &resources->sendMem->head; + sub->posted += args->sliceSteps; + *sendHead = sub->base + sub->posted - NCCL_STEPS; + } + // Enforce sync between operations of the same group. + bool groupSync = (((s == 0) && ((sub+args->nsubs-1)->received == sub->received)) || (s && (sub-1)->received > sub->received)); + if (groupSync && sub->received < sub->posted && sub->received < sub->done + perGroupSteps) { + int buffSlot = (sub->base+sub->received)%NCCL_STEPS; + int sharedBuffSlot = sub->received%NCCL_STEPS; + volatile int* sizesFifo = resources->recvMem->sizesFifo; + volatile uint64_t* recvTail = &resources->recvMem->tail; + if (sizesFifo[buffSlot] != -1 && ((*recvTail > (sub->base+sub->received)) || p == NCCL_PROTO_LL)) { + // We have something to receive, let's check whether data is ready. + int size = sizesFifo[buffSlot]; + int ready = 1; + if (s == 0) { + NCCLCHECK(ncclProxySharedBuffersGetCollNet(sub->connector->comm, p == NCCL_PROTO_SIMPLE ? resources->useGdr : 0, 0, sharedBuffSlot, 0, &args->sharedBuff[sharedBuffSlot])); + args->sharedSize[sharedBuffSlot] = p == NCCL_PROTO_SIMPLE ? args->chunkSize : size/2; } - // Data is ready, try to send. - int count = size/ncclTypeSize(args->dtype); - NCCLCHECK(collNetIallreduce(resources->collNetSendComm, (void*) buff, (void*)(reqFifo[buffSlot].recvBuff), count, args->dtype, args->redOp, sendMhandle, recvMhandle, args->requests+buffSlot)); - if (args->requests[buffSlot] != NULL) { - TRACE(NCCL_NET, "sendProxy [%lu/%d] Iallreduce posted, req %p", args->transmitted, buffSlot, args->requests[buffSlot]); - STORE(sizesFifo+buffSlot, -1); - // Make sure size is reset to zero before we update the head. - __sync_synchronize(); - args->transmitted += args->sliceSteps; + if (p == NCCL_PROTO_LL) { + char* localBuff = sub->connector->conn.buffs[p]; + uint32_t flag = NCCL_LL_FLAG(sub->base + sub->received + 1); + int nFifoLines = size / sizeof(union ncclLLFifoLine); + union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize); + // Pack data into the shared buffer + uint32_t* sendBuff = (uint32_t*)(args->sharedBuff[sharedBuffSlot]+args->sharedSize[sharedBuffSlot]*s); + for (int i=0; ireceived += args->sliceSteps; args->idle = 0; - return ncclSuccess; + //continue; + // flush HDP if not done + if (resources->curr_hdp_reg && args->hdp_flushed < LOAD(recvTail)) { + args->hdp_flushed = LOAD(recvTail); + STORE(resources->curr_hdp_reg, 1); + } } } } - } - // Check whether the network has completed some send operations. - if (args->done < args->transmitted) { - int done, size; - int buffSlot = args->done%NCCL_STEPS; - NCCLCHECK(collNetTest((void*)(args->requests[buffSlot]), &done, &size)); - if (done) { - TRACE(NCCL_NET, "sendProxy [%lu/%d] request %p done, size %d", args->done, buffSlot, args->requests[buffSlot], size); - STORE(&reqFifo[buffSlot].size, size); - // Make sure size is updated before we set recvBuff to NULL (from the view of recv proxy, concerning the flush) - // (reordered store after store is possible on POWER, though not on x86) - __sync_synchronize(); - STORE(&reqFifo[buffSlot].recvBuff, NULL); // Notify recvProxy - args->done += args->sliceSteps; - resources->sendMem->head = args->done; - args->idle = 0; - if (args->done == args->end) { - resources->step = args->end; - args->state = ncclProxyOpNone; + if (LAST_OF_GROUP(s) && (sub->transmitted < sub->received)) { + int group = s / COLLNET_GROUP_NSUBS; + int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS; + int sharedBuffSlot = sub->transmitted%NCCL_STEPS; + if (reqFifo[group][buffSlot].recvBuff != NULL) { + int totalSize = (s-group*COLLNET_GROUP_NSUBS+1) * args->sharedSize[sharedBuffSlot]; + int count = totalSize / ncclTypeSize(args->dtype); + reqFifo[group][buffSlot].size = args->sharedSize[sharedBuffSlot]; + char* sendAddress = (char*)args->sharedBuff[sharedBuffSlot] + group*COLLNET_GROUP_NSUBS*args->sharedSize[sharedBuffSlot]; + NCCLCHECK(collNetIallreduce(resources->collNetComm, sendAddress, (void*)(reqFifo[group][buffSlot].recvBuff), count, args->dtype, args->redOp, sendMhandle, recvMhandle, sub->requests+buffSlot)); + if (sub->requests[buffSlot] == NULL) continue; + + TRACE(NCCL_NET, "sendProxy [%lu/%d/%d] Iallreduce posted, size %d req %p", sub->transmitted, group, buffSlot, totalSize, sub->requests[buffSlot]); + // Make sure size is reset to zero before we update the head. + __sync_synchronize(); + sub->transmitted += args->sliceSteps; + args->idle = 0; + continue; + } + } + // Check whether the network has completed some send operations. + if (LAST_OF_GROUP(s) && sub->done < sub->transmitted) { + int done, size; + int group = s / COLLNET_GROUP_NSUBS; + int buffSlot = (sub->base+sub->done)%NCCL_STEPS; + NCCLCHECK(collNetTest((void*)(sub->requests[buffSlot]), &done, &size)); + if (done) { + TRACE(NCCL_NET, "sendProxy [%lu/%d/%d] request %p done, size %d", sub->done, group, buffSlot, sub->requests[buffSlot], size); + // Make sure size is updated before we set recvBuff to NULL (from the view of recv proxy, concerning the flush) + // (reordered store after store is possible on POWER, though not on x86) + __sync_synchronize(); + reqFifo[group][buffSlot].recvBuff = NULL; // Notify recvProxy + for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].done += args->sliceSteps; + args->idle = 0; + int allDone = 1; + for (int i=0; insubs; i++) { + if (args->subs[i].done < args->subs[i].nsteps) { allDone = 0; break; } + } + if (allDone) { + args->state = ncclProxyOpNone; + TRACE(NCCL_NET, "sendProxy [%lu/%d] stopped", sub->done, s); + } } - return ncclSuccess; } } } @@ -348,81 +442,115 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) { WARN("CollNet does not support LL128"); return ncclInternalError; } - struct collNetRecvResources* resources = (struct collNetRecvResources*) (args->connector->transportResources); if (args->state == ncclProxyOpReady) { - // Round to next multiple of sliceSteps - resources->step = ROUNDUP(resources->step, args->chunkSteps); - args->posted = args->received = args->transmitted = args->done = resources->step; - args->end = resources->step + args->nsteps; + for (int s=0; snsubs; s++) { + struct ncclProxySubArgs* sub = args->subs+s; + struct collNetRecvResources* resources = (struct collNetRecvResources*) (sub->connector->transportResources); + // Round to next multiple of sliceSteps + sub->base = ROUNDUP(resources->step, args->chunkSteps); + sub->posted = sub->received = sub->flushed = sub->transmitted = sub->done = 0; + resources->step = sub->base + sub->nsteps; + } args->state = ncclProxyOpProgress; } args->idle = 1; if (args->state == ncclProxyOpProgress) { int p = args->protocol; - int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS; - char* localBuff = args->connector->conn.buffs[p]; - void* mhandle = resources->mhandles[p]; - struct reqSlot* reqFifo = resources->reqFifo; - if ((args->posted < args->done + NCCL_STEPS) && (args->posted < args->end)) { - int buffSlot = args->posted%NCCL_STEPS; - char* recvBuff = p == NCCL_PROTO_LL ? (char*)resources->llData : localBuff; - int recvStepSize = p == NCCL_PROTO_LL ? stepSize/2 : stepSize; - STORE(&reqFifo[buffSlot].recvBuff, recvBuff+buffSlot*recvStepSize); - TRACE(NCCL_NET, "recvProxy [%lu/%d] posted buffer %p", args->posted, buffSlot, reqFifo[buffSlot].recvBuff); - args->posted += args->sliceSteps; - args->idle = 0; - return ncclSuccess; - } - if (args->posted > args->received) { - int buffSlot = args->received%NCCL_STEPS; - if (LOAD(&reqFifo[buffSlot].recvBuff) == NULL) { // Buffer is cleared : coll is complete - TRACE(NCCL_NET, "recvProxy [%lu/%d] done, size %d", args->received, buffSlot, LOAD(&reqFifo[buffSlot].size)); - if (args->protocol == NCCL_PROTO_LL) { // ll + int nGroups = DIVUP(args->nsubs, COLLNET_GROUP_NSUBS); + int perGroupSteps = NCCL_STEPS / nGroups; + for (int s=0; snsubs; s++) { + struct ncclProxySubArgs* sub = args->subs+s; + struct collNetRecvResources* resources = (struct collNetRecvResources*) (sub->connector->transportResources); + void* mhandle = resources->mhandles[p]; + int stepSize = sub->connector->comm->buffSizes[p] / NCCL_STEPS; + auto reqFifo = resources->reqFifo; + // Enforce sync between operations of the same group. + if (LAST_OF_GROUP(s) && (sub->posted < sub->done + perGroupSteps) && (sub->posted < sub->nsteps)) { + int group = s / COLLNET_GROUP_NSUBS; + int buffSlot = (sub->base+sub->posted)%NCCL_STEPS; + char* ptr; + int sharedBuffSlot = sub->posted%NCCL_STEPS; + NCCLCHECK(ncclProxySharedBuffersGetCollNet(sub->connector->comm, p == NCCL_PROTO_SIMPLE ? resources->useGdr : 0, 1, sharedBuffSlot, 0, &ptr)); + args->sharedBuff[sharedBuffSlot] = ptr; + int slotSize = sub->connector->comm->buffSizes[NCCL_PROTO_SIMPLE] / NCCL_STEPS; + reqFifo[group][buffSlot].recvBuff = args->sharedBuff[sharedBuffSlot] + group*COLLNET_GROUP_NSUBS*slotSize; + TRACE(NCCL_NET, "recvProxy [%lu/%d/%d] posted buffer %p", sub->posted, group, buffSlot, reqFifo[group][buffSlot].recvBuff); + sub->posted += args->sliceSteps; + args->idle = 0; + continue; + } + if (LAST_OF_GROUP(s) && (sub->posted > sub->received)) { + int group = s / COLLNET_GROUP_NSUBS; + int buffSlot = (sub->base+sub->received)%NCCL_STEPS; + int sharedBuffSlot = sub->received%NCCL_STEPS; + if (reqFifo[group][buffSlot].recvBuff == NULL) { // Buffer is cleared : coll is complete + args->sharedSize[sharedBuffSlot] = reqFifo[group][buffSlot].size; + int totalSize = args->sharedSize[sharedBuffSlot]*(s-group*COLLNET_GROUP_NSUBS+1); + TRACE(NCCL_NET, "recvProxy [%lu/%d/%d] received, size %d", sub->received, group, buffSlot, totalSize); + sub->received += args->sliceSteps; + if (reqFifo[group][buffSlot].size > 0 && p == NCCL_PROTO_SIMPLE && resources->useGdr) { + int slotSize = sub->connector->comm->buffSizes[NCCL_PROTO_SIMPLE] / NCCL_STEPS; + char* recvAddress = (char*)args->sharedBuff[sharedBuffSlot] + group*COLLNET_GROUP_NSUBS*slotSize; + NCCLCHECK(collNetIflush(resources->collNetComm, recvAddress, totalSize, mhandle, sub->requests+buffSlot)); + } else { + for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].flushed += args->sliceSteps; + } + args->idle = 0; + continue; + } + } + if (LAST_OF_GROUP(s) && (sub->received > sub->flushed)) { + // Progress flush operations + int group = s / COLLNET_GROUP_NSUBS; + int buffSlot = (sub->base + sub->flushed)%NCCL_STEPS; + int done = 1; + if (sub->requests[buffSlot]) NCCLCHECK(collNetTest(sub->requests[buffSlot], &done, NULL)); + if (done) { + TRACE(NCCL_NET, "recvProxy [%lu/%d/%d] flushed", sub->flushed, group, buffSlot); + for (int i=group*COLLNET_GROUP_NSUBS; i<=s; i++) args->subs[i].flushed += args->sliceSteps; + args->idle = 0; + //continue; + } + } + if (sub->flushed > sub->transmitted) { + int group = s / COLLNET_GROUP_NSUBS; + int buffSlot = (sub->base + sub->transmitted)%NCCL_STEPS; + int sharedBuffSlot = sub->transmitted%NCCL_STEPS; + int slotSize = sub->connector->comm->buffSizes[NCCL_PROTO_SIMPLE] / NCCL_STEPS; + char* ptr = args->sharedBuff[sharedBuffSlot] + group*COLLNET_GROUP_NSUBS*slotSize + (s%COLLNET_GROUP_NSUBS)*args->sharedSize[sharedBuffSlot]; + if (p == NCCL_PROTO_SIMPLE) { + volatile void** ptrsFifo = (volatile void**)resources->recvMem->ptrsFifo; + ptrsFifo[buffSlot] = ptr; + __sync_synchronize(); + resources->recvMem->tail = sub->base + sub->flushed; + } + if (p == NCCL_PROTO_LL) { // ll // re-attach flag - uint32_t flag = NCCL_LL_FLAG(args->received + 1); - int stepLines = stepSize / sizeof(union ncclLLFifoLine); + char* localBuff = sub->connector->conn.buffs[p]; + uint32_t flag = NCCL_LL_FLAG(sub->base + sub->transmitted + 1); union ncclLLFifoLine* lines = (union ncclLLFifoLine*)(localBuff+buffSlot*stepSize); - uint32_t* recvData = resources->llData+buffSlot*2*stepLines; - int nFifoLines = DIVUP(LOAD(&reqFifo[buffSlot].size), 2*sizeof(uint32_t)); + uint32_t* recvData = (uint32_t*)ptr; + int nFifoLines = DIVUP(args->sharedSize[sharedBuffSlot], 2*sizeof(uint32_t)); for (int i=0; ireceived += args->sliceSteps; - if (LOAD(&reqFifo[buffSlot].size) > 0 && args->protocol == NCCL_PROTO_SIMPLE && resources->useGdr) { - NCCLCHECK(collNetIflush(resources->collNetRecvComm, localBuff+buffSlot*stepSize, LOAD(&reqFifo[buffSlot].size), mhandle, args->requests+buffSlot)); - } else { - args->requests[buffSlot] = NULL; - } + sub->transmitted += args->sliceSteps; args->idle = 0; - return ncclSuccess; + continue; } - } - if (args->received > args->transmitted) { - // Progress flush operations - int buffSlot = args->transmitted%NCCL_STEPS; - int done = 1; - if (args->requests[buffSlot]) NCCLCHECK(collNetTest(args->requests[buffSlot], &done, NULL)); - if (done) { - args->transmitted += args->sliceSteps; - __sync_synchronize(); - resources->recvMem->tail = args->transmitted; - args->idle = 0; - return ncclSuccess; - } - } - if (args->transmitted > args->done) { + // Enforce sync here to make sure the last sub doesn't increase "done" before all others in the group have + // reached the same point, otherwise we would start posting buffers to the send proxy before we're done + // processing all the shared buffer. + bool groupSync = (((s == 0) && ((sub+args->nsubs-1)->done == sub->done)) || (s && (sub-1)->done > sub->done)); volatile uint64_t* sendHead = &resources->sendMem->head; - uint64_t done = LOAD(sendHead); - while (done > args->done && - // LL and LL128 can acknowledge 0-bytes send before they even happen. Don't go past what we transmitted. - args->transmitted > args->done) { - args->done += args->sliceSteps; + if (groupSync && sub->done < sub->transmitted && (sub->base+sub->done) < *sendHead) { + sub->done += args->sliceSteps; args->idle = 0; - if (args->done == args->end) { - resources->step = args->end; + if (sub->done == sub->nsteps && s == args->nsubs-1) { args->state = ncclProxyOpNone; + TRACE(NCCL_NET, "recvProxy [%lu/%d] stopped", sub->done, s); } } } diff --git a/src/transport/net.cc b/src/transport/net.cc index ff9480d688..a0c4875460 100644 --- a/src/transport/net.cc +++ b/src/transport/net.cc @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -11,6 +11,7 @@ #include #include "collectives.h" #include +#include "gdrwrap.h" struct netConnectInfo { ncclNetHandle_t netHandle; @@ -41,6 +42,13 @@ struct netRecvResources { void* netRecvComm; struct ncclSendMem* sendMem; struct ncclRecvMem* recvMem; + + // GDRCOPY support + void* gdrMemDesc; + struct ncclRecvMem* devRecvMem; + void* gdrFlushDesc; + int* devFlushMem; + int netDev; int useGdr; int shared; @@ -63,13 +71,16 @@ NCCL_PARAM(NetSharedBuffers, "NET_SHARED_BUFFERS", -2); /* Determine if we will use this transport for this peer and return connect * information for this peer */ -ncclResult_t netSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) { +ncclResult_t netSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { struct netSendResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); send->transportResources = resources; send->conn.shared = resources->shared = ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : graph ? 0 : 1; + send->proxyAppendPtr = send->conn.shared ? comm->proxyState.sharedBuffs.proxyAppend+2*channelId+1 : &send->proxyAppend; - NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, &resources->netDev)); + // Send/Receive: Round-robin NICs based on the receiver's CUDA device + int nicRR = comm->peerInfo[peerInfo->rank].cudaDev; + NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, nicRR, &resources->netDev)); NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 1, &resources->useGdr)); NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1)); @@ -120,20 +131,45 @@ ncclResult_t netSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st return ncclSuccess; } -ncclResult_t netRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) { +// GDRCOPY support: TAIL_ENABLE When enabled locates the RX proxy tail in CUDA memory +NCCL_PARAM(GdrCopyTailEnable, "GDRCOPY_TAIL_ENABLE", 1); +// GDRCOPY support: FLUSH_ENABLE When enabled uses a PCI-E read to flush GDRDMA buffers +NCCL_PARAM(GdrCopyFlushEnable, "GDRCOPY_FLUSH_ENABLE", 0); + +ncclResult_t netRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { struct netRecvResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); recv->transportResources = resources; recv->conn.shared = resources->shared = ncclParamNetSharedBuffers() != -2 ? ncclParamNetSharedBuffers() : graph ? 0 : 1; + recv->proxyAppendPtr = recv->conn.shared ? comm->proxyState.sharedBuffs.proxyAppend+2*channelId : &recv->proxyAppend; - NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, &resources->netDev)); + // Send/Receive: Round-robin NICs based on the receiver's CUDA device + int nicRR = comm->cudaDev; + NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, nicRR, &resources->netDev)); NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, resources->netDev, 0, &resources->useGdr)); NCCLCHECK(ncclCudaHostCalloc(&resources->sendMem, 1)); NCCLCHECK(ncclCudaHostCalloc(&resources->recvMem, 1)); + // GDRCOPY tail support + if (ncclGdrCopy != NULL && ncclParamGdrCopyTailEnable() == 1) { + struct ncclRecvMem* devCudaPtr; + NCCLCHECK(ncclGdrCudaCalloc(&resources->devRecvMem, &devCudaPtr, 1, &resources->gdrMemDesc)); + // The GDR mapped VA doesn't work on the SMs + recv->conn.tail = &((struct ncclRecvMem*)devCudaPtr)->tail; + } else { + recv->conn.tail = &resources->recvMem->tail; + } + + // GDRCOPY flush support +#if defined (__x86_64__) + if (ncclGdrCopy != NULL && ncclParamGdrCopyFlushEnable() == 1) { + int* cudaPtr; + NCCLCHECK(ncclGdrCudaCalloc(&resources->devFlushMem, &cudaPtr, 1, &resources->gdrFlushDesc)); + } +#endif + recv->conn.direct |= resources->useGdr ? NCCL_DIRECT_NIC : 0; - recv->conn.tail = &resources->recvMem->tail; // Only fuse P2P buffers, continue to allocate dedicated buffers for ring/tree recv->conn.ptrsFifo = resources->shared ? resources->recvMem->ptrsFifo : NULL; recv->conn.head = &resources->sendMem->head; @@ -242,6 +278,14 @@ ncclResult_t netSendFree(void* transportResources) { ncclResult_t netRecvFree(void* transportResources) { struct netRecvResources* resources = (struct netRecvResources*)transportResources; + // GDRCOPY support + if (resources->gdrFlushDesc) { + NCCLCHECK(ncclGdrCudaFree(resources->gdrFlushDesc)); + } + // GDRCOPY support + if (resources->gdrMemDesc) { + NCCLCHECK(ncclGdrCudaFree(resources->gdrMemDesc)); + } NCCLCHECK(ncclCudaHostFree(resources->sendMem)); NCCLCHECK(ncclCudaHostFree(resources->recvMem)); for (int l=0; lconnector->transportResources); if (args->state == ncclProxyOpReady) { - // Round to next multiple of sliceSteps - resources->step = ROUNDUP(resources->step, args->chunkSteps); - args->posted = args->transmitted = args->done = args->hdp_flushed = resources->step; - args->end = resources->step + args->nsteps; + for (int s=0; snsubs; s++) { + struct ncclProxySubArgs* sub = args->subs+s; + struct netSendResources* resources = (struct netSendResources*) (sub->connector->transportResources); + // Round to next multiple of sliceSteps + sub->base = ROUNDUP(resources->step, args->chunkSteps); + sub->posted = sub->transmitted = sub->done = 0; + } args->state = ncclProxyOpProgress; + args->hdp_flushed = 0; } args->idle = 1; if (args->state == ncclProxyOpProgress) { int p = args->protocol; - int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS; - char* localBuff = args->connector->conn.buffs[p]; - void* mhandle = *(resources->mhandlesProto[p]); - int buffSize = stepSize*args->sliceSteps; - if (resources->shared) buffSize /= SENDRECV_SLICEFACTOR; - if (args->sendbytes < buffSize) buffSize = args->sendbytes; - // Post buffers to the GPU - if (args->posted < args->end && args->posted < args->done + NCCL_STEPS) { - if (resources->shared) { - char* ptr; - NCCLCHECK(ncclProxySharedBuffersAlloc(args->connector->comm, resources->useGdr, 0, args->channel->id, buffSize, &ptr)); - if (ptr == NULL) return ncclInternalError; - resources->recvMem->ptrsFifo[args->posted%NCCL_STEPS] = ptr; - __sync_synchronize(); - volatile uint64_t* sendHead = &resources->sendMem->head; - args->posted += args->sliceSteps; - STORE(sendHead, args->posted - NCCL_STEPS); - } else args->posted += args->sliceSteps; - args->idle = 0; - return ncclSuccess; - } - // Check whether we received data from the GPU and send it to the network - int buffSlot = args->transmitted%NCCL_STEPS; - if (args->transmitted < args->posted && args->transmitted < args->done + NCCL_STEPS) { - volatile int* sizesFifo = resources->recvMem->sizesFifo; - volatile uint64_t* recvTail = &resources->recvMem->tail; - if (LOAD(sizesFifo+buffSlot) != -1 && (LOAD(recvTail) > args->transmitted || args->protocol == NCCL_PROTO_LL)) { - // We have something to receive, let's check if it's completely ready. - int size = LOAD(sizesFifo+buffSlot); - char* buff = resources->shared ? (char*)resources->recvMem->ptrsFifo[buffSlot] : localBuff+buffSlot*stepSize; - int ready = 1; - if (args->protocol == NCCL_PROTO_LL128) { - int ready = resources->useGdr; - if (!ready) { - // When data is in sysmem, we need to wait until all flags are correct since the GPU only - // called threadfence() - uint64_t flag = args->transmitted + 1; - int nFifoLines = DIVUP(LOAD(sizesFifo+buffSlot), sizeof(uint64_t)*NCCL_LL128_LINEELEMS); - volatile uint64_t* lines = (volatile uint64_t*)buff; - ready = 1; - for (int i=0; iprotocol == NCCL_PROTO_LL) { - uint32_t flag = NCCL_LL_FLAG(args->transmitted + 1); - int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine)); - union ncclLLFifoLine* lines = (union ncclLLFifoLine*)buff; - for (int i=0; icurr_hdp_reg && args->hdp_flushed < LOAD(recvTail)) { - args->hdp_flushed = LOAD(recvTail); - STORE(resources->curr_hdp_reg, 1); - } - // Data is ready, try to send. - NCCLCHECK(ncclNetIsend(resources->netSendComm, buff, size, mhandle, args->requests+buffSlot)); - if (args->requests[buffSlot] != NULL) { -#ifdef ENABLE_PROFILING - if (args->channel->active_req == 0) { - gettimeofday(&args->channel->tvs, NULL); - args->channel->sizes = 0; - } - args->channel->active_req ++; - args->channel->sizes += LOAD(sizesFifo+buffSlot); - args->channel->send_byte += LOAD(sizesFifo+buffSlot); -#endif - TRACE(NCCL_NET, "sendProxy [%lu/%d] Isend (LL) posted, req %p", args->transmitted, buffSlot, args->requests[buffSlot]); - STORE(sizesFifo+buffSlot, -1); - // Make sure size is reset to zero before we update the head. - __sync_synchronize(); - args->transmitted += args->sliceSteps; - args->idle = 0; - return ncclSuccess; - } - } - } - } - // Check whether the network has completed some send operations. - if (args->done < args->transmitted) { - int done; - int buffSlot = args->done%NCCL_STEPS; - NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, NULL)); - if (done) { - TRACE(NCCL_NET, "sendProxy [%lu/%d] request %p done", args->done, buffSlot, args->requests[buffSlot]); -#ifdef ENABLE_PROFILING - if (args->protocol == NCCL_PROTO_SIMPLE) { - args->channel->active_req --; - if (args->channel->active_req == 0) { - struct timeval tv; - gettimeofday(&tv, NULL); - float delta = (tv.tv_sec - args->channel->tvs.tv_sec)*1E6 + tv.tv_usec - args->channel->tvs.tv_usec; - if (delta) { - args->channel->bw_cumulative += (float)args->channel->sizes/delta/1E3; - args->channel->bw_count ++; - } - } - } -#endif + for (int s=0; snsubs; s++) { + struct ncclProxySubArgs* sub = args->subs+s; + if (sub->done == sub->nsteps) continue; + struct netSendResources* resources = (struct netSendResources*) (sub->connector->transportResources); + void* mhandle = *(resources->mhandlesProto[p]); + int stepSize = sub->connector->comm->buffSizes[p] / NCCL_STEPS; + char* localBuff = sub->connector->conn.buffs[p]; + int buffSize = stepSize*args->sliceSteps; + if (resources->shared) buffSize /= SENDRECV_SLICEFACTOR; + if (sub->sendbytes < buffSize) buffSize = sub->sendbytes; + // Post buffers to the GPU + if (sub->posted < sub->nsteps && sub->posted < sub->done + NCCL_STEPS) { + int buffSlot = (sub->base+sub->posted)%NCCL_STEPS; if (resources->shared) { - char* ptr = (char*)resources->recvMem->ptrsFifo[args->done%NCCL_STEPS]; - NCCLCHECK(ncclProxySharedBuffersFree(args->connector->comm, resources->useGdr, 0, args->channel->id, buffSize, ptr)); - } - args->done += args->sliceSteps; - - if (resources->shared == 0) { - resources->sendMem->head = args->done; - } + char* ptr; + int sharedBuffSlot = sub->posted%NCCL_STEPS; + NCCLCHECK(ncclProxySharedBuffersGetP2p(sub->connector->comm, resources->useGdr, 0, sub->channel->id, sharedBuffSlot, s, &ptr)); + resources->recvMem->ptrsFifo[buffSlot] = ptr; + __sync_synchronize(); + volatile uint64_t* sendHead = &resources->sendMem->head; + sub->posted += args->sliceSteps; + *sendHead = sub->base + sub->posted - NCCL_STEPS; + } else sub->posted += args->sliceSteps; args->idle = 0; - if (args->done == args->end) { - resources->step = args->end; - args->state = ncclProxyOpNone; - } - return ncclSuccess; + continue; } + // Check whether we received data from the GPU and send it to the network + if (sub->transmitted < sub->posted && sub->transmitted < sub->done + NCCL_STEPS) { + int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS; + volatile int* sizesFifo = resources->recvMem->sizesFifo; + volatile uint64_t* recvTail = &resources->recvMem->tail; + if (sizesFifo[buffSlot] != -1 && ((*recvTail > (sub->base+sub->transmitted)) || p == NCCL_PROTO_LL)) { + // We have something to receive, let's check if it's completely ready. + int size = sizesFifo[buffSlot]; + char* buff = resources->shared ? (char*)resources->recvMem->ptrsFifo[buffSlot] : localBuff+buffSlot*stepSize; + int ready = 1; + if (p == NCCL_PROTO_LL128) { + ready = resources->useGdr; + if (!ready) { + // When data is in sysmem, we need to wait until all flags are correct since the GPU only + // called threadfence() + uint64_t flag = sub->base+sub->transmitted+1; + int nFifoLines = DIVUP(sizesFifo[buffSlot], sizeof(uint64_t)*NCCL_LL128_LINEELEMS); + volatile uint64_t* lines = (volatile uint64_t*)buff; + ready = 1; + for (int i=0; ibase+sub->transmitted+1); + int nFifoLines = DIVUP(size, sizeof(union ncclLLFifoLine)); + union ncclLLFifoLine* lines = (union ncclLLFifoLine*)buff; + for (int i=0; icurr_hdp_reg && args->hdp_flushed < LOAD(recvTail)) { + args->hdp_flushed = LOAD(recvTail); + STORE(resources->curr_hdp_reg, 1); + } + // Data is ready, try to send. + NCCLCHECK(ncclNetIsend(resources->netSendComm, buff, size, mhandle, sub->requests+buffSlot)); + if (sub->requests[buffSlot] != NULL) { +#ifdef ENABLE_PROFILING + if (sub->channel->active_req == 0) { + gettimeofday(&sub->channel->tvs, NULL); + sub->channel->sizes = 0; + } + sub->channel->active_req ++; + sub->channel->sizes += LOAD(sizesFifo+buffSlot); + sub->channel->send_byte += LOAD(sizesFifo+buffSlot); +#endif + TRACE(NCCL_NET, "sendProxy [%lu/%d] Isend (LL) posted, req %p", sub->transmitted, buffSlot, sub->requests[buffSlot]); + sizesFifo[buffSlot] = -1; + // Make sure size is reset to zero before we update the head. + __sync_synchronize(); + sub->transmitted += args->sliceSteps; + args->idle = 0; + continue; + } + } + } + } + // Check whether the network has completed some send operations. + if (sub->done < sub->transmitted) { + int done; + int buffSlot = (sub->base+sub->done)%NCCL_STEPS; + NCCLCHECK(ncclNetTest(sub->requests[buffSlot], &done, NULL)); + if (done) { + TRACE(NCCL_NET, "sendProxy [%lu/%d] request %p done", sub->done, buffSlot, sub->requests[buffSlot]); +#ifdef ENABLE_PROFILING + if (args->protocol == NCCL_PROTO_SIMPLE) { + sub->channel->active_req --; + if (sub->channel->active_req == 0) { + struct timeval tv; + gettimeofday(&tv, NULL); + float delta = (tv.tv_sec - sub->channel->tvs.tv_sec)*1E6 + tv.tv_usec - sub->channel->tvs.tv_usec; + if (delta) { + sub->channel->bw_cumulative += (float)sub->channel->sizes/delta/1E3; + sub->channel->bw_count ++; + } + } + } +#endif + sub->done += args->sliceSteps; + + if (resources->shared == 0) { + resources->sendMem->head = sub->base + sub->done; + } + args->idle = 0; + if (sub->done == sub->nsteps) { + resources->step = sub->base + sub->nsteps; + args->done++; + } + } + } + } + if (args->done == args->nsubs) { + args->state = ncclProxyOpNone; } } return ncclSuccess; } ncclResult_t netRecvProxy(struct ncclProxyArgs* args) { - struct netRecvResources* resources = (struct netRecvResources*) (args->connector->transportResources); if (args->state == ncclProxyOpReady) { - // Round to next multiple of sliceSteps - resources->step = ROUNDUP(resources->step, args->chunkSteps); - args->posted = args->received = args->transmitted = args->done = resources->step; - args->end = resources->step + args->nsteps; + for (int s=0; snsubs; s++) { + struct ncclProxySubArgs* sub = args->subs+s; + struct netRecvResources* resources = (struct netRecvResources*) (sub->connector->transportResources); + // Round to next multiple of sliceSteps + sub->base = ROUNDUP(resources->step, args->chunkSteps); + sub->posted = sub->received = sub->transmitted = sub->done = 0; + } args->state = ncclProxyOpProgress; } args->idle = 1; if (args->state == ncclProxyOpProgress) { int p = args->protocol; - int stepSize = args->connector->comm->buffSizes[p] / NCCL_STEPS; - char* localBuff = args->connector->conn.buffs[p]; - void* mhandle = *(resources->mhandlesProto[p]); - int buffSize = stepSize*args->sliceSteps; - if (resources->shared) buffSize /= SENDRECV_SLICEFACTOR; - if (args->recvbytes < buffSize) buffSize = args->recvbytes; - if ((args->posted < args->done + NCCL_STEPS) && (args->posted < args->end)) { - int buffSlot = args->posted%NCCL_STEPS; - char* ptr; - if (resources->shared) { - NCCLCHECK(ncclProxySharedBuffersAlloc(args->connector->comm, resources->useGdr, 1, args->channel->id, buffSize, &ptr)); - if (ptr == NULL) return ncclInternalError; - volatile void** ptrsFifo = (volatile void**)resources->recvMem->ptrsFifo; - STORE(ptrsFifo+buffSlot, ptr); - } else { - ptr = localBuff+buffSlot*stepSize; - } - NCCLCHECK(ncclNetIrecv(resources->netRecvComm, ptr, buffSize, mhandle, args->requests+buffSlot)); - if (args->requests[buffSlot] != NULL) { - TRACE(NCCL_NET, "recvProxy [%lu/%d] posted recv request %p", args->posted, buffSlot, args->requests[buffSlot]); -#ifdef ENABLE_PROFILING - if (args->protocol == NCCL_PROTO_SIMPLE) { - if (args->channel->active_req == 0) { - gettimeofday(&args->channel->tvs, NULL); - args->channel->sizes = 0; - } - args->channel->active_req ++; + for (int s=0; snsubs; s++) { + struct ncclProxySubArgs* sub = args->subs+s; + if (sub->done == sub->nsteps) continue; + struct netRecvResources* resources = (struct netRecvResources*) (sub->connector->transportResources); + void* mhandle = *(resources->mhandlesProto[p]); + int stepSize = sub->connector->comm->buffSizes[p] / NCCL_STEPS; + char* localBuff = sub->connector->conn.buffs[p]; + int buffSize = stepSize*args->sliceSteps; + if (resources->shared) buffSize /= SENDRECV_SLICEFACTOR; + if (sub->recvbytes < buffSize) buffSize = sub->recvbytes; + + if ((sub->posted < sub->done + NCCL_STEPS) && (sub->posted < sub->nsteps)) { + int buffSlot = (sub->base+sub->posted)%NCCL_STEPS; + char* ptr; + if (resources->shared) { + int sharedBuffSlot = sub->posted%NCCL_STEPS; + NCCLCHECK(ncclProxySharedBuffersGetP2p(sub->connector->comm, resources->useGdr, 1, sub->channel->id, sharedBuffSlot, s, &ptr)); + volatile void** ptrsFifo = (volatile void**)resources->recvMem->ptrsFifo; + ptrsFifo[buffSlot] = ptr; + } else { + ptr = localBuff+buffSlot*stepSize; } -#endif - args->posted += args->sliceSteps; - args->idle = 0; - return ncclSuccess; - } else if (resources->shared) { - NCCLCHECK(ncclProxySharedBuffersFree(args->connector->comm, resources->useGdr, 1, args->channel->id, buffSize, ptr)); - } - } - if (args->posted > args->received) { - int buffSlot = args->received%NCCL_STEPS; - int done, size; - NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, &size)); - if (done) { - args->received += args->sliceSteps; + NCCLCHECK(ncclNetIrecv(resources->netRecvComm, ptr, buffSize, mhandle, sub->requests+buffSlot)); + if (sub->requests[buffSlot] != NULL) { + TRACE(NCCL_NET, "recvProxy [%lu/%d] posted recv request %p", sub->posted, buffSlot, sub->requests[buffSlot]); #ifdef ENABLE_PROFILING - if (args->protocol == NCCL_PROTO_SIMPLE) { - args->channel->active_req --; - args->channel->sizes += size; - args->channel->recv_byte += size; - if (args->channel->active_req == 0) { - struct timeval tv; - gettimeofday(&tv, NULL); - float delta = (tv.tv_sec - args->channel->tvs.tv_sec)*1E6 + tv.tv_usec - args->channel->tvs.tv_usec; - if (delta) { - args->channel->bw_cumulative += (float)args->channel->sizes/delta/1E3; - args->channel->bw_count ++; + if (args->protocol == NCCL_PROTO_SIMPLE) { + if (sub->channel->active_req == 0) { + gettimeofday(&sub->channel->tvs, NULL); + sub->channel->sizes = 0; + } + sub->channel->active_req ++; + } +#endif + sub->posted += args->sliceSteps; + args->idle = 0; + continue; + } + } + if (sub->posted > sub->received) { + int buffSlot = (sub->base+sub->received)%NCCL_STEPS; + int done, size; + NCCLCHECK(ncclNetTest(sub->requests[buffSlot], &done, &size)); + if (done) { + sub->received += args->sliceSteps; +#ifdef ENABLE_PROFILING + if (args->protocol == NCCL_PROTO_SIMPLE) { + sub->channel->active_req --; + sub->channel->sizes += size; + sub->channel->recv_byte += size; + if (sub->channel->active_req == 0) { + struct timeval tv; + gettimeofday(&tv, NULL); + float delta = (tv.tv_sec - sub->channel->tvs.tv_sec)*1E6 + tv.tv_usec - sub->channel->tvs.tv_usec; + if (delta) { + sub->channel->bw_cumulative += (float)sub->channel->sizes/delta/1E3; + sub->channel->bw_count ++; + } } } - } #endif - if (size > 0 && args->protocol == NCCL_PROTO_SIMPLE && resources->useGdr) { - // Don't pass data to the GPU yet, flush first. - volatile void** ptrsFifo = (volatile void**)resources->recvMem->ptrsFifo; - char* ptr = resources->shared ? (char*)(ptrsFifo[buffSlot]) : localBuff+buffSlot*stepSize; - NCCLCHECK(ncclNetIflush(resources->netRecvComm, ptr, size, mhandle, args->requests+buffSlot)); - } else { - args->requests[buffSlot] = NULL; + if (size > 0 && p == NCCL_PROTO_SIMPLE && resources->useGdr) { + // Don't pass data to the GPU yet, flush first. + + // GDRCOPY support + if (resources->devFlushMem) { +#if defined (__x86_64__) + // Force a PCI-E read from GPU memory + asm volatile ("mov (%0), %%eax" :: "l"(resources->devFlushMem) : "%eax"); +#else + WARN("NET: GDR Flush only supported on x86_64"); + return ncclInternalError; +#endif + sub->requests[buffSlot] = NULL; + } else { + volatile void** ptrsFifo = (volatile void**)resources->recvMem->ptrsFifo; + char* ptr = resources->shared ? (char*)(ptrsFifo[buffSlot]) : localBuff+buffSlot*stepSize; + NCCLCHECK(ncclNetIflush(resources->netRecvComm, ptr, size, mhandle, sub->requests+buffSlot)); + } + } else { + sub->requests[buffSlot] = NULL; + } + args->idle = 0; + continue; + } + } + if (sub->received > sub->transmitted) { + // Progress flush operations + int buffSlot = (sub->base+sub->transmitted)%NCCL_STEPS; + int done = 1; + if (sub->requests[buffSlot]) NCCLCHECK(ncclNetTest(sub->requests[buffSlot], &done, NULL)); + if (done) { + sub->transmitted += args->sliceSteps; + __sync_synchronize(); + if (resources->devRecvMem) { + // GDRCOPY support: Write updated tail directly to the device memory + resources->devRecvMem->tail = sub->base + sub->transmitted; + wc_store_fence(); // Flush out WC write + } else { + resources->recvMem->tail = sub->base + sub->transmitted; + } + args->idle = 0; + continue; + } + } + if (sub->transmitted > sub->done) { + volatile uint64_t* sendHead = &resources->sendMem->head; + uint64_t done = *sendHead; + while (done > sub->base + sub->done && + // LL and LL128 can acknowledge 0-bytes send before they even happen. Don't go past what we transmitted. + sub->transmitted > sub->done) { + sub->done += args->sliceSteps; + args->idle = 0; + if (sub->done == sub->nsteps) { + resources->step = sub->base + sub->nsteps; + args->done++; + } } - args->idle = 0; - return ncclSuccess; } } - if (args->received > args->transmitted) { - // Progress flush operations - int buffSlot = args->transmitted%NCCL_STEPS; - int done = 1; - if (args->requests[buffSlot]) NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, NULL)); - if (done) { - args->transmitted += args->sliceSteps; - __sync_synchronize(); - resources->recvMem->tail = args->transmitted; - args->idle = 0; - return ncclSuccess; - } - } - if (args->transmitted > args->done) { - volatile uint64_t* sendHead = &resources->sendMem->head; - uint64_t done = LOAD(sendHead); - while (done > args->done && - // LL and LL128 can acknowledge 0-bytes send before they even happen. Don't go past what we transmitted. - args->transmitted > args->done) { - if (resources->shared) { - char* ptr = (char*)resources->recvMem->ptrsFifo[args->done%NCCL_STEPS]; - NCCLCHECK(ncclProxySharedBuffersFree(args->connector->comm, resources->useGdr, 1, args->channel->id, buffSize, ptr)); - } - args->done += args->sliceSteps; - args->idle = 0; - if (args->done == args->end) { - resources->step = args->end; - args->state = ncclProxyOpNone; - } - } + if (args->done == args->nsubs) { + args->state = ncclProxyOpNone; } } return ncclSuccess; diff --git a/src/transport/net_ib.cc b/src/transport/net_ib.cc index 73893cfb72..8d59af5fad 100644 --- a/src/transport/net_ib.cc +++ b/src/transport/net_ib.cc @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -649,22 +649,22 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo NCCLCHECK(ncclIbGetRequest(&comm->verbs, &req)); req->size = size; - struct ibv_send_wr wr; - memset(&wr, 0, sizeof(wr)); - wr.wr_id = (uint64_t)req; + struct ibv_send_wr wr[2]; + memset(&wr[0], 0, sizeof(wr[0])); + wr[0].wr_id = (uint64_t)req; struct ibv_sge sge; if (size == 0) { - wr.sg_list = NULL; - wr.num_sge = 0; + wr[0].sg_list = NULL; + wr[0].num_sge = 0; } else { sge.addr=(uintptr_t)data; sge.length=(unsigned int)size; sge.lkey=mr->lkey; - wr.sg_list = &sge; - wr.num_sge = 1; + wr[0].sg_list = &sge; + wr[0].num_sge = 1; } #if USE_RDMA_WRITE == 0 - wr.opcode = IBV_WR_SEND; - wr.send_flags = IBV_SEND_SIGNALED; + wr[0].opcode = IBV_WR_SEND; + wr[0].send_flags = IBV_SEND_SIGNALED; #else __sync_synchronize(); // order the readyPtr load against rkey load below // Sanity checks to catch user collective call count/size mismatches @@ -674,15 +674,11 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo size, LOAD(&slot->size), LOAD(&slot->addr), LOAD(&slot->rkey), LOAD(&slot->seq), comm->fifoHead); return ncclInternalError; } - int useAr = 0; - if (size > ncclParamIbArThreshold()) { - useAr = 1; - } - wr.opcode = useAr ? IBV_WR_RDMA_WRITE : IBV_WR_RDMA_WRITE_WITH_IMM; - wr.send_flags = useAr ? 0 : IBV_SEND_SIGNALED; - wr.wr.rdma.remote_addr = LOAD(&slot->addr); - wr.wr.rdma.rkey = LOAD(&slot->rkey); - wr.imm_data = size; // Send the message size via imm_data + wr[0].opcode = IBV_WR_RDMA_WRITE_WITH_IMM; + wr[0].send_flags = IBV_SEND_SIGNALED; + wr[0].wr.rdma.remote_addr = slot->addr; + wr[0].wr.rdma.rkey = slot->rkey; + wr[0].imm_data = size; // Send the message size via imm_data __sync_synchronize(); #endif // We must clear slot->ready, but reset other fields to aid @@ -692,21 +688,29 @@ ncclResult_t ncclIbIsend(void* sendComm, void* data, int size, void* mhandle, vo STORE(&slot->rkey, 0); STORE(&slot->size, 0); STORE(&slot->seq, 0); comm->fifoHead++; - struct ibv_send_wr* bad_wr; - NCCLCHECK(wrap_ibv_post_send(comm->qp, &wr, &bad_wr)); #if USE_RDMA_WRITE // When using adaptive routing, send the bulk of the data first as an // RDMA_WRITE, then a 0-byte RDMA_WRITE_WITH_IMM to trigger a remote // completion. - if (useAr) { - wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM; - wr.sg_list = NULL; - wr.num_sge = 0; - wr.send_flags |= IBV_SEND_SIGNALED; - NCCLCHECK(wrap_ibv_post_send(comm->qp, &wr, &bad_wr)); + if (size > ncclParamIbArThreshold()) { + memset(&wr[1], 0, sizeof(wr[1])); + memcpy(&wr[1], &wr[0], sizeof(wr[0])); + wr[1].sg_list = NULL; + wr[1].num_sge = 0; + wr[0].next = &wr[1]; + + wr[0].opcode = IBV_WR_RDMA_WRITE; + wr[1].opcode = IBV_WR_RDMA_WRITE_WITH_IMM; + + wr[0].send_flags = 0; + wr[1].send_flags = IBV_SEND_SIGNALED; } #endif + + struct ibv_send_wr* bad_wr; + NCCLCHECK(wrap_ibv_post_send(comm->qp, wr, &bad_wr)); + *request = req; return ncclSuccess; } diff --git a/src/transport/net_socket.cc b/src/transport/net_socket.cc index 13e155c0fb..0db532c92a 100644 --- a/src/transport/net_socket.cc +++ b/src/transport/net_socket.cc @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -11,9 +11,7 @@ #include "net.h" #include "param.h" -#include #include -#include #include #include #include diff --git a/src/transport/p2p.cc b/src/transport/p2p.cc index 1484380af5..ea83e6975c 100644 --- a/src/transport/p2p.cc +++ b/src/transport/p2p.cc @@ -1,6 +1,6 @@ /************************************************************************* - * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. - * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. + * Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -165,17 +165,12 @@ static ncclResult_t p2pMap(struct ncclPeerInfo* myInfo, struct ncclPeerInfo* pee /* Send: Create and return connect structures for this peer to connect to me */ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, - struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) { - + struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { struct p2pSendResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); send->transportResources = resources; int useRead, intermediateRank; NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank)); - int sendSize = sizeof(struct ncclSendMem); - // For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure - if (useRead) sendSize += send->comm->buffSizes[NCCL_PROTO_SIMPLE]; - ALIGN_SIZE(sendSize, CUDA_IPC_MIN); resources->next_hdp_reg = 0; uint32_t linktype, hops; @@ -189,16 +184,22 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st } struct p2pConnectInfo info; - info.read = useRead; + // For CollNet, we use write for scatter-reduce (conn 1), read for broadcast-gather (conn 0) + info.read = (connIndex == 0) ? useRead : 0; const char* useReadStr = info.read ? "/read" : ""; + int sendSize = sizeof(struct ncclSendMem); + // For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure + if (info.read) sendSize += send->comm->buffSizes[NCCL_PROTO_SIMPLE]; + ALIGN_SIZE(sendSize, CUDA_IPC_MIN); + resources->remoteId = -1; resources->bootstrap = comm->bootstrap; if (intermediateRank == -1) { NCCLCHECK(ncclCudaCalloc((char**)&info.directPtr, sendSize, true)); info.rank = myInfo->rank; if (myInfo->pidHash == peerInfo->pidHash) { - if (useRead == 0) send->conn.direct |= NCCL_DIRECT_GPU; + if (info.read == 0) send->conn.direct |= NCCL_DIRECT_GPU; INFO(NCCL_INIT|NCCL_P2P, "Channel %02d : %d[%lx] -> %d[%lx] via P2P/direct pointer%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, useReadStr); } else { @@ -224,20 +225,21 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st /* Create and return connect structures for this peer to connect to me */ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, - struct ncclConnect* connectInfo, struct ncclConnector * recv, int channelId) { - + struct ncclConnect* connectInfo, struct ncclConnector * recv, int channelId, int connIndex) { struct p2pRecvResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); recv->transportResources = resources; int useRead, intermediateRank; NCCLCHECK(p2pGetInfo(comm->topo, myInfo, peerInfo, &useRead, &intermediateRank)); - int recvSize = offsetof(struct ncclRecvMem, buff); - // For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure - for (int p=0; pcomm->buffSizes[p]; - ALIGN_SIZE(recvSize, CUDA_IPC_MIN); struct p2pConnectInfo info; - info.read = useRead; + // For CollNet, we use write for scatter-reduce (conn 1), read for broadcast-gather (conn 0) + info.read = (connIndex == 0) ? useRead : 0; + + int recvSize = offsetof(struct ncclRecvMem, buff); + // For P2P Read the SIMPLE buffer is tagged on the end of the ncclSendMem structure + for (int p=0; pcomm->buffSizes[p]; + ALIGN_SIZE(recvSize, CUDA_IPC_MIN); resources->remoteId = -1; resources->bootstrap = comm->bootstrap; @@ -245,7 +247,7 @@ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st NCCLCHECK(ncclCudaCalloc((char**)&info.directPtr, recvSize, true)); info.rank = myInfo->rank; if (myInfo->pidHash == peerInfo->pidHash) { - if (useRead == 0) recv->conn.direct |= NCCL_DIRECT_GPU; + if (info.read == 0) recv->conn.direct |= NCCL_DIRECT_GPU; } else { CUDACHECK(hipIpcGetMemHandle(&info.devIpc, info.directPtr)); } diff --git a/src/transport/shm.cc b/src/transport/shm.cc index d8a5b52d2c..98e25a9dd2 100644 --- a/src/transport/shm.cc +++ b/src/transport/shm.cc @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -57,8 +57,7 @@ ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop #define MAX_SHM_NAME_LEN 1024 /* Create and return connect structures for this peer to connect to me */ -ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) { - +ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { struct shmSendResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); send->transportResources = resources; @@ -81,7 +80,7 @@ ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st return ncclSuccess; } -ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) { +ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { struct shmRecvResources* resources; NCCLCHECK(ncclCalloc(&resources, 1)); recv->transportResources = resources; diff --git a/test/test_AllReduceAbort.cpp b/test/test_AllReduceAbort.cpp index 23d7129814..e91dcfa0b9 100644 --- a/test/test_AllReduceAbort.cpp +++ b/test/test_AllReduceAbort.cpp @@ -40,7 +40,7 @@ namespace CorrectnessTests hipStream_t stream; HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); struct ncclChannel* channel = comm->channels; - uint64_t **p_dev_head = (uint64_t **)((uint8_t*)(channel->devPeers + channel->ring.next) + offsetof(struct ncclPeer, send.conn.head)); + uint64_t **p_dev_head = (uint64_t **)((uint8_t*)(channel->devPeers + channel->ring.next) + offsetof(struct ncclPeer, send[0].conn.head)); uint64_t *real_head, *fake_head, *fake_h; // get original head diff --git a/test/test_BroadcastAbort.cpp b/test/test_BroadcastAbort.cpp index ddab18a8e9..e7b3dfd07d 100644 --- a/test/test_BroadcastAbort.cpp +++ b/test/test_BroadcastAbort.cpp @@ -41,7 +41,7 @@ namespace CorrectnessTests hipStream_t stream; HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); struct ncclChannel* channel = comm->channels; - uint64_t **p_dev_head = (uint64_t **)((uint8_t*)(channel->devPeers + channel->ring.next) + offsetof(struct ncclPeer, send.conn.head)); + uint64_t **p_dev_head = (uint64_t **)((uint8_t*)(channel->devPeers + channel->ring.next) + offsetof(struct ncclPeer, send[0].conn.head)); uint64_t *real_head, *fake_head, *fake_h; // get original head diff --git a/tools/topo_expl/include/utils.h b/tools/topo_expl/include/utils.h index 62dd89d1cb..d249dd3697 100644 --- a/tools/topo_expl/include/utils.h +++ b/tools/topo_expl/include/utils.h @@ -17,6 +17,7 @@ struct allGather1Data_t { // AllGather3 - begin struct ncclGraphInfo { int pattern; + int nChannels; int sameChannels; float speedIntra; float speedInter; @@ -25,11 +26,8 @@ struct ncclGraphInfo { }; struct allGather3Data_t{ - int cudaCompCap; - int fullCudaCompCap; - int nChannels; + int collNetSupport; int nc; - int alltoallDisable; struct ncclGraphInfo tree; struct ncclGraphInfo ring; struct ncclGraphInfo collNet; diff --git a/tools/topo_expl/model.cpp b/tools/topo_expl/model.cpp index 4585f3048d..58a0cd0d06 100644 --- a/tools/topo_expl/model.cpp +++ b/tools/topo_expl/model.cpp @@ -84,7 +84,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop /* Send: Create and return connect structures for this peer to connect to me */ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, - struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) { + struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { if (myInfo->pidHash == peerInfo->pidHash) { if (myInfo->cudaDev == peerInfo->cudaDev) { INFO(NCCL_INIT|NCCL_P2P,"Ring %02d : %d[%d] -> %d[%d] via P2P/common device", channelId, myInfo->rank, myInfo->cudaDev, peerInfo->rank, peerInfo->cudaDev); @@ -103,7 +103,7 @@ ncclResult_t p2pSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st /* Create and return connect structures for this peer to connect to me */ ncclResult_t p2pRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, - struct ncclConnect* connectInfo, struct ncclConnector * recv, int channelId) { + struct ncclConnect* connectInfo, struct ncclConnector * recv, int channelId, int connIndex) { return ncclSuccess; } @@ -126,12 +126,12 @@ ncclResult_t shmCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop } /* Create and return connect structures for this peer to connect to me */ -ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) { +ncclResult_t shmSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { INFO(NCCL_INIT|NCCL_SHM,"Ring %02d : %d[%lx] -> %d[%lx] via direct shared memory", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId); return ncclSuccess; } -ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) { +ncclResult_t shmRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { return ncclSuccess; } @@ -148,10 +148,12 @@ ncclResult_t netCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop return ncclSuccess; } -ncclResult_t netSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) { +ncclResult_t netSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { int netDev, useGdr = 0; - NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, &netDev)); + // Send/Receive: Round-robin NICs based on the receiver's CUDA device + int nicRR = comm->peerInfo[peerInfo->rank].cudaDev; + NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, nicRR, &netDev)); NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netDev, 1, &useGdr)); INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [send] via NET/%s/%d%s", channelId, myInfo->rank, myInfo->busId, peerInfo->rank, peerInfo->busId, ncclNetName(), netDev, @@ -159,10 +161,14 @@ ncclResult_t netSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, st return ncclSuccess; } -ncclResult_t netRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) { +NCCL_PARAM(NetGdrLevel, "NET_GDR_LEVEL", PATH_PHB); + +ncclResult_t netRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { int netDev, useGdr = 0; - NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, &netDev)); + // Send/Receive: Round-robin NICs based on the receiver's CUDA device + int nicRR = comm->cudaDev; + NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, nicRR, &netDev)); NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netDev, 0, &useGdr)); INFO(NCCL_INIT|NCCL_NET,"Ring %02d : %d[%lx] -> %d[%lx] [receive] via NET/%s/%d%s", channelId, peerInfo->rank, peerInfo->busId, myInfo->rank, myInfo->busId, ncclNetName(), netDev, @@ -183,20 +189,20 @@ ncclResult_t collNetCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncc return ncclSuccess; } -ncclResult_t collNetSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId) { +ncclResult_t collNetSendSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* send, int channelId, int connIndex) { int netDev, useGdr = 0; - NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, &netDev)); + NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, 0, &netDev)); NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netDev, 1, &useGdr)); INFO(NCCL_INIT|NCCL_NET,"Coll %02d : %d [send] via COLLNET/%s/%d%s", channelId, myInfo->rank, "SHARP", netDev, useGdr ? "/GDRDMA" : ""); return ncclSuccess; } -ncclResult_t collNetRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId) { +ncclResult_t collNetRecvSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connectInfo, struct ncclConnector* recv, int channelId, int connIndex) { int netDev, useGdr = 0; - NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, &netDev)); + NCCLCHECK(ncclTopoGetNetDev(comm->topo, myInfo->rank, graph, channelId, 0, &netDev)); NCCLCHECK(ncclTopoCheckGdr(comm->topo, myInfo->busId, netDev, 0, &useGdr)); INFO(NCCL_INIT|NCCL_NET,"Coll %02d : %d [receive] via COLLNET/%s/%d%s", channelId, myInfo->rank, "SHARP", netDev, useGdr ? "/GDRDMA" : ""); diff --git a/tools/topo_expl/topo_expl.cpp b/tools/topo_expl/topo_expl.cpp index 2c671e8a7b..4c7e1b2b85 100644 --- a/tools/topo_expl/topo_expl.cpp +++ b/tools/topo_expl/topo_expl.cpp @@ -191,6 +191,9 @@ int main(int argc,char* argv[]) assert(node_model!=0); comm[i].topo = node_model->getSystem(i); bootstrapAllGather(&comm[i], allGather1Data); + // Mark channels as non initialized. + for (int c=0; crank; + int nranks = comm->nRanks; + int nMasters = comm->nNodes; int rankInCollNet = -1; int supported = 0; int isMaster = (rank == masterRank) ? 1 : 0; @@ -161,21 +166,22 @@ static int collNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGrap } // send master receives connect info from peer recv master - if (isMaster && type == 0) { - //NCCLCHECK(bootstrapRecv(comm->bootstrap, masterPeer, &sendrecvExchange, sizeof(sendrecvExchange))); + if (isMaster && type == collNetSend) { + //NCCLCHECK(bootstrapRecv(comm->bootstrap, masterPeer, collNetGraph->id, &sendrecvExchange, sizeof(sendrecvExchange))); rankInCollNet = sendrecvExchange.collNetRank; - INFO(NCCL_INIT, "CollNet [send] : rank %d collNetRank %d collNetNranks %d received connect from rank %d", rank, rankInCollNet, nMasters, masterPeer); + TRACE(NCCL_INIT, "CollNet [send] : rank %d collNetRank %d collNetNranks %d received connect from rank %d", rank, rankInCollNet, nMasters, masterPeer); } // select struct ncclPeer* root = channel->peers+nranks; - struct ncclConnector* conn = (type == 1) ? &root->recv : &root->send; - struct ncclTransportComm* transportComm = (type == 1) ? &(collNetTransport.recv) : &(collNetTransport.send); + // connector index: 0 for recv, 1 for send + struct ncclConnector* conn = (type == collNetRecv) ? root->recv+type : root->send+type; + struct ncclTransportComm* transportComm = (type == collNetRecv) ? &(collNetTransport.recv) : &(collNetTransport.send); conn->transportComm = transportComm; // setup struct ncclConnect myConnect; if (isMaster && ret > 0) { - NCCLCHECK(transportComm->setup(comm, collNetGraph, myInfo, peerInfo, &myConnect, conn, channel->id)); + NCCLCHECK(transportComm->setup(comm, collNetGraph, myInfo, peerInfo, &myConnect, conn, collNetGraphChannelId, type)); } // prepare connect handles ncclResult_t res; @@ -184,63 +190,89 @@ static int collNetSetup(struct ncclComm* comm, struct ncclTopoGraph* collNetGrap ncclConnect connect; } *allConnects = NULL; ncclConnect *masterConnects = NULL; - //NCCLCHECK(ncclCalloc(&masterConnects, nMasters)); - if (type == 1) { // recv side: AllGather + NCCLCHECK(ncclCalloc(&masterConnects, nMasters)); + if (type == collNetRecv) { // recv side: AllGather // all ranks must participate - //NCCLCHECK(ncclCalloc(&allConnects, nranks)); - //allConnects[rank].isMaster = isMaster; - //memcpy(&(allConnects[rank].connect), &myConnect, sizeof(struct ncclConnect)); + NCCLCHECK(ncclCalloc(&allConnects, nranks)); + allConnects[rank].isMaster = isMaster; + memcpy(&(allConnects[rank].connect), &myConnect, sizeof(struct ncclConnect)); //NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allConnects, sizeof(*allConnects)), res, cleanup); // consolidate - //int c = 0; - //for (int r = 0; r < nranks; r++) { - //if (allConnects[r].isMaster) { - //memcpy(masterConnects+c, &(allConnects[r].connect), sizeof(struct ncclConnect)); - //if (r == rank) rankInCollNet = c; - //c++; - //} - //} + int c = 0; + for (int r = 0; r < nranks; r++) { + if (allConnects[r].isMaster) { + memcpy(masterConnects+c, &(allConnects[r].connect), sizeof(struct ncclConnect)); + if (r == rank) rankInCollNet = c; + c++; + } + } } else { // send side : copy in connect info received from peer recv master //if (isMaster) memcpy(masterConnects+rankInCollNet, &(sendrecvExchange.connect), sizeof(struct ncclConnect)); } // connect if (isMaster && ret > 0) { //NCCLCHECKGOTO(transportComm->connect(comm, masterConnects, nMasters, rankInCollNet, conn), res, cleanup); - //struct ncclPeer* devRoot = channel->devPeers+nranks; - //struct ncclConnector* devConn = (type == 1) ? &devRoot->recv : &devRoot->send; + struct ncclPeer* devRoot = channel->devPeers+nranks; + struct ncclConnector* devConn = (type == collNetRecv) ? devRoot->recv+type : devRoot->send+type; //CUDACHECKGOTO(hipMemcpy(devConn, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice), res, cleanup); } // recv side sends connect info to send side - if (isMaster && type == 1) { - //sendrecvExchange.collNetRank = rankInCollNet; + if (isMaster && type == collNetRecv) { + sendrecvExchange.collNetRank = rankInCollNet; //memcpy(&sendrecvExchange.connect, masterConnects+rankInCollNet, sizeof(struct ncclConnect)); - //NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, masterPeer, &sendrecvExchange, sizeof(sendrecvExchange)), res, cleanup); - INFO(NCCL_INIT, "CollNet [recv] : rank %d collNetRank %d collNetNranks %d sent connect to rank %d", rank, rankInCollNet, nMasters, masterPeer); + //NCCLCHECKGOTO(bootstrapSend(comm->bootstrap, masterPeer, collNetGraph->id, &sendrecvExchange, sizeof(sendrecvExchange)), res, cleanup); + TRACE(NCCL_INIT, "CollNet [recv] : rank %d collNetRank %d collNetNranks %d sent connect to rank %d", rank, rankInCollNet, nMasters, masterPeer); } if (ret > 0) { supported = 1; } cleanup: - //if (allConnects != NULL) free(allConnects); - //if (masterConnects != NULL) free(masterConnects); + if (allConnects != NULL) free(allConnects); + if (masterConnects != NULL) free(masterConnects); return supported; } -static ncclResult_t checkCollNetSetup(struct ncclComm* comm, int rank, int collNetSetupFail) { - comm->collNetSupport = 1; - return ncclSuccess; -} - void initCollNet() { if (ncclParamCollNetEnable() == 1 && ncclCollNet == 0) ncclCollNet = (ncclCollNet_t*)0x12345678; } +ncclResult_t ncclTransportCollNetCheck(struct ncclComm* comm, int collNetSetupFail) { + int rank = comm->rank; + int nranks = comm->nRanks; + // AllGather collNet setup results + int* allGatherFailures; + NCCLCHECK(ncclCalloc(&allGatherFailures, nranks)); + allGatherFailures[rank] = collNetSetupFail; + //NCCLCHECK(bootstrapAllGather(comm->bootstrap, allGatherFailures, sizeof(int))); + for (int i=0; inChannels; r++) { + struct ncclChannel* channel = comm->channels+r; + struct ncclPeer* peer = channel->peers+nranks; + //if (peer->send->transportResources && peer->send->transportComm) NCCLCHECK(peer->send->transportComm->free(peer->send->transportResources)); + //if (peer->recv->transportResources && peer->recv->transportComm) NCCLCHECK(peer->recv->transportComm->free(peer->recv->transportResources)); + peer->send->transportResources = NULL; // avoid double free + peer->recv->transportResources = NULL; // avoid double free + } + // Set support to 0 + comm->collNetSupport = 0; + } + return ncclSuccess; +} + ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t *allGather1Data, struct allGather3Data_t *allGather3Data, struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph) { int rank = comm->rank; int nranks = comm->nRanks; - //uint64_t commHash = getHash(commId->internal, NCCL_UNIQUE_ID_BYTES); //TRACE(NCCL_INIT, "comm %p, commHash %lx, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks); //NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap)); @@ -301,12 +333,11 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t } struct ncclComm* intraRank0Comm = allGather1Data[intraRank0].comm; - //free(allGather1Data); - // AllGather1 - end // Topo detection / System graph creation //NCCLCHECK(ncclTopoGetSystem(comm, &comm->topo)); + // save nRanks to ncclTopoSystem as indicator of multi-node comm->topo->nRanks = comm->nRanks; // Compute paths between GPUs and NICs NCCLCHECK(ncclTopoComputePaths(comm->topo, comm->peerInfo)); @@ -345,7 +376,8 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t collNetGraph.pattern = NCCL_TOPO_PATTERN_TREE; collNetGraph.collNet = 1; collNetGraph.crossNic = ncclParamCrossNic(); - collNetGraph.minChannels = collNetGraph.maxChannels = ringGraph.nChannels; + collNetGraph.minChannels = 1; + collNetGraph.maxChannels = ringGraph.nChannels; NCCLCHECK(ncclTopoCompute(comm->topo, &collNetGraph)); NCCLCHECK(ncclTopoPrintGraph(comm->topo, &collNetGraph)); @@ -354,10 +386,26 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t NCCLCHECK(ncclTopoDumpGraphs(comm->topo, 3, graphs)); } + // Determine CollNet support + if (tmpNnodes > 1 && ncclParamCollNetEnable() == 1 && collNetSupport() == 1 && collNetGraph.nChannels > 0) comm->collNetSupport = 1; + if (intraRanks > 8) { + if (comm->collNetSupport == 1) WARN("CollNet currently only supports up to 8 GPUs per node"); + comm->collNetSupport = 0; + } + + if ((comm->topo->type & RCCL_TOPO_4P2H_ROME) && (comm->topo->type & RCCL_TOPO_GDR_ALL)) { + if (rcclParamP2pNetDisable() == 0) { + STORE(comm->p2pNet, 1); + INFO(NCCL_INIT, "RCCL enabled same node P2P over network"); + } + else + INFO(NCCL_INIT, "RCCL force disabled same node P2P over network"); + } // AllGather3 - begin #if 0 struct ncclGraphInfo { int pattern; + int nChannels; int sameChannels; float speedIntra; float speedInter; @@ -366,9 +414,7 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t }; struct { - int cudaCompCap; - int fullCudaCompCap; - int nChannels; + int collNetSupport; int nc; struct ncclGraphInfo tree; struct ncclGraphInfo ring; @@ -380,52 +426,44 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t #endif int idx; NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, myInfo->busId, &idx)); - allGather3Data[rank].cudaCompCap = comm->topo->nodes[GPU].nodes[idx].gpu.cudaCompCap; - allGather3Data[rank].nChannels = comm->nChannels = treeGraph.nChannels = ringGraph.nChannels = - std::min(treeGraph.nChannels, ringGraph.nChannels); - allGather3Data[rank].nc = comm->nChannels*2; - if (comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 908) allGather3Data[rank].nc = std::max(allGather3Data[rank].nc, 4); + allGather3Data[rank].nc = 2; if (comm->topo->nodes[GPU].count == comm->topo->nRanks && (comm->topo->type & RCCL_TOPO_CR8G)) - allGather3Data[rank].nc = comm->nChannels*4; - if (comm->topo->nodes[GPU].count != comm->topo->nRanks && comm->topo->nodes[NET].count && (comm->topo->type & RCCL_TOPO_4P2H_ROME)) - allGather3Data[rank].nc = (comm->topo->nodes[NET].count > 3 ? 2 : 4)*comm->topo->nodes[NET].count; + allGather3Data[rank].nc = 4; + if (comm->topo->nodes[GPU].count == comm->topo->nRanks && comm->topo->nodes[GPU].nodes[idx].gpu.gcn == 910) + allGather3Data[rank].nc = 6; allGather3Data[rank].tree.pattern = treeGraph.pattern; + allGather3Data[rank].tree.nChannels = treeGraph.nChannels; allGather3Data[rank].tree.sameChannels = treeGraph.sameChannels; allGather3Data[rank].tree.speedIntra = treeGraph.speedIntra; allGather3Data[rank].tree.speedInter = treeGraph.speedInter; allGather3Data[rank].tree.typeIntra = treeGraph.typeIntra; allGather3Data[rank].tree.typeInter = treeGraph.typeInter; allGather3Data[rank].ring.pattern = ringGraph.pattern; + allGather3Data[rank].ring.nChannels = ringGraph.nChannels; allGather3Data[rank].ring.sameChannels = ringGraph.sameChannels; allGather3Data[rank].ring.speedIntra = ringGraph.speedIntra; allGather3Data[rank].ring.speedInter = ringGraph.speedInter; allGather3Data[rank].ring.typeIntra = ringGraph.typeIntra; allGather3Data[rank].ring.typeInter = ringGraph.typeInter; allGather3Data[rank].collNet.pattern = collNetGraph.pattern; + allGather3Data[rank].collNet.nChannels = collNetGraph.nChannels; allGather3Data[rank].collNet.sameChannels = collNetGraph.sameChannels; allGather3Data[rank].collNet.speedIntra = collNetGraph.speedIntra; allGather3Data[rank].collNet.speedInter = collNetGraph.speedInter; allGather3Data[rank].collNet.typeIntra = collNetGraph.typeIntra; allGather3Data[rank].collNet.typeInter = collNetGraph.typeInter; + allGather3Data[rank].collNetSupport = comm->collNetSupport; - // CollNet channels are already duplicated - comm->collNetnChannels = 2*collNetGraph.nChannels; - NCCLCHECK(ncclTopoPreset(comm, &treeGraph, &ringGraph, &collNetGraph, &allGather3Data[rank].topoRanks)); - + comm->nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels); + NCCLCHECK(ncclTopoPreset(comm, &treeGraph, &ringGraph, &allGather3Data[rank].topoRanks)); return ncclSuccess; } ncclResult_t initChannel(struct ncclComm* comm, int channelid) { struct ncclChannel* channel = comm->channels+channelid; + if (channel->id != -1) return ncclSuccess; channel->id = channelid; - // Setup intermediate buffering - //int buffSize = ncclParamBuffsize(); - int cpuArch, cpuVendor, cpuModel; - NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel)); - //channel->buffSize = buffSize != -2 ? buffSize : - // cpuArch == NCCL_TOPO_CPU_ARCH_ARM ? DEFAULT_BUFFER_SIZE_BYTES_ARM : DEFAULT_BUFFER_SIZE_BYTES; - // Ring index to user rank table. //NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks)); NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks)); @@ -434,12 +472,25 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelid) { //NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks+1)); // The extra one rank is for collnet root (i.e. network) NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks+1)); for (size_t i=0; inRanks+1; ++i) { - channel->peers[i].send.comm = comm; - channel->peers[i].recv.comm = comm; + for (int b=0; bpeers[i].send[b].comm = comm; + channel->peers[i].recv[b].comm = comm; + } } // Per-channel operation list. - //NCCLCHECK(ncclCudaHostAlloc((void**)&channel->collectives, (void**)&channel->devCollectives, sizeof(struct ncclColl)*NCCL_MAX_OPS)); + //NCCLCHECK(ncclCudaHostCalloc(&channel->workFifo, NCCL_MAX_OPS)); + //if (ncclGdrCopy != NULL && ncclParamGdrCopyFifoEnable() == 1) { + // GDRCOPY support + // We allocate a workFifo in GDR mapped CUDA memory + // But we still allocate the Host workFifo so that we + // can copy the work elements to CUDA memory on kernel launch + //NCCLCHECK(ncclGdrCudaCalloc(&channel->workFifoGdr, &channel->workFifoDev, NCCL_MAX_OPS, &channel->gdrMemDesc)); + //} else { + // The device workFifo is the Host one + //channel->workFifoDev = channel->workFifo; + //} + return ncclSuccess; } @@ -461,16 +512,34 @@ static ncclResult_t setupChannel(struct ncclComm* comm, int channelId, int rank, return ncclSuccess; } +static ncclResult_t connectedByXGMI(int* ret, struct ncclTopoSystem* system, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { + *ret = 0; + if (info1->hostHash != info2->hostHash) return ncclSuccess; + int g1, g2; + NCCLCHECK(ncclTopoRankToIndex(system, info1->rank, &g1)); + NCCLCHECK(ncclTopoRankToIndex(system, info2->rank, &g2)); + if (system->nodes[GPU].nodes[g1].paths[GPU][g2].type == PATH_NVL) *ret = 1; + return ncclSuccess; +} + template -static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclPeerInfo* myInfo, struct ncclPeerInfo* peerInfo, struct ncclConnect* connect, struct ncclConnector* connector, int channelId) { +static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* graph, struct ncclConnect* connect, int channelId, int peer, int connIndex) { + struct ncclPeerInfo* myInfo = comm->peerInfo+comm->rank; + struct ncclPeerInfo* peerInfo = comm->peerInfo+peer; + struct ncclConnector* connector = (type == 1) ? comm->channels[channelId].peers[peer].send + connIndex : + comm->channels[channelId].peers[peer].recv + connIndex; + int xgmi; + NCCLCHECK(connectedByXGMI(&xgmi, comm->topo, myInfo, peerInfo)); for (int t=0; tsend : &transport->recv; int ret = 0; NCCLCHECK(transport->canConnect(&ret, comm->topo, graph, myInfo, peerInfo)); if (ret) { connector->transportComm = transportComm; - NCCLCHECK(transportComm->setup(comm, graph, myInfo, peerInfo, connect, connector, channelId)); + NCCLCHECK(transportComm->setup(comm, graph, myInfo, peerInfo, connect, connector, channelId, connIndex)); return ncclSuccess; } } @@ -478,25 +547,30 @@ static ncclResult_t selectTransport(struct ncclComm* comm, struct ncclTopoGraph* return ncclInternalError; } -ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend) { +ncclResult_t ncclTransportP2pConnect(struct ncclComm* comm, struct ncclChannel* channel, int nrecv, int* peerRecv, int nsend, int* peerSend, int connIndex) { TRACE(NCCL_INIT, "nsend %d nrecv %d", nsend, nrecv); uint32_t mask = 1 << channel->id; for (int i=0; i= comm->nRanks || peer == comm->rank || channel->peers[peer].recv.connected) continue; + if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].recv[connIndex].connected) continue; comm->connectRecv[peer] |= mask; } for (int i=0; i= comm->nRanks || peer == comm->rank || channel->peers[peer].send.connected) continue; + if (peer == -1 || peer >= comm->nRanks || peer == comm->rank || channel->peers[peer].send[connIndex].connected) continue; comm->connectSend[peer] |= mask; } return ncclSuccess; } -ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph) { +ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* graph, int connIndex) { + // Stream used during transport setup; need for P2P pre-connect + CUDA Graph + //hipStream_t transportSetupStream; + //CUDACHECK(hipStreamCreateWithFlags(&transportSetupStream, hipStreamNonBlocking)); + struct ncclConnect data[2*MAXCHANNELS]; for (int i=1; inRanks; i++) { + int bootstrapTag = (i<<8) + (graph ? graph->id+1 : 0); int recvPeer = (comm->rank - i + comm->nRanks) % comm->nRanks; int sendPeer = (comm->rank + i) % comm->nRanks; uint32_t recvMask = comm->connectRecv[recvPeer]; @@ -506,50 +580,50 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph* int sendChannels = 0, recvChannels = 0; for (int c=0; cchannels[c].peers[recvPeer].recv; - NCCLCHECK(selectTransport<0>(comm, graph, comm->peerInfo+comm->rank, comm->peerInfo+recvPeer, recvData+recvChannels++, conn, c)); + NCCLCHECK(selectTransport<0>(comm, graph, recvData+recvChannels++, c, recvPeer, connIndex)); } } struct ncclConnect* sendData = recvData+recvChannels; for (int c=0; cchannels[c].peers[sendPeer].send; - NCCLCHECK(selectTransport<1>(comm, graph, comm->peerInfo+comm->rank, comm->peerInfo+sendPeer, sendData+sendChannels++, conn, c)); + NCCLCHECK(selectTransport<1>(comm, graph, sendData+sendChannels++, c, sendPeer, connIndex)); } } if (sendPeer == recvPeer) { if (recvChannels+sendChannels) { - //NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels))); - //NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels))); + //NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels))); + //NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, data, sizeof(struct ncclConnect)*(recvChannels+sendChannels))); sendData = data; recvData = data+sendChannels; } } else { - //if (recvChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, recvData, sizeof(struct ncclConnect)*recvChannels)); - //if (sendChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, sendPeer, sendData, sizeof(struct ncclConnect)*sendChannels)); - //if (sendChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, sendPeer, sendData, sizeof(struct ncclConnect)*sendChannels)); - //if (recvChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, recvData, sizeof(struct ncclConnect)*recvChannels)); + //if (recvChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels)); + //if (sendChannels) NCCLCHECK(bootstrapSend(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels)); + //if (sendChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, sendPeer, bootstrapTag, sendData, sizeof(struct ncclConnect)*sendChannels)); + //if (recvChannels) NCCLCHECK(bootstrapRecv(comm->bootstrap, recvPeer, bootstrapTag, recvData, sizeof(struct ncclConnect)*recvChannels)); } for (int c=0; cchannels[c].peers[sendPeer].send; + struct ncclConnector* conn = comm->channels[c].peers[sendPeer].send + connIndex; //NCCLCHECK(conn->transportComm->connect(comm, sendData++, 1, comm->rank, conn)); conn->connected = 1; - //CUDACHECK(hipMemcpy(&comm->channels[c].devPeers[sendPeer].send, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice)); + //CUDACHECK(hipMemcpyAsync(comm->channels[c].devPeers[sendPeer].send+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice, transportSetupStream)); } } for (int c=0; cchannels[c].peers[recvPeer].recv; + struct ncclConnector* conn = comm->channels[c].peers[recvPeer].recv + connIndex; //NCCLCHECK(conn->transportComm->connect(comm, recvData++, 1, comm->rank, conn)); conn->connected = 1; - //CUDACHECK(hipMemcpy(&comm->channels[c].devPeers[recvPeer].recv, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice)); + //CUDACHECK(hipMemcpyAsync(comm->channels[c].devPeers[recvPeer].recv+connIndex, conn, sizeof(struct ncclConnector), hipMemcpyHostToDevice, transportSetupStream)); } } comm->connectRecv[recvPeer] = comm->connectSend[sendPeer] = 0; } + //CUDACHECK(hipStreamSynchronize(transportSetupStream)); + //CUDACHECK(hipStreamDestroy(transportSetupStream)); return ncclSuccess; } @@ -587,24 +661,28 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t allTopoRanks[i] = &allGather3Data[i].topoRanks; nc = std::min(allGather3Data[i].nc, nc); // Make sure we align all ranks so that the tuning is consistent across ranks - treeGraph.nChannels = ringGraph.nChannels = comm->nChannels = std::min(allGather3Data[i].nChannels, comm->nChannels); + treeGraph.nChannels = std::min(allGather3Data[i].tree.nChannels, treeGraph.nChannels); treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels); treeGraph.speedIntra = std::min(allGather3Data[i].tree.speedIntra, treeGraph.speedIntra); treeGraph.speedInter = std::min(allGather3Data[i].tree.speedInter, treeGraph.speedInter); treeGraph.typeIntra = std::min(allGather3Data[i].tree.typeIntra, treeGraph.typeIntra); treeGraph.typeInter = std::min(allGather3Data[i].tree.typeInter, treeGraph.typeInter); + ringGraph.nChannels = std::min(allGather3Data[i].ring.nChannels, ringGraph.nChannels); ringGraph.sameChannels = std::min(allGather3Data[i].ring.sameChannels, ringGraph.sameChannels); ringGraph.speedIntra = std::min(allGather3Data[i].ring.speedIntra, ringGraph.speedIntra); ringGraph.speedInter = std::min(allGather3Data[i].ring.speedInter, ringGraph.speedInter); ringGraph.typeIntra = std::min(allGather3Data[i].ring.typeIntra, ringGraph.typeIntra); ringGraph.typeInter = std::min(allGather3Data[i].ring.typeInter, ringGraph.typeInter); + collNetGraph.nChannels = std::min(allGather3Data[i].collNet.nChannels, collNetGraph.nChannels); collNetGraph.sameChannels = std::min(allGather3Data[i].collNet.sameChannels, collNetGraph.sameChannels); collNetGraph.speedIntra = std::min(allGather3Data[i].collNet.speedIntra, collNetGraph.speedIntra); collNetGraph.speedInter = std::min(allGather3Data[i].collNet.speedInter, collNetGraph.speedInter); collNetGraph.typeIntra = std::min(allGather3Data[i].collNet.typeIntra, collNetGraph.typeIntra); collNetGraph.typeInter = std::min(allGather3Data[i].collNet.typeInter, collNetGraph.typeInter); + comm->collNetSupport = std::min(allGather3Data[i].collNetSupport, comm->collNetSupport); } + comm->nChannels = treeGraph.nChannels = ringGraph.nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels); if (comm->nChannels < nChannelsOrig) { // We started duplicating channels during Preset(), so we need to move the // duplicated channels since we have removed some. @@ -613,15 +691,7 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t int *rings; NCCLCHECK(ncclCalloc(&rings, nranks*MAXCHANNELS)); - - NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, nc)); - if (comm->nNodes > 1 && - ncclParamCollNetEnable() == 1 && - collNetSupport() && collNetGraph.nChannels) { - NCCLCHECK(ncclTopoConnectCollNet(comm, &collNetGraph, rank)); - } else { - comm->collNetnChannels = 0; - } + NCCLCHECK(ncclTopoPostset(comm, nodesFirstRank, nodesTreePatterns, allTopoRanks, rings, &collNetGraph, nc)); free(allTopoRanks); free(nodesTreePatterns); @@ -657,46 +727,58 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t struct ncclChannel* channel = comm->channels+c; NCCLCHECKGOTO(setupChannel(comm, c, rank, nranks, rings+c*nranks), ret, affinity_restore); if (comm->nRanks == 1) continue; - NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next), ret, affinity_restore); + NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->ring.prev, 1, &channel->ring.next, 0), ret, affinity_restore); } - NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph), ret, affinity_restore); + NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &ringGraph, 0), ret, affinity_restore); INFO(NCCL_INIT, "Connected all rings"); // Connect Trees for (int c=0; cnChannels; c++) { struct ncclChannel* channel = comm->channels+c; if (comm->nRanks == 1) continue; - NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up), ret, affinity_restore); - NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down), ret, affinity_restore); + NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, NCCL_MAX_TREE_ARITY, channel->tree.down, 1, &channel->tree.up, 0), ret, affinity_restore); + NCCLCHECKGOTO(ncclTransportP2pConnect(comm, channel, 1, &channel->tree.up, NCCL_MAX_TREE_ARITY, channel->tree.down, 0), ret, affinity_restore); } - NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph), ret, affinity_restore); + NCCLCHECKGOTO(ncclTransportP2pSetup(comm, &treeGraph, 0), ret, affinity_restore); INFO(NCCL_INIT, "Connected all trees"); // Check if we can setup CollNet - if (comm->nNodes > 1 && - ncclParamCollNetEnable() == 1 && - collNetSupport() && collNetGraph.nChannels) { - for (int c=comm->nChannels; ccollNetnChannels; c++) - NCCLCHECK(initChannel(comm, c));; - int logicChannels = comm->collNetnChannels/2; + if (comm->collNetSupport > 0) { int collNetSetupFail = 0; - const int recvIndex = 0; // recv GPU index is always 0 - const int sendIndex = collNetGraph.pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1; // send GPU index depends on topo pattern - for (int c=0; cchannels+logicChannels+c; - struct ncclChannel* channelSend = comm->channels+c; - NCCLCHECK(ncclTransportP2pConnect(comm, channelRecv, 1, &channelRecv->collTree.up, 1, channelRecv->collTree.down)); - NCCLCHECK(ncclTransportP2pConnect(comm, channelSend, 1, channelSend->collTree.down, 1, &channelSend->collTree.up)); - const int recvMaster = collNetGraph.intra[c*comm->localRanks+recvIndex]; - const int sendMaster = collNetGraph.intra[c*comm->localRanks+sendIndex]; - if (collNetSetup(comm, &collNetGraph, channelRecv, rank, nranks, recvMaster, sendMaster, comm->nNodes, 1) != 1) - collNetSetupFail = 1; - else if (collNetSetup(comm, &collNetGraph, channelSend, rank, nranks, sendMaster, recvMaster, comm->nNodes, 0) != 1) - collNetSetupFail = 1; + // Find all head ranks + int nHeads = collNetGraph.nChannels; + int *heads; + NCCLCHECK(ncclCalloc(&heads, nHeads)); + // Head GPU index is always 0 + for (int c=0; clocalRanks+0]; + } + for (int c=0; cnChannels; c++) { + struct ncclChannel* channel = comm->channels+c; + for (int h=0; hcollNetSupport) { + TRACE(NCCL_INIT, "rank %d Connected inter-node CollNet", rank); + for (int c=0; cnChannels; c++) { + struct ncclChannel* channelRecv = comm->channels+c; + NCCLCHECK(ncclTransportP2pConnect(comm, channelRecv, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.up, NCCL_MAX_DIRECT_ARITY, channelRecv->collTree.down, 0)); + } + NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph, 0)); + for (int c=0; cnChannels; c++) { + struct ncclChannel* channelSend = comm->channels+c; + NCCLCHECK(ncclTransportP2pConnect(comm, channelSend, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.down, NCCL_MAX_DIRECT_ARITY, channelSend->collTree.up, 1)); + } + NCCLCHECK(ncclTransportP2pSetup(comm, &collNetGraph, 1)); + INFO(NCCL_INIT, "rank %d Connected CollNet", rank); + } } TRACE(NCCL_INIT, "rank %d nranks %d - CONNECTED %d RINGS AND TREES", rank, nranks, comm->nChannels); free(rings); @@ -714,10 +796,9 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t // We should have allocated all buffers, collective fifos, ... we can // restore the affinity. affinity_restore: - sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave); + //sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave); if (ret != ncclSuccess) return ret; TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks); - return ncclSuccess; }